diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000..904341b936
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,70 @@
+
+ccminer
+*.o
+
+autom4te.cache
+.deps
+
+Makefile
+Makefile.in
+INSTALL
+aclocal.m4
+configure
+configure.lineno
+depcomp
+missing
+install-sh
+stamp-h1
+ccminer-config.h*
+compile
+config.log
+config.status
+config.status.lineno
+config.guess
+config.sub
+
+mingw32-config.cache
+
+.dirstamp
+.DS_Store
+Desktop.ini
+Thumbs.db
+
+*.iml
+
+Debug/
+Release/
+x64/Debug/
+x64/Release/
+*.suo
+*.user
+*.opensdf
+*.sdf
+*.pdb
+*.VC.opendb
+*.VC.db
+
+.settings/
+.project
+.metadata
+.classpath
+.loadpath
+.cproject
+.buildpath
+
+compat/curl-for-windows/
+
+*.ptx
+*.i
+*.ii
+*.gpu
+*.cubin
+*.stub.c
+*.module_id
+*.hash
+*.fatbin.c
+*.fatbin
+*.cudafe1.cpp
+*.cudafe1.c
+*.cudafe2.c
+
diff --git a/AUTHORS b/AUTHORS
deleted file mode 100644
index 540821528b..0000000000
--- a/AUTHORS
+++ /dev/null
@@ -1 +0,0 @@
-See README.txt
diff --git a/Algo256/blake256.cu b/Algo256/blake256.cu
new file mode 100644
index 0000000000..3d2ca6080e
--- /dev/null
+++ b/Algo256/blake256.cu
@@ -0,0 +1,599 @@
+/**
+ * Blake-256 Cuda Kernel (Tested on SM 5/5.2)
+ *
+ * Tanguy Pruvot / SP - Jan 2016
+ */
+
+#include <stdint.h>
+#include <memory.h>
+
+#include "miner.h"
+
+extern "C" {
+#include "sph/sph_blake.h"
+}
+
+/* threads per block */
+#define TPB 512
+
+/* hash by cpu with blake 256 */
+extern "C" void blake256hash(void *output, const void *input, int8_t rounds = 14)
+{
+	uchar hash[64];
+	sph_blake256_context ctx;
+
+	sph_blake256_set_rounds(rounds);
+
+	sph_blake256_init(&ctx);
+	sph_blake256(&ctx, input, 80);
+	sph_blake256_close(&ctx, hash);
+
+	memcpy(output, hash, 32);
+}
+
+#include "cuda_helper.h"
+
+#ifdef __INTELLISENSE__
+#define __byte_perm(x, y, b) x
+#endif
+
+__constant__ uint32_t _ALIGN(32) d_data[12];
+
+/* 8 adapters max */
+static uint32_t *d_resNonce[MAX_GPUS];
+static uint32_t *h_resNonce[MAX_GPUS];
+
+/* max count of found nonces in one call */
+#define NBN 2
+static __thread uint32_t extra_results[NBN] = { UINT32_MAX };
+
+#define GSPREC(a,b,c,d,x,y) { \
+	v[a] += (m[x] ^ c_u256[y]) + v[b]; \
+	v[d] = __byte_perm(v[d] ^ v[a],0, 0x1032); \
+	v[c] += v[d]; \
+	v[b] = SPH_ROTR32(v[b] ^ v[c], 12); \
+	v[a] += (m[y] ^ c_u256[x]) + v[b]; \
+	v[d] = __byte_perm(v[d] ^ v[a],0, 0x0321); \
+	v[c] += v[d]; \
+	v[b] = SPH_ROTR32(v[b] ^ v[c], 7); \
+	}
+
+__device__ __forceinline__
+void blake256_compress_14(uint32_t *h, const uint32_t *block, const uint32_t T0)
+{
+	uint32_t /*_ALIGN(8)*/ m[16];
+	uint32_t v[16];
+
+	m[0] = block[0];
+	m[1] = block[1];
+	m[2] = block[2];
+	m[3] = block[3];
+
+	const uint32_t c_u256[16] = {
+		0x243F6A88, 0x85A308D3, 0x13198A2E, 0x03707344,
+		0xA4093822, 0x299F31D0, 0x082EFA98, 0xEC4E6C89,
+		0x452821E6, 0x38D01377, 0xBE5466CF, 0x34E90C6C,
+		0xC0AC29B7, 0xC97C50DD, 0x3F84D5B5, 0xB5470917
+	};
+
+	const uint32_t c_Padding[12] = {
+		0x80000000UL, 0, 0, 0,
+		0, 0, 0, 0,
+		0, 1, 0, 640,
+	};
+
+	#pragma unroll
+	for (uint32_t i = 0; i < 12; i++) {
+		m[i+4] = c_Padding[i];
+	}
+
+	//#pragma unroll 8
+	for(uint32_t i = 0; i < 8; i++)
+		v[i] = h[i];
+
+	v[ 8] = c_u256[0];
+	v[ 9] = c_u256[1];
+	v[10] = c_u256[2];
+	v[11] = c_u256[3];
+
+	v[12] = c_u256[4] ^ T0;
+	v[13] = c_u256[5] ^ T0;
+	v[14] = c_u256[6];
+	v[15] = c_u256[7];
+
+	//	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	GSPREC(0, 4, 0x8, 0xC,0,1);
+	GSPREC(1, 5, 0x9, 0xD,2,3);
+	GSPREC(2, 6, 0xA, 0xE, 4,5);
+	GSPREC(3, 7, 0xB, 0xF, 6,7);
+	GSPREC(0, 5, 0xA, 0xF, 8,9);
+	GSPREC(1, 6, 0xB, 0xC, 10,11);
+	GSPREC(2, 7, 0x8, 0xD, 12,13);
+	GSPREC(3, 4, 0x9, 0xE, 14,15);
+	//	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	GSPREC(0, 4, 0x8, 0xC, 14, 10);
+	GSPREC(1, 5, 0x9, 0xD, 4, 8);
+	GSPREC(2, 6, 0xA, 0xE, 9, 15);
+	GSPREC(3, 7, 0xB, 0xF, 13, 6);
+	GSPREC(0, 5, 0xA, 0xF, 1, 12);
+	GSPREC(1, 6, 0xB, 0xC, 0, 2);
+	GSPREC(2, 7, 0x8, 0xD, 11, 7);
+	GSPREC(3, 4, 0x9, 0xE, 5, 3);
+	//	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	GSPREC(0, 4, 0x8, 0xC, 11, 8);
+	GSPREC(1, 5, 0x9, 0xD, 12, 0);
+	GSPREC(2, 6, 0xA, 0xE, 5, 2);
+	GSPREC(3, 7, 0xB, 0xF, 15, 13);
+	GSPREC(0, 5, 0xA, 0xF, 10, 14);
+	GSPREC(1, 6, 0xB, 0xC, 3, 6);
+	GSPREC(2, 7, 0x8, 0xD, 7, 1);
+	GSPREC(3, 4, 0x9, 0xE, 9, 4);
+	//	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	GSPREC(0, 4, 0x8, 0xC, 7, 9);
+	GSPREC(1, 5, 0x9, 0xD, 3, 1);
+	GSPREC(2, 6, 0xA, 0xE, 13, 12);
+	GSPREC(3, 7, 0xB, 0xF, 11, 14);
+	GSPREC(0, 5, 0xA, 0xF, 2, 6);
+	GSPREC(1, 6, 0xB, 0xC, 5, 10);
+	GSPREC(2, 7, 0x8, 0xD, 4, 0);
+	GSPREC(3, 4, 0x9, 0xE, 15, 8);
+	//	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+	GSPREC(0, 4, 0x8, 0xC, 9, 0);
+	GSPREC(1, 5, 0x9, 0xD, 5, 7);
+	GSPREC(2, 6, 0xA, 0xE, 2, 4);
+	GSPREC(3, 7, 0xB, 0xF, 10, 15);
+	GSPREC(0, 5, 0xA, 0xF, 14, 1);
+	GSPREC(1, 6, 0xB, 0xC, 11, 12);
+	GSPREC(2, 7, 0x8, 0xD, 6, 8);
+	GSPREC(3, 4, 0x9, 0xE, 3, 13);
+	//	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+	GSPREC(0, 4, 0x8, 0xC, 2, 12);
+	GSPREC(1, 5, 0x9, 0xD, 6, 10);
+	GSPREC(2, 6, 0xA, 0xE, 0, 11);
+	GSPREC(3, 7, 0xB, 0xF, 8, 3);
+	GSPREC(0, 5, 0xA, 0xF, 4, 13);
+	GSPREC(1, 6, 0xB, 0xC, 7, 5);
+	GSPREC(2, 7, 0x8, 0xD, 15, 14);
+	GSPREC(3, 4, 0x9, 0xE, 1, 9);
+	//	{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+	GSPREC(0, 4, 0x8, 0xC, 12, 5);
+	GSPREC(1, 5, 0x9, 0xD, 1, 15);
+	GSPREC(2, 6, 0xA, 0xE, 14, 13);
+	GSPREC(3, 7, 0xB, 0xF, 4, 10);
+	GSPREC(0, 5, 0xA, 0xF, 0, 7);
+	GSPREC(1, 6, 0xB, 0xC, 6, 3);
+	GSPREC(2, 7, 0x8, 0xD, 9, 2);
+	GSPREC(3, 4, 0x9, 0xE, 8, 11);
+	//	{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+	GSPREC(0, 4, 0x8, 0xC, 13, 11);
+	GSPREC(1, 5, 0x9, 0xD, 7, 14);
+	GSPREC(2, 6, 0xA, 0xE, 12, 1);
+	GSPREC(3, 7, 0xB, 0xF, 3, 9);
+	GSPREC(0, 5, 0xA, 0xF, 5, 0);
+	GSPREC(1, 6, 0xB, 0xC, 15, 4);
+	GSPREC(2, 7, 0x8, 0xD, 8, 6);
+	GSPREC(3, 4, 0x9, 0xE, 2, 10);
+	//	{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+	GSPREC(0, 4, 0x8, 0xC, 6, 15);
+	GSPREC(1, 5, 0x9, 0xD, 14, 9);
+	GSPREC(2, 6, 0xA, 0xE, 11, 3);
+	GSPREC(3, 7, 0xB, 0xF, 0, 8);
+	GSPREC(0, 5, 0xA, 0xF, 12, 2);
+	GSPREC(1, 6, 0xB, 0xC, 13, 7);
+	GSPREC(2, 7, 0x8, 0xD, 1, 4);
+	GSPREC(3, 4, 0x9, 0xE, 10, 5);
+	//	{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+	GSPREC(0, 4, 0x8, 0xC, 10, 2);
+	GSPREC(1, 5, 0x9, 0xD, 8, 4);
+	GSPREC(2, 6, 0xA, 0xE, 7, 6);
+	GSPREC(3, 7, 0xB, 0xF, 1, 5);
+	GSPREC(0, 5, 0xA, 0xF, 15, 11);
+	GSPREC(1, 6, 0xB, 0xC, 9, 14);
+	GSPREC(2, 7, 0x8, 0xD, 3, 12);
+	GSPREC(3, 4, 0x9, 0xE, 13, 0);
+	//	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	GSPREC(0, 4, 0x8, 0xC, 0, 1);
+	GSPREC(1, 5, 0x9, 0xD, 2, 3);
+	GSPREC(2, 6, 0xA, 0xE, 4, 5);
+	GSPREC(3, 7, 0xB, 0xF, 6, 7);
+	GSPREC(0, 5, 0xA, 0xF, 8, 9);
+	GSPREC(1, 6, 0xB, 0xC, 10, 11);
+	GSPREC(2, 7, 0x8, 0xD, 12, 13);
+	GSPREC(3, 4, 0x9, 0xE, 14, 15);
+	//	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	GSPREC(0, 4, 0x8, 0xC, 14, 10);
+	GSPREC(1, 5, 0x9, 0xD, 4, 8);
+	GSPREC(2, 6, 0xA, 0xE, 9, 15);
+	GSPREC(3, 7, 0xB, 0xF, 13, 6);
+	GSPREC(0, 5, 0xA, 0xF, 1, 12);
+	GSPREC(1, 6, 0xB, 0xC, 0, 2);
+	GSPREC(2, 7, 0x8, 0xD, 11, 7);
+	GSPREC(3, 4, 0x9, 0xE, 5, 3);
+	//	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	GSPREC(0, 4, 0x8, 0xC, 11, 8);
+	GSPREC(1, 5, 0x9, 0xD, 12, 0);
+	GSPREC(2, 6, 0xA, 0xE, 5, 2);
+	GSPREC(3, 7, 0xB, 0xF, 15, 13);
+	GSPREC(0, 5, 0xA, 0xF, 10, 14);
+	GSPREC(1, 6, 0xB, 0xC, 3, 6);
+	GSPREC(2, 7, 0x8, 0xD, 7, 1);
+	GSPREC(3, 4, 0x9, 0xE, 9, 4);
+	//	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	GSPREC(0, 4, 0x8, 0xC, 7, 9);
+	GSPREC(1, 5, 0x9, 0xD, 3, 1);
+	GSPREC(2, 6, 0xA, 0xE, 13, 12);
+	GSPREC(3, 7, 0xB, 0xF, 11, 14);
+	GSPREC(0, 5, 0xA, 0xF, 2, 6);
+	GSPREC(1, 6, 0xB, 0xC, 5, 10);
+	GSPREC(2, 7, 0x8, 0xD, 4, 0);
+	GSPREC(3, 4, 0x9, 0xE, 15, 8);
+
+	// only compute h6 & 7
+	h[6U] ^= v[6U] ^ v[14U];
+	h[7U] ^= v[7U] ^ v[15U];
+}
+
+/* ############################################################################################################################### */
+/* Precalculated 1st 64-bytes block (midstate) method */
+
+__global__ __launch_bounds__(1024,1)
+void blake256_gpu_hash_16(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce, const uint64_t highTarget)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint32_t nonce = startNonce + thread;
+		uint32_t _ALIGN(16) h[8];
+
+		#pragma unroll
+		for(int i=0; i < 8; i++) {
+			h[i] = d_data[i];
+		}
+
+		// ------ Close: Bytes 64 to 80 ------
+
+		uint32_t _ALIGN(16) ending[4];
+		ending[0] = d_data[8];
+		ending[1] = d_data[9];
+		ending[2] = d_data[10];
+		ending[3] = nonce; /* our tested value */
+
+		blake256_compress_14(h, ending, 640);
+
+		if (h[7] == 0 && cuda_swab32(h[6]) <= highTarget) {
+#if NBN == 2
+			if (resNonce[0] != UINT32_MAX)
+				resNonce[1] = nonce;
+			else
+				resNonce[0] = nonce;
+#else
+			resNonce[0] = nonce;
+#endif
+		}
+	}
+}
+
+__global__
+#if __CUDA_ARCH__ >= 500
+__launch_bounds__(512, 3) /* 40 regs */
+#endif
+void blake256_gpu_hash_16_8(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce, const uint64_t highTarget)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t h[8];
+		const uint32_t nonce = startNonce + thread;
+
+		#pragma unroll
+		for (int i = 0; i < 8; i++) {
+			h[i] = d_data[i];
+		}
+
+		// ------ Close: Bytes 64 to 80 ------
+
+		uint32_t m[16] = {
+			d_data[8], d_data[9], d_data[10], nonce,
+			0x80000000UL, 0, 0, 0,
+			0, 0, 0, 0,
+			0, 1, 0, 640,
+		};
+
+		const uint32_t c_u256[16] = {
+			0x243F6A88, 0x85A308D3, 0x13198A2E, 0x03707344,
+			0xA4093822, 0x299F31D0, 0x082EFA98, 0xEC4E6C89,
+			0x452821E6, 0x38D01377, 0xBE5466CF, 0x34E90C6C,
+			0xC0AC29B7, 0xC97C50DD, 0x3F84D5B5, 0xB5470917
+		};
+
+		uint32_t v[16];
+
+		#pragma unroll
+		for (uint32_t i = 0; i < 8; i++)
+			v[i] = h[i];
+
+		v[8]  = c_u256[0];
+		v[9]  = c_u256[1];
+		v[10] = c_u256[2];
+		v[11] = c_u256[3];
+
+		v[12] = c_u256[4] ^ 640U;
+		v[13] = c_u256[5] ^ 640U;
+		v[14] = c_u256[6];
+		v[15] = c_u256[7];
+
+		//	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+		GSPREC(0, 4, 0x8, 0xC, 0, 1);
+		GSPREC(1, 5, 0x9, 0xD, 2, 3);
+		GSPREC(2, 6, 0xA, 0xE, 4, 5);
+		GSPREC(3, 7, 0xB, 0xF, 6, 7);
+		GSPREC(0, 5, 0xA, 0xF, 8, 9);
+		GSPREC(1, 6, 0xB, 0xC, 10, 11);
+		GSPREC(2, 7, 0x8, 0xD, 12, 13);
+		GSPREC(3, 4, 0x9, 0xE, 14, 15);
+		//	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+		GSPREC(0, 4, 0x8, 0xC, 14, 10);
+		GSPREC(1, 5, 0x9, 0xD, 4, 8);
+		GSPREC(2, 6, 0xA, 0xE, 9, 15);
+		GSPREC(3, 7, 0xB, 0xF, 13, 6);
+		GSPREC(0, 5, 0xA, 0xF, 1, 12);
+		GSPREC(1, 6, 0xB, 0xC, 0, 2);
+		GSPREC(2, 7, 0x8, 0xD, 11, 7);
+		GSPREC(3, 4, 0x9, 0xE, 5, 3);
+		//	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+		GSPREC(0, 4, 0x8, 0xC, 11, 8);
+		GSPREC(1, 5, 0x9, 0xD, 12, 0);
+		GSPREC(2, 6, 0xA, 0xE, 5, 2);
+		GSPREC(3, 7, 0xB, 0xF, 15, 13);
+		GSPREC(0, 5, 0xA, 0xF, 10, 14);
+		GSPREC(1, 6, 0xB, 0xC, 3, 6);
+		GSPREC(2, 7, 0x8, 0xD, 7, 1);
+		GSPREC(3, 4, 0x9, 0xE, 9, 4);
+		//	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+		GSPREC(0, 4, 0x8, 0xC, 7, 9);
+		GSPREC(1, 5, 0x9, 0xD, 3, 1);
+		GSPREC(2, 6, 0xA, 0xE, 13, 12);
+		GSPREC(3, 7, 0xB, 0xF, 11, 14);
+		GSPREC(0, 5, 0xA, 0xF, 2, 6);
+		GSPREC(1, 6, 0xB, 0xC, 5, 10);
+		GSPREC(2, 7, 0x8, 0xD, 4, 0);
+		GSPREC(3, 4, 0x9, 0xE, 15, 8);
+		//	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+		GSPREC(0, 4, 0x8, 0xC, 9, 0);
+		GSPREC(1, 5, 0x9, 0xD, 5, 7);
+		GSPREC(2, 6, 0xA, 0xE, 2, 4);
+		GSPREC(3, 7, 0xB, 0xF, 10, 15);
+		GSPREC(0, 5, 0xA, 0xF, 14, 1);
+		GSPREC(1, 6, 0xB, 0xC, 11, 12);
+		GSPREC(2, 7, 0x8, 0xD, 6, 8);
+		GSPREC(3, 4, 0x9, 0xE, 3, 13);
+		//	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+		GSPREC(0, 4, 0x8, 0xC, 2, 12);
+		GSPREC(1, 5, 0x9, 0xD, 6, 10);
+		GSPREC(2, 6, 0xA, 0xE, 0, 11);
+		GSPREC(3, 7, 0xB, 0xF, 8, 3);
+		GSPREC(0, 5, 0xA, 0xF, 4, 13);
+		GSPREC(1, 6, 0xB, 0xC, 7, 5);
+		GSPREC(2, 7, 0x8, 0xD, 15, 14);
+		GSPREC(3, 4, 0x9, 0xE, 1, 9);
+		//	{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+		GSPREC(0, 4, 0x8, 0xC, 12, 5);
+		GSPREC(1, 5, 0x9, 0xD, 1, 15);
+		GSPREC(2, 6, 0xA, 0xE, 14, 13);
+		GSPREC(3, 7, 0xB, 0xF, 4, 10);
+		GSPREC(0, 5, 0xA, 0xF, 0, 7);
+		GSPREC(1, 6, 0xB, 0xC, 6, 3);
+		GSPREC(2, 7, 0x8, 0xD, 9, 2);
+		GSPREC(3, 4, 0x9, 0xE, 8, 11);
+		//	{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+		GSPREC(0, 4, 0x8, 0xC, 13, 11);
+		GSPREC(1, 5, 0x9, 0xD, 7, 14);
+		GSPREC(2, 6, 0xA, 0xE, 12, 1);
+		GSPREC(3, 7, 0xB, 0xF, 3, 9);
+		GSPREC(0, 5, 0xA, 0xF, 5, 0);
+		GSPREC(1, 6, 0xB, 0xC, 15, 4);
+		GSPREC(2, 7, 0x8, 0xD, 8, 6);
+		//GSPREC(3, 4, 0x9, 0xE, 2, 10);
+		//	{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+
+		// only compute h6 & 7
+		//h[6] ^= v[6] ^ v[14];
+		//h[7] ^= v[7] ^ v[15];
+
+		if ((h[7]^v[7]^v[15]) == 0) // h7
+		{
+			GSPREC(3, 4, 0x9, 0xE, 2, 10);
+			if (cuda_swab32(h[6]^v[6]^v[14]) <= highTarget) {
+#if NBN == 2
+				if (resNonce[0] != UINT32_MAX)
+					resNonce[1] = nonce;
+				else
+					resNonce[0] = nonce;
+#else
+				resNonce[0] = nonce;
+#endif
+			}
+		}
+	}
+}
+
+__host__
+static uint32_t blake256_cpu_hash_16(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint64_t highTarget,
+	const int8_t rounds)
+{
+	uint32_t result = UINT32_MAX;
+
+	dim3 grid((threads + TPB-1)/TPB);
+	dim3 block(TPB);
+
+	/* Check error on Ctrl+C or kill to prevent segfaults on exit */
+	if (cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t)) != cudaSuccess)
+		return result;
+
+	if (rounds == 8)
+		blake256_gpu_hash_16_8 <<<grid, block>>> (threads, startNonce, d_resNonce[thr_id], highTarget);
+	else
+		blake256_gpu_hash_16  <<<grid, block>>> (threads, startNonce, d_resNonce[thr_id], highTarget);
+
+	if (cudaSuccess == cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
+		result = h_resNonce[thr_id][0];
+		for (int n=0; n < (NBN-1); n++)
+			extra_results[n] = h_resNonce[thr_id][n+1];
+	}
+	return result;
+}
+
+__host__
+static void blake256mid(uint32_t *output, const uint32_t *input, int8_t rounds = 14)
+{
+	sph_blake256_context ctx;
+
+	sph_blake256_set_rounds(rounds);
+
+	sph_blake256_init(&ctx);
+	sph_blake256(&ctx, input, 64);
+
+	memcpy(output, (void*)ctx.H, 32);
+}
+
+__host__
+void blake256_cpu_setBlock_16(uint32_t *penddata, const uint32_t *midstate, const uint32_t *ptarget)
+{
+	uint32_t _ALIGN(64) data[11];
+	memcpy(data, midstate, 32);
+	data[8] = penddata[0];
+	data[9] = penddata[1];
+	data[10]= penddata[2];
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_data, data, 32 + 12, 0, cudaMemcpyHostToDevice));
+}
+
+static bool init[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_blake256(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int8_t blakerounds=14)
+{
+	uint32_t _ALIGN(64) endiandata[20];
+	uint32_t _ALIGN(64) midstate[8];
+
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+
+	const uint32_t first_nonce = pdata[19];
+	uint64_t targetHigh = ((uint64_t*)ptarget)[3];
+
+	int dev_id = device_map[thr_id];
+	int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 30 : 26;
+	if (device_sm[dev_id] < 350) intensity = 22;
+
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	int rc = 0;
+
+	if (opt_benchmark) {
+		targetHigh = 0x1ULL << 32;
+		ptarget[6] = swab32(0xff);
+	}
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage (linux)
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		cuda_get_arch(thr_id);
+
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_resNonce[thr_id], NBN * sizeof(uint32_t)), -1);
+		CUDA_CALL_OR_RET_X(cudaMallocHost(&h_resNonce[thr_id], NBN * sizeof(uint32_t)), -1);
+		init[thr_id] = true;
+	}
+
+	for (int k = 0; k < 16; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	blake256mid(midstate, endiandata, blakerounds);
+	blake256_cpu_setBlock_16(&pdata[16], midstate, ptarget);
+
+	do {
+		// GPU HASH (second block only, first is midstate)
+		work->nonces[0] = blake256_cpu_hash_16(thr_id, throughput, pdata[19], targetHigh, blakerounds);
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			uint32_t _ALIGN(64) vhashcpu[8];
+			const uint32_t Htarg = ptarget[6];
+
+			for (int k=16; k < 19; k++)
+				be32enc(&endiandata[k], pdata[k]);
+
+			be32enc(&endiandata[19], work->nonces[0]);
+			blake256hash(vhashcpu, endiandata, blakerounds);
+
+			if (vhashcpu[6] <= Htarg && fulltest(vhashcpu, ptarget))
+			{
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhashcpu);
+#if NBN > 1
+				if (extra_results[0] != UINT32_MAX) {
+					work->nonces[1] = extra_results[0];
+					be32enc(&endiandata[19], work->nonces[1]);
+					blake256hash(vhashcpu, endiandata, blakerounds);
+					if (vhashcpu[6] <= Htarg && fulltest(vhashcpu, ptarget)) {
+						if (bn_hash_target_ratio(vhashcpu, ptarget) > work->shareratio[0]) {
+							work_set_target_ratio(work, vhashcpu);
+							xchg(work->nonces[0], work->nonces[1]);
+						} else {
+							bn_set_target_ratio(work, vhashcpu, 1);
+						}
+						work->valid_nonces = 2;
+					}
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+					extra_results[0] = UINT32_MAX;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+#endif
+				return work->valid_nonces;
+			}
+			else if (vhashcpu[6] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart && max_nonce > (uint64_t)throughput + pdata[19]);
+
+	*hashes_done = pdata[19] - first_nonce;
+
+	MyStreamSynchronize(NULL, 0, device_map[thr_id]);
+	return rc;
+}
+
+// cleanup
+extern "C" void free_blake256(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaDeviceSynchronize();
+
+	cudaFreeHost(h_resNonce[thr_id]);
+	cudaFree(d_resNonce[thr_id]);
+
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
+
diff --git a/Algo256/blake2s.cu b/Algo256/blake2s.cu
new file mode 100644
index 0000000000..0b4bbe0428
--- /dev/null
+++ b/Algo256/blake2s.cu
@@ -0,0 +1,565 @@
+/**
+ * Based on the SPH implementation of blake2s
+ * Provos Alexis - 2016
+ */
+
+#include "miner.h"
+
+#include <string.h>
+#include <stdint.h>
+
+#include "sph/blake2s.h"
+#include "sph/sph_types.h"
+
+#ifdef __INTELLISENSE__
+#define __byte_perm(x, y, b) x
+#endif
+
+#include "cuda_helper.h"
+
+#ifdef __CUDA_ARCH__
+
+__device__ __forceinline__
+uint32_t ROR8(const uint32_t a) {
+	return __byte_perm(a, 0, 0x0321);
+}
+
+__device__ __forceinline__
+uint32_t ROL16(const uint32_t a) {
+	return __byte_perm(a, 0, 0x1032);
+}
+
+#else
+#define ROR8(u)  (u >> 8)
+#define ROL16(u) (u << 16)
+#endif
+
+__device__ __forceinline__
+uint32_t xor3x(uint32_t a, uint32_t b, uint32_t c)
+{
+	uint32_t result;
+#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050
+	asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(result) : "r"(a), "r"(b),"r"(c)); //0x96 = 0xF0 ^ 0xCC ^ 0xAA
+#else
+	result = a^b^c;
+#endif
+	return result;
+}
+
+static const uint32_t blake2s_IV[8] = {
+	0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
+	0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
+};
+
+static const uint8_t blake2s_sigma[10][16] = {
+	{  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+	{ 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
+	{ 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 },
+	{  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 },
+	{  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 },
+	{  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 },
+	{ 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 },
+	{ 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 },
+	{  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 },
+	{ 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0 },
+};
+
+#define G(r,i,a,b,c,d) \
+	do { \
+		a = a + b + m[blake2s_sigma[r][2*i+0]]; \
+		d = SPH_ROTR32(d ^ a, 16); \
+		c = c + d; \
+		b = SPH_ROTR32(b ^ c, 12); \
+		a = a + b + m[blake2s_sigma[r][2*i+1]]; \
+		d = SPH_ROTR32(d ^ a, 8); \
+		c = c + d; \
+		b = SPH_ROTR32(b ^ c, 7); \
+	} while(0)
+#define ROUND(r)  \
+	do { \
+		G(r,0,v[0],v[4],v[ 8],v[12]); \
+		G(r,1,v[1],v[5],v[ 9],v[13]); \
+		G(r,2,v[2],v[6],v[10],v[14]); \
+		G(r,3,v[3],v[7],v[11],v[15]); \
+		G(r,4,v[0],v[5],v[10],v[15]); \
+		G(r,5,v[1],v[6],v[11],v[12]); \
+		G(r,6,v[2],v[7],v[ 8],v[13]); \
+		G(r,7,v[3],v[4],v[ 9],v[14]); \
+	} while(0)
+
+extern "C" void blake2s_hash(void *output, const void *input)
+{
+	uint32_t m[16];
+	uint32_t v[16];
+	uint32_t h[8];
+
+	uint32_t *in = (uint32_t*)input;
+//	COMPRESS
+	for(int i = 0; i < 16; ++i )
+		m[i] = in[i];
+
+	h[0] = 0x01010020 ^ blake2s_IV[0];
+	h[1] = blake2s_IV[1];
+	h[2] = blake2s_IV[2];
+	h[3] = blake2s_IV[3];
+	h[4] = blake2s_IV[4];
+	h[5] = blake2s_IV[5];
+	h[6] = blake2s_IV[6];
+	h[7] = blake2s_IV[7];
+
+	for(int i = 0; i < 8; ++i )
+		v[i] = h[i];
+
+	v[ 8] = blake2s_IV[0];		v[ 9] = blake2s_IV[1];
+	v[10] = blake2s_IV[2];		v[11] = blake2s_IV[3];
+	v[12] = 64 ^ blake2s_IV[4];	v[13] = blake2s_IV[5];
+	v[14] = blake2s_IV[6];		v[15] = blake2s_IV[7];
+
+	ROUND( 0 ); ROUND( 1 );
+	ROUND( 2 ); ROUND( 3 );
+	ROUND( 4 ); ROUND( 5 );
+	ROUND( 6 ); ROUND( 7 );
+	ROUND( 8 ); ROUND( 9 );
+
+	for(size_t i = 0; i < 8; ++i)
+		h[i] ^= v[i] ^ v[i + 8];
+
+//	COMPRESS
+	m[0] = in[16]; m[1] = in[17];
+	m[2] = in[18]; m[3] = in[19];
+	for(size_t i = 4; i < 16; ++i)
+		m[i] = 0;
+
+	for(size_t i = 0; i < 8; ++i)
+		v[i] = h[i];
+
+	v[ 8] = blake2s_IV[0];		v[ 9] = blake2s_IV[1];
+	v[10] = blake2s_IV[2];		v[11] = blake2s_IV[3];
+	v[12] = 0x50 ^ blake2s_IV[4];	v[13] = blake2s_IV[5];
+	v[14] = ~blake2s_IV[6];		v[15] = blake2s_IV[7];
+
+	ROUND( 0 ); ROUND( 1 );
+	ROUND( 2 ); ROUND( 3 );
+	ROUND( 4 ); ROUND( 5 );
+	ROUND( 6 ); ROUND( 7 );
+	ROUND( 8 ); ROUND( 9 );
+
+	for(size_t i = 0; i < 8; ++i)
+		h[i] ^= v[i] ^ v[i + 8];
+
+	memcpy(output, h, 32);
+}
+
+#define TPB 1024
+#define NPT 256
+#define maxResults 16
+#define NBN 1
+
+__constant__ uint32_t _ALIGN(32) midstate[20];
+
+static uint32_t *d_resNonce[MAX_GPUS];
+static uint32_t *h_resNonce[MAX_GPUS];
+
+#define GS4(a,b,c,d,e,f,a1,b1,c1,d1,e1,f1,a2,b2,c2,d2,e2,f2,a3,b3,c3,d3,e3,f3){ \
+	a += b + e;		a1+= b1 + e1;	 	a2+= b2 + e2;		a3+= b3 + e3; \
+	d  = ROL16( d ^ a);	d1 = ROL16(d1 ^ a1);	d2 = ROL16(d2 ^ a2);	d3 = ROL16(d3 ^ a3); \
+	c +=d; 			c1+=d1;			c2+=d2;			c3+=d3;\
+	b  = ROTR32(b ^ c, 12); b1 = ROTR32(b1^c1, 12);	b2 = ROTR32(b2^c2, 12);	b3 = ROTR32(b3^c3, 12); \
+	a += b + f;		a1+= b1 + f1;		a2+= b2 + f2;		a3+= b3 + f3; \
+	d  = ROR8(d ^ a);	d1 = ROR8(d1^a1);	d2 = ROR8(d2^a2);	d3 = ROR8(d3^a3); \
+	c  += d;		c1 += d1;		c2 += d2;		c3 += d3;\
+	b  = ROTR32(b ^ c, 7);	b1 = ROTR32(b1^c1, 7);	b2 = ROTR32(b2^c2, 7);	b3 = ROTR32(b3^c3, 7); \
+}
+
+__global__ __launch_bounds__(TPB,1)
+void blake2s_gpu_hash_nonce(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce, const uint32_t ptarget7)
+{
+	const uint32_t step = gridDim.x * blockDim.x;
+
+	uint32_t m[ 3];
+	uint32_t v[16];
+
+	m[0] = midstate[16];
+	m[1] = midstate[17];
+	m[2] = midstate[18];
+
+	const uint32_t h7 = midstate[19];
+
+	for(uint32_t thread   = blockDim.x * blockIdx.x + threadIdx.x ; thread <threads; thread+=step){
+		#pragma unroll
+		for(int i=0;i<16;i++){
+			v[ i] = midstate[ i];
+		}
+
+		uint32_t nonce = cuda_swab32(startNonce + thread);
+//		Round( 0 );
+		v[ 1] += nonce;
+		v[13] = ROR8(v[13] ^ v[ 1]);
+		v[ 9] += v[13];
+		v[ 5] = ROTR32(v[ 5] ^ v[ 9], 7);
+
+		v[ 1]+= v[ 6];
+		v[ 0]+= v[ 5];
+
+		v[12] = ROL16(v[12] ^ v[ 1]);
+		v[13] = ROL16(v[13] ^ v[ 2]);
+		v[15] = ROL16(v[15] ^ v[ 0]);
+
+		v[11]+= v[12];				v[ 8]+= v[13];				v[ 9]+= v[14];				v[10]+= v[15];
+		v[ 6] = ROTR32(v[ 6] ^ v[11], 12);	v[ 7] = ROTR32(v[ 7] ^ v[ 8], 12);	v[ 4] = ROTR32(v[ 4] ^ v[ 9], 12);	v[ 5] = ROTR32(v[ 5] ^ v[10], 12);
+		v[ 1]+= v[ 6];				v[ 2]+= v[ 7];				v[ 3]+= v[ 4];				v[ 0]+= v[ 5];
+		v[12] = ROR8(v[12] ^ v[ 1]);		v[13] = ROR8(v[13] ^ v[ 2]);		v[14] = ROR8(v[14] ^ v[ 3]);		v[15] = ROR8(v[15] ^ v[ 0]);
+		v[11]+= v[12]; 				v[ 8]+= v[13];				v[ 9]+= v[14];				v[10]+= v[15];
+		v[ 6] = ROTR32(v[ 6] ^ v[11], 7);	v[ 7] = ROTR32(v[ 7] ^ v[ 8], 7);	v[ 4] = ROTR32(v[ 4] ^ v[ 9], 7);	v[ 5] = ROTR32(v[ 5] ^ v[10], 7);
+
+		GS4(v[ 0],v[ 4],v[ 8],v[12],0,0,	v[ 1],v[ 5],v[ 9],v[13],0,0,	v[ 2],v[ 6],v[10],v[14],0,0,	v[ 3],v[ 7],v[11],v[15],0,0);
+		GS4(v[ 0],v[ 5],v[10],v[15],m[ 1],0,	v[ 1],v[ 6],v[11],v[12],m[ 0],m[ 2],	v[ 2],v[ 7],v[ 8],v[13],0,0,	v[ 3],v[ 4],v[ 9],v[14],0,nonce);
+		GS4(v[ 0],v[ 4],v[ 8],v[12],0,0,	v[ 1],v[ 5],v[ 9],v[13],0,m[ 0],	v[ 2],v[ 6],v[10],v[14],0,m[ 2],	v[ 3],v[ 7],v[11],v[15],0,0);
+		GS4(v[ 0],v[ 5],v[10],v[15],0,0,	v[ 1],v[ 6],v[11],v[12],nonce,0,	v[ 2],v[ 7],v[ 8],v[13],0,m[ 1],	v[ 3],v[ 4],v[ 9],v[14],0,0);
+		GS4(v[ 0],v[ 4],v[ 8],v[12],0,0,	v[ 1],v[ 5],v[ 9],v[13],nonce,m[ 1],	v[ 2],v[ 6],v[10],v[14],0,0,	v[ 3],v[ 7],v[11],v[15],0,0);
+		GS4(v[ 0],v[ 5],v[10],v[15],m[ 2],0,	v[ 1],v[ 6],v[11],v[12],0,0,	v[ 2],v[ 7],v[ 8],v[13],0,m[ 0],	v[ 3],v[ 4],v[ 9],v[14],0,0);
+		GS4(v[ 0],v[ 4],v[ 8],v[12],0,m[ 0],	v[ 1],v[ 5],v[ 9],v[13],0,0,	v[ 2],v[ 6],v[10],v[14],m[ 2],0,	v[ 3],v[ 7],v[11],v[15],0,0);
+		GS4(v[ 0],v[ 5],v[10],v[15],0,m[ 1],	v[ 1],v[ 6],v[11],v[12],0,0,	v[ 2],v[ 7],v[ 8],v[13],0,0,	v[ 3],v[ 4],v[ 9],v[14],nonce,0);
+		GS4(v[ 0],v[ 4],v[ 8],v[12],m[ 2],0,	v[ 1],v[ 5],v[ 9],v[13],0,0,	v[ 2],v[ 6],v[10],v[14],m[ 0],0,	v[ 3],v[ 7],v[11],v[15],0,nonce);
+		GS4(v[ 0],v[ 5],v[10],v[15],0,0,	v[ 1],v[ 6],v[11],v[12],0,0,	v[ 2],v[ 7],v[ 8],v[13],0,0,	v[ 3],v[ 4],v[ 9],v[14],m[ 1],0);
+		GS4(v[ 0],v[ 4],v[ 8],v[12],0,0,	v[ 1],v[ 5],v[ 9],v[13],m[ 1],0,	v[ 2],v[ 6],v[10],v[14],0,0,	v[ 3],v[ 7],v[11],v[15],0,0);
+		GS4(v[ 0],v[ 5],v[10],v[15],m[ 0],0,	v[ 1],v[ 6],v[11],v[12],0,nonce,	v[ 2],v[ 7],v[ 8],v[13],0,m[ 2],	v[ 3],v[ 4],v[ 9],v[14],0,0);
+		GS4(v[ 0],v[ 4],v[ 8],v[12],0,0,	v[ 1],v[ 5],v[ 9],v[13],0,0,	v[ 2],v[ 6],v[10],v[14],0,m[ 1],	v[ 3],v[ 7],v[11],v[15],nonce,0);
+		GS4(v[ 0],v[ 5],v[10],v[15],0,m[ 0],	v[ 1],v[ 6],v[11],v[12],0,0,	v[ 2],v[ 7],v[ 8],v[13],0,0,	v[ 3],v[ 4],v[ 9],v[14],m[ 2],0);
+		GS4(v[ 0],v[ 4],v[ 8],v[12],0,0,	v[ 1],v[ 5],v[ 9],v[13],0,0,	v[ 2],v[ 6],v[10],v[14],0,nonce,	v[ 3],v[ 7],v[11],v[15],m[ 0],0);
+		GS4(v[ 0],v[ 5],v[10],v[15],0,m[ 2],	v[ 1],v[ 6],v[11],v[12],0,0,	v[ 2],v[ 7],v[ 8],v[13],m[ 1],0,	v[ 3],v[ 4],v[ 9],v[14],0,0);
+		GS4(v[ 0],v[ 4],v[ 8],v[12],0,m[ 2],	v[ 1],v[ 5],v[ 9],v[13],0,0,	v[ 2],v[ 6],v[10],v[14],0,0,	v[ 3],v[ 7],v[11],v[15],m[ 1],0);
+
+//		GS(9,4,v[ 0],v[ 5],v[10],v[15]);
+		v[ 0] += v[ 5];
+		v[ 2] += v[ 7] + nonce;
+		v[15] = ROL16(v[15] ^ v[ 0]);
+		v[13] = ROL16(v[13] ^ v[ 2]);
+		v[10] += v[15];
+		v[ 8] += v[13];
+		v[ 5] = ROTR32(v[ 5] ^ v[10], 12);
+		v[ 7] = ROTR32(v[ 7] ^ v[ 8], 12);
+		v[ 0] += v[ 5];
+		v[ 2] += v[ 7];
+		v[15] = ROR8(v[15] ^ v[ 0]);
+		v[13] = ROR8(v[13] ^ v[ 2]);
+
+		v[ 8] += v[13];
+		v[ 7] = ROTR32(v[ 7] ^ v[ 8], 7);
+
+		if (xor3x(h7,v[7],v[15]) <= ptarget7){
+			uint32_t pos = atomicInc(&resNonce[0],0xffffffff)+1;
+			if(pos < maxResults)
+				resNonce[pos] = nonce;
+			return;
+		}
+	}
+}
+
+__global__ __launch_bounds__(TPB,1)
+void blake2s_gpu_hash_nonce(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce)
+{
+	const uint32_t step = gridDim.x * blockDim.x;
+
+	uint32_t m[ 3];
+	uint32_t v[16];
+
+	m[0] = midstate[16];
+	m[1] = midstate[17];
+	m[2] = midstate[18];
+
+	const uint32_t h7 = midstate[19];
+
+	for(uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x ; thread <threads; thread+=step)
+	{
+		#pragma unroll
+		for(int i=0;i<16;i++){
+			v[ i] = midstate[ i];
+		}
+
+		uint32_t nonce = cuda_swab32(startNonce+thread);
+
+//		Round( 0 );
+		v[ 1] += nonce;
+		v[13] = ROR8(v[13] ^ v[ 1]);
+		v[ 9] += v[13];
+		v[ 5] = ROTR32(v[ 5] ^ v[ 9], 7);
+
+		v[ 1]+= v[ 6];
+		v[ 0]+= v[ 5];
+
+		v[13] = ROL16(v[13] ^ v[ 2]);		v[12] = ROL16(v[12] ^ v[ 1]);		v[15] = ROL16(v[15] ^ v[ 0]);
+
+		v[ 8]+= v[13];				v[11]+= v[12];				v[ 9]+= v[14];				v[10]+= v[15];
+		v[ 7] = ROTR32(v[ 7] ^ v[ 8], 12);	v[ 6] = ROTR32(v[ 6] ^ v[11], 12);	v[ 4] = ROTR32(v[ 4] ^ v[ 9], 12);	v[ 5] = ROTR32(v[ 5] ^ v[10], 12);
+		v[ 2]+= v[ 7];				v[ 1]+= v[ 6];				v[ 3]+= v[ 4];				v[ 0]+= v[ 5];
+		v[13] = ROR8(v[13] ^ v[ 2]);		v[12] = ROR8(v[12] ^ v[ 1]);		v[14] = ROR8(v[14] ^ v[ 3]);		v[15] = ROR8(v[15] ^ v[ 0]);
+		v[ 8]+= v[13];				v[11]+= v[12];				v[ 9]+= v[14];				v[10]+= v[15];
+		v[ 6] = ROTR32(v[ 6] ^ v[11], 7);	v[ 7] = ROTR32(v[ 7] ^ v[8], 7);	v[ 4] = ROTR32(v[ 4] ^ v[ 9], 7);	v[ 5] = ROTR32(v[ 5] ^ v[10], 7);
+
+		GS4(v[ 0],v[ 4],v[ 8],v[12],0,0,	v[ 1],v[ 5],v[ 9],v[13],0,0,	v[ 2],v[ 6],v[10],v[14],0,0,	v[ 3],v[ 7],v[11],v[15],0,0);
+		GS4(v[ 0],v[ 5],v[10],v[15],m[ 1],0,	v[ 1],v[ 6],v[11],v[12],m[ 0],m[ 2],	v[ 2],v[ 7],v[ 8],v[13],0,0,	v[ 3],v[ 4],v[ 9],v[14],0,nonce);
+		GS4(v[ 0],v[ 4],v[ 8],v[12],0,0,	v[ 1],v[ 5],v[ 9],v[13],0,m[ 0],	v[ 2],v[ 6],v[10],v[14],0,m[ 2],	v[ 3],v[ 7],v[11],v[15],0,0);
+		GS4(v[ 0],v[ 5],v[10],v[15],0,0,	v[ 1],v[ 6],v[11],v[12],nonce,0,	v[ 2],v[ 7],v[ 8],v[13],0,m[ 1],	v[ 3],v[ 4],v[ 9],v[14],0,0);
+		GS4(v[ 0],v[ 4],v[ 8],v[12],0,0,	v[ 1],v[ 5],v[ 9],v[13],nonce,m[ 1],	v[ 2],v[ 6],v[10],v[14],0,0,	v[ 3],v[ 7],v[11],v[15],0,0);
+		GS4(v[ 0],v[ 5],v[10],v[15],m[ 2],0,	v[ 1],v[ 6],v[11],v[12],0,0,	v[ 2],v[ 7],v[ 8],v[13],0,m[ 0],	v[ 3],v[ 4],v[ 9],v[14],0,0);
+		GS4(v[ 0],v[ 4],v[ 8],v[12],0,m[ 0],	v[ 1],v[ 5],v[ 9],v[13],0,0,	v[ 2],v[ 6],v[10],v[14],m[ 2],0,	v[ 3],v[ 7],v[11],v[15],0,0);
+		GS4(v[ 0],v[ 5],v[10],v[15],0,m[ 1],	v[ 1],v[ 6],v[11],v[12],0,0,	v[ 2],v[ 7],v[ 8],v[13],0,0,	v[ 3],v[ 4],v[ 9],v[14],nonce,0);
+		GS4(v[ 0],v[ 4],v[ 8],v[12],m[ 2],0,	v[ 1],v[ 5],v[ 9],v[13],0,0,	v[ 2],v[ 6],v[10],v[14],m[ 0],0,	v[ 3],v[ 7],v[11],v[15],0,nonce);
+		GS4(v[ 0],v[ 5],v[10],v[15],0,0,	v[ 1],v[ 6],v[11],v[12],0,0,	v[ 2],v[ 7],v[ 8],v[13],0,0,	v[ 3],v[ 4],v[ 9],v[14],m[ 1],0);
+		GS4(v[ 0],v[ 4],v[ 8],v[12],0,0,	v[ 1],v[ 5],v[ 9],v[13],m[ 1],0,	v[ 2],v[ 6],v[10],v[14],0,0,	v[ 3],v[ 7],v[11],v[15],0,0);
+		GS4(v[ 0],v[ 5],v[10],v[15],m[ 0],0,	v[ 1],v[ 6],v[11],v[12],0,nonce,	v[ 2],v[ 7],v[ 8],v[13],0,m[ 2],	v[ 3],v[ 4],v[ 9],v[14],0,0);
+		GS4(v[ 0],v[ 4],v[ 8],v[12],0,0,	v[ 1],v[ 5],v[ 9],v[13],0,0,	v[ 2],v[ 6],v[10],v[14],0,m[ 1],	v[ 3],v[ 7],v[11],v[15],nonce,0);
+		GS4(v[ 0],v[ 5],v[10],v[15],0,m[ 0],	v[ 1],v[ 6],v[11],v[12],0,0,	v[ 2],v[ 7],v[ 8],v[13],0,0,	v[ 3],v[ 4],v[ 9],v[14],m[ 2],0);
+		GS4(v[ 0],v[ 4],v[ 8],v[12],0,0,	v[ 1],v[ 5],v[ 9],v[13],0,0,	v[ 2],v[ 6],v[10],v[14],0,nonce,	v[ 3],v[ 7],v[11],v[15],m[ 0],0);
+		GS4(v[ 0],v[ 5],v[10],v[15],0,m[ 2],	v[ 1],v[ 6],v[11],v[12],0,0,	v[ 2],v[ 7],v[ 8],v[13],m[ 1],0,	v[ 3],v[ 4],v[ 9],v[14],0,0);
+		GS4(v[ 0],v[ 4],v[ 8],v[12],0,m[ 2],	v[ 1],v[ 5],v[ 9],v[13],0,0,	v[ 2],v[ 6],v[10],v[14],0,0,	v[ 3],v[ 7],v[11],v[15],m[ 1],0);
+
+		v[ 0] += v[ 5];
+		v[ 2] += v[ 7] + nonce;
+		v[15] = ROL16(v[15] ^ v[ 0]);
+		v[13] = ROL16(v[13] ^ v[ 2]);
+		v[10] += v[15];
+		v[ 8] += v[13];
+		v[ 5] = ROTR32(v[ 5] ^ v[10], 12);
+		v[ 7] = ROTR32(v[ 7] ^ v[ 8], 12);
+		v[ 0] += v[ 5];
+		v[ 2] += v[ 7];
+		v[15] = ROTR32(v[15] ^ v[ 0],1);
+		v[13] = ROR8(v[13] ^ v[ 2]);
+
+		v[ 8] += v[13];
+
+		if(xor3x(v[ 7],h7,v[ 8])==v[15]){
+			uint32_t pos = atomicInc(&resNonce[0],0xffffffff)+1;
+			if(pos < maxResults)
+				resNonce[pos]=nonce;
+			return;
+		}
+	}
+}
+
+static void blake2s_setBlock(const uint32_t* input,const uint32_t ptarget7)
+{
+	uint32_t _ALIGN(64) m[16];
+	uint32_t _ALIGN(64) v[16];
+	uint32_t _ALIGN(64) h[21];
+
+//	COMPRESS
+	for(int i = 0; i < 16; ++i )
+		m[i] = input[i];
+
+	h[0] = 0x01010020 ^ blake2s_IV[0];
+	h[1] = blake2s_IV[1];
+	h[2] = blake2s_IV[2]; h[3] = blake2s_IV[3];
+	h[4] = blake2s_IV[4]; h[5] = blake2s_IV[5];
+	h[6] = blake2s_IV[6]; h[7] = blake2s_IV[7];
+
+	for(int i = 0; i < 8; ++i )
+		v[i] = h[i];
+
+	v[ 8] = blake2s_IV[0];		v[ 9] = blake2s_IV[1];
+	v[10] = blake2s_IV[2];		v[11] = blake2s_IV[3];
+	v[12] = 64 ^ blake2s_IV[4];	v[13] = blake2s_IV[5];
+	v[14] = blake2s_IV[6];		v[15] = blake2s_IV[7];
+
+	ROUND( 0 ); ROUND( 1 );
+	ROUND( 2 ); ROUND( 3 );
+	ROUND( 4 ); ROUND( 5 );
+	ROUND( 6 ); ROUND( 7 );
+	ROUND( 8 ); ROUND( 9 );
+
+	for(int i = 0; i < 8; ++i )
+		h[i] ^= v[i] ^ v[i + 8];
+
+	h[16] = input[16];
+	h[17] = input[17];
+	h[18] = input[18];
+
+	h[ 8] = 0x6A09E667; h[ 9] = 0xBB67AE85;
+	h[10] = 0x3C6EF372; h[11] = 0xA54FF53A;
+	h[12] = 0x510E522F; h[13] = 0x9B05688C;
+	h[14] =~0x1F83D9AB; h[15] = 0x5BE0CD19;
+
+	h[ 0]+= h[ 4] + h[16];
+	h[12] = SPH_ROTR32(h[12] ^ h[ 0],16);
+	h[ 8]+= h[12];
+	h[ 4] = SPH_ROTR32(h[ 4] ^ h[ 8],12);
+	h[ 0]+= h[ 4] + h[17];
+	h[12] = SPH_ROTR32(h[12] ^ h[ 0],8);
+	h[ 8]+= h[12];
+	h[ 4] = SPH_ROTR32(h[ 4] ^ h[ 8],7);
+
+	h[ 1]+= h[ 5] + h[18];
+	h[13] = SPH_ROTR32(h[13] ^ h[ 1], 16);
+	h[ 9]+= h[13];
+	h[ 5] = ROTR32(h[ 5] ^ h[ 9], 12);
+
+	h[ 2]+= h[ 6];
+	h[14] = SPH_ROTR32(h[14] ^ h[ 2],16);
+	h[10]+= h[14];
+	h[ 6] = SPH_ROTR32(h[ 6] ^ h[10], 12);
+	h[ 2]+= h[ 6];
+	h[14] = SPH_ROTR32(h[14] ^ h[ 2],8);
+	h[10]+= h[14];
+	h[ 6] = SPH_ROTR32(h[ 6] ^ h[10], 7);
+
+	h[19] = h[7]; //constant h[7] for nonce check
+
+	h[ 3]+= h[ 7];
+	h[15] = SPH_ROTR32(h[15] ^ h[ 3],16);
+	h[11]+= h[15];
+	h[ 7] = SPH_ROTR32(h[ 7] ^ h[11], 12);
+	h[ 3]+= h[ 7];
+	h[15] = SPH_ROTR32(h[15] ^ h[ 3],8);
+	h[11]+= h[15];
+	h[ 7] = SPH_ROTR32(h[ 7] ^ h[11], 7);
+
+	h[ 1]+= h[ 5];
+	h[ 3]+= h[ 4];
+	h[14] = SPH_ROTR32(h[14] ^ h[ 3],16);
+
+	h[ 2]+= h[ 7];
+	if(ptarget7==0){
+		h[19] = SPH_ROTL32(h[19],7); //align the rotation with v[7] v[15];
+	}
+	cudaMemcpyToSymbol(midstate, h, 20*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t _ALIGN(64) endiandata[20];
+
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	uint32_t *resNonces;
+
+	const uint32_t first_nonce = pdata[19];
+
+	const int dev_id = device_map[thr_id];
+	int rc = 0;
+	int intensity = is_windows() ? 25 : 28;
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	const dim3 grid((throughput + (NPT*TPB)-1)/(NPT*TPB));
+	const dim3 block(TPB);
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage (linux)
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		cuda_get_arch(thr_id);
+
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_resNonce[thr_id], maxResults * sizeof(uint32_t)), -1);
+		CUDA_CALL_OR_RET_X(cudaMallocHost(&h_resNonce[thr_id], maxResults * sizeof(uint32_t)), -1);
+		init[thr_id] = true;
+	}
+	resNonces = h_resNonce[thr_id];
+
+	for (int i=0; i < 19; i++) {
+		be32enc(&endiandata[i], pdata[i]);
+	}
+	blake2s_setBlock(endiandata,ptarget[7]);
+
+	cudaMemset(d_resNonce[thr_id], 0x00, maxResults*sizeof(uint32_t));
+
+	do {
+		if(ptarget[7]) {
+			blake2s_gpu_hash_nonce<<<grid, block>>>(throughput,pdata[19],d_resNonce[thr_id],ptarget[7]);
+		} else {
+			blake2s_gpu_hash_nonce<<<grid, block>>>(throughput,pdata[19],d_resNonce[thr_id]);
+		}
+		cudaMemcpy(resNonces, d_resNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+
+		if(resNonces[0])
+		{
+			cudaMemcpy(resNonces, d_resNonce[thr_id], maxResults*sizeof(uint32_t), cudaMemcpyDeviceToHost);
+			cudaMemset(d_resNonce[thr_id], 0x00, sizeof(uint32_t));
+
+			if(resNonces[0] >= maxResults) {
+				gpulog(LOG_WARNING, thr_id, "candidates flood: %u", resNonces[0]);
+				resNonces[0] = maxResults-1;
+			}
+
+			uint32_t vhashcpu[8];
+			uint32_t nonce = sph_bswap32(resNonces[1]);
+			be32enc(&endiandata[19], nonce);
+			blake2s_hash(vhashcpu, endiandata);
+
+			*hashes_done = pdata[19] - first_nonce + throughput;
+
+			if(vhashcpu[6] <= ptarget[6] && fulltest(vhashcpu, ptarget))
+			{
+				work_set_target_ratio(work, vhashcpu);
+				work->nonces[0] = nonce;
+				rc = work->valid_nonces = 1;
+
+				// search for 2nd best nonce
+				for(uint32_t j=2; j <= resNonces[0]; j++)
+				{
+					nonce = sph_bswap32(resNonces[j]);
+					be32enc(&endiandata[19], nonce);
+					blake2s_hash(vhashcpu, endiandata);
+					if(vhashcpu[6] <= ptarget[6] && fulltest(vhashcpu, ptarget))
+					{
+						gpulog(LOG_DEBUG, thr_id, "Multiple nonces: 1/%08x - %u/%08x", work->nonces[0], j, nonce);
+
+						work->nonces[1] = nonce;
+						if (bn_hash_target_ratio(vhashcpu, ptarget) > work->shareratio[0]) {
+							work->shareratio[1] = work->shareratio[0];
+							work->sharediff[1] = work->sharediff[0];
+							xchg(work->nonces[1], work->nonces[0]);
+							work_set_target_ratio(work, vhashcpu);
+						} else if (work->valid_nonces == 1) {
+							bn_set_target_ratio(work, vhashcpu, 1);
+						}
+
+						work->valid_nonces++;
+						rc = 2;
+						break;
+					}
+				}
+				pdata[19] = max(work->nonces[0], work->nonces[1]); // next scan start
+				return rc;
+			} else if (vhashcpu[7] > ptarget[7]) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+					gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", resNonces[0]);
+			}
+		}
+
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart && (uint64_t)max_nonce > (uint64_t)throughput + pdata[19]);
+
+	*hashes_done = pdata[19] - first_nonce;
+
+	return rc;
+}
+
+// cleanup
+extern "C" void free_blake2s(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaDeviceSynchronize();
+
+	cudaFreeHost(h_resNonce[thr_id]);
+	cudaFree(d_resNonce[thr_id]);
+
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
+
diff --git a/Algo256/bmw.cu b/Algo256/bmw.cu
new file mode 100644
index 0000000000..c2e4be60cc
--- /dev/null
+++ b/Algo256/bmw.cu
@@ -0,0 +1,138 @@
+/**
+ * bmw-256 MDT
+ * tpruvot - 2015
+ */
+extern "C" {
+#include "sph/sph_bmw.h"
+}
+
+#include <miner.h>
+#include <cuda_helper.h>
+
+static uint32_t *d_hash[MAX_GPUS];
+
+extern void bmw256_midstate_init(int thr_id, uint32_t threads);
+extern void bmw256_midstate_free(int thr_id);
+extern void bmw256_setBlock_80(int thr_id, void *pdata);
+extern void bmw256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash, int swap);
+
+extern uint32_t cuda_check_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash);
+
+// CPU Hash
+extern "C" void bmw_hash(void *state, const void *input)
+{
+	uint32_t _ALIGN(64) hash[16];
+	sph_bmw256_context ctx;
+
+	sph_bmw256_init(&ctx);
+	sph_bmw256(&ctx, input, 80);
+	sph_bmw256_close(&ctx, (void*) hash);
+
+	memcpy(state, hash, 32);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_bmw(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t _ALIGN(64) endiandata[20];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << 21);
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (opt_benchmark)
+		ptarget[7] = 0x0005;
+
+	if (!init[thr_id]) {
+		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+                        // reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		cuda_check_cpu_init(thr_id, throughput);
+		bmw256_midstate_init(thr_id, throughput);
+
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput));
+
+		init[thr_id] = true;
+	}
+
+	for (int k=0; k < 20; k++) {
+		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
+	}
+
+	cudaGetLastError();
+	bmw256_setBlock_80(thr_id, (void*)endiandata);
+
+	cuda_check_cpu_setTarget(ptarget);
+
+	do {
+		bmw256_cpu_hash_80(thr_id, (int) throughput, pdata[19], d_hash[thr_id], 1);
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		work->nonces[0] = cuda_check_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			bmw_hash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					bmw_hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+	return 0;
+}
+
+// cleanup
+extern "C" void free_bmw(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+	bmw256_midstate_free(thr_id);
+	cuda_check_cpu_free(thr_id);
+
+	cudaDeviceSynchronize();
+	init[thr_id] = false;
+}
diff --git a/Algo256/cuda_blake256.cu b/Algo256/cuda_blake256.cu
new file mode 100644
index 0000000000..418ca07ec5
--- /dev/null
+++ b/Algo256/cuda_blake256.cu
@@ -0,0 +1,580 @@
+/**
+ * Blake-256 Cuda Kernel (Tested on SM 5.0)
+ *
+ * Tanguy Pruvot - Nov. 2014
+ *
+ * + merged blake+keccak kernel for lyra2v2
+ */
+extern "C" {
+#include "sph/sph_blake.h"
+}
+
+#include "cuda_helper.h"
+
+#include <memory.h>
+
+#ifdef __INTELLISENSE__
+/* just for vstudio code colors */
+__device__ uint32_t __byte_perm(uint32_t a, uint32_t b, uint32_t c);
+#endif
+
+#define UINT2(x,y) make_uint2(x,y)
+
+__device__ __inline__ uint2 ROR8(const uint2 a) {
+	uint2 result;
+	result.x = __byte_perm(a.y, a.x, 0x0765);
+	result.y = __byte_perm(a.x, a.y, 0x0765);
+	return result;
+}
+
+static __device__ uint64_t cuda_swab32ll(uint64_t x) {
+	return MAKE_ULONGLONG(cuda_swab32(_LODWORD(x)), cuda_swab32(_HIDWORD(x)));
+}
+
+__constant__ static uint32_t c_data[3 + 1];
+
+__constant__ static uint32_t sigma[16][16];
+static uint32_t  c_sigma[16][16] = {
+	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+	{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+	{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+	{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+	{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
+};
+
+static const uint32_t  c_IV256[8] = {
+	0x6A09E667, 0xBB67AE85,
+	0x3C6EF372, 0xA54FF53A,
+	0x510E527F, 0x9B05688C,
+	0x1F83D9AB, 0x5BE0CD19
+};
+
+__device__ __constant__ static uint32_t cpu_h[8];
+
+__device__ __constant__ static  uint32_t  u256[16];
+static const uint32_t  c_u256[16] = {
+	0x243F6A88, 0x85A308D3,
+	0x13198A2E, 0x03707344,
+	0xA4093822, 0x299F31D0,
+	0x082EFA98, 0xEC4E6C89,
+	0x452821E6, 0x38D01377,
+	0xBE5466CF, 0x34E90C6C,
+	0xC0AC29B7, 0xC97C50DD,
+	0x3F84D5B5, 0xB5470917
+};
+
+__constant__ uint2 keccak_round_constants35[24] = {
+	{ 0x00000001ul, 0x00000000 }, { 0x00008082ul, 0x00000000 },
+	{ 0x0000808aul, 0x80000000 }, { 0x80008000ul, 0x80000000 },
+	{ 0x0000808bul, 0x00000000 }, { 0x80000001ul, 0x00000000 },
+	{ 0x80008081ul, 0x80000000 }, { 0x00008009ul, 0x80000000 },
+	{ 0x0000008aul, 0x00000000 }, { 0x00000088ul, 0x00000000 },
+	{ 0x80008009ul, 0x00000000 }, { 0x8000000aul, 0x00000000 },
+	{ 0x8000808bul, 0x00000000 }, { 0x0000008bul, 0x80000000 },
+	{ 0x00008089ul, 0x80000000 }, { 0x00008003ul, 0x80000000 },
+	{ 0x00008002ul, 0x80000000 }, { 0x00000080ul, 0x80000000 },
+	{ 0x0000800aul, 0x00000000 }, { 0x8000000aul, 0x80000000 },
+	{ 0x80008081ul, 0x80000000 }, { 0x00008080ul, 0x80000000 },
+	{ 0x80000001ul, 0x00000000 }, { 0x80008008ul, 0x80000000 }
+};
+
+
+#define GS2(a,b,c,d,x) { \
+	const uint32_t idx1 = sigma[r][x]; \
+	const uint32_t idx2 = sigma[r][(x)+1]; \
+	v[a] += (m[idx1] ^ u256[idx2]) + v[b]; \
+	v[d] = SPH_ROTL32(v[d] ^ v[a], 16); \
+	v[c] += v[d]; \
+	v[b] = SPH_ROTR32(v[b] ^ v[c], 12); \
+\
+	v[a] += (m[idx2] ^ u256[idx1]) + v[b]; \
+	v[d] = SPH_ROTR32(v[d] ^ v[a], 8); \
+	v[c] += v[d]; \
+	v[b] = SPH_ROTR32(v[b] ^ v[c], 7); \
+}
+
+//#define ROTL32(x, n) ((x) << (n)) | ((x) >> (32 - (n)))
+//#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
+#define hostGS(a,b,c,d,x) { \
+	const uint32_t idx1 = c_sigma[r][x]; \
+	const uint32_t idx2 = c_sigma[r][(x)+1]; \
+	v[a] += (m[idx1] ^ c_u256[idx2]) + v[b]; \
+	v[d] = ROTR32(v[d] ^ v[a], 16); \
+	v[c] += v[d]; \
+	v[b] = ROTR32(v[b] ^ v[c], 12); \
+\
+	v[a] += (m[idx2] ^ c_u256[idx1]) + v[b]; \
+	v[d] = ROTR32(v[d] ^ v[a], 8); \
+	v[c] += v[d]; \
+	v[b] = ROTR32(v[b] ^ v[c], 7); \
+	}
+
+#define GSPREC(a,b,c,d,x,y) { \
+	v[a] += (m[x] ^ u256[y]) + v[b]; \
+	v[d] = __byte_perm(v[d] ^ v[a],0, 0x1032); \
+	v[c] += v[d]; \
+	v[b] = SPH_ROTR32(v[b] ^ v[c], 12); \
+	v[a] += (m[y] ^ u256[x]) + v[b]; \
+	v[d] = __byte_perm(v[d] ^ v[a],0, 0x0321); \
+	v[c] += v[d]; \
+	v[b] = SPH_ROTR32(v[b] ^ v[c], 7); \
+						}
+
+/* Second part (64-80) msg never change, store it */
+__device__ __constant__ static const uint32_t  c_Padding[16] = {
+	0, 0, 0, 0,
+	0x80000000, 0, 0, 0,
+	0, 0, 0, 0,
+	0, 1, 0, 640,
+};
+
+__host__ __forceinline__
+static void blake256_compress1st(uint32_t *h, const uint32_t *block, const uint32_t T0)
+{
+	uint32_t m[16];
+	uint32_t v[16];
+
+	for (int i = 0; i < 16; i++) {
+		m[i] = block[i];
+	}
+
+	for (int i = 0; i < 8; i++)
+		v[i] = h[i];
+
+	v[8] = c_u256[0];
+	v[9] = c_u256[1];
+	v[10] = c_u256[2];
+	v[11] = c_u256[3];
+
+	v[12] = c_u256[4] ^ T0;
+	v[13] = c_u256[5] ^ T0;
+	v[14] = c_u256[6];
+	v[15] = c_u256[7];
+
+	for (int r = 0; r < 14; r++) {
+		/* column step */
+		hostGS(0, 4, 0x8, 0xC, 0x0);
+		hostGS(1, 5, 0x9, 0xD, 0x2);
+		hostGS(2, 6, 0xA, 0xE, 0x4);
+		hostGS(3, 7, 0xB, 0xF, 0x6);
+		/* diagonal step */
+		hostGS(0, 5, 0xA, 0xF, 0x8);
+		hostGS(1, 6, 0xB, 0xC, 0xA);
+		hostGS(2, 7, 0x8, 0xD, 0xC);
+		hostGS(3, 4, 0x9, 0xE, 0xE);
+	}
+
+	for (int i = 0; i < 16; i++) {
+		int j = i & 7;
+		h[j] ^= v[i];
+	}
+}
+
+__device__ __forceinline__
+static void blake256_compress2nd(uint32_t *h, const uint32_t *block, const uint32_t T0)
+{
+	uint32_t m[16];
+	uint32_t v[16];
+
+	m[0] = block[0];
+	m[1] = block[1];
+	m[2] = block[2];
+	m[3] = block[3];
+
+	#pragma unroll
+	for (int i = 4; i < 16; i++) {
+		m[i] = c_Padding[i];
+	}
+
+	#pragma unroll 8
+	for (int i = 0; i < 8; i++)
+		v[i] = h[i];
+
+	v[8] = u256[0];
+	v[9] = u256[1];
+	v[10] = u256[2];
+	v[11] = u256[3];
+
+	v[12] = u256[4] ^ T0;
+	v[13] = u256[5] ^ T0;
+	v[14] = u256[6];
+	v[15] = u256[7];
+
+	#pragma unroll 14
+	for (int r = 0; r < 14; r++) {
+		/* column step */
+		GS2(0, 4, 0x8, 0xC, 0x0);
+		GS2(1, 5, 0x9, 0xD, 0x2);
+		GS2(2, 6, 0xA, 0xE, 0x4);
+		GS2(3, 7, 0xB, 0xF, 0x6);
+		/* diagonal step */
+		GS2(0, 5, 0xA, 0xF, 0x8);
+		GS2(1, 6, 0xB, 0xC, 0xA);
+		GS2(2, 7, 0x8, 0xD, 0xC);
+		GS2(3, 4, 0x9, 0xE, 0xE);
+	}
+
+	#pragma unroll 16
+	for (int i = 0; i < 16; i++) {
+		int j = i & 7;
+		h[j] ^= v[i];
+	}
+}
+
+static void __forceinline__ __device__ keccak_block(uint2 *s)
+{
+	uint2 bc[5], tmpxor[5], u, v;
+	//	uint2 s[25];
+
+	#pragma unroll 1
+	for (int i = 0; i < 24; i++)
+	{
+		#pragma unroll
+		for (uint32_t x = 0; x < 5; x++)
+			tmpxor[x] = s[x] ^ s[x + 5] ^ s[x + 10] ^ s[x + 15] ^ s[x + 20];
+
+		bc[0] = tmpxor[0] ^ ROL2(tmpxor[2], 1);
+		bc[1] = tmpxor[1] ^ ROL2(tmpxor[3], 1);
+		bc[2] = tmpxor[2] ^ ROL2(tmpxor[4], 1);
+		bc[3] = tmpxor[3] ^ ROL2(tmpxor[0], 1);
+		bc[4] = tmpxor[4] ^ ROL2(tmpxor[1], 1);
+
+		u = s[1] ^ bc[0];
+
+		s[0] ^= bc[4];
+		s[1] = ROL2(s[6] ^ bc[0], 44);
+		s[6] = ROL2(s[9] ^ bc[3], 20);
+		s[9] = ROL2(s[22] ^ bc[1], 61);
+		s[22] = ROL2(s[14] ^ bc[3], 39);
+		s[14] = ROL2(s[20] ^ bc[4], 18);
+		s[20] = ROL2(s[2] ^ bc[1], 62);
+		s[2] = ROL2(s[12] ^ bc[1], 43);
+		s[12] = ROL2(s[13] ^ bc[2], 25);
+		s[13] = ROL8(s[19] ^ bc[3]);
+		s[19] = ROR8(s[23] ^ bc[2]);
+		s[23] = ROL2(s[15] ^ bc[4], 41);
+		s[15] = ROL2(s[4] ^ bc[3], 27);
+		s[4] = ROL2(s[24] ^ bc[3], 14);
+		s[24] = ROL2(s[21] ^ bc[0], 2);
+		s[21] = ROL2(s[8] ^ bc[2], 55);
+		s[8] = ROL2(s[16] ^ bc[0], 45);
+		s[16] = ROL2(s[5] ^ bc[4], 36);
+		s[5] = ROL2(s[3] ^ bc[2], 28);
+		s[3] = ROL2(s[18] ^ bc[2], 21);
+		s[18] = ROL2(s[17] ^ bc[1], 15);
+		s[17] = ROL2(s[11] ^ bc[0], 10);
+		s[11] = ROL2(s[7] ^ bc[1], 6);
+		s[7] = ROL2(s[10] ^ bc[4], 3);
+		s[10] = ROL2(u, 1);
+
+		u = s[0]; v = s[1]; s[0] ^= (~v) & s[2]; s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & u; s[4] ^= (~u) & v;
+		u = s[5]; v = s[6]; s[5] ^= (~v) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9]; s[8] ^= (~s[9]) & u; s[9] ^= (~u) & v;
+		u = s[10]; v = s[11]; s[10] ^= (~v) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & u; s[14] ^= (~u) & v;
+		u = s[15]; v = s[16]; s[15] ^= (~v) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & u; s[19] ^= (~u) & v;
+		u = s[20]; v = s[21]; s[20] ^= (~v) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & u; s[24] ^= (~u) & v;
+		s[0] ^= keccak_round_constants35[i];
+	}
+}
+
+
+//__launch_bounds__(256)
+__global__
+void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint32_t * Hash)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint32_t nonce = startNonce + thread;
+		const uint32_t T0 = 640;
+
+		uint32_t h[8];
+		#pragma unroll 8
+		for (int i = 0; i<8; i++) { h[i] = cpu_h[i]; }
+
+		uint32_t v[16];
+
+		const uint32_t c_Padding[12] = {
+			0x80000000, 0, 0, 0,
+			0, 0, 0, 0,
+			0, 1, 0, 640
+		};
+
+		const uint32_t  u256[16] = {
+			0x243F6A88, 0x85A308D3,
+			0x13198A2E, 0x03707344,
+			0xA4093822, 0x299F31D0,
+			0x082EFA98, 0xEC4E6C89,
+			0x452821E6, 0x38D01377,
+			0xBE5466CF, 0x34E90C6C,
+			0xC0AC29B7, 0xC97C50DD,
+			0x3F84D5B5, 0xB5470917
+		};
+
+		uint32_t m[16] = {
+			c_data[0], c_data[1], c_data[2], nonce,
+			c_Padding[0], c_Padding[1], c_Padding[2], c_Padding[3],
+			c_Padding[4], c_Padding[5], c_Padding[6], c_Padding[7],
+			c_Padding[8], c_Padding[9], c_Padding[10], c_Padding[11]
+		};
+
+		#pragma unroll 8
+		for (int i = 0; i < 8; i++)
+			v[i] = h[i];
+
+		v[8] = u256[0];
+		v[9] = u256[1];
+		v[10] = u256[2];
+		v[11] = u256[3];
+		v[12] = u256[4] ^ T0;
+		v[13] = u256[5] ^ T0;
+		v[14] = u256[6];
+		v[15] = u256[7];
+
+		//	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+		GSPREC(0, 4, 0x8, 0xC, 0, 1);
+		GSPREC(1, 5, 0x9, 0xD, 2, 3);
+		GSPREC(2, 6, 0xA, 0xE, 4, 5);
+		GSPREC(3, 7, 0xB, 0xF, 6, 7);
+		GSPREC(0, 5, 0xA, 0xF, 8, 9);
+		GSPREC(1, 6, 0xB, 0xC, 10, 11);
+		GSPREC(2, 7, 0x8, 0xD, 12, 13);
+		GSPREC(3, 4, 0x9, 0xE, 14, 15);
+		//	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+		GSPREC(0, 4, 0x8, 0xC, 14, 10);
+		GSPREC(1, 5, 0x9, 0xD, 4, 8);
+		GSPREC(2, 6, 0xA, 0xE, 9, 15);
+		GSPREC(3, 7, 0xB, 0xF, 13, 6);
+		GSPREC(0, 5, 0xA, 0xF, 1, 12);
+		GSPREC(1, 6, 0xB, 0xC, 0, 2);
+		GSPREC(2, 7, 0x8, 0xD, 11, 7);
+		GSPREC(3, 4, 0x9, 0xE, 5, 3);
+		//	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+		GSPREC(0, 4, 0x8, 0xC, 11, 8);
+		GSPREC(1, 5, 0x9, 0xD, 12, 0);
+		GSPREC(2, 6, 0xA, 0xE, 5, 2);
+		GSPREC(3, 7, 0xB, 0xF, 15, 13);
+		GSPREC(0, 5, 0xA, 0xF, 10, 14);
+		GSPREC(1, 6, 0xB, 0xC, 3, 6);
+		GSPREC(2, 7, 0x8, 0xD, 7, 1);
+		GSPREC(3, 4, 0x9, 0xE, 9, 4);
+		//	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+		GSPREC(0, 4, 0x8, 0xC, 7, 9);
+		GSPREC(1, 5, 0x9, 0xD, 3, 1);
+		GSPREC(2, 6, 0xA, 0xE, 13, 12);
+		GSPREC(3, 7, 0xB, 0xF, 11, 14);
+		GSPREC(0, 5, 0xA, 0xF, 2, 6);
+		GSPREC(1, 6, 0xB, 0xC, 5, 10);
+		GSPREC(2, 7, 0x8, 0xD, 4, 0);
+		GSPREC(3, 4, 0x9, 0xE, 15, 8);
+		//	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+		GSPREC(0, 4, 0x8, 0xC, 9, 0);
+		GSPREC(1, 5, 0x9, 0xD, 5, 7);
+		GSPREC(2, 6, 0xA, 0xE, 2, 4);
+		GSPREC(3, 7, 0xB, 0xF, 10, 15);
+		GSPREC(0, 5, 0xA, 0xF, 14, 1);
+		GSPREC(1, 6, 0xB, 0xC, 11, 12);
+		GSPREC(2, 7, 0x8, 0xD, 6, 8);
+		GSPREC(3, 4, 0x9, 0xE, 3, 13);
+		//	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+		GSPREC(0, 4, 0x8, 0xC, 2, 12);
+		GSPREC(1, 5, 0x9, 0xD, 6, 10);
+		GSPREC(2, 6, 0xA, 0xE, 0, 11);
+		GSPREC(3, 7, 0xB, 0xF, 8, 3);
+		GSPREC(0, 5, 0xA, 0xF, 4, 13);
+		GSPREC(1, 6, 0xB, 0xC, 7, 5);
+		GSPREC(2, 7, 0x8, 0xD, 15, 14);
+		GSPREC(3, 4, 0x9, 0xE, 1, 9);
+		//	{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+		GSPREC(0, 4, 0x8, 0xC, 12, 5);
+		GSPREC(1, 5, 0x9, 0xD, 1, 15);
+		GSPREC(2, 6, 0xA, 0xE, 14, 13);
+		GSPREC(3, 7, 0xB, 0xF, 4, 10);
+		GSPREC(0, 5, 0xA, 0xF, 0, 7);
+		GSPREC(1, 6, 0xB, 0xC, 6, 3);
+		GSPREC(2, 7, 0x8, 0xD, 9, 2);
+		GSPREC(3, 4, 0x9, 0xE, 8, 11);
+		//	{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+		GSPREC(0, 4, 0x8, 0xC, 13, 11);
+		GSPREC(1, 5, 0x9, 0xD, 7, 14);
+		GSPREC(2, 6, 0xA, 0xE, 12, 1);
+		GSPREC(3, 7, 0xB, 0xF, 3, 9);
+		GSPREC(0, 5, 0xA, 0xF, 5, 0);
+		GSPREC(1, 6, 0xB, 0xC, 15, 4);
+		GSPREC(2, 7, 0x8, 0xD, 8, 6);
+		GSPREC(3, 4, 0x9, 0xE, 2, 10);
+		//	{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+		GSPREC(0, 4, 0x8, 0xC, 6, 15);
+		GSPREC(1, 5, 0x9, 0xD, 14, 9);
+		GSPREC(2, 6, 0xA, 0xE, 11, 3);
+		GSPREC(3, 7, 0xB, 0xF, 0, 8);
+		GSPREC(0, 5, 0xA, 0xF, 12, 2);
+		GSPREC(1, 6, 0xB, 0xC, 13, 7);
+		GSPREC(2, 7, 0x8, 0xD, 1, 4);
+		GSPREC(3, 4, 0x9, 0xE, 10, 5);
+		//	{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+		GSPREC(0, 4, 0x8, 0xC, 10, 2);
+		GSPREC(1, 5, 0x9, 0xD, 8, 4);
+		GSPREC(2, 6, 0xA, 0xE, 7, 6);
+		GSPREC(3, 7, 0xB, 0xF, 1, 5);
+		GSPREC(0, 5, 0xA, 0xF, 15, 11);
+		GSPREC(1, 6, 0xB, 0xC, 9, 14);
+		GSPREC(2, 7, 0x8, 0xD, 3, 12);
+		GSPREC(3, 4, 0x9, 0xE, 13, 0);
+		//	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+		GSPREC(0, 4, 0x8, 0xC, 0, 1);
+		GSPREC(1, 5, 0x9, 0xD, 2, 3);
+		GSPREC(2, 6, 0xA, 0xE, 4, 5);
+		GSPREC(3, 7, 0xB, 0xF, 6, 7);
+		GSPREC(0, 5, 0xA, 0xF, 8, 9);
+		GSPREC(1, 6, 0xB, 0xC, 10, 11);
+		GSPREC(2, 7, 0x8, 0xD, 12, 13);
+		GSPREC(3, 4, 0x9, 0xE, 14, 15);
+		//	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+		GSPREC(0, 4, 0x8, 0xC, 14, 10);
+		GSPREC(1, 5, 0x9, 0xD, 4, 8);
+		GSPREC(2, 6, 0xA, 0xE, 9, 15);
+		GSPREC(3, 7, 0xB, 0xF, 13, 6);
+		GSPREC(0, 5, 0xA, 0xF, 1, 12);
+		GSPREC(1, 6, 0xB, 0xC, 0, 2);
+		GSPREC(2, 7, 0x8, 0xD, 11, 7);
+		GSPREC(3, 4, 0x9, 0xE, 5, 3);
+		//	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+		GSPREC(0, 4, 0x8, 0xC, 11, 8);
+		GSPREC(1, 5, 0x9, 0xD, 12, 0);
+		GSPREC(2, 6, 0xA, 0xE, 5, 2);
+		GSPREC(3, 7, 0xB, 0xF, 15, 13);
+		GSPREC(0, 5, 0xA, 0xF, 10, 14);
+		GSPREC(1, 6, 0xB, 0xC, 3, 6);
+		GSPREC(2, 7, 0x8, 0xD, 7, 1);
+		GSPREC(3, 4, 0x9, 0xE, 9, 4);
+		//	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+		GSPREC(0, 4, 0x8, 0xC, 7, 9);
+		GSPREC(1, 5, 0x9, 0xD, 3, 1);
+		GSPREC(2, 6, 0xA, 0xE, 13, 12);
+		GSPREC(3, 7, 0xB, 0xF, 11, 14);
+		GSPREC(0, 5, 0xA, 0xF, 2, 6);
+		GSPREC(1, 6, 0xB, 0xC, 5, 10);
+		GSPREC(2, 7, 0x8, 0xD, 4, 0);
+		GSPREC(3, 4, 0x9, 0xE, 15, 8);
+
+		h[0] = cuda_swab32(h[0] ^ v[0] ^ v[8]);
+		h[1] = cuda_swab32(h[1] ^ v[1] ^ v[9]);
+		h[2] = cuda_swab32(h[2] ^ v[2] ^ v[10]);
+		h[3] = cuda_swab32(h[3] ^ v[3] ^ v[11]);
+		h[4] = cuda_swab32(h[4] ^ v[4] ^ v[12]);
+		h[5] = cuda_swab32(h[5] ^ v[5] ^ v[13]);
+		h[6] = cuda_swab32(h[6] ^ v[6] ^ v[14]);
+		h[7] = cuda_swab32(h[7] ^ v[7] ^ v[15]);
+
+		uint2 keccak_gpu_state[25] = { 0 };
+		keccak_gpu_state[0].x = h[0];
+		keccak_gpu_state[0].y = h[1];
+		keccak_gpu_state[1].x = h[2];
+		keccak_gpu_state[1].y = h[3];
+		keccak_gpu_state[2].x = h[4];
+		keccak_gpu_state[2].y = h[5];
+		keccak_gpu_state[3].x = h[6];
+		keccak_gpu_state[3].y = h[7];
+		keccak_gpu_state[4] = UINT2(1, 0);
+
+		keccak_gpu_state[16] = UINT2(0, 0x80000000);
+		keccak_block(keccak_gpu_state);
+
+		uint64_t *outputHash = (uint64_t *)Hash;
+		#pragma unroll 4
+		for (int i = 0; i<4; i++)
+			outputHash[i*threads + thread] = devectorize(keccak_gpu_state[i]);
+	}
+}
+
+__global__ __launch_bounds__(256, 3)
+void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t * Hash)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t h[8];
+		uint32_t input[4];
+
+		#pragma unroll
+		for (int i = 0; i < 8; i++) h[i] = cpu_h[i];
+
+		#pragma unroll
+		for (int i = 0; i < 3; ++i) input[i] = c_data[i];
+
+		input[3] = startNonce + thread;
+		blake256_compress2nd(h, input, 640);
+
+		#pragma unroll
+		for (int i = 0; i<4; i++) {
+			Hash[i*threads + thread] = cuda_swab32ll(MAKE_ULONGLONG(h[2 * i], h[2 * i + 1]));
+		}
+	}
+}
+
+__host__
+void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order)
+{
+	const uint32_t threadsperblock = 256;
+
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	blake256_gpu_hash_80 << <grid, block >> > (threads, startNonce, Hash);
+	MyStreamSynchronize(NULL, order, thr_id);
+}
+
+__host__
+void blake256_cpu_setBlock_80(uint32_t *pdata)
+{
+	uint32_t h[8], data[20];
+
+	memcpy(data, pdata, 80);
+	memcpy(h, c_IV256, sizeof(c_IV256));
+	blake256_compress1st(h, pdata, 512);
+
+	cudaMemcpyToSymbol(cpu_h, h, sizeof(h), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(c_data, &data[16], sizeof(c_data), 0, cudaMemcpyHostToDevice);
+}
+
+__host__
+void blake256_cpu_init(int thr_id, uint32_t threads)
+{
+	cudaMemcpyToSymbol(u256, c_u256, sizeof(c_u256), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(sigma, c_sigma, sizeof(c_sigma), 0, cudaMemcpyHostToDevice);
+}
+
+/** for lyra2v2 **/
+
+__host__
+void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order)
+{
+	const uint32_t threadsperblock = 256;
+
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	blakeKeccak256_gpu_hash_80 <<<grid, block>>> (threads, startNonce, (uint32_t *)Hash);
+}
+
+__host__
+void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order, cudaStream_t stream)
+{
+	const uint32_t threadsperblock = 256;
+
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	blakeKeccak256_gpu_hash_80 <<<grid, block, 0, stream>>> (threads, startNonce, (uint32_t *)Hash);
+}
diff --git a/Algo256/cuda_bmw.cu b/Algo256/cuda_bmw.cu
new file mode 100644
index 0000000000..c4638b9982
--- /dev/null
+++ b/Algo256/cuda_bmw.cu
@@ -0,0 +1,380 @@
+/**
+ * BMW-256 CUDA Implementation - tpruvot 2015
+ *
+ * Not optimal but close to the sph version and easier to adapt.
+ */
+
+#include <stdio.h>
+#include <memory.h>
+
+#define SPH_64 1
+#define USE_MIDSTATE
+
+extern "C" {
+#include "sph/sph_bmw.h"
+}
+
+#include "cuda_helper.h"
+
+__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
+
+#ifndef USE_MIDSTATE
+__constant__ static sph_u32 IV256[16] = {
+	0x40414243, 0x44454647, 0x48494A4B, 0x4C4D4E4F,
+	0x50515253, 0x54555657, 0x58595A5B, 0x5C5D5E5F,
+	0x60616263, 0x64656667, 0x68696A6B, 0x6C6D6E6F,
+	0x70717273, 0x74757677, 0x78797A7B, 0x7C7D7E7F
+};
+#endif
+
+__constant__ static sph_u32 final_s[16] = {
+	0xaaaaaaa0, 0xaaaaaaa1, 0xaaaaaaa2, 0xaaaaaaa3,
+	0xaaaaaaa4, 0xaaaaaaa5, 0xaaaaaaa6, 0xaaaaaaa7,
+	0xaaaaaaa8, 0xaaaaaaa9, 0xaaaaaaaa, 0xaaaaaaab,
+	0xaaaaaaac, 0xaaaaaaad, 0xaaaaaaae, 0xaaaaaaaf
+};
+
+static sph_bmw_small_context* d_midstate[MAX_GPUS];
+
+#define I16_16    0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
+#define I16_17    1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16
+#define I16_18    2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17
+#define I16_19    3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18
+#define I16_20    4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19
+#define I16_21    5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20
+#define I16_22    6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
+#define I16_23    7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22
+#define I16_24    8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23
+#define I16_25    9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24
+#define I16_26   10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25
+#define I16_27   11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26
+#define I16_28   12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
+#define I16_29   13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28
+#define I16_30   14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29
+#define I16_31   15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
+
+//#define M16_16    0,  1,  3,  4,  7, 10, 11
+//#define M16_17    1,  2,  4,  5,  8, 11, 12
+#define M16_18    2,  3,  5,  6,  9, 12, 13
+#define M16_19    3,  4,  6,  7, 10, 13, 14
+#define M16_20    4,  5,  7,  8, 11, 14, 15
+#define M16_21    5,  6,  8,  9, 12, 15, 16
+#define M16_22    6,  7,  9, 10, 13,  0,  1
+#define M16_23    7,  8, 10, 11, 14,  1,  2
+#define M16_24    8,  9, 11, 12, 15,  2,  3
+#define M16_25    9, 10, 12, 13,  0,  3,  4
+#define M16_26   10, 11, 13, 14,  1,  4,  5
+#define M16_27   11, 12, 14, 15,  2,  5,  6
+#define M16_28   12, 13, 15, 16,  3,  6,  7
+#define M16_29   13, 14,  0,  1,  4,  7,  8
+#define M16_30   14, 15,  1,  2,  5,  8,  9
+#define M16_31   15, 16,  2,  3,  6,  9, 10
+
+#define ss0(x)    (((x) >> 1) ^ ((x) << 3) ^ ROTL32(x,  4) ^ ROTL32(x, 19))
+#define ss1(x)    (((x) >> 1) ^ ((x) << 2) ^ ROTL32(x,  8) ^ ROTL32(x, 23))
+#define ss2(x)    (((x) >> 2) ^ ((x) << 1) ^ ROTL32(x, 12) ^ ROTL32(x, 25))
+#define ss3(x)    (((x) >> 2) ^ ((x) << 2) ^ ROTL32(x, 15) ^ ROTL32(x, 29))
+#define ss4(x)    (((x) >> 1) ^ (x))
+#define ss5(x)    (((x) >> 2) ^ (x))
+
+#define rs1(x)    ROTL32(x,  3)
+#define rs2(x)    ROTL32(x,  7)
+#define rs3(x)    ROTL32(x, 13)
+#define rs4(x)    ROTL32(x, 16)
+#define rs5(x)    ROTL32(x, 19)
+#define rs6(x)    ROTL32(x, 23)
+#define rs7(x)    ROTL32(x, 27)
+
+#define MAKE_W(tt, i0, op01, i1, op12, i2, op23, i3, op34, i4) \
+	tt((data[i0] ^ h[i0]) op01 (data[i1] ^ h[i1]) op12 (data[i2] ^ h[i2]) op23 (data[i3] ^ h[i3]) op34 (data[i4] ^ h[i4]))
+//#define Ws0    MAKE_W(SPH_T32,  5, -,  7, +, 10, +, 13, +, 14)
+//#define Ws1    MAKE_W(SPH_T32,  6, -,  8, +, 11, +, 14, -, 15)
+//#define Ws2    MAKE_W(SPH_T32,  0, +,  7, +,  9, -, 12, +, 15)
+//#define Ws3    MAKE_W(SPH_T32,  0, -,  1, +,  8, -, 10, +, 13)
+//#define Ws4    MAKE_W(SPH_T32,  1, +,  2, +,  9, -, 11, -, 14)
+//#define Ws5    MAKE_W(SPH_T32,  3, -,  2, +, 10, -, 12, +, 15)
+//#define Ws6    MAKE_W(SPH_T32,  4, -,  0, -,  3, -, 11, +, 13)
+//#define Ws7    MAKE_W(SPH_T32,  1, -,  4, -,  5, -, 12, -, 14)
+//#define Ws8    MAKE_W(SPH_T32,  2, -,  5, -,  6, +, 13, -, 15)
+//#define Ws9    MAKE_W(SPH_T32,  0, -,  3, +,  6, -,  7, +, 14)
+//#define Ws10   MAKE_W(SPH_T32,  8, -,  1, -,  4, -,  7, +, 15)
+//#define Ws11   MAKE_W(SPH_T32,  8, -,  0, -,  2, -,  5, +,  9)
+//#define Ws12   MAKE_W(SPH_T32,  1, +,  3, -,  6, -,  9, +, 10)
+//#define Ws13   MAKE_W(SPH_T32,  2, +,  4, +,  7, +, 10, +, 11)
+//#define Ws14   MAKE_W(SPH_T32,  3, -,  5, +,  8, -, 11, -, 12)
+//#define Ws15   MAKE_W(SPH_T32, 12, -,  4, -,  6, -,  9, +, 13)
+
+__device__
+static void gpu_compress_small(const sph_u32 *data, const sph_u32 h[16], sph_u32 dh[16])
+{
+		// FOLD MAKE_Qas;
+
+		sph_u32 dx[16];
+		for (int i=0; i<16; i++)
+			dx[i] = data[i] ^ h[i];
+
+		sph_u32 qt[32];
+		qt[ 0] = dx[ 5] - dx[7] + dx[10] + dx[13] + dx[14]; // Ws0
+		qt[ 1] = dx[ 6] - dx[8] + dx[11] + dx[14] - dx[15]; // Ws1
+		qt[ 2] = dx[ 0] + dx[7] + dx[ 9] - dx[12] + dx[15]; // Ws2
+		qt[ 3] = dx[ 0] - dx[1] + dx[ 8] - dx[10] + dx[13]; // Ws3
+		qt[ 4] = dx[ 1] + dx[2] + dx[ 9] - dx[11] - dx[14]; // Ws4;
+		qt[ 5] = dx[ 3] - dx[2] + dx[10] - dx[12] + dx[15]; // Ws5;
+		qt[ 6] = dx[ 4] - dx[0] - dx[ 3] - dx[11] + dx[13]; // Ws6;
+		qt[ 7] = dx[ 1] - dx[4] - dx[ 5] - dx[12] - dx[14]; // Ws7;
+		qt[ 8] = dx[ 2] - dx[5] - dx[ 6] + dx[13] - dx[15]; // Ws8;
+		qt[ 9] = dx[ 0] - dx[3] + dx[ 6] - dx[ 7] + dx[14]; // Ws9;
+		qt[10] = dx[ 8] - dx[1] - dx[ 4] - dx[ 7] + dx[15]; // Ws10;
+		qt[11] = dx[ 8] - dx[0] - dx[ 2] - dx[ 5] + dx[ 9]; // Ws11;
+		qt[12] = dx[ 1] + dx[3] - dx[ 6] - dx[ 9] + dx[10]; // Ws12;
+		qt[13] = dx[ 2] + dx[4] + dx[ 7] + dx[10] + dx[11]; // Ws13;
+		qt[14] = dx[ 3] - dx[5] + dx[ 8] - dx[11] - dx[12]; // Ws14;
+		qt[15] = dx[12] - dx[4] - dx[ 6] - dx[ 9] + dx[13]; // Ws15;
+
+		qt[ 0] = ss0(qt[ 0]) + h[ 1];
+		qt[ 1] = ss1(qt[ 1]) + h[ 2];
+		qt[ 2] = ss2(qt[ 2]) + h[ 3];
+		qt[ 3] = ss3(qt[ 3]) + h[ 4];
+		qt[ 4] = ss4(qt[ 4]) + h[ 5];
+
+		qt[ 5] = ss0(qt[ 5]) + h[ 6];
+		qt[ 6] = ss1(qt[ 6]) + h[ 7];
+		qt[ 7] = ss2(qt[ 7]) + h[ 8];
+		qt[ 8] = ss3(qt[ 8]) + h[ 9];
+		qt[ 9] = ss4(qt[ 9]) + h[10];
+
+		qt[10] = ss0(qt[10]) + h[11];
+		qt[11] = ss1(qt[11]) + h[12];
+		qt[12] = ss2(qt[12]) + h[13];
+		qt[13] = ss3(qt[13]) + h[14];
+		qt[14] = ss4(qt[14]) + h[15];
+
+		qt[15] = ss0(qt[15]) + h[ 0];
+
+		//MAKE_Qbs;
+		#define Ks(j)   ((sph_u32)(0x05555555UL * j))
+		#define Qs(j)   (qt[j])
+
+		#define expand1s_in(i16, \
+				i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, \
+				i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
+			(ss1(qt[i0]) + ss2(qt[i1]) + ss3(qt[i2]) + ss0(qt[i3]) + ss1(qt[i4]) + ss2(qt[i5]) + ss3(qt[i6]) + ss0(qt[i7]) \
+				+ ss1(qt[i8]) + ss2(qt[i9]) + ss3(qt[i10]) + ss0(qt[i11]) + ss1(qt[i12]) + ss2(qt[i13]) + ss3(qt[i14]) + ss0(qt[i15]) \
+				+ ((ROTL32(data[i0m], i1m) + ROTL32(data[i3m], i4m)  - ROTL32(data[i10m], i11m) + Ks(i16)) ^ h[i7m]))
+
+		qt[16] = expand1s_in(16,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,  0, 1, 3, 4, 7, 10, 11);
+		qt[17] = expand1s_in(17,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,  1, 2, 4, 5, 8, 11, 12);
+
+		#define expand2s_inner(qf, i16, \
+				i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, \
+				i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
+			(qf(i0) + rs1(qf(i1)) + qf(i2) + rs2(qf(i3)) \
+				+ qf(i4) + rs3(qf(i5)) + qf(i6) + rs4(qf(i7)) + qf(i8) + rs5(qf(i9)) + qf(i10) + rs6(qf(i11)) \
+				+ qf(i12) + rs7(qf(i13)) + ss4(qf(i14)) + ss5(qf(i15)) \
+				+ ((ROTL32(data[i0m], i1m) + ROTL32(data[i3m], i4m) - ROTL32(data[i10m], i11m) + Ks(i16)) ^ h[i7m]))
+
+#ifdef _MSC_VER
+		#define LPAR   (
+		#define expand2s(i16) \
+			expand2s_(Qs, i16, I16_ ## i16, M16_ ## i16)
+		#define expand2s_(qf, i16, ix, iy) \
+			expand2s_inner LPAR qf, i16, ix, iy)
+#else
+		#define expand2s_(i16, ix, iy) \
+			expand2s_inner(Qs, i16, ix, iy)
+		#define expand2s(i16) \
+			expand2s_(i16, I16_ ## i16, M16_ ## i16)
+#endif
+
+		qt[18] = expand2s(18);
+		qt[19] = expand2s(19);
+		qt[20] = expand2s(20);
+		qt[21] = expand2s(21);
+		qt[22] = expand2s(22);
+		qt[23] = expand2s(23);
+		qt[24] = expand2s(24);
+		qt[25] = expand2s(25);
+		qt[26] = expand2s(26);
+		qt[27] = expand2s(27);
+		qt[28] = expand2s(28);
+		qt[29] = expand2s(29);
+		qt[30] = expand2s(30);
+		qt[31] = expand2s(31);
+
+		sph_u32 xl, xh;
+		xl = Qs(16) ^ Qs(17) ^ Qs(18) ^ Qs(19) ^ Qs(20) ^ Qs(21) ^ Qs(22) ^ Qs(23);
+
+		xh = xl ^ Qs(24) ^ Qs(25) ^ Qs(26) ^ Qs(27)	^ Qs(28) ^ Qs(29) ^ Qs(30) ^ Qs(31);
+
+		dh[ 0] = ((xh <<  5) ^ (Qs(16) >>  5) ^ data[ 0]) + (xl ^ Qs(24) ^ Qs(0));
+		dh[ 1] = ((xh >>  7) ^ (Qs(17) <<  8) ^ data[ 1]) + (xl ^ Qs(25) ^ Qs(1));
+		dh[ 2] = ((xh >>  5) ^ (Qs(18) <<  5) ^ data[ 2]) + (xl ^ Qs(26) ^ Qs(2));
+		dh[ 3] = ((xh >>  1) ^ (Qs(19) <<  5) ^ data[ 3]) + (xl ^ Qs(27) ^ Qs(3));
+		dh[ 4] = ((xh >>  3) ^ (Qs(20) <<  0) ^ data[ 4]) + (xl ^ Qs(28) ^ Qs(4));
+		dh[ 5] = ((xh <<  6) ^ (Qs(21) >>  6) ^ data[ 5]) + (xl ^ Qs(29) ^ Qs(5));
+		dh[ 6] = ((xh >>  4) ^ (Qs(22) <<  6) ^ data[ 6]) + (xl ^ Qs(30) ^ Qs(6));
+		dh[ 7] = ((xh >> 11) ^ (Qs(23) <<  2) ^ data[ 7]) + (xl ^ Qs(31) ^ Qs(7));
+
+		dh[ 8] = ROTL32(dh[4],  9) + (xh ^ Qs(24) ^ data[ 8]) + ((xl << 8) ^ Qs(23) ^ Qs( 8));
+		dh[ 9] = ROTL32(dh[5], 10) + (xh ^ Qs(25) ^ data[ 9]) + ((xl >> 6) ^ Qs(16) ^ Qs( 9));
+		dh[10] = ROTL32(dh[6], 11) + (xh ^ Qs(26) ^ data[10]) + ((xl << 6) ^ Qs(17) ^ Qs(10));
+		dh[11] = ROTL32(dh[7], 12) + (xh ^ Qs(27) ^ data[11]) + ((xl << 4) ^ Qs(18) ^ Qs(11));
+		dh[12] = ROTL32(dh[0], 13) + (xh ^ Qs(28) ^ data[12]) + ((xl >> 3) ^ Qs(19) ^ Qs(12));
+		dh[13] = ROTL32(dh[1], 14) + (xh ^ Qs(29) ^ data[13]) + ((xl >> 4) ^ Qs(20) ^ Qs(13));
+		dh[14] = ROTL32(dh[2], 15) + (xh ^ Qs(30) ^ data[14]) + ((xl >> 7) ^ Qs(21) ^ Qs(14));
+		dh[15] = ROTL32(dh[3], 16) + (xh ^ Qs(31) ^ data[15]) + ((xl >> 2) ^ Qs(22) ^ Qs(15));
+}
+
+#ifndef USE_MIDSTATE
+
+__device__
+static void gpu_bmw256_init(sph_bmw_small_context *sc)
+{
+	memcpy(sc->H, IV256, sizeof sc->H);
+	sc->ptr = 0;
+	sc->bit_count = 0;
+}
+
+__device__
+static void gpu_bmw256(sph_bmw_small_context *sc, const void *data, size_t len)
+{
+	sph_u32 htmp[16];
+	sph_u32 *h1, *h2;
+	unsigned char *buf = sc->buf;
+	size_t ptr = sc->ptr;
+
+	sc->bit_count += (sph_u64)len << 3;
+
+	h1 = sc->H;
+	h2 = htmp;
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		ptr += clen;
+		if (ptr == sizeof sc->buf) {
+			sph_u32 *ht;
+
+			gpu_compress_small((sph_u32 *) buf, h1, h2);
+			ht = h1;
+			h1 = h2;
+			h2 = ht;
+			ptr = 0;
+		}
+	}
+	sc->ptr = ptr;
+	if (h1 != sc->H)
+		memcpy(sc->H, h1, sizeof sc->H);
+}
+
+#endif
+
+#define sph_enc64le(ptr, x) \
+	*((uint64_t*)(ptr)) = x
+#define sph_enc64le_aligned sph_enc64le
+
+__device__
+static void gpu_bmw256_close(sph_bmw_small_context *sc, uint2 *out)
+{
+	unsigned char *buf = sc->buf;
+	size_t ptr = sc->ptr;
+
+	buf[ptr ++] = 0x80;
+	sph_u32 *h = sc->H;
+
+	sph_u32 h1[16];
+	if (ptr > (sizeof sc->buf) - 8) {
+		memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
+		gpu_compress_small((sph_u32 *) buf, h, h1);
+		ptr = 0;
+		h = h1;
+	}
+	memset(buf + ptr, 0, sizeof(sc->buf) - 8 - ptr);
+
+	sph_enc64le_aligned(buf + sizeof(sc->buf) - 8, SPH_T64(sc->bit_count));
+
+	sph_u32 h2[16];
+	gpu_compress_small((sph_u32 *) buf, h, h2);
+	gpu_compress_small(h2, final_s, h1);
+
+	uint64_t* h64 = (uint64_t*) (&h1[8]);
+	#pragma unroll
+	for (int i = 0; i < 4; i++) {
+		out[i] = vectorize(h64[i]);
+	}
+}
+
+__global__ /* __launch_bounds__(256, 3) */
+void bmw256_gpu_hash_80(uint32_t threads, uint32_t startNonce, uint64_t *g_hash, sph_bmw256_context *d_midstate, int swap)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t nonce = startNonce + thread;
+		nonce = swap ? cuda_swab32(nonce): nonce;
+
+#ifndef USE_MIDSTATE
+		uint2 hash[10];
+		#pragma unroll
+		for(int i=0;i<9;i++)
+			hash[i] = vectorize(c_PaddedMessage80[i]);
+		hash[9] = make_uint2(c_PaddedMessage80[9], nonce);
+
+		sph_bmw256_context ctx;
+		gpu_bmw256_init(&ctx);
+		gpu_bmw256(&ctx, (void*) hash, 80);
+#else
+		sph_bmw256_context ctx;
+		ctx.ptr = 16; ctx.bit_count = 640;
+		uint2 *buf = (uint2 *) ctx.buf;
+		buf[0] = vectorize(c_PaddedMessage80[8]);
+		buf[1] = make_uint2(c_PaddedMessage80[9], nonce);
+		#pragma unroll
+		for(int i=0;i<16;i++)
+			ctx.H[i] = d_midstate->H[i];
+#endif
+		gpu_bmw256_close(&ctx, (uint2*) &g_hash[thread << 2]);
+	}
+}
+
+__host__
+void bmw256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_outputHash, int swap)
+{
+	const uint32_t threadsperblock = 256;
+
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	bmw256_gpu_hash_80<<<grid, block>>>(threads, startNonce, (uint64_t*)d_outputHash, d_midstate[thr_id], swap);
+}
+
+__host__
+void bmw256_setBlock_80(int thr_id, void *pdata)
+{
+	uint64_t PaddedMessage[16];
+	memcpy(PaddedMessage, pdata, 80);
+	memset(&PaddedMessage[10], 0, 48);
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, sizeof(PaddedMessage), 0, cudaMemcpyHostToDevice));
+
+	sph_bmw256_context ctx;
+	sph_bmw256_init(&ctx);
+	sph_bmw256(&ctx, (void*) PaddedMessage, 80);
+	CUDA_SAFE_CALL(cudaMemcpy(d_midstate[thr_id], &ctx, sizeof(sph_bmw256_context), cudaMemcpyHostToDevice));
+}
+
+__host__
+void bmw256_midstate_init(int thr_id, uint32_t threads)
+{
+	cudaMalloc(&d_midstate[thr_id], sizeof(sph_bmw256_context));
+}
+
+__host__
+void bmw256_midstate_free(int thr_id)
+{
+	cudaFree(d_midstate[thr_id]);
+}
diff --git a/Algo256/cuda_bmw256.cu b/Algo256/cuda_bmw256.cu
new file mode 100644
index 0000000000..0fde12ee24
--- /dev/null
+++ b/Algo256/cuda_bmw256.cu
@@ -0,0 +1,283 @@
+#include <stdio.h>
+#include <memory.h>
+
+#include "cuda_helper.h"
+
+#undef SPH_ROTL32
+#define SPH_ROTL32 ROTL32
+
+static uint32_t *d_gnounce[MAX_GPUS];
+static uint32_t *d_GNonce[MAX_GPUS];
+
+__constant__ uint64_t pTarget[4];
+
+#define shl(x, n) ((x) << (n))
+#define shr(x, n) ((x) >> (n))
+
+#define ss0(x) (shr((x), 1) ^ shl((x), 3) ^ SPH_ROTL32((x),  4) ^ SPH_ROTL32((x), 19))
+#define ss1(x) (shr((x), 1) ^ shl((x), 2) ^ SPH_ROTL32((x),  8) ^ SPH_ROTL32((x), 23))
+#define ss2(x) (shr((x), 2) ^ shl((x), 1) ^ SPH_ROTL32((x), 12) ^ SPH_ROTL32((x), 25))
+#define ss3(x) (shr((x), 2) ^ shl((x), 2) ^ SPH_ROTL32((x), 15) ^ SPH_ROTL32((x), 29))
+#define ss4(x) (shr((x), 1) ^ (x))
+#define ss5(x) (shr((x), 2) ^ (x))
+
+#define rs1(x) SPH_ROTL32((x),  3)
+#define rs2(x) SPH_ROTL32((x),  7)
+#define rs3(x) SPH_ROTL32((x), 13)
+#define rs4(x) SPH_ROTL32((x), 16)
+#define rs5(x) SPH_ROTL32((x), 19)
+#define rs6(x) SPH_ROTL32((x), 23)
+#define rs7(x) SPH_ROTL32((x), 27)
+
+/* Message expansion function 1 */
+__forceinline__ __device__
+uint32_t expand32_1(int i, uint32_t *M32, const uint32_t *H, uint32_t *Q)
+{
+	return (ss1(Q[i - 16]) + ss2(Q[i - 15]) + ss3(Q[i - 14]) + ss0(Q[i - 13])
+		+ ss1(Q[i - 12]) + ss2(Q[i - 11]) + ss3(Q[i - 10]) + ss0(Q[i - 9])
+		+ ss1(Q[i - 8]) + ss2(Q[i - 7]) + ss3(Q[i - 6]) + ss0(Q[i - 5])
+		+ ss1(Q[i - 4]) + ss2(Q[i - 3]) + ss3(Q[i - 2]) + ss0(Q[i - 1])
+		+ ((i*(0x05555555ul) + SPH_ROTL32(M32[(i - 16) % 16], ((i - 16) % 16) + 1)
+			+ SPH_ROTL32(M32[(i - 13) % 16], ((i - 13) % 16) + 1)
+			- SPH_ROTL32(M32[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ H[(i - 16 + 7) % 16]));
+}
+
+/* Message expansion function 2 */
+__forceinline__ __device__
+uint32_t expand32_2(int i, uint32_t *M32, const uint32_t *H, uint32_t *Q)
+{
+	return (Q[i - 16] + rs1(Q[i - 15]) + Q[i - 14] + rs2(Q[i - 13])
+		+ Q[i - 12] + rs3(Q[i - 11]) + Q[i - 10] + rs4(Q[i - 9])
+		+ Q[i - 8] + rs5(Q[i - 7]) + Q[i - 6] + rs6(Q[i - 5])
+		+ Q[i - 4] + rs7(Q[i - 3]) + ss4(Q[i - 2]) + ss5(Q[i - 1])
+		+ ((i*(0x05555555ul) + SPH_ROTL32(M32[(i - 16) % 16], ((i - 16) % 16) + 1)
+			+ SPH_ROTL32(M32[(i - 13) % 16], ((i - 13) % 16) + 1)
+			- SPH_ROTL32(M32[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ H[(i - 16 + 7) % 16]));
+}
+
+__forceinline__ __device__
+void Compression256(uint32_t *  M32)
+{
+	uint32_t Q[32], XL32, XH32;
+
+	const uint32_t H[16] = {
+		0x40414243, 0x44454647, 0x48494A4B, 0x4C4D4E4F,
+		0x50515253, 0x54555657, 0x58595A5B, 0x5C5D5E5F,
+		0x60616263, 0x64656667, 0x68696A6B, 0x6C6D6E6F,
+		0x70717273, 0x74757677, 0x78797A7B, 0x7C7D7E7F
+	};
+
+	Q[0]  = (M32[5] ^ H[5]) - (M32[7] ^ H[7]) + (M32[10] ^ H[10]) + (M32[13] ^ H[13]) + (M32[14] ^ H[14]);
+	Q[1]  = (M32[6] ^ H[6]) - (M32[8] ^ H[8]) + (M32[11] ^ H[11]) + (M32[14] ^ H[14]) - (M32[15] ^ H[15]);
+	Q[2]  = (M32[0] ^ H[0]) + (M32[7] ^ H[7]) + (M32[9]  ^ H[9])  - (M32[12] ^ H[12]) + (M32[15] ^ H[15]);
+	Q[3]  = (M32[0] ^ H[0]) - (M32[1] ^ H[1]) + (M32[8]  ^ H[8])  - (M32[10] ^ H[10]) + (M32[13] ^ H[13]);
+	Q[4]  = (M32[1] ^ H[1]) + (M32[2] ^ H[2]) + (M32[9]  ^ H[9])  - (M32[11] ^ H[11]) - (M32[14] ^ H[14]);
+	Q[5]  = (M32[3] ^ H[3]) - (M32[2] ^ H[2]) + (M32[10] ^ H[10]) - (M32[12] ^ H[12]) + (M32[15] ^ H[15]);
+	Q[6]  = (M32[4] ^ H[4]) - (M32[0] ^ H[0]) - (M32[3]  ^ H[3])  - (M32[11] ^ H[11]) + (M32[13] ^ H[13]);
+	Q[7]  = (M32[1] ^ H[1]) - (M32[4] ^ H[4]) - (M32[5]  ^ H[5])  - (M32[12] ^ H[12]) - (M32[14] ^ H[14]);
+	Q[8]  = (M32[2] ^ H[2]) - (M32[5] ^ H[5]) - (M32[6]  ^ H[6])  + (M32[13] ^ H[13]) - (M32[15] ^ H[15]);
+	Q[9]  = (M32[0] ^ H[0]) - (M32[3] ^ H[3]) + (M32[6]  ^ H[6])  - (M32[7]  ^ H[7])  + (M32[14] ^ H[14]);
+	Q[10] = (M32[8] ^ H[8]) - (M32[1] ^ H[1]) - (M32[4]  ^ H[4])  - (M32[7]  ^ H[7])  + (M32[15] ^ H[15]);
+	Q[11] = (M32[8] ^ H[8]) - (M32[0] ^ H[0]) - (M32[2]  ^ H[2])  - (M32[5]  ^ H[5])  + (M32[9]  ^ H[9]);
+	Q[12] = (M32[1] ^ H[1]) + (M32[3] ^ H[3]) - (M32[6]  ^ H[6])  - (M32[9]  ^ H[9])  + (M32[10] ^ H[10]);
+	Q[13] = (M32[2] ^ H[2]) + (M32[4] ^ H[4]) + (M32[7]  ^ H[7])  + (M32[10] ^ H[10]) + (M32[11] ^ H[11]);
+	Q[14] = (M32[3] ^ H[3]) - (M32[5] ^ H[5]) + (M32[8]  ^ H[8])  - (M32[11] ^ H[11]) - (M32[12] ^ H[12]);
+	Q[15] = (M32[12] ^ H[12]) - (M32[4] ^ H[4]) - (M32[6] ^ H[6]) - (M32[9]  ^ H[9])  + (M32[13] ^ H[13]);
+
+	/*  Diffuse the differences in every word in a bijective manner with ssi, and then add the values of the previous double pipe. */
+	Q[0]  = ss0(Q[0])  + H[1];
+	Q[1]  = ss1(Q[1])  + H[2];
+	Q[2]  = ss2(Q[2])  + H[3];
+	Q[3]  = ss3(Q[3])  + H[4];
+	Q[4]  = ss4(Q[4])  + H[5];
+	Q[5]  = ss0(Q[5])  + H[6];
+	Q[6]  = ss1(Q[6])  + H[7];
+	Q[7]  = ss2(Q[7])  + H[8];
+	Q[8]  = ss3(Q[8])  + H[9];
+	Q[9]  = ss4(Q[9])  + H[10];
+	Q[10] = ss0(Q[10]) + H[11];
+	Q[11] = ss1(Q[11]) + H[12];
+	Q[12] = ss2(Q[12]) + H[13];
+	Q[13] = ss3(Q[13]) + H[14];
+	Q[14] = ss4(Q[14]) + H[15];
+	Q[15] = ss0(Q[15]) + H[0];
+
+	/* This is the Message expansion or f_1 in the documentation.       */
+	/* It has 16 rounds.                                                */
+	/* Blue Midnight Wish has two tunable security parameters.          */
+	/* The parameters are named EXPAND_1_ROUNDS and EXPAND_2_ROUNDS.    */
+	/* The following relation for these parameters should is satisfied: */
+	/* EXPAND_1_ROUNDS + EXPAND_2_ROUNDS = 16                           */
+
+	#pragma unroll
+	for (int i=16; i<18; i++)
+		Q[i] = expand32_1(i, M32, H, Q);
+
+	#pragma nounroll
+	for (int i=18; i<32; i++)
+		Q[i] = expand32_2(i, M32, H, Q);
+
+	/* Blue Midnight Wish has two temporary cummulative variables that accumulate via XORing */
+	/* 16 new variables that are prooduced in the Message Expansion part.                    */
+	XL32 = Q[16] ^ Q[17] ^ Q[18] ^ Q[19] ^ Q[20] ^ Q[21] ^ Q[22] ^ Q[23];
+	XH32 = XL32^Q[24] ^ Q[25] ^ Q[26] ^ Q[27] ^ Q[28] ^ Q[29] ^ Q[30] ^ Q[31];
+
+
+	/*  This part is the function f_2 - in the documentation            */
+
+	/*  Compute the double chaining pipe for the next message block.    */
+	M32[0] = (shl(XH32, 5) ^ shr(Q[16], 5) ^ M32[0]) + (XL32    ^ Q[24] ^ Q[0]);
+	M32[1] = (shr(XH32, 7) ^ shl(Q[17], 8) ^ M32[1]) + (XL32    ^ Q[25] ^ Q[1]);
+	M32[2] = (shr(XH32, 5) ^ shl(Q[18], 5) ^ M32[2]) + (XL32    ^ Q[26] ^ Q[2]);
+	M32[3] = (shr(XH32, 1) ^ shl(Q[19], 5) ^ M32[3]) + (XL32    ^ Q[27] ^ Q[3]);
+	M32[4] = (shr(XH32, 3) ^ Q[20] ^ M32[4]) + (XL32    ^ Q[28] ^ Q[4]);
+	M32[5] = (shl(XH32, 6) ^ shr(Q[21], 6) ^ M32[5]) + (XL32    ^ Q[29] ^ Q[5]);
+	M32[6] = (shr(XH32, 4) ^ shl(Q[22], 6) ^ M32[6]) + (XL32    ^ Q[30] ^ Q[6]);
+	M32[7] = (shr(XH32, 11) ^ shl(Q[23], 2) ^ M32[7]) + (XL32    ^ Q[31] ^ Q[7]);
+
+	M32[8] = SPH_ROTL32(M32[4], 9) + (XH32     ^     Q[24] ^ M32[8]) + (shl(XL32, 8) ^ Q[23] ^ Q[8]);
+	M32[9] = SPH_ROTL32(M32[5], 10) + (XH32     ^     Q[25] ^ M32[9]) + (shr(XL32, 6) ^ Q[16] ^ Q[9]);
+	M32[10] = SPH_ROTL32(M32[6], 11) + (XH32     ^     Q[26] ^ M32[10]) + (shl(XL32, 6) ^ Q[17] ^ Q[10]);
+	M32[11] = SPH_ROTL32(M32[7], 12) + (XH32     ^     Q[27] ^ M32[11]) + (shl(XL32, 4) ^ Q[18] ^ Q[11]);
+	M32[12] = SPH_ROTL32(M32[0], 13) + (XH32     ^     Q[28] ^ M32[12]) + (shr(XL32, 3) ^ Q[19] ^ Q[12]);
+	M32[13] = SPH_ROTL32(M32[1], 14) + (XH32     ^     Q[29] ^ M32[13]) + (shr(XL32, 4) ^ Q[20] ^ Q[13]);
+	M32[14] = SPH_ROTL32(M32[2], 15) + (XH32     ^     Q[30] ^ M32[14]) + (shr(XL32, 7) ^ Q[21] ^ Q[14]);
+	M32[15] = SPH_ROTL32(M32[3], 16) + (XH32     ^     Q[31] ^ M32[15]) + (shr(XL32, 2) ^ Q[22] ^ Q[15]);
+}
+
+__forceinline__ __device__
+void Compression256_2(uint32_t *  M32)
+{
+	uint32_t XL32, XH32, Q[32];
+
+	const uint32_t H[16] = {
+		0xaaaaaaa0, 0xaaaaaaa1, 0xaaaaaaa2, 0xaaaaaaa3,
+		0xaaaaaaa4, 0xaaaaaaa5, 0xaaaaaaa6, 0xaaaaaaa7,
+		0xaaaaaaa8, 0xaaaaaaa9, 0xaaaaaaaa, 0xaaaaaaab,
+		0xaaaaaaac, 0xaaaaaaad, 0xaaaaaaae, 0xaaaaaaaf
+	};
+
+	Q[0] = (M32[5] ^ H[5]) - (M32[7] ^ H[7]) + (M32[10] ^ H[10]) + (M32[13] ^ H[13]) + (M32[14] ^ H[14]);
+	Q[1] = (M32[6] ^ H[6]) - (M32[8] ^ H[8]) + (M32[11] ^ H[11]) + (M32[14] ^ H[14]) - (M32[15] ^ H[15]);
+	Q[2] = (M32[0] ^ H[0]) + (M32[7] ^ H[7]) + (M32[9] ^ H[9]) - (M32[12] ^ H[12]) + (M32[15] ^ H[15]);
+	Q[3] = (M32[0] ^ H[0]) - (M32[1] ^ H[1]) + (M32[8] ^ H[8]) - (M32[10] ^ H[10]) + (M32[13] ^ H[13]);
+	Q[4] = (M32[1] ^ H[1]) + (M32[2] ^ H[2]) + (M32[9] ^ H[9]) - (M32[11] ^ H[11]) - (M32[14] ^ H[14]);
+	Q[5] = (M32[3] ^ H[3]) - (M32[2] ^ H[2]) + (M32[10] ^ H[10]) - (M32[12] ^ H[12]) + (M32[15] ^ H[15]);
+	Q[6] = (M32[4] ^ H[4]) - (M32[0] ^ H[0]) - (M32[3] ^ H[3]) - (M32[11] ^ H[11]) + (M32[13] ^ H[13]);
+	Q[7] = (M32[1] ^ H[1]) - (M32[4] ^ H[4]) - (M32[5] ^ H[5]) - (M32[12] ^ H[12]) - (M32[14] ^ H[14]);
+	Q[8] = (M32[2] ^ H[2]) - (M32[5] ^ H[5]) - (M32[6] ^ H[6]) + (M32[13] ^ H[13]) - (M32[15] ^ H[15]);
+	Q[9] = (M32[0] ^ H[0]) - (M32[3] ^ H[3]) + (M32[6] ^ H[6]) - (M32[7] ^ H[7]) + (M32[14] ^ H[14]);
+	Q[10] = (M32[8] ^ H[8]) - (M32[1] ^ H[1]) - (M32[4] ^ H[4]) - (M32[7] ^ H[7]) + (M32[15] ^ H[15]);
+	Q[11] = (M32[8] ^ H[8]) - (M32[0] ^ H[0]) - (M32[2] ^ H[2]) - (M32[5] ^ H[5]) + (M32[9] ^ H[9]);
+	Q[12] = (M32[1] ^ H[1]) + (M32[3] ^ H[3]) - (M32[6] ^ H[6]) - (M32[9] ^ H[9]) + (M32[10] ^ H[10]);
+	Q[13] = (M32[2] ^ H[2]) + (M32[4] ^ H[4]) + (M32[7] ^ H[7]) + (M32[10] ^ H[10]) + (M32[11] ^ H[11]);
+	Q[14] = (M32[3] ^ H[3]) - (M32[5] ^ H[5]) + (M32[8] ^ H[8]) - (M32[11] ^ H[11]) - (M32[12] ^ H[12]);
+	Q[15] = (M32[12] ^ H[12]) - (M32[4] ^ H[4]) - (M32[6] ^ H[6]) - (M32[9] ^ H[9]) + (M32[13] ^ H[13]);
+
+	/*  Diffuse the differences in every word in a bijective manner with ssi, and then add the values of the previous double pipe.*/
+	Q[0] = ss0(Q[0]) + H[1];
+	Q[1] = ss1(Q[1]) + H[2];
+	Q[2] = ss2(Q[2]) + H[3];
+	Q[3] = ss3(Q[3]) + H[4];
+	Q[4] = ss4(Q[4]) + H[5];
+	Q[5] = ss0(Q[5]) + H[6];
+	Q[6] = ss1(Q[6]) + H[7];
+	Q[7] = ss2(Q[7]) + H[8];
+	Q[8] = ss3(Q[8]) + H[9];
+	Q[9] = ss4(Q[9]) + H[10];
+	Q[10] = ss0(Q[10]) + H[11];
+	Q[11] = ss1(Q[11]) + H[12];
+	Q[12] = ss2(Q[12]) + H[13];
+	Q[13] = ss3(Q[13]) + H[14];
+	Q[14] = ss4(Q[14]) + H[15];
+	Q[15] = ss0(Q[15]) + H[0];
+
+	/* This is the Message expansion or f_1 in the documentation.       */
+	/* It has 16 rounds.                                                */
+	/* Blue Midnight Wish has two tunable security parameters.          */
+	/* The parameters are named EXPAND_1_ROUNDS and EXPAND_2_ROUNDS.    */
+	/* The following relation for these parameters should is satisfied: */
+	/* EXPAND_1_ROUNDS + EXPAND_2_ROUNDS = 16                           */
+
+	#pragma unroll
+	for (int i = 16; i<18; i++)
+		Q[i] = expand32_1(i, M32, H, Q);
+
+	#pragma nounroll
+	for (int i = 18; i<32; i++)
+		Q[i] = expand32_2(i, M32, H, Q);
+
+	/* Blue Midnight Wish has two temporary cummulative variables that accumulate via XORing */
+	/* 16 new variables that are prooduced in the Message Expansion part.                    */
+	XL32 = Q[16] ^ Q[17] ^ Q[18] ^ Q[19] ^ Q[20] ^ Q[21] ^ Q[22] ^ Q[23];
+	XH32 = XL32 ^ Q[24] ^ Q[25] ^ Q[26] ^ Q[27] ^ Q[28] ^ Q[29] ^ Q[30] ^ Q[31];
+
+	M32[2] = (shr(XH32, 5) ^ shl(Q[18], 5) ^ M32[2]) + (XL32 ^ Q[26] ^ Q[2]);
+	M32[3] = (shr(XH32, 1) ^ shl(Q[19], 5) ^ M32[3]) + (XL32 ^ Q[27] ^ Q[3]);
+	M32[14] = SPH_ROTL32(M32[2], 15) + (XH32 ^ Q[30] ^ M32[14]) + (shr(XL32, 7) ^ Q[21] ^ Q[14]);
+	M32[15] = SPH_ROTL32(M32[3], 16) + (XH32 ^ Q[31] ^ M32[15]) + (shr(XL32, 2) ^ Q[22] ^ Q[15]);
+}
+
+#define TPB 512
+__global__ __launch_bounds__(TPB, 2)
+void bmw256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *const __restrict__ nonceVector)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t message[16] = { 0 };
+
+		LOHI(message[0], message[1], __ldg(&g_hash[thread]));
+		LOHI(message[2], message[3], __ldg(&g_hash[thread + 1 * threads]));
+		LOHI(message[4], message[5], __ldg(&g_hash[thread + 2 * threads]));
+		LOHI(message[6], message[7], __ldg(&g_hash[thread + 3 * threads]));
+
+		message[8]=0x80;
+		message[14]=0x100;
+		Compression256(message);
+		Compression256_2(message);
+
+		if (((uint64_t*)message)[7] <= pTarget[3])
+		{
+			uint32_t tmp = atomicExch(&nonceVector[0], startNounce + thread);
+			if (tmp != 0)
+				nonceVector[1] = tmp;
+		}
+	}
+}
+
+__host__
+void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces)
+{
+	const uint32_t threadsperblock = TPB;
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	cudaMemset(d_GNonce[thr_id], 0, 2 * sizeof(uint32_t));
+
+	bmw256_gpu_hash_32 << <grid, block >> >(threads, startNounce, g_hash, d_GNonce[thr_id]);
+	cudaMemcpy(d_gnounce[thr_id], d_GNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	resultnonces[0] = *(d_gnounce[thr_id]);
+	resultnonces[1] = *(d_gnounce[thr_id] + 1);
+}
+
+
+__host__
+void bmw256_cpu_init(int thr_id, uint32_t threads)
+{
+	cudaMalloc(&d_GNonce[thr_id], 2 * sizeof(uint32_t));
+	cudaMallocHost(&d_gnounce[thr_id], 2 * sizeof(uint32_t));
+}
+
+__host__
+void bmw256_cpu_free(int thr_id)
+{
+	cudaFree(d_GNonce[thr_id]);
+	cudaFreeHost(d_gnounce[thr_id]);
+}
+
+__host__
+void bmw256_setTarget(const void *pTargetIn)
+{
+	cudaMemcpyToSymbol(pTarget, pTargetIn, 32, 0, cudaMemcpyHostToDevice);
+}
diff --git a/Algo256/cuda_cubehash256.cu b/Algo256/cuda_cubehash256.cu
new file mode 100644
index 0000000000..153e87a10e
--- /dev/null
+++ b/Algo256/cuda_cubehash256.cu
@@ -0,0 +1,370 @@
+#include "cuda_helper.h"
+
+#define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */
+#define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */
+
+#ifdef __INTELLISENSE__
+/* just for vstudio code colors */
+#define __CUDA_ARCH__ 520
+#endif
+
+#if __CUDA_ARCH__ < 350
+#define LROT(x,bits) ((x << bits) | (x >> (32 - bits)))
+#else
+#define LROT(x, bits) __funnelshift_l(x, x, bits)
+#endif
+
+#define TPB35 576
+#define TPB50 1024
+
+#define ROTATEUPWARDS7(a)  LROT(a,7)
+#define ROTATEUPWARDS11(a) LROT(a,11)
+
+__device__ __forceinline__ void rrounds(uint32_t x[2][2][2][2][2])
+{
+	int r;
+
+	uint32_t x0[2][2][2][2];
+	uint32_t x1[2][2][2][2];
+
+	for (r = 0; r < CUBEHASH_ROUNDS; r += 2) {
+		/* "rotate x_0jklm upwards by 7 bits" */
+		x0[0][0][0][0] = ROTATEUPWARDS7(x[0][0][0][0][0]);
+		x0[0][0][0][1] = ROTATEUPWARDS7(x[0][0][0][0][1]);
+		x0[0][0][1][0] = ROTATEUPWARDS7(x[0][0][0][1][0]);
+		x0[0][0][1][1] = ROTATEUPWARDS7(x[0][0][0][1][1]);
+		x0[0][1][0][0] = ROTATEUPWARDS7(x[0][0][1][0][0]);
+		x0[0][1][0][1] = ROTATEUPWARDS7(x[0][0][1][0][1]);
+		x0[0][1][1][0] = ROTATEUPWARDS7(x[0][0][1][1][0]);
+		x0[0][1][1][1] = ROTATEUPWARDS7(x[0][0][1][1][1]);
+		x0[1][0][0][0] = ROTATEUPWARDS7(x[0][1][0][0][0]);
+		x0[1][0][0][1] = ROTATEUPWARDS7(x[0][1][0][0][1]);
+		x0[1][0][1][0] = ROTATEUPWARDS7(x[0][1][0][1][0]);
+		x0[1][0][1][1] = ROTATEUPWARDS7(x[0][1][0][1][1]);
+		x0[1][1][0][0] = ROTATEUPWARDS7(x[0][1][1][0][0]);
+		x0[1][1][0][1] = ROTATEUPWARDS7(x[0][1][1][0][1]);
+		x0[1][1][1][0] = ROTATEUPWARDS7(x[0][1][1][1][0]);
+		x0[1][1][1][1] = ROTATEUPWARDS7(x[0][1][1][1][1]);
+
+		/* "add x_0jklm into x_1jklm modulo 2^32" */
+		x1[0][0][0][0] = x[1][0][0][0][0] + x[0][0][0][0][0];
+		x1[0][0][0][1] = x[1][0][0][0][1] + x[0][0][0][0][1];
+		x1[0][0][1][0] = x[1][0][0][1][0] + x[0][0][0][1][0];
+		x1[0][0][1][1] = x[1][0][0][1][1] + x[0][0][0][1][1];
+		x1[0][1][0][0] = x[1][0][1][0][0] + x[0][0][1][0][0];
+		x1[0][1][0][1] = x[1][0][1][0][1] + x[0][0][1][0][1];
+		x1[0][1][1][0] = x[1][0][1][1][0] + x[0][0][1][1][0];
+		x1[0][1][1][1] = x[1][0][1][1][1] + x[0][0][1][1][1];
+		x1[1][0][0][0] = x[1][1][0][0][0] + x[0][1][0][0][0];
+		x1[1][0][0][1] = x[1][1][0][0][1] + x[0][1][0][0][1];
+		x1[1][0][1][0] = x[1][1][0][1][0] + x[0][1][0][1][0];
+		x1[1][0][1][1] = x[1][1][0][1][1] + x[0][1][0][1][1];
+		x1[1][1][0][0] = x[1][1][1][0][0] + x[0][1][1][0][0];
+		x1[1][1][0][1] = x[1][1][1][0][1] + x[0][1][1][0][1];
+		x1[1][1][1][0] = x[1][1][1][1][0] + x[0][1][1][1][0];
+		x1[1][1][1][1] = x[1][1][1][1][1] + x[0][1][1][1][1];
+
+		/* "xor x_1~jklm into x_0jklm" */
+		x[0][0][0][0][0] = x0[0][0][0][0] ^ x1[1][0][0][0];
+		x[0][0][0][0][1] = x0[0][0][0][1] ^ x1[1][0][0][1];
+		x[0][0][0][1][0] = x0[0][0][1][0] ^ x1[1][0][1][0];
+		x[0][0][0][1][1] = x0[0][0][1][1] ^ x1[1][0][1][1];
+		x[0][0][1][0][0] = x0[0][1][0][0] ^ x1[1][1][0][0];
+		x[0][0][1][0][1] = x0[0][1][0][1] ^ x1[1][1][0][1];
+		x[0][0][1][1][0] = x0[0][1][1][0] ^ x1[1][1][1][0];
+		x[0][0][1][1][1] = x0[0][1][1][1] ^ x1[1][1][1][1];
+		x[0][1][0][0][0] = x0[1][0][0][0] ^ x1[0][0][0][0];
+		x[0][1][0][0][1] = x0[1][0][0][1] ^ x1[0][0][0][1];
+		x[0][1][0][1][0] = x0[1][0][1][0] ^ x1[0][0][1][0];
+		x[0][1][0][1][1] = x0[1][0][1][1] ^ x1[0][0][1][1];
+		x[0][1][1][0][0] = x0[1][1][0][0] ^ x1[0][1][0][0];
+		x[0][1][1][0][1] = x0[1][1][0][1] ^ x1[0][1][0][1];
+		x[0][1][1][1][0] = x0[1][1][1][0] ^ x1[0][1][1][0];
+		x[0][1][1][1][1] = x0[1][1][1][1] ^ x1[0][1][1][1];
+
+		/* "rotate x_0jklm upwards by 11 bits" */
+		x0[0][0][0][0] = ROTATEUPWARDS11(x[0][0][0][0][0]);
+		x0[0][0][0][1] = ROTATEUPWARDS11(x[0][0][0][0][1]);
+		x0[0][0][1][0] = ROTATEUPWARDS11(x[0][0][0][1][0]);
+		x0[0][0][1][1] = ROTATEUPWARDS11(x[0][0][0][1][1]);
+		x0[0][1][0][0] = ROTATEUPWARDS11(x[0][0][1][0][0]);
+		x0[0][1][0][1] = ROTATEUPWARDS11(x[0][0][1][0][1]);
+		x0[0][1][1][0] = ROTATEUPWARDS11(x[0][0][1][1][0]);
+		x0[0][1][1][1] = ROTATEUPWARDS11(x[0][0][1][1][1]);
+		x0[1][0][0][0] = ROTATEUPWARDS11(x[0][1][0][0][0]);
+		x0[1][0][0][1] = ROTATEUPWARDS11(x[0][1][0][0][1]);
+		x0[1][0][1][0] = ROTATEUPWARDS11(x[0][1][0][1][0]);
+		x0[1][0][1][1] = ROTATEUPWARDS11(x[0][1][0][1][1]);
+		x0[1][1][0][0] = ROTATEUPWARDS11(x[0][1][1][0][0]);
+		x0[1][1][0][1] = ROTATEUPWARDS11(x[0][1][1][0][1]);
+		x0[1][1][1][0] = ROTATEUPWARDS11(x[0][1][1][1][0]);
+		x0[1][1][1][1] = ROTATEUPWARDS11(x[0][1][1][1][1]);
+
+		/* "add x_0jklm into x_1~jk~lm modulo 2^32" */
+		x[1][1][0][1][0] = x1[1][0][1][0] + x[0][0][0][0][0];
+		x[1][1][0][1][1] = x1[1][0][1][1] + x[0][0][0][0][1];
+		x[1][1][0][0][0] = x1[1][0][0][0] + x[0][0][0][1][0];
+		x[1][1][0][0][1] = x1[1][0][0][1] + x[0][0][0][1][1];
+		x[1][1][1][1][0] = x1[1][1][1][0] + x[0][0][1][0][0];
+		x[1][1][1][1][1] = x1[1][1][1][1] + x[0][0][1][0][1];
+		x[1][1][1][0][0] = x1[1][1][0][0] + x[0][0][1][1][0];
+		x[1][1][1][0][1] = x1[1][1][0][1] + x[0][0][1][1][1];
+		x[1][0][0][1][0] = x1[0][0][1][0] + x[0][1][0][0][0];
+		x[1][0][0][1][1] = x1[0][0][1][1] + x[0][1][0][0][1];
+		x[1][0][0][0][0] = x1[0][0][0][0] + x[0][1][0][1][0];
+		x[1][0][0][0][1] = x1[0][0][0][1] + x[0][1][0][1][1];
+		x[1][0][1][1][0] = x1[0][1][1][0] + x[0][1][1][0][0];
+		x[1][0][1][1][1] = x1[0][1][1][1] + x[0][1][1][0][1];
+		x[1][0][1][0][0] = x1[0][1][0][0] + x[0][1][1][1][0];
+		x[1][0][1][0][1] = x1[0][1][0][1] + x[0][1][1][1][1];
+
+		/* "xor x_1~j~k~lm into x_0jklm" */
+		x[0][0][0][0][0] = x0[0][0][0][0] ^ x[1][1][1][1][0];
+		x[0][0][0][0][1] = x0[0][0][0][1] ^ x[1][1][1][1][1];
+		x[0][0][0][1][0] = x0[0][0][1][0] ^ x[1][1][1][0][0];
+		x[0][0][0][1][1] = x0[0][0][1][1] ^ x[1][1][1][0][1];
+		x[0][0][1][0][0] = x0[0][1][0][0] ^ x[1][1][0][1][0];
+		x[0][0][1][0][1] = x0[0][1][0][1] ^ x[1][1][0][1][1];
+		x[0][0][1][1][0] = x0[0][1][1][0] ^ x[1][1][0][0][0];
+		x[0][0][1][1][1] = x0[0][1][1][1] ^ x[1][1][0][0][1];
+		x[0][1][0][0][0] = x0[1][0][0][0] ^ x[1][0][1][1][0];
+		x[0][1][0][0][1] = x0[1][0][0][1] ^ x[1][0][1][1][1];
+		x[0][1][0][1][0] = x0[1][0][1][0] ^ x[1][0][1][0][0];
+		x[0][1][0][1][1] = x0[1][0][1][1] ^ x[1][0][1][0][1];
+		x[0][1][1][0][0] = x0[1][1][0][0] ^ x[1][0][0][1][0];
+		x[0][1][1][0][1] = x0[1][1][0][1] ^ x[1][0][0][1][1];
+		x[0][1][1][1][0] = x0[1][1][1][0] ^ x[1][0][0][0][0];
+		x[0][1][1][1][1] = x0[1][1][1][1] ^ x[1][0][0][0][1];
+
+		/* "rotate x_0jklm upwards by 7 bits" */
+		x0[0][0][0][0] = ROTATEUPWARDS7(x[0][0][0][0][0]);
+		x0[0][0][0][1] = ROTATEUPWARDS7(x[0][0][0][0][1]);
+		x0[0][0][1][0] = ROTATEUPWARDS7(x[0][0][0][1][0]);
+		x0[0][0][1][1] = ROTATEUPWARDS7(x[0][0][0][1][1]);
+		x0[0][1][0][0] = ROTATEUPWARDS7(x[0][0][1][0][0]);
+		x0[0][1][0][1] = ROTATEUPWARDS7(x[0][0][1][0][1]);
+		x0[0][1][1][0] = ROTATEUPWARDS7(x[0][0][1][1][0]);
+		x0[0][1][1][1] = ROTATEUPWARDS7(x[0][0][1][1][1]);
+		x0[1][0][0][0] = ROTATEUPWARDS7(x[0][1][0][0][0]);
+		x0[1][0][0][1] = ROTATEUPWARDS7(x[0][1][0][0][1]);
+		x0[1][0][1][0] = ROTATEUPWARDS7(x[0][1][0][1][0]);
+		x0[1][0][1][1] = ROTATEUPWARDS7(x[0][1][0][1][1]);
+		x0[1][1][0][0] = ROTATEUPWARDS7(x[0][1][1][0][0]);
+		x0[1][1][0][1] = ROTATEUPWARDS7(x[0][1][1][0][1]);
+		x0[1][1][1][0] = ROTATEUPWARDS7(x[0][1][1][1][0]);
+		x0[1][1][1][1] = ROTATEUPWARDS7(x[0][1][1][1][1]);
+
+		/* "add x_0jklm into x_1~j~k~l~m modulo 2^32" */
+		x1[1][1][1][1] = x[1][1][1][1][1] + x[0][0][0][0][0];
+		x1[1][1][1][0] = x[1][1][1][1][0] + x[0][0][0][0][1];
+		x1[1][1][0][1] = x[1][1][1][0][1] + x[0][0][0][1][0];
+		x1[1][1][0][0] = x[1][1][1][0][0] + x[0][0][0][1][1];
+		x1[1][0][1][1] = x[1][1][0][1][1] + x[0][0][1][0][0];
+		x1[1][0][1][0] = x[1][1][0][1][0] + x[0][0][1][0][1];
+		x1[1][0][0][1] = x[1][1][0][0][1] + x[0][0][1][1][0];
+		x1[1][0][0][0] = x[1][1][0][0][0] + x[0][0][1][1][1];
+		x1[0][1][1][1] = x[1][0][1][1][1] + x[0][1][0][0][0];
+		x1[0][1][1][0] = x[1][0][1][1][0] + x[0][1][0][0][1];
+		x1[0][1][0][1] = x[1][0][1][0][1] + x[0][1][0][1][0];
+		x1[0][1][0][0] = x[1][0][1][0][0] + x[0][1][0][1][1];
+		x1[0][0][1][1] = x[1][0][0][1][1] + x[0][1][1][0][0];
+		x1[0][0][1][0] = x[1][0][0][1][0] + x[0][1][1][0][1];
+		x1[0][0][0][1] = x[1][0][0][0][1] + x[0][1][1][1][0];
+		x1[0][0][0][0] = x[1][0][0][0][0] + x[0][1][1][1][1];
+
+		/* "xor x_1j~k~l~m into x_0jklm" */
+		x[0][0][0][0][0] = x0[0][0][0][0] ^ x1[0][1][1][1];
+		x[0][0][0][0][1] = x0[0][0][0][1] ^ x1[0][1][1][0];
+		x[0][0][0][1][0] = x0[0][0][1][0] ^ x1[0][1][0][1];
+		x[0][0][0][1][1] = x0[0][0][1][1] ^ x1[0][1][0][0];
+		x[0][0][1][0][0] = x0[0][1][0][0] ^ x1[0][0][1][1];
+		x[0][0][1][0][1] = x0[0][1][0][1] ^ x1[0][0][1][0];
+		x[0][0][1][1][0] = x0[0][1][1][0] ^ x1[0][0][0][1];
+		x[0][0][1][1][1] = x0[0][1][1][1] ^ x1[0][0][0][0];
+		x[0][1][0][0][0] = x0[1][0][0][0] ^ x1[1][1][1][1];
+		x[0][1][0][0][1] = x0[1][0][0][1] ^ x1[1][1][1][0];
+		x[0][1][0][1][0] = x0[1][0][1][0] ^ x1[1][1][0][1];
+		x[0][1][0][1][1] = x0[1][0][1][1] ^ x1[1][1][0][0];
+		x[0][1][1][0][0] = x0[1][1][0][0] ^ x1[1][0][1][1];
+		x[0][1][1][0][1] = x0[1][1][0][1] ^ x1[1][0][1][0];
+		x[0][1][1][1][0] = x0[1][1][1][0] ^ x1[1][0][0][1];
+		x[0][1][1][1][1] = x0[1][1][1][1] ^ x1[1][0][0][0];
+
+		/* "rotate x_0jklm upwards by 11 bits" */
+		x0[0][0][0][0] = ROTATEUPWARDS11(x[0][0][0][0][0]);
+		x0[0][0][0][1] = ROTATEUPWARDS11(x[0][0][0][0][1]);
+		x0[0][0][1][0] = ROTATEUPWARDS11(x[0][0][0][1][0]);
+		x0[0][0][1][1] = ROTATEUPWARDS11(x[0][0][0][1][1]);
+		x0[0][1][0][0] = ROTATEUPWARDS11(x[0][0][1][0][0]);
+		x0[0][1][0][1] = ROTATEUPWARDS11(x[0][0][1][0][1]);
+		x0[0][1][1][0] = ROTATEUPWARDS11(x[0][0][1][1][0]);
+		x0[0][1][1][1] = ROTATEUPWARDS11(x[0][0][1][1][1]);
+		x0[1][0][0][0] = ROTATEUPWARDS11(x[0][1][0][0][0]);
+		x0[1][0][0][1] = ROTATEUPWARDS11(x[0][1][0][0][1]);
+		x0[1][0][1][0] = ROTATEUPWARDS11(x[0][1][0][1][0]);
+		x0[1][0][1][1] = ROTATEUPWARDS11(x[0][1][0][1][1]);
+		x0[1][1][0][0] = ROTATEUPWARDS11(x[0][1][1][0][0]);
+		x0[1][1][0][1] = ROTATEUPWARDS11(x[0][1][1][0][1]);
+		x0[1][1][1][0] = ROTATEUPWARDS11(x[0][1][1][1][0]);
+		x0[1][1][1][1] = ROTATEUPWARDS11(x[0][1][1][1][1]);
+
+		/* "add x_0jklm into x_1j~kl~m modulo 2^32" */
+		x[1][0][1][0][1] = x1[0][1][0][1] + x[0][0][0][0][0];
+		x[1][0][1][0][0] = x1[0][1][0][0] + x[0][0][0][0][1];
+		x[1][0][1][1][1] = x1[0][1][1][1] + x[0][0][0][1][0];
+		x[1][0][1][1][0] = x1[0][1][1][0] + x[0][0][0][1][1];
+		x[1][0][0][0][1] = x1[0][0][0][1] + x[0][0][1][0][0];
+		x[1][0][0][0][0] = x1[0][0][0][0] + x[0][0][1][0][1];
+		x[1][0][0][1][1] = x1[0][0][1][1] + x[0][0][1][1][0];
+		x[1][0][0][1][0] = x1[0][0][1][0] + x[0][0][1][1][1];
+		x[1][1][1][0][1] = x1[1][1][0][1] + x[0][1][0][0][0];
+		x[1][1][1][0][0] = x1[1][1][0][0] + x[0][1][0][0][1];
+		x[1][1][1][1][1] = x1[1][1][1][1] + x[0][1][0][1][0];
+		x[1][1][1][1][0] = x1[1][1][1][0] + x[0][1][0][1][1];
+		x[1][1][0][0][1] = x1[1][0][0][1] + x[0][1][1][0][0];
+		x[1][1][0][0][0] = x1[1][0][0][0] + x[0][1][1][0][1];
+		x[1][1][0][1][1] = x1[1][0][1][1] + x[0][1][1][1][0];
+		x[1][1][0][1][0] = x1[1][0][1][0] + x[0][1][1][1][1];
+
+		/* "xor x_1jkl~m into x_0jklm" */
+		x[0][0][0][0][0] = x0[0][0][0][0] ^ x[1][0][0][0][1];
+		x[0][0][0][0][1] = x0[0][0][0][1] ^ x[1][0][0][0][0];
+		x[0][0][0][1][0] = x0[0][0][1][0] ^ x[1][0][0][1][1];
+		x[0][0][0][1][1] = x0[0][0][1][1] ^ x[1][0][0][1][0];
+		x[0][0][1][0][0] = x0[0][1][0][0] ^ x[1][0][1][0][1];
+		x[0][0][1][0][1] = x0[0][1][0][1] ^ x[1][0][1][0][0];
+		x[0][0][1][1][0] = x0[0][1][1][0] ^ x[1][0][1][1][1];
+		x[0][0][1][1][1] = x0[0][1][1][1] ^ x[1][0][1][1][0];
+		x[0][1][0][0][0] = x0[1][0][0][0] ^ x[1][1][0][0][1];
+		x[0][1][0][0][1] = x0[1][0][0][1] ^ x[1][1][0][0][0];
+		x[0][1][0][1][0] = x0[1][0][1][0] ^ x[1][1][0][1][1];
+		x[0][1][0][1][1] = x0[1][0][1][1] ^ x[1][1][0][1][0];
+		x[0][1][1][0][0] = x0[1][1][0][0] ^ x[1][1][1][0][1];
+		x[0][1][1][0][1] = x0[1][1][0][1] ^ x[1][1][1][0][0];
+		x[0][1][1][1][0] = x0[1][1][1][0] ^ x[1][1][1][1][1];
+		x[0][1][1][1][1] = x0[1][1][1][1] ^ x[1][1][1][1][0];
+	}
+}
+
+__device__ __forceinline__
+void Final(uint32_t x[2][2][2][2][2], uint32_t *hashval)
+{
+	/* "the integer 1 is xored into the last state word x_11111" */
+	x[1][1][1][1][1] ^= 1U;
+
+	/* "the state is then transformed invertibly through 10r identical rounds" */
+	for (int i = 0; i < 10; ++i) rrounds(x);
+
+	/* "output the first h/8 bytes of the state" */
+	hashval[0] = x[0][0][0][0][0];
+	hashval[1] = x[0][0][0][0][1];
+	hashval[2] = x[0][0][0][1][0];
+	hashval[3] = x[0][0][0][1][1];
+	hashval[4] = x[0][0][1][0][0];
+	hashval[5] = x[0][0][1][0][1];
+	hashval[6] = x[0][0][1][1][0];
+	hashval[7] = x[0][0][1][1][1];
+}
+
+#if __CUDA_ARCH__ >= 500
+__global__ __launch_bounds__(TPB50, 1)
+#else
+__global__ __launch_bounds__(TPB35, 1)
+#endif
+void cubehash256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+#if __CUDA_ARCH__ >= 500
+		uint2 Hash[4];
+
+		Hash[0] = __ldg(&g_hash[thread]);
+		Hash[1] = __ldg(&g_hash[thread + 1 * threads]);
+		Hash[2] = __ldg(&g_hash[thread + 2 * threads]);
+		Hash[3] = __ldg(&g_hash[thread + 3 * threads]);
+#else
+		uint32_t Hash[8];
+
+		LOHI(Hash[0], Hash[1], __ldg(&((uint64_t*)g_hash)[thread]));
+		LOHI(Hash[2], Hash[3], __ldg(&((uint64_t*)g_hash)[thread + 1 * threads]));
+		LOHI(Hash[4], Hash[5], __ldg(&((uint64_t*)g_hash)[thread + 2 * threads]));
+		LOHI(Hash[6], Hash[7], __ldg(&((uint64_t*)g_hash)[thread + 3 * threads]));
+#endif
+
+		uint32_t x[2][2][2][2][2] =
+		{
+			0xEA2BD4B4, 0xCCD6F29F, 0x63117E71, 0x35481EAE,
+			0x22512D5B, 0xE5D94E63, 0x7E624131, 0xF4CC12BE,
+			0xC2D0B696, 0x42AF2070, 0xD0720C35, 0x3361DA8C,
+			0x28CCECA4, 0x8EF8AD83, 0x4680AC00, 0x40E5FBAB,
+			0xD89041C3, 0x6107FBD5, 0x6C859D41, 0xF0B26679,
+			0x09392549, 0x5FA25603, 0x65C892FD, 0x93CB6285,
+			0x2AF2B5AE, 0x9E4B4E60, 0x774ABFDD, 0x85254725,
+			0x15815AEB, 0x4AB6AAD6, 0x9CDAF8AF, 0xD6032C0A
+		};
+
+#if __CUDA_ARCH__ >= 500
+		x[0][0][0][0][0] ^= Hash[0].x;
+		x[0][0][0][0][1] ^= Hash[0].y;
+		x[0][0][0][1][0] ^= Hash[1].x;
+		x[0][0][0][1][1] ^= Hash[1].y;
+		x[0][0][1][0][0] ^= Hash[2].x;
+		x[0][0][1][0][1] ^= Hash[2].y;
+		x[0][0][1][1][0] ^= Hash[3].x;
+		x[0][0][1][1][1] ^= Hash[3].y;
+#else
+		x[0][0][0][0][0] ^= Hash[0];
+		x[0][0][0][0][1] ^= Hash[1];
+		x[0][0][0][1][0] ^= Hash[2];
+		x[0][0][0][1][1] ^= Hash[3];
+		x[0][0][1][0][0] ^= Hash[4];
+		x[0][0][1][0][1] ^= Hash[5];
+		x[0][0][1][1][0] ^= Hash[6];
+		x[0][0][1][1][1] ^= Hash[7];
+#endif
+		rrounds(x);
+		x[0][0][0][0][0] ^= 0x80U;
+		rrounds(x);
+
+#if __CUDA_ARCH__ >= 500
+		Final(x, (uint32_t*)Hash);
+
+		g_hash[thread] = Hash[0];
+		g_hash[1 * threads + thread] = Hash[1];
+		g_hash[2 * threads + thread] = Hash[2];
+		g_hash[3 * threads + thread] = Hash[3];
+#else
+		Final(x, Hash);
+
+		((uint64_t*)g_hash)[thread] = ((uint64_t*)Hash)[0];
+		((uint64_t*)g_hash)[1 * threads + thread] = ((uint64_t*)Hash)[1];
+		((uint64_t*)g_hash)[2 * threads + thread] = ((uint64_t*)Hash)[2];
+		((uint64_t*)g_hash)[3 * threads + thread] = ((uint64_t*)Hash)[3];
+#endif
+	}
+}
+
+__host__
+void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order)
+{
+	uint32_t tpb = TPB35;
+	if (cuda_arch[thr_id] >= 500) tpb = TPB50;
+
+	dim3 grid((threads + tpb - 1) / tpb);
+	dim3 block(tpb);
+
+	cubehash256_gpu_hash_32 <<<grid, block >>> (threads, startNounce, (uint2*)d_hash);
+}
+
+__host__
+void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order, cudaStream_t stream)
+{
+	uint32_t tpb = TPB35;
+	if (cuda_arch[thr_id] >= 500) tpb = TPB50;
+
+	dim3 grid((threads + tpb - 1) / tpb);
+	dim3 block(tpb);
+
+	cubehash256_gpu_hash_32 <<<grid, block, 0, stream >>> (threads, startNounce, (uint2*)d_hash);
+}
diff --git a/cuda_fugue256.cu b/Algo256/cuda_fugue256.cu
similarity index 93%
rename from cuda_fugue256.cu
rename to Algo256/cuda_fugue256.cu
index f5ddd4f1e7..5c9d6870a3 100644
--- a/cuda_fugue256.cu
+++ b/Algo256/cuda_fugue256.cu
@@ -1,38 +1,24 @@
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
 #include <stdio.h>
 #include <memory.h>
 
 #include "sph/sph_fugue.h"
 
-#define USE_SHARED 1
-
-// aus cpu-miner.c
-extern int device_map[8];
+#include "cuda_helper.h"
+#include <host_defines.h>
 
-// aus heavy.cu
-extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
-
-// Folgende Definitionen sp�ter durch header ersetzen
-typedef unsigned char uint8_t;
-typedef unsigned int uint32_t;
-typedef unsigned long long uint64_t;
-
-// schon in sph_fugue.h definiert
-//#define SPH_C32(x)	((uint32_t)(x ## U))
+#define USE_SHARED 1
 
-uint32_t *d_fugue256_hashoutput[8];
-uint32_t *d_resultNonce[8];
+uint32_t *d_fugue256_hashoutput[MAX_GPUS];
+static uint32_t *d_resultNonce[MAX_GPUS];
+static unsigned int* d_textures[MAX_GPUS][8];
 
 __constant__ uint32_t GPUstate[30]; // Single GPU
 __constant__ uint32_t pTarget[8]; // Single GPU
 
-texture<unsigned int, 1, cudaReadModeElementType> mixTab0Tex;
-texture<unsigned int, 1, cudaReadModeElementType> mixTab1Tex;
-texture<unsigned int, 1, cudaReadModeElementType> mixTab2Tex;
-texture<unsigned int, 1, cudaReadModeElementType> mixTab3Tex;
+static texture<unsigned int, 1, cudaReadModeElementType> mixTab0Tex;
+static texture<unsigned int, 1, cudaReadModeElementType> mixTab1Tex;
+static texture<unsigned int, 1, cudaReadModeElementType> mixTab2Tex;
+static texture<unsigned int, 1, cudaReadModeElementType> mixTab3Tex;
 
 #if USE_SHARED
 #define mixtab0(x) (*((uint32_t*)mixtabs + (    (x))))
@@ -563,7 +549,7 @@ __global__ void  __launch_bounds__(256)
 #else
 __global__ void
 #endif
-fugue256_gpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce)
+fugue256_gpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce)
 {
 #if USE_SHARED
 	extern __shared__ char mixtabs[];
@@ -576,7 +562,7 @@ fugue256_gpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHas
 	__syncthreads();
 #endif
 
-	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		/* Nimm den State und verarbeite das letztenByte (die Nounce) */
@@ -586,7 +572,7 @@ fugue256_gpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHas
 		for(int i=0;i<30;i++)
 			sc[i] = GPUstate[i];
 
-		uint32_t nounce = startNounce + thread; // muss noch ermittelt werden	
+		uint32_t nounce = startNounce + thread; // muss noch ermittelt werden
 		uint32_t q;
 
 
@@ -702,7 +688,7 @@ fugue256_gpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHas
 
 		int i;
 		bool rc = true;
-	
+
 		for (i = 7; i >= 0; i--) {
 			if (hash[i] > pTarget[i]) {
 				rc = false;
@@ -722,62 +708,68 @@ fugue256_gpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHas
 	}
 }
 
-#define texDef(texname, texmem, texsource, texsize) \
+#define texDef(id, texname, texmem, texsource, texsize) { \
 	unsigned int *texmem; \
 	cudaMalloc(&texmem, texsize); \
+	d_textures[thr_id][id] = texmem; \
 	cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
 	texname.normalized = 0; \
 	texname.filterMode = cudaFilterModePoint; \
 	texname.addressMode[0] = cudaAddressModeClamp; \
 	{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
-	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); }
-
+	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); \
+	} \
+}
 
-void fugue256_cpu_init(int thr_id, int threads)
+__host__
+void fugue256_cpu_init(int thr_id, uint32_t threads)
 {
-    cudaSetDevice(device_map[thr_id]);
+	// Link the hash tables in the GPU
+	texDef(0, mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256);
+	texDef(1, mixTab1Tex, mixTab1m, mixtab1_cpu, sizeof(uint32_t)*256);
+	texDef(2, mixTab2Tex, mixTab2m, mixtab2_cpu, sizeof(uint32_t)*256);
+	texDef(3, mixTab3Tex, mixTab3m, mixtab3_cpu, sizeof(uint32_t)*256);
+
+	CUDA_SAFE_CALL(cudaMalloc(&d_fugue256_hashoutput[thr_id], (size_t) 32 * threads));
+	cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t));
+}
 
-	// Kopiere die Hash-Tabellen in den GPU-Speicher
-	texDef(mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256);
-	texDef(mixTab1Tex, mixTab1m, mixtab1_cpu, sizeof(uint32_t)*256);
-	texDef(mixTab2Tex, mixTab2m, mixtab2_cpu, sizeof(uint32_t)*256);
-	texDef(mixTab3Tex, mixTab3m, mixtab3_cpu, sizeof(uint32_t)*256);
+__host__
+void fugue256_cpu_free(int thr_id)
+{
+	cudaFree(d_fugue256_hashoutput[thr_id]);
+	cudaFree(d_resultNonce[thr_id]);
 
-	// Speicher f�r alle Ergebnisse belegen
-	cudaMalloc(&d_fugue256_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads);
-	cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); 
+	for (int i=0; i<4; i++)
+		cudaFree(d_textures[thr_id][i]);
 }
 
-__host__ void fugue256_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
+__host__
+void fugue256_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
 {
-	// CPU-Vorbereitungen treffen
 	sph_fugue256_context ctx_fugue_const;
 	sph_fugue256_init(&ctx_fugue_const);
-	sph_fugue256 (&ctx_fugue_const, data, 80);	// State speichern
-
-	cudaMemcpyToSymbol(	GPUstate,
-						ctx_fugue_const.S,
-						sizeof(uint32_t) * 30 );
+	sph_fugue256 (&ctx_fugue_const, data, 80);
+	cudaMemcpyToSymbol(GPUstate, ctx_fugue_const.S, sizeof(uint32_t) * 30);
 
-	cudaMemcpyToSymbol(	pTarget,
-						pTargetIn,
-						sizeof(uint32_t) * 8 );
+	cudaMemcpyToSymbol(pTarget, pTargetIn, 32);
 
 	cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
 }
 
-__host__ void fugue256_cpu_hash(int thr_id, int threads, int startNounce, void *outputHashes, uint32_t *nounce)
+__host__
+void fugue256_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce)
 {
 #if USE_SHARED
-	const int threadsperblock = 256; // Alignment mit mixtab Gr�sse. NICHT �NDERN
+	const uint32_t threadsperblock = 256; // Alignment mit mixtab Grösse. NICHT ÄNDERN
 #else
-	const int threadsperblock = 512; // so einstellen wie gew�nscht ;-)
+	const uint32_t threadsperblock = 512; // so einstellen wie gewünscht ;-)
 #endif
 	// berechne wie viele Thread Blocks wir brauchen
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	// Gr��e des dynamischen Shared Memory Bereichs
+	// Größe des dynamischen Shared Memory Bereichs
 #if USE_SHARED
 	size_t shared_size = 4 * 256 * sizeof(uint32_t);
 #else
diff --git a/Algo256/cuda_groestl256.cu b/Algo256/cuda_groestl256.cu
new file mode 100644
index 0000000000..5d796e2ff3
--- /dev/null
+++ b/Algo256/cuda_groestl256.cu
@@ -0,0 +1,335 @@
+#include <memory.h>
+
+#define SPH_C32(x)    ((uint32_t)(x ## U))
+#define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
+
+#include "cuda_helper.h"
+
+static uint32_t *h_GNonces[MAX_GPUS];
+static uint32_t *d_GNonces[MAX_GPUS];
+static unsigned int* d_textures[MAX_GPUS][8];
+
+__constant__ uint32_t pTarget[8];
+
+#define C32e(x) \
+	  ((SPH_C32(x) >> 24) \
+	| ((SPH_C32(x) >>  8) & SPH_C32(0x0000FF00)) \
+	| ((SPH_C32(x) <<  8) & SPH_C32(0x00FF0000)) \
+	| ((SPH_C32(x) << 24) & SPH_C32(0xFF000000)))
+
+#define PC32up(j, r)   ((uint32_t)((j) + (r)))
+#define PC32dn(j, r)   0
+#define QC32up(j, r)   0xFFFFFFFF
+#define QC32dn(j, r)   (((uint32_t)(r) << 24) ^ SPH_T32(~((uint32_t)(j) << 24)))
+
+#define B32_0(x)    __byte_perm(x, 0, 0x4440)
+//((x) & 0xFF)
+#define B32_1(x)    __byte_perm(x, 0, 0x4441)
+//(((x) >> 8) & 0xFF)
+#define B32_2(x)    __byte_perm(x, 0, 0x4442)
+//(((x) >> 16) & 0xFF)
+#define B32_3(x)    __byte_perm(x, 0, 0x4443)
+//((x) >> 24)
+
+#define MAXWELL_OR_FERMI 1
+#if MAXWELL_OR_FERMI
+	#define USE_SHARED 1
+	// Maxwell and Fermi cards get the best speed with SHARED access it seems.
+	#if USE_SHARED
+	#define T0up(x) (*((uint32_t*)mixtabs + (    (x))))
+	#define T0dn(x) (*((uint32_t*)mixtabs + (256+(x))))
+	#define T1up(x) (*((uint32_t*)mixtabs + (512+(x))))
+	#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
+	#define T2up(x) (*((uint32_t*)mixtabs + (1024+(x))))
+	#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
+	#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
+	#define T3dn(x) (*((uint32_t*)mixtabs + (1792+(x))))
+	#else
+	#define T0up(x) tex1Dfetch(t0up2, x)
+	#define T0dn(x) tex1Dfetch(t0dn2, x)
+	#define T1up(x) tex1Dfetch(t1up2, x)
+	#define T1dn(x) tex1Dfetch(t1dn2, x)
+	#define T2up(x) tex1Dfetch(t2up2, x)
+	#define T2dn(x) tex1Dfetch(t2dn2, x)
+	#define T3up(x) tex1Dfetch(t3up2, x)
+	#define T3dn(x) tex1Dfetch(t3dn2, x)
+	#endif
+#else
+	#define USE_SHARED 1
+	// a healthy mix between shared and textured access provides the highest speed on Compute 3.0 and 3.5!
+	#define T0up(x) (*((uint32_t*)mixtabs + (    (x))))
+	#define T0dn(x) tex1Dfetch(t0dn2, x)
+	#define T1up(x) tex1Dfetch(t1up2, x)
+	#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
+	#define T2up(x) tex1Dfetch(t2up2, x)
+	#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
+	#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
+	#define T3dn(x) tex1Dfetch(t3dn2, x)
+#endif
+
+static texture<unsigned int, 1, cudaReadModeElementType> t0up2;
+static texture<unsigned int, 1, cudaReadModeElementType> t0dn2;
+static texture<unsigned int, 1, cudaReadModeElementType> t1up2;
+static texture<unsigned int, 1, cudaReadModeElementType> t1dn2;
+static texture<unsigned int, 1, cudaReadModeElementType> t2up2;
+static texture<unsigned int, 1, cudaReadModeElementType> t2dn2;
+static texture<unsigned int, 1, cudaReadModeElementType> t3up2;
+static texture<unsigned int, 1, cudaReadModeElementType> t3dn2;
+
+#define RSTT(d0, d1, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \
+	t[d0] = T0up(B32_0(a[b0])) \
+		^ T1up(B32_1(a[b1])) \
+		^ T2up(B32_2(a[b2])) \
+		^ T3up(B32_3(a[b3])) \
+		^ T0dn(B32_0(a[b4])) \
+		^ T1dn(B32_1(a[b5])) \
+		^ T2dn(B32_2(a[b6])) \
+		^ T3dn(B32_3(a[b7])); \
+	t[d1] = T0dn(B32_0(a[b0])) \
+		^ T1dn(B32_1(a[b1])) \
+		^ T2dn(B32_2(a[b2])) \
+		^ T3dn(B32_3(a[b3])) \
+		^ T0up(B32_0(a[b4])) \
+		^ T1up(B32_1(a[b5])) \
+		^ T2up(B32_2(a[b6])) \
+		^ T3up(B32_3(a[b7])); \
+	} while (0)
+
+
+extern uint32_t T0up_cpu[];
+extern uint32_t T0dn_cpu[];
+extern uint32_t T1up_cpu[];
+extern uint32_t T1dn_cpu[];
+extern uint32_t T2up_cpu[];
+extern uint32_t T2dn_cpu[];
+extern uint32_t T3up_cpu[];
+extern uint32_t T3dn_cpu[];
+
+__device__ __forceinline__
+void groestl256_perm_P(uint32_t thread,uint32_t *a, char *mixtabs)
+{
+	#pragma unroll 10
+	for (int r = 0; r<10; r++)
+	{
+		uint32_t t[16];
+
+		a[0x0] ^= PC32up(0x00, r);
+		a[0x2] ^= PC32up(0x10, r);
+		a[0x4] ^= PC32up(0x20, r);
+		a[0x6] ^= PC32up(0x30, r);
+		a[0x8] ^= PC32up(0x40, r);
+		a[0xA] ^= PC32up(0x50, r);
+		a[0xC] ^= PC32up(0x60, r);
+		a[0xE] ^= PC32up(0x70, r);
+		RSTT(0x0, 0x1, a, 0x0, 0x2, 0x4, 0x6, 0x9, 0xB, 0xD, 0xF);
+		RSTT(0x2, 0x3, a, 0x2, 0x4, 0x6, 0x8, 0xB, 0xD, 0xF, 0x1);
+		RSTT(0x4, 0x5, a, 0x4, 0x6, 0x8, 0xA, 0xD, 0xF, 0x1, 0x3);
+		RSTT(0x6, 0x7, a, 0x6, 0x8, 0xA, 0xC, 0xF, 0x1, 0x3, 0x5);
+		RSTT(0x8, 0x9, a, 0x8, 0xA, 0xC, 0xE, 0x1, 0x3, 0x5, 0x7);
+		RSTT(0xA, 0xB, a, 0xA, 0xC, 0xE, 0x0, 0x3, 0x5, 0x7, 0x9);
+		RSTT(0xC, 0xD, a, 0xC, 0xE, 0x0, 0x2, 0x5, 0x7, 0x9, 0xB);
+		RSTT(0xE, 0xF, a, 0xE, 0x0, 0x2, 0x4, 0x7, 0x9, 0xB, 0xD);
+
+		#pragma unroll 16
+		for (int k = 0; k<16; k++)
+			a[k] = t[k];
+	}
+}
+
+__device__ __forceinline__
+void groestl256_perm_Q(uint32_t thread, uint32_t *a, char *mixtabs)
+{
+	#pragma unroll
+	for (int r = 0; r<10; r++)
+	{
+		uint32_t t[16];
+
+		a[0x0] ^= QC32up(0x00, r);
+		a[0x1] ^= QC32dn(0x00, r);
+		a[0x2] ^= QC32up(0x10, r);
+		a[0x3] ^= QC32dn(0x10, r);
+		a[0x4] ^= QC32up(0x20, r);
+		a[0x5] ^= QC32dn(0x20, r);
+		a[0x6] ^= QC32up(0x30, r);
+		a[0x7] ^= QC32dn(0x30, r);
+		a[0x8] ^= QC32up(0x40, r);
+		a[0x9] ^= QC32dn(0x40, r);
+		a[0xA] ^= QC32up(0x50, r);
+		a[0xB] ^= QC32dn(0x50, r);
+		a[0xC] ^= QC32up(0x60, r);
+		a[0xD] ^= QC32dn(0x60, r);
+		a[0xE] ^= QC32up(0x70, r);
+		a[0xF] ^= QC32dn(0x70, r);
+		RSTT(0x0, 0x1, a, 0x2, 0x6, 0xA, 0xE, 0x1, 0x5, 0x9, 0xD);
+		RSTT(0x2, 0x3, a, 0x4, 0x8, 0xC, 0x0, 0x3, 0x7, 0xB, 0xF);
+		RSTT(0x4, 0x5, a, 0x6, 0xA, 0xE, 0x2, 0x5, 0x9, 0xD, 0x1);
+		RSTT(0x6, 0x7, a, 0x8, 0xC, 0x0, 0x4, 0x7, 0xB, 0xF, 0x3);
+		RSTT(0x8, 0x9, a, 0xA, 0xE, 0x2, 0x6, 0x9, 0xD, 0x1, 0x5);
+		RSTT(0xA, 0xB, a, 0xC, 0x0, 0x4, 0x8, 0xB, 0xF, 0x3, 0x7);
+		RSTT(0xC, 0xD, a, 0xE, 0x2, 0x6, 0xA, 0xD, 0x1, 0x5, 0x9);
+		RSTT(0xE, 0xF, a, 0x0, 0x4, 0x8, 0xC, 0xF, 0x3, 0x7, 0xB);
+
+		#pragma unroll
+		for (int k = 0; k<16; k++)
+			a[k] = t[k];
+	}
+}
+
+__global__ __launch_bounds__(256,1)
+void groestl256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outputHash, uint32_t *resNonces)
+{
+#if USE_SHARED
+	extern __shared__ char mixtabs[];
+
+	if (threadIdx.x < 256) {
+		*((uint32_t*)mixtabs + (threadIdx.x)) = tex1Dfetch(t0up2, threadIdx.x);
+		*((uint32_t*)mixtabs + (256 + threadIdx.x)) = tex1Dfetch(t0dn2, threadIdx.x);
+		*((uint32_t*)mixtabs + (512 + threadIdx.x)) = tex1Dfetch(t1up2, threadIdx.x);
+		*((uint32_t*)mixtabs + (768 + threadIdx.x)) = tex1Dfetch(t1dn2, threadIdx.x);
+		*((uint32_t*)mixtabs + (1024 + threadIdx.x)) = tex1Dfetch(t2up2, threadIdx.x);
+		*((uint32_t*)mixtabs + (1280 + threadIdx.x)) = tex1Dfetch(t2dn2, threadIdx.x);
+		*((uint32_t*)mixtabs + (1536 + threadIdx.x)) = tex1Dfetch(t3up2, threadIdx.x);
+		*((uint32_t*)mixtabs + (1792 + threadIdx.x)) = tex1Dfetch(t3dn2, threadIdx.x);
+	}
+
+	__syncthreads();
+#endif
+
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		// GROESTL
+		uint32_t message[16];
+		uint32_t state[16];
+
+		#pragma unroll
+		for (int k = 0; k<4; k++)
+			LOHI(message[2*k], message[2*k+1], outputHash[k*threads+thread]);
+
+		#pragma unroll
+		for (int k = 9; k<15; k++)
+			message[k] = 0;
+
+		message[8] = 0x80;
+		message[15] = 0x01000000;
+
+		#pragma unroll 16
+		for (int u = 0; u<16; u++)
+			state[u] = message[u];
+
+		state[15] ^= 0x10000;
+
+		// Perm
+
+#if USE_SHARED
+		groestl256_perm_P(thread, state, mixtabs);
+		state[15] ^= 0x10000;
+		groestl256_perm_Q(thread, message, mixtabs);
+#else
+		groestl256_perm_P(thread, state, NULL);
+		state[15] ^= 0x10000;
+		groestl256_perm_P(thread, message, NULL);
+#endif
+		#pragma unroll 16
+		for (int u = 0; u<16; u++) state[u] ^= message[u];
+		#pragma unroll 16
+		for (int u = 0; u<16; u++) message[u] = state[u];
+#if USE_SHARED
+		groestl256_perm_P(thread, message, mixtabs);
+#else
+		groestl256_perm_P(thread, message, NULL);
+#endif
+		state[14] ^= message[14];
+		state[15] ^= message[15];
+
+		uint32_t nonce = startNounce + thread;
+		if (state[15] <= pTarget[7]) {
+			atomicMin(&resNonces[1], resNonces[0]);
+			atomicMin(&resNonces[0], nonce);
+		}
+	}
+}
+
+#define texDef(id, texname, texmem, texsource, texsize) { \
+	unsigned int *texmem; \
+	cudaMalloc(&texmem, texsize); \
+	d_textures[thr_id][id] = texmem; \
+	cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
+	texname.normalized = 0; \
+	texname.filterMode = cudaFilterModePoint; \
+	texname.addressMode[0] = cudaAddressModeClamp; \
+	{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
+	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); \
+	} \
+}
+
+__host__
+void groestl256_cpu_init(int thr_id, uint32_t threads)
+{
+	// Texturen mit obigem Makro initialisieren
+	texDef(0, t0up2, d_T0up, T0up_cpu, sizeof(uint32_t) * 256);
+	texDef(1, t0dn2, d_T0dn, T0dn_cpu, sizeof(uint32_t) * 256);
+	texDef(2, t1up2, d_T1up, T1up_cpu, sizeof(uint32_t) * 256);
+	texDef(3, t1dn2, d_T1dn, T1dn_cpu, sizeof(uint32_t) * 256);
+	texDef(4, t2up2, d_T2up, T2up_cpu, sizeof(uint32_t) * 256);
+	texDef(5, t2dn2, d_T2dn, T2dn_cpu, sizeof(uint32_t) * 256);
+	texDef(6, t3up2, d_T3up, T3up_cpu, sizeof(uint32_t) * 256);
+	texDef(7, t3dn2, d_T3dn, T3dn_cpu, sizeof(uint32_t) * 256);
+
+	cudaMalloc(&d_GNonces[thr_id], 2*sizeof(uint32_t));
+	cudaMallocHost(&h_GNonces[thr_id], 2*sizeof(uint32_t));
+}
+
+__host__
+void groestl256_cpu_free(int thr_id)
+{
+	for (int i=0; i<8; i++)
+		cudaFree(d_textures[thr_id][i]);
+
+	cudaFree(d_GNonces[thr_id]);
+	cudaFreeHost(h_GNonces[thr_id]);
+}
+
+__host__
+uint32_t groestl256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
+{
+	uint32_t result = UINT32_MAX;
+	cudaMemset(d_GNonces[thr_id], 0xff, 2*sizeof(uint32_t));
+	const uint32_t threadsperblock = 256;
+
+	// berechne wie viele Thread Blocks wir brauchen
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+#if USE_SHARED
+	size_t shared_size = 8 * 256 * sizeof(uint32_t);
+#else
+	size_t shared_size = 0;
+#endif
+	groestl256_gpu_hash_32<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash, d_GNonces[thr_id]);
+
+	MyStreamSynchronize(NULL, order, thr_id);
+
+	// get first found nonce
+	cudaMemcpy(h_GNonces[thr_id], d_GNonces[thr_id], 1*sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	result = *h_GNonces[thr_id];
+
+	return result;
+}
+
+__host__
+uint32_t groestl256_getSecNonce(int thr_id, int num)
+{
+	uint32_t results[2];
+	memset(results, 0xFF, sizeof(results));
+	cudaMemcpy(results, d_GNonces[thr_id], sizeof(results), cudaMemcpyDeviceToHost);
+	if (results[1] == results[0])
+		return UINT32_MAX;
+	return results[num];
+}
+
+__host__
+void groestl256_setTarget(const void *pTargetIn)
+{
+	cudaMemcpyToSymbol(pTarget, pTargetIn, 32, 0, cudaMemcpyHostToDevice);
+}
diff --git a/Algo256/cuda_keccak256.cu b/Algo256/cuda_keccak256.cu
new file mode 100644
index 0000000000..7e87bb2860
--- /dev/null
+++ b/Algo256/cuda_keccak256.cu
@@ -0,0 +1,383 @@
+/**
+ * KECCAK-256 CUDA optimised implementation, based on ccminer-alexis code
+ */
+
+#include <miner.h>
+
+extern "C" {
+#include <stdint.h>
+#include <memory.h>
+}
+
+#include <cuda_helper.h>
+#include <cuda_vectors.h>
+
+#define TPB52 1024
+#define TPB50 384
+#define NPT 2
+#define NBN 2
+
+static uint32_t *d_nonces[MAX_GPUS];
+static uint32_t *h_nonces[MAX_GPUS];
+
+__constant__ uint2 c_message48[6];
+__constant__ uint2 c_mid[17];
+
+__constant__ uint2 keccak_round_constants[24] = {
+	{ 0x00000001, 0x00000000 }, { 0x00008082, 0x00000000 },	{ 0x0000808a, 0x80000000 }, { 0x80008000, 0x80000000 },
+	{ 0x0000808b, 0x00000000 }, { 0x80000001, 0x00000000 },	{ 0x80008081, 0x80000000 }, { 0x00008009, 0x80000000 },
+	{ 0x0000008a, 0x00000000 }, { 0x00000088, 0x00000000 },	{ 0x80008009, 0x00000000 }, { 0x8000000a, 0x00000000 },
+	{ 0x8000808b, 0x00000000 }, { 0x0000008b, 0x80000000 },	{ 0x00008089, 0x80000000 }, { 0x00008003, 0x80000000 },
+	{ 0x00008002, 0x80000000 }, { 0x00000080, 0x80000000 },	{ 0x0000800a, 0x00000000 }, { 0x8000000a, 0x80000000 },
+	{ 0x80008081, 0x80000000 }, { 0x00008080, 0x80000000 },	{ 0x80000001, 0x00000000 }, { 0x80008008, 0x80000000 }
+};
+
+__device__ __forceinline__
+uint2 xor3x(const uint2 a,const uint2 b,const uint2 c) {
+	uint2 result;
+#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050
+	asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(result.x) : "r"(a.x), "r"(b.x),"r"(c.x)); //0x96 = 0xF0 ^ 0xCC ^ 0xAA
+	asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(result.y) : "r"(a.y), "r"(b.y),"r"(c.y)); //0x96 = 0xF0 ^ 0xCC ^ 0xAA
+#else
+	result = a^b^c;
+#endif
+	return result;
+}
+
+__device__ __forceinline__
+uint2 chi(const uint2 a,const uint2 b,const uint2 c) { // keccak chi
+	uint2 result;
+#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050
+	asm ("lop3.b32 %0, %1, %2, %3, 0xD2;" : "=r"(result.x) : "r"(a.x), "r"(b.x),"r"(c.x)); //0xD2 = 0xF0 ^ ((~0xCC) & 0xAA)
+	asm ("lop3.b32 %0, %1, %2, %3, 0xD2;" : "=r"(result.y) : "r"(a.y), "r"(b.y),"r"(c.y)); //0xD2 = 0xF0 ^ ((~0xCC) & 0xAA)
+#else
+	result = a ^ (~b) & c;
+#endif
+	return result;
+}
+
+__device__ __forceinline__
+uint64_t xor5(uint64_t a, uint64_t b, uint64_t c, uint64_t d, uint64_t e)
+{
+	uint64_t result;
+	asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(d) ,"l"(e));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(c));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(b));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(a));
+	return result;
+}
+
+#if __CUDA_ARCH__ <= 500
+__global__ __launch_bounds__(TPB50, 2)
+#else
+__global__ __launch_bounds__(TPB52, 1)
+#endif
+void keccak256_gpu_hash_80(uint32_t threads, uint32_t startNonce, uint32_t *resNounce, const uint2 highTarget)
+{
+	uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
+	uint2 s[25], t[5], v, w, u[5];
+#if __CUDA_ARCH__ > 500
+	uint64_t step     = gridDim.x * blockDim.x;
+	uint64_t maxNonce = startNonce + threads;
+	for(uint64_t nounce = startNonce + thread; nounce<maxNonce;nounce+=step) {
+#else
+	uint32_t nounce = startNonce+thread;
+	if(thread<threads) {
+#endif
+		s[ 9] = make_uint2(c_message48[0].x,cuda_swab32(nounce));
+		s[10] = keccak_round_constants[0];
+
+		t[ 4] = c_message48[1]^s[ 9];
+		/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
+		u[ 0] = t[4] ^ c_mid[ 0];
+		u[ 1] = c_mid[ 1] ^ ROL2(t[4],1);
+		u[ 2] = c_mid[ 2];
+		/* thetarho pi: b[..] = rotl(a[..] ^ d[...], ..)*/
+		s[ 7] = ROL2(s[10]^u[0], 3);
+		s[10] = c_mid[ 3];
+		    w = c_mid[ 4];
+		s[20] = c_mid[ 5];
+		s[ 6] = ROL2(s[ 9]^u[2],20);
+		s[ 9] = c_mid[ 6];
+		s[22] = c_mid[ 7];
+		s[14] = ROL2(u[0],18);
+		s[ 2] = c_mid[ 8];
+		s[12] = ROL2(u[1],25);
+		s[13] = c_mid[ 9];
+		s[19] = ROR8(u[1]);
+		s[23] = ROR2(u[0],23);
+		s[15] = c_mid[10];
+		s[ 4] = c_mid[11];
+		s[24] = c_mid[12];
+		s[21] = ROR2(c_message48[2]^u[1], 9);
+		s[ 8] = c_mid[13];
+		s[16] = ROR2(c_message48[3]^u[0],28);
+		s[ 5] = ROL2(c_message48[4]^u[1],28);
+		s[ 3] = ROL2(u[1],21);
+		s[18] = c_mid[14];
+		s[17] = c_mid[15];
+		s[11] = c_mid[16];
+
+		/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
+		v = c_message48[5]^u[0];
+		s[ 0] = chi(v,w,s[ 2]);
+		s[ 1] = chi(w,s[ 2],s[ 3]);
+		s[ 2] = chi(s[ 2],s[ 3],s[ 4]);
+		s[ 3] = chi(s[ 3],s[ 4],v);
+		s[ 4] = chi(s[ 4],v,w);
+		v = s[ 5]; w = s[ 6]; s[ 5] = chi(v,w,s[ 7]); s[ 6] = chi(w,s[ 7],s[ 8]); s[ 7] = chi(s[ 7],s[ 8],s[ 9]); s[ 8] = chi(s[ 8],s[ 9],v);s[ 9] = chi(s[ 9],v,w);
+		v = s[10]; w = s[11]; s[10] = chi(v,w,s[12]); s[11] = chi(w,s[12],s[13]); s[12] = chi(s[12],s[13],s[14]); s[13] = chi(s[13],s[14],v);s[14] = chi(s[14],v,w);
+		v = s[15]; w = s[16]; s[15] = chi(v,w,s[17]); s[16] = chi(w,s[17],s[18]); s[17] = chi(s[17],s[18],s[19]); s[18] = chi(s[18],s[19],v);s[19] = chi(s[19],v,w);
+		v = s[20]; w = s[21]; s[20] = chi(v,w,s[22]); s[21] = chi(w,s[22],s[23]); s[22] = chi(s[22],s[23],s[24]); s[23] = chi(s[23],s[24],v);s[24] = chi(s[24],v,w);
+
+		/* iota: a[0,0] ^= round constant */
+		s[ 0] ^=keccak_round_constants[ 0];
+
+		#if __CUDA_ARCH__ > 500
+			#pragma unroll 22
+		#else
+			#pragma unroll 4
+		#endif
+		for (int i = 1; i < 23; i++) {
+			#pragma unroll
+			for(int j=0;j<5;j++) {
+				t[ j] = vectorize(xor5(devectorize(s[ j]),devectorize(s[j+5]),devectorize(s[j+10]),devectorize(s[j+15]),devectorize(s[j+20])));
+			}
+			/*theta*/
+			#pragma unroll
+			for(int j=0;j<5;j++) {
+				u[j] = ROL2(t[j], 1);
+			}
+			s[ 4] = xor3x(s[ 4], t[3], u[0]);s[ 9] = xor3x(s[ 9], t[3], u[0]);s[14] = xor3x(s[14], t[3], u[0]);s[19] = xor3x(s[19], t[3], u[0]);s[24] = xor3x(s[24], t[3], u[0]);
+			s[ 0] = xor3x(s[ 0], t[4], u[1]);s[ 5] = xor3x(s[ 5], t[4], u[1]);s[10] = xor3x(s[10], t[4], u[1]);s[15] = xor3x(s[15], t[4], u[1]);s[20] = xor3x(s[20], t[4], u[1]);
+			s[ 1] = xor3x(s[ 1], t[0], u[2]);s[ 6] = xor3x(s[ 6], t[0], u[2]);s[11] = xor3x(s[11], t[0], u[2]);s[16] = xor3x(s[16], t[0], u[2]);s[21] = xor3x(s[21], t[0], u[2]);
+			s[ 2] = xor3x(s[ 2], t[1], u[3]);s[ 7] = xor3x(s[ 7], t[1], u[3]);s[12] = xor3x(s[12], t[1], u[3]);s[17] = xor3x(s[17], t[1], u[3]);s[22] = xor3x(s[22], t[1], u[3]);
+			s[ 3] = xor3x(s[ 3], t[2], u[4]);s[ 8] = xor3x(s[ 8], t[2], u[4]);s[13] = xor3x(s[13], t[2], u[4]);s[18] = xor3x(s[18], t[2], u[4]);s[23] = xor3x(s[23], t[2], u[4]);
+			/*rho pi: b[..] = rotl(a[..] ^ d[...], ..)*/
+			v = s[ 1];
+			s[ 1] = ROL2(s[ 6],44);	s[ 6] = ROL2(s[ 9],20);	s[ 9] = ROL2(s[22],61);	s[22] = ROL2(s[14],39);
+			s[14] = ROL2(s[20],18);	s[20] = ROL2(s[ 2],62);	s[ 2] = ROL2(s[12],43);	s[12] = ROL2(s[13],25);
+			s[13] = ROL8(s[19]);	s[19] = ROR8(s[23]);	s[23] = ROL2(s[15],41);	s[15] = ROL2(s[ 4],27);
+			s[ 4] = ROL2(s[24],14);	s[24] = ROL2(s[21], 2);	s[21] = ROL2(s[ 8],55);	s[ 8] = ROL2(s[16],45);
+			s[16] = ROL2(s[ 5],36);	s[ 5] = ROL2(s[ 3],28);	s[ 3] = ROL2(s[18],21);	s[18] = ROL2(s[17],15);
+			s[17] = ROL2(s[11],10);	s[11] = ROL2(s[ 7], 6);	s[ 7] = ROL2(s[10], 3);	s[10] = ROL2(v, 1);
+			/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
+			#pragma unroll
+			for(int j=0;j<25;j+=5) {
+				v=s[j];w=s[j + 1];s[j] = chi(s[j],s[j+1],s[j+2]);s[j+1] = chi(s[j+1],s[j+2],s[j+3]);s[j+2]=chi(s[j+2],s[j+3],s[j+4]);s[j+3]=chi(s[j+3],s[j+4],v);s[j+4]=chi(s[j+4],v,w);
+			}
+			/* iota: a[0,0] ^= round constant */
+			s[ 0] ^=keccak_round_constants[ i];
+		}
+		/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
+		#pragma unroll 5
+		for(int j=0;j<5;j++) {
+			t[ j] = xor3x(xor3x(s[j+0],s[j+5],s[j+10]), s[j+15], s[j+20]);
+		}
+		s[24] = xor3x(s[24],t[3],ROL2(t[0],1));
+		s[18] = xor3x(s[18],t[2],ROL2(t[4],1));
+		s[ 0] = xor3x(s[ 0],t[4],ROL2(t[1],1));
+		/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
+		s[24] = ROL2(s[24],14);
+		s[18] = ROL2(s[18],21);
+		if (devectorize(chi(s[18],s[24],s[ 0])) <= devectorize(highTarget)) {
+//		if(chi(s[18].x,s[24].x,s[0].x)<=highTarget.x) {
+//			if(chi(s[18].y,s[24].y,s[0].y)<=highTarget.y) {
+				const uint32_t tmp = atomicExch(&resNounce[0], nounce);
+				if (tmp != UINT32_MAX)
+					resNounce[1] = tmp;
+	//			return;
+//			}
+		}
+	}
+}
+
+__host__
+void keccak256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t* resNonces, const uint2 highTarget)
+{
+	uint32_t tpb;
+	dim3 grid;
+	if (device_sm[device_map[thr_id]] <= 500) {
+		tpb = TPB50;
+		grid.x = (threads + tpb-1)/tpb;
+	} else {
+		tpb = TPB52;
+		grid.x = (threads + (NPT*tpb)-1)/(NPT*tpb);
+	}
+	const dim3 block(tpb);
+
+	keccak256_gpu_hash_80<<<grid, block>>>(threads, startNonce, d_nonces[thr_id], highTarget);
+//	cudaThreadSynchronize();
+	cudaMemcpy(h_nonces[thr_id], d_nonces[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	memcpy(resNonces, h_nonces[thr_id], NBN*sizeof(uint32_t));
+}
+
+#if 0
+#if __CUDA_ARCH__ <= 500
+__global__ __launch_bounds__(TPB50, 2)
+#else
+__global__ __launch_bounds__(TPB52, 1)
+#endif
+void keccak256_gpu_hash_32(uint32_t threads, uint2* outputHash)
+{
+	uint32_t thread   = blockDim.x * blockIdx.x + threadIdx.x;
+	uint2 s[25], t[5], v, w, u[5];
+
+	if(thread < threads) {
+		#pragma unroll 25
+		for (int i = 0; i<25; i++) {
+			if (i<4) s[i] = __ldg(&outputHash[i*threads+thread]);
+			else     s[i] = make_uint2(0, 0);
+		}
+		s[4]  = keccak_round_constants[ 0];
+		s[16] = make_uint2(0, 0x80000000);
+		#if __CUDA_ARCH__ > 500
+			#pragma unroll
+		#else
+			#pragma unroll 4
+		#endif
+		for (uint32_t i = 0; i < 23; i++) {
+			/*theta*/
+			#pragma unroll 5
+			for(int j=0; j<5; j++) {
+				t[ j] = vectorize(xor5(devectorize(s[ j]),devectorize(s[j+5]),devectorize(s[j+10]),devectorize(s[j+15]),devectorize(s[j+20])));
+			}
+			/*theta*/
+			#pragma unroll 5
+			for(int j=0; j<5; j++) {
+				u[j] = ROL2(t[j], 1);
+			}
+			s[ 4] = xor3x(s[ 4], t[3], u[0]);s[ 9] = xor3x(s[ 9], t[3], u[0]);s[14] = xor3x(s[14], t[3], u[0]);s[19] = xor3x(s[19], t[3], u[0]);s[24] = xor3x(s[24], t[3], u[0]);
+			s[ 0] = xor3x(s[ 0], t[4], u[1]);s[ 5] = xor3x(s[ 5], t[4], u[1]);s[10] = xor3x(s[10], t[4], u[1]);s[15] = xor3x(s[15], t[4], u[1]);s[20] = xor3x(s[20], t[4], u[1]);
+			s[ 1] = xor3x(s[ 1], t[0], u[2]);s[ 6] = xor3x(s[ 6], t[0], u[2]);s[11] = xor3x(s[11], t[0], u[2]);s[16] = xor3x(s[16], t[0], u[2]);s[21] = xor3x(s[21], t[0], u[2]);
+			s[ 2] = xor3x(s[ 2], t[1], u[3]);s[ 7] = xor3x(s[ 7], t[1], u[3]);s[12] = xor3x(s[12], t[1], u[3]);s[17] = xor3x(s[17], t[1], u[3]);s[22] = xor3x(s[22], t[1], u[3]);
+			s[ 3] = xor3x(s[ 3], t[2], u[4]);s[ 8] = xor3x(s[ 8], t[2], u[4]);s[13] = xor3x(s[13], t[2], u[4]);s[18] = xor3x(s[18], t[2], u[4]);s[23] = xor3x(s[23], t[2], u[4]);
+			/*rho pi: b[..] = rotl(a[..] ^ d[...], ..)*/
+			v = s[ 1];
+			s[ 1] = ROL2(s[ 6],44); s[ 6] = ROL2(s[ 9],20); s[ 9] = ROL2(s[22],61); s[22] = ROL2(s[14],39);
+			s[14] = ROL2(s[20],18); s[20] = ROL2(s[ 2],62); s[ 2] = ROL2(s[12],43); s[12] = ROL2(s[13],25);
+			s[13] = ROL8(s[19]);    s[19] = ROR8(s[23]);    s[23] = ROL2(s[15],41); s[15] = ROL2(s[ 4],27);
+			s[ 4] = ROL2(s[24],14); s[24] = ROL2(s[21], 2); s[21] = ROL2(s[ 8],55); s[ 8] = ROL2(s[16],45);
+			s[16] = ROL2(s[ 5],36); s[ 5] = ROL2(s[ 3],28); s[ 3] = ROL2(s[18],21); s[18] = ROL2(s[17],15);
+			s[17] = ROL2(s[11],10); s[11] = ROL2(s[ 7], 6); s[ 7] = ROL2(s[10], 3); s[10] = ROL2(v, 1);
+			/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
+			#pragma unroll 5
+			for(int j=0; j<25; j+=5) {
+				v=s[j];w=s[j + 1]; s[j] = chi(v,w,s[j+2]); s[j+1] = chi(w,s[j+2],s[j+3]); s[j+2]=chi(s[j+2],s[j+3],s[j+4]); s[j+3]=chi(s[j+3],s[j+4],v); s[j+4]=chi(s[j+4],v,w);
+			}
+			/* iota: a[0,0] ^= round constant */
+			s[ 0] ^=keccak_round_constants[ i];
+		}
+		/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
+		#pragma unroll 5
+		for(int j=0;j<5;j++) {
+			t[ j] = xor3x(xor3x(s[j+0],s[j+5],s[j+10]), s[j+15], s[j+20]);
+		}
+		/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
+		#pragma unroll 5
+		for(int j=0;j<5;j++) {
+			u[j] = ROL2(t[j],1);
+		}
+		/* thetarho pi: b[..] = rotl(a[..] ^ d[...], ..) //There's no need to perform theta and -store- the result since it's unique for each a[..]*/
+		s[ 4] = xor3x(s[24], t[3], u[0]);
+		s[ 0] = xor3x(s[ 0], t[4], u[1]);
+		s[ 1] = xor3x(s[ 6], t[0], u[2]);
+		s[ 2] = xor3x(s[12], t[1], u[3]);
+		s[ 3] = xor3x(s[18], t[2], u[4]);
+		s[ 1] = ROR2(s[ 1],20);
+		s[ 2] = ROR2(s[ 2],21);
+		s[ 3] = ROL2(s[ 3],21);
+		s[ 4] = ROL2(s[ 4],14);
+
+		/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
+		outputHash[0*threads+thread] = chi(s[ 0],s[ 1],s[ 2]) ^ keccak_round_constants[23];
+		outputHash[1*threads+thread] = chi(s[ 1],s[ 2],s[ 3]);
+		outputHash[2*threads+thread] = chi(s[ 2],s[ 3],s[ 4]);
+		outputHash[3*threads+thread] = chi(s[ 3],s[ 4],s[ 0]);
+	}
+}
+
+__host__
+void keccak256_cpu_hash_32(const int thr_id,const uint32_t threads, uint2* d_hash)
+{
+	uint32_t tpb = TPB52;
+	if (device_sm[device_map[thr_id]] == 500) tpb = TPB50;
+	const dim3 grid((threads + tpb-1)/tpb);
+	const dim3 block(tpb);
+
+	keccak256_gpu_hash_32 <<<grid, block>>> (threads, d_hash);
+}
+#endif
+
+__host__
+void keccak256_setBlock_80(uint64_t *endiandata)
+{
+	uint64_t midstate[17], s[25];
+	uint64_t t[5], u[5];
+
+	s[10] = 1; //(uint64_t)make_uint2(1, 0);
+	s[16] = ((uint64_t)1)<<63; //(uint64_t)make_uint2(0, 0x80000000);
+
+	t[0] = endiandata[0] ^ endiandata[5] ^ s[10];
+	t[1] = endiandata[1] ^ endiandata[6] ^ s[16];
+	t[2] = endiandata[2] ^ endiandata[7];
+	t[3] = endiandata[3] ^ endiandata[8];
+
+	midstate[ 0] = ROTL64(t[1], 1);         //u[0] -partial
+	       u[1] = t[ 0] ^ ROTL64(t[2], 1);  //u[1]
+	       u[2] = t[ 1] ^ ROTL64(t[3], 1);  //u[2]
+	midstate[ 1] = t[ 2];                   //u[3] -partial
+	midstate[ 2] = t[ 3] ^ ROTL64(t[0], 1); //u[4]
+	midstate[ 3] = ROTL64(endiandata[1]^u[1], 1); //v
+	midstate[ 4] = ROTL64(endiandata[6]^u[1], 44);
+	midstate[ 5] = ROTL64(endiandata[2]^u[2], 62);
+	midstate[ 6] = ROTL64(u[2], 61);
+	midstate[ 7] = ROTL64(midstate[2], 39);
+	midstate[ 8] = ROTL64(u[2], 43);
+	midstate[ 9] = ROTL64(midstate[2], 8);
+	midstate[10] = ROTL64(endiandata[4]^midstate[ 2],27);
+	midstate[11] = ROTL64(midstate[2], 14);
+	midstate[12] = ROTL64(u[1], 2);
+	midstate[13] = ROTL64(s[16] ^ u[1], 45);
+	midstate[14] = ROTL64(u[2],15);
+	midstate[15] = ROTL64(u[1],10);
+	midstate[16] = ROTL64(endiandata[7]^u[2], 6);
+
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_mid, midstate,17*sizeof(uint64_t), 0, cudaMemcpyHostToDevice));
+
+	// pass only what's needed
+	uint64_t message48[6];
+	message48[0] = endiandata[9];
+	message48[1] = endiandata[4];
+	message48[2] = endiandata[8];
+	message48[3] = endiandata[5];
+	message48[4] = endiandata[3];
+	message48[5] = endiandata[0];
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_message48, message48, 6*sizeof(uint64_t), 0, cudaMemcpyHostToDevice));
+}
+
+__host__
+void keccak256_cpu_init(int thr_id)
+{
+	CUDA_SAFE_CALL(cudaMalloc(&d_nonces[thr_id], NBN*sizeof(uint32_t)));
+	//CUDA_SAFE_CALL(cudaMallocHost(&h_nonces[thr_id], NBN*sizeof(uint32_t)));
+	h_nonces[thr_id] = (uint32_t*) malloc(NBN * sizeof(uint32_t));
+	if(h_nonces[thr_id] == NULL) {
+		gpulog(LOG_ERR,thr_id,"Host memory allocation failed");
+		exit(EXIT_FAILURE);
+	}
+}
+
+__host__
+void keccak256_setOutput(int thr_id)
+{
+	CUDA_SAFE_CALL(cudaMemset(d_nonces[thr_id], 0xff, NBN*sizeof(uint32_t)));
+}
+
+__host__
+void keccak256_cpu_free(int thr_id)
+{
+	cudaFree(d_nonces[thr_id]);
+	//cudaFreeHost(h_nonces[thr_id]);
+	free(h_nonces[thr_id]);
+}
diff --git a/Algo256/cuda_keccak256_sm3.cu b/Algo256/cuda_keccak256_sm3.cu
new file mode 100644
index 0000000000..7100ad6ed3
--- /dev/null
+++ b/Algo256/cuda_keccak256_sm3.cu
@@ -0,0 +1,303 @@
+#include "miner.h"
+
+extern "C" {
+#include <stdint.h>
+#include <memory.h>
+}
+
+#include "cuda_helper.h"
+
+static const uint64_t host_keccak_round_constants[24] = {
+	0x0000000000000001ull, 0x0000000000008082ull,
+	0x800000000000808aull, 0x8000000080008000ull,
+	0x000000000000808bull, 0x0000000080000001ull,
+	0x8000000080008081ull, 0x8000000000008009ull,
+	0x000000000000008aull, 0x0000000000000088ull,
+	0x0000000080008009ull, 0x000000008000000aull,
+	0x000000008000808bull, 0x800000000000008bull,
+	0x8000000000008089ull, 0x8000000000008003ull,
+	0x8000000000008002ull, 0x8000000000000080ull,
+	0x000000000000800aull, 0x800000008000000aull,
+	0x8000000080008081ull, 0x8000000000008080ull,
+	0x0000000080000001ull, 0x8000000080008008ull
+};
+
+static uint32_t *d_KNonce[MAX_GPUS];
+
+__constant__ uint32_t pTarget[8];
+__constant__ uint64_t keccak_round_constants[24];
+__constant__ uint64_t c_PaddedMessage80[10]; // padded message (80 bytes + padding?)
+
+#if __CUDA_ARCH__ >= 350
+__device__ __forceinline__
+static void keccak_blockv35(uint2 *s, const uint64_t *keccak_round_constants)
+{
+	size_t i;
+	uint2 t[5], u[5], v, w;
+
+	#pragma unroll
+	for (i = 0; i < 24; i++) {
+		/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
+		t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
+		t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
+		t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
+		t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
+		t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
+
+		/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
+		u[0] = t[4] ^ ROL2(t[1], 1);
+		u[1] = t[0] ^ ROL2(t[2], 1);
+		u[2] = t[1] ^ ROL2(t[3], 1);
+		u[3] = t[2] ^ ROL2(t[4], 1);
+		u[4] = t[3] ^ ROL2(t[0], 1);
+
+		/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
+		s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
+		s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
+		s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
+		s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
+		s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
+
+		/* rho pi: b[..] = rotl(a[..], ..) */
+		v = s[1];
+		s[1] = ROL2(s[6], 44);
+		s[6] = ROL2(s[9], 20);
+		s[9] = ROL2(s[22], 61);
+		s[22] = ROL2(s[14], 39);
+		s[14] = ROL2(s[20], 18);
+		s[20] = ROL2(s[2], 62);
+		s[2] = ROL2(s[12], 43);
+		s[12] = ROL2(s[13], 25);
+		s[13] = ROL2(s[19], 8);
+		s[19] = ROL2(s[23], 56);
+		s[23] = ROL2(s[15], 41);
+		s[15] = ROL2(s[4], 27);
+		s[4] = ROL2(s[24], 14);
+		s[24] = ROL2(s[21], 2);
+		s[21] = ROL2(s[8], 55);
+		s[8] = ROL2(s[16], 45);
+		s[16] = ROL2(s[5], 36);
+		s[5] = ROL2(s[3], 28);
+		s[3] = ROL2(s[18], 21);
+		s[18] = ROL2(s[17], 15);
+		s[17] = ROL2(s[11], 10);
+		s[11] = ROL2(s[7], 6);
+		s[7] = ROL2(s[10], 3);
+		s[10] = ROL2(v, 1);
+
+		/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
+		v = s[0]; w = s[1]; s[0] ^= (~w) & s[2]; s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & v; s[4] ^= (~v) & w;
+		v = s[5]; w = s[6]; s[5] ^= (~w) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9]; s[8] ^= (~s[9]) & v; s[9] ^= (~v) & w;
+		v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
+		v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
+		v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
+
+		/* iota: a[0,0] ^= round constant */
+		s[0] ^= vectorize(keccak_round_constants[i]);
+	}
+}
+#else
+
+__device__ __forceinline__
+static void keccak_blockv30(uint64_t *s, const uint64_t *keccak_round_constants)
+{
+	size_t i;
+	uint64_t t[5], u[5], v, w;
+
+	/* absorb input */
+
+	for (i = 0; i < 24; i++) {
+		/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
+		t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
+		t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
+		t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
+		t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
+		t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
+
+		/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
+		u[0] = t[4] ^ ROTL64(t[1], 1);
+		u[1] = t[0] ^ ROTL64(t[2], 1);
+		u[2] = t[1] ^ ROTL64(t[3], 1);
+		u[3] = t[2] ^ ROTL64(t[4], 1);
+		u[4] = t[3] ^ ROTL64(t[0], 1);
+
+		/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
+		s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
+		s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
+		s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
+		s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
+		s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
+
+		/* rho pi: b[..] = rotl(a[..], ..) */
+		v = s[ 1];
+		s[ 1] = ROTL64(s[ 6], 44);
+		s[ 6] = ROTL64(s[ 9], 20);
+		s[ 9] = ROTL64(s[22], 61);
+		s[22] = ROTL64(s[14], 39);
+		s[14] = ROTL64(s[20], 18);
+		s[20] = ROTL64(s[ 2], 62);
+		s[ 2] = ROTL64(s[12], 43);
+		s[12] = ROTL64(s[13], 25);
+		s[13] = ROTL64(s[19],  8);
+		s[19] = ROTL64(s[23], 56);
+		s[23] = ROTL64(s[15], 41);
+		s[15] = ROTL64(s[ 4], 27);
+		s[ 4] = ROTL64(s[24], 14);
+		s[24] = ROTL64(s[21],  2);
+		s[21] = ROTL64(s[ 8], 55);
+		s[ 8] = ROTL64(s[16], 45);
+		s[16] = ROTL64(s[ 5], 36);
+		s[ 5] = ROTL64(s[ 3], 28);
+		s[ 3] = ROTL64(s[18], 21);
+		s[18] = ROTL64(s[17], 15);
+		s[17] = ROTL64(s[11], 10);
+		s[11] = ROTL64(s[ 7],  6);
+		s[ 7] = ROTL64(s[10],  3);
+		s[10] = ROTL64(    v,  1);
+
+		/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
+		v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w;
+		v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w;
+		v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
+		v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
+		v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
+
+		/* iota: a[0,0] ^= round constant */
+		s[0] ^= keccak_round_constants[i];
+	}
+}
+#endif
+
+__global__ __launch_bounds__(128,5)
+void keccak256_sm3_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resNounce)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t nounce = startNounce + thread;
+
+#if __CUDA_ARCH__ >= 350
+		uint2 keccak_gpu_state[25];
+		#pragma unroll 25
+		for (int i=0; i<25; i++) {
+			if (i<9) keccak_gpu_state[i] = vectorize(c_PaddedMessage80[i]);
+			else     keccak_gpu_state[i] = make_uint2(0, 0);
+		}
+
+		keccak_gpu_state[9]= vectorize(c_PaddedMessage80[9]);
+		keccak_gpu_state[9].y = cuda_swab32(nounce);
+		keccak_gpu_state[10] = make_uint2(1, 0);
+		keccak_gpu_state[16] = make_uint2(0, 0x80000000);
+
+		keccak_blockv35(keccak_gpu_state,keccak_round_constants);
+		if (devectorize(keccak_gpu_state[3]) <= ((uint64_t*)pTarget)[3]) {resNounce[0] = nounce;}
+#else
+		uint64_t keccak_gpu_state[25];
+		#pragma unroll 25
+		for (int i=0; i<25; i++) {
+			if (i<9) keccak_gpu_state[i] = c_PaddedMessage80[i];
+			else     keccak_gpu_state[i] = 0;
+		}
+		keccak_gpu_state[9]  = REPLACE_HIDWORD(c_PaddedMessage80[9], cuda_swab32(nounce));
+		keccak_gpu_state[10] = 0x0000000000000001;
+		keccak_gpu_state[16] = 0x8000000000000000;
+
+		keccak_blockv30(keccak_gpu_state, keccak_round_constants);
+		if (keccak_gpu_state[3] <= ((uint64_t*)pTarget)[3]) { resNounce[0] = nounce; }
+#endif
+	}
+}
+
+__host__
+void keccak256_sm3_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonces, int order)
+{
+	cudaMemset(d_KNonce[thr_id], 0xff, 2*sizeof(uint32_t));
+	const uint32_t threadsperblock = 128;
+
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	size_t shared_size = 0;
+
+	keccak256_sm3_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, d_KNonce[thr_id]);
+
+	cudaMemcpy(resNonces, d_KNonce[thr_id], 2*sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	cudaThreadSynchronize();
+}
+
+#if 0
+__global__ __launch_bounds__(256,3)
+void keccak256_sm3_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outputHash)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+#if __CUDA_ARCH__ >= 350 /* tpr: to double check if faster on SM5+ */
+		uint2 keccak_gpu_state[25];
+		#pragma unroll 25
+		for (int i = 0; i<25; i++) {
+			if (i<4) keccak_gpu_state[i] = vectorize(outputHash[i*threads+thread]);
+			else     keccak_gpu_state[i] = make_uint2(0, 0);
+		}
+		keccak_gpu_state[4]  = make_uint2(1, 0);
+		keccak_gpu_state[16] = make_uint2(0, 0x80000000);
+		keccak_blockv35(keccak_gpu_state, keccak_round_constants);
+
+		#pragma unroll 4
+		for (int i=0; i<4; i++)
+			outputHash[i*threads+thread] = devectorize(keccak_gpu_state[i]);
+#else
+		uint64_t keccak_gpu_state[25];
+		#pragma unroll 25
+		for (int i = 0; i<25; i++) {
+			if (i<4)
+				keccak_gpu_state[i] = outputHash[i*threads+thread];
+			else
+				keccak_gpu_state[i] = 0;
+		}
+		keccak_gpu_state[4]  = 0x0000000000000001;
+		keccak_gpu_state[16] = 0x8000000000000000;
+
+		keccak_blockv30(keccak_gpu_state, keccak_round_constants);
+		#pragma unroll 4
+		for (int i = 0; i<4; i++)
+			outputHash[i*threads + thread] = keccak_gpu_state[i];
+#endif
+	}
+}
+
+__host__
+void keccak256_sm3_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
+{
+	const uint32_t threadsperblock = 256;
+
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	keccak256_sm3_gpu_hash_32 <<<grid, block>>> (threads, startNounce, d_outputHash);
+	MyStreamSynchronize(NULL, order, thr_id);
+}
+#endif
+
+__host__
+void keccak256_sm3_setBlock_80(void *pdata,const void *pTargetIn)
+{
+	unsigned char PaddedMessage[80];
+	memcpy(PaddedMessage, pdata, 80);
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(pTarget, pTargetIn, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 10*sizeof(uint64_t), 0, cudaMemcpyHostToDevice));
+}
+
+__host__
+void keccak256_sm3_init(int thr_id, uint32_t threads)
+{
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(keccak_round_constants, host_keccak_round_constants,
+				sizeof(host_keccak_round_constants), 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMalloc(&d_KNonce[thr_id], 2*sizeof(uint32_t)));
+}
+
+__host__
+void keccak256_sm3_free(int thr_id)
+{
+	cudaFree(d_KNonce[thr_id]);
+}
diff --git a/Algo256/cuda_skein256.cu b/Algo256/cuda_skein256.cu
new file mode 100644
index 0000000000..cbeb660e85
--- /dev/null
+++ b/Algo256/cuda_skein256.cu
@@ -0,0 +1,313 @@
+#include <memory.h>
+
+#include "cuda_helper.h"
+
+static __forceinline__ __device__
+void Round512v35(uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7,
+	const int ROT0, const int ROT1, const int ROT2, const int ROT3)
+{
+	p0 += p1; p1 = ROL2(p1, ROT0) ^ p0;
+	p2 += p3; p3 = ROL2(p3, ROT1) ^ p2;
+	p4 += p5; p5 = ROL2(p5, ROT2) ^ p4;
+	p6 += p7; p7 = ROL2(p7, ROT3) ^ p6;
+}
+
+__forceinline__ __device__
+void Round_8_512v35(const uint2 *const __restrict__ ks, const uint2 *const __restrict__ ts,
+	uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7, int R)
+{
+	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37);
+	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42);
+	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44,  9, 54, 56);
+
+	p0 += ks[(R+0) % 9];
+	p1 += ks[(R+1) % 9];
+	p2 += ks[(R+2) % 9];
+	p3 += ks[(R+3) % 9];
+	p4 += ks[(R+4) % 9];
+	p5 += ks[(R+5) % 9] + ts[(R+0) % 3];
+	p6 += ks[(R+6) % 9] + ts[(R+1) % 3];
+	p7 += ks[(R+7) % 9] + make_uint2(R, 0);
+
+	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24);
+	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17);
+	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8,  35, 56, 22);
+
+	p0 += ks[(R+1) % 9];
+	p1 += ks[(R+2) % 9];
+	p2 += ks[(R+3) % 9];
+	p3 += ks[(R+4) % 9];
+	p4 += ks[(R+5) % 9];
+	p5 += ks[(R+6) % 9] + ts[(R+1) % 3];
+	p6 += ks[(R+7) % 9] + ts[(R+2) % 3];
+	p7 += ks[(R+8) % 9] + make_uint2(R+1, 0);
+}
+
+__forceinline__ __device__
+void Round_8_512v35_final(const uint2 *const __restrict__ ks, const uint2 *const __restrict__ ts,
+	uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7)
+{
+	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37);
+	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42);
+	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56);
+
+	p0 += ks[8];
+	p1 += ks[0];
+	p2 += ks[1];
+	p3 += ks[2];
+	p4 += ks[3];
+	p5 += ks[4] + ts[2];
+	p6 += ks[5] + ts[0];
+	p7 += ks[6] + make_uint2(17, 0);
+
+	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24);
+	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17);
+	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8,  35, 56, 22);
+
+	p0 += ks[0];
+	p1 += ks[1];
+	p2 += ks[2];
+	p3 += ks[3];
+}
+
+__global__ __launch_bounds__(256,3)
+void skein256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outputHash)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+	if (thread < threads)
+	{
+		const uint2 skein_ks_parity = { 0xA9FC1A22, 0x1BD11BDA };
+		const uint2 t12[6] = {
+			{ 0x20, 0 },
+			{ 0,    0xf0000000 },
+			{ 0x20, 0xf0000000 },
+			{ 0x08, 0 },
+			{ 0,    0xff000000 },
+			{ 0x08, 0xff000000 }
+		};
+
+		uint2 h[9] = {
+			{ 0x2FDB3E13, 0xCCD044A1 },
+			{ 0x1A79A9EB, 0xE8359030 },
+			{ 0x4F816E6F, 0x55AEA061 },
+			{ 0xAE9B94DB, 0x2A2767A4 },
+			{ 0x74DD7683, 0xEC06025E },
+			{ 0xC4746251, 0xE7A436CD },
+			{ 0x393AD185, 0xC36FBAF9 },
+			{ 0x33EDFC13, 0x3EEDBA18 },
+			{ 0xC73A4E2A, 0xB69D3CFC }
+		};
+		uint2 dt0,dt1,dt2,dt3;
+		uint2 p0, p1, p2, p3, p4, p5, p6, p7;
+
+		LOHI(dt0.x,dt0.y,outputHash[thread]);
+		LOHI(dt1.x,dt1.y,outputHash[threads+thread]);
+		LOHI(dt2.x,dt2.y,outputHash[2*threads+thread]);
+		LOHI(dt3.x,dt3.y,outputHash[3*threads+thread]);
+
+		p0 = h[0] + dt0;
+		p1 = h[1] + dt1;
+		p2 = h[2] + dt2;
+		p3 = h[3] + dt3;
+		p4 = h[4];
+		p5 = h[5] + t12[0];
+		p6 = h[6] + t12[1];
+		p7 = h[7];
+
+		// forced unroll required
+		Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 1);
+		Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 3);
+		Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 5);
+		Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 7);
+		Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 9);
+		Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 11);
+		Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 13);
+		Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 15);
+		Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 17);
+
+		p0 ^= dt0;
+		p1 ^= dt1;
+		p2 ^= dt2;
+		p3 ^= dt3;
+
+		h[0] = p0;
+		h[1] = p1;
+		h[2] = p2;
+		h[3] = p3;
+		h[4] = p4;
+		h[5] = p5;
+		h[6] = p6;
+		h[7] = p7;
+		h[8] = skein_ks_parity ^ h[0] ^ h[1] ^ h[2] ^ h[3] ^ h[4] ^ h[5] ^ h[6] ^ h[7];
+
+		const uint2 *t = t12+3;
+		p5 += t12[3];  //p5 already equal h[5]
+		p6 += t12[4];
+
+		// forced unroll
+		Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 1);
+		Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 3);
+		Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 5);
+		Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 7);
+		Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 9);
+		Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 11);
+		Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 13);
+		Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 15);
+		Round_8_512v35_final(h, t, p0, p1, p2, p3, p4, p5, p6, p7);
+
+		outputHash[thread]           = devectorize(p0);
+		outputHash[threads+thread]   = devectorize(p1);
+		outputHash[2*threads+thread] = devectorize(p2);
+		outputHash[3*threads+thread] = devectorize(p3);
+	}
+}
+
+static __forceinline__ __device__
+void Round512v30(uint64_t &p0, uint64_t &p1, uint64_t &p2, uint64_t &p3,
+	uint64_t &p4, uint64_t &p5, uint64_t &p6, uint64_t &p7,
+	const int ROT0, const int ROT1, const int ROT2, const int ROT3)
+{
+	p0 += p1; p1 = ROTL64(p1, ROT0) ^ p0;
+	p2 += p3; p3 = ROTL64(p3, ROT1) ^ p2;
+	p4 += p5; p5 = ROTL64(p5, ROT2) ^ p4;
+	p6 += p7; p7 = ROTL64(p7, ROT3) ^ p6;
+}
+
+static __forceinline__ __device__
+void Round_8_512v30(uint64_t *ks, uint64_t *ts,
+	uint64_t &p0, uint64_t &p1, uint64_t &p2, uint64_t &p3,
+	uint64_t &p4, uint64_t &p5, uint64_t &p6, uint64_t &p7, int R)
+{
+	Round512v30(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37);
+	Round512v30(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42);
+	Round512v30(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39);
+	Round512v30(p6, p1, p0, p7, p2, p5, p4, p3, 44,  9, 54, 56);
+
+	p0 += ks[(R+0) % 9];
+	p1 += ks[(R+1) % 9];
+	p2 += ks[(R+2) % 9];
+	p3 += ks[(R+3) % 9];
+	p4 += ks[(R+4) % 9];
+	p5 += ks[(R+5) % 9] + ts[(R+0) % 3];
+	p6 += ks[(R+6) % 9] + ts[(R+1) % 3];
+	p7 += ks[(R+7) % 9] + R;
+
+	Round512v30(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24);
+	Round512v30(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17);
+	Round512v30(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43);
+	Round512v30(p6, p1, p0, p7, p2, p5, p4, p3, 8,  35, 56, 22);
+
+	p0 += ks[(R+1) % 9];
+	p1 += ks[(R+2) % 9];
+	p2 += ks[(R+3) % 9];
+	p3 += ks[(R+4) % 9];
+	p4 += ks[(R+5) % 9];
+	p5 += ks[(R+6) % 9] + ts[(R+1) % 3];
+	p6 += ks[(R+7) % 9] + ts[(R+2) % 3];
+	p7 += ks[(R+8) % 9] + R+1;
+}
+
+#define skein_ks_parity64 0x1BD11BDAA9FC1A22ull
+#include <stdio.h>
+__global__  __launch_bounds__(256, 3)
+void skein256_gpu_hash_32_v30(uint32_t threads, uint32_t startNounce, uint64_t *outputHash)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint64_t h[12] = { // SKEIN_IV512_256
+			0xCCD044A12FDB3E13, 0xE83590301A79A9EB,
+			0x55AEA0614F816E6F, 0x2A2767A4AE9B94DB,
+			0xEC06025E74DD7683, 0xE7A436CDC4746251,
+			0xC36FBAF9393AD185, 0x3EEDBA1833EDFC13,
+			0xb69d3cfcc73a4e2a, // skein_ks_parity64 ^ h[0..7]
+			0x20, 0xf000000000000000, 0xf000000000000020 // t0..2
+		};
+		uint64_t dt0 = outputHash[thread];
+		uint64_t dt1 = outputHash[threads   + thread];
+		uint64_t dt2 = outputHash[threads*2 + thread];
+		uint64_t dt3 = outputHash[threads*3 + thread];
+
+		uint64_t *t = &h[9];
+		uint64_t p0 = h[0] + dt0;
+		uint64_t p1 = h[1] + dt1;
+		uint64_t p2 = h[2] + dt2;
+		uint64_t p3 = h[3] + dt3;
+		uint64_t p4 = h[4];
+		uint64_t p5 = h[5] + t[0];
+		uint64_t p6 = h[6] + t[1];
+		uint64_t p7 = h[7];
+
+		#pragma unroll 9
+		for (int i = 1; i<19; i += 2) {
+			Round_8_512v30(h, t, p0, p1, p2, p3, p4, p5, p6, p7, i);
+		}
+
+		p0 ^= dt0;
+		p1 ^= dt1;
+		p2 ^= dt2;
+		p3 ^= dt3;
+
+		h[0] = p0;
+		h[1] = p1;
+		h[2] = p2;
+		h[3] = p3;
+		h[4] = p4;
+		h[5] = p5;
+		h[6] = p6;
+		h[7] = p7;
+		h[8] = skein_ks_parity64;
+
+		#pragma unroll 8
+		for (int i = 0; i<8; i++) {
+			h[8] ^= h[i];
+		}
+
+		t[0] = 0x08;
+		t[1] = 0xff00000000000000;
+		t[2] = 0xff00000000000008;
+
+		p5 += t[0];  //p5 already equal h[5]
+		p6 += t[1];
+
+		#pragma unroll 9
+		for (int i = 1; i<19; i += 2) {
+			Round_8_512v30(h, t, p0, p1, p2, p3, p4, p5, p6, p7, i);
+		}
+
+		outputHash[thread] = p0;
+		outputHash[threads   + thread] = p1;
+		outputHash[threads*2 + thread] = p2;
+		outputHash[threads*3 + thread] = p3;
+	} //thread
+}
+
+__host__
+void skein256_cpu_init(int thr_id, uint32_t threads)
+{
+	cuda_get_arch(thr_id);
+}
+
+__host__
+void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
+{
+	const uint32_t threadsperblock = 256;
+	int dev_id = device_map[thr_id];
+
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	// only 1kH/s perf change between kernels on a 960...
+	if (device_sm[dev_id] > 300 && cuda_arch[dev_id] > 300)
+		skein256_gpu_hash_32<<<grid, block>>>(threads, startNounce, d_outputHash);
+	else
+		skein256_gpu_hash_32_v30<<<grid, block>>>(threads, startNounce, d_outputHash);
+
+	MyStreamSynchronize(NULL, order, thr_id);
+}
+
diff --git a/Algo256/decred.cu b/Algo256/decred.cu
new file mode 100644
index 0000000000..e539c8e6be
--- /dev/null
+++ b/Algo256/decred.cu
@@ -0,0 +1,475 @@
+/**
+ * Blake-256 Decred 180-Bytes input Cuda Kernel
+ *
+ * Tanguy Pruvot, Alexis Provos - Feb/Sep 2016
+ */
+
+#include <stdint.h>
+#include <memory.h>
+#include <miner.h>
+
+extern "C" {
+#include <sph/sph_blake.h>
+}
+
+/* threads per block */
+#define TPB 640
+
+/* max count of found nonces in one call (like sgminer) */
+#define MAX_RESULTS 4
+
+/* hash by cpu with blake 256 */
+extern "C" void decred_hash(void *output, const void *input)
+{
+	sph_blake256_context ctx;
+
+	sph_blake256_set_rounds(14);
+
+	sph_blake256_init(&ctx);
+	sph_blake256(&ctx, input, 180);
+	sph_blake256_close(&ctx, output);
+}
+
+#include <cuda_helper.h>
+
+#ifdef __INTELLISENSE__
+#define __byte_perm(x, y, b) x
+#define atomicInc(p, max) (*p)++
+#endif
+
+__constant__ uint32_t _ALIGN(16) c_h[2];
+__constant__ uint32_t _ALIGN(16) c_data[32];
+__constant__ uint32_t _ALIGN(16) c_xors[215];
+
+/* Buffers of candidate nonce(s) */
+static uint32_t *d_resNonce[MAX_GPUS];
+static uint32_t *h_resNonce[MAX_GPUS];
+
+#define ROR8(a)  __byte_perm(a, 0, 0x0321)
+#define ROL16(a) __byte_perm(a, 0, 0x1032)
+
+/* macro bodies */
+#define pxorGS(a,b,c,d) { \
+	v[a]+= c_xors[i++] + v[b]; \
+	v[d] = ROL16(v[d] ^ v[a]); \
+	v[c]+= v[d]; \
+	v[b] = ROTR32(v[b] ^ v[c], 12); \
+	v[a]+= c_xors[i++] + v[b]; \
+	v[d] = ROR8(v[d] ^ v[a]); \
+	v[c]+= v[d]; \
+	v[b] = ROTR32(v[b] ^ v[c], 7); \
+}
+
+#define pxorGS2(a,b,c,d, a1,b1,c1,d1) {\
+	v[ a]+= c_xors[i++] + v[ b];            v[a1]+= c_xors[i++] + v[b1]; \
+	v[ d] = ROL16(v[ d] ^ v[ a]);           v[d1] = ROL16(v[d1] ^ v[a1]); \
+	v[ c]+= v[ d];                          v[c1]+= v[d1]; \
+	v[ b] = ROTR32(v[ b] ^ v[ c], 12);      v[b1] = ROTR32(v[b1] ^ v[c1], 12); \
+	v[ a]+= c_xors[i++] + v[ b];            v[a1]+= c_xors[i++] + v[b1]; \
+	v[ d] = ROR8(v[ d] ^ v[ a]);            v[d1] = ROR8(v[d1] ^ v[a1]); \
+	v[ c]+= v[ d];                          v[c1]+= v[d1]; \
+	v[ b] = ROTR32(v[ b] ^ v[ c], 7);       v[b1] = ROTR32(v[b1] ^ v[c1], 7); \
+}
+
+#define pxory1GS2(a,b,c,d, a1,b1,c1,d1) { \
+	v[ a]+= c_xors[i++] + v[ b];            v[a1]+= c_xors[i++] + v[b1]; \
+	v[ d] = ROL16(v[ d] ^ v[ a]);           v[d1] = ROL16(v[d1] ^ v[a1]); \
+	v[ c]+= v[ d];                          v[c1]+= v[d1]; \
+	v[ b] = ROTR32(v[ b] ^ v[ c], 12);      v[b1] = ROTR32(v[b1] ^ v[c1], 12); \
+	v[ a]+= c_xors[i++] + v[ b];            v[a1]+= (c_xors[i++]^nonce) + v[b1]; \
+	v[ d] = ROR8(v[ d] ^ v[ a]);            v[d1] = ROR8(v[d1] ^ v[a1]); \
+	v[ c]+= v[ d];                          v[c1]+= v[d1]; \
+	v[ b] = ROTR32(v[ b] ^ v[ c], 7);       v[b1] = ROTR32(v[b1] ^ v[c1], 7); \
+}
+
+#define pxory0GS2(a,b,c,d, a1,b1,c1,d1) { \
+	v[ a]+= c_xors[i++] + v[ b];            v[a1]+= c_xors[i++] + v[b1]; \
+	v[ d] = ROL16(v[ d] ^ v[ a]);           v[d1] = ROL16(v[d1] ^ v[a1]); \
+	v[ c]+= v[ d];                          v[c1]+= v[d1]; \
+	v[ b] = ROTR32(v[ b] ^ v[ c], 12);      v[b1] = ROTR32(v[b1] ^ v[c1], 12); \
+	v[ a]+= (c_xors[i++]^nonce) + v[ b];    v[a1]+= c_xors[i++] + v[b1]; \
+	v[ d] = ROR8(v[ d] ^ v[ a]);            v[d1] = ROR8(v[d1] ^ v[a1]); \
+	v[ c]+= v[ d];                          v[c1]+= v[d1]; \
+	v[ b] = ROTR32(v[ b] ^ v[ c], 7);       v[b1] = ROTR32(v[b1] ^ v[c1], 7); \
+}
+
+#define pxorx1GS2(a,b,c,d, a1,b1,c1,d1) { \
+	v[ a]+= c_xors[i++] + v[ b];            v[a1]+= (c_xors[i++]^nonce) + v[b1]; \
+	v[ d] = ROL16(v[ d] ^ v[ a]);           v[d1] = ROL16(v[d1] ^ v[a1]); \
+	v[ c]+= v[ d];                          v[c1]+= v[d1]; \
+	v[ b] = ROTR32(v[ b] ^ v[ c], 12);      v[b1] = ROTR32(v[b1] ^ v[c1], 12); \
+	v[ a]+= c_xors[i++] + v[ b];            v[a1]+= c_xors[i++] + v[b1]; \
+	v[ d] = ROR8(v[ d] ^ v[ a]);            v[d1] = ROR8(v[d1] ^ v[a1]); \
+	v[ c]+= v[ d];                          v[c1]+= v[d1]; \
+	v[ b] = ROTR32(v[ b] ^ v[ c], 7);       v[b1] = ROTR32(v[b1] ^ v[c1], 7); \
+}
+
+#define pxorx0GS2(a,b,c,d, a1,b1,c1,d1) { \
+	v[ a]+= (c_xors[i++]^nonce) + v[ b];    v[a1]+= c_xors[i++] + v[b1]; \
+	v[ d] = ROL16(v[ d] ^ v[ a]);           v[d1] = ROL16(v[d1] ^ v[a1]); \
+	v[ c]+= v[ d];                          v[c1]+= v[d1]; \
+	v[ b] = ROTR32(v[ b] ^ v[ c], 12);      v[b1] = ROTR32(v[b1] ^ v[c1], 12); \
+	v[ a]+= c_xors[i++] + v[ b];            v[a1]+= c_xors[i++] + v[b1]; \
+	v[ d] = ROR8(v[ d] ^ v[ a]);            v[d1] = ROR8(v[d1] ^ v[a1]); \
+	v[ c]+= v[ d];                          v[c1]+= v[d1]; \
+	v[ b] = ROTR32(v[ b] ^ v[ c], 7);       v[b1] = ROTR32(v[b1] ^ v[c1], 7); \
+}
+
+__global__ __launch_bounds__(TPB,1)
+void decred_gpu_hash_nonce(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce, const uint32_t highTarget)
+{
+	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
+
+	if (thread < threads)
+	{
+		uint32_t v[16];
+		#pragma unroll
+		for(int i=0; i<16; i+=4) {
+			*(uint4*)&v[i] = *(uint4*)&c_data[i];
+		}
+
+		const uint32_t nonce = startNonce + thread;
+		v[ 1]+= (nonce ^ 0x13198A2E);
+		v[13] = ROR8(v[13] ^ v[1]);
+		v[ 9]+= v[13];
+		v[ 5] = ROTR32(v[5] ^ v[9], 7);
+
+		int i = 0;
+		v[ 1]+= c_xors[i++];// + v[ 6];
+		v[ 0]+= v[5];
+		v[12] = ROL16(v[12] ^ v[ 1]);         v[15] = ROL16(v[15] ^ v[ 0]);
+		v[11]+= v[12];                        v[10]+= v[15];
+		v[ 6] = ROTR32(v[ 6] ^ v[11], 12);    v[ 5] = ROTR32(v[5] ^ v[10], 12);
+		v[ 1]+= c_xors[i++] + v[ 6];          v[ 0]+= c_xors[i++] + v[ 5];
+		v[12] = ROR8(v[12] ^ v[ 1]);          v[15] = ROR8(v[15] ^ v[ 0]);
+		v[11]+= v[12];                        v[10]+= v[15];
+		v[ 6] = ROTR32(v[ 6] ^ v[11], 7);     v[ 5] = ROTR32(v[ 5] ^ v[10], 7);
+
+		pxorGS2(   2, 7, 8, 13, 3, 4, 9, 14);
+		pxorGS2(   0, 4, 8, 12, 1, 5, 9, 13); pxorGS2(   2, 6, 10, 14, 3, 7, 11, 15); pxorGS2(   0, 5, 10, 15, 1, 6, 11, 12); pxory1GS2( 2, 7, 8, 13, 3, 4, 9, 14);
+		pxorGS2(   0, 4, 8, 12, 1, 5, 9, 13); pxorGS2(   2, 6, 10, 14, 3, 7, 11, 15); pxorx1GS2( 0, 5, 10, 15, 1, 6, 11, 12); pxorGS2(   2, 7, 8, 13, 3, 4, 9, 14);
+		pxorx1GS2( 0, 4, 8, 12, 1, 5, 9, 13); pxorGS2(   2, 6, 10, 14, 3, 7, 11, 15); pxorGS2(   0, 5, 10, 15, 1, 6, 11, 12); pxorGS2(   2, 7, 8, 13, 3, 4, 9, 14);
+		pxorGS2(   0, 4, 8, 12, 1, 5, 9, 13); pxorGS2(   2, 6, 10, 14, 3, 7, 11, 15); pxorGS2(   0, 5, 10, 15, 1, 6, 11, 12); pxorx1GS2( 2, 7, 8, 13, 3, 4, 9, 14);
+		pxorGS2(   0, 4, 8, 12, 1, 5, 9, 13); pxory1GS2( 2, 6, 10, 14, 3, 7, 11, 15); pxorGS2(   0, 5, 10, 15, 1, 6, 11, 12); pxorGS2(   2, 7, 8, 13, 3, 4, 9, 14);
+		pxorGS2(   0, 4, 8, 12, 1, 5, 9, 13); pxorGS2(   2, 6, 10, 14, 3, 7, 11, 15); pxory1GS2( 0, 5, 10, 15, 1, 6, 11, 12); pxorGS2(   2, 7, 8, 13, 3, 4, 9, 14);
+		pxorGS2(   0, 4, 8, 12, 1, 5, 9, 13); pxorx1GS2( 2, 6, 10, 14, 3, 7, 11, 15); pxorGS2(   0, 5, 10, 15, 1, 6, 11, 12); pxorGS2(   2, 7, 8, 13, 3, 4, 9, 14);
+		pxorGS2(   0, 4, 8, 12, 1, 5, 9, 13); pxory0GS2( 2, 6, 10, 14, 3, 7, 11, 15); pxorGS2(   0, 5, 10, 15, 1, 6, 11, 12); pxorGS2(   2, 7, 8, 13, 3, 4, 9, 14);
+		pxorGS2(   0, 4, 8, 12, 1, 5, 9, 13); pxorGS2(   2, 6, 10, 14, 3, 7, 11, 15); pxorGS2(   0, 5, 10, 15, 1, 6, 11, 12); pxorx0GS2( 2, 7, 8, 13, 3, 4, 9, 14);
+		pxory1GS2( 0, 4, 8, 12, 1, 5, 9, 13); pxorGS2(   2, 6, 10, 14, 3, 7, 11, 15); pxorGS2(   0, 5, 10, 15, 1, 6, 11, 12); pxorGS2(   2, 7, 8, 13, 3, 4, 9, 14);
+		pxorGS2(   0, 4, 8, 12, 1, 5, 9, 13); pxorGS2(   2, 6, 10, 14, 3, 7, 11, 15); pxorGS2(   0, 5, 10, 15, 1, 6, 11, 12); pxory1GS2( 2, 7, 8, 13, 3, 4, 9, 14);
+		pxorGS2(   0, 4, 8, 12, 1, 5, 9, 13); pxorGS2(   2, 6, 10, 14, 3, 7, 11, 15); pxorx1GS2( 0, 5, 10, 15, 1, 6, 11, 12); pxorGS2(   2, 7, 8, 13, 3, 4, 9, 14);
+		pxorx1GS2( 0, 4, 8, 12, 1, 5, 9, 13); pxorGS2(   2, 6, 10, 14, 3, 7, 11, 15); pxorGS2(   0, 5, 10, 15, 1, 6, 11, 12); pxorGS(    2, 7, 8, 13);
+
+		if ((c_h[1]^v[15]) == v[7]) {
+			v[ 3] += c_xors[i++] + v[4];
+			v[14] = ROL16(v[14] ^ v[3]);
+			v[ 9] += v[14];
+			v[ 4] = ROTR32(v[4] ^ v[9], 12);
+			v[ 3] += c_xors[i++] + v[4];
+			v[14] = ROR8(v[14] ^ v[3]);
+			if(cuda_swab32((c_h[0]^v[6]^v[14])) <= highTarget) {
+				uint32_t pos = atomicInc(&resNonce[0], UINT32_MAX)+1;
+				resNonce[pos] = nonce;
+				return;
+			}
+		}
+	}
+}
+
+__host__
+void decred_cpu_setBlock_52(const uint32_t *input)
+{
+/*
+	Precompute everything possible and pass it on constant memory
+*/
+	const uint32_t z[16] = {
+		0x243F6A88U, 0x85A308D3U, 0x13198A2EU, 0x03707344U,
+		0xA4093822U, 0x299F31D0U, 0x082EFA98U, 0xEC4E6C89U,
+		0x452821E6U, 0x38D01377U, 0xBE5466CFU, 0x34E90C6CU,
+		0xC0AC29B7U, 0xC97C50DDU, 0x3F84D5B5U, 0xB5470917U
+	};
+
+	int i=0;
+	uint32_t _ALIGN(64) preXOR[215];
+	uint32_t _ALIGN(64)   data[16];
+	uint32_t _ALIGN(64)      m[16];
+	uint32_t _ALIGN(64)      h[ 2];
+
+	sph_blake256_context ctx;
+	sph_blake256_set_rounds(14);
+	sph_blake256_init(&ctx);
+	sph_blake256(&ctx, input, 128);
+
+	data[ 0] = ctx.H[0];
+	data[ 1] = ctx.H[1];
+	data[ 2] = ctx.H[2];
+	data[ 3] = ctx.H[3];
+	data[ 4] = ctx.H[4];
+	data[ 5] = ctx.H[5];
+	data[ 8] = ctx.H[6];
+
+	data[12] = swab32(input[35]);
+	data[13] = ctx.H[7];
+
+	// pre swab32
+	m[ 0] = swab32(input[32]);	m[ 1] = swab32(input[33]);
+	m[ 2] = swab32(input[34]);	m[ 3] = 0;
+	m[ 4] = swab32(input[36]);	m[ 5] = swab32(input[37]);
+	m[ 6] = swab32(input[38]);	m[ 7] = swab32(input[39]);
+	m[ 8] = swab32(input[40]);	m[ 9] = swab32(input[41]);
+	m[10] = swab32(input[42]);	m[11] = swab32(input[43]);
+	m[12] = swab32(input[44]);	m[13] = 0x80000001;
+	m[14] = 0;
+	m[15] = 0x000005a0;
+
+	h[ 0] = data[ 8];
+	h[ 1] = data[13];
+
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_h,h, 8, 0, cudaMemcpyHostToDevice));
+
+	data[ 0]+= (m[ 0] ^ z[1]) + data[ 4];
+	data[12]  = SPH_ROTR32(z[4] ^ SPH_C32(0x5A0) ^ data[ 0], 16);
+
+	data[ 8] = z[0]+data[12];
+	data[ 4] = SPH_ROTR32(data[ 4] ^ data[ 8], 12);
+	data[ 0]+= (m[ 1] ^ z[0]) + data[ 4];
+	data[12] = SPH_ROTR32(data[12] ^ data[ 0],8);
+	data[ 8]+= data[12];
+	data[ 4] = SPH_ROTR32(data[ 4] ^ data[ 8], 7);
+
+	data[ 1]+= (m[ 2] ^ z[3]) + data[ 5];
+	data[13] = SPH_ROTR32((z[5] ^ SPH_C32(0x5A0)) ^ data[ 1], 16);
+	data[ 9] = z[1]+data[13];
+	data[ 5] = SPH_ROTR32(data[ 5] ^ data[ 9], 12);
+	data[ 1]+= data[ 5]; //+nonce ^ ...
+
+	data[ 2]+= (m[ 4] ^ z[5]) + h[ 0];
+	data[14] = SPH_ROTR32(z[6] ^ data[ 2],16);
+	data[10] = z[2] + data[14];
+	data[ 6] = SPH_ROTR32(h[ 0] ^ data[10], 12);
+	data[ 2]+= (m[ 5] ^ z[4]) + data[ 6];
+	data[14] = SPH_ROTR32(data[14] ^ data[ 2], 8);
+	data[10]+= data[14];
+	data[ 6] = SPH_ROTR32(data[ 6] ^ data[10], 7);
+
+	data[ 3]+= (m[ 6] ^ z[7]) + h[ 1];
+	data[15] = SPH_ROTR32(z[7] ^ data[ 3],16);
+	data[11] = z[3] + data[15];
+	data[ 7] = SPH_ROTR32(h[ 1] ^ data[11], 12);
+	data[ 3]+= (m[ 7] ^ z[6]) + data[ 7];
+	data[15] = SPH_ROTR32(data[15] ^ data[ 3],8);
+	data[11]+= data[15];
+	data[ 7] = SPH_ROTR32(data[11] ^ data[ 7], 7);
+	data[ 0]+= m[ 8] ^ z[9];
+
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_data, data, 64, 0, cudaMemcpyHostToDevice));
+
+#define precalcXORGS(x,y) { \
+	preXOR[i++]= (m[x] ^ z[y]); \
+	preXOR[i++]= (m[y] ^ z[x]); \
+}
+#define precalcXORGS2(x,y,x1,y1){\
+	preXOR[i++] = (m[ x] ^ z[ y]);\
+	preXOR[i++] = (m[x1] ^ z[y1]);\
+	preXOR[i++] = (m[ y] ^ z[ x]);\
+	preXOR[i++] = (m[y1] ^ z[x1]);\
+}
+	precalcXORGS(10,11);
+	preXOR[ 0]+=data[ 6];
+	preXOR[i++] = (m[9] ^ z[8]);
+	precalcXORGS2(12,13,14,15);
+	precalcXORGS2(14,10, 4, 8);
+	precalcXORGS2( 9,15,13, 6);
+	precalcXORGS2( 1,12, 0, 2);
+	precalcXORGS2(11, 7, 5, 3);
+	precalcXORGS2(11, 8,12, 0);
+	precalcXORGS2( 5, 2,15,13);
+	precalcXORGS2(10,14, 3, 6);
+	precalcXORGS2( 7, 1, 9, 4);
+	precalcXORGS2( 7, 9, 3, 1);
+	precalcXORGS2(13,12,11,14);
+	precalcXORGS2( 2, 6, 5,10);
+	precalcXORGS2( 4, 0,15, 8);
+	precalcXORGS2( 9, 0, 5, 7);
+	precalcXORGS2( 2, 4,10,15);
+	precalcXORGS2(14, 1,11,12);
+	precalcXORGS2( 6, 8, 3,13);
+	precalcXORGS2( 2,12, 6,10);
+	precalcXORGS2( 0,11, 8, 3);
+	precalcXORGS2( 4,13, 7, 5);
+	precalcXORGS2(15,14, 1, 9);
+	precalcXORGS2(12, 5, 1,15);
+	precalcXORGS2(14,13, 4,10);
+	precalcXORGS2( 0, 7, 6, 3);
+	precalcXORGS2( 9, 2, 8,11);
+	precalcXORGS2(13,11, 7,14);
+	precalcXORGS2(12, 1, 3, 9);
+	precalcXORGS2( 5, 0,15, 4);
+	precalcXORGS2( 8, 6, 2,10);
+	precalcXORGS2( 6,15,14, 9);
+	precalcXORGS2(11, 3, 0, 8);
+	precalcXORGS2(12, 2,13, 7);
+	precalcXORGS2( 1, 4,10, 5);
+	precalcXORGS2(10, 2, 8, 4);
+	precalcXORGS2( 7, 6, 1, 5);
+	precalcXORGS2(15,11, 9,14);
+	precalcXORGS2( 3,12,13, 0);
+	precalcXORGS2( 0, 1, 2, 3);
+	precalcXORGS2( 4, 5, 6, 7);
+	precalcXORGS2( 8, 9,10,11);
+	precalcXORGS2(12,13,14,15);
+	precalcXORGS2(14,10, 4, 8);
+	precalcXORGS2( 9,15,13, 6);
+	precalcXORGS2( 1,12, 0, 2);
+	precalcXORGS2(11, 7, 5, 3);
+	precalcXORGS2(11, 8,12, 0);
+	precalcXORGS2( 5, 2,15,13);
+	precalcXORGS2(10,14, 3, 6);
+	precalcXORGS2( 7, 1, 9, 4);
+	precalcXORGS2( 7, 9, 3, 1);
+	precalcXORGS2(13,12,11,14);
+	precalcXORGS2( 2, 6, 5,10);
+	precalcXORGS( 4, 0);
+	precalcXORGS(15, 8);
+
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_xors, preXOR, 215*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
+}
+
+/* ############################################################################################################################### */
+
+static bool init[MAX_GPUS] = { 0 };
+
+// nonce position is different in decred
+#define DCR_NONCE_OFT32 35
+
+extern "C" int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t _ALIGN(64) endiandata[48];
+
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	uint32_t *pnonce = &pdata[DCR_NONCE_OFT32];
+
+	const uint32_t first_nonce = *pnonce;
+	const uint32_t targetHigh = opt_benchmark ? 0x1ULL : ptarget[6];
+
+	const int dev_id = device_map[thr_id];
+	int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 29 : 25;
+	if (device_sm[dev_id] < 350) intensity = 22;
+
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	const dim3 grid((throughput + TPB-1)/(TPB));
+	const dim3 block(TPB);
+
+	if (!init[thr_id]) {
+		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage (linux)
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		cuda_get_arch(thr_id);
+
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_resNonce[thr_id], MAX_RESULTS*sizeof(uint32_t)), -1);
+		CUDA_CALL_OR_RET_X(cudaMallocHost(&h_resNonce[thr_id], MAX_RESULTS*sizeof(uint32_t)), -1);
+		init[thr_id] = true;
+	}
+	memcpy(endiandata, pdata, 180);
+
+	decred_cpu_setBlock_52(endiandata);
+	cudaMemset(d_resNonce[thr_id], 0x00, sizeof(uint32_t));
+
+	do {
+		uint32_t* resNonces = h_resNonce[thr_id];
+
+		if (resNonces[0]) cudaMemset(d_resNonce[thr_id], 0x00, sizeof(uint32_t));
+
+		// GPU HASH
+		decred_gpu_hash_nonce <<<grid, block>>> (throughput, (*pnonce), d_resNonce[thr_id], targetHigh);
+
+		*hashes_done = (*pnonce) - first_nonce + throughput;
+
+		// first cell contains the valid nonces count
+		cudaMemcpy(resNonces, d_resNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+
+		if (resNonces[0])
+		{
+			uint32_t _ALIGN(64) vhash[8];
+
+			cudaMemcpy(resNonces, d_resNonce[thr_id], (resNonces[0]+1)*sizeof(uint32_t), cudaMemcpyDeviceToHost);
+
+			be32enc(&endiandata[DCR_NONCE_OFT32], resNonces[1]);
+			decred_hash(vhash, endiandata);
+			if (vhash[6] <= ptarget[6] && fulltest(vhash, ptarget))
+			{
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				work->nonces[0] = swab32(resNonces[1]);
+				*pnonce = work->nonces[0];
+
+				// search for another nonce
+				for(uint32_t n=2; n <= resNonces[0]; n++)
+				{
+					be32enc(&endiandata[DCR_NONCE_OFT32], resNonces[n]);
+					decred_hash(vhash, endiandata);
+					if (vhash[6] <= ptarget[6] && fulltest(vhash, ptarget)) {
+						work->nonces[1] = swab32(resNonces[n]);
+						if (bn_hash_target_ratio(vhash, ptarget) > work->shareratio[0]) {
+							// we really want the best first ? depends...
+							work->shareratio[1] = work->shareratio[0];
+							work->sharediff[1] = work->sharediff[0];
+							xchg(work->nonces[1], work->nonces[0]);
+							work_set_target_ratio(work, vhash);
+							work->valid_nonces++;
+						} else if (work->valid_nonces == 1) {
+							bn_set_target_ratio(work, vhash, 1);
+							work->valid_nonces++;
+						}
+						work->valid_nonces = 2; // MAX_NONCES submit limited to 2
+
+						gpulog(LOG_DEBUG, thr_id, "multiple nonces 1:%08x (%g) %u:%08x (%g)",
+							work->nonces[0], work->sharediff[0], n, work->nonces[1], work->sharediff[1]);
+
+					} else if (vhash[6] > ptarget[6]) {
+						gpu_increment_reject(thr_id);
+						if (!opt_quiet)
+						gpulog(LOG_WARNING, thr_id, "result %u for %08x does not validate on CPU!", n, resNonces[n]);
+					}
+				}
+				return work->valid_nonces;
+
+			} else if (vhash[6] > ptarget[6]) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+					gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", resNonces[1]);
+			}
+		}
+		*pnonce += throughput;
+
+	} while (!work_restart[thr_id].restart && max_nonce > (uint64_t)throughput + (*pnonce));
+
+	*hashes_done = (*pnonce) - first_nonce;
+	MyStreamSynchronize(NULL, 0, device_map[thr_id]);
+	return 0;
+}
+
+// cleanup
+extern "C" void free_decred(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaDeviceSynchronize();
+	cudaFreeHost(h_resNonce[thr_id]);
+	cudaFree(d_resNonce[thr_id]);
+
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/Algo256/keccak256.cu b/Algo256/keccak256.cu
new file mode 100644
index 0000000000..1388a54ade
--- /dev/null
+++ b/Algo256/keccak256.cu
@@ -0,0 +1,175 @@
+/*
+ * Keccak 256
+ *
+ */
+
+extern "C"
+{
+#include "sph/sph_shavite.h"
+#include "sph/sph_simd.h"
+#include "sph/sph_keccak.h"
+
+#include "miner.h"
+}
+
+#include "cuda_helper.h"
+
+// SM5+ cuda
+extern void keccak256_cpu_init(int thr_id);
+extern void keccak256_cpu_free(int thr_id);
+extern void keccak256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t* resNonces, const uint2 highTarget);
+extern void keccak256_setBlock_80(uint64_t *endiandata);
+extern void keccak256_setOutput(int thr_id);
+
+// compat
+extern void keccak256_sm3_init(int thr_id, uint32_t threads);
+extern void keccak256_sm3_free(int thr_id);
+extern void keccak256_sm3_setBlock_80(void *pdata, const void *ptarget);
+extern void keccak256_sm3_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t* resNonces, int order);
+
+// CPU Hash
+extern "C" void keccak256_hash(void *state, const void *input)
+{
+	uint32_t _ALIGN(64) hash[16];
+	sph_keccak_context ctx_keccak;
+
+	sph_keccak256_init(&ctx_keccak);
+	sph_keccak256 (&ctx_keccak, input, 80);
+	sph_keccak256_close(&ctx_keccak, (void*) hash);
+
+	memcpy(state, hash, 32);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+static bool use_compat_kernels[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t _ALIGN(64) endiandata[20];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	const int dev_id = device_map[thr_id];
+	uint32_t throughput;
+	uint32_t intensity = 23;
+	if(!use_compat_kernels[thr_id]) {
+		if (strstr(device_name[dev_id], "GTX 1070")) intensity = 25;
+		if (strstr(device_name[dev_id], "GTX 1080")) intensity = 26;
+	}
+	throughput = cuda_default_throughput(thr_id, 1U << intensity);
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (opt_benchmark)
+		ptarget[7] = 0x000f;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		cuda_get_arch(thr_id);
+		use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
+
+		if(!use_compat_kernels[thr_id]) {
+			keccak256_cpu_init(thr_id);
+		} else {
+			// really useful ?
+			keccak256_sm3_init(thr_id, throughput);
+		}
+
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		init[thr_id] = true;
+	}
+
+	for (int k=0; k < 19; k++) {
+		be32enc(&endiandata[k], pdata[k]);
+	}
+
+	const uint2 highTarget = make_uint2(ptarget[6], ptarget[7]);
+	if(use_compat_kernels[thr_id])
+		keccak256_sm3_setBlock_80((void*)endiandata, ptarget);
+	else {
+		keccak256_setBlock_80((uint64_t*)endiandata);
+		keccak256_setOutput(thr_id);
+	}
+
+	do {
+		int order = 0;
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		if(use_compat_kernels[thr_id])
+			keccak256_sm3_hash_80(thr_id, throughput, pdata[19], work->nonces, order++);
+		else {
+			keccak256_cpu_hash_80(thr_id, throughput, pdata[19], work->nonces, highTarget);
+		}
+
+		if (work->nonces[0] != UINT32_MAX && bench_algo < 0)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+
+			be32enc(&endiandata[19], work->nonces[0]);
+			keccak256_hash(vhash, endiandata);
+
+			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				if (!use_compat_kernels[thr_id] && work->nonces[1] != UINT32_MAX) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					keccak256_hash(vhash, endiandata);
+					if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+						work->valid_nonces++;
+						bn_set_target_ratio(work, vhash, 1);
+					}
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1;
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				//keccak256_setOutput(thr_id);
+				continue;
+			}
+		}
+
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+	return 0;
+}
+
+// cleanup
+extern "C" void free_keccak256(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	if(!use_compat_kernels[thr_id])
+		keccak256_cpu_free(thr_id);
+	else {
+		keccak256_sm3_free(thr_id);
+	}
+
+	cudaDeviceSynchronize();
+	init[thr_id] = false;
+}
diff --git a/Algo256/vanilla.cu b/Algo256/vanilla.cu
new file mode 100644
index 0000000000..1b63aedd21
--- /dev/null
+++ b/Algo256/vanilla.cu
@@ -0,0 +1,492 @@
+/**
+ * Optimized Blake-256 8-rounds Cuda Kernel (Tested on SM >3.0)
+ * Based upon Blake-256 implementation of Tanguy Pruvot - Nov. 2014
+ *
+ * Provos Alexis - Jan. 2016
+ *
+ * Fixed CUDA 7.5 flaw
+ * minor code changes
+ * code cleanup
+ * increased nonces per thread
+ * removed SSE2 midstate computation
+ * Provos Alexis - Mar 2016
+ */
+
+#include <stdint.h>
+#include <memory.h>
+
+#include "miner.h"
+
+extern "C" {
+#include "sph/sph_blake.h"
+}
+
+#include "cuda_helper.h"
+
+#ifdef __INTELLISENSE__
+#define __byte_perm(x, y, b) x
+#endif
+
+/* threads per block and nonces per thread */
+#define TPB 768
+#define NPT 384
+#define NBN 2
+
+__constant__ uint32_t _ALIGN(16) d_data[21];
+
+/* 16 gpu threads max */
+static uint32_t		*d_resNonce[MAX_GPUS];
+static uint32_t		*h_resNonce[MAX_GPUS];
+static cudaStream_t	streams[MAX_GPUS];
+
+/* hash by cpu with blake 256 */
+extern "C" void vanillahash(void *output, const void *input, int8_t blakerounds){
+	uchar hash[64];
+	sph_blake256_context ctx;
+
+	sph_blake256_set_rounds(blakerounds);
+
+	sph_blake256_init(&ctx);
+	sph_blake256(&ctx, input, 80);
+	sph_blake256_close(&ctx, hash);
+
+	memcpy(output, hash, 32);
+}
+#define GS4(a,b,c,d,x,y,a1,b1,c1,d1,x1,y1,a2,b2,c2,d2,x2,y2,a3,b3,c3,d3,x3,y3) { \
+	v[ a]+= (m[ x] ^ z[ y]) + v[ b]; \
+	v[a1]+= (m[x1] ^ z[y1]) + v[b1]; \
+	v[a2]+= (m[x2] ^ z[y2]) + v[b2]; \
+	v[a3]+= (m[x3] ^ z[y3]) + v[b3]; \
+	\
+	v[ d] = __byte_perm(v[ d] ^ v[ a], 0, 0x1032); \
+	v[d1] = __byte_perm(v[d1] ^ v[a1], 0, 0x1032); \
+	v[d2] = __byte_perm(v[d2] ^ v[a2], 0, 0x1032); \
+	v[d3] = __byte_perm(v[d3] ^ v[a3], 0, 0x1032); \
+	\
+	v[ c]+= v[ d]; \
+	v[c1]+= v[d1]; \
+	v[c2]+= v[d2]; \
+	v[c3]+= v[d3]; \
+	\
+	v[ b] = ROTR32(v[ b] ^ v[ c], 12); \
+	v[b1] = ROTR32(v[b1] ^ v[c1], 12); \
+	v[b2] = ROTR32(v[b2] ^ v[c2], 12); \
+	v[b3] = ROTR32(v[b3] ^ v[c3], 12); \
+	\
+	v[ a]+= (m[ y] ^ z[ x]) + v[ b]; \
+	v[a1]+= (m[y1] ^ z[x1]) + v[b1]; \
+	v[a2]+= (m[y2] ^ z[x2]) + v[b2]; \
+	v[a3]+= (m[y3] ^ z[x3]) + v[b3]; \
+	\
+	v[ d] = __byte_perm(v[ d] ^ v[ a], 0, 0x0321); \
+	v[d1] = __byte_perm(v[d1] ^ v[a1], 0, 0x0321); \
+	v[d2] = __byte_perm(v[d2] ^ v[a2], 0, 0x0321); \
+	v[d3] = __byte_perm(v[d3] ^ v[a3], 0, 0x0321); \
+	\
+	v[ c]+= v[ d]; \
+	v[c1]+= v[d1]; \
+	v[c2]+= v[d2]; \
+	v[c3]+= v[d3]; \
+	\
+	v[ b] = ROTR32(v[ b] ^ v[ c], 7); \
+	v[b1] = ROTR32(v[b1] ^ v[c1], 7); \
+	v[b2] = ROTR32(v[b2] ^ v[c2], 7); \
+	v[b3] = ROTR32(v[b3] ^ v[c3], 7); \
+}
+
+#define GS3(a,b,c,d,x,y,a1,b1,c1,d1,x1,y1,a2,b2,c2,d2,x2,y2) { \
+	v[ a]+= (m[ x] ^ z[ y]) + v[ b]; \
+	v[a1]+= (m[x1] ^ z[y1]) + v[b1]; \
+	v[a2]+= (m[x2] ^ z[y2]) + v[b2]; \
+	\
+	v[ d] = __byte_perm(v[ d] ^ v[ a], 0, 0x1032); \
+	v[d1] = __byte_perm(v[d1] ^ v[a1], 0, 0x1032); \
+	v[d2] = __byte_perm(v[d2] ^ v[a2], 0, 0x1032); \
+	\
+	v[ c]+= v[ d]; \
+	v[c1]+= v[d1]; \
+	v[c2]+= v[d2]; \
+	\
+	v[ b] = ROTR32(v[ b] ^ v[ c], 12); \
+	v[b1] = ROTR32(v[b1] ^ v[c1], 12); \
+	v[b2] = ROTR32(v[b2] ^ v[c2], 12); \
+	\
+	v[ a]+= (m[ y] ^ z[ x]) + v[ b]; \
+	v[a1]+= (m[y1] ^ z[x1]) + v[b1]; \
+	v[a2]+= (m[y2] ^ z[x2]) + v[b2]; \
+	\
+	v[ d] = __byte_perm(v[ d] ^ v[ a], 0, 0x0321); \
+	v[d1] = __byte_perm(v[d1] ^ v[a1], 0, 0x0321); \
+	v[d2] = __byte_perm(v[d2] ^ v[a2], 0, 0x0321); \
+	\
+	v[ c]+= v[ d]; \
+	v[c1]+= v[d1]; \
+	v[c2]+= v[d2]; \
+	\
+	v[ b] = ROTR32(v[ b] ^ v[ c], 7); \
+	v[b1] = ROTR32(v[b1] ^ v[c1], 7); \
+	v[b2] = ROTR32(v[b2] ^ v[c2], 7); \
+}
+
+#define GS2(a,b,c,d,x,y,a1,b1,c1,d1,x1,y1) { \
+	v[ a]+= (m[ x] ^ z[ y]) + v[ b]; \
+	v[a1]+= (m[x1] ^ z[y1]) + v[b1]; \
+	\
+	v[ d] = __byte_perm(v[ d] ^ v[ a], 0, 0x1032); \
+	v[d1] = __byte_perm(v[d1] ^ v[a1], 0, 0x1032); \
+	\
+	v[ c]+= v[ d]; \
+	v[c1]+= v[d1]; \
+	\
+	v[ b] = ROTR32(v[ b] ^ v[ c], 12); \
+	v[b1] = ROTR32(v[b1] ^ v[c1], 12); \
+	\
+	v[ a]+= (m[ y] ^ z[ x]) + v[ b]; \
+	v[a1]+= (m[y1] ^ z[x1]) + v[b1]; \
+	\
+	v[ d] = __byte_perm(v[ d] ^ v[ a], 0, 0x0321); \
+	v[d1] = __byte_perm(v[d1] ^ v[a1], 0, 0x0321); \
+	\
+	v[ c]+= v[ d]; \
+	v[c1]+= v[d1]; \
+	\
+	v[ b] = ROTR32(v[ b] ^ v[ c], 7); \
+	v[b1] = ROTR32(v[b1] ^ v[c1], 7); \
+}
+
+#define GS(a,b,c,d,x,y) { \
+	v[a] += (m[x] ^ z[y]) + v[b]; \
+	v[d] = __byte_perm(v[d] ^ v[a],0, 0x1032); \
+	v[c] += v[d]; \
+	v[b] = ROTR32(v[b] ^ v[c], 12); \
+	v[a] += (m[y] ^ z[x]) + v[b]; \
+	v[d] = __byte_perm(v[d] ^ v[a],0, 0x0321); \
+	v[c] += v[d]; \
+	v[b] = ROTR32(v[b] ^ v[c], 7); \
+}
+
+__global__ __launch_bounds__(TPB,1)
+void vanilla_gpu_hash_16_8(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce,const uint64_t highTarget){
+	uint32_t _ALIGN(16) v[16];
+	uint32_t _ALIGN(16) tmp[16];
+
+	const size_t thread   = blockDim.x * blockIdx.x + threadIdx.x;
+	const uint64_t step     = gridDim.x * blockDim.x;
+	const uint64_t maxNonce = startNonce + threads;
+
+	const int8_t r[][16] = {
+		{ 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },{ 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 },
+		{  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 },{  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 },
+		{  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 },{ 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 },
+		{ 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 }
+	};
+	const uint32_t z[16] = {
+		0x243F6A88, 0x85A308D3, 0x13198A2E, 0x03707344, 0xA4093822, 0x299F31D0, 0x082EFA98, 0xEC4E6C89,
+		0x452821E6, 0x38D01377, 0xBE5466CF, 0x34E90C6C, 0xC0AC29B7, 0xC97C50DD, 0x3F84D5B5, 0xB5470917
+	};
+
+//PREFETCH
+	#pragma unroll
+	for(int i=0;i<16;i++){
+		tmp[ i] = d_data[ i];
+	}
+
+	uint32_t m[16] = {
+				  d_data[16],	d_data[17],	d_data[18],	0,
+				  0x80000000UL,	0,		0,		0,
+				  0,		0,		0,		0,
+				  0,		1,		0,		640
+	};
+
+	const uint32_t h7 = d_data[19];
+	const uint32_t h6 = d_data[20];
+//END OF PREFETCH
+	uint64_t m3 = startNonce + thread;
+loopstart:
+		if(m3>=maxNonce)return;
+		m[3]  = m3;
+
+		#pragma unroll
+		for(int i=0;i<16;i++)
+			v[ i] = tmp[ i];
+
+		v[ 1]+= m[3] ^ z[2];
+		v[13] = __byte_perm(v[13] ^ v[1],0, 0x0321);
+		v[ 9]+= v[13];
+		v[ 5] = ROTR32(v[5] ^ v[9], 7);
+		v[ 0]+= v[5];
+		v[15] = __byte_perm(v[15] ^ v[0],0, 0x1032);
+
+		v[10]+= v[15];
+		v[ 5] = ROTR32(v[5] ^ v[10], 12);
+		v[ 0]+= z[8] + v[5];
+		v[15] = __byte_perm(v[15] ^ v[0],0, 0x0321);
+		v[10]+= v[15];
+		v[ 5] = ROTR32(v[5] ^ v[10], 7);
+
+		GS3( 1, 6,11,12,10,11, 2, 7, 8,13,12,13, 3, 4, 9,14,14,15);
+
+		#pragma unroll
+		for(int i=0;i<6;i++){
+			GS4(0, 4, 8,12,r[i][ 0],r[i][ 1], 1, 5, 9,13,r[i][ 2],r[i][ 3], 2, 6,10,14,r[i][ 4],r[i][ 5], 3, 7,11,15,r[i][ 6],r[i][ 7]);
+			GS4(0, 5,10,15,r[i][ 8],r[i][ 9], 1, 6,11,12,r[i][10],r[i][11], 2, 7, 8,13,r[i][12],r[i][13], 3, 4, 9,14,r[i][14],r[i][15]);
+		}
+		GS4(0, 4, 8,12,r[6][ 0],r[6][ 1], 1, 5, 9,13,r[6][ 2],r[6][ 3], 2, 6,10,14,r[6][ 4],r[6][ 5], 3, 7,11,15,r[6][ 6],r[6][ 7]);
+
+		v[ 0] += (m[ 5] ^ z[0]) + v[5];
+		v[ 2] += (m[ 8] ^ z[6]) + v[7];
+		v[13] = __byte_perm(v[13] ^ v[2],0, 0x1032);
+		v[15] = __byte_perm(v[15] ^ v[0],0, 0x1032);
+
+		v[ 8] += v[13];
+		v[10] += v[15];
+
+		v[ 5] = ROTR32(v[ 5] ^ v[10], 12);
+		v[ 7] = ROTR32(v[ 7] ^ v[ 8], 12);
+
+		v[ 0] += (m[ 0] ^ z[5]) + v[5];
+		v[ 2] += (m[ 6] ^ z[8]) + v[7];
+
+		v[15] = __byte_perm(v[15] ^ v[ 0],0, 0x0321);
+		v[13] = __byte_perm(v[13] ^ v[ 2],0, 0x0321);
+
+		v[8] += v[13];
+		v[7]  = ROTR32(v[7] ^ v[8], 7);
+
+		// only compute h6 & 7
+		if((v[15]^h7)==v[7]){
+			v[ 1] += (m[15] ^ z[ 4]) + v[6];
+			v[ 3] += (m[2] ^ z[10]) + v[4];
+			v[12]  = __byte_perm(v[12] ^ v[ 1],0, 0x1032);
+			v[14]  = __byte_perm(v[14] ^ v[3],0, 0x1032);
+			v[11] += v[12];
+			v[ 9] += v[14];
+			v[ 6]  = ROTR32(v[ 6] ^ v[11], 12);
+
+			v[ 1] += (m[ 4] ^ z[15]) + v[ 6];
+			v[ 3] += (m[10] ^ z[ 2]) + ROTR32(v[ 4] ^ v[ 9],12);
+			v[12]  = __byte_perm(v[12] ^ v[ 1],0, 0x0321);
+			v[14]  = __byte_perm(v[14] ^ v[ 3],0, 0x0321);
+
+			v[11] += v[12];
+			v[ 6] = ROTR32(v[ 6] ^ v[11], 7);
+
+			if(cuda_swab32(h6^v[6]^v[14]) <= highTarget) {
+#if NBN == 2
+				/* keep the smallest nonce, + extra one if found */
+				if (m[3] < resNonce[0]){
+					resNonce[1] = resNonce[0];
+					resNonce[0] = m[3];
+				}
+				else
+					resNonce[1] = m[3];
+#else
+				resNonce[0] = m[3];
+#endif
+				return; //<-- this may cause a problem on extranonce if the extranonce is on position current_nonce + X * step where X=[1,2,3..,N]
+			}
+		}
+	m3+=step;
+	goto loopstart;
+}
+
+__host__
+void vanilla_cpu_setBlock_16(const int thr_id,const uint32_t* endiandata, uint32_t *penddata){
+
+	const uint32_t _ALIGN(64) z[16] = {
+		SPH_C32(0x243F6A88), SPH_C32(0x85A308D3), SPH_C32(0x13198A2E), SPH_C32(0x03707344),
+		SPH_C32(0xA4093822), SPH_C32(0x299F31D0), SPH_C32(0x082EFA98), SPH_C32(0xEC4E6C89),
+		SPH_C32(0x452821E6), SPH_C32(0x38D01377), SPH_C32(0xBE5466CF), SPH_C32(0x34E90C6C),
+		SPH_C32(0xC0AC29B7), SPH_C32(0xC97C50DD), SPH_C32(0x3F84D5B5), SPH_C32(0xB5470917)
+	};
+	uint32_t _ALIGN(64) h[22];
+
+	sph_blake256_context ctx;
+
+	sph_blake256_set_rounds(8);
+
+	sph_blake256_init(&ctx);
+	sph_blake256(&ctx, endiandata, 64);
+
+	h[ 0] = ctx.H[0];	h[ 1] = ctx.H[1];
+	h[ 2] = ctx.H[2];	h[21] = ctx.H[3];
+	h[ 4] = ctx.H[4];	h[20] = ctx.H[5];
+	h[19] = ctx.H[6];	h[16] = ctx.H[7];
+
+	uint32_t tmp = h[20];
+	h[20] = h[19];
+	h[19] = h[16];
+	h[16] = penddata[ 0];
+	h[17] = penddata[ 1];
+	h[18] = penddata[ 2];
+	h[12] = z[ 4] ^ 640;
+	h[ 8] = z[ 0];
+
+	h[ 0] += (h[16] ^ z[ 1]) + h[ 4];
+	h[12]  = SPH_ROTR32(h[12] ^ h[0],16);
+	h[ 8] += h[12];
+	h[ 4]  = SPH_ROTR32(h[ 4] ^ h[ 8], 12);
+	h[ 0] += (h[17] ^ z[ 0]) + h[ 4];
+	h[12]  = SPH_ROTR32(h[12] ^ h[0],8);
+	h[ 8] += h[12];
+	h[ 4]  = SPH_ROTR32(h[ 4] ^ h[ 8], 7);
+
+	h[1] += (h[18] ^ z[ 3]) + tmp;
+
+	h[13] = SPH_ROTR32(z[ 5] ^ 640 ^ h[1],16);
+	h[ 5] = ROTR32(tmp ^ (z[ 1] + h[13]), 12);
+
+	h[ 1] += h[ 5];
+	h[ 2] += (0x80000000UL ^ z[ 5]) + h[20];
+
+	h[14]  = SPH_ROTR32(z[ 6] ^ h[2], 16);
+	h[ 6]  = z[ 2] + h[14];
+	h[ 6]  = SPH_ROTR32(h[20] ^ h[ 6], 12);
+
+	h[21] += z[ 7] + h[19];
+	h[ 0] += z[ 9];
+
+	h[ 2] += z[ 4] + h[ 6];
+
+	h[ 9] = z[ 1] + h[13];
+	h[10] = z[ 2] + h[14];
+
+	h[14] = SPH_ROTR32(h[14] ^ h[2],8); //0x0321
+	h[10]+=h[14];
+
+	h[ 6] = SPH_ROTR32(h[ 6] ^ h[10],7);
+	h[15] = SPH_ROTR32(z[ 7] ^ h[21],16);
+
+	h[11] = z[ 3] + h[15];
+	h[ 7] = SPH_ROTR32(h[19] ^ h[11], 12);
+	h[ 3] = h[21] + h[ 7] + z[ 6];
+
+	h[15] = SPH_ROTR32(h[15] ^ h[ 3],8);
+	h[11]+= h[15];
+	h[ 7] = ROTR32(h[ 7] ^ h[11],7);
+
+	cudaMemcpyToSymbolAsync(d_data, h, 21*sizeof(uint32_t), 0, cudaMemcpyHostToDevice, streams[thr_id]);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_vanilla(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, const int8_t blakerounds)
+{
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce  = pdata[19];
+	const uint32_t targetHigh   = ptarget[6];
+	int dev_id = device_map[thr_id];
+
+	int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 30 : 24;
+	if (device_sm[dev_id] < 350) intensity = 22;
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (!init[thr_id]) {
+		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage (linux)
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		cuda_get_arch(thr_id);
+
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_resNonce[thr_id], NBN * sizeof(uint32_t)), -1);
+		CUDA_CALL_OR_RET_X(cudaMallocHost(&h_resNonce[thr_id], NBN * sizeof(uint32_t)), -1);
+		cudaStreamCreate(&streams[thr_id]);
+		init[thr_id] = true;
+	}
+
+	uint32_t _ALIGN(64) endiandata[20];
+
+	for (int k = 0; k < 16; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	cudaMemsetAsync(d_resNonce[thr_id], 0xff, sizeof(uint32_t),streams[thr_id]);
+
+	vanilla_cpu_setBlock_16(thr_id,endiandata,&pdata[16]);
+
+	const dim3 grid((throughput + (NPT*TPB)-1)/(NPT*TPB));
+	const dim3 block(TPB);
+	int rc = 0;
+
+	do {
+		vanilla_gpu_hash_16_8<<<grid,block, 0, streams[thr_id]>>>(throughput, pdata[19], d_resNonce[thr_id], targetHigh);
+		cudaMemcpyAsync(h_resNonce[thr_id], d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost,streams[thr_id]);
+		*hashes_done = pdata[19] - first_nonce + throughput;
+		cudaStreamSynchronize(streams[thr_id]);
+
+		if (h_resNonce[thr_id][0] != UINT32_MAX){
+			uint32_t vhashcpu[8];
+			uint32_t Htarg = (uint32_t)targetHigh;
+
+			for (int k=0; k < 19; k++)
+				be32enc(&endiandata[k], pdata[k]);
+
+			be32enc(&endiandata[19], h_resNonce[thr_id][0]);
+			vanillahash(vhashcpu, endiandata, blakerounds);
+
+			if (vhashcpu[6] <= Htarg && fulltest(vhashcpu, ptarget)) {
+				work->valid_nonces = 1;
+				work->nonces[0] = h_resNonce[thr_id][0];
+				work_set_target_ratio(work, vhashcpu);
+#if NBN > 1
+				if (h_resNonce[thr_id][1] != UINT32_MAX) {
+					work->nonces[1] = h_resNonce[thr_id][1];
+					be32enc(&endiandata[19], h_resNonce[thr_id][1]);
+					vanillahash(vhashcpu, endiandata, blakerounds);
+					if (bn_hash_target_ratio(vhashcpu, ptarget) > work->shareratio[0]) {
+						work_set_target_ratio(work, vhashcpu);
+						xchg(work->nonces[0], work->nonces[1]);
+					}
+					work->valid_nonces = 2;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+#endif
+				return work->valid_nonces;
+			}
+			else if (vhashcpu[6] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+					gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", h_resNonce[thr_id][0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+	MyStreamSynchronize(NULL, 0, dev_id);
+	return rc;
+}
+
+// cleanup
+extern "C" void free_vanilla(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFreeHost(h_resNonce[thr_id]);
+	cudaFree(d_resNonce[thr_id]);
+
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/COPYING b/COPYING
deleted file mode 100644
index 805d7df96b..0000000000
--- a/COPYING
+++ /dev/null
@@ -1 +0,0 @@
-See LICENSE.txt
diff --git a/ChangeLog b/ChangeLog
deleted file mode 100644
index 540821528b..0000000000
--- a/ChangeLog
+++ /dev/null
@@ -1 +0,0 @@
-See README.txt
diff --git a/INSTALL b/INSTALL
index 2ee1151e63..cac32a1b62 100644
--- a/INSTALL
+++ b/INSTALL
@@ -1,9 +1,97 @@
-
-It is advised to run ./autogen.sh before./configure (autoconf and automake
-need to be installed on your system for autogen.sh to work)
-
-./configure has an option named --with-cuda that allows you to specify
-where your CUDA 5.5 toolkit is installed (usually /usr/local/cuda-5.5,
-but some distros may have a different default location)
-
-See README.txt
+
+You can use ./build.sh to configure and build with default options.
+
+It is advised to run ./autogen.sh before ./configure (autoconf and automake
+need to be installed on your system for autogen.sh to work)
+
+./configure has an option named --with-cuda that allows you to specify
+where your CUDA 6.5 toolkit is installed (usually /usr/local/cuda,
+but some distros may have a different default location)
+
+
+** How to compile on Ubuntu (16.04 LTS)
+
+First, install Cuda toolkit and nVidia Driver, and type `nvidia-smi` to check if your card is detected.
+
+Install dependencies
+```sudo apt-get install libcurl4-openssl-dev libssl-dev libjansson-dev automake autotools-dev build-essential```
+
+Ubuntu is now shipped with gcc 6 or 7 so please install gcc/g++ 5 and make it the default (required by the cuda toolkit)
+```
+sudo apt-get install gcc-5 g++-5
+sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-5 1
+```
+
+Then use the helper ./build.sh in ccminer source folder, edit configure.sh and the Makefile.am if required.
+```
+./build.sh
+./ccminer --version
+```
+
+
+** How to compile on Fedora 25 **
+
+Note: You may find an alternative method via rpms :
+see https://negativo17.org/nvidia-driver/ and https://negativo17.org/repos/multimedia/
+
+
+# Step 1: gcc and dependencies
+dnf install gcc gcc-c++ autoconf automake
+dnf install jansson-devel openssl-devel libcurl-devel zlib-devel
+
+# Step 2: nvidia drivers (Download common linux drivers from nvidia site)
+dnf install kernel-devel
+dnf install https://download1.rpmfusion.org/free/fedora/rpmfusion-free-release-$(rpm -E %fedora).noarch.rpm https://download1.rpmfusion.org/nonfree/fedora/rpmfusion-nonfree-release-$(rpm -E %fedora).noarch.rpm
+dnf check-update
+dnf install xorg-x11-drv-nvidia-cuda kmod-nvidia
+ln -s libnvidia-ml.so.1 /usr/lib64/libnvidia-ml.so
+
+# Step 3: CUDA SDK (Download from nvidia the generic ".run" archive)
+#         --override is required to ignore "too recent" gcc 6.3
+#         --silent is required to install only the toolkit (no kmod)
+./cuda_8.0.61_375.26_linux.run --toolkit --silent --override
+nvcc --version
+
+# add the nvcc binary path to the system
+ln -s /usr/local/cuda-8.0 /usr/local/cuda # (if not already made)
+echo 'export PATH=$PATH:/usr/local/cuda/bin' > /etc/profile.d/cuda.sh
+
+# add the cudart library path to the system
+echo /usr/local/cuda/lib64 > /etc/ld.so.conf.d/cuda.conf
+ldconfig
+
+# Step 4: Fix the toolkit incompatibility with gcc 6
+
+# You need to build yourself an older GCC/G++ version, i recommend the 5.4
+# see https://gcc.gnu.org/mirrors.html
+# Note: this manual method will override the default gcc, it could be better to use a custom toolchain prefix
+
+wget ftp://ftp.lip6.fr/pub/gcc/releases/gcc-5.4.0/gcc-5.4.0.tar.bz2
+dnf install libmpc-devel mpfr-devel gmp-devel
+./configure --prefix=/usr/local --enable-languages=c,c++,lto --disable-multilib
+make -j 8 && make install
+(while this step, you have the time to cook something :p)
+
+# or, for previous fedora versions, edit the file /usr/local/cuda/include/host_config.h
+# and comment/delete the line 121 : #error -- unsupported GNU version! gcc versions later than 5 are not supported!
+
+./build.sh
+
+./ccminer -n
+
+
+** How to compile on macOS **
+
+# Step 1: download and install CUDA Toolkit 8 or more recent
+# https://developer.nvidia.com/cuda-toolkit-archive
+
+# Step 2: install Homebrew
+ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
+
+# Step 3: dependencies
+brew install pkg-config autoconf automake curl openssl llvm
+
+./build.sh
+
+./ccminer -n
+
diff --git a/JHA/.deps/.dirstamp b/JHA/.deps/.dirstamp
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/JHA/.dirstamp b/JHA/.dirstamp
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/JHA/cuda_jha_compactionTest.cu b/JHA/cuda_jha_compactionTest.cu
index e043e2c41c..35a13b7b9a 100644
--- a/JHA/cuda_jha_compactionTest.cu
+++ b/JHA/cuda_jha_compactionTest.cu
@@ -1,26 +1,18 @@
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-#include "sm_30_intrinsics.h"
-
 #include <stdio.h>
 #include <memory.h>
-#include <stdint.h>
-
-// aus cpu-miner.c
-extern int device_map[8];
 
-// diese Struktur wird in der Init Funktion angefordert
-static cudaDeviceProp props[8];
+#include "cuda_helper.h"
+#include <sm_30_intrinsics.h>
 
-static uint32_t *d_tempBranch1Nonces[8];
-static uint32_t *d_numValid[8];
-static uint32_t *h_numValid[8];
+#ifdef __INTELLISENSE__
+#define __shfl_up(a,b)
+#endif
 
-static uint32_t *d_partSum[2][8]; // f�r bis zu vier partielle Summen
+static uint32_t *d_tempBranch1Nonces[MAX_GPUS];
+static uint32_t *d_numValid[MAX_GPUS];
+static uint32_t *h_numValid[MAX_GPUS];
 
-// aus heavy.cu
-extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
+static uint32_t *d_partSum[2][MAX_GPUS]; // für bis zu vier partielle Summen
 
 // True/False tester
 typedef uint32_t(*cuda_compactTestFunction_t)(uint32_t *inpHash);
@@ -38,13 +30,13 @@ __device__ uint32_t JackpotFalseTest(uint32_t *inpHash)
 }
 
 __device__ cuda_compactTestFunction_t d_JackpotTrueFunction = JackpotTrueTest, d_JackpotFalseFunction = JackpotFalseTest;
-cuda_compactTestFunction_t h_JackpotTrueFunction[8], h_JackpotFalseFunction[8];
 
-// Setup-Funktionen
-__host__ void jackpot_compactTest_cpu_init(int thr_id, int threads)
-{
-	cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
+cuda_compactTestFunction_t h_JackpotTrueFunction[MAX_GPUS], h_JackpotFalseFunction[MAX_GPUS];
 
+// Setup-Function
+__host__
+void jackpot_compactTest_cpu_init(int thr_id, uint32_t threads)
+{
 	cudaMemcpyFromSymbol(&h_JackpotTrueFunction[thr_id], d_JackpotTrueFunction, sizeof(cuda_compactTestFunction_t));
 	cudaMemcpyFromSymbol(&h_JackpotFalseFunction[thr_id], d_JackpotFalseFunction, sizeof(cuda_compactTestFunction_t));
 
@@ -60,8 +52,30 @@ __host__ void jackpot_compactTest_cpu_init(int thr_id, int threads)
 	cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block)
 }
 
+__host__
+void jackpot_compactTest_cpu_free(int thr_id)
+{
+	cudaFree(d_tempBranch1Nonces[thr_id]);
+	cudaFree(d_numValid[thr_id]);
+
+	cudaFree(d_partSum[0][thr_id]);
+	cudaFree(d_partSum[1][thr_id]);
+
+	cudaFreeHost(h_numValid[thr_id]);
+}
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
+/**
+ * __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1
+ */
+#undef __shfl_up
+#define __shfl_up(var, delta, width) (0)
+#endif
+
 // Die Summenfunktion (vom NVIDIA SDK)
-__global__ void jackpot_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
+__global__
+void jackpot_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL,
+	uint32_t threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
 {
 	extern __shared__ uint32_t sums[];
 	int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
@@ -88,7 +102,7 @@ __global__ void jackpot_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t
 				inpHash = &inpHashes[id<<4];
 			}else
 			{
-				// Nonce-Liste verf�gbar
+				// Nonce-Liste verfügbar
 				int nonce = d_validNonceTable[id] - startNounce;
 				inpHash = &inpHashes[nonce<<4];
 			}			
@@ -173,7 +187,8 @@ __global__ void jackpot_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t
 }
 
 // Uniform add: add partial sums array
-__global__ void jackpot_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_sums, int len)
+__global__
+void jackpot_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_sums, int len)
 {
 	__shared__ uint32_t buf;
 	int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
@@ -190,7 +205,9 @@ __global__ void jackpot_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_su
 }
 
 // Der Scatter
-__global__ void jackpot_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, cuda_compactTestFunction_t testFunc, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
+__global__
+void jackpot_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, cuda_compactTestFunction_t testFunc,
+	uint32_t threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
 {
 	int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
 	uint32_t actNounce = id;
@@ -205,7 +222,7 @@ __global__ void jackpot_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, c
 			inpHash = &inpHashes[id<<4];
 		}else
 		{
-			// Nonce-Liste verf�gbar
+			// Nonce-Liste verfügbar
 			int nonce = d_validNonceTable[id] - startNounce;
 			actNounce = nonce;
 			inpHash = &inpHashes[nonce<<4];
@@ -225,7 +242,8 @@ __global__ void jackpot_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, c
 	}
 }
 
-__host__ static uint32_t jackpot_compactTest_roundUpExp(uint32_t val)
+__host__
+static uint32_t jackpot_compactTest_roundUpExp(uint32_t val)
 {
 	if(val == 0)
 		return 0;
@@ -239,9 +257,9 @@ __host__ static uint32_t jackpot_compactTest_roundUpExp(uint32_t val)
 	return mask;
 }
 
-__host__ void jackpot_compactTest_cpu_singleCompaction(int thr_id, int threads, uint32_t *nrm,
-														uint32_t *d_nonces1, cuda_compactTestFunction_t function,
-														uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
+__host__
+void jackpot_compactTest_cpu_singleCompaction(int thr_id, uint32_t threads, uint32_t *nrm, uint32_t *d_nonces1,
+	cuda_compactTestFunction_t function, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
 {
 	int orgThreads = threads;
 	threads = (int)jackpot_compactTest_roundUpExp((uint32_t)threads);
@@ -295,9 +313,9 @@ __host__ void jackpot_compactTest_cpu_singleCompaction(int thr_id, int threads,
 }
 
 ////// ACHTUNG: Diese funktion geht aktuell nur mit threads > 65536 (Am besten 256 * 1024 oder 256*2048)
-__host__ void jackpot_compactTest_cpu_dualCompaction(int thr_id, int threads, uint32_t *nrm,
-													 uint32_t *d_nonces1, uint32_t *d_nonces2,
-													 uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
+__host__
+void jackpot_compactTest_cpu_dualCompaction(int thr_id, uint32_t threads, uint32_t *nrm, uint32_t *d_nonces1,
+	uint32_t *d_nonces2, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
 {
 	jackpot_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[0], d_nonces1, h_JackpotTrueFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
 	jackpot_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[1], d_nonces2, h_JackpotFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
@@ -334,19 +352,18 @@ __host__ void jackpot_compactTest_cpu_dualCompaction(int thr_id, int threads, ui
 	*/
 }
 
-__host__ void jackpot_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
-											uint32_t *d_nonces1, size_t *nrm1,
-											uint32_t *d_nonces2, size_t *nrm2,
-											int order)
+__host__
+void jackpot_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
+	uint32_t *d_nonces1, uint32_t *nrm1, uint32_t *d_nonces2, uint32_t *nrm2, int order)
 {
 	// Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind
-	// "threads" ist in diesem Fall auf die L�nge dieses Array's zu setzen!
+	// "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen!
 
 	jackpot_compactTest_cpu_dualCompaction(thr_id, threads,
 		h_numValid[thr_id], d_nonces1, d_nonces2,
 		startNounce, inpHashes, d_validNonceTable);
 
 	cudaStreamSynchronize(NULL); // Das original braucht zwar etwas CPU-Last, ist an dieser Stelle aber evtl besser
-	*nrm1 = (size_t)h_numValid[thr_id][0];
-	*nrm2 = (size_t)h_numValid[thr_id][1];
+	*nrm1 = h_numValid[thr_id][0];
+	*nrm2 = h_numValid[thr_id][1];
 }
diff --git a/JHA/cuda_jha_keccak512.cu b/JHA/cuda_jha_keccak512.cu
index be5b61ac57..9192a8a5cd 100644
--- a/JHA/cuda_jha_keccak512.cu
+++ b/JHA/cuda_jha_keccak512.cu
@@ -1,191 +1,118 @@
-
-
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
 #include <stdio.h>
 #include <memory.h>
 
-// Folgende Definitionen sp�ter durch header ersetzen
-typedef unsigned char uint8_t;
-typedef unsigned int uint32_t;
-typedef unsigned long long uint64_t;
+#include "cuda_helper.h"
+#include "miner.h"
 
-// aus heavy.cu
-extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
+// ZR5
+__constant__ uint32_t d_OriginalData[20];
 
-__constant__ uint64_t c_State[25];
 __constant__ uint32_t c_PaddedMessage[18];
-
-static __device__ uint32_t cuda_swab32(uint32_t x)
-{
-	return __byte_perm(x, 0, 0x0123);
-}
-
-// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
-#if __CUDA_ARCH__ >= 350
-__forceinline__ __device__ uint64_t ROTL64(const uint64_t value, const int offset) {
-    uint2 result;
-    if(offset >= 32) {
-        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-    } else {
-        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-    }
-    return  __double_as_longlong(__hiloint2double(result.y, result.x));
-}
-#else
-#define ROTL64(x, n)        (((x) << (n)) | ((x) >> (64 - (n))))
-#endif
+__constant__ uint64_t c_State[25];
 
 #define U32TO64_LE(p) \
-    (((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32))
+	(((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32))
 
 #define U64TO32_LE(p, v) \
-    *p = (uint32_t)((v)); *(p+1) = (uint32_t)((v) >> 32);
+	*p = (uint32_t)((v)); *(p+1) = (uint32_t)((v) >> 32);
 
 static const uint64_t host_keccak_round_constants[24] = {
-    0x0000000000000001ull, 0x0000000000008082ull,
-    0x800000000000808aull, 0x8000000080008000ull,
-    0x000000000000808bull, 0x0000000080000001ull,
-    0x8000000080008081ull, 0x8000000000008009ull,
-    0x000000000000008aull, 0x0000000000000088ull,
-    0x0000000080008009ull, 0x000000008000000aull,
-    0x000000008000808bull, 0x800000000000008bull,
-    0x8000000000008089ull, 0x8000000000008003ull,
-    0x8000000000008002ull, 0x8000000000000080ull,
-    0x000000000000800aull, 0x800000008000000aull,
-    0x8000000080008081ull, 0x8000000000008080ull,
-    0x0000000080000001ull, 0x8000000080008008ull
+	0x0000000000000001ull, 0x0000000000008082ull,
+	0x800000000000808aull, 0x8000000080008000ull,
+	0x000000000000808bull, 0x0000000080000001ull,
+	0x8000000080008081ull, 0x8000000000008009ull,
+	0x000000000000008aull, 0x0000000000000088ull,
+	0x0000000080008009ull, 0x000000008000000aull,
+	0x000000008000808bull, 0x800000000000008bull,
+	0x8000000000008089ull, 0x8000000000008003ull,
+	0x8000000000008002ull, 0x8000000000000080ull,
+	0x000000000000800aull, 0x800000008000000aull,
+	0x8000000080008081ull, 0x8000000000008080ull,
+	0x0000000080000001ull, 0x8000000080008008ull
 };
 
 __constant__ uint64_t c_keccak_round_constants[24];
 
 static __device__ __forceinline__ void
 keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_constants) {
-    size_t i;
-    uint64_t t[5], u[5], v, w;
-
-    /* absorb input */
-#pragma unroll 9
-    for (i = 0; i < 72 / 8; i++, in += 2)
-        s[i] ^= U32TO64_LE(in);
-    
-    for (i = 0; i < 24; i++) {
-        /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
-        t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
-        t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
-        t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
-        t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
-        t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
-
-        /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
-        u[0] = t[4] ^ ROTL64(t[1], 1);
-        u[1] = t[0] ^ ROTL64(t[2], 1);
-        u[2] = t[1] ^ ROTL64(t[3], 1);
-        u[3] = t[2] ^ ROTL64(t[4], 1);
-        u[4] = t[3] ^ ROTL64(t[0], 1);
-
-        /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
-        s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
-        s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
-        s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
-        s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
-        s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
-
-        /* rho pi: b[..] = rotl(a[..], ..) */
-        v = s[ 1];
-        s[ 1] = ROTL64(s[ 6], 44);
-        s[ 6] = ROTL64(s[ 9], 20);
-        s[ 9] = ROTL64(s[22], 61);
-        s[22] = ROTL64(s[14], 39);
-        s[14] = ROTL64(s[20], 18);
-        s[20] = ROTL64(s[ 2], 62);
-        s[ 2] = ROTL64(s[12], 43);
-        s[12] = ROTL64(s[13], 25);
-        s[13] = ROTL64(s[19],  8);
-        s[19] = ROTL64(s[23], 56);
-        s[23] = ROTL64(s[15], 41);
-        s[15] = ROTL64(s[ 4], 27);
-        s[ 4] = ROTL64(s[24], 14);
-        s[24] = ROTL64(s[21],  2);
-        s[21] = ROTL64(s[ 8], 55);
-        s[ 8] = ROTL64(s[16], 45);
-        s[16] = ROTL64(s[ 5], 36);
-        s[ 5] = ROTL64(s[ 3], 28);
-        s[ 3] = ROTL64(s[18], 21);
-        s[18] = ROTL64(s[17], 15);
-        s[17] = ROTL64(s[11], 10);
-        s[11] = ROTL64(s[ 7],  6);
-        s[ 7] = ROTL64(s[10],  3);
-        s[10] = ROTL64(    v,  1);
-
-        /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
-        v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w;
-        v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w;
-        v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
-        v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
-        v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
-
-        /* iota: a[0,0] ^= round constant */
-        s[0] ^= keccak_round_constants[i];
-    }
-}
-
-__global__ void jackpot_keccak512_gpu_hash(int threads, uint32_t startNounce, uint64_t *g_hash)
-{
-    int thread = (blockDim.x * blockIdx.x + threadIdx.x);
-    if (thread < threads)
-    {
-        uint32_t nounce = startNounce + thread;
-
-        int hashPosition = nounce - startNounce;
-
-        // Nachricht kopieren
-        uint32_t message[18];
-#pragma unroll 18
-        for(int i=0;i<18;i++)
-            message[i] = c_PaddedMessage[i];
-
-        // die individuelle Nounce einsetzen
-        message[1] = cuda_swab32(nounce);
-
-        // State initialisieren
-        uint64_t keccak_gpu_state[25];
-#pragma unroll 25
-        for (int i=0; i<25; i++)
-            keccak_gpu_state[i] = c_State[i];
-
-        // den Block einmal gut durchsch�tteln
-        keccak_block(keccak_gpu_state, message, c_keccak_round_constants);
-
-        // das Hash erzeugen
-        uint32_t hash[16];
-
-#pragma unroll 8
-        for (size_t i = 0; i < 64; i += 8) {
-            U64TO32_LE((&hash[i/4]), keccak_gpu_state[i / 8]);
-        }
-
-        // fertig
-        uint32_t *outpHash = (uint32_t*)&g_hash[8 * hashPosition];
-
-#pragma unroll 16
-        for(int i=0;i<16;i++)
-            outpHash[i] = hash[i];
-    }
+	size_t i;
+	uint64_t t[5], u[5], v, w;
+
+	/* absorb input */
+	#pragma unroll 9
+	for (i = 0; i < 72 / 8; i++, in += 2)
+		s[i] ^= U32TO64_LE(in);
+
+	for (i = 0; i < 24; i++) {
+		/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
+		t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
+		t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
+		t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
+		t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
+		t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
+
+		/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
+		u[0] = t[4] ^ ROTL64(t[1], 1);
+		u[1] = t[0] ^ ROTL64(t[2], 1);
+		u[2] = t[1] ^ ROTL64(t[3], 1);
+		u[3] = t[2] ^ ROTL64(t[4], 1);
+		u[4] = t[3] ^ ROTL64(t[0], 1);
+
+		/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
+		s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
+		s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
+		s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
+		s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
+		s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
+
+		/* rho pi: b[..] = rotl(a[..], ..) */
+		v = s[ 1];
+		s[ 1] = ROTL64(s[ 6], 44);
+		s[ 6] = ROTL64(s[ 9], 20);
+		s[ 9] = ROTL64(s[22], 61);
+		s[22] = ROTL64(s[14], 39);
+		s[14] = ROTL64(s[20], 18);
+		s[20] = ROTL64(s[ 2], 62);
+		s[ 2] = ROTL64(s[12], 43);
+		s[12] = ROTL64(s[13], 25);
+		s[13] = ROTL64(s[19],  8);
+		s[19] = ROTL64(s[23], 56);
+		s[23] = ROTL64(s[15], 41);
+		s[15] = ROTL64(s[ 4], 27);
+		s[ 4] = ROTL64(s[24], 14);
+		s[24] = ROTL64(s[21],  2);
+		s[21] = ROTL64(s[ 8], 55);
+		s[ 8] = ROTL64(s[16], 45);
+		s[16] = ROTL64(s[ 5], 36);
+		s[ 5] = ROTL64(s[ 3], 28);
+		s[ 3] = ROTL64(s[18], 21);
+		s[18] = ROTL64(s[17], 15);
+		s[17] = ROTL64(s[11], 10);
+		s[11] = ROTL64(s[ 7],  6);
+		s[ 7] = ROTL64(s[10],  3);
+		s[10] = ROTL64(    v,  1);
+
+		/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
+		v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w;
+		v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w;
+		v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
+		v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
+		v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
+
+		/* iota: a[0,0] ^= round constant */
+		s[0] ^= keccak_round_constants[i];
+	}
 }
 
 // Setup-Funktionen
-__host__ void jackpot_keccak512_cpu_init(int thr_id, int threads)
+__host__
+void jackpot_keccak512_cpu_init(int thr_id, uint32_t threads)
 {
-    // Kopiere die Hash-Tabellen in den GPU-Speicher
-    cudaMemcpyToSymbol( c_keccak_round_constants,
-                        host_keccak_round_constants,
-                        sizeof(host_keccak_round_constants),
-                        0, cudaMemcpyHostToDevice);
+	// Kopiere die Hash-Tabellen in den GPU-Speicher
+	cudaMemcpyToSymbol( c_keccak_round_constants,
+						host_keccak_round_constants,
+						sizeof(host_keccak_round_constants),
+						0, cudaMemcpyHostToDevice);
 }
 
 #define cKeccakB    1600
@@ -194,379 +121,546 @@ __host__ void jackpot_keccak512_cpu_init(int thr_id, int threads)
 #define cKeccakR_SizeInBytes    (cKeccakR / 8)
 #define crypto_hash_BYTES 64
 
-#if     (cKeccakB   == 1600)
-    typedef unsigned long long  UINT64;
-    typedef UINT64 tKeccakLane;
-    #define cKeccakNumberOfRounds   24
+#if (cKeccakB == 1600)
+	typedef unsigned long long UINT64;
+	typedef UINT64 tKeccakLane;
+	#define cKeccakNumberOfRounds 24
 #endif
 
 #define cKeccakLaneSizeInBits   (sizeof(tKeccakLane) * 8)
 
 #define ROL(a, offset) ((((tKeccakLane)a) << ((offset) % cKeccakLaneSizeInBits)) ^ (((tKeccakLane)a) >> (cKeccakLaneSizeInBits-((offset) % cKeccakLaneSizeInBits))))
 #if ((cKeccakB/25) == 8)
-    #define ROL_mult8(a, offset) ((tKeccakLane)a)
+	#define ROL_mult8(a, offset) ((tKeccakLane)a)
 #else
-    #define ROL_mult8(a, offset) ROL(a, offset)
+	#define ROL_mult8(a, offset) ROL(a, offset)
 #endif
-void KeccakF( tKeccakLane * state, const tKeccakLane *in, int laneCount );
 
-const tKeccakLane KeccakF_RoundConstants[cKeccakNumberOfRounds] = 
-{
-    (tKeccakLane)0x0000000000000001ULL,
-    (tKeccakLane)0x0000000000008082ULL,
-    (tKeccakLane)0x800000000000808aULL,
-    (tKeccakLane)0x8000000080008000ULL,
-    (tKeccakLane)0x000000000000808bULL,
-    (tKeccakLane)0x0000000080000001ULL,
-    (tKeccakLane)0x8000000080008081ULL,
-    (tKeccakLane)0x8000000000008009ULL,
-    (tKeccakLane)0x000000000000008aULL,
-    (tKeccakLane)0x0000000000000088ULL,
-    (tKeccakLane)0x0000000080008009ULL,
-    (tKeccakLane)0x000000008000000aULL,
-    (tKeccakLane)0x000000008000808bULL,
-    (tKeccakLane)0x800000000000008bULL,
-    (tKeccakLane)0x8000000000008089ULL,
-    (tKeccakLane)0x8000000000008003ULL,
-    (tKeccakLane)0x8000000000008002ULL,
-    (tKeccakLane)0x8000000000000080ULL
-	#if		(cKeccakB	>= 400)
+const tKeccakLane KeccakF_RoundConstants[cKeccakNumberOfRounds] = {
+	(tKeccakLane)0x0000000000000001ULL,
+	(tKeccakLane)0x0000000000008082ULL,
+	(tKeccakLane)0x800000000000808aULL,
+	(tKeccakLane)0x8000000080008000ULL,
+	(tKeccakLane)0x000000000000808bULL,
+	(tKeccakLane)0x0000000080000001ULL,
+	(tKeccakLane)0x8000000080008081ULL,
+	(tKeccakLane)0x8000000000008009ULL,
+	(tKeccakLane)0x000000000000008aULL,
+	(tKeccakLane)0x0000000000000088ULL,
+	(tKeccakLane)0x0000000080008009ULL,
+	(tKeccakLane)0x000000008000000aULL,
+	(tKeccakLane)0x000000008000808bULL,
+	(tKeccakLane)0x800000000000008bULL,
+	(tKeccakLane)0x8000000000008089ULL,
+	(tKeccakLane)0x8000000000008003ULL,
+	(tKeccakLane)0x8000000000008002ULL,
+	(tKeccakLane)0x8000000000000080ULL
+#if (cKeccakB >= 400)
   , (tKeccakLane)0x000000000000800aULL,
-    (tKeccakLane)0x800000008000000aULL
-	#if		(cKeccakB	>= 800)
+	(tKeccakLane)0x800000008000000aULL
+#if (cKeccakB >= 800)
   , (tKeccakLane)0x8000000080008081ULL,
-    (tKeccakLane)0x8000000000008080ULL
-	#if		(cKeccakB	== 1600)
+	(tKeccakLane)0x8000000000008080ULL
+#if (cKeccakB == 1600)
   , (tKeccakLane)0x0000000080000001ULL,
-    (tKeccakLane)0x8000000080008008ULL
-	#endif
-	#endif
-	#endif
+	(tKeccakLane)0x8000000080008008ULL
+#endif
+#endif
+#endif
 };
 
-void KeccakF( tKeccakLane * state, const tKeccakLane *in, int laneCount )
+void KeccakF(tKeccakLane * state, const tKeccakLane *in, int laneCount)
 {
-
-    {
-        while ( --laneCount >= 0 )
-        {
-            state[laneCount] ^= in[laneCount];
-        }
-    }
-
-    {
-        tKeccakLane Aba, Abe, Abi, Abo, Abu;
-        tKeccakLane Aga, Age, Agi, Ago, Agu;
-        tKeccakLane Aka, Ake, Aki, Ako, Aku;
-        tKeccakLane Ama, Ame, Ami, Amo, Amu;
-        tKeccakLane Asa, Ase, Asi, Aso, Asu;
-        tKeccakLane BCa, BCe, BCi, BCo, BCu;
-        tKeccakLane Da, De, Di, Do, Du;
-        tKeccakLane Eba, Ebe, Ebi, Ebo, Ebu;
-        tKeccakLane Ega, Ege, Egi, Ego, Egu;
-        tKeccakLane Eka, Eke, Eki, Eko, Eku;
-        tKeccakLane Ema, Eme, Emi, Emo, Emu;
-        tKeccakLane Esa, Ese, Esi, Eso, Esu;
-        #define    round    laneCount
-
-        //copyFromState(A, state)
-        Aba = state[ 0];
-        Abe = state[ 1];
-        Abi = state[ 2];
-        Abo = state[ 3];
-        Abu = state[ 4];
-        Aga = state[ 5];
-        Age = state[ 6];
-        Agi = state[ 7];
-        Ago = state[ 8];
-        Agu = state[ 9];
-        Aka = state[10];
-        Ake = state[11];
-        Aki = state[12];
-        Ako = state[13];
-        Aku = state[14];
-        Ama = state[15];
-        Ame = state[16];
-        Ami = state[17];
-        Amo = state[18];
-        Amu = state[19];
-        Asa = state[20];
-        Ase = state[21];
-        Asi = state[22];
-        Aso = state[23];
-        Asu = state[24];
-
-        for( round = 0; round < cKeccakNumberOfRounds; round += 2 )
-        {
-            //    prepareTheta
-            BCa = Aba^Aga^Aka^Ama^Asa;
-            BCe = Abe^Age^Ake^Ame^Ase;
-            BCi = Abi^Agi^Aki^Ami^Asi;
-            BCo = Abo^Ago^Ako^Amo^Aso;
-            BCu = Abu^Agu^Aku^Amu^Asu;
-
-            //thetaRhoPiChiIotaPrepareTheta(round  , A, E)
-            Da = BCu^ROL(BCe, 1);
-            De = BCa^ROL(BCi, 1);
-            Di = BCe^ROL(BCo, 1);
-            Do = BCi^ROL(BCu, 1);
-            Du = BCo^ROL(BCa, 1);
-
-            Aba ^= Da;
-            BCa = Aba;
-            Age ^= De;
-            BCe = ROL(Age, 44);
-            Aki ^= Di;
-            BCi = ROL(Aki, 43);
-            Amo ^= Do;
-            BCo = ROL(Amo, 21);
-            Asu ^= Du;
-            BCu = ROL(Asu, 14);
-            Eba =   BCa ^((~BCe)&  BCi );
-            Eba ^= (tKeccakLane)KeccakF_RoundConstants[round];
-            Ebe =   BCe ^((~BCi)&  BCo );
-            Ebi =   BCi ^((~BCo)&  BCu );
-            Ebo =   BCo ^((~BCu)&  BCa );
-            Ebu =   BCu ^((~BCa)&  BCe );
-
-            Abo ^= Do;
-            BCa = ROL(Abo, 28);
-            Agu ^= Du;
-            BCe = ROL(Agu, 20);
-            Aka ^= Da;
-            BCi = ROL(Aka,  3);
-            Ame ^= De;
-            BCo = ROL(Ame, 45);
-            Asi ^= Di;
-            BCu = ROL(Asi, 61);
-            Ega =   BCa ^((~BCe)&  BCi );
-            Ege =   BCe ^((~BCi)&  BCo );
-            Egi =   BCi ^((~BCo)&  BCu );
-            Ego =   BCo ^((~BCu)&  BCa );
-            Egu =   BCu ^((~BCa)&  BCe );
-
-            Abe ^= De;
-            BCa = ROL(Abe,  1);
-            Agi ^= Di;
-            BCe = ROL(Agi,  6);
-            Ako ^= Do;
-            BCi = ROL(Ako, 25);
-            Amu ^= Du;
-            BCo = ROL_mult8(Amu,  8);
-            Asa ^= Da;
-            BCu = ROL(Asa, 18);
-            Eka =   BCa ^((~BCe)&  BCi );
-            Eke =   BCe ^((~BCi)&  BCo );
-            Eki =   BCi ^((~BCo)&  BCu );
-            Eko =   BCo ^((~BCu)&  BCa );
-            Eku =   BCu ^((~BCa)&  BCe );
-
-            Abu ^= Du;
-            BCa = ROL(Abu, 27);
-            Aga ^= Da;
-            BCe = ROL(Aga, 36);
-            Ake ^= De;
-            BCi = ROL(Ake, 10);
-            Ami ^= Di;
-            BCo = ROL(Ami, 15);
-            Aso ^= Do;
-            BCu = ROL_mult8(Aso, 56);
-            Ema =   BCa ^((~BCe)&  BCi );
-            Eme =   BCe ^((~BCi)&  BCo );
-            Emi =   BCi ^((~BCo)&  BCu );
-            Emo =   BCo ^((~BCu)&  BCa );
-            Emu =   BCu ^((~BCa)&  BCe );
-
-            Abi ^= Di;
-            BCa = ROL(Abi, 62);
-            Ago ^= Do;
-            BCe = ROL(Ago, 55);
-            Aku ^= Du;
-            BCi = ROL(Aku, 39);
-            Ama ^= Da;
-            BCo = ROL(Ama, 41);
-            Ase ^= De;
-            BCu = ROL(Ase,  2);
-            Esa =   BCa ^((~BCe)&  BCi );
-            Ese =   BCe ^((~BCi)&  BCo );
-            Esi =   BCi ^((~BCo)&  BCu );
-            Eso =   BCo ^((~BCu)&  BCa );
-            Esu =   BCu ^((~BCa)&  BCe );
-
-            //    prepareTheta
-            BCa = Eba^Ega^Eka^Ema^Esa;
-            BCe = Ebe^Ege^Eke^Eme^Ese;
-            BCi = Ebi^Egi^Eki^Emi^Esi;
-            BCo = Ebo^Ego^Eko^Emo^Eso;
-            BCu = Ebu^Egu^Eku^Emu^Esu;
-
-            //thetaRhoPiChiIotaPrepareTheta(round+1, E, A)
-            Da = BCu^ROL(BCe, 1);
-            De = BCa^ROL(BCi, 1);
-            Di = BCe^ROL(BCo, 1);
-            Do = BCi^ROL(BCu, 1);
-            Du = BCo^ROL(BCa, 1);
-
-            Eba ^= Da;
-            BCa = Eba;
-            Ege ^= De;
-            BCe = ROL(Ege, 44);
-            Eki ^= Di;
-            BCi = ROL(Eki, 43);
-            Emo ^= Do;
-            BCo = ROL(Emo, 21);
-            Esu ^= Du;
-            BCu = ROL(Esu, 14);
-            Aba =   BCa ^((~BCe)&  BCi );
-            Aba ^= (tKeccakLane)KeccakF_RoundConstants[round+1];
-            Abe =   BCe ^((~BCi)&  BCo );
-            Abi =   BCi ^((~BCo)&  BCu );
-            Abo =   BCo ^((~BCu)&  BCa );
-            Abu =   BCu ^((~BCa)&  BCe );
-
-            Ebo ^= Do;
-            BCa = ROL(Ebo, 28);
-            Egu ^= Du;
-            BCe = ROL(Egu, 20);
-            Eka ^= Da;
-            BCi = ROL(Eka, 3);
-            Eme ^= De;
-            BCo = ROL(Eme, 45);
-            Esi ^= Di;
-            BCu = ROL(Esi, 61);
-            Aga =   BCa ^((~BCe)&  BCi );
-            Age =   BCe ^((~BCi)&  BCo );
-            Agi =   BCi ^((~BCo)&  BCu );
-            Ago =   BCo ^((~BCu)&  BCa );
-            Agu =   BCu ^((~BCa)&  BCe );
-
-            Ebe ^= De;
-            BCa = ROL(Ebe, 1);
-            Egi ^= Di;
-            BCe = ROL(Egi, 6);
-            Eko ^= Do;
-            BCi = ROL(Eko, 25);
-            Emu ^= Du;
-            BCo = ROL_mult8(Emu, 8);
-            Esa ^= Da;
-            BCu = ROL(Esa, 18);
-            Aka =   BCa ^((~BCe)&  BCi );
-            Ake =   BCe ^((~BCi)&  BCo );
-            Aki =   BCi ^((~BCo)&  BCu );
-            Ako =   BCo ^((~BCu)&  BCa );
-            Aku =   BCu ^((~BCa)&  BCe );
-
-            Ebu ^= Du;
-            BCa = ROL(Ebu, 27);
-            Ega ^= Da;
-            BCe = ROL(Ega, 36);
-            Eke ^= De;
-            BCi = ROL(Eke, 10);
-            Emi ^= Di;
-            BCo = ROL(Emi, 15);
-            Eso ^= Do;
-            BCu = ROL_mult8(Eso, 56);
-            Ama =   BCa ^((~BCe)&  BCi );
-            Ame =   BCe ^((~BCi)&  BCo );
-            Ami =   BCi ^((~BCo)&  BCu );
-            Amo =   BCo ^((~BCu)&  BCa );
-            Amu =   BCu ^((~BCa)&  BCe );
-
-            Ebi ^= Di;
-            BCa = ROL(Ebi, 62);
-            Ego ^= Do;
-            BCe = ROL(Ego, 55);
-            Eku ^= Du;
-            BCi = ROL(Eku, 39);
-            Ema ^= Da;
-            BCo = ROL(Ema, 41);
-            Ese ^= De;
-            BCu = ROL(Ese, 2);
-            Asa =   BCa ^((~BCe)&  BCi );
-            Ase =   BCe ^((~BCi)&  BCo );
-            Asi =   BCi ^((~BCo)&  BCu );
-            Aso =   BCo ^((~BCu)&  BCa );
-            Asu =   BCu ^((~BCa)&  BCe );
-        }
-
-        //copyToState(state, A)
-        state[ 0] = Aba;
-        state[ 1] = Abe;
-        state[ 2] = Abi;
-        state[ 3] = Abo;
-        state[ 4] = Abu;
-        state[ 5] = Aga;
-        state[ 6] = Age;
-        state[ 7] = Agi;
-        state[ 8] = Ago;
-        state[ 9] = Agu;
-        state[10] = Aka;
-        state[11] = Ake;
-        state[12] = Aki;
-        state[13] = Ako;
-        state[14] = Aku;
-        state[15] = Ama;
-        state[16] = Ame;
-        state[17] = Ami;
-        state[18] = Amo;
-        state[19] = Amu;
-        state[20] = Asa;
-        state[21] = Ase;
-        state[22] = Asi;
-        state[23] = Aso;
-        state[24] = Asu;
-
-        #undef    round
-    }
+	while ( --laneCount >= 0 ) {
+		state[laneCount] ^= in[laneCount];
+	}
+
+	{
+		tKeccakLane Aba, Abe, Abi, Abo, Abu;
+		tKeccakLane Aga, Age, Agi, Ago, Agu;
+		tKeccakLane Aka, Ake, Aki, Ako, Aku;
+		tKeccakLane Ama, Ame, Ami, Amo, Amu;
+		tKeccakLane Asa, Ase, Asi, Aso, Asu;
+		tKeccakLane BCa, BCe, BCi, BCo, BCu;
+		tKeccakLane Da, De, Di, Do, Du;
+		tKeccakLane Eba, Ebe, Ebi, Ebo, Ebu;
+		tKeccakLane Ega, Ege, Egi, Ego, Egu;
+		tKeccakLane Eka, Eke, Eki, Eko, Eku;
+		tKeccakLane Ema, Eme, Emi, Emo, Emu;
+		tKeccakLane Esa, Ese, Esi, Eso, Esu;
+		#define    round    laneCount
+
+		//copyFromState(A, state)
+		Aba = state[ 0];
+		Abe = state[ 1];
+		Abi = state[ 2];
+		Abo = state[ 3];
+		Abu = state[ 4];
+		Aga = state[ 5];
+		Age = state[ 6];
+		Agi = state[ 7];
+		Ago = state[ 8];
+		Agu = state[ 9];
+		Aka = state[10];
+		Ake = state[11];
+		Aki = state[12];
+		Ako = state[13];
+		Aku = state[14];
+		Ama = state[15];
+		Ame = state[16];
+		Ami = state[17];
+		Amo = state[18];
+		Amu = state[19];
+		Asa = state[20];
+		Ase = state[21];
+		Asi = state[22];
+		Aso = state[23];
+		Asu = state[24];
+
+		for( round = 0; round < cKeccakNumberOfRounds; round += 2 )
+		{
+			//    prepareTheta
+			BCa = Aba^Aga^Aka^Ama^Asa;
+			BCe = Abe^Age^Ake^Ame^Ase;
+			BCi = Abi^Agi^Aki^Ami^Asi;
+			BCo = Abo^Ago^Ako^Amo^Aso;
+			BCu = Abu^Agu^Aku^Amu^Asu;
+
+			//thetaRhoPiChiIotaPrepareTheta(round  , A, E)
+			Da = BCu^ROL(BCe, 1);
+			De = BCa^ROL(BCi, 1);
+			Di = BCe^ROL(BCo, 1);
+			Do = BCi^ROL(BCu, 1);
+			Du = BCo^ROL(BCa, 1);
+
+			Aba ^= Da;
+			BCa = Aba;
+			Age ^= De;
+			BCe = ROL(Age, 44);
+			Aki ^= Di;
+			BCi = ROL(Aki, 43);
+			Amo ^= Do;
+			BCo = ROL(Amo, 21);
+			Asu ^= Du;
+			BCu = ROL(Asu, 14);
+			Eba =   BCa ^((~BCe)&  BCi );
+			Eba ^= (tKeccakLane)KeccakF_RoundConstants[round];
+			Ebe =   BCe ^((~BCi)&  BCo );
+			Ebi =   BCi ^((~BCo)&  BCu );
+			Ebo =   BCo ^((~BCu)&  BCa );
+			Ebu =   BCu ^((~BCa)&  BCe );
+
+			Abo ^= Do;
+			BCa = ROL(Abo, 28);
+			Agu ^= Du;
+			BCe = ROL(Agu, 20);
+			Aka ^= Da;
+			BCi = ROL(Aka,  3);
+			Ame ^= De;
+			BCo = ROL(Ame, 45);
+			Asi ^= Di;
+			BCu = ROL(Asi, 61);
+			Ega =   BCa ^((~BCe)&  BCi );
+			Ege =   BCe ^((~BCi)&  BCo );
+			Egi =   BCi ^((~BCo)&  BCu );
+			Ego =   BCo ^((~BCu)&  BCa );
+			Egu =   BCu ^((~BCa)&  BCe );
+
+			Abe ^= De;
+			BCa = ROL(Abe,  1);
+			Agi ^= Di;
+			BCe = ROL(Agi,  6);
+			Ako ^= Do;
+			BCi = ROL(Ako, 25);
+			Amu ^= Du;
+			BCo = ROL_mult8(Amu,  8);
+			Asa ^= Da;
+			BCu = ROL(Asa, 18);
+			Eka =   BCa ^((~BCe)&  BCi );
+			Eke =   BCe ^((~BCi)&  BCo );
+			Eki =   BCi ^((~BCo)&  BCu );
+			Eko =   BCo ^((~BCu)&  BCa );
+			Eku =   BCu ^((~BCa)&  BCe );
+
+			Abu ^= Du;
+			BCa = ROL(Abu, 27);
+			Aga ^= Da;
+			BCe = ROL(Aga, 36);
+			Ake ^= De;
+			BCi = ROL(Ake, 10);
+			Ami ^= Di;
+			BCo = ROL(Ami, 15);
+			Aso ^= Do;
+			BCu = ROL_mult8(Aso, 56);
+			Ema =   BCa ^((~BCe)&  BCi );
+			Eme =   BCe ^((~BCi)&  BCo );
+			Emi =   BCi ^((~BCo)&  BCu );
+			Emo =   BCo ^((~BCu)&  BCa );
+			Emu =   BCu ^((~BCa)&  BCe );
+
+			Abi ^= Di;
+			BCa = ROL(Abi, 62);
+			Ago ^= Do;
+			BCe = ROL(Ago, 55);
+			Aku ^= Du;
+			BCi = ROL(Aku, 39);
+			Ama ^= Da;
+			BCo = ROL(Ama, 41);
+			Ase ^= De;
+			BCu = ROL(Ase,  2);
+			Esa =   BCa ^((~BCe)&  BCi );
+			Ese =   BCe ^((~BCi)&  BCo );
+			Esi =   BCi ^((~BCo)&  BCu );
+			Eso =   BCo ^((~BCu)&  BCa );
+			Esu =   BCu ^((~BCa)&  BCe );
+
+			//    prepareTheta
+			BCa = Eba^Ega^Eka^Ema^Esa;
+			BCe = Ebe^Ege^Eke^Eme^Ese;
+			BCi = Ebi^Egi^Eki^Emi^Esi;
+			BCo = Ebo^Ego^Eko^Emo^Eso;
+			BCu = Ebu^Egu^Eku^Emu^Esu;
+
+			//thetaRhoPiChiIotaPrepareTheta(round+1, E, A)
+			Da = BCu^ROL(BCe, 1);
+			De = BCa^ROL(BCi, 1);
+			Di = BCe^ROL(BCo, 1);
+			Do = BCi^ROL(BCu, 1);
+			Du = BCo^ROL(BCa, 1);
+
+			Eba ^= Da;
+			BCa = Eba;
+			Ege ^= De;
+			BCe = ROL(Ege, 44);
+			Eki ^= Di;
+			BCi = ROL(Eki, 43);
+			Emo ^= Do;
+			BCo = ROL(Emo, 21);
+			Esu ^= Du;
+			BCu = ROL(Esu, 14);
+			Aba =   BCa ^((~BCe)&  BCi );
+			Aba ^= (tKeccakLane)KeccakF_RoundConstants[round+1];
+			Abe =   BCe ^((~BCi)&  BCo );
+			Abi =   BCi ^((~BCo)&  BCu );
+			Abo =   BCo ^((~BCu)&  BCa );
+			Abu =   BCu ^((~BCa)&  BCe );
+
+			Ebo ^= Do;
+			BCa = ROL(Ebo, 28);
+			Egu ^= Du;
+			BCe = ROL(Egu, 20);
+			Eka ^= Da;
+			BCi = ROL(Eka, 3);
+			Eme ^= De;
+			BCo = ROL(Eme, 45);
+			Esi ^= Di;
+			BCu = ROL(Esi, 61);
+			Aga =   BCa ^((~BCe)&  BCi );
+			Age =   BCe ^((~BCi)&  BCo );
+			Agi =   BCi ^((~BCo)&  BCu );
+			Ago =   BCo ^((~BCu)&  BCa );
+			Agu =   BCu ^((~BCa)&  BCe );
+
+			Ebe ^= De;
+			BCa = ROL(Ebe, 1);
+			Egi ^= Di;
+			BCe = ROL(Egi, 6);
+			Eko ^= Do;
+			BCi = ROL(Eko, 25);
+			Emu ^= Du;
+			BCo = ROL_mult8(Emu, 8);
+			Esa ^= Da;
+			BCu = ROL(Esa, 18);
+			Aka =   BCa ^((~BCe)&  BCi );
+			Ake =   BCe ^((~BCi)&  BCo );
+			Aki =   BCi ^((~BCo)&  BCu );
+			Ako =   BCo ^((~BCu)&  BCa );
+			Aku =   BCu ^((~BCa)&  BCe );
+
+			Ebu ^= Du;
+			BCa = ROL(Ebu, 27);
+			Ega ^= Da;
+			BCe = ROL(Ega, 36);
+			Eke ^= De;
+			BCi = ROL(Eke, 10);
+			Emi ^= Di;
+			BCo = ROL(Emi, 15);
+			Eso ^= Do;
+			BCu = ROL_mult8(Eso, 56);
+			Ama =   BCa ^((~BCe)&  BCi );
+			Ame =   BCe ^((~BCi)&  BCo );
+			Ami =   BCi ^((~BCo)&  BCu );
+			Amo =   BCo ^((~BCu)&  BCa );
+			Amu =   BCu ^((~BCa)&  BCe );
+
+			Ebi ^= Di;
+			BCa = ROL(Ebi, 62);
+			Ego ^= Do;
+			BCe = ROL(Ego, 55);
+			Eku ^= Du;
+			BCi = ROL(Eku, 39);
+			Ema ^= Da;
+			BCo = ROL(Ema, 41);
+			Ese ^= De;
+			BCu = ROL(Ese, 2);
+			Asa =   BCa ^((~BCe)&  BCi );
+			Ase =   BCe ^((~BCi)&  BCo );
+			Asi =   BCi ^((~BCo)&  BCu );
+			Aso =   BCo ^((~BCu)&  BCa );
+			Asu =   BCu ^((~BCa)&  BCe );
+		}
+
+		//copyToState(state, A)
+		state[ 0] = Aba;
+		state[ 1] = Abe;
+		state[ 2] = Abi;
+		state[ 3] = Abo;
+		state[ 4] = Abu;
+		state[ 5] = Aga;
+		state[ 6] = Age;
+		state[ 7] = Agi;
+		state[ 8] = Ago;
+		state[ 9] = Agu;
+		state[10] = Aka;
+		state[11] = Ake;
+		state[12] = Aki;
+		state[13] = Ako;
+		state[14] = Aku;
+		state[15] = Ama;
+		state[16] = Ame;
+		state[17] = Ami;
+		state[18] = Amo;
+		state[19] = Amu;
+		state[20] = Asa;
+		state[21] = Ase;
+		state[22] = Asi;
+		state[23] = Aso;
+		state[24] = Asu;
+
+		#undef    round
+	}
 }
 
 // inlen kann 72...143 betragen
-__host__ void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen)
+__host__
+void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen)
+{
+	const unsigned char *in = (const unsigned char*)pdata;
+
+	tKeccakLane state[5 * 5];
+	unsigned char temp[cKeccakR_SizeInBytes];
+
+	memset( state, 0, sizeof(state) );
+
+	for ( /* empty */; inlen >= cKeccakR_SizeInBytes; inlen -= cKeccakR_SizeInBytes, in += cKeccakR_SizeInBytes)
+	{
+		KeccakF( state, (const tKeccakLane*)in, cKeccakR_SizeInBytes / sizeof(tKeccakLane) );
+	}
+
+	// Copy state of the first round (72 Bytes)
+	// in Constant Memory
+	cudaMemcpyToSymbol( c_State,
+						state,
+						sizeof(state),
+						0, cudaMemcpyHostToDevice);
+
+	// second part
+	memcpy(temp, in, inlen);
+	temp[inlen++] = 1;
+	memset(temp + inlen, 0, cKeccakR_SizeInBytes - inlen);
+	temp[cKeccakR_SizeInBytes-1] |= 0x80;
+
+	// Copy rest of the message in constant memory
+	cudaMemcpyToSymbol( c_PaddedMessage,
+						temp,
+						cKeccakR_SizeInBytes,
+						0, cudaMemcpyHostToDevice);
+}
+
+__global__
+void jackpot_keccak512_gpu_hash(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t nounce = startNounce + thread;
+
+		int hashPosition = nounce - startNounce;
+
+		uint32_t message[18];
+		#pragma unroll 18
+		for(int i=0;i<18;i++)
+			message[i] = c_PaddedMessage[i];
+
+		message[1] = cuda_swab32(nounce);
+
+		// State init
+		uint64_t keccak_gpu_state[25];
+		#pragma unroll 25
+		for (int i=0; i<25; i++)
+			keccak_gpu_state[i] = c_State[i];
+
+		// den Block einmal gut durchschütteln
+		keccak_block(keccak_gpu_state, message, c_keccak_round_constants);
+
+		uint32_t hash[16];
+
+		#pragma unroll 8
+		for (size_t i = 0; i < 64; i += 8) {
+			U64TO32_LE((&hash[i/4]), keccak_gpu_state[i / 8]);
+		}
+
+		// copy hash
+		uint32_t *outpHash = (uint32_t*)&g_hash[8 * hashPosition];
+
+		#pragma unroll 16
+		for(int i=0;i<16;i++)
+			outpHash[i] = hash[i];
+	}
+}
+
+__host__
+void jackpot_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order)
 {
-    const unsigned char *in = (const unsigned char*)pdata;
+	const uint32_t threadsperblock = 256;
 
-    tKeccakLane    state[5 * 5];
-    unsigned char temp[cKeccakR_SizeInBytes];
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
 
-    memset( state, 0, sizeof(state) );
+	size_t shared_size = 0;
 
-    for ( /* empty */; inlen >= cKeccakR_SizeInBytes; inlen -= cKeccakR_SizeInBytes, in += cKeccakR_SizeInBytes )
-    {
-        KeccakF( state, (const tKeccakLane*)in, cKeccakR_SizeInBytes / sizeof(tKeccakLane) );
-    }
+	jackpot_keccak512_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash);
+	MyStreamSynchronize(NULL, order, thr_id);
+}
 
-    // Kopiere den state nach der ersten Runde (nach Absorption von 72 Bytes Inputdaten)
-    // ins Constant Memory
-    cudaMemcpyToSymbol( c_State,
-                        state,
-                        sizeof(state),
-                        0, cudaMemcpyHostToDevice);
 
-    //    padding
-    memcpy( temp, in, (size_t)inlen );
-    temp[inlen++] = 1;
-    memset( temp+inlen, 0, cKeccakR_SizeInBytes - (size_t)inlen );
-    temp[cKeccakR_SizeInBytes-1] |= 0x80;
+/* zr5 keccak, no nonce swab32 */
 
+__global__
+void zr5_keccak512_gpu_hash(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t nounce = startNounce + thread;
+		uint32_t message[18];
+
+		#pragma unroll 18
+		for(int i=0; i<18; i++)
+			message[i] = c_PaddedMessage[i];
+
+		message[1] = nounce;
+
+		// Get mid-state
+		uint64_t keccak_gpu_state[25];
+		#pragma unroll 25
+		for (int i=0; i<25; i++)
+			keccak_gpu_state[i] = c_State[i];
+
+		keccak_block(keccak_gpu_state, message, c_keccak_round_constants);
+
+		uint32_t hash[16];
+
+		#pragma unroll 8
+		for (int i = 0; i < 8; i++) {
+			U64TO32_LE((&hash[i*2]), keccak_gpu_state[i]);
+		}
+
+		// Output (64 bytes hash required)
+		uint32_t hashPosition = nounce - startNounce;
+		//uint32_t *outpHash = (uint32_t*) (&g_hash[hashPosition*8]);
+		//#pragma unroll 16
+		//for(int i=0; i<16; i++)
+		//	outpHash[i] = hash[i];
+
+		uint4 *outpHash = (uint4*) (&g_hash[hashPosition*8]);
+		uint4 *psrc = (uint4*) hash;
+		outpHash[0] = psrc[0];
+		outpHash[1] = psrc[1];
+		outpHash[2] = psrc[2];
+		outpHash[3] = psrc[3];
+	}
+}
 
-    // Kopiere den Rest der Message und das Padding ins Constant Memory
-    cudaMemcpyToSymbol( c_PaddedMessage,
-                        temp,
-                        cKeccakR_SizeInBytes,
-                        0, cudaMemcpyHostToDevice);
+__host__
+void zr5_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash)
+{
+	const uint32_t threadsperblock = 256;
+
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	zr5_keccak512_gpu_hash<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash);
+	MyStreamSynchronize(NULL, 0, thr_id);
 }
 
-__host__ void jackpot_keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order)
+/* required for the second hash part of zr5 */
+
+__global__
+void zr5_keccak512_gpu_hash_pok(uint32_t threads, uint32_t startNounce, uint32_t *g_hash, uint16_t *d_poks, uint32_t version)
 {
-    const int threadsperblock = 256;
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t nounce = startNounce + thread;
+
+		uint32_t message[18]; /* 72 bytes */
+
+		// pok - hash[0] from prev hash
+		message[0] = version | (0x10000UL * d_poks[thread]);
+		#pragma unroll
+		for (int i=1; i<18; i++) {
+			message[i]=d_OriginalData[i];
+		}
+
+		// first bloc
+		uint64_t keccak_gpu_state[25] = { 0 };
+		keccak_block(keccak_gpu_state, message, c_keccak_round_constants);
+
+		// second bloc
+		message[0] = d_OriginalData[18];
+		message[1] = nounce; //cuda_swab32(nounce);
+		message[2] = 1;
+
+		#pragma unroll
+		for(int i=3; i<17; i++)
+			message[i] = 0;
+
+		message[17] = 0x80000000UL;
+
+		keccak_block(keccak_gpu_state, message, c_keccak_round_constants);
+
+		uint32_t hash[16];
+
+		#pragma unroll 8
+		for (size_t i = 0; i < 8; i++) {
+			U64TO32_LE((&hash[i*2]), keccak_gpu_state[i]);
+		}
+
+		//uint32_t *outpHash = &g_hash[thread * 16];
+		//#pragma unroll 16
+		//for(int i=0; i<16; i++)
+		//	outpHash[i] = hash[i];
+
+		uint4 *outpHash = (uint4*) (&g_hash[thread * 16]);
+		uint4 *psrc = (uint4*) hash;
+		outpHash[0] = psrc[0];
+		outpHash[1] = psrc[1];
+		outpHash[2] = psrc[2];
+		outpHash[3] = psrc[3];
+	}
+}
 
-    // berechne wie viele Thread Blocks wir brauchen
-    dim3 grid((threads + threadsperblock-1)/threadsperblock);
-    dim3 block(threadsperblock);
+__host__
+void zr5_keccak512_cpu_hash_pok(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t* pdata, uint32_t *d_hash, uint16_t *d_poks)
+{
+	const uint32_t threadsperblock = 256;
+	const uint32_t version = (pdata[0] & (~POK_DATA_MASK)) | (use_pok ? POK_BOOL_MASK : 0);
 
-    // Gr��e des dynamischen Shared Memory Bereichs
-    size_t shared_size = 0;
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
 
-    jackpot_keccak512_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash);
-    MyStreamSynchronize(NULL, order, thr_id);
+	cudaMemcpyToSymbol(d_OriginalData, pdata, sizeof(d_OriginalData), 0, cudaMemcpyHostToDevice);
+	zr5_keccak512_gpu_hash_pok<<<grid, block>>>(threads, startNounce, d_hash, d_poks, version);
+	MyStreamSynchronize(NULL, 10, thr_id);
 }
diff --git a/JHA/jackpotcoin.cu b/JHA/jackpotcoin.cu
index 7d6d4cf5b7..78e5bd02e7 100644
--- a/JHA/jackpotcoin.cu
+++ b/JHA/jackpotcoin.cu
@@ -1,4 +1,3 @@
-
 extern "C"
 {
 #include "sph/sph_keccak.h"
@@ -6,135 +5,131 @@ extern "C"
 #include "sph/sph_groestl.h"
 #include "sph/sph_jh.h"
 #include "sph/sph_skein.h"
-#include "miner.h"
 }
 
-#include <stdint.h>
+#include "miner.h"
+#include "cuda_helper.h"
+#include "quark/cuda_quark.h"
 
-// aus cpu-miner.c
-extern int device_map[8];
+static uint32_t *d_hash[MAX_GPUS] = { 0 };
 
-// Speicher f�r Input/Output der verketteten Hashfunktionen
-static uint32_t *d_hash[8];
+// Speicher zur Generierung der Noncevektoren für die bedingten Hashes
+static uint32_t *d_jackpotNonces[MAX_GPUS] = { 0 };
+static uint32_t *d_branch1Nonces[MAX_GPUS] = { 0 };
+static uint32_t *d_branch2Nonces[MAX_GPUS] = { 0 };
+static uint32_t *d_branch3Nonces[MAX_GPUS] = { 0 };
 
-extern void jackpot_keccak512_cpu_init(int thr_id, int threads);
+extern void jackpot_keccak512_cpu_init(int thr_id, uint32_t threads);
 extern void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen);
-extern void jackpot_keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order);
+extern void jackpot_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
 
-extern void quark_blake512_cpu_init(int thr_id, int threads);
-extern void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void jackpot_compactTest_cpu_init(int thr_id, uint32_t threads);
+extern void jackpot_compactTest_cpu_free(int thr_id);
+extern void jackpot_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
+                                            uint32_t *d_nonces1, uint32_t *nrm1, uint32_t *d_nonces2, uint32_t *nrm2, int order);
 
-extern void quark_groestl512_cpu_init(int thr_id, int threads);
-extern void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern uint32_t cuda_check_hash_branch(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
 
-extern void quark_jh512_cpu_init(int thr_id, int threads);
-extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
-extern void quark_skein512_cpu_init(int thr_id, int threads);
-extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+// CPU HASH JHA v8
+extern "C" void jackpothash(void *state, const void *input)
+{
+	uint32_t hash[16];
+	unsigned int rnd;
 
-extern void quark_check_cpu_init(int thr_id, int threads);
-extern void quark_check_cpu_setTarget(const void *ptarget);
-extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
+	sph_blake512_context     ctx_blake;
+	sph_groestl512_context   ctx_groestl;
+	sph_jh512_context        ctx_jh;
+	sph_keccak512_context    ctx_keccak;
+	sph_skein512_context     ctx_skein;
 
-extern void jackpot_compactTest_cpu_init(int thr_id, int threads);
-extern void jackpot_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, 
-											uint32_t *d_nonces1, size_t *nrm1,
-											uint32_t *d_nonces2, size_t *nrm2,
-											int order);
+	sph_keccak512_init(&ctx_keccak);
+	sph_keccak512 (&ctx_keccak, input, 80);
+	sph_keccak512_close(&ctx_keccak, hash);
 
-// Speicher zur Generierung der Noncevektoren f�r die bedingten Hashes
-static uint32_t *d_jackpotNonces[8];
-static uint32_t *d_branch1Nonces[8];
-static uint32_t *d_branch2Nonces[8];
-static uint32_t *d_branch3Nonces[8];
+	for (rnd = 0; rnd < 3; rnd++)
+	{
+		if (hash[0] & 0x01) {
+			sph_groestl512_init(&ctx_groestl);
+			sph_groestl512 (&ctx_groestl, (&hash), 64);
+			sph_groestl512_close(&ctx_groestl, (&hash));
+		}
+		else {
+			sph_skein512_init(&ctx_skein);
+			sph_skein512 (&ctx_skein, (&hash), 64);
+			sph_skein512_close(&ctx_skein, (&hash));
+		}
 
-// Original jackpothash Funktion aus einem miner Quelltext
-inline unsigned int jackpothash(void *state, const void *input)
-{
-    sph_blake512_context     ctx_blake;
-    sph_groestl512_context   ctx_groestl;
-    sph_jh512_context        ctx_jh;
-    sph_keccak512_context    ctx_keccak;
-    sph_skein512_context     ctx_skein;
-
-    uint32_t hash[16];
-
-    sph_keccak512_init(&ctx_keccak);
-    sph_keccak512 (&ctx_keccak, input, 80);
-    sph_keccak512_close(&ctx_keccak, hash);
-
-    unsigned int round;
-    for (round = 0; round < 3; round++) {
-        if (hash[0] & 0x01) {
-           sph_groestl512_init(&ctx_groestl);
-           sph_groestl512 (&ctx_groestl, (&hash), 64);
-           sph_groestl512_close(&ctx_groestl, (&hash));
-        }
-        else {
-           sph_skein512_init(&ctx_skein);
-           sph_skein512 (&ctx_skein, (&hash), 64);
-           sph_skein512_close(&ctx_skein, (&hash));
-        }
-        if (hash[0] & 0x01) {
-           sph_blake512_init(&ctx_blake);
-           sph_blake512 (&ctx_blake, (&hash), 64);
-           sph_blake512_close(&ctx_blake, (&hash));
-        }
-        else {
-           sph_jh512_init(&ctx_jh);
-           sph_jh512 (&ctx_jh, (&hash), 64);
-           sph_jh512_close(&ctx_jh, (&hash));
-        }
-    }
-    memcpy(state, hash, 32);
-
-    return round;
+		if (hash[0] & 0x01) {
+			sph_blake512_init(&ctx_blake);
+			sph_blake512 (&ctx_blake, (&hash), 64);
+			sph_blake512_close(&ctx_blake, (&hash));
+		}
+		else {
+			sph_jh512_init(&ctx_jh);
+			sph_jh512 (&ctx_jh, (&hash), 64);
+			sph_jh512_close(&ctx_jh, (&hash));
+		}
+	}
+	memcpy(state, hash, 32);
 }
 
+static bool init[MAX_GPUS] = { 0 };
 
-extern bool opt_benchmark;
-
-extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
-    const uint32_t *ptarget, uint32_t max_nonce,
-    unsigned long *hashes_done)
+extern "C" int scanhash_jackpot(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done)
 {
+	uint32_t _ALIGN(64) endiandata[22];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[19];
+	int dev_id = device_map[thr_id];
 
-	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x0000ff;
+	uint32_t throughput =  cuda_default_throughput(thr_id, 1U << 20);
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
 
-	const uint32_t Htarg = ptarget[7];
-
-	const int throughput = 256*4096*4; // 100;
+	if (opt_benchmark)
+		ptarget[7] = 0x000f;
 
-	static bool init[8] = {0,0,0,0,0,0,0,0};
 	if (!init[thr_id])
 	{
-		cudaSetDevice(device_map[thr_id]);
+		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		cuda_get_arch(thr_id);
+		if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300) {
+			gpulog(LOG_ERR, thr_id, "Sorry, This algo is not supported by this GPU arch (SM 3.0 required)");
+			proper_exit(EXIT_CODE_CUDA_ERROR);
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
 
-		// Konstanten kopieren, Speicher belegen
-		cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput);
 		jackpot_keccak512_cpu_init(thr_id, throughput);
 		jackpot_compactTest_cpu_init(thr_id, throughput);
 		quark_blake512_cpu_init(thr_id, throughput);
 		quark_groestl512_cpu_init(thr_id, throughput);
 		quark_jh512_cpu_init(thr_id, throughput);
 		quark_skein512_cpu_init(thr_id, throughput);
-		quark_check_cpu_init(thr_id, throughput);
-		cudaMalloc(&d_jackpotNonces[thr_id], sizeof(uint32_t)*throughput*2);
-		cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput*2);
-		cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput*2);
-		cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput*2);
+
+		cuda_check_cpu_init(thr_id, throughput);
+
+		cudaMalloc(&d_branch1Nonces[thr_id], (size_t) sizeof(uint32_t)*throughput*2);
+		cudaMalloc(&d_branch2Nonces[thr_id], (size_t) sizeof(uint32_t)*throughput*2);
+		cudaMalloc(&d_branch3Nonces[thr_id], (size_t) sizeof(uint32_t)*throughput*2);
+
+		CUDA_SAFE_CALL(cudaMalloc(&d_jackpotNonces[thr_id], (size_t) sizeof(uint32_t)*throughput*2));
+
 		init[thr_id] = true;
 	}
 
-	uint32_t endiandata[22];
 	for (int k=0; k < 22; k++)
-		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
+		be32enc(&endiandata[k], pdata[k]);
 
 	jackpot_keccak512_cpu_setBlock((void*)endiandata, 80);
-	quark_check_cpu_setTarget(ptarget);
+	cuda_check_cpu_setTarget(ptarget);
 
 	do {
 		int order = 0;
@@ -142,9 +137,9 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
 		// erstes Keccak512 Hash mit CUDA
 		jackpot_keccak512_cpu_hash(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 
-		size_t nrm1, nrm2, nrm3;
+		uint32_t nrm1, nrm2, nrm3;
 
-		// Runde 1 (ohne Gr�stl)
+		// Runde 1 (ohne Gröstl)
 
 		jackpot_compactTest_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], NULL,
 				d_branch1Nonces[thr_id], &nrm1,
@@ -167,7 +162,7 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
 
 		// Runde 3 (komplett)
 
-		// jackpotNonces in branch1/2 aufsplitten gem�ss if (hash[0] & 0x01)
+		// jackpotNonces in branch1/2 aufsplitten gemäss if (hash[0] & 0x01)
 		jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
 			d_branch1Nonces[thr_id], &nrm1,
 			d_branch2Nonces[thr_id], &nrm2,
@@ -178,7 +173,7 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
 			quark_skein512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
 		}
 
-		// jackpotNonces in branch1/2 aufsplitten gem�ss if (hash[0] & 0x01)
+		// jackpotNonces in branch1/2 aufsplitten gemäss if (hash[0] & 0x01)
 		jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
 			d_branch1Nonces[thr_id], &nrm1,
 			d_branch2Nonces[thr_id], &nrm2,
@@ -191,7 +186,7 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
 
 		// Runde 3 (komplett)
 
-		// jackpotNonces in branch1/2 aufsplitten gem�ss if (hash[0] & 0x01)
+		// jackpotNonces in branch1/2 aufsplitten gemäss if (hash[0] & 0x01)
 		jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
 			d_branch1Nonces[thr_id], &nrm1,
 			d_branch2Nonces[thr_id], &nrm2,
@@ -202,7 +197,7 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
 			quark_skein512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
 		}
 
-		// jackpotNonces in branch1/2 aufsplitten gem�ss if (hash[0] & 0x01)
+		// jackpotNonces in branch1/2 aufsplitten gemäss if (hash[0] & 0x01)
 		jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
 			d_branch1Nonces[thr_id], &nrm1,
 			d_branch2Nonces[thr_id], &nrm2,
@@ -213,31 +208,88 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
 			quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
 		}
 
-		// Scan nach Gewinner Hashes auf der GPU
-		uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
-		if  (foundNonce != 0xffffffff)
-		{
-			uint32_t vhash64[8];
-			be32enc(&endiandata[19], foundNonce);
+		*hashes_done = pdata[19] - first_nonce + throughput;
 
-			// diese jackpothash Funktion gibt die Zahl der Runden zur�ck
-			unsigned int rounds = jackpothash(vhash64, endiandata);
+		CUDA_LOG_ERROR();
 
-			if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) {
+		work->nonces[0] = cuda_check_hash_branch(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
 
-				pdata[19] = foundNonce;
-				*hashes_done = (foundNonce - first_nonce + 1)/2;
-				//applog(LOG_INFO, "GPU #%d: result for nonce $%08X does validate on CPU (%d rounds)!", thr_id, foundNonce, rounds);
-				return 1;
-			} else {
-				applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU (%d rounds)!", thr_id, foundNonce, rounds);
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+
+			// jackpothash function gibt die Zahl der Runden zurück
+			jackpothash(vhash, endiandata);
+
+			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+#if 0
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					jackpothash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+#else
+				pdata[19] = work->nonces[0] + 1; // cursor
+#endif
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+					gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
 			}
 		}
 
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
 		pdata[19] += throughput;
 
-	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+
+	CUDA_LOG_ERROR();
 
-	*hashes_done = (pdata[19] - first_nonce + 1)/2;
 	return 0;
 }
+
+// cleanup
+extern "C" void free_jackpot(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_branch1Nonces[thr_id]);
+	cudaFree(d_branch2Nonces[thr_id]);
+	cudaFree(d_branch3Nonces[thr_id]);
+	cudaFree(d_jackpotNonces[thr_id]);
+
+	quark_blake512_cpu_free(thr_id);
+	quark_groestl512_cpu_free(thr_id);
+	jackpot_compactTest_cpu_free(thr_id);
+
+	cudaFree(d_hash[thr_id]);
+
+	cuda_check_cpu_free(thr_id);
+	CUDA_LOG_ERROR();
+
+	cudaDeviceSynchronize();
+
+	init[thr_id] = false;
+}
diff --git a/JHA/jha.cu b/JHA/jha.cu
new file mode 100644
index 0000000000..ec7895c10d
--- /dev/null
+++ b/JHA/jha.cu
@@ -0,0 +1,265 @@
+/**
+ * JHA v8 algorithm - compatible implementation
+ * @author tpruvot@github 05-2017
+ */
+
+extern "C" {
+#include "sph/sph_keccak.h"
+#include "sph/sph_blake.h"
+#include "sph/sph_groestl.h"
+#include "sph/sph_jh.h"
+#include "sph/sph_skein.h"
+}
+
+#include "miner.h"
+#include "cuda_helper.h"
+#include "quark/cuda_quark.h"
+
+static uint32_t *d_hash[MAX_GPUS] = { 0 };
+static uint32_t *d_hash_br2[MAX_GPUS];
+static uint32_t *d_tempBranch[MAX_GPUS];
+
+extern void jackpot_keccak512_cpu_init(int thr_id, uint32_t threads);
+extern void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen);
+extern void jackpot_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
+
+// CPU HASH
+extern "C" void jha_hash(void *output, const void *input)
+{
+	uint32_t hash[16];
+
+	sph_blake512_context     ctx_blake;
+	sph_groestl512_context   ctx_groestl;
+	sph_jh512_context        ctx_jh;
+	sph_keccak512_context    ctx_keccak;
+	sph_skein512_context     ctx_skein;
+
+	sph_keccak512_init(&ctx_keccak);
+	sph_keccak512 (&ctx_keccak, input, 80);
+	sph_keccak512_close(&ctx_keccak, hash);
+
+	for (int rnd = 0; rnd < 3; rnd++)
+	{
+		if (hash[0] & 0x01) {
+			sph_groestl512_init(&ctx_groestl);
+			sph_groestl512 (&ctx_groestl, (&hash), 64);
+			sph_groestl512_close(&ctx_groestl, (&hash));
+		}
+		else {
+			sph_skein512_init(&ctx_skein);
+			sph_skein512 (&ctx_skein, (&hash), 64);
+			sph_skein512_close(&ctx_skein, (&hash));
+		}
+
+		if (hash[0] & 0x01) {
+			sph_blake512_init(&ctx_blake);
+			sph_blake512 (&ctx_blake, (&hash), 64);
+			sph_blake512_close(&ctx_blake, (&hash));
+		}
+		else {
+			sph_jh512_init(&ctx_jh);
+			sph_jh512 (&ctx_jh, (&hash), 64);
+			sph_jh512_close(&ctx_jh, (&hash));
+		}
+	}
+	memcpy(output, hash, 32);
+}
+
+__global__ __launch_bounds__(128, 8)
+void jha_filter_gpu(const uint32_t threads, const uint32_t* d_hash, uint32_t* d_branch2, uint32_t* d_NonceBranch)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint32_t offset = thread * 16U; // 64U / sizeof(uint32_t);
+		uint4 *psrc = (uint4*) (&d_hash[offset]);
+		d_NonceBranch[thread] = ((uint8_t*)psrc)[0] & 0x01;
+		if (d_NonceBranch[thread]) return;
+		// uint4 = 4x uint32_t = 16 bytes
+		uint4 *pdst = (uint4*) (&d_branch2[offset]);
+		pdst[0] = psrc[0];
+		pdst[1] = psrc[1];
+		pdst[2] = psrc[2];
+		pdst[3] = psrc[3];
+	}
+}
+
+__global__ __launch_bounds__(128, 8)
+void jha_merge_gpu(const uint32_t threads, uint32_t* d_hash, uint32_t* d_branch2, uint32_t* const d_NonceBranch)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads && !d_NonceBranch[thread])
+	{
+		const uint32_t offset = thread * 16U;
+		uint4 *pdst = (uint4*) (&d_hash[offset]);
+		uint4 *psrc = (uint4*) (&d_branch2[offset]);
+		pdst[0] = psrc[0];
+		pdst[1] = psrc[1];
+		pdst[2] = psrc[2];
+		pdst[3] = psrc[3];
+	}
+}
+
+__host__
+uint32_t jha_filter_cpu(const int thr_id, const uint32_t threads, const uint32_t *inpHashes, uint32_t* d_branch2)
+{
+	const uint32_t threadsperblock = 128;
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+	// extract algo permution hashes to a second branch buffer
+	jha_filter_gpu <<<grid, block>>> (threads, inpHashes, d_branch2, d_tempBranch[thr_id]);
+	return threads;
+}
+
+__host__
+void jha_merge_cpu(const int thr_id, const uint32_t threads, uint32_t *outpHashes, uint32_t* d_branch2)
+{
+	const uint32_t threadsperblock = 128;
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+	// put back second branch hashes to the common buffer d_hash
+	jha_merge_gpu <<<grid, block>>> (threads, outpHashes, d_branch2, d_tempBranch[thr_id]);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_jha(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t _ALIGN(64) endiandata[22];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	int dev_id = device_map[thr_id];
+
+	uint32_t throughput =  cuda_default_throughput(thr_id, 1U << 20);
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (opt_benchmark)
+		ptarget[7] = 0x000f;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		cuda_get_arch(thr_id);
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash_br2[thr_id], (size_t) 64 * throughput));
+		CUDA_SAFE_CALL(cudaMalloc(&d_tempBranch[thr_id], sizeof(uint32_t) * throughput));
+
+		jackpot_keccak512_cpu_init(thr_id, throughput);
+		quark_blake512_cpu_init(thr_id, throughput);
+		quark_groestl512_cpu_init(thr_id, throughput);
+		quark_jh512_cpu_init(thr_id, throughput);
+		quark_skein512_cpu_init(thr_id, throughput);
+
+		cuda_check_cpu_init(thr_id, throughput);
+
+		init[thr_id] = true;
+	}
+
+	for (int k=0; k < 22; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	jackpot_keccak512_cpu_setBlock((void*)endiandata, 80);
+	cuda_check_cpu_setTarget(ptarget);
+
+	do {
+		int order = 0;
+
+		jackpot_keccak512_cpu_hash(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+
+		for (int rnd = 0; rnd < 3; rnd++)
+		{
+			jha_filter_cpu(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]);
+			quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+			quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order++);
+			jha_merge_cpu(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]);
+
+			jha_filter_cpu(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]);
+			quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+			quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order++);
+			jha_merge_cpu(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]);
+		}
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		CUDA_LOG_ERROR();
+
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+
+			be32enc(&endiandata[19], work->nonces[0]);
+			jha_hash(vhash, endiandata);
+
+			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					jha_hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+					gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+
+	CUDA_LOG_ERROR();
+
+	return 0;
+}
+
+// cleanup
+extern "C" void free_jha(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+	cudaFree(d_hash_br2[thr_id]);
+	cudaFree(d_tempBranch[thr_id]);
+
+	quark_blake512_cpu_free(thr_id);
+	quark_groestl512_cpu_free(thr_id);
+
+	cuda_check_cpu_free(thr_id);
+	CUDA_LOG_ERROR();
+
+	cudaDeviceSynchronize();
+	init[thr_id] = false;
+}
diff --git a/LICENSE b/LICENSE
deleted file mode 100644
index 2d7f3b98d8..0000000000
--- a/LICENSE
+++ /dev/null
@@ -1,3 +0,0 @@
-ccminer is available under the terms of the GNU Public License version 3.
-
-See LICENSE.TXT for details.
diff --git a/Makefile.am b/Makefile.am
index 831cfd23c5..4749f5797f 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,3 +1,5 @@
+# allow to use Host cuda functions in C/C++
+DEF_INCLUDES = @CUDA_INCLUDES@
 
 if WANT_JANSSON
 JANSSON_INCLUDES= -I$(top_srcdir)/compat/jansson
@@ -5,19 +7,23 @@ else
 JANSSON_INCLUDES=
 endif
 
-EXTRA_DIST		= autogen.sh README.txt LICENSE.txt \
-			  cudaminer.sln cudaminer.vcxproj cudaminer.vcxproj.filters \
-			  compat/gettimeofday.c compat/getopt/getopt_long.c cpuminer-config.h.in
+EXTRA_DIST		= autogen.sh README.md README.txt LICENSE.txt \
+			  ccminer.sln ccminer.vcxproj ccminer.vcxproj.filters \
+			  compat/gettimeofday.c compat/getopt/getopt_long.c
 
-SUBDIRS		= compat
+SUBDIRS = compat
 
-bin_PROGRAMS	= ccminer
+bin_PROGRAMS = ccminer
 
-ccminer_SOURCES		= elist.h miner.h compat.h \
+ccminer_SOURCES	= elist.h miner.h compat.h \
 			  compat/inttypes.h compat/stdbool.h compat/unistd.h \
 			  compat/sys/time.h compat/getopt/getopt.h \
-			  cpu-miner.c util.c sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c hefty1.c scrypt.c sha2.c \
-			  sph/bmw.h sph/sph_blake.h sph/sph_groestl.h sph/sph_jh.h sph/sph_keccak.h sph/sph_skein.h sph/sph_types.h \
+			  crc32.c hefty1.c \
+			  ccminer.cpp pools.cpp util.cpp bench.cpp bignum.cpp \
+			  api.cpp hashlog.cpp nvml.cpp stats.cpp sysinfos.cpp cuda.cpp \
+			  nvsettings.cpp \
+			  equi/equi-stratum.cpp equi/equi.cpp equi/blake2/blake2bx.cpp \
+			  equi/equihash.cpp equi/cuda_equi.cu \
 			  heavy/heavy.cu \
 			  heavy/cuda_blake512.cu heavy/cuda_blake512.h \
 			  heavy/cuda_combine.cu heavy/cuda_combine.h \
@@ -25,36 +31,178 @@ ccminer_SOURCES		= elist.h miner.h compat.h \
 			  heavy/cuda_hefty1.cu heavy/cuda_hefty1.h \
 			  heavy/cuda_keccak512.cu heavy/cuda_keccak512.h \
 			  heavy/cuda_sha256.cu heavy/cuda_sha256.h \
-			  fuguecoin.cpp cuda_fugue256.cu sph/fugue.c sph/sph_fugue.h uint256.h \
+			  heavy/bastion.cu heavy/cuda_bastion.cu \
+			  fuguecoin.cpp Algo256/cuda_fugue256.cu sph/fugue.c uint256.h \
 			  groestlcoin.cpp cuda_groestlcoin.cu cuda_groestlcoin.h \
 			  myriadgroestl.cpp cuda_myriadgroestl.cu \
-			  JHA/jackpotcoin.cu JHA/cuda_jha_keccak512.cu \
-			  JHA/cuda_jha_compactionTest.cu quark/cuda_quark_checkhash.cu \
+			  lyra2/Lyra2.c lyra2/Sponge.c \
+			  lyra2/lyra2RE.cu lyra2/cuda_lyra2.cu \
+		          lyra2/lyra2REv2.cu lyra2/cuda_lyra2v2.cu \
+			  lyra2/lyra2REv3.cu lyra2/cuda_lyra2v3.cu \
+			  lyra2/Lyra2Z.c lyra2/lyra2Z.cu lyra2/cuda_lyra2Z.cu \
+			  lyra2/allium.cu \
+			  Algo256/cuda_bmw256.cu Algo256/cuda_cubehash256.cu \
+			  Algo256/cuda_blake256.cu Algo256/cuda_groestl256.cu \
+			  Algo256/cuda_keccak256_sm3.cu Algo256/cuda_keccak256.cu Algo256/cuda_skein256.cu \
+			  Algo256/blake256.cu Algo256/decred.cu Algo256/vanilla.cu Algo256/keccak256.cu \
+			  Algo256/blake2s.cu sph/blake2s.c \
+			  Algo256/bmw.cu Algo256/cuda_bmw.cu \
+			  blake2b.cu \
+			  crypto/xmr-rpc.cpp crypto/wildkeccak-cpu.cpp crypto/wildkeccak.cu \
+			  crypto/cryptolight.cu crypto/cryptolight-core.cu crypto/cryptolight-cpu.cpp \
+			  crypto/cryptonight.cu crypto/cryptonight-core.cu crypto/cryptonight-extra.cu \
+			  crypto/cryptonight-cpu.cpp crypto/oaes_lib.cpp crypto/aesb.cpp crypto/cpu/c_keccak.c \
+			  JHA/jha.cu JHA/jackpotcoin.cu JHA/cuda_jha_keccak512.cu \
+			  JHA/cuda_jha_compactionTest.cu cuda_checkhash.cu \
 			  quark/cuda_jh512.cu quark/cuda_quark_blake512.cu quark/cuda_quark_groestl512.cu quark/cuda_skein512.cu \
-			  quark/cuda_bmw512.cu quark/cuda_quark_keccak512.cu quark/quarkcoin.cu quark/animecoin.cu \
-			  quark/cuda_quark_compactionTest.cu \
-			  cuda_nist5.cu \
-			  sph/cubehash.c sph/echo.c sph/luffa.c sph/shavite.c sph/simd.c \
-			  sph/hamsi.c sph/hamsi_helper.c sph/sph_hamsi.h \
+			  quark/cuda_bmw512.cu quark/cuda_quark_keccak512.cu \
+			  quark/nist5.cu \
+			  quark/quarkcoin.cu quark/cuda_quark_compactionTest.cu \
+			  neoscrypt/neoscrypt.cpp neoscrypt/neoscrypt-cpu.c neoscrypt/cuda_neoscrypt.cu \
+			  pentablake.cu skein.cu cuda_skeincoin.cu skein2.cpp zr5.cu \
+			  skunk/skunk.cu skunk/cuda_skunk.cu skunk/cuda_skunk_streebog.cu \
+			  sha256/sha256d.cu sha256/cuda_sha256d.cu sha256/sha256t.cu sha256/cuda_sha256t.cu sha256/sha256q.cu sha256/cuda_sha256q.cu \
+			  sia/sia.cu sia/sia-rpc.cpp sph/blake2b.c \
+			  sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c \
+			  sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \
+			  sph/hamsi.c sph/hamsi_helper.c sph/streebog.c \
+			  sph/shabal.c sph/whirlpool.c sph/sha2big.c sph/haval.c \
+			  sph/ripemd.c sph/sph_sha2.c \
+			  polytimos.cu \
+			  lbry/lbry.cu lbry/cuda_sha256_lbry.cu lbry/cuda_sha512_lbry.cu lbry/cuda_lbry_merged.cu \
+			  qubit/qubit.cu qubit/qubit_luffa512.cu qubit/deep.cu qubit/luffa.cu \
+			  tribus/tribus.cu tribus/cuda_echo512_final.cu \
+			  x11/x11.cu x12/x12.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \
+			  x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu x11/exosis.cu \
+			  x11/cuda_x11_luffa512_Cubehash.cu x11/x11evo.cu x11/timetravel.cu x11/bitcore.cu \
 			  x13/x13.cu x13/cuda_x13_hamsi512.cu x13/cuda_x13_fugue512.cu \
-			  x11/x11.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \
-			  x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu
+			  x13/hsr.cu x13/cuda_hsr_sm3.cu x13/sm3.c \
+			  x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu \
+			  x15/whirlpool.cu x15/cuda_x15_whirlpool_sm3.cu \
+			  x16/x16r.cu x16/x16s.cu x16/cuda_x16_echo512.cu x16/cuda_x16_fugue512.cu \
+			  x16/cuda_x16_shabal512.cu x16/cuda_x16_simd512_80.cu \
+			  x16/cuda_x16_echo512_64.cu \
+			  x17/x17.cu x17/hmq17.cu x17/sonoa.cu x17/cuda_x17_haval256.cu x17/cuda_x17_sha512.cu \
+			  phi/phi.cu phi/phi2.cu phi/cuda_phi2.cu phi/cuda_phi2_cubehash512.cu x11/cuda_streebog_maxwell.cu \
+			  x11/c11.cu x11/s3.cu x11/sib.cu x11/veltor.cu x11/cuda_streebog.cu
 
-ccminer_LDFLAGS		= $(PTHREAD_FLAGS) @CUDA_LDFLAGS@
-ccminer_LDADD		= @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ @CUDA_LIBS@ @OPENMP_CFLAGS@ @LIBS@
-ccminer_CPPFLAGS	= -msse2 @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES) -DSCRYPT_KECCAK512 -DSCRYPT_CHACHA -DSCRYPT_CHOOSE_COMPILETIME
+# scrypt
+ccminer_SOURCES += scrypt.cpp scrypt-jane.cpp \
+    scrypt/blake.cu scrypt/keccak.cu scrypt/sha256.cu \
+    scrypt/salsa_kernel.cu scrypt/test_kernel.cu \
+    scrypt/fermi_kernel.cu scrypt/kepler_kernel.cu \
+    scrypt/nv_kernel.cu scrypt/nv_kernel2.cu scrypt/titan_kernel.cu
+
+if HAVE_NVML
+nvml_defs = -DUSE_WRAPNVML
+nvml_libs = -ldl
+endif
+
+if HAVE_WINDOWS
+ccminer_SOURCES += compat/winansi.c
+endif
+
+ccminer_LDFLAGS  = $(PTHREAD_FLAGS) @CUDA_LDFLAGS@
+ccminer_LDADD    = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ @CUDA_LIBS@ @OPENMP_CFLAGS@ @LIBS@ $(nvml_libs)
+ccminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(CPPFLAGS) $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES) $(DEF_INCLUDES) $(nvml_defs)
+
+if HAVE_OSX
+ccminer_CPPFLAGS += -I/usr/local/llvm/lib/clang/4.0.0/include
+ccminer_LDFLAGS += -L/usr/local/llvm/lib
+ccminer_LDADD += -lomp
+endif
+
+#ccminer_CPPFLAGS += -DUSE_LIBSODIUM
+#ccminer_LDFLAGS += -Lequi/lib
+#ccminer_LDADD += -lsodium
+ccminer_LDADD += -lcuda
+
+nvcc_ARCH :=
+#nvcc_ARCH += -gencode=arch=compute_75,code=\"sm_75,compute_75\" # CUDA 10 req.
+#nvcc_ARCH += -gencode=arch=compute_70,code=\"sm_70,compute_70\" # CUDA 9.1
+#nvcc_ARCH += -gencode=arch=compute_61,code=\"sm_61,compute_61\" # CUDA 8
+nvcc_ARCH += -gencode=arch=compute_52,code=\"sm_52,compute_52\"
+#nvcc_ARCH += -gencode=arch=compute_50,code=\"sm_50,compute_50\"
+#nvcc_ARCH += -gencode=arch=compute_35,code=\"sm_35,compute_35\"
+#nvcc_ARCH += -gencode=arch=compute_30,code=\"sm_30,compute_30\"
+
+nvcc_FLAGS = $(nvcc_ARCH) @CUDA_INCLUDES@ -I. @CUDA_CFLAGS@
+nvcc_FLAGS += $(JANSSON_INCLUDES) --ptxas-options="-v"
 
 # we're now targeting all major compute architectures within one binary.
 .cu.o:
-	$(NVCC) @CFLAGS@ -I . -Xptxas "-abi=no -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
+	$(NVCC) $(nvcc_FLAGS) --maxrregcount=128 -o $@ -c $<
+
+Algo256/blake256.o: Algo256/blake256.cu
+	$(NVCC) $(nvcc_FLAGS) --maxrregcount=64 -o $@ -c $<
+
+Algo256/cuda_bmw.o: Algo256/cuda_bmw.cu
+	$(NVCC) $(nvcc_FLAGS) --maxrregcount=76 -o $@ -c $<
+
+crypto/cryptonight-core.o: crypto/cryptonight-core.cu
+	$(NVCC) $(nvcc_FLAGS) --maxrregcount=64 -o $@ -c $<
+
+crypto/cryptonight-extra.o: crypto/cryptonight-extra.cu
+	$(NVCC) $(nvcc_FLAGS) -o $@ -c $<
+
+heavy/cuda_hefty1.o: heavy/cuda_hefty1.cu
+	$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<
+
+qubit/qubit_luffa512.o: qubit/qubit_luffa512.cu
+	$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<
 
-# Shavite compiles faster with 128 regs
-x11/cuda_x11_shavite512.o: x11/cuda_x11_shavite512.cu
-	$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=128 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<        
+# Luffa and Echo are faster with 80 registers than 128
+x11/cuda_x11_luffa512.o: x11/cuda_x11_luffa512.cu
+	$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<
+
+x11/cuda_x11_luffa512_Cubehash.o: x11/cuda_x11_luffa512_Cubehash.cu
+	$(NVCC) $(nvcc_FLAGS) --maxrregcount=76 -o $@ -c $<
+
+x11/cuda_x11_simd512.o: x11/cuda_x11_simd512.cu
+	$(NVCC) $(nvcc_FLAGS) -Xcompiler -Wno-unused-variable -o $@ -c $<
+
+x13/cuda_x13_hamsi512.o: x13/cuda_x13_hamsi512.cu
+	$(NVCC) $(nvcc_FLAGS) --maxrregcount=72 -o $@ -c $<
+
+x17/cuda_x17_sha512.o: x17/cuda_x17_sha512.cu
+	$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<
+
+quark/cuda_quark_blake512.o: quark/cuda_quark_blake512.cu
+	$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<
+
+quark/cuda_jh512.o: quark/cuda_jh512.cu
+	$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<
+
+quark/cuda_quark_keccak512.o: quark/cuda_quark_keccak512.cu
+	$(NVCC) $(nvcc_FLAGS) --maxrregcount=88 -o $@ -c $<
 
-# ABI requiring code modules
 quark/cuda_quark_compactionTest.o: quark/cuda_quark_compactionTest.cu
-	$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
+	$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<
 
 JHA/cuda_jha_compactionTest.o: JHA/cuda_jha_compactionTest.cu
-	$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
+	$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<
+
+# This object does not use cuda device code but call the different kernels (autotune)
+scrypt/salsa_kernel.o: scrypt/salsa_kernel.cu
+	$(NVCC) $(JANSSON_INCLUDES) -I. @CUDA_INCLUDES@ @CUDA_CFLAGS@ -gencode=arch=compute_30,code=\"sm_30,compute_30\" -o $@ -c $<
+
+# These kernels are for older devices (SM)
+
+scrypt/test_kernel.o: scrypt/test_kernel.cu
+	$(NVCC) $(JANSSON_INCLUDES) -I. @CUDA_INCLUDES@ @CUDA_CFLAGS@ -gencode=arch=compute_30,code=\"sm_30,compute_30\" -o $@ -c $<
+
+scrypt/fermi_kernel.o: scrypt/fermi_kernel.cu
+	$(NVCC) $(JANSSON_INCLUDES) -I. @CUDA_INCLUDES@ @CUDA_CFLAGS@ -gencode=arch=compute_30,code=\"sm_30,compute_30\" -o $@ -c $<
+
+scrypt/kepler_kernel.o: scrypt/kepler_kernel.cu
+	$(NVCC) $(JANSSON_INCLUDES) -I. @CUDA_INCLUDES@ @CUDA_CFLAGS@ -gencode=arch=compute_30,code=\"sm_30,compute_30\" -o $@ -c $<
+
+scrypt/nv_kernel.o: scrypt/nv_kernel.cu
+	$(NVCC) $(JANSSON_INCLUDES) -I. @CUDA_INCLUDES@ @CUDA_CFLAGS@ -gencode=arch=compute_30,code=\"sm_30,compute_30\" -o $@ -c $<
+
+scrypt/titan_kernel.o: scrypt/titan_kernel.cu
+	$(NVCC) $(nvcc_FLAGS) -gencode=arch=compute_35,code=\"sm_35,compute_35\" -o $@ -c $<
+
+skein.o: skein.cu
+	$(NVCC) $(nvcc_FLAGS) --maxrregcount=64 -o $@ -c $<
+
diff --git a/Makefile.in b/Makefile.in
deleted file mode 100644
index 9da7878915..0000000000
--- a/Makefile.in
+++ /dev/null
@@ -1,1213 +0,0 @@
-# Makefile.in generated by automake 1.11.3 from Makefile.am.
-# @configure_input@
-
-# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
-# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
-# Foundation, Inc.
-# This Makefile.in is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
-# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
-# PARTICULAR PURPOSE.
-
-@SET_MAKE@
-
-VPATH = @srcdir@
-pkgdatadir = $(datadir)/@PACKAGE@
-pkgincludedir = $(includedir)/@PACKAGE@
-pkglibdir = $(libdir)/@PACKAGE@
-pkglibexecdir = $(libexecdir)/@PACKAGE@
-am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
-install_sh_DATA = $(install_sh) -c -m 644
-install_sh_PROGRAM = $(install_sh) -c
-install_sh_SCRIPT = $(install_sh) -c
-INSTALL_HEADER = $(INSTALL_DATA)
-transform = $(program_transform_name)
-NORMAL_INSTALL = :
-PRE_INSTALL = :
-POST_INSTALL = :
-NORMAL_UNINSTALL = :
-PRE_UNINSTALL = :
-POST_UNINSTALL = :
-build_triplet = @build@
-host_triplet = @host@
-target_triplet = @target@
-bin_PROGRAMS = ccminer$(EXEEXT)
-subdir = .
-DIST_COMMON = README $(am__configure_deps) $(srcdir)/Makefile.am \
-	$(srcdir)/Makefile.in $(srcdir)/cpuminer-config.h.in \
-	$(top_srcdir)/configure AUTHORS COPYING ChangeLog INSTALL NEWS \
-	compile config.guess config.sub depcomp install-sh missing
-ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps = $(top_srcdir)/configure.ac
-am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
-	$(ACLOCAL_M4)
-am__CONFIG_DISTCLEAN_FILES = config.status config.cache config.log \
- configure.lineno config.status.lineno
-mkinstalldirs = $(install_sh) -d
-CONFIG_HEADER = cpuminer-config.h
-CONFIG_CLEAN_FILES =
-CONFIG_CLEAN_VPATH_FILES =
-am__installdirs = "$(DESTDIR)$(bindir)"
-PROGRAMS = $(bin_PROGRAMS)
-am__dirstamp = $(am__leading_dot)dirstamp
-am_ccminer_OBJECTS = ccminer-cpu-miner.$(OBJEXT) \
-	ccminer-util.$(OBJEXT) ccminer-bmw.$(OBJEXT) \
-	ccminer-blake.$(OBJEXT) ccminer-groestl.$(OBJEXT) \
-	ccminer-jh.$(OBJEXT) ccminer-keccak.$(OBJEXT) \
-	ccminer-skein.$(OBJEXT) ccminer-hefty1.$(OBJEXT) \
-	ccminer-scrypt.$(OBJEXT) ccminer-sha2.$(OBJEXT) \
-	heavy/heavy.$(OBJEXT) heavy/cuda_blake512.$(OBJEXT) \
-	heavy/cuda_combine.$(OBJEXT) heavy/cuda_groestl512.$(OBJEXT) \
-	heavy/cuda_hefty1.$(OBJEXT) heavy/cuda_keccak512.$(OBJEXT) \
-	heavy/cuda_sha256.$(OBJEXT) ccminer-fuguecoin.$(OBJEXT) \
-	cuda_fugue256.$(OBJEXT) ccminer-fugue.$(OBJEXT) \
-	ccminer-groestlcoin.$(OBJEXT) cuda_groestlcoin.$(OBJEXT) \
-	JHA/jackpotcoin.$(OBJEXT) JHA/cuda_jha_keccak512.$(OBJEXT) \
-	JHA/cuda_jha_compactionTest.$(OBJEXT) \
-	quark/cuda_quark_checkhash.$(OBJEXT) \
-	quark/cuda_jh512.$(OBJEXT) quark/cuda_quark_blake512.$(OBJEXT) \
-	quark/cuda_quark_groestl512.$(OBJEXT) \
-	quark/cuda_skein512.$(OBJEXT) quark/cuda_bmw512.$(OBJEXT) \
-	quark/cuda_quark_keccak512.$(OBJEXT) quark/quarkcoin.$(OBJEXT) \
-	quark/animecoin.$(OBJEXT) \
-	quark/cuda_quark_compactionTest.$(OBJEXT) \
-	ccminer-myriadgroestl.$(OBJEXT) cuda_myriadgroestl.$(OBJEXT)
-ccminer_OBJECTS = $(am_ccminer_OBJECTS)
-ccminer_DEPENDENCIES =
-ccminer_LINK = $(CXXLD) $(AM_CXXFLAGS) $(CXXFLAGS) $(ccminer_LDFLAGS) \
-	$(LDFLAGS) -o $@
-DEFAULT_INCLUDES = -I.@am__isrc@
-depcomp = $(SHELL) $(top_srcdir)/depcomp
-am__depfiles_maybe = depfiles
-am__mv = mv -f
-COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
-	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
-CCLD = $(CC)
-LINK = $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
-CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
-	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS)
-CXXLD = $(CXX)
-CXXLINK = $(CXXLD) $(AM_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) \
-	-o $@
-SOURCES = $(ccminer_SOURCES)
-DIST_SOURCES = $(ccminer_SOURCES)
-RECURSIVE_TARGETS = all-recursive check-recursive dvi-recursive \
-	html-recursive info-recursive install-data-recursive \
-	install-dvi-recursive install-exec-recursive \
-	install-html-recursive install-info-recursive \
-	install-pdf-recursive install-ps-recursive install-recursive \
-	installcheck-recursive installdirs-recursive pdf-recursive \
-	ps-recursive uninstall-recursive
-RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive	\
-  distclean-recursive maintainer-clean-recursive
-AM_RECURSIVE_TARGETS = $(RECURSIVE_TARGETS:-recursive=) \
-	$(RECURSIVE_CLEAN_TARGETS:-recursive=) tags TAGS ctags CTAGS \
-	distdir dist dist-all distcheck
-ETAGS = etags
-CTAGS = ctags
-DIST_SUBDIRS = $(SUBDIRS)
-DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
-distdir = $(PACKAGE)-$(VERSION)
-top_distdir = $(distdir)
-am__remove_distdir = \
-  if test -d "$(distdir)"; then \
-    find "$(distdir)" -type d ! -perm -200 -exec chmod u+w {} ';' \
-      && rm -rf "$(distdir)" \
-      || { sleep 5 && rm -rf "$(distdir)"; }; \
-  else :; fi
-am__relativize = \
-  dir0=`pwd`; \
-  sed_first='s,^\([^/]*\)/.*$$,\1,'; \
-  sed_rest='s,^[^/]*/*,,'; \
-  sed_last='s,^.*/\([^/]*\)$$,\1,'; \
-  sed_butlast='s,/*[^/]*$$,,'; \
-  while test -n "$$dir1"; do \
-    first=`echo "$$dir1" | sed -e "$$sed_first"`; \
-    if test "$$first" != "."; then \
-      if test "$$first" = ".."; then \
-        dir2=`echo "$$dir0" | sed -e "$$sed_last"`/"$$dir2"; \
-        dir0=`echo "$$dir0" | sed -e "$$sed_butlast"`; \
-      else \
-        first2=`echo "$$dir2" | sed -e "$$sed_first"`; \
-        if test "$$first2" = "$$first"; then \
-          dir2=`echo "$$dir2" | sed -e "$$sed_rest"`; \
-        else \
-          dir2="../$$dir2"; \
-        fi; \
-        dir0="$$dir0"/"$$first"; \
-      fi; \
-    fi; \
-    dir1=`echo "$$dir1" | sed -e "$$sed_rest"`; \
-  done; \
-  reldir="$$dir2"
-DIST_ARCHIVES = $(distdir).tar.gz
-GZIP_ENV = --best
-distuninstallcheck_listfiles = find . -type f -print
-am__distuninstallcheck_listfiles = $(distuninstallcheck_listfiles) \
-  | sed 's|^\./|$(prefix)/|' | grep -v '$(infodir)/dir$$'
-distcleancheck_listfiles = find . -type f -print
-ACLOCAL = @ACLOCAL@
-ALLOCA = @ALLOCA@
-AMTAR = @AMTAR@
-AUTOCONF = @AUTOCONF@
-AUTOHEADER = @AUTOHEADER@
-AUTOMAKE = @AUTOMAKE@
-AWK = @AWK@
-CC = @CC@
-CCAS = @CCAS@
-CCASDEPMODE = @CCASDEPMODE@
-CCASFLAGS = @CCASFLAGS@
-CCDEPMODE = @CCDEPMODE@
-CFLAGS = @CFLAGS@
-CPP = @CPP@
-CPPFLAGS = @CPPFLAGS@
-CUDA_CFLAGS = @CUDA_CFLAGS@
-CUDA_LDFLAGS = @CUDA_LDFLAGS@
-CUDA_LIBS = @CUDA_LIBS@
-CXX = @CXX@
-CXXDEPMODE = @CXXDEPMODE@
-CXXFLAGS = @CXXFLAGS@
-CYGPATH_W = @CYGPATH_W@
-DEFS = @DEFS@
-DEPDIR = @DEPDIR@
-ECHO_C = @ECHO_C@
-ECHO_N = @ECHO_N@
-ECHO_T = @ECHO_T@
-EGREP = @EGREP@
-EXEEXT = @EXEEXT@
-GREP = @GREP@
-INSTALL = @INSTALL@
-INSTALL_DATA = @INSTALL_DATA@
-INSTALL_PROGRAM = @INSTALL_PROGRAM@
-INSTALL_SCRIPT = @INSTALL_SCRIPT@
-INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
-JANSSON_LIBS = @JANSSON_LIBS@
-LDFLAGS = @LDFLAGS@
-LIBCURL = @LIBCURL@
-LIBCURL_CPPFLAGS = @LIBCURL_CPPFLAGS@
-LIBOBJS = @LIBOBJS@
-LIBS = @LIBS@
-LTLIBOBJS = @LTLIBOBJS@
-MAINT = @MAINT@
-MAKEINFO = @MAKEINFO@
-MKDIR_P = @MKDIR_P@
-NVCC = @NVCC@
-OBJEXT = @OBJEXT@
-OPENMP_CFLAGS = @OPENMP_CFLAGS@
-PACKAGE = @PACKAGE@
-PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
-PACKAGE_NAME = @PACKAGE_NAME@
-PACKAGE_STRING = @PACKAGE_STRING@
-PACKAGE_TARNAME = @PACKAGE_TARNAME@
-PACKAGE_URL = @PACKAGE_URL@
-PACKAGE_VERSION = @PACKAGE_VERSION@
-PATH_SEPARATOR = @PATH_SEPARATOR@
-PTHREAD_FLAGS = @PTHREAD_FLAGS@
-PTHREAD_LIBS = @PTHREAD_LIBS@
-RANLIB = @RANLIB@
-SET_MAKE = @SET_MAKE@
-SHELL = @SHELL@
-STRIP = @STRIP@
-VERSION = @VERSION@
-WS2_LIBS = @WS2_LIBS@
-_libcurl_config = @_libcurl_config@
-abs_builddir = @abs_builddir@
-abs_srcdir = @abs_srcdir@
-abs_top_builddir = @abs_top_builddir@
-abs_top_srcdir = @abs_top_srcdir@
-ac_ct_CC = @ac_ct_CC@
-ac_ct_CXX = @ac_ct_CXX@
-am__include = @am__include@
-am__leading_dot = @am__leading_dot@
-am__quote = @am__quote@
-am__tar = @am__tar@
-am__untar = @am__untar@
-bindir = @bindir@
-build = @build@
-build_alias = @build_alias@
-build_cpu = @build_cpu@
-build_os = @build_os@
-build_vendor = @build_vendor@
-builddir = @builddir@
-datadir = @datadir@
-datarootdir = @datarootdir@
-docdir = @docdir@
-dvidir = @dvidir@
-exec_prefix = @exec_prefix@
-host = @host@
-host_alias = @host_alias@
-host_cpu = @host_cpu@
-host_os = @host_os@
-host_vendor = @host_vendor@
-htmldir = @htmldir@
-includedir = @includedir@
-infodir = @infodir@
-install_sh = @install_sh@
-libdir = @libdir@
-libexecdir = @libexecdir@
-localedir = @localedir@
-localstatedir = @localstatedir@
-mandir = @mandir@
-mkdir_p = @mkdir_p@
-oldincludedir = @oldincludedir@
-pdfdir = @pdfdir@
-prefix = @prefix@
-program_transform_name = @program_transform_name@
-psdir = @psdir@
-sbindir = @sbindir@
-sharedstatedir = @sharedstatedir@
-srcdir = @srcdir@
-sysconfdir = @sysconfdir@
-target = @target@
-target_alias = @target_alias@
-target_cpu = @target_cpu@
-target_os = @target_os@
-target_vendor = @target_vendor@
-top_build_prefix = @top_build_prefix@
-top_builddir = @top_builddir@
-top_srcdir = @top_srcdir@
-@WANT_JANSSON_FALSE@JANSSON_INCLUDES = 
-@WANT_JANSSON_TRUE@JANSSON_INCLUDES = -I$(top_srcdir)/compat/jansson
-EXTRA_DIST = autogen.sh README.txt LICENSE.txt \
-			  cudaminer.sln cudaminer.vcxproj cudaminer.vcxproj.filters \
-			  compat/gettimeofday.c compat/getopt/getopt_long.c cpuminer-config.h.in
-
-SUBDIRS = compat
-ccminer_SOURCES = elist.h miner.h compat.h \
-			  compat/inttypes.h compat/stdbool.h compat/unistd.h \
-			  compat/sys/time.h compat/getopt/getopt.h \
-			  cpu-miner.c util.c sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c hefty1.c scrypt.c sha2.c \
-			  sph/bmw.h sph/sph_blake.h sph/sph_groestl.h sph/sph_jh.h sph/sph_keccak.h sph/sph_skein.h sph/sph_types.h \
-			  heavy/heavy.cu \
-			  heavy/cuda_blake512.cu heavy/cuda_blake512.h \
-			  heavy/cuda_combine.cu heavy/cuda_combine.h \
-			  heavy/cuda_groestl512.cu heavy/cuda_groestl512.h \
-			  heavy/cuda_hefty1.cu heavy/cuda_hefty1.h \
-			  heavy/cuda_keccak512.cu heavy/cuda_keccak512.h \
-			  heavy/cuda_sha256.cu heavy/cuda_sha256.h \
-			  fuguecoin.cpp cuda_fugue256.cu sph/fugue.c sph/sph_fugue.h uint256.h \
-			  groestlcoin.cpp cuda_groestlcoin.cu cuda_groestlcoin.h \
-			  JHA/jackpotcoin.cu JHA/cuda_jha_keccak512.cu \
-			  JHA/cuda_jha_compactionTest.cu quark/cuda_quark_checkhash.cu \
-			  quark/cuda_jh512.cu quark/cuda_quark_blake512.cu quark/cuda_quark_groestl512.cu quark/cuda_skein512.cu \
-			  quark/cuda_bmw512.cu quark/cuda_quark_keccak512.cu quark/quarkcoin.cu quark/animecoin.cu \
-			  quark/cuda_quark_compactionTest.cu \
-			  myriadgroestl.cpp cuda_myriadgroestl.cu
-
-ccminer_LDFLAGS = $(PTHREAD_FLAGS) @CUDA_LDFLAGS@
-ccminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ @CUDA_LIBS@ @OPENMP_CFLAGS@ @LIBS@
-ccminer_CPPFLAGS = -msse2 @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES) -DSCRYPT_KECCAK512 -DSCRYPT_CHACHA -DSCRYPT_CHOOSE_COMPILETIME
-all: cpuminer-config.h
-	$(MAKE) $(AM_MAKEFLAGS) all-recursive
-
-.SUFFIXES:
-.SUFFIXES: .c .cpp .cu .o .obj
-am--refresh: Makefile
-	@:
-$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
-	@for dep in $?; do \
-	  case '$(am__configure_deps)' in \
-	    *$$dep*) \
-	      echo ' cd $(srcdir) && $(AUTOMAKE) --gnu'; \
-	      $(am__cd) $(srcdir) && $(AUTOMAKE) --gnu \
-		&& exit 0; \
-	      exit 1;; \
-	  esac; \
-	done; \
-	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu Makefile'; \
-	$(am__cd) $(top_srcdir) && \
-	  $(AUTOMAKE) --gnu Makefile
-.PRECIOUS: Makefile
-Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
-	@case '$?' in \
-	  *config.status*) \
-	    echo ' $(SHELL) ./config.status'; \
-	    $(SHELL) ./config.status;; \
-	  *) \
-	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $@ $(am__depfiles_maybe)'; \
-	    cd $(top_builddir) && $(SHELL) ./config.status $@ $(am__depfiles_maybe);; \
-	esac;
-
-$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
-	$(SHELL) ./config.status --recheck
-
-$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
-	$(am__cd) $(srcdir) && $(AUTOCONF)
-$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
-	$(am__cd) $(srcdir) && $(ACLOCAL) $(ACLOCAL_AMFLAGS)
-$(am__aclocal_m4_deps):
-
-cpuminer-config.h: stamp-h1
-	@if test ! -f $@; then rm -f stamp-h1; else :; fi
-	@if test ! -f $@; then $(MAKE) $(AM_MAKEFLAGS) stamp-h1; else :; fi
-
-stamp-h1: $(srcdir)/cpuminer-config.h.in $(top_builddir)/config.status
-	@rm -f stamp-h1
-	cd $(top_builddir) && $(SHELL) ./config.status cpuminer-config.h
-$(srcdir)/cpuminer-config.h.in: @MAINTAINER_MODE_TRUE@ $(am__configure_deps) 
-	($(am__cd) $(top_srcdir) && $(AUTOHEADER))
-	rm -f stamp-h1
-	touch $@
-
-distclean-hdr:
-	-rm -f cpuminer-config.h stamp-h1
-install-binPROGRAMS: $(bin_PROGRAMS)
-	@$(NORMAL_INSTALL)
-	test -z "$(bindir)" || $(MKDIR_P) "$(DESTDIR)$(bindir)"
-	@list='$(bin_PROGRAMS)'; test -n "$(bindir)" || list=; \
-	for p in $$list; do echo "$$p $$p"; done | \
-	sed 's/$(EXEEXT)$$//' | \
-	while read p p1; do if test -f $$p; \
-	  then echo "$$p"; echo "$$p"; else :; fi; \
-	done | \
-	sed -e 'p;s,.*/,,;n;h' -e 's|.*|.|' \
-	    -e 'p;x;s,.*/,,;s/$(EXEEXT)$$//;$(transform);s/$$/$(EXEEXT)/' | \
-	sed 'N;N;N;s,\n, ,g' | \
-	$(AWK) 'BEGIN { files["."] = ""; dirs["."] = 1 } \
-	  { d=$$3; if (dirs[d] != 1) { print "d", d; dirs[d] = 1 } \
-	    if ($$2 == $$4) files[d] = files[d] " " $$1; \
-	    else { print "f", $$3 "/" $$4, $$1; } } \
-	  END { for (d in files) print "f", d, files[d] }' | \
-	while read type dir files; do \
-	    if test "$$dir" = .; then dir=; else dir=/$$dir; fi; \
-	    test -z "$$files" || { \
-	      echo " $(INSTALL_PROGRAM_ENV) $(INSTALL_PROGRAM) $$files '$(DESTDIR)$(bindir)$$dir'"; \
-	      $(INSTALL_PROGRAM_ENV) $(INSTALL_PROGRAM) $$files "$(DESTDIR)$(bindir)$$dir" || exit $$?; \
-	    } \
-	; done
-
-uninstall-binPROGRAMS:
-	@$(NORMAL_UNINSTALL)
-	@list='$(bin_PROGRAMS)'; test -n "$(bindir)" || list=; \
-	files=`for p in $$list; do echo "$$p"; done | \
-	  sed -e 'h;s,^.*/,,;s/$(EXEEXT)$$//;$(transform)' \
-	      -e 's/$$/$(EXEEXT)/' `; \
-	test -n "$$list" || exit 0; \
-	echo " ( cd '$(DESTDIR)$(bindir)' && rm -f" $$files ")"; \
-	cd "$(DESTDIR)$(bindir)" && rm -f $$files
-
-clean-binPROGRAMS:
-	-test -z "$(bin_PROGRAMS)" || rm -f $(bin_PROGRAMS)
-heavy/$(am__dirstamp):
-	@$(MKDIR_P) heavy
-	@: > heavy/$(am__dirstamp)
-heavy/$(DEPDIR)/$(am__dirstamp):
-	@$(MKDIR_P) heavy/$(DEPDIR)
-	@: > heavy/$(DEPDIR)/$(am__dirstamp)
-heavy/heavy.$(OBJEXT): heavy/$(am__dirstamp) \
-	heavy/$(DEPDIR)/$(am__dirstamp)
-heavy/cuda_blake512.$(OBJEXT): heavy/$(am__dirstamp) \
-	heavy/$(DEPDIR)/$(am__dirstamp)
-heavy/cuda_combine.$(OBJEXT): heavy/$(am__dirstamp) \
-	heavy/$(DEPDIR)/$(am__dirstamp)
-heavy/cuda_groestl512.$(OBJEXT): heavy/$(am__dirstamp) \
-	heavy/$(DEPDIR)/$(am__dirstamp)
-heavy/cuda_hefty1.$(OBJEXT): heavy/$(am__dirstamp) \
-	heavy/$(DEPDIR)/$(am__dirstamp)
-heavy/cuda_keccak512.$(OBJEXT): heavy/$(am__dirstamp) \
-	heavy/$(DEPDIR)/$(am__dirstamp)
-heavy/cuda_sha256.$(OBJEXT): heavy/$(am__dirstamp) \
-	heavy/$(DEPDIR)/$(am__dirstamp)
-JHA/$(am__dirstamp):
-	@$(MKDIR_P) JHA
-	@: > JHA/$(am__dirstamp)
-JHA/$(DEPDIR)/$(am__dirstamp):
-	@$(MKDIR_P) JHA/$(DEPDIR)
-	@: > JHA/$(DEPDIR)/$(am__dirstamp)
-JHA/jackpotcoin.$(OBJEXT): JHA/$(am__dirstamp) \
-	JHA/$(DEPDIR)/$(am__dirstamp)
-JHA/cuda_jha_keccak512.$(OBJEXT): JHA/$(am__dirstamp) \
-	JHA/$(DEPDIR)/$(am__dirstamp)
-JHA/cuda_jha_compactionTest.$(OBJEXT): JHA/$(am__dirstamp) \
-	JHA/$(DEPDIR)/$(am__dirstamp)
-quark/$(am__dirstamp):
-	@$(MKDIR_P) quark
-	@: > quark/$(am__dirstamp)
-quark/$(DEPDIR)/$(am__dirstamp):
-	@$(MKDIR_P) quark/$(DEPDIR)
-	@: > quark/$(DEPDIR)/$(am__dirstamp)
-quark/cuda_quark_checkhash.$(OBJEXT): quark/$(am__dirstamp) \
-	quark/$(DEPDIR)/$(am__dirstamp)
-quark/cuda_jh512.$(OBJEXT): quark/$(am__dirstamp) \
-	quark/$(DEPDIR)/$(am__dirstamp)
-quark/cuda_quark_blake512.$(OBJEXT): quark/$(am__dirstamp) \
-	quark/$(DEPDIR)/$(am__dirstamp)
-quark/cuda_quark_groestl512.$(OBJEXT): quark/$(am__dirstamp) \
-	quark/$(DEPDIR)/$(am__dirstamp)
-quark/cuda_skein512.$(OBJEXT): quark/$(am__dirstamp) \
-	quark/$(DEPDIR)/$(am__dirstamp)
-quark/cuda_bmw512.$(OBJEXT): quark/$(am__dirstamp) \
-	quark/$(DEPDIR)/$(am__dirstamp)
-quark/cuda_quark_keccak512.$(OBJEXT): quark/$(am__dirstamp) \
-	quark/$(DEPDIR)/$(am__dirstamp)
-quark/quarkcoin.$(OBJEXT): quark/$(am__dirstamp) \
-	quark/$(DEPDIR)/$(am__dirstamp)
-quark/animecoin.$(OBJEXT): quark/$(am__dirstamp) \
-	quark/$(DEPDIR)/$(am__dirstamp)
-quark/cuda_quark_compactionTest.$(OBJEXT): quark/$(am__dirstamp) \
-	quark/$(DEPDIR)/$(am__dirstamp)
-ccminer$(EXEEXT): $(ccminer_OBJECTS) $(ccminer_DEPENDENCIES) $(EXTRA_ccminer_DEPENDENCIES) 
-	@rm -f ccminer$(EXEEXT)
-	$(ccminer_LINK) $(ccminer_OBJECTS) $(ccminer_LDADD) $(LIBS)
-
-mostlyclean-compile:
-	-rm -f *.$(OBJEXT)
-	-rm -f JHA/cuda_jha_compactionTest.$(OBJEXT)
-	-rm -f JHA/cuda_jha_keccak512.$(OBJEXT)
-	-rm -f JHA/jackpotcoin.$(OBJEXT)
-	-rm -f heavy/cuda_blake512.$(OBJEXT)
-	-rm -f heavy/cuda_combine.$(OBJEXT)
-	-rm -f heavy/cuda_groestl512.$(OBJEXT)
-	-rm -f heavy/cuda_hefty1.$(OBJEXT)
-	-rm -f heavy/cuda_keccak512.$(OBJEXT)
-	-rm -f heavy/cuda_sha256.$(OBJEXT)
-	-rm -f heavy/heavy.$(OBJEXT)
-	-rm -f quark/animecoin.$(OBJEXT)
-	-rm -f quark/cuda_bmw512.$(OBJEXT)
-	-rm -f quark/cuda_jh512.$(OBJEXT)
-	-rm -f quark/cuda_quark_blake512.$(OBJEXT)
-	-rm -f quark/cuda_quark_checkhash.$(OBJEXT)
-	-rm -f quark/cuda_quark_compactionTest.$(OBJEXT)
-	-rm -f quark/cuda_quark_groestl512.$(OBJEXT)
-	-rm -f quark/cuda_quark_keccak512.$(OBJEXT)
-	-rm -f quark/cuda_skein512.$(OBJEXT)
-	-rm -f quark/quarkcoin.$(OBJEXT)
-
-distclean-compile:
-	-rm -f *.tab.c
-
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-blake.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-bmw.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-cpu-miner.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-fugue.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-fuguecoin.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-groestl.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-groestlcoin.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-hefty1.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-jh.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-keccak.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-myriadgroestl.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-scrypt.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-sha2.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-skein.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-util.Po@am__quote@
-
-.c.o:
-@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(COMPILE) -c $<
-
-.c.obj:
-@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(COMPILE) -c `$(CYGPATH_W) '$<'`
-
-ccminer-cpu-miner.o: cpu-miner.c
-@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-cpu-miner.o -MD -MP -MF $(DEPDIR)/ccminer-cpu-miner.Tpo -c -o ccminer-cpu-miner.o `test -f 'cpu-miner.c' || echo '$(srcdir)/'`cpu-miner.c
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/ccminer-cpu-miner.Tpo $(DEPDIR)/ccminer-cpu-miner.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='cpu-miner.c' object='ccminer-cpu-miner.o' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-cpu-miner.o `test -f 'cpu-miner.c' || echo '$(srcdir)/'`cpu-miner.c
-
-ccminer-cpu-miner.obj: cpu-miner.c
-@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-cpu-miner.obj -MD -MP -MF $(DEPDIR)/ccminer-cpu-miner.Tpo -c -o ccminer-cpu-miner.obj `if test -f 'cpu-miner.c'; then $(CYGPATH_W) 'cpu-miner.c'; else $(CYGPATH_W) '$(srcdir)/cpu-miner.c'; fi`
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/ccminer-cpu-miner.Tpo $(DEPDIR)/ccminer-cpu-miner.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='cpu-miner.c' object='ccminer-cpu-miner.obj' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-cpu-miner.obj `if test -f 'cpu-miner.c'; then $(CYGPATH_W) 'cpu-miner.c'; else $(CYGPATH_W) '$(srcdir)/cpu-miner.c'; fi`
-
-ccminer-util.o: util.c
-@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-util.o -MD -MP -MF $(DEPDIR)/ccminer-util.Tpo -c -o ccminer-util.o `test -f 'util.c' || echo '$(srcdir)/'`util.c
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/ccminer-util.Tpo $(DEPDIR)/ccminer-util.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='util.c' object='ccminer-util.o' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-util.o `test -f 'util.c' || echo '$(srcdir)/'`util.c
-
-ccminer-util.obj: util.c
-@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-util.obj -MD -MP -MF $(DEPDIR)/ccminer-util.Tpo -c -o ccminer-util.obj `if test -f 'util.c'; then $(CYGPATH_W) 'util.c'; else $(CYGPATH_W) '$(srcdir)/util.c'; fi`
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/ccminer-util.Tpo $(DEPDIR)/ccminer-util.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='util.c' object='ccminer-util.obj' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-util.obj `if test -f 'util.c'; then $(CYGPATH_W) 'util.c'; else $(CYGPATH_W) '$(srcdir)/util.c'; fi`
-
-ccminer-bmw.o: sph/bmw.c
-@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-bmw.o -MD -MP -MF $(DEPDIR)/ccminer-bmw.Tpo -c -o ccminer-bmw.o `test -f 'sph/bmw.c' || echo '$(srcdir)/'`sph/bmw.c
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/ccminer-bmw.Tpo $(DEPDIR)/ccminer-bmw.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='sph/bmw.c' object='ccminer-bmw.o' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-bmw.o `test -f 'sph/bmw.c' || echo '$(srcdir)/'`sph/bmw.c
-
-ccminer-bmw.obj: sph/bmw.c
-@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-bmw.obj -MD -MP -MF $(DEPDIR)/ccminer-bmw.Tpo -c -o ccminer-bmw.obj `if test -f 'sph/bmw.c'; then $(CYGPATH_W) 'sph/bmw.c'; else $(CYGPATH_W) '$(srcdir)/sph/bmw.c'; fi`
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/ccminer-bmw.Tpo $(DEPDIR)/ccminer-bmw.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='sph/bmw.c' object='ccminer-bmw.obj' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-bmw.obj `if test -f 'sph/bmw.c'; then $(CYGPATH_W) 'sph/bmw.c'; else $(CYGPATH_W) '$(srcdir)/sph/bmw.c'; fi`
-
-ccminer-blake.o: sph/blake.c
-@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-blake.o -MD -MP -MF $(DEPDIR)/ccminer-blake.Tpo -c -o ccminer-blake.o `test -f 'sph/blake.c' || echo '$(srcdir)/'`sph/blake.c
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/ccminer-blake.Tpo $(DEPDIR)/ccminer-blake.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='sph/blake.c' object='ccminer-blake.o' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-blake.o `test -f 'sph/blake.c' || echo '$(srcdir)/'`sph/blake.c
-
-ccminer-blake.obj: sph/blake.c
-@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-blake.obj -MD -MP -MF $(DEPDIR)/ccminer-blake.Tpo -c -o ccminer-blake.obj `if test -f 'sph/blake.c'; then $(CYGPATH_W) 'sph/blake.c'; else $(CYGPATH_W) '$(srcdir)/sph/blake.c'; fi`
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/ccminer-blake.Tpo $(DEPDIR)/ccminer-blake.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='sph/blake.c' object='ccminer-blake.obj' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-blake.obj `if test -f 'sph/blake.c'; then $(CYGPATH_W) 'sph/blake.c'; else $(CYGPATH_W) '$(srcdir)/sph/blake.c'; fi`
-
-ccminer-groestl.o: sph/groestl.c
-@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-groestl.o -MD -MP -MF $(DEPDIR)/ccminer-groestl.Tpo -c -o ccminer-groestl.o `test -f 'sph/groestl.c' || echo '$(srcdir)/'`sph/groestl.c
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/ccminer-groestl.Tpo $(DEPDIR)/ccminer-groestl.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='sph/groestl.c' object='ccminer-groestl.o' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-groestl.o `test -f 'sph/groestl.c' || echo '$(srcdir)/'`sph/groestl.c
-
-ccminer-groestl.obj: sph/groestl.c
-@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-groestl.obj -MD -MP -MF $(DEPDIR)/ccminer-groestl.Tpo -c -o ccminer-groestl.obj `if test -f 'sph/groestl.c'; then $(CYGPATH_W) 'sph/groestl.c'; else $(CYGPATH_W) '$(srcdir)/sph/groestl.c'; fi`
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/ccminer-groestl.Tpo $(DEPDIR)/ccminer-groestl.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='sph/groestl.c' object='ccminer-groestl.obj' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-groestl.obj `if test -f 'sph/groestl.c'; then $(CYGPATH_W) 'sph/groestl.c'; else $(CYGPATH_W) '$(srcdir)/sph/groestl.c'; fi`
-
-ccminer-jh.o: sph/jh.c
-@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-jh.o -MD -MP -MF $(DEPDIR)/ccminer-jh.Tpo -c -o ccminer-jh.o `test -f 'sph/jh.c' || echo '$(srcdir)/'`sph/jh.c
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/ccminer-jh.Tpo $(DEPDIR)/ccminer-jh.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='sph/jh.c' object='ccminer-jh.o' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-jh.o `test -f 'sph/jh.c' || echo '$(srcdir)/'`sph/jh.c
-
-ccminer-jh.obj: sph/jh.c
-@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-jh.obj -MD -MP -MF $(DEPDIR)/ccminer-jh.Tpo -c -o ccminer-jh.obj `if test -f 'sph/jh.c'; then $(CYGPATH_W) 'sph/jh.c'; else $(CYGPATH_W) '$(srcdir)/sph/jh.c'; fi`
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/ccminer-jh.Tpo $(DEPDIR)/ccminer-jh.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='sph/jh.c' object='ccminer-jh.obj' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-jh.obj `if test -f 'sph/jh.c'; then $(CYGPATH_W) 'sph/jh.c'; else $(CYGPATH_W) '$(srcdir)/sph/jh.c'; fi`
-
-ccminer-keccak.o: sph/keccak.c
-@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-keccak.o -MD -MP -MF $(DEPDIR)/ccminer-keccak.Tpo -c -o ccminer-keccak.o `test -f 'sph/keccak.c' || echo '$(srcdir)/'`sph/keccak.c
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/ccminer-keccak.Tpo $(DEPDIR)/ccminer-keccak.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='sph/keccak.c' object='ccminer-keccak.o' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-keccak.o `test -f 'sph/keccak.c' || echo '$(srcdir)/'`sph/keccak.c
-
-ccminer-keccak.obj: sph/keccak.c
-@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-keccak.obj -MD -MP -MF $(DEPDIR)/ccminer-keccak.Tpo -c -o ccminer-keccak.obj `if test -f 'sph/keccak.c'; then $(CYGPATH_W) 'sph/keccak.c'; else $(CYGPATH_W) '$(srcdir)/sph/keccak.c'; fi`
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/ccminer-keccak.Tpo $(DEPDIR)/ccminer-keccak.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='sph/keccak.c' object='ccminer-keccak.obj' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-keccak.obj `if test -f 'sph/keccak.c'; then $(CYGPATH_W) 'sph/keccak.c'; else $(CYGPATH_W) '$(srcdir)/sph/keccak.c'; fi`
-
-ccminer-skein.o: sph/skein.c
-@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-skein.o -MD -MP -MF $(DEPDIR)/ccminer-skein.Tpo -c -o ccminer-skein.o `test -f 'sph/skein.c' || echo '$(srcdir)/'`sph/skein.c
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/ccminer-skein.Tpo $(DEPDIR)/ccminer-skein.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='sph/skein.c' object='ccminer-skein.o' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-skein.o `test -f 'sph/skein.c' || echo '$(srcdir)/'`sph/skein.c
-
-ccminer-skein.obj: sph/skein.c
-@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-skein.obj -MD -MP -MF $(DEPDIR)/ccminer-skein.Tpo -c -o ccminer-skein.obj `if test -f 'sph/skein.c'; then $(CYGPATH_W) 'sph/skein.c'; else $(CYGPATH_W) '$(srcdir)/sph/skein.c'; fi`
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/ccminer-skein.Tpo $(DEPDIR)/ccminer-skein.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='sph/skein.c' object='ccminer-skein.obj' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-skein.obj `if test -f 'sph/skein.c'; then $(CYGPATH_W) 'sph/skein.c'; else $(CYGPATH_W) '$(srcdir)/sph/skein.c'; fi`
-
-ccminer-hefty1.o: hefty1.c
-@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-hefty1.o -MD -MP -MF $(DEPDIR)/ccminer-hefty1.Tpo -c -o ccminer-hefty1.o `test -f 'hefty1.c' || echo '$(srcdir)/'`hefty1.c
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/ccminer-hefty1.Tpo $(DEPDIR)/ccminer-hefty1.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='hefty1.c' object='ccminer-hefty1.o' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-hefty1.o `test -f 'hefty1.c' || echo '$(srcdir)/'`hefty1.c
-
-ccminer-hefty1.obj: hefty1.c
-@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-hefty1.obj -MD -MP -MF $(DEPDIR)/ccminer-hefty1.Tpo -c -o ccminer-hefty1.obj `if test -f 'hefty1.c'; then $(CYGPATH_W) 'hefty1.c'; else $(CYGPATH_W) '$(srcdir)/hefty1.c'; fi`
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/ccminer-hefty1.Tpo $(DEPDIR)/ccminer-hefty1.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='hefty1.c' object='ccminer-hefty1.obj' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-hefty1.obj `if test -f 'hefty1.c'; then $(CYGPATH_W) 'hefty1.c'; else $(CYGPATH_W) '$(srcdir)/hefty1.c'; fi`
-
-ccminer-scrypt.o: scrypt.c
-@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-scrypt.o -MD -MP -MF $(DEPDIR)/ccminer-scrypt.Tpo -c -o ccminer-scrypt.o `test -f 'scrypt.c' || echo '$(srcdir)/'`scrypt.c
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/ccminer-scrypt.Tpo $(DEPDIR)/ccminer-scrypt.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='scrypt.c' object='ccminer-scrypt.o' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-scrypt.o `test -f 'scrypt.c' || echo '$(srcdir)/'`scrypt.c
-
-ccminer-scrypt.obj: scrypt.c
-@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-scrypt.obj -MD -MP -MF $(DEPDIR)/ccminer-scrypt.Tpo -c -o ccminer-scrypt.obj `if test -f 'scrypt.c'; then $(CYGPATH_W) 'scrypt.c'; else $(CYGPATH_W) '$(srcdir)/scrypt.c'; fi`
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/ccminer-scrypt.Tpo $(DEPDIR)/ccminer-scrypt.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='scrypt.c' object='ccminer-scrypt.obj' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-scrypt.obj `if test -f 'scrypt.c'; then $(CYGPATH_W) 'scrypt.c'; else $(CYGPATH_W) '$(srcdir)/scrypt.c'; fi`
-
-ccminer-sha2.o: sha2.c
-@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-sha2.o -MD -MP -MF $(DEPDIR)/ccminer-sha2.Tpo -c -o ccminer-sha2.o `test -f 'sha2.c' || echo '$(srcdir)/'`sha2.c
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/ccminer-sha2.Tpo $(DEPDIR)/ccminer-sha2.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='sha2.c' object='ccminer-sha2.o' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-sha2.o `test -f 'sha2.c' || echo '$(srcdir)/'`sha2.c
-
-ccminer-sha2.obj: sha2.c
-@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-sha2.obj -MD -MP -MF $(DEPDIR)/ccminer-sha2.Tpo -c -o ccminer-sha2.obj `if test -f 'sha2.c'; then $(CYGPATH_W) 'sha2.c'; else $(CYGPATH_W) '$(srcdir)/sha2.c'; fi`
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/ccminer-sha2.Tpo $(DEPDIR)/ccminer-sha2.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='sha2.c' object='ccminer-sha2.obj' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-sha2.obj `if test -f 'sha2.c'; then $(CYGPATH_W) 'sha2.c'; else $(CYGPATH_W) '$(srcdir)/sha2.c'; fi`
-
-ccminer-fugue.o: sph/fugue.c
-@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-fugue.o -MD -MP -MF $(DEPDIR)/ccminer-fugue.Tpo -c -o ccminer-fugue.o `test -f 'sph/fugue.c' || echo '$(srcdir)/'`sph/fugue.c
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/ccminer-fugue.Tpo $(DEPDIR)/ccminer-fugue.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='sph/fugue.c' object='ccminer-fugue.o' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-fugue.o `test -f 'sph/fugue.c' || echo '$(srcdir)/'`sph/fugue.c
-
-ccminer-fugue.obj: sph/fugue.c
-@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-fugue.obj -MD -MP -MF $(DEPDIR)/ccminer-fugue.Tpo -c -o ccminer-fugue.obj `if test -f 'sph/fugue.c'; then $(CYGPATH_W) 'sph/fugue.c'; else $(CYGPATH_W) '$(srcdir)/sph/fugue.c'; fi`
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/ccminer-fugue.Tpo $(DEPDIR)/ccminer-fugue.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='sph/fugue.c' object='ccminer-fugue.obj' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-fugue.obj `if test -f 'sph/fugue.c'; then $(CYGPATH_W) 'sph/fugue.c'; else $(CYGPATH_W) '$(srcdir)/sph/fugue.c'; fi`
-
-.cpp.o:
-@am__fastdepCXX_TRUE@	$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
-@am__fastdepCXX_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCXX_FALSE@	$(CXXCOMPILE) -c -o $@ $<
-
-.cpp.obj:
-@am__fastdepCXX_TRUE@	$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
-@am__fastdepCXX_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCXX_FALSE@	$(CXXCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
-
-ccminer-fuguecoin.o: fuguecoin.cpp
-@am__fastdepCXX_TRUE@	$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT ccminer-fuguecoin.o -MD -MP -MF $(DEPDIR)/ccminer-fuguecoin.Tpo -c -o ccminer-fuguecoin.o `test -f 'fuguecoin.cpp' || echo '$(srcdir)/'`fuguecoin.cpp
-@am__fastdepCXX_TRUE@	$(am__mv) $(DEPDIR)/ccminer-fuguecoin.Tpo $(DEPDIR)/ccminer-fuguecoin.Po
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	source='fuguecoin.cpp' object='ccminer-fuguecoin.o' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCXX_FALSE@	$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o ccminer-fuguecoin.o `test -f 'fuguecoin.cpp' || echo '$(srcdir)/'`fuguecoin.cpp
-
-ccminer-fuguecoin.obj: fuguecoin.cpp
-@am__fastdepCXX_TRUE@	$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT ccminer-fuguecoin.obj -MD -MP -MF $(DEPDIR)/ccminer-fuguecoin.Tpo -c -o ccminer-fuguecoin.obj `if test -f 'fuguecoin.cpp'; then $(CYGPATH_W) 'fuguecoin.cpp'; else $(CYGPATH_W) '$(srcdir)/fuguecoin.cpp'; fi`
-@am__fastdepCXX_TRUE@	$(am__mv) $(DEPDIR)/ccminer-fuguecoin.Tpo $(DEPDIR)/ccminer-fuguecoin.Po
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	source='fuguecoin.cpp' object='ccminer-fuguecoin.obj' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCXX_FALSE@	$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o ccminer-fuguecoin.obj `if test -f 'fuguecoin.cpp'; then $(CYGPATH_W) 'fuguecoin.cpp'; else $(CYGPATH_W) '$(srcdir)/fuguecoin.cpp'; fi`
-
-ccminer-groestlcoin.o: groestlcoin.cpp
-@am__fastdepCXX_TRUE@	$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT ccminer-groestlcoin.o -MD -MP -MF $(DEPDIR)/ccminer-groestlcoin.Tpo -c -o ccminer-groestlcoin.o `test -f 'groestlcoin.cpp' || echo '$(srcdir)/'`groestlcoin.cpp
-@am__fastdepCXX_TRUE@	$(am__mv) $(DEPDIR)/ccminer-groestlcoin.Tpo $(DEPDIR)/ccminer-groestlcoin.Po
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	source='groestlcoin.cpp' object='ccminer-groestlcoin.o' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCXX_FALSE@	$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o ccminer-groestlcoin.o `test -f 'groestlcoin.cpp' || echo '$(srcdir)/'`groestlcoin.cpp
-
-ccminer-groestlcoin.obj: groestlcoin.cpp
-@am__fastdepCXX_TRUE@	$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT ccminer-groestlcoin.obj -MD -MP -MF $(DEPDIR)/ccminer-groestlcoin.Tpo -c -o ccminer-groestlcoin.obj `if test -f 'groestlcoin.cpp'; then $(CYGPATH_W) 'groestlcoin.cpp'; else $(CYGPATH_W) '$(srcdir)/groestlcoin.cpp'; fi`
-@am__fastdepCXX_TRUE@	$(am__mv) $(DEPDIR)/ccminer-groestlcoin.Tpo $(DEPDIR)/ccminer-groestlcoin.Po
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	source='groestlcoin.cpp' object='ccminer-groestlcoin.obj' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCXX_FALSE@	$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o ccminer-groestlcoin.obj `if test -f 'groestlcoin.cpp'; then $(CYGPATH_W) 'groestlcoin.cpp'; else $(CYGPATH_W) '$(srcdir)/groestlcoin.cpp'; fi`
-
-ccminer-myriadgroestl.o: myriadgroestl.cpp
-@am__fastdepCXX_TRUE@	$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT ccminer-myriadgroestl.o -MD -MP -MF $(DEPDIR)/ccminer-myriadgroestl.Tpo -c -o ccminer-myriadgroestl.o `test -f 'myriadgroestl.cpp' || echo '$(srcdir)/'`myriadgroestl.cpp
-@am__fastdepCXX_TRUE@	$(am__mv) $(DEPDIR)/ccminer-myriadgroestl.Tpo $(DEPDIR)/ccminer-myriadgroestl.Po
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	source='myriadgroestl.cpp' object='ccminer-myriadgroestl.o' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCXX_FALSE@	$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o ccminer-myriadgroestl.o `test -f 'myriadgroestl.cpp' || echo '$(srcdir)/'`myriadgroestl.cpp
-
-ccminer-myriadgroestl.obj: myriadgroestl.cpp
-@am__fastdepCXX_TRUE@	$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT ccminer-myriadgroestl.obj -MD -MP -MF $(DEPDIR)/ccminer-myriadgroestl.Tpo -c -o ccminer-myriadgroestl.obj `if test -f 'myriadgroestl.cpp'; then $(CYGPATH_W) 'myriadgroestl.cpp'; else $(CYGPATH_W) '$(srcdir)/myriadgroestl.cpp'; fi`
-@am__fastdepCXX_TRUE@	$(am__mv) $(DEPDIR)/ccminer-myriadgroestl.Tpo $(DEPDIR)/ccminer-myriadgroestl.Po
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	source='myriadgroestl.cpp' object='ccminer-myriadgroestl.obj' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCXX_FALSE@	$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o ccminer-myriadgroestl.obj `if test -f 'myriadgroestl.cpp'; then $(CYGPATH_W) 'myriadgroestl.cpp'; else $(CYGPATH_W) '$(srcdir)/myriadgroestl.cpp'; fi`
-
-# This directory's subdirectories are mostly independent; you can cd
-# into them and run `make' without going through this Makefile.
-# To change the values of `make' variables: instead of editing Makefiles,
-# (1) if the variable is set in `config.status', edit `config.status'
-#     (which will cause the Makefiles to be regenerated when you run `make');
-# (2) otherwise, pass the desired values on the `make' command line.
-$(RECURSIVE_TARGETS):
-	@fail= failcom='exit 1'; \
-	for f in x $$MAKEFLAGS; do \
-	  case $$f in \
-	    *=* | --[!k]*);; \
-	    *k*) failcom='fail=yes';; \
-	  esac; \
-	done; \
-	dot_seen=no; \
-	target=`echo $@ | sed s/-recursive//`; \
-	list='$(SUBDIRS)'; for subdir in $$list; do \
-	  echo "Making $$target in $$subdir"; \
-	  if test "$$subdir" = "."; then \
-	    dot_seen=yes; \
-	    local_target="$$target-am"; \
-	  else \
-	    local_target="$$target"; \
-	  fi; \
-	  ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
-	  || eval $$failcom; \
-	done; \
-	if test "$$dot_seen" = "no"; then \
-	  $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
-	fi; test -z "$$fail"
-
-$(RECURSIVE_CLEAN_TARGETS):
-	@fail= failcom='exit 1'; \
-	for f in x $$MAKEFLAGS; do \
-	  case $$f in \
-	    *=* | --[!k]*);; \
-	    *k*) failcom='fail=yes';; \
-	  esac; \
-	done; \
-	dot_seen=no; \
-	case "$@" in \
-	  distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \
-	  *) list='$(SUBDIRS)' ;; \
-	esac; \
-	rev=''; for subdir in $$list; do \
-	  if test "$$subdir" = "."; then :; else \
-	    rev="$$subdir $$rev"; \
-	  fi; \
-	done; \
-	rev="$$rev ."; \
-	target=`echo $@ | sed s/-recursive//`; \
-	for subdir in $$rev; do \
-	  echo "Making $$target in $$subdir"; \
-	  if test "$$subdir" = "."; then \
-	    local_target="$$target-am"; \
-	  else \
-	    local_target="$$target"; \
-	  fi; \
-	  ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
-	  || eval $$failcom; \
-	done && test -z "$$fail"
-tags-recursive:
-	list='$(SUBDIRS)'; for subdir in $$list; do \
-	  test "$$subdir" = . || ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) tags); \
-	done
-ctags-recursive:
-	list='$(SUBDIRS)'; for subdir in $$list; do \
-	  test "$$subdir" = . || ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) ctags); \
-	done
-
-ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
-	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
-	unique=`for i in $$list; do \
-	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
-	  done | \
-	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
-	      END { if (nonempty) { for (i in files) print i; }; }'`; \
-	mkid -fID $$unique
-tags: TAGS
-
-TAGS: tags-recursive $(HEADERS) $(SOURCES) cpuminer-config.h.in $(TAGS_DEPENDENCIES) \
-		$(TAGS_FILES) $(LISP)
-	set x; \
-	here=`pwd`; \
-	if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \
-	  include_option=--etags-include; \
-	  empty_fix=.; \
-	else \
-	  include_option=--include; \
-	  empty_fix=; \
-	fi; \
-	list='$(SUBDIRS)'; for subdir in $$list; do \
-	  if test "$$subdir" = .; then :; else \
-	    test ! -f $$subdir/TAGS || \
-	      set "$$@" "$$include_option=$$here/$$subdir/TAGS"; \
-	  fi; \
-	done; \
-	list='$(SOURCES) $(HEADERS) cpuminer-config.h.in $(LISP) $(TAGS_FILES)'; \
-	unique=`for i in $$list; do \
-	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
-	  done | \
-	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
-	      END { if (nonempty) { for (i in files) print i; }; }'`; \
-	shift; \
-	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
-	  test -n "$$unique" || unique=$$empty_fix; \
-	  if test $$# -gt 0; then \
-	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
-	      "$$@" $$unique; \
-	  else \
-	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
-	      $$unique; \
-	  fi; \
-	fi
-ctags: CTAGS
-CTAGS: ctags-recursive $(HEADERS) $(SOURCES) cpuminer-config.h.in $(TAGS_DEPENDENCIES) \
-		$(TAGS_FILES) $(LISP)
-	list='$(SOURCES) $(HEADERS) cpuminer-config.h.in $(LISP) $(TAGS_FILES)'; \
-	unique=`for i in $$list; do \
-	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
-	  done | \
-	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
-	      END { if (nonempty) { for (i in files) print i; }; }'`; \
-	test -z "$(CTAGS_ARGS)$$unique" \
-	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
-	     $$unique
-
-GTAGS:
-	here=`$(am__cd) $(top_builddir) && pwd` \
-	  && $(am__cd) $(top_srcdir) \
-	  && gtags -i $(GTAGS_ARGS) "$$here"
-
-distclean-tags:
-	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
-
-distdir: $(DISTFILES)
-	$(am__remove_distdir)
-	test -d "$(distdir)" || mkdir "$(distdir)"
-	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
-	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
-	list='$(DISTFILES)'; \
-	  dist_files=`for file in $$list; do echo $$file; done | \
-	  sed -e "s|^$$srcdirstrip/||;t" \
-	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
-	case $$dist_files in \
-	  */*) $(MKDIR_P) `echo "$$dist_files" | \
-			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
-			   sort -u` ;; \
-	esac; \
-	for file in $$dist_files; do \
-	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
-	  if test -d $$d/$$file; then \
-	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
-	    if test -d "$(distdir)/$$file"; then \
-	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
-	    fi; \
-	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
-	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
-	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
-	    fi; \
-	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
-	  else \
-	    test -f "$(distdir)/$$file" \
-	    || cp -p $$d/$$file "$(distdir)/$$file" \
-	    || exit 1; \
-	  fi; \
-	done
-	@list='$(DIST_SUBDIRS)'; for subdir in $$list; do \
-	  if test "$$subdir" = .; then :; else \
-	    test -d "$(distdir)/$$subdir" \
-	    || $(MKDIR_P) "$(distdir)/$$subdir" \
-	    || exit 1; \
-	  fi; \
-	done
-	@list='$(DIST_SUBDIRS)'; for subdir in $$list; do \
-	  if test "$$subdir" = .; then :; else \
-	    dir1=$$subdir; dir2="$(distdir)/$$subdir"; \
-	    $(am__relativize); \
-	    new_distdir=$$reldir; \
-	    dir1=$$subdir; dir2="$(top_distdir)"; \
-	    $(am__relativize); \
-	    new_top_distdir=$$reldir; \
-	    echo " (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir="$$new_top_distdir" distdir="$$new_distdir" \\"; \
-	    echo "     am__remove_distdir=: am__skip_length_check=: am__skip_mode_fix=: distdir)"; \
-	    ($(am__cd) $$subdir && \
-	      $(MAKE) $(AM_MAKEFLAGS) \
-	        top_distdir="$$new_top_distdir" \
-	        distdir="$$new_distdir" \
-		am__remove_distdir=: \
-		am__skip_length_check=: \
-		am__skip_mode_fix=: \
-	        distdir) \
-	      || exit 1; \
-	  fi; \
-	done
-	-test -n "$(am__skip_mode_fix)" \
-	|| find "$(distdir)" -type d ! -perm -755 \
-		-exec chmod u+rwx,go+rx {} \; -o \
-	  ! -type d ! -perm -444 -links 1 -exec chmod a+r {} \; -o \
-	  ! -type d ! -perm -400 -exec chmod a+r {} \; -o \
-	  ! -type d ! -perm -444 -exec $(install_sh) -c -m a+r {} {} \; \
-	|| chmod -R a+r "$(distdir)"
-dist-gzip: distdir
-	tardir=$(distdir) && $(am__tar) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).tar.gz
-	$(am__remove_distdir)
-
-dist-bzip2: distdir
-	tardir=$(distdir) && $(am__tar) | BZIP2=$${BZIP2--9} bzip2 -c >$(distdir).tar.bz2
-	$(am__remove_distdir)
-
-dist-lzip: distdir
-	tardir=$(distdir) && $(am__tar) | lzip -c $${LZIP_OPT--9} >$(distdir).tar.lz
-	$(am__remove_distdir)
-
-dist-lzma: distdir
-	tardir=$(distdir) && $(am__tar) | lzma -9 -c >$(distdir).tar.lzma
-	$(am__remove_distdir)
-
-dist-xz: distdir
-	tardir=$(distdir) && $(am__tar) | XZ_OPT=$${XZ_OPT--e} xz -c >$(distdir).tar.xz
-	$(am__remove_distdir)
-
-dist-tarZ: distdir
-	tardir=$(distdir) && $(am__tar) | compress -c >$(distdir).tar.Z
-	$(am__remove_distdir)
-
-dist-shar: distdir
-	shar $(distdir) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).shar.gz
-	$(am__remove_distdir)
-
-dist-zip: distdir
-	-rm -f $(distdir).zip
-	zip -rq $(distdir).zip $(distdir)
-	$(am__remove_distdir)
-
-dist dist-all: distdir
-	tardir=$(distdir) && $(am__tar) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).tar.gz
-	$(am__remove_distdir)
-
-# This target untars the dist file and tries a VPATH configuration.  Then
-# it guarantees that the distribution is self-contained by making another
-# tarfile.
-distcheck: dist
-	case '$(DIST_ARCHIVES)' in \
-	*.tar.gz*) \
-	  GZIP=$(GZIP_ENV) gzip -dc $(distdir).tar.gz | $(am__untar) ;;\
-	*.tar.bz2*) \
-	  bzip2 -dc $(distdir).tar.bz2 | $(am__untar) ;;\
-	*.tar.lzma*) \
-	  lzma -dc $(distdir).tar.lzma | $(am__untar) ;;\
-	*.tar.lz*) \
-	  lzip -dc $(distdir).tar.lz | $(am__untar) ;;\
-	*.tar.xz*) \
-	  xz -dc $(distdir).tar.xz | $(am__untar) ;;\
-	*.tar.Z*) \
-	  uncompress -c $(distdir).tar.Z | $(am__untar) ;;\
-	*.shar.gz*) \
-	  GZIP=$(GZIP_ENV) gzip -dc $(distdir).shar.gz | unshar ;;\
-	*.zip*) \
-	  unzip $(distdir).zip ;;\
-	esac
-	chmod -R a-w $(distdir); chmod a+w $(distdir)
-	mkdir $(distdir)/_build
-	mkdir $(distdir)/_inst
-	chmod a-w $(distdir)
-	test -d $(distdir)/_build || exit 0; \
-	dc_install_base=`$(am__cd) $(distdir)/_inst && pwd | sed -e 's,^[^:\\/]:[\\/],/,'` \
-	  && dc_destdir="$${TMPDIR-/tmp}/am-dc-$$$$/" \
-	  && am__cwd=`pwd` \
-	  && $(am__cd) $(distdir)/_build \
-	  && ../configure --srcdir=.. --prefix="$$dc_install_base" \
-	    $(AM_DISTCHECK_CONFIGURE_FLAGS) \
-	    $(DISTCHECK_CONFIGURE_FLAGS) \
-	  && $(MAKE) $(AM_MAKEFLAGS) \
-	  && $(MAKE) $(AM_MAKEFLAGS) dvi \
-	  && $(MAKE) $(AM_MAKEFLAGS) check \
-	  && $(MAKE) $(AM_MAKEFLAGS) install \
-	  && $(MAKE) $(AM_MAKEFLAGS) installcheck \
-	  && $(MAKE) $(AM_MAKEFLAGS) uninstall \
-	  && $(MAKE) $(AM_MAKEFLAGS) distuninstallcheck_dir="$$dc_install_base" \
-	        distuninstallcheck \
-	  && chmod -R a-w "$$dc_install_base" \
-	  && ({ \
-	       (cd ../.. && umask 077 && mkdir "$$dc_destdir") \
-	       && $(MAKE) $(AM_MAKEFLAGS) DESTDIR="$$dc_destdir" install \
-	       && $(MAKE) $(AM_MAKEFLAGS) DESTDIR="$$dc_destdir" uninstall \
-	       && $(MAKE) $(AM_MAKEFLAGS) DESTDIR="$$dc_destdir" \
-	            distuninstallcheck_dir="$$dc_destdir" distuninstallcheck; \
-	      } || { rm -rf "$$dc_destdir"; exit 1; }) \
-	  && rm -rf "$$dc_destdir" \
-	  && $(MAKE) $(AM_MAKEFLAGS) dist \
-	  && rm -rf $(DIST_ARCHIVES) \
-	  && $(MAKE) $(AM_MAKEFLAGS) distcleancheck \
-	  && cd "$$am__cwd" \
-	  || exit 1
-	$(am__remove_distdir)
-	@(echo "$(distdir) archives ready for distribution: "; \
-	  list='$(DIST_ARCHIVES)'; for i in $$list; do echo $$i; done) | \
-	  sed -e 1h -e 1s/./=/g -e 1p -e 1x -e '$$p' -e '$$x'
-distuninstallcheck:
-	@test -n '$(distuninstallcheck_dir)' || { \
-	  echo 'ERROR: trying to run $@ with an empty' \
-	       '$$(distuninstallcheck_dir)' >&2; \
-	  exit 1; \
-	}; \
-	$(am__cd) '$(distuninstallcheck_dir)' || { \
-	  echo 'ERROR: cannot chdir into $(distuninstallcheck_dir)' >&2; \
-	  exit 1; \
-	}; \
-	test `$(am__distuninstallcheck_listfiles) | wc -l` -eq 0 \
-	   || { echo "ERROR: files left after uninstall:" ; \
-	        if test -n "$(DESTDIR)"; then \
-	          echo "  (check DESTDIR support)"; \
-	        fi ; \
-	        $(distuninstallcheck_listfiles) ; \
-	        exit 1; } >&2
-distcleancheck: distclean
-	@if test '$(srcdir)' = . ; then \
-	  echo "ERROR: distcleancheck can only run from a VPATH build" ; \
-	  exit 1 ; \
-	fi
-	@test `$(distcleancheck_listfiles) | wc -l` -eq 0 \
-	  || { echo "ERROR: files left in build directory after distclean:" ; \
-	       $(distcleancheck_listfiles) ; \
-	       exit 1; } >&2
-check-am: all-am
-check: check-recursive
-all-am: Makefile $(PROGRAMS) cpuminer-config.h
-installdirs: installdirs-recursive
-installdirs-am:
-	for dir in "$(DESTDIR)$(bindir)"; do \
-	  test -z "$$dir" || $(MKDIR_P) "$$dir"; \
-	done
-install: install-recursive
-install-exec: install-exec-recursive
-install-data: install-data-recursive
-uninstall: uninstall-recursive
-
-install-am: all-am
-	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
-
-installcheck: installcheck-recursive
-install-strip:
-	if test -z '$(STRIP)'; then \
-	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
-	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
-	      install; \
-	else \
-	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
-	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
-	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
-	fi
-mostlyclean-generic:
-
-clean-generic:
-
-distclean-generic:
-	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
-	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
-	-rm -f JHA/$(DEPDIR)/$(am__dirstamp)
-	-rm -f JHA/$(am__dirstamp)
-	-rm -f heavy/$(DEPDIR)/$(am__dirstamp)
-	-rm -f heavy/$(am__dirstamp)
-	-rm -f quark/$(DEPDIR)/$(am__dirstamp)
-	-rm -f quark/$(am__dirstamp)
-
-maintainer-clean-generic:
-	@echo "This command is intended for maintainers to use"
-	@echo "it deletes files that may require special tools to rebuild."
-clean: clean-recursive
-
-clean-am: clean-binPROGRAMS clean-generic mostlyclean-am
-
-distclean: distclean-recursive
-	-rm -f $(am__CONFIG_DISTCLEAN_FILES)
-	-rm -rf ./$(DEPDIR)
-	-rm -f Makefile
-distclean-am: clean-am distclean-compile distclean-generic \
-	distclean-hdr distclean-tags
-
-dvi: dvi-recursive
-
-dvi-am:
-
-html: html-recursive
-
-html-am:
-
-info: info-recursive
-
-info-am:
-
-install-data-am:
-
-install-dvi: install-dvi-recursive
-
-install-dvi-am:
-
-install-exec-am: install-binPROGRAMS
-
-install-html: install-html-recursive
-
-install-html-am:
-
-install-info: install-info-recursive
-
-install-info-am:
-
-install-man:
-
-install-pdf: install-pdf-recursive
-
-install-pdf-am:
-
-install-ps: install-ps-recursive
-
-install-ps-am:
-
-installcheck-am:
-
-maintainer-clean: maintainer-clean-recursive
-	-rm -f $(am__CONFIG_DISTCLEAN_FILES)
-	-rm -rf $(top_srcdir)/autom4te.cache
-	-rm -rf ./$(DEPDIR)
-	-rm -f Makefile
-maintainer-clean-am: distclean-am maintainer-clean-generic
-
-mostlyclean: mostlyclean-recursive
-
-mostlyclean-am: mostlyclean-compile mostlyclean-generic
-
-pdf: pdf-recursive
-
-pdf-am:
-
-ps: ps-recursive
-
-ps-am:
-
-uninstall-am: uninstall-binPROGRAMS
-
-.MAKE: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) all \
-	ctags-recursive install-am install-strip tags-recursive
-
-.PHONY: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) CTAGS GTAGS \
-	all all-am am--refresh check check-am clean clean-binPROGRAMS \
-	clean-generic ctags ctags-recursive dist dist-all dist-bzip2 \
-	dist-gzip dist-lzip dist-lzma dist-shar dist-tarZ dist-xz \
-	dist-zip distcheck distclean distclean-compile \
-	distclean-generic distclean-hdr distclean-tags distcleancheck \
-	distdir distuninstallcheck dvi dvi-am html html-am info \
-	info-am install install-am install-binPROGRAMS install-data \
-	install-data-am install-dvi install-dvi-am install-exec \
-	install-exec-am install-html install-html-am install-info \
-	install-info-am install-man install-pdf install-pdf-am \
-	install-ps install-ps-am install-strip installcheck \
-	installcheck-am installdirs installdirs-am maintainer-clean \
-	maintainer-clean-generic mostlyclean mostlyclean-compile \
-	mostlyclean-generic pdf pdf-am ps ps-am tags tags-recursive \
-	uninstall uninstall-am uninstall-binPROGRAMS
-
-
-# we're now targeting all major compute architectures within one binary.
-.cu.o:
-	$(NVCC) @CFLAGS@ -I . -Xptxas "-abi=no -v" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
-
-# ABI requiring code modules
-# this module doesn't compile with Compute 2.0 unfortunately
-quark/cuda_quark_compactionTest.o: quark/cuda_quark_compactionTest.cu
-	$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
-
-JHA/cuda_jha_compactionTest.o: JHA/cuda_jha_compactionTest.cu
-	$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
-
-# Tell versions [3.59,3.63) of GNU make to not export all variables.
-# Otherwise a system limit (for SysV at least) may be exceeded.
-.NOEXPORT:
diff --git a/NEWS b/NEWS
deleted file mode 100644
index d3f5a12faa..0000000000
--- a/NEWS
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/README b/README
deleted file mode 100644
index f5c26fca65..0000000000
--- a/README
+++ /dev/null
@@ -1,3 +0,0 @@
-
-A CUDA based miner for Heavycoin and Fuguecoin. For more information, take a look at README.txt
-
diff --git a/README.md b/README.md
index 0a54f7fa01..5bcb50572c 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,36 @@
-ccminer
-=======
+# ccminer
 
-Christian Buchner's &amp; Christian H.'s CUDA miner project
+Based on Christian Buchner's &amp; Christian H.'s CUDA project, no more active on github since 2014.
+
+Check the [README.txt](README.txt) for the additions
+
+BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo (tpruvot)
+
+A part of the recent algos were originally written by [djm34](https://github.com/djm34) and [alexis78](https://github.com/alexis78)
+
+This variant was tested and built on Linux (ubuntu server 14.04, 16.04, Fedora 22 to 25)
+It is also built for Windows 7 to 10 with VStudio 2013, to stay compatible with Windows 7 and Vista.
+
+Note that the x86 releases are generally faster than x64 ones on Windows, but that tend to change with the recent drivers.
+
+The recommended CUDA Toolkit version was the [6.5.19](http://developer.download.nvidia.com/compute/cuda/6_5/rel/installers/cuda_6.5.19_windows_general_64.exe), but some light algos could be faster with the version 7.5 and 8.0 (like lbry, decred and skein).
+
+About source code dependencies
+------------------------------
+
+This project requires some libraries to be built :
+
+- OpenSSL (prebuilt for win)
+- Curl (prebuilt for win)
+- pthreads (prebuilt for win)
+
+The tree now contains recent prebuilt openssl and curl .lib for both x86 and x64 platforms (windows).
+
+To rebuild them, you need to clone this repository and its submodules :
+    git clone https://github.com/peters/curl-for-windows.git compat/curl-for-windows
+
+
+Compile on Linux
+----------------
+
+Please see [INSTALL](https://github.com/tpruvot/ccminer/blob/linux/INSTALL) file or [project Wiki](https://github.com/tpruvot/ccminer/wiki/Compatibility)
diff --git a/README.txt b/README.txt
index a40f30e645..c2470bddb6 100644
--- a/README.txt
+++ b/README.txt
@@ -1,40 +1,64 @@
 
-ccMiner release 1.2 (June 15th 2014) - "Killer Groestl + X13"
--------------------------------------------------------------
+ccminer 2.3.1                     "lyra2v3, exosis and sha256q"
+---------------------------------------------------------------
 
 ***************************************************************
-If you find this tool useful and like to support its continued 
+If you find this tool useful and like to support its continuous
           development, then consider a donation.
 
-   LTC donation address: LKS1WDKGED647msBQfLBHV3Ls8sveGncnm
-   BTC donation address: 16hJF5mceSojnTD3ZTUDqdRhDyPJzoRakM
-   YAC donation address: Y87sptDEcpLkLeAuex6qZioDbvy1qXZEj4
-   VTC donation address: VrjeFzMgvteCGarLw85KivBzmsiH9fqp4a
-   MAX donation address: mHrhQP9EFArechWxTFJ97s9D3jvcCvEEnt
-  DOGE donation address: DT9ghsGmez6ojVdEZgvaZbT2Z3TruXG6yP
-   HVC donation address: HNN3PyyTMkDo4RkEjkWSGMwqia1yD8mwJN
-   GRS donation address: FmJKJAhvyHWPeEVeLQHefr2naqgWc9ABTM
-   MYR donation address: MNHM7Q7HVfGpKDJgVJrY8ofwvmeugNewyf
-   JPC donation address: JYFBypVDkk583yKWY4M46TG5vXG8hfgD2U
-   SFR donation address: SR4b87aEnPfTs77bo9NnnaV21fiF6jQpAp
-   MNC donation address: MShgNUSYwybEbXLvJUtdNg1a7rUeiNgooK
-   BTQ donation address: 13GFwLiZL2DaA9XeE733PNrQX5QYLFsonS
+tpruvot@github:
+  BTC  : 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo
+  DCR  : DsUCcACGcyP8McNMRXQwbtpDxaVUYLDQDeU
+
+DJM34:
+  BTC donation address: 1NENYmxwZGHsKFmyjTc5WferTn5VTFb7Ze
+
+cbuchner v1.2:
+  LTC donation address: LKS1WDKGED647msBQfLBHV3Ls8sveGncnm
+  BTC donation address: 16hJF5mceSojnTD3ZTUDqdRhDyPJzoRakM
+
 ***************************************************************
 
 >>> Introduction <<<
 
-This is a CUDA accelerated mining application for use with
+This is a CUDA accelerated mining application which handle :
 
+Decred (Blake256 14-rounds - 180 bytes)
 HeavyCoin & MjollnirCoin
 FugueCoin
 GroestlCoin & Myriad-Groestl
-JackpotCoin
+Lbry Credits
+JackpotCoin (JHA)
 QuarkCoin family & AnimeCoin
 TalkCoin
 DarkCoin and other X11 coins
+Chaincoin and Flaxscript (C11)
+Saffroncoin blake (256 14-rounds)
+BlakeCoin (256 8-rounds)
+Qubit (Digibyte, ...)
+Luffa (Joincoin)
+Keccak (Maxcoin)
+Pentablake (Blake 512 x5)
+1Coin Triple S
+Neoscrypt (FeatherCoin)
+x11evo (Revolver)
+phi2 (LUXCoin)
+Scrypt and Scrypt:N
+Scrypt-Jane (Chacha)
+sib (Sibcoin)
+Skein (Skein + SHA)
+Signatum (Skein cubehash fugue Streebog)
+SonoA (Sono)
+Tribus (JH, keccak, simd)
+Woodcoin (Double Skein)
+Vanilla (Blake256 8-rounds - double sha256)
+Vertcoin Lyra2REv3
+Boolberry (Wild Keccak)
+Monero (Cryptonight v7 with -a monero)
+Aeon (Cryptonight-lite)
 
 where some of these coins have a VERY NOTABLE nVidia advantage
-over competing AMD (OpenCL) implementations.
+over competing AMD (OpenCL Only) implementations.
 
 We did not take a big effort on improving usability, so please set
 your parameters carefuly.
@@ -46,29 +70,89 @@ that the most of our comments are in german.
 
 >>> Command Line Interface <<<
 
-This code is based on the pooler cpuminer 2.3.2 release and inherits
+This code is based on the pooler cpuminer and inherits
 its command line interface and options.
 
   -a, --algo=ALGO       specify the algorithm to use
-                          heavy       use to mine Heavycoin
-                          mjollnir    use to mine Mjollnircoin
+                          allium      use to mine Garlic
+                          bastion     use to mine Joincoin
+                          bitcore     use to mine Bitcore's Timetravel10
+                          blake       use to mine Saffroncoin (Blake256)
+                          blakecoin   use to mine Old Blake 256
+                          blake2s     use to mine Nevacoin (Blake2-S 256)
+                          bmw         use to mine Midnight
+                          cryptolight use to mine AEON cryptonight variant 1 (MEM/2)
+                          cryptonight use to mine original cryptonight
+                          c11/flax    use to mine Chaincoin and Flax
+                          decred      use to mine Decred 180 bytes Blake256-14
+                          deep        use to mine Deepcoin
+                          dmd-gr      use to mine Diamond-Groestl
+                          equihash    use to mine ZEC, HUSH and KMD
+                          exosis      use to mine EXO
+                          fresh       use to mine Freshcoin
                           fugue256    use to mine Fuguecoin
                           groestl     use to mine Groestlcoin
-                          myr-gr      use to mine Myriad-Groestl
-                          jackpot     use to mine Jackpotcoin
-                          quark       use to mine Quarkcoin
-                          anime       use to mine Animecoin
+                          hsr         use to mine Hshare
+                          jackpot     use to mine Sweepcoin
+                          keccak      use to mine Maxcoin
+                          keccakc     use to mine CreativeCoin
+                          lbry        use to mine LBRY Credits
+                          luffa       use to mine Joincoin
+                          lyra2       use to mine CryptoCoin
+                          lyra2v2     use to mine Monacoin
+                          lyra2v3     use to mine Vertcoin
+                          lyra2z      use to mine Zerocoin (XZC)
+                          monero      use to mine Monero (XMR)
+                          myr-gr      use to mine Myriad-Groest
+                          neoscrypt   use to mine FeatherCoin, Trezarcoin, Orbitcoin, etc
                           nist5       use to mine TalkCoin
+                          penta       use to mine Joincoin / Pentablake
+                          phi1612     use to mine Seraph
+                          phi2        use to mine LUXCoin
+                          polytimos   use to mine Polytimos
+                          quark       use to mine Quarkcoin
+                          qubit       use to mine Qubit
+                          scrypt      use to mine Scrypt coins (Litecoin, Dogecoin, etc)
+                          scrypt:N    use to mine Scrypt-N (:10 for 2048 iterations)
+                          scrypt-jane use to mine Chacha coins like Cache and Ultracoin
+                          s3          use to mine 1coin (ONE)
+                          sha256t     use to mine OneCoin (OC)
+                          sha256q     use to mine Pyrite
+                          sia         use to mine SIA
+                          sib         use to mine Sibcoin
+                          skein       use to mine Skeincoin
+                          skein2      use to mine Woodcoin
+                          skunk       use to mine Signatum
+                          sonoa       use to mine Sono
+                          stellite    use to mine Stellite (a cryptonight variant)
+                          timetravel  use to mine MachineCoin
+                          tribus      use to mine Denarius
+                          x11evo      use to mine Revolver
                           x11         use to mine DarkCoin
+                          x12         use to mine GalaxyCash
+                          x13         use to mine X13
+                          x14         use to mine X14
+                          x15         use to mine Halcyon
+                          x16r        use to mine Raven
+                          x16s        use to mine Pigeon and Eden
+                          x17         use to mine X17
+                          vanilla     use to mine Vanilla (Blake256)
+                          veltor      use to mine VeltorCoin
+                          whirlpool   use to mine Joincoin
+                          wildkeccak  use to mine Boolberry (Stratum only)
+                          zr5         use to mine ZiftrCoin
 
   -d, --devices         gives a comma separated list of CUDA device IDs
                         to operate on. Device IDs start counting from 0!
                         Alternatively give string names of your card like
                         gtx780ti or gt640#2 (matching 2nd gt640 in the PC).
 
-  -f, --diff            Divide difficulty by this factor (std is 1) \n\
-  -v, --vote            Heavycoin block vote (default: 512)
-  -o, --url=URL         URL of mining server (default: " DEF_RPC_URL ")
+  -i, --intensity=N[,N] GPU threads per call 8-25 (2^N + F, default: 0=auto)
+                        Decimals and multiple values are allowed for fine tuning
+      --cuda-schedule   Set device threads scheduling mode (default: auto)
+  -f, --diff-factor     Divide difficulty by this factor (default 1.0)
+  -m, --diff-multiplier Multiply difficulty by this value (default 1.0)
+  -o, --url=URL         URL of mining server
   -O, --userpass=U:P    username:password pair for mining server
   -u, --user=USERNAME   username for mining server
   -p, --pass=PASSWORD   password for mining server
@@ -78,66 +162,394 @@ its command line interface and options.
   -r, --retries=N       number of times to retry if a network call fails
                           (default: retry indefinitely)
   -R, --retry-pause=N   time to pause between retries, in seconds (default: 15)
-  -T, --timeout=N       network timeout, in seconds (default: 270)
+      --shares-limit    maximum shares to mine before exiting the program.
+      --time-limit      maximum time [s] to mine before exiting the program.
+  -T, --timeout=N       network timeout, in seconds (default: 300)
   -s, --scantime=N      upper bound on time spent scanning current work when
                         long polling is unavailable, in seconds (default: 5)
+      --submit-stale    ignore stale job checks, may create more rejected shares
+  -n, --ndevs           list cuda devices
+  -N, --statsavg        number of samples used to display hashrate (default: 30)
+      --no-gbt          disable getblocktemplate support (height check in solo)
       --no-longpoll     disable X-Long-Polling support
       --no-stratum      disable X-Stratum support
   -q, --quiet           disable per-thread hashmeter output
+      --no-color        disable colored output
   -D, --debug           enable debug output
   -P, --protocol-dump   verbose dump of protocol-level activities
+  -b, --api-bind=port   IP:port for the miner API (default: 127.0.0.1:4068), 0 disabled
+      --api-remote      Allow remote control, like pool switching, imply --api-allow=0/0
+      --api-allow=...   IP/mask of the allowed api client(s), 0/0 for all
+      --max-temp=N      Only mine if gpu temp is less than specified value
+      --max-rate=N[KMG] Only mine if net hashrate is less than specified value
+      --max-diff=N      Only mine if net difficulty is less than specified value
+      --max-log-rate    Interval to reduce per gpu hashrate logs (default: 3)
+      --pstate=0        will force the Geforce 9xx to run in P0 P-State
+      --plimit=150W     set the gpu power limit, allow multiple values for N cards
+                          on windows this parameter use percentages (like OC tools)
+      --tlimit=85       Set the gpu thermal limit (windows only)
+      --keep-clocks     prevent reset clocks and/or power limit on exit
+      --hide-diff       Hide submitted shares diff and net difficulty
   -B, --background      run the miner in the background
       --benchmark       run in offline benchmark mode
+      --cputest         debug hashes from cpu algorithms
+      --cpu-affinity    set process affinity to specific cpu core(s) mask
+      --cpu-priority    set process priority (default: 0 idle, 2 normal to 5 highest)
   -c, --config=FILE     load a JSON-format configuration file
+                        can be from an url with the http:// prefix
   -V, --version         display version information and exit
   -h, --help            display this help text and exit
 
 
+Scrypt specific options:
+  -l, --launch-config   gives the launch configuration for each kernel
+                        in a comma separated list, one per device.
+      --interactive     comma separated list of flags (0/1) specifying
+                        which of the CUDA device you need to run at inter-
+                        active frame rates (because it drives a display).
+  -L, --lookup-gap      Divides the per-hash memory requirement by this factor
+                        by storing only every N'th value in the scratchpad.
+                        Default is 1.
+      --texture-cache   comma separated list of flags (0/1/2) specifying
+                        which of the CUDA devices shall use the texture
+                        cache for mining. Kepler devices may profit.
+      --no-autotune     disable auto-tuning of kernel launch parameters
+
+CryptoNight specific options:
+  -l, --launch-config   gives the launch configuration for each kernel
+                        in a comma separated list, one per device.
+      --bfactor=[0-12]  Run Cryptonight core kernel in smaller pieces,
+                        From 0 (ui freeze) to 12 (smooth), win default is 11
+                        This is a per-device setting like the launch config.
+
+Wildkeccak specific:
+  -l, --launch-config   gives the launch configuration for each kernel
+                        in a comma separated list, one per device.
+  -k, --scratchpad url  Url used to download the scratchpad cache.
+
+
 >>> Examples <<<
 
 
 Example for Heavycoin Mining on heavycoinpool.com with a single gpu in your system
+    ccminer -t 1 -a heavy -o stratum+tcp://stratum01.heavycoinpool.com:5333 -u <<username.worker>> -p <<workerpassword>> -v 8
 
-ccminer.exe -t 1 -a heavy -o stratum+tcp://stratum01.heavycoinpool.com:5333 -u <<username.worker>> -p <<workerpassword>> -v 8
 
+Example for Heavycoin Mining on hvc.1gh.com with a dual gpu in your system
+    ccminer -t 2 -a heavy -o stratum+tcp://hvcpool.1gh.com:5333/ -u <<WALLET>> -p x -v 8
 
 
-Example for Heavycoin Mining on hvc.1gh.com with a dual gpu in your system
+Example for Fuguecoin solo-mining with 4 gpu's in your system and a Fuguecoin-wallet running on localhost
+    ccminer -q -s 1 -t 4 -a fugue256 -o http://localhost:9089/ -u <<myusername>> -p <<mypassword>>
 
-ccminer.exe -t 2 -a heavy -o stratum+tcp://hvcpool.1gh.com:5333 -u <<WALLET>> -p x -v 8
 
+Example for Fuguecoin pool mining on dwarfpool.com with all your GPUs
+    ccminer -q -a fugue256 -o stratum+tcp://erebor.dwarfpool.com:3340/ -u YOURWALLETADDRESS.1 -p YOUREMAILADDRESS
 
 
-Example for Fuguecoin solo-mining with 4 gpu's in your system and a Fuguecoin-wallet running on localhost
+Example for Groestlcoin solo mining
+    ccminer -q -s 1 -a groestl -o http://127.0.0.1:1441/ -u USERNAME -p PASSWORD
 
-ccminer.exe -q -s 1 -t 4 -a fugue256 -o http://localhost:9089 -u <<myusername>> -p <<mypassword>>
+Example for Boolberry
+    ccminer -a wildkeccak -o stratum+tcp://bbr.suprnova.cc:7777 -u tpruvot.donate -p x -k http://bbr.suprnova.cc/scratchpad.bin -l 64x360
 
+Example for Scrypt-N (2048) on Nicehash
+    ccminer -a scrypt:10 -o stratum+tcp://stratum.nicehash.com:3335 -u 3EujYFcoBzWvpUEvbe3obEG95mBuU88QBD -p x
 
-Example for Fuguecoin pool mining on dwarfpool.com with all your GPUs
+For solo-mining you typically use -o http://127.0.0.1:xxxx where xxxx represents
+the rpcport number specified in your wallet's .conf file and you have to pass the same username
+and password with -O (or -u -p) as specified in the wallet config.
 
-ccminer.exe -q -a fugue256 -o stratum+tcp://erebor.dwarfpool.com:3340 -u YOURWALLETADDRESS.1 -p YOUREMAILADDRESS
+The wallet must also be started with the -server option and/or with the server=1 flag in the .conf file
 
+>>> Configuration files <<<
 
-Example for Groestlcoin solo mining
+With the -c parameter you can use a json config file to set your prefered settings.
+An example is present in source tree, and is also the default one when no command line parameters are given.
+This allow you to run the miner without batch/script.
 
-ccminer.exe -q -s 1 -a groestl -o http://127.0.0.1:1441 -u USERNAME -p PASSWORD
 
+>>> API and Monitoring <<<
 
-For solo-mining you typically use -o 127.0.0.1:xxxx where xxxx represents
-the RPC portnumber specified in your wallet's .conf file and you have to
-pass the same username and password with -O as specified in the wallet's
-.conf file. The wallet must also be started with the -server option and
-the server flag in the wallet's .conf file set to 1
+With the -b parameter you can open your ccminer to your network, use -b 0.0.0.0:4068 if required.
+On windows, setting 0.0.0.0 will ask firewall permissions on the first launch. Its normal.
+
+Default API feature is only enabled for localhost queries by default, on port 4068.
+
+You can test this api on linux with "telnet <miner-ip> 4068" and type "help" to list the commands.
+Default api format is delimited text. If required a php json wrapper is present in api/ folder.
+
+I plan to add a json format later, if requests are formatted in json too..
 
 
 >>> Additional Notes <<<
 
 This code should be running on nVidia GPUs ranging from compute capability
-3.0 up to compute capability 3.5. Support for Compute 2.0 has been dropped
+3.0 up to compute capability 5.2. Support for Compute 2.0 has been dropped
 so we can more efficiently implement new algorithms using the latest hardware
 features.
 
 >>> RELEASE HISTORY <<<
+  Jan. 30th 2019  v2.3.1
+                  Handle Lyra2v3 algo
+                  Handle sha256q algo
+                  Handle exosis algo
+                  Handle blake2b standard algo
+
+  June 23th 2018  v2.3
+                  Handle phi2 header variation for smart contracts
+                  Handle monero, stellite, graft and cryptolight variants
+                  Handle SonoA algo
+
+  June 10th 2018  v2.2.6
+                  New phi2 algo for LUX
+                  New allium algo for Garlic
+
+  Apr. 02nd 2018  v2.2.5
+                  New x16r algo for Raven
+                  New x16s algo for Pigeon and Eden
+                  New x12 algo for Galaxycash
+                  Equihash (SIMT) sync issues for the Volta generation
+
+  Jan. 04th 2018  v2.2.4
+                  Improve lyra2v2
+                  Higher keccak default intensity
+                  Drop SM 2.x support by default, for CUDA 9 and more recent
+
+  Dec. 04th 2017  v2.2.3
+                  Polytimos Algo
+                  Handle keccakc variant (with refreshed sha256d merkle)
+                  Optimised keccak for SM5+, based on alexis improvements
+
+  Oct. 09th 2017  v2.2.2
+                  Import and clean the hsr algo (x13 + custom hash)
+                  Import and optimise phi algo from LuxCoin repository
+                  Improve sib algo too for maxwell and pascal cards
+                  Small fix to handle more than 9 cards on linux (-d 10+)
+                  Attempt to free equihash memory "properly"
+                  --submit-stale parameter for supernova pool (which change diff too fast)
+
+  Sep. 01st 2017  v2.2.1
+                  Improve tribus algo on recent cards (up to +10%)
+
+  Aug. 13th 2017  v2.2
+                  New skunk algo, using the heavy streebog algorithm
+                  Enhance tribus algo (+10%)
+                  equihash protocol enhancement on yiimp.ccminer.org and zpool.ca
+
+  June 16th 2017  v2.1-tribus
+                  Interface equihash algo with djeZo solver (from nheqminer 0.5c)
+                  New api parameters (and multicast announces for local networks)
+                  New tribus algo
+
+  May. 14th 2017  v2.0
+                  Handle cryptonight, wildkeccak and cryptonight-lite
+                  Add a serie of new algos: timetravel, bastion, hmq1725, sha256t
+                  Import lyra2z from djm34 work...
+                  Rework the common skein512 (used in most algos except skein ;)
+                  Upgrade whirlpool algo with alexis version (2x faster)
+                  Store the share diff of second nonce(s) in most algos
+                  Hardware monitoring thread to get more accurate power readings
+                  Small changes for the quiet mode & max-log-rate to reduce logs
+                  Add bitcore and a compatible jha algo
+
+  Dec. 21th 2016  v1.8.4
+                  Improve streebog based algos, veltor and sib (from alexis work)
+                  Blake2s greetly improved (3x), thanks to alexis too...
+
+  Sep. 28th 2016  v1.8.3
+                  show intensity on startup for each cards
+                  show-diff is now used by default, use --hide-diff if not wanted
+
+  Sep. 22th 2016  v1.8.2
+                  lbry improvements by Alexis Provos
+                  Prevent Windows hibernate while mining
+                  veltor algo (basic implementation)
+
+  Aug. 10th 2016  v1.8.1
+                  SIA Blake2-B Algo (getwork over stratum for Suprnova)
+                  SIA Nanopool RPC (getwork over http)
+                  Update also the older lyra2 with Nanashi version
+
+  July 20th 2016  v1.8.0
+                  Pascal support with cuda 8
+                  lbry new multi sha / ripemd algo (LBC)
+                  x11evo algo (XRE)
+                  Lyra2v2, Neoscrypt and Decred improvements
+                  Enhance windows NVAPI clock and power limits
+                  Led support for mining/shares activity on windows
+
+  May  18th 2016  v1.7.6
+                  Decred vote support
+                  X17 cleanup and improvement
+                  Add mining.ping stratum method and handle unknown methods
+                  Implement a pool stats/benchmark mode (-p stats on yiimp)
+                  Add --shares-limit parameter, can be used for benchmarks
+
+  Mar. 13th 2016  v1.7.5
+                  Blake2S Algo (NEVA/OXEN)
+
+  Feb. 28th 2016  v1.7.4 (1.7.3 was a preview, not official)
+                  Decred simplified stratum (getwork over stratum)
+                  Vanilla kernel by MrMad
+                  Drop/Disable WhirlpoolX
+
+  Feb. 11th 2016  v1.7.2
+                  Decred Algo (longpoll only)
+                  Blake256 improvements/cleanup
+
+  Jan. 26th 2016  v1.7.1
+                  Implement sib algo (X11 + Russian Streebog-512/GOST)
+                  Whirlpool speed x2 with the midstate precompute
+                  Small bug fixes about device ids mapping (and vendor names)
+                  Add Vanilla algo (Blake256 8-rounds - double sha256)
+
+  Nov. 06th 2015  v1.7
+                  Improve old devices compatibility (x11, lyra2v2, quark, qubit...)
+                  Add windows support for SM 2.1 and drop SM 3.5 (x86)
+                  Improve lyra2 (v1/v2) cuda implementations
+                  Improve most common algos on SM5+ with sp blake kernel
+                  Restore whirlpool algo (and whirlcoin variant)
+                  Prepare algo/pool switch ability, trivial method
+                  Add --benchmark alone to run a benchmark for all algos
+                  Add --cuda-schedule parameter
+                  Add --show-diff parameter, which display shares diff,
+                    and is able to detect real solved blocks on pools.
+
+  Aug. 28th 2015  v1.6.6
+                  Allow to load remote config with curl (-c http://...)
+                  Add Lyra2REv2 algo (Vertcoin/Zoom)
+                  Restore WhirlpoolX algo (VNL)
+                  Drop Animecoin support
+                  Add bmw (Midnight) algo
+
+  July 06th 2015  v1.6.5-C11
+                  Nvml api power limits
+                  Add chaincoin c11 algo (used by Flaxscript too)
+                  Remove pluck algo
+
+  June 23th 2015  v1.6.5
+                  Handle Ziftrcoin PoK solo mining
+                  Basic compatibility with CUDA 7.0 (generally slower hashrate)
+                  Show gpus vendor names on linux (windows test branch is pciutils)
+                  Remove -v and -m short params specific to heavycoin
+                  Add --diff-multiplier (-m) and rename --diff to --diff-factor (-f)
+                  First steps to handle nvml application clocks and P0 on the GTX9xx
+                  Various improvements on multipool and cmdline parameters
+                  Optimize a bit qubit, deep, luffa, x11 and quark algos
+
+  May 26th 2015   v1.6.4
+                  Implement multi-pool support (failover and time rotate)
+                    try "ccminer -c pools.conf" to test the sample config
+                  Update the API to allow remote pool switching and pool stats
+                  Auto bind the api port to the first available when using default
+                  Try to compute network difficulty on pools too (for most algos)
+                  Drop Whirlpool and whirpoolx algos, no more used...
+
+  May 15th 2015   v1.6.3
+                  Import and adapt Neoscrypt from djm34 work (SM 5+ only)
+                  Conditional mining options based on gpu temp, network diff and rate
+                  background option implementation for windows too
+                  "Multithreaded" devices (-d 0,0) intensity and stats changes
+                  SM5+ Optimisation of skein based on sp/klaus method (+20%)
+
+  Apr. 21th 2015  v1.6.2
+                  Import Scrypt, Scrypt:N and Scrypt-jane from Cudaminer
+                  Add the --time-limit command line parameter
+
+  Apr. 14th 2015  v1.6.1
+                  Add the Double Skein Algo for Woodcoin
+                  Skein/Skein2 SM 3.0 devices support
+
+  Mar. 27th 2015  v1.6.0
+                  Add the ZR5 Algo for Ziftcoin
+                  Implement Skeincoin algo (skein + sha)
+                  Import pluck (djm34) and whirlpoolx (alexis78) algos
+                  Hashrate units based on hashing rate values (Hs/kHs/MHs/GHs)
+                  Default config file (also help to debug without command line)
+                  Various small fixes
+
+  Feb. 11th 2015  v1.5.3
+                  Fix anime algo
+                  Allow a default config file in user or ccminer folder
+                  SM 2.1 windows binary (lyra2 and blake/blakecoin for the moment)
+
+  Jan. 24th 2015  v1.5.2
+                  Allow per device intensity, example: -i 20,19.5
+                  Add process CPU priority and affinity mask parameters
+                  Intelligent duplicate shares check feature (enabled if needed)
+                  api: Fan RPM (windows), Cuda threads count, linux kernel ver.
+                  More X11 optimisations from sp and KlausT
+                  SM 3.0 enhancements
+
+  Dec. 16th 2014  v1.5.1
+                  Add lyra2RE algo for Vertcoin based on djm34/vtc code
+                  Multiple shares support (2 for the moment)
+                  X11 optimisations (From klaust and sp-hash)
+                  HTML5 WebSocket api compatibility (see api/websocket.htm)
+                  Solo mode height checks with getblocktemplate rpc calls
+
+  Nov. 27th 2014  v1.5.0
+                  Upgrade compat jansson to 2.6 (for windows)
+                  Add pool mining.set_extranonce support
+                  Allow intermediate intensity with decimals
+                  Update prebuilt x86 openssl lib to 1.0.1i
+                  Fix heavy algo on linux (broken since 1.4)
+                  Some internal changes to use the C++ compiler
+                  New API 1.2 with some new commands (read only)
+                  Add some of sp x11/x15 optimisations (and tsiv x13)
+
+  Nov. 15th 2014  v1.4.9
+                  Support of nvml and nvapi(windows) to monitor gpus
+                  Fix (again) displayed hashrate for multi gpus systems
+                    Average is now made by card (30 scans of the card)
+                  Final API v1.1 (new fields + histo command)
+                  Add support of telnet queries "telnet 127.0.0.1 4068"
+                  add histo api command to get performance debug details
+                  Add a rig sample php ui using json wrapper (php)
+                  Restore quark/jackpot previous speed (differently)
+
+  Nov. 12th 2014  v1.4.8
+                  Add a basic API and a sample php json wrapper
+                  Add statsavg (def 20) and api-bind parameters
+
+  Nov. 11th 2014  v1.4.7
+                  Average hashrate (based on the 20 last scans)
+                  Rewrite blake algo
+                  Add the -i (gpu threads/intensity parameter)
+                  Add some X11 optimisations based on sp_ commits
+                  Fix quark reported hashrate and benchmark mode for some algos
+                  Enhance json config file param (int/float/false) (-c config.json)
+                  Update windows prebuilt curl to 7.38.0
+
+  Oct. 26th 2014  v1.4.6
+                  Add S3 algo reusing existing code (onecoin)
+                  Small X11 (simd512) enhancement
+
+  Oct. 20th 2014  v1.4.5
+                  Add keccak algo from djm34 repo (maxcoin)
+                  Curl 7.35 and OpenSSL are now included in the binary (and win tree)
+                  Enhance windows terminal support (--help was broken)
+
+  Sep. 27th 2014  v1.4.4
+                  First SM 5.2 Release (GTX 970 & 980)
+                  CUDA Runtime included in binary
+                  Colors enabled by default
+
+  Sep. 10th 2014  v1.4.3
+                  Add algos from djm34 repo (deep, doom, qubit)
+                  Goalcoin seems to be dead, not imported.
+                  Create also the pentablake algo (5x Blake 512)
+
+  Sept  6th 2014  Almost twice the speed on blake256 algos with the "midstate" cache
+
+  Sep.  1st 2014  add X17, optimized x15 and whirl
+                  add blake (256 variant)
+                  color support on Windows,
+                  remove some dll dependencies (pthreads, msvcp)
+
+  Aug. 18th 2014  add X14, X15, Whirl, and Fresh algos,
+                  also add colors and nvprof cmd line support
 
   June 15th 2014  add X13 and Diamond Groestl support.
                   Thanks to tsiv and to Bombadil for the contributions!
@@ -203,7 +615,12 @@ features.
 
 Notable contributors to this application are:
 
-Christian Buchner, Christian H. (Germany): CUDA implementation 
+Christian Buchner, Christian H. (Germany): Initial CUDA implementation
+
+djm34, tsiv, sp and klausT for cuda algos implementation and optimisation
+
+Tanguy Pruvot : 750Ti tuning, blake, colors, zr5, skein, general code cleanup
+                API monitoring, linux Config/Makefile and vstudio libs...
 
 and also many thanks to anyone else who contributed to the original
 cpuminer application (Jeff Garzik, pooler), it's original HVC-fork
@@ -216,3 +633,4 @@ With kind regards,
 
    Christian Buchner ( Christian.Buchner@gmail.com )
    Christian H. ( Chris84 )
+   Tanguy Pruvot ( tpruvot@github )
diff --git a/aclocal.m4 b/aclocal.m4
deleted file mode 100644
index 911786cc97..0000000000
--- a/aclocal.m4
+++ /dev/null
@@ -1,1319 +0,0 @@
-# generated automatically by aclocal 1.11.3 -*- Autoconf -*-
-
-# Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
-# 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation,
-# Inc.
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
-# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
-# PARTICULAR PURPOSE.
-
-m4_ifndef([AC_AUTOCONF_VERSION],
-  [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
-m4_if(m4_defn([AC_AUTOCONF_VERSION]), [2.68],,
-[m4_warning([this file was generated for autoconf 2.68.
-You have another version of autoconf.  It may work, but is not guaranteed to.
-If you have problems, you may need to regenerate the build system entirely.
-To do so, use the procedure documented by the package, typically `autoreconf'.])])
-
-# LIBCURL_CHECK_CONFIG ([DEFAULT-ACTION], [MINIMUM-VERSION],
-#                       [ACTION-IF-YES], [ACTION-IF-NO])
-# ----------------------------------------------------------
-#      David Shaw <dshaw@jabberwocky.com>   May-09-2006
-#
-# Checks for libcurl.  DEFAULT-ACTION is the string yes or no to
-# specify whether to default to --with-libcurl or --without-libcurl.
-# If not supplied, DEFAULT-ACTION is yes.  MINIMUM-VERSION is the
-# minimum version of libcurl to accept.  Pass the version as a regular
-# version number like 7.10.1. If not supplied, any version is
-# accepted.  ACTION-IF-YES is a list of shell commands to run if
-# libcurl was successfully found and passed the various tests.
-# ACTION-IF-NO is a list of shell commands that are run otherwise.
-# Note that using --without-libcurl does run ACTION-IF-NO.
-#
-# This macro #defines HAVE_LIBCURL if a working libcurl setup is
-# found, and sets @LIBCURL@ and @LIBCURL_CPPFLAGS@ to the necessary
-# values.  Other useful defines are LIBCURL_FEATURE_xxx where xxx are
-# the various features supported by libcurl, and LIBCURL_PROTOCOL_yyy
-# where yyy are the various protocols supported by libcurl.  Both xxx
-# and yyy are capitalized.  See the list of AH_TEMPLATEs at the top of
-# the macro for the complete list of possible defines.  Shell
-# variables $libcurl_feature_xxx and $libcurl_protocol_yyy are also
-# defined to 'yes' for those features and protocols that were found.
-# Note that xxx and yyy keep the same capitalization as in the
-# curl-config list (e.g. it's "HTTP" and not "http").
-#
-# Users may override the detected values by doing something like:
-# LIBCURL="-lcurl" LIBCURL_CPPFLAGS="-I/usr/myinclude" ./configure
-#
-# For the sake of sanity, this macro assumes that any libcurl that is
-# found is after version 7.7.2, the first version that included the
-# curl-config script.  Note that it is very important for people
-# packaging binary versions of libcurl to include this script!
-# Without curl-config, we can only guess what protocols are available,
-# or use curl_version_info to figure it out at runtime.
-
-AC_DEFUN([LIBCURL_CHECK_CONFIG],
-[
-  AH_TEMPLATE([LIBCURL_FEATURE_SSL],[Defined if libcurl supports SSL])
-  AH_TEMPLATE([LIBCURL_FEATURE_KRB4],[Defined if libcurl supports KRB4])
-  AH_TEMPLATE([LIBCURL_FEATURE_IPV6],[Defined if libcurl supports IPv6])
-  AH_TEMPLATE([LIBCURL_FEATURE_LIBZ],[Defined if libcurl supports libz])
-  AH_TEMPLATE([LIBCURL_FEATURE_ASYNCHDNS],[Defined if libcurl supports AsynchDNS])
-  AH_TEMPLATE([LIBCURL_FEATURE_IDN],[Defined if libcurl supports IDN])
-  AH_TEMPLATE([LIBCURL_FEATURE_SSPI],[Defined if libcurl supports SSPI])
-  AH_TEMPLATE([LIBCURL_FEATURE_NTLM],[Defined if libcurl supports NTLM])
-
-  AH_TEMPLATE([LIBCURL_PROTOCOL_HTTP],[Defined if libcurl supports HTTP])
-  AH_TEMPLATE([LIBCURL_PROTOCOL_HTTPS],[Defined if libcurl supports HTTPS])
-  AH_TEMPLATE([LIBCURL_PROTOCOL_FTP],[Defined if libcurl supports FTP])
-  AH_TEMPLATE([LIBCURL_PROTOCOL_FTPS],[Defined if libcurl supports FTPS])
-  AH_TEMPLATE([LIBCURL_PROTOCOL_FILE],[Defined if libcurl supports FILE])
-  AH_TEMPLATE([LIBCURL_PROTOCOL_TELNET],[Defined if libcurl supports TELNET])
-  AH_TEMPLATE([LIBCURL_PROTOCOL_LDAP],[Defined if libcurl supports LDAP])
-  AH_TEMPLATE([LIBCURL_PROTOCOL_DICT],[Defined if libcurl supports DICT])
-  AH_TEMPLATE([LIBCURL_PROTOCOL_TFTP],[Defined if libcurl supports TFTP])
-  AH_TEMPLATE([LIBCURL_PROTOCOL_RTSP],[Defined if libcurl supports RTSP])
-  AH_TEMPLATE([LIBCURL_PROTOCOL_POP3],[Defined if libcurl supports POP3])
-  AH_TEMPLATE([LIBCURL_PROTOCOL_IMAP],[Defined if libcurl supports IMAP])
-  AH_TEMPLATE([LIBCURL_PROTOCOL_SMTP],[Defined if libcurl supports SMTP])
-
-  AC_ARG_WITH(libcurl,
-     AC_HELP_STRING([--with-libcurl=PREFIX],[look for the curl library in PREFIX/lib and headers in PREFIX/include]),
-     [_libcurl_with=$withval],[_libcurl_with=ifelse([$1],,[yes],[$1])])
-
-  if test "$_libcurl_with" != "no" ; then
-
-     AC_PROG_AWK
-
-     _libcurl_version_parse="eval $AWK '{split(\$NF,A,\".\"); X=256*256*A[[1]]+256*A[[2]]+A[[3]]; print X;}'"
-
-     _libcurl_try_link=yes
-
-     if test -d "$_libcurl_with" ; then
-        LIBCURL_CPPFLAGS="-I$withval/include"
-        _libcurl_ldflags="-L$withval/lib"
-        AC_PATH_PROG([_libcurl_config],[curl-config],[],
-                     ["$withval/bin"])
-     else
-        AC_PATH_PROG([_libcurl_config],[curl-config],[],[$PATH])
-     fi
-
-     if test x$_libcurl_config != "x" ; then
-        AC_CACHE_CHECK([for the version of libcurl],
-           [libcurl_cv_lib_curl_version],
-           [libcurl_cv_lib_curl_version=`$_libcurl_config --version | $AWK '{print $[]2}'`])
-
-        _libcurl_version=`echo $libcurl_cv_lib_curl_version | $_libcurl_version_parse`
-        _libcurl_wanted=`echo ifelse([$2],,[0],[$2]) | $_libcurl_version_parse`
-
-        if test $_libcurl_wanted -gt 0 ; then
-           AC_CACHE_CHECK([for libcurl >= version $2],
-              [libcurl_cv_lib_version_ok],
-              [
-              if test $_libcurl_version -ge $_libcurl_wanted ; then
-                 libcurl_cv_lib_version_ok=yes
-              else
-                 libcurl_cv_lib_version_ok=no
-              fi
-              ])
-        fi
-
-        if test $_libcurl_wanted -eq 0 || test x$libcurl_cv_lib_version_ok = xyes ; then
-           if test x"$LIBCURL_CPPFLAGS" = "x" ; then
-              LIBCURL_CPPFLAGS=`$_libcurl_config --cflags`
-           fi
-           if test x"$LIBCURL" = "x" ; then
-              LIBCURL=`$_libcurl_config --libs`
-
-              # This is so silly, but Apple actually has a bug in their
-              # curl-config script.  Fixed in Tiger, but there are still
-              # lots of Panther installs around.
-              case "${host}" in
-                 powerpc-apple-darwin7*)
-                    LIBCURL=`echo $LIBCURL | sed -e 's|-arch i386||g'`
-                 ;;
-              esac
-           fi
-
-           # All curl-config scripts support --feature
-           _libcurl_features=`$_libcurl_config --feature`
-
-           # Is it modern enough to have --protocols? (7.12.4)
-           if test $_libcurl_version -ge 461828 ; then
-              _libcurl_protocols=`$_libcurl_config --protocols`
-           fi
-        else
-           _libcurl_try_link=no
-        fi
-
-        unset _libcurl_wanted
-     fi
-
-     if test $_libcurl_try_link = yes ; then
-
-        # we didn't find curl-config, so let's see if the user-supplied
-        # link line (or failing that, "-lcurl") is enough.
-        LIBCURL=${LIBCURL-"$_libcurl_ldflags -lcurl"}
-
-        AC_CACHE_CHECK([whether libcurl is usable],
-           [libcurl_cv_lib_curl_usable],
-           [
-           _libcurl_save_cppflags=$CPPFLAGS
-           CPPFLAGS="$LIBCURL_CPPFLAGS $CPPFLAGS"
-           _libcurl_save_libs=$LIBS
-           LIBS="$LIBCURL $LIBS"
-
-           AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <curl/curl.h>],[
-/* Try and use a few common options to force a failure if we are
-   missing symbols or can't link. */
-int x;
-curl_easy_setopt(NULL,CURLOPT_URL,NULL);
-x=CURL_ERROR_SIZE;
-x=CURLOPT_WRITEFUNCTION;
-x=CURLOPT_FILE;
-x=CURLOPT_ERRORBUFFER;
-x=CURLOPT_STDERR;
-x=CURLOPT_VERBOSE;
-])],libcurl_cv_lib_curl_usable=yes,libcurl_cv_lib_curl_usable=no)
-
-           CPPFLAGS=$_libcurl_save_cppflags
-           LIBS=$_libcurl_save_libs
-           unset _libcurl_save_cppflags
-           unset _libcurl_save_libs
-           ])
-
-        if test $libcurl_cv_lib_curl_usable = yes ; then
-
-           # Does curl_free() exist in this version of libcurl?
-           # If not, fake it with free()
-
-           _libcurl_save_cppflags=$CPPFLAGS
-           CPPFLAGS="$CPPFLAGS $LIBCURL_CPPFLAGS"
-           _libcurl_save_libs=$LIBS
-           LIBS="$LIBS $LIBCURL"
-
-           AC_CHECK_FUNC(curl_free,,
-              AC_DEFINE(curl_free,free,
-                [Define curl_free() as free() if our version of curl lacks curl_free.]))
-
-           CPPFLAGS=$_libcurl_save_cppflags
-           LIBS=$_libcurl_save_libs
-           unset _libcurl_save_cppflags
-           unset _libcurl_save_libs
-
-           AC_DEFINE(HAVE_LIBCURL,1,
-             [Define to 1 if you have a functional curl library.])
-           AC_SUBST(LIBCURL_CPPFLAGS)
-           AC_SUBST(LIBCURL)
-
-           for _libcurl_feature in $_libcurl_features ; do
-              AC_DEFINE_UNQUOTED(AS_TR_CPP(libcurl_feature_$_libcurl_feature),[1])
-              eval AS_TR_SH(libcurl_feature_$_libcurl_feature)=yes
-           done
-
-           if test "x$_libcurl_protocols" = "x" ; then
-
-              # We don't have --protocols, so just assume that all
-              # protocols are available
-              _libcurl_protocols="HTTP FTP FILE TELNET LDAP DICT TFTP"
-
-              if test x$libcurl_feature_SSL = xyes ; then
-                 _libcurl_protocols="$_libcurl_protocols HTTPS"
-
-                 # FTPS wasn't standards-compliant until version
-                 # 7.11.0 (0x070b00 == 461568)
-                 if test $_libcurl_version -ge 461568; then
-                    _libcurl_protocols="$_libcurl_protocols FTPS"
-                 fi
-              fi
-
-              # RTSP, IMAP, POP3 and SMTP were added in
-              # 7.20.0 (0x071400 == 463872)
-              if test $_libcurl_version -ge 463872; then
-                 _libcurl_protocols="$_libcurl_protocols RTSP IMAP POP3 SMTP"
-              fi
-           fi
-
-           for _libcurl_protocol in $_libcurl_protocols ; do
-              AC_DEFINE_UNQUOTED(AS_TR_CPP(libcurl_protocol_$_libcurl_protocol),[1])
-              eval AS_TR_SH(libcurl_protocol_$_libcurl_protocol)=yes
-           done
-        else
-           unset LIBCURL
-           unset LIBCURL_CPPFLAGS
-        fi
-     fi
-
-     unset _libcurl_try_link
-     unset _libcurl_version_parse
-     unset _libcurl_config
-     unset _libcurl_feature
-     unset _libcurl_features
-     unset _libcurl_protocol
-     unset _libcurl_protocols
-     unset _libcurl_version
-     unset _libcurl_ldflags
-  fi
-
-  if test x$_libcurl_with = xno || test x$libcurl_cv_lib_curl_usable != xyes ; then
-     # This is the IF-NO path
-     ifelse([$4],,:,[$4])
-  else
-     # This is the IF-YES path
-     ifelse([$3],,:,[$3])
-  fi
-
-  unset _libcurl_with
-])dnl
-
-# Copyright (C) 2002, 2003, 2005, 2006, 2007, 2008, 2011 Free Software
-# Foundation, Inc.
-#
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# serial 1
-
-# AM_AUTOMAKE_VERSION(VERSION)
-# ----------------------------
-# Automake X.Y traces this macro to ensure aclocal.m4 has been
-# generated from the m4 files accompanying Automake X.Y.
-# (This private macro should not be called outside this file.)
-AC_DEFUN([AM_AUTOMAKE_VERSION],
-[am__api_version='1.11'
-dnl Some users find AM_AUTOMAKE_VERSION and mistake it for a way to
-dnl require some minimum version.  Point them to the right macro.
-m4_if([$1], [1.11.3], [],
-      [AC_FATAL([Do not call $0, use AM_INIT_AUTOMAKE([$1]).])])dnl
-])
-
-# _AM_AUTOCONF_VERSION(VERSION)
-# -----------------------------
-# aclocal traces this macro to find the Autoconf version.
-# This is a private macro too.  Using m4_define simplifies
-# the logic in aclocal, which can simply ignore this definition.
-m4_define([_AM_AUTOCONF_VERSION], [])
-
-# AM_SET_CURRENT_AUTOMAKE_VERSION
-# -------------------------------
-# Call AM_AUTOMAKE_VERSION and AM_AUTOMAKE_VERSION so they can be traced.
-# This function is AC_REQUIREd by AM_INIT_AUTOMAKE.
-AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION],
-[AM_AUTOMAKE_VERSION([1.11.3])dnl
-m4_ifndef([AC_AUTOCONF_VERSION],
-  [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
-_AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))])
-
-# Figure out how to run the assembler.                      -*- Autoconf -*-
-
-# Copyright (C) 2001, 2003, 2004, 2005, 2006  Free Software Foundation, Inc.
-#
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# serial 5
-
-# AM_PROG_AS
-# ----------
-AC_DEFUN([AM_PROG_AS],
-[# By default we simply use the C compiler to build assembly code.
-AC_REQUIRE([AC_PROG_CC])
-test "${CCAS+set}" = set || CCAS=$CC
-test "${CCASFLAGS+set}" = set || CCASFLAGS=$CFLAGS
-AC_ARG_VAR([CCAS],      [assembler compiler command (defaults to CC)])
-AC_ARG_VAR([CCASFLAGS], [assembler compiler flags (defaults to CFLAGS)])
-_AM_IF_OPTION([no-dependencies],, [_AM_DEPENDENCIES([CCAS])])dnl
-])
-
-# AM_AUX_DIR_EXPAND                                         -*- Autoconf -*-
-
-# Copyright (C) 2001, 2003, 2005, 2011 Free Software Foundation, Inc.
-#
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# serial 1
-
-# For projects using AC_CONFIG_AUX_DIR([foo]), Autoconf sets
-# $ac_aux_dir to `$srcdir/foo'.  In other projects, it is set to
-# `$srcdir', `$srcdir/..', or `$srcdir/../..'.
-#
-# Of course, Automake must honor this variable whenever it calls a
-# tool from the auxiliary directory.  The problem is that $srcdir (and
-# therefore $ac_aux_dir as well) can be either absolute or relative,
-# depending on how configure is run.  This is pretty annoying, since
-# it makes $ac_aux_dir quite unusable in subdirectories: in the top
-# source directory, any form will work fine, but in subdirectories a
-# relative path needs to be adjusted first.
-#
-# $ac_aux_dir/missing
-#    fails when called from a subdirectory if $ac_aux_dir is relative
-# $top_srcdir/$ac_aux_dir/missing
-#    fails if $ac_aux_dir is absolute,
-#    fails when called from a subdirectory in a VPATH build with
-#          a relative $ac_aux_dir
-#
-# The reason of the latter failure is that $top_srcdir and $ac_aux_dir
-# are both prefixed by $srcdir.  In an in-source build this is usually
-# harmless because $srcdir is `.', but things will broke when you
-# start a VPATH build or use an absolute $srcdir.
-#
-# So we could use something similar to $top_srcdir/$ac_aux_dir/missing,
-# iff we strip the leading $srcdir from $ac_aux_dir.  That would be:
-#   am_aux_dir='\$(top_srcdir)/'`expr "$ac_aux_dir" : "$srcdir//*\(.*\)"`
-# and then we would define $MISSING as
-#   MISSING="\${SHELL} $am_aux_dir/missing"
-# This will work as long as MISSING is not called from configure, because
-# unfortunately $(top_srcdir) has no meaning in configure.
-# However there are other variables, like CC, which are often used in
-# configure, and could therefore not use this "fixed" $ac_aux_dir.
-#
-# Another solution, used here, is to always expand $ac_aux_dir to an
-# absolute PATH.  The drawback is that using absolute paths prevent a
-# configured tree to be moved without reconfiguration.
-
-AC_DEFUN([AM_AUX_DIR_EXPAND],
-[dnl Rely on autoconf to set up CDPATH properly.
-AC_PREREQ([2.50])dnl
-# expand $ac_aux_dir to an absolute path
-am_aux_dir=`cd $ac_aux_dir && pwd`
-])
-
-# AM_CONDITIONAL                                            -*- Autoconf -*-
-
-# Copyright (C) 1997, 2000, 2001, 2003, 2004, 2005, 2006, 2008
-# Free Software Foundation, Inc.
-#
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# serial 9
-
-# AM_CONDITIONAL(NAME, SHELL-CONDITION)
-# -------------------------------------
-# Define a conditional.
-AC_DEFUN([AM_CONDITIONAL],
-[AC_PREREQ(2.52)dnl
- ifelse([$1], [TRUE],  [AC_FATAL([$0: invalid condition: $1])],
-	[$1], [FALSE], [AC_FATAL([$0: invalid condition: $1])])dnl
-AC_SUBST([$1_TRUE])dnl
-AC_SUBST([$1_FALSE])dnl
-_AM_SUBST_NOTMAKE([$1_TRUE])dnl
-_AM_SUBST_NOTMAKE([$1_FALSE])dnl
-m4_define([_AM_COND_VALUE_$1], [$2])dnl
-if $2; then
-  $1_TRUE=
-  $1_FALSE='#'
-else
-  $1_TRUE='#'
-  $1_FALSE=
-fi
-AC_CONFIG_COMMANDS_PRE(
-[if test -z "${$1_TRUE}" && test -z "${$1_FALSE}"; then
-  AC_MSG_ERROR([[conditional "$1" was never defined.
-Usually this means the macro was only invoked conditionally.]])
-fi])])
-
-# Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2009,
-# 2010, 2011 Free Software Foundation, Inc.
-#
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# serial 12
-
-# There are a few dirty hacks below to avoid letting `AC_PROG_CC' be
-# written in clear, in which case automake, when reading aclocal.m4,
-# will think it sees a *use*, and therefore will trigger all it's
-# C support machinery.  Also note that it means that autoscan, seeing
-# CC etc. in the Makefile, will ask for an AC_PROG_CC use...
-
-
-# _AM_DEPENDENCIES(NAME)
-# ----------------------
-# See how the compiler implements dependency checking.
-# NAME is "CC", "CXX", "GCJ", or "OBJC".
-# We try a few techniques and use that to set a single cache variable.
-#
-# We don't AC_REQUIRE the corresponding AC_PROG_CC since the latter was
-# modified to invoke _AM_DEPENDENCIES(CC); we would have a circular
-# dependency, and given that the user is not expected to run this macro,
-# just rely on AC_PROG_CC.
-AC_DEFUN([_AM_DEPENDENCIES],
-[AC_REQUIRE([AM_SET_DEPDIR])dnl
-AC_REQUIRE([AM_OUTPUT_DEPENDENCY_COMMANDS])dnl
-AC_REQUIRE([AM_MAKE_INCLUDE])dnl
-AC_REQUIRE([AM_DEP_TRACK])dnl
-
-ifelse([$1], CC,   [depcc="$CC"   am_compiler_list=],
-       [$1], CXX,  [depcc="$CXX"  am_compiler_list=],
-       [$1], OBJC, [depcc="$OBJC" am_compiler_list='gcc3 gcc'],
-       [$1], UPC,  [depcc="$UPC"  am_compiler_list=],
-       [$1], GCJ,  [depcc="$GCJ"  am_compiler_list='gcc3 gcc'],
-                   [depcc="$$1"   am_compiler_list=])
-
-AC_CACHE_CHECK([dependency style of $depcc],
-               [am_cv_$1_dependencies_compiler_type],
-[if test -z "$AMDEP_TRUE" && test -f "$am_depcomp"; then
-  # We make a subdir and do the tests there.  Otherwise we can end up
-  # making bogus files that we don't know about and never remove.  For
-  # instance it was reported that on HP-UX the gcc test will end up
-  # making a dummy file named `D' -- because `-MD' means `put the output
-  # in D'.
-  rm -rf conftest.dir
-  mkdir conftest.dir
-  # Copy depcomp to subdir because otherwise we won't find it if we're
-  # using a relative directory.
-  cp "$am_depcomp" conftest.dir
-  cd conftest.dir
-  # We will build objects and dependencies in a subdirectory because
-  # it helps to detect inapplicable dependency modes.  For instance
-  # both Tru64's cc and ICC support -MD to output dependencies as a
-  # side effect of compilation, but ICC will put the dependencies in
-  # the current directory while Tru64 will put them in the object
-  # directory.
-  mkdir sub
-
-  am_cv_$1_dependencies_compiler_type=none
-  if test "$am_compiler_list" = ""; then
-     am_compiler_list=`sed -n ['s/^#*\([a-zA-Z0-9]*\))$/\1/p'] < ./depcomp`
-  fi
-  am__universal=false
-  m4_case([$1], [CC],
-    [case " $depcc " in #(
-     *\ -arch\ *\ -arch\ *) am__universal=true ;;
-     esac],
-    [CXX],
-    [case " $depcc " in #(
-     *\ -arch\ *\ -arch\ *) am__universal=true ;;
-     esac])
-
-  for depmode in $am_compiler_list; do
-    # Setup a source with many dependencies, because some compilers
-    # like to wrap large dependency lists on column 80 (with \), and
-    # we should not choose a depcomp mode which is confused by this.
-    #
-    # We need to recreate these files for each test, as the compiler may
-    # overwrite some of them when testing with obscure command lines.
-    # This happens at least with the AIX C compiler.
-    : > sub/conftest.c
-    for i in 1 2 3 4 5 6; do
-      echo '#include "conftst'$i'.h"' >> sub/conftest.c
-      # Using `: > sub/conftst$i.h' creates only sub/conftst1.h with
-      # Solaris 8's {/usr,}/bin/sh.
-      touch sub/conftst$i.h
-    done
-    echo "${am__include} ${am__quote}sub/conftest.Po${am__quote}" > confmf
-
-    # We check with `-c' and `-o' for the sake of the "dashmstdout"
-    # mode.  It turns out that the SunPro C++ compiler does not properly
-    # handle `-M -o', and we need to detect this.  Also, some Intel
-    # versions had trouble with output in subdirs
-    am__obj=sub/conftest.${OBJEXT-o}
-    am__minus_obj="-o $am__obj"
-    case $depmode in
-    gcc)
-      # This depmode causes a compiler race in universal mode.
-      test "$am__universal" = false || continue
-      ;;
-    nosideeffect)
-      # after this tag, mechanisms are not by side-effect, so they'll
-      # only be used when explicitly requested
-      if test "x$enable_dependency_tracking" = xyes; then
-	continue
-      else
-	break
-      fi
-      ;;
-    msvc7 | msvc7msys | msvisualcpp | msvcmsys)
-      # This compiler won't grok `-c -o', but also, the minuso test has
-      # not run yet.  These depmodes are late enough in the game, and
-      # so weak that their functioning should not be impacted.
-      am__obj=conftest.${OBJEXT-o}
-      am__minus_obj=
-      ;;
-    none) break ;;
-    esac
-    if depmode=$depmode \
-       source=sub/conftest.c object=$am__obj \
-       depfile=sub/conftest.Po tmpdepfile=sub/conftest.TPo \
-       $SHELL ./depcomp $depcc -c $am__minus_obj sub/conftest.c \
-         >/dev/null 2>conftest.err &&
-       grep sub/conftst1.h sub/conftest.Po > /dev/null 2>&1 &&
-       grep sub/conftst6.h sub/conftest.Po > /dev/null 2>&1 &&
-       grep $am__obj sub/conftest.Po > /dev/null 2>&1 &&
-       ${MAKE-make} -s -f confmf > /dev/null 2>&1; then
-      # icc doesn't choke on unknown options, it will just issue warnings
-      # or remarks (even with -Werror).  So we grep stderr for any message
-      # that says an option was ignored or not supported.
-      # When given -MP, icc 7.0 and 7.1 complain thusly:
-      #   icc: Command line warning: ignoring option '-M'; no argument required
-      # The diagnosis changed in icc 8.0:
-      #   icc: Command line remark: option '-MP' not supported
-      if (grep 'ignoring option' conftest.err ||
-          grep 'not supported' conftest.err) >/dev/null 2>&1; then :; else
-        am_cv_$1_dependencies_compiler_type=$depmode
-        break
-      fi
-    fi
-  done
-
-  cd ..
-  rm -rf conftest.dir
-else
-  am_cv_$1_dependencies_compiler_type=none
-fi
-])
-AC_SUBST([$1DEPMODE], [depmode=$am_cv_$1_dependencies_compiler_type])
-AM_CONDITIONAL([am__fastdep$1], [
-  test "x$enable_dependency_tracking" != xno \
-  && test "$am_cv_$1_dependencies_compiler_type" = gcc3])
-])
-
-
-# AM_SET_DEPDIR
-# -------------
-# Choose a directory name for dependency files.
-# This macro is AC_REQUIREd in _AM_DEPENDENCIES
-AC_DEFUN([AM_SET_DEPDIR],
-[AC_REQUIRE([AM_SET_LEADING_DOT])dnl
-AC_SUBST([DEPDIR], ["${am__leading_dot}deps"])dnl
-])
-
-
-# AM_DEP_TRACK
-# ------------
-AC_DEFUN([AM_DEP_TRACK],
-[AC_ARG_ENABLE(dependency-tracking,
-[  --disable-dependency-tracking  speeds up one-time build
-  --enable-dependency-tracking   do not reject slow dependency extractors])
-if test "x$enable_dependency_tracking" != xno; then
-  am_depcomp="$ac_aux_dir/depcomp"
-  AMDEPBACKSLASH='\'
-  am__nodep='_no'
-fi
-AM_CONDITIONAL([AMDEP], [test "x$enable_dependency_tracking" != xno])
-AC_SUBST([AMDEPBACKSLASH])dnl
-_AM_SUBST_NOTMAKE([AMDEPBACKSLASH])dnl
-AC_SUBST([am__nodep])dnl
-_AM_SUBST_NOTMAKE([am__nodep])dnl
-])
-
-# Generate code to set up dependency tracking.              -*- Autoconf -*-
-
-# Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2008
-# Free Software Foundation, Inc.
-#
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-#serial 5
-
-# _AM_OUTPUT_DEPENDENCY_COMMANDS
-# ------------------------------
-AC_DEFUN([_AM_OUTPUT_DEPENDENCY_COMMANDS],
-[{
-  # Autoconf 2.62 quotes --file arguments for eval, but not when files
-  # are listed without --file.  Let's play safe and only enable the eval
-  # if we detect the quoting.
-  case $CONFIG_FILES in
-  *\'*) eval set x "$CONFIG_FILES" ;;
-  *)   set x $CONFIG_FILES ;;
-  esac
-  shift
-  for mf
-  do
-    # Strip MF so we end up with the name of the file.
-    mf=`echo "$mf" | sed -e 's/:.*$//'`
-    # Check whether this is an Automake generated Makefile or not.
-    # We used to match only the files named `Makefile.in', but
-    # some people rename them; so instead we look at the file content.
-    # Grep'ing the first line is not enough: some people post-process
-    # each Makefile.in and add a new line on top of each file to say so.
-    # Grep'ing the whole file is not good either: AIX grep has a line
-    # limit of 2048, but all sed's we know have understand at least 4000.
-    if sed -n 's,^#.*generated by automake.*,X,p' "$mf" | grep X >/dev/null 2>&1; then
-      dirpart=`AS_DIRNAME("$mf")`
-    else
-      continue
-    fi
-    # Extract the definition of DEPDIR, am__include, and am__quote
-    # from the Makefile without running `make'.
-    DEPDIR=`sed -n 's/^DEPDIR = //p' < "$mf"`
-    test -z "$DEPDIR" && continue
-    am__include=`sed -n 's/^am__include = //p' < "$mf"`
-    test -z "am__include" && continue
-    am__quote=`sed -n 's/^am__quote = //p' < "$mf"`
-    # When using ansi2knr, U may be empty or an underscore; expand it
-    U=`sed -n 's/^U = //p' < "$mf"`
-    # Find all dependency output files, they are included files with
-    # $(DEPDIR) in their names.  We invoke sed twice because it is the
-    # simplest approach to changing $(DEPDIR) to its actual value in the
-    # expansion.
-    for file in `sed -n "
-      s/^$am__include $am__quote\(.*(DEPDIR).*\)$am__quote"'$/\1/p' <"$mf" | \
-	 sed -e 's/\$(DEPDIR)/'"$DEPDIR"'/g' -e 's/\$U/'"$U"'/g'`; do
-      # Make sure the directory exists.
-      test -f "$dirpart/$file" && continue
-      fdir=`AS_DIRNAME(["$file"])`
-      AS_MKDIR_P([$dirpart/$fdir])
-      # echo "creating $dirpart/$file"
-      echo '# dummy' > "$dirpart/$file"
-    done
-  done
-}
-])# _AM_OUTPUT_DEPENDENCY_COMMANDS
-
-
-# AM_OUTPUT_DEPENDENCY_COMMANDS
-# -----------------------------
-# This macro should only be invoked once -- use via AC_REQUIRE.
-#
-# This code is only required when automatic dependency tracking
-# is enabled.  FIXME.  This creates each `.P' file that we will
-# need in order to bootstrap the dependency handling code.
-AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS],
-[AC_CONFIG_COMMANDS([depfiles],
-     [test x"$AMDEP_TRUE" != x"" || _AM_OUTPUT_DEPENDENCY_COMMANDS],
-     [AMDEP_TRUE="$AMDEP_TRUE" ac_aux_dir="$ac_aux_dir"])
-])
-
-# Do all the work for Automake.                             -*- Autoconf -*-
-
-# Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
-# 2005, 2006, 2008, 2009 Free Software Foundation, Inc.
-#
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# serial 16
-
-# This macro actually does too much.  Some checks are only needed if
-# your package does certain things.  But this isn't really a big deal.
-
-# AM_INIT_AUTOMAKE(PACKAGE, VERSION, [NO-DEFINE])
-# AM_INIT_AUTOMAKE([OPTIONS])
-# -----------------------------------------------
-# The call with PACKAGE and VERSION arguments is the old style
-# call (pre autoconf-2.50), which is being phased out.  PACKAGE
-# and VERSION should now be passed to AC_INIT and removed from
-# the call to AM_INIT_AUTOMAKE.
-# We support both call styles for the transition.  After
-# the next Automake release, Autoconf can make the AC_INIT
-# arguments mandatory, and then we can depend on a new Autoconf
-# release and drop the old call support.
-AC_DEFUN([AM_INIT_AUTOMAKE],
-[AC_PREREQ([2.62])dnl
-dnl Autoconf wants to disallow AM_ names.  We explicitly allow
-dnl the ones we care about.
-m4_pattern_allow([^AM_[A-Z]+FLAGS$])dnl
-AC_REQUIRE([AM_SET_CURRENT_AUTOMAKE_VERSION])dnl
-AC_REQUIRE([AC_PROG_INSTALL])dnl
-if test "`cd $srcdir && pwd`" != "`pwd`"; then
-  # Use -I$(srcdir) only when $(srcdir) != ., so that make's output
-  # is not polluted with repeated "-I."
-  AC_SUBST([am__isrc], [' -I$(srcdir)'])_AM_SUBST_NOTMAKE([am__isrc])dnl
-  # test to see if srcdir already configured
-  if test -f $srcdir/config.status; then
-    AC_MSG_ERROR([source directory already configured; run "make distclean" there first])
-  fi
-fi
-
-# test whether we have cygpath
-if test -z "$CYGPATH_W"; then
-  if (cygpath --version) >/dev/null 2>/dev/null; then
-    CYGPATH_W='cygpath -w'
-  else
-    CYGPATH_W=echo
-  fi
-fi
-AC_SUBST([CYGPATH_W])
-
-# Define the identity of the package.
-dnl Distinguish between old-style and new-style calls.
-m4_ifval([$2],
-[m4_ifval([$3], [_AM_SET_OPTION([no-define])])dnl
- AC_SUBST([PACKAGE], [$1])dnl
- AC_SUBST([VERSION], [$2])],
-[_AM_SET_OPTIONS([$1])dnl
-dnl Diagnose old-style AC_INIT with new-style AM_AUTOMAKE_INIT.
-m4_if(m4_ifdef([AC_PACKAGE_NAME], 1)m4_ifdef([AC_PACKAGE_VERSION], 1), 11,,
-  [m4_fatal([AC_INIT should be called with package and version arguments])])dnl
- AC_SUBST([PACKAGE], ['AC_PACKAGE_TARNAME'])dnl
- AC_SUBST([VERSION], ['AC_PACKAGE_VERSION'])])dnl
-
-_AM_IF_OPTION([no-define],,
-[AC_DEFINE_UNQUOTED(PACKAGE, "$PACKAGE", [Name of package])
- AC_DEFINE_UNQUOTED(VERSION, "$VERSION", [Version number of package])])dnl
-
-# Some tools Automake needs.
-AC_REQUIRE([AM_SANITY_CHECK])dnl
-AC_REQUIRE([AC_ARG_PROGRAM])dnl
-AM_MISSING_PROG(ACLOCAL, aclocal-${am__api_version})
-AM_MISSING_PROG(AUTOCONF, autoconf)
-AM_MISSING_PROG(AUTOMAKE, automake-${am__api_version})
-AM_MISSING_PROG(AUTOHEADER, autoheader)
-AM_MISSING_PROG(MAKEINFO, makeinfo)
-AC_REQUIRE([AM_PROG_INSTALL_SH])dnl
-AC_REQUIRE([AM_PROG_INSTALL_STRIP])dnl
-AC_REQUIRE([AM_PROG_MKDIR_P])dnl
-# We need awk for the "check" target.  The system "awk" is bad on
-# some platforms.
-AC_REQUIRE([AC_PROG_AWK])dnl
-AC_REQUIRE([AC_PROG_MAKE_SET])dnl
-AC_REQUIRE([AM_SET_LEADING_DOT])dnl
-_AM_IF_OPTION([tar-ustar], [_AM_PROG_TAR([ustar])],
-	      [_AM_IF_OPTION([tar-pax], [_AM_PROG_TAR([pax])],
-			     [_AM_PROG_TAR([v7])])])
-_AM_IF_OPTION([no-dependencies],,
-[AC_PROVIDE_IFELSE([AC_PROG_CC],
-		  [_AM_DEPENDENCIES(CC)],
-		  [define([AC_PROG_CC],
-			  defn([AC_PROG_CC])[_AM_DEPENDENCIES(CC)])])dnl
-AC_PROVIDE_IFELSE([AC_PROG_CXX],
-		  [_AM_DEPENDENCIES(CXX)],
-		  [define([AC_PROG_CXX],
-			  defn([AC_PROG_CXX])[_AM_DEPENDENCIES(CXX)])])dnl
-AC_PROVIDE_IFELSE([AC_PROG_OBJC],
-		  [_AM_DEPENDENCIES(OBJC)],
-		  [define([AC_PROG_OBJC],
-			  defn([AC_PROG_OBJC])[_AM_DEPENDENCIES(OBJC)])])dnl
-])
-_AM_IF_OPTION([silent-rules], [AC_REQUIRE([AM_SILENT_RULES])])dnl
-dnl The `parallel-tests' driver may need to know about EXEEXT, so add the
-dnl `am__EXEEXT' conditional if _AM_COMPILER_EXEEXT was seen.  This macro
-dnl is hooked onto _AC_COMPILER_EXEEXT early, see below.
-AC_CONFIG_COMMANDS_PRE(dnl
-[m4_provide_if([_AM_COMPILER_EXEEXT],
-  [AM_CONDITIONAL([am__EXEEXT], [test -n "$EXEEXT"])])])dnl
-])
-
-dnl Hook into `_AC_COMPILER_EXEEXT' early to learn its expansion.  Do not
-dnl add the conditional right here, as _AC_COMPILER_EXEEXT may be further
-dnl mangled by Autoconf and run in a shell conditional statement.
-m4_define([_AC_COMPILER_EXEEXT],
-m4_defn([_AC_COMPILER_EXEEXT])[m4_provide([_AM_COMPILER_EXEEXT])])
-
-
-# When config.status generates a header, we must update the stamp-h file.
-# This file resides in the same directory as the config header
-# that is generated.  The stamp files are numbered to have different names.
-
-# Autoconf calls _AC_AM_CONFIG_HEADER_HOOK (when defined) in the
-# loop where config.status creates the headers, so we can generate
-# our stamp files there.
-AC_DEFUN([_AC_AM_CONFIG_HEADER_HOOK],
-[# Compute $1's index in $config_headers.
-_am_arg=$1
-_am_stamp_count=1
-for _am_header in $config_headers :; do
-  case $_am_header in
-    $_am_arg | $_am_arg:* )
-      break ;;
-    * )
-      _am_stamp_count=`expr $_am_stamp_count + 1` ;;
-  esac
-done
-echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_count])
-
-# Copyright (C) 2001, 2003, 2005, 2008, 2011 Free Software Foundation,
-# Inc.
-#
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# serial 1
-
-# AM_PROG_INSTALL_SH
-# ------------------
-# Define $install_sh.
-AC_DEFUN([AM_PROG_INSTALL_SH],
-[AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
-if test x"${install_sh}" != xset; then
-  case $am_aux_dir in
-  *\ * | *\	*)
-    install_sh="\${SHELL} '$am_aux_dir/install-sh'" ;;
-  *)
-    install_sh="\${SHELL} $am_aux_dir/install-sh"
-  esac
-fi
-AC_SUBST(install_sh)])
-
-# Copyright (C) 2003, 2005  Free Software Foundation, Inc.
-#
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# serial 2
-
-# Check whether the underlying file-system supports filenames
-# with a leading dot.  For instance MS-DOS doesn't.
-AC_DEFUN([AM_SET_LEADING_DOT],
-[rm -rf .tst 2>/dev/null
-mkdir .tst 2>/dev/null
-if test -d .tst; then
-  am__leading_dot=.
-else
-  am__leading_dot=_
-fi
-rmdir .tst 2>/dev/null
-AC_SUBST([am__leading_dot])])
-
-# Add --enable-maintainer-mode option to configure.         -*- Autoconf -*-
-# From Jim Meyering
-
-# Copyright (C) 1996, 1998, 2000, 2001, 2002, 2003, 2004, 2005, 2008,
-# 2011 Free Software Foundation, Inc.
-#
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# serial 5
-
-# AM_MAINTAINER_MODE([DEFAULT-MODE])
-# ----------------------------------
-# Control maintainer-specific portions of Makefiles.
-# Default is to disable them, unless `enable' is passed literally.
-# For symmetry, `disable' may be passed as well.  Anyway, the user
-# can override the default with the --enable/--disable switch.
-AC_DEFUN([AM_MAINTAINER_MODE],
-[m4_case(m4_default([$1], [disable]),
-       [enable], [m4_define([am_maintainer_other], [disable])],
-       [disable], [m4_define([am_maintainer_other], [enable])],
-       [m4_define([am_maintainer_other], [enable])
-        m4_warn([syntax], [unexpected argument to AM@&t@_MAINTAINER_MODE: $1])])
-AC_MSG_CHECKING([whether to enable maintainer-specific portions of Makefiles])
-  dnl maintainer-mode's default is 'disable' unless 'enable' is passed
-  AC_ARG_ENABLE([maintainer-mode],
-[  --][am_maintainer_other][-maintainer-mode  am_maintainer_other make rules and dependencies not useful
-			  (and sometimes confusing) to the casual installer],
-      [USE_MAINTAINER_MODE=$enableval],
-      [USE_MAINTAINER_MODE=]m4_if(am_maintainer_other, [enable], [no], [yes]))
-  AC_MSG_RESULT([$USE_MAINTAINER_MODE])
-  AM_CONDITIONAL([MAINTAINER_MODE], [test $USE_MAINTAINER_MODE = yes])
-  MAINT=$MAINTAINER_MODE_TRUE
-  AC_SUBST([MAINT])dnl
-]
-)
-
-AU_DEFUN([jm_MAINTAINER_MODE], [AM_MAINTAINER_MODE])
-
-# Check to see how 'make' treats includes.	            -*- Autoconf -*-
-
-# Copyright (C) 2001, 2002, 2003, 2005, 2009  Free Software Foundation, Inc.
-#
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# serial 4
-
-# AM_MAKE_INCLUDE()
-# -----------------
-# Check to see how make treats includes.
-AC_DEFUN([AM_MAKE_INCLUDE],
-[am_make=${MAKE-make}
-cat > confinc << 'END'
-am__doit:
-	@echo this is the am__doit target
-.PHONY: am__doit
-END
-# If we don't find an include directive, just comment out the code.
-AC_MSG_CHECKING([for style of include used by $am_make])
-am__include="#"
-am__quote=
-_am_result=none
-# First try GNU make style include.
-echo "include confinc" > confmf
-# Ignore all kinds of additional output from `make'.
-case `$am_make -s -f confmf 2> /dev/null` in #(
-*the\ am__doit\ target*)
-  am__include=include
-  am__quote=
-  _am_result=GNU
-  ;;
-esac
-# Now try BSD make style include.
-if test "$am__include" = "#"; then
-   echo '.include "confinc"' > confmf
-   case `$am_make -s -f confmf 2> /dev/null` in #(
-   *the\ am__doit\ target*)
-     am__include=.include
-     am__quote="\""
-     _am_result=BSD
-     ;;
-   esac
-fi
-AC_SUBST([am__include])
-AC_SUBST([am__quote])
-AC_MSG_RESULT([$_am_result])
-rm -f confinc confmf
-])
-
-# Copyright (C) 1999, 2000, 2001, 2003, 2004, 2005, 2008
-# Free Software Foundation, Inc.
-#
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# serial 6
-
-# AM_PROG_CC_C_O
-# --------------
-# Like AC_PROG_CC_C_O, but changed for automake.
-AC_DEFUN([AM_PROG_CC_C_O],
-[AC_REQUIRE([AC_PROG_CC_C_O])dnl
-AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
-AC_REQUIRE_AUX_FILE([compile])dnl
-# FIXME: we rely on the cache variable name because
-# there is no other way.
-set dummy $CC
-am_cc=`echo $[2] | sed ['s/[^a-zA-Z0-9_]/_/g;s/^[0-9]/_/']`
-eval am_t=\$ac_cv_prog_cc_${am_cc}_c_o
-if test "$am_t" != yes; then
-   # Losing compiler, so override with the script.
-   # FIXME: It is wrong to rewrite CC.
-   # But if we don't then we get into trouble of one sort or another.
-   # A longer-term fix would be to have automake use am__CC in this case,
-   # and then we could set am__CC="\$(top_srcdir)/compile \$(CC)"
-   CC="$am_aux_dir/compile $CC"
-fi
-dnl Make sure AC_PROG_CC is never called again, or it will override our
-dnl setting of CC.
-m4_define([AC_PROG_CC],
-          [m4_fatal([AC_PROG_CC cannot be called after AM_PROG_CC_C_O])])
-])
-
-# Fake the existence of programs that GNU maintainers use.  -*- Autoconf -*-
-
-# Copyright (C) 1997, 1999, 2000, 2001, 2003, 2004, 2005, 2008
-# Free Software Foundation, Inc.
-#
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# serial 6
-
-# AM_MISSING_PROG(NAME, PROGRAM)
-# ------------------------------
-AC_DEFUN([AM_MISSING_PROG],
-[AC_REQUIRE([AM_MISSING_HAS_RUN])
-$1=${$1-"${am_missing_run}$2"}
-AC_SUBST($1)])
-
-
-# AM_MISSING_HAS_RUN
-# ------------------
-# Define MISSING if not defined so far and test if it supports --run.
-# If it does, set am_missing_run to use it, otherwise, to nothing.
-AC_DEFUN([AM_MISSING_HAS_RUN],
-[AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
-AC_REQUIRE_AUX_FILE([missing])dnl
-if test x"${MISSING+set}" != xset; then
-  case $am_aux_dir in
-  *\ * | *\	*)
-    MISSING="\${SHELL} \"$am_aux_dir/missing\"" ;;
-  *)
-    MISSING="\${SHELL} $am_aux_dir/missing" ;;
-  esac
-fi
-# Use eval to expand $SHELL
-if eval "$MISSING --run true"; then
-  am_missing_run="$MISSING --run "
-else
-  am_missing_run=
-  AC_MSG_WARN([`missing' script is too old or missing])
-fi
-])
-
-# Copyright (C) 2003, 2004, 2005, 2006, 2011 Free Software Foundation,
-# Inc.
-#
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# serial 1
-
-# AM_PROG_MKDIR_P
-# ---------------
-# Check for `mkdir -p'.
-AC_DEFUN([AM_PROG_MKDIR_P],
-[AC_PREREQ([2.60])dnl
-AC_REQUIRE([AC_PROG_MKDIR_P])dnl
-dnl Automake 1.8 to 1.9.6 used to define mkdir_p.  We now use MKDIR_P,
-dnl while keeping a definition of mkdir_p for backward compatibility.
-dnl @MKDIR_P@ is magic: AC_OUTPUT adjusts its value for each Makefile.
-dnl However we cannot define mkdir_p as $(MKDIR_P) for the sake of
-dnl Makefile.ins that do not define MKDIR_P, so we do our own
-dnl adjustment using top_builddir (which is defined more often than
-dnl MKDIR_P).
-AC_SUBST([mkdir_p], ["$MKDIR_P"])dnl
-case $mkdir_p in
-  [[\\/$]]* | ?:[[\\/]]*) ;;
-  */*) mkdir_p="\$(top_builddir)/$mkdir_p" ;;
-esac
-])
-
-# Helper functions for option handling.                     -*- Autoconf -*-
-
-# Copyright (C) 2001, 2002, 2003, 2005, 2008, 2010 Free Software
-# Foundation, Inc.
-#
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# serial 5
-
-# _AM_MANGLE_OPTION(NAME)
-# -----------------------
-AC_DEFUN([_AM_MANGLE_OPTION],
-[[_AM_OPTION_]m4_bpatsubst($1, [[^a-zA-Z0-9_]], [_])])
-
-# _AM_SET_OPTION(NAME)
-# --------------------
-# Set option NAME.  Presently that only means defining a flag for this option.
-AC_DEFUN([_AM_SET_OPTION],
-[m4_define(_AM_MANGLE_OPTION([$1]), 1)])
-
-# _AM_SET_OPTIONS(OPTIONS)
-# ------------------------
-# OPTIONS is a space-separated list of Automake options.
-AC_DEFUN([_AM_SET_OPTIONS],
-[m4_foreach_w([_AM_Option], [$1], [_AM_SET_OPTION(_AM_Option)])])
-
-# _AM_IF_OPTION(OPTION, IF-SET, [IF-NOT-SET])
-# -------------------------------------------
-# Execute IF-SET if OPTION is set, IF-NOT-SET otherwise.
-AC_DEFUN([_AM_IF_OPTION],
-[m4_ifset(_AM_MANGLE_OPTION([$1]), [$2], [$3])])
-
-# Check to make sure that the build environment is sane.    -*- Autoconf -*-
-
-# Copyright (C) 1996, 1997, 2000, 2001, 2003, 2005, 2008
-# Free Software Foundation, Inc.
-#
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# serial 5
-
-# AM_SANITY_CHECK
-# ---------------
-AC_DEFUN([AM_SANITY_CHECK],
-[AC_MSG_CHECKING([whether build environment is sane])
-# Just in case
-sleep 1
-echo timestamp > conftest.file
-# Reject unsafe characters in $srcdir or the absolute working directory
-# name.  Accept space and tab only in the latter.
-am_lf='
-'
-case `pwd` in
-  *[[\\\"\#\$\&\'\`$am_lf]]*)
-    AC_MSG_ERROR([unsafe absolute working directory name]);;
-esac
-case $srcdir in
-  *[[\\\"\#\$\&\'\`$am_lf\ \	]]*)
-    AC_MSG_ERROR([unsafe srcdir value: `$srcdir']);;
-esac
-
-# Do `set' in a subshell so we don't clobber the current shell's
-# arguments.  Must try -L first in case configure is actually a
-# symlink; some systems play weird games with the mod time of symlinks
-# (eg FreeBSD returns the mod time of the symlink's containing
-# directory).
-if (
-   set X `ls -Lt "$srcdir/configure" conftest.file 2> /dev/null`
-   if test "$[*]" = "X"; then
-      # -L didn't work.
-      set X `ls -t "$srcdir/configure" conftest.file`
-   fi
-   rm -f conftest.file
-   if test "$[*]" != "X $srcdir/configure conftest.file" \
-      && test "$[*]" != "X conftest.file $srcdir/configure"; then
-
-      # If neither matched, then we have a broken ls.  This can happen
-      # if, for instance, CONFIG_SHELL is bash and it inherits a
-      # broken ls alias from the environment.  This has actually
-      # happened.  Such a system could not be considered "sane".
-      AC_MSG_ERROR([ls -t appears to fail.  Make sure there is not a broken
-alias in your environment])
-   fi
-
-   test "$[2]" = conftest.file
-   )
-then
-   # Ok.
-   :
-else
-   AC_MSG_ERROR([newly created file is older than distributed files!
-Check your system clock])
-fi
-AC_MSG_RESULT(yes)])
-
-# Copyright (C) 2001, 2003, 2005, 2011 Free Software Foundation, Inc.
-#
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# serial 1
-
-# AM_PROG_INSTALL_STRIP
-# ---------------------
-# One issue with vendor `install' (even GNU) is that you can't
-# specify the program used to strip binaries.  This is especially
-# annoying in cross-compiling environments, where the build's strip
-# is unlikely to handle the host's binaries.
-# Fortunately install-sh will honor a STRIPPROG variable, so we
-# always use install-sh in `make install-strip', and initialize
-# STRIPPROG with the value of the STRIP variable (set by the user).
-AC_DEFUN([AM_PROG_INSTALL_STRIP],
-[AC_REQUIRE([AM_PROG_INSTALL_SH])dnl
-# Installed binaries are usually stripped using `strip' when the user
-# run `make install-strip'.  However `strip' might not be the right
-# tool to use in cross-compilation environments, therefore Automake
-# will honor the `STRIP' environment variable to overrule this program.
-dnl Don't test for $cross_compiling = yes, because it might be `maybe'.
-if test "$cross_compiling" != no; then
-  AC_CHECK_TOOL([STRIP], [strip], :)
-fi
-INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s"
-AC_SUBST([INSTALL_STRIP_PROGRAM])])
-
-# Copyright (C) 2006, 2008, 2010 Free Software Foundation, Inc.
-#
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# serial 3
-
-# _AM_SUBST_NOTMAKE(VARIABLE)
-# ---------------------------
-# Prevent Automake from outputting VARIABLE = @VARIABLE@ in Makefile.in.
-# This macro is traced by Automake.
-AC_DEFUN([_AM_SUBST_NOTMAKE])
-
-# AM_SUBST_NOTMAKE(VARIABLE)
-# --------------------------
-# Public sister of _AM_SUBST_NOTMAKE.
-AC_DEFUN([AM_SUBST_NOTMAKE], [_AM_SUBST_NOTMAKE($@)])
-
-# Check how to create a tarball.                            -*- Autoconf -*-
-
-# Copyright (C) 2004, 2005, 2012 Free Software Foundation, Inc.
-#
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# serial 2
-
-# _AM_PROG_TAR(FORMAT)
-# --------------------
-# Check how to create a tarball in format FORMAT.
-# FORMAT should be one of `v7', `ustar', or `pax'.
-#
-# Substitute a variable $(am__tar) that is a command
-# writing to stdout a FORMAT-tarball containing the directory
-# $tardir.
-#     tardir=directory && $(am__tar) > result.tar
-#
-# Substitute a variable $(am__untar) that extract such
-# a tarball read from stdin.
-#     $(am__untar) < result.tar
-AC_DEFUN([_AM_PROG_TAR],
-[# Always define AMTAR for backward compatibility.  Yes, it's still used
-# in the wild :-(  We should find a proper way to deprecate it ...
-AC_SUBST([AMTAR], ['$${TAR-tar}'])
-m4_if([$1], [v7],
-     [am__tar='$${TAR-tar} chof - "$$tardir"' am__untar='$${TAR-tar} xf -'],
-     [m4_case([$1], [ustar],, [pax],,
-              [m4_fatal([Unknown tar format])])
-AC_MSG_CHECKING([how to create a $1 tar archive])
-# Loop over all known methods to create a tar archive until one works.
-_am_tools='gnutar m4_if([$1], [ustar], [plaintar]) pax cpio none'
-_am_tools=${am_cv_prog_tar_$1-$_am_tools}
-# Do not fold the above two line into one, because Tru64 sh and
-# Solaris sh will not grok spaces in the rhs of `-'.
-for _am_tool in $_am_tools
-do
-  case $_am_tool in
-  gnutar)
-    for _am_tar in tar gnutar gtar;
-    do
-      AM_RUN_LOG([$_am_tar --version]) && break
-    done
-    am__tar="$_am_tar --format=m4_if([$1], [pax], [posix], [$1]) -chf - "'"$$tardir"'
-    am__tar_="$_am_tar --format=m4_if([$1], [pax], [posix], [$1]) -chf - "'"$tardir"'
-    am__untar="$_am_tar -xf -"
-    ;;
-  plaintar)
-    # Must skip GNU tar: if it does not support --format= it doesn't create
-    # ustar tarball either.
-    (tar --version) >/dev/null 2>&1 && continue
-    am__tar='tar chf - "$$tardir"'
-    am__tar_='tar chf - "$tardir"'
-    am__untar='tar xf -'
-    ;;
-  pax)
-    am__tar='pax -L -x $1 -w "$$tardir"'
-    am__tar_='pax -L -x $1 -w "$tardir"'
-    am__untar='pax -r'
-    ;;
-  cpio)
-    am__tar='find "$$tardir" -print | cpio -o -H $1 -L'
-    am__tar_='find "$tardir" -print | cpio -o -H $1 -L'
-    am__untar='cpio -i -H $1 -d'
-    ;;
-  none)
-    am__tar=false
-    am__tar_=false
-    am__untar=false
-    ;;
-  esac
-
-  # If the value was cached, stop now.  We just wanted to have am__tar
-  # and am__untar set.
-  test -n "${am_cv_prog_tar_$1}" && break
-
-  # tar/untar a dummy directory, and stop if the command works
-  rm -rf conftest.dir
-  mkdir conftest.dir
-  echo GrepMe > conftest.dir/file
-  AM_RUN_LOG([tardir=conftest.dir && eval $am__tar_ >conftest.tar])
-  rm -rf conftest.dir
-  if test -s conftest.tar; then
-    AM_RUN_LOG([$am__untar <conftest.tar])
-    grep GrepMe conftest.dir/file >/dev/null 2>&1 && break
-  fi
-done
-rm -rf conftest.dir
-
-AC_CACHE_VAL([am_cv_prog_tar_$1], [am_cv_prog_tar_$1=$_am_tool])
-AC_MSG_RESULT([$am_cv_prog_tar_$1])])
-AC_SUBST([am__tar])
-AC_SUBST([am__untar])
-]) # _AM_PROG_TAR
-
diff --git a/algos.h b/algos.h
new file mode 100644
index 0000000000..e33d182c4d
--- /dev/null
+++ b/algos.h
@@ -0,0 +1,252 @@
+#ifndef ALGOS_H
+#define ALGOS_H
+
+#include <string.h>
+#include "compat.h"
+
+enum sha_algos {
+	ALGO_BLAKECOIN = 0,
+	ALGO_BLAKE,
+	ALGO_BLAKE2B,
+	ALGO_BLAKE2S,
+	ALGO_ALLIUM,
+	ALGO_BMW,
+	ALGO_BASTION,
+	ALGO_C11,
+	ALGO_CRYPTOLIGHT,
+	ALGO_CRYPTONIGHT,
+	ALGO_DEEP,
+	ALGO_DECRED,
+	ALGO_DMD_GR,
+	ALGO_EQUIHASH,
+	ALGO_EXOSIS,
+	ALGO_FRESH,
+	ALGO_FUGUE256,		/* Fugue256 */
+	ALGO_GROESTL,
+	ALGO_HEAVY,		/* Heavycoin hash */
+	ALGO_HMQ1725,
+	ALGO_HSR,
+	ALGO_KECCAK,
+	ALGO_KECCAKC,		/* refreshed Keccak with pool factor 256 */
+	ALGO_JACKPOT,
+	ALGO_JHA,
+	ALGO_LBRY,
+	ALGO_LUFFA,
+	ALGO_LYRA2,
+	ALGO_LYRA2v2,
+	ALGO_LYRA2v3,
+	ALGO_LYRA2Z,
+	ALGO_MJOLLNIR,		/* Hefty hash */
+	ALGO_MYR_GR,
+	ALGO_NEOSCRYPT,
+	ALGO_NIST5,
+	ALGO_PENTABLAKE,
+	ALGO_PHI,
+	ALGO_PHI2,
+	ALGO_POLYTIMOS,
+	ALGO_QUARK,
+	ALGO_QUBIT,
+	ALGO_SCRYPT,
+	ALGO_SCRYPT_JANE,
+	ALGO_SHA256D,
+	ALGO_SHA256T,
+	ALGO_SHA256Q,
+	ALGO_SIA,
+	ALGO_SIB,
+	ALGO_SKEIN,
+	ALGO_SKEIN2,
+	ALGO_SKUNK,
+	ALGO_SONOA,
+	ALGO_S3,
+	ALGO_TIMETRAVEL,
+	ALGO_TRIBUS,
+	ALGO_BITCORE,
+	ALGO_X11EVO,
+	ALGO_X11,
+	ALGO_X12,
+	ALGO_X13,
+	ALGO_X14,
+	ALGO_X15,
+	ALGO_X16R,
+	ALGO_X16S,
+	ALGO_X17,
+	ALGO_VANILLA,
+	ALGO_VELTOR,
+	ALGO_WHIRLCOIN,
+	ALGO_WHIRLPOOL,
+	ALGO_WHIRLPOOLX,
+	ALGO_WILDKECCAK,
+	ALGO_ZR5,
+	ALGO_MONERO,
+	ALGO_GRAFT,
+	ALGO_STELLITE,
+	ALGO_AUTO,
+	ALGO_COUNT
+};
+
+extern volatile enum sha_algos opt_algo;
+
+static const char *algo_names[] = {
+	"blakecoin",
+	"blake",
+	"blake2b",
+	"blake2s",
+	"allium",
+	"bmw",
+	"bastion",
+	"c11",
+	"cryptolight",
+	"cryptonight",
+	"deep",
+	"decred",
+	"dmd-gr",
+	"equihash",
+	"exosis",
+	"fresh",
+	"fugue256",
+	"groestl",
+	"heavy",
+	"hmq1725",
+	"hsr",
+	"keccak",
+	"keccakc",
+	"jackpot",
+	"jha",
+	"lbry",
+	"luffa",
+	"lyra2",
+	"lyra2v2",
+	"lyra2v3",
+	"lyra2z",
+	"mjollnir",
+	"myr-gr",
+	"neoscrypt",
+	"nist5",
+	"penta",
+	"phi",
+	"phi2",
+	"polytimos",
+	"quark",
+	"qubit",
+	"scrypt",
+	"scrypt-jane",
+	"sha256d",
+	"sha256t",
+	"sha256q",
+	"sia",
+	"sib",
+	"skein",
+	"skein2",
+	"skunk",
+	"sonoa",
+	"s3",
+	"timetravel",
+	"tribus",
+	"bitcore",
+	"x11evo",
+	"x11",
+	"x12",
+	"x13",
+	"x14",
+	"x15",
+	"x16r",
+	"x16s",
+	"x17",
+	"vanilla",
+	"veltor",
+	"whirlcoin",
+	"whirlpool",
+	"whirlpoolx",
+	"wildkeccak",
+	"zr5",
+	"monero",
+	"graft",
+	"stellite",
+	"auto", /* reserved for multi algo */
+	""
+};
+
+// string to int/enum
+static inline int algo_to_int(char* arg)
+{
+	int i;
+
+	for (i = 0; i < ALGO_COUNT; i++) {
+		if (algo_names[i] && !strcasecmp(arg, algo_names[i])) {
+			return i;
+		}
+	}
+
+	if (i == ALGO_COUNT) {
+		// some aliases...
+		if (!strcasecmp("all", arg))
+			i = ALGO_AUTO;
+		else if (!strcasecmp("cryptonight-light", arg))
+			i = ALGO_CRYPTOLIGHT;
+		else if (!strcasecmp("cryptonight-lite", arg))
+			i = ALGO_CRYPTOLIGHT;
+		else if (!strcasecmp("flax", arg))
+			i = ALGO_C11;
+		else if (!strcasecmp("diamond", arg))
+			i = ALGO_DMD_GR;
+		else if (!strcasecmp("equi", arg))
+			i = ALGO_EQUIHASH;
+		else if (!strcasecmp("doom", arg))
+			i = ALGO_LUFFA;
+		else if (!strcasecmp("hmq17", arg))
+			i = ALGO_HMQ1725;
+		else if (!strcasecmp("hshare", arg))
+			i = ALGO_HSR;
+		else if (!strcasecmp("lyra2re", arg))
+			i = ALGO_LYRA2;
+		else if (!strcasecmp("lyra2rev2", arg))
+			i = ALGO_LYRA2v2;
+		else if (!strcasecmp("lyra2rev3", arg))
+			i = ALGO_LYRA2v3;
+		else if (!strcasecmp("phi1612", arg))
+			i = ALGO_PHI;
+		else if (!strcasecmp("bitcoin", arg))
+			i = ALGO_SHA256D;
+		else if (!strcasecmp("sha256", arg))
+			i = ALGO_SHA256D;
+		else if (!strcasecmp("thorsriddle", arg))
+			i = ALGO_VELTOR;
+		else if (!strcasecmp("timetravel10", arg))
+			i = ALGO_BITCORE;
+		else if (!strcasecmp("whirl", arg))
+			i = ALGO_WHIRLPOOL;
+		else if (!strcasecmp("ziftr", arg))
+			i = ALGO_ZR5;
+		else
+			i = -1;
+	}
+
+	return i;
+}
+
+static inline int get_cryptonight_algo(int fork)
+{
+	int algo = ALGO_COUNT;
+
+	switch (fork) {
+		case 8:
+			algo = ALGO_GRAFT;
+			break;
+
+		case 7:
+			algo = ALGO_MONERO;
+			break;
+
+		case 3:
+			algo = ALGO_STELLITE;
+			break;
+
+		default:
+			algo = ALGO_CRYPTONIGHT;
+			break;
+	}
+
+	return algo;
+}
+
+#endif
diff --git a/api.cpp b/api.cpp
new file mode 100644
index 0000000000..6edfd3109f
--- /dev/null
+++ b/api.cpp
@@ -0,0 +1,1353 @@
+/*
+ * Copyright 2014 ccminer team
+ *
+ * Implementation by tpruvot (based on cgminer)
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.  See COPYING for more details.
+ */
+#define APIVERSION "1.9"
+
+#ifdef WIN32
+# define  _WINSOCK_DEPRECATED_NO_WARNINGS
+# include <winsock2.h>
+#endif
+
+#include <stdio.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <inttypes.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <time.h>
+#include <math.h>
+#include <stdarg.h>
+#include <assert.h>
+
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "miner.h"
+#include "nvml.h"
+#include "algos.h"
+
+#ifndef WIN32
+# include <errno.h>
+# include <sys/socket.h>
+# include <netinet/in.h>
+# include <arpa/inet.h>
+# include <netdb.h>
+# define SOCKETTYPE long
+# define SOCKETFAIL(a) ((a) < 0)
+# define INVSOCK -1 /* INVALID_SOCKET */
+# define INVINETADDR -1 /* INADDR_NONE */
+# define CLOSESOCKET close
+# define SOCKETINIT {}
+# define SOCKERRMSG strerror(errno)
+#else
+# define SOCKETTYPE SOCKET
+# define SOCKETFAIL(a) ((a) == SOCKET_ERROR)
+# define INVSOCK INVALID_SOCKET
+# define INVINETADDR INADDR_NONE
+# define CLOSESOCKET closesocket
+# define in_addr_t uint32_t
+#endif
+
+#define GROUP(g) (toupper(g))
+#define PRIVGROUP GROUP('W')
+#define NOPRIVGROUP GROUP('R')
+#define ISPRIVGROUP(g) (GROUP(g) == PRIVGROUP)
+#define GROUPOFFSET(g) (GROUP(g) - GROUP('A'))
+#define VALIDGROUP(g) (GROUP(g) >= GROUP('A') && GROUP(g) <= GROUP('Z'))
+#define COMMANDS(g) (apigroups[GROUPOFFSET(g)].commands)
+#define DEFINEDGROUP(g) (ISPRIVGROUP(g) || COMMANDS(g) != NULL)
+struct APIGROUPS {
+	// This becomes a string like: "|cmd1|cmd2|cmd3|" so it's quick to search
+	char *commands;
+} apigroups['Z' - 'A' + 1]; // only A=0 to Z=25 (R: noprivs, W: allprivs)
+
+struct IP4ACCESS {
+	in_addr_t ip;
+	in_addr_t mask;
+	char group;
+};
+
+static int ips = 1;
+static struct IP4ACCESS *ipaccess = NULL;
+
+#define MYBUFSIZ       16384
+#define SOCK_REC_BUFSZ 1024
+#define QUEUE          10
+
+#define ALLIP4         "0.0.0.0"
+static const char *localaddr = "127.0.0.1";
+static const char *UNAVAILABLE = " - API will not be available";
+static const char *MUNAVAILABLE = " - API multicast listener will not be available";
+static char *buffer = NULL;
+static time_t startup = 0;
+static int bye = 0;
+
+extern char *opt_api_bind;
+extern int opt_api_port;
+extern char *opt_api_allow;
+extern char *opt_api_groups;
+extern bool opt_api_mcast;
+extern char *opt_api_mcast_addr;
+extern char *opt_api_mcast_code;
+extern char *opt_api_mcast_des;
+extern int opt_api_mcast_port;
+
+// current stratum...
+extern struct stratum_ctx stratum;
+
+// sysinfos.cpp
+extern int num_cpus;
+extern float cpu_temp(int);
+extern uint32_t cpu_clock(int);
+
+char driver_version[32] = { 0 };
+
+/***************************************************************/
+
+static void gpustatus(int thr_id)
+{
+	struct pool_infos *p = &pools[cur_pooln];
+
+	if (thr_id >= 0 && thr_id < opt_n_threads) {
+		struct cgpu_info *cgpu = &thr_info[thr_id].gpu;
+		double khashes_per_watt = 0;
+		int gpuid = cgpu->gpu_id;
+		char buf[512]; *buf = '\0';
+		char* card;
+
+		cuda_gpu_info(cgpu);
+		cgpu->gpu_plimit = device_plimit[cgpu->gpu_id];
+
+#ifdef USE_WRAPNVML
+		cgpu->has_monitoring = true;
+		cgpu->gpu_bus = gpu_busid(cgpu);
+		cgpu->gpu_temp = gpu_temp(cgpu);
+		cgpu->gpu_fan = (uint16_t) gpu_fanpercent(cgpu);
+		cgpu->gpu_fan_rpm = (uint16_t) gpu_fanrpm(cgpu);
+		cgpu->gpu_power = gpu_power(cgpu); // mWatts
+		cgpu->gpu_plimit = gpu_plimit(cgpu); // mW or %
+#endif
+		cgpu->khashes = stats_get_speed(thr_id, 0.0) / 1000.0;
+		if (cgpu->monitor.gpu_power) {
+			cgpu->gpu_power = cgpu->monitor.gpu_power;
+			khashes_per_watt = (double)cgpu->khashes / cgpu->monitor.gpu_power;
+			khashes_per_watt *= 1000; // power in mW
+			//gpulog(LOG_BLUE, thr_id, "KHW: %g", khashes_per_watt);
+		}
+
+		card = device_name[gpuid];
+
+		snprintf(buf, sizeof(buf), "GPU=%d;BUS=%hd;CARD=%s;TEMP=%.1f;"
+			"POWER=%u;FAN=%hu;RPM=%hu;"
+			"FREQ=%u;MEMFREQ=%u;GPUF=%u;MEMF=%u;"
+			"KHS=%.2f;KHW=%.5f;PLIM=%u;"
+			"ACC=%u;REJ=%u;HWF=%u;I=%.1f;THR=%u|",
+			gpuid, cgpu->gpu_bus, card, cgpu->gpu_temp,
+			cgpu->gpu_power, cgpu->gpu_fan, cgpu->gpu_fan_rpm,
+			cgpu->gpu_clock/1000, cgpu->gpu_memclock/1000, // base freqs in MHz
+			cgpu->monitor.gpu_clock, cgpu->monitor.gpu_memclock, // current
+			cgpu->khashes, khashes_per_watt, cgpu->gpu_plimit,
+			cgpu->accepted, (unsigned) cgpu->rejected, (unsigned) cgpu->hw_errors,
+			cgpu->intensity, cgpu->throughput);
+
+		// append to buffer for multi gpus
+		strcat(buffer, buf);
+	}
+}
+
+/**
+* Returns gpu/thread specific stats
+*/
+static char *getthreads(char *params)
+{
+	*buffer = '\0';
+	for (int i = 0; i < opt_n_threads; i++)
+		gpustatus(i);
+	return buffer;
+}
+
+/*****************************************************************************/
+
+/**
+* Returns miner global infos
+*/
+static char *getsummary(char *params)
+{
+	char algo[64] = { 0 };
+	time_t ts = time(NULL);
+	double accps, uptime = difftime(ts, startup);
+	uint32_t wait_time = 0, solved_count = 0;
+	uint32_t accepted_count = 0, rejected_count = 0;
+	for (int p = 0; p < num_pools; p++) {
+		wait_time += pools[p].wait_time;
+		accepted_count += pools[p].accepted_count;
+		rejected_count += pools[p].rejected_count;
+		solved_count += pools[p].solved_count;
+	}
+	accps = (60.0 * accepted_count) / (uptime ? uptime : 1.0);
+
+	get_currentalgo(algo, sizeof(algo));
+
+	*buffer = '\0';
+	sprintf(buffer, "NAME=%s;VER=%s;API=%s;"
+		"ALGO=%s;GPUS=%d;KHS=%.2f;SOLV=%d;ACC=%d;REJ=%d;"
+		"ACCMN=%.3f;DIFF=%.6f;NETKHS=%.0f;"
+		"POOLS=%u;WAIT=%u;UPTIME=%.0f;TS=%u|",
+		PACKAGE_NAME, PACKAGE_VERSION, APIVERSION,
+		algo, active_gpus, (double)global_hashrate / 1000.,
+		solved_count, accepted_count, rejected_count,
+		accps, net_diff > 1e-6 ? net_diff : stratum_diff, (double)net_hashrate / 1000.,
+		num_pools, wait_time, uptime, (uint32_t) ts);
+	return buffer;
+}
+
+/**
+ * Returns some infos about current pool
+ */
+static char *getpoolnfo(char *params)
+{
+	char *s = buffer;
+	char jobid[128] = { 0 };
+	char extra[96] = { 0 };
+	int pooln = params ? atoi(params) % num_pools : cur_pooln;
+	struct pool_infos *p = &pools[pooln];
+	uint32_t last_share = 0;
+	if (p->last_share_time)
+		last_share = (uint32_t) (time(NULL) - p->last_share_time);
+
+	*s = '\0';
+
+	if (stratum.job.job_id)
+		strncpy(jobid, stratum.job.job_id, sizeof(stratum.job.job_id));
+	if (stratum.job.xnonce2) {
+		/* used temporary to be sure all is ok */
+		sprintf(extra, "0x");
+		if (p->algo == ALGO_DECRED) {
+			char compat[32] = { 0 };
+			cbin2hex(&extra[2], (const char*) stratum.xnonce1, min(36, stratum.xnonce2_size));
+			cbin2hex(compat, (const char*) stratum.job.xnonce2, 4);
+			memcpy(&extra[2], compat, 8); // compat extranonce
+		} else {
+			cbin2hex(&extra[2], (const char*) stratum.job.xnonce2, stratum.xnonce2_size);
+		}
+	}
+
+	snprintf(s, MYBUFSIZ, "POOL=%s;ALGO=%s;URL=%s;USER=%s;SOLV=%d;ACC=%d;REJ=%d;STALE=%u;H=%u;JOB=%s;DIFF=%.6f;"
+		"BEST=%.6f;N2SZ=%d;N2=%s;PING=%u;DISCO=%u;WAIT=%u;UPTIME=%u;LAST=%u|",
+		strlen(p->name) ? p->name : p->short_url, algo_names[p->algo],
+		p->url, p->type & POOL_STRATUM ? p->user : "",
+		p->solved_count, p->accepted_count, p->rejected_count, p->stales_count,
+		stratum.job.height, jobid, stratum_diff, p->best_share,
+		(int) stratum.xnonce2_size, extra, stratum.answer_msec,
+		p->disconnects, p->wait_time, p->work_time, last_share);
+
+	return s;
+}
+
+/*****************************************************************************/
+
+static void gpuhwinfos(int gpu_id)
+{
+	char buf[512];
+	char pstate[8];
+	char* card;
+	struct cgpu_info *cgpu = NULL;
+
+	for (int g = 0; g < opt_n_threads; g++) {
+		if (device_map[g] == gpu_id) {
+			cgpu = &thr_info[g].gpu;
+			break;
+		}
+	}
+
+	if (cgpu == NULL)
+		return;
+
+	cuda_gpu_info(cgpu);
+	cgpu->gpu_plimit = device_plimit[cgpu->gpu_id];
+
+#ifdef USE_WRAPNVML
+	cgpu->has_monitoring = true;
+	cgpu->gpu_bus = gpu_busid(cgpu);
+	cgpu->gpu_temp = gpu_temp(cgpu);
+	cgpu->gpu_fan = (uint16_t) gpu_fanpercent(cgpu);
+	cgpu->gpu_fan_rpm = (uint16_t) gpu_fanrpm(cgpu);
+	cgpu->gpu_pstate = (int16_t) gpu_pstate(cgpu);
+	cgpu->gpu_power = gpu_power(cgpu);
+	cgpu->gpu_plimit = gpu_plimit(cgpu);
+	gpu_info(cgpu);
+#ifdef WIN32
+	if (opt_debug) nvapi_pstateinfo(cgpu->gpu_id);
+#endif
+#endif
+
+	memset(pstate, 0, sizeof(pstate));
+	if (cgpu->gpu_pstate != -1)
+		snprintf(pstate, sizeof(pstate), "P%d", (int) cgpu->gpu_pstate);
+
+	card = device_name[gpu_id];
+
+	snprintf(buf, sizeof(buf), "GPU=%d;BUS=%hd;CARD=%s;SM=%hu;MEM=%u;"
+		"TEMP=%.1f;FAN=%hu;RPM=%hu;FREQ=%u;MEMFREQ=%u;GPUF=%u;MEMF=%u;"
+		"PST=%s;POWER=%u;PLIM=%u;"
+		"VID=%hx;PID=%hx;NVML=%d;NVAPI=%d;SN=%s;BIOS=%s|",
+		gpu_id, cgpu->gpu_bus, card, cgpu->gpu_arch, (uint32_t) cgpu->gpu_mem,
+		cgpu->gpu_temp, cgpu->gpu_fan, cgpu->gpu_fan_rpm,
+		cgpu->gpu_clock/1000U, cgpu->gpu_memclock/1000U, // base clocks
+		cgpu->monitor.gpu_clock, cgpu->monitor.gpu_memclock, // current
+		pstate, cgpu->gpu_power, cgpu->gpu_plimit,
+		cgpu->gpu_vid, cgpu->gpu_pid, cgpu->nvml_id, cgpu->nvapi_id,
+		cgpu->gpu_sn, cgpu->gpu_desc);
+
+	strcat(buffer, buf);
+}
+
+#ifndef WIN32
+static char os_version[64] = "linux ";
+#endif
+
+static const char* os_name()
+{
+#ifdef WIN32
+	return "windows";
+#else
+	FILE *fd = fopen("/proc/version", "r");
+	if (!fd)
+		return "linux";
+	if (!fscanf(fd, "Linux version %48s", &os_version[6])) {
+		fclose(fd);
+		return "linux";
+	}
+	fclose(fd);
+	os_version[48] = '\0';
+	return (const char*) os_version;
+#endif
+}
+
+/**
+ * System and CPU Infos
+ */
+static void syshwinfos()
+{
+	char buf[256];
+
+	int cputc = (int) cpu_temp(0);
+	uint32_t cpuclk = cpu_clock(0);
+
+	memset(buf, 0, sizeof(buf));
+	snprintf(buf, sizeof(buf), "OS=%s;NVDRIVER=%s;CPUS=%d;CPUTEMP=%d;CPUFREQ=%d|",
+		os_name(), driver_version, num_cpus, cputc, cpuclk/1000);
+	strcat(buffer, buf);
+}
+
+/**
+ * Returns gpu and system (todo) informations
+ */
+static char *gethwinfos(char *params)
+{
+	*buffer = '\0';
+	for (int i = 0; i < cuda_num_devices(); i++)
+		gpuhwinfos(i);
+	syshwinfos();
+	return buffer;
+}
+
+/*****************************************************************************/
+
+/**
+ * Returns the last 50 scans stats
+ * optional param thread id (default all)
+ */
+static char *gethistory(char *params)
+{
+	struct stats_data data[50];
+	int thrid = params ? atoi(params) : -1;
+	char *p = buffer;
+	int records = stats_get_history(thrid, data, ARRAY_SIZE(data));
+	*buffer = '\0';
+	for (int i = 0; i < records; i++) {
+		time_t ts = data[i].tm_stat;
+		p += sprintf(p, "GPU=%d;H=%u;KHS=%.2f;DIFF=%g;"
+				"COUNT=%u;FOUND=%u;ID=%u;TS=%u|",
+			data[i].gpu_id, data[i].height, data[i].hashrate, data[i].difficulty,
+			data[i].hashcount, data[i].hashfound, data[i].uid, (uint32_t)ts);
+	}
+	return buffer;
+}
+
+/**
+ * Returns the job scans ranges (debug purpose, only with -D)
+ */
+static char *getscanlog(char *params)
+{
+	struct hashlog_data data[50];
+	char *p = buffer;
+	int records = hashlog_get_history(data, ARRAY_SIZE(data));
+	*buffer = '\0';
+	for (int i = 0; i < records; i++) {
+		time_t ts = data[i].tm_upd;
+		p += sprintf(p, "H=%u;P=%u;JOB=%u;ID=%d;DIFF=%g;"
+				"N=0x%x;FROM=0x%x;SCANTO=0x%x;"
+				"COUNT=0x%x;FOUND=%u;TS=%u|",
+			data[i].height, data[i].npool, data[i].njobid, (int)data[i].job_nonce_id, data[i].sharediff,
+			data[i].nonce, data[i].scanned_from, data[i].scanned_to,
+			(data[i].scanned_to - data[i].scanned_from), data[i].tm_sent ? 1 : 0, (uint32_t)ts);
+	}
+	return buffer;
+}
+
+/**
+ * Some debug infos about memory usage
+ */
+static char *getmeminfo(char *params)
+{
+	uint64_t smem, hmem, totmem;
+	uint32_t srec, hrec;
+
+	stats_getmeminfo(&smem, &srec);
+	hashlog_getmeminfo(&hmem, &hrec);
+	totmem = smem + hmem;
+
+	*buffer = '\0';
+	sprintf(buffer, "STATS=%u;HASHLOG=%u;MEM=%lu|",
+		srec, hrec, totmem);
+
+	return buffer;
+}
+
+/*****************************************************************************/
+
+/**
+ * Set pool by index (pools array in json config)
+ * switchpool|1|
+ */
+static char *remote_switchpool(char *params)
+{
+	bool ret = false;
+	*buffer = '\0';
+	if (!params || strlen(params) == 0) {
+		// rotate pool test
+		ret = pool_switch_next(-1);
+	} else {
+		int n = atoi(params);
+		if (n == cur_pooln)
+			ret = true;
+		else if (n < num_pools)
+			ret = pool_switch(-1, n);
+	}
+	sprintf(buffer, "%s|", ret ? "ok" : "fail");
+	return buffer;
+}
+
+/**
+ * Change pool url (see --url parameter)
+ * seturl|stratum+tcp://<user>:<pass>@mine.xpool.ca:1131|
+ */
+static char *remote_seturl(char *params)
+{
+	bool ret;
+	*buffer = '\0';
+	if (!params || strlen(params) == 0) {
+		// rotate pool test
+		ret = pool_switch_next(-1);
+	} else {
+		ret = pool_switch_url(params);
+	}
+	sprintf(buffer, "%s|", ret ? "ok" : "fail");
+	return buffer;
+}
+
+/**
+ * Ask the miner to quit
+ */
+static char *remote_quit(char *params)
+{
+	*buffer = '\0';
+	bye = 1;
+	sprintf(buffer, "%s", "bye|");
+	return buffer;
+}
+
+/*****************************************************************************/
+
+static char *gethelp(char *params);
+struct CMDS {
+	const char *name;
+	char *(*func)(char *);
+	bool iswritemode;
+} cmds[] = {
+	{ "summary", getsummary, false },
+	{ "threads", getthreads, false },
+	{ "pool",    getpoolnfo, false },
+	{ "histo",   gethistory, false },
+	{ "hwinfo",  gethwinfos, false },
+	{ "meminfo", getmeminfo, false },
+	{ "scanlog", getscanlog, false },
+
+	/* remote functions */
+	{ "seturl",  remote_seturl, true }, /* prefer switchpool, deprecated */
+	{ "switchpool", remote_switchpool, true },
+	{ "quit", remote_quit, true },
+
+	/* keep it the last */
+	{ "help",    gethelp, false },
+};
+#define CMDMAX ARRAY_SIZE(cmds)
+
+static char *gethelp(char *params)
+{
+	*buffer = '\0';
+	char * p = buffer;
+	for (int i = 0; i < CMDMAX-1; i++) {
+		bool displayed = !cmds[i].iswritemode || opt_api_allow;
+		if (displayed) p += sprintf(p, "%s\n", cmds[i].name);
+	}
+	sprintf(p, "|");
+	return buffer;
+}
+
+/*****************************************************************************/
+
+static int send_result(SOCKETTYPE c, char *result)
+{
+	int n;
+	if (!result) {
+		n = send(c, "", 1, 0);
+	} else {
+		// ignore failure - it's closed immediately anyway
+		n = send(c, result, (int) strlen(result) + 1, 0);
+	}
+	return n;
+}
+
+/* ---- Base64 Encoding/Decoding Table --- */
+static const char table64[]=
+  "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+static size_t base64_encode(const uchar *indata, size_t insize, char *outptr, size_t outlen)
+{
+	uchar ibuf[3];
+	uchar obuf[4];
+	int i, inputparts, inlen = (int) insize;
+	size_t len = 0;
+	char *output, *outbuf;
+
+	memset(outptr, 0, outlen);
+
+	outbuf = output = (char*)calloc(1, inlen * 4 / 3 + 4);
+	if (outbuf == NULL) {
+		return -1;
+	}
+
+	while (inlen > 0) {
+		for (i = inputparts = 0; i < 3; i++) {
+			if (inlen  > 0) {
+				inputparts++;
+				ibuf[i] = (uchar) *indata;
+				indata++; inlen--;
+			}
+			else
+				ibuf[i] = 0;
+		}
+
+		obuf[0] = (uchar)  ((ibuf[0] & 0xFC) >> 2);
+		obuf[1] = (uchar) (((ibuf[0] & 0x03) << 4) | ((ibuf[1] & 0xF0) >> 4));
+		obuf[2] = (uchar) (((ibuf[1] & 0x0F) << 2) | ((ibuf[2] & 0xC0) >> 6));
+		obuf[3] = (uchar)   (ibuf[2] & 0x3F);
+
+		switch(inputparts) {
+		case 1: /* only one byte read */
+			snprintf(output, 5, "%c%c==",
+				table64[obuf[0]],
+				table64[obuf[1]]);
+			break;
+		case 2: /* two bytes read */
+			snprintf(output, 5, "%c%c%c=",
+				table64[obuf[0]],
+				table64[obuf[1]],
+				table64[obuf[2]]);
+			break;
+		default:
+			snprintf(output, 5, "%c%c%c%c",
+				table64[obuf[0]],
+				table64[obuf[1]],
+				table64[obuf[2]],
+				table64[obuf[3]] );
+			break;
+		}
+		if ((len+4) > outlen)
+			break;
+		output += 4; len += 4;
+	}
+	len = snprintf(outptr, len, "%s", outbuf);
+	// todo: seems to be missing on linux
+	if (strlen(outptr) == 27)
+		strcat(outptr, "=");
+	free(outbuf);
+
+	return len;
+}
+
+#include "compat/curl-for-windows/openssl/openssl/crypto/sha/sha.h"
+
+/* websocket handshake (tested in Chrome) */
+static int websocket_handshake(SOCKETTYPE c, char *result, char *clientkey)
+{
+	char answer[256];
+	char inpkey[128] = { 0 };
+	char seckey[64];
+	uchar sha1[20];
+	SHA_CTX ctx;
+
+	if (opt_protocol)
+		applog(LOG_DEBUG, "clientkey: %s", clientkey);
+
+	sprintf(inpkey, "%s258EAFA5-E914-47DA-95CA-C5AB0DC85B11", clientkey);
+
+	// SHA-1 test from rfc, returns in base64 "s3pPLMBiTxaQ9kYGzzhZRbK+xOo="
+	//sprintf(inpkey, "dGhlIHNhbXBsZSBub25jZQ==258EAFA5-E914-47DA-95CA-C5AB0DC85B11");
+
+	SHA1_Init(&ctx);
+	SHA1_Update(&ctx, inpkey, strlen(inpkey));
+	SHA1_Final(sha1, &ctx);
+
+	base64_encode(sha1, 20, seckey, sizeof(seckey));
+
+	sprintf(answer,
+		"HTTP/1.1 101 Switching Protocol\r\n"
+		"Upgrade: WebSocket\r\nConnection: Upgrade\r\n"
+		"Sec-WebSocket-Accept: %s\r\n"
+		"Sec-WebSocket-Protocol: text\r\n"
+		"\r\n", seckey);
+
+	// data result as tcp frame
+
+	uchar hd[10] = { 0 };
+	hd[0] = 129; // 0x1 text frame (FIN + opcode)
+	uint64_t datalen = (uint64_t) strlen(result);
+	uint8_t frames = 2;
+	if (datalen <= 125) {
+		hd[1] = (uchar) (datalen);
+	} else if (datalen <= 65535) {
+		hd[1] = (uchar) 126;
+		hd[2] = (uchar) (datalen >> 8);
+		hd[3] = (uchar) (datalen);
+		frames = 4;
+	} else {
+		hd[1] = (uchar) 127;
+		hd[2] = (uchar) (datalen >> 56);
+		hd[3] = (uchar) (datalen >> 48);
+		hd[4] = (uchar) (datalen >> 40);
+		hd[5] = (uchar) (datalen >> 32);
+		hd[6] = (uchar) (datalen >> 24);
+		hd[7] = (uchar) (datalen >> 16);
+		hd[8] = (uchar) (datalen >> 8);
+		hd[9] = (uchar) (datalen);
+		frames = 10;
+	}
+
+	size_t handlen = strlen(answer);
+	uchar *data = (uchar*) calloc(1, handlen + frames + (size_t) datalen + 1);
+	if (data == NULL)
+		return -1;
+	else {
+		uchar *p = data;
+		// HTTP header 101
+		memcpy(p, answer, handlen);
+		p += handlen;
+		// WebSocket Frame - Header + Data
+		memcpy(p, hd, frames);
+		memcpy(p + frames, result, (size_t)datalen);
+		send(c, (const char*)data, (int) (strlen(answer) + frames + datalen + 1), 0);
+		free(data);
+	}
+	return 0;
+}
+
+/*
+ * Interpret --api-groups G:cmd1:cmd2:cmd3,P:cmd4,*,...
+ */
+static void setup_groups()
+{
+	const char *api_groups = opt_api_groups ? opt_api_groups : "";
+	char *buf, *cmd, *ptr, *next, *colon;
+	char commands[512] = { 0 };
+	char cmdbuf[128] = { 0 };
+	char group;
+	bool addstar;
+	int i;
+
+	buf = (char *)calloc(1, strlen(api_groups) + 1);
+	if (unlikely(!buf))
+		proper_exit(1); //, "Failed to malloc ipgroups buf");
+
+	strcpy(buf, api_groups);
+
+	next = buf;
+	// for each group defined
+	while (next && *next) {
+		ptr = next;
+		next = strchr(ptr, ',');
+		if (next)
+			*(next++) = '\0';
+
+		// Validate the group
+		if (*(ptr+1) != ':') {
+			colon = strchr(ptr, ':');
+			if (colon)
+				*colon = '\0';
+			proper_exit(1); //, "API invalid group name '%s'", ptr);
+		}
+
+		group = GROUP(*ptr);
+		if (!VALIDGROUP(group))
+			proper_exit(1); //, "API invalid group name '%c'", *ptr);
+
+		if (group == PRIVGROUP)
+			proper_exit(1); //, "API group name can't be '%c'", PRIVGROUP);
+
+		if (group == NOPRIVGROUP)
+			proper_exit(1); //, "API group name can't be '%c'", NOPRIVGROUP);
+
+		if (apigroups[GROUPOFFSET(group)].commands != NULL)
+			proper_exit(1); //, "API duplicate group name '%c'", *ptr);
+
+		ptr += 2;
+
+		// Validate the command list (and handle '*')
+		cmd = &(commands[0]);
+		*(cmd++) = '|';
+		*cmd = '\0';
+		addstar = false;
+		while (ptr && *ptr) {
+			colon = strchr(ptr, ':');
+			if (colon)
+				*(colon++) = '\0';
+
+			if (strcmp(ptr, "*") == 0)
+				addstar = true;
+			else {
+				bool did = false;
+				for (i = 0; i < CMDMAX-1; i++) {
+					if (strcasecmp(ptr, cmds[i].name) == 0) {
+						did = true;
+						break;
+					}
+				}
+				if (did) {
+					// skip duplicates
+					sprintf(cmdbuf, "|%s|", cmds[i].name);
+					if (strstr(commands, cmdbuf) == NULL) {
+						strcpy(cmd, cmds[i].name);
+						cmd += strlen(cmds[i].name);
+						*(cmd++) = '|';
+						*cmd = '\0';
+					}
+				} else {
+					proper_exit(1); //, "API unknown command '%s' in group '%c'", ptr, group);
+				}
+			}
+
+			ptr = colon;
+		}
+
+		// * = allow all non-iswritemode commands
+		if (addstar) {
+			for (i = 0; i < CMDMAX-1; i++) {
+				if (cmds[i].iswritemode == false) {
+					// skip duplicates
+					sprintf(cmdbuf, "|%s|", cmds[i].name);
+					if (strstr(commands, cmdbuf) == NULL) {
+						strcpy(cmd, cmds[i].name);
+						cmd += strlen(cmds[i].name);
+						*(cmd++) = '|';
+						*cmd = '\0';
+					}
+				}
+			}
+		}
+
+		ptr = apigroups[GROUPOFFSET(group)].commands = (char *)calloc(1, strlen(commands) + 1);
+		if (unlikely(!ptr))
+			proper_exit(1); //, "Failed to malloc group commands buf");
+
+		strcpy(ptr, commands);
+	}
+
+	// Now define R (NOPRIVGROUP) as all non-iswritemode commands
+	cmd = &(commands[0]);
+	*(cmd++) = '|';
+	*cmd = '\0';
+	for (i = 0; i < CMDMAX-1; i++) {
+		if (cmds[i].iswritemode == false) {
+			strcpy(cmd, cmds[i].name);
+			cmd += strlen(cmds[i].name);
+			*(cmd++) = '|';
+			*cmd = '\0';
+		}
+	}
+
+	ptr = apigroups[GROUPOFFSET(NOPRIVGROUP)].commands = (char *)calloc(1, strlen(commands) + 1);
+	if (unlikely(!ptr))
+		proper_exit(1); //, "Failed to malloc noprivgroup commands buf");
+
+	strcpy(ptr, commands);
+
+	// W (PRIVGROUP) is handled as a special case since it simply means all commands
+
+	free(buf);
+	return;
+}
+
+/*
+ * Interpret [W:]IP[/Prefix][,[R|W:]IP2[/Prefix2][,...]] --api-allow option
+ *	special case of 0/0 allows /0 (means all IP addresses)
+ */
+#define ALLIPS "0/0"
+/*
+ * N.B. IP4 addresses are by Definition 32bit big endian on all platforms
+ */
+static void setup_ipaccess()
+{
+	char *buf, *ptr, *comma, *slash, *dot;
+	int ipcount, mask, octet, i;
+	char group;
+
+	buf = (char*) calloc(1, strlen(opt_api_allow) + 1);
+	if (unlikely(!buf))
+		proper_exit(1);//, "Failed to malloc ipaccess buf");
+
+	strcpy(buf, opt_api_allow);
+	ipcount = 1;
+	ptr = buf;
+	while (*ptr) if (*(ptr++) == ',')
+		ipcount++;
+
+	// possibly more than needed, but never less
+	ipaccess = (struct IP4ACCESS *) calloc(ipcount, sizeof(struct IP4ACCESS));
+	if (unlikely(!ipaccess))
+		proper_exit(1);//, "Failed to calloc ipaccess");
+
+	ips = 0;
+	ptr = buf;
+	while (ptr && *ptr) {
+		while (*ptr == ' ' || *ptr == '\t')
+			ptr++;
+
+		if (*ptr == ',') {
+			ptr++;
+			continue;
+		}
+
+		comma = strchr(ptr, ',');
+		if (comma)
+			*(comma++) = '\0';
+
+		group = NOPRIVGROUP;
+
+		if (isalpha(*ptr) && *(ptr+1) == ':') {
+			if (DEFINEDGROUP(*ptr))
+				group = GROUP(*ptr);
+			ptr += 2;
+		}
+
+		ipaccess[ips].group = group;
+
+		if (strcmp(ptr, ALLIPS) == 0 || strcmp(ptr, ALLIP4) == 0)
+			ipaccess[ips].ip = ipaccess[ips].mask = 0;
+		else
+		{
+			slash = strchr(ptr, '/');
+			if (!slash)
+				ipaccess[ips].mask = 0xffffffff;
+			else {
+				*(slash++) = '\0';
+				mask = atoi(slash);
+				if (mask < 1 || mask > 32)
+					goto popipo; // skip invalid/zero
+
+				ipaccess[ips].mask = 0;
+				while (mask-- >= 0) {
+					octet = 1 << (mask % 8);
+					ipaccess[ips].mask |= (octet << (24 - (8 * (mask >> 3))));
+				}
+			}
+
+			ipaccess[ips].ip = 0; // missing default to '.0'
+			for (i = 0; ptr && (i < 4); i++) {
+				dot = strchr(ptr, '.');
+				if (dot)
+					*(dot++) = '\0';
+				octet = atoi(ptr);
+
+				if (octet < 0 || octet > 0xff)
+					goto popipo; // skip invalid
+
+				ipaccess[ips].ip |= (octet << (24 - (i * 8)));
+
+				ptr = dot;
+			}
+
+			ipaccess[ips].ip &= ipaccess[ips].mask;
+		}
+
+		ips++;
+popipo:
+		ptr = comma;
+	}
+
+	free(buf);
+}
+
+static bool check_connect(struct sockaddr_in *cli, char **connectaddr, char *group)
+{
+	bool addrok = false;
+
+	*connectaddr = inet_ntoa(cli->sin_addr);
+
+	*group = NOPRIVGROUP;
+	if (opt_api_allow) {
+		int client_ip = htonl(cli->sin_addr.s_addr);
+		for (int i = 0; i < ips; i++) {
+			if ((client_ip & ipaccess[i].mask) == ipaccess[i].ip) {
+				addrok = true;
+				*group = ipaccess[i].group;
+				break;
+			}
+		}
+	}
+	else if (strcmp(opt_api_bind, ALLIP4) == 0)
+		addrok = true;
+	else
+		addrok = (strcmp(*connectaddr, localaddr) == 0);
+
+	return addrok;
+}
+
+static void mcast()
+{
+	struct sockaddr_in listen;
+	struct ip_mreq grp;
+	struct sockaddr_in came_from;
+	time_t bindstart;
+	char *binderror;
+	SOCKETTYPE mcast_sock;
+	SOCKETTYPE reply_sock;
+	socklen_t came_from_siz;
+	char *connectaddr;
+	ssize_t rep;
+	int bound;
+	int count;
+	int reply_port;
+	bool addrok;
+	char group;
+
+	char expect[] = "ccminer-"; // first 8 bytes constant
+	char *expect_code;
+	size_t expect_code_len;
+	char buf[1024];
+	char replybuf[1024];
+
+	memset(&grp, 0, sizeof(grp));
+	grp.imr_multiaddr.s_addr = inet_addr(opt_api_mcast_addr);
+	if (grp.imr_multiaddr.s_addr == INADDR_NONE)
+		proper_exit(1); //, "Invalid Multicast Address");
+	grp.imr_interface.s_addr = INADDR_ANY;
+
+	mcast_sock = socket(AF_INET, SOCK_DGRAM, 0);
+
+	int optval = 1;
+	if (SOCKETFAIL(setsockopt(mcast_sock, SOL_SOCKET, SO_REUSEADDR, (const char *)(&optval), sizeof(optval)))) {
+		applog(LOG_ERR, "API mcast setsockopt SO_REUSEADDR failed (%s)%s", strerror(errno), MUNAVAILABLE);
+		goto die;
+	}
+
+	memset(&listen, 0, sizeof(listen));
+	listen.sin_family = AF_INET;
+	listen.sin_addr.s_addr = INADDR_ANY;
+	listen.sin_port = htons(opt_api_mcast_port);
+
+	// try for more than 1 minute ... in case the old one hasn't completely gone yet
+	bound = 0;
+	bindstart = time(NULL);
+	while (bound == 0) {
+		if (SOCKETFAIL(bind(mcast_sock, (struct sockaddr *)(&listen), sizeof(listen)))) {
+			binderror = strerror(errno);;
+			if ((time(NULL) - bindstart) > 61)
+				break;
+			else
+				sleep(30);
+		}
+		else
+			bound = 1;
+	}
+
+	if (bound == 0) {
+		applog(LOG_ERR, "API mcast bind to port %d failed (%s)%s", opt_api_port, binderror, MUNAVAILABLE);
+		goto die;
+	}
+
+	if (SOCKETFAIL(setsockopt(mcast_sock, IPPROTO_IP, IP_ADD_MEMBERSHIP, (const char *)(&grp), sizeof(grp)))) {
+		applog(LOG_ERR, "API mcast join failed (%s)%s", strerror(errno), MUNAVAILABLE);
+		goto die;
+	}
+
+	expect_code_len = sizeof(expect) + strlen(opt_api_mcast_code);
+	expect_code = (char *)calloc(1, expect_code_len + 1);
+	if (!expect_code)
+		proper_exit(1); //, "Failed to malloc mcast expect_code");
+	snprintf(expect_code, expect_code_len + 1, "%s%s-", expect, opt_api_mcast_code);
+
+	count = 0;
+	while (42) {
+		sleep(1);
+
+		count++;
+		came_from_siz = sizeof(came_from);
+		if (SOCKETFAIL(rep = recvfrom(mcast_sock, buf, sizeof(buf) - 1,
+			0, (struct sockaddr *)(&came_from), &came_from_siz))) {
+			applog(LOG_DEBUG, "API mcast failed count=%d (%s) (%d)",
+				count, strerror(errno), (int)mcast_sock);
+			continue;
+		}
+
+		addrok = check_connect(&came_from, &connectaddr, &group);
+		applog(LOG_DEBUG, "API mcast from %s - %s",
+			connectaddr, addrok ? "Accepted" : "Ignored");
+		if (!addrok) {
+			continue;
+		}
+
+		buf[rep] = '\0';
+		if (rep > 0 && buf[rep - 1] == '\n')
+			buf[--rep] = '\0';
+
+		applog(LOG_DEBUG, "API mcast request rep=%d (%s) from %s:%d",
+			(int)rep, buf,
+			inet_ntoa(came_from.sin_addr),
+			ntohs(came_from.sin_port));
+
+		if ((size_t)rep > expect_code_len && memcmp(buf, expect_code, expect_code_len) == 0) {
+			reply_port = atoi(&buf[expect_code_len]);
+			if (reply_port < 1 || reply_port > 65535) {
+				applog(LOG_DEBUG, "API mcast request ignored - invalid port (%s)",
+					&buf[expect_code_len]);
+			}
+			else {
+				applog(LOG_DEBUG, "API mcast request OK port %s=%d",
+					&buf[expect_code_len], reply_port);
+
+				came_from.sin_port = htons(reply_port);
+				reply_sock = socket(AF_INET, SOCK_DGRAM, 0);
+
+				snprintf(replybuf, sizeof(replybuf),
+					"ccm-%s-%d-%s", opt_api_mcast_code, opt_api_port, opt_api_mcast_des);
+
+				rep = sendto(reply_sock, replybuf, (int) strlen(replybuf) + 1,
+					0, (struct sockaddr *)(&came_from), (int) sizeof(came_from));
+				if (SOCKETFAIL(rep)) {
+					applog(LOG_DEBUG, "API mcast send reply failed (%s) (%d)",
+						strerror(errno), (int)reply_sock);
+				} else {
+					applog(LOG_DEBUG, "API mcast send reply (%s) succeeded (%d) (%d)",
+						replybuf, (int)rep, (int)reply_sock);
+				}
+
+				CLOSESOCKET(reply_sock);
+			}
+		}
+		else
+			applog(LOG_DEBUG, "API mcast request was no good");
+	}
+
+die:
+	CLOSESOCKET(mcast_sock);
+}
+
+static void *mcast_thread(void *userdata)
+{
+	struct thr_info *mythr = (struct thr_info *)userdata;
+
+	pthread_detach(pthread_self());
+	pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);
+
+	mcast();
+
+	//PTH(mythr) = 0L;
+
+	return NULL;
+}
+
+void mcast_init()
+{
+	struct thr_info *thr;
+
+	thr = (struct thr_info *)calloc(1, sizeof(*thr));
+	if (!thr)
+		proper_exit(1); //, "Failed to calloc mcast thr");
+
+	if (unlikely(pthread_create(&thr->pth, NULL, mcast_thread, thr)))
+		proper_exit(1); //, "API mcast thread create failed");
+}
+
+static void api()
+{
+	const char *addr = opt_api_bind;
+	unsigned short port = (unsigned short) opt_api_port; // 4068
+	char buf[MYBUFSIZ];
+	int n, bound;
+	char *connectaddr;
+	char *binderror;
+	char group;
+	time_t bindstart;
+	struct sockaddr_in serv;
+	struct sockaddr_in cli;
+	socklen_t clisiz;
+	bool addrok = false;
+	long long counter;
+	char *result;
+	char *params;
+	int i;
+
+	SOCKETTYPE c;
+	SOCKETTYPE *apisock;
+	if (!opt_api_port && opt_debug) {
+		applog(LOG_DEBUG, "API disabled");
+		return;
+	}
+
+	setup_groups();
+
+	if (opt_api_allow) {
+		setup_ipaccess();
+		if (ips == 0) {
+			applog(LOG_WARNING, "API not running (no valid IPs specified)%s", UNAVAILABLE);
+		}
+	}
+
+	apisock = (SOCKETTYPE*) calloc(1, sizeof(*apisock));
+	*apisock = INVSOCK;
+
+	sleep(1);
+
+	*apisock = socket(AF_INET, SOCK_STREAM, 0);
+	if (*apisock == INVSOCK) {
+		applog(LOG_ERR, "API initialisation failed (%s)%s", strerror(errno), UNAVAILABLE);
+		return;
+	}
+
+	memset(&serv, 0, sizeof(serv));
+	serv.sin_family = AF_INET;
+	serv.sin_addr.s_addr = inet_addr(addr); // TODO: allow bind to ip/interface
+	if (serv.sin_addr.s_addr == (in_addr_t)INVINETADDR) {
+		applog(LOG_ERR, "API initialisation 2 failed (%s)%s", strerror(errno), UNAVAILABLE);
+		// free(apisock); FIXME!!
+		return;
+	}
+
+	serv.sin_port = htons(port);
+
+#ifndef WIN32
+	// On linux with SO_REUSEADDR, bind will get the port if the previous
+	// socket is closed (even if it is still in TIME_WAIT) but fail if
+	// another program has it open - which is what we want
+	int optval = 1;
+	// If it doesn't work, we don't really care - just show a debug message
+	if (SOCKETFAIL(setsockopt(*apisock, SOL_SOCKET, SO_REUSEADDR, (void *)(&optval), sizeof(optval))))
+	        applog(LOG_DEBUG, "API setsockopt SO_REUSEADDR failed (ignored): %s", SOCKERRMSG);
+#else
+	// On windows a 2nd program can bind to a port>1024 already in use unless
+	// SO_EXCLUSIVEADDRUSE is used - however then the bind to a closed port
+	// in TIME_WAIT will fail until the timeout - so we leave the options alone
+#endif
+
+	// try for 1 minute ... in case the old one hasn't completely gone yet
+	bound = 0;
+	bindstart = time(NULL);
+	while (bound == 0) {
+		if (bind(*apisock, (struct sockaddr *)(&serv), sizeof(serv)) < 0) {
+			binderror = strerror(errno);
+			if ((time(NULL) - bindstart) > 61)
+				break;
+			else if (opt_api_port == 4068) {
+				/* when port is default one, use first available */
+				if (opt_debug)
+					applog(LOG_DEBUG, "API bind to port %d failed, trying port %u",
+						port, (uint32_t) port+1);
+				port++;
+				serv.sin_port = htons(port);
+				sleep(1);
+			} else {
+				if (!opt_quiet || opt_debug)
+					applog(LOG_WARNING, "API bind to port %u failed - trying again in 20sec",
+						(uint32_t) port);
+				sleep(20);
+			}
+		}
+		else {
+			bound = 1;
+			if (opt_api_port != port) {
+				applog(LOG_WARNING, "API bind to port %d failed - using port %u",
+					opt_api_port, (uint32_t)port);
+				opt_api_port = port;
+			}
+		}
+	}
+
+	if (bound == 0) {
+		applog(LOG_WARNING, "API bind to port %d failed (%s)%s", port, binderror, UNAVAILABLE);
+		free(apisock);
+		return;
+	}
+
+	if (SOCKETFAIL(listen(*apisock, QUEUE))) {
+		applog(LOG_ERR, "API initialisation 3 failed (%s)%s", strerror(errno), UNAVAILABLE);
+		CLOSESOCKET(*apisock);
+		free(apisock);
+		return;
+	}
+
+	if (opt_api_allow && strcmp(opt_api_bind, "127.0.0.1") == 0)
+		applog(LOG_WARNING, "API open locally in full access mode on port %d", opt_api_port);
+	else if (opt_api_allow)
+		applog(LOG_WARNING, "API open in full access mode to %s on port %d", opt_api_allow, opt_api_port);
+	else if (strcmp(opt_api_bind, "127.0.0.1") != 0)
+		applog(LOG_INFO, "API open to the network in read-only mode on port %d", opt_api_port);
+
+	if (opt_api_mcast)
+		mcast_init();
+
+	buffer = (char *) calloc(1, MYBUFSIZ + 1);
+
+	counter = 0;
+	while (bye == 0 && !abort_flag) {
+		counter++;
+
+		clisiz = sizeof(cli);
+		c = accept(*apisock, (struct sockaddr*) (&cli), &clisiz);
+		if (SOCKETFAIL(c)) {
+			applog(LOG_ERR, "API failed (%s)%s", strerror(errno), UNAVAILABLE);
+			CLOSESOCKET(*apisock);
+			free(apisock);
+			free(buffer);
+			return;
+		}
+
+		addrok = check_connect(&cli, &connectaddr, &group);
+		if (opt_debug && opt_protocol)
+			applog(LOG_DEBUG, "API: connection from %s - %s",
+				connectaddr, addrok ? "Accepted" : "Ignored");
+
+		if (addrok) {
+			bool fail;
+			char *wskey = NULL;
+			n = recv(c, &buf[0], SOCK_REC_BUFSZ, 0);
+
+			fail = SOCKETFAIL(n) || n < 0;
+			if (fail)
+				buf[0] = '\0';
+			else if (n > 0 && buf[n-1] == '\n') {
+				/* telnet compat \r\n */
+				buf[n-1] = '\0'; n--;
+				if (n > 0 && buf[n-1] == '\r')
+					buf[n-1] = '\0';
+			}
+			else buf[n] = '\0';
+
+			//if (opt_debug && opt_protocol && n > 0)
+			//	applog(LOG_DEBUG, "API: recv command: (%d) '%s'+char(%x)", n, buf, buf[n-1]);
+
+			if (!fail) {
+				char *msg = NULL;
+				/* Websocket requests compat. */
+				if ((msg = strstr(buf, "GET /")) && strlen(msg) > 5) {
+					char cmd[256] = { 0 };
+					sscanf(&msg[5], "%s\n", cmd);
+					params = strchr(cmd, '/');
+					if (params)
+						*(params++) = '|';
+					params = strchr(cmd, '/');
+					if (params)
+						*(params++) = '\0';
+					wskey = strstr(msg, "Sec-WebSocket-Key");
+					if (wskey) {
+						char *eol = strchr(wskey, '\r');
+						if (eol) *eol = '\0';
+						wskey = strchr(wskey, ':');
+						wskey++;
+						while ((*wskey) == ' ') wskey++; // ltrim
+					}
+					n = sprintf(buf, "%s", cmd);
+				}
+
+				params = strchr(buf, '|');
+				if (params != NULL)
+					*(params++) = '\0';
+
+				if (opt_debug && opt_protocol && n > 0)
+					applog(LOG_DEBUG, "API: exec command %s(%s)", buf, params ? params : "");
+
+				for (i = 0; i < CMDMAX; i++) {
+					if (strcmp(buf, cmds[i].name) == 0 && strlen(buf)) {
+						if (params && strlen(params)) {
+							// remove possible trailing |
+							if (params[strlen(params)-1] == '|')
+								params[strlen(params)-1] = '\0';
+						}
+						result = (cmds[i].func)(params);
+						if (wskey) {
+							websocket_handshake(c, result, wskey);
+							break;
+						}
+						send_result(c, result);
+						break;
+					}
+				}
+			}
+			CLOSESOCKET(c);
+		}
+	}
+
+	CLOSESOCKET(*apisock);
+	free(apisock);
+	free(buffer);
+}
+
+/* external access */
+void *api_thread(void *userdata)
+{
+	struct thr_info *mythr = (struct thr_info*)userdata;
+
+	startup = time(NULL);
+	api();
+	tq_freeze(mythr->q);
+
+	if (bye) {
+		// quit command
+		proper_exit(1);
+	}
+
+	return NULL;
+}
+
+/* to be able to report the default value set in each algo */
+void api_set_throughput(int thr_id, uint32_t throughput)
+{
+	if (thr_id < MAX_GPUS && thr_info) {
+		struct cgpu_info *cgpu = &thr_info[thr_id].gpu;
+		cgpu->intensity = throughput2intensity(throughput);
+		if (cgpu->throughput != throughput) cgpu->throughput = throughput;
+	}
+	// to display in bench results
+	if (opt_benchmark)
+		bench_set_throughput(thr_id, throughput);
+}
diff --git a/api/index.php b/api/index.php
new file mode 100644
index 0000000000..231fc393a7
--- /dev/null
+++ b/api/index.php
@@ -0,0 +1,280 @@
+<?php
+/* ccminer API sample UI (API 1.9) */
+
+$host = 'http://localhost/api/'; // 'http://'.$_SERVER['SERVER_NAME'].'/api/';
+$configs = array(
+	'LOCAL'=>'local-sample.php',
+	//'EPSYTOUR'=>'epsytour.php', /* copy local.php file and edit target IP:PORT */
+);
+
+// 3 seconds max.
+set_time_limit(3);
+error_reporting(0);
+
+function getdataFromPeers()
+{
+	global $host, $configs;
+	$data = array();
+	foreach ($configs as $name => $conf) {
+
+		$json = file_get_contents($host.$conf);
+
+		$data[$name] = json_decode($json, TRUE);
+	}
+	return $data;
+}
+
+function ignoreField($key)
+{
+	$ignored = array(
+		'API','VER','GPU','BUS','POOLS',
+		'CARD','GPUS','CPU','TS','URL',
+	);
+	return in_array($key, $ignored);
+}
+
+function translateField($key)
+{
+	$intl = array();
+	$intl['NAME'] = 'Software';
+	$intl['VER'] = 'Version';
+
+	$intl['ALGO'] = 'Algorithm';
+	$intl['GPUS'] = 'GPUs';
+	$intl['CPUS'] = 'Threads';
+	$intl['KHS'] = 'Hash rate';
+	$intl['ACC'] = 'Accepted shares';
+	$intl['ACCMN'] = 'Accepted / mn';
+	$intl['REJ'] = 'Rejected';
+	$intl['SOLV'] = 'Solved';
+	$intl['BEST'] = 'Best share';
+	$intl['STALE'] = 'Stale shares';
+	$intl['LAST'] = 'Last share';
+	$intl['DIFF'] = 'Difficulty';
+	$intl['NETKHS'] = 'Net Rate';
+	$intl['UPTIME'] = 'Miner up time';
+	$intl['TS'] = 'Last update';
+	$intl['THR'] = 'Throughput';
+	$intl['WAIT'] = 'Wait time';
+
+	$intl['H'] = 'Bloc height';
+	$intl['I'] = 'Intensity';
+	$intl['HWF'] = 'Failures';
+	$intl['POOL'] = 'Pool';
+	$intl['POOLS'] = 'Pools';
+
+	$intl['TEMP'] = 'T°c';
+	$intl['FAN'] = 'Fan %';
+	$intl['CPUFREQ'] = 'CPU Freq.';
+	$intl['FREQ'] = 'Base Freq.';
+	$intl['MEMFREQ'] = 'Mem. Freq.';
+	$intl['GPUF'] = 'Curr Freq.';
+	$intl['MEMF'] = 'Mem. Freq.';
+	$intl['KHW'] = 'Efficiency';
+	$intl['POWER'] = 'Power';
+	$intl['PLIM'] = 'P.Limit';
+	$intl['PST'] = 'P-State';
+
+	// pool infos
+	$intl['POOL'] = 'Pool';
+	$intl['PING'] = 'Ping (ms)';
+	$intl['DISCO'] = 'Disconnects';
+	$intl['USER'] = 'User';
+
+	if (isset($intl[$key]))
+		return $intl[$key];
+	else
+		return $key;
+}
+
+function translateValue($key,$val,$data=array())
+{
+	switch ($key) {
+		case 'UPTIME':
+		case 'WAIT':
+			$min = floor(intval($val) / 60);
+			$sec = intval($val) % 60;
+			$val = "${min}mn${sec}s";
+			if ($min > 180) {
+				$hrs = floor($min / 60);
+				$min = $min % 60;
+				$val = "${hrs}h${min}mn";
+			}
+			break;
+		case 'NAME':
+			$val = $data['NAME'].'&nbsp;'.$data['VER'];
+			break;
+		case 'CPUFREQ':
+		case 'FREQ':
+		case 'MEMFREQ':
+		case 'GPUF':
+		case 'MEMF':
+			$val = sprintf("%d MHz", $val);
+			break;
+		case 'POWER':
+			$val = sprintf("%d W", round(floatval($val)/1000.0));
+			break;
+		case 'TS':
+			$val = strftime("%H:%M:%S", (int) $val);
+			break;
+		case 'KHS':
+		case 'NETKHS':
+			$val = '<span class="bold">'.$val.'</span> kH/s';
+			break;
+		case 'KHW':
+			$val = $val.' kH/W';
+			break;
+		case 'NAME':
+		case 'POOL';
+		case 'USER':
+			// long fields
+			$val = '<span class="elipsis">'.$val.'</span>';
+			break;
+	}
+	return $val;
+}
+
+function filterPoolInfos($stats)
+{
+	$keys = array('USER','H','PING','DISCO');
+	$data = array();
+	$pool = array_pop($stats);
+	// simplify URL to host only
+	$data['POOL'] = $pool['URL'];
+	if (strstr($pool['URL'],'://')) {
+		$parts = explode(':', $pool['URL']);
+		$data['POOL'] = substr($parts[1],2);
+	}
+	foreach ($pool as $key=>$val) {
+		if (in_array($key, $keys))
+			$data[$key] = $val;
+	}
+	return $data;
+}
+
+function displayData($data)
+{
+	$htm = '';
+	$totals = array();
+	foreach ($data as $name => $stats) {
+		if (!isset($stats['summary']))
+			continue;
+		$htm .= '<table id="tb_'.$name.'" class="stats">'."\n";
+		$htm .= '<tr><th class="machine" colspan="2">'.$name."</th></tr>\n";
+		if (!empty($stats)) {
+			$summary = (array) $stats['summary'];
+			foreach ($summary as $key=>$val) {
+				if (!empty($val) && !ignoreField($key))
+					$htm .= '<tr><td class="key">'.translateField($key).'</td>'.
+						'<td class="val">'.translateValue($key, $val, $summary)."</td></tr>\n";
+			}
+			if (isset($summary['KHS']))
+				@ $totals[$summary['ALGO']] += floatval($summary['KHS']);
+
+			if (isset($stats['pool']) && !empty($stats['pool']) ) {
+				$pool = filterPoolInfos($stats['pool']);
+				$htm .= '<tr><th class="gpu" colspan="2">POOL</th></tr>'."\n";
+				foreach ($pool as $key=>$val) {
+					if (!empty($val) && !ignoreField($key))
+					$htm .= '<tr><td class="key">'.translateField($key).'</td>'.
+						'<td class="val">'.translateValue($key, $val)."</td></tr>\n";
+				}
+			}
+
+			foreach ($stats['threads'] as $g=>$gpu) {
+				$card = isset($gpu['CARD']) ? $gpu['CARD'] : '';
+				$htm .= '<tr><th class="gpu" colspan="2">'.$g." $card</th></tr>\n";
+				foreach ($gpu as $key=>$val) {
+					if (!empty($val) && !ignoreField($key))
+					$htm .= '<tr><td class="key">'.translateField($key).'</td>'.
+						'<td class="val">'.translateValue($key, $val)."</td></tr>\n";
+				}
+			}
+		}
+		$htm .= "</table>\n";
+	}
+	// totals
+	if (!empty($totals)) {
+		$htm .= '<div class="totals"><h2>Total Hash rate</h2>'."\n";
+		foreach ($totals as $algo => $hashrate) {
+			$htm .= '<li><span class="algo">'.$algo.":</span>$hashrate kH/s</li>\n";
+		}
+		$htm .= '</div>';
+	}
+	return $htm;
+}
+
+$data = getdataFromPeers();
+
+?>
+<html>
+<head>
+	<title>ccminer rig api sample</title>
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+<meta http-equiv="refresh" content="10">
+<style type="text/css">
+body {
+	color:#cccccc; background:#1d1d1d; margin:30px 30px 0px 30px; padding:0px;
+	font-size:.8em; font-family:Arial,Helvetica,sans-serif;
+}
+a { color:#aaaaaa; text-decoration: none; }
+a:focus { outline-style:none; }
+.clear { clear: both; }
+
+div#page, div#header, div#footer {
+	margin: auto;
+	width: 950px;
+	box-shadow: 0 5px 10px rgba(0, 0, 0, 0.15);
+}
+div#page {
+	padding-top: 8px;
+	background: #252525;
+	min-height: 820px;
+}
+div#header {
+	background: rgba(65, 65, 65, 0.85);
+	height: 50px;
+	margin-bottom: 24px;
+	padding-left: 8px;
+}
+div#footer {
+	background: rgba(25, 25, 25, 0.85);
+	height: 0px;
+	margin-bottom: 40px;
+	text-align: center;
+	color: #666666;
+	text-shadow: rgba(0, 0, 0, 0.8) 0px 1px 0px;
+}
+#header h1 { padding: 12px; font-size: 20px; }
+#footer p { margin: 12px 24px; }
+
+table.stats { width: 280px; margin: 4px 16px; display: inline-block; vertical-align: top; }
+th.machine { color: darkcyan; padding: 16px 0px 0px 0px; text-align: left; border-bottom: 1px solid gray; }
+th.gpu { color: white; padding: 3px 3px; font: bolder; text-align: left; background: rgba(65, 65, 65, 0.85); }
+td.key { width: 99px; max-width: 180px; }
+td.val { width: 40px; max-width: 100px; color: white; }
+
+div.totals { margin: 16px; padding-bottom: 16px; }
+div.totals h2 { color: darkcyan; font-size: 16px; margin-bottom: 4px; }
+div.totals li { list-style-type: none; font-size: 16px; margin-left: 4px; margin-bottom: 8px; }
+li span.algo { display: inline-block; width: 100px; max-width: 180px; }
+
+span.bold { color: #bb99aa; }
+span.elipsis { display: inline-block; max-width: 130px; overflow: hidden; }
+</style>
+</head>
+<body>
+<div id="header">
+<h1>ccminer monitoring API RIG sample</h1>
+</div>
+
+<div id="page">
+<?=displayData($data)?>
+</div>
+
+<div id="footer">
+<p>&copy; 2014-2015 <a href="http://github.com/tpruvot/ccminer">tpruvot@github</a></p>
+</div>
+
+</body>
+</html>
diff --git a/api/local-sample.php b/api/local-sample.php
new file mode 100644
index 0000000000..33ed49c658
--- /dev/null
+++ b/api/local-sample.php
@@ -0,0 +1,137 @@
+<?php
+/**
+ * Sample Request API to ccminer
+ */
+defined('API_HOST') || define('API_HOST', '127.0.0.1');
+defined('API_PORT') || define('API_PORT', 4068);
+
+// 2 seconds max.
+set_time_limit(2);
+
+function getsock($port)
+{
+	$socket = null;
+	$socket = socket_create(AF_INET, SOCK_STREAM, SOL_TCP);
+	if ($socket === false || $socket === null) {
+		$error = socket_strerror(socket_last_error());
+		$msg = "socket create($port) failed";
+		echo "ERR: $msg '$error'\n";
+		return NULL;
+	}
+
+	socket_set_nonblock($socket);
+
+	$res = socket_connect($socket, API_HOST, $port);
+	$timeout = 50;
+	while ($res === false && $timeout > 0) {
+		$err = socket_last_error($socket);
+		echo ".";
+		if ($timeout > 1 && ($err == 115 || $err == 114)) {
+			$timeout--;
+			usleep(50);
+			$res = socket_connect($socket, API_HOST, $port);
+			continue;
+		}
+		$error = socket_strerror($err);
+		$msg = "socket connect($port) failed";
+		echo "ERR: $msg '$error'\n";
+		socket_close($socket);
+		return NULL;
+	}
+
+	socket_set_block($socket);
+
+	return $socket;
+}
+
+function readsockline($socket)
+{
+	$line = '';
+	while (true) {
+		$byte = socket_read($socket, 1);
+		if ($byte === false || $byte === '')
+			break;
+		if ($byte === "\0")
+			break;
+		$line .= $byte;
+	}
+	return $line;
+}
+
+
+function request($cmd)
+{
+	$socket = getsock(API_PORT);
+	if ($socket == null)
+		return NULL;
+
+	socket_write($socket, $cmd, strlen($cmd));
+	$line = readsockline($socket);
+	socket_close($socket);
+
+	if (strlen($line) == 0) {
+		echo "WARN: '$cmd' returned nothing\n";
+		return $line;
+	}
+
+	echo "$cmd returned '$line'\n";
+
+	$data = array();
+
+	$objs = explode('|', $line);
+	foreach ($objs as $obj)
+	{
+		if (strlen($obj) > 0)
+		{
+			$items = explode(';', $obj);
+			$item = $items[0];
+			$id = explode('=', $items[0], 2);
+			if (count($id) == 1)
+				$name = $id[0];
+			else
+				$name = $id[0].$id[1];
+
+			if (strlen($name) == 0)
+				$name = 'null';
+
+			if (isset($data[$name])) {
+				$num = 1;
+				while (isset($data[$name.$num]))
+					$num++;
+				$name .= $num;
+			}
+
+			$counter = 0;
+			foreach ($items as $item)
+			{
+				$id = explode('=', $item, 2);
+				if (count($id) == 2)
+					$data[$name][$id[0]] = $id[1];
+				else
+					$data[$name][$counter] = $id[0];
+
+				$counter++;
+			}
+
+		}
+	}
+	if ($cmd == 'summary')
+		return array_pop($data);
+	else
+		return $data;
+}
+
+ob_start();
+
+error_reporting(0);
+
+$summary = request('summary');
+$threads = request('threads');
+$pool    = request('pool');
+
+ob_end_clean(); /* swap to debug */
+//echo ob_get_clean()."\n";
+
+header("Content-Type: application/json");
+echo json_encode(compact('summary', 'threads', 'pool'))."\n";
+?>
diff --git a/api/summary.pl b/api/summary.pl
new file mode 100755
index 0000000000..e04a8ec055
--- /dev/null
+++ b/api/summary.pl
@@ -0,0 +1,34 @@
+#!/usr/bin/perl
+
+# sample script to query ccminer API
+
+my $command = "summary|";
+
+use Socket;
+use IO::Socket::INET;
+
+my $sock = new IO::Socket::INET (
+    PeerAddr => '127.0.0.1',
+    PeerPort => 4068,
+    Proto => 'tcp',
+    ReuseAddr => 1,
+    Timeout => 10,
+);
+
+if ($sock) {
+
+    print $sock $command;
+    my $res = "";
+
+    while(<$sock>) {
+        $res .= $_;
+    }
+
+    close($sock);
+    print("$res\n");
+
+} else {
+
+    print("ccminer socket failed\n");
+
+}
diff --git a/api/websocket.htm b/api/websocket.htm
new file mode 100644
index 0000000000..bf2e1cb102
--- /dev/null
+++ b/api/websocket.htm
@@ -0,0 +1,130 @@
+<!DOCTYPE HTML>
+<html>
+<head>
+	<script src="http://ajax.googleapis.com/ajax/libs/jquery/1.11.1/jquery.min.js"></script>
+	<script src="http://code.highcharts.com/highcharts.js"></script>
+	<script src="http://code.highcharts.com/modules/exporting.js"></script>
+</head>
+<body>
+	<div id="container" style="min-width: 310px; height: 400px; margin: 0 auto"></div>
+<script type="text/javascript">
+
+var hashrates = [];
+
+function drawChart(ip) {
+
+	$('#container').highcharts({
+		chart: {
+			type: 'spline'
+		},
+		title: {
+			text: 'ccMiner WebSocket Sample - ' + ip,
+			x: -20 //center
+		},
+		subtitle: {
+			text: 'By <a href="http://github.com/tpruvot/ccminer">tpruvot@github</a> 2014',
+			x: -20
+		},
+		xAxis: {
+			type: 'datetime',
+		},
+		yAxis: {
+			title: {
+				text: 'Hash Rate (KH/s)'
+			},
+			plotLines: [{
+				value: 0,
+				width: 1,
+				color: '#808080'
+			}],
+			min: 0
+		},
+		tooltip: {
+			valueSuffix: ' KH'
+		},
+		legend: {
+			layout: 'vertical',
+			align: 'right',
+			verticalAlign: 'middle',
+			borderWidth: 0
+		},
+		series: [
+			{
+				name: 'GPU0',
+				data: hashrates[0]
+			},
+			{
+				name: 'GPU1',
+				data: hashrates[1]
+			}
+		]
+	});
+}
+
+function getData(ip, port) {
+	if ("WebSocket" in window) {
+		var ws = new WebSocket('ws://'+ip+':'+port+'/histo','text');
+		var rates = [];
+		for (var gpu=0; gpu<8; gpu++) {
+			hashrates[gpu] = [];
+			rates[gpu] = [];
+		}
+		ws.onmessage = function (evt) {
+			var now = new Date();
+			var ts = Math.round(now/1000);
+			var data = evt.data.split('|');
+			for (n in data) {
+				var map = data[n].split(';');
+				var gpu = 0;
+				var uid = 0;
+				var plot = {};
+				for (k in map) {
+					var kv = map[k].split('=');
+					if (kv.length == 1)
+						continue;
+					if (kv[0] === 'GPU')
+						gpu = parseInt(kv[1], 10);
+					else if (kv[0] === 'ID')
+						uid = parseInt(kv[1], 10);
+					else if (kv[0] === 'TS')
+						plot.timestamp = parseInt(kv[1], 10);
+					else if (kv[0] === 'KHS')
+						plot.hashrate = parseInt(kv[1], 10) / 1000.0;
+					console.log('Data received: #GPU'+gpu+': '+kv[0]+' = '+kv[1]);
+				}
+				if (uid == 0)
+					continue;
+				rates[gpu][uid] = [+new Date(plot.timestamp*1000), plot.hashrate];
+			}
+
+			// sort values with id
+			for (gpu in rates) {
+				for (uid in rates[gpu])
+					hashrates[gpu].push(rates[gpu][uid]);
+			}
+
+			drawChart(ip);
+		};
+		ws.onerror = function (evt) {
+			var w = evt.target;
+			console.log('Error! readyState=' + w.readyState); //log errors
+			$('#container').html('Error! Unable to get WebSocket data from '+ip); //log errors
+			return false;
+		};
+		ws.onclose = function() {
+			// websocket is closed.
+		};
+	} else {
+		// The browser doesn't support WebSocket
+		alert("WebSocket NOT supported by your Browser!");
+	}
+}
+
+$(function () {
+	//getData('192.168.0.110', 4068);
+	getData('localhost', 4068);
+});
+
+</script>
+</body>
+</html>
diff --git a/bench.cpp b/bench.cpp
new file mode 100644
index 0000000000..d3c770137a
--- /dev/null
+++ b/bench.cpp
@@ -0,0 +1,265 @@
+/**
+ * Made to benchmark and test algo switch
+ *
+ * 2015 - tpruvot@github
+ */
+
+#include <unistd.h>
+
+#include "miner.h"
+#include "algos.h"
+#include <cuda_runtime.h>
+
+#ifdef __APPLE__
+#include "compat/pthreads/pthread_barrier.hpp"
+#endif
+
+int bench_algo = -1;
+
+static double algo_hashrates[MAX_GPUS][ALGO_COUNT] = { 0 };
+static uint32_t algo_throughput[MAX_GPUS][ALGO_COUNT] = { 0 };
+static int algo_mem_used[MAX_GPUS][ALGO_COUNT] = { 0 };
+static int device_mem_free[MAX_GPUS] = { 0 };
+
+static pthread_barrier_t miner_barr;
+static pthread_barrier_t algo_barr;
+static pthread_mutex_t bench_lock = PTHREAD_MUTEX_INITIALIZER;
+
+extern double thr_hashrates[MAX_GPUS];
+
+void bench_init(int threads)
+{
+	bench_algo = opt_algo = (enum sha_algos) 0; /* first */
+	applog(LOG_BLUE, "Starting benchmark mode with %s", algo_names[opt_algo]);
+	pthread_barrier_init(&miner_barr, NULL, threads);
+	pthread_barrier_init(&algo_barr, NULL, threads);
+	// required for usage of first algo.
+	for (int n=0; n < opt_n_threads; n++) {
+		device_mem_free[n] = cuda_available_memory(n);
+	}
+}
+
+void bench_free()
+{
+	pthread_barrier_destroy(&miner_barr);
+	pthread_barrier_destroy(&algo_barr);
+}
+
+// required to switch algos
+void algo_free_all(int thr_id)
+{
+	// only initialized algos will be freed
+	free_allium(thr_id);
+	free_bastion(thr_id);
+	free_bitcore(thr_id);
+	free_blake256(thr_id);
+	free_blake2b(thr_id);
+	free_blake2s(thr_id);
+	free_bmw(thr_id);
+	free_c11(thr_id);
+	free_cryptolight(thr_id);
+	free_cryptonight(thr_id);
+	free_decred(thr_id);
+	free_deep(thr_id);
+	free_equihash(thr_id);
+	free_exosis(thr_id);
+	free_keccak256(thr_id);
+	free_fresh(thr_id);
+	free_fugue256(thr_id);
+	free_groestlcoin(thr_id);
+#ifdef WITH_HEAVY_ALGO
+	free_heavy(thr_id);
+#endif
+	free_hmq17(thr_id);
+	free_hsr(thr_id);
+	free_jackpot(thr_id);
+	free_jha(thr_id);
+	free_lbry(thr_id);
+	free_luffa(thr_id);
+	free_lyra2(thr_id);
+	free_lyra2v2(thr_id);
+	free_lyra2v3(thr_id);
+	free_lyra2Z(thr_id);
+	free_myriad(thr_id);
+	free_neoscrypt(thr_id);
+	free_nist5(thr_id);
+	free_pentablake(thr_id);
+	free_phi(thr_id);
+	free_phi2(thr_id);
+	free_polytimos(thr_id);
+	free_quark(thr_id);
+	free_qubit(thr_id);
+	free_skeincoin(thr_id);
+	free_skein2(thr_id);
+	free_skunk(thr_id);
+	free_sha256d(thr_id);
+	free_sha256t(thr_id);
+	free_sha256q(thr_id);
+	free_sia(thr_id);
+	free_sib(thr_id);
+	free_sonoa(thr_id);
+	free_s3(thr_id);
+	free_vanilla(thr_id);
+	free_veltor(thr_id);
+	free_whirl(thr_id);
+	//free_whirlx(thr_id);
+	free_wildkeccak(thr_id);
+	free_x11evo(thr_id);
+	free_x11(thr_id);
+	free_x12(thr_id);
+	free_x13(thr_id);
+	free_x14(thr_id);
+	free_x15(thr_id);
+	free_x16r(thr_id);
+	free_x16s(thr_id);
+	free_x17(thr_id);
+	free_zr5(thr_id);
+	free_scrypt(thr_id);
+	free_scrypt_jane(thr_id);
+	free_timetravel(thr_id);
+	free_tribus(thr_id);
+	free_bitcore(thr_id);
+}
+
+// benchmark all algos (called once per mining thread)
+bool bench_algo_switch_next(int thr_id)
+{
+	int algo = (int) opt_algo;
+	int prev_algo = algo;
+	int dev_id = device_map[thr_id % MAX_GPUS];
+	int mfree, mused;
+	// doesnt seems enough to prevent device slow down
+	// after some algo switchs
+	bool need_reset = (gpu_threads == 1);
+
+	algo++;
+
+	// skip some duplicated algos
+	if (algo == ALGO_C11) algo++; // same as x11
+	if (algo == ALGO_DMD_GR) algo++; // same as groestl
+	if (algo == ALGO_HEAVY) algo++; // dead
+	if (algo == ALGO_MJOLLNIR) algo++; // same as heavy
+	if (algo == ALGO_KECCAKC) algo++; // same as keccak
+	if (algo == ALGO_WHIRLCOIN) algo++; // same as whirlpool
+	if (algo == ALGO_WHIRLPOOLX) algo++; // disabled
+	// todo: algo switch from RPC 2.0
+	if (algo == ALGO_CRYPTOLIGHT) algo++;
+	if (algo == ALGO_CRYPTONIGHT) algo++;
+	if (algo == ALGO_WILDKECCAK) algo++;
+	if (algo == ALGO_QUARK) algo++; // to fix
+	if (algo == ALGO_LBRY && CUDART_VERSION < 7000) algo++;
+
+	if (device_sm[dev_id] && device_sm[dev_id] < 300) {
+		// incompatible SM 2.1 kernels...
+		if (algo == ALGO_GROESTL) algo++;
+		if (algo == ALGO_MYR_GR) algo++;
+		if (algo == ALGO_NEOSCRYPT) algo++;
+		if (algo == ALGO_WHIRLPOOLX) algo++;
+	}
+	// and unwanted ones...
+	if (algo == ALGO_SCRYPT) algo++;
+	if (algo == ALGO_SCRYPT_JANE) algo++;
+
+	// Set cryptonight variant
+	switch (algo) {
+		case ALGO_MONERO:
+			cryptonight_fork = 7;
+			break;
+		case ALGO_GRAFT:
+			cryptonight_fork = 8;
+			break;
+		case ALGO_STELLITE:
+			cryptonight_fork = 3;
+			break;
+		case ALGO_CRYPTONIGHT:
+			cryptonight_fork = 1;
+			break;
+	}
+
+	// free current algo memory and track mem usage
+	mused = cuda_available_memory(thr_id);
+	algo_free_all(thr_id);
+	CUDA_LOG_ERROR();
+
+	// device can take some time to free
+	mfree = cuda_available_memory(thr_id);
+	if (device_mem_free[thr_id] > mfree) {
+		sleep(1);
+		mfree = cuda_available_memory(thr_id);
+	}
+
+	// we need to wait completion on all cards before the switch
+	if (opt_n_threads > 1) {
+		pthread_barrier_wait(&miner_barr);
+	}
+
+	char rate[32] = { 0 };
+	double hashrate = stats_get_speed(thr_id, thr_hashrates[thr_id]);
+	format_hashrate(hashrate, rate);
+	gpulog(LOG_NOTICE, thr_id, "%s hashrate = %s", algo_names[prev_algo], rate);
+
+	// ensure memory leak is still real after the barrier
+	if (device_mem_free[thr_id] > mfree) {
+		mfree = cuda_available_memory(thr_id);
+	}
+
+	// check if there is memory leak
+	if (device_mem_free[thr_id] - mfree > 1) {
+		gpulog(LOG_WARNING, thr_id, "possible %d MB memory leak in %s! %d MB free",
+			(device_mem_free[thr_id] - mfree), algo_names[prev_algo], mfree);
+		cuda_reset_device(thr_id, NULL); // force to free the leak
+		need_reset = false;
+		mfree = cuda_available_memory(thr_id);
+	}
+	// store used memory per algo
+	algo_mem_used[thr_id][opt_algo] = device_mem_free[thr_id] - mused;
+	device_mem_free[thr_id] = mfree;
+
+	// store to dump a table per gpu later
+	algo_hashrates[thr_id][prev_algo] = hashrate;
+
+	// wait the other threads to display logs correctly
+	if (opt_n_threads > 1) {
+		pthread_barrier_wait(&algo_barr);
+	}
+
+	if (algo == ALGO_AUTO)
+		return false; // all algos done
+
+	// mutex primary used for the stats purge
+	pthread_mutex_lock(&bench_lock);
+	stats_purge_all();
+
+	opt_algo = (enum sha_algos) algo;
+	global_hashrate = 0;
+	thr_hashrates[thr_id] = 0; // reset for minmax64
+	pthread_mutex_unlock(&bench_lock);
+
+	if (need_reset)
+		cuda_reset_device(thr_id, NULL);
+
+	if (thr_id == 0)
+		applog(LOG_BLUE, "Benchmark algo %s...", algo_names[algo]);
+
+	return true;
+}
+
+void bench_set_throughput(int thr_id, uint32_t throughput)
+{
+	algo_throughput[thr_id][opt_algo] = throughput;
+}
+
+void bench_display_results()
+{
+	for (int n=0; n < opt_n_threads; n++)
+	{
+		int dev_id = device_map[n];
+		applog(LOG_BLUE, "Benchmark results for GPU #%d - %s:", dev_id, device_name[dev_id]);
+		for (int i=0; i < ALGO_COUNT-1; i++) {
+			double rate = algo_hashrates[n][i];
+			if (rate == 0.0) continue;
+			applog(LOG_INFO, "%12s : %12.1f kH/s, %5d MB, %8u thr.", algo_names[i],
+				rate / 1024., algo_mem_used[n][i], algo_throughput[n][i]);
+		}
+	}
+}
diff --git a/bignum.cpp b/bignum.cpp
new file mode 100644
index 0000000000..fe4cc3452f
--- /dev/null
+++ b/bignum.cpp
@@ -0,0 +1,100 @@
+/**
+ * Wrapper to OpenSSL BIGNUM used by net diff (nBits)
+ */
+
+#include <stdio.h>
+
+#include "uint256.h"
+
+#include <openssl/opensslv.h>
+#if OPENSSL_VERSION_NUMBER < 0x10100000L
+#include "compat/bignum_ssl10.hpp"
+#else
+#include "bignum.hpp"
+#endif
+
+#include "miner.h" // hex2bin
+
+extern "C" double bn_convert_nbits(const uint32_t nBits)
+{
+	uint256 bn = CBigNum().SetCompact(nBits).getuint256();
+	return bn.getdouble();
+}
+
+// copy the big number to 32-bytes uchar
+extern "C" void bn_nbits_to_uchar(const uint32_t nBits, unsigned char *target)
+{
+	char buff[65];
+	uint256 bn = CBigNum().SetCompact(nBits).getuint256();
+
+	snprintf(buff, 65, "%s\n", bn.ToString().c_str()); buff[64] = '\0';
+	hex2bin(target, buff, 32);
+}
+
+// unused, but should allow more than 256bits targets
+#if 0
+extern "C" double bn_hash_target_ratio(uint32_t* hash, uint32_t* target)
+{
+	double dhash;
+
+	if (!opt_showdiff)
+		return 0.0;
+
+	CBigNum h(0), t(0);
+	std::vector<unsigned char> vch(32);
+
+	memcpy(&vch[0], (void*) target, 32);
+	t.setvch(vch);
+	memcpy(&vch[0], (void*) hash, 32);
+	h.setvch(vch);
+
+	dhash = h.getuint256().getdouble();
+	if (dhash > 0.)
+		return t.getuint256().getdouble() / dhash;
+	else
+		return dhash;
+}
+#endif
+
+// compute the diff ratio between a found hash and the target
+extern "C" double bn_hash_target_ratio(uint32_t* hash, uint32_t* target)
+{
+	uint256 h, t;
+	double dhash;
+
+	if (!opt_showdiff)
+		return 0.0;
+
+	memcpy(&t, (void*) target, 32);
+	memcpy(&h, (void*) hash, 32);
+
+	dhash = h.getdouble();
+	if (dhash > 0.)
+		return t.getdouble() / dhash;
+	else
+		return dhash;
+}
+
+// store ratio in work struct
+extern "C" void bn_store_hash_target_ratio(uint32_t* hash, uint32_t* target, struct work* work, int nonce)
+{
+	// only if the option is enabled (to reduce cpu usage)
+	if (!opt_showdiff) return;
+	if (nonce < 0 || nonce >= MAX_NONCES) return;
+
+	work->shareratio[nonce] = bn_hash_target_ratio(hash, target);
+	work->sharediff[nonce] = work->targetdiff * work->shareratio[nonce];
+}
+
+// new method to save all nonce(s) share diff/ration
+extern "C" void bn_set_target_ratio(struct work* work, uint32_t* hash, int nonce)
+{
+	bn_store_hash_target_ratio(hash, work->target, work, nonce);
+}
+
+// compat (only store single nonce share diff per work)
+extern "C" void work_set_target_ratio(struct work* work, uint32_t* hash)
+{
+	bn_store_hash_target_ratio(hash, work->target, work, work->submit_nonce_id);
+}
+
diff --git a/bignum.hpp b/bignum.hpp
new file mode 100644
index 0000000000..414b6cb54c
--- /dev/null
+++ b/bignum.hpp
@@ -0,0 +1,584 @@
+// Copyright (c) 2009-2010 Satoshi Nakamoto
+// Copyright (c) 2009-2012 The Bitcoin developers
+// Distributed under the MIT/X11 software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+#ifndef BITCOIN_BIGNUM_H
+#define BITCOIN_BIGNUM_H
+
+#ifndef PROTOCOL_VERSION
+#define PROTOCOL_VERSION 70001
+#endif
+
+#include <stdexcept>
+#include <vector>
+
+#include <openssl/bn.h>
+
+#include "serialize.hpp"
+
+/** Errors thrown by the bignum class */
+class bignum_error : public std::runtime_error
+{
+public:
+    explicit bignum_error(const std::string& str) : std::runtime_error(str) {}
+};
+
+
+/** RAII encapsulated BN_CTX (OpenSSL bignum context) */
+class CAutoBN_CTX
+{
+protected:
+    BN_CTX* pctx;
+    BN_CTX* operator=(BN_CTX* pnew) { return pctx = pnew; }
+
+public:
+    CAutoBN_CTX()
+    {
+        pctx = BN_CTX_new();
+        if (pctx == NULL)
+            throw bignum_error("CAutoBN_CTX : BN_CTX_new() returned NULL");
+    }
+
+    ~CAutoBN_CTX()
+    {
+        if (pctx != NULL)
+            BN_CTX_free(pctx);
+    }
+
+    operator BN_CTX*() { return pctx; }
+    BN_CTX& operator*() { return *pctx; }
+    BN_CTX** operator&() { return &pctx; }
+    bool operator!() { return (pctx == NULL); }
+};
+
+
+/** C++ wrapper for BIGNUM (OpenSSL bignum) */
+class CBigNum
+{
+private:
+    BIGNUM *self = NULL;
+
+    void init()
+    {
+        if (self)
+            BN_clear_free(self);
+        self = BN_new();
+        if (!self)
+            throw bignum_error("CBigNum::init(): BN_new() returned NULL");
+    }
+
+public:
+    BIGNUM *get() { return self; }
+    const BIGNUM *getc() const { return self; }
+
+    CBigNum()
+    {
+        init();
+    }
+
+    CBigNum(const CBigNum& b)
+    {
+        init();
+        if (!BN_copy(self, b.getc()))
+        {
+            BN_clear_free(self);
+            throw bignum_error("CBigNum::CBigNum(const CBigNum&) : BN_copy failed");
+        }
+    }
+
+    CBigNum& operator=(const CBigNum& b)
+    {
+        if (!BN_copy(self, b.getc()))
+            throw bignum_error("CBigNum::operator= : BN_copy failed");
+        return (*this);
+    }
+
+    ~CBigNum()
+    {
+        BN_clear_free(self);
+    }
+
+    //CBigNum(char n) is not portable.  Use 'signed char' or 'unsigned char'.
+    CBigNum(signed char n)      { init(); if (n >= 0) setulong(n); else setint64(n); }
+    CBigNum(short n)            { init(); if (n >= 0) setulong(n); else setint64(n); }
+    CBigNum(int n)              { init(); if (n >= 0) setulong(n); else setint64(n); }
+    CBigNum(long n)             { init(); if (n >= 0) setulong(n); else setint64(n); }
+    CBigNum(int64 n)            { init(); setint64(n); }
+    CBigNum(unsigned char n)    { init(); setulong(n); }
+    CBigNum(unsigned short n)   { init(); setulong(n); }
+    CBigNum(unsigned int n)     { init(); setulong(n); }
+    CBigNum(unsigned long n)    { init(); setulong(n); }
+    CBigNum(uint64 n)           { init(); setuint64(n); }
+    explicit CBigNum(uint256 n) { init(); setuint256(n); }
+
+    explicit CBigNum(const std::vector<unsigned char>& vch)
+    {
+        init();
+        setvch(vch);
+    }
+
+    void setulong(unsigned long n)
+    {
+        if (!BN_set_word(self, n))
+            throw bignum_error("CBigNum conversion from unsigned long : BN_set_word failed");
+    }
+
+    unsigned long getulong() const
+    {
+        return BN_get_word(self);
+    }
+
+    unsigned int getuint() const
+    {
+        return BN_get_word(self);
+    }
+
+    int getint() const
+    {
+        unsigned long n = BN_get_word(self);
+        if (!BN_is_negative(self))
+            return (n > (unsigned long)std::numeric_limits<int>::max() ? std::numeric_limits<int>::max() : n);
+        else
+            return (n > (unsigned long)std::numeric_limits<int>::max() ? std::numeric_limits<int>::min() : -(int)n);
+    }
+
+    void setint64(int64 sn)
+    {
+        unsigned char pch[sizeof(sn) + 6];
+        unsigned char* p = pch + 4;
+        bool fNegative;
+        uint64 n;
+
+        if (sn < (int64)0)
+        {
+            // Since the minimum signed integer cannot be represented as positive so long as its type is signed, and it's not well-defined what happens if you make it unsigned before negating it, we instead increment the negative integer by 1, convert it, then increment the (now positive) unsigned integer by 1 to compensate
+            n = -(sn + 1);
+            ++n;
+            fNegative = true;
+        } else {
+            n = sn;
+            fNegative = false;
+        }
+
+        bool fLeadingZeroes = true;
+        for (int i = 0; i < 8; i++)
+        {
+            unsigned char c = (n >> 56) & 0xff;
+            n <<= 8;
+            if (fLeadingZeroes)
+            {
+                if (c == 0)
+                    continue;
+                if (c & 0x80)
+                    *p++ = (fNegative ? 0x80 : 0);
+                else if (fNegative)
+                    c |= 0x80;
+                fLeadingZeroes = false;
+            }
+            *p++ = c;
+        }
+        unsigned int nSize = p - (pch + 4);
+        pch[0] = (nSize >> 24) & 0xff;
+        pch[1] = (nSize >> 16) & 0xff;
+        pch[2] = (nSize >> 8) & 0xff;
+        pch[3] = (nSize) & 0xff;
+        BN_mpi2bn(pch, p - pch, self);
+    }
+
+    uint64 getuint64()
+    {
+        unsigned int nSize = BN_bn2mpi(self, NULL);
+        if (nSize < 4)
+            return 0;
+        std::vector<unsigned char> vch(nSize);
+        BN_bn2mpi(self, &vch[0]);
+        if (vch.size() > 4)
+            vch[4] &= 0x7f;
+        uint64 n = 0;
+        for (unsigned int i = 0, j = vch.size()-1; i < sizeof(n) && j >= 4; i++, j--)
+            ((unsigned char*)&n)[i] = vch[j];
+        return n;
+    }
+
+    void setuint64(uint64 n)
+    {
+        unsigned char pch[sizeof(n) + 6];
+        unsigned char* p = pch + 4;
+        bool fLeadingZeroes = true;
+        for (int i = 0; i < 8; i++)
+        {
+            unsigned char c = (n >> 56) & 0xff;
+            n <<= 8;
+            if (fLeadingZeroes)
+            {
+                if (c == 0)
+                    continue;
+                if (c & 0x80)
+                    *p++ = 0;
+                fLeadingZeroes = false;
+            }
+            *p++ = c;
+        }
+        unsigned int nSize = p - (pch + 4);
+        pch[0] = (nSize >> 24) & 0xff;
+        pch[1] = (nSize >> 16) & 0xff;
+        pch[2] = (nSize >> 8) & 0xff;
+        pch[3] = (nSize) & 0xff;
+        BN_mpi2bn(pch, p - pch, self);
+    }
+
+    void setuint256(uint256 n)
+    {
+        unsigned char pch[sizeof(n) + 6];
+        unsigned char* p = pch + 4;
+        bool fLeadingZeroes = true;
+        unsigned char* pbegin = (unsigned char*)&n;
+        unsigned char* psrc = pbegin + sizeof(n);
+        while (psrc != pbegin)
+        {
+            unsigned char c = *(--psrc);
+            if (fLeadingZeroes)
+            {
+                if (c == 0)
+                    continue;
+                if (c & 0x80)
+                    *p++ = 0;
+                fLeadingZeroes = false;
+            }
+            *p++ = c;
+        }
+        unsigned int nSize = p - (pch + 4);
+        pch[0] = (nSize >> 24) & 0xff;
+        pch[1] = (nSize >> 16) & 0xff;
+        pch[2] = (nSize >> 8) & 0xff;
+        pch[3] = (nSize >> 0) & 0xff;
+        BN_mpi2bn(pch, p - pch, self);
+    }
+
+    uint256 getuint256()
+    {
+        unsigned int nSize = BN_bn2mpi(self, NULL);
+        if (nSize < 4)
+            return 0;
+        std::vector<unsigned char> vch(nSize);
+        BN_bn2mpi(self, &vch[0]);
+        if (vch.size() > 4)
+            vch[4] &= 0x7f;
+        uint256 n = 0;
+        for (unsigned int i = 0, j = vch.size()-1; i < sizeof(n) && j >= 4; i++, j--)
+            ((unsigned char*)&n)[i] = vch[j];
+        return n;
+    }
+
+
+    void setvch(const std::vector<unsigned char>& vch)
+    {
+        std::vector<unsigned char> vch2(vch.size() + 4);
+        unsigned int nSize = vch.size();
+        // BIGNUM's byte stream format expects 4 bytes of
+        // big endian size data info at the front
+        vch2[0] = (nSize >> 24) & 0xff;
+        vch2[1] = (nSize >> 16) & 0xff;
+        vch2[2] = (nSize >> 8) & 0xff;
+        vch2[3] = (nSize >> 0) & 0xff;
+        // swap data to big endian
+        reverse_copy(vch.begin(), vch.end(), vch2.begin() + 4);
+        BN_mpi2bn(&vch2[0], vch2.size(), self);
+    }
+
+    std::vector<unsigned char> getvch() const
+    {
+        unsigned int nSize = BN_bn2mpi(self, NULL);
+        if (nSize <= 4)
+            return std::vector<unsigned char>();
+        std::vector<unsigned char> vch(nSize);
+        BN_bn2mpi(self, &vch[0]);
+        vch.erase(vch.begin(), vch.begin() + 4);
+        reverse(vch.begin(), vch.end());
+        return vch;
+    }
+
+    CBigNum& SetCompact(unsigned int nCompact)
+    {
+        unsigned int nSize = nCompact >> 24;
+        std::vector<unsigned char> vch(4 + nSize);
+        vch[3] = nSize;
+        if (nSize >= 1) vch[4] = (nCompact >> 16) & 0xff;
+        if (nSize >= 2) vch[5] = (nCompact >> 8) & 0xff;
+        if (nSize >= 3) vch[6] = (nCompact >> 0) & 0xff;
+        BN_mpi2bn(&vch[0], vch.size(), self);
+        return *this;
+    }
+
+    unsigned int GetCompact() const
+    {
+        unsigned int nSize = BN_bn2mpi(self, NULL);
+        std::vector<unsigned char> vch(nSize);
+        nSize -= 4;
+        BN_bn2mpi(self, &vch[0]);
+        unsigned int nCompact = nSize << 24;
+        if (nSize >= 1) nCompact |= (vch[4] << 16);
+        if (nSize >= 2) nCompact |= (vch[5] << 8);
+        if (nSize >= 3) nCompact |= (vch[6] << 0);
+        return nCompact;
+    }
+
+    void SetHex(const std::string& str)
+    {
+        // skip 0x
+        const char* psz = str.c_str();
+        while (isspace(*psz))
+            psz++;
+        bool fNegative = false;
+        if (*psz == '-')
+        {
+            fNegative = true;
+            psz++;
+        }
+        if (psz[0] == '0' && tolower(psz[1]) == 'x')
+            psz += 2;
+        while (isspace(*psz))
+            psz++;
+
+        // hex string to bignum
+        static const signed char phexdigit[256] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,1,2,3,4,5,6,7,8,9,0,0,0,0,0,0, 0,0xa,0xb,0xc,0xd,0xe,0xf,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0xa,0xb,0xc,0xd,0xe,0xf,0,0,0,0,0,0,0,0,0 };
+        *this = 0;
+        while (isxdigit(*psz))
+        {
+            *this <<= 4;
+            int n = phexdigit[(unsigned char)*psz++];
+            *this += n;
+        }
+        if (fNegative)
+            *this = 0 - *this;
+    }
+
+    std::string ToString(int nBase=10) const
+    {
+        CAutoBN_CTX pctx;
+        CBigNum bnBase = nBase;
+        CBigNum bn0 = 0;
+        std::string str;
+        CBigNum bn = *this;
+        BN_set_negative(bn.get(), false);
+        CBigNum dv;
+        CBigNum rem;
+        if (BN_cmp(bn.getc(), bn0.getc()) == 0)
+            return "0";
+        while (BN_cmp(bn.getc(), bn0.getc()) > 0)
+        {
+            if (!BN_div(dv.get(), rem.get(), bn.getc(), bnBase.getc(), pctx))
+                throw bignum_error("CBigNum::ToString() : BN_div failed");
+            bn = dv;
+            unsigned int c = rem.getulong();
+            str += "0123456789abcdef"[c];
+        }
+        if (BN_is_negative(self))
+            str += "-";
+        reverse(str.begin(), str.end());
+        return str;
+    }
+
+    std::string GetHex() const
+    {
+        return ToString(16);
+    }
+
+    unsigned int GetSerializeSize(int nType=0, int nVersion=PROTOCOL_VERSION) const
+    {
+        return ::GetSerializeSize(getvch(), nType, nVersion);
+    }
+
+    template<typename Stream>
+    void Serialize(Stream& s, int nType=0, int nVersion=PROTOCOL_VERSION) const
+    {
+        ::Serialize(s, getvch(), nType, nVersion);
+    }
+
+    template<typename Stream>
+    void Unserialize(Stream& s, int nType=0, int nVersion=PROTOCOL_VERSION)
+    {
+        std::vector<unsigned char> vch;
+        ::Unserialize(s, vch, nType, nVersion);
+        setvch(vch);
+    }
+
+
+    bool operator!() const
+    {
+        return BN_is_zero(self);
+    }
+
+    CBigNum& operator+=(const CBigNum& b)
+    {
+        if (!BN_add(self, self, b.getc()))
+            throw bignum_error("CBigNum::operator+= : BN_add failed");
+        return *this;
+    }
+
+    CBigNum& operator-=(const CBigNum& b)
+    {
+        *this = *this - b;
+        return *this;
+    }
+
+    CBigNum& operator*=(const CBigNum& b)
+    {
+        CAutoBN_CTX pctx;
+        if (!BN_mul(self, self, b.getc(), pctx))
+            throw bignum_error("CBigNum::operator*= : BN_mul failed");
+        return *this;
+    }
+
+    CBigNum& operator/=(const CBigNum& b)
+    {
+        *this = *this / b;
+        return *this;
+    }
+
+    CBigNum& operator%=(const CBigNum& b)
+    {
+        *this = *this % b;
+        return *this;
+    }
+
+    CBigNum& operator<<=(unsigned int shift)
+    {
+        if (!BN_lshift(self, self, shift))
+            throw bignum_error("CBigNum:operator<<= : BN_lshift failed");
+        return *this;
+    }
+
+    CBigNum& operator>>=(unsigned int shift)
+    {
+        // Note: BN_rshift segfaults on 64-bit if 2^shift is greater than the number
+        //   if built on ubuntu 9.04 or 9.10, probably depends on version of OpenSSL
+        CBigNum a = 1;
+        a <<= shift;
+        if (BN_cmp(a.getc(), self) > 0)
+        {
+            *this = 0;
+            return *this;
+        }
+
+        if (!BN_rshift(self, self, shift))
+            throw bignum_error("CBigNum:operator>>= : BN_rshift failed");
+        return *this;
+    }
+
+
+    CBigNum& operator++()
+    {
+        // prefix operator
+        if (!BN_add(self, self, BN_value_one()))
+            throw bignum_error("CBigNum::operator++ : BN_add failed");
+        return *this;
+    }
+
+    const CBigNum operator++(int)
+    {
+        // postfix operator
+        const CBigNum ret = *this;
+        ++(*this);
+        return ret;
+    }
+
+    CBigNum& operator--()
+    {
+        // prefix operator
+        CBigNum r;
+        if (!BN_sub(r.get(), self, BN_value_one()))
+            throw bignum_error("CBigNum::operator-- : BN_sub failed");
+        *this = r;
+        return *this;
+    }
+
+    const CBigNum operator--(int)
+    {
+        // postfix operator
+        const CBigNum ret = *this;
+        --(*this);
+        return ret;
+    }
+
+
+    friend inline const CBigNum operator-(const CBigNum& a, const CBigNum& b);
+    friend inline const CBigNum operator/(const CBigNum& a, const CBigNum& b);
+    friend inline const CBigNum operator%(const CBigNum& a, const CBigNum& b);
+};
+
+
+
+inline const CBigNum operator+(const CBigNum& a, const CBigNum& b)
+{
+    CBigNum r;
+    if (!BN_add(r.get(), a.getc(), b.getc()))
+        throw bignum_error("CBigNum::operator+ : BN_add failed");
+    return r;
+}
+
+inline const CBigNum operator-(const CBigNum& a, const CBigNum& b)
+{
+    CBigNum r;
+    if (!BN_sub(r.get(), a.getc(), b.getc()))
+        throw bignum_error("CBigNum::operator- : BN_sub failed");
+    return r;
+}
+
+inline const CBigNum operator-(const CBigNum& a)
+{
+    CBigNum r(a);
+    BN_set_negative(r.get(), !BN_is_negative(r.getc()));
+    return r;
+}
+
+inline const CBigNum operator*(const CBigNum& a, const CBigNum& b)
+{
+    CAutoBN_CTX pctx;
+    CBigNum r;
+    if (!BN_mul(r.get(), a.getc(), b.getc(), pctx))
+        throw bignum_error("CBigNum::operator* : BN_mul failed");
+    return r;
+}
+
+inline const CBigNum operator/(const CBigNum& a, const CBigNum& b)
+{
+    CAutoBN_CTX pctx;
+    CBigNum r;
+    if (!BN_div(r.get(), NULL, a.getc(), b.getc(), pctx))
+        throw bignum_error("CBigNum::operator/ : BN_div failed");
+    return r;
+}
+
+inline const CBigNum operator%(const CBigNum& a, const CBigNum& b)
+{
+    CAutoBN_CTX pctx;
+    CBigNum r;
+    if (!BN_mod(r.get(), a.getc(), b.getc(), pctx))
+        throw bignum_error("CBigNum::operator% : BN_div failed");
+    return r;
+}
+
+inline const CBigNum operator<<(const CBigNum& a, unsigned int shift)
+{
+    CBigNum r;
+    if (!BN_lshift(r.get(), a.getc(), shift))
+        throw bignum_error("CBigNum:operator<< : BN_lshift failed");
+    return r;
+}
+
+inline const CBigNum operator>>(const CBigNum& a, unsigned int shift)
+{
+    CBigNum r = a;
+    r >>= shift;
+    return r;
+}
+
+inline bool operator==(const CBigNum& a, const CBigNum& b) { return (BN_cmp(a.getc(), b.getc()) == 0); }
+inline bool operator!=(const CBigNum& a, const CBigNum& b) { return (BN_cmp(a.getc(), b.getc()) != 0); }
+inline bool operator<=(const CBigNum& a, const CBigNum& b) { return (BN_cmp(a.getc(), b.getc()) <= 0); }
+inline bool operator>=(const CBigNum& a, const CBigNum& b) { return (BN_cmp(a.getc(), b.getc()) >= 0); }
+inline bool operator<(const CBigNum& a, const CBigNum& b)  { return (BN_cmp(a.getc(), b.getc()) < 0); }
+inline bool operator>(const CBigNum& a, const CBigNum& b)  { return (BN_cmp(a.getc(), b.getc()) > 0); }
+
+#endif
diff --git a/bitslice_transformations_quad.cu b/bitslice_transformations_quad.cu
deleted file mode 100644
index f4a5f2a325..0000000000
--- a/bitslice_transformations_quad.cu
+++ /dev/null
@@ -1,418 +0,0 @@
-
-__device__ __forceinline__ void to_bitslice_quad(uint32_t *input, uint32_t *output)
-{
-    int n = threadIdx.x % 4;
-    uint32_t other[8];
-#pragma unroll 8
-    for (int i = 0; i < 8; i++) {
-        input[i] =__shfl((int)input[i], n ^ (3*(n >=1 && n <=2)), 4);
-        other[i] =__shfl((int)input[i], (threadIdx.x + 1) % 4, 4);
-        input[i] = __shfl((int)input[i], threadIdx.x & 2, 4);
-        other[i] = __shfl((int)other[i], threadIdx.x & 2, 4);
-        if (threadIdx.x & 1) {
-            input[i] = __byte_perm(input[i], 0, 0x1032);
-            other[i] = __byte_perm(other[i], 0, 0x1032);
-        }
-        output[i] = 0;
-    }
-
-    output[ 0] |=  (input[ 0] & 0x00000001);
-    output[ 0] |= ((other[ 0] & 0x00000001) << 1);
-    output[ 0] |= ((input[ 1] & 0x00000001) << 2);
-    output[ 0] |= ((other[ 1] & 0x00000001) << 3);
-    output[ 0] |= ((input[ 2] & 0x00000001) << 4);
-    output[ 0] |= ((other[ 2] & 0x00000001) << 5);
-    output[ 0] |= ((input[ 3] & 0x00000001) << 6);
-    output[ 0] |= ((other[ 3] & 0x00000001) << 7);
-    output[ 0] |= ((input[ 4] & 0x00000001) << 8);
-    output[ 0] |= ((other[ 4] & 0x00000001) << 9);
-    output[ 0] |= ((input[ 5] & 0x00000001) <<10);
-    output[ 0] |= ((other[ 5] & 0x00000001) <<11);
-    output[ 0] |= ((input[ 6] & 0x00000001) <<12);
-    output[ 0] |= ((other[ 6] & 0x00000001) <<13);
-    output[ 0] |= ((input[ 7] & 0x00000001) <<14);
-    output[ 0] |= ((other[ 7] & 0x00000001) <<15);
-    output[ 0] |= ((input[ 0] & 0x00000100) << 8);
-    output[ 0] |= ((other[ 0] & 0x00000100) << 9);
-    output[ 0] |= ((input[ 1] & 0x00000100) <<10);
-    output[ 0] |= ((other[ 1] & 0x00000100) <<11);
-    output[ 0] |= ((input[ 2] & 0x00000100) <<12);
-    output[ 0] |= ((other[ 2] & 0x00000100) <<13);
-    output[ 0] |= ((input[ 3] & 0x00000100) <<14);
-    output[ 0] |= ((other[ 3] & 0x00000100) <<15);
-    output[ 0] |= ((input[ 4] & 0x00000100) <<16);
-    output[ 0] |= ((other[ 4] & 0x00000100) <<17);
-    output[ 0] |= ((input[ 5] & 0x00000100) <<18);
-    output[ 0] |= ((other[ 5] & 0x00000100) <<19);
-    output[ 0] |= ((input[ 6] & 0x00000100) <<20);
-    output[ 0] |= ((other[ 6] & 0x00000100) <<21);
-    output[ 0] |= ((input[ 7] & 0x00000100) <<22);
-    output[ 0] |= ((other[ 7] & 0x00000100) <<23);
-    output[ 1] |= ((input[ 0] & 0x00000002) >> 1);
-    output[ 1] |=  (other[ 0] & 0x00000002);
-    output[ 1] |= ((input[ 1] & 0x00000002) << 1);
-    output[ 1] |= ((other[ 1] & 0x00000002) << 2);
-    output[ 1] |= ((input[ 2] & 0x00000002) << 3);
-    output[ 1] |= ((other[ 2] & 0x00000002) << 4);
-    output[ 1] |= ((input[ 3] & 0x00000002) << 5);
-    output[ 1] |= ((other[ 3] & 0x00000002) << 6);
-    output[ 1] |= ((input[ 4] & 0x00000002) << 7);
-    output[ 1] |= ((other[ 4] & 0x00000002) << 8);
-    output[ 1] |= ((input[ 5] & 0x00000002) << 9);
-    output[ 1] |= ((other[ 5] & 0x00000002) <<10);
-    output[ 1] |= ((input[ 6] & 0x00000002) <<11);
-    output[ 1] |= ((other[ 6] & 0x00000002) <<12);
-    output[ 1] |= ((input[ 7] & 0x00000002) <<13);
-    output[ 1] |= ((other[ 7] & 0x00000002) <<14);
-    output[ 1] |= ((input[ 0] & 0x00000200) << 7);
-    output[ 1] |= ((other[ 0] & 0x00000200) << 8);
-    output[ 1] |= ((input[ 1] & 0x00000200) << 9);
-    output[ 1] |= ((other[ 1] & 0x00000200) <<10);
-    output[ 1] |= ((input[ 2] & 0x00000200) <<11);
-    output[ 1] |= ((other[ 2] & 0x00000200) <<12);
-    output[ 1] |= ((input[ 3] & 0x00000200) <<13);
-    output[ 1] |= ((other[ 3] & 0x00000200) <<14);
-    output[ 1] |= ((input[ 4] & 0x00000200) <<15);
-    output[ 1] |= ((other[ 4] & 0x00000200) <<16);
-    output[ 1] |= ((input[ 5] & 0x00000200) <<17);
-    output[ 1] |= ((other[ 5] & 0x00000200) <<18);
-    output[ 1] |= ((input[ 6] & 0x00000200) <<19);
-    output[ 1] |= ((other[ 6] & 0x00000200) <<20);
-    output[ 1] |= ((input[ 7] & 0x00000200) <<21);
-    output[ 1] |= ((other[ 7] & 0x00000200) <<22);
-    output[ 2] |= ((input[ 0] & 0x00000004) >> 2);
-    output[ 2] |= ((other[ 0] & 0x00000004) >> 1);
-    output[ 2] |=  (input[ 1] & 0x00000004);
-    output[ 2] |= ((other[ 1] & 0x00000004) << 1);
-    output[ 2] |= ((input[ 2] & 0x00000004) << 2);
-    output[ 2] |= ((other[ 2] & 0x00000004) << 3);
-    output[ 2] |= ((input[ 3] & 0x00000004) << 4);
-    output[ 2] |= ((other[ 3] & 0x00000004) << 5);
-    output[ 2] |= ((input[ 4] & 0x00000004) << 6);
-    output[ 2] |= ((other[ 4] & 0x00000004) << 7);
-    output[ 2] |= ((input[ 5] & 0x00000004) << 8);
-    output[ 2] |= ((other[ 5] & 0x00000004) << 9);
-    output[ 2] |= ((input[ 6] & 0x00000004) <<10);
-    output[ 2] |= ((other[ 6] & 0x00000004) <<11);
-    output[ 2] |= ((input[ 7] & 0x00000004) <<12);
-    output[ 2] |= ((other[ 7] & 0x00000004) <<13);
-    output[ 2] |= ((input[ 0] & 0x00000400) << 6);
-    output[ 2] |= ((other[ 0] & 0x00000400) << 7);
-    output[ 2] |= ((input[ 1] & 0x00000400) << 8);
-    output[ 2] |= ((other[ 1] & 0x00000400) << 9);
-    output[ 2] |= ((input[ 2] & 0x00000400) <<10);
-    output[ 2] |= ((other[ 2] & 0x00000400) <<11);
-    output[ 2] |= ((input[ 3] & 0x00000400) <<12);
-    output[ 2] |= ((other[ 3] & 0x00000400) <<13);
-    output[ 2] |= ((input[ 4] & 0x00000400) <<14);
-    output[ 2] |= ((other[ 4] & 0x00000400) <<15);
-    output[ 2] |= ((input[ 5] & 0x00000400) <<16);
-    output[ 2] |= ((other[ 5] & 0x00000400) <<17);
-    output[ 2] |= ((input[ 6] & 0x00000400) <<18);
-    output[ 2] |= ((other[ 6] & 0x00000400) <<19);
-    output[ 2] |= ((input[ 7] & 0x00000400) <<20);
-    output[ 2] |= ((other[ 7] & 0x00000400) <<21);
-    output[ 3] |= ((input[ 0] & 0x00000008) >> 3);
-    output[ 3] |= ((other[ 0] & 0x00000008) >> 2);
-    output[ 3] |= ((input[ 1] & 0x00000008) >> 1);
-    output[ 3] |=  (other[ 1] & 0x00000008);
-    output[ 3] |= ((input[ 2] & 0x00000008) << 1);
-    output[ 3] |= ((other[ 2] & 0x00000008) << 2);
-    output[ 3] |= ((input[ 3] & 0x00000008) << 3);
-    output[ 3] |= ((other[ 3] & 0x00000008) << 4);
-    output[ 3] |= ((input[ 4] & 0x00000008) << 5);
-    output[ 3] |= ((other[ 4] & 0x00000008) << 6);
-    output[ 3] |= ((input[ 5] & 0x00000008) << 7);
-    output[ 3] |= ((other[ 5] & 0x00000008) << 8);
-    output[ 3] |= ((input[ 6] & 0x00000008) << 9);
-    output[ 3] |= ((other[ 6] & 0x00000008) <<10);
-    output[ 3] |= ((input[ 7] & 0x00000008) <<11);
-    output[ 3] |= ((other[ 7] & 0x00000008) <<12);
-    output[ 3] |= ((input[ 0] & 0x00000800) << 5);
-    output[ 3] |= ((other[ 0] & 0x00000800) << 6);
-    output[ 3] |= ((input[ 1] & 0x00000800) << 7);
-    output[ 3] |= ((other[ 1] & 0x00000800) << 8);
-    output[ 3] |= ((input[ 2] & 0x00000800) << 9);
-    output[ 3] |= ((other[ 2] & 0x00000800) <<10);
-    output[ 3] |= ((input[ 3] & 0x00000800) <<11);
-    output[ 3] |= ((other[ 3] & 0x00000800) <<12);
-    output[ 3] |= ((input[ 4] & 0x00000800) <<13);
-    output[ 3] |= ((other[ 4] & 0x00000800) <<14);
-    output[ 3] |= ((input[ 5] & 0x00000800) <<15);
-    output[ 3] |= ((other[ 5] & 0x00000800) <<16);
-    output[ 3] |= ((input[ 6] & 0x00000800) <<17);
-    output[ 3] |= ((other[ 6] & 0x00000800) <<18);
-    output[ 3] |= ((input[ 7] & 0x00000800) <<19);
-    output[ 3] |= ((other[ 7] & 0x00000800) <<20);
-    output[ 4] |= ((input[ 0] & 0x00000010) >> 4);
-    output[ 4] |= ((other[ 0] & 0x00000010) >> 3);
-    output[ 4] |= ((input[ 1] & 0x00000010) >> 2);
-    output[ 4] |= ((other[ 1] & 0x00000010) >> 1);
-    output[ 4] |=  (input[ 2] & 0x00000010);
-    output[ 4] |= ((other[ 2] & 0x00000010) << 1);
-    output[ 4] |= ((input[ 3] & 0x00000010) << 2);
-    output[ 4] |= ((other[ 3] & 0x00000010) << 3);
-    output[ 4] |= ((input[ 4] & 0x00000010) << 4);
-    output[ 4] |= ((other[ 4] & 0x00000010) << 5);
-    output[ 4] |= ((input[ 5] & 0x00000010) << 6);
-    output[ 4] |= ((other[ 5] & 0x00000010) << 7);
-    output[ 4] |= ((input[ 6] & 0x00000010) << 8);
-    output[ 4] |= ((other[ 6] & 0x00000010) << 9);
-    output[ 4] |= ((input[ 7] & 0x00000010) <<10);
-    output[ 4] |= ((other[ 7] & 0x00000010) <<11);
-    output[ 4] |= ((input[ 0] & 0x00001000) << 4);
-    output[ 4] |= ((other[ 0] & 0x00001000) << 5);
-    output[ 4] |= ((input[ 1] & 0x00001000) << 6);
-    output[ 4] |= ((other[ 1] & 0x00001000) << 7);
-    output[ 4] |= ((input[ 2] & 0x00001000) << 8);
-    output[ 4] |= ((other[ 2] & 0x00001000) << 9);
-    output[ 4] |= ((input[ 3] & 0x00001000) <<10);
-    output[ 4] |= ((other[ 3] & 0x00001000) <<11);
-    output[ 4] |= ((input[ 4] & 0x00001000) <<12);
-    output[ 4] |= ((other[ 4] & 0x00001000) <<13);
-    output[ 4] |= ((input[ 5] & 0x00001000) <<14);
-    output[ 4] |= ((other[ 5] & 0x00001000) <<15);
-    output[ 4] |= ((input[ 6] & 0x00001000) <<16);
-    output[ 4] |= ((other[ 6] & 0x00001000) <<17);
-    output[ 4] |= ((input[ 7] & 0x00001000) <<18);
-    output[ 4] |= ((other[ 7] & 0x00001000) <<19);
-    output[ 5] |= ((input[ 0] & 0x00000020) >> 5);
-    output[ 5] |= ((other[ 0] & 0x00000020) >> 4);
-    output[ 5] |= ((input[ 1] & 0x00000020) >> 3);
-    output[ 5] |= ((other[ 1] & 0x00000020) >> 2);
-    output[ 5] |= ((input[ 2] & 0x00000020) >> 1);
-    output[ 5] |=  (other[ 2] & 0x00000020);
-    output[ 5] |= ((input[ 3] & 0x00000020) << 1);
-    output[ 5] |= ((other[ 3] & 0x00000020) << 2);
-    output[ 5] |= ((input[ 4] & 0x00000020) << 3);
-    output[ 5] |= ((other[ 4] & 0x00000020) << 4);
-    output[ 5] |= ((input[ 5] & 0x00000020) << 5);
-    output[ 5] |= ((other[ 5] & 0x00000020) << 6);
-    output[ 5] |= ((input[ 6] & 0x00000020) << 7);
-    output[ 5] |= ((other[ 6] & 0x00000020) << 8);
-    output[ 5] |= ((input[ 7] & 0x00000020) << 9);
-    output[ 5] |= ((other[ 7] & 0x00000020) <<10);
-    output[ 5] |= ((input[ 0] & 0x00002000) << 3);
-    output[ 5] |= ((other[ 0] & 0x00002000) << 4);
-    output[ 5] |= ((input[ 1] & 0x00002000) << 5);
-    output[ 5] |= ((other[ 1] & 0x00002000) << 6);
-    output[ 5] |= ((input[ 2] & 0x00002000) << 7);
-    output[ 5] |= ((other[ 2] & 0x00002000) << 8);
-    output[ 5] |= ((input[ 3] & 0x00002000) << 9);
-    output[ 5] |= ((other[ 3] & 0x00002000) <<10);
-    output[ 5] |= ((input[ 4] & 0x00002000) <<11);
-    output[ 5] |= ((other[ 4] & 0x00002000) <<12);
-    output[ 5] |= ((input[ 5] & 0x00002000) <<13);
-    output[ 5] |= ((other[ 5] & 0x00002000) <<14);
-    output[ 5] |= ((input[ 6] & 0x00002000) <<15);
-    output[ 5] |= ((other[ 6] & 0x00002000) <<16);
-    output[ 5] |= ((input[ 7] & 0x00002000) <<17);
-    output[ 5] |= ((other[ 7] & 0x00002000) <<18);
-    output[ 6] |= ((input[ 0] & 0x00000040) >> 6);
-    output[ 6] |= ((other[ 0] & 0x00000040) >> 5);
-    output[ 6] |= ((input[ 1] & 0x00000040) >> 4);
-    output[ 6] |= ((other[ 1] & 0x00000040) >> 3);
-    output[ 6] |= ((input[ 2] & 0x00000040) >> 2);
-    output[ 6] |= ((other[ 2] & 0x00000040) >> 1);
-    output[ 6] |=  (input[ 3] & 0x00000040);
-    output[ 6] |= ((other[ 3] & 0x00000040) << 1);
-    output[ 6] |= ((input[ 4] & 0x00000040) << 2);
-    output[ 6] |= ((other[ 4] & 0x00000040) << 3);
-    output[ 6] |= ((input[ 5] & 0x00000040) << 4);
-    output[ 6] |= ((other[ 5] & 0x00000040) << 5);
-    output[ 6] |= ((input[ 6] & 0x00000040) << 6);
-    output[ 6] |= ((other[ 6] & 0x00000040) << 7);
-    output[ 6] |= ((input[ 7] & 0x00000040) << 8);
-    output[ 6] |= ((other[ 7] & 0x00000040) << 9);
-    output[ 6] |= ((input[ 0] & 0x00004000) << 2);
-    output[ 6] |= ((other[ 0] & 0x00004000) << 3);
-    output[ 6] |= ((input[ 1] & 0x00004000) << 4);
-    output[ 6] |= ((other[ 1] & 0x00004000) << 5);
-    output[ 6] |= ((input[ 2] & 0x00004000) << 6);
-    output[ 6] |= ((other[ 2] & 0x00004000) << 7);
-    output[ 6] |= ((input[ 3] & 0x00004000) << 8);
-    output[ 6] |= ((other[ 3] & 0x00004000) << 9);
-    output[ 6] |= ((input[ 4] & 0x00004000) <<10);
-    output[ 6] |= ((other[ 4] & 0x00004000) <<11);
-    output[ 6] |= ((input[ 5] & 0x00004000) <<12);
-    output[ 6] |= ((other[ 5] & 0x00004000) <<13);
-    output[ 6] |= ((input[ 6] & 0x00004000) <<14);
-    output[ 6] |= ((other[ 6] & 0x00004000) <<15);
-    output[ 6] |= ((input[ 7] & 0x00004000) <<16);
-    output[ 6] |= ((other[ 7] & 0x00004000) <<17);
-    output[ 7] |= ((input[ 0] & 0x00000080) >> 7);
-    output[ 7] |= ((other[ 0] & 0x00000080) >> 6);
-    output[ 7] |= ((input[ 1] & 0x00000080) >> 5);
-    output[ 7] |= ((other[ 1] & 0x00000080) >> 4);
-    output[ 7] |= ((input[ 2] & 0x00000080) >> 3);
-    output[ 7] |= ((other[ 2] & 0x00000080) >> 2);
-    output[ 7] |= ((input[ 3] & 0x00000080) >> 1);
-    output[ 7] |=  (other[ 3] & 0x00000080);
-    output[ 7] |= ((input[ 4] & 0x00000080) << 1);
-    output[ 7] |= ((other[ 4] & 0x00000080) << 2);
-    output[ 7] |= ((input[ 5] & 0x00000080) << 3);
-    output[ 7] |= ((other[ 5] & 0x00000080) << 4);
-    output[ 7] |= ((input[ 6] & 0x00000080) << 5);
-    output[ 7] |= ((other[ 6] & 0x00000080) << 6);
-    output[ 7] |= ((input[ 7] & 0x00000080) << 7);
-    output[ 7] |= ((other[ 7] & 0x00000080) << 8);
-    output[ 7] |= ((input[ 0] & 0x00008000) << 1);
-    output[ 7] |= ((other[ 0] & 0x00008000) << 2);
-    output[ 7] |= ((input[ 1] & 0x00008000) << 3);
-    output[ 7] |= ((other[ 1] & 0x00008000) << 4);
-    output[ 7] |= ((input[ 2] & 0x00008000) << 5);
-    output[ 7] |= ((other[ 2] & 0x00008000) << 6);
-    output[ 7] |= ((input[ 3] & 0x00008000) << 7);
-    output[ 7] |= ((other[ 3] & 0x00008000) << 8);
-    output[ 7] |= ((input[ 4] & 0x00008000) << 9);
-    output[ 7] |= ((other[ 4] & 0x00008000) <<10);
-    output[ 7] |= ((input[ 5] & 0x00008000) <<11);
-    output[ 7] |= ((other[ 5] & 0x00008000) <<12);
-    output[ 7] |= ((input[ 6] & 0x00008000) <<13);
-    output[ 7] |= ((other[ 6] & 0x00008000) <<14);
-    output[ 7] |= ((input[ 7] & 0x00008000) <<15);
-    output[ 7] |= ((other[ 7] & 0x00008000) <<16);
-}
-
-__device__ __forceinline__ void from_bitslice_quad(uint32_t *input, uint32_t *output)
-{
-#pragma unroll 8
-    for (int i=0; i < 16; i+=2) output[i] = 0;
-
-    output[ 0] |= ((input[ 0] & 0x00000100) >> 8);
-    output[ 0] |= ((input[ 1] & 0x00000100) >> 7);
-    output[ 0] |= ((input[ 2] & 0x00000100) >> 6);
-    output[ 0] |= ((input[ 3] & 0x00000100) >> 5);
-    output[ 0] |= ((input[ 4] & 0x00000100) >> 4);
-    output[ 0] |= ((input[ 5] & 0x00000100) >> 3);
-    output[ 0] |= ((input[ 6] & 0x00000100) >> 2);
-    output[ 0] |= ((input[ 7] & 0x00000100) >> 1);
-    output[ 0] |= ((input[ 0] & 0x01000000) >>16);
-    output[ 0] |= ((input[ 1] & 0x01000000) >>15);
-    output[ 0] |= ((input[ 2] & 0x01000000) >>14);
-    output[ 0] |= ((input[ 3] & 0x01000000) >>13);
-    output[ 0] |= ((input[ 4] & 0x01000000) >>12);
-    output[ 0] |= ((input[ 5] & 0x01000000) >>11);
-    output[ 0] |= ((input[ 6] & 0x01000000) >>10);
-    output[ 0] |= ((input[ 7] & 0x01000000) >> 9);
-    output[ 2] |= ((input[ 0] & 0x00000200) >> 9);
-    output[ 2] |= ((input[ 1] & 0x00000200) >> 8);
-    output[ 2] |= ((input[ 2] & 0x00000200) >> 7);
-    output[ 2] |= ((input[ 3] & 0x00000200) >> 6);
-    output[ 2] |= ((input[ 4] & 0x00000200) >> 5);
-    output[ 2] |= ((input[ 5] & 0x00000200) >> 4);
-    output[ 2] |= ((input[ 6] & 0x00000200) >> 3);
-    output[ 2] |= ((input[ 7] & 0x00000200) >> 2);
-    output[ 2] |= ((input[ 0] & 0x02000000) >>17);
-    output[ 2] |= ((input[ 1] & 0x02000000) >>16);
-    output[ 2] |= ((input[ 2] & 0x02000000) >>15);
-    output[ 2] |= ((input[ 3] & 0x02000000) >>14);
-    output[ 2] |= ((input[ 4] & 0x02000000) >>13);
-    output[ 2] |= ((input[ 5] & 0x02000000) >>12);
-    output[ 2] |= ((input[ 6] & 0x02000000) >>11);
-    output[ 2] |= ((input[ 7] & 0x02000000) >>10);
-    output[ 4] |= ((input[ 0] & 0x00000400) >>10);
-    output[ 4] |= ((input[ 1] & 0x00000400) >> 9);
-    output[ 4] |= ((input[ 2] & 0x00000400) >> 8);
-    output[ 4] |= ((input[ 3] & 0x00000400) >> 7);
-    output[ 4] |= ((input[ 4] & 0x00000400) >> 6);
-    output[ 4] |= ((input[ 5] & 0x00000400) >> 5);
-    output[ 4] |= ((input[ 6] & 0x00000400) >> 4);
-    output[ 4] |= ((input[ 7] & 0x00000400) >> 3);
-    output[ 4] |= ((input[ 0] & 0x04000000) >>18);
-    output[ 4] |= ((input[ 1] & 0x04000000) >>17);
-    output[ 4] |= ((input[ 2] & 0x04000000) >>16);
-    output[ 4] |= ((input[ 3] & 0x04000000) >>15);
-    output[ 4] |= ((input[ 4] & 0x04000000) >>14);
-    output[ 4] |= ((input[ 5] & 0x04000000) >>13);
-    output[ 4] |= ((input[ 6] & 0x04000000) >>12);
-    output[ 4] |= ((input[ 7] & 0x04000000) >>11);
-    output[ 6] |= ((input[ 0] & 0x00000800) >>11);
-    output[ 6] |= ((input[ 1] & 0x00000800) >>10);
-    output[ 6] |= ((input[ 2] & 0x00000800) >> 9);
-    output[ 6] |= ((input[ 3] & 0x00000800) >> 8);
-    output[ 6] |= ((input[ 4] & 0x00000800) >> 7);
-    output[ 6] |= ((input[ 5] & 0x00000800) >> 6);
-    output[ 6] |= ((input[ 6] & 0x00000800) >> 5);
-    output[ 6] |= ((input[ 7] & 0x00000800) >> 4);
-    output[ 6] |= ((input[ 0] & 0x08000000) >>19);
-    output[ 6] |= ((input[ 1] & 0x08000000) >>18);
-    output[ 6] |= ((input[ 2] & 0x08000000) >>17);
-    output[ 6] |= ((input[ 3] & 0x08000000) >>16);
-    output[ 6] |= ((input[ 4] & 0x08000000) >>15);
-    output[ 6] |= ((input[ 5] & 0x08000000) >>14);
-    output[ 6] |= ((input[ 6] & 0x08000000) >>13);
-    output[ 6] |= ((input[ 7] & 0x08000000) >>12);
-    output[ 8] |= ((input[ 0] & 0x00001000) >>12);
-    output[ 8] |= ((input[ 1] & 0x00001000) >>11);
-    output[ 8] |= ((input[ 2] & 0x00001000) >>10);
-    output[ 8] |= ((input[ 3] & 0x00001000) >> 9);
-    output[ 8] |= ((input[ 4] & 0x00001000) >> 8);
-    output[ 8] |= ((input[ 5] & 0x00001000) >> 7);
-    output[ 8] |= ((input[ 6] & 0x00001000) >> 6);
-    output[ 8] |= ((input[ 7] & 0x00001000) >> 5);
-    output[ 8] |= ((input[ 0] & 0x10000000) >>20);
-    output[ 8] |= ((input[ 1] & 0x10000000) >>19);
-    output[ 8] |= ((input[ 2] & 0x10000000) >>18);
-    output[ 8] |= ((input[ 3] & 0x10000000) >>17);
-    output[ 8] |= ((input[ 4] & 0x10000000) >>16);
-    output[ 8] |= ((input[ 5] & 0x10000000) >>15);
-    output[ 8] |= ((input[ 6] & 0x10000000) >>14);
-    output[ 8] |= ((input[ 7] & 0x10000000) >>13);
-    output[10] |= ((input[ 0] & 0x00002000) >>13);
-    output[10] |= ((input[ 1] & 0x00002000) >>12);
-    output[10] |= ((input[ 2] & 0x00002000) >>11);
-    output[10] |= ((input[ 3] & 0x00002000) >>10);
-    output[10] |= ((input[ 4] & 0x00002000) >> 9);
-    output[10] |= ((input[ 5] & 0x00002000) >> 8);
-    output[10] |= ((input[ 6] & 0x00002000) >> 7);
-    output[10] |= ((input[ 7] & 0x00002000) >> 6);
-    output[10] |= ((input[ 0] & 0x20000000) >>21);
-    output[10] |= ((input[ 1] & 0x20000000) >>20);
-    output[10] |= ((input[ 2] & 0x20000000) >>19);
-    output[10] |= ((input[ 3] & 0x20000000) >>18);
-    output[10] |= ((input[ 4] & 0x20000000) >>17);
-    output[10] |= ((input[ 5] & 0x20000000) >>16);
-    output[10] |= ((input[ 6] & 0x20000000) >>15);
-    output[10] |= ((input[ 7] & 0x20000000) >>14);
-    output[12] |= ((input[ 0] & 0x00004000) >>14);
-    output[12] |= ((input[ 1] & 0x00004000) >>13);
-    output[12] |= ((input[ 2] & 0x00004000) >>12);
-    output[12] |= ((input[ 3] & 0x00004000) >>11);
-    output[12] |= ((input[ 4] & 0x00004000) >>10);
-    output[12] |= ((input[ 5] & 0x00004000) >> 9);
-    output[12] |= ((input[ 6] & 0x00004000) >> 8);
-    output[12] |= ((input[ 7] & 0x00004000) >> 7);
-    output[12] |= ((input[ 0] & 0x40000000) >>22);
-    output[12] |= ((input[ 1] & 0x40000000) >>21);
-    output[12] |= ((input[ 2] & 0x40000000) >>20);
-    output[12] |= ((input[ 3] & 0x40000000) >>19);
-    output[12] |= ((input[ 4] & 0x40000000) >>18);
-    output[12] |= ((input[ 5] & 0x40000000) >>17);
-    output[12] |= ((input[ 6] & 0x40000000) >>16);
-    output[12] |= ((input[ 7] & 0x40000000) >>15);
-    output[14] |= ((input[ 0] & 0x00008000) >>15);
-    output[14] |= ((input[ 1] & 0x00008000) >>14);
-    output[14] |= ((input[ 2] & 0x00008000) >>13);
-    output[14] |= ((input[ 3] & 0x00008000) >>12);
-    output[14] |= ((input[ 4] & 0x00008000) >>11);
-    output[14] |= ((input[ 5] & 0x00008000) >>10);
-    output[14] |= ((input[ 6] & 0x00008000) >> 9);
-    output[14] |= ((input[ 7] & 0x00008000) >> 8);
-    output[14] |= ((input[ 0] & 0x80000000) >>23);
-    output[14] |= ((input[ 1] & 0x80000000) >>22);
-    output[14] |= ((input[ 2] & 0x80000000) >>21);
-    output[14] |= ((input[ 3] & 0x80000000) >>20);
-    output[14] |= ((input[ 4] & 0x80000000) >>19);
-    output[14] |= ((input[ 5] & 0x80000000) >>18);
-    output[14] |= ((input[ 6] & 0x80000000) >>17);
-    output[14] |= ((input[ 7] & 0x80000000) >>16);
-
-#pragma unroll 8
-    for (int i = 0; i < 16; i+=2) {
-        if (threadIdx.x & 1) output[i] = __byte_perm(output[i], 0, 0x1032);
-        output[i] = __byte_perm(output[i], __shfl((int)output[i], (threadIdx.x+1)%4, 4), 0x7610);
-        output[i+1] = __shfl((int)output[i], (threadIdx.x+2)%4, 4);
-        if ((threadIdx.x % 4) != 0) output[i] = output[i+1] = 0;
-    }
-}
diff --git a/blake2b.cu b/blake2b.cu
new file mode 100644
index 0000000000..2be74f8185
--- /dev/null
+++ b/blake2b.cu
@@ -0,0 +1,273 @@
+/**
+ * Blake2-B CUDA Implementation
+ *
+ * tpruvot@github July 2016
+ *
+ */
+
+#include <miner.h>
+
+#include <string.h>
+#include <stdint.h>
+
+#include <sph/blake2b.h>
+
+#include <cuda_helper.h>
+#include <cuda_vector_uint2x4.h>
+
+#define TPB 512
+#define NBN 2
+
+static uint32_t *d_resNonces[MAX_GPUS];
+
+__device__ uint64_t d_data[10];
+
+static __constant__ const int8_t blake2b_sigma[12][16] = {
+	{ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15 } ,
+	{ 14, 10, 4,  8,  9,  15, 13, 6,  1,  12, 0,  2,  11, 7,  5,  3  } ,
+	{ 11, 8,  12, 0,  5,  2,  15, 13, 10, 14, 3,  6,  7,  1,  9,  4  } ,
+	{ 7,  9,  3,  1,  13, 12, 11, 14, 2,  6,  5,  10, 4,  0,  15, 8  } ,
+	{ 9,  0,  5,  7,  2,  4,  10, 15, 14, 1,  11, 12, 6,  8,  3,  13 } ,
+	{ 2,  12, 6,  10, 0,  11, 8,  3,  4,  13, 7,  5,  15, 14, 1,  9  } ,
+	{ 12, 5,  1,  15, 14, 13, 4,  10, 0,  7,  6,  3,  9,  2,  8,  11 } ,
+	{ 13, 11, 7,  14, 12, 1,  3,  9,  5,  0,  15, 4,  8,  6,  2,  10 } ,
+	{ 6,  15, 14, 9,  11, 3,  0,  8,  12, 2,  13, 7,  1,  4,  10, 5  } ,
+	{ 10, 2,  8,  4,  7,  6,  1,  5,  15, 11, 9,  14, 3,  12, 13, 0  } ,
+	{ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15 } ,
+	{ 14, 10, 4,  8,  9,  15, 13, 6,  1,  12, 0,  2,  11, 7,  5,  3  }
+};
+
+// host mem align
+#define A 64
+
+extern "C" void blake2b_hash(void *output, const void *input)
+{
+	uint8_t _ALIGN(A) hash[32];
+	blake2b_ctx ctx;
+
+	blake2b_init(&ctx, 32, NULL, 0);
+	blake2b_update(&ctx, input, 80);
+	blake2b_final(&ctx, hash);
+
+	memcpy(output, hash, 32);
+}
+
+// ----------------------------------------------------------------
+
+__device__ __forceinline__
+static void G(const int r, const int i, uint64_t &a, uint64_t &b, uint64_t &c, uint64_t &d, uint64_t const m[16])
+{
+	a = a + b + m[ blake2b_sigma[r][2*i] ];
+	((uint2*)&d)[0] = SWAPUINT2( ((uint2*)&d)[0] ^ ((uint2*)&a)[0] );
+	c = c + d;
+	((uint2*)&b)[0] = ROR24( ((uint2*)&b)[0] ^ ((uint2*)&c)[0] );
+	a = a + b + m[ blake2b_sigma[r][2*i+1] ];
+	((uint2*)&d)[0] = ROR16( ((uint2*)&d)[0] ^ ((uint2*)&a)[0] );
+	c = c + d;
+	((uint2*)&b)[0] = ROR2( ((uint2*)&b)[0] ^ ((uint2*)&c)[0], 63U);
+}
+
+#define ROUND(r) \
+	G(r, 0, v[0], v[4], v[ 8], v[12], m); \
+	G(r, 1, v[1], v[5], v[ 9], v[13], m); \
+	G(r, 2, v[2], v[6], v[10], v[14], m); \
+	G(r, 3, v[3], v[7], v[11], v[15], m); \
+	G(r, 4, v[0], v[5], v[10], v[15], m); \
+	G(r, 5, v[1], v[6], v[11], v[12], m); \
+	G(r, 6, v[2], v[7], v[ 8], v[13], m); \
+	G(r, 7, v[3], v[4], v[ 9], v[14], m);
+
+__global__
+//__launch_bounds__(128, 8) /* to force 64 regs */
+void blake2b_gpu_hash(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce, const uint2 target2)
+{
+	const uint32_t nonce = (blockDim.x * blockIdx.x + threadIdx.x) + startNonce;
+
+	uint64_t m[16];
+
+	m[0] = d_data[0];
+	m[1] = d_data[1];
+	m[2] = d_data[2];
+	m[3] = d_data[3];
+	m[4] = d_data[4];
+	m[5] = d_data[5];
+	m[6] = d_data[6];
+	m[7] = d_data[7];
+	m[8] = d_data[8];
+	((uint32_t*)m)[18] = AS_U32(&d_data[9]);
+	((uint32_t*)m)[19] = nonce;
+
+	m[10] = m[11] = 0;
+	m[12] = m[13] = 0;
+	m[14] = m[15] = 0;
+
+	uint64_t v[16] = {
+		0x6a09e667f2bdc928, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
+		0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
+		0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
+		0x510e527fade68281, 0x9b05688c2b3e6c1f, 0xe07c265404be4294, 0x5be0cd19137e2179
+	};
+
+	ROUND( 0);
+	ROUND( 1);
+	ROUND( 2);
+	ROUND( 3);
+	ROUND( 4);
+	ROUND( 5);
+	ROUND( 6);
+	ROUND( 7);
+	ROUND( 8);
+	ROUND( 9);
+	ROUND(10);
+	ROUND(11);
+
+	uint2 last = vectorize(v[3] ^ v[11] ^ 0xa54ff53a5f1d36f1);
+	if (last.y <= target2.y && last.x <= target2.x) {
+		resNonce[1] = resNonce[0];
+		resNonce[0] = nonce;
+	}
+}
+
+__host__
+uint32_t blake2b_hash_cuda(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint2 target2, uint32_t &secNonce)
+{
+	uint32_t resNonces[NBN] = { UINT32_MAX, UINT32_MAX };
+	uint32_t result = UINT32_MAX;
+
+	dim3 grid((threads + TPB-1)/TPB);
+	dim3 block(TPB);
+
+	/* Check error on Ctrl+C or kill to prevent segfaults on exit */
+	if (cudaMemset(d_resNonces[thr_id], 0xff, NBN*sizeof(uint32_t)) != cudaSuccess)
+		return result;
+
+	blake2b_gpu_hash <<<grid, block, 8>>> (threads, startNonce, d_resNonces[thr_id], target2);
+	cudaThreadSynchronize();
+
+	if (cudaSuccess == cudaMemcpy(resNonces, d_resNonces[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
+		result = resNonces[0];
+		secNonce = resNonces[1];
+		if (secNonce == result) secNonce = UINT32_MAX;
+	}
+	return result;
+}
+
+__host__
+void blake2b_setBlock(uint32_t *data)
+{
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_data, data, 80, 0, cudaMemcpyHostToDevice));
+}
+
+static bool init[MAX_GPUS] = { 0 };
+
+int scanhash_blake2b(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t _ALIGN(A) endiandata[20];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+
+	const uint32_t first_nonce = pdata[19];
+
+	int dev_id = device_map[thr_id];
+	int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 28 : 25;
+	if (device_sm[dev_id] >= 520 && is_windows()) intensity = 26;
+	if (device_sm[dev_id] < 350) intensity = 22;
+
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage (linux)
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_resNonces[thr_id], NBN * sizeof(uint32_t)), -1);
+		init[thr_id] = true;
+	}
+
+	for (int i=0; i < 20; i++)
+		be32enc(&endiandata[i], pdata[i]);
+
+	const uint2 target = make_uint2(ptarget[6], ptarget[7]);
+	blake2b_setBlock(endiandata);
+
+	do {
+		work->nonces[0] = blake2b_hash_cuda(thr_id, throughput, pdata[19], target, work->nonces[1]);
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(A) vhash[8];
+			work->valid_nonces = 0;
+			endiandata[19] = work->nonces[0];
+			blake2b_hash(vhash, endiandata);
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work_set_target_ratio(work, vhash);
+				work->valid_nonces++;
+				pdata[19] = work->nonces[0] + 1;
+			} else {
+				gpu_increment_reject(thr_id);
+			}
+
+			if (work->nonces[1] != UINT32_MAX) {
+				endiandata[19] = work->nonces[1];
+				blake2b_hash(vhash, endiandata);
+				if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+					if (bn_hash_target_ratio(vhash, ptarget) > work->shareratio[0]) {
+						work->sharediff[1] = work->sharediff[0];
+						work->shareratio[1] = work->shareratio[0];
+						xchg(work->nonces[1], work->nonces[0]);
+						work_set_target_ratio(work, vhash);
+					} else {
+						bn_set_target_ratio(work, vhash, 1);
+					}
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; // next scan start
+				} else {
+					gpu_increment_reject(thr_id);
+				}
+			}
+
+			if (work->valid_nonces) {
+				work->nonces[0] = cuda_swab32(work->nonces[0]);
+				work->nonces[1] = cuda_swab32(work->nonces[1]);
+				return work->valid_nonces;
+			}
+		}
+
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+
+	return 0;
+}
+
+// cleanup
+extern "C" void free_blake2b(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	//cudaThreadSynchronize();
+
+	cudaFree(d_resNonces[thr_id]);
+
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/build.cmd b/build.cmd
new file mode 100644
index 0000000000..6bea46b5a7
--- /dev/null
+++ b/build.cmd
@@ -0,0 +1,4 @@
+REM COMMAND LINE MS BUILD
+REM Note: /m:2 = 2 threads, but for host code only...
+
+msbuild ccminer.vcxproj /m /p:Configuration=Release
diff --git a/build.sh b/build.sh
new file mode 100755
index 0000000000..17935f3968
--- /dev/null
+++ b/build.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+# Simple script to create the Makefile and build
+
+# export PATH="$PATH:/usr/local/cuda/bin/"
+
+make distclean || echo clean
+
+rm -f Makefile.in
+rm -f config.status
+./autogen.sh || echo done
+
+# CFLAGS="-O2" ./configure
+./configure.sh
+
+make -j 4
diff --git a/ccminer.conf b/ccminer.conf
new file mode 100644
index 0000000000..1760afd7b5
--- /dev/null
+++ b/ccminer.conf
@@ -0,0 +1,20 @@
+{
+	"_comment1" : "Possible keys are the long options (ccminer --help)",
+	"_comment2" : "todo: support /* comments */",
+
+	"algo" : "lbry",
+	"intensity": 22,
+
+	"api-bind": "127.0.0.1:4068",
+	"statsavg": 20,
+
+	"max-log-rate": 60,
+	"quiet" : false,
+	"debug" : false,
+	"protocol" : false,
+	"cpu-priority" : 3,
+
+	"url" : "stratum+tcp://yiimp.ccminer.org:3334",
+	"user" : "bK2wcSFJv2nnNjFvTN5Q7VX8X8unJktJsa",
+	"pass" : "donate"
+}
diff --git a/ccminer.cpp b/ccminer.cpp
new file mode 100644
index 0000000000..26950745b3
--- /dev/null
+++ b/ccminer.cpp
@@ -0,0 +1,4374 @@
+﻿/*
+ * Copyright 2010 Jeff Garzik
+ * Copyright 2012-2014 pooler
+ * Copyright 2014-2017 tpruvot
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.  See COPYING for more details.
+ */
+
+#include <ccminer-config.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <inttypes.h>
+#include <unistd.h>
+#include <math.h>
+#include <sys/time.h>
+#include <time.h>
+#include <signal.h>
+
+#include <curl/curl.h>
+#include <openssl/sha.h>
+
+#ifdef WIN32
+#include <windows.h>
+#include <stdint.h>
+#else
+#include <errno.h>
+#include <sys/resource.h>
+#if HAVE_SYS_SYSCTL_H
+#include <sys/types.h>
+#if HAVE_SYS_PARAM_H
+#include <sys/param.h>
+#endif
+#include <sys/sysctl.h>
+#endif
+#endif
+
+#include "miner.h"
+#include "algos.h"
+#include "sia/sia-rpc.h"
+#include "crypto/xmr-rpc.h"
+#include "equi/equihash.h"
+
+#include <cuda_runtime.h>
+
+#ifdef WIN32
+#include <Mmsystem.h>
+#pragma comment(lib, "winmm.lib")
+#include "compat/winansi.h"
+BOOL WINAPI ConsoleHandler(DWORD);
+#endif
+
+#define PROGRAM_NAME		"ccminer"
+#define LP_SCANTIME		60
+#define HEAVYCOIN_BLKHDR_SZ		84
+#define MNR_BLKHDR_SZ 80
+
+#include "nvml.h"
+#ifdef USE_WRAPNVML
+nvml_handle *hnvml = NULL;
+#endif
+
+enum workio_commands {
+	WC_GET_WORK,
+	WC_SUBMIT_WORK,
+	WC_ABORT,
+};
+
+struct workio_cmd {
+	enum workio_commands	cmd;
+	struct thr_info		*thr;
+	union {
+		struct work	*work;
+	} u;
+	int pooln;
+};
+
+bool opt_debug = false;
+bool opt_debug_diff = false;
+bool opt_debug_threads = false;
+bool opt_protocol = false;
+bool opt_benchmark = false;
+bool opt_showdiff = true;
+bool opt_hwmonitor = true;
+
+// todo: limit use of these flags,
+// prefer the pools[] attributes
+bool want_longpoll = true;
+bool have_longpoll = false;
+bool want_stratum = true;
+bool have_stratum = false;
+bool allow_gbt = true;
+bool allow_mininginfo = true;
+bool check_dups = true; //false;
+bool check_stratum_jobs = false;
+bool opt_submit_stale = false;
+bool submit_old = false;
+bool use_syslog = false;
+bool use_colors = true;
+int use_pok = 0;
+int use_roots = 0;
+static bool opt_background = false;
+bool opt_quiet = false;
+int opt_maxlograte = 3;
+static int opt_retries = -1;
+static int opt_fail_pause = 30;
+int opt_time_limit = -1;
+int opt_shares_limit = -1;
+time_t firstwork_time = 0;
+int opt_timeout = 300; // curl
+int opt_scantime = 10;
+static json_t *opt_config;
+static const bool opt_time = true;
+volatile enum sha_algos opt_algo = ALGO_AUTO;
+int opt_n_threads = 0;
+int gpu_threads = 1;
+int64_t opt_affinity = -1L;
+int opt_priority = 0;
+static double opt_difficulty = 1.;
+bool opt_extranonce = true;
+bool opt_trust_pool = false;
+uint16_t opt_vote = 9999;
+int num_cpus;
+int active_gpus;
+bool need_nvsettings = false;
+bool need_memclockrst = false;
+char * device_name[MAX_GPUS];
+short device_map[MAX_GPUS] = { 0 };
+long  device_sm[MAX_GPUS] = { 0 };
+short device_mpcount[MAX_GPUS] = { 0 };
+uint32_t gpus_intensity[MAX_GPUS] = { 0 };
+uint32_t device_gpu_clocks[MAX_GPUS] = { 0 };
+uint32_t device_mem_clocks[MAX_GPUS] = { 0 };
+int32_t device_mem_offsets[MAX_GPUS] = { 0 };
+uint32_t device_plimit[MAX_GPUS] = { 0 };
+uint8_t device_tlimit[MAX_GPUS] = { 0 };
+int8_t device_pstate[MAX_GPUS] = { -1, -1 };
+int32_t device_led[MAX_GPUS] = { -1, -1 };
+int opt_led_mode = 0;
+int opt_cudaschedule = -1;
+static bool opt_keep_clocks = false;
+
+// un-linked to cmdline scrypt options (useless)
+int device_batchsize[MAX_GPUS] = { 0 };
+int device_texturecache[MAX_GPUS] = { 0 };
+int device_singlememory[MAX_GPUS] = { 0 };
+// implemented scrypt options
+int parallel = 2; // All should be made on GPU
+char *device_config[MAX_GPUS] = { 0 };
+int device_backoff[MAX_GPUS] = { 0 }; // scrypt
+int device_bfactor[MAX_GPUS] = { 0 }; // cryptonight
+int device_lookup_gap[MAX_GPUS] = { 0 };
+int device_interactive[MAX_GPUS] = { 0 };
+int opt_nfactor = 0;
+bool opt_autotune = true;
+char *jane_params = NULL;
+
+// pools (failover/getwork infos)
+struct pool_infos pools[MAX_POOLS] = { 0 };
+int num_pools = 1;
+volatile int cur_pooln = 0;
+bool opt_pool_failover = true;
+volatile bool pool_on_hold = false;
+volatile bool pool_is_switching = false;
+volatile int pool_switch_count = 0;
+bool conditional_pool_rotate = false;
+
+extern char* opt_scratchpad_url;
+
+// current connection
+char *rpc_user = NULL;
+char *rpc_pass;
+char *rpc_url;
+char *short_url = NULL;
+
+struct stratum_ctx stratum = { 0 };
+pthread_mutex_t stratum_sock_lock;
+pthread_mutex_t stratum_work_lock;
+
+char *opt_cert;
+char *opt_proxy;
+long opt_proxy_type;
+struct thr_info *thr_info = NULL;
+static int work_thr_id;
+struct thr_api *thr_api;
+int longpoll_thr_id = -1;
+int stratum_thr_id = -1;
+int api_thr_id = -1;
+int monitor_thr_id = -1;
+bool stratum_need_reset = false;
+volatile bool abort_flag = false;
+struct work_restart *work_restart = NULL;
+static int app_exit_code = EXIT_CODE_OK;
+
+pthread_mutex_t applog_lock;
+pthread_mutex_t stats_lock;
+double thr_hashrates[MAX_GPUS] = { 0 };
+uint64_t global_hashrate = 0;
+double   stratum_diff = 0.0;
+double   net_diff = 0;
+uint64_t net_hashrate = 0;
+uint64_t net_blocks = 0;
+// conditional mining
+uint8_t conditional_state[MAX_GPUS] = { 0 };
+double opt_max_temp = 0.0;
+double opt_max_diff = -1.;
+double opt_max_rate = -1.;
+double opt_resume_temp = 0.;
+double opt_resume_diff = 0.;
+double opt_resume_rate = -1.;
+
+int opt_statsavg = 30;
+
+#define API_MCAST_CODE "FTW"
+#define API_MCAST_ADDR "224.0.0.75"
+
+// strdup on char* to allow a common free() if used
+static char* opt_syslog_pfx = strdup(PROGRAM_NAME);
+char *opt_api_bind = strdup("127.0.0.1"); /* 0.0.0.0 for all ips */
+int opt_api_port = 4068; /* 0 to disable */
+char *opt_api_allow = NULL;
+char *opt_api_groups = NULL;
+bool opt_api_mcast = false;
+char *opt_api_mcast_addr = strdup(API_MCAST_ADDR);
+char *opt_api_mcast_code = strdup(API_MCAST_CODE);
+char *opt_api_mcast_des = strdup("");
+int opt_api_mcast_port = 4068;
+
+bool opt_stratum_stats = false;
+
+int cryptonight_fork = 1;
+
+static char const usage[] = "\
+Usage: " PROGRAM_NAME " [OPTIONS]\n\
+Options:\n\
+  -a, --algo=ALGO       specify the hash algorithm to use\n\
+			allium      Garlic double lyra2\n\
+			bastion     Hefty bastion\n\
+			bitcore     Timetravel-10\n\
+			blake       Blake 256 (SFR)\n\
+			blake2b     Blake2-B 512 (BCX)\n\
+			blake2s     Blake2-S 256 (NEVA)\n\
+			blakecoin   Fast Blake 256 (8 rounds)\n\
+			bmw         BMW 256\n\
+			cryptolight AEON cryptonight (MEM/2)\n\
+			cryptonight XMR cryptonight v1 (old)\n\
+			c11/flax    X11 variant\n\
+			decred      Decred Blake256\n\
+			deep        Deepcoin\n\
+			equihash    Zcash Equihash\n\
+			exosis      Exosis timetravel\n\
+			dmd-gr      Diamond-Groestl\n\
+			fresh       Freshcoin (shavite 80)\n\
+			fugue256    Fuguecoin\n\
+			graft       Cryptonight v8\n\
+			groestl     Groestlcoin\n"
+#ifdef WITH_HEAVY_ALGO
+"			heavy       Heavycoin\n"
+#endif
+"			hmq1725     Doubloons / Espers\n\
+			jackpot     JHA v8\n\
+			keccak      Deprecated Keccak-256\n\
+			keccakc     Keccak-256 (CreativeCoin)\n\
+			lbry        LBRY Credits (Sha/Ripemd)\n\
+			luffa       Joincoin\n\
+			lyra2       CryptoCoin\n\
+			lyra2v2     MonaCoin\n\
+			lyra2v3     Vertcoin\n\
+			lyra2z      ZeroCoin (3rd impl)\n\
+			myr-gr      Myriad-Groestl\n\
+			monero      XMR cryptonight (v7)\n\
+			neoscrypt   FeatherCoin, Phoenix, UFO...\n\
+			nist5       NIST5 (TalkCoin)\n\
+			penta       Pentablake hash (5x Blake 512)\n\
+			phi1612     LUX initial algo, for Seraph\n\
+			phi2        LUX v2 with lyra2\n\
+			polytimos   Politimos\n\
+			quark       Quark\n\
+			qubit       Qubit\n\
+			sha256d     SHA256d (bitcoin)\n\
+			sha256t     SHA256 x3\n\
+			sha256q     SHA256 x4\n\
+			sia         SIA (Blake2B)\n\
+			sib         Sibcoin (X11+Streebog)\n\
+			scrypt      Scrypt\n\
+			scrypt-jane Scrypt-jane Chacha\n\
+			skein       Skein SHA2 (Skeincoin)\n\
+			skein2      Double Skein (Woodcoin)\n\
+			skunk       Skein Cube Fugue Streebog\n\
+			sonoa       97 hashes based on X17 ones (Sono)\n\
+			stellite    Cryptonight v3\n\
+			s3          S3 (1Coin)\n\
+			timetravel  Machinecoin permuted x8\n\
+			tribus      Denarius\n\
+			vanilla     Blake256-8 (VNL)\n\
+			veltor      Thorsriddle streebog\n\
+			whirlcoin   Old Whirlcoin (Whirlpool algo)\n\
+			whirlpool   Whirlpool algo\n\
+			x11evo      Permuted x11 (Revolver)\n\
+			x11         X11 (DarkCoin)\n\
+			x12         X12 (GalaxyCash)\n\
+			x13         X13 (MaruCoin)\n\
+			x14         X14\n\
+			x15         X15\n\
+			x16r        X16R (Raven)\n\
+			x16s        X16S\n\
+			x17         X17\n\
+			wildkeccak  Boolberry\n\
+			zr5         ZR5 (ZiftrCoin)\n\
+  -d, --devices         Comma separated list of CUDA devices to use.\n\
+                        Device IDs start counting from 0! Alternatively takes\n\
+                        string names of your cards like gtx780ti or gt640#2\n\
+                        (matching 2nd gt640 in the PC)\n\
+  -i  --intensity=N[,N] GPU intensity 8.0-25.0 (default: auto) \n\
+                        Decimals are allowed for fine tuning \n\
+      --cuda-schedule   Set device threads scheduling mode (default: auto)\n\
+  -f, --diff-factor     Divide difficulty by this factor (default 1.0) \n\
+  -m, --diff-multiplier Multiply difficulty by this value (default 1.0) \n\
+  -o, --url=URL         URL of mining server\n\
+  -O, --userpass=U:P    username:password pair for mining server\n\
+  -u, --user=USERNAME   username for mining server\n\
+  -p, --pass=PASSWORD   password for mining server\n\
+      --cert=FILE       certificate for mining server using SSL\n\
+  -x, --proxy=[PROTOCOL://]HOST[:PORT]  connect through a proxy\n\
+  -t, --threads=N       number of miner threads (default: number of nVidia GPUs)\n\
+  -r, --retries=N       number of times to retry if a network call fails\n\
+                          (default: retry indefinitely)\n\
+  -R, --retry-pause=N   time to pause between retries, in seconds (default: 30)\n\
+      --shares-limit    maximum shares [s] to mine before exiting the program.\n\
+      --time-limit      maximum time [s] to mine before exiting the program.\n\
+  -T, --timeout=N       network timeout, in seconds (default: 300)\n\
+  -s, --scantime=N      upper bound on time spent scanning current work when\n\
+                          long polling is unavailable, in seconds (default: 10)\n\
+      --submit-stale    ignore stale jobs checks, may create more rejected shares\n\
+  -n, --ndevs           list cuda devices\n\
+  -N, --statsavg        number of samples used to compute hashrate (default: 30)\n\
+      --no-gbt          disable getblocktemplate support (height check in solo)\n\
+      --no-longpoll     disable X-Long-Polling support\n\
+      --no-stratum      disable X-Stratum support\n\
+      --no-extranonce   disable extranonce subscribe on stratum\n\
+  -q, --quiet           disable per-thread hashmeter output\n\
+      --no-color        disable colored output\n\
+  -D, --debug           enable debug output\n\
+  -P, --protocol-dump   verbose dump of protocol-level activities\n\
+      --cpu-affinity    set process affinity to cpu core(s), mask 0x3 for cores 0 and 1\n\
+      --cpu-priority    set process priority (default: 3) 0 idle, 2 normal to 5 highest\n\
+  -b, --api-bind=port   IP:port for the miner API (default: 127.0.0.1:4068), 0 disabled\n\
+      --api-remote      Allow remote control, like pool switching, imply --api-allow=0/0\n\
+      --api-allow=...   IP/mask of the allowed api client(s), 0/0 for all\n\
+      --max-temp=N      Only mine if gpu temp is less than specified value\n\
+      --max-rate=N[KMG] Only mine if net hashrate is less than specified value\n\
+      --max-diff=N      Only mine if net difficulty is less than specified value\n\
+                        Can be tuned with --resume-diff=N to set a resume value\n\
+      --max-log-rate    Interval to reduce per gpu hashrate logs (default: 3)\n"
+#if defined(__linux) /* via nvml */
+"\
+      --mem-clock=3505  Set the gpu memory max clock (346.72+ driver)\n\
+      --gpu-clock=1150  Set the gpu engine max clock (346.72+ driver)\n\
+      --pstate=0[,2]    Set the gpu power state (352.21+ driver)\n\
+      --plimit=100W     Set the gpu power limit (352.21+ driver)\n"
+#else /* via nvapi.dll */
+"\
+      --mem-clock=3505  Set the gpu memory boost clock\n\
+      --mem-clock=+500  Set the gpu memory offset\n\
+      --gpu-clock=1150  Set the gpu engine boost clock\n\
+      --plimit=100      Set the gpu power limit in percentage\n\
+      --tlimit=80       Set the gpu thermal limit in degrees\n\
+      --led=100         Set the logo led level (0=disable, 0xFF00FF for RVB)\n"
+#endif
+#ifdef HAVE_SYSLOG_H
+"\
+  -S, --syslog          use system log for output messages\n\
+      --syslog-prefix=... allow to change syslog tool name\n"
+#endif
+"\
+      --hide-diff       hide submitted block and net difficulty (old mode)\n\
+  -B, --background      run the miner in the background\n\
+      --benchmark       run in offline benchmark mode\n\
+      --cputest         debug hashes from cpu algorithms\n\
+  -c, --config=FILE     load a JSON-format configuration file\n\
+  -V, --version         display version information and exit\n\
+  -h, --help            display this help text and exit\n\
+";
+
+static char const short_options[] =
+#ifdef HAVE_SYSLOG_H
+	"S"
+#endif
+	"a:Bc:k:i:Dhp:Px:f:m:nqr:R:s:t:T:o:u:O:Vd:N:b:l:L:";
+
+struct option options[] = {
+	{ "algo", 1, NULL, 'a' },
+	{ "api-bind", 1, NULL, 'b' },
+	{ "api-remote", 0, NULL, 1030 },
+	{ "api-allow", 1, NULL, 1031 },
+	{ "api-groups", 1, NULL, 1032 },
+	{ "api-mcast", 0, NULL, 1033 },
+	{ "api-mcast-addr", 1, NULL, 1034 },
+	{ "api-mcast-code", 1, NULL, 1035 },
+	{ "api-mcast-port", 1, NULL, 1036 },
+	{ "api-mcast-des", 1, NULL, 1037 },
+	{ "background", 0, NULL, 'B' },
+	{ "benchmark", 0, NULL, 1005 },
+	{ "cert", 1, NULL, 1001 },
+	{ "config", 1, NULL, 'c' },
+	{ "cputest", 0, NULL, 1006 },
+	{ "cpu-affinity", 1, NULL, 1020 },
+	{ "cpu-priority", 1, NULL, 1021 },
+	{ "cuda-schedule", 1, NULL, 1025 },
+	{ "debug", 0, NULL, 'D' },
+	{ "help", 0, NULL, 'h' },
+	{ "intensity", 1, NULL, 'i' },
+	{ "ndevs", 0, NULL, 'n' },
+	{ "no-color", 0, NULL, 1002 },
+	{ "no-extranonce", 0, NULL, 1012 },
+	{ "no-gbt", 0, NULL, 1011 },
+	{ "no-longpoll", 0, NULL, 1003 },
+	{ "no-stratum", 0, NULL, 1007 },
+	{ "no-autotune", 0, NULL, 1004 },  // scrypt
+	{ "interactive", 1, NULL, 1050 },  // scrypt
+	{ "lookup-gap", 1, NULL, 'L' },    // scrypt
+	{ "texture-cache", 1, NULL, 1051 },// scrypt
+	{ "launch-config", 1, NULL, 'l' }, // scrypt bbr xmr
+	{ "scratchpad", 1, NULL, 'k' },    // bbr
+	{ "bfactor", 1, NULL, 1055 },      // xmr
+	{ "max-temp", 1, NULL, 1060 },
+	{ "max-diff", 1, NULL, 1061 },
+	{ "max-rate", 1, NULL, 1062 },
+	{ "resume-diff", 1, NULL, 1063 },
+	{ "resume-rate", 1, NULL, 1064 },
+	{ "resume-temp", 1, NULL, 1065 },
+	{ "pass", 1, NULL, 'p' },
+	{ "pool-name", 1, NULL, 1100 },     // pool
+	{ "pool-algo", 1, NULL, 1101 },     // pool
+	{ "pool-scantime", 1, NULL, 1102 }, // pool
+	{ "pool-shares-limit", 1, NULL, 1109 },
+	{ "pool-time-limit", 1, NULL, 1108 },
+	{ "pool-max-diff", 1, NULL, 1161 }, // pool
+	{ "pool-max-rate", 1, NULL, 1162 }, // pool
+	{ "pool-disabled", 1, NULL, 1199 }, // pool
+	{ "protocol-dump", 0, NULL, 'P' },
+	{ "proxy", 1, NULL, 'x' },
+	{ "quiet", 0, NULL, 'q' },
+	{ "retries", 1, NULL, 'r' },
+	{ "retry-pause", 1, NULL, 'R' },
+	{ "scantime", 1, NULL, 's' },
+	{ "show-diff", 0, NULL, 1013 }, // deprecated
+	{ "submit-stale", 0, NULL, 1015 },
+	{ "hide-diff", 0, NULL, 1014 },
+	{ "statsavg", 1, NULL, 'N' },
+	{ "gpu-clock", 1, NULL, 1070 },
+	{ "mem-clock", 1, NULL, 1071 },
+	{ "pstate", 1, NULL, 1072 },
+	{ "plimit", 1, NULL, 1073 },
+	{ "keep-clocks", 0, NULL, 1074 },
+	{ "tlimit", 1, NULL, 1075 },
+	{ "led", 1, NULL, 1080 },
+	{ "max-log-rate", 1, NULL, 1019 },
+#ifdef HAVE_SYSLOG_H
+	{ "syslog", 0, NULL, 'S' },
+	{ "syslog-prefix", 1, NULL, 1018 },
+#endif
+	{ "shares-limit", 1, NULL, 1009 },
+	{ "time-limit", 1, NULL, 1008 },
+	{ "threads", 1, NULL, 't' },
+	{ "vote", 1, NULL, 1022 },
+	{ "trust-pool", 0, NULL, 1023 },
+	{ "timeout", 1, NULL, 'T' },
+	{ "url", 1, NULL, 'o' },
+	{ "user", 1, NULL, 'u' },
+	{ "userpass", 1, NULL, 'O' },
+	{ "version", 0, NULL, 'V' },
+	{ "devices", 1, NULL, 'd' },
+	{ "diff-multiplier", 1, NULL, 'm' },
+	{ "diff-factor", 1, NULL, 'f' },
+	{ "diff", 1, NULL, 'f' }, // compat
+	{ 0, 0, 0, 0 }
+};
+
+static char const scrypt_usage[] = "\n\
+Scrypt specific options:\n\
+  -l, --launch-config   gives the launch configuration for each kernel\n\
+                        in a comma separated list, one per device.\n\
+  -L, --lookup-gap      Divides the per-hash memory requirement by this factor\n\
+                        by storing only every N'th value in the scratchpad.\n\
+                        Default is 1.\n\
+      --interactive     comma separated list of flags (0/1) specifying\n\
+                        which of the CUDA device you need to run at inter-\n\
+                        active frame rates (because it drives a display).\n\
+      --texture-cache   comma separated list of flags (0/1/2) specifying\n\
+                        which of the CUDA devices shall use the texture\n\
+                        cache for mining. Kepler devices may profit.\n\
+      --no-autotune     disable auto-tuning of kernel launch parameters\n\
+";
+
+static char const xmr_usage[] = "\n\
+CryptoNight specific options:\n\
+  -l, --launch-config   gives the launch configuration for each kernel\n\
+                        in a comma separated list, one per device.\n\
+      --bfactor=[0-12]  Run Cryptonight core kernel in smaller pieces,\n\
+                        From 0 (ui freeze) to 12 (smooth), win default is 11\n\
+                        This is a per-device setting like the launch config.\n\
+";
+
+static char const bbr_usage[] = "\n\
+Boolberry specific options:\n\
+  -l, --launch-config   gives the launch configuration for each kernel\n\
+                        in a comma separated list, one per device.\n\
+  -k, --scratchpad url  Url used to download the scratchpad cache.\n\
+";
+
+struct work _ALIGN(64) g_work;
+volatile time_t g_work_time;
+pthread_mutex_t g_work_lock;
+
+// get const array size (defined in ccminer.cpp)
+int options_count()
+{
+	int n = 0;
+	while (options[n].name != NULL)
+		n++;
+	return n;
+}
+
+#ifdef __linux /* Linux specific policy and affinity management */
+#include <sched.h>
+static inline void drop_policy(void) {
+	struct sched_param param;
+	param.sched_priority = 0;
+#ifdef SCHED_IDLE
+	if (unlikely(sched_setscheduler(0, SCHED_IDLE, &param) == -1))
+#endif
+#ifdef SCHED_BATCH
+		sched_setscheduler(0, SCHED_BATCH, &param);
+#endif
+}
+
+static void affine_to_cpu_mask(int id, unsigned long mask) {
+	cpu_set_t set;
+	CPU_ZERO(&set);
+	for (uint8_t i = 0; i < num_cpus; i++) {
+		// cpu mask
+		if (mask & (1UL<<i)) { CPU_SET(i, &set); }
+	}
+	if (id == -1) {
+		// process affinity
+		sched_setaffinity(0, sizeof(&set), &set);
+	} else {
+		// thread only
+		pthread_setaffinity_np(thr_info[id].pth, sizeof(&set), &set);
+	}
+}
+#elif defined(__FreeBSD__) /* FreeBSD specific policy and affinity management */
+#include <sys/cpuset.h>
+static inline void drop_policy(void) { }
+static void affine_to_cpu_mask(int id, unsigned long mask) {
+	cpuset_t set;
+	CPU_ZERO(&set);
+	for (uint8_t i = 0; i < num_cpus; i++) {
+		if (mask & (1UL<<i)) CPU_SET(i, &set);
+	}
+	cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, sizeof(cpuset_t), &set);
+}
+#elif defined(WIN32) /* Windows */
+static inline void drop_policy(void) { }
+static void affine_to_cpu_mask(int id, unsigned long mask) {
+	if (id == -1)
+		SetProcessAffinityMask(GetCurrentProcess(), mask);
+	else
+		SetThreadAffinityMask(GetCurrentThread(), mask);
+}
+#else /* Martians */
+static inline void drop_policy(void) { }
+static void affine_to_cpu_mask(int id, uint8_t mask) { }
+#endif
+
+static bool get_blocktemplate(CURL *curl, struct work *work);
+
+void get_currentalgo(char* buf, int sz)
+{
+	int algo = opt_algo;
+	if (algo == ALGO_CRYPTONIGHT)
+		algo = get_cryptonight_algo(cryptonight_fork);
+	snprintf(buf, sz, "%s", algo_names[algo]);
+}
+
+void format_hashrate(double hashrate, char *output)
+{
+	if (opt_algo == ALGO_EQUIHASH)
+		format_hashrate_unit(hashrate, output, "Sol/s");
+	else
+		format_hashrate_unit(hashrate, output, "H/s");
+}
+
+/**
+ * Exit app
+ */
+void proper_exit(int reason)
+{
+	restart_threads();
+	if (abort_flag) /* already called */
+		return;
+
+	abort_flag = true;
+	usleep(200 * 1000);
+	cuda_shutdown();
+
+	if (reason == EXIT_CODE_OK && app_exit_code != EXIT_CODE_OK) {
+		reason = app_exit_code;
+	}
+
+	pthread_mutex_lock(&stats_lock);
+	if (check_dups)
+		hashlog_purge_all();
+	stats_purge_all();
+	pthread_mutex_unlock(&stats_lock);
+
+#ifdef WIN32
+	timeEndPeriod(1); // else never executed
+#endif
+#ifdef USE_WRAPNVML
+	if (hnvml) {
+		for (int n=0; n < opt_n_threads && !opt_keep_clocks; n++) {
+			nvml_reset_clocks(hnvml, device_map[n]);
+		}
+		nvml_destroy(hnvml);
+	}
+	if (need_memclockrst) {
+#	ifdef WIN32
+		for (int n = 0; n < opt_n_threads && !opt_keep_clocks; n++) {
+			nvapi_toggle_clocks(n, false);
+		}
+#	endif
+	}
+#endif
+	free(opt_syslog_pfx);
+	free(opt_api_bind);
+	if (opt_api_allow) free(opt_api_allow);
+	if (opt_api_groups) free(opt_api_groups);
+	free(opt_api_mcast_addr);
+	free(opt_api_mcast_code);
+	free(opt_api_mcast_des);
+	//free(work_restart);
+	//free(thr_info);
+	exit(reason);
+}
+
+bool jobj_binary(const json_t *obj, const char *key, void *buf, size_t buflen)
+{
+	const char *hexstr;
+	json_t *tmp;
+
+	tmp = json_object_get(obj, key);
+	if (unlikely(!tmp)) {
+		applog(LOG_ERR, "JSON key '%s' not found", key);
+		return false;
+	}
+	hexstr = json_string_value(tmp);
+	if (unlikely(!hexstr)) {
+		applog(LOG_ERR, "JSON key '%s' is not a string", key);
+		return false;
+	}
+	if (!hex2bin((uchar*)buf, hexstr, buflen))
+		return false;
+
+	return true;
+}
+
+/* compute nbits to get the network diff */
+static void calc_network_diff(struct work *work)
+{
+	// sample for diff 43.281 : 1c05ea29
+	// todo: endian reversed on longpoll could be zr5 specific...
+	uint32_t nbits = have_longpoll ? work->data[18] : swab32(work->data[18]);
+	if (opt_algo == ALGO_LBRY) nbits = swab32(work->data[26]);
+	if (opt_algo == ALGO_DECRED) nbits = work->data[29];
+	if (opt_algo == ALGO_SIA) nbits = work->data[11]; // unsure if correct
+	if (opt_algo == ALGO_EQUIHASH) {
+		net_diff = equi_network_diff(work);
+		return;
+	}
+
+	uint32_t bits = (nbits & 0xffffff);
+	int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28
+
+	uint64_t diffone = 0x0000FFFF00000000ull;
+	double d = (double)0x0000ffff / (double)bits;
+
+	for (int m=shift; m < 29; m++) d *= 256.0;
+	for (int m=29; m < shift; m++) d /= 256.0;
+	if (opt_algo == ALGO_DECRED && shift == 28) d *= 256.0;
+	if (opt_debug_diff)
+		applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits);
+
+	net_diff = d;
+}
+
+/* decode data from getwork (wallets and longpoll pools) */
+static bool work_decode(const json_t *val, struct work *work)
+{
+	int data_size, target_size = sizeof(work->target);
+	int adata_sz, atarget_sz = ARRAY_SIZE(work->target);
+	int i;
+
+	switch (opt_algo) {
+	case ALGO_DECRED:
+		data_size = 192;
+		adata_sz = 180/4;
+		break;
+	case ALGO_PHI2:
+		data_size = 144;
+		adata_sz = data_size / 4;
+		break;
+	case ALGO_NEOSCRYPT:
+	case ALGO_ZR5:
+		data_size = 80;
+		adata_sz = data_size / 4;
+		break;
+	case ALGO_CRYPTOLIGHT:
+	case ALGO_CRYPTONIGHT:
+	case ALGO_WILDKECCAK:
+		return rpc2_job_decode(val, work);
+	default:
+		data_size = 128;
+		adata_sz = data_size / 4;
+	}
+
+	if (!jobj_binary(val, "data", work->data, data_size)) {
+		json_t *obj = json_object_get(val, "data");
+		int len = obj ? (int) strlen(json_string_value(obj)) : 0;
+		if (!len || len > sizeof(work->data)*2) {
+			applog(LOG_ERR, "JSON invalid data (len %d <> %d)", len/2, data_size);
+			return false;
+		} else {
+			data_size = len / 2;
+			if (!jobj_binary(val, "data", work->data, data_size)) {
+				applog(LOG_ERR, "JSON invalid data (len %d)", data_size);
+				return false;
+			}
+		}
+	}
+
+	if (!jobj_binary(val, "target", work->target, target_size)) {
+		applog(LOG_ERR, "JSON invalid target");
+		return false;
+	}
+
+	if (opt_algo == ALGO_HEAVY) {
+		if (unlikely(!jobj_binary(val, "maxvote", &work->maxvote, sizeof(work->maxvote)))) {
+			work->maxvote = 2048;
+		}
+	} else work->maxvote = 0;
+
+	for (i = 0; i < adata_sz; i++)
+		work->data[i] = le32dec(work->data + i);
+	for (i = 0; i < atarget_sz; i++)
+		work->target[i] = le32dec(work->target + i);
+
+	if (opt_algo == ALGO_PHI2) {
+		for (i = 20; i < 36; i++) if (work->data[i]) {
+			use_roots = 1; break;
+		}
+	}
+
+	if ((opt_showdiff || opt_max_diff > 0.) && !allow_mininginfo)
+		calc_network_diff(work);
+
+	work->targetdiff = target_to_diff(work->target);
+
+	// for api stats, on longpoll pools
+	stratum_diff = work->targetdiff;
+
+	work->tx_count = use_pok = 0;
+	if (opt_algo == ALGO_ZR5 && work->data[0] & POK_BOOL_MASK) {
+		use_pok = 1;
+		json_t *txs = json_object_get(val, "txs");
+		if (txs && json_is_array(txs)) {
+			size_t idx, totlen = 0;
+			json_t *p;
+
+			json_array_foreach(txs, idx, p) {
+				const int tx = work->tx_count % POK_MAX_TXS;
+				const char* hexstr = json_string_value(p);
+				size_t txlen = strlen(hexstr)/2;
+				work->tx_count++;
+				if (work->tx_count > POK_MAX_TXS || txlen >= POK_MAX_TX_SZ) {
+					// when tx is too big, just reset use_pok for the block
+					use_pok = 0;
+					if (opt_debug) applog(LOG_WARNING,
+						"pok: large block ignored, tx len: %u", txlen);
+					work->tx_count = 0;
+					break;
+				}
+				hex2bin((uchar*)work->txs[tx].data, hexstr, min(txlen, POK_MAX_TX_SZ));
+				work->txs[tx].len = (uint32_t) (txlen);
+				totlen += txlen;
+			}
+			if (opt_debug)
+				applog(LOG_DEBUG, "block txs: %u, total len: %u", work->tx_count, totlen);
+		}
+	}
+
+	/* use work ntime as job id (solo-mining) */
+	cbin2hex(work->job_id, (const char*)&work->data[17], 4);
+
+	if (opt_algo == ALGO_DECRED) {
+		uint16_t vote;
+		// always keep last bit of votebits
+		memcpy(&vote, &work->data[25], 2);
+		vote = (opt_vote << 1) | (vote & 1);
+		memcpy(&work->data[25], &vote, 2);
+		// some random extradata to make it unique
+		work->data[36] = (rand()*4);
+		work->data[37] = (rand()*4) << 8;
+		// required for the longpoll pool block info...
+		work->height = work->data[32];
+		if (!have_longpoll && work->height > net_blocks + 1) {
+			char netinfo[64] = { 0 };
+			if (opt_showdiff && net_diff > 0.) {
+				if (net_diff != work->targetdiff)
+					sprintf(netinfo, ", diff %.3f, pool %.1f", net_diff, work->targetdiff);
+				else
+					sprintf(netinfo, ", diff %.3f", net_diff);
+			}
+			applog(LOG_BLUE, "%s block %d%s",
+				algo_names[opt_algo], work->height, netinfo);
+			net_blocks = work->height - 1;
+		}
+		cbin2hex(work->job_id, (const char*)&work->data[34], 4);
+	}
+
+	return true;
+}
+
+#define YES "yes!"
+#define YAY "yay!!!"
+#define BOO "booooo"
+
+int share_result(int result, int pooln, double sharediff, const char *reason)
+{
+	const char *flag;
+	char suppl[32] = { 0 };
+	char solved[16] = { 0 };
+	char s[32] = { 0 };
+	double hashrate = 0.;
+	struct pool_infos *p = &pools[pooln];
+
+	pthread_mutex_lock(&stats_lock);
+	for (int i = 0; i < opt_n_threads; i++) {
+		hashrate += stats_get_speed(i, thr_hashrates[i]);
+	}
+	pthread_mutex_unlock(&stats_lock);
+
+	result ? p->accepted_count++ : p->rejected_count++;
+
+	p->last_share_time = time(NULL);
+	if (sharediff > p->best_share)
+		p->best_share = sharediff;
+
+	global_hashrate = llround(hashrate);
+
+	format_hashrate(hashrate, s);
+	if (opt_showdiff)
+		sprintf(suppl, "diff %.3f", sharediff);
+	else // accepted percent
+		sprintf(suppl, "%.2f%%", 100. * p->accepted_count / (p->accepted_count + p->rejected_count));
+
+	if (!net_diff || sharediff < net_diff) {
+		flag = use_colors ?
+			(result ? CL_GRN YES : CL_RED BOO)
+		:	(result ? "(" YES ")" : "(" BOO ")");
+	} else {
+		p->solved_count++;
+		flag = use_colors ?
+			(result ? CL_GRN YAY : CL_RED BOO)
+		:	(result ? "(" YAY ")" : "(" BOO ")");
+		sprintf(solved, " solved: %u", p->solved_count);
+	}
+
+	applog(LOG_NOTICE, "accepted: %lu/%lu (%s), %s %s%s",
+			p->accepted_count,
+			p->accepted_count + p->rejected_count,
+			suppl, s, flag, solved);
+	if (reason) {
+		applog(LOG_WARNING, "reject reason: %s", reason);
+		if (!check_dups && strncasecmp(reason, "duplicate", 9) == 0) {
+			applog(LOG_WARNING, "enabling duplicates check feature");
+			check_dups = true;
+			g_work_time = 0;
+		}
+	}
+	return 1;
+}
+
+static bool submit_upstream_work(CURL *curl, struct work *work)
+{
+	char s[512];
+	struct pool_infos *pool = &pools[work->pooln];
+	json_t *val, *res, *reason;
+	bool stale_work = false;
+	int idnonce = work->submit_nonce_id;
+
+	if (pool->type & POOL_STRATUM && stratum.rpc2) {
+		struct work submit_work;
+		memcpy(&submit_work, work, sizeof(struct work));
+		if (!hashlog_already_submittted(submit_work.job_id, submit_work.nonces[idnonce])) {
+			if (rpc2_stratum_submit(pool, &submit_work))
+				hashlog_remember_submit(&submit_work, submit_work.nonces[idnonce]);
+			stratum.job.shares_count++;
+		}
+		return true;
+	}
+
+	if (pool->type & POOL_STRATUM && stratum.is_equihash) {
+		struct work submit_work;
+		memcpy(&submit_work, work, sizeof(struct work));
+		//if (!hashlog_already_submittted(submit_work.job_id, submit_work.nonces[idnonce])) {
+			if (equi_stratum_submit(pool, &submit_work))
+				hashlog_remember_submit(&submit_work, submit_work.nonces[idnonce]);
+			stratum.job.shares_count++;
+		//}
+		return true;
+	}
+
+	/* discard if a newer block was received */
+	stale_work = work->height && work->height < g_work.height;
+	if (have_stratum && !stale_work && !opt_submit_stale && opt_algo != ALGO_ZR5 && opt_algo != ALGO_SCRYPT_JANE) {
+		pthread_mutex_lock(&g_work_lock);
+		if (strlen(work->job_id + 8))
+			stale_work = strncmp(work->job_id + 8, g_work.job_id + 8, sizeof(g_work.job_id) - 8);
+		if (stale_work) {
+			pool->stales_count++;
+			if (opt_debug) applog(LOG_DEBUG, "outdated job %s, new %s stales=%d",
+				work->job_id + 8 , g_work.job_id + 8, pool->stales_count);
+			if (!check_stratum_jobs && pool->stales_count > 5) {
+				if (!opt_quiet) applog(LOG_WARNING, "Enabled stratum stale jobs workaround");
+				check_stratum_jobs = true;
+			}
+		}
+		pthread_mutex_unlock(&g_work_lock);
+	}
+
+	if (!have_stratum && !stale_work && allow_gbt) {
+		struct work wheight = { 0 };
+		if (get_blocktemplate(curl, &wheight)) {
+			if (work->height && work->height < wheight.height) {
+				if (opt_debug)
+					applog(LOG_WARNING, "block %u was already solved", work->height);
+				return true;
+			}
+		}
+	}
+
+	if (!stale_work && opt_algo == ALGO_ZR5 && !have_stratum) {
+		stale_work = (memcmp(&work->data[1], &g_work.data[1], 68));
+	}
+
+	if (!submit_old && stale_work) {
+		if (opt_debug)
+			applog(LOG_WARNING, "stale work detected, discarding");
+		return true;
+	}
+
+	if (pool->type & POOL_STRATUM) {
+		uint32_t sent = 0;
+		uint32_t ntime, nonce = work->nonces[idnonce];
+		char *ntimestr, *noncestr, *xnonce2str, *nvotestr;
+		uint16_t nvote = 0;
+
+		switch (opt_algo) {
+		case ALGO_BLAKE:
+		case ALGO_BLAKECOIN:
+		case ALGO_BLAKE2S:
+		case ALGO_BMW:
+		case ALGO_SHA256D:
+		case ALGO_SHA256T:
+		case ALGO_SHA256Q:
+		case ALGO_VANILLA:
+			// fast algos require that... (todo: regen hash)
+			check_dups = true;
+			le32enc(&ntime, work->data[17]);
+			le32enc(&nonce, work->data[19]);
+			break;
+		case ALGO_DECRED:
+			be16enc(&nvote, *((uint16_t*)&work->data[25]));
+			be32enc(&ntime, work->data[34]);
+			be32enc(&nonce, work->data[35]);
+			break;
+		case ALGO_HEAVY:
+			le32enc(&ntime, work->data[17]);
+			le32enc(&nonce, work->data[19]);
+			be16enc(&nvote, *((uint16_t*)&work->data[20]));
+			break;
+		case ALGO_LBRY:
+			check_dups = true;
+			le32enc(&ntime, work->data[25]);
+			//le32enc(&nonce, work->data[27]);
+			break;
+		case ALGO_SIA:
+			be32enc(&ntime, work->data[10]);
+			be32enc(&nonce, work->data[8]);
+			break;
+		case ALGO_ZR5:
+			check_dups = true;
+			be32enc(&ntime, work->data[17]);
+			be32enc(&nonce, work->data[19]);
+			break;
+		default:
+			le32enc(&ntime, work->data[17]);
+			le32enc(&nonce, work->data[19]);
+		}
+		noncestr = bin2hex((const uchar*)(&nonce), 4);
+
+		if (check_dups)
+			sent = hashlog_already_submittted(work->job_id, nonce);
+		if (sent > 0) {
+			sent = (uint32_t) time(NULL) - sent;
+			if (!opt_quiet) {
+				applog(LOG_WARNING, "nonce %s was already sent %u seconds ago", noncestr, sent);
+				hashlog_dump_job(work->job_id);
+			}
+			free(noncestr);
+			// prevent useless computing on some pools
+			g_work_time = 0;
+			restart_threads();
+			return true;
+		}
+
+		ntimestr = bin2hex((const uchar*)(&ntime), 4);
+
+		if (opt_algo == ALGO_DECRED) {
+			xnonce2str = bin2hex((const uchar*)&work->data[36], stratum.xnonce1_size);
+		} else if (opt_algo == ALGO_SIA) {
+			uint16_t high_nonce = swab32(work->data[9]) >> 16;
+			xnonce2str = bin2hex((unsigned char*)(&high_nonce), 2);
+		} else {
+			xnonce2str = bin2hex(work->xnonce2, work->xnonce2_len);
+		}
+
+		// store to keep/display the solved ratio/diff
+		stratum.sharediff = work->sharediff[idnonce];
+
+		if (net_diff && stratum.sharediff > net_diff && (opt_debug || opt_debug_diff))
+			applog(LOG_INFO, "share diff: %.5f, possible block found!!!",
+				stratum.sharediff);
+		else if (opt_debug_diff)
+			applog(LOG_DEBUG, "share diff: %.5f (x %.1f)",
+				stratum.sharediff, work->shareratio[idnonce]);
+
+		if (opt_vote) { // ALGO_HEAVY
+			nvotestr = bin2hex((const uchar*)(&nvote), 2);
+			sprintf(s, "{\"method\": \"mining.submit\", \"params\": ["
+					"\"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":%u}",
+					pool->user, work->job_id + 8, xnonce2str, ntimestr, noncestr, nvotestr, stratum.job.shares_count + 10);
+			free(nvotestr);
+		} else {
+			sprintf(s, "{\"method\": \"mining.submit\", \"params\": ["
+					"\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":%u}",
+					pool->user, work->job_id + 8, xnonce2str, ntimestr, noncestr, stratum.job.shares_count + 10);
+		}
+		free(xnonce2str);
+		free(ntimestr);
+		free(noncestr);
+
+		gettimeofday(&stratum.tv_submit, NULL);
+		if (unlikely(!stratum_send_line(&stratum, s))) {
+			applog(LOG_ERR, "submit_upstream_work stratum_send_line failed");
+			return false;
+		}
+
+		if (check_dups || opt_showdiff)
+			hashlog_remember_submit(work, nonce);
+		stratum.job.shares_count++;
+
+	} else {
+
+		int data_size = 128;
+		int adata_sz = data_size / sizeof(uint32_t);
+
+		/* build hex string */
+		char *str = NULL;
+
+		if (opt_algo == ALGO_ZR5) {
+			data_size = 80; adata_sz = 20;
+		}
+		else if (opt_algo == ALGO_DECRED) {
+			data_size = 192; adata_sz = 180/4;
+		}
+		else if (opt_algo == ALGO_PHI2 && use_roots) {
+			data_size = 144; adata_sz = 36;
+		}
+		else if (opt_algo == ALGO_SIA) {
+			return sia_submit(curl, pool, work);
+		}
+
+		if (opt_algo != ALGO_HEAVY && opt_algo != ALGO_MJOLLNIR) {
+			for (int i = 0; i < adata_sz; i++)
+				le32enc(work->data + i, work->data[i]);
+		}
+		str = bin2hex((uchar*)work->data, data_size);
+		if (unlikely(!str)) {
+			applog(LOG_ERR, "submit_upstream_work OOM");
+			return false;
+		}
+
+		/* build JSON-RPC request */
+		sprintf(s,
+			"{\"method\": \"getwork\", \"params\": [\"%s\"], \"id\":10}\r\n",
+			str);
+
+		/* issue JSON-RPC request */
+		val = json_rpc_call_pool(curl, pool, s, false, false, NULL);
+		if (unlikely(!val)) {
+			applog(LOG_ERR, "submit_upstream_work json_rpc_call failed");
+			return false;
+		}
+
+		res = json_object_get(val, "result");
+		reason = json_object_get(val, "reject-reason");
+		if (!share_result(json_is_true(res), work->pooln, work->sharediff[0],
+				reason ? json_string_value(reason) : NULL))
+		{
+			if (check_dups)
+				hashlog_purge_job(work->job_id);
+		}
+
+		json_decref(val);
+
+		free(str);
+	}
+
+	return true;
+}
+
+/* simplified method to only get some extra infos in solo mode */
+static bool gbt_work_decode(const json_t *val, struct work *work)
+{
+	json_t *err = json_object_get(val, "error");
+	if (err && !json_is_null(err)) {
+		allow_gbt = false;
+		applog(LOG_INFO, "GBT not supported, block height unavailable");
+		return false;
+	}
+
+	if (!work->height) {
+		// complete missing data from getwork
+		json_t *key = json_object_get(val, "height");
+		if (key && json_is_integer(key)) {
+			work->height = (uint32_t) json_integer_value(key);
+			if (!opt_quiet && work->height > g_work.height) {
+				if (net_diff > 0.) {
+					char netinfo[64] = { 0 };
+					char srate[32] = { 0 };
+					sprintf(netinfo, "diff %.2f", net_diff);
+					if (net_hashrate) {
+						format_hashrate((double) net_hashrate, srate);
+						strcat(netinfo, ", net ");
+						strcat(netinfo, srate);
+					}
+					applog(LOG_BLUE, "%s block %d, %s",
+						algo_names[opt_algo], work->height, netinfo);
+				} else {
+					applog(LOG_BLUE, "%s %s block %d", short_url,
+						algo_names[opt_algo], work->height);
+				}
+				g_work.height = work->height;
+			}
+		}
+	}
+
+	return true;
+}
+
+#define GBT_CAPABILITIES "[\"coinbasetxn\", \"coinbasevalue\", \"longpoll\", \"workid\"]"
+static const char *gbt_req =
+	"{\"method\": \"getblocktemplate\", \"params\": [{"
+	//	"\"capabilities\": " GBT_CAPABILITIES ""
+	"}], \"id\":9}\r\n";
+
+static bool get_blocktemplate(CURL *curl, struct work *work)
+{
+	struct pool_infos *pool = &pools[work->pooln];
+	if (!allow_gbt)
+		return false;
+
+	int curl_err = 0;
+	json_t *val = json_rpc_call_pool(curl, pool, gbt_req, false, false, &curl_err);
+
+	if (!val && curl_err == -1) {
+		// when getblocktemplate is not supported, disable it
+		allow_gbt = false;
+		if (!opt_quiet) {
+				applog(LOG_BLUE, "gbt not supported, block height notices disabled");
+		}
+		return false;
+	}
+
+	bool rc = gbt_work_decode(json_object_get(val, "result"), work);
+
+	json_decref(val);
+
+	return rc;
+}
+
+// good alternative for wallet mining, difficulty and net hashrate
+static const char *info_req =
+	"{\"method\": \"getmininginfo\", \"params\": [], \"id\":8}\r\n";
+
+static bool get_mininginfo(CURL *curl, struct work *work)
+{
+	struct pool_infos *pool = &pools[work->pooln];
+	int curl_err = 0;
+
+	if (have_stratum || have_longpoll || !allow_mininginfo)
+		return false;
+
+	json_t *val = json_rpc_call_pool(curl, pool, info_req, false, false, &curl_err);
+
+	if (!val && curl_err == -1) {
+		allow_mininginfo = false;
+		if (opt_debug) {
+				applog(LOG_DEBUG, "getmininginfo not supported");
+		}
+		return false;
+	} else {
+		json_t *res = json_object_get(val, "result");
+		// "blocks": 491493 (= current work height - 1)
+		// "difficulty": 0.99607860999999998
+		// "networkhashps": 56475980
+		// "netmhashps": 351.74414726
+		if (res) {
+			json_t *key = json_object_get(res, "difficulty");
+			if (key) {
+				if (json_is_object(key))
+					key = json_object_get(key, "proof-of-work");
+				if (json_is_real(key))
+					net_diff = json_real_value(key);
+			}
+			key = json_object_get(res, "networkhashps");
+			if (key && json_is_integer(key)) {
+				net_hashrate = json_integer_value(key);
+			}
+			key = json_object_get(res, "netmhashps");
+			if (key && json_is_real(key)) {
+				net_hashrate = (uint64_t)(json_real_value(key) * 1e6);
+			}
+			key = json_object_get(res, "blocks");
+			if (key && json_is_integer(key)) {
+				net_blocks = json_integer_value(key);
+			}
+		}
+	}
+	json_decref(val);
+	return true;
+}
+
+static const char *json_rpc_getwork =
+	"{\"method\":\"getwork\",\"params\":[],\"id\":0}\r\n";
+
+static bool get_upstream_work(CURL *curl, struct work *work)
+{
+	bool rc = false;
+	struct timeval tv_start, tv_end, diff;
+	struct pool_infos *pool = &pools[work->pooln];
+	const char *rpc_req = json_rpc_getwork;
+	json_t *val;
+
+	gettimeofday(&tv_start, NULL);
+
+	if (opt_algo == ALGO_SIA) {
+		char *sia_header = sia_getheader(curl, pool);
+		if (sia_header) {
+			rc = sia_work_decode(sia_header, work);
+			free(sia_header);
+		}
+		gettimeofday(&tv_end, NULL);
+		if (have_stratum || unlikely(work->pooln != cur_pooln)) {
+			return rc;
+		}
+		return rc;
+	}
+
+	if (opt_debug_threads)
+		applog(LOG_DEBUG, "%s: want_longpoll=%d have_longpoll=%d",
+			__func__, want_longpoll, have_longpoll);
+
+	/* want_longpoll/have_longpoll required here to init/unlock the lp thread */
+	val = json_rpc_call_pool(curl, pool, rpc_req, want_longpoll, have_longpoll, NULL);
+	gettimeofday(&tv_end, NULL);
+
+	if (have_stratum || unlikely(work->pooln != cur_pooln)) {
+		if (val)
+			json_decref(val);
+		return false;
+	}
+
+	if (!val)
+		return false;
+
+	rc = work_decode(json_object_get(val, "result"), work);
+
+	if (opt_protocol && rc) {
+		timeval_subtract(&diff, &tv_end, &tv_start);
+		/* show time because curl can be slower against versions/config */
+		applog(LOG_DEBUG, "got new work in %.2f ms",
+		       (1000.0 * diff.tv_sec) + (0.001 * diff.tv_usec));
+	}
+
+	json_decref(val);
+
+	get_mininginfo(curl, work);
+	get_blocktemplate(curl, work);
+
+	return rc;
+}
+
+static void workio_cmd_free(struct workio_cmd *wc)
+{
+	if (!wc)
+		return;
+
+	switch (wc->cmd) {
+	case WC_SUBMIT_WORK:
+		aligned_free(wc->u.work);
+		break;
+	default: /* do nothing */
+		break;
+	}
+
+	memset(wc, 0, sizeof(*wc));	/* poison */
+	free(wc);
+}
+
+static void workio_abort()
+{
+	struct workio_cmd *wc;
+
+	/* fill out work request message */
+	wc = (struct workio_cmd *)calloc(1, sizeof(*wc));
+	if (!wc)
+		return;
+
+	wc->cmd = WC_ABORT;
+
+	/* send work request to workio thread */
+	if (!tq_push(thr_info[work_thr_id].q, wc)) {
+		workio_cmd_free(wc);
+	}
+}
+
+static bool workio_get_work(struct workio_cmd *wc, CURL *curl)
+{
+	struct work *ret_work;
+	int failures = 0;
+
+	ret_work = (struct work*)aligned_calloc(sizeof(struct work));
+	if (!ret_work)
+		return false;
+
+	/* assign pool number before rpc calls */
+	ret_work->pooln = wc->pooln;
+	// applog(LOG_DEBUG, "%s: pool %d", __func__, wc->pooln);
+
+	/* obtain new work from bitcoin via JSON-RPC */
+	while (!get_upstream_work(curl, ret_work)) {
+
+		if (unlikely(ret_work->pooln != cur_pooln)) {
+			applog(LOG_ERR, "get_work json_rpc_call failed");
+			aligned_free(ret_work);
+			tq_push(wc->thr->q, NULL);
+			return true;
+		}
+
+		if (unlikely((opt_retries >= 0) && (++failures > opt_retries))) {
+			applog(LOG_ERR, "get_work json_rpc_call failed");
+			aligned_free(ret_work);
+			return false;
+		}
+
+		/* pause, then restart work-request loop */
+		applog(LOG_ERR, "get_work failed, retry after %d seconds",
+			opt_fail_pause);
+		sleep(opt_fail_pause);
+	}
+
+	/* send work to requesting thread */
+	if (!tq_push(wc->thr->q, ret_work))
+		aligned_free(ret_work);
+
+	return true;
+}
+
+static bool workio_submit_work(struct workio_cmd *wc, CURL *curl)
+{
+	int failures = 0;
+	uint32_t pooln = wc->pooln;
+	// applog(LOG_DEBUG, "%s: pool %d", __func__, wc->pooln);
+
+	/* submit solution to bitcoin via JSON-RPC */
+	while (!submit_upstream_work(curl, wc->u.work)) {
+		if (pooln != cur_pooln) {
+			applog(LOG_DEBUG, "work from pool %u discarded", pooln);
+			return true;
+		}
+		if (unlikely((opt_retries >= 0) && (++failures > opt_retries))) {
+			applog(LOG_ERR, "...terminating workio thread");
+			return false;
+		}
+		/* pause, then restart work-request loop */
+		if (!opt_benchmark)
+			applog(LOG_ERR, "...retry after %d seconds", opt_fail_pause);
+
+		sleep(opt_fail_pause);
+	}
+
+	return true;
+}
+
+static void *workio_thread(void *userdata)
+{
+	struct thr_info *mythr = (struct thr_info*)userdata;
+	CURL *curl;
+	bool ok = true;
+
+	curl = curl_easy_init();
+	if (unlikely(!curl)) {
+		applog(LOG_ERR, "CURL initialization failed");
+		return NULL;
+	}
+
+	while (ok && !abort_flag) {
+		struct workio_cmd *wc;
+
+		/* wait for workio_cmd sent to us, on our queue */
+		wc = (struct workio_cmd *)tq_pop(mythr->q, NULL);
+		if (!wc) {
+			ok = false;
+			break;
+		}
+
+		/* process workio_cmd */
+		switch (wc->cmd) {
+		case WC_GET_WORK:
+			ok = workio_get_work(wc, curl);
+			break;
+		case WC_SUBMIT_WORK:
+			if (opt_led_mode == LED_MODE_SHARES)
+				gpu_led_on(device_map[wc->thr->id]);
+			ok = workio_submit_work(wc, curl);
+			if (opt_led_mode == LED_MODE_SHARES)
+				gpu_led_off(device_map[wc->thr->id]);
+			break;
+		case WC_ABORT:
+		default:		/* should never happen */
+			ok = false;
+			break;
+		}
+
+		if (!ok && num_pools > 1 && opt_pool_failover) {
+			if (opt_debug_threads)
+				applog(LOG_DEBUG, "%s died, failover", __func__);
+			ok = pool_switch_next(-1);
+			tq_push(wc->thr->q, NULL); // get_work() will return false
+		}
+
+		workio_cmd_free(wc);
+	}
+
+	if (opt_debug_threads)
+		applog(LOG_DEBUG, "%s() died", __func__);
+	curl_easy_cleanup(curl);
+	tq_freeze(mythr->q);
+	return NULL;
+}
+
+bool get_work(struct thr_info *thr, struct work *work)
+{
+	struct workio_cmd *wc;
+	struct work *work_heap;
+
+	if (opt_benchmark) {
+		memset(work->data, 0x55, 76);
+		//work->data[17] = swab32((uint32_t)time(NULL));
+		memset(work->data + 19, 0x00, 52);
+		if (opt_algo == ALGO_DECRED) {
+			memset(&work->data[35], 0x00, 52);
+		} else if (opt_algo == ALGO_LBRY) {
+			work->data[28] = 0x80000000;
+		} else {
+			work->data[20] = 0x80000000;
+			work->data[31] = 0x00000280;
+		}
+		memset(work->target, 0x00, sizeof(work->target));
+		return true;
+	}
+
+	/* fill out work request message */
+	wc = (struct workio_cmd *)calloc(1, sizeof(*wc));
+	if (!wc)
+		return false;
+
+	wc->cmd = WC_GET_WORK;
+	wc->thr = thr;
+	wc->pooln = cur_pooln;
+
+	/* send work request to workio thread */
+	if (!tq_push(thr_info[work_thr_id].q, wc)) {
+		workio_cmd_free(wc);
+		return false;
+	}
+
+	/* wait for response, a unit of work */
+	work_heap = (struct work *)tq_pop(thr->q, NULL);
+	if (!work_heap)
+		return false;
+
+	/* copy returned work into storage provided by caller */
+	memcpy(work, work_heap, sizeof(*work));
+	aligned_free(work_heap);
+
+	return true;
+}
+
+static bool submit_work(struct thr_info *thr, const struct work *work_in)
+{
+	struct workio_cmd *wc;
+	/* fill out work request message */
+	wc = (struct workio_cmd *)calloc(1, sizeof(*wc));
+	if (!wc)
+		return false;
+
+	wc->u.work = (struct work *)aligned_calloc(sizeof(*work_in));
+	if (!wc->u.work)
+		goto err_out;
+
+	wc->cmd = WC_SUBMIT_WORK;
+	wc->thr = thr;
+	memcpy(wc->u.work, work_in, sizeof(struct work));
+	wc->pooln = work_in->pooln;
+
+	/* send solution to workio thread */
+	if (!tq_push(thr_info[work_thr_id].q, wc))
+		goto err_out;
+
+	return true;
+
+err_out:
+	workio_cmd_free(wc);
+	return false;
+}
+
+static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
+{
+	uchar merkle_root[64] = { 0 };
+	int i;
+
+	if (sctx->rpc2)
+		return rpc2_stratum_gen_work(sctx, work);
+
+	if (!sctx->job.job_id) {
+		// applog(LOG_WARNING, "stratum_gen_work: job not yet retrieved");
+		return false;
+	}
+
+	pthread_mutex_lock(&stratum_work_lock);
+
+	// store the job ntime as high part of jobid
+	snprintf(work->job_id, sizeof(work->job_id), "%07x %s",
+		be32dec(sctx->job.ntime) & 0xfffffff, sctx->job.job_id);
+	work->xnonce2_len = sctx->xnonce2_size;
+	memcpy(work->xnonce2, sctx->job.xnonce2, sctx->xnonce2_size);
+
+	// also store the block number
+	work->height = sctx->job.height;
+	// and the pool of the current stratum
+	work->pooln = sctx->pooln;
+
+	/* Generate merkle root */
+	switch (opt_algo) {
+		case ALGO_DECRED:
+		case ALGO_EQUIHASH:
+		case ALGO_SIA:
+			// getwork over stratum, no merkle to generate
+			break;
+#ifdef WITH_HEAVY_ALGO
+		case ALGO_HEAVY:
+		case ALGO_MJOLLNIR:
+			heavycoin_hash(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size);
+			break;
+#endif
+		case ALGO_FUGUE256:
+		case ALGO_GROESTL:
+		case ALGO_KECCAK:
+		case ALGO_BLAKECOIN:
+		case ALGO_WHIRLCOIN:
+			SHA256((uchar*)sctx->job.coinbase, sctx->job.coinbase_size, (uchar*)merkle_root);
+			break;
+		case ALGO_WHIRLPOOL:
+		default:
+			sha256d(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size);
+	}
+
+	for (i = 0; i < sctx->job.merkle_count; i++) {
+		memcpy(merkle_root + 32, sctx->job.merkle[i], 32);
+#ifdef WITH_HEAVY_ALGO
+		if (opt_algo == ALGO_HEAVY || opt_algo == ALGO_MJOLLNIR)
+			heavycoin_hash(merkle_root, merkle_root, 64);
+		else
+#endif
+			sha256d(merkle_root, merkle_root, 64);
+	}
+	
+	/* Increment extranonce2 */
+	for (i = 0; i < (int)sctx->xnonce2_size && !++sctx->job.xnonce2[i]; i++);
+
+	/* Assemble block header */
+	memset(work->data, 0, sizeof(work->data));
+	work->data[0] = le32dec(sctx->job.version);
+	for (i = 0; i < 8; i++)
+		work->data[1 + i] = le32dec((uint32_t *)sctx->job.prevhash + i);
+
+	if (opt_algo == ALGO_DECRED) {
+		uint16_t vote;
+		for (i = 0; i < 8; i++) // reversed prevhash
+			work->data[1 + i] = swab32(work->data[1 + i]);
+		// decred header (coinb1) [merkle...nonce]
+		memcpy(&work->data[9], sctx->job.coinbase, 108);
+		// last vote bit should never be changed
+		memcpy(&vote, &work->data[25], 2);
+		vote = (opt_vote << 1) | (vote & 1);
+		memcpy(&work->data[25], &vote, 2);
+		// extradata
+		if (sctx->xnonce1_size > sizeof(work->data)-(32*4)) {
+			// should never happen...
+			applog(LOG_ERR, "extranonce size overflow!");
+			sctx->xnonce1_size = sizeof(work->data)-(32*4);
+		}
+		memcpy(&work->data[36], sctx->xnonce1, sctx->xnonce1_size);
+		work->data[37] = (rand()*4) << 8; // random work data
+		// block header suffix from coinb2 (stake version)
+		memcpy(&work->data[44], &sctx->job.coinbase[sctx->job.coinbase_size-4], 4);
+		sctx->job.height = work->data[32];
+		//applog_hex(work->data, 180);
+	} else if (opt_algo == ALGO_EQUIHASH) {
+		memcpy(&work->data[9], sctx->job.coinbase, 32+32); // merkle [9..16] + reserved
+		work->data[25] = le32dec(sctx->job.ntime);
+		work->data[26] = le32dec(sctx->job.nbits);
+		memcpy(&work->data[27], sctx->xnonce1, sctx->xnonce1_size & 0x1F); // pool extranonce
+		work->data[35] = 0x80;
+		//applog_hex(work->data, 140);
+	} else if (opt_algo == ALGO_LBRY) {
+		for (i = 0; i < 8; i++)
+			work->data[9 + i] = be32dec((uint32_t *)merkle_root + i);
+		for (i = 0; i < 8; i++)
+			work->data[17 + i] = ((uint32_t*)sctx->job.extra)[i];
+		work->data[25] = le32dec(sctx->job.ntime);
+		work->data[26] = le32dec(sctx->job.nbits);
+		work->data[28] = 0x80000000;
+	} else if (opt_algo == ALGO_PHI2) {
+		for (i = 0; i < 8; i++)
+			work->data[9 + i] = be32dec((uint32_t *)merkle_root + i);
+		work->data[17] = le32dec(sctx->job.ntime);
+		work->data[18] = le32dec(sctx->job.nbits);
+		for (i = 0; i < 16; i++)
+			work->data[20 + i] = ((uint32_t*)sctx->job.extra)[i];
+	} else if (opt_algo == ALGO_SIA) {
+		uint32_t extra = 0;
+		memcpy(&extra, &sctx->job.coinbase[32], 2);
+		for (i = 0; i < 8; i++) // reversed hash
+			work->data[i] = ((uint32_t*)sctx->job.prevhash)[7-i];
+		work->data[8] = 0; // nonce
+		work->data[9] = swab32(extra) | ((rand() << 8) & 0xffff);
+		work->data[10] = be32dec(sctx->job.ntime);
+		work->data[11] = be32dec(sctx->job.nbits);
+		memcpy(&work->data[12], sctx->job.coinbase, 32); // merkle_root
+		work->data[20] = 0x80000000;
+		if (opt_debug) applog_hex(work->data, 80);
+	} else {
+		for (i = 0; i < 8; i++)
+			work->data[9 + i] = be32dec((uint32_t *)merkle_root + i);
+		work->data[17] = le32dec(sctx->job.ntime);
+		work->data[18] = le32dec(sctx->job.nbits);
+		work->data[20] = 0x80000000;
+		work->data[31] = (opt_algo == ALGO_MJOLLNIR) ? 0x000002A0 : 0x00000280;
+	}
+
+	if (opt_showdiff || opt_max_diff > 0.)
+		calc_network_diff(work);
+
+	switch (opt_algo) {
+	case ALGO_MJOLLNIR:
+	case ALGO_HEAVY:
+	case ALGO_ZR5:
+		for (i = 0; i < 20; i++)
+			work->data[i] = swab32(work->data[i]);
+		break;
+	}
+
+	// HeavyCoin (vote / reward)
+	if (opt_algo == ALGO_HEAVY) {
+		work->maxvote = 2048;
+		uint16_t *ext = (uint16_t*)(&work->data[20]);
+		ext[0] = opt_vote;
+		ext[1] = be16dec(sctx->job.nreward);
+		// applog(LOG_DEBUG, "DEBUG: vote=%hx reward=%hx", ext[0], ext[1]);
+	}
+
+	pthread_mutex_unlock(&stratum_work_lock);
+
+	if (opt_debug && opt_algo != ALGO_DECRED && opt_algo != ALGO_EQUIHASH && opt_algo != ALGO_SIA) {
+		uint32_t utm = work->data[17];
+		if (opt_algo != ALGO_ZR5) utm = swab32(utm);
+		char *tm = atime2str(utm - sctx->srvtime_diff);
+		char *xnonce2str = bin2hex(work->xnonce2, sctx->xnonce2_size);
+		applog(LOG_DEBUG, "DEBUG: job_id=%s xnonce2=%s time=%s",
+		       work->job_id, xnonce2str, tm);
+		free(tm);
+		free(xnonce2str);
+	}
+
+	if (opt_difficulty == 0.)
+		opt_difficulty = 1.;
+
+	switch (opt_algo) {
+		case ALGO_HMQ1725:
+		case ALGO_JACKPOT:
+		case ALGO_JHA:
+		case ALGO_NEOSCRYPT:
+		case ALGO_SCRYPT:
+		case ALGO_SCRYPT_JANE:
+			work_set_target(work, sctx->job.diff / (65536.0 * opt_difficulty));
+			break;
+		case ALGO_ALLIUM:
+		case ALGO_DMD_GR:
+		case ALGO_FRESH:
+		case ALGO_FUGUE256:
+		case ALGO_GROESTL:
+		case ALGO_KECCAKC:
+		case ALGO_LBRY:
+		case ALGO_LYRA2v2:
+		case ALGO_LYRA2v3:
+		case ALGO_LYRA2Z:
+		case ALGO_PHI2:
+		case ALGO_TIMETRAVEL:
+		case ALGO_BITCORE:
+		case ALGO_EXOSIS:
+		case ALGO_X16R:
+		case ALGO_X16S:
+			work_set_target(work, sctx->job.diff / (256.0 * opt_difficulty));
+			break;
+		case ALGO_KECCAK:
+		case ALGO_LYRA2:
+			work_set_target(work, sctx->job.diff / (128.0 * opt_difficulty));
+			break;
+		case ALGO_EQUIHASH:
+			equi_work_set_target(work, sctx->job.diff / opt_difficulty);
+			break;
+		default:
+			work_set_target(work, sctx->job.diff / opt_difficulty);
+	}
+
+	if (stratum_diff != sctx->job.diff) {
+		char sdiff[32] = { 0 };
+		// store for api stats
+		stratum_diff = sctx->job.diff;
+		if (opt_showdiff && work->targetdiff != stratum_diff)
+			snprintf(sdiff, 32, " (%.5f)", work->targetdiff);
+		applog(LOG_WARNING, "Stratum difficulty set to %g%s", stratum_diff, sdiff);
+	}
+
+	return true;
+}
+
+void restart_threads(void)
+{
+	if (opt_debug && !opt_quiet)
+		applog(LOG_DEBUG,"%s", __FUNCTION__);
+
+	for (int i = 0; i < opt_n_threads && work_restart; i++)
+		work_restart[i].restart = 1;
+}
+
+static bool wanna_mine(int thr_id)
+{
+	bool state = true;
+	bool allow_pool_rotate = (thr_id == 0 && num_pools > 1 && !pool_is_switching);
+
+	if (opt_max_temp > 0.0) {
+#ifdef USE_WRAPNVML
+		struct cgpu_info * cgpu = &thr_info[thr_id].gpu;
+		float temp = gpu_temp(cgpu);
+		if (temp > opt_max_temp) {
+			if (!conditional_state[thr_id] && !opt_quiet)
+				gpulog(LOG_INFO, thr_id, "temperature too high (%.0f°c), waiting...", temp);
+			state = false;
+		} else if (opt_max_temp > 0. && opt_resume_temp > 0. && conditional_state[thr_id] && temp > opt_resume_temp) {
+			if (!thr_id && opt_debug)
+				applog(LOG_DEBUG, "temperature did not reach resume value %.1f...", opt_resume_temp);
+			state = false;
+		}
+#endif
+	}
+	// Network Difficulty
+	if (opt_max_diff > 0.0 && net_diff > opt_max_diff) {
+		int next = pool_get_first_valid(cur_pooln+1);
+		if (num_pools > 1 && pools[next].max_diff != pools[cur_pooln].max_diff && opt_resume_diff <= 0.)
+			conditional_pool_rotate = allow_pool_rotate;
+		if (!thr_id && !conditional_state[thr_id] && !opt_quiet)
+			applog(LOG_INFO, "network diff too high, waiting...");
+		state = false;
+	} else if (opt_max_diff > 0. && opt_resume_diff > 0. && conditional_state[thr_id] && net_diff > opt_resume_diff) {
+		if (!thr_id && opt_debug)
+			applog(LOG_DEBUG, "network diff did not reach resume value %.3f...", opt_resume_diff);
+		state = false;
+	}
+	// Network hashrate
+	if (opt_max_rate > 0.0 && net_hashrate > opt_max_rate) {
+		int next = pool_get_first_valid(cur_pooln+1);
+		if (pools[next].max_rate != pools[cur_pooln].max_rate && opt_resume_rate <= 0.)
+			conditional_pool_rotate = allow_pool_rotate;
+		if (!thr_id && !conditional_state[thr_id] && !opt_quiet) {
+			char rate[32];
+			format_hashrate(opt_max_rate, rate);
+			applog(LOG_INFO, "network hashrate too high, waiting %s...", rate);
+		}
+		state = false;
+	} else if (opt_max_rate > 0. && opt_resume_rate > 0. && conditional_state[thr_id] && net_hashrate > opt_resume_rate) {
+		if (!thr_id && opt_debug)
+			applog(LOG_DEBUG, "network rate did not reach resume value %.3f...", opt_resume_rate);
+		state = false;
+	}
+	conditional_state[thr_id] = (uint8_t) !state; // only one wait message in logs
+	return state;
+}
+
+static void *miner_thread(void *userdata)
+{
+	struct thr_info *mythr = (struct thr_info *)userdata;
+	int switchn = pool_switch_count;
+	int thr_id = mythr->id;
+	int dev_id = device_map[thr_id % MAX_GPUS];
+	struct cgpu_info * cgpu = &thr_info[thr_id].gpu;
+	struct work work;
+	uint64_t loopcnt = 0;
+	uint32_t max_nonce;
+	uint32_t end_nonce = UINT32_MAX / opt_n_threads * (thr_id + 1) - (thr_id + 1);
+	time_t tm_rate_log = 0;
+	bool work_done = false;
+	bool extrajob = false;
+	char s[16];
+	int rc = 0;
+
+	memset(&work, 0, sizeof(work)); // prevent work from being used uninitialized
+
+	if (opt_priority > 0) {
+		int prio = 2; // default to normal
+#ifndef WIN32
+		prio = 0;
+		// note: different behavior on linux (-19 to 19)
+		switch (opt_priority) {
+			case 0:
+				prio = 15;
+				break;
+			case 1:
+				prio = 5;
+				break;
+			case 2:
+				prio = 0; // normal process
+				break;
+			case 3:
+				prio = -1; // above
+				break;
+			case 4:
+				prio = -10;
+				break;
+			case 5:
+				prio = -15;
+		}
+		if (opt_debug)
+			applog(LOG_DEBUG, "Thread %d priority %d (nice %d)",
+				thr_id,	opt_priority, prio);
+#endif
+		setpriority(PRIO_PROCESS, 0, prio);
+		drop_policy();
+	}
+
+	/* Cpu thread affinity */
+	if (num_cpus > 1) {
+		if (opt_affinity == -1L && opt_n_threads > 1) {
+			if (opt_debug)
+				applog(LOG_DEBUG, "Binding thread %d to cpu %d (mask %x)", thr_id,
+						thr_id % num_cpus, (1UL << (thr_id % num_cpus)));
+			affine_to_cpu_mask(thr_id, 1 << (thr_id % num_cpus));
+		} else if (opt_affinity != -1L) {
+			if (opt_debug)
+				applog(LOG_DEBUG, "Binding thread %d to cpu mask %lx", thr_id,
+						(long) opt_affinity);
+			affine_to_cpu_mask(thr_id, (unsigned long) opt_affinity);
+		}
+	}
+
+	gpu_led_off(dev_id);
+
+	while (!abort_flag) {
+		struct timeval tv_start, tv_end, diff;
+		unsigned long hashes_done;
+		uint32_t start_nonce;
+		uint32_t scan_time = have_longpoll ? LP_SCANTIME : opt_scantime;
+		uint64_t max64, minmax = 0x100000;
+		int nodata_check_oft = 0;
+		bool regen = false;
+
+		// &work.data[19]
+		int wcmplen = (opt_algo == ALGO_DECRED) ? 140 : 76;
+		int wcmpoft = 0;
+
+		if (opt_algo == ALGO_LBRY) wcmplen = 108;
+		else if (opt_algo == ALGO_SIA) {
+			wcmpoft = (32+16)/4;
+			wcmplen = 32;
+		}
+
+		uint32_t *nonceptr = (uint32_t*) (((char*)work.data) + wcmplen);
+
+		if (opt_algo == ALGO_WILDKECCAK) {
+			nonceptr = (uint32_t*) (((char*)work.data) + 1);
+			wcmpoft = 2;
+			wcmplen = 32;
+		} else if (opt_algo == ALGO_CRYPTOLIGHT || opt_algo == ALGO_CRYPTONIGHT) {
+			nonceptr = (uint32_t*) (((char*)work.data) + 39);
+			wcmplen = 39;
+		} else if (opt_algo == ALGO_EQUIHASH) {
+			nonceptr = &work.data[EQNONCE_OFFSET]; // 27 is pool extranonce (256bits nonce space)
+			wcmplen = 4+32+32;
+		}
+
+		if (have_stratum) {
+			uint32_t sleeptime = 0;
+
+			if (opt_algo == ALGO_DECRED || opt_algo == ALGO_WILDKECCAK /* getjob */)
+				work_done = true; // force "regen" hash
+			while (!work_done && time(NULL) >= (g_work_time + opt_scantime)) {
+				usleep(100*1000);
+				if (sleeptime > 4) {
+					extrajob = true;
+					break;
+				}
+				sleeptime++;
+			}
+			if (sleeptime && opt_debug && !opt_quiet)
+				applog(LOG_DEBUG, "sleeptime: %u ms", sleeptime*100);
+			//nonceptr = (uint32_t*) (((char*)work.data) + wcmplen);
+			pthread_mutex_lock(&g_work_lock);
+			extrajob |= work_done;
+
+			regen = (nonceptr[0] >= end_nonce);
+			if (opt_algo == ALGO_SIA) {
+				regen = ((nonceptr[1] & 0xFF00) >= 0xF000);
+			}
+			regen = regen || extrajob;
+
+			if (regen) {
+				work_done = false;
+				extrajob = false;
+				if (stratum_gen_work(&stratum, &g_work))
+					g_work_time = time(NULL);
+				if (opt_algo == ALGO_CRYPTONIGHT || opt_algo == ALGO_CRYPTOLIGHT)
+					nonceptr[0] += 0x100000;
+			}
+		} else {
+			uint32_t secs = 0;
+			pthread_mutex_lock(&g_work_lock);
+			secs = (uint32_t) (time(NULL) - g_work_time);
+			if (secs >= scan_time || nonceptr[0] >= (end_nonce - 0x100)) {
+				if (opt_debug && g_work_time && !opt_quiet)
+					applog(LOG_DEBUG, "work time %u/%us nonce %x/%x", secs, scan_time, nonceptr[0], end_nonce);
+				/* obtain new work from internal workio thread */
+				if (unlikely(!get_work(mythr, &g_work))) {
+					pthread_mutex_unlock(&g_work_lock);
+					if (switchn != pool_switch_count) {
+						switchn = pool_switch_count;
+						continue;
+					} else {
+						applog(LOG_ERR, "work retrieval failed, exiting mining thread %d", mythr->id);
+						goto out;
+					}
+				}
+				g_work_time = time(NULL);
+			}
+		}
+
+		// reset shares id counter on new job
+		if (strcmp(work.job_id, g_work.job_id))
+			stratum.job.shares_count = 0;
+
+		if (!opt_benchmark && (g_work.height != work.height || memcmp(work.target, g_work.target, sizeof(work.target))))
+		{
+			if (opt_debug) {
+				uint64_t target64 = g_work.target[7] * 0x100000000ULL + g_work.target[6];
+				applog(LOG_DEBUG, "job %s target change: %llx (%.1f)", g_work.job_id, target64, g_work.targetdiff);
+			}
+			memcpy(work.target, g_work.target, sizeof(work.target));
+			work.targetdiff = g_work.targetdiff;
+			work.height = g_work.height;
+			//nonceptr[0] = (UINT32_MAX / opt_n_threads) * thr_id; // 0 if single thr
+		}
+
+		if (opt_algo == ALGO_ZR5) {
+			// ignore pok/version header
+			wcmpoft = 1;
+			wcmplen -= 4;
+		}
+
+		if (opt_algo == ALGO_CRYPTONIGHT || opt_algo == ALGO_CRYPTOLIGHT) {
+			uint32_t oldpos = nonceptr[0];
+			bool nicehash = strstr(pools[cur_pooln].url, "nicehash") != NULL;
+			if (memcmp(&work.data[wcmpoft], &g_work.data[wcmpoft], wcmplen)) {
+				memcpy(&work, &g_work, sizeof(struct work));
+				if (!nicehash) nonceptr[0] = (rand()*4) << 24;
+				nonceptr[0] &=  0xFF000000u; // nicehash prefix hack
+				nonceptr[0] |= (0x00FFFFFFu / opt_n_threads) * thr_id;
+			}
+			// also check the end, nonce in the middle
+			else if (memcmp(&work.data[44/4], &g_work.data[0], 76-44)) {
+				memcpy(&work, &g_work, sizeof(struct work));
+			}
+			if (oldpos & 0xFFFF) {
+				if (!nicehash) nonceptr[0] = oldpos + 0x1000000u;
+				else {
+					uint32_t pfx = nonceptr[0] & 0xFF000000u;
+					nonceptr[0] = pfx | ((oldpos + 0x8000u) & 0xFFFFFFu);
+				}
+			}
+		}
+
+		else if (memcmp(&work.data[wcmpoft], &g_work.data[wcmpoft], wcmplen)) {
+			#if 0
+			if (opt_debug) {
+				for (int n=0; n <= (wcmplen-8); n+=8) {
+					if (memcmp(work.data + n, g_work.data + n, 8)) {
+						applog(LOG_DEBUG, "job %s work updated at offset %d:", g_work.job_id, n);
+						applog_hash((uchar*) &work.data[n]);
+						applog_compare_hash((uchar*) &g_work.data[n], (uchar*) &work.data[n]);
+					}
+				}
+			}
+			#endif
+			memcpy(&work, &g_work, sizeof(struct work));
+			nonceptr[0] = (UINT32_MAX / opt_n_threads) * thr_id; // 0 if single thr
+		} else
+			nonceptr[0]++; //??
+
+		if (opt_algo == ALGO_DECRED) {
+			// suprnova job_id check without data/target/height change...
+			if (check_stratum_jobs && strcmp(work.job_id, g_work.job_id)) {
+				pthread_mutex_unlock(&g_work_lock);
+				continue;
+			}
+
+			// use the full range per loop
+			nonceptr[0] = 0;
+			end_nonce = UINT32_MAX;
+			// and make an unique work (extradata)
+			nonceptr[1] += 1;
+			nonceptr[2] |= thr_id;
+
+		} else if (opt_algo == ALGO_EQUIHASH) {
+			nonceptr[1]++;
+			nonceptr[1] |= thr_id << 24;
+			//applog_hex(&work.data[27], 32);
+		} else if (opt_algo == ALGO_WILDKECCAK) {
+			//nonceptr[1] += 1;
+		} else if (opt_algo == ALGO_SIA) {
+			// suprnova job_id check without data/target/height change...
+			if (have_stratum && strcmp(work.job_id, g_work.job_id)) {
+				pthread_mutex_unlock(&g_work_lock);
+				work_done = true;
+				continue;
+			}
+			nonceptr[1] += opt_n_threads;
+			nonceptr[1] |= thr_id;
+			// range max
+			nonceptr[0] = 0;
+			end_nonce = UINT32_MAX;
+		} else if (opt_benchmark) {
+			// randomize work
+			nonceptr[-1] += 1;
+		}
+
+		pthread_mutex_unlock(&g_work_lock);
+
+		// --benchmark [-a all]
+		if (opt_benchmark && bench_algo >= 0) {
+			//gpulog(LOG_DEBUG, thr_id, "loop %d", loopcnt);
+			if (loopcnt >= 3) {
+				if (!bench_algo_switch_next(thr_id) && thr_id == 0)
+				{
+					bench_display_results();
+					proper_exit(0);
+					break;
+				}
+				loopcnt = 0;
+			}
+		}
+		loopcnt++;
+
+		// prevent gpu scans before a job is received
+		if (opt_algo == ALGO_SIA) nodata_check_oft = 7; // no stratum version
+		else if (opt_algo == ALGO_DECRED) nodata_check_oft = 4; // testnet ver is 0
+		else nodata_check_oft = 0;
+		if (have_stratum && work.data[nodata_check_oft] == 0 && !opt_benchmark) {
+			sleep(1);
+			if (!thr_id) pools[cur_pooln].wait_time += 1;
+			gpulog(LOG_DEBUG, thr_id, "no data");
+			continue;
+		}
+		if (opt_algo == ALGO_WILDKECCAK && !scratchpad_size) {
+			sleep(1);
+			if (!thr_id) pools[cur_pooln].wait_time += 1;
+			continue;
+		}
+
+		/* conditional mining */
+		if (!wanna_mine(thr_id))
+		{
+			// reset default mem offset before idle..
+#if defined(WIN32) && defined(USE_WRAPNVML)
+			if (need_memclockrst) nvapi_toggle_clocks(thr_id, false);
+#else
+			if (need_nvsettings) nvs_reset_clocks(dev_id);
+#endif
+			// free gpu resources
+			algo_free_all(thr_id);
+			// clear any free error (algo switch)
+			cuda_clear_lasterror();
+
+			// conditional pool switch
+			if (num_pools > 1 && conditional_pool_rotate) {
+				if (!pool_is_switching)
+					pool_switch_next(thr_id);
+				else if (time(NULL) - firstwork_time > 35) {
+					if (!opt_quiet)
+						applog(LOG_WARNING, "Pool switching timed out...");
+					if (!thr_id) pools[cur_pooln].wait_time += 1;
+					pool_is_switching = false;
+				}
+				sleep(1);
+				continue;
+			}
+
+			pool_on_hold = true;
+			global_hashrate = 0;
+			sleep(5);
+			if (!thr_id) pools[cur_pooln].wait_time += 5;
+			continue;
+		} else {
+			// reapply mem offset if needed
+#if defined(WIN32) && defined(USE_WRAPNVML)
+			if (need_memclockrst) nvapi_toggle_clocks(thr_id, true);
+#else
+			if (need_nvsettings) nvs_set_clocks(dev_id);
+#endif
+		}
+
+		pool_on_hold = false;
+
+		work_restart[thr_id].restart = 0;
+
+		/* adjust max_nonce to meet target scan time */
+		if (have_stratum)
+			max64 = LP_SCANTIME;
+		else
+			max64 = max(1, (int64_t) scan_time + g_work_time - time(NULL));
+
+		/* time limit */
+		if (opt_time_limit > 0 && firstwork_time) {
+			int passed = (int)(time(NULL) - firstwork_time);
+			int remain = (int)(opt_time_limit - passed);
+			if (remain < 0)  {
+				if (thr_id != 0) {
+					sleep(1); continue;
+				}
+				if (num_pools > 1 && pools[cur_pooln].time_limit > 0) {
+					if (!pool_is_switching) {
+						if (!opt_quiet)
+							applog(LOG_INFO, "Pool mining timeout of %ds reached, rotate...", opt_time_limit);
+						pool_switch_next(thr_id);
+					} else if (passed > 35) {
+						// ensure we dont stay locked if pool_is_switching is not reset...
+						applog(LOG_WARNING, "Pool switch to %d timed out...", cur_pooln);
+						if (!thr_id) pools[cur_pooln].wait_time += 1;
+						pool_is_switching = false;
+					}
+					sleep(1);
+					continue;
+				}
+				app_exit_code = EXIT_CODE_TIME_LIMIT;
+				abort_flag = true;
+				if (opt_benchmark) {
+					char rate[32];
+					format_hashrate((double)global_hashrate, rate);
+					applog(LOG_NOTICE, "Benchmark: %s", rate);
+					usleep(200*1000);
+					fprintf(stderr, "%llu\n", (long long unsigned int) global_hashrate);
+				} else {
+					applog(LOG_NOTICE, "Mining timeout of %ds reached, exiting...", opt_time_limit);
+				}
+				workio_abort();
+				break;
+			}
+			if (remain < max64) max64 = remain;
+		}
+
+		/* shares limit */
+		if (opt_shares_limit > 0 && firstwork_time) {
+			int64_t shares = (pools[cur_pooln].accepted_count + pools[cur_pooln].rejected_count);
+			if (shares >= opt_shares_limit) {
+				int passed = (int)(time(NULL) - firstwork_time);
+				if (thr_id != 0) {
+					sleep(1); continue;
+				}
+				if (num_pools > 1 && pools[cur_pooln].shares_limit > 0) {
+					if (!pool_is_switching) {
+						if (!opt_quiet)
+							applog(LOG_INFO, "Pool shares limit of %d reached, rotate...", opt_shares_limit);
+						pool_switch_next(thr_id);
+					} else if (passed > 35) {
+						// ensure we dont stay locked if pool_is_switching is not reset...
+						applog(LOG_WARNING, "Pool switch to %d timed out...", cur_pooln);
+						if (!thr_id) pools[cur_pooln].wait_time += 1;
+						pool_is_switching = false;
+					}
+					sleep(1);
+					continue;
+				}
+				abort_flag = true;
+				app_exit_code = EXIT_CODE_OK;
+				applog(LOG_NOTICE, "Mining limit of %d shares reached, exiting...", opt_shares_limit);
+				workio_abort();
+				break;
+			}
+		}
+
+		max64 *= (uint32_t)thr_hashrates[thr_id];
+
+		/* on start, max64 should not be 0,
+		 *    before hashrate is computed */
+		if (max64 < minmax) {
+			switch (opt_algo) {
+			case ALGO_BLAKECOIN:
+			case ALGO_BLAKE2S:
+			case ALGO_VANILLA:
+				minmax = 0x80000000U;
+				break;
+			case ALGO_BLAKE:
+			case ALGO_BMW:
+			case ALGO_DECRED:
+			case ALGO_SHA256D:
+			case ALGO_SHA256T:
+			case ALGO_SHA256Q:
+			//case ALGO_WHIRLPOOLX:
+				minmax = 0x40000000U;
+				break;
+			case ALGO_BLAKE2B:
+			case ALGO_KECCAK:
+			case ALGO_KECCAKC:
+			case ALGO_LBRY:
+			case ALGO_LUFFA:
+			case ALGO_SIA:
+			case ALGO_SKEIN:
+			case ALGO_SKEIN2:
+			case ALGO_TRIBUS:
+				minmax = 0x1000000;
+				break;
+			case ALGO_ALLIUM:
+			case ALGO_C11:
+			case ALGO_DEEP:
+			case ALGO_HEAVY:
+			case ALGO_JACKPOT:
+			case ALGO_JHA:
+			case ALGO_HSR:
+			case ALGO_LYRA2v2:
+			case ALGO_LYRA2v3:
+			case ALGO_PHI:
+			case ALGO_PHI2:
+			case ALGO_POLYTIMOS:
+			case ALGO_S3:
+			case ALGO_SKUNK:
+			case ALGO_TIMETRAVEL:
+			case ALGO_BITCORE:
+			case ALGO_EXOSIS:
+			case ALGO_X11EVO:
+			case ALGO_X11:
+			case ALGO_X12:
+			case ALGO_X13:
+			case ALGO_WHIRLCOIN:
+			case ALGO_WHIRLPOOL:
+				minmax = 0x400000;
+				break;
+			case ALGO_X14:
+			case ALGO_X15:
+				minmax = 0x300000;
+				break;
+			case ALGO_LYRA2:
+			case ALGO_LYRA2Z:
+			case ALGO_NEOSCRYPT:
+			case ALGO_SIB:
+			case ALGO_SCRYPT:
+			case ALGO_SONOA:
+			case ALGO_VELTOR:
+				minmax = 0x80000;
+				break;
+			case ALGO_CRYPTOLIGHT:
+			case ALGO_CRYPTONIGHT:
+			case ALGO_SCRYPT_JANE:
+				minmax = 0x1000;
+				break;
+			}
+			max64 = max(minmax-1, max64);
+		}
+
+		// we can't scan more than uint32 capacity
+		max64 = min(UINT32_MAX, max64);
+
+		start_nonce = nonceptr[0];
+
+		/* never let small ranges at end */
+		if (end_nonce >= UINT32_MAX - 256)
+			end_nonce = UINT32_MAX;
+
+		if ((max64 + start_nonce) >= end_nonce)
+			max_nonce = end_nonce;
+		else
+			max_nonce = (uint32_t) (max64 + start_nonce);
+
+		// todo: keep it rounded to a multiple of 256 ?
+
+		if (unlikely(start_nonce > max_nonce)) {
+			// should not happen but seen in skein2 benchmark with 2 gpus
+			max_nonce = end_nonce = UINT32_MAX;
+		}
+
+		work.scanned_from = start_nonce;
+
+		gpulog(LOG_DEBUG, thr_id, "start=%08x end=%08x range=%08x",
+			start_nonce, max_nonce, (max_nonce-start_nonce));
+
+		if (opt_led_mode == LED_MODE_MINING)
+			gpu_led_on(dev_id);
+
+		if (cgpu && loopcnt > 1) {
+			cgpu->monitor.sampling_flag = true;
+			pthread_cond_signal(&cgpu->monitor.sampling_signal);
+		}
+
+		hashes_done = 0;
+		gettimeofday(&tv_start, NULL);
+
+		// check (and reset) previous errors
+		cudaError_t err = cudaGetLastError();
+		if (err != cudaSuccess && !opt_quiet)
+			gpulog(LOG_WARNING, thr_id, "%s", cudaGetErrorString(err));
+
+		work.valid_nonces = 0;
+
+		/* scan nonces for a proof-of-work hash */
+		switch (opt_algo) {
+
+		case ALGO_ALLIUM:
+			rc = scanhash_allium(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_BASTION:
+			rc = scanhash_bastion(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_BLAKECOIN:
+			rc = scanhash_blake256(thr_id, &work, max_nonce, &hashes_done, 8);
+			break;
+		case ALGO_BLAKE:
+			rc = scanhash_blake256(thr_id, &work, max_nonce, &hashes_done, 14);
+			break;
+		case ALGO_BLAKE2B:
+			rc = scanhash_blake2b(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_BLAKE2S:
+			rc = scanhash_blake2s(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_BMW:
+			rc = scanhash_bmw(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_C11:
+			rc = scanhash_c11(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_CRYPTOLIGHT:
+			rc = scanhash_cryptolight(thr_id, &work, max_nonce, &hashes_done, 1);
+			break;
+		case ALGO_MONERO:
+		case ALGO_STELLITE:
+		case ALGO_GRAFT:
+		case ALGO_CRYPTONIGHT:
+		{
+			int cn_variant = 0;
+			if (cryptonight_fork > 1 && ((unsigned char*)work.data)[0] >= cryptonight_fork)
+				cn_variant = ((unsigned char*)work.data)[0] - cryptonight_fork + 1;
+			rc = scanhash_cryptonight(thr_id, &work, max_nonce, &hashes_done, cn_variant);
+			break;
+		}
+		case ALGO_DECRED:
+			rc = scanhash_decred(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_DEEP:
+			rc = scanhash_deep(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_EQUIHASH:
+			rc = scanhash_equihash(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_FRESH:
+			rc = scanhash_fresh(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_FUGUE256:
+			rc = scanhash_fugue256(thr_id, &work, max_nonce, &hashes_done);
+			break;
+
+		case ALGO_GROESTL:
+		case ALGO_DMD_GR:
+			rc = scanhash_groestlcoin(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_MYR_GR:
+			rc = scanhash_myriad(thr_id, &work, max_nonce, &hashes_done);
+			break;
+
+		case ALGO_HMQ1725:
+			rc = scanhash_hmq17(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_HSR:
+			rc = scanhash_hsr(thr_id, &work, max_nonce, &hashes_done);
+			break;
+#ifdef WITH_HEAVY_ALGO
+		case ALGO_HEAVY:
+			rc = scanhash_heavy(thr_id, &work, max_nonce, &hashes_done, work.maxvote, HEAVYCOIN_BLKHDR_SZ);
+			break;
+		case ALGO_MJOLLNIR:
+			rc = scanhash_heavy(thr_id, &work, max_nonce, &hashes_done, 0, MNR_BLKHDR_SZ);
+			break;
+#endif
+		case ALGO_KECCAK:
+		case ALGO_KECCAKC:
+			rc = scanhash_keccak256(thr_id, &work, max_nonce, &hashes_done);
+			break;
+
+		case ALGO_JACKPOT:
+			rc = scanhash_jackpot(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_JHA:
+			rc = scanhash_jha(thr_id, &work, max_nonce, &hashes_done);
+			break;
+
+		case ALGO_LBRY:
+			rc = scanhash_lbry(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_LUFFA:
+			rc = scanhash_luffa(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_QUARK:
+			rc = scanhash_quark(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_QUBIT:
+			rc = scanhash_qubit(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_LYRA2:
+			rc = scanhash_lyra2(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_LYRA2v2:
+			rc = scanhash_lyra2v2(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_LYRA2v3:
+			rc = scanhash_lyra2v3(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_LYRA2Z:
+			rc = scanhash_lyra2Z(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_NEOSCRYPT:
+			rc = scanhash_neoscrypt(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_NIST5:
+			rc = scanhash_nist5(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_PENTABLAKE:
+			rc = scanhash_pentablake(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_PHI:
+			rc = scanhash_phi(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_PHI2:
+			rc = scanhash_phi2(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_POLYTIMOS:
+			rc = scanhash_polytimos(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_SCRYPT:
+			rc = scanhash_scrypt(thr_id, &work, max_nonce, &hashes_done,
+				NULL, &tv_start, &tv_end);
+			break;
+		case ALGO_SCRYPT_JANE:
+			rc = scanhash_scrypt_jane(thr_id, &work, max_nonce, &hashes_done,
+				NULL, &tv_start, &tv_end);
+			break;
+		case ALGO_SKEIN:
+			rc = scanhash_skeincoin(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_SKEIN2:
+			rc = scanhash_skein2(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_SKUNK:
+			rc = scanhash_skunk(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_SHA256D:
+			rc = scanhash_sha256d(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_SHA256T:
+			rc = scanhash_sha256t(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_SHA256Q:
+			rc = scanhash_sha256q(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_SIA:
+			rc = scanhash_sia(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_SIB:
+			rc = scanhash_sib(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_SONOA:
+			rc = scanhash_sonoa(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_S3:
+			rc = scanhash_s3(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_VANILLA:
+			rc = scanhash_vanilla(thr_id, &work, max_nonce, &hashes_done, 8);
+			break;
+		case ALGO_VELTOR:
+			rc = scanhash_veltor(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_WHIRLCOIN:
+		case ALGO_WHIRLPOOL:
+			rc = scanhash_whirl(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		//case ALGO_WHIRLPOOLX:
+		//	rc = scanhash_whirlx(thr_id, &work, max_nonce, &hashes_done);
+		//	break;
+		case ALGO_WILDKECCAK:
+			rc = scanhash_wildkeccak(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_TIMETRAVEL:
+			rc = scanhash_timetravel(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_TRIBUS:
+			rc = scanhash_tribus(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_BITCORE:
+			rc = scanhash_bitcore(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_EXOSIS:
+			rc = scanhash_exosis(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_X11EVO:
+			rc = scanhash_x11evo(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_X11:
+			rc = scanhash_x11(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_X12:
+			rc = scanhash_x12(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_X13:
+			rc = scanhash_x13(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_X14:
+			rc = scanhash_x14(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_X15:
+			rc = scanhash_x15(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_X16R:
+			rc = scanhash_x16r(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_X16S:
+			rc = scanhash_x16s(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_X17:
+			rc = scanhash_x17(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_ZR5:
+			rc = scanhash_zr5(thr_id, &work, max_nonce, &hashes_done);
+			break;
+
+		default:
+			/* should never happen */
+			goto out;
+		}
+
+		if (opt_led_mode == LED_MODE_MINING)
+			gpu_led_off(dev_id);
+
+		if (abort_flag)
+			break; // time to leave the mining loop...
+
+		if (work_restart[thr_id].restart)
+			continue;
+
+		/* record scanhash elapsed time */
+		gettimeofday(&tv_end, NULL);
+
+		switch (opt_algo) {
+			// algos to migrate to replace pdata[21] by work.nonces[]
+			case ALGO_HEAVY:
+			case ALGO_SCRYPT:
+			case ALGO_SCRYPT_JANE:
+			//case ALGO_WHIRLPOOLX:
+				work.nonces[0] = nonceptr[0];
+				work.nonces[1] = nonceptr[2];
+		}
+
+		if (stratum.rpc2 && (rc == -EBUSY || work_restart[thr_id].restart)) {
+			// bbr scratchpad download or stale result
+			sleep(1);
+			if (!thr_id) pools[cur_pooln].wait_time += 1;
+			continue;
+		}
+
+		if (rc > 0 && opt_debug)
+			applog(LOG_NOTICE, CL_CYN "found => %08x" CL_GRN " %08x", work.nonces[0], swab32(work.nonces[0]));
+		if (rc > 1 && opt_debug)
+			applog(LOG_NOTICE, CL_CYN "found => %08x" CL_GRN " %08x", work.nonces[1], swab32(work.nonces[1]));
+
+		timeval_subtract(&diff, &tv_end, &tv_start);
+
+		if (cgpu && diff.tv_sec) { // stop monitoring
+			cgpu->monitor.sampling_flag = false;
+		}
+
+		if (diff.tv_usec || diff.tv_sec) {
+			double dtime = (double) diff.tv_sec + 1e-6 * diff.tv_usec;
+
+			/* hashrate factors for some algos */
+			double rate_factor = 1.0;
+			switch (opt_algo) {
+				case ALGO_JACKPOT:
+				case ALGO_QUARK:
+					// to stay comparable to other ccminer forks or pools
+					rate_factor = 0.5;
+					break;
+			}
+
+			/* store thread hashrate */
+			if (dtime > 0.0) {
+				pthread_mutex_lock(&stats_lock);
+				thr_hashrates[thr_id] = hashes_done / dtime;
+				thr_hashrates[thr_id] *= rate_factor;
+				if (loopcnt > 2) // ignore first (init time)
+					stats_remember_speed(thr_id, hashes_done, thr_hashrates[thr_id], (uint8_t) rc, work.height);
+				pthread_mutex_unlock(&stats_lock);
+			}
+		}
+
+		if (rc > 0)
+			work.scanned_to = work.nonces[0];
+		if (rc > 1)
+			work.scanned_to = max(work.nonces[0], work.nonces[1]);
+		else {
+			work.scanned_to = max_nonce;
+			if (opt_debug && opt_benchmark) {
+				// to debug nonce ranges
+				gpulog(LOG_DEBUG, thr_id, "ends=%08x range=%08x", nonceptr[0], (nonceptr[0] - start_nonce));
+			}
+			// prevent low scan ranges on next loop on fast algos (blake)
+			if (nonceptr[0] > UINT32_MAX - 64)
+				nonceptr[0] = UINT32_MAX;
+		}
+
+		// only required to debug purpose
+		if (opt_debug && check_dups && opt_algo != ALGO_DECRED && opt_algo != ALGO_EQUIHASH && opt_algo != ALGO_SIA)
+			hashlog_remember_scan_range(&work);
+
+		/* output */
+		if (!opt_quiet && loopcnt > 1 && (time(NULL) - tm_rate_log) > opt_maxlograte) {
+			format_hashrate(thr_hashrates[thr_id], s);
+			gpulog(LOG_INFO, thr_id, "%s, %s", device_name[dev_id], s);
+			tm_rate_log = time(NULL);
+		}
+
+		/* ignore first loop hashrate */
+		if (firstwork_time && thr_id == (opt_n_threads - 1)) {
+			double hashrate = 0.;
+			pthread_mutex_lock(&stats_lock);
+			for (int i = 0; i < opt_n_threads && thr_hashrates[i]; i++)
+				hashrate += stats_get_speed(i, thr_hashrates[i]);
+			pthread_mutex_unlock(&stats_lock);
+			if (opt_benchmark && bench_algo == -1 && loopcnt > 2) {
+				format_hashrate(hashrate, s);
+				applog(LOG_NOTICE, "Total: %s", s);
+			}
+
+			// since pool start
+			pools[cur_pooln].work_time = (uint32_t) (time(NULL) - firstwork_time);
+
+			// X-Mining-Hashrate
+			global_hashrate = llround(hashrate);
+		}
+
+		if (firstwork_time == 0)
+			firstwork_time = time(NULL);
+
+		if (cgpu) cgpu->accepted += work.valid_nonces;
+
+		/* if nonce found, submit work */
+		if (rc > 0 && !opt_benchmark) {
+			uint32_t curnonce = nonceptr[0]; // current scan position
+
+			if (opt_led_mode == LED_MODE_SHARES)
+				gpu_led_percent(dev_id, 50);
+
+			work.submit_nonce_id = 0;
+			nonceptr[0] = work.nonces[0];
+			if (!submit_work(mythr, &work))
+				break;
+			nonceptr[0] = curnonce;
+
+			// prevent stale work in solo
+			// we can't submit twice a block!
+			if (!have_stratum && !have_longpoll) {
+				pthread_mutex_lock(&g_work_lock);
+				// will force getwork
+				g_work_time = 0;
+				pthread_mutex_unlock(&g_work_lock);
+				continue;
+			}
+
+			// second nonce found, submit too (on pool only!)
+			if (rc > 1 && work.nonces[1]) {
+				work.submit_nonce_id = 1;
+				nonceptr[0] = work.nonces[1];
+				if (opt_algo == ALGO_ZR5) {
+					work.data[0] = work.data[22]; // pok
+					work.data[22] = 0;
+				}
+				if (!submit_work(mythr, &work))
+					break;
+				nonceptr[0] = curnonce;
+				work.nonces[1] = 0; // reset
+			}
+		}
+	}
+
+out:
+	if (opt_led_mode)
+		gpu_led_off(dev_id);
+	if (opt_debug_threads)
+		applog(LOG_DEBUG, "%s() died", __func__);
+	tq_freeze(mythr->q);
+	return NULL;
+}
+
+static void *longpoll_thread(void *userdata)
+{
+	struct thr_info *mythr = (struct thr_info *)userdata;
+	struct pool_infos *pool;
+	CURL *curl = NULL;
+	char *hdr_path = NULL, *lp_url = NULL;
+	const char *rpc_req = json_rpc_getwork;
+	bool need_slash = false;
+	int pooln, switchn;
+
+	curl = curl_easy_init();
+	if (unlikely(!curl)) {
+		applog(LOG_ERR, "%s() CURL init failed", __func__);
+		goto out;
+	}
+
+wait_lp_url:
+	hdr_path = (char*)tq_pop(mythr->q, NULL); // wait /LP url
+	if (!hdr_path)
+		goto out;
+
+	if (!(pools[cur_pooln].type & POOL_STRATUM)) {
+		pooln = cur_pooln;
+		pool = &pools[pooln];
+	} else {
+		// hack...
+		have_stratum = true;
+	}
+
+	// to detect pool switch during loop
+	switchn = pool_switch_count;
+
+	if (opt_algo == ALGO_SIA) {
+		goto out;
+	}
+
+	/* full URL */
+	else if (strstr(hdr_path, "://")) {
+		lp_url = hdr_path;
+		hdr_path = NULL;
+	}
+	/* absolute path, on current server */
+	else {
+		char *copy_start = (*hdr_path == '/') ? (hdr_path + 1) : hdr_path;
+		if (rpc_url[strlen(rpc_url) - 1] != '/')
+			need_slash = true;
+
+		lp_url = (char*)malloc(strlen(rpc_url) + strlen(copy_start) + 2);
+		if (!lp_url)
+			goto out;
+
+		sprintf(lp_url, "%s%s%s", rpc_url, need_slash ? "/" : "", copy_start);
+	}
+
+	if (!pool_is_switching)
+		applog(LOG_BLUE, "Long-polling on %s", lp_url);
+
+	pool_is_switching = false;
+
+	pool->type |= POOL_LONGPOLL;
+
+longpoll_retry:
+
+	while (!abort_flag) {
+		json_t *val = NULL, *soval;
+		int err = 0;
+
+		if (opt_debug_threads)
+			applog(LOG_DEBUG, "longpoll %d: %d count %d %d, switching=%d, have_stratum=%d",
+				pooln, cur_pooln, switchn, pool_switch_count, pool_is_switching, have_stratum);
+
+		// exit on pool switch
+		if (switchn != pool_switch_count)
+			goto need_reinit;
+
+		if (opt_algo == ALGO_SIA) {
+			char *sia_header = sia_getheader(curl, pool);
+			if (sia_header) {
+				pthread_mutex_lock(&g_work_lock);
+				if (sia_work_decode(sia_header, &g_work)) {
+					g_work_time = time(NULL);
+				}
+				free(sia_header);
+				pthread_mutex_unlock(&g_work_lock);
+			}
+			continue;
+		}
+
+		val = json_rpc_longpoll(curl, lp_url, pool, rpc_req, &err);
+		if (have_stratum || switchn != pool_switch_count) {
+			if (val)
+				json_decref(val);
+			goto need_reinit;
+		}
+		if (likely(val)) {
+			soval = json_object_get(json_object_get(val, "result"), "submitold");
+			submit_old = soval ? json_is_true(soval) : false;
+			pthread_mutex_lock(&g_work_lock);
+			if (work_decode(json_object_get(val, "result"), &g_work)) {
+				restart_threads();
+				if (!opt_quiet) {
+					char netinfo[64] = { 0 };
+					if (net_diff > 0.) {
+						sprintf(netinfo, ", diff %.3f", net_diff);
+					}
+					if (opt_showdiff) {
+						sprintf(&netinfo[strlen(netinfo)], ", target %.3f", g_work.targetdiff);
+					}
+					if (g_work.height)
+						applog(LOG_BLUE, "%s block %u%s", algo_names[opt_algo], g_work.height, netinfo);
+					else
+						applog(LOG_BLUE, "%s detected new block%s", short_url, netinfo);
+				}
+				g_work_time = time(NULL);
+			}
+			pthread_mutex_unlock(&g_work_lock);
+			json_decref(val);
+		} else {
+			// to check...
+			g_work_time = 0;
+			if (err != CURLE_OPERATION_TIMEDOUT) {
+				if (opt_debug_threads) applog(LOG_DEBUG, "%s() err %d, retry in %s seconds",
+					__func__, err, opt_fail_pause);
+				sleep(opt_fail_pause);
+				goto longpoll_retry;
+			}
+		}
+	}
+
+out:
+	have_longpoll = false;
+	if (opt_debug_threads)
+		applog(LOG_DEBUG, "%s() died", __func__);
+
+	free(hdr_path);
+	free(lp_url);
+	tq_freeze(mythr->q);
+	if (curl)
+		curl_easy_cleanup(curl);
+
+	return NULL;
+
+need_reinit:
+	/* this thread should not die to allow pool switch */
+	have_longpoll = false;
+	if (opt_debug_threads)
+		applog(LOG_DEBUG, "%s() reinit...", __func__);
+	if (hdr_path) free(hdr_path); hdr_path = NULL;
+	if (lp_url) free(lp_url); lp_url = NULL;
+	goto wait_lp_url;
+}
+
+static bool stratum_handle_response(char *buf)
+{
+	json_t *val, *err_val, *res_val, *id_val;
+	json_error_t err;
+	struct timeval tv_answer, diff;
+	int num = 0, job_nonce_id = 0;
+	double sharediff = stratum.sharediff;
+	bool ret = false;
+
+	val = JSON_LOADS(buf, &err);
+	if (!val) {
+		applog(LOG_INFO, "JSON decode failed(%d): %s", err.line, err.text);
+		goto out;
+	}
+
+	res_val = json_object_get(val, "result");
+	err_val = json_object_get(val, "error");
+	id_val = json_object_get(val, "id");
+
+	if (!id_val || json_is_null(id_val))
+		goto out;
+
+	// ignore late login answers
+	num = (int) json_integer_value(id_val);
+	if (num < 4)
+		goto out;
+
+	// We dont have the work anymore, so use the hashlog to get the right sharediff for multiple nonces
+	job_nonce_id = num - 10;
+	if (opt_showdiff && check_dups)
+		sharediff = hashlog_get_sharediff(g_work.job_id, job_nonce_id, sharediff);
+
+	gettimeofday(&tv_answer, NULL);
+	timeval_subtract(&diff, &tv_answer, &stratum.tv_submit);
+	// store time required to the pool to answer to a submit
+	stratum.answer_msec = (1000 * diff.tv_sec) + (uint32_t) (0.001 * diff.tv_usec);
+
+	if (stratum.rpc2) {
+		const char* reject_reason = err_val ? json_string_value(json_object_get(err_val, "message")) : NULL;
+		// {"id":10,"jsonrpc":"2.0","error":null,"result":{"status":"OK"}}
+		share_result(json_is_null(err_val), stratum.pooln, sharediff, reject_reason);
+		if (reject_reason) {
+			g_work_time = 0;
+			restart_threads();
+		}
+	} else {
+		if (!res_val)
+			goto out;
+		share_result(json_is_true(res_val), stratum.pooln, sharediff,
+			err_val ? json_string_value(json_array_get(err_val, 1)) : NULL);
+	}
+
+	ret = true;
+out:
+	if (val)
+		json_decref(val);
+
+	return ret;
+}
+
+static void *stratum_thread(void *userdata)
+{
+	struct thr_info *mythr = (struct thr_info *)userdata;
+	struct pool_infos *pool;
+	stratum_ctx *ctx = &stratum;
+	int pooln, switchn;
+	char *s;
+
+wait_stratum_url:
+	stratum.url = (char*)tq_pop(mythr->q, NULL);
+	if (!stratum.url)
+		goto out;
+
+	if (!pool_is_switching)
+		applog(LOG_BLUE, "Starting on %s", stratum.url);
+
+	ctx->pooln = pooln = cur_pooln;
+	switchn = pool_switch_count;
+	pool = &pools[pooln];
+
+	pool_is_switching = false;
+	stratum_need_reset = false;
+
+	while (!abort_flag) {
+		int failures = 0;
+
+		if (stratum_need_reset) {
+			stratum_need_reset = false;
+			if (stratum.url)
+				stratum_disconnect(&stratum);
+			else
+				stratum.url = strdup(pool->url); // may be useless
+		}
+
+		while (!stratum.curl && !abort_flag) {
+			pthread_mutex_lock(&g_work_lock);
+			g_work_time = 0;
+			g_work.data[0] = 0;
+			pthread_mutex_unlock(&g_work_lock);
+			restart_threads();
+
+			if (!stratum_connect(&stratum, pool->url) ||
+			    !stratum_subscribe(&stratum) ||
+			    !stratum_authorize(&stratum, pool->user, pool->pass))
+			{
+				stratum_disconnect(&stratum);
+				if (opt_retries >= 0 && ++failures > opt_retries) {
+					if (num_pools > 1 && opt_pool_failover) {
+						applog(LOG_WARNING, "Stratum connect timeout, failover...");
+						pool_switch_next(-1);
+					} else {
+						applog(LOG_ERR, "...terminating workio thread");
+						//tq_push(thr_info[work_thr_id].q, NULL);
+						workio_abort();
+						proper_exit(EXIT_CODE_POOL_TIMEOUT);
+						goto out;
+					}
+				}
+				if (switchn != pool_switch_count)
+					goto pool_switched;
+				if (!opt_benchmark)
+					applog(LOG_ERR, "...retry after %d seconds", opt_fail_pause);
+				sleep(opt_fail_pause);
+			}
+		}
+
+		if (stratum.rpc2) {
+			rpc2_stratum_thread_stuff(pool);
+		}
+
+		if (switchn != pool_switch_count) goto pool_switched;
+
+		if (stratum.job.job_id &&
+		    (!g_work_time || strncmp(stratum.job.job_id, g_work.job_id + 8, sizeof(g_work.job_id)-8))) {
+			pthread_mutex_lock(&g_work_lock);
+			if (stratum_gen_work(&stratum, &g_work))
+				g_work_time = time(NULL);
+			if (stratum.job.clean) {
+				static uint32_t last_block_height;
+				if ((!opt_quiet || !firstwork_time) && stratum.job.height != last_block_height) {
+					last_block_height = stratum.job.height;
+					if (net_diff > 0.)
+						applog(LOG_BLUE, "%s block %d, diff %.3f", algo_names[opt_algo],
+							stratum.job.height, net_diff);
+					else
+						applog(LOG_BLUE, "%s %s block %d", pool->short_url, algo_names[opt_algo],
+							stratum.job.height);
+				}
+				restart_threads();
+				if (check_dups || opt_showdiff)
+					hashlog_purge_old();
+				stats_purge_old();
+			} else if (opt_debug && !opt_quiet) {
+					applog(LOG_BLUE, "%s asks job %d for block %d", pool->short_url,
+						strtoul(stratum.job.job_id, NULL, 16), stratum.job.height);
+			}
+			pthread_mutex_unlock(&g_work_lock);
+		}
+		
+		// check we are on the right pool
+		if (switchn != pool_switch_count) goto pool_switched;
+
+		if (!stratum_socket_full(&stratum, opt_timeout)) {
+			if (opt_debug)
+				applog(LOG_WARNING, "Stratum connection timed out");
+			s = NULL;
+		} else
+			s = stratum_recv_line(&stratum);
+
+		// double check we are on the right pool
+		if (switchn != pool_switch_count) goto pool_switched;
+
+		if (!s) {
+			stratum_disconnect(&stratum);
+			if (!opt_quiet && !pool_on_hold)
+				applog(LOG_WARNING, "Stratum connection interrupted");
+			continue;
+		}
+		if (!stratum_handle_method(&stratum, s))
+			stratum_handle_response(s);
+		free(s);
+	}
+
+out:
+	if (opt_debug_threads)
+		applog(LOG_DEBUG, "%s() died", __func__);
+
+	return NULL;
+
+pool_switched:
+	/* this thread should not die on pool switch */
+	stratum_disconnect(&(pools[pooln].stratum));
+	if (stratum.url) free(stratum.url); stratum.url = NULL;
+	if (opt_debug_threads)
+		applog(LOG_DEBUG, "%s() reinit...", __func__);
+	goto wait_stratum_url;
+}
+
+static void show_version_and_exit(void)
+{
+	printf("%s v%s\n"
+#ifdef WIN32
+		"pthreads static %s\n"
+#endif
+		"%s\n",
+		PACKAGE_NAME, PACKAGE_VERSION,
+#ifdef WIN32
+		PTW32_VERSION_STRING,
+#endif
+		curl_version());
+	proper_exit(EXIT_CODE_OK);
+}
+
+static void show_usage_and_exit(int status)
+{
+	if (status)
+		fprintf(stderr, "Try `" PROGRAM_NAME " --help' for more information.\n");
+	else
+		printf(usage);
+
+	if (opt_algo == ALGO_SCRYPT || opt_algo == ALGO_SCRYPT_JANE) {
+		printf(scrypt_usage);
+	}
+	else if (opt_algo == ALGO_CRYPTONIGHT || opt_algo == ALGO_CRYPTOLIGHT) {
+		printf(xmr_usage);
+	}
+	else if (opt_algo == ALGO_WILDKECCAK) {
+		printf(bbr_usage);
+	}
+	proper_exit(status);
+}
+
+void parse_arg(int key, char *arg)
+{
+	char *p = arg;
+	int v, i;
+	uint64_t ul;
+	double d;
+
+	switch(key) {
+	case 'a': /* --algo */
+		p = strstr(arg, ":"); // optional factor
+		if (p) *p = '\0';
+
+		i = algo_to_int(arg);
+		if (i >= 0)
+			opt_algo = (enum sha_algos)i;
+		else {
+			applog(LOG_ERR, "Unknown algo parameter '%s'", arg);
+			show_usage_and_exit(1);
+		}
+
+		if (p) {
+			opt_nfactor = atoi(p + 1);
+			if (opt_algo == ALGO_SCRYPT_JANE) {
+				free(jane_params);
+				jane_params = strdup(p+1);
+			}
+		}
+		if (!opt_nfactor) {
+			switch (opt_algo) {
+			case ALGO_SCRYPT:      opt_nfactor = 9;  break;
+			case ALGO_SCRYPT_JANE: opt_nfactor = 14; break;
+			}
+		}
+
+		// cryptonight variants
+		switch (opt_algo) {
+		case ALGO_MONERO:
+			opt_algo = ALGO_CRYPTONIGHT;
+			cryptonight_fork = 7;
+			break;
+		case ALGO_GRAFT:
+			opt_algo = ALGO_CRYPTONIGHT;
+			cryptonight_fork = 8;
+			break;
+		case ALGO_STELLITE:
+			opt_algo = ALGO_CRYPTONIGHT;
+			cryptonight_fork = 3;
+			break;
+		case ALGO_CRYPTONIGHT:
+			cryptonight_fork = 1;
+			break;
+		}
+
+		break;
+	case 'b':
+		p = strstr(arg, ":");
+		if (p) {
+			/* ip:port */
+			if (p - arg > 0) {
+				free(opt_api_bind);
+				opt_api_bind = strdup(arg);
+				opt_api_bind[p - arg] = '\0';
+			}
+			opt_api_port = atoi(p + 1);
+		}
+		else if (arg && strstr(arg, ".")) {
+			/* ip only */
+			free(opt_api_bind);
+			opt_api_bind = strdup(arg);
+		}
+		else if (arg) {
+			/* port or 0 to disable */
+			opt_api_port = atoi(arg);
+		}
+		break;
+	case 1030: /* --api-remote */
+		if (opt_api_allow) free(opt_api_allow);
+		opt_api_allow = strdup("0/0");
+		break;
+	case 1031: /* --api-allow */
+		// --api-allow 0/0 means opened to all, so assume -b 0.0.0.0
+		if (!strcmp(arg, "0/0") && !strcmp(opt_api_bind, "127.0.0.1"))
+			parse_arg('b', (char*)"0.0.0.0");
+		if (opt_api_allow) free(opt_api_allow);
+		opt_api_allow = strdup(arg);
+		break;
+	case 1032: /* --api-groups */
+		if (opt_api_groups) free(opt_api_groups);
+		opt_api_groups = strdup(arg);
+		break;
+	case 1033: /* --api-mcast */
+		opt_api_mcast = true;
+		break;
+	case 1034: /* --api-mcast-addr */
+		free(opt_api_mcast_addr);
+		opt_api_mcast_addr = strdup(arg);
+	case 1035: /* --api-mcast-code */
+		free(opt_api_mcast_code);
+		opt_api_mcast_code = strdup(arg);
+		break;
+	case 1036: /* --api-mcast-des */
+		free(opt_api_mcast_des);
+		opt_api_mcast_des = strdup(arg);
+		break;
+	case 1037: /* --api-mcast-port */
+		v = atoi(arg);
+		if (v < 1 || v > 65535) // sanity check
+			show_usage_and_exit(1);
+		opt_api_mcast_port = v;
+		break;
+	case 'B':
+		opt_background = true;
+		break;
+	case 'c': {
+		json_error_t err;
+		if (opt_config) {
+			json_decref(opt_config);
+			opt_config = NULL;
+		}
+		if (arg && strstr(arg, "://")) {
+			opt_config = json_load_url(arg, &err);
+		} else {
+			opt_config = JSON_LOADF(arg, &err);
+		}
+		if (!json_is_object(opt_config)) {
+			applog(LOG_ERR, "JSON decode of %s failed", arg);
+			proper_exit(EXIT_CODE_USAGE);
+		}
+		break;
+	}
+	case 'k':
+		opt_scratchpad_url = strdup(arg);
+		break;
+	case 'i':
+		d = atof(arg);
+		v = (uint32_t) d;
+		if (v < 0 || v > 31)
+			show_usage_and_exit(1);
+		{
+			int n = 0;
+			int ngpus = cuda_num_devices();
+			uint32_t last = 0;
+			char * pch = strtok(arg,",");
+			while (pch != NULL) {
+				d = atof(pch);
+				v = (uint32_t) d;
+				if (v > 7) { /* 0 = default */
+					if ((d - v) > 0.0) {
+						uint32_t adds = (uint32_t)floor((d - v) * (1 << (v - 8))) * 256;
+						gpus_intensity[n] = (1 << v) + adds;
+						applog(LOG_INFO, "Adding %u threads to intensity %u, %u cuda threads",
+							adds, v, gpus_intensity[n]);
+					}
+					else if (gpus_intensity[n] != (1 << v)) {
+						gpus_intensity[n] = (1 << v);
+					}
+				}
+				last = gpus_intensity[n];
+				n++;
+				pch = strtok(NULL, ",");
+			}
+			while (n < MAX_GPUS)
+				gpus_intensity[n++] = last;
+		}
+		break;
+	case 'D':
+		opt_debug = true;
+		break;
+	case 'N':
+		v = atoi(arg);
+		if (v < 1)
+			opt_statsavg = INT_MAX;
+		opt_statsavg = v;
+		break;
+	case 'n': /* --ndevs */
+		// to get gpu vendors...
+		#ifdef USE_WRAPNVML
+		hnvml = nvml_create();
+		#ifdef WIN32
+		nvapi_init();
+		cuda_devicenames(); // req for leds
+		nvapi_init_settings();
+		#endif
+		#endif
+		cuda_print_devices();
+		proper_exit(EXIT_CODE_OK);
+		break;
+	case 'q':
+		opt_quiet = true;
+		break;
+	case 'p':
+		free(rpc_pass);
+		rpc_pass = strdup(arg);
+		pool_set_creds(cur_pooln);
+		break;
+	case 'P':
+		opt_protocol = true;
+		break;
+	case 'r':
+		v = atoi(arg);
+		if (v < -1 || v > 9999)	/* sanity check */
+			show_usage_and_exit(1);
+		opt_retries = v;
+		break;
+	case 'R':
+		v = atoi(arg);
+		if (v < 1 || v > 9999)	/* sanity check */
+			show_usage_and_exit(1);
+		opt_fail_pause = v;
+		break;
+	case 's':
+		v = atoi(arg);
+		if (v < 1 || v > 9999)	/* sanity check */
+			show_usage_and_exit(1);
+		opt_scantime = v;
+		break;
+	case 'T':
+		v = atoi(arg);
+		if (v < 1 || v > 99999)	/* sanity check */
+			show_usage_and_exit(1);
+		opt_timeout = v;
+		break;
+	case 't':
+		v = atoi(arg);
+		if (v < 0 || v > 9999)	/* sanity check */
+			show_usage_and_exit(1);
+		opt_n_threads = v;
+		break;
+	case 1022: // --vote
+		v = atoi(arg);
+		if (v < 0 || v > 8192)	/* sanity check */
+			show_usage_and_exit(1);
+		opt_vote = (uint16_t)v;
+		break;
+	case 1023: // --trust-pool
+		opt_trust_pool = true;
+		break;
+	case 'u':
+		free(rpc_user);
+		rpc_user = strdup(arg);
+		pool_set_creds(cur_pooln);
+		break;
+	case 'o':			/* --url */
+		if (pools[cur_pooln].type != POOL_UNUSED) {
+			// rotate pool pointer
+			cur_pooln = (cur_pooln + 1) % MAX_POOLS;
+			num_pools = max(cur_pooln+1, num_pools);
+			// change some defaults if multi pools
+			if (opt_retries == -1) opt_retries = 1;
+			if (opt_fail_pause == 30) opt_fail_pause = 5;
+			if (opt_timeout == 300) opt_timeout = 60;
+		}
+		p = strstr(arg, "://");
+		if (p) {
+			if (strncasecmp(arg, "http://", 7) && strncasecmp(arg, "https://", 8) &&
+					strncasecmp(arg, "stratum+tcp://", 14))
+				show_usage_and_exit(1);
+			free(rpc_url);
+			rpc_url = strdup(arg);
+			short_url = &rpc_url[(p - arg) + 3];
+		} else {
+			if (!strlen(arg) || *arg == '/')
+				show_usage_and_exit(1);
+			free(rpc_url);
+			rpc_url = (char*)malloc(strlen(arg) + 8);
+			sprintf(rpc_url, "http://%s", arg);
+			short_url = &rpc_url[7];
+		}
+		p = strrchr(rpc_url, '@');
+		if (p) {
+			char *sp, *ap;
+			*p = '\0';
+			ap = strstr(rpc_url, "://") + 3;
+			sp = strchr(ap, ':');
+			if (sp && sp < p) {
+				free(rpc_user);
+				rpc_user = (char*)calloc(sp - ap + 1, 1);
+				strncpy(rpc_user, ap, sp - ap);
+				free(rpc_pass);
+				rpc_pass = strdup(sp + 1);
+			} else {
+				free(rpc_user);
+				rpc_user = strdup(ap);
+			}
+			// remove user[:pass]@ from rpc_url
+			memmove(ap, p + 1, strlen(p + 1) + 1);
+			// host:port only
+			short_url = ap;
+		}
+		have_stratum = !opt_benchmark && !strncasecmp(rpc_url, "stratum", 7);
+		pool_set_creds(cur_pooln);
+		break;
+	case 'O':			/* --userpass */
+		p = strchr(arg, ':');
+		if (!p)
+			show_usage_and_exit(1);
+		free(rpc_user);
+		rpc_user = (char*)calloc(p - arg + 1, 1);
+		strncpy(rpc_user, arg, p - arg);
+		free(rpc_pass);
+		rpc_pass = strdup(p + 1);
+		pool_set_creds(cur_pooln);
+		break;
+	case 'x':			/* --proxy */
+		if (!strncasecmp(arg, "socks4://", 9))
+			opt_proxy_type = CURLPROXY_SOCKS4;
+		else if (!strncasecmp(arg, "socks5://", 9))
+			opt_proxy_type = CURLPROXY_SOCKS5;
+#if LIBCURL_VERSION_NUM >= 0x071200
+		else if (!strncasecmp(arg, "socks4a://", 10))
+			opt_proxy_type = CURLPROXY_SOCKS4A;
+		else if (!strncasecmp(arg, "socks5h://", 10))
+			opt_proxy_type = CURLPROXY_SOCKS5_HOSTNAME;
+#endif
+		else
+			opt_proxy_type = CURLPROXY_HTTP;
+		free(opt_proxy);
+		opt_proxy = strdup(arg);
+		pool_set_creds(cur_pooln);
+		break;
+	case 1001:
+		free(opt_cert);
+		opt_cert = strdup(arg);
+		break;
+	case 1002:
+		use_colors = false;
+		break;
+	case 1004:
+		opt_autotune = false;
+		break;
+	case 'l': /* --launch-config */
+		{
+			char *last = NULL, *pch = strtok(arg,",");
+			int n = 0;
+			while (pch != NULL) {
+				device_config[n++] = last = strdup(pch);
+				pch = strtok(NULL, ",");
+			}
+			while (n < MAX_GPUS)
+				device_config[n++] = last;
+		}
+		break;
+	case 'L': /* scrypt --lookup-gap */
+		{
+			char *pch = strtok(arg,",");
+			int n = 0, last = atoi(arg);
+			while (pch != NULL) {
+				device_lookup_gap[n++] = last = atoi(pch);
+				pch = strtok(NULL, ",");
+			}
+			while (n < MAX_GPUS)
+				device_lookup_gap[n++] = last;
+		}
+		break;
+	case 1050: /* scrypt --interactive */
+		{
+			char *pch = strtok(arg,",");
+			int n = 0, last = atoi(arg);
+			while (pch != NULL) {
+				device_interactive[n++] = last = atoi(pch);
+				pch = strtok(NULL, ",");
+			}
+			while (n < MAX_GPUS)
+				device_interactive[n++] = last;
+		}
+		break;
+	case 1051: /* scrypt --texture-cache */
+		{
+			char *pch = strtok(arg,",");
+			int n = 0, last = atoi(arg);
+			while (pch != NULL) {
+				device_texturecache[n++] = last = atoi(pch);
+				pch = strtok(NULL, ",");
+			}
+			while (n < MAX_GPUS)
+				device_texturecache[n++] = last;
+		}
+		break;
+	case 1055: /* cryptonight --bfactor */
+		{
+			char *pch = strtok(arg, ",");
+			int n = 0, last = atoi(arg);
+			while (pch != NULL) {
+				last = atoi(pch);
+				if (last > 15) last = 15;
+				device_bfactor[n++] = last;
+				pch = strtok(NULL, ",");
+			}
+			while (n < MAX_GPUS)
+				device_bfactor[n++] = last;
+		}
+		break;
+	case 1070: /* --gpu-clock */
+		{
+			char *pch = strtok(arg,",");
+			int n = 0;
+			while (pch != NULL && n < MAX_GPUS) {
+				int dev_id = device_map[n++];
+				device_gpu_clocks[dev_id] = atoi(pch);
+				pch = strtok(NULL, ",");
+			}
+		}
+		break;
+	case 1071: /* --mem-clock */
+		{
+			char *pch = strtok(arg,",");
+			int n = 0;
+			while (pch != NULL && n < MAX_GPUS) {
+				int dev_id = device_map[n++];
+				if (*pch == '+' || *pch == '-')
+					device_mem_offsets[dev_id] = atoi(pch);
+				else
+					device_mem_clocks[dev_id] = atoi(pch);
+				need_nvsettings = true;
+				pch = strtok(NULL, ",");
+			}
+		}
+		break;
+	case 1072: /* --pstate */
+		{
+			char *pch = strtok(arg,",");
+			int n = 0;
+			while (pch != NULL && n < MAX_GPUS) {
+				int dev_id = device_map[n++];
+				device_pstate[dev_id] = (int8_t) atoi(pch);
+				pch = strtok(NULL, ",");
+			}
+		}
+		break;
+	case 1073: /* --plimit */
+		{
+			char *pch = strtok(arg,",");
+			int n = 0;
+			while (pch != NULL && n < MAX_GPUS) {
+				int dev_id = device_map[n++];
+				device_plimit[dev_id] = atoi(pch);
+				pch = strtok(NULL, ",");
+			}
+		}
+		break;
+	case 1074: /* --keep-clocks */
+		opt_keep_clocks = true;
+		break;
+	case 1075: /* --tlimit */
+		{
+			char *pch = strtok(arg,",");
+			int n = 0;
+			while (pch != NULL && n < MAX_GPUS) {
+				int dev_id = device_map[n++];
+				device_tlimit[dev_id] = (uint8_t) atoi(pch);
+				pch = strtok(NULL, ",");
+			}
+		}
+		break;
+	case 1080: /* --led */
+		{
+			if (!opt_led_mode)
+				opt_led_mode = LED_MODE_SHARES;
+			char *pch = strtok(arg,",");
+			int n = 0, lastval, val;
+			while (pch != NULL && n < MAX_GPUS) {
+				int dev_id = device_map[n++];
+				char * p = strstr(pch, "0x");
+				val = p ? (int32_t) strtoul(p, NULL, 16) : atoi(pch);
+				if (!val && !strcmp(pch, "mining"))
+					opt_led_mode = LED_MODE_MINING;
+				else if (device_led[dev_id] == -1)
+					device_led[dev_id] = lastval = val;
+				pch = strtok(NULL, ",");
+			}
+			if (lastval) while (n < MAX_GPUS) {
+				device_led[n++] = lastval;
+			}
+		}
+		break;
+	case 1005:
+		opt_benchmark = true;
+		want_longpoll = false;
+		want_stratum = false;
+		have_stratum = false;
+		break;
+	case 1006:
+		print_hash_tests();
+		proper_exit(EXIT_CODE_OK);
+		break;
+	case 1003:
+		want_longpoll = false;
+		break;
+	case 1007:
+		want_stratum = false;
+		opt_extranonce = false;
+		break;
+	case 1008:
+		opt_time_limit = atoi(arg);
+		break;
+	case 1009:
+		opt_shares_limit = atoi(arg);
+		break;
+	case 1011:
+		allow_gbt = false;
+		break;
+	case 1012:
+		opt_extranonce = false;
+		break;
+	case 1013:
+		opt_showdiff = true;
+		break;
+	case 1014:
+		opt_showdiff = false;
+		break;
+	case 1015:
+		opt_submit_stale = true;
+		break;
+	case 'S':
+	case 1018:
+		applog(LOG_INFO, "Now logging to syslog...");
+		use_syslog = true;
+		if (arg && strlen(arg)) {
+			free(opt_syslog_pfx);
+			opt_syslog_pfx = strdup(arg);
+		}
+		break;
+	case 1019: // max-log-rate
+		opt_maxlograte = atoi(arg);
+		break;
+	case 1020:
+		p = strstr(arg, "0x");
+		ul = p ? strtoul(p, NULL, 16) : atol(arg);
+		if (ul > (1UL<<num_cpus)-1)
+			ul = -1L;
+		opt_affinity = ul;
+		break;
+	case 1021:
+		v = atoi(arg);
+		if (v < 0 || v > 5)	/* sanity check */
+			show_usage_and_exit(1);
+		opt_priority = v;
+		break;
+	case 1025: // cuda-schedule
+		opt_cudaschedule = atoi(arg);
+		break;
+	case 1060: // max-temp
+		d = atof(arg);
+		opt_max_temp = d;
+		break;
+	case 1061: // max-diff
+		d = atof(arg);
+		opt_max_diff = d;
+		break;
+	case 1062: // max-rate
+		d = atof(arg);
+		p = strstr(arg, "K");
+		if (p) d *= 1e3;
+		p = strstr(arg, "M");
+		if (p) d *= 1e6;
+		p = strstr(arg, "G");
+		if (p) d *= 1e9;
+		opt_max_rate = d;
+		break;
+	case 1063: // resume-diff
+		d = atof(arg);
+		opt_resume_diff = d;
+		break;
+	case 1064: // resume-rate
+		d = atof(arg);
+		p = strstr(arg, "K");
+		if (p) d *= 1e3;
+		p = strstr(arg, "M");
+		if (p) d *= 1e6;
+		p = strstr(arg, "G");
+		if (p) d *= 1e9;
+		opt_resume_rate = d;
+		break;
+	case 1065: // resume-temp
+		d = atof(arg);
+		opt_resume_temp = d;
+		break;
+	case 'd': // --device
+		{
+			int device_thr[MAX_GPUS] = { 0 };
+			int ngpus = cuda_num_devices();
+			char* pch = strtok(arg,",");
+			opt_n_threads = 0;
+			while (pch != NULL && opt_n_threads < MAX_GPUS) {
+				if (pch[0] >= '0' && pch[0] <= '9' && strlen(pch) <= 2)
+				{
+					if (atoi(pch) < ngpus)
+						device_map[opt_n_threads++] = atoi(pch);
+					else {
+						applog(LOG_ERR, "Non-existant CUDA device #%d specified in -d option", atoi(pch));
+						proper_exit(EXIT_CODE_CUDA_NODEVICE);
+					}
+				} else {
+					int device = cuda_finddevice(pch);
+					if (device >= 0 && device < ngpus)
+						device_map[opt_n_threads++] = device;
+					else {
+						applog(LOG_ERR, "Non-existant CUDA device '%s' specified in -d option", pch);
+						proper_exit(EXIT_CODE_CUDA_NODEVICE);
+					}
+				}
+				pch = strtok (NULL, ",");
+			}
+			// count threads per gpu
+			for (int n=0; n < opt_n_threads; n++) {
+				int device = device_map[n];
+				device_thr[device]++;
+			}
+			for (int n=0; n < ngpus; n++) {
+				gpu_threads = max(gpu_threads, device_thr[n]);
+			}
+		}
+		break;
+
+	case 'f': // --diff-factor
+		d = atof(arg);
+		if (d <= 0.)
+			show_usage_and_exit(1);
+		opt_difficulty = d;
+		break;
+	case 'm': // --diff-multiplier
+		d = atof(arg);
+		if (d <= 0.)
+			show_usage_and_exit(1);
+		opt_difficulty = 1.0/d;
+		break;
+
+	/* PER POOL CONFIG OPTIONS */
+
+	case 1100: /* pool name */
+		pool_set_attr(cur_pooln, "name", arg);
+		break;
+	case 1101: /* pool algo */
+		pool_set_attr(cur_pooln, "algo", arg);
+		break;
+	case 1102: /* pool scantime */
+		pool_set_attr(cur_pooln, "scantime", arg);
+		break;
+	case 1108: /* pool time-limit */
+		pool_set_attr(cur_pooln, "time-limit", arg);
+		break;
+	case 1109: /* pool shares-limit (1.7.6) */
+		pool_set_attr(cur_pooln, "shares-limit", arg);
+		break;
+	case 1161: /* pool max-diff */
+		pool_set_attr(cur_pooln, "max-diff", arg);
+		break;
+	case 1162: /* pool max-rate */
+		pool_set_attr(cur_pooln, "max-rate", arg);
+		break;
+	case 1199:
+		pool_set_attr(cur_pooln, "disabled", arg);
+		break;
+
+	case 'V':
+		show_version_and_exit();
+	case 'h':
+		show_usage_and_exit(0);
+	default:
+		show_usage_and_exit(1);
+	}
+
+	if (use_syslog)
+		use_colors = false;
+}
+
+void parse_config(json_t* json_obj)
+{
+	int i;
+	json_t *val;
+
+	if (!json_is_object(json_obj))
+		return;
+
+	for (i = 0; i < ARRAY_SIZE(options); i++) {
+
+		if (!options[i].name)
+			break;
+
+		if (!strcasecmp(options[i].name, "config"))
+			continue;
+
+		val = json_object_get(json_obj, options[i].name);
+		if (!val)
+			continue;
+
+		if (options[i].has_arg && json_is_string(val)) {
+			char *s = strdup(json_string_value(val));
+			if (!s)
+				continue;
+			parse_arg(options[i].val, s);
+			free(s);
+		}
+		else if (options[i].has_arg && json_is_integer(val)) {
+			char buf[16];
+			sprintf(buf, "%d", (int) json_integer_value(val));
+			parse_arg(options[i].val, buf);
+		}
+		else if (options[i].has_arg && json_is_real(val)) {
+			char buf[16];
+			sprintf(buf, "%f", json_real_value(val));
+			parse_arg(options[i].val, buf);
+		}
+		else if (!options[i].has_arg) {
+			if (json_is_true(val))
+				parse_arg(options[i].val, (char*) "");
+		}
+		else
+			applog(LOG_ERR, "JSON option %s invalid",
+				options[i].name);
+	}
+
+	val = json_object_get(json_obj, "pools");
+	if (val && json_typeof(val) == JSON_ARRAY) {
+		parse_pool_array(val);
+	}
+}
+
+static void parse_cmdline(int argc, char *argv[])
+{
+	int key;
+
+	while (1) {
+#if HAVE_GETOPT_LONG
+		key = getopt_long(argc, argv, short_options, options, NULL);
+#else
+		key = getopt(argc, argv, short_options);
+#endif
+		if (key < 0)
+			break;
+
+		parse_arg(key, optarg);
+	}
+	if (optind < argc) {
+		fprintf(stderr, "%s: unsupported non-option argument '%s' (see --help)\n",
+			argv[0], argv[optind]);
+		//show_usage_and_exit(1);
+	}
+
+	parse_config(opt_config);
+
+	if (opt_algo == ALGO_HEAVY && opt_vote == 9999 && !opt_benchmark) {
+		fprintf(stderr, "%s: Heavycoin hash requires block reward vote parameter (see --vote)\n",
+			argv[0]);
+		show_usage_and_exit(1);
+	}
+
+	if (opt_vote == 9999) {
+		opt_vote = 0; // default, don't vote
+	}
+}
+
+static void parse_single_opt(int opt, int argc, char *argv[])
+{
+	int key, prev = optind;
+	while (1) {
+#if HAVE_GETOPT_LONG
+		key = getopt_long(argc, argv, short_options, options, NULL);
+#else
+		key = getopt(argc, argv, short_options);
+#endif
+		if (key < 0)
+			break;
+		if (key == opt /* || key == 'c'*/)
+			parse_arg(key, optarg);
+	}
+	//todo with a filter: parse_config(opt_config);
+
+	optind = prev; // reset argv index
+}
+
+#ifndef WIN32
+static void signal_handler(int sig)
+{
+	switch (sig) {
+	case SIGHUP:
+		applog(LOG_INFO, "SIGHUP received");
+		break;
+	case SIGINT:
+		signal(sig, SIG_IGN);
+		applog(LOG_INFO, "SIGINT received, exiting");
+		proper_exit(EXIT_CODE_KILLED);
+		break;
+	case SIGTERM:
+		applog(LOG_INFO, "SIGTERM received, exiting");
+		proper_exit(EXIT_CODE_KILLED);
+		break;
+	}
+}
+#else
+BOOL WINAPI ConsoleHandler(DWORD dwType)
+{
+	switch (dwType) {
+	case CTRL_C_EVENT:
+		applog(LOG_INFO, "CTRL_C_EVENT received, exiting");
+		proper_exit(EXIT_CODE_KILLED);
+		break;
+	case CTRL_BREAK_EVENT:
+		applog(LOG_INFO, "CTRL_BREAK_EVENT received, exiting");
+		proper_exit(EXIT_CODE_KILLED);
+		break;
+	case CTRL_LOGOFF_EVENT:
+		applog(LOG_INFO, "CTRL_LOGOFF_EVENT received, exiting");
+		proper_exit(EXIT_CODE_KILLED);
+		break;
+	case CTRL_SHUTDOWN_EVENT:
+		applog(LOG_INFO, "CTRL_SHUTDOWN_EVENT received, exiting");
+		proper_exit(EXIT_CODE_KILLED);
+		break;
+	default:
+		return false;
+	}
+	return true;
+}
+#endif
+
+int main(int argc, char *argv[])
+{
+	struct thr_info *thr;
+	long flags;
+	int i;
+
+	// get opt_quiet early
+	parse_single_opt('q', argc, argv);
+
+	printf("*** ccminer " PACKAGE_VERSION " for nVidia GPUs by tpruvot@github ***\n");
+	if (!opt_quiet) {
+		const char* arch = is_x64() ? "64-bits" : "32-bits";
+#ifdef _MSC_VER
+		printf("    Built with VC++ %d and nVidia CUDA SDK %d.%d %s\n\n", msver(),
+#else
+		printf("    Built with the nVidia CUDA Toolkit %d.%d %s\n\n",
+#endif
+			CUDART_VERSION/1000, (CUDART_VERSION % 1000)/10, arch);
+		printf("  Originally based on Christian Buchner and Christian H. project\n");
+		printf("  Include some kernels from alexis78, djm34, djEzo, tsiv and krnlx.\n\n");
+		printf("BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo (tpruvot)\n\n");
+	}
+
+	rpc_user = strdup("");
+	rpc_pass = strdup("");
+	rpc_url = strdup("");
+	jane_params = strdup("");
+
+	pthread_mutex_init(&applog_lock, NULL);
+	pthread_mutex_init(&stratum_sock_lock, NULL);
+	pthread_mutex_init(&stratum_work_lock, NULL);
+	pthread_mutex_init(&stats_lock, NULL);
+	pthread_mutex_init(&g_work_lock, NULL);
+
+	// number of cpus for thread affinity
+#if defined(WIN32)
+	SYSTEM_INFO sysinfo;
+	GetSystemInfo(&sysinfo);
+	num_cpus = sysinfo.dwNumberOfProcessors;
+#elif defined(_SC_NPROCESSORS_CONF)
+	num_cpus = sysconf(_SC_NPROCESSORS_CONF);
+#elif defined(CTL_HW) && defined(HW_NCPU)
+	int req[] = { CTL_HW, HW_NCPU };
+	size_t len = sizeof(num_cpus);
+	sysctl(req, 2, &num_cpus, &len, NULL, 0);
+#else
+	num_cpus = 1;
+#endif
+	if (num_cpus < 1)
+		num_cpus = 1;
+
+	// number of gpus
+	active_gpus = cuda_num_devices();
+
+	for (i = 0; i < MAX_GPUS; i++) {
+		device_map[i] = i % active_gpus;
+		device_name[i] = NULL;
+		device_config[i] = NULL;
+		device_backoff[i] = is_windows() ? 12 : 2;
+		device_bfactor[i] = is_windows() ? 11 : 0;
+		device_lookup_gap[i] = 1;
+		device_batchsize[i] = 1024;
+		device_interactive[i] = -1;
+		device_texturecache[i] = -1;
+		device_singlememory[i] = -1;
+		device_pstate[i] = -1;
+		device_led[i] = -1;
+	}
+
+	cuda_devicenames();
+
+	/* parse command line */
+	parse_cmdline(argc, argv);
+
+	if (!opt_benchmark && !strlen(rpc_url)) {
+		// try default config file (user then binary folder)
+		char defconfig[MAX_PATH] = { 0 };
+		get_defconfig_path(defconfig, MAX_PATH, argv[0]);
+		if (strlen(defconfig)) {
+			if (opt_debug)
+				applog(LOG_DEBUG, "Using config %s", defconfig);
+			parse_arg('c', defconfig);
+			parse_cmdline(argc, argv);
+		}
+	}
+
+	if (!strlen(rpc_url)) {
+		if (!opt_benchmark) {
+			fprintf(stderr, "%s: no URL supplied\n", argv[0]);
+			show_usage_and_exit(1);
+		}
+		// ensure a pool is set with default params...
+		pool_set_creds(0);
+	}
+
+	/* init stratum data.. */
+	memset(&stratum.url, 0, sizeof(stratum));
+
+	// ensure default params are set
+	pool_init_defaults();
+
+	if (opt_debug)
+		pool_dump_infos();
+	cur_pooln = pool_get_first_valid(0);
+	pool_switch(-1, cur_pooln);
+
+	if (opt_algo == ALGO_DECRED || opt_algo == ALGO_SIA) {
+		allow_gbt = false;
+		allow_mininginfo = false;
+	}
+
+	if (opt_algo == ALGO_EQUIHASH) {
+		opt_extranonce = false; // disable subscribe
+	}
+
+	if (opt_algo == ALGO_CRYPTONIGHT || opt_algo == ALGO_CRYPTOLIGHT) {
+		rpc2_init();
+		if (!opt_quiet) applog(LOG_INFO, "Using JSON-RPC 2.0");
+	}
+
+	if (opt_algo == ALGO_WILDKECCAK) {
+		rpc2_init();
+		if (!opt_quiet) applog(LOG_INFO, "Using JSON-RPC 2.0");
+		GetScratchpad();
+	}
+
+	flags = !opt_benchmark && strncmp(rpc_url, "https:", 6)
+	      ? (CURL_GLOBAL_ALL & ~CURL_GLOBAL_SSL)
+	      : CURL_GLOBAL_ALL;
+	if (curl_global_init(flags)) {
+		applog(LOG_ERR, "CURL initialization failed");
+		return EXIT_CODE_SW_INIT_ERROR;
+	}
+
+	if (opt_background) {
+#ifndef WIN32
+		i = fork();
+		if (i < 0) proper_exit(EXIT_CODE_SW_INIT_ERROR);
+		if (i > 0) proper_exit(EXIT_CODE_OK);
+		i = setsid();
+		if (i < 0)
+			applog(LOG_ERR, "setsid() failed (errno = %d)", errno);
+		i = chdir("/");
+		if (i < 0)
+			applog(LOG_ERR, "chdir() failed (errno = %d)", errno);
+		signal(SIGHUP, signal_handler);
+		signal(SIGTERM, signal_handler);
+#else
+		HWND hcon = GetConsoleWindow();
+		if (hcon) {
+			// this method also hide parent command line window
+			ShowWindow(hcon, SW_HIDE);
+		} else {
+			HANDLE h = GetStdHandle(STD_OUTPUT_HANDLE);
+			CloseHandle(h);
+			FreeConsole();
+		}
+#endif
+	}
+
+#ifndef WIN32
+	/* Always catch Ctrl+C */
+	signal(SIGINT, signal_handler);
+#else
+	SetConsoleCtrlHandler((PHANDLER_ROUTINE)ConsoleHandler, TRUE);
+	if (opt_priority > 0) {
+		DWORD prio = NORMAL_PRIORITY_CLASS;
+		switch (opt_priority) {
+		case 1:
+			prio = BELOW_NORMAL_PRIORITY_CLASS;
+			break;
+		case 2:
+			prio = NORMAL_PRIORITY_CLASS;
+			break;
+		case 3:
+			prio = ABOVE_NORMAL_PRIORITY_CLASS;
+			break;
+		case 4:
+			prio = HIGH_PRIORITY_CLASS;
+			break;
+		case 5:
+			prio = REALTIME_PRIORITY_CLASS;
+		}
+		SetPriorityClass(GetCurrentProcess(), prio);
+	}
+	// Prevent windows to sleep while mining
+	SetThreadExecutionState(ES_CONTINUOUS | ES_SYSTEM_REQUIRED);
+	// Enable windows high precision timer
+	timeBeginPeriod(1);
+#endif
+	if (opt_affinity != -1) {
+		if (!opt_quiet)
+			applog(LOG_DEBUG, "Binding process to cpu mask %x", opt_affinity);
+		affine_to_cpu_mask(-1, (unsigned long)opt_affinity);
+	}
+	if (active_gpus == 0) {
+		applog(LOG_ERR, "No CUDA devices found! terminating.");
+		exit(1);
+	}
+	if (!opt_n_threads)
+		opt_n_threads = active_gpus;
+	else if (active_gpus > opt_n_threads)
+		active_gpus = opt_n_threads;
+
+	// generally doesn't work well...
+	gpu_threads = max(gpu_threads, opt_n_threads / active_gpus);
+
+	if (opt_benchmark && opt_algo == ALGO_AUTO) {
+		bench_init(opt_n_threads);
+		for (int n=0; n < MAX_GPUS; n++) {
+			gpus_intensity[n] = 0; // use default
+		}
+		opt_autotune = false;
+	}
+
+#ifdef HAVE_SYSLOG_H
+	if (use_syslog)
+		openlog(opt_syslog_pfx, LOG_PID, LOG_USER);
+#endif
+
+	work_restart = (struct work_restart *)calloc(opt_n_threads, sizeof(*work_restart));
+	if (!work_restart)
+		return EXIT_CODE_SW_INIT_ERROR;
+
+	thr_info = (struct thr_info *)calloc(opt_n_threads + 5, sizeof(*thr));
+	if (!thr_info)
+		return EXIT_CODE_SW_INIT_ERROR;
+
+	/* longpoll thread */
+	longpoll_thr_id = opt_n_threads + 1;
+	thr = &thr_info[longpoll_thr_id];
+	thr->id = longpoll_thr_id;
+	thr->q = tq_new();
+	if (!thr->q)
+		return EXIT_CODE_SW_INIT_ERROR;
+
+	/* always start the longpoll thread (will wait a tq_push on workio /LP) */
+	if (unlikely(pthread_create(&thr->pth, NULL, longpoll_thread, thr))) {
+		applog(LOG_ERR, "longpoll thread create failed");
+		return EXIT_CODE_SW_INIT_ERROR;
+	}
+
+	/* stratum thread */
+	stratum_thr_id = opt_n_threads + 2;
+	thr = &thr_info[stratum_thr_id];
+	thr->id = stratum_thr_id;
+	thr->q = tq_new();
+	if (!thr->q)
+		return EXIT_CODE_SW_INIT_ERROR;
+
+	/* always start the stratum thread (will wait a tq_push) */
+	if (unlikely(pthread_create(&thr->pth, NULL, stratum_thread, thr))) {
+		applog(LOG_ERR, "stratum thread create failed");
+		return EXIT_CODE_SW_INIT_ERROR;
+	}
+
+	/* init workio thread */
+	work_thr_id = opt_n_threads;
+	thr = &thr_info[work_thr_id];
+	thr->id = work_thr_id;
+	thr->q = tq_new();
+	if (!thr->q)
+		return EXIT_CODE_SW_INIT_ERROR;
+
+	if (pthread_create(&thr->pth, NULL, workio_thread, thr)) {
+		applog(LOG_ERR, "workio thread create failed");
+		return EXIT_CODE_SW_INIT_ERROR;
+	}
+
+	/* real start of the stratum work */
+	if (want_stratum && have_stratum) {
+		tq_push(thr_info[stratum_thr_id].q, strdup(rpc_url));
+	}
+
+#ifdef __linux__
+	if (need_nvsettings) {
+		if (nvs_init() < 0)
+			need_nvsettings = false;
+	}
+#endif
+
+#ifdef USE_WRAPNVML
+#if defined(__linux__) || defined(_WIN64)
+	/* nvml is currently not the best choice on Windows (only in x64) */
+	hnvml = nvml_create();
+	if (hnvml) {
+		bool gpu_reinit = (opt_cudaschedule >= 0); //false
+		cuda_devicenames(); // refresh gpu vendor name
+		if (!opt_quiet)
+			applog(LOG_INFO, "NVML GPU monitoring enabled.");
+		for (int n=0; n < active_gpus; n++) {
+			if (nvml_set_pstate(hnvml, device_map[n]) == 1)
+				gpu_reinit = true;
+			if (nvml_set_plimit(hnvml, device_map[n]) == 1)
+				gpu_reinit = true;
+			if (!is_windows() && nvml_set_clocks(hnvml, device_map[n]) == 1)
+				gpu_reinit = true;
+			if (gpu_reinit) {
+				cuda_reset_device(n, NULL);
+			}
+		}
+	}
+#endif
+#ifdef WIN32
+	if (nvapi_init() == 0) {
+		if (!opt_quiet)
+			applog(LOG_INFO, "NVAPI GPU monitoring enabled.");
+		if (!hnvml) {
+			cuda_devicenames(); // refresh gpu vendor name
+		}
+		nvapi_init_settings();
+	}
+#endif
+	else if (!hnvml && !opt_quiet)
+		applog(LOG_INFO, "GPU monitoring is not available.");
+
+	// force reinit to set default device flags
+	if (opt_cudaschedule >= 0 && !hnvml) {
+		for (int n=0; n < active_gpus; n++) {
+			cuda_reset_device(n, NULL);
+		}
+	}
+#endif
+
+	if (opt_api_port) {
+		/* api thread */
+		api_thr_id = opt_n_threads + 3;
+		thr = &thr_info[api_thr_id];
+		thr->id = api_thr_id;
+		thr->q = tq_new();
+		if (!thr->q)
+			return EXIT_CODE_SW_INIT_ERROR;
+
+		/* start stratum thread */
+		if (unlikely(pthread_create(&thr->pth, NULL, api_thread, thr))) {
+			applog(LOG_ERR, "api thread create failed");
+			return EXIT_CODE_SW_INIT_ERROR;
+		}
+	}
+
+#ifdef USE_WRAPNVML
+	// to monitor gpu activitity during work, a thread is required
+	if (1) {
+		monitor_thr_id = opt_n_threads + 4;
+		thr = &thr_info[monitor_thr_id];
+		thr->id = monitor_thr_id;
+		thr->q = tq_new();
+		if (!thr->q)
+			return EXIT_CODE_SW_INIT_ERROR;
+		if (unlikely(pthread_create(&thr->pth, NULL, monitor_thread, thr))) {
+			applog(LOG_ERR, "Monitoring thread %d create failed", i);
+			return EXIT_CODE_SW_INIT_ERROR;
+		}
+	}
+#endif
+
+	/* start mining threads */
+	for (i = 0; i < opt_n_threads; i++) {
+		thr = &thr_info[i];
+
+		thr->id = i;
+		thr->gpu.thr_id = i;
+		thr->gpu.gpu_id = (uint8_t) device_map[i];
+		thr->gpu.gpu_arch = (uint16_t) device_sm[device_map[i]];
+		thr->q = tq_new();
+		if (!thr->q)
+			return EXIT_CODE_SW_INIT_ERROR;
+
+		pthread_mutex_init(&thr->gpu.monitor.lock, NULL);
+		pthread_cond_init(&thr->gpu.monitor.sampling_signal, NULL);
+
+		if (unlikely(pthread_create(&thr->pth, NULL, miner_thread, thr))) {
+			applog(LOG_ERR, "thread %d create failed", i);
+			return EXIT_CODE_SW_INIT_ERROR;
+		}
+	}
+
+	applog(LOG_INFO, "%d miner thread%s started, "
+		"using '%s' algorithm.",
+		opt_n_threads, opt_n_threads > 1 ? "s":"",
+		algo_names[opt_algo]);
+
+	/* main loop - simply wait for workio thread to exit */
+	pthread_join(thr_info[work_thr_id].pth, NULL);
+
+	abort_flag = true;
+
+	/* wait for mining threads */
+	for (i = 0; i < opt_n_threads; i++) {
+		struct cgpu_info *cgpu = &thr_info[i].gpu;
+		if (monitor_thr_id != -1 && cgpu) {
+			pthread_cond_signal(&cgpu->monitor.sampling_signal);
+		}
+		pthread_join(thr_info[i].pth, NULL);
+	}
+
+	if (monitor_thr_id != -1) {
+		pthread_join(thr_info[monitor_thr_id].pth, NULL);
+		//tq_free(thr_info[monitor_thr_id].q);
+	}
+
+	if (opt_debug)
+		applog(LOG_DEBUG, "workio thread dead, exiting.");
+
+	proper_exit(EXIT_CODE_OK);
+	return 0;
+}
diff --git a/ccminer.sln b/ccminer.sln
index a28278f2db..783caa5ce2 100644
--- a/ccminer.sln
+++ b/ccminer.sln
@@ -1,6 +1,7 @@
-﻿
-Microsoft Visual Studio Solution File, Format Version 11.00
-# Visual Studio 2010
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2013
+VisualStudioVersion = 12.0.30723.0
+MinimumVisualStudioVersion = 10.0.40219.1
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ccminer", "ccminer.vcxproj", "{36DC07F9-A4A6-4877-A146-1B960083CF6F}"
 EndProject
 Global
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index 93621c4e84..26c9cd1ee4 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -1,517 +1,642 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
-  <PropertyGroup Label="Globals">
-    <ProjectGuid>{36DC07F9-A4A6-4877-A146-1B960083CF6F}</ProjectGuid>
-    <RootNamespace>ccminer</RootNamespace>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>MultiByte</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <CharacterSet>MultiByte</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>MultiByte</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>MultiByte</CharacterSet>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 5.5.props" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <LinkIncremental>true</LinkIncremental>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <LinkIncremental>true</LinkIncremental>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <LinkIncremental>true</LinkIncremental>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <LinkIncremental>true</LinkIncremental>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;CURL_STATICLIB;SCRYPT_KECCAK512;SCRYPT_CHACHA;SCRYPT_CHOOSE_COMPILETIME;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>.;compat;compat\jansson;compat\getopt;..\pthreads\Pre-built.2\include;..\curl-7.29.0\include;..\OpenSSL-Win32\include;%(AdditionalIncludeDirectories);$(CudaToolkitIncludeDir)</AdditionalIncludeDirectories>
-    </ClCompile>
-    <Link>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <SubSystem>Console</SubSystem>
-      <AdditionalDependencies>cudart.lib;cuda.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;pthreadVC2.lib;libcurl.lib;ws2_32.lib;Wldap32.lib;libeay32MTd.lib;ssleay32MTd.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <AdditionalLibraryDirectories>..\pthreads\Pre-built.2\lib\x86;..\curl-7.29.0\build\lib\Debug;..\OpenSSL-Win32\lib\VC\static;%(AdditionalLibraryDirectories);$(CudaToolkitLibDir)</AdditionalLibraryDirectories>
-    </Link>
-    <PostBuildEvent>
-      <Command>echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"
-copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
-    </PostBuildEvent>
-    <CudaCompile>
-      <CInterleavedPTX>true</CInterleavedPTX>
-    </CudaCompile>
-    <CudaCompile>
-      <MaxRegCount>80</MaxRegCount>
-    </CudaCompile>
-    <CudaCompile>
-      <PtxAsOptionV>true</PtxAsOptionV>
-      <Keep>true</Keep>
-      <CodeGeneration>compute_35,sm_35</CodeGeneration>
-      <Include>
-      </Include>
-    </CudaCompile>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;CURL_STATICLIB;SCRYPT_KECCAK512;SCRYPT_CHACHA;SCRYPT_CHOOSE_COMPILETIME;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>.;compat;compat\jansson;compat\getopt;..\pthreads\Pre-built.2\include;..\curl-7.29.0_x64\include;..\OpenSSL-Win64\include;%(AdditionalIncludeDirectories);$(CudaToolkitIncludeDir)</AdditionalIncludeDirectories>
-    </ClCompile>
-    <Link>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <SubSystem>Console</SubSystem>
-      <AdditionalDependencies>cudart.lib;cuda.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;pthreadVC2.lib;libcurl.lib;ws2_32.lib;Wldap32.lib;libeay32MTd.lib;ssleay32MTd.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <AdditionalLibraryDirectories>..\pthreads\Pre-built.2\lib\x64;..\curl-7.29.0_x64\build\lib\Debug;..\OpenSSL-Win64\lib\VC\static;%(AdditionalLibraryDirectories);$(CudaToolkitLibDir)</AdditionalLibraryDirectories>
-    </Link>
-    <PostBuildEvent>
-      <Command>echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"
-copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
-    </PostBuildEvent>
-    <CudaCompile>
-      <CInterleavedPTX>true</CInterleavedPTX>
-    </CudaCompile>
-    <CudaCompile>
-      <MaxRegCount>80</MaxRegCount>
-    </CudaCompile>
-    <CudaCompile>
-      <PtxAsOptionV>true</PtxAsOptionV>
-      <Keep>true</Keep>
-      <CodeGeneration>compute_35,sm_35</CodeGeneration>
-      <Include>
-      </Include>
-    </CudaCompile>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;CURL_STATICLIB;SCRYPT_KECCAK512;SCRYPT_CHACHA;SCRYPT_CHOOSE_COMPILETIME;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>.;compat;compat\jansson;compat\getopt;..\pthreads\Pre-built.2\include;..\curl-7.29.0\include;..\OpenSSL-Win32\include;%(AdditionalIncludeDirectories);$(CudaToolkitIncludeDir)</AdditionalIncludeDirectories>
-    </ClCompile>
-    <Link>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <SubSystem>Console</SubSystem>
-      <AdditionalDependencies>cudart.lib;cuda.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;pthreadVC2.lib;libcurl.lib;ws2_32.lib;Wldap32.lib;libeay32MT.lib;ssleay32MT.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <AdditionalLibraryDirectories>..\pthreads\Pre-built.2\lib\x86;..\curl-7.29.0\build\lib\Release;..\OpenSSL-Win32\lib\VC\static;%(AdditionalLibraryDirectories);$(CudaToolkitLibDir)</AdditionalLibraryDirectories>
-    </Link>
-    <PostBuildEvent>
-      <Command>echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"
-copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
-    </PostBuildEvent>
-    <CudaCompile>
-      <CInterleavedPTX>true</CInterleavedPTX>
-    </CudaCompile>
-    <CudaCompile>
-      <MaxRegCount>80</MaxRegCount>
-    </CudaCompile>
-    <CudaCompile>
-      <PtxAsOptionV>true</PtxAsOptionV>
-      <Keep>true</Keep>
-      <CodeGeneration>compute_35,sm_35</CodeGeneration>
-      <Include>
-      </Include>
-    </CudaCompile>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;CURL_STATICLIB;SCRYPT_KECCAK512;SCRYPT_CHACHA;SCRYPT_CHOOSE_COMPILETIME;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>.;compat;compat\jansson;compat\getopt;..\pthreads\Pre-built.2\include;..\curl-7.29.0_x64\include;..\OpenSSL-Win64\include;%(AdditionalIncludeDirectories);$(CudaToolkitIncludeDir)</AdditionalIncludeDirectories>
-    </ClCompile>
-    <Link>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <SubSystem>Console</SubSystem>
-      <AdditionalDependencies>cudart.lib;cuda.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;pthreadVC2.lib;libcurl.lib;ws2_32.lib;Wldap32.lib;libeay32MT.lib;ssleay32MT.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <AdditionalLibraryDirectories>..\pthreads\Pre-built.2\lib\x64;..\curl-7.29.0_x64\build\lib\Release;..\OpenSSL-Win64\lib\VC\static;%(AdditionalLibraryDirectories);$(CudaToolkitLibDir)</AdditionalLibraryDirectories>
-    </Link>
-    <PostBuildEvent>
-      <Command>echo copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"
-copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
-    </PostBuildEvent>
-    <CudaCompile>
-      <CInterleavedPTX>true</CInterleavedPTX>
-    </CudaCompile>
-    <CudaCompile>
-      <MaxRegCount>80</MaxRegCount>
-    </CudaCompile>
-    <CudaCompile>
-      <PtxAsOptionV>true</PtxAsOptionV>
-      <Keep>true</Keep>
-      <CodeGeneration>compute_35,sm_35</CodeGeneration>
-      <Include>
-      </Include>
-    </CudaCompile>
-  </ItemDefinitionGroup>
-  <ItemGroup>
-    <ClCompile Include="compat\getopt\getopt_long.c" />
-    <ClCompile Include="compat\gettimeofday.c" />
-    <ClCompile Include="compat\jansson\dump.c" />
-    <ClCompile Include="compat\jansson\hashtable.c" />
-    <ClCompile Include="compat\jansson\load.c" />
-    <ClCompile Include="compat\jansson\strbuffer.c" />
-    <ClCompile Include="compat\jansson\utf.c" />
-    <ClCompile Include="compat\jansson\value.c" />
-    <ClCompile Include="cpu-miner.c">
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">/TP %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">/TP %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">/TP %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">/TP %(AdditionalOptions)</AdditionalOptions>
-    </ClCompile>
-    <ClCompile Include="fuguecoin.cpp" />
-    <ClCompile Include="groestlcoin.cpp" />
-    <ClCompile Include="hefty1.c" />
-    <ClCompile Include="myriadgroestl.cpp" />
-    <ClCompile Include="scrypt.c" />
-    <ClCompile Include="sha2.c">
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">/TP %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">/TP %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">/TP %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">/TP %(AdditionalOptions)</AdditionalOptions>
-    </ClCompile>
-    <ClCompile Include="sph\aes_helper.c" />
-    <ClCompile Include="sph\blake.c" />
-    <ClCompile Include="sph\bmw.c" />
-    <ClCompile Include="sph\cubehash.c" />
-    <ClCompile Include="sph\echo.c" />
-    <ClCompile Include="sph\fugue.c" />
-    <ClCompile Include="sph\groestl.c" />
-    <ClCompile Include="sph\jh.c" />
-    <ClCompile Include="sph\keccak.c" />
-    <ClCompile Include="sph\luffa.c" />
-    <ClCompile Include="sph\shavite.c" />
-    <ClCompile Include="sph\simd.c" />
-    <ClCompile Include="sph\skein.c" />
-    <ClCompile Include="sph\hamsi.c" />
-    <ClCompile Include="sph\hamsi_helper.c" />
-    <ClCompile Include="util.c">
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">/TP %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">/TP %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">/TP %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">/TP %(AdditionalOptions)</AdditionalOptions>
-    </ClCompile>
-  </ItemGroup>
-  <ItemGroup>
-    <ClInclude Include="compat.h" />
-    <ClInclude Include="compat\getopt\getopt.h" />
-    <ClInclude Include="compat\inttypes.h" />
-    <ClInclude Include="compat\stdbool.h" />
-    <ClInclude Include="compat\sys\time.h" />
-    <ClInclude Include="compat\unistd.h" />
-    <ClInclude Include="cpuminer-config.h" />
-    <ClInclude Include="cuda_groestlcoin.h" />
-    <ClInclude Include="cuda_helper.h" />
-    <ClInclude Include="elist.h" />
-    <ClInclude Include="heavy\cuda_blake512.h" />
-    <ClInclude Include="heavy\cuda_combine.h" />
-    <ClInclude Include="heavy\cuda_groestl512.h" />
-    <ClInclude Include="heavy\cuda_hefty1.h" />
-    <ClInclude Include="heavy\cuda_keccak512.h" />
-    <ClInclude Include="heavy\cuda_sha256.h" />
-    <ClInclude Include="hefty1.h" />
-    <ClInclude Include="miner.h" />
-    <ClInclude Include="sph\sph_blake.h" />
-    <ClInclude Include="sph\sph_bmw.h" />
-    <ClInclude Include="sph\sph_cubehash.h" />
-    <ClInclude Include="sph\sph_echo.h" />
-    <ClInclude Include="sph\sph_groestl.h" />
-    <ClInclude Include="sph\sph_jh.h" />
-    <ClInclude Include="sph\sph_keccak.h" />
-    <ClInclude Include="sph\sph_luffa.h" />
-    <ClInclude Include="sph\sph_shavite.h" />
-    <ClInclude Include="sph\sph_simd.h" />
-    <ClInclude Include="sph\sph_skein.h" />
-    <ClInclude Include="sph\sph_hamsi.h" />
-    <ClInclude Include="sph\sph_types.h" />
-    <ClInclude Include="uint256.h" />
-  </ItemGroup>
-  <ItemGroup>
-    <CudaCompile Include="cuda_fugue256.cu">
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-    </CudaCompile>
-    <CudaCompile Include="cuda_groestlcoin.cu">
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-    </CudaCompile>
-    <CudaCompile Include="cuda_myriadgroestl.cu">
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-    </CudaCompile>
-    <CudaCompile Include="cuda_nist5.cu">
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-    </CudaCompile>
-    <CudaCompile Include="heavy\cuda_blake512.cu">
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-    </CudaCompile>
-    <CudaCompile Include="heavy\cuda_combine.cu">
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-    </CudaCompile>
-    <CudaCompile Include="heavy\cuda_groestl512.cu">
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-    </CudaCompile>
-    <CudaCompile Include="heavy\cuda_hefty1.cu">
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-    </CudaCompile>
-    <CudaCompile Include="heavy\cuda_keccak512.cu">
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-    </CudaCompile>
-    <CudaCompile Include="heavy\cuda_sha256.cu">
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-    </CudaCompile>
-    <CudaCompile Include="heavy\heavy.cu">
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-    </CudaCompile>
-    <CudaCompile Include="JHA\cuda_jha_compactionTest.cu">
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=yes -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=yes -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=yes -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=yes -v" %(AdditionalOptions)</AdditionalOptions>
-    </CudaCompile>
-    <CudaCompile Include="JHA\cuda_jha_keccak512.cu">
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-    </CudaCompile>
-    <CudaCompile Include="JHA\jackpotcoin.cu">
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-    </CudaCompile>
-    <CudaCompile Include="quark\animecoin.cu">
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-    </CudaCompile>
-    <CudaCompile Include="quark\cuda_bmw512.cu">
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-    </CudaCompile>
-    <CudaCompile Include="quark\cuda_jh512.cu">
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-    </CudaCompile>
-    <CudaCompile Include="quark\cuda_quark_blake512.cu">
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-    </CudaCompile>
-    <CudaCompile Include="quark\cuda_quark_checkhash.cu">
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-    </CudaCompile>
-    <CudaCompile Include="quark\cuda_quark_compactionTest.cu">
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=yes -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=yes -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=yes -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=yes -v" %(AdditionalOptions)</AdditionalOptions>
-    </CudaCompile>
-    <CudaCompile Include="quark\cuda_quark_groestl512.cu">
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-    </CudaCompile>
-    <CudaCompile Include="quark\cuda_quark_keccak512.cu">
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-    </CudaCompile>
-    <CudaCompile Include="quark\cuda_skein512.cu">
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-    </CudaCompile>
-    <CudaCompile Include="quark\quarkcoin.cu">
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-    </CudaCompile>
-    <CudaCompile Include="x11\cuda_x11_aes.cu">
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-    </CudaCompile>
-    <CudaCompile Include="x11\cuda_x11_cubehash512.cu">
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-    </CudaCompile>
-    <CudaCompile Include="x11\cuda_x11_echo.cu">
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-    </CudaCompile>
-    <CudaCompile Include="x11\cuda_x11_luffa512.cu">
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-    </CudaCompile>
-    <CudaCompile Include="x11\cuda_x11_shavite512.cu">
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <MaxRegCount Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">128</MaxRegCount>
-      <MaxRegCount Condition="'$(Configuration)|$(Platform)'=='Release|x64'">128</MaxRegCount>
-    </CudaCompile>
-    <CudaCompile Include="x11\cuda_x11_simd512.cu">
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-    </CudaCompile>
-    <CudaCompile Include="x11\simd_functions.cu">
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-    </CudaCompile>
-    <CudaCompile Include="x11\x11.cu">
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-    </CudaCompile>
-    <CudaCompile Include="x13\cuda_x13_hamsi512.cu">
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-    </CudaCompile>
-    <CudaCompile Include="x13\cuda_x13_fugue512.cu">
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-    </CudaCompile>
-    <CudaCompile Include="x13\x13.cu">
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
-    </CudaCompile>
-  </ItemGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 5.5.targets" />
-  </ImportGroup>
-</Project>
\ No newline at end of file
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{36DC07F9-A4A6-4877-A146-1B960083CF6F}</ProjectGuid>
+    <RootNamespace>ccminer</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v120</PlatformToolset>
+    <CLRSupport>false</CLRSupport>
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'" Label="Configuration">
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'" Label="Configuration">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 9.0.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;CURL_STATICLIB;USE_WRAPNVML;SCRYPT_KECCAK512;SCRYPT_CHACHA;SCRYPT_CHOOSE_COMPILETIME;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>.;compat;compat\curl-for-windows\curl\include;compat\jansson;compat\getopt;compat\pthreads;compat\curl-for-windows\openssl\openssl\include;compat\curl-for-windows\zlib;%(AdditionalIncludeDirectories);$(CudaToolkitIncludeDir)</AdditionalIncludeDirectories>
+      <EnableParallelCodeGeneration>true</EnableParallelCodeGeneration>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <OpenMPSupport>true</OpenMPSupport>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;cuda.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;pthreadVC2.lib;libcurl.x86.lib;openssl.x86.lib;zlib.x86.lib;ws2_32.lib;Wldap32.lib;nvapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <IgnoreSpecificDefaultLibraries>libcmt.lib;msvcrt.lib</IgnoreSpecificDefaultLibraries>
+      <AdditionalLibraryDirectories>compat\pthreads\x86;compat\curl-for-windows\out\x86\Release\lib;compat\nvapi\x86;%(AdditionalLibraryDirectories);$(CudaToolkitLibDir)</AdditionalLibraryDirectories>
+      <AdditionalOptions>/NODEFAULTLIB:LIBCMT %(AdditionalOptions)</AdditionalOptions>
+      <LinkStatus>true</LinkStatus>
+    </Link>
+    <CudaCompile>
+      <CInterleavedPTX>false</CInterleavedPTX>
+      <GenerateLineInfo>true</GenerateLineInfo>
+      <MaxRegCount>80</MaxRegCount>
+      <PtxAsOptionV>true</PtxAsOptionV>
+      <Keep>true</Keep>
+      <CodeGeneration>compute_50,sm_50</CodeGeneration>
+      <Include>$(NVTOOLSEXT_PATH)\include;..\..\..\Common\C99</Include>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;CURL_STATICLIB;USE_WRAPNVML;SCRYPT_KECCAK512;SCRYPT_CHACHA;SCRYPT_CHOOSE_COMPILETIME;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>.;compat;compat\curl-for-windows\curl\include;compat\jansson;compat\getopt;compat\pthreads;compat\curl-for-windows\openssl\openssl\include;compat\curl-for-windows\zlib;%(AdditionalIncludeDirectories);$(CudaToolkitIncludeDir)</AdditionalIncludeDirectories>
+      <StructMemberAlignment>8Bytes</StructMemberAlignment>
+      <EnableParallelCodeGeneration>true</EnableParallelCodeGeneration>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <OpenMPSupport>true</OpenMPSupport>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;cuda.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;pthreadVC2.lib;libcurl.x64.lib;openssl.x64.lib;zlib.x64.lib;ws2_32.lib;Wldap32.lib;nvapi64.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <IgnoreSpecificDefaultLibraries>libcmt.lib</IgnoreSpecificDefaultLibraries>
+      <AdditionalLibraryDirectories>compat\pthreads\x64;compat\curl-for-windows\out\x64\Release\lib;compat\nvapi\amd64;%(AdditionalLibraryDirectories);$(CudaToolkitLibDir)</AdditionalLibraryDirectories>
+      <AdditionalOptions>/NODEFAULTLIB:LIBCMTD %(AdditionalOptions)</AdditionalOptions>
+      <LinkStatus>true</LinkStatus>
+    </Link>
+    <CudaCompile>
+      <CInterleavedPTX>false</CInterleavedPTX>
+      <GenerateLineInfo>true</GenerateLineInfo>
+      <MaxRegCount>80</MaxRegCount>
+      <PtxAsOptionV>true</PtxAsOptionV>
+      <Keep>true</Keep>
+      <CodeGeneration>compute_52,sm_52</CodeGeneration>
+      <Include>$(NVTOOLSEXT_PATH)\include;..\..\..\Common\C99</Include>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+    <CudaLink>
+      <PerformDeviceLink>false</PerformDeviceLink>
+    </CudaLink>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
+      <CompileAsManaged>false</CompileAsManaged>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;CURL_STATICLIB;USE_WRAPNVML;SCRYPT_KECCAK512;SCRYPT_CHACHA;SCRYPT_CHOOSE_COMPILETIME;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>.;compat;compat\curl-for-windows\curl\include;compat\jansson;compat\getopt;compat\pthreads;compat\curl-for-windows\openssl\openssl\include;compat\curl-for-windows\zlib;%(AdditionalIncludeDirectories);$(CudaToolkitIncludeDir)</AdditionalIncludeDirectories>
+      <MinimalRebuild>true</MinimalRebuild>
+      <EnableParallelCodeGeneration>true</EnableParallelCodeGeneration>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <OpenMPSupport>true</OpenMPSupport>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;cuda.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;pthreadVC2.lib;libcurl.x86.lib;openssl.x86.lib;zlib.x86.lib;ws2_32.lib;Wldap32.lib;nvapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <IgnoreSpecificDefaultLibraries>libcmt.lib</IgnoreSpecificDefaultLibraries>
+      <AdditionalLibraryDirectories>compat\pthreads\x86;compat\curl-for-windows\out\x86\Release\lib;compat\nvapi\x86;%(AdditionalLibraryDirectories);$(CudaToolkitLibDir)</AdditionalLibraryDirectories>
+      <AdditionalOptions>/NODEFAULTLIB:LIBCMT %(AdditionalOptions)</AdditionalOptions>
+      <ImageHasSafeExceptionHandlers>false</ImageHasSafeExceptionHandlers>
+      <LinkStatus>false</LinkStatus>
+      <LargeAddressAware>true</LargeAddressAware>
+    </Link>
+    <CudaCompile>
+      <CInterleavedPTX>false</CInterleavedPTX>
+      <MaxRegCount>80</MaxRegCount>
+      <PtxAsOptionV>true</PtxAsOptionV>
+      <Keep>true</Keep>
+      <CodeGeneration>compute_50,sm_50;compute_52,sm_52;compute_30,sm_30</CodeGeneration>
+      <AdditionalOptions>--ptxas-options="-O2" --Wno-deprecated-gpu-targets %(AdditionalOptions)</AdditionalOptions>
+      <Optimization>O2</Optimization>
+    </CudaCompile>
+    <CudaLink>
+      <GPUDebugInfo>false</GPUDebugInfo>
+      <Optimization>O3</Optimization>
+    </CudaLink>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <EnableEnhancedInstructionSet>
+      </EnableEnhancedInstructionSet>
+      <OpenMPSupport>true</OpenMPSupport>
+      <CompileAsManaged>false</CompileAsManaged>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;CURL_STATICLIB;USE_WRAPNVML;SCRYPT_KECCAK512;SCRYPT_CHACHA;SCRYPT_CHOOSE_COMPILETIME;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>.;compat;compat\curl-for-windows\curl\include;compat\jansson;compat\getopt;compat\pthreads;compat\curl-for-windows\openssl\openssl\include;compat\curl-for-windows\zlib;%(AdditionalIncludeDirectories);$(CudaToolkitIncludeDir)</AdditionalIncludeDirectories>
+      <MinimalRebuild>true</MinimalRebuild>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;pthreadVC2.lib;libcurl.x64.lib;openssl.x64.lib;zlib.x64.lib;ws2_32.lib;Wldap32.lib;cudart_static.lib;cuda.lib;nvapi64.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <IgnoreSpecificDefaultLibraries>libcmt.lib</IgnoreSpecificDefaultLibraries>
+      <AdditionalLibraryDirectories>compat\pthreads\x64;compat\curl-for-windows\out\x64\Release\lib;compat\nvapi\amd64;%(AdditionalLibraryDirectories);$(CudaToolkitLibDir)</AdditionalLibraryDirectories>
+      <AdditionalOptions>/NODEFAULTLIB:LIBCMT %(AdditionalOptions)</AdditionalOptions>
+      <LinkStatus>false</LinkStatus>
+      <LargeAddressAware>true</LargeAddressAware>
+    </Link>
+    <CudaCompile>
+      <CInterleavedPTX>false</CInterleavedPTX>
+      <MaxRegCount>80</MaxRegCount>
+      <PtxAsOptionV>true</PtxAsOptionV>
+      <Keep>true</Keep>
+      <CodeGeneration>compute_52,sm_52;compute_50,sm_50;compute_35,sm_35;compute_30,sm_30</CodeGeneration>
+      <Include>$(NVTOOLSEXT_PATH)\include</Include>
+      <Optimization>O3</Optimization>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+      <AdditionalOptions>--Wno-deprecated-gpu-targets %(AdditionalOptions)</AdditionalOptions>
+    </CudaCompile>
+    <CudaLink>
+      <Optimization>O3</Optimization>
+      <PerformDeviceLink>false</PerformDeviceLink>
+    </CudaLink>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="compat\getopt\getopt_long.c" />
+    <ClCompile Include="compat\gettimeofday.c" />
+    <ClCompile Include="compat\jansson\dump.c" />
+    <ClCompile Include="compat\jansson\error.c" />
+    <ClCompile Include="compat\jansson\hashtable.c" />
+    <ClCompile Include="compat\jansson\load.c" />
+    <ClCompile Include="compat\jansson\memory.c" />
+    <ClCompile Include="compat\jansson\pack_unpack.c" />
+    <ClCompile Include="compat\jansson\strbuffer.c" />
+    <ClCompile Include="compat\jansson\strconv.c" />
+    <ClCompile Include="compat\jansson\utf.c" />
+    <ClCompile Include="compat\jansson\value.c" />
+    <ClInclude Include="compat\pthreads\pthread.h" />
+    <ClCompile Include="compat\winansi.c" />
+    <ClCompile Include="ccminer.cpp">
+      <TreatWChar_tAsBuiltInType>false</TreatWChar_tAsBuiltInType>
+      <Optimization Condition="'$(Configuration)'=='Release'">Full</Optimization>
+    </ClCompile>
+    <ClCompile Include="crypto\mman.c" />
+    <ClCompile Include="crypto\wildkeccak-cpu.cpp" />
+    <ClCompile Include="crypto\xmr-rpc.cpp" />
+    <ClCompile Include="crypto\aesb.cpp" />
+    <ClCompile Include="crypto\oaes_lib.cpp" />
+    <ClCompile Include="crypto\cryptolight-cpu.cpp" />
+    <ClCompile Include="crypto\cryptonight-cpu.cpp" />
+    <ClCompile Include="crypto\cpu\c_keccak.c" />
+    <ClCompile Include="equi\blake2\blake2bx.cpp">
+      <EnableEnhancedInstructionSet Condition="'$(Platform)'=='Win32'">StreamingSIMDExtensions</EnableEnhancedInstructionSet>
+    </ClCompile>
+    <ClCompile Include="equi\equi-stratum.cpp" />
+    <ClCompile Include="equi\equi.cpp" />
+    <ClCompile Include="equi\equihash.cpp" />
+    <ClCompile Include="nvapi.cpp" />
+    <ClCompile Include="nvsettings.cpp" />
+    <ClCompile Include="pools.cpp" />
+    <ClCompile Include="util.cpp" />
+    <ClCompile Include="bench.cpp" />
+    <ClCompile Include="bignum.cpp" />
+    <ClInclude Include="bignum.hpp" />
+    <ClCompile Include="fuguecoin.cpp" />
+    <ClCompile Include="groestlcoin.cpp" />
+    <ClCompile Include="hashlog.cpp" />
+    <ClCompile Include="stats.cpp" />
+    <ClCompile Include="nvml.cpp" />
+    <ClCompile Include="api.cpp" />
+    <ClCompile Include="sysinfos.cpp" />
+    <ClCompile Include="crc32.c" />
+    <ClCompile Include="hefty1.c" />
+    <ClCompile Include="myriadgroestl.cpp" />
+    <ClCompile Include="lyra2\Lyra2.c" />
+    <ClCompile Include="lyra2\Sponge.c" />
+    <ClCompile Include="lyra2\Lyra2Z.c" />
+    <ClInclude Include="equi\eqcuda.hpp" />
+    <ClInclude Include="equi\equihash.h" />
+    <ClInclude Include="neoscrypt\neoscrypt.h" />
+    <ClCompile Include="neoscrypt\neoscrypt.cpp" />
+    <ClCompile Include="neoscrypt\neoscrypt-cpu.c" />
+    <ClInclude Include="neoscrypt\cuda_vectors.h" />
+    <ClInclude Include="x11\cuda_x11_simd512_sm2.cuh" />
+    <ClInclude Include="x16\cuda_x16.h" />
+    <CudaCompile Include="Algo256\bmw.cu" />
+    <CudaCompile Include="Algo256\cuda_bmw.cu">
+      <MaxRegCount>76</MaxRegCount>
+    </CudaCompile>
+    <CudaCompile Include="crypto\cryptolight.cu" />
+    <CudaCompile Include="crypto\cryptolight-core.cu">
+      <MaxRegCount>64</MaxRegCount>
+    </CudaCompile>
+    <CudaCompile Include="crypto\cryptonight.cu" />
+    <CudaCompile Include="crypto\cryptonight-core.cu">
+      <MaxRegCount>64</MaxRegCount>
+    </CudaCompile>
+    <CudaCompile Include="crypto\cryptonight-extra.cu">
+      <MaxRegCount>255</MaxRegCount>
+    </CudaCompile>
+    <ClInclude Include="crypto\cn_aes.cuh" />
+    <ClInclude Include="crypto\cn_blake.cuh" />
+    <ClInclude Include="crypto\cn_groestl.cuh" />
+    <ClInclude Include="crypto\cn_jh.cuh" />
+    <ClInclude Include="crypto\cn_keccak.cuh" />
+    <ClInclude Include="crypto\cn_skein.cuh" />
+    <CudaCompile Include="crypto\wildkeccak.cu">
+      <MaxRegCount>128</MaxRegCount>
+    </CudaCompile>
+    <CudaCompile Include="equi\cuda_equi.cu">
+      <CodeGeneration>compute_52,sm_52;compute_50,sm_50;compute_30,sm_30</CodeGeneration>
+      <AdditionalOptions> -Xptxas -dlcm=ca -Xptxas -dscm=cs %(AdditionalOptions)</AdditionalOptions>
+      <MaxRegCount>0</MaxRegCount>
+      <CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Release|x64'">compute_61,sm_61;compute_52,sm_52;compute_50,sm_50;compute_30,sm_30</CodeGeneration>
+    </CudaCompile>
+    <CudaCompile Include="neoscrypt\cuda_neoscrypt.cu">
+      <MaxRegCount>160</MaxRegCount>
+    </CudaCompile>
+    <ClCompile Include="scrypt-jane.cpp" />
+    <ClCompile Include="scrypt.cpp" />
+    <ClCompile Include="sia\sia-rpc.cpp" />
+    <ClCompile Include="skein2.cpp" />
+    <ClCompile Include="sph\aes_helper.c" />
+    <ClCompile Include="sph\blake.c" />
+    <ClCompile Include="sph\blake2b.c" />
+    <ClCompile Include="sph\blake2s.c" />
+    <ClCompile Include="sph\bmw.c" />
+    <ClCompile Include="sph\cubehash.c" />
+    <ClCompile Include="sph\echo.c" />
+    <ClCompile Include="sph\fugue.c" />
+    <ClCompile Include="sph\groestl.c" />
+    <ClCompile Include="sph\haval.c" />
+    <ClCompile Include="sph\jh.c" />
+    <ClCompile Include="sph\keccak.c" />
+    <ClCompile Include="sph\luffa.c" />
+    <ClCompile Include="sph\ripemd.c" />
+    <ClCompile Include="sph\sph_sha2.c" />
+    <ClCompile Include="sph\sha2.c" />
+    <ClCompile Include="sph\sha2big.c" />
+    <ClCompile Include="sph\shabal.c" />
+    <ClCompile Include="sph\shavite.c" />
+    <ClCompile Include="sph\simd.c" />
+    <ClCompile Include="sph\skein.c" />
+    <ClCompile Include="sph\streebog.c" />
+    <ClCompile Include="sph\hamsi.c" />
+    <ClCompile Include="sph\hamsi_helper.c" />
+    <ClCompile Include="sph\whirlpool.c" />
+    <ClCompile Include="x13\sm3.c" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="compat.h" />
+    <ClInclude Include="compat\getopt\getopt.h" />
+    <ClInclude Include="compat\inttypes.h" />
+    <ClInclude Include="compat\jansson\jansson_config.h" />
+    <ClInclude Include="compat\nvapi\nvapi.h" />
+    <ClInclude Include="compat\nvapi\nvapi_ccminer.h" />
+    <ClInclude Include="compat\nvapi\nvapi_lite_common.h" />
+    <ClInclude Include="compat\nvapi\nvapi_lite_d3dext.h" />
+    <ClInclude Include="compat\nvapi\nvapi_lite_salend.h" />
+    <ClInclude Include="compat\nvapi\nvapi_lite_salstart.h" />
+    <ClInclude Include="compat\nvapi\nvapi_lite_sli.h" />
+    <ClInclude Include="compat\nvapi\nvapi_lite_stereo.h" />
+    <ClInclude Include="compat\nvapi\nvapi_lite_surround.h" />
+    <ClInclude Include="compat\stdbool.h" />
+    <ClInclude Include="compat\sys\time.h" />
+    <ClInclude Include="compat\unistd.h" />
+    <ClInclude Include="compat\winansi.h" />
+    <ClInclude Include="compat\ccminer-config.h" />
+    <ClInclude Include="crypto\cryptolight.h" />
+    <ClInclude Include="crypto\cryptonight.h" />
+    <ClInclude Include="crypto\mman.h" />
+    <ClInclude Include="crypto\wildkeccak.h" />
+    <ClInclude Include="crypto\xmr-rpc.h" />
+    <ClInclude Include="cuda_groestlcoin.h" />
+    <ClInclude Include="cuda_helper.h" />
+    <ClInclude Include="cuda_vector_uint2x4.h" />
+    <ClInclude Include="elist.h" />
+    <ClInclude Include="heavy\heavy.h" />
+    <ClInclude Include="hefty1.h" />
+    <ClInclude Include="algos.h" />
+    <ClInclude Include="miner.h" />
+    <ClInclude Include="nvml.h" />
+    <ClInclude Include="quark\cuda_bmw512_sm3.cuh" />
+    <ClInclude Include="quark\cuda_quark_groestl512_sm2.cuh" />
+    <ClInclude Include="quark\cuda_quark_blake512_sp.cuh" />
+    <ClInclude Include="res\resource.h" />
+    <ClInclude Include="sia\sia-rpc.h" />
+    <ClInclude Include="scrypt\salsa_kernel.h" />
+    <ClInclude Include="sph\blake2b.h" />
+    <ClInclude Include="sph\blake2s.h" />
+    <ClInclude Include="sph\sph_blake.h" />
+    <ClInclude Include="sph\sph_bmw.h" />
+    <ClInclude Include="sph\sph_cubehash.h" />
+    <ClInclude Include="sph\sph_echo.h" />
+    <ClInclude Include="sph\sph_fugue.h" />
+    <ClInclude Include="sph\sph_groestl.h" />
+    <ClInclude Include="sph\sph_haval.h" />
+    <ClInclude Include="sph\sph_jh.h" />
+    <ClInclude Include="sph\sph_keccak.h" />
+    <ClInclude Include="sph\sph_luffa.h" />
+    <ClInclude Include="sph\sph_sha2.h" />
+    <ClInclude Include="sph\sph_shabal.h" />
+    <ClInclude Include="sph\sph_shavite.h" />
+    <ClInclude Include="sph\sph_simd.h" />
+    <ClInclude Include="sph\sph_skein.h" />
+    <ClInclude Include="sph\sph_streebog.h" />
+    <ClInclude Include="sph\sph_hamsi.h" />
+    <ClInclude Include="sph\sph_types.h" />
+    <ClInclude Include="sph\sph_whirlpool.h" />
+    <ClInclude Include="uint256.h" />
+    <ClInclude Include="lyra2\Lyra2.h" />
+    <ClInclude Include="lyra2\Sponge.h" />
+    <ClInclude Include="lyra2\Lyra2Z.h" />
+    <ClInclude Include="quark\groestl_transf_quad.h" />
+    <ClInclude Include="quark\groestl_functions_quad.h" />
+    <ClInclude Include="quark\cuda_quark.h" />
+    <ClInclude Include="x11\cuda_x11.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <CudaCompile Include="cuda.cpp" />
+    <CudaCompile Include="cuda_groestlcoin.cu" />
+    <CudaCompile Include="cuda_myriadgroestl.cu" />
+    <CudaCompile Include="scrypt\blake.cu" />
+    <CudaCompile Include="scrypt\keccak.cu" />
+    <CudaCompile Include="scrypt\sha256.cu" />
+    <CudaCompile Include="scrypt\salsa_kernel.cu">
+      <CodeGeneration>compute_30,sm_30</CodeGeneration>
+    </CudaCompile>
+    <CudaCompile Include="scrypt\fermi_kernel.cu">
+      <CodeGeneration>compute_30,sm_30</CodeGeneration>
+    </CudaCompile>
+    <CudaCompile Include="scrypt\kepler_kernel.cu">
+      <CodeGeneration>compute_30,sm_30</CodeGeneration>
+    </CudaCompile>
+    <CudaCompile Include="scrypt\nv_kernel.cu">
+      <CodeGeneration>compute_30,sm_30</CodeGeneration>
+    </CudaCompile>
+    <CudaCompile Include="scrypt\nv_kernel2.cu">
+      <CodeGeneration>compute_35,sm_35;compute_50,sm_50;compute_52,sm_52</CodeGeneration>
+    </CudaCompile>
+    <CudaCompile Include="scrypt\test_kernel.cu">
+      <CodeGeneration>compute_30,sm_30</CodeGeneration>
+    </CudaCompile>
+    <CudaCompile Include="scrypt\titan_kernel.cu">
+      <CodeGeneration>compute_35,sm_35;compute_50,sm_50</CodeGeneration>
+    </CudaCompile>
+    <CudaCompile Include="sha256\cuda_sha256d.cu" />
+    <CudaCompile Include="sha256\sha256d.cu" />
+    <CudaCompile Include="sha256\cuda_sha256t.cu" />
+    <CudaCompile Include="sha256\sha256t.cu" />
+    <CudaCompile Include="sha256\cuda_sha256q.cu" />
+    <CudaCompile Include="sha256\sha256q.cu" />
+    <CudaCompile Include="zr5.cu" />
+    <CudaCompile Include="heavy\cuda_blake512.cu">
+    </CudaCompile>
+    <CudaCompile Include="heavy\cuda_combine.cu">
+    </CudaCompile>
+    <CudaCompile Include="heavy\cuda_groestl512.cu">
+    </CudaCompile>
+    <CudaCompile Include="heavy\cuda_hefty1.cu">
+      <MaxRegCount>80</MaxRegCount>
+    </CudaCompile>
+    <CudaCompile Include="heavy\cuda_keccak512.cu">
+    </CudaCompile>
+    <CudaCompile Include="heavy\cuda_sha256.cu">
+    </CudaCompile>
+    <CudaCompile Include="heavy\heavy.cu">
+    </CudaCompile>
+    <CudaCompile Include="heavy\bastion.cu" />
+    <CudaCompile Include="heavy\cuda_bastion.cu" />
+    <CudaCompile Include="JHA\jha.cu" />
+    <CudaCompile Include="JHA\jackpotcoin.cu" />
+    <CudaCompile Include="JHA\cuda_jha_compactionTest.cu">
+      <AdditionalOptions>-Xptxas "-abi=yes" %(AdditionalOptions)</AdditionalOptions>
+    </CudaCompile>
+    <CudaCompile Include="JHA\cuda_jha_keccak512.cu">
+    </CudaCompile>
+    <CudaCompile Include="blake2b.cu" />
+    <CudaCompile Include="Algo256\blake256.cu">
+      <MaxRegCount>64</MaxRegCount>
+      <AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options="-dlcm=cg" %(AdditionalOptions)</AdditionalOptions>
+      <FastMath>true</FastMath>
+    </CudaCompile>
+    <CudaCompile Include="Algo256\blake2s.cu" />
+    <CudaCompile Include="Algo256\decred.cu">
+      <MaxRegCount>128</MaxRegCount>
+    </CudaCompile>
+    <CudaCompile Include="Algo256\vanilla.cu" />
+    <CudaCompile Include="Algo256\keccak256.cu" />
+    <CudaCompile Include="Algo256\cuda_blake256.cu" />
+    <CudaCompile Include="Algo256\cuda_bmw256.cu" />
+    <CudaCompile Include="Algo256\cuda_cubehash256.cu" />
+    <CudaCompile Include="Algo256\cuda_fugue256.cu" />
+    <CudaCompile Include="Algo256\cuda_groestl256.cu" />
+    <CudaCompile Include="Algo256\cuda_keccak256.cu" />
+    <CudaCompile Include="Algo256\cuda_keccak256_sm3.cu">
+      <MaxRegCount>92</MaxRegCount>
+    </CudaCompile>
+    <CudaCompile Include="Algo256\cuda_skein256.cu" />
+    <CudaCompile Include="lbry\cuda_sha256_lbry.cu" />
+    <CudaCompile Include="lbry\cuda_sha512_lbry.cu" />
+    <CudaCompile Include="lbry\cuda_lbry_merged.cu" />
+    <CudaCompile Include="lbry\lbry.cu" />
+    <CudaCompile Include="pentablake.cu">
+      <MaxRegCount>80</MaxRegCount>
+      <AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options="-dlcm=cg" %(AdditionalOptions)</AdditionalOptions>
+      <FastMath>true</FastMath>
+    </CudaCompile>
+    <CudaCompile Include="quark\cuda_bmw512.cu">
+      <MaxRegCount>128</MaxRegCount>
+    </CudaCompile>
+    <CudaCompile Include="quark\cuda_jh512.cu">
+      <MaxRegCount>80</MaxRegCount>
+    </CudaCompile>
+    <CudaCompile Include="quark\cuda_quark_blake512.cu">
+      <MaxRegCount>80</MaxRegCount>
+    </CudaCompile>
+    <CudaCompile Include="cuda_checkhash.cu">
+      <CInterleavedPTX>false</CInterleavedPTX>
+    </CudaCompile>
+    <CudaCompile Include="quark\cuda_quark_compactionTest.cu">
+      <AdditionalOptions Condition="'$(Configuration)'=='Release'">-Xptxas "-abi=yes" %(AdditionalOptions)</AdditionalOptions>
+      <AdditionalOptions Condition="'$(Configuration)'=='Debug'">-Xptxas "-abi=yes" %(AdditionalOptions)</AdditionalOptions>
+    </CudaCompile>
+    <CudaCompile Include="quark\cuda_quark_groestl512.cu">
+    </CudaCompile>
+    <CudaCompile Include="quark\cuda_quark_keccak512.cu">
+      <MaxRegCount>88</MaxRegCount>
+    </CudaCompile>
+    <CudaCompile Include="quark\cuda_skein512.cu">
+    </CudaCompile>
+    <CudaCompile Include="quark\quarkcoin.cu" />
+    <CudaCompile Include="quark\nist5.cu" />
+    <CudaCompile Include="qubit\deep.cu" />
+    <CudaCompile Include="qubit\luffa.cu" />
+    <CudaCompile Include="qubit\qubit.cu" />
+    <CudaCompile Include="qubit\qubit_luffa512.cu" />
+    <CudaCompile Include="lyra2\allium.cu" />
+    <CudaCompile Include="lyra2\lyra2RE.cu" />
+    <CudaCompile Include="lyra2\cuda_lyra2.cu" />
+    <CudaCompile Include="lyra2\lyra2REv2.cu" />
+    <CudaCompile Include="lyra2\cuda_lyra2v2.cu" />
+    <ClInclude Include="lyra2\cuda_lyra2_sm2.cuh" />
+    <ClInclude Include="lyra2\cuda_lyra2_sm5.cuh" />
+    <ClInclude Include="lyra2\cuda_lyra2v2_sm3.cuh" />
+    <CudaCompile Include="lyra2\lyra2REv3.cu" />
+    <CudaCompile Include="lyra2\cuda_lyra2v3.cu" />
+    <ClInclude Include="lyra2\cuda_lyra2v3_sm3.cuh" />
+    <CudaCompile Include="lyra2\lyra2Z.cu" />
+    <CudaCompile Include="lyra2\cuda_lyra2Z.cu" />
+    <ClInclude Include="lyra2\cuda_lyra2Z_sm5.cuh" />
+    <CudaCompile Include="polytimos.cu" />
+    <CudaCompile Include="sia\sia.cu" />
+    <CudaCompile Include="skein.cu">
+      <MaxRegCount>64</MaxRegCount>
+    </CudaCompile>
+    <CudaCompile Include="cuda_skeincoin.cu">
+      <MaxRegCount>48</MaxRegCount>
+    </CudaCompile>
+    <CudaCompile Include="phi\phi.cu" />
+    <CudaCompile Include="phi\phi2.cu" />
+    <CudaCompile Include="phi\cuda_phi2.cu" />
+    <CudaCompile Include="phi\cuda_phi2_cubehash512.cu" />
+    <CudaCompile Include="skunk\skunk.cu" />
+    <CudaCompile Include="skunk\cuda_skunk.cu">
+      <CodeGeneration>compute_50,sm_50;compute_52,sm_52</CodeGeneration>
+      <MaxRegCount>64</MaxRegCount>
+    </CudaCompile>
+    <CudaCompile Include="skunk\cuda_skunk_streebog.cu" />
+    <ClInclude Include="tribus\cuda_echo512_aes.cuh" />
+    <CudaCompile Include="tribus\cuda_echo512_final.cu">
+      <CodeGeneration>compute_50,sm_50;compute_52,sm_52</CodeGeneration>
+    </CudaCompile>
+    <CudaCompile Include="tribus\tribus.cu" />
+    <ClInclude Include="x11\cuda_x11_aes.cuh" />
+    <CudaCompile Include="x11\cuda_x11_cubehash512.cu" />
+    <CudaCompile Include="x11\cuda_x11_echo.cu">
+    </CudaCompile>
+    <CudaCompile Include="x11\cuda_x11_luffa512.cu" />
+    <CudaCompile Include="x11\cuda_x11_luffa512_Cubehash.cu">
+      <MaxRegCount>76</MaxRegCount>
+    </CudaCompile>
+    <CudaCompile Include="x11\cuda_x11_shavite512.cu">
+      <MaxRegCount>128</MaxRegCount>
+    </CudaCompile>
+    <ClInclude Include="x11\cuda_x11_simd512_func.cuh" />
+    <CudaCompile Include="x11\cuda_x11_simd512.cu">
+      <MaxRegCount>64</MaxRegCount>
+    </CudaCompile>
+    <CudaCompile Include="x11\cuda_streebog.cu" />
+    <CudaCompile Include="x11\cuda_streebog_maxwell.cu" />
+    <CudaCompile Include="x11\c11.cu" />
+    <CudaCompile Include="x11\fresh.cu" />
+    <CudaCompile Include="x11\sib.cu" />
+    <CudaCompile Include="x11\s3.cu" />
+    <CudaCompile Include="x11\timetravel.cu" />
+    <CudaCompile Include="x11\bitcore.cu" />
+    <CudaCompile Include="x11\exosis.cu" />
+    <CudaCompile Include="x11\veltor.cu" />
+    <CudaCompile Include="x11\x11.cu" />
+    <CudaCompile Include="x11\x11evo.cu" />
+    <CudaCompile Include="x12\x12.cu" />
+    <CudaCompile Include="x13\cuda_x13_hamsi512.cu">
+      <MaxRegCount>72</MaxRegCount>
+    </CudaCompile>
+    <CudaCompile Include="x13\cuda_x13_fugue512.cu">
+    </CudaCompile>
+    <CudaCompile Include="x13\x13.cu" />
+    <CudaCompile Include="x13\cuda_hsr_sm3.cu" />
+    <CudaCompile Include="x13\hsr.cu" />
+    <CudaCompile Include="x15\x14.cu" />
+    <CudaCompile Include="x15\cuda_x14_shabal512.cu" />
+    <CudaCompile Include="x15\cuda_x15_whirlpool.cu" />
+    <CudaCompile Include="x15\x15.cu" />
+    <CudaCompile Include="x15\whirlpool.cu" />
+    <CudaCompile Include="x15\cuda_x15_whirlpool_sm3.cu" />
+    <CudaCompile Include="x16\x16r.cu" />
+    <CudaCompile Include="x16\x16s.cu" />
+    <CudaCompile Include="x16\cuda_x16_echo512.cu" />
+    <CudaCompile Include="x16\cuda_x16_fugue512.cu" />
+    <CudaCompile Include="x16\cuda_x16_shabal512.cu" />
+    <CudaCompile Include="x16\cuda_x16_simd512_80.cu" />
+    <CudaCompile Include="x16\cuda_x16_echo512_64.cu">
+      <CodeGeneration>compute_50,sm_50;compute_52,sm_52</CodeGeneration>
+    </CudaCompile>
+    <CudaCompile Include="x17\hmq17.cu" />
+    <CudaCompile Include="x17\sonoa.cu" />
+    <CudaCompile Include="x17\x17.cu" />
+    <CudaCompile Include="x17\cuda_x17_haval256.cu">
+    </CudaCompile>
+    <CudaCompile Include="x17\cuda_x17_sha512.cu">
+      <MaxRegCount>80</MaxRegCount>
+    </CudaCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <Image Include="res\ccminer.ico" />
+  </ItemGroup>
+  <ItemGroup>
+    <ResourceCompile Include="res\ccminer.rc" />
+  </ItemGroup>
+  <ItemGroup>
+    <Text Include="README.txt" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 9.0.targets" />
+  </ImportGroup>
+  <!-- Copy the required dlls -->
+  <Target Name="AfterBuild">
+    <Copy SourceFiles="@(FilesToCopy)" DestinationFolder="$(OutDir)" SkipUnchangedFiles="true" />
+  </Target>
+  <Target Name="AfterClean">
+    <Delete Files="@(FilesToCopy->'$(OutDir)%(Filename)%(Extension)')" TreatErrorsAsWarnings="true" />
+  </Target>
+</Project>
diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters
index 2fb6824895..3df78716e3 100644
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@@ -1,365 +1,1064 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
-<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup>
-    <Filter Include="Source Files">
-      <UniqueIdentifier>{2450a9c7-a97a-49e1-ba19-c8dbc5a4e3e7}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="Source Files\gettimeofday">
-      <UniqueIdentifier>{c53ce808-c5c5-4c6c-99a2-3947090c62f1}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="Source Files\getopt">
-      <UniqueIdentifier>{5a45c1bf-81d2-4bc6-97b5-714e34f51a82}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="Header Files">
-      <UniqueIdentifier>{431cec61-9376-4de9-aae9-04c4250652e7}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="Header Files\compat">
-      <UniqueIdentifier>{cc8bb259-5332-4a45-ba81-f4840a55b604}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="Header Files\compat\getopt">
-      <UniqueIdentifier>{89362bd8-4690-4f0c-a4f7-6b2fa67a1f34}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="Header Files\compat\sys">
-      <UniqueIdentifier>{6c3cd392-b6b8-424c-87d2-10e33dbd4b41}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="Source Files\CUDA">
-      <UniqueIdentifier>{5a31b6f4-4943-4b22-b69a-230f3cc96269}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="Header Files\CUDA">
-      <UniqueIdentifier>{a0f072d0-a831-4c23-8d64-7a026521df9c}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="Source Files\code">
-      <UniqueIdentifier>{fe39ded0-754b-415f-a284-038a15a0aa55}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="Source Files\CUDA\jansson">
-      <UniqueIdentifier>{17b56151-79ec-4a32-bac3-9d94ae7f68fe}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="Source Files\CUDA\JHA">
-      <UniqueIdentifier>{d8f2e173-a0a5-455b-8efc-42511b585156}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="Source Files\CUDA\quark">
-      <UniqueIdentifier>{dd0822bd-27cc-4d5c-8e2f-cf7d9b00feb4}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="Source Files\sph">
-      <UniqueIdentifier>{0dc051db-f833-476f-b3f4-c69fd03b9348}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="Header Files\sph">
-      <UniqueIdentifier>{7c2a98c6-064c-4a69-b803-d6f6ff5edd0b}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="Source Files\CUDA\heavy">
-      <UniqueIdentifier>{c3222908-22ba-4586-a637-6363f455b06d}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="Header Files\CUDA\heavy">
-      <UniqueIdentifier>{3281db48-f394-49ea-a1ef-6ebd09828d50}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="Source Files\CUDA\x11">
-      <UniqueIdentifier>{dd751f2d-bfd6-42c1-8f9b-cbe94e539353}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="Source Files\CUDA\x13">
-      <UniqueIdentifier>{d67a2af7-4851-4d21-910e-87791bc8ee35}</UniqueIdentifier>
-    </Filter>
-  </ItemGroup>
-  <ItemGroup>
-    <ClCompile Include="compat\jansson\dump.c">
-      <Filter>Source Files\CUDA\jansson</Filter>
-    </ClCompile>
-    <ClCompile Include="compat\jansson\hashtable.c">
-      <Filter>Source Files\CUDA\jansson</Filter>
-    </ClCompile>
-    <ClCompile Include="compat\jansson\load.c">
-      <Filter>Source Files\CUDA\jansson</Filter>
-    </ClCompile>
-    <ClCompile Include="compat\jansson\strbuffer.c">
-      <Filter>Source Files\CUDA\jansson</Filter>
-    </ClCompile>
-    <ClCompile Include="compat\jansson\utf.c">
-      <Filter>Source Files\CUDA\jansson</Filter>
-    </ClCompile>
-    <ClCompile Include="compat\jansson\value.c">
-      <Filter>Source Files\CUDA\jansson</Filter>
-    </ClCompile>
-    <ClCompile Include="compat\getopt\getopt_long.c">
-      <Filter>Source Files\getopt</Filter>
-    </ClCompile>
-    <ClCompile Include="compat\gettimeofday.c">
-      <Filter>Source Files\gettimeofday</Filter>
-    </ClCompile>
-    <ClCompile Include="util.c">
-      <Filter>Source Files</Filter>
-    </ClCompile>
-    <ClCompile Include="sha2.c">
-      <Filter>Source Files</Filter>
-    </ClCompile>
-    <ClCompile Include="cpu-miner.c">
-      <Filter>Source Files</Filter>
-    </ClCompile>
-    <ClCompile Include="hefty1.c">
-      <Filter>Source Files</Filter>
-    </ClCompile>
-    <ClCompile Include="scrypt.c">
-      <Filter>Source Files</Filter>
-    </ClCompile>
-    <ClCompile Include="fuguecoin.cpp">
-      <Filter>Source Files</Filter>
-    </ClCompile>
-    <ClCompile Include="groestlcoin.cpp">
-      <Filter>Source Files</Filter>
-    </ClCompile>
-    <ClCompile Include="sph\aes_helper.c">
-      <Filter>Source Files\sph</Filter>
-    </ClCompile>
-    <ClCompile Include="sph\blake.c">
-      <Filter>Source Files\sph</Filter>
-    </ClCompile>
-    <ClCompile Include="sph\bmw.c">
-      <Filter>Source Files\sph</Filter>
-    </ClCompile>
-    <ClCompile Include="sph\cubehash.c">
-      <Filter>Source Files\sph</Filter>
-    </ClCompile>
-    <ClCompile Include="sph\echo.c">
-      <Filter>Source Files\sph</Filter>
-    </ClCompile>
-    <ClCompile Include="sph\fugue.c">
-      <Filter>Source Files\sph</Filter>
-    </ClCompile>
-    <ClCompile Include="sph\groestl.c">
-      <Filter>Source Files\sph</Filter>
-    </ClCompile>
-    <ClCompile Include="sph\jh.c">
-      <Filter>Source Files\sph</Filter>
-    </ClCompile>
-    <ClCompile Include="sph\keccak.c">
-      <Filter>Source Files\sph</Filter>
-    </ClCompile>
-    <ClCompile Include="sph\luffa.c">
-      <Filter>Source Files\sph</Filter>
-    </ClCompile>
-    <ClCompile Include="sph\shavite.c">
-      <Filter>Source Files\sph</Filter>
-    </ClCompile>
-    <ClCompile Include="sph\simd.c">
-      <Filter>Source Files\sph</Filter>
-    </ClCompile>
-    <ClCompile Include="sph\skein.c">
-      <Filter>Source Files\sph</Filter>
-    </ClCompile>
-    <ClCompile Include="myriadgroestl.cpp">
-      <Filter>Source Files</Filter>
-    </ClCompile>
-    <ClCompile Include="sph\hamsi.c">
-      <Filter>Source Files\sph</Filter>
-    </ClCompile>
-    <ClCompile Include="sph\hamsi_helper.c">
-      <Filter>Source Files\sph</Filter>
-    </ClCompile>
-  </ItemGroup>
-  <ItemGroup>
-    <ClInclude Include="compat.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="elist.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="miner.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="compat\sys\time.h">
-      <Filter>Header Files\compat\sys</Filter>
-    </ClInclude>
-    <ClInclude Include="compat\unistd.h">
-      <Filter>Header Files\compat</Filter>
-    </ClInclude>
-    <ClInclude Include="compat\inttypes.h">
-      <Filter>Header Files\compat</Filter>
-    </ClInclude>
-    <ClInclude Include="compat\getopt\getopt.h">
-      <Filter>Header Files\compat\getopt</Filter>
-    </ClInclude>
-    <ClInclude Include="compat\stdbool.h">
-      <Filter>Header Files\compat</Filter>
-    </ClInclude>
-    <ClInclude Include="cpuminer-config.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="hefty1.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="uint256.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="cuda_groestlcoin.h">
-      <Filter>Header Files\CUDA</Filter>
-    </ClInclude>
-    <ClInclude Include="sph\sph_blake.h">
-      <Filter>Header Files\sph</Filter>
-    </ClInclude>
-    <ClInclude Include="sph\sph_bmw.h">
-      <Filter>Header Files\sph</Filter>
-    </ClInclude>
-    <ClInclude Include="sph\sph_cubehash.h">
-      <Filter>Header Files\sph</Filter>
-    </ClInclude>
-    <ClInclude Include="sph\sph_echo.h">
-      <Filter>Header Files\sph</Filter>
-    </ClInclude>
-    <ClInclude Include="sph\sph_fugue.h">
-      <Filter>Header Files\sph</Filter>
-    </ClInclude>
-    <ClInclude Include="sph\sph_groestl.h">
-      <Filter>Header Files\sph</Filter>
-    </ClInclude>
-    <ClInclude Include="sph\sph_jh.h">
-      <Filter>Header Files\sph</Filter>
-    </ClInclude>
-    <ClInclude Include="sph\sph_keccak.h">
-      <Filter>Header Files\sph</Filter>
-    </ClInclude>
-    <ClInclude Include="sph\sph_luffa.h">
-      <Filter>Header Files\sph</Filter>
-    </ClInclude>
-    <ClInclude Include="sph\sph_shavite.h">
-      <Filter>Header Files\sph</Filter>
-    </ClInclude>
-    <ClInclude Include="sph\sph_simd.h">
-      <Filter>Header Files\sph</Filter>
-    </ClInclude>
-    <ClInclude Include="sph\sph_skein.h">
-      <Filter>Header Files\sph</Filter>
-    </ClInclude>
-    <ClInclude Include="sph\sph_types.h">
-      <Filter>Header Files\sph</Filter>
-    </ClInclude>
-    <ClInclude Include="heavy\cuda_blake512.h">
-      <Filter>Header Files\CUDA\heavy</Filter>
-    </ClInclude>
-    <ClInclude Include="heavy\cuda_combine.h">
-      <Filter>Header Files\CUDA\heavy</Filter>
-    </ClInclude>
-    <ClInclude Include="heavy\cuda_groestl512.h">
-      <Filter>Header Files\CUDA\heavy</Filter>
-    </ClInclude>
-    <ClInclude Include="heavy\cuda_hefty1.h">
-      <Filter>Header Files\CUDA\heavy</Filter>
-    </ClInclude>
-    <ClInclude Include="heavy\cuda_keccak512.h">
-      <Filter>Header Files\CUDA\heavy</Filter>
-    </ClInclude>
-    <ClInclude Include="heavy\cuda_sha256.h">
-      <Filter>Header Files\CUDA\heavy</Filter>
-    </ClInclude>
-    <ClInclude Include="cuda_helper.h">
-      <Filter>Header Files\CUDA</Filter>
-    </ClInclude>
-    <ClInclude Include="sph\sph_hamsi.h">
-      <Filter>Header Files\sph</Filter>
-    </ClInclude>
-  </ItemGroup>
-  <ItemGroup>
-    <CudaCompile Include="cuda_fugue256.cu">
-      <Filter>Source Files\CUDA</Filter>
-    </CudaCompile>
-    <CudaCompile Include="cuda_groestlcoin.cu">
-      <Filter>Source Files\CUDA</Filter>
-    </CudaCompile>
-    <CudaCompile Include="JHA\cuda_jha_keccak512.cu">
-      <Filter>Source Files\CUDA\JHA</Filter>
-    </CudaCompile>
-    <CudaCompile Include="JHA\jackpotcoin.cu">
-      <Filter>Source Files\CUDA\JHA</Filter>
-    </CudaCompile>
-    <CudaCompile Include="quark\cuda_quark_checkhash.cu">
-      <Filter>Source Files\CUDA\quark</Filter>
-    </CudaCompile>
-    <CudaCompile Include="cuda_myriadgroestl.cu">
-      <Filter>Source Files\CUDA</Filter>
-    </CudaCompile>
-    <CudaCompile Include="JHA\cuda_jha_compactionTest.cu">
-      <Filter>Source Files\CUDA\JHA</Filter>
-    </CudaCompile>
-    <CudaCompile Include="quark\cuda_jh512.cu">
-      <Filter>Source Files\CUDA\quark</Filter>
-    </CudaCompile>
-    <CudaCompile Include="quark\cuda_quark_blake512.cu">
-      <Filter>Source Files\CUDA\quark</Filter>
-    </CudaCompile>
-    <CudaCompile Include="quark\cuda_quark_groestl512.cu">
-      <Filter>Source Files\CUDA\quark</Filter>
-    </CudaCompile>
-    <CudaCompile Include="quark\cuda_skein512.cu">
-      <Filter>Source Files\CUDA\quark</Filter>
-    </CudaCompile>
-    <CudaCompile Include="heavy\cuda_blake512.cu">
-      <Filter>Source Files\CUDA\heavy</Filter>
-    </CudaCompile>
-    <CudaCompile Include="heavy\cuda_combine.cu">
-      <Filter>Source Files\CUDA\heavy</Filter>
-    </CudaCompile>
-    <CudaCompile Include="heavy\cuda_groestl512.cu">
-      <Filter>Source Files\CUDA\heavy</Filter>
-    </CudaCompile>
-    <CudaCompile Include="heavy\cuda_hefty1.cu">
-      <Filter>Source Files\CUDA\heavy</Filter>
-    </CudaCompile>
-    <CudaCompile Include="heavy\cuda_keccak512.cu">
-      <Filter>Source Files\CUDA\heavy</Filter>
-    </CudaCompile>
-    <CudaCompile Include="heavy\cuda_sha256.cu">
-      <Filter>Source Files\CUDA\heavy</Filter>
-    </CudaCompile>
-    <CudaCompile Include="heavy\heavy.cu">
-      <Filter>Source Files\CUDA\heavy</Filter>
-    </CudaCompile>
-    <CudaCompile Include="quark\animecoin.cu">
-      <Filter>Source Files\CUDA\quark</Filter>
-    </CudaCompile>
-    <CudaCompile Include="quark\cuda_bmw512.cu">
-      <Filter>Source Files\CUDA\quark</Filter>
-    </CudaCompile>
-    <CudaCompile Include="quark\cuda_quark_compactionTest.cu">
-      <Filter>Source Files\CUDA\quark</Filter>
-    </CudaCompile>
-    <CudaCompile Include="quark\cuda_quark_keccak512.cu">
-      <Filter>Source Files\CUDA\quark</Filter>
-    </CudaCompile>
-    <CudaCompile Include="quark\quarkcoin.cu">
-      <Filter>Source Files\CUDA\quark</Filter>
-    </CudaCompile>
-    <CudaCompile Include="cuda_nist5.cu">
-      <Filter>Source Files\CUDA</Filter>
-    </CudaCompile>
-    <CudaCompile Include="x11\cuda_x11_aes.cu">
-      <Filter>Source Files\CUDA\x11</Filter>
-    </CudaCompile>
-    <CudaCompile Include="x11\cuda_x11_cubehash512.cu">
-      <Filter>Source Files\CUDA\x11</Filter>
-    </CudaCompile>
-    <CudaCompile Include="x11\cuda_x11_echo.cu">
-      <Filter>Source Files\CUDA\x11</Filter>
-    </CudaCompile>
-    <CudaCompile Include="x11\cuda_x11_luffa512.cu">
-      <Filter>Source Files\CUDA\x11</Filter>
-    </CudaCompile>
-    <CudaCompile Include="x11\cuda_x11_shavite512.cu">
-      <Filter>Source Files\CUDA\x11</Filter>
-    </CudaCompile>
-    <CudaCompile Include="x11\cuda_x11_simd512.cu">
-      <Filter>Source Files\CUDA\x11</Filter>
-    </CudaCompile>
-    <CudaCompile Include="x11\x11.cu">
-      <Filter>Source Files\CUDA\x11</Filter>
-    </CudaCompile>
-    <CudaCompile Include="x11\simd_functions.cu">
-      <Filter>Source Files\CUDA\x11</Filter>
-    </CudaCompile>
-    <CudaCompile Include="x13\cuda_x13_fugue512.cu">
-      <Filter>Source Files\CUDA\x13</Filter>
-    </CudaCompile>
-    <CudaCompile Include="x13\cuda_x13_hamsi512.cu">
-      <Filter>Source Files\CUDA\x13</Filter>
-    </CudaCompile>
-    <CudaCompile Include="x13\x13.cu">
-      <Filter>Source Files\CUDA\x13</Filter>
-    </CudaCompile>
-  </ItemGroup>
-</Project>
\ No newline at end of file
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="Source Files">
+      <UniqueIdentifier>{2450a9c7-a97a-49e1-ba19-c8dbc5a4e3e7}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Source Files\gettimeofday">
+      <UniqueIdentifier>{c53ce808-c5c5-4c6c-99a2-3947090c62f1}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Source Files\getopt">
+      <UniqueIdentifier>{5a45c1bf-81d2-4bc6-97b5-714e34f51a82}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Header Files">
+      <UniqueIdentifier>{431cec61-9376-4de9-aae9-04c4250652e7}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Header Files\compat">
+      <UniqueIdentifier>{cc8bb259-5332-4a45-ba81-f4840a55b604}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Header Files\compat\getopt">
+      <UniqueIdentifier>{89362bd8-4690-4f0c-a4f7-6b2fa67a1f34}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Header Files\compat\sys">
+      <UniqueIdentifier>{6c3cd392-b6b8-424c-87d2-10e33dbd4b41}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Source Files\CUDA">
+      <UniqueIdentifier>{5a31b6f4-4943-4b22-b69a-230f3cc96269}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Header Files\CUDA">
+      <UniqueIdentifier>{a0f072d0-a831-4c23-8d64-7a026521df9c}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Source Files\CUDA\JHA">
+      <UniqueIdentifier>{d8f2e173-a0a5-455b-8efc-42511b585156}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Source Files\CUDA\quark">
+      <UniqueIdentifier>{dd0822bd-27cc-4d5c-8e2f-cf7d9b00feb4}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Source Files\sph">
+      <UniqueIdentifier>{0dc051db-f833-476f-b3f4-c69fd03b9348}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Header Files\sph">
+      <UniqueIdentifier>{7c2a98c6-064c-4a69-b803-d6f6ff5edd0b}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Source Files\CUDA\heavy">
+      <UniqueIdentifier>{c3222908-22ba-4586-a637-6363f455b06d}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Source Files\CUDA\qubit">
+      <UniqueIdentifier>{f3ed23a2-8ce7-41a5-b051-6da56047dc35}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Source Files\CUDA\skunk">
+      <UniqueIdentifier>{23ed23a2-8e7a-65a2-b051-8aa63047d352}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Source Files\CUDA\x11">
+      <UniqueIdentifier>{dd751f2d-bfd6-42c1-8f9b-cbe94e539353}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Source Files\CUDA\x13">
+      <UniqueIdentifier>{d67a2af7-4851-4d21-910e-87791bc8ee35}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Source Files\CUDA\x15">
+      <UniqueIdentifier>{a2403c22-6777-46ab-a55a-3fcc7386c974}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Source Files\CUDA\x16">
+      <UniqueIdentifier>{55dfae6a-66ba-43e2-8ceb-98ee70cbdf16}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Source Files\CUDA\x17">
+      <UniqueIdentifier>{85dfae6a-66ca-4332-8cec-98ee70cbdf2f}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Source Files\jansson">
+      <UniqueIdentifier>{17b56151-79ec-4a32-bac3-9d94ae7f68fe}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Header Files\compat\nvapi">
+      <UniqueIdentifier>{ef6f9983-bda5-4fb2-adfa-ac4f29b74f25}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Source Files\CUDA\Algo256">
+      <UniqueIdentifier>{9762c92c-9677-4044-8292-ff6ba4bfdd89}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Header Files\lyra2">
+      <UniqueIdentifier>{2ff6e4ce-7c92-4cb2-a3ad-c331e94fd81d}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Ressources">
+      <UniqueIdentifier>{f5117ccb-a70d-411a-b7ea-d6faed230bc7}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Source Files\CUDA\scrypt">
+      <UniqueIdentifier>{c26f5b02-37b5-4420-a4e8-ee1ad517dc95}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Source Files\neoscrypt">
+      <UniqueIdentifier>{2037fd0d-e7a2-4da8-956c-150aec726a99}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Source Files\CUDA\lyra2">
+      <UniqueIdentifier>{1613763f-895c-4321-b58b-6f5849868956}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Source Files\CUDA\lbry">
+      <UniqueIdentifier>{3079ea1f-f768-455a-acd6-f517fac535b4}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Source Files\CUDA\sha256">
+      <UniqueIdentifier>{86a896c0-1688-4854-98e3-285d166069a3}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Source Files\sia">
+      <UniqueIdentifier>{86a896c0-1688-4854-98e0-285d166069a3}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Source Files\crypto">
+      <UniqueIdentifier>{fea0fce3-c0fe-42f7-aa37-0cbba10b008a}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Source Files\crypto\xmr">
+      <UniqueIdentifier>{af52b078-ed91-4c6e-b07a-e9243acc85d2}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Source Files\crypto\bbr">
+      <UniqueIdentifier>{af387eac-e9e6-4e91-a5e8-637b1e7a8d93}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Source Files\CUDA\xmr">
+      <UniqueIdentifier>{0f9aec5e-5409-488f-992a-2c108590d1ac}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Source Files\equi">
+      <UniqueIdentifier>{031afae7-2a78-4e32-9738-4b589b6f7ff3}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Source Files\CUDA\tribus">
+      <UniqueIdentifier>{1e548d79-c217-4203-989a-a592fe2b2de3}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Source Files\CUDA\phi">
+      <UniqueIdentifier>{311e8d79-1612-4f0f-8591-23a592f2b2d3}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Source Files\CUDA\x12">
+      <UniqueIdentifier>{xde48d89-fx12-1323-129a-b592fe2b2de3}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="compat\jansson\dump.c">
+      <Filter>Source Files\jansson</Filter>
+    </ClCompile>
+    <ClCompile Include="compat\jansson\hashtable.c">
+      <Filter>Source Files\jansson</Filter>
+    </ClCompile>
+    <ClCompile Include="compat\jansson\load.c">
+      <Filter>Source Files\jansson</Filter>
+    </ClCompile>
+    <ClCompile Include="compat\jansson\strbuffer.c">
+      <Filter>Source Files\jansson</Filter>
+    </ClCompile>
+    <ClCompile Include="compat\jansson\utf.c">
+      <Filter>Source Files\jansson</Filter>
+    </ClCompile>
+    <ClCompile Include="compat\jansson\value.c">
+      <Filter>Source Files\jansson</Filter>
+    </ClCompile>
+    <ClCompile Include="compat\getopt\getopt_long.c">
+      <Filter>Source Files\getopt</Filter>
+    </ClCompile>
+    <ClCompile Include="compat\gettimeofday.c">
+      <Filter>Source Files\gettimeofday</Filter>
+    </ClCompile>
+    <ClCompile Include="util.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="ccminer.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="pools.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="crc32.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="sph\aes_helper.c">
+      <Filter>Source Files\sph</Filter>
+    </ClCompile>
+    <ClCompile Include="sph\blake.c">
+      <Filter>Source Files\sph</Filter>
+    </ClCompile>
+    <ClCompile Include="sph\blake2b.c">
+      <Filter>Source Files\sph</Filter>
+    </ClCompile>
+    <ClCompile Include="sph\blake2s.c">
+      <Filter>Source Files\sph</Filter>
+    </ClCompile>
+    <ClCompile Include="sph\bmw.c">
+      <Filter>Source Files\sph</Filter>
+    </ClCompile>
+    <ClCompile Include="sph\cubehash.c">
+      <Filter>Source Files\sph</Filter>
+    </ClCompile>
+    <ClCompile Include="sph\echo.c">
+      <Filter>Source Files\sph</Filter>
+    </ClCompile>
+    <ClCompile Include="sph\fugue.c">
+      <Filter>Source Files\sph</Filter>
+    </ClCompile>
+    <ClCompile Include="sph\groestl.c">
+      <Filter>Source Files\sph</Filter>
+    </ClCompile>
+    <ClCompile Include="sph\jh.c">
+      <Filter>Source Files\sph</Filter>
+    </ClCompile>
+    <ClCompile Include="sph\keccak.c">
+      <Filter>Source Files\sph</Filter>
+    </ClCompile>
+    <ClCompile Include="sph\luffa.c">
+      <Filter>Source Files\sph</Filter>
+    </ClCompile>
+    <ClCompile Include="sph\sha2.c">
+      <Filter>Source Files\sph</Filter>
+    </ClCompile>
+    <ClCompile Include="sph\shavite.c">
+      <Filter>Source Files\sph</Filter>
+    </ClCompile>
+    <ClCompile Include="sph\simd.c">
+      <Filter>Source Files\sph</Filter>
+    </ClCompile>
+    <ClCompile Include="sph\skein.c">
+      <Filter>Source Files\sph</Filter>
+    </ClCompile>
+    <ClCompile Include="sph\hamsi.c">
+      <Filter>Source Files\sph</Filter>
+    </ClCompile>
+    <ClCompile Include="sph\hamsi_helper.c">
+      <Filter>Source Files\sph</Filter>
+    </ClCompile>
+    <ClCompile Include="sph\shabal.c">
+      <Filter>Source Files\sph</Filter>
+    </ClCompile>
+    <ClCompile Include="sph\whirlpool.c">
+      <Filter>Source Files\sph</Filter>
+    </ClCompile>
+    <ClCompile Include="sph\haval.c">
+      <Filter>Source Files\sph</Filter>
+    </ClCompile>
+    <ClCompile Include="sph\ripemd.c">
+      <Filter>Source Files\sph</Filter>
+    </ClCompile>
+    <ClCompile Include="sph\sph_sha2.c">
+      <Filter>Source Files\sph</Filter>
+    </ClCompile>
+    <ClCompile Include="sph\sha2big.c">
+      <Filter>Source Files\sph</Filter>
+    </ClCompile>
+    <ClCompile Include="sph\streebog.c">
+      <Filter>Source Files\sph</Filter>
+    </ClCompile>
+    <ClCompile Include="x13\sm3.c">
+      <Filter>Source Files\sph</Filter>
+    </ClCompile>
+    <ClCompile Include="compat\winansi.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="hashlog.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="stats.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="api.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="nvml.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="sysinfos.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="compat\jansson\memory.c">
+      <Filter>Source Files\jansson</Filter>
+    </ClCompile>
+    <ClCompile Include="compat\jansson\pack_unpack.c">
+      <Filter>Source Files\jansson</Filter>
+    </ClCompile>
+    <ClCompile Include="compat\jansson\strconv.c">
+      <Filter>Source Files\jansson</Filter>
+    </ClCompile>
+    <ClCompile Include="compat\jansson\error.c">
+      <Filter>Source Files\jansson</Filter>
+    </ClCompile>
+    <ClInclude Include="compat\pthreads\pthread.h">
+      <Filter>Header Files\compat</Filter>
+    </ClInclude>
+    <ClCompile Include="lyra2\Lyra2.c">
+      <Filter>Source Files\sph</Filter>
+    </ClCompile>
+    <ClCompile Include="lyra2\Sponge.c">
+      <Filter>Source Files\sph</Filter>
+    </ClCompile>
+    <ClCompile Include="lyra2\Lyra2Z.c">
+      <Filter>Source Files\sph</Filter>
+    </ClCompile>
+    <ClCompile Include="scrypt.cpp">
+      <Filter>Source Files\CUDA\scrypt</Filter>
+    </ClCompile>
+    <ClCompile Include="scrypt-jane.cpp">
+      <Filter>Source Files\CUDA\scrypt</Filter>
+    </ClCompile>
+    <ClCompile Include="neoscrypt\neoscrypt.cpp">
+      <Filter>Source Files\neoscrypt</Filter>
+    </ClCompile>
+    <ClCompile Include="neoscrypt\neoscrypt-cpu.c">
+      <Filter>Source Files\neoscrypt</Filter>
+    </ClCompile>
+    <ClCompile Include="skein2.cpp">
+      <Filter>Source Files\CUDA</Filter>
+    </ClCompile>
+    <ClCompile Include="fuguecoin.cpp">
+      <Filter>Source Files\CUDA</Filter>
+    </ClCompile>
+    <ClCompile Include="groestlcoin.cpp">
+      <Filter>Source Files\CUDA</Filter>
+    </ClCompile>
+    <ClCompile Include="myriadgroestl.cpp">
+      <Filter>Source Files\CUDA</Filter>
+    </ClCompile>
+    <ClCompile Include="hefty1.c">
+      <Filter>Source Files\CUDA\heavy</Filter>
+    </ClCompile>
+    <ClCompile Include="bench.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="bignum.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="nvapi.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="nvsettings.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="sia\sia-rpc.cpp">
+      <Filter>Source Files\sia</Filter>
+    </ClCompile>
+    <ClCompile Include="crypto\xmr-rpc.cpp">
+      <Filter>Source Files\crypto</Filter>
+    </ClCompile>
+    <ClCompile Include="crypto\aesb.cpp">
+      <Filter>Source Files\crypto\xmr</Filter>
+    </ClCompile>
+    <ClCompile Include="crypto\cpu\c_keccak.c">
+      <Filter>Source Files\crypto\xmr</Filter>
+    </ClCompile>
+    <ClCompile Include="crypto\oaes_lib.cpp">
+      <Filter>Source Files\crypto\xmr</Filter>
+    </ClCompile>
+    <ClCompile Include="crypto\cryptolight-cpu.cpp">
+      <Filter>Source Files\crypto\xmr</Filter>
+    </ClCompile>
+    <ClCompile Include="crypto\cryptonight-cpu.cpp">
+      <Filter>Source Files\crypto\xmr</Filter>
+    </ClCompile>
+    <ClCompile Include="crypto\mman.c">
+      <Filter>Source Files\crypto\bbr</Filter>
+    </ClCompile>
+    <ClCompile Include="crypto\wildkeccak-cpu.cpp">
+      <Filter>Source Files\crypto\bbr</Filter>
+    </ClCompile>
+    <ClCompile Include="equi\equi.cpp">
+      <Filter>Source Files\equi</Filter>
+    </ClCompile>
+    <ClCompile Include="equi\equihash.cpp">
+      <Filter>Source Files\equi</Filter>
+    </ClCompile>
+    <ClCompile Include="equi\equi-stratum.cpp">
+      <Filter>Source Files\equi</Filter>
+    </ClCompile>
+    <ClCompile Include="equi\blake2\blake2bx.cpp">
+      <Filter>Source Files\equi</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="algos.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="compat.h">
+      <Filter>Header Files\compat</Filter>
+    </ClInclude>
+    <ClInclude Include="elist.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="miner.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="compat\sys\time.h">
+      <Filter>Header Files\compat\sys</Filter>
+    </ClInclude>
+    <ClInclude Include="compat\unistd.h">
+      <Filter>Header Files\compat</Filter>
+    </ClInclude>
+    <ClInclude Include="compat\inttypes.h">
+      <Filter>Header Files\compat</Filter>
+    </ClInclude>
+    <ClInclude Include="compat\getopt\getopt.h">
+      <Filter>Header Files\compat\getopt</Filter>
+    </ClInclude>
+    <ClInclude Include="compat\stdbool.h">
+      <Filter>Header Files\compat</Filter>
+    </ClInclude>
+    <ClInclude Include="compat\ccminer-config.h">
+      <Filter>Header Files\compat</Filter>
+    </ClInclude>
+    <ClInclude Include="uint256.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="cuda_groestlcoin.h">
+      <Filter>Header Files\CUDA</Filter>
+    </ClInclude>
+    <ClInclude Include="quark\cuda_quark.h">
+      <Filter>Header Files\CUDA</Filter>
+    </ClInclude>
+    <ClInclude Include="x11\cuda_x11.h">
+      <Filter>Header Files\CUDA</Filter>
+    </ClInclude>
+    <ClInclude Include="sph\blake2b.h">
+      <Filter>Header Files\sph</Filter>
+    </ClInclude>
+    <ClInclude Include="sph\blake2s.h">
+      <Filter>Header Files\sph</Filter>
+    </ClInclude>
+    <ClInclude Include="sph\sph_blake.h">
+      <Filter>Header Files\sph</Filter>
+    </ClInclude>
+    <ClInclude Include="sph\sph_bmw.h">
+      <Filter>Header Files\sph</Filter>
+    </ClInclude>
+    <ClInclude Include="sph\sph_cubehash.h">
+      <Filter>Header Files\sph</Filter>
+    </ClInclude>
+    <ClInclude Include="sph\sph_echo.h">
+      <Filter>Header Files\sph</Filter>
+    </ClInclude>
+    <ClInclude Include="sph\sph_fugue.h">
+      <Filter>Header Files\sph</Filter>
+    </ClInclude>
+    <ClInclude Include="sph\sph_groestl.h">
+      <Filter>Header Files\sph</Filter>
+    </ClInclude>
+    <ClInclude Include="sph\sph_jh.h">
+      <Filter>Header Files\sph</Filter>
+    </ClInclude>
+    <ClInclude Include="sph\sph_keccak.h">
+      <Filter>Header Files\sph</Filter>
+    </ClInclude>
+    <ClInclude Include="sph\sph_luffa.h">
+      <Filter>Header Files\sph</Filter>
+    </ClInclude>
+    <ClInclude Include="sph\sph_shavite.h">
+      <Filter>Header Files\sph</Filter>
+    </ClInclude>
+    <ClInclude Include="sph\sph_simd.h">
+      <Filter>Header Files\sph</Filter>
+    </ClInclude>
+    <ClInclude Include="sph\sph_skein.h">
+      <Filter>Header Files\sph</Filter>
+    </ClInclude>
+    <ClInclude Include="sph\sph_types.h">
+      <Filter>Header Files\sph</Filter>
+    </ClInclude>
+    <ClInclude Include="heavy\heavy.h">
+      <Filter>Header Files\CUDA</Filter>
+    </ClInclude>
+    <ClInclude Include="cuda_helper.h">
+      <Filter>Header Files\CUDA</Filter>
+    </ClInclude>
+    <ClInclude Include="cuda_vector_uint2x4.h">
+      <Filter>Header Files\CUDA</Filter>
+    </ClInclude>
+    <ClInclude Include="sph\sph_hamsi.h">
+      <Filter>Header Files\sph</Filter>
+    </ClInclude>
+    <ClInclude Include="sph\sph_shabal.h">
+      <Filter>Header Files\sph</Filter>
+    </ClInclude>
+    <ClInclude Include="sph\sph_streebog.h">
+      <Filter>Header Files\sph</Filter>
+    </ClInclude>
+    <ClInclude Include="sph\sph_whirlpool.h">
+      <Filter>Header Files\sph</Filter>
+    </ClInclude>
+    <ClInclude Include="sph\sph_haval.h">
+      <Filter>Header Files\sph</Filter>
+    </ClInclude>
+    <ClInclude Include="sph\sph_sha2.h">
+      <Filter>Header Files\sph</Filter>
+    </ClInclude>
+    <ClInclude Include="compat\winansi.h">
+      <Filter>Header Files\compat</Filter>
+    </ClInclude>
+    <ClInclude Include="nvml.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="compat\nvapi\nvapi_ccminer.h">
+      <Filter>Header Files\compat\nvapi</Filter>
+    </ClInclude>
+    <ClInclude Include="compat\nvapi\nvapi_lite_sli.h">
+      <Filter>Header Files\compat\nvapi</Filter>
+    </ClInclude>
+    <ClInclude Include="compat\nvapi\nvapi_lite_stereo.h">
+      <Filter>Header Files\compat\nvapi</Filter>
+    </ClInclude>
+    <ClInclude Include="compat\nvapi\nvapi_lite_surround.h">
+      <Filter>Header Files\compat\nvapi</Filter>
+    </ClInclude>
+    <ClInclude Include="compat\nvapi\nvapi.h">
+      <Filter>Header Files\compat\nvapi</Filter>
+    </ClInclude>
+    <ClInclude Include="compat\nvapi\nvapi_lite_common.h">
+      <Filter>Header Files\compat\nvapi</Filter>
+    </ClInclude>
+    <ClInclude Include="compat\nvapi\nvapi_lite_d3dext.h">
+      <Filter>Header Files\compat\nvapi</Filter>
+    </ClInclude>
+    <ClInclude Include="compat\nvapi\nvapi_lite_salend.h">
+      <Filter>Header Files\compat\nvapi</Filter>
+    </ClInclude>
+    <ClInclude Include="compat\nvapi\nvapi_lite_salstart.h">
+      <Filter>Header Files\compat\nvapi</Filter>
+    </ClInclude>
+    <ClInclude Include="compat\jansson\jansson_config.h">
+      <Filter>Header Files\compat</Filter>
+    </ClInclude>
+    <ClInclude Include="lyra2\Lyra2.h">
+      <Filter>Header Files\lyra2</Filter>
+    </ClInclude>
+    <ClInclude Include="lyra2\Lyra2Z.h">
+      <Filter>Header Files\lyra2</Filter>
+    </ClInclude>
+    <ClInclude Include="lyra2\Sponge.h">
+      <Filter>Header Files\lyra2</Filter>
+    </ClInclude>
+    <ClInclude Include="quark\groestl_transf_quad.h">
+      <Filter>Source Files\CUDA\quark</Filter>
+    </ClInclude>
+    <ClInclude Include="quark\groestl_functions_quad.h">
+      <Filter>Source Files\CUDA\quark</Filter>
+    </ClInclude>
+    <ClInclude Include="res\resource.h">
+      <Filter>Ressources</Filter>
+    </ClInclude>
+    <ClInclude Include="scrypt\salsa_kernel.h">
+      <Filter>Source Files\CUDA\scrypt</Filter>
+    </ClInclude>
+    <ClInclude Include="neoscrypt\neoscrypt.h">
+      <Filter>Source Files\neoscrypt</Filter>
+    </ClInclude>
+    <ClInclude Include="neoscrypt\cuda_vectors.h">
+      <Filter>Source Files\neoscrypt</Filter>
+    </ClInclude>
+    <ClInclude Include="hefty1.h">
+      <Filter>Source Files\CUDA\heavy</Filter>
+    </ClInclude>
+    <ClInclude Include="bignum.hpp">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="lyra2\cuda_lyra2v2_sm3.cuh">
+      <Filter>Source Files\CUDA\lyra2</Filter>
+    </ClInclude>
+    <ClInclude Include="lyra2\cuda_lyra2_sm2.cuh">
+      <Filter>Source Files\CUDA\lyra2</Filter>
+    </ClInclude>
+    <ClInclude Include="lyra2\cuda_lyra2_sm5.cuh">
+      <Filter>Source Files\CUDA\lyra2</Filter>
+    </ClInclude>
+    <ClInclude Include="lyra2\cuda_lyra2Z_sm5.cuh">
+      <Filter>Source Files\CUDA\lyra2</Filter>
+    </ClInclude>
+    <ClInclude Include="quark\cuda_quark_blake512_sp.cuh">
+      <Filter>Source Files\CUDA\quark</Filter>
+    </ClInclude>
+    <ClInclude Include="quark\cuda_bmw512_sm3.cuh">
+      <Filter>Source Files\CUDA\quark</Filter>
+    </ClInclude>
+    <ClInclude Include="quark\cuda_quark_groestl512_sm2.cuh">
+      <Filter>Source Files\CUDA\quark</Filter>
+    </ClInclude>
+    <ClInclude Include="x11\cuda_x11_simd512_sm2.cuh">
+      <Filter>Source Files\CUDA\x11</Filter>
+    </ClInclude>
+    <ClInclude Include="sia\sia-rpc.h">
+      <Filter>Source Files\sia</Filter>
+    </ClInclude>
+    <ClInclude Include="crypto\xmr-rpc.h">
+      <Filter>Source Files\crypto</Filter>
+    </ClInclude>
+    <ClInclude Include="crypto\mman.h">
+      <Filter>Source Files\crypto\bbr</Filter>
+    </ClInclude>
+    <ClInclude Include="crypto\wildkeccak.h">
+      <Filter>Source Files\crypto\bbr</Filter>
+    </ClInclude>
+    <ClInclude Include="crypto\cn_aes.cuh">
+      <Filter>Source Files\CUDA\xmr</Filter>
+    </ClInclude>
+    <ClInclude Include="crypto\cn_blake.cuh">
+      <Filter>Source Files\CUDA\xmr</Filter>
+    </ClInclude>
+    <ClInclude Include="crypto\cn_groestl.cuh">
+      <Filter>Source Files\CUDA\xmr</Filter>
+    </ClInclude>
+    <ClInclude Include="crypto\cn_jh.cuh">
+      <Filter>Source Files\CUDA\xmr</Filter>
+    </ClInclude>
+    <ClInclude Include="crypto\cn_keccak.cuh">
+      <Filter>Source Files\CUDA\xmr</Filter>
+    </ClInclude>
+    <ClInclude Include="crypto\cn_skein.cuh">
+      <Filter>Source Files\CUDA\xmr</Filter>
+    </ClInclude>
+    <ClInclude Include="crypto\cryptolight.h">
+      <Filter>Source Files\CUDA\xmr</Filter>
+    </ClInclude>
+    <ClInclude Include="crypto\cryptonight.h">
+      <Filter>Source Files\CUDA\xmr</Filter>
+    </ClInclude>
+    <ClInclude Include="equi\eqcuda.hpp">
+      <Filter>Source Files\equi</Filter>
+    </ClInclude>
+    <ClInclude Include="equi\equihash.h">
+      <Filter>Source Files\equi</Filter>
+    </ClInclude>
+    <ClInclude Include="x16\cuda_x16.h">
+      <Filter>Header Files\CUDA</Filter>
+    </ClInclude>
+  </ItemGroup>
+  <ItemGroup>
+    <CudaCompile Include="cuda.cpp">
+      <Filter>Source Files\CUDA</Filter>
+    </CudaCompile>
+    <CudaCompile Include="cuda_groestlcoin.cu">
+      <Filter>Source Files\CUDA</Filter>
+    </CudaCompile>
+    <CudaCompile Include="JHA\cuda_jha_keccak512.cu">
+      <Filter>Source Files\CUDA\JHA</Filter>
+    </CudaCompile>
+    <CudaCompile Include="JHA\jha.cu">
+      <Filter>Source Files\CUDA\JHA</Filter>
+    </CudaCompile>
+    <CudaCompile Include="JHA\jackpotcoin.cu">
+      <Filter>Source Files\CUDA\JHA</Filter>
+    </CudaCompile>
+    <CudaCompile Include="cuda_myriadgroestl.cu">
+      <Filter>Source Files\CUDA</Filter>
+    </CudaCompile>
+    <CudaCompile Include="JHA\cuda_jha_compactionTest.cu">
+      <Filter>Source Files\CUDA\JHA</Filter>
+    </CudaCompile>
+    <CudaCompile Include="quark\cuda_jh512.cu">
+      <Filter>Source Files\CUDA\quark</Filter>
+    </CudaCompile>
+    <CudaCompile Include="quark\cuda_quark_blake512.cu">
+      <Filter>Source Files\CUDA\quark</Filter>
+    </CudaCompile>
+    <CudaCompile Include="quark\cuda_quark_groestl512.cu">
+      <Filter>Source Files\CUDA\quark</Filter>
+    </CudaCompile>
+    <CudaCompile Include="quark\cuda_skein512.cu">
+      <Filter>Source Files\CUDA\quark</Filter>
+    </CudaCompile>
+    <CudaCompile Include="heavy\cuda_blake512.cu">
+      <Filter>Source Files\CUDA\heavy</Filter>
+    </CudaCompile>
+    <CudaCompile Include="heavy\cuda_combine.cu">
+      <Filter>Source Files\CUDA\heavy</Filter>
+    </CudaCompile>
+    <CudaCompile Include="heavy\cuda_groestl512.cu">
+      <Filter>Source Files\CUDA\heavy</Filter>
+    </CudaCompile>
+    <CudaCompile Include="heavy\cuda_hefty1.cu">
+      <Filter>Source Files\CUDA\heavy</Filter>
+    </CudaCompile>
+    <CudaCompile Include="heavy\cuda_keccak512.cu">
+      <Filter>Source Files\CUDA\heavy</Filter>
+    </CudaCompile>
+    <CudaCompile Include="heavy\cuda_sha256.cu">
+      <Filter>Source Files\CUDA\heavy</Filter>
+    </CudaCompile>
+    <CudaCompile Include="heavy\heavy.cu">
+      <Filter>Source Files\CUDA\heavy</Filter>
+    </CudaCompile>
+    <CudaCompile Include="heavy\bastion.cu">
+      <Filter>Source Files\CUDA\heavy</Filter>
+    </CudaCompile>
+    <CudaCompile Include="heavy\cuda_bastion.cu">
+      <Filter>Source Files\CUDA\heavy</Filter>
+    </CudaCompile>
+    <CudaCompile Include="quark\cuda_bmw512.cu">
+      <Filter>Source Files\CUDA\quark</Filter>
+    </CudaCompile>
+    <CudaCompile Include="quark\cuda_quark_compactionTest.cu">
+      <Filter>Source Files\CUDA\quark</Filter>
+    </CudaCompile>
+    <CudaCompile Include="quark\cuda_quark_keccak512.cu">
+      <Filter>Source Files\CUDA\quark</Filter>
+    </CudaCompile>
+    <CudaCompile Include="quark\nist5.cu">
+      <Filter>Source Files\CUDA\quark</Filter>
+    </CudaCompile>
+    <CudaCompile Include="quark\quarkcoin.cu">
+      <Filter>Source Files\CUDA\quark</Filter>
+    </CudaCompile>
+    <CudaCompile Include="qubit\deep.cu">
+      <Filter>Source Files\CUDA\qubit</Filter>
+    </CudaCompile>
+    <CudaCompile Include="qubit\luffa.cu">
+      <Filter>Source Files\CUDA\qubit</Filter>
+    </CudaCompile>
+    <CudaCompile Include="qubit\qubit.cu">
+      <Filter>Source Files\CUDA\qubit</Filter>
+    </CudaCompile>
+    <CudaCompile Include="qubit\qubit_luffa512.cu">
+      <Filter>Source Files\CUDA\qubit</Filter>
+    </CudaCompile>
+    <ClInclude Include="x11\cuda_x11_aes.cuh">
+      <Filter>Source Files\CUDA\x11</Filter>
+    </ClInclude>
+    <CudaCompile Include="x11\cuda_x11_cubehash512.cu">
+      <Filter>Source Files\CUDA\x11</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x11\cuda_x11_echo.cu">
+      <Filter>Source Files\CUDA\x11</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x11\cuda_x11_luffa512.cu">
+      <Filter>Source Files\CUDA\x11</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x11\cuda_x11_luffa512_Cubehash.cu">
+      <Filter>Source Files\CUDA\x11</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x11\cuda_x11_shavite512.cu">
+      <Filter>Source Files\CUDA\x11</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x11\cuda_x11_simd512.cu">
+      <Filter>Source Files\CUDA\x11</Filter>
+    </CudaCompile>
+    <ClInclude Include="x11\cuda_x11_simd512_func.cuh">
+      <Filter>Source Files\CUDA\x11</Filter>
+    </ClInclude>
+    <CudaCompile Include="x11\c11.cu">
+      <Filter>Source Files\CUDA\x11</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x11\fresh.cu">
+      <Filter>Source Files\CUDA\x11</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x11\x11.cu">
+      <Filter>Source Files\CUDA\x11</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x11\x11evo.cu">
+      <Filter>Source Files\CUDA\x11</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x13\cuda_x13_fugue512.cu">
+      <Filter>Source Files\CUDA\x13</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x13\cuda_x13_hamsi512.cu">
+      <Filter>Source Files\CUDA\x13</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x13\x13.cu">
+      <Filter>Source Files\CUDA\x13</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x13\cuda_hsr_sm3.cu">
+      <Filter>Source Files\CUDA\x13</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x13\hsr.cu">
+      <Filter>Source Files\CUDA\x13</Filter>
+    </CudaCompile>
+    <CudaCompile Include="cuda_checkhash.cu">
+      <Filter>Source Files\CUDA</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x15\x14.cu">
+      <Filter>Source Files\CUDA\x15</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x15\cuda_x14_shabal512.cu">
+      <Filter>Source Files\CUDA\x15</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x15\x15.cu">
+      <Filter>Source Files\CUDA\x15</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x15\cuda_x15_whirlpool.cu">
+      <Filter>Source Files\CUDA\x15</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x15\whirlpool.cu">
+      <Filter>Source Files\CUDA\x15</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x17\cuda_x17_haval256.cu">
+      <Filter>Source Files\CUDA\x17</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x17\cuda_x17_sha512.cu">
+      <Filter>Source Files\CUDA\x17</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x17\hmq17.cu">
+      <Filter>Source Files\CUDA\x17</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x17\sonoa.cu">
+      <Filter>Source Files\CUDA\x17</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x17\x17.cu">
+      <Filter>Source Files\CUDA\x17</Filter>
+    </CudaCompile>
+    <CudaCompile Include="pentablake.cu">
+      <Filter>Source Files\CUDA</Filter>
+    </CudaCompile>
+    <CudaCompile Include="polytimos.cu">
+      <Filter>Source Files\CUDA</Filter>
+    </CudaCompile>
+    <CudaCompile Include="phi\phi.cu">
+      <Filter>Source Files\CUDA\phi</Filter>
+    </CudaCompile>
+    <CudaCompile Include="phi\phi2.cu">
+      <Filter>Source Files\CUDA\phi</Filter>
+    </CudaCompile>
+    <CudaCompile Include="phi\cuda_phi2.cu">
+      <Filter>Source Files\CUDA\phi</Filter>
+    </CudaCompile>
+    <CudaCompile Include="phi\cuda_phi2_cubehash512.cu">
+      <Filter>Source Files\CUDA\phi</Filter>
+    </CudaCompile>
+    <CudaCompile Include="skunk\skunk.cu">
+      <Filter>Source Files\CUDA\skunk</Filter>
+    </CudaCompile>
+    <CudaCompile Include="skunk\cuda_skunk.cu">
+      <Filter>Source Files\CUDA\skunk</Filter>
+    </CudaCompile>
+    <CudaCompile Include="skunk\cuda_skunk_streebog.cu">
+      <Filter>Source Files\CUDA\skunk</Filter>
+    </CudaCompile>
+    <CudaCompile Include="tribus\tribus.cu">
+      <Filter>Source Files\CUDA\tribus</Filter>
+    </CudaCompile>
+    <CudaCompile Include="tribus\cuda_echo512_final.cu">
+      <Filter>Source Files\CUDA\tribus</Filter>
+    </CudaCompile>
+    <ClInclude Include="tribus\cuda_echo512_aes.cuh">
+      <Filter>Source Files\CUDA\tribus</Filter>
+    </ClInclude>
+    <CudaCompile Include="x11\sib.cu">
+      <Filter>Source Files\CUDA\x11</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x11\cuda_streebog.cu">
+      <Filter>Source Files\CUDA\x11</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x11\cuda_streebog_maxwell.cu">
+      <Filter>Source Files\CUDA\x11</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x11\s3.cu">
+      <Filter>Source Files\CUDA\x11</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x12\x12.cu">
+      <Filter>Source Files\CUDA\x12</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x11\timetravel.cu">
+      <Filter>Source Files\CUDA\x11</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x11\bitcore.cu">
+      <Filter>Source Files\CUDA\x11</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x11\exosis.cu">
+      <Filter>Source Files\CUDA\x11</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x11\veltor.cu">
+      <Filter>Source Files\CUDA\x11</Filter>
+    </CudaCompile>
+    <CudaCompile Include="Algo256\blake256.cu">
+      <Filter>Source Files\CUDA\Algo256</Filter>
+    </CudaCompile>
+    <CudaCompile Include="Algo256\decred.cu">
+      <Filter>Source Files\CUDA\Algo256</Filter>
+    </CudaCompile>
+    <CudaCompile Include="Algo256\vanilla.cu">
+      <Filter>Source Files\CUDA\Algo256</Filter>
+    </CudaCompile>
+    <CudaCompile Include="Algo256\keccak256.cu">
+      <Filter>Source Files\CUDA\Algo256</Filter>
+    </CudaCompile>
+    <CudaCompile Include="Algo256\cuda_blake256.cu">
+      <Filter>Source Files\CUDA\Algo256</Filter>
+    </CudaCompile>
+    <CudaCompile Include="Algo256\cuda_bmw256.cu">
+      <Filter>Source Files\CUDA\Algo256</Filter>
+    </CudaCompile>
+    <CudaCompile Include="Algo256\cuda_cubehash256.cu">
+      <Filter>Source Files\CUDA\Algo256</Filter>
+    </CudaCompile>
+    <CudaCompile Include="Algo256\cuda_fugue256.cu">
+      <Filter>Source Files\CUDA\Algo256</Filter>
+    </CudaCompile>
+    <CudaCompile Include="Algo256\cuda_groestl256.cu">
+      <Filter>Source Files\CUDA\Algo256</Filter>
+    </CudaCompile>
+    <CudaCompile Include="Algo256\cuda_keccak256.cu">
+      <Filter>Source Files\CUDA\Algo256</Filter>
+    </CudaCompile>
+    <CudaCompile Include="Algo256\cuda_keccak256_sm3.cu">
+      <Filter>Source Files\CUDA\Algo256</Filter>
+    </CudaCompile>
+    <CudaCompile Include="Algo256\cuda_skein256.cu">
+      <Filter>Source Files\CUDA\Algo256</Filter>
+    </CudaCompile>
+    <CudaCompile Include="zr5.cu">
+      <Filter>Source Files\CUDA</Filter>
+    </CudaCompile>
+    <CudaCompile Include="skein.cu">
+      <Filter>Source Files\CUDA</Filter>
+    </CudaCompile>
+    <CudaCompile Include="cuda_skeincoin.cu">
+      <Filter>Source Files\CUDA</Filter>
+    </CudaCompile>
+    <CudaCompile Include="scrypt\blake.cu">
+      <Filter>Source Files\CUDA\scrypt</Filter>
+    </CudaCompile>
+    <CudaCompile Include="scrypt\fermi_kernel.cu">
+      <Filter>Source Files\CUDA\scrypt</Filter>
+    </CudaCompile>
+    <CudaCompile Include="scrypt\keccak.cu">
+      <Filter>Source Files\CUDA\scrypt</Filter>
+    </CudaCompile>
+    <CudaCompile Include="scrypt\kepler_kernel.cu">
+      <Filter>Source Files\CUDA\scrypt</Filter>
+    </CudaCompile>
+    <CudaCompile Include="scrypt\nv_kernel.cu">
+      <Filter>Source Files\CUDA\scrypt</Filter>
+    </CudaCompile>
+    <CudaCompile Include="scrypt\nv_kernel2.cu">
+      <Filter>Source Files\CUDA\scrypt</Filter>
+    </CudaCompile>
+    <CudaCompile Include="scrypt\salsa_kernel.cu">
+      <Filter>Source Files\CUDA\scrypt</Filter>
+    </CudaCompile>
+    <CudaCompile Include="scrypt\sha256.cu">
+      <Filter>Source Files\CUDA\scrypt</Filter>
+    </CudaCompile>
+    <CudaCompile Include="scrypt\test_kernel.cu">
+      <Filter>Source Files\CUDA\scrypt</Filter>
+    </CudaCompile>
+    <CudaCompile Include="scrypt\titan_kernel.cu">
+      <Filter>Source Files\CUDA\scrypt</Filter>
+    </CudaCompile>
+    <CudaCompile Include="neoscrypt\cuda_neoscrypt.cu">
+      <Filter>Source Files\neoscrypt</Filter>
+    </CudaCompile>
+    <CudaCompile Include="Algo256\bmw.cu">
+      <Filter>Source Files\CUDA\Algo256</Filter>
+    </CudaCompile>
+    <CudaCompile Include="Algo256\cuda_bmw.cu">
+      <Filter>Source Files\CUDA\Algo256</Filter>
+    </CudaCompile>
+    <CudaCompile Include="lyra2\allium.cu">
+      <Filter>Source Files\CUDA\lyra2</Filter>
+    </CudaCompile>
+    <CudaCompile Include="lyra2\cuda_lyra2.cu">
+      <Filter>Source Files\CUDA\lyra2</Filter>
+    </CudaCompile>
+    <CudaCompile Include="lyra2\cuda_lyra2v2.cu">
+      <Filter>Source Files\CUDA\lyra2</Filter>
+    </CudaCompile>
+    <CudaCompile Include="lyra2\lyra2RE.cu">
+      <Filter>Source Files\CUDA\lyra2</Filter>
+    </CudaCompile>
+    <CudaCompile Include="lyra2\lyra2REv2.cu">
+      <Filter>Source Files\CUDA\lyra2</Filter>
+    </CudaCompile>
+    <CudaCompile Include="lyra2\cuda_lyra2v3.cu">
+      <Filter>Source Files\CUDA\lyra2</Filter>
+    </CudaCompile>
+    <ClInclude Include="lyra2\cuda_lyra2v3_sm3.cuh">
+      <Filter>Source Files\CUDA\lyra2</Filter>
+    </ClInclude>
+    <CudaCompile Include="lyra2\lyra2REv3.cu">
+      <Filter>Source Files\CUDA\lyra2</Filter>
+    </CudaCompile>
+    <CudaCompile Include="lyra2\cuda_lyra2Z.cu">
+      <Filter>Source Files\CUDA\lyra2</Filter>
+    </CudaCompile>
+    <CudaCompile Include="lyra2\lyra2Z.cu">
+      <Filter>Source Files\CUDA\lyra2</Filter>
+    </CudaCompile>
+    <CudaCompile Include="blake2b.cu">
+      <Filter>Source Files\CUDA\</Filter>
+    </CudaCompile>
+    <CudaCompile Include="Algo256\blake2s.cu">
+      <Filter>Source Files\CUDA\Algo256</Filter>
+    </CudaCompile>
+    <CudaCompile Include="lbry\cuda_sha256_lbry.cu">
+      <Filter>Source Files\CUDA\lbry</Filter>
+    </CudaCompile>
+    <CudaCompile Include="lbry\cuda_sha512_lbry.cu">
+      <Filter>Source Files\CUDA\lbry</Filter>
+    </CudaCompile>
+    <CudaCompile Include="lbry\cuda_lbry_merged.cu">
+      <Filter>Source Files\CUDA\lbry</Filter>
+    </CudaCompile>
+    <CudaCompile Include="lbry\lbry.cu">
+      <Filter>Source Files\CUDA\lbry</Filter>
+    </CudaCompile>
+    <CudaCompile Include="sha256\cuda_sha256d.cu">
+      <Filter>Source Files\CUDA\sha256</Filter>
+    </CudaCompile>
+    <CudaCompile Include="sha256\sha256d.cu">
+      <Filter>Source Files\CUDA\sha256</Filter>
+    </CudaCompile>
+    <CudaCompile Include="sha256\cuda_sha256t.cu">
+      <Filter>Source Files\CUDA\sha256</Filter>
+    </CudaCompile>
+    <CudaCompile Include="sha256\sha256t.cu">
+      <Filter>Source Files\CUDA\sha256</Filter>
+    </CudaCompile>
+    <CudaCompile Include="sha256\cuda_sha256q.cu">
+      <Filter>Source Files\CUDA\sha256</Filter>
+    </CudaCompile>
+    <CudaCompile Include="sha256\sha256q.cu">
+      <Filter>Source Files\CUDA\sha256</Filter>
+    </CudaCompile>
+    <CudaCompile Include="sia\sia.cu">
+      <Filter>Source Files\sia</Filter>
+    </CudaCompile>
+    <CudaCompile Include="crypto\cryptolight.cu">
+      <Filter>Source Files\CUDA\xmr</Filter>
+    </CudaCompile>
+    <CudaCompile Include="crypto\cryptolight-core.cu">
+      <Filter>Source Files\CUDA\xmr</Filter>
+    </CudaCompile>
+    <CudaCompile Include="crypto\cryptonight.cu">
+      <Filter>Source Files\CUDA\xmr</Filter>
+    </CudaCompile>
+    <CudaCompile Include="crypto\cryptonight-core.cu">
+      <Filter>Source Files\CUDA\xmr</Filter>
+    </CudaCompile>
+    <CudaCompile Include="crypto\cryptonight-extra.cu">
+      <Filter>Source Files\CUDA\xmr</Filter>
+    </CudaCompile>
+    <CudaCompile Include="crypto\wildkeccak.cu">
+      <Filter>Source Files\CUDA</Filter>
+    </CudaCompile>
+    <CudaCompile Include="equi\cuda_equi.cu">
+      <Filter>Source Files\equi</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x15\cuda_x15_whirlpool_sm3.cu">
+      <Filter>Source Files\CUDA\x15</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x16\cuda_x16_echo512.cu">
+      <Filter>Source Files\CUDA\x16</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x16\cuda_x16_echo512_64.cu">
+      <Filter>Source Files\CUDA\x16</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x16\cuda_x16_fugue512.cu">
+      <Filter>Source Files\CUDA\x16</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x16\cuda_x16_shabal512.cu">
+      <Filter>Source Files\CUDA\x16</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x16\cuda_x16_simd512_80.cu">
+      <Filter>Source Files\CUDA\x16</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x16\x16r.cu">
+      <Filter>Source Files\CUDA\x16</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x16\x16s.cu">
+      <Filter>Source Files\CUDA\x16</Filter>
+    </CudaCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <Image Include="res\ccminer.ico">
+      <Filter>Ressources</Filter>
+    </Image>
+  </ItemGroup>
+  <ItemGroup>
+    <ResourceCompile Include="res\ccminer.rc">
+      <Filter>Ressources</Filter>
+    </ResourceCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <Text Include="README.txt">
+      <Filter>Ressources</Filter>
+    </Text>
+  </ItemGroup>
+</Project>
diff --git a/compat.h b/compat.h
index c7f201cf2f..a98dab9076 100644
--- a/compat.h
+++ b/compat.h
@@ -4,6 +4,9 @@
 #ifdef WIN32
 
 #include <windows.h>
+#include <time.h>
+
+#define localtime_r(src, dst) localtime_s(dst, src)
 
 static __inline void sleep(int secs)
 {
@@ -11,14 +14,81 @@ static __inline void sleep(int secs)
 }
 
 enum {
-	PRIO_PROCESS		= 0,
+	PRIO_PROCESS = 0,
 };
 
+extern int opt_priority;
+
 static __inline int setpriority(int which, int who, int prio)
 {
-	return -!SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_IDLE /*THREAD_PRIORITY_TIME_CRITICAL*/);
+	switch (opt_priority) {
+		case 5:
+			prio = THREAD_PRIORITY_TIME_CRITICAL;
+			break;
+		case 4:
+			prio = THREAD_PRIORITY_HIGHEST;
+			break;
+		case 3:
+			prio = THREAD_PRIORITY_ABOVE_NORMAL;
+			break;
+		case 2:
+			prio = THREAD_PRIORITY_NORMAL;
+			break;
+		case 1:
+			prio = THREAD_PRIORITY_BELOW_NORMAL;
+			break;
+		case 0:
+		default:
+			prio = THREAD_PRIORITY_IDLE;
+	}
+	return -!SetThreadPriority(GetCurrentThread(), prio);
+}
+
+#ifdef _MSC_VER
+#define snprintf(...) _snprintf(__VA_ARGS__)
+#define strdup(...) _strdup(__VA_ARGS__)
+#define strncasecmp(x,y,z) _strnicmp(x,y,z)
+#define strcasecmp(x,y) _stricmp(x,y)
+typedef int ssize_t;
+
+__inline int msver(void) {
+	switch (_MSC_VER) {
+	case 1500: return 2008;
+	case 1600: return 2010;
+	case 1700: return 2012;
+	case 1800: return 2013;
+	case 1900: return 2015;
+	default: return (_MSC_VER/100);
+	}
 }
 
+#include <stdlib.h>
+static __inline char * dirname(char *file) {
+	char buffer[_MAX_PATH] = { 0 };
+	char drive[_MAX_DRIVE];
+	char dir[_MAX_DIR];
+	char fname[_MAX_FNAME];
+	char ext[_MAX_EXT];
+	_splitpath_s(file, drive, _MAX_DRIVE, dir, _MAX_DIR, fname, _MAX_FNAME, ext, _MAX_EXT);
+	sprintf(buffer, "%s%s", drive, dir);
+	return strdup(buffer);
+}
+#endif
+
 #endif /* WIN32 */
 
+#ifdef _MSC_VER
+# define __func__ __FUNCTION__
+# define __thread __declspec(thread)
+# define _ALIGN(x) __declspec(align(x))
+#else
+# define _ALIGN(x) __attribute__ ((aligned(x)))
+/* dirname() for linux/mingw */
+#include <libgen.h>
+#endif
+
+#ifndef WIN32
+#define MAX_PATH PATH_MAX
+#endif
+
 #endif /* __COMPAT_H__ */
diff --git a/compat/Makefile.in b/compat/Makefile.in
deleted file mode 100644
index 7b1f20d64a..0000000000
--- a/compat/Makefile.in
+++ /dev/null
@@ -1,559 +0,0 @@
-# Makefile.in generated by automake 1.11.3 from Makefile.am.
-# @configure_input@
-
-# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
-# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
-# Foundation, Inc.
-# This Makefile.in is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
-# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
-# PARTICULAR PURPOSE.
-
-@SET_MAKE@
-VPATH = @srcdir@
-pkgdatadir = $(datadir)/@PACKAGE@
-pkgincludedir = $(includedir)/@PACKAGE@
-pkglibdir = $(libdir)/@PACKAGE@
-pkglibexecdir = $(libexecdir)/@PACKAGE@
-am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
-install_sh_DATA = $(install_sh) -c -m 644
-install_sh_PROGRAM = $(install_sh) -c
-install_sh_SCRIPT = $(install_sh) -c
-INSTALL_HEADER = $(INSTALL_DATA)
-transform = $(program_transform_name)
-NORMAL_INSTALL = :
-PRE_INSTALL = :
-POST_INSTALL = :
-NORMAL_UNINSTALL = :
-PRE_UNINSTALL = :
-POST_UNINSTALL = :
-build_triplet = @build@
-host_triplet = @host@
-target_triplet = @target@
-subdir = compat
-DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in
-ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps = $(top_srcdir)/configure.ac
-am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
-	$(ACLOCAL_M4)
-mkinstalldirs = $(install_sh) -d
-CONFIG_HEADER = $(top_builddir)/cpuminer-config.h
-CONFIG_CLEAN_FILES =
-CONFIG_CLEAN_VPATH_FILES =
-SOURCES =
-DIST_SOURCES =
-RECURSIVE_TARGETS = all-recursive check-recursive dvi-recursive \
-	html-recursive info-recursive install-data-recursive \
-	install-dvi-recursive install-exec-recursive \
-	install-html-recursive install-info-recursive \
-	install-pdf-recursive install-ps-recursive install-recursive \
-	installcheck-recursive installdirs-recursive pdf-recursive \
-	ps-recursive uninstall-recursive
-RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive	\
-  distclean-recursive maintainer-clean-recursive
-AM_RECURSIVE_TARGETS = $(RECURSIVE_TARGETS:-recursive=) \
-	$(RECURSIVE_CLEAN_TARGETS:-recursive=) tags TAGS ctags CTAGS \
-	distdir
-ETAGS = etags
-CTAGS = ctags
-DIST_SUBDIRS = jansson
-DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
-am__relativize = \
-  dir0=`pwd`; \
-  sed_first='s,^\([^/]*\)/.*$$,\1,'; \
-  sed_rest='s,^[^/]*/*,,'; \
-  sed_last='s,^.*/\([^/]*\)$$,\1,'; \
-  sed_butlast='s,/*[^/]*$$,,'; \
-  while test -n "$$dir1"; do \
-    first=`echo "$$dir1" | sed -e "$$sed_first"`; \
-    if test "$$first" != "."; then \
-      if test "$$first" = ".."; then \
-        dir2=`echo "$$dir0" | sed -e "$$sed_last"`/"$$dir2"; \
-        dir0=`echo "$$dir0" | sed -e "$$sed_butlast"`; \
-      else \
-        first2=`echo "$$dir2" | sed -e "$$sed_first"`; \
-        if test "$$first2" = "$$first"; then \
-          dir2=`echo "$$dir2" | sed -e "$$sed_rest"`; \
-        else \
-          dir2="../$$dir2"; \
-        fi; \
-        dir0="$$dir0"/"$$first"; \
-      fi; \
-    fi; \
-    dir1=`echo "$$dir1" | sed -e "$$sed_rest"`; \
-  done; \
-  reldir="$$dir2"
-ACLOCAL = @ACLOCAL@
-ALLOCA = @ALLOCA@
-AMTAR = @AMTAR@
-AUTOCONF = @AUTOCONF@
-AUTOHEADER = @AUTOHEADER@
-AUTOMAKE = @AUTOMAKE@
-AWK = @AWK@
-CC = @CC@
-CCAS = @CCAS@
-CCASDEPMODE = @CCASDEPMODE@
-CCASFLAGS = @CCASFLAGS@
-CCDEPMODE = @CCDEPMODE@
-CFLAGS = @CFLAGS@
-CPP = @CPP@
-CPPFLAGS = @CPPFLAGS@
-CUDA_CFLAGS = @CUDA_CFLAGS@
-CUDA_LDFLAGS = @CUDA_LDFLAGS@
-CUDA_LIBS = @CUDA_LIBS@
-CXX = @CXX@
-CXXDEPMODE = @CXXDEPMODE@
-CXXFLAGS = @CXXFLAGS@
-CYGPATH_W = @CYGPATH_W@
-DEFS = @DEFS@
-DEPDIR = @DEPDIR@
-ECHO_C = @ECHO_C@
-ECHO_N = @ECHO_N@
-ECHO_T = @ECHO_T@
-EGREP = @EGREP@
-EXEEXT = @EXEEXT@
-GREP = @GREP@
-INSTALL = @INSTALL@
-INSTALL_DATA = @INSTALL_DATA@
-INSTALL_PROGRAM = @INSTALL_PROGRAM@
-INSTALL_SCRIPT = @INSTALL_SCRIPT@
-INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
-JANSSON_LIBS = @JANSSON_LIBS@
-LDFLAGS = @LDFLAGS@
-LIBCURL = @LIBCURL@
-LIBCURL_CPPFLAGS = @LIBCURL_CPPFLAGS@
-LIBOBJS = @LIBOBJS@
-LIBS = @LIBS@
-LTLIBOBJS = @LTLIBOBJS@
-MAINT = @MAINT@
-MAKEINFO = @MAKEINFO@
-MKDIR_P = @MKDIR_P@
-NVCC = @NVCC@
-OBJEXT = @OBJEXT@
-OPENMP_CFLAGS = @OPENMP_CFLAGS@
-PACKAGE = @PACKAGE@
-PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
-PACKAGE_NAME = @PACKAGE_NAME@
-PACKAGE_STRING = @PACKAGE_STRING@
-PACKAGE_TARNAME = @PACKAGE_TARNAME@
-PACKAGE_URL = @PACKAGE_URL@
-PACKAGE_VERSION = @PACKAGE_VERSION@
-PATH_SEPARATOR = @PATH_SEPARATOR@
-PTHREAD_FLAGS = @PTHREAD_FLAGS@
-PTHREAD_LIBS = @PTHREAD_LIBS@
-RANLIB = @RANLIB@
-SET_MAKE = @SET_MAKE@
-SHELL = @SHELL@
-STRIP = @STRIP@
-VERSION = @VERSION@
-WS2_LIBS = @WS2_LIBS@
-_libcurl_config = @_libcurl_config@
-abs_builddir = @abs_builddir@
-abs_srcdir = @abs_srcdir@
-abs_top_builddir = @abs_top_builddir@
-abs_top_srcdir = @abs_top_srcdir@
-ac_ct_CC = @ac_ct_CC@
-ac_ct_CXX = @ac_ct_CXX@
-am__include = @am__include@
-am__leading_dot = @am__leading_dot@
-am__quote = @am__quote@
-am__tar = @am__tar@
-am__untar = @am__untar@
-bindir = @bindir@
-build = @build@
-build_alias = @build_alias@
-build_cpu = @build_cpu@
-build_os = @build_os@
-build_vendor = @build_vendor@
-builddir = @builddir@
-datadir = @datadir@
-datarootdir = @datarootdir@
-docdir = @docdir@
-dvidir = @dvidir@
-exec_prefix = @exec_prefix@
-host = @host@
-host_alias = @host_alias@
-host_cpu = @host_cpu@
-host_os = @host_os@
-host_vendor = @host_vendor@
-htmldir = @htmldir@
-includedir = @includedir@
-infodir = @infodir@
-install_sh = @install_sh@
-libdir = @libdir@
-libexecdir = @libexecdir@
-localedir = @localedir@
-localstatedir = @localstatedir@
-mandir = @mandir@
-mkdir_p = @mkdir_p@
-oldincludedir = @oldincludedir@
-pdfdir = @pdfdir@
-prefix = @prefix@
-program_transform_name = @program_transform_name@
-psdir = @psdir@
-sbindir = @sbindir@
-sharedstatedir = @sharedstatedir@
-srcdir = @srcdir@
-sysconfdir = @sysconfdir@
-target = @target@
-target_alias = @target_alias@
-target_cpu = @target_cpu@
-target_os = @target_os@
-target_vendor = @target_vendor@
-top_build_prefix = @top_build_prefix@
-top_builddir = @top_builddir@
-top_srcdir = @top_srcdir@
-@WANT_JANSSON_FALSE@SUBDIRS = 
-@WANT_JANSSON_TRUE@SUBDIRS = jansson
-all: all-recursive
-
-.SUFFIXES:
-$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
-	@for dep in $?; do \
-	  case '$(am__configure_deps)' in \
-	    *$$dep*) \
-	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
-	        && { if test -f $@; then exit 0; else break; fi; }; \
-	      exit 1;; \
-	  esac; \
-	done; \
-	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu compat/Makefile'; \
-	$(am__cd) $(top_srcdir) && \
-	  $(AUTOMAKE) --gnu compat/Makefile
-.PRECIOUS: Makefile
-Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
-	@case '$?' in \
-	  *config.status*) \
-	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
-	  *) \
-	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
-	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
-	esac;
-
-$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
-	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-
-$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
-	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
-	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-$(am__aclocal_m4_deps):
-
-# This directory's subdirectories are mostly independent; you can cd
-# into them and run `make' without going through this Makefile.
-# To change the values of `make' variables: instead of editing Makefiles,
-# (1) if the variable is set in `config.status', edit `config.status'
-#     (which will cause the Makefiles to be regenerated when you run `make');
-# (2) otherwise, pass the desired values on the `make' command line.
-$(RECURSIVE_TARGETS):
-	@fail= failcom='exit 1'; \
-	for f in x $$MAKEFLAGS; do \
-	  case $$f in \
-	    *=* | --[!k]*);; \
-	    *k*) failcom='fail=yes';; \
-	  esac; \
-	done; \
-	dot_seen=no; \
-	target=`echo $@ | sed s/-recursive//`; \
-	list='$(SUBDIRS)'; for subdir in $$list; do \
-	  echo "Making $$target in $$subdir"; \
-	  if test "$$subdir" = "."; then \
-	    dot_seen=yes; \
-	    local_target="$$target-am"; \
-	  else \
-	    local_target="$$target"; \
-	  fi; \
-	  ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
-	  || eval $$failcom; \
-	done; \
-	if test "$$dot_seen" = "no"; then \
-	  $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
-	fi; test -z "$$fail"
-
-$(RECURSIVE_CLEAN_TARGETS):
-	@fail= failcom='exit 1'; \
-	for f in x $$MAKEFLAGS; do \
-	  case $$f in \
-	    *=* | --[!k]*);; \
-	    *k*) failcom='fail=yes';; \
-	  esac; \
-	done; \
-	dot_seen=no; \
-	case "$@" in \
-	  distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \
-	  *) list='$(SUBDIRS)' ;; \
-	esac; \
-	rev=''; for subdir in $$list; do \
-	  if test "$$subdir" = "."; then :; else \
-	    rev="$$subdir $$rev"; \
-	  fi; \
-	done; \
-	rev="$$rev ."; \
-	target=`echo $@ | sed s/-recursive//`; \
-	for subdir in $$rev; do \
-	  echo "Making $$target in $$subdir"; \
-	  if test "$$subdir" = "."; then \
-	    local_target="$$target-am"; \
-	  else \
-	    local_target="$$target"; \
-	  fi; \
-	  ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
-	  || eval $$failcom; \
-	done && test -z "$$fail"
-tags-recursive:
-	list='$(SUBDIRS)'; for subdir in $$list; do \
-	  test "$$subdir" = . || ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) tags); \
-	done
-ctags-recursive:
-	list='$(SUBDIRS)'; for subdir in $$list; do \
-	  test "$$subdir" = . || ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) ctags); \
-	done
-
-ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
-	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
-	unique=`for i in $$list; do \
-	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
-	  done | \
-	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
-	      END { if (nonempty) { for (i in files) print i; }; }'`; \
-	mkid -fID $$unique
-tags: TAGS
-
-TAGS: tags-recursive $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
-		$(TAGS_FILES) $(LISP)
-	set x; \
-	here=`pwd`; \
-	if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \
-	  include_option=--etags-include; \
-	  empty_fix=.; \
-	else \
-	  include_option=--include; \
-	  empty_fix=; \
-	fi; \
-	list='$(SUBDIRS)'; for subdir in $$list; do \
-	  if test "$$subdir" = .; then :; else \
-	    test ! -f $$subdir/TAGS || \
-	      set "$$@" "$$include_option=$$here/$$subdir/TAGS"; \
-	  fi; \
-	done; \
-	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
-	unique=`for i in $$list; do \
-	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
-	  done | \
-	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
-	      END { if (nonempty) { for (i in files) print i; }; }'`; \
-	shift; \
-	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
-	  test -n "$$unique" || unique=$$empty_fix; \
-	  if test $$# -gt 0; then \
-	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
-	      "$$@" $$unique; \
-	  else \
-	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
-	      $$unique; \
-	  fi; \
-	fi
-ctags: CTAGS
-CTAGS: ctags-recursive $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
-		$(TAGS_FILES) $(LISP)
-	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
-	unique=`for i in $$list; do \
-	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
-	  done | \
-	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
-	      END { if (nonempty) { for (i in files) print i; }; }'`; \
-	test -z "$(CTAGS_ARGS)$$unique" \
-	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
-	     $$unique
-
-GTAGS:
-	here=`$(am__cd) $(top_builddir) && pwd` \
-	  && $(am__cd) $(top_srcdir) \
-	  && gtags -i $(GTAGS_ARGS) "$$here"
-
-distclean-tags:
-	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
-
-distdir: $(DISTFILES)
-	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
-	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
-	list='$(DISTFILES)'; \
-	  dist_files=`for file in $$list; do echo $$file; done | \
-	  sed -e "s|^$$srcdirstrip/||;t" \
-	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
-	case $$dist_files in \
-	  */*) $(MKDIR_P) `echo "$$dist_files" | \
-			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
-			   sort -u` ;; \
-	esac; \
-	for file in $$dist_files; do \
-	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
-	  if test -d $$d/$$file; then \
-	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
-	    if test -d "$(distdir)/$$file"; then \
-	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
-	    fi; \
-	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
-	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
-	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
-	    fi; \
-	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
-	  else \
-	    test -f "$(distdir)/$$file" \
-	    || cp -p $$d/$$file "$(distdir)/$$file" \
-	    || exit 1; \
-	  fi; \
-	done
-	@list='$(DIST_SUBDIRS)'; for subdir in $$list; do \
-	  if test "$$subdir" = .; then :; else \
-	    test -d "$(distdir)/$$subdir" \
-	    || $(MKDIR_P) "$(distdir)/$$subdir" \
-	    || exit 1; \
-	  fi; \
-	done
-	@list='$(DIST_SUBDIRS)'; for subdir in $$list; do \
-	  if test "$$subdir" = .; then :; else \
-	    dir1=$$subdir; dir2="$(distdir)/$$subdir"; \
-	    $(am__relativize); \
-	    new_distdir=$$reldir; \
-	    dir1=$$subdir; dir2="$(top_distdir)"; \
-	    $(am__relativize); \
-	    new_top_distdir=$$reldir; \
-	    echo " (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir="$$new_top_distdir" distdir="$$new_distdir" \\"; \
-	    echo "     am__remove_distdir=: am__skip_length_check=: am__skip_mode_fix=: distdir)"; \
-	    ($(am__cd) $$subdir && \
-	      $(MAKE) $(AM_MAKEFLAGS) \
-	        top_distdir="$$new_top_distdir" \
-	        distdir="$$new_distdir" \
-		am__remove_distdir=: \
-		am__skip_length_check=: \
-		am__skip_mode_fix=: \
-	        distdir) \
-	      || exit 1; \
-	  fi; \
-	done
-check-am: all-am
-check: check-recursive
-all-am: Makefile
-installdirs: installdirs-recursive
-installdirs-am:
-install: install-recursive
-install-exec: install-exec-recursive
-install-data: install-data-recursive
-uninstall: uninstall-recursive
-
-install-am: all-am
-	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
-
-installcheck: installcheck-recursive
-install-strip:
-	if test -z '$(STRIP)'; then \
-	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
-	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
-	      install; \
-	else \
-	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
-	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
-	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
-	fi
-mostlyclean-generic:
-
-clean-generic:
-
-distclean-generic:
-	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
-	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
-
-maintainer-clean-generic:
-	@echo "This command is intended for maintainers to use"
-	@echo "it deletes files that may require special tools to rebuild."
-clean: clean-recursive
-
-clean-am: clean-generic mostlyclean-am
-
-distclean: distclean-recursive
-	-rm -f Makefile
-distclean-am: clean-am distclean-generic distclean-tags
-
-dvi: dvi-recursive
-
-dvi-am:
-
-html: html-recursive
-
-html-am:
-
-info: info-recursive
-
-info-am:
-
-install-data-am:
-
-install-dvi: install-dvi-recursive
-
-install-dvi-am:
-
-install-exec-am:
-
-install-html: install-html-recursive
-
-install-html-am:
-
-install-info: install-info-recursive
-
-install-info-am:
-
-install-man:
-
-install-pdf: install-pdf-recursive
-
-install-pdf-am:
-
-install-ps: install-ps-recursive
-
-install-ps-am:
-
-installcheck-am:
-
-maintainer-clean: maintainer-clean-recursive
-	-rm -f Makefile
-maintainer-clean-am: distclean-am maintainer-clean-generic
-
-mostlyclean: mostlyclean-recursive
-
-mostlyclean-am: mostlyclean-generic
-
-pdf: pdf-recursive
-
-pdf-am:
-
-ps: ps-recursive
-
-ps-am:
-
-uninstall-am:
-
-.MAKE: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) ctags-recursive \
-	install-am install-strip tags-recursive
-
-.PHONY: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) CTAGS GTAGS \
-	all all-am check check-am clean clean-generic ctags \
-	ctags-recursive distclean distclean-generic distclean-tags \
-	distdir dvi dvi-am html html-am info info-am install \
-	install-am install-data install-data-am install-dvi \
-	install-dvi-am install-exec install-exec-am install-html \
-	install-html-am install-info install-info-am install-man \
-	install-pdf install-pdf-am install-ps install-ps-am \
-	install-strip installcheck installcheck-am installdirs \
-	installdirs-am maintainer-clean maintainer-clean-generic \
-	mostlyclean mostlyclean-generic pdf pdf-am ps ps-am tags \
-	tags-recursive uninstall uninstall-am
-
-
-# Tell versions [3.59,3.63) of GNU make to not export all variables.
-# Otherwise a system limit (for SysV at least) may be exceeded.
-.NOEXPORT:
diff --git a/compat/bignum_ssl10.hpp b/compat/bignum_ssl10.hpp
new file mode 100644
index 0000000000..d8236b7166
--- /dev/null
+++ b/compat/bignum_ssl10.hpp
@@ -0,0 +1,602 @@
+// Copyright (c) 2009-2010 Satoshi Nakamoto
+// Copyright (c) 2009-2012 The Bitcoin developers
+// Distributed under the MIT/X11 software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+#ifndef BITCOIN_BIGNUM_H
+#define BITCOIN_BIGNUM_H
+
+#ifndef PROTOCOL_VERSION
+#define PROTOCOL_VERSION 70001
+#endif
+
+#include <map>
+#include <limits>
+#include <algorithm>
+#include <stdexcept>
+#include <vector>
+
+#include "uint256.h"
+
+#include <openssl/bn.h>
+
+#include "serialize.hpp"
+
+using namespace std;
+
+/** Errors thrown by the bignum class */
+class bignum_error : public std::runtime_error
+{
+public:
+    explicit bignum_error(const std::string& str) : std::runtime_error(str) {}
+};
+
+
+/** RAII encapsulated BN_CTX (OpenSSL bignum context) */
+class CAutoBN_CTX
+{
+protected:
+    BN_CTX* pctx;
+    BN_CTX* operator=(BN_CTX* pnew) { return pctx = pnew; }
+
+public:
+    CAutoBN_CTX()
+    {
+        pctx = BN_CTX_new();
+        if (pctx == NULL)
+            throw bignum_error("CAutoBN_CTX : BN_CTX_new() returned NULL");
+    }
+
+    ~CAutoBN_CTX()
+    {
+        if (pctx != NULL)
+            BN_CTX_free(pctx);
+    }
+
+    operator BN_CTX*() { return pctx; }
+    BN_CTX& operator*() { return *pctx; }
+    BN_CTX** operator&() { return &pctx; }
+    bool operator!() { return (pctx == NULL); }
+};
+
+
+/** C++ wrapper for BIGNUM (OpenSSL bignum) */
+class CBigNum : public BIGNUM
+{
+public:
+    CBigNum()
+    {
+        BN_init(this);
+    }
+
+    CBigNum(const CBigNum& b)
+    {
+        BN_init(this);
+        if (!BN_copy(this, &b))
+        {
+            BN_clear_free(this);
+            throw bignum_error("CBigNum::CBigNum(const CBigNum&) : BN_copy failed");
+        }
+    }
+
+    CBigNum& operator=(const CBigNum& b)
+    {
+        if (!BN_copy(this, &b))
+            throw bignum_error("CBigNum::operator= : BN_copy failed");
+        return (*this);
+    }
+
+    ~CBigNum()
+    {
+        BN_clear_free(this);
+    }
+
+    //CBigNum(char n) is not portable.  Use 'signed char' or 'unsigned char'.
+    CBigNum(signed char n)      { BN_init(this); if (n >= 0) setulong(n); else setint64(n); }
+    CBigNum(short n)            { BN_init(this); if (n >= 0) setulong(n); else setint64(n); }
+    CBigNum(int n)              { BN_init(this); if (n >= 0) setulong(n); else setint64(n); }
+    CBigNum(long n)             { BN_init(this); if (n >= 0) setulong(n); else setint64(n); }
+    CBigNum(int64 n)            { BN_init(this); setint64(n); }
+    CBigNum(unsigned char n)    { BN_init(this); setulong(n); }
+    CBigNum(unsigned short n)   { BN_init(this); setulong(n); }
+    CBigNum(unsigned int n)     { BN_init(this); setulong(n); }
+    CBigNum(unsigned long n)    { BN_init(this); setulong(n); }
+    CBigNum(uint64 n)           { BN_init(this); setuint64(n); }
+    explicit CBigNum(uint256 n) { BN_init(this); setuint256(n); }
+
+    explicit CBigNum(const std::vector<unsigned char>& vch)
+    {
+        BN_init(this);
+        setvch(vch);
+    }
+
+    void setulong(unsigned long n)
+    {
+        if (!BN_set_word(this, n))
+            throw bignum_error("CBigNum conversion from unsigned long : BN_set_word failed");
+    }
+
+    unsigned long getulong() const
+    {
+        return (unsigned long) BN_get_word(this);
+    }
+
+    unsigned int getuint() const
+    {
+        return (unsigned int) BN_get_word(this);
+    }
+
+    int getint() const
+    {
+        unsigned long n = (unsigned long) BN_get_word(this);
+        if (!BN_is_negative(this))
+            return (n > (unsigned long)std::numeric_limits<int>::max() ? std::numeric_limits<int>::max() : n);
+        else
+            return (n > (unsigned long)std::numeric_limits<int>::max() ? std::numeric_limits<int>::min() : -(int)n);
+    }
+
+    void setint64(int64 sn)
+    {
+        unsigned char pch[sizeof(sn) + 6];
+        unsigned char* p = pch + 4;
+        bool fNegative;
+        uint64 n;
+
+        if (sn < 0LL)
+        {
+            // Since the minimum signed integer cannot be represented as positive so long as its type is signed,
+            // and it's not well-defined what happens if you make it unsigned before negating it,
+            // we instead increment the negative integer by 1, convert it, then increment the (now positive) unsigned integer by 1 to compensate
+            n = -(sn + 1);
+            ++n;
+            fNegative = true;
+        } else {
+            n = sn;
+            fNegative = false;
+        }
+
+        bool fLeadingZeroes = true;
+        for (int i = 0; i < 8; i++)
+        {
+            unsigned char c = (n >> 56) & 0xff;
+            n <<= 8;
+            if (fLeadingZeroes)
+            {
+                if (c == 0)
+                    continue;
+                if (c & 0x80)
+                    *p++ = (fNegative ? 0x80 : 0);
+                else if (fNegative)
+                    c |= 0x80;
+                fLeadingZeroes = false;
+            }
+            *p++ = c;
+        }
+        unsigned int nSize = (unsigned int) (p - (pch + 4));
+        pch[0] = (nSize >> 24) & 0xff;
+        pch[1] = (nSize >> 16) & 0xff;
+        pch[2] = (nSize >> 8) & 0xff;
+        pch[3] = (nSize) & 0xff;
+        BN_mpi2bn(pch, (int) (p - pch), this);
+    }
+
+    void setuint64(uint64 n)
+    {
+        unsigned char pch[sizeof(n) + 6];
+        unsigned char* p = pch + 4;
+        bool fLeadingZeroes = true;
+        for (int i = 0; i < 8; i++)
+        {
+            unsigned char c = (n >> 56) & 0xff;
+            n <<= 8;
+            if (fLeadingZeroes)
+            {
+                if (c == 0)
+                    continue;
+                if (c & 0x80)
+                    *p++ = 0;
+                fLeadingZeroes = false;
+            }
+            *p++ = c;
+        }
+        unsigned int nSize = (unsigned int) (p - (pch + 4));
+        pch[0] = (nSize >> 24) & 0xff;
+        pch[1] = (nSize >> 16) & 0xff;
+        pch[2] = (nSize >> 8) & 0xff;
+        pch[3] = (nSize) & 0xff;
+        BN_mpi2bn(pch, (int) (p - pch), this);
+    }
+
+    void setuint256(uint256 n)
+    {
+        unsigned char pch[sizeof(n) + 6];
+        unsigned char* p = pch + 4;
+        bool fLeadingZeroes = true;
+        unsigned char* pbegin = (unsigned char*)&n;
+        unsigned char* psrc = pbegin + sizeof(n);
+        while (psrc != pbegin)
+        {
+            unsigned char c = *(--psrc);
+            if (fLeadingZeroes)
+            {
+                if (c == 0)
+                    continue;
+                if (c & 0x80)
+                    *p++ = 0;
+                fLeadingZeroes = false;
+            }
+            *p++ = c;
+        }
+        unsigned int nSize = (unsigned int) (p - (pch + 4));
+        pch[0] = (nSize >> 24) & 0xff;
+        pch[1] = (nSize >> 16) & 0xff;
+        pch[2] = (nSize >> 8) & 0xff;
+        pch[3] = (nSize >> 0) & 0xff;
+        BN_mpi2bn(pch, (int) (p - pch), this);
+    }
+
+    uint256 getuint256() const
+    {
+        unsigned int nSize = BN_bn2mpi(this, NULL);
+        if (nSize < 4)
+            return 0;
+        std::vector<unsigned char> vch(nSize);
+        BN_bn2mpi(this, &vch[0]);
+        if (vch.size() > 4)
+            vch[4] &= 0x7f;
+        uint256 n = 0;
+        for (unsigned int i = 0, j = (unsigned int) vch.size()-1; i < sizeof(n) && j >= 4; i++, j--)
+            ((unsigned char*)&n)[i] = vch[j];
+        return n;
+    }
+
+    void setvch(const std::vector<unsigned char>& vch)
+    {
+        std::vector<unsigned char> vch2(vch.size() + 4);
+        unsigned int nSize = (unsigned int) vch.size();
+        // BIGNUM's byte stream format expects 4 bytes of
+        // big endian size data info at the front
+        vch2[0] = (nSize >> 24) & 0xff;
+        vch2[1] = (nSize >> 16) & 0xff;
+        vch2[2] = (nSize >> 8) & 0xff;
+        vch2[3] = (nSize >> 0) & 0xff;
+        // swap data to big endian
+        reverse_copy(vch.begin(), vch.end(), vch2.begin() + 4);
+        BN_mpi2bn(&vch2[0], (int) vch2.size(), this);
+    }
+
+    std::vector<unsigned char> getvch() const
+    {
+        unsigned int nSize = BN_bn2mpi(this, NULL);
+        if (nSize <= 4)
+            return std::vector<unsigned char>();
+        std::vector<unsigned char> vch(nSize);
+        BN_bn2mpi(this, &vch[0]);
+        vch.erase(vch.begin(), vch.begin() + 4);
+        reverse(vch.begin(), vch.end());
+        return vch;
+    }
+
+    // The "compact" format is a representation of a whole
+    // number N using an unsigned 32bit number similar to a
+    // floating point format.
+    // The most significant 8 bits are the unsigned exponent of base 256.
+    // This exponent can be thought of as "number of bytes of N".
+    // The lower 23 bits are the mantissa.
+    // Bit number 24 (0x800000) represents the sign of N.
+    // N = (-1^sign) * mantissa * 256^(exponent-3)
+    //
+    // Satoshi's original implementation used BN_bn2mpi() and BN_mpi2bn().
+    // MPI uses the most significant bit of the first byte as sign.
+    // Thus 0x1234560000 is compact (0x05123456)
+    // and  0xc0de000000 is compact (0x0600c0de)
+    // (0x05c0de00) would be -0x40de000000
+    //
+    // Bitcoin only uses this "compact" format for encoding difficulty
+    // targets, which are unsigned 256bit quantities.  Thus, all the
+    // complexities of the sign bit and using base 256 are probably an
+    // implementation accident.
+    //
+    // This implementation directly uses shifts instead of going
+    // through an intermediate MPI representation.
+    CBigNum& SetCompact(unsigned int nCompact)
+    {
+        unsigned int nSize = nCompact >> 24;
+        bool fNegative     =(nCompact & 0x00800000) != 0;
+        unsigned int nWord = nCompact & 0x007fffff;
+        if (nSize <= 3)
+        {
+            nWord >>= 8*(3-nSize);
+            BN_set_word(this, nWord);
+        }
+        else
+        {
+            BN_set_word(this, nWord);
+            BN_lshift(this, this, 8*(nSize-3));
+        }
+        BN_set_negative(this, fNegative);
+        return *this;
+    }
+
+    unsigned int GetCompact() const
+    {
+        unsigned int nSize = BN_num_bytes(this);
+        unsigned int nCompact = 0;
+        if (nSize <= 3)
+            nCompact = (unsigned int) BN_get_word(this) << 8*(3-nSize);
+        else
+        {
+            CBigNum bn;
+            BN_rshift(&bn, this, 8*(nSize-3));
+            nCompact = (unsigned int) BN_get_word(&bn);
+        }
+        // The 0x00800000 bit denotes the sign.
+        // Thus, if it is already set, divide the mantissa by 256 and increase the exponent.
+        if (nCompact & 0x00800000)
+        {
+            nCompact >>= 8;
+            nSize++;
+        }
+        nCompact |= nSize << 24;
+        nCompact |= (BN_is_negative(this) ? 0x00800000 : 0);
+        return nCompact;
+    }
+
+    void SetHex(const std::string& str)
+    {
+        // skip 0x
+        const char* psz = str.c_str();
+        while (isspace(*psz))
+            psz++;
+        bool fNegative = false;
+        if (*psz == '-')
+        {
+            fNegative = true;
+            psz++;
+        }
+        if (psz[0] == '0' && tolower(psz[1]) == 'x')
+            psz += 2;
+        while (isspace(*psz))
+            psz++;
+
+        // hex string to bignum
+        static const signed char phexdigit[256] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,1,2,3,4,5,6,7,8,9,0,0,0,0,0,0, 0,0xa,0xb,0xc,0xd,0xe,0xf,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0xa,0xb,0xc,0xd,0xe,0xf,0,0,0,0,0,0,0,0,0 };
+        *this = 0;
+        while (isxdigit(*psz))
+        {
+            *this <<= 4;
+            int n = phexdigit[(unsigned char)*psz++];
+            *this += n;
+        }
+        if (fNegative)
+            *this = 0 - *this;
+    }
+
+    std::string ToString(int nBase=10) const
+    {
+        CAutoBN_CTX pctx;
+        CBigNum bnBase = nBase;
+        CBigNum bn0 = 0;
+        std::string str;
+        CBigNum bn = *this;
+        BN_set_negative(&bn, false);
+        CBigNum dv;
+        CBigNum rem;
+        if (BN_cmp(&bn, &bn0) == 0)
+            return "0";
+        while (BN_cmp(&bn, &bn0) > 0)
+        {
+            if (!BN_div(&dv, &rem, &bn, &bnBase, pctx))
+                throw bignum_error("CBigNum::ToString() : BN_div failed");
+            bn = dv;
+            unsigned int c = rem.getulong();
+            str += "0123456789abcdef"[c];
+        }
+        if (BN_is_negative(this))
+            str += "-";
+        reverse(str.begin(), str.end());
+        return str;
+    }
+
+    std::string GetHex() const
+    {
+        return ToString(16);
+    }
+
+    unsigned int GetSerializeSize(int nType=0, int nVersion=PROTOCOL_VERSION) const
+    {
+        return ::GetSerializeSize(getvch(), nType, nVersion);
+    }
+
+    template<typename Stream>
+    void Serialize(Stream& s, int nType=0, int nVersion=PROTOCOL_VERSION) const
+    {
+        ::Serialize(s, getvch(), nType, nVersion);
+    }
+
+    template<typename Stream>
+    void Unserialize(Stream& s, int nType=0, int nVersion=PROTOCOL_VERSION)
+    {
+        std::vector<unsigned char> vch;
+        ::Unserialize(s, vch, nType, nVersion);
+        setvch(vch);
+    }
+
+
+    bool operator!() const
+    {
+        return BN_is_zero(this);
+    }
+
+    CBigNum& operator+=(const CBigNum& b)
+    {
+        if (!BN_add(this, this, &b))
+            throw bignum_error("CBigNum::operator+= : BN_add failed");
+        return *this;
+    }
+
+    CBigNum& operator-=(const CBigNum& b)
+    {
+        *this = *this - b;
+        return *this;
+    }
+
+    CBigNum& operator*=(const CBigNum& b)
+    {
+        CAutoBN_CTX pctx;
+        if (!BN_mul(this, this, &b, pctx))
+            throw bignum_error("CBigNum::operator*= : BN_mul failed");
+        return *this;
+    }
+
+    CBigNum& operator/=(const CBigNum& b)
+    {
+        *this = *this / b;
+        return *this;
+    }
+
+    CBigNum& operator%=(const CBigNum& b)
+    {
+        *this = *this % b;
+        return *this;
+    }
+
+    CBigNum& operator<<=(unsigned int shift)
+    {
+        if (!BN_lshift(this, this, shift))
+            throw bignum_error("CBigNum:operator<<= : BN_lshift failed");
+        return *this;
+    }
+
+    CBigNum& operator>>=(unsigned int shift)
+    {
+        // Note: BN_rshift segfaults on 64-bit if 2^shift is greater than the number
+        //   if built on ubuntu 9.04 or 9.10, probably depends on version of OpenSSL
+        CBigNum a = 1;
+        a <<= shift;
+        if (BN_cmp(&a, this) > 0)
+        {
+            *this = 0;
+            return *this;
+        }
+
+        if (!BN_rshift(this, this, shift))
+            throw bignum_error("CBigNum:operator>>= : BN_rshift failed");
+        return *this;
+    }
+
+
+    CBigNum& operator++()
+    {
+        // prefix operator
+        if (!BN_add(this, this, BN_value_one()))
+            throw bignum_error("CBigNum::operator++ : BN_add failed");
+        return *this;
+    }
+
+    const CBigNum operator++(int)
+    {
+        // postfix operator
+        const CBigNum ret = *this;
+        ++(*this);
+        return ret;
+    }
+
+    CBigNum& operator--()
+    {
+        // prefix operator
+        CBigNum r;
+        if (!BN_sub(&r, this, BN_value_one()))
+            throw bignum_error("CBigNum::operator-- : BN_sub failed");
+        *this = r;
+        return *this;
+    }
+
+    const CBigNum operator--(int)
+    {
+        // postfix operator
+        const CBigNum ret = *this;
+        --(*this);
+        return ret;
+    }
+
+
+    friend inline const CBigNum operator-(const CBigNum& a, const CBigNum& b);
+    friend inline const CBigNum operator/(const CBigNum& a, const CBigNum& b);
+    friend inline const CBigNum operator%(const CBigNum& a, const CBigNum& b);
+};
+
+
+
+inline const CBigNum operator+(const CBigNum& a, const CBigNum& b)
+{
+    CBigNum r;
+    if (!BN_add(&r, &a, &b))
+        throw bignum_error("CBigNum::operator+ : BN_add failed");
+    return r;
+}
+
+inline const CBigNum operator-(const CBigNum& a, const CBigNum& b)
+{
+    CBigNum r;
+    if (!BN_sub(&r, &a, &b))
+        throw bignum_error("CBigNum::operator- : BN_sub failed");
+    return r;
+}
+
+inline const CBigNum operator-(const CBigNum& a)
+{
+    CBigNum r(a);
+    BN_set_negative(&r, !BN_is_negative(&r));
+    return r;
+}
+
+inline const CBigNum operator*(const CBigNum& a, const CBigNum& b)
+{
+    CAutoBN_CTX pctx;
+    CBigNum r;
+    if (!BN_mul(&r, &a, &b, pctx))
+        throw bignum_error("CBigNum::operator* : BN_mul failed");
+    return r;
+}
+
+inline const CBigNum operator/(const CBigNum& a, const CBigNum& b)
+{
+    CAutoBN_CTX pctx;
+    CBigNum r;
+    if (!BN_div(&r, NULL, &a, &b, pctx))
+        throw bignum_error("CBigNum::operator/ : BN_div failed");
+    return r;
+}
+
+inline const CBigNum operator%(const CBigNum& a, const CBigNum& b)
+{
+    CAutoBN_CTX pctx;
+    CBigNum r;
+    if (!BN_mod(&r, &a, &b, pctx))
+        throw bignum_error("CBigNum::operator% : BN_div failed");
+    return r;
+}
+
+inline const CBigNum operator<<(const CBigNum& a, unsigned int shift)
+{
+    CBigNum r;
+    if (!BN_lshift(&r, &a, shift))
+        throw bignum_error("CBigNum:operator<< : BN_lshift failed");
+    return r;
+}
+
+inline const CBigNum operator>>(const CBigNum& a, unsigned int shift)
+{
+    CBigNum r = a;
+    r >>= shift;
+    return r;
+}
+
+inline bool operator==(const CBigNum& a, const CBigNum& b) { return (BN_cmp(&a, &b) == 0); }
+inline bool operator!=(const CBigNum& a, const CBigNum& b) { return (BN_cmp(&a, &b) != 0); }
+inline bool operator<=(const CBigNum& a, const CBigNum& b) { return (BN_cmp(&a, &b) <= 0); }
+inline bool operator>=(const CBigNum& a, const CBigNum& b) { return (BN_cmp(&a, &b) >= 0); }
+inline bool operator<(const CBigNum& a, const CBigNum& b)  { return (BN_cmp(&a, &b) < 0); }
+inline bool operator>(const CBigNum& a, const CBigNum& b)  { return (BN_cmp(&a, &b) > 0); }
+
+#endif
diff --git a/compat/ccminer-config.h b/compat/ccminer-config.h
new file mode 100644
index 0000000000..d110201d94
--- /dev/null
+++ b/compat/ccminer-config.h
@@ -0,0 +1,188 @@
+/* CONFIG ONLY FOR MS VC++ BUILD */
+
+/* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP
+   systems. This function is required for `alloca.c' support on those systems.
+   */
+/* #undef CRAY_STACKSEG_END */
+
+/* Define to 1 if using `alloca.c'. */
+/* #undef C_ALLOCA */
+
+/* Define to 1 if you have `alloca', as a function or macro. */
+#define HAVE_ALLOCA 1
+
+/* Define to 1 if you have <alloca.h> and it should be used (not on Ultrix).
+   */
+#define HAVE_ALLOCA_H 1
+
+/* Define to 1 if you have the declaration of `be32dec', and to 0 if you
+   don't. */
+#define HAVE_DECL_BE32DEC 0
+
+/* Define to 1 if you have the declaration of `be32enc', and to 0 if you
+   don't. */
+#define HAVE_DECL_BE32ENC 0
+
+/* Define to 1 if you have the declaration of `le32dec', and to 0 if you
+   don't. */
+#define HAVE_DECL_LE32DEC 0
+
+/* Define to 1 if you have the declaration of `le32enc', and to 0 if you
+   don't. */
+#define HAVE_DECL_LE32ENC 0
+
+/* Define to 1 if you have the `getopt_long' function. */
+#define HAVE_GETOPT_LONG 1
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#define HAVE_INTTYPES_H 1
+
+/* Define to 1 if you have the `crypto' library (-lcrypto). */
+#define HAVE_LIBCRYPTO 1
+
+/* Define to 1 if you have a functional curl library. */
+#define HAVE_LIBCURL 1
+
+/* Define to 1 if you have the `ssl' library (-lssl). */
+#define HAVE_LIBSSL 1
+
+/* Define to 1 if you have the `z' library (-lz). */
+#define HAVE_LIBZ 1
+
+/* Define to 1 if you have the <memory.h> header file. */
+#define HAVE_MEMORY_H 1
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#define HAVE_STDINT_H 1
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#define HAVE_STDLIB_H 1
+
+/* Define to 1 if you have the <strings.h> header file. */
+#define HAVE_STRINGS_H 1
+
+/* Define to 1 if you have the <string.h> header file. */
+#define HAVE_STRING_H 1
+
+/* Define to 1 if you have the <syslog.h> header file. */
+/* #undef HAVE_SYSLOG_H */
+
+/* Define to 1 if you have the <sys/endian.h> header file. */
+/* #undef HAVE_SYS_ENDIAN_H */
+
+/* Define to 1 if you have the <sys/param.h> header file. */
+#define HAVE_SYS_PARAM_H 1
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#define HAVE_SYS_STAT_H 1
+
+/* Define to 1 if you have the <sys/sysctl.h> header file. */
+#define HAVE_SYS_SYSCTL_H 1
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#define HAVE_SYS_TYPES_H 1
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#define HAVE_UNISTD_H 1
+
+/* Defined if libcurl supports AsynchDNS */
+/* #undef LIBCURL_FEATURE_ASYNCHDNS */
+
+/* Defined if libcurl supports IDN */
+#define LIBCURL_FEATURE_IDN 1
+
+/* Defined if libcurl supports IPv6 */
+#define LIBCURL_FEATURE_IPV6 1
+
+/* Defined if libcurl supports KRB4 */
+/* #undef LIBCURL_FEATURE_KRB4 */
+
+/* Defined if libcurl supports libz */
+#define LIBCURL_FEATURE_LIBZ 1
+
+/* Defined if libcurl supports NTLM */
+#define LIBCURL_FEATURE_NTLM 1
+
+/* Defined if libcurl supports SSL */
+#define LIBCURL_FEATURE_SSL 1
+
+/* Defined if libcurl supports SSPI */
+/* #undef LIBCURL_FEATURE_SSPI */
+
+/* Defined if libcurl supports DICT */
+/* #undef LIBCURL_PROTOCOL_DICT */
+
+/* Defined if libcurl supports FILE */
+#define LIBCURL_PROTOCOL_FILE 1
+
+/* Defined if libcurl supports FTP */
+#define LIBCURL_PROTOCOL_FTP 1
+
+/* Defined if libcurl supports FTPS */
+#define LIBCURL_PROTOCOL_FTPS 1
+
+/* Defined if libcurl supports HTTP */
+#define LIBCURL_PROTOCOL_HTTP 1
+
+/* Defined if libcurl supports HTTPS */
+#define LIBCURL_PROTOCOL_HTTPS 1
+
+/* Defined if libcurl supports IMAP */
+/* #undef LIBCURL_PROTOCOL_IMAP */
+
+/* Defined if libcurl supports LDAP */
+/* #undef LIBCURL_PROTOCOL_LDAP */
+
+/* Defined if libcurl supports POP3 */
+/* #undef LIBCURL_PROTOCOL_POP3 */
+
+/* Defined if libcurl supports RTSP */
+/* #undef LIBCURL_PROTOCOL_RTSP */
+
+/* Defined if libcurl supports SMTP */
+/* #undef LIBCURL_PROTOCOL_SMTP */
+
+/* Defined if libcurl supports TELNET */
+/* #undef LIBCURL_PROTOCOL_TELNET */
+
+/* Defined if libcurl supports TFTP */
+/* #undef LIBCURL_PROTOCOL_TFTP */
+
+/* Define to 1 if your C compiler doesn't accept -c and -o together. */
+/* #undef NO_MINUS_C_MINUS_O */
+
+/* Name of package */
+#define PACKAGE "ccminer"
+
+/* Define to the address where bug reports for this package should be sent. */
+#define PACKAGE_BUGREPORT ""
+
+/* Define to the full name of this package. */
+#define PACKAGE_NAME "ccminer"
+
+/* Define to the home page for this package. */
+#define PACKAGE_URL "http://github.com/tpruvot/ccminer"
+
+/* Define to the version of this package. */
+#define PACKAGE_VERSION "2.3.1"
+
+/* If using the C implementation of alloca, define if you know the
+   direction of stack growth for your system; otherwise it will be
+   automatically deduced at runtime.
+	STACK_DIRECTION > 0 => grows toward higher addresses
+	STACK_DIRECTION < 0 => grows toward lower addresses
+	STACK_DIRECTION = 0 => direction of growth unknown */
+/* #undef STACK_DIRECTION */
+
+/* Define to 1 if you have the ANSI C header files. */
+#define STDC_HEADERS 1
+
+/* Define curl_free() as free() if our version of curl lacks curl_free. */
+/* #undef curl_free */
+
+/* Define to `unsigned int' if <sys/types.h> does not define. */
+//#define size_t unsigned int
+
+#if !defined(HAVE_STRUCT_TIMESPEC) && _MSC_VER >= 1900
+#define HAVE_STRUCT_TIMESPEC
+#endif
diff --git a/compat/curl-for-windows/curl/include/curl/curl.h b/compat/curl-for-windows/curl/include/curl/curl.h
new file mode 100644
index 0000000000..d40b2dbbf4
--- /dev/null
+++ b/compat/curl-for-windows/curl/include/curl/curl.h
@@ -0,0 +1,2336 @@
+#ifndef __CURL_CURL_H
+#define __CURL_CURL_H
+/***************************************************************************
+ *                                  _   _ ____  _
+ *  Project                     ___| | | |  _ \| |
+ *                             / __| | | | |_) | |
+ *                            | (__| |_| |  _ <| |___
+ *                             \___|\___/|_| \_\_____|
+ *
+ * Copyright (C) 1998 - 2014, Daniel Stenberg, <daniel@haxx.se>, et al.
+ *
+ * This software is licensed as described in the file COPYING, which
+ * you should have received as part of this distribution. The terms
+ * are also available at http://curl.haxx.se/docs/copyright.html.
+ *
+ * You may opt to use, copy, modify, merge, publish, distribute and/or sell
+ * copies of the Software, and permit persons to whom the Software is
+ * furnished to do so, under the terms of the COPYING file.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ***************************************************************************/
+
+/*
+ * If you have libcurl problems, all docs and details are found here:
+ *   http://curl.haxx.se/libcurl/
+ *
+ * curl-library mailing list subscription and unsubscription web interface:
+ *   http://cool.haxx.se/mailman/listinfo/curl-library/
+ */
+
+#include "curlver.h"         /* libcurl version defines   */
+#include "curlbuild.h"       /* libcurl build definitions */
+#include "curlrules.h"       /* libcurl rules enforcement */
+
+/*
+ * Define WIN32 when build target is Win32 API
+ */
+
+#if (defined(_WIN32) || defined(__WIN32__)) && \
+     !defined(WIN32) && !defined(__SYMBIAN32__)
+#define WIN32
+#endif
+
+#include <stdio.h>
+#include <limits.h>
+
+#if defined(__FreeBSD__) && (__FreeBSD__ >= 2)
+/* Needed for __FreeBSD_version symbol definition */
+#include <osreldate.h>
+#endif
+
+/* The include stuff here below is mainly for time_t! */
+#include <sys/types.h>
+#include <time.h>
+
+#if defined(WIN32) && !defined(_WIN32_WCE) && !defined(__CYGWIN__)
+#if !(defined(_WINSOCKAPI_) || defined(_WINSOCK_H) || defined(__LWIP_OPT_H__))
+/* The check above prevents the winsock2 inclusion if winsock.h already was
+   included, since they can't co-exist without problems */
+#include <winsock2.h>
+#include <ws2tcpip.h>
+#endif
+#endif
+
+/* HP-UX systems version 9, 10 and 11 lack sys/select.h and so does oldish
+   libc5-based Linux systems. Only include it on systems that are known to
+   require it! */
+#if defined(_AIX) || defined(__NOVELL_LIBC__) || defined(__NetBSD__) || \
+    defined(__minix) || defined(__SYMBIAN32__) || defined(__INTEGRITY) || \
+    defined(ANDROID) || defined(__ANDROID__) || defined(__OpenBSD__) || \
+   (defined(__FreeBSD_version) && (__FreeBSD_version < 800000))
+#include <sys/select.h>
+#endif
+
+#if !defined(WIN32) && !defined(_WIN32_WCE)
+#include <sys/socket.h>
+#endif
+
+#if !defined(WIN32) && !defined(__WATCOMC__) && !defined(__VXWORKS__)
+#include <sys/time.h>
+#endif
+
+#ifdef __BEOS__
+#include <support/SupportDefs.h>
+#endif
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+typedef void CURL;
+
+/*
+ * libcurl external API function linkage decorations.
+ */
+
+#ifdef CURL_STATICLIB
+#  define CURL_EXTERN
+#elif defined(WIN32) || defined(_WIN32) || defined(__SYMBIAN32__)
+#  if defined(BUILDING_LIBCURL)
+#    define CURL_EXTERN  __declspec(dllexport)
+#  else
+#    define CURL_EXTERN  __declspec(dllimport)
+#  endif
+#elif defined(BUILDING_LIBCURL) && defined(CURL_HIDDEN_SYMBOLS)
+#  define CURL_EXTERN CURL_EXTERN_SYMBOL
+#else
+#  define CURL_EXTERN
+#endif
+
+#ifndef curl_socket_typedef
+/* socket typedef */
+#if defined(WIN32) && !defined(__LWIP_OPT_H__)
+typedef SOCKET curl_socket_t;
+#define CURL_SOCKET_BAD INVALID_SOCKET
+#else
+typedef int curl_socket_t;
+#define CURL_SOCKET_BAD -1
+#endif
+#define curl_socket_typedef
+#endif /* curl_socket_typedef */
+
+struct curl_httppost {
+  struct curl_httppost *next;       /* next entry in the list */
+  char *name;                       /* pointer to allocated name */
+  long namelength;                  /* length of name length */
+  char *contents;                   /* pointer to allocated data contents */
+  long contentslength;              /* length of contents field */
+  char *buffer;                     /* pointer to allocated buffer contents */
+  long bufferlength;                /* length of buffer field */
+  char *contenttype;                /* Content-Type */
+  struct curl_slist* contentheader; /* list of extra headers for this form */
+  struct curl_httppost *more;       /* if one field name has more than one
+                                       file, this link should link to following
+                                       files */
+  long flags;                       /* as defined below */
+#define HTTPPOST_FILENAME (1<<0)    /* specified content is a file name */
+#define HTTPPOST_READFILE (1<<1)    /* specified content is a file name */
+#define HTTPPOST_PTRNAME (1<<2)     /* name is only stored pointer
+                                       do not free in formfree */
+#define HTTPPOST_PTRCONTENTS (1<<3) /* contents is only stored pointer
+                                       do not free in formfree */
+#define HTTPPOST_BUFFER (1<<4)      /* upload file from buffer */
+#define HTTPPOST_PTRBUFFER (1<<5)   /* upload file from pointer contents */
+#define HTTPPOST_CALLBACK (1<<6)    /* upload file contents by using the
+                                       regular read callback to get the data
+                                       and pass the given pointer as custom
+                                       pointer */
+
+  char *showfilename;               /* The file name to show. If not set, the
+                                       actual file name will be used (if this
+                                       is a file part) */
+  void *userp;                      /* custom pointer used for
+                                       HTTPPOST_CALLBACK posts */
+};
+
+/* This is the CURLOPT_PROGRESSFUNCTION callback proto. It is now considered
+   deprecated but was the only choice up until 7.31.0 */
+typedef int (*curl_progress_callback)(void *clientp,
+                                      double dltotal,
+                                      double dlnow,
+                                      double ultotal,
+                                      double ulnow);
+
+/* This is the CURLOPT_XFERINFOFUNCTION callback proto. It was introduced in
+   7.32.0, it avoids floating point and provides more detailed information. */
+typedef int (*curl_xferinfo_callback)(void *clientp,
+                                      curl_off_t dltotal,
+                                      curl_off_t dlnow,
+                                      curl_off_t ultotal,
+                                      curl_off_t ulnow);
+
+#ifndef CURL_MAX_WRITE_SIZE
+  /* Tests have proven that 20K is a very bad buffer size for uploads on
+     Windows, while 16K for some odd reason performed a lot better.
+     We do the ifndef check to allow this value to easier be changed at build
+     time for those who feel adventurous. The practical minimum is about
+     400 bytes since libcurl uses a buffer of this size as a scratch area
+     (unrelated to network send operations). */
+#define CURL_MAX_WRITE_SIZE 16384
+#endif
+
+#ifndef CURL_MAX_HTTP_HEADER
+/* The only reason to have a max limit for this is to avoid the risk of a bad
+   server feeding libcurl with a never-ending header that will cause reallocs
+   infinitely */
+#define CURL_MAX_HTTP_HEADER (100*1024)
+#endif
+
+/* This is a magic return code for the write callback that, when returned,
+   will signal libcurl to pause receiving on the current transfer. */
+#define CURL_WRITEFUNC_PAUSE 0x10000001
+
+typedef size_t (*curl_write_callback)(char *buffer,
+                                      size_t size,
+                                      size_t nitems,
+                                      void *outstream);
+
+
+
+/* enumeration of file types */
+typedef enum {
+  CURLFILETYPE_FILE = 0,
+  CURLFILETYPE_DIRECTORY,
+  CURLFILETYPE_SYMLINK,
+  CURLFILETYPE_DEVICE_BLOCK,
+  CURLFILETYPE_DEVICE_CHAR,
+  CURLFILETYPE_NAMEDPIPE,
+  CURLFILETYPE_SOCKET,
+  CURLFILETYPE_DOOR, /* is possible only on Sun Solaris now */
+
+  CURLFILETYPE_UNKNOWN /* should never occur */
+} curlfiletype;
+
+#define CURLFINFOFLAG_KNOWN_FILENAME    (1<<0)
+#define CURLFINFOFLAG_KNOWN_FILETYPE    (1<<1)
+#define CURLFINFOFLAG_KNOWN_TIME        (1<<2)
+#define CURLFINFOFLAG_KNOWN_PERM        (1<<3)
+#define CURLFINFOFLAG_KNOWN_UID         (1<<4)
+#define CURLFINFOFLAG_KNOWN_GID         (1<<5)
+#define CURLFINFOFLAG_KNOWN_SIZE        (1<<6)
+#define CURLFINFOFLAG_KNOWN_HLINKCOUNT  (1<<7)
+
+/* Content of this structure depends on information which is known and is
+   achievable (e.g. by FTP LIST parsing). Please see the url_easy_setopt(3) man
+   page for callbacks returning this structure -- some fields are mandatory,
+   some others are optional. The FLAG field has special meaning. */
+struct curl_fileinfo {
+  char *filename;
+  curlfiletype filetype;
+  time_t time;
+  unsigned int perm;
+  int uid;
+  int gid;
+  curl_off_t size;
+  long int hardlinks;
+
+  struct {
+    /* If some of these fields is not NULL, it is a pointer to b_data. */
+    char *time;
+    char *perm;
+    char *user;
+    char *group;
+    char *target; /* pointer to the target filename of a symlink */
+  } strings;
+
+  unsigned int flags;
+
+  /* used internally */
+  char * b_data;
+  size_t b_size;
+  size_t b_used;
+};
+
+/* return codes for CURLOPT_CHUNK_BGN_FUNCTION */
+#define CURL_CHUNK_BGN_FUNC_OK      0
+#define CURL_CHUNK_BGN_FUNC_FAIL    1 /* tell the lib to end the task */
+#define CURL_CHUNK_BGN_FUNC_SKIP    2 /* skip this chunk over */
+
+/* if splitting of data transfer is enabled, this callback is called before
+   download of an individual chunk started. Note that parameter "remains" works
+   only for FTP wildcard downloading (for now), otherwise is not used */
+typedef long (*curl_chunk_bgn_callback)(const void *transfer_info,
+                                        void *ptr,
+                                        int remains);
+
+/* return codes for CURLOPT_CHUNK_END_FUNCTION */
+#define CURL_CHUNK_END_FUNC_OK      0
+#define CURL_CHUNK_END_FUNC_FAIL    1 /* tell the lib to end the task */
+
+/* If splitting of data transfer is enabled this callback is called after
+   download of an individual chunk finished.
+   Note! After this callback was set then it have to be called FOR ALL chunks.
+   Even if downloading of this chunk was skipped in CHUNK_BGN_FUNC.
+   This is the reason why we don't need "transfer_info" parameter in this
+   callback and we are not interested in "remains" parameter too. */
+typedef long (*curl_chunk_end_callback)(void *ptr);
+
+/* return codes for FNMATCHFUNCTION */
+#define CURL_FNMATCHFUNC_MATCH    0 /* string corresponds to the pattern */
+#define CURL_FNMATCHFUNC_NOMATCH  1 /* pattern doesn't match the string */
+#define CURL_FNMATCHFUNC_FAIL     2 /* an error occurred */
+
+/* callback type for wildcard downloading pattern matching. If the
+   string matches the pattern, return CURL_FNMATCHFUNC_MATCH value, etc. */
+typedef int (*curl_fnmatch_callback)(void *ptr,
+                                     const char *pattern,
+                                     const char *string);
+
+/* These are the return codes for the seek callbacks */
+#define CURL_SEEKFUNC_OK       0
+#define CURL_SEEKFUNC_FAIL     1 /* fail the entire transfer */
+#define CURL_SEEKFUNC_CANTSEEK 2 /* tell libcurl seeking can't be done, so
+                                    libcurl might try other means instead */
+typedef int (*curl_seek_callback)(void *instream,
+                                  curl_off_t offset,
+                                  int origin); /* 'whence' */
+
+/* This is a return code for the read callback that, when returned, will
+   signal libcurl to immediately abort the current transfer. */
+#define CURL_READFUNC_ABORT 0x10000000
+/* This is a return code for the read callback that, when returned, will
+   signal libcurl to pause sending data on the current transfer. */
+#define CURL_READFUNC_PAUSE 0x10000001
+
+typedef size_t (*curl_read_callback)(char *buffer,
+                                      size_t size,
+                                      size_t nitems,
+                                      void *instream);
+
+typedef enum  {
+  CURLSOCKTYPE_IPCXN,  /* socket created for a specific IP connection */
+  CURLSOCKTYPE_ACCEPT, /* socket created by accept() call */
+  CURLSOCKTYPE_LAST    /* never use */
+} curlsocktype;
+
+/* The return code from the sockopt_callback can signal information back
+   to libcurl: */
+#define CURL_SOCKOPT_OK 0
+#define CURL_SOCKOPT_ERROR 1 /* causes libcurl to abort and return
+                                CURLE_ABORTED_BY_CALLBACK */
+#define CURL_SOCKOPT_ALREADY_CONNECTED 2
+
+typedef int (*curl_sockopt_callback)(void *clientp,
+                                     curl_socket_t curlfd,
+                                     curlsocktype purpose);
+
+struct curl_sockaddr {
+  int family;
+  int socktype;
+  int protocol;
+  unsigned int addrlen; /* addrlen was a socklen_t type before 7.18.0 but it
+                           turned really ugly and painful on the systems that
+                           lack this type */
+  struct sockaddr addr;
+};
+
+typedef curl_socket_t
+(*curl_opensocket_callback)(void *clientp,
+                            curlsocktype purpose,
+                            struct curl_sockaddr *address);
+
+typedef int
+(*curl_closesocket_callback)(void *clientp, curl_socket_t item);
+
+typedef enum {
+  CURLIOE_OK,            /* I/O operation successful */
+  CURLIOE_UNKNOWNCMD,    /* command was unknown to callback */
+  CURLIOE_FAILRESTART,   /* failed to restart the read */
+  CURLIOE_LAST           /* never use */
+} curlioerr;
+
+typedef enum  {
+  CURLIOCMD_NOP,         /* no operation */
+  CURLIOCMD_RESTARTREAD, /* restart the read stream from start */
+  CURLIOCMD_LAST         /* never use */
+} curliocmd;
+
+typedef curlioerr (*curl_ioctl_callback)(CURL *handle,
+                                         int cmd,
+                                         void *clientp);
+
+/*
+ * The following typedef's are signatures of malloc, free, realloc, strdup and
+ * calloc respectively.  Function pointers of these types can be passed to the
+ * curl_global_init_mem() function to set user defined memory management
+ * callback routines.
+ */
+typedef void *(*curl_malloc_callback)(size_t size);
+typedef void (*curl_free_callback)(void *ptr);
+typedef void *(*curl_realloc_callback)(void *ptr, size_t size);
+typedef char *(*curl_strdup_callback)(const char *str);
+typedef void *(*curl_calloc_callback)(size_t nmemb, size_t size);
+
+/* the kind of data that is passed to information_callback*/
+typedef enum {
+  CURLINFO_TEXT = 0,
+  CURLINFO_HEADER_IN,    /* 1 */
+  CURLINFO_HEADER_OUT,   /* 2 */
+  CURLINFO_DATA_IN,      /* 3 */
+  CURLINFO_DATA_OUT,     /* 4 */
+  CURLINFO_SSL_DATA_IN,  /* 5 */
+  CURLINFO_SSL_DATA_OUT, /* 6 */
+  CURLINFO_END
+} curl_infotype;
+
+typedef int (*curl_debug_callback)
+       (CURL *handle,      /* the handle/transfer this concerns */
+        curl_infotype type, /* what kind of data */
+        char *data,        /* points to the data */
+        size_t size,       /* size of the data pointed to */
+        void *userptr);    /* whatever the user please */
+
+/* All possible error codes from all sorts of curl functions. Future versions
+   may return other values, stay prepared.
+
+   Always add new return codes last. Never *EVER* remove any. The return
+   codes must remain the same!
+ */
+
+typedef enum {
+  CURLE_OK = 0,
+  CURLE_UNSUPPORTED_PROTOCOL,    /* 1 */
+  CURLE_FAILED_INIT,             /* 2 */
+  CURLE_URL_MALFORMAT,           /* 3 */
+  CURLE_NOT_BUILT_IN,            /* 4 - [was obsoleted in August 2007 for
+                                    7.17.0, reused in April 2011 for 7.21.5] */
+  CURLE_COULDNT_RESOLVE_PROXY,   /* 5 */
+  CURLE_COULDNT_RESOLVE_HOST,    /* 6 */
+  CURLE_COULDNT_CONNECT,         /* 7 */
+  CURLE_FTP_WEIRD_SERVER_REPLY,  /* 8 */
+  CURLE_REMOTE_ACCESS_DENIED,    /* 9 a service was denied by the server
+                                    due to lack of access - when login fails
+                                    this is not returned. */
+  CURLE_FTP_ACCEPT_FAILED,       /* 10 - [was obsoleted in April 2006 for
+                                    7.15.4, reused in Dec 2011 for 7.24.0]*/
+  CURLE_FTP_WEIRD_PASS_REPLY,    /* 11 */
+  CURLE_FTP_ACCEPT_TIMEOUT,      /* 12 - timeout occurred accepting server
+                                    [was obsoleted in August 2007 for 7.17.0,
+                                    reused in Dec 2011 for 7.24.0]*/
+  CURLE_FTP_WEIRD_PASV_REPLY,    /* 13 */
+  CURLE_FTP_WEIRD_227_FORMAT,    /* 14 */
+  CURLE_FTP_CANT_GET_HOST,       /* 15 */
+  CURLE_HTTP2,                   /* 16 - A problem in the http2 framing layer.
+                                    [was obsoleted in August 2007 for 7.17.0,
+                                    reused in July 2014 for 7.38.0] */
+  CURLE_FTP_COULDNT_SET_TYPE,    /* 17 */
+  CURLE_PARTIAL_FILE,            /* 18 */
+  CURLE_FTP_COULDNT_RETR_FILE,   /* 19 */
+  CURLE_OBSOLETE20,              /* 20 - NOT USED */
+  CURLE_QUOTE_ERROR,             /* 21 - quote command failure */
+  CURLE_HTTP_RETURNED_ERROR,     /* 22 */
+  CURLE_WRITE_ERROR,             /* 23 */
+  CURLE_OBSOLETE24,              /* 24 - NOT USED */
+  CURLE_UPLOAD_FAILED,           /* 25 - failed upload "command" */
+  CURLE_READ_ERROR,              /* 26 - couldn't open/read from file */
+  CURLE_OUT_OF_MEMORY,           /* 27 */
+  /* Note: CURLE_OUT_OF_MEMORY may sometimes indicate a conversion error
+           instead of a memory allocation error if CURL_DOES_CONVERSIONS
+           is defined
+  */
+  CURLE_OPERATION_TIMEDOUT,      /* 28 - the timeout time was reached */
+  CURLE_OBSOLETE29,              /* 29 - NOT USED */
+  CURLE_FTP_PORT_FAILED,         /* 30 - FTP PORT operation failed */
+  CURLE_FTP_COULDNT_USE_REST,    /* 31 - the REST command failed */
+  CURLE_OBSOLETE32,              /* 32 - NOT USED */
+  CURLE_RANGE_ERROR,             /* 33 - RANGE "command" didn't work */
+  CURLE_HTTP_POST_ERROR,         /* 34 */
+  CURLE_SSL_CONNECT_ERROR,       /* 35 - wrong when connecting with SSL */
+  CURLE_BAD_DOWNLOAD_RESUME,     /* 36 - couldn't resume download */
+  CURLE_FILE_COULDNT_READ_FILE,  /* 37 */
+  CURLE_LDAP_CANNOT_BIND,        /* 38 */
+  CURLE_LDAP_SEARCH_FAILED,      /* 39 */
+  CURLE_OBSOLETE40,              /* 40 - NOT USED */
+  CURLE_FUNCTION_NOT_FOUND,      /* 41 */
+  CURLE_ABORTED_BY_CALLBACK,     /* 42 */
+  CURLE_BAD_FUNCTION_ARGUMENT,   /* 43 */
+  CURLE_OBSOLETE44,              /* 44 - NOT USED */
+  CURLE_INTERFACE_FAILED,        /* 45 - CURLOPT_INTERFACE failed */
+  CURLE_OBSOLETE46,              /* 46 - NOT USED */
+  CURLE_TOO_MANY_REDIRECTS ,     /* 47 - catch endless re-direct loops */
+  CURLE_UNKNOWN_OPTION,          /* 48 - User specified an unknown option */
+  CURLE_TELNET_OPTION_SYNTAX ,   /* 49 - Malformed telnet option */
+  CURLE_OBSOLETE50,              /* 50 - NOT USED */
+  CURLE_PEER_FAILED_VERIFICATION, /* 51 - peer's certificate or fingerprint
+                                     wasn't verified fine */
+  CURLE_GOT_NOTHING,             /* 52 - when this is a specific error */
+  CURLE_SSL_ENGINE_NOTFOUND,     /* 53 - SSL crypto engine not found */
+  CURLE_SSL_ENGINE_SETFAILED,    /* 54 - can not set SSL crypto engine as
+                                    default */
+  CURLE_SEND_ERROR,              /* 55 - failed sending network data */
+  CURLE_RECV_ERROR,              /* 56 - failure in receiving network data */
+  CURLE_OBSOLETE57,              /* 57 - NOT IN USE */
+  CURLE_SSL_CERTPROBLEM,         /* 58 - problem with the local certificate */
+  CURLE_SSL_CIPHER,              /* 59 - couldn't use specified cipher */
+  CURLE_SSL_CACERT,              /* 60 - problem with the CA cert (path?) */
+  CURLE_BAD_CONTENT_ENCODING,    /* 61 - Unrecognized/bad encoding */
+  CURLE_LDAP_INVALID_URL,        /* 62 - Invalid LDAP URL */
+  CURLE_FILESIZE_EXCEEDED,       /* 63 - Maximum file size exceeded */
+  CURLE_USE_SSL_FAILED,          /* 64 - Requested FTP SSL level failed */
+  CURLE_SEND_FAIL_REWIND,        /* 65 - Sending the data requires a rewind
+                                    that failed */
+  CURLE_SSL_ENGINE_INITFAILED,   /* 66 - failed to initialise ENGINE */
+  CURLE_LOGIN_DENIED,            /* 67 - user, password or similar was not
+                                    accepted and we failed to login */
+  CURLE_TFTP_NOTFOUND,           /* 68 - file not found on server */
+  CURLE_TFTP_PERM,               /* 69 - permission problem on server */
+  CURLE_REMOTE_DISK_FULL,        /* 70 - out of disk space on server */
+  CURLE_TFTP_ILLEGAL,            /* 71 - Illegal TFTP operation */
+  CURLE_TFTP_UNKNOWNID,          /* 72 - Unknown transfer ID */
+  CURLE_REMOTE_FILE_EXISTS,      /* 73 - File already exists */
+  CURLE_TFTP_NOSUCHUSER,         /* 74 - No such user */
+  CURLE_CONV_FAILED,             /* 75 - conversion failed */
+  CURLE_CONV_REQD,               /* 76 - caller must register conversion
+                                    callbacks using curl_easy_setopt options
+                                    CURLOPT_CONV_FROM_NETWORK_FUNCTION,
+                                    CURLOPT_CONV_TO_NETWORK_FUNCTION, and
+                                    CURLOPT_CONV_FROM_UTF8_FUNCTION */
+  CURLE_SSL_CACERT_BADFILE,      /* 77 - could not load CACERT file, missing
+                                    or wrong format */
+  CURLE_REMOTE_FILE_NOT_FOUND,   /* 78 - remote file not found */
+  CURLE_SSH,                     /* 79 - error from the SSH layer, somewhat
+                                    generic so the error message will be of
+                                    interest when this has happened */
+
+  CURLE_SSL_SHUTDOWN_FAILED,     /* 80 - Failed to shut down the SSL
+                                    connection */
+  CURLE_AGAIN,                   /* 81 - socket is not ready for send/recv,
+                                    wait till it's ready and try again (Added
+                                    in 7.18.2) */
+  CURLE_SSL_CRL_BADFILE,         /* 82 - could not load CRL file, missing or
+                                    wrong format (Added in 7.19.0) */
+  CURLE_SSL_ISSUER_ERROR,        /* 83 - Issuer check failed.  (Added in
+                                    7.19.0) */
+  CURLE_FTP_PRET_FAILED,         /* 84 - a PRET command failed */
+  CURLE_RTSP_CSEQ_ERROR,         /* 85 - mismatch of RTSP CSeq numbers */
+  CURLE_RTSP_SESSION_ERROR,      /* 86 - mismatch of RTSP Session Ids */
+  CURLE_FTP_BAD_FILE_LIST,       /* 87 - unable to parse FTP file list */
+  CURLE_CHUNK_FAILED,            /* 88 - chunk callback reported error */
+  CURLE_NO_CONNECTION_AVAILABLE, /* 89 - No connection available, the
+                                    session will be queued */
+  CURL_LAST /* never use! */
+} CURLcode;
+
+#ifndef CURL_NO_OLDIES /* define this to test if your app builds with all
+                          the obsolete stuff removed! */
+
+/* Previously obsolete error code re-used in 7.38.0 */
+#define CURLE_OBSOLETE16 CURLE_HTTP2
+
+/* Previously obsolete error codes re-used in 7.24.0 */
+#define CURLE_OBSOLETE10 CURLE_FTP_ACCEPT_FAILED
+#define CURLE_OBSOLETE12 CURLE_FTP_ACCEPT_TIMEOUT
+
+/*  compatibility with older names */
+#define CURLOPT_ENCODING CURLOPT_ACCEPT_ENCODING
+
+/* The following were added in 7.21.5, April 2011 */
+#define CURLE_UNKNOWN_TELNET_OPTION CURLE_UNKNOWN_OPTION
+
+/* The following were added in 7.17.1 */
+/* These are scheduled to disappear by 2009 */
+#define CURLE_SSL_PEER_CERTIFICATE CURLE_PEER_FAILED_VERIFICATION
+
+/* The following were added in 7.17.0 */
+/* These are scheduled to disappear by 2009 */
+#define CURLE_OBSOLETE CURLE_OBSOLETE50 /* no one should be using this! */
+#define CURLE_BAD_PASSWORD_ENTERED CURLE_OBSOLETE46
+#define CURLE_BAD_CALLING_ORDER CURLE_OBSOLETE44
+#define CURLE_FTP_USER_PASSWORD_INCORRECT CURLE_OBSOLETE10
+#define CURLE_FTP_CANT_RECONNECT CURLE_OBSOLETE16
+#define CURLE_FTP_COULDNT_GET_SIZE CURLE_OBSOLETE32
+#define CURLE_FTP_COULDNT_SET_ASCII CURLE_OBSOLETE29
+#define CURLE_FTP_WEIRD_USER_REPLY CURLE_OBSOLETE12
+#define CURLE_FTP_WRITE_ERROR CURLE_OBSOLETE20
+#define CURLE_LIBRARY_NOT_FOUND CURLE_OBSOLETE40
+#define CURLE_MALFORMAT_USER CURLE_OBSOLETE24
+#define CURLE_SHARE_IN_USE CURLE_OBSOLETE57
+#define CURLE_URL_MALFORMAT_USER CURLE_NOT_BUILT_IN
+
+#define CURLE_FTP_ACCESS_DENIED CURLE_REMOTE_ACCESS_DENIED
+#define CURLE_FTP_COULDNT_SET_BINARY CURLE_FTP_COULDNT_SET_TYPE
+#define CURLE_FTP_QUOTE_ERROR CURLE_QUOTE_ERROR
+#define CURLE_TFTP_DISKFULL CURLE_REMOTE_DISK_FULL
+#define CURLE_TFTP_EXISTS CURLE_REMOTE_FILE_EXISTS
+#define CURLE_HTTP_RANGE_ERROR CURLE_RANGE_ERROR
+#define CURLE_FTP_SSL_FAILED CURLE_USE_SSL_FAILED
+
+/* The following were added earlier */
+
+#define CURLE_OPERATION_TIMEOUTED CURLE_OPERATION_TIMEDOUT
+
+#define CURLE_HTTP_NOT_FOUND CURLE_HTTP_RETURNED_ERROR
+#define CURLE_HTTP_PORT_FAILED CURLE_INTERFACE_FAILED
+#define CURLE_FTP_COULDNT_STOR_FILE CURLE_UPLOAD_FAILED
+
+#define CURLE_FTP_PARTIAL_FILE CURLE_PARTIAL_FILE
+#define CURLE_FTP_BAD_DOWNLOAD_RESUME CURLE_BAD_DOWNLOAD_RESUME
+
+/* This was the error code 50 in 7.7.3 and a few earlier versions, this
+   is no longer used by libcurl but is instead #defined here only to not
+   make programs break */
+#define CURLE_ALREADY_COMPLETE 99999
+
+/* Provide defines for really old option names */
+#define CURLOPT_FILE CURLOPT_WRITEDATA /* name changed in 7.9.7 */
+#define CURLOPT_INFILE CURLOPT_READDATA /* name changed in 7.9.7 */
+#define CURLOPT_WRITEHEADER CURLOPT_HEADERDATA
+
+/* Since long deprecated options with no code in the lib that does anything
+   with them. */
+#define CURLOPT_WRITEINFO CURLOPT_OBSOLETE40
+#define CURLOPT_CLOSEPOLICY CURLOPT_OBSOLETE72
+
+#endif /*!CURL_NO_OLDIES*/
+
+/* This prototype applies to all conversion callbacks */
+typedef CURLcode (*curl_conv_callback)(char *buffer, size_t length);
+
+typedef CURLcode (*curl_ssl_ctx_callback)(CURL *curl,    /* easy handle */
+                                          void *ssl_ctx, /* actually an
+                                                            OpenSSL SSL_CTX */
+                                          void *userptr);
+
+typedef enum {
+  CURLPROXY_HTTP = 0,   /* added in 7.10, new in 7.19.4 default is to use
+                           CONNECT HTTP/1.1 */
+  CURLPROXY_HTTP_1_0 = 1,   /* added in 7.19.4, force to use CONNECT
+                               HTTP/1.0  */
+  CURLPROXY_SOCKS4 = 4, /* support added in 7.15.2, enum existed already
+                           in 7.10 */
+  CURLPROXY_SOCKS5 = 5, /* added in 7.10 */
+  CURLPROXY_SOCKS4A = 6, /* added in 7.18.0 */
+  CURLPROXY_SOCKS5_HOSTNAME = 7 /* Use the SOCKS5 protocol but pass along the
+                                   host name rather than the IP address. added
+                                   in 7.18.0 */
+} curl_proxytype;  /* this enum was added in 7.10 */
+
+/*
+ * Bitmasks for CURLOPT_HTTPAUTH and CURLOPT_PROXYAUTH options:
+ *
+ * CURLAUTH_NONE         - No HTTP authentication
+ * CURLAUTH_BASIC        - HTTP Basic authentication (default)
+ * CURLAUTH_DIGEST       - HTTP Digest authentication
+ * CURLAUTH_NEGOTIATE    - HTTP Negotiate (SPNEGO) authentication
+ * CURLAUTH_GSSNEGOTIATE - Alias for CURLAUTH_NEGOTIATE (deprecated)
+ * CURLAUTH_NTLM         - HTTP NTLM authentication
+ * CURLAUTH_DIGEST_IE    - HTTP Digest authentication with IE flavour
+ * CURLAUTH_NTLM_WB      - HTTP NTLM authentication delegated to winbind helper
+ * CURLAUTH_ONLY         - Use together with a single other type to force no
+ *                         authentication or just that single type
+ * CURLAUTH_ANY          - All fine types set
+ * CURLAUTH_ANYSAFE      - All fine types except Basic
+ */
+
+#define CURLAUTH_NONE         ((unsigned long)0)
+#define CURLAUTH_BASIC        (((unsigned long)1)<<0)
+#define CURLAUTH_DIGEST       (((unsigned long)1)<<1)
+#define CURLAUTH_NEGOTIATE    (((unsigned long)1)<<2)
+/* Deprecated since the advent of CURLAUTH_NEGOTIATE */
+#define CURLAUTH_GSSNEGOTIATE CURLAUTH_NEGOTIATE
+#define CURLAUTH_NTLM         (((unsigned long)1)<<3)
+#define CURLAUTH_DIGEST_IE    (((unsigned long)1)<<4)
+#define CURLAUTH_NTLM_WB      (((unsigned long)1)<<5)
+#define CURLAUTH_ONLY         (((unsigned long)1)<<31)
+#define CURLAUTH_ANY          (~CURLAUTH_DIGEST_IE)
+#define CURLAUTH_ANYSAFE      (~(CURLAUTH_BASIC|CURLAUTH_DIGEST_IE))
+
+#define CURLSSH_AUTH_ANY       ~0     /* all types supported by the server */
+#define CURLSSH_AUTH_NONE      0      /* none allowed, silly but complete */
+#define CURLSSH_AUTH_PUBLICKEY (1<<0) /* public/private key files */
+#define CURLSSH_AUTH_PASSWORD  (1<<1) /* password */
+#define CURLSSH_AUTH_HOST      (1<<2) /* host key files */
+#define CURLSSH_AUTH_KEYBOARD  (1<<3) /* keyboard interactive */
+#define CURLSSH_AUTH_AGENT     (1<<4) /* agent (ssh-agent, pageant...) */
+#define CURLSSH_AUTH_DEFAULT CURLSSH_AUTH_ANY
+
+#define CURLGSSAPI_DELEGATION_NONE        0      /* no delegation (default) */
+#define CURLGSSAPI_DELEGATION_POLICY_FLAG (1<<0) /* if permitted by policy */
+#define CURLGSSAPI_DELEGATION_FLAG        (1<<1) /* delegate always */
+
+#define CURL_ERROR_SIZE 256
+
+enum curl_khtype {
+  CURLKHTYPE_UNKNOWN,
+  CURLKHTYPE_RSA1,
+  CURLKHTYPE_RSA,
+  CURLKHTYPE_DSS
+};
+
+struct curl_khkey {
+  const char *key; /* points to a zero-terminated string encoded with base64
+                      if len is zero, otherwise to the "raw" data */
+  size_t len;
+  enum curl_khtype keytype;
+};
+
+/* this is the set of return values expected from the curl_sshkeycallback
+   callback */
+enum curl_khstat {
+  CURLKHSTAT_FINE_ADD_TO_FILE,
+  CURLKHSTAT_FINE,
+  CURLKHSTAT_REJECT, /* reject the connection, return an error */
+  CURLKHSTAT_DEFER,  /* do not accept it, but we can't answer right now so
+                        this causes a CURLE_DEFER error but otherwise the
+                        connection will be left intact etc */
+  CURLKHSTAT_LAST    /* not for use, only a marker for last-in-list */
+};
+
+/* this is the set of status codes pass in to the callback */
+enum curl_khmatch {
+  CURLKHMATCH_OK,       /* match */
+  CURLKHMATCH_MISMATCH, /* host found, key mismatch! */
+  CURLKHMATCH_MISSING,  /* no matching host/key found */
+  CURLKHMATCH_LAST      /* not for use, only a marker for last-in-list */
+};
+
+typedef int
+  (*curl_sshkeycallback) (CURL *easy,     /* easy handle */
+                          const struct curl_khkey *knownkey, /* known */
+                          const struct curl_khkey *foundkey, /* found */
+                          enum curl_khmatch, /* libcurl's view on the keys */
+                          void *clientp); /* custom pointer passed from app */
+
+/* parameter for the CURLOPT_USE_SSL option */
+typedef enum {
+  CURLUSESSL_NONE,    /* do not attempt to use SSL */
+  CURLUSESSL_TRY,     /* try using SSL, proceed anyway otherwise */
+  CURLUSESSL_CONTROL, /* SSL for the control connection or fail */
+  CURLUSESSL_ALL,     /* SSL for all communication or fail */
+  CURLUSESSL_LAST     /* not an option, never use */
+} curl_usessl;
+
+/* Definition of bits for the CURLOPT_SSL_OPTIONS argument: */
+
+/* - ALLOW_BEAST tells libcurl to allow the BEAST SSL vulnerability in the
+   name of improving interoperability with older servers. Some SSL libraries
+   have introduced work-arounds for this flaw but those work-arounds sometimes
+   make the SSL communication fail. To regain functionality with those broken
+   servers, a user can this way allow the vulnerability back. */
+#define CURLSSLOPT_ALLOW_BEAST (1<<0)
+
+#ifndef CURL_NO_OLDIES /* define this to test if your app builds with all
+                          the obsolete stuff removed! */
+
+/* Backwards compatibility with older names */
+/* These are scheduled to disappear by 2009 */
+
+#define CURLFTPSSL_NONE CURLUSESSL_NONE
+#define CURLFTPSSL_TRY CURLUSESSL_TRY
+#define CURLFTPSSL_CONTROL CURLUSESSL_CONTROL
+#define CURLFTPSSL_ALL CURLUSESSL_ALL
+#define CURLFTPSSL_LAST CURLUSESSL_LAST
+#define curl_ftpssl curl_usessl
+#endif /*!CURL_NO_OLDIES*/
+
+/* parameter for the CURLOPT_FTP_SSL_CCC option */
+typedef enum {
+  CURLFTPSSL_CCC_NONE,    /* do not send CCC */
+  CURLFTPSSL_CCC_PASSIVE, /* Let the server initiate the shutdown */
+  CURLFTPSSL_CCC_ACTIVE,  /* Initiate the shutdown */
+  CURLFTPSSL_CCC_LAST     /* not an option, never use */
+} curl_ftpccc;
+
+/* parameter for the CURLOPT_FTPSSLAUTH option */
+typedef enum {
+  CURLFTPAUTH_DEFAULT, /* let libcurl decide */
+  CURLFTPAUTH_SSL,     /* use "AUTH SSL" */
+  CURLFTPAUTH_TLS,     /* use "AUTH TLS" */
+  CURLFTPAUTH_LAST /* not an option, never use */
+} curl_ftpauth;
+
+/* parameter for the CURLOPT_FTP_CREATE_MISSING_DIRS option */
+typedef enum {
+  CURLFTP_CREATE_DIR_NONE,  /* do NOT create missing dirs! */
+  CURLFTP_CREATE_DIR,       /* (FTP/SFTP) if CWD fails, try MKD and then CWD
+                               again if MKD succeeded, for SFTP this does
+                               similar magic */
+  CURLFTP_CREATE_DIR_RETRY, /* (FTP only) if CWD fails, try MKD and then CWD
+                               again even if MKD failed! */
+  CURLFTP_CREATE_DIR_LAST   /* not an option, never use */
+} curl_ftpcreatedir;
+
+/* parameter for the CURLOPT_FTP_FILEMETHOD option */
+typedef enum {
+  CURLFTPMETHOD_DEFAULT,   /* let libcurl pick */
+  CURLFTPMETHOD_MULTICWD,  /* single CWD operation for each path part */
+  CURLFTPMETHOD_NOCWD,     /* no CWD at all */
+  CURLFTPMETHOD_SINGLECWD, /* one CWD to full dir, then work on file */
+  CURLFTPMETHOD_LAST       /* not an option, never use */
+} curl_ftpmethod;
+
+/* bitmask defines for CURLOPT_HEADEROPT */
+#define CURLHEADER_UNIFIED  0
+#define CURLHEADER_SEPARATE (1<<0)
+
+/* CURLPROTO_ defines are for the CURLOPT_*PROTOCOLS options */
+#define CURLPROTO_HTTP   (1<<0)
+#define CURLPROTO_HTTPS  (1<<1)
+#define CURLPROTO_FTP    (1<<2)
+#define CURLPROTO_FTPS   (1<<3)
+#define CURLPROTO_SCP    (1<<4)
+#define CURLPROTO_SFTP   (1<<5)
+#define CURLPROTO_TELNET (1<<6)
+#define CURLPROTO_LDAP   (1<<7)
+#define CURLPROTO_LDAPS  (1<<8)
+#define CURLPROTO_DICT   (1<<9)
+#define CURLPROTO_FILE   (1<<10)
+#define CURLPROTO_TFTP   (1<<11)
+#define CURLPROTO_IMAP   (1<<12)
+#define CURLPROTO_IMAPS  (1<<13)
+#define CURLPROTO_POP3   (1<<14)
+#define CURLPROTO_POP3S  (1<<15)
+#define CURLPROTO_SMTP   (1<<16)
+#define CURLPROTO_SMTPS  (1<<17)
+#define CURLPROTO_RTSP   (1<<18)
+#define CURLPROTO_RTMP   (1<<19)
+#define CURLPROTO_RTMPT  (1<<20)
+#define CURLPROTO_RTMPE  (1<<21)
+#define CURLPROTO_RTMPTE (1<<22)
+#define CURLPROTO_RTMPS  (1<<23)
+#define CURLPROTO_RTMPTS (1<<24)
+#define CURLPROTO_GOPHER (1<<25)
+#define CURLPROTO_ALL    (~0) /* enable everything */
+
+/* long may be 32 or 64 bits, but we should never depend on anything else
+   but 32 */
+#define CURLOPTTYPE_LONG          0
+#define CURLOPTTYPE_OBJECTPOINT   10000
+#define CURLOPTTYPE_FUNCTIONPOINT 20000
+#define CURLOPTTYPE_OFF_T         30000
+
+/* name is uppercase CURLOPT_<name>,
+   type is one of the defined CURLOPTTYPE_<type>
+   number is unique identifier */
+#ifdef CINIT
+#undef CINIT
+#endif
+
+#ifdef CURL_ISOCPP
+#define CINIT(na,t,nu) CURLOPT_ ## na = CURLOPTTYPE_ ## t + nu
+#else
+/* The macro "##" is ISO C, we assume pre-ISO C doesn't support it. */
+#define LONG          CURLOPTTYPE_LONG
+#define OBJECTPOINT   CURLOPTTYPE_OBJECTPOINT
+#define FUNCTIONPOINT CURLOPTTYPE_FUNCTIONPOINT
+#define OFF_T         CURLOPTTYPE_OFF_T
+#define CINIT(name,type,number) CURLOPT_/**/name = type + number
+#endif
+
+/*
+ * This macro-mania below setups the CURLOPT_[what] enum, to be used with
+ * curl_easy_setopt(). The first argument in the CINIT() macro is the [what]
+ * word.
+ */
+
+typedef enum {
+  /* This is the FILE * or void * the regular output should be written to. */
+  CINIT(WRITEDATA, OBJECTPOINT, 1),
+
+  /* The full URL to get/put */
+  CINIT(URL,  OBJECTPOINT, 2),
+
+  /* Port number to connect to, if other than default. */
+  CINIT(PORT, LONG, 3),
+
+  /* Name of proxy to use. */
+  CINIT(PROXY, OBJECTPOINT, 4),
+
+  /* "user:password;options" to use when fetching. */
+  CINIT(USERPWD, OBJECTPOINT, 5),
+
+  /* "user:password" to use with proxy. */
+  CINIT(PROXYUSERPWD, OBJECTPOINT, 6),
+
+  /* Range to get, specified as an ASCII string. */
+  CINIT(RANGE, OBJECTPOINT, 7),
+
+  /* not used */
+
+  /* Specified file stream to upload from (use as input): */
+  CINIT(READDATA, OBJECTPOINT, 9),
+
+  /* Buffer to receive error messages in, must be at least CURL_ERROR_SIZE
+   * bytes big. If this is not used, error messages go to stderr instead: */
+  CINIT(ERRORBUFFER, OBJECTPOINT, 10),
+
+  /* Function that will be called to store the output (instead of fwrite). The
+   * parameters will use fwrite() syntax, make sure to follow them. */
+  CINIT(WRITEFUNCTION, FUNCTIONPOINT, 11),
+
+  /* Function that will be called to read the input (instead of fread). The
+   * parameters will use fread() syntax, make sure to follow them. */
+  CINIT(READFUNCTION, FUNCTIONPOINT, 12),
+
+  /* Time-out the read operation after this amount of seconds */
+  CINIT(TIMEOUT, LONG, 13),
+
+  /* If the CURLOPT_INFILE is used, this can be used to inform libcurl about
+   * how large the file being sent really is. That allows better error
+   * checking and better verifies that the upload was successful. -1 means
+   * unknown size.
+   *
+   * For large file support, there is also a _LARGE version of the key
+   * which takes an off_t type, allowing platforms with larger off_t
+   * sizes to handle larger files.  See below for INFILESIZE_LARGE.
+   */
+  CINIT(INFILESIZE, LONG, 14),
+
+  /* POST static input fields. */
+  CINIT(POSTFIELDS, OBJECTPOINT, 15),
+
+  /* Set the referrer page (needed by some CGIs) */
+  CINIT(REFERER, OBJECTPOINT, 16),
+
+  /* Set the FTP PORT string (interface name, named or numerical IP address)
+     Use i.e '-' to use default address. */
+  CINIT(FTPPORT, OBJECTPOINT, 17),
+
+  /* Set the User-Agent string (examined by some CGIs) */
+  CINIT(USERAGENT, OBJECTPOINT, 18),
+
+  /* If the download receives less than "low speed limit" bytes/second
+   * during "low speed time" seconds, the operations is aborted.
+   * You could i.e if you have a pretty high speed connection, abort if
+   * it is less than 2000 bytes/sec during 20 seconds.
+   */
+
+  /* Set the "low speed limit" */
+  CINIT(LOW_SPEED_LIMIT, LONG, 19),
+
+  /* Set the "low speed time" */
+  CINIT(LOW_SPEED_TIME, LONG, 20),
+
+  /* Set the continuation offset.
+   *
+   * Note there is also a _LARGE version of this key which uses
+   * off_t types, allowing for large file offsets on platforms which
+   * use larger-than-32-bit off_t's.  Look below for RESUME_FROM_LARGE.
+   */
+  CINIT(RESUME_FROM, LONG, 21),
+
+  /* Set cookie in request: */
+  CINIT(COOKIE, OBJECTPOINT, 22),
+
+  /* This points to a linked list of headers, struct curl_slist kind. This
+     list is also used for RTSP (in spite of its name) */
+  CINIT(HTTPHEADER, OBJECTPOINT, 23),
+
+  /* This points to a linked list of post entries, struct curl_httppost */
+  CINIT(HTTPPOST, OBJECTPOINT, 24),
+
+  /* name of the file keeping your private SSL-certificate */
+  CINIT(SSLCERT, OBJECTPOINT, 25),
+
+  /* password for the SSL or SSH private key */
+  CINIT(KEYPASSWD, OBJECTPOINT, 26),
+
+  /* send TYPE parameter? */
+  CINIT(CRLF, LONG, 27),
+
+  /* send linked-list of QUOTE commands */
+  CINIT(QUOTE, OBJECTPOINT, 28),
+
+  /* send FILE * or void * to store headers to, if you use a callback it
+     is simply passed to the callback unmodified */
+  CINIT(HEADERDATA, OBJECTPOINT, 29),
+
+  /* point to a file to read the initial cookies from, also enables
+     "cookie awareness" */
+  CINIT(COOKIEFILE, OBJECTPOINT, 31),
+
+  /* What version to specifically try to use.
+     See CURL_SSLVERSION defines below. */
+  CINIT(SSLVERSION, LONG, 32),
+
+  /* What kind of HTTP time condition to use, see defines */
+  CINIT(TIMECONDITION, LONG, 33),
+
+  /* Time to use with the above condition. Specified in number of seconds
+     since 1 Jan 1970 */
+  CINIT(TIMEVALUE, LONG, 34),
+
+  /* 35 = OBSOLETE */
+
+  /* Custom request, for customizing the get command like
+     HTTP: DELETE, TRACE and others
+     FTP: to use a different list command
+     */
+  CINIT(CUSTOMREQUEST, OBJECTPOINT, 36),
+
+  /* HTTP request, for odd commands like DELETE, TRACE and others */
+  CINIT(STDERR, OBJECTPOINT, 37),
+
+  /* 38 is not used */
+
+  /* send linked-list of post-transfer QUOTE commands */
+  CINIT(POSTQUOTE, OBJECTPOINT, 39),
+
+  CINIT(OBSOLETE40, OBJECTPOINT, 40), /* OBSOLETE, do not use! */
+
+  CINIT(VERBOSE, LONG, 41),      /* talk a lot */
+  CINIT(HEADER, LONG, 42),       /* throw the header out too */
+  CINIT(NOPROGRESS, LONG, 43),   /* shut off the progress meter */
+  CINIT(NOBODY, LONG, 44),       /* use HEAD to get http document */
+  CINIT(FAILONERROR, LONG, 45),  /* no output on http error codes >= 300 */
+  CINIT(UPLOAD, LONG, 46),       /* this is an upload */
+  CINIT(POST, LONG, 47),         /* HTTP POST method */
+  CINIT(DIRLISTONLY, LONG, 48),  /* bare names when listing directories */
+
+  CINIT(APPEND, LONG, 50),       /* Append instead of overwrite on upload! */
+
+  /* Specify whether to read the user+password from the .netrc or the URL.
+   * This must be one of the CURL_NETRC_* enums below. */
+  CINIT(NETRC, LONG, 51),
+
+  CINIT(FOLLOWLOCATION, LONG, 52),  /* use Location: Luke! */
+
+  CINIT(TRANSFERTEXT, LONG, 53), /* transfer data in text/ASCII format */
+  CINIT(PUT, LONG, 54),          /* HTTP PUT */
+
+  /* 55 = OBSOLETE */
+
+  /* DEPRECATED
+   * Function that will be called instead of the internal progress display
+   * function. This function should be defined as the curl_progress_callback
+   * prototype defines. */
+  CINIT(PROGRESSFUNCTION, FUNCTIONPOINT, 56),
+
+  /* Data passed to the CURLOPT_PROGRESSFUNCTION and CURLOPT_XFERINFOFUNCTION
+     callbacks */
+  CINIT(PROGRESSDATA, OBJECTPOINT, 57),
+#define CURLOPT_XFERINFODATA CURLOPT_PROGRESSDATA
+
+  /* We want the referrer field set automatically when following locations */
+  CINIT(AUTOREFERER, LONG, 58),
+
+  /* Port of the proxy, can be set in the proxy string as well with:
+     "[host]:[port]" */
+  CINIT(PROXYPORT, LONG, 59),
+
+  /* size of the POST input data, if strlen() is not good to use */
+  CINIT(POSTFIELDSIZE, LONG, 60),
+
+  /* tunnel non-http operations through a HTTP proxy */
+  CINIT(HTTPPROXYTUNNEL, LONG, 61),
+
+  /* Set the interface string to use as outgoing network interface */
+  CINIT(INTERFACE, OBJECTPOINT, 62),
+
+  /* Set the krb4/5 security level, this also enables krb4/5 awareness.  This
+   * is a string, 'clear', 'safe', 'confidential' or 'private'.  If the string
+   * is set but doesn't match one of these, 'private' will be used.  */
+  CINIT(KRBLEVEL, OBJECTPOINT, 63),
+
+  /* Set if we should verify the peer in ssl handshake, set 1 to verify. */
+  CINIT(SSL_VERIFYPEER, LONG, 64),
+
+  /* The CApath or CAfile used to validate the peer certificate
+     this option is used only if SSL_VERIFYPEER is true */
+  CINIT(CAINFO, OBJECTPOINT, 65),
+
+  /* 66 = OBSOLETE */
+  /* 67 = OBSOLETE */
+
+  /* Maximum number of http redirects to follow */
+  CINIT(MAXREDIRS, LONG, 68),
+
+  /* Pass a long set to 1 to get the date of the requested document (if
+     possible)! Pass a zero to shut it off. */
+  CINIT(FILETIME, LONG, 69),
+
+  /* This points to a linked list of telnet options */
+  CINIT(TELNETOPTIONS, OBJECTPOINT, 70),
+
+  /* Max amount of cached alive connections */
+  CINIT(MAXCONNECTS, LONG, 71),
+
+  CINIT(OBSOLETE72, LONG, 72), /* OBSOLETE, do not use! */
+
+  /* 73 = OBSOLETE */
+
+  /* Set to explicitly use a new connection for the upcoming transfer.
+     Do not use this unless you're absolutely sure of this, as it makes the
+     operation slower and is less friendly for the network. */
+  CINIT(FRESH_CONNECT, LONG, 74),
+
+  /* Set to explicitly forbid the upcoming transfer's connection to be re-used
+     when done. Do not use this unless you're absolutely sure of this, as it
+     makes the operation slower and is less friendly for the network. */
+  CINIT(FORBID_REUSE, LONG, 75),
+
+  /* Set to a file name that contains random data for libcurl to use to
+     seed the random engine when doing SSL connects. */
+  CINIT(RANDOM_FILE, OBJECTPOINT, 76),
+
+  /* Set to the Entropy Gathering Daemon socket pathname */
+  CINIT(EGDSOCKET, OBJECTPOINT, 77),
+
+  /* Time-out connect operations after this amount of seconds, if connects are
+     OK within this time, then fine... This only aborts the connect phase. */
+  CINIT(CONNECTTIMEOUT, LONG, 78),
+
+  /* Function that will be called to store headers (instead of fwrite). The
+   * parameters will use fwrite() syntax, make sure to follow them. */
+  CINIT(HEADERFUNCTION, FUNCTIONPOINT, 79),
+
+  /* Set this to force the HTTP request to get back to GET. Only really usable
+     if POST, PUT or a custom request have been used first.
+   */
+  CINIT(HTTPGET, LONG, 80),
+
+  /* Set if we should verify the Common name from the peer certificate in ssl
+   * handshake, set 1 to check existence, 2 to ensure that it matches the
+   * provided hostname. */
+  CINIT(SSL_VERIFYHOST, LONG, 81),
+
+  /* Specify which file name to write all known cookies in after completed
+     operation. Set file name to "-" (dash) to make it go to stdout. */
+  CINIT(COOKIEJAR, OBJECTPOINT, 82),
+
+  /* Specify which SSL ciphers to use */
+  CINIT(SSL_CIPHER_LIST, OBJECTPOINT, 83),
+
+  /* Specify which HTTP version to use! This must be set to one of the
+     CURL_HTTP_VERSION* enums set below. */
+  CINIT(HTTP_VERSION, LONG, 84),
+
+  /* Specifically switch on or off the FTP engine's use of the EPSV command. By
+     default, that one will always be attempted before the more traditional
+     PASV command. */
+  CINIT(FTP_USE_EPSV, LONG, 85),
+
+  /* type of the file keeping your SSL-certificate ("DER", "PEM", "ENG") */
+  CINIT(SSLCERTTYPE, OBJECTPOINT, 86),
+
+  /* name of the file keeping your private SSL-key */
+  CINIT(SSLKEY, OBJECTPOINT, 87),
+
+  /* type of the file keeping your private SSL-key ("DER", "PEM", "ENG") */
+  CINIT(SSLKEYTYPE, OBJECTPOINT, 88),
+
+  /* crypto engine for the SSL-sub system */
+  CINIT(SSLENGINE, OBJECTPOINT, 89),
+
+  /* set the crypto engine for the SSL-sub system as default
+     the param has no meaning...
+   */
+  CINIT(SSLENGINE_DEFAULT, LONG, 90),
+
+  /* Non-zero value means to use the global dns cache */
+  CINIT(DNS_USE_GLOBAL_CACHE, LONG, 91), /* DEPRECATED, do not use! */
+
+  /* DNS cache timeout */
+  CINIT(DNS_CACHE_TIMEOUT, LONG, 92),
+
+  /* send linked-list of pre-transfer QUOTE commands */
+  CINIT(PREQUOTE, OBJECTPOINT, 93),
+
+  /* set the debug function */
+  CINIT(DEBUGFUNCTION, FUNCTIONPOINT, 94),
+
+  /* set the data for the debug function */
+  CINIT(DEBUGDATA, OBJECTPOINT, 95),
+
+  /* mark this as start of a cookie session */
+  CINIT(COOKIESESSION, LONG, 96),
+
+  /* The CApath directory used to validate the peer certificate
+     this option is used only if SSL_VERIFYPEER is true */
+  CINIT(CAPATH, OBJECTPOINT, 97),
+
+  /* Instruct libcurl to use a smaller receive buffer */
+  CINIT(BUFFERSIZE, LONG, 98),
+
+  /* Instruct libcurl to not use any signal/alarm handlers, even when using
+     timeouts. This option is useful for multi-threaded applications.
+     See libcurl-the-guide for more background information. */
+  CINIT(NOSIGNAL, LONG, 99),
+
+  /* Provide a CURLShare for mutexing non-ts data */
+  CINIT(SHARE, OBJECTPOINT, 100),
+
+  /* indicates type of proxy. accepted values are CURLPROXY_HTTP (default),
+     CURLPROXY_SOCKS4, CURLPROXY_SOCKS4A and CURLPROXY_SOCKS5. */
+  CINIT(PROXYTYPE, LONG, 101),
+
+  /* Set the Accept-Encoding string. Use this to tell a server you would like
+     the response to be compressed. Before 7.21.6, this was known as
+     CURLOPT_ENCODING */
+  CINIT(ACCEPT_ENCODING, OBJECTPOINT, 102),
+
+  /* Set pointer to private data */
+  CINIT(PRIVATE, OBJECTPOINT, 103),
+
+  /* Set aliases for HTTP 200 in the HTTP Response header */
+  CINIT(HTTP200ALIASES, OBJECTPOINT, 104),
+
+  /* Continue to send authentication (user+password) when following locations,
+     even when hostname changed. This can potentially send off the name
+     and password to whatever host the server decides. */
+  CINIT(UNRESTRICTED_AUTH, LONG, 105),
+
+  /* Specifically switch on or off the FTP engine's use of the EPRT command (
+     it also disables the LPRT attempt). By default, those ones will always be
+     attempted before the good old traditional PORT command. */
+  CINIT(FTP_USE_EPRT, LONG, 106),
+
+  /* Set this to a bitmask value to enable the particular authentications
+     methods you like. Use this in combination with CURLOPT_USERPWD.
+     Note that setting multiple bits may cause extra network round-trips. */
+  CINIT(HTTPAUTH, LONG, 107),
+
+  /* Set the ssl context callback function, currently only for OpenSSL ssl_ctx
+     in second argument. The function must be matching the
+     curl_ssl_ctx_callback proto. */
+  CINIT(SSL_CTX_FUNCTION, FUNCTIONPOINT, 108),
+
+  /* Set the userdata for the ssl context callback function's third
+     argument */
+  CINIT(SSL_CTX_DATA, OBJECTPOINT, 109),
+
+  /* FTP Option that causes missing dirs to be created on the remote server.
+     In 7.19.4 we introduced the convenience enums for this option using the
+     CURLFTP_CREATE_DIR prefix.
+  */
+  CINIT(FTP_CREATE_MISSING_DIRS, LONG, 110),
+
+  /* Set this to a bitmask value to enable the particular authentications
+     methods you like. Use this in combination with CURLOPT_PROXYUSERPWD.
+     Note that setting multiple bits may cause extra network round-trips. */
+  CINIT(PROXYAUTH, LONG, 111),
+
+  /* FTP option that changes the timeout, in seconds, associated with
+     getting a response.  This is different from transfer timeout time and
+     essentially places a demand on the FTP server to acknowledge commands
+     in a timely manner. */
+  CINIT(FTP_RESPONSE_TIMEOUT, LONG, 112),
+#define CURLOPT_SERVER_RESPONSE_TIMEOUT CURLOPT_FTP_RESPONSE_TIMEOUT
+
+  /* Set this option to one of the CURL_IPRESOLVE_* defines (see below) to
+     tell libcurl to resolve names to those IP versions only. This only has
+     affect on systems with support for more than one, i.e IPv4 _and_ IPv6. */
+  CINIT(IPRESOLVE, LONG, 113),
+
+  /* Set this option to limit the size of a file that will be downloaded from
+     an HTTP or FTP server.
+
+     Note there is also _LARGE version which adds large file support for
+     platforms which have larger off_t sizes.  See MAXFILESIZE_LARGE below. */
+  CINIT(MAXFILESIZE, LONG, 114),
+
+  /* See the comment for INFILESIZE above, but in short, specifies
+   * the size of the file being uploaded.  -1 means unknown.
+   */
+  CINIT(INFILESIZE_LARGE, OFF_T, 115),
+
+  /* Sets the continuation offset.  There is also a LONG version of this;
+   * look above for RESUME_FROM.
+   */
+  CINIT(RESUME_FROM_LARGE, OFF_T, 116),
+
+  /* Sets the maximum size of data that will be downloaded from
+   * an HTTP or FTP server.  See MAXFILESIZE above for the LONG version.
+   */
+  CINIT(MAXFILESIZE_LARGE, OFF_T, 117),
+
+  /* Set this option to the file name of your .netrc file you want libcurl
+     to parse (using the CURLOPT_NETRC option). If not set, libcurl will do
+     a poor attempt to find the user's home directory and check for a .netrc
+     file in there. */
+  CINIT(NETRC_FILE, OBJECTPOINT, 118),
+
+  /* Enable SSL/TLS for FTP, pick one of:
+     CURLUSESSL_TRY     - try using SSL, proceed anyway otherwise
+     CURLUSESSL_CONTROL - SSL for the control connection or fail
+     CURLUSESSL_ALL     - SSL for all communication or fail
+  */
+  CINIT(USE_SSL, LONG, 119),
+
+  /* The _LARGE version of the standard POSTFIELDSIZE option */
+  CINIT(POSTFIELDSIZE_LARGE, OFF_T, 120),
+
+  /* Enable/disable the TCP Nagle algorithm */
+  CINIT(TCP_NODELAY, LONG, 121),
+
+  /* 122 OBSOLETE, used in 7.12.3. Gone in 7.13.0 */
+  /* 123 OBSOLETE. Gone in 7.16.0 */
+  /* 124 OBSOLETE, used in 7.12.3. Gone in 7.13.0 */
+  /* 125 OBSOLETE, used in 7.12.3. Gone in 7.13.0 */
+  /* 126 OBSOLETE, used in 7.12.3. Gone in 7.13.0 */
+  /* 127 OBSOLETE. Gone in 7.16.0 */
+  /* 128 OBSOLETE. Gone in 7.16.0 */
+
+  /* When FTP over SSL/TLS is selected (with CURLOPT_USE_SSL), this option
+     can be used to change libcurl's default action which is to first try
+     "AUTH SSL" and then "AUTH TLS" in this order, and proceed when a OK
+     response has been received.
+
+     Available parameters are:
+     CURLFTPAUTH_DEFAULT - let libcurl decide
+     CURLFTPAUTH_SSL     - try "AUTH SSL" first, then TLS
+     CURLFTPAUTH_TLS     - try "AUTH TLS" first, then SSL
+  */
+  CINIT(FTPSSLAUTH, LONG, 129),
+
+  CINIT(IOCTLFUNCTION, FUNCTIONPOINT, 130),
+  CINIT(IOCTLDATA, OBJECTPOINT, 131),
+
+  /* 132 OBSOLETE. Gone in 7.16.0 */
+  /* 133 OBSOLETE. Gone in 7.16.0 */
+
+  /* zero terminated string for pass on to the FTP server when asked for
+     "account" info */
+  CINIT(FTP_ACCOUNT, OBJECTPOINT, 134),
+
+  /* feed cookies into cookie engine */
+  CINIT(COOKIELIST, OBJECTPOINT, 135),
+
+  /* ignore Content-Length */
+  CINIT(IGNORE_CONTENT_LENGTH, LONG, 136),
+
+  /* Set to non-zero to skip the IP address received in a 227 PASV FTP server
+     response. Typically used for FTP-SSL purposes but is not restricted to
+     that. libcurl will then instead use the same IP address it used for the
+     control connection. */
+  CINIT(FTP_SKIP_PASV_IP, LONG, 137),
+
+  /* Select "file method" to use when doing FTP, see the curl_ftpmethod
+     above. */
+  CINIT(FTP_FILEMETHOD, LONG, 138),
+
+  /* Local port number to bind the socket to */
+  CINIT(LOCALPORT, LONG, 139),
+
+  /* Number of ports to try, including the first one set with LOCALPORT.
+     Thus, setting it to 1 will make no additional attempts but the first.
+  */
+  CINIT(LOCALPORTRANGE, LONG, 140),
+
+  /* no transfer, set up connection and let application use the socket by
+     extracting it with CURLINFO_LASTSOCKET */
+  CINIT(CONNECT_ONLY, LONG, 141),
+
+  /* Function that will be called to convert from the
+     network encoding (instead of using the iconv calls in libcurl) */
+  CINIT(CONV_FROM_NETWORK_FUNCTION, FUNCTIONPOINT, 142),
+
+  /* Function that will be called to convert to the
+     network encoding (instead of using the iconv calls in libcurl) */
+  CINIT(CONV_TO_NETWORK_FUNCTION, FUNCTIONPOINT, 143),
+
+  /* Function that will be called to convert from UTF8
+     (instead of using the iconv calls in libcurl)
+     Note that this is used only for SSL certificate processing */
+  CINIT(CONV_FROM_UTF8_FUNCTION, FUNCTIONPOINT, 144),
+
+  /* if the connection proceeds too quickly then need to slow it down */
+  /* limit-rate: maximum number of bytes per second to send or receive */
+  CINIT(MAX_SEND_SPEED_LARGE, OFF_T, 145),
+  CINIT(MAX_RECV_SPEED_LARGE, OFF_T, 146),
+
+  /* Pointer to command string to send if USER/PASS fails. */
+  CINIT(FTP_ALTERNATIVE_TO_USER, OBJECTPOINT, 147),
+
+  /* callback function for setting socket options */
+  CINIT(SOCKOPTFUNCTION, FUNCTIONPOINT, 148),
+  CINIT(SOCKOPTDATA, OBJECTPOINT, 149),
+
+  /* set to 0 to disable session ID re-use for this transfer, default is
+     enabled (== 1) */
+  CINIT(SSL_SESSIONID_CACHE, LONG, 150),
+
+  /* allowed SSH authentication methods */
+  CINIT(SSH_AUTH_TYPES, LONG, 151),
+
+  /* Used by scp/sftp to do public/private key authentication */
+  CINIT(SSH_PUBLIC_KEYFILE, OBJECTPOINT, 152),
+  CINIT(SSH_PRIVATE_KEYFILE, OBJECTPOINT, 153),
+
+  /* Send CCC (Clear Command Channel) after authentication */
+  CINIT(FTP_SSL_CCC, LONG, 154),
+
+  /* Same as TIMEOUT and CONNECTTIMEOUT, but with ms resolution */
+  CINIT(TIMEOUT_MS, LONG, 155),
+  CINIT(CONNECTTIMEOUT_MS, LONG, 156),
+
+  /* set to zero to disable the libcurl's decoding and thus pass the raw body
+     data to the application even when it is encoded/compressed */
+  CINIT(HTTP_TRANSFER_DECODING, LONG, 157),
+  CINIT(HTTP_CONTENT_DECODING, LONG, 158),
+
+  /* Permission used when creating new files and directories on the remote
+     server for protocols that support it, SFTP/SCP/FILE */
+  CINIT(NEW_FILE_PERMS, LONG, 159),
+  CINIT(NEW_DIRECTORY_PERMS, LONG, 160),
+
+  /* Set the behaviour of POST when redirecting. Values must be set to one
+     of CURL_REDIR* defines below. This used to be called CURLOPT_POST301 */
+  CINIT(POSTREDIR, LONG, 161),
+
+  /* used by scp/sftp to verify the host's public key */
+  CINIT(SSH_HOST_PUBLIC_KEY_MD5, OBJECTPOINT, 162),
+
+  /* Callback function for opening socket (instead of socket(2)). Optionally,
+     callback is able change the address or refuse to connect returning
+     CURL_SOCKET_BAD.  The callback should have type
+     curl_opensocket_callback */
+  CINIT(OPENSOCKETFUNCTION, FUNCTIONPOINT, 163),
+  CINIT(OPENSOCKETDATA, OBJECTPOINT, 164),
+
+  /* POST volatile input fields. */
+  CINIT(COPYPOSTFIELDS, OBJECTPOINT, 165),
+
+  /* set transfer mode (;type=<a|i>) when doing FTP via an HTTP proxy */
+  CINIT(PROXY_TRANSFER_MODE, LONG, 166),
+
+  /* Callback function for seeking in the input stream */
+  CINIT(SEEKFUNCTION, FUNCTIONPOINT, 167),
+  CINIT(SEEKDATA, OBJECTPOINT, 168),
+
+  /* CRL file */
+  CINIT(CRLFILE, OBJECTPOINT, 169),
+
+  /* Issuer certificate */
+  CINIT(ISSUERCERT, OBJECTPOINT, 170),
+
+  /* (IPv6) Address scope */
+  CINIT(ADDRESS_SCOPE, LONG, 171),
+
+  /* Collect certificate chain info and allow it to get retrievable with
+     CURLINFO_CERTINFO after the transfer is complete. */
+  CINIT(CERTINFO, LONG, 172),
+
+  /* "name" and "pwd" to use when fetching. */
+  CINIT(USERNAME, OBJECTPOINT, 173),
+  CINIT(PASSWORD, OBJECTPOINT, 174),
+
+    /* "name" and "pwd" to use with Proxy when fetching. */
+  CINIT(PROXYUSERNAME, OBJECTPOINT, 175),
+  CINIT(PROXYPASSWORD, OBJECTPOINT, 176),
+
+  /* Comma separated list of hostnames defining no-proxy zones. These should
+     match both hostnames directly, and hostnames within a domain. For
+     example, local.com will match local.com and www.local.com, but NOT
+     notlocal.com or www.notlocal.com. For compatibility with other
+     implementations of this, .local.com will be considered to be the same as
+     local.com. A single * is the only valid wildcard, and effectively
+     disables the use of proxy. */
+  CINIT(NOPROXY, OBJECTPOINT, 177),
+
+  /* block size for TFTP transfers */
+  CINIT(TFTP_BLKSIZE, LONG, 178),
+
+  /* Socks Service */
+  CINIT(SOCKS5_GSSAPI_SERVICE, OBJECTPOINT, 179),
+
+  /* Socks Service */
+  CINIT(SOCKS5_GSSAPI_NEC, LONG, 180),
+
+  /* set the bitmask for the protocols that are allowed to be used for the
+     transfer, which thus helps the app which takes URLs from users or other
+     external inputs and want to restrict what protocol(s) to deal
+     with. Defaults to CURLPROTO_ALL. */
+  CINIT(PROTOCOLS, LONG, 181),
+
+  /* set the bitmask for the protocols that libcurl is allowed to follow to,
+     as a subset of the CURLOPT_PROTOCOLS ones. That means the protocol needs
+     to be set in both bitmasks to be allowed to get redirected to. Defaults
+     to all protocols except FILE and SCP. */
+  CINIT(REDIR_PROTOCOLS, LONG, 182),
+
+  /* set the SSH knownhost file name to use */
+  CINIT(SSH_KNOWNHOSTS, OBJECTPOINT, 183),
+
+  /* set the SSH host key callback, must point to a curl_sshkeycallback
+     function */
+  CINIT(SSH_KEYFUNCTION, FUNCTIONPOINT, 184),
+
+  /* set the SSH host key callback custom pointer */
+  CINIT(SSH_KEYDATA, OBJECTPOINT, 185),
+
+  /* set the SMTP mail originator */
+  CINIT(MAIL_FROM, OBJECTPOINT, 186),
+
+  /* set the SMTP mail receiver(s) */
+  CINIT(MAIL_RCPT, OBJECTPOINT, 187),
+
+  /* FTP: send PRET before PASV */
+  CINIT(FTP_USE_PRET, LONG, 188),
+
+  /* RTSP request method (OPTIONS, SETUP, PLAY, etc...) */
+  CINIT(RTSP_REQUEST, LONG, 189),
+
+  /* The RTSP session identifier */
+  CINIT(RTSP_SESSION_ID, OBJECTPOINT, 190),
+
+  /* The RTSP stream URI */
+  CINIT(RTSP_STREAM_URI, OBJECTPOINT, 191),
+
+  /* The Transport: header to use in RTSP requests */
+  CINIT(RTSP_TRANSPORT, OBJECTPOINT, 192),
+
+  /* Manually initialize the client RTSP CSeq for this handle */
+  CINIT(RTSP_CLIENT_CSEQ, LONG, 193),
+
+  /* Manually initialize the server RTSP CSeq for this handle */
+  CINIT(RTSP_SERVER_CSEQ, LONG, 194),
+
+  /* The stream to pass to INTERLEAVEFUNCTION. */
+  CINIT(INTERLEAVEDATA, OBJECTPOINT, 195),
+
+  /* Let the application define a custom write method for RTP data */
+  CINIT(INTERLEAVEFUNCTION, FUNCTIONPOINT, 196),
+
+  /* Turn on wildcard matching */
+  CINIT(WILDCARDMATCH, LONG, 197),
+
+  /* Directory matching callback called before downloading of an
+     individual file (chunk) started */
+  CINIT(CHUNK_BGN_FUNCTION, FUNCTIONPOINT, 198),
+
+  /* Directory matching callback called after the file (chunk)
+     was downloaded, or skipped */
+  CINIT(CHUNK_END_FUNCTION, FUNCTIONPOINT, 199),
+
+  /* Change match (fnmatch-like) callback for wildcard matching */
+  CINIT(FNMATCH_FUNCTION, FUNCTIONPOINT, 200),
+
+  /* Let the application define custom chunk data pointer */
+  CINIT(CHUNK_DATA, OBJECTPOINT, 201),
+
+  /* FNMATCH_FUNCTION user pointer */
+  CINIT(FNMATCH_DATA, OBJECTPOINT, 202),
+
+  /* send linked-list of name:port:address sets */
+  CINIT(RESOLVE, OBJECTPOINT, 203),
+
+  /* Set a username for authenticated TLS */
+  CINIT(TLSAUTH_USERNAME, OBJECTPOINT, 204),
+
+  /* Set a password for authenticated TLS */
+  CINIT(TLSAUTH_PASSWORD, OBJECTPOINT, 205),
+
+  /* Set authentication type for authenticated TLS */
+  CINIT(TLSAUTH_TYPE, OBJECTPOINT, 206),
+
+  /* Set to 1 to enable the "TE:" header in HTTP requests to ask for
+     compressed transfer-encoded responses. Set to 0 to disable the use of TE:
+     in outgoing requests. The current default is 0, but it might change in a
+     future libcurl release.
+
+     libcurl will ask for the compressed methods it knows of, and if that
+     isn't any, it will not ask for transfer-encoding at all even if this
+     option is set to 1.
+
+  */
+  CINIT(TRANSFER_ENCODING, LONG, 207),
+
+  /* Callback function for closing socket (instead of close(2)). The callback
+     should have type curl_closesocket_callback */
+  CINIT(CLOSESOCKETFUNCTION, FUNCTIONPOINT, 208),
+  CINIT(CLOSESOCKETDATA, OBJECTPOINT, 209),
+
+  /* allow GSSAPI credential delegation */
+  CINIT(GSSAPI_DELEGATION, LONG, 210),
+
+  /* Set the name servers to use for DNS resolution */
+  CINIT(DNS_SERVERS, OBJECTPOINT, 211),
+
+  /* Time-out accept operations (currently for FTP only) after this amount
+     of miliseconds. */
+  CINIT(ACCEPTTIMEOUT_MS, LONG, 212),
+
+  /* Set TCP keepalive */
+  CINIT(TCP_KEEPALIVE, LONG, 213),
+
+  /* non-universal keepalive knobs (Linux, AIX, HP-UX, more) */
+  CINIT(TCP_KEEPIDLE, LONG, 214),
+  CINIT(TCP_KEEPINTVL, LONG, 215),
+
+  /* Enable/disable specific SSL features with a bitmask, see CURLSSLOPT_* */
+  CINIT(SSL_OPTIONS, LONG, 216),
+
+  /* Set the SMTP auth originator */
+  CINIT(MAIL_AUTH, OBJECTPOINT, 217),
+
+  /* Enable/disable SASL initial response */
+  CINIT(SASL_IR, LONG, 218),
+
+  /* Function that will be called instead of the internal progress display
+   * function. This function should be defined as the curl_xferinfo_callback
+   * prototype defines. (Deprecates CURLOPT_PROGRESSFUNCTION) */
+  CINIT(XFERINFOFUNCTION, FUNCTIONPOINT, 219),
+
+  /* The XOAUTH2 bearer token */
+  CINIT(XOAUTH2_BEARER, OBJECTPOINT, 220),
+
+  /* Set the interface string to use as outgoing network
+   * interface for DNS requests.
+   * Only supported by the c-ares DNS backend */
+  CINIT(DNS_INTERFACE, OBJECTPOINT, 221),
+
+  /* Set the local IPv4 address to use for outgoing DNS requests.
+   * Only supported by the c-ares DNS backend */
+  CINIT(DNS_LOCAL_IP4, OBJECTPOINT, 222),
+
+  /* Set the local IPv4 address to use for outgoing DNS requests.
+   * Only supported by the c-ares DNS backend */
+  CINIT(DNS_LOCAL_IP6, OBJECTPOINT, 223),
+
+  /* Set authentication options directly */
+  CINIT(LOGIN_OPTIONS, OBJECTPOINT, 224),
+
+  /* Enable/disable TLS NPN extension (http2 over ssl might fail without) */
+  CINIT(SSL_ENABLE_NPN, LONG, 225),
+
+  /* Enable/disable TLS ALPN extension (http2 over ssl might fail without) */
+  CINIT(SSL_ENABLE_ALPN, LONG, 226),
+
+  /* Time to wait for a response to a HTTP request containing an
+   * Expect: 100-continue header before sending the data anyway. */
+  CINIT(EXPECT_100_TIMEOUT_MS, LONG, 227),
+
+  /* This points to a linked list of headers used for proxy requests only,
+     struct curl_slist kind */
+  CINIT(PROXYHEADER, OBJECTPOINT, 228),
+
+  /* Pass in a bitmask of "header options" */
+  CINIT(HEADEROPT, LONG, 229),
+
+  CURLOPT_LASTENTRY /* the last unused */
+} CURLoption;
+
+#ifndef CURL_NO_OLDIES /* define this to test if your app builds with all
+                          the obsolete stuff removed! */
+
+/* Backwards compatibility with older names */
+/* These are scheduled to disappear by 2011 */
+
+/* This was added in version 7.19.1 */
+#define CURLOPT_POST301 CURLOPT_POSTREDIR
+
+/* These are scheduled to disappear by 2009 */
+
+/* The following were added in 7.17.0 */
+#define CURLOPT_SSLKEYPASSWD CURLOPT_KEYPASSWD
+#define CURLOPT_FTPAPPEND CURLOPT_APPEND
+#define CURLOPT_FTPLISTONLY CURLOPT_DIRLISTONLY
+#define CURLOPT_FTP_SSL CURLOPT_USE_SSL
+
+/* The following were added earlier */
+
+#define CURLOPT_SSLCERTPASSWD CURLOPT_KEYPASSWD
+#define CURLOPT_KRB4LEVEL CURLOPT_KRBLEVEL
+
+#else
+/* This is set if CURL_NO_OLDIES is defined at compile-time */
+#undef CURLOPT_DNS_USE_GLOBAL_CACHE /* soon obsolete */
+#endif
+
+
+  /* Below here follows defines for the CURLOPT_IPRESOLVE option. If a host
+     name resolves addresses using more than one IP protocol version, this
+     option might be handy to force libcurl to use a specific IP version. */
+#define CURL_IPRESOLVE_WHATEVER 0 /* default, resolves addresses to all IP
+                                     versions that your system allows */
+#define CURL_IPRESOLVE_V4       1 /* resolve to ipv4 addresses */
+#define CURL_IPRESOLVE_V6       2 /* resolve to ipv6 addresses */
+
+  /* three convenient "aliases" that follow the name scheme better */
+#define CURLOPT_RTSPHEADER CURLOPT_HTTPHEADER
+
+  /* These enums are for use with the CURLOPT_HTTP_VERSION option. */
+enum {
+  CURL_HTTP_VERSION_NONE, /* setting this means we don't care, and that we'd
+                             like the library to choose the best possible
+                             for us! */
+  CURL_HTTP_VERSION_1_0,  /* please use HTTP 1.0 in the request */
+  CURL_HTTP_VERSION_1_1,  /* please use HTTP 1.1 in the request */
+  CURL_HTTP_VERSION_2_0,  /* please use HTTP 2.0 in the request */
+
+  CURL_HTTP_VERSION_LAST /* *ILLEGAL* http version */
+};
+
+/*
+ * Public API enums for RTSP requests
+ */
+enum {
+    CURL_RTSPREQ_NONE, /* first in list */
+    CURL_RTSPREQ_OPTIONS,
+    CURL_RTSPREQ_DESCRIBE,
+    CURL_RTSPREQ_ANNOUNCE,
+    CURL_RTSPREQ_SETUP,
+    CURL_RTSPREQ_PLAY,
+    CURL_RTSPREQ_PAUSE,
+    CURL_RTSPREQ_TEARDOWN,
+    CURL_RTSPREQ_GET_PARAMETER,
+    CURL_RTSPREQ_SET_PARAMETER,
+    CURL_RTSPREQ_RECORD,
+    CURL_RTSPREQ_RECEIVE,
+    CURL_RTSPREQ_LAST /* last in list */
+};
+
+  /* These enums are for use with the CURLOPT_NETRC option. */
+enum CURL_NETRC_OPTION {
+  CURL_NETRC_IGNORED,     /* The .netrc will never be read.
+                           * This is the default. */
+  CURL_NETRC_OPTIONAL,    /* A user:password in the URL will be preferred
+                           * to one in the .netrc. */
+  CURL_NETRC_REQUIRED,    /* A user:password in the URL will be ignored.
+                           * Unless one is set programmatically, the .netrc
+                           * will be queried. */
+  CURL_NETRC_LAST
+};
+
+enum {
+  CURL_SSLVERSION_DEFAULT,
+  CURL_SSLVERSION_TLSv1, /* TLS 1.x */
+  CURL_SSLVERSION_SSLv2,
+  CURL_SSLVERSION_SSLv3,
+  CURL_SSLVERSION_TLSv1_0,
+  CURL_SSLVERSION_TLSv1_1,
+  CURL_SSLVERSION_TLSv1_2,
+
+  CURL_SSLVERSION_LAST /* never use, keep last */
+};
+
+enum CURL_TLSAUTH {
+  CURL_TLSAUTH_NONE,
+  CURL_TLSAUTH_SRP,
+  CURL_TLSAUTH_LAST /* never use, keep last */
+};
+
+/* symbols to use with CURLOPT_POSTREDIR.
+   CURL_REDIR_POST_301, CURL_REDIR_POST_302 and CURL_REDIR_POST_303
+   can be bitwise ORed so that CURL_REDIR_POST_301 | CURL_REDIR_POST_302
+   | CURL_REDIR_POST_303 == CURL_REDIR_POST_ALL */
+
+#define CURL_REDIR_GET_ALL  0
+#define CURL_REDIR_POST_301 1
+#define CURL_REDIR_POST_302 2
+#define CURL_REDIR_POST_303 4
+#define CURL_REDIR_POST_ALL \
+    (CURL_REDIR_POST_301|CURL_REDIR_POST_302|CURL_REDIR_POST_303)
+
+typedef enum {
+  CURL_TIMECOND_NONE,
+
+  CURL_TIMECOND_IFMODSINCE,
+  CURL_TIMECOND_IFUNMODSINCE,
+  CURL_TIMECOND_LASTMOD,
+
+  CURL_TIMECOND_LAST
+} curl_TimeCond;
+
+
+/* curl_strequal() and curl_strnequal() are subject for removal in a future
+   libcurl, see lib/README.curlx for details */
+CURL_EXTERN int (curl_strequal)(const char *s1, const char *s2);
+CURL_EXTERN int (curl_strnequal)(const char *s1, const char *s2, size_t n);
+
+/* name is uppercase CURLFORM_<name> */
+#ifdef CFINIT
+#undef CFINIT
+#endif
+
+#ifdef CURL_ISOCPP
+#define CFINIT(name) CURLFORM_ ## name
+#else
+/* The macro "##" is ISO C, we assume pre-ISO C doesn't support it. */
+#define CFINIT(name) CURLFORM_/**/name
+#endif
+
+typedef enum {
+  CFINIT(NOTHING),        /********* the first one is unused ************/
+
+  /*  */
+  CFINIT(COPYNAME),
+  CFINIT(PTRNAME),
+  CFINIT(NAMELENGTH),
+  CFINIT(COPYCONTENTS),
+  CFINIT(PTRCONTENTS),
+  CFINIT(CONTENTSLENGTH),
+  CFINIT(FILECONTENT),
+  CFINIT(ARRAY),
+  CFINIT(OBSOLETE),
+  CFINIT(FILE),
+
+  CFINIT(BUFFER),
+  CFINIT(BUFFERPTR),
+  CFINIT(BUFFERLENGTH),
+
+  CFINIT(CONTENTTYPE),
+  CFINIT(CONTENTHEADER),
+  CFINIT(FILENAME),
+  CFINIT(END),
+  CFINIT(OBSOLETE2),
+
+  CFINIT(STREAM),
+
+  CURLFORM_LASTENTRY /* the last unused */
+} CURLformoption;
+
+#undef CFINIT /* done */
+
+/* structure to be used as parameter for CURLFORM_ARRAY */
+struct curl_forms {
+  CURLformoption option;
+  const char     *value;
+};
+
+/* use this for multipart formpost building */
+/* Returns code for curl_formadd()
+ *
+ * Returns:
+ * CURL_FORMADD_OK             on success
+ * CURL_FORMADD_MEMORY         if the FormInfo allocation fails
+ * CURL_FORMADD_OPTION_TWICE   if one option is given twice for one Form
+ * CURL_FORMADD_NULL           if a null pointer was given for a char
+ * CURL_FORMADD_MEMORY         if the allocation of a FormInfo struct failed
+ * CURL_FORMADD_UNKNOWN_OPTION if an unknown option was used
+ * CURL_FORMADD_INCOMPLETE     if the some FormInfo is not complete (or error)
+ * CURL_FORMADD_MEMORY         if a curl_httppost struct cannot be allocated
+ * CURL_FORMADD_MEMORY         if some allocation for string copying failed.
+ * CURL_FORMADD_ILLEGAL_ARRAY  if an illegal option is used in an array
+ *
+ ***************************************************************************/
+typedef enum {
+  CURL_FORMADD_OK, /* first, no error */
+
+  CURL_FORMADD_MEMORY,
+  CURL_FORMADD_OPTION_TWICE,
+  CURL_FORMADD_NULL,
+  CURL_FORMADD_UNKNOWN_OPTION,
+  CURL_FORMADD_INCOMPLETE,
+  CURL_FORMADD_ILLEGAL_ARRAY,
+  CURL_FORMADD_DISABLED, /* libcurl was built with this disabled */
+
+  CURL_FORMADD_LAST /* last */
+} CURLFORMcode;
+
+/*
+ * NAME curl_formadd()
+ *
+ * DESCRIPTION
+ *
+ * Pretty advanced function for building multi-part formposts. Each invoke
+ * adds one part that together construct a full post. Then use
+ * CURLOPT_HTTPPOST to send it off to libcurl.
+ */
+CURL_EXTERN CURLFORMcode curl_formadd(struct curl_httppost **httppost,
+                                      struct curl_httppost **last_post,
+                                      ...);
+
+/*
+ * callback function for curl_formget()
+ * The void *arg pointer will be the one passed as second argument to
+ *   curl_formget().
+ * The character buffer passed to it must not be freed.
+ * Should return the buffer length passed to it as the argument "len" on
+ *   success.
+ */
+typedef size_t (*curl_formget_callback)(void *arg, const char *buf,
+                                        size_t len);
+
+/*
+ * NAME curl_formget()
+ *
+ * DESCRIPTION
+ *
+ * Serialize a curl_httppost struct built with curl_formadd().
+ * Accepts a void pointer as second argument which will be passed to
+ * the curl_formget_callback function.
+ * Returns 0 on success.
+ */
+CURL_EXTERN int curl_formget(struct curl_httppost *form, void *arg,
+                             curl_formget_callback append);
+/*
+ * NAME curl_formfree()
+ *
+ * DESCRIPTION
+ *
+ * Free a multipart formpost previously built with curl_formadd().
+ */
+CURL_EXTERN void curl_formfree(struct curl_httppost *form);
+
+/*
+ * NAME curl_getenv()
+ *
+ * DESCRIPTION
+ *
+ * Returns a malloc()'ed string that MUST be curl_free()ed after usage is
+ * complete. DEPRECATED - see lib/README.curlx
+ */
+CURL_EXTERN char *curl_getenv(const char *variable);
+
+/*
+ * NAME curl_version()
+ *
+ * DESCRIPTION
+ *
+ * Returns a static ascii string of the libcurl version.
+ */
+CURL_EXTERN char *curl_version(void);
+
+/*
+ * NAME curl_easy_escape()
+ *
+ * DESCRIPTION
+ *
+ * Escapes URL strings (converts all letters consider illegal in URLs to their
+ * %XX versions). This function returns a new allocated string or NULL if an
+ * error occurred.
+ */
+CURL_EXTERN char *curl_easy_escape(CURL *handle,
+                                   const char *string,
+                                   int length);
+
+/* the previous version: */
+CURL_EXTERN char *curl_escape(const char *string,
+                              int length);
+
+
+/*
+ * NAME curl_easy_unescape()
+ *
+ * DESCRIPTION
+ *
+ * Unescapes URL encoding in strings (converts all %XX codes to their 8bit
+ * versions). This function returns a new allocated string or NULL if an error
+ * occurred.
+ * Conversion Note: On non-ASCII platforms the ASCII %XX codes are
+ * converted into the host encoding.
+ */
+CURL_EXTERN char *curl_easy_unescape(CURL *handle,
+                                     const char *string,
+                                     int length,
+                                     int *outlength);
+
+/* the previous version */
+CURL_EXTERN char *curl_unescape(const char *string,
+                                int length);
+
+/*
+ * NAME curl_free()
+ *
+ * DESCRIPTION
+ *
+ * Provided for de-allocation in the same translation unit that did the
+ * allocation. Added in libcurl 7.10
+ */
+CURL_EXTERN void curl_free(void *p);
+
+/*
+ * NAME curl_global_init()
+ *
+ * DESCRIPTION
+ *
+ * curl_global_init() should be invoked exactly once for each application that
+ * uses libcurl and before any call of other libcurl functions.
+ *
+ * This function is not thread-safe!
+ */
+CURL_EXTERN CURLcode curl_global_init(long flags);
+
+/*
+ * NAME curl_global_init_mem()
+ *
+ * DESCRIPTION
+ *
+ * curl_global_init() or curl_global_init_mem() should be invoked exactly once
+ * for each application that uses libcurl.  This function can be used to
+ * initialize libcurl and set user defined memory management callback
+ * functions.  Users can implement memory management routines to check for
+ * memory leaks, check for mis-use of the curl library etc.  User registered
+ * callback routines with be invoked by this library instead of the system
+ * memory management routines like malloc, free etc.
+ */
+CURL_EXTERN CURLcode curl_global_init_mem(long flags,
+                                          curl_malloc_callback m,
+                                          curl_free_callback f,
+                                          curl_realloc_callback r,
+                                          curl_strdup_callback s,
+                                          curl_calloc_callback c);
+
+/*
+ * NAME curl_global_cleanup()
+ *
+ * DESCRIPTION
+ *
+ * curl_global_cleanup() should be invoked exactly once for each application
+ * that uses libcurl
+ */
+CURL_EXTERN void curl_global_cleanup(void);
+
+/* linked-list structure for the CURLOPT_QUOTE option (and other) */
+struct curl_slist {
+  char *data;
+  struct curl_slist *next;
+};
+
+/*
+ * NAME curl_slist_append()
+ *
+ * DESCRIPTION
+ *
+ * Appends a string to a linked list. If no list exists, it will be created
+ * first. Returns the new list, after appending.
+ */
+CURL_EXTERN struct curl_slist *curl_slist_append(struct curl_slist *,
+                                                 const char *);
+
+/*
+ * NAME curl_slist_free_all()
+ *
+ * DESCRIPTION
+ *
+ * free a previously built curl_slist.
+ */
+CURL_EXTERN void curl_slist_free_all(struct curl_slist *);
+
+/*
+ * NAME curl_getdate()
+ *
+ * DESCRIPTION
+ *
+ * Returns the time, in seconds since 1 Jan 1970 of the time string given in
+ * the first argument. The time argument in the second parameter is unused
+ * and should be set to NULL.
+ */
+CURL_EXTERN time_t curl_getdate(const char *p, const time_t *unused);
+
+/* info about the certificate chain, only for OpenSSL builds. Asked
+   for with CURLOPT_CERTINFO / CURLINFO_CERTINFO */
+struct curl_certinfo {
+  int num_of_certs;             /* number of certificates with information */
+  struct curl_slist **certinfo; /* for each index in this array, there's a
+                                   linked list with textual information in the
+                                   format "name: value" */
+};
+
+/* enum for the different supported SSL backends */
+typedef enum {
+  CURLSSLBACKEND_NONE = 0,
+  CURLSSLBACKEND_OPENSSL = 1,
+  CURLSSLBACKEND_GNUTLS = 2,
+  CURLSSLBACKEND_NSS = 3,
+  CURLSSLBACKEND_QSOSSL = 4,
+  CURLSSLBACKEND_GSKIT = 5,
+  CURLSSLBACKEND_POLARSSL = 6,
+  CURLSSLBACKEND_CYASSL = 7,
+  CURLSSLBACKEND_SCHANNEL = 8,
+  CURLSSLBACKEND_DARWINSSL = 9,
+  CURLSSLBACKEND_AXTLS = 10
+} curl_sslbackend;
+
+/* Information about the SSL library used and the respective internal SSL
+   handle, which can be used to obtain further information regarding the
+   connection. Asked for with CURLINFO_TLS_SESSION. */
+struct curl_tlssessioninfo {
+  curl_sslbackend backend;
+  void *internals;
+};
+
+#define CURLINFO_STRING   0x100000
+#define CURLINFO_LONG     0x200000
+#define CURLINFO_DOUBLE   0x300000
+#define CURLINFO_SLIST    0x400000
+#define CURLINFO_MASK     0x0fffff
+#define CURLINFO_TYPEMASK 0xf00000
+
+typedef enum {
+  CURLINFO_NONE, /* first, never use this */
+  CURLINFO_EFFECTIVE_URL    = CURLINFO_STRING + 1,
+  CURLINFO_RESPONSE_CODE    = CURLINFO_LONG   + 2,
+  CURLINFO_TOTAL_TIME       = CURLINFO_DOUBLE + 3,
+  CURLINFO_NAMELOOKUP_TIME  = CURLINFO_DOUBLE + 4,
+  CURLINFO_CONNECT_TIME     = CURLINFO_DOUBLE + 5,
+  CURLINFO_PRETRANSFER_TIME = CURLINFO_DOUBLE + 6,
+  CURLINFO_SIZE_UPLOAD      = CURLINFO_DOUBLE + 7,
+  CURLINFO_SIZE_DOWNLOAD    = CURLINFO_DOUBLE + 8,
+  CURLINFO_SPEED_DOWNLOAD   = CURLINFO_DOUBLE + 9,
+  CURLINFO_SPEED_UPLOAD     = CURLINFO_DOUBLE + 10,
+  CURLINFO_HEADER_SIZE      = CURLINFO_LONG   + 11,
+  CURLINFO_REQUEST_SIZE     = CURLINFO_LONG   + 12,
+  CURLINFO_SSL_VERIFYRESULT = CURLINFO_LONG   + 13,
+  CURLINFO_FILETIME         = CURLINFO_LONG   + 14,
+  CURLINFO_CONTENT_LENGTH_DOWNLOAD   = CURLINFO_DOUBLE + 15,
+  CURLINFO_CONTENT_LENGTH_UPLOAD     = CURLINFO_DOUBLE + 16,
+  CURLINFO_STARTTRANSFER_TIME = CURLINFO_DOUBLE + 17,
+  CURLINFO_CONTENT_TYPE     = CURLINFO_STRING + 18,
+  CURLINFO_REDIRECT_TIME    = CURLINFO_DOUBLE + 19,
+  CURLINFO_REDIRECT_COUNT   = CURLINFO_LONG   + 20,
+  CURLINFO_PRIVATE          = CURLINFO_STRING + 21,
+  CURLINFO_HTTP_CONNECTCODE = CURLINFO_LONG   + 22,
+  CURLINFO_HTTPAUTH_AVAIL   = CURLINFO_LONG   + 23,
+  CURLINFO_PROXYAUTH_AVAIL  = CURLINFO_LONG   + 24,
+  CURLINFO_OS_ERRNO         = CURLINFO_LONG   + 25,
+  CURLINFO_NUM_CONNECTS     = CURLINFO_LONG   + 26,
+  CURLINFO_SSL_ENGINES      = CURLINFO_SLIST  + 27,
+  CURLINFO_COOKIELIST       = CURLINFO_SLIST  + 28,
+  CURLINFO_LASTSOCKET       = CURLINFO_LONG   + 29,
+  CURLINFO_FTP_ENTRY_PATH   = CURLINFO_STRING + 30,
+  CURLINFO_REDIRECT_URL     = CURLINFO_STRING + 31,
+  CURLINFO_PRIMARY_IP       = CURLINFO_STRING + 32,
+  CURLINFO_APPCONNECT_TIME  = CURLINFO_DOUBLE + 33,
+  CURLINFO_CERTINFO         = CURLINFO_SLIST  + 34,
+  CURLINFO_CONDITION_UNMET  = CURLINFO_LONG   + 35,
+  CURLINFO_RTSP_SESSION_ID  = CURLINFO_STRING + 36,
+  CURLINFO_RTSP_CLIENT_CSEQ = CURLINFO_LONG   + 37,
+  CURLINFO_RTSP_SERVER_CSEQ = CURLINFO_LONG   + 38,
+  CURLINFO_RTSP_CSEQ_RECV   = CURLINFO_LONG   + 39,
+  CURLINFO_PRIMARY_PORT     = CURLINFO_LONG   + 40,
+  CURLINFO_LOCAL_IP         = CURLINFO_STRING + 41,
+  CURLINFO_LOCAL_PORT       = CURLINFO_LONG   + 42,
+  CURLINFO_TLS_SESSION      = CURLINFO_SLIST  + 43,
+  /* Fill in new entries below here! */
+
+  CURLINFO_LASTONE          = 43
+} CURLINFO;
+
+/* CURLINFO_RESPONSE_CODE is the new name for the option previously known as
+   CURLINFO_HTTP_CODE */
+#define CURLINFO_HTTP_CODE CURLINFO_RESPONSE_CODE
+
+typedef enum {
+  CURLCLOSEPOLICY_NONE, /* first, never use this */
+
+  CURLCLOSEPOLICY_OLDEST,
+  CURLCLOSEPOLICY_LEAST_RECENTLY_USED,
+  CURLCLOSEPOLICY_LEAST_TRAFFIC,
+  CURLCLOSEPOLICY_SLOWEST,
+  CURLCLOSEPOLICY_CALLBACK,
+
+  CURLCLOSEPOLICY_LAST /* last, never use this */
+} curl_closepolicy;
+
+#define CURL_GLOBAL_SSL (1<<0)
+#define CURL_GLOBAL_WIN32 (1<<1)
+#define CURL_GLOBAL_ALL (CURL_GLOBAL_SSL|CURL_GLOBAL_WIN32)
+#define CURL_GLOBAL_NOTHING 0
+#define CURL_GLOBAL_DEFAULT CURL_GLOBAL_ALL
+#define CURL_GLOBAL_ACK_EINTR (1<<2)
+
+
+/*****************************************************************************
+ * Setup defines, protos etc for the sharing stuff.
+ */
+
+/* Different data locks for a single share */
+typedef enum {
+  CURL_LOCK_DATA_NONE = 0,
+  /*  CURL_LOCK_DATA_SHARE is used internally to say that
+   *  the locking is just made to change the internal state of the share
+   *  itself.
+   */
+  CURL_LOCK_DATA_SHARE,
+  CURL_LOCK_DATA_COOKIE,
+  CURL_LOCK_DATA_DNS,
+  CURL_LOCK_DATA_SSL_SESSION,
+  CURL_LOCK_DATA_CONNECT,
+  CURL_LOCK_DATA_LAST
+} curl_lock_data;
+
+/* Different lock access types */
+typedef enum {
+  CURL_LOCK_ACCESS_NONE = 0,   /* unspecified action */
+  CURL_LOCK_ACCESS_SHARED = 1, /* for read perhaps */
+  CURL_LOCK_ACCESS_SINGLE = 2, /* for write perhaps */
+  CURL_LOCK_ACCESS_LAST        /* never use */
+} curl_lock_access;
+
+typedef void (*curl_lock_function)(CURL *handle,
+                                   curl_lock_data data,
+                                   curl_lock_access locktype,
+                                   void *userptr);
+typedef void (*curl_unlock_function)(CURL *handle,
+                                     curl_lock_data data,
+                                     void *userptr);
+
+typedef void CURLSH;
+
+typedef enum {
+  CURLSHE_OK,  /* all is fine */
+  CURLSHE_BAD_OPTION, /* 1 */
+  CURLSHE_IN_USE,     /* 2 */
+  CURLSHE_INVALID,    /* 3 */
+  CURLSHE_NOMEM,      /* 4 out of memory */
+  CURLSHE_NOT_BUILT_IN, /* 5 feature not present in lib */
+  CURLSHE_LAST        /* never use */
+} CURLSHcode;
+
+typedef enum {
+  CURLSHOPT_NONE,  /* don't use */
+  CURLSHOPT_SHARE,   /* specify a data type to share */
+  CURLSHOPT_UNSHARE, /* specify which data type to stop sharing */
+  CURLSHOPT_LOCKFUNC,   /* pass in a 'curl_lock_function' pointer */
+  CURLSHOPT_UNLOCKFUNC, /* pass in a 'curl_unlock_function' pointer */
+  CURLSHOPT_USERDATA,   /* pass in a user data pointer used in the lock/unlock
+                           callback functions */
+  CURLSHOPT_LAST  /* never use */
+} CURLSHoption;
+
+CURL_EXTERN CURLSH *curl_share_init(void);
+CURL_EXTERN CURLSHcode curl_share_setopt(CURLSH *, CURLSHoption option, ...);
+CURL_EXTERN CURLSHcode curl_share_cleanup(CURLSH *);
+
+/****************************************************************************
+ * Structures for querying information about the curl library at runtime.
+ */
+
+typedef enum {
+  CURLVERSION_FIRST,
+  CURLVERSION_SECOND,
+  CURLVERSION_THIRD,
+  CURLVERSION_FOURTH,
+  CURLVERSION_LAST /* never actually use this */
+} CURLversion;
+
+/* The 'CURLVERSION_NOW' is the symbolic name meant to be used by
+   basically all programs ever that want to get version information. It is
+   meant to be a built-in version number for what kind of struct the caller
+   expects. If the struct ever changes, we redefine the NOW to another enum
+   from above. */
+#define CURLVERSION_NOW CURLVERSION_FOURTH
+
+typedef struct {
+  CURLversion age;          /* age of the returned struct */
+  const char *version;      /* LIBCURL_VERSION */
+  unsigned int version_num; /* LIBCURL_VERSION_NUM */
+  const char *host;         /* OS/host/cpu/machine when configured */
+  int features;             /* bitmask, see defines below */
+  const char *ssl_version;  /* human readable string */
+  long ssl_version_num;     /* not used anymore, always 0 */
+  const char *libz_version; /* human readable string */
+  /* protocols is terminated by an entry with a NULL protoname */
+  const char * const *protocols;
+
+  /* The fields below this were added in CURLVERSION_SECOND */
+  const char *ares;
+  int ares_num;
+
+  /* This field was added in CURLVERSION_THIRD */
+  const char *libidn;
+
+  /* These field were added in CURLVERSION_FOURTH */
+
+  /* Same as '_libiconv_version' if built with HAVE_ICONV */
+  int iconv_ver_num;
+
+  const char *libssh_version; /* human readable string */
+
+} curl_version_info_data;
+
+#define CURL_VERSION_IPV6      (1<<0)  /* IPv6-enabled */
+#define CURL_VERSION_KERBEROS4 (1<<1)  /* kerberos auth is supported */
+#define CURL_VERSION_SSL       (1<<2)  /* SSL options are present */
+#define CURL_VERSION_LIBZ      (1<<3)  /* libz features are present */
+#define CURL_VERSION_NTLM      (1<<4)  /* NTLM auth is supported */
+#define CURL_VERSION_GSSNEGOTIATE (1<<5) /* Negotiate auth support
+                                            (deprecated) */
+#define CURL_VERSION_DEBUG     (1<<6)  /* built with debug capabilities */
+#define CURL_VERSION_ASYNCHDNS (1<<7)  /* asynchronous dns resolves */
+#define CURL_VERSION_SPNEGO    (1<<8)  /* SPNEGO auth is supported */
+#define CURL_VERSION_LARGEFILE (1<<9)  /* supports files bigger than 2GB */
+#define CURL_VERSION_IDN       (1<<10) /* International Domain Names support */
+#define CURL_VERSION_SSPI      (1<<11) /* SSPI is supported */
+#define CURL_VERSION_CONV      (1<<12) /* character conversions supported */
+#define CURL_VERSION_CURLDEBUG (1<<13) /* debug memory tracking supported */
+#define CURL_VERSION_TLSAUTH_SRP (1<<14) /* TLS-SRP auth is supported */
+#define CURL_VERSION_NTLM_WB   (1<<15) /* NTLM delegating to winbind helper */
+#define CURL_VERSION_HTTP2     (1<<16) /* HTTP2 support built-in */
+#define CURL_VERSION_GSSAPI    (1<<17) /* GSS-API is supported */
+
+ /*
+ * NAME curl_version_info()
+ *
+ * DESCRIPTION
+ *
+ * This function returns a pointer to a static copy of the version info
+ * struct. See above.
+ */
+CURL_EXTERN curl_version_info_data *curl_version_info(CURLversion);
+
+/*
+ * NAME curl_easy_strerror()
+ *
+ * DESCRIPTION
+ *
+ * The curl_easy_strerror function may be used to turn a CURLcode value
+ * into the equivalent human readable error string.  This is useful
+ * for printing meaningful error messages.
+ */
+CURL_EXTERN const char *curl_easy_strerror(CURLcode);
+
+/*
+ * NAME curl_share_strerror()
+ *
+ * DESCRIPTION
+ *
+ * The curl_share_strerror function may be used to turn a CURLSHcode value
+ * into the equivalent human readable error string.  This is useful
+ * for printing meaningful error messages.
+ */
+CURL_EXTERN const char *curl_share_strerror(CURLSHcode);
+
+/*
+ * NAME curl_easy_pause()
+ *
+ * DESCRIPTION
+ *
+ * The curl_easy_pause function pauses or unpauses transfers. Select the new
+ * state by setting the bitmask, use the convenience defines below.
+ *
+ */
+CURL_EXTERN CURLcode curl_easy_pause(CURL *handle, int bitmask);
+
+#define CURLPAUSE_RECV      (1<<0)
+#define CURLPAUSE_RECV_CONT (0)
+
+#define CURLPAUSE_SEND      (1<<2)
+#define CURLPAUSE_SEND_CONT (0)
+
+#define CURLPAUSE_ALL       (CURLPAUSE_RECV|CURLPAUSE_SEND)
+#define CURLPAUSE_CONT      (CURLPAUSE_RECV_CONT|CURLPAUSE_SEND_CONT)
+
+#ifdef  __cplusplus
+}
+#endif
+
+/* unfortunately, the easy.h and multi.h include files need options and info
+  stuff before they can be included! */
+#include "easy.h" /* nothing in curl is fun without the easy stuff */
+#include "multi.h"
+
+/* the typechecker doesn't work in C++ (yet) */
+#if defined(__GNUC__) && defined(__GNUC_MINOR__) && \
+    ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)) && \
+    !defined(__cplusplus) && !defined(CURL_DISABLE_TYPECHECK)
+#include "typecheck-gcc.h"
+#else
+#if defined(__STDC__) && (__STDC__ >= 1)
+/* This preprocessor magic that replaces a call with the exact same call is
+   only done to make sure application authors pass exactly three arguments
+   to these functions. */
+#define curl_easy_setopt(handle,opt,param) curl_easy_setopt(handle,opt,param)
+#define curl_easy_getinfo(handle,info,arg) curl_easy_getinfo(handle,info,arg)
+#define curl_share_setopt(share,opt,param) curl_share_setopt(share,opt,param)
+#define curl_multi_setopt(handle,opt,param) curl_multi_setopt(handle,opt,param)
+#endif /* __STDC__ >= 1 */
+#endif /* gcc >= 4.3 && !__cplusplus */
+
+#endif /* __CURL_CURL_H */
diff --git a/compat/curl-for-windows/curl/include/curl/curlbuild.h b/compat/curl-for-windows/curl/include/curl/curlbuild.h
new file mode 100644
index 0000000000..aa53673f3f
--- /dev/null
+++ b/compat/curl-for-windows/curl/include/curl/curlbuild.h
@@ -0,0 +1,180 @@
+#ifndef __CURL_CURLBUILD_H
+#define __CURL_CURLBUILD_H
+/***************************************************************************
+ *                                  _   _ ____  _
+ *  Project                     ___| | | |  _ \| |
+ *                             / __| | | | |_) | |
+ *                            | (__| |_| |  _ <| |___
+ *                             \___|\___/|_| \_\_____|
+ *
+ * Copyright (C) 1998 - 2008, Daniel Stenberg, <daniel@haxx.se>, et al.
+ *
+ * This software is licensed as described in the file COPYING, which
+ * you should have received as part of this distribution. The terms
+ * are also available at http://curl.haxx.se/docs/copyright.html.
+ *
+ * You may opt to use, copy, modify, merge, publish, distribute and/or sell
+ * copies of the Software, and permit persons to whom the Software is
+ * furnished to do so, under the terms of the COPYING file.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ***************************************************************************/
+
+/* ================================================================ */
+/*               NOTES FOR CONFIGURE CAPABLE SYSTEMS                */
+/* ================================================================ */
+
+/*
+ * NOTE 1:
+ * -------
+ *
+ * Nothing in this file is intended to be modified or adjusted by the
+ * curl library user nor by the curl library builder.
+ *
+ * If you think that something actually needs to be changed, adjusted
+ * or fixed in this file, then, report it on the libcurl development
+ * mailing list: http://cool.haxx.se/mailman/listinfo/curl-library/
+ *
+ * This header file shall only export symbols which are 'curl' or 'CURL'
+ * prefixed, otherwise public name space would be polluted.
+ *
+ * NOTE 2:
+ * -------
+ *
+ * Right now you might be staring at file include/curl/curlbuild.h.in or
+ * at file include/curl/curlbuild.h, this is due to the following reason:
+ *
+ * On systems capable of running the configure script, the configure process
+ * will overwrite the distributed include/curl/curlbuild.h file with one that
+ * is suitable and specific to the library being configured and built, which
+ * is generated from the include/curl/curlbuild.h.in template file.
+ *
+ */
+
+/* ================================================================ */
+/*  DEFINITION OF THESE SYMBOLS SHALL NOT TAKE PLACE ANYWHERE ELSE  */
+/* ================================================================ */
+
+#ifdef CURL_SIZEOF_LONG
+#  error "CURL_SIZEOF_LONG shall not be defined except in curlbuild.h"
+   Error Compilation_aborted_CURL_SIZEOF_LONG_already_defined
+#endif
+
+#ifdef CURL_TYPEOF_CURL_SOCKLEN_T
+#  error "CURL_TYPEOF_CURL_SOCKLEN_T shall not be defined except in curlbuild.h"
+   Error Compilation_aborted_CURL_TYPEOF_CURL_SOCKLEN_T_already_defined
+#endif
+
+#ifdef CURL_SIZEOF_CURL_SOCKLEN_T
+#  error "CURL_SIZEOF_CURL_SOCKLEN_T shall not be defined except in curlbuild.h"
+   Error Compilation_aborted_CURL_SIZEOF_CURL_SOCKLEN_T_already_defined
+#endif
+#ifdef CURL_TYPEOF_CURL_OFF_T
+#  error "CURL_TYPEOF_CURL_OFF_T shall not be defined except in curlbuild.h"
+   Error Compilation_aborted_CURL_TYPEOF_CURL_OFF_T_already_defined
+#endif
+
+#ifdef CURL_FORMAT_CURL_OFF_T
+#  error "CURL_FORMAT_CURL_OFF_T shall not be defined except in curlbuild.h"
+   Error Compilation_aborted_CURL_FORMAT_CURL_OFF_T_already_defined
+#endif
+
+#ifdef CURL_FORMAT_CURL_OFF_TU
+#  error "CURL_FORMAT_CURL_OFF_TU shall not be defined except in curlbuild.h"
+   Error Compilation_aborted_CURL_FORMAT_CURL_OFF_TU_already_defined
+#endif
+
+#ifdef CURL_FORMAT_OFF_T
+#  error "CURL_FORMAT_OFF_T shall not be defined except in curlbuild.h"
+   Error Compilation_aborted_CURL_FORMAT_OFF_T_already_defined
+#endif
+
+#ifdef CURL_SIZEOF_CURL_OFF_T
+#  error "CURL_SIZEOF_CURL_OFF_T shall not be defined except in curlbuild.h"
+   Error Compilation_aborted_CURL_SIZEOF_CURL_OFF_T_already_defined
+#endif
+
+#ifdef CURL_SUFFIX_CURL_OFF_T
+#  error "CURL_SUFFIX_CURL_OFF_T shall not be defined except in curlbuild.h"
+   Error Compilation_aborted_CURL_SUFFIX_CURL_OFF_T_already_defined
+#endif
+
+#ifdef CURL_SUFFIX_CURL_OFF_TU
+#  error "CURL_SUFFIX_CURL_OFF_TU shall not be defined except in curlbuild.h"
+   Error Compilation_aborted_CURL_SUFFIX_CURL_OFF_TU_already_defined
+#endif
+
+/* ================================================================ */
+/*  EXTERNAL INTERFACE SETTINGS FOR CONFIGURE CAPABLE SYSTEMS ONLY  */
+/* ================================================================ */
+
+/* Configure process defines this to 1 when it finds out that system   */
+/* header file sys/types.h must be included by the external interface. */
+/* #undef CURL_PULL_SYS_TYPES_H */
+#ifdef CURL_PULL_SYS_TYPES_H
+#  include <sys/types.h>
+#endif
+
+/* Configure process defines this to 1 when it finds out that system */
+/* header file stdint.h must be included by the external interface.  */
+/* #undef CURL_PULL_STDINT_H */
+#ifdef CURL_PULL_STDINT_H
+#  include <stdint.h>
+#endif
+
+/* Configure process defines this to 1 when it finds out that system  */
+/* header file inttypes.h must be included by the external interface. */
+/* #undef CURL_PULL_INTTYPES_H */
+#ifdef CURL_PULL_INTTYPES_H
+#  include <inttypes.h>
+#endif
+
+/* The size of `long', as computed by sizeof. */
+#define CURL_SIZEOF_LONG 4
+
+/* Integral data type used for curl_socklen_t. */
+#define CURL_TYPEOF_CURL_SOCKLEN_T socklen_t
+
+/* on windows socklen_t is in here */
+#ifdef _WIN32
+#  include <winsock2.h>
+#  include <ws2tcpip.h>
+#endif
+
+#ifdef HAVE_SYS_SOCKET_H
+#  include <sys/socket.h>
+#endif
+
+/* Data type definition of curl_socklen_t. */
+typedef CURL_TYPEOF_CURL_SOCKLEN_T curl_socklen_t;
+
+/* The size of `curl_socklen_t', as computed by sizeof. */
+#define CURL_SIZEOF_CURL_SOCKLEN_T 4
+
+/* Signed integral data type used for curl_off_t. */
+#define CURL_TYPEOF_CURL_OFF_T long long
+
+/* Data type definition of curl_off_t. */
+typedef CURL_TYPEOF_CURL_OFF_T curl_off_t;
+
+/* curl_off_t formatting string directive without "%" conversion specifier. */
+#define CURL_FORMAT_CURL_OFF_T "lld"
+
+/* unsigned curl_off_t formatting string without "%" conversion specifier. */
+#define CURL_FORMAT_CURL_OFF_TU "llu"
+
+/* curl_off_t formatting string directive with "%" conversion specifier. */
+#define CURL_FORMAT_OFF_T "%lld"
+
+/* The size of `curl_off_t', as computed by sizeof. */
+#define CURL_SIZEOF_CURL_OFF_T 8
+
+/* curl_off_t constant suffix. */
+#define CURL_SUFFIX_CURL_OFF_T LL
+
+/* unsigned curl_off_t constant suffix. */
+#define CURL_SUFFIX_CURL_OFF_TU ULL
+
+#endif /* __CURL_CURLBUILD_H */
diff --git a/compat/curl-for-windows/curl/include/curl/curlrules.h b/compat/curl-for-windows/curl/include/curl/curlrules.h
new file mode 100644
index 0000000000..7c2ede35b6
--- /dev/null
+++ b/compat/curl-for-windows/curl/include/curl/curlrules.h
@@ -0,0 +1,262 @@
+#ifndef __CURL_CURLRULES_H
+#define __CURL_CURLRULES_H
+/***************************************************************************
+ *                                  _   _ ____  _
+ *  Project                     ___| | | |  _ \| |
+ *                             / __| | | | |_) | |
+ *                            | (__| |_| |  _ <| |___
+ *                             \___|\___/|_| \_\_____|
+ *
+ * Copyright (C) 1998 - 2012, Daniel Stenberg, <daniel@haxx.se>, et al.
+ *
+ * This software is licensed as described in the file COPYING, which
+ * you should have received as part of this distribution. The terms
+ * are also available at http://curl.haxx.se/docs/copyright.html.
+ *
+ * You may opt to use, copy, modify, merge, publish, distribute and/or sell
+ * copies of the Software, and permit persons to whom the Software is
+ * furnished to do so, under the terms of the COPYING file.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ***************************************************************************/
+
+/* ================================================================ */
+/*                    COMPILE TIME SANITY CHECKS                    */
+/* ================================================================ */
+
+/*
+ * NOTE 1:
+ * -------
+ *
+ * All checks done in this file are intentionally placed in a public
+ * header file which is pulled by curl/curl.h when an application is
+ * being built using an already built libcurl library. Additionally
+ * this file is also included and used when building the library.
+ *
+ * If compilation fails on this file it is certainly sure that the
+ * problem is elsewhere. It could be a problem in the curlbuild.h
+ * header file, or simply that you are using different compilation
+ * settings than those used to build the library.
+ *
+ * Nothing in this file is intended to be modified or adjusted by the
+ * curl library user nor by the curl library builder.
+ *
+ * Do not deactivate any check, these are done to make sure that the
+ * library is properly built and used.
+ *
+ * You can find further help on the libcurl development mailing list:
+ * http://cool.haxx.se/mailman/listinfo/curl-library/
+ *
+ * NOTE 2
+ * ------
+ *
+ * Some of the following compile time checks are based on the fact
+ * that the dimension of a constant array can not be a negative one.
+ * In this way if the compile time verification fails, the compilation
+ * will fail issuing an error. The error description wording is compiler
+ * dependent but it will be quite similar to one of the following:
+ *
+ *   "negative subscript or subscript is too large"
+ *   "array must have at least one element"
+ *   "-1 is an illegal array size"
+ *   "size of array is negative"
+ *
+ * If you are building an application which tries to use an already
+ * built libcurl library and you are getting this kind of errors on
+ * this file, it is a clear indication that there is a mismatch between
+ * how the library was built and how you are trying to use it for your
+ * application. Your already compiled or binary library provider is the
+ * only one who can give you the details you need to properly use it.
+ */
+
+/*
+ * Verify that some macros are actually defined.
+ */
+
+#ifndef CURL_SIZEOF_LONG
+#  error "CURL_SIZEOF_LONG definition is missing!"
+   Error Compilation_aborted_CURL_SIZEOF_LONG_is_missing
+#endif
+
+#ifndef CURL_TYPEOF_CURL_SOCKLEN_T
+#  error "CURL_TYPEOF_CURL_SOCKLEN_T definition is missing!"
+   Error Compilation_aborted_CURL_TYPEOF_CURL_SOCKLEN_T_is_missing
+#endif
+
+#ifndef CURL_SIZEOF_CURL_SOCKLEN_T
+#  error "CURL_SIZEOF_CURL_SOCKLEN_T definition is missing!"
+   Error Compilation_aborted_CURL_SIZEOF_CURL_SOCKLEN_T_is_missing
+#endif
+
+#ifndef CURL_TYPEOF_CURL_OFF_T
+#  error "CURL_TYPEOF_CURL_OFF_T definition is missing!"
+   Error Compilation_aborted_CURL_TYPEOF_CURL_OFF_T_is_missing
+#endif
+
+#ifndef CURL_FORMAT_CURL_OFF_T
+#  error "CURL_FORMAT_CURL_OFF_T definition is missing!"
+   Error Compilation_aborted_CURL_FORMAT_CURL_OFF_T_is_missing
+#endif
+
+#ifndef CURL_FORMAT_CURL_OFF_TU
+#  error "CURL_FORMAT_CURL_OFF_TU definition is missing!"
+   Error Compilation_aborted_CURL_FORMAT_CURL_OFF_TU_is_missing
+#endif
+
+#ifndef CURL_FORMAT_OFF_T
+#  error "CURL_FORMAT_OFF_T definition is missing!"
+   Error Compilation_aborted_CURL_FORMAT_OFF_T_is_missing
+#endif
+
+#ifndef CURL_SIZEOF_CURL_OFF_T
+#  error "CURL_SIZEOF_CURL_OFF_T definition is missing!"
+   Error Compilation_aborted_CURL_SIZEOF_CURL_OFF_T_is_missing
+#endif
+
+#ifndef CURL_SUFFIX_CURL_OFF_T
+#  error "CURL_SUFFIX_CURL_OFF_T definition is missing!"
+   Error Compilation_aborted_CURL_SUFFIX_CURL_OFF_T_is_missing
+#endif
+
+#ifndef CURL_SUFFIX_CURL_OFF_TU
+#  error "CURL_SUFFIX_CURL_OFF_TU definition is missing!"
+   Error Compilation_aborted_CURL_SUFFIX_CURL_OFF_TU_is_missing
+#endif
+
+/*
+ * Macros private to this header file.
+ */
+
+#define CurlchkszEQ(t, s) sizeof(t) == s ? 1 : -1
+
+#define CurlchkszGE(t1, t2) sizeof(t1) >= sizeof(t2) ? 1 : -1
+
+/*
+ * Verify that the size previously defined and expected for long
+ * is the same as the one reported by sizeof() at compile time.
+ */
+
+typedef char
+  __curl_rule_01__
+    [CurlchkszEQ(long, CURL_SIZEOF_LONG)];
+
+/*
+ * Verify that the size previously defined and expected for
+ * curl_off_t is actually the the same as the one reported
+ * by sizeof() at compile time.
+ */
+
+typedef char
+  __curl_rule_02__
+    [CurlchkszEQ(curl_off_t, CURL_SIZEOF_CURL_OFF_T)];
+
+/*
+ * Verify at compile time that the size of curl_off_t as reported
+ * by sizeof() is greater or equal than the one reported for long
+ * for the current compilation.
+ */
+
+typedef char
+  __curl_rule_03__
+    [CurlchkszGE(curl_off_t, long)];
+
+/*
+ * Verify that the size previously defined and expected for
+ * curl_socklen_t is actually the the same as the one reported
+ * by sizeof() at compile time.
+ */
+
+typedef char
+  __curl_rule_04__
+    [CurlchkszEQ(curl_socklen_t, CURL_SIZEOF_CURL_SOCKLEN_T)];
+
+/*
+ * Verify at compile time that the size of curl_socklen_t as reported
+ * by sizeof() is greater or equal than the one reported for int for
+ * the current compilation.
+ */
+
+typedef char
+  __curl_rule_05__
+    [CurlchkszGE(curl_socklen_t, int)];
+
+/* ================================================================ */
+/*          EXTERNALLY AND INTERNALLY VISIBLE DEFINITIONS           */
+/* ================================================================ */
+
+/*
+ * CURL_ISOCPP and CURL_OFF_T_C definitions are done here in order to allow
+ * these to be visible and exported by the external libcurl interface API,
+ * while also making them visible to the library internals, simply including
+ * curl_setup.h, without actually needing to include curl.h internally.
+ * If some day this section would grow big enough, all this should be moved
+ * to its own header file.
+ */
+
+/*
+ * Figure out if we can use the ## preprocessor operator, which is supported
+ * by ISO/ANSI C and C++. Some compilers support it without setting __STDC__
+ * or  __cplusplus so we need to carefully check for them too.
+ */
+
+#if defined(__STDC__) || defined(_MSC_VER) || defined(__cplusplus) || \
+  defined(__HP_aCC) || defined(__BORLANDC__) || defined(__LCC__) || \
+  defined(__POCC__) || defined(__SALFORDC__) || defined(__HIGHC__) || \
+  defined(__ILEC400__)
+  /* This compiler is believed to have an ISO compatible preprocessor */
+#define CURL_ISOCPP
+#else
+  /* This compiler is believed NOT to have an ISO compatible preprocessor */
+#undef CURL_ISOCPP
+#endif
+
+/*
+ * Macros for minimum-width signed and unsigned curl_off_t integer constants.
+ */
+
+#if defined(__BORLANDC__) && (__BORLANDC__ == 0x0551)
+#  define __CURL_OFF_T_C_HLPR2(x) x
+#  define __CURL_OFF_T_C_HLPR1(x) __CURL_OFF_T_C_HLPR2(x)
+#  define CURL_OFF_T_C(Val)  __CURL_OFF_T_C_HLPR1(Val) ## \
+                             __CURL_OFF_T_C_HLPR1(CURL_SUFFIX_CURL_OFF_T)
+#  define CURL_OFF_TU_C(Val) __CURL_OFF_T_C_HLPR1(Val) ## \
+                             __CURL_OFF_T_C_HLPR1(CURL_SUFFIX_CURL_OFF_TU)
+#else
+#  ifdef CURL_ISOCPP
+#    define __CURL_OFF_T_C_HLPR2(Val,Suffix) Val ## Suffix
+#  else
+#    define __CURL_OFF_T_C_HLPR2(Val,Suffix) Val/**/Suffix
+#  endif
+#  define __CURL_OFF_T_C_HLPR1(Val,Suffix) __CURL_OFF_T_C_HLPR2(Val,Suffix)
+#  define CURL_OFF_T_C(Val)  __CURL_OFF_T_C_HLPR1(Val,CURL_SUFFIX_CURL_OFF_T)
+#  define CURL_OFF_TU_C(Val) __CURL_OFF_T_C_HLPR1(Val,CURL_SUFFIX_CURL_OFF_TU)
+#endif
+
+/*
+ * Get rid of macros private to this header file.
+ */
+
+#undef CurlchkszEQ
+#undef CurlchkszGE
+
+/*
+ * Get rid of macros not intended to exist beyond this point.
+ */
+
+#undef CURL_PULL_WS2TCPIP_H
+#undef CURL_PULL_SYS_TYPES_H
+#undef CURL_PULL_SYS_SOCKET_H
+#undef CURL_PULL_SYS_POLL_H
+#undef CURL_PULL_STDINT_H
+#undef CURL_PULL_INTTYPES_H
+
+#undef CURL_TYPEOF_CURL_SOCKLEN_T
+#undef CURL_TYPEOF_CURL_OFF_T
+
+#ifdef CURL_NO_OLDIES
+#undef CURL_FORMAT_OFF_T /* not required since 7.19.0 - obsoleted in 7.20.0 */
+#endif
+
+#endif /* __CURL_CURLRULES_H */
diff --git a/compat/curl-for-windows/curl/include/curl/curlver.h b/compat/curl-for-windows/curl/include/curl/curlver.h
new file mode 100644
index 0000000000..7cc268f788
--- /dev/null
+++ b/compat/curl-for-windows/curl/include/curl/curlver.h
@@ -0,0 +1,69 @@
+#ifndef __CURL_CURLVER_H
+#define __CURL_CURLVER_H
+/***************************************************************************
+ *                                  _   _ ____  _
+ *  Project                     ___| | | |  _ \| |
+ *                             / __| | | | |_) | |
+ *                            | (__| |_| |  _ <| |___
+ *                             \___|\___/|_| \_\_____|
+ *
+ * Copyright (C) 1998 - 2014, Daniel Stenberg, <daniel@haxx.se>, et al.
+ *
+ * This software is licensed as described in the file COPYING, which
+ * you should have received as part of this distribution. The terms
+ * are also available at http://curl.haxx.se/docs/copyright.html.
+ *
+ * You may opt to use, copy, modify, merge, publish, distribute and/or sell
+ * copies of the Software, and permit persons to whom the Software is
+ * furnished to do so, under the terms of the COPYING file.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ***************************************************************************/
+
+/* This header file contains nothing but libcurl version info, generated by
+   a script at release-time. This was made its own header file in 7.11.2 */
+
+/* This is the global package copyright */
+#define LIBCURL_COPYRIGHT "1996 - 2014 Daniel Stenberg, <daniel@haxx.se>."
+
+/* This is the version number of the libcurl package from which this header
+   file origins: */
+#define LIBCURL_VERSION "7.38.0"
+
+/* The numeric version number is also available "in parts" by using these
+   defines: */
+#define LIBCURL_VERSION_MAJOR 7
+#define LIBCURL_VERSION_MINOR 38
+#define LIBCURL_VERSION_PATCH 0
+
+/* This is the numeric version of the libcurl version number, meant for easier
+   parsing and comparions by programs. The LIBCURL_VERSION_NUM define will
+   always follow this syntax:
+
+         0xXXYYZZ
+
+   Where XX, YY and ZZ are the main version, release and patch numbers in
+   hexadecimal (using 8 bits each). All three numbers are always represented
+   using two digits.  1.2 would appear as "0x010200" while version 9.11.7
+   appears as "0x090b07".
+
+   This 6-digit (24 bits) hexadecimal number does not show pre-release number,
+   and it is always a greater number in a more recent release. It makes
+   comparisons with greater than and less than work.
+*/
+#define LIBCURL_VERSION_NUM 0x072600
+
+/*
+ * This is the date and time when the full source package was created. The
+ * timestamp is not stored in git, as the timestamp is properly set in the
+ * tarballs by the maketgz script.
+ *
+ * The format of the date should follow this template:
+ *
+ * "Mon Feb 12 11:35:33 UTC 2007"
+ */
+#define LIBCURL_TIMESTAMP "Mon Nov 03 12:00:00 UTC 2014"
+
+#endif /* __CURL_CURLVER_H */
diff --git a/compat/curl-for-windows/curl/include/curl/easy.h b/compat/curl-for-windows/curl/include/curl/easy.h
new file mode 100644
index 0000000000..c1e3e76096
--- /dev/null
+++ b/compat/curl-for-windows/curl/include/curl/easy.h
@@ -0,0 +1,102 @@
+#ifndef __CURL_EASY_H
+#define __CURL_EASY_H
+/***************************************************************************
+ *                                  _   _ ____  _
+ *  Project                     ___| | | |  _ \| |
+ *                             / __| | | | |_) | |
+ *                            | (__| |_| |  _ <| |___
+ *                             \___|\___/|_| \_\_____|
+ *
+ * Copyright (C) 1998 - 2008, Daniel Stenberg, <daniel@haxx.se>, et al.
+ *
+ * This software is licensed as described in the file COPYING, which
+ * you should have received as part of this distribution. The terms
+ * are also available at http://curl.haxx.se/docs/copyright.html.
+ *
+ * You may opt to use, copy, modify, merge, publish, distribute and/or sell
+ * copies of the Software, and permit persons to whom the Software is
+ * furnished to do so, under the terms of the COPYING file.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ***************************************************************************/
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+CURL_EXTERN CURL *curl_easy_init(void);
+CURL_EXTERN CURLcode curl_easy_setopt(CURL *curl, CURLoption option, ...);
+CURL_EXTERN CURLcode curl_easy_perform(CURL *curl);
+CURL_EXTERN void curl_easy_cleanup(CURL *curl);
+
+/*
+ * NAME curl_easy_getinfo()
+ *
+ * DESCRIPTION
+ *
+ * Request internal information from the curl session with this function.  The
+ * third argument MUST be a pointer to a long, a pointer to a char * or a
+ * pointer to a double (as the documentation describes elsewhere).  The data
+ * pointed to will be filled in accordingly and can be relied upon only if the
+ * function returns CURLE_OK.  This function is intended to get used *AFTER* a
+ * performed transfer, all results from this function are undefined until the
+ * transfer is completed.
+ */
+CURL_EXTERN CURLcode curl_easy_getinfo(CURL *curl, CURLINFO info, ...);
+
+
+/*
+ * NAME curl_easy_duphandle()
+ *
+ * DESCRIPTION
+ *
+ * Creates a new curl session handle with the same options set for the handle
+ * passed in. Duplicating a handle could only be a matter of cloning data and
+ * options, internal state info and things like persistent connections cannot
+ * be transferred. It is useful in multithreaded applications when you can run
+ * curl_easy_duphandle() for each new thread to avoid a series of identical
+ * curl_easy_setopt() invokes in every thread.
+ */
+CURL_EXTERN CURL* curl_easy_duphandle(CURL *curl);
+
+/*
+ * NAME curl_easy_reset()
+ *
+ * DESCRIPTION
+ *
+ * Re-initializes a CURL handle to the default values. This puts back the
+ * handle to the same state as it was in when it was just created.
+ *
+ * It does keep: live connections, the Session ID cache, the DNS cache and the
+ * cookies.
+ */
+CURL_EXTERN void curl_easy_reset(CURL *curl);
+
+/*
+ * NAME curl_easy_recv()
+ *
+ * DESCRIPTION
+ *
+ * Receives data from the connected socket. Use after successful
+ * curl_easy_perform() with CURLOPT_CONNECT_ONLY option.
+ */
+CURL_EXTERN CURLcode curl_easy_recv(CURL *curl, void *buffer, size_t buflen,
+                                    size_t *n);
+
+/*
+ * NAME curl_easy_send()
+ *
+ * DESCRIPTION
+ *
+ * Sends data over the connected socket. Use after successful
+ * curl_easy_perform() with CURLOPT_CONNECT_ONLY option.
+ */
+CURL_EXTERN CURLcode curl_easy_send(CURL *curl, const void *buffer,
+                                    size_t buflen, size_t *n);
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif
diff --git a/compat/curl-for-windows/curl/include/curl/mprintf.h b/compat/curl-for-windows/curl/include/curl/mprintf.h
new file mode 100644
index 0000000000..cc9e7f5d1f
--- /dev/null
+++ b/compat/curl-for-windows/curl/include/curl/mprintf.h
@@ -0,0 +1,81 @@
+#ifndef __CURL_MPRINTF_H
+#define __CURL_MPRINTF_H
+/***************************************************************************
+ *                                  _   _ ____  _
+ *  Project                     ___| | | |  _ \| |
+ *                             / __| | | | |_) | |
+ *                            | (__| |_| |  _ <| |___
+ *                             \___|\___/|_| \_\_____|
+ *
+ * Copyright (C) 1998 - 2013, Daniel Stenberg, <daniel@haxx.se>, et al.
+ *
+ * This software is licensed as described in the file COPYING, which
+ * you should have received as part of this distribution. The terms
+ * are also available at http://curl.haxx.se/docs/copyright.html.
+ *
+ * You may opt to use, copy, modify, merge, publish, distribute and/or sell
+ * copies of the Software, and permit persons to whom the Software is
+ * furnished to do so, under the terms of the COPYING file.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ***************************************************************************/
+
+#include <stdarg.h>
+#include <stdio.h> /* needed for FILE */
+
+#include "curl.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+CURL_EXTERN int curl_mprintf(const char *format, ...);
+CURL_EXTERN int curl_mfprintf(FILE *fd, const char *format, ...);
+CURL_EXTERN int curl_msprintf(char *buffer, const char *format, ...);
+CURL_EXTERN int curl_msnprintf(char *buffer, size_t maxlength,
+                               const char *format, ...);
+CURL_EXTERN int curl_mvprintf(const char *format, va_list args);
+CURL_EXTERN int curl_mvfprintf(FILE *fd, const char *format, va_list args);
+CURL_EXTERN int curl_mvsprintf(char *buffer, const char *format, va_list args);
+CURL_EXTERN int curl_mvsnprintf(char *buffer, size_t maxlength,
+                                const char *format, va_list args);
+CURL_EXTERN char *curl_maprintf(const char *format, ...);
+CURL_EXTERN char *curl_mvaprintf(const char *format, va_list args);
+
+#ifdef _MPRINTF_REPLACE
+# undef printf
+# undef fprintf
+# undef sprintf
+# undef vsprintf
+# undef snprintf
+# undef vprintf
+# undef vfprintf
+# undef vsnprintf
+# undef aprintf
+# undef vaprintf
+# define printf curl_mprintf
+# define fprintf curl_mfprintf
+#ifdef CURLDEBUG
+/* When built with CURLDEBUG we define away the sprintf functions since we
+   don't want internal code to be using them */
+# define sprintf sprintf_was_used
+# define vsprintf vsprintf_was_used
+#else
+# define sprintf curl_msprintf
+# define vsprintf curl_mvsprintf
+#endif
+# define snprintf curl_msnprintf
+# define vprintf curl_mvprintf
+# define vfprintf curl_mvfprintf
+# define vsnprintf curl_mvsnprintf
+# define aprintf curl_maprintf
+# define vaprintf curl_mvaprintf
+#endif
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif /* __CURL_MPRINTF_H */
diff --git a/compat/curl-for-windows/curl/include/curl/multi.h b/compat/curl-for-windows/curl/include/curl/multi.h
new file mode 100644
index 0000000000..3c4acb0f6e
--- /dev/null
+++ b/compat/curl-for-windows/curl/include/curl/multi.h
@@ -0,0 +1,399 @@
+#ifndef __CURL_MULTI_H
+#define __CURL_MULTI_H
+/***************************************************************************
+ *                                  _   _ ____  _
+ *  Project                     ___| | | |  _ \| |
+ *                             / __| | | | |_) | |
+ *                            | (__| |_| |  _ <| |___
+ *                             \___|\___/|_| \_\_____|
+ *
+ * Copyright (C) 1998 - 2013, Daniel Stenberg, <daniel@haxx.se>, et al.
+ *
+ * This software is licensed as described in the file COPYING, which
+ * you should have received as part of this distribution. The terms
+ * are also available at http://curl.haxx.se/docs/copyright.html.
+ *
+ * You may opt to use, copy, modify, merge, publish, distribute and/or sell
+ * copies of the Software, and permit persons to whom the Software is
+ * furnished to do so, under the terms of the COPYING file.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ***************************************************************************/
+/*
+  This is an "external" header file. Don't give away any internals here!
+
+  GOALS
+
+  o Enable a "pull" interface. The application that uses libcurl decides where
+    and when to ask libcurl to get/send data.
+
+  o Enable multiple simultaneous transfers in the same thread without making it
+    complicated for the application.
+
+  o Enable the application to select() on its own file descriptors and curl's
+    file descriptors simultaneous easily.
+
+*/
+
+/*
+ * This header file should not really need to include "curl.h" since curl.h
+ * itself includes this file and we expect user applications to do #include
+ * <curl/curl.h> without the need for especially including multi.h.
+ *
+ * For some reason we added this include here at one point, and rather than to
+ * break existing (wrongly written) libcurl applications, we leave it as-is
+ * but with this warning attached.
+ */
+#include "curl.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+typedef void CURLM;
+
+typedef enum {
+  CURLM_CALL_MULTI_PERFORM = -1, /* please call curl_multi_perform() or
+                                    curl_multi_socket*() soon */
+  CURLM_OK,
+  CURLM_BAD_HANDLE,      /* the passed-in handle is not a valid CURLM handle */
+  CURLM_BAD_EASY_HANDLE, /* an easy handle was not good/valid */
+  CURLM_OUT_OF_MEMORY,   /* if you ever get this, you're in deep sh*t */
+  CURLM_INTERNAL_ERROR,  /* this is a libcurl bug */
+  CURLM_BAD_SOCKET,      /* the passed in socket argument did not match */
+  CURLM_UNKNOWN_OPTION,  /* curl_multi_setopt() with unsupported option */
+  CURLM_ADDED_ALREADY,   /* an easy handle already added to a multi handle was
+                            attempted to get added - again */
+  CURLM_LAST
+} CURLMcode;
+
+/* just to make code nicer when using curl_multi_socket() you can now check
+   for CURLM_CALL_MULTI_SOCKET too in the same style it works for
+   curl_multi_perform() and CURLM_CALL_MULTI_PERFORM */
+#define CURLM_CALL_MULTI_SOCKET CURLM_CALL_MULTI_PERFORM
+
+typedef enum {
+  CURLMSG_NONE, /* first, not used */
+  CURLMSG_DONE, /* This easy handle has completed. 'result' contains
+                   the CURLcode of the transfer */
+  CURLMSG_LAST /* last, not used */
+} CURLMSG;
+
+struct CURLMsg {
+  CURLMSG msg;       /* what this message means */
+  CURL *easy_handle; /* the handle it concerns */
+  union {
+    void *whatever;    /* message-specific data */
+    CURLcode result;   /* return code for transfer */
+  } data;
+};
+typedef struct CURLMsg CURLMsg;
+
+/* Based on poll(2) structure and values.
+ * We don't use pollfd and POLL* constants explicitly
+ * to cover platforms without poll(). */
+#define CURL_WAIT_POLLIN    0x0001
+#define CURL_WAIT_POLLPRI   0x0002
+#define CURL_WAIT_POLLOUT   0x0004
+
+struct curl_waitfd {
+  curl_socket_t fd;
+  short events;
+  short revents; /* not supported yet */
+};
+
+/*
+ * Name:    curl_multi_init()
+ *
+ * Desc:    inititalize multi-style curl usage
+ *
+ * Returns: a new CURLM handle to use in all 'curl_multi' functions.
+ */
+CURL_EXTERN CURLM *curl_multi_init(void);
+
+/*
+ * Name:    curl_multi_add_handle()
+ *
+ * Desc:    add a standard curl handle to the multi stack
+ *
+ * Returns: CURLMcode type, general multi error code.
+ */
+CURL_EXTERN CURLMcode curl_multi_add_handle(CURLM *multi_handle,
+                                            CURL *curl_handle);
+
+ /*
+  * Name:    curl_multi_remove_handle()
+  *
+  * Desc:    removes a curl handle from the multi stack again
+  *
+  * Returns: CURLMcode type, general multi error code.
+  */
+CURL_EXTERN CURLMcode curl_multi_remove_handle(CURLM *multi_handle,
+                                               CURL *curl_handle);
+
+ /*
+  * Name:    curl_multi_fdset()
+  *
+  * Desc:    Ask curl for its fd_set sets. The app can use these to select() or
+  *          poll() on. We want curl_multi_perform() called as soon as one of
+  *          them are ready.
+  *
+  * Returns: CURLMcode type, general multi error code.
+  */
+CURL_EXTERN CURLMcode curl_multi_fdset(CURLM *multi_handle,
+                                       fd_set *read_fd_set,
+                                       fd_set *write_fd_set,
+                                       fd_set *exc_fd_set,
+                                       int *max_fd);
+
+/*
+ * Name:     curl_multi_wait()
+ *
+ * Desc:     Poll on all fds within a CURLM set as well as any
+ *           additional fds passed to the function.
+ *
+ * Returns:  CURLMcode type, general multi error code.
+ */
+CURL_EXTERN CURLMcode curl_multi_wait(CURLM *multi_handle,
+                                      struct curl_waitfd extra_fds[],
+                                      unsigned int extra_nfds,
+                                      int timeout_ms,
+                                      int *ret);
+
+ /*
+  * Name:    curl_multi_perform()
+  *
+  * Desc:    When the app thinks there's data available for curl it calls this
+  *          function to read/write whatever there is right now. This returns
+  *          as soon as the reads and writes are done. This function does not
+  *          require that there actually is data available for reading or that
+  *          data can be written, it can be called just in case. It returns
+  *          the number of handles that still transfer data in the second
+  *          argument's integer-pointer.
+  *
+  * Returns: CURLMcode type, general multi error code. *NOTE* that this only
+  *          returns errors etc regarding the whole multi stack. There might
+  *          still have occurred problems on invidual transfers even when this
+  *          returns OK.
+  */
+CURL_EXTERN CURLMcode curl_multi_perform(CURLM *multi_handle,
+                                         int *running_handles);
+
+ /*
+  * Name:    curl_multi_cleanup()
+  *
+  * Desc:    Cleans up and removes a whole multi stack. It does not free or
+  *          touch any individual easy handles in any way. We need to define
+  *          in what state those handles will be if this function is called
+  *          in the middle of a transfer.
+  *
+  * Returns: CURLMcode type, general multi error code.
+  */
+CURL_EXTERN CURLMcode curl_multi_cleanup(CURLM *multi_handle);
+
+/*
+ * Name:    curl_multi_info_read()
+ *
+ * Desc:    Ask the multi handle if there's any messages/informationals from
+ *          the individual transfers. Messages include informationals such as
+ *          error code from the transfer or just the fact that a transfer is
+ *          completed. More details on these should be written down as well.
+ *
+ *          Repeated calls to this function will return a new struct each
+ *          time, until a special "end of msgs" struct is returned as a signal
+ *          that there is no more to get at this point.
+ *
+ *          The data the returned pointer points to will not survive calling
+ *          curl_multi_cleanup().
+ *
+ *          The 'CURLMsg' struct is meant to be very simple and only contain
+ *          very basic informations. If more involved information is wanted,
+ *          we will provide the particular "transfer handle" in that struct
+ *          and that should/could/would be used in subsequent
+ *          curl_easy_getinfo() calls (or similar). The point being that we
+ *          must never expose complex structs to applications, as then we'll
+ *          undoubtably get backwards compatibility problems in the future.
+ *
+ * Returns: A pointer to a filled-in struct, or NULL if it failed or ran out
+ *          of structs. It also writes the number of messages left in the
+ *          queue (after this read) in the integer the second argument points
+ *          to.
+ */
+CURL_EXTERN CURLMsg *curl_multi_info_read(CURLM *multi_handle,
+                                          int *msgs_in_queue);
+
+/*
+ * Name:    curl_multi_strerror()
+ *
+ * Desc:    The curl_multi_strerror function may be used to turn a CURLMcode
+ *          value into the equivalent human readable error string.  This is
+ *          useful for printing meaningful error messages.
+ *
+ * Returns: A pointer to a zero-terminated error message.
+ */
+CURL_EXTERN const char *curl_multi_strerror(CURLMcode);
+
+/*
+ * Name:    curl_multi_socket() and
+ *          curl_multi_socket_all()
+ *
+ * Desc:    An alternative version of curl_multi_perform() that allows the
+ *          application to pass in one of the file descriptors that have been
+ *          detected to have "action" on them and let libcurl perform.
+ *          See man page for details.
+ */
+#define CURL_POLL_NONE   0
+#define CURL_POLL_IN     1
+#define CURL_POLL_OUT    2
+#define CURL_POLL_INOUT  3
+#define CURL_POLL_REMOVE 4
+
+#define CURL_SOCKET_TIMEOUT CURL_SOCKET_BAD
+
+#define CURL_CSELECT_IN   0x01
+#define CURL_CSELECT_OUT  0x02
+#define CURL_CSELECT_ERR  0x04
+
+typedef int (*curl_socket_callback)(CURL *easy,      /* easy handle */
+                                    curl_socket_t s, /* socket */
+                                    int what,        /* see above */
+                                    void *userp,     /* private callback
+                                                        pointer */
+                                    void *socketp);  /* private socket
+                                                        pointer */
+/*
+ * Name:    curl_multi_timer_callback
+ *
+ * Desc:    Called by libcurl whenever the library detects a change in the
+ *          maximum number of milliseconds the app is allowed to wait before
+ *          curl_multi_socket() or curl_multi_perform() must be called
+ *          (to allow libcurl's timed events to take place).
+ *
+ * Returns: The callback should return zero.
+ */
+typedef int (*curl_multi_timer_callback)(CURLM *multi,    /* multi handle */
+                                         long timeout_ms, /* see above */
+                                         void *userp);    /* private callback
+                                                             pointer */
+
+CURL_EXTERN CURLMcode curl_multi_socket(CURLM *multi_handle, curl_socket_t s,
+                                        int *running_handles);
+
+CURL_EXTERN CURLMcode curl_multi_socket_action(CURLM *multi_handle,
+                                               curl_socket_t s,
+                                               int ev_bitmask,
+                                               int *running_handles);
+
+CURL_EXTERN CURLMcode curl_multi_socket_all(CURLM *multi_handle,
+                                            int *running_handles);
+
+#ifndef CURL_ALLOW_OLD_MULTI_SOCKET
+/* This macro below was added in 7.16.3 to push users who recompile to use
+   the new curl_multi_socket_action() instead of the old curl_multi_socket()
+*/
+#define curl_multi_socket(x,y,z) curl_multi_socket_action(x,y,0,z)
+#endif
+
+/*
+ * Name:    curl_multi_timeout()
+ *
+ * Desc:    Returns the maximum number of milliseconds the app is allowed to
+ *          wait before curl_multi_socket() or curl_multi_perform() must be
+ *          called (to allow libcurl's timed events to take place).
+ *
+ * Returns: CURLM error code.
+ */
+CURL_EXTERN CURLMcode curl_multi_timeout(CURLM *multi_handle,
+                                         long *milliseconds);
+
+#undef CINIT /* re-using the same name as in curl.h */
+
+#ifdef CURL_ISOCPP
+#define CINIT(name,type,num) CURLMOPT_ ## name = CURLOPTTYPE_ ## type + num
+#else
+/* The macro "##" is ISO C, we assume pre-ISO C doesn't support it. */
+#define LONG          CURLOPTTYPE_LONG
+#define OBJECTPOINT   CURLOPTTYPE_OBJECTPOINT
+#define FUNCTIONPOINT CURLOPTTYPE_FUNCTIONPOINT
+#define OFF_T         CURLOPTTYPE_OFF_T
+#define CINIT(name,type,number) CURLMOPT_/**/name = type + number
+#endif
+
+typedef enum {
+  /* This is the socket callback function pointer */
+  CINIT(SOCKETFUNCTION, FUNCTIONPOINT, 1),
+
+  /* This is the argument passed to the socket callback */
+  CINIT(SOCKETDATA, OBJECTPOINT, 2),
+
+    /* set to 1 to enable pipelining for this multi handle */
+  CINIT(PIPELINING, LONG, 3),
+
+   /* This is the timer callback function pointer */
+  CINIT(TIMERFUNCTION, FUNCTIONPOINT, 4),
+
+  /* This is the argument passed to the timer callback */
+  CINIT(TIMERDATA, OBJECTPOINT, 5),
+
+  /* maximum number of entries in the connection cache */
+  CINIT(MAXCONNECTS, LONG, 6),
+
+  /* maximum number of (pipelining) connections to one host */
+  CINIT(MAX_HOST_CONNECTIONS, LONG, 7),
+
+  /* maximum number of requests in a pipeline */
+  CINIT(MAX_PIPELINE_LENGTH, LONG, 8),
+
+  /* a connection with a content-length longer than this
+     will not be considered for pipelining */
+  CINIT(CONTENT_LENGTH_PENALTY_SIZE, OFF_T, 9),
+
+  /* a connection with a chunk length longer than this
+     will not be considered for pipelining */
+  CINIT(CHUNK_LENGTH_PENALTY_SIZE, OFF_T, 10),
+
+  /* a list of site names(+port) that are blacklisted from
+     pipelining */
+  CINIT(PIPELINING_SITE_BL, OBJECTPOINT, 11),
+
+  /* a list of server types that are blacklisted from
+     pipelining */
+  CINIT(PIPELINING_SERVER_BL, OBJECTPOINT, 12),
+
+  /* maximum number of open connections in total */
+  CINIT(MAX_TOTAL_CONNECTIONS, LONG, 13),
+
+  CURLMOPT_LASTENTRY /* the last unused */
+} CURLMoption;
+
+
+/*
+ * Name:    curl_multi_setopt()
+ *
+ * Desc:    Sets options for the multi handle.
+ *
+ * Returns: CURLM error code.
+ */
+CURL_EXTERN CURLMcode curl_multi_setopt(CURLM *multi_handle,
+                                        CURLMoption option, ...);
+
+
+/*
+ * Name:    curl_multi_assign()
+ *
+ * Desc:    This function sets an association in the multi handle between the
+ *          given socket and a private pointer of the application. This is
+ *          (only) useful for curl_multi_socket uses.
+ *
+ * Returns: CURLM error code.
+ */
+CURL_EXTERN CURLMcode curl_multi_assign(CURLM *multi_handle,
+                                        curl_socket_t sockfd, void *sockp);
+
+#ifdef __cplusplus
+} /* end of extern "C" */
+#endif
+
+#endif
diff --git a/compat/curl-for-windows/curl/include/curl/stdcheaders.h b/compat/curl-for-windows/curl/include/curl/stdcheaders.h
new file mode 100644
index 0000000000..ad82ef6335
--- /dev/null
+++ b/compat/curl-for-windows/curl/include/curl/stdcheaders.h
@@ -0,0 +1,33 @@
+#ifndef __STDC_HEADERS_H
+#define __STDC_HEADERS_H
+/***************************************************************************
+ *                                  _   _ ____  _
+ *  Project                     ___| | | |  _ \| |
+ *                             / __| | | | |_) | |
+ *                            | (__| |_| |  _ <| |___
+ *                             \___|\___/|_| \_\_____|
+ *
+ * Copyright (C) 1998 - 2010, Daniel Stenberg, <daniel@haxx.se>, et al.
+ *
+ * This software is licensed as described in the file COPYING, which
+ * you should have received as part of this distribution. The terms
+ * are also available at http://curl.haxx.se/docs/copyright.html.
+ *
+ * You may opt to use, copy, modify, merge, publish, distribute and/or sell
+ * copies of the Software, and permit persons to whom the Software is
+ * furnished to do so, under the terms of the COPYING file.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ***************************************************************************/
+
+#include <sys/types.h>
+
+size_t fread (void *, size_t, size_t, FILE *);
+size_t fwrite (const void *, size_t, size_t, FILE *);
+
+int strcasecmp(const char *, const char *);
+int strncasecmp(const char *, const char *, size_t);
+
+#endif /* __STDC_HEADERS_H */
diff --git a/compat/curl-for-windows/curl/include/curl/typecheck-gcc.h b/compat/curl-for-windows/curl/include/curl/typecheck-gcc.h
new file mode 100644
index 0000000000..69d41a20d1
--- /dev/null
+++ b/compat/curl-for-windows/curl/include/curl/typecheck-gcc.h
@@ -0,0 +1,610 @@
+#ifndef __CURL_TYPECHECK_GCC_H
+#define __CURL_TYPECHECK_GCC_H
+/***************************************************************************
+ *                                  _   _ ____  _
+ *  Project                     ___| | | |  _ \| |
+ *                             / __| | | | |_) | |
+ *                            | (__| |_| |  _ <| |___
+ *                             \___|\___/|_| \_\_____|
+ *
+ * Copyright (C) 1998 - 2014, Daniel Stenberg, <daniel@haxx.se>, et al.
+ *
+ * This software is licensed as described in the file COPYING, which
+ * you should have received as part of this distribution. The terms
+ * are also available at http://curl.haxx.se/docs/copyright.html.
+ *
+ * You may opt to use, copy, modify, merge, publish, distribute and/or sell
+ * copies of the Software, and permit persons to whom the Software is
+ * furnished to do so, under the terms of the COPYING file.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ***************************************************************************/
+
+/* wraps curl_easy_setopt() with typechecking */
+
+/* To add a new kind of warning, add an
+ *   if(_curl_is_sometype_option(_curl_opt))
+ *     if(!_curl_is_sometype(value))
+ *       _curl_easy_setopt_err_sometype();
+ * block and define _curl_is_sometype_option, _curl_is_sometype and
+ * _curl_easy_setopt_err_sometype below
+ *
+ * NOTE: We use two nested 'if' statements here instead of the && operator, in
+ *       order to work around gcc bug #32061.  It affects only gcc 4.3.x/4.4.x
+ *       when compiling with -Wlogical-op.
+ *
+ * To add an option that uses the same type as an existing option, you'll just
+ * need to extend the appropriate _curl_*_option macro
+ */
+#define curl_easy_setopt(handle, option, value)                               \
+__extension__ ({                                                              \
+  __typeof__ (option) _curl_opt = option;                                     \
+  if(__builtin_constant_p(_curl_opt)) {                                       \
+    if(_curl_is_long_option(_curl_opt))                                       \
+      if(!_curl_is_long(value))                                               \
+        _curl_easy_setopt_err_long();                                         \
+    if(_curl_is_off_t_option(_curl_opt))                                      \
+      if(!_curl_is_off_t(value))                                              \
+        _curl_easy_setopt_err_curl_off_t();                                   \
+    if(_curl_is_string_option(_curl_opt))                                     \
+      if(!_curl_is_string(value))                                             \
+        _curl_easy_setopt_err_string();                                       \
+    if(_curl_is_write_cb_option(_curl_opt))                                   \
+      if(!_curl_is_write_cb(value))                                           \
+        _curl_easy_setopt_err_write_callback();                               \
+    if((_curl_opt) == CURLOPT_READFUNCTION)                                   \
+      if(!_curl_is_read_cb(value))                                            \
+        _curl_easy_setopt_err_read_cb();                                      \
+    if((_curl_opt) == CURLOPT_IOCTLFUNCTION)                                  \
+      if(!_curl_is_ioctl_cb(value))                                           \
+        _curl_easy_setopt_err_ioctl_cb();                                     \
+    if((_curl_opt) == CURLOPT_SOCKOPTFUNCTION)                                \
+      if(!_curl_is_sockopt_cb(value))                                         \
+        _curl_easy_setopt_err_sockopt_cb();                                   \
+    if((_curl_opt) == CURLOPT_OPENSOCKETFUNCTION)                             \
+      if(!_curl_is_opensocket_cb(value))                                      \
+        _curl_easy_setopt_err_opensocket_cb();                                \
+    if((_curl_opt) == CURLOPT_PROGRESSFUNCTION)                               \
+      if(!_curl_is_progress_cb(value))                                        \
+        _curl_easy_setopt_err_progress_cb();                                  \
+    if((_curl_opt) == CURLOPT_DEBUGFUNCTION)                                  \
+      if(!_curl_is_debug_cb(value))                                           \
+        _curl_easy_setopt_err_debug_cb();                                     \
+    if((_curl_opt) == CURLOPT_SSL_CTX_FUNCTION)                               \
+      if(!_curl_is_ssl_ctx_cb(value))                                         \
+        _curl_easy_setopt_err_ssl_ctx_cb();                                   \
+    if(_curl_is_conv_cb_option(_curl_opt))                                    \
+      if(!_curl_is_conv_cb(value))                                            \
+        _curl_easy_setopt_err_conv_cb();                                      \
+    if((_curl_opt) == CURLOPT_SEEKFUNCTION)                                   \
+      if(!_curl_is_seek_cb(value))                                            \
+        _curl_easy_setopt_err_seek_cb();                                      \
+    if(_curl_is_cb_data_option(_curl_opt))                                    \
+      if(!_curl_is_cb_data(value))                                            \
+        _curl_easy_setopt_err_cb_data();                                      \
+    if((_curl_opt) == CURLOPT_ERRORBUFFER)                                    \
+      if(!_curl_is_error_buffer(value))                                       \
+        _curl_easy_setopt_err_error_buffer();                                 \
+    if((_curl_opt) == CURLOPT_STDERR)                                         \
+      if(!_curl_is_FILE(value))                                               \
+        _curl_easy_setopt_err_FILE();                                         \
+    if(_curl_is_postfields_option(_curl_opt))                                 \
+      if(!_curl_is_postfields(value))                                         \
+        _curl_easy_setopt_err_postfields();                                   \
+    if((_curl_opt) == CURLOPT_HTTPPOST)                                       \
+      if(!_curl_is_arr((value), struct curl_httppost))                        \
+        _curl_easy_setopt_err_curl_httpost();                                 \
+    if(_curl_is_slist_option(_curl_opt))                                      \
+      if(!_curl_is_arr((value), struct curl_slist))                           \
+        _curl_easy_setopt_err_curl_slist();                                   \
+    if((_curl_opt) == CURLOPT_SHARE)                                          \
+      if(!_curl_is_ptr((value), CURLSH))                                      \
+        _curl_easy_setopt_err_CURLSH();                                       \
+  }                                                                           \
+  curl_easy_setopt(handle, _curl_opt, value);                                 \
+})
+
+/* wraps curl_easy_getinfo() with typechecking */
+/* FIXME: don't allow const pointers */
+#define curl_easy_getinfo(handle, info, arg)                                  \
+__extension__ ({                                                              \
+  __typeof__ (info) _curl_info = info;                                        \
+  if(__builtin_constant_p(_curl_info)) {                                      \
+    if(_curl_is_string_info(_curl_info))                                      \
+      if(!_curl_is_arr((arg), char *))                                        \
+        _curl_easy_getinfo_err_string();                                      \
+    if(_curl_is_long_info(_curl_info))                                        \
+      if(!_curl_is_arr((arg), long))                                          \
+        _curl_easy_getinfo_err_long();                                        \
+    if(_curl_is_double_info(_curl_info))                                      \
+      if(!_curl_is_arr((arg), double))                                        \
+        _curl_easy_getinfo_err_double();                                      \
+    if(_curl_is_slist_info(_curl_info))                                       \
+      if(!_curl_is_arr((arg), struct curl_slist *))                           \
+        _curl_easy_getinfo_err_curl_slist();                                  \
+  }                                                                           \
+  curl_easy_getinfo(handle, _curl_info, arg);                                 \
+})
+
+/* TODO: typechecking for curl_share_setopt() and curl_multi_setopt(),
+ * for now just make sure that the functions are called with three
+ * arguments
+ */
+#define curl_share_setopt(share,opt,param) curl_share_setopt(share,opt,param)
+#define curl_multi_setopt(handle,opt,param) curl_multi_setopt(handle,opt,param)
+
+
+/* the actual warnings, triggered by calling the _curl_easy_setopt_err*
+ * functions */
+
+/* To define a new warning, use _CURL_WARNING(identifier, "message") */
+#define _CURL_WARNING(id, message)                                            \
+  static void __attribute__((__warning__(message)))                           \
+  __attribute__((__unused__)) __attribute__((__noinline__))                   \
+  id(void) { __asm__(""); }
+
+_CURL_WARNING(_curl_easy_setopt_err_long,
+  "curl_easy_setopt expects a long argument for this option")
+_CURL_WARNING(_curl_easy_setopt_err_curl_off_t,
+  "curl_easy_setopt expects a curl_off_t argument for this option")
+_CURL_WARNING(_curl_easy_setopt_err_string,
+              "curl_easy_setopt expects a "
+              "string (char* or char[]) argument for this option"
+  )
+_CURL_WARNING(_curl_easy_setopt_err_write_callback,
+  "curl_easy_setopt expects a curl_write_callback argument for this option")
+_CURL_WARNING(_curl_easy_setopt_err_read_cb,
+  "curl_easy_setopt expects a curl_read_callback argument for this option")
+_CURL_WARNING(_curl_easy_setopt_err_ioctl_cb,
+  "curl_easy_setopt expects a curl_ioctl_callback argument for this option")
+_CURL_WARNING(_curl_easy_setopt_err_sockopt_cb,
+  "curl_easy_setopt expects a curl_sockopt_callback argument for this option")
+_CURL_WARNING(_curl_easy_setopt_err_opensocket_cb,
+              "curl_easy_setopt expects a "
+              "curl_opensocket_callback argument for this option"
+  )
+_CURL_WARNING(_curl_easy_setopt_err_progress_cb,
+  "curl_easy_setopt expects a curl_progress_callback argument for this option")
+_CURL_WARNING(_curl_easy_setopt_err_debug_cb,
+  "curl_easy_setopt expects a curl_debug_callback argument for this option")
+_CURL_WARNING(_curl_easy_setopt_err_ssl_ctx_cb,
+  "curl_easy_setopt expects a curl_ssl_ctx_callback argument for this option")
+_CURL_WARNING(_curl_easy_setopt_err_conv_cb,
+  "curl_easy_setopt expects a curl_conv_callback argument for this option")
+_CURL_WARNING(_curl_easy_setopt_err_seek_cb,
+  "curl_easy_setopt expects a curl_seek_callback argument for this option")
+_CURL_WARNING(_curl_easy_setopt_err_cb_data,
+              "curl_easy_setopt expects a "
+              "private data pointer as argument for this option")
+_CURL_WARNING(_curl_easy_setopt_err_error_buffer,
+              "curl_easy_setopt expects a "
+              "char buffer of CURL_ERROR_SIZE as argument for this option")
+_CURL_WARNING(_curl_easy_setopt_err_FILE,
+  "curl_easy_setopt expects a FILE* argument for this option")
+_CURL_WARNING(_curl_easy_setopt_err_postfields,
+  "curl_easy_setopt expects a void* or char* argument for this option")
+_CURL_WARNING(_curl_easy_setopt_err_curl_httpost,
+  "curl_easy_setopt expects a struct curl_httppost* argument for this option")
+_CURL_WARNING(_curl_easy_setopt_err_curl_slist,
+  "curl_easy_setopt expects a struct curl_slist* argument for this option")
+_CURL_WARNING(_curl_easy_setopt_err_CURLSH,
+  "curl_easy_setopt expects a CURLSH* argument for this option")
+
+_CURL_WARNING(_curl_easy_getinfo_err_string,
+  "curl_easy_getinfo expects a pointer to char * for this info")
+_CURL_WARNING(_curl_easy_getinfo_err_long,
+  "curl_easy_getinfo expects a pointer to long for this info")
+_CURL_WARNING(_curl_easy_getinfo_err_double,
+  "curl_easy_getinfo expects a pointer to double for this info")
+_CURL_WARNING(_curl_easy_getinfo_err_curl_slist,
+  "curl_easy_getinfo expects a pointer to struct curl_slist * for this info")
+
+/* groups of curl_easy_setops options that take the same type of argument */
+
+/* To add a new option to one of the groups, just add
+ *   (option) == CURLOPT_SOMETHING
+ * to the or-expression. If the option takes a long or curl_off_t, you don't
+ * have to do anything
+ */
+
+/* evaluates to true if option takes a long argument */
+#define _curl_is_long_option(option)                                          \
+  (0 < (option) && (option) < CURLOPTTYPE_OBJECTPOINT)
+
+#define _curl_is_off_t_option(option)                                         \
+  ((option) > CURLOPTTYPE_OFF_T)
+
+/* evaluates to true if option takes a char* argument */
+#define _curl_is_string_option(option)                                        \
+  ((option) == CURLOPT_URL ||                                                 \
+   (option) == CURLOPT_PROXY ||                                               \
+   (option) == CURLOPT_INTERFACE ||                                           \
+   (option) == CURLOPT_NETRC_FILE ||                                          \
+   (option) == CURLOPT_USERPWD ||                                             \
+   (option) == CURLOPT_USERNAME ||                                            \
+   (option) == CURLOPT_PASSWORD ||                                            \
+   (option) == CURLOPT_PROXYUSERPWD ||                                        \
+   (option) == CURLOPT_PROXYUSERNAME ||                                       \
+   (option) == CURLOPT_PROXYPASSWORD ||                                       \
+   (option) == CURLOPT_NOPROXY ||                                             \
+   (option) == CURLOPT_ACCEPT_ENCODING ||                                     \
+   (option) == CURLOPT_REFERER ||                                             \
+   (option) == CURLOPT_USERAGENT ||                                           \
+   (option) == CURLOPT_COOKIE ||                                              \
+   (option) == CURLOPT_COOKIEFILE ||                                          \
+   (option) == CURLOPT_COOKIEJAR ||                                           \
+   (option) == CURLOPT_COOKIELIST ||                                          \
+   (option) == CURLOPT_FTPPORT ||                                             \
+   (option) == CURLOPT_FTP_ALTERNATIVE_TO_USER ||                             \
+   (option) == CURLOPT_FTP_ACCOUNT ||                                         \
+   (option) == CURLOPT_RANGE ||                                               \
+   (option) == CURLOPT_CUSTOMREQUEST ||                                       \
+   (option) == CURLOPT_SSLCERT ||                                             \
+   (option) == CURLOPT_SSLCERTTYPE ||                                         \
+   (option) == CURLOPT_SSLKEY ||                                              \
+   (option) == CURLOPT_SSLKEYTYPE ||                                          \
+   (option) == CURLOPT_KEYPASSWD ||                                           \
+   (option) == CURLOPT_SSLENGINE ||                                           \
+   (option) == CURLOPT_CAINFO ||                                              \
+   (option) == CURLOPT_CAPATH ||                                              \
+   (option) == CURLOPT_RANDOM_FILE ||                                         \
+   (option) == CURLOPT_EGDSOCKET ||                                           \
+   (option) == CURLOPT_SSL_CIPHER_LIST ||                                     \
+   (option) == CURLOPT_KRBLEVEL ||                                            \
+   (option) == CURLOPT_SSH_HOST_PUBLIC_KEY_MD5 ||                             \
+   (option) == CURLOPT_SSH_PUBLIC_KEYFILE ||                                  \
+   (option) == CURLOPT_SSH_PRIVATE_KEYFILE ||                                 \
+   (option) == CURLOPT_CRLFILE ||                                             \
+   (option) == CURLOPT_ISSUERCERT ||                                          \
+   (option) == CURLOPT_SOCKS5_GSSAPI_SERVICE ||                               \
+   (option) == CURLOPT_SSH_KNOWNHOSTS ||                                      \
+   (option) == CURLOPT_MAIL_FROM ||                                           \
+   (option) == CURLOPT_RTSP_SESSION_ID ||                                     \
+   (option) == CURLOPT_RTSP_STREAM_URI ||                                     \
+   (option) == CURLOPT_RTSP_TRANSPORT ||                                      \
+   (option) == CURLOPT_XOAUTH2_BEARER ||                                      \
+   (option) == CURLOPT_DNS_SERVERS ||                                         \
+   (option) == CURLOPT_DNS_INTERFACE ||                                       \
+   (option) == CURLOPT_DNS_LOCAL_IP4 ||                                       \
+   (option) == CURLOPT_DNS_LOCAL_IP6 ||                                       \
+   (option) == CURLOPT_LOGIN_OPTIONS ||                                       \
+   0)
+
+/* evaluates to true if option takes a curl_write_callback argument */
+#define _curl_is_write_cb_option(option)                                      \
+  ((option) == CURLOPT_HEADERFUNCTION ||                                      \
+   (option) == CURLOPT_WRITEFUNCTION)
+
+/* evaluates to true if option takes a curl_conv_callback argument */
+#define _curl_is_conv_cb_option(option)                                       \
+  ((option) == CURLOPT_CONV_TO_NETWORK_FUNCTION ||                            \
+   (option) == CURLOPT_CONV_FROM_NETWORK_FUNCTION ||                          \
+   (option) == CURLOPT_CONV_FROM_UTF8_FUNCTION)
+
+/* evaluates to true if option takes a data argument to pass to a callback */
+#define _curl_is_cb_data_option(option)                                       \
+  ((option) == CURLOPT_WRITEDATA ||                                           \
+   (option) == CURLOPT_READDATA ||                                            \
+   (option) == CURLOPT_IOCTLDATA ||                                           \
+   (option) == CURLOPT_SOCKOPTDATA ||                                         \
+   (option) == CURLOPT_OPENSOCKETDATA ||                                      \
+   (option) == CURLOPT_PROGRESSDATA ||                                        \
+   (option) == CURLOPT_HEADERDATA ||                                         \
+   (option) == CURLOPT_DEBUGDATA ||                                           \
+   (option) == CURLOPT_SSL_CTX_DATA ||                                        \
+   (option) == CURLOPT_SEEKDATA ||                                            \
+   (option) == CURLOPT_PRIVATE ||                                             \
+   (option) == CURLOPT_SSH_KEYDATA ||                                         \
+   (option) == CURLOPT_INTERLEAVEDATA ||                                      \
+   (option) == CURLOPT_CHUNK_DATA ||                                          \
+   (option) == CURLOPT_FNMATCH_DATA ||                                        \
+   0)
+
+/* evaluates to true if option takes a POST data argument (void* or char*) */
+#define _curl_is_postfields_option(option)                                    \
+  ((option) == CURLOPT_POSTFIELDS ||                                          \
+   (option) == CURLOPT_COPYPOSTFIELDS ||                                      \
+   0)
+
+/* evaluates to true if option takes a struct curl_slist * argument */
+#define _curl_is_slist_option(option)                                         \
+  ((option) == CURLOPT_HTTPHEADER ||                                          \
+   (option) == CURLOPT_HTTP200ALIASES ||                                      \
+   (option) == CURLOPT_QUOTE ||                                               \
+   (option) == CURLOPT_POSTQUOTE ||                                           \
+   (option) == CURLOPT_PREQUOTE ||                                            \
+   (option) == CURLOPT_TELNETOPTIONS ||                                       \
+   (option) == CURLOPT_MAIL_RCPT ||                                           \
+   0)
+
+/* groups of curl_easy_getinfo infos that take the same type of argument */
+
+/* evaluates to true if info expects a pointer to char * argument */
+#define _curl_is_string_info(info)                                            \
+  (CURLINFO_STRING < (info) && (info) < CURLINFO_LONG)
+
+/* evaluates to true if info expects a pointer to long argument */
+#define _curl_is_long_info(info)                                              \
+  (CURLINFO_LONG < (info) && (info) < CURLINFO_DOUBLE)
+
+/* evaluates to true if info expects a pointer to double argument */
+#define _curl_is_double_info(info)                                            \
+  (CURLINFO_DOUBLE < (info) && (info) < CURLINFO_SLIST)
+
+/* true if info expects a pointer to struct curl_slist * argument */
+#define _curl_is_slist_info(info)                                             \
+  (CURLINFO_SLIST < (info))
+
+
+/* typecheck helpers -- check whether given expression has requested type*/
+
+/* For pointers, you can use the _curl_is_ptr/_curl_is_arr macros,
+ * otherwise define a new macro. Search for __builtin_types_compatible_p
+ * in the GCC manual.
+ * NOTE: these macros MUST NOT EVALUATE their arguments! The argument is
+ * the actual expression passed to the curl_easy_setopt macro. This
+ * means that you can only apply the sizeof and __typeof__ operators, no
+ * == or whatsoever.
+ */
+
+/* XXX: should evaluate to true iff expr is a pointer */
+#define _curl_is_any_ptr(expr)                                                \
+  (sizeof(expr) == sizeof(void*))
+
+/* evaluates to true if expr is NULL */
+/* XXX: must not evaluate expr, so this check is not accurate */
+#define _curl_is_NULL(expr)                                                   \
+  (__builtin_types_compatible_p(__typeof__(expr), __typeof__(NULL)))
+
+/* evaluates to true if expr is type*, const type* or NULL */
+#define _curl_is_ptr(expr, type)                                              \
+  (_curl_is_NULL(expr) ||                                                     \
+   __builtin_types_compatible_p(__typeof__(expr), type *) ||                  \
+   __builtin_types_compatible_p(__typeof__(expr), const type *))
+
+/* evaluates to true if expr is one of type[], type*, NULL or const type* */
+#define _curl_is_arr(expr, type)                                              \
+  (_curl_is_ptr((expr), type) ||                                              \
+   __builtin_types_compatible_p(__typeof__(expr), type []))
+
+/* evaluates to true if expr is a string */
+#define _curl_is_string(expr)                                                 \
+  (_curl_is_arr((expr), char) ||                                              \
+   _curl_is_arr((expr), signed char) ||                                       \
+   _curl_is_arr((expr), unsigned char))
+
+/* evaluates to true if expr is a long (no matter the signedness)
+ * XXX: for now, int is also accepted (and therefore short and char, which
+ * are promoted to int when passed to a variadic function) */
+#define _curl_is_long(expr)                                                   \
+  (__builtin_types_compatible_p(__typeof__(expr), long) ||                    \
+   __builtin_types_compatible_p(__typeof__(expr), signed long) ||             \
+   __builtin_types_compatible_p(__typeof__(expr), unsigned long) ||           \
+   __builtin_types_compatible_p(__typeof__(expr), int) ||                     \
+   __builtin_types_compatible_p(__typeof__(expr), signed int) ||              \
+   __builtin_types_compatible_p(__typeof__(expr), unsigned int) ||            \
+   __builtin_types_compatible_p(__typeof__(expr), short) ||                   \
+   __builtin_types_compatible_p(__typeof__(expr), signed short) ||            \
+   __builtin_types_compatible_p(__typeof__(expr), unsigned short) ||          \
+   __builtin_types_compatible_p(__typeof__(expr), char) ||                    \
+   __builtin_types_compatible_p(__typeof__(expr), signed char) ||             \
+   __builtin_types_compatible_p(__typeof__(expr), unsigned char))
+
+/* evaluates to true if expr is of type curl_off_t */
+#define _curl_is_off_t(expr)                                                  \
+  (__builtin_types_compatible_p(__typeof__(expr), curl_off_t))
+
+/* evaluates to true if expr is abuffer suitable for CURLOPT_ERRORBUFFER */
+/* XXX: also check size of an char[] array? */
+#define _curl_is_error_buffer(expr)                                           \
+  (_curl_is_NULL(expr) ||                                                     \
+   __builtin_types_compatible_p(__typeof__(expr), char *) ||                  \
+   __builtin_types_compatible_p(__typeof__(expr), char[]))
+
+/* evaluates to true if expr is of type (const) void* or (const) FILE* */
+#if 0
+#define _curl_is_cb_data(expr)                                                \
+  (_curl_is_ptr((expr), void) ||                                              \
+   _curl_is_ptr((expr), FILE))
+#else /* be less strict */
+#define _curl_is_cb_data(expr)                                                \
+  _curl_is_any_ptr(expr)
+#endif
+
+/* evaluates to true if expr is of type FILE* */
+#define _curl_is_FILE(expr)                                                   \
+  (__builtin_types_compatible_p(__typeof__(expr), FILE *))
+
+/* evaluates to true if expr can be passed as POST data (void* or char*) */
+#define _curl_is_postfields(expr)                                             \
+  (_curl_is_ptr((expr), void) ||                                              \
+   _curl_is_arr((expr), char))
+
+/* FIXME: the whole callback checking is messy...
+ * The idea is to tolerate char vs. void and const vs. not const
+ * pointers in arguments at least
+ */
+/* helper: __builtin_types_compatible_p distinguishes between functions and
+ * function pointers, hide it */
+#define _curl_callback_compatible(func, type)                                 \
+  (__builtin_types_compatible_p(__typeof__(func), type) ||                    \
+   __builtin_types_compatible_p(__typeof__(func), type*))
+
+/* evaluates to true if expr is of type curl_read_callback or "similar" */
+#define _curl_is_read_cb(expr)                                          \
+  (_curl_is_NULL(expr) ||                                                     \
+   __builtin_types_compatible_p(__typeof__(expr), __typeof__(fread)) ||       \
+   __builtin_types_compatible_p(__typeof__(expr), curl_read_callback) ||      \
+   _curl_callback_compatible((expr), _curl_read_callback1) ||                 \
+   _curl_callback_compatible((expr), _curl_read_callback2) ||                 \
+   _curl_callback_compatible((expr), _curl_read_callback3) ||                 \
+   _curl_callback_compatible((expr), _curl_read_callback4) ||                 \
+   _curl_callback_compatible((expr), _curl_read_callback5) ||                 \
+   _curl_callback_compatible((expr), _curl_read_callback6))
+typedef size_t (_curl_read_callback1)(char *, size_t, size_t, void*);
+typedef size_t (_curl_read_callback2)(char *, size_t, size_t, const void*);
+typedef size_t (_curl_read_callback3)(char *, size_t, size_t, FILE*);
+typedef size_t (_curl_read_callback4)(void *, size_t, size_t, void*);
+typedef size_t (_curl_read_callback5)(void *, size_t, size_t, const void*);
+typedef size_t (_curl_read_callback6)(void *, size_t, size_t, FILE*);
+
+/* evaluates to true if expr is of type curl_write_callback or "similar" */
+#define _curl_is_write_cb(expr)                                               \
+  (_curl_is_read_cb(expr) ||                                            \
+   __builtin_types_compatible_p(__typeof__(expr), __typeof__(fwrite)) ||      \
+   __builtin_types_compatible_p(__typeof__(expr), curl_write_callback) ||     \
+   _curl_callback_compatible((expr), _curl_write_callback1) ||                \
+   _curl_callback_compatible((expr), _curl_write_callback2) ||                \
+   _curl_callback_compatible((expr), _curl_write_callback3) ||                \
+   _curl_callback_compatible((expr), _curl_write_callback4) ||                \
+   _curl_callback_compatible((expr), _curl_write_callback5) ||                \
+   _curl_callback_compatible((expr), _curl_write_callback6))
+typedef size_t (_curl_write_callback1)(const char *, size_t, size_t, void*);
+typedef size_t (_curl_write_callback2)(const char *, size_t, size_t,
+                                       const void*);
+typedef size_t (_curl_write_callback3)(const char *, size_t, size_t, FILE*);
+typedef size_t (_curl_write_callback4)(const void *, size_t, size_t, void*);
+typedef size_t (_curl_write_callback5)(const void *, size_t, size_t,
+                                       const void*);
+typedef size_t (_curl_write_callback6)(const void *, size_t, size_t, FILE*);
+
+/* evaluates to true if expr is of type curl_ioctl_callback or "similar" */
+#define _curl_is_ioctl_cb(expr)                                         \
+  (_curl_is_NULL(expr) ||                                                     \
+   __builtin_types_compatible_p(__typeof__(expr), curl_ioctl_callback) ||     \
+   _curl_callback_compatible((expr), _curl_ioctl_callback1) ||                \
+   _curl_callback_compatible((expr), _curl_ioctl_callback2) ||                \
+   _curl_callback_compatible((expr), _curl_ioctl_callback3) ||                \
+   _curl_callback_compatible((expr), _curl_ioctl_callback4))
+typedef curlioerr (_curl_ioctl_callback1)(CURL *, int, void*);
+typedef curlioerr (_curl_ioctl_callback2)(CURL *, int, const void*);
+typedef curlioerr (_curl_ioctl_callback3)(CURL *, curliocmd, void*);
+typedef curlioerr (_curl_ioctl_callback4)(CURL *, curliocmd, const void*);
+
+/* evaluates to true if expr is of type curl_sockopt_callback or "similar" */
+#define _curl_is_sockopt_cb(expr)                                       \
+  (_curl_is_NULL(expr) ||                                                     \
+   __builtin_types_compatible_p(__typeof__(expr), curl_sockopt_callback) ||   \
+   _curl_callback_compatible((expr), _curl_sockopt_callback1) ||              \
+   _curl_callback_compatible((expr), _curl_sockopt_callback2))
+typedef int (_curl_sockopt_callback1)(void *, curl_socket_t, curlsocktype);
+typedef int (_curl_sockopt_callback2)(const void *, curl_socket_t,
+                                      curlsocktype);
+
+/* evaluates to true if expr is of type curl_opensocket_callback or
+   "similar" */
+#define _curl_is_opensocket_cb(expr)                                    \
+  (_curl_is_NULL(expr) ||                                                     \
+   __builtin_types_compatible_p(__typeof__(expr), curl_opensocket_callback) ||\
+   _curl_callback_compatible((expr), _curl_opensocket_callback1) ||           \
+   _curl_callback_compatible((expr), _curl_opensocket_callback2) ||           \
+   _curl_callback_compatible((expr), _curl_opensocket_callback3) ||           \
+   _curl_callback_compatible((expr), _curl_opensocket_callback4))
+typedef curl_socket_t (_curl_opensocket_callback1)
+  (void *, curlsocktype, struct curl_sockaddr *);
+typedef curl_socket_t (_curl_opensocket_callback2)
+  (void *, curlsocktype, const struct curl_sockaddr *);
+typedef curl_socket_t (_curl_opensocket_callback3)
+  (const void *, curlsocktype, struct curl_sockaddr *);
+typedef curl_socket_t (_curl_opensocket_callback4)
+  (const void *, curlsocktype, const struct curl_sockaddr *);
+
+/* evaluates to true if expr is of type curl_progress_callback or "similar" */
+#define _curl_is_progress_cb(expr)                                      \
+  (_curl_is_NULL(expr) ||                                                     \
+   __builtin_types_compatible_p(__typeof__(expr), curl_progress_callback) ||  \
+   _curl_callback_compatible((expr), _curl_progress_callback1) ||             \
+   _curl_callback_compatible((expr), _curl_progress_callback2))
+typedef int (_curl_progress_callback1)(void *,
+    double, double, double, double);
+typedef int (_curl_progress_callback2)(const void *,
+    double, double, double, double);
+
+/* evaluates to true if expr is of type curl_debug_callback or "similar" */
+#define _curl_is_debug_cb(expr)                                         \
+  (_curl_is_NULL(expr) ||                                                     \
+   __builtin_types_compatible_p(__typeof__(expr), curl_debug_callback) ||     \
+   _curl_callback_compatible((expr), _curl_debug_callback1) ||                \
+   _curl_callback_compatible((expr), _curl_debug_callback2) ||                \
+   _curl_callback_compatible((expr), _curl_debug_callback3) ||                \
+   _curl_callback_compatible((expr), _curl_debug_callback4) ||                \
+   _curl_callback_compatible((expr), _curl_debug_callback5) ||                \
+   _curl_callback_compatible((expr), _curl_debug_callback6) ||                \
+   _curl_callback_compatible((expr), _curl_debug_callback7) ||                \
+   _curl_callback_compatible((expr), _curl_debug_callback8))
+typedef int (_curl_debug_callback1) (CURL *,
+    curl_infotype, char *, size_t, void *);
+typedef int (_curl_debug_callback2) (CURL *,
+    curl_infotype, char *, size_t, const void *);
+typedef int (_curl_debug_callback3) (CURL *,
+    curl_infotype, const char *, size_t, void *);
+typedef int (_curl_debug_callback4) (CURL *,
+    curl_infotype, const char *, size_t, const void *);
+typedef int (_curl_debug_callback5) (CURL *,
+    curl_infotype, unsigned char *, size_t, void *);
+typedef int (_curl_debug_callback6) (CURL *,
+    curl_infotype, unsigned char *, size_t, const void *);
+typedef int (_curl_debug_callback7) (CURL *,
+    curl_infotype, const unsigned char *, size_t, void *);
+typedef int (_curl_debug_callback8) (CURL *,
+    curl_infotype, const unsigned char *, size_t, const void *);
+
+/* evaluates to true if expr is of type curl_ssl_ctx_callback or "similar" */
+/* this is getting even messier... */
+#define _curl_is_ssl_ctx_cb(expr)                                       \
+  (_curl_is_NULL(expr) ||                                                     \
+   __builtin_types_compatible_p(__typeof__(expr), curl_ssl_ctx_callback) ||   \
+   _curl_callback_compatible((expr), _curl_ssl_ctx_callback1) ||              \
+   _curl_callback_compatible((expr), _curl_ssl_ctx_callback2) ||              \
+   _curl_callback_compatible((expr), _curl_ssl_ctx_callback3) ||              \
+   _curl_callback_compatible((expr), _curl_ssl_ctx_callback4) ||              \
+   _curl_callback_compatible((expr), _curl_ssl_ctx_callback5) ||              \
+   _curl_callback_compatible((expr), _curl_ssl_ctx_callback6) ||              \
+   _curl_callback_compatible((expr), _curl_ssl_ctx_callback7) ||              \
+   _curl_callback_compatible((expr), _curl_ssl_ctx_callback8))
+typedef CURLcode (_curl_ssl_ctx_callback1)(CURL *, void *, void *);
+typedef CURLcode (_curl_ssl_ctx_callback2)(CURL *, void *, const void *);
+typedef CURLcode (_curl_ssl_ctx_callback3)(CURL *, const void *, void *);
+typedef CURLcode (_curl_ssl_ctx_callback4)(CURL *, const void *, const void *);
+#ifdef HEADER_SSL_H
+/* hack: if we included OpenSSL's ssl.h, we know about SSL_CTX
+ * this will of course break if we're included before OpenSSL headers...
+ */
+typedef CURLcode (_curl_ssl_ctx_callback5)(CURL *, SSL_CTX, void *);
+typedef CURLcode (_curl_ssl_ctx_callback6)(CURL *, SSL_CTX, const void *);
+typedef CURLcode (_curl_ssl_ctx_callback7)(CURL *, const SSL_CTX, void *);
+typedef CURLcode (_curl_ssl_ctx_callback8)(CURL *, const SSL_CTX,
+                                           const void *);
+#else
+typedef _curl_ssl_ctx_callback1 _curl_ssl_ctx_callback5;
+typedef _curl_ssl_ctx_callback1 _curl_ssl_ctx_callback6;
+typedef _curl_ssl_ctx_callback1 _curl_ssl_ctx_callback7;
+typedef _curl_ssl_ctx_callback1 _curl_ssl_ctx_callback8;
+#endif
+
+/* evaluates to true if expr is of type curl_conv_callback or "similar" */
+#define _curl_is_conv_cb(expr)                                          \
+  (_curl_is_NULL(expr) ||                                                     \
+   __builtin_types_compatible_p(__typeof__(expr), curl_conv_callback) ||      \
+   _curl_callback_compatible((expr), _curl_conv_callback1) ||                 \
+   _curl_callback_compatible((expr), _curl_conv_callback2) ||                 \
+   _curl_callback_compatible((expr), _curl_conv_callback3) ||                 \
+   _curl_callback_compatible((expr), _curl_conv_callback4))
+typedef CURLcode (*_curl_conv_callback1)(char *, size_t length);
+typedef CURLcode (*_curl_conv_callback2)(const char *, size_t length);
+typedef CURLcode (*_curl_conv_callback3)(void *, size_t length);
+typedef CURLcode (*_curl_conv_callback4)(const void *, size_t length);
+
+/* evaluates to true if expr is of type curl_seek_callback or "similar" */
+#define _curl_is_seek_cb(expr)                                          \
+  (_curl_is_NULL(expr) ||                                                     \
+   __builtin_types_compatible_p(__typeof__(expr), curl_seek_callback) ||      \
+   _curl_callback_compatible((expr), _curl_seek_callback1) ||                 \
+   _curl_callback_compatible((expr), _curl_seek_callback2))
+typedef CURLcode (*_curl_seek_callback1)(void *, curl_off_t, int);
+typedef CURLcode (*_curl_seek_callback2)(const void *, curl_off_t, int);
+
+
+#endif /* __CURL_TYPECHECK_GCC_H */
diff --git a/compat/curl-for-windows/openssl/config/opensslconf.h b/compat/curl-for-windows/openssl/config/opensslconf.h
new file mode 100644
index 0000000000..9bf23692d6
--- /dev/null
+++ b/compat/curl-for-windows/openssl/config/opensslconf.h
@@ -0,0 +1,333 @@
+/* opensslconf.h */
+/* WARNING: Generated automatically from opensslconf.h.in by Configure. */
+
+/* OpenSSL was configured with the following options: */
+#undef OPENSSL_SYSNAME_WIN32
+#if defined(_WIN32)
+# define OPENSSL_SYSNAME_WIN32
+#endif
+
+#ifndef OPENSSL_DOING_MAKEDEPEND
+# ifndef OPENSSL_NO_CAPIENG
+#  define OPENSSL_NO_CAPIENG
+# endif
+# ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+#  define OPENSSL_NO_EC_NISTP_64_GCC_128
+# endif
+# ifndef OPENSSL_NO_GMP
+#  define OPENSSL_NO_GMP
+# endif
+# ifndef OPENSSL_NO_GOST
+#  define OPENSSL_NO_GOST
+# endif
+# ifndef OPENSSL_NO_HW_PADLOCK
+#  define OPENSSL_NO_HW_PADLOCK
+# endif
+# ifndef OPENSSL_NO_JPAKE
+#  define OPENSSL_NO_JPAKE
+# endif
+# ifndef OPENSSL_NO_KRB5
+#  define OPENSSL_NO_KRB5
+# endif
+# ifndef OPENSSL_NO_MD2
+#  define OPENSSL_NO_MD2
+# endif
+# ifndef OPENSSL_NO_RC5
+#  define OPENSSL_NO_RC5
+# endif
+# ifndef OPENSSL_NO_RFC3779
+#  define OPENSSL_NO_RFC3779
+# endif
+# ifndef OPENSSL_NO_SCTP
+#  define OPENSSL_NO_SCTP
+# endif
+# ifndef OPENSSL_NO_STORE
+#  define OPENSSL_NO_STORE
+# endif
+#endif /* OPENSSL_DOING_MAKEDEPEND */
+
+#ifndef OPENSSL_THREADS
+# define OPENSSL_THREADS
+#endif
+#ifndef OPENSSL_NO_DYNAMIC_ENGINE
+# define OPENSSL_NO_DYNAMIC_ENGINE
+#endif
+
+/* The OPENSSL_NO_* macros are also defined as NO_* if the application
+   asks for it.  This is a transient feature that is provided for those
+   who haven't had the time to do the appropriate changes in their
+   applications.  */
+#ifdef OPENSSL_ALGORITHM_DEFINES
+# if defined(OPENSSL_NO_CAMELLIA) && !defined(NO_CAMELLIA)
+#  define NO_CAMELLIA
+# endif
+# if defined(OPENSSL_NO_CAPIENG) && !defined(NO_CAPIENG)
+#  define NO_CAPIENG
+# endif
+# if defined(OPENSSL_NO_CAST) && !defined(NO_CAST)
+#  define NO_CAST
+# endif
+# if defined(OPENSSL_NO_CMS) && !defined(NO_CMS)
+#  define NO_CMS
+# endif
+# if defined(OPENSSL_NO_FIPS) && !defined(NO_FIPS)
+#  define NO_FIPS
+# endif
+# if defined(OPENSSL_NO_GMP) && !defined(NO_GMP)
+#  define NO_GMP
+# endif
+# if defined(OPENSSL_NO_IDEA) && !defined(NO_IDEA)
+#  define NO_IDEA
+# endif
+# if defined(OPENSSL_NO_JPAKE) && !defined(NO_JPAKE)
+#  define NO_JPAKE
+# endif
+# if defined(OPENSSL_NO_KRB5) && !defined(NO_KRB5)
+#  define NO_KRB5
+# endif
+# if defined(OPENSSL_NO_MD2) && !defined(NO_MD2)
+#  define NO_MD2
+# endif
+# if defined(OPENSSL_NO_MDC2) && !defined(NO_MDC2)
+#  define NO_MDC2
+# endif
+# if defined(OPENSSL_NO_RC5) && !defined(NO_RC5)
+#  define NO_RC5
+# endif
+# if defined(OPENSSL_NO_RFC3779) && !defined(NO_RFC3779)
+#  define NO_RFC3779
+# endif
+# if defined(OPENSSL_NO_SEED) && !defined(NO_SEED)
+#  define NO_SEED
+# endif
+# if defined(OPENSSL_NO_SHA0) && !defined(NO_SHA0)
+#  define NO_SHA0
+# endif
+# if defined(OPENSSL_NO_STORE) && !defined(NO_STORE)
+#  define NO_STORE
+# endif
+# if defined(OPENSSL_NO_WHRLPOOL) && !defined(NO_WHRLPOOL)
+#  define NO_WHRLPOOL
+# endif
+# if defined(OPENSSL_NO_MDC2) && !defined(NO_MDC2)
+#  define NO_MDC2
+# endif
+#endif
+
+/* crypto/opensslconf.h.in */
+
+#ifdef OPENSSL_DOING_MAKEDEPEND
+ /* Include any symbols here that have to be explicitly set to enable a feature
+  * that should be visible to makedepend.
+  *
+  * [Our "make depend" doesn't actually look at this, we use actual build settings
+  * instead; we want to make it easy to remove subdirectories with disabled algorithms.]
+  */
+# ifndef OPENSSL_FIPS
+#  define OPENSSL_FIPS
+# endif
+#endif
+
+/* Generate 80386 code? */
+#undef I386_ONLY
+
+#if !(defined(VMS) || defined(__VMS)) /* VMS uses logical names instead */
+# if defined(HEADER_CRYPTLIB_H) && !defined(OPENSSLDIR)
+#  if defined(_WIN32)
+#   define ENGINESDIR "ssl/lib/engines"
+#   define OPENSSLDIR "ssl"
+#  else
+#   define ENGINESDIR "/usr/local/ssl/lib/engines"
+#   define OPENSSLDIR "/usr/local/ssl"
+#  endif
+# endif
+#endif
+
+#undef OPENSSL_UNISTD
+#define OPENSSL_UNISTD <unistd.h>
+#if !defined(_WIN32) && !defined(__arm__) && !defined(__mips__) && !defined(SWIG)
+# include <unistd.h>
+#endif
+
+#undef OPENSSL_EXPORT_VAR_AS_FUNCTION
+#if defined(_WIN32)
+# define OPENSSL_EXPORT_VAR_AS_FUNCTION
+#endif
+
+#if defined(HEADER_IDEA_H)
+# undef IDEA_INT
+# define IDEA_INT unsigned int
+#endif
+
+#if defined(HEADER_MD2_H)
+# undef MD2_INT
+# define MD2_INT unsigned int
+#endif
+
+#if defined(HEADER_RC2_H)
+/* I need to put in a mod for the alpha - eay */
+# undef RC2_INT
+# define RC2_INT unsigned int
+#endif
+
+#if defined(HEADER_RC4_H)
+ /* using int types make the structure larger but make the code faster
+  * on most boxes I have tested - up to %20 faster. */
+ /*
+  * I don't know what does "most" mean, but declaring "int" is a must on:
+  * - Intel P6 because partial register stalls are very expensive;
+  * - elder Alpha because it lacks byte load/store instructions;
+  */
+# undef RC4_INT
+# if defined(__arm__)
+#  define RC4_INT unsigned char
+# else
+#  define RC4_INT unsigned int
+# endif
+
+  /*
+   * This enables code handling data aligned at natural CPU word
+   * boundary. See crypto/rc4/rc4_enc.c for further details.
+   */
+# undef RC4_CHUNK
+# if (defined(_M_X64) || defined(__x86_64__)) && defined(_WIN32)
+#  define RC4_CHUNK unsigned long long
+# elif (defined(_M_X64) || defined(__x86_64__)) && !defined(_WIN32)
+#  define RC4_CHUNK unsigned long
+# elif defined(__arm__)
+#  define RC4_CHUNK unsigned long
+# else
+   /* On x86 RC4_CHUNK is not defined */
+# endif
+#endif
+
+#if defined(HEADER_NEW_DES_H) || defined(HEADER_DES_H)
+ /* If this is set to 'unsigned int' on a DEC Alpha, this gives about a
+  * %20 speed up (longs are 8 bytes, int's are 4). */
+# undef DES_LONG
+# if defined(_M_X64) || defined(__x86_64__) || defined(__arm__) || defined(__mips__)
+#  define DES_LONG unsigned int
+# elif defined(_M_IX86) || defined(__i386__)
+#  define DES_LONG unsigned long
+# endif
+#endif
+
+#if defined(HEADER_BN_H) && !defined(CONFIG_HEADER_BN_H)
+# define CONFIG_HEADER_BN_H
+
+# undef BL_LLONG
+# if defined(_M_IX86) || defined(__i386__) || defined(__arm__)
+#  define BL_LLONG
+# endif
+
+ /* Should we define BN_DIV2W here? */
+
+ /* Only one for the following should be defined */
+ /* The prime number generation stuff may not work when
+  * EIGHT_BIT but I don't care since I've only used this mode
+  * for debuging the bignum libraries */
+# undef SIXTY_FOUR_BIT_LONG
+# undef SIXTY_FOUR_BIT
+# undef THIRTY_TWO_BIT
+# undef SIXTEEN_BIT
+# undef EIGHT_BIT
+# if (defined(_M_X64) || defined(__x86_64__)) && defined(_WIN32)
+#  define SIXTY_FOUR_BIT
+# elif (defined(_M_X64) || defined(__x86_64__)) && !defined(_WIN32)
+#  define SIXTY_FOUR_BIT_LONG
+# elif defined(_M_IX86) || defined(__i386__) || defined(__arm__) || defined(__mips__)
+#  define THIRTY_TWO_BIT
+# endif
+#endif
+
+#if defined(HEADER_RC4_LOCL_H) && !defined(CONFIG_HEADER_RC4_LOCL_H)
+# define CONFIG_HEADER_RC4_LOCL_H
+ /* if this is defined data[i] is used instead of *data, this is a %20
+  * speedup on x86 */
+# undef RC4_INDEX
+# if defined(_M_IX86) || defined(__i386__)
+#  define RC4_INDEX
+# endif
+#endif
+
+#if defined(HEADER_BF_LOCL_H) && !defined(CONFIG_HEADER_BF_LOCL_H)
+# define CONFIG_HEADER_BF_LOCL_H
+# undef BF_PTR
+# if defined(__arm__)
+#  define BF_PTR
+# endif
+#endif /* HEADER_BF_LOCL_H */
+
+#if defined(HEADER_DES_LOCL_H) && !defined(CONFIG_HEADER_DES_LOCL_H)
+# define CONFIG_HEADER_DES_LOCL_H
+
+# ifndef DES_DEFAULT_OPTIONS
+   /* the following is tweaked from a config script, that is why it is a
+    * protected undef/define */
+#  undef DES_PTR
+#  if !defined(_WIN32) && (defined(_M_IX86) || defined(__i386__))
+#   define DES_PTR
+#  endif
+
+   /* This helps C compiler generate the correct code for multiple functional
+    * units.  It reduces register dependancies at the expense of 2 more
+    * registers */
+#  undef DES_RISC1
+#  if !defined(_WIN32) && (defined(_M_IX86) || defined(__i386__))
+#   define DES_RISC1
+#  endif
+
+#  undef DES_RISC2
+
+#  if defined(DES_RISC1) && defined(DES_RISC2)
+#   error YOU SHOULD NOT HAVE BOTH DES_RISC1 AND DES_RISC2 DEFINED!!!!!
+#  endif
+
+  /* Unroll the inner loop, this sometimes helps, sometimes hinders.
+   * Very mucy CPU dependant */
+#  undef DES_UNROLL
+#  if !defined(_WIN32)
+#   define DES_UNROLL
+#  endif
+
+  /* These default values were supplied by
+   * Peter Gutman <pgut001@cs.auckland.ac.nz>
+   * They are only used if nothing else has been defined */
+# if !defined(DES_PTR) && !defined(DES_RISC1) && !defined(DES_RISC2) && !defined(DES_UNROLL)
+   /* Special defines which change the way the code is built depending on the
+      CPU and OS.  For SGI machines you can use _MIPS_SZLONG (32 or 64) to find
+      even newer MIPS CPU's, but at the moment one size fits all for
+      optimization options.  Older Sparc's work better with only UNROLL, but
+      there's no way to tell at compile time what it is you're running on */
+#  if defined( sun )		/* Newer Sparc's */
+#    define DES_PTR
+#    define DES_RISC1
+#    define DES_UNROLL
+#  elif defined( __ultrix )	/* Older MIPS */
+#    define DES_PTR
+#    define DES_RISC2
+#    define DES_UNROLL
+#  elif defined( __osf1__ )	/* Alpha */
+#    define DES_PTR
+#    define DES_RISC2
+#  elif defined ( _AIX )		/* RS6000 */
+    /* Unknown */
+#  elif defined( __hpux )		/* HP-PA */
+    /* Unknown */
+#  elif defined( __aux )		/* 68K */
+    /* Unknown */
+#  elif defined( __dgux )		/* 88K (but P6 in latest boxes) */
+#    define DES_UNROLL
+#  elif defined( __sgi )		/* Newer MIPS */
+#    define DES_PTR
+#    define DES_RISC2
+#    define DES_UNROLL
+#  elif defined(i386) || defined(__i386__)	/* x86 boxes, should be gcc */
+#    define DES_PTR
+#    define DES_RISC1
+#    define DES_UNROLL
+#  endif /* Systems-specific speed defines */
+# endif
+
+# endif /* DES_DEFAULT_OPTIONS */
+#endif /* HEADER_DES_LOCL_H */
diff --git a/compat/curl-for-windows/openssl/openssl/crypto/opensslconf.h b/compat/curl-for-windows/openssl/openssl/crypto/opensslconf.h
new file mode 100644
index 0000000000..76c99d433a
--- /dev/null
+++ b/compat/curl-for-windows/openssl/openssl/crypto/opensslconf.h
@@ -0,0 +1 @@
+#include "../../config/opensslconf.h"
diff --git a/compat/curl-for-windows/openssl/openssl/crypto/sha/sha.h b/compat/curl-for-windows/openssl/openssl/crypto/sha/sha.h
new file mode 100644
index 0000000000..8a6bf4bbbb
--- /dev/null
+++ b/compat/curl-for-windows/openssl/openssl/crypto/sha/sha.h
@@ -0,0 +1,214 @@
+/* crypto/sha/sha.h */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ * 
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ * 
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from 
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ * 
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+
+#ifndef HEADER_SHA_H
+#define HEADER_SHA_H
+
+#include <openssl/e_os2.h>
+#include <stddef.h>
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#if defined(OPENSSL_NO_SHA) || (defined(OPENSSL_NO_SHA0) && defined(OPENSSL_NO_SHA1))
+#error SHA is disabled.
+#endif
+
+#if defined(OPENSSL_FIPS)
+#define FIPS_SHA_SIZE_T size_t
+#endif
+
+/*
+ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ * ! SHA_LONG has to be at least 32 bits wide. If it's wider, then !
+ * ! SHA_LONG_LOG2 has to be defined along.                        !
+ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ */
+
+#if defined(__LP32__)
+#define SHA_LONG unsigned long
+#elif defined(OPENSSL_SYS_CRAY) || defined(__ILP64__)
+#define SHA_LONG unsigned long
+#define SHA_LONG_LOG2 3
+#else
+#define SHA_LONG unsigned int
+#endif
+
+#define SHA_LBLOCK	16
+#define SHA_CBLOCK	(SHA_LBLOCK*4)	/* SHA treats input data as a
+					 * contiguous array of 32 bit
+					 * wide big-endian values. */
+#define SHA_LAST_BLOCK  (SHA_CBLOCK-8)
+#define SHA_DIGEST_LENGTH 20
+
+typedef struct SHAstate_st
+	{
+	SHA_LONG h0,h1,h2,h3,h4;
+	SHA_LONG Nl,Nh;
+	SHA_LONG data[SHA_LBLOCK];
+	unsigned int num;
+	} SHA_CTX;
+
+#ifndef OPENSSL_NO_SHA0
+#ifdef OPENSSL_FIPS
+int private_SHA_Init(SHA_CTX *c);
+#endif
+int SHA_Init(SHA_CTX *c);
+int SHA_Update(SHA_CTX *c, const void *data, size_t len);
+int SHA_Final(unsigned char *md, SHA_CTX *c);
+unsigned char *SHA(const unsigned char *d, size_t n, unsigned char *md);
+void SHA_Transform(SHA_CTX *c, const unsigned char *data);
+#endif
+#ifndef OPENSSL_NO_SHA1
+#ifdef OPENSSL_FIPS
+int private_SHA1_Init(SHA_CTX *c);
+#endif
+int SHA1_Init(SHA_CTX *c);
+int SHA1_Update(SHA_CTX *c, const void *data, size_t len);
+int SHA1_Final(unsigned char *md, SHA_CTX *c);
+unsigned char *SHA1(const unsigned char *d, size_t n, unsigned char *md);
+void SHA1_Transform(SHA_CTX *c, const unsigned char *data);
+#endif
+
+#define SHA256_CBLOCK	(SHA_LBLOCK*4)	/* SHA-256 treats input data as a
+					 * contiguous array of 32 bit
+					 * wide big-endian values. */
+#define SHA224_DIGEST_LENGTH	28
+#define SHA256_DIGEST_LENGTH	32
+
+typedef struct SHA256state_st
+	{
+	SHA_LONG h[8];
+	SHA_LONG Nl,Nh;
+	SHA_LONG data[SHA_LBLOCK];
+	unsigned int num,md_len;
+	} SHA256_CTX;
+
+#ifndef OPENSSL_NO_SHA256
+#ifdef OPENSSL_FIPS
+int private_SHA224_Init(SHA256_CTX *c);
+int private_SHA256_Init(SHA256_CTX *c);
+#endif
+int SHA224_Init(SHA256_CTX *c);
+int SHA224_Update(SHA256_CTX *c, const void *data, size_t len);
+int SHA224_Final(unsigned char *md, SHA256_CTX *c);
+unsigned char *SHA224(const unsigned char *d, size_t n,unsigned char *md);
+int SHA256_Init(SHA256_CTX *c);
+int SHA256_Update(SHA256_CTX *c, const void *data, size_t len);
+int SHA256_Final(unsigned char *md, SHA256_CTX *c);
+unsigned char *SHA256(const unsigned char *d, size_t n,unsigned char *md);
+void SHA256_Transform(SHA256_CTX *c, const unsigned char *data);
+#endif
+
+#define SHA384_DIGEST_LENGTH	48
+#define SHA512_DIGEST_LENGTH	64
+
+#ifndef OPENSSL_NO_SHA512
+/*
+ * Unlike 32-bit digest algorithms, SHA-512 *relies* on SHA_LONG64
+ * being exactly 64-bit wide. See Implementation Notes in sha512.c
+ * for further details.
+ */
+#define SHA512_CBLOCK	(SHA_LBLOCK*8)	/* SHA-512 treats input data as a
+					 * contiguous array of 64 bit
+					 * wide big-endian values. */
+#if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
+#define SHA_LONG64 unsigned __int64
+#define U64(C)     C##UI64
+#elif defined(__arch64__)
+#define SHA_LONG64 unsigned long
+#define U64(C)     C##UL
+#else
+#define SHA_LONG64 unsigned long long
+#define U64(C)     C##ULL
+#endif
+
+typedef struct SHA512state_st
+	{
+	SHA_LONG64 h[8];
+	SHA_LONG64 Nl,Nh;
+	union {
+		SHA_LONG64	d[SHA_LBLOCK];
+		unsigned char	p[SHA512_CBLOCK];
+	} u;
+	unsigned int num,md_len;
+	} SHA512_CTX;
+#endif
+
+#ifndef OPENSSL_NO_SHA512
+#ifdef OPENSSL_FIPS
+int private_SHA384_Init(SHA512_CTX *c);
+int private_SHA512_Init(SHA512_CTX *c);
+#endif
+int SHA384_Init(SHA512_CTX *c);
+int SHA384_Update(SHA512_CTX *c, const void *data, size_t len);
+int SHA384_Final(unsigned char *md, SHA512_CTX *c);
+unsigned char *SHA384(const unsigned char *d, size_t n,unsigned char *md);
+int SHA512_Init(SHA512_CTX *c);
+int SHA512_Update(SHA512_CTX *c, const void *data, size_t len);
+int SHA512_Final(unsigned char *md, SHA512_CTX *c);
+unsigned char *SHA512(const unsigned char *d, size_t n,unsigned char *md);
+void SHA512_Transform(SHA512_CTX *c, const unsigned char *data);
+#endif
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif
diff --git a/compat/curl-for-windows/openssl/openssl/e_os2.h b/compat/curl-for-windows/openssl/openssl/e_os2.h
new file mode 100644
index 0000000000..d22c0368f8
--- /dev/null
+++ b/compat/curl-for-windows/openssl/openssl/e_os2.h
@@ -0,0 +1,315 @@
+/* e_os2.h */
+/* ====================================================================
+ * Copyright (c) 1998-2000 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+#include <openssl/opensslconf.h>
+
+#ifndef HEADER_E_OS2_H
+#define HEADER_E_OS2_H
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************
+ * Detect operating systems.  This probably needs completing.
+ * The result is that at least one OPENSSL_SYS_os macro should be defined.
+ * However, if none is defined, Unix is assumed.
+ **/
+
+#define OPENSSL_SYS_UNIX
+
+/* ----------------------- Macintosh, before MacOS X ----------------------- */
+#if defined(__MWERKS__) && defined(macintosh) || defined(OPENSSL_SYSNAME_MAC)
+# undef OPENSSL_SYS_UNIX
+# define OPENSSL_SYS_MACINTOSH_CLASSIC
+#endif
+
+/* ----------------------- NetWare ----------------------------------------- */
+#if defined(NETWARE) || defined(OPENSSL_SYSNAME_NETWARE)
+# undef OPENSSL_SYS_UNIX
+# define OPENSSL_SYS_NETWARE
+#endif
+
+/* ---------------------- Microsoft operating systems ---------------------- */
+
+/* Note that MSDOS actually denotes 32-bit environments running on top of
+   MS-DOS, such as DJGPP one. */
+#if defined(OPENSSL_SYSNAME_MSDOS)
+# undef OPENSSL_SYS_UNIX
+# define OPENSSL_SYS_MSDOS
+#endif
+
+/* For 32 bit environment, there seems to be the CygWin environment and then
+   all the others that try to do the same thing Microsoft does... */
+#if defined(OPENSSL_SYSNAME_UWIN)
+# undef OPENSSL_SYS_UNIX
+# define OPENSSL_SYS_WIN32_UWIN
+#else
+# if defined(__CYGWIN32__) || defined(OPENSSL_SYSNAME_CYGWIN32)
+#  undef OPENSSL_SYS_UNIX
+#  define OPENSSL_SYS_WIN32_CYGWIN
+# else
+#  if defined(_WIN32) || defined(OPENSSL_SYSNAME_WIN32)
+#   undef OPENSSL_SYS_UNIX
+#   define OPENSSL_SYS_WIN32
+#  endif
+#  if defined(OPENSSL_SYSNAME_WINNT)
+#   undef OPENSSL_SYS_UNIX
+#   define OPENSSL_SYS_WINNT
+#  endif
+#  if defined(OPENSSL_SYSNAME_WINCE)
+#   undef OPENSSL_SYS_UNIX
+#   define OPENSSL_SYS_WINCE
+#  endif
+# endif
+#endif
+
+/* Anything that tries to look like Microsoft is "Windows" */
+#if defined(OPENSSL_SYS_WIN32) || defined(OPENSSL_SYS_WINNT) || defined(OPENSSL_SYS_WINCE)
+# undef OPENSSL_SYS_UNIX
+# define OPENSSL_SYS_WINDOWS
+# ifndef OPENSSL_SYS_MSDOS
+#  define OPENSSL_SYS_MSDOS
+# endif
+#endif
+
+/* DLL settings.  This part is a bit tough, because it's up to the application
+   implementor how he or she will link the application, so it requires some
+   macro to be used. */
+#ifdef OPENSSL_SYS_WINDOWS
+# ifndef OPENSSL_OPT_WINDLL
+#  if defined(_WINDLL) /* This is used when building OpenSSL to indicate that
+                          DLL linkage should be used */
+#   define OPENSSL_OPT_WINDLL
+#  endif
+# endif
+#endif
+
+/* -------------------------------- OpenVMS -------------------------------- */
+#if defined(__VMS) || defined(VMS) || defined(OPENSSL_SYSNAME_VMS)
+# undef OPENSSL_SYS_UNIX
+# define OPENSSL_SYS_VMS
+# if defined(__DECC)
+#  define OPENSSL_SYS_VMS_DECC
+# elif defined(__DECCXX)
+#  define OPENSSL_SYS_VMS_DECC
+#  define OPENSSL_SYS_VMS_DECCXX
+# else
+#  define OPENSSL_SYS_VMS_NODECC
+# endif
+#endif
+
+/* --------------------------------- OS/2 ---------------------------------- */
+#if defined(__EMX__) || defined(__OS2__)
+# undef OPENSSL_SYS_UNIX
+# define OPENSSL_SYS_OS2
+#endif
+
+/* --------------------------------- Unix ---------------------------------- */
+#ifdef OPENSSL_SYS_UNIX
+# if defined(linux) || defined(__linux__) || defined(OPENSSL_SYSNAME_LINUX)
+#  define OPENSSL_SYS_LINUX
+# endif
+# ifdef OPENSSL_SYSNAME_MPE
+#  define OPENSSL_SYS_MPE
+# endif
+# ifdef OPENSSL_SYSNAME_SNI
+#  define OPENSSL_SYS_SNI
+# endif
+# ifdef OPENSSL_SYSNAME_ULTRASPARC
+#  define OPENSSL_SYS_ULTRASPARC
+# endif
+# ifdef OPENSSL_SYSNAME_NEWS4
+#  define OPENSSL_SYS_NEWS4
+# endif
+# ifdef OPENSSL_SYSNAME_MACOSX
+#  define OPENSSL_SYS_MACOSX
+# endif
+# ifdef OPENSSL_SYSNAME_MACOSX_RHAPSODY
+#  define OPENSSL_SYS_MACOSX_RHAPSODY
+#  define OPENSSL_SYS_MACOSX
+# endif
+# ifdef OPENSSL_SYSNAME_SUNOS
+#  define OPENSSL_SYS_SUNOS
+#endif
+# if defined(_CRAY) || defined(OPENSSL_SYSNAME_CRAY)
+#  define OPENSSL_SYS_CRAY
+# endif
+# if defined(_AIX) || defined(OPENSSL_SYSNAME_AIX)
+#  define OPENSSL_SYS_AIX
+# endif
+#endif
+
+/* --------------------------------- VOS ----------------------------------- */
+#if defined(__VOS__) || defined(OPENSSL_SYSNAME_VOS)
+# define OPENSSL_SYS_VOS
+#ifdef __HPPA__
+# define OPENSSL_SYS_VOS_HPPA
+#endif
+#ifdef __IA32__
+# define OPENSSL_SYS_VOS_IA32
+#endif
+#endif
+
+/* ------------------------------- VxWorks --------------------------------- */
+#ifdef OPENSSL_SYSNAME_VXWORKS
+# define OPENSSL_SYS_VXWORKS
+#endif
+
+/* --------------------------------- BeOS ---------------------------------- */
+#if defined(__BEOS__)
+# define OPENSSL_SYS_BEOS
+# include <sys/socket.h>
+# if defined(BONE_VERSION)
+#  define OPENSSL_SYS_BEOS_BONE
+# else
+#  define OPENSSL_SYS_BEOS_R5
+# endif
+#endif
+
+/**
+ * That's it for OS-specific stuff
+ *****************************************************************************/
+
+
+/* Specials for I/O an exit */
+#ifdef OPENSSL_SYS_MSDOS
+# define OPENSSL_UNISTD_IO <io.h>
+# define OPENSSL_DECLARE_EXIT extern void exit(int);
+#else
+# define OPENSSL_UNISTD_IO OPENSSL_UNISTD
+# define OPENSSL_DECLARE_EXIT /* declared in unistd.h */
+#endif
+
+/* Definitions of OPENSSL_GLOBAL and OPENSSL_EXTERN, to define and declare
+   certain global symbols that, with some compilers under VMS, have to be
+   defined and declared explicitely with globaldef and globalref.
+   Definitions of OPENSSL_EXPORT and OPENSSL_IMPORT, to define and declare
+   DLL exports and imports for compilers under Win32.  These are a little
+   more complicated to use.  Basically, for any library that exports some
+   global variables, the following code must be present in the header file
+   that declares them, before OPENSSL_EXTERN is used:
+
+   #ifdef SOME_BUILD_FLAG_MACRO
+   # undef OPENSSL_EXTERN
+   # define OPENSSL_EXTERN OPENSSL_EXPORT
+   #endif
+
+   The default is to have OPENSSL_EXPORT, OPENSSL_IMPORT and OPENSSL_GLOBAL
+   have some generally sensible values, and for OPENSSL_EXTERN to have the
+   value OPENSSL_IMPORT.
+*/
+
+#if defined(OPENSSL_SYS_VMS_NODECC)
+# define OPENSSL_EXPORT globalref
+# define OPENSSL_IMPORT globalref
+# define OPENSSL_GLOBAL globaldef
+#elif defined(OPENSSL_SYS_WINDOWS) && defined(OPENSSL_OPT_WINDLL)
+# define OPENSSL_EXPORT extern __declspec(dllexport)
+# define OPENSSL_IMPORT extern __declspec(dllimport)
+# define OPENSSL_GLOBAL
+#else
+# define OPENSSL_EXPORT extern
+# define OPENSSL_IMPORT extern
+# define OPENSSL_GLOBAL
+#endif
+#define OPENSSL_EXTERN OPENSSL_IMPORT
+
+/* Macros to allow global variables to be reached through function calls when
+   required (if a shared library version requires it, for example.
+   The way it's done allows definitions like this:
+
+	// in foobar.c
+	OPENSSL_IMPLEMENT_GLOBAL(int,foobar,0)
+	// in foobar.h
+	OPENSSL_DECLARE_GLOBAL(int,foobar);
+	#define foobar OPENSSL_GLOBAL_REF(foobar)
+*/
+#ifdef OPENSSL_EXPORT_VAR_AS_FUNCTION
+# define OPENSSL_IMPLEMENT_GLOBAL(type,name,value)			\
+	type *_shadow_##name(void)					\
+	{ static type _hide_##name=value; return &_hide_##name; }
+# define OPENSSL_DECLARE_GLOBAL(type,name) type *_shadow_##name(void)
+# define OPENSSL_GLOBAL_REF(name) (*(_shadow_##name()))
+#else
+# define OPENSSL_IMPLEMENT_GLOBAL(type,name,value) OPENSSL_GLOBAL type _shadow_##name=value;
+# define OPENSSL_DECLARE_GLOBAL(type,name) OPENSSL_EXPORT type _shadow_##name
+# define OPENSSL_GLOBAL_REF(name) _shadow_##name
+#endif
+
+#if defined(OPENSSL_SYS_MACINTOSH_CLASSIC) && macintosh==1 && !defined(MAC_OS_GUSI_SOURCE)
+#  define ossl_ssize_t long
+#endif
+
+#ifdef OPENSSL_SYS_MSDOS
+#  define ossl_ssize_t long
+#endif
+
+#if defined(NeXT) || defined(OPENSSL_SYS_NEWS4) || defined(OPENSSL_SYS_SUNOS)
+#  define ssize_t int
+#endif
+
+#if defined(__ultrix) && !defined(ssize_t)
+#  define ossl_ssize_t int 
+#endif
+
+#ifndef ossl_ssize_t
+#  define ossl_ssize_t ssize_t
+#endif
+
+#ifdef  __cplusplus
+}
+#endif
+#endif
diff --git a/compat/curl-for-windows/openssl/openssl/include/openssl/bn.h b/compat/curl-for-windows/openssl/openssl/include/openssl/bn.h
new file mode 100644
index 0000000000..21a1a3fe35
--- /dev/null
+++ b/compat/curl-for-windows/openssl/openssl/include/openssl/bn.h
@@ -0,0 +1,902 @@
+/* crypto/bn/bn.h */
+/* Copyright (C) 1995-1997 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ * 
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ * 
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from 
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ * 
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2006 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+/* ====================================================================
+ * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
+ *
+ * Portions of the attached software ("Contribution") are developed by 
+ * SUN MICROSYSTEMS, INC., and are contributed to the OpenSSL project.
+ *
+ * The Contribution is licensed pursuant to the Eric Young open source
+ * license provided above.
+ *
+ * The binary polynomial arithmetic software is originally written by 
+ * Sheueling Chang Shantz and Douglas Stebila of Sun Microsystems Laboratories.
+ *
+ */
+
+#ifndef HEADER_BN_H
+#define HEADER_BN_H
+
+#include <openssl/e_os2.h>
+#ifndef OPENSSL_NO_FP_API
+#include <stdio.h> /* FILE */
+#endif
+#include <openssl/ossl_typ.h>
+#include <openssl/crypto.h>
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+/* These preprocessor symbols control various aspects of the bignum headers and
+ * library code. They're not defined by any "normal" configuration, as they are
+ * intended for development and testing purposes. NB: defining all three can be
+ * useful for debugging application code as well as openssl itself.
+ *
+ * BN_DEBUG - turn on various debugging alterations to the bignum code
+ * BN_DEBUG_RAND - uses random poisoning of unused words to trip up
+ * mismanagement of bignum internals. You must also define BN_DEBUG.
+ */
+/* #define BN_DEBUG */
+/* #define BN_DEBUG_RAND */
+
+#ifndef OPENSSL_SMALL_FOOTPRINT
+#define BN_MUL_COMBA
+#define BN_SQR_COMBA
+#define BN_RECURSION
+#endif
+
+/* This next option uses the C libraries (2 word)/(1 word) function.
+ * If it is not defined, I use my C version (which is slower).
+ * The reason for this flag is that when the particular C compiler
+ * library routine is used, and the library is linked with a different
+ * compiler, the library is missing.  This mostly happens when the
+ * library is built with gcc and then linked using normal cc.  This would
+ * be a common occurrence because gcc normally produces code that is
+ * 2 times faster than system compilers for the big number stuff.
+ * For machines with only one compiler (or shared libraries), this should
+ * be on.  Again this in only really a problem on machines
+ * using "long long's", are 32bit, and are not using my assembler code. */
+#if defined(OPENSSL_SYS_MSDOS) || defined(OPENSSL_SYS_WINDOWS) || \
+    defined(OPENSSL_SYS_WIN32) || defined(linux)
+# ifndef BN_DIV2W
+#  define BN_DIV2W
+# endif
+#endif
+
+/* assuming long is 64bit - this is the DEC Alpha
+ * unsigned long long is only 64 bits :-(, don't define
+ * BN_LLONG for the DEC Alpha */
+#ifdef SIXTY_FOUR_BIT_LONG
+#define BN_ULLONG	unsigned long long
+#define BN_ULONG	unsigned long
+#define BN_LONG		long
+#define BN_BITS		128
+#define BN_BYTES	8
+#define BN_BITS2	64
+#define BN_BITS4	32
+#define BN_MASK		(0xffffffffffffffffffffffffffffffffLL)
+#define BN_MASK2	(0xffffffffffffffffL)
+#define BN_MASK2l	(0xffffffffL)
+#define BN_MASK2h	(0xffffffff00000000L)
+#define BN_MASK2h1	(0xffffffff80000000L)
+#define BN_TBIT		(0x8000000000000000L)
+#define BN_DEC_CONV	(10000000000000000000UL)
+#define BN_DEC_FMT1	"%lu"
+#define BN_DEC_FMT2	"%019lu"
+#define BN_DEC_NUM	19
+#define BN_HEX_FMT1	"%lX"
+#define BN_HEX_FMT2	"%016lX"
+#endif
+
+/* This is where the long long data type is 64 bits, but long is 32.
+ * For machines where there are 64bit registers, this is the mode to use.
+ * IRIX, on R4000 and above should use this mode, along with the relevant
+ * assembler code :-).  Do NOT define BN_LLONG.
+ */
+#ifdef SIXTY_FOUR_BIT
+#undef BN_LLONG
+#undef BN_ULLONG
+#define BN_ULONG	unsigned long long
+#define BN_LONG		long long
+#define BN_BITS		128
+#define BN_BYTES	8
+#define BN_BITS2	64
+#define BN_BITS4	32
+#define BN_MASK2	(0xffffffffffffffffLL)
+#define BN_MASK2l	(0xffffffffL)
+#define BN_MASK2h	(0xffffffff00000000LL)
+#define BN_MASK2h1	(0xffffffff80000000LL)
+#define BN_TBIT		(0x8000000000000000LL)
+#define BN_DEC_CONV	(10000000000000000000ULL)
+#define BN_DEC_FMT1	"%llu"
+#define BN_DEC_FMT2	"%019llu"
+#define BN_DEC_NUM	19
+#define BN_HEX_FMT1	"%llX"
+#define BN_HEX_FMT2	"%016llX"
+#endif
+
+#ifdef THIRTY_TWO_BIT
+#ifdef BN_LLONG
+# if defined(_WIN32) && !defined(__GNUC__)
+#  define BN_ULLONG	unsigned __int64
+#  define BN_MASK	(0xffffffffffffffffI64)
+# else
+#  define BN_ULLONG	unsigned long long
+#  define BN_MASK	(0xffffffffffffffffLL)
+# endif
+#endif
+#define BN_ULONG	unsigned int
+#define BN_LONG		int
+#define BN_BITS		64
+#define BN_BYTES	4
+#define BN_BITS2	32
+#define BN_BITS4	16
+#define BN_MASK2	(0xffffffffL)
+#define BN_MASK2l	(0xffff)
+#define BN_MASK2h1	(0xffff8000L)
+#define BN_MASK2h	(0xffff0000L)
+#define BN_TBIT		(0x80000000L)
+#define BN_DEC_CONV	(1000000000L)
+#define BN_DEC_FMT1	"%u"
+#define BN_DEC_FMT2	"%09u"
+#define BN_DEC_NUM	9
+#define BN_HEX_FMT1	"%X"
+#define BN_HEX_FMT2	"%08X"
+#endif
+
+/* 2011-02-22 SMS.
+ * In various places, a size_t variable or a type cast to size_t was
+ * used to perform integer-only operations on pointers.  This failed on
+ * VMS with 64-bit pointers (CC /POINTER_SIZE = 64) because size_t is
+ * still only 32 bits.  What's needed in these cases is an integer type
+ * with the same size as a pointer, which size_t is not certain to be. 
+ * The only fix here is VMS-specific.
+ */
+#if defined(OPENSSL_SYS_VMS)
+# if __INITIAL_POINTER_SIZE == 64
+#  define PTR_SIZE_INT long long
+# else /* __INITIAL_POINTER_SIZE == 64 */
+#  define PTR_SIZE_INT int
+# endif /* __INITIAL_POINTER_SIZE == 64 [else] */
+#else /* defined(OPENSSL_SYS_VMS) */
+# define PTR_SIZE_INT size_t
+#endif /* defined(OPENSSL_SYS_VMS) [else] */
+
+#define BN_DEFAULT_BITS	1280
+
+#define BN_FLG_MALLOCED		0x01
+#define BN_FLG_STATIC_DATA	0x02
+#define BN_FLG_CONSTTIME	0x04 /* avoid leaking exponent information through timing,
+                                      * BN_mod_exp_mont() will call BN_mod_exp_mont_consttime,
+                                      * BN_div() will call BN_div_no_branch,
+                                      * BN_mod_inverse() will call BN_mod_inverse_no_branch.
+                                      */
+
+#ifndef OPENSSL_NO_DEPRECATED
+#define BN_FLG_EXP_CONSTTIME BN_FLG_CONSTTIME /* deprecated name for the flag */
+                                      /* avoid leaking exponent information through timings
+                                      * (BN_mod_exp_mont() will call BN_mod_exp_mont_consttime) */
+#endif
+
+#ifndef OPENSSL_NO_DEPRECATED
+#define BN_FLG_FREE		0x8000	/* used for debuging */
+#endif
+#define BN_set_flags(b,n)	((b)->flags|=(n))
+#define BN_get_flags(b,n)	((b)->flags&(n))
+
+/* get a clone of a BIGNUM with changed flags, for *temporary* use only
+ * (the two BIGNUMs cannot not be used in parallel!) */
+#define BN_with_flags(dest,b,n)  ((dest)->d=(b)->d, \
+                                  (dest)->top=(b)->top, \
+                                  (dest)->dmax=(b)->dmax, \
+                                  (dest)->neg=(b)->neg, \
+                                  (dest)->flags=(((dest)->flags & BN_FLG_MALLOCED) \
+                                                 |  ((b)->flags & ~BN_FLG_MALLOCED) \
+                                                 |  BN_FLG_STATIC_DATA \
+                                                 |  (n)))
+
+/* Already declared in ossl_typ.h */
+#if 0
+typedef struct bignum_st BIGNUM;
+/* Used for temp variables (declaration hidden in bn_lcl.h) */
+typedef struct bignum_ctx BN_CTX;
+typedef struct bn_blinding_st BN_BLINDING;
+typedef struct bn_mont_ctx_st BN_MONT_CTX;
+typedef struct bn_recp_ctx_st BN_RECP_CTX;
+typedef struct bn_gencb_st BN_GENCB;
+#endif
+
+struct bignum_st
+	{
+	BN_ULONG *d;	/* Pointer to an array of 'BN_BITS2' bit chunks. */
+	int top;	/* Index of last used d +1. */
+	/* The next are internal book keeping for bn_expand. */
+	int dmax;	/* Size of the d array. */
+	int neg;	/* one if the number is negative */
+	int flags;
+	};
+
+/* Used for montgomery multiplication */
+struct bn_mont_ctx_st
+	{
+	int ri;        /* number of bits in R */
+	BIGNUM RR;     /* used to convert to montgomery form */
+	BIGNUM N;      /* The modulus */
+	BIGNUM Ni;     /* R*(1/R mod N) - N*Ni = 1
+	                * (Ni is only stored for bignum algorithm) */
+	BN_ULONG n0[2];/* least significant word(s) of Ni;
+	                  (type changed with 0.9.9, was "BN_ULONG n0;" before) */
+	int flags;
+	};
+
+/* Used for reciprocal division/mod functions
+ * It cannot be shared between threads
+ */
+struct bn_recp_ctx_st
+	{
+	BIGNUM N;	/* the divisor */
+	BIGNUM Nr;	/* the reciprocal */
+	int num_bits;
+	int shift;
+	int flags;
+	};
+
+/* Used for slow "generation" functions. */
+struct bn_gencb_st
+	{
+	unsigned int ver;	/* To handle binary (in)compatibility */
+	void *arg;		/* callback-specific data */
+	union
+		{
+		/* if(ver==1) - handles old style callbacks */
+		void (*cb_1)(int, int, void *);
+		/* if(ver==2) - new callback style */
+		int (*cb_2)(int, int, BN_GENCB *);
+		} cb;
+	};
+/* Wrapper function to make using BN_GENCB easier,  */
+int BN_GENCB_call(BN_GENCB *cb, int a, int b);
+/* Macro to populate a BN_GENCB structure with an "old"-style callback */
+#define BN_GENCB_set_old(gencb, callback, cb_arg) { \
+		BN_GENCB *tmp_gencb = (gencb); \
+		tmp_gencb->ver = 1; \
+		tmp_gencb->arg = (cb_arg); \
+		tmp_gencb->cb.cb_1 = (callback); }
+/* Macro to populate a BN_GENCB structure with a "new"-style callback */
+#define BN_GENCB_set(gencb, callback, cb_arg) { \
+		BN_GENCB *tmp_gencb = (gencb); \
+		tmp_gencb->ver = 2; \
+		tmp_gencb->arg = (cb_arg); \
+		tmp_gencb->cb.cb_2 = (callback); }
+
+#define BN_prime_checks 0 /* default: select number of iterations
+			     based on the size of the number */
+
+/* number of Miller-Rabin iterations for an error rate  of less than 2^-80
+ * for random 'b'-bit input, b >= 100 (taken from table 4.4 in the Handbook
+ * of Applied Cryptography [Menezes, van Oorschot, Vanstone; CRC Press 1996];
+ * original paper: Damgaard, Landrock, Pomerance: Average case error estimates
+ * for the strong probable prime test. -- Math. Comp. 61 (1993) 177-194) */
+#define BN_prime_checks_for_size(b) ((b) >= 1300 ?  2 : \
+                                (b) >=  850 ?  3 : \
+                                (b) >=  650 ?  4 : \
+                                (b) >=  550 ?  5 : \
+                                (b) >=  450 ?  6 : \
+                                (b) >=  400 ?  7 : \
+                                (b) >=  350 ?  8 : \
+                                (b) >=  300 ?  9 : \
+                                (b) >=  250 ? 12 : \
+                                (b) >=  200 ? 15 : \
+                                (b) >=  150 ? 18 : \
+                                /* b >= 100 */ 27)
+
+#define BN_num_bytes(a)	((BN_num_bits(a)+7)/8)
+
+/* Note that BN_abs_is_word didn't work reliably for w == 0 until 0.9.8 */
+#define BN_abs_is_word(a,w) ((((a)->top == 1) && ((a)->d[0] == (BN_ULONG)(w))) || \
+				(((w) == 0) && ((a)->top == 0)))
+#define BN_is_zero(a)       ((a)->top == 0)
+#define BN_is_one(a)        (BN_abs_is_word((a),1) && !(a)->neg)
+#define BN_is_word(a,w)     (BN_abs_is_word((a),(w)) && (!(w) || !(a)->neg))
+#define BN_is_odd(a)	    (((a)->top > 0) && ((a)->d[0] & 1))
+
+#define BN_one(a)	(BN_set_word((a),1))
+#define BN_zero_ex(a) \
+	do { \
+		BIGNUM *_tmp_bn = (a); \
+		_tmp_bn->top = 0; \
+		_tmp_bn->neg = 0; \
+	} while(0)
+#ifdef OPENSSL_NO_DEPRECATED
+#define BN_zero(a)	BN_zero_ex(a)
+#else
+#define BN_zero(a)	(BN_set_word((a),0))
+#endif
+
+const BIGNUM *BN_value_one(void);
+char *	BN_options(void);
+BN_CTX *BN_CTX_new(void);
+#ifndef OPENSSL_NO_DEPRECATED
+void	BN_CTX_init(BN_CTX *c);
+#endif
+void	BN_CTX_free(BN_CTX *c);
+void	BN_CTX_start(BN_CTX *ctx);
+BIGNUM *BN_CTX_get(BN_CTX *ctx);
+void	BN_CTX_end(BN_CTX *ctx);
+int     BN_rand(BIGNUM *rnd, int bits, int top,int bottom);
+int     BN_pseudo_rand(BIGNUM *rnd, int bits, int top,int bottom);
+int	BN_rand_range(BIGNUM *rnd, const BIGNUM *range);
+int	BN_pseudo_rand_range(BIGNUM *rnd, const BIGNUM *range);
+int	BN_num_bits(const BIGNUM *a);
+int	BN_num_bits_word(BN_ULONG);
+BIGNUM *BN_new(void);
+void	BN_init(BIGNUM *);
+void	BN_clear_free(BIGNUM *a);
+BIGNUM *BN_copy(BIGNUM *a, const BIGNUM *b);
+void	BN_swap(BIGNUM *a, BIGNUM *b);
+BIGNUM *BN_bin2bn(const unsigned char *s,int len,BIGNUM *ret);
+int	BN_bn2bin(const BIGNUM *a, unsigned char *to);
+BIGNUM *BN_mpi2bn(const unsigned char *s,int len,BIGNUM *ret);
+int	BN_bn2mpi(const BIGNUM *a, unsigned char *to);
+int	BN_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
+int	BN_usub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
+int	BN_uadd(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
+int	BN_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
+int	BN_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
+int	BN_sqr(BIGNUM *r, const BIGNUM *a,BN_CTX *ctx);
+/** BN_set_negative sets sign of a BIGNUM
+ * \param  b  pointer to the BIGNUM object
+ * \param  n  0 if the BIGNUM b should be positive and a value != 0 otherwise 
+ */
+void	BN_set_negative(BIGNUM *b, int n);
+/** BN_is_negative returns 1 if the BIGNUM is negative
+ * \param  a  pointer to the BIGNUM object
+ * \return 1 if a < 0 and 0 otherwise
+ */
+#define BN_is_negative(a) ((a)->neg != 0)
+
+int	BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, const BIGNUM *d,
+	BN_CTX *ctx);
+#define BN_mod(rem,m,d,ctx) BN_div(NULL,(rem),(m),(d),(ctx))
+int	BN_nnmod(BIGNUM *r, const BIGNUM *m, const BIGNUM *d, BN_CTX *ctx);
+int	BN_mod_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m, BN_CTX *ctx);
+int	BN_mod_add_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m);
+int	BN_mod_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m, BN_CTX *ctx);
+int	BN_mod_sub_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m);
+int	BN_mod_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+	const BIGNUM *m, BN_CTX *ctx);
+int	BN_mod_sqr(BIGNUM *r, const BIGNUM *a, const BIGNUM *m, BN_CTX *ctx);
+int	BN_mod_lshift1(BIGNUM *r, const BIGNUM *a, const BIGNUM *m, BN_CTX *ctx);
+int	BN_mod_lshift1_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *m);
+int	BN_mod_lshift(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m, BN_CTX *ctx);
+int	BN_mod_lshift_quick(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m);
+
+BN_ULONG BN_mod_word(const BIGNUM *a, BN_ULONG w);
+BN_ULONG BN_div_word(BIGNUM *a, BN_ULONG w);
+int	BN_mul_word(BIGNUM *a, BN_ULONG w);
+int	BN_add_word(BIGNUM *a, BN_ULONG w);
+int	BN_sub_word(BIGNUM *a, BN_ULONG w);
+int	BN_set_word(BIGNUM *a, BN_ULONG w);
+BN_ULONG BN_get_word(const BIGNUM *a);
+
+int	BN_cmp(const BIGNUM *a, const BIGNUM *b);
+void	BN_free(BIGNUM *a);
+int	BN_is_bit_set(const BIGNUM *a, int n);
+int	BN_lshift(BIGNUM *r, const BIGNUM *a, int n);
+int	BN_lshift1(BIGNUM *r, const BIGNUM *a);
+int	BN_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,BN_CTX *ctx);
+
+int	BN_mod_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
+	const BIGNUM *m,BN_CTX *ctx);
+int	BN_mod_exp_mont(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
+	const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *m_ctx);
+int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
+	const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont);
+int	BN_mod_exp_mont_word(BIGNUM *r, BN_ULONG a, const BIGNUM *p,
+	const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *m_ctx);
+int	BN_mod_exp2_mont(BIGNUM *r, const BIGNUM *a1, const BIGNUM *p1,
+	const BIGNUM *a2, const BIGNUM *p2,const BIGNUM *m,
+	BN_CTX *ctx,BN_MONT_CTX *m_ctx);
+int	BN_mod_exp_simple(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
+	const BIGNUM *m,BN_CTX *ctx);
+
+int	BN_mask_bits(BIGNUM *a,int n);
+#ifndef OPENSSL_NO_FP_API
+int	BN_print_fp(FILE *fp, const BIGNUM *a);
+#endif
+#ifdef HEADER_BIO_H
+int	BN_print(BIO *fp, const BIGNUM *a);
+#else
+int	BN_print(void *fp, const BIGNUM *a);
+#endif
+int	BN_reciprocal(BIGNUM *r, const BIGNUM *m, int len, BN_CTX *ctx);
+int	BN_rshift(BIGNUM *r, const BIGNUM *a, int n);
+int	BN_rshift1(BIGNUM *r, const BIGNUM *a);
+void	BN_clear(BIGNUM *a);
+BIGNUM *BN_dup(const BIGNUM *a);
+int	BN_ucmp(const BIGNUM *a, const BIGNUM *b);
+int	BN_set_bit(BIGNUM *a, int n);
+int	BN_clear_bit(BIGNUM *a, int n);
+char *	BN_bn2hex(const BIGNUM *a);
+char *	BN_bn2dec(const BIGNUM *a);
+int 	BN_hex2bn(BIGNUM **a, const char *str);
+int 	BN_dec2bn(BIGNUM **a, const char *str);
+int	BN_asc2bn(BIGNUM **a, const char *str);
+int	BN_gcd(BIGNUM *r,const BIGNUM *a,const BIGNUM *b,BN_CTX *ctx);
+int	BN_kronecker(const BIGNUM *a,const BIGNUM *b,BN_CTX *ctx); /* returns -2 for error */
+BIGNUM *BN_mod_inverse(BIGNUM *ret,
+	const BIGNUM *a, const BIGNUM *n,BN_CTX *ctx);
+BIGNUM *BN_mod_sqrt(BIGNUM *ret,
+	const BIGNUM *a, const BIGNUM *n,BN_CTX *ctx);
+
+void	BN_consttime_swap(BN_ULONG swap, BIGNUM *a, BIGNUM *b, int nwords);
+
+/* Deprecated versions */
+#ifndef OPENSSL_NO_DEPRECATED
+BIGNUM *BN_generate_prime(BIGNUM *ret,int bits,int safe,
+	const BIGNUM *add, const BIGNUM *rem,
+	void (*callback)(int,int,void *),void *cb_arg);
+int	BN_is_prime(const BIGNUM *p,int nchecks,
+	void (*callback)(int,int,void *),
+	BN_CTX *ctx,void *cb_arg);
+int	BN_is_prime_fasttest(const BIGNUM *p,int nchecks,
+	void (*callback)(int,int,void *),BN_CTX *ctx,void *cb_arg,
+	int do_trial_division);
+#endif /* !defined(OPENSSL_NO_DEPRECATED) */
+
+/* Newer versions */
+int	BN_generate_prime_ex(BIGNUM *ret,int bits,int safe, const BIGNUM *add,
+		const BIGNUM *rem, BN_GENCB *cb);
+int	BN_is_prime_ex(const BIGNUM *p,int nchecks, BN_CTX *ctx, BN_GENCB *cb);
+int	BN_is_prime_fasttest_ex(const BIGNUM *p,int nchecks, BN_CTX *ctx,
+		int do_trial_division, BN_GENCB *cb);
+
+int BN_X931_generate_Xpq(BIGNUM *Xp, BIGNUM *Xq, int nbits, BN_CTX *ctx);
+
+int BN_X931_derive_prime_ex(BIGNUM *p, BIGNUM *p1, BIGNUM *p2,
+			const BIGNUM *Xp, const BIGNUM *Xp1, const BIGNUM *Xp2,
+			const BIGNUM *e, BN_CTX *ctx, BN_GENCB *cb);
+int BN_X931_generate_prime_ex(BIGNUM *p, BIGNUM *p1, BIGNUM *p2,
+			BIGNUM *Xp1, BIGNUM *Xp2,
+			const BIGNUM *Xp,
+			const BIGNUM *e, BN_CTX *ctx,
+			BN_GENCB *cb);
+
+BN_MONT_CTX *BN_MONT_CTX_new(void );
+void BN_MONT_CTX_init(BN_MONT_CTX *ctx);
+int BN_mod_mul_montgomery(BIGNUM *r,const BIGNUM *a,const BIGNUM *b,
+	BN_MONT_CTX *mont, BN_CTX *ctx);
+#define BN_to_montgomery(r,a,mont,ctx)	BN_mod_mul_montgomery(\
+	(r),(a),&((mont)->RR),(mont),(ctx))
+int BN_from_montgomery(BIGNUM *r,const BIGNUM *a,
+	BN_MONT_CTX *mont, BN_CTX *ctx);
+void BN_MONT_CTX_free(BN_MONT_CTX *mont);
+int BN_MONT_CTX_set(BN_MONT_CTX *mont,const BIGNUM *mod,BN_CTX *ctx);
+BN_MONT_CTX *BN_MONT_CTX_copy(BN_MONT_CTX *to,BN_MONT_CTX *from);
+BN_MONT_CTX *BN_MONT_CTX_set_locked(BN_MONT_CTX **pmont, int lock,
+					const BIGNUM *mod, BN_CTX *ctx);
+
+/* BN_BLINDING flags */
+#define	BN_BLINDING_NO_UPDATE	0x00000001
+#define	BN_BLINDING_NO_RECREATE	0x00000002
+
+BN_BLINDING *BN_BLINDING_new(const BIGNUM *A, const BIGNUM *Ai, BIGNUM *mod);
+void BN_BLINDING_free(BN_BLINDING *b);
+int BN_BLINDING_update(BN_BLINDING *b,BN_CTX *ctx);
+int BN_BLINDING_convert(BIGNUM *n, BN_BLINDING *b, BN_CTX *ctx);
+int BN_BLINDING_invert(BIGNUM *n, BN_BLINDING *b, BN_CTX *ctx);
+int BN_BLINDING_convert_ex(BIGNUM *n, BIGNUM *r, BN_BLINDING *b, BN_CTX *);
+int BN_BLINDING_invert_ex(BIGNUM *n, const BIGNUM *r, BN_BLINDING *b, BN_CTX *);
+#ifndef OPENSSL_NO_DEPRECATED
+unsigned long BN_BLINDING_get_thread_id(const BN_BLINDING *);
+void BN_BLINDING_set_thread_id(BN_BLINDING *, unsigned long);
+#endif
+CRYPTO_THREADID *BN_BLINDING_thread_id(BN_BLINDING *);
+unsigned long BN_BLINDING_get_flags(const BN_BLINDING *);
+void BN_BLINDING_set_flags(BN_BLINDING *, unsigned long);
+BN_BLINDING *BN_BLINDING_create_param(BN_BLINDING *b,
+	const BIGNUM *e, BIGNUM *m, BN_CTX *ctx,
+	int (*bn_mod_exp)(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
+			  const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *m_ctx),
+	BN_MONT_CTX *m_ctx);
+
+#ifndef OPENSSL_NO_DEPRECATED
+void BN_set_params(int mul,int high,int low,int mont);
+int BN_get_params(int which); /* 0, mul, 1 high, 2 low, 3 mont */
+#endif
+
+void	BN_RECP_CTX_init(BN_RECP_CTX *recp);
+BN_RECP_CTX *BN_RECP_CTX_new(void);
+void	BN_RECP_CTX_free(BN_RECP_CTX *recp);
+int	BN_RECP_CTX_set(BN_RECP_CTX *recp,const BIGNUM *rdiv,BN_CTX *ctx);
+int	BN_mod_mul_reciprocal(BIGNUM *r, const BIGNUM *x, const BIGNUM *y,
+	BN_RECP_CTX *recp,BN_CTX *ctx);
+int	BN_mod_exp_recp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
+	const BIGNUM *m, BN_CTX *ctx);
+int	BN_div_recp(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m,
+	BN_RECP_CTX *recp, BN_CTX *ctx);
+
+#ifndef OPENSSL_NO_EC2M
+
+/* Functions for arithmetic over binary polynomials represented by BIGNUMs. 
+ *
+ * The BIGNUM::neg property of BIGNUMs representing binary polynomials is
+ * ignored.
+ *
+ * Note that input arguments are not const so that their bit arrays can
+ * be expanded to the appropriate size if needed.
+ */
+
+int	BN_GF2m_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b); /*r = a + b*/
+#define BN_GF2m_sub(r, a, b) BN_GF2m_add(r, a, b)
+int	BN_GF2m_mod(BIGNUM *r, const BIGNUM *a, const BIGNUM *p); /*r=a mod p*/
+int	BN_GF2m_mod_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+	const BIGNUM *p, BN_CTX *ctx); /* r = (a * b) mod p */
+int	BN_GF2m_mod_sqr(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
+	BN_CTX *ctx); /* r = (a * a) mod p */
+int	BN_GF2m_mod_inv(BIGNUM *r, const BIGNUM *b, const BIGNUM *p,
+	BN_CTX *ctx); /* r = (1 / b) mod p */
+int	BN_GF2m_mod_div(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+	const BIGNUM *p, BN_CTX *ctx); /* r = (a / b) mod p */
+int	BN_GF2m_mod_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+	const BIGNUM *p, BN_CTX *ctx); /* r = (a ^ b) mod p */
+int	BN_GF2m_mod_sqrt(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
+	BN_CTX *ctx); /* r = sqrt(a) mod p */
+int	BN_GF2m_mod_solve_quad(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
+	BN_CTX *ctx); /* r^2 + r = a mod p */
+#define BN_GF2m_cmp(a, b) BN_ucmp((a), (b))
+/* Some functions allow for representation of the irreducible polynomials
+ * as an unsigned int[], say p.  The irreducible f(t) is then of the form:
+ *     t^p[0] + t^p[1] + ... + t^p[k]
+ * where m = p[0] > p[1] > ... > p[k] = 0.
+ */
+int	BN_GF2m_mod_arr(BIGNUM *r, const BIGNUM *a, const int p[]);
+	/* r = a mod p */
+int	BN_GF2m_mod_mul_arr(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+	const int p[], BN_CTX *ctx); /* r = (a * b) mod p */
+int	BN_GF2m_mod_sqr_arr(BIGNUM *r, const BIGNUM *a, const int p[],
+	BN_CTX *ctx); /* r = (a * a) mod p */
+int	BN_GF2m_mod_inv_arr(BIGNUM *r, const BIGNUM *b, const int p[],
+	BN_CTX *ctx); /* r = (1 / b) mod p */
+int	BN_GF2m_mod_div_arr(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+	const int p[], BN_CTX *ctx); /* r = (a / b) mod p */
+int	BN_GF2m_mod_exp_arr(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+	const int p[], BN_CTX *ctx); /* r = (a ^ b) mod p */
+int	BN_GF2m_mod_sqrt_arr(BIGNUM *r, const BIGNUM *a,
+	const int p[], BN_CTX *ctx); /* r = sqrt(a) mod p */
+int	BN_GF2m_mod_solve_quad_arr(BIGNUM *r, const BIGNUM *a,
+	const int p[], BN_CTX *ctx); /* r^2 + r = a mod p */
+int	BN_GF2m_poly2arr(const BIGNUM *a, int p[], int max);
+int	BN_GF2m_arr2poly(const int p[], BIGNUM *a);
+
+#endif
+
+/* faster mod functions for the 'NIST primes' 
+ * 0 <= a < p^2 */
+int BN_nist_mod_192(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx);
+int BN_nist_mod_224(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx);
+int BN_nist_mod_256(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx);
+int BN_nist_mod_384(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx);
+int BN_nist_mod_521(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx);
+
+const BIGNUM *BN_get0_nist_prime_192(void);
+const BIGNUM *BN_get0_nist_prime_224(void);
+const BIGNUM *BN_get0_nist_prime_256(void);
+const BIGNUM *BN_get0_nist_prime_384(void);
+const BIGNUM *BN_get0_nist_prime_521(void);
+
+/* library internal functions */
+
+#define bn_expand(a,bits) ((((((bits+BN_BITS2-1))/BN_BITS2)) <= (a)->dmax)?\
+	(a):bn_expand2((a),(bits+BN_BITS2-1)/BN_BITS2))
+#define bn_wexpand(a,words) (((words) <= (a)->dmax)?(a):bn_expand2((a),(words)))
+BIGNUM *bn_expand2(BIGNUM *a, int words);
+#ifndef OPENSSL_NO_DEPRECATED
+BIGNUM *bn_dup_expand(const BIGNUM *a, int words); /* unused */
+#endif
+
+/* Bignum consistency macros
+ * There is one "API" macro, bn_fix_top(), for stripping leading zeroes from
+ * bignum data after direct manipulations on the data. There is also an
+ * "internal" macro, bn_check_top(), for verifying that there are no leading
+ * zeroes. Unfortunately, some auditing is required due to the fact that
+ * bn_fix_top() has become an overabused duct-tape because bignum data is
+ * occasionally passed around in an inconsistent state. So the following
+ * changes have been made to sort this out;
+ * - bn_fix_top()s implementation has been moved to bn_correct_top()
+ * - if BN_DEBUG isn't defined, bn_fix_top() maps to bn_correct_top(), and
+ *   bn_check_top() is as before.
+ * - if BN_DEBUG *is* defined;
+ *   - bn_check_top() tries to pollute unused words even if the bignum 'top' is
+ *     consistent. (ed: only if BN_DEBUG_RAND is defined)
+ *   - bn_fix_top() maps to bn_check_top() rather than "fixing" anything.
+ * The idea is to have debug builds flag up inconsistent bignums when they
+ * occur. If that occurs in a bn_fix_top(), we examine the code in question; if
+ * the use of bn_fix_top() was appropriate (ie. it follows directly after code
+ * that manipulates the bignum) it is converted to bn_correct_top(), and if it
+ * was not appropriate, we convert it permanently to bn_check_top() and track
+ * down the cause of the bug. Eventually, no internal code should be using the
+ * bn_fix_top() macro. External applications and libraries should try this with
+ * their own code too, both in terms of building against the openssl headers
+ * with BN_DEBUG defined *and* linking with a version of OpenSSL built with it
+ * defined. This not only improves external code, it provides more test
+ * coverage for openssl's own code.
+ */
+
+#ifdef BN_DEBUG
+
+/* We only need assert() when debugging */
+#include <assert.h>
+
+#ifdef BN_DEBUG_RAND
+/* To avoid "make update" cvs wars due to BN_DEBUG, use some tricks */
+#ifndef RAND_pseudo_bytes
+int RAND_pseudo_bytes(unsigned char *buf,int num);
+#define BN_DEBUG_TRIX
+#endif
+#define bn_pollute(a) \
+	do { \
+		const BIGNUM *_bnum1 = (a); \
+		if(_bnum1->top < _bnum1->dmax) { \
+			unsigned char _tmp_char; \
+			/* We cast away const without the compiler knowing, any \
+			 * *genuinely* constant variables that aren't mutable \
+			 * wouldn't be constructed with top!=dmax. */ \
+			BN_ULONG *_not_const; \
+			memcpy(&_not_const, &_bnum1->d, sizeof(BN_ULONG*)); \
+			RAND_pseudo_bytes(&_tmp_char, 1); \
+			memset((unsigned char *)(_not_const + _bnum1->top), _tmp_char, \
+				(_bnum1->dmax - _bnum1->top) * sizeof(BN_ULONG)); \
+		} \
+	} while(0)
+#ifdef BN_DEBUG_TRIX
+#undef RAND_pseudo_bytes
+#endif
+#else
+#define bn_pollute(a)
+#endif
+#define bn_check_top(a) \
+	do { \
+		const BIGNUM *_bnum2 = (a); \
+		if (_bnum2 != NULL) { \
+			assert((_bnum2->top == 0) || \
+				(_bnum2->d[_bnum2->top - 1] != 0)); \
+			bn_pollute(_bnum2); \
+		} \
+	} while(0)
+
+#define bn_fix_top(a)		bn_check_top(a)
+
+#define bn_check_size(bn, bits) bn_wcheck_size(bn, ((bits+BN_BITS2-1))/BN_BITS2)
+#define bn_wcheck_size(bn, words) \
+	do { \
+		const BIGNUM *_bnum2 = (bn); \
+		assert(words <= (_bnum2)->dmax && words >= (_bnum2)->top); \
+	} while(0)
+
+#else /* !BN_DEBUG */
+
+#define bn_pollute(a)
+#define bn_check_top(a)
+#define bn_fix_top(a)		bn_correct_top(a)
+#define bn_check_size(bn, bits)
+#define bn_wcheck_size(bn, words)
+
+#endif
+
+#define bn_correct_top(a) \
+        { \
+        BN_ULONG *ftl; \
+	int tmp_top = (a)->top; \
+	if (tmp_top > 0) \
+		{ \
+		for (ftl= &((a)->d[tmp_top-1]); tmp_top > 0; tmp_top--) \
+			if (*(ftl--)) break; \
+		(a)->top = tmp_top; \
+		} \
+	bn_pollute(a); \
+	}
+
+BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w);
+BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w);
+void     bn_sqr_words(BN_ULONG *rp, const BN_ULONG *ap, int num);
+BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d);
+BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int num);
+BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int num);
+
+/* Primes from RFC 2409 */
+BIGNUM *get_rfc2409_prime_768(BIGNUM *bn);
+BIGNUM *get_rfc2409_prime_1024(BIGNUM *bn);
+
+/* Primes from RFC 3526 */
+BIGNUM *get_rfc3526_prime_1536(BIGNUM *bn);
+BIGNUM *get_rfc3526_prime_2048(BIGNUM *bn);
+BIGNUM *get_rfc3526_prime_3072(BIGNUM *bn);
+BIGNUM *get_rfc3526_prime_4096(BIGNUM *bn);
+BIGNUM *get_rfc3526_prime_6144(BIGNUM *bn);
+BIGNUM *get_rfc3526_prime_8192(BIGNUM *bn);
+
+int BN_bntest_rand(BIGNUM *rnd, int bits, int top,int bottom);
+
+/* BEGIN ERROR CODES */
+/* The following lines are auto generated by the script mkerr.pl. Any changes
+ * made after this point may be overwritten when the script is next run.
+ */
+void ERR_load_BN_strings(void);
+
+/* Error codes for the BN functions. */
+
+/* Function codes. */
+#define BN_F_BNRAND					 127
+#define BN_F_BN_BLINDING_CONVERT_EX			 100
+#define BN_F_BN_BLINDING_CREATE_PARAM			 128
+#define BN_F_BN_BLINDING_INVERT_EX			 101
+#define BN_F_BN_BLINDING_NEW				 102
+#define BN_F_BN_BLINDING_UPDATE				 103
+#define BN_F_BN_BN2DEC					 104
+#define BN_F_BN_BN2HEX					 105
+#define BN_F_BN_CTX_GET					 116
+#define BN_F_BN_CTX_NEW					 106
+#define BN_F_BN_CTX_START				 129
+#define BN_F_BN_DIV					 107
+#define BN_F_BN_DIV_NO_BRANCH				 138
+#define BN_F_BN_DIV_RECP				 130
+#define BN_F_BN_EXP					 123
+#define BN_F_BN_EXPAND2					 108
+#define BN_F_BN_EXPAND_INTERNAL				 120
+#define BN_F_BN_GF2M_MOD				 131
+#define BN_F_BN_GF2M_MOD_EXP				 132
+#define BN_F_BN_GF2M_MOD_MUL				 133
+#define BN_F_BN_GF2M_MOD_SOLVE_QUAD			 134
+#define BN_F_BN_GF2M_MOD_SOLVE_QUAD_ARR			 135
+#define BN_F_BN_GF2M_MOD_SQR				 136
+#define BN_F_BN_GF2M_MOD_SQRT				 137
+#define BN_F_BN_MOD_EXP2_MONT				 118
+#define BN_F_BN_MOD_EXP_MONT				 109
+#define BN_F_BN_MOD_EXP_MONT_CONSTTIME			 124
+#define BN_F_BN_MOD_EXP_MONT_WORD			 117
+#define BN_F_BN_MOD_EXP_RECP				 125
+#define BN_F_BN_MOD_EXP_SIMPLE				 126
+#define BN_F_BN_MOD_INVERSE				 110
+#define BN_F_BN_MOD_INVERSE_NO_BRANCH			 139
+#define BN_F_BN_MOD_LSHIFT_QUICK			 119
+#define BN_F_BN_MOD_MUL_RECIPROCAL			 111
+#define BN_F_BN_MOD_SQRT				 121
+#define BN_F_BN_MPI2BN					 112
+#define BN_F_BN_NEW					 113
+#define BN_F_BN_RAND					 114
+#define BN_F_BN_RAND_RANGE				 122
+#define BN_F_BN_USUB					 115
+
+/* Reason codes. */
+#define BN_R_ARG2_LT_ARG3				 100
+#define BN_R_BAD_RECIPROCAL				 101
+#define BN_R_BIGNUM_TOO_LONG				 114
+#define BN_R_CALLED_WITH_EVEN_MODULUS			 102
+#define BN_R_DIV_BY_ZERO				 103
+#define BN_R_ENCODING_ERROR				 104
+#define BN_R_EXPAND_ON_STATIC_BIGNUM_DATA		 105
+#define BN_R_INPUT_NOT_REDUCED				 110
+#define BN_R_INVALID_LENGTH				 106
+#define BN_R_INVALID_RANGE				 115
+#define BN_R_NOT_A_SQUARE				 111
+#define BN_R_NOT_INITIALIZED				 107
+#define BN_R_NO_INVERSE					 108
+#define BN_R_NO_SOLUTION				 116
+#define BN_R_P_IS_NOT_PRIME				 112
+#define BN_R_TOO_MANY_ITERATIONS			 113
+#define BN_R_TOO_MANY_TEMPORARY_VARIABLES		 109
+
+#ifdef  __cplusplus
+}
+#endif
+#endif
diff --git a/compat/curl-for-windows/openssl/openssl/include/openssl/crypto.h b/compat/curl-for-windows/openssl/openssl/include/openssl/crypto.h
new file mode 100644
index 0000000000..f92fc5182d
--- /dev/null
+++ b/compat/curl-for-windows/openssl/openssl/include/openssl/crypto.h
@@ -0,0 +1,611 @@
+/* crypto/crypto.h */
+/* ====================================================================
+ * Copyright (c) 1998-2006 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ * 
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ * 
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from 
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ * 
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+/* ====================================================================
+ * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
+ * ECDH support in OpenSSL originally developed by 
+ * SUN MICROSYSTEMS, INC., and contributed to the OpenSSL project.
+ */
+
+#ifndef HEADER_CRYPTO_H
+#define HEADER_CRYPTO_H
+
+#include <stdlib.h>
+
+#include <openssl/e_os2.h>
+
+#ifndef OPENSSL_NO_FP_API
+#include <stdio.h>
+#endif
+
+#include <openssl/stack.h>
+#include <openssl/safestack.h>
+#include <openssl/opensslv.h>
+#include <openssl/ossl_typ.h>
+
+#ifdef CHARSET_EBCDIC
+#include <openssl/ebcdic.h>
+#endif
+
+/* Resolve problems on some operating systems with symbol names that clash
+   one way or another */
+#include <openssl/symhacks.h>
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+/* Backward compatibility to SSLeay */
+/* This is more to be used to check the correct DLL is being used
+ * in the MS world. */
+#define SSLEAY_VERSION_NUMBER	OPENSSL_VERSION_NUMBER
+#define SSLEAY_VERSION		0
+/* #define SSLEAY_OPTIONS	1 no longer supported */
+#define SSLEAY_CFLAGS		2
+#define SSLEAY_BUILT_ON		3
+#define SSLEAY_PLATFORM		4
+#define SSLEAY_DIR		5
+
+/* Already declared in ossl_typ.h */
+#if 0
+typedef struct crypto_ex_data_st CRYPTO_EX_DATA;
+/* Called when a new object is created */
+typedef int CRYPTO_EX_new(void *parent, void *ptr, CRYPTO_EX_DATA *ad,
+					int idx, long argl, void *argp);
+/* Called when an object is free()ed */
+typedef void CRYPTO_EX_free(void *parent, void *ptr, CRYPTO_EX_DATA *ad,
+					int idx, long argl, void *argp);
+/* Called when we need to dup an object */
+typedef int CRYPTO_EX_dup(CRYPTO_EX_DATA *to, CRYPTO_EX_DATA *from, void *from_d, 
+					int idx, long argl, void *argp);
+#endif
+
+/* A generic structure to pass assorted data in a expandable way */
+typedef struct openssl_item_st
+	{
+	int code;
+	void *value;		/* Not used for flag attributes */
+	size_t value_size;	/* Max size of value for output, length for input */
+	size_t *value_length;	/* Returned length of value for output */
+	} OPENSSL_ITEM;
+
+
+/* When changing the CRYPTO_LOCK_* list, be sure to maintin the text lock
+ * names in cryptlib.c
+ */
+
+#define	CRYPTO_LOCK_ERR			1
+#define	CRYPTO_LOCK_EX_DATA		2
+#define	CRYPTO_LOCK_X509		3
+#define	CRYPTO_LOCK_X509_INFO		4
+#define	CRYPTO_LOCK_X509_PKEY		5
+#define CRYPTO_LOCK_X509_CRL		6
+#define CRYPTO_LOCK_X509_REQ		7
+#define CRYPTO_LOCK_DSA			8
+#define CRYPTO_LOCK_RSA			9
+#define CRYPTO_LOCK_EVP_PKEY		10
+#define CRYPTO_LOCK_X509_STORE		11
+#define CRYPTO_LOCK_SSL_CTX		12
+#define CRYPTO_LOCK_SSL_CERT		13
+#define CRYPTO_LOCK_SSL_SESSION		14
+#define CRYPTO_LOCK_SSL_SESS_CERT	15
+#define CRYPTO_LOCK_SSL			16
+#define CRYPTO_LOCK_SSL_METHOD		17
+#define CRYPTO_LOCK_RAND		18
+#define CRYPTO_LOCK_RAND2		19
+#define CRYPTO_LOCK_MALLOC		20
+#define CRYPTO_LOCK_BIO			21
+#define CRYPTO_LOCK_GETHOSTBYNAME	22
+#define CRYPTO_LOCK_GETSERVBYNAME	23
+#define CRYPTO_LOCK_READDIR		24
+#define CRYPTO_LOCK_RSA_BLINDING	25
+#define CRYPTO_LOCK_DH			26
+#define CRYPTO_LOCK_MALLOC2		27
+#define CRYPTO_LOCK_DSO			28
+#define CRYPTO_LOCK_DYNLOCK		29
+#define CRYPTO_LOCK_ENGINE		30
+#define CRYPTO_LOCK_UI			31
+#define CRYPTO_LOCK_ECDSA               32
+#define CRYPTO_LOCK_EC			33
+#define CRYPTO_LOCK_ECDH		34
+#define CRYPTO_LOCK_BN  		35
+#define CRYPTO_LOCK_EC_PRE_COMP		36
+#define CRYPTO_LOCK_STORE		37
+#define CRYPTO_LOCK_COMP		38
+#define CRYPTO_LOCK_FIPS		39
+#define CRYPTO_LOCK_FIPS2		40
+#define CRYPTO_NUM_LOCKS		41
+
+#define CRYPTO_LOCK		1
+#define CRYPTO_UNLOCK		2
+#define CRYPTO_READ		4
+#define CRYPTO_WRITE		8
+
+#ifndef OPENSSL_NO_LOCKING
+#ifndef CRYPTO_w_lock
+#define CRYPTO_w_lock(type)	\
+	CRYPTO_lock(CRYPTO_LOCK|CRYPTO_WRITE,type,__FILE__,__LINE__)
+#define CRYPTO_w_unlock(type)	\
+	CRYPTO_lock(CRYPTO_UNLOCK|CRYPTO_WRITE,type,__FILE__,__LINE__)
+#define CRYPTO_r_lock(type)	\
+	CRYPTO_lock(CRYPTO_LOCK|CRYPTO_READ,type,__FILE__,__LINE__)
+#define CRYPTO_r_unlock(type)	\
+	CRYPTO_lock(CRYPTO_UNLOCK|CRYPTO_READ,type,__FILE__,__LINE__)
+#define CRYPTO_add(addr,amount,type)	\
+	CRYPTO_add_lock(addr,amount,type,__FILE__,__LINE__)
+#endif
+#else
+#define CRYPTO_w_lock(a)
+#define CRYPTO_w_unlock(a)
+#define CRYPTO_r_lock(a)
+#define CRYPTO_r_unlock(a)
+#define CRYPTO_add(a,b,c)	((*(a))+=(b))
+#endif
+
+/* Some applications as well as some parts of OpenSSL need to allocate
+   and deallocate locks in a dynamic fashion.  The following typedef
+   makes this possible in a type-safe manner.  */
+/* struct CRYPTO_dynlock_value has to be defined by the application. */
+typedef struct
+	{
+	int references;
+	struct CRYPTO_dynlock_value *data;
+	} CRYPTO_dynlock;
+
+
+/* The following can be used to detect memory leaks in the SSLeay library.
+ * It used, it turns on malloc checking */
+
+#define CRYPTO_MEM_CHECK_OFF	0x0	/* an enume */
+#define CRYPTO_MEM_CHECK_ON	0x1	/* a bit */
+#define CRYPTO_MEM_CHECK_ENABLE	0x2	/* a bit */
+#define CRYPTO_MEM_CHECK_DISABLE 0x3	/* an enume */
+
+/* The following are bit values to turn on or off options connected to the
+ * malloc checking functionality */
+
+/* Adds time to the memory checking information */
+#define V_CRYPTO_MDEBUG_TIME	0x1 /* a bit */
+/* Adds thread number to the memory checking information */
+#define V_CRYPTO_MDEBUG_THREAD	0x2 /* a bit */
+
+#define V_CRYPTO_MDEBUG_ALL (V_CRYPTO_MDEBUG_TIME | V_CRYPTO_MDEBUG_THREAD)
+
+
+/* predec of the BIO type */
+typedef struct bio_st BIO_dummy;
+
+struct crypto_ex_data_st
+	{
+	STACK_OF(void) *sk;
+	int dummy; /* gcc is screwing up this data structure :-( */
+	};
+DECLARE_STACK_OF(void)
+
+/* This stuff is basically class callback functions
+ * The current classes are SSL_CTX, SSL, SSL_SESSION, and a few more */
+
+typedef struct crypto_ex_data_func_st
+	{
+	long argl;	/* Arbitary long */
+	void *argp;	/* Arbitary void * */
+	CRYPTO_EX_new *new_func;
+	CRYPTO_EX_free *free_func;
+	CRYPTO_EX_dup *dup_func;
+	} CRYPTO_EX_DATA_FUNCS;
+
+DECLARE_STACK_OF(CRYPTO_EX_DATA_FUNCS)
+
+/* Per class, we have a STACK of CRYPTO_EX_DATA_FUNCS for each CRYPTO_EX_DATA
+ * entry.
+ */
+
+#define CRYPTO_EX_INDEX_BIO		0
+#define CRYPTO_EX_INDEX_SSL		1
+#define CRYPTO_EX_INDEX_SSL_CTX		2
+#define CRYPTO_EX_INDEX_SSL_SESSION	3
+#define CRYPTO_EX_INDEX_X509_STORE	4
+#define CRYPTO_EX_INDEX_X509_STORE_CTX	5
+#define CRYPTO_EX_INDEX_RSA		6
+#define CRYPTO_EX_INDEX_DSA		7
+#define CRYPTO_EX_INDEX_DH		8
+#define CRYPTO_EX_INDEX_ENGINE		9
+#define CRYPTO_EX_INDEX_X509		10
+#define CRYPTO_EX_INDEX_UI		11
+#define CRYPTO_EX_INDEX_ECDSA		12
+#define CRYPTO_EX_INDEX_ECDH		13
+#define CRYPTO_EX_INDEX_COMP		14
+#define CRYPTO_EX_INDEX_STORE		15
+
+/* Dynamically assigned indexes start from this value (don't use directly, use
+ * via CRYPTO_ex_data_new_class). */
+#define CRYPTO_EX_INDEX_USER		100
+
+
+/* This is the default callbacks, but we can have others as well:
+ * this is needed in Win32 where the application malloc and the
+ * library malloc may not be the same.
+ */
+#define CRYPTO_malloc_init()	CRYPTO_set_mem_functions(\
+	malloc, realloc, free)
+
+#if defined CRYPTO_MDEBUG_ALL || defined CRYPTO_MDEBUG_TIME || defined CRYPTO_MDEBUG_THREAD
+# ifndef CRYPTO_MDEBUG /* avoid duplicate #define */
+#  define CRYPTO_MDEBUG
+# endif
+#endif
+
+/* Set standard debugging functions (not done by default
+ * unless CRYPTO_MDEBUG is defined) */
+#define CRYPTO_malloc_debug_init()	do {\
+	CRYPTO_set_mem_debug_functions(\
+		CRYPTO_dbg_malloc,\
+		CRYPTO_dbg_realloc,\
+		CRYPTO_dbg_free,\
+		CRYPTO_dbg_set_options,\
+		CRYPTO_dbg_get_options);\
+	} while(0)
+
+int CRYPTO_mem_ctrl(int mode);
+int CRYPTO_is_mem_check_on(void);
+
+/* for applications */
+#define MemCheck_start() CRYPTO_mem_ctrl(CRYPTO_MEM_CHECK_ON)
+#define MemCheck_stop()	CRYPTO_mem_ctrl(CRYPTO_MEM_CHECK_OFF)
+
+/* for library-internal use */
+#define MemCheck_on()	CRYPTO_mem_ctrl(CRYPTO_MEM_CHECK_ENABLE)
+#define MemCheck_off()	CRYPTO_mem_ctrl(CRYPTO_MEM_CHECK_DISABLE)
+#define is_MemCheck_on() CRYPTO_is_mem_check_on()
+
+#define OPENSSL_malloc(num)	CRYPTO_malloc((int)num,__FILE__,__LINE__)
+#define OPENSSL_strdup(str)	CRYPTO_strdup((str),__FILE__,__LINE__)
+#define OPENSSL_realloc(addr,num) \
+	CRYPTO_realloc((char *)addr,(int)num,__FILE__,__LINE__)
+#define OPENSSL_realloc_clean(addr,old_num,num) \
+	CRYPTO_realloc_clean(addr,old_num,num,__FILE__,__LINE__)
+#define OPENSSL_remalloc(addr,num) \
+	CRYPTO_remalloc((char **)addr,(int)num,__FILE__,__LINE__)
+#define OPENSSL_freeFunc	CRYPTO_free
+#define OPENSSL_free(addr)	CRYPTO_free(addr)
+
+#define OPENSSL_malloc_locked(num) \
+	CRYPTO_malloc_locked((int)num,__FILE__,__LINE__)
+#define OPENSSL_free_locked(addr) CRYPTO_free_locked(addr)
+
+
+const char *SSLeay_version(int type);
+unsigned long SSLeay(void);
+
+int OPENSSL_issetugid(void);
+
+/* An opaque type representing an implementation of "ex_data" support */
+typedef struct st_CRYPTO_EX_DATA_IMPL	CRYPTO_EX_DATA_IMPL;
+/* Return an opaque pointer to the current "ex_data" implementation */
+const CRYPTO_EX_DATA_IMPL *CRYPTO_get_ex_data_implementation(void);
+/* Sets the "ex_data" implementation to be used (if it's not too late) */
+int CRYPTO_set_ex_data_implementation(const CRYPTO_EX_DATA_IMPL *i);
+/* Get a new "ex_data" class, and return the corresponding "class_index" */
+int CRYPTO_ex_data_new_class(void);
+/* Within a given class, get/register a new index */
+int CRYPTO_get_ex_new_index(int class_index, long argl, void *argp,
+		CRYPTO_EX_new *new_func, CRYPTO_EX_dup *dup_func,
+		CRYPTO_EX_free *free_func);
+/* Initialise/duplicate/free CRYPTO_EX_DATA variables corresponding to a given
+ * class (invokes whatever per-class callbacks are applicable) */
+int CRYPTO_new_ex_data(int class_index, void *obj, CRYPTO_EX_DATA *ad);
+int CRYPTO_dup_ex_data(int class_index, CRYPTO_EX_DATA *to,
+		CRYPTO_EX_DATA *from);
+void CRYPTO_free_ex_data(int class_index, void *obj, CRYPTO_EX_DATA *ad);
+/* Get/set data in a CRYPTO_EX_DATA variable corresponding to a particular index
+ * (relative to the class type involved) */
+int CRYPTO_set_ex_data(CRYPTO_EX_DATA *ad, int idx, void *val);
+void *CRYPTO_get_ex_data(const CRYPTO_EX_DATA *ad,int idx);
+/* This function cleans up all "ex_data" state. It mustn't be called under
+ * potential race-conditions. */
+void CRYPTO_cleanup_all_ex_data(void);
+
+int CRYPTO_get_new_lockid(char *name);
+
+int CRYPTO_num_locks(void); /* return CRYPTO_NUM_LOCKS (shared libs!) */
+void CRYPTO_lock(int mode, int type,const char *file,int line);
+void CRYPTO_set_locking_callback(void (*func)(int mode,int type,
+					      const char *file,int line));
+void (*CRYPTO_get_locking_callback(void))(int mode,int type,const char *file,
+		int line);
+void CRYPTO_set_add_lock_callback(int (*func)(int *num,int mount,int type,
+					      const char *file, int line));
+int (*CRYPTO_get_add_lock_callback(void))(int *num,int mount,int type,
+					  const char *file,int line);
+
+/* Don't use this structure directly. */
+typedef struct crypto_threadid_st
+	{
+	void *ptr;
+	unsigned long val;
+	} CRYPTO_THREADID;
+/* Only use CRYPTO_THREADID_set_[numeric|pointer]() within callbacks */
+void CRYPTO_THREADID_set_numeric(CRYPTO_THREADID *id, unsigned long val);
+void CRYPTO_THREADID_set_pointer(CRYPTO_THREADID *id, void *ptr);
+int CRYPTO_THREADID_set_callback(void (*threadid_func)(CRYPTO_THREADID *));
+void (*CRYPTO_THREADID_get_callback(void))(CRYPTO_THREADID *);
+void CRYPTO_THREADID_current(CRYPTO_THREADID *id);
+int CRYPTO_THREADID_cmp(const CRYPTO_THREADID *a, const CRYPTO_THREADID *b);
+void CRYPTO_THREADID_cpy(CRYPTO_THREADID *dest, const CRYPTO_THREADID *src);
+unsigned long CRYPTO_THREADID_hash(const CRYPTO_THREADID *id);
+#ifndef OPENSSL_NO_DEPRECATED
+void CRYPTO_set_id_callback(unsigned long (*func)(void));
+unsigned long (*CRYPTO_get_id_callback(void))(void);
+unsigned long CRYPTO_thread_id(void);
+#endif
+
+const char *CRYPTO_get_lock_name(int type);
+int CRYPTO_add_lock(int *pointer,int amount,int type, const char *file,
+		    int line);
+
+int CRYPTO_get_new_dynlockid(void);
+void CRYPTO_destroy_dynlockid(int i);
+struct CRYPTO_dynlock_value *CRYPTO_get_dynlock_value(int i);
+void CRYPTO_set_dynlock_create_callback(struct CRYPTO_dynlock_value *(*dyn_create_function)(const char *file, int line));
+void CRYPTO_set_dynlock_lock_callback(void (*dyn_lock_function)(int mode, struct CRYPTO_dynlock_value *l, const char *file, int line));
+void CRYPTO_set_dynlock_destroy_callback(void (*dyn_destroy_function)(struct CRYPTO_dynlock_value *l, const char *file, int line));
+struct CRYPTO_dynlock_value *(*CRYPTO_get_dynlock_create_callback(void))(const char *file,int line);
+void (*CRYPTO_get_dynlock_lock_callback(void))(int mode, struct CRYPTO_dynlock_value *l, const char *file,int line);
+void (*CRYPTO_get_dynlock_destroy_callback(void))(struct CRYPTO_dynlock_value *l, const char *file,int line);
+
+/* CRYPTO_set_mem_functions includes CRYPTO_set_locked_mem_functions --
+ * call the latter last if you need different functions */
+int CRYPTO_set_mem_functions(void *(*m)(size_t),void *(*r)(void *,size_t), void (*f)(void *));
+int CRYPTO_set_locked_mem_functions(void *(*m)(size_t), void (*free_func)(void *));
+int CRYPTO_set_mem_ex_functions(void *(*m)(size_t,const char *,int),
+                                void *(*r)(void *,size_t,const char *,int),
+                                void (*f)(void *));
+int CRYPTO_set_locked_mem_ex_functions(void *(*m)(size_t,const char *,int),
+                                       void (*free_func)(void *));
+int CRYPTO_set_mem_debug_functions(void (*m)(void *,int,const char *,int,int),
+				   void (*r)(void *,void *,int,const char *,int,int),
+				   void (*f)(void *,int),
+				   void (*so)(long),
+				   long (*go)(void));
+void CRYPTO_get_mem_functions(void *(**m)(size_t),void *(**r)(void *, size_t), void (**f)(void *));
+void CRYPTO_get_locked_mem_functions(void *(**m)(size_t), void (**f)(void *));
+void CRYPTO_get_mem_ex_functions(void *(**m)(size_t,const char *,int),
+                                 void *(**r)(void *, size_t,const char *,int),
+                                 void (**f)(void *));
+void CRYPTO_get_locked_mem_ex_functions(void *(**m)(size_t,const char *,int),
+                                        void (**f)(void *));
+void CRYPTO_get_mem_debug_functions(void (**m)(void *,int,const char *,int,int),
+				    void (**r)(void *,void *,int,const char *,int,int),
+				    void (**f)(void *,int),
+				    void (**so)(long),
+				    long (**go)(void));
+
+void *CRYPTO_malloc_locked(int num, const char *file, int line);
+void CRYPTO_free_locked(void *ptr);
+void *CRYPTO_malloc(int num, const char *file, int line);
+char *CRYPTO_strdup(const char *str, const char *file, int line);
+void CRYPTO_free(void *ptr);
+void *CRYPTO_realloc(void *addr,int num, const char *file, int line);
+void *CRYPTO_realloc_clean(void *addr,int old_num,int num,const char *file,
+			   int line);
+void *CRYPTO_remalloc(void *addr,int num, const char *file, int line);
+
+void OPENSSL_cleanse(void *ptr, size_t len);
+
+void CRYPTO_set_mem_debug_options(long bits);
+long CRYPTO_get_mem_debug_options(void);
+
+#define CRYPTO_push_info(info) \
+        CRYPTO_push_info_(info, __FILE__, __LINE__);
+int CRYPTO_push_info_(const char *info, const char *file, int line);
+int CRYPTO_pop_info(void);
+int CRYPTO_remove_all_info(void);
+
+
+/* Default debugging functions (enabled by CRYPTO_malloc_debug_init() macro;
+ * used as default in CRYPTO_MDEBUG compilations): */
+/* The last argument has the following significance:
+ *
+ * 0:	called before the actual memory allocation has taken place
+ * 1:	called after the actual memory allocation has taken place
+ */
+void CRYPTO_dbg_malloc(void *addr,int num,const char *file,int line,int before_p);
+void CRYPTO_dbg_realloc(void *addr1,void *addr2,int num,const char *file,int line,int before_p);
+void CRYPTO_dbg_free(void *addr,int before_p);
+/* Tell the debugging code about options.  By default, the following values
+ * apply:
+ *
+ * 0:                           Clear all options.
+ * V_CRYPTO_MDEBUG_TIME (1):    Set the "Show Time" option.
+ * V_CRYPTO_MDEBUG_THREAD (2):  Set the "Show Thread Number" option.
+ * V_CRYPTO_MDEBUG_ALL (3):     1 + 2
+ */
+void CRYPTO_dbg_set_options(long bits);
+long CRYPTO_dbg_get_options(void);
+
+
+#ifndef OPENSSL_NO_FP_API
+void CRYPTO_mem_leaks_fp(FILE *);
+#endif
+void CRYPTO_mem_leaks(struct bio_st *bio);
+/* unsigned long order, char *file, int line, int num_bytes, char *addr */
+typedef void *CRYPTO_MEM_LEAK_CB(unsigned long, const char *, int, int, void *);
+void CRYPTO_mem_leaks_cb(CRYPTO_MEM_LEAK_CB *cb);
+
+/* die if we have to */
+void OpenSSLDie(const char *file,int line,const char *assertion);
+#define OPENSSL_assert(e)       (void)((e) ? 0 : (OpenSSLDie(__FILE__, __LINE__, #e),1))
+
+unsigned long *OPENSSL_ia32cap_loc(void);
+#define OPENSSL_ia32cap (*(OPENSSL_ia32cap_loc()))
+int OPENSSL_isservice(void);
+
+int FIPS_mode(void);
+int FIPS_mode_set(int r);
+
+void OPENSSL_init(void);
+
+#define fips_md_init(alg) fips_md_init_ctx(alg, alg)
+
+#ifdef OPENSSL_FIPS
+#define fips_md_init_ctx(alg, cx) \
+	int alg##_Init(cx##_CTX *c) \
+	{ \
+	if (FIPS_mode()) OpenSSLDie(__FILE__, __LINE__, \
+		"Low level API call to digest " #alg " forbidden in FIPS mode!"); \
+	return private_##alg##_Init(c); \
+	} \
+	int private_##alg##_Init(cx##_CTX *c)
+
+#define fips_cipher_abort(alg) \
+	if (FIPS_mode()) OpenSSLDie(__FILE__, __LINE__, \
+		"Low level API call to cipher " #alg " forbidden in FIPS mode!")
+
+#else
+#define fips_md_init_ctx(alg, cx) \
+	int alg##_Init(cx##_CTX *c)
+#define fips_cipher_abort(alg) while(0)
+#endif
+
+/* CRYPTO_memcmp returns zero iff the |len| bytes at |a| and |b| are equal. It
+ * takes an amount of time dependent on |len|, but independent of the contents
+ * of |a| and |b|. Unlike memcmp, it cannot be used to put elements into a
+ * defined order as the return value when a != b is undefined, other than to be
+ * non-zero. */
+int CRYPTO_memcmp(const void *a, const void *b, size_t len);
+
+/* BEGIN ERROR CODES */
+/* The following lines are auto generated by the script mkerr.pl. Any changes
+ * made after this point may be overwritten when the script is next run.
+ */
+void ERR_load_CRYPTO_strings(void);
+
+/* Error codes for the CRYPTO functions. */
+
+/* Function codes. */
+#define CRYPTO_F_CRYPTO_GET_EX_NEW_INDEX		 100
+#define CRYPTO_F_CRYPTO_GET_NEW_DYNLOCKID		 103
+#define CRYPTO_F_CRYPTO_GET_NEW_LOCKID			 101
+#define CRYPTO_F_CRYPTO_SET_EX_DATA			 102
+#define CRYPTO_F_DEF_ADD_INDEX				 104
+#define CRYPTO_F_DEF_GET_CLASS				 105
+#define CRYPTO_F_FIPS_MODE_SET				 109
+#define CRYPTO_F_INT_DUP_EX_DATA			 106
+#define CRYPTO_F_INT_FREE_EX_DATA			 107
+#define CRYPTO_F_INT_NEW_EX_DATA			 108
+
+/* Reason codes. */
+#define CRYPTO_R_FIPS_MODE_NOT_SUPPORTED		 101
+#define CRYPTO_R_NO_DYNLOCK_CREATE_CALLBACK		 100
+
+#ifdef  __cplusplus
+}
+#endif
+#endif
diff --git a/compat/curl-for-windows/openssl/openssl/include/openssl/e_os2.h b/compat/curl-for-windows/openssl/openssl/include/openssl/e_os2.h
new file mode 100644
index 0000000000..ab3f1ee44f
--- /dev/null
+++ b/compat/curl-for-windows/openssl/openssl/include/openssl/e_os2.h
@@ -0,0 +1 @@
+#include "../../e_os2.h"
diff --git a/compat/curl-for-windows/openssl/openssl/include/openssl/opensslconf.h b/compat/curl-for-windows/openssl/openssl/include/openssl/opensslconf.h
new file mode 100644
index 0000000000..221be629b7
--- /dev/null
+++ b/compat/curl-for-windows/openssl/openssl/include/openssl/opensslconf.h
@@ -0,0 +1 @@
+#include "../../crypto/opensslconf.h"
diff --git a/compat/curl-for-windows/openssl/openssl/include/openssl/opensslv.h b/compat/curl-for-windows/openssl/openssl/include/openssl/opensslv.h
new file mode 100644
index 0000000000..2411505300
--- /dev/null
+++ b/compat/curl-for-windows/openssl/openssl/include/openssl/opensslv.h
@@ -0,0 +1,89 @@
+#ifndef HEADER_OPENSSLV_H
+#define HEADER_OPENSSLV_H
+
+/* Numeric release version identifier:
+ * MNNFFPPS: major minor fix patch status
+ * The status nibble has one of the values 0 for development, 1 to e for betas
+ * 1 to 14, and f for release.  The patch level is exactly that.
+ * For example:
+ * 0.9.3-dev	  0x00903000
+ * 0.9.3-beta1	  0x00903001
+ * 0.9.3-beta2-dev 0x00903002
+ * 0.9.3-beta2    0x00903002 (same as ...beta2-dev)
+ * 0.9.3	  0x0090300f
+ * 0.9.3a	  0x0090301f
+ * 0.9.4 	  0x0090400f
+ * 1.2.3z	  0x102031af
+ *
+ * For continuity reasons (because 0.9.5 is already out, and is coded
+ * 0x00905100), between 0.9.5 and 0.9.6 the coding of the patch level
+ * part is slightly different, by setting the highest bit.  This means
+ * that 0.9.5a looks like this: 0x0090581f.  At 0.9.6, we can start
+ * with 0x0090600S...
+ *
+ * (Prior to 0.9.3-dev a different scheme was used: 0.9.2b is 0x0922.)
+ * (Prior to 0.9.5a beta1, a different scheme was used: MMNNFFRBB for
+ *  major minor fix final patch/beta)
+ */
+#define OPENSSL_VERSION_NUMBER	0x1000109fL
+#ifdef OPENSSL_FIPS
+#define OPENSSL_VERSION_TEXT	"OpenSSL 1.0.1i-fips 6 Aug 2014"
+#else
+#define OPENSSL_VERSION_TEXT	"OpenSSL 1.0.1i 6 Aug 2014"
+#endif
+#define OPENSSL_VERSION_PTEXT	" part of " OPENSSL_VERSION_TEXT
+
+
+/* The macros below are to be used for shared library (.so, .dll, ...)
+ * versioning.  That kind of versioning works a bit differently between
+ * operating systems.  The most usual scheme is to set a major and a minor
+ * number, and have the runtime loader check that the major number is equal
+ * to what it was at application link time, while the minor number has to
+ * be greater or equal to what it was at application link time.  With this
+ * scheme, the version number is usually part of the file name, like this:
+ *
+ *	libcrypto.so.0.9
+ *
+ * Some unixen also make a softlink with the major verson number only:
+ *
+ *	libcrypto.so.0
+ *
+ * On Tru64 and IRIX 6.x it works a little bit differently.  There, the
+ * shared library version is stored in the file, and is actually a series
+ * of versions, separated by colons.  The rightmost version present in the
+ * library when linking an application is stored in the application to be
+ * matched at run time.  When the application is run, a check is done to
+ * see if the library version stored in the application matches any of the
+ * versions in the version string of the library itself.
+ * This version string can be constructed in any way, depending on what
+ * kind of matching is desired.  However, to implement the same scheme as
+ * the one used in the other unixen, all compatible versions, from lowest
+ * to highest, should be part of the string.  Consecutive builds would
+ * give the following versions strings:
+ *
+ *	3.0
+ *	3.0:3.1
+ *	3.0:3.1:3.2
+ *	4.0
+ *	4.0:4.1
+ *
+ * Notice how version 4 is completely incompatible with version, and
+ * therefore give the breach you can see.
+ *
+ * There may be other schemes as well that I haven't yet discovered.
+ *
+ * So, here's the way it works here: first of all, the library version
+ * number doesn't need at all to match the overall OpenSSL version.
+ * However, it's nice and more understandable if it actually does.
+ * The current library version is stored in the macro SHLIB_VERSION_NUMBER,
+ * which is just a piece of text in the format "M.m.e" (Major, minor, edit).
+ * For the sake of Tru64, IRIX, and any other OS that behaves in similar ways,
+ * we need to keep a history of version numbers, which is done in the
+ * macro SHLIB_VERSION_HISTORY.  The numbers are separated by colons and
+ * should only keep the versions that are binary compatible with the current.
+ */
+#define SHLIB_VERSION_HISTORY ""
+#define SHLIB_VERSION_NUMBER "1.0.0"
+
+
+#endif /* HEADER_OPENSSLV_H */
diff --git a/compat/curl-for-windows/openssl/openssl/include/openssl/ossl_typ.h b/compat/curl-for-windows/openssl/openssl/include/openssl/ossl_typ.h
new file mode 100644
index 0000000000..ea9227f6f9
--- /dev/null
+++ b/compat/curl-for-windows/openssl/openssl/include/openssl/ossl_typ.h
@@ -0,0 +1,202 @@
+/* ====================================================================
+ * Copyright (c) 1998-2001 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+#ifndef HEADER_OPENSSL_TYPES_H
+#define HEADER_OPENSSL_TYPES_H
+
+#include <openssl/e_os2.h>
+
+#ifdef NO_ASN1_TYPEDEFS
+#define ASN1_INTEGER		ASN1_STRING
+#define ASN1_ENUMERATED		ASN1_STRING
+#define ASN1_BIT_STRING		ASN1_STRING
+#define ASN1_OCTET_STRING	ASN1_STRING
+#define ASN1_PRINTABLESTRING	ASN1_STRING
+#define ASN1_T61STRING		ASN1_STRING
+#define ASN1_IA5STRING		ASN1_STRING
+#define ASN1_UTCTIME		ASN1_STRING
+#define ASN1_GENERALIZEDTIME	ASN1_STRING
+#define ASN1_TIME		ASN1_STRING
+#define ASN1_GENERALSTRING	ASN1_STRING
+#define ASN1_UNIVERSALSTRING	ASN1_STRING
+#define ASN1_BMPSTRING		ASN1_STRING
+#define ASN1_VISIBLESTRING	ASN1_STRING
+#define ASN1_UTF8STRING		ASN1_STRING
+#define ASN1_BOOLEAN		int
+#define ASN1_NULL		int
+#else
+typedef struct asn1_string_st ASN1_INTEGER;
+typedef struct asn1_string_st ASN1_ENUMERATED;
+typedef struct asn1_string_st ASN1_BIT_STRING;
+typedef struct asn1_string_st ASN1_OCTET_STRING;
+typedef struct asn1_string_st ASN1_PRINTABLESTRING;
+typedef struct asn1_string_st ASN1_T61STRING;
+typedef struct asn1_string_st ASN1_IA5STRING;
+typedef struct asn1_string_st ASN1_GENERALSTRING;
+typedef struct asn1_string_st ASN1_UNIVERSALSTRING;
+typedef struct asn1_string_st ASN1_BMPSTRING;
+typedef struct asn1_string_st ASN1_UTCTIME;
+typedef struct asn1_string_st ASN1_TIME;
+typedef struct asn1_string_st ASN1_GENERALIZEDTIME;
+typedef struct asn1_string_st ASN1_VISIBLESTRING;
+typedef struct asn1_string_st ASN1_UTF8STRING;
+typedef struct asn1_string_st ASN1_STRING;
+typedef int ASN1_BOOLEAN;
+typedef int ASN1_NULL;
+#endif
+
+typedef struct ASN1_ITEM_st ASN1_ITEM;
+typedef struct asn1_pctx_st ASN1_PCTX;
+
+#ifdef OPENSSL_SYS_WIN32
+#undef X509_NAME
+#undef X509_EXTENSIONS
+#undef X509_CERT_PAIR
+#undef PKCS7_ISSUER_AND_SERIAL
+#undef OCSP_REQUEST
+#undef OCSP_RESPONSE
+#endif
+
+#ifdef BIGNUM
+#undef BIGNUM
+#endif
+typedef struct bignum_st BIGNUM;
+typedef struct bignum_ctx BN_CTX;
+typedef struct bn_blinding_st BN_BLINDING;
+typedef struct bn_mont_ctx_st BN_MONT_CTX;
+typedef struct bn_recp_ctx_st BN_RECP_CTX;
+typedef struct bn_gencb_st BN_GENCB;
+
+typedef struct buf_mem_st BUF_MEM;
+
+typedef struct evp_cipher_st EVP_CIPHER;
+typedef struct evp_cipher_ctx_st EVP_CIPHER_CTX;
+typedef struct env_md_st EVP_MD;
+typedef struct env_md_ctx_st EVP_MD_CTX;
+typedef struct evp_pkey_st EVP_PKEY;
+
+typedef struct evp_pkey_asn1_method_st EVP_PKEY_ASN1_METHOD;
+
+typedef struct evp_pkey_method_st EVP_PKEY_METHOD;
+typedef struct evp_pkey_ctx_st EVP_PKEY_CTX;
+
+typedef struct dh_st DH;
+typedef struct dh_method DH_METHOD;
+
+typedef struct dsa_st DSA;
+typedef struct dsa_method DSA_METHOD;
+
+typedef struct rsa_st RSA;
+typedef struct rsa_meth_st RSA_METHOD;
+
+typedef struct rand_meth_st RAND_METHOD;
+
+typedef struct ecdh_method ECDH_METHOD;
+typedef struct ecdsa_method ECDSA_METHOD;
+
+typedef struct x509_st X509;
+typedef struct X509_algor_st X509_ALGOR;
+typedef struct X509_crl_st X509_CRL;
+typedef struct x509_crl_method_st X509_CRL_METHOD;
+typedef struct x509_revoked_st X509_REVOKED;
+typedef struct X509_name_st X509_NAME;
+typedef struct X509_pubkey_st X509_PUBKEY;
+typedef struct x509_store_st X509_STORE;
+typedef struct x509_store_ctx_st X509_STORE_CTX;
+
+typedef struct pkcs8_priv_key_info_st PKCS8_PRIV_KEY_INFO;
+
+typedef struct v3_ext_ctx X509V3_CTX;
+typedef struct conf_st CONF;
+
+typedef struct store_st STORE;
+typedef struct store_method_st STORE_METHOD;
+
+typedef struct ui_st UI;
+typedef struct ui_method_st UI_METHOD;
+
+typedef struct st_ERR_FNS ERR_FNS;
+
+typedef struct engine_st ENGINE;
+typedef struct ssl_st SSL;
+typedef struct ssl_ctx_st SSL_CTX;
+
+typedef struct X509_POLICY_NODE_st X509_POLICY_NODE;
+typedef struct X509_POLICY_LEVEL_st X509_POLICY_LEVEL;
+typedef struct X509_POLICY_TREE_st X509_POLICY_TREE;
+typedef struct X509_POLICY_CACHE_st X509_POLICY_CACHE;
+
+typedef struct AUTHORITY_KEYID_st AUTHORITY_KEYID;
+typedef struct DIST_POINT_st DIST_POINT;
+typedef struct ISSUING_DIST_POINT_st ISSUING_DIST_POINT;
+typedef struct NAME_CONSTRAINTS_st NAME_CONSTRAINTS;
+
+  /* If placed in pkcs12.h, we end up with a circular depency with pkcs7.h */
+#define DECLARE_PKCS12_STACK_OF(type) /* Nothing */
+#define IMPLEMENT_PKCS12_STACK_OF(type) /* Nothing */
+
+typedef struct crypto_ex_data_st CRYPTO_EX_DATA;
+/* Callback types for crypto.h */
+typedef int CRYPTO_EX_new(void *parent, void *ptr, CRYPTO_EX_DATA *ad,
+					int idx, long argl, void *argp);
+typedef void CRYPTO_EX_free(void *parent, void *ptr, CRYPTO_EX_DATA *ad,
+					int idx, long argl, void *argp);
+typedef int CRYPTO_EX_dup(CRYPTO_EX_DATA *to, CRYPTO_EX_DATA *from, void *from_d, 
+					int idx, long argl, void *argp);
+
+typedef struct ocsp_req_ctx_st OCSP_REQ_CTX;
+typedef struct ocsp_response_st OCSP_RESPONSE;
+typedef struct ocsp_responder_id_st OCSP_RESPID;
+
+#endif /* def HEADER_OPENSSL_TYPES_H */
diff --git a/compat/curl-for-windows/openssl/openssl/include/openssl/safestack.h b/compat/curl-for-windows/openssl/openssl/include/openssl/safestack.h
new file mode 100644
index 0000000000..ea3aa0d800
--- /dev/null
+++ b/compat/curl-for-windows/openssl/openssl/include/openssl/safestack.h
@@ -0,0 +1,2663 @@
+/* ====================================================================
+ * Copyright (c) 1999 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+#ifndef HEADER_SAFESTACK_H
+#define HEADER_SAFESTACK_H
+
+#include <openssl/stack.h>
+
+#ifndef CHECKED_PTR_OF
+#define CHECKED_PTR_OF(type, p) \
+    ((void*) (1 ? p : (type*)0))
+#endif
+
+/* In C++ we get problems because an explicit cast is needed from (void *)
+ * we use CHECKED_STACK_OF to ensure the correct type is passed in the macros
+ * below. 
+ */
+
+#define CHECKED_STACK_OF(type, p) \
+    ((_STACK*) (1 ? p : (STACK_OF(type)*)0))
+
+#define CHECKED_SK_FREE_FUNC(type, p) \
+    ((void (*)(void *)) ((1 ? p : (void (*)(type *))0)))
+
+#define CHECKED_SK_FREE_FUNC2(type, p) \
+    ((void (*)(void *)) ((1 ? p : (void (*)(type))0)))
+
+#define CHECKED_SK_CMP_FUNC(type, p) \
+    ((int (*)(const void *, const void *)) \
+	((1 ? p : (int (*)(const type * const *, const type * const *))0)))
+
+#define STACK_OF(type) struct stack_st_##type
+#define PREDECLARE_STACK_OF(type) STACK_OF(type);
+
+#define DECLARE_STACK_OF(type) \
+STACK_OF(type) \
+    { \
+    _STACK stack; \
+    };
+#define DECLARE_SPECIAL_STACK_OF(type, type2) \
+STACK_OF(type) \
+    { \
+    _STACK stack; \
+    };
+
+#define IMPLEMENT_STACK_OF(type) /* nada (obsolete in new safestack approach)*/
+
+
+/* Strings are special: normally an lhash entry will point to a single
+ * (somewhat) mutable object. In the case of strings:
+ *
+ * a) Instead of a single char, there is an array of chars, NUL-terminated.
+ * b) The string may have be immutable.
+ *
+ * So, they need their own declarations. Especially important for
+ * type-checking tools, such as Deputy.
+ *
+o * In practice, however, it appears to be hard to have a const
+ * string. For now, I'm settling for dealing with the fact it is a
+ * string at all.
+ */
+typedef char *OPENSSL_STRING;
+
+typedef const char *OPENSSL_CSTRING;
+
+/* Confusingly, LHASH_OF(STRING) deals with char ** throughout, but
+ * STACK_OF(STRING) is really more like STACK_OF(char), only, as
+ * mentioned above, instead of a single char each entry is a
+ * NUL-terminated array of chars. So, we have to implement STRING
+ * specially for STACK_OF. This is dealt with in the autogenerated
+ * macros below.
+ */
+
+DECLARE_SPECIAL_STACK_OF(OPENSSL_STRING, char)
+
+/* Similarly, we sometimes use a block of characters, NOT
+ * nul-terminated. These should also be distinguished from "normal"
+ * stacks. */
+
+typedef void *OPENSSL_BLOCK;
+DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
+
+/* SKM_sk_... stack macros are internal to safestack.h:
+ * never use them directly, use sk_<type>_... instead */
+#define SKM_sk_new(type, cmp) \
+	((STACK_OF(type) *)sk_new(CHECKED_SK_CMP_FUNC(type, cmp)))
+#define SKM_sk_new_null(type) \
+	((STACK_OF(type) *)sk_new_null())
+#define SKM_sk_free(type, st) \
+	sk_free(CHECKED_STACK_OF(type, st))
+#define SKM_sk_num(type, st) \
+	sk_num(CHECKED_STACK_OF(type, st))
+#define SKM_sk_value(type, st,i) \
+	((type *)sk_value(CHECKED_STACK_OF(type, st), i))
+#define SKM_sk_set(type, st,i,val) \
+	sk_set(CHECKED_STACK_OF(type, st), i, CHECKED_PTR_OF(type, val))
+#define SKM_sk_zero(type, st) \
+	sk_zero(CHECKED_STACK_OF(type, st))
+#define SKM_sk_push(type, st, val) \
+	sk_push(CHECKED_STACK_OF(type, st), CHECKED_PTR_OF(type, val))
+#define SKM_sk_unshift(type, st, val) \
+	sk_unshift(CHECKED_STACK_OF(type, st), CHECKED_PTR_OF(type, val))
+#define SKM_sk_find(type, st, val) \
+	sk_find(CHECKED_STACK_OF(type, st), CHECKED_PTR_OF(type, val))
+#define SKM_sk_find_ex(type, st, val) \
+	sk_find_ex(CHECKED_STACK_OF(type, st), \
+		   CHECKED_PTR_OF(type, val))
+#define SKM_sk_delete(type, st, i) \
+	(type *)sk_delete(CHECKED_STACK_OF(type, st), i)
+#define SKM_sk_delete_ptr(type, st, ptr) \
+	(type *)sk_delete_ptr(CHECKED_STACK_OF(type, st), CHECKED_PTR_OF(type, ptr))
+#define SKM_sk_insert(type, st,val, i) \
+	sk_insert(CHECKED_STACK_OF(type, st), CHECKED_PTR_OF(type, val), i)
+#define SKM_sk_set_cmp_func(type, st, cmp) \
+	((int (*)(const type * const *,const type * const *)) \
+	sk_set_cmp_func(CHECKED_STACK_OF(type, st), CHECKED_SK_CMP_FUNC(type, cmp)))
+#define SKM_sk_dup(type, st) \
+	(STACK_OF(type) *)sk_dup(CHECKED_STACK_OF(type, st))
+#define SKM_sk_pop_free(type, st, free_func) \
+	sk_pop_free(CHECKED_STACK_OF(type, st), CHECKED_SK_FREE_FUNC(type, free_func))
+#define SKM_sk_shift(type, st) \
+	(type *)sk_shift(CHECKED_STACK_OF(type, st))
+#define SKM_sk_pop(type, st) \
+	(type *)sk_pop(CHECKED_STACK_OF(type, st))
+#define SKM_sk_sort(type, st) \
+	sk_sort(CHECKED_STACK_OF(type, st))
+#define SKM_sk_is_sorted(type, st) \
+	sk_is_sorted(CHECKED_STACK_OF(type, st))
+
+#define	SKM_ASN1_SET_OF_d2i(type, st, pp, length, d2i_func, free_func, ex_tag, ex_class) \
+  (STACK_OF(type) *)d2i_ASN1_SET( \
+				(STACK_OF(OPENSSL_BLOCK) **)CHECKED_PTR_OF(STACK_OF(type)*, st), \
+				pp, length, \
+				CHECKED_D2I_OF(type, d2i_func), \
+				CHECKED_SK_FREE_FUNC(type, free_func), \
+				ex_tag, ex_class)
+
+#define	SKM_ASN1_SET_OF_i2d(type, st, pp, i2d_func, ex_tag, ex_class, is_set) \
+  i2d_ASN1_SET((STACK_OF(OPENSSL_BLOCK) *)CHECKED_STACK_OF(type, st), pp, \
+				CHECKED_I2D_OF(type, i2d_func), \
+				ex_tag, ex_class, is_set)
+
+#define	SKM_ASN1_seq_pack(type, st, i2d_func, buf, len) \
+	ASN1_seq_pack(CHECKED_PTR_OF(STACK_OF(type), st), \
+			CHECKED_I2D_OF(type, i2d_func), buf, len)
+
+#define	SKM_ASN1_seq_unpack(type, buf, len, d2i_func, free_func) \
+	(STACK_OF(type) *)ASN1_seq_unpack(buf, len, CHECKED_D2I_OF(type, d2i_func), CHECKED_SK_FREE_FUNC(type, free_func))
+
+#define SKM_PKCS12_decrypt_d2i(type, algor, d2i_func, free_func, pass, passlen, oct, seq) \
+	(STACK_OF(type) *)PKCS12_decrypt_d2i(algor, \
+				CHECKED_D2I_OF(type, d2i_func), \
+				CHECKED_SK_FREE_FUNC(type, free_func), \
+				pass, passlen, oct, seq)
+
+/* This block of defines is updated by util/mkstack.pl, please do not touch! */
+#define sk_ACCESS_DESCRIPTION_new(cmp) SKM_sk_new(ACCESS_DESCRIPTION, (cmp))
+#define sk_ACCESS_DESCRIPTION_new_null() SKM_sk_new_null(ACCESS_DESCRIPTION)
+#define sk_ACCESS_DESCRIPTION_free(st) SKM_sk_free(ACCESS_DESCRIPTION, (st))
+#define sk_ACCESS_DESCRIPTION_num(st) SKM_sk_num(ACCESS_DESCRIPTION, (st))
+#define sk_ACCESS_DESCRIPTION_value(st, i) SKM_sk_value(ACCESS_DESCRIPTION, (st), (i))
+#define sk_ACCESS_DESCRIPTION_set(st, i, val) SKM_sk_set(ACCESS_DESCRIPTION, (st), (i), (val))
+#define sk_ACCESS_DESCRIPTION_zero(st) SKM_sk_zero(ACCESS_DESCRIPTION, (st))
+#define sk_ACCESS_DESCRIPTION_push(st, val) SKM_sk_push(ACCESS_DESCRIPTION, (st), (val))
+#define sk_ACCESS_DESCRIPTION_unshift(st, val) SKM_sk_unshift(ACCESS_DESCRIPTION, (st), (val))
+#define sk_ACCESS_DESCRIPTION_find(st, val) SKM_sk_find(ACCESS_DESCRIPTION, (st), (val))
+#define sk_ACCESS_DESCRIPTION_find_ex(st, val) SKM_sk_find_ex(ACCESS_DESCRIPTION, (st), (val))
+#define sk_ACCESS_DESCRIPTION_delete(st, i) SKM_sk_delete(ACCESS_DESCRIPTION, (st), (i))
+#define sk_ACCESS_DESCRIPTION_delete_ptr(st, ptr) SKM_sk_delete_ptr(ACCESS_DESCRIPTION, (st), (ptr))
+#define sk_ACCESS_DESCRIPTION_insert(st, val, i) SKM_sk_insert(ACCESS_DESCRIPTION, (st), (val), (i))
+#define sk_ACCESS_DESCRIPTION_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(ACCESS_DESCRIPTION, (st), (cmp))
+#define sk_ACCESS_DESCRIPTION_dup(st) SKM_sk_dup(ACCESS_DESCRIPTION, st)
+#define sk_ACCESS_DESCRIPTION_pop_free(st, free_func) SKM_sk_pop_free(ACCESS_DESCRIPTION, (st), (free_func))
+#define sk_ACCESS_DESCRIPTION_shift(st) SKM_sk_shift(ACCESS_DESCRIPTION, (st))
+#define sk_ACCESS_DESCRIPTION_pop(st) SKM_sk_pop(ACCESS_DESCRIPTION, (st))
+#define sk_ACCESS_DESCRIPTION_sort(st) SKM_sk_sort(ACCESS_DESCRIPTION, (st))
+#define sk_ACCESS_DESCRIPTION_is_sorted(st) SKM_sk_is_sorted(ACCESS_DESCRIPTION, (st))
+
+#define sk_ASIdOrRange_new(cmp) SKM_sk_new(ASIdOrRange, (cmp))
+#define sk_ASIdOrRange_new_null() SKM_sk_new_null(ASIdOrRange)
+#define sk_ASIdOrRange_free(st) SKM_sk_free(ASIdOrRange, (st))
+#define sk_ASIdOrRange_num(st) SKM_sk_num(ASIdOrRange, (st))
+#define sk_ASIdOrRange_value(st, i) SKM_sk_value(ASIdOrRange, (st), (i))
+#define sk_ASIdOrRange_set(st, i, val) SKM_sk_set(ASIdOrRange, (st), (i), (val))
+#define sk_ASIdOrRange_zero(st) SKM_sk_zero(ASIdOrRange, (st))
+#define sk_ASIdOrRange_push(st, val) SKM_sk_push(ASIdOrRange, (st), (val))
+#define sk_ASIdOrRange_unshift(st, val) SKM_sk_unshift(ASIdOrRange, (st), (val))
+#define sk_ASIdOrRange_find(st, val) SKM_sk_find(ASIdOrRange, (st), (val))
+#define sk_ASIdOrRange_find_ex(st, val) SKM_sk_find_ex(ASIdOrRange, (st), (val))
+#define sk_ASIdOrRange_delete(st, i) SKM_sk_delete(ASIdOrRange, (st), (i))
+#define sk_ASIdOrRange_delete_ptr(st, ptr) SKM_sk_delete_ptr(ASIdOrRange, (st), (ptr))
+#define sk_ASIdOrRange_insert(st, val, i) SKM_sk_insert(ASIdOrRange, (st), (val), (i))
+#define sk_ASIdOrRange_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(ASIdOrRange, (st), (cmp))
+#define sk_ASIdOrRange_dup(st) SKM_sk_dup(ASIdOrRange, st)
+#define sk_ASIdOrRange_pop_free(st, free_func) SKM_sk_pop_free(ASIdOrRange, (st), (free_func))
+#define sk_ASIdOrRange_shift(st) SKM_sk_shift(ASIdOrRange, (st))
+#define sk_ASIdOrRange_pop(st) SKM_sk_pop(ASIdOrRange, (st))
+#define sk_ASIdOrRange_sort(st) SKM_sk_sort(ASIdOrRange, (st))
+#define sk_ASIdOrRange_is_sorted(st) SKM_sk_is_sorted(ASIdOrRange, (st))
+
+#define sk_ASN1_GENERALSTRING_new(cmp) SKM_sk_new(ASN1_GENERALSTRING, (cmp))
+#define sk_ASN1_GENERALSTRING_new_null() SKM_sk_new_null(ASN1_GENERALSTRING)
+#define sk_ASN1_GENERALSTRING_free(st) SKM_sk_free(ASN1_GENERALSTRING, (st))
+#define sk_ASN1_GENERALSTRING_num(st) SKM_sk_num(ASN1_GENERALSTRING, (st))
+#define sk_ASN1_GENERALSTRING_value(st, i) SKM_sk_value(ASN1_GENERALSTRING, (st), (i))
+#define sk_ASN1_GENERALSTRING_set(st, i, val) SKM_sk_set(ASN1_GENERALSTRING, (st), (i), (val))
+#define sk_ASN1_GENERALSTRING_zero(st) SKM_sk_zero(ASN1_GENERALSTRING, (st))
+#define sk_ASN1_GENERALSTRING_push(st, val) SKM_sk_push(ASN1_GENERALSTRING, (st), (val))
+#define sk_ASN1_GENERALSTRING_unshift(st, val) SKM_sk_unshift(ASN1_GENERALSTRING, (st), (val))
+#define sk_ASN1_GENERALSTRING_find(st, val) SKM_sk_find(ASN1_GENERALSTRING, (st), (val))
+#define sk_ASN1_GENERALSTRING_find_ex(st, val) SKM_sk_find_ex(ASN1_GENERALSTRING, (st), (val))
+#define sk_ASN1_GENERALSTRING_delete(st, i) SKM_sk_delete(ASN1_GENERALSTRING, (st), (i))
+#define sk_ASN1_GENERALSTRING_delete_ptr(st, ptr) SKM_sk_delete_ptr(ASN1_GENERALSTRING, (st), (ptr))
+#define sk_ASN1_GENERALSTRING_insert(st, val, i) SKM_sk_insert(ASN1_GENERALSTRING, (st), (val), (i))
+#define sk_ASN1_GENERALSTRING_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(ASN1_GENERALSTRING, (st), (cmp))
+#define sk_ASN1_GENERALSTRING_dup(st) SKM_sk_dup(ASN1_GENERALSTRING, st)
+#define sk_ASN1_GENERALSTRING_pop_free(st, free_func) SKM_sk_pop_free(ASN1_GENERALSTRING, (st), (free_func))
+#define sk_ASN1_GENERALSTRING_shift(st) SKM_sk_shift(ASN1_GENERALSTRING, (st))
+#define sk_ASN1_GENERALSTRING_pop(st) SKM_sk_pop(ASN1_GENERALSTRING, (st))
+#define sk_ASN1_GENERALSTRING_sort(st) SKM_sk_sort(ASN1_GENERALSTRING, (st))
+#define sk_ASN1_GENERALSTRING_is_sorted(st) SKM_sk_is_sorted(ASN1_GENERALSTRING, (st))
+
+#define sk_ASN1_INTEGER_new(cmp) SKM_sk_new(ASN1_INTEGER, (cmp))
+#define sk_ASN1_INTEGER_new_null() SKM_sk_new_null(ASN1_INTEGER)
+#define sk_ASN1_INTEGER_free(st) SKM_sk_free(ASN1_INTEGER, (st))
+#define sk_ASN1_INTEGER_num(st) SKM_sk_num(ASN1_INTEGER, (st))
+#define sk_ASN1_INTEGER_value(st, i) SKM_sk_value(ASN1_INTEGER, (st), (i))
+#define sk_ASN1_INTEGER_set(st, i, val) SKM_sk_set(ASN1_INTEGER, (st), (i), (val))
+#define sk_ASN1_INTEGER_zero(st) SKM_sk_zero(ASN1_INTEGER, (st))
+#define sk_ASN1_INTEGER_push(st, val) SKM_sk_push(ASN1_INTEGER, (st), (val))
+#define sk_ASN1_INTEGER_unshift(st, val) SKM_sk_unshift(ASN1_INTEGER, (st), (val))
+#define sk_ASN1_INTEGER_find(st, val) SKM_sk_find(ASN1_INTEGER, (st), (val))
+#define sk_ASN1_INTEGER_find_ex(st, val) SKM_sk_find_ex(ASN1_INTEGER, (st), (val))
+#define sk_ASN1_INTEGER_delete(st, i) SKM_sk_delete(ASN1_INTEGER, (st), (i))
+#define sk_ASN1_INTEGER_delete_ptr(st, ptr) SKM_sk_delete_ptr(ASN1_INTEGER, (st), (ptr))
+#define sk_ASN1_INTEGER_insert(st, val, i) SKM_sk_insert(ASN1_INTEGER, (st), (val), (i))
+#define sk_ASN1_INTEGER_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(ASN1_INTEGER, (st), (cmp))
+#define sk_ASN1_INTEGER_dup(st) SKM_sk_dup(ASN1_INTEGER, st)
+#define sk_ASN1_INTEGER_pop_free(st, free_func) SKM_sk_pop_free(ASN1_INTEGER, (st), (free_func))
+#define sk_ASN1_INTEGER_shift(st) SKM_sk_shift(ASN1_INTEGER, (st))
+#define sk_ASN1_INTEGER_pop(st) SKM_sk_pop(ASN1_INTEGER, (st))
+#define sk_ASN1_INTEGER_sort(st) SKM_sk_sort(ASN1_INTEGER, (st))
+#define sk_ASN1_INTEGER_is_sorted(st) SKM_sk_is_sorted(ASN1_INTEGER, (st))
+
+#define sk_ASN1_OBJECT_new(cmp) SKM_sk_new(ASN1_OBJECT, (cmp))
+#define sk_ASN1_OBJECT_new_null() SKM_sk_new_null(ASN1_OBJECT)
+#define sk_ASN1_OBJECT_free(st) SKM_sk_free(ASN1_OBJECT, (st))
+#define sk_ASN1_OBJECT_num(st) SKM_sk_num(ASN1_OBJECT, (st))
+#define sk_ASN1_OBJECT_value(st, i) SKM_sk_value(ASN1_OBJECT, (st), (i))
+#define sk_ASN1_OBJECT_set(st, i, val) SKM_sk_set(ASN1_OBJECT, (st), (i), (val))
+#define sk_ASN1_OBJECT_zero(st) SKM_sk_zero(ASN1_OBJECT, (st))
+#define sk_ASN1_OBJECT_push(st, val) SKM_sk_push(ASN1_OBJECT, (st), (val))
+#define sk_ASN1_OBJECT_unshift(st, val) SKM_sk_unshift(ASN1_OBJECT, (st), (val))
+#define sk_ASN1_OBJECT_find(st, val) SKM_sk_find(ASN1_OBJECT, (st), (val))
+#define sk_ASN1_OBJECT_find_ex(st, val) SKM_sk_find_ex(ASN1_OBJECT, (st), (val))
+#define sk_ASN1_OBJECT_delete(st, i) SKM_sk_delete(ASN1_OBJECT, (st), (i))
+#define sk_ASN1_OBJECT_delete_ptr(st, ptr) SKM_sk_delete_ptr(ASN1_OBJECT, (st), (ptr))
+#define sk_ASN1_OBJECT_insert(st, val, i) SKM_sk_insert(ASN1_OBJECT, (st), (val), (i))
+#define sk_ASN1_OBJECT_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(ASN1_OBJECT, (st), (cmp))
+#define sk_ASN1_OBJECT_dup(st) SKM_sk_dup(ASN1_OBJECT, st)
+#define sk_ASN1_OBJECT_pop_free(st, free_func) SKM_sk_pop_free(ASN1_OBJECT, (st), (free_func))
+#define sk_ASN1_OBJECT_shift(st) SKM_sk_shift(ASN1_OBJECT, (st))
+#define sk_ASN1_OBJECT_pop(st) SKM_sk_pop(ASN1_OBJECT, (st))
+#define sk_ASN1_OBJECT_sort(st) SKM_sk_sort(ASN1_OBJECT, (st))
+#define sk_ASN1_OBJECT_is_sorted(st) SKM_sk_is_sorted(ASN1_OBJECT, (st))
+
+#define sk_ASN1_STRING_TABLE_new(cmp) SKM_sk_new(ASN1_STRING_TABLE, (cmp))
+#define sk_ASN1_STRING_TABLE_new_null() SKM_sk_new_null(ASN1_STRING_TABLE)
+#define sk_ASN1_STRING_TABLE_free(st) SKM_sk_free(ASN1_STRING_TABLE, (st))
+#define sk_ASN1_STRING_TABLE_num(st) SKM_sk_num(ASN1_STRING_TABLE, (st))
+#define sk_ASN1_STRING_TABLE_value(st, i) SKM_sk_value(ASN1_STRING_TABLE, (st), (i))
+#define sk_ASN1_STRING_TABLE_set(st, i, val) SKM_sk_set(ASN1_STRING_TABLE, (st), (i), (val))
+#define sk_ASN1_STRING_TABLE_zero(st) SKM_sk_zero(ASN1_STRING_TABLE, (st))
+#define sk_ASN1_STRING_TABLE_push(st, val) SKM_sk_push(ASN1_STRING_TABLE, (st), (val))
+#define sk_ASN1_STRING_TABLE_unshift(st, val) SKM_sk_unshift(ASN1_STRING_TABLE, (st), (val))
+#define sk_ASN1_STRING_TABLE_find(st, val) SKM_sk_find(ASN1_STRING_TABLE, (st), (val))
+#define sk_ASN1_STRING_TABLE_find_ex(st, val) SKM_sk_find_ex(ASN1_STRING_TABLE, (st), (val))
+#define sk_ASN1_STRING_TABLE_delete(st, i) SKM_sk_delete(ASN1_STRING_TABLE, (st), (i))
+#define sk_ASN1_STRING_TABLE_delete_ptr(st, ptr) SKM_sk_delete_ptr(ASN1_STRING_TABLE, (st), (ptr))
+#define sk_ASN1_STRING_TABLE_insert(st, val, i) SKM_sk_insert(ASN1_STRING_TABLE, (st), (val), (i))
+#define sk_ASN1_STRING_TABLE_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(ASN1_STRING_TABLE, (st), (cmp))
+#define sk_ASN1_STRING_TABLE_dup(st) SKM_sk_dup(ASN1_STRING_TABLE, st)
+#define sk_ASN1_STRING_TABLE_pop_free(st, free_func) SKM_sk_pop_free(ASN1_STRING_TABLE, (st), (free_func))
+#define sk_ASN1_STRING_TABLE_shift(st) SKM_sk_shift(ASN1_STRING_TABLE, (st))
+#define sk_ASN1_STRING_TABLE_pop(st) SKM_sk_pop(ASN1_STRING_TABLE, (st))
+#define sk_ASN1_STRING_TABLE_sort(st) SKM_sk_sort(ASN1_STRING_TABLE, (st))
+#define sk_ASN1_STRING_TABLE_is_sorted(st) SKM_sk_is_sorted(ASN1_STRING_TABLE, (st))
+
+#define sk_ASN1_TYPE_new(cmp) SKM_sk_new(ASN1_TYPE, (cmp))
+#define sk_ASN1_TYPE_new_null() SKM_sk_new_null(ASN1_TYPE)
+#define sk_ASN1_TYPE_free(st) SKM_sk_free(ASN1_TYPE, (st))
+#define sk_ASN1_TYPE_num(st) SKM_sk_num(ASN1_TYPE, (st))
+#define sk_ASN1_TYPE_value(st, i) SKM_sk_value(ASN1_TYPE, (st), (i))
+#define sk_ASN1_TYPE_set(st, i, val) SKM_sk_set(ASN1_TYPE, (st), (i), (val))
+#define sk_ASN1_TYPE_zero(st) SKM_sk_zero(ASN1_TYPE, (st))
+#define sk_ASN1_TYPE_push(st, val) SKM_sk_push(ASN1_TYPE, (st), (val))
+#define sk_ASN1_TYPE_unshift(st, val) SKM_sk_unshift(ASN1_TYPE, (st), (val))
+#define sk_ASN1_TYPE_find(st, val) SKM_sk_find(ASN1_TYPE, (st), (val))
+#define sk_ASN1_TYPE_find_ex(st, val) SKM_sk_find_ex(ASN1_TYPE, (st), (val))
+#define sk_ASN1_TYPE_delete(st, i) SKM_sk_delete(ASN1_TYPE, (st), (i))
+#define sk_ASN1_TYPE_delete_ptr(st, ptr) SKM_sk_delete_ptr(ASN1_TYPE, (st), (ptr))
+#define sk_ASN1_TYPE_insert(st, val, i) SKM_sk_insert(ASN1_TYPE, (st), (val), (i))
+#define sk_ASN1_TYPE_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(ASN1_TYPE, (st), (cmp))
+#define sk_ASN1_TYPE_dup(st) SKM_sk_dup(ASN1_TYPE, st)
+#define sk_ASN1_TYPE_pop_free(st, free_func) SKM_sk_pop_free(ASN1_TYPE, (st), (free_func))
+#define sk_ASN1_TYPE_shift(st) SKM_sk_shift(ASN1_TYPE, (st))
+#define sk_ASN1_TYPE_pop(st) SKM_sk_pop(ASN1_TYPE, (st))
+#define sk_ASN1_TYPE_sort(st) SKM_sk_sort(ASN1_TYPE, (st))
+#define sk_ASN1_TYPE_is_sorted(st) SKM_sk_is_sorted(ASN1_TYPE, (st))
+
+#define sk_ASN1_UTF8STRING_new(cmp) SKM_sk_new(ASN1_UTF8STRING, (cmp))
+#define sk_ASN1_UTF8STRING_new_null() SKM_sk_new_null(ASN1_UTF8STRING)
+#define sk_ASN1_UTF8STRING_free(st) SKM_sk_free(ASN1_UTF8STRING, (st))
+#define sk_ASN1_UTF8STRING_num(st) SKM_sk_num(ASN1_UTF8STRING, (st))
+#define sk_ASN1_UTF8STRING_value(st, i) SKM_sk_value(ASN1_UTF8STRING, (st), (i))
+#define sk_ASN1_UTF8STRING_set(st, i, val) SKM_sk_set(ASN1_UTF8STRING, (st), (i), (val))
+#define sk_ASN1_UTF8STRING_zero(st) SKM_sk_zero(ASN1_UTF8STRING, (st))
+#define sk_ASN1_UTF8STRING_push(st, val) SKM_sk_push(ASN1_UTF8STRING, (st), (val))
+#define sk_ASN1_UTF8STRING_unshift(st, val) SKM_sk_unshift(ASN1_UTF8STRING, (st), (val))
+#define sk_ASN1_UTF8STRING_find(st, val) SKM_sk_find(ASN1_UTF8STRING, (st), (val))
+#define sk_ASN1_UTF8STRING_find_ex(st, val) SKM_sk_find_ex(ASN1_UTF8STRING, (st), (val))
+#define sk_ASN1_UTF8STRING_delete(st, i) SKM_sk_delete(ASN1_UTF8STRING, (st), (i))
+#define sk_ASN1_UTF8STRING_delete_ptr(st, ptr) SKM_sk_delete_ptr(ASN1_UTF8STRING, (st), (ptr))
+#define sk_ASN1_UTF8STRING_insert(st, val, i) SKM_sk_insert(ASN1_UTF8STRING, (st), (val), (i))
+#define sk_ASN1_UTF8STRING_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(ASN1_UTF8STRING, (st), (cmp))
+#define sk_ASN1_UTF8STRING_dup(st) SKM_sk_dup(ASN1_UTF8STRING, st)
+#define sk_ASN1_UTF8STRING_pop_free(st, free_func) SKM_sk_pop_free(ASN1_UTF8STRING, (st), (free_func))
+#define sk_ASN1_UTF8STRING_shift(st) SKM_sk_shift(ASN1_UTF8STRING, (st))
+#define sk_ASN1_UTF8STRING_pop(st) SKM_sk_pop(ASN1_UTF8STRING, (st))
+#define sk_ASN1_UTF8STRING_sort(st) SKM_sk_sort(ASN1_UTF8STRING, (st))
+#define sk_ASN1_UTF8STRING_is_sorted(st) SKM_sk_is_sorted(ASN1_UTF8STRING, (st))
+
+#define sk_ASN1_VALUE_new(cmp) SKM_sk_new(ASN1_VALUE, (cmp))
+#define sk_ASN1_VALUE_new_null() SKM_sk_new_null(ASN1_VALUE)
+#define sk_ASN1_VALUE_free(st) SKM_sk_free(ASN1_VALUE, (st))
+#define sk_ASN1_VALUE_num(st) SKM_sk_num(ASN1_VALUE, (st))
+#define sk_ASN1_VALUE_value(st, i) SKM_sk_value(ASN1_VALUE, (st), (i))
+#define sk_ASN1_VALUE_set(st, i, val) SKM_sk_set(ASN1_VALUE, (st), (i), (val))
+#define sk_ASN1_VALUE_zero(st) SKM_sk_zero(ASN1_VALUE, (st))
+#define sk_ASN1_VALUE_push(st, val) SKM_sk_push(ASN1_VALUE, (st), (val))
+#define sk_ASN1_VALUE_unshift(st, val) SKM_sk_unshift(ASN1_VALUE, (st), (val))
+#define sk_ASN1_VALUE_find(st, val) SKM_sk_find(ASN1_VALUE, (st), (val))
+#define sk_ASN1_VALUE_find_ex(st, val) SKM_sk_find_ex(ASN1_VALUE, (st), (val))
+#define sk_ASN1_VALUE_delete(st, i) SKM_sk_delete(ASN1_VALUE, (st), (i))
+#define sk_ASN1_VALUE_delete_ptr(st, ptr) SKM_sk_delete_ptr(ASN1_VALUE, (st), (ptr))
+#define sk_ASN1_VALUE_insert(st, val, i) SKM_sk_insert(ASN1_VALUE, (st), (val), (i))
+#define sk_ASN1_VALUE_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(ASN1_VALUE, (st), (cmp))
+#define sk_ASN1_VALUE_dup(st) SKM_sk_dup(ASN1_VALUE, st)
+#define sk_ASN1_VALUE_pop_free(st, free_func) SKM_sk_pop_free(ASN1_VALUE, (st), (free_func))
+#define sk_ASN1_VALUE_shift(st) SKM_sk_shift(ASN1_VALUE, (st))
+#define sk_ASN1_VALUE_pop(st) SKM_sk_pop(ASN1_VALUE, (st))
+#define sk_ASN1_VALUE_sort(st) SKM_sk_sort(ASN1_VALUE, (st))
+#define sk_ASN1_VALUE_is_sorted(st) SKM_sk_is_sorted(ASN1_VALUE, (st))
+
+#define sk_BIO_new(cmp) SKM_sk_new(BIO, (cmp))
+#define sk_BIO_new_null() SKM_sk_new_null(BIO)
+#define sk_BIO_free(st) SKM_sk_free(BIO, (st))
+#define sk_BIO_num(st) SKM_sk_num(BIO, (st))
+#define sk_BIO_value(st, i) SKM_sk_value(BIO, (st), (i))
+#define sk_BIO_set(st, i, val) SKM_sk_set(BIO, (st), (i), (val))
+#define sk_BIO_zero(st) SKM_sk_zero(BIO, (st))
+#define sk_BIO_push(st, val) SKM_sk_push(BIO, (st), (val))
+#define sk_BIO_unshift(st, val) SKM_sk_unshift(BIO, (st), (val))
+#define sk_BIO_find(st, val) SKM_sk_find(BIO, (st), (val))
+#define sk_BIO_find_ex(st, val) SKM_sk_find_ex(BIO, (st), (val))
+#define sk_BIO_delete(st, i) SKM_sk_delete(BIO, (st), (i))
+#define sk_BIO_delete_ptr(st, ptr) SKM_sk_delete_ptr(BIO, (st), (ptr))
+#define sk_BIO_insert(st, val, i) SKM_sk_insert(BIO, (st), (val), (i))
+#define sk_BIO_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(BIO, (st), (cmp))
+#define sk_BIO_dup(st) SKM_sk_dup(BIO, st)
+#define sk_BIO_pop_free(st, free_func) SKM_sk_pop_free(BIO, (st), (free_func))
+#define sk_BIO_shift(st) SKM_sk_shift(BIO, (st))
+#define sk_BIO_pop(st) SKM_sk_pop(BIO, (st))
+#define sk_BIO_sort(st) SKM_sk_sort(BIO, (st))
+#define sk_BIO_is_sorted(st) SKM_sk_is_sorted(BIO, (st))
+
+#define sk_BY_DIR_ENTRY_new(cmp) SKM_sk_new(BY_DIR_ENTRY, (cmp))
+#define sk_BY_DIR_ENTRY_new_null() SKM_sk_new_null(BY_DIR_ENTRY)
+#define sk_BY_DIR_ENTRY_free(st) SKM_sk_free(BY_DIR_ENTRY, (st))
+#define sk_BY_DIR_ENTRY_num(st) SKM_sk_num(BY_DIR_ENTRY, (st))
+#define sk_BY_DIR_ENTRY_value(st, i) SKM_sk_value(BY_DIR_ENTRY, (st), (i))
+#define sk_BY_DIR_ENTRY_set(st, i, val) SKM_sk_set(BY_DIR_ENTRY, (st), (i), (val))
+#define sk_BY_DIR_ENTRY_zero(st) SKM_sk_zero(BY_DIR_ENTRY, (st))
+#define sk_BY_DIR_ENTRY_push(st, val) SKM_sk_push(BY_DIR_ENTRY, (st), (val))
+#define sk_BY_DIR_ENTRY_unshift(st, val) SKM_sk_unshift(BY_DIR_ENTRY, (st), (val))
+#define sk_BY_DIR_ENTRY_find(st, val) SKM_sk_find(BY_DIR_ENTRY, (st), (val))
+#define sk_BY_DIR_ENTRY_find_ex(st, val) SKM_sk_find_ex(BY_DIR_ENTRY, (st), (val))
+#define sk_BY_DIR_ENTRY_delete(st, i) SKM_sk_delete(BY_DIR_ENTRY, (st), (i))
+#define sk_BY_DIR_ENTRY_delete_ptr(st, ptr) SKM_sk_delete_ptr(BY_DIR_ENTRY, (st), (ptr))
+#define sk_BY_DIR_ENTRY_insert(st, val, i) SKM_sk_insert(BY_DIR_ENTRY, (st), (val), (i))
+#define sk_BY_DIR_ENTRY_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(BY_DIR_ENTRY, (st), (cmp))
+#define sk_BY_DIR_ENTRY_dup(st) SKM_sk_dup(BY_DIR_ENTRY, st)
+#define sk_BY_DIR_ENTRY_pop_free(st, free_func) SKM_sk_pop_free(BY_DIR_ENTRY, (st), (free_func))
+#define sk_BY_DIR_ENTRY_shift(st) SKM_sk_shift(BY_DIR_ENTRY, (st))
+#define sk_BY_DIR_ENTRY_pop(st) SKM_sk_pop(BY_DIR_ENTRY, (st))
+#define sk_BY_DIR_ENTRY_sort(st) SKM_sk_sort(BY_DIR_ENTRY, (st))
+#define sk_BY_DIR_ENTRY_is_sorted(st) SKM_sk_is_sorted(BY_DIR_ENTRY, (st))
+
+#define sk_BY_DIR_HASH_new(cmp) SKM_sk_new(BY_DIR_HASH, (cmp))
+#define sk_BY_DIR_HASH_new_null() SKM_sk_new_null(BY_DIR_HASH)
+#define sk_BY_DIR_HASH_free(st) SKM_sk_free(BY_DIR_HASH, (st))
+#define sk_BY_DIR_HASH_num(st) SKM_sk_num(BY_DIR_HASH, (st))
+#define sk_BY_DIR_HASH_value(st, i) SKM_sk_value(BY_DIR_HASH, (st), (i))
+#define sk_BY_DIR_HASH_set(st, i, val) SKM_sk_set(BY_DIR_HASH, (st), (i), (val))
+#define sk_BY_DIR_HASH_zero(st) SKM_sk_zero(BY_DIR_HASH, (st))
+#define sk_BY_DIR_HASH_push(st, val) SKM_sk_push(BY_DIR_HASH, (st), (val))
+#define sk_BY_DIR_HASH_unshift(st, val) SKM_sk_unshift(BY_DIR_HASH, (st), (val))
+#define sk_BY_DIR_HASH_find(st, val) SKM_sk_find(BY_DIR_HASH, (st), (val))
+#define sk_BY_DIR_HASH_find_ex(st, val) SKM_sk_find_ex(BY_DIR_HASH, (st), (val))
+#define sk_BY_DIR_HASH_delete(st, i) SKM_sk_delete(BY_DIR_HASH, (st), (i))
+#define sk_BY_DIR_HASH_delete_ptr(st, ptr) SKM_sk_delete_ptr(BY_DIR_HASH, (st), (ptr))
+#define sk_BY_DIR_HASH_insert(st, val, i) SKM_sk_insert(BY_DIR_HASH, (st), (val), (i))
+#define sk_BY_DIR_HASH_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(BY_DIR_HASH, (st), (cmp))
+#define sk_BY_DIR_HASH_dup(st) SKM_sk_dup(BY_DIR_HASH, st)
+#define sk_BY_DIR_HASH_pop_free(st, free_func) SKM_sk_pop_free(BY_DIR_HASH, (st), (free_func))
+#define sk_BY_DIR_HASH_shift(st) SKM_sk_shift(BY_DIR_HASH, (st))
+#define sk_BY_DIR_HASH_pop(st) SKM_sk_pop(BY_DIR_HASH, (st))
+#define sk_BY_DIR_HASH_sort(st) SKM_sk_sort(BY_DIR_HASH, (st))
+#define sk_BY_DIR_HASH_is_sorted(st) SKM_sk_is_sorted(BY_DIR_HASH, (st))
+
+#define sk_CMS_CertificateChoices_new(cmp) SKM_sk_new(CMS_CertificateChoices, (cmp))
+#define sk_CMS_CertificateChoices_new_null() SKM_sk_new_null(CMS_CertificateChoices)
+#define sk_CMS_CertificateChoices_free(st) SKM_sk_free(CMS_CertificateChoices, (st))
+#define sk_CMS_CertificateChoices_num(st) SKM_sk_num(CMS_CertificateChoices, (st))
+#define sk_CMS_CertificateChoices_value(st, i) SKM_sk_value(CMS_CertificateChoices, (st), (i))
+#define sk_CMS_CertificateChoices_set(st, i, val) SKM_sk_set(CMS_CertificateChoices, (st), (i), (val))
+#define sk_CMS_CertificateChoices_zero(st) SKM_sk_zero(CMS_CertificateChoices, (st))
+#define sk_CMS_CertificateChoices_push(st, val) SKM_sk_push(CMS_CertificateChoices, (st), (val))
+#define sk_CMS_CertificateChoices_unshift(st, val) SKM_sk_unshift(CMS_CertificateChoices, (st), (val))
+#define sk_CMS_CertificateChoices_find(st, val) SKM_sk_find(CMS_CertificateChoices, (st), (val))
+#define sk_CMS_CertificateChoices_find_ex(st, val) SKM_sk_find_ex(CMS_CertificateChoices, (st), (val))
+#define sk_CMS_CertificateChoices_delete(st, i) SKM_sk_delete(CMS_CertificateChoices, (st), (i))
+#define sk_CMS_CertificateChoices_delete_ptr(st, ptr) SKM_sk_delete_ptr(CMS_CertificateChoices, (st), (ptr))
+#define sk_CMS_CertificateChoices_insert(st, val, i) SKM_sk_insert(CMS_CertificateChoices, (st), (val), (i))
+#define sk_CMS_CertificateChoices_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(CMS_CertificateChoices, (st), (cmp))
+#define sk_CMS_CertificateChoices_dup(st) SKM_sk_dup(CMS_CertificateChoices, st)
+#define sk_CMS_CertificateChoices_pop_free(st, free_func) SKM_sk_pop_free(CMS_CertificateChoices, (st), (free_func))
+#define sk_CMS_CertificateChoices_shift(st) SKM_sk_shift(CMS_CertificateChoices, (st))
+#define sk_CMS_CertificateChoices_pop(st) SKM_sk_pop(CMS_CertificateChoices, (st))
+#define sk_CMS_CertificateChoices_sort(st) SKM_sk_sort(CMS_CertificateChoices, (st))
+#define sk_CMS_CertificateChoices_is_sorted(st) SKM_sk_is_sorted(CMS_CertificateChoices, (st))
+
+#define sk_CMS_RecipientInfo_new(cmp) SKM_sk_new(CMS_RecipientInfo, (cmp))
+#define sk_CMS_RecipientInfo_new_null() SKM_sk_new_null(CMS_RecipientInfo)
+#define sk_CMS_RecipientInfo_free(st) SKM_sk_free(CMS_RecipientInfo, (st))
+#define sk_CMS_RecipientInfo_num(st) SKM_sk_num(CMS_RecipientInfo, (st))
+#define sk_CMS_RecipientInfo_value(st, i) SKM_sk_value(CMS_RecipientInfo, (st), (i))
+#define sk_CMS_RecipientInfo_set(st, i, val) SKM_sk_set(CMS_RecipientInfo, (st), (i), (val))
+#define sk_CMS_RecipientInfo_zero(st) SKM_sk_zero(CMS_RecipientInfo, (st))
+#define sk_CMS_RecipientInfo_push(st, val) SKM_sk_push(CMS_RecipientInfo, (st), (val))
+#define sk_CMS_RecipientInfo_unshift(st, val) SKM_sk_unshift(CMS_RecipientInfo, (st), (val))
+#define sk_CMS_RecipientInfo_find(st, val) SKM_sk_find(CMS_RecipientInfo, (st), (val))
+#define sk_CMS_RecipientInfo_find_ex(st, val) SKM_sk_find_ex(CMS_RecipientInfo, (st), (val))
+#define sk_CMS_RecipientInfo_delete(st, i) SKM_sk_delete(CMS_RecipientInfo, (st), (i))
+#define sk_CMS_RecipientInfo_delete_ptr(st, ptr) SKM_sk_delete_ptr(CMS_RecipientInfo, (st), (ptr))
+#define sk_CMS_RecipientInfo_insert(st, val, i) SKM_sk_insert(CMS_RecipientInfo, (st), (val), (i))
+#define sk_CMS_RecipientInfo_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(CMS_RecipientInfo, (st), (cmp))
+#define sk_CMS_RecipientInfo_dup(st) SKM_sk_dup(CMS_RecipientInfo, st)
+#define sk_CMS_RecipientInfo_pop_free(st, free_func) SKM_sk_pop_free(CMS_RecipientInfo, (st), (free_func))
+#define sk_CMS_RecipientInfo_shift(st) SKM_sk_shift(CMS_RecipientInfo, (st))
+#define sk_CMS_RecipientInfo_pop(st) SKM_sk_pop(CMS_RecipientInfo, (st))
+#define sk_CMS_RecipientInfo_sort(st) SKM_sk_sort(CMS_RecipientInfo, (st))
+#define sk_CMS_RecipientInfo_is_sorted(st) SKM_sk_is_sorted(CMS_RecipientInfo, (st))
+
+#define sk_CMS_RevocationInfoChoice_new(cmp) SKM_sk_new(CMS_RevocationInfoChoice, (cmp))
+#define sk_CMS_RevocationInfoChoice_new_null() SKM_sk_new_null(CMS_RevocationInfoChoice)
+#define sk_CMS_RevocationInfoChoice_free(st) SKM_sk_free(CMS_RevocationInfoChoice, (st))
+#define sk_CMS_RevocationInfoChoice_num(st) SKM_sk_num(CMS_RevocationInfoChoice, (st))
+#define sk_CMS_RevocationInfoChoice_value(st, i) SKM_sk_value(CMS_RevocationInfoChoice, (st), (i))
+#define sk_CMS_RevocationInfoChoice_set(st, i, val) SKM_sk_set(CMS_RevocationInfoChoice, (st), (i), (val))
+#define sk_CMS_RevocationInfoChoice_zero(st) SKM_sk_zero(CMS_RevocationInfoChoice, (st))
+#define sk_CMS_RevocationInfoChoice_push(st, val) SKM_sk_push(CMS_RevocationInfoChoice, (st), (val))
+#define sk_CMS_RevocationInfoChoice_unshift(st, val) SKM_sk_unshift(CMS_RevocationInfoChoice, (st), (val))
+#define sk_CMS_RevocationInfoChoice_find(st, val) SKM_sk_find(CMS_RevocationInfoChoice, (st), (val))
+#define sk_CMS_RevocationInfoChoice_find_ex(st, val) SKM_sk_find_ex(CMS_RevocationInfoChoice, (st), (val))
+#define sk_CMS_RevocationInfoChoice_delete(st, i) SKM_sk_delete(CMS_RevocationInfoChoice, (st), (i))
+#define sk_CMS_RevocationInfoChoice_delete_ptr(st, ptr) SKM_sk_delete_ptr(CMS_RevocationInfoChoice, (st), (ptr))
+#define sk_CMS_RevocationInfoChoice_insert(st, val, i) SKM_sk_insert(CMS_RevocationInfoChoice, (st), (val), (i))
+#define sk_CMS_RevocationInfoChoice_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(CMS_RevocationInfoChoice, (st), (cmp))
+#define sk_CMS_RevocationInfoChoice_dup(st) SKM_sk_dup(CMS_RevocationInfoChoice, st)
+#define sk_CMS_RevocationInfoChoice_pop_free(st, free_func) SKM_sk_pop_free(CMS_RevocationInfoChoice, (st), (free_func))
+#define sk_CMS_RevocationInfoChoice_shift(st) SKM_sk_shift(CMS_RevocationInfoChoice, (st))
+#define sk_CMS_RevocationInfoChoice_pop(st) SKM_sk_pop(CMS_RevocationInfoChoice, (st))
+#define sk_CMS_RevocationInfoChoice_sort(st) SKM_sk_sort(CMS_RevocationInfoChoice, (st))
+#define sk_CMS_RevocationInfoChoice_is_sorted(st) SKM_sk_is_sorted(CMS_RevocationInfoChoice, (st))
+
+#define sk_CMS_SignerInfo_new(cmp) SKM_sk_new(CMS_SignerInfo, (cmp))
+#define sk_CMS_SignerInfo_new_null() SKM_sk_new_null(CMS_SignerInfo)
+#define sk_CMS_SignerInfo_free(st) SKM_sk_free(CMS_SignerInfo, (st))
+#define sk_CMS_SignerInfo_num(st) SKM_sk_num(CMS_SignerInfo, (st))
+#define sk_CMS_SignerInfo_value(st, i) SKM_sk_value(CMS_SignerInfo, (st), (i))
+#define sk_CMS_SignerInfo_set(st, i, val) SKM_sk_set(CMS_SignerInfo, (st), (i), (val))
+#define sk_CMS_SignerInfo_zero(st) SKM_sk_zero(CMS_SignerInfo, (st))
+#define sk_CMS_SignerInfo_push(st, val) SKM_sk_push(CMS_SignerInfo, (st), (val))
+#define sk_CMS_SignerInfo_unshift(st, val) SKM_sk_unshift(CMS_SignerInfo, (st), (val))
+#define sk_CMS_SignerInfo_find(st, val) SKM_sk_find(CMS_SignerInfo, (st), (val))
+#define sk_CMS_SignerInfo_find_ex(st, val) SKM_sk_find_ex(CMS_SignerInfo, (st), (val))
+#define sk_CMS_SignerInfo_delete(st, i) SKM_sk_delete(CMS_SignerInfo, (st), (i))
+#define sk_CMS_SignerInfo_delete_ptr(st, ptr) SKM_sk_delete_ptr(CMS_SignerInfo, (st), (ptr))
+#define sk_CMS_SignerInfo_insert(st, val, i) SKM_sk_insert(CMS_SignerInfo, (st), (val), (i))
+#define sk_CMS_SignerInfo_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(CMS_SignerInfo, (st), (cmp))
+#define sk_CMS_SignerInfo_dup(st) SKM_sk_dup(CMS_SignerInfo, st)
+#define sk_CMS_SignerInfo_pop_free(st, free_func) SKM_sk_pop_free(CMS_SignerInfo, (st), (free_func))
+#define sk_CMS_SignerInfo_shift(st) SKM_sk_shift(CMS_SignerInfo, (st))
+#define sk_CMS_SignerInfo_pop(st) SKM_sk_pop(CMS_SignerInfo, (st))
+#define sk_CMS_SignerInfo_sort(st) SKM_sk_sort(CMS_SignerInfo, (st))
+#define sk_CMS_SignerInfo_is_sorted(st) SKM_sk_is_sorted(CMS_SignerInfo, (st))
+
+#define sk_CONF_IMODULE_new(cmp) SKM_sk_new(CONF_IMODULE, (cmp))
+#define sk_CONF_IMODULE_new_null() SKM_sk_new_null(CONF_IMODULE)
+#define sk_CONF_IMODULE_free(st) SKM_sk_free(CONF_IMODULE, (st))
+#define sk_CONF_IMODULE_num(st) SKM_sk_num(CONF_IMODULE, (st))
+#define sk_CONF_IMODULE_value(st, i) SKM_sk_value(CONF_IMODULE, (st), (i))
+#define sk_CONF_IMODULE_set(st, i, val) SKM_sk_set(CONF_IMODULE, (st), (i), (val))
+#define sk_CONF_IMODULE_zero(st) SKM_sk_zero(CONF_IMODULE, (st))
+#define sk_CONF_IMODULE_push(st, val) SKM_sk_push(CONF_IMODULE, (st), (val))
+#define sk_CONF_IMODULE_unshift(st, val) SKM_sk_unshift(CONF_IMODULE, (st), (val))
+#define sk_CONF_IMODULE_find(st, val) SKM_sk_find(CONF_IMODULE, (st), (val))
+#define sk_CONF_IMODULE_find_ex(st, val) SKM_sk_find_ex(CONF_IMODULE, (st), (val))
+#define sk_CONF_IMODULE_delete(st, i) SKM_sk_delete(CONF_IMODULE, (st), (i))
+#define sk_CONF_IMODULE_delete_ptr(st, ptr) SKM_sk_delete_ptr(CONF_IMODULE, (st), (ptr))
+#define sk_CONF_IMODULE_insert(st, val, i) SKM_sk_insert(CONF_IMODULE, (st), (val), (i))
+#define sk_CONF_IMODULE_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(CONF_IMODULE, (st), (cmp))
+#define sk_CONF_IMODULE_dup(st) SKM_sk_dup(CONF_IMODULE, st)
+#define sk_CONF_IMODULE_pop_free(st, free_func) SKM_sk_pop_free(CONF_IMODULE, (st), (free_func))
+#define sk_CONF_IMODULE_shift(st) SKM_sk_shift(CONF_IMODULE, (st))
+#define sk_CONF_IMODULE_pop(st) SKM_sk_pop(CONF_IMODULE, (st))
+#define sk_CONF_IMODULE_sort(st) SKM_sk_sort(CONF_IMODULE, (st))
+#define sk_CONF_IMODULE_is_sorted(st) SKM_sk_is_sorted(CONF_IMODULE, (st))
+
+#define sk_CONF_MODULE_new(cmp) SKM_sk_new(CONF_MODULE, (cmp))
+#define sk_CONF_MODULE_new_null() SKM_sk_new_null(CONF_MODULE)
+#define sk_CONF_MODULE_free(st) SKM_sk_free(CONF_MODULE, (st))
+#define sk_CONF_MODULE_num(st) SKM_sk_num(CONF_MODULE, (st))
+#define sk_CONF_MODULE_value(st, i) SKM_sk_value(CONF_MODULE, (st), (i))
+#define sk_CONF_MODULE_set(st, i, val) SKM_sk_set(CONF_MODULE, (st), (i), (val))
+#define sk_CONF_MODULE_zero(st) SKM_sk_zero(CONF_MODULE, (st))
+#define sk_CONF_MODULE_push(st, val) SKM_sk_push(CONF_MODULE, (st), (val))
+#define sk_CONF_MODULE_unshift(st, val) SKM_sk_unshift(CONF_MODULE, (st), (val))
+#define sk_CONF_MODULE_find(st, val) SKM_sk_find(CONF_MODULE, (st), (val))
+#define sk_CONF_MODULE_find_ex(st, val) SKM_sk_find_ex(CONF_MODULE, (st), (val))
+#define sk_CONF_MODULE_delete(st, i) SKM_sk_delete(CONF_MODULE, (st), (i))
+#define sk_CONF_MODULE_delete_ptr(st, ptr) SKM_sk_delete_ptr(CONF_MODULE, (st), (ptr))
+#define sk_CONF_MODULE_insert(st, val, i) SKM_sk_insert(CONF_MODULE, (st), (val), (i))
+#define sk_CONF_MODULE_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(CONF_MODULE, (st), (cmp))
+#define sk_CONF_MODULE_dup(st) SKM_sk_dup(CONF_MODULE, st)
+#define sk_CONF_MODULE_pop_free(st, free_func) SKM_sk_pop_free(CONF_MODULE, (st), (free_func))
+#define sk_CONF_MODULE_shift(st) SKM_sk_shift(CONF_MODULE, (st))
+#define sk_CONF_MODULE_pop(st) SKM_sk_pop(CONF_MODULE, (st))
+#define sk_CONF_MODULE_sort(st) SKM_sk_sort(CONF_MODULE, (st))
+#define sk_CONF_MODULE_is_sorted(st) SKM_sk_is_sorted(CONF_MODULE, (st))
+
+#define sk_CONF_VALUE_new(cmp) SKM_sk_new(CONF_VALUE, (cmp))
+#define sk_CONF_VALUE_new_null() SKM_sk_new_null(CONF_VALUE)
+#define sk_CONF_VALUE_free(st) SKM_sk_free(CONF_VALUE, (st))
+#define sk_CONF_VALUE_num(st) SKM_sk_num(CONF_VALUE, (st))
+#define sk_CONF_VALUE_value(st, i) SKM_sk_value(CONF_VALUE, (st), (i))
+#define sk_CONF_VALUE_set(st, i, val) SKM_sk_set(CONF_VALUE, (st), (i), (val))
+#define sk_CONF_VALUE_zero(st) SKM_sk_zero(CONF_VALUE, (st))
+#define sk_CONF_VALUE_push(st, val) SKM_sk_push(CONF_VALUE, (st), (val))
+#define sk_CONF_VALUE_unshift(st, val) SKM_sk_unshift(CONF_VALUE, (st), (val))
+#define sk_CONF_VALUE_find(st, val) SKM_sk_find(CONF_VALUE, (st), (val))
+#define sk_CONF_VALUE_find_ex(st, val) SKM_sk_find_ex(CONF_VALUE, (st), (val))
+#define sk_CONF_VALUE_delete(st, i) SKM_sk_delete(CONF_VALUE, (st), (i))
+#define sk_CONF_VALUE_delete_ptr(st, ptr) SKM_sk_delete_ptr(CONF_VALUE, (st), (ptr))
+#define sk_CONF_VALUE_insert(st, val, i) SKM_sk_insert(CONF_VALUE, (st), (val), (i))
+#define sk_CONF_VALUE_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(CONF_VALUE, (st), (cmp))
+#define sk_CONF_VALUE_dup(st) SKM_sk_dup(CONF_VALUE, st)
+#define sk_CONF_VALUE_pop_free(st, free_func) SKM_sk_pop_free(CONF_VALUE, (st), (free_func))
+#define sk_CONF_VALUE_shift(st) SKM_sk_shift(CONF_VALUE, (st))
+#define sk_CONF_VALUE_pop(st) SKM_sk_pop(CONF_VALUE, (st))
+#define sk_CONF_VALUE_sort(st) SKM_sk_sort(CONF_VALUE, (st))
+#define sk_CONF_VALUE_is_sorted(st) SKM_sk_is_sorted(CONF_VALUE, (st))
+
+#define sk_CRYPTO_EX_DATA_FUNCS_new(cmp) SKM_sk_new(CRYPTO_EX_DATA_FUNCS, (cmp))
+#define sk_CRYPTO_EX_DATA_FUNCS_new_null() SKM_sk_new_null(CRYPTO_EX_DATA_FUNCS)
+#define sk_CRYPTO_EX_DATA_FUNCS_free(st) SKM_sk_free(CRYPTO_EX_DATA_FUNCS, (st))
+#define sk_CRYPTO_EX_DATA_FUNCS_num(st) SKM_sk_num(CRYPTO_EX_DATA_FUNCS, (st))
+#define sk_CRYPTO_EX_DATA_FUNCS_value(st, i) SKM_sk_value(CRYPTO_EX_DATA_FUNCS, (st), (i))
+#define sk_CRYPTO_EX_DATA_FUNCS_set(st, i, val) SKM_sk_set(CRYPTO_EX_DATA_FUNCS, (st), (i), (val))
+#define sk_CRYPTO_EX_DATA_FUNCS_zero(st) SKM_sk_zero(CRYPTO_EX_DATA_FUNCS, (st))
+#define sk_CRYPTO_EX_DATA_FUNCS_push(st, val) SKM_sk_push(CRYPTO_EX_DATA_FUNCS, (st), (val))
+#define sk_CRYPTO_EX_DATA_FUNCS_unshift(st, val) SKM_sk_unshift(CRYPTO_EX_DATA_FUNCS, (st), (val))
+#define sk_CRYPTO_EX_DATA_FUNCS_find(st, val) SKM_sk_find(CRYPTO_EX_DATA_FUNCS, (st), (val))
+#define sk_CRYPTO_EX_DATA_FUNCS_find_ex(st, val) SKM_sk_find_ex(CRYPTO_EX_DATA_FUNCS, (st), (val))
+#define sk_CRYPTO_EX_DATA_FUNCS_delete(st, i) SKM_sk_delete(CRYPTO_EX_DATA_FUNCS, (st), (i))
+#define sk_CRYPTO_EX_DATA_FUNCS_delete_ptr(st, ptr) SKM_sk_delete_ptr(CRYPTO_EX_DATA_FUNCS, (st), (ptr))
+#define sk_CRYPTO_EX_DATA_FUNCS_insert(st, val, i) SKM_sk_insert(CRYPTO_EX_DATA_FUNCS, (st), (val), (i))
+#define sk_CRYPTO_EX_DATA_FUNCS_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(CRYPTO_EX_DATA_FUNCS, (st), (cmp))
+#define sk_CRYPTO_EX_DATA_FUNCS_dup(st) SKM_sk_dup(CRYPTO_EX_DATA_FUNCS, st)
+#define sk_CRYPTO_EX_DATA_FUNCS_pop_free(st, free_func) SKM_sk_pop_free(CRYPTO_EX_DATA_FUNCS, (st), (free_func))
+#define sk_CRYPTO_EX_DATA_FUNCS_shift(st) SKM_sk_shift(CRYPTO_EX_DATA_FUNCS, (st))
+#define sk_CRYPTO_EX_DATA_FUNCS_pop(st) SKM_sk_pop(CRYPTO_EX_DATA_FUNCS, (st))
+#define sk_CRYPTO_EX_DATA_FUNCS_sort(st) SKM_sk_sort(CRYPTO_EX_DATA_FUNCS, (st))
+#define sk_CRYPTO_EX_DATA_FUNCS_is_sorted(st) SKM_sk_is_sorted(CRYPTO_EX_DATA_FUNCS, (st))
+
+#define sk_CRYPTO_dynlock_new(cmp) SKM_sk_new(CRYPTO_dynlock, (cmp))
+#define sk_CRYPTO_dynlock_new_null() SKM_sk_new_null(CRYPTO_dynlock)
+#define sk_CRYPTO_dynlock_free(st) SKM_sk_free(CRYPTO_dynlock, (st))
+#define sk_CRYPTO_dynlock_num(st) SKM_sk_num(CRYPTO_dynlock, (st))
+#define sk_CRYPTO_dynlock_value(st, i) SKM_sk_value(CRYPTO_dynlock, (st), (i))
+#define sk_CRYPTO_dynlock_set(st, i, val) SKM_sk_set(CRYPTO_dynlock, (st), (i), (val))
+#define sk_CRYPTO_dynlock_zero(st) SKM_sk_zero(CRYPTO_dynlock, (st))
+#define sk_CRYPTO_dynlock_push(st, val) SKM_sk_push(CRYPTO_dynlock, (st), (val))
+#define sk_CRYPTO_dynlock_unshift(st, val) SKM_sk_unshift(CRYPTO_dynlock, (st), (val))
+#define sk_CRYPTO_dynlock_find(st, val) SKM_sk_find(CRYPTO_dynlock, (st), (val))
+#define sk_CRYPTO_dynlock_find_ex(st, val) SKM_sk_find_ex(CRYPTO_dynlock, (st), (val))
+#define sk_CRYPTO_dynlock_delete(st, i) SKM_sk_delete(CRYPTO_dynlock, (st), (i))
+#define sk_CRYPTO_dynlock_delete_ptr(st, ptr) SKM_sk_delete_ptr(CRYPTO_dynlock, (st), (ptr))
+#define sk_CRYPTO_dynlock_insert(st, val, i) SKM_sk_insert(CRYPTO_dynlock, (st), (val), (i))
+#define sk_CRYPTO_dynlock_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(CRYPTO_dynlock, (st), (cmp))
+#define sk_CRYPTO_dynlock_dup(st) SKM_sk_dup(CRYPTO_dynlock, st)
+#define sk_CRYPTO_dynlock_pop_free(st, free_func) SKM_sk_pop_free(CRYPTO_dynlock, (st), (free_func))
+#define sk_CRYPTO_dynlock_shift(st) SKM_sk_shift(CRYPTO_dynlock, (st))
+#define sk_CRYPTO_dynlock_pop(st) SKM_sk_pop(CRYPTO_dynlock, (st))
+#define sk_CRYPTO_dynlock_sort(st) SKM_sk_sort(CRYPTO_dynlock, (st))
+#define sk_CRYPTO_dynlock_is_sorted(st) SKM_sk_is_sorted(CRYPTO_dynlock, (st))
+
+#define sk_DIST_POINT_new(cmp) SKM_sk_new(DIST_POINT, (cmp))
+#define sk_DIST_POINT_new_null() SKM_sk_new_null(DIST_POINT)
+#define sk_DIST_POINT_free(st) SKM_sk_free(DIST_POINT, (st))
+#define sk_DIST_POINT_num(st) SKM_sk_num(DIST_POINT, (st))
+#define sk_DIST_POINT_value(st, i) SKM_sk_value(DIST_POINT, (st), (i))
+#define sk_DIST_POINT_set(st, i, val) SKM_sk_set(DIST_POINT, (st), (i), (val))
+#define sk_DIST_POINT_zero(st) SKM_sk_zero(DIST_POINT, (st))
+#define sk_DIST_POINT_push(st, val) SKM_sk_push(DIST_POINT, (st), (val))
+#define sk_DIST_POINT_unshift(st, val) SKM_sk_unshift(DIST_POINT, (st), (val))
+#define sk_DIST_POINT_find(st, val) SKM_sk_find(DIST_POINT, (st), (val))
+#define sk_DIST_POINT_find_ex(st, val) SKM_sk_find_ex(DIST_POINT, (st), (val))
+#define sk_DIST_POINT_delete(st, i) SKM_sk_delete(DIST_POINT, (st), (i))
+#define sk_DIST_POINT_delete_ptr(st, ptr) SKM_sk_delete_ptr(DIST_POINT, (st), (ptr))
+#define sk_DIST_POINT_insert(st, val, i) SKM_sk_insert(DIST_POINT, (st), (val), (i))
+#define sk_DIST_POINT_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(DIST_POINT, (st), (cmp))
+#define sk_DIST_POINT_dup(st) SKM_sk_dup(DIST_POINT, st)
+#define sk_DIST_POINT_pop_free(st, free_func) SKM_sk_pop_free(DIST_POINT, (st), (free_func))
+#define sk_DIST_POINT_shift(st) SKM_sk_shift(DIST_POINT, (st))
+#define sk_DIST_POINT_pop(st) SKM_sk_pop(DIST_POINT, (st))
+#define sk_DIST_POINT_sort(st) SKM_sk_sort(DIST_POINT, (st))
+#define sk_DIST_POINT_is_sorted(st) SKM_sk_is_sorted(DIST_POINT, (st))
+
+#define sk_ENGINE_new(cmp) SKM_sk_new(ENGINE, (cmp))
+#define sk_ENGINE_new_null() SKM_sk_new_null(ENGINE)
+#define sk_ENGINE_free(st) SKM_sk_free(ENGINE, (st))
+#define sk_ENGINE_num(st) SKM_sk_num(ENGINE, (st))
+#define sk_ENGINE_value(st, i) SKM_sk_value(ENGINE, (st), (i))
+#define sk_ENGINE_set(st, i, val) SKM_sk_set(ENGINE, (st), (i), (val))
+#define sk_ENGINE_zero(st) SKM_sk_zero(ENGINE, (st))
+#define sk_ENGINE_push(st, val) SKM_sk_push(ENGINE, (st), (val))
+#define sk_ENGINE_unshift(st, val) SKM_sk_unshift(ENGINE, (st), (val))
+#define sk_ENGINE_find(st, val) SKM_sk_find(ENGINE, (st), (val))
+#define sk_ENGINE_find_ex(st, val) SKM_sk_find_ex(ENGINE, (st), (val))
+#define sk_ENGINE_delete(st, i) SKM_sk_delete(ENGINE, (st), (i))
+#define sk_ENGINE_delete_ptr(st, ptr) SKM_sk_delete_ptr(ENGINE, (st), (ptr))
+#define sk_ENGINE_insert(st, val, i) SKM_sk_insert(ENGINE, (st), (val), (i))
+#define sk_ENGINE_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(ENGINE, (st), (cmp))
+#define sk_ENGINE_dup(st) SKM_sk_dup(ENGINE, st)
+#define sk_ENGINE_pop_free(st, free_func) SKM_sk_pop_free(ENGINE, (st), (free_func))
+#define sk_ENGINE_shift(st) SKM_sk_shift(ENGINE, (st))
+#define sk_ENGINE_pop(st) SKM_sk_pop(ENGINE, (st))
+#define sk_ENGINE_sort(st) SKM_sk_sort(ENGINE, (st))
+#define sk_ENGINE_is_sorted(st) SKM_sk_is_sorted(ENGINE, (st))
+
+#define sk_ENGINE_CLEANUP_ITEM_new(cmp) SKM_sk_new(ENGINE_CLEANUP_ITEM, (cmp))
+#define sk_ENGINE_CLEANUP_ITEM_new_null() SKM_sk_new_null(ENGINE_CLEANUP_ITEM)
+#define sk_ENGINE_CLEANUP_ITEM_free(st) SKM_sk_free(ENGINE_CLEANUP_ITEM, (st))
+#define sk_ENGINE_CLEANUP_ITEM_num(st) SKM_sk_num(ENGINE_CLEANUP_ITEM, (st))
+#define sk_ENGINE_CLEANUP_ITEM_value(st, i) SKM_sk_value(ENGINE_CLEANUP_ITEM, (st), (i))
+#define sk_ENGINE_CLEANUP_ITEM_set(st, i, val) SKM_sk_set(ENGINE_CLEANUP_ITEM, (st), (i), (val))
+#define sk_ENGINE_CLEANUP_ITEM_zero(st) SKM_sk_zero(ENGINE_CLEANUP_ITEM, (st))
+#define sk_ENGINE_CLEANUP_ITEM_push(st, val) SKM_sk_push(ENGINE_CLEANUP_ITEM, (st), (val))
+#define sk_ENGINE_CLEANUP_ITEM_unshift(st, val) SKM_sk_unshift(ENGINE_CLEANUP_ITEM, (st), (val))
+#define sk_ENGINE_CLEANUP_ITEM_find(st, val) SKM_sk_find(ENGINE_CLEANUP_ITEM, (st), (val))
+#define sk_ENGINE_CLEANUP_ITEM_find_ex(st, val) SKM_sk_find_ex(ENGINE_CLEANUP_ITEM, (st), (val))
+#define sk_ENGINE_CLEANUP_ITEM_delete(st, i) SKM_sk_delete(ENGINE_CLEANUP_ITEM, (st), (i))
+#define sk_ENGINE_CLEANUP_ITEM_delete_ptr(st, ptr) SKM_sk_delete_ptr(ENGINE_CLEANUP_ITEM, (st), (ptr))
+#define sk_ENGINE_CLEANUP_ITEM_insert(st, val, i) SKM_sk_insert(ENGINE_CLEANUP_ITEM, (st), (val), (i))
+#define sk_ENGINE_CLEANUP_ITEM_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(ENGINE_CLEANUP_ITEM, (st), (cmp))
+#define sk_ENGINE_CLEANUP_ITEM_dup(st) SKM_sk_dup(ENGINE_CLEANUP_ITEM, st)
+#define sk_ENGINE_CLEANUP_ITEM_pop_free(st, free_func) SKM_sk_pop_free(ENGINE_CLEANUP_ITEM, (st), (free_func))
+#define sk_ENGINE_CLEANUP_ITEM_shift(st) SKM_sk_shift(ENGINE_CLEANUP_ITEM, (st))
+#define sk_ENGINE_CLEANUP_ITEM_pop(st) SKM_sk_pop(ENGINE_CLEANUP_ITEM, (st))
+#define sk_ENGINE_CLEANUP_ITEM_sort(st) SKM_sk_sort(ENGINE_CLEANUP_ITEM, (st))
+#define sk_ENGINE_CLEANUP_ITEM_is_sorted(st) SKM_sk_is_sorted(ENGINE_CLEANUP_ITEM, (st))
+
+#define sk_ESS_CERT_ID_new(cmp) SKM_sk_new(ESS_CERT_ID, (cmp))
+#define sk_ESS_CERT_ID_new_null() SKM_sk_new_null(ESS_CERT_ID)
+#define sk_ESS_CERT_ID_free(st) SKM_sk_free(ESS_CERT_ID, (st))
+#define sk_ESS_CERT_ID_num(st) SKM_sk_num(ESS_CERT_ID, (st))
+#define sk_ESS_CERT_ID_value(st, i) SKM_sk_value(ESS_CERT_ID, (st), (i))
+#define sk_ESS_CERT_ID_set(st, i, val) SKM_sk_set(ESS_CERT_ID, (st), (i), (val))
+#define sk_ESS_CERT_ID_zero(st) SKM_sk_zero(ESS_CERT_ID, (st))
+#define sk_ESS_CERT_ID_push(st, val) SKM_sk_push(ESS_CERT_ID, (st), (val))
+#define sk_ESS_CERT_ID_unshift(st, val) SKM_sk_unshift(ESS_CERT_ID, (st), (val))
+#define sk_ESS_CERT_ID_find(st, val) SKM_sk_find(ESS_CERT_ID, (st), (val))
+#define sk_ESS_CERT_ID_find_ex(st, val) SKM_sk_find_ex(ESS_CERT_ID, (st), (val))
+#define sk_ESS_CERT_ID_delete(st, i) SKM_sk_delete(ESS_CERT_ID, (st), (i))
+#define sk_ESS_CERT_ID_delete_ptr(st, ptr) SKM_sk_delete_ptr(ESS_CERT_ID, (st), (ptr))
+#define sk_ESS_CERT_ID_insert(st, val, i) SKM_sk_insert(ESS_CERT_ID, (st), (val), (i))
+#define sk_ESS_CERT_ID_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(ESS_CERT_ID, (st), (cmp))
+#define sk_ESS_CERT_ID_dup(st) SKM_sk_dup(ESS_CERT_ID, st)
+#define sk_ESS_CERT_ID_pop_free(st, free_func) SKM_sk_pop_free(ESS_CERT_ID, (st), (free_func))
+#define sk_ESS_CERT_ID_shift(st) SKM_sk_shift(ESS_CERT_ID, (st))
+#define sk_ESS_CERT_ID_pop(st) SKM_sk_pop(ESS_CERT_ID, (st))
+#define sk_ESS_CERT_ID_sort(st) SKM_sk_sort(ESS_CERT_ID, (st))
+#define sk_ESS_CERT_ID_is_sorted(st) SKM_sk_is_sorted(ESS_CERT_ID, (st))
+
+#define sk_EVP_MD_new(cmp) SKM_sk_new(EVP_MD, (cmp))
+#define sk_EVP_MD_new_null() SKM_sk_new_null(EVP_MD)
+#define sk_EVP_MD_free(st) SKM_sk_free(EVP_MD, (st))
+#define sk_EVP_MD_num(st) SKM_sk_num(EVP_MD, (st))
+#define sk_EVP_MD_value(st, i) SKM_sk_value(EVP_MD, (st), (i))
+#define sk_EVP_MD_set(st, i, val) SKM_sk_set(EVP_MD, (st), (i), (val))
+#define sk_EVP_MD_zero(st) SKM_sk_zero(EVP_MD, (st))
+#define sk_EVP_MD_push(st, val) SKM_sk_push(EVP_MD, (st), (val))
+#define sk_EVP_MD_unshift(st, val) SKM_sk_unshift(EVP_MD, (st), (val))
+#define sk_EVP_MD_find(st, val) SKM_sk_find(EVP_MD, (st), (val))
+#define sk_EVP_MD_find_ex(st, val) SKM_sk_find_ex(EVP_MD, (st), (val))
+#define sk_EVP_MD_delete(st, i) SKM_sk_delete(EVP_MD, (st), (i))
+#define sk_EVP_MD_delete_ptr(st, ptr) SKM_sk_delete_ptr(EVP_MD, (st), (ptr))
+#define sk_EVP_MD_insert(st, val, i) SKM_sk_insert(EVP_MD, (st), (val), (i))
+#define sk_EVP_MD_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(EVP_MD, (st), (cmp))
+#define sk_EVP_MD_dup(st) SKM_sk_dup(EVP_MD, st)
+#define sk_EVP_MD_pop_free(st, free_func) SKM_sk_pop_free(EVP_MD, (st), (free_func))
+#define sk_EVP_MD_shift(st) SKM_sk_shift(EVP_MD, (st))
+#define sk_EVP_MD_pop(st) SKM_sk_pop(EVP_MD, (st))
+#define sk_EVP_MD_sort(st) SKM_sk_sort(EVP_MD, (st))
+#define sk_EVP_MD_is_sorted(st) SKM_sk_is_sorted(EVP_MD, (st))
+
+#define sk_EVP_PBE_CTL_new(cmp) SKM_sk_new(EVP_PBE_CTL, (cmp))
+#define sk_EVP_PBE_CTL_new_null() SKM_sk_new_null(EVP_PBE_CTL)
+#define sk_EVP_PBE_CTL_free(st) SKM_sk_free(EVP_PBE_CTL, (st))
+#define sk_EVP_PBE_CTL_num(st) SKM_sk_num(EVP_PBE_CTL, (st))
+#define sk_EVP_PBE_CTL_value(st, i) SKM_sk_value(EVP_PBE_CTL, (st), (i))
+#define sk_EVP_PBE_CTL_set(st, i, val) SKM_sk_set(EVP_PBE_CTL, (st), (i), (val))
+#define sk_EVP_PBE_CTL_zero(st) SKM_sk_zero(EVP_PBE_CTL, (st))
+#define sk_EVP_PBE_CTL_push(st, val) SKM_sk_push(EVP_PBE_CTL, (st), (val))
+#define sk_EVP_PBE_CTL_unshift(st, val) SKM_sk_unshift(EVP_PBE_CTL, (st), (val))
+#define sk_EVP_PBE_CTL_find(st, val) SKM_sk_find(EVP_PBE_CTL, (st), (val))
+#define sk_EVP_PBE_CTL_find_ex(st, val) SKM_sk_find_ex(EVP_PBE_CTL, (st), (val))
+#define sk_EVP_PBE_CTL_delete(st, i) SKM_sk_delete(EVP_PBE_CTL, (st), (i))
+#define sk_EVP_PBE_CTL_delete_ptr(st, ptr) SKM_sk_delete_ptr(EVP_PBE_CTL, (st), (ptr))
+#define sk_EVP_PBE_CTL_insert(st, val, i) SKM_sk_insert(EVP_PBE_CTL, (st), (val), (i))
+#define sk_EVP_PBE_CTL_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(EVP_PBE_CTL, (st), (cmp))
+#define sk_EVP_PBE_CTL_dup(st) SKM_sk_dup(EVP_PBE_CTL, st)
+#define sk_EVP_PBE_CTL_pop_free(st, free_func) SKM_sk_pop_free(EVP_PBE_CTL, (st), (free_func))
+#define sk_EVP_PBE_CTL_shift(st) SKM_sk_shift(EVP_PBE_CTL, (st))
+#define sk_EVP_PBE_CTL_pop(st) SKM_sk_pop(EVP_PBE_CTL, (st))
+#define sk_EVP_PBE_CTL_sort(st) SKM_sk_sort(EVP_PBE_CTL, (st))
+#define sk_EVP_PBE_CTL_is_sorted(st) SKM_sk_is_sorted(EVP_PBE_CTL, (st))
+
+#define sk_EVP_PKEY_ASN1_METHOD_new(cmp) SKM_sk_new(EVP_PKEY_ASN1_METHOD, (cmp))
+#define sk_EVP_PKEY_ASN1_METHOD_new_null() SKM_sk_new_null(EVP_PKEY_ASN1_METHOD)
+#define sk_EVP_PKEY_ASN1_METHOD_free(st) SKM_sk_free(EVP_PKEY_ASN1_METHOD, (st))
+#define sk_EVP_PKEY_ASN1_METHOD_num(st) SKM_sk_num(EVP_PKEY_ASN1_METHOD, (st))
+#define sk_EVP_PKEY_ASN1_METHOD_value(st, i) SKM_sk_value(EVP_PKEY_ASN1_METHOD, (st), (i))
+#define sk_EVP_PKEY_ASN1_METHOD_set(st, i, val) SKM_sk_set(EVP_PKEY_ASN1_METHOD, (st), (i), (val))
+#define sk_EVP_PKEY_ASN1_METHOD_zero(st) SKM_sk_zero(EVP_PKEY_ASN1_METHOD, (st))
+#define sk_EVP_PKEY_ASN1_METHOD_push(st, val) SKM_sk_push(EVP_PKEY_ASN1_METHOD, (st), (val))
+#define sk_EVP_PKEY_ASN1_METHOD_unshift(st, val) SKM_sk_unshift(EVP_PKEY_ASN1_METHOD, (st), (val))
+#define sk_EVP_PKEY_ASN1_METHOD_find(st, val) SKM_sk_find(EVP_PKEY_ASN1_METHOD, (st), (val))
+#define sk_EVP_PKEY_ASN1_METHOD_find_ex(st, val) SKM_sk_find_ex(EVP_PKEY_ASN1_METHOD, (st), (val))
+#define sk_EVP_PKEY_ASN1_METHOD_delete(st, i) SKM_sk_delete(EVP_PKEY_ASN1_METHOD, (st), (i))
+#define sk_EVP_PKEY_ASN1_METHOD_delete_ptr(st, ptr) SKM_sk_delete_ptr(EVP_PKEY_ASN1_METHOD, (st), (ptr))
+#define sk_EVP_PKEY_ASN1_METHOD_insert(st, val, i) SKM_sk_insert(EVP_PKEY_ASN1_METHOD, (st), (val), (i))
+#define sk_EVP_PKEY_ASN1_METHOD_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(EVP_PKEY_ASN1_METHOD, (st), (cmp))
+#define sk_EVP_PKEY_ASN1_METHOD_dup(st) SKM_sk_dup(EVP_PKEY_ASN1_METHOD, st)
+#define sk_EVP_PKEY_ASN1_METHOD_pop_free(st, free_func) SKM_sk_pop_free(EVP_PKEY_ASN1_METHOD, (st), (free_func))
+#define sk_EVP_PKEY_ASN1_METHOD_shift(st) SKM_sk_shift(EVP_PKEY_ASN1_METHOD, (st))
+#define sk_EVP_PKEY_ASN1_METHOD_pop(st) SKM_sk_pop(EVP_PKEY_ASN1_METHOD, (st))
+#define sk_EVP_PKEY_ASN1_METHOD_sort(st) SKM_sk_sort(EVP_PKEY_ASN1_METHOD, (st))
+#define sk_EVP_PKEY_ASN1_METHOD_is_sorted(st) SKM_sk_is_sorted(EVP_PKEY_ASN1_METHOD, (st))
+
+#define sk_EVP_PKEY_METHOD_new(cmp) SKM_sk_new(EVP_PKEY_METHOD, (cmp))
+#define sk_EVP_PKEY_METHOD_new_null() SKM_sk_new_null(EVP_PKEY_METHOD)
+#define sk_EVP_PKEY_METHOD_free(st) SKM_sk_free(EVP_PKEY_METHOD, (st))
+#define sk_EVP_PKEY_METHOD_num(st) SKM_sk_num(EVP_PKEY_METHOD, (st))
+#define sk_EVP_PKEY_METHOD_value(st, i) SKM_sk_value(EVP_PKEY_METHOD, (st), (i))
+#define sk_EVP_PKEY_METHOD_set(st, i, val) SKM_sk_set(EVP_PKEY_METHOD, (st), (i), (val))
+#define sk_EVP_PKEY_METHOD_zero(st) SKM_sk_zero(EVP_PKEY_METHOD, (st))
+#define sk_EVP_PKEY_METHOD_push(st, val) SKM_sk_push(EVP_PKEY_METHOD, (st), (val))
+#define sk_EVP_PKEY_METHOD_unshift(st, val) SKM_sk_unshift(EVP_PKEY_METHOD, (st), (val))
+#define sk_EVP_PKEY_METHOD_find(st, val) SKM_sk_find(EVP_PKEY_METHOD, (st), (val))
+#define sk_EVP_PKEY_METHOD_find_ex(st, val) SKM_sk_find_ex(EVP_PKEY_METHOD, (st), (val))
+#define sk_EVP_PKEY_METHOD_delete(st, i) SKM_sk_delete(EVP_PKEY_METHOD, (st), (i))
+#define sk_EVP_PKEY_METHOD_delete_ptr(st, ptr) SKM_sk_delete_ptr(EVP_PKEY_METHOD, (st), (ptr))
+#define sk_EVP_PKEY_METHOD_insert(st, val, i) SKM_sk_insert(EVP_PKEY_METHOD, (st), (val), (i))
+#define sk_EVP_PKEY_METHOD_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(EVP_PKEY_METHOD, (st), (cmp))
+#define sk_EVP_PKEY_METHOD_dup(st) SKM_sk_dup(EVP_PKEY_METHOD, st)
+#define sk_EVP_PKEY_METHOD_pop_free(st, free_func) SKM_sk_pop_free(EVP_PKEY_METHOD, (st), (free_func))
+#define sk_EVP_PKEY_METHOD_shift(st) SKM_sk_shift(EVP_PKEY_METHOD, (st))
+#define sk_EVP_PKEY_METHOD_pop(st) SKM_sk_pop(EVP_PKEY_METHOD, (st))
+#define sk_EVP_PKEY_METHOD_sort(st) SKM_sk_sort(EVP_PKEY_METHOD, (st))
+#define sk_EVP_PKEY_METHOD_is_sorted(st) SKM_sk_is_sorted(EVP_PKEY_METHOD, (st))
+
+#define sk_GENERAL_NAME_new(cmp) SKM_sk_new(GENERAL_NAME, (cmp))
+#define sk_GENERAL_NAME_new_null() SKM_sk_new_null(GENERAL_NAME)
+#define sk_GENERAL_NAME_free(st) SKM_sk_free(GENERAL_NAME, (st))
+#define sk_GENERAL_NAME_num(st) SKM_sk_num(GENERAL_NAME, (st))
+#define sk_GENERAL_NAME_value(st, i) SKM_sk_value(GENERAL_NAME, (st), (i))
+#define sk_GENERAL_NAME_set(st, i, val) SKM_sk_set(GENERAL_NAME, (st), (i), (val))
+#define sk_GENERAL_NAME_zero(st) SKM_sk_zero(GENERAL_NAME, (st))
+#define sk_GENERAL_NAME_push(st, val) SKM_sk_push(GENERAL_NAME, (st), (val))
+#define sk_GENERAL_NAME_unshift(st, val) SKM_sk_unshift(GENERAL_NAME, (st), (val))
+#define sk_GENERAL_NAME_find(st, val) SKM_sk_find(GENERAL_NAME, (st), (val))
+#define sk_GENERAL_NAME_find_ex(st, val) SKM_sk_find_ex(GENERAL_NAME, (st), (val))
+#define sk_GENERAL_NAME_delete(st, i) SKM_sk_delete(GENERAL_NAME, (st), (i))
+#define sk_GENERAL_NAME_delete_ptr(st, ptr) SKM_sk_delete_ptr(GENERAL_NAME, (st), (ptr))
+#define sk_GENERAL_NAME_insert(st, val, i) SKM_sk_insert(GENERAL_NAME, (st), (val), (i))
+#define sk_GENERAL_NAME_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(GENERAL_NAME, (st), (cmp))
+#define sk_GENERAL_NAME_dup(st) SKM_sk_dup(GENERAL_NAME, st)
+#define sk_GENERAL_NAME_pop_free(st, free_func) SKM_sk_pop_free(GENERAL_NAME, (st), (free_func))
+#define sk_GENERAL_NAME_shift(st) SKM_sk_shift(GENERAL_NAME, (st))
+#define sk_GENERAL_NAME_pop(st) SKM_sk_pop(GENERAL_NAME, (st))
+#define sk_GENERAL_NAME_sort(st) SKM_sk_sort(GENERAL_NAME, (st))
+#define sk_GENERAL_NAME_is_sorted(st) SKM_sk_is_sorted(GENERAL_NAME, (st))
+
+#define sk_GENERAL_NAMES_new(cmp) SKM_sk_new(GENERAL_NAMES, (cmp))
+#define sk_GENERAL_NAMES_new_null() SKM_sk_new_null(GENERAL_NAMES)
+#define sk_GENERAL_NAMES_free(st) SKM_sk_free(GENERAL_NAMES, (st))
+#define sk_GENERAL_NAMES_num(st) SKM_sk_num(GENERAL_NAMES, (st))
+#define sk_GENERAL_NAMES_value(st, i) SKM_sk_value(GENERAL_NAMES, (st), (i))
+#define sk_GENERAL_NAMES_set(st, i, val) SKM_sk_set(GENERAL_NAMES, (st), (i), (val))
+#define sk_GENERAL_NAMES_zero(st) SKM_sk_zero(GENERAL_NAMES, (st))
+#define sk_GENERAL_NAMES_push(st, val) SKM_sk_push(GENERAL_NAMES, (st), (val))
+#define sk_GENERAL_NAMES_unshift(st, val) SKM_sk_unshift(GENERAL_NAMES, (st), (val))
+#define sk_GENERAL_NAMES_find(st, val) SKM_sk_find(GENERAL_NAMES, (st), (val))
+#define sk_GENERAL_NAMES_find_ex(st, val) SKM_sk_find_ex(GENERAL_NAMES, (st), (val))
+#define sk_GENERAL_NAMES_delete(st, i) SKM_sk_delete(GENERAL_NAMES, (st), (i))
+#define sk_GENERAL_NAMES_delete_ptr(st, ptr) SKM_sk_delete_ptr(GENERAL_NAMES, (st), (ptr))
+#define sk_GENERAL_NAMES_insert(st, val, i) SKM_sk_insert(GENERAL_NAMES, (st), (val), (i))
+#define sk_GENERAL_NAMES_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(GENERAL_NAMES, (st), (cmp))
+#define sk_GENERAL_NAMES_dup(st) SKM_sk_dup(GENERAL_NAMES, st)
+#define sk_GENERAL_NAMES_pop_free(st, free_func) SKM_sk_pop_free(GENERAL_NAMES, (st), (free_func))
+#define sk_GENERAL_NAMES_shift(st) SKM_sk_shift(GENERAL_NAMES, (st))
+#define sk_GENERAL_NAMES_pop(st) SKM_sk_pop(GENERAL_NAMES, (st))
+#define sk_GENERAL_NAMES_sort(st) SKM_sk_sort(GENERAL_NAMES, (st))
+#define sk_GENERAL_NAMES_is_sorted(st) SKM_sk_is_sorted(GENERAL_NAMES, (st))
+
+#define sk_GENERAL_SUBTREE_new(cmp) SKM_sk_new(GENERAL_SUBTREE, (cmp))
+#define sk_GENERAL_SUBTREE_new_null() SKM_sk_new_null(GENERAL_SUBTREE)
+#define sk_GENERAL_SUBTREE_free(st) SKM_sk_free(GENERAL_SUBTREE, (st))
+#define sk_GENERAL_SUBTREE_num(st) SKM_sk_num(GENERAL_SUBTREE, (st))
+#define sk_GENERAL_SUBTREE_value(st, i) SKM_sk_value(GENERAL_SUBTREE, (st), (i))
+#define sk_GENERAL_SUBTREE_set(st, i, val) SKM_sk_set(GENERAL_SUBTREE, (st), (i), (val))
+#define sk_GENERAL_SUBTREE_zero(st) SKM_sk_zero(GENERAL_SUBTREE, (st))
+#define sk_GENERAL_SUBTREE_push(st, val) SKM_sk_push(GENERAL_SUBTREE, (st), (val))
+#define sk_GENERAL_SUBTREE_unshift(st, val) SKM_sk_unshift(GENERAL_SUBTREE, (st), (val))
+#define sk_GENERAL_SUBTREE_find(st, val) SKM_sk_find(GENERAL_SUBTREE, (st), (val))
+#define sk_GENERAL_SUBTREE_find_ex(st, val) SKM_sk_find_ex(GENERAL_SUBTREE, (st), (val))
+#define sk_GENERAL_SUBTREE_delete(st, i) SKM_sk_delete(GENERAL_SUBTREE, (st), (i))
+#define sk_GENERAL_SUBTREE_delete_ptr(st, ptr) SKM_sk_delete_ptr(GENERAL_SUBTREE, (st), (ptr))
+#define sk_GENERAL_SUBTREE_insert(st, val, i) SKM_sk_insert(GENERAL_SUBTREE, (st), (val), (i))
+#define sk_GENERAL_SUBTREE_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(GENERAL_SUBTREE, (st), (cmp))
+#define sk_GENERAL_SUBTREE_dup(st) SKM_sk_dup(GENERAL_SUBTREE, st)
+#define sk_GENERAL_SUBTREE_pop_free(st, free_func) SKM_sk_pop_free(GENERAL_SUBTREE, (st), (free_func))
+#define sk_GENERAL_SUBTREE_shift(st) SKM_sk_shift(GENERAL_SUBTREE, (st))
+#define sk_GENERAL_SUBTREE_pop(st) SKM_sk_pop(GENERAL_SUBTREE, (st))
+#define sk_GENERAL_SUBTREE_sort(st) SKM_sk_sort(GENERAL_SUBTREE, (st))
+#define sk_GENERAL_SUBTREE_is_sorted(st) SKM_sk_is_sorted(GENERAL_SUBTREE, (st))
+
+#define sk_IPAddressFamily_new(cmp) SKM_sk_new(IPAddressFamily, (cmp))
+#define sk_IPAddressFamily_new_null() SKM_sk_new_null(IPAddressFamily)
+#define sk_IPAddressFamily_free(st) SKM_sk_free(IPAddressFamily, (st))
+#define sk_IPAddressFamily_num(st) SKM_sk_num(IPAddressFamily, (st))
+#define sk_IPAddressFamily_value(st, i) SKM_sk_value(IPAddressFamily, (st), (i))
+#define sk_IPAddressFamily_set(st, i, val) SKM_sk_set(IPAddressFamily, (st), (i), (val))
+#define sk_IPAddressFamily_zero(st) SKM_sk_zero(IPAddressFamily, (st))
+#define sk_IPAddressFamily_push(st, val) SKM_sk_push(IPAddressFamily, (st), (val))
+#define sk_IPAddressFamily_unshift(st, val) SKM_sk_unshift(IPAddressFamily, (st), (val))
+#define sk_IPAddressFamily_find(st, val) SKM_sk_find(IPAddressFamily, (st), (val))
+#define sk_IPAddressFamily_find_ex(st, val) SKM_sk_find_ex(IPAddressFamily, (st), (val))
+#define sk_IPAddressFamily_delete(st, i) SKM_sk_delete(IPAddressFamily, (st), (i))
+#define sk_IPAddressFamily_delete_ptr(st, ptr) SKM_sk_delete_ptr(IPAddressFamily, (st), (ptr))
+#define sk_IPAddressFamily_insert(st, val, i) SKM_sk_insert(IPAddressFamily, (st), (val), (i))
+#define sk_IPAddressFamily_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(IPAddressFamily, (st), (cmp))
+#define sk_IPAddressFamily_dup(st) SKM_sk_dup(IPAddressFamily, st)
+#define sk_IPAddressFamily_pop_free(st, free_func) SKM_sk_pop_free(IPAddressFamily, (st), (free_func))
+#define sk_IPAddressFamily_shift(st) SKM_sk_shift(IPAddressFamily, (st))
+#define sk_IPAddressFamily_pop(st) SKM_sk_pop(IPAddressFamily, (st))
+#define sk_IPAddressFamily_sort(st) SKM_sk_sort(IPAddressFamily, (st))
+#define sk_IPAddressFamily_is_sorted(st) SKM_sk_is_sorted(IPAddressFamily, (st))
+
+#define sk_IPAddressOrRange_new(cmp) SKM_sk_new(IPAddressOrRange, (cmp))
+#define sk_IPAddressOrRange_new_null() SKM_sk_new_null(IPAddressOrRange)
+#define sk_IPAddressOrRange_free(st) SKM_sk_free(IPAddressOrRange, (st))
+#define sk_IPAddressOrRange_num(st) SKM_sk_num(IPAddressOrRange, (st))
+#define sk_IPAddressOrRange_value(st, i) SKM_sk_value(IPAddressOrRange, (st), (i))
+#define sk_IPAddressOrRange_set(st, i, val) SKM_sk_set(IPAddressOrRange, (st), (i), (val))
+#define sk_IPAddressOrRange_zero(st) SKM_sk_zero(IPAddressOrRange, (st))
+#define sk_IPAddressOrRange_push(st, val) SKM_sk_push(IPAddressOrRange, (st), (val))
+#define sk_IPAddressOrRange_unshift(st, val) SKM_sk_unshift(IPAddressOrRange, (st), (val))
+#define sk_IPAddressOrRange_find(st, val) SKM_sk_find(IPAddressOrRange, (st), (val))
+#define sk_IPAddressOrRange_find_ex(st, val) SKM_sk_find_ex(IPAddressOrRange, (st), (val))
+#define sk_IPAddressOrRange_delete(st, i) SKM_sk_delete(IPAddressOrRange, (st), (i))
+#define sk_IPAddressOrRange_delete_ptr(st, ptr) SKM_sk_delete_ptr(IPAddressOrRange, (st), (ptr))
+#define sk_IPAddressOrRange_insert(st, val, i) SKM_sk_insert(IPAddressOrRange, (st), (val), (i))
+#define sk_IPAddressOrRange_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(IPAddressOrRange, (st), (cmp))
+#define sk_IPAddressOrRange_dup(st) SKM_sk_dup(IPAddressOrRange, st)
+#define sk_IPAddressOrRange_pop_free(st, free_func) SKM_sk_pop_free(IPAddressOrRange, (st), (free_func))
+#define sk_IPAddressOrRange_shift(st) SKM_sk_shift(IPAddressOrRange, (st))
+#define sk_IPAddressOrRange_pop(st) SKM_sk_pop(IPAddressOrRange, (st))
+#define sk_IPAddressOrRange_sort(st) SKM_sk_sort(IPAddressOrRange, (st))
+#define sk_IPAddressOrRange_is_sorted(st) SKM_sk_is_sorted(IPAddressOrRange, (st))
+
+#define sk_KRB5_APREQBODY_new(cmp) SKM_sk_new(KRB5_APREQBODY, (cmp))
+#define sk_KRB5_APREQBODY_new_null() SKM_sk_new_null(KRB5_APREQBODY)
+#define sk_KRB5_APREQBODY_free(st) SKM_sk_free(KRB5_APREQBODY, (st))
+#define sk_KRB5_APREQBODY_num(st) SKM_sk_num(KRB5_APREQBODY, (st))
+#define sk_KRB5_APREQBODY_value(st, i) SKM_sk_value(KRB5_APREQBODY, (st), (i))
+#define sk_KRB5_APREQBODY_set(st, i, val) SKM_sk_set(KRB5_APREQBODY, (st), (i), (val))
+#define sk_KRB5_APREQBODY_zero(st) SKM_sk_zero(KRB5_APREQBODY, (st))
+#define sk_KRB5_APREQBODY_push(st, val) SKM_sk_push(KRB5_APREQBODY, (st), (val))
+#define sk_KRB5_APREQBODY_unshift(st, val) SKM_sk_unshift(KRB5_APREQBODY, (st), (val))
+#define sk_KRB5_APREQBODY_find(st, val) SKM_sk_find(KRB5_APREQBODY, (st), (val))
+#define sk_KRB5_APREQBODY_find_ex(st, val) SKM_sk_find_ex(KRB5_APREQBODY, (st), (val))
+#define sk_KRB5_APREQBODY_delete(st, i) SKM_sk_delete(KRB5_APREQBODY, (st), (i))
+#define sk_KRB5_APREQBODY_delete_ptr(st, ptr) SKM_sk_delete_ptr(KRB5_APREQBODY, (st), (ptr))
+#define sk_KRB5_APREQBODY_insert(st, val, i) SKM_sk_insert(KRB5_APREQBODY, (st), (val), (i))
+#define sk_KRB5_APREQBODY_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(KRB5_APREQBODY, (st), (cmp))
+#define sk_KRB5_APREQBODY_dup(st) SKM_sk_dup(KRB5_APREQBODY, st)
+#define sk_KRB5_APREQBODY_pop_free(st, free_func) SKM_sk_pop_free(KRB5_APREQBODY, (st), (free_func))
+#define sk_KRB5_APREQBODY_shift(st) SKM_sk_shift(KRB5_APREQBODY, (st))
+#define sk_KRB5_APREQBODY_pop(st) SKM_sk_pop(KRB5_APREQBODY, (st))
+#define sk_KRB5_APREQBODY_sort(st) SKM_sk_sort(KRB5_APREQBODY, (st))
+#define sk_KRB5_APREQBODY_is_sorted(st) SKM_sk_is_sorted(KRB5_APREQBODY, (st))
+
+#define sk_KRB5_AUTHDATA_new(cmp) SKM_sk_new(KRB5_AUTHDATA, (cmp))
+#define sk_KRB5_AUTHDATA_new_null() SKM_sk_new_null(KRB5_AUTHDATA)
+#define sk_KRB5_AUTHDATA_free(st) SKM_sk_free(KRB5_AUTHDATA, (st))
+#define sk_KRB5_AUTHDATA_num(st) SKM_sk_num(KRB5_AUTHDATA, (st))
+#define sk_KRB5_AUTHDATA_value(st, i) SKM_sk_value(KRB5_AUTHDATA, (st), (i))
+#define sk_KRB5_AUTHDATA_set(st, i, val) SKM_sk_set(KRB5_AUTHDATA, (st), (i), (val))
+#define sk_KRB5_AUTHDATA_zero(st) SKM_sk_zero(KRB5_AUTHDATA, (st))
+#define sk_KRB5_AUTHDATA_push(st, val) SKM_sk_push(KRB5_AUTHDATA, (st), (val))
+#define sk_KRB5_AUTHDATA_unshift(st, val) SKM_sk_unshift(KRB5_AUTHDATA, (st), (val))
+#define sk_KRB5_AUTHDATA_find(st, val) SKM_sk_find(KRB5_AUTHDATA, (st), (val))
+#define sk_KRB5_AUTHDATA_find_ex(st, val) SKM_sk_find_ex(KRB5_AUTHDATA, (st), (val))
+#define sk_KRB5_AUTHDATA_delete(st, i) SKM_sk_delete(KRB5_AUTHDATA, (st), (i))
+#define sk_KRB5_AUTHDATA_delete_ptr(st, ptr) SKM_sk_delete_ptr(KRB5_AUTHDATA, (st), (ptr))
+#define sk_KRB5_AUTHDATA_insert(st, val, i) SKM_sk_insert(KRB5_AUTHDATA, (st), (val), (i))
+#define sk_KRB5_AUTHDATA_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(KRB5_AUTHDATA, (st), (cmp))
+#define sk_KRB5_AUTHDATA_dup(st) SKM_sk_dup(KRB5_AUTHDATA, st)
+#define sk_KRB5_AUTHDATA_pop_free(st, free_func) SKM_sk_pop_free(KRB5_AUTHDATA, (st), (free_func))
+#define sk_KRB5_AUTHDATA_shift(st) SKM_sk_shift(KRB5_AUTHDATA, (st))
+#define sk_KRB5_AUTHDATA_pop(st) SKM_sk_pop(KRB5_AUTHDATA, (st))
+#define sk_KRB5_AUTHDATA_sort(st) SKM_sk_sort(KRB5_AUTHDATA, (st))
+#define sk_KRB5_AUTHDATA_is_sorted(st) SKM_sk_is_sorted(KRB5_AUTHDATA, (st))
+
+#define sk_KRB5_AUTHENTBODY_new(cmp) SKM_sk_new(KRB5_AUTHENTBODY, (cmp))
+#define sk_KRB5_AUTHENTBODY_new_null() SKM_sk_new_null(KRB5_AUTHENTBODY)
+#define sk_KRB5_AUTHENTBODY_free(st) SKM_sk_free(KRB5_AUTHENTBODY, (st))
+#define sk_KRB5_AUTHENTBODY_num(st) SKM_sk_num(KRB5_AUTHENTBODY, (st))
+#define sk_KRB5_AUTHENTBODY_value(st, i) SKM_sk_value(KRB5_AUTHENTBODY, (st), (i))
+#define sk_KRB5_AUTHENTBODY_set(st, i, val) SKM_sk_set(KRB5_AUTHENTBODY, (st), (i), (val))
+#define sk_KRB5_AUTHENTBODY_zero(st) SKM_sk_zero(KRB5_AUTHENTBODY, (st))
+#define sk_KRB5_AUTHENTBODY_push(st, val) SKM_sk_push(KRB5_AUTHENTBODY, (st), (val))
+#define sk_KRB5_AUTHENTBODY_unshift(st, val) SKM_sk_unshift(KRB5_AUTHENTBODY, (st), (val))
+#define sk_KRB5_AUTHENTBODY_find(st, val) SKM_sk_find(KRB5_AUTHENTBODY, (st), (val))
+#define sk_KRB5_AUTHENTBODY_find_ex(st, val) SKM_sk_find_ex(KRB5_AUTHENTBODY, (st), (val))
+#define sk_KRB5_AUTHENTBODY_delete(st, i) SKM_sk_delete(KRB5_AUTHENTBODY, (st), (i))
+#define sk_KRB5_AUTHENTBODY_delete_ptr(st, ptr) SKM_sk_delete_ptr(KRB5_AUTHENTBODY, (st), (ptr))
+#define sk_KRB5_AUTHENTBODY_insert(st, val, i) SKM_sk_insert(KRB5_AUTHENTBODY, (st), (val), (i))
+#define sk_KRB5_AUTHENTBODY_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(KRB5_AUTHENTBODY, (st), (cmp))
+#define sk_KRB5_AUTHENTBODY_dup(st) SKM_sk_dup(KRB5_AUTHENTBODY, st)
+#define sk_KRB5_AUTHENTBODY_pop_free(st, free_func) SKM_sk_pop_free(KRB5_AUTHENTBODY, (st), (free_func))
+#define sk_KRB5_AUTHENTBODY_shift(st) SKM_sk_shift(KRB5_AUTHENTBODY, (st))
+#define sk_KRB5_AUTHENTBODY_pop(st) SKM_sk_pop(KRB5_AUTHENTBODY, (st))
+#define sk_KRB5_AUTHENTBODY_sort(st) SKM_sk_sort(KRB5_AUTHENTBODY, (st))
+#define sk_KRB5_AUTHENTBODY_is_sorted(st) SKM_sk_is_sorted(KRB5_AUTHENTBODY, (st))
+
+#define sk_KRB5_CHECKSUM_new(cmp) SKM_sk_new(KRB5_CHECKSUM, (cmp))
+#define sk_KRB5_CHECKSUM_new_null() SKM_sk_new_null(KRB5_CHECKSUM)
+#define sk_KRB5_CHECKSUM_free(st) SKM_sk_free(KRB5_CHECKSUM, (st))
+#define sk_KRB5_CHECKSUM_num(st) SKM_sk_num(KRB5_CHECKSUM, (st))
+#define sk_KRB5_CHECKSUM_value(st, i) SKM_sk_value(KRB5_CHECKSUM, (st), (i))
+#define sk_KRB5_CHECKSUM_set(st, i, val) SKM_sk_set(KRB5_CHECKSUM, (st), (i), (val))
+#define sk_KRB5_CHECKSUM_zero(st) SKM_sk_zero(KRB5_CHECKSUM, (st))
+#define sk_KRB5_CHECKSUM_push(st, val) SKM_sk_push(KRB5_CHECKSUM, (st), (val))
+#define sk_KRB5_CHECKSUM_unshift(st, val) SKM_sk_unshift(KRB5_CHECKSUM, (st), (val))
+#define sk_KRB5_CHECKSUM_find(st, val) SKM_sk_find(KRB5_CHECKSUM, (st), (val))
+#define sk_KRB5_CHECKSUM_find_ex(st, val) SKM_sk_find_ex(KRB5_CHECKSUM, (st), (val))
+#define sk_KRB5_CHECKSUM_delete(st, i) SKM_sk_delete(KRB5_CHECKSUM, (st), (i))
+#define sk_KRB5_CHECKSUM_delete_ptr(st, ptr) SKM_sk_delete_ptr(KRB5_CHECKSUM, (st), (ptr))
+#define sk_KRB5_CHECKSUM_insert(st, val, i) SKM_sk_insert(KRB5_CHECKSUM, (st), (val), (i))
+#define sk_KRB5_CHECKSUM_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(KRB5_CHECKSUM, (st), (cmp))
+#define sk_KRB5_CHECKSUM_dup(st) SKM_sk_dup(KRB5_CHECKSUM, st)
+#define sk_KRB5_CHECKSUM_pop_free(st, free_func) SKM_sk_pop_free(KRB5_CHECKSUM, (st), (free_func))
+#define sk_KRB5_CHECKSUM_shift(st) SKM_sk_shift(KRB5_CHECKSUM, (st))
+#define sk_KRB5_CHECKSUM_pop(st) SKM_sk_pop(KRB5_CHECKSUM, (st))
+#define sk_KRB5_CHECKSUM_sort(st) SKM_sk_sort(KRB5_CHECKSUM, (st))
+#define sk_KRB5_CHECKSUM_is_sorted(st) SKM_sk_is_sorted(KRB5_CHECKSUM, (st))
+
+#define sk_KRB5_ENCDATA_new(cmp) SKM_sk_new(KRB5_ENCDATA, (cmp))
+#define sk_KRB5_ENCDATA_new_null() SKM_sk_new_null(KRB5_ENCDATA)
+#define sk_KRB5_ENCDATA_free(st) SKM_sk_free(KRB5_ENCDATA, (st))
+#define sk_KRB5_ENCDATA_num(st) SKM_sk_num(KRB5_ENCDATA, (st))
+#define sk_KRB5_ENCDATA_value(st, i) SKM_sk_value(KRB5_ENCDATA, (st), (i))
+#define sk_KRB5_ENCDATA_set(st, i, val) SKM_sk_set(KRB5_ENCDATA, (st), (i), (val))
+#define sk_KRB5_ENCDATA_zero(st) SKM_sk_zero(KRB5_ENCDATA, (st))
+#define sk_KRB5_ENCDATA_push(st, val) SKM_sk_push(KRB5_ENCDATA, (st), (val))
+#define sk_KRB5_ENCDATA_unshift(st, val) SKM_sk_unshift(KRB5_ENCDATA, (st), (val))
+#define sk_KRB5_ENCDATA_find(st, val) SKM_sk_find(KRB5_ENCDATA, (st), (val))
+#define sk_KRB5_ENCDATA_find_ex(st, val) SKM_sk_find_ex(KRB5_ENCDATA, (st), (val))
+#define sk_KRB5_ENCDATA_delete(st, i) SKM_sk_delete(KRB5_ENCDATA, (st), (i))
+#define sk_KRB5_ENCDATA_delete_ptr(st, ptr) SKM_sk_delete_ptr(KRB5_ENCDATA, (st), (ptr))
+#define sk_KRB5_ENCDATA_insert(st, val, i) SKM_sk_insert(KRB5_ENCDATA, (st), (val), (i))
+#define sk_KRB5_ENCDATA_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(KRB5_ENCDATA, (st), (cmp))
+#define sk_KRB5_ENCDATA_dup(st) SKM_sk_dup(KRB5_ENCDATA, st)
+#define sk_KRB5_ENCDATA_pop_free(st, free_func) SKM_sk_pop_free(KRB5_ENCDATA, (st), (free_func))
+#define sk_KRB5_ENCDATA_shift(st) SKM_sk_shift(KRB5_ENCDATA, (st))
+#define sk_KRB5_ENCDATA_pop(st) SKM_sk_pop(KRB5_ENCDATA, (st))
+#define sk_KRB5_ENCDATA_sort(st) SKM_sk_sort(KRB5_ENCDATA, (st))
+#define sk_KRB5_ENCDATA_is_sorted(st) SKM_sk_is_sorted(KRB5_ENCDATA, (st))
+
+#define sk_KRB5_ENCKEY_new(cmp) SKM_sk_new(KRB5_ENCKEY, (cmp))
+#define sk_KRB5_ENCKEY_new_null() SKM_sk_new_null(KRB5_ENCKEY)
+#define sk_KRB5_ENCKEY_free(st) SKM_sk_free(KRB5_ENCKEY, (st))
+#define sk_KRB5_ENCKEY_num(st) SKM_sk_num(KRB5_ENCKEY, (st))
+#define sk_KRB5_ENCKEY_value(st, i) SKM_sk_value(KRB5_ENCKEY, (st), (i))
+#define sk_KRB5_ENCKEY_set(st, i, val) SKM_sk_set(KRB5_ENCKEY, (st), (i), (val))
+#define sk_KRB5_ENCKEY_zero(st) SKM_sk_zero(KRB5_ENCKEY, (st))
+#define sk_KRB5_ENCKEY_push(st, val) SKM_sk_push(KRB5_ENCKEY, (st), (val))
+#define sk_KRB5_ENCKEY_unshift(st, val) SKM_sk_unshift(KRB5_ENCKEY, (st), (val))
+#define sk_KRB5_ENCKEY_find(st, val) SKM_sk_find(KRB5_ENCKEY, (st), (val))
+#define sk_KRB5_ENCKEY_find_ex(st, val) SKM_sk_find_ex(KRB5_ENCKEY, (st), (val))
+#define sk_KRB5_ENCKEY_delete(st, i) SKM_sk_delete(KRB5_ENCKEY, (st), (i))
+#define sk_KRB5_ENCKEY_delete_ptr(st, ptr) SKM_sk_delete_ptr(KRB5_ENCKEY, (st), (ptr))
+#define sk_KRB5_ENCKEY_insert(st, val, i) SKM_sk_insert(KRB5_ENCKEY, (st), (val), (i))
+#define sk_KRB5_ENCKEY_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(KRB5_ENCKEY, (st), (cmp))
+#define sk_KRB5_ENCKEY_dup(st) SKM_sk_dup(KRB5_ENCKEY, st)
+#define sk_KRB5_ENCKEY_pop_free(st, free_func) SKM_sk_pop_free(KRB5_ENCKEY, (st), (free_func))
+#define sk_KRB5_ENCKEY_shift(st) SKM_sk_shift(KRB5_ENCKEY, (st))
+#define sk_KRB5_ENCKEY_pop(st) SKM_sk_pop(KRB5_ENCKEY, (st))
+#define sk_KRB5_ENCKEY_sort(st) SKM_sk_sort(KRB5_ENCKEY, (st))
+#define sk_KRB5_ENCKEY_is_sorted(st) SKM_sk_is_sorted(KRB5_ENCKEY, (st))
+
+#define sk_KRB5_PRINCNAME_new(cmp) SKM_sk_new(KRB5_PRINCNAME, (cmp))
+#define sk_KRB5_PRINCNAME_new_null() SKM_sk_new_null(KRB5_PRINCNAME)
+#define sk_KRB5_PRINCNAME_free(st) SKM_sk_free(KRB5_PRINCNAME, (st))
+#define sk_KRB5_PRINCNAME_num(st) SKM_sk_num(KRB5_PRINCNAME, (st))
+#define sk_KRB5_PRINCNAME_value(st, i) SKM_sk_value(KRB5_PRINCNAME, (st), (i))
+#define sk_KRB5_PRINCNAME_set(st, i, val) SKM_sk_set(KRB5_PRINCNAME, (st), (i), (val))
+#define sk_KRB5_PRINCNAME_zero(st) SKM_sk_zero(KRB5_PRINCNAME, (st))
+#define sk_KRB5_PRINCNAME_push(st, val) SKM_sk_push(KRB5_PRINCNAME, (st), (val))
+#define sk_KRB5_PRINCNAME_unshift(st, val) SKM_sk_unshift(KRB5_PRINCNAME, (st), (val))
+#define sk_KRB5_PRINCNAME_find(st, val) SKM_sk_find(KRB5_PRINCNAME, (st), (val))
+#define sk_KRB5_PRINCNAME_find_ex(st, val) SKM_sk_find_ex(KRB5_PRINCNAME, (st), (val))
+#define sk_KRB5_PRINCNAME_delete(st, i) SKM_sk_delete(KRB5_PRINCNAME, (st), (i))
+#define sk_KRB5_PRINCNAME_delete_ptr(st, ptr) SKM_sk_delete_ptr(KRB5_PRINCNAME, (st), (ptr))
+#define sk_KRB5_PRINCNAME_insert(st, val, i) SKM_sk_insert(KRB5_PRINCNAME, (st), (val), (i))
+#define sk_KRB5_PRINCNAME_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(KRB5_PRINCNAME, (st), (cmp))
+#define sk_KRB5_PRINCNAME_dup(st) SKM_sk_dup(KRB5_PRINCNAME, st)
+#define sk_KRB5_PRINCNAME_pop_free(st, free_func) SKM_sk_pop_free(KRB5_PRINCNAME, (st), (free_func))
+#define sk_KRB5_PRINCNAME_shift(st) SKM_sk_shift(KRB5_PRINCNAME, (st))
+#define sk_KRB5_PRINCNAME_pop(st) SKM_sk_pop(KRB5_PRINCNAME, (st))
+#define sk_KRB5_PRINCNAME_sort(st) SKM_sk_sort(KRB5_PRINCNAME, (st))
+#define sk_KRB5_PRINCNAME_is_sorted(st) SKM_sk_is_sorted(KRB5_PRINCNAME, (st))
+
+#define sk_KRB5_TKTBODY_new(cmp) SKM_sk_new(KRB5_TKTBODY, (cmp))
+#define sk_KRB5_TKTBODY_new_null() SKM_sk_new_null(KRB5_TKTBODY)
+#define sk_KRB5_TKTBODY_free(st) SKM_sk_free(KRB5_TKTBODY, (st))
+#define sk_KRB5_TKTBODY_num(st) SKM_sk_num(KRB5_TKTBODY, (st))
+#define sk_KRB5_TKTBODY_value(st, i) SKM_sk_value(KRB5_TKTBODY, (st), (i))
+#define sk_KRB5_TKTBODY_set(st, i, val) SKM_sk_set(KRB5_TKTBODY, (st), (i), (val))
+#define sk_KRB5_TKTBODY_zero(st) SKM_sk_zero(KRB5_TKTBODY, (st))
+#define sk_KRB5_TKTBODY_push(st, val) SKM_sk_push(KRB5_TKTBODY, (st), (val))
+#define sk_KRB5_TKTBODY_unshift(st, val) SKM_sk_unshift(KRB5_TKTBODY, (st), (val))
+#define sk_KRB5_TKTBODY_find(st, val) SKM_sk_find(KRB5_TKTBODY, (st), (val))
+#define sk_KRB5_TKTBODY_find_ex(st, val) SKM_sk_find_ex(KRB5_TKTBODY, (st), (val))
+#define sk_KRB5_TKTBODY_delete(st, i) SKM_sk_delete(KRB5_TKTBODY, (st), (i))
+#define sk_KRB5_TKTBODY_delete_ptr(st, ptr) SKM_sk_delete_ptr(KRB5_TKTBODY, (st), (ptr))
+#define sk_KRB5_TKTBODY_insert(st, val, i) SKM_sk_insert(KRB5_TKTBODY, (st), (val), (i))
+#define sk_KRB5_TKTBODY_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(KRB5_TKTBODY, (st), (cmp))
+#define sk_KRB5_TKTBODY_dup(st) SKM_sk_dup(KRB5_TKTBODY, st)
+#define sk_KRB5_TKTBODY_pop_free(st, free_func) SKM_sk_pop_free(KRB5_TKTBODY, (st), (free_func))
+#define sk_KRB5_TKTBODY_shift(st) SKM_sk_shift(KRB5_TKTBODY, (st))
+#define sk_KRB5_TKTBODY_pop(st) SKM_sk_pop(KRB5_TKTBODY, (st))
+#define sk_KRB5_TKTBODY_sort(st) SKM_sk_sort(KRB5_TKTBODY, (st))
+#define sk_KRB5_TKTBODY_is_sorted(st) SKM_sk_is_sorted(KRB5_TKTBODY, (st))
+
+#define sk_MEM_OBJECT_DATA_new(cmp) SKM_sk_new(MEM_OBJECT_DATA, (cmp))
+#define sk_MEM_OBJECT_DATA_new_null() SKM_sk_new_null(MEM_OBJECT_DATA)
+#define sk_MEM_OBJECT_DATA_free(st) SKM_sk_free(MEM_OBJECT_DATA, (st))
+#define sk_MEM_OBJECT_DATA_num(st) SKM_sk_num(MEM_OBJECT_DATA, (st))
+#define sk_MEM_OBJECT_DATA_value(st, i) SKM_sk_value(MEM_OBJECT_DATA, (st), (i))
+#define sk_MEM_OBJECT_DATA_set(st, i, val) SKM_sk_set(MEM_OBJECT_DATA, (st), (i), (val))
+#define sk_MEM_OBJECT_DATA_zero(st) SKM_sk_zero(MEM_OBJECT_DATA, (st))
+#define sk_MEM_OBJECT_DATA_push(st, val) SKM_sk_push(MEM_OBJECT_DATA, (st), (val))
+#define sk_MEM_OBJECT_DATA_unshift(st, val) SKM_sk_unshift(MEM_OBJECT_DATA, (st), (val))
+#define sk_MEM_OBJECT_DATA_find(st, val) SKM_sk_find(MEM_OBJECT_DATA, (st), (val))
+#define sk_MEM_OBJECT_DATA_find_ex(st, val) SKM_sk_find_ex(MEM_OBJECT_DATA, (st), (val))
+#define sk_MEM_OBJECT_DATA_delete(st, i) SKM_sk_delete(MEM_OBJECT_DATA, (st), (i))
+#define sk_MEM_OBJECT_DATA_delete_ptr(st, ptr) SKM_sk_delete_ptr(MEM_OBJECT_DATA, (st), (ptr))
+#define sk_MEM_OBJECT_DATA_insert(st, val, i) SKM_sk_insert(MEM_OBJECT_DATA, (st), (val), (i))
+#define sk_MEM_OBJECT_DATA_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(MEM_OBJECT_DATA, (st), (cmp))
+#define sk_MEM_OBJECT_DATA_dup(st) SKM_sk_dup(MEM_OBJECT_DATA, st)
+#define sk_MEM_OBJECT_DATA_pop_free(st, free_func) SKM_sk_pop_free(MEM_OBJECT_DATA, (st), (free_func))
+#define sk_MEM_OBJECT_DATA_shift(st) SKM_sk_shift(MEM_OBJECT_DATA, (st))
+#define sk_MEM_OBJECT_DATA_pop(st) SKM_sk_pop(MEM_OBJECT_DATA, (st))
+#define sk_MEM_OBJECT_DATA_sort(st) SKM_sk_sort(MEM_OBJECT_DATA, (st))
+#define sk_MEM_OBJECT_DATA_is_sorted(st) SKM_sk_is_sorted(MEM_OBJECT_DATA, (st))
+
+#define sk_MIME_HEADER_new(cmp) SKM_sk_new(MIME_HEADER, (cmp))
+#define sk_MIME_HEADER_new_null() SKM_sk_new_null(MIME_HEADER)
+#define sk_MIME_HEADER_free(st) SKM_sk_free(MIME_HEADER, (st))
+#define sk_MIME_HEADER_num(st) SKM_sk_num(MIME_HEADER, (st))
+#define sk_MIME_HEADER_value(st, i) SKM_sk_value(MIME_HEADER, (st), (i))
+#define sk_MIME_HEADER_set(st, i, val) SKM_sk_set(MIME_HEADER, (st), (i), (val))
+#define sk_MIME_HEADER_zero(st) SKM_sk_zero(MIME_HEADER, (st))
+#define sk_MIME_HEADER_push(st, val) SKM_sk_push(MIME_HEADER, (st), (val))
+#define sk_MIME_HEADER_unshift(st, val) SKM_sk_unshift(MIME_HEADER, (st), (val))
+#define sk_MIME_HEADER_find(st, val) SKM_sk_find(MIME_HEADER, (st), (val))
+#define sk_MIME_HEADER_find_ex(st, val) SKM_sk_find_ex(MIME_HEADER, (st), (val))
+#define sk_MIME_HEADER_delete(st, i) SKM_sk_delete(MIME_HEADER, (st), (i))
+#define sk_MIME_HEADER_delete_ptr(st, ptr) SKM_sk_delete_ptr(MIME_HEADER, (st), (ptr))
+#define sk_MIME_HEADER_insert(st, val, i) SKM_sk_insert(MIME_HEADER, (st), (val), (i))
+#define sk_MIME_HEADER_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(MIME_HEADER, (st), (cmp))
+#define sk_MIME_HEADER_dup(st) SKM_sk_dup(MIME_HEADER, st)
+#define sk_MIME_HEADER_pop_free(st, free_func) SKM_sk_pop_free(MIME_HEADER, (st), (free_func))
+#define sk_MIME_HEADER_shift(st) SKM_sk_shift(MIME_HEADER, (st))
+#define sk_MIME_HEADER_pop(st) SKM_sk_pop(MIME_HEADER, (st))
+#define sk_MIME_HEADER_sort(st) SKM_sk_sort(MIME_HEADER, (st))
+#define sk_MIME_HEADER_is_sorted(st) SKM_sk_is_sorted(MIME_HEADER, (st))
+
+#define sk_MIME_PARAM_new(cmp) SKM_sk_new(MIME_PARAM, (cmp))
+#define sk_MIME_PARAM_new_null() SKM_sk_new_null(MIME_PARAM)
+#define sk_MIME_PARAM_free(st) SKM_sk_free(MIME_PARAM, (st))
+#define sk_MIME_PARAM_num(st) SKM_sk_num(MIME_PARAM, (st))
+#define sk_MIME_PARAM_value(st, i) SKM_sk_value(MIME_PARAM, (st), (i))
+#define sk_MIME_PARAM_set(st, i, val) SKM_sk_set(MIME_PARAM, (st), (i), (val))
+#define sk_MIME_PARAM_zero(st) SKM_sk_zero(MIME_PARAM, (st))
+#define sk_MIME_PARAM_push(st, val) SKM_sk_push(MIME_PARAM, (st), (val))
+#define sk_MIME_PARAM_unshift(st, val) SKM_sk_unshift(MIME_PARAM, (st), (val))
+#define sk_MIME_PARAM_find(st, val) SKM_sk_find(MIME_PARAM, (st), (val))
+#define sk_MIME_PARAM_find_ex(st, val) SKM_sk_find_ex(MIME_PARAM, (st), (val))
+#define sk_MIME_PARAM_delete(st, i) SKM_sk_delete(MIME_PARAM, (st), (i))
+#define sk_MIME_PARAM_delete_ptr(st, ptr) SKM_sk_delete_ptr(MIME_PARAM, (st), (ptr))
+#define sk_MIME_PARAM_insert(st, val, i) SKM_sk_insert(MIME_PARAM, (st), (val), (i))
+#define sk_MIME_PARAM_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(MIME_PARAM, (st), (cmp))
+#define sk_MIME_PARAM_dup(st) SKM_sk_dup(MIME_PARAM, st)
+#define sk_MIME_PARAM_pop_free(st, free_func) SKM_sk_pop_free(MIME_PARAM, (st), (free_func))
+#define sk_MIME_PARAM_shift(st) SKM_sk_shift(MIME_PARAM, (st))
+#define sk_MIME_PARAM_pop(st) SKM_sk_pop(MIME_PARAM, (st))
+#define sk_MIME_PARAM_sort(st) SKM_sk_sort(MIME_PARAM, (st))
+#define sk_MIME_PARAM_is_sorted(st) SKM_sk_is_sorted(MIME_PARAM, (st))
+
+#define sk_NAME_FUNCS_new(cmp) SKM_sk_new(NAME_FUNCS, (cmp))
+#define sk_NAME_FUNCS_new_null() SKM_sk_new_null(NAME_FUNCS)
+#define sk_NAME_FUNCS_free(st) SKM_sk_free(NAME_FUNCS, (st))
+#define sk_NAME_FUNCS_num(st) SKM_sk_num(NAME_FUNCS, (st))
+#define sk_NAME_FUNCS_value(st, i) SKM_sk_value(NAME_FUNCS, (st), (i))
+#define sk_NAME_FUNCS_set(st, i, val) SKM_sk_set(NAME_FUNCS, (st), (i), (val))
+#define sk_NAME_FUNCS_zero(st) SKM_sk_zero(NAME_FUNCS, (st))
+#define sk_NAME_FUNCS_push(st, val) SKM_sk_push(NAME_FUNCS, (st), (val))
+#define sk_NAME_FUNCS_unshift(st, val) SKM_sk_unshift(NAME_FUNCS, (st), (val))
+#define sk_NAME_FUNCS_find(st, val) SKM_sk_find(NAME_FUNCS, (st), (val))
+#define sk_NAME_FUNCS_find_ex(st, val) SKM_sk_find_ex(NAME_FUNCS, (st), (val))
+#define sk_NAME_FUNCS_delete(st, i) SKM_sk_delete(NAME_FUNCS, (st), (i))
+#define sk_NAME_FUNCS_delete_ptr(st, ptr) SKM_sk_delete_ptr(NAME_FUNCS, (st), (ptr))
+#define sk_NAME_FUNCS_insert(st, val, i) SKM_sk_insert(NAME_FUNCS, (st), (val), (i))
+#define sk_NAME_FUNCS_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(NAME_FUNCS, (st), (cmp))
+#define sk_NAME_FUNCS_dup(st) SKM_sk_dup(NAME_FUNCS, st)
+#define sk_NAME_FUNCS_pop_free(st, free_func) SKM_sk_pop_free(NAME_FUNCS, (st), (free_func))
+#define sk_NAME_FUNCS_shift(st) SKM_sk_shift(NAME_FUNCS, (st))
+#define sk_NAME_FUNCS_pop(st) SKM_sk_pop(NAME_FUNCS, (st))
+#define sk_NAME_FUNCS_sort(st) SKM_sk_sort(NAME_FUNCS, (st))
+#define sk_NAME_FUNCS_is_sorted(st) SKM_sk_is_sorted(NAME_FUNCS, (st))
+
+#define sk_OCSP_CERTID_new(cmp) SKM_sk_new(OCSP_CERTID, (cmp))
+#define sk_OCSP_CERTID_new_null() SKM_sk_new_null(OCSP_CERTID)
+#define sk_OCSP_CERTID_free(st) SKM_sk_free(OCSP_CERTID, (st))
+#define sk_OCSP_CERTID_num(st) SKM_sk_num(OCSP_CERTID, (st))
+#define sk_OCSP_CERTID_value(st, i) SKM_sk_value(OCSP_CERTID, (st), (i))
+#define sk_OCSP_CERTID_set(st, i, val) SKM_sk_set(OCSP_CERTID, (st), (i), (val))
+#define sk_OCSP_CERTID_zero(st) SKM_sk_zero(OCSP_CERTID, (st))
+#define sk_OCSP_CERTID_push(st, val) SKM_sk_push(OCSP_CERTID, (st), (val))
+#define sk_OCSP_CERTID_unshift(st, val) SKM_sk_unshift(OCSP_CERTID, (st), (val))
+#define sk_OCSP_CERTID_find(st, val) SKM_sk_find(OCSP_CERTID, (st), (val))
+#define sk_OCSP_CERTID_find_ex(st, val) SKM_sk_find_ex(OCSP_CERTID, (st), (val))
+#define sk_OCSP_CERTID_delete(st, i) SKM_sk_delete(OCSP_CERTID, (st), (i))
+#define sk_OCSP_CERTID_delete_ptr(st, ptr) SKM_sk_delete_ptr(OCSP_CERTID, (st), (ptr))
+#define sk_OCSP_CERTID_insert(st, val, i) SKM_sk_insert(OCSP_CERTID, (st), (val), (i))
+#define sk_OCSP_CERTID_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(OCSP_CERTID, (st), (cmp))
+#define sk_OCSP_CERTID_dup(st) SKM_sk_dup(OCSP_CERTID, st)
+#define sk_OCSP_CERTID_pop_free(st, free_func) SKM_sk_pop_free(OCSP_CERTID, (st), (free_func))
+#define sk_OCSP_CERTID_shift(st) SKM_sk_shift(OCSP_CERTID, (st))
+#define sk_OCSP_CERTID_pop(st) SKM_sk_pop(OCSP_CERTID, (st))
+#define sk_OCSP_CERTID_sort(st) SKM_sk_sort(OCSP_CERTID, (st))
+#define sk_OCSP_CERTID_is_sorted(st) SKM_sk_is_sorted(OCSP_CERTID, (st))
+
+#define sk_OCSP_ONEREQ_new(cmp) SKM_sk_new(OCSP_ONEREQ, (cmp))
+#define sk_OCSP_ONEREQ_new_null() SKM_sk_new_null(OCSP_ONEREQ)
+#define sk_OCSP_ONEREQ_free(st) SKM_sk_free(OCSP_ONEREQ, (st))
+#define sk_OCSP_ONEREQ_num(st) SKM_sk_num(OCSP_ONEREQ, (st))
+#define sk_OCSP_ONEREQ_value(st, i) SKM_sk_value(OCSP_ONEREQ, (st), (i))
+#define sk_OCSP_ONEREQ_set(st, i, val) SKM_sk_set(OCSP_ONEREQ, (st), (i), (val))
+#define sk_OCSP_ONEREQ_zero(st) SKM_sk_zero(OCSP_ONEREQ, (st))
+#define sk_OCSP_ONEREQ_push(st, val) SKM_sk_push(OCSP_ONEREQ, (st), (val))
+#define sk_OCSP_ONEREQ_unshift(st, val) SKM_sk_unshift(OCSP_ONEREQ, (st), (val))
+#define sk_OCSP_ONEREQ_find(st, val) SKM_sk_find(OCSP_ONEREQ, (st), (val))
+#define sk_OCSP_ONEREQ_find_ex(st, val) SKM_sk_find_ex(OCSP_ONEREQ, (st), (val))
+#define sk_OCSP_ONEREQ_delete(st, i) SKM_sk_delete(OCSP_ONEREQ, (st), (i))
+#define sk_OCSP_ONEREQ_delete_ptr(st, ptr) SKM_sk_delete_ptr(OCSP_ONEREQ, (st), (ptr))
+#define sk_OCSP_ONEREQ_insert(st, val, i) SKM_sk_insert(OCSP_ONEREQ, (st), (val), (i))
+#define sk_OCSP_ONEREQ_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(OCSP_ONEREQ, (st), (cmp))
+#define sk_OCSP_ONEREQ_dup(st) SKM_sk_dup(OCSP_ONEREQ, st)
+#define sk_OCSP_ONEREQ_pop_free(st, free_func) SKM_sk_pop_free(OCSP_ONEREQ, (st), (free_func))
+#define sk_OCSP_ONEREQ_shift(st) SKM_sk_shift(OCSP_ONEREQ, (st))
+#define sk_OCSP_ONEREQ_pop(st) SKM_sk_pop(OCSP_ONEREQ, (st))
+#define sk_OCSP_ONEREQ_sort(st) SKM_sk_sort(OCSP_ONEREQ, (st))
+#define sk_OCSP_ONEREQ_is_sorted(st) SKM_sk_is_sorted(OCSP_ONEREQ, (st))
+
+#define sk_OCSP_RESPID_new(cmp) SKM_sk_new(OCSP_RESPID, (cmp))
+#define sk_OCSP_RESPID_new_null() SKM_sk_new_null(OCSP_RESPID)
+#define sk_OCSP_RESPID_free(st) SKM_sk_free(OCSP_RESPID, (st))
+#define sk_OCSP_RESPID_num(st) SKM_sk_num(OCSP_RESPID, (st))
+#define sk_OCSP_RESPID_value(st, i) SKM_sk_value(OCSP_RESPID, (st), (i))
+#define sk_OCSP_RESPID_set(st, i, val) SKM_sk_set(OCSP_RESPID, (st), (i), (val))
+#define sk_OCSP_RESPID_zero(st) SKM_sk_zero(OCSP_RESPID, (st))
+#define sk_OCSP_RESPID_push(st, val) SKM_sk_push(OCSP_RESPID, (st), (val))
+#define sk_OCSP_RESPID_unshift(st, val) SKM_sk_unshift(OCSP_RESPID, (st), (val))
+#define sk_OCSP_RESPID_find(st, val) SKM_sk_find(OCSP_RESPID, (st), (val))
+#define sk_OCSP_RESPID_find_ex(st, val) SKM_sk_find_ex(OCSP_RESPID, (st), (val))
+#define sk_OCSP_RESPID_delete(st, i) SKM_sk_delete(OCSP_RESPID, (st), (i))
+#define sk_OCSP_RESPID_delete_ptr(st, ptr) SKM_sk_delete_ptr(OCSP_RESPID, (st), (ptr))
+#define sk_OCSP_RESPID_insert(st, val, i) SKM_sk_insert(OCSP_RESPID, (st), (val), (i))
+#define sk_OCSP_RESPID_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(OCSP_RESPID, (st), (cmp))
+#define sk_OCSP_RESPID_dup(st) SKM_sk_dup(OCSP_RESPID, st)
+#define sk_OCSP_RESPID_pop_free(st, free_func) SKM_sk_pop_free(OCSP_RESPID, (st), (free_func))
+#define sk_OCSP_RESPID_shift(st) SKM_sk_shift(OCSP_RESPID, (st))
+#define sk_OCSP_RESPID_pop(st) SKM_sk_pop(OCSP_RESPID, (st))
+#define sk_OCSP_RESPID_sort(st) SKM_sk_sort(OCSP_RESPID, (st))
+#define sk_OCSP_RESPID_is_sorted(st) SKM_sk_is_sorted(OCSP_RESPID, (st))
+
+#define sk_OCSP_SINGLERESP_new(cmp) SKM_sk_new(OCSP_SINGLERESP, (cmp))
+#define sk_OCSP_SINGLERESP_new_null() SKM_sk_new_null(OCSP_SINGLERESP)
+#define sk_OCSP_SINGLERESP_free(st) SKM_sk_free(OCSP_SINGLERESP, (st))
+#define sk_OCSP_SINGLERESP_num(st) SKM_sk_num(OCSP_SINGLERESP, (st))
+#define sk_OCSP_SINGLERESP_value(st, i) SKM_sk_value(OCSP_SINGLERESP, (st), (i))
+#define sk_OCSP_SINGLERESP_set(st, i, val) SKM_sk_set(OCSP_SINGLERESP, (st), (i), (val))
+#define sk_OCSP_SINGLERESP_zero(st) SKM_sk_zero(OCSP_SINGLERESP, (st))
+#define sk_OCSP_SINGLERESP_push(st, val) SKM_sk_push(OCSP_SINGLERESP, (st), (val))
+#define sk_OCSP_SINGLERESP_unshift(st, val) SKM_sk_unshift(OCSP_SINGLERESP, (st), (val))
+#define sk_OCSP_SINGLERESP_find(st, val) SKM_sk_find(OCSP_SINGLERESP, (st), (val))
+#define sk_OCSP_SINGLERESP_find_ex(st, val) SKM_sk_find_ex(OCSP_SINGLERESP, (st), (val))
+#define sk_OCSP_SINGLERESP_delete(st, i) SKM_sk_delete(OCSP_SINGLERESP, (st), (i))
+#define sk_OCSP_SINGLERESP_delete_ptr(st, ptr) SKM_sk_delete_ptr(OCSP_SINGLERESP, (st), (ptr))
+#define sk_OCSP_SINGLERESP_insert(st, val, i) SKM_sk_insert(OCSP_SINGLERESP, (st), (val), (i))
+#define sk_OCSP_SINGLERESP_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(OCSP_SINGLERESP, (st), (cmp))
+#define sk_OCSP_SINGLERESP_dup(st) SKM_sk_dup(OCSP_SINGLERESP, st)
+#define sk_OCSP_SINGLERESP_pop_free(st, free_func) SKM_sk_pop_free(OCSP_SINGLERESP, (st), (free_func))
+#define sk_OCSP_SINGLERESP_shift(st) SKM_sk_shift(OCSP_SINGLERESP, (st))
+#define sk_OCSP_SINGLERESP_pop(st) SKM_sk_pop(OCSP_SINGLERESP, (st))
+#define sk_OCSP_SINGLERESP_sort(st) SKM_sk_sort(OCSP_SINGLERESP, (st))
+#define sk_OCSP_SINGLERESP_is_sorted(st) SKM_sk_is_sorted(OCSP_SINGLERESP, (st))
+
+#define sk_PKCS12_SAFEBAG_new(cmp) SKM_sk_new(PKCS12_SAFEBAG, (cmp))
+#define sk_PKCS12_SAFEBAG_new_null() SKM_sk_new_null(PKCS12_SAFEBAG)
+#define sk_PKCS12_SAFEBAG_free(st) SKM_sk_free(PKCS12_SAFEBAG, (st))
+#define sk_PKCS12_SAFEBAG_num(st) SKM_sk_num(PKCS12_SAFEBAG, (st))
+#define sk_PKCS12_SAFEBAG_value(st, i) SKM_sk_value(PKCS12_SAFEBAG, (st), (i))
+#define sk_PKCS12_SAFEBAG_set(st, i, val) SKM_sk_set(PKCS12_SAFEBAG, (st), (i), (val))
+#define sk_PKCS12_SAFEBAG_zero(st) SKM_sk_zero(PKCS12_SAFEBAG, (st))
+#define sk_PKCS12_SAFEBAG_push(st, val) SKM_sk_push(PKCS12_SAFEBAG, (st), (val))
+#define sk_PKCS12_SAFEBAG_unshift(st, val) SKM_sk_unshift(PKCS12_SAFEBAG, (st), (val))
+#define sk_PKCS12_SAFEBAG_find(st, val) SKM_sk_find(PKCS12_SAFEBAG, (st), (val))
+#define sk_PKCS12_SAFEBAG_find_ex(st, val) SKM_sk_find_ex(PKCS12_SAFEBAG, (st), (val))
+#define sk_PKCS12_SAFEBAG_delete(st, i) SKM_sk_delete(PKCS12_SAFEBAG, (st), (i))
+#define sk_PKCS12_SAFEBAG_delete_ptr(st, ptr) SKM_sk_delete_ptr(PKCS12_SAFEBAG, (st), (ptr))
+#define sk_PKCS12_SAFEBAG_insert(st, val, i) SKM_sk_insert(PKCS12_SAFEBAG, (st), (val), (i))
+#define sk_PKCS12_SAFEBAG_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(PKCS12_SAFEBAG, (st), (cmp))
+#define sk_PKCS12_SAFEBAG_dup(st) SKM_sk_dup(PKCS12_SAFEBAG, st)
+#define sk_PKCS12_SAFEBAG_pop_free(st, free_func) SKM_sk_pop_free(PKCS12_SAFEBAG, (st), (free_func))
+#define sk_PKCS12_SAFEBAG_shift(st) SKM_sk_shift(PKCS12_SAFEBAG, (st))
+#define sk_PKCS12_SAFEBAG_pop(st) SKM_sk_pop(PKCS12_SAFEBAG, (st))
+#define sk_PKCS12_SAFEBAG_sort(st) SKM_sk_sort(PKCS12_SAFEBAG, (st))
+#define sk_PKCS12_SAFEBAG_is_sorted(st) SKM_sk_is_sorted(PKCS12_SAFEBAG, (st))
+
+#define sk_PKCS7_new(cmp) SKM_sk_new(PKCS7, (cmp))
+#define sk_PKCS7_new_null() SKM_sk_new_null(PKCS7)
+#define sk_PKCS7_free(st) SKM_sk_free(PKCS7, (st))
+#define sk_PKCS7_num(st) SKM_sk_num(PKCS7, (st))
+#define sk_PKCS7_value(st, i) SKM_sk_value(PKCS7, (st), (i))
+#define sk_PKCS7_set(st, i, val) SKM_sk_set(PKCS7, (st), (i), (val))
+#define sk_PKCS7_zero(st) SKM_sk_zero(PKCS7, (st))
+#define sk_PKCS7_push(st, val) SKM_sk_push(PKCS7, (st), (val))
+#define sk_PKCS7_unshift(st, val) SKM_sk_unshift(PKCS7, (st), (val))
+#define sk_PKCS7_find(st, val) SKM_sk_find(PKCS7, (st), (val))
+#define sk_PKCS7_find_ex(st, val) SKM_sk_find_ex(PKCS7, (st), (val))
+#define sk_PKCS7_delete(st, i) SKM_sk_delete(PKCS7, (st), (i))
+#define sk_PKCS7_delete_ptr(st, ptr) SKM_sk_delete_ptr(PKCS7, (st), (ptr))
+#define sk_PKCS7_insert(st, val, i) SKM_sk_insert(PKCS7, (st), (val), (i))
+#define sk_PKCS7_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(PKCS7, (st), (cmp))
+#define sk_PKCS7_dup(st) SKM_sk_dup(PKCS7, st)
+#define sk_PKCS7_pop_free(st, free_func) SKM_sk_pop_free(PKCS7, (st), (free_func))
+#define sk_PKCS7_shift(st) SKM_sk_shift(PKCS7, (st))
+#define sk_PKCS7_pop(st) SKM_sk_pop(PKCS7, (st))
+#define sk_PKCS7_sort(st) SKM_sk_sort(PKCS7, (st))
+#define sk_PKCS7_is_sorted(st) SKM_sk_is_sorted(PKCS7, (st))
+
+#define sk_PKCS7_RECIP_INFO_new(cmp) SKM_sk_new(PKCS7_RECIP_INFO, (cmp))
+#define sk_PKCS7_RECIP_INFO_new_null() SKM_sk_new_null(PKCS7_RECIP_INFO)
+#define sk_PKCS7_RECIP_INFO_free(st) SKM_sk_free(PKCS7_RECIP_INFO, (st))
+#define sk_PKCS7_RECIP_INFO_num(st) SKM_sk_num(PKCS7_RECIP_INFO, (st))
+#define sk_PKCS7_RECIP_INFO_value(st, i) SKM_sk_value(PKCS7_RECIP_INFO, (st), (i))
+#define sk_PKCS7_RECIP_INFO_set(st, i, val) SKM_sk_set(PKCS7_RECIP_INFO, (st), (i), (val))
+#define sk_PKCS7_RECIP_INFO_zero(st) SKM_sk_zero(PKCS7_RECIP_INFO, (st))
+#define sk_PKCS7_RECIP_INFO_push(st, val) SKM_sk_push(PKCS7_RECIP_INFO, (st), (val))
+#define sk_PKCS7_RECIP_INFO_unshift(st, val) SKM_sk_unshift(PKCS7_RECIP_INFO, (st), (val))
+#define sk_PKCS7_RECIP_INFO_find(st, val) SKM_sk_find(PKCS7_RECIP_INFO, (st), (val))
+#define sk_PKCS7_RECIP_INFO_find_ex(st, val) SKM_sk_find_ex(PKCS7_RECIP_INFO, (st), (val))
+#define sk_PKCS7_RECIP_INFO_delete(st, i) SKM_sk_delete(PKCS7_RECIP_INFO, (st), (i))
+#define sk_PKCS7_RECIP_INFO_delete_ptr(st, ptr) SKM_sk_delete_ptr(PKCS7_RECIP_INFO, (st), (ptr))
+#define sk_PKCS7_RECIP_INFO_insert(st, val, i) SKM_sk_insert(PKCS7_RECIP_INFO, (st), (val), (i))
+#define sk_PKCS7_RECIP_INFO_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(PKCS7_RECIP_INFO, (st), (cmp))
+#define sk_PKCS7_RECIP_INFO_dup(st) SKM_sk_dup(PKCS7_RECIP_INFO, st)
+#define sk_PKCS7_RECIP_INFO_pop_free(st, free_func) SKM_sk_pop_free(PKCS7_RECIP_INFO, (st), (free_func))
+#define sk_PKCS7_RECIP_INFO_shift(st) SKM_sk_shift(PKCS7_RECIP_INFO, (st))
+#define sk_PKCS7_RECIP_INFO_pop(st) SKM_sk_pop(PKCS7_RECIP_INFO, (st))
+#define sk_PKCS7_RECIP_INFO_sort(st) SKM_sk_sort(PKCS7_RECIP_INFO, (st))
+#define sk_PKCS7_RECIP_INFO_is_sorted(st) SKM_sk_is_sorted(PKCS7_RECIP_INFO, (st))
+
+#define sk_PKCS7_SIGNER_INFO_new(cmp) SKM_sk_new(PKCS7_SIGNER_INFO, (cmp))
+#define sk_PKCS7_SIGNER_INFO_new_null() SKM_sk_new_null(PKCS7_SIGNER_INFO)
+#define sk_PKCS7_SIGNER_INFO_free(st) SKM_sk_free(PKCS7_SIGNER_INFO, (st))
+#define sk_PKCS7_SIGNER_INFO_num(st) SKM_sk_num(PKCS7_SIGNER_INFO, (st))
+#define sk_PKCS7_SIGNER_INFO_value(st, i) SKM_sk_value(PKCS7_SIGNER_INFO, (st), (i))
+#define sk_PKCS7_SIGNER_INFO_set(st, i, val) SKM_sk_set(PKCS7_SIGNER_INFO, (st), (i), (val))
+#define sk_PKCS7_SIGNER_INFO_zero(st) SKM_sk_zero(PKCS7_SIGNER_INFO, (st))
+#define sk_PKCS7_SIGNER_INFO_push(st, val) SKM_sk_push(PKCS7_SIGNER_INFO, (st), (val))
+#define sk_PKCS7_SIGNER_INFO_unshift(st, val) SKM_sk_unshift(PKCS7_SIGNER_INFO, (st), (val))
+#define sk_PKCS7_SIGNER_INFO_find(st, val) SKM_sk_find(PKCS7_SIGNER_INFO, (st), (val))
+#define sk_PKCS7_SIGNER_INFO_find_ex(st, val) SKM_sk_find_ex(PKCS7_SIGNER_INFO, (st), (val))
+#define sk_PKCS7_SIGNER_INFO_delete(st, i) SKM_sk_delete(PKCS7_SIGNER_INFO, (st), (i))
+#define sk_PKCS7_SIGNER_INFO_delete_ptr(st, ptr) SKM_sk_delete_ptr(PKCS7_SIGNER_INFO, (st), (ptr))
+#define sk_PKCS7_SIGNER_INFO_insert(st, val, i) SKM_sk_insert(PKCS7_SIGNER_INFO, (st), (val), (i))
+#define sk_PKCS7_SIGNER_INFO_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(PKCS7_SIGNER_INFO, (st), (cmp))
+#define sk_PKCS7_SIGNER_INFO_dup(st) SKM_sk_dup(PKCS7_SIGNER_INFO, st)
+#define sk_PKCS7_SIGNER_INFO_pop_free(st, free_func) SKM_sk_pop_free(PKCS7_SIGNER_INFO, (st), (free_func))
+#define sk_PKCS7_SIGNER_INFO_shift(st) SKM_sk_shift(PKCS7_SIGNER_INFO, (st))
+#define sk_PKCS7_SIGNER_INFO_pop(st) SKM_sk_pop(PKCS7_SIGNER_INFO, (st))
+#define sk_PKCS7_SIGNER_INFO_sort(st) SKM_sk_sort(PKCS7_SIGNER_INFO, (st))
+#define sk_PKCS7_SIGNER_INFO_is_sorted(st) SKM_sk_is_sorted(PKCS7_SIGNER_INFO, (st))
+
+#define sk_POLICYINFO_new(cmp) SKM_sk_new(POLICYINFO, (cmp))
+#define sk_POLICYINFO_new_null() SKM_sk_new_null(POLICYINFO)
+#define sk_POLICYINFO_free(st) SKM_sk_free(POLICYINFO, (st))
+#define sk_POLICYINFO_num(st) SKM_sk_num(POLICYINFO, (st))
+#define sk_POLICYINFO_value(st, i) SKM_sk_value(POLICYINFO, (st), (i))
+#define sk_POLICYINFO_set(st, i, val) SKM_sk_set(POLICYINFO, (st), (i), (val))
+#define sk_POLICYINFO_zero(st) SKM_sk_zero(POLICYINFO, (st))
+#define sk_POLICYINFO_push(st, val) SKM_sk_push(POLICYINFO, (st), (val))
+#define sk_POLICYINFO_unshift(st, val) SKM_sk_unshift(POLICYINFO, (st), (val))
+#define sk_POLICYINFO_find(st, val) SKM_sk_find(POLICYINFO, (st), (val))
+#define sk_POLICYINFO_find_ex(st, val) SKM_sk_find_ex(POLICYINFO, (st), (val))
+#define sk_POLICYINFO_delete(st, i) SKM_sk_delete(POLICYINFO, (st), (i))
+#define sk_POLICYINFO_delete_ptr(st, ptr) SKM_sk_delete_ptr(POLICYINFO, (st), (ptr))
+#define sk_POLICYINFO_insert(st, val, i) SKM_sk_insert(POLICYINFO, (st), (val), (i))
+#define sk_POLICYINFO_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(POLICYINFO, (st), (cmp))
+#define sk_POLICYINFO_dup(st) SKM_sk_dup(POLICYINFO, st)
+#define sk_POLICYINFO_pop_free(st, free_func) SKM_sk_pop_free(POLICYINFO, (st), (free_func))
+#define sk_POLICYINFO_shift(st) SKM_sk_shift(POLICYINFO, (st))
+#define sk_POLICYINFO_pop(st) SKM_sk_pop(POLICYINFO, (st))
+#define sk_POLICYINFO_sort(st) SKM_sk_sort(POLICYINFO, (st))
+#define sk_POLICYINFO_is_sorted(st) SKM_sk_is_sorted(POLICYINFO, (st))
+
+#define sk_POLICYQUALINFO_new(cmp) SKM_sk_new(POLICYQUALINFO, (cmp))
+#define sk_POLICYQUALINFO_new_null() SKM_sk_new_null(POLICYQUALINFO)
+#define sk_POLICYQUALINFO_free(st) SKM_sk_free(POLICYQUALINFO, (st))
+#define sk_POLICYQUALINFO_num(st) SKM_sk_num(POLICYQUALINFO, (st))
+#define sk_POLICYQUALINFO_value(st, i) SKM_sk_value(POLICYQUALINFO, (st), (i))
+#define sk_POLICYQUALINFO_set(st, i, val) SKM_sk_set(POLICYQUALINFO, (st), (i), (val))
+#define sk_POLICYQUALINFO_zero(st) SKM_sk_zero(POLICYQUALINFO, (st))
+#define sk_POLICYQUALINFO_push(st, val) SKM_sk_push(POLICYQUALINFO, (st), (val))
+#define sk_POLICYQUALINFO_unshift(st, val) SKM_sk_unshift(POLICYQUALINFO, (st), (val))
+#define sk_POLICYQUALINFO_find(st, val) SKM_sk_find(POLICYQUALINFO, (st), (val))
+#define sk_POLICYQUALINFO_find_ex(st, val) SKM_sk_find_ex(POLICYQUALINFO, (st), (val))
+#define sk_POLICYQUALINFO_delete(st, i) SKM_sk_delete(POLICYQUALINFO, (st), (i))
+#define sk_POLICYQUALINFO_delete_ptr(st, ptr) SKM_sk_delete_ptr(POLICYQUALINFO, (st), (ptr))
+#define sk_POLICYQUALINFO_insert(st, val, i) SKM_sk_insert(POLICYQUALINFO, (st), (val), (i))
+#define sk_POLICYQUALINFO_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(POLICYQUALINFO, (st), (cmp))
+#define sk_POLICYQUALINFO_dup(st) SKM_sk_dup(POLICYQUALINFO, st)
+#define sk_POLICYQUALINFO_pop_free(st, free_func) SKM_sk_pop_free(POLICYQUALINFO, (st), (free_func))
+#define sk_POLICYQUALINFO_shift(st) SKM_sk_shift(POLICYQUALINFO, (st))
+#define sk_POLICYQUALINFO_pop(st) SKM_sk_pop(POLICYQUALINFO, (st))
+#define sk_POLICYQUALINFO_sort(st) SKM_sk_sort(POLICYQUALINFO, (st))
+#define sk_POLICYQUALINFO_is_sorted(st) SKM_sk_is_sorted(POLICYQUALINFO, (st))
+
+#define sk_POLICY_MAPPING_new(cmp) SKM_sk_new(POLICY_MAPPING, (cmp))
+#define sk_POLICY_MAPPING_new_null() SKM_sk_new_null(POLICY_MAPPING)
+#define sk_POLICY_MAPPING_free(st) SKM_sk_free(POLICY_MAPPING, (st))
+#define sk_POLICY_MAPPING_num(st) SKM_sk_num(POLICY_MAPPING, (st))
+#define sk_POLICY_MAPPING_value(st, i) SKM_sk_value(POLICY_MAPPING, (st), (i))
+#define sk_POLICY_MAPPING_set(st, i, val) SKM_sk_set(POLICY_MAPPING, (st), (i), (val))
+#define sk_POLICY_MAPPING_zero(st) SKM_sk_zero(POLICY_MAPPING, (st))
+#define sk_POLICY_MAPPING_push(st, val) SKM_sk_push(POLICY_MAPPING, (st), (val))
+#define sk_POLICY_MAPPING_unshift(st, val) SKM_sk_unshift(POLICY_MAPPING, (st), (val))
+#define sk_POLICY_MAPPING_find(st, val) SKM_sk_find(POLICY_MAPPING, (st), (val))
+#define sk_POLICY_MAPPING_find_ex(st, val) SKM_sk_find_ex(POLICY_MAPPING, (st), (val))
+#define sk_POLICY_MAPPING_delete(st, i) SKM_sk_delete(POLICY_MAPPING, (st), (i))
+#define sk_POLICY_MAPPING_delete_ptr(st, ptr) SKM_sk_delete_ptr(POLICY_MAPPING, (st), (ptr))
+#define sk_POLICY_MAPPING_insert(st, val, i) SKM_sk_insert(POLICY_MAPPING, (st), (val), (i))
+#define sk_POLICY_MAPPING_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(POLICY_MAPPING, (st), (cmp))
+#define sk_POLICY_MAPPING_dup(st) SKM_sk_dup(POLICY_MAPPING, st)
+#define sk_POLICY_MAPPING_pop_free(st, free_func) SKM_sk_pop_free(POLICY_MAPPING, (st), (free_func))
+#define sk_POLICY_MAPPING_shift(st) SKM_sk_shift(POLICY_MAPPING, (st))
+#define sk_POLICY_MAPPING_pop(st) SKM_sk_pop(POLICY_MAPPING, (st))
+#define sk_POLICY_MAPPING_sort(st) SKM_sk_sort(POLICY_MAPPING, (st))
+#define sk_POLICY_MAPPING_is_sorted(st) SKM_sk_is_sorted(POLICY_MAPPING, (st))
+
+#define sk_SRP_gN_new(cmp) SKM_sk_new(SRP_gN, (cmp))
+#define sk_SRP_gN_new_null() SKM_sk_new_null(SRP_gN)
+#define sk_SRP_gN_free(st) SKM_sk_free(SRP_gN, (st))
+#define sk_SRP_gN_num(st) SKM_sk_num(SRP_gN, (st))
+#define sk_SRP_gN_value(st, i) SKM_sk_value(SRP_gN, (st), (i))
+#define sk_SRP_gN_set(st, i, val) SKM_sk_set(SRP_gN, (st), (i), (val))
+#define sk_SRP_gN_zero(st) SKM_sk_zero(SRP_gN, (st))
+#define sk_SRP_gN_push(st, val) SKM_sk_push(SRP_gN, (st), (val))
+#define sk_SRP_gN_unshift(st, val) SKM_sk_unshift(SRP_gN, (st), (val))
+#define sk_SRP_gN_find(st, val) SKM_sk_find(SRP_gN, (st), (val))
+#define sk_SRP_gN_find_ex(st, val) SKM_sk_find_ex(SRP_gN, (st), (val))
+#define sk_SRP_gN_delete(st, i) SKM_sk_delete(SRP_gN, (st), (i))
+#define sk_SRP_gN_delete_ptr(st, ptr) SKM_sk_delete_ptr(SRP_gN, (st), (ptr))
+#define sk_SRP_gN_insert(st, val, i) SKM_sk_insert(SRP_gN, (st), (val), (i))
+#define sk_SRP_gN_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SRP_gN, (st), (cmp))
+#define sk_SRP_gN_dup(st) SKM_sk_dup(SRP_gN, st)
+#define sk_SRP_gN_pop_free(st, free_func) SKM_sk_pop_free(SRP_gN, (st), (free_func))
+#define sk_SRP_gN_shift(st) SKM_sk_shift(SRP_gN, (st))
+#define sk_SRP_gN_pop(st) SKM_sk_pop(SRP_gN, (st))
+#define sk_SRP_gN_sort(st) SKM_sk_sort(SRP_gN, (st))
+#define sk_SRP_gN_is_sorted(st) SKM_sk_is_sorted(SRP_gN, (st))
+
+#define sk_SRP_gN_cache_new(cmp) SKM_sk_new(SRP_gN_cache, (cmp))
+#define sk_SRP_gN_cache_new_null() SKM_sk_new_null(SRP_gN_cache)
+#define sk_SRP_gN_cache_free(st) SKM_sk_free(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_num(st) SKM_sk_num(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_value(st, i) SKM_sk_value(SRP_gN_cache, (st), (i))
+#define sk_SRP_gN_cache_set(st, i, val) SKM_sk_set(SRP_gN_cache, (st), (i), (val))
+#define sk_SRP_gN_cache_zero(st) SKM_sk_zero(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_push(st, val) SKM_sk_push(SRP_gN_cache, (st), (val))
+#define sk_SRP_gN_cache_unshift(st, val) SKM_sk_unshift(SRP_gN_cache, (st), (val))
+#define sk_SRP_gN_cache_find(st, val) SKM_sk_find(SRP_gN_cache, (st), (val))
+#define sk_SRP_gN_cache_find_ex(st, val) SKM_sk_find_ex(SRP_gN_cache, (st), (val))
+#define sk_SRP_gN_cache_delete(st, i) SKM_sk_delete(SRP_gN_cache, (st), (i))
+#define sk_SRP_gN_cache_delete_ptr(st, ptr) SKM_sk_delete_ptr(SRP_gN_cache, (st), (ptr))
+#define sk_SRP_gN_cache_insert(st, val, i) SKM_sk_insert(SRP_gN_cache, (st), (val), (i))
+#define sk_SRP_gN_cache_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SRP_gN_cache, (st), (cmp))
+#define sk_SRP_gN_cache_dup(st) SKM_sk_dup(SRP_gN_cache, st)
+#define sk_SRP_gN_cache_pop_free(st, free_func) SKM_sk_pop_free(SRP_gN_cache, (st), (free_func))
+#define sk_SRP_gN_cache_shift(st) SKM_sk_shift(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_pop(st) SKM_sk_pop(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_sort(st) SKM_sk_sort(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_is_sorted(st) SKM_sk_is_sorted(SRP_gN_cache, (st))
+
+#define sk_SRP_user_pwd_new(cmp) SKM_sk_new(SRP_user_pwd, (cmp))
+#define sk_SRP_user_pwd_new_null() SKM_sk_new_null(SRP_user_pwd)
+#define sk_SRP_user_pwd_free(st) SKM_sk_free(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_num(st) SKM_sk_num(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_value(st, i) SKM_sk_value(SRP_user_pwd, (st), (i))
+#define sk_SRP_user_pwd_set(st, i, val) SKM_sk_set(SRP_user_pwd, (st), (i), (val))
+#define sk_SRP_user_pwd_zero(st) SKM_sk_zero(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_push(st, val) SKM_sk_push(SRP_user_pwd, (st), (val))
+#define sk_SRP_user_pwd_unshift(st, val) SKM_sk_unshift(SRP_user_pwd, (st), (val))
+#define sk_SRP_user_pwd_find(st, val) SKM_sk_find(SRP_user_pwd, (st), (val))
+#define sk_SRP_user_pwd_find_ex(st, val) SKM_sk_find_ex(SRP_user_pwd, (st), (val))
+#define sk_SRP_user_pwd_delete(st, i) SKM_sk_delete(SRP_user_pwd, (st), (i))
+#define sk_SRP_user_pwd_delete_ptr(st, ptr) SKM_sk_delete_ptr(SRP_user_pwd, (st), (ptr))
+#define sk_SRP_user_pwd_insert(st, val, i) SKM_sk_insert(SRP_user_pwd, (st), (val), (i))
+#define sk_SRP_user_pwd_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SRP_user_pwd, (st), (cmp))
+#define sk_SRP_user_pwd_dup(st) SKM_sk_dup(SRP_user_pwd, st)
+#define sk_SRP_user_pwd_pop_free(st, free_func) SKM_sk_pop_free(SRP_user_pwd, (st), (free_func))
+#define sk_SRP_user_pwd_shift(st) SKM_sk_shift(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_pop(st) SKM_sk_pop(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_sort(st) SKM_sk_sort(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_is_sorted(st) SKM_sk_is_sorted(SRP_user_pwd, (st))
+
+#define sk_SRTP_PROTECTION_PROFILE_new(cmp) SKM_sk_new(SRTP_PROTECTION_PROFILE, (cmp))
+#define sk_SRTP_PROTECTION_PROFILE_new_null() SKM_sk_new_null(SRTP_PROTECTION_PROFILE)
+#define sk_SRTP_PROTECTION_PROFILE_free(st) SKM_sk_free(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_num(st) SKM_sk_num(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_value(st, i) SKM_sk_value(SRTP_PROTECTION_PROFILE, (st), (i))
+#define sk_SRTP_PROTECTION_PROFILE_set(st, i, val) SKM_sk_set(SRTP_PROTECTION_PROFILE, (st), (i), (val))
+#define sk_SRTP_PROTECTION_PROFILE_zero(st) SKM_sk_zero(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_push(st, val) SKM_sk_push(SRTP_PROTECTION_PROFILE, (st), (val))
+#define sk_SRTP_PROTECTION_PROFILE_unshift(st, val) SKM_sk_unshift(SRTP_PROTECTION_PROFILE, (st), (val))
+#define sk_SRTP_PROTECTION_PROFILE_find(st, val) SKM_sk_find(SRTP_PROTECTION_PROFILE, (st), (val))
+#define sk_SRTP_PROTECTION_PROFILE_find_ex(st, val) SKM_sk_find_ex(SRTP_PROTECTION_PROFILE, (st), (val))
+#define sk_SRTP_PROTECTION_PROFILE_delete(st, i) SKM_sk_delete(SRTP_PROTECTION_PROFILE, (st), (i))
+#define sk_SRTP_PROTECTION_PROFILE_delete_ptr(st, ptr) SKM_sk_delete_ptr(SRTP_PROTECTION_PROFILE, (st), (ptr))
+#define sk_SRTP_PROTECTION_PROFILE_insert(st, val, i) SKM_sk_insert(SRTP_PROTECTION_PROFILE, (st), (val), (i))
+#define sk_SRTP_PROTECTION_PROFILE_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SRTP_PROTECTION_PROFILE, (st), (cmp))
+#define sk_SRTP_PROTECTION_PROFILE_dup(st) SKM_sk_dup(SRTP_PROTECTION_PROFILE, st)
+#define sk_SRTP_PROTECTION_PROFILE_pop_free(st, free_func) SKM_sk_pop_free(SRTP_PROTECTION_PROFILE, (st), (free_func))
+#define sk_SRTP_PROTECTION_PROFILE_shift(st) SKM_sk_shift(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_pop(st) SKM_sk_pop(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_sort(st) SKM_sk_sort(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_is_sorted(st) SKM_sk_is_sorted(SRTP_PROTECTION_PROFILE, (st))
+
+#define sk_SSL_CIPHER_new(cmp) SKM_sk_new(SSL_CIPHER, (cmp))
+#define sk_SSL_CIPHER_new_null() SKM_sk_new_null(SSL_CIPHER)
+#define sk_SSL_CIPHER_free(st) SKM_sk_free(SSL_CIPHER, (st))
+#define sk_SSL_CIPHER_num(st) SKM_sk_num(SSL_CIPHER, (st))
+#define sk_SSL_CIPHER_value(st, i) SKM_sk_value(SSL_CIPHER, (st), (i))
+#define sk_SSL_CIPHER_set(st, i, val) SKM_sk_set(SSL_CIPHER, (st), (i), (val))
+#define sk_SSL_CIPHER_zero(st) SKM_sk_zero(SSL_CIPHER, (st))
+#define sk_SSL_CIPHER_push(st, val) SKM_sk_push(SSL_CIPHER, (st), (val))
+#define sk_SSL_CIPHER_unshift(st, val) SKM_sk_unshift(SSL_CIPHER, (st), (val))
+#define sk_SSL_CIPHER_find(st, val) SKM_sk_find(SSL_CIPHER, (st), (val))
+#define sk_SSL_CIPHER_find_ex(st, val) SKM_sk_find_ex(SSL_CIPHER, (st), (val))
+#define sk_SSL_CIPHER_delete(st, i) SKM_sk_delete(SSL_CIPHER, (st), (i))
+#define sk_SSL_CIPHER_delete_ptr(st, ptr) SKM_sk_delete_ptr(SSL_CIPHER, (st), (ptr))
+#define sk_SSL_CIPHER_insert(st, val, i) SKM_sk_insert(SSL_CIPHER, (st), (val), (i))
+#define sk_SSL_CIPHER_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SSL_CIPHER, (st), (cmp))
+#define sk_SSL_CIPHER_dup(st) SKM_sk_dup(SSL_CIPHER, st)
+#define sk_SSL_CIPHER_pop_free(st, free_func) SKM_sk_pop_free(SSL_CIPHER, (st), (free_func))
+#define sk_SSL_CIPHER_shift(st) SKM_sk_shift(SSL_CIPHER, (st))
+#define sk_SSL_CIPHER_pop(st) SKM_sk_pop(SSL_CIPHER, (st))
+#define sk_SSL_CIPHER_sort(st) SKM_sk_sort(SSL_CIPHER, (st))
+#define sk_SSL_CIPHER_is_sorted(st) SKM_sk_is_sorted(SSL_CIPHER, (st))
+
+#define sk_SSL_COMP_new(cmp) SKM_sk_new(SSL_COMP, (cmp))
+#define sk_SSL_COMP_new_null() SKM_sk_new_null(SSL_COMP)
+#define sk_SSL_COMP_free(st) SKM_sk_free(SSL_COMP, (st))
+#define sk_SSL_COMP_num(st) SKM_sk_num(SSL_COMP, (st))
+#define sk_SSL_COMP_value(st, i) SKM_sk_value(SSL_COMP, (st), (i))
+#define sk_SSL_COMP_set(st, i, val) SKM_sk_set(SSL_COMP, (st), (i), (val))
+#define sk_SSL_COMP_zero(st) SKM_sk_zero(SSL_COMP, (st))
+#define sk_SSL_COMP_push(st, val) SKM_sk_push(SSL_COMP, (st), (val))
+#define sk_SSL_COMP_unshift(st, val) SKM_sk_unshift(SSL_COMP, (st), (val))
+#define sk_SSL_COMP_find(st, val) SKM_sk_find(SSL_COMP, (st), (val))
+#define sk_SSL_COMP_find_ex(st, val) SKM_sk_find_ex(SSL_COMP, (st), (val))
+#define sk_SSL_COMP_delete(st, i) SKM_sk_delete(SSL_COMP, (st), (i))
+#define sk_SSL_COMP_delete_ptr(st, ptr) SKM_sk_delete_ptr(SSL_COMP, (st), (ptr))
+#define sk_SSL_COMP_insert(st, val, i) SKM_sk_insert(SSL_COMP, (st), (val), (i))
+#define sk_SSL_COMP_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SSL_COMP, (st), (cmp))
+#define sk_SSL_COMP_dup(st) SKM_sk_dup(SSL_COMP, st)
+#define sk_SSL_COMP_pop_free(st, free_func) SKM_sk_pop_free(SSL_COMP, (st), (free_func))
+#define sk_SSL_COMP_shift(st) SKM_sk_shift(SSL_COMP, (st))
+#define sk_SSL_COMP_pop(st) SKM_sk_pop(SSL_COMP, (st))
+#define sk_SSL_COMP_sort(st) SKM_sk_sort(SSL_COMP, (st))
+#define sk_SSL_COMP_is_sorted(st) SKM_sk_is_sorted(SSL_COMP, (st))
+
+#define sk_STACK_OF_X509_NAME_ENTRY_new(cmp) SKM_sk_new(STACK_OF_X509_NAME_ENTRY, (cmp))
+#define sk_STACK_OF_X509_NAME_ENTRY_new_null() SKM_sk_new_null(STACK_OF_X509_NAME_ENTRY)
+#define sk_STACK_OF_X509_NAME_ENTRY_free(st) SKM_sk_free(STACK_OF_X509_NAME_ENTRY, (st))
+#define sk_STACK_OF_X509_NAME_ENTRY_num(st) SKM_sk_num(STACK_OF_X509_NAME_ENTRY, (st))
+#define sk_STACK_OF_X509_NAME_ENTRY_value(st, i) SKM_sk_value(STACK_OF_X509_NAME_ENTRY, (st), (i))
+#define sk_STACK_OF_X509_NAME_ENTRY_set(st, i, val) SKM_sk_set(STACK_OF_X509_NAME_ENTRY, (st), (i), (val))
+#define sk_STACK_OF_X509_NAME_ENTRY_zero(st) SKM_sk_zero(STACK_OF_X509_NAME_ENTRY, (st))
+#define sk_STACK_OF_X509_NAME_ENTRY_push(st, val) SKM_sk_push(STACK_OF_X509_NAME_ENTRY, (st), (val))
+#define sk_STACK_OF_X509_NAME_ENTRY_unshift(st, val) SKM_sk_unshift(STACK_OF_X509_NAME_ENTRY, (st), (val))
+#define sk_STACK_OF_X509_NAME_ENTRY_find(st, val) SKM_sk_find(STACK_OF_X509_NAME_ENTRY, (st), (val))
+#define sk_STACK_OF_X509_NAME_ENTRY_find_ex(st, val) SKM_sk_find_ex(STACK_OF_X509_NAME_ENTRY, (st), (val))
+#define sk_STACK_OF_X509_NAME_ENTRY_delete(st, i) SKM_sk_delete(STACK_OF_X509_NAME_ENTRY, (st), (i))
+#define sk_STACK_OF_X509_NAME_ENTRY_delete_ptr(st, ptr) SKM_sk_delete_ptr(STACK_OF_X509_NAME_ENTRY, (st), (ptr))
+#define sk_STACK_OF_X509_NAME_ENTRY_insert(st, val, i) SKM_sk_insert(STACK_OF_X509_NAME_ENTRY, (st), (val), (i))
+#define sk_STACK_OF_X509_NAME_ENTRY_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(STACK_OF_X509_NAME_ENTRY, (st), (cmp))
+#define sk_STACK_OF_X509_NAME_ENTRY_dup(st) SKM_sk_dup(STACK_OF_X509_NAME_ENTRY, st)
+#define sk_STACK_OF_X509_NAME_ENTRY_pop_free(st, free_func) SKM_sk_pop_free(STACK_OF_X509_NAME_ENTRY, (st), (free_func))
+#define sk_STACK_OF_X509_NAME_ENTRY_shift(st) SKM_sk_shift(STACK_OF_X509_NAME_ENTRY, (st))
+#define sk_STACK_OF_X509_NAME_ENTRY_pop(st) SKM_sk_pop(STACK_OF_X509_NAME_ENTRY, (st))
+#define sk_STACK_OF_X509_NAME_ENTRY_sort(st) SKM_sk_sort(STACK_OF_X509_NAME_ENTRY, (st))
+#define sk_STACK_OF_X509_NAME_ENTRY_is_sorted(st) SKM_sk_is_sorted(STACK_OF_X509_NAME_ENTRY, (st))
+
+#define sk_STORE_ATTR_INFO_new(cmp) SKM_sk_new(STORE_ATTR_INFO, (cmp))
+#define sk_STORE_ATTR_INFO_new_null() SKM_sk_new_null(STORE_ATTR_INFO)
+#define sk_STORE_ATTR_INFO_free(st) SKM_sk_free(STORE_ATTR_INFO, (st))
+#define sk_STORE_ATTR_INFO_num(st) SKM_sk_num(STORE_ATTR_INFO, (st))
+#define sk_STORE_ATTR_INFO_value(st, i) SKM_sk_value(STORE_ATTR_INFO, (st), (i))
+#define sk_STORE_ATTR_INFO_set(st, i, val) SKM_sk_set(STORE_ATTR_INFO, (st), (i), (val))
+#define sk_STORE_ATTR_INFO_zero(st) SKM_sk_zero(STORE_ATTR_INFO, (st))
+#define sk_STORE_ATTR_INFO_push(st, val) SKM_sk_push(STORE_ATTR_INFO, (st), (val))
+#define sk_STORE_ATTR_INFO_unshift(st, val) SKM_sk_unshift(STORE_ATTR_INFO, (st), (val))
+#define sk_STORE_ATTR_INFO_find(st, val) SKM_sk_find(STORE_ATTR_INFO, (st), (val))
+#define sk_STORE_ATTR_INFO_find_ex(st, val) SKM_sk_find_ex(STORE_ATTR_INFO, (st), (val))
+#define sk_STORE_ATTR_INFO_delete(st, i) SKM_sk_delete(STORE_ATTR_INFO, (st), (i))
+#define sk_STORE_ATTR_INFO_delete_ptr(st, ptr) SKM_sk_delete_ptr(STORE_ATTR_INFO, (st), (ptr))
+#define sk_STORE_ATTR_INFO_insert(st, val, i) SKM_sk_insert(STORE_ATTR_INFO, (st), (val), (i))
+#define sk_STORE_ATTR_INFO_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(STORE_ATTR_INFO, (st), (cmp))
+#define sk_STORE_ATTR_INFO_dup(st) SKM_sk_dup(STORE_ATTR_INFO, st)
+#define sk_STORE_ATTR_INFO_pop_free(st, free_func) SKM_sk_pop_free(STORE_ATTR_INFO, (st), (free_func))
+#define sk_STORE_ATTR_INFO_shift(st) SKM_sk_shift(STORE_ATTR_INFO, (st))
+#define sk_STORE_ATTR_INFO_pop(st) SKM_sk_pop(STORE_ATTR_INFO, (st))
+#define sk_STORE_ATTR_INFO_sort(st) SKM_sk_sort(STORE_ATTR_INFO, (st))
+#define sk_STORE_ATTR_INFO_is_sorted(st) SKM_sk_is_sorted(STORE_ATTR_INFO, (st))
+
+#define sk_STORE_OBJECT_new(cmp) SKM_sk_new(STORE_OBJECT, (cmp))
+#define sk_STORE_OBJECT_new_null() SKM_sk_new_null(STORE_OBJECT)
+#define sk_STORE_OBJECT_free(st) SKM_sk_free(STORE_OBJECT, (st))
+#define sk_STORE_OBJECT_num(st) SKM_sk_num(STORE_OBJECT, (st))
+#define sk_STORE_OBJECT_value(st, i) SKM_sk_value(STORE_OBJECT, (st), (i))
+#define sk_STORE_OBJECT_set(st, i, val) SKM_sk_set(STORE_OBJECT, (st), (i), (val))
+#define sk_STORE_OBJECT_zero(st) SKM_sk_zero(STORE_OBJECT, (st))
+#define sk_STORE_OBJECT_push(st, val) SKM_sk_push(STORE_OBJECT, (st), (val))
+#define sk_STORE_OBJECT_unshift(st, val) SKM_sk_unshift(STORE_OBJECT, (st), (val))
+#define sk_STORE_OBJECT_find(st, val) SKM_sk_find(STORE_OBJECT, (st), (val))
+#define sk_STORE_OBJECT_find_ex(st, val) SKM_sk_find_ex(STORE_OBJECT, (st), (val))
+#define sk_STORE_OBJECT_delete(st, i) SKM_sk_delete(STORE_OBJECT, (st), (i))
+#define sk_STORE_OBJECT_delete_ptr(st, ptr) SKM_sk_delete_ptr(STORE_OBJECT, (st), (ptr))
+#define sk_STORE_OBJECT_insert(st, val, i) SKM_sk_insert(STORE_OBJECT, (st), (val), (i))
+#define sk_STORE_OBJECT_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(STORE_OBJECT, (st), (cmp))
+#define sk_STORE_OBJECT_dup(st) SKM_sk_dup(STORE_OBJECT, st)
+#define sk_STORE_OBJECT_pop_free(st, free_func) SKM_sk_pop_free(STORE_OBJECT, (st), (free_func))
+#define sk_STORE_OBJECT_shift(st) SKM_sk_shift(STORE_OBJECT, (st))
+#define sk_STORE_OBJECT_pop(st) SKM_sk_pop(STORE_OBJECT, (st))
+#define sk_STORE_OBJECT_sort(st) SKM_sk_sort(STORE_OBJECT, (st))
+#define sk_STORE_OBJECT_is_sorted(st) SKM_sk_is_sorted(STORE_OBJECT, (st))
+
+#define sk_SXNETID_new(cmp) SKM_sk_new(SXNETID, (cmp))
+#define sk_SXNETID_new_null() SKM_sk_new_null(SXNETID)
+#define sk_SXNETID_free(st) SKM_sk_free(SXNETID, (st))
+#define sk_SXNETID_num(st) SKM_sk_num(SXNETID, (st))
+#define sk_SXNETID_value(st, i) SKM_sk_value(SXNETID, (st), (i))
+#define sk_SXNETID_set(st, i, val) SKM_sk_set(SXNETID, (st), (i), (val))
+#define sk_SXNETID_zero(st) SKM_sk_zero(SXNETID, (st))
+#define sk_SXNETID_push(st, val) SKM_sk_push(SXNETID, (st), (val))
+#define sk_SXNETID_unshift(st, val) SKM_sk_unshift(SXNETID, (st), (val))
+#define sk_SXNETID_find(st, val) SKM_sk_find(SXNETID, (st), (val))
+#define sk_SXNETID_find_ex(st, val) SKM_sk_find_ex(SXNETID, (st), (val))
+#define sk_SXNETID_delete(st, i) SKM_sk_delete(SXNETID, (st), (i))
+#define sk_SXNETID_delete_ptr(st, ptr) SKM_sk_delete_ptr(SXNETID, (st), (ptr))
+#define sk_SXNETID_insert(st, val, i) SKM_sk_insert(SXNETID, (st), (val), (i))
+#define sk_SXNETID_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SXNETID, (st), (cmp))
+#define sk_SXNETID_dup(st) SKM_sk_dup(SXNETID, st)
+#define sk_SXNETID_pop_free(st, free_func) SKM_sk_pop_free(SXNETID, (st), (free_func))
+#define sk_SXNETID_shift(st) SKM_sk_shift(SXNETID, (st))
+#define sk_SXNETID_pop(st) SKM_sk_pop(SXNETID, (st))
+#define sk_SXNETID_sort(st) SKM_sk_sort(SXNETID, (st))
+#define sk_SXNETID_is_sorted(st) SKM_sk_is_sorted(SXNETID, (st))
+
+#define sk_UI_STRING_new(cmp) SKM_sk_new(UI_STRING, (cmp))
+#define sk_UI_STRING_new_null() SKM_sk_new_null(UI_STRING)
+#define sk_UI_STRING_free(st) SKM_sk_free(UI_STRING, (st))
+#define sk_UI_STRING_num(st) SKM_sk_num(UI_STRING, (st))
+#define sk_UI_STRING_value(st, i) SKM_sk_value(UI_STRING, (st), (i))
+#define sk_UI_STRING_set(st, i, val) SKM_sk_set(UI_STRING, (st), (i), (val))
+#define sk_UI_STRING_zero(st) SKM_sk_zero(UI_STRING, (st))
+#define sk_UI_STRING_push(st, val) SKM_sk_push(UI_STRING, (st), (val))
+#define sk_UI_STRING_unshift(st, val) SKM_sk_unshift(UI_STRING, (st), (val))
+#define sk_UI_STRING_find(st, val) SKM_sk_find(UI_STRING, (st), (val))
+#define sk_UI_STRING_find_ex(st, val) SKM_sk_find_ex(UI_STRING, (st), (val))
+#define sk_UI_STRING_delete(st, i) SKM_sk_delete(UI_STRING, (st), (i))
+#define sk_UI_STRING_delete_ptr(st, ptr) SKM_sk_delete_ptr(UI_STRING, (st), (ptr))
+#define sk_UI_STRING_insert(st, val, i) SKM_sk_insert(UI_STRING, (st), (val), (i))
+#define sk_UI_STRING_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(UI_STRING, (st), (cmp))
+#define sk_UI_STRING_dup(st) SKM_sk_dup(UI_STRING, st)
+#define sk_UI_STRING_pop_free(st, free_func) SKM_sk_pop_free(UI_STRING, (st), (free_func))
+#define sk_UI_STRING_shift(st) SKM_sk_shift(UI_STRING, (st))
+#define sk_UI_STRING_pop(st) SKM_sk_pop(UI_STRING, (st))
+#define sk_UI_STRING_sort(st) SKM_sk_sort(UI_STRING, (st))
+#define sk_UI_STRING_is_sorted(st) SKM_sk_is_sorted(UI_STRING, (st))
+
+#define sk_X509_new(cmp) SKM_sk_new(X509, (cmp))
+#define sk_X509_new_null() SKM_sk_new_null(X509)
+#define sk_X509_free(st) SKM_sk_free(X509, (st))
+#define sk_X509_num(st) SKM_sk_num(X509, (st))
+#define sk_X509_value(st, i) SKM_sk_value(X509, (st), (i))
+#define sk_X509_set(st, i, val) SKM_sk_set(X509, (st), (i), (val))
+#define sk_X509_zero(st) SKM_sk_zero(X509, (st))
+#define sk_X509_push(st, val) SKM_sk_push(X509, (st), (val))
+#define sk_X509_unshift(st, val) SKM_sk_unshift(X509, (st), (val))
+#define sk_X509_find(st, val) SKM_sk_find(X509, (st), (val))
+#define sk_X509_find_ex(st, val) SKM_sk_find_ex(X509, (st), (val))
+#define sk_X509_delete(st, i) SKM_sk_delete(X509, (st), (i))
+#define sk_X509_delete_ptr(st, ptr) SKM_sk_delete_ptr(X509, (st), (ptr))
+#define sk_X509_insert(st, val, i) SKM_sk_insert(X509, (st), (val), (i))
+#define sk_X509_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(X509, (st), (cmp))
+#define sk_X509_dup(st) SKM_sk_dup(X509, st)
+#define sk_X509_pop_free(st, free_func) SKM_sk_pop_free(X509, (st), (free_func))
+#define sk_X509_shift(st) SKM_sk_shift(X509, (st))
+#define sk_X509_pop(st) SKM_sk_pop(X509, (st))
+#define sk_X509_sort(st) SKM_sk_sort(X509, (st))
+#define sk_X509_is_sorted(st) SKM_sk_is_sorted(X509, (st))
+
+#define sk_X509V3_EXT_METHOD_new(cmp) SKM_sk_new(X509V3_EXT_METHOD, (cmp))
+#define sk_X509V3_EXT_METHOD_new_null() SKM_sk_new_null(X509V3_EXT_METHOD)
+#define sk_X509V3_EXT_METHOD_free(st) SKM_sk_free(X509V3_EXT_METHOD, (st))
+#define sk_X509V3_EXT_METHOD_num(st) SKM_sk_num(X509V3_EXT_METHOD, (st))
+#define sk_X509V3_EXT_METHOD_value(st, i) SKM_sk_value(X509V3_EXT_METHOD, (st), (i))
+#define sk_X509V3_EXT_METHOD_set(st, i, val) SKM_sk_set(X509V3_EXT_METHOD, (st), (i), (val))
+#define sk_X509V3_EXT_METHOD_zero(st) SKM_sk_zero(X509V3_EXT_METHOD, (st))
+#define sk_X509V3_EXT_METHOD_push(st, val) SKM_sk_push(X509V3_EXT_METHOD, (st), (val))
+#define sk_X509V3_EXT_METHOD_unshift(st, val) SKM_sk_unshift(X509V3_EXT_METHOD, (st), (val))
+#define sk_X509V3_EXT_METHOD_find(st, val) SKM_sk_find(X509V3_EXT_METHOD, (st), (val))
+#define sk_X509V3_EXT_METHOD_find_ex(st, val) SKM_sk_find_ex(X509V3_EXT_METHOD, (st), (val))
+#define sk_X509V3_EXT_METHOD_delete(st, i) SKM_sk_delete(X509V3_EXT_METHOD, (st), (i))
+#define sk_X509V3_EXT_METHOD_delete_ptr(st, ptr) SKM_sk_delete_ptr(X509V3_EXT_METHOD, (st), (ptr))
+#define sk_X509V3_EXT_METHOD_insert(st, val, i) SKM_sk_insert(X509V3_EXT_METHOD, (st), (val), (i))
+#define sk_X509V3_EXT_METHOD_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(X509V3_EXT_METHOD, (st), (cmp))
+#define sk_X509V3_EXT_METHOD_dup(st) SKM_sk_dup(X509V3_EXT_METHOD, st)
+#define sk_X509V3_EXT_METHOD_pop_free(st, free_func) SKM_sk_pop_free(X509V3_EXT_METHOD, (st), (free_func))
+#define sk_X509V3_EXT_METHOD_shift(st) SKM_sk_shift(X509V3_EXT_METHOD, (st))
+#define sk_X509V3_EXT_METHOD_pop(st) SKM_sk_pop(X509V3_EXT_METHOD, (st))
+#define sk_X509V3_EXT_METHOD_sort(st) SKM_sk_sort(X509V3_EXT_METHOD, (st))
+#define sk_X509V3_EXT_METHOD_is_sorted(st) SKM_sk_is_sorted(X509V3_EXT_METHOD, (st))
+
+#define sk_X509_ALGOR_new(cmp) SKM_sk_new(X509_ALGOR, (cmp))
+#define sk_X509_ALGOR_new_null() SKM_sk_new_null(X509_ALGOR)
+#define sk_X509_ALGOR_free(st) SKM_sk_free(X509_ALGOR, (st))
+#define sk_X509_ALGOR_num(st) SKM_sk_num(X509_ALGOR, (st))
+#define sk_X509_ALGOR_value(st, i) SKM_sk_value(X509_ALGOR, (st), (i))
+#define sk_X509_ALGOR_set(st, i, val) SKM_sk_set(X509_ALGOR, (st), (i), (val))
+#define sk_X509_ALGOR_zero(st) SKM_sk_zero(X509_ALGOR, (st))
+#define sk_X509_ALGOR_push(st, val) SKM_sk_push(X509_ALGOR, (st), (val))
+#define sk_X509_ALGOR_unshift(st, val) SKM_sk_unshift(X509_ALGOR, (st), (val))
+#define sk_X509_ALGOR_find(st, val) SKM_sk_find(X509_ALGOR, (st), (val))
+#define sk_X509_ALGOR_find_ex(st, val) SKM_sk_find_ex(X509_ALGOR, (st), (val))
+#define sk_X509_ALGOR_delete(st, i) SKM_sk_delete(X509_ALGOR, (st), (i))
+#define sk_X509_ALGOR_delete_ptr(st, ptr) SKM_sk_delete_ptr(X509_ALGOR, (st), (ptr))
+#define sk_X509_ALGOR_insert(st, val, i) SKM_sk_insert(X509_ALGOR, (st), (val), (i))
+#define sk_X509_ALGOR_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(X509_ALGOR, (st), (cmp))
+#define sk_X509_ALGOR_dup(st) SKM_sk_dup(X509_ALGOR, st)
+#define sk_X509_ALGOR_pop_free(st, free_func) SKM_sk_pop_free(X509_ALGOR, (st), (free_func))
+#define sk_X509_ALGOR_shift(st) SKM_sk_shift(X509_ALGOR, (st))
+#define sk_X509_ALGOR_pop(st) SKM_sk_pop(X509_ALGOR, (st))
+#define sk_X509_ALGOR_sort(st) SKM_sk_sort(X509_ALGOR, (st))
+#define sk_X509_ALGOR_is_sorted(st) SKM_sk_is_sorted(X509_ALGOR, (st))
+
+#define sk_X509_ATTRIBUTE_new(cmp) SKM_sk_new(X509_ATTRIBUTE, (cmp))
+#define sk_X509_ATTRIBUTE_new_null() SKM_sk_new_null(X509_ATTRIBUTE)
+#define sk_X509_ATTRIBUTE_free(st) SKM_sk_free(X509_ATTRIBUTE, (st))
+#define sk_X509_ATTRIBUTE_num(st) SKM_sk_num(X509_ATTRIBUTE, (st))
+#define sk_X509_ATTRIBUTE_value(st, i) SKM_sk_value(X509_ATTRIBUTE, (st), (i))
+#define sk_X509_ATTRIBUTE_set(st, i, val) SKM_sk_set(X509_ATTRIBUTE, (st), (i), (val))
+#define sk_X509_ATTRIBUTE_zero(st) SKM_sk_zero(X509_ATTRIBUTE, (st))
+#define sk_X509_ATTRIBUTE_push(st, val) SKM_sk_push(X509_ATTRIBUTE, (st), (val))
+#define sk_X509_ATTRIBUTE_unshift(st, val) SKM_sk_unshift(X509_ATTRIBUTE, (st), (val))
+#define sk_X509_ATTRIBUTE_find(st, val) SKM_sk_find(X509_ATTRIBUTE, (st), (val))
+#define sk_X509_ATTRIBUTE_find_ex(st, val) SKM_sk_find_ex(X509_ATTRIBUTE, (st), (val))
+#define sk_X509_ATTRIBUTE_delete(st, i) SKM_sk_delete(X509_ATTRIBUTE, (st), (i))
+#define sk_X509_ATTRIBUTE_delete_ptr(st, ptr) SKM_sk_delete_ptr(X509_ATTRIBUTE, (st), (ptr))
+#define sk_X509_ATTRIBUTE_insert(st, val, i) SKM_sk_insert(X509_ATTRIBUTE, (st), (val), (i))
+#define sk_X509_ATTRIBUTE_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(X509_ATTRIBUTE, (st), (cmp))
+#define sk_X509_ATTRIBUTE_dup(st) SKM_sk_dup(X509_ATTRIBUTE, st)
+#define sk_X509_ATTRIBUTE_pop_free(st, free_func) SKM_sk_pop_free(X509_ATTRIBUTE, (st), (free_func))
+#define sk_X509_ATTRIBUTE_shift(st) SKM_sk_shift(X509_ATTRIBUTE, (st))
+#define sk_X509_ATTRIBUTE_pop(st) SKM_sk_pop(X509_ATTRIBUTE, (st))
+#define sk_X509_ATTRIBUTE_sort(st) SKM_sk_sort(X509_ATTRIBUTE, (st))
+#define sk_X509_ATTRIBUTE_is_sorted(st) SKM_sk_is_sorted(X509_ATTRIBUTE, (st))
+
+#define sk_X509_CRL_new(cmp) SKM_sk_new(X509_CRL, (cmp))
+#define sk_X509_CRL_new_null() SKM_sk_new_null(X509_CRL)
+#define sk_X509_CRL_free(st) SKM_sk_free(X509_CRL, (st))
+#define sk_X509_CRL_num(st) SKM_sk_num(X509_CRL, (st))
+#define sk_X509_CRL_value(st, i) SKM_sk_value(X509_CRL, (st), (i))
+#define sk_X509_CRL_set(st, i, val) SKM_sk_set(X509_CRL, (st), (i), (val))
+#define sk_X509_CRL_zero(st) SKM_sk_zero(X509_CRL, (st))
+#define sk_X509_CRL_push(st, val) SKM_sk_push(X509_CRL, (st), (val))
+#define sk_X509_CRL_unshift(st, val) SKM_sk_unshift(X509_CRL, (st), (val))
+#define sk_X509_CRL_find(st, val) SKM_sk_find(X509_CRL, (st), (val))
+#define sk_X509_CRL_find_ex(st, val) SKM_sk_find_ex(X509_CRL, (st), (val))
+#define sk_X509_CRL_delete(st, i) SKM_sk_delete(X509_CRL, (st), (i))
+#define sk_X509_CRL_delete_ptr(st, ptr) SKM_sk_delete_ptr(X509_CRL, (st), (ptr))
+#define sk_X509_CRL_insert(st, val, i) SKM_sk_insert(X509_CRL, (st), (val), (i))
+#define sk_X509_CRL_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(X509_CRL, (st), (cmp))
+#define sk_X509_CRL_dup(st) SKM_sk_dup(X509_CRL, st)
+#define sk_X509_CRL_pop_free(st, free_func) SKM_sk_pop_free(X509_CRL, (st), (free_func))
+#define sk_X509_CRL_shift(st) SKM_sk_shift(X509_CRL, (st))
+#define sk_X509_CRL_pop(st) SKM_sk_pop(X509_CRL, (st))
+#define sk_X509_CRL_sort(st) SKM_sk_sort(X509_CRL, (st))
+#define sk_X509_CRL_is_sorted(st) SKM_sk_is_sorted(X509_CRL, (st))
+
+#define sk_X509_EXTENSION_new(cmp) SKM_sk_new(X509_EXTENSION, (cmp))
+#define sk_X509_EXTENSION_new_null() SKM_sk_new_null(X509_EXTENSION)
+#define sk_X509_EXTENSION_free(st) SKM_sk_free(X509_EXTENSION, (st))
+#define sk_X509_EXTENSION_num(st) SKM_sk_num(X509_EXTENSION, (st))
+#define sk_X509_EXTENSION_value(st, i) SKM_sk_value(X509_EXTENSION, (st), (i))
+#define sk_X509_EXTENSION_set(st, i, val) SKM_sk_set(X509_EXTENSION, (st), (i), (val))
+#define sk_X509_EXTENSION_zero(st) SKM_sk_zero(X509_EXTENSION, (st))
+#define sk_X509_EXTENSION_push(st, val) SKM_sk_push(X509_EXTENSION, (st), (val))
+#define sk_X509_EXTENSION_unshift(st, val) SKM_sk_unshift(X509_EXTENSION, (st), (val))
+#define sk_X509_EXTENSION_find(st, val) SKM_sk_find(X509_EXTENSION, (st), (val))
+#define sk_X509_EXTENSION_find_ex(st, val) SKM_sk_find_ex(X509_EXTENSION, (st), (val))
+#define sk_X509_EXTENSION_delete(st, i) SKM_sk_delete(X509_EXTENSION, (st), (i))
+#define sk_X509_EXTENSION_delete_ptr(st, ptr) SKM_sk_delete_ptr(X509_EXTENSION, (st), (ptr))
+#define sk_X509_EXTENSION_insert(st, val, i) SKM_sk_insert(X509_EXTENSION, (st), (val), (i))
+#define sk_X509_EXTENSION_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(X509_EXTENSION, (st), (cmp))
+#define sk_X509_EXTENSION_dup(st) SKM_sk_dup(X509_EXTENSION, st)
+#define sk_X509_EXTENSION_pop_free(st, free_func) SKM_sk_pop_free(X509_EXTENSION, (st), (free_func))
+#define sk_X509_EXTENSION_shift(st) SKM_sk_shift(X509_EXTENSION, (st))
+#define sk_X509_EXTENSION_pop(st) SKM_sk_pop(X509_EXTENSION, (st))
+#define sk_X509_EXTENSION_sort(st) SKM_sk_sort(X509_EXTENSION, (st))
+#define sk_X509_EXTENSION_is_sorted(st) SKM_sk_is_sorted(X509_EXTENSION, (st))
+
+#define sk_X509_INFO_new(cmp) SKM_sk_new(X509_INFO, (cmp))
+#define sk_X509_INFO_new_null() SKM_sk_new_null(X509_INFO)
+#define sk_X509_INFO_free(st) SKM_sk_free(X509_INFO, (st))
+#define sk_X509_INFO_num(st) SKM_sk_num(X509_INFO, (st))
+#define sk_X509_INFO_value(st, i) SKM_sk_value(X509_INFO, (st), (i))
+#define sk_X509_INFO_set(st, i, val) SKM_sk_set(X509_INFO, (st), (i), (val))
+#define sk_X509_INFO_zero(st) SKM_sk_zero(X509_INFO, (st))
+#define sk_X509_INFO_push(st, val) SKM_sk_push(X509_INFO, (st), (val))
+#define sk_X509_INFO_unshift(st, val) SKM_sk_unshift(X509_INFO, (st), (val))
+#define sk_X509_INFO_find(st, val) SKM_sk_find(X509_INFO, (st), (val))
+#define sk_X509_INFO_find_ex(st, val) SKM_sk_find_ex(X509_INFO, (st), (val))
+#define sk_X509_INFO_delete(st, i) SKM_sk_delete(X509_INFO, (st), (i))
+#define sk_X509_INFO_delete_ptr(st, ptr) SKM_sk_delete_ptr(X509_INFO, (st), (ptr))
+#define sk_X509_INFO_insert(st, val, i) SKM_sk_insert(X509_INFO, (st), (val), (i))
+#define sk_X509_INFO_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(X509_INFO, (st), (cmp))
+#define sk_X509_INFO_dup(st) SKM_sk_dup(X509_INFO, st)
+#define sk_X509_INFO_pop_free(st, free_func) SKM_sk_pop_free(X509_INFO, (st), (free_func))
+#define sk_X509_INFO_shift(st) SKM_sk_shift(X509_INFO, (st))
+#define sk_X509_INFO_pop(st) SKM_sk_pop(X509_INFO, (st))
+#define sk_X509_INFO_sort(st) SKM_sk_sort(X509_INFO, (st))
+#define sk_X509_INFO_is_sorted(st) SKM_sk_is_sorted(X509_INFO, (st))
+
+#define sk_X509_LOOKUP_new(cmp) SKM_sk_new(X509_LOOKUP, (cmp))
+#define sk_X509_LOOKUP_new_null() SKM_sk_new_null(X509_LOOKUP)
+#define sk_X509_LOOKUP_free(st) SKM_sk_free(X509_LOOKUP, (st))
+#define sk_X509_LOOKUP_num(st) SKM_sk_num(X509_LOOKUP, (st))
+#define sk_X509_LOOKUP_value(st, i) SKM_sk_value(X509_LOOKUP, (st), (i))
+#define sk_X509_LOOKUP_set(st, i, val) SKM_sk_set(X509_LOOKUP, (st), (i), (val))
+#define sk_X509_LOOKUP_zero(st) SKM_sk_zero(X509_LOOKUP, (st))
+#define sk_X509_LOOKUP_push(st, val) SKM_sk_push(X509_LOOKUP, (st), (val))
+#define sk_X509_LOOKUP_unshift(st, val) SKM_sk_unshift(X509_LOOKUP, (st), (val))
+#define sk_X509_LOOKUP_find(st, val) SKM_sk_find(X509_LOOKUP, (st), (val))
+#define sk_X509_LOOKUP_find_ex(st, val) SKM_sk_find_ex(X509_LOOKUP, (st), (val))
+#define sk_X509_LOOKUP_delete(st, i) SKM_sk_delete(X509_LOOKUP, (st), (i))
+#define sk_X509_LOOKUP_delete_ptr(st, ptr) SKM_sk_delete_ptr(X509_LOOKUP, (st), (ptr))
+#define sk_X509_LOOKUP_insert(st, val, i) SKM_sk_insert(X509_LOOKUP, (st), (val), (i))
+#define sk_X509_LOOKUP_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(X509_LOOKUP, (st), (cmp))
+#define sk_X509_LOOKUP_dup(st) SKM_sk_dup(X509_LOOKUP, st)
+#define sk_X509_LOOKUP_pop_free(st, free_func) SKM_sk_pop_free(X509_LOOKUP, (st), (free_func))
+#define sk_X509_LOOKUP_shift(st) SKM_sk_shift(X509_LOOKUP, (st))
+#define sk_X509_LOOKUP_pop(st) SKM_sk_pop(X509_LOOKUP, (st))
+#define sk_X509_LOOKUP_sort(st) SKM_sk_sort(X509_LOOKUP, (st))
+#define sk_X509_LOOKUP_is_sorted(st) SKM_sk_is_sorted(X509_LOOKUP, (st))
+
+#define sk_X509_NAME_new(cmp) SKM_sk_new(X509_NAME, (cmp))
+#define sk_X509_NAME_new_null() SKM_sk_new_null(X509_NAME)
+#define sk_X509_NAME_free(st) SKM_sk_free(X509_NAME, (st))
+#define sk_X509_NAME_num(st) SKM_sk_num(X509_NAME, (st))
+#define sk_X509_NAME_value(st, i) SKM_sk_value(X509_NAME, (st), (i))
+#define sk_X509_NAME_set(st, i, val) SKM_sk_set(X509_NAME, (st), (i), (val))
+#define sk_X509_NAME_zero(st) SKM_sk_zero(X509_NAME, (st))
+#define sk_X509_NAME_push(st, val) SKM_sk_push(X509_NAME, (st), (val))
+#define sk_X509_NAME_unshift(st, val) SKM_sk_unshift(X509_NAME, (st), (val))
+#define sk_X509_NAME_find(st, val) SKM_sk_find(X509_NAME, (st), (val))
+#define sk_X509_NAME_find_ex(st, val) SKM_sk_find_ex(X509_NAME, (st), (val))
+#define sk_X509_NAME_delete(st, i) SKM_sk_delete(X509_NAME, (st), (i))
+#define sk_X509_NAME_delete_ptr(st, ptr) SKM_sk_delete_ptr(X509_NAME, (st), (ptr))
+#define sk_X509_NAME_insert(st, val, i) SKM_sk_insert(X509_NAME, (st), (val), (i))
+#define sk_X509_NAME_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(X509_NAME, (st), (cmp))
+#define sk_X509_NAME_dup(st) SKM_sk_dup(X509_NAME, st)
+#define sk_X509_NAME_pop_free(st, free_func) SKM_sk_pop_free(X509_NAME, (st), (free_func))
+#define sk_X509_NAME_shift(st) SKM_sk_shift(X509_NAME, (st))
+#define sk_X509_NAME_pop(st) SKM_sk_pop(X509_NAME, (st))
+#define sk_X509_NAME_sort(st) SKM_sk_sort(X509_NAME, (st))
+#define sk_X509_NAME_is_sorted(st) SKM_sk_is_sorted(X509_NAME, (st))
+
+#define sk_X509_NAME_ENTRY_new(cmp) SKM_sk_new(X509_NAME_ENTRY, (cmp))
+#define sk_X509_NAME_ENTRY_new_null() SKM_sk_new_null(X509_NAME_ENTRY)
+#define sk_X509_NAME_ENTRY_free(st) SKM_sk_free(X509_NAME_ENTRY, (st))
+#define sk_X509_NAME_ENTRY_num(st) SKM_sk_num(X509_NAME_ENTRY, (st))
+#define sk_X509_NAME_ENTRY_value(st, i) SKM_sk_value(X509_NAME_ENTRY, (st), (i))
+#define sk_X509_NAME_ENTRY_set(st, i, val) SKM_sk_set(X509_NAME_ENTRY, (st), (i), (val))
+#define sk_X509_NAME_ENTRY_zero(st) SKM_sk_zero(X509_NAME_ENTRY, (st))
+#define sk_X509_NAME_ENTRY_push(st, val) SKM_sk_push(X509_NAME_ENTRY, (st), (val))
+#define sk_X509_NAME_ENTRY_unshift(st, val) SKM_sk_unshift(X509_NAME_ENTRY, (st), (val))
+#define sk_X509_NAME_ENTRY_find(st, val) SKM_sk_find(X509_NAME_ENTRY, (st), (val))
+#define sk_X509_NAME_ENTRY_find_ex(st, val) SKM_sk_find_ex(X509_NAME_ENTRY, (st), (val))
+#define sk_X509_NAME_ENTRY_delete(st, i) SKM_sk_delete(X509_NAME_ENTRY, (st), (i))
+#define sk_X509_NAME_ENTRY_delete_ptr(st, ptr) SKM_sk_delete_ptr(X509_NAME_ENTRY, (st), (ptr))
+#define sk_X509_NAME_ENTRY_insert(st, val, i) SKM_sk_insert(X509_NAME_ENTRY, (st), (val), (i))
+#define sk_X509_NAME_ENTRY_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(X509_NAME_ENTRY, (st), (cmp))
+#define sk_X509_NAME_ENTRY_dup(st) SKM_sk_dup(X509_NAME_ENTRY, st)
+#define sk_X509_NAME_ENTRY_pop_free(st, free_func) SKM_sk_pop_free(X509_NAME_ENTRY, (st), (free_func))
+#define sk_X509_NAME_ENTRY_shift(st) SKM_sk_shift(X509_NAME_ENTRY, (st))
+#define sk_X509_NAME_ENTRY_pop(st) SKM_sk_pop(X509_NAME_ENTRY, (st))
+#define sk_X509_NAME_ENTRY_sort(st) SKM_sk_sort(X509_NAME_ENTRY, (st))
+#define sk_X509_NAME_ENTRY_is_sorted(st) SKM_sk_is_sorted(X509_NAME_ENTRY, (st))
+
+#define sk_X509_OBJECT_new(cmp) SKM_sk_new(X509_OBJECT, (cmp))
+#define sk_X509_OBJECT_new_null() SKM_sk_new_null(X509_OBJECT)
+#define sk_X509_OBJECT_free(st) SKM_sk_free(X509_OBJECT, (st))
+#define sk_X509_OBJECT_num(st) SKM_sk_num(X509_OBJECT, (st))
+#define sk_X509_OBJECT_value(st, i) SKM_sk_value(X509_OBJECT, (st), (i))
+#define sk_X509_OBJECT_set(st, i, val) SKM_sk_set(X509_OBJECT, (st), (i), (val))
+#define sk_X509_OBJECT_zero(st) SKM_sk_zero(X509_OBJECT, (st))
+#define sk_X509_OBJECT_push(st, val) SKM_sk_push(X509_OBJECT, (st), (val))
+#define sk_X509_OBJECT_unshift(st, val) SKM_sk_unshift(X509_OBJECT, (st), (val))
+#define sk_X509_OBJECT_find(st, val) SKM_sk_find(X509_OBJECT, (st), (val))
+#define sk_X509_OBJECT_find_ex(st, val) SKM_sk_find_ex(X509_OBJECT, (st), (val))
+#define sk_X509_OBJECT_delete(st, i) SKM_sk_delete(X509_OBJECT, (st), (i))
+#define sk_X509_OBJECT_delete_ptr(st, ptr) SKM_sk_delete_ptr(X509_OBJECT, (st), (ptr))
+#define sk_X509_OBJECT_insert(st, val, i) SKM_sk_insert(X509_OBJECT, (st), (val), (i))
+#define sk_X509_OBJECT_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(X509_OBJECT, (st), (cmp))
+#define sk_X509_OBJECT_dup(st) SKM_sk_dup(X509_OBJECT, st)
+#define sk_X509_OBJECT_pop_free(st, free_func) SKM_sk_pop_free(X509_OBJECT, (st), (free_func))
+#define sk_X509_OBJECT_shift(st) SKM_sk_shift(X509_OBJECT, (st))
+#define sk_X509_OBJECT_pop(st) SKM_sk_pop(X509_OBJECT, (st))
+#define sk_X509_OBJECT_sort(st) SKM_sk_sort(X509_OBJECT, (st))
+#define sk_X509_OBJECT_is_sorted(st) SKM_sk_is_sorted(X509_OBJECT, (st))
+
+#define sk_X509_POLICY_DATA_new(cmp) SKM_sk_new(X509_POLICY_DATA, (cmp))
+#define sk_X509_POLICY_DATA_new_null() SKM_sk_new_null(X509_POLICY_DATA)
+#define sk_X509_POLICY_DATA_free(st) SKM_sk_free(X509_POLICY_DATA, (st))
+#define sk_X509_POLICY_DATA_num(st) SKM_sk_num(X509_POLICY_DATA, (st))
+#define sk_X509_POLICY_DATA_value(st, i) SKM_sk_value(X509_POLICY_DATA, (st), (i))
+#define sk_X509_POLICY_DATA_set(st, i, val) SKM_sk_set(X509_POLICY_DATA, (st), (i), (val))
+#define sk_X509_POLICY_DATA_zero(st) SKM_sk_zero(X509_POLICY_DATA, (st))
+#define sk_X509_POLICY_DATA_push(st, val) SKM_sk_push(X509_POLICY_DATA, (st), (val))
+#define sk_X509_POLICY_DATA_unshift(st, val) SKM_sk_unshift(X509_POLICY_DATA, (st), (val))
+#define sk_X509_POLICY_DATA_find(st, val) SKM_sk_find(X509_POLICY_DATA, (st), (val))
+#define sk_X509_POLICY_DATA_find_ex(st, val) SKM_sk_find_ex(X509_POLICY_DATA, (st), (val))
+#define sk_X509_POLICY_DATA_delete(st, i) SKM_sk_delete(X509_POLICY_DATA, (st), (i))
+#define sk_X509_POLICY_DATA_delete_ptr(st, ptr) SKM_sk_delete_ptr(X509_POLICY_DATA, (st), (ptr))
+#define sk_X509_POLICY_DATA_insert(st, val, i) SKM_sk_insert(X509_POLICY_DATA, (st), (val), (i))
+#define sk_X509_POLICY_DATA_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(X509_POLICY_DATA, (st), (cmp))
+#define sk_X509_POLICY_DATA_dup(st) SKM_sk_dup(X509_POLICY_DATA, st)
+#define sk_X509_POLICY_DATA_pop_free(st, free_func) SKM_sk_pop_free(X509_POLICY_DATA, (st), (free_func))
+#define sk_X509_POLICY_DATA_shift(st) SKM_sk_shift(X509_POLICY_DATA, (st))
+#define sk_X509_POLICY_DATA_pop(st) SKM_sk_pop(X509_POLICY_DATA, (st))
+#define sk_X509_POLICY_DATA_sort(st) SKM_sk_sort(X509_POLICY_DATA, (st))
+#define sk_X509_POLICY_DATA_is_sorted(st) SKM_sk_is_sorted(X509_POLICY_DATA, (st))
+
+#define sk_X509_POLICY_NODE_new(cmp) SKM_sk_new(X509_POLICY_NODE, (cmp))
+#define sk_X509_POLICY_NODE_new_null() SKM_sk_new_null(X509_POLICY_NODE)
+#define sk_X509_POLICY_NODE_free(st) SKM_sk_free(X509_POLICY_NODE, (st))
+#define sk_X509_POLICY_NODE_num(st) SKM_sk_num(X509_POLICY_NODE, (st))
+#define sk_X509_POLICY_NODE_value(st, i) SKM_sk_value(X509_POLICY_NODE, (st), (i))
+#define sk_X509_POLICY_NODE_set(st, i, val) SKM_sk_set(X509_POLICY_NODE, (st), (i), (val))
+#define sk_X509_POLICY_NODE_zero(st) SKM_sk_zero(X509_POLICY_NODE, (st))
+#define sk_X509_POLICY_NODE_push(st, val) SKM_sk_push(X509_POLICY_NODE, (st), (val))
+#define sk_X509_POLICY_NODE_unshift(st, val) SKM_sk_unshift(X509_POLICY_NODE, (st), (val))
+#define sk_X509_POLICY_NODE_find(st, val) SKM_sk_find(X509_POLICY_NODE, (st), (val))
+#define sk_X509_POLICY_NODE_find_ex(st, val) SKM_sk_find_ex(X509_POLICY_NODE, (st), (val))
+#define sk_X509_POLICY_NODE_delete(st, i) SKM_sk_delete(X509_POLICY_NODE, (st), (i))
+#define sk_X509_POLICY_NODE_delete_ptr(st, ptr) SKM_sk_delete_ptr(X509_POLICY_NODE, (st), (ptr))
+#define sk_X509_POLICY_NODE_insert(st, val, i) SKM_sk_insert(X509_POLICY_NODE, (st), (val), (i))
+#define sk_X509_POLICY_NODE_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(X509_POLICY_NODE, (st), (cmp))
+#define sk_X509_POLICY_NODE_dup(st) SKM_sk_dup(X509_POLICY_NODE, st)
+#define sk_X509_POLICY_NODE_pop_free(st, free_func) SKM_sk_pop_free(X509_POLICY_NODE, (st), (free_func))
+#define sk_X509_POLICY_NODE_shift(st) SKM_sk_shift(X509_POLICY_NODE, (st))
+#define sk_X509_POLICY_NODE_pop(st) SKM_sk_pop(X509_POLICY_NODE, (st))
+#define sk_X509_POLICY_NODE_sort(st) SKM_sk_sort(X509_POLICY_NODE, (st))
+#define sk_X509_POLICY_NODE_is_sorted(st) SKM_sk_is_sorted(X509_POLICY_NODE, (st))
+
+#define sk_X509_PURPOSE_new(cmp) SKM_sk_new(X509_PURPOSE, (cmp))
+#define sk_X509_PURPOSE_new_null() SKM_sk_new_null(X509_PURPOSE)
+#define sk_X509_PURPOSE_free(st) SKM_sk_free(X509_PURPOSE, (st))
+#define sk_X509_PURPOSE_num(st) SKM_sk_num(X509_PURPOSE, (st))
+#define sk_X509_PURPOSE_value(st, i) SKM_sk_value(X509_PURPOSE, (st), (i))
+#define sk_X509_PURPOSE_set(st, i, val) SKM_sk_set(X509_PURPOSE, (st), (i), (val))
+#define sk_X509_PURPOSE_zero(st) SKM_sk_zero(X509_PURPOSE, (st))
+#define sk_X509_PURPOSE_push(st, val) SKM_sk_push(X509_PURPOSE, (st), (val))
+#define sk_X509_PURPOSE_unshift(st, val) SKM_sk_unshift(X509_PURPOSE, (st), (val))
+#define sk_X509_PURPOSE_find(st, val) SKM_sk_find(X509_PURPOSE, (st), (val))
+#define sk_X509_PURPOSE_find_ex(st, val) SKM_sk_find_ex(X509_PURPOSE, (st), (val))
+#define sk_X509_PURPOSE_delete(st, i) SKM_sk_delete(X509_PURPOSE, (st), (i))
+#define sk_X509_PURPOSE_delete_ptr(st, ptr) SKM_sk_delete_ptr(X509_PURPOSE, (st), (ptr))
+#define sk_X509_PURPOSE_insert(st, val, i) SKM_sk_insert(X509_PURPOSE, (st), (val), (i))
+#define sk_X509_PURPOSE_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(X509_PURPOSE, (st), (cmp))
+#define sk_X509_PURPOSE_dup(st) SKM_sk_dup(X509_PURPOSE, st)
+#define sk_X509_PURPOSE_pop_free(st, free_func) SKM_sk_pop_free(X509_PURPOSE, (st), (free_func))
+#define sk_X509_PURPOSE_shift(st) SKM_sk_shift(X509_PURPOSE, (st))
+#define sk_X509_PURPOSE_pop(st) SKM_sk_pop(X509_PURPOSE, (st))
+#define sk_X509_PURPOSE_sort(st) SKM_sk_sort(X509_PURPOSE, (st))
+#define sk_X509_PURPOSE_is_sorted(st) SKM_sk_is_sorted(X509_PURPOSE, (st))
+
+#define sk_X509_REVOKED_new(cmp) SKM_sk_new(X509_REVOKED, (cmp))
+#define sk_X509_REVOKED_new_null() SKM_sk_new_null(X509_REVOKED)
+#define sk_X509_REVOKED_free(st) SKM_sk_free(X509_REVOKED, (st))
+#define sk_X509_REVOKED_num(st) SKM_sk_num(X509_REVOKED, (st))
+#define sk_X509_REVOKED_value(st, i) SKM_sk_value(X509_REVOKED, (st), (i))
+#define sk_X509_REVOKED_set(st, i, val) SKM_sk_set(X509_REVOKED, (st), (i), (val))
+#define sk_X509_REVOKED_zero(st) SKM_sk_zero(X509_REVOKED, (st))
+#define sk_X509_REVOKED_push(st, val) SKM_sk_push(X509_REVOKED, (st), (val))
+#define sk_X509_REVOKED_unshift(st, val) SKM_sk_unshift(X509_REVOKED, (st), (val))
+#define sk_X509_REVOKED_find(st, val) SKM_sk_find(X509_REVOKED, (st), (val))
+#define sk_X509_REVOKED_find_ex(st, val) SKM_sk_find_ex(X509_REVOKED, (st), (val))
+#define sk_X509_REVOKED_delete(st, i) SKM_sk_delete(X509_REVOKED, (st), (i))
+#define sk_X509_REVOKED_delete_ptr(st, ptr) SKM_sk_delete_ptr(X509_REVOKED, (st), (ptr))
+#define sk_X509_REVOKED_insert(st, val, i) SKM_sk_insert(X509_REVOKED, (st), (val), (i))
+#define sk_X509_REVOKED_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(X509_REVOKED, (st), (cmp))
+#define sk_X509_REVOKED_dup(st) SKM_sk_dup(X509_REVOKED, st)
+#define sk_X509_REVOKED_pop_free(st, free_func) SKM_sk_pop_free(X509_REVOKED, (st), (free_func))
+#define sk_X509_REVOKED_shift(st) SKM_sk_shift(X509_REVOKED, (st))
+#define sk_X509_REVOKED_pop(st) SKM_sk_pop(X509_REVOKED, (st))
+#define sk_X509_REVOKED_sort(st) SKM_sk_sort(X509_REVOKED, (st))
+#define sk_X509_REVOKED_is_sorted(st) SKM_sk_is_sorted(X509_REVOKED, (st))
+
+#define sk_X509_TRUST_new(cmp) SKM_sk_new(X509_TRUST, (cmp))
+#define sk_X509_TRUST_new_null() SKM_sk_new_null(X509_TRUST)
+#define sk_X509_TRUST_free(st) SKM_sk_free(X509_TRUST, (st))
+#define sk_X509_TRUST_num(st) SKM_sk_num(X509_TRUST, (st))
+#define sk_X509_TRUST_value(st, i) SKM_sk_value(X509_TRUST, (st), (i))
+#define sk_X509_TRUST_set(st, i, val) SKM_sk_set(X509_TRUST, (st), (i), (val))
+#define sk_X509_TRUST_zero(st) SKM_sk_zero(X509_TRUST, (st))
+#define sk_X509_TRUST_push(st, val) SKM_sk_push(X509_TRUST, (st), (val))
+#define sk_X509_TRUST_unshift(st, val) SKM_sk_unshift(X509_TRUST, (st), (val))
+#define sk_X509_TRUST_find(st, val) SKM_sk_find(X509_TRUST, (st), (val))
+#define sk_X509_TRUST_find_ex(st, val) SKM_sk_find_ex(X509_TRUST, (st), (val))
+#define sk_X509_TRUST_delete(st, i) SKM_sk_delete(X509_TRUST, (st), (i))
+#define sk_X509_TRUST_delete_ptr(st, ptr) SKM_sk_delete_ptr(X509_TRUST, (st), (ptr))
+#define sk_X509_TRUST_insert(st, val, i) SKM_sk_insert(X509_TRUST, (st), (val), (i))
+#define sk_X509_TRUST_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(X509_TRUST, (st), (cmp))
+#define sk_X509_TRUST_dup(st) SKM_sk_dup(X509_TRUST, st)
+#define sk_X509_TRUST_pop_free(st, free_func) SKM_sk_pop_free(X509_TRUST, (st), (free_func))
+#define sk_X509_TRUST_shift(st) SKM_sk_shift(X509_TRUST, (st))
+#define sk_X509_TRUST_pop(st) SKM_sk_pop(X509_TRUST, (st))
+#define sk_X509_TRUST_sort(st) SKM_sk_sort(X509_TRUST, (st))
+#define sk_X509_TRUST_is_sorted(st) SKM_sk_is_sorted(X509_TRUST, (st))
+
+#define sk_X509_VERIFY_PARAM_new(cmp) SKM_sk_new(X509_VERIFY_PARAM, (cmp))
+#define sk_X509_VERIFY_PARAM_new_null() SKM_sk_new_null(X509_VERIFY_PARAM)
+#define sk_X509_VERIFY_PARAM_free(st) SKM_sk_free(X509_VERIFY_PARAM, (st))
+#define sk_X509_VERIFY_PARAM_num(st) SKM_sk_num(X509_VERIFY_PARAM, (st))
+#define sk_X509_VERIFY_PARAM_value(st, i) SKM_sk_value(X509_VERIFY_PARAM, (st), (i))
+#define sk_X509_VERIFY_PARAM_set(st, i, val) SKM_sk_set(X509_VERIFY_PARAM, (st), (i), (val))
+#define sk_X509_VERIFY_PARAM_zero(st) SKM_sk_zero(X509_VERIFY_PARAM, (st))
+#define sk_X509_VERIFY_PARAM_push(st, val) SKM_sk_push(X509_VERIFY_PARAM, (st), (val))
+#define sk_X509_VERIFY_PARAM_unshift(st, val) SKM_sk_unshift(X509_VERIFY_PARAM, (st), (val))
+#define sk_X509_VERIFY_PARAM_find(st, val) SKM_sk_find(X509_VERIFY_PARAM, (st), (val))
+#define sk_X509_VERIFY_PARAM_find_ex(st, val) SKM_sk_find_ex(X509_VERIFY_PARAM, (st), (val))
+#define sk_X509_VERIFY_PARAM_delete(st, i) SKM_sk_delete(X509_VERIFY_PARAM, (st), (i))
+#define sk_X509_VERIFY_PARAM_delete_ptr(st, ptr) SKM_sk_delete_ptr(X509_VERIFY_PARAM, (st), (ptr))
+#define sk_X509_VERIFY_PARAM_insert(st, val, i) SKM_sk_insert(X509_VERIFY_PARAM, (st), (val), (i))
+#define sk_X509_VERIFY_PARAM_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(X509_VERIFY_PARAM, (st), (cmp))
+#define sk_X509_VERIFY_PARAM_dup(st) SKM_sk_dup(X509_VERIFY_PARAM, st)
+#define sk_X509_VERIFY_PARAM_pop_free(st, free_func) SKM_sk_pop_free(X509_VERIFY_PARAM, (st), (free_func))
+#define sk_X509_VERIFY_PARAM_shift(st) SKM_sk_shift(X509_VERIFY_PARAM, (st))
+#define sk_X509_VERIFY_PARAM_pop(st) SKM_sk_pop(X509_VERIFY_PARAM, (st))
+#define sk_X509_VERIFY_PARAM_sort(st) SKM_sk_sort(X509_VERIFY_PARAM, (st))
+#define sk_X509_VERIFY_PARAM_is_sorted(st) SKM_sk_is_sorted(X509_VERIFY_PARAM, (st))
+
+#define sk_nid_triple_new(cmp) SKM_sk_new(nid_triple, (cmp))
+#define sk_nid_triple_new_null() SKM_sk_new_null(nid_triple)
+#define sk_nid_triple_free(st) SKM_sk_free(nid_triple, (st))
+#define sk_nid_triple_num(st) SKM_sk_num(nid_triple, (st))
+#define sk_nid_triple_value(st, i) SKM_sk_value(nid_triple, (st), (i))
+#define sk_nid_triple_set(st, i, val) SKM_sk_set(nid_triple, (st), (i), (val))
+#define sk_nid_triple_zero(st) SKM_sk_zero(nid_triple, (st))
+#define sk_nid_triple_push(st, val) SKM_sk_push(nid_triple, (st), (val))
+#define sk_nid_triple_unshift(st, val) SKM_sk_unshift(nid_triple, (st), (val))
+#define sk_nid_triple_find(st, val) SKM_sk_find(nid_triple, (st), (val))
+#define sk_nid_triple_find_ex(st, val) SKM_sk_find_ex(nid_triple, (st), (val))
+#define sk_nid_triple_delete(st, i) SKM_sk_delete(nid_triple, (st), (i))
+#define sk_nid_triple_delete_ptr(st, ptr) SKM_sk_delete_ptr(nid_triple, (st), (ptr))
+#define sk_nid_triple_insert(st, val, i) SKM_sk_insert(nid_triple, (st), (val), (i))
+#define sk_nid_triple_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(nid_triple, (st), (cmp))
+#define sk_nid_triple_dup(st) SKM_sk_dup(nid_triple, st)
+#define sk_nid_triple_pop_free(st, free_func) SKM_sk_pop_free(nid_triple, (st), (free_func))
+#define sk_nid_triple_shift(st) SKM_sk_shift(nid_triple, (st))
+#define sk_nid_triple_pop(st) SKM_sk_pop(nid_triple, (st))
+#define sk_nid_triple_sort(st) SKM_sk_sort(nid_triple, (st))
+#define sk_nid_triple_is_sorted(st) SKM_sk_is_sorted(nid_triple, (st))
+
+#define sk_void_new(cmp) SKM_sk_new(void, (cmp))
+#define sk_void_new_null() SKM_sk_new_null(void)
+#define sk_void_free(st) SKM_sk_free(void, (st))
+#define sk_void_num(st) SKM_sk_num(void, (st))
+#define sk_void_value(st, i) SKM_sk_value(void, (st), (i))
+#define sk_void_set(st, i, val) SKM_sk_set(void, (st), (i), (val))
+#define sk_void_zero(st) SKM_sk_zero(void, (st))
+#define sk_void_push(st, val) SKM_sk_push(void, (st), (val))
+#define sk_void_unshift(st, val) SKM_sk_unshift(void, (st), (val))
+#define sk_void_find(st, val) SKM_sk_find(void, (st), (val))
+#define sk_void_find_ex(st, val) SKM_sk_find_ex(void, (st), (val))
+#define sk_void_delete(st, i) SKM_sk_delete(void, (st), (i))
+#define sk_void_delete_ptr(st, ptr) SKM_sk_delete_ptr(void, (st), (ptr))
+#define sk_void_insert(st, val, i) SKM_sk_insert(void, (st), (val), (i))
+#define sk_void_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(void, (st), (cmp))
+#define sk_void_dup(st) SKM_sk_dup(void, st)
+#define sk_void_pop_free(st, free_func) SKM_sk_pop_free(void, (st), (free_func))
+#define sk_void_shift(st) SKM_sk_shift(void, (st))
+#define sk_void_pop(st) SKM_sk_pop(void, (st))
+#define sk_void_sort(st) SKM_sk_sort(void, (st))
+#define sk_void_is_sorted(st) SKM_sk_is_sorted(void, (st))
+
+#define sk_OPENSSL_STRING_new(cmp) ((STACK_OF(OPENSSL_STRING) *)sk_new(CHECKED_SK_CMP_FUNC(char, cmp)))
+#define sk_OPENSSL_STRING_new_null() ((STACK_OF(OPENSSL_STRING) *)sk_new_null())
+#define sk_OPENSSL_STRING_push(st, val) sk_push(CHECKED_STACK_OF(OPENSSL_STRING, st), CHECKED_PTR_OF(char, val))
+#define sk_OPENSSL_STRING_find(st, val) sk_find(CHECKED_STACK_OF(OPENSSL_STRING, st), CHECKED_PTR_OF(char, val))
+#define sk_OPENSSL_STRING_value(st, i) ((OPENSSL_STRING)sk_value(CHECKED_STACK_OF(OPENSSL_STRING, st), i))
+#define sk_OPENSSL_STRING_num(st) SKM_sk_num(OPENSSL_STRING, st)
+#define sk_OPENSSL_STRING_pop_free(st, free_func) sk_pop_free(CHECKED_STACK_OF(OPENSSL_STRING, st), CHECKED_SK_FREE_FUNC2(OPENSSL_STRING, free_func))
+#define sk_OPENSSL_STRING_insert(st, val, i) sk_insert(CHECKED_STACK_OF(OPENSSL_STRING, st), CHECKED_PTR_OF(char, val), i)
+#define sk_OPENSSL_STRING_free(st) SKM_sk_free(OPENSSL_STRING, st)
+#define sk_OPENSSL_STRING_set(st, i, val) sk_set(CHECKED_STACK_OF(OPENSSL_STRING, st), i, CHECKED_PTR_OF(char, val))
+#define sk_OPENSSL_STRING_zero(st) SKM_sk_zero(OPENSSL_STRING, (st))
+#define sk_OPENSSL_STRING_unshift(st, val) sk_unshift(CHECKED_STACK_OF(OPENSSL_STRING, st), CHECKED_PTR_OF(char, val))
+#define sk_OPENSSL_STRING_find_ex(st, val) sk_find_ex((_STACK *)CHECKED_CONST_PTR_OF(STACK_OF(OPENSSL_STRING), st), CHECKED_CONST_PTR_OF(char, val))
+#define sk_OPENSSL_STRING_delete(st, i) SKM_sk_delete(OPENSSL_STRING, (st), (i))
+#define sk_OPENSSL_STRING_delete_ptr(st, ptr) (OPENSSL_STRING *)sk_delete_ptr(CHECKED_STACK_OF(OPENSSL_STRING, st), CHECKED_PTR_OF(char, ptr))
+#define sk_OPENSSL_STRING_set_cmp_func(st, cmp)  \
+	((int (*)(const char * const *,const char * const *)) \
+	sk_set_cmp_func(CHECKED_STACK_OF(OPENSSL_STRING, st), CHECKED_SK_CMP_FUNC(char, cmp)))
+#define sk_OPENSSL_STRING_dup(st) SKM_sk_dup(OPENSSL_STRING, st)
+#define sk_OPENSSL_STRING_shift(st) SKM_sk_shift(OPENSSL_STRING, (st))
+#define sk_OPENSSL_STRING_pop(st) (char *)sk_pop(CHECKED_STACK_OF(OPENSSL_STRING, st))
+#define sk_OPENSSL_STRING_sort(st) SKM_sk_sort(OPENSSL_STRING, (st))
+#define sk_OPENSSL_STRING_is_sorted(st) SKM_sk_is_sorted(OPENSSL_STRING, (st))
+
+
+#define sk_OPENSSL_BLOCK_new(cmp) ((STACK_OF(OPENSSL_BLOCK) *)sk_new(CHECKED_SK_CMP_FUNC(void, cmp)))
+#define sk_OPENSSL_BLOCK_new_null() ((STACK_OF(OPENSSL_BLOCK) *)sk_new_null())
+#define sk_OPENSSL_BLOCK_push(st, val) sk_push(CHECKED_STACK_OF(OPENSSL_BLOCK, st), CHECKED_PTR_OF(void, val))
+#define sk_OPENSSL_BLOCK_find(st, val) sk_find(CHECKED_STACK_OF(OPENSSL_BLOCK, st), CHECKED_PTR_OF(void, val))
+#define sk_OPENSSL_BLOCK_value(st, i) ((OPENSSL_BLOCK)sk_value(CHECKED_STACK_OF(OPENSSL_BLOCK, st), i))
+#define sk_OPENSSL_BLOCK_num(st) SKM_sk_num(OPENSSL_BLOCK, st)
+#define sk_OPENSSL_BLOCK_pop_free(st, free_func) sk_pop_free(CHECKED_STACK_OF(OPENSSL_BLOCK, st), CHECKED_SK_FREE_FUNC2(OPENSSL_BLOCK, free_func))
+#define sk_OPENSSL_BLOCK_insert(st, val, i) sk_insert(CHECKED_STACK_OF(OPENSSL_BLOCK, st), CHECKED_PTR_OF(void, val), i)
+#define sk_OPENSSL_BLOCK_free(st) SKM_sk_free(OPENSSL_BLOCK, st)
+#define sk_OPENSSL_BLOCK_set(st, i, val) sk_set(CHECKED_STACK_OF(OPENSSL_BLOCK, st), i, CHECKED_PTR_OF(void, val))
+#define sk_OPENSSL_BLOCK_zero(st) SKM_sk_zero(OPENSSL_BLOCK, (st))
+#define sk_OPENSSL_BLOCK_unshift(st, val) sk_unshift(CHECKED_STACK_OF(OPENSSL_BLOCK, st), CHECKED_PTR_OF(void, val))
+#define sk_OPENSSL_BLOCK_find_ex(st, val) sk_find_ex((_STACK *)CHECKED_CONST_PTR_OF(STACK_OF(OPENSSL_BLOCK), st), CHECKED_CONST_PTR_OF(void, val))
+#define sk_OPENSSL_BLOCK_delete(st, i) SKM_sk_delete(OPENSSL_BLOCK, (st), (i))
+#define sk_OPENSSL_BLOCK_delete_ptr(st, ptr) (OPENSSL_BLOCK *)sk_delete_ptr(CHECKED_STACK_OF(OPENSSL_BLOCK, st), CHECKED_PTR_OF(void, ptr))
+#define sk_OPENSSL_BLOCK_set_cmp_func(st, cmp)  \
+	((int (*)(const void * const *,const void * const *)) \
+	sk_set_cmp_func(CHECKED_STACK_OF(OPENSSL_BLOCK, st), CHECKED_SK_CMP_FUNC(void, cmp)))
+#define sk_OPENSSL_BLOCK_dup(st) SKM_sk_dup(OPENSSL_BLOCK, st)
+#define sk_OPENSSL_BLOCK_shift(st) SKM_sk_shift(OPENSSL_BLOCK, (st))
+#define sk_OPENSSL_BLOCK_pop(st) (void *)sk_pop(CHECKED_STACK_OF(OPENSSL_BLOCK, st))
+#define sk_OPENSSL_BLOCK_sort(st) SKM_sk_sort(OPENSSL_BLOCK, (st))
+#define sk_OPENSSL_BLOCK_is_sorted(st) SKM_sk_is_sorted(OPENSSL_BLOCK, (st))
+
+
+#define sk_OPENSSL_PSTRING_new(cmp) ((STACK_OF(OPENSSL_PSTRING) *)sk_new(CHECKED_SK_CMP_FUNC(OPENSSL_STRING, cmp)))
+#define sk_OPENSSL_PSTRING_new_null() ((STACK_OF(OPENSSL_PSTRING) *)sk_new_null())
+#define sk_OPENSSL_PSTRING_push(st, val) sk_push(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
+#define sk_OPENSSL_PSTRING_find(st, val) sk_find(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
+#define sk_OPENSSL_PSTRING_value(st, i) ((OPENSSL_PSTRING)sk_value(CHECKED_STACK_OF(OPENSSL_PSTRING, st), i))
+#define sk_OPENSSL_PSTRING_num(st) SKM_sk_num(OPENSSL_PSTRING, st)
+#define sk_OPENSSL_PSTRING_pop_free(st, free_func) sk_pop_free(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_SK_FREE_FUNC2(OPENSSL_PSTRING, free_func))
+#define sk_OPENSSL_PSTRING_insert(st, val, i) sk_insert(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val), i)
+#define sk_OPENSSL_PSTRING_free(st) SKM_sk_free(OPENSSL_PSTRING, st)
+#define sk_OPENSSL_PSTRING_set(st, i, val) sk_set(CHECKED_STACK_OF(OPENSSL_PSTRING, st), i, CHECKED_PTR_OF(OPENSSL_STRING, val))
+#define sk_OPENSSL_PSTRING_zero(st) SKM_sk_zero(OPENSSL_PSTRING, (st))
+#define sk_OPENSSL_PSTRING_unshift(st, val) sk_unshift(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
+#define sk_OPENSSL_PSTRING_find_ex(st, val) sk_find_ex((_STACK *)CHECKED_CONST_PTR_OF(STACK_OF(OPENSSL_PSTRING), st), CHECKED_CONST_PTR_OF(OPENSSL_STRING, val))
+#define sk_OPENSSL_PSTRING_delete(st, i) SKM_sk_delete(OPENSSL_PSTRING, (st), (i))
+#define sk_OPENSSL_PSTRING_delete_ptr(st, ptr) (OPENSSL_PSTRING *)sk_delete_ptr(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, ptr))
+#define sk_OPENSSL_PSTRING_set_cmp_func(st, cmp)  \
+	((int (*)(const OPENSSL_STRING * const *,const OPENSSL_STRING * const *)) \
+	sk_set_cmp_func(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_SK_CMP_FUNC(OPENSSL_STRING, cmp)))
+#define sk_OPENSSL_PSTRING_dup(st) SKM_sk_dup(OPENSSL_PSTRING, st)
+#define sk_OPENSSL_PSTRING_shift(st) SKM_sk_shift(OPENSSL_PSTRING, (st))
+#define sk_OPENSSL_PSTRING_pop(st) (OPENSSL_STRING *)sk_pop(CHECKED_STACK_OF(OPENSSL_PSTRING, st))
+#define sk_OPENSSL_PSTRING_sort(st) SKM_sk_sort(OPENSSL_PSTRING, (st))
+#define sk_OPENSSL_PSTRING_is_sorted(st) SKM_sk_is_sorted(OPENSSL_PSTRING, (st))
+
+
+#define d2i_ASN1_SET_OF_ACCESS_DESCRIPTION(st, pp, length, d2i_func, free_func, ex_tag, ex_class) \
+	SKM_ASN1_SET_OF_d2i(ACCESS_DESCRIPTION, (st), (pp), (length), (d2i_func), (free_func), (ex_tag), (ex_class)) 
+#define i2d_ASN1_SET_OF_ACCESS_DESCRIPTION(st, pp, i2d_func, ex_tag, ex_class, is_set) \
+	SKM_ASN1_SET_OF_i2d(ACCESS_DESCRIPTION, (st), (pp), (i2d_func), (ex_tag), (ex_class), (is_set))
+#define ASN1_seq_pack_ACCESS_DESCRIPTION(st, i2d_func, buf, len) \
+	SKM_ASN1_seq_pack(ACCESS_DESCRIPTION, (st), (i2d_func), (buf), (len))
+#define ASN1_seq_unpack_ACCESS_DESCRIPTION(buf, len, d2i_func, free_func) \
+	SKM_ASN1_seq_unpack(ACCESS_DESCRIPTION, (buf), (len), (d2i_func), (free_func))
+
+#define d2i_ASN1_SET_OF_ASN1_INTEGER(st, pp, length, d2i_func, free_func, ex_tag, ex_class) \
+	SKM_ASN1_SET_OF_d2i(ASN1_INTEGER, (st), (pp), (length), (d2i_func), (free_func), (ex_tag), (ex_class)) 
+#define i2d_ASN1_SET_OF_ASN1_INTEGER(st, pp, i2d_func, ex_tag, ex_class, is_set) \
+	SKM_ASN1_SET_OF_i2d(ASN1_INTEGER, (st), (pp), (i2d_func), (ex_tag), (ex_class), (is_set))
+#define ASN1_seq_pack_ASN1_INTEGER(st, i2d_func, buf, len) \
+	SKM_ASN1_seq_pack(ASN1_INTEGER, (st), (i2d_func), (buf), (len))
+#define ASN1_seq_unpack_ASN1_INTEGER(buf, len, d2i_func, free_func) \
+	SKM_ASN1_seq_unpack(ASN1_INTEGER, (buf), (len), (d2i_func), (free_func))
+
+#define d2i_ASN1_SET_OF_ASN1_OBJECT(st, pp, length, d2i_func, free_func, ex_tag, ex_class) \
+	SKM_ASN1_SET_OF_d2i(ASN1_OBJECT, (st), (pp), (length), (d2i_func), (free_func), (ex_tag), (ex_class)) 
+#define i2d_ASN1_SET_OF_ASN1_OBJECT(st, pp, i2d_func, ex_tag, ex_class, is_set) \
+	SKM_ASN1_SET_OF_i2d(ASN1_OBJECT, (st), (pp), (i2d_func), (ex_tag), (ex_class), (is_set))
+#define ASN1_seq_pack_ASN1_OBJECT(st, i2d_func, buf, len) \
+	SKM_ASN1_seq_pack(ASN1_OBJECT, (st), (i2d_func), (buf), (len))
+#define ASN1_seq_unpack_ASN1_OBJECT(buf, len, d2i_func, free_func) \
+	SKM_ASN1_seq_unpack(ASN1_OBJECT, (buf), (len), (d2i_func), (free_func))
+
+#define d2i_ASN1_SET_OF_ASN1_TYPE(st, pp, length, d2i_func, free_func, ex_tag, ex_class) \
+	SKM_ASN1_SET_OF_d2i(ASN1_TYPE, (st), (pp), (length), (d2i_func), (free_func), (ex_tag), (ex_class)) 
+#define i2d_ASN1_SET_OF_ASN1_TYPE(st, pp, i2d_func, ex_tag, ex_class, is_set) \
+	SKM_ASN1_SET_OF_i2d(ASN1_TYPE, (st), (pp), (i2d_func), (ex_tag), (ex_class), (is_set))
+#define ASN1_seq_pack_ASN1_TYPE(st, i2d_func, buf, len) \
+	SKM_ASN1_seq_pack(ASN1_TYPE, (st), (i2d_func), (buf), (len))
+#define ASN1_seq_unpack_ASN1_TYPE(buf, len, d2i_func, free_func) \
+	SKM_ASN1_seq_unpack(ASN1_TYPE, (buf), (len), (d2i_func), (free_func))
+
+#define d2i_ASN1_SET_OF_ASN1_UTF8STRING(st, pp, length, d2i_func, free_func, ex_tag, ex_class) \
+	SKM_ASN1_SET_OF_d2i(ASN1_UTF8STRING, (st), (pp), (length), (d2i_func), (free_func), (ex_tag), (ex_class)) 
+#define i2d_ASN1_SET_OF_ASN1_UTF8STRING(st, pp, i2d_func, ex_tag, ex_class, is_set) \
+	SKM_ASN1_SET_OF_i2d(ASN1_UTF8STRING, (st), (pp), (i2d_func), (ex_tag), (ex_class), (is_set))
+#define ASN1_seq_pack_ASN1_UTF8STRING(st, i2d_func, buf, len) \
+	SKM_ASN1_seq_pack(ASN1_UTF8STRING, (st), (i2d_func), (buf), (len))
+#define ASN1_seq_unpack_ASN1_UTF8STRING(buf, len, d2i_func, free_func) \
+	SKM_ASN1_seq_unpack(ASN1_UTF8STRING, (buf), (len), (d2i_func), (free_func))
+
+#define d2i_ASN1_SET_OF_DIST_POINT(st, pp, length, d2i_func, free_func, ex_tag, ex_class) \
+	SKM_ASN1_SET_OF_d2i(DIST_POINT, (st), (pp), (length), (d2i_func), (free_func), (ex_tag), (ex_class)) 
+#define i2d_ASN1_SET_OF_DIST_POINT(st, pp, i2d_func, ex_tag, ex_class, is_set) \
+	SKM_ASN1_SET_OF_i2d(DIST_POINT, (st), (pp), (i2d_func), (ex_tag), (ex_class), (is_set))
+#define ASN1_seq_pack_DIST_POINT(st, i2d_func, buf, len) \
+	SKM_ASN1_seq_pack(DIST_POINT, (st), (i2d_func), (buf), (len))
+#define ASN1_seq_unpack_DIST_POINT(buf, len, d2i_func, free_func) \
+	SKM_ASN1_seq_unpack(DIST_POINT, (buf), (len), (d2i_func), (free_func))
+
+#define d2i_ASN1_SET_OF_ESS_CERT_ID(st, pp, length, d2i_func, free_func, ex_tag, ex_class) \
+	SKM_ASN1_SET_OF_d2i(ESS_CERT_ID, (st), (pp), (length), (d2i_func), (free_func), (ex_tag), (ex_class)) 
+#define i2d_ASN1_SET_OF_ESS_CERT_ID(st, pp, i2d_func, ex_tag, ex_class, is_set) \
+	SKM_ASN1_SET_OF_i2d(ESS_CERT_ID, (st), (pp), (i2d_func), (ex_tag), (ex_class), (is_set))
+#define ASN1_seq_pack_ESS_CERT_ID(st, i2d_func, buf, len) \
+	SKM_ASN1_seq_pack(ESS_CERT_ID, (st), (i2d_func), (buf), (len))
+#define ASN1_seq_unpack_ESS_CERT_ID(buf, len, d2i_func, free_func) \
+	SKM_ASN1_seq_unpack(ESS_CERT_ID, (buf), (len), (d2i_func), (free_func))
+
+#define d2i_ASN1_SET_OF_EVP_MD(st, pp, length, d2i_func, free_func, ex_tag, ex_class) \
+	SKM_ASN1_SET_OF_d2i(EVP_MD, (st), (pp), (length), (d2i_func), (free_func), (ex_tag), (ex_class)) 
+#define i2d_ASN1_SET_OF_EVP_MD(st, pp, i2d_func, ex_tag, ex_class, is_set) \
+	SKM_ASN1_SET_OF_i2d(EVP_MD, (st), (pp), (i2d_func), (ex_tag), (ex_class), (is_set))
+#define ASN1_seq_pack_EVP_MD(st, i2d_func, buf, len) \
+	SKM_ASN1_seq_pack(EVP_MD, (st), (i2d_func), (buf), (len))
+#define ASN1_seq_unpack_EVP_MD(buf, len, d2i_func, free_func) \
+	SKM_ASN1_seq_unpack(EVP_MD, (buf), (len), (d2i_func), (free_func))
+
+#define d2i_ASN1_SET_OF_GENERAL_NAME(st, pp, length, d2i_func, free_func, ex_tag, ex_class) \
+	SKM_ASN1_SET_OF_d2i(GENERAL_NAME, (st), (pp), (length), (d2i_func), (free_func), (ex_tag), (ex_class)) 
+#define i2d_ASN1_SET_OF_GENERAL_NAME(st, pp, i2d_func, ex_tag, ex_class, is_set) \
+	SKM_ASN1_SET_OF_i2d(GENERAL_NAME, (st), (pp), (i2d_func), (ex_tag), (ex_class), (is_set))
+#define ASN1_seq_pack_GENERAL_NAME(st, i2d_func, buf, len) \
+	SKM_ASN1_seq_pack(GENERAL_NAME, (st), (i2d_func), (buf), (len))
+#define ASN1_seq_unpack_GENERAL_NAME(buf, len, d2i_func, free_func) \
+	SKM_ASN1_seq_unpack(GENERAL_NAME, (buf), (len), (d2i_func), (free_func))
+
+#define d2i_ASN1_SET_OF_OCSP_ONEREQ(st, pp, length, d2i_func, free_func, ex_tag, ex_class) \
+	SKM_ASN1_SET_OF_d2i(OCSP_ONEREQ, (st), (pp), (length), (d2i_func), (free_func), (ex_tag), (ex_class)) 
+#define i2d_ASN1_SET_OF_OCSP_ONEREQ(st, pp, i2d_func, ex_tag, ex_class, is_set) \
+	SKM_ASN1_SET_OF_i2d(OCSP_ONEREQ, (st), (pp), (i2d_func), (ex_tag), (ex_class), (is_set))
+#define ASN1_seq_pack_OCSP_ONEREQ(st, i2d_func, buf, len) \
+	SKM_ASN1_seq_pack(OCSP_ONEREQ, (st), (i2d_func), (buf), (len))
+#define ASN1_seq_unpack_OCSP_ONEREQ(buf, len, d2i_func, free_func) \
+	SKM_ASN1_seq_unpack(OCSP_ONEREQ, (buf), (len), (d2i_func), (free_func))
+
+#define d2i_ASN1_SET_OF_OCSP_SINGLERESP(st, pp, length, d2i_func, free_func, ex_tag, ex_class) \
+	SKM_ASN1_SET_OF_d2i(OCSP_SINGLERESP, (st), (pp), (length), (d2i_func), (free_func), (ex_tag), (ex_class)) 
+#define i2d_ASN1_SET_OF_OCSP_SINGLERESP(st, pp, i2d_func, ex_tag, ex_class, is_set) \
+	SKM_ASN1_SET_OF_i2d(OCSP_SINGLERESP, (st), (pp), (i2d_func), (ex_tag), (ex_class), (is_set))
+#define ASN1_seq_pack_OCSP_SINGLERESP(st, i2d_func, buf, len) \
+	SKM_ASN1_seq_pack(OCSP_SINGLERESP, (st), (i2d_func), (buf), (len))
+#define ASN1_seq_unpack_OCSP_SINGLERESP(buf, len, d2i_func, free_func) \
+	SKM_ASN1_seq_unpack(OCSP_SINGLERESP, (buf), (len), (d2i_func), (free_func))
+
+#define d2i_ASN1_SET_OF_PKCS12_SAFEBAG(st, pp, length, d2i_func, free_func, ex_tag, ex_class) \
+	SKM_ASN1_SET_OF_d2i(PKCS12_SAFEBAG, (st), (pp), (length), (d2i_func), (free_func), (ex_tag), (ex_class)) 
+#define i2d_ASN1_SET_OF_PKCS12_SAFEBAG(st, pp, i2d_func, ex_tag, ex_class, is_set) \
+	SKM_ASN1_SET_OF_i2d(PKCS12_SAFEBAG, (st), (pp), (i2d_func), (ex_tag), (ex_class), (is_set))
+#define ASN1_seq_pack_PKCS12_SAFEBAG(st, i2d_func, buf, len) \
+	SKM_ASN1_seq_pack(PKCS12_SAFEBAG, (st), (i2d_func), (buf), (len))
+#define ASN1_seq_unpack_PKCS12_SAFEBAG(buf, len, d2i_func, free_func) \
+	SKM_ASN1_seq_unpack(PKCS12_SAFEBAG, (buf), (len), (d2i_func), (free_func))
+
+#define d2i_ASN1_SET_OF_PKCS7(st, pp, length, d2i_func, free_func, ex_tag, ex_class) \
+	SKM_ASN1_SET_OF_d2i(PKCS7, (st), (pp), (length), (d2i_func), (free_func), (ex_tag), (ex_class)) 
+#define i2d_ASN1_SET_OF_PKCS7(st, pp, i2d_func, ex_tag, ex_class, is_set) \
+	SKM_ASN1_SET_OF_i2d(PKCS7, (st), (pp), (i2d_func), (ex_tag), (ex_class), (is_set))
+#define ASN1_seq_pack_PKCS7(st, i2d_func, buf, len) \
+	SKM_ASN1_seq_pack(PKCS7, (st), (i2d_func), (buf), (len))
+#define ASN1_seq_unpack_PKCS7(buf, len, d2i_func, free_func) \
+	SKM_ASN1_seq_unpack(PKCS7, (buf), (len), (d2i_func), (free_func))
+
+#define d2i_ASN1_SET_OF_PKCS7_RECIP_INFO(st, pp, length, d2i_func, free_func, ex_tag, ex_class) \
+	SKM_ASN1_SET_OF_d2i(PKCS7_RECIP_INFO, (st), (pp), (length), (d2i_func), (free_func), (ex_tag), (ex_class)) 
+#define i2d_ASN1_SET_OF_PKCS7_RECIP_INFO(st, pp, i2d_func, ex_tag, ex_class, is_set) \
+	SKM_ASN1_SET_OF_i2d(PKCS7_RECIP_INFO, (st), (pp), (i2d_func), (ex_tag), (ex_class), (is_set))
+#define ASN1_seq_pack_PKCS7_RECIP_INFO(st, i2d_func, buf, len) \
+	SKM_ASN1_seq_pack(PKCS7_RECIP_INFO, (st), (i2d_func), (buf), (len))
+#define ASN1_seq_unpack_PKCS7_RECIP_INFO(buf, len, d2i_func, free_func) \
+	SKM_ASN1_seq_unpack(PKCS7_RECIP_INFO, (buf), (len), (d2i_func), (free_func))
+
+#define d2i_ASN1_SET_OF_PKCS7_SIGNER_INFO(st, pp, length, d2i_func, free_func, ex_tag, ex_class) \
+	SKM_ASN1_SET_OF_d2i(PKCS7_SIGNER_INFO, (st), (pp), (length), (d2i_func), (free_func), (ex_tag), (ex_class)) 
+#define i2d_ASN1_SET_OF_PKCS7_SIGNER_INFO(st, pp, i2d_func, ex_tag, ex_class, is_set) \
+	SKM_ASN1_SET_OF_i2d(PKCS7_SIGNER_INFO, (st), (pp), (i2d_func), (ex_tag), (ex_class), (is_set))
+#define ASN1_seq_pack_PKCS7_SIGNER_INFO(st, i2d_func, buf, len) \
+	SKM_ASN1_seq_pack(PKCS7_SIGNER_INFO, (st), (i2d_func), (buf), (len))
+#define ASN1_seq_unpack_PKCS7_SIGNER_INFO(buf, len, d2i_func, free_func) \
+	SKM_ASN1_seq_unpack(PKCS7_SIGNER_INFO, (buf), (len), (d2i_func), (free_func))
+
+#define d2i_ASN1_SET_OF_POLICYINFO(st, pp, length, d2i_func, free_func, ex_tag, ex_class) \
+	SKM_ASN1_SET_OF_d2i(POLICYINFO, (st), (pp), (length), (d2i_func), (free_func), (ex_tag), (ex_class)) 
+#define i2d_ASN1_SET_OF_POLICYINFO(st, pp, i2d_func, ex_tag, ex_class, is_set) \
+	SKM_ASN1_SET_OF_i2d(POLICYINFO, (st), (pp), (i2d_func), (ex_tag), (ex_class), (is_set))
+#define ASN1_seq_pack_POLICYINFO(st, i2d_func, buf, len) \
+	SKM_ASN1_seq_pack(POLICYINFO, (st), (i2d_func), (buf), (len))
+#define ASN1_seq_unpack_POLICYINFO(buf, len, d2i_func, free_func) \
+	SKM_ASN1_seq_unpack(POLICYINFO, (buf), (len), (d2i_func), (free_func))
+
+#define d2i_ASN1_SET_OF_POLICYQUALINFO(st, pp, length, d2i_func, free_func, ex_tag, ex_class) \
+	SKM_ASN1_SET_OF_d2i(POLICYQUALINFO, (st), (pp), (length), (d2i_func), (free_func), (ex_tag), (ex_class)) 
+#define i2d_ASN1_SET_OF_POLICYQUALINFO(st, pp, i2d_func, ex_tag, ex_class, is_set) \
+	SKM_ASN1_SET_OF_i2d(POLICYQUALINFO, (st), (pp), (i2d_func), (ex_tag), (ex_class), (is_set))
+#define ASN1_seq_pack_POLICYQUALINFO(st, i2d_func, buf, len) \
+	SKM_ASN1_seq_pack(POLICYQUALINFO, (st), (i2d_func), (buf), (len))
+#define ASN1_seq_unpack_POLICYQUALINFO(buf, len, d2i_func, free_func) \
+	SKM_ASN1_seq_unpack(POLICYQUALINFO, (buf), (len), (d2i_func), (free_func))
+
+#define d2i_ASN1_SET_OF_SXNETID(st, pp, length, d2i_func, free_func, ex_tag, ex_class) \
+	SKM_ASN1_SET_OF_d2i(SXNETID, (st), (pp), (length), (d2i_func), (free_func), (ex_tag), (ex_class)) 
+#define i2d_ASN1_SET_OF_SXNETID(st, pp, i2d_func, ex_tag, ex_class, is_set) \
+	SKM_ASN1_SET_OF_i2d(SXNETID, (st), (pp), (i2d_func), (ex_tag), (ex_class), (is_set))
+#define ASN1_seq_pack_SXNETID(st, i2d_func, buf, len) \
+	SKM_ASN1_seq_pack(SXNETID, (st), (i2d_func), (buf), (len))
+#define ASN1_seq_unpack_SXNETID(buf, len, d2i_func, free_func) \
+	SKM_ASN1_seq_unpack(SXNETID, (buf), (len), (d2i_func), (free_func))
+
+#define d2i_ASN1_SET_OF_X509(st, pp, length, d2i_func, free_func, ex_tag, ex_class) \
+	SKM_ASN1_SET_OF_d2i(X509, (st), (pp), (length), (d2i_func), (free_func), (ex_tag), (ex_class)) 
+#define i2d_ASN1_SET_OF_X509(st, pp, i2d_func, ex_tag, ex_class, is_set) \
+	SKM_ASN1_SET_OF_i2d(X509, (st), (pp), (i2d_func), (ex_tag), (ex_class), (is_set))
+#define ASN1_seq_pack_X509(st, i2d_func, buf, len) \
+	SKM_ASN1_seq_pack(X509, (st), (i2d_func), (buf), (len))
+#define ASN1_seq_unpack_X509(buf, len, d2i_func, free_func) \
+	SKM_ASN1_seq_unpack(X509, (buf), (len), (d2i_func), (free_func))
+
+#define d2i_ASN1_SET_OF_X509_ALGOR(st, pp, length, d2i_func, free_func, ex_tag, ex_class) \
+	SKM_ASN1_SET_OF_d2i(X509_ALGOR, (st), (pp), (length), (d2i_func), (free_func), (ex_tag), (ex_class)) 
+#define i2d_ASN1_SET_OF_X509_ALGOR(st, pp, i2d_func, ex_tag, ex_class, is_set) \
+	SKM_ASN1_SET_OF_i2d(X509_ALGOR, (st), (pp), (i2d_func), (ex_tag), (ex_class), (is_set))
+#define ASN1_seq_pack_X509_ALGOR(st, i2d_func, buf, len) \
+	SKM_ASN1_seq_pack(X509_ALGOR, (st), (i2d_func), (buf), (len))
+#define ASN1_seq_unpack_X509_ALGOR(buf, len, d2i_func, free_func) \
+	SKM_ASN1_seq_unpack(X509_ALGOR, (buf), (len), (d2i_func), (free_func))
+
+#define d2i_ASN1_SET_OF_X509_ATTRIBUTE(st, pp, length, d2i_func, free_func, ex_tag, ex_class) \
+	SKM_ASN1_SET_OF_d2i(X509_ATTRIBUTE, (st), (pp), (length), (d2i_func), (free_func), (ex_tag), (ex_class)) 
+#define i2d_ASN1_SET_OF_X509_ATTRIBUTE(st, pp, i2d_func, ex_tag, ex_class, is_set) \
+	SKM_ASN1_SET_OF_i2d(X509_ATTRIBUTE, (st), (pp), (i2d_func), (ex_tag), (ex_class), (is_set))
+#define ASN1_seq_pack_X509_ATTRIBUTE(st, i2d_func, buf, len) \
+	SKM_ASN1_seq_pack(X509_ATTRIBUTE, (st), (i2d_func), (buf), (len))
+#define ASN1_seq_unpack_X509_ATTRIBUTE(buf, len, d2i_func, free_func) \
+	SKM_ASN1_seq_unpack(X509_ATTRIBUTE, (buf), (len), (d2i_func), (free_func))
+
+#define d2i_ASN1_SET_OF_X509_CRL(st, pp, length, d2i_func, free_func, ex_tag, ex_class) \
+	SKM_ASN1_SET_OF_d2i(X509_CRL, (st), (pp), (length), (d2i_func), (free_func), (ex_tag), (ex_class)) 
+#define i2d_ASN1_SET_OF_X509_CRL(st, pp, i2d_func, ex_tag, ex_class, is_set) \
+	SKM_ASN1_SET_OF_i2d(X509_CRL, (st), (pp), (i2d_func), (ex_tag), (ex_class), (is_set))
+#define ASN1_seq_pack_X509_CRL(st, i2d_func, buf, len) \
+	SKM_ASN1_seq_pack(X509_CRL, (st), (i2d_func), (buf), (len))
+#define ASN1_seq_unpack_X509_CRL(buf, len, d2i_func, free_func) \
+	SKM_ASN1_seq_unpack(X509_CRL, (buf), (len), (d2i_func), (free_func))
+
+#define d2i_ASN1_SET_OF_X509_EXTENSION(st, pp, length, d2i_func, free_func, ex_tag, ex_class) \
+	SKM_ASN1_SET_OF_d2i(X509_EXTENSION, (st), (pp), (length), (d2i_func), (free_func), (ex_tag), (ex_class)) 
+#define i2d_ASN1_SET_OF_X509_EXTENSION(st, pp, i2d_func, ex_tag, ex_class, is_set) \
+	SKM_ASN1_SET_OF_i2d(X509_EXTENSION, (st), (pp), (i2d_func), (ex_tag), (ex_class), (is_set))
+#define ASN1_seq_pack_X509_EXTENSION(st, i2d_func, buf, len) \
+	SKM_ASN1_seq_pack(X509_EXTENSION, (st), (i2d_func), (buf), (len))
+#define ASN1_seq_unpack_X509_EXTENSION(buf, len, d2i_func, free_func) \
+	SKM_ASN1_seq_unpack(X509_EXTENSION, (buf), (len), (d2i_func), (free_func))
+
+#define d2i_ASN1_SET_OF_X509_NAME_ENTRY(st, pp, length, d2i_func, free_func, ex_tag, ex_class) \
+	SKM_ASN1_SET_OF_d2i(X509_NAME_ENTRY, (st), (pp), (length), (d2i_func), (free_func), (ex_tag), (ex_class)) 
+#define i2d_ASN1_SET_OF_X509_NAME_ENTRY(st, pp, i2d_func, ex_tag, ex_class, is_set) \
+	SKM_ASN1_SET_OF_i2d(X509_NAME_ENTRY, (st), (pp), (i2d_func), (ex_tag), (ex_class), (is_set))
+#define ASN1_seq_pack_X509_NAME_ENTRY(st, i2d_func, buf, len) \
+	SKM_ASN1_seq_pack(X509_NAME_ENTRY, (st), (i2d_func), (buf), (len))
+#define ASN1_seq_unpack_X509_NAME_ENTRY(buf, len, d2i_func, free_func) \
+	SKM_ASN1_seq_unpack(X509_NAME_ENTRY, (buf), (len), (d2i_func), (free_func))
+
+#define d2i_ASN1_SET_OF_X509_REVOKED(st, pp, length, d2i_func, free_func, ex_tag, ex_class) \
+	SKM_ASN1_SET_OF_d2i(X509_REVOKED, (st), (pp), (length), (d2i_func), (free_func), (ex_tag), (ex_class)) 
+#define i2d_ASN1_SET_OF_X509_REVOKED(st, pp, i2d_func, ex_tag, ex_class, is_set) \
+	SKM_ASN1_SET_OF_i2d(X509_REVOKED, (st), (pp), (i2d_func), (ex_tag), (ex_class), (is_set))
+#define ASN1_seq_pack_X509_REVOKED(st, i2d_func, buf, len) \
+	SKM_ASN1_seq_pack(X509_REVOKED, (st), (i2d_func), (buf), (len))
+#define ASN1_seq_unpack_X509_REVOKED(buf, len, d2i_func, free_func) \
+	SKM_ASN1_seq_unpack(X509_REVOKED, (buf), (len), (d2i_func), (free_func))
+
+#define PKCS12_decrypt_d2i_PKCS12_SAFEBAG(algor, d2i_func, free_func, pass, passlen, oct, seq) \
+	SKM_PKCS12_decrypt_d2i(PKCS12_SAFEBAG, (algor), (d2i_func), (free_func), (pass), (passlen), (oct), (seq))
+
+#define PKCS12_decrypt_d2i_PKCS7(algor, d2i_func, free_func, pass, passlen, oct, seq) \
+	SKM_PKCS12_decrypt_d2i(PKCS7, (algor), (d2i_func), (free_func), (pass), (passlen), (oct), (seq))
+
+#define lh_ADDED_OBJ_new() LHM_lh_new(ADDED_OBJ,added_obj)
+#define lh_ADDED_OBJ_insert(lh,inst) LHM_lh_insert(ADDED_OBJ,lh,inst)
+#define lh_ADDED_OBJ_retrieve(lh,inst) LHM_lh_retrieve(ADDED_OBJ,lh,inst)
+#define lh_ADDED_OBJ_delete(lh,inst) LHM_lh_delete(ADDED_OBJ,lh,inst)
+#define lh_ADDED_OBJ_doall(lh,fn) LHM_lh_doall(ADDED_OBJ,lh,fn)
+#define lh_ADDED_OBJ_doall_arg(lh,fn,arg_type,arg) \
+  LHM_lh_doall_arg(ADDED_OBJ,lh,fn,arg_type,arg)
+#define lh_ADDED_OBJ_error(lh) LHM_lh_error(ADDED_OBJ,lh)
+#define lh_ADDED_OBJ_num_items(lh) LHM_lh_num_items(ADDED_OBJ,lh)
+#define lh_ADDED_OBJ_down_load(lh) LHM_lh_down_load(ADDED_OBJ,lh)
+#define lh_ADDED_OBJ_node_stats_bio(lh,out) \
+  LHM_lh_node_stats_bio(ADDED_OBJ,lh,out)
+#define lh_ADDED_OBJ_node_usage_stats_bio(lh,out) \
+  LHM_lh_node_usage_stats_bio(ADDED_OBJ,lh,out)
+#define lh_ADDED_OBJ_stats_bio(lh,out) \
+  LHM_lh_stats_bio(ADDED_OBJ,lh,out)
+#define lh_ADDED_OBJ_free(lh) LHM_lh_free(ADDED_OBJ,lh)
+
+#define lh_APP_INFO_new() LHM_lh_new(APP_INFO,app_info)
+#define lh_APP_INFO_insert(lh,inst) LHM_lh_insert(APP_INFO,lh,inst)
+#define lh_APP_INFO_retrieve(lh,inst) LHM_lh_retrieve(APP_INFO,lh,inst)
+#define lh_APP_INFO_delete(lh,inst) LHM_lh_delete(APP_INFO,lh,inst)
+#define lh_APP_INFO_doall(lh,fn) LHM_lh_doall(APP_INFO,lh,fn)
+#define lh_APP_INFO_doall_arg(lh,fn,arg_type,arg) \
+  LHM_lh_doall_arg(APP_INFO,lh,fn,arg_type,arg)
+#define lh_APP_INFO_error(lh) LHM_lh_error(APP_INFO,lh)
+#define lh_APP_INFO_num_items(lh) LHM_lh_num_items(APP_INFO,lh)
+#define lh_APP_INFO_down_load(lh) LHM_lh_down_load(APP_INFO,lh)
+#define lh_APP_INFO_node_stats_bio(lh,out) \
+  LHM_lh_node_stats_bio(APP_INFO,lh,out)
+#define lh_APP_INFO_node_usage_stats_bio(lh,out) \
+  LHM_lh_node_usage_stats_bio(APP_INFO,lh,out)
+#define lh_APP_INFO_stats_bio(lh,out) \
+  LHM_lh_stats_bio(APP_INFO,lh,out)
+#define lh_APP_INFO_free(lh) LHM_lh_free(APP_INFO,lh)
+
+#define lh_CONF_VALUE_new() LHM_lh_new(CONF_VALUE,conf_value)
+#define lh_CONF_VALUE_insert(lh,inst) LHM_lh_insert(CONF_VALUE,lh,inst)
+#define lh_CONF_VALUE_retrieve(lh,inst) LHM_lh_retrieve(CONF_VALUE,lh,inst)
+#define lh_CONF_VALUE_delete(lh,inst) LHM_lh_delete(CONF_VALUE,lh,inst)
+#define lh_CONF_VALUE_doall(lh,fn) LHM_lh_doall(CONF_VALUE,lh,fn)
+#define lh_CONF_VALUE_doall_arg(lh,fn,arg_type,arg) \
+  LHM_lh_doall_arg(CONF_VALUE,lh,fn,arg_type,arg)
+#define lh_CONF_VALUE_error(lh) LHM_lh_error(CONF_VALUE,lh)
+#define lh_CONF_VALUE_num_items(lh) LHM_lh_num_items(CONF_VALUE,lh)
+#define lh_CONF_VALUE_down_load(lh) LHM_lh_down_load(CONF_VALUE,lh)
+#define lh_CONF_VALUE_node_stats_bio(lh,out) \
+  LHM_lh_node_stats_bio(CONF_VALUE,lh,out)
+#define lh_CONF_VALUE_node_usage_stats_bio(lh,out) \
+  LHM_lh_node_usage_stats_bio(CONF_VALUE,lh,out)
+#define lh_CONF_VALUE_stats_bio(lh,out) \
+  LHM_lh_stats_bio(CONF_VALUE,lh,out)
+#define lh_CONF_VALUE_free(lh) LHM_lh_free(CONF_VALUE,lh)
+
+#define lh_ENGINE_PILE_new() LHM_lh_new(ENGINE_PILE,engine_pile)
+#define lh_ENGINE_PILE_insert(lh,inst) LHM_lh_insert(ENGINE_PILE,lh,inst)
+#define lh_ENGINE_PILE_retrieve(lh,inst) LHM_lh_retrieve(ENGINE_PILE,lh,inst)
+#define lh_ENGINE_PILE_delete(lh,inst) LHM_lh_delete(ENGINE_PILE,lh,inst)
+#define lh_ENGINE_PILE_doall(lh,fn) LHM_lh_doall(ENGINE_PILE,lh,fn)
+#define lh_ENGINE_PILE_doall_arg(lh,fn,arg_type,arg) \
+  LHM_lh_doall_arg(ENGINE_PILE,lh,fn,arg_type,arg)
+#define lh_ENGINE_PILE_error(lh) LHM_lh_error(ENGINE_PILE,lh)
+#define lh_ENGINE_PILE_num_items(lh) LHM_lh_num_items(ENGINE_PILE,lh)
+#define lh_ENGINE_PILE_down_load(lh) LHM_lh_down_load(ENGINE_PILE,lh)
+#define lh_ENGINE_PILE_node_stats_bio(lh,out) \
+  LHM_lh_node_stats_bio(ENGINE_PILE,lh,out)
+#define lh_ENGINE_PILE_node_usage_stats_bio(lh,out) \
+  LHM_lh_node_usage_stats_bio(ENGINE_PILE,lh,out)
+#define lh_ENGINE_PILE_stats_bio(lh,out) \
+  LHM_lh_stats_bio(ENGINE_PILE,lh,out)
+#define lh_ENGINE_PILE_free(lh) LHM_lh_free(ENGINE_PILE,lh)
+
+#define lh_ERR_STATE_new() LHM_lh_new(ERR_STATE,err_state)
+#define lh_ERR_STATE_insert(lh,inst) LHM_lh_insert(ERR_STATE,lh,inst)
+#define lh_ERR_STATE_retrieve(lh,inst) LHM_lh_retrieve(ERR_STATE,lh,inst)
+#define lh_ERR_STATE_delete(lh,inst) LHM_lh_delete(ERR_STATE,lh,inst)
+#define lh_ERR_STATE_doall(lh,fn) LHM_lh_doall(ERR_STATE,lh,fn)
+#define lh_ERR_STATE_doall_arg(lh,fn,arg_type,arg) \
+  LHM_lh_doall_arg(ERR_STATE,lh,fn,arg_type,arg)
+#define lh_ERR_STATE_error(lh) LHM_lh_error(ERR_STATE,lh)
+#define lh_ERR_STATE_num_items(lh) LHM_lh_num_items(ERR_STATE,lh)
+#define lh_ERR_STATE_down_load(lh) LHM_lh_down_load(ERR_STATE,lh)
+#define lh_ERR_STATE_node_stats_bio(lh,out) \
+  LHM_lh_node_stats_bio(ERR_STATE,lh,out)
+#define lh_ERR_STATE_node_usage_stats_bio(lh,out) \
+  LHM_lh_node_usage_stats_bio(ERR_STATE,lh,out)
+#define lh_ERR_STATE_stats_bio(lh,out) \
+  LHM_lh_stats_bio(ERR_STATE,lh,out)
+#define lh_ERR_STATE_free(lh) LHM_lh_free(ERR_STATE,lh)
+
+#define lh_ERR_STRING_DATA_new() LHM_lh_new(ERR_STRING_DATA,err_string_data)
+#define lh_ERR_STRING_DATA_insert(lh,inst) LHM_lh_insert(ERR_STRING_DATA,lh,inst)
+#define lh_ERR_STRING_DATA_retrieve(lh,inst) LHM_lh_retrieve(ERR_STRING_DATA,lh,inst)
+#define lh_ERR_STRING_DATA_delete(lh,inst) LHM_lh_delete(ERR_STRING_DATA,lh,inst)
+#define lh_ERR_STRING_DATA_doall(lh,fn) LHM_lh_doall(ERR_STRING_DATA,lh,fn)
+#define lh_ERR_STRING_DATA_doall_arg(lh,fn,arg_type,arg) \
+  LHM_lh_doall_arg(ERR_STRING_DATA,lh,fn,arg_type,arg)
+#define lh_ERR_STRING_DATA_error(lh) LHM_lh_error(ERR_STRING_DATA,lh)
+#define lh_ERR_STRING_DATA_num_items(lh) LHM_lh_num_items(ERR_STRING_DATA,lh)
+#define lh_ERR_STRING_DATA_down_load(lh) LHM_lh_down_load(ERR_STRING_DATA,lh)
+#define lh_ERR_STRING_DATA_node_stats_bio(lh,out) \
+  LHM_lh_node_stats_bio(ERR_STRING_DATA,lh,out)
+#define lh_ERR_STRING_DATA_node_usage_stats_bio(lh,out) \
+  LHM_lh_node_usage_stats_bio(ERR_STRING_DATA,lh,out)
+#define lh_ERR_STRING_DATA_stats_bio(lh,out) \
+  LHM_lh_stats_bio(ERR_STRING_DATA,lh,out)
+#define lh_ERR_STRING_DATA_free(lh) LHM_lh_free(ERR_STRING_DATA,lh)
+
+#define lh_EX_CLASS_ITEM_new() LHM_lh_new(EX_CLASS_ITEM,ex_class_item)
+#define lh_EX_CLASS_ITEM_insert(lh,inst) LHM_lh_insert(EX_CLASS_ITEM,lh,inst)
+#define lh_EX_CLASS_ITEM_retrieve(lh,inst) LHM_lh_retrieve(EX_CLASS_ITEM,lh,inst)
+#define lh_EX_CLASS_ITEM_delete(lh,inst) LHM_lh_delete(EX_CLASS_ITEM,lh,inst)
+#define lh_EX_CLASS_ITEM_doall(lh,fn) LHM_lh_doall(EX_CLASS_ITEM,lh,fn)
+#define lh_EX_CLASS_ITEM_doall_arg(lh,fn,arg_type,arg) \
+  LHM_lh_doall_arg(EX_CLASS_ITEM,lh,fn,arg_type,arg)
+#define lh_EX_CLASS_ITEM_error(lh) LHM_lh_error(EX_CLASS_ITEM,lh)
+#define lh_EX_CLASS_ITEM_num_items(lh) LHM_lh_num_items(EX_CLASS_ITEM,lh)
+#define lh_EX_CLASS_ITEM_down_load(lh) LHM_lh_down_load(EX_CLASS_ITEM,lh)
+#define lh_EX_CLASS_ITEM_node_stats_bio(lh,out) \
+  LHM_lh_node_stats_bio(EX_CLASS_ITEM,lh,out)
+#define lh_EX_CLASS_ITEM_node_usage_stats_bio(lh,out) \
+  LHM_lh_node_usage_stats_bio(EX_CLASS_ITEM,lh,out)
+#define lh_EX_CLASS_ITEM_stats_bio(lh,out) \
+  LHM_lh_stats_bio(EX_CLASS_ITEM,lh,out)
+#define lh_EX_CLASS_ITEM_free(lh) LHM_lh_free(EX_CLASS_ITEM,lh)
+
+#define lh_FUNCTION_new() LHM_lh_new(FUNCTION,function)
+#define lh_FUNCTION_insert(lh,inst) LHM_lh_insert(FUNCTION,lh,inst)
+#define lh_FUNCTION_retrieve(lh,inst) LHM_lh_retrieve(FUNCTION,lh,inst)
+#define lh_FUNCTION_delete(lh,inst) LHM_lh_delete(FUNCTION,lh,inst)
+#define lh_FUNCTION_doall(lh,fn) LHM_lh_doall(FUNCTION,lh,fn)
+#define lh_FUNCTION_doall_arg(lh,fn,arg_type,arg) \
+  LHM_lh_doall_arg(FUNCTION,lh,fn,arg_type,arg)
+#define lh_FUNCTION_error(lh) LHM_lh_error(FUNCTION,lh)
+#define lh_FUNCTION_num_items(lh) LHM_lh_num_items(FUNCTION,lh)
+#define lh_FUNCTION_down_load(lh) LHM_lh_down_load(FUNCTION,lh)
+#define lh_FUNCTION_node_stats_bio(lh,out) \
+  LHM_lh_node_stats_bio(FUNCTION,lh,out)
+#define lh_FUNCTION_node_usage_stats_bio(lh,out) \
+  LHM_lh_node_usage_stats_bio(FUNCTION,lh,out)
+#define lh_FUNCTION_stats_bio(lh,out) \
+  LHM_lh_stats_bio(FUNCTION,lh,out)
+#define lh_FUNCTION_free(lh) LHM_lh_free(FUNCTION,lh)
+
+#define lh_MEM_new() LHM_lh_new(MEM,mem)
+#define lh_MEM_insert(lh,inst) LHM_lh_insert(MEM,lh,inst)
+#define lh_MEM_retrieve(lh,inst) LHM_lh_retrieve(MEM,lh,inst)
+#define lh_MEM_delete(lh,inst) LHM_lh_delete(MEM,lh,inst)
+#define lh_MEM_doall(lh,fn) LHM_lh_doall(MEM,lh,fn)
+#define lh_MEM_doall_arg(lh,fn,arg_type,arg) \
+  LHM_lh_doall_arg(MEM,lh,fn,arg_type,arg)
+#define lh_MEM_error(lh) LHM_lh_error(MEM,lh)
+#define lh_MEM_num_items(lh) LHM_lh_num_items(MEM,lh)
+#define lh_MEM_down_load(lh) LHM_lh_down_load(MEM,lh)
+#define lh_MEM_node_stats_bio(lh,out) \
+  LHM_lh_node_stats_bio(MEM,lh,out)
+#define lh_MEM_node_usage_stats_bio(lh,out) \
+  LHM_lh_node_usage_stats_bio(MEM,lh,out)
+#define lh_MEM_stats_bio(lh,out) \
+  LHM_lh_stats_bio(MEM,lh,out)
+#define lh_MEM_free(lh) LHM_lh_free(MEM,lh)
+
+#define lh_OBJ_NAME_new() LHM_lh_new(OBJ_NAME,obj_name)
+#define lh_OBJ_NAME_insert(lh,inst) LHM_lh_insert(OBJ_NAME,lh,inst)
+#define lh_OBJ_NAME_retrieve(lh,inst) LHM_lh_retrieve(OBJ_NAME,lh,inst)
+#define lh_OBJ_NAME_delete(lh,inst) LHM_lh_delete(OBJ_NAME,lh,inst)
+#define lh_OBJ_NAME_doall(lh,fn) LHM_lh_doall(OBJ_NAME,lh,fn)
+#define lh_OBJ_NAME_doall_arg(lh,fn,arg_type,arg) \
+  LHM_lh_doall_arg(OBJ_NAME,lh,fn,arg_type,arg)
+#define lh_OBJ_NAME_error(lh) LHM_lh_error(OBJ_NAME,lh)
+#define lh_OBJ_NAME_num_items(lh) LHM_lh_num_items(OBJ_NAME,lh)
+#define lh_OBJ_NAME_down_load(lh) LHM_lh_down_load(OBJ_NAME,lh)
+#define lh_OBJ_NAME_node_stats_bio(lh,out) \
+  LHM_lh_node_stats_bio(OBJ_NAME,lh,out)
+#define lh_OBJ_NAME_node_usage_stats_bio(lh,out) \
+  LHM_lh_node_usage_stats_bio(OBJ_NAME,lh,out)
+#define lh_OBJ_NAME_stats_bio(lh,out) \
+  LHM_lh_stats_bio(OBJ_NAME,lh,out)
+#define lh_OBJ_NAME_free(lh) LHM_lh_free(OBJ_NAME,lh)
+
+#define lh_OPENSSL_CSTRING_new() LHM_lh_new(OPENSSL_CSTRING,openssl_cstring)
+#define lh_OPENSSL_CSTRING_insert(lh,inst) LHM_lh_insert(OPENSSL_CSTRING,lh,inst)
+#define lh_OPENSSL_CSTRING_retrieve(lh,inst) LHM_lh_retrieve(OPENSSL_CSTRING,lh,inst)
+#define lh_OPENSSL_CSTRING_delete(lh,inst) LHM_lh_delete(OPENSSL_CSTRING,lh,inst)
+#define lh_OPENSSL_CSTRING_doall(lh,fn) LHM_lh_doall(OPENSSL_CSTRING,lh,fn)
+#define lh_OPENSSL_CSTRING_doall_arg(lh,fn,arg_type,arg) \
+  LHM_lh_doall_arg(OPENSSL_CSTRING,lh,fn,arg_type,arg)
+#define lh_OPENSSL_CSTRING_error(lh) LHM_lh_error(OPENSSL_CSTRING,lh)
+#define lh_OPENSSL_CSTRING_num_items(lh) LHM_lh_num_items(OPENSSL_CSTRING,lh)
+#define lh_OPENSSL_CSTRING_down_load(lh) LHM_lh_down_load(OPENSSL_CSTRING,lh)
+#define lh_OPENSSL_CSTRING_node_stats_bio(lh,out) \
+  LHM_lh_node_stats_bio(OPENSSL_CSTRING,lh,out)
+#define lh_OPENSSL_CSTRING_node_usage_stats_bio(lh,out) \
+  LHM_lh_node_usage_stats_bio(OPENSSL_CSTRING,lh,out)
+#define lh_OPENSSL_CSTRING_stats_bio(lh,out) \
+  LHM_lh_stats_bio(OPENSSL_CSTRING,lh,out)
+#define lh_OPENSSL_CSTRING_free(lh) LHM_lh_free(OPENSSL_CSTRING,lh)
+
+#define lh_OPENSSL_STRING_new() LHM_lh_new(OPENSSL_STRING,openssl_string)
+#define lh_OPENSSL_STRING_insert(lh,inst) LHM_lh_insert(OPENSSL_STRING,lh,inst)
+#define lh_OPENSSL_STRING_retrieve(lh,inst) LHM_lh_retrieve(OPENSSL_STRING,lh,inst)
+#define lh_OPENSSL_STRING_delete(lh,inst) LHM_lh_delete(OPENSSL_STRING,lh,inst)
+#define lh_OPENSSL_STRING_doall(lh,fn) LHM_lh_doall(OPENSSL_STRING,lh,fn)
+#define lh_OPENSSL_STRING_doall_arg(lh,fn,arg_type,arg) \
+  LHM_lh_doall_arg(OPENSSL_STRING,lh,fn,arg_type,arg)
+#define lh_OPENSSL_STRING_error(lh) LHM_lh_error(OPENSSL_STRING,lh)
+#define lh_OPENSSL_STRING_num_items(lh) LHM_lh_num_items(OPENSSL_STRING,lh)
+#define lh_OPENSSL_STRING_down_load(lh) LHM_lh_down_load(OPENSSL_STRING,lh)
+#define lh_OPENSSL_STRING_node_stats_bio(lh,out) \
+  LHM_lh_node_stats_bio(OPENSSL_STRING,lh,out)
+#define lh_OPENSSL_STRING_node_usage_stats_bio(lh,out) \
+  LHM_lh_node_usage_stats_bio(OPENSSL_STRING,lh,out)
+#define lh_OPENSSL_STRING_stats_bio(lh,out) \
+  LHM_lh_stats_bio(OPENSSL_STRING,lh,out)
+#define lh_OPENSSL_STRING_free(lh) LHM_lh_free(OPENSSL_STRING,lh)
+
+#define lh_SSL_SESSION_new() LHM_lh_new(SSL_SESSION,ssl_session)
+#define lh_SSL_SESSION_insert(lh,inst) LHM_lh_insert(SSL_SESSION,lh,inst)
+#define lh_SSL_SESSION_retrieve(lh,inst) LHM_lh_retrieve(SSL_SESSION,lh,inst)
+#define lh_SSL_SESSION_delete(lh,inst) LHM_lh_delete(SSL_SESSION,lh,inst)
+#define lh_SSL_SESSION_doall(lh,fn) LHM_lh_doall(SSL_SESSION,lh,fn)
+#define lh_SSL_SESSION_doall_arg(lh,fn,arg_type,arg) \
+  LHM_lh_doall_arg(SSL_SESSION,lh,fn,arg_type,arg)
+#define lh_SSL_SESSION_error(lh) LHM_lh_error(SSL_SESSION,lh)
+#define lh_SSL_SESSION_num_items(lh) LHM_lh_num_items(SSL_SESSION,lh)
+#define lh_SSL_SESSION_down_load(lh) LHM_lh_down_load(SSL_SESSION,lh)
+#define lh_SSL_SESSION_node_stats_bio(lh,out) \
+  LHM_lh_node_stats_bio(SSL_SESSION,lh,out)
+#define lh_SSL_SESSION_node_usage_stats_bio(lh,out) \
+  LHM_lh_node_usage_stats_bio(SSL_SESSION,lh,out)
+#define lh_SSL_SESSION_stats_bio(lh,out) \
+  LHM_lh_stats_bio(SSL_SESSION,lh,out)
+#define lh_SSL_SESSION_free(lh) LHM_lh_free(SSL_SESSION,lh)
+/* End of util/mkstack.pl block, you may now edit :-) */
+
+#endif /* !defined HEADER_SAFESTACK_H */
diff --git a/compat/curl-for-windows/openssl/openssl/include/openssl/sha.h b/compat/curl-for-windows/openssl/openssl/include/openssl/sha.h
new file mode 100644
index 0000000000..ab9d94c386
--- /dev/null
+++ b/compat/curl-for-windows/openssl/openssl/include/openssl/sha.h
@@ -0,0 +1 @@
+#include "../../crypto/sha/sha.h"
diff --git a/compat/curl-for-windows/openssl/openssl/include/openssl/stack.h b/compat/curl-for-windows/openssl/openssl/include/openssl/stack.h
new file mode 100644
index 0000000000..ce35e554eb
--- /dev/null
+++ b/compat/curl-for-windows/openssl/openssl/include/openssl/stack.h
@@ -0,0 +1,108 @@
+/* crypto/stack/stack.h */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ * 
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ * 
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from 
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ * 
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+
+#ifndef HEADER_STACK_H
+#define HEADER_STACK_H
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+typedef struct stack_st
+	{
+	int num;
+	char **data;
+	int sorted;
+
+	int num_alloc;
+	int (*comp)(const void *, const void *);
+	} _STACK;  /* Use STACK_OF(...) instead */
+
+#define M_sk_num(sk)		((sk) ? (sk)->num:-1)
+#define M_sk_value(sk,n)	((sk) ? (sk)->data[n] : NULL)
+
+int sk_num(const _STACK *);
+void *sk_value(const _STACK *, int);
+
+void *sk_set(_STACK *, int, void *);
+
+_STACK *sk_new(int (*cmp)(const void *, const void *));
+_STACK *sk_new_null(void);
+void sk_free(_STACK *);
+void sk_pop_free(_STACK *st, void (*func)(void *));
+int sk_insert(_STACK *sk, void *data, int where);
+void *sk_delete(_STACK *st, int loc);
+void *sk_delete_ptr(_STACK *st, void *p);
+int sk_find(_STACK *st, void *data);
+int sk_find_ex(_STACK *st, void *data);
+int sk_push(_STACK *st, void *data);
+int sk_unshift(_STACK *st, void *data);
+void *sk_shift(_STACK *st);
+void *sk_pop(_STACK *st);
+void sk_zero(_STACK *st);
+int (*sk_set_cmp_func(_STACK *sk, int (*c)(const void *, const void *)))
+	(const void *, const void *);
+_STACK *sk_dup(_STACK *st);
+void sk_sort(_STACK *st);
+int sk_is_sorted(const _STACK *st);
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif
diff --git a/compat/curl-for-windows/openssl/openssl/include/openssl/symhacks.h b/compat/curl-for-windows/openssl/openssl/include/openssl/symhacks.h
new file mode 100644
index 0000000000..bd2f000d59
--- /dev/null
+++ b/compat/curl-for-windows/openssl/openssl/include/openssl/symhacks.h
@@ -0,0 +1,481 @@
+/* ====================================================================
+ * Copyright (c) 1999 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+#ifndef HEADER_SYMHACKS_H
+#define HEADER_SYMHACKS_H
+
+#include <openssl/e_os2.h>
+
+/* Hacks to solve the problem with linkers incapable of handling very long
+   symbol names.  In the case of VMS, the limit is 31 characters on VMS for
+   VAX. */
+/* Note that this affects util/libeay.num and util/ssleay.num...  you may
+   change those manually, but that's not recommended, as those files are
+   controlled centrally and updated on Unix, and the central definition
+   may disagree with yours, which in turn may come with shareable library
+   incompatibilities. */
+#ifdef OPENSSL_SYS_VMS
+
+/* Hack a long name in crypto/ex_data.c */
+#undef CRYPTO_get_ex_data_implementation
+#define CRYPTO_get_ex_data_implementation	CRYPTO_get_ex_data_impl
+#undef CRYPTO_set_ex_data_implementation
+#define CRYPTO_set_ex_data_implementation	CRYPTO_set_ex_data_impl
+
+/* Hack a long name in crypto/asn1/a_mbstr.c */
+#undef ASN1_STRING_set_default_mask_asc
+#define ASN1_STRING_set_default_mask_asc	ASN1_STRING_set_def_mask_asc
+
+#if 0 /* No longer needed, since safestack macro magic does the job */
+/* Hack the names created with DECLARE_ASN1_SET_OF(PKCS7_SIGNER_INFO) */
+#undef i2d_ASN1_SET_OF_PKCS7_SIGNER_INFO
+#define i2d_ASN1_SET_OF_PKCS7_SIGNER_INFO	i2d_ASN1_SET_OF_PKCS7_SIGINF
+#undef d2i_ASN1_SET_OF_PKCS7_SIGNER_INFO
+#define d2i_ASN1_SET_OF_PKCS7_SIGNER_INFO	d2i_ASN1_SET_OF_PKCS7_SIGINF
+#endif
+
+#if 0 /* No longer needed, since safestack macro magic does the job */
+/* Hack the names created with DECLARE_ASN1_SET_OF(PKCS7_RECIP_INFO) */
+#undef i2d_ASN1_SET_OF_PKCS7_RECIP_INFO
+#define i2d_ASN1_SET_OF_PKCS7_RECIP_INFO	i2d_ASN1_SET_OF_PKCS7_RECINF
+#undef d2i_ASN1_SET_OF_PKCS7_RECIP_INFO
+#define d2i_ASN1_SET_OF_PKCS7_RECIP_INFO	d2i_ASN1_SET_OF_PKCS7_RECINF
+#endif
+
+#if 0 /* No longer needed, since safestack macro magic does the job */
+/* Hack the names created with DECLARE_ASN1_SET_OF(ACCESS_DESCRIPTION) */
+#undef i2d_ASN1_SET_OF_ACCESS_DESCRIPTION
+#define i2d_ASN1_SET_OF_ACCESS_DESCRIPTION	i2d_ASN1_SET_OF_ACC_DESC
+#undef d2i_ASN1_SET_OF_ACCESS_DESCRIPTION
+#define d2i_ASN1_SET_OF_ACCESS_DESCRIPTION	d2i_ASN1_SET_OF_ACC_DESC
+#endif
+
+/* Hack the names created with DECLARE_PEM_rw(NETSCAPE_CERT_SEQUENCE) */
+#undef PEM_read_NETSCAPE_CERT_SEQUENCE
+#define PEM_read_NETSCAPE_CERT_SEQUENCE		PEM_read_NS_CERT_SEQ
+#undef PEM_write_NETSCAPE_CERT_SEQUENCE
+#define PEM_write_NETSCAPE_CERT_SEQUENCE	PEM_write_NS_CERT_SEQ
+#undef PEM_read_bio_NETSCAPE_CERT_SEQUENCE
+#define PEM_read_bio_NETSCAPE_CERT_SEQUENCE	PEM_read_bio_NS_CERT_SEQ
+#undef PEM_write_bio_NETSCAPE_CERT_SEQUENCE
+#define PEM_write_bio_NETSCAPE_CERT_SEQUENCE	PEM_write_bio_NS_CERT_SEQ
+#undef PEM_write_cb_bio_NETSCAPE_CERT_SEQUENCE
+#define PEM_write_cb_bio_NETSCAPE_CERT_SEQUENCE	PEM_write_cb_bio_NS_CERT_SEQ
+
+/* Hack the names created with DECLARE_PEM_rw(PKCS8_PRIV_KEY_INFO) */
+#undef PEM_read_PKCS8_PRIV_KEY_INFO
+#define PEM_read_PKCS8_PRIV_KEY_INFO		PEM_read_P8_PRIV_KEY_INFO
+#undef PEM_write_PKCS8_PRIV_KEY_INFO
+#define PEM_write_PKCS8_PRIV_KEY_INFO		PEM_write_P8_PRIV_KEY_INFO
+#undef PEM_read_bio_PKCS8_PRIV_KEY_INFO
+#define PEM_read_bio_PKCS8_PRIV_KEY_INFO	PEM_read_bio_P8_PRIV_KEY_INFO
+#undef PEM_write_bio_PKCS8_PRIV_KEY_INFO
+#define PEM_write_bio_PKCS8_PRIV_KEY_INFO	PEM_write_bio_P8_PRIV_KEY_INFO
+#undef PEM_write_cb_bio_PKCS8_PRIV_KEY_INFO
+#define PEM_write_cb_bio_PKCS8_PRIV_KEY_INFO	PEM_wrt_cb_bio_P8_PRIV_KEY_INFO
+
+/* Hack other PEM names */
+#undef PEM_write_bio_PKCS8PrivateKey_nid
+#define PEM_write_bio_PKCS8PrivateKey_nid	PEM_write_bio_PKCS8PrivKey_nid
+
+/* Hack some long X509 names */
+#undef X509_REVOKED_get_ext_by_critical
+#define X509_REVOKED_get_ext_by_critical	X509_REVOKED_get_ext_by_critic
+#undef X509_policy_tree_get0_user_policies
+#define X509_policy_tree_get0_user_policies	X509_pcy_tree_get0_usr_policies
+#undef X509_policy_node_get0_qualifiers
+#define X509_policy_node_get0_qualifiers	X509_pcy_node_get0_qualifiers
+#undef X509_STORE_CTX_get_explicit_policy
+#define X509_STORE_CTX_get_explicit_policy	X509_STORE_CTX_get_expl_policy
+#undef X509_STORE_CTX_get0_current_issuer
+#define X509_STORE_CTX_get0_current_issuer	X509_STORE_CTX_get0_cur_issuer
+
+/* Hack some long CRYPTO names */
+#undef CRYPTO_set_dynlock_destroy_callback
+#define CRYPTO_set_dynlock_destroy_callback     CRYPTO_set_dynlock_destroy_cb
+#undef CRYPTO_set_dynlock_create_callback
+#define CRYPTO_set_dynlock_create_callback      CRYPTO_set_dynlock_create_cb
+#undef CRYPTO_set_dynlock_lock_callback
+#define CRYPTO_set_dynlock_lock_callback	CRYPTO_set_dynlock_lock_cb
+#undef CRYPTO_get_dynlock_lock_callback
+#define CRYPTO_get_dynlock_lock_callback	CRYPTO_get_dynlock_lock_cb
+#undef CRYPTO_get_dynlock_destroy_callback
+#define CRYPTO_get_dynlock_destroy_callback     CRYPTO_get_dynlock_destroy_cb
+#undef CRYPTO_get_dynlock_create_callback
+#define CRYPTO_get_dynlock_create_callback      CRYPTO_get_dynlock_create_cb
+#undef CRYPTO_set_locked_mem_ex_functions
+#define CRYPTO_set_locked_mem_ex_functions      CRYPTO_set_locked_mem_ex_funcs
+#undef CRYPTO_get_locked_mem_ex_functions
+#define CRYPTO_get_locked_mem_ex_functions      CRYPTO_get_locked_mem_ex_funcs
+
+/* Hack some long SSL names */
+#undef SSL_CTX_set_default_verify_paths
+#define SSL_CTX_set_default_verify_paths	SSL_CTX_set_def_verify_paths
+#undef SSL_get_ex_data_X509_STORE_CTX_idx
+#define SSL_get_ex_data_X509_STORE_CTX_idx      SSL_get_ex_d_X509_STORE_CTX_idx
+#undef SSL_add_file_cert_subjects_to_stack
+#define SSL_add_file_cert_subjects_to_stack     SSL_add_file_cert_subjs_to_stk
+#undef SSL_add_dir_cert_subjects_to_stack
+#define SSL_add_dir_cert_subjects_to_stack      SSL_add_dir_cert_subjs_to_stk
+#undef SSL_CTX_use_certificate_chain_file
+#define SSL_CTX_use_certificate_chain_file      SSL_CTX_use_cert_chain_file
+#undef SSL_CTX_set_cert_verify_callback
+#define SSL_CTX_set_cert_verify_callback	SSL_CTX_set_cert_verify_cb
+#undef SSL_CTX_set_default_passwd_cb_userdata
+#define SSL_CTX_set_default_passwd_cb_userdata  SSL_CTX_set_def_passwd_cb_ud
+#undef SSL_COMP_get_compression_methods
+#define SSL_COMP_get_compression_methods	SSL_COMP_get_compress_methods
+#undef ssl_add_clienthello_renegotiate_ext
+#define ssl_add_clienthello_renegotiate_ext	ssl_add_clienthello_reneg_ext
+#undef ssl_add_serverhello_renegotiate_ext
+#define ssl_add_serverhello_renegotiate_ext	ssl_add_serverhello_reneg_ext
+#undef ssl_parse_clienthello_renegotiate_ext
+#define ssl_parse_clienthello_renegotiate_ext	ssl_parse_clienthello_reneg_ext
+#undef ssl_parse_serverhello_renegotiate_ext
+#define ssl_parse_serverhello_renegotiate_ext	ssl_parse_serverhello_reneg_ext
+#undef SSL_srp_server_param_with_username
+#define SSL_srp_server_param_with_username	SSL_srp_server_param_with_un
+#undef SSL_CTX_set_srp_client_pwd_callback
+#define SSL_CTX_set_srp_client_pwd_callback	SSL_CTX_set_srp_client_pwd_cb
+#undef SSL_CTX_set_srp_verify_param_callback
+#define SSL_CTX_set_srp_verify_param_callback	SSL_CTX_set_srp_vfy_param_cb
+#undef SSL_CTX_set_srp_username_callback
+#define SSL_CTX_set_srp_username_callback	SSL_CTX_set_srp_un_cb
+#undef ssl_add_clienthello_use_srtp_ext
+#define ssl_add_clienthello_use_srtp_ext	ssl_add_clihello_use_srtp_ext
+#undef ssl_add_serverhello_use_srtp_ext
+#define ssl_add_serverhello_use_srtp_ext	ssl_add_serhello_use_srtp_ext
+#undef ssl_parse_clienthello_use_srtp_ext
+#define ssl_parse_clienthello_use_srtp_ext	ssl_parse_clihello_use_srtp_ext
+#undef ssl_parse_serverhello_use_srtp_ext
+#define ssl_parse_serverhello_use_srtp_ext	ssl_parse_serhello_use_srtp_ext
+#undef SSL_CTX_set_next_protos_advertised_cb
+#define SSL_CTX_set_next_protos_advertised_cb	SSL_CTX_set_next_protos_adv_cb
+#undef SSL_CTX_set_next_proto_select_cb
+#define SSL_CTX_set_next_proto_select_cb	SSL_CTX_set_next_proto_sel_cb
+#undef ssl3_cbc_record_digest_supported
+#define ssl3_cbc_record_digest_supported        ssl3_cbc_record_digest_support
+#undef ssl_check_clienthello_tlsext_late
+#define ssl_check_clienthello_tlsext_late       ssl_check_clihello_tlsext_late
+#undef ssl_check_clienthello_tlsext_early
+#define ssl_check_clienthello_tlsext_early      ssl_check_clihello_tlsext_early
+
+/* Hack some long ENGINE names */
+#undef ENGINE_get_default_BN_mod_exp_crt
+#define ENGINE_get_default_BN_mod_exp_crt	ENGINE_get_def_BN_mod_exp_crt
+#undef ENGINE_set_default_BN_mod_exp_crt
+#define ENGINE_set_default_BN_mod_exp_crt	ENGINE_set_def_BN_mod_exp_crt
+#undef ENGINE_set_load_privkey_function
+#define ENGINE_set_load_privkey_function	ENGINE_set_load_privkey_fn
+#undef ENGINE_get_load_privkey_function
+#define ENGINE_get_load_privkey_function	ENGINE_get_load_privkey_fn
+#undef ENGINE_unregister_pkey_asn1_meths
+#define ENGINE_unregister_pkey_asn1_meths	ENGINE_unreg_pkey_asn1_meths
+#undef ENGINE_register_all_pkey_asn1_meths
+#define ENGINE_register_all_pkey_asn1_meths	ENGINE_reg_all_pkey_asn1_meths
+#undef ENGINE_set_default_pkey_asn1_meths
+#define ENGINE_set_default_pkey_asn1_meths	ENGINE_set_def_pkey_asn1_meths
+#undef ENGINE_get_pkey_asn1_meth_engine
+#define ENGINE_get_pkey_asn1_meth_engine	ENGINE_get_pkey_asn1_meth_eng
+#undef ENGINE_set_load_ssl_client_cert_function
+#define ENGINE_set_load_ssl_client_cert_function \
+						ENGINE_set_ld_ssl_clnt_cert_fn
+#undef ENGINE_get_ssl_client_cert_function
+#define ENGINE_get_ssl_client_cert_function	ENGINE_get_ssl_client_cert_fn
+
+/* Hack some long OCSP names */
+#undef OCSP_REQUEST_get_ext_by_critical
+#define OCSP_REQUEST_get_ext_by_critical	OCSP_REQUEST_get_ext_by_crit
+#undef OCSP_BASICRESP_get_ext_by_critical
+#define OCSP_BASICRESP_get_ext_by_critical      OCSP_BASICRESP_get_ext_by_crit
+#undef OCSP_SINGLERESP_get_ext_by_critical
+#define OCSP_SINGLERESP_get_ext_by_critical     OCSP_SINGLERESP_get_ext_by_crit
+
+/* Hack some long DES names */
+#undef _ossl_old_des_ede3_cfb64_encrypt
+#define _ossl_old_des_ede3_cfb64_encrypt	_ossl_odes_ede3_cfb64_encrypt
+#undef _ossl_old_des_ede3_ofb64_encrypt
+#define _ossl_old_des_ede3_ofb64_encrypt	_ossl_odes_ede3_ofb64_encrypt
+
+/* Hack some long EVP names */
+#undef OPENSSL_add_all_algorithms_noconf
+#define OPENSSL_add_all_algorithms_noconf	OPENSSL_add_all_algo_noconf
+#undef OPENSSL_add_all_algorithms_conf
+#define OPENSSL_add_all_algorithms_conf		OPENSSL_add_all_algo_conf
+#undef EVP_PKEY_meth_set_verify_recover
+#define EVP_PKEY_meth_set_verify_recover	EVP_PKEY_meth_set_vrfy_recover
+
+/* Hack some long EC names */
+#undef EC_GROUP_set_point_conversion_form
+#define EC_GROUP_set_point_conversion_form	EC_GROUP_set_point_conv_form
+#undef EC_GROUP_get_point_conversion_form
+#define EC_GROUP_get_point_conversion_form	EC_GROUP_get_point_conv_form
+#undef EC_GROUP_clear_free_all_extra_data
+#define EC_GROUP_clear_free_all_extra_data	EC_GROUP_clr_free_all_xtra_data
+#undef EC_KEY_set_public_key_affine_coordinates
+#define EC_KEY_set_public_key_affine_coordinates \
+						EC_KEY_set_pub_key_aff_coords
+#undef EC_POINT_set_Jprojective_coordinates_GFp
+#define EC_POINT_set_Jprojective_coordinates_GFp \
+                                                EC_POINT_set_Jproj_coords_GFp
+#undef EC_POINT_get_Jprojective_coordinates_GFp
+#define EC_POINT_get_Jprojective_coordinates_GFp \
+                                                EC_POINT_get_Jproj_coords_GFp
+#undef EC_POINT_set_affine_coordinates_GFp
+#define EC_POINT_set_affine_coordinates_GFp     EC_POINT_set_affine_coords_GFp
+#undef EC_POINT_get_affine_coordinates_GFp
+#define EC_POINT_get_affine_coordinates_GFp     EC_POINT_get_affine_coords_GFp
+#undef EC_POINT_set_compressed_coordinates_GFp
+#define EC_POINT_set_compressed_coordinates_GFp EC_POINT_set_compr_coords_GFp
+#undef EC_POINT_set_affine_coordinates_GF2m
+#define EC_POINT_set_affine_coordinates_GF2m    EC_POINT_set_affine_coords_GF2m
+#undef EC_POINT_get_affine_coordinates_GF2m
+#define EC_POINT_get_affine_coordinates_GF2m    EC_POINT_get_affine_coords_GF2m
+#undef EC_POINT_set_compressed_coordinates_GF2m
+#define EC_POINT_set_compressed_coordinates_GF2m \
+                                                EC_POINT_set_compr_coords_GF2m
+#undef ec_GF2m_simple_group_clear_finish
+#define ec_GF2m_simple_group_clear_finish	ec_GF2m_simple_grp_clr_finish
+#undef ec_GF2m_simple_group_check_discriminant
+#define ec_GF2m_simple_group_check_discriminant	ec_GF2m_simple_grp_chk_discrim
+#undef ec_GF2m_simple_point_clear_finish
+#define ec_GF2m_simple_point_clear_finish	ec_GF2m_simple_pt_clr_finish
+#undef ec_GF2m_simple_point_set_to_infinity
+#define ec_GF2m_simple_point_set_to_infinity	ec_GF2m_simple_pt_set_to_inf
+#undef ec_GF2m_simple_points_make_affine
+#define ec_GF2m_simple_points_make_affine	ec_GF2m_simple_pts_make_affine
+#undef ec_GF2m_simple_point_set_affine_coordinates
+#define ec_GF2m_simple_point_set_affine_coordinates \
+                                                ec_GF2m_smp_pt_set_af_coords
+#undef ec_GF2m_simple_point_get_affine_coordinates
+#define ec_GF2m_simple_point_get_affine_coordinates \
+                                                ec_GF2m_smp_pt_get_af_coords
+#undef ec_GF2m_simple_set_compressed_coordinates
+#define ec_GF2m_simple_set_compressed_coordinates \
+                                                ec_GF2m_smp_set_compr_coords
+#undef ec_GFp_simple_group_set_curve_GFp
+#define ec_GFp_simple_group_set_curve_GFp       ec_GFp_simple_grp_set_curve_GFp
+#undef ec_GFp_simple_group_get_curve_GFp
+#define ec_GFp_simple_group_get_curve_GFp       ec_GFp_simple_grp_get_curve_GFp
+#undef ec_GFp_simple_group_clear_finish
+#define ec_GFp_simple_group_clear_finish	ec_GFp_simple_grp_clear_finish
+#undef ec_GFp_simple_group_set_generator
+#define ec_GFp_simple_group_set_generator       ec_GFp_simple_grp_set_generator
+#undef ec_GFp_simple_group_get0_generator
+#define ec_GFp_simple_group_get0_generator      ec_GFp_simple_grp_gt0_generator
+#undef ec_GFp_simple_group_get_cofactor
+#define ec_GFp_simple_group_get_cofactor	ec_GFp_simple_grp_get_cofactor
+#undef ec_GFp_simple_point_clear_finish
+#define ec_GFp_simple_point_clear_finish	ec_GFp_simple_pt_clear_finish
+#undef ec_GFp_simple_point_set_to_infinity
+#define ec_GFp_simple_point_set_to_infinity     ec_GFp_simple_pt_set_to_inf
+#undef ec_GFp_simple_points_make_affine
+#define ec_GFp_simple_points_make_affine	ec_GFp_simple_pts_make_affine
+#undef ec_GFp_simple_set_Jprojective_coordinates_GFp
+#define ec_GFp_simple_set_Jprojective_coordinates_GFp \
+                                                ec_GFp_smp_set_Jproj_coords_GFp
+#undef ec_GFp_simple_get_Jprojective_coordinates_GFp
+#define ec_GFp_simple_get_Jprojective_coordinates_GFp \
+                                                ec_GFp_smp_get_Jproj_coords_GFp
+#undef ec_GFp_simple_point_set_affine_coordinates_GFp
+#define ec_GFp_simple_point_set_affine_coordinates_GFp \
+                                                ec_GFp_smp_pt_set_af_coords_GFp
+#undef ec_GFp_simple_point_get_affine_coordinates_GFp
+#define ec_GFp_simple_point_get_affine_coordinates_GFp \
+                                                ec_GFp_smp_pt_get_af_coords_GFp
+#undef ec_GFp_simple_set_compressed_coordinates_GFp
+#define ec_GFp_simple_set_compressed_coordinates_GFp \
+                                                ec_GFp_smp_set_compr_coords_GFp
+#undef ec_GFp_simple_point_set_affine_coordinates
+#define ec_GFp_simple_point_set_affine_coordinates \
+                                                ec_GFp_smp_pt_set_af_coords
+#undef ec_GFp_simple_point_get_affine_coordinates
+#define ec_GFp_simple_point_get_affine_coordinates \
+                                                ec_GFp_smp_pt_get_af_coords
+#undef ec_GFp_simple_set_compressed_coordinates
+#define ec_GFp_simple_set_compressed_coordinates \
+                                                ec_GFp_smp_set_compr_coords
+#undef ec_GFp_simple_group_check_discriminant
+#define ec_GFp_simple_group_check_discriminant	ec_GFp_simple_grp_chk_discrim
+
+/* Hack som long STORE names */
+#undef STORE_method_set_initialise_function
+#define STORE_method_set_initialise_function	STORE_meth_set_initialise_fn
+#undef STORE_method_set_cleanup_function
+#define STORE_method_set_cleanup_function	STORE_meth_set_cleanup_fn
+#undef STORE_method_set_generate_function
+#define STORE_method_set_generate_function	STORE_meth_set_generate_fn
+#undef STORE_method_set_modify_function
+#define STORE_method_set_modify_function	STORE_meth_set_modify_fn
+#undef STORE_method_set_revoke_function
+#define STORE_method_set_revoke_function	STORE_meth_set_revoke_fn
+#undef STORE_method_set_delete_function
+#define STORE_method_set_delete_function	STORE_meth_set_delete_fn
+#undef STORE_method_set_list_start_function
+#define STORE_method_set_list_start_function	STORE_meth_set_list_start_fn
+#undef STORE_method_set_list_next_function
+#define STORE_method_set_list_next_function	STORE_meth_set_list_next_fn
+#undef STORE_method_set_list_end_function
+#define STORE_method_set_list_end_function	STORE_meth_set_list_end_fn
+#undef STORE_method_set_update_store_function
+#define STORE_method_set_update_store_function	STORE_meth_set_update_store_fn
+#undef STORE_method_set_lock_store_function
+#define STORE_method_set_lock_store_function	STORE_meth_set_lock_store_fn
+#undef STORE_method_set_unlock_store_function
+#define STORE_method_set_unlock_store_function	STORE_meth_set_unlock_store_fn
+#undef STORE_method_get_initialise_function
+#define STORE_method_get_initialise_function	STORE_meth_get_initialise_fn
+#undef STORE_method_get_cleanup_function
+#define STORE_method_get_cleanup_function	STORE_meth_get_cleanup_fn
+#undef STORE_method_get_generate_function
+#define STORE_method_get_generate_function	STORE_meth_get_generate_fn
+#undef STORE_method_get_modify_function
+#define STORE_method_get_modify_function	STORE_meth_get_modify_fn
+#undef STORE_method_get_revoke_function
+#define STORE_method_get_revoke_function	STORE_meth_get_revoke_fn
+#undef STORE_method_get_delete_function
+#define STORE_method_get_delete_function	STORE_meth_get_delete_fn
+#undef STORE_method_get_list_start_function
+#define STORE_method_get_list_start_function	STORE_meth_get_list_start_fn
+#undef STORE_method_get_list_next_function
+#define STORE_method_get_list_next_function	STORE_meth_get_list_next_fn
+#undef STORE_method_get_list_end_function
+#define STORE_method_get_list_end_function	STORE_meth_get_list_end_fn
+#undef STORE_method_get_update_store_function
+#define STORE_method_get_update_store_function	STORE_meth_get_update_store_fn
+#undef STORE_method_get_lock_store_function
+#define STORE_method_get_lock_store_function	STORE_meth_get_lock_store_fn
+#undef STORE_method_get_unlock_store_function
+#define STORE_method_get_unlock_store_function	STORE_meth_get_unlock_store_fn
+
+/* Hack some long TS names */
+#undef TS_RESP_CTX_set_status_info_cond
+#define TS_RESP_CTX_set_status_info_cond	TS_RESP_CTX_set_stat_info_cond
+#undef TS_RESP_CTX_set_clock_precision_digits
+#define TS_RESP_CTX_set_clock_precision_digits	TS_RESP_CTX_set_clk_prec_digits
+#undef TS_CONF_set_clock_precision_digits
+#define TS_CONF_set_clock_precision_digits	TS_CONF_set_clk_prec_digits
+
+/* Hack some long CMS names */
+#undef CMS_RecipientInfo_ktri_get0_algs
+#define CMS_RecipientInfo_ktri_get0_algs	CMS_RecipInfo_ktri_get0_algs
+#undef CMS_RecipientInfo_ktri_get0_signer_id
+#define CMS_RecipientInfo_ktri_get0_signer_id	CMS_RecipInfo_ktri_get0_sigr_id
+#undef CMS_OtherRevocationInfoFormat_it
+#define CMS_OtherRevocationInfoFormat_it	CMS_OtherRevocInfoFormat_it
+#undef CMS_KeyAgreeRecipientIdentifier_it
+#define CMS_KeyAgreeRecipientIdentifier_it	CMS_KeyAgreeRecipIdentifier_it
+#undef CMS_OriginatorIdentifierOrKey_it
+#define CMS_OriginatorIdentifierOrKey_it	CMS_OriginatorIdOrKey_it
+#undef cms_SignerIdentifier_get0_signer_id
+#define cms_SignerIdentifier_get0_signer_id	cms_SignerId_get0_signer_id
+
+/* Hack some long DTLS1 names */
+#undef dtls1_retransmit_buffered_messages
+#define dtls1_retransmit_buffered_messages	dtls1_retransmit_buffered_msgs
+
+/* Hack some long SRP names */
+#undef SRP_generate_server_master_secret
+#define SRP_generate_server_master_secret	SRP_gen_server_master_secret
+#undef SRP_generate_client_master_secret
+#define SRP_generate_client_master_secret	SRP_gen_client_master_secret
+
+/* Hack some long UI names */
+#undef UI_method_get_prompt_constructor
+#define UI_method_get_prompt_constructor	UI_method_get_prompt_constructr
+#undef UI_method_set_prompt_constructor
+#define UI_method_set_prompt_constructor	UI_method_set_prompt_constructr
+
+#endif /* defined OPENSSL_SYS_VMS */
+
+
+/* Case insensitive linking causes problems.... */
+#if defined(OPENSSL_SYS_VMS) || defined(OPENSSL_SYS_OS2)
+#undef ERR_load_CRYPTO_strings
+#define ERR_load_CRYPTO_strings			ERR_load_CRYPTOlib_strings
+#undef OCSP_crlID_new
+#define OCSP_crlID_new				OCSP_crlID2_new
+
+#undef d2i_ECPARAMETERS
+#define d2i_ECPARAMETERS			d2i_UC_ECPARAMETERS
+#undef i2d_ECPARAMETERS
+#define i2d_ECPARAMETERS			i2d_UC_ECPARAMETERS
+#undef d2i_ECPKPARAMETERS
+#define d2i_ECPKPARAMETERS			d2i_UC_ECPKPARAMETERS
+#undef i2d_ECPKPARAMETERS
+#define i2d_ECPKPARAMETERS			i2d_UC_ECPKPARAMETERS
+
+/* These functions do not seem to exist!  However, I'm paranoid...
+   Original command in x509v3.h:
+   These functions are being redefined in another directory,
+   and clash when the linker is case-insensitive, so let's
+   hide them a little, by giving them an extra 'o' at the
+   beginning of the name... */
+#undef X509v3_cleanup_extensions
+#define X509v3_cleanup_extensions		oX509v3_cleanup_extensions
+#undef X509v3_add_extension
+#define X509v3_add_extension			oX509v3_add_extension
+#undef X509v3_add_netscape_extensions
+#define X509v3_add_netscape_extensions		oX509v3_add_netscape_extensions
+#undef X509v3_add_standard_extensions
+#define X509v3_add_standard_extensions		oX509v3_add_standard_extensions
+
+/* This one clashes with CMS_data_create */
+#undef cms_Data_create
+#define cms_Data_create				priv_cms_Data_create
+
+#endif
+
+
+#endif /* ! defined HEADER_VMS_IDHACKS_H */
diff --git a/compat/curl-for-windows/out/x64/Release/lib/libcurl.x64.lib b/compat/curl-for-windows/out/x64/Release/lib/libcurl.x64.lib
new file mode 100644
index 0000000000..d6f6a7a295
Binary files /dev/null and b/compat/curl-for-windows/out/x64/Release/lib/libcurl.x64.lib differ
diff --git a/compat/curl-for-windows/out/x64/Release/lib/openssl.x64.lib b/compat/curl-for-windows/out/x64/Release/lib/openssl.x64.lib
new file mode 100644
index 0000000000..26ed21b4dd
Binary files /dev/null and b/compat/curl-for-windows/out/x64/Release/lib/openssl.x64.lib differ
diff --git a/compat/curl-for-windows/out/x64/Release/lib/zlib.x64.lib b/compat/curl-for-windows/out/x64/Release/lib/zlib.x64.lib
new file mode 100644
index 0000000000..22947086cb
Binary files /dev/null and b/compat/curl-for-windows/out/x64/Release/lib/zlib.x64.lib differ
diff --git a/compat/curl-for-windows/out/x86/Release/lib/libcurl.x86.lib b/compat/curl-for-windows/out/x86/Release/lib/libcurl.x86.lib
new file mode 100644
index 0000000000..8374ca0307
Binary files /dev/null and b/compat/curl-for-windows/out/x86/Release/lib/libcurl.x86.lib differ
diff --git a/compat/curl-for-windows/out/x86/Release/lib/openssl.x86.lib b/compat/curl-for-windows/out/x86/Release/lib/openssl.x86.lib
new file mode 100644
index 0000000000..1f75262911
Binary files /dev/null and b/compat/curl-for-windows/out/x86/Release/lib/openssl.x86.lib differ
diff --git a/compat/curl-for-windows/out/x86/Release/lib/zlib.x86.lib b/compat/curl-for-windows/out/x86/Release/lib/zlib.x86.lib
new file mode 100644
index 0000000000..10dc5baca1
Binary files /dev/null and b/compat/curl-for-windows/out/x86/Release/lib/zlib.x86.lib differ
diff --git a/compat/curl-for-windows/zlib/zconf.h b/compat/curl-for-windows/zlib/zconf.h
new file mode 100644
index 0000000000..9987a77553
--- /dev/null
+++ b/compat/curl-for-windows/zlib/zconf.h
@@ -0,0 +1,511 @@
+/* zconf.h -- configuration of the zlib compression library
+ * Copyright (C) 1995-2013 Jean-loup Gailly.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/* @(#) $Id$ */
+
+#ifndef ZCONF_H
+#define ZCONF_H
+
+/*
+ * If you *really* need a unique prefix for all types and library functions,
+ * compile with -DZ_PREFIX. The "standard" zlib should be compiled without it.
+ * Even better than compiling with -DZ_PREFIX would be to use configure to set
+ * this permanently in zconf.h using "./configure --zprefix".
+ */
+#ifdef Z_PREFIX     /* may be set to #if 1 by ./configure */
+#  define Z_PREFIX_SET
+
+/* all linked symbols */
+#  define _dist_code            z__dist_code
+#  define _length_code          z__length_code
+#  define _tr_align             z__tr_align
+#  define _tr_flush_bits        z__tr_flush_bits
+#  define _tr_flush_block       z__tr_flush_block
+#  define _tr_init              z__tr_init
+#  define _tr_stored_block      z__tr_stored_block
+#  define _tr_tally             z__tr_tally
+#  define adler32               z_adler32
+#  define adler32_combine       z_adler32_combine
+#  define adler32_combine64     z_adler32_combine64
+#  ifndef Z_SOLO
+#    define compress              z_compress
+#    define compress2             z_compress2
+#    define compressBound         z_compressBound
+#  endif
+#  define crc32                 z_crc32
+#  define crc32_combine         z_crc32_combine
+#  define crc32_combine64       z_crc32_combine64
+#  define deflate               z_deflate
+#  define deflateBound          z_deflateBound
+#  define deflateCopy           z_deflateCopy
+#  define deflateEnd            z_deflateEnd
+#  define deflateInit2_         z_deflateInit2_
+#  define deflateInit_          z_deflateInit_
+#  define deflateParams         z_deflateParams
+#  define deflatePending        z_deflatePending
+#  define deflatePrime          z_deflatePrime
+#  define deflateReset          z_deflateReset
+#  define deflateResetKeep      z_deflateResetKeep
+#  define deflateSetDictionary  z_deflateSetDictionary
+#  define deflateSetHeader      z_deflateSetHeader
+#  define deflateTune           z_deflateTune
+#  define deflate_copyright     z_deflate_copyright
+#  define get_crc_table         z_get_crc_table
+#  ifndef Z_SOLO
+#    define gz_error              z_gz_error
+#    define gz_intmax             z_gz_intmax
+#    define gz_strwinerror        z_gz_strwinerror
+#    define gzbuffer              z_gzbuffer
+#    define gzclearerr            z_gzclearerr
+#    define gzclose               z_gzclose
+#    define gzclose_r             z_gzclose_r
+#    define gzclose_w             z_gzclose_w
+#    define gzdirect              z_gzdirect
+#    define gzdopen               z_gzdopen
+#    define gzeof                 z_gzeof
+#    define gzerror               z_gzerror
+#    define gzflush               z_gzflush
+#    define gzgetc                z_gzgetc
+#    define gzgetc_               z_gzgetc_
+#    define gzgets                z_gzgets
+#    define gzoffset              z_gzoffset
+#    define gzoffset64            z_gzoffset64
+#    define gzopen                z_gzopen
+#    define gzopen64              z_gzopen64
+#    ifdef _WIN32
+#      define gzopen_w              z_gzopen_w
+#    endif
+#    define gzprintf              z_gzprintf
+#    define gzvprintf             z_gzvprintf
+#    define gzputc                z_gzputc
+#    define gzputs                z_gzputs
+#    define gzread                z_gzread
+#    define gzrewind              z_gzrewind
+#    define gzseek                z_gzseek
+#    define gzseek64              z_gzseek64
+#    define gzsetparams           z_gzsetparams
+#    define gztell                z_gztell
+#    define gztell64              z_gztell64
+#    define gzungetc              z_gzungetc
+#    define gzwrite               z_gzwrite
+#  endif
+#  define inflate               z_inflate
+#  define inflateBack           z_inflateBack
+#  define inflateBackEnd        z_inflateBackEnd
+#  define inflateBackInit_      z_inflateBackInit_
+#  define inflateCopy           z_inflateCopy
+#  define inflateEnd            z_inflateEnd
+#  define inflateGetHeader      z_inflateGetHeader
+#  define inflateInit2_         z_inflateInit2_
+#  define inflateInit_          z_inflateInit_
+#  define inflateMark           z_inflateMark
+#  define inflatePrime          z_inflatePrime
+#  define inflateReset          z_inflateReset
+#  define inflateReset2         z_inflateReset2
+#  define inflateSetDictionary  z_inflateSetDictionary
+#  define inflateGetDictionary  z_inflateGetDictionary
+#  define inflateSync           z_inflateSync
+#  define inflateSyncPoint      z_inflateSyncPoint
+#  define inflateUndermine      z_inflateUndermine
+#  define inflateResetKeep      z_inflateResetKeep
+#  define inflate_copyright     z_inflate_copyright
+#  define inflate_fast          z_inflate_fast
+#  define inflate_table         z_inflate_table
+#  ifndef Z_SOLO
+#    define uncompress            z_uncompress
+#  endif
+#  define zError                z_zError
+#  ifndef Z_SOLO
+#    define zcalloc               z_zcalloc
+#    define zcfree                z_zcfree
+#  endif
+#  define zlibCompileFlags      z_zlibCompileFlags
+#  define zlibVersion           z_zlibVersion
+
+/* all zlib typedefs in zlib.h and zconf.h */
+#  define Byte                  z_Byte
+#  define Bytef                 z_Bytef
+#  define alloc_func            z_alloc_func
+#  define charf                 z_charf
+#  define free_func             z_free_func
+#  ifndef Z_SOLO
+#    define gzFile                z_gzFile
+#  endif
+#  define gz_header             z_gz_header
+#  define gz_headerp            z_gz_headerp
+#  define in_func               z_in_func
+#  define intf                  z_intf
+#  define out_func              z_out_func
+#  define uInt                  z_uInt
+#  define uIntf                 z_uIntf
+#  define uLong                 z_uLong
+#  define uLongf                z_uLongf
+#  define voidp                 z_voidp
+#  define voidpc                z_voidpc
+#  define voidpf                z_voidpf
+
+/* all zlib structs in zlib.h and zconf.h */
+#  define gz_header_s           z_gz_header_s
+#  define internal_state        z_internal_state
+
+#endif
+
+#if defined(__MSDOS__) && !defined(MSDOS)
+#  define MSDOS
+#endif
+#if (defined(OS_2) || defined(__OS2__)) && !defined(OS2)
+#  define OS2
+#endif
+#if defined(_WINDOWS) && !defined(WINDOWS)
+#  define WINDOWS
+#endif
+#if defined(_WIN32) || defined(_WIN32_WCE) || defined(__WIN32__)
+#  ifndef WIN32
+#    define WIN32
+#  endif
+#endif
+#if (defined(MSDOS) || defined(OS2) || defined(WINDOWS)) && !defined(WIN32)
+#  if !defined(__GNUC__) && !defined(__FLAT__) && !defined(__386__)
+#    ifndef SYS16BIT
+#      define SYS16BIT
+#    endif
+#  endif
+#endif
+
+/*
+ * Compile with -DMAXSEG_64K if the alloc function cannot allocate more
+ * than 64k bytes at a time (needed on systems with 16-bit int).
+ */
+#ifdef SYS16BIT
+#  define MAXSEG_64K
+#endif
+#ifdef MSDOS
+#  define UNALIGNED_OK
+#endif
+
+#ifdef __STDC_VERSION__
+#  ifndef STDC
+#    define STDC
+#  endif
+#  if __STDC_VERSION__ >= 199901L
+#    ifndef STDC99
+#      define STDC99
+#    endif
+#  endif
+#endif
+#if !defined(STDC) && (defined(__STDC__) || defined(__cplusplus))
+#  define STDC
+#endif
+#if !defined(STDC) && (defined(__GNUC__) || defined(__BORLANDC__))
+#  define STDC
+#endif
+#if !defined(STDC) && (defined(MSDOS) || defined(WINDOWS) || defined(WIN32))
+#  define STDC
+#endif
+#if !defined(STDC) && (defined(OS2) || defined(__HOS_AIX__))
+#  define STDC
+#endif
+
+#if defined(__OS400__) && !defined(STDC)    /* iSeries (formerly AS/400). */
+#  define STDC
+#endif
+
+#ifndef STDC
+#  ifndef const /* cannot use !defined(STDC) && !defined(const) on Mac */
+#    define const       /* note: need a more gentle solution here */
+#  endif
+#endif
+
+#if defined(ZLIB_CONST) && !defined(z_const)
+#  define z_const const
+#else
+#  define z_const
+#endif
+
+/* Some Mac compilers merge all .h files incorrectly: */
+#if defined(__MWERKS__)||defined(applec)||defined(THINK_C)||defined(__SC__)
+#  define NO_DUMMY_DECL
+#endif
+
+/* Maximum value for memLevel in deflateInit2 */
+#ifndef MAX_MEM_LEVEL
+#  ifdef MAXSEG_64K
+#    define MAX_MEM_LEVEL 8
+#  else
+#    define MAX_MEM_LEVEL 9
+#  endif
+#endif
+
+/* Maximum value for windowBits in deflateInit2 and inflateInit2.
+ * WARNING: reducing MAX_WBITS makes minigzip unable to extract .gz files
+ * created by gzip. (Files created by minigzip can still be extracted by
+ * gzip.)
+ */
+#ifndef MAX_WBITS
+#  define MAX_WBITS   15 /* 32K LZ77 window */
+#endif
+
+/* The memory requirements for deflate are (in bytes):
+            (1 << (windowBits+2)) +  (1 << (memLevel+9))
+ that is: 128K for windowBits=15  +  128K for memLevel = 8  (default values)
+ plus a few kilobytes for small objects. For example, if you want to reduce
+ the default memory requirements from 256K to 128K, compile with
+     make CFLAGS="-O -DMAX_WBITS=14 -DMAX_MEM_LEVEL=7"
+ Of course this will generally degrade compression (there's no free lunch).
+
+   The memory requirements for inflate are (in bytes) 1 << windowBits
+ that is, 32K for windowBits=15 (default value) plus a few kilobytes
+ for small objects.
+*/
+
+                        /* Type declarations */
+
+#ifndef OF /* function prototypes */
+#  ifdef STDC
+#    define OF(args)  args
+#  else
+#    define OF(args)  ()
+#  endif
+#endif
+
+#ifndef Z_ARG /* function prototypes for stdarg */
+#  if defined(STDC) || defined(Z_HAVE_STDARG_H)
+#    define Z_ARG(args)  args
+#  else
+#    define Z_ARG(args)  ()
+#  endif
+#endif
+
+/* The following definitions for FAR are needed only for MSDOS mixed
+ * model programming (small or medium model with some far allocations).
+ * This was tested only with MSC; for other MSDOS compilers you may have
+ * to define NO_MEMCPY in zutil.h.  If you don't need the mixed model,
+ * just define FAR to be empty.
+ */
+#ifdef SYS16BIT
+#  if defined(M_I86SM) || defined(M_I86MM)
+     /* MSC small or medium model */
+#    define SMALL_MEDIUM
+#    ifdef _MSC_VER
+#      define FAR _far
+#    else
+#      define FAR far
+#    endif
+#  endif
+#  if (defined(__SMALL__) || defined(__MEDIUM__))
+     /* Turbo C small or medium model */
+#    define SMALL_MEDIUM
+#    ifdef __BORLANDC__
+#      define FAR _far
+#    else
+#      define FAR far
+#    endif
+#  endif
+#endif
+
+#if defined(WINDOWS) || defined(WIN32)
+   /* If building or using zlib as a DLL, define ZLIB_DLL.
+    * This is not mandatory, but it offers a little performance increase.
+    */
+#  ifdef ZLIB_DLL
+#    if defined(WIN32) && (!defined(__BORLANDC__) || (__BORLANDC__ >= 0x500))
+#      ifdef ZLIB_INTERNAL
+#        define ZEXTERN extern __declspec(dllexport)
+#      else
+#        define ZEXTERN extern __declspec(dllimport)
+#      endif
+#    endif
+#  endif  /* ZLIB_DLL */
+   /* If building or using zlib with the WINAPI/WINAPIV calling convention,
+    * define ZLIB_WINAPI.
+    * Caution: the standard ZLIB1.DLL is NOT compiled using ZLIB_WINAPI.
+    */
+#  ifdef ZLIB_WINAPI
+#    ifdef FAR
+#      undef FAR
+#    endif
+#    include <windows.h>
+     /* No need for _export, use ZLIB.DEF instead. */
+     /* For complete Windows compatibility, use WINAPI, not __stdcall. */
+#    define ZEXPORT WINAPI
+#    ifdef WIN32
+#      define ZEXPORTVA WINAPIV
+#    else
+#      define ZEXPORTVA FAR CDECL
+#    endif
+#  endif
+#endif
+
+#if defined (__BEOS__)
+#  ifdef ZLIB_DLL
+#    ifdef ZLIB_INTERNAL
+#      define ZEXPORT   __declspec(dllexport)
+#      define ZEXPORTVA __declspec(dllexport)
+#    else
+#      define ZEXPORT   __declspec(dllimport)
+#      define ZEXPORTVA __declspec(dllimport)
+#    endif
+#  endif
+#endif
+
+#ifndef ZEXTERN
+#  define ZEXTERN extern
+#endif
+#ifndef ZEXPORT
+#  define ZEXPORT
+#endif
+#ifndef ZEXPORTVA
+#  define ZEXPORTVA
+#endif
+
+#ifndef FAR
+#  define FAR
+#endif
+
+#if !defined(__MACTYPES__)
+typedef unsigned char  Byte;  /* 8 bits */
+#endif
+typedef unsigned int   uInt;  /* 16 bits or more */
+typedef unsigned long  uLong; /* 32 bits or more */
+
+#ifdef SMALL_MEDIUM
+   /* Borland C/C++ and some old MSC versions ignore FAR inside typedef */
+#  define Bytef Byte FAR
+#else
+   typedef Byte  FAR Bytef;
+#endif
+typedef char  FAR charf;
+typedef int   FAR intf;
+typedef uInt  FAR uIntf;
+typedef uLong FAR uLongf;
+
+#ifdef STDC
+   typedef void const *voidpc;
+   typedef void FAR   *voidpf;
+   typedef void       *voidp;
+#else
+   typedef Byte const *voidpc;
+   typedef Byte FAR   *voidpf;
+   typedef Byte       *voidp;
+#endif
+
+#if !defined(Z_U4) && !defined(Z_SOLO) && defined(STDC)
+#  include <limits.h>
+#  if (UINT_MAX == 0xffffffffUL)
+#    define Z_U4 unsigned
+#  elif (ULONG_MAX == 0xffffffffUL)
+#    define Z_U4 unsigned long
+#  elif (USHRT_MAX == 0xffffffffUL)
+#    define Z_U4 unsigned short
+#  endif
+#endif
+
+#ifdef Z_U4
+   typedef Z_U4 z_crc_t;
+#else
+   typedef unsigned long z_crc_t;
+#endif
+
+#ifdef HAVE_UNISTD_H    /* may be set to #if 1 by ./configure */
+#  define Z_HAVE_UNISTD_H
+#endif
+
+#ifdef HAVE_STDARG_H    /* may be set to #if 1 by ./configure */
+#  define Z_HAVE_STDARG_H
+#endif
+
+#ifdef STDC
+#  ifndef Z_SOLO
+#    include <sys/types.h>      /* for off_t */
+#  endif
+#endif
+
+#if defined(STDC) || defined(Z_HAVE_STDARG_H)
+#  ifndef Z_SOLO
+#    include <stdarg.h>         /* for va_list */
+#  endif
+#endif
+
+#ifdef _WIN32
+#  ifndef Z_SOLO
+#    include <stddef.h>         /* for wchar_t */
+#  endif
+#endif
+
+/* a little trick to accommodate both "#define _LARGEFILE64_SOURCE" and
+ * "#define _LARGEFILE64_SOURCE 1" as requesting 64-bit operations, (even
+ * though the former does not conform to the LFS document), but considering
+ * both "#undef _LARGEFILE64_SOURCE" and "#define _LARGEFILE64_SOURCE 0" as
+ * equivalently requesting no 64-bit operations
+ */
+#if defined(_LARGEFILE64_SOURCE) && -_LARGEFILE64_SOURCE - -1 == 1
+#  undef _LARGEFILE64_SOURCE
+#endif
+
+#if defined(__WATCOMC__) && !defined(Z_HAVE_UNISTD_H)
+#  define Z_HAVE_UNISTD_H
+#endif
+#ifndef Z_SOLO
+#  if defined(Z_HAVE_UNISTD_H) || defined(_LARGEFILE64_SOURCE)
+#    include <unistd.h>         /* for SEEK_*, off_t, and _LFS64_LARGEFILE */
+#    ifdef VMS
+#      include <unixio.h>       /* for off_t */
+#    endif
+#    ifndef z_off_t
+#      define z_off_t off_t
+#    endif
+#  endif
+#endif
+
+#if defined(_LFS64_LARGEFILE) && _LFS64_LARGEFILE-0
+#  define Z_LFS64
+#endif
+
+#if defined(_LARGEFILE64_SOURCE) && defined(Z_LFS64)
+#  define Z_LARGE64
+#endif
+
+#if defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS-0 == 64 && defined(Z_LFS64)
+#  define Z_WANT64
+#endif
+
+#if !defined(SEEK_SET) && !defined(Z_SOLO)
+#  define SEEK_SET        0       /* Seek from beginning of file.  */
+#  define SEEK_CUR        1       /* Seek from current position.  */
+#  define SEEK_END        2       /* Set file pointer to EOF plus "offset" */
+#endif
+
+#ifndef z_off_t
+#  define z_off_t long
+#endif
+
+#if !defined(_WIN32) && defined(Z_LARGE64)
+#  define z_off64_t off64_t
+#else
+#  if defined(_WIN32) && !defined(__GNUC__) && !defined(Z_SOLO)
+#    define z_off64_t __int64
+#  else
+#    define z_off64_t z_off_t
+#  endif
+#endif
+
+/* MVS linker does not support external names larger than 8 bytes */
+#if defined(__MVS__)
+  #pragma map(deflateInit_,"DEIN")
+  #pragma map(deflateInit2_,"DEIN2")
+  #pragma map(deflateEnd,"DEEND")
+  #pragma map(deflateBound,"DEBND")
+  #pragma map(inflateInit_,"ININ")
+  #pragma map(inflateInit2_,"ININ2")
+  #pragma map(inflateEnd,"INEND")
+  #pragma map(inflateSync,"INSY")
+  #pragma map(inflateSetDictionary,"INSEDI")
+  #pragma map(compressBound,"CMBND")
+  #pragma map(inflate_table,"INTABL")
+  #pragma map(inflate_fast,"INFA")
+  #pragma map(inflate_copyright,"INCOPY")
+#endif
+
+#endif /* ZCONF_H */
diff --git a/compat/curl-for-windows/zlib/zlib.h b/compat/curl-for-windows/zlib/zlib.h
new file mode 100644
index 0000000000..3e0c7672ac
--- /dev/null
+++ b/compat/curl-for-windows/zlib/zlib.h
@@ -0,0 +1,1768 @@
+/* zlib.h -- interface of the 'zlib' general purpose compression library
+  version 1.2.8, April 28th, 2013
+
+  Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  Jean-loup Gailly        Mark Adler
+  jloup@gzip.org          madler@alumni.caltech.edu
+
+
+  The data format used by the zlib library is described by RFCs (Request for
+  Comments) 1950 to 1952 in the files http://tools.ietf.org/html/rfc1950
+  (zlib format), rfc1951 (deflate format) and rfc1952 (gzip format).
+*/
+
+#ifndef ZLIB_H
+#define ZLIB_H
+
+#include "zconf.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ZLIB_VERSION "1.2.8"
+#define ZLIB_VERNUM 0x1280
+#define ZLIB_VER_MAJOR 1
+#define ZLIB_VER_MINOR 2
+#define ZLIB_VER_REVISION 8
+#define ZLIB_VER_SUBREVISION 0
+
+/*
+    The 'zlib' compression library provides in-memory compression and
+  decompression functions, including integrity checks of the uncompressed data.
+  This version of the library supports only one compression method (deflation)
+  but other algorithms will be added later and will have the same stream
+  interface.
+
+    Compression can be done in a single step if the buffers are large enough,
+  or can be done by repeated calls of the compression function.  In the latter
+  case, the application must provide more input and/or consume the output
+  (providing more output space) before each call.
+
+    The compressed data format used by default by the in-memory functions is
+  the zlib format, which is a zlib wrapper documented in RFC 1950, wrapped
+  around a deflate stream, which is itself documented in RFC 1951.
+
+    The library also supports reading and writing files in gzip (.gz) format
+  with an interface similar to that of stdio using the functions that start
+  with "gz".  The gzip format is different from the zlib format.  gzip is a
+  gzip wrapper, documented in RFC 1952, wrapped around a deflate stream.
+
+    This library can optionally read and write gzip streams in memory as well.
+
+    The zlib format was designed to be compact and fast for use in memory
+  and on communications channels.  The gzip format was designed for single-
+  file compression on file systems, has a larger header than zlib to maintain
+  directory information, and uses a different, slower check method than zlib.
+
+    The library does not install any signal handler.  The decoder checks
+  the consistency of the compressed data, so the library should never crash
+  even in case of corrupted input.
+*/
+
+typedef voidpf (*alloc_func) OF((voidpf opaque, uInt items, uInt size));
+typedef void   (*free_func)  OF((voidpf opaque, voidpf address));
+
+struct internal_state;
+
+typedef struct z_stream_s {
+    z_const Bytef *next_in;     /* next input byte */
+    uInt     avail_in;  /* number of bytes available at next_in */
+    uLong    total_in;  /* total number of input bytes read so far */
+
+    Bytef    *next_out; /* next output byte should be put there */
+    uInt     avail_out; /* remaining free space at next_out */
+    uLong    total_out; /* total number of bytes output so far */
+
+    z_const char *msg;  /* last error message, NULL if no error */
+    struct internal_state FAR *state; /* not visible by applications */
+
+    alloc_func zalloc;  /* used to allocate the internal state */
+    free_func  zfree;   /* used to free the internal state */
+    voidpf     opaque;  /* private data object passed to zalloc and zfree */
+
+    int     data_type;  /* best guess about the data type: binary or text */
+    uLong   adler;      /* adler32 value of the uncompressed data */
+    uLong   reserved;   /* reserved for future use */
+} z_stream;
+
+typedef z_stream FAR *z_streamp;
+
+/*
+     gzip header information passed to and from zlib routines.  See RFC 1952
+  for more details on the meanings of these fields.
+*/
+typedef struct gz_header_s {
+    int     text;       /* true if compressed data believed to be text */
+    uLong   time;       /* modification time */
+    int     xflags;     /* extra flags (not used when writing a gzip file) */
+    int     os;         /* operating system */
+    Bytef   *extra;     /* pointer to extra field or Z_NULL if none */
+    uInt    extra_len;  /* extra field length (valid if extra != Z_NULL) */
+    uInt    extra_max;  /* space at extra (only when reading header) */
+    Bytef   *name;      /* pointer to zero-terminated file name or Z_NULL */
+    uInt    name_max;   /* space at name (only when reading header) */
+    Bytef   *comment;   /* pointer to zero-terminated comment or Z_NULL */
+    uInt    comm_max;   /* space at comment (only when reading header) */
+    int     hcrc;       /* true if there was or will be a header crc */
+    int     done;       /* true when done reading gzip header (not used
+                           when writing a gzip file) */
+} gz_header;
+
+typedef gz_header FAR *gz_headerp;
+
+/*
+     The application must update next_in and avail_in when avail_in has dropped
+   to zero.  It must update next_out and avail_out when avail_out has dropped
+   to zero.  The application must initialize zalloc, zfree and opaque before
+   calling the init function.  All other fields are set by the compression
+   library and must not be updated by the application.
+
+     The opaque value provided by the application will be passed as the first
+   parameter for calls of zalloc and zfree.  This can be useful for custom
+   memory management.  The compression library attaches no meaning to the
+   opaque value.
+
+     zalloc must return Z_NULL if there is not enough memory for the object.
+   If zlib is used in a multi-threaded application, zalloc and zfree must be
+   thread safe.
+
+     On 16-bit systems, the functions zalloc and zfree must be able to allocate
+   exactly 65536 bytes, but will not be required to allocate more than this if
+   the symbol MAXSEG_64K is defined (see zconf.h).  WARNING: On MSDOS, pointers
+   returned by zalloc for objects of exactly 65536 bytes *must* have their
+   offset normalized to zero.  The default allocation function provided by this
+   library ensures this (see zutil.c).  To reduce memory requirements and avoid
+   any allocation of 64K objects, at the expense of compression ratio, compile
+   the library with -DMAX_WBITS=14 (see zconf.h).
+
+     The fields total_in and total_out can be used for statistics or progress
+   reports.  After compression, total_in holds the total size of the
+   uncompressed data and may be saved for use in the decompressor (particularly
+   if the decompressor wants to decompress everything in a single step).
+*/
+
+                        /* constants */
+
+#define Z_NO_FLUSH      0
+#define Z_PARTIAL_FLUSH 1
+#define Z_SYNC_FLUSH    2
+#define Z_FULL_FLUSH    3
+#define Z_FINISH        4
+#define Z_BLOCK         5
+#define Z_TREES         6
+/* Allowed flush values; see deflate() and inflate() below for details */
+
+#define Z_OK            0
+#define Z_STREAM_END    1
+#define Z_NEED_DICT     2
+#define Z_ERRNO        (-1)
+#define Z_STREAM_ERROR (-2)
+#define Z_DATA_ERROR   (-3)
+#define Z_MEM_ERROR    (-4)
+#define Z_BUF_ERROR    (-5)
+#define Z_VERSION_ERROR (-6)
+/* Return codes for the compression/decompression functions. Negative values
+ * are errors, positive values are used for special but normal events.
+ */
+
+#define Z_NO_COMPRESSION         0
+#define Z_BEST_SPEED             1
+#define Z_BEST_COMPRESSION       9
+#define Z_DEFAULT_COMPRESSION  (-1)
+/* compression levels */
+
+#define Z_FILTERED            1
+#define Z_HUFFMAN_ONLY        2
+#define Z_RLE                 3
+#define Z_FIXED               4
+#define Z_DEFAULT_STRATEGY    0
+/* compression strategy; see deflateInit2() below for details */
+
+#define Z_BINARY   0
+#define Z_TEXT     1
+#define Z_ASCII    Z_TEXT   /* for compatibility with 1.2.2 and earlier */
+#define Z_UNKNOWN  2
+/* Possible values of the data_type field (though see inflate()) */
+
+#define Z_DEFLATED   8
+/* The deflate compression method (the only one supported in this version) */
+
+#define Z_NULL  0  /* for initializing zalloc, zfree, opaque */
+
+#define zlib_version zlibVersion()
+/* for compatibility with versions < 1.0.2 */
+
+
+                        /* basic functions */
+
+ZEXTERN const char * ZEXPORT zlibVersion OF((void));
+/* The application can compare zlibVersion and ZLIB_VERSION for consistency.
+   If the first character differs, the library code actually used is not
+   compatible with the zlib.h header file used by the application.  This check
+   is automatically made by deflateInit and inflateInit.
+ */
+
+/*
+ZEXTERN int ZEXPORT deflateInit OF((z_streamp strm, int level));
+
+     Initializes the internal stream state for compression.  The fields
+   zalloc, zfree and opaque must be initialized before by the caller.  If
+   zalloc and zfree are set to Z_NULL, deflateInit updates them to use default
+   allocation functions.
+
+     The compression level must be Z_DEFAULT_COMPRESSION, or between 0 and 9:
+   1 gives best speed, 9 gives best compression, 0 gives no compression at all
+   (the input data is simply copied a block at a time).  Z_DEFAULT_COMPRESSION
+   requests a default compromise between speed and compression (currently
+   equivalent to level 6).
+
+     deflateInit returns Z_OK if success, Z_MEM_ERROR if there was not enough
+   memory, Z_STREAM_ERROR if level is not a valid compression level, or
+   Z_VERSION_ERROR if the zlib library version (zlib_version) is incompatible
+   with the version assumed by the caller (ZLIB_VERSION).  msg is set to null
+   if there is no error message.  deflateInit does not perform any compression:
+   this will be done by deflate().
+*/
+
+
+ZEXTERN int ZEXPORT deflate OF((z_streamp strm, int flush));
+/*
+    deflate compresses as much data as possible, and stops when the input
+  buffer becomes empty or the output buffer becomes full.  It may introduce
+  some output latency (reading input without producing any output) except when
+  forced to flush.
+
+    The detailed semantics are as follows.  deflate performs one or both of the
+  following actions:
+
+  - Compress more input starting at next_in and update next_in and avail_in
+    accordingly.  If not all input can be processed (because there is not
+    enough room in the output buffer), next_in and avail_in are updated and
+    processing will resume at this point for the next call of deflate().
+
+  - Provide more output starting at next_out and update next_out and avail_out
+    accordingly.  This action is forced if the parameter flush is non zero.
+    Forcing flush frequently degrades the compression ratio, so this parameter
+    should be set only when necessary (in interactive applications).  Some
+    output may be provided even if flush is not set.
+
+    Before the call of deflate(), the application should ensure that at least
+  one of the actions is possible, by providing more input and/or consuming more
+  output, and updating avail_in or avail_out accordingly; avail_out should
+  never be zero before the call.  The application can consume the compressed
+  output when it wants, for example when the output buffer is full (avail_out
+  == 0), or after each call of deflate().  If deflate returns Z_OK and with
+  zero avail_out, it must be called again after making room in the output
+  buffer because there might be more output pending.
+
+    Normally the parameter flush is set to Z_NO_FLUSH, which allows deflate to
+  decide how much data to accumulate before producing output, in order to
+  maximize compression.
+
+    If the parameter flush is set to Z_SYNC_FLUSH, all pending output is
+  flushed to the output buffer and the output is aligned on a byte boundary, so
+  that the decompressor can get all input data available so far.  (In
+  particular avail_in is zero after the call if enough output space has been
+  provided before the call.) Flushing may degrade compression for some
+  compression algorithms and so it should be used only when necessary.  This
+  completes the current deflate block and follows it with an empty stored block
+  that is three bits plus filler bits to the next byte, followed by four bytes
+  (00 00 ff ff).
+
+    If flush is set to Z_PARTIAL_FLUSH, all pending output is flushed to the
+  output buffer, but the output is not aligned to a byte boundary.  All of the
+  input data so far will be available to the decompressor, as for Z_SYNC_FLUSH.
+  This completes the current deflate block and follows it with an empty fixed
+  codes block that is 10 bits long.  This assures that enough bytes are output
+  in order for the decompressor to finish the block before the empty fixed code
+  block.
+
+    If flush is set to Z_BLOCK, a deflate block is completed and emitted, as
+  for Z_SYNC_FLUSH, but the output is not aligned on a byte boundary, and up to
+  seven bits of the current block are held to be written as the next byte after
+  the next deflate block is completed.  In this case, the decompressor may not
+  be provided enough bits at this point in order to complete decompression of
+  the data provided so far to the compressor.  It may need to wait for the next
+  block to be emitted.  This is for advanced applications that need to control
+  the emission of deflate blocks.
+
+    If flush is set to Z_FULL_FLUSH, all output is flushed as with
+  Z_SYNC_FLUSH, and the compression state is reset so that decompression can
+  restart from this point if previous compressed data has been damaged or if
+  random access is desired.  Using Z_FULL_FLUSH too often can seriously degrade
+  compression.
+
+    If deflate returns with avail_out == 0, this function must be called again
+  with the same value of the flush parameter and more output space (updated
+  avail_out), until the flush is complete (deflate returns with non-zero
+  avail_out).  In the case of a Z_FULL_FLUSH or Z_SYNC_FLUSH, make sure that
+  avail_out is greater than six to avoid repeated flush markers due to
+  avail_out == 0 on return.
+
+    If the parameter flush is set to Z_FINISH, pending input is processed,
+  pending output is flushed and deflate returns with Z_STREAM_END if there was
+  enough output space; if deflate returns with Z_OK, this function must be
+  called again with Z_FINISH and more output space (updated avail_out) but no
+  more input data, until it returns with Z_STREAM_END or an error.  After
+  deflate has returned Z_STREAM_END, the only possible operations on the stream
+  are deflateReset or deflateEnd.
+
+    Z_FINISH can be used immediately after deflateInit if all the compression
+  is to be done in a single step.  In this case, avail_out must be at least the
+  value returned by deflateBound (see below).  Then deflate is guaranteed to
+  return Z_STREAM_END.  If not enough output space is provided, deflate will
+  not return Z_STREAM_END, and it must be called again as described above.
+
+    deflate() sets strm->adler to the adler32 checksum of all input read
+  so far (that is, total_in bytes).
+
+    deflate() may update strm->data_type if it can make a good guess about
+  the input data type (Z_BINARY or Z_TEXT).  In doubt, the data is considered
+  binary.  This field is only for information purposes and does not affect the
+  compression algorithm in any manner.
+
+    deflate() returns Z_OK if some progress has been made (more input
+  processed or more output produced), Z_STREAM_END if all input has been
+  consumed and all output has been produced (only when flush is set to
+  Z_FINISH), Z_STREAM_ERROR if the stream state was inconsistent (for example
+  if next_in or next_out was Z_NULL), Z_BUF_ERROR if no progress is possible
+  (for example avail_in or avail_out was zero).  Note that Z_BUF_ERROR is not
+  fatal, and deflate() can be called again with more input and more output
+  space to continue compressing.
+*/
+
+
+ZEXTERN int ZEXPORT deflateEnd OF((z_streamp strm));
+/*
+     All dynamically allocated data structures for this stream are freed.
+   This function discards any unprocessed input and does not flush any pending
+   output.
+
+     deflateEnd returns Z_OK if success, Z_STREAM_ERROR if the
+   stream state was inconsistent, Z_DATA_ERROR if the stream was freed
+   prematurely (some input or output was discarded).  In the error case, msg
+   may be set but then points to a static string (which must not be
+   deallocated).
+*/
+
+
+/*
+ZEXTERN int ZEXPORT inflateInit OF((z_streamp strm));
+
+     Initializes the internal stream state for decompression.  The fields
+   next_in, avail_in, zalloc, zfree and opaque must be initialized before by
+   the caller.  If next_in is not Z_NULL and avail_in is large enough (the
+   exact value depends on the compression method), inflateInit determines the
+   compression method from the zlib header and allocates all data structures
+   accordingly; otherwise the allocation will be deferred to the first call of
+   inflate.  If zalloc and zfree are set to Z_NULL, inflateInit updates them to
+   use default allocation functions.
+
+     inflateInit returns Z_OK if success, Z_MEM_ERROR if there was not enough
+   memory, Z_VERSION_ERROR if the zlib library version is incompatible with the
+   version assumed by the caller, or Z_STREAM_ERROR if the parameters are
+   invalid, such as a null pointer to the structure.  msg is set to null if
+   there is no error message.  inflateInit does not perform any decompression
+   apart from possibly reading the zlib header if present: actual decompression
+   will be done by inflate().  (So next_in and avail_in may be modified, but
+   next_out and avail_out are unused and unchanged.) The current implementation
+   of inflateInit() does not process any header information -- that is deferred
+   until inflate() is called.
+*/
+
+
+ZEXTERN int ZEXPORT inflate OF((z_streamp strm, int flush));
+/*
+    inflate decompresses as much data as possible, and stops when the input
+  buffer becomes empty or the output buffer becomes full.  It may introduce
+  some output latency (reading input without producing any output) except when
+  forced to flush.
+
+  The detailed semantics are as follows.  inflate performs one or both of the
+  following actions:
+
+  - Decompress more input starting at next_in and update next_in and avail_in
+    accordingly.  If not all input can be processed (because there is not
+    enough room in the output buffer), next_in is updated and processing will
+    resume at this point for the next call of inflate().
+
+  - Provide more output starting at next_out and update next_out and avail_out
+    accordingly.  inflate() provides as much output as possible, until there is
+    no more input data or no more space in the output buffer (see below about
+    the flush parameter).
+
+    Before the call of inflate(), the application should ensure that at least
+  one of the actions is possible, by providing more input and/or consuming more
+  output, and updating the next_* and avail_* values accordingly.  The
+  application can consume the uncompressed output when it wants, for example
+  when the output buffer is full (avail_out == 0), or after each call of
+  inflate().  If inflate returns Z_OK and with zero avail_out, it must be
+  called again after making room in the output buffer because there might be
+  more output pending.
+
+    The flush parameter of inflate() can be Z_NO_FLUSH, Z_SYNC_FLUSH, Z_FINISH,
+  Z_BLOCK, or Z_TREES.  Z_SYNC_FLUSH requests that inflate() flush as much
+  output as possible to the output buffer.  Z_BLOCK requests that inflate()
+  stop if and when it gets to the next deflate block boundary.  When decoding
+  the zlib or gzip format, this will cause inflate() to return immediately
+  after the header and before the first block.  When doing a raw inflate,
+  inflate() will go ahead and process the first block, and will return when it
+  gets to the end of that block, or when it runs out of data.
+
+    The Z_BLOCK option assists in appending to or combining deflate streams.
+  Also to assist in this, on return inflate() will set strm->data_type to the
+  number of unused bits in the last byte taken from strm->next_in, plus 64 if
+  inflate() is currently decoding the last block in the deflate stream, plus
+  128 if inflate() returned immediately after decoding an end-of-block code or
+  decoding the complete header up to just before the first byte of the deflate
+  stream.  The end-of-block will not be indicated until all of the uncompressed
+  data from that block has been written to strm->next_out.  The number of
+  unused bits may in general be greater than seven, except when bit 7 of
+  data_type is set, in which case the number of unused bits will be less than
+  eight.  data_type is set as noted here every time inflate() returns for all
+  flush options, and so can be used to determine the amount of currently
+  consumed input in bits.
+
+    The Z_TREES option behaves as Z_BLOCK does, but it also returns when the
+  end of each deflate block header is reached, before any actual data in that
+  block is decoded.  This allows the caller to determine the length of the
+  deflate block header for later use in random access within a deflate block.
+  256 is added to the value of strm->data_type when inflate() returns
+  immediately after reaching the end of the deflate block header.
+
+    inflate() should normally be called until it returns Z_STREAM_END or an
+  error.  However if all decompression is to be performed in a single step (a
+  single call of inflate), the parameter flush should be set to Z_FINISH.  In
+  this case all pending input is processed and all pending output is flushed;
+  avail_out must be large enough to hold all of the uncompressed data for the
+  operation to complete.  (The size of the uncompressed data may have been
+  saved by the compressor for this purpose.) The use of Z_FINISH is not
+  required to perform an inflation in one step.  However it may be used to
+  inform inflate that a faster approach can be used for the single inflate()
+  call.  Z_FINISH also informs inflate to not maintain a sliding window if the
+  stream completes, which reduces inflate's memory footprint.  If the stream
+  does not complete, either because not all of the stream is provided or not
+  enough output space is provided, then a sliding window will be allocated and
+  inflate() can be called again to continue the operation as if Z_NO_FLUSH had
+  been used.
+
+     In this implementation, inflate() always flushes as much output as
+  possible to the output buffer, and always uses the faster approach on the
+  first call.  So the effects of the flush parameter in this implementation are
+  on the return value of inflate() as noted below, when inflate() returns early
+  when Z_BLOCK or Z_TREES is used, and when inflate() avoids the allocation of
+  memory for a sliding window when Z_FINISH is used.
+
+     If a preset dictionary is needed after this call (see inflateSetDictionary
+  below), inflate sets strm->adler to the Adler-32 checksum of the dictionary
+  chosen by the compressor and returns Z_NEED_DICT; otherwise it sets
+  strm->adler to the Adler-32 checksum of all output produced so far (that is,
+  total_out bytes) and returns Z_OK, Z_STREAM_END or an error code as described
+  below.  At the end of the stream, inflate() checks that its computed adler32
+  checksum is equal to that saved by the compressor and returns Z_STREAM_END
+  only if the checksum is correct.
+
+    inflate() can decompress and check either zlib-wrapped or gzip-wrapped
+  deflate data.  The header type is detected automatically, if requested when
+  initializing with inflateInit2().  Any information contained in the gzip
+  header is not retained, so applications that need that information should
+  instead use raw inflate, see inflateInit2() below, or inflateBack() and
+  perform their own processing of the gzip header and trailer.  When processing
+  gzip-wrapped deflate data, strm->adler32 is set to the CRC-32 of the output
+  producted so far.  The CRC-32 is checked against the gzip trailer.
+
+    inflate() returns Z_OK if some progress has been made (more input processed
+  or more output produced), Z_STREAM_END if the end of the compressed data has
+  been reached and all uncompressed output has been produced, Z_NEED_DICT if a
+  preset dictionary is needed at this point, Z_DATA_ERROR if the input data was
+  corrupted (input stream not conforming to the zlib format or incorrect check
+  value), Z_STREAM_ERROR if the stream structure was inconsistent (for example
+  next_in or next_out was Z_NULL), Z_MEM_ERROR if there was not enough memory,
+  Z_BUF_ERROR if no progress is possible or if there was not enough room in the
+  output buffer when Z_FINISH is used.  Note that Z_BUF_ERROR is not fatal, and
+  inflate() can be called again with more input and more output space to
+  continue decompressing.  If Z_DATA_ERROR is returned, the application may
+  then call inflateSync() to look for a good compression block if a partial
+  recovery of the data is desired.
+*/
+
+
+ZEXTERN int ZEXPORT inflateEnd OF((z_streamp strm));
+/*
+     All dynamically allocated data structures for this stream are freed.
+   This function discards any unprocessed input and does not flush any pending
+   output.
+
+     inflateEnd returns Z_OK if success, Z_STREAM_ERROR if the stream state
+   was inconsistent.  In the error case, msg may be set but then points to a
+   static string (which must not be deallocated).
+*/
+
+
+                        /* Advanced functions */
+
+/*
+    The following functions are needed only in some special applications.
+*/
+
+/*
+ZEXTERN int ZEXPORT deflateInit2 OF((z_streamp strm,
+                                     int  level,
+                                     int  method,
+                                     int  windowBits,
+                                     int  memLevel,
+                                     int  strategy));
+
+     This is another version of deflateInit with more compression options.  The
+   fields next_in, zalloc, zfree and opaque must be initialized before by the
+   caller.
+
+     The method parameter is the compression method.  It must be Z_DEFLATED in
+   this version of the library.
+
+     The windowBits parameter is the base two logarithm of the window size
+   (the size of the history buffer).  It should be in the range 8..15 for this
+   version of the library.  Larger values of this parameter result in better
+   compression at the expense of memory usage.  The default value is 15 if
+   deflateInit is used instead.
+
+     windowBits can also be -8..-15 for raw deflate.  In this case, -windowBits
+   determines the window size.  deflate() will then generate raw deflate data
+   with no zlib header or trailer, and will not compute an adler32 check value.
+
+     windowBits can also be greater than 15 for optional gzip encoding.  Add
+   16 to windowBits to write a simple gzip header and trailer around the
+   compressed data instead of a zlib wrapper.  The gzip header will have no
+   file name, no extra data, no comment, no modification time (set to zero), no
+   header crc, and the operating system will be set to 255 (unknown).  If a
+   gzip stream is being written, strm->adler is a crc32 instead of an adler32.
+
+     The memLevel parameter specifies how much memory should be allocated
+   for the internal compression state.  memLevel=1 uses minimum memory but is
+   slow and reduces compression ratio; memLevel=9 uses maximum memory for
+   optimal speed.  The default value is 8.  See zconf.h for total memory usage
+   as a function of windowBits and memLevel.
+
+     The strategy parameter is used to tune the compression algorithm.  Use the
+   value Z_DEFAULT_STRATEGY for normal data, Z_FILTERED for data produced by a
+   filter (or predictor), Z_HUFFMAN_ONLY to force Huffman encoding only (no
+   string match), or Z_RLE to limit match distances to one (run-length
+   encoding).  Filtered data consists mostly of small values with a somewhat
+   random distribution.  In this case, the compression algorithm is tuned to
+   compress them better.  The effect of Z_FILTERED is to force more Huffman
+   coding and less string matching; it is somewhat intermediate between
+   Z_DEFAULT_STRATEGY and Z_HUFFMAN_ONLY.  Z_RLE is designed to be almost as
+   fast as Z_HUFFMAN_ONLY, but give better compression for PNG image data.  The
+   strategy parameter only affects the compression ratio but not the
+   correctness of the compressed output even if it is not set appropriately.
+   Z_FIXED prevents the use of dynamic Huffman codes, allowing for a simpler
+   decoder for special applications.
+
+     deflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+   memory, Z_STREAM_ERROR if any parameter is invalid (such as an invalid
+   method), or Z_VERSION_ERROR if the zlib library version (zlib_version) is
+   incompatible with the version assumed by the caller (ZLIB_VERSION).  msg is
+   set to null if there is no error message.  deflateInit2 does not perform any
+   compression: this will be done by deflate().
+*/
+
+ZEXTERN int ZEXPORT deflateSetDictionary OF((z_streamp strm,
+                                             const Bytef *dictionary,
+                                             uInt  dictLength));
+/*
+     Initializes the compression dictionary from the given byte sequence
+   without producing any compressed output.  When using the zlib format, this
+   function must be called immediately after deflateInit, deflateInit2 or
+   deflateReset, and before any call of deflate.  When doing raw deflate, this
+   function must be called either before any call of deflate, or immediately
+   after the completion of a deflate block, i.e. after all input has been
+   consumed and all output has been delivered when using any of the flush
+   options Z_BLOCK, Z_PARTIAL_FLUSH, Z_SYNC_FLUSH, or Z_FULL_FLUSH.  The
+   compressor and decompressor must use exactly the same dictionary (see
+   inflateSetDictionary).
+
+     The dictionary should consist of strings (byte sequences) that are likely
+   to be encountered later in the data to be compressed, with the most commonly
+   used strings preferably put towards the end of the dictionary.  Using a
+   dictionary is most useful when the data to be compressed is short and can be
+   predicted with good accuracy; the data can then be compressed better than
+   with the default empty dictionary.
+
+     Depending on the size of the compression data structures selected by
+   deflateInit or deflateInit2, a part of the dictionary may in effect be
+   discarded, for example if the dictionary is larger than the window size
+   provided in deflateInit or deflateInit2.  Thus the strings most likely to be
+   useful should be put at the end of the dictionary, not at the front.  In
+   addition, the current implementation of deflate will use at most the window
+   size minus 262 bytes of the provided dictionary.
+
+     Upon return of this function, strm->adler is set to the adler32 value
+   of the dictionary; the decompressor may later use this value to determine
+   which dictionary has been used by the compressor.  (The adler32 value
+   applies to the whole dictionary even if only a subset of the dictionary is
+   actually used by the compressor.) If a raw deflate was requested, then the
+   adler32 value is not computed and strm->adler is not set.
+
+     deflateSetDictionary returns Z_OK if success, or Z_STREAM_ERROR if a
+   parameter is invalid (e.g.  dictionary being Z_NULL) or the stream state is
+   inconsistent (for example if deflate has already been called for this stream
+   or if not at a block boundary for raw deflate).  deflateSetDictionary does
+   not perform any compression: this will be done by deflate().
+*/
+
+ZEXTERN int ZEXPORT deflateCopy OF((z_streamp dest,
+                                    z_streamp source));
+/*
+     Sets the destination stream as a complete copy of the source stream.
+
+     This function can be useful when several compression strategies will be
+   tried, for example when there are several ways of pre-processing the input
+   data with a filter.  The streams that will be discarded should then be freed
+   by calling deflateEnd.  Note that deflateCopy duplicates the internal
+   compression state which can be quite large, so this strategy is slow and can
+   consume lots of memory.
+
+     deflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not
+   enough memory, Z_STREAM_ERROR if the source stream state was inconsistent
+   (such as zalloc being Z_NULL).  msg is left unchanged in both source and
+   destination.
+*/
+
+ZEXTERN int ZEXPORT deflateReset OF((z_streamp strm));
+/*
+     This function is equivalent to deflateEnd followed by deflateInit,
+   but does not free and reallocate all the internal compression state.  The
+   stream will keep the same compression level and any other attributes that
+   may have been set by deflateInit2.
+
+     deflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source
+   stream state was inconsistent (such as zalloc or state being Z_NULL).
+*/
+
+ZEXTERN int ZEXPORT deflateParams OF((z_streamp strm,
+                                      int level,
+                                      int strategy));
+/*
+     Dynamically update the compression level and compression strategy.  The
+   interpretation of level and strategy is as in deflateInit2.  This can be
+   used to switch between compression and straight copy of the input data, or
+   to switch to a different kind of input data requiring a different strategy.
+   If the compression level is changed, the input available so far is
+   compressed with the old level (and may be flushed); the new level will take
+   effect only at the next call of deflate().
+
+     Before the call of deflateParams, the stream state must be set as for
+   a call of deflate(), since the currently available input may have to be
+   compressed and flushed.  In particular, strm->avail_out must be non-zero.
+
+     deflateParams returns Z_OK if success, Z_STREAM_ERROR if the source
+   stream state was inconsistent or if a parameter was invalid, Z_BUF_ERROR if
+   strm->avail_out was zero.
+*/
+
+ZEXTERN int ZEXPORT deflateTune OF((z_streamp strm,
+                                    int good_length,
+                                    int max_lazy,
+                                    int nice_length,
+                                    int max_chain));
+/*
+     Fine tune deflate's internal compression parameters.  This should only be
+   used by someone who understands the algorithm used by zlib's deflate for
+   searching for the best matching string, and even then only by the most
+   fanatic optimizer trying to squeeze out the last compressed bit for their
+   specific input data.  Read the deflate.c source code for the meaning of the
+   max_lazy, good_length, nice_length, and max_chain parameters.
+
+     deflateTune() can be called after deflateInit() or deflateInit2(), and
+   returns Z_OK on success, or Z_STREAM_ERROR for an invalid deflate stream.
+ */
+
+ZEXTERN uLong ZEXPORT deflateBound OF((z_streamp strm,
+                                       uLong sourceLen));
+/*
+     deflateBound() returns an upper bound on the compressed size after
+   deflation of sourceLen bytes.  It must be called after deflateInit() or
+   deflateInit2(), and after deflateSetHeader(), if used.  This would be used
+   to allocate an output buffer for deflation in a single pass, and so would be
+   called before deflate().  If that first deflate() call is provided the
+   sourceLen input bytes, an output buffer allocated to the size returned by
+   deflateBound(), and the flush value Z_FINISH, then deflate() is guaranteed
+   to return Z_STREAM_END.  Note that it is possible for the compressed size to
+   be larger than the value returned by deflateBound() if flush options other
+   than Z_FINISH or Z_NO_FLUSH are used.
+*/
+
+ZEXTERN int ZEXPORT deflatePending OF((z_streamp strm,
+                                       unsigned *pending,
+                                       int *bits));
+/*
+     deflatePending() returns the number of bytes and bits of output that have
+   been generated, but not yet provided in the available output.  The bytes not
+   provided would be due to the available output space having being consumed.
+   The number of bits of output not provided are between 0 and 7, where they
+   await more bits to join them in order to fill out a full byte.  If pending
+   or bits are Z_NULL, then those values are not set.
+
+     deflatePending returns Z_OK if success, or Z_STREAM_ERROR if the source
+   stream state was inconsistent.
+ */
+
+ZEXTERN int ZEXPORT deflatePrime OF((z_streamp strm,
+                                     int bits,
+                                     int value));
+/*
+     deflatePrime() inserts bits in the deflate output stream.  The intent
+   is that this function is used to start off the deflate output with the bits
+   leftover from a previous deflate stream when appending to it.  As such, this
+   function can only be used for raw deflate, and must be used before the first
+   deflate() call after a deflateInit2() or deflateReset().  bits must be less
+   than or equal to 16, and that many of the least significant bits of value
+   will be inserted in the output.
+
+     deflatePrime returns Z_OK if success, Z_BUF_ERROR if there was not enough
+   room in the internal buffer to insert the bits, or Z_STREAM_ERROR if the
+   source stream state was inconsistent.
+*/
+
+ZEXTERN int ZEXPORT deflateSetHeader OF((z_streamp strm,
+                                         gz_headerp head));
+/*
+     deflateSetHeader() provides gzip header information for when a gzip
+   stream is requested by deflateInit2().  deflateSetHeader() may be called
+   after deflateInit2() or deflateReset() and before the first call of
+   deflate().  The text, time, os, extra field, name, and comment information
+   in the provided gz_header structure are written to the gzip header (xflag is
+   ignored -- the extra flags are set according to the compression level).  The
+   caller must assure that, if not Z_NULL, name and comment are terminated with
+   a zero byte, and that if extra is not Z_NULL, that extra_len bytes are
+   available there.  If hcrc is true, a gzip header crc is included.  Note that
+   the current versions of the command-line version of gzip (up through version
+   1.3.x) do not support header crc's, and will report that it is a "multi-part
+   gzip file" and give up.
+
+     If deflateSetHeader is not used, the default gzip header has text false,
+   the time set to zero, and os set to 255, with no extra, name, or comment
+   fields.  The gzip header is returned to the default state by deflateReset().
+
+     deflateSetHeader returns Z_OK if success, or Z_STREAM_ERROR if the source
+   stream state was inconsistent.
+*/
+
+/*
+ZEXTERN int ZEXPORT inflateInit2 OF((z_streamp strm,
+                                     int  windowBits));
+
+     This is another version of inflateInit with an extra parameter.  The
+   fields next_in, avail_in, zalloc, zfree and opaque must be initialized
+   before by the caller.
+
+     The windowBits parameter is the base two logarithm of the maximum window
+   size (the size of the history buffer).  It should be in the range 8..15 for
+   this version of the library.  The default value is 15 if inflateInit is used
+   instead.  windowBits must be greater than or equal to the windowBits value
+   provided to deflateInit2() while compressing, or it must be equal to 15 if
+   deflateInit2() was not used.  If a compressed stream with a larger window
+   size is given as input, inflate() will return with the error code
+   Z_DATA_ERROR instead of trying to allocate a larger window.
+
+     windowBits can also be zero to request that inflate use the window size in
+   the zlib header of the compressed stream.
+
+     windowBits can also be -8..-15 for raw inflate.  In this case, -windowBits
+   determines the window size.  inflate() will then process raw deflate data,
+   not looking for a zlib or gzip header, not generating a check value, and not
+   looking for any check values for comparison at the end of the stream.  This
+   is for use with other formats that use the deflate compressed data format
+   such as zip.  Those formats provide their own check values.  If a custom
+   format is developed using the raw deflate format for compressed data, it is
+   recommended that a check value such as an adler32 or a crc32 be applied to
+   the uncompressed data as is done in the zlib, gzip, and zip formats.  For
+   most applications, the zlib format should be used as is.  Note that comments
+   above on the use in deflateInit2() applies to the magnitude of windowBits.
+
+     windowBits can also be greater than 15 for optional gzip decoding.  Add
+   32 to windowBits to enable zlib and gzip decoding with automatic header
+   detection, or add 16 to decode only the gzip format (the zlib format will
+   return a Z_DATA_ERROR).  If a gzip stream is being decoded, strm->adler is a
+   crc32 instead of an adler32.
+
+     inflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+   memory, Z_VERSION_ERROR if the zlib library version is incompatible with the
+   version assumed by the caller, or Z_STREAM_ERROR if the parameters are
+   invalid, such as a null pointer to the structure.  msg is set to null if
+   there is no error message.  inflateInit2 does not perform any decompression
+   apart from possibly reading the zlib header if present: actual decompression
+   will be done by inflate().  (So next_in and avail_in may be modified, but
+   next_out and avail_out are unused and unchanged.) The current implementation
+   of inflateInit2() does not process any header information -- that is
+   deferred until inflate() is called.
+*/
+
+ZEXTERN int ZEXPORT inflateSetDictionary OF((z_streamp strm,
+                                             const Bytef *dictionary,
+                                             uInt  dictLength));
+/*
+     Initializes the decompression dictionary from the given uncompressed byte
+   sequence.  This function must be called immediately after a call of inflate,
+   if that call returned Z_NEED_DICT.  The dictionary chosen by the compressor
+   can be determined from the adler32 value returned by that call of inflate.
+   The compressor and decompressor must use exactly the same dictionary (see
+   deflateSetDictionary).  For raw inflate, this function can be called at any
+   time to set the dictionary.  If the provided dictionary is smaller than the
+   window and there is already data in the window, then the provided dictionary
+   will amend what's there.  The application must insure that the dictionary
+   that was used for compression is provided.
+
+     inflateSetDictionary returns Z_OK if success, Z_STREAM_ERROR if a
+   parameter is invalid (e.g.  dictionary being Z_NULL) or the stream state is
+   inconsistent, Z_DATA_ERROR if the given dictionary doesn't match the
+   expected one (incorrect adler32 value).  inflateSetDictionary does not
+   perform any decompression: this will be done by subsequent calls of
+   inflate().
+*/
+
+ZEXTERN int ZEXPORT inflateGetDictionary OF((z_streamp strm,
+                                             Bytef *dictionary,
+                                             uInt  *dictLength));
+/*
+     Returns the sliding dictionary being maintained by inflate.  dictLength is
+   set to the number of bytes in the dictionary, and that many bytes are copied
+   to dictionary.  dictionary must have enough space, where 32768 bytes is
+   always enough.  If inflateGetDictionary() is called with dictionary equal to
+   Z_NULL, then only the dictionary length is returned, and nothing is copied.
+   Similary, if dictLength is Z_NULL, then it is not set.
+
+     inflateGetDictionary returns Z_OK on success, or Z_STREAM_ERROR if the
+   stream state is inconsistent.
+*/
+
+ZEXTERN int ZEXPORT inflateSync OF((z_streamp strm));
+/*
+     Skips invalid compressed data until a possible full flush point (see above
+   for the description of deflate with Z_FULL_FLUSH) can be found, or until all
+   available input is skipped.  No output is provided.
+
+     inflateSync searches for a 00 00 FF FF pattern in the compressed data.
+   All full flush points have this pattern, but not all occurrences of this
+   pattern are full flush points.
+
+     inflateSync returns Z_OK if a possible full flush point has been found,
+   Z_BUF_ERROR if no more input was provided, Z_DATA_ERROR if no flush point
+   has been found, or Z_STREAM_ERROR if the stream structure was inconsistent.
+   In the success case, the application may save the current current value of
+   total_in which indicates where valid compressed data was found.  In the
+   error case, the application may repeatedly call inflateSync, providing more
+   input each time, until success or end of the input data.
+*/
+
+ZEXTERN int ZEXPORT inflateCopy OF((z_streamp dest,
+                                    z_streamp source));
+/*
+     Sets the destination stream as a complete copy of the source stream.
+
+     This function can be useful when randomly accessing a large stream.  The
+   first pass through the stream can periodically record the inflate state,
+   allowing restarting inflate at those points when randomly accessing the
+   stream.
+
+     inflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not
+   enough memory, Z_STREAM_ERROR if the source stream state was inconsistent
+   (such as zalloc being Z_NULL).  msg is left unchanged in both source and
+   destination.
+*/
+
+ZEXTERN int ZEXPORT inflateReset OF((z_streamp strm));
+/*
+     This function is equivalent to inflateEnd followed by inflateInit,
+   but does not free and reallocate all the internal decompression state.  The
+   stream will keep attributes that may have been set by inflateInit2.
+
+     inflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source
+   stream state was inconsistent (such as zalloc or state being Z_NULL).
+*/
+
+ZEXTERN int ZEXPORT inflateReset2 OF((z_streamp strm,
+                                      int windowBits));
+/*
+     This function is the same as inflateReset, but it also permits changing
+   the wrap and window size requests.  The windowBits parameter is interpreted
+   the same as it is for inflateInit2.
+
+     inflateReset2 returns Z_OK if success, or Z_STREAM_ERROR if the source
+   stream state was inconsistent (such as zalloc or state being Z_NULL), or if
+   the windowBits parameter is invalid.
+*/
+
+ZEXTERN int ZEXPORT inflatePrime OF((z_streamp strm,
+                                     int bits,
+                                     int value));
+/*
+     This function inserts bits in the inflate input stream.  The intent is
+   that this function is used to start inflating at a bit position in the
+   middle of a byte.  The provided bits will be used before any bytes are used
+   from next_in.  This function should only be used with raw inflate, and
+   should be used before the first inflate() call after inflateInit2() or
+   inflateReset().  bits must be less than or equal to 16, and that many of the
+   least significant bits of value will be inserted in the input.
+
+     If bits is negative, then the input stream bit buffer is emptied.  Then
+   inflatePrime() can be called again to put bits in the buffer.  This is used
+   to clear out bits leftover after feeding inflate a block description prior
+   to feeding inflate codes.
+
+     inflatePrime returns Z_OK if success, or Z_STREAM_ERROR if the source
+   stream state was inconsistent.
+*/
+
+ZEXTERN long ZEXPORT inflateMark OF((z_streamp strm));
+/*
+     This function returns two values, one in the lower 16 bits of the return
+   value, and the other in the remaining upper bits, obtained by shifting the
+   return value down 16 bits.  If the upper value is -1 and the lower value is
+   zero, then inflate() is currently decoding information outside of a block.
+   If the upper value is -1 and the lower value is non-zero, then inflate is in
+   the middle of a stored block, with the lower value equaling the number of
+   bytes from the input remaining to copy.  If the upper value is not -1, then
+   it is the number of bits back from the current bit position in the input of
+   the code (literal or length/distance pair) currently being processed.  In
+   that case the lower value is the number of bytes already emitted for that
+   code.
+
+     A code is being processed if inflate is waiting for more input to complete
+   decoding of the code, or if it has completed decoding but is waiting for
+   more output space to write the literal or match data.
+
+     inflateMark() is used to mark locations in the input data for random
+   access, which may be at bit positions, and to note those cases where the
+   output of a code may span boundaries of random access blocks.  The current
+   location in the input stream can be determined from avail_in and data_type
+   as noted in the description for the Z_BLOCK flush parameter for inflate.
+
+     inflateMark returns the value noted above or -1 << 16 if the provided
+   source stream state was inconsistent.
+*/
+
+ZEXTERN int ZEXPORT inflateGetHeader OF((z_streamp strm,
+                                         gz_headerp head));
+/*
+     inflateGetHeader() requests that gzip header information be stored in the
+   provided gz_header structure.  inflateGetHeader() may be called after
+   inflateInit2() or inflateReset(), and before the first call of inflate().
+   As inflate() processes the gzip stream, head->done is zero until the header
+   is completed, at which time head->done is set to one.  If a zlib stream is
+   being decoded, then head->done is set to -1 to indicate that there will be
+   no gzip header information forthcoming.  Note that Z_BLOCK or Z_TREES can be
+   used to force inflate() to return immediately after header processing is
+   complete and before any actual data is decompressed.
+
+     The text, time, xflags, and os fields are filled in with the gzip header
+   contents.  hcrc is set to true if there is a header CRC.  (The header CRC
+   was valid if done is set to one.) If extra is not Z_NULL, then extra_max
+   contains the maximum number of bytes to write to extra.  Once done is true,
+   extra_len contains the actual extra field length, and extra contains the
+   extra field, or that field truncated if extra_max is less than extra_len.
+   If name is not Z_NULL, then up to name_max characters are written there,
+   terminated with a zero unless the length is greater than name_max.  If
+   comment is not Z_NULL, then up to comm_max characters are written there,
+   terminated with a zero unless the length is greater than comm_max.  When any
+   of extra, name, or comment are not Z_NULL and the respective field is not
+   present in the header, then that field is set to Z_NULL to signal its
+   absence.  This allows the use of deflateSetHeader() with the returned
+   structure to duplicate the header.  However if those fields are set to
+   allocated memory, then the application will need to save those pointers
+   elsewhere so that they can be eventually freed.
+
+     If inflateGetHeader is not used, then the header information is simply
+   discarded.  The header is always checked for validity, including the header
+   CRC if present.  inflateReset() will reset the process to discard the header
+   information.  The application would need to call inflateGetHeader() again to
+   retrieve the header from the next gzip stream.
+
+     inflateGetHeader returns Z_OK if success, or Z_STREAM_ERROR if the source
+   stream state was inconsistent.
+*/
+
+/*
+ZEXTERN int ZEXPORT inflateBackInit OF((z_streamp strm, int windowBits,
+                                        unsigned char FAR *window));
+
+     Initialize the internal stream state for decompression using inflateBack()
+   calls.  The fields zalloc, zfree and opaque in strm must be initialized
+   before the call.  If zalloc and zfree are Z_NULL, then the default library-
+   derived memory allocation routines are used.  windowBits is the base two
+   logarithm of the window size, in the range 8..15.  window is a caller
+   supplied buffer of that size.  Except for special applications where it is
+   assured that deflate was used with small window sizes, windowBits must be 15
+   and a 32K byte window must be supplied to be able to decompress general
+   deflate streams.
+
+     See inflateBack() for the usage of these routines.
+
+     inflateBackInit will return Z_OK on success, Z_STREAM_ERROR if any of
+   the parameters are invalid, Z_MEM_ERROR if the internal state could not be
+   allocated, or Z_VERSION_ERROR if the version of the library does not match
+   the version of the header file.
+*/
+
+typedef unsigned (*in_func) OF((void FAR *,
+                                z_const unsigned char FAR * FAR *));
+typedef int (*out_func) OF((void FAR *, unsigned char FAR *, unsigned));
+
+ZEXTERN int ZEXPORT inflateBack OF((z_streamp strm,
+                                    in_func in, void FAR *in_desc,
+                                    out_func out, void FAR *out_desc));
+/*
+     inflateBack() does a raw inflate with a single call using a call-back
+   interface for input and output.  This is potentially more efficient than
+   inflate() for file i/o applications, in that it avoids copying between the
+   output and the sliding window by simply making the window itself the output
+   buffer.  inflate() can be faster on modern CPUs when used with large
+   buffers.  inflateBack() trusts the application to not change the output
+   buffer passed by the output function, at least until inflateBack() returns.
+
+     inflateBackInit() must be called first to allocate the internal state
+   and to initialize the state with the user-provided window buffer.
+   inflateBack() may then be used multiple times to inflate a complete, raw
+   deflate stream with each call.  inflateBackEnd() is then called to free the
+   allocated state.
+
+     A raw deflate stream is one with no zlib or gzip header or trailer.
+   This routine would normally be used in a utility that reads zip or gzip
+   files and writes out uncompressed files.  The utility would decode the
+   header and process the trailer on its own, hence this routine expects only
+   the raw deflate stream to decompress.  This is different from the normal
+   behavior of inflate(), which expects either a zlib or gzip header and
+   trailer around the deflate stream.
+
+     inflateBack() uses two subroutines supplied by the caller that are then
+   called by inflateBack() for input and output.  inflateBack() calls those
+   routines until it reads a complete deflate stream and writes out all of the
+   uncompressed data, or until it encounters an error.  The function's
+   parameters and return types are defined above in the in_func and out_func
+   typedefs.  inflateBack() will call in(in_desc, &buf) which should return the
+   number of bytes of provided input, and a pointer to that input in buf.  If
+   there is no input available, in() must return zero--buf is ignored in that
+   case--and inflateBack() will return a buffer error.  inflateBack() will call
+   out(out_desc, buf, len) to write the uncompressed data buf[0..len-1].  out()
+   should return zero on success, or non-zero on failure.  If out() returns
+   non-zero, inflateBack() will return with an error.  Neither in() nor out()
+   are permitted to change the contents of the window provided to
+   inflateBackInit(), which is also the buffer that out() uses to write from.
+   The length written by out() will be at most the window size.  Any non-zero
+   amount of input may be provided by in().
+
+     For convenience, inflateBack() can be provided input on the first call by
+   setting strm->next_in and strm->avail_in.  If that input is exhausted, then
+   in() will be called.  Therefore strm->next_in must be initialized before
+   calling inflateBack().  If strm->next_in is Z_NULL, then in() will be called
+   immediately for input.  If strm->next_in is not Z_NULL, then strm->avail_in
+   must also be initialized, and then if strm->avail_in is not zero, input will
+   initially be taken from strm->next_in[0 ..  strm->avail_in - 1].
+
+     The in_desc and out_desc parameters of inflateBack() is passed as the
+   first parameter of in() and out() respectively when they are called.  These
+   descriptors can be optionally used to pass any information that the caller-
+   supplied in() and out() functions need to do their job.
+
+     On return, inflateBack() will set strm->next_in and strm->avail_in to
+   pass back any unused input that was provided by the last in() call.  The
+   return values of inflateBack() can be Z_STREAM_END on success, Z_BUF_ERROR
+   if in() or out() returned an error, Z_DATA_ERROR if there was a format error
+   in the deflate stream (in which case strm->msg is set to indicate the nature
+   of the error), or Z_STREAM_ERROR if the stream was not properly initialized.
+   In the case of Z_BUF_ERROR, an input or output error can be distinguished
+   using strm->next_in which will be Z_NULL only if in() returned an error.  If
+   strm->next_in is not Z_NULL, then the Z_BUF_ERROR was due to out() returning
+   non-zero.  (in() will always be called before out(), so strm->next_in is
+   assured to be defined if out() returns non-zero.) Note that inflateBack()
+   cannot return Z_OK.
+*/
+
+ZEXTERN int ZEXPORT inflateBackEnd OF((z_streamp strm));
+/*
+     All memory allocated by inflateBackInit() is freed.
+
+     inflateBackEnd() returns Z_OK on success, or Z_STREAM_ERROR if the stream
+   state was inconsistent.
+*/
+
+ZEXTERN uLong ZEXPORT zlibCompileFlags OF((void));
+/* Return flags indicating compile-time options.
+
+    Type sizes, two bits each, 00 = 16 bits, 01 = 32, 10 = 64, 11 = other:
+     1.0: size of uInt
+     3.2: size of uLong
+     5.4: size of voidpf (pointer)
+     7.6: size of z_off_t
+
+    Compiler, assembler, and debug options:
+     8: DEBUG
+     9: ASMV or ASMINF -- use ASM code
+     10: ZLIB_WINAPI -- exported functions use the WINAPI calling convention
+     11: 0 (reserved)
+
+    One-time table building (smaller code, but not thread-safe if true):
+     12: BUILDFIXED -- build static block decoding tables when needed
+     13: DYNAMIC_CRC_TABLE -- build CRC calculation tables when needed
+     14,15: 0 (reserved)
+
+    Library content (indicates missing functionality):
+     16: NO_GZCOMPRESS -- gz* functions cannot compress (to avoid linking
+                          deflate code when not needed)
+     17: NO_GZIP -- deflate can't write gzip streams, and inflate can't detect
+                    and decode gzip streams (to avoid linking crc code)
+     18-19: 0 (reserved)
+
+    Operation variations (changes in library functionality):
+     20: PKZIP_BUG_WORKAROUND -- slightly more permissive inflate
+     21: FASTEST -- deflate algorithm with only one, lowest compression level
+     22,23: 0 (reserved)
+
+    The sprintf variant used by gzprintf (zero is best):
+     24: 0 = vs*, 1 = s* -- 1 means limited to 20 arguments after the format
+     25: 0 = *nprintf, 1 = *printf -- 1 means gzprintf() not secure!
+     26: 0 = returns value, 1 = void -- 1 means inferred string length returned
+
+    Remainder:
+     27-31: 0 (reserved)
+ */
+
+#ifndef Z_SOLO
+
+                        /* utility functions */
+
+/*
+     The following utility functions are implemented on top of the basic
+   stream-oriented functions.  To simplify the interface, some default options
+   are assumed (compression level and memory usage, standard memory allocation
+   functions).  The source code of these utility functions can be modified if
+   you need special options.
+*/
+
+ZEXTERN int ZEXPORT compress OF((Bytef *dest,   uLongf *destLen,
+                                 const Bytef *source, uLong sourceLen));
+/*
+     Compresses the source buffer into the destination buffer.  sourceLen is
+   the byte length of the source buffer.  Upon entry, destLen is the total size
+   of the destination buffer, which must be at least the value returned by
+   compressBound(sourceLen).  Upon exit, destLen is the actual size of the
+   compressed buffer.
+
+     compress returns Z_OK if success, Z_MEM_ERROR if there was not
+   enough memory, Z_BUF_ERROR if there was not enough room in the output
+   buffer.
+*/
+
+ZEXTERN int ZEXPORT compress2 OF((Bytef *dest,   uLongf *destLen,
+                                  const Bytef *source, uLong sourceLen,
+                                  int level));
+/*
+     Compresses the source buffer into the destination buffer.  The level
+   parameter has the same meaning as in deflateInit.  sourceLen is the byte
+   length of the source buffer.  Upon entry, destLen is the total size of the
+   destination buffer, which must be at least the value returned by
+   compressBound(sourceLen).  Upon exit, destLen is the actual size of the
+   compressed buffer.
+
+     compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+   memory, Z_BUF_ERROR if there was not enough room in the output buffer,
+   Z_STREAM_ERROR if the level parameter is invalid.
+*/
+
+ZEXTERN uLong ZEXPORT compressBound OF((uLong sourceLen));
+/*
+     compressBound() returns an upper bound on the compressed size after
+   compress() or compress2() on sourceLen bytes.  It would be used before a
+   compress() or compress2() call to allocate the destination buffer.
+*/
+
+ZEXTERN int ZEXPORT uncompress OF((Bytef *dest,   uLongf *destLen,
+                                   const Bytef *source, uLong sourceLen));
+/*
+     Decompresses the source buffer into the destination buffer.  sourceLen is
+   the byte length of the source buffer.  Upon entry, destLen is the total size
+   of the destination buffer, which must be large enough to hold the entire
+   uncompressed data.  (The size of the uncompressed data must have been saved
+   previously by the compressor and transmitted to the decompressor by some
+   mechanism outside the scope of this compression library.) Upon exit, destLen
+   is the actual size of the uncompressed buffer.
+
+     uncompress returns Z_OK if success, Z_MEM_ERROR if there was not
+   enough memory, Z_BUF_ERROR if there was not enough room in the output
+   buffer, or Z_DATA_ERROR if the input data was corrupted or incomplete.  In
+   the case where there is not enough room, uncompress() will fill the output
+   buffer with the uncompressed data up to that point.
+*/
+
+                        /* gzip file access functions */
+
+/*
+     This library supports reading and writing files in gzip (.gz) format with
+   an interface similar to that of stdio, using the functions that start with
+   "gz".  The gzip format is different from the zlib format.  gzip is a gzip
+   wrapper, documented in RFC 1952, wrapped around a deflate stream.
+*/
+
+typedef struct gzFile_s *gzFile;    /* semi-opaque gzip file descriptor */
+
+/*
+ZEXTERN gzFile ZEXPORT gzopen OF((const char *path, const char *mode));
+
+     Opens a gzip (.gz) file for reading or writing.  The mode parameter is as
+   in fopen ("rb" or "wb") but can also include a compression level ("wb9") or
+   a strategy: 'f' for filtered data as in "wb6f", 'h' for Huffman-only
+   compression as in "wb1h", 'R' for run-length encoding as in "wb1R", or 'F'
+   for fixed code compression as in "wb9F".  (See the description of
+   deflateInit2 for more information about the strategy parameter.)  'T' will
+   request transparent writing or appending with no compression and not using
+   the gzip format.
+
+     "a" can be used instead of "w" to request that the gzip stream that will
+   be written be appended to the file.  "+" will result in an error, since
+   reading and writing to the same gzip file is not supported.  The addition of
+   "x" when writing will create the file exclusively, which fails if the file
+   already exists.  On systems that support it, the addition of "e" when
+   reading or writing will set the flag to close the file on an execve() call.
+
+     These functions, as well as gzip, will read and decode a sequence of gzip
+   streams in a file.  The append function of gzopen() can be used to create
+   such a file.  (Also see gzflush() for another way to do this.)  When
+   appending, gzopen does not test whether the file begins with a gzip stream,
+   nor does it look for the end of the gzip streams to begin appending.  gzopen
+   will simply append a gzip stream to the existing file.
+
+     gzopen can be used to read a file which is not in gzip format; in this
+   case gzread will directly read from the file without decompression.  When
+   reading, this will be detected automatically by looking for the magic two-
+   byte gzip header.
+
+     gzopen returns NULL if the file could not be opened, if there was
+   insufficient memory to allocate the gzFile state, or if an invalid mode was
+   specified (an 'r', 'w', or 'a' was not provided, or '+' was provided).
+   errno can be checked to determine if the reason gzopen failed was that the
+   file could not be opened.
+*/
+
+ZEXTERN gzFile ZEXPORT gzdopen OF((int fd, const char *mode));
+/*
+     gzdopen associates a gzFile with the file descriptor fd.  File descriptors
+   are obtained from calls like open, dup, creat, pipe or fileno (if the file
+   has been previously opened with fopen).  The mode parameter is as in gzopen.
+
+     The next call of gzclose on the returned gzFile will also close the file
+   descriptor fd, just like fclose(fdopen(fd, mode)) closes the file descriptor
+   fd.  If you want to keep fd open, use fd = dup(fd_keep); gz = gzdopen(fd,
+   mode);.  The duplicated descriptor should be saved to avoid a leak, since
+   gzdopen does not close fd if it fails.  If you are using fileno() to get the
+   file descriptor from a FILE *, then you will have to use dup() to avoid
+   double-close()ing the file descriptor.  Both gzclose() and fclose() will
+   close the associated file descriptor, so they need to have different file
+   descriptors.
+
+     gzdopen returns NULL if there was insufficient memory to allocate the
+   gzFile state, if an invalid mode was specified (an 'r', 'w', or 'a' was not
+   provided, or '+' was provided), or if fd is -1.  The file descriptor is not
+   used until the next gz* read, write, seek, or close operation, so gzdopen
+   will not detect if fd is invalid (unless fd is -1).
+*/
+
+ZEXTERN int ZEXPORT gzbuffer OF((gzFile file, unsigned size));
+/*
+     Set the internal buffer size used by this library's functions.  The
+   default buffer size is 8192 bytes.  This function must be called after
+   gzopen() or gzdopen(), and before any other calls that read or write the
+   file.  The buffer memory allocation is always deferred to the first read or
+   write.  Two buffers are allocated, either both of the specified size when
+   writing, or one of the specified size and the other twice that size when
+   reading.  A larger buffer size of, for example, 64K or 128K bytes will
+   noticeably increase the speed of decompression (reading).
+
+     The new buffer size also affects the maximum length for gzprintf().
+
+     gzbuffer() returns 0 on success, or -1 on failure, such as being called
+   too late.
+*/
+
+ZEXTERN int ZEXPORT gzsetparams OF((gzFile file, int level, int strategy));
+/*
+     Dynamically update the compression level or strategy.  See the description
+   of deflateInit2 for the meaning of these parameters.
+
+     gzsetparams returns Z_OK if success, or Z_STREAM_ERROR if the file was not
+   opened for writing.
+*/
+
+ZEXTERN int ZEXPORT gzread OF((gzFile file, voidp buf, unsigned len));
+/*
+     Reads the given number of uncompressed bytes from the compressed file.  If
+   the input file is not in gzip format, gzread copies the given number of
+   bytes into the buffer directly from the file.
+
+     After reaching the end of a gzip stream in the input, gzread will continue
+   to read, looking for another gzip stream.  Any number of gzip streams may be
+   concatenated in the input file, and will all be decompressed by gzread().
+   If something other than a gzip stream is encountered after a gzip stream,
+   that remaining trailing garbage is ignored (and no error is returned).
+
+     gzread can be used to read a gzip file that is being concurrently written.
+   Upon reaching the end of the input, gzread will return with the available
+   data.  If the error code returned by gzerror is Z_OK or Z_BUF_ERROR, then
+   gzclearerr can be used to clear the end of file indicator in order to permit
+   gzread to be tried again.  Z_OK indicates that a gzip stream was completed
+   on the last gzread.  Z_BUF_ERROR indicates that the input file ended in the
+   middle of a gzip stream.  Note that gzread does not return -1 in the event
+   of an incomplete gzip stream.  This error is deferred until gzclose(), which
+   will return Z_BUF_ERROR if the last gzread ended in the middle of a gzip
+   stream.  Alternatively, gzerror can be used before gzclose to detect this
+   case.
+
+     gzread returns the number of uncompressed bytes actually read, less than
+   len for end of file, or -1 for error.
+*/
+
+ZEXTERN int ZEXPORT gzwrite OF((gzFile file,
+                                voidpc buf, unsigned len));
+/*
+     Writes the given number of uncompressed bytes into the compressed file.
+   gzwrite returns the number of uncompressed bytes written or 0 in case of
+   error.
+*/
+
+ZEXTERN int ZEXPORTVA gzprintf Z_ARG((gzFile file, const char *format, ...));
+/*
+     Converts, formats, and writes the arguments to the compressed file under
+   control of the format string, as in fprintf.  gzprintf returns the number of
+   uncompressed bytes actually written, or 0 in case of error.  The number of
+   uncompressed bytes written is limited to 8191, or one less than the buffer
+   size given to gzbuffer().  The caller should assure that this limit is not
+   exceeded.  If it is exceeded, then gzprintf() will return an error (0) with
+   nothing written.  In this case, there may also be a buffer overflow with
+   unpredictable consequences, which is possible only if zlib was compiled with
+   the insecure functions sprintf() or vsprintf() because the secure snprintf()
+   or vsnprintf() functions were not available.  This can be determined using
+   zlibCompileFlags().
+*/
+
+ZEXTERN int ZEXPORT gzputs OF((gzFile file, const char *s));
+/*
+     Writes the given null-terminated string to the compressed file, excluding
+   the terminating null character.
+
+     gzputs returns the number of characters written, or -1 in case of error.
+*/
+
+ZEXTERN char * ZEXPORT gzgets OF((gzFile file, char *buf, int len));
+/*
+     Reads bytes from the compressed file until len-1 characters are read, or a
+   newline character is read and transferred to buf, or an end-of-file
+   condition is encountered.  If any characters are read or if len == 1, the
+   string is terminated with a null character.  If no characters are read due
+   to an end-of-file or len < 1, then the buffer is left untouched.
+
+     gzgets returns buf which is a null-terminated string, or it returns NULL
+   for end-of-file or in case of error.  If there was an error, the contents at
+   buf are indeterminate.
+*/
+
+ZEXTERN int ZEXPORT gzputc OF((gzFile file, int c));
+/*
+     Writes c, converted to an unsigned char, into the compressed file.  gzputc
+   returns the value that was written, or -1 in case of error.
+*/
+
+ZEXTERN int ZEXPORT gzgetc OF((gzFile file));
+/*
+     Reads one byte from the compressed file.  gzgetc returns this byte or -1
+   in case of end of file or error.  This is implemented as a macro for speed.
+   As such, it does not do all of the checking the other functions do.  I.e.
+   it does not check to see if file is NULL, nor whether the structure file
+   points to has been clobbered or not.
+*/
+
+ZEXTERN int ZEXPORT gzungetc OF((int c, gzFile file));
+/*
+     Push one character back onto the stream to be read as the first character
+   on the next read.  At least one character of push-back is allowed.
+   gzungetc() returns the character pushed, or -1 on failure.  gzungetc() will
+   fail if c is -1, and may fail if a character has been pushed but not read
+   yet.  If gzungetc is used immediately after gzopen or gzdopen, at least the
+   output buffer size of pushed characters is allowed.  (See gzbuffer above.)
+   The pushed character will be discarded if the stream is repositioned with
+   gzseek() or gzrewind().
+*/
+
+ZEXTERN int ZEXPORT gzflush OF((gzFile file, int flush));
+/*
+     Flushes all pending output into the compressed file.  The parameter flush
+   is as in the deflate() function.  The return value is the zlib error number
+   (see function gzerror below).  gzflush is only permitted when writing.
+
+     If the flush parameter is Z_FINISH, the remaining data is written and the
+   gzip stream is completed in the output.  If gzwrite() is called again, a new
+   gzip stream will be started in the output.  gzread() is able to read such
+   concatented gzip streams.
+
+     gzflush should be called only when strictly necessary because it will
+   degrade compression if called too often.
+*/
+
+/*
+ZEXTERN z_off_t ZEXPORT gzseek OF((gzFile file,
+                                   z_off_t offset, int whence));
+
+     Sets the starting position for the next gzread or gzwrite on the given
+   compressed file.  The offset represents a number of bytes in the
+   uncompressed data stream.  The whence parameter is defined as in lseek(2);
+   the value SEEK_END is not supported.
+
+     If the file is opened for reading, this function is emulated but can be
+   extremely slow.  If the file is opened for writing, only forward seeks are
+   supported; gzseek then compresses a sequence of zeroes up to the new
+   starting position.
+
+     gzseek returns the resulting offset location as measured in bytes from
+   the beginning of the uncompressed stream, or -1 in case of error, in
+   particular if the file is opened for writing and the new starting position
+   would be before the current position.
+*/
+
+ZEXTERN int ZEXPORT    gzrewind OF((gzFile file));
+/*
+     Rewinds the given file. This function is supported only for reading.
+
+     gzrewind(file) is equivalent to (int)gzseek(file, 0L, SEEK_SET)
+*/
+
+/*
+ZEXTERN z_off_t ZEXPORT    gztell OF((gzFile file));
+
+     Returns the starting position for the next gzread or gzwrite on the given
+   compressed file.  This position represents a number of bytes in the
+   uncompressed data stream, and is zero when starting, even if appending or
+   reading a gzip stream from the middle of a file using gzdopen().
+
+     gztell(file) is equivalent to gzseek(file, 0L, SEEK_CUR)
+*/
+
+/*
+ZEXTERN z_off_t ZEXPORT gzoffset OF((gzFile file));
+
+     Returns the current offset in the file being read or written.  This offset
+   includes the count of bytes that precede the gzip stream, for example when
+   appending or when using gzdopen() for reading.  When reading, the offset
+   does not include as yet unused buffered input.  This information can be used
+   for a progress indicator.  On error, gzoffset() returns -1.
+*/
+
+ZEXTERN int ZEXPORT gzeof OF((gzFile file));
+/*
+     Returns true (1) if the end-of-file indicator has been set while reading,
+   false (0) otherwise.  Note that the end-of-file indicator is set only if the
+   read tried to go past the end of the input, but came up short.  Therefore,
+   just like feof(), gzeof() may return false even if there is no more data to
+   read, in the event that the last read request was for the exact number of
+   bytes remaining in the input file.  This will happen if the input file size
+   is an exact multiple of the buffer size.
+
+     If gzeof() returns true, then the read functions will return no more data,
+   unless the end-of-file indicator is reset by gzclearerr() and the input file
+   has grown since the previous end of file was detected.
+*/
+
+ZEXTERN int ZEXPORT gzdirect OF((gzFile file));
+/*
+     Returns true (1) if file is being copied directly while reading, or false
+   (0) if file is a gzip stream being decompressed.
+
+     If the input file is empty, gzdirect() will return true, since the input
+   does not contain a gzip stream.
+
+     If gzdirect() is used immediately after gzopen() or gzdopen() it will
+   cause buffers to be allocated to allow reading the file to determine if it
+   is a gzip file.  Therefore if gzbuffer() is used, it should be called before
+   gzdirect().
+
+     When writing, gzdirect() returns true (1) if transparent writing was
+   requested ("wT" for the gzopen() mode), or false (0) otherwise.  (Note:
+   gzdirect() is not needed when writing.  Transparent writing must be
+   explicitly requested, so the application already knows the answer.  When
+   linking statically, using gzdirect() will include all of the zlib code for
+   gzip file reading and decompression, which may not be desired.)
+*/
+
+ZEXTERN int ZEXPORT    gzclose OF((gzFile file));
+/*
+     Flushes all pending output if necessary, closes the compressed file and
+   deallocates the (de)compression state.  Note that once file is closed, you
+   cannot call gzerror with file, since its structures have been deallocated.
+   gzclose must not be called more than once on the same file, just as free
+   must not be called more than once on the same allocation.
+
+     gzclose will return Z_STREAM_ERROR if file is not valid, Z_ERRNO on a
+   file operation error, Z_MEM_ERROR if out of memory, Z_BUF_ERROR if the
+   last read ended in the middle of a gzip stream, or Z_OK on success.
+*/
+
+ZEXTERN int ZEXPORT gzclose_r OF((gzFile file));
+ZEXTERN int ZEXPORT gzclose_w OF((gzFile file));
+/*
+     Same as gzclose(), but gzclose_r() is only for use when reading, and
+   gzclose_w() is only for use when writing or appending.  The advantage to
+   using these instead of gzclose() is that they avoid linking in zlib
+   compression or decompression code that is not used when only reading or only
+   writing respectively.  If gzclose() is used, then both compression and
+   decompression code will be included the application when linking to a static
+   zlib library.
+*/
+
+ZEXTERN const char * ZEXPORT gzerror OF((gzFile file, int *errnum));
+/*
+     Returns the error message for the last error which occurred on the given
+   compressed file.  errnum is set to zlib error number.  If an error occurred
+   in the file system and not in the compression library, errnum is set to
+   Z_ERRNO and the application may consult errno to get the exact error code.
+
+     The application must not modify the returned string.  Future calls to
+   this function may invalidate the previously returned string.  If file is
+   closed, then the string previously returned by gzerror will no longer be
+   available.
+
+     gzerror() should be used to distinguish errors from end-of-file for those
+   functions above that do not distinguish those cases in their return values.
+*/
+
+ZEXTERN void ZEXPORT gzclearerr OF((gzFile file));
+/*
+     Clears the error and end-of-file flags for file.  This is analogous to the
+   clearerr() function in stdio.  This is useful for continuing to read a gzip
+   file that is being written concurrently.
+*/
+
+#endif /* !Z_SOLO */
+
+                        /* checksum functions */
+
+/*
+     These functions are not related to compression but are exported
+   anyway because they might be useful in applications using the compression
+   library.
+*/
+
+ZEXTERN uLong ZEXPORT adler32 OF((uLong adler, const Bytef *buf, uInt len));
+/*
+     Update a running Adler-32 checksum with the bytes buf[0..len-1] and
+   return the updated checksum.  If buf is Z_NULL, this function returns the
+   required initial value for the checksum.
+
+     An Adler-32 checksum is almost as reliable as a CRC32 but can be computed
+   much faster.
+
+   Usage example:
+
+     uLong adler = adler32(0L, Z_NULL, 0);
+
+     while (read_buffer(buffer, length) != EOF) {
+       adler = adler32(adler, buffer, length);
+     }
+     if (adler != original_adler) error();
+*/
+
+/*
+ZEXTERN uLong ZEXPORT adler32_combine OF((uLong adler1, uLong adler2,
+                                          z_off_t len2));
+
+     Combine two Adler-32 checksums into one.  For two sequences of bytes, seq1
+   and seq2 with lengths len1 and len2, Adler-32 checksums were calculated for
+   each, adler1 and adler2.  adler32_combine() returns the Adler-32 checksum of
+   seq1 and seq2 concatenated, requiring only adler1, adler2, and len2.  Note
+   that the z_off_t type (like off_t) is a signed integer.  If len2 is
+   negative, the result has no meaning or utility.
+*/
+
+ZEXTERN uLong ZEXPORT crc32   OF((uLong crc, const Bytef *buf, uInt len));
+/*
+     Update a running CRC-32 with the bytes buf[0..len-1] and return the
+   updated CRC-32.  If buf is Z_NULL, this function returns the required
+   initial value for the crc.  Pre- and post-conditioning (one's complement) is
+   performed within this function so it shouldn't be done by the application.
+
+   Usage example:
+
+     uLong crc = crc32(0L, Z_NULL, 0);
+
+     while (read_buffer(buffer, length) != EOF) {
+       crc = crc32(crc, buffer, length);
+     }
+     if (crc != original_crc) error();
+*/
+
+/*
+ZEXTERN uLong ZEXPORT crc32_combine OF((uLong crc1, uLong crc2, z_off_t len2));
+
+     Combine two CRC-32 check values into one.  For two sequences of bytes,
+   seq1 and seq2 with lengths len1 and len2, CRC-32 check values were
+   calculated for each, crc1 and crc2.  crc32_combine() returns the CRC-32
+   check value of seq1 and seq2 concatenated, requiring only crc1, crc2, and
+   len2.
+*/
+
+
+                        /* various hacks, don't look :) */
+
+/* deflateInit and inflateInit are macros to allow checking the zlib version
+ * and the compiler's view of z_stream:
+ */
+ZEXTERN int ZEXPORT deflateInit_ OF((z_streamp strm, int level,
+                                     const char *version, int stream_size));
+ZEXTERN int ZEXPORT inflateInit_ OF((z_streamp strm,
+                                     const char *version, int stream_size));
+ZEXTERN int ZEXPORT deflateInit2_ OF((z_streamp strm, int  level, int  method,
+                                      int windowBits, int memLevel,
+                                      int strategy, const char *version,
+                                      int stream_size));
+ZEXTERN int ZEXPORT inflateInit2_ OF((z_streamp strm, int  windowBits,
+                                      const char *version, int stream_size));
+ZEXTERN int ZEXPORT inflateBackInit_ OF((z_streamp strm, int windowBits,
+                                         unsigned char FAR *window,
+                                         const char *version,
+                                         int stream_size));
+#define deflateInit(strm, level) \
+        deflateInit_((strm), (level), ZLIB_VERSION, (int)sizeof(z_stream))
+#define inflateInit(strm) \
+        inflateInit_((strm), ZLIB_VERSION, (int)sizeof(z_stream))
+#define deflateInit2(strm, level, method, windowBits, memLevel, strategy) \
+        deflateInit2_((strm),(level),(method),(windowBits),(memLevel),\
+                      (strategy), ZLIB_VERSION, (int)sizeof(z_stream))
+#define inflateInit2(strm, windowBits) \
+        inflateInit2_((strm), (windowBits), ZLIB_VERSION, \
+                      (int)sizeof(z_stream))
+#define inflateBackInit(strm, windowBits, window) \
+        inflateBackInit_((strm), (windowBits), (window), \
+                      ZLIB_VERSION, (int)sizeof(z_stream))
+
+#ifndef Z_SOLO
+
+/* gzgetc() macro and its supporting function and exposed data structure.  Note
+ * that the real internal state is much larger than the exposed structure.
+ * This abbreviated structure exposes just enough for the gzgetc() macro.  The
+ * user should not mess with these exposed elements, since their names or
+ * behavior could change in the future, perhaps even capriciously.  They can
+ * only be used by the gzgetc() macro.  You have been warned.
+ */
+struct gzFile_s {
+    unsigned have;
+    unsigned char *next;
+    z_off64_t pos;
+};
+ZEXTERN int ZEXPORT gzgetc_ OF((gzFile file));  /* backward compatibility */
+#ifdef Z_PREFIX_SET
+#  undef z_gzgetc
+#  define z_gzgetc(g) \
+          ((g)->have ? ((g)->have--, (g)->pos++, *((g)->next)++) : gzgetc(g))
+#else
+#  define gzgetc(g) \
+          ((g)->have ? ((g)->have--, (g)->pos++, *((g)->next)++) : gzgetc(g))
+#endif
+
+/* provide 64-bit offset functions if _LARGEFILE64_SOURCE defined, and/or
+ * change the regular functions to 64 bits if _FILE_OFFSET_BITS is 64 (if
+ * both are true, the application gets the *64 functions, and the regular
+ * functions are changed to 64 bits) -- in case these are set on systems
+ * without large file support, _LFS64_LARGEFILE must also be true
+ */
+#ifdef Z_LARGE64
+   ZEXTERN gzFile ZEXPORT gzopen64 OF((const char *, const char *));
+   ZEXTERN z_off64_t ZEXPORT gzseek64 OF((gzFile, z_off64_t, int));
+   ZEXTERN z_off64_t ZEXPORT gztell64 OF((gzFile));
+   ZEXTERN z_off64_t ZEXPORT gzoffset64 OF((gzFile));
+   ZEXTERN uLong ZEXPORT adler32_combine64 OF((uLong, uLong, z_off64_t));
+   ZEXTERN uLong ZEXPORT crc32_combine64 OF((uLong, uLong, z_off64_t));
+#endif
+
+#if !defined(ZLIB_INTERNAL) && defined(Z_WANT64)
+#  ifdef Z_PREFIX_SET
+#    define z_gzopen z_gzopen64
+#    define z_gzseek z_gzseek64
+#    define z_gztell z_gztell64
+#    define z_gzoffset z_gzoffset64
+#    define z_adler32_combine z_adler32_combine64
+#    define z_crc32_combine z_crc32_combine64
+#  else
+#    define gzopen gzopen64
+#    define gzseek gzseek64
+#    define gztell gztell64
+#    define gzoffset gzoffset64
+#    define adler32_combine adler32_combine64
+#    define crc32_combine crc32_combine64
+#  endif
+#  ifndef Z_LARGE64
+     ZEXTERN gzFile ZEXPORT gzopen64 OF((const char *, const char *));
+     ZEXTERN z_off_t ZEXPORT gzseek64 OF((gzFile, z_off_t, int));
+     ZEXTERN z_off_t ZEXPORT gztell64 OF((gzFile));
+     ZEXTERN z_off_t ZEXPORT gzoffset64 OF((gzFile));
+     ZEXTERN uLong ZEXPORT adler32_combine64 OF((uLong, uLong, z_off_t));
+     ZEXTERN uLong ZEXPORT crc32_combine64 OF((uLong, uLong, z_off_t));
+#  endif
+#else
+   ZEXTERN gzFile ZEXPORT gzopen OF((const char *, const char *));
+   ZEXTERN z_off_t ZEXPORT gzseek OF((gzFile, z_off_t, int));
+   ZEXTERN z_off_t ZEXPORT gztell OF((gzFile));
+   ZEXTERN z_off_t ZEXPORT gzoffset OF((gzFile));
+   ZEXTERN uLong ZEXPORT adler32_combine OF((uLong, uLong, z_off_t));
+   ZEXTERN uLong ZEXPORT crc32_combine OF((uLong, uLong, z_off_t));
+#endif
+
+#else /* Z_SOLO */
+
+   ZEXTERN uLong ZEXPORT adler32_combine OF((uLong, uLong, z_off_t));
+   ZEXTERN uLong ZEXPORT crc32_combine OF((uLong, uLong, z_off_t));
+
+#endif /* !Z_SOLO */
+
+/* hack for buggy compilers */
+#if !defined(ZUTIL_H) && !defined(NO_DUMMY_DECL)
+    struct internal_state {int dummy;};
+#endif
+
+/* undocumented functions */
+ZEXTERN const char   * ZEXPORT zError           OF((int));
+ZEXTERN int            ZEXPORT inflateSyncPoint OF((z_streamp));
+ZEXTERN const z_crc_t FAR * ZEXPORT get_crc_table    OF((void));
+ZEXTERN int            ZEXPORT inflateUndermine OF((z_streamp, int));
+ZEXTERN int            ZEXPORT inflateResetKeep OF((z_streamp));
+ZEXTERN int            ZEXPORT deflateResetKeep OF((z_streamp));
+#if defined(_WIN32) && !defined(Z_SOLO)
+ZEXTERN gzFile         ZEXPORT gzopen_w OF((const wchar_t *path,
+                                            const char *mode));
+#endif
+#if defined(STDC) || defined(Z_HAVE_STDARG_H)
+#  ifndef Z_SOLO
+ZEXTERN int            ZEXPORTVA gzvprintf Z_ARG((gzFile file,
+                                                  const char *format,
+                                                  va_list va));
+#  endif
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ZLIB_H */
diff --git a/compat/getopt/getopt_long.c b/compat/getopt/getopt_long.c
index 90fc0cfaac..a78ec8fc40 100644
--- a/compat/getopt/getopt_long.c
+++ b/compat/getopt/getopt_long.c
@@ -54,42 +54,39 @@ static const char rcsid[]="$Id: getopt_long.c,v 1.1 2009/10/16 19:50:28 rodney E
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-#if 0
-#include <err.h>
-#endif
 #include <errno.h>
 #include <getopt.h>
 #include <stdlib.h>
 #include <string.h>
 
-#ifdef _WIN32
-
-/* Windows needs warnx().  We change the definition though:
- *  1. (another) global is defined, opterrmsg, which holds the error message
- *  2. errors are always printed out on stderr w/o the program name
- * Note that opterrmsg always gets set no matter what opterr is set to.  The
- * error message will not be printed if opterr is 0 as usual.
- */
-
-#include <stdio.h>
-#include <stdarg.h>
-#include <stdarg.h>
-
-char opterrmsg[128]; /* last error message is stored here */
-
-static void warnx(const char *fmt, ...)
-{
-	va_list ap;
-	va_start(ap, fmt);
-	if (fmt != NULL)
-		_vsnprintf(opterrmsg, 128, fmt, ap);
-	else
-		opterrmsg[0]='\0';
-	va_end(ap);
-	fprintf(stderr, opterrmsg);
-	fprintf(stderr, "\n");
-}
-
+#ifdef _WIN32
+
+/* Windows needs warnx().  We change the definition though:
+ *  1. (another) global is defined, opterrmsg, which holds the error message
+ *  2. errors are always printed out on stderr w/o the program name
+ * Note that opterrmsg always gets set no matter what opterr is set to.  The
+ * error message will not be printed if opterr is 0 as usual.
+ */
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdarg.h>
+
+char opterrmsg[128]; /* last error message is stored here */
+
+static void warnx(const char *fmt, ...)
+{
+	va_list ap;
+	va_start(ap, fmt);
+	if (fmt != NULL)
+		_vsnprintf(opterrmsg, 128, fmt, ap);
+	else
+		opterrmsg[0]='\0';
+	va_end(ap);
+	fprintf(stderr, opterrmsg);
+	fprintf(stderr, "\n");
+}
+
 #endif /*_WIN32*/
 
 #define	REPLACE_GETOPT		/* use this getopt as the system getopt(3) */
diff --git a/compat/jansson/.gitignore b/compat/jansson/.gitignore
new file mode 100644
index 0000000000..1a81967d00
--- /dev/null
+++ b/compat/jansson/.gitignore
@@ -0,0 +1,3 @@
+*.h.in~
+libtool
+libjansson.a
diff --git a/compat/jansson/Makefile.am b/compat/jansson/Makefile.am
index ff38e51653..29e974574a 100644
--- a/compat/jansson/Makefile.am
+++ b/compat/jansson/Makefile.am
@@ -1,18 +1,20 @@
-
-noinst_LIBRARIES	= libjansson.a
-
-libjansson_a_SOURCES	= \
-			  config.h		\
-			  dump.c		\
-			  hashtable.c		\
-			  hashtable.h		\
-			  jansson.h		\
-			  jansson_private.h	\
-			  load.c		\
-			  strbuffer.c		\
-			  strbuffer.h		\
-			  utf.c			\
-			  utf.h			\
-			  util.h		\
-			  value.c
-
+
+noinst_LIBRARIES = libjansson.a
+
+libjansson_a_SOURCES = \
+  jansson_private_config.h \
+  dump.c \
+  error.c \
+  hashtable.c hashtable.h \
+  jansson.h \
+  jansson_config.h \
+  jansson_private.h \
+  load.c \
+  memory.c \
+  pack_unpack.c \
+  strbuffer.c strbuffer.h \
+  strconv.c \
+  utf.c utf.h \
+  util.h \
+  value.c
+
diff --git a/compat/jansson/Makefile.in b/compat/jansson/Makefile.in
deleted file mode 100644
index 010caa2f88..0000000000
--- a/compat/jansson/Makefile.in
+++ /dev/null
@@ -1,478 +0,0 @@
-# Makefile.in generated by automake 1.11.3 from Makefile.am.
-# @configure_input@
-
-# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
-# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
-# Foundation, Inc.
-# This Makefile.in is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
-# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
-# PARTICULAR PURPOSE.
-
-@SET_MAKE@
-
-VPATH = @srcdir@
-pkgdatadir = $(datadir)/@PACKAGE@
-pkgincludedir = $(includedir)/@PACKAGE@
-pkglibdir = $(libdir)/@PACKAGE@
-pkglibexecdir = $(libexecdir)/@PACKAGE@
-am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
-install_sh_DATA = $(install_sh) -c -m 644
-install_sh_PROGRAM = $(install_sh) -c
-install_sh_SCRIPT = $(install_sh) -c
-INSTALL_HEADER = $(INSTALL_DATA)
-transform = $(program_transform_name)
-NORMAL_INSTALL = :
-PRE_INSTALL = :
-POST_INSTALL = :
-NORMAL_UNINSTALL = :
-PRE_UNINSTALL = :
-POST_UNINSTALL = :
-build_triplet = @build@
-host_triplet = @host@
-target_triplet = @target@
-subdir = compat/jansson
-DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in
-ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps = $(top_srcdir)/configure.ac
-am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
-	$(ACLOCAL_M4)
-mkinstalldirs = $(install_sh) -d
-CONFIG_HEADER = $(top_builddir)/cpuminer-config.h
-CONFIG_CLEAN_FILES =
-CONFIG_CLEAN_VPATH_FILES =
-LIBRARIES = $(noinst_LIBRARIES)
-AR = ar
-ARFLAGS = cru
-libjansson_a_AR = $(AR) $(ARFLAGS)
-libjansson_a_LIBADD =
-am_libjansson_a_OBJECTS = dump.$(OBJEXT) hashtable.$(OBJEXT) \
-	load.$(OBJEXT) strbuffer.$(OBJEXT) utf.$(OBJEXT) \
-	value.$(OBJEXT)
-libjansson_a_OBJECTS = $(am_libjansson_a_OBJECTS)
-DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
-depcomp = $(SHELL) $(top_srcdir)/depcomp
-am__depfiles_maybe = depfiles
-am__mv = mv -f
-COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
-	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
-CCLD = $(CC)
-LINK = $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
-SOURCES = $(libjansson_a_SOURCES)
-DIST_SOURCES = $(libjansson_a_SOURCES)
-ETAGS = etags
-CTAGS = ctags
-DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
-ACLOCAL = @ACLOCAL@
-ALLOCA = @ALLOCA@
-AMTAR = @AMTAR@
-AUTOCONF = @AUTOCONF@
-AUTOHEADER = @AUTOHEADER@
-AUTOMAKE = @AUTOMAKE@
-AWK = @AWK@
-CC = @CC@
-CCAS = @CCAS@
-CCASDEPMODE = @CCASDEPMODE@
-CCASFLAGS = @CCASFLAGS@
-CCDEPMODE = @CCDEPMODE@
-CFLAGS = @CFLAGS@
-CPP = @CPP@
-CPPFLAGS = @CPPFLAGS@
-CUDA_CFLAGS = @CUDA_CFLAGS@
-CUDA_LDFLAGS = @CUDA_LDFLAGS@
-CUDA_LIBS = @CUDA_LIBS@
-CXX = @CXX@
-CXXDEPMODE = @CXXDEPMODE@
-CXXFLAGS = @CXXFLAGS@
-CYGPATH_W = @CYGPATH_W@
-DEFS = @DEFS@
-DEPDIR = @DEPDIR@
-ECHO_C = @ECHO_C@
-ECHO_N = @ECHO_N@
-ECHO_T = @ECHO_T@
-EGREP = @EGREP@
-EXEEXT = @EXEEXT@
-GREP = @GREP@
-INSTALL = @INSTALL@
-INSTALL_DATA = @INSTALL_DATA@
-INSTALL_PROGRAM = @INSTALL_PROGRAM@
-INSTALL_SCRIPT = @INSTALL_SCRIPT@
-INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
-JANSSON_LIBS = @JANSSON_LIBS@
-LDFLAGS = @LDFLAGS@
-LIBCURL = @LIBCURL@
-LIBCURL_CPPFLAGS = @LIBCURL_CPPFLAGS@
-LIBOBJS = @LIBOBJS@
-LIBS = @LIBS@
-LTLIBOBJS = @LTLIBOBJS@
-MAINT = @MAINT@
-MAKEINFO = @MAKEINFO@
-MKDIR_P = @MKDIR_P@
-NVCC = @NVCC@
-OBJEXT = @OBJEXT@
-OPENMP_CFLAGS = @OPENMP_CFLAGS@
-PACKAGE = @PACKAGE@
-PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
-PACKAGE_NAME = @PACKAGE_NAME@
-PACKAGE_STRING = @PACKAGE_STRING@
-PACKAGE_TARNAME = @PACKAGE_TARNAME@
-PACKAGE_URL = @PACKAGE_URL@
-PACKAGE_VERSION = @PACKAGE_VERSION@
-PATH_SEPARATOR = @PATH_SEPARATOR@
-PTHREAD_FLAGS = @PTHREAD_FLAGS@
-PTHREAD_LIBS = @PTHREAD_LIBS@
-RANLIB = @RANLIB@
-SET_MAKE = @SET_MAKE@
-SHELL = @SHELL@
-STRIP = @STRIP@
-VERSION = @VERSION@
-WS2_LIBS = @WS2_LIBS@
-_libcurl_config = @_libcurl_config@
-abs_builddir = @abs_builddir@
-abs_srcdir = @abs_srcdir@
-abs_top_builddir = @abs_top_builddir@
-abs_top_srcdir = @abs_top_srcdir@
-ac_ct_CC = @ac_ct_CC@
-ac_ct_CXX = @ac_ct_CXX@
-am__include = @am__include@
-am__leading_dot = @am__leading_dot@
-am__quote = @am__quote@
-am__tar = @am__tar@
-am__untar = @am__untar@
-bindir = @bindir@
-build = @build@
-build_alias = @build_alias@
-build_cpu = @build_cpu@
-build_os = @build_os@
-build_vendor = @build_vendor@
-builddir = @builddir@
-datadir = @datadir@
-datarootdir = @datarootdir@
-docdir = @docdir@
-dvidir = @dvidir@
-exec_prefix = @exec_prefix@
-host = @host@
-host_alias = @host_alias@
-host_cpu = @host_cpu@
-host_os = @host_os@
-host_vendor = @host_vendor@
-htmldir = @htmldir@
-includedir = @includedir@
-infodir = @infodir@
-install_sh = @install_sh@
-libdir = @libdir@
-libexecdir = @libexecdir@
-localedir = @localedir@
-localstatedir = @localstatedir@
-mandir = @mandir@
-mkdir_p = @mkdir_p@
-oldincludedir = @oldincludedir@
-pdfdir = @pdfdir@
-prefix = @prefix@
-program_transform_name = @program_transform_name@
-psdir = @psdir@
-sbindir = @sbindir@
-sharedstatedir = @sharedstatedir@
-srcdir = @srcdir@
-sysconfdir = @sysconfdir@
-target = @target@
-target_alias = @target_alias@
-target_cpu = @target_cpu@
-target_os = @target_os@
-target_vendor = @target_vendor@
-top_build_prefix = @top_build_prefix@
-top_builddir = @top_builddir@
-top_srcdir = @top_srcdir@
-noinst_LIBRARIES = libjansson.a
-libjansson_a_SOURCES = \
-			  config.h		\
-			  dump.c		\
-			  hashtable.c		\
-			  hashtable.h		\
-			  jansson.h		\
-			  jansson_private.h	\
-			  load.c		\
-			  strbuffer.c		\
-			  strbuffer.h		\
-			  utf.c			\
-			  utf.h			\
-			  util.h		\
-			  value.c
-
-all: all-am
-
-.SUFFIXES:
-.SUFFIXES: .c .o .obj
-$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
-	@for dep in $?; do \
-	  case '$(am__configure_deps)' in \
-	    *$$dep*) \
-	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
-	        && { if test -f $@; then exit 0; else break; fi; }; \
-	      exit 1;; \
-	  esac; \
-	done; \
-	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu compat/jansson/Makefile'; \
-	$(am__cd) $(top_srcdir) && \
-	  $(AUTOMAKE) --gnu compat/jansson/Makefile
-.PRECIOUS: Makefile
-Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
-	@case '$?' in \
-	  *config.status*) \
-	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
-	  *) \
-	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
-	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
-	esac;
-
-$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
-	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-
-$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
-	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
-	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-$(am__aclocal_m4_deps):
-
-clean-noinstLIBRARIES:
-	-test -z "$(noinst_LIBRARIES)" || rm -f $(noinst_LIBRARIES)
-libjansson.a: $(libjansson_a_OBJECTS) $(libjansson_a_DEPENDENCIES) $(EXTRA_libjansson_a_DEPENDENCIES) 
-	-rm -f libjansson.a
-	$(libjansson_a_AR) libjansson.a $(libjansson_a_OBJECTS) $(libjansson_a_LIBADD)
-	$(RANLIB) libjansson.a
-
-mostlyclean-compile:
-	-rm -f *.$(OBJEXT)
-
-distclean-compile:
-	-rm -f *.tab.c
-
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dump.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hashtable.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/load.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/strbuffer.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/utf.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/value.Po@am__quote@
-
-.c.o:
-@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(COMPILE) -c $<
-
-.c.obj:
-@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(COMPILE) -c `$(CYGPATH_W) '$<'`
-
-ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
-	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
-	unique=`for i in $$list; do \
-	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
-	  done | \
-	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
-	      END { if (nonempty) { for (i in files) print i; }; }'`; \
-	mkid -fID $$unique
-tags: TAGS
-
-TAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
-		$(TAGS_FILES) $(LISP)
-	set x; \
-	here=`pwd`; \
-	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
-	unique=`for i in $$list; do \
-	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
-	  done | \
-	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
-	      END { if (nonempty) { for (i in files) print i; }; }'`; \
-	shift; \
-	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
-	  test -n "$$unique" || unique=$$empty_fix; \
-	  if test $$# -gt 0; then \
-	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
-	      "$$@" $$unique; \
-	  else \
-	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
-	      $$unique; \
-	  fi; \
-	fi
-ctags: CTAGS
-CTAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
-		$(TAGS_FILES) $(LISP)
-	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
-	unique=`for i in $$list; do \
-	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
-	  done | \
-	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
-	      END { if (nonempty) { for (i in files) print i; }; }'`; \
-	test -z "$(CTAGS_ARGS)$$unique" \
-	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
-	     $$unique
-
-GTAGS:
-	here=`$(am__cd) $(top_builddir) && pwd` \
-	  && $(am__cd) $(top_srcdir) \
-	  && gtags -i $(GTAGS_ARGS) "$$here"
-
-distclean-tags:
-	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
-
-distdir: $(DISTFILES)
-	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
-	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
-	list='$(DISTFILES)'; \
-	  dist_files=`for file in $$list; do echo $$file; done | \
-	  sed -e "s|^$$srcdirstrip/||;t" \
-	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
-	case $$dist_files in \
-	  */*) $(MKDIR_P) `echo "$$dist_files" | \
-			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
-			   sort -u` ;; \
-	esac; \
-	for file in $$dist_files; do \
-	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
-	  if test -d $$d/$$file; then \
-	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
-	    if test -d "$(distdir)/$$file"; then \
-	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
-	    fi; \
-	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
-	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
-	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
-	    fi; \
-	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
-	  else \
-	    test -f "$(distdir)/$$file" \
-	    || cp -p $$d/$$file "$(distdir)/$$file" \
-	    || exit 1; \
-	  fi; \
-	done
-check-am: all-am
-check: check-am
-all-am: Makefile $(LIBRARIES)
-installdirs:
-install: install-am
-install-exec: install-exec-am
-install-data: install-data-am
-uninstall: uninstall-am
-
-install-am: all-am
-	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
-
-installcheck: installcheck-am
-install-strip:
-	if test -z '$(STRIP)'; then \
-	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
-	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
-	      install; \
-	else \
-	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
-	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
-	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
-	fi
-mostlyclean-generic:
-
-clean-generic:
-
-distclean-generic:
-	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
-	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
-
-maintainer-clean-generic:
-	@echo "This command is intended for maintainers to use"
-	@echo "it deletes files that may require special tools to rebuild."
-clean: clean-am
-
-clean-am: clean-generic clean-noinstLIBRARIES mostlyclean-am
-
-distclean: distclean-am
-	-rm -rf ./$(DEPDIR)
-	-rm -f Makefile
-distclean-am: clean-am distclean-compile distclean-generic \
-	distclean-tags
-
-dvi: dvi-am
-
-dvi-am:
-
-html: html-am
-
-html-am:
-
-info: info-am
-
-info-am:
-
-install-data-am:
-
-install-dvi: install-dvi-am
-
-install-dvi-am:
-
-install-exec-am:
-
-install-html: install-html-am
-
-install-html-am:
-
-install-info: install-info-am
-
-install-info-am:
-
-install-man:
-
-install-pdf: install-pdf-am
-
-install-pdf-am:
-
-install-ps: install-ps-am
-
-install-ps-am:
-
-installcheck-am:
-
-maintainer-clean: maintainer-clean-am
-	-rm -rf ./$(DEPDIR)
-	-rm -f Makefile
-maintainer-clean-am: distclean-am maintainer-clean-generic
-
-mostlyclean: mostlyclean-am
-
-mostlyclean-am: mostlyclean-compile mostlyclean-generic
-
-pdf: pdf-am
-
-pdf-am:
-
-ps: ps-am
-
-ps-am:
-
-uninstall-am:
-
-.MAKE: install-am install-strip
-
-.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \
-	clean-noinstLIBRARIES ctags distclean distclean-compile \
-	distclean-generic distclean-tags distdir dvi dvi-am html \
-	html-am info info-am install install-am install-data \
-	install-data-am install-dvi install-dvi-am install-exec \
-	install-exec-am install-html install-html-am install-info \
-	install-info-am install-man install-pdf install-pdf-am \
-	install-ps install-ps-am install-strip installcheck \
-	installcheck-am installdirs maintainer-clean \
-	maintainer-clean-generic mostlyclean mostlyclean-compile \
-	mostlyclean-generic pdf pdf-am ps ps-am tags uninstall \
-	uninstall-am
-
-
-# Tell versions [3.59,3.63) of GNU make to not export all variables.
-# Otherwise a system limit (for SysV at least) may be exceeded.
-.NOEXPORT:
diff --git a/compat/jansson/config.h b/compat/jansson/config.h
deleted file mode 100644
index f11075a3c5..0000000000
--- a/compat/jansson/config.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/* config.h.  Generated from config.h.in by configure.  */
-/* config.h.in.  Generated from configure.ac by autoheader.  */
-
-/* Define to 1 if you have the <dlfcn.h> header file. */
-#define HAVE_DLFCN_H 1
-
-/* Define to 1 if you have the <inttypes.h> header file. */
-#define HAVE_INTTYPES_H 1
-
-/* Define to 1 if you have the <memory.h> header file. */
-#define HAVE_MEMORY_H 1
-
-/* Define to 1 if you have the <stdint.h> header file. */
-#define HAVE_STDINT_H 1
-
-/* Define to 1 if you have the <stdlib.h> header file. */
-#define HAVE_STDLIB_H 1
-
-/* Define to 1 if you have the <strings.h> header file. */
-#define HAVE_STRINGS_H 1
-
-/* Define to 1 if you have the <string.h> header file. */
-#define HAVE_STRING_H 1
-
-/* Define to 1 if you have the <sys/stat.h> header file. */
-#define HAVE_SYS_STAT_H 1
-
-/* Define to 1 if you have the <sys/types.h> header file. */
-#define HAVE_SYS_TYPES_H 1
-
-/* Define to 1 if you have the <unistd.h> header file. */
-#define HAVE_UNISTD_H 1
-
-/* Define to the sub-directory in which libtool stores uninstalled libraries.
-   */
-#define LT_OBJDIR ".libs/"
-
-/* Name of package */
-#define PACKAGE "jansson"
-
-/* Define to the address where bug reports for this package should be sent. */
-#define PACKAGE_BUGREPORT "petri@digip.org"
-
-/* Define to the full name of this package. */
-#define PACKAGE_NAME "jansson"
-
-/* Define to the full name and version of this package. */
-#define PACKAGE_STRING "jansson 1.3"
-
-/* Define to the one symbol short name of this package. */
-#define PACKAGE_TARNAME "jansson"
-
-/* Define to the home page for this package. */
-#define PACKAGE_URL ""
-
-/* Define to the version of this package. */
-#define PACKAGE_VERSION "1.3"
-
-/* Define to 1 if you have the ANSI C header files. */
-#define STDC_HEADERS 1
-
-/* Version number of package */
-#define VERSION "1.3"
-
-/* Define to `__inline__' or `__inline' if that's what the C compiler
-   calls it, or to nothing if 'inline' is not supported under any name.  */
-#ifndef __cplusplus
-/* #undef inline */
-#endif
-
-/* Define to the type of a signed integer type of width exactly 32 bits if
-   such a type exists and the standard includes do not define it. */
-/* #undef int32_t */
diff --git a/compat/jansson/configure.ac b/compat/jansson/configure.ac
new file mode 100644
index 0000000000..defa1832c4
--- /dev/null
+++ b/compat/jansson/configure.ac
@@ -0,0 +1,98 @@
+AC_PREREQ([2.60])
+AC_INIT([jansson], [2.6], [petri@digip.org])
+
+AC_CONFIG_MACRO_DIR([m4])
+
+AM_INIT_AUTOMAKE([1.10 foreign])
+m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
+
+AC_CONFIG_SRCDIR([value.c])
+AC_CONFIG_HEADERS([jansson_private_config.h])
+
+# Checks for programs.
+AC_PROG_CC
+AC_PROG_LIBTOOL
+AM_CONDITIONAL([GCC], [test x$GCC = xyes])
+
+# Checks for libraries.
+
+# Checks for header files.
+AC_CHECK_HEADERS([endian.h fcntl.h locale.h sched.h unistd.h sys/param.h sys/stat.h sys/time.h sys/types.h])
+
+# Checks for typedefs, structures, and compiler characteristics.
+AC_TYPE_INT32_T
+AC_TYPE_UINT32_T
+AC_TYPE_LONG_LONG_INT
+
+AC_C_INLINE
+case $ac_cv_c_inline in
+    yes) json_inline=inline;;
+    no) json_inline=;;
+    *) json_inline=$ac_cv_c_inline;;
+esac
+AC_SUBST([json_inline])
+
+# Checks for library functions.
+AC_CHECK_FUNCS([close getpid gettimeofday localeconv open read sched_yield strtoll])
+
+AC_MSG_CHECKING([for gcc __sync builtins])
+have_sync_builtins=no
+AC_TRY_LINK(
+  [], [unsigned long val; __sync_bool_compare_and_swap(&val, 0, 1);],
+  [have_sync_builtins=yes],
+)
+if test "x$have_sync_builtins" = "xyes"; then
+  AC_DEFINE([HAVE_SYNC_BUILTINS], [1],
+    [Define to 1 if gcc's __sync builtins are available])
+fi
+AC_MSG_RESULT([$have_sync_builtins])
+
+AC_MSG_CHECKING([for gcc __atomic builtins])
+have_atomic_builtins=no
+AC_TRY_LINK(
+  [], [char l; unsigned long v; __atomic_test_and_set(&l, __ATOMIC_RELAXED); __atomic_store_n(&v, 1, __ATOMIC_ACQ_REL); __atomic_load_n(&v, __ATOMIC_ACQUIRE);],
+  [have_atomic_builtins=yes],
+)
+if test "x$have_atomic_builtins" = "xyes"; then
+  AC_DEFINE([HAVE_ATOMIC_BUILTINS], [1],
+    [Define to 1 if gcc's __atomic builtins are available])
+fi
+AC_MSG_RESULT([$have_atomic_builtins])
+
+case "$ac_cv_type_long_long_int$ac_cv_func_strtoll" in
+     yesyes) json_have_long_long=1;;
+     *) json_have_long_long=0;;
+esac
+AC_SUBST([json_have_long_long])
+
+case "$ac_cv_header_locale_h$ac_cv_func_localeconv" in
+     yesyes) json_have_localeconv=1;;
+     *) json_have_localeconv=0;;
+esac
+AC_SUBST([json_have_localeconv])
+
+# Features
+AC_ARG_ENABLE([urandom],
+  [AS_HELP_STRING([--disable-urandom],
+    [Don't use /dev/urandom to seed the hash function])],
+  [use_urandom=$enableval], [use_urandom=yes])
+
+if test "x$use_urandom" = xyes; then
+AC_DEFINE([USE_URANDOM], [1],
+  [Define to 1 if /dev/urandom should be used for seeding the hash function])
+fi
+
+AC_ARG_ENABLE([windows-cryptoapi],
+  [AS_HELP_STRING([--disable-windows-cryptoapi],
+    [Don't use CryptGenRandom to seed the hash function])],
+  [use_windows_cryptoapi=$enableval], [use_windows_cryptoapi=yes])
+
+if test "x$use_windows_cryptoapi" = xyes; then
+AC_DEFINE([USE_WINDOWS_CRYPTOAPI], [1],
+  [Define to 1 if CryptGenRandom should be used for seeding the hash function])
+fi
+
+AC_CONFIG_FILES([
+        Makefile
+])
+AC_OUTPUT
diff --git a/compat/jansson/dump.c b/compat/jansson/dump.c
index a8c9cc646a..45450b118e 100644
--- a/compat/jansson/dump.c
+++ b/compat/jansson/dump.c
@@ -1,17 +1,20 @@
 /*
- * Copyright (c) 2009, 2010 Petri Lehtinen <petri@digip.org>
+ * Copyright (c) 2009-2013 Petri Lehtinen <petri@digip.org>
  *
  * Jansson is free software; you can redistribute it and/or modify
  * it under the terms of the MIT license. See LICENSE for details.
  */
 
+#ifndef _GNU_SOURCE
 #define _GNU_SOURCE
+#endif
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <assert.h>
 
-#include <jansson.h>
+#include "jansson.h"
 #include "jansson_private.h"
 #include "strbuffer.h"
 #include "utf.h"
@@ -19,21 +22,17 @@
 #define MAX_INTEGER_STR_LENGTH  100
 #define MAX_REAL_STR_LENGTH     100
 
-typedef int (*dump_func)(const char *buffer, int size, void *data);
-
-struct string
-{
-    char *buffer;
-    int length;
-    int size;
+struct object_key {
+    size_t serial;
+    const char *key;
 };
 
-static int dump_to_strbuffer(const char *buffer, int size, void *data)
+static int dump_to_strbuffer(const char *buffer, size_t size, void *data)
 {
     return strbuffer_append_bytes((strbuffer_t *)data, buffer, size);
 }
 
-static int dump_to_file(const char *buffer, int size, void *data)
+static int dump_to_file(const char *buffer, size_t size, void *data)
 {
     FILE *dest = (FILE *)data;
     if(fwrite(buffer, size, 1, dest) != 1)
@@ -41,10 +40,10 @@ static int dump_to_file(const char *buffer, int size, void *data)
     return 0;
 }
 
-/* 256 spaces (the maximum indentation size) */
-static char whitespace[] = "                                                                                                                                                                                                                                                                ";
+/* 32 spaces (the maximum indentation size) */
+static const char whitespace[] = "                                ";
 
-static int dump_indent(unsigned long flags, int depth, int space, dump_func dump, void *data)
+static int dump_indent(size_t flags, int depth, int space, json_dump_callback_t dump, void *data)
 {
     if(JSON_INDENT(flags) > 0)
     {
@@ -66,7 +65,7 @@ static int dump_indent(unsigned long flags, int depth, int space, dump_func dump
     return 0;
 }
 
-static int dump_string(const char *str, int ascii, dump_func dump, void *data)
+static int dump_string(const char *str, json_dump_callback_t dump, void *data, size_t flags)
 {
     const char *pos, *end;
     int32_t codepoint;
@@ -91,8 +90,12 @@ static int dump_string(const char *str, int ascii, dump_func dump, void *data)
             if(codepoint == '\\' || codepoint == '"' || codepoint < 0x20)
                 break;
 
+            /* slash */
+            if((flags & JSON_ESCAPE_SLASH) && codepoint == '/')
+                break;
+
             /* non-ASCII */
-            if(ascii && codepoint > 0x7F)
+            if((flags & JSON_ENSURE_ASCII) && codepoint > 0x7F)
                 break;
 
             pos = end;
@@ -106,7 +109,7 @@ static int dump_string(const char *str, int ascii, dump_func dump, void *data)
         if(end == pos)
             break;
 
-        /* handle \, ", and control codes */
+        /* handle \, /, ", and control codes */
         length = 2;
         switch(codepoint)
         {
@@ -117,6 +120,7 @@ static int dump_string(const char *str, int ascii, dump_func dump, void *data)
             case '\n': text = "\\n"; break;
             case '\r': text = "\\r"; break;
             case '\t': text = "\\t"; break;
+            case '/':  text = "\\/"; break;
             default:
             {
                 /* codepoint is in BMP */
@@ -155,20 +159,23 @@ static int dump_string(const char *str, int ascii, dump_func dump, void *data)
 
 static int object_key_compare_keys(const void *key1, const void *key2)
 {
-    return strcmp((*(const object_key_t **)key1)->key,
-                  (*(const object_key_t **)key2)->key);
+    return strcmp(((const struct object_key *)key1)->key,
+                  ((const struct object_key *)key2)->key);
 }
 
 static int object_key_compare_serials(const void *key1, const void *key2)
 {
-    return (*(const object_key_t **)key1)->serial -
-           (*(const object_key_t **)key2)->serial;
+    size_t a = ((const struct object_key *)key1)->serial;
+    size_t b = ((const struct object_key *)key2)->serial;
+
+    return a < b ? -1 : a == b ? 0 : 1;
 }
 
-static int do_dump(const json_t *json, unsigned long flags, int depth,
-                   dump_func dump, void *data)
+static int do_dump(const json_t *json, size_t flags, int depth,
+                   json_dump_callback_t dump, void *data)
 {
-    int ascii = flags & JSON_ENSURE_ASCII ? 1 : 0;
+    if(!json)
+        return -1;
 
     switch(json_typeof(json)) {
         case JSON_NULL:
@@ -185,8 +192,10 @@ static int do_dump(const json_t *json, unsigned long flags, int depth,
             char buffer[MAX_INTEGER_STR_LENGTH];
             int size;
 
-            size = snprintf(buffer, MAX_INTEGER_STR_LENGTH, "%d", json_integer_value(json));
-            if(size >= MAX_INTEGER_STR_LENGTH)
+            size = snprintf(buffer, MAX_INTEGER_STR_LENGTH,
+                            "%" JSON_INTEGER_FORMAT,
+                            json_integer_value(json));
+            if(size < 0 || size >= MAX_INTEGER_STR_LENGTH)
                 return -1;
 
             return dump(buffer, size, data);
@@ -196,31 +205,17 @@ static int do_dump(const json_t *json, unsigned long flags, int depth,
         {
             char buffer[MAX_REAL_STR_LENGTH];
             int size;
+            double value = json_real_value(json);
 
-            size = snprintf(buffer, MAX_REAL_STR_LENGTH, "%.17g",
-                            json_real_value(json));
-            if(size >= MAX_REAL_STR_LENGTH)
+            size = jsonp_dtostr(buffer, MAX_REAL_STR_LENGTH, value);
+            if(size < 0)
                 return -1;
 
-            /* Make sure there's a dot or 'e' in the output. Otherwise
-               a real is converted to an integer when decoding */
-            if(strchr(buffer, '.') == NULL &&
-               strchr(buffer, 'e') == NULL)
-            {
-                if(size + 2 >= MAX_REAL_STR_LENGTH) {
-                    /* No space to append ".0" */
-                    return -1;
-                }
-                buffer[size] = '.';
-                buffer[size + 1] = '0';
-                size += 2;
-            }
-
             return dump(buffer, size, data);
         }
 
         case JSON_STRING:
-            return dump_string(json_string_value(json), ascii, dump, data);
+            return dump_string(json_string_value(json), dump, data, flags);
 
         case JSON_ARRAY:
         {
@@ -306,20 +301,20 @@ static int do_dump(const json_t *json, unsigned long flags, int depth,
 
             if(flags & JSON_SORT_KEYS || flags & JSON_PRESERVE_ORDER)
             {
-                const object_key_t **keys;
-                unsigned int size;
-                unsigned int i;
+                struct object_key *keys;
+                size_t size, i;
                 int (*cmp_func)(const void *, const void *);
 
                 size = json_object_size(json);
-                keys = malloc(size * sizeof(object_key_t *));
+                keys = jsonp_malloc(size * sizeof(struct object_key));
                 if(!keys)
                     goto object_error;
 
                 i = 0;
                 while(iter)
                 {
-                    keys[i] = jsonp_object_iter_fullkey(iter);
+                    keys[i].serial = hashtable_iter_serial(iter);
+                    keys[i].key = json_object_iter_key(iter);
                     iter = json_object_iter_next((json_t *)json, iter);
                     i++;
                 }
@@ -330,22 +325,22 @@ static int do_dump(const json_t *json, unsigned long flags, int depth,
                 else
                     cmp_func = object_key_compare_serials;
 
-                qsort((void*)keys, size, sizeof(object_key_t *), cmp_func);
+                qsort(keys, size, sizeof(struct object_key), cmp_func);
 
                 for(i = 0; i < size; i++)
                 {
                     const char *key;
                     json_t *value;
 
-                    key = keys[i]->key;
+                    key = keys[i].key;
                     value = json_object_get(json, key);
                     assert(value);
 
-                    dump_string(key, ascii, dump, data);
+                    dump_string(key, dump, data, flags);
                     if(dump(separator, separator_length, data) ||
                        do_dump(value, flags, depth + 1, dump, data))
                     {
-                        free((void*)keys);
+                        jsonp_free(keys);
                         goto object_error;
                     }
 
@@ -354,7 +349,7 @@ static int do_dump(const json_t *json, unsigned long flags, int depth,
                         if(dump(",", 1, data) ||
                            dump_indent(flags, depth + 1, 1, dump, data))
                         {
-                            free((void*)keys);
+                            jsonp_free(keys);
                             goto object_error;
                         }
                     }
@@ -362,13 +357,13 @@ static int do_dump(const json_t *json, unsigned long flags, int depth,
                     {
                         if(dump_indent(flags, depth, 0, dump, data))
                         {
-                            free((void*)keys);
+                            jsonp_free(keys);
                             goto object_error;
                         }
                     }
                 }
 
-                free((void*)keys);
+                jsonp_free(keys);
             }
             else
             {
@@ -378,7 +373,7 @@ static int do_dump(const json_t *json, unsigned long flags, int depth,
                 {
                     void *next = json_object_iter_next((json_t *)json, iter);
 
-                    dump_string(json_object_iter_key(iter), ascii, dump, data);
+                    dump_string(json_object_iter_key(iter), dump, data, flags);
                     if(dump(separator, separator_length, data) ||
                        do_dump(json_object_iter_value(iter), flags, depth + 1,
                                dump, data))
@@ -414,38 +409,29 @@ static int do_dump(const json_t *json, unsigned long flags, int depth,
     }
 }
 
-
-char *json_dumps(const json_t *json, unsigned long flags)
+char *json_dumps(const json_t *json, size_t flags)
 {
     strbuffer_t strbuff;
     char *result;
 
-    if(!json_is_array(json) && !json_is_object(json))
-        return NULL;
-
     if(strbuffer_init(&strbuff))
         return NULL;
 
-    if(do_dump(json, flags, 0, dump_to_strbuffer, (void *)&strbuff)) {
-        strbuffer_close(&strbuff);
-        return NULL;
-    }
+    if(json_dump_callback(json, dump_to_strbuffer, (void *)&strbuff, flags))
+        result = NULL;
+    else
+        result = jsonp_strdup(strbuffer_value(&strbuff));
 
-    result = strdup(strbuffer_value(&strbuff));
     strbuffer_close(&strbuff);
-
     return result;
 }
 
-int json_dumpf(const json_t *json, FILE *output, unsigned long flags)
+int json_dumpf(const json_t *json, FILE *output, size_t flags)
 {
-    if(!json_is_array(json) && !json_is_object(json))
-        return -1;
-
-    return do_dump(json, flags, 0, dump_to_file, (void *)output);
+    return json_dump_callback(json, dump_to_file, (void *)output, flags);
 }
 
-int json_dump_file(const json_t *json, const char *path, unsigned long flags)
+int json_dump_file(const json_t *json, const char *path, size_t flags)
 {
     int result;
 
@@ -458,3 +444,13 @@ int json_dump_file(const json_t *json, const char *path, unsigned long flags)
     fclose(output);
     return result;
 }
+
+int json_dump_callback(const json_t *json, json_dump_callback_t callback, void *data, size_t flags)
+{
+    if(!(flags & JSON_ENCODE_ANY)) {
+        if(!json_is_array(json) && !json_is_object(json))
+           return -1;
+    }
+
+    return do_dump(json, flags, 0, callback, data);
+}
diff --git a/compat/jansson/error.c b/compat/jansson/error.c
new file mode 100644
index 0000000000..a544a59fb1
--- /dev/null
+++ b/compat/jansson/error.c
@@ -0,0 +1,63 @@
+#include <string.h>
+#include "jansson_private.h"
+
+void jsonp_error_init(json_error_t *error, const char *source)
+{
+    if(error)
+    {
+        error->text[0] = '\0';
+        error->line = -1;
+        error->column = -1;
+        error->position = 0;
+        if(source)
+            jsonp_error_set_source(error, source);
+        else
+            error->source[0] = '\0';
+    }
+}
+
+void jsonp_error_set_source(json_error_t *error, const char *source)
+{
+    size_t length;
+
+    if(!error || !source)
+        return;
+
+    length = strlen(source);
+    if(length < JSON_ERROR_SOURCE_LENGTH)
+        strcpy(error->source, source);
+    else {
+        size_t extra = length - JSON_ERROR_SOURCE_LENGTH + 4;
+        strcpy(error->source, "...");
+        strcpy(error->source + 3, source + extra);
+    }
+}
+
+void jsonp_error_set(json_error_t *error, int line, int column,
+                     size_t position, const char *msg, ...)
+{
+    va_list ap;
+
+    va_start(ap, msg);
+    jsonp_error_vset(error, line, column, position, msg, ap);
+    va_end(ap);
+}
+
+void jsonp_error_vset(json_error_t *error, int line, int column,
+                      size_t position, const char *msg, va_list ap)
+{
+    if(!error)
+        return;
+
+    if(error->text[0] != '\0') {
+        /* error already set */
+        return;
+    }
+
+    error->line = line;
+    error->column = column;
+    error->position = position;
+
+    vsnprintf(error->text, JSON_ERROR_TEXT_LENGTH, msg, ap);
+    error->text[JSON_ERROR_TEXT_LENGTH - 1] = '\0';
+}
diff --git a/compat/jansson/hashtable.c b/compat/jansson/hashtable.c
index 791f9acbe2..a438bbbb53 100644
--- a/compat/jansson/hashtable.c
+++ b/compat/jansson/hashtable.c
@@ -1,35 +1,46 @@
 /*
- * Copyright (c) 2009, 2010 Petri Lehtinen <petri@digip.org>
+ * Copyright (c) 2009-2013 Petri Lehtinen <petri@digip.org>
  *
  * This library is free software; you can redistribute it and/or modify
  * it under the terms of the MIT license. See LICENSE for details.
  */
 
-#include <config.h>
-
 #include <stdlib.h>
+#include <string.h>
+#include <jansson_config.h>   /* for JSON_INLINE */
+#include "jansson_private.h"  /* for container_of() */
 #include "hashtable.h"
 
-#ifdef WIN32
-#define inline __inline
-#endif
-
 typedef struct hashtable_list list_t;
 typedef struct hashtable_pair pair_t;
 typedef struct hashtable_bucket bucket_t;
 
-#define container_of(ptr_, type_, member_)                      \
-    ((type_ *)((char *)ptr_ - (size_t)&((type_ *)0)->member_))
-
 #define list_to_pair(list_)  container_of(list_, pair_t, list)
 
-static inline void list_init(list_t *list)
+/* From http://www.cse.yorku.ca/~oz/hash.html */
+static size_t hash_str(const void *ptr)
+{
+    const char *str = (const char *)ptr;
+
+    size_t hash = 5381;
+    size_t c;
+
+    while((c = (size_t)*str))
+    {
+        hash = ((hash << 5) + hash) + c;
+        str++;
+    }
+
+    return hash;
+}
+
+static JSON_INLINE void list_init(list_t *list)
 {
     list->next = list;
     list->prev = list;
 }
 
-static inline void list_insert(list_t *list, list_t *node)
+static JSON_INLINE void list_insert(list_t *list, list_t *node)
 {
     node->next = list;
     node->prev = list->prev;
@@ -37,13 +48,13 @@ static inline void list_insert(list_t *list, list_t *node)
     list->prev = node;
 }
 
-static inline void list_remove(list_t *list)
+static JSON_INLINE void list_remove(list_t *list)
 {
     list->prev->next = list->next;
     list->next->prev = list->prev;
 }
 
-static inline int bucket_is_empty(hashtable_t *hashtable, bucket_t *bucket)
+static JSON_INLINE int bucket_is_empty(hashtable_t *hashtable, bucket_t *bucket)
 {
     return bucket->first == &hashtable->list && bucket->first == bucket->last;
 }
@@ -63,22 +74,21 @@ static void insert_to_bucket(hashtable_t *hashtable, bucket_t *bucket,
     }
 }
 
-static unsigned int primes[] = {
+static const size_t primes[] = {
     5, 13, 23, 53, 97, 193, 389, 769, 1543, 3079, 6151, 12289, 24593,
     49157, 98317, 196613, 393241, 786433, 1572869, 3145739, 6291469,
     12582917, 25165843, 50331653, 100663319, 201326611, 402653189,
     805306457, 1610612741
 };
-static const unsigned int num_primes = sizeof(primes) / sizeof(unsigned int);
 
-static inline unsigned int num_buckets(hashtable_t *hashtable)
+static JSON_INLINE size_t num_buckets(hashtable_t *hashtable)
 {
     return primes[hashtable->num_buckets];
 }
 
 
 static pair_t *hashtable_find_pair(hashtable_t *hashtable, bucket_t *bucket,
-                                   const void *key, unsigned int hash)
+                                   const char *key, size_t hash)
 {
     list_t *list;
     pair_t *pair;
@@ -90,7 +100,7 @@ static pair_t *hashtable_find_pair(hashtable_t *hashtable, bucket_t *bucket,
     while(1)
     {
         pair = list_to_pair(list);
-        if(pair->hash == hash && hashtable->cmp_keys(pair->key, key))
+        if(pair->hash == hash && strcmp(pair->key, key) == 0)
             return pair;
 
         if(list == bucket->last)
@@ -104,11 +114,11 @@ static pair_t *hashtable_find_pair(hashtable_t *hashtable, bucket_t *bucket,
 
 /* returns 0 on success, -1 if key was not found */
 static int hashtable_do_del(hashtable_t *hashtable,
-                            const void *key, unsigned int hash)
+                            const char *key, size_t hash)
 {
     pair_t *pair;
     bucket_t *bucket;
-    unsigned int index;
+    size_t index;
 
     index = hash % num_buckets(hashtable);
     bucket = &hashtable->buckets[index];
@@ -127,13 +137,9 @@ static int hashtable_do_del(hashtable_t *hashtable,
         bucket->last = pair->list.prev;
 
     list_remove(&pair->list);
+    json_decref(pair->value);
 
-    if(hashtable->free_key)
-        hashtable->free_key(pair->key);
-    if(hashtable->free_value)
-        hashtable->free_value(pair->value);
-
-    free(pair);
+    jsonp_free(pair);
     hashtable->size--;
 
     return 0;
@@ -148,11 +154,8 @@ static void hashtable_do_clear(hashtable_t *hashtable)
     {
         next = list->next;
         pair = list_to_pair(list);
-        if(hashtable->free_key)
-            hashtable->free_key(pair->key);
-        if(hashtable->free_value)
-            hashtable->free_value(pair->value);
-        free(pair);
+        json_decref(pair->value);
+        jsonp_free(pair);
     }
 }
 
@@ -160,14 +163,14 @@ static int hashtable_do_rehash(hashtable_t *hashtable)
 {
     list_t *list, *next;
     pair_t *pair;
-    unsigned int i, index, new_size;
+    size_t i, index, new_size;
 
-    free(hashtable->buckets);
+    jsonp_free(hashtable->buckets);
 
     hashtable->num_buckets++;
     new_size = num_buckets(hashtable);
 
-    hashtable->buckets = malloc(new_size * sizeof(bucket_t));
+    hashtable->buckets = jsonp_malloc(new_size * sizeof(bucket_t));
     if(!hashtable->buckets)
         return -1;
 
@@ -191,47 +194,18 @@ static int hashtable_do_rehash(hashtable_t *hashtable)
 }
 
 
-hashtable_t *hashtable_create(key_hash_fn hash_key, key_cmp_fn cmp_keys,
-                              free_fn free_key, free_fn free_value)
-{
-    hashtable_t *hashtable = malloc(sizeof(hashtable_t));
-    if(!hashtable)
-        return NULL;
-
-    if(hashtable_init(hashtable, hash_key, cmp_keys, free_key, free_value))
-    {
-        free(hashtable);
-        return NULL;
-    }
-
-    return hashtable;
-}
-
-void hashtable_destroy(hashtable_t *hashtable)
-{
-    hashtable_close(hashtable);
-    free(hashtable);
-}
-
-int hashtable_init(hashtable_t *hashtable,
-                   key_hash_fn hash_key, key_cmp_fn cmp_keys,
-                   free_fn free_key, free_fn free_value)
+int hashtable_init(hashtable_t *hashtable)
 {
-    unsigned int i;
+    size_t i;
 
     hashtable->size = 0;
     hashtable->num_buckets = 0;  /* index to primes[] */
-    hashtable->buckets = malloc(num_buckets(hashtable) * sizeof(bucket_t));
+    hashtable->buckets = jsonp_malloc(num_buckets(hashtable) * sizeof(bucket_t));
     if(!hashtable->buckets)
         return -1;
 
     list_init(&hashtable->list);
 
-    hashtable->hash_key = hash_key;
-    hashtable->cmp_keys = cmp_keys;
-    hashtable->free_key = free_key;
-    hashtable->free_value = free_value;
-
     for(i = 0; i < num_buckets(hashtable); i++)
     {
         hashtable->buckets[i].first = hashtable->buckets[i].last =
@@ -244,42 +218,45 @@ int hashtable_init(hashtable_t *hashtable,
 void hashtable_close(hashtable_t *hashtable)
 {
     hashtable_do_clear(hashtable);
-    free(hashtable->buckets);
+    jsonp_free(hashtable->buckets);
 }
 
-int hashtable_set(hashtable_t *hashtable, void *key, void *value)
+int hashtable_set(hashtable_t *hashtable,
+                  const char *key, size_t serial,
+                  json_t *value)
 {
     pair_t *pair;
     bucket_t *bucket;
-    unsigned int hash, index;
+    size_t hash, index;
 
     /* rehash if the load ratio exceeds 1 */
     if(hashtable->size >= num_buckets(hashtable))
         if(hashtable_do_rehash(hashtable))
             return -1;
 
-    hash = hashtable->hash_key(key);
+    hash = hash_str(key);
     index = hash % num_buckets(hashtable);
     bucket = &hashtable->buckets[index];
     pair = hashtable_find_pair(hashtable, bucket, key, hash);
 
     if(pair)
     {
-        if(hashtable->free_key)
-            hashtable->free_key(key);
-        if(hashtable->free_value)
-            hashtable->free_value(pair->value);
+        json_decref(pair->value);
         pair->value = value;
     }
     else
     {
-        pair = malloc(sizeof(pair_t));
+        /* offsetof(...) returns the size of pair_t without the last,
+           flexible member. This way, the correct amount is
+           allocated. */
+        pair = jsonp_malloc(offsetof(pair_t, key) + strlen(key) + 1);
         if(!pair)
             return -1;
 
-        pair->key = key;
-        pair->value = value;
         pair->hash = hash;
+        pair->serial = serial;
+        strcpy(pair->key, key);
+        pair->value = value;
         list_init(&pair->list);
 
         insert_to_bucket(hashtable, bucket, &pair->list);
@@ -289,13 +266,13 @@ int hashtable_set(hashtable_t *hashtable, void *key, void *value)
     return 0;
 }
 
-void *hashtable_get(hashtable_t *hashtable, const void *key)
+void *hashtable_get(hashtable_t *hashtable, const char *key)
 {
     pair_t *pair;
-    unsigned int hash;
+    size_t hash;
     bucket_t *bucket;
 
-    hash = hashtable->hash_key(key);
+    hash = hash_str(key);
     bucket = &hashtable->buckets[hash % num_buckets(hashtable)];
 
     pair = hashtable_find_pair(hashtable, bucket, key, hash);
@@ -305,15 +282,15 @@ void *hashtable_get(hashtable_t *hashtable, const void *key)
     return pair->value;
 }
 
-int hashtable_del(hashtable_t *hashtable, const void *key)
+int hashtable_del(hashtable_t *hashtable, const char *key)
 {
-    unsigned int hash = hashtable->hash_key(key);
+    size_t hash = hash_str(key);
     return hashtable_do_del(hashtable, key, hash);
 }
 
 void hashtable_clear(hashtable_t *hashtable)
 {
-    unsigned int i;
+    size_t i;
 
     hashtable_do_clear(hashtable);
 
@@ -332,13 +309,13 @@ void *hashtable_iter(hashtable_t *hashtable)
     return hashtable_iter_next(hashtable, &hashtable->list);
 }
 
-void *hashtable_iter_at(hashtable_t *hashtable, const void *key)
+void *hashtable_iter_at(hashtable_t *hashtable, const char *key)
 {
     pair_t *pair;
-    unsigned int hash;
+    size_t hash;
     bucket_t *bucket;
 
-    hash = hashtable->hash_key(key);
+    hash = hash_str(key);
     bucket = &hashtable->buckets[hash % num_buckets(hashtable)];
 
     pair = hashtable_find_pair(hashtable, bucket, key, hash);
@@ -362,18 +339,22 @@ void *hashtable_iter_key(void *iter)
     return pair->key;
 }
 
+size_t hashtable_iter_serial(void *iter)
+{
+    pair_t *pair = list_to_pair((list_t *)iter);
+    return pair->serial;
+}
+
 void *hashtable_iter_value(void *iter)
 {
     pair_t *pair = list_to_pair((list_t *)iter);
     return pair->value;
 }
 
-void hashtable_iter_set(hashtable_t *hashtable, void *iter, void *value)
+void hashtable_iter_set(void *iter, json_t *value)
 {
     pair_t *pair = list_to_pair((list_t *)iter);
 
-    if(hashtable->free_value)
-        hashtable->free_value(pair->value);
-
+    json_decref(pair->value);
     pair->value = value;
 }
diff --git a/compat/jansson/hashtable.h b/compat/jansson/hashtable.h
index 52f8549281..3c9fdc22a4 100644
--- a/compat/jansson/hashtable.h
+++ b/compat/jansson/hashtable.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2009, 2010 Petri Lehtinen <petri@digip.org>
+ * Copyright (c) 2009-2013 Petri Lehtinen <petri@digip.org>
  *
  * This library is free software; you can redistribute it and/or modify
  * it under the terms of the MIT license. See LICENSE for details.
@@ -8,20 +8,20 @@
 #ifndef HASHTABLE_H
 #define HASHTABLE_H
 
-typedef unsigned int (*key_hash_fn)(const void *key);
-typedef int (*key_cmp_fn)(const void *key1, const void *key2);
-typedef void (*free_fn)(void *key);
-
 struct hashtable_list {
     struct hashtable_list *prev;
     struct hashtable_list *next;
 };
 
+/* "pair" may be a bit confusing a name, but think of it as a
+   key-value pair. In this case, it just encodes some extra data,
+   too */
 struct hashtable_pair {
-    void *key;
-    void *value;
-    unsigned int hash;
+    size_t hash;
     struct hashtable_list list;
+    json_t *value;
+    size_t serial;
+    char key[1];
 };
 
 struct hashtable_bucket {
@@ -30,60 +30,27 @@ struct hashtable_bucket {
 };
 
 typedef struct hashtable {
-    unsigned int size;
+    size_t size;
     struct hashtable_bucket *buckets;
-    unsigned int num_buckets;  /* index to primes[] */
+    size_t num_buckets;  /* index to primes[] */
     struct hashtable_list list;
-
-    key_hash_fn hash_key;
-    key_cmp_fn cmp_keys;  /* returns non-zero for equal keys */
-    free_fn free_key;
-    free_fn free_value;
 } hashtable_t;
 
-/**
- * hashtable_create - Create a hashtable object
- *
- * @hash_key: The key hashing function
- * @cmp_keys: The key compare function. Returns non-zero for equal and
- *     zero for unequal unequal keys
- * @free_key: If non-NULL, called for a key that is no longer referenced.
- * @free_value: If non-NULL, called for a value that is no longer referenced.
- *
- * Returns a new hashtable object that should be freed with
- * hashtable_destroy when it's no longer used, or NULL on failure (out
- * of memory).
- */
-hashtable_t *hashtable_create(key_hash_fn hash_key, key_cmp_fn cmp_keys,
-                              free_fn free_key, free_fn free_value);
 
-/**
- * hashtable_destroy - Destroy a hashtable object
- *
- * @hashtable: The hashtable
- *
- * Destroys a hashtable created with hashtable_create().
- */
-void hashtable_destroy(hashtable_t *hashtable);
+#define hashtable_key_to_iter(key_) \
+    (&(container_of(key_, struct hashtable_pair, key)->list))
 
 /**
  * hashtable_init - Initialize a hashtable object
  *
  * @hashtable: The (statically allocated) hashtable object
- * @hash_key: The key hashing function
- * @cmp_keys: The key compare function. Returns non-zero for equal and
- *     zero for unequal unequal keys
- * @free_key: If non-NULL, called for a key that is no longer referenced.
- * @free_value: If non-NULL, called for a value that is no longer referenced.
  *
  * Initializes a statically allocated hashtable object. The object
  * should be cleared with hashtable_close when it's no longer used.
  *
  * Returns 0 on success, -1 on error (out of memory).
  */
-int hashtable_init(hashtable_t *hashtable,
-                   key_hash_fn hash_key, key_cmp_fn cmp_keys,
-                   free_fn free_key, free_fn free_value);
+int hashtable_init(hashtable_t *hashtable);
 
 /**
  * hashtable_close - Release all resources used by a hashtable object
@@ -99,20 +66,19 @@ void hashtable_close(hashtable_t *hashtable);
  *
  * @hashtable: The hashtable object
  * @key: The key
+ * @serial: For addition order of keys
  * @value: The value
  *
  * If a value with the given key already exists, its value is replaced
- * with the new value.
- *
- * Key and value are "stealed" in the sense that hashtable frees them
- * automatically when they are no longer used. The freeing is
- * accomplished by calling free_key and free_value functions that were
- * supplied to hashtable_new. In case one or both of the free
- * functions is NULL, the corresponding item is not "stealed".
+ * with the new value. Value is "stealed" in the sense that hashtable
+ * doesn't increment its refcount but decreases the refcount when the
+ * value is no longer needed.
  *
  * Returns 0 on success, -1 on failure (out of memory).
  */
-int hashtable_set(hashtable_t *hashtable, void *key, void *value);
+int hashtable_set(hashtable_t *hashtable,
+                  const char *key, size_t serial,
+                  json_t *value);
 
 /**
  * hashtable_get - Get a value associated with a key
@@ -122,7 +88,7 @@ int hashtable_set(hashtable_t *hashtable, void *key, void *value);
  *
  * Returns value if it is found, or NULL otherwise.
  */
-void *hashtable_get(hashtable_t *hashtable, const void *key);
+void *hashtable_get(hashtable_t *hashtable, const char *key);
 
 /**
  * hashtable_del - Remove a value from the hashtable
@@ -132,7 +98,7 @@ void *hashtable_get(hashtable_t *hashtable, const void *key);
  *
  * Returns 0 on success, or -1 if the key was not found.
  */
-int hashtable_del(hashtable_t *hashtable, const void *key);
+int hashtable_del(hashtable_t *hashtable, const char *key);
 
 /**
  * hashtable_clear - Clear hashtable
@@ -169,7 +135,7 @@ void *hashtable_iter(hashtable_t *hashtable);
  * Like hashtable_iter() but returns an iterator pointing to a
  * specific key.
  */
-void *hashtable_iter_at(hashtable_t *hashtable, const void *key);
+void *hashtable_iter_at(hashtable_t *hashtable, const char *key);
 
 /**
  * hashtable_iter_next - Advance an iterator
@@ -189,6 +155,13 @@ void *hashtable_iter_next(hashtable_t *hashtable, void *iter);
  */
 void *hashtable_iter_key(void *iter);
 
+/**
+ * hashtable_iter_serial - Retrieve the serial number pointed to by an iterator
+ *
+ * @iter: The iterator
+ */
+size_t hashtable_iter_serial(void *iter);
+
 /**
  * hashtable_iter_value - Retrieve the value pointed by an iterator
  *
@@ -202,6 +175,6 @@ void *hashtable_iter_value(void *iter);
  * @iter: The iterator
  * @value: The value to set
  */
-void hashtable_iter_set(hashtable_t *hashtable, void *iter, void *value);
+void hashtable_iter_set(void *iter, json_t *value);
 
 #endif
diff --git a/compat/jansson/jansson.h b/compat/jansson/jansson.h
index 781896ce9f..0a072ddb4d 100644
--- a/compat/jansson/jansson.h
+++ b/compat/jansson/jansson.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2009, 2010 Petri Lehtinen <petri@digip.org>
+ * Copyright (c) 2009-2013 Petri Lehtinen <petri@digip.org>
  *
  * Jansson is free software; you can redistribute it and/or modify
  * it under the terms of the MIT license. See LICENSE for details.
@@ -9,23 +9,31 @@
 #define JANSSON_H
 
 #include <stdio.h>
+#include <stdlib.h>  /* for size_t */
+#include <stdarg.h>
 
-#ifdef WIN32
-#define snprintf(...) _snprintf(__VA_ARGS__)
-#define strdup(x) _strdup(x)
-#endif
-
-#ifdef WIN32
-#define inline __inline
-#endif
+#include <jansson_config.h>
 
-#ifndef __cplusplus
-#define JSON_INLINE inline
-#else
-#define JSON_INLINE inline
+#ifdef __cplusplus
 extern "C" {
 #endif
 
+/* version */
+
+#define JANSSON_MAJOR_VERSION  2
+#define JANSSON_MINOR_VERSION  6
+#define JANSSON_MICRO_VERSION  0
+
+/* Micro version is omitted if it's 0 */
+#define JANSSON_VERSION  "2.6"
+
+/* Version as a 3-byte hex number, e.g. 0x010201 == 1.2.1. Use this
+   for numeric comparisons, e.g. #if JANSSON_VERSION_HEX >= ... */
+#define JANSSON_VERSION_HEX  ((JANSSON_MAJOR_VERSION << 16) |   \
+                              (JANSSON_MINOR_VERSION << 8)  |   \
+                              (JANSSON_MICRO_VERSION << 0))
+
+
 /* types */
 
 typedef enum {
@@ -39,11 +47,25 @@ typedef enum {
     JSON_NULL
 } json_type;
 
-typedef struct {
+typedef struct json_t {
     json_type type;
-    unsigned long refcount;
+    size_t refcount;
 } json_t;
 
+#ifndef JANSSON_USING_CMAKE /* disabled if using cmake */
+#if JSON_INTEGER_IS_LONG_LONG
+#ifdef _WIN32
+#define JSON_INTEGER_FORMAT "I64d"
+#else
+#define JSON_INTEGER_FORMAT "lld"
+#endif
+typedef long long json_int_t;
+#else
+#define JSON_INTEGER_FORMAT "ld"
+typedef long json_int_t;
+#endif /* JSON_INTEGER_IS_LONG_LONG */
+#endif
+
 #define json_typeof(json)      ((json)->type)
 #define json_is_object(json)   (json && json_typeof(json) == JSON_OBJECT)
 #define json_is_array(json)    (json && json_typeof(json) == JSON_ARRAY)
@@ -62,16 +84,17 @@ json_t *json_object(void);
 json_t *json_array(void);
 json_t *json_string(const char *value);
 json_t *json_string_nocheck(const char *value);
-json_t *json_integer(int value);
+json_t *json_integer(json_int_t value);
 json_t *json_real(double value);
 json_t *json_true(void);
 json_t *json_false(void);
+#define json_boolean(val)      ((val) ? json_true() : json_false())
 json_t *json_null(void);
 
 static JSON_INLINE
 json_t *json_incref(json_t *json)
 {
-    if(json && json->refcount != (unsigned int)-1)
+    if(json && json->refcount != (size_t)-1)
         ++json->refcount;
     return json;
 }
@@ -82,27 +105,54 @@ void json_delete(json_t *json);
 static JSON_INLINE
 void json_decref(json_t *json)
 {
-    if(json && json->refcount != (unsigned int)-1 && --json->refcount == 0)
+    if(json && json->refcount != (size_t)-1 && --json->refcount == 0)
         json_delete(json);
 }
 
 
+/* error reporting */
+
+#define JSON_ERROR_TEXT_LENGTH    160
+#define JSON_ERROR_SOURCE_LENGTH   80
+
+typedef struct {
+    int line;
+    int column;
+    int position;
+    char source[JSON_ERROR_SOURCE_LENGTH];
+    char text[JSON_ERROR_TEXT_LENGTH];
+} json_error_t;
+
+
 /* getters, setters, manipulation */
 
-unsigned int json_object_size(const json_t *object);
+size_t json_object_size(const json_t *object);
 json_t *json_object_get(const json_t *object, const char *key);
 int json_object_set_new(json_t *object, const char *key, json_t *value);
 int json_object_set_new_nocheck(json_t *object, const char *key, json_t *value);
 int json_object_del(json_t *object, const char *key);
 int json_object_clear(json_t *object);
 int json_object_update(json_t *object, json_t *other);
+int json_object_update_existing(json_t *object, json_t *other);
+int json_object_update_missing(json_t *object, json_t *other);
 void *json_object_iter(json_t *object);
 void *json_object_iter_at(json_t *object, const char *key);
+void *json_object_key_to_iter(const char *key);
 void *json_object_iter_next(json_t *object, void *iter);
 const char *json_object_iter_key(void *iter);
 json_t *json_object_iter_value(void *iter);
 int json_object_iter_set_new(json_t *object, void *iter, json_t *value);
 
+#define json_object_foreach(object, key, value) \
+    for(key = json_object_iter_key(json_object_iter(object)); \
+        key && (value = json_object_iter_value(json_object_key_to_iter(key))); \
+        key = json_object_iter_key(json_object_iter_next(object, json_object_key_to_iter(key))))
+
+#define json_array_foreach(array, index, value) \
+	for(index = 0; \
+		index < json_array_size(array) && (value = json_array_get(array, index)); \
+		index++)
+
 static JSON_INLINE
 int json_object_set(json_t *object, const char *key, json_t *value)
 {
@@ -115,25 +165,25 @@ int json_object_set_nocheck(json_t *object, const char *key, json_t *value)
     return json_object_set_new_nocheck(object, key, json_incref(value));
 }
 
-static inline
+static JSON_INLINE
 int json_object_iter_set(json_t *object, void *iter, json_t *value)
 {
     return json_object_iter_set_new(object, iter, json_incref(value));
 }
 
-unsigned int json_array_size(const json_t *array);
-json_t *json_array_get(const json_t *array, unsigned int index);
-int json_array_set_new(json_t *array, unsigned int index, json_t *value);
+size_t json_array_size(const json_t *array);
+json_t *json_array_get(const json_t *array, size_t index);
+int json_array_set_new(json_t *array, size_t index, json_t *value);
 int json_array_append_new(json_t *array, json_t *value);
-int json_array_insert_new(json_t *array, unsigned int index, json_t *value);
-int json_array_remove(json_t *array, unsigned int index);
+int json_array_insert_new(json_t *array, size_t index, json_t *value);
+int json_array_remove(json_t *array, size_t index);
 int json_array_clear(json_t *array);
 int json_array_extend(json_t *array, json_t *other);
 
 static JSON_INLINE
-int json_array_set(json_t *array, unsigned int index, json_t *value)
+int json_array_set(json_t *array, size_t ind, json_t *value)
 {
-    return json_array_set_new(array, index, json_incref(value));
+    return json_array_set_new(array, ind, json_incref(value));
 }
 
 static JSON_INLINE
@@ -143,22 +193,36 @@ int json_array_append(json_t *array, json_t *value)
 }
 
 static JSON_INLINE
-int json_array_insert(json_t *array, unsigned int index, json_t *value)
+int json_array_insert(json_t *array, size_t ind, json_t *value)
 {
-    return json_array_insert_new(array, index, json_incref(value));
+    return json_array_insert_new(array, ind, json_incref(value));
 }
 
 const char *json_string_value(const json_t *string);
-int json_integer_value(const json_t *integer);
+json_int_t json_integer_value(const json_t *integer);
 double json_real_value(const json_t *real);
 double json_number_value(const json_t *json);
 
 int json_string_set(json_t *string, const char *value);
 int json_string_set_nocheck(json_t *string, const char *value);
-int json_integer_set(json_t *integer, int value);
+int json_integer_set(json_t *integer, json_int_t value);
 int json_real_set(json_t *real, double value);
 
 
+/* pack, unpack */
+
+json_t *json_pack(const char *fmt, ...);
+json_t *json_pack_ex(json_error_t *error, size_t flags, const char *fmt, ...);
+json_t *json_vpack_ex(json_error_t *error, size_t flags, const char *fmt, va_list ap);
+
+#define JSON_VALIDATE_ONLY  0x1
+#define JSON_STRICT         0x2
+
+int json_unpack(json_t *root, const char *fmt, ...);
+int json_unpack_ex(json_t *root, json_error_t *error, size_t flags, const char *fmt, ...);
+int json_vunpack_ex(json_t *root, json_error_t *error, size_t flags, const char *fmt, va_list ap);
+
+
 /* equality */
 
 int json_equal(json_t *value1, json_t *value2);
@@ -167,31 +231,48 @@ int json_equal(json_t *value1, json_t *value2);
 /* copying */
 
 json_t *json_copy(json_t *value);
-json_t *json_deep_copy(json_t *value);
+json_t *json_deep_copy(const json_t *value);
 
 
-/* loading, printing */
+/* decoding */
 
-#define JSON_ERROR_TEXT_LENGTH  160
+#define JSON_REJECT_DUPLICATES  0x1
+#define JSON_DISABLE_EOF_CHECK  0x2
+#define JSON_DECODE_ANY         0x4
+#define JSON_DECODE_INT_AS_REAL 0x8
 
-typedef struct {
-    char text[JSON_ERROR_TEXT_LENGTH];
-    int line;
-} json_error_t;
+typedef size_t (*json_load_callback_t)(void *buffer, size_t buflen, void *data);
+
+json_t *json_loads(const char *input, size_t flags, json_error_t *error);
+json_t *json_loadb(const char *buffer, size_t buflen, size_t flags, json_error_t *error);
+json_t *json_loadf(FILE *input, size_t flags, json_error_t *error);
+json_t *json_load_file(const char *path, size_t flags, json_error_t *error);
+json_t *json_load_callback(json_load_callback_t callback, void *data, size_t flags, json_error_t *error);
+
+
+/* encoding */
+
+#define JSON_INDENT(n)      (n & 0x1F)
+#define JSON_COMPACT        0x20
+#define JSON_ENSURE_ASCII   0x40
+#define JSON_SORT_KEYS      0x80
+#define JSON_PRESERVE_ORDER 0x100
+#define JSON_ENCODE_ANY     0x200
+#define JSON_ESCAPE_SLASH   0x400
+
+typedef int (*json_dump_callback_t)(const char *buffer, size_t size, void *data);
+
+char *json_dumps(const json_t *json, size_t flags);
+int json_dumpf(const json_t *json, FILE *output, size_t flags);
+int json_dump_file(const json_t *json, const char *path, size_t flags);
+int json_dump_callback(const json_t *json, json_dump_callback_t callback, void *data, size_t flags);
 
-json_t *json_loads(const char *input, json_error_t *error);
-json_t *json_loadf(FILE *input, json_error_t *error);
-json_t *json_load_file(const char *path, json_error_t *error);
+/* custom memory allocation */
 
-#define JSON_INDENT(n)      (n & 0xFF)
-#define JSON_COMPACT        0x100
-#define JSON_ENSURE_ASCII   0x200
-#define JSON_SORT_KEYS      0x400
-#define JSON_PRESERVE_ORDER 0x800
+typedef void *(*json_malloc_t)(size_t);
+typedef void (*json_free_t)(void *);
 
-char *json_dumps(const json_t *json, unsigned long flags);
-int json_dumpf(const json_t *json, FILE *output, unsigned long flags);
-int json_dump_file(const json_t *json, const char *path, unsigned long flags);
+void json_set_alloc_funcs(json_malloc_t malloc_fn, json_free_t free_fn);
 
 #ifdef __cplusplus
 }
diff --git a/compat/jansson/jansson_config.h b/compat/jansson/jansson_config.h
new file mode 100644
index 0000000000..0f3c7ca654
--- /dev/null
+++ b/compat/jansson/jansson_config.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2010-2013 Petri Lehtinen <petri@digip.org>
+ *
+ * Jansson is free software; you can redistribute it and/or modify
+ * it under the terms of the MIT license. See LICENSE for details.
+ *
+ *
+ * This file specifies a part of the site-specific configuration for
+ * Jansson, namely those things that affect the public API in
+ * jansson.h.
+ *
+ * The configure script copies this file to jansson_config.h and
+ * replaces @var@ substitutions by values that fit your system. If you
+ * cannot run the configure script, you can do the value substitution
+ * by hand.
+ */
+
+#ifndef JANSSON_CONFIG_H
+#define JANSSON_CONFIG_H
+/* If your compiler supports the inline keyword in C, JSON_INLINE is
+   defined to `inline', otherwise empty. In C++, the inline is always
+   supported. */
+
+#ifdef _MSC_VER
+#define inline __inline
+#if !defined(HAVE_STRUCT_TIMESPEC) && _MSC_VER >= 1900
+#define HAVE_STRUCT_TIMESPEC
+#endif
+#endif
+
+#ifdef __cplusplus
+#define JSON_INLINE inline
+#else
+#define JSON_INLINE inline
+#endif
+
+/* If your compiler supports the `long long` type and the strtoll()
+   library function, JSON_INTEGER_IS_LONG_LONG is defined to 1,
+   otherwise to 0. */
+#define JSON_INTEGER_IS_LONG_LONG 1
+
+/* If locale.h and localeconv() are available, define to 1,
+   otherwise to 0. */
+#define JSON_HAVE_LOCALECONV 1
+
+#endif
diff --git a/compat/jansson/jansson_config.h.in b/compat/jansson/jansson_config.h.in
new file mode 100644
index 0000000000..12580a0a8c
--- /dev/null
+++ b/compat/jansson/jansson_config.h.in
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2010-2014 Petri Lehtinen <petri@digip.org>
+ *
+ * Jansson is free software; you can redistribute it and/or modify
+ * it under the terms of the MIT license. See LICENSE for details.
+ *
+ *
+ * This file specifies a part of the site-specific configuration for
+ * Jansson, namely those things that affect the public API in
+ * jansson.h.
+ *
+ * The configure script copies this file to jansson_config.h and
+ * replaces @var@ substitutions by values that fit your system. If you
+ * cannot run the configure script, you can do the value substitution
+ * by hand.
+ */
+
+#ifndef JANSSON_CONFIG_H
+#define JANSSON_CONFIG_H
+
+/* If your compiler supports the inline keyword in C, JSON_INLINE is
+   defined to `inline', otherwise empty. In C++, the inline is always
+   supported. */
+#ifdef __cplusplus
+#define JSON_INLINE inline
+#else
+#define JSON_INLINE @json_inline@
+#endif
+
+/* If your compiler supports the `long long` type and the strtoll()
+   library function, JSON_INTEGER_IS_LONG_LONG is defined to 1,
+   otherwise to 0. */
+#define JSON_INTEGER_IS_LONG_LONG @json_have_long_long@
+
+/* If locale.h and localeconv() are available, define to 1,
+   otherwise to 0. */
+#define JSON_HAVE_LOCALECONV @json_have_localeconv@
+
+#endif
diff --git a/compat/jansson/jansson_private.h b/compat/jansson/jansson_private.h
index 3a3ed75480..b47d4041a0 100644
--- a/compat/jansson/jansson_private.h
+++ b/compat/jansson/jansson_private.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2009, 2010 Petri Lehtinen <petri@digip.org>
+ * Copyright (c) 2009-2013 Petri Lehtinen <petri@digip.org>
  *
  * Jansson is free software; you can redistribute it and/or modify
  * it under the terms of the MIT license. See LICENSE for details.
@@ -8,23 +8,40 @@
 #ifndef JANSSON_PRIVATE_H
 #define JANSSON_PRIVATE_H
 
+#include <stddef.h>
 #include "jansson.h"
 #include "hashtable.h"
+#include "strbuffer.h"
 
 #define container_of(ptr_, type_, member_)  \
-    ((type_ *)((char *)ptr_ - (size_t)&((type_ *)0)->member_))
+    ((type_ *)((char *)ptr_ - offsetof(type_, member_)))
+
+/* On some platforms, max() may already be defined */
+#ifndef max
+#define max(a, b)  ((a) > (b) ? (a) : (b))
+#endif
+
+/* va_copy is a C99 feature. In C89 implementations, it's sometimes
+   available as __va_copy. If not, memcpy() should do the trick. */
+#ifndef va_copy
+#ifdef __va_copy
+#define va_copy __va_copy
+#else
+#define va_copy(a, b)  memcpy(&(a), &(b), sizeof(va_list))
+#endif
+#endif
 
 typedef struct {
     json_t json;
     hashtable_t hashtable;
-    unsigned long serial;
+    size_t serial;
     int visited;
 } json_object_t;
 
 typedef struct {
     json_t json;
-    unsigned int size;
-    unsigned int entries;
+    size_t size;
+    size_t entries;
     json_t **table;
     int visited;
 } json_array_t;
@@ -41,7 +58,7 @@ typedef struct {
 
 typedef struct {
     json_t json;
-    int value;
+    json_int_t value;
 } json_integer_t;
 
 #define json_to_object(json_)  container_of(json_, json_object_t, json)
@@ -50,11 +67,27 @@ typedef struct {
 #define json_to_real(json_)   container_of(json_, json_real_t, json)
 #define json_to_integer(json_) container_of(json_, json_integer_t, json)
 
-typedef struct {
-    unsigned long serial;
-    char key[];
-} object_key_t;
+void jsonp_error_init(json_error_t *error, const char *source);
+void jsonp_error_set_source(json_error_t *error, const char *source);
+void jsonp_error_set(json_error_t *error, int line, int column,
+                     size_t position, const char *msg, ...);
+void jsonp_error_vset(json_error_t *error, int line, int column,
+                      size_t position, const char *msg, va_list ap);
+
+/* Locale independent string<->double conversions */
+int jsonp_strtod(strbuffer_t *strbuffer, double *out);
+int jsonp_dtostr(char *buffer, size_t size, double value);
 
-const object_key_t *jsonp_object_iter_fullkey(void *iter);
+/* Wrappers for custom memory functions */
+void* jsonp_malloc(size_t size);
+void jsonp_free(void *ptr);
+char *jsonp_strndup(const char *str, size_t length);
+char *jsonp_strdup(const char *str);
+
+/* Windows compatibility */
+#ifdef _WIN32
+#define snprintf _snprintf
+#define vsnprintf _vsnprintf
+#endif
 
 #endif
diff --git a/compat/jansson/jansson_private_config.h b/compat/jansson/jansson_private_config.h
new file mode 100644
index 0000000000..2eb70a815b
--- /dev/null
+++ b/compat/jansson/jansson_private_config.h
@@ -0,0 +1,140 @@
+/* jansson_private_config.h.  Generated from jansson_private_config.h.in by configure.  */
+/* jansson_private_config.h.in.  Generated from configure.ac by autoheader.  */
+
+/* Define to 1 if gcc's __atomic builtins are available */
+/* #undef HAVE_ATOMIC_BUILTINS */
+
+/* Define to 1 if you have the `close' function. */
+#define HAVE_CLOSE 1
+
+/* Define to 1 if you have the <dlfcn.h> header file. */
+/* #undef HAVE_DLFCN_H */
+
+/* Define to 1 if you have the <endian.h> header file. */
+/* #undef HAVE_ENDIAN_H */
+
+/* Define to 1 if you have the <fcntl.h> header file. */
+#define HAVE_FCNTL_H 1
+
+/* Define to 1 if you have the `getpid' function. */
+#define HAVE_GETPID 1
+
+/* Define to 1 if you have the `gettimeofday' function. */
+#define HAVE_GETTIMEOFDAY 1
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#define HAVE_INTTYPES_H 1
+
+/* Define to 1 if you have the `localeconv' function. */
+#define HAVE_LOCALECONV 1
+
+/* Define to 1 if you have the <locale.h> header file. */
+#define HAVE_LOCALE_H 1
+
+/* Define to 1 if the system has the type `long long int'. */
+#define HAVE_LONG_LONG_INT 1
+
+/* Define to 1 if you have the <memory.h> header file. */
+#define HAVE_MEMORY_H 1
+
+/* Define to 1 if you have the `open' function. */
+#define HAVE_OPEN 1
+
+/* Define to 1 if you have the `read' function. */
+#define HAVE_READ 1
+
+/* Define to 1 if you have the <sched.h> header file. */
+#define HAVE_SCHED_H 1
+
+/* Define to 1 if you have the `sched_yield' function. */
+/* #undef HAVE_SCHED_YIELD */
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#define HAVE_STDINT_H 1
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#define HAVE_STDLIB_H 1
+
+/* Define to 1 if you have the <strings.h> header file. */
+#define HAVE_STRINGS_H 1
+
+/* Define to 1 if you have the <string.h> header file. */
+#define HAVE_STRING_H 1
+
+/* Define to 1 if you have the `strtoll' function. */
+#define HAVE_STRTOLL 1
+
+/* Define to 1 if gcc's __sync builtins are available */
+#define HAVE_SYNC_BUILTINS 1
+
+/* Define to 1 if you have the <sys/param.h> header file. */
+#define HAVE_SYS_PARAM_H 1
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#define HAVE_SYS_STAT_H 1
+
+/* Define to 1 if you have the <sys/time.h> header file. */
+#define HAVE_SYS_TIME_H 1
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#define HAVE_SYS_TYPES_H 1
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#define HAVE_UNISTD_H 1
+
+/* Define to the sub-directory in which libtool stores uninstalled libraries.
+   */
+#define LT_OBJDIR ".libs/"
+
+/* Name of package */
+#define PACKAGE "jansson"
+
+/* Define to the address where bug reports for this package should be sent. */
+#define PACKAGE_BUGREPORT "petri@digip.org"
+
+/* Define to the full name of this package. */
+#define PACKAGE_NAME "jansson"
+
+/* Define to the full name and version of this package. */
+#define PACKAGE_STRING "jansson 2.6"
+
+/* Define to the one symbol short name of this package. */
+#define PACKAGE_TARNAME "jansson"
+
+/* Define to the home page for this package. */
+#define PACKAGE_URL ""
+
+/* Define to the version of this package. */
+#define PACKAGE_VERSION "2.6"
+
+/* Define to 1 if you have the ANSI C header files. */
+#define STDC_HEADERS 1
+
+/* Define to 1 if /dev/urandom should be used for seeding the hash function */
+#define USE_URANDOM 1
+
+/* Define to 1 if CryptGenRandom should be used for seeding the hash function
+   */
+#define USE_WINDOWS_CRYPTOAPI 1
+
+/* Version number of package */
+#define VERSION "2.6"
+
+/* Define for Solaris 2.5.1 so the uint32_t typedef from <sys/synch.h>,
+   <pthread.h>, or <semaphore.h> is not used. If the typedef were allowed, the
+   #define below would cause a syntax error. */
+/* #undef _UINT32_T */
+
+/* Define to `__inline__' or `__inline' if that's what the C compiler
+   calls it, or to nothing if 'inline' is not supported under any name.  */
+#ifndef __cplusplus
+/* #undef inline */
+#endif
+
+/* Define to the type of a signed integer type of width exactly 32 bits if
+   such a type exists and the standard includes do not define it. */
+/* #undef int32_t */
+
+/* Define to the type of an unsigned integer type of width exactly 32 bits if
+   such a type exists and the standard includes do not define it. */
+/* #undef uint32_t */
diff --git a/compat/jansson/jansson_private_config.h.in b/compat/jansson/jansson_private_config.h.in
new file mode 100644
index 0000000000..7c070847b2
--- /dev/null
+++ b/compat/jansson/jansson_private_config.h.in
@@ -0,0 +1,139 @@
+/* jansson_private_config.h.in.  Generated from configure.ac by autoheader.  */
+
+/* Define to 1 if gcc's __atomic builtins are available */
+#undef HAVE_ATOMIC_BUILTINS
+
+/* Define to 1 if you have the `close' function. */
+#undef HAVE_CLOSE
+
+/* Define to 1 if you have the <dlfcn.h> header file. */
+#undef HAVE_DLFCN_H
+
+/* Define to 1 if you have the <endian.h> header file. */
+#undef HAVE_ENDIAN_H
+
+/* Define to 1 if you have the <fcntl.h> header file. */
+#undef HAVE_FCNTL_H
+
+/* Define to 1 if you have the `getpid' function. */
+#undef HAVE_GETPID
+
+/* Define to 1 if you have the `gettimeofday' function. */
+#undef HAVE_GETTIMEOFDAY
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#undef HAVE_INTTYPES_H
+
+/* Define to 1 if you have the `localeconv' function. */
+#undef HAVE_LOCALECONV
+
+/* Define to 1 if you have the <locale.h> header file. */
+#undef HAVE_LOCALE_H
+
+/* Define to 1 if the system has the type `long long int'. */
+#undef HAVE_LONG_LONG_INT
+
+/* Define to 1 if you have the <memory.h> header file. */
+#undef HAVE_MEMORY_H
+
+/* Define to 1 if you have the `open' function. */
+#undef HAVE_OPEN
+
+/* Define to 1 if you have the `read' function. */
+#undef HAVE_READ
+
+/* Define to 1 if you have the <sched.h> header file. */
+#undef HAVE_SCHED_H
+
+/* Define to 1 if you have the `sched_yield' function. */
+#undef HAVE_SCHED_YIELD
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#undef HAVE_STDINT_H
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#undef HAVE_STDLIB_H
+
+/* Define to 1 if you have the <strings.h> header file. */
+#undef HAVE_STRINGS_H
+
+/* Define to 1 if you have the <string.h> header file. */
+#undef HAVE_STRING_H
+
+/* Define to 1 if you have the `strtoll' function. */
+#undef HAVE_STRTOLL
+
+/* Define to 1 if gcc's __sync builtins are available */
+#undef HAVE_SYNC_BUILTINS
+
+/* Define to 1 if you have the <sys/param.h> header file. */
+#undef HAVE_SYS_PARAM_H
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#undef HAVE_SYS_STAT_H
+
+/* Define to 1 if you have the <sys/time.h> header file. */
+#undef HAVE_SYS_TIME_H
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#undef HAVE_SYS_TYPES_H
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#undef HAVE_UNISTD_H
+
+/* Define to the sub-directory in which libtool stores uninstalled libraries.
+   */
+#undef LT_OBJDIR
+
+/* Name of package */
+#undef PACKAGE
+
+/* Define to the address where bug reports for this package should be sent. */
+#undef PACKAGE_BUGREPORT
+
+/* Define to the full name of this package. */
+#undef PACKAGE_NAME
+
+/* Define to the full name and version of this package. */
+#undef PACKAGE_STRING
+
+/* Define to the one symbol short name of this package. */
+#undef PACKAGE_TARNAME
+
+/* Define to the home page for this package. */
+#undef PACKAGE_URL
+
+/* Define to the version of this package. */
+#undef PACKAGE_VERSION
+
+/* Define to 1 if you have the ANSI C header files. */
+#undef STDC_HEADERS
+
+/* Define to 1 if /dev/urandom should be used for seeding the hash function */
+#undef USE_URANDOM
+
+/* Define to 1 if CryptGenRandom should be used for seeding the hash function
+   */
+#undef USE_WINDOWS_CRYPTOAPI
+
+/* Version number of package */
+#undef VERSION
+
+/* Define for Solaris 2.5.1 so the uint32_t typedef from <sys/synch.h>,
+   <pthread.h>, or <semaphore.h> is not used. If the typedef were allowed, the
+   #define below would cause a syntax error. */
+#undef _UINT32_T
+
+/* Define to `__inline__' or `__inline' if that's what the C compiler
+   calls it, or to nothing if 'inline' is not supported under any name.  */
+#ifndef __cplusplus
+#undef inline
+#endif
+
+/* Define to the type of a signed integer type of width exactly 32 bits if
+   such a type exists and the standard includes do not define it. */
+#undef int32_t
+
+/* Define to the type of an unsigned integer type of width exactly 32 bits if
+   such a type exists and the standard includes do not define it. */
+#undef uint32_t
diff --git a/compat/jansson/load.c b/compat/jansson/load.c
index ee56fbe733..45e21d0c1f 100644
--- a/compat/jansson/load.c
+++ b/compat/jansson/load.c
@@ -1,25 +1,30 @@
 /*
- * Copyright (c) 2009, 2010 Petri Lehtinen <petri@digip.org>
+ * Copyright (c) 2009-2013 Petri Lehtinen <petri@digip.org>
  *
  * Jansson is free software; you can redistribute it and/or modify
  * it under the terms of the MIT license. See LICENSE for details.
  */
 
+#ifndef _GNU_SOURCE
 #define _GNU_SOURCE
-#include <ctype.h>
+#endif
+
 #include <errno.h>
 #include <limits.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <stdarg.h>
 #include <assert.h>
 
-#include <jansson.h>
+#include "jansson.h"
 #include "jansson_private.h"
 #include "strbuffer.h"
 #include "utf.h"
 
+#define STREAM_STATE_OK        0
+#define STREAM_STATE_EOF      -1
+#define STREAM_STATE_ERROR    -2
+
 #define TOKEN_INVALID         -1
 #define TOKEN_EOF              0
 #define TOKEN_STRING         256
@@ -29,113 +34,136 @@
 #define TOKEN_FALSE          260
 #define TOKEN_NULL           261
 
-/* read one byte from stream, return EOF on end of file */
+/* Locale independent versions of isxxx() functions */
+#define l_isupper(c)  ('A' <= (c) && (c) <= 'Z')
+#define l_islower(c)  ('a' <= (c) && (c) <= 'z')
+#define l_isalpha(c)  (l_isupper(c) || l_islower(c))
+#define l_isdigit(c)  ('0' <= (c) && (c) <= '9')
+#define l_isxdigit(c) \
+    (l_isdigit(c) || ('A' <= (c) && (c) <= 'F') || ('a' <= (c) && (c) <= 'f'))
+
+/* Read one byte from stream, convert to unsigned char, then int, and
+   return. return EOF on end of file. This corresponds to the
+   behaviour of fgetc(). */
 typedef int (*get_func)(void *data);
 
-/* return non-zero if end of file has been reached */
-typedef int (*eof_func)(void *data);
-
 typedef struct {
     get_func get;
-    eof_func eof;
     void *data;
-    int stream_pos;
     char buffer[5];
-    int buffer_pos;
+    size_t buffer_pos;
+    int state;
+    int line;
+    int column, last_column;
+    size_t position;
 } stream_t;
 
-
 typedef struct {
     stream_t stream;
     strbuffer_t saved_text;
     int token;
-    int line, column;
     union {
         char *string;
-        int integer;
+        json_int_t integer;
         double real;
     } value;
 } lex_t;
 
+#define stream_to_lex(stream) container_of(stream, lex_t, stream)
 
-/*** error reporting ***/
 
-static void error_init(json_error_t *error)
-{
-    if(error)
-    {
-        error->text[0] = '\0';
-        error->line = -1;
-    }
-}
+/*** error reporting ***/
 
 static void error_set(json_error_t *error, const lex_t *lex,
                       const char *msg, ...)
 {
     va_list ap;
-    char text[JSON_ERROR_TEXT_LENGTH];
+    char msg_text[JSON_ERROR_TEXT_LENGTH];
+    char msg_with_context[JSON_ERROR_TEXT_LENGTH];
+
+    int line = -1, col = -1;
+    size_t pos = 0;
+    const char *result = msg_text;
 
-    if(!error || error->text[0] != '\0') {
-        /* error already set */
+    if(!error)
         return;
-    }
 
     va_start(ap, msg);
-    vsnprintf(text, JSON_ERROR_TEXT_LENGTH, msg, ap);
+    vsnprintf(msg_text, JSON_ERROR_TEXT_LENGTH, msg, ap);
+    msg_text[JSON_ERROR_TEXT_LENGTH - 1] = '\0';
     va_end(ap);
 
     if(lex)
     {
         const char *saved_text = strbuffer_value(&lex->saved_text);
-        error->line = lex->line;
+
+        line = lex->stream.line;
+        col = lex->stream.column;
+        pos = lex->stream.position;
+
         if(saved_text && saved_text[0])
         {
             if(lex->saved_text.length <= 20) {
-                snprintf(error->text, JSON_ERROR_TEXT_LENGTH,
-                         "%s near '%s'", text, saved_text);
+                snprintf(msg_with_context, JSON_ERROR_TEXT_LENGTH,
+                         "%s near '%s'", msg_text, saved_text);
+                msg_with_context[JSON_ERROR_TEXT_LENGTH - 1] = '\0';
+                result = msg_with_context;
             }
-            else
-                snprintf(error->text, JSON_ERROR_TEXT_LENGTH, "%s", text);
         }
         else
         {
-            snprintf(error->text, JSON_ERROR_TEXT_LENGTH,
-                     "%s near end of file", text);
+            if(lex->stream.state == STREAM_STATE_ERROR) {
+                /* No context for UTF-8 decoding errors */
+                result = msg_text;
+            }
+            else {
+                snprintf(msg_with_context, JSON_ERROR_TEXT_LENGTH,
+                         "%s near end of file", msg_text);
+                msg_with_context[JSON_ERROR_TEXT_LENGTH - 1] = '\0';
+                result = msg_with_context;
+            }
         }
     }
-    else
-    {
-        error->line = -1;
-        snprintf(error->text, JSON_ERROR_TEXT_LENGTH, "%s", text);
-    }
+
+    jsonp_error_set(error, line, col, pos, "%s", result);
 }
 
 
 /*** lexical analyzer ***/
 
 static void
-stream_init(stream_t *stream, get_func get, eof_func eof, void *data)
+stream_init(stream_t *stream, get_func get, void *data)
 {
     stream->get = get;
-    stream->eof = eof;
     stream->data = data;
-    stream->stream_pos = 0;
     stream->buffer[0] = '\0';
     stream->buffer_pos = 0;
+
+    stream->state = STREAM_STATE_OK;
+    stream->line = 1;
+    stream->column = 0;
+    stream->position = 0;
 }
 
-static char stream_get(stream_t *stream, json_error_t *error)
+static int stream_get(stream_t *stream, json_error_t *error)
 {
-    char c;
+    int c;
+
+    if(stream->state != STREAM_STATE_OK)
+        return stream->state;
 
     if(!stream->buffer[stream->buffer_pos])
     {
-        stream->buffer[0] = stream->get(stream->data);
-        stream->buffer_pos = 0;
+        c = stream->get(stream->data);
+        if(c == EOF) {
+            stream->state = STREAM_STATE_EOF;
+            return STREAM_STATE_EOF;
+        }
 
-        c = stream->buffer[0];
+        stream->buffer[0] = c;
+        stream->buffer_pos = 0;
 
-        if((unsigned char)c >= 0x80 && c != (char)EOF)
+        if(0x80 <= c && c <= 0xFF)
         {
             /* multi-byte UTF-8 sequence */
             int i, count;
@@ -152,30 +180,47 @@ static char stream_get(stream_t *stream, json_error_t *error)
             if(!utf8_check_full(stream->buffer, count, NULL))
                 goto out;
 
-            stream->stream_pos += count;
             stream->buffer[count] = '\0';
         }
-        else {
+        else
             stream->buffer[1] = '\0';
-            stream->stream_pos++;
-        }
     }
 
-    return stream->buffer[stream->buffer_pos++];
+    c = stream->buffer[stream->buffer_pos++];
 
-out:
-    error_set(error, NULL, "unable to decode byte 0x%x at position %d",
-              (unsigned char)c, stream->stream_pos);
+    stream->position++;
+    if(c == '\n') {
+        stream->line++;
+        stream->last_column = stream->column;
+        stream->column = 0;
+    }
+    else if(utf8_check_first(c)) {
+        /* track the Unicode character column, so increment only if
+           this is the first character of a UTF-8 sequence */
+        stream->column++;
+    }
 
-    stream->buffer[0] = EOF;
-    stream->buffer[1] = '\0';
-    stream->buffer_pos = 1;
+    return c;
 
-    return EOF;
+out:
+    stream->state = STREAM_STATE_ERROR;
+    error_set(error, stream_to_lex(stream), "unable to decode byte 0x%x", c);
+    return STREAM_STATE_ERROR;
 }
 
-static void stream_unget(stream_t *stream, char c)
+static void stream_unget(stream_t *stream, int c)
 {
+    if(c == STREAM_STATE_EOF || c == STREAM_STATE_ERROR)
+        return;
+
+    stream->position--;
+    if(c == '\n') {
+        stream->line--;
+        stream->column = stream->last_column;
+    }
+    else if(utf8_check_first(c))
+        stream->column--;
+
     assert(stream->buffer_pos > 0);
     stream->buffer_pos--;
     assert(stream->buffer[stream->buffer_pos] == c);
@@ -187,29 +232,41 @@ static int lex_get(lex_t *lex, json_error_t *error)
     return stream_get(&lex->stream, error);
 }
 
-static int lex_eof(lex_t *lex)
-{
-    return lex->stream.eof(lex->stream.data);
-}
-
-static void lex_save(lex_t *lex, char c)
+static void lex_save(lex_t *lex, int c)
 {
     strbuffer_append_byte(&lex->saved_text, c);
 }
 
 static int lex_get_save(lex_t *lex, json_error_t *error)
 {
-    char c = stream_get(&lex->stream, error);
-    lex_save(lex, c);
+    int c = stream_get(&lex->stream, error);
+    if(c != STREAM_STATE_EOF && c != STREAM_STATE_ERROR)
+        lex_save(lex, c);
     return c;
 }
 
-static void lex_unget_unsave(lex_t *lex, char c)
+static void lex_unget(lex_t *lex, int c)
 {
-    char d;
     stream_unget(&lex->stream, c);
-    d = strbuffer_pop(&lex->saved_text);
-    assert(c == d);
+}
+
+static void lex_unget_unsave(lex_t *lex, int c)
+{
+    if(c != STREAM_STATE_EOF && c != STREAM_STATE_ERROR) {
+        /* Since we treat warnings as errors, when assertions are turned
+         * off the "d" variable would be set but never used. Which is
+         * treated as an error by GCC.
+         */
+        #ifndef NDEBUG
+        char d;
+        #endif
+        stream_unget(&lex->stream, c);
+        #ifndef NDEBUG
+        d = 
+        #endif
+            strbuffer_pop(&lex->saved_text);
+        assert(c == d);
+    }
 }
 
 static void lex_save_cached(lex_t *lex)
@@ -218,6 +275,7 @@ static void lex_save_cached(lex_t *lex)
     {
         lex_save(lex, lex->stream.buffer[lex->stream.buffer_pos]);
         lex->stream.buffer_pos++;
+        lex->stream.position++;
     }
 }
 
@@ -232,11 +290,11 @@ static int32_t decode_unicode_escape(const char *str)
     for(i = 1; i <= 4; i++) {
         char c = str[i];
         value <<= 4;
-        if(isdigit(c))
+        if(l_isdigit(c))
             value += c - '0';
-        else if(islower(c))
+        else if(l_islower(c))
             value += c - 'a' + 10;
-        else if(isupper(c))
+        else if(l_isupper(c))
             value += c - 'A' + 10;
         else
             assert(0);
@@ -247,7 +305,7 @@ static int32_t decode_unicode_escape(const char *str)
 
 static void lex_scan_string(lex_t *lex, json_error_t *error)
 {
-    char c;
+    int c;
     const char *p;
     char *t;
     int i;
@@ -258,14 +316,15 @@ static void lex_scan_string(lex_t *lex, json_error_t *error)
     c = lex_get_save(lex, error);
 
     while(c != '"') {
-        if(c == (char)EOF) {
-            lex_unget_unsave(lex, c);
-            if(lex_eof(lex))
-                error_set(error, lex, "premature end of input");
+        if(c == STREAM_STATE_ERROR)
+            goto out;
+
+        else if(c == STREAM_STATE_EOF) {
+            error_set(error, lex, "premature end of input");
             goto out;
         }
 
-        else if((unsigned char)c <= 0x1F) {
+        else if(0 <= c && c <= 0x1F) {
             /* control character */
             lex_unget_unsave(lex, c);
             if(c == '\n')
@@ -280,8 +339,7 @@ static void lex_scan_string(lex_t *lex, json_error_t *error)
             if(c == 'u') {
                 c = lex_get_save(lex, error);
                 for(i = 0; i < 4; i++) {
-                    if(!isxdigit(c)) {
-                        lex_unget_unsave(lex, c);
+                    if(!l_isxdigit(c)) {
                         error_set(error, lex, "invalid escape");
                         goto out;
                     }
@@ -292,7 +350,6 @@ static void lex_scan_string(lex_t *lex, json_error_t *error)
                     c == 'f' || c == 'n' || c == 'r' || c == 't')
                 c = lex_get_save(lex, error);
             else {
-                lex_unget_unsave(lex, c);
                 error_set(error, lex, "invalid escape");
                 goto out;
             }
@@ -308,7 +365,7 @@ static void lex_scan_string(lex_t *lex, json_error_t *error)
          - two \uXXXX escapes (length 12) forming an UTF-16 surrogate pair
            are converted to 4 bytes
     */
-    lex->value.string = malloc(lex->saved_text.length + 1);
+    lex->value.string = jsonp_malloc(lex->saved_text.length + 1);
     if(!lex->value.string) {
         /* this is not very nice, since TOKEN_INVALID is returned */
         goto out;
@@ -398,10 +455,22 @@ static void lex_scan_string(lex_t *lex, json_error_t *error)
     return;
 
 out:
-    free(lex->value.string);
+    jsonp_free(lex->value.string);
 }
 
-static int lex_scan_number(lex_t *lex, char c, json_error_t *error)
+#ifndef JANSSON_USING_CMAKE /* disabled if using cmake */
+#if JSON_INTEGER_IS_LONG_LONG
+#ifdef _MSC_VER  /* Microsoft Visual Studio */
+#define json_strtoint     _strtoi64
+#else
+#define json_strtoint     strtoll
+#endif
+#else
+#define json_strtoint     strtol
+#endif
+#endif
+
+static int lex_scan_number(lex_t *lex, int c, json_error_t *error)
 {
     const char *saved_text;
     char *end;
@@ -414,52 +483,55 @@ static int lex_scan_number(lex_t *lex, char c, json_error_t *error)
 
     if(c == '0') {
         c = lex_get_save(lex, error);
-        if(isdigit(c)) {
+        if(l_isdigit(c)) {
             lex_unget_unsave(lex, c);
             goto out;
         }
     }
-    else if(isdigit(c)) {
+    else if(l_isdigit(c)) {
         c = lex_get_save(lex, error);
-        while(isdigit(c))
+        while(l_isdigit(c))
             c = lex_get_save(lex, error);
     }
     else {
-      lex_unget_unsave(lex, c);
-      goto out;
+        lex_unget_unsave(lex, c);
+        goto out;
     }
 
     if(c != '.' && c != 'E' && c != 'e') {
-        long value;
+        json_int_t value;
 
         lex_unget_unsave(lex, c);
 
         saved_text = strbuffer_value(&lex->saved_text);
-        value = strtol(saved_text, &end, 10);
-        assert(end == saved_text + lex->saved_text.length);
 
-        if((value == LONG_MAX && errno == ERANGE) || value > INT_MAX) {
-            error_set(error, lex, "too big integer");
-            goto out;
-        }
-        else if((value == LONG_MIN && errno == ERANGE) || value < INT_MIN) {
-            error_set(error, lex, "too big negative integer");
+        errno = 0;
+        value = json_strtoint(saved_text, &end, 10);
+        if(errno == ERANGE) {
+            if(value < 0)
+                error_set(error, lex, "too big negative integer");
+            else
+                error_set(error, lex, "too big integer");
             goto out;
         }
 
+        assert(end == saved_text + lex->saved_text.length);
+
         lex->token = TOKEN_INTEGER;
-        lex->value.integer = (int)value;
+        lex->value.integer = value;
         return 0;
     }
 
     if(c == '.') {
         c = lex_get(lex, error);
-        if(!isdigit(c))
+        if(!l_isdigit(c)) {
+            lex_unget(lex, c);
             goto out;
+        }
         lex_save(lex, c);
 
         c = lex_get_save(lex, error);
-        while(isdigit(c))
+        while(l_isdigit(c))
             c = lex_get_save(lex, error);
     }
 
@@ -468,23 +540,19 @@ static int lex_scan_number(lex_t *lex, char c, json_error_t *error)
         if(c == '+' || c == '-')
             c = lex_get_save(lex, error);
 
-        if(!isdigit(c)) {
+        if(!l_isdigit(c)) {
             lex_unget_unsave(lex, c);
             goto out;
         }
 
         c = lex_get_save(lex, error);
-        while(isdigit(c))
+        while(l_isdigit(c))
             c = lex_get_save(lex, error);
     }
 
     lex_unget_unsave(lex, c);
 
-    saved_text = strbuffer_value(&lex->saved_text);
-    value = strtod(saved_text, &end);
-    assert(end == saved_text + lex->saved_text.length);
-
-    if(errno == ERANGE && value != 0) {
+    if(jsonp_strtod(&lex->saved_text, &value)) {
         error_set(error, lex, "real number overflow");
         goto out;
     }
@@ -499,29 +567,26 @@ static int lex_scan_number(lex_t *lex, char c, json_error_t *error)
 
 static int lex_scan(lex_t *lex, json_error_t *error)
 {
-    char c;
+    int c;
 
     strbuffer_clear(&lex->saved_text);
 
     if(lex->token == TOKEN_STRING) {
-        free(lex->value.string);
+        jsonp_free(lex->value.string);
         lex->value.string = NULL;
     }
 
     c = lex_get(lex, error);
     while(c == ' ' || c == '\t' || c == '\n' || c == '\r')
-    {
-        if(c == '\n')
-            lex->line++;
-
         c = lex_get(lex, error);
+
+    if(c == STREAM_STATE_EOF) {
+        lex->token = TOKEN_EOF;
+        goto out;
     }
 
-    if(c == (char)EOF) {
-        if(lex_eof(lex))
-            lex->token = TOKEN_EOF;
-        else
-            lex->token = TOKEN_INVALID;
+    if(c == STREAM_STATE_ERROR) {
+        lex->token = TOKEN_INVALID;
         goto out;
     }
 
@@ -533,17 +598,17 @@ static int lex_scan(lex_t *lex, json_error_t *error)
     else if(c == '"')
         lex_scan_string(lex, error);
 
-    else if(isdigit(c) || c == '-') {
+    else if(l_isdigit(c) || c == '-') {
         if(lex_scan_number(lex, c, error))
             goto out;
     }
 
-    else if(isupper(c) || islower(c)) {
+    else if(l_isalpha(c)) {
         /* eat up the whole identifier for clearer error messages */
         const char *saved_text;
 
         c = lex_get_save(lex, error);
-        while(isupper(c) || islower(c))
+        while(l_isalpha(c))
             c = lex_get_save(lex, error);
         lex_unget_unsave(lex, c);
 
@@ -581,31 +646,29 @@ static char *lex_steal_string(lex_t *lex)
     return result;
 }
 
-static int lex_init(lex_t *lex, get_func get, eof_func eof, void *data)
+static int lex_init(lex_t *lex, get_func get, void *data)
 {
-    stream_init(&lex->stream, get, eof, data);
+    stream_init(&lex->stream, get, data);
     if(strbuffer_init(&lex->saved_text))
         return -1;
 
     lex->token = TOKEN_INVALID;
-    lex->line = 1;
-
     return 0;
 }
 
 static void lex_close(lex_t *lex)
 {
     if(lex->token == TOKEN_STRING)
-        free(lex->value.string);
+        jsonp_free(lex->value.string);
     strbuffer_close(&lex->saved_text);
 }
 
 
 /*** parser ***/
 
-static json_t *parse_value(lex_t *lex, json_error_t *error);
+static json_t *parse_value(lex_t *lex, size_t flags, json_error_t *error);
 
-static json_t *parse_object(lex_t *lex, json_error_t *error)
+static json_t *parse_object(lex_t *lex, size_t flags, json_error_t *error)
 {
     json_t *object = json_object();
     if(!object)
@@ -628,28 +691,36 @@ static json_t *parse_object(lex_t *lex, json_error_t *error)
         if(!key)
             return NULL;
 
+        if(flags & JSON_REJECT_DUPLICATES) {
+            if(json_object_get(object, key)) {
+                jsonp_free(key);
+                error_set(error, lex, "duplicate object key");
+                goto error;
+            }
+        }
+
         lex_scan(lex, error);
         if(lex->token != ':') {
-            free(key);
+            jsonp_free(key);
             error_set(error, lex, "':' expected");
             goto error;
         }
 
         lex_scan(lex, error);
-        value = parse_value(lex, error);
+        value = parse_value(lex, flags, error);
         if(!value) {
-            free(key);
+            jsonp_free(key);
             goto error;
         }
 
         if(json_object_set_nocheck(object, key, value)) {
-            free(key);
+            jsonp_free(key);
             json_decref(value);
             goto error;
         }
 
         json_decref(value);
-        free(key);
+        jsonp_free(key);
 
         lex_scan(lex, error);
         if(lex->token != ',')
@@ -670,7 +741,7 @@ static json_t *parse_object(lex_t *lex, json_error_t *error)
     return NULL;
 }
 
-static json_t *parse_array(lex_t *lex, json_error_t *error)
+static json_t *parse_array(lex_t *lex, size_t flags, json_error_t *error)
 {
     json_t *array = json_array();
     if(!array)
@@ -681,7 +752,7 @@ static json_t *parse_array(lex_t *lex, json_error_t *error)
         return array;
 
     while(lex->token) {
-        json_t *elem = parse_value(lex, error);
+        json_t *elem = parse_value(lex, flags, error);
         if(!elem)
             goto error;
 
@@ -710,9 +781,10 @@ static json_t *parse_array(lex_t *lex, json_error_t *error)
     return NULL;
 }
 
-static json_t *parse_value(lex_t *lex, json_error_t *error)
+static json_t *parse_value(lex_t *lex, size_t flags, json_error_t *error)
 {
     json_t *json;
+    double value;
 
     switch(lex->token) {
         case TOKEN_STRING: {
@@ -721,7 +793,15 @@ static json_t *parse_value(lex_t *lex, json_error_t *error)
         }
 
         case TOKEN_INTEGER: {
-            json = json_integer(lex->value.integer);
+            if (flags & JSON_DECODE_INT_AS_REAL) {
+                if(jsonp_strtod(&lex->saved_text, &value)) {
+                    error_set(error, lex, "real number overflow");
+                    return NULL;
+                }
+                json = json_real(value);
+            } else {
+                json = json_integer(lex->value.integer);
+            }
             break;
         }
 
@@ -743,11 +823,11 @@ static json_t *parse_value(lex_t *lex, json_error_t *error)
             break;
 
         case '{':
-            json = parse_object(lex, error);
+            json = parse_object(lex, flags, error);
             break;
 
         case '[':
-            json = parse_array(lex, error);
+            json = parse_array(lex, flags, error);
             break;
 
         case TOKEN_INVALID:
@@ -765,17 +845,37 @@ static json_t *parse_value(lex_t *lex, json_error_t *error)
     return json;
 }
 
-static json_t *parse_json(lex_t *lex, json_error_t *error)
+static json_t *parse_json(lex_t *lex, size_t flags, json_error_t *error)
 {
-    error_init(error);
+    json_t *result;
 
     lex_scan(lex, error);
-    if(lex->token != '[' && lex->token != '{') {
-        error_set(error, lex, "'[' or '{' expected");
+    if(!(flags & JSON_DECODE_ANY)) {
+        if(lex->token != '[' && lex->token != '{') {
+            error_set(error, lex, "'[' or '{' expected");
+            return NULL;
+        }
+    }
+
+    result = parse_value(lex, flags, error);
+    if(!result)
         return NULL;
+
+    if(!(flags & JSON_DISABLE_EOF_CHECK)) {
+        lex_scan(lex, error);
+        if(lex->token != TOKEN_EOF) {
+            error_set(error, lex, "end of file expected");
+            json_decref(result);
+            return NULL;
+        }
+    }
+
+    if(error) {
+        /* Save the position even though there was no error */
+        error->position = lex->stream.position;
     }
 
-    return parse_value(lex, error);
+    return result;
 }
 
 typedef struct
@@ -794,77 +894,120 @@ static int string_get(void *data)
     else
     {
         stream->pos++;
-        return c;
+        return (unsigned char)c;
     }
 }
 
-static int string_eof(void *data)
+json_t *json_loads(const char *string, size_t flags, json_error_t *error)
 {
-    string_data_t *stream = (string_data_t *)data;
-    return (stream->data[stream->pos] == '\0');
+    lex_t lex;
+    json_t *result;
+    string_data_t stream_data;
+
+    jsonp_error_init(error, "<string>");
+
+    if (string == NULL) {
+        error_set(error, NULL, "wrong arguments");
+        return NULL;
+    }
+
+    stream_data.data = string;
+    stream_data.pos = 0;
+
+    if(lex_init(&lex, string_get, (void *)&stream_data))
+        return NULL;
+
+    result = parse_json(&lex, flags, error);
+
+    lex_close(&lex);
+    return result;
+}
+
+typedef struct
+{
+    const char *data;
+    size_t len;
+    size_t pos;
+} buffer_data_t;
+
+static int buffer_get(void *data)
+{
+    char c;
+    buffer_data_t *stream = data;
+    if(stream->pos >= stream->len)
+      return EOF;
+
+    c = stream->data[stream->pos];
+    stream->pos++;
+    return (unsigned char)c;
 }
 
-json_t *json_loads(const char *string, json_error_t *error)
+json_t *json_loadb(const char *buffer, size_t buflen, size_t flags, json_error_t *error)
 {
     lex_t lex;
     json_t *result;
+    buffer_data_t stream_data;
 
-    string_data_t stream_data = {
-        string,
-        0
-    };
+    jsonp_error_init(error, "<buffer>");
 
-    if(lex_init(&lex, string_get, string_eof, (void *)&stream_data))
+    if (buffer == NULL) {
+        error_set(error, NULL, "wrong arguments");
         return NULL;
+    }
 
-    result = parse_json(&lex, error);
-    if(!result)
-        goto out;
+    stream_data.data = buffer;
+    stream_data.pos = 0;
+    stream_data.len = buflen;
 
-    lex_scan(&lex, error);
-    if(lex.token != TOKEN_EOF) {
-        error_set(error, &lex, "end of file expected");
-        json_decref(result);
-        result = NULL;
-    }
+    if(lex_init(&lex, buffer_get, (void *)&stream_data))
+        return NULL;
+
+    result = parse_json(&lex, flags, error);
 
-out:
     lex_close(&lex);
     return result;
 }
 
-json_t *json_loadf(FILE *input, json_error_t *error)
+json_t *json_loadf(FILE *input, size_t flags, json_error_t *error)
 {
     lex_t lex;
+    const char *source;
     json_t *result;
 
-    if(lex_init(&lex, (get_func)fgetc, (eof_func)feof, input))
-        return NULL;
+    if(input == stdin)
+        source = "<stdin>";
+    else
+        source = "<stream>";
 
-    result = parse_json(&lex, error);
-    if(!result)
-        goto out;
+    jsonp_error_init(error, source);
 
-    lex_scan(&lex, error);
-    if(lex.token != TOKEN_EOF) {
-        error_set(error, &lex, "end of file expected");
-        json_decref(result);
-        result = NULL;
+    if (input == NULL) {
+        error_set(error, NULL, "wrong arguments");
+        return NULL;
     }
 
-out:
+    if(lex_init(&lex, (get_func)fgetc, input))
+        return NULL;
+
+    result = parse_json(&lex, flags, error);
+
     lex_close(&lex);
     return result;
 }
 
-json_t *json_load_file(const char *path, json_error_t *error)
+json_t *json_load_file(const char *path, size_t flags, json_error_t *error)
 {
     json_t *result;
     FILE *fp;
 
-    error_init(error);
+    jsonp_error_init(error, path);
+
+    if (path == NULL) {
+        error_set(error, NULL, "wrong arguments");
+        return NULL;
+    }
 
-    fp = fopen(path, "r");
+    fp = fopen(path, "rb");
     if(!fp)
     {
         error_set(error, NULL, "unable to open %s: %s",
@@ -872,8 +1015,63 @@ json_t *json_load_file(const char *path, json_error_t *error)
         return NULL;
     }
 
-    result = json_loadf(fp, error);
+    result = json_loadf(fp, flags, error);
 
     fclose(fp);
     return result;
 }
+
+#define MAX_BUF_LEN 1024
+
+typedef struct
+{
+    char data[MAX_BUF_LEN];
+    size_t len;
+    size_t pos;
+    json_load_callback_t callback;
+    void *arg;
+} callback_data_t;
+
+static int callback_get(void *data)
+{
+    char c;
+    callback_data_t *stream = data;
+
+    if(stream->pos >= stream->len) {
+        stream->pos = 0;
+        stream->len = stream->callback(stream->data, MAX_BUF_LEN, stream->arg);
+        if(stream->len == 0 || stream->len == (size_t)-1)
+            return EOF;
+    }
+
+    c = stream->data[stream->pos];
+    stream->pos++;
+    return (unsigned char)c;
+}
+
+json_t *json_load_callback(json_load_callback_t callback, void *arg, size_t flags, json_error_t *error)
+{
+    lex_t lex;
+    json_t *result;
+
+    callback_data_t stream_data;
+
+    memset(&stream_data, 0, sizeof(stream_data));
+    stream_data.callback = callback;
+    stream_data.arg = arg;
+
+    jsonp_error_init(error, "<callback>");
+
+    if (callback == NULL) {
+        error_set(error, NULL, "wrong arguments");
+        return NULL;
+    }
+
+    if(lex_init(&lex, (get_func)callback_get, &stream_data))
+        return NULL;
+
+    result = parse_json(&lex, flags, error);
+
+    lex_close(&lex);
+    return result;
+}
diff --git a/compat/jansson/memory.c b/compat/jansson/memory.c
new file mode 100644
index 0000000000..eb6cec542e
--- /dev/null
+++ b/compat/jansson/memory.c
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2009-2013 Petri Lehtinen <petri@digip.org>
+ * Copyright (c) 2011-2012 Basile Starynkevitch <basile@starynkevitch.net>
+ *
+ * Jansson is free software; you can redistribute it and/or modify it
+ * under the terms of the MIT license. See LICENSE for details.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "jansson.h"
+#include "jansson_private.h"
+
+/* memory function pointers */
+static json_malloc_t do_malloc = malloc;
+static json_free_t do_free = free;
+
+void *jsonp_malloc(size_t size)
+{
+    if(!size)
+        return NULL;
+
+    return (*do_malloc)(size);
+}
+
+void jsonp_free(void *ptr)
+{
+    if(!ptr)
+        return;
+
+    (*do_free)(ptr);
+}
+
+char *jsonp_strdup(const char *str)
+{
+    char *new_str;
+    size_t len;
+
+    len = strlen(str);
+    if(len == (size_t)-1)
+        return NULL;
+
+    new_str = jsonp_malloc(len + 1);
+    if(!new_str)
+        return NULL;
+
+    memcpy(new_str, str, len + 1);
+    return new_str;
+}
+
+void json_set_alloc_funcs(json_malloc_t malloc_fn, json_free_t free_fn)
+{
+    do_malloc = malloc_fn;
+    do_free = free_fn;
+}
diff --git a/compat/jansson/pack_unpack.c b/compat/jansson/pack_unpack.c
new file mode 100644
index 0000000000..0d932f791d
--- /dev/null
+++ b/compat/jansson/pack_unpack.c
@@ -0,0 +1,762 @@
+/*
+ * Copyright (c) 2009-2013 Petri Lehtinen <petri@digip.org>
+ * Copyright (c) 2011-2012 Graeme Smecher <graeme.smecher@mail.mcgill.ca>
+ *
+ * Jansson is free software; you can redistribute it and/or modify
+ * it under the terms of the MIT license. See LICENSE for details.
+ */
+
+#include <string.h>
+#include "jansson.h"
+#include "jansson_private.h"
+#include "utf.h"
+
+typedef struct {
+    int line;
+    int column;
+    size_t pos;
+    char token;
+} token_t;
+
+typedef struct {
+    const char *start;
+    const char *fmt;
+    token_t prev_token;
+    token_t token;
+    token_t next_token;
+    json_error_t *error;
+    size_t flags;
+    int line;
+    int column;
+    size_t pos;
+} scanner_t;
+
+#define token(scanner) ((scanner)->token.token)
+
+static const char * const type_names[] = {
+    "object",
+    "array",
+    "string",
+    "integer",
+    "real",
+    "true",
+    "false",
+    "null"
+};
+
+#define type_name(x) type_names[json_typeof(x)]
+
+static const char unpack_value_starters[] = "{[siIbfFOon";
+
+
+static void scanner_init(scanner_t *s, json_error_t *error,
+                         size_t flags, const char *fmt)
+{
+    s->error = error;
+    s->flags = flags;
+    s->fmt = s->start = fmt;
+    memset(&s->prev_token, 0, sizeof(token_t));
+    memset(&s->token, 0, sizeof(token_t));
+    memset(&s->next_token, 0, sizeof(token_t));
+    s->line = 1;
+    s->column = 0;
+    s->pos = 0;
+}
+
+static void next_token(scanner_t *s)
+{
+    const char *t;
+    s->prev_token = s->token;
+
+    if(s->next_token.line) {
+        s->token = s->next_token;
+        s->next_token.line = 0;
+        return;
+    }
+
+    t = s->fmt;
+    s->column++;
+    s->pos++;
+
+    /* skip space and ignored chars */
+    while(*t == ' ' || *t == '\t' || *t == '\n' || *t == ',' || *t == ':') {
+        if(*t == '\n') {
+            s->line++;
+            s->column = 1;
+        }
+        else
+            s->column++;
+
+        s->pos++;
+        t++;
+    }
+
+    s->token.token = *t;
+    s->token.line = s->line;
+    s->token.column = s->column;
+    s->token.pos = s->pos;
+
+    t++;
+    s->fmt = t;
+}
+
+static void prev_token(scanner_t *s)
+{
+    s->next_token = s->token;
+    s->token = s->prev_token;
+}
+
+static void set_error(scanner_t *s, const char *source, const char *fmt, ...)
+{
+    va_list ap;
+    va_start(ap, fmt);
+
+    jsonp_error_vset(s->error, s->token.line, s->token.column, s->token.pos,
+                     fmt, ap);
+
+    jsonp_error_set_source(s->error, source);
+
+    va_end(ap);
+}
+
+static json_t *pack(scanner_t *s, va_list *ap);
+
+
+/* ours will be set to 1 if jsonp_free() must be called for the result
+   afterwards */
+static char *read_string(scanner_t *s, va_list *ap,
+                         const char *purpose, int *ours)
+{
+    char t;
+    strbuffer_t strbuff;
+    const char *str;
+    size_t length;
+    char *result;
+
+    next_token(s);
+    t = token(s);
+    prev_token(s);
+
+    if(t != '#' && t != '+') {
+        /* Optimize the simple case */
+        str = va_arg(*ap, const char *);
+
+        if(!str) {
+            set_error(s, "<args>", "NULL string argument");
+            return NULL;
+        }
+
+        if(!utf8_check_string(str, -1)) {
+            set_error(s, "<args>", "Invalid UTF-8 %s", purpose);
+            return NULL;
+        }
+
+        *ours = 0;
+        return (char *)str;
+    }
+
+    strbuffer_init(&strbuff);
+
+    while(1) {
+        str = va_arg(*ap, const char *);
+        if(!str) {
+            set_error(s, "<args>", "NULL string argument");
+            strbuffer_close(&strbuff);
+            return NULL;
+        }
+
+        next_token(s);
+
+        if(token(s) == '#') {
+            length = va_arg(*ap, int);
+        }
+        else {
+            prev_token(s);
+            length = strlen(str);
+        }
+
+        if(strbuffer_append_bytes(&strbuff, str, length) == -1) {
+            set_error(s, "<internal>", "Out of memory");
+            strbuffer_close(&strbuff);
+            return NULL;
+        }
+
+        next_token(s);
+        if(token(s) != '+') {
+            prev_token(s);
+            break;
+        }
+    }
+
+    result = strbuffer_steal_value(&strbuff);
+
+    if(!utf8_check_string(result, -1)) {
+        set_error(s, "<args>", "Invalid UTF-8 %s", purpose);
+        return NULL;
+    }
+
+    *ours = 1;
+    return result;
+}
+
+static json_t *pack_object(scanner_t *s, va_list *ap)
+{
+    json_t *object = json_object();
+    next_token(s);
+
+    while(token(s) != '}') {
+        char *key;
+        int ours;
+        json_t *value;
+
+        if(!token(s)) {
+            set_error(s, "<format>", "Unexpected end of format string");
+            goto error;
+        }
+
+        if(token(s) != 's') {
+            set_error(s, "<format>", "Expected format 's', got '%c'", token(s));
+            goto error;
+        }
+
+        key = read_string(s, ap, "object key", &ours);
+        if(!key)
+            goto error;
+
+        next_token(s);
+
+        value = pack(s, ap);
+        if(!value)
+            goto error;
+
+        if(json_object_set_new_nocheck(object, key, value)) {
+            if(ours)
+                jsonp_free(key);
+
+            set_error(s, "<internal>", "Unable to add key \"%s\"", key);
+            goto error;
+        }
+
+        if(ours)
+            jsonp_free(key);
+
+        next_token(s);
+    }
+
+    return object;
+
+error:
+    json_decref(object);
+    return NULL;
+}
+
+static json_t *pack_array(scanner_t *s, va_list *ap)
+{
+    json_t *array = json_array();
+    next_token(s);
+
+    while(token(s) != ']') {
+        json_t *value;
+
+        if(!token(s)) {
+            set_error(s, "<format>", "Unexpected end of format string");
+            goto error;
+        }
+
+        value = pack(s, ap);
+        if(!value)
+            goto error;
+
+        if(json_array_append_new(array, value)) {
+            set_error(s, "<internal>", "Unable to append to array");
+            goto error;
+        }
+
+        next_token(s);
+    }
+    return array;
+
+error:
+    json_decref(array);
+    return NULL;
+}
+
+static json_t *pack(scanner_t *s, va_list *ap)
+{
+    switch(token(s)) {
+        case '{':
+            return pack_object(s, ap);
+
+        case '[':
+            return pack_array(s, ap);
+
+        case 's': { /* string */
+            char *str;
+            int ours;
+            json_t *result;
+
+            str = read_string(s, ap, "string", &ours);
+            if(!str)
+                return NULL;
+
+            result = json_string_nocheck(str);
+            if(ours)
+                jsonp_free(str);
+
+            return result;
+        }
+
+        case 'n': /* null */
+            return json_null();
+
+        case 'b': /* boolean */
+            return va_arg(*ap, int) ? json_true() : json_false();
+
+        case 'i': /* integer from int */
+            return json_integer(va_arg(*ap, int));
+
+        case 'I': /* integer from json_int_t */
+            return json_integer(va_arg(*ap, json_int_t));
+
+        case 'f': /* real */
+            return json_real(va_arg(*ap, double));
+
+        case 'O': /* a json_t object; increments refcount */
+            return json_incref(va_arg(*ap, json_t *));
+
+        case 'o': /* a json_t object; doesn't increment refcount */
+            return va_arg(*ap, json_t *);
+
+        default:
+            set_error(s, "<format>", "Unexpected format character '%c'",
+                      token(s));
+            return NULL;
+    }
+}
+
+static int unpack(scanner_t *s, json_t *root, va_list *ap);
+
+static int unpack_object(scanner_t *s, json_t *root, va_list *ap)
+{
+    int ret = -1;
+    int strict = 0;
+
+    /* Use a set (emulated by a hashtable) to check that all object
+       keys are accessed. Checking that the correct number of keys
+       were accessed is not enough, as the same key can be unpacked
+       multiple times.
+    */
+    hashtable_t key_set;
+
+    if(hashtable_init(&key_set)) {
+        set_error(s, "<internal>", "Out of memory");
+        return -1;
+    }
+
+    if(root && !json_is_object(root)) {
+        set_error(s, "<validation>", "Expected object, got %s",
+                  type_name(root));
+        goto out;
+    }
+    next_token(s);
+
+    while(token(s) != '}') {
+        const char *key;
+        json_t *value;
+        int opt = 0;
+
+        if(strict != 0) {
+            set_error(s, "<format>", "Expected '}' after '%c', got '%c'",
+                      (strict == 1 ? '!' : '*'), token(s));
+            goto out;
+        }
+
+        if(!token(s)) {
+            set_error(s, "<format>", "Unexpected end of format string");
+            goto out;
+        }
+
+        if(token(s) == '!' || token(s) == '*') {
+            strict = (token(s) == '!' ? 1 : -1);
+            next_token(s);
+            continue;
+        }
+
+        if(token(s) != 's') {
+            set_error(s, "<format>", "Expected format 's', got '%c'", token(s));
+            goto out;
+        }
+
+        key = va_arg(*ap, const char *);
+        if(!key) {
+            set_error(s, "<args>", "NULL object key");
+            goto out;
+        }
+
+        next_token(s);
+
+        if(token(s) == '?') {
+            opt = 1;
+            next_token(s);
+        }
+
+        if(!root) {
+            /* skipping */
+            value = NULL;
+        }
+        else {
+            value = json_object_get(root, key);
+            if(!value && !opt) {
+                set_error(s, "<validation>", "Object item not found: %s", key);
+                goto out;
+            }
+        }
+
+        if(unpack(s, value, ap))
+            goto out;
+
+        hashtable_set(&key_set, key, 0, json_null());
+        next_token(s);
+    }
+
+    if(strict == 0 && (s->flags & JSON_STRICT))
+        strict = 1;
+
+    if(root && strict == 1 && key_set.size != json_object_size(root)) {
+        long diff = (long)json_object_size(root) - (long)key_set.size;
+        set_error(s, "<validation>", "%li object item(s) left unpacked", diff);
+        goto out;
+    }
+
+    ret = 0;
+
+out:
+    hashtable_close(&key_set);
+    return ret;
+}
+
+static int unpack_array(scanner_t *s, json_t *root, va_list *ap)
+{
+    size_t i = 0;
+    int strict = 0;
+
+    if(root && !json_is_array(root)) {
+        set_error(s, "<validation>", "Expected array, got %s", type_name(root));
+        return -1;
+    }
+    next_token(s);
+
+    while(token(s) != ']') {
+        json_t *value;
+
+        if(strict != 0) {
+            set_error(s, "<format>", "Expected ']' after '%c', got '%c'",
+                      (strict == 1 ? '!' : '*'),
+                      token(s));
+            return -1;
+        }
+
+        if(!token(s)) {
+            set_error(s, "<format>", "Unexpected end of format string");
+            return -1;
+        }
+
+        if(token(s) == '!' || token(s) == '*') {
+            strict = (token(s) == '!' ? 1 : -1);
+            next_token(s);
+            continue;
+        }
+
+        if(!strchr(unpack_value_starters, token(s))) {
+            set_error(s, "<format>", "Unexpected format character '%c'",
+                      token(s));
+            return -1;
+        }
+
+        if(!root) {
+            /* skipping */
+            value = NULL;
+        }
+        else {
+            value = json_array_get(root, i);
+            if(!value) {
+                set_error(s, "<validation>", "Array index %lu out of range",
+                          (unsigned long)i);
+                return -1;
+            }
+        }
+
+        if(unpack(s, value, ap))
+            return -1;
+
+        next_token(s);
+        i++;
+    }
+
+    if(strict == 0 && (s->flags & JSON_STRICT))
+        strict = 1;
+
+    if(root && strict == 1 && i != json_array_size(root)) {
+        long diff = (long)json_array_size(root) - (long)i;
+        set_error(s, "<validation>", "%li array item(s) left unpacked", diff);
+        return -1;
+    }
+
+    return 0;
+}
+
+static int unpack(scanner_t *s, json_t *root, va_list *ap)
+{
+    switch(token(s))
+    {
+        case '{':
+            return unpack_object(s, root, ap);
+
+        case '[':
+            return unpack_array(s, root, ap);
+
+        case 's':
+            if(root && !json_is_string(root)) {
+                set_error(s, "<validation>", "Expected string, got %s",
+                          type_name(root));
+                return -1;
+            }
+
+            if(!(s->flags & JSON_VALIDATE_ONLY)) {
+                const char **target;
+
+                target = va_arg(*ap, const char **);
+                if(!target) {
+                    set_error(s, "<args>", "NULL string argument");
+                    return -1;
+                }
+
+                if(root)
+                    *target = json_string_value(root);
+            }
+            return 0;
+
+        case 'i':
+            if(root && !json_is_integer(root)) {
+                set_error(s, "<validation>", "Expected integer, got %s",
+                          type_name(root));
+                return -1;
+            }
+
+            if(!(s->flags & JSON_VALIDATE_ONLY)) {
+                int *target = va_arg(*ap, int*);
+                if(root)
+                    *target = (int)json_integer_value(root);
+            }
+
+            return 0;
+
+        case 'I':
+            if(root && !json_is_integer(root)) {
+                set_error(s, "<validation>", "Expected integer, got %s",
+                          type_name(root));
+                return -1;
+            }
+
+            if(!(s->flags & JSON_VALIDATE_ONLY)) {
+                json_int_t *target = va_arg(*ap, json_int_t*);
+                if(root)
+                    *target = json_integer_value(root);
+            }
+
+            return 0;
+
+        case 'b':
+            if(root && !json_is_boolean(root)) {
+                set_error(s, "<validation>", "Expected true or false, got %s",
+                          type_name(root));
+                return -1;
+            }
+
+            if(!(s->flags & JSON_VALIDATE_ONLY)) {
+                int *target = va_arg(*ap, int*);
+                if(root)
+                    *target = json_is_true(root);
+            }
+
+            return 0;
+
+        case 'f':
+            if(root && !json_is_real(root)) {
+                set_error(s, "<validation>", "Expected real, got %s",
+                          type_name(root));
+                return -1;
+            }
+
+            if(!(s->flags & JSON_VALIDATE_ONLY)) {
+                double *target = va_arg(*ap, double*);
+                if(root)
+                    *target = json_real_value(root);
+            }
+
+            return 0;
+
+        case 'F':
+            if(root && !json_is_number(root)) {
+                set_error(s, "<validation>", "Expected real or integer, got %s",
+                          type_name(root));
+                return -1;
+            }
+
+            if(!(s->flags & JSON_VALIDATE_ONLY)) {
+                double *target = va_arg(*ap, double*);
+                if(root)
+                    *target = json_number_value(root);
+            }
+
+            return 0;
+
+        case 'O':
+            if(root && !(s->flags & JSON_VALIDATE_ONLY))
+                json_incref(root);
+            /* Fall through */
+
+        case 'o':
+            if(!(s->flags & JSON_VALIDATE_ONLY)) {
+                json_t **target = va_arg(*ap, json_t**);
+                if(root)
+                    *target = root;
+            }
+
+            return 0;
+
+        case 'n':
+            /* Never assign, just validate */
+            if(root && !json_is_null(root)) {
+                set_error(s, "<validation>", "Expected null, got %s",
+                          type_name(root));
+                return -1;
+            }
+            return 0;
+
+        default:
+            set_error(s, "<format>", "Unexpected format character '%c'",
+                      token(s));
+            return -1;
+    }
+}
+
+json_t *json_vpack_ex(json_error_t *error, size_t flags,
+                      const char *fmt, va_list ap)
+{
+    scanner_t s;
+    va_list ap_copy;
+    json_t *value;
+
+    if(!fmt || !*fmt) {
+        jsonp_error_init(error, "<format>");
+        jsonp_error_set(error, -1, -1, 0, "NULL or empty format string");
+        return NULL;
+    }
+    jsonp_error_init(error, NULL);
+
+    scanner_init(&s, error, flags, fmt);
+    next_token(&s);
+
+    va_copy(ap_copy, ap);
+    value = pack(&s, &ap_copy);
+    va_end(ap_copy);
+
+    if(!value)
+        return NULL;
+
+    next_token(&s);
+    if(token(&s)) {
+        json_decref(value);
+        set_error(&s, "<format>", "Garbage after format string");
+        return NULL;
+    }
+
+    return value;
+}
+
+json_t *json_pack_ex(json_error_t *error, size_t flags, const char *fmt, ...)
+{
+    json_t *value;
+    va_list ap;
+
+    va_start(ap, fmt);
+    value = json_vpack_ex(error, flags, fmt, ap);
+    va_end(ap);
+
+    return value;
+}
+
+json_t *json_pack(const char *fmt, ...)
+{
+    json_t *value;
+    va_list ap;
+
+    va_start(ap, fmt);
+    value = json_vpack_ex(NULL, 0, fmt, ap);
+    va_end(ap);
+
+    return value;
+}
+
+int json_vunpack_ex(json_t *root, json_error_t *error, size_t flags,
+                    const char *fmt, va_list ap)
+{
+    scanner_t s;
+    va_list ap_copy;
+
+    if(!root) {
+        jsonp_error_init(error, "<root>");
+        jsonp_error_set(error, -1, -1, 0, "NULL root value");
+        return -1;
+    }
+
+    if(!fmt || !*fmt) {
+        jsonp_error_init(error, "<format>");
+        jsonp_error_set(error, -1, -1, 0, "NULL or empty format string");
+        return -1;
+    }
+    jsonp_error_init(error, NULL);
+
+    scanner_init(&s, error, flags, fmt);
+    next_token(&s);
+
+    va_copy(ap_copy, ap);
+    if(unpack(&s, root, &ap_copy)) {
+        va_end(ap_copy);
+        return -1;
+    }
+    va_end(ap_copy);
+
+    next_token(&s);
+    if(token(&s)) {
+        set_error(&s, "<format>", "Garbage after format string");
+        return -1;
+    }
+
+    return 0;
+}
+
+int json_unpack_ex(json_t *root, json_error_t *error, size_t flags, const char *fmt, ...)
+{
+    int ret;
+    va_list ap;
+
+    va_start(ap, fmt);
+    ret = json_vunpack_ex(root, error, flags, fmt, ap);
+    va_end(ap);
+
+    return ret;
+}
+
+int json_unpack(json_t *root, const char *fmt, ...)
+{
+    int ret;
+    va_list ap;
+
+    va_start(ap, fmt);
+    ret = json_vunpack_ex(root, NULL, 0, fmt, ap);
+    va_end(ap);
+
+    return ret;
+}
diff --git a/compat/jansson/strbuffer.c b/compat/jansson/strbuffer.c
index 0019645ed1..8a51a5a266 100644
--- a/compat/jansson/strbuffer.c
+++ b/compat/jansson/strbuffer.c
@@ -1,25 +1,29 @@
 /*
- * Copyright (c) 2009, 2010 Petri Lehtinen <petri@digip.org>
+ * Copyright (c) 2009-2013 Petri Lehtinen <petri@digip.org>
  *
  * Jansson is free software; you can redistribute it and/or modify
  * it under the terms of the MIT license. See LICENSE for details.
  */
 
+#ifndef _GNU_SOURCE
 #define _GNU_SOURCE
+#endif
+
 #include <stdlib.h>
 #include <string.h>
+#include "jansson_private.h"
 #include "strbuffer.h"
-#include "util.h"
 
 #define STRBUFFER_MIN_SIZE  16
 #define STRBUFFER_FACTOR    2
+#define STRBUFFER_SIZE_MAX  ((size_t)-1)
 
 int strbuffer_init(strbuffer_t *strbuff)
 {
     strbuff->size = STRBUFFER_MIN_SIZE;
     strbuff->length = 0;
 
-    strbuff->value = malloc(strbuff->size);
+    strbuff->value = jsonp_malloc(strbuff->size);
     if(!strbuff->value)
         return -1;
 
@@ -30,7 +34,9 @@ int strbuffer_init(strbuffer_t *strbuff)
 
 void strbuffer_close(strbuffer_t *strbuff)
 {
-    free(strbuff->value);
+    if(strbuff->value)
+        jsonp_free(strbuff->value);
+
     strbuff->size = 0;
     strbuff->length = 0;
     strbuff->value = NULL;
@@ -50,7 +56,7 @@ const char *strbuffer_value(const strbuffer_t *strbuff)
 char *strbuffer_steal_value(strbuffer_t *strbuff)
 {
     char *result = strbuff->value;
-    strbuffer_init(strbuff);
+    strbuff->value = NULL;
     return result;
 }
 
@@ -64,16 +70,31 @@ int strbuffer_append_byte(strbuffer_t *strbuff, char byte)
     return strbuffer_append_bytes(strbuff, &byte, 1);
 }
 
-int strbuffer_append_bytes(strbuffer_t *strbuff, const char *data, int size)
+int strbuffer_append_bytes(strbuffer_t *strbuff, const char *data, size_t size)
 {
-    if(strbuff->length + size >= strbuff->size)
+    if(size >= strbuff->size - strbuff->length)
     {
-        strbuff->size = max(strbuff->size * STRBUFFER_FACTOR,
-                            strbuff->length + size + 1);
+        size_t new_size;
+        char *new_value;
+
+        /* avoid integer overflow */
+        if (strbuff->size > STRBUFFER_SIZE_MAX / STRBUFFER_FACTOR
+            || size > STRBUFFER_SIZE_MAX - 1
+            || strbuff->length > STRBUFFER_SIZE_MAX - 1 - size)
+            return -1;
+
+        new_size = max(strbuff->size * STRBUFFER_FACTOR,
+                       strbuff->length + size + 1);
 
-        strbuff->value = realloc(strbuff->value, strbuff->size);
-        if(!strbuff->value)
+        new_value = jsonp_malloc(new_size);
+        if(!new_value)
             return -1;
+
+        memcpy(new_value, strbuff->value, strbuff->length);
+
+        jsonp_free(strbuff->value);
+        strbuff->value = new_value;
+        strbuff->size = new_size;
     }
 
     memcpy(strbuff->value + strbuff->length, data, size);
diff --git a/compat/jansson/strbuffer.h b/compat/jansson/strbuffer.h
index 816594a826..b17ea2b667 100644
--- a/compat/jansson/strbuffer.h
+++ b/compat/jansson/strbuffer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2009, 2010 Petri Lehtinen <petri@digip.org>
+ * Copyright (c) 2009-2013 Petri Lehtinen <petri@digip.org>
  *
  * Jansson is free software; you can redistribute it and/or modify
  * it under the terms of the MIT license. See LICENSE for details.
@@ -10,8 +10,8 @@
 
 typedef struct {
     char *value;
-    int length;   /* bytes used */
-    int size;     /* bytes allocated */
+    size_t length;   /* bytes used */
+    size_t size;     /* bytes allocated */
 } strbuffer_t;
 
 int strbuffer_init(strbuffer_t *strbuff);
@@ -20,11 +20,13 @@ void strbuffer_close(strbuffer_t *strbuff);
 void strbuffer_clear(strbuffer_t *strbuff);
 
 const char *strbuffer_value(const strbuffer_t *strbuff);
+
+/* Steal the value and close the strbuffer */
 char *strbuffer_steal_value(strbuffer_t *strbuff);
 
 int strbuffer_append(strbuffer_t *strbuff, const char *string);
 int strbuffer_append_byte(strbuffer_t *strbuff, char byte);
-int strbuffer_append_bytes(strbuffer_t *strbuff, const char *data, int size);
+int strbuffer_append_bytes(strbuffer_t *strbuff, const char *data, size_t size);
 
 char strbuffer_pop(strbuffer_t *strbuff);
 
diff --git a/compat/jansson/strconv.c b/compat/jansson/strconv.c
new file mode 100644
index 0000000000..1c2472ef6a
--- /dev/null
+++ b/compat/jansson/strconv.c
@@ -0,0 +1,134 @@
+#include <assert.h>
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+#include "jansson_private.h"
+#include "strbuffer.h"
+
+/* need jansson_private_config.h to get the correct snprintf */
+#ifdef HAVE_CONFIG_H
+#include "jansson_private_config.h"
+#endif
+
+#if JSON_HAVE_LOCALECONV
+#include <locale.h>
+
+/*
+  - This code assumes that the decimal separator is exactly one
+    character.
+
+  - If setlocale() is called by another thread between the call to
+    localeconv() and the call to sprintf() or strtod(), the result may
+    be wrong. setlocale() is not thread-safe and should not be used
+    this way. Multi-threaded programs should use uselocale() instead.
+*/
+
+static void to_locale(strbuffer_t *strbuffer)
+{
+    const char *point;
+    char *pos;
+
+    point = localeconv()->decimal_point;
+    if(*point == '.') {
+        /* No conversion needed */
+        return;
+    }
+
+    pos = strchr(strbuffer->value, '.');
+    if(pos)
+        *pos = *point;
+}
+
+static void from_locale(char *buffer)
+{
+    const char *point;
+    char *pos;
+
+    point = localeconv()->decimal_point;
+    if(*point == '.') {
+        /* No conversion needed */
+        return;
+    }
+
+    pos = strchr(buffer, *point);
+    if(pos)
+        *pos = '.';
+}
+#endif
+
+int jsonp_strtod(strbuffer_t *strbuffer, double *out)
+{
+    double value;
+    char *end;
+
+#if JSON_HAVE_LOCALECONV
+    to_locale(strbuffer);
+#endif
+
+    errno = 0;
+    value = strtod(strbuffer->value, &end);
+    assert(end == strbuffer->value + strbuffer->length);
+
+    if(errno == ERANGE && value != 0) {
+        /* Overflow */
+        return -1;
+    }
+
+    *out = value;
+    return 0;
+}
+
+int jsonp_dtostr(char *buffer, size_t size, double value)
+{
+    int ret;
+    char *start, *end;
+    size_t length;
+
+    ret = snprintf(buffer, size, "%.17g", value);
+    if(ret < 0)
+        return -1;
+
+    length = (size_t)ret;
+    if(length >= size)
+        return -1;
+
+#if JSON_HAVE_LOCALECONV
+    from_locale(buffer);
+#endif
+
+    /* Make sure there's a dot or 'e' in the output. Otherwise
+       a real is converted to an integer when decoding */
+    if(strchr(buffer, '.') == NULL &&
+       strchr(buffer, 'e') == NULL)
+    {
+        if(length + 3 >= size) {
+            /* No space to append ".0" */
+            return -1;
+        }
+        buffer[length] = '.';
+        buffer[length + 1] = '0';
+        buffer[length + 2] = '\0';
+        length += 2;
+    }
+
+    /* Remove leading '+' from positive exponent. Also remove leading
+       zeros from exponents (added by some printf() implementations) */
+    start = strchr(buffer, 'e');
+    if(start) {
+        start++;
+        end = start + 1;
+
+        if(*start == '-')
+            start++;
+
+        while(*end == '0')
+            end++;
+
+        if(end != start) {
+            memmove(start, end, length - (size_t)(end - buffer));
+            length -= (size_t)(end - start);
+        }
+    }
+
+    return (int)length;
+}
diff --git a/compat/jansson/utf.c b/compat/jansson/utf.c
index 2b644501cb..709694dfaf 100644
--- a/compat/jansson/utf.c
+++ b/compat/jansson/utf.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2009, 2010 Petri Lehtinen <petri@digip.org>
+ * Copyright (c) 2009-2013 Petri Lehtinen <petri@digip.org>
  *
  * Jansson is free software; you can redistribute it and/or modify
  * it under the terms of the MIT license. See LICENSE for details.
diff --git a/compat/jansson/utf.h b/compat/jansson/utf.h
index 8e95296404..4c8fc575c2 100644
--- a/compat/jansson/utf.h
+++ b/compat/jansson/utf.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2009, 2010 Petri Lehtinen <petri@digip.org>
+ * Copyright (c) 2009-2013 Petri Lehtinen <petri@digip.org>
  *
  * Jansson is free software; you can redistribute it and/or modify
  * it under the terms of the MIT license. See LICENSE for details.
@@ -8,14 +8,25 @@
 #ifndef UTF_H
 #define UTF_H
 
-#include <config.h>
+#ifdef HAVE_CONFIG_H
+#include "jansson_private_config.h"
 
 #ifdef HAVE_INTTYPES_H
 /* inttypes.h includes stdint.h in a standard environment, so there's
 no need to include stdint.h separately. If inttypes.h doesn't define
 int32_t, it's defined in config.h. */
 #include <inttypes.h>
-#endif
+#endif /* HAVE_INTTYPES_H */
+
+#else /* !HAVE_CONFIG_H */
+#ifdef _WIN32
+typedef int int32_t;
+#else /* !_WIN32 */
+/* Assume a standard environment */
+#include <inttypes.h>
+#endif /* _WIN32 */
+
+#endif /* HAVE_CONFIG_H */
 
 int utf8_encode(int codepoint, char *buffer, int *size);
 
diff --git a/compat/jansson/value.c b/compat/jansson/value.c
index 591b89e1fe..1e7697cd01 100644
--- a/compat/jansson/value.c
+++ b/compat/jansson/value.c
@@ -1,25 +1,33 @@
 /*
- * Copyright (c) 2009, 2010 Petri Lehtinen <petri@digip.org>
+ * Copyright (c) 2009-2013 Petri Lehtinen <petri@digip.org>
  *
  * Jansson is free software; you can redistribute it and/or modify
  * it under the terms of the MIT license. See LICENSE for details.
  */
 
+#ifndef _GNU_SOURCE
 #define _GNU_SOURCE
+#endif
 
-#include <config.h>
-
+#include <stddef.h>
 #include <stdlib.h>
 #include <string.h>
+#include <math.h>
 
-#include <jansson.h>
+#include "jansson.h"
 #include "hashtable.h"
 #include "jansson_private.h"
 #include "utf.h"
-#include "util.h"
 
+/* Work around nonstandard isnan() and isinf() implementations */
+#ifndef isnan
+static JSON_INLINE int isnan(double x) { return x != x; }
+#endif
+#ifndef isinf
+static JSON_INLINE int isinf(double x) { return !isnan(x) && isnan(x - x); }
+#endif
 
-static inline void json_init(json_t *json, json_type type)
+static JSON_INLINE void json_init(json_t *json, json_type type)
 {
     json->type = type;
     json->refcount = 1;
@@ -28,50 +36,16 @@ static inline void json_init(json_t *json, json_type type)
 
 /*** object ***/
 
-/* This macro just returns a pointer that's a few bytes backwards from
-   string. This makes it possible to pass a pointer to object_key_t
-   when only the string inside it is used, without actually creating
-   an object_key_t instance. */
-#define string_to_key(string)  container_of(string, object_key_t, key)
-
-static unsigned int hash_key(const void *ptr)
-{
-    const char *str = ((const object_key_t *)ptr)->key;
-
-    unsigned int hash = 5381;
-    unsigned int c;
-
-    while((c = (unsigned int)*str))
-    {
-        hash = ((hash << 5) + hash) + c;
-        str++;
-    }
-
-    return hash;
-}
-
-static int key_equal(const void *ptr1, const void *ptr2)
-{
-    return strcmp(((const object_key_t *)ptr1)->key,
-                  ((const object_key_t *)ptr2)->key) == 0;
-}
-
-static void value_decref(void *value)
-{
-    json_decref((json_t *)value);
-}
-
 json_t *json_object(void)
 {
-    json_object_t *object = malloc(sizeof(json_object_t));
+    json_object_t *object = jsonp_malloc(sizeof(json_object_t));
     if(!object)
         return NULL;
     json_init(&object->json, JSON_OBJECT);
 
-    if(hashtable_init(&object->hashtable, hash_key, key_equal,
-                      free, value_decref))
+    if(hashtable_init(&object->hashtable))
     {
-        free(object);
+        jsonp_free(object);
         return NULL;
     }
 
@@ -84,15 +58,15 @@ json_t *json_object(void)
 static void json_delete_object(json_object_t *object)
 {
     hashtable_close(&object->hashtable);
-    free(object);
+    jsonp_free(object);
 }
 
-unsigned int json_object_size(const json_t *json)
+size_t json_object_size(const json_t *json)
 {
     json_object_t *object;
 
     if(!json_is_object(json))
-        return -1;
+        return 0;
 
     object = json_to_object(json);
     return object->hashtable.size;
@@ -106,32 +80,24 @@ json_t *json_object_get(const json_t *json, const char *key)
         return NULL;
 
     object = json_to_object(json);
-    return hashtable_get(&object->hashtable, string_to_key(key));
+    return hashtable_get(&object->hashtable, key);
 }
 
 int json_object_set_new_nocheck(json_t *json, const char *key, json_t *value)
 {
     json_object_t *object;
-    object_key_t *k;
 
-    if(!key || !value)
+    if(!value)
         return -1;
 
-    if(!json_is_object(json) || json == value)
+    if(!key || !json_is_object(json) || json == value)
     {
         json_decref(value);
         return -1;
     }
     object = json_to_object(json);
 
-    k = malloc(sizeof(object_key_t) + strlen(key) + 1);
-    if(!k)
-        return -1;
-
-    k->serial = object->serial++;
-    strcpy(k->key, key);
-
-    if(hashtable_set(&object->hashtable, k, value))
+    if(hashtable_set(&object->hashtable, key, object->serial++, value))
     {
         json_decref(value);
         return -1;
@@ -159,7 +125,7 @@ int json_object_del(json_t *json, const char *key)
         return -1;
 
     object = json_to_object(json);
-    return hashtable_del(&object->hashtable, string_to_key(key));
+    return hashtable_del(&object->hashtable, key);
 }
 
 int json_object_clear(json_t *json)
@@ -170,30 +136,56 @@ int json_object_clear(json_t *json)
         return -1;
 
     object = json_to_object(json);
+
     hashtable_clear(&object->hashtable);
+    object->serial = 0;
 
     return 0;
 }
 
 int json_object_update(json_t *object, json_t *other)
 {
-    void *iter;
+    const char *key;
+    json_t *value;
 
     if(!json_is_object(object) || !json_is_object(other))
         return -1;
 
-    iter = json_object_iter(other);
-    while(iter) {
-        const char *key;
-        json_t *value;
-
-        key = json_object_iter_key(iter);
-        value = json_object_iter_value(iter);
-
+    json_object_foreach(other, key, value) {
         if(json_object_set_nocheck(object, key, value))
             return -1;
+    }
+
+    return 0;
+}
+
+int json_object_update_existing(json_t *object, json_t *other)
+{
+    const char *key;
+    json_t *value;
+
+    if(!json_is_object(object) || !json_is_object(other))
+        return -1;
+
+    json_object_foreach(other, key, value) {
+        if(json_object_get(object, key))
+            json_object_set_nocheck(object, key, value);
+    }
+
+    return 0;
+}
+
+int json_object_update_missing(json_t *object, json_t *other)
+{
+    const char *key;
+    json_t *value;
+
+    if(!json_is_object(object) || !json_is_object(other))
+        return -1;
 
-        iter = json_object_iter_next(other, iter);
+    json_object_foreach(other, key, value) {
+        if(!json_object_get(object, key))
+            json_object_set_nocheck(object, key, value);
     }
 
     return 0;
@@ -218,7 +210,7 @@ void *json_object_iter_at(json_t *json, const char *key)
         return NULL;
 
     object = json_to_object(json);
-    return hashtable_iter_at(&object->hashtable, string_to_key(key));
+    return hashtable_iter_at(&object->hashtable, key);
 }
 
 void *json_object_iter_next(json_t *json, void *iter)
@@ -232,20 +224,12 @@ void *json_object_iter_next(json_t *json, void *iter)
     return hashtable_iter_next(&object->hashtable, iter);
 }
 
-const object_key_t *jsonp_object_iter_fullkey(void *iter)
-{
-    if(!iter)
-        return NULL;
-
-    return hashtable_iter_key(iter);
-}
-
 const char *json_object_iter_key(void *iter)
 {
     if(!iter)
         return NULL;
 
-    return jsonp_object_iter_fullkey(iter)->key;
+    return hashtable_iter_key(iter);
 }
 
 json_t *json_object_iter_value(void *iter)
@@ -258,38 +242,34 @@ json_t *json_object_iter_value(void *iter)
 
 int json_object_iter_set_new(json_t *json, void *iter, json_t *value)
 {
-    json_object_t *object;
-
     if(!json_is_object(json) || !iter || !value)
         return -1;
 
-    object = json_to_object(json);
-    hashtable_iter_set(&object->hashtable, iter, value);
-
+    hashtable_iter_set(iter, value);
     return 0;
 }
 
+void *json_object_key_to_iter(const char *key)
+{
+    if(!key)
+        return NULL;
+
+    return hashtable_key_to_iter(key);
+}
+
 static int json_object_equal(json_t *object1, json_t *object2)
 {
-    void *iter;
+    const char *key;
+    json_t *value1, *value2;
 
     if(json_object_size(object1) != json_object_size(object2))
         return 0;
 
-    iter = json_object_iter(object1);
-    while(iter)
-    {
-        const char *key;
-        json_t *value1, *value2;
-
-        key = json_object_iter_key(iter);
-        value1 = json_object_iter_value(iter);
+    json_object_foreach(object1, key, value1) {
         value2 = json_object_get(object2, key);
 
         if(!json_equal(value1, value2))
             return 0;
-
-        iter = json_object_iter_next(object1, iter);
     }
 
     return 1;
@@ -298,29 +278,21 @@ static int json_object_equal(json_t *object1, json_t *object2)
 static json_t *json_object_copy(json_t *object)
 {
     json_t *result;
-    void *iter;
+
+    const char *key;
+    json_t *value;
 
     result = json_object();
     if(!result)
         return NULL;
 
-    iter = json_object_iter(object);
-    while(iter)
-    {
-        const char *key;
-        json_t *value;
-
-        key = json_object_iter_key(iter);
-        value = json_object_iter_value(iter);
+    json_object_foreach(object, key, value)
         json_object_set_nocheck(result, key, value);
 
-        iter = json_object_iter_next(object, iter);
-    }
-
     return result;
 }
 
-static json_t *json_object_deep_copy(json_t *object)
+static json_t *json_object_deep_copy(const json_t *object)
 {
     json_t *result;
     void *iter;
@@ -329,17 +301,17 @@ static json_t *json_object_deep_copy(json_t *object)
     if(!result)
         return NULL;
 
-    iter = json_object_iter(object);
-    while(iter)
-    {
+    /* Cannot use json_object_foreach because object has to be cast
+       non-const */
+    iter = json_object_iter((json_t *)object);
+    while(iter) {
         const char *key;
-        json_t *value;
-
+        const json_t *value;
         key = json_object_iter_key(iter);
         value = json_object_iter_value(iter);
-        json_object_set_new_nocheck(result, key, json_deep_copy(value));
 
-        iter = json_object_iter_next(object, iter);
+        json_object_set_new_nocheck(result, key, json_deep_copy(value));
+        iter = json_object_iter_next((json_t *)object, iter);
     }
 
     return result;
@@ -350,7 +322,7 @@ static json_t *json_object_deep_copy(json_t *object)
 
 json_t *json_array(void)
 {
-    json_array_t *array = malloc(sizeof(json_array_t));
+    json_array_t *array = jsonp_malloc(sizeof(json_array_t));
     if(!array)
         return NULL;
     json_init(&array->json, JSON_ARRAY);
@@ -358,9 +330,9 @@ json_t *json_array(void)
     array->entries = 0;
     array->size = 8;
 
-    array->table = malloc(array->size * sizeof(json_t *));
+    array->table = jsonp_malloc(array->size * sizeof(json_t *));
     if(!array->table) {
-        free(array);
+        jsonp_free(array);
         return NULL;
     }
 
@@ -371,16 +343,16 @@ json_t *json_array(void)
 
 static void json_delete_array(json_array_t *array)
 {
-    unsigned int i;
+    size_t i;
 
     for(i = 0; i < array->entries; i++)
         json_decref(array->table[i]);
 
-    free(array->table);
-    free(array);
+    jsonp_free(array->table);
+    jsonp_free(array);
 }
 
-unsigned int json_array_size(const json_t *json)
+size_t json_array_size(const json_t *json)
 {
     if(!json_is_array(json))
         return 0;
@@ -388,7 +360,7 @@ unsigned int json_array_size(const json_t *json)
     return json_to_array(json)->entries;
 }
 
-json_t *json_array_get(const json_t *json, unsigned int index)
+json_t *json_array_get(const json_t *json, size_t index)
 {
     json_array_t *array;
     if(!json_is_array(json))
@@ -401,7 +373,7 @@ json_t *json_array_get(const json_t *json, unsigned int index)
     return array->table[index];
 }
 
-int json_array_set_new(json_t *json, unsigned int index, json_t *value)
+int json_array_set_new(json_t *json, size_t index, json_t *value)
 {
     json_array_t *array;
 
@@ -427,24 +399,24 @@ int json_array_set_new(json_t *json, unsigned int index, json_t *value)
     return 0;
 }
 
-static void array_move(json_array_t *array, unsigned int dest,
-                       unsigned int src, unsigned int count)
+static void array_move(json_array_t *array, size_t dest,
+                       size_t src, size_t count)
 {
     memmove(&array->table[dest], &array->table[src], count * sizeof(json_t *));
 }
 
-static void array_copy(json_t **dest, unsigned int dpos,
-                       json_t **src, unsigned int spos,
-                       unsigned int count)
+static void array_copy(json_t **dest, size_t dpos,
+                       json_t **src, size_t spos,
+                       size_t count)
 {
     memcpy(&dest[dpos], &src[spos], count * sizeof(json_t *));
 }
 
 static json_t **json_array_grow(json_array_t *array,
-                                unsigned int amount,
+                                size_t amount,
                                 int copy)
 {
-    unsigned int new_size;
+    size_t new_size;
     json_t **old_table, **new_table;
 
     if(array->entries + amount <= array->size)
@@ -453,7 +425,7 @@ static json_t **json_array_grow(json_array_t *array,
     old_table = array->table;
 
     new_size = max(array->size + amount, array->size * 2);
-    new_table = malloc(new_size * sizeof(json_t *));
+    new_table = jsonp_malloc(new_size * sizeof(json_t *));
     if(!new_table)
         return NULL;
 
@@ -462,7 +434,7 @@ static json_t **json_array_grow(json_array_t *array,
 
     if(copy) {
         array_copy(array->table, 0, old_table, 0, array->entries);
-        free(old_table);
+        jsonp_free(old_table);
         return array->table;
     }
 
@@ -494,7 +466,7 @@ int json_array_append_new(json_t *json, json_t *value)
     return 0;
 }
 
-int json_array_insert_new(json_t *json, unsigned int index, json_t *value)
+int json_array_insert_new(json_t *json, size_t index, json_t *value)
 {
     json_array_t *array;
     json_t **old_table;
@@ -523,7 +495,7 @@ int json_array_insert_new(json_t *json, unsigned int index, json_t *value)
         array_copy(array->table, 0, old_table, 0, index);
         array_copy(array->table, index + 1, old_table, index,
                    array->entries - index);
-        free(old_table);
+        jsonp_free(old_table);
     }
     else
         array_move(array, index + 1, index, array->entries - index);
@@ -534,7 +506,7 @@ int json_array_insert_new(json_t *json, unsigned int index, json_t *value)
     return 0;
 }
 
-int json_array_remove(json_t *json, unsigned int index)
+int json_array_remove(json_t *json, size_t index)
 {
     json_array_t *array;
 
@@ -547,7 +519,10 @@ int json_array_remove(json_t *json, unsigned int index)
 
     json_decref(array->table[index]);
 
-    array_move(array, index, index + 1, array->entries - index);
+    /* If we're removing the last element, nothing has to be moved */
+    if(index < array->entries - 1)
+        array_move(array, index, index + 1, array->entries - index - 1);
+
     array->entries--;
 
     return 0;
@@ -556,7 +531,7 @@ int json_array_remove(json_t *json, unsigned int index)
 int json_array_clear(json_t *json)
 {
     json_array_t *array;
-    unsigned int i;
+    size_t i;
 
     if(!json_is_array(json))
         return -1;
@@ -572,7 +547,7 @@ int json_array_clear(json_t *json)
 int json_array_extend(json_t *json, json_t *other_json)
 {
     json_array_t *array, *other;
-    unsigned int i;
+    size_t i;
 
     if(!json_is_array(json) || !json_is_array(other_json))
         return -1;
@@ -593,7 +568,7 @@ int json_array_extend(json_t *json, json_t *other_json)
 
 static int json_array_equal(json_t *array1, json_t *array2)
 {
-    unsigned int i, size;
+    size_t i, size;
 
     size = json_array_size(array1);
     if(size != json_array_size(array2))
@@ -616,7 +591,7 @@ static int json_array_equal(json_t *array1, json_t *array2)
 static json_t *json_array_copy(json_t *array)
 {
     json_t *result;
-    unsigned int i;
+    size_t i;
 
     result = json_array();
     if(!result)
@@ -628,10 +603,10 @@ static json_t *json_array_copy(json_t *array)
     return result;
 }
 
-static json_t *json_array_deep_copy(json_t *array)
+static json_t *json_array_deep_copy(const json_t *array)
 {
     json_t *result;
-    unsigned int i;
+    size_t i;
 
     result = json_array();
     if(!result)
@@ -652,14 +627,14 @@ json_t *json_string_nocheck(const char *value)
     if(!value)
         return NULL;
 
-    string = malloc(sizeof(json_string_t));
+    string = jsonp_malloc(sizeof(json_string_t));
     if(!string)
         return NULL;
     json_init(&string->json, JSON_STRING);
 
-    string->value = strdup(value);
+    string->value = jsonp_strdup(value);
     if(!string->value) {
-        free(string);
+        jsonp_free(string);
         return NULL;
     }
 
@@ -687,12 +662,15 @@ int json_string_set_nocheck(json_t *json, const char *value)
     char *dup;
     json_string_t *string;
 
-    dup = strdup(value);
+    if(!json_is_string(json) || !value)
+        return -1;
+
+    dup = jsonp_strdup(value);
     if(!dup)
         return -1;
 
     string = json_to_string(json);
-    free(string->value);
+    jsonp_free(string->value);
     string->value = dup;
 
     return 0;
@@ -708,8 +686,8 @@ int json_string_set(json_t *json, const char *value)
 
 static void json_delete_string(json_string_t *string)
 {
-    free(string->value);
-    free(string);
+    jsonp_free(string->value);
+    jsonp_free(string);
 }
 
 static int json_string_equal(json_t *string1, json_t *string2)
@@ -717,7 +695,7 @@ static int json_string_equal(json_t *string1, json_t *string2)
     return strcmp(json_string_value(string1), json_string_value(string2)) == 0;
 }
 
-static json_t *json_string_copy(json_t *string)
+static json_t *json_string_copy(const json_t *string)
 {
     return json_string_nocheck(json_string_value(string));
 }
@@ -725,9 +703,9 @@ static json_t *json_string_copy(json_t *string)
 
 /*** integer ***/
 
-json_t *json_integer(int value)
+json_t *json_integer(json_int_t value)
 {
-    json_integer_t *integer = malloc(sizeof(json_integer_t));
+    json_integer_t *integer = jsonp_malloc(sizeof(json_integer_t));
     if(!integer)
         return NULL;
     json_init(&integer->json, JSON_INTEGER);
@@ -736,7 +714,7 @@ json_t *json_integer(int value)
     return &integer->json;
 }
 
-int json_integer_value(const json_t *json)
+json_int_t json_integer_value(const json_t *json)
 {
     if(!json_is_integer(json))
         return 0;
@@ -744,7 +722,7 @@ int json_integer_value(const json_t *json)
     return json_to_integer(json)->value;
 }
 
-int json_integer_set(json_t *json, int value)
+int json_integer_set(json_t *json, json_int_t value)
 {
     if(!json_is_integer(json))
         return -1;
@@ -756,7 +734,7 @@ int json_integer_set(json_t *json, int value)
 
 static void json_delete_integer(json_integer_t *integer)
 {
-    free(integer);
+    jsonp_free(integer);
 }
 
 static int json_integer_equal(json_t *integer1, json_t *integer2)
@@ -764,7 +742,7 @@ static int json_integer_equal(json_t *integer1, json_t *integer2)
     return json_integer_value(integer1) == json_integer_value(integer2);
 }
 
-static json_t *json_integer_copy(json_t *integer)
+static json_t *json_integer_copy(const json_t *integer)
 {
     return json_integer(json_integer_value(integer));
 }
@@ -774,7 +752,12 @@ static json_t *json_integer_copy(json_t *integer)
 
 json_t *json_real(double value)
 {
-    json_real_t *real = malloc(sizeof(json_real_t));
+    json_real_t *real;
+
+    if(isnan(value) || isinf(value))
+        return NULL;
+
+    real = jsonp_malloc(sizeof(json_real_t));
     if(!real)
         return NULL;
     json_init(&real->json, JSON_REAL);
@@ -793,8 +776,8 @@ double json_real_value(const json_t *json)
 
 int json_real_set(json_t *json, double value)
 {
-    if(!json_is_real(json))
-        return 0;
+    if(!json_is_real(json) || isnan(value) || isinf(value))
+        return -1;
 
     json_to_real(json)->value = value;
 
@@ -803,7 +786,7 @@ int json_real_set(json_t *json, double value)
 
 static void json_delete_real(json_real_t *real)
 {
-    free(real);
+    jsonp_free(real);
 }
 
 static int json_real_equal(json_t *real1, json_t *real2)
@@ -811,7 +794,7 @@ static int json_real_equal(json_t *real1, json_t *real2)
     return json_real_value(real1) == json_real_value(real2);
 }
 
-static json_t *json_real_copy(json_t *real)
+static json_t *json_real_copy(const json_t *real)
 {
     return json_real(json_real_value(real));
 }
@@ -822,7 +805,7 @@ static json_t *json_real_copy(json_t *real)
 double json_number_value(const json_t *json)
 {
     if(json_is_integer(json))
-        return json_integer_value(json);
+        return (double)json_integer_value(json);
     else if(json_is_real(json))
         return json_real_value(json);
     else
@@ -834,30 +817,21 @@ double json_number_value(const json_t *json)
 
 json_t *json_true(void)
 {
-    static json_t the_true = {
-        JSON_TRUE,
-        (unsigned int)-1
-    };
+    static json_t the_true = {JSON_TRUE, (size_t)-1};
     return &the_true;
 }
 
 
 json_t *json_false(void)
 {
-    static json_t the_false = {
-        JSON_FALSE,
-        (unsigned int)-1
-    };
+    static json_t the_false = {JSON_FALSE, (size_t)-1};
     return &the_false;
 }
 
 
 json_t *json_null(void)
 {
-    static json_t the_null = {
-        JSON_NULL,
-        (unsigned int)-1
-    };
+    static json_t the_null = {JSON_NULL, (size_t)-1};
     return &the_null;
 }
 
@@ -946,7 +920,7 @@ json_t *json_copy(json_t *json)
     return NULL;
 }
 
-json_t *json_deep_copy(json_t *json)
+json_t *json_deep_copy(const json_t *json)
 {
     if(!json)
         return NULL;
@@ -970,7 +944,7 @@ json_t *json_deep_copy(json_t *json)
         return json_real_copy(json);
 
     if(json_is_true(json) || json_is_false(json) || json_is_null(json))
-        return json;
+        return (json_t *)json;
 
     return NULL;
 }
diff --git a/compat/nvapi/NVAPI.url b/compat/nvapi/NVAPI.url
new file mode 100644
index 0000000000..fd7349d9fb
--- /dev/null
+++ b/compat/nvapi/NVAPI.url
@@ -0,0 +1,2 @@
+[InternetShortcut]
+URL=https://developer.nvidia.com/nvapi
diff --git a/compat/nvapi/amd64/nvapi64.lib b/compat/nvapi/amd64/nvapi64.lib
new file mode 100644
index 0000000000..d9c832f808
Binary files /dev/null and b/compat/nvapi/amd64/nvapi64.lib differ
diff --git a/compat/nvapi/nvHLSLExtns.h b/compat/nvapi/nvHLSLExtns.h
new file mode 100644
index 0000000000..0c0110e8d2
--- /dev/null
+++ b/compat/nvapi/nvHLSLExtns.h
@@ -0,0 +1,402 @@
+////////////////////////// NVIDIA SHADER EXTENSIONS /////////////////
+
+// this file is to be #included in the app HLSL shader code to make
+// use of nvidia shader extensions
+
+
+#include "nvHLSLExtnsInternal.h"
+
+//----------------------------------------------------------------------------//
+//------------------------- Warp Shuffle Functions ---------------------------//
+//----------------------------------------------------------------------------//
+
+// all functions have variants with width parameter which permits sub-division
+// of the warp into segments - for example to exchange data between 4 groups of
+// 8 lanes in a SIMD manner. If width is less than warpSize then each subsection
+// of the warp behaves as a separate entity with a starting logical lane ID of 0.
+// A thread may only exchange data with others in its own subsection. Width must
+// have a value which is a power of 2 so that the warp can be subdivided equally;
+// results are undefined if width is not a power of 2, or is a number greater
+// than warpSize.
+
+//
+// simple variant of SHFL instruction
+// returns val from the specified lane
+// optional width parameter must be a power of two and width <= 32
+//
+int NvShfl(int val, uint srcLane, int width = NV_WARP_SIZE)
+{
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].src0u.x  =  val;                             // variable to be shuffled
+    g_NvidiaExt[index].src0u.y  =  srcLane;                         // source lane
+    g_NvidiaExt[index].src0u.z  =  __NvGetShflMaskFromWidth(width);
+    g_NvidiaExt[index].opcode   =  NV_EXTN_OP_SHFL;
+
+    // result is returned as the return value of IncrementCounter on fake UAV slot
+    return g_NvidiaExt.IncrementCounter();
+}
+
+//
+// Copy from a lane with lower ID relative to caller
+//
+int NvShflUp(int val, uint delta, int width = NV_WARP_SIZE)
+{
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].src0u.x  =  val;                        // variable to be shuffled
+    g_NvidiaExt[index].src0u.y  =  delta;                      // relative lane offset
+    g_NvidiaExt[index].src0u.z  =  (NV_WARP_SIZE - width) << 8;   // minIndex = maxIndex for shfl_up (src2[4:0] is expected to be 0)
+    g_NvidiaExt[index].opcode   =  NV_EXTN_OP_SHFL_UP;
+    return g_NvidiaExt.IncrementCounter();
+}
+
+//
+// Copy from a lane with higher ID relative to caller
+//
+int NvShflDown(int val, uint delta, int width = NV_WARP_SIZE)
+{
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].src0u.x  =  val;           // variable to be shuffled
+    g_NvidiaExt[index].src0u.y  =  delta;         // relative lane offset
+    g_NvidiaExt[index].src0u.z  =  __NvGetShflMaskFromWidth(width);
+    g_NvidiaExt[index].opcode   =  NV_EXTN_OP_SHFL_DOWN;
+    return g_NvidiaExt.IncrementCounter();
+}
+
+//
+// Copy from a lane based on bitwise XOR of own lane ID
+//
+int NvShflXor(int val, uint laneMask, int width = NV_WARP_SIZE)
+{
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].src0u.x  =  val;           // variable to be shuffled
+    g_NvidiaExt[index].src0u.y  =  laneMask;      // laneMask to be XOR'ed with current laneId to get the source lane id
+    g_NvidiaExt[index].src0u.z  =  __NvGetShflMaskFromWidth(width);
+    g_NvidiaExt[index].opcode   =  NV_EXTN_OP_SHFL_XOR;
+    return g_NvidiaExt.IncrementCounter();
+}
+
+
+//----------------------------------------------------------------------------//
+//----------------------------- Warp Vote Functions---------------------------//
+//----------------------------------------------------------------------------//
+
+// returns 0xFFFFFFFF if the predicate is true for any thread in the warp, returns 0 otherwise
+uint NvAny(int predicate)
+{
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].src0u.x  =  predicate;
+    g_NvidiaExt[index].opcode   = NV_EXTN_OP_VOTE_ANY;
+    return g_NvidiaExt.IncrementCounter();
+}
+
+// returns 0xFFFFFFFF if the predicate is true for ALL threads in the warp, returns 0 otherwise
+uint NvAll(int predicate)
+{
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].src0u.x  =  predicate;
+    g_NvidiaExt[index].opcode   =  NV_EXTN_OP_VOTE_ALL;
+    return g_NvidiaExt.IncrementCounter();
+}
+
+// returns a mask of all threads in the warp with bits set for threads that have predicate true
+uint NvBallot(int predicate)
+{
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].src0u.x  =  predicate;
+    g_NvidiaExt[index].opcode   =  NV_EXTN_OP_VOTE_BALLOT;
+    return g_NvidiaExt.IncrementCounter();
+}
+
+
+//----------------------------------------------------------------------------//
+//----------------------------- Utility Functions ----------------------------//
+//----------------------------------------------------------------------------//
+
+// returns the lane index of the current thread (thread index in warp)
+int NvGetLaneId()
+{
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].opcode   =  NV_EXTN_OP_GET_LANE_ID;
+    return g_NvidiaExt.IncrementCounter();
+}
+
+
+//----------------------------------------------------------------------------//
+//----------------------------- FP16 Atmoic Functions-------------------------//
+//----------------------------------------------------------------------------//
+
+// The functions below performs atomic operations on two consecutive fp16
+// values in the given raw UAV.
+// The uint paramater 'fp16x2Val' is treated as two fp16 values byteAddress must be multiple of 4
+// The returned value are the two fp16 values packed into a single uint
+
+uint NvInterlockedAddFp16x2(RWByteAddressBuffer uav, uint byteAddress, uint fp16x2Val)
+{
+    return __NvAtomicOpFP16x2(uav, byteAddress, fp16x2Val, NV_EXTN_ATOM_ADD);
+}
+
+uint NvInterlockedMinFp16x2(RWByteAddressBuffer uav, uint byteAddress, uint fp16x2Val)
+{
+    return __NvAtomicOpFP16x2(uav, byteAddress, fp16x2Val, NV_EXTN_ATOM_MIN);
+}
+
+uint NvInterlockedMaxFp16x2(RWByteAddressBuffer uav, uint byteAddress, uint fp16x2Val)
+{
+    return __NvAtomicOpFP16x2(uav, byteAddress, fp16x2Val, NV_EXTN_ATOM_MAX);
+}
+
+
+// versions of the above functions taking two fp32 values (internally converted to fp16 values)
+uint NvInterlockedAddFp16x2(RWByteAddressBuffer uav, uint byteAddress, float2 val)
+{
+    return __NvAtomicOpFP16x2(uav, byteAddress, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_ADD);
+}
+
+uint NvInterlockedMinFp16x2(RWByteAddressBuffer uav, uint byteAddress, float2 val)
+{
+    return __NvAtomicOpFP16x2(uav, byteAddress, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_MIN);
+}
+
+uint NvInterlockedMaxFp16x2(RWByteAddressBuffer uav, uint byteAddress, float2 val)
+{
+    return __NvAtomicOpFP16x2(uav, byteAddress, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_MAX);
+}
+
+
+//----------------------------------------------------------------------------//
+
+// The functions below perform atomic operation on a R16G16_FLOAT UAV at the given address
+// the uint paramater 'fp16x2Val' is treated as two fp16 values
+// the returned value are the two fp16 values (.x and .y components) packed into a single uint
+// Warning: Behaviour of these set of functions is undefined if the UAV is not
+// of R16G16_FLOAT format (might result in app crash or TDR)
+
+uint NvInterlockedAddFp16x2(RWTexture1D<float2> uav, uint address, uint fp16x2Val)
+{
+    return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_ADD);
+}
+
+uint NvInterlockedMinFp16x2(RWTexture1D<float2> uav, uint address, uint fp16x2Val)
+{
+    return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MIN);
+}
+
+uint NvInterlockedMaxFp16x2(RWTexture1D<float2> uav, uint address, uint fp16x2Val)
+{
+    return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MAX);
+}
+
+uint NvInterlockedAddFp16x2(RWTexture2D<float2> uav, uint2 address, uint fp16x2Val)
+{
+    return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_ADD);
+}
+
+uint NvInterlockedMinFp16x2(RWTexture2D<float2> uav, uint2 address, uint fp16x2Val)
+{
+    return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MIN);
+}
+
+uint NvInterlockedMaxFp16x2(RWTexture2D<float2> uav, uint2 address, uint fp16x2Val)
+{
+    return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MAX);
+}
+
+uint NvInterlockedAddFp16x2(RWTexture3D<float2> uav, uint3 address, uint fp16x2Val)
+{
+    return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_ADD);
+}
+
+uint NvInterlockedMinFp16x2(RWTexture3D<float2> uav, uint3 address, uint fp16x2Val)
+{
+    return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MIN);
+}
+
+uint NvInterlockedMaxFp16x2(RWTexture3D<float2> uav, uint3 address, uint fp16x2Val)
+{
+    return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MAX);
+}
+
+
+// versions taking two fp32 values (internally converted to fp16)
+uint NvInterlockedAddFp16x2(RWTexture1D<float2> uav, uint address, float2 val)
+{
+    return __NvAtomicOpFP16x2(uav, address, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_ADD);
+}
+
+uint NvInterlockedMinFp16x2(RWTexture1D<float2> uav, uint address, float2 val)
+{
+    return __NvAtomicOpFP16x2(uav, address, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_MIN);
+}
+
+uint NvInterlockedMaxFp16x2(RWTexture1D<float2> uav, uint address, float2 val)
+{
+    return __NvAtomicOpFP16x2(uav, address, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_MAX);
+}
+
+uint NvInterlockedAddFp16x2(RWTexture2D<float2> uav, uint2 address, float2 val)
+{
+    return __NvAtomicOpFP16x2(uav, address, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_ADD);
+}
+
+uint NvInterlockedMinFp16x2(RWTexture2D<float2> uav, uint2 address, float2 val)
+{
+    return __NvAtomicOpFP16x2(uav, address, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_MIN);
+}
+
+uint NvInterlockedMaxFp16x2(RWTexture2D<float2> uav, uint2 address, float2 val)
+{
+    return __NvAtomicOpFP16x2(uav, address, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_MAX);
+}
+
+uint NvInterlockedAddFp16x2(RWTexture3D<float2> uav, uint3 address, float2 val)
+{
+    return __NvAtomicOpFP16x2(uav, address, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_ADD);
+}
+
+uint NvInterlockedMinFp16x2(RWTexture3D<float2> uav, uint3 address, float2 val)
+{
+    return __NvAtomicOpFP16x2(uav, address, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_MIN);
+}
+
+uint NvInterlockedMaxFp16x2(RWTexture3D<float2> uav, uint3 address, float2 val)
+{
+    return __NvAtomicOpFP16x2(uav, address, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_MAX);
+}
+
+
+//----------------------------------------------------------------------------//
+
+// The functions below perform Atomic operation on a R16G16B16A16_FLOAT UAV at the given address
+// the uint2 paramater 'fp16x2Val' is treated as four fp16 values
+// i.e, fp16x2Val.x = uav.xy and fp16x2Val.y = uav.yz
+// The returned value are the four fp16 values (.xyzw components) packed into uint2
+// Warning: Behaviour of these set of functions is undefined if the UAV is not
+// of R16G16B16A16_FLOAT format (might result in app crash or TDR)
+
+uint2 NvInterlockedAddFp16x4(RWTexture1D<float4> uav, uint address, uint2 fp16x2Val)
+{
+    return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_ADD);
+}
+
+uint2 NvInterlockedMinFp16x4(RWTexture1D<float4> uav, uint address, uint2 fp16x2Val)
+{
+    return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MIN);
+}
+
+uint2 NvInterlockedMaxFp16x4(RWTexture1D<float4> uav, uint address, uint2 fp16x2Val)
+{
+    return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MAX);
+}
+
+uint2 NvInterlockedAddFp16x4(RWTexture2D<float4> uav, uint2 address, uint2 fp16x2Val)
+{
+    return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_ADD);
+}
+
+uint2 NvInterlockedMinFp16x4(RWTexture2D<float4> uav, uint2 address, uint2 fp16x2Val)
+{
+    return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MIN);
+}
+
+uint2 NvInterlockedMaxFp16x4(RWTexture2D<float4> uav, uint2 address, uint2 fp16x2Val)
+{
+    return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MAX);
+}
+
+uint2 NvInterlockedAddFp16x4(RWTexture3D<float4> uav, uint3 address, uint2 fp16x2Val)
+{
+    return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_ADD);
+}
+
+uint2 NvInterlockedMinFp16x4(RWTexture3D<float4> uav, uint3 address, uint2 fp16x2Val)
+{
+    return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MIN);
+}
+
+uint2 NvInterlockedMaxFp16x4(RWTexture3D<float4> uav, uint3 address, uint2 fp16x2Val)
+{
+    return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MAX);
+}
+
+// versions taking four fp32 values (internally converted to fp16)
+uint2 NvInterlockedAddFp16x4(RWTexture1D<float4> uav, uint address, float4 val)
+{
+    return __NvAtomicOpFP16x2(uav, address, __fp32x4Tofp16x4(val), NV_EXTN_ATOM_ADD);
+}
+
+uint2 NvInterlockedMinFp16x4(RWTexture1D<float4> uav, uint address, float4 val)
+{
+    return __NvAtomicOpFP16x2(uav, address, __fp32x4Tofp16x4(val), NV_EXTN_ATOM_MIN);
+}
+
+uint2 NvInterlockedMaxFp16x4(RWTexture1D<float4> uav, uint address, float4 val)
+{
+    return __NvAtomicOpFP16x2(uav, address, __fp32x4Tofp16x4(val), NV_EXTN_ATOM_MAX);
+}
+
+uint2 NvInterlockedAddFp16x4(RWTexture2D<float4> uav, uint2 address, float4 val)
+{
+    return __NvAtomicOpFP16x2(uav, address, __fp32x4Tofp16x4(val), NV_EXTN_ATOM_ADD);
+}
+
+uint2 NvInterlockedMinFp16x4(RWTexture2D<float4> uav, uint2 address, float4 val)
+{
+    return __NvAtomicOpFP16x2(uav, address, __fp32x4Tofp16x4(val), NV_EXTN_ATOM_MIN);
+}
+
+uint2 NvInterlockedMaxFp16x4(RWTexture2D<float4> uav, uint2 address, float4 val)
+{
+    return __NvAtomicOpFP16x2(uav, address, __fp32x4Tofp16x4(val), NV_EXTN_ATOM_MAX);
+}
+
+uint2 NvInterlockedAddFp16x4(RWTexture3D<float4> uav, uint3 address, float4 val)
+{
+    return __NvAtomicOpFP16x2(uav, address, __fp32x4Tofp16x4(val), NV_EXTN_ATOM_ADD);
+}
+
+uint2 NvInterlockedMinFp16x4(RWTexture3D<float4> uav, uint3 address, float4 val)
+{
+    return __NvAtomicOpFP16x2(uav, address, __fp32x4Tofp16x4(val), NV_EXTN_ATOM_MIN);
+}
+
+uint2 NvInterlockedMaxFp16x4(RWTexture3D<float4> uav, uint3 address, float4 val)
+{
+    return __NvAtomicOpFP16x2(uav, address, __fp32x4Tofp16x4(val), NV_EXTN_ATOM_MAX);
+}
+
+
+//----------------------------------------------------------------------------//
+//----------------------------- FP32 Atmoic Functions-------------------------//
+//----------------------------------------------------------------------------//
+
+// The functions below performs atomic add on the given UAV treating the value as float
+// byteAddress must be multiple of 4
+// The returned value is the value present in memory location before the atomic add
+
+float NvInterlockedAddFp32(RWByteAddressBuffer uav, uint byteAddress, float val)
+{
+    return __NvAtomicAddFP32(uav, byteAddress, val);
+}
+
+//----------------------------------------------------------------------------//
+
+// The functions below perform atomic add on a R32_FLOAT UAV at the given address
+// the returned value is the value before performing the atomic add
+// Warning: Behaviour of these set of functions is undefined if the UAV is not
+// of R32_FLOAT format (might result in app crash or TDR)
+
+float NvInterlockedAddFp32(RWTexture1D<float> uav, uint address, float val)
+{
+    return __NvAtomicAddFP32(uav, address, val);
+}
+
+float NvInterlockedAddFp32(RWTexture2D<float> uav, uint2 address, float val)
+{
+    return __NvAtomicAddFP32(uav, address, val);
+}
+
+float NvInterlockedAddFp32(RWTexture3D<float> uav, uint3 address, float val)
+{
+    return __NvAtomicAddFP32(uav, address, val);
+}
+
diff --git a/compat/nvapi/nvHLSLExtnsInternal.h b/compat/nvapi/nvHLSLExtnsInternal.h
new file mode 100644
index 0000000000..1c2ebf6599
--- /dev/null
+++ b/compat/nvapi/nvHLSLExtnsInternal.h
@@ -0,0 +1,472 @@
+////////////////////////// NVIDIA SHADER EXTENSIONS /////////////////
+// internal functions
+// Functions in this file are not expected to be called by apps directly
+
+#include "nvShaderExtnEnums.h"
+
+struct NvShaderExtnStruct
+{
+    uint   opcode;      // opcode
+    uint   rid;         // resource ID
+    uint   sid;         // sampler ID
+
+    uint4  dst1u;       // destination operand 1 (for instructions that need extra destination operands)
+    uint4  padding0[3]; // currently unused
+
+    uint4  src0u;       // uint source operand  0
+    uint4  src1u;       // uint source operand  0
+    uint4  src2u;       // uint source operand  0
+    uint4  dst0u;       // uint destination operand
+
+    uint   markUavRef;  // the next store to UAV is fake and is used only to identify the uav slot
+    float  padding1[28];// struct size: 256 bytes
+};
+
+// RW structured buffer for Nvidia shader extensions
+
+// Application needs to define NV_SHADER_EXTN_SLOT as a unused slot, which should be
+// set using NvAPI_D3D11_SetNvShaderExtnSlot() call before creating the first shader that
+// uses nvidia shader extensions. E.g before including this file in shader define it as:
+// #define NV_SHADER_EXTN_SLOT u7
+
+// Note that other operations to this UAV will be ignored so application
+// should bind a null resource
+RWStructuredBuffer<NvShaderExtnStruct> g_NvidiaExt : register( NV_SHADER_EXTN_SLOT );
+
+
+//----------------------------------------------------------------------------//
+// the exposed SHFL instructions accept a mask parameter in src2
+// To compute lane mask from width of segment:
+// minLaneID : currentLaneId & src2[12:8]
+// maxLaneID : minLaneId | (src2[4:0] & ~src2[12:8])
+// where [minLaneId, maxLaneId] defines the segment where currentLaneId belongs
+// we always set src2[4:0] to 11111 (0x1F), and set src2[12:8] as (32 - width)
+int __NvGetShflMaskFromWidth(uint width)
+{
+    return ((NV_WARP_SIZE - width) << 8) | 0x1F;
+}
+
+//----------------------------------------------------------------------------//
+
+void __NvReferenceUAVForOp(RWByteAddressBuffer uav)
+{
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].markUavRef = 1;
+    uav.Store(index, 0);
+}
+
+void __NvReferenceUAVForOp(RWTexture1D<float2> uav)
+{
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].markUavRef = 1;
+    uav[index] = float2(0,0);
+}
+
+void __NvReferenceUAVForOp(RWTexture2D<float2> uav)
+{
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].markUavRef = 1;
+    uav[uint2(index,index)] = float2(0,0);
+}
+
+void __NvReferenceUAVForOp(RWTexture3D<float2> uav)
+{
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].markUavRef = 1;
+    uav[uint3(index,index,index)] = float2(0,0);
+}
+
+void __NvReferenceUAVForOp(RWTexture1D<float4> uav)
+{
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].markUavRef = 1;
+    uav[index] = float4(0,0,0,0);
+}
+
+void __NvReferenceUAVForOp(RWTexture2D<float4> uav)
+{
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].markUavRef = 1;
+    uav[uint2(index,index)] = float4(0,0,0,0);
+}
+
+void __NvReferenceUAVForOp(RWTexture3D<float4> uav)
+{
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].markUavRef = 1;
+    uav[uint3(index,index,index)] = float4(0,0,0,0);
+}
+
+void __NvReferenceUAVForOp(RWTexture1D<float> uav)
+{
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].markUavRef = 1;
+    uav[index] = 0.0f;
+}
+
+void __NvReferenceUAVForOp(RWTexture2D<float> uav)
+{
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].markUavRef = 1;
+    uav[uint2(index,index)] = 0.0f;
+}
+
+void __NvReferenceUAVForOp(RWTexture3D<float> uav)
+{
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].markUavRef = 1;
+    uav[uint3(index,index,index)] = 0.0f;
+}
+
+
+void __NvReferenceUAVForOp(RWTexture1D<uint2> uav)
+{
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].markUavRef = 1;
+    uav[index] = uint2(0,0);
+}
+
+void __NvReferenceUAVForOp(RWTexture2D<uint2> uav)
+{
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].markUavRef = 1;
+    uav[uint2(index,index)] = uint2(0,0);
+}
+
+void __NvReferenceUAVForOp(RWTexture3D<uint2> uav)
+{
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].markUavRef = 1;
+    uav[uint3(index,index,index)] = uint2(0,0);
+}
+
+void __NvReferenceUAVForOp(RWTexture1D<uint4> uav)
+{
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].markUavRef = 1;
+    uav[index] = uint4(0,0,0,0);
+}
+
+void __NvReferenceUAVForOp(RWTexture2D<uint4> uav)
+{
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].markUavRef = 1;
+    uav[uint2(index,index)] = uint4(0,0,0,0);
+}
+
+void __NvReferenceUAVForOp(RWTexture3D<uint4> uav)
+{
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].markUavRef = 1;
+    uav[uint3(index,index,index)] = uint4(0,0,0,0);
+}
+
+void __NvReferenceUAVForOp(RWTexture1D<uint> uav)
+{
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].markUavRef = 1;
+    uav[index] = 0;
+}
+
+void __NvReferenceUAVForOp(RWTexture2D<uint> uav)
+{
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].markUavRef = 1;
+    uav[uint2(index,index)] = 0;
+}
+
+void __NvReferenceUAVForOp(RWTexture3D<uint> uav)
+{
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].markUavRef = 1;
+    uav[uint3(index,index,index)] = 0;
+}
+
+void __NvReferenceUAVForOp(RWTexture1D<int2> uav)
+{
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].markUavRef = 1;
+    uav[index] = int2(0,0);
+}
+
+void __NvReferenceUAVForOp(RWTexture2D<int2> uav)
+{
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].markUavRef = 1;
+    uav[uint2(index,index)] = int2(0,0);
+}
+
+void __NvReferenceUAVForOp(RWTexture3D<int2> uav)
+{
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].markUavRef = 1;
+    uav[uint3(index,index,index)] = int2(0,0);
+}
+
+void __NvReferenceUAVForOp(RWTexture1D<int4> uav)
+{
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].markUavRef = 1;
+    uav[index] = int4(0,0,0,0);
+}
+
+void __NvReferenceUAVForOp(RWTexture2D<int4> uav)
+{
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].markUavRef = 1;
+    uav[uint2(index,index)] = int4(0,0,0,0);
+}
+
+void __NvReferenceUAVForOp(RWTexture3D<int4> uav)
+{
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].markUavRef = 1;
+    uav[uint3(index,index,index)] = int4(0,0,0,0);
+}
+
+void __NvReferenceUAVForOp(RWTexture1D<int> uav)
+{
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].markUavRef = 1;
+    uav[index] = 0;
+}
+
+void __NvReferenceUAVForOp(RWTexture2D<int> uav)
+{
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].markUavRef = 1;
+    uav[uint2(index,index)] = 0;
+}
+
+void __NvReferenceUAVForOp(RWTexture3D<int> uav)
+{
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].markUavRef = 1;
+    uav[uint3(index,index,index)] = 0;
+}
+
+//----------------------------------------------------------------------------//
+// ATOMIC op sub-opcodes
+#define NV_EXTN_ATOM_ADD                            3
+#define NV_EXTN_ATOM_MAX                            6
+#define NV_EXTN_ATOM_MIN                            7
+
+//----------------------------------------------------------------------------//
+
+// performs Atomic operation on two consecutive fp16 values in the given UAV
+// the uint paramater 'fp16x2Val' is treated as two fp16 values
+// the passed sub-opcode 'op' should be an immediate constant
+// byteAddress must be multiple of 4
+// the returned value are the two fp16 values packed into a single uint
+uint __NvAtomicOpFP16x2(RWByteAddressBuffer uav, uint byteAddress, uint fp16x2Val, uint atomicOpType)
+{
+    __NvReferenceUAVForOp(uav);
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].src0u.x = byteAddress;
+    g_NvidiaExt[index].src1u.x = fp16x2Val;
+    g_NvidiaExt[index].src2u.x = atomicOpType;
+    g_NvidiaExt[index].opcode  = NV_EXTN_OP_FP16_ATOMIC;
+
+    return g_NvidiaExt[index].dst0u.x;
+}
+
+//----------------------------------------------------------------------------//
+
+// performs Atomic operation on a R16G16_FLOAT UAV at the given address
+// the uint paramater 'fp16x2Val' is treated as two fp16 values
+// the passed sub-opcode 'op' should be an immediate constant
+// the returned value are the two fp16 values (.x and .y components) packed into a single uint
+// Warning: Behaviour of these set of functions is undefined if the UAV is not
+// of R16G16_FLOAT format (might result in app crash or TDR)
+
+uint __NvAtomicOpFP16x2(RWTexture1D<float2> uav, uint address, uint fp16x2Val, uint atomicOpType)
+{
+    __NvReferenceUAVForOp(uav);
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].src0u.x    = address;
+    g_NvidiaExt[index].src1u.x    = fp16x2Val;
+    g_NvidiaExt[index].src2u.x    = atomicOpType;
+    g_NvidiaExt[index].opcode     = NV_EXTN_OP_FP16_ATOMIC;
+
+    return g_NvidiaExt[index].dst0u.x;
+}
+
+uint __NvAtomicOpFP16x2(RWTexture2D<float2> uav, uint2 address, uint fp16x2Val, uint atomicOpType)
+{
+    __NvReferenceUAVForOp(uav);
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].src0u.xy   = address;
+    g_NvidiaExt[index].src1u.x    = fp16x2Val;
+    g_NvidiaExt[index].src2u.x    = atomicOpType;
+    g_NvidiaExt[index].opcode     = NV_EXTN_OP_FP16_ATOMIC;
+
+    return g_NvidiaExt[index].dst0u.x;
+}
+
+uint __NvAtomicOpFP16x2(RWTexture3D<float2> uav, uint3 address, uint fp16x2Val, uint atomicOpType)
+{
+    __NvReferenceUAVForOp(uav);
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].src0u.xyz  = address;
+    g_NvidiaExt[index].src1u.x    = fp16x2Val;
+    g_NvidiaExt[index].src2u.x    = atomicOpType;
+    g_NvidiaExt[index].opcode     = NV_EXTN_OP_FP16_ATOMIC;
+
+    return g_NvidiaExt[index].dst0u.x;
+}
+
+//----------------------------------------------------------------------------//
+
+// performs Atomic operation on a R16G16B16A16_FLOAT UAV at the given address
+// the uint2 paramater 'fp16x2Val' is treated as four fp16 values
+// i.e, fp16x2Val.x = uav.xy and fp16x2Val.y = uav.yz
+// the passed sub-opcode 'op' should be an immediate constant
+// the returned value are the four fp16 values (.xyzw components) packed into uint2
+// Warning: Behaviour of these set of functions is undefined if the UAV is not
+// of R16G16B16A16_FLOAT format (might result in app crash or TDR)
+
+uint2 __NvAtomicOpFP16x2(RWTexture1D<float4> uav, uint address, uint2 fp16x2Val, uint atomicOpType)
+{
+    __NvReferenceUAVForOp(uav);
+
+    // break it down into two fp16x2 atomic ops
+    uint2 retVal;
+
+    // first op has x-coordinate = x * 2
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].src0u.x    = address * 2;
+    g_NvidiaExt[index].src1u.x    = fp16x2Val.x;
+    g_NvidiaExt[index].src2u.x    = atomicOpType;
+    g_NvidiaExt[index].opcode     = NV_EXTN_OP_FP16_ATOMIC;
+    retVal.x = g_NvidiaExt[index].dst0u.x;
+
+    // second op has x-coordinate = x * 2 + 1
+    index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].src0u.x    = address * 2 + 1;
+    g_NvidiaExt[index].src1u.x    = fp16x2Val.y;
+    g_NvidiaExt[index].src2u.x    = atomicOpType;
+    g_NvidiaExt[index].opcode     = NV_EXTN_OP_FP16_ATOMIC;
+    retVal.y = g_NvidiaExt[index].dst0u.x;
+
+    return retVal;
+}
+
+uint2 __NvAtomicOpFP16x2(RWTexture2D<float4> uav, uint2 address, uint2 fp16x2Val, uint atomicOpType)
+{
+    __NvReferenceUAVForOp(uav);
+
+    // break it down into two fp16x2 atomic ops
+    uint2 retVal;
+
+    // first op has x-coordinate = x * 2
+    uint2 addressTemp = uint2(address.x * 2, address.y);
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].src0u.xy   = addressTemp;
+    g_NvidiaExt[index].src1u.x    = fp16x2Val.x;
+    g_NvidiaExt[index].src2u.x    = atomicOpType;
+    g_NvidiaExt[index].opcode     = NV_EXTN_OP_FP16_ATOMIC;
+    retVal.x = g_NvidiaExt[index].dst0u.x;
+
+    // second op has x-coordinate = x * 2 + 1
+    addressTemp.x++;
+    index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].src0u.xy   = addressTemp;
+    g_NvidiaExt[index].src1u.x    = fp16x2Val.y;
+    g_NvidiaExt[index].src2u.x    = atomicOpType;
+    g_NvidiaExt[index].opcode     = NV_EXTN_OP_FP16_ATOMIC;
+    retVal.y = g_NvidiaExt[index].dst0u.x;
+
+    return retVal;
+}
+
+uint2 __NvAtomicOpFP16x2(RWTexture3D<float4> uav, uint3 address, uint2 fp16x2Val, uint atomicOpType)
+{
+    __NvReferenceUAVForOp(uav);
+
+    // break it down into two fp16x2 atomic ops
+    uint2 retVal;
+
+    // first op has x-coordinate = x * 2
+    uint3 addressTemp = uint3(address.x * 2, address.y, address.z);
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].src0u.xyz  = addressTemp;
+    g_NvidiaExt[index].src1u.x    = fp16x2Val.x;
+    g_NvidiaExt[index].src2u.x    = atomicOpType;
+    g_NvidiaExt[index].opcode     = NV_EXTN_OP_FP16_ATOMIC;
+    retVal.x = g_NvidiaExt[index].dst0u.x;
+
+    // second op has x-coordinate = x * 2 + 1
+    addressTemp.x++;
+    index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].src0u.xyz  = addressTemp;
+    g_NvidiaExt[index].src1u.x    = fp16x2Val.y;
+    g_NvidiaExt[index].src2u.x    = atomicOpType;
+    g_NvidiaExt[index].opcode     = NV_EXTN_OP_FP16_ATOMIC;
+    retVal.y = g_NvidiaExt[index].dst0u.x;
+
+    return retVal;
+}
+
+uint __fp32x2Tofp16x2(float2 val)
+{
+    return (f32tof16(val.y)<<16) | f32tof16(val.x) ;
+}
+
+uint2 __fp32x4Tofp16x4(float4 val)
+{
+    return uint2( (f32tof16(val.y)<<16) | f32tof16(val.x), (f32tof16(val.w)<<16) | f32tof16(val.z) ) ;
+}
+
+// FP32 Atomic functions
+
+// performs Atomic operation treating the uav as float (fp32) values
+// the passed sub-opcode 'op' should be an immediate constant
+// byteAddress must be multiple of 4
+float __NvAtomicAddFP32(RWByteAddressBuffer uav, uint byteAddress, float val)
+{
+    __NvReferenceUAVForOp(uav);
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].src0u.x = byteAddress;
+    g_NvidiaExt[index].src1u.x = asuint(val);   // passing as uint to make it more convinient for the driver to translate
+    g_NvidiaExt[index].src2u.x = NV_EXTN_ATOM_ADD;
+    g_NvidiaExt[index].opcode  = NV_EXTN_OP_FP32_ATOMIC;
+
+    return asfloat(g_NvidiaExt[index].dst0u.x);
+}
+
+float __NvAtomicAddFP32(RWTexture1D<float> uav, uint address, float val)
+{
+    __NvReferenceUAVForOp(uav);
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].src0u.x    = address;
+    g_NvidiaExt[index].src1u.x    = asuint(val);
+    g_NvidiaExt[index].src2u.x    = NV_EXTN_ATOM_ADD;
+    g_NvidiaExt[index].opcode     = NV_EXTN_OP_FP32_ATOMIC;
+
+    return asfloat(g_NvidiaExt[index].dst0u.x);
+}
+
+float __NvAtomicAddFP32(RWTexture2D<float> uav, uint2 address, float val)
+{
+    __NvReferenceUAVForOp(uav);
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].src0u.xy   = address;
+    g_NvidiaExt[index].src1u.x    = asuint(val);
+    g_NvidiaExt[index].src2u.x    = NV_EXTN_ATOM_ADD;
+    g_NvidiaExt[index].opcode     = NV_EXTN_OP_FP32_ATOMIC;
+
+    return asfloat(g_NvidiaExt[index].dst0u.x);
+}
+
+float __NvAtomicAddFP32(RWTexture3D<float> uav, uint3 address, float val)
+{
+    __NvReferenceUAVForOp(uav);
+    uint index = g_NvidiaExt.IncrementCounter();
+    g_NvidiaExt[index].src0u.xyz  = address;
+    g_NvidiaExt[index].src1u.x    = asuint(val);
+    g_NvidiaExt[index].src2u.x    = NV_EXTN_ATOM_ADD;
+    g_NvidiaExt[index].opcode     = NV_EXTN_OP_FP32_ATOMIC;
+
+    return asfloat(g_NvidiaExt[index].dst0u.x);
+}
+
diff --git a/compat/nvapi/nvShaderExtnEnums.h b/compat/nvapi/nvShaderExtnEnums.h
new file mode 100644
index 0000000000..4293d9464d
--- /dev/null
+++ b/compat/nvapi/nvShaderExtnEnums.h
@@ -0,0 +1,38 @@
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////// NVIDIA SHADER EXTENSIONS ////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+// This file can be included both from HLSL shader code as well as C++ code.
+// The app should call NvAPI_D3D_IsNvShaderExtnOpCodeSupported() to
+// check for support for every nv shader extension opcode it plans to use
+
+
+
+//----------------------------------------------------------------------------//
+//---------------------------- NV Shader Extn Version  -----------------------//
+//----------------------------------------------------------------------------//
+#define NV_SHADER_EXTN_VERSION                              1
+
+//----------------------------------------------------------------------------//
+//---------------------------- Misc constants --------------------------------//
+//----------------------------------------------------------------------------//
+#define NV_WARP_SIZE                                       32
+
+
+//----------------------------------------------------------------------------//
+//---------------------------- opCode constants ------------------------------//
+//----------------------------------------------------------------------------//
+
+
+#define NV_EXTN_OP_SHFL                                     1
+#define NV_EXTN_OP_SHFL_UP                                  2
+#define NV_EXTN_OP_SHFL_DOWN                                3
+#define NV_EXTN_OP_SHFL_XOR                                 4
+
+#define NV_EXTN_OP_VOTE_ALL                                 5
+#define NV_EXTN_OP_VOTE_ANY                                 6
+#define NV_EXTN_OP_VOTE_BALLOT                              7
+
+#define NV_EXTN_OP_GET_LANE_ID                              8
+#define NV_EXTN_OP_FP16_ATOMIC                             12
+#define NV_EXTN_OP_FP32_ATOMIC                             13
diff --git a/compat/nvapi/nvapi.h b/compat/nvapi/nvapi.h
new file mode 100644
index 0000000000..803f84e729
--- /dev/null
+++ b/compat/nvapi/nvapi.h
@@ -0,0 +1,11760 @@
+#include"nvapi_lite_salstart.h"
+#include"nvapi_lite_common.h"
+#include"nvapi_lite_sli.h"
+#include"nvapi_lite_surround.h"
+#include"nvapi_lite_stereo.h"
+#include"nvapi_lite_d3dext.h"
+ /************************************************************************************************************************************\
+|*                                                                                                                                    *|
+|*     Copyright © 2012 NVIDIA Corporation.  All rights reserved.                                                                     *|
+|*                                                                                                                                    *|
+|*  NOTICE TO USER:                                                                                                                   *|
+|*                                                                                                                                    *|
+|*  This software is subject to NVIDIA ownership rights under U.S. and international Copyright laws.                                  *|
+|*                                                                                                                                    *|
+|*  This software and the information contained herein are PROPRIETARY and CONFIDENTIAL to NVIDIA                                     *|
+|*  and are being provided solely under the terms and conditions of an NVIDIA software license agreement.                             *|
+|*  Otherwise, you have no rights to use or access this software in any manner.                                                       *|
+|*                                                                                                                                    *|
+|*  If not covered by the applicable NVIDIA software license agreement:                                                               *|
+|*  NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOFTWARE FOR ANY PURPOSE.                                            *|
+|*  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.                                                           *|
+|*  NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,                                                                     *|
+|*  INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.                       *|
+|*  IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,                               *|
+|*  OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT,                         *|
+|*  NEGLIGENCE OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOURCE CODE.            *|
+|*                                                                                                                                    *|
+|*  U.S. Government End Users.                                                                                                        *|
+|*  This software is a "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT 1995),                                       *|
+|*  consisting  of "commercial computer  software"  and "commercial computer software documentation"                                  *|
+|*  as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government only as a commercial end item.     *|
+|*  Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995),                                          *|
+|*  all U.S. Government End Users acquire the software with only those rights set forth herein.                                       *|
+|*                                                                                                                                    *|
+|*  Any use of this software in individual and commercial software must include,                                                      *|
+|*  in the user documentation and internal comments to the code,                                                                      *|
+|*  the above Disclaimer (as applicable) and U.S. Government End Users Notice.                                                        *|
+|*                                                                                                                                    *|
+ \************************************************************************************************************************************/
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Date: Jan 27, 2016
+// File: nvapi.h
+//
+// NvAPI provides an interface to NVIDIA devices. This file contains the
+// interface constants, structure definitions and function prototypes.
+//
+//   Target Profile: developer
+//  Target Platform: windows
+//
+///////////////////////////////////////////////////////////////////////////////
+#ifndef _NVAPI_H
+#define _NVAPI_H
+
+#pragma pack(push,8) // Make sure we have consistent structure packings
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+// ====================================================
+// Universal NvAPI Definitions
+// ====================================================
+#ifndef _WIN32
+#define __cdecl
+#endif
+
+
+
+//! @}
+
+//!   \ingroup nvapistatus
+#define NVAPI_API_NOT_INTIALIZED        NVAPI_API_NOT_INITIALIZED       //!< Fix typo in error code
+
+//!   \ingroup nvapistatus
+#define NVAPI_INVALID_USER_PRIVILEDGE   NVAPI_INVALID_USER_PRIVILEGE    //!< Fix typo in error code
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_Initialize
+//
+//! This function initializes the NvAPI library (if not already initialized) but always increments the ref-counter.
+//! This must be called before calling other NvAPI_ functions.
+//!
+//! SUPPORTED OS:  Windows XP and higher,  Mac OS X
+//!
+//!
+//! \since Release: 80
+//!
+//! \retval  NVAPI_ERROR            An error occurred during the initialization process (generic error)
+//! \retval  NVAPI_LIBRARYNOTFOUND  Failed to load the NVAPI support library
+//! \retval  NVAPI_OK               Initialized
+//! \sa nvapistatus
+//! \ingroup nvapifunctions
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_Initialize();
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_Unload
+//
+//!   DESCRIPTION: Decrements the ref-counter and when it reaches ZERO, unloads NVAPI library.
+//!                This must be called in pairs with NvAPI_Initialize.
+//!
+//! SUPPORTED OS:  Windows XP and higher,  Mac OS X
+//!
+//!
+//!  Note: By design, it is not mandatory to call NvAPI_Initialize before calling any NvAPI.
+//!        When any NvAPI is called without first calling NvAPI_Initialize, the internal refcounter
+//!        will be implicitly incremented. In such cases, calling NvAPI_Initialize from a different thread will
+//!        result in incrementing the refcount again and the user has to call NvAPI_Unload twice to
+//!        unload the library. However, note that the implicit increment of the refcounter happens only once.
+//!        If the client wants unload functionality, it is recommended to always call NvAPI_Initialize and NvAPI_Unload in pairs.
+//!
+//!  Unloading NvAPI library is not supported when the library is in a resource locked state.
+//!  Some functions in the NvAPI library initiates an operation or allocates certain resources
+//!  and there are corresponding functions available, to complete the operation or free the
+//!  allocated resources. All such function pairs are designed to prevent unloading NvAPI library.
+//!
+//!  For example, if NvAPI_Unload is called after NvAPI_XXX which locks a resource, it fails with
+//!  NVAPI_ERROR. Developers need to call the corresponding NvAPI_YYY to unlock the resources,
+//!  before calling NvAPI_Unload again.
+//!
+//! \retval ::NVAPI_ERROR            One or more resources are locked and hence cannot unload NVAPI library
+//! \retval ::NVAPI_OK               NVAPI library unloaded
+//!
+//! \ingroup nvapifunctions
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_Unload();
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GetErrorMessage
+//
+//! This function converts an NvAPI error code into a null terminated string.
+//!
+//! SUPPORTED OS:  Windows XP and higher,  Mac OS X
+//!
+//!
+//! \since Release: 80
+//!
+//! \param nr      The error code to convert
+//! \param szDesc  The string corresponding to the error code
+//!
+//! \return NULL terminated string (always, never NULL)
+//! \ingroup nvapifunctions
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GetErrorMessage(NvAPI_Status nr,NvAPI_ShortString szDesc);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GetInterfaceVersionString
+//
+//! This function returns a string describing the version of the NvAPI library.
+//!               The contents of the string are human readable.  Do not assume a fixed
+//!                format.
+//!
+//!
+//! SUPPORTED OS:  Windows XP and higher,  Mac OS X
+//!
+//!
+//! \since Release: 80
+//!
+//! \param  szDesc User readable string giving NvAPI version information
+//!
+//! \return See \ref nvapistatus for the list of possible return values.
+//! \ingroup nvapifunctions
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GetInterfaceVersionString(NvAPI_ShortString szDesc);
+
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//              All display port related data types definition starts
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// This category is intentionally added before the #ifdef. The #endif should also be in the same scope
+#ifndef DISPLAYPORT_STRUCTS_DEFINED
+#define DISPLAYPORT_STRUCTS_DEFINED
+
+//! \ingroup dispcontrol
+//! Used in NV_DISPLAY_PORT_INFO.
+typedef enum _NV_DP_LINK_RATE
+{
+    NV_DP_1_62GBPS            = 6,
+    NV_DP_2_70GBPS            = 0xA,
+    NV_DP_5_40GBPS            = 0x14,
+    NV_DP_8_10GBPS            = 0x1E
+} NV_DP_LINK_RATE;
+
+
+//! \ingroup dispcontrol
+//! Used in NV_DISPLAY_PORT_INFO.
+typedef enum _NV_DP_LANE_COUNT
+{
+    NV_DP_1_LANE              = 1,
+    NV_DP_2_LANE              = 2,
+    NV_DP_4_LANE              = 4,
+} NV_DP_LANE_COUNT;
+
+
+//! \ingroup dispcontrol
+//! Used in NV_DISPLAY_PORT_INFO.
+typedef enum _NV_DP_COLOR_FORMAT
+{
+    NV_DP_COLOR_FORMAT_RGB     = 0,
+    NV_DP_COLOR_FORMAT_YCbCr422,
+    NV_DP_COLOR_FORMAT_YCbCr444,
+} NV_DP_COLOR_FORMAT;
+
+
+//! \ingroup dispcontrol
+//! Used in NV_DISPLAY_PORT_INFO.
+typedef enum _NV_DP_COLORIMETRY
+{
+    NV_DP_COLORIMETRY_RGB     = 0,
+    NV_DP_COLORIMETRY_YCbCr_ITU601,
+    NV_DP_COLORIMETRY_YCbCr_ITU709,
+} NV_DP_COLORIMETRY;
+
+
+//! \ingroup dispcontrol
+//! Used in NV_DISPLAY_PORT_INFO.
+typedef enum _NV_DP_DYNAMIC_RANGE
+{
+    NV_DP_DYNAMIC_RANGE_VESA  = 0,
+    NV_DP_DYNAMIC_RANGE_CEA,
+} NV_DP_DYNAMIC_RANGE;
+
+
+//! \ingroup dispcontrol
+//! Used in NV_DISPLAY_PORT_INFO.
+typedef enum _NV_DP_BPC
+{
+    NV_DP_BPC_DEFAULT         = 0,
+    NV_DP_BPC_6,
+    NV_DP_BPC_8,
+    NV_DP_BPC_10,
+    NV_DP_BPC_12,
+    NV_DP_BPC_16,
+} NV_DP_BPC;
+
+#endif  //#ifndef DISPLAYPORT_STRUCTS_DEFINED
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//              All display port related data types definitions end
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GPU_GetEDID
+//
+//! \fn NvAPI_GPU_GetEDID(NvPhysicalGpuHandle hPhysicalGpu, NvU32 displayOutputId, NV_EDID *pEDID)
+//!  This function returns the EDID data for the specified GPU handle and connection bit mask.
+//!  displayOutputId should have exactly 1 bit set to indicate a single display. See \ref handles.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 85
+//!
+//! \retval    NVAPI_INVALID_ARGUMENT              pEDID is NULL; displayOutputId has 0 or > 1 bits set
+//! \retval    NVAPI_OK                           *pEDID contains valid data.
+//! \retval    NVAPI_NVIDIA_DEVICE_NOT_FOUND       No NVIDIA GPU driving a display was found.
+//! \retval    NVAPI_EXPECTED_PHYSICAL_GPU_HANDLE  hPhysicalGpu was not a physical GPU handle.
+//! \retval    NVAPI_DATA_NOT_FOUND                The requested display does not contain an EDID.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+
+//! \ingroup gpu
+//! @{
+
+#define NV_EDID_V1_DATA_SIZE   256
+
+#define NV_EDID_DATA_SIZE      NV_EDID_V1_DATA_SIZE
+
+typedef struct
+{
+    NvU32   version;        //structure version
+    NvU8    EDID_Data[NV_EDID_DATA_SIZE];
+} NV_EDID_V1;
+
+//! Used in NvAPI_GPU_GetEDID()
+typedef struct
+{
+    NvU32   version;        //!< Structure version
+    NvU8    EDID_Data[NV_EDID_DATA_SIZE];
+    NvU32   sizeofEDID;
+} NV_EDID_V2;
+
+//! Used in NvAPI_GPU_GetEDID()
+typedef struct
+{
+    NvU32   version;        //!< Structure version
+    NvU8    EDID_Data[NV_EDID_DATA_SIZE];
+    NvU32   sizeofEDID;
+    NvU32   edidId;     //!< ID which always returned in a monotonically increasing counter.
+                       //!< Across a split-EDID read we need to verify that all calls returned the same edidId.
+                       //!< This counter is incremented if we get the updated EDID.
+    NvU32   offset;    //!< Which 256-byte page of the EDID we want to read. Start at 0.
+                       //!< If the read succeeds with edidSize > NV_EDID_DATA_SIZE,
+                       //!< call back again with offset+256 until we have read the entire buffer
+} NV_EDID_V3;
+
+typedef NV_EDID_V3    NV_EDID;
+
+#define NV_EDID_VER1    MAKE_NVAPI_VERSION(NV_EDID_V1,1)
+#define NV_EDID_VER2    MAKE_NVAPI_VERSION(NV_EDID_V2,2)
+#define NV_EDID_VER3    MAKE_NVAPI_VERSION(NV_EDID_V3,3)
+#define NV_EDID_VER   NV_EDID_VER3
+
+//! @}
+
+//! \ingroup gpu
+NVAPI_INTERFACE NvAPI_GPU_GetEDID(NvPhysicalGpuHandle hPhysicalGpu, NvU32 displayOutputId, NV_EDID *pEDID);
+
+//! \ingroup gpu
+//! Used in NV_GPU_CONNECTOR_DATA
+typedef enum _NV_GPU_CONNECTOR_TYPE
+{
+    NVAPI_GPU_CONNECTOR_VGA_15_PIN                      = 0x00000000,
+    NVAPI_GPU_CONNECTOR_TV_COMPOSITE                    = 0x00000010,
+    NVAPI_GPU_CONNECTOR_TV_SVIDEO                       = 0x00000011,
+    NVAPI_GPU_CONNECTOR_TV_HDTV_COMPONENT               = 0x00000013,
+    NVAPI_GPU_CONNECTOR_TV_SCART                        = 0x00000014,
+    NVAPI_GPU_CONNECTOR_TV_COMPOSITE_SCART_ON_EIAJ4120  = 0x00000016,
+    NVAPI_GPU_CONNECTOR_TV_HDTV_EIAJ4120                = 0x00000017,
+    NVAPI_GPU_CONNECTOR_PC_POD_HDTV_YPRPB               = 0x00000018,
+    NVAPI_GPU_CONNECTOR_PC_POD_SVIDEO                   = 0x00000019,
+    NVAPI_GPU_CONNECTOR_PC_POD_COMPOSITE                = 0x0000001A,
+    NVAPI_GPU_CONNECTOR_DVI_I_TV_SVIDEO                 = 0x00000020,
+    NVAPI_GPU_CONNECTOR_DVI_I_TV_COMPOSITE              = 0x00000021,
+    NVAPI_GPU_CONNECTOR_DVI_I                           = 0x00000030,
+    NVAPI_GPU_CONNECTOR_DVI_D                           = 0x00000031,
+    NVAPI_GPU_CONNECTOR_ADC                             = 0x00000032,
+    NVAPI_GPU_CONNECTOR_LFH_DVI_I_1                     = 0x00000038,
+    NVAPI_GPU_CONNECTOR_LFH_DVI_I_2                     = 0x00000039,
+    NVAPI_GPU_CONNECTOR_SPWG                            = 0x00000040,
+    NVAPI_GPU_CONNECTOR_OEM                             = 0x00000041,
+    NVAPI_GPU_CONNECTOR_DISPLAYPORT_EXTERNAL            = 0x00000046,
+    NVAPI_GPU_CONNECTOR_DISPLAYPORT_INTERNAL            = 0x00000047,
+    NVAPI_GPU_CONNECTOR_DISPLAYPORT_MINI_EXT            = 0x00000048,
+    NVAPI_GPU_CONNECTOR_HDMI_A                          = 0x00000061,
+    NVAPI_GPU_CONNECTOR_HDMI_C_MINI                     = 0x00000063,
+    NVAPI_GPU_CONNECTOR_LFH_DISPLAYPORT_1               = 0x00000064,
+    NVAPI_GPU_CONNECTOR_LFH_DISPLAYPORT_2               = 0x00000065,
+    NVAPI_GPU_CONNECTOR_VIRTUAL_WFD                     = 0x00000070,
+    NVAPI_GPU_CONNECTOR_UNKNOWN                         = 0xFFFFFFFF,
+} NV_GPU_CONNECTOR_TYPE;
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// NvAPI_TVOutput Information
+//
+///////////////////////////////////////////////////////////////////////////////
+
+//! \ingroup tvapi
+//! Used in NV_DISPLAY_TV_OUTPUT_INFO
+typedef enum _NV_DISPLAY_TV_FORMAT
+{
+    NV_DISPLAY_TV_FORMAT_NONE               = 0,
+    NV_DISPLAY_TV_FORMAT_SD_NTSCM           = 0x00000001,
+    NV_DISPLAY_TV_FORMAT_SD_NTSCJ           = 0x00000002,
+    NV_DISPLAY_TV_FORMAT_SD_PALM            = 0x00000004,
+    NV_DISPLAY_TV_FORMAT_SD_PALBDGH         = 0x00000008,
+    NV_DISPLAY_TV_FORMAT_SD_PALN            = 0x00000010,
+    NV_DISPLAY_TV_FORMAT_SD_PALNC           = 0x00000020,
+    NV_DISPLAY_TV_FORMAT_SD_576i            = 0x00000100,
+    NV_DISPLAY_TV_FORMAT_SD_480i            = 0x00000200,
+    NV_DISPLAY_TV_FORMAT_ED_480p            = 0x00000400,
+    NV_DISPLAY_TV_FORMAT_ED_576p            = 0x00000800,
+    NV_DISPLAY_TV_FORMAT_HD_720p            = 0x00001000,
+    NV_DISPLAY_TV_FORMAT_HD_1080i           = 0x00002000,
+    NV_DISPLAY_TV_FORMAT_HD_1080p           = 0x00004000,
+    NV_DISPLAY_TV_FORMAT_HD_720p50          = 0x00008000,
+    NV_DISPLAY_TV_FORMAT_HD_1080p24         = 0x00010000,
+    NV_DISPLAY_TV_FORMAT_HD_1080i50         = 0x00020000,
+    NV_DISPLAY_TV_FORMAT_HD_1080p50         = 0x00040000,
+    NV_DISPLAY_TV_FORMAT_UHD_4Kp30          = 0x00080000,
+    NV_DISPLAY_TV_FORMAT_UHD_4Kp30_3840     = NV_DISPLAY_TV_FORMAT_UHD_4Kp30,
+    NV_DISPLAY_TV_FORMAT_UHD_4Kp25          = 0x00100000,
+    NV_DISPLAY_TV_FORMAT_UHD_4Kp25_3840     = NV_DISPLAY_TV_FORMAT_UHD_4Kp25,
+    NV_DISPLAY_TV_FORMAT_UHD_4Kp24          = 0x00200000,
+    NV_DISPLAY_TV_FORMAT_UHD_4Kp24_3840     = NV_DISPLAY_TV_FORMAT_UHD_4Kp24,
+    NV_DISPLAY_TV_FORMAT_UHD_4Kp24_SMPTE    = 0x00400000,
+    NV_DISPLAY_TV_FORMAT_UHD_4Kp50_3840     = 0x00800000,
+    NV_DISPLAY_TV_FORMAT_UHD_4Kp60_3840     = 0x00900000,
+    NV_DISPLAY_TV_FORMAT_UHD_4Kp30_4096     = 0x00A00000,
+    NV_DISPLAY_TV_FORMAT_UHD_4Kp25_4096     = 0x00B00000,
+    NV_DISPLAY_TV_FORMAT_UHD_4Kp24_4096     = 0x00C00000,
+    NV_DISPLAY_TV_FORMAT_UHD_4Kp50_4096     = 0x00D00000,
+    NV_DISPLAY_TV_FORMAT_UHD_4Kp60_4096     = 0x00E00000,
+
+    NV_DISPLAY_TV_FORMAT_SD_OTHER           = 0x01000000,
+    NV_DISPLAY_TV_FORMAT_ED_OTHER           = 0x02000000,
+    NV_DISPLAY_TV_FORMAT_HD_OTHER           = 0x04000000,
+
+    NV_DISPLAY_TV_FORMAT_ANY                = 0x80000000,
+
+} NV_DISPLAY_TV_FORMAT;
+
+
+//! \ingroup dispcontrol
+//! @{
+#define NVAPI_MAX_VIEW_TARGET  2
+#define NVAPI_ADVANCED_MAX_VIEW_TARGET 4
+
+#ifndef _NV_TARGET_VIEW_MODE_
+#define _NV_TARGET_VIEW_MODE_
+
+//! Used in NvAPI_SetView().
+typedef enum _NV_TARGET_VIEW_MODE
+{
+    NV_VIEW_MODE_STANDARD  = 0,
+    NV_VIEW_MODE_CLONE     = 1,
+    NV_VIEW_MODE_HSPAN     = 2,
+    NV_VIEW_MODE_VSPAN     = 3,
+    NV_VIEW_MODE_DUALVIEW  = 4,
+    NV_VIEW_MODE_MULTIVIEW = 5,
+} NV_TARGET_VIEW_MODE;
+#endif
+
+//! @}
+
+
+// Following definitions are used in NvAPI_SetViewEx.
+
+//! Scaling modes - used in NvAPI_SetViewEx().
+//! \ingroup dispcontrol
+typedef enum _NV_SCALING
+{
+    NV_SCALING_DEFAULT          = 0,        //!< No change
+
+    // New Scaling Declarations
+    NV_SCALING_GPU_SCALING_TO_CLOSEST                   = 1,  //!< Balanced  - Full Screen
+    NV_SCALING_GPU_SCALING_TO_NATIVE                    = 2,  //!< Force GPU - Full Screen
+    NV_SCALING_GPU_SCANOUT_TO_NATIVE                    = 3,  //!< Force GPU - Centered\No Scaling
+    NV_SCALING_GPU_SCALING_TO_ASPECT_SCANOUT_TO_NATIVE  = 5,  //!< Force GPU - Aspect Ratio
+    NV_SCALING_GPU_SCALING_TO_ASPECT_SCANOUT_TO_CLOSEST = 6,  //!< Balanced  - Aspect Ratio
+    NV_SCALING_GPU_SCANOUT_TO_CLOSEST                   = 7,  //!< Balanced  - Centered\No Scaling
+
+    // Legacy Declarations
+    NV_SCALING_MONITOR_SCALING                          = NV_SCALING_GPU_SCALING_TO_CLOSEST,
+    NV_SCALING_ADAPTER_SCALING                          = NV_SCALING_GPU_SCALING_TO_NATIVE,
+    NV_SCALING_CENTERED                                 = NV_SCALING_GPU_SCANOUT_TO_NATIVE,
+    NV_SCALING_ASPECT_SCALING                           = NV_SCALING_GPU_SCALING_TO_ASPECT_SCANOUT_TO_NATIVE,
+
+    NV_SCALING_CUSTOMIZED       = 255       //!< For future use
+} NV_SCALING;
+
+//! Rotate modes- used in NvAPI_SetViewEx().
+//! \ingroup dispcontrol
+typedef enum _NV_ROTATE
+{
+    NV_ROTATE_0           = 0,
+    NV_ROTATE_90          = 1,
+    NV_ROTATE_180         = 2,
+    NV_ROTATE_270         = 3,
+    NV_ROTATE_IGNORED     = 4,
+} NV_ROTATE;
+
+//! Color formats- used in NvAPI_SetViewEx().
+//! \ingroup dispcontrol
+#define NVFORMAT_MAKEFOURCC(ch0, ch1, ch2, ch3)                              \
+                         ((NvU32)(NvU8)(ch0) | ((NvU32)(NvU8)(ch1) << 8) |   \
+                     ((NvU32)(NvU8)(ch2) << 16) | ((NvU32)(NvU8)(ch3) << 24 ))
+
+
+
+//! Color formats- used in NvAPI_SetViewEx().
+//! \ingroup dispcontrol
+typedef enum _NV_FORMAT
+{
+    NV_FORMAT_UNKNOWN           =  0,       //!< unknown. Driver will choose one as following value.
+    NV_FORMAT_P8                = 41,       //!< for 8bpp mode
+    NV_FORMAT_R5G6B5            = 23,       //!< for 16bpp mode
+    NV_FORMAT_A8R8G8B8          = 21,       //!< for 32bpp mode
+    NV_FORMAT_A16B16G16R16F     = 113,      //!< for 64bpp(floating point) mode.
+
+} NV_FORMAT;
+
+// TV standard
+
+typedef struct
+{
+    float x;    //!<  x-coordinate of the viewport top-left point
+    float y;    //!<  y-coordinate of the viewport top-left point
+    float w;    //!<  Width of the viewport
+    float h;    //!<  Height of the viewport
+} NV_VIEWPORTF;
+
+
+
+//! \ingroup dispcontrol
+//! The timing override is not supported yet; must be set to _AUTO. \n
+
+
+typedef enum _NV_TIMING_OVERRIDE
+{
+    NV_TIMING_OVERRIDE_CURRENT = 0,          //!< get the current timing
+    NV_TIMING_OVERRIDE_AUTO,                 //!< the timing the driver will use based the current policy
+    NV_TIMING_OVERRIDE_EDID,                 //!< EDID timing
+    NV_TIMING_OVERRIDE_DMT,                  //!< VESA DMT timing
+    NV_TIMING_OVERRIDE_DMT_RB,               //!< VESA DMT timing with reduced blanking
+    NV_TIMING_OVERRIDE_CVT,                  //!< VESA CVT timing
+    NV_TIMING_OVERRIDE_CVT_RB,               //!< VESA CVT timing with reduced blanking
+    NV_TIMING_OVERRIDE_GTF,                  //!< VESA GTF timing
+    NV_TIMING_OVERRIDE_EIA861,               //!< EIA 861x pre-defined timing
+    NV_TIMING_OVERRIDE_ANALOG_TV,            //!< analog SD/HDTV timing
+    NV_TIMING_OVERRIDE_CUST,                 //!< NV custom timings
+    NV_TIMING_OVERRIDE_NV_PREDEFINED,        //!< NV pre-defined timing (basically the PsF timings)
+    NV_TIMING_OVERRIDE_NV_PSF                = NV_TIMING_OVERRIDE_NV_PREDEFINED,
+    NV_TIMING_OVERRIDE_NV_ASPR,
+    NV_TIMING_OVERRIDE_SDI,                  //!< Override for SDI timing
+
+    NV_TIMING_OVRRIDE_MAX,
+}NV_TIMING_OVERRIDE;
+
+
+#ifndef NV_TIMING_STRUCTS_DEFINED
+#define NV_TIMING_STRUCTS_DEFINED
+
+//***********************
+// The Timing Structure
+//***********************
+//
+//! \ingroup dispcontrol
+//!  NVIDIA-specific timing extras \n
+//! Used in NV_TIMING.
+typedef struct tagNV_TIMINGEXT
+{
+    NvU32   flag;          //!< Reserved for NVIDIA hardware-based enhancement, such as double-scan.
+    NvU16   rr;            //!< Logical refresh rate to present
+    NvU32   rrx1k;         //!< Physical vertical refresh rate in 0.001Hz
+    NvU32   aspect;        //!< Display aspect ratio Hi(aspect):horizontal-aspect, Low(aspect):vertical-aspect
+    NvU16   rep;           //!< Bit-wise pixel repetition factor: 0x1:no pixel repetition; 0x2:each pixel repeats twice horizontally,..
+    NvU32   status;        //!< Timing standard
+    NvU8    name[40];      //!< Timing name
+}NV_TIMINGEXT;
+
+
+
+//! \ingroup dispcontrol
+//!The very basic timing structure based on the VESA standard:
+//! \code
+//!            |<----------------------------htotal--------------------------->|
+//!             ---------"active" video-------->|<-------blanking------>|<-----
+//!            |<-------hvisible-------->|<-hb->|<-hfp->|<-hsw->|<-hbp->|<-hb->|
+//! --------- -+-------------------------+      |       |       |       |      |
+//!   A      A |                         |      |       |       |       |      |
+//!   :      : |                         |      |       |       |       |      |
+//!   :      : |                         |      |       |       |       |      |
+//!   :vertical|    addressable video    |      |       |       |       |      |
+//!   : visible|                         |      |       |       |       |      |
+//!   :      : |                         |      |       |       |       |      |
+//!   :      : |                         |      |       |       |       |      |
+//! vertical V |                         |      |       |       |       |      |
+//!  total   --+-------------------------+      |       |       |       |      |
+//!   :      vb         border                  |       |       |       |      |
+//!   :      -----------------------------------+       |       |       |      |
+//!   :      vfp        front porch                     |       |       |      |
+//!   :      -------------------------------------------+       |       |      |
+//!   :      vsw        sync width                              |       |      |
+//!   :      ---------------------------------------------------+       |      |
+//!   :      vbp        back porch                                      |      |
+//!   :      -----------------------------------------------------------+      |
+//!   V      vb         border                                                 |
+//! ---------------------------------------------------------------------------+
+//! \endcode
+typedef struct _NV_TIMING
+{
+    // VESA scan out timing parameters:
+    NvU16 HVisible;         //!< horizontal visible
+    NvU16 HBorder;          //!< horizontal border
+    NvU16 HFrontPorch;      //!< horizontal front porch
+    NvU16 HSyncWidth;       //!< horizontal sync width
+    NvU16 HTotal;           //!< horizontal total
+    NvU8  HSyncPol;         //!< horizontal sync polarity: 1-negative, 0-positive
+
+    NvU16 VVisible;         //!< vertical visible
+    NvU16 VBorder;          //!< vertical border
+    NvU16 VFrontPorch;      //!< vertical front porch
+    NvU16 VSyncWidth;       //!< vertical sync width
+    NvU16 VTotal;           //!< vertical total
+    NvU8  VSyncPol;         //!< vertical sync polarity: 1-negative, 0-positive
+
+    NvU16 interlaced;       //!< 1-interlaced, 0-progressive
+    NvU32 pclk;             //!< pixel clock in 10 kHz
+
+    //other timing related extras
+    NV_TIMINGEXT etc;
+}NV_TIMING;
+#endif //NV_TIMING_STRUCTS_DEFINED
+
+
+//! \addtogroup dispcontrol
+//! Timing-related constants
+//! @{
+#define NV_TIMING_H_SYNC_POSITIVE                             0
+#define NV_TIMING_H_SYNC_NEGATIVE                             1
+#define NV_TIMING_H_SYNC_DEFAULT                              NV_TIMING_H_SYNC_NEGATIVE
+//
+#define NV_TIMING_V_SYNC_POSITIVE                             0
+#define NV_TIMING_V_SYNC_NEGATIVE                             1
+#define NV_TIMING_V_SYNC_DEFAULT                              NV_TIMING_V_SYNC_POSITIVE
+//
+#define NV_TIMING_PROGRESSIVE                                 0
+#define NV_TIMING_INTERLACED                                  1
+#define NV_TIMING_INTERLACED_EXTRA_VBLANK_ON_FIELD2           1
+#define NV_TIMING_INTERLACED_NO_EXTRA_VBLANK_ON_FIELD2        2
+//! @}
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME:   NvAPI_SetView
+//
+//! \fn NvAPI_SetView(NvDisplayHandle hNvDisplay, NV_VIEW_TARGET_INFO *pTargetInfo, NV_TARGET_VIEW_MODE targetView)
+//!  This function lets the caller modify the target display arrangement of the selected source display handle in any nView mode.
+//!  It can also modify or extend the source display in Dualview mode.
+//!  \note Maps the selected source to the associated target Ids.
+//!  \note Display PATH with this API is limited to single GPU. DUALVIEW across GPUs cannot be enabled with this API.
+//!
+//! \deprecated  Do not use this function - it is deprecated in release 290. Instead, use NvAPI_DISP_SetDisplayConfig.
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \since Release: 90
+//!
+//! \param [in]  hNvDisplay       NVIDIA Display selection. #NVAPI_DEFAULT_HANDLE is not allowed, it has to be a handle enumerated with NvAPI_EnumNVidiaDisplayHandle().
+//! \param [in]  pTargetInfo      Pointer to array of NV_VIEW_TARGET_INFO, specifying device properties in this view.
+//!                               The first device entry in the array is the physical primary.
+//!                               The device entry with the lowest source id is the desktop primary.
+//! \param [in]  targetCount      Count of target devices specified in pTargetInfo.
+//! \param [in]  targetView       Target view selected from NV_TARGET_VIEW_MODE.
+//!
+//! \retval  NVAPI_OK               Completed request
+//! \retval  NVAPI_ERROR            Miscellaneous error occurred
+//! \retval  NVAPI_INVALID_ARGUMENT Invalid input parameter.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+//! \ingroup dispcontrol
+//! Used in NvAPI_SetView() and NvAPI_GetView()
+typedef struct
+{
+    NvU32 version;     //!< (IN) structure version
+    NvU32 count;       //!< (IN) target count
+    struct
+    {
+        NvU32 deviceMask;    //!< (IN/OUT) Device mask
+        NvU32 sourceId;      //!< (IN/OUT) Source ID - values will be based on the number of heads exposed per GPU.
+        NvU32 bPrimary:1;    //!< (OUT) Indicates if this is the GPU's primary view target. This is not the desktop GDI primary.
+                             //!< NvAPI_SetView automatically selects the first target in NV_VIEW_TARGET_INFO index 0 as the GPU's primary view.
+        NvU32 bInterlaced:1; //!< (IN/OUT) Indicates if the timing being used on this monitor is interlaced.
+        NvU32 bGDIPrimary:1; //!< (IN/OUT) Indicates if this is the desktop GDI primary.
+        NvU32 bForceModeSet:1;//!< (IN) Used only on Win7 and higher during a call to NvAPI_SetView(). Turns off optimization & forces OS to set supplied mode.
+    } target[NVAPI_MAX_VIEW_TARGET];
+} NV_VIEW_TARGET_INFO;
+
+//! \ingroup dispcontrol
+#define NV_VIEW_TARGET_INFO_VER  MAKE_NVAPI_VERSION(NV_VIEW_TARGET_INFO,2)
+
+
+//! \ingroup dispcontrol
+__nvapi_deprecated_function("Do not use this function - it is deprecated in release 290. Instead, use NvAPI_DISP_SetDisplayConfig.")
+NVAPI_INTERFACE NvAPI_SetView(NvDisplayHandle hNvDisplay, NV_VIEW_TARGET_INFO *pTargetInfo, NV_TARGET_VIEW_MODE targetView);
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME:   NvAPI_SetViewEx
+//
+//!  \fn NvAPI_SetViewEx(NvDisplayHandle hNvDisplay, NV_DISPLAY_PATH_INFO *pPathInfo, NV_TARGET_VIEW_MODE displayView)
+//!  This function lets caller to modify the display arrangement for selected source display handle in any of the nview modes.
+//!  It also allows to modify or extend the source display in dualview mode.
+//!   \note Maps the selected source to the associated target Ids.
+//!   \note Display PATH with this API is limited to single GPU. DUALVIEW across GPUs cannot be enabled with this API.
+//!
+//! \deprecated  Do not use this function - it is deprecated in release 290. Instead, use NvAPI_DISP_SetDisplayConfig.
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \since Release: 95
+//!
+//! \param [in]  hNvDisplay   NVIDIA Display selection. #NVAPI_DEFAULT_HANDLE is not allowed, it has to be a handle enumerated with
+//!                           NvAPI_EnumNVidiaDisplayHandle().
+//! \param [in]  pPathInfo    Pointer to array of NV_VIEW_PATH_INFO, specifying device properties in this view.
+//!                           The first device entry in the array is the physical primary.
+//!                           The device entry with the lowest source id is the desktop primary.
+//! \param [in]  pathCount    Count of paths specified in pPathInfo.
+//! \param [in]  displayView  Display view selected from NV_TARGET_VIEW_MODE.
+//!
+//! \retval  NVAPI_OK                Completed request
+//! \retval  NVAPI_ERROR             Miscellaneous error occurred
+//! \retval  NVAPI_INVALID_ARGUMENT  Invalid input parameter.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+//! \ingroup dispcontrol
+#define NVAPI_MAX_DISPLAY_PATH  NVAPI_MAX_VIEW_TARGET
+
+//! \ingroup dispcontrol
+#define NVAPI_ADVANCED_MAX_DISPLAY_PATH  NVAPI_ADVANCED_MAX_VIEW_TARGET
+
+
+
+//! \ingroup dispcontrol
+//! Used in NV_DISPLAY_PATH_INFO.
+typedef struct
+{
+    NvU32                   deviceMask;     //!< (IN) Device mask
+    NvU32                   sourceId;       //!< (IN) Values will be based on the number of heads exposed per GPU(0, 1?)
+    NvU32                   bPrimary:1;     //!< (IN/OUT) Indicates if this is the GPU's primary view target. This is not the desktop GDI primary.
+                                            //!< NvAPI_SetViewEx() automatically selects the first target in NV_DISPLAY_PATH_INFO index 0 as the GPU's primary view.
+    NV_GPU_CONNECTOR_TYPE   connector;      //!< (IN) Specify connector type. For TV only.
+
+    // source mode information
+    NvU32                   width;          //!< (IN) Width of the mode
+    NvU32                   height;         //!< (IN) Height of the mode
+    NvU32                   depth;          //!< (IN) Depth of the mode
+    NV_FORMAT               colorFormat;    //!<      Color format if it needs to be specified. Not used now.
+
+    //rotation setting of the mode
+    NV_ROTATE               rotation;       //!< (IN) Rotation setting.
+
+    // the scaling mode
+    NV_SCALING              scaling;        //!< (IN) Scaling setting
+
+    // Timing info
+    NvU32                   refreshRate;    //!< (IN) Refresh rate of the mode
+    NvU32                   interlaced:1;   //!< (IN) Interlaced mode flag
+
+    NV_DISPLAY_TV_FORMAT    tvFormat;       //!< (IN) To choose the last TV format set this value to NV_DISPLAY_TV_FORMAT_NONE
+
+    // Windows desktop position
+    NvU32                   posx;           //!< (IN/OUT) X-offset of this display on the Windows desktop
+    NvU32                   posy;           //!< (IN/OUT) Y-offset of this display on the Windows desktop
+    NvU32                   bGDIPrimary:1;  //!< (IN/OUT) Indicates if this is the desktop GDI primary.
+
+    NvU32                   bForceModeSet:1;//!< (IN) Used only on Win7 and higher during a call to NvAPI_SetViewEx(). Turns off optimization & forces OS to set supplied mode.
+    NvU32                   bFocusDisplay:1;//!< (IN) If set, this display path should have the focus after the GPU topology change
+    NvU32                   gpuId:24;       //!< (IN) the physical display/target Gpu id which is the owner of the scan out (for SLI multimon, display from the slave Gpu)
+
+} NV_DISPLAY_PATH;
+
+//! \ingroup dispcontrol
+//! Used in NvAPI_SetViewEx() and NvAPI_GetViewEx().
+typedef struct
+{
+    NvU32 version;     //!< (IN) Structure version
+    NvU32 count;       //!< (IN) Path count
+    NV_DISPLAY_PATH path[NVAPI_MAX_DISPLAY_PATH];
+} NV_DISPLAY_PATH_INFO_V3;
+
+//! \ingroup dispcontrol
+//! Used in NvAPI_SetViewEx() and NvAPI_GetViewEx().
+typedef struct
+{
+    NvU32 version;     //!< (IN) Structure version
+    NvU32 count;       //!< (IN) Path count
+    NV_DISPLAY_PATH path[NVAPI_ADVANCED_MAX_DISPLAY_PATH];
+} NV_DISPLAY_PATH_INFO;
+
+//! \addtogroup dispcontrol
+//! Macro for constructing the version fields of NV_DISPLAY_PATH_INFO
+//! @{
+#define NV_DISPLAY_PATH_INFO_VER  NV_DISPLAY_PATH_INFO_VER4
+#define NV_DISPLAY_PATH_INFO_VER4 MAKE_NVAPI_VERSION(NV_DISPLAY_PATH_INFO,4)
+#define NV_DISPLAY_PATH_INFO_VER3 MAKE_NVAPI_VERSION(NV_DISPLAY_PATH_INFO,3)
+#define NV_DISPLAY_PATH_INFO_VER2 MAKE_NVAPI_VERSION(NV_DISPLAY_PATH_INFO,2)
+#define NV_DISPLAY_PATH_INFO_VER1 MAKE_NVAPI_VERSION(NV_DISPLAY_PATH_INFO,1)
+//! @}
+
+
+//! \ingroup dispcontrol
+__nvapi_deprecated_function("Do not use this function - it is deprecated in release 290. Instead, use NvAPI_DISP_SetDisplayConfig.")
+NVAPI_INTERFACE NvAPI_SetViewEx(NvDisplayHandle hNvDisplay, NV_DISPLAY_PATH_INFO *pPathInfo, NV_TARGET_VIEW_MODE displayView);
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+// SetDisplayConfig/GetDisplayConfig
+///////////////////////////////////////////////////////////////////////////////
+//! \ingroup dispcontrol
+
+typedef struct _NV_POSITION
+{
+    NvS32   x;
+    NvS32   y;
+} NV_POSITION;
+
+//! \ingroup dispcontrol
+typedef struct _NV_RESOLUTION
+{
+    NvU32   width;
+    NvU32   height;
+    NvU32   colorDepth;
+} NV_RESOLUTION;
+
+//! \ingroup dispcontrol
+typedef struct _NV_DISPLAYCONFIG_PATH_ADVANCED_TARGET_INFO_V1
+{
+    NvU32                   version;
+
+    // Rotation and Scaling
+    NV_ROTATE               rotation;       //!< (IN) rotation setting.
+    NV_SCALING              scaling;        //!< (IN) scaling setting.
+
+    // Refresh Rate
+    NvU32                   refreshRate1K;  //!< (IN) Non-interlaced Refresh Rate of the mode, multiplied by 1000, 0 = ignored
+                                            //!< This is the value which driver reports to the OS.
+    // Flags
+    NvU32                   interlaced:1;   //!< (IN) Interlaced mode flag, ignored if refreshRate == 0
+    NvU32                   primary:1;      //!< (IN) Declares primary display in clone configuration. This is *NOT* GDI Primary.
+                                            //!< Only one target can be primary per source. If no primary is specified, the first
+                                            //!< target will automatically be primary.
+#ifdef NV_PAN_AND_SCAN_DEFINED
+    NvU32                   isPanAndScanTarget:1; //!< Whether on this target Pan and Scan is enabled or has to be enabled. Valid only
+                                                  //!< when the target is part of clone topology.
+#else
+    NvU32                   reservedBit1:1;
+#endif
+    NvU32                   disableVirtualModeSupport:1;
+    NvU32                   isPreferredUnscaledTarget:1;
+    NvU32                   reserved:27;
+    // TV format information
+    NV_GPU_CONNECTOR_TYPE   connector;      //!< Specify connector type. For TV only, ignored if tvFormat == NV_DISPLAY_TV_FORMAT_NONE
+    NV_DISPLAY_TV_FORMAT    tvFormat;       //!< (IN) to choose the last TV format set this value to NV_DISPLAY_TV_FORMAT_NONE
+                                            //!< In case of NvAPI_DISP_GetDisplayConfig(), this field will indicate the currently applied TV format;
+                                            //!< if no TV format is applied, this field will have NV_DISPLAY_TV_FORMAT_NONE value.
+                                            //!< In case of NvAPI_DISP_SetDisplayConfig(), this field should only be set in case of TVs;
+                                            //!< for other displays this field will be ignored and resolution & refresh rate specified in input will be used to apply the TV format.
+
+    // Backend (raster) timing standard
+    NV_TIMING_OVERRIDE      timingOverride;     //!< Ignored if timingOverride == NV_TIMING_OVERRIDE_CURRENT
+    NV_TIMING               timing;             //!< Scan out timing, valid only if timingOverride == NV_TIMING_OVERRIDE_CUST
+                                                //!< The value NV_TIMING::NV_TIMINGEXT::rrx1k is obtained from the EDID. The driver may
+                                                //!< tweak this value for HDTV, stereo, etc., before reporting it to the OS.
+} NV_DISPLAYCONFIG_PATH_ADVANCED_TARGET_INFO_V1;
+
+//! \ingroup dispcontrol
+typedef NV_DISPLAYCONFIG_PATH_ADVANCED_TARGET_INFO_V1 NV_DISPLAYCONFIG_PATH_ADVANCED_TARGET_INFO;
+
+//! \ingroup dispcontrol
+#define NV_DISPLAYCONFIG_PATH_ADVANCED_TARGET_INFO_VER1     MAKE_NVAPI_VERSION(NV_DISPLAYCONFIG_PATH_ADVANCED_TARGET_INFO_V1,1)
+
+//! \ingroup dispcontrol
+#define NV_DISPLAYCONFIG_PATH_ADVANCED_TARGET_INFO_VER      NV_DISPLAYCONFIG_PATH_ADVANCED_TARGET_INFO_VER1
+
+//! \ingroup dispcontrol
+typedef struct _NV_DISPLAYCONFIG_PATH_TARGET_INFO_V1
+{
+    NvU32                                           displayId;  //!< Display ID
+    NV_DISPLAYCONFIG_PATH_ADVANCED_TARGET_INFO*     details;    //!< May be NULL if no advanced settings are required. NULL for Non-NVIDIA Display.
+} NV_DISPLAYCONFIG_PATH_TARGET_INFO_V1;
+
+//! \ingroup dispcontrol
+typedef struct _NV_DISPLAYCONFIG_PATH_TARGET_INFO_V2
+{
+    NvU32                                           displayId;  //!< Display ID
+    NV_DISPLAYCONFIG_PATH_ADVANCED_TARGET_INFO*     details;    //!< May be NULL if no advanced settings are required
+    NvU32                                           targetId;   //!< Windows CCD target ID. Must be present only for non-NVIDIA adapter, for NVIDIA adapter this parameter is ignored.
+} NV_DISPLAYCONFIG_PATH_TARGET_INFO_V2;
+
+
+//! \ingroup dispcontrol
+//! As version is not defined for this structure, we will be using version of NV_DISPLAYCONFIG_PATH_INFO
+typedef NV_DISPLAYCONFIG_PATH_TARGET_INFO_V2 NV_DISPLAYCONFIG_PATH_TARGET_INFO;
+
+
+//! \ingroup dispcontrol
+typedef enum _NV_DISPLAYCONFIG_SPANNING_ORIENTATION
+{
+    NV_DISPLAYCONFIG_SPAN_NONE          = 0,
+    NV_DISPLAYCONFIG_SPAN_HORIZONTAL    = 1,
+    NV_DISPLAYCONFIG_SPAN_VERTICAL      = 2,
+} NV_DISPLAYCONFIG_SPANNING_ORIENTATION;
+
+//! \ingroup dispcontrol
+typedef struct _NV_DISPLAYCONFIG_SOURCE_MODE_INFO_V1
+{
+    NV_RESOLUTION                           resolution;
+    NV_FORMAT                               colorFormat;                //!< Ignored at present, must be NV_FORMAT_UNKNOWN (0)
+    NV_POSITION                             position;                   //!< Is all positions are 0 or invalid, displays will be automatically
+                                                                        //!< positioned from left to right with GDI Primary at 0,0, and all
+                                                                        //!< other displays in the order of the path array.
+    NV_DISPLAYCONFIG_SPANNING_ORIENTATION   spanningOrientation;        //!< Spanning is only supported on XP
+    NvU32                                   bGDIPrimary : 1;
+    NvU32                                   bSLIFocus : 1;
+    NvU32                                   reserved : 30;              //!< Must be 0
+} NV_DISPLAYCONFIG_SOURCE_MODE_INFO_V1;
+
+
+
+//! \ingroup dispcontrol
+typedef struct _NV_DISPLAYCONFIG_PATH_INFO_V1
+{
+    NvU32                                   version;
+    NvU32                                   reserved_sourceId;     	//!< This field is reserved. There is ongoing debate if we need this field.
+                                                                        //!< Identifies sourceIds used by Windows. If all sourceIds are 0,
+                                                                        //!< these will be computed automatically.
+    NvU32                                   targetInfoCount;            //!< Number of elements in targetInfo array
+    NV_DISPLAYCONFIG_PATH_TARGET_INFO_V1*   targetInfo;
+    NV_DISPLAYCONFIG_SOURCE_MODE_INFO_V1*   sourceModeInfo;             //!< May be NULL if mode info is not important
+} NV_DISPLAYCONFIG_PATH_INFO_V1;
+
+//! \ingroup dispcontrol
+//! This define is temporary and must be removed once DVS failure is fixed.
+#define _NV_DISPLAYCONFIG_PATH_INFO_V2 _NV_DISPLAYCONFIG_PATH_INFO
+
+//! \ingroup dispcontrol
+typedef struct _NV_DISPLAYCONFIG_PATH_INFO_V2
+{
+    NvU32                                   version;
+    union {
+        NvU32                                   sourceId;            	//!< Identifies sourceId used by Windows CCD. This can be optionally set.
+        NvU32                                   reserved_sourceId;      //!< Only for compatibility
+    };
+
+    NvU32                                   targetInfoCount;            //!< Number of elements in targetInfo array
+    NV_DISPLAYCONFIG_PATH_TARGET_INFO_V2*   targetInfo;
+    NV_DISPLAYCONFIG_SOURCE_MODE_INFO_V1*   sourceModeInfo;             //!< May be NULL if mode info is not important
+    NvU32                                   IsNonNVIDIAAdapter : 1;     //!< True for non-NVIDIA adapter.
+    NvU32                                   reserved : 31;              //!< Must be 0
+    void                                    *pOSAdapterID;              //!< Used by Non-NVIDIA adapter for pointer to OS Adapter of LUID
+                                                                        //!< type, type casted to void *.
+} NV_DISPLAYCONFIG_PATH_INFO_V2;
+
+//! \ingroup dispcontrol
+#define NV_DISPLAYCONFIG_PATH_INFO_VER1                 MAKE_NVAPI_VERSION(NV_DISPLAYCONFIG_PATH_INFO_V1,1)
+
+//! \ingroup dispcontrol
+#define NV_DISPLAYCONFIG_PATH_INFO_VER2                 MAKE_NVAPI_VERSION(NV_DISPLAYCONFIG_PATH_INFO_V2,2)
+
+#ifndef NV_DISPLAYCONFIG_PATH_INFO_VER
+
+typedef NV_DISPLAYCONFIG_PATH_INFO_V2 NV_DISPLAYCONFIG_PATH_INFO;
+
+#define NV_DISPLAYCONFIG_PATH_INFO_VER                  NV_DISPLAYCONFIG_PATH_INFO_VER2
+
+typedef NV_DISPLAYCONFIG_SOURCE_MODE_INFO_V1 NV_DISPLAYCONFIG_SOURCE_MODE_INFO;
+
+#endif
+
+
+//! \ingroup dispcontrol
+typedef enum _NV_DISPLAYCONFIG_FLAGS
+{
+    NV_DISPLAYCONFIG_VALIDATE_ONLY          = 0x00000001,
+    NV_DISPLAYCONFIG_SAVE_TO_PERSISTENCE    = 0x00000002,
+    NV_DISPLAYCONFIG_DRIVER_RELOAD_ALLOWED  = 0x00000004,               //!< Driver reload is permitted if necessary
+    NV_DISPLAYCONFIG_FORCE_MODE_ENUMERATION = 0x00000008,               //!< Refresh OS mode list.
+} NV_DISPLAYCONFIG_FLAGS;
+
+
+#define NVAPI_UNICODE_STRING_MAX                             2048
+#define NVAPI_BINARY_DATA_MAX                                4096
+
+typedef NvU16 NvAPI_UnicodeString[NVAPI_UNICODE_STRING_MAX];
+typedef const NvU16 *NvAPI_LPCWSTR;
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GetDisplayDriverVersion
+//! \fn NvAPI_GetDisplayDriverVersion(NvDisplayHandle hNvDisplay, NV_DISPLAY_DRIVER_VERSION *pVersion)
+//! This function returns a struct that describes aspects of the display driver
+//!                build.
+//!
+//! \deprecated  Do not use this function - it is deprecated in release 290. Instead, use NvAPI_SYS_GetDriverAndBranchVersion.
+//! SUPPORTED OS:  Windows XP and higher,  Mac OS X
+//!
+//!
+//! \since Release: 80
+//!
+//! \param [in]  hNvDisplay NVIDIA display handle.
+//! \param [out] pVersion Pointer to NV_DISPLAY_DRIVER_VERSION struc
+//!
+//! \retval NVAPI_ERROR
+//! \retval NVAPI_OK
+///////////////////////////////////////////////////////////////////////////////
+
+//! \ingroup driverapi
+//! Used in NvAPI_GetDisplayDriverVersion()
+typedef struct
+{
+    NvU32              version;             // Structure version
+    NvU32              drvVersion;
+    NvU32              bldChangeListNum;
+    NvAPI_ShortString  szBuildBranchString;
+    NvAPI_ShortString  szAdapterString;
+} NV_DISPLAY_DRIVER_VERSION;
+
+//! \ingroup driverapi
+#define NV_DISPLAY_DRIVER_VERSION_VER  MAKE_NVAPI_VERSION(NV_DISPLAY_DRIVER_VERSION,1)
+
+
+//! \ingroup driverapi
+__nvapi_deprecated_function("Do not use this function - it is deprecated in release 290. Instead, use NvAPI_SYS_GetDriverAndBranchVersion.")
+NVAPI_INTERFACE NvAPI_GetDisplayDriverVersion(NvDisplayHandle hNvDisplay, NV_DISPLAY_DRIVER_VERSION *pVersion);
+
+
+
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_OGL_ExpertModeSet[Get]
+//
+//! \name NvAPI_OGL_ExpertModeSet[Get] Functions
+//@{
+//!  This function configures OpenGL Expert Mode, an API usage feedback and
+//!  advice reporting mechanism. The effects of this call are
+//!  applied only to the current context, and are reset to the
+//!  defaults when the context is destroyed.
+//!
+//!  \note  This feature is valid at runtime only when GLExpert
+//!         functionality has been built into the OpenGL driver
+//!         installed on the system. All Windows Vista OpenGL
+//!         drivers provided by NVIDIA have this instrumentation
+//!         included by default. Windows XP, however, requires a
+//!         special display driver available with the NVIDIA
+//!         PerfSDK found at developer.nvidia.com.
+//!
+//!  \note These functions are valid only for the current OpenGL
+//!        context. Calling these functions prior to creating a
+//!        context and calling MakeCurrent with it will result
+//!        in errors and undefined behavior.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 80
+//!
+//! \param   expertDetailMask  Mask made up of NVAPI_OGLEXPERT_DETAIL bits,
+//!                            this parameter specifies the detail level in
+//!                            the feedback stream.
+//!
+//! \param   expertReportMask  Mask made up of NVAPI_OGLEXPERT_REPORT bits,
+//!                            this parameter specifies the areas of
+//!                            functional interest.
+//!
+//! \param   expertOutputMask  Mask made up of NVAPI_OGLEXPERT_OUTPUT bits,
+//!                            this parameter specifies the feedback output
+//!                            location.
+//!
+//! \param   expertCallback    Used in conjunction with OUTPUT_TO_CALLBACK,
+//!                            this is a simple callback function the user
+//!                            may use to obtain the feedback stream. The
+//!                            function will be called once per fully
+//!                            qualified feedback stream extry.
+//!
+//! \retval  NVAPI_API_NOT_INTIALIZED          NVAPI not initialized
+//! \retval  NVAPI_NVIDIA_DEVICE_NOT_FOUND     No NVIDIA GPU found
+//! \retval  NVAPI_OPENGL_CONTEXT_NOT_CURRENT  No NVIDIA OpenGL context
+//!                                            which supports GLExpert
+//!                                            has been made current
+//! \retval  NVAPI_ERROR                       OpenGL driver failed to load properly
+//! \retval  NVAPI_OK                          Success
+//
+///////////////////////////////////////////////////////////////////////////////
+
+//! \addtogroup oglapi
+//! @{
+#define NVAPI_OGLEXPERT_DETAIL_NONE                 0x00000000
+#define NVAPI_OGLEXPERT_DETAIL_ERROR                0x00000001
+#define NVAPI_OGLEXPERT_DETAIL_SWFALLBACK           0x00000002
+#define NVAPI_OGLEXPERT_DETAIL_BASIC_INFO           0x00000004
+#define NVAPI_OGLEXPERT_DETAIL_DETAILED_INFO        0x00000008
+#define NVAPI_OGLEXPERT_DETAIL_PERFORMANCE_WARNING  0x00000010
+#define NVAPI_OGLEXPERT_DETAIL_QUALITY_WARNING      0x00000020
+#define NVAPI_OGLEXPERT_DETAIL_USAGE_WARNING        0x00000040
+#define NVAPI_OGLEXPERT_DETAIL_ALL                  0xFFFFFFFF
+
+#define NVAPI_OGLEXPERT_REPORT_NONE                 0x00000000
+#define NVAPI_OGLEXPERT_REPORT_ERROR                0x00000001
+#define NVAPI_OGLEXPERT_REPORT_SWFALLBACK           0x00000002
+#define NVAPI_OGLEXPERT_REPORT_PIPELINE_VERTEX      0x00000004
+#define NVAPI_OGLEXPERT_REPORT_PIPELINE_GEOMETRY    0x00000008
+#define NVAPI_OGLEXPERT_REPORT_PIPELINE_XFB         0x00000010
+#define NVAPI_OGLEXPERT_REPORT_PIPELINE_RASTER      0x00000020
+#define NVAPI_OGLEXPERT_REPORT_PIPELINE_FRAGMENT    0x00000040
+#define NVAPI_OGLEXPERT_REPORT_PIPELINE_ROP         0x00000080
+#define NVAPI_OGLEXPERT_REPORT_PIPELINE_FRAMEBUFFER 0x00000100
+#define NVAPI_OGLEXPERT_REPORT_PIPELINE_PIXEL       0x00000200
+#define NVAPI_OGLEXPERT_REPORT_PIPELINE_TEXTURE     0x00000400
+#define NVAPI_OGLEXPERT_REPORT_OBJECT_BUFFEROBJECT  0x00000800
+#define NVAPI_OGLEXPERT_REPORT_OBJECT_TEXTURE       0x00001000
+#define NVAPI_OGLEXPERT_REPORT_OBJECT_PROGRAM       0x00002000
+#define NVAPI_OGLEXPERT_REPORT_OBJECT_FBO           0x00004000
+#define NVAPI_OGLEXPERT_REPORT_FEATURE_SLI          0x00008000
+#define NVAPI_OGLEXPERT_REPORT_ALL                  0xFFFFFFFF
+
+
+#define NVAPI_OGLEXPERT_OUTPUT_TO_NONE       0x00000000
+#define NVAPI_OGLEXPERT_OUTPUT_TO_CONSOLE    0x00000001
+#define NVAPI_OGLEXPERT_OUTPUT_TO_DEBUGGER   0x00000004
+#define NVAPI_OGLEXPERT_OUTPUT_TO_CALLBACK   0x00000008
+#define NVAPI_OGLEXPERT_OUTPUT_TO_ALL        0xFFFFFFFF
+
+//! @}
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION TYPE: NVAPI_OGLEXPERT_CALLBACK
+//
+//!   DESCRIPTION: Used in conjunction with OUTPUT_TO_CALLBACK, this is a simple
+//!                callback function the user may use to obtain the feedback
+//!                stream. The function will be called once per fully qualified
+//!                feedback stream entry.
+//!
+//!   \param   categoryId    Contains the bit from the NVAPI_OGLEXPERT_REPORT
+//!                          mask that corresponds to the current message
+//!   \param   messageId     Unique ID for the current message
+//!   \param   detailLevel   Contains the bit from the NVAPI_OGLEXPERT_DETAIL
+//!                          mask that corresponds to the current message
+//!   \param   objectId      Unique ID of the object that corresponds to the
+//!                          current message
+//!   \param   messageStr    Text string from the current message
+//!
+//!   \ingroup oglapi
+///////////////////////////////////////////////////////////////////////////////
+typedef void (* NVAPI_OGLEXPERT_CALLBACK) (unsigned int categoryId, unsigned int messageId, unsigned int detailLevel, int objectId, const char *messageStr);
+
+
+
+//! \ingroup oglapi
+//! SUPPORTED OS:  Windows XP and higher
+//!
+NVAPI_INTERFACE NvAPI_OGL_ExpertModeSet(NvU32 expertDetailLevel,
+                                        NvU32 expertReportMask,
+                                        NvU32 expertOutputMask,
+                     NVAPI_OGLEXPERT_CALLBACK expertCallback);
+
+//! \addtogroup oglapi
+//! SUPPORTED OS:  Windows XP and higher
+//!
+NVAPI_INTERFACE NvAPI_OGL_ExpertModeGet(NvU32 *pExpertDetailLevel,
+                                        NvU32 *pExpertReportMask,
+                                        NvU32 *pExpertOutputMask,
+                     NVAPI_OGLEXPERT_CALLBACK *pExpertCallback);
+
+//@}
+///////////////////////////////////////////////////////////////////////////////
+//
+//! \name NvAPI_OGL_ExpertModeDefaultsSet[Get] Functions
+//!
+//@{
+//!  This function configures OpenGL Expert Mode global defaults. These settings
+//!  apply to any OpenGL application which starts up after these
+//!  values are applied (i.e. these settings *do not* apply to
+//!  currently running applications).
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 80
+//!
+//! \param   expertDetailLevel Value which specifies the detail level in
+//!                            the feedback stream. This is a mask made up
+//!                            of NVAPI_OGLEXPERT_LEVEL bits.
+//!
+//! \param   expertReportMask  Mask made up of NVAPI_OGLEXPERT_REPORT bits,
+//!                            this parameter specifies the areas of
+//!                            functional interest.
+//!
+//! \param   expertOutputMask  Mask made up of NVAPI_OGLEXPERT_OUTPUT bits,
+//!                            this parameter specifies the feedback output
+//!                            location. Note that using OUTPUT_TO_CALLBACK
+//!                            here is meaningless and has no effect, but
+//!                            using it will not cause an error.
+//!
+//! \return  ::NVAPI_ERROR or ::NVAPI_OK
+//
+///////////////////////////////////////////////////////////////////////////////
+
+//! \ingroup oglapi
+//! SUPPORTED OS:  Windows XP and higher
+//!
+NVAPI_INTERFACE NvAPI_OGL_ExpertModeDefaultsSet(NvU32 expertDetailLevel,
+                                                NvU32 expertReportMask,
+                                                NvU32 expertOutputMask);
+
+//! \addtogroup oglapi
+//! SUPPORTED OS:  Windows XP and higher
+//!
+NVAPI_INTERFACE NvAPI_OGL_ExpertModeDefaultsGet(NvU32 *pExpertDetailLevel,
+                                                NvU32 *pExpertReportMask,
+                                                NvU32 *pExpertOutputMask);
+//@}
+
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_EnumTCCPhysicalGPUs
+//
+//! This function returns an array of physical GPU handles that are in TCC Mode.
+//! Each handle represents a physical GPU present in the system in TCC Mode.
+//! That GPU may not be visible to the OS directly.
+//!
+//! The array nvGPUHandle will be filled with physical GPU handle values. The returned
+//! gpuCount determines how many entries in the array are valid.
+//!
+//! NOTE: Handles enumerated by this API are only valid for NvAPIs that are tagged as TCC_SUPPORTED
+//!       If handle is passed to any other API, it will fail with NVAPI_INVALID_HANDLE
+//!
+//!       For WDDM GPU handles please use NvAPI_EnumPhysicalGPUs()
+//!
+//! SUPPORTED OS:  Windows Vista and higher,  Mac OS X
+//!
+//!
+//!
+//! \param [out]   nvGPUHandle      Physical GPU array that will contain all TCC Physical GPUs
+//! \param [out]   pGpuCount        count represent the number of valid entries in nvGPUHandle
+//!
+//!
+//! \retval NVAPI_INVALID_ARGUMENT         nvGPUHandle or pGpuCount is NULL
+//! \retval NVAPI_OK                       One or more handles were returned
+//! \ingroup gpu
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_EnumTCCPhysicalGPUs( NvPhysicalGpuHandle nvGPUHandle[NVAPI_MAX_PHYSICAL_GPUS], NvU32 *pGpuCount);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_EnumLogicalGPUs
+//
+//! This function returns an array of logical GPU handles.
+//!
+//! Each handle represents one or more GPUs acting in concert as a single graphics device.
+//!
+//! At least one GPU must be present in the system and running an NVIDIA display driver.
+//!
+//! The array nvGPUHandle will be filled with logical GPU handle values.  The returned
+//! gpuCount determines how many entries in the array are valid.
+//!
+//! \note All logical GPUs handles get invalidated on a GPU topology change, so the calling
+//!       application is required to renum the logical GPU handles to get latest physical handle
+//!       mapping after every GPU topology change activated by a call to NvAPI_SetGpuTopologies().
+//!
+//! To detect if SLI rendering is enabled, use NvAPI_D3D_GetCurrentSLIState().
+//!
+//! SUPPORTED OS:  Windows XP and higher,  Mac OS X
+//!
+//!
+//! \since Release: 80
+//!
+//! \retval NVAPI_INVALID_ARGUMENT         nvGPUHandle or pGpuCount is NULL
+//! \retval NVAPI_OK                       One or more handles were returned
+//! \retval NVAPI_NVIDIA_DEVICE_NOT_FOUND  No NVIDIA GPU driving a display was found
+//! \ingroup gpu
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_EnumLogicalGPUs(NvLogicalGpuHandle nvGPUHandle[NVAPI_MAX_LOGICAL_GPUS], NvU32 *pGpuCount);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GetPhysicalGPUsFromDisplay
+//
+//! This function returns an array of physical GPU handles associated with the specified display.
+//!
+//! At least one GPU must be present in the system and running an NVIDIA display driver.
+//!
+//! The array nvGPUHandle will be filled with physical GPU handle values.  The returned
+//! gpuCount determines how many entries in the array are valid.
+//!
+//! If the display corresponds to more than one physical GPU, the first GPU returned
+//! is the one with the attached active output.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 80
+//!
+//! \retval NVAPI_INVALID_ARGUMENT         hNvDisp is not valid; nvGPUHandle or pGpuCount is NULL
+//! \retval NVAPI_OK                       One or more handles were returned
+//! \retval NVAPI_NVIDIA_DEVICE_NOT_FOUND  no NVIDIA GPU driving a display was found
+//! \ingroup gpu
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GetPhysicalGPUsFromDisplay(NvDisplayHandle hNvDisp, NvPhysicalGpuHandle nvGPUHandle[NVAPI_MAX_PHYSICAL_GPUS], NvU32 *pGpuCount);
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GetPhysicalGPUFromUnAttachedDisplay
+//
+//! This function returns a physical GPU handle associated with the specified unattached display.
+//! The source GPU is a physical render GPU which renders the frame buffer but may or may not drive the scan out.
+//!
+//! At least one GPU must be present in the system and running an NVIDIA display driver.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 80
+//!
+//! \retval NVAPI_INVALID_ARGUMENT         hNvUnAttachedDisp is not valid or pPhysicalGpu is NULL.
+//! \retval NVAPI_OK                       One or more handles were returned
+//! \retval NVAPI_NVIDIA_DEVICE_NOT_FOUND  No NVIDIA GPU driving a display was found
+//! \ingroup gpu
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GetPhysicalGPUFromUnAttachedDisplay(NvUnAttachedDisplayHandle hNvUnAttachedDisp, NvPhysicalGpuHandle *pPhysicalGpu);
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GetLogicalGPUFromDisplay
+//
+//! This function returns the logical GPU handle associated with the specified display.
+//! At least one GPU must be present in the system and running an NVIDIA display driver.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 80
+//!
+//! \retval NVAPI_INVALID_ARGUMENT         hNvDisp is not valid; pLogicalGPU is NULL
+//! \retval NVAPI_OK                       One or more handles were returned
+//! \retval NVAPI_NVIDIA_DEVICE_NOT_FOUND  No NVIDIA GPU driving a display was found
+//! \ingroup gpu
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GetLogicalGPUFromDisplay(NvDisplayHandle hNvDisp, NvLogicalGpuHandle *pLogicalGPU);
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GetLogicalGPUFromPhysicalGPU
+//
+//! This function returns the logical GPU handle associated with specified physical GPU handle.
+//! At least one GPU must be present in the system and running an NVIDIA display driver.
+//!
+//! SUPPORTED OS:  Windows XP and higher,  Mac OS X
+//!
+//!
+//! \since Release: 80
+//!
+//! \retval NVAPI_INVALID_ARGUMENT         hPhysicalGPU is not valid; pLogicalGPU is NULL
+//! \retval NVAPI_OK                       One or more handles were returned
+//! \retval NVAPI_NVIDIA_DEVICE_NOT_FOUND  No NVIDIA GPU driving a display was found
+//! \ingroup gpu
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GetLogicalGPUFromPhysicalGPU(NvPhysicalGpuHandle hPhysicalGPU, NvLogicalGpuHandle *pLogicalGPU);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GetPhysicalGPUsFromLogicalGPU
+//
+//!  This function returns the physical GPU handles associated with the specified logical GPU handle.
+//!  At least one GPU must be present in the system and running an NVIDIA display driver.
+//!
+//!  The array hPhysicalGPU will be filled with physical GPU handle values.  The returned
+//!  gpuCount determines how many entries in the array are valid.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 80
+//!
+//! \retval NVAPI_INVALID_ARGUMENT             hLogicalGPU is not valid; hPhysicalGPU is NULL
+//! \retval NVAPI_OK                           One or more handles were returned
+//! \retval NVAPI_NVIDIA_DEVICE_NOT_FOUND      No NVIDIA GPU driving a display was found
+//! \retval NVAPI_EXPECTED_LOGICAL_GPU_HANDLE  hLogicalGPU was not a logical GPU handle
+//! \ingroup gpu
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GetPhysicalGPUsFromLogicalGPU(NvLogicalGpuHandle hLogicalGPU,NvPhysicalGpuHandle hPhysicalGPU[NVAPI_MAX_PHYSICAL_GPUS], NvU32 *pGpuCount);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GPU_GetGpuCoreCount
+//
+//!   DESCRIPTION: Retrieves the total number of cores defined for a GPU.
+//!                Returns 0 on architectures that don't define GPU cores.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! TCC_SUPPORTED
+//!
+//! \retval ::NVAPI_INVALID_ARGUMENT              pCount is NULL
+//! \retval ::NVAPI_OK                            *pCount is set
+//! \retval ::NVAPI_NVIDIA_DEVICE_NOT_FOUND       no NVIDIA GPU driving a display was found
+//! \retval ::NVAPI_EXPECTED_PHYSICAL_GPU_HANDLE  hPhysicalGpu was not a physical GPU handle
+//! \retval ::NVAPI_NOT_SUPPORTED                 API call is not supported on current architecture
+//!
+//! \ingroup gpu
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GPU_GetGpuCoreCount(NvPhysicalGpuHandle hPhysicalGpu,NvU32 *pCount);
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GPU_GetAllOutputs
+//
+//!  This function returns set of all GPU-output identifiers as a bitmask.
+//!
+//! \deprecated  Do not use this function - it is deprecated in release 290. Instead, use NvAPI_GPU_GetAllDisplayIds.
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 85
+//!
+//! \retval   NVAPI_INVALID_ARGUMENT              hPhysicalGpu or pOutputsMask is NULL.
+//! \retval   NVAPI_OK                           *pOutputsMask contains a set of GPU-output identifiers.
+//! \retval   NVAPI_NVIDIA_DEVICE_NOT_FOUND       No NVIDIA GPU driving a display was found.
+//! \retval   NVAPI_EXPECTED_PHYSICAL_GPU_HANDLE  hPhysicalGpu was not a physical GPU handle.
+//! \ingroup gpu
+///////////////////////////////////////////////////////////////////////////////
+__nvapi_deprecated_function("Do not use this function - it is deprecated in release 290. Instead, use NvAPI_GPU_GetAllDisplayIds.")
+NVAPI_INTERFACE NvAPI_GPU_GetAllOutputs(NvPhysicalGpuHandle hPhysicalGpu,NvU32 *pOutputsMask);
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GPU_GetConnectedOutputs
+//
+//! This function is the same as NvAPI_GPU_GetAllOutputs() but returns only the set of GPU output
+//! identifiers that are connected to display devices.
+//!
+//! \deprecated  Do not use this function - it is deprecated in release 290. Instead, use NvAPI_GPU_GetConnectedDisplayIds.
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 80
+//!
+//! \retval   NVAPI_INVALID_ARGUMENT              hPhysicalGpu or pOutputsMask is NULL.
+//! \retval   NVAPI_OK                           *pOutputsMask contains a set of GPU-output identifiers.
+//! \retval   NVAPI_NVIDIA_DEVICE_NOT_FOUND       No NVIDIA GPU driving a display was found.
+//! \retval   NVAPI_EXPECTED_PHYSICAL_GPU_HANDLE  hPhysicalGpu was not a physical GPU handle.
+//! \ingroup gpu
+///////////////////////////////////////////////////////////////////////////////
+__nvapi_deprecated_function("Do not use this function - it is deprecated in release 290. Instead, use NvAPI_GPU_GetConnectedDisplayIds.")
+NVAPI_INTERFACE NvAPI_GPU_GetConnectedOutputs(NvPhysicalGpuHandle hPhysicalGpu, NvU32 *pOutputsMask);
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GPU_GetConnectedSLIOutputs
+//
+//!   DESCRIPTION: This function is the same as NvAPI_GPU_GetConnectedOutputs() but returns only the set of GPU-output
+//!                identifiers that can be selected in an SLI configuration.
+//!                 NOTE: This function matches NvAPI_GPU_GetConnectedOutputs()
+//!                 - On systems which are not SLI capable.
+//!                 - If the queried GPU is not part of a valid SLI group.
+//!
+//! \deprecated  Do not use this function - it is deprecated in release 290. Instead, use NvAPI_GPU_GetConnectedDisplayIds.
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 170
+//!
+//! \retval   NVAPI_INVALID_ARGUMENT              hPhysicalGpu or pOutputsMask is NULL
+//! \retval   NVAPI_OK                           *pOutputsMask contains a set of GPU-output identifiers
+//! \retval   NVAPI_NVIDIA_DEVICE_NOT_FOUND       No NVIDIA GPU driving a display was found
+//! \retval   NVAPI_EXPECTED_PHYSICAL_GPU_HANDLE: hPhysicalGpu was not a physical GPU handle
+//!
+//! \ingroup gpu
+///////////////////////////////////////////////////////////////////////////////
+__nvapi_deprecated_function("Do not use this function - it is deprecated in release 290. Instead, use NvAPI_GPU_GetConnectedDisplayIds.")
+NVAPI_INTERFACE NvAPI_GPU_GetConnectedSLIOutputs(NvPhysicalGpuHandle hPhysicalGpu, NvU32 *pOutputsMask);
+
+
+
+
+//! \ingroup gpu
+typedef enum
+{
+    NV_MONITOR_CONN_TYPE_UNINITIALIZED = 0,
+    NV_MONITOR_CONN_TYPE_VGA,
+    NV_MONITOR_CONN_TYPE_COMPONENT,
+    NV_MONITOR_CONN_TYPE_SVIDEO,
+    NV_MONITOR_CONN_TYPE_HDMI,
+    NV_MONITOR_CONN_TYPE_DVI,
+    NV_MONITOR_CONN_TYPE_LVDS,
+    NV_MONITOR_CONN_TYPE_DP,
+    NV_MONITOR_CONN_TYPE_COMPOSITE,
+    NV_MONITOR_CONN_TYPE_UNKNOWN =  -1
+} NV_MONITOR_CONN_TYPE;
+
+
+//! \addtogroup gpu
+//! @{
+#define NV_GPU_CONNECTED_IDS_FLAG_UNCACHED          NV_BIT(0) //!< Get uncached connected devices
+#define NV_GPU_CONNECTED_IDS_FLAG_SLI               NV_BIT(1) //!< Get devices such that those can be selected in an SLI configuration
+#define NV_GPU_CONNECTED_IDS_FLAG_LIDSTATE          NV_BIT(2) //!< Get devices such that to reflect the Lid State
+#define NV_GPU_CONNECTED_IDS_FLAG_FAKE              NV_BIT(3) //!< Get devices that includes the fake connected monitors
+#define NV_GPU_CONNECTED_IDS_FLAG_EXCLUDE_MST       NV_BIT(4) //!< Excludes devices that are part of the multi stream topology.
+
+//! @}
+
+//! \ingroup gpu
+typedef struct _NV_GPU_DISPLAYIDS
+{
+    NvU32    version;
+    NV_MONITOR_CONN_TYPE connectorType; //!< out: vga, tv, dvi, hdmi and dp. This is reserved for future use and clients should not rely on this information. Instead get the
+                                        //!< GPU connector type from NvAPI_GPU_GetConnectorInfo/NvAPI_GPU_GetConnectorInfoEx
+    NvU32    displayId;                 //!< this is a unique identifier for each device
+    NvU32    isDynamic:1;               //!< if bit is set then this display is part of MST topology and it's a dynamic
+    NvU32    isMultiStreamRootNode:1;   //!< if bit is set then this displayID belongs to a multi stream enabled connector(root node). Note that when multi stream is enabled and
+                                        //!< a single multi stream capable monitor is connected to it, the monitor will share the display id with the RootNode.
+                                        //!< When there is more than one monitor connected in a multi stream topology, then the root node will have a separate displayId.
+    NvU32    isActive:1;                //!< if bit is set then this display is being actively driven
+    NvU32    isCluster:1;               //!< if bit is set then this display is the representative display
+    NvU32    isOSVisible:1;             //!< if bit is set, then this display is reported to the OS
+    NvU32    isWFD:1;                   //!< if bit is set, then this display is wireless
+    NvU32    isConnected:1;             //!< if bit is set, then this display is connected
+    NvU32    reservedInternal:10;       //!< Do not use
+    NvU32    isPhysicallyConnected:1;   //!< if bit is set, then this display is a phycially connected display; Valid only when isConnected bit is set
+    NvU32    reserved: 14;              //!< must be zero
+} NV_GPU_DISPLAYIDS;
+
+//! \ingroup gpu
+//! Macro for constructing the version field of ::_NV_GPU_DISPLAYIDS
+#define NV_GPU_DISPLAYIDS_VER1          MAKE_NVAPI_VERSION(NV_GPU_DISPLAYIDS,1)
+#define NV_GPU_DISPLAYIDS_VER2          MAKE_NVAPI_VERSION(NV_GPU_DISPLAYIDS,3)
+
+#define NV_GPU_DISPLAYIDS_VER NV_GPU_DISPLAYIDS_VER2
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GPU_GetConnectedDisplayIds
+//
+//! \code
+//!   DESCRIPTION: Due to space limitation NvAPI_GPU_GetConnectedOutputs can return maximum 32 devices, but
+//!                this is no longer true for DPMST. NvAPI_GPU_GetConnectedDisplayIds will return all
+//!                the connected display devices in the form of displayIds for the associated hPhysicalGpu.
+//!                This function can accept set of flags to request cached, uncached, sli and lid to get the connected devices.
+//!                Default value for flags will be cached .
+//! HOW TO USE: 1) for each PhysicalGpu, make a call to get the number of connected displayId's
+//!                using NvAPI_GPU_GetConnectedDisplayIds by passing the pDisplayIds as NULL
+//!                On call success:
+//!             2) Allocate memory based on pDisplayIdCount then make a call NvAPI_GPU_GetConnectedDisplayIds to populate DisplayIds
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//! PARAMETERS:     hPhysicalGpu (IN)  - GPU selection
+//!                 flags        (IN)  - One or more defines from NV_GPU_CONNECTED_IDS_FLAG_* as valid flags.
+//!                 pDisplayIds  (IN/OUT) - Pointer to an NV_GPU_DISPLAYIDS struct, each entry represents a one displayID and its attributes
+//!                 pDisplayIdCount(OUT)- Number of displayId's.
+//!
+//! RETURN STATUS: NVAPI_INVALID_ARGUMENT: hPhysicalGpu or pDisplayIds or pDisplayIdCount is NULL
+//!                NVAPI_OK: *pDisplayIds contains a set of GPU-output identifiers
+//!                NVAPI_NVIDIA_DEVICE_NOT_FOUND: no NVIDIA GPU driving a display was found
+//!                NVAPI_EXPECTED_PHYSICAL_GPU_HANDLE: hPhysicalGpu was not a physical GPU handle
+//! \endcode
+//! \ingroup gpu
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GPU_GetConnectedDisplayIds(__in NvPhysicalGpuHandle hPhysicalGpu,  __inout_ecount_part_opt(*pDisplayIdCount, *pDisplayIdCount) NV_GPU_DISPLAYIDS* pDisplayIds, __inout NvU32* pDisplayIdCount, __in NvU32 flags);
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GPU_GetAllDisplayIds
+//
+//!   DESCRIPTION: This API returns display IDs for all possible outputs on the GPU.
+//!                For DPMST connector, it will return display IDs for all the video sinks in the topology. \n
+//! HOW TO USE: 1. The first call should be made to get the all display ID count. To get the display ID count, send in \n
+//!                  a) hPhysicalGpu    - a valid GPU handle(enumerated using NvAPI_EnumPhysicalGPUs()) as input,      \n
+//!                  b) pDisplayIds     - NULL, as we just want to get the display ID count.                           \n
+//!                  c) pDisplayIdCount - a valid pointer to NvU32, whose value is set to ZERO.                        \n
+//!                If all parameters are correct and this call is successful, this call will return the display ID's count. \n
+//!             2. To get the display ID array, make the second call to NvAPI_GPU_GetAllDisplayIds() with              \n
+//!                  a) hPhysicalGpu    - should be same value which was sent in first call,                           \n
+//!                  b) pDisplayIds     - pointer to the display ID array allocated by caller based on display ID count,    \n
+//!                                       eg. malloc(sizeof(NV_GPU_DISPLAYIDS) * pDisplayIdCount).                     \n
+//!                  c) pDisplayIdCount - a valid pointer to NvU32. This indicates for how many display IDs            \n
+//!                                       the memory is allocated(pDisplayIds) by the caller.                          \n
+//!                If all parameters are correct and this call is successful, this call will return the display ID array and actual
+//!                display ID count (which was obtained in the first call to NvAPI_GPU_GetAllDisplayIds). If the input display ID count is
+//!                less than the actual display ID count, it will overwrite the input and give the pDisplayIdCount as actual count and the
+//!                API will return NVAPI_INSUFFICIENT_BUFFER.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \param [in]     hPhysicalGpu         GPU selection.
+//! \param [in,out] DisplayIds           Pointer to an array of NV_GPU_DISPLAYIDS structures, each entry represents one displayID
+//!                                      and its attributes.
+//! \param [in,out] pDisplayIdCount      As input, this parameter indicates the number of display's id's for which caller has
+//!                                      allocated the memory. As output, it will return the actual number of display IDs.
+//!
+//! \return  This API can return any of the error codes enumerated in #NvAPI_Status. If there are return error codes with
+//!          specific meaning for this API, they are listed below.
+//!
+//! \retval  NVAPI_INSUFFICIENT_BUFFER  When the input buffer(pDisplayIds) is less than the actual number of display IDs, this API
+//!                                     will return NVAPI_INSUFFICIENT_BUFFER.
+//!
+//! \ingroup gpu
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GPU_GetAllDisplayIds(__in NvPhysicalGpuHandle hPhysicalGpu, __inout_ecount_part_opt(*pDisplayIdCount, *pDisplayIdCount) NV_GPU_DISPLAYIDS* pDisplayIds, __inout NvU32* pDisplayIdCount);
+
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GPU_GetConnectedOutputsWithLidState
+//
+//!  This function is similar to NvAPI_GPU_GetConnectedOutputs(), and returns the connected display identifiers that are connected
+//!  as an output mask but unlike NvAPI_GPU_GetConnectedOutputs() this API "always" reflects the Lid State in the output mask.
+//!  Thus if you expect the LID close state to be available in the connection mask use this API.
+//!  - If LID is closed then this API will remove the LID panel from the connected display identifiers.
+//!  - If LID is open then this API will reflect the LID panel in the connected display identifiers.
+//!
+//! \note This API should be used on notebook systems and on systems where the LID state is required in the connection
+//!       output mask. On desktop systems the returned identifiers will match NvAPI_GPU_GetConnectedOutputs().
+//!
+//! \deprecated  Do not use this function - it is deprecated in release 290. Instead, use NvAPI_GPU_GetConnectedDisplayIds.
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 95
+//!
+//! \retval  NVAPI_INVALID_ARGUMENT              hPhysicalGpu or pOutputsMask is NULL
+//! \retval  NVAPI_OK                           *pOutputsMask contains a set of GPU-output identifiers
+//! \retval  NVAPI_NVIDIA_DEVICE_NOT_FOUND       No NVIDIA GPU driving a display was found
+//! \retval  NVAPI_EXPECTED_PHYSICAL_GPU_HANDLE  hPhysicalGpu was not a physical GPU handle
+//! \ingroup gpu
+///////////////////////////////////////////////////////////////////////////////
+__nvapi_deprecated_function("Do not use this function - it is deprecated in release 290. Instead, use NvAPI_GPU_GetConnectedDisplayIds.")
+NVAPI_INTERFACE NvAPI_GPU_GetConnectedOutputsWithLidState(NvPhysicalGpuHandle hPhysicalGpu, NvU32 *pOutputsMask);
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GPU_GetConnectedSLIOutputsWithLidState
+//
+//!   DESCRIPTION: This function is the same as NvAPI_GPU_GetConnectedOutputsWithLidState() but returns only the set
+//!                of GPU-output identifiers that can be selected in an SLI configuration. With SLI disabled,
+//!                this function matches NvAPI_GPU_GetConnectedOutputsWithLidState().
+//!
+//! \deprecated  Do not use this function - it is deprecated in release 290. Instead, use NvAPI_GPU_GetConnectedDisplayIds.
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 170
+//!
+//! \retval  NVAPI_INVALID_ARGUMENT              hPhysicalGpu or pOutputsMask is NULL
+//! \retval  NVAPI_OK                           *pOutputsMask contains a set of GPU-output identifiers
+//! \retval  NVAPI_NVIDIA_DEVICE_NOT_FOUND       No NVIDIA GPU driving a display was found
+//! \retval  NVAPI_EXPECTED_PHYSICAL_GPU_HANDLE  hPhysicalGpu was not a physical GPU handle
+//!
+//! \ingroup gpu
+///////////////////////////////////////////////////////////////////////////////
+__nvapi_deprecated_function("Do not use this function - it is deprecated in release 290. Instead, use NvAPI_GPU_GetConnectedDisplayIds.")
+NVAPI_INTERFACE NvAPI_GPU_GetConnectedSLIOutputsWithLidState(NvPhysicalGpuHandle hPhysicalGpu, NvU32 *pOutputsMask);
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GPU_GetSystemType
+//
+//! \fn NvAPI_GPU_GetSystemType(NvPhysicalGpuHandle hPhysicalGpu, NV_SYSTEM_TYPE *pSystemType)
+//!  This function identifies whether the GPU is a notebook GPU or a desktop GPU.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 95
+//!
+//! \retval  NVAPI_INVALID_ARGUMENT              hPhysicalGpu or pOutputsMask is NULL
+//! \retval  NVAPI_OK                           *pSystemType contains the GPU system type
+//! \retval  NVAPI_NVIDIA_DEVICE_NOT_FOUND       No NVIDIA GPU driving a display was found
+//! \retval  NVAPI_EXPECTED_PHYSICAL_GPU_HANDLE: hPhysicalGpu was not a physical GPU handle
+//
+///////////////////////////////////////////////////////////////////////////////
+
+//! \ingroup gpu
+//! Used in NvAPI_GPU_GetSystemType()
+typedef enum
+{
+    NV_SYSTEM_TYPE_UNKNOWN = 0,
+    NV_SYSTEM_TYPE_LAPTOP  = 1,
+    NV_SYSTEM_TYPE_DESKTOP = 2,
+
+} NV_SYSTEM_TYPE;
+
+
+
+//! \ingroup gpu
+NVAPI_INTERFACE NvAPI_GPU_GetSystemType(NvPhysicalGpuHandle hPhysicalGpu, NV_SYSTEM_TYPE *pSystemType);
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GPU_GetActiveOutputs
+//
+//!  This function is the same as NvAPI_GPU_GetAllOutputs but returns only the set of GPU output
+//!  identifiers that are actively driving display devices.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 85
+//!
+//! \retval    NVAPI_INVALID_ARGUMENT              hPhysicalGpu or pOutputsMask is NULL.
+//! \retval    NVAPI_OK                           *pOutputsMask contains a set of GPU-output identifiers.
+//! \retval    NVAPI_NVIDIA_DEVICE_NOT_FOUND       No NVIDIA GPU driving a display was found.
+//! \retval    NVAPI_EXPECTED_PHYSICAL_GPU_HANDLE  hPhysicalGpu was not a physical GPU handle.
+//! \ingroup gpu
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GPU_GetActiveOutputs(NvPhysicalGpuHandle hPhysicalGpu, NvU32 *pOutputsMask);
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GPU_SetEDID
+//
+//!  Thus function sets the EDID data for the specified GPU handle and connection bit mask.
+//!  User can either send (Gpu handle & output id)  or only display Id in variable displayOutputId parameter & hPhysicalGpu parameter can be default handle (0).
+//!  \note The EDID will be cached across the boot session and will be enumerated to the OS in this call.
+//!        To remove the EDID set sizeofEDID to zero.
+//!        OS and NVAPI connection status APIs will reflect the newly set or removed EDID dynamically.
+//!
+//!                This feature will NOT be supported on the following boards:
+//!                - GeForce
+//!                - Quadro VX
+//!                - Tesla
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 100
+//!
+//! \retval  NVAPI_INVALID_ARGUMENT              pEDID is NULL; displayOutputId has 0 or > 1 bits set
+//! \retval  NVAPI_OK                           *pEDID data was applied to the requested displayOutputId.
+//! \retval  NVAPI_NVIDIA_DEVICE_NOT_FOUND       No NVIDIA GPU driving a display was found.
+//! \retval  NVAPI_EXPECTED_PHYSICAL_GPU_HANDLE: hPhysicalGpu was not a physical GPU handle.
+//! \retval  NVAPI_NOT_SUPPORTED                 For the above mentioned GPUs
+//! \ingroup gpu
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GPU_SetEDID(NvPhysicalGpuHandle hPhysicalGpu, NvU32 displayOutputId, NV_EDID *pEDID);
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GPU_GetOutputType
+//
+//! \fn NvAPI_GPU_GetOutputType(NvPhysicalGpuHandle hPhysicalGpu, NvU32 outputId, NV_GPU_OUTPUT_TYPE *pOutputType)
+//!  This function returns the output type for a specific physical GPU handle and outputId (exactly 1 bit set - see \ref handles).
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \Version Earliest supported ForceWare version: 82.61
+//!
+//! \retval     NVAPI_INVALID_ARGUMENT              hPhysicalGpu, outputId, or pOutputsMask is NULL; or outputId has > 1 bit set
+//! \retval     NVAPI_OK                           *pOutputType contains a NvGpuOutputType value
+//! \retval     NVAPI_NVIDIA_DEVICE_NOT_FOUND       No NVIDIA GPU driving a display was found
+//! \retval     NVAPI_EXPECTED_PHYSICAL_GPU_HANDLE  hPhysicalGpu was not a physical GPU handle
+//
+///////////////////////////////////////////////////////////////////////////////
+
+//! \ingroup gpu
+//! used in NvAPI_GPU_GetOutputType()
+typedef enum _NV_GPU_OUTPUT_TYPE
+{
+    NVAPI_GPU_OUTPUT_UNKNOWN  = 0,
+    NVAPI_GPU_OUTPUT_CRT      = 1,     //!<  CRT display device
+    NVAPI_GPU_OUTPUT_DFP      = 2,     //!<  Digital Flat Panel display device
+    NVAPI_GPU_OUTPUT_TV       = 3,     //!<  TV display device
+} NV_GPU_OUTPUT_TYPE;
+
+
+
+
+//! \ingroup gpu
+NVAPI_INTERFACE NvAPI_GPU_GetOutputType(NvPhysicalGpuHandle hPhysicalGpu, NvU32 outputId, NV_GPU_OUTPUT_TYPE *pOutputType);
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GPU_ValidateOutputCombination
+//
+//!  This function determines if a set of GPU outputs can be active
+//!  simultaneously.  While a GPU may have <n> outputs, typically they cannot
+//!  all be active at the same time due to internal resource sharing.
+//!
+//!  Given a physical GPU handle and a mask of candidate outputs, this call
+//!  will return NVAPI_OK if all of the specified outputs can be driven
+//!  simultaneously.  It will return NVAPI_INVALID_COMBINATION if they cannot.
+//!
+//!  Use NvAPI_GPU_GetAllOutputs() to determine which outputs are candidates.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 85
+//!
+//! \retval  NVAPI_OK                            Combination of outputs in outputsMask are valid (can be active simultaneously).
+//! \retval  NVAPI_INVALID_COMBINATION           Combination of outputs in outputsMask are NOT valid.
+//! \retval  NVAPI_INVALID_ARGUMENT              hPhysicalGpu or outputsMask does not have at least 2 bits set.
+//! \retval  NVAPI_EXPECTED_PHYSICAL_GPU_HANDLE  hPhysicalGpu was not a physical GPU handle.
+//! \retval  NVAPI_NVIDIA_DEVICE_NOT_FOUND       No NVIDIA GPU driving a display was found.
+//! \ingroup gpu
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GPU_ValidateOutputCombination(NvPhysicalGpuHandle hPhysicalGpu, NvU32 outputsMask);
+
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GPU_GetFullName
+//
+//!  This function retrieves the full GPU name as an ASCII string - for example, "Quadro FX 1400".
+//!
+//! SUPPORTED OS:  Windows XP and higher,  Mac OS X
+//!
+//!
+//! TCC_SUPPORTED
+//!
+//! \since Release: 90
+//!
+//! \return  NVAPI_ERROR or NVAPI_OK
+//! \ingroup gpu
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GPU_GetFullName(NvPhysicalGpuHandle hPhysicalGpu, NvAPI_ShortString szName);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GPU_GetPCIIdentifiers
+//
+//!  This function returns the PCI identifiers associated with this GPU.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! TCC_SUPPORTED
+//!
+//! \since Release: 90
+//!
+//! \param   DeviceId      The internal PCI device identifier for the GPU.
+//! \param   SubSystemId   The internal PCI subsystem identifier for the GPU.
+//! \param   RevisionId    The internal PCI device-specific revision identifier for the GPU.
+//! \param   ExtDeviceId   The external PCI device identifier for the GPU.
+//!
+//! \retval  NVAPI_INVALID_ARGUMENT              hPhysicalGpu or an argument is NULL
+//! \retval  NVAPI_OK                            Arguments are populated with PCI identifiers
+//! \retval  NVAPI_NVIDIA_DEVICE_NOT_FOUND       No NVIDIA GPU driving a display was found
+//! \retval  NVAPI_EXPECTED_PHYSICAL_GPU_HANDLE  hPhysicalGpu was not a physical GPU handle
+//! \ingroup gpu
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GPU_GetPCIIdentifiers(NvPhysicalGpuHandle hPhysicalGpu,NvU32 *pDeviceId,NvU32 *pSubSystemId,NvU32 *pRevisionId,NvU32 *pExtDeviceId);
+
+
+
+
+//! \ingroup gpu
+//! Used in NvAPI_GPU_GetGPUType().
+typedef enum _NV_GPU_TYPE
+{
+    NV_SYSTEM_TYPE_GPU_UNKNOWN     = 0,
+    NV_SYSTEM_TYPE_IGPU            = 1, //!< Integrated GPU
+    NV_SYSTEM_TYPE_DGPU            = 2, //!< Discrete GPU
+} NV_GPU_TYPE;
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GPU_GetGPUType
+//
+//!  DESCRIPTION: This function returns the GPU type (integrated or discrete).
+//!               See ::NV_GPU_TYPE.
+//!
+//! SUPPORTED OS:  Windows XP and higher,  Mac OS X
+//!
+//!
+//! TCC_SUPPORTED
+//!
+//! \since Release: 173
+//!
+//! \retval  NVAPI_INVALID_ARGUMENT              hPhysicalGpu
+//! \retval  NVAPI_OK                           *pGpuType contains the GPU type
+//! \retval  NVAPI_NVIDIA_DEVICE_NOT_FOUND       No NVIDIA GPU driving a display was found
+//! \retval  NVAPI_EXPECTED_PHYSICAL_GPU_HANDLE: hPhysicalGpu was not a physical GPU handle
+//!
+//!  \ingroup gpu
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GPU_GetGPUType(__in NvPhysicalGpuHandle hPhysicalGpu, __inout NV_GPU_TYPE *pGpuType);
+
+
+
+
+//! \ingroup gpu
+//! Used in NvAPI_GPU_GetBusType()
+typedef enum _NV_GPU_BUS_TYPE
+{
+    NVAPI_GPU_BUS_TYPE_UNDEFINED    = 0,
+    NVAPI_GPU_BUS_TYPE_PCI          = 1,
+    NVAPI_GPU_BUS_TYPE_AGP          = 2,
+    NVAPI_GPU_BUS_TYPE_PCI_EXPRESS  = 3,
+    NVAPI_GPU_BUS_TYPE_FPCI         = 4,
+    NVAPI_GPU_BUS_TYPE_AXI          = 5,
+} NV_GPU_BUS_TYPE;
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GPU_GetBusType
+//
+//!  This function returns the type of bus associated with this GPU.
+//!
+//! SUPPORTED OS:  Windows XP and higher,  Mac OS X
+//!
+//!
+//! TCC_SUPPORTED
+//!
+//! \since Release: 90
+//!
+//! \return      This API can return any of the error codes enumerated in #NvAPI_Status. If there are return error codes with
+//!              specific meaning for this API, they are listed below.
+//! \retval      NVAPI_INVALID_ARGUMENT             hPhysicalGpu or pBusType is NULL.
+//! \retval      NVAPI_OK                          *pBusType contains bus identifier.
+//! \ingroup gpu
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GPU_GetBusType(NvPhysicalGpuHandle hPhysicalGpu,NV_GPU_BUS_TYPE *pBusType);
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GPU_GetBusId
+//
+//!   DESCRIPTION: Returns the ID of the bus associated with this GPU.
+//!
+//! SUPPORTED OS:  Windows XP and higher,  Mac OS X
+//!
+//!
+//! TCC_SUPPORTED
+//!
+//! \since Release: 167
+//!
+//!  \retval  NVAPI_INVALID_ARGUMENT              hPhysicalGpu or pBusId is NULL.
+//!  \retval  NVAPI_OK                           *pBusId contains the bus ID.
+//!  \retval  NVAPI_NVIDIA_DEVICE_NOT_FOUND       No NVIDIA GPU driving a display was found.
+//!  \retval  NVAPI_EXPECTED_PHYSICAL_GPU_HANDLE  hPhysicalGpu was not a physical GPU handle.
+//!
+//!  \ingroup gpu
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GPU_GetBusId(NvPhysicalGpuHandle hPhysicalGpu, NvU32 *pBusId);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GPU_GetBusSlotId
+//
+//!   DESCRIPTION: Returns the ID of the bus slot associated with this GPU.
+//!
+//! SUPPORTED OS:  Windows XP and higher,  Mac OS X
+//!
+//!
+//! TCC_SUPPORTED
+//!
+//! \since Release: 167
+//!
+//!  \retval  NVAPI_INVALID_ARGUMENT              hPhysicalGpu or pBusSlotId is NULL.
+//!  \retval  NVAPI_OK                           *pBusSlotId contains the bus slot ID.
+//!  \retval  NVAPI_NVIDIA_DEVICE_NOT_FOUND       No NVIDIA GPU driving a display was found.
+//!  \retval  NVAPI_EXPECTED_PHYSICAL_GPU_HANDLE  hPhysicalGpu was not a physical GPU handle.
+//!
+//!  \ingroup gpu
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GPU_GetBusSlotId(NvPhysicalGpuHandle hPhysicalGpu, NvU32 *pBusSlotId);
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GPU_GetIRQ
+//
+//!  This function returns the interrupt number associated with this GPU.
+//!
+//! SUPPORTED OS:  Windows XP and higher,  Mac OS X
+//!
+//!
+//! TCC_SUPPORTED
+//!
+//! \since Release: 90
+//!
+//! \retval  NVAPI_INVALID_ARGUMENT              hPhysicalGpu or pIRQ is NULL.
+//! \retval  NVAPI_OK                           *pIRQ contains interrupt number.
+//! \retval  NVAPI_NVIDIA_DEVICE_NOT_FOUND       No NVIDIA GPU driving a display was found.
+//! \retval  NVAPI_EXPECTED_PHYSICAL_GPU_HANDLE  hPhysicalGpu was not a physical GPU handle.
+//! \ingroup gpu
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GPU_GetIRQ(NvPhysicalGpuHandle hPhysicalGpu,NvU32 *pIRQ);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GPU_GetVbiosRevision
+//
+//!  This function returns the revision of the video BIOS associated with this GPU.
+//!
+//! SUPPORTED OS:  Windows XP and higher,  Mac OS X
+//!
+//!
+//! TCC_SUPPORTED
+//!
+//! \since Release: 90
+//!
+//! \retval    NVAPI_INVALID_ARGUMENT               hPhysicalGpu or pBiosRevision is NULL.
+//! \retval    NVAPI_OK                            *pBiosRevision contains revision number.
+//! \retval    NVAPI_NVIDIA_DEVICE_NOT_FOUND        No NVIDIA GPU driving a display was found.
+//! \retval    NVAPI_EXPECTED_PHYSICAL_GPU_HANDLE   hPhysicalGpu was not a physical GPU handle.
+//! \ingroup   gpu
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GPU_GetVbiosRevision(NvPhysicalGpuHandle hPhysicalGpu,NvU32 *pBiosRevision);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GPU_GetVbiosOEMRevision
+//
+//!  This function returns the OEM revision of the video BIOS associated with this GPU.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 90
+//!
+//! \retval    NVAPI_INVALID_ARGUMENT              hPhysicalGpu or pBiosRevision is NULL
+//! \retval    NVAPI_OK                           *pBiosRevision contains revision number
+//! \retval    NVAPI_NVIDIA_DEVICE_NOT_FOUND       No NVIDIA GPU driving a display was found
+//! \retval    NVAPI_EXPECTED_PHYSICAL_GPU_HANDLE  hPhysicalGpu was not a physical GPU handle
+//! \ingroup   gpu
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GPU_GetVbiosOEMRevision(NvPhysicalGpuHandle hPhysicalGpu,NvU32 *pBiosRevision);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GPU_GetVbiosVersionString
+//
+//!  This function returns the full video BIOS version string in the form of xx.xx.xx.xx.yy where
+//!  - xx numbers come from NvAPI_GPU_GetVbiosRevision() and
+//!  - yy comes from NvAPI_GPU_GetVbiosOEMRevision().
+//!
+//! SUPPORTED OS:  Windows XP and higher,  Mac OS X
+//!
+//!
+//! TCC_SUPPORTED
+//!
+//! \since Release: 90
+//!
+//! \retval   NVAPI_INVALID_ARGUMENT              hPhysicalGpu is NULL.
+//! \retval   NVAPI_OK                            szBiosRevision contains version string.
+//! \retval   NVAPI_NVIDIA_DEVICE_NOT_FOUND       No NVIDIA GPU driving a display was found.
+//! \retval   NVAPI_EXPECTED_PHYSICAL_GPU_HANDLE  hPhysicalGpu was not a physical GPU handle.
+//! \ingroup gpu
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GPU_GetVbiosVersionString(NvPhysicalGpuHandle hPhysicalGpu,NvAPI_ShortString szBiosRevision);
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GPU_GetAGPAperture
+//
+//!  This function returns the AGP aperture in megabytes.
+//!
+//! SUPPORTED OS:  Windows XP and higher,  Mac OS X
+//!
+//!
+//! \since Release: 90
+//!
+//! \retval   NVAPI_INVALID_ARGUMENT              pSize is NULL.
+//! \retval   NVAPI_OK                            Call successful.
+//! \retval   NVAPI_NVIDIA_DEVICE_NOT_FOUND       No NVIDIA GPU driving a display was found.
+//! \retval   NVAPI_EXPECTED_PHYSICAL_GPU_HANDLE  hPhysicalGpu was not a physical GPU handle.
+//! \ingroup gpu
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GPU_GetAGPAperture(NvPhysicalGpuHandle hPhysicalGpu,NvU32 *pSize);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GPU_GetCurrentAGPRate
+//
+//!  This function returns the current AGP Rate (0 = AGP not present, 1 = 1x, 2 = 2x, etc.).
+//!
+//! SUPPORTED OS:  Windows XP and higher,  Mac OS X
+//!
+//!
+//! \since Release: 90
+//!
+//! \retval   NVAPI_INVALID_ARGUMENT              pRate is NULL.
+//! \retval   NVAPI_OK                            Call successful.
+//! \retval   NVAPI_NVIDIA_DEVICE_NOT_FOUND       No NVIDIA GPU driving a display was found.
+//! \retval   NVAPI_EXPECTED_PHYSICAL_GPU_HANDLE  hPhysicalGpu was not a physical GPU handle.
+//! \ingroup gpu
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GPU_GetCurrentAGPRate(NvPhysicalGpuHandle hPhysicalGpu,NvU32 *pRate);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GPU_GetCurrentPCIEDownstreamWidth
+//
+//!  This function returns the number of PCIE lanes being used for the PCIE interface
+//!  downstream from the GPU.
+//!
+//! SUPPORTED OS:  Windows XP and higher,  Mac OS X
+//!
+//!
+//! \since Release: 90
+//!
+//! \retval  NVAPI_INVALID_ARGUMENT              pWidth is NULL.
+//! \retval  NVAPI_OK                            Call successful.
+//! \retval  NVAPI_NVIDIA_DEVICE_NOT_FOUND       No NVIDIA GPU driving a display was found.
+//! \retval  NVAPI_EXPECTED_PHYSICAL_GPU_HANDLE  hPhysicalGpu was not a physical GPU handle.
+//! \ingroup gpu
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GPU_GetCurrentPCIEDownstreamWidth(NvPhysicalGpuHandle hPhysicalGpu,NvU32 *pWidth);
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GPU_GetPhysicalFrameBufferSize
+//
+//!   This function returns the physical size of framebuffer in KB.  This does NOT include any
+//!   system RAM that may be dedicated for use by the GPU.
+//!
+//! SUPPORTED OS:  Windows XP and higher,  Mac OS X
+//!
+//!
+//! TCC_SUPPORTED
+//!
+//! \since Release: 90
+//!
+//! \retval  NVAPI_INVALID_ARGUMENT              pSize is NULL
+//! \retval  NVAPI_OK                            Call successful
+//! \retval  NVAPI_NVIDIA_DEVICE_NOT_FOUND       No NVIDIA GPU driving a display was found
+//! \retval  NVAPI_EXPECTED_PHYSICAL_GPU_HANDLE  hPhysicalGpu was not a physical GPU handle
+//! \ingroup gpu
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GPU_GetPhysicalFrameBufferSize(NvPhysicalGpuHandle hPhysicalGpu,NvU32 *pSize);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GPU_GetVirtualFrameBufferSize
+//
+//!  This function returns the virtual size of framebuffer in KB.  This includes the physical RAM plus any
+//!  system RAM that has been dedicated for use by the GPU.
+//!
+//! SUPPORTED OS:  Windows XP and higher,  Mac OS X
+//!
+//!
+//! TCC_SUPPORTED
+//!
+//! \since Release: 90
+//!
+//! \retval  NVAPI_INVALID_ARGUMENT              pSize is NULL.
+//! \retval  NVAPI_OK                            Call successful.
+//! \retval  NVAPI_NVIDIA_DEVICE_NOT_FOUND       No NVIDIA GPU driving a display was found.
+//! \retval  NVAPI_EXPECTED_PHYSICAL_GPU_HANDLE  hPhysicalGpu was not a physical GPU handle.
+//! \ingroup gpu
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GPU_GetVirtualFrameBufferSize(NvPhysicalGpuHandle hPhysicalGpu,NvU32 *pSize);
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GPU_GetQuadroStatus
+//
+//!  This function retrieves the Quadro status for the GPU (1 if Quadro, 0 if GeForce)
+//!
+//! SUPPORTED OS:  Windows XP and higher,  Mac OS X
+//!
+//!
+//! \since Release: 80
+//!
+//! \return  NVAPI_ERROR or NVAPI_OK
+//! \ingroup gpu
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GPU_GetQuadroStatus(NvPhysicalGpuHandle hPhysicalGpu, NvU32 *pStatus);
+
+
+
+
+//! \ingroup gpu
+typedef struct _NV_BOARD_INFO
+{
+    NvU32 version;                   //!< structure version
+    NvU8 BoardNum[16];               //!< Board Serial Number
+
+}NV_BOARD_INFO_V1;
+
+#define NV_BOARD_INFO_VER1  MAKE_NVAPI_VERSION(NV_BOARD_INFO_V1,1)
+#ifndef NV_BOARD_INFO_VER
+//! \ingroup gpu
+typedef NV_BOARD_INFO_V1    NV_BOARD_INFO;
+//! \ingroup gpu
+//! \ingroup gpu
+#define NV_BOARD_INFO_VER   NV_BOARD_INFO_VER1
+#endif
+
+//! SUPPORTED OS:  Windows XP and higher
+//!
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GPU_GetBoardInfo
+//
+//!   DESCRIPTION: This API Retrieves the Board information (a unique GPU Board Serial Number) stored in the InfoROM.
+//!
+//! \param [in]      hPhysicalGpu       Physical GPU Handle.
+//! \param [in,out]  NV_BOARD_INFO      Board Information.
+//!
+//! TCC_SUPPORTED
+//!
+//! \retval ::NVAPI_OK                     completed request
+//! \retval ::NVAPI_ERROR                  miscellaneous error occurred
+//! \retval ::NVAPI_EXPECTED_PHYSICAL_GPU_HANDLE  handle passed is not a physical GPU handle
+//! \retval ::NVAPI_API_NOT_INTIALIZED            NVAPI not initialized
+//! \retval ::NVAPI_INVALID_POINTER               pBoardInfo is NULL
+//! \retval ::NVAPI_INCOMPATIBLE_STRUCT_VERSION   the version of the INFO struct is not supported
+//!
+//! \ingroup gpu
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GPU_GetBoardInfo(NvPhysicalGpuHandle hPhysicalGpu, NV_BOARD_INFO *pBoardInfo);
+
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+//  GPU Clock Control
+//
+//  These APIs allow the user to get and set individual clock domains
+//  on a per-GPU basis.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+
+//! \ingroup gpuclock
+//! @{
+#define NVAPI_MAX_GPU_CLOCKS            32
+#define NVAPI_MAX_GPU_PUBLIC_CLOCKS     32
+#define NVAPI_MAX_GPU_PERF_CLOCKS       32
+#define NVAPI_MAX_GPU_PERF_VOLTAGES     16
+#define NVAPI_MAX_GPU_PERF_PSTATES      16
+//! @}
+
+//! \ingroup gpuclock
+typedef enum _NV_GPU_PUBLIC_CLOCK_ID
+{
+    NVAPI_GPU_PUBLIC_CLOCK_GRAPHICS  = 0,
+    NVAPI_GPU_PUBLIC_CLOCK_MEMORY    = 4,
+    NVAPI_GPU_PUBLIC_CLOCK_PROCESSOR = 7,
+    NVAPI_GPU_PUBLIC_CLOCK_VIDEO     = 8,
+    NVAPI_GPU_PUBLIC_CLOCK_UNDEFINED = NVAPI_MAX_GPU_PUBLIC_CLOCKS,
+} NV_GPU_PUBLIC_CLOCK_ID;
+
+
+//! \ingroup gpuclock
+typedef enum _NV_GPU_PERF_VOLTAGE_INFO_DOMAIN_ID
+{
+    NVAPI_GPU_PERF_VOLTAGE_INFO_DOMAIN_CORE      = 0,
+    NVAPI_GPU_PERF_VOLTAGE_INFO_DOMAIN_UNDEFINED = NVAPI_MAX_GPU_PERF_VOLTAGES,
+} NV_GPU_PERF_VOLTAGE_INFO_DOMAIN_ID;
+
+
+
+//! \ingroup gpuclock
+//! Used in NvAPI_GPU_GetAllClockFrequencies()
+typedef struct
+{
+    NvU32   version;    //!< Structure version
+    NvU32   reserved;   //!< These bits are reserved for future use.
+    struct
+    {
+        NvU32 bIsPresent:1;         //!< Set if this domain is present on this GPU
+        NvU32 reserved:31;          //!< These bits are reserved for future use.
+        NvU32 frequency;            //!< Clock frequency (kHz)
+    }domain[NVAPI_MAX_GPU_PUBLIC_CLOCKS];
+} NV_GPU_CLOCK_FREQUENCIES_V1;
+
+//! \ingroup gpuclock
+//! Used in NvAPI_GPU_GetAllClockFrequencies()
+typedef enum
+{
+    NV_GPU_CLOCK_FREQUENCIES_CURRENT_FREQ =   0,
+    NV_GPU_CLOCK_FREQUENCIES_BASE_CLOCK   =   1,
+    NV_GPU_CLOCK_FREQUENCIES_BOOST_CLOCK  =   2,
+    NV_GPU_CLOCK_FREQUENCIES_CLOCK_TYPE_NUM = 3
+}   NV_GPU_CLOCK_FREQUENCIES_CLOCK_TYPE;
+
+//! \ingroup gpuclock
+//! Used in NvAPI_GPU_GetAllClockFrequencies()
+typedef struct
+{
+    NvU32   version;        //!< Structure version
+    NvU32   ClockType:2;    //!< One of NV_GPU_CLOCK_FREQUENCIES_CLOCK_TYPE. Used to specify the type of clock to be returned.
+    NvU32   reserved:22;    //!< These bits are reserved for future use. Must be set to 0.
+    NvU32   reserved1:8;    //!< These bits are reserved.
+    struct
+    {
+        NvU32 bIsPresent:1;         //!< Set if this domain is present on this GPU
+        NvU32 reserved:31;          //!< These bits are reserved for future use.
+        NvU32 frequency;            //!< Clock frequency (kHz)
+    }domain[NVAPI_MAX_GPU_PUBLIC_CLOCKS];
+} NV_GPU_CLOCK_FREQUENCIES_V2;
+
+//! \ingroup gpuclock
+//! Used in NvAPI_GPU_GetAllClockFrequencies()
+typedef NV_GPU_CLOCK_FREQUENCIES_V2 NV_GPU_CLOCK_FREQUENCIES;
+
+//! \addtogroup gpuclock
+//! @{
+#define NV_GPU_CLOCK_FREQUENCIES_VER_1    MAKE_NVAPI_VERSION(NV_GPU_CLOCK_FREQUENCIES_V1,1)
+#define NV_GPU_CLOCK_FREQUENCIES_VER_2    MAKE_NVAPI_VERSION(NV_GPU_CLOCK_FREQUENCIES_V2,2)
+#define NV_GPU_CLOCK_FREQUENCIES_VER_3    MAKE_NVAPI_VERSION(NV_GPU_CLOCK_FREQUENCIES_V2,3)
+#define NV_GPU_CLOCK_FREQUENCIES_VER	  NV_GPU_CLOCK_FREQUENCIES_VER_3
+//! @}
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GPU_GetAllClockFrequencies
+//
+//!   This function retrieves the NV_GPU_CLOCK_FREQUENCIES structure for the specified physical GPU.
+//!
+//!   For each clock domain:
+//!      - bIsPresent is set for each domain that is present on the GPU
+//!      - frequency is the domain's clock freq in kHz
+//!
+//!   Each domain's info is indexed in the array.  For example:
+//!   clkFreqs.domain[NVAPI_GPU_PUBLIC_CLOCK_MEMORY] holds the info for the MEMORY domain.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 295
+//!
+//! \return  This API can return any of the error codes enumerated in #NvAPI_Status.
+//!          If there are return error codes with specific meaning for this API,
+//!          they are listed below.
+//! \retval  NVAPI_INVALID_ARGUMENT     pClkFreqs is NULL.
+//! \ingroup gpuclock
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GPU_GetAllClockFrequencies(__in NvPhysicalGpuHandle hPhysicalGPU, __inout NV_GPU_CLOCK_FREQUENCIES *pClkFreqs);
+
+
+//! \addtogroup gpupstate
+//! @{
+
+typedef enum _NV_GPU_PERF_PSTATE_ID
+{
+    NVAPI_GPU_PERF_PSTATE_P0 = 0,
+    NVAPI_GPU_PERF_PSTATE_P1,
+    NVAPI_GPU_PERF_PSTATE_P2,
+    NVAPI_GPU_PERF_PSTATE_P3,
+    NVAPI_GPU_PERF_PSTATE_P4,
+    NVAPI_GPU_PERF_PSTATE_P5,
+    NVAPI_GPU_PERF_PSTATE_P6,
+    NVAPI_GPU_PERF_PSTATE_P7,
+    NVAPI_GPU_PERF_PSTATE_P8,
+    NVAPI_GPU_PERF_PSTATE_P9,
+    NVAPI_GPU_PERF_PSTATE_P10,
+    NVAPI_GPU_PERF_PSTATE_P11,
+    NVAPI_GPU_PERF_PSTATE_P12,
+    NVAPI_GPU_PERF_PSTATE_P13,
+    NVAPI_GPU_PERF_PSTATE_P14,
+    NVAPI_GPU_PERF_PSTATE_P15,
+    NVAPI_GPU_PERF_PSTATE_UNDEFINED = NVAPI_MAX_GPU_PERF_PSTATES,
+    NVAPI_GPU_PERF_PSTATE_ALL,
+
+} NV_GPU_PERF_PSTATE_ID;
+
+//! @}
+
+
+
+//! \ingroup gpupstate
+//! Used in NvAPI_GPU_GetPstatesInfoEx()
+typedef struct
+{
+    NvU32   version;
+    NvU32   flags;           //!< - bit 0 indicates if perfmon is enabled or not
+                             //!< - bit 1 indicates if dynamic Pstate is capable or not
+                             //!< - bit 2 indicates if dynamic Pstate is enable or not
+                             //!< - all other bits must be set to 0
+    NvU32   numPstates;      //!< The number of available p-states
+    NvU32   numClocks;       //!< The number of clock domains supported by each P-State
+    struct
+    {
+        NV_GPU_PERF_PSTATE_ID   pstateId; //!< ID of the p-state.
+        NvU32                   flags;    //!< - bit 0 indicates if the PCIE limit is GEN1 or GEN2
+                                          //!< - bit 1 indicates if the Pstate is overclocked or not
+                                          //!< - bit 2 indicates if the Pstate is overclockable or not
+                                          //!< - all other bits must be set to 0
+        struct
+        {
+            NV_GPU_PUBLIC_CLOCK_ID           domainId;  //!< ID of the clock domain
+            NvU32                               flags;  //!< Reserved. Must be set to 0
+            NvU32                                freq;  //!< Clock frequency in kHz
+
+        } clocks[NVAPI_MAX_GPU_PERF_CLOCKS];
+    } pstates[NVAPI_MAX_GPU_PERF_PSTATES];
+
+} NV_GPU_PERF_PSTATES_INFO_V1;
+
+
+//! \ingroup gpupstate
+typedef struct
+{
+    NvU32   version;
+    NvU32   flags;             //!< - bit 0 indicates if perfmon is enabled or not
+                               //!< - bit 1 indicates if dynamic Pstate is capable or not
+                               //!< - bit 2 indicates if dynamic Pstate is enable or not
+                               //!< - all other bits must be set to 0
+    NvU32   numPstates;        //!< The number of available p-states
+    NvU32   numClocks;         //!< The number of clock domains supported by each P-State
+    NvU32   numVoltages;
+    struct
+    {
+        NV_GPU_PERF_PSTATE_ID   pstateId;  //!< ID of the p-state.
+        NvU32                   flags;     //!< - bit 0 indicates if the PCIE limit is GEN1 or GEN2
+                                           //!< - bit 1 indicates if the Pstate is overclocked or not
+                                           //!< - bit 2 indicates if the Pstate is overclockable or not
+                                           //!< - all other bits must be set to 0
+        struct
+        {
+            NV_GPU_PUBLIC_CLOCK_ID            domainId;
+            NvU32                                flags; //!< bit 0 indicates if this clock is overclockable
+                                                        //!< all other bits must be set to 0
+            NvU32                                 freq;
+
+        } clocks[NVAPI_MAX_GPU_PERF_CLOCKS];
+        struct
+        {
+            NV_GPU_PERF_VOLTAGE_INFO_DOMAIN_ID domainId; //!< ID of the voltage domain, containing flags and mvolt info
+            NvU32                       flags;           //!< Reserved for future use. Must be set to 0
+            NvU32                       mvolt;           //!< Voltage in mV
+
+        } voltages[NVAPI_MAX_GPU_PERF_VOLTAGES];
+
+    } pstates[NVAPI_MAX_GPU_PERF_PSTATES];  //!< Valid index range is 0 to numVoltages-1
+
+} NV_GPU_PERF_PSTATES_INFO_V2;
+
+//! \ingroup gpupstate
+typedef  NV_GPU_PERF_PSTATES_INFO_V2 NV_GPU_PERF_PSTATES_INFO;
+
+
+//! \ingroup gpupstate
+//! @{
+
+//! Macro for constructing the version field of NV_GPU_PERF_PSTATES_INFO_V1
+#define NV_GPU_PERF_PSTATES_INFO_VER1  MAKE_NVAPI_VERSION(NV_GPU_PERF_PSTATES_INFO_V1,1)
+
+//! Macro for constructing the version field of NV_GPU_PERF_PSTATES_INFO_V2
+#define NV_GPU_PERF_PSTATES_INFO_VER2  MAKE_NVAPI_VERSION(NV_GPU_PERF_PSTATES_INFO_V2,2)
+
+//! Macro for constructing the version field of NV_GPU_PERF_PSTATES_INFO_V2
+#define NV_GPU_PERF_PSTATES_INFO_VER3  MAKE_NVAPI_VERSION(NV_GPU_PERF_PSTATES_INFO_V2,3)
+
+//! Macro for constructing the version field of NV_GPU_PERF_PSTATES_INFO
+#define NV_GPU_PERF_PSTATES_INFO_VER   NV_GPU_PERF_PSTATES_INFO_VER3
+
+//! @}
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME:   NvAPI_GPU_GetPstatesInfoEx
+//
+//! DESCRIPTION:     This API retrieves all performance states (P-States) information. This is the same as
+//!                  NvAPI_GPU_GetPstatesInfo(), but supports an input flag for various options.
+//!
+//!                  P-States are GPU active/executing performance capability and power consumption states.
+//!
+//!                  P-States ranges from P0 to P15, with P0 being the highest performance/power state, and
+//!                  P15 being the lowest performance/power state. Each P-State, if available, maps to a
+//!                  performance level. Not all P-States are available on a given system. The definitions
+//!                  of each P-State are currently as follows: \n
+//!                  - P0/P1 - Maximum 3D performance
+//!                  - P2/P3 - Balanced 3D performance-power
+//!                  - P8 - Basic HD video playback
+//!                  - P10 - DVD playback
+//!                  - P12 - Minimum idle power consumption
+//!
+//! \deprecated  Do not use this function - it is deprecated in release 304. Instead, use NvAPI_GPU_GetPstates20.
+//! SUPPORTED OS:  Windows XP and higher,  Mac OS X
+//!
+//!
+//! \param [in]     hPhysicalGPU       GPU selection.
+//! \param [out]    pPerfPstatesInfo   P-States information retrieved, as detailed below: \n
+//!                  - flags is reserved for future use.
+//!                  - numPstates is the number of available P-States
+//!                  - numClocks is the number of clock domains supported by each P-State
+//!                  - pstates has valid index range from 0 to numPstates - 1
+//!                  - pstates[i].pstateId is the ID of the P-State,
+//!                      containing the following info:
+//!                    - pstates[i].flags containing the following info:
+//!                        - bit 0 indicates if the PCIE limit is GEN1 or GEN2
+//!                        - bit 1 indicates if the Pstate is overclocked or not
+//!                        - bit 2 indicates if the Pstate is overclockable or not
+//!                    - pstates[i].clocks has valid index range from 0 to numClocks -1
+//!                    - pstates[i].clocks[j].domainId is the public ID of the clock domain,
+//!                        containing the following info:
+//!                      - pstates[i].clocks[j].flags containing the following info:
+//!                          bit 0 indicates if the clock domain is overclockable or not
+//!                      - pstates[i].clocks[j].freq is the clock frequency in kHz
+//!                    - pstates[i].voltages has a valid index range from 0 to numVoltages - 1
+//!                    - pstates[i].voltages[j].domainId is the ID of the voltage domain,
+//!                        containing the following info:
+//!                      - pstates[i].voltages[j].flags is reserved for future use.
+//!                      - pstates[i].voltages[j].mvolt is the voltage in mV
+//!                  inputFlags(IN)   - This can be used to select various options:
+//!                    - if bit 0 is set, pPerfPstatesInfo would contain the default settings
+//!                        instead of the current, possibily overclocked settings.
+//!                    - if bit 1 is set, pPerfPstatesInfo would contain the maximum clock
+//!                        frequencies instead of the nominal frequencies.
+//!                    - if bit 2 is set, pPerfPstatesInfo would contain the minimum clock
+//!                        frequencies instead of the nominal frequencies.
+//!                    - all other bits must be set to 0.
+//!
+//! \retval ::NVAPI_OK                            Completed request
+//! \retval ::NVAPI_ERROR                         Miscellaneous error occurred
+//! \retval ::NVAPI_HANDLE_INVALIDATED            Handle passed has been invalidated (see user guide)
+//! \retval ::NVAPI_EXPECTED_PHYSICAL_GPU_HANDLE  Handle passed is not a physical GPU handle
+//! \retval ::NVAPI_INCOMPATIBLE_STRUCT_VERSION   The version of the NV_GPU_PERF_PSTATES struct is not supported
+//!
+//! \ingroup gpupstate
+///////////////////////////////////////////////////////////////////////////////
+__nvapi_deprecated_function("Do not use this function - it is deprecated in release 304. Instead, use NvAPI_GPU_GetPstates20.")
+NVAPI_INTERFACE NvAPI_GPU_GetPstatesInfoEx(NvPhysicalGpuHandle hPhysicalGpu, NV_GPU_PERF_PSTATES_INFO *pPerfPstatesInfo, NvU32 inputFlags);
+
+
+//! \addtogroup gpupstate
+//! @{
+
+#define NVAPI_MAX_GPU_PSTATE20_PSTATES          16
+#define NVAPI_MAX_GPU_PSTATE20_CLOCKS           8
+#define NVAPI_MAX_GPU_PSTATE20_BASE_VOLTAGES    4
+
+//! Used to identify clock type
+typedef enum
+{
+    //! Clock domains that use single frequency value within given pstate
+    NVAPI_GPU_PERF_PSTATE20_CLOCK_TYPE_SINGLE = 0,
+
+    //! Clock domains that allow range of frequency values within given pstate
+    NVAPI_GPU_PERF_PSTATE20_CLOCK_TYPE_RANGE,
+} NV_GPU_PERF_PSTATE20_CLOCK_TYPE_ID;
+
+//! Used to describe both voltage and frequency deltas
+typedef struct
+{
+    //! Value of parameter delta (in respective units [kHz, uV])
+    NvS32       value;
+
+    struct
+    {
+        //! Min value allowed for parameter delta (in respective units [kHz, uV])
+        NvS32   min;
+
+        //! Max value allowed for parameter delta (in respective units [kHz, uV])
+        NvS32   max;
+    } valueRange;
+} NV_GPU_PERF_PSTATES20_PARAM_DELTA;
+
+//! Used to describe single clock entry
+typedef struct
+{
+    //! ID of the clock domain
+    NV_GPU_PUBLIC_CLOCK_ID                      domainId;
+
+    //! Clock type ID
+    NV_GPU_PERF_PSTATE20_CLOCK_TYPE_ID          typeId;
+    NvU32                                       bIsEditable:1;
+
+    //! These bits are reserved for future use (must be always 0)
+    NvU32                                       reserved:31;
+
+    //! Current frequency delta from nominal settings in (kHz)
+    NV_GPU_PERF_PSTATES20_PARAM_DELTA           freqDelta_kHz;
+
+    //! Clock domain type dependant information
+    union
+    {
+        struct
+        {
+            //! Clock frequency within given pstate in (kHz)
+            NvU32                               freq_kHz;
+        } single;
+
+        struct
+        {
+            //! Min clock frequency within given pstate in (kHz)
+            NvU32                               minFreq_kHz;
+
+            //! Max clock frequency within given pstate in (kHz)
+            NvU32                               maxFreq_kHz;
+
+            //! Voltage domain ID and value range in (uV) required for this clock
+            NV_GPU_PERF_VOLTAGE_INFO_DOMAIN_ID  domainId;
+            NvU32                               minVoltage_uV;
+            NvU32                               maxVoltage_uV;
+        } range;
+    } data;
+} NV_GPU_PSTATE20_CLOCK_ENTRY_V1;
+
+//! Used to describe single base voltage entry
+typedef struct
+{
+    //! ID of the voltage domain
+    NV_GPU_PERF_VOLTAGE_INFO_DOMAIN_ID  domainId;
+    NvU32                               bIsEditable:1;
+
+    //! These bits are reserved for future use (must be always 0)
+    NvU32                               reserved:31;
+
+    //! Current base voltage settings in [uV]
+    NvU32                               volt_uV;
+
+    NV_GPU_PERF_PSTATES20_PARAM_DELTA   voltDelta_uV; // Current base voltage delta from nominal settings in [uV]
+} NV_GPU_PSTATE20_BASE_VOLTAGE_ENTRY_V1;
+
+//! Used in NvAPI_GPU_GetPstates20() interface call.
+
+typedef struct
+{
+    //! Version info of the structure (NV_GPU_PERF_PSTATES20_INFO_VER<n>)
+    NvU32   version;
+
+    NvU32   bIsEditable:1;
+
+    //! These bits are reserved for future use (must be always 0)
+    NvU32   reserved:31;
+
+    //! Number of populated pstates
+    NvU32   numPstates;
+
+    //! Number of populated clocks (per pstate)
+    NvU32   numClocks;
+
+    //! Number of populated base voltages (per pstate)
+    NvU32   numBaseVoltages;
+
+    //! Performance state (P-State) settings
+    //! Valid index range is 0 to numPstates-1
+    struct
+    {
+    //! ID of the P-State
+        NV_GPU_PERF_PSTATE_ID                   pstateId;
+
+        NvU32                                   bIsEditable:1;
+
+        //! These bits are reserved for future use (must be always 0)
+        NvU32                                   reserved:31;
+
+        //! Array of clock entries
+        //! Valid index range is 0 to numClocks-1
+        NV_GPU_PSTATE20_CLOCK_ENTRY_V1          clocks[NVAPI_MAX_GPU_PSTATE20_CLOCKS];
+
+        //! Array of baseVoltage entries
+        //! Valid index range is 0 to numBaseVoltages-1
+        NV_GPU_PSTATE20_BASE_VOLTAGE_ENTRY_V1   baseVoltages[NVAPI_MAX_GPU_PSTATE20_BASE_VOLTAGES];
+    } pstates[NVAPI_MAX_GPU_PSTATE20_PSTATES];
+} NV_GPU_PERF_PSTATES20_INFO_V1;
+
+//! Used in NvAPI_GPU_GetPstates20() interface call.
+
+typedef struct _NV_GPU_PERF_PSTATES20_INFO_V2
+{
+    //! Version info of the structure (NV_GPU_PERF_PSTATES20_INFO_VER<n>)
+    NvU32   version;
+
+    NvU32   bIsEditable:1;
+
+    //! These bits are reserved for future use (must be always 0)
+    NvU32   reserved:31;
+
+    //! Number of populated pstates
+    NvU32   numPstates;
+
+    //! Number of populated clocks (per pstate)
+    NvU32   numClocks;
+
+    //! Number of populated base voltages (per pstate)
+    NvU32   numBaseVoltages;
+
+    //! Performance state (P-State) settings
+    //! Valid index range is 0 to numPstates-1
+    struct
+    {
+    //! ID of the P-State
+        NV_GPU_PERF_PSTATE_ID                   pstateId;
+
+        NvU32                                   bIsEditable:1;
+
+        //! These bits are reserved for future use (must be always 0)
+        NvU32                                   reserved:31;
+
+        //! Array of clock entries
+        //! Valid index range is 0 to numClocks-1
+        NV_GPU_PSTATE20_CLOCK_ENTRY_V1          clocks[NVAPI_MAX_GPU_PSTATE20_CLOCKS];
+
+        //! Array of baseVoltage entries
+        //! Valid index range is 0 to numBaseVoltages-1
+        NV_GPU_PSTATE20_BASE_VOLTAGE_ENTRY_V1   baseVoltages[NVAPI_MAX_GPU_PSTATE20_BASE_VOLTAGES];
+    } pstates[NVAPI_MAX_GPU_PSTATE20_PSTATES];
+
+    //! OV settings - Please refer to NVIDIA over-volting recommendation to understand impact of this functionality
+    //! Valid index range is 0 to numVoltages-1
+    struct
+    {
+        //! Number of populated voltages
+        NvU32                                 numVoltages;
+
+        //! Array of voltage entries
+        //! Valid index range is 0 to numVoltages-1
+        NV_GPU_PSTATE20_BASE_VOLTAGE_ENTRY_V1 voltages[NVAPI_MAX_GPU_PSTATE20_BASE_VOLTAGES];
+    } ov;
+} NV_GPU_PERF_PSTATES20_INFO_V2;
+
+typedef NV_GPU_PERF_PSTATES20_INFO_V2   NV_GPU_PERF_PSTATES20_INFO;
+
+//! Macro for constructing the version field of NV_GPU_PERF_PSTATES20_INFO_V1
+#define NV_GPU_PERF_PSTATES20_INFO_VER1 MAKE_NVAPI_VERSION(NV_GPU_PERF_PSTATES20_INFO_V1,1)
+
+//! Macro for constructing the version field of NV_GPU_PERF_PSTATES20_INFO_V2
+#define NV_GPU_PERF_PSTATES20_INFO_VER2 MAKE_NVAPI_VERSION(NV_GPU_PERF_PSTATES20_INFO_V2,2)
+
+//! Macro for constructing the version field of NV_GPU_PERF_PSTATES20_INFO_V2
+#define NV_GPU_PERF_PSTATES20_INFO_VER3 MAKE_NVAPI_VERSION(NV_GPU_PERF_PSTATES20_INFO_V2,3)
+
+//! Macro for constructing the version field of NV_GPU_PERF_PSTATES20_INFO
+#define NV_GPU_PERF_PSTATES20_INFO_VER  NV_GPU_PERF_PSTATES20_INFO_VER3
+
+//! @}
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME:   NvAPI_GPU_GetPstates20
+//
+//! DESCRIPTION:    This API retrieves all performance states (P-States) 2.0 information.
+//!
+//!                 P-States are GPU active/executing performance capability states.
+//!                 They range from P0 to P15, with P0 being the highest performance state,
+//!                 and P15 being the lowest performance state. Each P-State, if available,
+//!                 maps to a performance level. Not all P-States are available on a given system.
+//!                 The definition of each P-States are currently as follow:
+//!                 - P0/P1 - Maximum 3D performance
+//!                 - P2/P3 - Balanced 3D performance-power
+//!                 - P8 - Basic HD video playback
+//!                 - P10 - DVD playback
+//!                 - P12 - Minimum idle power consumption
+//!
+//! TCC_SUPPORTED
+//!
+//! \since Release: 295
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \param [in]   hPhysicalGPU  GPU selection
+//! \param [out]  pPstatesInfo  P-States information retrieved, as documented in declaration above
+//!
+//! \return  This API can return any of the error codes enumerated in #NvAPI_Status.
+//!          If there are return error codes with specific meaning for this API,
+//!          they are listed below.
+//!
+//! \ingroup gpupstate
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GPU_GetPstates20(__in NvPhysicalGpuHandle hPhysicalGpu, __inout NV_GPU_PERF_PSTATES20_INFO *pPstatesInfo);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME:   NvAPI_GPU_GetCurrentPstate
+//
+//! DESCRIPTION:     This function retrieves the current performance state (P-State).
+//!
+//! SUPPORTED OS:  Windows XP and higher,  Mac OS X
+//!
+//!
+//! \since Release: 165
+//!
+//! TCC_SUPPORTED
+//!
+//! \param [in]      hPhysicalGPU     GPU selection
+//! \param [out]     pCurrentPstate   The ID of the current P-State of the GPU - see \ref NV_GPU_PERF_PSTATES.
+//!
+//! \retval    NVAPI_OK                             Completed request
+//! \retval    NVAPI_ERROR                          Miscellaneous error occurred.
+//! \retval    NVAPI_HANDLE_INVALIDATED             Handle passed has been invalidated (see user guide).
+//! \retval    NVAPI_EXPECTED_PHYSICAL_GPU_HANDLE   Handle passed is not a physical GPU handle.
+//! \retval    NVAPI_NOT_SUPPORTED                  P-States is not supported on this setup.
+//!
+//! \ingroup   gpupstate
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GPU_GetCurrentPstate(NvPhysicalGpuHandle hPhysicalGpu, NV_GPU_PERF_PSTATE_ID *pCurrentPstate);
+
+
+
+
+//! \ingroup gpupstate
+#define NVAPI_MAX_GPU_UTILIZATIONS 8
+
+
+
+//! \ingroup gpupstate
+//! Used in NvAPI_GPU_GetDynamicPstatesInfoEx().
+typedef struct
+{
+    NvU32       version;        //!< Structure version
+    NvU32       flags;          //!< bit 0 indicates if the dynamic Pstate is enabled or not
+    struct
+    {
+        NvU32   bIsPresent:1;   //!< Set if this utilization domain is present on this GPU
+        NvU32   percentage;     //!< Percentage of time where the domain is considered busy in the last 1 second interval
+    } utilization[NVAPI_MAX_GPU_UTILIZATIONS];
+} NV_GPU_DYNAMIC_PSTATES_INFO_EX;
+
+//! \ingroup gpupstate
+//! Macro for constructing the version field of NV_GPU_DYNAMIC_PSTATES_INFO_EX
+#define NV_GPU_DYNAMIC_PSTATES_INFO_EX_VER MAKE_NVAPI_VERSION(NV_GPU_DYNAMIC_PSTATES_INFO_EX,1)
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GPU_GetDynamicPstatesInfoEx
+//
+//! DESCRIPTION:   This API retrieves the NV_GPU_DYNAMIC_PSTATES_INFO_EX structure for the specified physical GPU.
+//!                Each domain's info is indexed in the array.  For example:
+//!                - pDynamicPstatesInfo->utilization[NVAPI_GPU_UTILIZATION_DOMAIN_GPU] holds the info for the GPU domain. \p
+//!                There are currently 4 domains for which GPU utilization and dynamic P-State thresholds can be retrieved:
+//!                   graphic engine (GPU), frame buffer (FB), video engine (VID), and bus interface (BUS).
+//!
+//! SUPPORTED OS:  Windows XP and higher,  Mac OS X
+//!
+//!
+//! \since Release: 185
+//!
+//! \retval ::NVAPI_OK
+//! \retval ::NVAPI_ERROR
+//! \retval ::NVAPI_INVALID_ARGUMENT  pDynamicPstatesInfo is NULL
+//! \retval ::NVAPI_HANDLE_INVALIDATED
+//! \retval ::NVAPI_EXPECTED_PHYSICAL_GPU_HANDLE
+//! \retval ::NVAPI_INCOMPATIBLE_STRUCT_VERSION The version of the INFO struct is not supported
+//!
+//! \ingroup gpupstate
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GPU_GetDynamicPstatesInfoEx(NvPhysicalGpuHandle hPhysicalGpu, NV_GPU_DYNAMIC_PSTATES_INFO_EX *pDynamicPstatesInfoEx);
+
+///////////////////////////////////////////////////////////////////////////////////
+//  Thermal API
+//  Provides ability to get temperature levels from the various thermal sensors associated with the GPU
+
+//! \ingroup gputhermal
+#define NVAPI_MAX_THERMAL_SENSORS_PER_GPU 3
+
+//! \ingroup gputhermal
+//! Used in NV_GPU_THERMAL_SETTINGS
+typedef enum
+{
+    NVAPI_THERMAL_TARGET_NONE          = 0,
+    NVAPI_THERMAL_TARGET_GPU           = 1,     //!< GPU core temperature requires NvPhysicalGpuHandle
+    NVAPI_THERMAL_TARGET_MEMORY        = 2,     //!< GPU memory temperature requires NvPhysicalGpuHandle
+    NVAPI_THERMAL_TARGET_POWER_SUPPLY  = 4,     //!< GPU power supply temperature requires NvPhysicalGpuHandle
+    NVAPI_THERMAL_TARGET_BOARD         = 8,     //!< GPU board ambient temperature requires NvPhysicalGpuHandle
+    NVAPI_THERMAL_TARGET_VCD_BOARD     = 9,     //!< Visual Computing Device Board temperature requires NvVisualComputingDeviceHandle
+    NVAPI_THERMAL_TARGET_VCD_INLET     = 10,    //!< Visual Computing Device Inlet temperature requires NvVisualComputingDeviceHandle
+    NVAPI_THERMAL_TARGET_VCD_OUTLET    = 11,    //!< Visual Computing Device Outlet temperature requires NvVisualComputingDeviceHandle
+
+    NVAPI_THERMAL_TARGET_ALL           = 15,
+    NVAPI_THERMAL_TARGET_UNKNOWN       = -1,
+} NV_THERMAL_TARGET;
+
+//! \ingroup gputhermal
+//! Used in NV_GPU_THERMAL_SETTINGS
+typedef enum
+{
+    NVAPI_THERMAL_CONTROLLER_NONE = 0,
+    NVAPI_THERMAL_CONTROLLER_GPU_INTERNAL,
+    NVAPI_THERMAL_CONTROLLER_ADM1032,
+    NVAPI_THERMAL_CONTROLLER_MAX6649,
+    NVAPI_THERMAL_CONTROLLER_MAX1617,
+    NVAPI_THERMAL_CONTROLLER_LM99,
+    NVAPI_THERMAL_CONTROLLER_LM89,
+    NVAPI_THERMAL_CONTROLLER_LM64,
+    NVAPI_THERMAL_CONTROLLER_ADT7473,
+    NVAPI_THERMAL_CONTROLLER_SBMAX6649,
+    NVAPI_THERMAL_CONTROLLER_VBIOSEVT,
+    NVAPI_THERMAL_CONTROLLER_OS,
+    NVAPI_THERMAL_CONTROLLER_UNKNOWN = -1,
+} NV_THERMAL_CONTROLLER;
+
+//! \ingroup gputhermal
+//! Used in NvAPI_GPU_GetThermalSettings()
+typedef struct
+{
+    NvU32   version;                //!< structure version
+    NvU32   count;                  //!< number of associated thermal sensors
+    struct
+    {
+        NV_THERMAL_CONTROLLER       controller;        //!< internal, ADM1032, MAX6649...
+        NvU32                       defaultMinTemp;    //!< The min default temperature value of the thermal sensor in degree Celsius
+        NvU32                       defaultMaxTemp;    //!< The max default temperature value of the thermal sensor in degree Celsius
+        NvU32                       currentTemp;       //!< The current temperature value of the thermal sensor in degree Celsius
+        NV_THERMAL_TARGET           target;            //!< Thermal sensor targeted @ GPU, memory, chipset, powersupply, Visual Computing Device, etc.
+    } sensor[NVAPI_MAX_THERMAL_SENSORS_PER_GPU];
+
+} NV_GPU_THERMAL_SETTINGS_V1;
+
+//! \ingroup gputhermal
+typedef struct
+{
+    NvU32   version;                //!< structure version
+    NvU32   count;                  //!< number of associated thermal sensors
+    struct
+    {
+        NV_THERMAL_CONTROLLER       controller;         //!< internal, ADM1032, MAX6649...
+        NvS32                       defaultMinTemp;     //!< Minimum default temperature value of the thermal sensor in degree Celsius
+        NvS32                       defaultMaxTemp;     //!< Maximum default temperature value of the thermal sensor in degree Celsius
+        NvS32                       currentTemp;        //!< Current temperature value of the thermal sensor in degree Celsius
+        NV_THERMAL_TARGET           target;             //!< Thermal sensor targeted - GPU, memory, chipset, powersupply, Visual Computing Device, etc
+    } sensor[NVAPI_MAX_THERMAL_SENSORS_PER_GPU];
+
+} NV_GPU_THERMAL_SETTINGS_V2;
+
+//! \ingroup gputhermal
+typedef NV_GPU_THERMAL_SETTINGS_V2  NV_GPU_THERMAL_SETTINGS;
+
+//! \ingroup gputhermal
+//! @{
+
+//! Macro for constructing the version field of NV_GPU_THERMAL_SETTINGS_V1
+#define NV_GPU_THERMAL_SETTINGS_VER_1   MAKE_NVAPI_VERSION(NV_GPU_THERMAL_SETTINGS_V1,1)
+
+//! Macro for constructing the version field of NV_GPU_THERMAL_SETTINGS_V2
+#define NV_GPU_THERMAL_SETTINGS_VER_2   MAKE_NVAPI_VERSION(NV_GPU_THERMAL_SETTINGS_V2,2)
+
+//! Macro for constructing the version field of NV_GPU_THERMAL_SETTINGS
+#define NV_GPU_THERMAL_SETTINGS_VER     NV_GPU_THERMAL_SETTINGS_VER_2
+//! @}
+
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME:   NvAPI_GPU_GetThermalSettings
+//
+//!  This function retrieves the thermal information of all thermal sensors or specific thermal sensor associated with the selected GPU.
+//!  Thermal sensors are indexed 0 to NVAPI_MAX_THERMAL_SENSORS_PER_GPU-1.
+//!
+//!  - To retrieve specific thermal sensor info, set the sensorIndex to the required thermal sensor index.
+//!  - To retrieve info for all sensors, set sensorIndex to NVAPI_THERMAL_TARGET_ALL.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! TCC_SUPPORTED
+//!
+//! \since Release: 85
+//!
+//! \param [in]   hPhysicalGPU      GPU selection.
+//! \param [in]   sensorIndex       Explicit thermal sensor index selection.
+//! \param [out]  pThermalSettings  Array of thermal settings.
+//!
+//! \retval   NVAPI_OK                           Completed request
+//! \retval   NVAPI_ERROR                        Miscellaneous error occurred.
+//! \retval   NVAPI_INVALID_ARGUMENT             pThermalInfo is NULL.
+//! \retval   NVAPI_HANDLE_INVALIDATED           Handle passed has been invalidated (see user guide).
+//! \retval   NVAPI_EXPECTED_PHYSICAL_GPU_HANDLE Handle passed is not a physical GPU handle.
+//! \retval   NVAPI_INCOMPATIBLE_STRUCT_VERSION  The version of the INFO struct is not supported.
+//! \ingroup gputhermal
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GPU_GetThermalSettings(NvPhysicalGpuHandle hPhysicalGpu, NvU32 sensorIndex, NV_GPU_THERMAL_SETTINGS *pThermalSettings);
+
+
+///////////////////////////////////////////////////////////////////////////////////
+//  I2C API
+//  Provides ability to read or write data using I2C protocol.
+//  These APIs allow I2C access only to DDC monitors
+
+
+//! \addtogroup i2capi
+//! @{
+#define NVAPI_MAX_SIZEOF_I2C_DATA_BUFFER    4096
+#define NVAPI_MAX_SIZEOF_I2C_REG_ADDRESS       4
+#define NVAPI_DISPLAY_DEVICE_MASK_MAX         24
+#define NVAPI_I2C_SPEED_DEPRECATED        0xFFFF
+
+typedef enum
+{
+    NVAPI_I2C_SPEED_DEFAULT,    //!< Set i2cSpeedKhz to I2C_SPEED_DEFAULT if default I2C speed is to be chosen, ie.use the current frequency setting.
+    NVAPI_I2C_SPEED_3KHZ,
+    NVAPI_I2C_SPEED_10KHZ,
+    NVAPI_I2C_SPEED_33KHZ,
+    NVAPI_I2C_SPEED_100KHZ,
+    NVAPI_I2C_SPEED_200KHZ,
+    NVAPI_I2C_SPEED_400KHZ,
+} NV_I2C_SPEED;
+
+//! Used in NvAPI_I2CRead() and NvAPI_I2CWrite()
+typedef struct
+{
+    NvU32                   version;            //!< The structure version.
+    NvU32                   displayMask;        //!< The Display Mask of the concerned display.
+    NvU8                    bIsDDCPort;         //!< This flag indicates either the DDC port (TRUE) or the communication port
+                                                //!< (FALSE) of the concerned display.
+    NvU8                    i2cDevAddress;      //!< The address of the I2C slave.  The address should be shifted left by one.  For
+                                                //!< example, the I2C address 0x50, often used for reading EDIDs, would be stored
+                                                //!< here as 0xA0.  This matches the position within the byte sent by the master, as
+                                                //!< the last bit is reserved to specify the read or write direction.
+    NvU8*                   pbI2cRegAddress;    //!< The I2C target register address.  May be NULL, which indicates no register
+                                                //!< address should be sent.
+    NvU32                   regAddrSize;        //!< The size in bytes of target register address.  If pbI2cRegAddress is NULL, this
+                                                //!< field must be 0.
+    NvU8*                   pbData;             //!< The buffer of data which is to be read or written (depending on the command).
+    NvU32                   cbSize;             //!< The size of the data buffer, pbData, to be read or written.
+    NvU32                   i2cSpeed;           //!< The target speed of the transaction (between 28Kbps to 40Kbps; not guaranteed).
+} NV_I2C_INFO_V1;
+
+//! Used in NvAPI_I2CRead() and NvAPI_I2CWrite()
+typedef struct
+{
+    NvU32                   version;            //!< The structure version.
+    NvU32                   displayMask;        //!< The Display Mask of the concerned display.
+    NvU8                    bIsDDCPort;         //!< This flag indicates either the DDC port (TRUE) or the communication port
+                                                //!< (FALSE) of the concerned display.
+    NvU8                    i2cDevAddress;      //!< The address of the I2C slave.  The address should be shifted left by one.  For
+                                                //!< example, the I2C address 0x50, often used for reading EDIDs, would be stored
+                                                //!< here as 0xA0.  This matches the position within the byte sent by the master, as
+                                                //!< the last bit is reserved to specify the read or write direction.
+    NvU8*                   pbI2cRegAddress;    //!< The I2C target register address.  May be NULL, which indicates no register
+                                                //!< address should be sent.
+    NvU32                   regAddrSize;        //!< The size in bytes of target register address.  If pbI2cRegAddress is NULL, this
+                                                //!< field must be 0.
+    NvU8*                   pbData;             //!< The buffer of data which is to be read or written (depending on the command).
+    NvU32                   cbSize;             //!< The size of the data buffer, pbData, to be read or written.
+    NvU32                   i2cSpeed;           //!< Deprecated, Must be set to NVAPI_I2C_SPEED_DEPRECATED.
+    NV_I2C_SPEED            i2cSpeedKhz;        //!< The target speed of the transaction in (kHz) (Chosen from the enum NV_I2C_SPEED).
+} NV_I2C_INFO_V2;
+
+//! Used in NvAPI_I2CRead() and NvAPI_I2CWrite()
+typedef struct
+{
+    NvU32                   version;            //!< The structure version.
+    NvU32                   displayMask;        //!< The Display Mask of the concerned display.
+    NvU8                    bIsDDCPort;         //!< This flag indicates either the DDC port (TRUE) or the communication port
+                                                //!< (FALSE) of the concerned display.
+    NvU8                    i2cDevAddress;      //!< The address of the I2C slave.  The address should be shifted left by one.  For
+                                                //!< example, the I2C address 0x50, often used for reading EDIDs, would be stored
+                                                //!< here as 0xA0.  This matches the position within the byte sent by the master, as
+                                                //!< the last bit is reserved to specify the read or write direction.
+    NvU8*                   pbI2cRegAddress;    //!< The I2C target register address.  May be NULL, which indicates no register
+                                                //!< address should be sent.
+    NvU32                   regAddrSize;        //!< The size in bytes of target register address.  If pbI2cRegAddress is NULL, this
+                                                //!< field must be 0.
+    NvU8*                   pbData;             //!< The buffer of data which is to be read or written (depending on the command).
+    NvU32                   cbSize;             //!< The size of the data buffer, pbData, to be read or written.
+    NvU32                   i2cSpeed;           //!< Deprecated, Must be set to NVAPI_I2C_SPEED_DEPRECATED.
+    NV_I2C_SPEED            i2cSpeedKhz;        //!< The target speed of the transaction in (kHz) (Chosen from the enum NV_I2C_SPEED).
+    NvU8                    portId;             //!< The portid on which device is connected (remember to set bIsPortIdSet if this value is set)
+                                                //!< Optional for pre-Kepler
+    NvU32                   bIsPortIdSet;       //!< set this flag on if and only if portid value is set
+} NV_I2C_INFO_V3;
+
+typedef NV_I2C_INFO_V3                     NV_I2C_INFO;
+
+#define NV_I2C_INFO_VER3  MAKE_NVAPI_VERSION(NV_I2C_INFO_V3,3)
+#define NV_I2C_INFO_VER2  MAKE_NVAPI_VERSION(NV_I2C_INFO_V2,2)
+#define NV_I2C_INFO_VER1  MAKE_NVAPI_VERSION(NV_I2C_INFO_V1,1)
+
+#define NV_I2C_INFO_VER  NV_I2C_INFO_VER3
+//! @}
+
+/***********************************************************************************/
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME:  NvAPI_I2CRead
+//
+//!  This function reads the data buffer from the I2C port.
+//!                 The I2C request must be for a DDC port: pI2cInfo->bIsDDCPort = 1.
+//!
+//!                 A data buffer size larger than 16 bytes may be rejected if a register address is specified.  In such a case,
+//!                 NVAPI_ARGUMENT_EXCEED_MAX_SIZE would be returned.
+//!
+//!                 If a register address is specified (i.e. regAddrSize is positive), then the transaction will be performed in
+//!                 the combined format described in the I2C specification.  The register address will be written, followed by
+//!                 reading into the data buffer.
+//!
+//! SUPPORTED OS:  Windows XP and higher,  Mac OS X
+//!
+//!
+//! \since Release: 85
+//!
+//! \param [in]   hPhysicalGPU     GPU selection.
+//! \param [out]  NV_I2C_INFO     *pI2cInfo The I2C data input structure
+//!
+//! \retval   NVAPI_OK                            Completed request
+//! \retval   NVAPI_ERROR                         Miscellaneous error occurred.
+//! \retval   NVAPI_HANDLE_INVALIDATED            Handle passed has been invalidated (see user guide).
+//! \retval   NVAPI_EXPECTED_PHYSICAL_GPU_HANDLE  Handle passed is not a physical GPU handle.
+//! \retval   NVAPI_INCOMPATIBLE_STRUCT_VERSION   Structure version is not supported.
+//! \retval   NVAPI_INVALID_ARGUMENT - argument does not meet specified requirements
+//! \retval   NVAPI_ARGUMENT_EXCEED_MAX_SIZE - an argument exceeds the maximum
+//!
+//! \ingroup i2capi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_I2CRead(NvPhysicalGpuHandle hPhysicalGpu, NV_I2C_INFO *pI2cInfo);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME:  NvAPI_I2CWrite
+//
+//!  This function writes the data buffer to the I2C port.
+//!
+//!                 The I2C request must be for a DDC port: pI2cInfo->bIsDDCPort = 1.
+//!
+//!                 A data buffer size larger than 16 bytes may be rejected if a register address is specified.  In such a case,
+//!                 NVAPI_ARGUMENT_EXCEED_MAX_SIZE would be returned.
+//!
+//!                 If a register address is specified (i.e. regAddrSize is positive), then the register address will be written
+//!                 and the data buffer will immediately follow without a restart.
+//!
+//! SUPPORTED OS:  Windows XP and higher,  Mac OS X
+//!
+//!
+//! \since Release: 85
+//!
+//! \param [in]   hPhysicalGPU     GPU selection.
+//! \param [in]   pI2cInfo         The I2C data input structure
+//!
+//! \retval   NVAPI_OK                            Completed request
+//! \retval   NVAPI_ERROR                         Miscellaneous error occurred.
+//! \retval   NVAPI_HANDLE_INVALIDATED            Handle passed has been invalidated (see user guide).
+//! \retval   NVAPI_EXPECTED_PHYSICAL_GPU_HANDLE  Handle passed is not a physical GPU handle.
+//! \retval   NVAPI_INCOMPATIBLE_STRUCT_VERSION    Structure version is not supported.
+//! \retval   NVAPI_INVALID_ARGUMENT              Argument does not meet specified requirements
+//! \retval   NVAPI_ARGUMENT_EXCEED_MAX_SIZE      Argument exceeds the maximum
+//!
+//! \ingroup i2capi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_I2CWrite(NvPhysicalGpuHandle hPhysicalGpu, NV_I2C_INFO *pI2cInfo);
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GPU_WorkstationFeatureSetup
+//
+//! \fn NvAPI_GPU_WorkstationFeatureSetup(NvPhysicalGpuHandle hPhysicalGpu, NvU32 featureEnableMask, NvU32 featureDisableMask)
+//!   DESCRIPTION: This API configures the driver for a set of workstation features.
+//!                The driver can allocate the memory resources accordingly.
+//!
+//! SUPPORTED OS:  Windows 7
+//!
+//!
+//! \param [in]   hPhysicalGpu       Physical GPU Handle of the display adapter to be configured. GPU handles may be retrieved
+//!                                  using NvAPI_EnumPhysicalGPUs. A value of NULL is permitted and applies the same operation
+//!                                  to all GPU handles enumerated by NvAPI_EnumPhysicalGPUs.
+//! \param [in]   featureEnableMask  Mask of features the caller requests to enable for use
+//! \param [in]   featureDisableMask Mask of features the caller requests to disable
+//!
+//!                As a general rule, features in the enable and disable masks are expected to be disjoint, although the disable
+//!                mask has precedence and a feature flagged in both masks will be disabled.
+//!
+//! \retval ::NVAPI_OK                            configuration request succeeded
+//! \retval ::NVAPI_ERROR                         configuration request failed
+//! \retval ::NVAPI_EXPECTED_PHYSICAL_GPU_HANDLE  hPhysicalGpu is not a physical GPU handle.
+//! \retval ::NVAPI_GPU_WORKSTATION_FEATURE_INCOMPLETE  requested feature set does not have all resources allocated for completeness.
+//! \retval ::NVAPI_NO_IMPLEMENTATION             only implemented for Win7
+//
+///////////////////////////////////////////////////////////////////////////////
+
+//! \ingroup gpu
+typedef enum
+{
+    NVAPI_GPU_WORKSTATION_FEATURE_MASK_SWAPGROUP     = 0x00000001,
+    NVAPI_GPU_WORKSTATION_FEATURE_MASK_STEREO        = 0x00000010,
+    NVAPI_GPU_WORKSTATION_FEATURE_MASK_WARPING       = 0x00000100,
+    NVAPI_GPU_WORKSTATION_FEATURE_MASK_PIXINTENSITY  = 0x00000200,
+    NVAPI_GPU_WORKSTATION_FEATURE_MASK_GRAYSCALE     = 0x00000400,
+    NVAPI_GPU_WORKSTATION_FEATURE_MASK_BPC10         = 0x00001000
+} NVAPI_GPU_WORKSTATION_FEATURE_MASK;
+
+//! \ingroup gpu
+NVAPI_INTERFACE NvAPI_GPU_WorkstationFeatureSetup(__in NvPhysicalGpuHandle hPhysicalGpu, __in NvU32 featureEnableMask, __in NvU32 featureDisableMask);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GPU_WorkstationFeatureQuery
+//
+//!   DESCRIPTION: This API queries the current set of workstation features.
+//!
+//! SUPPORTED OS:  Windows 7
+//!
+//!
+//! \param [in]   hPhysicalGpu       Physical GPU Handle of the display adapter to be configured. GPU handles may be retrieved
+//!                                  using NvAPI_EnumPhysicalGPUs.
+//! \param [out]  pConfiguredFeatureMask  Mask of features requested for use by client drivers
+//! \param [out]  pConsistentFeatureMask  Mask of features that have all resources allocated for completeness.
+//!
+//! \retval ::NVAPI_OK                            configuration request succeeded
+//! \retval ::NVAPI_ERROR                         configuration request failed
+//! \retval ::NVAPI_EXPECTED_PHYSICAL_GPU_HANDLE  hPhysicalGpu is not a physical GPU handle.
+//! \retval ::NVAPI_NO_IMPLEMENTATION             only implemented for Win7
+//
+///////////////////////////////////////////////////////////////////////////////
+
+//! \ingroup gpu
+NVAPI_INTERFACE NvAPI_GPU_WorkstationFeatureQuery(__in NvPhysicalGpuHandle hPhysicalGpu, __out_opt NvU32 *pConfiguredFeatureMask, __out_opt NvU32 *pConsistentFeatureMask);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GPU_GetHDCPSupportStatus
+//
+//! \fn NvAPI_GPU_GetHDCPSupportStatus(NvPhysicalGpuHandle hPhysicalGpu, NV_GPU_GET_HDCP_SUPPORT_STATUS *pGetHDCPSupportStatus)
+//! DESCRIPTION: This function returns a GPU's HDCP support status.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 175
+//!
+//!  \retval ::NVAPI_OK
+//!  \retval ::NVAPI_ERROR
+//!  \retval ::NVAPI_INVALID_ARGUMENT
+//!  \retval ::NVAPI_HANDLE_INVALIDATED
+//!  \retval ::NVAPI_EXPECTED_PHYSICAL_GPU_HANDLE
+//!  \retval ::NVAPI_INCOMPATIBLE_STRUCT_VERSION
+//
+////////////////////////////////////////////////////////////////////////////////
+
+
+//! \addtogroup gpu
+//! @{
+
+
+//! HDCP fuse states - used in NV_GPU_GET_HDCP_SUPPORT_STATUS
+typedef enum _NV_GPU_HDCP_FUSE_STATE
+{
+    NV_GPU_HDCP_FUSE_STATE_UNKNOWN  = 0,
+    NV_GPU_HDCP_FUSE_STATE_DISABLED = 1,
+    NV_GPU_HDCP_FUSE_STATE_ENABLED  = 2,
+} NV_GPU_HDCP_FUSE_STATE;
+
+
+//! HDCP key sources - used in NV_GPU_GET_HDCP_SUPPORT_STATUS
+typedef enum _NV_GPU_HDCP_KEY_SOURCE
+{
+    NV_GPU_HDCP_KEY_SOURCE_UNKNOWN    = 0,
+    NV_GPU_HDCP_KEY_SOURCE_NONE       = 1,
+    NV_GPU_HDCP_KEY_SOURCE_CRYPTO_ROM = 2,
+    NV_GPU_HDCP_KEY_SOURCE_SBIOS      = 3,
+    NV_GPU_HDCP_KEY_SOURCE_I2C_ROM    = 4,
+    NV_GPU_HDCP_KEY_SOURCE_FUSES      = 5,
+} NV_GPU_HDCP_KEY_SOURCE;
+
+
+//! HDCP key source states - used in NV_GPU_GET_HDCP_SUPPORT_STATUS
+typedef enum _NV_GPU_HDCP_KEY_SOURCE_STATE
+{
+    NV_GPU_HDCP_KEY_SOURCE_STATE_UNKNOWN = 0,
+    NV_GPU_HDCP_KEY_SOURCE_STATE_ABSENT  = 1,
+    NV_GPU_HDCP_KEY_SOURCE_STATE_PRESENT = 2,
+} NV_GPU_HDCP_KEY_SOURCE_STATE;
+
+
+//! HDPC support status - used in NvAPI_GPU_GetHDCPSupportStatus()
+typedef struct
+{
+    NvU32                        version;               //! Structure version constucted by macro #NV_GPU_GET_HDCP_SUPPORT_STATUS
+    NV_GPU_HDCP_FUSE_STATE       hdcpFuseState;         //! GPU's HDCP fuse state
+    NV_GPU_HDCP_KEY_SOURCE       hdcpKeySource;         //! GPU's HDCP key source
+    NV_GPU_HDCP_KEY_SOURCE_STATE hdcpKeySourceState;    //! GPU's HDCP key source state
+} NV_GPU_GET_HDCP_SUPPORT_STATUS;
+
+
+//! Macro for constructing the version for structure NV_GPU_GET_HDCP_SUPPORT_STATUS
+#define NV_GPU_GET_HDCP_SUPPORT_STATUS_VER MAKE_NVAPI_VERSION(NV_GPU_GET_HDCP_SUPPORT_STATUS,1)
+
+
+//! @}
+
+
+//!  \ingroup gpu
+NVAPI_INTERFACE NvAPI_GPU_GetHDCPSupportStatus(NvPhysicalGpuHandle hPhysicalGpu, NV_GPU_GET_HDCP_SUPPORT_STATUS *pGetHDCPSupportStatus);
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GPU_GetTachReading
+//
+//!   DESCRIPTION: This API retrieves the fan speed tachometer reading for the specified physical GPU.
+//!
+//!   HOW TO USE:
+//!                 - NvU32 Value = 0;
+//!                 - ret = NvAPI_GPU_GetTachReading(hPhysicalGpu, &Value);
+//!                 - On call success:
+//!                 - Value contains the tachometer reading
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! TCC_SUPPORTED
+//!
+//! \param [in]    hPhysicalGpu   GPU selection.
+//! \param [out]   pValue         Pointer to a variable to get the tachometer reading
+//!
+//! \retval ::NVAPI_OK - completed request
+//! \retval ::NVAPI_ERROR - miscellaneous error occurred
+//! \retval ::NVAPI_NOT_SUPPORTED - functionality not supported
+//! \retval ::NVAPI_API_NOT_INTIALIZED - nvapi not initialized
+//! \retval ::NVAPI_INVALID_ARGUMENT - invalid argument passed
+//! \retval ::NVAPI_HANDLE_INVALIDATED - handle passed has been invalidated (see user guide)
+//! \retval ::NVAPI_EXPECTED_PHYSICAL_GPU_HANDLE - handle passed is not a physical GPU handle
+//!
+//! \ingroup gpucooler
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GPU_GetTachReading(NvPhysicalGpuHandle hPhysicalGPU, NvU32 *pValue);
+
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME:   NvAPI_GPU_GetECCStatusInfo
+//
+//! \fn NvAPI_GPU_GetECCStatusInfo(NvPhysicalGpuHandle hPhysicalGpu,
+//!                                           NV_GPU_ECC_STATUS_INFO *pECCStatusInfo);
+//! DESCRIPTION:     This function returns ECC memory status information.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! TCC_SUPPORTED
+//!
+//! \param [in]      hPhysicalGpu      A handle identifying the physical GPU for which ECC
+//!                                    status information is to be retrieved.
+//! \param [out]     pECCStatusInfo    A pointer to an ECC status structure.
+//!
+//! \retval ::NVAPI_OK                  The request was completed successfully.
+//! \retval ::NVAPI_ERROR               An unknown error occurred.
+//! \retval ::NVAPI_EXPECTED_PHYSICAL_GPU_HANDLE The provided GPU handle is not a physical GPU handle.
+//! \retval ::NVAPI_INVALID_HANDLE      The provided GPU handle is invalid.
+//! \retval ::NVAPI_HANDLE_INVALIDATED  The provided GPU handle is no longer valid.
+//! \retval ::NVAPI_INVALID_POINTER     An invalid argument pointer was provided.
+//! \retval ::NVAPI_NOT_SUPPORTED       The request is not supported.
+//! \retval ::NVAPI_API_NOT_INTIALIZED  NvAPI was not yet initialized.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+//! \addtogroup gpuecc
+//! Used in NV_GPU_ECC_STATUS_INFO.
+typedef enum _NV_ECC_CONFIGURATION
+{
+    NV_ECC_CONFIGURATION_NOT_SUPPORTED = 0,
+    NV_ECC_CONFIGURATION_DEFERRED,           //!< Changes require a POST to take effect
+    NV_ECC_CONFIGURATION_IMMEDIATE,          //!< Changes can optionally be made to take effect immediately
+} NV_ECC_CONFIGURATION;
+
+//! \ingroup gpuecc
+//! Used in NvAPI_GPU_GetECCStatusInfo().
+typedef struct
+{
+    NvU32                 version;               //!< Structure version
+    NvU32                 isSupported : 1;       //!< ECC memory feature support
+    NV_ECC_CONFIGURATION  configurationOptions;  //!< Supported ECC memory feature configuration options
+    NvU32                 isEnabled : 1;         //!< Active ECC memory setting
+} NV_GPU_ECC_STATUS_INFO;
+
+//! \ingroup gpuecc
+//! Macro for constructing the version field of NV_GPU_ECC_STATUS_INFO
+#define NV_GPU_ECC_STATUS_INFO_VER MAKE_NVAPI_VERSION(NV_GPU_ECC_STATUS_INFO,1)
+
+//! \ingroup gpuecc
+NVAPI_INTERFACE NvAPI_GPU_GetECCStatusInfo(NvPhysicalGpuHandle hPhysicalGpu,
+                                           NV_GPU_ECC_STATUS_INFO *pECCStatusInfo);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME:   NvAPI_GPU_GetECCErrorInfo
+//
+//! \fn NvAPI_GPU_GetECCErrorInfo(NvPhysicalGpuHandle hPhysicalGpu,
+//!                                          NV_GPU_ECC_ERROR_INFO *pECCErrorInfo);
+//!
+//! DESCRIPTION:     This function returns ECC memory error information.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! TCC_SUPPORTED
+//!
+//! \param [in]      hPhysicalGpu  A handle identifying the physical GPU for
+//!                                which ECC error information is to be
+//!                                retrieved.
+//! \param [out]     pECCErrorInfo A pointer to an ECC error structure.
+//!
+//! \retval ::NVAPI_OK  The request was completed successfully.
+//! \retval ::NVAPI_ERROR  An unknown error occurred.
+//! \retval ::NVAPI_EXPECTED_PHYSICAL_GPU_HANDLE  The provided GPU handle is not a physical GPU handle.
+//! \retval ::NVAPI_INVALID_ARGUMENT  incorrect param value
+//! \retval ::NVAPI_INVALID_POINTER  An invalid argument pointer was provided.
+//! \retval ::NVAPI_INCOMPATIBLE_STRUCT_VERSION  structure version is not supported, initialize to NV_GPU_ECC_ERROR_INFO_VER.
+//! \retval ::NVAPI_HANDLE_INVALIDATED  The provided GPU handle is no longer valid.
+//! \retval ::NVAPI_NOT_SUPPORTED  The request is not supported.
+//! \retval ::NVAPI_API_NOT_INTIALIZED  NvAPI was not yet initialized.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+
+//! \ingroup gpuecc
+//! Used in NvAPI_GPU_GetECCErrorInfo()/
+typedef struct
+{
+    NvU32   version;             //!< Structure version
+    struct {
+        NvU64  singleBitErrors;  //!< Number of single-bit ECC errors detected since last boot
+        NvU64  doubleBitErrors;  //!< Number of double-bit ECC errors detected since last boot
+    } current;
+    struct {
+        NvU64  singleBitErrors;  //!< Number of single-bit ECC errors detected since last counter reset
+        NvU64  doubleBitErrors;  //!< Number of double-bit ECC errors detected since last counter reset
+    } aggregate;
+} NV_GPU_ECC_ERROR_INFO;
+
+//! \ingroup gpuecc
+//! Macro for constructing the version field of NV_GPU_ECC_ERROR_INFO
+#define NV_GPU_ECC_ERROR_INFO_VER MAKE_NVAPI_VERSION(NV_GPU_ECC_ERROR_INFO,1)
+
+//! \ingroup gpuecc
+NVAPI_INTERFACE NvAPI_GPU_GetECCErrorInfo(NvPhysicalGpuHandle hPhysicalGpu,
+                                          NV_GPU_ECC_ERROR_INFO *pECCErrorInfo);
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME:   NvAPI_GPU_ResetECCErrorInfo
+//
+//! DESCRIPTION:     This function resets ECC memory error counters.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! TCC_SUPPORTED
+//!
+//! \param [in]     hPhysicalGpu     A handle identifying the physical GPU for
+//!                                  which ECC error information is to be
+//!                                  cleared.
+//! \param [in]     bResetCurrent    Reset the current ECC error counters.
+//! \param [in]     bResetAggregate  Reset the aggregate ECC error counters.
+//!
+//! \return  This API can return any of the error codes enumerated in #NvAPI_Status. If there are return error codes with
+//!          specific meaning for this API, they are listed below.
+//!
+//! \retval ::NVAPI_INVALID_USER_PRIVILEGE       - The caller does not have administrative privileges
+//!
+//! \ingroup gpuecc
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GPU_ResetECCErrorInfo(NvPhysicalGpuHandle hPhysicalGpu, NvU8 bResetCurrent,
+                                            NvU8 bResetAggregate);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME:   NvAPI_GPU_GetECCConfigurationInfo
+//
+//! \fn NvAPI_GPU_GetECCConfigurationInfo(NvPhysicalGpuHandle hPhysicalGpu,
+//!                             NV_GPU_ECC_CONFIGURATION_INFO *pECCConfigurationInfo);
+//! DESCRIPTION:     This function returns ECC memory configuration information.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! TCC_SUPPORTED
+//!
+//! \param [in]      hPhysicalGpu  A handle identifying the physical GPU for
+//!                                which ECC configuration information
+//!                               is to be retrieved.
+//! \param [out]     pECCConfigurationInfo  A pointer to an ECC
+//!                                                configuration structure.
+//!
+//! \retval ::NVAPI_OK  The request was completed successfully.
+//! \retval ::NVAPI_ERROR  An unknown error occurred.
+//! \retval ::NVAPI_EXPECTED_PHYSICAL_GPU_HANDLE  The provided GPU handle is not a physical GPU handle.
+//! \retval ::NVAPI_INVALID_HANDLE  The provided GPU handle is invalid.
+//! \retval ::NVAPI_HANDLE_INVALIDATED  The provided GPU handle is no longer valid.
+//! \retval ::NVAPI_INVALID_POINTER  An invalid argument pointer was provided.
+//! \retval ::NVAPI_NOT_SUPPORTED  The request is not supported.
+//! \retval ::NVAPI_API_NOT_INTIALIZED  NvAPI was not yet initialized.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+//! \ingroup gpuecc
+//! Used in NvAPI_GPU_GetECCConfigurationInfo().
+typedef struct
+{
+    NvU32  version;                 //! Structure version
+    NvU32  isEnabled : 1;           //! Current ECC configuration stored in non-volatile memory
+    NvU32  isEnabledByDefault : 1;  //! Factory default ECC configuration (static)
+} NV_GPU_ECC_CONFIGURATION_INFO;
+
+//! \ingroup gpuecc
+//! Macro for consstructing the verion field of NV_GPU_ECC_CONFIGURATION_INFO
+#define NV_GPU_ECC_CONFIGURATION_INFO_VER MAKE_NVAPI_VERSION(NV_GPU_ECC_CONFIGURATION_INFO,1)
+
+//! \ingroup gpuecc
+NVAPI_INTERFACE NvAPI_GPU_GetECCConfigurationInfo(NvPhysicalGpuHandle hPhysicalGpu,
+                                                  NV_GPU_ECC_CONFIGURATION_INFO *pECCConfigurationInfo);
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME:   NvAPI_GPU_SetECCConfiguration
+//
+//! DESCRIPTION:     This function updates the ECC memory configuration setting.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! TCC_SUPPORTED
+//!
+//! \param [in]      hPhysicalGpu    A handle identifying the physical GPU for
+//!                                  which to update the ECC configuration
+//!                                  setting.
+//! \param [in]      bEnable         The new ECC configuration setting.
+//! \param [in]      bEnableImmediately   Request that the new setting take effect immediately.
+//!
+//! \return  This API can return any of the error codes enumerated in #NvAPI_Status. If there are return error codes with
+//!          specific meaning for this API, they are listed below.
+//!
+//! \retval ::NVAPI_INVALID_CONFIGURATION  - Possibly SLI is enabled. Disable SLI and retry.
+//! \retval ::NVAPI_INVALID_USER_PRIVILEGE - The caller does not have administrative privileges
+//!
+//! \ingroup gpuecc
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GPU_SetECCConfiguration(NvPhysicalGpuHandle hPhysicalGpu, NvU8 bEnable,
+                                              NvU8 bEnableImmediately);
+
+
+
+//! \ingroup gpu
+typedef struct
+{
+    NvU32  version;              //!< version of this structure
+    NvU32  width;                //!< width of the input texture
+    NvU32  height;               //!< height of the input texture
+    float* blendingTexture;      //!< array of floating values building an intensity RGB texture
+} NV_SCANOUT_INTENSITY_DATA_V1;
+
+//! \ingroup gpu
+typedef struct
+{
+    NvU32  version;              //!< version of this structure
+    NvU32  width;                //!< width of the input texture
+    NvU32  height;               //!< height of the input texture
+    float* blendingTexture;      //!< array of floating values building an intensity RGB texture
+    float* offsetTexture;        //!< array of floating values building an offset texture
+    NvU32  offsetTexChannels;    //!< number of channels per pixel in the offset texture
+} NV_SCANOUT_INTENSITY_DATA_V2;
+
+typedef NV_SCANOUT_INTENSITY_DATA_V2 NV_SCANOUT_INTENSITY_DATA;
+
+//! \ingroup gpu
+#define NV_SCANOUT_INTENSITY_DATA_VER1    MAKE_NVAPI_VERSION(NV_SCANOUT_INTENSITY_DATA_V1, 1)
+#define NV_SCANOUT_INTENSITY_DATA_VER2    MAKE_NVAPI_VERSION(NV_SCANOUT_INTENSITY_DATA_V2, 2)
+#define NV_SCANOUT_INTENSITY_DATA_VER      NV_SCANOUT_INTENSITY_DATA_VER2
+
+///////////////////////////////////////////////////////////////////////////////
+// FUNCTION NAME: NvAPI_GPU_SetScanoutIntensity
+//
+//!   DESCRIPTION: This API enables and sets up per-pixel intensity feature on the specified display.
+//!
+//! SUPPORTED OS:  Windows 7 and higher
+//!
+//!
+//! \param [in]   displayId              combined physical display and GPU identifier of the display to apply the intensity control.
+//! \param [in]   scanoutIntensityData   the intensity texture info.
+//! \param [out]  pbSticky(OUT)           indicates whether the settings will be kept over a reboot.
+//!
+//! \retval ::NVAPI_INVALID_ARGUMENT Invalid input parameters.
+//! \retval ::NVAPI_API_NOT_INITIALIZED NvAPI not initialized.
+//! \retval ::NVAPI_NOT_SUPPORTED Interface not supported by the driver used, or only supported on selected GPUs
+//! \retval ::NVAPI_INVALID_ARGUMENT Invalid input data.
+//! \retval ::NVAPI_INCOMPATIBLE_STRUCT_VERSION NV_SCANOUT_INTENSITY_DATA structure version mismatch.
+//! \retval ::NVAPI_OK Feature enabled.
+//! \retval ::NVAPI_ERROR Miscellaneous error occurred.
+//!
+//! \ingroup gpu
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GPU_SetScanoutIntensity(NvU32 displayId, NV_SCANOUT_INTENSITY_DATA* scanoutIntensityData, int *pbSticky);
+
+
+//! \ingroup gpu
+typedef struct _NV_SCANOUT_INTENSITY_STATE_DATA
+{
+    NvU32  version;                                 //!< version of this structure
+    NvU32  bEnabled;                                //!< intensity is enabled or not
+} NV_SCANOUT_INTENSITY_STATE_DATA;
+
+//! \ingroup gpu
+#define NV_SCANOUT_INTENSITY_STATE_VER    MAKE_NVAPI_VERSION(NV_SCANOUT_INTENSITY_STATE_DATA, 1)
+
+///////////////////////////////////////////////////////////////////////////////
+// FUNCTION NAME: NvAPI_GPU_GetScanoutIntensityState
+//
+//!   DESCRIPTION: This API queries current state of the intensity feature on the specified display.
+//!
+//! SUPPORTED OS:  Windows 7 and higher
+//!
+//!
+//! \param [in]     displayId                       combined physical display and GPU identifier of the display to query the configuration.
+//! \param [in,out] scanoutIntensityStateData       intensity state data.
+//!
+//! \return  This API can return any of the error codes enumerated in #NvAPI_Status. If there are return error codes with
+//!          specific meaning for this API, they are listed below.
+//!
+//! \retval ::NVAPI_INVALID_ARGUMENT Invalid input parameters.
+//! \retval ::NVAPI_API_NOT_INITIALIZED NvAPI not initialized.
+//! \retval ::NVAPI_NOT_SUPPORTED Interface not supported by the driver used, or only supported on selected GPUs.
+//! \retval ::NVAPI_OK Feature enabled.
+//! \retval ::NVAPI_ERROR Miscellaneous error occurred.
+//!
+//! \ingroup gpu
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GPU_GetScanoutIntensityState(__in NvU32 displayId, __inout NV_SCANOUT_INTENSITY_STATE_DATA* scanoutIntensityStateData);
+
+
+//! \ingroup gpu
+typedef enum
+{
+    NV_GPU_WARPING_VERTICE_FORMAT_TRIANGLESTRIP_XYUVRQ = 0,
+    NV_GPU_WARPING_VERTICE_FORMAT_TRIANGLES_XYUVRQ     = 1,
+} NV_GPU_WARPING_VERTICE_FORMAT;
+
+//! \ingroup gpu
+typedef struct
+{
+    NvU32  version;                                 //!< version of this structure
+    float* vertices;                                //!< width of the input texture
+    NV_GPU_WARPING_VERTICE_FORMAT vertexFormat;     //!< format of the input vertices
+    int    numVertices;                             //!< number of the input vertices
+    NvSBox* textureRect;                            //!< rectangle in desktop coordinates describing the source area for the warping
+} NV_SCANOUT_WARPING_DATA;
+
+//! \ingroup gpu
+#define NV_SCANOUT_WARPING_VER    MAKE_NVAPI_VERSION(NV_SCANOUT_WARPING_DATA, 1)
+
+
+///////////////////////////////////////////////////////////////////////////////
+// FUNCTION NAME: NvAPI_GPU_SetScanoutWarping
+//
+//!   DESCRIPTION: This API enables and sets up the warping feature on the specified display.
+//!
+//! SUPPORTED OS:  Windows 7 and higher
+//!
+//!
+//! \param [in]    displayId               Combined physical display and GPU identifier of the display to apply the intensity control
+//! \param [in]    scanoutWarpingData      The warping data info
+//! \param [out]   pbSticky                Indicates whether the settings will be kept over a reboot.
+//!
+//! \retval ::NVAPI_INVALID_ARGUMENT Invalid input parameters.
+//! \retval ::NVAPI_API_NOT_INITIALIZED NvAPI not initialized.
+//! \retval ::NVAPI_NOT_SUPPORTED Interface not supported by the driver used, or only supported on selected GPUs
+//! \retval ::NVAPI_INVALID_ARGUMENT Invalid input data.
+//! \retval ::NVAPI_INCOMPATIBLE_STRUCT_VERSION NV_SCANOUT_WARPING_DATA structure version mismatch.
+//! \retval ::NVAPI_OK Feature enabled.
+//! \retval ::NVAPI_ERROR Miscellaneous error occurred.
+//!
+//! \ingroup gpu
+///////////////////////////////////////////////////////////////////////////////
+
+NVAPI_INTERFACE NvAPI_GPU_SetScanoutWarping(NvU32 displayId, NV_SCANOUT_WARPING_DATA* scanoutWarpingData, int* piMaxNumVertices, int* pbSticky);
+
+
+//! \ingroup gpu
+typedef struct _NV_SCANOUT_WARPING_STATE_DATA
+{
+    NvU32  version;                                  //!< version of this structure
+    NvU32  bEnabled;                                 //!< warping is enabled or not
+} NV_SCANOUT_WARPING_STATE_DATA;
+
+//! \ingroup gpu
+#define NV_SCANOUT_WARPING_STATE_VER    MAKE_NVAPI_VERSION(NV_SCANOUT_WARPING_STATE_DATA, 1)
+
+///////////////////////////////////////////////////////////////////////////////
+// FUNCTION NAME: NvAPI_GPU_GetScanoutWarpingState
+//
+//!   DESCRIPTION: This API queries current state of the warping feature on the specified display.
+//!
+//! SUPPORTED OS:  Windows 7 and higher
+//!
+//!
+//! \param [in]     displayId                      combined physical display and GPU identifier of the display to query the configuration.
+//! \param [in,out] scanoutWarpingStateData        warping state data.
+//!
+//! \return  This API can return any of the error codes enumerated in #NvAPI_Status. If there are return error codes with
+//!          specific meaning for this API, they are listed below.
+//!
+//! \retval ::NVAPI_INVALID_ARGUMENT Invalid input parameters.
+//! \retval ::NVAPI_API_NOT_INITIALIZED NvAPI not initialized.
+//! \retval ::NVAPI_NOT_SUPPORTED Interface not supported by the driver used, or only supported on selected GPUs.
+//! \retval ::NVAPI_OK Feature enabled.
+//! \retval ::NVAPI_ERROR Miscellaneous error occurred.
+//!
+//! \ingroup gpu
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GPU_GetScanoutWarpingState(__in NvU32 displayId, __inout NV_SCANOUT_WARPING_STATE_DATA* scanoutWarpingStateData);
+
+typedef enum
+{
+    NV_GPU_SCANOUT_COMPOSITION_PARAMETER_WARPING_RESAMPLING_METHOD      = 0
+} NV_GPU_SCANOUT_COMPOSITION_PARAMETER;
+
+//! This enum defines a collection of possible scanout composition values that can be used to configure
+//! possible scanout composition settings.  (Currently the only parameter defined is the WARPING_RESAMPLING_METHOD).
+typedef enum
+{
+    NV_GPU_SCANOUT_COMPOSITION_PARAMETER_SET_TO_DEFAULT = 0,                                  // Set parameter to default value.
+    // WARPING_RESAMPLING_METHOD section:
+    NV_GPU_SCANOUT_COMPOSITION_PARAMETER_VALUE_WARPING_RESAMPLING_METHOD_BILINEAR                       = 0x100,
+    NV_GPU_SCANOUT_COMPOSITION_PARAMETER_VALUE_WARPING_RESAMPLING_METHOD_BICUBIC_TRIANGULAR             = 0x101,
+    NV_GPU_SCANOUT_COMPOSITION_PARAMETER_VALUE_WARPING_RESAMPLING_METHOD_BICUBIC_BELL_SHAPED            = 0x102,
+    NV_GPU_SCANOUT_COMPOSITION_PARAMETER_VALUE_WARPING_RESAMPLING_METHOD_BICUBIC_BSPLINE                = 0x103,
+    NV_GPU_SCANOUT_COMPOSITION_PARAMETER_VALUE_WARPING_RESAMPLING_METHOD_BICUBIC_ADAPTIVE_TRIANGULAR    = 0x104,
+    NV_GPU_SCANOUT_COMPOSITION_PARAMETER_VALUE_WARPING_RESAMPLING_METHOD_BICUBIC_ADAPTIVE_BELL_SHAPED   = 0x105,
+    NV_GPU_SCANOUT_COMPOSITION_PARAMETER_VALUE_WARPING_RESAMPLING_METHOD_BICUBIC_ADAPTIVE_BSPLINE       = 0x106
+} NV_GPU_SCANOUT_COMPOSITION_PARAMETER_VALUE;
+
+///////////////////////////////////////////////////////////////////////////////
+// FUNCTION NAME: NvAPI_GPU_SetScanoutCompositionParameter
+//
+//!   DESCRIPTION: This API sets various parameters that configure the scanout composition feature on the specified display.
+//!                (currently there is only one configurable parameter defined: WARPING_RESAMPLING_METHOD,
+//!                 but this function is designed to support the addition of parameters as needed.)
+//!
+//! SUPPORTED OS:  Windows 7 and higher
+//!
+//!
+//! \param [in]    displayId               Combined physical display and GPU identifier of the display to apply the intensity control
+//! \param [in]    parameter               The scanout composition parameter to be set
+//! \param [in]    parameterValue          The data to be set for the specified parameter
+//! \param [in]    pContainer              Additional container for data associated with the specified parameter
+//!
+//! \retval ::NVAPI_INVALID_ARGUMENT Invalid input parameters.
+//! \retval ::NVAPI_API_NOT_INITIALIZED NvAPI not initialized.
+//! \retval ::NVAPI_NOT_SUPPORTED Interface not supported by the driver used, or only supported on selected GPUs
+//! \retval ::NVAPI_INVALID_ARGUMENT Invalid input data.
+//! \retval ::NVAPI_OK Feature enabled.
+//! \retval ::NVAPI_ERROR Miscellaneous error occurred.
+//!
+//! \ingroup gpu
+///////////////////////////////////////////////////////////////////////////////
+
+NVAPI_INTERFACE NvAPI_GPU_SetScanoutCompositionParameter(NvU32 displayId, NV_GPU_SCANOUT_COMPOSITION_PARAMETER parameter,
+                                                         NV_GPU_SCANOUT_COMPOSITION_PARAMETER_VALUE parameterValue, float *pContainer);
+
+
+///////////////////////////////////////////////////////////////////////////////
+// FUNCTION NAME: NvAPI_GPU_GetScanoutCompositionParameter
+//
+//!   DESCRIPTION: This API queries current state of one of the various scanout composition parameters on the specified display.
+//!
+//! SUPPORTED OS:  Windows 7 and higher
+//!
+//!
+//! \param [in]     displayId           combined physical display and GPU identifier of the display to query the configuration.
+//! \param [in]     parameter           scanout composition parameter to by queried.
+//! \param [out] parameterData          scanout composition parameter data.
+//! \param [out]   pContainer           Additional container for returning data associated with the specified parameter
+//!
+//! \return  This API can return any of the error codes enumerated in #NvAPI_Status. If there are return error codes with
+//!          specific meaning for this API, they are listed below.
+//!
+//! \retval ::NVAPI_INVALID_ARGUMENT Invalid input parameters.
+//! \retval ::NVAPI_API_NOT_INITIALIZED NvAPI not initialized.
+//! \retval ::NVAPI_NOT_SUPPORTED Interface not supported by the driver used, or only supported on selected GPUs.
+//! \retval ::NVAPI_OK Feature enabled.
+//! \retval ::NVAPI_ERROR Miscellaneous error occurred.
+//!
+//! \ingroup gpu
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GPU_GetScanoutCompositionParameter(__in NvU32 displayId, __in NV_GPU_SCANOUT_COMPOSITION_PARAMETER parameter,
+                                                         __out NV_GPU_SCANOUT_COMPOSITION_PARAMETER_VALUE *parameterData, __out float *pContainer);
+
+
+///////////////////////////////////////////////////////////////////////////////
+// FUNCTION NAME: NvAPI_GPU_GetScanoutConfiguration
+//
+//!   DESCRIPTION: This API queries the desktop and scanout portion of the specified display.
+//!
+//! SUPPORTED OS:  Windows 7 and higher
+//!
+//!
+//! \param [in]     displayId          combined physical display and GPU identifier of the display to query the configuration.
+//! \param [in,out] desktopRect        desktop area of the display in desktop coordinates.
+//! \param [in,out] scanoutRect        scanout area of the display relative to desktopRect.
+//!
+//! \retval ::NVAPI_INVALID_ARGUMENT Invalid input parameters.
+//! \retval ::NVAPI_API_NOT_INITIALIZED NvAPI not initialized.
+//! \retval ::NVAPI_NOT_SUPPORTED Interface not supported by the driver used, or only supported on selected GPUs.
+//! \retval ::NVAPI_OK Feature enabled.
+//! \retval ::NVAPI_ERROR Miscellaneous error occurred.
+//!
+//! \ingroup gpu
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GPU_GetScanoutConfiguration(NvU32 displayId, NvSBox* desktopRect, NvSBox* scanoutRect);
+
+
+
+//! \ingroup gpu
+//! Used in NvAPI_GPU_GetScanoutConfigurationEx().
+typedef struct _NV_SCANOUT_INFORMATION
+{
+    NvU32      version;                 //!< Structure version, needs to be initialized with NV_SCANOUT_INFORMATION_VER.
+
+    NvSBox     sourceDesktopRect;       //!< Operating system display device rect in desktop coordinates displayId is scanning out from.
+    NvSBox     sourceViewportRect;      //!< Area inside the sourceDesktopRect which is scanned out to the display.
+    NvSBox     targetViewportRect;      //!< Area inside the rect described by targetDisplayWidth/Height sourceViewportRect is scanned out to.
+    NvU32      targetDisplayWidth;      //!< Horizontal size of the active resolution scanned out to the display.
+    NvU32      targetDisplayHeight;     //!< Vertical size of the active resolution scanned out to the display.
+    NvU32      cloneImportance;         //!< If targets are cloned views of the sourceDesktopRect the cloned targets have an importance assigned (0:primary,1 secondary,...).
+    NV_ROTATE  sourceToTargetRotation;  //!< Rotation performed between the sourceViewportRect and the targetViewportRect.
+} NV_SCANOUT_INFORMATION;
+
+#define NV_SCANOUT_INFORMATION_VER  MAKE_NVAPI_VERSION(NV_SCANOUT_INFORMATION,1)
+
+///////////////////////////////////////////////////////////////////////////////
+// FUNCTION NAME: NvAPI_GPU_GetScanoutConfigurationEx
+//
+//!   DESCRIPTION: This API queries the desktop and scanout portion of the specified display.
+//!
+//! SUPPORTED OS:  Windows 7 and higher
+//!
+//! \since Release: 331
+//!
+//! \param [in]     displayId            combined physical display and GPU identifier of the display to query the configuration.
+//! \param [in,out] pScanoutInformation  desktop area to displayId mapping information.
+//!
+//! \return This API can return any of the error codes enumerated in #NvAPI_Status.
+//!
+//! \ingroup gpu
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GPU_GetScanoutConfigurationEx(__in NvU32 displayId, __inout NV_SCANOUT_INFORMATION *pScanoutInformation);
+
+//! Used in NvAPI_GPU_GetPerfDecreaseInfo.
+//! Bit masks for knowing the exact reason for performance decrease
+typedef enum _NVAPI_GPU_PERF_DECREASE
+{
+    NV_GPU_PERF_DECREASE_NONE                        = 0,          //!< No Slowdown detected
+    NV_GPU_PERF_DECREASE_REASON_THERMAL_PROTECTION   = 0x00000001, //!< Thermal slowdown/shutdown/POR thermal protection
+    NV_GPU_PERF_DECREASE_REASON_POWER_CONTROL        = 0x00000002, //!< Power capping / pstate cap
+    NV_GPU_PERF_DECREASE_REASON_AC_BATT              = 0x00000004, //!< AC->BATT event
+    NV_GPU_PERF_DECREASE_REASON_API_TRIGGERED        = 0x00000008, //!< API triggered slowdown
+    NV_GPU_PERF_DECREASE_REASON_INSUFFICIENT_POWER   = 0x00000010, //!< Power connector missing
+    NV_GPU_PERF_DECREASE_REASON_UNKNOWN              = 0x80000000, //!< Unknown reason
+} NVAPI_GPU_PERF_DECREASE;
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GPU_GetPerfDecreaseInfo
+//
+//! DESCRIPTION:   This function retrieves - in NvU32 variable - reasons for the current performance decrease.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//! \param [in]      hPhysicalGPU    (IN)    - GPU for which performance decrease is to be evaluated.
+//! \param [out]  pPerfDecrInfo    (OUT)    - Pointer to a NvU32 variable containing performance decrease info
+//!
+//! \return      This API can return any of the error codes enumerated in #NvAPI_Status.
+//!
+//! \ingroup gpu
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GPU_GetPerfDecreaseInfo(__in NvPhysicalGpuHandle hPhysicalGpu, __inout NvU32 *pPerfDecrInfo);
+
+//! \ingroup gpu
+typedef enum _NV_GPU_ILLUMINATION_ATTRIB
+{
+    NV_GPU_IA_LOGO_BRIGHTNESS  = 0,
+    NV_GPU_IA_SLI_BRIGHTNESS   = 1,
+} NV_GPU_ILLUMINATION_ATTRIB;
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GPU_QueryIlluminationSupport
+//
+//! \fn NvAPI_GPU_QueryIlluminationSupport(__inout  NV_GPU_QUERY_ILLUMINATION_SUPPORT_PARM *pIlluminationSupportInfo)
+//! DESCRIPTION:   This function reports if the specified illumination attribute is supported.
+//!
+//! \note Only a single GPU can manage an given attribute on a given HW element,
+//!       regardless of how many are attatched. I.E. only one GPU will be used to control
+//!       the brightness of the LED on an SLI bridge, regardless of how many are physicaly attached.
+//!       You should enumerate thru the GPUs with this call to determine which GPU is managing the attribute.
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//! \since Version: 300.05
+//!
+//! \param [in]  hPhysicalGpu        Physical GPU handle
+//! \param       Attribute           An enumeration value specifying the Illumination attribute to be querried
+//! \param [out] pSupported          A boolean indicating if the attribute is supported.
+//!
+//! \return See \ref nvapistatus for the list of possible return values.
+//
+//////////////////////////////////////////////////////////////////////////////
+
+//! \ingroup gpu
+typedef struct _NV_GPU_QUERY_ILLUMINATION_SUPPORT_PARM_V1 {
+
+    // IN
+    NvU32   version;						//!< Version of this structure
+    NvPhysicalGpuHandle hPhysicalGpu;		//!< The handle of the GPU that you are checking for the specified attribute.
+                                            //!< note that this is the GPU that is managing the attribute.
+                                            //!< Only a single GPU can manage an given attribute on a given HW element,
+                                            //!< regardless of how many are attatched.
+                                            //!< I.E. only one GPU will be used to control the brightness of the LED on an SLI bridge,
+                                            //!< regardless of how many are physicaly attached.
+                                            //!< You enumerate thru the GPUs with this call to determine which GPU is managing the attribute.
+    NV_GPU_ILLUMINATION_ATTRIB Attribute;   //!< An enumeration value specifying the Illumination attribute to be querried.
+                                            //!<     refer to enum \ref NV_GPU_ILLUMINATION_ATTRIB.
+
+    // OUT
+    NvU32    bSupported;                    //!< A boolean indicating if the attribute is supported.
+
+} NV_GPU_QUERY_ILLUMINATION_SUPPORT_PARM_V1;
+
+//! \ingroup gpu
+typedef NV_GPU_QUERY_ILLUMINATION_SUPPORT_PARM_V1      NV_GPU_QUERY_ILLUMINATION_SUPPORT_PARM;
+//! \ingroup gpu
+#define NV_GPU_QUERY_ILLUMINATION_SUPPORT_PARM_VER_1   MAKE_NVAPI_VERSION(NV_GPU_QUERY_ILLUMINATION_SUPPORT_PARM_V1,1)
+//! \ingroup gpu
+#define NV_GPU_QUERY_ILLUMINATION_SUPPORT_PARM_VER     NV_GPU_QUERY_ILLUMINATION_SUPPORT_PARM_VER_1
+
+//! \ingroup gpu
+NVAPI_INTERFACE NvAPI_GPU_QueryIlluminationSupport(__inout NV_GPU_QUERY_ILLUMINATION_SUPPORT_PARM *pIlluminationSupportInfo);
+
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GPU_GetIllumination
+//
+//! \fn NvAPI_GPU_GetIllumination(NV_GPU_GET_ILLUMINATION_PARM *pIlluminationInfo)
+//! DESCRIPTION:   This function reports value of the specified illumination attribute.
+//!
+//! \note Only a single GPU can manage an given attribute on a given HW element,
+//!       regardless of how many are attatched. I.E. only one GPU will be used to control
+//!       the brightness of the LED on an SLI bridge, regardless of how many are physicaly attached.
+//!       You should enumerate thru the GPUs with the \ref NvAPI_GPU_QueryIlluminationSupport call to
+//!       determine which GPU is managing the attribute.
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//! \since Version: 300.05
+//!
+//! \param [in]  hPhysicalGpu        Physical GPU handle
+//! \param       Attribute           An enumeration value specifying the Illumination attribute to be querried
+//! \param [out] Value               A DWORD containing the current value for the specified attribute.
+//!                                  This is specified as a percentage of the full range of the attribute
+//!                                  (0-100; 0 = off, 100 = full brightness)
+//!
+//! \return See \ref nvapistatus for the list of possible return values. Return values of special interest are:
+//!             NVAPI_INVALID_ARGUMENT The specified attibute is not known to the driver.
+//!             NVAPI_NOT_SUPPORTED:   The specified attribute is not supported on the specified GPU
+//
+//////////////////////////////////////////////////////////////////////////////
+
+//! \ingroup gpu
+typedef struct _NV_GPU_GET_ILLUMINATION_PARM_V1 {
+
+    // IN
+    NvU32   version;						//!< Version of this structure
+    NvPhysicalGpuHandle hPhysicalGpu;		//!< The handle of the GPU that you are checking for the specified attribute.
+                                            //!< Note that this is the GPU that is managing the attribute.
+                                            //!< Only a single GPU can manage an given attribute on a given HW element,
+                                            //!< regardless of how many are attatched.
+                                            //!< I.E. only one GPU will be used to control the brightness of the LED on an SLI bridge,
+                                            //!< regardless of how many are physicaly attached.
+                                            //!< You enumerate thru the GPUs with this call to determine which GPU is managing the attribute.
+    NV_GPU_ILLUMINATION_ATTRIB Attribute;   //!< An enumeration value specifying the Illumination attribute to be querried.
+                                            //!< refer to enum \ref NV_GPU_ILLUMINATION_ATTRIB.
+
+    // OUT
+    NvU32    Value;                         //!< A DWORD that will contain the current value of the specified attribute.
+                                            //! This is specified as a percentage of the full range of the attribute
+                                            //! (0-100; 0 = off, 100 = full brightness)
+
+} NV_GPU_GET_ILLUMINATION_PARM_V1;
+
+//! \ingroup gpu
+typedef NV_GPU_GET_ILLUMINATION_PARM_V1      NV_GPU_GET_ILLUMINATION_PARM;
+//! \ingroup gpu
+#define NV_GPU_GET_ILLUMINATION_PARM_VER_1   MAKE_NVAPI_VERSION(NV_GPU_GET_ILLUMINATION_PARM_V1,1)
+//! \ingroup gpu
+#define NV_GPU_GET_ILLUMINATION_PARM_VER     NV_GPU_GET_ILLUMINATION_PARM_VER_1
+
+//! \ingroup gpu
+NVAPI_INTERFACE NvAPI_GPU_GetIllumination(NV_GPU_GET_ILLUMINATION_PARM *pIlluminationInfo);
+
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GPU_SetIllumination
+//
+//! \fn NvAPI_GPU_SetIllumination(NV_GPU_SET_ILLUMINATION_PARM *pIlluminationInfo)
+//! DESCRIPTION:   This function sets the value of the specified illumination attribute.
+//!
+//! \note Only a single GPU can manage an given attribute on a given HW element,
+//!       regardless of how many are attatched. I.E. only one GPU will be used to control
+//!       the brightness of the LED on an SLI bridge, regardless of how many are physicaly attached.
+//!       You should enumerate thru the GPUs with the \ref NvAPI_GPU_QueryIlluminationSupport call to
+//!       determine which GPU is managing the attribute.
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//! \since Version: 300.05
+//!
+//! \param [in]  hPhysicalGpu        Physical GPU handle
+//! \param       Attribute           An enumeration value specifying the Illumination attribute to be set
+//! \param       Value               The new value for the specified attribute.
+//!                                  This should be specified as a percentage of the full range of the attribute
+//!                                  (0-100; 0 = off, 100 = full brightness)
+//!                                  If a value is specified outside this range, NVAPI_INVALID_ARGUMENT will be returned.
+//!
+//! \return See \ref nvapistatus for the list of possible return values. Return values of special interest are:
+//!             NVAPI_INVALID_ARGUMENT	The specified attibute is not known to the driver, or the specified value is out of range.
+//!             NVAPI_NOT_SUPPORTED     The specified attribute is not supported on the specified GPU.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+//! \ingroup gpu
+typedef struct _NV_GPU_SET_ILLUMINATION_PARM_V1 {
+
+    // IN
+    NvU32   version;						//!< Version of this structure
+    NvPhysicalGpuHandle hPhysicalGpu;		//!< The handle of the GPU that you are checking for the specified attribute.
+                                            //!< Note that this is the GPU that is managing the attribute.
+                                            //!< Only a single GPU can manage an given attribute on a given HW element,
+                                            //!< regardless of how many are attatched.
+                                            //!< I.E. only one GPU will be used to control the brightness of the LED on an SLI bridge,
+                                            //!< regardless of how many are physicaly attached.
+                                            //!< You enumerate thru the GPUs with this call to determine which GPU is managing the attribute.
+    NV_GPU_ILLUMINATION_ATTRIB Attribute;   //!< An enumeration value specifying the Illumination attribute to be querried.
+                                            //!< refer to enum \ref NV_GPU_ILLUMINATION_ATTRIB.
+    NvU32    Value;                         //!< A DWORD containing the new value for the specified attribute.
+                                            //!< This should be specified as a percentage of the full range of the attribute
+                                            //!< (0-100; 0 = off, 100 = full brightness)
+                                            //!< If a value is specified outside this range, NVAPI_INVALID_ARGUMENT will be returned.
+
+    // OUT
+
+} NV_GPU_SET_ILLUMINATION_PARM_V1;
+
+//! \ingroup gpu
+typedef NV_GPU_SET_ILLUMINATION_PARM_V1      NV_GPU_SET_ILLUMINATION_PARM;
+//! \ingroup gpu
+#define NV_GPU_SET_ILLUMINATION_PARM_VER_1   MAKE_NVAPI_VERSION(NV_GPU_SET_ILLUMINATION_PARM_V1,1)
+//! \ingroup gpu
+#define NV_GPU_SET_ILLUMINATION_PARM_VER     NV_GPU_SET_ILLUMINATION_PARM_VER_1
+
+//! \ingroup gpu
+NVAPI_INTERFACE NvAPI_GPU_SetIllumination(NV_GPU_SET_ILLUMINATION_PARM *pIlluminationInfo);
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_EnumNvidiaDisplayHandle
+//
+//! This function returns the handle of the NVIDIA display specified by the enum
+//!                index (thisEnum). The client should keep enumerating until it
+//!                returns NVAPI_END_ENUMERATION.
+//!
+//!                Note: Display handles can get invalidated on a modeset, so the calling applications need to
+//!                renum the handles after every modeset.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 80
+//!
+//! \param [in]  thisEnum      The index of the NVIDIA display.
+//! \param [out] pNvDispHandle Pointer to the NVIDIA display handle.
+//!
+//! \retval NVAPI_INVALID_ARGUMENT        Either the handle pointer is NULL or enum index too big
+//! \retval NVAPI_OK                      Return a valid NvDisplayHandle based on the enum index
+//! \retval NVAPI_NVIDIA_DEVICE_NOT_FOUND No NVIDIA device found in the system
+//! \retval NVAPI_END_ENUMERATION         No more display device to enumerate
+//! \ingroup disphandle
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_EnumNvidiaDisplayHandle(NvU32 thisEnum, NvDisplayHandle *pNvDispHandle);
+
+
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_EnumNvidiaUnAttachedDisplayHandle
+//
+//! This function returns the handle of the NVIDIA unattached display specified by the enum
+//!                index (thisEnum). The client should keep enumerating until it
+//!                returns error.
+//!                Note: Display handles can get invalidated on a modeset, so the calling applications need to
+//!                renum the handles after every modeset.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 80
+//!
+//! \param [in]  thisEnum                  The index of the NVIDIA display.
+//! \param [out] pNvUnAttachedDispHandle   Pointer to the NVIDIA display handle of the unattached display.
+//!
+//! \retval NVAPI_INVALID_ARGUMENT         Either the handle pointer is NULL or enum index too big
+//! \retval NVAPI_OK                       Return a valid NvDisplayHandle based on the enum index
+//! \retval NVAPI_NVIDIA_DEVICE_NOT_FOUND  No NVIDIA device found in the system
+//! \retval NVAPI_END_ENUMERATION          No more display device to enumerate.
+//! \ingroup disphandle
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_EnumNvidiaUnAttachedDisplayHandle(NvU32 thisEnum, NvUnAttachedDisplayHandle *pNvUnAttachedDispHandle);
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_CreateDisplayFromUnAttachedDisplay
+//
+//! This function converts the unattached display handle to an active attached display handle.
+//!
+//! At least one GPU must be present in the system and running an NVIDIA display driver.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 80
+//!
+//! \retval NVAPI_INVALID_ARGUMENT         hNvUnAttachedDisp is not valid or pNvDisplay is NULL.
+//! \retval NVAPI_OK                       One or more handles were returned
+//! \retval NVAPI_NVIDIA_DEVICE_NOT_FOUND  No NVIDIA GPU driving a display was found
+//! \ingroup dispcontrol
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_CreateDisplayFromUnAttachedDisplay(NvUnAttachedDisplayHandle hNvUnAttachedDisp, NvDisplayHandle *pNvDisplay);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GetAssociatedNVidiaDisplayHandle
+//
+//!  This function returns the handle of the NVIDIA display that is associated
+//!  with the given display "name" (such as "\\.\DISPLAY1").
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 80
+//!
+//! \retval NVAPI_INVALID_ARGUMENT         Either argument is NULL
+//! \retval NVAPI_OK                      *pNvDispHandle is now valid
+//! \retval NVAPI_NVIDIA_DEVICE_NOT_FOUND  No NVIDIA device maps to that display name
+//! \ingroup disphandle
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GetAssociatedNvidiaDisplayHandle(const char *szDisplayName, NvDisplayHandle *pNvDispHandle);
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_DISP_GetAssociatedUnAttachedNvidiaDisplayHandle
+//
+//!   DESCRIPTION: This function returns the handle of an unattached NVIDIA display that is
+//!                associated with the given display name (such as "\\DISPLAY1").
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 185
+//!
+//! \retval ::NVAPI_INVALID_ARGUMENT          Either argument is NULL.
+//! \retval ::NVAPI_OK                       *pNvUnAttachedDispHandle is now valid.
+//! \retval ::NVAPI_NVIDIA_DEVICE_NOT_FOUND   No NVIDIA device maps to that display name.
+//!
+//! \ingroup disphandle
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DISP_GetAssociatedUnAttachedNvidiaDisplayHandle(const char *szDisplayName, NvUnAttachedDisplayHandle *pNvUnAttachedDispHandle);
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GetAssociatedNVidiaDisplayName
+//
+//!  For a given NVIDIA display handle, this function returns a string (such as "\\.\DISPLAY1") to identify the display.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 80
+//!
+//! \retval NVAPI_INVALID_ARGUMENT          Either argument is NULL
+//! \retval NVAPI_OK                       *pNvDispHandle is now valid
+//! \retval NVAPI_NVIDIA_DEVICE_NOT_FOUND   No NVIDIA device maps to that display name
+//! \ingroup dispcontrol
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GetAssociatedNvidiaDisplayName(NvDisplayHandle NvDispHandle, NvAPI_ShortString szDisplayName);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GetUnAttachedAssociatedDisplayName
+//
+//!  This function returns the display name given, for example, "\\DISPLAY1", using the unattached NVIDIA display handle
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 95
+//!
+//! \retval NVAPI_INVALID_ARGUMENT          Either argument is NULL
+//! \retval NVAPI_OK                       *pNvDispHandle is now valid
+//! \retval NVAPI_NVIDIA_DEVICE_NOT_FOUND   No NVIDIA device maps to that display name
+//! \ingroup dispcontrol
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GetUnAttachedAssociatedDisplayName(NvUnAttachedDisplayHandle hNvUnAttachedDisp, NvAPI_ShortString szDisplayName);
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_EnableHWCursor
+//
+//!  This function enables hardware cursor support
+//!
+//! SUPPORTED OS:  Windows XP
+//!
+//!
+//!
+//! \since Release: 80
+//!
+//! \return NVAPI_ERROR or NVAPI_OK
+//! \ingroup dispcontrol
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_EnableHWCursor(NvDisplayHandle hNvDisplay);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_DisableHWCursor
+//
+//! This function disables hardware cursor support
+//!
+//! SUPPORTED OS:  Windows XP
+//!
+//!
+//! \since Release: 80
+//!
+//! \return  NVAPI_ERROR or NVAPI_OK
+//! \ingroup dispcontrol
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DisableHWCursor(NvDisplayHandle hNvDisplay);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GetVBlankCounter
+//
+//!  This function gets the V-blank counter
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 80
+//!
+//! \return NVAPI_ERROR or NVAPI_OK
+//! \ingroup dispcontrol
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GetVBlankCounter(NvDisplayHandle hNvDisplay, NvU32 *pCounter);
+
+///////////////////////////////////////////////////////////////////////////////
+// FUNCTION NAME:  NvAPI_SetRefreshRateOverride
+//
+//!  This function overrides the refresh rate on the given display/outputsMask.
+//!  The new refresh rate can be applied right away in this API call or deferred to be applied with the
+//!  next OS modeset. The override is good for only one modeset (regardless whether it's deferred or immediate).
+//!
+//!
+//! SUPPORTED OS:  Windows XP
+//!
+//!
+//! \since Release: 80
+//!
+//!  \param [in] hNvDisplay    The NVIDIA display handle. It can be NVAPI_DEFAULT_HANDLE or a handle
+//!                           enumerated from NvAPI_EnumNVidiaDisplayHandle().
+//!  \param [in] outputsMask  A set of bits that identify all target outputs which are associated with the NVIDIA
+//!                           display handle to apply the refresh rate override. When SLI is enabled, the
+//!                           outputsMask only applies to the GPU that is driving the display output.
+//!  \param [in] refreshRate  The override value. "0.0" means cancel the override.
+//!  \param [in] bSetDeferred
+//!              - "0": Apply the refresh rate override immediately in this API call.\p
+//!              - "1": Apply refresh rate at the next OS modeset.
+//!
+//!  \retval  NVAPI_INVALID_ARGUMENT hNvDisplay or outputsMask is invalid
+//!  \retval  NVAPI_OK               The refresh rate override is correct set
+//!  \retval  NVAPI_ERROR            The operation failed
+//!  \ingroup dispcontrol
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_SetRefreshRateOverride(NvDisplayHandle hNvDisplay, NvU32 outputsMask, float refreshRate, NvU32 bSetDeferred);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GetAssociatedDisplayOutputId
+//
+//! This function gets the active outputId associated with the display handle.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 90
+//!
+//! \param [in]  hNvDisplay  NVIDIA Display selection. It can be NVAPI_DEFAULT_HANDLE or a handle enumerated from NvAPI_EnumNVidiaDisplayHandle().
+//! \param [out] outputId    The active display output ID associated with the selected display handle hNvDisplay.
+//!                          The outputid will have only one bit set. In the case of Clone or Span mode, this will indicate the
+//!                          display outputId of the primary display that the GPU is driving. See \ref handles.
+//!
+//! \retval  NVAPI_OK                      Call successful.
+//! \retval  NVAPI_NVIDIA_DEVICE_NOT_FOUND No NVIDIA GPU driving a display was found.
+//! \retval  NVAPI_EXPECTED_DISPLAY_HANDLE hNvDisplay is not a valid display handle.
+//! \ingroup dispcontrol
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GetAssociatedDisplayOutputId(NvDisplayHandle hNvDisplay, NvU32 *pOutputId);
+
+
+//! \ingroup dispcontrol
+//! Used in NvAPI_GetDisplayPortInfo().
+typedef struct _NV_DISPLAY_PORT_INFO_V1
+{
+    NvU32               version;                                  //!< Structure version
+    NvU32               dpcd_ver;                                 //!< DPCD version of the monitor
+    NV_DP_LINK_RATE     maxLinkRate;                              //!< Maximum supported link rate
+    NV_DP_LANE_COUNT    maxLaneCount;                             //!< Maximum supported lane count
+    NV_DP_LINK_RATE     curLinkRate;                              //!< Current link rate
+    NV_DP_LANE_COUNT    curLaneCount;                             //!< Current lane count
+    NV_DP_COLOR_FORMAT  colorFormat;                              //!< Current color format
+    NV_DP_DYNAMIC_RANGE dynamicRange;                             //!< Dynamic range
+    NV_DP_COLORIMETRY   colorimetry;                              //!< Ignored in RGB space
+    NV_DP_BPC           bpc;                                      //!< Current bit-per-component
+    NvU32               isDp                                : 1;  //!< If the monitor is driven by a DisplayPort
+    NvU32               isInternalDp                        : 1;  //!< If the monitor is driven by an NV Dp transmitter
+    NvU32               isColorCtrlSupported                : 1;  //!< If the color format change is supported
+    NvU32               is6BPCSupported                     : 1;  //!< If 6 bpc is supported
+    NvU32               is8BPCSupported                     : 1;  //!< If 8 bpc is supported
+    NvU32               is10BPCSupported                    : 1;  //!< If 10 bpc is supported
+    NvU32               is12BPCSupported                    : 1;  //!< If 12 bpc is supported
+    NvU32               is16BPCSupported                    : 1;  //!< If 16 bpc is supported
+    NvU32               isYCrCb422Supported                 : 1;  //!< If YCrCb422 is supported
+    NvU32               isYCrCb444Supported                 : 1;  //!< If YCrCb444 is supported
+    NvU32               isRgb444SupportedOnCurrentMode      : 1;  //!< If Rgb444 is supported on the current mode
+    NvU32               isYCbCr444SupportedOnCurrentMode    : 1;  //!< If YCbCr444 is supported on the current mode
+    NvU32               isYCbCr422SupportedOnCurrentMode    : 1;  //!< If YCbCr422 is support on the current mode
+    NvU32               is6BPCSupportedOnCurrentMode        : 1;  // if 6 bpc is supported On Current Mode
+    NvU32               is8BPCSupportedOnCurrentMode        : 1;  // if 8 bpc is supported On Current Mode
+    NvU32               is10BPCSupportedOnCurrentMode       : 1;  // if 10 bpc is supported On Current Mode
+    NvU32               is12BPCSupportedOnCurrentMode       : 1;  // if 12 bpc is supported On Current Mode
+    NvU32               is16BPCSupportedOnCurrentMode       : 1;  // if 16 bpc is supported On Current Mode
+    NvU32               isMonxvYCC601Capable                : 1;  // if xvYCC 601 extended colorimetry is supported
+    NvU32               isMonxvYCC709Capable                : 1;  // if xvYCC 709 extended colorimetry is supported
+    NvU32               isMonsYCC601Capable                 : 1;  // if sYCC601 extended colorimetry is supported
+    NvU32               isMonAdobeYCC601Capable             : 1;  // if AdobeYCC601 extended colorimetry is supported
+    NvU32               isMonAdobeRGBCapable                : 1;  // if AdobeRGB extended colorimetry is supported
+    NvU32               isMonBT2020RGBCapable               : 1;  // if BT2020 RGB extended colorimetry is supported
+    NvU32               isMonBT2020YCCCapable               : 1;  // if BT2020 Y'CbCr extended colorimetry is supported
+    NvU32               isMonBT2020cYCCCapable              : 1;  // if BT2020 cYCbCr (constant luminance) extended colorimetry is supported
+
+    NvU32               reserved                            : 6;  //!< reserved
+ } NV_DISPLAY_PORT_INFO_V1;
+
+ typedef NV_DISPLAY_PORT_INFO_V1 NV_DISPLAY_PORT_INFO;
+
+//! Macro for constructing the version field of NV_DISPLAY_PORT_INFO.
+#define NV_DISPLAY_PORT_INFO_VER1  MAKE_NVAPI_VERSION(NV_DISPLAY_PORT_INFO,1)
+#define NV_DISPLAY_PORT_INFO_VER2  MAKE_NVAPI_VERSION(NV_DISPLAY_PORT_INFO,2)
+#define NV_DISPLAY_PORT_INFO_VER   NV_DISPLAY_PORT_INFO_VER2
+
+///////////////////////////////////////////////////////////////////////////////
+// FUNCTION NAME:   NvAPI_GetDisplayPortInfo
+//
+//! \fn NvAPI_GetDisplayPortInfo(__in_opt NvDisplayHandle hNvDisplay, __in NvU32 outputId, __inout NV_DISPLAY_PORT_INFO *pInfo)
+//! DESCRIPTION:     This function returns the current DisplayPort-related information on the specified device (monitor).
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 165
+//!
+//! \param [in]     hvDisplay     NVIDIA Display selection. It can be NVAPI_DEFAULT_HANDLE or a handle enumerated from NvAPI_EnumNVidiaDisplayHandle().
+//!                               This parameter is ignored when the outputId is a NvAPI displayId.
+//! \param [in]     outputId      This can either be the connection bit mask or the NvAPI displayId. When the legacy connection bit mask is passed,
+//!                               it should have exactly 1 bit set to indicate a single display. If it's "0" then the default outputId from
+//!                               NvAPI_GetAssociatedDisplayOutputId() will be used. See \ref handles.
+//! \param [out]    pInfo         The DisplayPort information
+//!
+//! \retval         NVAPI_OK                Completed request
+//! \retval         NVAPI_ERROR             Miscellaneous error occurred
+//! \retval         NVAPI_INVALID_ARGUMENT  Invalid input parameter.
+//
+///////////////////////////////////////////////////////////////////////////////
+//! \ingroup        dispcontrol
+NVAPI_INTERFACE NvAPI_GetDisplayPortInfo(__in_opt NvDisplayHandle hNvDisplay, __in NvU32 outputId, __inout NV_DISPLAY_PORT_INFO *pInfo);
+
+///////////////////////////////////////////////////////////////////////////////
+// FUNCTION NAME:   NvAPI_SetDisplayPort
+//
+//! \fn NvAPI_SetDisplayPort(NvDisplayHandle hNvDisplay, NvU32 outputId, NV_DISPLAY_PORT_CONFIG *pCfg)
+//! DESCRIPTION:     This function sets up DisplayPort-related configurations.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release:   165
+//!
+//! \param [in]       hNvDisplay  NVIDIA display handle. It can be NVAPI_DEFAULT_HANDLE or a handle enumerated from
+//!                               NvAPI_EnumNVidiaDisplayHandle().
+//! \param [in]       outputId    This display output ID, when it's "0" it means the default outputId generated from the return of
+//!                               NvAPI_GetAssociatedDisplayOutputId(). See \ref handles.
+//! \param [in]       pCfg        The display port config structure. If pCfg is NULL, it means to use the driver's default value to setup.
+//!
+//! \retval           NVAPI_OK                Completed request
+//! \retval           NVAPI_ERROR             Miscellaneous error occurred
+//! \retval           NVAPI_INVALID_ARGUMENT  Invalid input parameter
+///////////////////////////////////////////////////////////////////////////////
+
+
+//! \ingroup dispcontrol
+//! DisplayPort configuration settings - used in NvAPI_SetDisplayPort().
+typedef struct
+{
+    NvU32               version;                     //!< Structure version - 2 is the latest
+    NV_DP_LINK_RATE     linkRate;                    //!< Link rate
+    NV_DP_LANE_COUNT    laneCount;                   //!< Lane count
+    NV_DP_COLOR_FORMAT  colorFormat;                 //!< Color format to set
+    NV_DP_DYNAMIC_RANGE dynamicRange;                //!< Dynamic range
+    NV_DP_COLORIMETRY   colorimetry;                 //!< Ignored in RGB space
+    NV_DP_BPC           bpc;                         //!< Bit-per-component
+    NvU32               isHPD               : 1;     //!< If the control panel is making this call due to HPD
+    NvU32               isSetDeferred       : 1;     //!< Requires an OS modeset to finalize the setup if set
+    NvU32               isChromaLpfOff      : 1;     //!< Force the chroma low_pass_filter to be off
+    NvU32               isDitherOff         : 1;     //!< Force to turn off dither
+    NvU32               testLinkTrain       : 1;     //!< If testing mode, skip validation
+    NvU32               testColorChange     : 1;     //!< If testing mode, skip validation
+
+} NV_DISPLAY_PORT_CONFIG;
+
+//! \addtogroup dispcontrol
+//! @{
+//! Macro for constructing the version field of NV_DISPLAY_PORT_CONFIG
+#define NV_DISPLAY_PORT_CONFIG_VER   MAKE_NVAPI_VERSION(NV_DISPLAY_PORT_CONFIG,2)
+//! Macro for constructing the version field of NV_DISPLAY_PORT_CONFIG
+#define NV_DISPLAY_PORT_CONFIG_VER_1 MAKE_NVAPI_VERSION(NV_DISPLAY_PORT_CONFIG,1)
+//! Macro for constructing the version field of NV_DISPLAY_PORT_CONFIG
+#define NV_DISPLAY_PORT_CONFIG_VER_2 MAKE_NVAPI_VERSION(NV_DISPLAY_PORT_CONFIG,2)
+//! @}
+
+
+//! \ingroup          dispcontrol
+NVAPI_INTERFACE NvAPI_SetDisplayPort(NvDisplayHandle hNvDisplay, NvU32 outputId, NV_DISPLAY_PORT_CONFIG *pCfg);
+
+
+
+
+//! \ingroup dispcontrol
+//! Used in NvAPI_GetHDMISupportInfo().
+typedef struct _NV_HDMI_SUPPORT_INFO_V1
+{
+    NvU32      version;                     //!< Structure version
+
+    NvU32      isGpuHDMICapable       : 1;  //!< If the GPU can handle HDMI
+    NvU32      isMonUnderscanCapable  : 1;  //!< If the monitor supports underscan
+    NvU32      isMonBasicAudioCapable : 1;  //!< If the monitor supports basic audio
+    NvU32      isMonYCbCr444Capable   : 1;  //!< If YCbCr 4:4:4 is supported
+    NvU32      isMonYCbCr422Capable   : 1;  //!< If YCbCr 4:2:2 is supported
+    NvU32      isMonxvYCC601Capable   : 1;  //!< If xvYCC 601 is supported
+    NvU32      isMonxvYCC709Capable   : 1;  //!< If xvYCC 709 is supported
+    NvU32      isMonHDMI              : 1;  //!< If the monitor is HDMI (with IEEE's HDMI registry ID)
+    NvU32      reserved               : 24; //!< Reserved.
+
+    NvU32      EDID861ExtRev;               //!< Revision number of the EDID 861 extension
+ } NV_HDMI_SUPPORT_INFO_V1;
+
+typedef struct _NV_HDMI_SUPPORT_INFO_V2
+{
+    NvU32      version;                      //!< Structure version
+
+    NvU32      isGpuHDMICapable        : 1;  //!< If the GPU can handle HDMI
+    NvU32      isMonUnderscanCapable   : 1;  //!< If the monitor supports underscan
+    NvU32      isMonBasicAudioCapable  : 1;  //!< If the monitor supports basic audio
+    NvU32      isMonYCbCr444Capable    : 1;  //!< If YCbCr 4:4:4 is supported
+    NvU32      isMonYCbCr422Capable    : 1;  //!< If YCbCr 4:2:2 is supported
+    NvU32      isMonxvYCC601Capable    : 1;  //!< If xvYCC extended colorimetry 601 is supported
+    NvU32      isMonxvYCC709Capable    : 1;  //!< If xvYCC extended colorimetry 709 is supported
+    NvU32      isMonHDMI               : 1;  //!< If the monitor is HDMI (with IEEE's HDMI registry ID)
+    NvU32      isMonsYCC601Capable     : 1;  //!< if sYCC601 extended colorimetry is supported
+    NvU32      isMonAdobeYCC601Capable : 1;  //!< if AdobeYCC601 extended colorimetry is supported
+    NvU32      isMonAdobeRGBCapable    : 1;  //!< if AdobeRGB extended colorimetry is supported
+    NvU32      reserved                : 21; //!< Reserved.
+
+    NvU32      EDID861ExtRev;                //!< Revision number of the EDID 861 extension
+ } NV_HDMI_SUPPORT_INFO_V2;
+
+#define NV_HDMI_SUPPORT_INFO_VER1  MAKE_NVAPI_VERSION(NV_HDMI_SUPPORT_INFO_V1, 1)
+#define NV_HDMI_SUPPORT_INFO_VER2  MAKE_NVAPI_VERSION(NV_HDMI_SUPPORT_INFO_V2, 2)
+
+
+
+#ifndef NV_HDMI_SUPPORT_INFO_VER
+
+typedef NV_HDMI_SUPPORT_INFO_V2    NV_HDMI_SUPPORT_INFO;
+#define NV_HDMI_SUPPORT_INFO_VER   NV_HDMI_SUPPORT_INFO_VER2
+
+#endif
+
+
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+///////////////////////////////////////////////////////////////////////////////
+// FUNCTION NAME:   NvAPI_GetHDMISupportInfo
+//
+//! \fn NvAPI_GetHDMISupportInfo(__in_opt NvDisplayHandle hNvDisplay, __in NvU32 outputId, __inout NV_HDMI_SUPPORT_INFO *pInfo)
+//!   This API returns the current infoframe data on the specified device(monitor).
+//!
+//! \since Release: 95
+//!
+//! \param [in]  hvDisplay  NVIDIA Display selection. It can be NVAPI_DEFAULT_HANDLE or a handle enumerated from NvAPI_EnumNVidiaDisplayHandle().
+//!                         This parameter is ignored when the outputId is a NvAPI displayId.
+//! \param [in]  outputId   This can either be the connection bit mask or the NvAPI displayId. When the legacy connection bit mask is passed,
+//!                         it should have exactly 1 bit set to indicate a single display. If it's "0" then the default outputId from
+//!                         NvAPI_GetAssociatedDisplayOutputId() will be used. See \ref handles.
+//! \param [out] pInfo      The monitor and GPU's HDMI support info
+//!
+//! \retval  NVAPI_OK                Completed request
+//! \retval  NVAPI_ERROR             Miscellaneous error occurred
+//! \retval  NVAPI_INVALID_ARGUMENT  Invalid input parameter.
+///////////////////////////////////////////////////////////////////////////////
+
+
+//! \ingroup dispcontrol
+NVAPI_INTERFACE NvAPI_GetHDMISupportInfo(__in_opt NvDisplayHandle hNvDisplay, __in NvU32 outputId, __inout NV_HDMI_SUPPORT_INFO *pInfo);
+
+
+//! \ingroup dispcontrol
+
+typedef enum
+{
+    NV_INFOFRAME_CMD_GET_DEFAULT = 0,     //!< Returns the fields in the infoframe with values set by the manufacturer - NVIDIA/OEM.
+    NV_INFOFRAME_CMD_RESET,               //!< Sets the fields in the infoframe to auto, and infoframe to the default infoframe for use in a set.
+    NV_INFOFRAME_CMD_GET,                 //!< Get the current infoframe state.
+    NV_INFOFRAME_CMD_SET,                 //!< Set the current infoframe state (flushed to the monitor), the values are one time and do not persist.
+    NV_INFOFRAME_CMD_GET_OVERRIDE,        //!< Get the override infoframe state, non-override fields will be set to value = AUTO, overridden fields will have the current override values.
+    NV_INFOFRAME_CMD_SET_OVERRIDE,        //!< Set the override infoframe state, non-override fields will be set to value = AUTO, other values indicate override; persist across modeset/reboot
+    NV_INFOFRAME_CMD_GET_PROPERTY,        //!< get properties associated with infoframe (each of the infoframe type will have properties)
+    NV_INFOFRAME_CMD_SET_PROPERTY,        //!< set properties associated with infoframe
+} NV_INFOFRAME_CMD;
+
+
+typedef enum
+{
+    NV_INFOFRAME_PROPERTY_MODE_AUTO           = 0, //!< Driver determines whether to send infoframes.
+    NV_INFOFRAME_PROPERTY_MODE_ENABLE,             //!< Driver always sends infoframe.
+    NV_INFOFRAME_PROPERTY_MODE_DISABLE,            //!< Driver never sends infoframe.
+    NV_INFOFRAME_PROPERTY_MODE_ALLOW_OVERRIDE,     //!< Driver only sends infoframe when client requests it via infoframe escape call.
+} NV_INFOFRAME_PROPERTY_MODE;
+
+
+//! Returns whether the current monitor is in blacklist or force this monitor to be in blacklist.
+typedef enum
+{
+    NV_INFOFRAME_PROPERTY_BLACKLIST_FALSE = 0,
+    NV_INFOFRAME_PROPERTY_BLACKLIST_TRUE,
+} NV_INFOFRAME_PROPERTY_BLACKLIST;
+
+typedef struct
+{
+    NvU32 mode      :  4;
+    NvU32 blackList :  2;
+    NvU32 reserved  : 10;
+    NvU32 version   :  8;
+    NvU32 length    :  8;
+} NV_INFOFRAME_PROPERTY;
+
+//! Byte1 related
+typedef enum
+{
+    NV_INFOFRAME_FIELD_VALUE_AVI_SCANINFO_NODATA    = 0,
+    NV_INFOFRAME_FIELD_VALUE_AVI_SCANINFO_OVERSCAN,
+    NV_INFOFRAME_FIELD_VALUE_AVI_SCANINFO_UNDERSCAN,
+    NV_INFOFRAME_FIELD_VALUE_AVI_SCANINFO_FUTURE,
+    NV_INFOFRAME_FIELD_VALUE_AVI_SCANINFO_AUTO      = 7
+} NV_INFOFRAME_FIELD_VALUE_AVI_SCANINFO;
+
+
+typedef enum
+{
+    NV_INFOFRAME_FIELD_VALUE_AVI_BARDATA_NOT_PRESENT         = 0,
+    NV_INFOFRAME_FIELD_VALUE_AVI_BARDATA_VERTICAL_PRESENT,
+    NV_INFOFRAME_FIELD_VALUE_AVI_BARDATA_HORIZONTAL_PRESENT,
+    NV_INFOFRAME_FIELD_VALUE_AVI_BARDATA_BOTH_PRESENT,
+    NV_INFOFRAME_FIELD_VALUE_AVI_BARDATA_AUTO                = 7
+} NV_INFOFRAME_FIELD_VALUE_AVI_BARDATA;
+
+typedef enum
+{
+    NV_INFOFRAME_FIELD_VALUE_AVI_AFI_ABSENT   = 0,
+    NV_INFOFRAME_FIELD_VALUE_AVI_AFI_PRESENT,
+    NV_INFOFRAME_FIELD_VALUE_AVI_AFI_AUTO     = 3
+} NV_INFOFRAME_FIELD_VALUE_AVI_ACTIVEFORMATINFO;
+
+
+typedef enum
+{
+    NV_INFOFRAME_FIELD_VALUE_AVI_COLORFORMAT_RGB      = 0,
+    NV_INFOFRAME_FIELD_VALUE_AVI_COLORFORMAT_YCbCr422,
+    NV_INFOFRAME_FIELD_VALUE_AVI_COLORFORMAT_YCbCr444,
+    NV_INFOFRAME_FIELD_VALUE_AVI_COLORFORMAT_FUTURE,
+    NV_INFOFRAME_FIELD_VALUE_AVI_COLORFORMAT_AUTO     = 7
+} NV_INFOFRAME_FIELD_VALUE_AVI_COLORFORMAT;
+
+typedef enum
+{
+    NV_INFOFRAME_FIELD_VALUE_AVI_F17_FALSE = 0,
+    NV_INFOFRAME_FIELD_VALUE_AVI_F17_TRUE,
+    NV_INFOFRAME_FIELD_VALUE_AVI_F17_AUTO = 3
+} NV_INFOFRAME_FIELD_VALUE_AVI_F17;
+
+//! Byte2 related
+typedef enum
+{
+    NV_INFOFRAME_FIELD_VALUE_AVI_ASPECTRATIOACTIVEPORTION_NO_AFD           = 0,
+    NV_INFOFRAME_FIELD_VALUE_AVI_ASPECTRATIOACTIVEPORTION_RESERVE01,
+    NV_INFOFRAME_FIELD_VALUE_AVI_ASPECTRATIOACTIVEPORTION_RESERVE02,
+    NV_INFOFRAME_FIELD_VALUE_AVI_ASPECTRATIOACTIVEPORTION_RESERVE03,
+    NV_INFOFRAME_FIELD_VALUE_AVI_ASPECTRATIOACTIVEPORTION_LETTERBOX_GT16x9,
+    NV_INFOFRAME_FIELD_VALUE_AVI_ASPECTRATIOACTIVEPORTION_RESERVE05,
+    NV_INFOFRAME_FIELD_VALUE_AVI_ASPECTRATIOACTIVEPORTION_RESERVE06,
+    NV_INFOFRAME_FIELD_VALUE_AVI_ASPECTRATIOACTIVEPORTION_RESERVE07,
+    NV_INFOFRAME_FIELD_VALUE_AVI_ASPECTRATIOACTIVEPORTION_EQUAL_CODEDFRAME = 8,
+    NV_INFOFRAME_FIELD_VALUE_AVI_ASPECTRATIOACTIVEPORTION_CENTER_4x3,
+    NV_INFOFRAME_FIELD_VALUE_AVI_ASPECTRATIOACTIVEPORTION_CENTER_16x9,
+    NV_INFOFRAME_FIELD_VALUE_AVI_ASPECTRATIOACTIVEPORTION_CENTER_14x9,
+    NV_INFOFRAME_FIELD_VALUE_AVI_ASPECTRATIOACTIVEPORTION_RESERVE12,
+    NV_INFOFRAME_FIELD_VALUE_AVI_ASPECTRATIOACTIVEPORTION_4x3_ON_14x9,
+    NV_INFOFRAME_FIELD_VALUE_AVI_ASPECTRATIOACTIVEPORTION_16x9_ON_14x9,
+    NV_INFOFRAME_FIELD_VALUE_AVI_ASPECTRATIOACTIVEPORTION_16x9_ON_4x3,
+    NV_INFOFRAME_FIELD_VALUE_AVI_ASPECTRATIOACTIVEPORTION_AUTO             = 31,
+} NV_INFOFRAME_FIELD_VALUE_AVI_ASPECTRATIOACTIVEPORTION;
+
+
+typedef enum
+{
+    NV_INFOFRAME_FIELD_VALUE_AVI_ASPECTRATIOCODEDFRAME_NO_DATA = 0,
+    NV_INFOFRAME_FIELD_VALUE_AVI_ASPECTRATIOCODEDFRAME_4x3,
+    NV_INFOFRAME_FIELD_VALUE_AVI_ASPECTRATIOCODEDFRAME_16x9,
+    NV_INFOFRAME_FIELD_VALUE_AVI_ASPECTRATIOCODEDFRAME_FUTURE,
+    NV_INFOFRAME_FIELD_VALUE_AVI_ASPECTRATIOCODEDFRAME_AUTO    = 7
+} NV_INFOFRAME_FIELD_VALUE_AVI_ASPECTRATIOCODEDFRAME;
+
+typedef enum
+{
+    NV_INFOFRAME_FIELD_VALUE_AVI_COLORIMETRY_NO_DATA                   = 0,
+    NV_INFOFRAME_FIELD_VALUE_AVI_COLORIMETRY_SMPTE_170M,
+    NV_INFOFRAME_FIELD_VALUE_AVI_COLORIMETRY_ITUR_BT709,
+    NV_INFOFRAME_FIELD_VALUE_AVI_COLORIMETRY_USE_EXTENDED_COLORIMETRY,
+    NV_INFOFRAME_FIELD_VALUE_AVI_COLORIMETRY_AUTO                      = 7
+} NV_INFOFRAME_FIELD_VALUE_AVI_COLORIMETRY;
+
+//! Byte 3 related
+typedef enum
+{
+    NV_INFOFRAME_FIELD_VALUE_AVI_NONUNIFORMPICTURESCALING_NO_DATA    = 0,
+    NV_INFOFRAME_FIELD_VALUE_AVI_NONUNIFORMPICTURESCALING_HORIZONTAL,
+    NV_INFOFRAME_FIELD_VALUE_AVI_NONUNIFORMPICTURESCALING_VERTICAL,
+    NV_INFOFRAME_FIELD_VALUE_AVI_NONUNIFORMPICTURESCALING_BOTH,
+    NV_INFOFRAME_FIELD_VALUE_AVI_NONUNIFORMPICTURESCALING_AUTO       = 7
+} NV_INFOFRAME_FIELD_VALUE_AVI_NONUNIFORMPICTURESCALING;
+
+typedef enum
+{
+    NV_INFOFRAME_FIELD_VALUE_AVI_RGBQUANTIZATION_DEFAULT       = 0,
+    NV_INFOFRAME_FIELD_VALUE_AVI_RGBQUANTIZATION_LIMITED_RANGE,
+    NV_INFOFRAME_FIELD_VALUE_AVI_RGBQUANTIZATION_FULL_RANGE,
+    NV_INFOFRAME_FIELD_VALUE_AVI_RGBQUANTIZATION_RESERVED,
+    NV_INFOFRAME_FIELD_VALUE_AVI_RGBQUANTIZATION_AUTO          = 7
+} NV_INFOFRAME_FIELD_VALUE_AVI_RGBQUANTIZATION;
+
+typedef enum
+{
+    NV_INFOFRAME_FIELD_VALUE_AVI_EXTENDEDCOLORIMETRY_XVYCC601     = 0,
+    NV_INFOFRAME_FIELD_VALUE_AVI_EXTENDEDCOLORIMETRY_XVYCC709,
+    NV_INFOFRAME_FIELD_VALUE_AVI_EXTENDEDCOLORIMETRY_SYCC601,
+    NV_INFOFRAME_FIELD_VALUE_AVI_EXTENDEDCOLORIMETRY_ADOBEYCC601,
+    NV_INFOFRAME_FIELD_VALUE_AVI_EXTENDEDCOLORIMETRY_ADOBERGB,
+    NV_INFOFRAME_FIELD_VALUE_AVI_EXTENDEDCOLORIMETRY_RESERVED05,
+    NV_INFOFRAME_FIELD_VALUE_AVI_EXTENDEDCOLORIMETRY_RESERVED06,
+    NV_INFOFRAME_FIELD_VALUE_AVI_EXTENDEDCOLORIMETRY_RESERVED07,
+    NV_INFOFRAME_FIELD_VALUE_AVI_EXTENDEDCOLORIMETRY_AUTO         = 15
+} NV_INFOFRAME_FIELD_VALUE_AVI_EXTENDEDCOLORIMETRY;
+
+typedef enum
+{
+    NV_INFOFRAME_FIELD_VALUE_AVI_ITC_VIDEO_CONTENT = 0,
+    NV_INFOFRAME_FIELD_VALUE_AVI_ITC_ITCONTENT,
+    NV_INFOFRAME_FIELD_VALUE_AVI_ITC_AUTO          = 3
+} NV_INFOFRAME_FIELD_VALUE_AVI_ITC;
+
+//! Byte 4 related
+typedef enum
+{
+    NV_INFOFRAME_FIELD_VALUE_AVI_PIXELREPETITION_NONE = 0,
+    NV_INFOFRAME_FIELD_VALUE_AVI_PIXELREPETITION_X02,
+    NV_INFOFRAME_FIELD_VALUE_AVI_PIXELREPETITION_X03,
+    NV_INFOFRAME_FIELD_VALUE_AVI_PIXELREPETITION_X04,
+    NV_INFOFRAME_FIELD_VALUE_AVI_PIXELREPETITION_X05,
+    NV_INFOFRAME_FIELD_VALUE_AVI_PIXELREPETITION_X06,
+    NV_INFOFRAME_FIELD_VALUE_AVI_PIXELREPETITION_X07,
+    NV_INFOFRAME_FIELD_VALUE_AVI_PIXELREPETITION_X08,
+    NV_INFOFRAME_FIELD_VALUE_AVI_PIXELREPETITION_X09,
+    NV_INFOFRAME_FIELD_VALUE_AVI_PIXELREPETITION_X10,
+    NV_INFOFRAME_FIELD_VALUE_AVI_PIXELREPETITION_RESERVED10,
+    NV_INFOFRAME_FIELD_VALUE_AVI_PIXELREPETITION_RESERVED11,
+    NV_INFOFRAME_FIELD_VALUE_AVI_PIXELREPETITION_RESERVED12,
+    NV_INFOFRAME_FIELD_VALUE_AVI_PIXELREPETITION_RESERVED13,
+    NV_INFOFRAME_FIELD_VALUE_AVI_PIXELREPETITION_RESERVED14,
+    NV_INFOFRAME_FIELD_VALUE_AVI_PIXELREPETITION_RESERVED15,
+    NV_INFOFRAME_FIELD_VALUE_AVI_PIXELREPETITION_AUTO         = 31
+} NV_INFOFRAME_FIELD_VALUE_AVI_PIXELREPETITION;
+
+
+typedef enum
+{
+    NV_INFOFRAME_FIELD_VALUE_AVI_CONTENTTYPE_GRAPHICS = 0,
+    NV_INFOFRAME_FIELD_VALUE_AVI_CONTENTTYPE_PHOTO,
+    NV_INFOFRAME_FIELD_VALUE_AVI_CONTENTTYPE_CINEMA,
+    NV_INFOFRAME_FIELD_VALUE_AVI_CONTENTTYPE_GAME,
+    NV_INFOFRAME_FIELD_VALUE_AVI_CONTENTTYPE_AUTO     = 7
+} NV_INFOFRAME_FIELD_VALUE_AVI_CONTENTTYPE;
+
+typedef enum
+{
+    NV_INFOFRAME_FIELD_VALUE_AVI_YCCQUANTIZATION_LIMITED_RANGE = 0,
+    NV_INFOFRAME_FIELD_VALUE_AVI_YCCQUANTIZATION_FULL_RANGE,
+    NV_INFOFRAME_FIELD_VALUE_AVI_YCCQUANTIZATION_RESERVED02,
+    NV_INFOFRAME_FIELD_VALUE_AVI_YCCQUANTIZATION_RESERVED03,
+    NV_INFOFRAME_FIELD_VALUE_AVI_YCCQUANTIZATION_AUTO          = 7
+} NV_INFOFRAME_FIELD_VALUE_AVI_YCCQUANTIZATION;
+
+//! Adding an Auto bit to each field
+typedef struct
+{
+    NvU32 vic                     : 8;
+    NvU32 pixelRepeat             : 5;
+    NvU32 colorSpace              : 3;
+    NvU32 colorimetry             : 3;
+    NvU32 extendedColorimetry     : 4;
+    NvU32 rgbQuantizationRange    : 3;
+    NvU32 yccQuantizationRange    : 3;
+    NvU32 itContent               : 2;
+    NvU32 contentTypes            : 3;
+    NvU32 scanInfo                : 3;
+    NvU32 activeFormatInfoPresent : 2;
+    NvU32 activeFormatAspectRatio : 5;
+    NvU32 picAspectRatio          : 3;
+    NvU32 nonuniformScaling       : 3;
+    NvU32 barInfo                 : 3;
+    NvU32 top_bar                 : 17;
+    NvU32 bottom_bar              : 17;
+    NvU32 left_bar                : 17;
+    NvU32 right_bar               : 17;
+    NvU32 Future17                : 2;
+    NvU32 Future47                : 2;
+} NV_INFOFRAME_VIDEO;
+
+//! Byte 1 related
+typedef enum
+{
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELCOUNT_IN_HEADER = 0,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELCOUNT_2,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELCOUNT_3,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELCOUNT_4,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELCOUNT_5,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELCOUNT_6,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELCOUNT_7,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELCOUNT_8,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELCOUNT_AUTO      = 15
+} NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELCOUNT;
+
+typedef enum
+{
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGTYPE_IN_HEADER                  = 0,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGTYPE_PCM,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGTYPE_AC3,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGTYPE_MPEG1,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGTYPE_MP3,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGTYPE_MPEG2,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGTYPE_AACLC,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGTYPE_DTS,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGTYPE_ATRAC,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGTYPE_DSD,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGTYPE_EAC3,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGTYPE_DTSHD,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGTYPE_MLP,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGTYPE_DST,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGTYPE_WMAPRO,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGTYPE_USE_CODING_EXTENSION_TYPE,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGTYPE_AUTO                      = 31
+} NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGTYPE;
+
+//! Byte 2 related
+typedef enum
+{
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_SAMPLESIZE_IN_HEADER = 0,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_SAMPLESIZE_16BITS,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_SAMPLESIZE_20BITS,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_SAMPLESIZE_24BITS,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_SAMPLESIZE_AUTO      = 7
+} NV_INFOFRAME_FIELD_VALUE_AUDIO_SAMPLESIZE;
+
+typedef enum
+{
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_SAMPLEFREQUENCY_IN_HEADER = 0,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_SAMPLEFREQUENCY_32000HZ,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_SAMPLEFREQUENCY_44100HZ,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_SAMPLEFREQUENCY_48000HZ,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_SAMPLEFREQUENCY_88200KHZ,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_SAMPLEFREQUENCY_96000KHZ,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_SAMPLEFREQUENCY_176400KHZ,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_SAMPLEFREQUENCY_192000KHZ,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_SAMPLEFREQUENCY_AUTO      = 15
+} NV_INFOFRAME_FIELD_VALUE_AUDIO_SAMPLEFREQUENCY;
+
+
+
+//! Byte 3 related
+typedef enum
+{
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGEXTENSIONTYPE_USE_CODING_TYPE = 0,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGEXTENSIONTYPE_HEAAC,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGEXTENSIONTYPE_HEAACV2,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGEXTENSIONTYPE_MPEGSURROUND,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGEXTENSIONTYPE_RESERVE04,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGEXTENSIONTYPE_RESERVE05,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGEXTENSIONTYPE_RESERVE06,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGEXTENSIONTYPE_RESERVE07,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGEXTENSIONTYPE_RESERVE08,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGEXTENSIONTYPE_RESERVE09,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGEXTENSIONTYPE_RESERVE10,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGEXTENSIONTYPE_RESERVE11,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGEXTENSIONTYPE_RESERVE12,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGEXTENSIONTYPE_RESERVE13,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGEXTENSIONTYPE_RESERVE14,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGEXTENSIONTYPE_RESERVE15,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGEXTENSIONTYPE_RESERVE16,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGEXTENSIONTYPE_RESERVE17,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGEXTENSIONTYPE_RESERVE18,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGEXTENSIONTYPE_RESERVE19,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGEXTENSIONTYPE_RESERVE20,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGEXTENSIONTYPE_RESERVE21,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGEXTENSIONTYPE_RESERVE22,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGEXTENSIONTYPE_RESERVE23,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGEXTENSIONTYPE_RESERVE24,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGEXTENSIONTYPE_RESERVE25,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGEXTENSIONTYPE_RESERVE26,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGEXTENSIONTYPE_RESERVE27,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGEXTENSIONTYPE_RESERVE28,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGEXTENSIONTYPE_RESERVE29,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGEXTENSIONTYPE_RESERVE30,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGEXTENSIONTYPE_RESERVE31,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGEXTENSIONTYPE_AUTO           = 63
+} NV_INFOFRAME_FIELD_VALUE_AUDIO_CODINGEXTENSIONTYPE;
+
+
+//! Byte 4 related
+typedef enum
+{
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_X_X_X_X_X_X_FR_FL           =0,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_X_X_X_X_X_LFE_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_X_X_X_X_FC_X_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_X_X_X_X_FC_LFE_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_X_X_X_RC_X_X_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_X_X_X_RC_X_LFE_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_X_X_X_RC_FC_X_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_X_X_X_RC_FC_LFE_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_X_X_RR_RL_X_X_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_X_X_RR_RL_X_LFE_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_X_X_RR_RL_FC_X_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_X_X_RR_RL_FC_LFE_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_X_RC_RR_RL_X_X_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_X_RC_RR_RL_X_LFE_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_X_RC_RR_RL_FC_X_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_X_RC_RR_RL_FC_LFE_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_RRC_RLC_RR_RL_X_X_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_RRC_RLC_RR_RL_X_LFE_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_RRC_RLC_RR_RL_FC_X_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_RRC_RLC_RR_RL_FC_LFE_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_FRC_FLC_X_X_X_X_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_FRC_FLC_X_X_X_LFE_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_FRC_FLC_X_X_FC_X_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_FRC_FLC_X_X_FC_LFE_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_FRC_FLC_X_RC_X_X_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_FRC_FLC_X_RC_X_LFE_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_FRC_FLC_X_RC_FC_X_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_FRC_FLC_X_RC_FC_LFE_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_FRC_FLC_RR_RL_X_X_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_FRC_FLC_RR_RL_X_LFE_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_FRC_FLC_RR_RL_FC_X_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_FRC_FLC_RR_RL_FC_LFE_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_X_FCH_RR_RL_FC_X_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_X_FCH_RR_RL_FC_LFE_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_TC_X_RR_RL_FC_X_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_TC_X_RR_RL_FC_LFE_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_FRH_FLH_RR_RL_X_X_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_FRH_FLH_RR_RL_X_LFE_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_FRW_FLW_RR_RL_X_X_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_FRW_FLW_RR_RL_X_LFE_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_TC_RC_RR_RL_FC_X_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_TC_RC_RR_RL_FC_LFE_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_FCH_RC_RR_RL_FC_X_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_FCH_RC_RR_RL_FC_LFE_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_TC_FCH_RR_RL_FC_X_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_TC_FCH_RR_RL_FC_LFE_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_FRH_FLH_RR_RL_FC_X_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_FRH_FLH_RR_RL_FC_LFE_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_FRW_FLW_RR_RL_FC_X_FR_FL,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_FRW_FLW_RR_RL_FC_LFE_FR_FL  = 0X31,
+    // all other values should default to auto
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION_AUTO                        = 0x1FF
+} NV_INFOFRAME_FIELD_VALUE_AUDIO_CHANNELALLOCATION;
+
+//! Byte 5 related
+typedef enum
+{
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_LFEPLAYBACKLEVEL_NO_DATA    = 0,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_LFEPLAYBACKLEVEL_0DB,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_LFEPLAYBACKLEVEL_PLUS10DB,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_LFEPLAYBACKLEVEL_RESERVED03,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_LFEPLAYBACKLEVEL_AUTO       = 7
+} NV_INFOFRAME_FIELD_VALUE_AUDIO_LFEPLAYBACKLEVEL;
+
+typedef enum
+{
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_LEVELSHIFTVALUES_0DB  = 0,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_LEVELSHIFTVALUES_1DB,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_LEVELSHIFTVALUES_2DB,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_LEVELSHIFTVALUES_3DB,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_LEVELSHIFTVALUES_4DB,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_LEVELSHIFTVALUES_5DB,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_LEVELSHIFTVALUES_6DB,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_LEVELSHIFTVALUES_7DB,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_LEVELSHIFTVALUES_8DB,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_LEVELSHIFTVALUES_9DB,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_LEVELSHIFTVALUES_10DB,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_LEVELSHIFTVALUES_11DB,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_LEVELSHIFTVALUES_12DB,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_LEVELSHIFTVALUES_13DB,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_LEVELSHIFTVALUES_14DB,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_LEVELSHIFTVALUES_15DB,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_LEVELSHIFTVALUES_AUTO = 31
+} NV_INFOFRAME_FIELD_VALUE_AUDIO_LEVELSHIFTVALUES;
+
+
+typedef enum
+{
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_DOWNMIX_PERMITTED  = 0,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_DOWNMIX_PROHIBITED,
+    NV_INFOFRAME_FIELD_VALUE_AUDIO_DOWNMIX_AUTO       = 3
+} NV_INFOFRAME_FIELD_VALUE_AUDIO_DOWNMIX;
+
+typedef struct
+{
+    NvU32 codingType          : 5;
+    NvU32 codingExtensionType : 6;
+    NvU32 sampleSize          : 3;
+    NvU32 sampleRate          : 4;
+    NvU32 channelCount        : 4;
+    NvU32 speakerPlacement    : 9;
+    NvU32 downmixInhibit      : 2;
+    NvU32 lfePlaybackLevel    : 3;
+    NvU32 levelShift          : 5;
+    NvU32 Future12            : 2;
+    NvU32 Future2x            : 4;
+    NvU32 Future3x            : 4;
+    NvU32 Future52            : 2;
+    NvU32 Future6             : 9;
+    NvU32 Future7             : 9;
+    NvU32 Future8             : 9;
+    NvU32 Future9             : 9;
+    NvU32 Future10            : 9;
+} NV_INFOFRAME_AUDIO;
+
+typedef struct
+{
+    NvU32 version; //!< version of this structure
+    NvU16 size;    //!< size of this structure
+    NvU8  cmd;     //!< The actions to perform from NV_INFOFRAME_CMD
+    NvU8  type;    //!< type of infoframe
+
+    union
+    {
+        NV_INFOFRAME_PROPERTY     property;  //!< This is NVIDIA-specific and corresponds to the property cmds and associated infoframe.
+        NV_INFOFRAME_AUDIO        audio;
+        NV_INFOFRAME_VIDEO        video;
+    } infoframe;
+} NV_INFOFRAME_DATA;
+
+//! Macro for constructing the version field of ::NV_INFOFRAME_DATA
+#define NV_INFOFRAME_DATA_VER   MAKE_NVAPI_VERSION(NV_INFOFRAME_DATA,1)
+
+///////////////////////////////////////////////////////////////////////////////
+// FUNCTION NAME:   NvAPI_Disp_InfoFrameControl
+//
+//! DESCRIPTION:     This API controls the InfoFrame values.
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \param [in]     displayId         Monitor Identifier
+//! \param [in,out] pInfoframeData    Contains data corresponding to InfoFrame
+//!
+//! \return    This API can return any of the error codes enumerated in #NvAPI_Status. If there are return error codes with
+//!            specific meaning for this API, they are listed below.
+//!
+//! \ingroup dispcontrol
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_Disp_InfoFrameControl(__in NvU32 displayId, __inout NV_INFOFRAME_DATA *pInfoframeData);
+
+
+
+
+
+
+//! \ingroup dispcontrol
+//! @{
+///////////////////////////////////////////////////////////////////////////////
+// FUNCTION NAME:   NvAPI_Disp_ColorControl
+//
+//! \fn NvAPI_Disp_ColorControl(NvU32 displayId, NV_COLOR_DATA *pColorData)
+//! DESCRIPTION:    This API controls the Color values.
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \param [in]     displayId         Monitor Identifier
+//! \param [in,out] pColorData        Contains data corresponding to color information
+//!
+//! \return  RETURN STATUS:
+//! ::NVAPI_OK,
+//! ::NVAPI_ERROR,
+//! ::NVAPI_INVALID_ARGUMENT
+//
+///////////////////////////////////////////////////////////////////////////////
+
+typedef enum
+{
+    NV_COLOR_CMD_GET                 = 1,
+    NV_COLOR_CMD_SET,
+    NV_COLOR_CMD_IS_SUPPORTED_COLOR,
+    NV_COLOR_CMD_GET_DEFAULT
+} NV_COLOR_CMD;
+
+//!  See Table 14 of CEA-861E.  Not all of this is supported by the GPU.
+typedef enum
+{
+    NV_COLOR_FORMAT_RGB             = 0,
+    NV_COLOR_FORMAT_YUV422,
+    NV_COLOR_FORMAT_YUV444,
+    NV_COLOR_FORMAT_YUV420,
+
+    NV_COLOR_FORMAT_DEFAULT         = 0xFE,
+    NV_COLOR_FORMAT_AUTO            = 0xFF
+} NV_COLOR_FORMAT;
+
+
+
+typedef enum
+{
+    NV_COLOR_COLORIMETRY_RGB             = 0,
+    NV_COLOR_COLORIMETRY_YCC601,
+    NV_COLOR_COLORIMETRY_YCC709,
+    NV_COLOR_COLORIMETRY_XVYCC601,
+    NV_COLOR_COLORIMETRY_XVYCC709,
+    NV_COLOR_COLORIMETRY_SYCC601,
+    NV_COLOR_COLORIMETRY_ADOBEYCC601,
+    NV_COLOR_COLORIMETRY_ADOBERGB,
+    NV_COLOR_COLORIMETRY_BT2020RGB,
+    NV_COLOR_COLORIMETRY_BT2020YCC,
+    NV_COLOR_COLORIMETRY_BT2020cYCC,
+
+    NV_COLOR_COLORIMETRY_DEFAULT         = 0xFE,
+    NV_COLOR_COLORIMETRY_AUTO            = 0xFF
+} NV_COLOR_COLORIMETRY;
+
+typedef enum _NV_DYNAMIC_RANGE
+{
+    NV_DYNAMIC_RANGE_VESA     = 0x0,
+    NV_DYNAMIC_RANGE_CEA      = 0x1,
+
+    NV_DYNAMIC_RANGE_AUTO     = 0xFF
+} NV_DYNAMIC_RANGE;
+
+typedef enum _NV_BPC
+{
+    NV_BPC_DEFAULT         = 0,
+    NV_BPC_6               = 1,
+    NV_BPC_8               = 2,
+    NV_BPC_10              = 3,
+    NV_BPC_12              = 4,
+    NV_BPC_16              = 5,
+} NV_BPC;
+
+typedef struct _NV_COLOR_DATA_V1
+{
+    NvU32 version; //!< Version of this structure
+    NvU16 size;    //!< Size of this structure
+    NvU8  cmd;
+    struct
+    {
+        NvU8  colorFormat;    //!< One of NV_COLOR_FORMAT enum values.
+        NvU8  colorimetry;    //!< One of NV_COLOR_COLORIMETRY enum values.
+    } data;
+} NV_COLOR_DATA_V1;
+
+typedef struct _NV_COLOR_DATA_V2
+{
+    NvU32 version; //!< Version of this structure
+    NvU16 size;    //!< Size of this structure
+    NvU8  cmd;
+    struct
+    {
+        NvU8  colorFormat;    //!< One of NV_COLOR_FORMAT enum values.
+        NvU8  colorimetry;    //!< One of NV_COLOR_COLORIMETRY enum values.
+        NvU8  dynamicRange;   //!< One of NV_DYNAMIC_RANGE enum values.
+    } data;
+} NV_COLOR_DATA_V2;
+
+typedef struct _NV_COLOR_DATA_V3
+{
+    NvU32 version; //!< Version of this structure
+    NvU16 size;    //!< Size of this structure
+    NvU8  cmd;
+    struct
+    {
+        NvU8     colorFormat;    //!< One of NV_COLOR_FORMAT enum values.
+        NvU8     colorimetry;    //!< One of NV_COLOR_COLORIMETRY enum values.
+        NvU8     dynamicRange;   //!< One of NV_DYNAMIC_RANGE enum values.
+        NV_BPC   bpc;            //!< One of NV_BPC enum values.
+    } data;
+} NV_COLOR_DATA_V3;
+
+typedef NV_COLOR_DATA_V3    NV_COLOR_DATA;
+
+#define NV_COLOR_DATA_VER1  MAKE_NVAPI_VERSION(NV_COLOR_DATA_V1, 1)
+#define NV_COLOR_DATA_VER2  MAKE_NVAPI_VERSION(NV_COLOR_DATA_V2, 2)
+#define NV_COLOR_DATA_VER3  MAKE_NVAPI_VERSION(NV_COLOR_DATA_V3, 3)
+#define NV_COLOR_DATA_VER   NV_COLOR_DATA_VER3
+
+NVAPI_INTERFACE NvAPI_Disp_ColorControl(NvU32 displayId, NV_COLOR_DATA *pColorData);
+
+//! @}
+
+//! \ingroup dispcontrol
+//! Used in NvAPI_DISP_GetTiming().
+typedef struct
+{
+    NvU32                   isInterlaced   : 4;  //!< To retrieve interlaced/progressive timing
+    NvU32                   reserved0      : 12;
+    union
+    {
+        NvU32               tvFormat       : 8;  //!< The actual analog HD/SDTV format. Used when the timing type is
+                                                 //!  NV_TIMING_OVERRIDE_ANALOG_TV and width==height==rr==0.
+        NvU32               ceaId          : 8;  //!< The EIA/CEA 861B/D predefined short timing descriptor ID.
+                                                 //!  Used when the timing type is NV_TIMING_OVERRIDE_EIA861
+                                                 //!  and width==height==rr==0.
+        NvU32               nvPsfId        : 8;  //!< The NV predefined PsF format Id.
+                                                 //!  Used when the timing type is NV_TIMING_OVERRIDE_NV_PREDEFINED.
+    };
+    NvU32                   scaling        : 8;  //!< Define preferred scaling
+}NV_TIMING_FLAG;
+
+//! \ingroup dispcontrol
+//! Used in NvAPI_DISP_GetTiming().
+typedef struct _NV_TIMING_INPUT
+{
+    NvU32 version;                      //!< (IN)     structure version
+
+    NvU32 width;						//!< Visible horizontal size
+    NvU32 height;						//!< Visible vertical size
+    float rr;							//!< Timing refresh rate
+
+    NV_TIMING_FLAG flag;				//!< Flag containing additional info for timing calculation.
+
+    NV_TIMING_OVERRIDE type;			//!< Timing type(formula) to use for calculating the timing
+}NV_TIMING_INPUT;
+
+#define NV_TIMING_INPUT_VER   MAKE_NVAPI_VERSION(NV_TIMING_INPUT,1)
+
+///////////////////////////////////////////////////////////////////////////////
+// FUNCTION NAME:   NvAPI_DISP_GetTiming
+//
+//! DESCRIPTION:  This function calculates the timing from the visible width/height/refresh-rate and timing type info.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 313
+//!
+//!
+//! \param [in]   displayId		Display ID of the display.
+//! \param [in]   timingInput   Inputs used for calculating the timing.
+//! \param [out]  pTiming       Pointer to the NV_TIMING structure.
+//!
+//! \return        This API can return any of the error codes enumerated in #NvAPI_Status. If there are return error codes with
+//!                specific meaning for this API, they are listed below.
+//!
+//! \ingroup dispcontrol
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DISP_GetTiming( __in NvU32 displayId,__in NV_TIMING_INPUT *timingInput, __out NV_TIMING *pTiming);
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+// FUNCTION NAME:   NvAPI_DISP_GetMonitorCapabilities
+//
+//! \fn NvAPI_DISP_GetMonitorCapabilities(NvU32 displayId, NV_MONITOR_CAPABILITIES *pMonitorCapabilities)
+//! DESCRIPTION:     This API returns the Monitor capabilities
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \param [in]      displayId                Monitor Identifier
+//! \param [out]     pMonitorCapabilities     The monitor support info
+//!
+//! \return ::NVAPI_OK,
+//!         ::NVAPI_ERROR,
+//!         ::NVAPI_INVALID_ARGUMENT
+//
+///////////////////////////////////////////////////////////////////////////////
+
+//! \ingroup dispcontrol
+//! @{
+
+
+//! HDMI-related and extended CAPs
+typedef enum
+{
+    // hdmi related caps
+    NV_MONITOR_CAPS_TYPE_HDMI_VSDB = 0x1000,
+    NV_MONITOR_CAPS_TYPE_HDMI_VCDB = 0x1001,
+} NV_MONITOR_CAPS_TYPE;
+
+
+
+typedef struct _NV_MONITOR_CAPS_VCDB
+{
+    NvU8    quantizationRangeYcc         : 1;
+    NvU8    quantizationRangeRgb         : 1;
+    NvU8    scanInfoPreferredVideoFormat : 2;
+    NvU8    scanInfoITVideoFormats       : 2;
+    NvU8    scanInfoCEVideoFormats       : 2;
+} NV_MONITOR_CAPS_VCDB;
+
+
+//! See NvAPI_DISP_GetMonitorCapabilities().
+typedef struct _NV_MONITOR_CAPS_VSDB
+{
+    // byte 1
+    NvU8    sourcePhysicalAddressB         : 4; //!< Byte 1
+    NvU8    sourcePhysicalAddressA         : 4; //!< Byte 1
+    // byte 2
+    NvU8    sourcePhysicalAddressD         : 4; //!< Byte 2
+    NvU8    sourcePhysicalAddressC         : 4; //!< Byte 2
+    // byte 3
+    NvU8    supportDualDviOperation        : 1; //!< Byte 3
+    NvU8    reserved6                      : 2; //!< Byte 3
+    NvU8    supportDeepColorYCbCr444       : 1; //!< Byte 3
+    NvU8    supportDeepColor30bits         : 1; //!< Byte 3
+    NvU8    supportDeepColor36bits         : 1; //!< Byte 3
+    NvU8    supportDeepColor48bits         : 1; //!< Byte 3
+    NvU8    supportAI                      : 1; //!< Byte 3
+    // byte 4
+    NvU8    maxTmdsClock;  //!< Bye 4
+    // byte 5
+    NvU8    cnc0SupportGraphicsTextContent : 1; //!< Byte 5
+    NvU8    cnc1SupportPhotoContent        : 1; //!< Byte 5
+    NvU8    cnc2SupportCinemaContent       : 1; //!< Byte 5
+    NvU8    cnc3SupportGameContent         : 1; //!< Byte 5
+    NvU8    reserved8                      : 1; //!< Byte 5
+    NvU8    hasVicEntries                  : 1; //!< Byte 5
+    NvU8    hasInterlacedLatencyField      : 1; //!< Byte 5
+    NvU8    hasLatencyField                : 1; //!< Byte 5
+    // byte 6
+    NvU8    videoLatency; //!< Byte 6
+    // byte 7
+    NvU8    audioLatency; //!< Byte 7
+    // byte 8
+    NvU8    interlacedVideoLatency; //!< Byte 8
+    // byte 9
+    NvU8    interlacedAudioLatency; //!< Byte 9
+    // byte 10
+    NvU8    reserved13                     : 7; //!< Byte 10
+    NvU8    has3dEntries                   : 1; //!< Byte 10
+    // byte 11
+    NvU8    hdmi3dLength                   : 5; //!< Byte 11
+    NvU8    hdmiVicLength                  : 3; //!< Byte 11
+    // Remaining bytes
+    NvU8    hdmi_vic[7];  //!< Keeping maximum length for 3 bits
+    NvU8    hdmi_3d[31];  //!< Keeping maximum length for 5 bits
+} NV_MONITOR_CAPS_VSDB;
+
+
+//! See NvAPI_DISP_GetMonitorCapabilities().
+typedef struct _NV_MONITOR_CAPABILITIES_V1
+{
+    NvU32    version;
+    NvU16    size;
+    NvU32    infoType;
+    NvU32    connectorType;        //!< Out: VGA, TV, DVI, HDMI, DP
+    NvU8     bIsValidInfo : 1;     //!< Boolean : Returns invalid if requested info is not present such as VCDB not present
+    union {
+        NV_MONITOR_CAPS_VSDB  vsdb;
+        NV_MONITOR_CAPS_VCDB  vcdb;
+    } data;
+} NV_MONITOR_CAPABILITIES_V1;
+
+typedef NV_MONITOR_CAPABILITIES_V1 NV_MONITOR_CAPABILITIES;
+
+//! Macro for constructing the version field of ::NV_MONITOR_CAPABILITIES_V1
+#define NV_MONITOR_CAPABILITIES_VER1   MAKE_NVAPI_VERSION(NV_MONITOR_CAPABILITIES_V1,1)
+#define NV_MONITOR_CAPABILITIES_VER    NV_MONITOR_CAPABILITIES_VER1
+
+//! @}
+
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//! \ingroup dispcontrol
+NVAPI_INTERFACE NvAPI_DISP_GetMonitorCapabilities(__in NvU32 displayId, __inout NV_MONITOR_CAPABILITIES *pMonitorCapabilities);
+
+//! \ingroup dispcontrol
+typedef struct _NV_MONITOR_COLOR_DATA
+{
+    NvU32                   version;
+// We are only supporting DP monitors for now. We need to extend this to HDMI panels as well
+    NV_DP_COLOR_FORMAT      colorFormat;        //!< One of the supported color formats
+    NV_DP_BPC               backendBitDepths;   //!< One of the supported bit depths
+} NV_MONITOR_COLOR_CAPS_V1;
+
+typedef NV_MONITOR_COLOR_CAPS_V1 NV_MONITOR_COLOR_CAPS;
+
+//! \ingroup dispcontrol
+#define NV_MONITOR_COLOR_CAPS_VER1   MAKE_NVAPI_VERSION(NV_MONITOR_COLOR_CAPS_V1,1)
+#define NV_MONITOR_COLOR_CAPS_VER    NV_MONITOR_COLOR_CAPS_VER1
+
+///////////////////////////////////////////////////////////////////////////////
+// FUNCTION NAME:   NvAPI_DISP_GetMonitorColorCapabilities
+//
+//! DESCRIPTION:    This API returns all the color formats and bit depth values supported by a given DP monitor.
+//!
+//! USAGE:         Sequence of calls which caller should make to get the information.
+//!                1. First call NvAPI_DISP_GetMonitorColorCapabilities() with pMonitorColorCapabilities as NULL to get the count.
+//!                2. Allocate memory for color caps(NV_MONITOR_COLOR_CAPS) array.
+//!                3. Call NvAPI_DISP_GetMonitorColorCapabilities() again with the pointer to the memory allocated to get all the
+//!                   color capabilities.
+//!
+//!                Note :
+//!                1. pColorCapsCount should never be NULL, else the API will fail with NVAPI_INVALID_ARGUMENT.
+//!                2. *pColorCapsCount returned from the API will always be the actual count in any/every call.
+//!                3. Memory size to be allocated should be (*pColorCapsCount * sizeof(NV_MONITOR_COLOR_CAPS)).
+//!                4. If the memory allocated is less than what is required to return all the timings, this API will return the
+//!                   amount of information which can fit in user provided buffer and API will return NVAPI_INSUFFICIENT_BUFFER.
+//!                5. If the caller specifies a greater value for *pColorCapsCount in second call to NvAPI_DISP_GetMonitorColorCapabilities()
+//!                   than what was returned from first call, the API will return only the actual number of elements in the color
+//!                   capabilities array and the extra buffer will remain unused.
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \param [in]      displayId                  Monitor Identifier
+//! \param [in, out] pMonitorColorCapabilities  The monitor color capabilities information
+//! \param [in, out] pColorCapsCount            - During input, the number of elements allocated for the pMonitorColorCapabilities pointer
+//!                                             - During output, the actual number of color data elements the monitor supports
+//!
+//! \return    This API can return any of the error codes enumerated in #NvAPI_Status. If there are return error codes with
+//!            specific meaning for this API, they are listed below.
+//!
+//! \retval         NVAPI_INSUFFICIENT_BUFFER   The input buffer size is not sufficient to hold the total contents. In this case
+//!                                             *pColorCapsCount will hold the required amount of elements.
+//! \retval         NVAPI_INVALID_DISPLAY_ID    The input monitor is either not connected or is not a DP panel.
+//!
+//! \ingroup dispcontrol
+//!
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DISP_GetMonitorColorCapabilities(__in NvU32 displayId, __inout_ecount_part_opt(*pColorCapsCount, *pColorCapsCount) NV_MONITOR_COLOR_CAPS *pMonitorColorCapabilities, __inout NvU32 *pColorCapsCount);
+
+//! \ingroup dispcontrol
+//! Used in NvAPI_DISP_EnumCustomDisplay() and NvAPI_DISP_TryCustomDisplay().
+typedef struct
+{
+    NvU32                   version;
+
+    // the source mode information
+    NvU32                   width;             //!< Source surface(source mode) width
+    NvU32                   height;            //!< Source surface(source mode) height
+    NvU32                   depth;             //!< Source surface color depth."0" means all 8/16/32bpp
+    NV_FORMAT               colorFormat;       //!< Color format (optional)
+
+    NV_VIEWPORTF            srcPartition;      //!< For multimon support, should be set to (0,0,1.0,1.0) for now.
+
+    float                   xRatio;            //!< Horizontal scaling ratio
+    float                   yRatio;            //!< Vertical scaling ratio
+
+    NV_TIMING               timing;            //!< Timing used to program TMDS/DAC/LVDS/HDMI/TVEncoder, etc.
+    NvU32                   hwModeSetOnly : 1; //!< If set, it means a hardware modeset without OS update
+
+}NV_CUSTOM_DISPLAY;
+
+//! \ingroup dispcontrol
+//! Used in NV_CUSTOM_DISPLAY.
+#define NV_CUSTOM_DISPLAY_VER  MAKE_NVAPI_VERSION(NV_CUSTOM_DISPLAY,1)
+
+///////////////////////////////////////////////////////////////////////////////
+// FUNCTION NAME:   NvAPI_DISP_EnumCustomDisplay
+//
+//! DESCRIPTION:   This API enumerates the custom timing specified by the enum index.
+//!				   The client should keep enumerating until it returns NVAPI_END_ENUMERATION.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 313
+//!
+//! \param [in]     displayId   Dispaly ID of the display.
+//! \param [in]     index       Enum index
+//! \param [inout]  pCustDisp   Pointer to the NV_CUSTOM_DISPLAY structure
+//!
+//! \return        This API can return any of the error codes enumerated in #NvAPI_Status. If there are return error codes with
+//!                specific meaning for this API, they are listed below.
+//! \retval        NVAPI_INVALID_DISPLAY_ID:   Custom Timing is not supported on the Display, whose display id is passed
+//!
+//! \ingroup dispcontrol
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DISP_EnumCustomDisplay( __in NvU32 displayId, __in NvU32 index, __inout NV_CUSTOM_DISPLAY *pCustDisp);
+
+///////////////////////////////////////////////////////////////////////////////
+// FUNCTION NAME:   NvAPI_DISP_TryCustomDisplay
+//
+//! DESCRIPTION:    This API is used to set up a custom display without saving the configuration on multiple displays.
+//!
+//! \note
+//!  All the members of srcPartition, present in NV_CUSTOM_DISPLAY structure, should have their range in (0.0,1.0).
+//!  In clone mode the timings can applied to both the target monitors but only one target at a time. \n
+//!  For the secondary target the applied timings works under the following conditions:
+//!  - If the secondary monitor EDID supports the selected timing, OR
+//!  - If the selected custom timings can be scaled by the secondary monitor for the selected source resolution on the primary, OR
+//!  - If the selected custom timings matches the existing source resolution on the primary.
+//!  Setting up a custom display on non-active but connected monitors is supported only for Win7 and above.
+//!
+//! SUPPORTED OS:  Windows XP,  Windows 7 and higher
+//!
+//!
+//! \since Release: 313
+//!
+//!
+//! \param [in]    pDisplayIds    Array of the target display Dispaly IDs - See \ref handles.
+//! \param [in]    count          Total number of the incoming Display IDs and corresponding NV_CUSTOM_DISPLAY structure. This is for the multi-head support.
+//! \param [in]    pCustDisp      Pointer to the NV_CUSTOM_DISPLAY structure array.
+//!
+//! \return        This API can return any of the error codes enumerated in #NvAPI_Status. If there are return error codes with
+//!                specific meaning for this API, they are listed below.
+//! \retval        NVAPI_INVALID_DISPLAY_ID:   Custom Timing is not supported on the Display, whose display id is passed
+//!
+//! \ingroup dispcontrol
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DISP_TryCustomDisplay( __in_ecount(count) NvU32 *pDisplayIds, __in NvU32 count, __in_ecount(count) NV_CUSTOM_DISPLAY *pCustDisp);
+
+///////////////////////////////////////////////////////////////////////////////
+// FUNCTION NAME:   NvAPI_DISP_DeleteCustomDisplay
+//
+//! DESCRIPTION:    This function deletes the custom display configuration, specified from the registry for  all the displays whose display IDs are passed.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 313
+//!
+//!
+//! \param [in]     pDisplayIds    Array of Dispaly IDs on which custom display configuration is to be saved.
+//! \param [in]     count          Total number of the incoming Dispaly IDs. This is for the multi-head support.
+//!	\param [in]     pCustDisp	   Pointer to the NV_CUSTOM_DISPLAY structure
+//!
+//! \return        This API can return any of the error codes enumerated in #NvAPI_Status. If there are return error codes with
+//!                specific meaning for this API, they are listed below.
+//! \retval        NVAPI_INVALID_DISPLAY_ID:   Custom Timing is not supported on the Display, whose display id is passed
+//!
+//! \ingroup dispcontrol
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DISP_DeleteCustomDisplay( __in_ecount(count) NvU32 *pDisplayIds, __in NvU32 count, __in NV_CUSTOM_DISPLAY *pCustDisp);
+
+///////////////////////////////////////////////////////////////////////////////
+// FUNCTION NAME:   NvAPI_DISP_SaveCustomDisplay
+//
+//! DESCRIPTION:    This function saves the current hardware display configuration on the specified Display IDs as a custom display configuration.
+//!                 This function should be called right after NvAPI_DISP_TryCustomDisplay() to save the custom display from the current
+//!                 hardware context. This function will not do anything if the custom display configuration is not tested on the hardware.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 313
+//!
+//!
+//! \param [in]     pDisplayIds         Array of Dispaly IDs on which custom display configuration is to be saved.
+//! \param [in]     count               Total number of the incoming Dispaly IDs. This is for the multi-head support.
+//! \param [in]     isThisOutputIdOnly  If set, the saved custom display will only be applied on the monitor with the same outputId (see \ref handles).
+//! \param [in]     isThisMonitorIdOnly If set, the saved custom display will only be applied on the monitor with the same EDID ID or
+//!                                     the same TV connector in case of analog TV.
+//!
+//! \return        This API can return any of the error codes enumerated in #NvAPI_Status. If there are return error codes with
+//!                specific meaning for this API, they are listed below.
+//! \retval 	   NVAPI_INVALID_DISPLAY_ID:   Custom Timing is not supported on the Display, whose display id is passed
+//!
+//! \ingroup dispcontrol
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DISP_SaveCustomDisplay( __in_ecount(count) NvU32 *pDisplayIds, __in NvU32 count, __in NvU32 isThisOutputIdOnly, __in NvU32 isThisMonitorIdOnly);
+
+///////////////////////////////////////////////////////////////////////////////
+// FUNCTION NAME:   NvAPI_DISP_RevertCustomDisplayTrial
+//
+//! DESCRIPTION:    This API is used to restore the display configuration, that was changed by calling NvAPI_DISP_TryCustomDisplay(). This function
+//!                 must be called only after a custom display configuration is tested on the hardware, using NvAPI_DISP_TryCustomDisplay(),
+//!                 otherwise no action is taken. On Vista, NvAPI_DISP_RevertCustomDisplayTrial should be called with an active display that
+//!                 was affected during the NvAPI_DISP_TryCustomDisplay() call, per GPU.
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \since Release: 313
+//!
+//!
+//! \param [in]    pDisplayIds   Pointer to display Id, of an active display.
+//! \param [in]    count         Total number of incoming Display IDs. For future use only. Currently it is expected to be passed as 1.
+//!
+//! \return        This API can return any of the error codes enumerated in #NvAPI_Status. If there are return error codes with
+//!                specific meaning for this API, they are listed below.
+//!
+//! \ingroup dispcontrol
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DISP_RevertCustomDisplayTrial( __in_ecount(count) NvU32* pDisplayIds, __in NvU32 count);
+
+///////////////////////////////////////////////////////////////////////////////
+// FUNCTION NAME:   NvAPI_GetView
+//
+//! This API lets caller retrieve the target display arrangement for selected source display handle.
+//! \note Display PATH with this API is limited to single GPU. DUALVIEW across GPUs will be returned as STANDARD VIEW.
+//!       Use NvAPI_SYS_GetDisplayTopologies() to query views across GPUs.
+//!
+//! \deprecated  Do not use this function - it is deprecated in release 290. Instead, use NvAPI_DISP_GetDisplayConfig.
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \since Release: 85
+//!
+//!  \param [in]     hNvDisplay             NVIDIA Display selection. It can be #NVAPI_DEFAULT_HANDLE or a handle enumerated from
+//!                                         NvAPI_EnumNVidiaDisplayHandle().
+//!  \param [out]    pTargets               User allocated storage to retrieve an array of  NV_VIEW_TARGET_INFO. Can be NULL to retrieve
+//!                                         the targetCount.
+//!  \param [in,out] targetMaskCount        Count of target device mask specified in pTargetMask.
+//!  \param [out]    targetView             Target view selected from NV_TARGET_VIEW_MODE.
+//!
+//!  \retval         NVAPI_OK               Completed request
+//!  \retval         NVAPI_ERROR            Miscellaneous error occurred
+//!  \retval         NVAPI_INVALID_ARGUMENT Invalid input parameter.
+//! \ingroup dispcontrol
+///////////////////////////////////////////////////////////////////////////////
+__nvapi_deprecated_function("Do not use this function - it is deprecated in release 290. Instead, use NvAPI_DISP_GetDisplayConfig.")
+NVAPI_INTERFACE NvAPI_GetView(NvDisplayHandle hNvDisplay, NV_VIEW_TARGET_INFO *pTargets, NvU32 *pTargetMaskCount, NV_TARGET_VIEW_MODE *pTargetView);
+
+
+
+
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+// FUNCTION NAME:   NvAPI_GetViewEx
+//
+//! DESCRIPTION:    This API lets caller retrieve the target display arrangement for selected source display handle.
+//!                 \note Display PATH with this API is limited to single GPU. DUALVIEW across GPUs will be returned as STANDARD VIEW.
+//!                       Use NvAPI_SYS_GetDisplayTopologies() to query views across GPUs.
+//!
+//! \deprecated  Do not use this function - it is deprecated in release 290. Instead, use NvAPI_DISP_GetDisplayConfig.
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \since Release: 165
+//!
+//! \param [in]     hNvDisplay       NVIDIA Display selection. #NVAPI_DEFAULT_HANDLE is not allowed, it has to be a handle enumerated with
+//!                                  NvAPI_EnumNVidiaDisplayHandle().
+//! \param [in,out] pPathInfo        Count field should be set to NVAPI_MAX_DISPLAY_PATH. Can be NULL to retrieve just the pathCount.
+//! \param [in,out] pPathCount       Number of elements in array pPathInfo->path.
+//! \param [out]    pTargetViewMode  Display view selected from NV_TARGET_VIEW_MODE.
+//!
+//! \retval         NVAPI_OK                      Completed request
+//! \retval         NVAPI_API_NOT_INTIALIZED      NVAPI not initialized
+//! \retval         NVAPI_ERROR                   Miscellaneous error occurred
+//! \retval         NVAPI_INVALID_ARGUMENT        Invalid input parameter.
+//! \retval         NVAPI_EXPECTED_DISPLAY_HANDLE hNvDisplay is not a valid display handle.
+//!
+//! \ingroup dispcontrol
+///////////////////////////////////////////////////////////////////////////////
+__nvapi_deprecated_function("Do not use this function - it is deprecated in release 290. Instead, use NvAPI_DISP_GetDisplayConfig.")
+NVAPI_INTERFACE NvAPI_GetViewEx(NvDisplayHandle hNvDisplay, NV_DISPLAY_PATH_INFO *pPathInfo, NvU32 *pPathCount, NV_TARGET_VIEW_MODE *pTargetViewMode);
+
+///////////////////////////////////////////////////////////////////////////////
+// FUNCTION NAME:   NvAPI_GetSupportedViews
+//
+//!  This API lets caller enumerate all the supported NVIDIA display views - nView and Dualview modes.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 85
+//!
+//!  \param [in]     hNvDisplay             NVIDIA Display selection. It can be #NVAPI_DEFAULT_HANDLE or a handle enumerated from
+//!                                         NvAPI_EnumNVidiaDisplayHandle().
+//!  \param [out]    pTargetViews           Array of supported views. Can be NULL to retrieve the pViewCount first.
+//!  \param [in,out] pViewCount             Count of supported views.
+//!
+//!  \retval         NVAPI_OK               Completed request
+//!  \retval         NVAPI_ERROR            Miscellaneous error occurred
+//!  \retval         NVAPI_INVALID_ARGUMENT Invalid input parameter.
+//! \ingroup dispcontrol
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GetSupportedViews(NvDisplayHandle hNvDisplay, NV_TARGET_VIEW_MODE *pTargetViews, NvU32 *pViewCount);
+
+
+//! SUPPORTED OS:  Windows XP and higher
+//!
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME:   NvAPI_DISP_GetDisplayIdByDisplayName
+//
+//! DESCRIPTION:     This API retrieves the Display Id of a given display by
+//!                  display name. The display must be active to retrieve the
+//!                  displayId. In the case of clone mode or Surround gaming,
+//!                  the primary or top-left display will be returned.
+//!
+//! \param [in]     displayName  Name of display (Eg: "\\DISPLAY1" to
+//!                              retrieve the displayId for.
+//! \param [out]    displayId    Display ID of the requested display.
+//!
+//! retval ::NVAPI_OK:                          Capabilties have been returned.
+//! retval ::NVAPI_INVALID_ARGUMENT:            One or more args passed in are invalid.
+//! retval ::NVAPI_API_NOT_INTIALIZED:          The NvAPI API needs to be initialized first
+//! retval ::NVAPI_NO_IMPLEMENTATION:           This entrypoint not available
+//! retval ::NVAPI_ERROR:                       Miscellaneous error occurred
+//!
+//! \ingroup dispcontrol
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DISP_GetDisplayIdByDisplayName(const char *displayName, NvU32* displayId);
+
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+// FUNCTION NAME:   NvAPI_DISP_GetDisplayConfig
+//
+//! DESCRIPTION:     This API lets caller retrieve the current global display
+//!                  configuration.
+//!       USAGE:     The caller might have to call this three times to fetch all the required configuration details as follows:
+//!                  First  Pass: Caller should Call NvAPI_DISP_GetDisplayConfig() with pathInfo set to NULL to fetch pathInfoCount.
+//!                  Second Pass: Allocate memory for pathInfo with respect to the number of pathInfoCount(from First Pass) to fetch
+//!                               targetInfoCount. If sourceModeInfo is needed allocate memory or it can be initialized to NULL.
+//!             Third  Pass(Optional, only required if target information is required): Allocate memory for targetInfo with respect
+//!                               to number of targetInfoCount(from Second Pass).
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \param [in,out]  pathInfoCount    Number of elements in pathInfo array, returns number of valid topologies, this cannot be null.
+//! \param [in,out]  pathInfo         Array of path information
+//!
+//! \return    This API can return any of the error codes enumerated in #NvAPI_Status. If there are return error codes with
+//!            specific meaning for this API, they are listed below.
+//!
+//! \retval    NVAPI_INVALID_ARGUMENT  -   Invalid input parameter. Following can be the reason for this return value:
+//!                                        -# pathInfoCount is NULL.
+//!                                        -# *pathInfoCount is 0 and pathInfo is not NULL.
+//!                                        -# *pathInfoCount is not 0 and pathInfo is NULL.
+//! \retval    NVAPI_DEVICE_BUSY       -   ModeSet has not yet completed. Please wait and call it again.
+//!
+//! \ingroup dispcontrol
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DISP_GetDisplayConfig(__inout NvU32 *pathInfoCount, __out_ecount_full_opt(*pathInfoCount) NV_DISPLAYCONFIG_PATH_INFO *pathInfo);
+
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+// FUNCTION NAME:   NvAPI_DISP_SetDisplayConfig
+//
+//
+//! DESCRIPTION:     This API lets caller apply a global display configuration
+//!                  across multiple GPUs.
+//!
+//!                  If all sourceIds are zero, then NvAPI will pick up sourceId's based on the following criteria :
+//!                  - If user provides sourceModeInfo then we are trying to assign 0th sourceId always to GDIPrimary.
+//!                     This is needed since active windows always moves along with 0th sourceId.
+//!                  - For rest of the paths, we are incrementally assigning the sourceId per adapter basis.
+//!                  - If user doesn't provide sourceModeInfo then NVAPI just picks up some default sourceId's in incremental order.
+//!                  Note : NVAPI will not intelligently choose the sourceIDs for any configs that does not need a modeset.
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \param [in]      pathInfoCount   Number of supplied elements in pathInfo
+//! \param [in]      pathInfo        Array of path information
+//! \param [in]      flags           Flags for applying settings
+//!
+//! \retval ::NVAPI_OK - completed request
+//! \retval ::NVAPI_API_NOT_INTIALIZED - NVAPI not initialized
+//! \retval ::NVAPI_ERROR - miscellaneous error occurred
+//! \retval ::NVAPI_INVALID_ARGUMENT - Invalid input parameter.
+//!
+//! \ingroup dispcontrol
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DISP_SetDisplayConfig(__in NvU32 pathInfoCount, __in_ecount(pathInfoCount) NV_DISPLAYCONFIG_PATH_INFO* pathInfo, __in NvU32 flags);
+
+
+
+
+
+////////////////////////////////////////////////////////////////////////////////////////
+//
+// MOSAIC allows a multi display target output scanout on a single source.
+//
+// SAMPLE of MOSAIC 1x4 topo with 8 pixel horizontal overlap
+//
+//+-------------------------++-------------------------++-------------------------++-------------------------+
+//|                         ||                         ||                         ||                         |
+//|                         ||                         ||                         ||                         |
+//|                         ||                         ||                         ||                         |
+//|        DVI1             ||           DVI2          ||         DVI3            ||          DVI4           |
+//|                         ||                         ||                         ||                         |
+//|                         ||                         ||                         ||                         |
+//|                         ||                         ||                         ||                         |
+//|                         ||                         ||                         ||                         |
+//+-------------------------++-------------------------++-------------------------++-------------------------+
+
+
+//! \addtogroup mosaicapi
+//! @{
+
+#define NVAPI_MAX_MOSAIC_DISPLAY_ROWS       8
+#define NVAPI_MAX_MOSAIC_DISPLAY_COLUMNS    8
+//
+// These bits are used to describe the validity of a topo.
+//
+#define NV_MOSAIC_TOPO_VALIDITY_VALID               0x00000000  //!< The topology is valid
+#define NV_MOSAIC_TOPO_VALIDITY_MISSING_GPU         0x00000001  //!< Not enough SLI GPUs were found to fill the entire
+                                                                //! topology. hPhysicalGPU will be 0 for these.
+#define NV_MOSAIC_TOPO_VALIDITY_MISSING_DISPLAY     0x00000002  //!< Not enough displays were found to fill the entire
+                                                                //! topology. displayOutputId will be 0 for these.
+#define NV_MOSAIC_TOPO_VALIDITY_MIXED_DISPLAY_TYPES 0x00000004  //!< The topoogy is only possible with displays of the same
+                                                                //! NV_GPU_OUTPUT_TYPE. Check displayOutputIds to make
+                                                                //! sure they are all CRTs, or all DFPs.
+
+
+//
+//! This structure defines the topology details.
+typedef struct
+{
+    NvU32                version;              //!< Version of this structure
+    NvLogicalGpuHandle   hLogicalGPU;          //!< Logical GPU for this topology
+    NvU32                validityMask;         //!< 0 means topology is valid with the current hardware.
+                                               //! If not 0, inspect bits against NV_MOSAIC_TOPO_VALIDITY_*.
+    NvU32                rowCount;             //!< Number of displays in a row
+    NvU32                colCount;             //!< Number of displays in a column
+
+    struct
+    {
+        NvPhysicalGpuHandle hPhysicalGPU;      //!< Physical GPU to be used in the topology (0 if GPU missing)
+        NvU32               displayOutputId;   //!< Connected display target (0 if no display connected)
+        NvS32               overlapX;          //!< Pixels of overlap on left of target: (+overlap, -gap)
+        NvS32               overlapY;          //!< Pixels of overlap on top of target: (+overlap, -gap)
+
+    } gpuLayout[NVAPI_MAX_MOSAIC_DISPLAY_ROWS][NVAPI_MAX_MOSAIC_DISPLAY_COLUMNS];
+
+} NV_MOSAIC_TOPO_DETAILS;
+
+//! Macro for constructing te vesion field of NV_MOSAIC_TOPO_DETAILS
+#define NVAPI_MOSAIC_TOPO_DETAILS_VER         MAKE_NVAPI_VERSION(NV_MOSAIC_TOPO_DETAILS,1)
+
+
+//
+//! These values refer to the different types of Mosaic topologies that are possible.  When
+//! getting the supported Mosaic topologies, you can specify one of these types to narrow down
+//! the returned list to only those that match the given type.
+typedef enum
+{
+    NV_MOSAIC_TOPO_TYPE_ALL,                          //!< All mosaic topologies
+    NV_MOSAIC_TOPO_TYPE_BASIC,                        //!< Basic Mosaic topologies
+    NV_MOSAIC_TOPO_TYPE_PASSIVE_STEREO,               //!< Passive Stereo topologies
+    NV_MOSAIC_TOPO_TYPE_SCALED_CLONE,                 //!< Not supported at this time
+    NV_MOSAIC_TOPO_TYPE_PASSIVE_STEREO_SCALED_CLONE,  //!< Not supported at this time
+    NV_MOSAIC_TOPO_TYPE_MAX,                          //!< Always leave this at end of the enum
+} NV_MOSAIC_TOPO_TYPE;
+
+
+//
+//! This is a complete list of supported Mosaic topologies.
+//!
+//! Using a "Basic" topology combines multiple monitors to create a single desktop.
+//!
+//! Using a "Passive" topology combines multiples monitors to create a passive stereo desktop.
+//! In passive stereo, two identical topologies combine - one topology is used for the right eye and the other identical //! topology (targeting different displays) is used for the left eye.  \n
+//! NOTE: common\inc\nvEscDef.h shadows a couple PASSIVE_STEREO enums.  If this
+//!       enum list changes and effects the value of NV_MOSAIC_TOPO_BEGIN_PASSIVE_STEREO
+//!       please update the corresponding value in nvEscDef.h
+typedef enum
+{
+    NV_MOSAIC_TOPO_NONE,
+
+    // 'BASIC' topos start here
+    //
+    // The result of using one of these Mosaic topos is that multiple monitors
+    // will combine to create a single desktop.
+    //
+    NV_MOSAIC_TOPO_BEGIN_BASIC,
+    NV_MOSAIC_TOPO_1x2_BASIC = NV_MOSAIC_TOPO_BEGIN_BASIC,
+    NV_MOSAIC_TOPO_2x1_BASIC,
+    NV_MOSAIC_TOPO_1x3_BASIC,
+    NV_MOSAIC_TOPO_3x1_BASIC,
+    NV_MOSAIC_TOPO_1x4_BASIC,
+    NV_MOSAIC_TOPO_4x1_BASIC,
+    NV_MOSAIC_TOPO_2x2_BASIC,
+    NV_MOSAIC_TOPO_2x3_BASIC,
+    NV_MOSAIC_TOPO_2x4_BASIC,
+    NV_MOSAIC_TOPO_3x2_BASIC,
+    NV_MOSAIC_TOPO_4x2_BASIC,
+    NV_MOSAIC_TOPO_1x5_BASIC,
+    NV_MOSAIC_TOPO_1x6_BASIC,
+    NV_MOSAIC_TOPO_7x1_BASIC,
+
+    // Add padding for 10 more entries. 6 will be enough room to specify every
+    // possible topology with 8 or fewer displays, so this gives us a little
+    // extra should we need it.
+    NV_MOSAIC_TOPO_END_BASIC = NV_MOSAIC_TOPO_7x1_BASIC + 9,
+
+    // 'PASSIVE_STEREO' topos start here
+    //
+    // The result of using one of these Mosaic topos is that multiple monitors
+    // will combine to create a single PASSIVE STEREO desktop.  What this means is
+    // that there will be two topos that combine to create the overall desktop.
+    // One topo will be used for the left eye, and the other topo (of the
+    // same rows x cols), will be used for the right eye.  The difference between
+    // the two topos is that different GPUs and displays will be used.
+    //
+    NV_MOSAIC_TOPO_BEGIN_PASSIVE_STEREO,    // value shadowed in nvEscDef.h
+    NV_MOSAIC_TOPO_1x2_PASSIVE_STEREO = NV_MOSAIC_TOPO_BEGIN_PASSIVE_STEREO,
+    NV_MOSAIC_TOPO_2x1_PASSIVE_STEREO,
+    NV_MOSAIC_TOPO_1x3_PASSIVE_STEREO,
+    NV_MOSAIC_TOPO_3x1_PASSIVE_STEREO,
+    NV_MOSAIC_TOPO_1x4_PASSIVE_STEREO,
+    NV_MOSAIC_TOPO_4x1_PASSIVE_STEREO,
+    NV_MOSAIC_TOPO_2x2_PASSIVE_STEREO,
+    NV_MOSAIC_TOPO_END_PASSIVE_STEREO = NV_MOSAIC_TOPO_2x2_PASSIVE_STEREO + 4,
+
+
+    //
+    // Total number of topos.  Always leave this at the end of the enumeration.
+    //
+    NV_MOSAIC_TOPO_MAX  //! Total number of topologies.
+
+} NV_MOSAIC_TOPO;
+
+
+//
+//! This is a "topology brief" structure.  It tells you what you need to know about
+//! a topology at a high level. A list of these is returned when you query for the
+//! supported Mosaic information.
+//!
+//! If you need more detailed information about the topology, call
+//! NvAPI_Mosaic_GetTopoGroup() with the topology value from this structure.
+typedef struct
+{
+    NvU32                        version;            //!< Version of this structure
+    NV_MOSAIC_TOPO               topo;               //!< The topology
+    NvU32                        enabled;            //!< 1 if topo is enabled, else 0
+    NvU32                        isPossible;         //!< 1 if topo *can* be enabled, else 0
+
+} NV_MOSAIC_TOPO_BRIEF;
+
+//! Macro for constructing the version field of NV_MOSAIC_TOPO_BRIEF
+#define NVAPI_MOSAIC_TOPO_BRIEF_VER         MAKE_NVAPI_VERSION(NV_MOSAIC_TOPO_BRIEF,1)
+
+
+//
+//! Basic per-display settings that are used in setting/getting the Mosaic mode
+typedef struct _NV_MOSAIC_DISPLAY_SETTING_V1
+{
+    NvU32                        version;            //!< Version of this structure
+    NvU32                        width;              //!< Per-display width
+    NvU32                        height;             //!< Per-display height
+    NvU32                        bpp;                //!< Bits per pixel
+    NvU32                        freq;               //!< Display frequency
+} NV_MOSAIC_DISPLAY_SETTING_V1;
+
+typedef struct NV_MOSAIC_DISPLAY_SETTING_V2
+{
+    NvU32                        version;            //!< Version of this structure
+    NvU32                        width;              //!< Per-display width
+    NvU32                        height;             //!< Per-display height
+    NvU32                        bpp;                //!< Bits per pixel
+    NvU32                        freq;               //!< Display frequency
+    NvU32                        rrx1k;              //!< Display frequency in x1k
+} NV_MOSAIC_DISPLAY_SETTING_V2;
+
+typedef NV_MOSAIC_DISPLAY_SETTING_V2 NV_MOSAIC_DISPLAY_SETTING;
+
+//! Macro for constructing the version field of NV_MOSAIC_DISPLAY_SETTING
+#define NVAPI_MOSAIC_DISPLAY_SETTING_VER1         MAKE_NVAPI_VERSION(NV_MOSAIC_DISPLAY_SETTING_V1,1)
+#define NVAPI_MOSAIC_DISPLAY_SETTING_VER2         MAKE_NVAPI_VERSION(NV_MOSAIC_DISPLAY_SETTING_V2,2)
+#define NVAPI_MOSAIC_DISPLAY_SETTING_VER          NVAPI_MOSAIC_DISPLAY_SETTING_VER2
+
+
+//
+// Set a reasonable max number of display settings to support
+// so arrays are bound.
+//
+#define NV_MOSAIC_DISPLAY_SETTINGS_MAX 40  //!< Set a reasonable maximum number of display settings to support
+                                           //! so arrays are bound.
+
+
+//
+//! This structure is used to contain a list of supported Mosaic topologies
+//! along with the display settings that can be used.
+typedef struct _NV_MOSAIC_SUPPORTED_TOPO_INFO_V1
+{
+    NvU32                          version;                                         //!< Version of this structure
+    NvU32                          topoBriefsCount;                                 //!< Number of topologies in below array
+    NV_MOSAIC_TOPO_BRIEF           topoBriefs[NV_MOSAIC_TOPO_MAX];                  //!< List of supported topologies with only brief details
+    NvU32                          displaySettingsCount;                            //!< Number of display settings in below array
+    NV_MOSAIC_DISPLAY_SETTING_V1   displaySettings[NV_MOSAIC_DISPLAY_SETTINGS_MAX]; //!< List of per display settings possible
+
+} NV_MOSAIC_SUPPORTED_TOPO_INFO_V1;
+
+typedef struct _NV_MOSAIC_SUPPORTED_TOPO_INFO_V2
+{
+    NvU32                          version;                                         //!< Version of this structure
+    NvU32                          topoBriefsCount;                                 //!< Number of topologies in below array
+    NV_MOSAIC_TOPO_BRIEF           topoBriefs[NV_MOSAIC_TOPO_MAX];                  //!< List of supported topologies with only brief details
+    NvU32                          displaySettingsCount;                            //!< Number of display settings in below array
+    NV_MOSAIC_DISPLAY_SETTING_V2   displaySettings[NV_MOSAIC_DISPLAY_SETTINGS_MAX]; //!< List of per display settings possible
+
+} NV_MOSAIC_SUPPORTED_TOPO_INFO_V2;
+
+typedef NV_MOSAIC_SUPPORTED_TOPO_INFO_V2 NV_MOSAIC_SUPPORTED_TOPO_INFO;
+
+//! Macro forconstructing  the version field of NV_MOSAIC_SUPPORTED_TOPO_INFO
+#define NVAPI_MOSAIC_SUPPORTED_TOPO_INFO_VER1         MAKE_NVAPI_VERSION(NV_MOSAIC_SUPPORTED_TOPO_INFO_V1,1)
+#define NVAPI_MOSAIC_SUPPORTED_TOPO_INFO_VER2         MAKE_NVAPI_VERSION(NV_MOSAIC_SUPPORTED_TOPO_INFO_V2,2)
+#define NVAPI_MOSAIC_SUPPORTED_TOPO_INFO_VER          NVAPI_MOSAIC_SUPPORTED_TOPO_INFO_VER2
+
+
+//
+// Indices to use to access the topos array within the mosaic topology
+#define NV_MOSAIC_TOPO_IDX_DEFAULT       0
+
+#define NV_MOSAIC_TOPO_IDX_LEFT_EYE      0
+#define NV_MOSAIC_TOPO_IDX_RIGHT_EYE     1
+#define NV_MOSAIC_TOPO_NUM_EYES          2
+
+
+//
+//! This defines the maximum number of topos that can be in a topo group.
+//! At this time, it is set to 2 because our largest topo group (passive
+//! stereo) only needs 2 topos (left eye and right eye).
+//!
+//! If a new topo group with more than 2 topos is added above, then this
+//! number will also have to be incremented.
+#define NV_MOSAIC_MAX_TOPO_PER_TOPO_GROUP 2
+
+
+//
+//! This structure defines a group of topologies that work together to create one
+//! overall layout.  All of the supported topologies are represented with this
+//! structure.
+//!
+//! For example, a 'Passive Stereo' topology would be represented with this
+//! structure, and would have separate topology details for the left and right eyes.
+//! The count would be 2.  A 'Basic' topology is also represented by this structure,
+//! with a count of 1.
+//!
+//! The structure is primarily used internally, but is exposed to applications in a
+//! read-only fashion because there are some details in it that might be useful
+//! (like the number of rows/cols, or connected display information).  A user can
+//! get the filled-in structure by calling NvAPI_Mosaic_GetTopoGroup().
+//!
+//! You can then look at the detailed values within the structure.  There are no
+//! entrypoints which take this structure as input (effectively making it read-only).
+typedef struct
+{
+    NvU32                      version;              //!< Version of this structure
+    NV_MOSAIC_TOPO_BRIEF       brief;                //!< The brief details of this topo
+    NvU32                      count;                //!< Number of topos in array below
+    NV_MOSAIC_TOPO_DETAILS     topos[NV_MOSAIC_MAX_TOPO_PER_TOPO_GROUP];
+
+} NV_MOSAIC_TOPO_GROUP;
+
+//! Macro for constructing the version field of NV_MOSAIC_TOPO_GROUP
+#define NVAPI_MOSAIC_TOPO_GROUP_VER         MAKE_NVAPI_VERSION(NV_MOSAIC_TOPO_GROUP,1)
+
+//! @}
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME:   NvAPI_Mosaic_GetSupportedTopoInfo
+//
+//! DESCRIPTION:     This API returns information on the topologies and display resolutions
+//!                  supported by Mosaic mode.
+//!
+//!                  NOTE: Not all topologies returned can be set immediately.
+//!                        See 'OUT' Notes below.
+//!
+//!                  Once you get the list of supported topologies, you can call
+//!                  NvAPI_Mosaic_GetTopoGroup() with one of the Mosaic topologies if you need
+//!                  more information about it.
+//!
+//!     <b>'IN' Notes:</b>  pSupportedTopoInfo->version must be set before calling this function.
+//!                  If the specified version is not supported by this implementation,
+//!                  an error will be returned (NVAPI_INCOMPATIBLE_STRUCT_VERSION).
+//!
+//!     <b>'OUT' Notes:</b> Some of the topologies returned might not be valid for one reason or
+//!                  another.  It could be due to mismatched or missing displays.  It
+//!                  could also be because the required number of GPUs is not found.
+//!                  At a high level, you can see if the topology is valid and can be enabled
+//!                  by looking at the pSupportedTopoInfo->topoBriefs[xxx].isPossible flag.
+//!                  If this is true, the topology can be enabled. If it
+//!                  is false, you can find out why it cannot be enabled by getting the
+//!                  details of the topology via NvAPI_Mosaic_GetTopoGroup().  From there,
+//!                  look at the validityMask of the individual topologies.  The bits can
+//!                  be tested against the NV_MOSAIC_TOPO_VALIDITY_* bits.
+//!
+//!                  It is possible for this function to return NVAPI_OK with no topologies
+//!                  listed in the return structure.  If this is the case, it means that
+//!                  the current hardware DOES support Mosaic, but with the given configuration
+//!                  no valid topologies were found.  This most likely means that SLI was not
+//!                  enabled for the hardware. Once enabled, you should see valid topologies
+//!                  returned from this function.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 185
+//!
+//!
+//! \param [in,out]  pSupportedTopoInfo  Information about what topologies and display resolutions
+//!                                      are supported for Mosaic.
+//! \param [in]      type                The type of topologies the caller is interested in
+//!                                      getting. See NV_MOSAIC_TOPO_TYPE for possible values.
+//!
+//! \retval ::NVAPI_OK                          No errors in returning supported topologies.
+//! \retval ::NVAPI_NOT_SUPPORTED               Mosaic is not supported with the existing hardware.
+//! \retval ::NVAPI_INVALID_ARGUMENT            One or more arguments passed in are invalid.
+//! \retval ::NVAPI_API_NOT_INTIALIZED          The NvAPI API needs to be initialized first.
+//! \retval ::NVAPI_NO_IMPLEMENTATION           This entrypoint not available.
+//! \retval ::NVAPI_INCOMPATIBLE_STRUCT_VERSION The version of the structure passed in is not
+//                                              compatible with this entry point.
+//! \retval ::NVAPI_ERROR:                      Miscellaneous error occurred.
+//!
+//! \ingroup mosaicapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_Mosaic_GetSupportedTopoInfo(NV_MOSAIC_SUPPORTED_TOPO_INFO *pSupportedTopoInfo, NV_MOSAIC_TOPO_TYPE type);
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME:   NvAPI_Mosaic_GetTopoGroup
+//
+//! DESCRIPTION:     This API returns a structure filled with the details
+//!                  of the specified Mosaic topology.
+//!
+//!                  If the pTopoBrief passed in matches the current topology,
+//!                  then information in the brief and group structures
+//!                  will reflect what is current. Thus the brief would have
+//!                  the current 'enable' status, and the group would have the
+//!                  current overlap values. If there is no match, then the
+//!                  returned brief has an 'enable' status of FALSE (since it
+//!                  is obviously not enabled), and the overlap values will be 0.
+//!
+//!     <b>'IN' Notes:</b>  pTopoGroup->version must be set before calling this function.
+//!                  If the specified version is not supported by this implementation,
+//!                  an error will be returned (NVAPI_INCOMPATIBLE_STRUCT_VERSION).
+//!
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 185
+//!
+//! \param [in]     pTopoBrief        The topology for getting the details
+//!                                   This must be one of the topology briefs
+//!                                   returned from NvAPI_Mosaic_GetSupportedTopoInfo().
+//! \param [in,out] pTopoGroup        The topology details matching the brief
+//!
+//! \retval ::NVAPI_OK                          Details were retrieved successfully.
+//! \retval ::NVAPI_NOT_SUPPORTED               Mosaic is not supported with the existing hardware.
+//! \retval ::NVAPI_INVALID_ARGUMENT            One or more argumentss passed in are invalid.
+//! \retval ::NVAPI_API_NOT_INTIALIZED          The NvAPI API needs to be initialized first.
+//! \retval ::NVAPI_NO_IMPLEMENTATION           This entrypoint not available.
+//! \retval ::NVAPI_INCOMPATIBLE_STRUCT_VERSION The version of the structure passed in is not
+//                                              compatible with this entry point.
+//! \retval ::NVAPI_ERROR:                      Miscellaneous error occurred.
+//!
+//! \ingroup mosaicapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_Mosaic_GetTopoGroup(NV_MOSAIC_TOPO_BRIEF *pTopoBrief, NV_MOSAIC_TOPO_GROUP *pTopoGroup);
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME:   NvAPI_Mosaic_GetOverlapLimits
+//
+//! DESCRIPTION:     This API returns the X and Y overlap limits required if
+//!                  the given Mosaic topology and display settings are to be used.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 185
+//!
+//! \param [in]   pTopoBrief          The topology for getting limits
+//!                                   This must be one of the topo briefs
+//!                                   returned from NvAPI_Mosaic_GetSupportedTopoInfo().
+//! \param [in]   pDisplaySetting     The display settings for getting the limits.
+//!                                   This must be one of the settings
+//!                                   returned from NvAPI_Mosaic_GetSupportedTopoInfo().
+//! \param [out]  pMinOverlapX        X overlap minimum
+//! \param [out]  pMaxOverlapX        X overlap maximum
+//! \param [out]  pMinOverlapY        Y overlap minimum
+//! \param [out]  pMaxOverlapY        Y overlap maximum
+//!
+//! \retval ::NVAPI_OK                          Details were retrieved successfully.
+//! \retval ::NVAPI_NOT_SUPPORTED               Mosaic is not supported with the existing hardware.
+//! \retval ::NVAPI_INVALID_ARGUMENT            One or more argumentss passed in are invalid.
+//! \retval ::NVAPI_API_NOT_INTIALIZED          The NvAPI API needs to be initialized first.
+//! \retval ::NVAPI_NO_IMPLEMENTATION           This entrypoint not available.
+//! \retval ::NVAPI_INCOMPATIBLE_STRUCT_VERSION The version of the structure passed in is not
+//!                                             compatible with this entry point.
+//! \retval ::NVAPI_ERROR                       Miscellaneous error occurred.
+//!
+//! \ingroup mosaicapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_Mosaic_GetOverlapLimits(NV_MOSAIC_TOPO_BRIEF *pTopoBrief, NV_MOSAIC_DISPLAY_SETTING *pDisplaySetting, NvS32 *pMinOverlapX, NvS32 *pMaxOverlapX, NvS32 *pMinOverlapY, NvS32 *pMaxOverlapY);
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME:   NvAPI_Mosaic_SetCurrentTopo
+//
+//! DESCRIPTION:     This API sets the Mosaic topology and performs a mode switch
+//!                  using the given display settings.
+//!
+//!                  If NVAPI_OK is returned, the current Mosaic topology was set
+//!                  correctly.  Any other status returned means the
+//!                  topology was not set, and remains what it was before this
+//!                  function was called.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 185
+//!
+//! \param [in]     pTopoBrief        The topology to set. This must be one of the topologies returned from
+//!                                   NvAPI_Mosaic_GetSupportedTopoInfo(), and it must have an isPossible value of 1.
+//! \param [in]     pDisplaySetting   The per display settings to be used in the Mosaic mode. This must be one of the
+//!                                   settings returned from NvAPI_Mosaic_GetSupportedTopoInfo().
+//! \param [in]     overlapX          The pixel overlap to use between horizontal displays (use positive a number for
+//!                                   overlap, or a negative number to create a gap.) If the overlap is out of bounds
+//!                                   for what is possible given the topo and display setting, the overlap will be clamped.
+//! \param [in]     overlapY          The pixel overlap to use between vertical displays (use positive a number for
+//!                                   overlap, or a negative number to create a gap.) If the overlap is out of bounds for
+//!                                   what is possible given the topo and display setting, the overlap will be clamped.
+//! \param [in]     enable            If 1, the topology being set will also be enabled, meaning that the mode set will
+//!                                   occur.  \n
+//!                                   If 0, you don't want to be in Mosaic mode right now, but want to set the current
+//!                                   Mosaic topology so you can enable it later with NvAPI_Mosaic_EnableCurrentTopo().
+//!
+//! \retval  ::NVAPI_OK                          The Mosaic topology was set.
+//! \retval  ::NVAPI_NOT_SUPPORTED               Mosaic is not supported with the existing hardware.
+//! \retval  ::NVAPI_INVALID_ARGUMENT            One or more argumentss passed in are invalid.
+//! \retval  ::NVAPI_TOPO_NOT_POSSIBLE           The topology passed in is not currently possible.
+//! \retval  ::NVAPI_API_NOT_INTIALIZED          The NvAPI API needs to be initialized first.
+//! \retval  ::NVAPI_NO_IMPLEMENTATION           This entrypoint not available.
+//! \retval  ::NVAPI_INCOMPATIBLE_STRUCT_VERSION The version of the structure passed in is not
+//!                                              compatible with this entrypoint.
+//! \retval  ::NVAPI_MODE_CHANGE_FAILED          There was an error changing the display mode.
+//! \retval  ::NVAPI_ERROR                       Miscellaneous error occurred.
+//!
+//! \ingroup mosaicapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_Mosaic_SetCurrentTopo(NV_MOSAIC_TOPO_BRIEF *pTopoBrief, NV_MOSAIC_DISPLAY_SETTING *pDisplaySetting, NvS32 overlapX, NvS32 overlapY, NvU32 enable);
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME:   NvAPI_Mosaic_GetCurrentTopo
+//
+//! DESCRIPTION:     This API returns information for the current Mosaic topology.
+//!                  This includes topology, display settings, and overlap values.
+//!
+//!                  You can call NvAPI_Mosaic_GetTopoGroup() with the topology
+//!                  if you require more information.
+//!
+//!                  If there isn't a current topology, then pTopoBrief->topo will
+//!                  be NV_MOSAIC_TOPO_NONE.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 185
+//!
+//! \param [out]     pTopoBrief       The current Mosaic topology
+//! \param [out]     pDisplaySetting  The current per-display settings
+//! \param [out]     pOverlapX        The pixel overlap between horizontal displays
+//! \param [out]     pOverlapY        The pixel overlap between vertical displays
+//!
+//! \retval ::NVAPI_OK                          Success getting current info.
+//! \retval ::NVAPI_NOT_SUPPORTED               Mosaic is not supported with the existing hardware.
+//! \retval ::NVAPI_INVALID_ARGUMENT            One or more argumentss passed in are invalid.
+//! \retval ::NVAPI_API_NOT_INTIALIZED          The NvAPI API needs to be initialized first.
+//! \retval ::NVAPI_NO_IMPLEMENTATION           This entry point not available.
+//! \retval ::NVAPI_ERROR                       Miscellaneous error occurred.
+//!
+//! \ingroup mosaicapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_Mosaic_GetCurrentTopo(NV_MOSAIC_TOPO_BRIEF *pTopoBrief, NV_MOSAIC_DISPLAY_SETTING *pDisplaySetting, NvS32 *pOverlapX, NvS32 *pOverlapY);
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME:   NvAPI_Mosaic_EnableCurrentTopo
+//
+//! DESCRIPTION:     This API enables or disables the current Mosaic topology
+//!                  based on the setting of the incoming 'enable' parameter.
+//!
+//!                  An "enable" setting enables the current (previously set) Mosaic topology.
+//!                  Note that when the current Mosaic topology is retrieved, it must have an isPossible value of 1 or
+//!                  an error will occur.
+//!
+//!                  A "disable" setting disables the current Mosaic topology.
+//!                  The topology information will persist, even across reboots.
+//!                  To re-enable the Mosaic topology, call this function
+//!                  again with the enable parameter set to 1.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 185
+//!
+//! \param [in]   enable               1 to enable the current Mosaic topo, 0 to disable it.
+//!
+//! \retval ::NVAPI_OK                 The Mosaic topo was enabled/disabled.
+//! \retval ::NVAPI_NOT_SUPPORTED      Mosaic is not supported with the existing hardware.
+//! \retval ::NVAPI_INVALID_ARGUMENT   One or more arguments passed in are invalid.
+//! \retval ::NVAPI_TOPO_NOT_POSSIBLE  The current topology is not currently possible.
+//! \retval ::NVAPI_MODE_CHANGE_FAILED There was an error changing the display mode.
+//! \retval ::NVAPI_ERROR:             Miscellaneous error occurred.
+//!
+//! \ingroup mosaicapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_Mosaic_EnableCurrentTopo(NvU32 enable);
+
+//! \ingroup mosaicapi
+//! @{
+typedef struct _NV_MOSAIC_GRID_TOPO_DISPLAY_V1
+{
+    NvU32                       displayId;              //!< DisplayID of the display
+    NvS32                       overlapX;               //!< (+overlap, -gap)
+    NvS32                       overlapY;               //!< (+overlap, -gap)
+    NV_ROTATE                   rotation;               //!< Rotation of display
+    NvU32                       cloneGroup;             //!< Reserved, must be 0
+} NV_MOSAIC_GRID_TOPO_DISPLAY_V1;
+
+typedef enum _NV_PIXEL_SHIFT_TYPE
+{
+    NV_PIXEL_SHIFT_TYPE_NO_PIXEL_SHIFT            = 0,          //!< No pixel shift will be applied to this display.
+    NV_PIXEL_SHIFT_TYPE_2x2_TOP_LEFT_PIXELS       = 1,          //!< This display will be used to scanout top left pixels in 2x2 PixelShift configuration
+    NV_PIXEL_SHIFT_TYPE_2x2_BOTTOM_RIGHT_PIXELS   = 2,          //!< This display will be used to scanout bottom right pixels in 2x2 PixelShift configuration
+} NV_PIXEL_SHIFT_TYPE;
+
+typedef struct _NV_MOSAIC_GRID_TOPO_DISPLAY_V2
+{
+    NvU32                               version;                //!< Version of this structure
+
+    NvU32                               displayId;              //!< DisplayID of the display
+    NvS32                               overlapX;               //!< (+overlap, -gap)
+    NvS32                               overlapY;               //!< (+overlap, -gap)
+    NV_ROTATE                           rotation;               //!< Rotation of display
+    NvU32                               cloneGroup;             //!< Reserved, must be 0
+    NV_PIXEL_SHIFT_TYPE                 pixelShiftType;         //!< Type of the pixel shift enabled display
+} NV_MOSAIC_GRID_TOPO_DISPLAY_V2;
+
+#ifndef NV_MOSAIC_GRID_TOPO_DISPLAY_VER
+
+typedef NV_MOSAIC_GRID_TOPO_DISPLAY_V1           NV_MOSAIC_GRID_TOPO_DISPLAY;
+
+#endif
+
+typedef struct _NV_MOSAIC_GRID_TOPO_V1
+{
+    NvU32                          version;                            //!< Version of this structure
+    NvU32                          rows;                               //!< Number of rows
+    NvU32                          columns;                            //!< Number of columns
+    NvU32                          displayCount;                       //!< Number of display details
+    NvU32                          applyWithBezelCorrect : 1;          //!< When enabling and doing the modeset, do we switch to the bezel-corrected resolution
+    NvU32                          immersiveGaming : 1;                //!< Enable as immersive gaming instead of Mosaic SLI (for Quadro-boards only)
+    NvU32                          baseMosaic : 1;                     //!< Enable as Base Mosaic (Panoramic) instead of Mosaic SLI (for NVS and Quadro-boards only)
+    NvU32                          driverReloadAllowed : 1;            //!< If necessary, reloading the driver is permitted (for Vista and above only). Will not be persisted. Value undefined on get.
+    NvU32                          acceleratePrimaryDisplay : 1;       //!< Enable SLI acceleration on the primary display while in single-wide mode (For Immersive Gaming only). Will not be persisted. Value undefined on get.
+    NvU32                          reserved : 27;                      //!< Reserved, must be 0
+    NV_MOSAIC_GRID_TOPO_DISPLAY_V1 displays[NV_MOSAIC_MAX_DISPLAYS];   //!< Displays are done as [(row * columns) + column]
+    NV_MOSAIC_DISPLAY_SETTING_V1   displaySettings;                    //!< Display settings
+} NV_MOSAIC_GRID_TOPO_V1;
+
+typedef struct _NV_MOSAIC_GRID_TOPO_V2
+{
+    NvU32                          version;                            //!< Version of this structure
+    NvU32                          rows;                               //!< Number of rows
+    NvU32                          columns;                            //!< Number of columns
+    NvU32                          displayCount;                       //!< Number of display details
+    NvU32                          applyWithBezelCorrect : 1;          //!< When enabling and doing the modeset, do we switch to the bezel-corrected resolution
+    NvU32                          immersiveGaming : 1;                //!< Enable as immersive gaming instead of Mosaic SLI (for Quadro-boards only)
+    NvU32                          baseMosaic : 1;                     //!< Enable as Base Mosaic (Panoramic) instead of Mosaic SLI (for NVS and Quadro-boards only)
+    NvU32                          driverReloadAllowed : 1;            //!< If necessary, reloading the driver is permitted (for Vista and above only). Will not be persisted. Value undefined on get.
+    NvU32                          acceleratePrimaryDisplay : 1;       //!< Enable SLI acceleration on the primary display while in single-wide mode (For Immersive Gaming only). Will not be persisted. Value undefined on get.
+    NvU32                          pixelShift : 1;                     //!< Enable Pixel shift
+    NvU32                          reserved : 26;                      //!< Reserved, must be 0
+    NV_MOSAIC_GRID_TOPO_DISPLAY_V2 displays[NV_MOSAIC_MAX_DISPLAYS];   //!< Displays are done as [(row * columns) + column]
+    NV_MOSAIC_DISPLAY_SETTING_V1   displaySettings;                    //!< Display settings
+} NV_MOSAIC_GRID_TOPO_V2;
+
+//! Macro for constructing the version field of ::NV_MOSAIC_GRID_TOPO
+#define NV_MOSAIC_GRID_TOPO_VER1         MAKE_NVAPI_VERSION(NV_MOSAIC_GRID_TOPO_V1,1)
+#define NV_MOSAIC_GRID_TOPO_VER2         MAKE_NVAPI_VERSION(NV_MOSAIC_GRID_TOPO_V2,2)
+#ifndef NV_MOSAIC_GRID_TOPO_VER
+
+typedef NV_MOSAIC_GRID_TOPO_V2           NV_MOSAIC_GRID_TOPO;
+
+//! Macro for constructing the version field of ::NV_MOSAIC_GRID_TOPO
+#define NV_MOSAIC_GRID_TOPO_VER          NV_MOSAIC_GRID_TOPO_VER2
+
+#endif
+
+//! @}
+
+//! since Release R290
+
+#define NV_MOSAIC_DISPLAYCAPS_PROBLEM_DISPLAY_ON_INVALID_GPU        NV_BIT(0)
+#define NV_MOSAIC_DISPLAYCAPS_PROBLEM_DISPLAY_ON_WRONG_CONNECTOR    NV_BIT(1)
+#define NV_MOSAIC_DISPLAYCAPS_PROBLEM_NO_COMMON_TIMINGS             NV_BIT(2)
+#define NV_MOSAIC_DISPLAYCAPS_PROBLEM_NO_EDID_AVAILABLE             NV_BIT(3)
+#define NV_MOSAIC_DISPLAYCAPS_PROBLEM_MISMATCHED_OUTPUT_TYPE        NV_BIT(4)
+#define NV_MOSAIC_DISPLAYCAPS_PROBLEM_NO_DISPLAY_CONNECTED          NV_BIT(5)
+#define NV_MOSAIC_DISPLAYCAPS_PROBLEM_NO_GPU_TOPOLOGY               NV_BIT(6)
+#define NV_MOSAIC_DISPLAYCAPS_PROBLEM_NOT_SUPPORTED                 NV_BIT(7)
+#define NV_MOSAIC_DISPLAYCAPS_PROBLEM_NO_SLI_BRIDGE                 NV_BIT(8)
+#define NV_MOSAIC_DISPLAYCAPS_PROBLEM_ECC_ENABLED                   NV_BIT(9)
+#define NV_MOSAIC_DISPLAYCAPS_PROBLEM_GPU_TOPOLOGY_NOT_SUPPORTED    NV_BIT(10)
+
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME:   NvAPI_Mosaic_SetDisplayGrids
+//
+//! DESCRIPTION:     Sets a new display topology, replacing any existing topologies
+//!                  that use the same displays.
+//!
+//!                  This function will look for an SLI configuration that will
+//!                  allow the display topology to work.
+//!
+//!                  To revert to a single display, specify that display as a 1x1
+//!                  grid.
+//!
+//! SUPPORTED OS:  Windows 7 and higher
+//!
+//!
+//! \param [in]      pGridTopologies    The topology details to set.
+//! \param [in]      gridCount          The number of elements in the pGridTopologies array.
+//! \param [in]      setTopoFlags       Zero or more of the NVAPI_MOSAIC_SETDISPLAYTOPO_FLAG_*
+//!                                     flags.
+//!
+//!
+//! \retval ::NVAPI_OK                          Capabilities have been returned.
+//! \retval ::NVAPI_INVALID_ARGUMENT            One or more args passed in are invalid.
+//! \retval ::NVAPI_API_NOT_INTIALIZED          The NvAPI API needs to be initialized first
+//! \retval ::NVAPI_NO_IMPLEMENTATION           This entrypoint not available
+//! \retval ::NVAPI_NO_ACTIVE_SLI_TOPOLOGY      No matching GPU topologies could be found.
+//! \retval ::NVAPI_TOPO_NOT_POSSIBLE           One or more of the display grids are not valid.
+//! \retval ::NVAPI_ERROR                       Miscellaneous error occurred
+//! \ingroup mosaicapi
+///////////////////////////////////////////////////////////////////////////////
+
+
+//! Do not change the current GPU topology. If the NO_DRIVER_RELOAD bit is not
+//! specified, then it may still require a driver reload.
+#define NV_MOSAIC_SETDISPLAYTOPO_FLAG_CURRENT_GPU_TOPOLOGY NV_BIT(0)
+
+//! Do not allow a driver reload. That is, stick with the same master GPU as well as the
+//! same SLI configuration.
+#define NV_MOSAIC_SETDISPLAYTOPO_FLAG_NO_DRIVER_RELOAD     NV_BIT(1)
+
+//! When choosing a GPU topology, choose the topology with the best performance.
+//! Without this flag, it will choose the topology that uses the smallest number
+//! of GPU's.
+#define NV_MOSAIC_SETDISPLAYTOPO_FLAG_MAXIMIZE_PERFORMANCE NV_BIT(2)
+
+//! Do not return an error if no configuration will work with all of the grids.
+#define NV_MOSAIC_SETDISPLAYTOPO_FLAG_ALLOW_INVALID        NV_BIT(3)
+
+NVAPI_INTERFACE NvAPI_Mosaic_SetDisplayGrids(__in_ecount(gridCount) NV_MOSAIC_GRID_TOPO *pGridTopologies, __in NvU32 gridCount, __in NvU32 setTopoFlags);
+
+
+//! \ingroup mosaicapi
+//! Indicates that a display's position in the grid is sub-optimal.
+#define NV_MOSAIC_DISPLAYTOPO_WARNING_DISPLAY_POSITION            NV_BIT(0)
+
+//! \ingroup mosaicapi
+//! Indicates that SetDisplaySettings would need to perform a driver reload.
+#define NV_MOSAIC_DISPLAYTOPO_WARNING_DRIVER_RELOAD_REQUIRED      NV_BIT(1)
+
+//! \ingroup mosaicapi
+typedef struct
+{
+    NvU32 version;
+    NvU32 errorFlags;            //!< (OUT) Any of the NV_MOSAIC_DISPLAYTOPO_ERROR_* flags.
+    NvU32 warningFlags;          //!< (OUT) Any of the NV_MOSAIC_DISPLAYTOPO_WARNING_* flags.
+
+    NvU32 displayCount;          //!< (OUT) The number of valid entries in the displays array.
+    struct
+    {
+        NvU32 displayId;             //!< (OUT) The DisplayID of this display.
+        NvU32 errorFlags;            //!< (OUT) Any of the NV_MOSAIC_DISPLAYCAPS_PROBLEM_* flags.
+        NvU32 warningFlags;          //!< (OUT) Any of the NV_MOSAIC_DISPLAYTOPO_WARNING_* flags.
+
+        NvU32  supportsRotation : 1; //!< (OUT) This display can be rotated
+        NvU32  reserved : 31;        //!< (OUT) reserved
+    } displays[NVAPI_MAX_DISPLAYS];
+} NV_MOSAIC_DISPLAY_TOPO_STATUS;
+
+//! \ingroup mosaicapi
+#define NV_MOSAIC_DISPLAY_TOPO_STATUS_VER         MAKE_NVAPI_VERSION(NV_MOSAIC_DISPLAY_TOPO_STATUS,1)
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME:   NvAPI_Mosaic_ValidateDisplayGrids
+//
+//! DESCRIPTION:     Determines if a list of grid topologies is valid. It will choose an SLI
+//!                  configuration in the same way that NvAPI_Mosaic_SetDisplayGrids() does.
+//!
+//!                  On return, each element in the pTopoStatus array will contain any errors or
+//!                  warnings about each grid topology. If any error flags are set, then the topology
+//!                  is not valid. If any warning flags are set, then the topology is valid, but
+//!                  sub-optimal.
+//!
+//!                  If the ALLOW_INVALID flag is set, then it will continue to validate the grids
+//!                  even if no SLI configuration will allow all of the grids. In this case, a grid
+//!                  grid with no matching GPU topology will have the error
+//!                  flags NO_GPU_TOPOLOGY or NOT_SUPPORTED set.
+//!
+//!                  If the ALLOW_INVALID flag is not set and no matching SLI configuration is
+//!                  found, then it will skip the rest of the validation and return
+//!                  NVAPI_NO_ACTIVE_SLI_TOPOLOGY.
+//!
+//! SUPPORTED OS:  Windows 7 and higher
+//!
+//!
+//! \param [in]      setTopoFlags       Zero or more of the NVAPI_MOSAIC_SETDISPLAYTOPO_FLAG_*
+//!                                     flags.
+//! \param [in]      pGridTopologies    The array of grid topologies to verify.
+//! \param [in,out]  pTopoStatus        The array of problems and warnings with each grid topology.
+//! \param [in]      gridCount          The number of elements in the pGridTopologies and
+//!                                     pTopoStatus arrays.
+//!
+//!
+//! \retval ::NVAPI_OK:                          Capabilities have been returned.
+//! \retval ::NVAPI_INVALID_ARGUMENT:            One or more args passed in are invalid.
+//! \retval ::NVAPI_API_NOT_INTIALIZED:          The NvAPI API needs to be initialized first
+//! \retval ::NVAPI_NO_IMPLEMENTATION:           This entrypoint not available
+//! \retval ::NVAPI_NO_ACTIVE_SLI_TOPOLOGY:      No matching GPU topologies could be found.
+//! \retval ::NVAPI_ERROR:                       Miscellaneous error occurred
+//!
+//! \ingroup mosaicapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_Mosaic_ValidateDisplayGrids(__in NvU32 setTopoFlags,
+        __in_ecount(gridCount) NV_MOSAIC_GRID_TOPO *pGridTopologies,
+        __inout_ecount_full(gridCount) NV_MOSAIC_DISPLAY_TOPO_STATUS *pTopoStatus,
+        __in NvU32 gridCount);
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME:   NvAPI_Mosaic_EnumDisplayModes
+//
+//! DESCRIPTION:     Determines the set of available display modes for a given grid topology.
+//!
+//! SUPPORTED OS:  Windows 7 and higher
+//!
+//!
+//! \param [in]      pGridTopology         The grid topology to use.
+//! \param [in,out]  pDisplaySettings      A pointer to an array of display settings to populate,
+//!                                        or NULL to find out the total number of available modes.
+//! \param [in,out]  pDisplayCount         If pDisplaySettings is not NULL, then pDisplayCount
+//!                                        should point to the number of elements in the
+//!                                        pDisplaySettings array. On return, it will contain the
+//!                                        number of modes that were actually returned. If
+//!                                        pDisplaySettings is NULL, then pDisplayCount will receive
+//!                                        the total number of modes that are available.
+//!
+//!
+//! \retval ::NVAPI_OK                          Capabilities have been returned.
+//! \retval ::NVAPI_INVALID_ARGUMENT            One or more args passed in are invalid.
+//! \retval ::NVAPI_API_NOT_INTIALIZED          The NvAPI API needs to be initialized first
+//! \retval ::NVAPI_NO_IMPLEMENTATION           This entrypoint not available
+//! \retval ::NVAPI_ERROR                       Miscellaneous error occurred
+//!
+//! \ingroup mosaciapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_Mosaic_EnumDisplayModes(__in NV_MOSAIC_GRID_TOPO *pGridTopology,
+        __inout_ecount_part_opt(*pDisplayCount, *pDisplayCount) NV_MOSAIC_DISPLAY_SETTING *pDisplaySettings,
+        __inout NvU32 *pDisplayCount);
+
+
+//! SUPPORTED OS:  Windows 7 and higher
+//!
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME:   NvAPI_Mosaic_EnumDisplayGrids
+//
+//! DESCRIPTION:     Enumerates the current active grid topologies. This includes Mosaic, IG, and
+//!                  Panoramic topologies, as well as single displays.
+//!
+//!                  If pGridTopologies is NULL, then pGridCount will be set to the number of active
+//!                  grid topologies.
+//!
+//!                  If pGridTopologies is not NULL, then pGridCount contains the maximum number of
+//!                  grid topologies to return. On return, pGridCount will be set to the number of
+//!                  grid topologies that were returned.
+//!
+//! \param [out]     pGridTopologies   The list of active grid topologies.
+//! \param [in,out]  pGridCount        A pointer to the number of grid topologies returned.
+//!
+//! \retval ::NVAPI_OK                          Capabilties have been returned.
+//! \retval ::NVAPI_END_ENUMERATION             There are no more topologies to return.
+//! \retval ::NVAPI_INVALID_ARGUMENT            One or more args passed in are invalid.
+//! \retval ::NVAPI_API_NOT_INTIALIZED          The NvAPI API needs to be initialized first
+//! \retval ::NVAPI_NO_IMPLEMENTATION           This entrypoint not available
+//! \retval ::NVAPI_ERROR                       Miscellaneous error occurred
+//!
+//! \ingroup mosaicapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_Mosaic_EnumDisplayGrids(__inout_ecount_part_opt(*pGridCount, *pGridCount) NV_MOSAIC_GRID_TOPO *pGridTopologies,
+        __inout NvU32 *pGridCount);
+
+
+////////////////////////////////////////////////////////////////////////////////////////
+//
+// ###########################################################################
+// DELME_RUSS - DELME_RUSS - DELME_RUSS - DELME_RUSS - DELME_RUSS - DELME_RUSS
+//
+//   Below is the Phase 1 Mosaic stuff, the Phase 2 stuff above is what will remain
+//   once Phase 2 is complete.  For a small amount of time, the two will co-exist.  As
+//   soon as apps (nvapichk, NvAPITestMosaic, and CPL) are updated to use the Phase 2
+//   entrypoints, the code below will be deleted.
+//
+// DELME_RUSS - DELME_RUSS - DELME_RUSS - DELME_RUSS - DELME_RUSS - DELME_RUSS
+// ###########################################################################
+//
+// Supported topos 1x4, 4x1 and 2x2 to start with.
+//
+// Selected scan out targets can be one per GPU or more than one on the same GPU.
+//
+// SAMPLE of MOSAIC 1x4 SCAN OUT TOPO with 8 pixel horizontal overlap
+//
+//+-------------------------++-------------------------++-------------------------++-------------------------+
+//|                         ||                         ||                         ||                         |
+//|                         ||                         ||                         ||                         |
+//|                         ||                         ||                         ||                         |
+//|        DVI1             ||           DVI2          ||         DVI3            ||          DVI4           |
+//|                         ||                         ||                         ||                         |
+//|                         ||                         ||                         ||                         |
+//|                         ||                         ||                         ||                         |
+//|                         ||                         ||                         ||                         |
+//+-------------------------++-------------------------++-------------------------++-------------------------+
+
+
+//! \addtogroup mosaicapi
+//! @{
+
+//! Used in NV_MOSAIC_TOPOLOGY.
+#define NVAPI_MAX_MOSAIC_DISPLAY_ROWS       8
+
+//! Used in NV_MOSAIC_TOPOLOGY.
+#define NVAPI_MAX_MOSAIC_DISPLAY_COLUMNS    8
+
+//! Used in NV_MOSAIC_TOPOLOGY.
+#define NVAPI_MAX_MOSAIC_TOPOS              16
+
+//! Used in NvAPI_GetCurrentMosaicTopology() and NvAPI_SetCurrentMosaicTopology().
+typedef struct
+{
+    NvU32 version;                             //!< Version number of the mosaic topology
+    NvU32 rowCount;                            //!< Horizontal display count
+    NvU32 colCount;                            //!< Vertical display count
+
+    struct
+    {
+        NvPhysicalGpuHandle hPhysicalGPU;      //!< Physical GPU to be used in the topology
+        NvU32               displayOutputId;   //!< Connected display target
+        NvS32               overlapX;          //!< Pixels of overlap on the left of target: (+overlap, -gap)
+        NvS32               overlapY;          //!< Pixels of overlap on the top of target: (+overlap, -gap)
+
+    } gpuLayout[NVAPI_MAX_MOSAIC_DISPLAY_ROWS][NVAPI_MAX_MOSAIC_DISPLAY_COLUMNS];
+
+} NV_MOSAIC_TOPOLOGY;
+
+//! Used in NV_MOSAIC_TOPOLOGY.
+#define NVAPI_MOSAIC_TOPOLOGY_VER         MAKE_NVAPI_VERSION(NV_MOSAIC_TOPOLOGY,1)
+
+//! Used in NvAPI_GetSupportedMosaicTopologies().
+typedef struct
+{
+    NvU32                   version;
+    NvU32                   totalCount;                     //!< Count of valid topologies
+    NV_MOSAIC_TOPOLOGY      topos[NVAPI_MAX_MOSAIC_TOPOS];  //!< Maximum number of topologies
+
+} NV_MOSAIC_SUPPORTED_TOPOLOGIES;
+
+//! Used in NV_MOSAIC_SUPPORTED_TOPOLOGIES.
+#define NVAPI_MOSAIC_SUPPORTED_TOPOLOGIES_VER         MAKE_NVAPI_VERSION(NV_MOSAIC_SUPPORTED_TOPOLOGIES,1)
+
+//!@}
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME:   NvAPI_GetSupportedMosaicTopologies
+//
+//! DESCRIPTION:     This API returns all valid Mosaic topologies.
+//!
+//! SUPPORTED OS:  Windows XP
+//!
+//!
+//! \since Release: 177
+//!
+//! \param [out] pMosaicTopos                   An array of valid Mosaic topologies.
+//!
+//! \retval      NVAPI_OK                       Call succeeded; 1 or more topologies were returned
+//! \retval      NVAPI_INVALID_ARGUMENT         One or more arguments are invalid
+//! \retval      NVAPI_MIXED_TARGET_TYPES       Mosaic topology is only possible with all targets of the same NV_GPU_OUTPUT_TYPE.
+//! \retval      NVAPI_NVIDIA_DEVICE_NOT_FOUND  No NVIDIA GPU driving a display was found
+//! \retval      NVAPI_NOT_SUPPORTED            Mosaic is not supported with GPUs on this system.
+//! \retval      NVAPI_NO_ACTIVE_SLI_TOPOLOGY   SLI is not enabled, yet needs to be, in order for this function to succeed.
+//!
+//! \ingroup     mosaicapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GetSupportedMosaicTopologies(NV_MOSAIC_SUPPORTED_TOPOLOGIES *pMosaicTopos);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME:   NvAPI_GetCurrentMosaicTopology
+//
+//! DESCRIPTION:     This API gets the current Mosaic topology.
+//!
+//! SUPPORTED OS:  Windows XP
+//!
+//!
+//! \since Release: 177
+//!
+//! \param [out] pMosaicTopo                    The current Mosaic topology
+//! \param [out] pEnabled                       TRUE if returned topology is currently enabled, else FALSE
+//!
+//! \retval      NVAPI_OK                       Call succeeded
+//! \retval      NVAPI_INVALID_ARGUMENT         One or more arguments are invalid
+//! \retval      NVAPI_NVIDIA_DEVICE_NOT_FOUND  No NVIDIA GPU driving a display was found
+//! \retval      NVAPI_NOT_SUPPORTED            Mosaic is not supported with GPUs on this system.
+//! \retval      NVAPI_NO_ACTIVE_SLI_TOPOLOGY   SLI is not enabled, yet needs to be, in order for this function to succeed.
+//!
+//! \ingroup     mosaicapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GetCurrentMosaicTopology(NV_MOSAIC_TOPOLOGY *pMosaicTopo, NvU32 *pEnabled);
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME:   NvAPI_SetCurrentMosaicTopology
+//
+//! DESCRIPTION:     This API sets the Mosaic topology, and enables it so that the
+//!                  Mosaic display settings are enumerated upon request.
+//!
+//! SUPPORTED OS:  Windows XP
+//!
+//!
+//! \since Release: 177
+//!
+//! \param [in]  pMosaicTopo                    A valid Mosaic topology
+//!
+//! \retval      NVAPI_OK                       Call succeeded
+//! \retval      NVAPI_INVALID_ARGUMENT         One or more arguments are invalid
+//! \retval      NVAPI_NVIDIA_DEVICE_NOT_FOUND  No NVIDIA GPU driving a display was found
+//! \retval      NVAPI_NOT_SUPPORTED            Mosaic is not supported with GPUs on this system.
+//! \retval      NVAPI_NO_ACTIVE_SLI_TOPOLOGY   SLI is not enabled, yet needs to be, in order for this function to succeed.
+//!
+//! \ingroup     mosaicapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_SetCurrentMosaicTopology(NV_MOSAIC_TOPOLOGY *pMosaicTopo);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME:   NvAPI_EnableCurrentMosaicTopology
+//
+//! DESCRIPTION:    This API enables or disables the current Mosaic topology.
+//!                 When enabling, the last Mosaic topology will be set.
+//!
+//!                  - If enabled, enumeration of display settings will include valid Mosaic resolutions.
+//!                  - If disabled, enumeration of display settings will not include Mosaic resolutions.
+//!
+//! SUPPORTED OS:  Windows XP
+//!
+//!
+//! \since Release: 177
+//!
+//! \param [in]  enable                         TRUE to enable the Mosaic Topology, FALSE to disable it.
+//!
+//! \retval      NVAPI_OK                       Call succeeded
+//! \retval      NVAPI_INVALID_ARGUMENT         One or more arguments are invalid
+//! \retval      NVAPI_NVIDIA_DEVICE_NOT_FOUND  No NVIDIA GPU driving a display was found
+//! \retval      NVAPI_NOT_SUPPORTED            Mosaic is not supported with GPUs on this system.
+//! \retval      NVAPI_NO_ACTIVE_SLI_TOPOLOGY   SLI is not enabled, yet needs to be, in order for this function to succeed.
+//!
+//! \ingroup     mosaicapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_EnableCurrentMosaicTopology(NvU32 enable);
+
+
+#define NVAPI_MAX_GSYNC_DEVICES                       4
+
+
+// Sync Display APIs
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GSync_EnumSyncDevices
+//
+//!   DESCRIPTION: This API returns an array of Sync device handles. A Sync device handle represents a
+//!                single Sync device on the system.
+//!
+//! SUPPORTED OS:  Windows 7 and higher
+//!
+//!
+//! \since Release: 313
+//!
+//! \param [out] nvGSyncHandles-  The caller provides an array of handles, which must contain at least
+//!                               NVAPI_MAX_GSYNC_DEVICES elements. The API will zero out the entire array and then fill in one
+//!                               or more handles. If an error occurs, the array is invalid.
+//! \param [out] *gsyncCount-     The caller provides the storage space. NvAPI_GSync_EnumSyncDevices
+//!                               sets *gsyncCount to indicate how many of the elements in the nvGSyncHandles[] array are valid.
+//!                               If an error occurs, *gsyncCount will be set to zero.
+//!
+//! \return  This API can return any of the error codes enumerated in #NvAPI_Status.
+//!          If there are return error codes with specific meaning for this API, they are listed below.
+//! \retval ::NVAPI_INVALID_ARGUMENT         nvGSyncHandles or gsyncCount is NULL.
+//! \retval ::NVAPI_NVIDIA_DEVICE_NOT_FOUND  The queried Graphics system does not have any Sync Device.
+//!
+//! \ingroup gsyncapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GSync_EnumSyncDevices(__out NvGSyncDeviceHandle nvGSyncHandles[NVAPI_MAX_GSYNC_DEVICES], __out NvU32 *gsyncCount);
+
+
+
+// GSync boardId values
+#define NVAPI_GSYNC_BOARD_ID_P358 856		//!< GSync board ID 0x358, see NV_GSYNC_CAPABILITIES
+#define NVAPI_GSYNC_BOARD_ID_P2060 8288		//!< GSync board ID 0x2060, see NV_GSYNC_CAPABILITIES
+
+
+//! Used in NvAPI_GSync_QueryCapabilities().
+typedef struct _NV_GSYNC_CAPABILITIES_V1
+{
+    NvU32   version;						//!< Version of the structure
+    NvU32   boardId;						//!< Board ID
+    NvU32   revision;						//!< FPGA Revision
+    NvU32   capFlags;						//!< Capabilities of the Sync board. Reserved for future use
+} NV_GSYNC_CAPABILITIES_V1;
+
+typedef NV_GSYNC_CAPABILITIES_V1 NV_GSYNC_CAPABILITIES;
+
+
+//! \ingroup gsyncapi
+//! Macro for constructing the version field of NV_GSYNC_CAPABILITIES.
+#define NV_GSYNC_CAPABILITIES_VER1  MAKE_NVAPI_VERSION(NV_GSYNC_CAPABILITIES_V1,1)
+#define NV_GSYNC_CAPABILITIES_VER NV_GSYNC_CAPABILITIES_VER1
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GSync_QueryCapabilities
+//
+//!   DESCRIPTION: This API returns the capabilities of the Sync device.
+//!
+//!
+//! SUPPORTED OS:  Windows 7 and higher
+//!
+//!
+//! \since Release: 313
+//!
+//! \param [in]    hNvGSyncDevice-        The handle for a Sync device for which the capabilities will be queried.
+//! \param [inout] *pNvGSyncCapabilities- The caller provides the storage space. NvAPI_GSync_QueryCapabilities() sets
+//!                                       *pNvGSyncCapabilities to the version and capabilities details of the Sync device
+//!                                       If an error occurs, *pNvGSyncCapabilities will be set to NULL.
+//!
+//! \return  This API can return any of the error codes enumerated in #NvAPI_Status.
+//!          If there are return error codes with specific meaning for this API, they are listed below.
+//! \retval ::NVAPI_INVALID_ARGUMENT         hNvGSyncDevice is NULL.
+//! \retval ::NVAPI_NVIDIA_DEVICE_NOT_FOUND  The queried Graphics system does not have any Sync Device.
+//!
+//! \ingroup gsyncapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GSync_QueryCapabilities(__in NvGSyncDeviceHandle hNvGSyncDevice, __inout NV_GSYNC_CAPABILITIES *pNvGSyncCapabilities);
+
+
+
+//! Connector values for a GPU. Used in NV_GSYNC_GPU.
+typedef enum _NVAPI_GSYNC_GPU_TOPOLOGY_CONNECTOR
+{
+    NVAPI_GSYNC_GPU_TOPOLOGY_CONNECTOR_NONE         = 0,
+    NVAPI_GSYNC_GPU_TOPOLOGY_CONNECTOR_PRIMARY      = 1,
+    NVAPI_GSYNC_GPU_TOPOLOGY_CONNECTOR_SECONDARY    = 2,
+    NVAPI_GSYNC_GPU_TOPOLOGY_CONNECTOR_TERTIARY     = 3,
+    NVAPI_GSYNC_GPU_TOPOLOGY_CONNECTOR_QUARTERNARY  = 4,
+} NVAPI_GSYNC_GPU_TOPOLOGY_CONNECTOR;
+
+//! Display sync states. Used in NV_GSYNC_DISPLAY.
+typedef enum _NVAPI_GSYNC_DISPLAY_SYNC_STATE
+{
+    NVAPI_GSYNC_DISPLAY_SYNC_STATE_UNSYNCED         = 0,
+    NVAPI_GSYNC_DISPLAY_SYNC_STATE_SLAVE            = 1,
+    NVAPI_GSYNC_DISPLAY_SYNC_STATE_MASTER           = 2,
+} NVAPI_GSYNC_DISPLAY_SYNC_STATE;
+
+typedef struct _NV_GSYNC_GPU
+{
+    NvU32                               version;            //!< Version of the structure
+    NvPhysicalGpuHandle                 hPhysicalGpu;       //!< GPU handle
+    NVAPI_GSYNC_GPU_TOPOLOGY_CONNECTOR  connector;          //!< Indicates which connector on the device the GPU is connected to.
+    NvPhysicalGpuHandle                 hProxyPhysicalGpu;  //!< GPU through which hPhysicalGpu is connected to the Sync device (if not directly connected)
+                                                            //!<  - this is NULL otherwise
+    NvU32                               isSynced : 1;       //!< Whether this GPU is sync'd or not.
+    NvU32                               reserved : 31;      //!< Should be set to ZERO
+} NV_GSYNC_GPU;
+
+typedef struct _NV_GSYNC_DISPLAY
+{
+    NvU32                               version;            //!< Version of the structure
+    NvU32                               displayId;          //!< display identifier for displays.The GPU to which it is connected, can be retireved from NvAPI_SYS_GetPhysicalGpuFromDisplayId
+    NvU32                               isMasterable : 1;   //!< Can this display be the master? (Read only)
+    NvU32                               reserved : 31;      //!< Should be set to ZERO
+    NVAPI_GSYNC_DISPLAY_SYNC_STATE      syncState;          //!< Is this display slave/master
+                                                            //!< (Retrieved with topology or set by caller for enable/disable sync)
+} NV_GSYNC_DISPLAY;
+
+#define NV_GSYNC_DISPLAY_VER  MAKE_NVAPI_VERSION(NV_GSYNC_DISPLAY,1)
+#define NV_GSYNC_GPU_VER      MAKE_NVAPI_VERSION(NV_GSYNC_GPU,1)
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GSync_GetTopology
+//
+//!   DESCRIPTION: This API returns the topology for the specified Sync device.
+//!
+//!
+//! SUPPORTED OS:  Windows 7 and higher
+//!
+//!
+//! \since Release: 313
+//!
+//! \param [in]       hNvGSyncDevice-     The caller provides the handle for a Sync device for which the topology will be queried.
+//! \param [in, out]  gsyncGpuCount-      It returns number of GPUs connected to Sync device
+//! \param [in, out]  gsyncGPUs-          It returns info about GPUs connected to Sync device
+//! \param [in, out]  gsyncDisplayCount-  It returns number of active displays that belongs to Sync device
+//! \param [in, out]  gsyncDisplays-      It returns info about all active displays that belongs to Sync device
+//!
+//! HOW TO USE: 1) make a call to get the number of GPUs connected OR displays synced through Sync device
+//!                by passing the gsyncGPUs OR gsyncDisplays as NULL respectively. Both gsyncGpuCount and gsyncDisplayCount can be retrieved in same call by passing
+//!                both gsyncGPUs and gsyncDisplays as NULL
+//!                On call success:
+//!             2) Allocate memory based on gsyncGpuCount(for gsyncGPUs) and/or gsyncDisplayCount(for gsyncDisplays) then make a call to populate gsyncGPUs and/or gsyncDisplays respectively.
+//!
+//! \return  This API can return any of the error codes enumerated in #NvAPI_Status.
+//!          If there are return error codes with specific meaning for this API, they are listed below.
+//! \retval ::NVAPI_INVALID_ARGUMENT               hNvGSyncDevice is NULL.
+//! \retval ::NVAPI_NVIDIA_DEVICE_NOT_FOUND        The queried Graphics system does not have any Sync Device.
+//! \retval ::NVAPI_INSUFFICIENT_BUFFER            When the actual number of GPUs/displays in the topology exceed the number of elements allocated for SyncGPUs/SyncDisplays respectively.
+//!
+//! \ingroup gsyncapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GSync_GetTopology(__in NvGSyncDeviceHandle hNvGSyncDevice, __inout_opt NvU32 *gsyncGpuCount,  __inout_ecount_part_opt(*gsyncGpuCount, *gsyncGpuCount) NV_GSYNC_GPU *gsyncGPUs,
+                                        __inout_opt NvU32 *gsyncDisplayCount, __inout_ecount_part_opt(*gsyncDisplayCount, *gsyncDisplayCount) NV_GSYNC_DISPLAY *gsyncDisplays);
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GSync_SetSyncStateSettings
+//
+//!   DESCRIPTION: Sets a new sync state for the displays in system.
+//!
+//!
+//! SUPPORTED OS:  Windows 7 and higher
+//!
+//!
+//! \since Release: 313
+//!
+//! \param [in]  gsyncDisplayCount-			The number of displays in gsyncDisplays.
+//! \param [in]  pGsyncDisplays-			The caller provides the structure containing all displays that need to be synchronized in the system.
+//!											The displays that are not part of pGsyncDisplays, will be un-synchronized.
+//! \param [in]  flags-						Reserved for future use.
+//!
+//!
+//! \return  This API can return any of the error codes enumerated in #NvAPI_Status.
+//!          If there are return error codes with specific meaning for this API, they are listed below.
+//!
+//! \retval ::NVAPI_INVALID_ARGUMENT			If the display topology or count not valid.
+//! \retval ::NVAPI_NVIDIA_DEVICE_NOT_FOUND		The queried Graphics system does not have any Sync Device.
+//! \retval ::NVAPI_INVALID_SYNC_TOPOLOGY       1.If any mosaic grid is partial.
+//!                                             2.If timing(HVisible/VVisible/refreshRate) applied of any display is different.
+//!                                             3.If There is a across GPU mosaic grid in system and that is not a part of pGsyncDisplays.
+//!
+//! \ingroup gsyncapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GSync_SetSyncStateSettings(__in NvU32 gsyncDisplayCount, __in_ecount(gsyncDisplayCount) NV_GSYNC_DISPLAY *pGsyncDisplays, __in NvU32 flags);
+
+
+//! \ingroup gsyncapi
+
+//! Source signal edge to be used for output pulse. See NV_GSYNC_CONTROL_PARAMS.
+typedef enum _NVAPI_GSYNC_POLARITY
+{
+    NVAPI_GSYNC_POLARITY_RISING_EDGE     = 0,
+    NVAPI_GSYNC_POLARITY_FALLING_EDGE    = 1,
+    NVAPI_GSYNC_POLARITY_BOTH_EDGES      = 2,
+} NVAPI_GSYNC_POLARITY;
+
+//! Used in NV_GSYNC_CONTROL_PARAMS.
+typedef enum _NVAPI_GSYNC_VIDEO_MODE
+{
+    NVAPI_GSYNC_VIDEO_MODE_NONE          = 0,
+    NVAPI_GSYNC_VIDEO_MODE_TTL           = 1,
+    NVAPI_GSYNC_VIDEO_MODE_NTSCPALSECAM  = 2,
+    NVAPI_GSYNC_VIDEO_MODE_HDTV          = 3,
+    NVAPI_GSYNC_VIDEO_MODE_COMPOSITE     = 4,
+} NVAPI_GSYNC_VIDEO_MODE;
+
+//! Used in NV_GSYNC_CONTROL_PARAMS.
+typedef enum _NVAPI_GSYNC_SYNC_SOURCE
+{
+    NVAPI_GSYNC_SYNC_SOURCE_VSYNC        = 0,
+    NVAPI_GSYNC_SYNC_SOURCE_HOUSESYNC    = 1,
+} NVAPI_GSYNC_SYNC_SOURCE;
+
+//! Used in NV_GSYNC_CONTROL_PARAMS.
+typedef struct _NV_GSYNC_DELAY
+{
+    NvU32        version;          //!< Version of the structure
+    NvU32        numLines;         //!< delay to be induced in number of horizontal lines.
+    NvU32        numPixels;        //!< delay to be induced in number of pixels.
+    NvU32        maxLines;         //!< maximum number of lines supported at current display mode to induce delay. Updated by NvAPI_GSync_GetControlParameters(). Read only.
+	NvU32        minPixels;        //!< minimum number of pixels required at current display mode to induce delay. Updated by NvAPI_GSync_GetControlParameters(). Read only.
+} NV_GSYNC_DELAY;
+
+#define NV_GSYNC_DELAY_VER  MAKE_NVAPI_VERSION(NV_GSYNC_DELAY,1)
+
+//! Used in NvAPI_GSync_GetControlParameters() and NvAPI_GSync_SetControlParameters().
+typedef struct _NV_GSYNC_CONTROL_PARAMS
+{
+    NvU32                       version;            //!< Version of the structure
+    NVAPI_GSYNC_POLARITY        polarity;           //!< Leading edge / Falling edge / both
+    NVAPI_GSYNC_VIDEO_MODE      vmode;              //!< None, TTL, NTSCPALSECAM, HDTV
+    NvU32                       interval;           //!< Number of pulses to wait between framelock signal generation
+    NVAPI_GSYNC_SYNC_SOURCE     source;             //!< VSync/House sync
+    NvU32                       interlaceMode:1;    //!< interlace mode for a Sync device
+    NvU32                       reserved:31;        //!< should be set zero
+	NV_GSYNC_DELAY              syncSkew;           //!< The time delay between the frame sync signal and the GPUs signal.
+    NV_GSYNC_DELAY              startupDelay;       //!< Sync start delay for master.
+} NV_GSYNC_CONTROL_PARAMS;
+
+#define NV_GSYNC_CONTROL_PARAMS_VER  MAKE_NVAPI_VERSION(NV_GSYNC_CONTROL_PARAMS,1)
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GSync_GetControlParameters
+//
+//!   DESCRIPTION: This API queries for sync control parameters as defined in NV_GSYNC_CONTROL_PARAMS.
+//!
+//! SUPPORTED OS:  Windows 7 and higher
+//!
+//!
+//! \since Release: 313
+//!
+//! \param [in]    hNvGSyncDevice-   The caller provides the handle of the Sync device for which to get parameters
+//! \param [inout] *pGsyncControls-  The caller provides the storage space. NvAPI_GSync_GetControlParameters() populates *pGsyncControls with values.
+//!
+//! \return  This API can return any of the error codes enumerated in #NvAPI_Status.
+//!          If there are return error codes with specific meaning for this API, they are listed below.
+//! \retval ::NVAPI_INVALID_ARGUMENT          hNvGSyncDevice is NULL.
+//! \retval ::NVAPI_NVIDIA_DEVICE_NOT_FOUND   The queried Graphics system does not have any Sync Device.
+//!
+//! \ingroup gsyncapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GSync_GetControlParameters(__in NvGSyncDeviceHandle hNvGSyncDevice, __inout NV_GSYNC_CONTROL_PARAMS *pGsyncControls);
+
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GSync_SetControlParameters
+//
+//!   DESCRIPTION: This API sets control parameters as defined in NV_SYNC_CONTROL_PARAMS.
+//!
+//! SUPPORTED OS:  Windows 7 and higher
+//!
+//!
+//! \since Release: 313
+//!
+//! \param [in]  hNvGSyncDevice-   The caller provides the handle of the Sync device for which to get parameters
+//! \param [inout]  *pGsyncControls-  The caller provides NV_GSYNC_CONTROL_PARAMS. skew and startDelay will be updated to the applied values.
+//!
+//! \return  This API can return any of the error codes enumerated in #NvAPI_Status.
+//!          If there are return error codes with specific meaning for this API, they are listed below.
+//! \retval ::NVAPI_INVALID_ARGUMENT          hNvGSyncDevice is NULL.
+//! \retval ::NVAPI_NVIDIA_DEVICE_NOT_FOUND   The queried Graphics system does not have any Sync Device.
+//! \retval ::NVAPI_SYNC_MASTER_NOT_FOUND     Control Parameters can only be set if there is a Sync Master enabled on the Gsync card.
+//!
+//! \ingroup gsyncapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GSync_SetControlParameters(__in NvGSyncDeviceHandle hNvGSyncDevice, __inout NV_GSYNC_CONTROL_PARAMS *pGsyncControls);
+
+
+
+
+//! Used in NvAPI_GSync_AdjustSyncDelay()
+typedef enum _NVAPI_GSYNC_DELAY_TYPE
+{
+	NVAPI_GSYNC_DELAY_TYPE_UNKNOWN			= 0,
+    NVAPI_GSYNC_DELAY_TYPE_SYNC_SKEW     	= 1,
+    NVAPI_GSYNC_DELAY_TYPE_STARTUP     		= 2
+} NVAPI_GSYNC_DELAY_TYPE;
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GSync_AdjustSyncDelay
+//
+//!   DESCRIPTION: This API adjusts the skew and startDelay to the closest possible values. Use this API before calling NvAPI_GSync_SetControlParameters for skew or startDelay.
+//!
+//! SUPPORTED OS:  Windows 7 and higher
+//!
+//!
+//! \since Release: 319
+//!
+//! \param [in]  hNvGSyncDevice-   	The caller provides the handle of the Sync device for which to get parameters
+//! \param [in]  delayType-   		Specifies whether the delay is syncSkew or startupDelay.
+//! \param [inout]  *pGsyncDelay-  	The caller provides NV_GSYNC_DELAY. skew and startDelay will be adjusted and updated to the closest values.
+//! \param [out]  *syncSteps-  		This parameter is optional. It returns the sync delay in unit steps. If 0, it means either the NV_GSYNC_DELAY::numPixels is less than NV_GSYNC_DELAY::minPixels or NV_GSYNC_DELAY::numOfLines exceeds the NV_GSYNC_DELAY::maxLines.
+//!
+//! \return  This API can return any of the error codes enumerated in #NvAPI_Status.
+//!          If there are return error codes with specific meaning for this API, they are listed below.
+//!
+//! \ingroup gsyncapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GSync_AdjustSyncDelay(__in NvGSyncDeviceHandle hNvGSyncDevice, __in NVAPI_GSYNC_DELAY_TYPE delayType, __inout NV_GSYNC_DELAY *pGsyncDelay, __out_opt NvU32* syncSteps);
+
+
+
+//! Used in NvAPI_GSync_GetSyncStatus().
+typedef struct _NV_GSYNC_STATUS
+{
+    NvU32 version;                          //!< Version of the structure
+    NvU32 bIsSynced;                        //!< Is timing in sync?
+    NvU32 bIsStereoSynced;                  //!< Does the phase of the timing signal from the GPU = the phase of the master sync signal?
+    NvU32 bIsSyncSignalAvailable;           //!< Is the sync signal available?
+} NV_GSYNC_STATUS;
+
+//! Macro for constructing the version field for NV_GSYNC_STATUS.
+#define NV_GSYNC_STATUS_VER  MAKE_NVAPI_VERSION(NV_GSYNC_STATUS,1)
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GSync_GetSyncStatus
+//
+//!   DESCRIPTION: This API queries the sync status of a GPU - timing, stereosync and sync signal availability.
+//!
+//! SUPPORTED OS:  Windows 7 and higher
+//!
+//!
+//! \since Release: 313
+//!
+//! \param [in]  hNvGSyncDevice-     Handle of the Sync device
+//! \param [in]  hPhysicalGpu-       GPU to be queried for sync status.
+//! \param [out] *status-            The caller provides the storage space. NvAPI_GSync_GetSyncStatus() populates *status with
+//!                                  values - timing, stereosync and signal availability. On error, *status is set to NULL.
+//!
+//! \return  This API can return any of the error codes enumerated in #NvAPI_Status.
+//!          If there are return error codes with specific meaning for this API, they are listed below.
+//! \retval ::NVAPI_INVALID_ARGUMENT          hNvGSyncDevice is NULL / SyncTarget is NULL.
+//! \retval ::NVAPI_NVIDIA_DEVICE_NOT_FOUND   The queried Graphics system does not have any G-Sync Device.
+//!
+//! \ingroup gsyncapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GSync_GetSyncStatus(__in NvGSyncDeviceHandle hNvGSyncDevice, __in NvPhysicalGpuHandle hPhysicalGpu, __inout NV_GSYNC_STATUS *status);
+
+
+//! \ingroup gsyncapi
+
+#define NVAPI_MAX_RJ45_PER_GSYNC    2
+
+//! Used in NV_GSYNC_STATUS_PARAMS.
+typedef enum _NVAPI_GSYNC_RJ45_IO
+{
+    NVAPI_GSYNC_RJ45_OUTPUT     = 0,
+    NVAPI_GSYNC_RJ45_INPUT      = 1,
+    NVAPI_GSYNC_RJ45_UNUSED     = 2 //!< This field is used to notify that the framelock is not actually present.
+
+} NVAPI_GSYNC_RJ45_IO;
+
+//! \ingroup gsyncapi
+//! Used in NvAPI_GSync_GetStatusParameters().
+typedef struct _NV_GSYNC_STATUS_PARAMS
+{
+    NvU32                       version;
+    NvU32                       refreshRate;                                //!< The refresh rate
+    NVAPI_GSYNC_RJ45_IO         RJ45_IO[NVAPI_MAX_RJ45_PER_GSYNC];          //!< Configured as input / output
+    NvU32                       RJ45_Ethernet[NVAPI_MAX_RJ45_PER_GSYNC];    //!< Connected to ethernet hub? [ERRONEOUSLY CONNECTED!]
+    NvU32                       houseSyncIncoming;                          //!< Incoming house sync frequency in Hz
+    NvU32                       bHouseSync;                                 //!< Is house sync connected?
+} NV_GSYNC_STATUS_PARAMS;
+
+
+//! \ingroup gsyncapi
+//! Macro for constructing the version field of NV_GSYNC_STATUS_PARAMS
+#define NV_GSYNC_STATUS_PARAMS_VER  MAKE_NVAPI_VERSION(NV_GSYNC_STATUS_PARAMS,1)
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GSync_GetStatusParameters
+//
+//!   DESCRIPTION: This API queries for sync status parameters as defined in NV_GSYNC_STATUS_PARAMS.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 313
+//!
+//! \param [in]  hNvGSyncDevice   The caller provides the handle of the GSync device for which to get parameters
+//! \param [out] *pStatusParams   The caller provides the storage space. NvAPI_GSync_GetStatusParameters populates *pStatusParams with
+//!                               values.
+//!
+//! \return  This API can return any of the error codes enumerated in #NvAPI_Status.
+//!          If there are return error codes with specific meaning for this API, they are listed below.
+//! \retval ::NVAPI_INVALID_ARGUMENT            hNvGSyncDevice is NULL / pStatusParams is NULL.
+//! \retval ::NVAPI_NVIDIA_DEVICE_NOT_FOUND     The queried Graphics system does not have any GSync Device.
+//!
+//! \ingroup gsyncapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GSync_GetStatusParameters(NvGSyncDeviceHandle hNvGSyncDevice, NV_GSYNC_STATUS_PARAMS *pStatusParams);
+
+//! @}
+
+
+
+
+
+
+
+#if defined(_D3D9_H_)
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME:   NvAPI_D3D9_RegisterResource
+//
+//! DESCRIPTION:    This API binds a resource (surface/texture) so that it can be retrieved
+//!                 internally by NVAPI.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//! \param [in]     pResource      surface/texture
+//!
+//! \return ::NVAPI_OK, ::NVAPI_ERROR
+//!
+//! \ingroup dx
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_D3D9_RegisterResource(IDirect3DResource9* pResource);
+#endif //defined(_D3D9_H_)
+#if defined(_D3D9_H_)
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME:   NvAPI_D3D9_UnregisterResource
+//
+//! DESCRIPTION:     This API unbinds a resource (surface/texture) after use.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \param [in]     pResource    surface/texture
+//!
+//! \return ::NVAPI_OK, ::NVAPI_ERROR
+//!
+//! \ingroup dx
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_D3D9_UnregisterResource(IDirect3DResource9* pResource);
+
+#endif //defined(_D3D9_H_)
+
+
+
+#if defined(_D3D9_H_)
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_D3D9_AliasSurfaceAsTexture
+//
+//! \fn NvAPI_D3D9_AliasSurfaceAsTexture(IDirect3DDevice9* pDev,
+//!                                              IDirect3DSurface9* pSurface,
+//!                                              IDirect3DTexture9 **ppTexture,
+//!                                              DWORD dwFlag);
+//!   DESCRIPTION: Create a texture that is an alias of a surface registered with NvAPI.  The
+//!                new texture can be bound with IDirect3DDevice9::SetTexture().  Note that the texture must
+//!                be unbound before drawing to the surface again.
+//!                Unless the USE_SUPER flag is passed, MSAA surfaces will be resolved before
+//!                being used as a texture.  MSAA depth buffers are resolved with a point filter,
+//!                and non-depth MSAA surfaces are resolved with a linear filter.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \param [in]    pDev         The D3D device that owns the objects
+//! \param [in]    pSurface     Pointer to a surface that has been registered with NvAPI
+//!                             to which a texture alias is to be provided
+//! \param [out]   ppTexture    Fill with the texture created
+//! \param [in]    dwFlag       NVAPI_ALIAS_SURFACE_FLAG to describe how to handle the texture
+//!
+//! \retval ::NVAPI_OK                  completed request
+//! \retval ::NVAPI_INVALID_POINTER     A null pointer was passed as an argument
+//! \retval ::NVAPI_INVALID_ARGUMENT    One of the arguments was invalid, probably dwFlag.
+//! \retval ::NVAPI_UNREGISTERED_RESOURCE pSurface has not been registered with NvAPI
+//! \retval ::NVAPI_ERROR               error occurred
+//
+///////////////////////////////////////////////////////////////////////////////
+
+
+//! \ingroup dx
+//! See NvAPI_D3D9_AliasSurfaceAsTexture().
+typedef enum {
+    NVAPI_ALIAS_SURFACE_FLAG_NONE                     = 0x00000000,
+    NVAPI_ALIAS_SURFACE_FLAG_USE_SUPER                = 0x00000001,  //!< Use the surface's msaa buffer directly as a texture, rather than resolving. (This is much slower, but potentially has higher quality.)
+    NVAPI_ALIAS_SURFACE_FLAG_MASK                     = 0x00000001
+} NVAPI_ALIAS_SURFACE_FLAG;
+
+
+//! \ingroup dx
+NVAPI_INTERFACE NvAPI_D3D9_AliasSurfaceAsTexture(IDirect3DDevice9* pDev,
+                                                 IDirect3DSurface9* pSurface,
+                                                 IDirect3DTexture9 **ppTexture,
+                                                 DWORD dwFlag);
+#endif //defined(_D3D9_H_)
+#if defined(_D3D9_H_)
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME:   NvAPI_D3D9_StretchRectEx
+//
+//! DESCRIPTION:     This API copies the contents of the source resource to the destination
+//!                  resource.  This function can convert
+//!                  between a wider range of surfaces than
+//!                  IDirect3DDevice9::StretchRect.  For example, it can copy
+//!                  from a depth/stencil surface to a texture.
+//!
+//!                  The source and destination resources *must* be registered
+//!                  with NvAPI before being used with NvAPI_D3D9_StretchRectEx().
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \param [in]     pDevice             The D3D device that owns the objects.
+//! \param [in]     pSourceResource     Pointer to the source resource.
+//! \param [in]     pSrcRect            Defines the rectangle on the source to copy from.  If NULL, copy from the entire resource.
+//! \param [in]     pDestResource       Pointer to the destination resource.
+//! \param [in]     pDstRect            Defines the rectangle on the destination to copy to.  If NULL, copy to the entire resource.
+//! \param [in]     Filter              Choose a filtering method: D3DTEXF_NONE, D3DTEXF_POINT, D3DTEXF_LINEAR.
+//!
+//! \retval ::NVAPI_OK                       completed request
+//! \retval ::NVAPI_INVALID_POINTER          An invalid pointer was passed as an argument (probably NULL)
+//! \retval ::NVAPI_INVALID_ARGUMENT         One of the arguments was invalid
+//! \retval ::NVAPI_UNREGISTERED_RESOURCE    a resource was passed in without being registered
+//! \retval ::NVAPI_ERROR                    error occurred
+//!
+//! \ingroup dx
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_D3D9_StretchRectEx(IDirect3DDevice9 * pDevice,
+                                         IDirect3DResource9 * pSourceResource,
+                                         CONST RECT * pSourceRect,
+                                         IDirect3DResource9 * pDestResource,
+                                         CONST RECT * pDestRect,
+                                         D3DTEXTUREFILTERTYPE Filter);
+
+#endif //defined(_D3D9_H_)
+#if defined(_D3D9_H_)
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME:   NvAPI_D3D9_ClearRT
+//
+//! DESCRIPTION:     This API Clears the currently bound render target(s) with the
+//!                  given color
+//!
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \param [in]     pDevice             The D3D device that owns the objects.
+//! \param [in]     dwNumRects          The no of rectangles to clear. If 0, clear the entire surface (clipped to viewport)
+//! \param [in]     pRects              Defines the rectangles to clear.  Should be NULL if dwNumRects == 0
+//! \param [in]     r                   red component of the clear color
+//! \param [in]     g                   green component of the clear color
+//! \param [in]     b                   blue component of the clear color
+//! \param [in]     a                   alpha component of the clear color
+//!
+//! \return This API can return any of the error codes enumerated in #NvAPI_Status.
+//!
+//! \ingroup dx
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_D3D9_ClearRT(IDirect3DDevice9 * pDevice,
+                                   NvU32 dwNumRects,
+                                   CONST RECT * pRects,
+                                   float r, float g, float b, float a);
+#endif //if defined(_D3D9_H_)
+
+
+
+
+
+
+
+
+
+
+#if defined(_D3D9_H_) && defined(__cplusplus)
+//! SUPPORTED OS:  Windows XP and higher
+//!
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_D3D9_GetSurfaceHandle
+//
+//!  This function gets the handle of a given surface. This handle uniquely
+//!  identifies the surface through all NvAPI entries.
+//!
+//!
+//! \since Release: 313
+//!
+//! \param [in]     pSurface  Surface to be identified
+//! \param [out]    pHandle    Will be filled by the return handle
+//!
+//! \return An int which could be an NvAPI status or DX HRESULT code
+//! \ingroup dx
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_D3D9_GetSurfaceHandle(IDirect3DSurface9 *pSurface,
+                                        NVDX_ObjectHandle *pHandle);
+
+#endif //defined(_D3D9_H_) && defined(__cplusplus)
+
+#if defined(_D3D9_H_) && defined(__cplusplus)
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//! \addtogroup dxvidcontrol
+//! @{
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION_NAME: NvAPI_D3D9_VideoSetStereoInfo
+//
+//! \fn NvAPI_D3D9_VideoSetStereoInfo(IDirect3DDevice9 *pDev,
+//!                                            NV_DX_VIDEO_STEREO_INFO *pStereoInfo);
+//! \code
+//!   DESCRIPTION: This api specifies the stereo format of a surface, so that the
+//!                surface could be used for stereo video processing or compositing.
+//!                In particular, this api could be used to link the left and right
+//!                views of a decoded picture.
+//!
+//! \since Release: 313
+//!
+//!         INPUT: pDev        - The device on which the stereo surface will be used
+//!                pStereoInfo - The stereo format of the surface
+//!
+//! RETURN STATUS: an int which could be an NvAPI status or DX HRESULT code
+//! \endcode
+///////////////////////////////////////////////////////////////////////////////
+
+#ifndef NV_STEREO_VIDEO_FORMAT_DEFINE
+#define NV_STEREO_VIDEO_FORMAT_DEFINE
+
+
+typedef enum _NV_STEREO_VIDEO_FORMAT
+{
+    NV_STEREO_VIDEO_FORMAT_NOT_STEREO         = 0,
+
+    NV_STEREO_VIDEO_FORMAT_SIDE_BY_SIDE_LR    = 1,
+    NV_STEREO_VIDEO_FORMAT_SIDE_BY_SIDE_RL    = 2,
+    NV_STEREO_VIDEO_FORMAT_TOP_BOTTOM_LR      = 3,
+    NV_STEREO_VIDEO_FORMAT_TOP_BOTTOM_RL      = 4,
+    NV_STEREO_VIDEO_FORMAT_ROW_INTERLEAVE_LR  = 5,
+    NV_STEREO_VIDEO_FORMAT_ROW_INTERLEAVE_RL  = 6,
+    NV_STEREO_VIDEO_FORMAT_TWO_FRAMES_LR      = 7,
+    NV_STEREO_VIDEO_FORMAT_MONO_PLUS_OFFSET   = 8,
+
+    NV_STEREO_VIDEO_FORMAT_LAST               = 9,
+} NV_STEREO_VIDEO_FORMAT;
+
+#endif // NV_STEREO_VIDEO_FORMAT_DEFINE
+
+
+typedef struct _NV_DX_VIDEO_STEREO_INFO {
+    NvU32                     dwVersion;         //!< Must be NV_DX_VIDEO_STEREO_INFO_VER
+    NVDX_ObjectHandle         hSurface;          //!< The surface whose stereo format is to be set
+    NVDX_ObjectHandle         hLinkedSurface;    //!< The linked surface (must be valid when eFormat==NV_STEREO_VIDEO_FORMAT_TWO_FRAMES_LR)
+    NV_STEREO_VIDEO_FORMAT    eFormat;           //!< Stereo format of the surface
+    NvS32                     sViewOffset;       //!< Signed offset of each view (positive offset indicating left view is shifted left)
+    BOOL                      bStereoEnable;     //!< Whether stereo rendering should be enabled (if FALSE, only left view will be used)
+} NV_DX_VIDEO_STEREO_INFO;
+
+//! Macro for constructing the version field of ::NV_DX_VIDEO_STEREO_INFO
+#define NV_DX_VIDEO_STEREO_INFO_VER  MAKE_NVAPI_VERSION(NV_DX_VIDEO_STEREO_INFO,1)
+
+NVAPI_INTERFACE NvAPI_D3D9_VideoSetStereoInfo(IDirect3DDevice9 *pDev,
+                                              NV_DX_VIDEO_STEREO_INFO *pStereoInfo);
+
+//! @}
+#endif //defined(_D3D9_H_) && defined(__cplusplus)
+
+
+
+
+
+
+#if defined (__cplusplus) && (defined(__d3d11_h__) || defined(__d3d11_1_h__))
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_D3D11_IsNvShaderExtnOpCodeSupported
+//
+//!   DESCRIPTION: This function checks if a nv HLSL shader extension opcode is
+//!                supported on current hardware. List of opcodes is in nvShaderExtnEnums.h
+//!                To use Nvidia HLSL extensions the application must include nvHLSLExtns.h
+//!                in the hlsl shader code. See nvHLSLExtns.h for more details on supported opcodes.
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \param [in]        pDev         The device on which to query for support,
+//!                                 should be a ID3D11Device+ device
+//! \param [in]        opCode       the opcode to check
+//! \param [out]       pSupported   true if supported, false otherwise
+//!
+//! RETURN STATUS:     This API can return any of the error codes enumerated in #NvAPI_Status.
+//!                    If there are return error codes with specific meaning for this API, they are listed below.
+//! \retval ::         NVAPI_OK if the call succeeded
+//!
+//! \ingroup dx
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_D3D11_IsNvShaderExtnOpCodeSupported(__in  IUnknown *pDev,
+                                                          __in  NvU32 opCode,
+													      __out bool *pSupported);
+
+#endif //defined (__cplusplus) && (defined(__d3d11_h__) || defined(__d3d11_1_h__))
+
+#if defined (__cplusplus) && (defined(__d3d11_h__) || defined(__d3d11_1_h__))
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_D3D11_SetNvShaderExtnSlot
+//
+//!   DESCRIPTION: This function sets the fake UAV slot that is used by Nvidia HLSL
+//!                shader extensions. All createShader calls made to the driver after
+//!                setting this slot would treat writes/reads to this UAV in a
+//!                different way. Applications are expected to bind null UAV to this slot.
+//!                The same slot is used for all shader stages.
+//!                To disable shader extensions the app may set this uav slot
+//!                to some value that is bigger than the max allowed slot index
+//!                e.g, 128 or 0xFFFFFFFF.
+//!                To use Nvidia HLSL extensions the application must include nvHLSLExtns.h
+//!                in the hlsl shader code. See nvHLSLExtns.h for more details.
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \param [in]        pDev         The device for which to set the extension slot
+//!                                 should be a ID3D11Device+ device
+//! \param [in]        uavSlot      the uav slot to use
+//!
+//! RETURN STATUS:     This API can return any of the error codes enumerated in #NvAPI_Status.
+//!                    If there are return error codes with specific meaning for this API, they are listed below.
+//! \retval ::         NVAPI_OK    : success, the uavSlot was set sucessfully
+//!
+//! \ingroup dx
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_D3D11_SetNvShaderExtnSlot(__in IUnknown *pDev,
+                                                __in NvU32 uavSlot);
+
+#endif //defined (__cplusplus) && (defined(__d3d11_h__) || defined(__d3d11_1_h__))
+
+
+#if defined (__cplusplus) && (defined(__d3d11_h__) || defined(__d3d11_1_h__))
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_D3D11_BeginUAVOverlapEx
+//
+//!   DESCRIPTION: Causes the driver to skip synchronization that is normally needed when accessing UAVs.
+//!                Applications must use this with caution otherwise this might cause data hazards when
+//!                multiple draw calls/compute shader launches are accessing same memory locations
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \param [in]        *pDeviceOrContext     pointer to D3D11 device, or D3D11 device context
+//! \param [in]        insertWFIFlags        bit fields to indicate which WFI would be inserted (gfx / compute / both).
+//!
+//! RETURN STATUS:     This API can return any of the error codes enumerated in #NvAPI_Status.
+//!                    If there are return error codes with specific meaning for this API, they are listed below.
+//!
+//! \ingroup dx
+///////////////////////////////////////////////////////////////////////////////
+typedef enum _NVAPI_D3D11_INSERTWFI_FLAG
+{
+    NVAPI_D3D_BEGIN_UAV_OVERLAP_NO_WFI      = 0x00000000,   //!< no WFI
+    NVAPI_D3D_BEGIN_UAV_OVERLAP_GFX_WFI     = 0x00000001,   //!< (bit 0) force graphics WFI
+    NVAPI_D3D_BEGIN_UAV_OVERLAP_COMP_WFI    = 0x00000002,   //!< (bit 1) force compute WFI
+} NVAPI_D3D11_INSERTWFI_FLAG;
+
+NVAPI_INTERFACE NvAPI_D3D11_BeginUAVOverlapEx(__in  IUnknown *pDeviceOrContext, __in NvU32 insertWFIFlags);
+
+#endif //defined (__cplusplus) && (defined(__d3d11_h__) || defined(__d3d11_1_h__))
+
+#if defined (__cplusplus) && (defined(__d3d11_h__) || defined(__d3d11_1_h__))
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_D3D11_BeginUAVOverlap
+//
+//!   DESCRIPTION: Causes the driver to skip synchronization that is normally needed when accessing UAVs.
+//!                Applications must use this with caution otherwise this might cause data hazards when
+//!                multiple draw calls/compute shader launches are accessing same memory locations
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \param [in]        *pDeviceOrContext     pointer to D3D11 device, or D3D11 device context
+//!
+//! RETURN STATUS:     This API can return any of the error codes enumerated in #NvAPI_Status.
+//!                    If there are return error codes with specific meaning for this API, they are listed below.
+//!
+//! \ingroup dx
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_D3D11_BeginUAVOverlap(__in  IUnknown *pDeviceOrContext);
+
+#endif //defined (__cplusplus) && (defined(__d3d11_h__) || defined(__d3d11_1_h__))
+
+#if defined (__cplusplus) && (defined(__d3d11_h__) || defined(__d3d11_1_h__))
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_D3D11_EndUAVOverlap
+//
+//!   DESCRIPTION: Re-enables driver synchronization between calls that access same UAVs
+//!                See NvAPI_D3D_BeginUAVOverlap for more details.
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \param [in]        *pDeviceOrContext     pointer to D3D11 device, or D3D11 device context
+//!
+//! RETURN STATUS:     This API can return any of the error codes enumerated in #NvAPI_Status.
+//!                    If there are return error codes with specific meaning for this API, they are listed below.
+//!
+//! \ingroup dx
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_D3D11_EndUAVOverlap(__in  IUnknown *pDeviceOrContext);
+
+#endif //defined (__cplusplus) && (defined(__d3d11_h__) || defined(__d3d11_1_h__))
+
+#if defined(_D3D9_H_) || defined(__d3d10_h__) || defined(__d3d10_1_h__) || defined(__d3d11_h__)
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_D3D_SetFPSIndicatorState
+//
+//!   DESCRIPTION: Display an overlay that tracks the number of times the app presents per second, or,
+//!      the number of frames-per-second (FPS)
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \param [in] bool    Whether or not to enable the fps indicator.
+//!
+//! \return ::NVAPI_OK,
+//!         ::NVAPI_ERROR
+//!
+//! \ingroup dx
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_D3D_SetFPSIndicatorState(IUnknown *pDev, NvU8 doEnable);
+
+#endif //if defined(_D3D9_H_) || defined(__d3d10_h__) || defined(__d3d10_1_h__) || defined(__d3d11_h__)
+
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+#if defined (__cplusplus) && (defined(__d3d11_h__) || defined(__d3d11_1_h__) || defined(__d3d12_h__))
+
+enum NVAPI_QUAD_FILLMODE
+{
+    NVAPI_QUAD_FILLMODE_DISABLED = 0,
+    NVAPI_QUAD_FILLMODE_BBOX = 1,
+    NVAPI_QUAD_FILLMODE_FULL_VIEWPORT = 2,
+};
+
+#endif //defined(__cplusplus) && (defined(__d3d11_h__) || defined(__d3d11_1_h__) || defined(__d3d12_h__))
+
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+#if defined (__cplusplus) && (defined(__d3d11_h__) || defined(__d3d11_1_h__))
+
+typedef struct NvAPI_D3D11_RASTERIZER_DESC_EX
+{
+    // D3D11_RASTERIZER_DESC member variables
+    D3D11_FILL_MODE FillMode;
+    D3D11_CULL_MODE CullMode;
+    BOOL FrontCounterClockwise;
+    INT DepthBias;
+    FLOAT DepthBiasClamp;
+    FLOAT SlopeScaledDepthBias;
+    BOOL DepthClipEnable;
+    BOOL ScissorEnable;
+    BOOL MultisampleEnable;
+    BOOL AntialiasedLineEnable;
+
+    // NvAPI_D3D11_RASTERIZER_DESC_EX specific member variables
+    NvU32 ForcedSampleCount;                 //<! Added DX 11.1, part of _DESC1 version of this struct.
+    bool ProgrammableSamplePositionsEnable;  //<! enable Programmable Samples feature
+    bool InterleavedSamplingEnable;          //<! when jitter is enabled, an app need to fill the whole arrays below, otherwise only as much entries as samples
+    NvU8 SampleCount;                        //<! number of samples. In TIR N->1 it needs to match N, in non-TIR it needs to match RT sample count. Ignored if ForcePerSampleInterlock is set
+    NvU8 SamplePositionsX[16];               //<! x positions in API sample order
+    NvU8 SamplePositionsY[16];               //<! y positions in API sample order
+    bool ConservativeRasterEnable;           //<! rasterize all pixels a primitive touches in any way instead of just those with the centroid covered.
+    NVAPI_QUAD_FILLMODE QuadFillMode;        //<! Fill a triangle outside its bounds as a screen-aligned quad, matching the tri's bounding-box or filling the full viewport.
+    bool PostZCoverageEnable;                //<! Enable pixel-shader input SV_COVERAGE to account for z-test in early-z mode.
+    bool CoverageToColorEnable;              //<! Enable output of coverage to a color render-target.
+    NvU8 CoverageToColorRTIndex;             //<! Index of RT for coverage-to-color.
+    NvU32 reserved[16];                      //<! reserved for expansion, set to zero.
+} NvAPI_D3D11_RASTERIZER_DESC_EX;
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_D3D11_CreateRasterizerState
+//
+//! \code
+//!   DESCRIPTION: This function is an extension of ID3D11Device::CreateRasterizerState with additional raster states
+//!
+//!         \param [in]        pDevice             current d3d device
+//!         \param [in]        pRasterizerDesc     Rasterizer state description of type NVAPI_D3D11_RASTERIZER_DESC_EX
+//!         \param [out]       ppRasterizerState   ID3D11RasterizerState
+//!
+//!
+//! \return ::NVAPI_OK     if the call succeeds.
+//! \endcode
+//! \ingroup dx
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_D3D11_CreateRasterizerState(__in ID3D11Device *pDevice,
+                                                  __in const NvAPI_D3D11_RASTERIZER_DESC_EX *pRasterizerDesc,
+                                                  __out ID3D11RasterizerState **ppRasterizerState);
+
+#endif //defined(__cplusplus) && defined(__d3d11_h__)
+
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+
+#if defined (__cplusplus) && (defined(__d3d11_h__) || defined(__d3d11_1_h__))
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_D3D11_AliasMSAATexture2DAsNonMSAA
+//
+//! \code
+//!   DESCRIPTION: This function allows creating (aliasing) a non-MSAA Texture2D object using the same memory as the given multi-sampled
+//!                texture (pInputTex). The surface created would be bloated in width and height but it will have SampleCount = 1
+//!                For 2X MSAA: OutTex.Width = InputTex.Width * 2, outTex.Height = InputTex.Height
+//!                For 4X MSAA: OutTex.Width = InputTex.Width * 2, outTex.Height = InputTex.Height * 2
+//!                For 8X MSAA: OutTex.Width = InputTex.Width * 4, outTex.Height = InputTex.Height * 2
+//!                Only textures SampleQuality = 0 can be aliased as Non MSAA
+//!                The app should ensure that original texture is released only after the aliased copy is released.
+//!
+//!         \param [in]        pDevice             current d3d device
+//!         \param [in]        pInputTex           The MultiSampled Texture2D resource that is being aliased
+//!         \param [out]       ppOutTex            The aliased non AA copy MultiSampled Texture2D resource
+//!
+//!
+//! \return :: NVAPI_OK                   if the call succeeds.
+//! \endcode
+//! \ingroup dx
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_D3D11_AliasMSAATexture2DAsNonMSAA(__in ID3D11Device *pDevice,
+														__in ID3D11Texture2D *pInputTex,
+														__out ID3D11Texture2D **ppOutTex);
+
+#endif //defined(__cplusplus) && defined(__d3d11_h__)
+
+#if defined (__cplusplus) && (defined(__d3d11_h__) || defined(__d3d12_h__)) && (!defined(CINTERFACE))
+typedef UINT NvAPI_D3D11_SWIZZLE_MODE;
+
+typedef enum _NV_SWIZZLE_MODE
+{
+    NV_SWIZZLE_POS_X = 0,
+    NV_SWIZZLE_NEG_X = 1,
+    NV_SWIZZLE_POS_Y = 2,
+    NV_SWIZZLE_NEG_Y = 3,
+    NV_SWIZZLE_POS_Z = 4,
+    NV_SWIZZLE_NEG_Z = 5,
+    NV_SWIZZLE_POS_W = 6,
+    NV_SWIZZLE_NEG_W = 7
+}NV_SWIZZLE_MODE;
+
+typedef enum _NV_SWIZZLE_OFFSET
+{
+    NV_SWIZZLE_OFFSET_X = 0,
+    NV_SWIZZLE_OFFSET_Y = 4,
+    NV_SWIZZLE_OFFSET_Z = 8,
+    NV_SWIZZLE_OFFSET_W = 12
+}NV_SWIZZLE_OFFSET;
+
+#endif //defined (__cplusplus) && (defined(__d3d11_h__) || defined(__d3d12_h__)) && (!defined(CINTERFACE))
+
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+
+#if defined (__cplusplus) && defined(__d3d11_h__) && (!defined(CINTERFACE) )
+
+typedef enum _NV_FASTGS_FLAGS
+{
+    NV_FASTGS_USE_VIEWPORT_MASK             = 0x01,   // Causes SV_ViewportArrayIndex value to be interpreted as a bitmask of viewports to broadcast to.
+    NV_FASTGS_OFFSET_RT_INDEX_BY_VP_INDEX   = 0x02,   // Causes SV_RenderTargetArrayIndex value to be offset by the viewport index when broadcasting.
+    NV_FASTGS_STRICT_API_ORDER              = 0x04,   // Causes broadcast primitives to be rendered strictly in API order (slow).
+                                                      // By default, primitives may be batched per viewport to improve performance.
+} NV_FASTGS_FLAGS;
+
+struct NvAPI_D3D11_CREATE_FASTGS_EXPLICIT_DESC_V1
+{
+    NvU32 version;                                   // ALWAYS == NVAPI_D3D11_CREATEFASTGSEXPLICIT_VER
+    NvU32 flags;                                     // A combination of flags from NV_FASTGS_FLAGS
+    NvAPI_D3D11_SWIZZLE_MODE *pCoordinateSwizzling;  // [optional] Array of 16 coordinate swizzle modes, one per viewport. NULL if not used.
+                                                     // The output x, y, z, and w coordinates of all vertices can be set to any of the coordinates or their
+                                                     // negated versions i.e. {x, y, z, w, -x, -y, -z, -w}. Coordinates are swizzled before any viewport
+                                                     // operation occurs i.e. before frustum clipping, scaling, and viewport clipping. And after
+                                                     // last of vertex/tesselation/geometry shader stage, stream-out and viewport broadcast expansion (see NV_FASTGS_USE_VIEWPORT_MASK)
+                                                     // pCoordinateSwizzling[i] sets the swizzle-mode of each component for viewport i.
+                                                     // See NV_SWIZZLE_MODE for values of allowed swizzle modes.
+                                                     // See NV_SWIZZLE_OFFSET for bit offset from where NV_SWIZZLE_MODE to be set for each component.
+                                                     // For example :
+                                                     // 1. To set swizzle for viewport 0 such that -  w and z are unchanged and values of x and y are swapped :
+                                                     // pCoordinateSwizzling[0] = (NV_SWIZZLE_POS_W << NV_SWIZZLE_OFFSET_W) |
+                                                     //                           (NV_SWIZZLE_POS_Z << NV_SWIZZLE_OFFSET_Z) |
+                                                     //                           (NV_SWIZZLE_POS_X << NV_SWIZZLE_OFFSET_Y) |
+                                                     //                           (NV_SWIZZLE_POS_Y << NV_SWIZZLE_OFFSET_X);
+                                                     // 2. To set swizzle for viewport 0 such that -  w, z and y are unchanged and value of x is negated :
+                                                     // pCoordinateSwizzling[0] = (NV_SWIZZLE_POS_W << NV_SWIZZLE_OFFSET_W) |
+                                                     //                           (NV_SWIZZLE_POS_Z << NV_SWIZZLE_OFFSET_Z) |
+                                                     //                           (NV_SWIZZLE_POS_Y << NV_SWIZZLE_OFFSET_Y) |
+                                                     //                           (NV_SWIZZLE_NEG_X << NV_SWIZZLE_OFFSET_X);
+                                                     // Need to set some valid combination of swizzle-modes for all viewports, irrespective of whether that viewport is set.
+                                                     // Invalid swizzle-mode for any viewport (even if that viewport is not set) may result in removal of device.
+};
+
+#define NVAPI_D3D11_CREATEFASTGSEXPLICIT_VER1 MAKE_NVAPI_VERSION(NvAPI_D3D11_CREATE_FASTGS_EXPLICIT_DESC_V1, 1)
+#define NVAPI_D3D11_CREATEFASTGSEXPLICIT_VER  NVAPI_D3D11_CREATEFASTGSEXPLICIT_VER1
+
+typedef NvAPI_D3D11_CREATE_FASTGS_EXPLICIT_DESC_V1 NvAPI_D3D11_CREATE_FASTGS_EXPLICIT_DESC;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_D3D11_CreateFastGeometryShaderExplicit
+//
+//! \fn NvAPI_D3D11_CreateFastGeometryShaderExplicit
+//!
+//!   DESCRIPTION: This function will create a fast geometry shader written using an "explicit"
+//!                coding style, rather than converting a standard GS. For the explicit coding
+//!                style, the GS must be written with maxvertexcount(1), and must pass-through
+//!                input vertex 0 to the output without modification.
+//!
+//!                Additional per-primitive outputs may also be computed and written to the single
+//!                output vertex. If these outputs are read by the pixel shader, they must be
+//!                declared with the "nointerpolation" attribute in the PS input signature;
+//!                otherwise, visual corruption may occur. Also, unlike D3D API, there is no guarantee
+//!                that pixel shader will get the default value of an attribute if that attribute is not written
+//!                by the earlier shader stage in the pipeline.
+//!
+//!                The first four parameters are identical to ID3D11Device::CreateGeometryShader(),
+//!                so please refer to its documentation for their usage.
+//!
+//! \since Release:
+//!
+//!   \param [in]  pDevice               The device pointer
+//!   \param [in]  pShaderBytecode       A pointer to the compiled shader.
+//!   \param [in]  BytecodeLength        Size of the compiled geometry shader.
+//!   \param [in]  pClassLinkage         A pointer to a class linkage interface. Can be NULL.
+//!   \param [in]  pCreateFastGSArgs     A pointer to a NvAPI_D3D11_CREATE_FASTGS_EXPLICIT struct.
+//!   \param [out] ppGeometryShader      Address of a pointer to a ID3D11GeometryShader interface.
+//!
+//! \return  This API can return any of the error codes enumerated in
+//!          #NvAPI_Status.  If there are return error codes with specific
+//!          meaning for this API, they are listed below.
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+NVAPI_INTERFACE NvAPI_D3D11_CreateFastGeometryShaderExplicit(__in ID3D11Device *pDevice, __in const void *pShaderBytecode,
+                                                             __in SIZE_T BytecodeLength, __in_opt ID3D11ClassLinkage *pClassLinkage,
+                                                             __in const NvAPI_D3D11_CREATE_FASTGS_EXPLICIT_DESC *pCreateFastGSArgs,
+                                                             __out ID3D11GeometryShader **ppGeometryShader);
+
+#endif //defined(__cplusplus) && defined(__d3d11_h__) && (!defined(CINTERFACE))
+
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+
+#if defined (__cplusplus) && defined(__d3d11_h__) && (!defined(CINTERFACE) )
+////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_D3D11_CreateFastGeometryShader
+//
+//! \fn NvAPI_D3D11_CreateFastGeometryShader
+//!
+//!   DESCRIPTION: This function will convert a regular geometry shader into a fast GS variant if possible.
+//!                It will not do any validation regarding the compatibility of the resulting fast GS with any
+//!                Pixel shader. The validation has to be done by the application manually.
+//!
+//!                The parameters are identical to ID3D11Device::CreateGeometryShader()
+//!                so please refer to its documentation for their usage.
+//!
+//!                If the shader is too complex or is not in adequate form to be converted to fast GS
+//!                this function will simply fail. You should then call ID3D11Device::CreateGeometryShader()
+//!                to create the regular geometry shader.
+//!
+//! \since Release:
+//!
+//!   \param [in]  pDevice               The device pointer
+//!   \param [in]  pShaderBytecode       A pointer to the compiled shader.
+//!   \param [in]  BytecodeLength        Size of the compiled geometry shader.
+//!   \param [in]  pClassLinkage         A pointer to a class linkage interface. Can be NULL.
+//!   \param [out] ppGeometryShader      Address of a pointer to a ID3D11GeometryShader interface.
+//!
+//! \return  This API can return any of the error codes enumerated in
+//!          #NvAPI_Status.  If there are return error codes with specific
+//!          meaning for this API, they are listed below.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+NVAPI_INTERFACE NvAPI_D3D11_CreateFastGeometryShader(__in ID3D11Device *pDevice, __in const void *pShaderBytecode,
+                                                     __in SIZE_T BytecodeLength, __in_opt ID3D11ClassLinkage *pClassLinkage,
+                                                     __out ID3D11GeometryShader **ppGeometryShader);
+
+#endif //defined(__cplusplus) && defined(__d3d11_h__) && (!defined(CINTERFACE))
+
+
+#if defined (__cplusplus) && defined(__d3d12_h__)
+
+//! Enum for CreatePSO extensions.
+//! \ingroup dx
+typedef enum _NV_PSO_EXTENSION
+{
+    NV_PSO_RASTER_EXTENSION = 0,
+    NV_PSO_REQUEST_FASTGS_EXTENSION = 1,
+    NV_PSO_ENABLE_DEPTH_BOUND_TEST_EXTENSION = 3,
+    NV_PSO_EXPLICIT_FASTGS_EXTENSION = 4,
+}NV_PSO_EXTENSION;
+
+struct NVAPI_D3D12_PSO_EXTENSION_DESC_V1
+{
+    NvU32            baseVersion;         //<! Always use NV_PSO_EXTENSION_DESC_VER
+    NV_PSO_EXTENSION psoExtension;
+};
+
+#define NV_PSO_EXTENSION_DESC_VER_1   MAKE_NVAPI_VERSION(NVAPI_D3D12_PSO_EXTENSION_DESC_V1, 1)
+#define NV_PSO_EXTENSION_DESC_VER     NV_PSO_EXTENSION_DESC_VER_1
+
+typedef NVAPI_D3D12_PSO_EXTENSION_DESC_V1   NVAPI_D3D12_PSO_EXTENSION_DESC;
+
+
+struct NVAPI_D3D12_PSO_RASTERIZER_STATE_DESC_V1 : public NVAPI_D3D12_PSO_EXTENSION_DESC
+{
+    NvU32 version;                           //<! Always use NV_RASTERIZER_PSO_EXTENSION_DESC_VER
+    // These are additional parameters on the top of D3D12_RASTERIZER_DESC
+    bool ProgrammableSamplePositionsEnable;  //<! enable Programmable Samples feature
+    bool InterleavedSamplingEnable;          //<! when jitter is enabled, an app need to fill the whole arrays below, otherwise only as much entries as samples
+    NvU8 SampleCount;                        //<! number of samples. In TIR N->1 it needs to match N, in non-TIR it needs to match RT sample count. Ignored if ForcePerSampleInterlock is set
+    NvU8 SamplePositionsX[16];               //<! x positions in API sample order
+    NvU8 SamplePositionsY[16];               //<! y positions in API sample order
+    NVAPI_QUAD_FILLMODE QuadFillMode;        //<! Fill a triangle outside its bounds as a screen-aligned quad, matching the tri's bounding-box or filling the full viewport.
+    bool PostZCoverageEnable;                //<! Enable pixel-shader input SV_COVERAGE to account for z-test in early-z mode.
+    bool CoverageToColorEnable;              //<! Enable output of coverage to a color render-target.
+    NvU8 CoverageToColorRTIndex;             //<! Index of RT for coverage-to-color.
+    NvU32 reserved[16];                      //<! reserved for expansion, set to zero.
+};
+
+#define NV_RASTERIZER_PSO_EXTENSION_DESC_VER_1   MAKE_NVAPI_VERSION(NVAPI_D3D12_PSO_RASTERIZER_STATE_DESC_V1, 1)
+#define NV_RASTERIZER_PSO_EXTENSION_DESC_VER     NV_RASTERIZER_PSO_EXTENSION_DESC_VER_1
+
+typedef NVAPI_D3D12_PSO_RASTERIZER_STATE_DESC_V1   NVAPI_D3D12_PSO_RASTERIZER_STATE_DESC;
+
+struct NVAPI_D3D12_PSO_CREATE_FASTGS_EXPLICIT_DESC_V1 : public NVAPI_D3D12_PSO_EXTENSION_DESC
+{
+    NvU32 version;                                   // ALWAYS == NV_FASTGS_EXPLICIT_PSO_EXTENSION_VER
+    NvU32 flags;                                     // A combination of flags from NV_FASTGS_FLAGS
+    NvAPI_D3D11_SWIZZLE_MODE *pCoordinateSwizzling;  // [optional] Array of 16 coordinate swizzle modes, one per viewport. NULL if not used.
+                                                     // The output x, y, z, and w coordinates of all vertices can be set to any of the coordinates or their
+                                                     // negated versions i.e. {x, y, z, w, -x, -y, -z, -w}. Coordinates are swizzled before any viewport
+                                                     // operation occurs i.e. before frustum clipping, scaling, and viewport clipping. And after
+                                                     // last of vertex/tesselation/geometry shader stage, stream-out and viewport broadcast expansion (see NV_FASTGS_USE_VIEWPORT_MASK)
+                                                     // pCoordinateSwizzling[i] sets the swizzle-mode of each component for viewport i.
+                                                     // See NV_SWIZZLE_MODE for values of allowed swizzle modes.
+                                                     // See NV_SWIZZLE_OFFSET for bit offset from where NV_SWIZZLE_MODE to be set for each component.
+                                                     // For example :
+                                                     // 1. To set swizzle for viewport 0 such that -  w and z are unchanged and values of x and y are swapped :
+                                                     // pCoordinateSwizzling[0] = (NV_SWIZZLE_POS_W << NV_SWIZZLE_OFFSET_W) |
+                                                     //                           (NV_SWIZZLE_POS_Z << NV_SWIZZLE_OFFSET_Z) |
+                                                     //                           (NV_SWIZZLE_POS_X << NV_SWIZZLE_OFFSET_Y) |
+                                                     //                           (NV_SWIZZLE_POS_Y << NV_SWIZZLE_OFFSET_X);
+                                                     // 2. To set swizzle for viewport 0 such that -  w, z and y are unchanged and value of x is negated :
+                                                     // pCoordinateSwizzling[0] = (NV_SWIZZLE_POS_W << NV_SWIZZLE_OFFSET_W) |
+                                                     //                           (NV_SWIZZLE_POS_Z << NV_SWIZZLE_OFFSET_Z) |
+                                                     //                           (NV_SWIZZLE_POS_Y << NV_SWIZZLE_OFFSET_Y) |
+                                                     //                           (NV_SWIZZLE_NEG_X << NV_SWIZZLE_OFFSET_X);
+                                                     // Need to set some valid combination of swizzle-modes for all viewports, irrespective of whether that viewport is set.
+                                                     // Invalid swizzle-mode for any viewport (even if that viewport is not set) may result in removal of device.
+};
+
+#define NV_FASTGS_EXPLICIT_PSO_EXTENSION_VER_1 MAKE_NVAPI_VERSION(NVAPI_D3D12_PSO_CREATE_FASTGS_EXPLICIT_DESC_V1, 1)
+#define NV_FASTGS_EXPLICIT_PSO_EXTENSION_VER   NV_FASTGS_EXPLICIT_PSO_EXTENSION_VER_1
+
+typedef NVAPI_D3D12_PSO_CREATE_FASTGS_EXPLICIT_DESC_V1   NVAPI_D3D12_PSO_CREATE_FASTGS_EXPLICIT_DESC;
+
+
+struct NVAPI_D3D12_PSO_REQUEST_FAST_GEOMETRY_SHADER_DESC_V1 : public NVAPI_D3D12_PSO_EXTENSION_DESC
+{
+    NvU32 version; //<! Always use NV_FAST_GEOMETRY_SHADER_PSO_EXTENSION_VER
+};
+
+#define NV_FAST_GEOMETRY_SHADER_PSO_EXTENSION_VER_1   MAKE_NVAPI_VERSION(NVAPI_D3D12_PSO_REQUEST_FAST_GEOMETRY_SHADER_DESC_V1, 1)
+#define NV_FAST_GEOMETRY_SHADER_PSO_EXTENSION_VER     NV_FAST_GEOMETRY_SHADER_PSO_EXTENSION_VER_1
+
+typedef NVAPI_D3D12_PSO_REQUEST_FAST_GEOMETRY_SHADER_DESC_V1   NVAPI_D3D12_PSO_REQUEST_FAST_GEOMETRY_SHADER_DESC;
+
+
+struct NVAPI_D3D12_PSO_ENABLE_DEPTH_BOUND_TEST_DESC_V1 : public NVAPI_D3D12_PSO_EXTENSION_DESC
+{
+    NvU32 version; //<! Always use NV_ENABLE_DEPTH_BOUND_TEST_PSO_EXTENSION_DESC_VER
+    bool EnableDBT;
+};
+
+#define NV_ENABLE_DEPTH_BOUND_TEST_PSO_EXTENSION_DESC_VER_1   MAKE_NVAPI_VERSION(NVAPI_D3D12_PSO_ENABLE_DEPTH_BOUND_TEST_DESC_V1, 1)
+#define NV_ENABLE_DEPTH_BOUND_TEST_PSO_EXTENSION_DESC_VER     NV_ENABLE_DEPTH_BOUND_TEST_PSO_EXTENSION_DESC_VER_1
+
+typedef NVAPI_D3D12_PSO_ENABLE_DEPTH_BOUND_TEST_DESC_V1       NVAPI_D3D12_PSO_ENABLE_DEPTH_BOUND_TEST_DESC;
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_D3D12_CreateGraphicsPipelineState
+//
+//! \code
+//!   DESCRIPTION: This function will create PSO with provided extensions
+//!
+//!         \param [in]        pDevice              Current d3d device
+//!         \param [in]        pPSODesc             PSO description of type D3D12_GRAPHICS_PIPELINE_STATE_DESC
+//!         \param [in]        numExtensions        Number of extensions
+//!         \param [in]        ppExtensions         Array of PSO extensions (see NV_PSO_EXTENSION  for possible extensions)
+//!         \param [out]       ppPSO                Output PSO object of type ID3D12PipelineState
+//!
+//! SUPPORTED OS:  Windows 10
+//!
+//! \return ::NVAPI_OK     if the call succeeds.
+//! \endcode
+//! \ingroup dx
+///////////////////////////////////////////////////////////////////////////////
+
+NVAPI_INTERFACE NvAPI_D3D12_CreateGraphicsPipelineState(__in ID3D12Device *pDevice,
+                                                        __in const D3D12_GRAPHICS_PIPELINE_STATE_DESC *pPSODesc,
+                                                          NvU32 numExtensions,
+                                                        __in const NVAPI_D3D12_PSO_EXTENSION_DESC** ppExtensions,
+                                                        __out ID3D12PipelineState **ppPSO);
+
+
+#endif //defined(__cplusplus) && defined(__d3d12_h__)
+
+
+#if defined (__cplusplus) && defined(__d3d12_h__)
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_D3D12_SetDepthBoundsTestValues
+//
+//! \code
+//!   DESCRIPTION: This function will set the minDepth and maxDepth values for depth bounds test
+//!                To enable/ disable depth bounds test use PSO extension NV_PSO_ENABLE_DEPTH_BOUND_TEST_EXTENSION
+//!                in the nvapi NvAPI_D3D12_CreateGraphicsPipelineState
+//!
+//!         \param [in]        pCommandList         Command List to set depth bounds test
+//!         \param [in]        minDepth             min value for depth bound test
+//!         \param [in]        maxDepth             max value for depth bound test
+//!
+//! The valid values for minDepth and maxDepth are such that 0 <= minDepth <= maxDepth <= 1
+//!
+//! SUPPORTED OS:  Windows 10
+//!
+//! \return ::NVAPI_OK     if the call succeeds.
+//! \endcode
+//! \ingroup dx
+///////////////////////////////////////////////////////////////////////////////
+
+NVAPI_INTERFACE NvAPI_D3D12_SetDepthBoundsTestValues(__in ID3D12GraphicsCommandList *pCommandList,
+                                                     __in const float minDepth,
+                                                     __in const float maxDepth);
+
+#endif //defined(__cplusplus) && defined(__d3d12_h__)
+
+
+#if defined(_D3D9_H_) || defined(__d3d10_h__) || defined(__d3d11_h__)
+
+///////////////////////////////////////////////////////////////////////////////
+//!
+//! FUNCTION NAME: NvAPI_D3D_IsGSyncCapable
+//!   DESCRIPTION: This API gets G-Sync capability for the given device context.
+//!                This is only reliable after the first present call has completed.
+//! \param [in]    pDeviceOrContext    The D3D9, D3D10, D3D11 device, or D3D11 device context
+//! \param [in]    NVDX_ObjectHandle   The handle of primary surface
+//! \param [out]   pIsGsyncCapable     if G-Sync can be enabled, *pIsGsyncCapable is true.
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! RETURN STATUS: This API can return any of the error codes enumerated in #NvAPI_Status.
+//!                If there are return error codes with specific meaning for this API, they are listed below.
+//!
+//! \ingroup dx
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_D3D_IsGSyncCapable(__in IUnknown *pDeviceOrContext, __in NVDX_ObjectHandle primarySurface, __out BOOL *pIsGsyncCapable);
+#endif //if defined(_D3D9_H_) || defined(__d3d10_h__) || defined(__d3d11_h__)
+
+
+
+#if defined(_D3D9_H_) || defined(__d3d10_h__) || defined(__d3d11_h__)
+
+///////////////////////////////////////////////////////////////////////////////
+//!
+//! FUNCTION NAME: NvAPI_D3D_IsGSyncActive
+//!   DESCRIPTION: This API get the G-Sync state for the given device context.
+//!                This is only reliable after the first present call has completed.
+//!                As it is a bit time consuming, It should not be called per frame.
+//! \param [in]    pDeviceOrContext    The D3D9, D3D10, D3D11 device, or D3D11 device context
+//! \param [in]    NVDX_ObjectHandle   The handle of primary surface
+//! \param [out]   pIsGsyncActive      if G-Sync is active, *pisGsyncActive is true.
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! RETURN STATUS: This API can return any of the error codes enumerated in #NvAPI_Status.
+//!                If there are return error codes with specific meaning for this API, they are listed below.
+//!
+//! \ingroup dx
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_D3D_IsGSyncActive(__in IUnknown *pDeviceOrContext, __in NVDX_ObjectHandle primarySurface, __out BOOL *pIsGsyncActive);
+#endif //if defined(_D3D9_H_) || defined(__d3d10_h__) || defined(__d3d11_h__)
+
+
+
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+#if defined (__cplusplus) && ( defined(__d3d10_h__) || defined(__d3d10_1_h__) ||defined(__d3d11_h__) )
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_D3D1x_DisableShaderDiskCache
+//
+//! DESCRIPTION: Disables driver managed caching of shader compilations to disk
+//!
+//! \param [in]    pDevice               Device to disabled the shader disk cache on
+//!
+//!
+//! \retval ::NVAPI_OK                   Shader disk cache was disabled
+//! \retval ::NVAPI_ERROR                The operation failed.
+//! \retval ::NVAPI_INVALID_ARGUMENT     Argument passed in is invalid.
+//! \ingroup dx
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_D3D1x_DisableShaderDiskCache(IUnknown *pDevice);
+
+#endif //defined(__cplusplus) && ( defined(__d3d10_h__) || defined(__d3d10_1_h__) ||defined(__d3d11_h__) )
+
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+#if defined (__cplusplus) && ( defined(__d3d10_h__) || defined(__d3d10_1_h__) ||defined(__d3d11_h__) )
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_D3D1x_HintCreateLowLatencyDevice
+//
+//! DESCRIPTION: Hint driver what type of D3D1x device has to be created
+//!
+//! \param [in]    bool                  true  - Next CreateDevice call has to create low latency device.
+//!										 false - Next CreateDevice call has to create normal device.
+//!												 Caller has explicitely change state of the hint from true to false
+//!												 after low latency device is created.
+//!												 Default hint state is false.
+//! \retval ::NVAPI_OK                   Hint is set.
+//! \retval ::NVAPI_ERROR                Hint was not set.
+//! \ingroup dx
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_D3D1x_HintCreateLowLatencyDevice(bool bCreateLowLatencyDevice);
+
+#endif //defined(__cplusplus) && ( defined(__d3d10_h__) || defined(__d3d10_1_h__) ||defined(__d3d11_h__) )
+
+
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+#if defined (__cplusplus) && defined(__d3d11_h__)
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_D3D11_MultiGPU_GetCaps
+//
+//! DESCRIPTION: Request to get multi GPU extension caps.
+//!
+//! \param [out]    pMultiGPUCaps        Pointer to a structure returning multi GPU caps
+//! \retval ::NVAPI_OK                   Call succeeded.
+//! \retval ::NVAPI_ERROR                Call failed.
+//! \ingroup dx
+///////////////////////////////////////////////////////////////////////////////
+//! \ingroup dx
+typedef struct
+{
+    NvU32 multiGPUVersion;
+    NvU32 reserved;
+    NvU32 nTotalGPUs;
+    NvU32 nSLIGPUs;
+    NvU32 videoBridgePresent;
+} NV_MULTIGPU_CAPS, *PNV_MULTIGPU_CAPS;
+
+//! \ingroup dx
+NVAPI_INTERFACE NvAPI_D3D11_MultiGPU_GetCaps(__out PNV_MULTIGPU_CAPS pMultiGPUCaps);
+
+#endif //defined(__cplusplus) && defined(__d3d11_h__)
+
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+#if defined (__cplusplus) && defined(__d3d11_h__)
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_D3D11_MultiGPU_Init
+//
+//! DESCRIPTION: Request to enable/disable multi GPU extension. Also if enabled automatically disables auto stereo.
+//!
+//! \param [in]    bEnable               if true enables the extension for all subsequently created devices. Otherwise disables it
+//! \retval ::NVAPI_OK                   Call succeeded.
+//! \retval ::NVAPI_ERROR                Call failed.
+//! \ingroup dx
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_D3D11_MultiGPU_Init(__in bool bEnable);
+
+#endif //defined(__cplusplus) && defined(__d3d11_h__)
+
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+#if defined (__cplusplus) && defined(__d3d11_h__)
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_D3D11_CreateMultiGPUDevice
+//! \code
+//!   DESCRIPTION: This function returns ID3D11MultiGPUDevice used for multi GPU VR support
+//!
+//!         \param [in]        pDevice                  current d3d device
+//!         \param [in]        version                  version of requested ID3D11MultiGPUDevice.
+//!         \param [out]       currentVersion           pointer to returned current version of ID3D11MultiGPUDevice.
+//!         \param [out]       ppD3D11MultiGPUDevice    pointer to returned ID3D11MultiGPUDevice.
+//!         \param [in]        maxGpus                  max number of gpus this ID3D11MultiGPUDevice is allowed to use
+//!
+//!
+//! \return ::NVAPI_OK     if the call succeeds.
+//! \endcode
+///////////////////////////////////////////////////////////////////////////////
+
+
+//! \ingroup dx
+
+#define NVAPI_COPY_ASYNCHRONOUSLY					 1
+#define NVAPI_CPU_RESOURCE					0xffffffff
+
+DECLARE_INTERFACE(ID3D11MultiGPUDevice_V1)
+{
+    STDMETHOD_(void,Destroy)(THIS) PURE;
+    STDMETHOD_(UINT,SetGPUMask)(THIS_ __in UINT GPUMask) PURE;
+    STDMETHOD_(NvAPI_Status,CopySubresourceRegion)(THIS_ __in ID3D11DeviceContext *pContext, __in ID3D11Resource *pDstResource, __in UINT DstSubresource,
+                                                   __in UINT DstGPUIndex, __in UINT DstX, __in UINT DstY, __in UINT DstZ,
+                                                   __in ID3D11Resource *pSrcResource, __in UINT SrcSubresource, __in UINT SrcGPUIndex,
+                                                   __in const D3D11_BOX *pSrcBox, __in UINT ExtendedFlags = 0) PURE;
+#if defined(__d3d11_1_h__)
+    STDMETHOD_(NvAPI_Status,CopySubresourceRegion1)(THIS_ __in ID3D11DeviceContext1 *pContext1, __in ID3D11Resource *pDstResource, __in UINT DstSubresource,
+                                                   __in UINT DstGPUIndex, __in UINT DstX, __in UINT DstY, __in UINT DstZ,
+                                                   __in ID3D11Resource *pSrcResource, __in UINT SrcSubresource, __in UINT SrcGPUIndex,
+                                                   __in const D3D11_BOX *pSrcBox, __in UINT CopyFlags, __in UINT ExtendedFlags = 0 ) PURE;
+#else
+    STDMETHOD_(NvAPI_Status,CopySubresourceRegion1)(THIS_ __in void *pContext1, __in ID3D11Resource *pDstResource, __in UINT DstSubresource,
+                                                   __in UINT DstGPUIndex, __in UINT DstX, __in UINT DstY, __in UINT DstZ,
+                                                   __in ID3D11Resource *pSrcResource, __in UINT SrcSubresource, __in UINT SrcGPUIndex,
+                                                   __in const D3D11_BOX *pSrcBox, __in UINT CopyFlags, __in UINT ExtendedFlags = 0 ) PURE;
+#endif
+    STDMETHOD_(NvAPI_Status,UpdateSubresource)(THIS_ __in ID3D11DeviceContext *pContext,__in ID3D11Resource *pDstResource, __in UINT DstSubresource, __in UINT DstGPUIndex,
+                                               __in const D3D11_BOX *pDstBox, __in const void *pSrcData, __in UINT SrcRowPitch, __in UINT SrcDepthPitch) PURE;
+    STDMETHOD_(NvAPI_Status,VSSetConstantBuffers)(THIS_ __in ID3D11DeviceContext *pContext, __in UINT GPUMask, __in UINT StartSlot,
+                                                 __in UINT NumBuffers, __in ID3D11Buffer *const *ppConstantBuffers,
+                                                 __in UINT *const pFirstConstant = NULL, __in UINT *const pNumConstants = NULL) PURE;
+    STDMETHOD_(NvAPI_Status,PSSetConstantBuffers)(THIS_ __in ID3D11DeviceContext *pContext, __in UINT GPUMask, __in UINT StartSlot,
+                                                 __in UINT NumBuffers, __in ID3D11Buffer *const *ppConstantBuffers,
+                                                 __in UINT *const pFirstConstant = NULL, __in UINT *const pNumConstants = NULL) PURE;
+    STDMETHOD_(NvAPI_Status,GSSetConstantBuffers)(THIS_ __in ID3D11DeviceContext *pContext, __in UINT GPUMask, __in UINT StartSlot,
+                                                 __in UINT NumBuffers, __in ID3D11Buffer *const *ppConstantBuffers,
+                                                 __in UINT *const pFirstConstant = NULL, __in UINT *const pNumConstants = NULL) PURE;
+    STDMETHOD_(NvAPI_Status,DSSetConstantBuffers)(THIS_ __in ID3D11DeviceContext *pContext, __in UINT GPUMask, __in UINT StartSlot,
+                                                 __in UINT NumBuffers, __in ID3D11Buffer *const *ppConstantBuffers,
+                                                 __in UINT *const pFirstConstant = NULL, __in UINT *const pNumConstants = NULL) PURE;
+    STDMETHOD_(NvAPI_Status,HSSetConstantBuffers)(THIS_ __in ID3D11DeviceContext *pContext, __in UINT GPUMask, __in UINT StartSlot,
+                                                 __in UINT NumBuffers, __in ID3D11Buffer *const *ppConstantBuffers,
+                                                 __in UINT *const pFirstConstant = NULL, __in UINT *const pNumConstants = NULL) PURE;
+    STDMETHOD_(NvAPI_Status,CSSetConstantBuffers)(THIS_ __in ID3D11DeviceContext *pContext, __in UINT GPUMask, __in UINT StartSlot,
+                                                 __in UINT NumBuffers, __in ID3D11Buffer *const *ppConstantBuffers,
+                                                 __in UINT *const pFirstConstant = NULL, __in UINT *const pNumConstants = NULL) PURE;
+    STDMETHOD_(NvAPI_Status,SetViewports)(THIS_ __in ID3D11DeviceContext *pContext, __in UINT GPUMask, __in UINT NumViewports,
+                                          __in const D3D11_VIEWPORT *pViewports) PURE;
+    STDMETHOD_(NvAPI_Status,SetScissorRects)(THIS_ __in ID3D11DeviceContext *pContext, __in UINT GPUMask, __in UINT NumRects,
+                                            __in const D3D11_RECT *pRects) PURE;
+    STDMETHOD_(HRESULT,GetData)(THIS_ __in ID3D11DeviceContext *pContext, __in ID3D11Asynchronous *pAsync, __in UINT GPUIndex,
+                                    __out void *pData, __in UINT DataSize, __in UINT GetDataFlags) PURE;
+#if defined(__d3d11_2_h__)
+    STDMETHOD_(NvAPI_Status,UpdateTiles)(THIS_ __in ID3D11DeviceContext2 *pContext2, __in ID3D11Resource *pDestTiledResource,
+                                        __in UINT GPUMask, __in const D3D11_TILED_RESOURCE_COORDINATE *pDestTileRegionStartCoordinate,
+                                        __in const D3D11_TILE_REGION_SIZE *pDestTileRegionSize, __in const void *pSourceTileData,
+                                        __in UINT Flags) PURE;
+#else
+    STDMETHOD_(NvAPI_Status,UpdateTiles)(THIS_ __in void *pContext2, __in ID3D11Resource *pDestTiledResource,
+                                        __in UINT GPUMask, __in const void *pDestTileRegionStartCoordinate,
+                                        __in const void *pDestTileRegionSize, __in const void *pSourceTileData,
+                                        __in UINT Flags) PURE;
+#endif
+
+    STDMETHOD_(NvAPI_Status,CreateFences)(THIS_ __in UINT count, __out void **ppFences) PURE;
+    STDMETHOD_(NvAPI_Status,SetFence)(THIS_ __in UINT GPUIndex, __in void *hFence, __in UINT64 value) PURE;
+    STDMETHOD_(NvAPI_Status,WaitForFence)(THIS_ __in UINT GPUIMask, __in void *hFence, __in UINT64 value) PURE;
+    STDMETHOD_(NvAPI_Status,FreeFences)(THIS_ __in UINT count, __in void **ppFences) PURE;
+    STDMETHOD_(NvAPI_Status,PresentCompositingConfig )(THIS_ __in IUnknown *pSwapChain, __in UINT GPUMask,
+                                         __in const D3D11_RECT *pRects, __in UINT flags) PURE;
+	STDMETHOD_(NvAPI_Status,SetContextGPUMask)(THIS_ __in ID3D11DeviceContext *pContext, __in UINT GPUMask) PURE;
+	STDMETHOD_(NvAPI_Status,GetVideoBridgeStatus)(THIS_ __in IUnknown *pSwapChain, __in UINT* pVideoBridgeStatus) PURE;
+};
+
+//! Synchronization macros based on fences.
+#define FENCE_SYNCHRONIZATION_START(pMultiGPUDevice, hFence, Value, srcGpu, dstGpu) \
+    pMultiGPUDevice->SetFence(dstGpu, hFence, Value); \
+    pMultiGPUDevice->WaitForFence(1 << (srcGpu), hFence, Value); \
+    Value++;
+
+#define FENCE_SYNCHRONIZATION_END(pMultiGPUDevice, hFence, Value, srcGpu, dstGpu) \
+    pMultiGPUDevice->SetFence(srcGpu, hFence, Value); \
+    pMultiGPUDevice->WaitForFence(1 << (dstGpu), hFence, Value); \
+    Value++;
+
+//! PresentCompositingConfig method flags.
+#define NVAPI_PRESENT_COMPOSITING_CONFIG_FLAG_USE_VIDEO_BRIDGE          0x01
+#define NVAPI_PRESENT_COMPOSITING_CONFIG_FLAG_CLEAR_OUTBANDS            0x02
+#define NVAPI_PRESENT_COMPOSITING_CONFIG_FLAG_GET_VIDEO_BRIDGE_STATUS   0x80000000
+
+#define NVAPI_VIDEO_BRIDGE_STATUS_AVAILABLE       0
+#define NVAPI_VIDEO_BRIDGE_STATUS_NOT_AVAILABLE   1
+#define NVAPI_VIDEO_BRIDGE_STATUS_FAILED_ACCESS   2
+#define NVAPI_VIDEO_BRIDGE_STATUS_UNKNOWN         3
+
+#define NVAPI_ALL_GPUS				0
+typedef ID3D11MultiGPUDevice_V1     ID3D11MultiGPUDevice;
+
+#define ID3D11MultiGPUDevice_VER1   MAKE_NVAPI_VERSION(ID3D11MultiGPUDevice_V1, 1)
+#define ID3D11MultiGPUDevice_VER2   MAKE_NVAPI_VERSION(ID3D11MultiGPUDevice_V1, 2)
+#define ID3D11MultiGPUDevice_VER    ID3D11MultiGPUDevice_VER2
+
+#define ALL_GPUS 0
+
+//! \ingroup dx
+NVAPI_INTERFACE NvAPI_D3D11_CreateMultiGPUDevice(__in ID3D11Device *pDevice, __in ULONG version, __out ULONG *currentVersion, __out ID3D11MultiGPUDevice **ppD3D11MultiGPUDevice, __in UINT maxGpus=ALL_GPUS);
+
+#endif //defined(__cplusplus) && defined(__d3d11_h__)
+
+#if defined (__cplusplus) && (defined(__d3d10_h__) || defined(__d3d10_1_h__) || defined(__d3d11_h__))
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_D3D_RegisterDevice
+//
+//!   DESCRIPTION: Tells NvAPI about a D3D device. This must be called prior to using any DX1x
+//!                deferred-context calls.
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \param [in]  pDev                           The ID3D10Device or ID3D11Device to use.
+//!
+//! RETURN STATUS:     This API can return any of the error codes enumerated in #NvAPI_Status.
+//!                    If there are return error codes with specific meaning for this API, they are listed below.
+//!
+//! \ingroup dx
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_D3D_RegisterDevice(__in IUnknown *pDev);
+
+#endif //if defined(__cplusplus) && (defined(__d3d10_h__) || defined(__d3d10_1_h__) || defined(__d3d11_h__))
+
+
+
+#if defined (__cplusplus) && (defined(__d3d11_h__) || defined(__d3d11_1_h__))
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_D3D11_MultiDrawInstancedIndirect
+//
+//!   DESCRIPTION: Extension of DrawInstancedIndirect that takes a draw count in. The effect of this function is to loop over
+//!                that draw count and perform the DrawInstancedIndirect operation each time, incrementing the buffer offset
+//!                by the supplied stride each time.
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \param [in]        *pDevContext11                  Pointer to D3D11 device context (IC or DC)
+//! \param [in]         drawCount                      Do DrawInstancedIndirect operation this many times
+//! \param [in]        *pBuffer                        ID3D11Buffer that contains the command parameters
+//! \param [in]         alignedByteOffsetForArgs       Start in pBuffer of the command parameters
+//! \param [in]         alignedByteStrideForArgs       Stride of the command parameters - must be >= 4 * sizeof(NvU32)
+//!
+//! RETURN STATUS:     This API can return any of the error codes enumerated in #NvAPI_Status.
+//!                    If there are return error codes with specific meaning for this API, they are listed below.
+//!
+//! \retval  NVAPI_D3D_DEVICE_NOT_REGISTERED     When MultiDraw is called on a deferred context, and the device has not yet
+//!                                              been registered (NvAPI_D3D_RegisterDevice), this error is returned.
+//! \ingroup dx
+///////////////////////////////////////////////////////////////////////////////
+
+NVAPI_INTERFACE NvAPI_D3D11_MultiDrawInstancedIndirect(__in ID3D11DeviceContext *pDevContext11,
+                                                       __in NvU32                drawCount,
+                                                       __in ID3D11Buffer        *pBuffer,
+                                                       __in NvU32                alignedByteOffsetForArgs,
+                                                       __in NvU32                alignedByteStrideForArgs);
+
+#endif //defined (__cplusplus) && (defined(__d3d11_h__) || defined(__d3d11_1_h__))
+
+
+#if defined (__cplusplus) && (defined(__d3d11_h__) || defined(__d3d11_1_h__))
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_D3D11_MultiDrawIndexedInstancedIndirect
+//
+//!   DESCRIPTION: Extension of DrawIndexedInstancedIndirect that takes a draw count in. The effect of this function is to loop over
+//!                that draw count and perform the DrawIndexedInstancedIndirect operation each time, incrementing the buffer offset
+//!                by the supplied stride each time.
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \param [in]        *pDevContext11                  Pointer to D3D11 device context (IC or DC)
+//! \param [in]         drawCount                      Do DrawIndexedInstancedIndirect operation this many times
+//! \param [in]        *pBuffer                        ID3D11Buffer that contains the command parameters
+//! \param [in]         alignedByteOffsetForArgs       Start in pBuffer of the command parameters
+//! \param [in]         alignedByteStrideForArgs       Stride of the command parameters - must be >= 5 * sizeof(NvU32)
+//!
+//! RETURN STATUS:     This API can return any of the error codes enumerated in #NvAPI_Status.
+//!                    If there are return error codes with specific meaning for this API, they are listed below.
+//!
+//! \retval  NVAPI_D3D_DEVICE_NOT_REGISTERED     When MultiDraw is called on a deferred context, and the device has not yet
+//!                                              been registered (NvAPI_D3D_RegisterDevice), this error is returned.
+//! \ingroup dx
+///////////////////////////////////////////////////////////////////////////////
+
+NVAPI_INTERFACE NvAPI_D3D11_MultiDrawIndexedInstancedIndirect(__in ID3D11DeviceContext *pDevContext11,
+                                                              __in NvU32                drawCount,
+                                                              __in ID3D11Buffer        *pBuffer,
+                                                              __in NvU32                alignedByteOffsetForArgs,
+                                                              __in NvU32                alignedByteStrideForArgs);
+
+#endif //defined (__cplusplus) && (defined(__d3d11_h__) || defined(__d3d11_1_h__))
+
+//! SUPPORTED OS:  Windows 7 and higher
+//!
+#if defined (__cplusplus) && ( defined(_D3D9_H_) || defined(__d3d10_h__) || defined(__d3d10_1_h__) ||defined(__d3d11_h__) )
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME:   NvAPI_D3D_ImplicitSLIControl
+//
+//! This function enables/disables the SLI rendering mode. It has to be called prior to D3D device creation. Once this function is called with DISABLE_IMPLICIT_SLI
+//! parameter all subsequently created devices will be forced to run in a single gpu mode until the same function is called with ENABLE_IMPLICIT_SLI parameter. The enable
+//! call will force all subsequently created devices to run in default implicit SLI mode being determined by an application profile or a global control panel SLI setting.
+//! This NvAPI call is supported in all DX10+ versions of the driver. It is supported on all Windows versions.
+//!
+//! \retval    NVAPI_OK                             Completed request
+//! \retval    NVAPI_ERROR                          Error occurred
+//! \ingroup dx
+///////////////////////////////////////////////////////////////////////////////
+
+//! \ingroup dx
+typedef enum _IMPLICIT_SLI_CONTROL
+{
+    DISABLE_IMPLICIT_SLI    = 0,
+    ENABLE_IMPLICIT_SLI     = 1,
+} IMPLICIT_SLI_CONTROL;
+
+//! \ingroup dx
+NVAPI_INTERFACE NvAPI_D3D_ImplicitSLIControl(__in IMPLICIT_SLI_CONTROL implicitSLIControl);
+
+#endif //defined (__cplusplus) && ( defined(_D3D9_H_) || defined(__d3d10_h__) || defined(__d3d10_1_h__) ||defined(__d3d11_h__) )
+
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+#if defined(__cplusplus) && ( defined(__d3d10_h__) || defined(__d3d10_1_h__) || defined(__d3d11_h__) )
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_D3D1x_GetLowLatencySupport
+//
+//!   DESCRIPTION: Query support for low latency nodes
+//!
+//!
+//! \param [in]  adapterId                      The adapter ID that specifies the GPU to query.
+//! \param [out] pIsLowLatencySupported         Returns true if and only if low latency nodes are supported.
+//!
+//! \retval  NVAPI_OK                           Call succeeded.
+//! \retval  NVAPI_ERROR                        Call failed.
+//! \retval  NVAPI_INVALID_ARGUMENT             One or more arguments are invalid.
+//! \retval  NVAPI_INVALID_POINTER              A NULL pointer was passed
+//!
+//! \ingroup dx
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_D3D1x_GetLowLatencySupport(__in LUID pAdapterId,
+                                                 __out BOOL *pIsLowLatencySupported);
+
+#endif //defined(__cplusplus) && ( defined(__d3d10_h__) || defined(__d3d10_1_h__) || defined(__d3d11_h__) )
+
+/////////////////////////////////////////////////////////////////////////
+// Video Input Output (VIO) API
+/////////////////////////////////////////////////////////////////////////
+
+
+
+//! \ingroup vidio
+//! Unique identifier for VIO owner (process identifier or NVVIOOWNERID_NONE)
+typedef NvU32   NVVIOOWNERID;
+
+
+//! \addtogroup vidio
+//! @{
+
+
+#define NVVIOOWNERID_NONE                   0      //!< Unregistered ownerId
+
+
+//! Owner type for device
+typedef enum _NVVIOOWNERTYPE
+{
+    NVVIOOWNERTYPE_NONE                             ,       //!<  No owner for the device
+    NVVIOOWNERTYPE_APPLICATION                      ,       //!<  Application owns the device
+    NVVIOOWNERTYPE_DESKTOP                          ,       //!<  Desktop transparent mode owns the device (not applicable for video input)
+}NVVIOOWNERTYPE;
+
+// Access rights for NvAPI_VIO_Open()
+
+//! Read access             (not applicable for video output)
+#define NVVIO_O_READ                        0x00000000
+
+//! Write exclusive access  (not applicable for video input)
+#define NVVIO_O_WRITE_EXCLUSIVE             0x00010001
+
+//!
+#define NVVIO_VALID_ACCESSRIGHTS            (NVVIO_O_READ              | \
+                                             NVVIO_O_WRITE_EXCLUSIVE   )
+
+
+//! VIO_DATA.ulOwnerID high-bit is set only if device has been initialized by VIOAPI
+//! examined at NvAPI_GetCapabilities|NvAPI_VIO_Open to determine if settings need to be applied from registry or POR state read
+#define NVVIO_OWNERID_INITIALIZED  0x80000000
+
+//! VIO_DATA.ulOwnerID next-bit is set only if device is currently in exclusive write access mode from NvAPI_VIO_Open()
+#define NVVIO_OWNERID_EXCLUSIVE    0x40000000
+
+//! VIO_DATA.ulOwnerID lower bits are:
+//!  NVGVOOWNERTYPE_xxx enumerations indicating use context
+#define NVVIO_OWNERID_TYPEMASK     0x0FFFFFFF //!< mask for NVVIOOWNERTYPE_xxx
+
+
+//! @}
+
+//---------------------------------------------------------------------
+// Enumerations
+//---------------------------------------------------------------------
+
+
+//! \addtogroup vidio
+//! @{
+
+//! Video signal format and resolution
+typedef enum _NVVIOSIGNALFORMAT
+{
+    NVVIOSIGNALFORMAT_NONE,                //!< Invalid signal format
+    NVVIOSIGNALFORMAT_487I_59_94_SMPTE259_NTSC,     //!< 01  487i    59.94Hz  (SMPTE259) NTSC
+    NVVIOSIGNALFORMAT_576I_50_00_SMPTE259_PAL,      //!< 02  576i    50.00Hz  (SMPTE259) PAL
+    NVVIOSIGNALFORMAT_1035I_60_00_SMPTE260,         //!< 03  1035i   60.00Hz  (SMPTE260)
+    NVVIOSIGNALFORMAT_1035I_59_94_SMPTE260,         //!< 04  1035i   59.94Hz  (SMPTE260)
+    NVVIOSIGNALFORMAT_1080I_50_00_SMPTE295,         //!< 05  1080i   50.00Hz  (SMPTE295)
+    NVVIOSIGNALFORMAT_1080I_60_00_SMPTE274,         //!< 06  1080i   60.00Hz  (SMPTE274)
+    NVVIOSIGNALFORMAT_1080I_59_94_SMPTE274,         //!< 07  1080i   59.94Hz  (SMPTE274)
+    NVVIOSIGNALFORMAT_1080I_50_00_SMPTE274,         //!< 08  1080i   50.00Hz  (SMPTE274)
+    NVVIOSIGNALFORMAT_1080P_30_00_SMPTE274,         //!< 09  1080p   30.00Hz  (SMPTE274)
+    NVVIOSIGNALFORMAT_1080P_29_97_SMPTE274,         //!< 10  1080p   29.97Hz  (SMPTE274)
+    NVVIOSIGNALFORMAT_1080P_25_00_SMPTE274,         //!< 11  1080p   25.00Hz  (SMPTE274)
+    NVVIOSIGNALFORMAT_1080P_24_00_SMPTE274,         //!< 12  1080p   24.00Hz  (SMPTE274)
+    NVVIOSIGNALFORMAT_1080P_23_976_SMPTE274,        //!< 13  1080p   23.976Hz (SMPTE274)
+    NVVIOSIGNALFORMAT_720P_60_00_SMPTE296,          //!< 14  720p    60.00Hz  (SMPTE296)
+    NVVIOSIGNALFORMAT_720P_59_94_SMPTE296,          //!< 15  720p    59.94Hz  (SMPTE296)
+    NVVIOSIGNALFORMAT_720P_50_00_SMPTE296,          //!< 16  720p    50.00Hz  (SMPTE296)
+    NVVIOSIGNALFORMAT_1080I_48_00_SMPTE274,         //!< 17  1080I   48.00Hz  (SMPTE274)
+    NVVIOSIGNALFORMAT_1080I_47_96_SMPTE274,         //!< 18  1080I   47.96Hz  (SMPTE274)
+    NVVIOSIGNALFORMAT_720P_30_00_SMPTE296,          //!< 19  720p    30.00Hz  (SMPTE296)
+    NVVIOSIGNALFORMAT_720P_29_97_SMPTE296,          //!< 20  720p    29.97Hz  (SMPTE296)
+    NVVIOSIGNALFORMAT_720P_25_00_SMPTE296,          //!< 21  720p    25.00Hz  (SMPTE296)
+    NVVIOSIGNALFORMAT_720P_24_00_SMPTE296,          //!< 22  720p    24.00Hz  (SMPTE296)
+    NVVIOSIGNALFORMAT_720P_23_98_SMPTE296,          //!< 23  720p    23.98Hz  (SMPTE296)
+    NVVIOSIGNALFORMAT_2048P_30_00_SMPTE372,         //!< 24  2048p   30.00Hz  (SMPTE372)
+    NVVIOSIGNALFORMAT_2048P_29_97_SMPTE372,         //!< 25  2048p   29.97Hz  (SMPTE372)
+    NVVIOSIGNALFORMAT_2048I_60_00_SMPTE372,         //!< 26  2048i   60.00Hz  (SMPTE372)
+    NVVIOSIGNALFORMAT_2048I_59_94_SMPTE372,         //!< 27  2048i   59.94Hz  (SMPTE372)
+    NVVIOSIGNALFORMAT_2048P_25_00_SMPTE372,         //!< 28  2048p   25.00Hz  (SMPTE372)
+    NVVIOSIGNALFORMAT_2048I_50_00_SMPTE372,         //!< 29  2048i   50.00Hz  (SMPTE372)
+    NVVIOSIGNALFORMAT_2048P_24_00_SMPTE372,         //!< 30  2048p   24.00Hz  (SMPTE372)
+    NVVIOSIGNALFORMAT_2048P_23_98_SMPTE372,         //!< 31  2048p   23.98Hz  (SMPTE372)
+    NVVIOSIGNALFORMAT_2048I_48_00_SMPTE372,         //!< 32  2048i   48.00Hz  (SMPTE372)
+    NVVIOSIGNALFORMAT_2048I_47_96_SMPTE372,         //!< 33  2048i   47.96Hz  (SMPTE372)
+
+    NVVIOSIGNALFORMAT_1080PSF_25_00_SMPTE274,       //!< 34  1080PsF 25.00Hz  (SMPTE274)
+    NVVIOSIGNALFORMAT_1080PSF_29_97_SMPTE274,       //!< 35  1080PsF 29.97Hz  (SMPTE274)
+    NVVIOSIGNALFORMAT_1080PSF_30_00_SMPTE274,       //!< 36  1080PsF 30.00Hz  (SMPTE274)
+    NVVIOSIGNALFORMAT_1080PSF_24_00_SMPTE274,       //!< 37  1080PsF 24.00Hz  (SMPTE274)
+    NVVIOSIGNALFORMAT_1080PSF_23_98_SMPTE274,       //!< 38  1080PsF 23.98Hz  (SMPTE274)
+
+    NVVIOSIGNALFORMAT_1080P_50_00_SMPTE274_3G_LEVEL_A, //!< 39  1080P   50.00Hz  (SMPTE274) 3G Level A
+    NVVIOSIGNALFORMAT_1080P_59_94_SMPTE274_3G_LEVEL_A, //!< 40  1080P   59.94Hz  (SMPTE274) 3G Level A
+    NVVIOSIGNALFORMAT_1080P_60_00_SMPTE274_3G_LEVEL_A, //!< 41  1080P   60.00Hz  (SMPTE274) 3G Level A
+
+    NVVIOSIGNALFORMAT_1080P_60_00_SMPTE274_3G_LEVEL_B, //!< 42  1080p   60.00Hz  (SMPTE274) 3G Level B
+    NVVIOSIGNALFORMAT_1080I_60_00_SMPTE274_3G_LEVEL_B, //!< 43  1080i   60.00Hz  (SMPTE274) 3G Level B
+    NVVIOSIGNALFORMAT_2048I_60_00_SMPTE372_3G_LEVEL_B, //!< 44  2048i   60.00Hz  (SMPTE372) 3G Level B
+    NVVIOSIGNALFORMAT_1080P_50_00_SMPTE274_3G_LEVEL_B, //!< 45  1080p   50.00Hz  (SMPTE274) 3G Level B
+    NVVIOSIGNALFORMAT_1080I_50_00_SMPTE274_3G_LEVEL_B, //!< 46  1080i   50.00Hz  (SMPTE274) 3G Level B
+    NVVIOSIGNALFORMAT_2048I_50_00_SMPTE372_3G_LEVEL_B, //!< 47  2048i   50.00Hz  (SMPTE372) 3G Level B
+    NVVIOSIGNALFORMAT_1080P_30_00_SMPTE274_3G_LEVEL_B, //!< 48  1080p   30.00Hz  (SMPTE274) 3G Level B
+    NVVIOSIGNALFORMAT_2048P_30_00_SMPTE372_3G_LEVEL_B, //!< 49  2048p   30.00Hz  (SMPTE372) 3G Level B
+    NVVIOSIGNALFORMAT_1080P_25_00_SMPTE274_3G_LEVEL_B, //!< 50  1080p   25.00Hz  (SMPTE274) 3G Level B
+    NVVIOSIGNALFORMAT_2048P_25_00_SMPTE372_3G_LEVEL_B, //!< 51  2048p   25.00Hz  (SMPTE372) 3G Level B
+    NVVIOSIGNALFORMAT_1080P_24_00_SMPTE274_3G_LEVEL_B, //!< 52  1080p   24.00Hz  (SMPTE274) 3G Level B
+    NVVIOSIGNALFORMAT_2048P_24_00_SMPTE372_3G_LEVEL_B, //!< 53  2048p   24.00Hz  (SMPTE372) 3G Level B
+    NVVIOSIGNALFORMAT_1080I_48_00_SMPTE274_3G_LEVEL_B, //!< 54  1080i   48.00Hz  (SMPTE274) 3G Level B
+    NVVIOSIGNALFORMAT_2048I_48_00_SMPTE372_3G_LEVEL_B, //!< 55  2048i   48.00Hz  (SMPTE372) 3G Level B
+    NVVIOSIGNALFORMAT_1080P_59_94_SMPTE274_3G_LEVEL_B, //!< 56  1080p   59.94Hz  (SMPTE274) 3G Level B
+    NVVIOSIGNALFORMAT_1080I_59_94_SMPTE274_3G_LEVEL_B, //!< 57  1080i   59.94Hz  (SMPTE274) 3G Level B
+    NVVIOSIGNALFORMAT_2048I_59_94_SMPTE372_3G_LEVEL_B, //!< 58  2048i   59.94Hz  (SMPTE372) 3G Level B
+    NVVIOSIGNALFORMAT_1080P_29_97_SMPTE274_3G_LEVEL_B, //!< 59  1080p   29.97Hz  (SMPTE274) 3G Level B
+    NVVIOSIGNALFORMAT_2048P_29_97_SMPTE372_3G_LEVEL_B, //!< 60  2048p   29.97Hz  (SMPTE372) 3G Level B
+    NVVIOSIGNALFORMAT_1080P_23_98_SMPTE274_3G_LEVEL_B, //!< 61  1080p   29.98Hz  (SMPTE274) 3G Level B
+    NVVIOSIGNALFORMAT_2048P_23_98_SMPTE372_3G_LEVEL_B, //!< 62  2048p   29.98Hz  (SMPTE372) 3G Level B
+    NVVIOSIGNALFORMAT_1080I_47_96_SMPTE274_3G_LEVEL_B, //!< 63  1080i   47.96Hz  (SMPTE274) 3G Level B
+    NVVIOSIGNALFORMAT_2048I_47_96_SMPTE372_3G_LEVEL_B, //!< 64  2048i   47.96Hz  (SMPTE372) 3G Level B
+
+    NVVIOSIGNALFORMAT_END                              //!< 65  To indicate end of signal format list
+
+}NVVIOSIGNALFORMAT;
+
+//! SMPTE standards format
+typedef enum _NVVIOVIDEOSTANDARD
+{
+    NVVIOVIDEOSTANDARD_SMPTE259                        ,       //!< SMPTE259
+    NVVIOVIDEOSTANDARD_SMPTE260                        ,       //!< SMPTE260
+    NVVIOVIDEOSTANDARD_SMPTE274                        ,       //!< SMPTE274
+    NVVIOVIDEOSTANDARD_SMPTE295                        ,       //!< SMPTE295
+    NVVIOVIDEOSTANDARD_SMPTE296                        ,       //!< SMPTE296
+    NVVIOVIDEOSTANDARD_SMPTE372                        ,       //!< SMPTE372
+}NVVIOVIDEOSTANDARD;
+
+//! HD or SD video type
+typedef enum _NVVIOVIDEOTYPE
+{
+    NVVIOVIDEOTYPE_SD                                  ,       //!< Standard-definition (SD)
+    NVVIOVIDEOTYPE_HD                                  ,       //!< High-definition     (HD)
+}NVVIOVIDEOTYPE;
+
+//! Interlace mode
+typedef enum _NVVIOINTERLACEMODE
+{
+    NVVIOINTERLACEMODE_PROGRESSIVE                     ,       //!< Progressive               (p)
+    NVVIOINTERLACEMODE_INTERLACE                       ,       //!< Interlace                 (i)
+    NVVIOINTERLACEMODE_PSF                             ,       //!< Progressive Segment Frame (psf)
+}NVVIOINTERLACEMODE;
+
+//! Video data format
+typedef enum _NVVIODATAFORMAT
+{
+    NVVIODATAFORMAT_UNKNOWN   = -1                     ,       //!< Invalid DataFormat
+    NVVIODATAFORMAT_R8G8B8_TO_YCRCB444                 ,       //!< R8:G8:B8                => YCrCb  (4:4:4)
+    NVVIODATAFORMAT_R8G8B8A8_TO_YCRCBA4444             ,       //!< R8:G8:B8:A8             => YCrCbA (4:4:4:4)
+    NVVIODATAFORMAT_R8G8B8Z10_TO_YCRCBZ4444            ,       //!< R8:G8:B8:Z10            => YCrCbZ (4:4:4:4)
+    NVVIODATAFORMAT_R8G8B8_TO_YCRCB422                 ,       //!< R8:G8:B8                => YCrCb  (4:2:2)
+    NVVIODATAFORMAT_R8G8B8A8_TO_YCRCBA4224             ,       //!< R8:G8:B8:A8             => YCrCbA (4:2:2:4)
+    NVVIODATAFORMAT_R8G8B8Z10_TO_YCRCBZ4224            ,       //!< R8:G8:B8:Z10            => YCrCbZ (4:2:2:4)
+    NVVIODATAFORMAT_X8X8X8_444_PASSTHRU                ,       //!< R8:G8:B8                => RGB    (4:4:4)
+    NVVIODATAFORMAT_X8X8X8A8_4444_PASSTHRU             ,       //!< R8:G8:B8:A8             => RGBA   (4:4:4:4)
+    NVVIODATAFORMAT_X8X8X8Z10_4444_PASSTHRU            ,       //!< R8:G8:B8:Z10            => RGBZ   (4:4:4:4)
+    NVVIODATAFORMAT_X10X10X10_444_PASSTHRU             ,       //!< Y10:CR10:CB10           => YCrCb  (4:4:4)
+    NVVIODATAFORMAT_X10X8X8_444_PASSTHRU               ,       //!< Y10:CR8:CB8             => YCrCb  (4:4:4)
+    NVVIODATAFORMAT_X10X8X8A10_4444_PASSTHRU           ,       //!< Y10:CR8:CB8:A10         => YCrCbA (4:4:4:4)
+    NVVIODATAFORMAT_X10X8X8Z10_4444_PASSTHRU           ,       //!< Y10:CR8:CB8:Z10         => YCrCbZ (4:4:4:4)
+    NVVIODATAFORMAT_DUAL_R8G8B8_TO_DUAL_YCRCB422       ,       //!< R8:G8:B8 + R8:G8:B8     => YCrCb  (4:2:2 + 4:2:2)
+    NVVIODATAFORMAT_DUAL_X8X8X8_TO_DUAL_422_PASSTHRU   ,       //!< Y8:CR8:CB8 + Y8:CR8:CB8 => YCrCb  (4:2:2 + 4:2:2)
+    NVVIODATAFORMAT_R10G10B10_TO_YCRCB422              ,       //!< R10:G10:B10             => YCrCb  (4:2:2)
+    NVVIODATAFORMAT_R10G10B10_TO_YCRCB444              ,       //!< R10:G10:B10             => YCrCb  (4:4:4)
+    NVVIODATAFORMAT_X12X12X12_444_PASSTHRU             ,       //!< X12:X12:X12             => XXX    (4:4:4)
+    NVVIODATAFORMAT_X12X12X12_422_PASSTHRU             ,       //!< X12:X12:X12             => XXX    (4:2:2)
+    NVVIODATAFORMAT_Y10CR10CB10_TO_YCRCB422            ,       //!< Y10:CR10:CB10           => YCrCb  (4:2:2)
+    NVVIODATAFORMAT_Y8CR8CB8_TO_YCRCB422               ,       //!< Y8:CR8:CB8              => YCrCb  (4:2:2)
+    NVVIODATAFORMAT_Y10CR8CB8A10_TO_YCRCBA4224         ,       //!< Y10:CR8:CB8:A10         => YCrCbA (4:2:2:4)
+    NVVIODATAFORMAT_R10G10B10_TO_RGB444                ,       //!< R10:G10:B10             => RGB    (4:4:4)
+    NVVIODATAFORMAT_R12G12B12_TO_YCRCB444              ,       //!< R12:G12:B12             => YCrCb  (4:4:4)
+    NVVIODATAFORMAT_R12G12B12_TO_YCRCB422              ,       //!< R12:G12:B12             => YCrCb  (4:2:2)
+}NVVIODATAFORMAT;
+
+//! Video output area
+typedef enum _NVVIOOUTPUTAREA
+{
+    NVVIOOUTPUTAREA_FULLSIZE                           ,       //!< Output to entire video resolution (full size)
+    NVVIOOUTPUTAREA_SAFEACTION                         ,       //!< Output to centered 90% of video resolution (safe action)
+    NVVIOOUTPUTAREA_SAFETITLE                          ,       //!< Output to centered 80% of video resolution (safe title)
+}NVVIOOUTPUTAREA;
+
+//! Synchronization source
+typedef enum _NVVIOSYNCSOURCE
+{
+    NVVIOSYNCSOURCE_SDISYNC                            ,       //!< SDI Sync  (Digital input)
+    NVVIOSYNCSOURCE_COMPSYNC                           ,       //!< COMP Sync (Composite input)
+}NVVIOSYNCSOURCE;
+
+//! Composite synchronization type
+typedef enum _NVVIOCOMPSYNCTYPE
+{
+    NVVIOCOMPSYNCTYPE_AUTO                             ,       //!< Auto-detect
+    NVVIOCOMPSYNCTYPE_BILEVEL                          ,       //!< Bi-level signal
+    NVVIOCOMPSYNCTYPE_TRILEVEL                         ,       //!< Tri-level signal
+}NVVIOCOMPSYNCTYPE;
+
+//! Video input output status
+typedef enum _NVVIOINPUTOUTPUTSTATUS
+{
+    NVINPUTOUTPUTSTATUS_OFF                            ,       //!< Not in use
+    NVINPUTOUTPUTSTATUS_ERROR                          ,       //!< Error detected
+    NVINPUTOUTPUTSTATUS_SDI_SD                         ,       //!< SDI (standard-definition)
+    NVINPUTOUTPUTSTATUS_SDI_HD                         ,       //!< SDI (high-definition)
+}NVVIOINPUTOUTPUTSTATUS;
+
+//! Synchronization input status
+typedef enum _NVVIOSYNCSTATUS
+{
+    NVVIOSYNCSTATUS_OFF                                ,       //!< Sync not detected
+    NVVIOSYNCSTATUS_ERROR                              ,       //!< Error detected
+    NVVIOSYNCSTATUS_SYNCLOSS                           ,       //!< Genlock in use, format mismatch with output
+    NVVIOSYNCSTATUS_COMPOSITE                          ,       //!< Composite sync
+    NVVIOSYNCSTATUS_SDI_SD                             ,       //!< SDI sync (standard-definition)
+    NVVIOSYNCSTATUS_SDI_HD                             ,       //!< SDI sync (high-definition)
+}NVVIOSYNCSTATUS;
+
+//! Video Capture Status
+typedef enum _NVVIOCAPTURESTATUS
+{
+    NVVIOSTATUS_STOPPED                                ,       //!< Sync not detected
+    NVVIOSTATUS_RUNNING                                ,       //!< Error detected
+    NVVIOSTATUS_ERROR                                  ,       //!< Genlock in use, format mismatch with output
+}NVVIOCAPTURESTATUS;
+
+//! Video Capture Status
+typedef enum _NVVIOSTATUSTYPE
+{
+    NVVIOSTATUSTYPE_IN                                 ,       //!< Input Status
+    NVVIOSTATUSTYPE_OUT                                ,       //!< Output Status
+}NVVIOSTATUSTYPE;
+
+
+//! Assumption, maximum 4 SDI input and 4 SDI output cards supported on a system
+#define NVAPI_MAX_VIO_DEVICES                 8
+
+//! 4 physical jacks supported on each SDI input card.
+#define NVAPI_MAX_VIO_JACKS                   4
+
+
+//! Each physical jack an on SDI input card can have
+//! two "channels" in the case of "3G" VideoFormats, as specified
+//! by SMPTE 425; for non-3G VideoFormats, only the first channel within
+//! a physical jack is valid.
+#define NVAPI_MAX_VIO_CHANNELS_PER_JACK       2
+
+//! 4 Streams, 1 per physical jack
+#define NVAPI_MAX_VIO_STREAMS                 4
+
+#define NVAPI_MIN_VIO_STREAMS                 1
+
+//! SDI input supports a max of 2 links per stream
+#define NVAPI_MAX_VIO_LINKS_PER_STREAM        2
+
+
+#define NVAPI_MAX_FRAMELOCK_MAPPING_MODES     20
+
+//! Min number of capture images
+#define NVAPI_GVI_MIN_RAW_CAPTURE_IMAGES      1
+
+//! Max number of capture images
+#define NVAPI_GVI_MAX_RAW_CAPTURE_IMAGES      32
+
+//! Default number of capture images
+#define NVAPI_GVI_DEFAULT_RAW_CAPTURE_IMAGES  5
+
+
+
+// Data Signal notification events. These need a event handler in RM.
+// Register/Unregister and PopEvent NVAPI's are already available.
+
+//! Device configuration
+typedef enum _NVVIOCONFIGTYPE
+{
+    NVVIOCONFIGTYPE_IN                                 ,       //!< Input Status
+    NVVIOCONFIGTYPE_OUT                                ,       //!< Output Status
+}NVVIOCONFIGTYPE;
+
+typedef enum _NVVIOCOLORSPACE
+{
+    NVVIOCOLORSPACE_UNKNOWN,
+    NVVIOCOLORSPACE_YCBCR,
+    NVVIOCOLORSPACE_YCBCRA,
+    NVVIOCOLORSPACE_YCBCRD,
+    NVVIOCOLORSPACE_GBR,
+    NVVIOCOLORSPACE_GBRA,
+    NVVIOCOLORSPACE_GBRD,
+} NVVIOCOLORSPACE;
+
+//! Component sampling
+typedef enum _NVVIOCOMPONENTSAMPLING
+{
+    NVVIOCOMPONENTSAMPLING_UNKNOWN,
+    NVVIOCOMPONENTSAMPLING_4444,
+    NVVIOCOMPONENTSAMPLING_4224,
+    NVVIOCOMPONENTSAMPLING_444,
+    NVVIOCOMPONENTSAMPLING_422
+} NVVIOCOMPONENTSAMPLING;
+
+typedef enum _NVVIOBITSPERCOMPONENT
+{
+    NVVIOBITSPERCOMPONENT_UNKNOWN,
+    NVVIOBITSPERCOMPONENT_8,
+    NVVIOBITSPERCOMPONENT_10,
+    NVVIOBITSPERCOMPONENT_12,
+} NVVIOBITSPERCOMPONENT;
+
+typedef enum _NVVIOLINKID
+{
+    NVVIOLINKID_UNKNOWN,
+    NVVIOLINKID_A,
+    NVVIOLINKID_B,
+    NVVIOLINKID_C,
+    NVVIOLINKID_D
+} NVVIOLINKID;
+
+
+typedef enum _NVVIOANCPARITYCOMPUTATION
+{
+    NVVIOANCPARITYCOMPUTATION_AUTO,
+    NVVIOANCPARITYCOMPUTATION_ON,
+    NVVIOANCPARITYCOMPUTATION_OFF
+} NVVIOANCPARITYCOMPUTATION;
+
+
+
+//! @}
+
+
+//---------------------------------------------------------------------
+// Structures
+//---------------------------------------------------------------------
+
+//! \addtogroup vidio
+//! @{
+
+
+//! Supports Serial Digital Interface (SDI) output
+#define NVVIOCAPS_VIDOUT_SDI                0x00000001
+
+//! Supports Internal timing source
+#define NVVIOCAPS_SYNC_INTERNAL             0x00000100
+
+//! Supports Genlock timing source
+#define NVVIOCAPS_SYNC_GENLOCK              0x00000200
+
+//! Supports Serial Digital Interface (SDI) synchronization input
+#define NVVIOCAPS_SYNCSRC_SDI               0x00001000
+
+//! Supports Composite synchronization input
+#define NVVIOCAPS_SYNCSRC_COMP              0x00002000
+
+//! Supports Desktop transparent mode
+#define NVVIOCAPS_OUTPUTMODE_DESKTOP        0x00010000
+
+//! Supports OpenGL application mode
+#define NVVIOCAPS_OUTPUTMODE_OPENGL         0x00020000
+
+//! Supports Serial Digital Interface (SDI) input
+#define NVVIOCAPS_VIDIN_SDI                 0x00100000
+
+//! Supports Packed ANC
+#define NVVIOCAPS_PACKED_ANC_SUPPORTED      0x00200000
+
+//! Supports ANC audio blanking
+#define NVVIOCAPS_AUDIO_BLANKING_SUPPORTED  0x00400000
+
+//! SDI-class interface: SDI output with two genlock inputs
+#define NVVIOCLASS_SDI                      0x00000001
+
+//! Device capabilities
+typedef struct _NVVIOCAPS
+{
+    NvU32             version;                              //!< Structure version
+    NvAPI_String      adapterName;                          //!< Graphics adapter name
+    NvU32             adapterClass;                         //!< Graphics adapter classes (NVVIOCLASS_SDI mask)
+    NvU32             adapterCaps;                          //!< Graphics adapter capabilities (NVVIOCAPS_* mask)
+    NvU32             dipSwitch;                            //!< On-board DIP switch settings bits
+    NvU32             dipSwitchReserved;                    //!< On-board DIP switch settings reserved bits
+    NvU32             boardID;                              //!< Board ID
+    //! Driver version
+    struct                                                  //
+    {
+        NvU32          majorVersion;                        //!< Major version. For GVI, majorVersion contains MajorVersion(HIWORD) And MinorVersion(LOWORD)
+        NvU32          minorVersion;                        //!< Minor version. For GVI, minorVersion contains Revison(HIWORD) And Build(LOWORD)
+    } driver;                                               //
+    //! Firmware version
+    struct
+    {
+        NvU32          majorVersion;                        //!< Major version. In version 2, for both GVI and GVO, majorVersion contains MajorVersion(HIWORD) And MinorVersion(LOWORD)
+        NvU32          minorVersion;                        //!< Minor version. In version 2, for both GVI and GVO, minorVersion contains Revison(HIWORD) And Build(LOWORD)
+    } firmWare;                                             //
+    NVVIOOWNERID      ownerId;                              //!< Unique identifier for owner of video output (NVVIOOWNERID_INVALID if free running)
+    NVVIOOWNERTYPE    ownerType;                            //!< Owner type (OpenGL application or Desktop mode)
+} NVVIOCAPS;
+
+//! Macro for constructing the version field of NVVIOCAPS
+#define NVVIOCAPS_VER1  MAKE_NVAPI_VERSION(NVVIOCAPS,1)
+#define NVVIOCAPS_VER2  MAKE_NVAPI_VERSION(NVVIOCAPS,2)
+#define NVVIOCAPS_VER   NVVIOCAPS_VER2
+
+//! Input channel status
+typedef struct _NVVIOCHANNELSTATUS
+{
+    NvU32                  smpte352;                         //!< 4-byte SMPTE 352 video payload identifier
+    NVVIOSIGNALFORMAT      signalFormat;                     //!< Signal format
+    NVVIOBITSPERCOMPONENT  bitsPerComponent;                 //!< Bits per component
+    NVVIOCOMPONENTSAMPLING samplingFormat;                   //!< Sampling format
+    NVVIOCOLORSPACE        colorSpace;                       //!< Color space
+    NVVIOLINKID            linkID;                           //!< Link ID
+} NVVIOCHANNELSTATUS;
+
+//! Input device status
+typedef struct _NVVIOINPUTSTATUS
+{
+    NVVIOCHANNELSTATUS     vidIn[NVAPI_MAX_VIO_JACKS][NVAPI_MAX_VIO_CHANNELS_PER_JACK];     //!< Video input status per channel within a jack
+    NVVIOCAPTURESTATUS     captureStatus;                  //!< status of video capture
+} NVVIOINPUTSTATUS;
+
+//! Output device status
+typedef struct _NVVIOOUTPUTSTATUS
+{
+    NVVIOINPUTOUTPUTSTATUS    vid1Out;                        //!< Video 1 output status
+    NVVIOINPUTOUTPUTSTATUS    vid2Out;                        //!< Video 2 output status
+    NVVIOSYNCSTATUS        sdiSyncIn;                      //!< SDI sync input status
+    NVVIOSYNCSTATUS        compSyncIn;                     //!< Composite sync input status
+    NvU32            syncEnable;                     //!< Sync enable (TRUE if using syncSource)
+    NVVIOSYNCSOURCE        syncSource;                     //!< Sync source
+    NVVIOSIGNALFORMAT        syncFormat;                     //!< Sync format
+    NvU32            frameLockEnable;                //!< Framelock enable flag
+    NvU32            outputVideoLocked;              //!< Output locked status
+    NvU32            dataIntegrityCheckErrorCount;   //!< Data integrity check error count
+    NvU32            dataIntegrityCheckEnabled;      //!< Data integrity check status enabled
+    NvU32            dataIntegrityCheckFailed;       //!< Data integrity check status failed
+    NvU32                       uSyncSourceLocked;              //!< genlocked to framelocked to ref signal
+    NvU32                       uPowerOn;                       //!< TRUE: indicates there is sufficient power
+} NVVIOOUTPUTSTATUS;
+
+//! Video device status.
+typedef struct _NVVIOSTATUS
+{
+    NvU32                 version;                        //!< Structure version
+    NVVIOSTATUSTYPE       nvvioStatusType;                //!< Input or Output status
+    union
+    {
+        NVVIOINPUTSTATUS  inStatus;                       //!<  Input device status
+        NVVIOOUTPUTSTATUS outStatus;                      //!<  Output device status
+    }vioStatus;
+} NVVIOSTATUS;
+
+//! Macro for constructingthe version field of NVVIOSTATUS
+#define NVVIOSTATUS_VER   MAKE_NVAPI_VERSION(NVVIOSTATUS,1)
+
+//! Output region
+typedef struct _NVVIOOUTPUTREGION
+{
+    NvU32              x;                                    //!< Horizontal origin in pixels
+    NvU32              y;                                    //!< Vertical origin in pixels
+    NvU32              width;                                //!< Width of region in pixels
+    NvU32              height;                               //!< Height of region in pixels
+} NVVIOOUTPUTREGION;
+
+//! Gamma ramp (8-bit index)
+typedef struct _NVVIOGAMMARAMP8
+{
+    NvU16              uRed[256];                            //!< Red channel gamma ramp (8-bit index, 16-bit values)
+    NvU16              uGreen[256];                          //!< Green channel gamma ramp (8-bit index, 16-bit values)
+    NvU16              uBlue[256];                           //!< Blue channel gamma ramp (8-bit index, 16-bit values)
+} NVVIOGAMMARAMP8;
+
+//! Gamma ramp (10-bit index)
+typedef struct _NVVIOGAMMARAMP10
+{
+    NvU16              uRed[1024];                           //!< Red channel gamma ramp (10-bit index, 16-bit values)
+    NvU16              uGreen[1024];                         //!< Green channel gamma ramp (10-bit index, 16-bit values)
+    NvU16              uBlue[1024];                          //!< Blue channel gamma ramp (10-bit index, 16-bit values)
+} NVVIOGAMMARAMP10;
+
+
+//! Sync delay
+typedef struct _NVVIOSYNCDELAY
+{
+    NvU32              version;                              //!< Structure version
+    NvU32              horizontalDelay;                      //!< Horizontal delay in pixels
+    NvU32              verticalDelay;                        //!< Vertical delay in lines
+} NVVIOSYNCDELAY;
+
+//! Macro for constructing the version field of NVVIOSYNCDELAY
+#define NVVIOSYNCDELAY_VER   MAKE_NVAPI_VERSION(NVVIOSYNCDELAY,1)
+
+
+//! Video mode information
+typedef struct _NVVIOVIDEOMODE
+{
+    NvU32                horizontalPixels;                   //!< Horizontal resolution (in pixels)
+    NvU32                verticalLines;                      //!< Vertical resolution for frame (in lines)
+    float                fFrameRate;                         //!< Frame rate
+    NVVIOINTERLACEMODE   interlaceMode;                      //!< Interlace mode
+    NVVIOVIDEOSTANDARD   videoStandard;                      //!< SMPTE standards format
+    NVVIOVIDEOTYPE       videoType;                          //!< HD or SD signal classification
+} NVVIOVIDEOMODE;
+
+//! Signal format details
+typedef struct _NVVIOSIGNALFORMATDETAIL
+{
+    NVVIOSIGNALFORMAT    signalFormat;                       //!< Signal format enumerated value
+    NVVIOVIDEOMODE       videoMode;                          //!< Video mode for signal format
+}NVVIOSIGNALFORMATDETAIL;
+
+
+//! R8:G8:B8
+#define NVVIOBUFFERFORMAT_R8G8B8                  0x00000001
+
+//! R8:G8:B8:Z24
+#define NVVIOBUFFERFORMAT_R8G8B8Z24               0x00000002
+
+//! R8:G8:B8:A8
+#define NVVIOBUFFERFORMAT_R8G8B8A8                0x00000004
+
+//! R8:G8:B8:A8:Z24
+#define NVVIOBUFFERFORMAT_R8G8B8A8Z24             0x00000008
+
+//! R16FP:G16FP:B16FP
+#define NVVIOBUFFERFORMAT_R16FPG16FPB16FP         0x00000010
+
+//! R16FP:G16FP:B16FP:Z24
+#define NVVIOBUFFERFORMAT_R16FPG16FPB16FPZ24      0x00000020
+
+//! R16FP:G16FP:B16FP:A16FP
+#define NVVIOBUFFERFORMAT_R16FPG16FPB16FPA16FP    0x00000040
+
+//! R16FP:G16FP:B16FP:A16FP:Z24
+#define NVVIOBUFFERFORMAT_R16FPG16FPB16FPA16FPZ24 0x00000080
+
+
+
+//! Data format details
+typedef struct _NVVIODATAFORMATDETAIL
+{
+    NVVIODATAFORMAT   dataFormat;                              //!< Data format enumerated value
+    NvU32             vioCaps;                                 //!< Data format capabilities (NVVIOCAPS_* mask)
+}NVVIODATAFORMATDETAIL;
+
+//! Colorspace conversion
+typedef struct _NVVIOCOLORCONVERSION
+{
+    NvU32       version;                                    //!<  Structure version
+    float       colorMatrix[3][3];                          //!<  Output[n] =
+    float       colorOffset[3];                             //!<  Input[0] * colorMatrix[n][0] +
+    float       colorScale[3];                              //!<  Input[1] * colorMatrix[n][1] +
+                                                            //!<  Input[2] * colorMatrix[n][2] +
+                                                            //!<  OutputRange * colorOffset[n]
+                                                            //!<  where OutputRange is the standard magnitude of
+                                                            //!<  Output[n][n] and colorMatrix and colorOffset
+                                                            //!<  values are within the range -1.0 to +1.0
+    NvU32      compositeSafe;                               //!<  compositeSafe constrains luminance range when using composite output
+} NVVIOCOLORCONVERSION;
+
+//! macro for constructing the version field of _NVVIOCOLORCONVERSION.
+#define NVVIOCOLORCONVERSION_VER   MAKE_NVAPI_VERSION(NVVIOCOLORCONVERSION,1)
+
+//! Gamma correction
+typedef struct _NVVIOGAMMACORRECTION
+{
+    NvU32            version;                               //!< Structure version
+    NvU32            vioGammaCorrectionType;                //!< Gamma correction type (8-bit or 10-bit)
+    //! Gamma correction:
+    union
+    {
+        NVVIOGAMMARAMP8  gammaRamp8;                        //!< Gamma ramp (8-bit index, 16-bit values)
+        NVVIOGAMMARAMP10 gammaRamp10;                       //!< Gamma ramp (10-bit index, 16-bit values)
+    }gammaRamp;
+    float            fGammaValueR;            //!< Red Gamma value within gamma ranges. 0.5 - 6.0
+    float            fGammaValueG;            //!< Green Gamma value within gamma ranges. 0.5 - 6.0
+    float            fGammaValueB;            //!< Blue Gamma value within gamma ranges. 0.5 - 6.0
+} NVVIOGAMMACORRECTION;
+
+//! Macro for constructing thevesion field of _NVVIOGAMMACORRECTION
+#define NVVIOGAMMACORRECTION_VER   MAKE_NVAPI_VERSION(NVVIOGAMMACORRECTION,1)
+
+//! Maximum number of ranges per channel
+#define MAX_NUM_COMPOSITE_RANGE      2
+
+
+typedef struct _NVVIOCOMPOSITERANGE
+{
+    NvU32   uRange;
+    NvU32   uEnabled;
+    NvU32   uMin;
+    NvU32   uMax;
+} NVVIOCOMPOSITERANGE;
+
+
+
+// Device configuration (fields masks indicating NVVIOCONFIG fields to use for NvAPI_VIO_GetConfig/NvAPI_VIO_SetConfig() )
+//
+#define NVVIOCONFIG_SIGNALFORMAT            0x00000001      //!< fields: signalFormat
+#define NVVIOCONFIG_DATAFORMAT              0x00000002      //!< fields: dataFormat
+#define NVVIOCONFIG_OUTPUTREGION            0x00000004      //!< fields: outputRegion
+#define NVVIOCONFIG_OUTPUTAREA              0x00000008      //!< fields: outputArea
+#define NVVIOCONFIG_COLORCONVERSION         0x00000010      //!< fields: colorConversion
+#define NVVIOCONFIG_GAMMACORRECTION         0x00000020      //!< fields: gammaCorrection
+#define NVVIOCONFIG_SYNCSOURCEENABLE        0x00000040      //!< fields: syncSource and syncEnable
+#define NVVIOCONFIG_SYNCDELAY               0x00000080      //!< fields: syncDelay
+#define NVVIOCONFIG_COMPOSITESYNCTYPE       0x00000100      //!< fields: compositeSyncType
+#define NVVIOCONFIG_FRAMELOCKENABLE         0x00000200      //!< fields: EnableFramelock
+#define NVVIOCONFIG_422FILTER               0x00000400      //!< fields: bEnable422Filter
+#define NVVIOCONFIG_COMPOSITETERMINATE      0x00000800      //!< fields: bCompositeTerminate (Not supported on Quadro FX 4000 SDI)
+#define NVVIOCONFIG_DATAINTEGRITYCHECK      0x00001000      //!< fields: bEnableDataIntegrityCheck (Not supported on Quadro FX 4000 SDI)
+#define NVVIOCONFIG_CSCOVERRIDE             0x00002000      //!< fields: colorConversion override
+#define NVVIOCONFIG_FLIPQUEUELENGTH         0x00004000      //!< fields: flipqueuelength control
+#define NVVIOCONFIG_ANCTIMECODEGENERATION   0x00008000      //!< fields: bEnableANCTimeCodeGeneration
+#define NVVIOCONFIG_COMPOSITE               0x00010000      //!< fields: bEnableComposite
+#define NVVIOCONFIG_ALPHAKEYCOMPOSITE       0x00020000      //!< fields: bEnableAlphaKeyComposite
+#define NVVIOCONFIG_COMPOSITE_Y             0x00040000      //!< fields: compRange
+#define NVVIOCONFIG_COMPOSITE_CR            0x00080000      //!< fields: compRange
+#define NVVIOCONFIG_COMPOSITE_CB            0x00100000      //!< fields: compRange
+#define NVVIOCONFIG_FULL_COLOR_RANGE        0x00200000      //!< fields: bEnableFullColorRange
+#define NVVIOCONFIG_RGB_DATA                0x00400000      //!< fields: bEnableRGBData
+#define NVVIOCONFIG_RESERVED_SDIOUTPUTENABLE         0x00800000      //!< fields: bEnableSDIOutput
+#define NVVIOCONFIG_STREAMS                 0x01000000      //!< fields: streams
+#define NVVIOCONFIG_ANC_PARITY_COMPUTATION  0x02000000      //!< fields: ancParityComputation
+#define NVVIOCONFIG_ANC_AUDIO_REPEAT		0x04000000      //!< fields: enableAudioBlanking
+
+
+// Don't forget to update NVVIOCONFIG_VALIDFIELDS in nvapi.spec when NVVIOCONFIG_ALLFIELDS changes.
+#define NVVIOCONFIG_ALLFIELDS   ( NVVIOCONFIG_SIGNALFORMAT          | \
+                                  NVVIOCONFIG_DATAFORMAT            | \
+                                  NVVIOCONFIG_OUTPUTREGION          | \
+                                  NVVIOCONFIG_OUTPUTAREA            | \
+                                  NVVIOCONFIG_COLORCONVERSION       | \
+                                  NVVIOCONFIG_GAMMACORRECTION       | \
+                                  NVVIOCONFIG_SYNCSOURCEENABLE      | \
+                                  NVVIOCONFIG_SYNCDELAY             | \
+                                  NVVIOCONFIG_COMPOSITESYNCTYPE     | \
+                                  NVVIOCONFIG_FRAMELOCKENABLE       | \
+                                  NVVIOCONFIG_422FILTER             | \
+                                  NVVIOCONFIG_COMPOSITETERMINATE    | \
+                                  NVVIOCONFIG_DATAINTEGRITYCHECK    | \
+                                  NVVIOCONFIG_CSCOVERRIDE           | \
+                                  NVVIOCONFIG_FLIPQUEUELENGTH       | \
+                                  NVVIOCONFIG_ANCTIMECODEGENERATION | \
+                                  NVVIOCONFIG_COMPOSITE             | \
+                                  NVVIOCONFIG_ALPHAKEYCOMPOSITE     | \
+                                  NVVIOCONFIG_COMPOSITE_Y           | \
+                                  NVVIOCONFIG_COMPOSITE_CR          | \
+                                  NVVIOCONFIG_COMPOSITE_CB          | \
+                                  NVVIOCONFIG_FULL_COLOR_RANGE      | \
+                                  NVVIOCONFIG_RGB_DATA              | \
+                                  NVVIOCONFIG_RESERVED_SDIOUTPUTENABLE | \
+                                  NVVIOCONFIG_STREAMS               | \
+                                  NVVIOCONFIG_ANC_PARITY_COMPUTATION | \
+								  NVVIOCONFIG_ANC_AUDIO_REPEAT )
+
+#define NVVIOCONFIG_VALIDFIELDS  ( NVVIOCONFIG_SIGNALFORMAT          | \
+                                   NVVIOCONFIG_DATAFORMAT            | \
+                                   NVVIOCONFIG_OUTPUTREGION          | \
+                                   NVVIOCONFIG_OUTPUTAREA            | \
+                                   NVVIOCONFIG_COLORCONVERSION       | \
+                                   NVVIOCONFIG_GAMMACORRECTION       | \
+                                   NVVIOCONFIG_SYNCSOURCEENABLE      | \
+                                   NVVIOCONFIG_SYNCDELAY             | \
+                                   NVVIOCONFIG_COMPOSITESYNCTYPE     | \
+                                   NVVIOCONFIG_FRAMELOCKENABLE       | \
+                                   NVVIOCONFIG_RESERVED_SDIOUTPUTENABLE | \
+                                   NVVIOCONFIG_422FILTER             | \
+                                   NVVIOCONFIG_COMPOSITETERMINATE    | \
+                                   NVVIOCONFIG_DATAINTEGRITYCHECK    | \
+                                   NVVIOCONFIG_CSCOVERRIDE           | \
+                                   NVVIOCONFIG_FLIPQUEUELENGTH       | \
+                                   NVVIOCONFIG_ANCTIMECODEGENERATION | \
+                                   NVVIOCONFIG_COMPOSITE             | \
+                                   NVVIOCONFIG_ALPHAKEYCOMPOSITE     | \
+                                   NVVIOCONFIG_COMPOSITE_Y           | \
+                                   NVVIOCONFIG_COMPOSITE_CR          | \
+                                   NVVIOCONFIG_COMPOSITE_CB          | \
+                                   NVVIOCONFIG_FULL_COLOR_RANGE      | \
+                                   NVVIOCONFIG_RGB_DATA              | \
+                                   NVVIOCONFIG_RESERVED_SDIOUTPUTENABLE | \
+                                   NVVIOCONFIG_STREAMS               | \
+                                   NVVIOCONFIG_ANC_PARITY_COMPUTATION | \
+								   NVVIOCONFIG_ANC_AUDIO_REPEAT)
+
+#define NVVIOCONFIG_DRIVERFIELDS ( NVVIOCONFIG_OUTPUTREGION          | \
+                                   NVVIOCONFIG_OUTPUTAREA            | \
+                                   NVVIOCONFIG_COLORCONVERSION       | \
+                                   NVVIOCONFIG_FLIPQUEUELENGTH)
+
+#define NVVIOCONFIG_GAMMAFIELDS  ( NVVIOCONFIG_GAMMACORRECTION       )
+
+#define NVVIOCONFIG_RMCTRLFIELDS ( NVVIOCONFIG_SIGNALFORMAT          | \
+                                   NVVIOCONFIG_DATAFORMAT            | \
+                                   NVVIOCONFIG_SYNCSOURCEENABLE      | \
+                                   NVVIOCONFIG_COMPOSITESYNCTYPE     | \
+                                   NVVIOCONFIG_FRAMELOCKENABLE       | \
+                                   NVVIOCONFIG_422FILTER             | \
+                                   NVVIOCONFIG_COMPOSITETERMINATE    | \
+                                   NVVIOCONFIG_DATAINTEGRITYCHECK    | \
+                                   NVVIOCONFIG_COMPOSITE             | \
+                                   NVVIOCONFIG_ALPHAKEYCOMPOSITE     | \
+                                   NVVIOCONFIG_COMPOSITE_Y           | \
+                                   NVVIOCONFIG_COMPOSITE_CR          | \
+                                   NVVIOCONFIG_COMPOSITE_CB)
+
+#define NVVIOCONFIG_RMSKEWFIELDS ( NVVIOCONFIG_SYNCDELAY             )
+
+#define NVVIOCONFIG_ALLOWSDIRUNNING_FIELDS ( NVVIOCONFIG_DATAINTEGRITYCHECK     | \
+                                             NVVIOCONFIG_SYNCDELAY              | \
+                                             NVVIOCONFIG_CSCOVERRIDE            | \
+                                             NVVIOCONFIG_ANCTIMECODEGENERATION  | \
+                                             NVVIOCONFIG_COMPOSITE              | \
+                                             NVVIOCONFIG_ALPHAKEYCOMPOSITE      | \
+                                             NVVIOCONFIG_COMPOSITE_Y            | \
+                                             NVVIOCONFIG_COMPOSITE_CR           | \
+                                             NVVIOCONFIG_COMPOSITE_CB           | \
+                                             NVVIOCONFIG_ANC_PARITY_COMPUTATION)
+
+
+ #define NVVIOCONFIG_RMMODESET_FIELDS ( NVVIOCONFIG_SIGNALFORMAT         | \
+                                        NVVIOCONFIG_DATAFORMAT           | \
+                                        NVVIOCONFIG_SYNCSOURCEENABLE     | \
+                                        NVVIOCONFIG_FRAMELOCKENABLE      | \
+                                        NVVIOCONFIG_COMPOSITESYNCTYPE	 | \
+										NVVIOCONFIG_ANC_AUDIO_REPEAT)
+
+
+//! Output device configuration
+// No members can be deleted from below structure. Only add new members at the
+// end of the structure.
+typedef struct _NVVIOOUTPUTCONFIG_V1
+{
+    NVVIOSIGNALFORMAT    signalFormat;                         //!< Signal format for video output
+    NVVIODATAFORMAT      dataFormat;                           //!< Data format for video output
+    NVVIOOUTPUTREGION    outputRegion;                         //!< Region for video output (Desktop mode)
+    NVVIOOUTPUTAREA      outputArea;                           //!< Usable resolution for video output (safe area)
+    NVVIOCOLORCONVERSION colorConversion;                      //!< Color conversion.
+    NVVIOGAMMACORRECTION gammaCorrection;
+    NvU32                syncEnable;                           //!< Sync enable (TRUE to use syncSource)
+    NVVIOSYNCSOURCE      syncSource;                           //!< Sync source
+    NVVIOSYNCDELAY       syncDelay;                            //!< Sync delay
+    NVVIOCOMPSYNCTYPE    compositeSyncType;                    //!< Composite sync type
+    NvU32                frameLockEnable;                      //!< Flag indicating whether framelock was on/off
+    NvU32                psfSignalFormat;                      //!< Indicates whether contained format is PSF Signal format
+    NvU32                enable422Filter;                      //!< Enables/Disables 4:2:2 filter
+    NvU32                compositeTerminate;                   //!< Composite termination
+    NvU32                enableDataIntegrityCheck;             //!< Enable data integrity check: true - enable, false - disable
+    NvU32                cscOverride;                          //!< Use provided CSC color matrix to overwrite
+    NvU32                flipQueueLength;                      //!< Number of buffers used for the internal flipqueue
+    NvU32                enableANCTimeCodeGeneration;          //!< Enable SDI ANC time code generation
+    NvU32                enableComposite;                      //!< Enable composite
+    NvU32                enableAlphaKeyComposite;              //!< Enable Alpha key composite
+    NVVIOCOMPOSITERANGE  compRange;                            //!< Composite ranges
+    NvU8                 reservedData[256];                    //!< Inicates last stored SDI output state TRUE-ON / FALSE-OFF
+    NvU32                enableFullColorRange;                 //!< Flag indicating Full Color Range
+    NvU32                enableRGBData;                        //!< Indicates data is in RGB format
+} NVVIOOUTPUTCONFIG_V1;
+
+typedef struct _NVVIOOUTPUTCONFIG_V2
+{
+    NVVIOSIGNALFORMAT    signalFormat;                         //!< Signal format for video output
+    NVVIODATAFORMAT      dataFormat;                           //!< Data format for video output
+    NVVIOOUTPUTREGION    outputRegion;                         //!< Region for video output (Desktop mode)
+    NVVIOOUTPUTAREA      outputArea;                           //!< Usable resolution for video output (safe area)
+    NVVIOCOLORCONVERSION colorConversion;                      //!< Color conversion.
+    NVVIOGAMMACORRECTION gammaCorrection;
+    NvU32                syncEnable;                           //!< Sync enable (TRUE to use syncSource)
+    NVVIOSYNCSOURCE      syncSource;                           //!< Sync source
+    NVVIOSYNCDELAY       syncDelay;                            //!< Sync delay
+    NVVIOCOMPSYNCTYPE    compositeSyncType;                    //!< Composite sync type
+    NvU32                frameLockEnable;                      //!< Flag indicating whether framelock was on/off
+    NvU32                psfSignalFormat;                      //!< Indicates whether contained format is PSF Signal format
+    NvU32                enable422Filter;                      //!< Enables/Disables 4:2:2 filter
+    NvU32                compositeTerminate;                   //!< Composite termination
+    NvU32                enableDataIntegrityCheck;             //!< Enable data integrity check: true - enable, false - disable
+    NvU32                cscOverride;                          //!< Use provided CSC color matrix to overwrite
+    NvU32                flipQueueLength;                      //!< Number of buffers used for the internal flip queue
+    NvU32                enableANCTimeCodeGeneration;          //!< Enable SDI ANC time code generation
+    NvU32                enableComposite;                      //!< Enable composite
+    NvU32                enableAlphaKeyComposite;              //!< Enable Alpha key composite
+    NVVIOCOMPOSITERANGE  compRange;                            //!< Composite ranges
+    NvU8                 reservedData[256];                    //!< Indicates last stored SDI output state TRUE-ON / FALSE-OFF
+    NvU32                enableFullColorRange;                 //!< Flag indicating Full Color Range
+    NvU32                enableRGBData;                        //!< Indicates data is in RGB format
+    NVVIOANCPARITYCOMPUTATION ancParityComputation;            //!< Enable HW ANC parity bit computation (auto/on/off)
+} NVVIOOUTPUTCONFIG_V2;
+
+typedef struct _NVVIOOUTPUTCONFIG_V3
+{
+    NVVIOSIGNALFORMAT    signalFormat;                         //!< Signal format for video output
+    NVVIODATAFORMAT      dataFormat;                           //!< Data format for video output
+    NVVIOOUTPUTREGION    outputRegion;                         //!< Region for video output (Desktop mode)
+    NVVIOOUTPUTAREA      outputArea;                           //!< Usable resolution for video output (safe area)
+    NVVIOCOLORCONVERSION colorConversion;                      //!< Color conversion.
+    NVVIOGAMMACORRECTION gammaCorrection;
+    NvU32                syncEnable;                           //!< Sync enable (TRUE to use syncSource)
+    NVVIOSYNCSOURCE      syncSource;                           //!< Sync source
+    NVVIOSYNCDELAY       syncDelay;                            //!< Sync delay
+    NVVIOCOMPSYNCTYPE    compositeSyncType;                    //!< Composite sync type
+    NvU32                frameLockEnable;                      //!< Flag indicating whether framelock was on/off
+    NvU32                psfSignalFormat;                      //!< Indicates whether contained format is PSF Signal format
+    NvU32                enable422Filter;                      //!< Enables/Disables 4:2:2 filter
+    NvU32                compositeTerminate;                   //!< Composite termination
+    NvU32                enableDataIntegrityCheck;             //!< Enable data integrity check: true - enable, false - disable
+    NvU32                cscOverride;                          //!< Use provided CSC color matrix to overwrite
+    NvU32                flipQueueLength;                      //!< Number of buffers used for the internal flip queue
+    NvU32                enableANCTimeCodeGeneration;          //!< Enable SDI ANC time code generation
+    NvU32                enableComposite;                      //!< Enable composite
+    NvU32                enableAlphaKeyComposite;              //!< Enable Alpha key composite
+    NVVIOCOMPOSITERANGE  compRange;                            //!< Composite ranges
+    NvU8                 reservedData[256];                    //!< Indicates last stored SDI output state TRUE-ON / FALSE-OFF
+    NvU32                enableFullColorRange;                 //!< Flag indicating Full Color Range
+    NvU32                enableRGBData;                        //!< Indicates data is in RGB format
+    NVVIOANCPARITYCOMPUTATION ancParityComputation;            //!< Enable HW ANC parity bit computation (auto/on/off)
+	NvU32				 enableAudioBlanking;				   //!< Enable HANC audio blanking on repeat frames
+} NVVIOOUTPUTCONFIG_V3;
+
+//! Stream configuration
+typedef struct _NVVIOSTREAM
+{
+    NvU32                   bitsPerComponent;                     //!< Bits per component
+    NVVIOCOMPONENTSAMPLING  sampling;                             //!< Sampling
+    NvU32                   expansionEnable;                      //!< Enable/disable 4:2:2->4:4:4 expansion
+    NvU32                   numLinks;                             //!< Number of active links
+    struct
+    {
+        NvU32               jack;                                 //!< This stream's link[i] will use the specified (0-based) channel within the
+        NvU32               channel;                              //!< specified (0-based) jack
+    } links[NVAPI_MAX_VIO_LINKS_PER_STREAM];
+} NVVIOSTREAM;
+
+//! Input device configuration
+typedef struct _NVVIOINPUTCONFIG
+{
+    NvU32                numRawCaptureImages;                  //!< numRawCaptureImages is the number of frames to keep in the capture queue.
+                                                               //!< must be between NVAPI_GVI_MIN_RAW_CAPTURE_IMAGES and NVAPI_GVI_MAX_RAW_CAPTURE_IMAGES,
+    NVVIOSIGNALFORMAT    signalFormat;                         //!< Signal format.
+                                                               //!< Please note that both numRawCaptureImages and signalFormat should be set together.
+    NvU32                numStreams;                           //!< Number of active streams.
+    NVVIOSTREAM          streams[NVAPI_MAX_VIO_STREAMS];       //!< Stream configurations
+    NvU32                bTestMode;                            //!< This attribute controls the GVI test mode.
+                                                               //!< Possible values 0/1. When testmode enabled, the
+                                                               //!< GVI device will generate fake data as quickly as possible.
+} NVVIOINPUTCONFIG;
+
+typedef struct _NVVIOCONFIG_V1
+{
+    NvU32                version;                              //!< Structure version
+    NvU32                fields;                               //!< Caller sets to NVVIOCONFIG_* mask for fields to use
+    NVVIOCONFIGTYPE      nvvioConfigType;                      //!< Input or Output configuration
+    union
+    {
+        NVVIOINPUTCONFIG  inConfig;                            //!<  Input device configuration
+        NVVIOOUTPUTCONFIG_V1 outConfig;                           //!<  Output device configuration
+    }vioConfig;
+} NVVIOCONFIG_V1;
+
+
+typedef struct _NVVIOCONFIG_V2
+{
+    NvU32                version;                              //!< Structure version
+    NvU32                fields;                               //!< Caller sets to NVVIOCONFIG_* mask for fields to use
+    NVVIOCONFIGTYPE      nvvioConfigType;                      //!< Input or Output configuration
+    union
+    {
+        NVVIOINPUTCONFIG     inConfig;                         //!< Input device configuration
+        NVVIOOUTPUTCONFIG_V2 outConfig;                        //!< Output device configuration
+    }vioConfig;
+} NVVIOCONFIG_V2;
+
+typedef struct _NVVIOCONFIG_V3
+{
+    NvU32                version;                              //!< Structure version
+    NvU32                fields;                               //!< Caller sets to NVVIOCONFIG_* mask for fields to use
+    NVVIOCONFIGTYPE      nvvioConfigType;                      //!< Input or Output configuration
+    union
+    {
+        NVVIOINPUTCONFIG     inConfig;                         //!< Input device configuration
+        NVVIOOUTPUTCONFIG_V3 outConfig;                        //!< Output device configuration
+    }vioConfig;
+} NVVIOCONFIG_V3;
+typedef NVVIOOUTPUTCONFIG_V3 NVVIOOUTPUTCONFIG;
+typedef NVVIOCONFIG_V3 NVVIOCONFIG;
+
+#define NVVIOCONFIG_VER1  MAKE_NVAPI_VERSION(NVVIOCONFIG_V1,1)
+#define NVVIOCONFIG_VER2  MAKE_NVAPI_VERSION(NVVIOCONFIG_V2,2)
+#define NVVIOCONFIG_VER3  MAKE_NVAPI_VERSION(NVVIOCONFIG_V3,3)
+#define NVVIOCONFIG_VER   NVVIOCONFIG_VER3
+
+
+typedef struct
+{
+    NvPhysicalGpuHandle                    hPhysicalGpu;                    //!< Handle to Physical GPU (This could be NULL for GVI device if its not binded)
+    NvVioHandle                         hVioHandle;                     //!<handle to SDI Input/Output device
+    NvU32                               vioId;                          //!<device Id of SDI Input/Output device
+    NvU32                               outputId;            //!<deviceMask of the SDI display connected to GVO device.
+                                                                        //!<outputId will be 0 for GVI device.
+} NVVIOTOPOLOGYTARGET;
+
+typedef struct _NV_VIO_TOPOLOGY
+{
+    NvU32                       version;
+    NvU32                       vioTotalDeviceCount;                    //!<How many video I/O targets are valid
+    NVVIOTOPOLOGYTARGET         vioTarget[NVAPI_MAX_VIO_DEVICES];       //!<Array of video I/O targets
+}NV_VIO_TOPOLOGY, NVVIOTOPOLOGY;
+
+
+//! Macro for constructing the version field of NV_VIO_TOPOLOGY
+#define NV_VIO_TOPOLOGY_VER  MAKE_NVAPI_VERSION(NV_VIO_TOPOLOGY,1)
+
+//! Macro for constructing the version field of NVVIOTOPOLOGY
+#define NVVIOTOPOLOGY_VER    MAKE_NVAPI_VERSION(NVVIOTOPOLOGY,1)
+
+
+
+//! @}
+
+
+
+//! \addtogroup vidio
+//! @{
+///////////////////////////////////////////////////////////////////////////////
+//!
+//!   Function:    NvAPI_VIO_GetCapabilities
+//!
+//!   Description: This API determine the graphics adapter video I/O capabilities.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 190
+//!
+//! \param [in]  NvVioHandle   The caller provides the SDI device handle as input.
+//! \param [out] pAdapterCaps  Pointer to receive capabilities
+//!
+//! \retval ::NVAPI_OK                           Success
+//! \retval ::NVAPI_API_NOT_INTIALIZED           NVAPI Not Initialized
+//! \retval ::NVAPI_INVALID_ARGUMENT             Arguments passed to API are not valid
+//! \retval ::NVAPI_INCOMPATIBLE_STRUCT_VERSION  NVVIOCAPS struct version used by the app is not compatible
+//! \retval ::NVAPI_NOT_SUPPORTED                Video I/O not supported
+//! \retval ::NVAPI_ERROR                        NVAPI Random errors
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_VIO_GetCapabilities(NvVioHandle     hVioHandle,
+                                          NVVIOCAPS       *pAdapterCaps);
+
+
+////////////////////////////////////////////////////////////////////////////////
+//!   Function:    NvAPI_VIO_Open
+//!
+//!   Description: This API opens the graphics adapter for video I/O operations
+//!                using the OpenGL application interface.  Read operations
+//!                are permitted in this mode by multiple clients, but Write
+//!                operations are application exclusive.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 190
+//!
+//! \param [in]  NvVioHandle      The caller provides the SDI output device handle as input.
+//! \param [in]  vioClass         Class interface (NVVIOCLASS_* value)
+//! \param [in]  ownerType        Specify NVVIOOWNERTYPE_APPLICATION or NVVIOOWNERTYPE_DESKTOP.
+//!
+//! \retval ::NVAPI_OK                           Success
+//! \retval ::NVAPI_API_NOT_INTIALIZED           NVAPI Not Initialized
+//! \retval ::NVAPI_INVALID_ARGUMENT             Arguments passed to API are not valid
+//! \retval ::NVAPI_NOT_SUPPORTED                Video I/O not supported
+//! \retval ::NVAPI_ERROR                        NVAPI Random errors
+//! \retval ::NVAPI_DEVICE_BUSY                  Access denied for requested access
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_VIO_Open(NvVioHandle       hVioHandle,
+                               NvU32             vioClass,
+                               NVVIOOWNERTYPE    ownerType);
+
+///////////////////////////////////////////////////////////////////////////////
+//!   Function:    NvAPI_VIO_Close
+//!
+//!   Description: This API closes the graphics adapter for graphics-to-video operations
+//!                using the OpenGL application interface.  Closing an
+//!                OpenGL handle releases the device.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 190
+//!
+//! \param [in]  NvVioHandle   The caller provides the SDI output device handle as input.
+//! \param [in]  bRelease      boolean value to either keep or release ownership
+//!
+//! \retval ::NVAPI_OK                           Success
+//! \retval ::NVAPI_API_NOT_INTIALIZED           NVAPI Not Initialized
+//! \retval ::NVAPI_INVALID_ARGUMENT             Arguments passed to API are not valid
+//! \retval ::NVAPI_NOT_SUPPORTED                Video I/O not supported
+//! \retval ::NVAPI_ERROR                        NVAPI Random errors
+//! \retval ::NVAPI_DEVICE_BUSY                  Access denied for requested access
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_VIO_Close(NvVioHandle       hVioHandle,
+                                NvU32             bRelease);
+///////////////////////////////////////////////////////////////////////////////
+//!   Function:    NvAPI_VIO_Status
+//!
+//!   Description: This API gets the Video I/O LED status.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 190
+//!
+//! \param [in]  NvVioHandle   The caller provides the SDI device handle as input.
+//! \param [out] pStatus       Return pointer to NVVIOSTATUS
+//!
+//! \retval ::NVAPI_OK                           Success
+//! \retval ::NVAPI_API_NOT_INTIALIZED           NVAPI Not Initialized
+//! \retval ::NVAPI_INVALID_ARGUMENT             Arguments passed to API are not valid
+//! \retval ::NVAPI_INCOMPATIBLE_STRUCT_VERSION  Invalid structure version
+//! \retval ::NVAPI_NOT_SUPPORTED                Video I/O not supported
+//! \retval ::NVAPI_ERROR                        NVAPI Random errors
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_VIO_Status(NvVioHandle     hVioHandle,
+                                 NVVIOSTATUS     *pStatus);
+
+////////////////////////////////////////////////////////////////////////////////
+//!   Function:    NvAPI_VIO_SyncFormatDetect
+//!
+//!   Description: This API detects the Video I/O incoming sync video format.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 190
+//!
+//! \param [in]  NvVioHandle  The caller provides the SDI device handle as input.
+//! \param [out] pWait        Pointer to receive how many milliseconds will lapse
+//!                           before VIOStatus returns the detected syncFormat.
+//!
+//! \retval ::NVAPI_OK                          Success
+//! \retval ::NVAPI_API_NOT_INTIALIZED          NVAPI Not Initialized
+//! \retval ::NVAPI_INVALID_ARGUMENT            Arguments passed to API are not valid
+//! \retval ::NVAPI_NOT_SUPPORTED               Video I/O not supported
+//! \retval ::NVAPI_ERROR                       NVAPI Random errors
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_VIO_SyncFormatDetect(NvVioHandle hVioHandle,
+                                           NvU32       *pWait);
+///////////////////////////////////////////////////////////////////////////////
+//!   Function:    NvAPI_VIO_GetConfig
+//!
+//!   Description: This API gets the graphics-to-video configuration.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 190
+//!
+//! \param [in]  NvVioHandle   The caller provides the SDI device handle as input.
+//! \param [out] pConfig       Pointer to the graphics-to-video configuration
+//!
+//! \retval ::NVAPI_OK                           Success
+//! \retval ::NVAPI_API_NOT_INTIALIZED           NVAPI Not Initialized
+//! \retval ::NVAPI_INVALID_ARGUMENT             Arguments passed to API are not valid
+//! \retval ::NVAPI_INCOMPATIBLE_STRUCT_VERSION  Invalid structure version
+//! \retval ::NVAPI_NOT_SUPPORTED                Video I/O not supported
+//! \retval ::NVAPI_ERROR                        NVAPI Random errors
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_VIO_GetConfig(NvVioHandle        hVioHandle,
+                                    NVVIOCONFIG        *pConfig);
+
+///////////////////////////////////////////////////////////////////////////////
+//!   Function:    NvAPI_VIO_SetConfig
+//!
+//!   Description: This API sets the graphics-to-video configuration.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 190
+//!
+//! \param [in]  NvVioHandle      The caller provides the SDI device handle as input.
+//! \param [in]  pConfig          Pointer to Graphics-to-Video configuration
+//!
+//! \retval ::NVAPI_OK                           Success
+//! \retval ::NVAPI_API_NOT_INTIALIZED           NVAPI Not Initialized
+//! \retval ::NVAPI_INVALID_ARGUMENT             Arguments passed to API are not valid
+//! \retval ::NVAPI_INCOMPATIBLE_STRUCT_VERSION  Stucture version invalid
+//! \retval ::NVAPI_NOT_SUPPORTED                Video I/O not supported
+//! \retval ::NVAPI_ERROR                        NVAPI Random errors
+//! \retval ::NVAPI_DEVICE_BUSY                  Access denied for requested access
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_VIO_SetConfig(NvVioHandle            hVioHandle,
+                                    const NVVIOCONFIG      *pConfig);
+
+///////////////////////////////////////////////////////////////////////////////
+//!   Function:    NvAPI_VIO_SetCSC
+//!
+//!   Description: This API sets the colorspace conversion parameters.
+//!
+//! \deprecated  Do not use this function - it is deprecated in release 290. Instead, use NvAPI_VIO_SetConfig.
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 190
+//!
+//! \param [in]  NvVioHandle      The caller provides the SDI device handle as input.
+//! \param [in]  pCSC             Pointer to CSC parameters
+//!
+//! \retval ::NVAPI_OK                           Success
+//! \retval ::NVAPI_API_NOT_INTIALIZED           NVAPI Not Initialized
+//! \retval ::NVAPI_INVALID_ARGUMENT             Arguments passed to API are not valid
+//! \retval ::NVAPI_INCOMPATIBLE_STRUCT_VERSION  Stucture version invalid
+//! \retval ::NVAPI_NOT_SUPPORTED                Video I/O not supported
+//! \retval ::NVAPI_ERROR                        NVAPI Random errors
+//! \retval ::NVAPI_DEVICE_BUSY                  Access denied for requested access
+////////////////////////////////////////////////////////////////////////////////----
+__nvapi_deprecated_function("Do not use this function - it is deprecated in release 290. Instead, use NvAPI_VIO_SetConfig.")
+NVAPI_INTERFACE NvAPI_VIO_SetCSC(NvVioHandle           hVioHandle,
+                                 NVVIOCOLORCONVERSION  *pCSC);
+////////////////////////////////////////////////////////////////////////////////
+//!   Function:    NvAPI_VIO_GetCSC
+//!
+//!   Description: This API gets the colorspace conversion parameters.
+//!
+//! \deprecated  Do not use this function - it is deprecated in release 290. Instead, use NvAPI_VIO_GetConfig.
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 190
+//!
+//! \param [in]  NvVioHandle      The caller provides the SDI device handle as input.
+//! \param [out] pCSC             Pointer to CSC parameters
+//!
+//! \retval ::NVAPI_OK                           Success
+//! \retval ::NVAPI_API_NOT_INTIALIZED           NVAPI Not Initialized
+//! \retval ::NVAPI_INVALID_ARGUMENT             Arguments passed to API are not valid
+//! \retval ::NVAPI_INCOMPATIBLE_STRUCT_VERSION  Stucture version invalid
+//! \retval ::NVAPI_NOT_SUPPORTED                Video I/O not supported
+//! \retval ::NVAPI_ERROR                        NVAPI Random errors
+////////////////////////////////////////////////////////////////////////////////
+__nvapi_deprecated_function("Do not use this function - it is deprecated in release 290. Instead, use NvAPI_VIO_GetConfig.")
+NVAPI_INTERFACE NvAPI_VIO_GetCSC(NvVioHandle           hVioHandle,
+                                 NVVIOCOLORCONVERSION  *pCSC);
+///////////////////////////////////////////////////////////////////////////////
+//!   Function:    NvAPI_VIO_SetGamma
+//!
+//!   Description: This API sets the gamma conversion parameters.
+//!
+//! \deprecated  Do not use this function - it is deprecated in release 290. Instead, use NvAPI_VIO_SetConfig.
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 190
+//!
+//! \param [in]  NvVioHandle       The caller provides the SDI device handle as input.
+//! \param [in]  pGamma            Pointer to gamma parameters
+//!
+//! \retval ::NVAPI_OK                           Success
+//! \retval ::NVAPI_API_NOT_INTIALIZED           NVAPI Not Initialized
+//! \retval ::NVAPI_INVALID_ARGUMENT             Arguments passed to API are not valid
+//! \retval ::NVAPI_INCOMPATIBLE_STRUCT_VERSION  Stucture version invalid
+//! \retval ::NVAPI_NOT_SUPPORTED                Video I/O not supported
+//! \retval ::NVAPI_ERROR                        NVAPI Random errors
+//! \retval ::NVAPI_DEVICE_BUSY                  Access denied for requested access
+///////////////////////////////////////////////////////////////////////////////
+__nvapi_deprecated_function("Do not use this function - it is deprecated in release 290. Instead, use NvAPI_VIO_SetConfig.")
+NVAPI_INTERFACE NvAPI_VIO_SetGamma(NvVioHandle           hVioHandle,
+                                   NVVIOGAMMACORRECTION  *pGamma);
+
+///////////////////////////////////////////////////////////////////////////////
+//!   Function:    NvAPI_VIO_GetGamma
+//!
+//!   Description: This API gets the gamma conversion parameters.
+//!
+//! \deprecated  Do not use this function - it is deprecated in release 290. Instead, use NvAPI_VIO_GetConfig.
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 190
+//!
+//! \param [in]  NvVioHandle      The caller provides the SDI device handle as input.
+//! \param [out] pGamma           Pointer to gamma parameters
+//!
+//! \retval ::NVAPI_OK                           Success
+//! \retval ::NVAPI_API_NOT_INTIALIZED           NVAPI Not Initialized
+//! \retval ::NVAPI_INVALID_ARGUMENT             Arguments passed to API are not valid
+//! \retval ::NVAPI_INCOMPATIBLE_STRUCT_VERSION  Stucture version invalid
+//! \retval ::NVAPI_NOT_SUPPORTED                Video I/O not supported
+//! \retval ::NVAPI_ERROR                        NVAPI Random errors
+///////////////////////////////////////////////////////////////////////////////
+__nvapi_deprecated_function("Do not use this function - it is deprecated in release 290. Instead, use NvAPI_VIO_GetConfig.")
+NVAPI_INTERFACE NvAPI_VIO_GetGamma(NvVioHandle           hVioHandle,
+                                   NVVIOGAMMACORRECTION* pGamma);
+////////////////////////////////////////////////////////////////////////////////
+//!   Function:    NvAPI_VIO_SetSyncDelay
+//!
+//!   Description: This API sets the sync delay parameters.
+//!
+//! \deprecated  Do not use this function - it is deprecated in release 290. Instead, use NvAPI_VIO_SetConfig.
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 190
+//!
+//! \param [in]  NvVioHandle   The caller provides the SDI device handle as input.
+//! \param [in]  pSyncDelay    Pointer to sync delay parameters
+//!
+//! \retval ::NVAPI_OK                           Success
+//! \retval ::NVAPI_API_NOT_INTIALIZED           NVAPI Not Initialized
+//! \retval ::NVAPI_INVALID_ARGUMENT             Arguments passed to API are not valid
+//! \retval ::NVAPI_INCOMPATIBLE_STRUCT_VERSION  Stucture version invalid
+//! \retval ::NVAPI_ERROR                        NVAPI Random errors
+//! \retval ::NVAPI_DEVICE_BUSY                  Access denied for requested access
+///////////////////////////////////////////////////////////////////////////////
+__nvapi_deprecated_function("Do not use this function - it is deprecated in release 290. Instead, use NvAPI_VIO_SetConfig.")
+NVAPI_INTERFACE NvAPI_VIO_SetSyncDelay(NvVioHandle            hVioHandle,
+                                       const NVVIOSYNCDELAY   *pSyncDelay);
+
+////////////////////////////////////////////////////////////////////////////////
+//!   Function:    NvAPI_VIO_GetSyncDelay
+//!
+//!   Description: This API gets the sync delay parameters.
+//!
+//! \deprecated  Do not use this function - it is deprecated in release 290. Instead, use NvAPI_VIO_GetConfig.
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 190
+//!
+//! \param [in]  NvVioHandle      The caller provides the SDI device handle as input.
+//! \param [out] pSyncDelay       Pointer to sync delay parameters
+//!
+//! \retval ::NVAPI_OK                           Success
+//! \retval ::NVAPI_API_NOT_INTIALIZED           NVAPI Not Initialized
+//! \retval ::NVAPI_INVALID_ARGUMENT             Arguments passed to API are not valid
+//! \retval ::NVAPI_INCOMPATIBLE_STRUCT_VERSION  Stucture version invalid
+//! \retval ::NVAPI_ERROR                        NVAPI Random errors
+///////////////////////////////////////////////////////////////////////////////
+__nvapi_deprecated_function("Do not use this function - it is deprecated in release 290. Instead, use NvAPI_VIO_GetConfig.")
+NVAPI_INTERFACE NvAPI_VIO_GetSyncDelay(NvVioHandle      hVioHandle,
+                                       NVVIOSYNCDELAY   *pSyncDelay);
+
+typedef enum _NVVIOPCILINKRATE
+{
+    NVVIOPCILINKRATE_UNKNOWN          = 0,
+    NVVIOPCILINKRATE_GEN1             = 1,                    //<! 2.5 Gbps.
+    NVVIOPCILINKRATE_GEN2             = 2,                    //<! 5 Gbps.
+    NVVIOPCILINKRATE_GEN3             = 3,                    //<! 8 Gbps.
+}NVVIOPCILINKRATE;
+
+typedef enum _NVVIOPCILINKWIDTH
+{
+    NVVIOPCILINKWIDTH_UNKNOWN         = 0,
+    NVVIOPCILINKWIDTH_x1              = 1,
+    NVVIOPCILINKWIDTH_x2              = 2,
+    NVVIOPCILINKWIDTH_x4              = 4,
+    NVVIOPCILINKWIDTH_x8              = 8,
+    NVVIOPCILINKWIDTH_x16            = 16,
+}NVVIOPCILINKWIDTH;
+
+typedef struct _NVVIOPCIINFO
+{
+    NvU32                     version;                            //!< Structure version
+
+    NvU32                   pciDeviceId;                        //!< specifies the internal PCI device identifier for the GVI.
+    NvU32                   pciSubSystemId;                        //!< specifies the internal PCI subsystem identifier for the GVI.
+    NvU32                   pciRevisionId;                        //!< specifies the internal PCI device-specific revision identifier for the GVI.
+    NvU32                   pciDomain;                            //!< specifies the PCI domain of the GVI device.
+    NvU32                   pciBus;                                //!< specifies the PCI bus number of the GVI device.
+    NvU32                   pciSlot;                            //!< specifies the PCI slot number of the GVI device.
+    NVVIOPCILINKWIDTH       pciLinkWidth;                        //!< specifies the the negotiated PCIE link width.
+    NVVIOPCILINKRATE           pciLinkRate;                        //!< specifies the the negotiated PCIE link rate.
+} NVVIOPCIINFO_V1;
+
+typedef NVVIOPCIINFO_V1                                         NVVIOPCIINFO;
+#define NVVIOPCIINFO_VER1                                          MAKE_NVAPI_VERSION(NVVIOPCIINFO_V1,1)
+#define NVVIOPCIINFO_VER                                        NVVIOPCIINFO_VER1
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_VIO_GetPCIInfo()
+//
+// DESCRIPTION:   This API gets PCI information of the attached SDI(input) capture card.
+//
+// PARAMETERS:      hVioHandle    (IN)    - Handle to SDI capture card.
+//                  pVioPCIInfo    (OUT)    - PCI information of the attached SDI capture card.
+//
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//
+// RETURN STATUS: This API can return any of the error codes enumerated in #NvAPI_Status. If there are return error codes with
+//                specific meaning for this API, they are listed below.
+//
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_VIO_GetPCIInfo(__in NvVioHandle hVioHandle,
+                                            __inout NVVIOPCIINFO* pVioPCIInfo);
+
+////////////////////////////////////////////////////////////////////////////////
+//!   Function:    NvAPI_VIO_IsRunning
+//!
+//!   Description: This API determines if Video I/O is running.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 190
+//!
+//! \param [in]  NvVioHandle[IN]          The caller provides the SDI device handle as input.
+//!
+//! \retval ::NVAPI_DRIVER_RUNNING        Video I/O running
+//! \retval ::NVAPI_DRIVER_NOTRUNNING     Video I/O not running
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_VIO_IsRunning(NvVioHandle   hVioHandle);
+
+///////////////////////////////////////////////////////////////////////////////
+//!   Function:    NvAPI_VIO_Start
+//!
+//!   Description: This API starts Video I/O.
+//!              This API should be called for NVVIOOWNERTYPE_DESKTOP only and will not work for OGL applications.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 190
+//!
+//! \param [in]  NvVioHandle[IN]     The caller provides the SDI device handle as input.
+//!
+//! \retval ::NVAPI_OK                           Success
+//! \retval ::NVAPI_API_NOT_INTIALIZED           NVAPI Not Initialized
+//! \retval ::NVAPI_INVALID_ARGUMENT             Arguments passed to API are not valid
+//! \retval ::NVAPI_NOT_SUPPORTED                Video I/O not supported
+//! \retval ::NVAPI_ERROR                        NVAPI Random errors
+//! \retval ::NVAPI_DEVICE_BUSY                  Access denied for requested access
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_VIO_Start(NvVioHandle     hVioHandle);
+
+///////////////////////////////////////////////////////////////////////////////
+//!   Function:    NvAPI_VIO_Stop
+//!
+//!   Description: This API stops Video I/O.
+//!              This API should be called for NVVIOOWNERTYPE_DESKTOP only and will not work for OGL applications.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 190
+//!
+//! \param [in]  NvVioHandle[IN]     The caller provides the SDI device handle as input.
+//!
+//! \retval ::NVAPI_OK                           Success
+//! \retval ::NVAPI_API_NOT_INTIALIZED           NVAPI Not Initialized
+//! \retval ::NVAPI_INVALID_ARGUMENT             Arguments passed to API are not valid
+//! \retval ::NVAPI_NOT_SUPPORTED                Video I/O not supported
+//! \retval ::NVAPI_ERROR                        NVAPI Random errors
+//! \retval ::NVAPI_DEVICE_BUSY                  Access denied for requested access
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_VIO_Stop(NvVioHandle     hVioHandle);
+
+
+///////////////////////////////////////////////////////////////////////////////
+//!   Function:    NvAPI_VIO_IsFrameLockModeCompatible
+//!
+//!   Description: This API checks whether modes are compatible in frame lock mode.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 190
+//!
+//! \param [in]   NvVioHandle          The caller provides the SDI device handle as input.
+//! \param [in]   srcEnumIndex         Source Enumeration index
+//! \param [in]   destEnumIndex        Destination Enumeration index
+//! \param [out]  pbCompatible         Pointer to receive compatibility
+//!
+//! \retval ::NVAPI_OK                           Success
+//! \retval ::NVAPI_API_NOT_INTIALIZED           NVAPI Not Initialized
+//! \retval ::NVAPI_INVALID_ARGUMENT             Arguments passed to API are not valid
+//! \retval ::NVAPI_NOT_SUPPORTED                Video I/O not supported
+//! \retval ::NVAPI_ERROR                        NVAPI Random errors
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_VIO_IsFrameLockModeCompatible(NvVioHandle              hVioHandle,
+                                                    NvU32                    srcEnumIndex,
+                                                    NvU32                    destEnumIndex,
+                                                    NvU32*                   pbCompatible);
+
+
+///////////////////////////////////////////////////////////////////////////////
+//!   Function:    NvAPI_VIO_EnumDevices
+//!
+//!   Description: This API enumerate all VIO devices connected to the system.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 190
+//!
+//! \param [out]  NvVioHandle                  User passes the pointer of NvVioHandle[] array to get handles to
+//!                                            all the connected video I/O devices.
+//! \param [out]  vioDeviceCount               User gets total number of VIO devices connected to the system.
+//!
+//! \retval ::NVAPI_OK                         Success
+//! \retval ::NVAPI_API_NOT_INTIALIZED         NVAPI Not Initialized
+//! \retval ::NVAPI_INVALID_ARGUMENT           Arguments passed to API are not valid
+//! \retval ::NVAPI_ERROR                      NVAPI Random errors
+//! \retval ::NVAPI_NVIDIA_DEVICE_NOT_FOUND    No SDI Device found
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_VIO_EnumDevices(NvVioHandle       hVioHandle[NVAPI_MAX_VIO_DEVICES],
+                                      NvU32             *vioDeviceCount);
+
+
+///////////////////////////////////////////////////////////////////////////////
+//!   Function:    NvAPI_VIO_QueryTopology
+//!
+//!   Description: This API queries the valid SDI topologies.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 190
+//!
+//! \param [out] pNvVIOTopology     User passes the pointer to NVVIOTOPOLOGY to fetch all valid SDI topologies.
+//!
+//! \retval ::NVAPI_OK                           Success
+//! \retval ::NVAPI_API_NOT_INTIALIZED           NVAPI Not Initialized
+//! \retval ::NVAPI_INVALID_ARGUMENT             Arguments passed to API are not valid
+//! \retval ::NVAPI_INCOMPATIBLE_STRUCT_VERSION  Invalid structure version
+//! \retval ::NVAPI_ERROR                        NVAPI Random errors
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_VIO_QueryTopology(NV_VIO_TOPOLOGY   *pNvVIOTopology);
+
+
+///////////////////////////////////////////////////////////////////////////////
+//!   Function:    NvAPI_VIO_EnumSignalFormats
+//!
+//!   Description: This API enumerates signal formats supported by Video I/O.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 190
+//!
+//! \param [in]   NvVioHandle          The caller provides the SDI device handle as input.
+//! \param [in]   enumIndex            Enumeration index
+//! \param [out]  pSignalFormatDetail  Pointer to receive detail or NULL
+//!
+//! \retval ::NVAPI_OK                  Success
+//! \retval ::NVAPI_API_NOT_INTIALIZED  NVAPI not initialized
+//! \retval ::NVAPI_INVALID_ARGUMENT    Invalid argument passed
+//! \retval ::NVAPI_END_ENUMERATION     No more signal formats to enumerate
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_VIO_EnumSignalFormats(NvVioHandle              hVioHandle,
+                                            NvU32                    enumIndex,
+                                            NVVIOSIGNALFORMATDETAIL  *pSignalFormatDetail);
+
+///////////////////////////////////////////////////////////////////////////////
+//!   Function:    NvAPI_VIO_EnumDataFormats
+//!
+//!   Description: This API enumerates data formats supported by Video I/O.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 190
+//!
+//! \param [in]  NvVioHandle         The caller provides the SDI device handle as input.
+//! \param [in]  enumIndex           Enumeration index
+//! \param [out] pDataFormatDetail   Pointer to receive detail or NULL
+//!
+//! \retval ::NVAPI_OK                Success
+//! \retval ::NVAPI_END_ENUMERATION   No more data formats to enumerate
+//! \retval ::NVAPI_NOT_SUPPORTED     Unsupported NVVIODATAFORMAT_ enumeration
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_VIO_EnumDataFormats(NvVioHandle            hVioHandle,
+                                          NvU32                  enumIndex,
+                                          NVVIODATAFORMATDETAIL  *pDataFormatDetail);
+
+
+//! @}
+
+
+
+
+///////////////////////////////////////////////////////////////////////////////////
+//  CAMERA TEST API
+//  These APIs allows test apps to perform low level camera tests
+
+//! \addtogroup vidio
+//! @{
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_Stereo_CreateConfigurationProfileRegistryKey
+//
+//! \fn NvAPI_Stereo_CreateConfigurationProfileRegistryKey(NV_STEREO_REGISTRY_PROFILE_TYPE registryProfileType)
+//!
+//! DESCRIPTION:   Creates new configuration registry key for current application.
+//!
+//!                If there is no configuration profile prior to the function call,
+//!                this API tries to create a new configuration profile registry key
+//!                for a given application and fill it with the default values.
+//!                If an application already has a configuration profile registry key, the API does nothing.
+//!                The name of the key is automatically set to the name of the executable that calls this function.
+//!                Because of this, the executable should have a distinct and unique name.
+//!                If the application is using only one version of DirectX, then the default profile type will be appropriate.
+//!                If the application is using more than one version of DirectX from the same executable,
+//!                it should use the appropriate profile type for each configuration profile.
+//!
+//! HOW TO USE:    When there is a need for an application to have default stereo parameter values,
+//!                use this function to create a key to store the values.
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \since Release: 180
+//!
+//! \param [in]    registryProfileType  Type of profile the application wants to create. It should be one of the symbolic constants defined in
+//!                                     ::NV_STEREO_REGISTRY_PROFILE_TYPE. Any other value will cause function to do nothing and return
+//!                                     ::NV_STEREO_REGISTRY_PROFILE_TYPE_NOT_SUPPORTED.
+//!
+//! \retval ::NVAPI_OK                                           Key exists in the registry.
+//! \retval ::NVAPI_STEREO_REGISTRY_PROFILE_TYPE_NOT_SUPPORTED   This profile type is not supported.
+//! \retval ::NVAPI_STEREO_REGISTRY_ACCESS_FAILED                Access to registry failed.
+//! \retval ::NVAPI_API_NOT_INTIALIZED
+//! \retval ::NVAPI_STEREO_NOT_INITIALIZED                       Stereo part of NVAPI not initialized.
+//! \retval ::NVAPI_ERROR
+//!
+///////////////////////////////////////////////////////////////////////////////
+
+
+//! \ingroup stereoapi
+//! Used in NvAPI_Stereo_CreateConfigurationProfileRegistryKey()
+typedef enum _NV_StereoRegistryProfileType
+{
+    NVAPI_STEREO_DEFAULT_REGISTRY_PROFILE, //!< Default registry configuration profile.
+    NVAPI_STEREO_DX9_REGISTRY_PROFILE,     //!< Separate registry configuration profile for a DirectX 9 executable.
+    NVAPI_STEREO_DX10_REGISTRY_PROFILE     //!< Separate registry configuration profile for a DirectX 10 executable.
+} NV_STEREO_REGISTRY_PROFILE_TYPE;
+
+
+//! \ingroup stereoapi
+NVAPI_INTERFACE NvAPI_Stereo_CreateConfigurationProfileRegistryKey(NV_STEREO_REGISTRY_PROFILE_TYPE registryProfileType);
+
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_Stereo_DeleteConfigurationProfileRegistryKey
+//
+//! DESCRIPTION:   Removes configuration registry key for current application.
+//!
+//!                If an application already has a configuration profile prior to this function call,
+//!                the function attempts to remove the application's configuration profile registry key from the registry.
+//!                If there is no configuration profile registry key prior to the function call,
+//!                the function does nothing and does not report an error.
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \since Release: 180
+//!
+//! \param [in]   registryProfileType   Type of profile that the application wants to delete. This should be one of the symbolic
+//!                                     constants defined in ::NV_STEREO_REGISTRY_PROFILE_TYPE. Any other value will cause the function
+//!                                     to do nothing and return ::NV_STEREO_REGISTRY_PROFILE_TYPE_NOT_SUPPORTED.
+//!
+//! \retval ::NVAPI_OK                                           Key does not exist in the registry any more.
+//! \retval ::NVAPI_STEREO_REGISTRY_PROFILE_TYPE_NOT_SUPPORTED   This profile type is not supported.
+//! \retval ::NVAPI_STEREO_REGISTRY_ACCESS_FAILED                Access to registry failed.
+//! \retval ::NVAPI_API_NOT_INTIALIZED                           NVAPI is not initialized.
+//! \retval ::NVAPI_STEREO_NOT_INITIALIZED                       Stereo part of NVAPI is not initialized.
+//! \retval ::NVAPI_ERROR
+//!
+//! \ingroup stereoapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_Stereo_DeleteConfigurationProfileRegistryKey(NV_STEREO_REGISTRY_PROFILE_TYPE registryProfileType);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_Stereo_SetConfigurationProfileValue
+//
+//! \fn NvAPI_Stereo_SetConfigurationProfileValue(NV_STEREO_REGISTRY_PROFILE_TYPE registryProfileType, NV_STEREO_REGISTRY_ID valueRegistryID, void *pValue)
+//!
+//! DESCRIPTION:   This API sets the given parameter value under the application's registry key.
+//!
+//!                If the value does not exist under the application's registry key,
+//!                the value will be created under the key.
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \since Release: 180
+//!
+//! \param [in]     registryProfileType  The type of profile the application wants to access. It should be one of the
+//!                                      symbolic constants defined in ::NV_STEREO_REGISTRY_PROFILE_TYPE. Any other value
+//!                                      will cause function to do nothing and return ::NV_STEREO_REGISTRY_PROFILE_TYPE_NOT_SUPPORTED.
+//! \param [in]     valueRegistryID      ID of the value that is being set. It should be one of the symbolic constants defined in
+//!                                      ::NV_STEREO_REGISTRY_PROFILE_TYPE. Any other value will cause function to do nothing
+//!                                      and return ::NVAPI_STEREO_REGISTRY_VALUE_NOT_SUPPORTED.
+//! \param [in]     pValue               Address of the value that is being set. It should be either address of a DWORD or of a float,
+//!                                      dependent on the type of the stereo parameter whose value is being set. The API will then cast that
+//!                                      address to DWORD* and write whatever is in those 4 bytes as a DWORD to the registry.
+//!
+//! \retval ::NVAPI_OK                                           Value is written to registry.
+//! \retval ::NVAPI_STEREO_REGISTRY_PROFILE_TYPE_NOT_SUPPORTED   This profile type is not supported.
+//! \retval ::NVAPI_STEREO_REGISTRY_VALUE_NOT_SUPPORTED          This value is not supported.
+//! \retval ::NVAPI_STEREO_REGISTRY_ACCESS_FAILED                Access to registry failed.
+//! \retval ::NVAPI_API_NOT_INTIALIZED                           NVAPI is not initialized.
+//! \retval ::NVAPI_STEREO_NOT_INITIALIZED                       Stereo part of NVAPI is not initialized.
+//! \retval ::NVAPI_ERROR                                        Something is wrong (generic error).
+//
+///////////////////////////////////////////////////////////////////////////////
+
+
+//! \ingroup stereoapi
+//! Used in NvAPI_Stereo_SetConfigurationProfileValue()
+typedef enum _NV_StereoRegistryID
+{
+    NVAPI_CONVERGENCE_ID,         //!< Symbolic constant for convergence registry ID.
+    NVAPI_FRUSTUM_ADJUST_MODE_ID, //!< Symbolic constant for frustum adjust mode registry ID.
+} NV_STEREO_REGISTRY_ID;
+
+
+//! \ingroup stereoapi
+NVAPI_INTERFACE NvAPI_Stereo_SetConfigurationProfileValue(NV_STEREO_REGISTRY_PROFILE_TYPE registryProfileType, NV_STEREO_REGISTRY_ID valueRegistryID, void *pValue);
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_Stereo_DeleteConfigurationProfileValue
+//
+//! DESCRIPTION:   This API removes the given value from the application's configuration profile registry key.
+//!                If there is no such value, the function does nothing and does not report an error.
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \since Release: 180
+//!
+//! \param [in]     registryProfileType   The type of profile the application wants to access. It should be one of the
+//!                                       symbolic constants defined in ::NV_STEREO_REGISTRY_PROFILE_TYPE. Any other value will
+//!                                       cause function to do nothing and return ::NV_STEREO_REGISTRY_PROFILE_TYPE_NOT_SUPPORTED.
+//! \param [in]     valueRegistryID       ID of the value that is being deleted. It should be one of the symbolic constants defined in
+//!                                       ::NV_STEREO_REGISTRY_PROFILE_TYPE. Any other value will cause function to do nothing and return
+//!                                       ::NVAPI_STEREO_REGISTRY_VALUE_NOT_SUPPORTED.
+//!
+//! \retval ::NVAPI_OK                                           Value does not exist in registry any more.
+//! \retval ::NVAPI_STEREO_REGISTRY_PROFILE_TYPE_NOT_SUPPORTED   This profile type is not supported.
+//! \retval ::NVAPI_STEREO_REGISTRY_VALUE_NOT_SUPPORTED          This value is not supported.
+//! \retval ::NVAPI_STEREO_REGISTRY_ACCESS_FAILED                Access to registry failed.
+//! \retval ::NVAPI_API_NOT_INTIALIZED
+//! \retval ::NVAPI_STEREO_NOT_INITIALIZED                       Stereo part of NVAPI not initialized.
+//! \retval ::NVAPI_ERROR
+//!
+//! \ingroup stereoapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_Stereo_DeleteConfigurationProfileValue(NV_STEREO_REGISTRY_PROFILE_TYPE registryProfileType, NV_STEREO_REGISTRY_ID valueRegistryID);
+
+
+
+
+
+
+//! \addtogroup stereoapi
+//! @{
+
+typedef struct _NVAPI_STEREO_CAPS
+{
+    NvU32 version;
+    NvU32 supportsWindowedModeOff        : 1;
+    NvU32 supportsWindowedModeAutomatic  : 1;
+    NvU32 supportsWindowedModePersistent : 1;
+    NvU32 reserved                       : 29;  // must be 0
+    NvU32 reserved2[3];                         // must be 0
+} NVAPI_STEREO_CAPS_V1;
+
+#define NVAPI_STEREO_CAPS_VER1  MAKE_NVAPI_VERSION(NVAPI_STEREO_CAPS,1)
+#define NVAPI_STEREO_CAPS_VER   NVAPI_STEREO_CAPS_VER1
+
+typedef NVAPI_STEREO_CAPS_V1    NVAPI_STEREO_CAPS;
+
+//! @}
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_Stereo_GetStereoSupport
+//
+//! DESCRIPTION:  This API checks what kind of stereo support is currently supported on a particular display.
+//!               If the the display is prohibited from showing stereo (e.g. secondary in a multi-mon setup), we will
+//!               return 0 for all stereo modes (full screen exclusive, automatic windowed, persistent windowed).
+//!               Otherwise, we will check which stereo mode is supported. On 120Hz display, this will be what
+//!               the user chooses in control panel. On HDMI 1.4 display, persistent windowed mode is always assumed to be
+//!               supported. Note that this function does not check if the CURRENT RESOLUTION/REFRESH RATE can support
+//!               stereo. For HDMI 1.4, it is the app's responsibility to change the resolution/refresh rate to one that is
+//!               3D compatible. For 120Hz, the driver will ALWAYS force 120Hz anyway.
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \since Release: 304
+//!
+//! \param [in]     hMonitor handle to monitor that app is going to run on
+//! \param [out]    pCaps    Address where the result of the inquiry will be placed.
+//!                          *pCaps is defined in NVAPI_STEREO_CAPS.
+//! \return       This API can return any of the following error codes enumerated in #NvAPI_Status
+//! \retval ::NVAPI_OK
+//!
+//! \ingroup stereoapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_Stereo_GetStereoSupport(__in NvMonitorHandle hMonitor, __out NVAPI_STEREO_CAPS *pCaps);
+
+
+
+
+
+
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_Stereo_DecreaseSeparation
+//
+//! DESCRIPTION:   This API decreases separation for the given device interface (just like the Ctrl+F3 hotkey).
+//!
+//! WHEN TO USE:   After the stereo handle for device interface is created via successfull call to the appropriate NvAPI_Stereo_CreateHandleFrom() function.
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \since Release: 180
+//!
+//! \param [in]   stereoHandle  Stereo handle that corresponds to the device interface.
+//!
+//! \retval ::NVAPI_OK - Decrease of separation percentage was successfull.
+//! \retval ::NVAPI_STEREO_INVALID_DEVICE_INTERFACE - Device interface is not valid. Create again, then attach again.
+//! \retval ::NVAPI_API_NOT_INTIALIZED - NVAPI not initialized.
+//! \retval ::NVAPI_STEREO_NOT_INITIALIZED - Stereo part of NVAPI not initialized.
+//! \retval ::NVAPI_ERROR - Something is wrong (generic error).
+//!
+//! \ingroup stereoapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_Stereo_DecreaseSeparation(StereoHandle stereoHandle);
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_Stereo_IncreaseSeparation
+//
+//! DESCRIPTION:   This API increases separation for the given device interface (just like the Ctrl+F4 hotkey).
+//!
+//! WHEN TO USE:   After the stereo handle for the device interface is created via successfull call to the appropriate NvAPI_Stereo_CreateHandleFrom() function.
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \since Release: 180
+//!
+//! \param [in]    stereoHandle  Stereo handle that corresponds to the device interface.
+//!
+//! \retval ::NVAPI_OK                               Increase of separation percentage was successfull.
+//! \retval ::NVAPI_STEREO_INVALID_DEVICE_INTERFACE  Device interface is not valid. Create again, then attach again.
+//! \retval ::NVAPI_API_NOT_INTIALIZED               NVAPI not initialized.
+//! \retval ::NVAPI_STEREO_NOT_INITIALIZED           Stereo part of NVAPI not initialized.
+//! \retval ::NVAPI_ERROR                            Something is wrong (generic error).
+//!
+//! \ingroup stereoapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_Stereo_IncreaseSeparation(StereoHandle stereoHandle);
+
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_Stereo_DecreaseConvergence
+//
+//! DESCRIPTION:   This API decreases convergence for the given device interface (just like the Ctrl+F5 hotkey).
+//!
+//! WHEN TO USE:   After the stereo handle for the device interface is created via successfull call to the appropriate NvAPI_Stereo_CreateHandleFrom() function.
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \since Release: 180
+//!
+//! \param [in]     stereoHandle  Stereo handle that corresponds to the device interface.
+//!
+//! \retval ::NVAPI_OK - Decrease of convergence was successfull.
+//! \retval ::NVAPI_STEREO_INVALID_DEVICE_INTERFACE - Device interface is not valid. Create again, then attach again.
+//! \retval ::NVAPI_API_NOT_INTIALIZED - NVAPI not initialized.
+//! \retval ::NVAPI_STEREO_NOT_INITIALIZED - Stereo part of NVAPI not initialized.
+//! \retval ::NVAPI_ERROR - Something is wrong (generic error).
+//!
+//! \ingroup stereoapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_Stereo_DecreaseConvergence(StereoHandle stereoHandle);
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_Stereo_IncreaseConvergence
+//
+//! DESCRIPTION:   This API increases convergence for given the device interface (just like the Ctrl+F5 hotkey).
+//!
+//! WHEN TO USE:   After the stereo handle for the device interface is created via successfull call to the appropriate NvAPI_Stereo_CreateHandleFrom() function.
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \since Release: 180
+//!
+//! \param [in]    stereoHandle  Stereo handle that corresponds to the device interface.
+//!
+//! \retval ::NVAPI_OK                               Increase of convergence was successfull.
+//! \retval ::NVAPI_STEREO_INVALID_DEVICE_INTERFACE  Device interface is not valid. Create again, then attach again.
+//! \retval ::NVAPI_API_NOT_INTIALIZED
+//! \retval ::NVAPI_STEREO_NOT_INITIALIZED           Stereo part of NVAPI not initialized.
+//! \retval ::NVAPI_ERROR
+//!
+//! \ingroup stereoapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_Stereo_IncreaseConvergence(StereoHandle stereoHandle);
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_Stereo_GetFrustumAdjustMode
+//
+//! \fn NvAPI_Stereo_GetFrustumAdjustMode(StereoHandle stereoHandle, NV_FRUSTUM_ADJUST_MODE *pFrustumAdjustMode)
+//! DESCRIPTION:   This API gets the current frustum adjust mode value.
+//!
+//! WHEN TO USE:   After the stereo handle for the device interface is created via successfull call to the appropriate NvAPI_Stereo_CreateHandleFrom() function.
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \since Release: 180
+//!
+//! \param [in]     stereoHandle         Stereo handle that corresponds to the device interface.
+//! \param [out]    pFrustumAdjustMode   Address of the NV_FRUSTUM_ADJUST_MODE type variable to store current frustum value in.
+//!
+//! \retval ::NVAPI_OK - Retrieval of frustum adjust mode was successfull.
+//! \retval ::NVAPI_STEREO_INVALID_DEVICE_INTERFACE - Device interface is not valid. Create again, then attach again.
+//! \retval ::NVAPI_API_NOT_INTIALIZED - NVAPI not initialized.
+//! \retval ::NVAPI_STEREO_NOT_INITIALIZED - Stereo part of NVAPI not initialized.
+//! \retval ::NVAPI_ERROR - Something is wrong (generic error).
+//!
+///////////////////////////////////////////////////////////////////////////////
+
+//! \ingroup stereoapi
+//! Used in NvAPI_Stereo_GetFrustumAdjustMode().
+typedef enum _NV_FrustumAdjustMode
+{
+    NVAPI_NO_FRUSTUM_ADJUST,    //!< Do not adjust frustum.
+    NVAPI_FRUSTUM_STRETCH,      //!< Stretch images in X.
+    NVAPI_FRUSTUM_CLEAR_EDGES   //!< Clear corresponding edges for each eye.
+} NV_FRUSTUM_ADJUST_MODE;
+
+//! \ingroup stereoapi
+NVAPI_INTERFACE NvAPI_Stereo_GetFrustumAdjustMode(StereoHandle stereoHandle, NV_FRUSTUM_ADJUST_MODE *pFrustumAdjustMode);
+
+
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_Stereo_SetFrustumAdjustMode
+//
+//! DESCRIPTION:   This API sets the current frustum adjust mode value.
+//!
+//! WHEN TO USE:   After the stereo handle for the device interface is created via successfull call to the appropriate NvAPI_Stereo_CreateHandleFrom() function.
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \since Release: 180
+//!
+//! \param [in]     stereoHandle                Stereo handle that corresponds to the device interface.
+//! \param [in]     newFrustumAdjustModeValue   New value for frustum adjust mode. It should be one of the symbolic constants defined in
+//!                                             ::NV_FRUSTUM_ADJUST_MODE. Any other value will cause function to do nothing and return
+//!                                             ::NVAPI_STEREO_FRUSTUM_ADJUST_MODE_NOT_SUPPORTED.
+//!
+//! \retval ::NVAPI_OK                                         Retrieval of frustum adjust mode was successfull.
+//! \retval ::NVAPI_STEREO_INVALID_DEVICE_INTERFACE            Device interface is not valid. Create again, then attach again.
+//! \retval ::NVAPI_API_NOT_INTIALIZED
+//! \retval ::NVAPI_STEREO_NOT_INITIALIZED                     Stereo part of NVAPI not initialized.
+//! \retval ::NVAPI_STEREO_FRUSTUM_ADJUST_MODE_NOT_SUPPORTED   Given frustum adjust mode is not supported.
+//! \retval ::NVAPI_ERROR
+//!
+//! \ingroup stereoapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_Stereo_SetFrustumAdjustMode(StereoHandle stereoHandle, NV_FRUSTUM_ADJUST_MODE newFrustumAdjustModeValue);
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_Stereo_CaptureJpegImage
+//
+//! DESCRIPTION:   This API captures the current stereo image in JPEG stereo format with the given quality.
+//!                Only the last capture call per flip will be effective.
+//!
+//! WHEN TO USE:   After the stereo handle for the device interface is created via successfull call to the appropriate NvAPI_Stereo_CreateHandleFrom() function.
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \since Release: 180
+//!
+//! \param [in]     stereoHandle  Stereo handle that corresponds to the device interface.
+//! \param [in]     quality        Quality of the JPEG image to be captured. Integer value betweeen 0 and 100.
+//!
+//! \retval ::NVAPI_OK                              Image captured.
+//! \retval ::NVAPI_STEREO_INVALID_DEVICE_INTERFACE Device interface is not valid. Create again, then attach again.
+//! \retval ::NVAPI_API_NOT_INTIALIZED
+//! \retval ::NVAPI_STEREO_NOT_INITIALIZED
+//! \retval ::NVAPI_STEREO_PARAMETER_OUT_OF_RANGE   Given quality is out of [0..100] range.
+//! \retval ::NVAPI_ERROR
+//!
+//! \ingroup stereoapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_Stereo_CaptureJpegImage(StereoHandle stereoHandle, NvU32 quality);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_Stereo_InitActivation
+//
+//! DESCRIPTION:   This API allows an application to enable stereo viewing, without the need of a GUID/Key pair
+//!				   This API cannot be used to enable stereo viewing on 3DTV.
+//!
+//! HOW TO USE:    Call this function immediately after device creation, then follow with a reset. \n
+//!                Very generically:
+//!                Create Device->Create Stereo Handle->InitActivation->Reset Device
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//! \since Release: 302
+//!
+//! \param [in]   stereoHandle            Stereo handle corresponding to the device interface.
+//! \param [in]   bDelayed				  Use delayed activation
+//!
+//! \return This API can return any of the error codes enumerated in #NvAPI_Status.
+//!         If there are return error codes with specific meaning for this API,
+//!         they are listed below.
+//! \retval ::NVAPI_STEREO_NOT_INITIALIZED - Stereo part of NVAPI not initialized.
+//!
+//! \ingroup stereoapi
+///////////////////////////////////////////////////////////////////////////////
+
+//! \addtogroup stereoapi
+//! @{
+
+//! InitActivation Flags
+typedef enum _NVAPI_STEREO_INIT_ACTIVATION_FLAGS
+{
+	NVAPI_STEREO_INIT_ACTIVATION_IMMEDIATE = 0X00,
+    NVAPI_STEREO_INIT_ACTIVATION_DELAYED = 0x01,
+} NVAPI_STEREO_INIT_ACTIVATION_FLAGS;
+
+NVAPI_INTERFACE NvAPI_Stereo_InitActivation(__in StereoHandle hStereoHandle, __in NVAPI_STEREO_INIT_ACTIVATION_FLAGS flags);
+
+//! @}
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_Stereo_Trigger_Activation
+//
+//! DESCRIPTION:   This API allows an application to trigger creation of a stereo desktop,
+//!				   in case the creation was stopped on application launch.
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//! \since Release: 302
+//!
+//! \param [in]   stereoHandle   Stereo handle that corresponds to the device interface.
+//!
+//! \return This API can return any of the error codes enumerated in #NvAPI_Status.
+//!         If there are return error codes with specific meaning for this API,
+//!         they are listed below.
+//! \retval ::NVAPI_STEREO_INIT_ACTIVATION_NOT_DONE - Stereo InitActivation not called.
+//! \retval ::NVAPI_STEREO_NOT_INITIALIZED - Stereo part of NVAPI not initialized.
+//!
+//! \ingroup stereoapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_Stereo_Trigger_Activation(__in StereoHandle hStereoHandle);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_Stereo_CapturePngImage
+//
+//! DESCRIPTION:   This API captures the current stereo image in PNG stereo format.
+//!                Only the last capture call per flip will be effective.
+//!
+//! WHEN TO USE:   After the stereo handle for the device interface is created via successfull call to the appropriate NvAPI_Stereo_CreateHandleFrom() function.
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \since Release: 180
+//!
+//! \param [in]     stereoHandle  Stereo handle corresponding to the device interface.
+//!
+//! \retval ::NVAPI_OK                               Image captured.
+//! \retval ::NVAPI_STEREO_INVALID_DEVICE_INTERFACE  Device interface is not valid. Create again, then attach again.
+//! \retval ::NVAPI_API_NOT_INTIALIZED
+//! \retval ::NVAPI_STEREO_NOT_INITIALIZED           Stereo part of NVAPI not initialized.
+//! \retval ::NVAPI_ERROR
+//!
+//! \ingroup stereoapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_Stereo_CapturePngImage(StereoHandle stereoHandle);
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_Stereo_ReverseStereoBlitControl
+//
+//! DESCRIPTION:   This API turns on/off reverse stereo blit.
+//!
+//! HOW TO USE:    Use after the stereo handle for the device interface is created via successfull call to the appropriate
+//!                NvAPI_Stereo_CreateHandleFrom() function.
+//!                After reversed stereo blit control is turned on, blits from the stereo surface will
+//!                produce the right-eye image in the left side of the destination surface and the left-eye
+//!                image in the right side of the destination surface.
+//!
+//!                In DirectX 9, the destination surface must be created as the render target, and StretchRect must be used.
+//!                Conditions:
+//!                - DstWidth == 2*SrcWidth
+//!                - DstHeight == SrcHeight
+//!                - Src surface is the stereo surface.
+//!                - SrcRect must be {0,0,SrcWidth,SrcHeight}
+//!                - DstRect must be {0,0,DstWidth,DstHeight}
+//!
+//!                In DirectX 10, ResourceCopyRegion must be used.
+//!                Conditions:
+//!                - DstWidth == 2*SrcWidth
+//!                - DstHeight == SrcHeight
+//!                - dstX == 0,
+//!                - dstY == 0,
+//!                - dstZ == 0,
+//!                - SrcBox: left=top=front==0; right==SrcWidth; bottom==SrcHeight; back==1;
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \since Release: 185
+//!
+//! \param [in]    stereoHandle  Stereo handle corresponding to the device interface.
+//! \param [in]    TurnOn         != 0 : Turns on \n
+//!                               == 0 : Turns off
+//!
+//!
+//! \retval ::NVAPI_OK                                Retrieval of frustum adjust mode was successfull.
+//! \retval ::NVAPI_STEREO_INVALID_DEVICE_INTERFACE   Device interface is not valid. Create again, then attach again.
+//! \retval ::NVAPI_API_NOT_INTIALIZED
+//! \retval ::NVAPI_STEREO_NOT_INITIALIZED            Stereo part of NVAPI not initialized.
+//! \retval ::NVAPI_ERROR
+//!
+//! \ingroup stereoapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_Stereo_ReverseStereoBlitControl(StereoHandle hStereoHandle, NvU8 TurnOn);
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_Stereo_SetNotificationMessage
+//
+//! DESCRIPTION:   This API is a Setup notification message that the stereo driver uses to notify the application
+//!                when the user changes the stereo driver state.
+//!
+//!                When the user changes the stereo state (Activated or Deactivated, separation or conversion)
+//!                the stereo driver posts a defined message with the following parameters:
+//!
+//!                lParam  is the current conversion. (Actual conversion is *(float*)&lParam )
+//!
+//!                wParam == MAKEWPARAM(l, h) where
+//!                - l == 0 if stereo is deactivated
+//!                - l == 1 if stereo is deactivated
+//!                - h is the current separation. (Actual separation is float(h*100.f/0xFFFF)
+//!
+//!                Call this API with NULL hWnd to prohibit notification.
+//!
+//! WHEN TO USE:   Use after the stereo handle for device interface is created via successful call to appropriate
+//!                NvAPI_Stereo_CreateHandleFrom() function.
+//!
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \since Release: 180
+//!
+//!
+//! \param [in]     stereoHandle  Stereo handle corresponding to the device interface.
+//! \param [in]     hWnd          Window HWND that will be notified when the user changes the stereo driver state.
+//!                               Actual HWND must be cast to an NvU64.
+//! \param [in]     messageID     MessageID of the message that will be posted to hWnd
+//!
+//! \retval ::NVAPI_OK                                Notification set.
+//! \retval ::NVAPI_STEREO_INVALID_DEVICE_INTERFACE   Device interface is not valid. Create again, then attach again.
+//! \retval ::NVAPI_API_NOT_INTIALIZED
+//! \retval ::NVAPI_STEREO_NOT_INITIALIZED            Stereo part of NVAPI not initialized.
+//! \retval ::NVAPI_ERROR
+//!
+//! \ingroup stereoapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_Stereo_SetNotificationMessage(StereoHandle hStereoHandle, NvU64 hWnd,NvU64 messageID);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+//! \ingroup stereoapi
+#define NVAPI_STEREO_QUADBUFFERED_API_VERSION   0x2
+
+//! \ingroup stereoapi
+ typedef enum _NV_StereoSwapChainMode
+ {
+   NVAPI_STEREO_SWAPCHAIN_DEFAULT = 0,
+   NVAPI_STEREO_SWAPCHAIN_STEREO = 1,
+   NVAPI_STEREO_SWAPCHAIN_MONO = 2,
+ } NV_STEREO_SWAPCHAIN_MODE;
+
+#if defined(__d3d10_h__) || defined(__d3d10_1_h__) || defined(__d3d11_h__)
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_D3D1x_CreateSwapChain
+//
+//! DESCRIPTION: This API allows the user to create a mono or a stereo swap chain.
+//!
+//!          NOTE: NvAPI_D3D1x_CreateSwapChain is a wrapper of the method IDXGIFactory::CreateSwapChain which
+//!                additionally notifies the D3D driver of the mode in which stereo mode the swap chain is to be
+//!                created.
+//!
+//! \since Release: 285
+//!
+//! SUPPORTED OS:  Windows 7 and higher
+//!
+//!
+//! \param [in]     hStereoHandle      Stereo handle that corresponds to the device interface.
+//!                                   A pointer to the device that will write 2D images to the swap chain.
+//! \param [in]     pDesc             A pointer to the swap-chain description (DXGI_SWAP_CHAIN_DESC). This parameter cannot be NULL.
+//! \param [out]    ppSwapChain       A pointer to the swap chain created.
+//! \param [in]     mode              The stereo mode fot the swap chain.
+//!                                   NVAPI_STEREO_SWAPCHAIN_DEFAULT
+//!                                   NVAPI_STEREO_SWAPCHAIN_STEREO
+//!                                   NVAPI_STEREO_SWAPCHAIN_MONO
+//!
+//! \retval ::NVAPI_OK                 The swap chain was created successfully.
+//! \retval ::NVAPI_ERROR              The operation failed.
+//!
+//! \ingroup stereoapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_D3D1x_CreateSwapChain(StereoHandle hStereoHandle,
+                                            DXGI_SWAP_CHAIN_DESC* pDesc,
+                                            IDXGISwapChain** ppSwapChain,
+                                            NV_STEREO_SWAPCHAIN_MODE mode);
+
+#endif //if defined(__d3d10_h__) || defined(__d3d10_1_h__) || defined(__d3d11_h__)
+
+
+#if defined(_D3D9_H_)
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_D3D9_CreateSwapChain
+//
+//! DESCRIPTION: This API allows the user to create a mono or a stereo swap chain.
+//!
+//!          NOTE: NvAPI_D3D9_CreateSwapChain is a wrapper of the method IDirect3DDevice9::CreateAdditionalSwapChain which
+//!                additionally notifies the D3D driver if the swap chain creation mode must be stereo or mono.
+//!
+//!
+//! \since Release: 285
+//!
+//! SUPPORTED OS:  Windows 7 and higher
+//!
+//!
+//! \param [in]      hStereoHandle            Stereo handle that corresponds to the device interface.
+//! \param [in, out] pPresentationParameters  A pointer to the swap-chain description (DXGI). This parameter cannot be NULL.
+//! \param [out]     ppSwapChain              A pointer to the swap chain created.
+//! \param [in]      mode                     The stereo mode for the swap chain.
+//!                                           NVAPI_STEREO_SWAPCHAIN_DEFAULT
+//!                                           NVAPI_STEREO_SWAPCHAIN_STEREO
+//!                                           NVAPI_STEREO_SWAPCHAIN_MONO
+//!
+//! \retval ::NVAPI_OK                        The swap chain creation was successful
+//! \retval ::NVAPI_ERROR                     The operation failed.
+//!
+//!\ingroup stereoapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_D3D9_CreateSwapChain(StereoHandle hStereoHandle,
+                                           D3DPRESENT_PARAMETERS *pPresentationParameters,
+                                           IDirect3DSwapChain9 **ppSwapChain,
+                                           NV_STEREO_SWAPCHAIN_MODE mode);
+#endif //if defined(_D3D9_H_)
+
+
+
+
+
+//! \addtogroup drsapi
+//! @{
+
+
+// GPU Profile APIs
+
+NV_DECLARE_HANDLE(NvDRSSessionHandle);
+NV_DECLARE_HANDLE(NvDRSProfileHandle);
+
+#define NVAPI_DRS_GLOBAL_PROFILE                             ((NvDRSProfileHandle) -1)
+
+#define NVAPI_SETTING_MAX_VALUES                             100
+
+typedef enum _NVDRS_SETTING_TYPE
+{
+     NVDRS_DWORD_TYPE,
+     NVDRS_BINARY_TYPE,
+     NVDRS_STRING_TYPE,
+     NVDRS_WSTRING_TYPE
+} NVDRS_SETTING_TYPE;
+
+typedef enum _NVDRS_SETTING_LOCATION
+{
+     NVDRS_CURRENT_PROFILE_LOCATION,
+     NVDRS_GLOBAL_PROFILE_LOCATION,
+     NVDRS_BASE_PROFILE_LOCATION,
+     NVDRS_DEFAULT_PROFILE_LOCATION
+} NVDRS_SETTING_LOCATION;
+
+
+typedef struct _NVDRS_GPU_SUPPORT
+{
+    NvU32 geforce    :  1;
+    NvU32 quadro     :  1;
+    NvU32 nvs        :  1;
+    NvU32 reserved4  :  1;
+    NvU32 reserved5  :  1;
+    NvU32 reserved6  :  1;
+    NvU32 reserved7  :  1;
+    NvU32 reserved8  :  1;
+    NvU32 reserved9  :  1;
+    NvU32 reserved10 :  1;
+    NvU32 reserved11 :  1;
+    NvU32 reserved12 :  1;
+    NvU32 reserved13 :  1;
+    NvU32 reserved14 :  1;
+    NvU32 reserved15 :  1;
+    NvU32 reserved16 :  1;
+    NvU32 reserved17 :  1;
+    NvU32 reserved18 :  1;
+    NvU32 reserved19 :  1;
+    NvU32 reserved20 :  1;
+    NvU32 reserved21 :  1;
+    NvU32 reserved22 :  1;
+    NvU32 reserved23 :  1;
+    NvU32 reserved24 :  1;
+    NvU32 reserved25 :  1;
+    NvU32 reserved26 :  1;
+    NvU32 reserved27 :  1;
+    NvU32 reserved28 :  1;
+    NvU32 reserved29 :  1;
+    NvU32 reserved30 :  1;
+    NvU32 reserved31 :  1;
+    NvU32 reserved32 :  1;
+} NVDRS_GPU_SUPPORT;
+
+//! Enum to decide on the datatype of setting value.
+typedef struct _NVDRS_BINARY_SETTING
+{
+     NvU32                valueLength;               //!< valueLength should always be in number of bytes.
+     NvU8                 valueData[NVAPI_BINARY_DATA_MAX];
+} NVDRS_BINARY_SETTING;
+
+typedef struct _NVDRS_SETTING_VALUES
+{
+     NvU32                      version;                //!< Structure Version
+     NvU32                      numSettingValues;       //!< Total number of values available in a setting.
+     NVDRS_SETTING_TYPE         settingType;            //!< Type of setting value.
+     union                                              //!< Setting can hold either DWORD or Binary value or string. Not mixed types.
+     {
+         NvU32                      u32DefaultValue;    //!< Accessing default DWORD value of this setting.
+         NVDRS_BINARY_SETTING       binaryDefaultValue; //!< Accessing default Binary value of this setting.
+                                                        //!< Must be allocated by caller with valueLength specifying buffer size, or only valueLength will be filled in.
+         NvAPI_UnicodeString        wszDefaultValue;    //!< Accessing default unicode string value of this setting.
+     };
+     union                                                //!< Setting values can be of either DWORD, Binary values or String type,
+     {                                                    //!< NOT mixed types.
+         NvU32                      u32Value;           //!< All possible DWORD values for a setting
+         NVDRS_BINARY_SETTING       binaryValue;        //!< All possible Binary values for a setting
+         NvAPI_UnicodeString        wszValue;           //!< Accessing current unicode string value of this setting.
+     }settingValues[NVAPI_SETTING_MAX_VALUES];
+} NVDRS_SETTING_VALUES;
+
+//! Macro for constructing the version field of ::_NVDRS_SETTING_VALUES
+#define NVDRS_SETTING_VALUES_VER    MAKE_NVAPI_VERSION(NVDRS_SETTING_VALUES,1)
+
+typedef struct _NVDRS_SETTING_V1
+{
+     NvU32                      version;                //!< Structure Version
+     NvAPI_UnicodeString        settingName;            //!< String name of setting
+     NvU32                      settingId;              //!< 32 bit setting Id
+     NVDRS_SETTING_TYPE         settingType;            //!< Type of setting value.
+     NVDRS_SETTING_LOCATION     settingLocation;        //!< Describes where the value in CurrentValue comes from.
+     NvU32                      isCurrentPredefined;    //!< It is different than 0 if the currentValue is a predefined Value,
+                                                        //!< 0 if the currentValue is a user value.
+     NvU32                      isPredefinedValid;      //!< It is different than 0 if the PredefinedValue union contains a valid value.
+     union                                              //!< Setting can hold either DWORD or Binary value or string. Not mixed types.
+     {
+         NvU32                      u32PredefinedValue;    //!< Accessing default DWORD value of this setting.
+         NVDRS_BINARY_SETTING       binaryPredefinedValue; //!< Accessing default Binary value of this setting.
+                                                           //!< Must be allocated by caller with valueLength specifying buffer size,
+                                                           //!< or only valueLength will be filled in.
+         NvAPI_UnicodeString        wszPredefinedValue;    //!< Accessing default unicode string value of this setting.
+     };
+     union                                              //!< Setting can hold either DWORD or Binary value or string. Not mixed types.
+     {
+         NvU32                      u32CurrentValue;    //!< Accessing current DWORD value of this setting.
+         NVDRS_BINARY_SETTING       binaryCurrentValue; //!< Accessing current Binary value of this setting.
+                                                        //!< Must be allocated by caller with valueLength specifying buffer size,
+                                                        //!< or only valueLength will be filled in.
+         NvAPI_UnicodeString        wszCurrentValue;    //!< Accessing current unicode string value of this setting.
+     };
+} NVDRS_SETTING_V1;
+
+//! Macro for constructing the version field of ::_NVDRS_SETTING
+#define NVDRS_SETTING_VER1        MAKE_NVAPI_VERSION(NVDRS_SETTING_V1, 1)
+
+typedef NVDRS_SETTING_V1          NVDRS_SETTING;
+#define NVDRS_SETTING_VER         NVDRS_SETTING_VER1
+
+typedef struct _NVDRS_APPLICATION_V1
+{
+     NvU32                      version;            //!< Structure Version
+     NvU32                      isPredefined;       //!< Is the application userdefined/predefined
+     NvAPI_UnicodeString        appName;            //!< String name of the Application
+     NvAPI_UnicodeString        userFriendlyName;   //!< UserFriendly name of the Application
+     NvAPI_UnicodeString        launcher;           //!< Indicates the name (if any) of the launcher that starts the application
+} NVDRS_APPLICATION_V1;
+
+typedef struct _NVDRS_APPLICATION_V2
+{
+     NvU32                      version;            //!< Structure Version
+     NvU32                      isPredefined;       //!< Is the application userdefined/predefined
+     NvAPI_UnicodeString        appName;            //!< String name of the Application
+     NvAPI_UnicodeString        userFriendlyName;   //!< UserFriendly name of the Application
+     NvAPI_UnicodeString        launcher;           //!< Indicates the name (if any) of the launcher that starts the Application
+     NvAPI_UnicodeString        fileInFolder;       //!< Select this application only if this file is found.
+                                                    //!< When specifying multiple files, separate them using the ':' character.
+} NVDRS_APPLICATION_V2;
+
+typedef struct _NVDRS_APPLICATION_V3
+{
+     NvU32                      version;            //!< Structure Version
+     NvU32                      isPredefined;       //!< Is the application userdefined/predefined
+     NvAPI_UnicodeString        appName;            //!< String name of the Application
+     NvAPI_UnicodeString        userFriendlyName;   //!< UserFriendly name of the Application
+     NvAPI_UnicodeString        launcher;           //!< Indicates the name (if any) of the launcher that starts the Application
+     NvAPI_UnicodeString        fileInFolder;       //!< Select this application only if this file is found.
+                                                    //!< When specifying multiple files, separate them using the ':' character.
+     NvU32                      isMetro:1;          //!< Windows 8 style app
+     NvU32                      isCommandLine:1;    //!< Command line parsing for the application name
+     NvU32                      reserved:30;        //!< Reserved. Should be 0.
+} NVDRS_APPLICATION_V3;
+
+#define NVDRS_APPLICATION_VER_V1        MAKE_NVAPI_VERSION(NVDRS_APPLICATION_V1,1)
+#define NVDRS_APPLICATION_VER_V2        MAKE_NVAPI_VERSION(NVDRS_APPLICATION_V2,2)
+#define NVDRS_APPLICATION_VER_V3        MAKE_NVAPI_VERSION(NVDRS_APPLICATION_V3,3)
+
+typedef NVDRS_APPLICATION_V3 NVDRS_APPLICATION;
+#define NVDRS_APPLICATION_VER NVDRS_APPLICATION_VER_V3
+
+typedef struct _NVDRS_PROFILE_V1
+{
+     NvU32                      version;            //!< Structure Version
+     NvAPI_UnicodeString        profileName;        //!< String name of the Profile
+     NVDRS_GPU_SUPPORT          gpuSupport;         //!< This read-only flag indicates the profile support on either
+                                                    //!< Quadro, or Geforce, or both.
+     NvU32                      isPredefined;       //!< Is the Profile user-defined, or predefined
+     NvU32                      numOfApps;          //!< Total number of applications that belong to this profile. Read-only
+     NvU32                      numOfSettings;      //!< Total number of settings applied for this Profile. Read-only
+} NVDRS_PROFILE_V1;
+
+typedef NVDRS_PROFILE_V1         NVDRS_PROFILE;
+
+//! Macro for constructing the version field of ::NVDRS_PROFILE
+#define NVDRS_PROFILE_VER1       MAKE_NVAPI_VERSION(NVDRS_PROFILE_V1,1)
+#define NVDRS_PROFILE_VER        NVDRS_PROFILE_VER1
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_DRS_CreateSession
+//
+//!   DESCRIPTION: This API allocates memory and initializes the session.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \param [out]  *phSession Return pointer to the session handle.
+//!
+//! \retval ::NVAPI_OK SUCCESS
+//! \retval ::NVAPI_ERROR: For miscellaneous errors.
+//
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DRS_CreateSession(NvDRSSessionHandle *phSession);
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_DRS_DestroySession
+//
+//!   DESCRIPTION: This API frees the allocation: cleanup of NvDrsSession.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \param [in] hSession Input to the session handle.
+//!
+//! \retval ::NVAPI_OK SUCCESS
+//! \retval ::NVAPI_ERROR For miscellaneous errors.
+//
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DRS_DestroySession(NvDRSSessionHandle hSession);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_DRS_LoadSettings
+//
+//!   DESCRIPTION: This API loads and parses the settings data.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \param [in] hSession  Input to the session handle.
+//!
+//! \retval ::NVAPI_OK     SUCCESS
+//! \retval ::NVAPI_ERROR  For miscellaneous errors.
+//
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DRS_LoadSettings(NvDRSSessionHandle hSession);
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_DRS_SaveSettings
+//
+//!   DESCRIPTION: This API saves the settings data to the system.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \param [in] hSession  Input to the session handle.
+//!
+//! \retval ::NVAPI_OK    SUCCESS
+//! \retval ::NVAPI_ERROR For miscellaneous errors.
+//
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DRS_SaveSettings(NvDRSSessionHandle hSession);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_DRS_LoadSettingsFromFile
+//
+//!   DESCRIPTION: This API loads settings from the given file path.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \param [in]  hSession Input to the session handle
+//! \param [in]  fileName Binary File Name/Path
+//!
+//! \retval ::NVAPI_OK     SUCCESS
+//! \retval ::NVAPI_ERROR  For miscellaneous errors.
+//
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DRS_LoadSettingsFromFile(NvDRSSessionHandle hSession, NvAPI_UnicodeString fileName);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_DRS_SaveSettingsToFile
+//
+//!   DESCRIPTION: This API saves settings to the given file path.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \param [in]  hSession  Input to the session handle.
+//! \param [in]  fileName  Binary File Name/Path
+//!
+//! \retval ::NVAPI_OK     SUCCESS
+//! \retval ::NVAPI_ERROR  For miscellaneous errors.
+//
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DRS_SaveSettingsToFile(NvDRSSessionHandle hSession, NvAPI_UnicodeString fileName);
+
+//! @}
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_DRS_CreateProfile
+//
+//!   DESCRIPTION: This API creates an empty profile.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \param [in]  hSession        Input to the session handle.
+//! \param [in]  *pProfileInfo   Input pointer to NVDRS_PROFILE.
+//! \param [in]  *phProfile      Returns pointer to profile handle.
+//!
+//! \retval ::NVAPI_OK     SUCCESS
+//! \retval ::NVAPI_ERROR  For miscellaneous errors.
+//!
+//! \ingroup drsapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DRS_CreateProfile(NvDRSSessionHandle hSession, NVDRS_PROFILE *pProfileInfo, NvDRSProfileHandle *phProfile);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_DRS_DeleteProfile
+//
+//!   DESCRIPTION: This API deletes a profile or sets it back to a predefined value.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \param [in] hSession  Input to the session handle.
+//! \param [in] hProfile  Input profile handle.
+//!
+//! \retval ::NVAPI_OK     SUCCESS if the profile is found
+//! \retval ::NVAPI_ERROR  For miscellaneous errors.
+//!
+//! \ingroup drsapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DRS_DeleteProfile(NvDRSSessionHandle hSession, NvDRSProfileHandle hProfile);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_DRS_SetCurrentGlobalProfile
+//
+//!   DESCRIPTION: This API sets the current global profile in the driver.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \param [in] hSession                Input to the session handle.
+//! \param [in] wszGlobalProfileName    Input current Global profile name.
+//!
+//! \retval ::NVAPI_OK     SUCCESS
+//! \retval ::NVAPI_ERROR  For miscellaneous errors.
+//!
+//! \ingroup drsapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DRS_SetCurrentGlobalProfile(NvDRSSessionHandle hSession, NvAPI_UnicodeString wszGlobalProfileName);
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_DRS_GetCurrentGlobalProfile
+//
+//!   DESCRIPTION: This API returns the handle to the current global profile.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \param [in]   hSession     Input to the session handle.
+//! \param [out]  *phProfile   Returns current Global profile handle.
+//!
+//! \retval ::NVAPI_OK     SUCCESS
+//! \retval ::NVAPI_ERROR  For miscellaneous errors.
+//!
+//! \ingroup drsapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DRS_GetCurrentGlobalProfile(NvDRSSessionHandle hSession, NvDRSProfileHandle *phProfile);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_DRS_GetProfileInfo
+//
+//!   DESCRIPTION: This API gets information about the given profile. User needs to specify the name of the Profile.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \param [in]  hSession       Input to the session handle.
+//! \param [in]  hProfile       Input profile handle.
+//! \param [out] *pProfileInfo  Return the profile info.
+//!
+//! \retval ::NVAPI_OK     SUCCESS
+//! \retval ::NVAPI_ERROR  For miscellaneous errors.
+//!
+//! \ingroup drsapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DRS_GetProfileInfo(NvDRSSessionHandle hSession, NvDRSProfileHandle hProfile, NVDRS_PROFILE *pProfileInfo);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_DRS_SetProfileInfo
+//
+//!   DESCRIPTION: Specifies flags for a given profile. Currently only the NVDRS_GPU_SUPPORT is
+//!                used to update the profile. Neither the name, number of settings or applications
+//!                or other profile information can be changed with this function.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \param [in]  hSession       Input to the session handle.
+//! \param [in]  hProfile       Input profile handle.
+//! \param [in]  *pProfileInfo  Input the new profile info.
+//!
+//! \retval ::NVAPI_OK     SUCCESS
+//! \retval ::NVAPI_ERROR  For miscellaneous errors.
+//!
+//! \ingroup drsapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DRS_SetProfileInfo(NvDRSSessionHandle hSession, NvDRSProfileHandle hProfile, NVDRS_PROFILE *pProfileInfo);
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_DRS_FindProfileByName
+//
+//!   DESCRIPTION: This API finds a profile in the current session.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \param [in]   hSession      Input to the session handle.
+//! \param [in]   profileName   Input profileName.
+//! \param [out]  phProfile     Input profile handle.
+//!
+//! \retval ::NVAPI_OK                SUCCESS if the profile is found
+//! \retval ::NVAPI_PROFILE_NOT_FOUND if profile is not found
+//! \retval ::NVAPI_ERROR             For miscellaneous errors.
+//!
+//! \ingroup drsapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DRS_FindProfileByName(NvDRSSessionHandle hSession, NvAPI_UnicodeString profileName, NvDRSProfileHandle* phProfile);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_DRS_EnumProfiles
+//
+//!   DESCRIPTION: This API enumerates through all the profiles in the session.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \param [in]   hSession        Input to the session handle.
+//! \param [in]   index           Input the index for enumeration.
+//! \param [out]  *phProfile      Returns profile handle.
+//!
+//!   RETURN STATUS: NVAPI_OK: SUCCESS if the profile is found
+//!                  NVAPI_ERROR: For miscellaneous errors.
+//!                  NVAPI_END_ENUMERATION: index exceeds the total number of available Profiles in DB.
+//!
+//! \ingroup drsapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DRS_EnumProfiles(NvDRSSessionHandle hSession, NvU32 index, NvDRSProfileHandle *phProfile);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_DRS_GetNumProfiles
+//
+//!   DESCRIPTION: This API obtains the number of profiles in the current session object.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \param [in]  hSession       Input to the session handle.
+//! \param out]  *numProfiles   Returns count of profiles in the current hSession.
+//!
+//! \retval ::NVAPI_OK                  SUCCESS
+//! \retval ::NVAPI_API_NOT_INTIALIZED  Failed to initialize.
+//! \retval ::NVAPI_INVALID_ARGUMENT    Invalid Arguments.
+//!
+//! \ingroup drsapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DRS_GetNumProfiles(NvDRSSessionHandle hSession, NvU32 *numProfiles);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_DRS_CreateApplication
+//
+//!   DESCRIPTION: This API adds an executable name to a profile.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \param [in]  hSession       Input to the session handle.
+//! \param [in]  hProfile       Input profile handle.
+//! \param [in]  *pApplication  Input NVDRS_APPLICATION struct with the executable name to be added.
+//!
+//! \retval ::NVAPI_OK     SUCCESS
+//! \retval ::NVAPI_ERROR  For miscellaneous errors.
+//!
+//! \ingroup drsapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DRS_CreateApplication(NvDRSSessionHandle hSession, NvDRSProfileHandle  hProfile, NVDRS_APPLICATION *pApplication);
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_DRS_DeleteApplicationEx
+//
+//!   DESCRIPTION: This API removes an executable from a profile.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \param [in]   hSession  - Input to the session handle.
+//! \param [in]   hProfile  - Input profile handle.
+//! \param [in]   *pApp     - Input all the information about the application to be removed.
+//!
+//! \retval ::NVAPI_OK  SUCCESS
+//! \retval ::NVAPI_ERROR For miscellaneous errors.
+//! \retval ::NVAPI_EXECUTABLE_PATH_IS_AMBIGUOUS If the path provided could refer to two different executables,
+//!                                              this error will be returned.
+//!
+//! \ingroup drsapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DRS_DeleteApplicationEx(NvDRSSessionHandle hSession, NvDRSProfileHandle hProfile, NVDRS_APPLICATION *pApp);
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_DRS_DeleteApplication
+//
+//!   DESCRIPTION: This API removes an executable name from a profile.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \param [in]  hSessionPARAMETERS   Input to the session handle.
+//! \param [in]  hProfile             Input profile handle.
+//! \param [in]  appName              Input the executable name to be removed.
+//!
+//! \retval ::NVAPI_OK     SUCCESS
+//! \retval ::NVAPI_ERROR  For miscellaneous errors.
+//! \retval ::NVAPI_EXECUTABLE_PATH_IS_AMBIGUOUS If the path provided could refer to two different executables,
+//!                                              this error will be returned
+//!
+//! \ingroup drsapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DRS_DeleteApplication(NvDRSSessionHandle hSession, NvDRSProfileHandle hProfile, NvAPI_UnicodeString appName);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_DRS_GetApplicationInfo
+//
+//!   DESCRIPTION: This API gets information about the given application.  The input application name
+//!                must match exactly what the Profile has stored for the application.
+//!                This function is better used to retrieve application information from a previous
+//!                enumeration.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \param [in]   hSession       Input to the session handle.
+//! \param [in]   hProfile       Input profile handle.
+//! \param [in]   appName        Input application name.
+//! \param [out]  *pApplication  Returns NVDRS_APPLICATION struct with all the attributes.
+//!
+//! \return  This API can return any of the error codes enumerated in #NvAPI_Status.
+//!          If there are return error codes with specific meaning for this API,
+//!          they are listed below.
+//! \retval ::NVAPI_EXECUTABLE_PATH_IS_AMBIGUOUS   The application name could not
+//                                                single out only one executable.
+//! \retval ::NVAPI_EXECUTABLE_NOT_FOUND           No application with that name is found on the profile.
+//!
+//! \ingroup drsapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DRS_GetApplicationInfo(NvDRSSessionHandle hSession, NvDRSProfileHandle hProfile, NvAPI_UnicodeString appName, NVDRS_APPLICATION *pApplication);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_DRS_EnumApplications
+//
+//!   DESCRIPTION: This API enumerates all the applications in a given profile from the starting index to the maximum length.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \param [in]      hSession         Input to the session handle.
+//! \param [in]      hProfile         Input profile handle.
+//! \param [in]      startIndex       Indicates starting index for enumeration.
+//! \param [in,out]  *appCount        Input maximum length of the passed in arrays. Returns the actual length.
+//! \param [out]     *pApplication    Returns NVDRS_APPLICATION struct with all the attributes.
+//!
+//! \retval ::NVAPI_OK               SUCCESS
+//! \retval ::NVAPI_ERROR            For miscellaneous errors.
+//! \retval ::NVAPI_END_ENUMERATION  startIndex exceeds the total appCount.
+//!
+//! \ingroup drsapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DRS_EnumApplications(NvDRSSessionHandle hSession, NvDRSProfileHandle hProfile, NvU32 startIndex, NvU32 *appCount, NVDRS_APPLICATION *pApplication);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_DRS_FindApplicationByName
+//
+//!   DESCRIPTION: This API searches the application and the associated profile for the given application name.
+//!                If a fully qualified path is provided, this function will always return the profile
+//!                the driver will apply upon running the application (on the path provided).
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \param [in]      hSession       Input to the hSession handle
+//! \param [in]      appName        Input appName. For best results, provide a fully qualified path of the type
+//!                                 c:/Folder1/Folder2/App.exe
+//! \param [out]     *phProfile     Returns profile handle.
+//! \param [in,out]  *pApplication  Returns NVDRS_APPLICATION struct pointer.
+//!
+//! \return  This API can return any of the error codes enumerated in #NvAPI_Status.
+//!                  If there are return error codes with specific meaning for this API,
+//!                  they are listed below:
+//! \retval ::NVAPI_APPLICATION_NOT_FOUND          If App not found
+//! \retval ::NVAPI_EXECUTABLE_PATH_IS_AMBIGUOUS   If the input appName was not fully qualified, this error might return in the case of multiple matches
+//!
+//! \ingroup drsapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DRS_FindApplicationByName(__in NvDRSSessionHandle hSession, __in NvAPI_UnicodeString appName, __out NvDRSProfileHandle *phProfile, __inout NVDRS_APPLICATION *pApplication);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_DRS_SetSetting
+//
+//!   DESCRIPTION: This API adds/modifies a setting to a profile.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \param [in]  hSession     Input to the session handle.
+//! \param [in]  hProfile     Input profile handle.
+//! \param [in]   *pSetting   Input NVDRS_SETTING struct pointer.
+//!
+//! \retval ::NVAPI_OK     SUCCESS
+//! \retval ::NVAPI_ERROR  For miscellaneous errors.
+//!
+//! \ingroup drsapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DRS_SetSetting(NvDRSSessionHandle hSession, NvDRSProfileHandle hProfile, NVDRS_SETTING *pSetting);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_DRS_GetSetting
+//
+//!   DESCRIPTION: This API gets information about the given setting.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \param [in]   hSession   Input to the session handle.
+//! \param [in]   hProfile   Input profile handle.
+//! \param [in]   settingId  Input settingId.
+//! \param [out]  *pSetting  Returns all the setting info
+//!
+//! \retval ::NVAPI_OK     SUCCESS
+//! \retval ::NVAPI_ERROR  For miscellaneous errors.
+//!
+//! \ingroup drsapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DRS_GetSetting(NvDRSSessionHandle hSession, NvDRSProfileHandle hProfile, NvU32 settingId, NVDRS_SETTING *pSetting);
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_DRS_EnumSettings
+//
+//!   DESCRIPTION: This API enumerates all the settings of a given profile from startIndex to the maximum length.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \param [in]      hSession        Input to the session handle.
+//! \param [in]      hProfile        Input profile handle.
+//! \param [in]      startIndex      Indicates starting index for enumeration.
+//! \param [in,out]  *settingsCount  Input max length of the passed in arrays, Returns the actual length.
+//! \param [out]     *pSetting       Returns all the settings info.
+//!
+//! \retval ::NVAPI_OK              SUCCESS
+//! \retval ::NVAPI_ERROR           For miscellaneous errors.
+//! \retval ::NVAPI_END_ENUMERATION startIndex exceeds the total appCount.
+//!
+//! \ingroup drsapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DRS_EnumSettings(NvDRSSessionHandle hSession, NvDRSProfileHandle hProfile, NvU32 startIndex, NvU32 *settingsCount, NVDRS_SETTING *pSetting);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_DRS_EnumAvailableSettingIds
+//
+//!   DESCRIPTION: This API enumerates all the Ids of all the settings recognized by NVAPI.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \param [out]    pSettingIds     User-provided array of length *pMaxCount that NVAPI will fill with IDs.
+//! \param [in,out] pMaxCount       Input max length of the passed in array, Returns the actual length.
+//!
+//! \retval ::NVAPI_OK     SUCCESS
+//! \retval ::NVAPI_ERROR  For miscellaneous errors.
+//!                  NVAPI_END_ENUMERATION: the provided pMaxCount is not enough to hold all settingIds.
+//!
+//! \ingroup drsapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DRS_EnumAvailableSettingIds(NvU32 *pSettingIds, NvU32 *pMaxCount);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_DRS_EnumAvailableSettingValues
+//
+//!   DESCRIPTION: This API enumerates all available setting values for a given setting.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \param [in]      settingId          Input settingId.
+//! \param [in,out]  maxNumCount        Input max length of the passed in arrays, Returns the actual length.
+//! \param [out]     *pSettingValues    Returns all available setting values and its count.
+//!
+//! \retval ::NVAPI_OK     SUCCESS
+//! \retval ::NVAPI_ERROR  For miscellaneous errors.
+//!
+//! \ingroup drsapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DRS_EnumAvailableSettingValues(NvU32 settingId, NvU32 *pMaxNumValues, NVDRS_SETTING_VALUES *pSettingValues);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_DRS_GetSettingIdFromName
+//
+//!   DESCRIPTION: This API gets the binary ID of a setting given the setting name.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \param [in]   settingName  Input Unicode settingName.
+//! \param [out]  *pSettingId  Returns corresponding settingId.
+//!
+//! \retval ::NVAPI_OK                 SUCCESS if the profile is found
+//! \retval ::NVAPI_PROFILE_NOT_FOUND  if profile is not found
+//! \retval ::NVAPI_SETTING_NOT_FOUND  if setting is not found
+//! \retval ::NVAPI_ERROR              For miscellaneous errors.
+//!
+//! \ingroup drsapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DRS_GetSettingIdFromName(NvAPI_UnicodeString settingName, NvU32 *pSettingId);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_DRS_GetSettingNameFromId
+//
+//!   DESCRIPTION: This API gets the setting name given the binary ID.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \param [in]  settingId        Input settingId.
+//! \param [in]  *pSettingName    Returns corresponding Unicode settingName.
+//!
+//! \retval ::NVAPI_OK                 SUCCESS if the profile is found
+//! \retval ::NVAPI_PROFILE_NOT_FOUND  if profile is not found
+//! \retval ::NVAPI_SETTING_NOT_FOUND  if setting is not found
+//! \retval ::NVAPI_ERROR              For miscellaneous errors.
+//!
+//! \ingroup drsapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DRS_GetSettingNameFromId(NvU32 settingId, NvAPI_UnicodeString *pSettingName);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_DRS_DeleteProfileSetting
+//
+//!   DESCRIPTION: This API deletes a setting or sets it back to predefined value.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \param [in]  hSession            Input to the session handle.
+//! \param [in]  hProfile            Input profile handle.
+//! \param [in]  settingId           Input settingId to be deleted.
+//!
+//! \retval ::NVAPI_OK     SUCCESS if the profile is found
+//! \retval ::NVAPI_ERROR  For miscellaneous errors.
+//!
+//! \ingroup drsapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DRS_DeleteProfileSetting(NvDRSSessionHandle hSession, NvDRSProfileHandle hProfile, NvU32 settingId);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_DRS_RestoreAllDefaults
+//
+//!   DESCRIPTION: This API restores the whole system to predefined(default) values.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \param [in]  hSession  Input to the session handle.
+//!
+//! \retval ::NVAPI_OK     SUCCESS if the profile is found
+//! \retval ::NVAPI_ERROR  For miscellaneous errors.
+//!
+//! \ingroup drsapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DRS_RestoreAllDefaults(NvDRSSessionHandle hSession);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_DRS_RestoreProfileDefault
+//
+//!   DESCRIPTION: This API restores the given profile to predefined(default) values.
+//!                Any and all user specified modifications will be removed.
+//!                If the whole profile was set by the user, the profile will be removed.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \param [in]  hSession  Input to the session handle.
+//! \param [in]  hProfile  Input profile handle.
+//!
+//! \retval ::NVAPI_OK              SUCCESS if the profile is found
+//! \retval ::NVAPI_ERROR           For miscellaneous errors.
+//! \retval ::NVAPI_PROFILE_REMOVED SUCCESS, and the hProfile is no longer valid.
+//! \retval ::NVAPI_ERROR           For miscellaneous errors.
+//!
+//! \ingroup drsapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DRS_RestoreProfileDefault(NvDRSSessionHandle hSession, NvDRSProfileHandle hProfile);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_DRS_RestoreProfileDefaultSetting
+//
+//!   DESCRIPTION: This API restores the given profile setting to predefined(default) values.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \param [in]  hSession  Input to the session handle.
+//! \param [in]  hProfile  Input profile handle.
+//! \param [in]  settingId Input settingId.
+//!
+//! \retval ::NVAPI_OK     SUCCESS if the profile is found
+//! \retval ::NVAPI_ERROR  For miscellaneous errors.
+//!
+//! \ingroup drsapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DRS_RestoreProfileDefaultSetting(NvDRSSessionHandle hSession, NvDRSProfileHandle hProfile, NvU32 settingId);
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_DRS_GetBaseProfile
+//
+//!   DESCRIPTION: Returns the handle to the current global profile.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \param [in]  hSession    Input to the session handle.
+//! \param [in]  *phProfile   Returns Base profile handle.
+//!
+//! \retval ::NVAPI_OK     SUCCESS if the profile is found
+//! \retval ::NVAPI_ERROR  For miscellaneous errors.
+//!
+//! \ingroup drsapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DRS_GetBaseProfile(NvDRSSessionHandle hSession, NvDRSProfileHandle *phProfile);
+
+
+
+
+//! \addtogroup sysgeneral
+//! @{
+
+typedef struct
+{
+    NvU32               version;            //!< structure version
+    NvU32               vendorId;           //!< Chipset vendor identification
+    NvU32               deviceId;           //!< Chipset device identification
+    NvAPI_ShortString   szVendorName;       //!< Chipset vendor Name
+    NvAPI_ShortString   szChipsetName;      //!< Chipset device Name
+    NvU32               flags;              //!< Chipset info flags - obsolete
+    NvU32               subSysVendorId;     //!< Chipset subsystem vendor identification
+    NvU32               subSysDeviceId;     //!< Chipset subsystem device identification
+    NvAPI_ShortString   szSubSysVendorName; //!< subsystem vendor Name
+    NvU32               HBvendorId;         //!< Host bridge vendor identification
+    NvU32               HBdeviceId;         //!< Host bridge device identification
+    NvU32               HBsubSysVendorId;   //!< Host bridge subsystem vendor identification
+    NvU32               HBsubSysDeviceId;   //!< Host bridge subsystem device identification
+
+} NV_CHIPSET_INFO_v4;
+
+typedef struct
+{
+    NvU32               version;            //!< structure version
+    NvU32               vendorId;           //!< vendor ID
+    NvU32               deviceId;           //!< device ID
+    NvAPI_ShortString   szVendorName;       //!< vendor Name
+    NvAPI_ShortString   szChipsetName;      //!< device Name
+    NvU32               flags;              //!< Chipset info flags - obsolete
+    NvU32               subSysVendorId;     //!< subsystem vendor ID
+    NvU32               subSysDeviceId;     //!< subsystem device ID
+    NvAPI_ShortString   szSubSysVendorName; //!< subsystem vendor Name
+} NV_CHIPSET_INFO_v3;
+
+typedef enum
+{
+    NV_CHIPSET_INFO_HYBRID          = 0x00000001,
+} NV_CHIPSET_INFO_FLAGS;
+
+typedef struct
+{
+    NvU32               version;        //!< structure version
+    NvU32               vendorId;       //!< vendor ID
+    NvU32               deviceId;       //!< device ID
+    NvAPI_ShortString   szVendorName;   //!< vendor Name
+    NvAPI_ShortString   szChipsetName;  //!< device Name
+    NvU32               flags;          //!< Chipset info flags
+} NV_CHIPSET_INFO_v2;
+
+typedef struct
+{
+    NvU32               version;        //structure version
+    NvU32               vendorId;       //vendor ID
+    NvU32               deviceId;       //device ID
+    NvAPI_ShortString   szVendorName;   //vendor Name
+    NvAPI_ShortString   szChipsetName;  //device Name
+} NV_CHIPSET_INFO_v1;
+
+#define NV_CHIPSET_INFO_VER_1  MAKE_NVAPI_VERSION(NV_CHIPSET_INFO_v1,1)
+#define NV_CHIPSET_INFO_VER_2   MAKE_NVAPI_VERSION(NV_CHIPSET_INFO_v2,2)
+#define NV_CHIPSET_INFO_VER_3   MAKE_NVAPI_VERSION(NV_CHIPSET_INFO_v3,3)
+#define NV_CHIPSET_INFO_VER_4   MAKE_NVAPI_VERSION(NV_CHIPSET_INFO_v4,4)
+
+#define NV_CHIPSET_INFO         NV_CHIPSET_INFO_v4
+#define NV_CHIPSET_INFO_VER     NV_CHIPSET_INFO_VER_4
+
+//! @}
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_SYS_GetChipSetInfo
+//
+//!  This function returns information about the system's chipset.
+//!
+//! SUPPORTED OS:  Windows XP and higher,  Mac OS X
+//!
+//!
+//! \since Release: 95
+//!
+//! \retval  NVAPI_INVALID_ARGUMENT              pChipSetInfo is NULL.
+//! \retval  NVAPI_OK                           *pChipSetInfo is now set.
+//! \retval  NVAPI_INCOMPATIBLE_STRUCT_VERSION   NV_CHIPSET_INFO version not compatible with driver.
+//! \ingroup sysgeneral
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_SYS_GetChipSetInfo(NV_CHIPSET_INFO *pChipSetInfo);
+
+
+//! \ingroup sysgeneral
+//! Lid and dock information - used in NvAPI_GetLidDockInfo()
+typedef struct
+{
+    NvU32 version;    //! Structure version, constructed from the macro #NV_LID_DOCK_PARAMS_VER
+    NvU32 currentLidState;
+    NvU32 currentDockState;
+    NvU32 currentLidPolicy;
+    NvU32 currentDockPolicy;
+    NvU32 forcedLidMechanismPresent;
+    NvU32 forcedDockMechanismPresent;
+}NV_LID_DOCK_PARAMS;
+
+
+//! ingroup sysgeneral
+#define NV_LID_DOCK_PARAMS_VER  MAKE_NVAPI_VERSION(NV_LID_DOCK_PARAMS,1)
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GetLidDockInfo
+//
+//! DESCRIPTION: This function returns the current lid and dock information.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 177
+//!
+//! \retval ::NVAPI_OK
+//! \retval ::NVAPI_ERROR
+//! \retval ::NVAPI_NOT_SUPPORTED
+//! \retval ::NVAPI_HANDLE_INVALIDATED
+//! \retval ::NVAPI_API_NOT_INTIALIZED
+//!
+//! \ingroup sysgeneral
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_SYS_GetLidAndDockInfo(NV_LID_DOCK_PARAMS *pLidAndDock);
+
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+// FUNCTION NAME:   NvAPI_SYS_GetDisplayIdFromGpuAndOutputId
+//
+//! DESCRIPTION:     This API converts a Physical GPU handle and output ID to a
+//!                  display ID.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \param [in]     hPhysicalGpu   Handle to the physical GPU
+//! \param [in]     outputId       Connected display output ID on the
+//!                                target GPU - must only have one bit set
+//! \param [out]    displayId      Pointer to an NvU32 which contains
+//!                                 the display ID
+//!
+//! \retval  ::NVAPI_OK - completed request
+//! \retval  ::NVAPI_API_NOT_INTIALIZED - NVAPI not initialized
+//! \retval  ::NVAPI_ERROR - miscellaneous error occurred
+//! \retval  ::NVAPI_INVALID_ARGUMENT - Invalid input parameter.
+//!
+//! \ingroup sysgeneral
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_SYS_GetDisplayIdFromGpuAndOutputId(NvPhysicalGpuHandle hPhysicalGpu, NvU32 outputId, NvU32* displayId);
+
+
+///////////////////////////////////////////////////////////////////////////////
+// FUNCTION NAME:   NvAPI_SYS_GetGpuAndOutputIdFromDisplayId
+//
+//! DESCRIPTION:     This API converts a display ID to a Physical GPU handle and output ID.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \param [in]     displayId       Display ID of display to retrieve
+//!                                 GPU and outputId for
+//! \param [out]    hPhysicalGpu    Handle to the physical GPU
+//! \param [out]    outputId )      Connected display output ID on the
+//!                                 target GPU will only have one bit set.
+//!
+//! \retval ::NVAPI_OK
+//! \retval ::NVAPI_API_NOT_INTIALIZED
+//! \retval ::NVAPI_ID_OUT_OF_RANGE    The DisplayId corresponds to a
+//!                                    display which is not within the
+//!                                    normal outputId range.
+//! \retval ::NVAPI_ERROR
+//! \retval ::NVAPI_INVALID_ARGUMENT
+//!
+//! \ingroup sysgeneral
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_SYS_GetGpuAndOutputIdFromDisplayId(NvU32 displayId, NvPhysicalGpuHandle *hPhysicalGpu, NvU32 *outputId);
+
+
+///////////////////////////////////////////////////////////////////////////////
+// FUNCTION NAME:   NvAPI_SYS_GetPhysicalGpuFromDisplayId
+//
+//! \code
+//! DESCRIPTION:     This API retrieves the Physical GPU handle of the connected display
+//!
+//! \since Release: 313
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! PARAMETERS:      displayId(IN)     - Display ID of display to retrieve
+//!                                      GPU handle
+//!                  hPhysicalGpu(OUT) - Handle to the physical GPU
+//!
+//! RETURN STATUS:
+//!                  NVAPI_OK - completed request
+//!                  NVAPI_API_NOT_INTIALIZED - NVAPI not initialized
+//!                  NVAPI_ERROR - miscellaneous error occurred
+//!                  NVAPI_INVALID_ARGUMENT - Invalid input parameter.
+//! \endcode
+//! \ingroup sysgeneral
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_SYS_GetPhysicalGpuFromDisplayId(NvU32 displayId, NvPhysicalGpuHandle *hPhysicalGpu);
+
+
+
+
+#ifdef __cplusplus
+}; //extern "C" {
+
+#endif
+
+#pragma pack(pop)
+
+#endif // _NVAPI_H
+
+#include"nvapi_lite_salend.h"
diff --git a/compat/nvapi/nvapi_ccminer.h b/compat/nvapi/nvapi_ccminer.h
new file mode 100644
index 0000000000..5bba84bc2d
--- /dev/null
+++ b/compat/nvapi/nvapi_ccminer.h
@@ -0,0 +1,371 @@
+#pragma once
+
+#define NVAPI_INTERNAL
+#include "nvapi.h"
+
+NvAPI_Status nvapi_dll_init();
+
+typedef struct {
+	NvU32 version;
+	NvU8  valid;
+	NvU8  count;
+	struct
+	{
+		NvU32 pstate; // Assumption
+		NvU32 unknown1[2];
+		NvU32 min_power;
+		NvU32 unknown2[2];
+		NvU32 def_power;
+		NvU32 unknown3[2];
+		NvU32 max_power;
+		NvU32 unknown4; // 0
+	} entries[4];
+} NVAPI_GPU_POWER_INFO;
+#define NVAPI_GPU_POWER_INFO_VER MAKE_NVAPI_VERSION(NVAPI_GPU_POWER_INFO, 1)
+
+typedef struct {
+	NvU32 version;
+	NvU32 flags;
+	struct {
+		NvU32 unknown1;
+		NvU32 unknown2;
+		NvU32 power; // percent * 1000
+		NvU32 unknown4;
+	} entries[4];
+} NVAPI_GPU_POWER_STATUS;
+#define NVAPI_GPU_POWER_STATUS_VER MAKE_NVAPI_VERSION(NVAPI_GPU_POWER_STATUS, 1)
+
+typedef struct {
+	NvU32 version;
+	NvU32 count;
+	struct {
+		NvU32 unknown1;
+		NvU32 unknown2;
+		NvU32 power; // unsure ?? 85536 to 95055 on 1080, 104825+ on 970
+		NvU32 unknown4;
+	} entries[4];
+} NVAPI_GPU_POWER_TOPO;
+#define NVAPI_GPU_POWER_TOPO_VER MAKE_NVAPI_VERSION(NVAPI_GPU_POWER_TOPO, 1)
+
+typedef struct {
+	NvU32 version;
+	NvU32 flags;
+	struct {
+		NvU32 controller;
+		NvU32 unknown;
+		NvS32 min_temp;
+		NvS32 def_temp;
+		NvS32 max_temp;
+		NvU32 defaultFlags;
+	} entries[4];
+} NVAPI_GPU_THERMAL_INFO;
+#define NVAPI_GPU_THERMAL_INFO_VER MAKE_NVAPI_VERSION(NVAPI_GPU_THERMAL_INFO, 2)
+
+typedef struct {
+	NvU32 version;
+	NvU32 flags;
+	struct {
+		NvU32 controller;
+		NvU32 value;
+		NvU32 flags;
+	} entries[4];
+} NVAPI_GPU_THERMAL_LIMIT;
+#define NVAPI_GPU_THERMAL_LIMIT_VER MAKE_NVAPI_VERSION(NVAPI_GPU_THERMAL_LIMIT, 2)
+
+// Maxwell gpu core voltage reading
+typedef struct {
+	NvU32 version;
+	NvU32 flags;
+	NvU32 count; // unsure
+	NvU32 unknown;
+	NvU32 value_uV;
+	NvU32 buf1[30];
+} NVAPI_VOLT_STATUS; // 140 bytes (1-008c)
+#define NVAPI_VOLT_STATUS_VER MAKE_NVAPI_VERSION(NVAPI_VOLT_STATUS, 1)
+
+// Pascal gpu core voltage reading
+typedef struct {
+	NvU32 version;
+	NvU32 flags;
+	NvU32 nul[8];
+	NvU32 value_uV;
+	NvU32 buf1[8];
+} NVAPI_VOLTAGE_STATUS; // 76 bytes (1-004c)
+#define NVAPI_VOLTAGE_STATUS_VER MAKE_NVAPI_VERSION(NVAPI_VOLTAGE_STATUS, 1)
+
+typedef struct {
+	NvU32 version;
+	NvU32 numClocks; // unsure
+	NvU32 nul[8];
+	struct {
+		NvU32 a;
+		NvU32 clockType;
+		NvU32 c;
+		NvU32 d;
+		NvU32 e;
+		NvU32 f;
+		NvU32 g;
+		NvU32 h;
+		NvU32 i;
+		NvU32 j;
+		NvS32 rangeMax;
+		NvS32 rangeMin;
+		NvS32 tempMax; // ? unsure
+		NvU32 n;
+		NvU32 o;
+		NvU32 p;
+		NvU32 q;
+		NvU32 r;
+	} entries[32]; // NVAPI_MAX_GPU_CLOCKS ?
+} NVAPI_CLOCKS_RANGE; // 2344 bytes
+#define NVAPI_CLOCKS_RANGE_VER MAKE_NVAPI_VERSION(NVAPI_CLOCKS_RANGE, 1)
+
+// seems to return a clock table mask
+typedef struct {
+	NvU32 version;
+	NvU32 mask[4]; // 80 bits mask
+	NvU32 buf0[8];
+	struct {
+		NvU32 a;
+		NvU32 b;
+		NvU32 c;
+		NvU32 d;
+		NvU32 memDelta; // 1 for mem
+		NvU32 gpuDelta; // 1 for gpu
+	} clocks[80 + 23];
+	NvU32 buf1[916];
+} NVAPI_CLOCK_MASKS; // 6188 bytes
+#define NVAPI_CLOCK_MASKS_VER MAKE_NVAPI_VERSION(NVAPI_CLOCK_MASKS, 1)
+
+// contains the gpu/mem clocks deltas
+typedef struct {
+	NvU32 version;
+	NvU32 mask[4]; // 80 bits mask (could be 8x 32bits)
+	NvU32 buf0[12];
+	struct {
+		NvU32 a;
+		NvU32 b;
+		NvU32 c;
+		NvU32 d;
+		NvU32 e;
+		NvS32 freqDelta; // 84000 = +84MHz
+		NvU32 g;
+		NvU32 h;
+		NvU32 i;
+	} gpuDeltas[80];
+	NvU32 memFilled[23]; // maybe only 4 max
+	NvS32 memDeltas[23];
+	NvU32 buf1[1529];
+} NVAPI_CLOCK_TABLE; // 9248 bytes
+#define NVAPI_CLOCK_TABLE_VER MAKE_NVAPI_VERSION(NVAPI_CLOCK_TABLE, 1)
+
+typedef struct {
+	NvU32 version;
+	NvU32 mask[4]; // 80 bits mask
+	NvU32 buf0[12];
+	struct {
+		NvU32 a; // 0
+		NvU32 freq_kHz;
+		NvU32 volt_uV;
+		NvU32 d;
+		NvU32 e;
+		NvU32 f;
+		NvU32 g;
+	} gpuEntries[80];
+	struct {
+		NvU32 a;  // 1 for idle values ?
+		NvU32 freq_kHz;
+		NvU32 volt_uV;
+		NvU32 d;
+		NvU32 e;
+		NvU32 f;
+		NvU32 g;
+	} memEntries[23];
+	NvU32 buf1[1064];
+} NVAPI_VFP_CURVE; // 7208 bytes (1-1c28)
+#define NVAPI_VFP_CURVE_VER MAKE_NVAPI_VERSION(NVAPI_VFP_CURVE, 1)
+
+typedef struct {
+	NvU32 version;
+	NvS32 percent;
+	NvU32 pad[8];
+} NVAPI_VOLTBOOST_PERCENT; // 40 bytes (1-0028)
+#define NVAPI_VOLTBOOST_PERCENT_VER MAKE_NVAPI_VERSION(NVAPI_VOLTBOOST_PERCENT, 1)
+
+typedef struct {
+	NvU32 version;
+	NvU32 flags;
+	NvU32 filled; // 1
+	struct {
+		NvU32 volt_uV;
+		NvU32 unknown;
+	} entries[128];
+	// some empty tables then...
+	NvU32 buf1[3888];
+} NVAPI_VOLTAGES_TABLE; // 16588 bytes (1-40cc)
+#define NVAPI_VOLTAGES_TABLE_VER MAKE_NVAPI_VERSION(NVAPI_VOLTAGES_TABLE, 1)
+
+typedef struct {
+	NvU32 version;
+	NvU32 val1; // 7
+	NvU32 val2; // 0x3F (63.)
+	NvU32 pad[16];
+} NVAPI_GPU_PERF_INFO; // 76 bytes (1-004c)
+#define NVAPI_GPU_PERF_INFO_VER MAKE_NVAPI_VERSION(NVAPI_GPU_PERF_INFO, 1)
+
+typedef struct {
+	NvU32 version;
+	NvU32 flags;     // 0
+	NvU64 timeRef;   // increment with time
+	NvU64 val1;      // seen 1 4 5 while mining, 16 else
+	NvU64 val2;      // seen 7 and 3
+	NvU64 values[3]; // increment with time
+	NvU32 pad[326];  // empty
+} NVAPI_GPU_PERF_STATUS; // 1360 bytes (1-0550)
+#define NVAPI_GPU_PERF_STATUS_VER MAKE_NVAPI_VERSION(NVAPI_GPU_PERF_STATUS, 1)
+
+typedef struct {
+	NvU32 version;
+	NvU32 val1;      // 4
+	NvU32 val2;      // 2 or 0
+	NvU32 val3;      // 2
+	NvU32 val4;      // 3
+	NV_GPU_PERF_PSTATE_ID pStateId;
+	NvU32 val6;      // 0 or 2
+	NvU32 val7;      // 4
+	NvU32 val8;      // 0
+	NvU32 memFreq1;  // 405000.
+	NvU32 memFreq2;  // 405000.
+	NvU32 memFreqMin;// 101250.
+	NvU32 memFreqMax;// 486000.
+	NvU32 zeros[3];
+	NvU32 gpuFreq1;  // 696000. Unsure about those
+	NvU32 gpuFreq2;  // 696000.
+	NvU32 gpuFreqMin;// 174000.
+	NvU32 gpuFreqMax;// 658000.
+	NvU32 pad[2697];
+} NVAPI_GPU_PERF_CLOCKS; // 10868 bytes (2-2a74)
+#define NVAPI_GPU_PERF_CLOCKS_VER MAKE_NVAPI_VERSION(NVAPI_GPU_PERF_CLOCKS, 2)
+
+typedef struct {
+	NvU32 version;
+	NvU32 level;
+	NvU32 count;
+	NvU32 pad[339]; // (4-0558)
+} NVAPI_COOLER_SETTINGS;
+#define NVAPI_COOLER_SETTINGS_VER MAKE_NVAPI_VERSION(NVAPI_COOLER_SETTINGS, 4)
+
+typedef struct {
+	NvU32 version;
+	NvU32 level;   // 0 = auto ?
+	NvU32 count;   // 1
+	NvU32 pad[38]; // (1-00a4)
+} NVAPI_COOLER_LEVEL;
+#define NVAPI_COOLER_LEVEL_VER MAKE_NVAPI_VERSION(NVAPI_COOLER_LEVEL, 1)
+
+NvAPI_Status NvAPI_DLL_GetInterfaceVersionString(NvAPI_ShortString string);
+
+NvAPI_Status NvAPI_DLL_PerfPoliciesGetInfo(NvPhysicalGpuHandle, NVAPI_GPU_PERF_INFO*); // 409D9841 1-004c
+NvAPI_Status NvAPI_DLL_PerfPoliciesGetStatus(NvPhysicalGpuHandle, NVAPI_GPU_PERF_STATUS*); // 3D358A0C 1-0550
+
+NvAPI_Status NvAPI_DLL_ClientPowerPoliciesGetInfo(NvPhysicalGpuHandle, NVAPI_GPU_POWER_INFO*);
+NvAPI_Status NvAPI_DLL_ClientPowerPoliciesGetStatus(NvPhysicalGpuHandle, NVAPI_GPU_POWER_STATUS*);
+NvAPI_Status NvAPI_DLL_ClientPowerPoliciesSetStatus(NvPhysicalGpuHandle, NVAPI_GPU_POWER_STATUS*);
+NvAPI_Status NvAPI_DLL_ClientPowerTopologyGetStatus(NvPhysicalGpuHandle, NVAPI_GPU_POWER_TOPO*); // EDCF624E 1-0048
+
+NvAPI_Status NvAPI_DLL_ClientThermalPoliciesGetInfo(NvPhysicalGpuHandle, NVAPI_GPU_THERMAL_INFO*);
+NvAPI_Status NvAPI_DLL_ClientThermalPoliciesGetLimit(NvPhysicalGpuHandle, NVAPI_GPU_THERMAL_LIMIT*);
+NvAPI_Status NvAPI_DLL_ClientThermalPoliciesSetLimit(NvPhysicalGpuHandle, NVAPI_GPU_THERMAL_LIMIT*);
+
+// Pascal GTX only
+NvAPI_Status NvAPI_DLL_GetClockBoostRanges(NvPhysicalGpuHandle, NVAPI_CLOCKS_RANGE*);
+NvAPI_Status NvAPI_DLL_GetClockBoostMask(NvPhysicalGpuHandle, NVAPI_CLOCK_MASKS*);  // 0x507B4B59
+NvAPI_Status NvAPI_DLL_GetClockBoostTable(NvPhysicalGpuHandle, NVAPI_CLOCK_TABLE*); // 0x23F1B133
+NvAPI_Status NvAPI_DLL_SetClockBoostTable(NvPhysicalGpuHandle, NVAPI_CLOCK_TABLE*); // 0x0733E009
+NvAPI_Status NvAPI_DLL_GetVFPCurve(NvPhysicalGpuHandle, NVAPI_VFP_CURVE*); // 0x21537AD4
+NvAPI_Status NvAPI_DLL_GetCurrentVoltage(NvPhysicalGpuHandle, NVAPI_VOLTAGE_STATUS*);   // 0x465F9BCF 1-004c
+NvAPI_Status NvAPI_DLL_GetCoreVoltageBoostPercent(NvPhysicalGpuHandle, NVAPI_VOLTBOOST_PERCENT*);
+NvAPI_Status NvAPI_DLL_SetCoreVoltageBoostPercent(NvPhysicalGpuHandle, NVAPI_VOLTBOOST_PERCENT*);
+
+// Maxwell only
+NvAPI_Status NvAPI_DLL_GetVoltageDomainsStatus(NvPhysicalGpuHandle, NVAPI_VOLT_STATUS*); // 0xC16C7E2C
+NvAPI_Status NvAPI_DLL_GetVoltages(NvPhysicalGpuHandle, NVAPI_VOLTAGES_TABLE*); // 0x7D656244 1-40CC
+NvAPI_Status NvAPI_DLL_GetVoltageStep(NvPhysicalGpuHandle, NVAPI_VOLT_STATUS*); // 0x28766157 1-008C unsure of the name
+
+NvAPI_Status NvAPI_DLL_GetCoolerSettings(NvPhysicalGpuHandle, uint32_t, NVAPI_COOLER_SETTINGS*); // 0xDA141340 4-0558
+NvAPI_Status NvAPI_DLL_SetCoolerLevels(NvPhysicalGpuHandle, uint32_t, NVAPI_COOLER_LEVEL*); // 0x891FA0AE 1-00A4
+NvAPI_Status NvAPI_DLL_RestoreCoolerSettings(NvPhysicalGpuHandle, NVAPI_COOLER_SETTINGS*, uint32_t);
+
+NvAPI_Status NvAPI_DLL_GetSerialNumber(NvPhysicalGpuHandle, NvAPI_ShortString serial);
+
+NvAPI_Status NvAPI_DLL_GetPerfClocks(NvPhysicalGpuHandle, uint32_t num, NVAPI_GPU_PERF_CLOCKS* pClocks); // 2-2A74
+//NvAPI_Status NvAPI_DLL_SetPerfClocks(NvPhysicalGpuHandle, uint32_t num, NVAPI_GPU_PERF_CLOCKS* pClocks); // error
+
+//invalid..
+//NvAPI_Status NvAPI_DLL_GetPstateClientLimits(NvPhysicalGpuHandle, NV_GPU_PERF_PSTATE_ID, uint32_t* pLimits);
+//NvAPI_Status NvAPI_DLL_SetPstateClientLimits(NvPhysicalGpuHandle, NV_GPU_PERF_PSTATE_ID, uint32_t* pLimits);
+
+NvAPI_Status NvAPI_DLL_SetPstates20v1(NvPhysicalGpuHandle handle, NV_GPU_PERF_PSTATES20_INFO_V1 *pSet);
+NvAPI_Status NvAPI_DLL_SetPstates20v2(NvPhysicalGpuHandle handle, NV_GPU_PERF_PSTATES20_INFO_V2 *pSet);
+
+NvAPI_Status NvAPI_DLL_Unload();
+
+#define NV_ASSERT(x) { NvAPI_Status ret = x; if(ret != NVAPI_OK) return ret; }
+
+// to reduce stack size, allow to reuse a mem buffer
+#define NV_INIT_STRUCT_ON(TYPE, var, mem) { \
+	var = (TYPE*) mem; \
+	memset(var, 0, sizeof(TYPE)); \
+	var->version = TYPE##_VER; \
+}
+
+// alloc a struct, need free(var)
+#define NV_INIT_STRUCT_ALLOC(TYPE, var) { \
+	var = (TYPE*) calloc(1, TYPE##_VER & 0xFFFF); \
+	if (var) var->version = TYPE##_VER; \
+}
+
+//! Used in NvAPI_I2CReadEx()
+typedef struct
+{
+	NvU32        version;
+	NvU32        displayMask;        // Display Mask of the concerned display.
+	NvU8         bIsDDCPort;         // indicates either the DDC port (TRUE) or the communication port (FALSE) of the concerned display.
+	NvU8         i2cDevAddress;      // address of the I2C slave.  The address should be shifted left by one. 0x50 -> 0xA0.
+	NvU8*        pbI2cRegAddress;    // I2C target register address.  May be NULL, which indicates no register address should be sent.
+	NvU32        regAddrSize;        // size in bytes of target register address.  If pbI2cRegAddress is NULL, this field must be 0.
+	NvU8*        pbData;             // buffer of data which is to be read or written (depending on the command).
+	NvU32        cbRead;             // bytes to read ??? seems required on write too
+	NvU32        cbSize;             // full size of the data buffer, pbData, to be read or written.
+	NV_I2C_SPEED i2cSpeedKhz;        // target speed of the transaction in (kHz) (Chosen from the enum NV_I2C_SPEED).
+	NvU8         portId;             // portid on which device is connected (remember to set bIsPortIdSet if this value is set)
+	NvU32        bIsPortIdSet;       // set this flag on if and only if portid value is set
+
+} NV_I2C_INFO_EX;
+#define NV_I2C_INFO_EX_VER  MAKE_NVAPI_VERSION(NV_I2C_INFO_EX,3)
+/*
+sample evga x64 call (struct of 0x40 bytes)
+ReadEx
+$ ==> 40 00 03 00  00 00 00 00  00 40 00 00  00 00 00 00
+$+10  58 F9 2B 00  00 00 00 00  01 00 00 00  00 00 00 00
+$+20  C0 F9 2B 00  00 00 00 00  02 00 00 00  FF FF 00 00
+$+30  00 00 00 00  02 00 00 00  01 00 00 00  00 00 00 00
+
+$ ==> 40 00 03 00  00 00 00 00  00 10 00 00  00 00 00 00
+$+10  68 F9 2B 00  00 00 00 00  01 00 00 00  00 00 00 00
+$+20  C0 F9 2B 00  00 00 00 00  01 00 00 00  FF FF 00 00
+$+30  00 00 00 00  01 00 00 00  01 00 00 00  00 00 00 00
+00000000002BF968 > 75 83 CF 3F 01 00 00 00
+00000000002BF9C0 > 0
+
+WriteEx
+$ ==> 40 00 03 00  00 00 00 00  00 8C 00 00  00 00 00 00
+$+10  30 F9 2B 00  00 00 00 00  01 00 00 00  00 00 00 00
+$+20  38 F9 2B 00  00 00 00 00  02 00 00 00  FF FF 00 00
+$+30  00 00 00 00  01 00 00 00  01 00 00 00  00 00 00 00
+00000000002BF930 > D1 00 00 00 00 00 00 00
+00000000002BF938 > 38 00 00 00 00 00 00 00
+*/
+
+NvAPI_Status NvAPI_DLL_I2CReadEx(NvPhysicalGpuHandle, NV_I2C_INFO_EX*, NvU32*);
+NvAPI_Status NvAPI_DLL_I2CWriteEx(NvPhysicalGpuHandle, NV_I2C_INFO_EX*, NvU32*);
diff --git a/compat/nvapi/nvapi_lite_common.h b/compat/nvapi/nvapi_lite_common.h
new file mode 100644
index 0000000000..e31463afdd
--- /dev/null
+++ b/compat/nvapi/nvapi_lite_common.h
@@ -0,0 +1,541 @@
+ /************************************************************************************************************************************\
+|*                                                                                                                                    *|
+|*     Copyright � 2012 NVIDIA Corporation.  All rights reserved.                                                                     *|
+|*                                                                                                                                    *|
+|*  NOTICE TO USER:                                                                                                                   *|
+|*                                                                                                                                    *|
+|*  This software is subject to NVIDIA ownership rights under U.S. and international Copyright laws.                                  *|
+|*                                                                                                                                    *|
+|*  This software and the information contained herein are PROPRIETARY and CONFIDENTIAL to NVIDIA                                     *|
+|*  and are being provided solely under the terms and conditions of an NVIDIA software license agreement.                             *|
+|*  Otherwise, you have no rights to use or access this software in any manner.                                                       *|
+|*                                                                                                                                    *|
+|*  If not covered by the applicable NVIDIA software license agreement:                                                               *|
+|*  NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOFTWARE FOR ANY PURPOSE.                                            *|
+|*  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.                                                           *|
+|*  NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,                                                                     *|
+|*  INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.                       *|
+|*  IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,                               *|
+|*  OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT,                         *|
+|*  NEGLIGENCE OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOURCE CODE.            *|
+|*                                                                                                                                    *|
+|*  U.S. Government End Users.                                                                                                        *|
+|*  This software is a "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT 1995),                                       *|
+|*  consisting  of "commercial computer  software"  and "commercial computer software documentation"                                  *|
+|*  as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government only as a commercial end item.     *|
+|*  Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995),                                          *|
+|*  all U.S. Government End Users acquire the software with only those rights set forth herein.                                       *|
+|*                                                                                                                                    *|
+|*  Any use of this software in individual and commercial software must include,                                                      *|
+|*  in the user documentation and internal comments to the code,                                                                      *|
+|*  the above Disclaimer (as applicable) and U.S. Government End Users Notice.                                                        *|
+|*                                                                                                                                    *|
+ \************************************************************************************************************************************/
+#pragma once
+#include"nvapi_lite_salstart.h"
+#pragma pack(push,8)
+#ifdef __cplusplus
+extern "C" {
+#endif
+#if (defined(WIN32) || defined(_WIN32)) && defined(_MSC_VER) && (_MSC_VER > 1399) && !defined(NVAPI_INTERNAL) && !defined(NVAPI_DEPRECATED_OLD)
+#ifndef __nvapi_deprecated_function
+#define __nvapi_deprecated_function(message) __declspec(deprecated(message))
+#endif
+#ifndef __nvapi_deprecated_datatype
+#define __nvapi_deprecated_datatype(FirstRelease) __declspec(deprecated("Do not use this data type - it is deprecated in release " #FirstRelease "."))
+#endif
+#else
+#ifndef __nvapi_deprecated_function
+#define __nvapi_deprecated_function(message)
+#endif
+#ifndef __nvapi_deprecated_datatype
+#define __nvapi_deprecated_datatype(FirstRelease)
+#endif
+#endif
+
+
+/* 64-bit types for compilers that support them, plus some obsolete variants */
+#if defined(__GNUC__) || defined(__arm) || defined(__IAR_SYSTEMS_ICC__) || defined(__ghs__) || defined(_WIN64)
+typedef unsigned long long NvU64; /* 0 to 18446744073709551615  */
+typedef long long          NvS64; /* -9223372036854775808 to 9223372036854775807  */
+#else
+typedef unsigned __int64   NvU64; /* 0 to 18446744073709551615  */
+typedef __int64            NvS64; /* -9223372036854775808 to 9223372036854775807  */
+#endif
+
+// mac os 32-bit still needs this
+#if (defined(macintosh) || defined(__APPLE__)) && !defined(__LP64__)
+typedef signed long        NvS32; /* -2147483648 to 2147483647  */
+#else
+typedef signed int         NvS32; /* -2147483648 to 2147483647 */
+#endif
+
+#ifndef __unix
+// mac os 32-bit still needs this
+#if ( (defined(macintosh) && defined(__LP64__) && (__NVAPI_RESERVED0__)) || \
+      (!defined(macintosh) && defined(__NVAPI_RESERVED0__)) )
+typedef unsigned int       NvU32; /* 0 to 4294967295                         */
+#else
+typedef unsigned long      NvU32; /* 0 to 4294967295                         */
+#endif
+#else
+typedef unsigned int       NvU32; /* 0 to 4294967295                         */
+#endif
+
+typedef unsigned long    temp_NvU32; /* 0 to 4294967295                         */
+typedef signed   short   NvS16;
+typedef unsigned short   NvU16;
+typedef unsigned char    NvU8;
+typedef signed   char    NvS8;
+
+/* Boolean type */
+typedef NvU8 NvBool;
+#define NV_TRUE           ((NvBool)(0 == 0))
+#define NV_FALSE          ((NvBool)(0 != 0))
+
+
+typedef struct _NV_RECT
+{
+    NvU32    left;
+    NvU32    top;
+    NvU32    right;
+    NvU32    bottom;
+} NV_RECT;
+
+
+
+#define NV_DECLARE_HANDLE(name) struct name##__ { int unused; }; typedef struct name##__ *name
+
+//! \addtogroup nvapihandles
+//! NVAPI Handles - These handles are retrieved from various calls and passed in to others in NvAPI
+//!                 These are meant to be opaque types.  Do not assume they correspond to indices, HDCs,
+//!                 display indexes or anything else.
+//!
+//!                 Most handles remain valid until a display re-configuration (display mode set) or GPU
+//!                 reconfiguration (going into or out of SLI modes) occurs.  If NVAPI_HANDLE_INVALIDATED
+//!                 is received by an app, it should discard all handles, and re-enumerate them.
+//! @{
+NV_DECLARE_HANDLE(NvDisplayHandle);                //!< Display Device driven by NVIDIA GPU(s) (an attached display)
+NV_DECLARE_HANDLE(NvMonitorHandle);                //!< Monitor handle
+NV_DECLARE_HANDLE(NvUnAttachedDisplayHandle);      //!< Unattached Display Device driven by NVIDIA GPU(s)
+NV_DECLARE_HANDLE(NvLogicalGpuHandle);             //!< One or more physical GPUs acting in concert (SLI)
+NV_DECLARE_HANDLE(NvPhysicalGpuHandle);            //!< A single physical GPU
+NV_DECLARE_HANDLE(NvEventHandle);                  //!< A handle to an event registration instance
+NV_DECLARE_HANDLE(NvVisualComputingDeviceHandle);  //!< A handle to a Visual Computing Device
+NV_DECLARE_HANDLE(NvHICHandle);                    //!< A handle to a Host Interface Card
+NV_DECLARE_HANDLE(NvGSyncDeviceHandle);            //!< A handle to a Sync device
+NV_DECLARE_HANDLE(NvVioHandle);                    //!< A handle to an SDI device
+NV_DECLARE_HANDLE(NvTransitionHandle);             //!< A handle to address a single transition request
+NV_DECLARE_HANDLE(NvAudioHandle);                  //!< NVIDIA HD Audio Device
+NV_DECLARE_HANDLE(Nv3DVPContextHandle);            //!< A handle for a 3D Vision Pro (3DVP) context
+NV_DECLARE_HANDLE(Nv3DVPTransceiverHandle);        //!< A handle for a 3DVP RF transceiver
+NV_DECLARE_HANDLE(Nv3DVPGlassesHandle);            //!< A handle for a pair of 3DVP RF shutter glasses
+
+typedef void* StereoHandle;                        //!< A stereo handle, that corresponds to the device interface
+
+NV_DECLARE_HANDLE(NvSourceHandle);                 //!< Unique source handle on the system
+NV_DECLARE_HANDLE(NvTargetHandle);                 //!< Unique target handle on the system
+NV_DECLARE_HANDLE(NVDX_SwapChainHandle);           //!< DirectX SwapChain objects
+static const NVDX_SwapChainHandle NVDX_SWAPCHAIN_NONE = 0;
+//! @}
+
+//! \ingroup nvapihandles
+//! @{
+#define NVAPI_DEFAULT_HANDLE        0
+#define NV_BIT(x)    (1 << (x))
+//! @}
+
+
+
+//! \addtogroup nvapitypes
+//! @{
+#define NVAPI_GENERIC_STRING_MAX    4096
+#define NVAPI_LONG_STRING_MAX       256
+#define NVAPI_SHORT_STRING_MAX      64
+
+
+typedef struct
+{
+    NvS32   sX;
+    NvS32   sY;
+    NvS32   sWidth;
+    NvS32   sHeight;
+} NvSBox;
+
+#ifndef NvGUID_Defined
+#define NvGUID_Defined
+
+typedef struct
+{
+    NvU32 data1;
+    NvU16 data2;
+    NvU16 data3;
+    NvU8  data4[8];
+} NvGUID, NvLUID;
+
+#endif //#ifndef NvGUID_Defined
+
+
+#define NVAPI_MAX_PHYSICAL_GPUS             64
+#define NVAPI_MAX_PHYSICAL_BRIDGES          100
+#define NVAPI_PHYSICAL_GPUS                 32
+#define NVAPI_MAX_LOGICAL_GPUS              64
+#define NVAPI_MAX_AVAILABLE_GPU_TOPOLOGIES  256
+#define NVAPI_MAX_AVAILABLE_SLI_GROUPS      256
+#define NVAPI_MAX_GPU_TOPOLOGIES            NVAPI_MAX_PHYSICAL_GPUS
+#define NVAPI_MAX_GPU_PER_TOPOLOGY          8
+#define NVAPI_MAX_DISPLAY_HEADS             2
+#define NVAPI_ADVANCED_DISPLAY_HEADS        4
+#define NVAPI_MAX_DISPLAYS                  NVAPI_PHYSICAL_GPUS * NVAPI_ADVANCED_DISPLAY_HEADS
+#define NVAPI_MAX_ACPI_IDS                  16
+#define NVAPI_MAX_VIEW_MODES                8
+#define NV_MAX_HEADS                        4   //!< Maximum heads, each with NVAPI_DESKTOP_RES resolution
+#define NVAPI_MAX_HEADS_PER_GPU             32
+
+#define NV_MAX_HEADS        4   //!< Maximum number of heads, each with #NVAPI_DESKTOP_RES resolution
+#define NV_MAX_VID_STREAMS  4   //!< Maximum number of input video streams, each with a #NVAPI_VIDEO_SRC_INFO
+#define NV_MAX_VID_PROFILES 4   //!< Maximum number of output video profiles supported
+
+#define NVAPI_SYSTEM_MAX_DISPLAYS           NVAPI_MAX_PHYSICAL_GPUS * NV_MAX_HEADS
+
+#define NVAPI_SYSTEM_MAX_HWBCS              128
+#define NVAPI_SYSTEM_HWBC_INVALID_ID        0xffffffff
+#define NVAPI_MAX_AUDIO_DEVICES             16
+
+
+typedef char NvAPI_String[NVAPI_GENERIC_STRING_MAX];
+typedef char NvAPI_LongString[NVAPI_LONG_STRING_MAX];
+typedef char NvAPI_ShortString[NVAPI_SHORT_STRING_MAX];
+//! @}
+
+
+// =========================================================================================
+//!  NvAPI Version Definition \n
+//!  Maintain per structure specific version define using the MAKE_NVAPI_VERSION macro. \n
+//!  Usage: #define NV_GENLOCK_STATUS_VER  MAKE_NVAPI_VERSION(NV_GENLOCK_STATUS, 1)
+//!  \ingroup nvapitypes
+// =========================================================================================
+#define MAKE_NVAPI_VERSION(typeName,ver) (NvU32)(sizeof(typeName) | ((ver)<<16))
+
+//!  \ingroup nvapitypes
+#define GET_NVAPI_VERSION(ver) (NvU32)((ver)>>16)
+
+//!  \ingroup nvapitypes
+#define GET_NVAPI_SIZE(ver) (NvU32)((ver) & 0xffff)
+
+
+// ====================================================
+//! NvAPI Status Values
+//!   All NvAPI functions return one of these codes.
+//!   \ingroup nvapistatus
+// ====================================================
+
+
+typedef enum _NvAPI_Status
+{
+    NVAPI_OK                                    =  0,      //!< Success. Request is completed.
+    NVAPI_ERROR                                 = -1,      //!< Generic error
+    NVAPI_LIBRARY_NOT_FOUND                     = -2,      //!< NVAPI support library cannot be loaded.
+    NVAPI_NO_IMPLEMENTATION                     = -3,      //!< not implemented in current driver installation
+    NVAPI_API_NOT_INITIALIZED                   = -4,      //!< NvAPI_Initialize has not been called (successfully)
+    NVAPI_INVALID_ARGUMENT                      = -5,      //!< The argument/parameter value is not valid or NULL.
+    NVAPI_NVIDIA_DEVICE_NOT_FOUND               = -6,      //!< No NVIDIA display driver, or NVIDIA GPU driving a display, was found.
+    NVAPI_END_ENUMERATION                       = -7,      //!< No more items to enumerate
+    NVAPI_INVALID_HANDLE                        = -8,      //!< Invalid handle
+    NVAPI_INCOMPATIBLE_STRUCT_VERSION           = -9,      //!< An argument's structure version is not supported
+    NVAPI_HANDLE_INVALIDATED                    = -10,     //!< The handle is no longer valid (likely due to GPU or display re-configuration)
+    NVAPI_OPENGL_CONTEXT_NOT_CURRENT            = -11,     //!< No NVIDIA OpenGL context is current (but needs to be)
+    NVAPI_INVALID_POINTER                       = -14,     //!< An invalid pointer, usually NULL, was passed as a parameter
+    NVAPI_NO_GL_EXPERT                          = -12,     //!< OpenGL Expert is not supported by the current drivers
+    NVAPI_INSTRUMENTATION_DISABLED              = -13,     //!< OpenGL Expert is supported, but driver instrumentation is currently disabled
+    NVAPI_NO_GL_NSIGHT                          = -15,     //!< OpenGL does not support Nsight
+
+    NVAPI_EXPECTED_LOGICAL_GPU_HANDLE           = -100,    //!< Expected a logical GPU handle for one or more parameters
+    NVAPI_EXPECTED_PHYSICAL_GPU_HANDLE          = -101,    //!< Expected a physical GPU handle for one or more parameters
+    NVAPI_EXPECTED_DISPLAY_HANDLE               = -102,    //!< Expected an NV display handle for one or more parameters
+    NVAPI_INVALID_COMBINATION                   = -103,    //!< The combination of parameters is not valid.
+    NVAPI_NOT_SUPPORTED                         = -104,    //!< Requested feature is not supported in the selected GPU
+    NVAPI_PORTID_NOT_FOUND                      = -105,    //!< No port ID was found for the I2C transaction
+    NVAPI_EXPECTED_UNATTACHED_DISPLAY_HANDLE    = -106,    //!< Expected an unattached display handle as one of the input parameters.
+    NVAPI_INVALID_PERF_LEVEL                    = -107,    //!< Invalid perf level
+    NVAPI_DEVICE_BUSY                           = -108,    //!< Device is busy; request not fulfilled
+    NVAPI_NV_PERSIST_FILE_NOT_FOUND             = -109,    //!< NV persist file is not found
+    NVAPI_PERSIST_DATA_NOT_FOUND                = -110,    //!< NV persist data is not found
+    NVAPI_EXPECTED_TV_DISPLAY                   = -111,    //!< Expected a TV output display
+    NVAPI_EXPECTED_TV_DISPLAY_ON_DCONNECTOR     = -112,    //!< Expected a TV output on the D Connector - HDTV_EIAJ4120.
+    NVAPI_NO_ACTIVE_SLI_TOPOLOGY                = -113,    //!< SLI is not active on this device.
+    NVAPI_SLI_RENDERING_MODE_NOTALLOWED         = -114,    //!< Setup of SLI rendering mode is not possible right now.
+    NVAPI_EXPECTED_DIGITAL_FLAT_PANEL           = -115,    //!< Expected a digital flat panel.
+    NVAPI_ARGUMENT_EXCEED_MAX_SIZE              = -116,    //!< Argument exceeds the expected size.
+    NVAPI_DEVICE_SWITCHING_NOT_ALLOWED          = -117,    //!< Inhibit is ON due to one of the flags in NV_GPU_DISPLAY_CHANGE_INHIBIT or SLI active.
+    NVAPI_TESTING_CLOCKS_NOT_SUPPORTED          = -118,    //!< Testing of clocks is not supported.
+    NVAPI_UNKNOWN_UNDERSCAN_CONFIG              = -119,    //!< The specified underscan config is from an unknown source (e.g. INF)
+    NVAPI_TIMEOUT_RECONFIGURING_GPU_TOPO        = -120,    //!< Timeout while reconfiguring GPUs
+    NVAPI_DATA_NOT_FOUND                        = -121,    //!< Requested data was not found
+    NVAPI_EXPECTED_ANALOG_DISPLAY               = -122,    //!< Expected an analog display
+    NVAPI_NO_VIDLINK                            = -123,    //!< No SLI video bridge is present
+    NVAPI_REQUIRES_REBOOT                       = -124,    //!< NVAPI requires a reboot for the settings to take effect
+    NVAPI_INVALID_HYBRID_MODE                   = -125,    //!< The function is not supported with the current Hybrid mode.
+    NVAPI_MIXED_TARGET_TYPES                    = -126,    //!< The target types are not all the same
+    NVAPI_SYSWOW64_NOT_SUPPORTED                = -127,    //!< The function is not supported from 32-bit on a 64-bit system.
+    NVAPI_IMPLICIT_SET_GPU_TOPOLOGY_CHANGE_NOT_ALLOWED = -128,    //!< There is no implicit GPU topology active. Use NVAPI_SetHybridMode to change topology.
+    NVAPI_REQUEST_USER_TO_CLOSE_NON_MIGRATABLE_APPS = -129,      //!< Prompt the user to close all non-migratable applications.
+    NVAPI_OUT_OF_MEMORY                         = -130,    //!< Could not allocate sufficient memory to complete the call.
+    NVAPI_WAS_STILL_DRAWING                     = -131,    //!< The previous operation that is transferring information to or from this surface is incomplete.
+    NVAPI_FILE_NOT_FOUND                        = -132,    //!< The file was not found.
+    NVAPI_TOO_MANY_UNIQUE_STATE_OBJECTS         = -133,    //!< There are too many unique instances of a particular type of state object.
+    NVAPI_INVALID_CALL                          = -134,    //!< The method call is invalid. For example, a method's parameter may not be a valid pointer.
+    NVAPI_D3D10_1_LIBRARY_NOT_FOUND             = -135,    //!< d3d10_1.dll cannot be loaded.
+    NVAPI_FUNCTION_NOT_FOUND                    = -136,    //!< Couldn't find the function in the loaded DLL.
+    NVAPI_INVALID_USER_PRIVILEGE                = -137,    //!< Current User is not Admin.
+    NVAPI_EXPECTED_NON_PRIMARY_DISPLAY_HANDLE   = -138,    //!< The handle corresponds to GDIPrimary.
+    NVAPI_EXPECTED_COMPUTE_GPU_HANDLE           = -139,    //!< Setting Physx GPU requires that the GPU is compute-capable.
+    NVAPI_STEREO_NOT_INITIALIZED                = -140,    //!< The Stereo part of NVAPI failed to initialize completely. Check if the stereo driver is installed.
+    NVAPI_STEREO_REGISTRY_ACCESS_FAILED         = -141,    //!< Access to stereo-related registry keys or values has failed.
+    NVAPI_STEREO_REGISTRY_PROFILE_TYPE_NOT_SUPPORTED = -142, //!< The given registry profile type is not supported.
+    NVAPI_STEREO_REGISTRY_VALUE_NOT_SUPPORTED   = -143,    //!< The given registry value is not supported.
+    NVAPI_STEREO_NOT_ENABLED                    = -144,    //!< Stereo is not enabled and the function needed it to execute completely.
+    NVAPI_STEREO_NOT_TURNED_ON                  = -145,    //!< Stereo is not turned on and the function needed it to execute completely.
+    NVAPI_STEREO_INVALID_DEVICE_INTERFACE       = -146,    //!< Invalid device interface.
+    NVAPI_STEREO_PARAMETER_OUT_OF_RANGE         = -147,    //!< Separation percentage or JPEG image capture quality is out of [0-100] range.
+    NVAPI_STEREO_FRUSTUM_ADJUST_MODE_NOT_SUPPORTED = -148, //!< The given frustum adjust mode is not supported.
+    NVAPI_TOPO_NOT_POSSIBLE                     = -149,    //!< The mosaic topology is not possible given the current state of the hardware.
+    NVAPI_MODE_CHANGE_FAILED                    = -150,    //!< An attempt to do a display resolution mode change has failed.
+    NVAPI_D3D11_LIBRARY_NOT_FOUND               = -151,    //!< d3d11.dll/d3d11_beta.dll cannot be loaded.
+    NVAPI_INVALID_ADDRESS                       = -152,    //!< Address is outside of valid range.
+    NVAPI_STRING_TOO_SMALL                      = -153,    //!< The pre-allocated string is too small to hold the result.
+    NVAPI_MATCHING_DEVICE_NOT_FOUND             = -154,    //!< The input does not match any of the available devices.
+    NVAPI_DRIVER_RUNNING                        = -155,    //!< Driver is running.
+    NVAPI_DRIVER_NOTRUNNING                     = -156,    //!< Driver is not running.
+    NVAPI_ERROR_DRIVER_RELOAD_REQUIRED          = -157,    //!< A driver reload is required to apply these settings.
+    NVAPI_SET_NOT_ALLOWED                       = -158,    //!< Intended setting is not allowed.
+    NVAPI_ADVANCED_DISPLAY_TOPOLOGY_REQUIRED    = -159,    //!< Information can't be returned due to "advanced display topology".
+    NVAPI_SETTING_NOT_FOUND                     = -160,    //!< Setting is not found.
+    NVAPI_SETTING_SIZE_TOO_LARGE                = -161,    //!< Setting size is too large.
+    NVAPI_TOO_MANY_SETTINGS_IN_PROFILE          = -162,    //!< There are too many settings for a profile.
+    NVAPI_PROFILE_NOT_FOUND                     = -163,    //!< Profile is not found.
+    NVAPI_PROFILE_NAME_IN_USE                   = -164,    //!< Profile name is duplicated.
+    NVAPI_PROFILE_NAME_EMPTY                    = -165,    //!< Profile name is empty.
+    NVAPI_EXECUTABLE_NOT_FOUND                  = -166,    //!< Application not found in the Profile.
+    NVAPI_EXECUTABLE_ALREADY_IN_USE             = -167,    //!< Application already exists in the other profile.
+    NVAPI_DATATYPE_MISMATCH                     = -168,    //!< Data Type mismatch
+    NVAPI_PROFILE_REMOVED                       = -169,    //!< The profile passed as parameter has been removed and is no longer valid.
+    NVAPI_UNREGISTERED_RESOURCE                 = -170,    //!< An unregistered resource was passed as a parameter.
+    NVAPI_ID_OUT_OF_RANGE                       = -171,    //!< The DisplayId corresponds to a display which is not within the normal outputId range.
+    NVAPI_DISPLAYCONFIG_VALIDATION_FAILED       = -172,    //!< Display topology is not valid so the driver cannot do a mode set on this configuration.
+    NVAPI_DPMST_CHANGED                         = -173,    //!< Display Port Multi-Stream topology has been changed.
+    NVAPI_INSUFFICIENT_BUFFER                   = -174,    //!< Input buffer is insufficient to hold the contents.
+    NVAPI_ACCESS_DENIED                         = -175,    //!< No access to the caller.
+    NVAPI_MOSAIC_NOT_ACTIVE                     = -176,    //!< The requested action cannot be performed without Mosaic being enabled.
+    NVAPI_SHARE_RESOURCE_RELOCATED              = -177,    //!< The surface is relocated away from video memory.
+    NVAPI_REQUEST_USER_TO_DISABLE_DWM           = -178,    //!< The user should disable DWM before calling NvAPI.
+    NVAPI_D3D_DEVICE_LOST                       = -179,    //!< D3D device status is D3DERR_DEVICELOST or D3DERR_DEVICENOTRESET - the user has to reset the device.
+    NVAPI_INVALID_CONFIGURATION                 = -180,    //!< The requested action cannot be performed in the current state.
+    NVAPI_STEREO_HANDSHAKE_NOT_DONE             = -181,    //!< Call failed as stereo handshake not completed.
+    NVAPI_EXECUTABLE_PATH_IS_AMBIGUOUS          = -182,    //!< The path provided was too short to determine the correct NVDRS_APPLICATION
+    NVAPI_DEFAULT_STEREO_PROFILE_IS_NOT_DEFINED = -183,    //!< Default stereo profile is not currently defined
+    NVAPI_DEFAULT_STEREO_PROFILE_DOES_NOT_EXIST = -184,    //!< Default stereo profile does not exist
+    NVAPI_CLUSTER_ALREADY_EXISTS                = -185,    //!< A cluster is already defined with the given configuration.
+    NVAPI_DPMST_DISPLAY_ID_EXPECTED             = -186,    //!< The input display id is not that of a multi stream enabled connector or a display device in a multi stream topology
+    NVAPI_INVALID_DISPLAY_ID                    = -187,    //!< The input display id is not valid or the monitor associated to it does not support the current operation
+    NVAPI_STREAM_IS_OUT_OF_SYNC                 = -188,    //!< While playing secure audio stream, stream goes out of sync
+    NVAPI_INCOMPATIBLE_AUDIO_DRIVER             = -189,    //!< Older audio driver version than required
+    NVAPI_VALUE_ALREADY_SET                     = -190,    //!< Value already set, setting again not allowed.
+    NVAPI_TIMEOUT                               = -191,    //!< Requested operation timed out
+    NVAPI_GPU_WORKSTATION_FEATURE_INCOMPLETE    = -192,    //!< The requested workstation feature set has incomplete driver internal allocation resources
+    NVAPI_STEREO_INIT_ACTIVATION_NOT_DONE       = -193,    //!< Call failed because InitActivation was not called.
+    NVAPI_SYNC_NOT_ACTIVE                       = -194,    //!< The requested action cannot be performed without Sync being enabled.
+    NVAPI_SYNC_MASTER_NOT_FOUND                 = -195,    //!< The requested action cannot be performed without Sync Master being enabled.
+    NVAPI_INVALID_SYNC_TOPOLOGY                 = -196,    //!< Invalid displays passed in the NV_GSYNC_DISPLAY pointer.
+    NVAPI_ECID_SIGN_ALGO_UNSUPPORTED            = -197,    //!< The specified signing algorithm is not supported. Either an incorrect value was entered or the current installed driver/hardware does not support the input value.
+    NVAPI_ECID_KEY_VERIFICATION_FAILED          = -198,    //!< The encrypted public key verification has failed.
+    NVAPI_FIRMWARE_OUT_OF_DATE                  = -199,    //!< The device's firmware is out of date.
+    NVAPI_FIRMWARE_REVISION_NOT_SUPPORTED       = -200,    //!< The device's firmware is not supported.
+    NVAPI_LICENSE_CALLER_AUTHENTICATION_FAILED  = -201,    //!< The caller is not authorized to modify the License.
+    NVAPI_D3D_DEVICE_NOT_REGISTERED             = -202,    //!< The user tried to use a deferred context without registering the device first
+    NVAPI_RESOURCE_NOT_ACQUIRED                 = -203,    //!< Head or SourceId was not reserved for the VR Display before doing the Modeset.
+    NVAPI_TIMING_NOT_SUPPORTED                  = -204,    //!< Provided timing is not supported.
+    NVAPI_HDCP_ENCRYPTION_FAILED                = -205,    //!< HDCP Encryption Failed for the device. Would be applicable when the device is HDCP Capable.
+    NVAPI_PCLK_LIMITATION_FAILED                = -206,    //!< Provided mode is over sink device pclk limitation.
+    NVAPI_NO_CONNECTOR_FOUND                    = -207,    //!< No connector on GPU found.
+} NvAPI_Status;
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_SYS_GetDriverAndBranchVersion
+//
+//!   DESCRIPTION: This API returns display driver version and driver-branch string.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \param [out]  pDriverVersion         Contains the driver version after successful return.
+//! \param [out]  szBuildBranchString    Contains the driver-branch string after successful return.
+//!
+//! \retval ::NVAPI_INVALID_ARGUMENT: either pDriverVersion is NULL or enum index too big
+//! \retval ::NVAPI_OK - completed request
+//! \retval ::NVAPI_API_NOT_INTIALIZED - NVAPI not initialized
+//! \retval ::NVAPI_ERROR - miscellaneous error occurred
+//!
+//! \ingroup driverapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_SYS_GetDriverAndBranchVersion(NvU32* pDriverVersion, NvAPI_ShortString szBuildBranchString);
+//! \ingroup driverapi
+//! Used in NvAPI_GPU_GetMemoryInfo().
+typedef struct
+{
+    NvU32   version;                        //!< Version info
+    NvU32   dedicatedVideoMemory;           //!< Size(in kb) of the physical framebuffer.
+    NvU32   availableDedicatedVideoMemory;  //!< Size(in kb) of the available physical framebuffer for allocating video memory surfaces.
+    NvU32   systemVideoMemory;              //!< Size(in kb) of system memory the driver allocates at load time.
+    NvU32   sharedSystemMemory;             //!< Size(in kb) of shared system memory that driver is allowed to commit for surfaces across all allocations.
+
+} NV_DISPLAY_DRIVER_MEMORY_INFO_V1;
+
+
+//! \ingroup driverapi
+//! Used in NvAPI_GPU_GetMemoryInfo().
+typedef struct
+{
+    NvU32   version;                           //!< Version info
+    NvU32   dedicatedVideoMemory;              //!< Size(in kb) of the physical framebuffer.
+    NvU32   availableDedicatedVideoMemory;     //!< Size(in kb) of the available physical framebuffer for allocating video memory surfaces.
+    NvU32   systemVideoMemory;                 //!< Size(in kb) of system memory the driver allocates at load time.
+    NvU32   sharedSystemMemory;                //!< Size(in kb) of shared system memory that driver is allowed to commit for surfaces across all allocations.
+    NvU32   curAvailableDedicatedVideoMemory;  //!< Size(in kb) of the current available physical framebuffer for allocating video memory surfaces.
+
+} NV_DISPLAY_DRIVER_MEMORY_INFO_V2;
+
+//! \ingroup driverapi
+//! Used in NvAPI_GPU_GetMemoryInfo().
+typedef struct
+{
+    NvU32   version;                           //!< Version info
+    NvU32   dedicatedVideoMemory;              //!< Size(in kb) of the physical framebuffer.
+    NvU32   availableDedicatedVideoMemory;     //!< Size(in kb) of the available physical framebuffer for allocating video memory surfaces.
+    NvU32   systemVideoMemory;                 //!< Size(in kb) of system memory the driver allocates at load time.
+    NvU32   sharedSystemMemory;                //!< Size(in kb) of shared system memory that driver is allowed to commit for surfaces across all allocations.
+    NvU32   curAvailableDedicatedVideoMemory;  //!< Size(in kb) of the current available physical framebuffer for allocating video memory surfaces.
+	NvU32   dedicatedVideoMemoryEvictionsSize; //!< Size(in kb) of the total size of memory released as a result of the evictions.
+	NvU32   dedicatedVideoMemoryEvictionCount; //!< Indicates the number of eviction events that caused an allocation to be removed from dedicated video memory to free GPU
+	                                           //!< video memory to make room for other allocations.
+} NV_DISPLAY_DRIVER_MEMORY_INFO_V3;
+
+//! \ingroup driverapi
+typedef NV_DISPLAY_DRIVER_MEMORY_INFO_V3 NV_DISPLAY_DRIVER_MEMORY_INFO;
+
+//! \ingroup driverapi
+//! Macro for constructing the version field of NV_DISPLAY_DRIVER_MEMORY_INFO_V1
+#define NV_DISPLAY_DRIVER_MEMORY_INFO_VER_1  MAKE_NVAPI_VERSION(NV_DISPLAY_DRIVER_MEMORY_INFO_V1,1)
+
+//! \ingroup driverapi
+//! Macro for constructing the version field of NV_DISPLAY_DRIVER_MEMORY_INFO_V2
+#define NV_DISPLAY_DRIVER_MEMORY_INFO_VER_2  MAKE_NVAPI_VERSION(NV_DISPLAY_DRIVER_MEMORY_INFO_V2,2)
+
+//! \ingroup driverapi
+//! Macro for constructing the version field of NV_DISPLAY_DRIVER_MEMORY_INFO_V3
+#define NV_DISPLAY_DRIVER_MEMORY_INFO_VER_3  MAKE_NVAPI_VERSION(NV_DISPLAY_DRIVER_MEMORY_INFO_V3,3)
+
+//! \ingroup driverapi
+#define NV_DISPLAY_DRIVER_MEMORY_INFO_VER    NV_DISPLAY_DRIVER_MEMORY_INFO_VER_3
+
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_GPU_GetMemoryInfo
+//
+//!   DESCRIPTION: This function retrieves the available driver memory footprint for the specified GPU.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! TCC_SUPPORTED
+//!
+//! \since Release: 177
+//!
+//!  \param [in]   hPhysicalGpu  Handle of the physical GPU for which the memory information is to be extracted.
+//!  \param [out]  pMemoryInfo   The memory footprint available in the driver. See NV_DISPLAY_DRIVER_MEMORY_INFO.
+//!
+//!  \retval       NVAPI_INVALID_ARGUMENT             pMemoryInfo is NULL.
+//!  \retval       NVAPI_OK                           Call successful.
+//!  \retval       NVAPI_NVIDIA_DEVICE_NOT_FOUND      No NVIDIA GPU driving a display was found.
+//!  \retval       NVAPI_INCOMPATIBLE_STRUCT_VERSION  NV_DISPLAY_DRIVER_MEMORY_INFO structure version mismatch.
+//!
+//!  \ingroup  driverapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_GPU_GetMemoryInfo(NvPhysicalGpuHandle hPhysicalGpu, NV_DISPLAY_DRIVER_MEMORY_INFO *pMemoryInfo);
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_EnumPhysicalGPUs
+//
+//! This function returns an array of physical GPU handles.
+//! Each handle represents a physical GPU present in the system.
+//! That GPU may be part of an SLI configuration, or may not be visible to the OS directly.
+//!
+//! At least one GPU must be present in the system and running an NVIDIA display driver.
+//!
+//! The array nvGPUHandle will be filled with physical GPU handle values. The returned
+//! gpuCount determines how many entries in the array are valid.
+//!
+//! \note In drivers older than 105.00, all physical GPU handles get invalidated on a
+//!       modeset. So the calling applications need to renum the handles after every modeset.\n
+//!       With drivers 105.00 and up, all physical GPU handles are constant.
+//!       Physical GPU handles are constant as long as the GPUs are not physically moved and
+//!       the SBIOS VGA order is unchanged.
+//!
+//!       For GPU handles in TCC MODE please use NvAPI_EnumTCCPhysicalGPUs()
+//!
+//! SUPPORTED OS:  Windows XP and higher,  Mac OS X
+//!
+//!
+//! \par Introduced in
+//! \since Release: 80
+//!
+//! \retval NVAPI_INVALID_ARGUMENT         nvGPUHandle or pGpuCount is NULL
+//! \retval NVAPI_OK                       One or more handles were returned
+//! \retval NVAPI_NVIDIA_DEVICE_NOT_FOUND  No NVIDIA GPU driving a display was found
+//! \ingroup gpu
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_EnumPhysicalGPUs(NvPhysicalGpuHandle nvGPUHandle[NVAPI_MAX_PHYSICAL_GPUS], NvU32 *pGpuCount);
+#if defined(_D3D9_H_) || defined(__d3d10_h__) || defined(__d3d11_h__)
+
+NV_DECLARE_HANDLE(NVDX_ObjectHandle);  // DX Objects
+static const NVDX_ObjectHandle NVDX_OBJECT_NONE = 0;
+
+#endif //if defined(_D3D9_H_) || defined(__d3d10_h__) || defined(__d3d11_h__)
+#if defined(_D3D9_H_) || defined(__d3d10_h__) || defined(__d3d11_h__)
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_D3D_GetObjectHandleForResource
+//
+//!   DESCRIPTION: This API gets a handle to a resource.
+//!
+//! \param [in]  pDev       The ID3D11Device, ID3D10Device or IDirect3DDevice9 to use
+//! \param [in]  pResource  The ID3D10Resource, ID3D10Resource or IDirect3DResource9 from which
+//!                         we want the NvAPI handle
+//! \param [out]  pHandle   A handle to the resource
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 185
+//!
+//! \return ::NVAPI_OK if the handle was populated.
+//!
+//! \ingroup dx
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_D3D_GetObjectHandleForResource(
+    IUnknown *pDevice,
+    IUnknown *pResource,
+    NVDX_ObjectHandle *pHandle);
+
+
+#endif //if defined(_D3D9_H_) || defined(__d3d10_h__) || defined(__d3d11_h__)
+
+#include"nvapi_lite_salend.h"
+#ifdef __cplusplus
+}
+#endif
+#pragma pack(pop)
diff --git a/compat/nvapi/nvapi_lite_d3dext.h b/compat/nvapi/nvapi_lite_d3dext.h
new file mode 100644
index 0000000000..75e21f33e7
--- /dev/null
+++ b/compat/nvapi/nvapi_lite_d3dext.h
@@ -0,0 +1,188 @@
+ /************************************************************************************************************************************\
+|*                                                                                                                                    *|
+|*     Copyright � 2012 NVIDIA Corporation.  All rights reserved.                                                                     *|
+|*                                                                                                                                    *|
+|*  NOTICE TO USER:                                                                                                                   *|
+|*                                                                                                                                    *|
+|*  This software is subject to NVIDIA ownership rights under U.S. and international Copyright laws.                                  *|
+|*                                                                                                                                    *|
+|*  This software and the information contained herein are PROPRIETARY and CONFIDENTIAL to NVIDIA                                     *|
+|*  and are being provided solely under the terms and conditions of an NVIDIA software license agreement.                             *|
+|*  Otherwise, you have no rights to use or access this software in any manner.                                                       *|
+|*                                                                                                                                    *|
+|*  If not covered by the applicable NVIDIA software license agreement:                                                               *|
+|*  NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOFTWARE FOR ANY PURPOSE.                                            *|
+|*  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.                                                           *|
+|*  NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,                                                                     *|
+|*  INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.                       *|
+|*  IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,                               *|
+|*  OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT,                         *|
+|*  NEGLIGENCE OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOURCE CODE.            *|
+|*                                                                                                                                    *|
+|*  U.S. Government End Users.                                                                                                        *|
+|*  This software is a "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT 1995),                                       *|
+|*  consisting  of "commercial computer  software"  and "commercial computer software documentation"                                  *|
+|*  as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government only as a commercial end item.     *|
+|*  Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995),                                          *|
+|*  all U.S. Government End Users acquire the software with only those rights set forth herein.                                       *|
+|*                                                                                                                                    *|
+|*  Any use of this software in individual and commercial software must include,                                                      *|
+|*  in the user documentation and internal comments to the code,                                                                      *|
+|*  the above Disclaimer (as applicable) and U.S. Government End Users Notice.                                                        *|
+|*                                                                                                                                    *|
+ \************************************************************************************************************************************/
+#pragma once
+#include"nvapi_lite_salstart.h"
+#include"nvapi_lite_common.h"
+#pragma pack(push,8)
+#ifdef __cplusplus
+extern "C" {
+#endif
+#if defined(__cplusplus) && (defined(__d3d10_h__) || defined(__d3d10_1_h__) || defined(__d3d11_h__))
+//! \ingroup dx
+//! D3D_FEATURE_LEVEL supported - used in NvAPI_D3D11_CreateDevice() and NvAPI_D3D11_CreateDeviceAndSwapChain()
+typedef enum
+{
+    NVAPI_DEVICE_FEATURE_LEVEL_NULL       = -1,
+    NVAPI_DEVICE_FEATURE_LEVEL_10_0       = 0,
+    NVAPI_DEVICE_FEATURE_LEVEL_10_0_PLUS  = 1,
+    NVAPI_DEVICE_FEATURE_LEVEL_10_1       = 2,
+    NVAPI_DEVICE_FEATURE_LEVEL_11_0       = 3,
+} NVAPI_DEVICE_FEATURE_LEVEL;
+
+#endif  //defined(__cplusplus) && (defined(__d3d10_h__) || defined(__d3d10_1_h__) || defined(__d3d11_h__))
+#if defined(__cplusplus) && defined(__d3d11_h__)
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_D3D11_CreateDevice
+//
+//!   DESCRIPTION: This function tries to create a DirectX 11 device. If the call fails (if we are running
+//!                on pre-DirectX 11 hardware), depending on the type of hardware it will try to create a DirectX 10.1 OR DirectX 10.0+
+//!                OR DirectX 10.0 device. The function call is the same as D3D11CreateDevice(), but with an extra
+//!                argument (D3D_FEATURE_LEVEL supported by the device) that the function fills in. This argument
+//!                can contain -1 (NVAPI_DEVICE_FEATURE_LEVEL_NULL), if the requested featureLevel is less than DirecX 10.0.
+//!
+//!            NOTE: When NvAPI_D3D11_CreateDevice is called with 10+ feature level we have an issue on few set of
+//!                  tesla hardware (G80/G84/G86/G92/G94/G96) which does not support all feature level 10+ functionality
+//!                  e.g. calling driver with mismatch between RenderTarget and Depth Buffer. App developers should
+//!                  take into consideration such limitation when using NVAPI on such tesla hardwares.
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \since Release: 185
+//!
+//! \param [in]   pAdapter
+//! \param [in]   DriverType
+//! \param [in]   Software
+//! \param [in]   Flags
+//! \param [in]   *pFeatureLevels
+//! \param [in]   FeatureLevels
+//! \param [in]   SDKVersion
+//! \param [in]   **ppDevice
+//! \param [in]   *pFeatureLevel
+//! \param [in]   **ppImmediateContext
+//! \param [in]   *pSupportedLevel  D3D_FEATURE_LEVEL supported
+//!
+//! \return NVAPI_OK if the createDevice call succeeded.
+//!
+//! \ingroup dx
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_D3D11_CreateDevice(IDXGIAdapter* pAdapter,
+                                         D3D_DRIVER_TYPE DriverType,
+                                         HMODULE Software,
+                                         UINT Flags,
+                                         CONST D3D_FEATURE_LEVEL *pFeatureLevels,
+                                         UINT FeatureLevels,
+                                         UINT SDKVersion,
+                                         ID3D11Device **ppDevice,
+                                         D3D_FEATURE_LEVEL *pFeatureLevel,
+                                         ID3D11DeviceContext **ppImmediateContext,
+                                         NVAPI_DEVICE_FEATURE_LEVEL *pSupportedLevel);
+
+
+#endif //defined(__cplusplus) && defined(__d3d11_h__)
+#if defined(__cplusplus) && defined(__d3d11_h__)
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_D3D11_CreateDeviceAndSwapChain
+//
+//!   DESCRIPTION: This function tries to create a DirectX 11 device and swap chain. If the call fails (if we are
+//!                running on pre=DirectX 11 hardware), depending on the type of hardware it will try to create a DirectX 10.1 OR
+//!                DirectX 10.0+ OR DirectX 10.0 device. The function call is the same as D3D11CreateDeviceAndSwapChain,
+//!                but with an extra argument (D3D_FEATURE_LEVEL supported by the device) that the function fills
+//!                in. This argument can contain -1 (NVAPI_DEVICE_FEATURE_LEVEL_NULL), if the requested featureLevel
+//!                is less than DirectX 10.0.
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \since Release: 185
+//!
+//! \param [in]     pAdapter
+//! \param [in]     DriverType
+//! \param [in]     Software
+//! \param [in]     Flags
+//! \param [in]     *pFeatureLevels
+//! \param [in]     FeatureLevels
+//! \param [in]     SDKVersion
+//! \param [in]     *pSwapChainDesc
+//! \param [in]     **ppSwapChain
+//! \param [in]     **ppDevice
+//! \param [in]     *pFeatureLevel
+//! \param [in]     **ppImmediateContext
+//! \param [in]     *pSupportedLevel  D3D_FEATURE_LEVEL supported
+//!
+//!return  NVAPI_OK if the createDevice with swap chain call succeeded.
+//!
+//! \ingroup dx
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_D3D11_CreateDeviceAndSwapChain(IDXGIAdapter* pAdapter,
+                                         D3D_DRIVER_TYPE DriverType,
+                                         HMODULE Software,
+                                         UINT Flags,
+                                         CONST D3D_FEATURE_LEVEL *pFeatureLevels,
+                                         UINT FeatureLevels,
+                                         UINT SDKVersion,
+                                         CONST DXGI_SWAP_CHAIN_DESC *pSwapChainDesc,
+                                         IDXGISwapChain **ppSwapChain,
+                                         ID3D11Device **ppDevice,
+                                         D3D_FEATURE_LEVEL *pFeatureLevel,
+                                         ID3D11DeviceContext **ppImmediateContext,
+                                         NVAPI_DEVICE_FEATURE_LEVEL *pSupportedLevel);
+
+
+
+#endif //defined(__cplusplus) && defined(__d3d11_h__)
+#if defined(__cplusplus) && defined(__d3d11_h__)
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_D3D11_SetDepthBoundsTest
+//
+//!   DESCRIPTION: This function enables/disables the depth bounds test
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \param [in]        pDeviceOrContext   The device or device context to set depth bounds test
+//! \param [in]        bEnable            Enable(non-zero)/disable(zero) the depth bounds test
+//! \param [in]        fMinDepth          The minimum depth for depth bounds test
+//! \param [in]        fMaxDepth          The maximum depth for depth bounds test
+//!                                       The valid values for fMinDepth and fMaxDepth
+//!                                       are such that 0 <= fMinDepth <= fMaxDepth <= 1
+//!
+//! \return  ::NVAPI_OK if the depth bounds test was correcly enabled or disabled
+//!
+//! \ingroup dx
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_D3D11_SetDepthBoundsTest(IUnknown* pDeviceOrContext,
+                                               NvU32 bEnable,
+                                               float fMinDepth,
+                                               float fMaxDepth);
+
+#endif //defined(__cplusplus) && defined(__d3d11_h__)
+
+#include"nvapi_lite_salend.h"
+#ifdef __cplusplus
+}
+#endif
+#pragma pack(pop)
diff --git a/compat/nvapi/nvapi_lite_salend.h b/compat/nvapi/nvapi_lite_salend.h
new file mode 100644
index 0000000000..287f988797
--- /dev/null
+++ b/compat/nvapi/nvapi_lite_salend.h
@@ -0,0 +1,816 @@
+ /************************************************************************************************************************************\
+|*                                                                                                                                    *|
+|*     Copyright � 2012 NVIDIA Corporation.  All rights reserved.                                                                     *|
+|*                                                                                                                                    *|
+|*  NOTICE TO USER:                                                                                                                   *|
+|*                                                                                                                                    *|
+|*  This software is subject to NVIDIA ownership rights under U.S. and international Copyright laws.                                  *|
+|*                                                                                                                                    *|
+|*  This software and the information contained herein are PROPRIETARY and CONFIDENTIAL to NVIDIA                                     *|
+|*  and are being provided solely under the terms and conditions of an NVIDIA software license agreement.                             *|
+|*  Otherwise, you have no rights to use or access this software in any manner.                                                       *|
+|*                                                                                                                                    *|
+|*  If not covered by the applicable NVIDIA software license agreement:                                                               *|
+|*  NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOFTWARE FOR ANY PURPOSE.                                            *|
+|*  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.                                                           *|
+|*  NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,                                                                     *|
+|*  INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.                       *|
+|*  IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,                               *|
+|*  OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT,                         *|
+|*  NEGLIGENCE OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOURCE CODE.            *|
+|*                                                                                                                                    *|
+|*  U.S. Government End Users.                                                                                                        *|
+|*  This software is a "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT 1995),                                       *|
+|*  consisting  of "commercial computer  software"  and "commercial computer software documentation"                                  *|
+|*  as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government only as a commercial end item.     *|
+|*  Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995),                                          *|
+|*  all U.S. Government End Users acquire the software with only those rights set forth herein.                                       *|
+|*                                                                                                                                    *|
+|*  Any use of this software in individual and commercial software must include,                                                      *|
+|*  in the user documentation and internal comments to the code,                                                                      *|
+|*  the above Disclaimer (as applicable) and U.S. Government End Users Notice.                                                        *|
+|*                                                                                                                                    *|
+ \************************************************************************************************************************************/
+#ifndef __NVAPI_EMPTY_SAL
+#ifdef __nvapi_undef__ecount
+    #undef __ecount
+    #undef __nvapi_undef__ecount
+#endif
+#ifdef __nvapi_undef__bcount
+    #undef __bcount
+    #undef __nvapi_undef__bcount
+#endif
+#ifdef __nvapi_undef__in
+    #undef __in
+    #undef __nvapi_undef__in
+#endif
+#ifdef __nvapi_undef__in_ecount
+    #undef __in_ecount
+    #undef __nvapi_undef__in_ecount
+#endif
+#ifdef __nvapi_undef__in_bcount
+    #undef __in_bcount
+    #undef __nvapi_undef__in_bcount
+#endif
+#ifdef __nvapi_undef__in_z
+    #undef __in_z
+    #undef __nvapi_undef__in_z
+#endif
+#ifdef __nvapi_undef__in_ecount_z
+    #undef __in_ecount_z
+    #undef __nvapi_undef__in_ecount_z
+#endif
+#ifdef __nvapi_undef__in_bcount_z
+    #undef __in_bcount_z
+    #undef __nvapi_undef__in_bcount_z
+#endif
+#ifdef __nvapi_undef__in_nz
+    #undef __in_nz
+    #undef __nvapi_undef__in_nz
+#endif
+#ifdef __nvapi_undef__in_ecount_nz
+    #undef __in_ecount_nz
+    #undef __nvapi_undef__in_ecount_nz
+#endif
+#ifdef __nvapi_undef__in_bcount_nz
+    #undef __in_bcount_nz
+    #undef __nvapi_undef__in_bcount_nz
+#endif
+#ifdef __nvapi_undef__out
+    #undef __out
+    #undef __nvapi_undef__out
+#endif
+#ifdef __nvapi_undef__out_ecount
+    #undef __out_ecount
+    #undef __nvapi_undef__out_ecount
+#endif
+#ifdef __nvapi_undef__out_bcount
+    #undef __out_bcount
+    #undef __nvapi_undef__out_bcount
+#endif
+#ifdef __nvapi_undef__out_ecount_part
+    #undef __out_ecount_part
+    #undef __nvapi_undef__out_ecount_part
+#endif
+#ifdef __nvapi_undef__out_bcount_part
+    #undef __out_bcount_part
+    #undef __nvapi_undef__out_bcount_part
+#endif
+#ifdef __nvapi_undef__out_ecount_full
+    #undef __out_ecount_full
+    #undef __nvapi_undef__out_ecount_full
+#endif
+#ifdef __nvapi_undef__out_bcount_full
+    #undef __out_bcount_full
+    #undef __nvapi_undef__out_bcount_full
+#endif
+#ifdef __nvapi_undef__out_z
+    #undef __out_z
+    #undef __nvapi_undef__out_z
+#endif
+#ifdef __nvapi_undef__out_z_opt
+    #undef __out_z_opt
+    #undef __nvapi_undef__out_z_opt
+#endif
+#ifdef __nvapi_undef__out_ecount_z
+    #undef __out_ecount_z
+    #undef __nvapi_undef__out_ecount_z
+#endif
+#ifdef __nvapi_undef__out_bcount_z
+    #undef __out_bcount_z
+    #undef __nvapi_undef__out_bcount_z
+#endif
+#ifdef __nvapi_undef__out_ecount_part_z
+    #undef __out_ecount_part_z
+    #undef __nvapi_undef__out_ecount_part_z
+#endif
+#ifdef __nvapi_undef__out_bcount_part_z
+    #undef __out_bcount_part_z
+    #undef __nvapi_undef__out_bcount_part_z
+#endif
+#ifdef __nvapi_undef__out_ecount_full_z
+    #undef __out_ecount_full_z
+    #undef __nvapi_undef__out_ecount_full_z
+#endif
+#ifdef __nvapi_undef__out_bcount_full_z
+    #undef __out_bcount_full_z
+    #undef __nvapi_undef__out_bcount_full_z
+#endif
+#ifdef __nvapi_undef__out_nz
+    #undef __out_nz
+    #undef __nvapi_undef__out_nz
+#endif
+#ifdef __nvapi_undef__out_nz_opt
+    #undef __out_nz_opt
+    #undef __nvapi_undef__out_nz_opt
+#endif
+#ifdef __nvapi_undef__out_ecount_nz
+    #undef __out_ecount_nz
+    #undef __nvapi_undef__out_ecount_nz
+#endif
+#ifdef __nvapi_undef__out_bcount_nz
+    #undef __out_bcount_nz
+    #undef __nvapi_undef__out_bcount_nz
+#endif
+#ifdef __nvapi_undef__inout
+    #undef __inout
+    #undef __nvapi_undef__inout
+#endif
+#ifdef __nvapi_undef__inout_ecount
+    #undef __inout_ecount
+    #undef __nvapi_undef__inout_ecount
+#endif
+#ifdef __nvapi_undef__inout_bcount
+    #undef __inout_bcount
+    #undef __nvapi_undef__inout_bcount
+#endif
+#ifdef __nvapi_undef__inout_ecount_part
+    #undef __inout_ecount_part
+    #undef __nvapi_undef__inout_ecount_part
+#endif
+#ifdef __nvapi_undef__inout_bcount_part
+    #undef __inout_bcount_part
+    #undef __nvapi_undef__inout_bcount_part
+#endif
+#ifdef __nvapi_undef__inout_ecount_full
+    #undef __inout_ecount_full
+    #undef __nvapi_undef__inout_ecount_full
+#endif
+#ifdef __nvapi_undef__inout_bcount_full
+    #undef __inout_bcount_full
+    #undef __nvapi_undef__inout_bcount_full
+#endif
+#ifdef __nvapi_undef__inout_z
+    #undef __inout_z
+    #undef __nvapi_undef__inout_z
+#endif
+#ifdef __nvapi_undef__inout_ecount_z
+    #undef __inout_ecount_z
+    #undef __nvapi_undef__inout_ecount_z
+#endif
+#ifdef __nvapi_undef__inout_bcount_z
+    #undef __inout_bcount_z
+    #undef __nvapi_undef__inout_bcount_z
+#endif
+#ifdef __nvapi_undef__inout_nz
+    #undef __inout_nz
+    #undef __nvapi_undef__inout_nz
+#endif
+#ifdef __nvapi_undef__inout_ecount_nz
+    #undef __inout_ecount_nz
+    #undef __nvapi_undef__inout_ecount_nz
+#endif
+#ifdef __nvapi_undef__inout_bcount_nz
+    #undef __inout_bcount_nz
+    #undef __nvapi_undef__inout_bcount_nz
+#endif
+#ifdef __nvapi_undef__ecount_opt
+    #undef __ecount_opt
+    #undef __nvapi_undef__ecount_opt
+#endif
+#ifdef __nvapi_undef__bcount_opt
+    #undef __bcount_opt
+    #undef __nvapi_undef__bcount_opt
+#endif
+#ifdef __nvapi_undef__in_opt
+    #undef __in_opt
+    #undef __nvapi_undef__in_opt
+#endif
+#ifdef __nvapi_undef__in_ecount_opt
+    #undef __in_ecount_opt
+    #undef __nvapi_undef__in_ecount_opt
+#endif
+#ifdef __nvapi_undef__in_bcount_opt
+    #undef __in_bcount_opt
+    #undef __nvapi_undef__in_bcount_opt
+#endif
+#ifdef __nvapi_undef__in_z_opt
+    #undef __in_z_opt
+    #undef __nvapi_undef__in_z_opt
+#endif
+#ifdef __nvapi_undef__in_ecount_z_opt
+    #undef __in_ecount_z_opt
+    #undef __nvapi_undef__in_ecount_z_opt
+#endif
+#ifdef __nvapi_undef__in_bcount_z_opt
+    #undef __in_bcount_z_opt
+    #undef __nvapi_undef__in_bcount_z_opt
+#endif
+#ifdef __nvapi_undef__in_nz_opt
+    #undef __in_nz_opt
+    #undef __nvapi_undef__in_nz_opt
+#endif
+#ifdef __nvapi_undef__in_ecount_nz_opt
+    #undef __in_ecount_nz_opt
+    #undef __nvapi_undef__in_ecount_nz_opt
+#endif
+#ifdef __nvapi_undef__in_bcount_nz_opt
+    #undef __in_bcount_nz_opt
+    #undef __nvapi_undef__in_bcount_nz_opt
+#endif
+#ifdef __nvapi_undef__out_opt
+    #undef __out_opt
+    #undef __nvapi_undef__out_opt
+#endif
+#ifdef __nvapi_undef__out_ecount_opt
+    #undef __out_ecount_opt
+    #undef __nvapi_undef__out_ecount_opt
+#endif
+#ifdef __nvapi_undef__out_bcount_opt
+    #undef __out_bcount_opt
+    #undef __nvapi_undef__out_bcount_opt
+#endif
+#ifdef __nvapi_undef__out_ecount_part_opt
+    #undef __out_ecount_part_opt
+    #undef __nvapi_undef__out_ecount_part_opt
+#endif
+#ifdef __nvapi_undef__out_bcount_part_opt
+    #undef __out_bcount_part_opt
+    #undef __nvapi_undef__out_bcount_part_opt
+#endif
+#ifdef __nvapi_undef__out_ecount_full_opt
+    #undef __out_ecount_full_opt
+    #undef __nvapi_undef__out_ecount_full_opt
+#endif
+#ifdef __nvapi_undef__out_bcount_full_opt
+    #undef __out_bcount_full_opt
+    #undef __nvapi_undef__out_bcount_full_opt
+#endif
+#ifdef __nvapi_undef__out_ecount_z_opt
+    #undef __out_ecount_z_opt
+    #undef __nvapi_undef__out_ecount_z_opt
+#endif
+#ifdef __nvapi_undef__out_bcount_z_opt
+    #undef __out_bcount_z_opt
+    #undef __nvapi_undef__out_bcount_z_opt
+#endif
+#ifdef __nvapi_undef__out_ecount_part_z_opt
+    #undef __out_ecount_part_z_opt
+    #undef __nvapi_undef__out_ecount_part_z_opt
+#endif
+#ifdef __nvapi_undef__out_bcount_part_z_opt
+    #undef __out_bcount_part_z_opt
+    #undef __nvapi_undef__out_bcount_part_z_opt
+#endif
+#ifdef __nvapi_undef__out_ecount_full_z_opt
+    #undef __out_ecount_full_z_opt
+    #undef __nvapi_undef__out_ecount_full_z_opt
+#endif
+#ifdef __nvapi_undef__out_bcount_full_z_opt
+    #undef __out_bcount_full_z_opt
+    #undef __nvapi_undef__out_bcount_full_z_opt
+#endif
+#ifdef __nvapi_undef__out_ecount_nz_opt
+    #undef __out_ecount_nz_opt
+    #undef __nvapi_undef__out_ecount_nz_opt
+#endif
+#ifdef __nvapi_undef__out_bcount_nz_opt
+    #undef __out_bcount_nz_opt
+    #undef __nvapi_undef__out_bcount_nz_opt
+#endif
+#ifdef __nvapi_undef__inout_opt
+    #undef __inout_opt
+    #undef __nvapi_undef__inout_opt
+#endif
+#ifdef __nvapi_undef__inout_ecount_opt
+    #undef __inout_ecount_opt
+    #undef __nvapi_undef__inout_ecount_opt
+#endif
+#ifdef __nvapi_undef__inout_bcount_opt
+    #undef __inout_bcount_opt
+    #undef __nvapi_undef__inout_bcount_opt
+#endif
+#ifdef __nvapi_undef__inout_ecount_part_opt
+    #undef __inout_ecount_part_opt
+    #undef __nvapi_undef__inout_ecount_part_opt
+#endif
+#ifdef __nvapi_undef__inout_bcount_part_opt
+    #undef __inout_bcount_part_opt
+    #undef __nvapi_undef__inout_bcount_part_opt
+#endif
+#ifdef __nvapi_undef__inout_ecount_full_opt
+    #undef __inout_ecount_full_opt
+    #undef __nvapi_undef__inout_ecount_full_opt
+#endif
+#ifdef __nvapi_undef__inout_bcount_full_opt
+    #undef __inout_bcount_full_opt
+    #undef __nvapi_undef__inout_bcount_full_opt
+#endif
+#ifdef __nvapi_undef__inout_z_opt
+    #undef __inout_z_opt
+    #undef __nvapi_undef__inout_z_opt
+#endif
+#ifdef __nvapi_undef__inout_ecount_z_opt
+    #undef __inout_ecount_z_opt
+    #undef __nvapi_undef__inout_ecount_z_opt
+#endif
+#ifdef __nvapi_undef__inout_ecount_z_opt
+    #undef __inout_ecount_z_opt
+    #undef __nvapi_undef__inout_ecount_z_opt
+#endif
+#ifdef __nvapi_undef__inout_bcount_z_opt
+    #undef __inout_bcount_z_opt
+    #undef __nvapi_undef__inout_bcount_z_opt
+#endif
+#ifdef __nvapi_undef__inout_nz_opt
+    #undef __inout_nz_opt
+    #undef __nvapi_undef__inout_nz_opt
+#endif
+#ifdef __nvapi_undef__inout_ecount_nz_opt
+    #undef __inout_ecount_nz_opt
+    #undef __nvapi_undef__inout_ecount_nz_opt
+#endif
+#ifdef __nvapi_undef__inout_bcount_nz_opt
+    #undef __inout_bcount_nz_opt
+    #undef __nvapi_undef__inout_bcount_nz_opt
+#endif
+#ifdef __nvapi_undef__deref_ecount
+    #undef __deref_ecount
+    #undef __nvapi_undef__deref_ecount
+#endif
+#ifdef __nvapi_undef__deref_bcount
+    #undef __deref_bcount
+    #undef __nvapi_undef__deref_bcount
+#endif
+#ifdef __nvapi_undef__deref_out
+    #undef __deref_out
+    #undef __nvapi_undef__deref_out
+#endif
+#ifdef __nvapi_undef__deref_out_ecount
+    #undef __deref_out_ecount
+    #undef __nvapi_undef__deref_out_ecount
+#endif
+#ifdef __nvapi_undef__deref_out_bcount
+    #undef __deref_out_bcount
+    #undef __nvapi_undef__deref_out_bcount
+#endif
+#ifdef __nvapi_undef__deref_out_ecount_part
+    #undef __deref_out_ecount_part
+    #undef __nvapi_undef__deref_out_ecount_part
+#endif
+#ifdef __nvapi_undef__deref_out_bcount_part
+    #undef __deref_out_bcount_part
+    #undef __nvapi_undef__deref_out_bcount_part
+#endif
+#ifdef __nvapi_undef__deref_out_ecount_full
+    #undef __deref_out_ecount_full
+    #undef __nvapi_undef__deref_out_ecount_full
+#endif
+#ifdef __nvapi_undef__deref_out_bcount_full
+    #undef __deref_out_bcount_full
+    #undef __nvapi_undef__deref_out_bcount_full
+#endif
+#ifdef __nvapi_undef__deref_out_z
+    #undef __deref_out_z
+    #undef __nvapi_undef__deref_out_z
+#endif
+#ifdef __nvapi_undef__deref_out_ecount_z
+    #undef __deref_out_ecount_z
+    #undef __nvapi_undef__deref_out_ecount_z
+#endif
+#ifdef __nvapi_undef__deref_out_bcount_z
+    #undef __deref_out_bcount_z
+    #undef __nvapi_undef__deref_out_bcount_z
+#endif
+#ifdef __nvapi_undef__deref_out_nz
+    #undef __deref_out_nz
+    #undef __nvapi_undef__deref_out_nz
+#endif
+#ifdef __nvapi_undef__deref_out_ecount_nz
+    #undef __deref_out_ecount_nz
+    #undef __nvapi_undef__deref_out_ecount_nz
+#endif
+#ifdef __nvapi_undef__deref_out_bcount_nz
+    #undef __deref_out_bcount_nz
+    #undef __nvapi_undef__deref_out_bcount_nz
+#endif
+#ifdef __nvapi_undef__deref_inout
+    #undef __deref_inout
+    #undef __nvapi_undef__deref_inout
+#endif
+#ifdef __nvapi_undef__deref_inout_z
+    #undef __deref_inout_z
+    #undef __nvapi_undef__deref_inout_z
+#endif
+#ifdef __nvapi_undef__deref_inout_ecount
+    #undef __deref_inout_ecount
+    #undef __nvapi_undef__deref_inout_ecount
+#endif
+#ifdef __nvapi_undef__deref_inout_bcount
+    #undef __deref_inout_bcount
+    #undef __nvapi_undef__deref_inout_bcount
+#endif
+#ifdef __nvapi_undef__deref_inout_ecount_part
+    #undef __deref_inout_ecount_part
+    #undef __nvapi_undef__deref_inout_ecount_part
+#endif
+#ifdef __nvapi_undef__deref_inout_bcount_part
+    #undef __deref_inout_bcount_part
+    #undef __nvapi_undef__deref_inout_bcount_part
+#endif
+#ifdef __nvapi_undef__deref_inout_ecount_full
+    #undef __deref_inout_ecount_full
+    #undef __nvapi_undef__deref_inout_ecount_full
+#endif
+#ifdef __nvapi_undef__deref_inout_bcount_full
+    #undef __deref_inout_bcount_full
+    #undef __nvapi_undef__deref_inout_bcount_full
+#endif
+#ifdef __nvapi_undef__deref_inout_z
+    #undef __deref_inout_z
+    #undef __nvapi_undef__deref_inout_z
+#endif
+#ifdef __nvapi_undef__deref_inout_ecount_z
+    #undef __deref_inout_ecount_z
+    #undef __nvapi_undef__deref_inout_ecount_z
+#endif
+#ifdef __nvapi_undef__deref_inout_bcount_z
+    #undef __deref_inout_bcount_z
+    #undef __nvapi_undef__deref_inout_bcount_z
+#endif
+#ifdef __nvapi_undef__deref_inout_nz
+    #undef __deref_inout_nz
+    #undef __nvapi_undef__deref_inout_nz
+#endif
+#ifdef __nvapi_undef__deref_inout_ecount_nz
+    #undef __deref_inout_ecount_nz
+    #undef __nvapi_undef__deref_inout_ecount_nz
+#endif
+#ifdef __nvapi_undef__deref_inout_bcount_nz
+    #undef __deref_inout_bcount_nz
+    #undef __nvapi_undef__deref_inout_bcount_nz
+#endif
+#ifdef __nvapi_undef__deref_ecount_opt
+    #undef __deref_ecount_opt
+    #undef __nvapi_undef__deref_ecount_opt
+#endif
+#ifdef __nvapi_undef__deref_bcount_opt
+    #undef __deref_bcount_opt
+    #undef __nvapi_undef__deref_bcount_opt
+#endif
+#ifdef __nvapi_undef__deref_out_opt
+    #undef __deref_out_opt
+    #undef __nvapi_undef__deref_out_opt
+#endif
+#ifdef __nvapi_undef__deref_out_ecount_opt
+    #undef __deref_out_ecount_opt
+    #undef __nvapi_undef__deref_out_ecount_opt
+#endif
+#ifdef __nvapi_undef__deref_out_bcount_opt
+    #undef __deref_out_bcount_opt
+    #undef __nvapi_undef__deref_out_bcount_opt
+#endif
+#ifdef __nvapi_undef__deref_out_ecount_part_opt
+    #undef __deref_out_ecount_part_opt
+    #undef __nvapi_undef__deref_out_ecount_part_opt
+#endif
+#ifdef __nvapi_undef__deref_out_bcount_part_opt
+    #undef __deref_out_bcount_part_opt
+    #undef __nvapi_undef__deref_out_bcount_part_opt
+#endif
+#ifdef __nvapi_undef__deref_out_ecount_full_opt
+    #undef __deref_out_ecount_full_opt
+    #undef __nvapi_undef__deref_out_ecount_full_opt
+#endif
+#ifdef __nvapi_undef__deref_out_bcount_full_opt
+    #undef __deref_out_bcount_full_opt
+    #undef __nvapi_undef__deref_out_bcount_full_opt
+#endif
+#ifdef __nvapi_undef__deref_out_z_opt
+    #undef __deref_out_z_opt
+    #undef __nvapi_undef__deref_out_z_opt
+#endif
+#ifdef __nvapi_undef__deref_out_ecount_z_opt
+    #undef __deref_out_ecount_z_opt
+    #undef __nvapi_undef__deref_out_ecount_z_opt
+#endif
+#ifdef __nvapi_undef__deref_out_bcount_z_opt
+    #undef __deref_out_bcount_z_opt
+    #undef __nvapi_undef__deref_out_bcount_z_opt
+#endif
+#ifdef __nvapi_undef__deref_out_nz_opt
+    #undef __deref_out_nz_opt
+    #undef __nvapi_undef__deref_out_nz_opt
+#endif
+#ifdef __nvapi_undef__deref_out_ecount_nz_opt
+    #undef __deref_out_ecount_nz_opt
+    #undef __nvapi_undef__deref_out_ecount_nz_opt
+#endif
+#ifdef __nvapi_undef__deref_out_bcount_nz_opt
+    #undef __deref_out_bcount_nz_opt
+    #undef __nvapi_undef__deref_out_bcount_nz_opt
+#endif
+#ifdef __nvapi_undef__deref_inout_opt
+    #undef __deref_inout_opt
+    #undef __nvapi_undef__deref_inout_opt
+#endif
+#ifdef __nvapi_undef__deref_inout_ecount_opt
+    #undef __deref_inout_ecount_opt
+    #undef __nvapi_undef__deref_inout_ecount_opt
+#endif
+#ifdef __nvapi_undef__deref_inout_bcount_opt
+    #undef __deref_inout_bcount_opt
+    #undef __nvapi_undef__deref_inout_bcount_opt
+#endif
+#ifdef __nvapi_undef__deref_inout_ecount_part_opt
+    #undef __deref_inout_ecount_part_opt
+    #undef __nvapi_undef__deref_inout_ecount_part_opt
+#endif
+#ifdef __nvapi_undef__deref_inout_bcount_part_opt
+    #undef __deref_inout_bcount_part_opt
+    #undef __nvapi_undef__deref_inout_bcount_part_opt
+#endif
+#ifdef __nvapi_undef__deref_inout_ecount_full_opt
+    #undef __deref_inout_ecount_full_opt
+    #undef __nvapi_undef__deref_inout_ecount_full_opt
+#endif
+#ifdef __nvapi_undef__deref_inout_bcount_full_opt
+    #undef __deref_inout_bcount_full_opt
+    #undef __nvapi_undef__deref_inout_bcount_full_opt
+#endif
+#ifdef __nvapi_undef__deref_inout_z_opt
+    #undef __deref_inout_z_opt
+    #undef __nvapi_undef__deref_inout_z_opt
+#endif
+#ifdef __nvapi_undef__deref_inout_ecount_z_opt
+    #undef __deref_inout_ecount_z_opt
+    #undef __nvapi_undef__deref_inout_ecount_z_opt
+#endif
+#ifdef __nvapi_undef__deref_inout_bcount_z_opt
+    #undef __deref_inout_bcount_z_opt
+    #undef __nvapi_undef__deref_inout_bcount_z_opt
+#endif
+#ifdef __nvapi_undef__deref_inout_nz_opt
+    #undef __deref_inout_nz_opt
+    #undef __nvapi_undef__deref_inout_nz_opt
+#endif
+#ifdef __nvapi_undef__deref_inout_ecount_nz_opt
+    #undef __deref_inout_ecount_nz_opt
+    #undef __nvapi_undef__deref_inout_ecount_nz_opt
+#endif
+#ifdef __nvapi_undef__deref_inout_bcount_nz_opt
+    #undef __deref_inout_bcount_nz_opt
+    #undef __nvapi_undef__deref_inout_bcount_nz_opt
+#endif
+#ifdef __nvapi_undef__deref_opt_ecount
+    #undef __deref_opt_ecount
+    #undef __nvapi_undef__deref_opt_ecount
+#endif
+#ifdef __nvapi_undef__deref_opt_bcount
+    #undef __deref_opt_bcount
+    #undef __nvapi_undef__deref_opt_bcount
+#endif
+#ifdef __nvapi_undef__deref_opt_out
+    #undef __deref_opt_out
+    #undef __nvapi_undef__deref_opt_out
+#endif
+#ifdef __nvapi_undef__deref_opt_out_z
+    #undef __deref_opt_out_z
+    #undef __nvapi_undef__deref_opt_out_z
+#endif
+#ifdef __nvapi_undef__deref_opt_out_ecount
+    #undef __deref_opt_out_ecount
+    #undef __nvapi_undef__deref_opt_out_ecount
+#endif
+#ifdef __nvapi_undef__deref_opt_out_bcount
+    #undef __deref_opt_out_bcount
+    #undef __nvapi_undef__deref_opt_out_bcount
+#endif
+#ifdef __nvapi_undef__deref_opt_out_ecount_part
+    #undef __deref_opt_out_ecount_part
+    #undef __nvapi_undef__deref_opt_out_ecount_part
+#endif
+#ifdef __nvapi_undef__deref_opt_out_bcount_part
+    #undef __deref_opt_out_bcount_part
+    #undef __nvapi_undef__deref_opt_out_bcount_part
+#endif
+#ifdef __nvapi_undef__deref_opt_out_ecount_full
+    #undef __deref_opt_out_ecount_full
+    #undef __nvapi_undef__deref_opt_out_ecount_full
+#endif
+#ifdef __nvapi_undef__deref_opt_out_bcount_full
+    #undef __deref_opt_out_bcount_full
+    #undef __nvapi_undef__deref_opt_out_bcount_full
+#endif
+#ifdef __nvapi_undef__deref_opt_inout
+    #undef __deref_opt_inout
+    #undef __nvapi_undef__deref_opt_inout
+#endif
+#ifdef __nvapi_undef__deref_opt_inout_ecount
+    #undef __deref_opt_inout_ecount
+    #undef __nvapi_undef__deref_opt_inout_ecount
+#endif
+#ifdef __nvapi_undef__deref_opt_inout_bcount
+    #undef __deref_opt_inout_bcount
+    #undef __nvapi_undef__deref_opt_inout_bcount
+#endif
+#ifdef __nvapi_undef__deref_opt_inout_ecount_part
+    #undef __deref_opt_inout_ecount_part
+    #undef __nvapi_undef__deref_opt_inout_ecount_part
+#endif
+#ifdef __nvapi_undef__deref_opt_inout_bcount_part
+    #undef __deref_opt_inout_bcount_part
+    #undef __nvapi_undef__deref_opt_inout_bcount_part
+#endif
+#ifdef __nvapi_undef__deref_opt_inout_ecount_full
+    #undef __deref_opt_inout_ecount_full
+    #undef __nvapi_undef__deref_opt_inout_ecount_full
+#endif
+#ifdef __nvapi_undef__deref_opt_inout_bcount_full
+    #undef __deref_opt_inout_bcount_full
+    #undef __nvapi_undef__deref_opt_inout_bcount_full
+#endif
+#ifdef __nvapi_undef__deref_opt_inout_z
+    #undef __deref_opt_inout_z
+    #undef __nvapi_undef__deref_opt_inout_z
+#endif
+#ifdef __nvapi_undef__deref_opt_inout_ecount_z
+    #undef __deref_opt_inout_ecount_z
+    #undef __nvapi_undef__deref_opt_inout_ecount_z
+#endif
+#ifdef __nvapi_undef__deref_opt_inout_bcount_z
+    #undef __deref_opt_inout_bcount_z
+    #undef __nvapi_undef__deref_opt_inout_bcount_z
+#endif
+#ifdef __nvapi_undef__deref_opt_inout_nz
+    #undef __deref_opt_inout_nz
+    #undef __nvapi_undef__deref_opt_inout_nz
+#endif
+#ifdef __nvapi_undef__deref_opt_inout_ecount_nz
+    #undef __deref_opt_inout_ecount_nz
+    #undef __nvapi_undef__deref_opt_inout_ecount_nz
+#endif
+#ifdef __nvapi_undef__deref_opt_inout_bcount_nz
+    #undef __deref_opt_inout_bcount_nz
+    #undef __nvapi_undef__deref_opt_inout_bcount_nz
+#endif
+#ifdef __nvapi_undef__deref_opt_ecount_opt
+    #undef __deref_opt_ecount_opt
+    #undef __nvapi_undef__deref_opt_ecount_opt
+#endif
+#ifdef __nvapi_undef__deref_opt_bcount_opt
+    #undef __deref_opt_bcount_opt
+    #undef __nvapi_undef__deref_opt_bcount_opt
+#endif
+#ifdef __nvapi_undef__deref_opt_out_opt
+    #undef __deref_opt_out_opt
+    #undef __nvapi_undef__deref_opt_out_opt
+#endif
+#ifdef __nvapi_undef__deref_opt_out_ecount_opt
+    #undef __deref_opt_out_ecount_opt
+    #undef __nvapi_undef__deref_opt_out_ecount_opt
+#endif
+#ifdef __nvapi_undef__deref_opt_out_bcount_opt
+    #undef __deref_opt_out_bcount_opt
+    #undef __nvapi_undef__deref_opt_out_bcount_opt
+#endif
+#ifdef __nvapi_undef__deref_opt_out_ecount_part_opt
+    #undef __deref_opt_out_ecount_part_opt
+    #undef __nvapi_undef__deref_opt_out_ecount_part_opt
+#endif
+#ifdef __nvapi_undef__deref_opt_out_bcount_part_opt
+    #undef __deref_opt_out_bcount_part_opt
+    #undef __nvapi_undef__deref_opt_out_bcount_part_opt
+#endif
+#ifdef __nvapi_undef__deref_opt_out_ecount_full_opt
+    #undef __deref_opt_out_ecount_full_opt
+    #undef __nvapi_undef__deref_opt_out_ecount_full_opt
+#endif
+#ifdef __nvapi_undef__deref_opt_out_bcount_full_opt
+    #undef __deref_opt_out_bcount_full_opt
+    #undef __nvapi_undef__deref_opt_out_bcount_full_opt
+#endif
+#ifdef __nvapi_undef__deref_opt_out_z_opt
+    #undef __deref_opt_out_z_opt
+    #undef __nvapi_undef__deref_opt_out_z_opt
+#endif
+#ifdef __nvapi_undef__deref_opt_out_ecount_z_opt
+    #undef __deref_opt_out_ecount_z_opt
+    #undef __nvapi_undef__deref_opt_out_ecount_z_opt
+#endif
+#ifdef __nvapi_undef__deref_opt_out_bcount_z_opt
+    #undef __deref_opt_out_bcount_z_opt
+    #undef __nvapi_undef__deref_opt_out_bcount_z_opt
+#endif
+#ifdef __nvapi_undef__deref_opt_out_nz_opt
+    #undef __deref_opt_out_nz_opt
+    #undef __nvapi_undef__deref_opt_out_nz_opt
+#endif
+#ifdef __nvapi_undef__deref_opt_out_ecount_nz_opt
+    #undef __deref_opt_out_ecount_nz_opt
+    #undef __nvapi_undef__deref_opt_out_ecount_nz_opt
+#endif
+#ifdef __nvapi_undef__deref_opt_out_bcount_nz_opt
+    #undef __deref_opt_out_bcount_nz_opt
+    #undef __nvapi_undef__deref_opt_out_bcount_nz_opt
+#endif
+#ifdef __nvapi_undef__deref_opt_inout_opt
+    #undef __deref_opt_inout_opt
+    #undef __nvapi_undef__deref_opt_inout_opt
+#endif
+#ifdef __nvapi_undef__deref_opt_inout_ecount_opt
+    #undef __deref_opt_inout_ecount_opt
+    #undef __nvapi_undef__deref_opt_inout_ecount_opt
+#endif
+#ifdef __nvapi_undef__deref_opt_inout_bcount_opt
+    #undef __deref_opt_inout_bcount_opt
+    #undef __nvapi_undef__deref_opt_inout_bcount_opt
+#endif
+#ifdef __nvapi_undef__deref_opt_inout_ecount_part_opt
+    #undef __deref_opt_inout_ecount_part_opt
+    #undef __nvapi_undef__deref_opt_inout_ecount_part_opt
+#endif
+#ifdef __nvapi_undef__deref_opt_inout_bcount_part_opt
+    #undef __deref_opt_inout_bcount_part_opt
+    #undef __nvapi_undef__deref_opt_inout_bcount_part_opt
+#endif
+#ifdef __nvapi_undef__deref_opt_inout_ecount_full_opt
+    #undef __deref_opt_inout_ecount_full_opt
+    #undef __nvapi_undef__deref_opt_inout_ecount_full_opt
+#endif
+#ifdef __nvapi_undef__deref_opt_inout_bcount_full_opt
+    #undef __deref_opt_inout_bcount_full_opt
+    #undef __nvapi_undef__deref_opt_inout_bcount_full_opt
+#endif
+#ifdef __nvapi_undef__deref_opt_inout_z_opt
+    #undef __deref_opt_inout_z_opt
+    #undef __nvapi_undef__deref_opt_inout_z_opt
+#endif
+#ifdef __nvapi_undef__deref_opt_inout_ecount_z_opt
+    #undef __deref_opt_inout_ecount_z_opt
+    #undef __nvapi_undef__deref_opt_inout_ecount_z_opt
+#endif
+#ifdef __nvapi_undef__deref_opt_inout_bcount_z_opt
+    #undef __deref_opt_inout_bcount_z_opt
+    #undef __nvapi_undef__deref_opt_inout_bcount_z_opt
+#endif
+#ifdef __nvapi_undef__deref_opt_inout_nz_opt
+    #undef __deref_opt_inout_nz_opt
+    #undef __nvapi_undef__deref_opt_inout_nz_opt
+#endif
+#ifdef __nvapi_undef__deref_opt_inout_ecount_nz_opt
+    #undef __deref_opt_inout_ecount_nz_opt
+    #undef __nvapi_undef__deref_opt_inout_ecount_nz_opt
+#endif
+#ifdef __nvapi_undef__deref_opt_inout_bcount_nz_opt
+    #undef __deref_opt_inout_bcount_nz_opt
+    #undef __nvapi_undef__deref_opt_inout_bcount_nz_opt
+#endif
+#ifdef __nvapi_success
+    #undef __success
+    #undef __nvapi_success
+#endif
+#ifdef __nvapi__Ret_notnull_
+    #undef __nvapi__Ret_notnull_
+    #undef _Ret_notnull_
+#endif
+#ifdef __nvapi__Post_writable_byte_size_
+    #undef __nvapi__Post_writable_byte_size_
+    #undef _Post_writable_byte_size_
+#endif
+#ifdef __nvapi_Outptr_
+    #undef __nvapi_Outptr_
+    #undef _Outptr_
+#endif
+
+#endif // __NVAPI_EMPTY_SAL
diff --git a/compat/nvapi/nvapi_lite_salstart.h b/compat/nvapi/nvapi_lite_salstart.h
new file mode 100644
index 0000000000..8f493a4b49
--- /dev/null
+++ b/compat/nvapi/nvapi_lite_salstart.h
@@ -0,0 +1,821 @@
+ /************************************************************************************************************************************\
+|*                                                                                                                                    *|
+|*     Copyright � 2012 NVIDIA Corporation.  All rights reserved.                                                                     *|
+|*                                                                                                                                    *|
+|*  NOTICE TO USER:                                                                                                                   *|
+|*                                                                                                                                    *|
+|*  This software is subject to NVIDIA ownership rights under U.S. and international Copyright laws.                                  *|
+|*                                                                                                                                    *|
+|*  This software and the information contained herein are PROPRIETARY and CONFIDENTIAL to NVIDIA                                     *|
+|*  and are being provided solely under the terms and conditions of an NVIDIA software license agreement.                             *|
+|*  Otherwise, you have no rights to use or access this software in any manner.                                                       *|
+|*                                                                                                                                    *|
+|*  If not covered by the applicable NVIDIA software license agreement:                                                               *|
+|*  NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOFTWARE FOR ANY PURPOSE.                                            *|
+|*  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.                                                           *|
+|*  NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,                                                                     *|
+|*  INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.                       *|
+|*  IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,                               *|
+|*  OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT,                         *|
+|*  NEGLIGENCE OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOURCE CODE.            *|
+|*                                                                                                                                    *|
+|*  U.S. Government End Users.                                                                                                        *|
+|*  This software is a "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT 1995),                                       *|
+|*  consisting  of "commercial computer  software"  and "commercial computer software documentation"                                  *|
+|*  as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government only as a commercial end item.     *|
+|*  Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995),                                          *|
+|*  all U.S. Government End Users acquire the software with only those rights set forth herein.                                       *|
+|*                                                                                                                                    *|
+|*  Any use of this software in individual and commercial software must include,                                                      *|
+|*  in the user documentation and internal comments to the code,                                                                      *|
+|*  the above Disclaimer (as applicable) and U.S. Government End Users Notice.                                                        *|
+|*                                                                                                                                    *|
+ \************************************************************************************************************************************/
+
+// ====================================================
+// SAL related support
+// ====================================================
+
+#ifndef __ecount
+    #define __nvapi_undef__ecount
+    #define __ecount(size)
+#endif
+#ifndef __bcount
+    #define __nvapi_undef__bcount
+    #define __bcount(size)
+#endif
+#ifndef __in
+    #define __nvapi_undef__in
+    #define __in
+#endif
+#ifndef __in_ecount
+    #define __nvapi_undef__in_ecount
+    #define __in_ecount(size)
+#endif
+#ifndef __in_bcount
+    #define __nvapi_undef__in_bcount
+    #define __in_bcount(size)
+#endif
+#ifndef __in_z
+    #define __nvapi_undef__in_z
+    #define __in_z
+#endif
+#ifndef __in_ecount_z
+    #define __nvapi_undef__in_ecount_z
+    #define __in_ecount_z(size)
+#endif
+#ifndef __in_bcount_z
+    #define __nvapi_undef__in_bcount_z
+    #define __in_bcount_z(size)
+#endif
+#ifndef __in_nz
+    #define __nvapi_undef__in_nz
+    #define __in_nz
+#endif
+#ifndef __in_ecount_nz
+    #define __nvapi_undef__in_ecount_nz
+    #define __in_ecount_nz(size)
+#endif
+#ifndef __in_bcount_nz
+    #define __nvapi_undef__in_bcount_nz
+    #define __in_bcount_nz(size)
+#endif
+#ifndef __out
+    #define __nvapi_undef__out
+    #define __out
+#endif
+#ifndef __out_ecount
+    #define __nvapi_undef__out_ecount
+    #define __out_ecount(size)
+#endif
+#ifndef __out_bcount
+    #define __nvapi_undef__out_bcount
+    #define __out_bcount(size)
+#endif
+#ifndef __out_ecount_part
+    #define __nvapi_undef__out_ecount_part
+    #define __out_ecount_part(size,length)
+#endif
+#ifndef __out_bcount_part
+    #define __nvapi_undef__out_bcount_part
+    #define __out_bcount_part(size,length)
+#endif
+#ifndef __out_ecount_full
+    #define __nvapi_undef__out_ecount_full
+    #define __out_ecount_full(size)
+#endif
+#ifndef __out_bcount_full
+    #define __nvapi_undef__out_bcount_full
+    #define __out_bcount_full(size)
+#endif
+#ifndef __out_z
+    #define __nvapi_undef__out_z
+    #define __out_z
+#endif
+#ifndef __out_z_opt
+    #define __nvapi_undef__out_z_opt
+    #define __out_z_opt
+#endif
+#ifndef __out_ecount_z
+    #define __nvapi_undef__out_ecount_z
+    #define __out_ecount_z(size)
+#endif
+#ifndef __out_bcount_z
+    #define __nvapi_undef__out_bcount_z
+    #define __out_bcount_z(size)
+#endif
+#ifndef __out_ecount_part_z
+    #define __nvapi_undef__out_ecount_part_z
+    #define __out_ecount_part_z(size,length)
+#endif
+#ifndef __out_bcount_part_z
+    #define __nvapi_undef__out_bcount_part_z
+    #define __out_bcount_part_z(size,length)
+#endif
+#ifndef __out_ecount_full_z
+    #define __nvapi_undef__out_ecount_full_z
+    #define __out_ecount_full_z(size)
+#endif
+#ifndef __out_bcount_full_z
+    #define __nvapi_undef__out_bcount_full_z
+    #define __out_bcount_full_z(size)
+#endif
+#ifndef __out_nz
+    #define __nvapi_undef__out_nz
+    #define __out_nz
+#endif
+#ifndef __out_nz_opt
+    #define __nvapi_undef__out_nz_opt
+    #define __out_nz_opt
+#endif
+#ifndef __out_ecount_nz
+    #define __nvapi_undef__out_ecount_nz
+    #define __out_ecount_nz(size)
+#endif
+#ifndef __out_bcount_nz
+    #define __nvapi_undef__out_bcount_nz
+    #define __out_bcount_nz(size)
+#endif
+#ifndef __inout
+    #define __nvapi_undef__inout
+    #define __inout
+#endif
+#ifndef __inout_ecount
+    #define __nvapi_undef__inout_ecount
+    #define __inout_ecount(size)
+#endif
+#ifndef __inout_bcount
+    #define __nvapi_undef__inout_bcount
+    #define __inout_bcount(size)
+#endif
+#ifndef __inout_ecount_part
+    #define __nvapi_undef__inout_ecount_part
+    #define __inout_ecount_part(size,length)
+#endif
+#ifndef __inout_bcount_part
+    #define __nvapi_undef__inout_bcount_part
+    #define __inout_bcount_part(size,length)
+#endif
+#ifndef __inout_ecount_full
+    #define __nvapi_undef__inout_ecount_full
+    #define __inout_ecount_full(size)
+#endif
+#ifndef __inout_bcount_full
+    #define __nvapi_undef__inout_bcount_full
+    #define __inout_bcount_full(size)
+#endif
+#ifndef __inout_z
+    #define __nvapi_undef__inout_z
+    #define __inout_z
+#endif
+#ifndef __inout_ecount_z
+    #define __nvapi_undef__inout_ecount_z
+    #define __inout_ecount_z(size)
+#endif
+#ifndef __inout_bcount_z
+    #define __nvapi_undef__inout_bcount_z
+    #define __inout_bcount_z(size)
+#endif
+#ifndef __inout_nz
+    #define __nvapi_undef__inout_nz
+    #define __inout_nz
+#endif
+#ifndef __inout_ecount_nz
+    #define __nvapi_undef__inout_ecount_nz
+    #define __inout_ecount_nz(size)
+#endif
+#ifndef __inout_bcount_nz
+    #define __nvapi_undef__inout_bcount_nz
+    #define __inout_bcount_nz(size)
+#endif
+#ifndef __ecount_opt
+    #define __nvapi_undef__ecount_opt
+    #define __ecount_opt(size)
+#endif
+#ifndef __bcount_opt
+    #define __nvapi_undef__bcount_opt
+    #define __bcount_opt(size)
+#endif
+#ifndef __in_opt
+    #define __nvapi_undef__in_opt
+    #define __in_opt
+#endif
+#ifndef __in_ecount_opt
+    #define __nvapi_undef__in_ecount_opt
+    #define __in_ecount_opt(size)
+#endif
+#ifndef __in_bcount_opt
+    #define __nvapi_undef__in_bcount_opt
+    #define __in_bcount_opt(size)
+#endif
+#ifndef __in_z_opt
+    #define __nvapi_undef__in_z_opt
+    #define __in_z_opt
+#endif
+#ifndef __in_ecount_z_opt
+    #define __nvapi_undef__in_ecount_z_opt
+    #define __in_ecount_z_opt(size)
+#endif
+#ifndef __in_bcount_z_opt
+    #define __nvapi_undef__in_bcount_z_opt
+    #define __in_bcount_z_opt(size)
+#endif
+#ifndef __in_nz_opt
+    #define __nvapi_undef__in_nz_opt
+    #define __in_nz_opt
+#endif
+#ifndef __in_ecount_nz_opt
+    #define __nvapi_undef__in_ecount_nz_opt
+    #define __in_ecount_nz_opt(size)
+#endif
+#ifndef __in_bcount_nz_opt
+    #define __nvapi_undef__in_bcount_nz_opt
+    #define __in_bcount_nz_opt(size)
+#endif
+#ifndef __out_opt
+    #define __nvapi_undef__out_opt
+    #define __out_opt
+#endif
+#ifndef __out_ecount_opt
+    #define __nvapi_undef__out_ecount_opt
+    #define __out_ecount_opt(size)
+#endif
+#ifndef __out_bcount_opt
+    #define __nvapi_undef__out_bcount_opt
+    #define __out_bcount_opt(size)
+#endif
+#ifndef __out_ecount_part_opt
+    #define __nvapi_undef__out_ecount_part_opt
+    #define __out_ecount_part_opt(size,length)
+#endif
+#ifndef __out_bcount_part_opt
+    #define __nvapi_undef__out_bcount_part_opt
+    #define __out_bcount_part_opt(size,length)
+#endif
+#ifndef __out_ecount_full_opt
+    #define __nvapi_undef__out_ecount_full_opt
+    #define __out_ecount_full_opt(size)
+#endif
+#ifndef __out_bcount_full_opt
+    #define __nvapi_undef__out_bcount_full_opt
+    #define __out_bcount_full_opt(size)
+#endif
+#ifndef __out_ecount_z_opt
+    #define __nvapi_undef__out_ecount_z_opt
+    #define __out_ecount_z_opt(size)
+#endif
+#ifndef __out_bcount_z_opt
+    #define __nvapi_undef__out_bcount_z_opt
+    #define __out_bcount_z_opt(size)
+#endif
+#ifndef __out_ecount_part_z_opt
+    #define __nvapi_undef__out_ecount_part_z_opt
+    #define __out_ecount_part_z_opt(size,length)
+#endif
+#ifndef __out_bcount_part_z_opt
+    #define __nvapi_undef__out_bcount_part_z_opt
+    #define __out_bcount_part_z_opt(size,length)
+#endif
+#ifndef __out_ecount_full_z_opt
+    #define __nvapi_undef__out_ecount_full_z_opt
+    #define __out_ecount_full_z_opt(size)
+#endif
+#ifndef __out_bcount_full_z_opt
+    #define __nvapi_undef__out_bcount_full_z_opt
+    #define __out_bcount_full_z_opt(size)
+#endif
+#ifndef __out_ecount_nz_opt
+    #define __nvapi_undef__out_ecount_nz_opt
+    #define __out_ecount_nz_opt(size)
+#endif
+#ifndef __out_bcount_nz_opt
+    #define __nvapi_undef__out_bcount_nz_opt
+    #define __out_bcount_nz_opt(size)
+#endif
+#ifndef __inout_opt
+    #define __nvapi_undef__inout_opt
+    #define __inout_opt
+#endif
+#ifndef __inout_ecount_opt
+    #define __nvapi_undef__inout_ecount_opt
+    #define __inout_ecount_opt(size)
+#endif
+#ifndef __inout_bcount_opt
+    #define __nvapi_undef__inout_bcount_opt
+    #define __inout_bcount_opt(size)
+#endif
+#ifndef __inout_ecount_part_opt
+    #define __nvapi_undef__inout_ecount_part_opt
+    #define __inout_ecount_part_opt(size,length)
+#endif
+#ifndef __inout_bcount_part_opt
+    #define __nvapi_undef__inout_bcount_part_opt
+    #define __inout_bcount_part_opt(size,length)
+#endif
+#ifndef __inout_ecount_full_opt
+    #define __nvapi_undef__inout_ecount_full_opt
+    #define __inout_ecount_full_opt(size)
+#endif
+#ifndef __inout_bcount_full_opt
+    #define __nvapi_undef__inout_bcount_full_opt
+    #define __inout_bcount_full_opt(size)
+#endif
+#ifndef __inout_z_opt
+    #define __nvapi_undef__inout_z_opt
+    #define __inout_z_opt
+#endif
+#ifndef __inout_ecount_z_opt
+    #define __nvapi_undef__inout_ecount_z_opt
+    #define __inout_ecount_z_opt(size)
+#endif
+#ifndef __inout_ecount_z_opt
+    #define __nvapi_undef__inout_ecount_z_opt
+    #define __inout_ecount_z_opt(size)
+#endif
+#ifndef __inout_bcount_z_opt
+    #define __nvapi_undef__inout_bcount_z_opt
+    #define __inout_bcount_z_opt(size)
+#endif
+#ifndef __inout_nz_opt
+    #define __nvapi_undef__inout_nz_opt
+    #define __inout_nz_opt
+#endif
+#ifndef __inout_ecount_nz_opt
+    #define __nvapi_undef__inout_ecount_nz_opt
+    #define __inout_ecount_nz_opt(size)
+#endif
+#ifndef __inout_bcount_nz_opt
+    #define __nvapi_undef__inout_bcount_nz_opt
+    #define __inout_bcount_nz_opt(size)
+#endif
+#ifndef __deref_ecount
+    #define __nvapi_undef__deref_ecount
+    #define __deref_ecount(size)
+#endif
+#ifndef __deref_bcount
+    #define __nvapi_undef__deref_bcount
+    #define __deref_bcount(size)
+#endif
+#ifndef __deref_out
+    #define __nvapi_undef__deref_out
+    #define __deref_out
+#endif
+#ifndef __deref_out_ecount
+    #define __nvapi_undef__deref_out_ecount
+    #define __deref_out_ecount(size)
+#endif
+#ifndef __deref_out_bcount
+    #define __nvapi_undef__deref_out_bcount
+    #define __deref_out_bcount(size)
+#endif
+#ifndef __deref_out_ecount_part
+    #define __nvapi_undef__deref_out_ecount_part
+    #define __deref_out_ecount_part(size,length)
+#endif
+#ifndef __deref_out_bcount_part
+    #define __nvapi_undef__deref_out_bcount_part
+    #define __deref_out_bcount_part(size,length)
+#endif
+#ifndef __deref_out_ecount_full
+    #define __nvapi_undef__deref_out_ecount_full
+    #define __deref_out_ecount_full(size)
+#endif
+#ifndef __deref_out_bcount_full
+    #define __nvapi_undef__deref_out_bcount_full
+    #define __deref_out_bcount_full(size)
+#endif
+#ifndef __deref_out_z
+    #define __nvapi_undef__deref_out_z
+    #define __deref_out_z
+#endif
+#ifndef __deref_out_ecount_z
+    #define __nvapi_undef__deref_out_ecount_z
+    #define __deref_out_ecount_z(size)
+#endif
+#ifndef __deref_out_bcount_z
+    #define __nvapi_undef__deref_out_bcount_z
+    #define __deref_out_bcount_z(size)
+#endif
+#ifndef __deref_out_nz
+    #define __nvapi_undef__deref_out_nz
+    #define __deref_out_nz
+#endif
+#ifndef __deref_out_ecount_nz
+    #define __nvapi_undef__deref_out_ecount_nz
+    #define __deref_out_ecount_nz(size)
+#endif
+#ifndef __deref_out_bcount_nz
+    #define __nvapi_undef__deref_out_bcount_nz
+    #define __deref_out_bcount_nz(size)
+#endif
+#ifndef __deref_inout
+    #define __nvapi_undef__deref_inout
+    #define __deref_inout
+#endif
+#ifndef __deref_inout_z
+    #define __nvapi_undef__deref_inout_z
+    #define __deref_inout_z
+#endif
+#ifndef __deref_inout_ecount
+    #define __nvapi_undef__deref_inout_ecount
+    #define __deref_inout_ecount(size)
+#endif
+#ifndef __deref_inout_bcount
+    #define __nvapi_undef__deref_inout_bcount
+    #define __deref_inout_bcount(size)
+#endif
+#ifndef __deref_inout_ecount_part
+    #define __nvapi_undef__deref_inout_ecount_part
+    #define __deref_inout_ecount_part(size,length)
+#endif
+#ifndef __deref_inout_bcount_part
+    #define __nvapi_undef__deref_inout_bcount_part
+    #define __deref_inout_bcount_part(size,length)
+#endif
+#ifndef __deref_inout_ecount_full
+    #define __nvapi_undef__deref_inout_ecount_full
+    #define __deref_inout_ecount_full(size)
+#endif
+#ifndef __deref_inout_bcount_full
+    #define __nvapi_undef__deref_inout_bcount_full
+    #define __deref_inout_bcount_full(size)
+#endif
+#ifndef __deref_inout_z
+    #define __nvapi_undef__deref_inout_z
+    #define __deref_inout_z
+#endif
+#ifndef __deref_inout_ecount_z
+    #define __nvapi_undef__deref_inout_ecount_z
+    #define __deref_inout_ecount_z(size)
+#endif
+#ifndef __deref_inout_bcount_z
+    #define __nvapi_undef__deref_inout_bcount_z
+    #define __deref_inout_bcount_z(size)
+#endif
+#ifndef __deref_inout_nz
+    #define __nvapi_undef__deref_inout_nz
+    #define __deref_inout_nz
+#endif
+#ifndef __deref_inout_ecount_nz
+    #define __nvapi_undef__deref_inout_ecount_nz
+    #define __deref_inout_ecount_nz(size)
+#endif
+#ifndef __deref_inout_bcount_nz
+    #define __nvapi_undef__deref_inout_bcount_nz
+    #define __deref_inout_bcount_nz(size)
+#endif
+#ifndef __deref_ecount_opt
+    #define __nvapi_undef__deref_ecount_opt
+    #define __deref_ecount_opt(size)
+#endif
+#ifndef __deref_bcount_opt
+    #define __nvapi_undef__deref_bcount_opt
+    #define __deref_bcount_opt(size)
+#endif
+#ifndef __deref_out_opt
+    #define __nvapi_undef__deref_out_opt
+    #define __deref_out_opt
+#endif
+#ifndef __deref_out_ecount_opt
+    #define __nvapi_undef__deref_out_ecount_opt
+    #define __deref_out_ecount_opt(size)
+#endif
+#ifndef __deref_out_bcount_opt
+    #define __nvapi_undef__deref_out_bcount_opt
+    #define __deref_out_bcount_opt(size)
+#endif
+#ifndef __deref_out_ecount_part_opt
+    #define __nvapi_undef__deref_out_ecount_part_opt
+    #define __deref_out_ecount_part_opt(size,length)
+#endif
+#ifndef __deref_out_bcount_part_opt
+    #define __nvapi_undef__deref_out_bcount_part_opt
+    #define __deref_out_bcount_part_opt(size,length)
+#endif
+#ifndef __deref_out_ecount_full_opt
+    #define __nvapi_undef__deref_out_ecount_full_opt
+    #define __deref_out_ecount_full_opt(size)
+#endif
+#ifndef __deref_out_bcount_full_opt
+    #define __nvapi_undef__deref_out_bcount_full_opt
+    #define __deref_out_bcount_full_opt(size)
+#endif
+#ifndef __deref_out_z_opt
+    #define __nvapi_undef__deref_out_z_opt
+    #define __deref_out_z_opt
+#endif
+#ifndef __deref_out_ecount_z_opt
+    #define __nvapi_undef__deref_out_ecount_z_opt
+    #define __deref_out_ecount_z_opt(size)
+#endif
+#ifndef __deref_out_bcount_z_opt
+    #define __nvapi_undef__deref_out_bcount_z_opt
+    #define __deref_out_bcount_z_opt(size)
+#endif
+#ifndef __deref_out_nz_opt
+    #define __nvapi_undef__deref_out_nz_opt
+    #define __deref_out_nz_opt
+#endif
+#ifndef __deref_out_ecount_nz_opt
+    #define __nvapi_undef__deref_out_ecount_nz_opt
+    #define __deref_out_ecount_nz_opt(size)
+#endif
+#ifndef __deref_out_bcount_nz_opt
+    #define __nvapi_undef__deref_out_bcount_nz_opt
+    #define __deref_out_bcount_nz_opt(size)
+#endif
+#ifndef __deref_inout_opt
+    #define __nvapi_undef__deref_inout_opt
+    #define __deref_inout_opt
+#endif
+#ifndef __deref_inout_ecount_opt
+    #define __nvapi_undef__deref_inout_ecount_opt
+    #define __deref_inout_ecount_opt(size)
+#endif
+#ifndef __deref_inout_bcount_opt
+    #define __nvapi_undef__deref_inout_bcount_opt
+    #define __deref_inout_bcount_opt(size)
+#endif
+#ifndef __deref_inout_ecount_part_opt
+    #define __nvapi_undef__deref_inout_ecount_part_opt
+    #define __deref_inout_ecount_part_opt(size,length)
+#endif
+#ifndef __deref_inout_bcount_part_opt
+    #define __nvapi_undef__deref_inout_bcount_part_opt
+    #define __deref_inout_bcount_part_opt(size,length)
+#endif
+#ifndef __deref_inout_ecount_full_opt
+    #define __nvapi_undef__deref_inout_ecount_full_opt
+    #define __deref_inout_ecount_full_opt(size)
+#endif
+#ifndef __deref_inout_bcount_full_opt
+    #define __nvapi_undef__deref_inout_bcount_full_opt
+    #define __deref_inout_bcount_full_opt(size)
+#endif
+#ifndef __deref_inout_z_opt
+    #define __nvapi_undef__deref_inout_z_opt
+    #define __deref_inout_z_opt
+#endif
+#ifndef __deref_inout_ecount_z_opt
+    #define __nvapi_undef__deref_inout_ecount_z_opt
+    #define __deref_inout_ecount_z_opt(size)
+#endif
+#ifndef __deref_inout_bcount_z_opt
+    #define __nvapi_undef__deref_inout_bcount_z_opt
+    #define __deref_inout_bcount_z_opt(size)
+#endif
+#ifndef __deref_inout_nz_opt
+    #define __nvapi_undef__deref_inout_nz_opt
+    #define __deref_inout_nz_opt
+#endif
+#ifndef __deref_inout_ecount_nz_opt
+    #define __nvapi_undef__deref_inout_ecount_nz_opt
+    #define __deref_inout_ecount_nz_opt(size)
+#endif
+#ifndef __deref_inout_bcount_nz_opt
+    #define __nvapi_undef__deref_inout_bcount_nz_opt
+    #define __deref_inout_bcount_nz_opt(size)
+#endif
+#ifndef __deref_opt_ecount
+    #define __nvapi_undef__deref_opt_ecount
+    #define __deref_opt_ecount(size)
+#endif
+#ifndef __deref_opt_bcount
+    #define __nvapi_undef__deref_opt_bcount
+    #define __deref_opt_bcount(size)
+#endif
+#ifndef __deref_opt_out
+    #define __nvapi_undef__deref_opt_out
+    #define __deref_opt_out
+#endif
+#ifndef __deref_opt_out_z
+    #define __nvapi_undef__deref_opt_out_z
+    #define __deref_opt_out_z
+#endif
+#ifndef __deref_opt_out_ecount
+    #define __nvapi_undef__deref_opt_out_ecount
+    #define __deref_opt_out_ecount(size)
+#endif
+#ifndef __deref_opt_out_bcount
+    #define __nvapi_undef__deref_opt_out_bcount
+    #define __deref_opt_out_bcount(size)
+#endif
+#ifndef __deref_opt_out_ecount_part
+    #define __nvapi_undef__deref_opt_out_ecount_part
+    #define __deref_opt_out_ecount_part(size,length)
+#endif
+#ifndef __deref_opt_out_bcount_part
+    #define __nvapi_undef__deref_opt_out_bcount_part
+    #define __deref_opt_out_bcount_part(size,length)
+#endif
+#ifndef __deref_opt_out_ecount_full
+    #define __nvapi_undef__deref_opt_out_ecount_full
+    #define __deref_opt_out_ecount_full(size)
+#endif
+#ifndef __deref_opt_out_bcount_full
+    #define __nvapi_undef__deref_opt_out_bcount_full
+    #define __deref_opt_out_bcount_full(size)
+#endif
+#ifndef __deref_opt_inout
+    #define __nvapi_undef__deref_opt_inout
+    #define __deref_opt_inout
+#endif
+#ifndef __deref_opt_inout_ecount
+    #define __nvapi_undef__deref_opt_inout_ecount
+    #define __deref_opt_inout_ecount(size)
+#endif
+#ifndef __deref_opt_inout_bcount
+    #define __nvapi_undef__deref_opt_inout_bcount
+    #define __deref_opt_inout_bcount(size)
+#endif
+#ifndef __deref_opt_inout_ecount_part
+    #define __nvapi_undef__deref_opt_inout_ecount_part
+    #define __deref_opt_inout_ecount_part(size,length)
+#endif
+#ifndef __deref_opt_inout_bcount_part
+    #define __nvapi_undef__deref_opt_inout_bcount_part
+    #define __deref_opt_inout_bcount_part(size,length)
+#endif
+#ifndef __deref_opt_inout_ecount_full
+    #define __nvapi_undef__deref_opt_inout_ecount_full
+    #define __deref_opt_inout_ecount_full(size)
+#endif
+#ifndef __deref_opt_inout_bcount_full
+    #define __nvapi_undef__deref_opt_inout_bcount_full
+    #define __deref_opt_inout_bcount_full(size)
+#endif
+#ifndef __deref_opt_inout_z
+    #define __nvapi_undef__deref_opt_inout_z
+    #define __deref_opt_inout_z
+#endif
+#ifndef __deref_opt_inout_ecount_z
+    #define __nvapi_undef__deref_opt_inout_ecount_z
+    #define __deref_opt_inout_ecount_z(size)
+#endif
+#ifndef __deref_opt_inout_bcount_z
+    #define __nvapi_undef__deref_opt_inout_bcount_z
+    #define __deref_opt_inout_bcount_z(size)
+#endif
+#ifndef __deref_opt_inout_nz
+    #define __nvapi_undef__deref_opt_inout_nz
+    #define __deref_opt_inout_nz
+#endif
+#ifndef __deref_opt_inout_ecount_nz
+    #define __nvapi_undef__deref_opt_inout_ecount_nz
+    #define __deref_opt_inout_ecount_nz(size)
+#endif
+#ifndef __deref_opt_inout_bcount_nz
+    #define __nvapi_undef__deref_opt_inout_bcount_nz
+    #define __deref_opt_inout_bcount_nz(size)
+#endif
+#ifndef __deref_opt_ecount_opt
+    #define __nvapi_undef__deref_opt_ecount_opt
+    #define __deref_opt_ecount_opt(size)
+#endif
+#ifndef __deref_opt_bcount_opt
+    #define __nvapi_undef__deref_opt_bcount_opt
+    #define __deref_opt_bcount_opt(size)
+#endif
+#ifndef __deref_opt_out_opt
+    #define __nvapi_undef__deref_opt_out_opt
+    #define __deref_opt_out_opt
+#endif
+#ifndef __deref_opt_out_ecount_opt
+    #define __nvapi_undef__deref_opt_out_ecount_opt
+    #define __deref_opt_out_ecount_opt(size)
+#endif
+#ifndef __deref_opt_out_bcount_opt
+    #define __nvapi_undef__deref_opt_out_bcount_opt
+    #define __deref_opt_out_bcount_opt(size)
+#endif
+#ifndef __deref_opt_out_ecount_part_opt
+    #define __nvapi_undef__deref_opt_out_ecount_part_opt
+    #define __deref_opt_out_ecount_part_opt(size,length)
+#endif
+#ifndef __deref_opt_out_bcount_part_opt
+    #define __nvapi_undef__deref_opt_out_bcount_part_opt
+    #define __deref_opt_out_bcount_part_opt(size,length)
+#endif
+#ifndef __deref_opt_out_ecount_full_opt
+    #define __nvapi_undef__deref_opt_out_ecount_full_opt
+    #define __deref_opt_out_ecount_full_opt(size)
+#endif
+#ifndef __deref_opt_out_bcount_full_opt
+    #define __nvapi_undef__deref_opt_out_bcount_full_opt
+    #define __deref_opt_out_bcount_full_opt(size)
+#endif
+#ifndef __deref_opt_out_z_opt
+    #define __nvapi_undef__deref_opt_out_z_opt
+    #define __deref_opt_out_z_opt
+#endif
+#ifndef __deref_opt_out_ecount_z_opt
+    #define __nvapi_undef__deref_opt_out_ecount_z_opt
+    #define __deref_opt_out_ecount_z_opt(size)
+#endif
+#ifndef __deref_opt_out_bcount_z_opt
+    #define __nvapi_undef__deref_opt_out_bcount_z_opt
+    #define __deref_opt_out_bcount_z_opt(size)
+#endif
+#ifndef __deref_opt_out_nz_opt
+    #define __nvapi_undef__deref_opt_out_nz_opt
+    #define __deref_opt_out_nz_opt
+#endif
+#ifndef __deref_opt_out_ecount_nz_opt
+    #define __nvapi_undef__deref_opt_out_ecount_nz_opt
+    #define __deref_opt_out_ecount_nz_opt(size)
+#endif
+#ifndef __deref_opt_out_bcount_nz_opt
+    #define __nvapi_undef__deref_opt_out_bcount_nz_opt
+    #define __deref_opt_out_bcount_nz_opt(size)
+#endif
+#ifndef __deref_opt_inout_opt
+    #define __nvapi_undef__deref_opt_inout_opt
+    #define __deref_opt_inout_opt
+#endif
+#ifndef __deref_opt_inout_ecount_opt
+    #define __nvapi_undef__deref_opt_inout_ecount_opt
+    #define __deref_opt_inout_ecount_opt(size)
+#endif
+#ifndef __deref_opt_inout_bcount_opt
+    #define __nvapi_undef__deref_opt_inout_bcount_opt
+    #define __deref_opt_inout_bcount_opt(size)
+#endif
+#ifndef __deref_opt_inout_ecount_part_opt
+    #define __nvapi_undef__deref_opt_inout_ecount_part_opt
+    #define __deref_opt_inout_ecount_part_opt(size,length)
+#endif
+#ifndef __deref_opt_inout_bcount_part_opt
+    #define __nvapi_undef__deref_opt_inout_bcount_part_opt
+    #define __deref_opt_inout_bcount_part_opt(size,length)
+#endif
+#ifndef __deref_opt_inout_ecount_full_opt
+    #define __nvapi_undef__deref_opt_inout_ecount_full_opt
+    #define __deref_opt_inout_ecount_full_opt(size)
+#endif
+#ifndef __deref_opt_inout_bcount_full_opt
+    #define __nvapi_undef__deref_opt_inout_bcount_full_opt
+    #define __deref_opt_inout_bcount_full_opt(size)
+#endif
+#ifndef __deref_opt_inout_z_opt
+    #define __nvapi_undef__deref_opt_inout_z_opt
+    #define __deref_opt_inout_z_opt
+#endif
+#ifndef __deref_opt_inout_ecount_z_opt
+    #define __nvapi_undef__deref_opt_inout_ecount_z_opt
+    #define __deref_opt_inout_ecount_z_opt(size)
+#endif
+#ifndef __deref_opt_inout_bcount_z_opt
+    #define __nvapi_undef__deref_opt_inout_bcount_z_opt
+    #define __deref_opt_inout_bcount_z_opt(size)
+#endif
+#ifndef __deref_opt_inout_nz_opt
+    #define __nvapi_undef__deref_opt_inout_nz_opt
+    #define __deref_opt_inout_nz_opt
+#endif
+#ifndef __deref_opt_inout_ecount_nz_opt
+    #define __nvapi_undef__deref_opt_inout_ecount_nz_opt
+    #define __deref_opt_inout_ecount_nz_opt(size)
+#endif
+#ifndef __deref_opt_inout_bcount_nz_opt
+    #define __nvapi_undef__deref_opt_inout_bcount_nz_opt
+    #define __deref_opt_inout_bcount_nz_opt(size)
+#endif
+#ifndef __success
+    #define __nvapi_success
+    #define __success(epxr)
+#endif
+#ifndef _Ret_notnull_
+    #define __nvapi__Ret_notnull_
+    #define _Ret_notnull_
+#endif
+#ifndef _Post_writable_byte_size_
+    #define __nvapi__Post_writable_byte_size_
+    #define _Post_writable_byte_size_(n)
+#endif
+#ifndef _Outptr_
+    #define __nvapi_Outptr_
+    #define _Outptr_
+#endif
+
+
+#define NVAPI_INTERFACE extern __success(return == NVAPI_OK) NvAPI_Status __cdecl
diff --git a/compat/nvapi/nvapi_lite_sli.h b/compat/nvapi/nvapi_lite_sli.h
new file mode 100644
index 0000000000..b4fd661261
--- /dev/null
+++ b/compat/nvapi/nvapi_lite_sli.h
@@ -0,0 +1,225 @@
+ /************************************************************************************************************************************\
+|*                                                                                                                                    *|
+|*     Copyright � 2012 NVIDIA Corporation.  All rights reserved.                                                                     *|
+|*                                                                                                                                    *|
+|*  NOTICE TO USER:                                                                                                                   *|
+|*                                                                                                                                    *|
+|*  This software is subject to NVIDIA ownership rights under U.S. and international Copyright laws.                                  *|
+|*                                                                                                                                    *|
+|*  This software and the information contained herein are PROPRIETARY and CONFIDENTIAL to NVIDIA                                     *|
+|*  and are being provided solely under the terms and conditions of an NVIDIA software license agreement.                             *|
+|*  Otherwise, you have no rights to use or access this software in any manner.                                                       *|
+|*                                                                                                                                    *|
+|*  If not covered by the applicable NVIDIA software license agreement:                                                               *|
+|*  NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOFTWARE FOR ANY PURPOSE.                                            *|
+|*  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.                                                           *|
+|*  NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,                                                                     *|
+|*  INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.                       *|
+|*  IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,                               *|
+|*  OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT,                         *|
+|*  NEGLIGENCE OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOURCE CODE.            *|
+|*                                                                                                                                    *|
+|*  U.S. Government End Users.                                                                                                        *|
+|*  This software is a "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT 1995),                                       *|
+|*  consisting  of "commercial computer  software"  and "commercial computer software documentation"                                  *|
+|*  as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government only as a commercial end item.     *|
+|*  Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995),                                          *|
+|*  all U.S. Government End Users acquire the software with only those rights set forth herein.                                       *|
+|*                                                                                                                                    *|
+|*  Any use of this software in individual and commercial software must include,                                                      *|
+|*  in the user documentation and internal comments to the code,                                                                      *|
+|*  the above Disclaimer (as applicable) and U.S. Government End Users Notice.                                                        *|
+|*                                                                                                                                    *|
+ \************************************************************************************************************************************/
+
+#pragma once
+#include"nvapi_lite_salstart.h"
+#include"nvapi_lite_common.h"
+#pragma pack(push,8)
+#ifdef __cplusplus
+extern "C" {
+#endif
+//-----------------------------------------------------------------------------
+// DirectX APIs
+//-----------------------------------------------------------------------------
+
+
+//! \ingroup dx
+//! Used in NvAPI_D3D10_GetCurrentSLIState(), and NvAPI_D3D_GetCurrentSLIState().
+typedef struct
+{
+    NvU32 version;                    //!< Structure version
+    NvU32 maxNumAFRGroups;            //!< [OUT] The maximum possible value of numAFRGroups
+    NvU32 numAFRGroups;               //!< [OUT] The number of AFR groups enabled in the system
+    NvU32 currentAFRIndex;            //!< [OUT] The AFR group index for the frame currently being rendered
+    NvU32 nextFrameAFRIndex;          //!< [OUT] What the AFR group index will be for the next frame (i.e. after calling Present)
+    NvU32 previousFrameAFRIndex;      //!< [OUT] The AFR group index that was used for the previous frame (~0 if more than one frame has not been rendered yet)
+    NvU32 bIsCurAFRGroupNew;          //!< [OUT] Boolean: Is this frame the first time running on the current AFR group
+
+} NV_GET_CURRENT_SLI_STATE;
+
+//! \ingroup dx
+#define NV_GET_CURRENT_SLI_STATE_VER  MAKE_NVAPI_VERSION(NV_GET_CURRENT_SLI_STATE,1)
+#if defined(_D3D9_H_) || defined(__d3d10_h__) || defined(__d3d11_h__)
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME:   NvAPI_D3D_GetCurrentSLIState
+//
+//! DESCRIPTION:     This function returns the current SLI state for the specified device.  The structure
+//!                  contains the number of AFR groups, the current AFR group index,
+//!                  and what the AFR group index will be for the next frame. \p
+//!                  pDevice can be either a IDirect3DDevice9 or ID3D10Device pointer.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 173
+//!
+//! \retval         NVAPI_OK     Completed request
+//! \retval         NVAPI_ERROR  Error occurred
+//!
+//! \ingroup  dx
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_D3D_GetCurrentSLIState(IUnknown *pDevice, NV_GET_CURRENT_SLI_STATE *pSliState);
+#endif //if defined(_D3D9_H_) || defined(__d3d10_h__) || defined(__d3d11_h__)
+#if defined(_D3D9_H_) || defined(__d3d10_h__) || defined(__d3d11_h__)
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_D3D_SetResourceHint
+//
+//! \fn NvAPI_D3D_SetResourceHint(IUnknown *pDev, NVDX_ObjectHandle obj,
+//!                                          NVAPI_D3D_SETRESOURCEHINT_CATEGORY dwHintCategory,
+//!                                          NvU32 dwHintName,
+//!                                          NvU32 *pdwHintValue)
+//!
+//!   DESCRIPTION: This is a general purpose function for passing down various resource
+//!                related hints to the driver. Hints are divided into categories
+//!                and types within each category.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 185
+//!
+//! \param [in] pDev            The ID3D10Device or IDirect3DDevice9 that is a using the resource
+//! \param [in] obj             Previously obtained HV resource handle
+//! \param [in] dwHintCategory  Category of the hints
+//! \param [in] dwHintName      A hint within this category
+//! \param [in] *pdwHintValue   Pointer to location containing hint value
+//!
+//! \return an int which could be an NvAPI status or DX HRESULT code
+//!
+//! \retval ::NVAPI_OK
+//! \retval ::NVAPI_INVALID_ARGUMENT
+//! \retval ::NVAPI_INVALID_CALL     It is illegal to change a hint dynamically when the resource is already bound.
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+//! \ingroup dx
+//! Valid categories for NvAPI_D3D_SetResourceHint()
+typedef enum _NVAPI_D3D_SETRESOURCEHINT_CATEGORY
+{
+    NVAPI_D3D_SRH_CATEGORY_SLI = 1
+} NVAPI_D3D_SETRESOURCEHINT_CATEGORY;
+
+
+//
+//  NVAPI_D3D_SRH_SLI_APP_CONTROLLED_INTERFRAME_CONTENT_SYNC:
+//  NVAPI_D3D_SRH_SLI_ASK_FOR_BROADCAST_USING:
+
+
+//! \ingroup dx
+//!  Types of SLI hints; \n
+//!  NVAPI_D3D_SRH_SLI_APP_CONTROLLED_INTERFRAME_CONTENT_SYNC: Valid values : 0 or 1 \n
+//!  Default value: 0 \n
+//!  Explanation: If the value is 1, the driver will not track any rendering operations that would mark this resource as dirty,
+//!  avoiding any form of synchronization across frames rendered in parallel in multiple GPUs in AFR mode.
+//!
+//!  NVAPI_D3D_SRH_SLI_ASK_FOR_BROADCAST_USAGE: Valid values : 0 or 1 \n
+//!  Default value: 0 \n
+//!  Explanation: If the value is 1, the driver will try to perform operations which involved target resource in broadcast,
+//!  where its possible. Hint is static and must be set before resource starts using.
+typedef enum _NVAPI_D3D_SETRESOURCEHINT_SLI
+{
+    NVAPI_D3D_SRH_SLI_APP_CONTROLLED_INTERFRAME_CONTENT_SYNC = 1,
+    NVAPI_D3D_SRH_SLI_ASK_FOR_BROADCAST_USAGE = 2
+}  NVAPI_D3D_SETRESOURCEHINT_SLI;
+
+//! \ingroup dx
+NVAPI_INTERFACE NvAPI_D3D_SetResourceHint(IUnknown *pDev, NVDX_ObjectHandle obj,
+                                          NVAPI_D3D_SETRESOURCEHINT_CATEGORY dwHintCategory,
+                                          NvU32 dwHintName,
+                                          NvU32 *pdwHintValue);
+#endif //defined(_D3D9_H_) || defined(__d3d10_h__) || defined(__d3d11_h__)
+#if defined(_D3D9_H_) || defined(__d3d10_h__) || defined(__d3d11_h__)
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_D3D_BeginResourceRendering
+//
+//! \fn NvAPI_D3D_BeginResourceRendering(IUnknown *pDev, NVDX_ObjectHandle obj, NvU32 Flags)
+//!   DESCRIPTION: This function tells the driver that the resource will begin to receive updates. It must be used in combination with NvAPI_D3D_EndResourceRendering().
+//!                The primary use of this function is allow the driver to initiate early inter-frame synchronization of resources while running in AFR SLI mode.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 185
+//!
+//! \param [in]  pDev         The ID3D10Device or IDirect3DDevice9 that is a using the resource
+//! \param [in]  obj          Previously obtained HV resource handle
+//! \param [in]  Flags        The flags for functionality applied to resource while being used.
+//!
+//! \retval ::NVAPI_OK                Function succeeded, if used properly and driver can initiate proper sync'ing of the resources.
+//! \retval ::NVAPI_INVALID_ARGUMENT  Bad argument(s) or invalid flag values
+//! \retval ::NVAPI_INVALID_CALL      Mismatched begin/end calls
+//
+///////////////////////////////////////////////////////////////////////////////
+
+//! \ingroup dx
+//! Used in NvAPI_D3D_BeginResourceRendering().
+typedef enum  _NVAPI_D3D_RESOURCERENDERING_FLAG
+{
+    NVAPI_D3D_RR_FLAG_DEFAULTS                 = 0x00000000,  //!< All bits set to 0 are defaults.
+    NVAPI_D3D_RR_FLAG_FORCE_DISCARD_CONTENT    = 0x00000001,  //!< (bit 0) The flag forces to discard previous content of the resource regardless of the NvApiHints_Sli_Disable_InterframeSync hint
+    NVAPI_D3D_RR_FLAG_FORCE_KEEP_CONTENT       = 0x00000002,   //!< (bit 1) The flag forces to respect previous content of the resource regardless of the NvApiHints_Sli_Disable_InterframeSync hint
+    NVAPI_D3D_RR_FLAG_MULTI_FRAME              = 0x00000004   //!< (bit 2) The flag hints the driver that content will be used for many frames. If not specified then the driver assumes that content is used only on the next frame
+} NVAPI_D3D_RESOURCERENDERING_FLAG;
+
+//! \ingroup dx
+NVAPI_INTERFACE NvAPI_D3D_BeginResourceRendering(IUnknown *pDev, NVDX_ObjectHandle obj, NvU32 Flags);
+
+#endif //defined(_D3D9_H_) || defined(__d3d10_h__) || defined(__d3d11_h__)
+#if defined(_D3D9_H_) || defined(__d3d10_h__) || defined(__d3d11_h__)
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_D3D_EndResourceRendering
+//
+//!   DESCRIPTION: This function tells the driver that the resource is done receiving updates. It must be used in combination with
+//!                NvAPI_D3D_BeginResourceRendering().
+//!                The primary use of this function is allow the driver to initiate early inter-frame syncs of resources while running in AFR SLI mode.
+//!
+//! SUPPORTED OS:  Windows XP and higher
+//!
+//!
+//! \since Release: 185
+//!
+//! \param [in]  pDev         The ID3D10Device or IDirect3DDevice9 thatis a using the resource
+//! \param [in]  obj          Previously obtained HV resource handle
+//! \param [in]  Flags        Reserved, must be zero
+//
+//! \retval ::NVAPI_OK                Function succeeded, if used properly and driver can initiate proper sync'ing of the resources.
+//! \retval ::NVAPI_INVALID_ARGUMENT  Bad argument(s) or invalid flag values
+//! \retval ::NVAPI_INVALID_CALL      Mismatched begin/end calls
+//!
+//! \ingroup dx
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_D3D_EndResourceRendering(IUnknown *pDev, NVDX_ObjectHandle obj, NvU32 Flags);
+#endif //if defined(_D3D9_H_) || defined(__d3d10_h__) || defined(__d3d11_h__)
+
+#include"nvapi_lite_salend.h"
+#ifdef __cplusplus
+}
+#endif
+#pragma pack(pop)
diff --git a/compat/nvapi/nvapi_lite_stereo.h b/compat/nvapi/nvapi_lite_stereo.h
new file mode 100644
index 0000000000..6b733f69e4
--- /dev/null
+++ b/compat/nvapi/nvapi_lite_stereo.h
@@ -0,0 +1,600 @@
+ /************************************************************************************************************************************\
+|*                                                                                                                                    *|
+|*     Copyright � 2012 NVIDIA Corporation.  All rights reserved.                                                                     *|
+|*                                                                                                                                    *|
+|*  NOTICE TO USER:                                                                                                                   *|
+|*                                                                                                                                    *|
+|*  This software is subject to NVIDIA ownership rights under U.S. and international Copyright laws.                                  *|
+|*                                                                                                                                    *|
+|*  This software and the information contained herein are PROPRIETARY and CONFIDENTIAL to NVIDIA                                     *|
+|*  and are being provided solely under the terms and conditions of an NVIDIA software license agreement.                             *|
+|*  Otherwise, you have no rights to use or access this software in any manner.                                                       *|
+|*                                                                                                                                    *|
+|*  If not covered by the applicable NVIDIA software license agreement:                                                               *|
+|*  NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOFTWARE FOR ANY PURPOSE.                                            *|
+|*  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.                                                           *|
+|*  NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,                                                                     *|
+|*  INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.                       *|
+|*  IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,                               *|
+|*  OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT,                         *|
+|*  NEGLIGENCE OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOURCE CODE.            *|
+|*                                                                                                                                    *|
+|*  U.S. Government End Users.                                                                                                        *|
+|*  This software is a "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT 1995),                                       *|
+|*  consisting  of "commercial computer  software"  and "commercial computer software documentation"                                  *|
+|*  as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government only as a commercial end item.     *|
+|*  Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995),                                          *|
+|*  all U.S. Government End Users acquire the software with only those rights set forth herein.                                       *|
+|*                                                                                                                                    *|
+|*  Any use of this software in individual and commercial software must include,                                                      *|
+|*  in the user documentation and internal comments to the code,                                                                      *|
+|*  the above Disclaimer (as applicable) and U.S. Government End Users Notice.                                                        *|
+|*                                                                                                                                    *|
+ \************************************************************************************************************************************/
+
+#pragma once
+#include"nvapi_lite_salstart.h"
+#include"nvapi_lite_common.h"
+#pragma pack(push,8)
+#ifdef __cplusplus
+extern "C" {
+#endif
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_Stereo_Enable
+//
+//! DESCRIPTION:   This APU enables stereo mode in the registry.
+//!                Calls to this function affect the entire system.
+//!                If stereo is not enabled, then calls to functions that require that stereo is enabled have no effect,
+//!                and will return the appropriate error code.
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \since Release: 180
+//!
+//! \retval ::NVAPI_OK                      Stereo is now enabled.
+//! \retval ::NVAPI_API_NOT_INTIALIZED
+//! \retval ::NVAPI_STEREO_NOT_INITIALIZED  Stereo part of NVAPI not initialized.
+//! \retval ::NVAPI_ERROR
+//!
+//! \ingroup stereoapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_Stereo_Enable(void);
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_Stereo_Disable
+//
+//! DESCRIPTION:   This API disables stereo mode in the registry.
+//!                Calls to this function affect the entire system.
+//!                If stereo is not enabled, then calls to functions that require that stereo is enabled have no effect,
+//!                and will return the appropriate error code.
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \since Release: 180
+//!
+//! \retval ::NVAPI_OK                     Stereo is now disabled.
+//! \retval ::NVAPI_API_NOT_INTIALIZED
+//! \retval ::NVAPI_STEREO_NOT_INITIALIZED Stereo part of NVAPI not initialized.
+//! \retval ::NVAPI_ERROR
+//!
+//! \ingroup stereoapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_Stereo_Disable(void);
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_Stereo_IsEnabled
+//
+//! DESCRIPTION:   This API checks if stereo mode is enabled in the registry.
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \since Release: 180
+//!
+//! \param [out]     pIsStereoEnabled   Address where the result of the inquiry will be placed.
+//!
+//! \retval ::NVAPI_OK                       Check was sucessfully completed and result reflects current state of stereo availability.
+//! \retval ::NVAPI_API_NOT_INTIALIZED
+//! \retval ::NVAPI_STEREO_NOT_INITIALIZED   Stereo part of NVAPI not initialized.
+//! \retval ::NVAPI_ERROR
+//!
+//! \ingroup stereoapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_Stereo_IsEnabled(NvU8 *pIsStereoEnabled);
+#if defined(_D3D9_H_) || defined(__d3d10_h__) || defined(__d3d11_h__)
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_Stereo_CreateHandleFromIUnknown
+//
+//! DESCRIPTION:   This API creates a stereo handle that is used in subsequent calls related to a given device interface.
+//!                This must be called before any other NvAPI_Stereo_ function for that handle.
+//!                Multiple devices can be used at one time using multiple calls to this function (one per each device).
+//!
+//! HOW TO USE:    After the Direct3D device is created, create the stereo handle.
+//!                On call success:
+//!                -# Use all other NvAPI_Stereo_ functions that have stereo handle as first parameter.
+//!                -# After the device interface that corresponds to the the stereo handle is destroyed,
+//!                the application should call NvAPI_DestroyStereoHandle() for that stereo handle.
+//!
+//! WHEN TO USE:   After the stereo handle for the device interface is created via successfull call to the appropriate NvAPI_Stereo_CreateHandleFrom() function.
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \since Release: 180
+//!
+//! \param [in]     pDevice        Pointer to IUnknown interface that is IDirect3DDevice9* in DX9, ID3D10Device*.
+//! \param [out]    pStereoHandle  Pointer to the newly created stereo handle.
+//!
+//! \retval ::NVAPI_OK                       Stereo handle is created for given device interface.
+//! \retval ::NVAPI_INVALID_ARGUMENT         Provided device interface is invalid.
+//! \retval ::NVAPI_API_NOT_INTIALIZED
+//! \retval ::NVAPI_STEREO_NOT_INITIALIZED   Stereo part of NVAPI not initialized.
+//! \retval ::NVAPI_ERROR
+//!
+//! \ingroup stereoapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_Stereo_CreateHandleFromIUnknown(IUnknown *pDevice, StereoHandle *pStereoHandle);
+
+#endif // defined(_D3D9_H_) || defined(__d3d10_h__)
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_Stereo_DestroyHandle
+//
+//! DESCRIPTION:   This API destroys the stereo handle created with one of the NvAPI_Stereo_CreateHandleFrom() functions.
+//!                This should be called after the device corresponding to the handle has been destroyed.
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \since Release: 180
+//!
+//! \param [in]     stereoHandle  Stereo handle that is to be destroyed.
+//!
+//! \retval ::NVAPI_OK                      Stereo handle is destroyed.
+//! \retval ::NVAPI_API_NOT_INTIALIZED
+//! \retval ::NVAPI_STEREO_NOT_INITIALIZED  Stereo part of NVAPI not initialized.
+//! \retval ::NVAPI_ERROR
+//!
+//! \ingroup stereoapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_Stereo_DestroyHandle(StereoHandle stereoHandle);
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_Stereo_Activate
+//
+//! DESCRIPTION:   This API activates stereo for the device interface corresponding to the given stereo handle.
+//!                Activating stereo is possible only if stereo was enabled previously in the registry.
+//!                If stereo is not activated, then calls to functions that require that stereo is activated have no effect,
+//!                and will return the appropriate error code.
+//!
+//! WHEN TO USE:   After the stereo handle for the device interface is created via successfull call to the appropriate NvAPI_Stereo_CreateHandleFrom() function.
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \since Release: 180
+//!
+//! \param [in]    stereoHandle  Stereo handle corresponding to the device interface.
+//!
+//! \retval ::NVAPI_OK                                Stereo is turned on.
+//! \retval ::NVAPI_STEREO_INVALID_DEVICE_INTERFACE   Device interface is not valid. Create again, then attach again.
+//! \retval ::NVAPI_API_NOT_INTIALIZED
+//! \retval ::NVAPI_STEREO_NOT_INITIALIZED            Stereo part of NVAPI not initialized.
+//! \retval ::NVAPI_ERROR
+//!
+//! \ingroup stereoapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_Stereo_Activate(StereoHandle stereoHandle);
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_Stereo_Deactivate
+//
+//! DESCRIPTION:   This API deactivates stereo for the given device interface.
+//!                If stereo is not activated, then calls to functions that require that stereo is activated have no effect,
+//!                and will return the appropriate error code.
+//!
+//! WHEN TO USE:   After the stereo handle for the device interface is created via successfull call to the appropriate NvAPI_Stereo_CreateHandleFrom() function.
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \since Release: 180
+//!
+//! \param [in]     stereoHandle  Stereo handle that corresponds to the device interface.
+//!
+//! \retval ::NVAPI_OK                               Stereo is turned off.
+//! \retval ::NVAPI_STEREO_INVALID_DEVICE_INTERFACE  Device interface is not valid. Create again, then attach again.
+//! \retval ::NVAPI_API_NOT_INTIALIZED
+//! \retval ::NVAPI_STEREO_NOT_INITIALIZED           Stereo part of NVAPI not initialized.
+//! \retval ::NVAPI_ERROR
+//!
+//! \ingroup stereoapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_Stereo_Deactivate(StereoHandle stereoHandle);
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_Stereo_IsActivated
+//
+//! DESCRIPTION:   This API checks if stereo is activated for the given device interface.
+//!
+//! WHEN TO USE:   After the stereo handle for the device interface is created via successfull call to the appropriate NvAPI_Stereo_CreateHandleFrom() function.
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \since Release: 180
+//!
+//! \param [in]    stereoHandle  Stereo handle that corresponds to the device interface.
+//! \param [in]    pIsStereoOn   Address where result of the inquiry will be placed.
+//!
+//! \retval ::NVAPI_OK - Check was sucessfully completed and result reflects current state of stereo (on/off).
+//! \retval ::NVAPI_STEREO_INVALID_DEVICE_INTERFACE - Device interface is not valid. Create again, then attach again.
+//! \retval ::NVAPI_API_NOT_INTIALIZED - NVAPI not initialized.
+//! \retval ::NVAPI_STEREO_NOT_INITIALIZED - Stereo part of NVAPI not initialized.
+//! \retval ::NVAPI_ERROR - Something is wrong (generic error).
+//!
+//! \ingroup stereoapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_Stereo_IsActivated(StereoHandle stereoHandle, NvU8 *pIsStereoOn);
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_Stereo_GetSeparation
+//
+//! DESCRIPTION:   This API gets current separation value (in percents).
+//!
+//! WHEN TO USE:   After the stereo handle for the device interface is created via successfull call to the appropriate NvAPI_Stereo_CreateHandleFrom() function.
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \since Release: 180
+//!
+//! \param [in]     stereoHandle           Stereo handle that corresponds to the device interface.
+//! \param [out]    pSeparationPercentage  Address of @c float type variable to store current separation percentage in.
+//!
+//! \retval ::NVAPI_OK                                Retrieval of separation percentage was successfull.
+//! \retval ::NVAPI_STEREO_INVALID_DEVICE_INTERFACE   Device interface is not valid. Create again, then attach again.
+//! \retval ::NVAPI_API_NOT_INTIALIZED
+//! \retval ::NVAPI_STEREO_NOT_INITIALIZED            Stereo part of NVAPI not initialized.
+//! \retval ::NVAPI_ERROR
+//!
+//! \ingroup stereoapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_Stereo_GetSeparation(StereoHandle stereoHandle, float *pSeparationPercentage);
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_Stereo_SetSeparation
+//
+//! DESCRIPTION:   This API sets separation to given percentage.
+//!
+//! WHEN TO USE:   After the stereo handle for the device interface is created via successfull call to appropriate NvAPI_Stereo_CreateHandleFrom() function.
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \since Release: 180
+//!
+//! \param [in]     stereoHandle             Stereo handle that corresponds to the device interface.
+//! \param [in]     newSeparationPercentage  New value for separation percentage.
+//!
+//! \retval ::NVAPI_OK                               Setting of separation percentage was successfull.
+//! \retval ::NVAPI_STEREO_INVALID_DEVICE_INTERFACE  Device interface is not valid. Create again, then attach again.
+//! \retval ::NVAPI_API_NOT_INTIALIZED               NVAPI not initialized.
+//! \retval ::NVAPI_STEREO_NOT_INITIALIZED           Stereo part of NVAPI not initialized.
+//! \retval ::NVAPI_STEREO_PARAMETER_OUT_OF_RANGE    Given separation percentage is out of [0..100] range.
+//! \retval ::NVAPI_ERROR
+//!
+//! \ingroup stereoapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_Stereo_SetSeparation(StereoHandle stereoHandle, float newSeparationPercentage);
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_Stereo_GetConvergence
+//
+//! DESCRIPTION:   This API gets the current convergence value.
+//!
+//! WHEN TO USE:   After the stereo handle for the device interface is created via successfull call to the appropriate NvAPI_Stereo_CreateHandleFrom() function.
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \since Release: 180
+//!
+//! \param [in]     stereoHandle   Stereo handle that corresponds to the device interface.
+//! \param [out]    pConvergence   Address of @c float type variable to store current convergence value in.
+//!
+//! \retval ::NVAPI_OK                               Retrieval of convergence value was successfull.
+//! \retval ::NVAPI_STEREO_INVALID_DEVICE_INTERFACE  Device interface is not valid. Create again, then attach again.
+//! \retval ::NVAPI_API_NOT_INTIALIZED
+//! \retval ::NVAPI_STEREO_NOT_INITIALIZED           Stereo part of NVAPI not initialized.
+//! \retval ::NVAPI_ERROR
+//!
+//! \ingroup stereoapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_Stereo_GetConvergence(StereoHandle stereoHandle, float *pConvergence);
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_Stereo_SetConvergence
+//
+//! DESCRIPTION:   This API sets convergence to the given value.
+//!
+//! WHEN TO USE:   After the stereo handle for the device interface is created via successfull call to the appropriate NvAPI_Stereo_CreateHandleFrom() function.
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \since Release: 180
+//!
+//! \param [in]     stereoHandle              Stereo handle that corresponds to the device interface.
+//! \param [in]     newConvergence            New value for convergence.
+//!
+//! \retval ::NVAPI_OK                                Setting of convergence value was successfull.
+//! \retval ::NVAPI_STEREO_INVALID_DEVICE_INTERFACE   Device interface is not valid. Create again, then attach again.
+//! \retval ::NVAPI_API_NOT_INTIALIZED
+//! \retval ::NVAPI_STEREO_NOT_INITIALIZED            Stereo part of NVAPI not initialized.
+//! \retval ::NVAPI_ERROR
+//!
+//! \ingroup stereoapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_Stereo_SetConvergence(StereoHandle stereoHandle, float newConvergence);
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_Stereo_SetActiveEye
+//
+//! \fn NvAPI_Stereo_SetActiveEye(StereoHandle hStereoHandle, NV_STEREO_ACTIVE_EYE StereoEye);
+//! DESCRIPTION:   This API sets the back buffer to left or right in Direct stereo mode.
+//!
+//! HOW TO USE:    After the stereo handle for device interface is created via successfull call to appropriate
+//!                NvAPI_Stereo_CreateHandleFrom function.
+//!
+//! \since Release: 285
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \param [in]   stereoHandle  Stereo handle that corresponds to the device interface.
+//! \param [in]   StereoEye     Defines active eye in Direct stereo mode
+//!
+//! \retval ::NVAPI_OK - Active eye is set.
+//! \retval ::NVAPI_STEREO_INVALID_DEVICE_INTERFACE - Device interface is not valid. Create again, then attach again.
+//! \retval ::NVAPI_API_NOT_INTIALIZED - NVAPI not initialized.
+//! \retval ::NVAPI_STEREO_NOT_INITIALIZED - Stereo part of NVAPI not initialized.
+//! \retval ::NVAPI_INVALID_ARGUMENT - StereoEye parameter has not allowed value.
+//! \retval ::NVAPI_SET_NOT_ALLOWED  - Current stereo mode is not Direct
+//! \retval ::NVAPI_ERROR - Something is wrong (generic error).
+//
+///////////////////////////////////////////////////////////////////////////////
+
+//! \ingroup stereoapi
+typedef enum _NV_StereoActiveEye
+{
+    NVAPI_STEREO_EYE_RIGHT = 1,
+    NVAPI_STEREO_EYE_LEFT = 2,
+    NVAPI_STEREO_EYE_MONO = 3,
+} NV_STEREO_ACTIVE_EYE;
+
+//! \ingroup stereoapi
+NVAPI_INTERFACE NvAPI_Stereo_SetActiveEye(StereoHandle hStereoHandle, NV_STEREO_ACTIVE_EYE StereoEye);
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_Stereo_SetDriverMode
+//
+//! \fn NvAPI_Stereo_SetDriverMode( NV_STEREO_DRIVER_MODE mode );
+//! DESCRIPTION:   This API sets the 3D stereo driver mode: Direct or Automatic
+//!
+//! HOW TO USE:    This API must be called before the device is created.
+//!                Applies to DirectX 9 and higher.
+//!
+//! \since Release: 285
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \param [in]    mode       Defines the 3D stereo driver mode: Direct or Automatic
+//!
+//! \retval ::NVAPI_OK                      Active eye is set.
+//! \retval ::NVAPI_API_NOT_INTIALIZED      NVAPI not initialized.
+//! \retval ::NVAPI_STEREO_NOT_INITIALIZED  Stereo part of NVAPI not initialized.
+//! \retval ::NVAPI_INVALID_ARGUMENT        mode parameter has not allowed value.
+//! \retval ::NVAPI_ERROR                   Something is wrong (generic error).
+//
+///////////////////////////////////////////////////////////////////////////////
+
+//! \ingroup stereoapi
+typedef enum _NV_StereoDriverMode
+{
+    NVAPI_STEREO_DRIVER_MODE_AUTOMATIC = 0,
+    NVAPI_STEREO_DRIVER_MODE_DIRECT    = 2,
+} NV_STEREO_DRIVER_MODE;
+
+//! \ingroup stereoapi
+NVAPI_INTERFACE NvAPI_Stereo_SetDriverMode( NV_STEREO_DRIVER_MODE mode );
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_Stereo_GetEyeSeparation
+//
+//! DESCRIPTION:   This API returns eye separation as a ratio of <between eye distance>/<physical screen width>.
+//!
+//! HOW TO USE:    After the stereo handle for device interface is created via successfull call to appropriate API. Applies only to DirectX 9 and up.
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \param [in]   stereoHandle  Stereo handle that corresponds to the device interface.
+//! \param [out]  pSeparation   Eye separation.
+//!
+//! \retval ::NVAPI_OK                               Active eye is set.
+//! \retval ::NVAPI_STEREO_INVALID_DEVICE_INTERFACE  Device interface is not valid. Create again, then attach again.
+//! \retval ::NVAPI_API_NOT_INTIALIZED               NVAPI not initialized.
+//! \retval ::NVAPI_STEREO_NOT_INITIALIZED           Stereo part of NVAPI not initialized.
+//! \retval ::NVAPI_ERROR  (generic error).
+//!
+//! \ingroup stereoapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_Stereo_GetEyeSeparation(StereoHandle hStereoHandle,  float *pSeparation );
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_Stereo_IsWindowedModeSupported
+//
+//! DESCRIPTION:   This API returns availability of windowed mode stereo
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! \param [out] bSupported(OUT)    != 0  - supported,  \n
+//!                                 == 0  - is not supported
+//!
+//!
+//! \retval ::NVAPI_OK                      Retrieval of frustum adjust mode was successfull.
+//! \retval ::NVAPI_API_NOT_INTIALIZED      NVAPI not initialized.
+//! \retval ::NVAPI_STEREO_NOT_INITIALIZED  Stereo part of NVAPI not initialized.
+//! \retval ::NVAPI_ERROR                   Something is wrong (generic error).
+//!
+//! \ingroup stereoapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_Stereo_IsWindowedModeSupported(NvU8* bSupported);
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_Stereo_SetSurfaceCreationMode
+//
+//! \function NvAPI_Stereo_SetSurfaceCreationMode(StereoHandle hStereoHandle, NVAPI_STEREO_SURFACECREATEMODE creationMode)
+//! \param [in]   hStereoHandle   Stereo handle that corresponds to the device interface.
+//! \param [in]   creationMode    New surface creation mode for this device interface.
+//!
+//! \since Release: 285
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! DESCRIPTION: This API sets surface creation mode for this device interface.
+//!
+//! WHEN TO USE: After the stereo handle for device interface is created via successful call to appropriate NvAPI_Stereo_CreateHandleFrom function.
+//!
+//! \return      This API can return any of the error codes enumerated in #NvAPI_Status.
+//!              There are no return error codes with specific meaning for this API.
+//!
+///////////////////////////////////////////////////////////////////////////////
+
+//! \ingroup stereoapi
+typedef enum _NVAPI_STEREO_SURFACECREATEMODE
+{
+    NVAPI_STEREO_SURFACECREATEMODE_AUTO,        //!< Use driver registry profile settings for surface creation mode.
+    NVAPI_STEREO_SURFACECREATEMODE_FORCESTEREO, //!< Always create stereo surfaces.
+    NVAPI_STEREO_SURFACECREATEMODE_FORCEMONO    //!< Always create mono surfaces.
+} NVAPI_STEREO_SURFACECREATEMODE;
+
+//! \ingroup stereoapi
+NVAPI_INTERFACE NvAPI_Stereo_SetSurfaceCreationMode(__in StereoHandle hStereoHandle, __in NVAPI_STEREO_SURFACECREATEMODE creationMode);
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_Stereo_GetSurfaceCreationMode
+//
+//! \function NvAPI_Stereo_GetSurfaceCreationMode(StereoHandle hStereoHandle, NVAPI_STEREO_SURFACECREATEMODE* pCreationMode)
+//! \param [in]   hStereoHandle   Stereo handle that corresponds to the device interface.
+//! \param [out]   pCreationMode   The current creation mode for this device interface.
+//!
+//! \since Release: 295
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! DESCRIPTION: This API gets surface creation mode for this device interface.
+//!
+//! WHEN TO USE: After the stereo handle for device interface is created via successful call to appropriate NvAPI_Stereo_CreateHandleFrom function.
+//!
+//! \return      This API can return any of the error codes enumerated in #NvAPI_Status.
+//!              There are no return error codes with specific meaning for this API.
+//!
+///////////////////////////////////////////////////////////////////////////////
+
+//! \ingroup stereoapi
+NVAPI_INTERFACE NvAPI_Stereo_GetSurfaceCreationMode(__in StereoHandle hStereoHandle, __in NVAPI_STEREO_SURFACECREATEMODE* pCreationMode);
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_Stereo_Debug_WasLastDrawStereoized
+//
+//! \param [in]  hStereoHandle    Stereo handle that corresponds to the device interface.
+//! \param [out] pWasStereoized   Address where result of the inquiry will be placed.
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! DESCRIPTION: This API checks if the last draw call was stereoized. It is a very expensive to call and should be used for debugging purpose *only*.
+//!
+//! WHEN TO USE: After the stereo handle for device interface is created via successful call to appropriate NvAPI_Stereo_CreateHandleFrom function.
+//!
+//! \return      This API can return any of the error codes enumerated in #NvAPI_Status.
+//!              There are no return error codes with specific meaning for this API.
+//!
+//! \ingroup stereoapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_Stereo_Debug_WasLastDrawStereoized(__in StereoHandle hStereoHandle, __out NvU8 *pWasStereoized);
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_Stereo_SetDefaultProfile
+//
+//!
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! DESCRIPTION: This API defines the stereo profile used by the driver in case the application has no associated profile.
+//!
+//! WHEN TO USE: To take effect, this API must be called before D3D device is created. Calling once a device has been created will not affect the current device.
+//!
+//! \param [in]  szProfileName        Default profile name.
+//!
+//! \return      This API can return any of the error codes enumerated in #NvAPI_Status.
+//!              Error codes specific to this API are described below.
+//!
+//! \retval      NVAPI_SUCCESS                               - Default stereo profile name has been copied into szProfileName.
+//! \retval      NVAPI_INVALID_ARGUMENT                      - szProfileName == NULL.
+//! \retval      NVAPI_DEFAULT_STEREO_PROFILE_DOES_NOT_EXIST - Default stereo profile does not exist
+//!
+//! \ingroup stereoapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_Stereo_SetDefaultProfile(__in const char* szProfileName);
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME: NvAPI_Stereo_GetDefaultProfile
+//
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+//!
+//! DESCRIPTION: This API retrieves the current default stereo profile.
+//!
+//!              After call cbSizeOut contain 0 if default profile is not set required buffer size cbSizeOut.
+//!              To get needed buffer size this function can be called with szProfileName==0 and cbSizeIn == 0.
+//!
+//! WHEN TO USE: This API can be called at any time.
+//!
+//!
+//! \param [in]   cbSizeIn             Size of buffer allocated for default stereo profile name.
+//! \param [out]  szProfileName        Default stereo profile name.
+//! \param [out]  pcbSizeOut           Required buffer size.
+//!                     # ==0 - there is no default stereo profile name currently set
+//!                     # !=0 - size of buffer required for currently set default stereo profile name including trailing '0'.
+//!
+//!
+//! \return      This API can return any of the error codes enumerated in #NvAPI_Status.
+//!              Error codes specific to this API are described below.
+//!
+//! \retval      NVAPI_SUCCESS                                - Default stereo profile name has been copied into szProfileName.
+//! \retval      NVAPI_DEFAULT_STEREO_PROFILE_IS_NOT_DEFINED  - There is no default stereo profile set at this time.
+//! \retval      NVAPI_INVALID_ARGUMENT                       - pcbSizeOut == 0 or cbSizeIn >= *pcbSizeOut && szProfileName == 0
+//! \retval      NVAPI_INSUFFICIENT_BUFFER                    - cbSizeIn < *pcbSizeOut
+//!
+//! \ingroup stereoapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_Stereo_GetDefaultProfile( __in NvU32 cbSizeIn, __out_bcount_part_opt(cbSizeIn, *pcbSizeOut) char* szProfileName,  __out NvU32 *pcbSizeOut);
+
+#include"nvapi_lite_salend.h"
+#ifdef __cplusplus
+}
+#endif
+#pragma pack(pop)
diff --git a/compat/nvapi/nvapi_lite_surround.h b/compat/nvapi/nvapi_lite_surround.h
new file mode 100644
index 0000000000..d7576d1dd7
--- /dev/null
+++ b/compat/nvapi/nvapi_lite_surround.h
@@ -0,0 +1,105 @@
+ /************************************************************************************************************************************\
+|*                                                                                                                                    *|
+|*     Copyright � 2012 NVIDIA Corporation.  All rights reserved.                                                                     *|
+|*                                                                                                                                    *|
+|*  NOTICE TO USER:                                                                                                                   *|
+|*                                                                                                                                    *|
+|*  This software is subject to NVIDIA ownership rights under U.S. and international Copyright laws.                                  *|
+|*                                                                                                                                    *|
+|*  This software and the information contained herein are PROPRIETARY and CONFIDENTIAL to NVIDIA                                     *|
+|*  and are being provided solely under the terms and conditions of an NVIDIA software license agreement.                             *|
+|*  Otherwise, you have no rights to use or access this software in any manner.                                                       *|
+|*                                                                                                                                    *|
+|*  If not covered by the applicable NVIDIA software license agreement:                                                               *|
+|*  NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOFTWARE FOR ANY PURPOSE.                                            *|
+|*  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.                                                           *|
+|*  NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,                                                                     *|
+|*  INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.                       *|
+|*  IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,                               *|
+|*  OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT,                         *|
+|*  NEGLIGENCE OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOURCE CODE.            *|
+|*                                                                                                                                    *|
+|*  U.S. Government End Users.                                                                                                        *|
+|*  This software is a "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT 1995),                                       *|
+|*  consisting  of "commercial computer  software"  and "commercial computer software documentation"                                  *|
+|*  as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government only as a commercial end item.     *|
+|*  Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995),                                          *|
+|*  all U.S. Government End Users acquire the software with only those rights set forth herein.                                       *|
+|*                                                                                                                                    *|
+|*  Any use of this software in individual and commercial software must include,                                                      *|
+|*  in the user documentation and internal comments to the code,                                                                      *|
+|*  the above Disclaimer (as applicable) and U.S. Government End Users Notice.                                                        *|
+|*                                                                                                                                    *|
+ \************************************************************************************************************************************/
+
+#pragma once
+#include"nvapi_lite_salstart.h"
+#include"nvapi_lite_common.h"
+#pragma pack(push,8)
+#ifdef __cplusplus
+extern "C" {
+#endif
+//! SUPPORTED OS:  Windows XP and higher
+//!
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME:   NvAPI_DISP_GetGDIPrimaryDisplayId
+//
+//! DESCRIPTION:     This API returns the Display ID of the GDI Primary.
+//!
+//! \param [out]     displayId   Display ID of the GDI Primary display.
+//!
+//! \retval ::NVAPI_OK:                          Capabilties have been returned.
+//! \retval ::NVAPI_NVIDIA_DEVICE_NOT_FOUND:     GDI Primary not on an NVIDIA GPU.
+//! \retval ::NVAPI_INVALID_ARGUMENT:            One or more args passed in are invalid.
+//! \retval ::NVAPI_API_NOT_INTIALIZED:          The NvAPI API needs to be initialized first
+//! \retval ::NVAPI_NO_IMPLEMENTATION:           This entrypoint not available
+//! \retval ::NVAPI_ERROR:                       Miscellaneous error occurred
+//!
+//! \ingroup dispcontrol
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_DISP_GetGDIPrimaryDisplayId(NvU32* displayId);
+#define NV_MOSAIC_MAX_DISPLAYS      (64)
+//! SUPPORTED OS:  Windows Vista and higher
+//!
+///////////////////////////////////////////////////////////////////////////////
+//
+// FUNCTION NAME:   NvAPI_Mosaic_GetDisplayViewportsByResolution
+//
+//! DESCRIPTION:     This API returns the viewports that would be applied on
+//!                  the requested display.
+//!
+//! \param [in]      displayId       Display ID of a single display in the active
+//!                                  mosaic topology to query.
+//! \param [in]      srcWidth        Width of full display topology. If both
+//!                                  width and height are 0, the current
+//!                                  resolution is used.
+//! \param [in]      srcHeight       Height of full display topology. If both
+//!                                  width and height are 0, the current
+//!                                  resolution is used.
+//! \param [out]     viewports       Array of NV_RECT viewports which represent
+//!                                  the displays as identified in
+//!                                  NvAPI_Mosaic_EnumGridTopologies. If the
+//!                                  requested resolution is a single-wide
+//!                                  resolution, only viewports[0] will
+//!                                  contain the viewport details, regardless
+//!                                  of which display is driving the display.
+//! \param [out]     bezelCorrected  Returns 1 if the requested resolution is
+//!                                  bezel corrected. May be NULL.
+//!
+//! \retval ::NVAPI_OK                          Capabilties have been returned.
+//! \retval ::NVAPI_INVALID_ARGUMENT            One or more args passed in are invalid.
+//! \retval ::NVAPI_API_NOT_INTIALIZED          The NvAPI API needs to be initialized first
+//! \retval ::NVAPI_MOSAIC_NOT_ACTIVE           The display does not belong to an active Mosaic Topology
+//! \retval ::NVAPI_NO_IMPLEMENTATION           This entrypoint not available
+//! \retval ::NVAPI_ERROR                       Miscellaneous error occurred
+//!
+//! \ingroup mosaicapi
+///////////////////////////////////////////////////////////////////////////////
+NVAPI_INTERFACE NvAPI_Mosaic_GetDisplayViewportsByResolution(NvU32 displayId, NvU32 srcWidth, NvU32 srcHeight, NV_RECT viewports[NV_MOSAIC_MAX_DISPLAYS], NvU8* bezelCorrected);
+
+#include"nvapi_lite_salend.h"
+#ifdef __cplusplus
+}
+#endif
+#pragma pack(pop)
diff --git a/compat/nvapi/x86/nvapi.lib b/compat/nvapi/x86/nvapi.lib
new file mode 100644
index 0000000000..38d85961c9
Binary files /dev/null and b/compat/nvapi/x86/nvapi.lib differ
diff --git a/compat/pthreads/pthread.h b/compat/pthreads/pthread.h
new file mode 100644
index 0000000000..c8ee7780ae
--- /dev/null
+++ b/compat/pthreads/pthread.h
@@ -0,0 +1,1376 @@
+/* This is an implementation of the threads API of POSIX 1003.1-2001.
+ *
+ * --------------------------------------------------------------------------
+ *
+ *      Pthreads-win32 - POSIX Threads Library for Win32
+ *      Copyright(C) 1998 John E. Bossom
+ *      Copyright(C) 1999,2005 Pthreads-win32 contributors
+ *
+ *      Contact Email: rpj@callisto.canberra.edu.au
+ *
+ *      The current list of contributors is contained
+ *      in the file CONTRIBUTORS included with the source
+ *      code distribution. The list can also be seen at the
+ *      following World Wide Web location:
+ *      http://sources.redhat.com/pthreads-win32/contributors.html
+ *
+ *      This library is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU Lesser General Public
+ *      License as published by the Free Software Foundation; either
+ *      version 2 of the License, or (at your option) any later version.
+ *
+ *      This library is distributed in the hope that it will be useful,
+ *      but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *      Lesser General Public License for more details.
+ *
+ *      You should have received a copy of the GNU Lesser General Public
+ *      License along with this library in the file COPYING.LIB;
+ *      if not, write to the Free Software Foundation, Inc.,
+ *      59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+// undef for DLLs
+#define PTW32_STATIC_LIB
+
+
+#if !defined( PTHREAD_H )
+#define PTHREAD_H
+
+#if !defined(HAVE_STRUCT_TIMESPEC) && _MSC_VER >= 1900
+#define HAVE_STRUCT_TIMESPEC
+#endif
+
+/*
+ * See the README file for an explanation of the pthreads-win32 version
+ * numbering scheme and how the DLL is named etc.
+ */
+#define PTW32_VERSION 2,9,1,0
+#define PTW32_VERSION_STRING "2, 9, 1, 0\0"
+
+/* There are three implementations of cancel cleanup.
+ * Note that pthread.h is included in both application
+ * compilation units and also internally for the library.
+ * The code here and within the library aims to work
+ * for all reasonable combinations of environments.
+ *
+ * The three implementations are:
+ *
+ *   WIN32 SEH
+ *   C
+ *   C++
+ *
+ * Please note that exiting a push/pop block via
+ * "return", "exit", "break", or "continue" will
+ * lead to different behaviour amongst applications
+ * depending upon whether the library was built
+ * using SEH, C++, or C. For example, a library built
+ * with SEH will call the cleanup routine, while both
+ * C++ and C built versions will not.
+ */
+
+/*
+ * Define defaults for cleanup code.
+ * Note: Unless the build explicitly defines one of the following, then
+ * we default to standard C style cleanup. This style uses setjmp/longjmp
+ * in the cancelation and thread exit implementations and therefore won't
+ * do stack unwinding if linked to applications that have it (e.g.
+ * C++ apps). This is currently consistent with most/all commercial Unix
+ * POSIX threads implementations.
+ */
+#if !defined( __CLEANUP_SEH ) && !defined( __CLEANUP_CXX ) && !defined( __CLEANUP_C )
+# define __CLEANUP_C
+#endif
+
+#if defined( __CLEANUP_SEH ) && ( !defined( _MSC_VER ) && !defined(PTW32_RC_MSC))
+#error ERROR [__FILE__, line __LINE__]: SEH is not supported for this compiler.
+#endif
+
+/*
+ * Stop here if we are being included by the resource compiler.
+ */
+#if !defined(RC_INVOKED)
+
+#undef PTW32_LEVEL
+
+#if defined(_POSIX_SOURCE)
+#define PTW32_LEVEL 0
+/* Early POSIX */
+#endif
+
+#if defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 199309
+#undef PTW32_LEVEL
+#define PTW32_LEVEL 1
+/* Include 1b, 1c and 1d */
+#endif
+
+#if defined(INCLUDE_NP)
+#undef PTW32_LEVEL
+#define PTW32_LEVEL 2
+/* Include Non-Portable extensions */
+#endif
+
+#define PTW32_LEVEL_MAX 3
+
+#if ( defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 200112 )  || !defined(PTW32_LEVEL)
+#define PTW32_LEVEL PTW32_LEVEL_MAX
+/* Include everything */
+#endif
+
+#if defined(_UWIN)
+#   define HAVE_STRUCT_TIMESPEC 1
+#   define HAVE_SIGNAL_H        1
+#   undef HAVE_PTW32_CONFIG_H
+#   pragma comment(lib, "pthread")
+#endif
+
+/*
+ * -------------------------------------------------------------
+ *
+ *
+ * Module: pthread.h
+ *
+ * Purpose:
+ *      Provides an implementation of PThreads based upon the
+ *      standard:
+ *
+ *              POSIX 1003.1-2001
+ *  and
+ *    The Single Unix Specification version 3
+ *
+ *    (these two are equivalent)
+ *
+ *      in order to enhance code portability between Windows,
+ *  various commercial Unix implementations, and Linux.
+ *
+ *      See the ANNOUNCE file for a full list of conforming
+ *      routines and defined constants, and a list of missing
+ *      routines and constants not defined in this implementation.
+ *
+ * Authors:
+ *      There have been many contributors to this library.
+ *      The initial implementation was contributed by
+ *      John Bossom, and several others have provided major
+ *      sections or revisions of parts of the implementation.
+ *      Often significant effort has been contributed to
+ *      find and fix important bugs and other problems to
+ *      improve the reliability of the library, which sometimes
+ *      is not reflected in the amount of code which changed as
+ *      result.
+ *      As much as possible, the contributors are acknowledged
+ *      in the ChangeLog file in the source code distribution
+ *      where their changes are noted in detail.
+ *
+ *      Contributors are listed in the CONTRIBUTORS file.
+ *
+ *      As usual, all bouquets go to the contributors, and all
+ *      brickbats go to the project maintainer.
+ *
+ * Maintainer:
+ *      The code base for this project is coordinated and
+ *      eventually pre-tested, packaged, and made available by
+ *
+ *              Ross Johnson <rpj@callisto.canberra.edu.au>
+ *
+ * QA Testers:
+ *      Ultimately, the library is tested in the real world by
+ *      a host of competent and demanding scientists and
+ *      engineers who report bugs and/or provide solutions
+ *      which are then fixed or incorporated into subsequent
+ *      versions of the library. Each time a bug is fixed, a
+ *      test case is written to prove the fix and ensure
+ *      that later changes to the code don't reintroduce the
+ *      same error. The number of test cases is slowly growing
+ *      and therefore so is the code reliability.
+ *
+ * Compliance:
+ *      See the file ANNOUNCE for the list of implemented
+ *      and not-implemented routines and defined options.
+ *      Of course, these are all defined is this file as well.
+ *
+ * Web site:
+ *      The source code and other information about this library
+ *      are available from
+ *
+ *              http://sources.redhat.com/pthreads-win32/
+ *
+ * -------------------------------------------------------------
+ */
+
+/* Try to avoid including windows.h */
+#if (defined(__MINGW64__) || defined(__MINGW32__)) && defined(__cplusplus)
+#define PTW32_INCLUDE_WINDOWS_H
+#endif
+
+#if defined(PTW32_INCLUDE_WINDOWS_H)
+#include <windows.h>
+#endif
+
+#if defined(_MSC_VER) && _MSC_VER < 1300 || defined(__DMC__)
+/*
+ * VC++6.0 or early compiler's header has no DWORD_PTR type.
+ */
+typedef unsigned long DWORD_PTR;
+typedef unsigned long ULONG_PTR;
+#endif
+/*
+ * -----------------
+ * autoconf switches
+ * -----------------
+ */
+
+#if defined(HAVE_PTW32_CONFIG_H)
+#include "config.h"
+#endif /* HAVE_PTW32_CONFIG_H */
+
+#if !defined(NEED_FTIME)
+#include <time.h>
+#else /* NEED_FTIME */
+/* use native WIN32 time API */
+#endif /* NEED_FTIME */
+
+#if defined(HAVE_SIGNAL_H)
+#include <signal.h>
+#endif /* HAVE_SIGNAL_H */
+
+#include <limits.h>
+
+/*
+ * Boolean values to make us independent of system includes.
+ */
+enum {
+  PTW32_FALSE = 0,
+  PTW32_TRUE = (! PTW32_FALSE)
+};
+
+/*
+ * This is a duplicate of what is in the autoconf config.h,
+ * which is only used when building the pthread-win32 libraries.
+ */
+
+#if !defined(PTW32_CONFIG_H)
+#  if defined(WINCE)
+#    define NEED_ERRNO
+#    define NEED_SEM
+#  endif
+#  if defined(__MINGW64__)
+#    define HAVE_STRUCT_TIMESPEC
+#    define HAVE_MODE_T
+#  elif defined(_UWIN) || defined(__MINGW32__)
+#    define HAVE_MODE_T
+#  endif
+#endif
+
+/*
+ *
+ */
+
+#if PTW32_LEVEL >= PTW32_LEVEL_MAX
+#if defined(NEED_ERRNO)
+#include "need_errno.h"
+#else
+#include <errno.h>
+#endif
+#endif /* PTW32_LEVEL >= PTW32_LEVEL_MAX */
+
+/*
+ * Several systems don't define some error numbers.
+ */
+#if !defined(ENOTSUP)
+#  define ENOTSUP 48   /* This is the value in Solaris. */
+#endif
+
+#if !defined(ETIMEDOUT)
+#  define ETIMEDOUT 10060 /* Same as WSAETIMEDOUT */
+#endif
+
+#if !defined(ENOSYS)
+#  define ENOSYS 140     /* Semi-arbitrary value */
+#endif
+
+#if !defined(EDEADLK)
+#  if defined(EDEADLOCK)
+#    define EDEADLK EDEADLOCK
+#  else
+#    define EDEADLK 36     /* This is the value in MSVC. */
+#  endif
+#endif
+
+/* POSIX 2008 - related to robust mutexes */
+#if !defined(EOWNERDEAD)
+#  define EOWNERDEAD 43
+#endif
+#if !defined(ENOTRECOVERABLE)
+#  define ENOTRECOVERABLE 44
+#endif
+
+#include <sched.h>
+
+/*
+ * To avoid including windows.h we define only those things that we
+ * actually need from it.
+ */
+#if !defined(PTW32_INCLUDE_WINDOWS_H)
+#if !defined(HANDLE)
+# define PTW32__HANDLE_DEF
+# define HANDLE void *
+#endif
+#if !defined(DWORD)
+# define PTW32__DWORD_DEF
+# define DWORD unsigned long
+#endif
+#endif
+
+#if !defined(HAVE_STRUCT_TIMESPEC)
+#define HAVE_STRUCT_TIMESPEC
+#if !defined(_TIMESPEC_DEFINED)
+#define _TIMESPEC_DEFINED
+struct timespec {
+        time_t tv_sec;
+        long tv_nsec;
+};
+#endif /* _TIMESPEC_DEFINED */
+#endif /* HAVE_STRUCT_TIMESPEC */
+
+#if !defined(SIG_BLOCK)
+#define SIG_BLOCK 0
+#endif /* SIG_BLOCK */
+
+#if !defined(SIG_UNBLOCK)
+#define SIG_UNBLOCK 1
+#endif /* SIG_UNBLOCK */
+
+#if !defined(SIG_SETMASK)
+#define SIG_SETMASK 2
+#endif /* SIG_SETMASK */
+
+#if defined(__cplusplus)
+extern "C"
+{
+#endif                          /* __cplusplus */
+
+/*
+ * -------------------------------------------------------------
+ *
+ * POSIX 1003.1-2001 Options
+ * =========================
+ *
+ * Options are normally set in <unistd.h>, which is not provided
+ * with pthreads-win32.
+ *
+ * For conformance with the Single Unix Specification (version 3), all of the
+ * options below are defined, and have a value of either -1 (not supported)
+ * or 200112L (supported).
+ *
+ * These options can neither be left undefined nor have a value of 0, because
+ * either indicates that sysconf(), which is not implemented, may be used at
+ * runtime to check the status of the option.
+ *
+ * _POSIX_THREADS (== 200112L)
+ *                      If == 200112L, you can use threads
+ *
+ * _POSIX_THREAD_ATTR_STACKSIZE (== 200112L)
+ *                      If == 200112L, you can control the size of a thread's
+ *                      stack
+ *                              pthread_attr_getstacksize
+ *                              pthread_attr_setstacksize
+ *
+ * _POSIX_THREAD_ATTR_STACKADDR (== -1)
+ *                      If == 200112L, you can allocate and control a thread's
+ *                      stack. If not supported, the following functions
+ *                      will return ENOSYS, indicating they are not
+ *                      supported:
+ *                              pthread_attr_getstackaddr
+ *                              pthread_attr_setstackaddr
+ *
+ * _POSIX_THREAD_PRIORITY_SCHEDULING (== -1)
+ *                      If == 200112L, you can use realtime scheduling.
+ *                      This option indicates that the behaviour of some
+ *                      implemented functions conforms to the additional TPS
+ *                      requirements in the standard. E.g. rwlocks favour
+ *                      writers over readers when threads have equal priority.
+ *
+ * _POSIX_THREAD_PRIO_INHERIT (== -1)
+ *                      If == 200112L, you can create priority inheritance
+ *                      mutexes.
+ *                              pthread_mutexattr_getprotocol +
+ *                              pthread_mutexattr_setprotocol +
+ *
+ * _POSIX_THREAD_PRIO_PROTECT (== -1)
+ *                      If == 200112L, you can create priority ceiling mutexes
+ *                      Indicates the availability of:
+ *                              pthread_mutex_getprioceiling
+ *                              pthread_mutex_setprioceiling
+ *                              pthread_mutexattr_getprioceiling
+ *                              pthread_mutexattr_getprotocol     +
+ *                              pthread_mutexattr_setprioceiling
+ *                              pthread_mutexattr_setprotocol     +
+ *
+ * _POSIX_THREAD_PROCESS_SHARED (== -1)
+ *                      If set, you can create mutexes and condition
+ *                      variables that can be shared with another
+ *                      process.If set, indicates the availability
+ *                      of:
+ *                              pthread_mutexattr_getpshared
+ *                              pthread_mutexattr_setpshared
+ *                              pthread_condattr_getpshared
+ *                              pthread_condattr_setpshared
+ *
+ * _POSIX_THREAD_SAFE_FUNCTIONS (== 200112L)
+ *                      If == 200112L you can use the special *_r library
+ *                      functions that provide thread-safe behaviour
+ *
+ * _POSIX_READER_WRITER_LOCKS (== 200112L)
+ *                      If == 200112L, you can use read/write locks
+ *
+ * _POSIX_SPIN_LOCKS (== 200112L)
+ *                      If == 200112L, you can use spin locks
+ *
+ * _POSIX_BARRIERS (== 200112L)
+ *                      If == 200112L, you can use barriers
+ *
+ *      + These functions provide both 'inherit' and/or
+ *        'protect' protocol, based upon these macro
+ *        settings.
+ *
+ * -------------------------------------------------------------
+ */
+
+/*
+ * POSIX Options
+ */
+#undef _POSIX_THREADS
+#define _POSIX_THREADS 200809L
+
+#undef _POSIX_READER_WRITER_LOCKS
+#define _POSIX_READER_WRITER_LOCKS 200809L
+
+#undef _POSIX_SPIN_LOCKS
+#define _POSIX_SPIN_LOCKS 200809L
+
+#undef _POSIX_BARRIERS
+#define _POSIX_BARRIERS 200809L
+
+#undef _POSIX_THREAD_SAFE_FUNCTIONS
+#define _POSIX_THREAD_SAFE_FUNCTIONS 200809L
+
+#undef _POSIX_THREAD_ATTR_STACKSIZE
+#define _POSIX_THREAD_ATTR_STACKSIZE 200809L
+
+/*
+ * The following options are not supported
+ */
+#undef _POSIX_THREAD_ATTR_STACKADDR
+#define _POSIX_THREAD_ATTR_STACKADDR -1
+
+#undef _POSIX_THREAD_PRIO_INHERIT
+#define _POSIX_THREAD_PRIO_INHERIT -1
+
+#undef _POSIX_THREAD_PRIO_PROTECT
+#define _POSIX_THREAD_PRIO_PROTECT -1
+
+/* TPS is not fully supported.  */
+#undef _POSIX_THREAD_PRIORITY_SCHEDULING
+#define _POSIX_THREAD_PRIORITY_SCHEDULING -1
+
+#undef _POSIX_THREAD_PROCESS_SHARED
+#define _POSIX_THREAD_PROCESS_SHARED -1
+
+
+/*
+ * POSIX 1003.1-2001 Limits
+ * ===========================
+ *
+ * These limits are normally set in <limits.h>, which is not provided with
+ * pthreads-win32.
+ *
+ * PTHREAD_DESTRUCTOR_ITERATIONS
+ *                      Maximum number of attempts to destroy
+ *                      a thread's thread-specific data on
+ *                      termination (must be at least 4)
+ *
+ * PTHREAD_KEYS_MAX
+ *                      Maximum number of thread-specific data keys
+ *                      available per process (must be at least 128)
+ *
+ * PTHREAD_STACK_MIN
+ *                      Minimum supported stack size for a thread
+ *
+ * PTHREAD_THREADS_MAX
+ *                      Maximum number of threads supported per
+ *                      process (must be at least 64).
+ *
+ * SEM_NSEMS_MAX
+ *                      The maximum number of semaphores a process can have.
+ *                      (must be at least 256)
+ *
+ * SEM_VALUE_MAX
+ *                      The maximum value a semaphore can have.
+ *                      (must be at least 32767)
+ *
+ */
+#undef _POSIX_THREAD_DESTRUCTOR_ITERATIONS
+#define _POSIX_THREAD_DESTRUCTOR_ITERATIONS     4
+
+#undef PTHREAD_DESTRUCTOR_ITERATIONS
+#define PTHREAD_DESTRUCTOR_ITERATIONS           _POSIX_THREAD_DESTRUCTOR_ITERATIONS
+
+#undef _POSIX_THREAD_KEYS_MAX
+#define _POSIX_THREAD_KEYS_MAX                  128
+
+#undef PTHREAD_KEYS_MAX
+#define PTHREAD_KEYS_MAX                        _POSIX_THREAD_KEYS_MAX
+
+#undef PTHREAD_STACK_MIN
+#define PTHREAD_STACK_MIN                       0
+
+#undef _POSIX_THREAD_THREADS_MAX
+#define _POSIX_THREAD_THREADS_MAX               64
+
+  /* Arbitrary value */
+#undef PTHREAD_THREADS_MAX
+#define PTHREAD_THREADS_MAX                     2019
+
+#undef _POSIX_SEM_NSEMS_MAX
+#define _POSIX_SEM_NSEMS_MAX                    256
+
+  /* Arbitrary value */
+#undef SEM_NSEMS_MAX
+#define SEM_NSEMS_MAX                           1024
+
+#undef _POSIX_SEM_VALUE_MAX
+#define _POSIX_SEM_VALUE_MAX                    32767
+
+#undef SEM_VALUE_MAX
+#define SEM_VALUE_MAX                           INT_MAX
+
+
+#if defined(__GNUC__) && !defined(__declspec)
+# error Please upgrade your GNU compiler to one that supports __declspec.
+#endif
+
+/*
+ * When building the library, you should define PTW32_BUILD so that
+ * the variables/functions are exported correctly. When using the library,
+ * do NOT define PTW32_BUILD, and then the variables/functions will
+ * be imported correctly.
+ */
+#if !defined(PTW32_STATIC_LIB)
+#  if defined(PTW32_BUILD)
+#    define PTW32_DLLPORT __declspec (dllexport)
+#  else
+#    define PTW32_DLLPORT __declspec (dllimport)
+#  endif
+#else
+#  define PTW32_DLLPORT
+#endif
+
+/*
+ * The Open Watcom C/C++ compiler uses a non-standard calling convention
+ * that passes function args in registers unless __cdecl is explicitly specified
+ * in exposed function prototypes.
+ *
+ * We force all calls to cdecl even though this could slow Watcom code down
+ * slightly. If you know that the Watcom compiler will be used to build both
+ * the DLL and application, then you can probably define this as a null string.
+ * Remember that pthread.h (this file) is used for both the DLL and application builds.
+ */
+#define PTW32_CDECL __cdecl
+
+#if defined(_UWIN) && PTW32_LEVEL >= PTW32_LEVEL_MAX
+#   include     <sys/types.h>
+#else
+/*
+ * Generic handle type - intended to extend uniqueness beyond
+ * that available with a simple pointer. It should scale for either
+ * IA-32 or IA-64.
+ */
+typedef struct {
+    void * p;                   /* Pointer to actual object */
+    unsigned int x;             /* Extra information - reuse count etc */
+} ptw32_handle_t;
+
+typedef ptw32_handle_t pthread_t;
+typedef struct pthread_attr_t_ * pthread_attr_t;
+typedef struct pthread_once_t_ pthread_once_t;
+typedef struct pthread_key_t_ * pthread_key_t;
+typedef struct pthread_mutex_t_ * pthread_mutex_t;
+typedef struct pthread_mutexattr_t_ * pthread_mutexattr_t;
+typedef struct pthread_cond_t_ * pthread_cond_t;
+typedef struct pthread_condattr_t_ * pthread_condattr_t;
+#endif
+typedef struct pthread_rwlock_t_ * pthread_rwlock_t;
+typedef struct pthread_rwlockattr_t_ * pthread_rwlockattr_t;
+typedef struct pthread_spinlock_t_ * pthread_spinlock_t;
+typedef struct pthread_barrier_t_ * pthread_barrier_t;
+typedef struct pthread_barrierattr_t_ * pthread_barrierattr_t;
+
+/*
+ * ====================
+ * ====================
+ * POSIX Threads
+ * ====================
+ * ====================
+ */
+
+enum {
+/*
+ * pthread_attr_{get,set}detachstate
+ */
+  PTHREAD_CREATE_JOINABLE       = 0,  /* Default */
+  PTHREAD_CREATE_DETACHED       = 1,
+
+/*
+ * pthread_attr_{get,set}inheritsched
+ */
+  PTHREAD_INHERIT_SCHED         = 0,
+  PTHREAD_EXPLICIT_SCHED        = 1,  /* Default */
+
+/*
+ * pthread_{get,set}scope
+ */
+  PTHREAD_SCOPE_PROCESS         = 0,
+  PTHREAD_SCOPE_SYSTEM          = 1,  /* Default */
+
+/*
+ * pthread_setcancelstate paramters
+ */
+  PTHREAD_CANCEL_ENABLE         = 0,  /* Default */
+  PTHREAD_CANCEL_DISABLE        = 1,
+
+/*
+ * pthread_setcanceltype parameters
+ */
+  PTHREAD_CANCEL_ASYNCHRONOUS   = 0,
+  PTHREAD_CANCEL_DEFERRED       = 1,  /* Default */
+
+/*
+ * pthread_mutexattr_{get,set}pshared
+ * pthread_condattr_{get,set}pshared
+ */
+  PTHREAD_PROCESS_PRIVATE       = 0,
+  PTHREAD_PROCESS_SHARED        = 1,
+
+/*
+ * pthread_mutexattr_{get,set}robust
+ */
+  PTHREAD_MUTEX_STALLED         = 0,  /* Default */
+  PTHREAD_MUTEX_ROBUST          = 1,
+
+/*
+ * pthread_barrier_wait
+ */
+  PTHREAD_BARRIER_SERIAL_THREAD = -1
+};
+
+/*
+ * ====================
+ * ====================
+ * Cancelation
+ * ====================
+ * ====================
+ */
+#define PTHREAD_CANCELED       ((void *)(size_t) -1)
+
+
+/*
+ * ====================
+ * ====================
+ * Once Key
+ * ====================
+ * ====================
+ */
+#define PTHREAD_ONCE_INIT       { PTW32_FALSE, 0, 0, 0}
+
+struct pthread_once_t_
+{
+  int          done;        /* indicates if user function has been executed */
+  void *       lock;
+  int          reserved1;
+  int          reserved2;
+};
+
+
+/*
+ * ====================
+ * ====================
+ * Object initialisers
+ * ====================
+ * ====================
+ */
+#define PTHREAD_MUTEX_INITIALIZER ((pthread_mutex_t)(size_t) -1)
+#define PTHREAD_RECURSIVE_MUTEX_INITIALIZER ((pthread_mutex_t)(size_t) -2)
+#define PTHREAD_ERRORCHECK_MUTEX_INITIALIZER ((pthread_mutex_t)(size_t) -3)
+
+/*
+ * Compatibility with LinuxThreads
+ */
+#define PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP PTHREAD_RECURSIVE_MUTEX_INITIALIZER
+#define PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP PTHREAD_ERRORCHECK_MUTEX_INITIALIZER
+
+#define PTHREAD_COND_INITIALIZER ((pthread_cond_t)(size_t) -1)
+
+#define PTHREAD_RWLOCK_INITIALIZER ((pthread_rwlock_t)(size_t) -1)
+
+#define PTHREAD_SPINLOCK_INITIALIZER ((pthread_spinlock_t)(size_t) -1)
+
+
+/*
+ * Mutex types.
+ */
+enum
+{
+  /* Compatibility with LinuxThreads */
+  PTHREAD_MUTEX_FAST_NP,
+  PTHREAD_MUTEX_RECURSIVE_NP,
+  PTHREAD_MUTEX_ERRORCHECK_NP,
+  PTHREAD_MUTEX_TIMED_NP = PTHREAD_MUTEX_FAST_NP,
+  PTHREAD_MUTEX_ADAPTIVE_NP = PTHREAD_MUTEX_FAST_NP,
+  /* For compatibility with POSIX */
+  PTHREAD_MUTEX_NORMAL = PTHREAD_MUTEX_FAST_NP,
+  PTHREAD_MUTEX_RECURSIVE = PTHREAD_MUTEX_RECURSIVE_NP,
+  PTHREAD_MUTEX_ERRORCHECK = PTHREAD_MUTEX_ERRORCHECK_NP,
+  PTHREAD_MUTEX_DEFAULT = PTHREAD_MUTEX_NORMAL
+};
+
+
+typedef struct ptw32_cleanup_t ptw32_cleanup_t;
+
+#if defined(_MSC_VER)
+/* Disable MSVC 'anachronism used' warning */
+#pragma warning( disable : 4229 )
+#endif
+
+typedef void (* PTW32_CDECL ptw32_cleanup_callback_t)(void *);
+
+#if defined(_MSC_VER)
+#pragma warning( default : 4229 )
+#endif
+
+struct ptw32_cleanup_t
+{
+  ptw32_cleanup_callback_t routine;
+  void *arg;
+  struct ptw32_cleanup_t *prev;
+};
+
+#if defined(__CLEANUP_SEH)
+        /*
+         * WIN32 SEH version of cancel cleanup.
+         */
+
+#define pthread_cleanup_push( _rout, _arg ) \
+        { \
+            ptw32_cleanup_t     _cleanup; \
+            \
+        _cleanup.routine        = (ptw32_cleanup_callback_t)(_rout); \
+            _cleanup.arg        = (_arg); \
+            __try \
+              { \
+
+#define pthread_cleanup_pop( _execute ) \
+              } \
+            __finally \
+                { \
+                    if( _execute || AbnormalTermination()) \
+                      { \
+                          (*(_cleanup.routine))( _cleanup.arg ); \
+                      } \
+                } \
+        }
+
+#else /* __CLEANUP_SEH */
+
+#if defined(__CLEANUP_C)
+
+        /*
+         * C implementation of PThreads cancel cleanup
+         */
+
+#define pthread_cleanup_push( _rout, _arg ) \
+        { \
+            ptw32_cleanup_t     _cleanup; \
+            \
+            ptw32_push_cleanup( &_cleanup, (ptw32_cleanup_callback_t) (_rout), (_arg) ); \
+
+#define pthread_cleanup_pop( _execute ) \
+            (void) ptw32_pop_cleanup( _execute ); \
+        }
+
+#else /* __CLEANUP_C */
+
+#if defined(__CLEANUP_CXX)
+
+        /*
+         * C++ version of cancel cleanup.
+         * - John E. Bossom.
+         */
+
+        class PThreadCleanup {
+          /*
+           * PThreadCleanup
+           *
+           * Purpose
+           *      This class is a C++ helper class that is
+           *      used to implement pthread_cleanup_push/
+           *      pthread_cleanup_pop.
+           *      The destructor of this class automatically
+           *      pops the pushed cleanup routine regardless
+           *      of how the code exits the scope
+           *      (i.e. such as by an exception)
+           */
+      ptw32_cleanup_callback_t cleanUpRout;
+          void    *       obj;
+          int             executeIt;
+
+        public:
+          PThreadCleanup() :
+            cleanUpRout( 0 ),
+            obj( 0 ),
+            executeIt( 0 )
+            /*
+             * No cleanup performed
+             */
+            {
+            }
+
+          PThreadCleanup(
+             ptw32_cleanup_callback_t routine,
+                         void    *       arg ) :
+            cleanUpRout( routine ),
+            obj( arg ),
+            executeIt( 1 )
+            /*
+             * Registers a cleanup routine for 'arg'
+             */
+            {
+            }
+
+          ~PThreadCleanup()
+            {
+              if ( executeIt && ((void *) cleanUpRout != (void *) 0) )
+                {
+                  (void) (*cleanUpRout)( obj );
+                }
+            }
+
+          void execute( int exec )
+            {
+              executeIt = exec;
+            }
+        };
+
+        /*
+         * C++ implementation of PThreads cancel cleanup;
+         * This implementation takes advantage of a helper
+         * class who's destructor automatically calls the
+         * cleanup routine if we exit our scope weirdly
+         */
+#define pthread_cleanup_push( _rout, _arg ) \
+        { \
+            PThreadCleanup  cleanup((ptw32_cleanup_callback_t)(_rout), \
+                                    (void *) (_arg) );
+
+#define pthread_cleanup_pop( _execute ) \
+            cleanup.execute( _execute ); \
+        }
+
+#else
+
+#error ERROR [__FILE__, line __LINE__]: Cleanup type undefined.
+
+#endif /* __CLEANUP_CXX */
+
+#endif /* __CLEANUP_C */
+
+#endif /* __CLEANUP_SEH */
+
+/*
+ * ===============
+ * ===============
+ * Methods
+ * ===============
+ * ===============
+ */
+
+/*
+ * PThread Attribute Functions
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_init (pthread_attr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_destroy (pthread_attr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_getdetachstate (const pthread_attr_t * attr,
+                                         int *detachstate);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_getstackaddr (const pthread_attr_t * attr,
+                                       void **stackaddr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_getstacksize (const pthread_attr_t * attr,
+                                       size_t * stacksize);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_setdetachstate (pthread_attr_t * attr,
+                                         int detachstate);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_setstackaddr (pthread_attr_t * attr,
+                                       void *stackaddr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_setstacksize (pthread_attr_t * attr,
+                                       size_t stacksize);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_getschedparam (const pthread_attr_t *attr,
+                                        struct sched_param *param);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_setschedparam (pthread_attr_t *attr,
+                                        const struct sched_param *param);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_setschedpolicy (pthread_attr_t *,
+                                         int);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_getschedpolicy (const pthread_attr_t *,
+                                         int *);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_setinheritsched(pthread_attr_t * attr,
+                                         int inheritsched);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_getinheritsched(const pthread_attr_t * attr,
+                                         int * inheritsched);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_setscope (pthread_attr_t *,
+                                   int);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_getscope (const pthread_attr_t *,
+                                   int *);
+
+/*
+ * PThread Functions
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_create (pthread_t * tid,
+                            const pthread_attr_t * attr,
+                            void *(PTW32_CDECL *start) (void *),
+                            void *arg);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_detach (pthread_t tid);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_equal (pthread_t t1,
+                           pthread_t t2);
+
+PTW32_DLLPORT void PTW32_CDECL pthread_exit (void *value_ptr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_join (pthread_t thread,
+                          void **value_ptr);
+
+PTW32_DLLPORT pthread_t PTW32_CDECL pthread_self (void);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_cancel (pthread_t thread);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_setcancelstate (int state,
+                                    int *oldstate);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_setcanceltype (int type,
+                                   int *oldtype);
+
+PTW32_DLLPORT void PTW32_CDECL pthread_testcancel (void);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_once (pthread_once_t * once_control,
+                          void (PTW32_CDECL *init_routine) (void));
+
+#if PTW32_LEVEL >= PTW32_LEVEL_MAX
+PTW32_DLLPORT ptw32_cleanup_t * PTW32_CDECL ptw32_pop_cleanup (int execute);
+
+PTW32_DLLPORT void PTW32_CDECL ptw32_push_cleanup (ptw32_cleanup_t * cleanup,
+                                 ptw32_cleanup_callback_t routine,
+                                 void *arg);
+#endif /* PTW32_LEVEL >= PTW32_LEVEL_MAX */
+
+/*
+ * Thread Specific Data Functions
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_key_create (pthread_key_t * key,
+                                void (PTW32_CDECL *destructor) (void *));
+
+PTW32_DLLPORT int PTW32_CDECL pthread_key_delete (pthread_key_t key);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_setspecific (pthread_key_t key,
+                                 const void *value);
+
+PTW32_DLLPORT void * PTW32_CDECL pthread_getspecific (pthread_key_t key);
+
+
+/*
+ * Mutex Attribute Functions
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_init (pthread_mutexattr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_destroy (pthread_mutexattr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_getpshared (const pthread_mutexattr_t
+                                          * attr,
+                                          int *pshared);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_setpshared (pthread_mutexattr_t * attr,
+                                          int pshared);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_settype (pthread_mutexattr_t * attr, int kind);
+PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_gettype (const pthread_mutexattr_t * attr, int *kind);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_setrobust(
+                                           pthread_mutexattr_t *attr,
+                                           int robust);
+PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_getrobust(
+                                           const pthread_mutexattr_t * attr,
+                                           int * robust);
+
+/*
+ * Barrier Attribute Functions
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_barrierattr_init (pthread_barrierattr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_barrierattr_destroy (pthread_barrierattr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_barrierattr_getpshared (const pthread_barrierattr_t
+                                            * attr,
+                                            int *pshared);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_barrierattr_setpshared (pthread_barrierattr_t * attr,
+                                            int pshared);
+
+/*
+ * Mutex Functions
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_mutex_init (pthread_mutex_t * mutex,
+                                const pthread_mutexattr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_mutex_destroy (pthread_mutex_t * mutex);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_mutex_lock (pthread_mutex_t * mutex);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_mutex_timedlock(pthread_mutex_t * mutex,
+                                    const struct timespec *abstime);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_mutex_trylock (pthread_mutex_t * mutex);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_mutex_unlock (pthread_mutex_t * mutex);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_mutex_consistent (pthread_mutex_t * mutex);
+
+/*
+ * Spinlock Functions
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_spin_init (pthread_spinlock_t * lock, int pshared);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_spin_destroy (pthread_spinlock_t * lock);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_spin_lock (pthread_spinlock_t * lock);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_spin_trylock (pthread_spinlock_t * lock);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_spin_unlock (pthread_spinlock_t * lock);
+
+/*
+ * Barrier Functions
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_barrier_init (pthread_barrier_t * barrier,
+                                  const pthread_barrierattr_t * attr,
+                                  unsigned int count);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_barrier_destroy (pthread_barrier_t * barrier);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_barrier_wait (pthread_barrier_t * barrier);
+
+/*
+ * Condition Variable Attribute Functions
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_condattr_init (pthread_condattr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_condattr_destroy (pthread_condattr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_condattr_getpshared (const pthread_condattr_t * attr,
+                                         int *pshared);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_condattr_setpshared (pthread_condattr_t * attr,
+                                         int pshared);
+
+/*
+ * Condition Variable Functions
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_cond_init (pthread_cond_t * cond,
+                               const pthread_condattr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_cond_destroy (pthread_cond_t * cond);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_cond_wait (pthread_cond_t * cond,
+                               pthread_mutex_t * mutex);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_cond_timedwait (pthread_cond_t * cond,
+                                    pthread_mutex_t * mutex,
+                                    const struct timespec *abstime);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_cond_signal (pthread_cond_t * cond);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_cond_broadcast (pthread_cond_t * cond);
+
+/*
+ * Scheduling
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_setschedparam (pthread_t thread,
+                                   int policy,
+                                   const struct sched_param *param);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_getschedparam (pthread_t thread,
+                                   int *policy,
+                                   struct sched_param *param);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_setconcurrency (int);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_getconcurrency (void);
+
+/*
+ * Read-Write Lock Functions
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlock_init(pthread_rwlock_t *lock,
+                                const pthread_rwlockattr_t *attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlock_destroy(pthread_rwlock_t *lock);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlock_tryrdlock(pthread_rwlock_t *);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlock_trywrlock(pthread_rwlock_t *);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlock_rdlock(pthread_rwlock_t *lock);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlock_timedrdlock(pthread_rwlock_t *lock,
+                                       const struct timespec *abstime);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlock_wrlock(pthread_rwlock_t *lock);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlock_timedwrlock(pthread_rwlock_t *lock,
+                                       const struct timespec *abstime);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlock_unlock(pthread_rwlock_t *lock);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlockattr_init (pthread_rwlockattr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlockattr_destroy (pthread_rwlockattr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlockattr_getpshared (const pthread_rwlockattr_t * attr,
+                                           int *pshared);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlockattr_setpshared (pthread_rwlockattr_t * attr,
+                                           int pshared);
+
+#if PTW32_LEVEL >= PTW32_LEVEL_MAX - 1
+
+/*
+ * Signal Functions. Should be defined in <signal.h> but MSVC and MinGW32
+ * already have signal.h that don't define these.
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_kill(pthread_t thread, int sig);
+
+/*
+ * Non-portable functions
+ */
+
+/*
+ * Compatibility with Linux.
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_setkind_np(pthread_mutexattr_t * attr,
+                                         int kind);
+PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_getkind_np(pthread_mutexattr_t * attr,
+                                         int *kind);
+
+/*
+ * Possibly supported by other POSIX threads implementations
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_delay_np (struct timespec * interval);
+PTW32_DLLPORT int PTW32_CDECL pthread_num_processors_np(void);
+PTW32_DLLPORT unsigned __int64 PTW32_CDECL pthread_getunique_np(pthread_t thread);
+
+/*
+ * Useful if an application wants to statically link
+ * the lib rather than load the DLL at run-time.
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_win32_process_attach_np(void);
+PTW32_DLLPORT int PTW32_CDECL pthread_win32_process_detach_np(void);
+PTW32_DLLPORT int PTW32_CDECL pthread_win32_thread_attach_np(void);
+PTW32_DLLPORT int PTW32_CDECL pthread_win32_thread_detach_np(void);
+
+/*
+ * Features that are auto-detected at load/run time.
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_win32_test_features_np(int);
+enum ptw32_features {
+  PTW32_SYSTEM_INTERLOCKED_COMPARE_EXCHANGE = 0x0001, /* System provides it. */
+  PTW32_ALERTABLE_ASYNC_CANCEL              = 0x0002  /* Can cancel blocked threads. */
+};
+
+/*
+ * Register a system time change with the library.
+ * Causes the library to perform various functions
+ * in response to the change. Should be called whenever
+ * the application's top level window receives a
+ * WM_TIMECHANGE message. It can be passed directly to
+ * pthread_create() as a new thread if desired.
+ */
+PTW32_DLLPORT void * PTW32_CDECL pthread_timechange_handler_np(void *);
+
+#endif /*PTW32_LEVEL >= PTW32_LEVEL_MAX - 1 */
+
+#if PTW32_LEVEL >= PTW32_LEVEL_MAX
+
+/*
+ * Returns the Win32 HANDLE for the POSIX thread.
+ */
+PTW32_DLLPORT HANDLE PTW32_CDECL pthread_getw32threadhandle_np(pthread_t thread);
+/*
+ * Returns the win32 thread ID for POSIX thread.
+ */
+PTW32_DLLPORT DWORD PTW32_CDECL pthread_getw32threadid_np (pthread_t thread);
+
+
+/*
+ * Protected Methods
+ *
+ * This function blocks until the given WIN32 handle
+ * is signaled or pthread_cancel had been called.
+ * This function allows the caller to hook into the
+ * PThreads cancel mechanism. It is implemented using
+ *
+ *              WaitForMultipleObjects
+ *
+ * on 'waitHandle' and a manually reset WIN32 Event
+ * used to implement pthread_cancel. The 'timeout'
+ * argument to TimedWait is simply passed to
+ * WaitForMultipleObjects.
+ */
+PTW32_DLLPORT int PTW32_CDECL pthreadCancelableWait (HANDLE waitHandle);
+PTW32_DLLPORT int PTW32_CDECL pthreadCancelableTimedWait (HANDLE waitHandle,
+                                        DWORD timeout);
+
+#endif /* PTW32_LEVEL >= PTW32_LEVEL_MAX */
+
+/*
+ * Thread-Safe C Runtime Library Mappings.
+ */
+#if !defined(_UWIN)
+#  if defined(NEED_ERRNO)
+     PTW32_DLLPORT int * PTW32_CDECL _errno( void );
+#  else
+#    if !defined(errno)
+#      if (defined(_MT) || defined(_DLL))
+         __declspec(dllimport) extern int * __cdecl _errno(void);
+#        define errno   (*_errno())
+#      endif
+#    endif
+#  endif
+#endif
+
+/*
+ * Some compiler environments don't define some things.
+ */
+#if defined(__BORLANDC__)
+#  define _ftime ftime
+#  define _timeb timeb
+#endif
+
+#if defined(__cplusplus)
+
+/*
+ * Internal exceptions
+ */
+class ptw32_exception {};
+class ptw32_exception_cancel : public ptw32_exception {};
+class ptw32_exception_exit   : public ptw32_exception {};
+
+#endif
+
+#if PTW32_LEVEL >= PTW32_LEVEL_MAX
+
+/* FIXME: This is only required if the library was built using SEH */
+/*
+ * Get internal SEH tag
+ */
+PTW32_DLLPORT DWORD PTW32_CDECL ptw32_get_exception_services_code(void);
+
+#endif /* PTW32_LEVEL >= PTW32_LEVEL_MAX */
+
+#if !defined(PTW32_BUILD)
+
+#if defined(__CLEANUP_SEH)
+
+/*
+ * Redefine the SEH __except keyword to ensure that applications
+ * propagate our internal exceptions up to the library's internal handlers.
+ */
+#define __except( E ) \
+        __except( ( GetExceptionCode() == ptw32_get_exception_services_code() ) \
+                 ? EXCEPTION_CONTINUE_SEARCH : ( E ) )
+
+#endif /* __CLEANUP_SEH */
+
+#if defined(__CLEANUP_CXX)
+
+/*
+ * Redefine the C++ catch keyword to ensure that applications
+ * propagate our internal exceptions up to the library's internal handlers.
+ */
+#if defined(_MSC_VER)
+        /*
+         * WARNING: Replace any 'catch( ... )' with 'PtW32CatchAll'
+         * if you want Pthread-Win32 cancelation and pthread_exit to work.
+         */
+
+#if !defined(PtW32NoCatchWarn)
+
+#pragma message("Specify \"/DPtW32NoCatchWarn\" compiler flag to skip this message.")
+#pragma message("------------------------------------------------------------------")
+#pragma message("When compiling applications with MSVC++ and C++ exception handling:")
+#pragma message("  Replace any 'catch( ... )' in routines called from POSIX threads")
+#pragma message("  with 'PtW32CatchAll' or 'CATCHALL' if you want POSIX thread")
+#pragma message("  cancelation and pthread_exit to work. For example:")
+#pragma message("")
+#pragma message("    #if defined(PtW32CatchAll)")
+#pragma message("      PtW32CatchAll")
+#pragma message("    #else")
+#pragma message("      catch(...)")
+#pragma message("    #endif")
+#pragma message("        {")
+#pragma message("          /* Catchall block processing */")
+#pragma message("        }")
+#pragma message("------------------------------------------------------------------")
+
+#endif
+
+#define PtW32CatchAll \
+        catch( ptw32_exception & ) { throw; } \
+        catch( ... )
+
+#else /* _MSC_VER */
+
+#define catch( E ) \
+        catch( ptw32_exception & ) { throw; } \
+        catch( E )
+
+#endif /* _MSC_VER */
+
+#endif /* __CLEANUP_CXX */
+
+#endif /* ! PTW32_BUILD */
+
+#if defined(__cplusplus)
+}                               /* End of extern "C" */
+#endif                          /* __cplusplus */
+
+#if defined(PTW32__HANDLE_DEF)
+# undef HANDLE
+#endif
+#if defined(PTW32__DWORD_DEF)
+# undef DWORD
+#endif
+
+#undef PTW32_LEVEL
+#undef PTW32_LEVEL_MAX
+
+#endif /* ! RC_INVOKED */
+
+#endif /* PTHREAD_H */
diff --git a/compat/pthreads/pthread_barrier.hpp b/compat/pthreads/pthread_barrier.hpp
new file mode 100644
index 0000000000..b0c3172ef9
--- /dev/null
+++ b/compat/pthreads/pthread_barrier.hpp
@@ -0,0 +1,69 @@
+/**
+ * Meant to improve clang 4 / macos compatibility (untested)
+ */
+
+#ifndef PTHREAD_BARRIER_H_
+#define PTHREAD_BARRIER_H_
+
+#include <pthread.h>
+#include <errno.h>
+
+typedef int pthread_barrierattr_t;
+typedef struct
+{
+    pthread_mutex_t mutex;
+    pthread_cond_t cond;
+    int count;
+    int tripCount;
+} pthread_barrier_t;
+
+
+int pthread_barrier_init(pthread_barrier_t *barrier, const pthread_barrierattr_t *attr, unsigned int count)
+{
+    if(count == 0)
+    {
+        errno = EINVAL;
+        return -1;
+    }
+    if(pthread_mutex_init(&barrier->mutex, 0) < 0)
+    {
+        return -1;
+    }
+    if(pthread_cond_init(&barrier->cond, 0) < 0)
+    {
+        pthread_mutex_destroy(&barrier->mutex);
+        return -1;
+    }
+    barrier->tripCount = count;
+    barrier->count = 0;
+
+    return 0;
+}
+
+int pthread_barrier_destroy(pthread_barrier_t *barrier)
+{
+    pthread_cond_destroy(&barrier->cond);
+    pthread_mutex_destroy(&barrier->mutex);
+    return 0;
+}
+
+int pthread_barrier_wait(pthread_barrier_t *barrier)
+{
+    pthread_mutex_lock(&barrier->mutex);
+    ++(barrier->count);
+    if(barrier->count >= barrier->tripCount)
+    {
+        barrier->count = 0;
+        pthread_cond_broadcast(&barrier->cond);
+        pthread_mutex_unlock(&barrier->mutex);
+        return 1;
+    }
+    else
+    {
+        pthread_cond_wait(&barrier->cond, &(barrier->mutex));
+        pthread_mutex_unlock(&barrier->mutex);
+        return 0;
+    }
+}
+
+#endif // PTHREAD_BARRIER_H_
diff --git a/compat/pthreads/sched.h b/compat/pthreads/sched.h
new file mode 100644
index 0000000000..f36a97a66b
--- /dev/null
+++ b/compat/pthreads/sched.h
@@ -0,0 +1,183 @@
+/*
+ * Module: sched.h
+ *
+ * Purpose:
+ *      Provides an implementation of POSIX realtime extensions
+ *      as defined in 
+ *
+ *              POSIX 1003.1b-1993      (POSIX.1b)
+ *
+ * --------------------------------------------------------------------------
+ *
+ *      Pthreads-win32 - POSIX Threads Library for Win32
+ *      Copyright(C) 1998 John E. Bossom
+ *      Copyright(C) 1999,2005 Pthreads-win32 contributors
+ * 
+ *      Contact Email: rpj@callisto.canberra.edu.au
+ * 
+ *      The current list of contributors is contained
+ *      in the file CONTRIBUTORS included with the source
+ *      code distribution. The list can also be seen at the
+ *      following World Wide Web location:
+ *      http://sources.redhat.com/pthreads-win32/contributors.html
+ * 
+ *      This library is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU Lesser General Public
+ *      License as published by the Free Software Foundation; either
+ *      version 2 of the License, or (at your option) any later version.
+ * 
+ *      This library is distributed in the hope that it will be useful,
+ *      but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *      Lesser General Public License for more details.
+ * 
+ *      You should have received a copy of the GNU Lesser General Public
+ *      License along with this library in the file COPYING.LIB;
+ *      if not, write to the Free Software Foundation, Inc.,
+ *      59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+#if !defined(_SCHED_H)
+#define _SCHED_H
+
+#undef PTW32_SCHED_LEVEL
+
+#if defined(_POSIX_SOURCE)
+#define PTW32_SCHED_LEVEL 0
+/* Early POSIX */
+#endif
+
+#if defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 199309
+#undef PTW32_SCHED_LEVEL
+#define PTW32_SCHED_LEVEL 1
+/* Include 1b, 1c and 1d */
+#endif
+
+#if defined(INCLUDE_NP)
+#undef PTW32_SCHED_LEVEL
+#define PTW32_SCHED_LEVEL 2
+/* Include Non-Portable extensions */
+#endif
+
+#define PTW32_SCHED_LEVEL_MAX 3
+
+#if ( defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 200112 )  || !defined(PTW32_SCHED_LEVEL)
+#define PTW32_SCHED_LEVEL PTW32_SCHED_LEVEL_MAX
+/* Include everything */
+#endif
+
+
+#if defined(__GNUC__) && !defined(__declspec)
+# error Please upgrade your GNU compiler to one that supports __declspec.
+#endif
+
+/*
+ * When building the library, you should define PTW32_BUILD so that
+ * the variables/functions are exported correctly. When using the library,
+ * do NOT define PTW32_BUILD, and then the variables/functions will
+ * be imported correctly.
+ */
+#if !defined(PTW32_STATIC_LIB)
+#  if defined(PTW32_BUILD)
+#    define PTW32_DLLPORT __declspec (dllexport)
+#  else
+#    define PTW32_DLLPORT __declspec (dllimport)
+#  endif
+#else
+#  define PTW32_DLLPORT
+#endif
+
+/*
+ * This is a duplicate of what is in the autoconf config.h,
+ * which is only used when building the pthread-win32 libraries.
+ */
+
+#if !defined(PTW32_CONFIG_H)
+#  if defined(WINCE)
+#    define NEED_ERRNO
+#    define NEED_SEM
+#  endif
+#  if defined(__MINGW64__)
+#    define HAVE_STRUCT_TIMESPEC
+#    define HAVE_MODE_T
+#  elif defined(_UWIN) || defined(__MINGW32__)
+#    define HAVE_MODE_T
+#  endif
+#endif
+
+/*
+ *
+ */
+
+#if PTW32_SCHED_LEVEL >= PTW32_SCHED_LEVEL_MAX
+#if defined(NEED_ERRNO)
+#include "need_errno.h"
+#else
+#include <errno.h>
+#endif
+#endif /* PTW32_SCHED_LEVEL >= PTW32_SCHED_LEVEL_MAX */
+
+#if (defined(__MINGW64__) || defined(__MINGW32__)) || defined(_UWIN)
+# if PTW32_SCHED_LEVEL >= PTW32_SCHED_LEVEL_MAX
+/* For pid_t */
+#  include <sys/types.h>
+/* Required by Unix 98 */
+#  include <time.h>
+# else
+   typedef int pid_t;
+# endif
+#else
+ typedef int pid_t;
+#endif
+
+/* Thread scheduling policies */
+
+enum {
+  SCHED_OTHER = 0,
+  SCHED_FIFO,
+  SCHED_RR,
+  SCHED_MIN   = SCHED_OTHER,
+  SCHED_MAX   = SCHED_RR
+};
+
+struct sched_param {
+  int sched_priority;
+};
+
+#if defined(__cplusplus)
+extern "C"
+{
+#endif                          /* __cplusplus */
+
+PTW32_DLLPORT int __cdecl sched_yield (void);
+
+PTW32_DLLPORT int __cdecl sched_get_priority_min (int policy);
+
+PTW32_DLLPORT int __cdecl sched_get_priority_max (int policy);
+
+PTW32_DLLPORT int __cdecl sched_setscheduler (pid_t pid, int policy);
+
+PTW32_DLLPORT int __cdecl sched_getscheduler (pid_t pid);
+
+/*
+ * Note that this macro returns ENOTSUP rather than
+ * ENOSYS as might be expected. However, returning ENOSYS
+ * should mean that sched_get_priority_{min,max} are
+ * not implemented as well as sched_rr_get_interval.
+ * This is not the case, since we just don't support
+ * round-robin scheduling. Therefore I have chosen to
+ * return the same value as sched_setscheduler when
+ * SCHED_RR is passed to it.
+ */
+#define sched_rr_get_interval(_pid, _interval) \
+  ( errno = ENOTSUP, (int) -1 )
+
+
+#if defined(__cplusplus)
+}                               /* End of extern "C" */
+#endif                          /* __cplusplus */
+
+#undef PTW32_SCHED_LEVEL
+#undef PTW32_SCHED_LEVEL_MAX
+
+#endif                          /* !_SCHED_H */
+
diff --git a/compat/pthreads/x64/pthreadVC2.lib b/compat/pthreads/x64/pthreadVC2.lib
new file mode 100644
index 0000000000..3e26e4a066
Binary files /dev/null and b/compat/pthreads/x64/pthreadVC2.lib differ
diff --git a/compat/pthreads/x86/pthreadVC2.lib b/compat/pthreads/x86/pthreadVC2.lib
new file mode 100644
index 0000000000..5880478f96
Binary files /dev/null and b/compat/pthreads/x86/pthreadVC2.lib differ
diff --git a/compat/thrust/CHANGELOG b/compat/thrust/CHANGELOG
deleted file mode 100644
index 110c6689c2..0000000000
--- a/compat/thrust/CHANGELOG
+++ /dev/null
@@ -1,662 +0,0 @@
-#######################################
-#           Thrust v1.7.0             #
-#######################################
-
-Summary
-    Thrust 1.7.0 introduces a new interface for controlling algorithm execution as
-    well as several new algorithms and performance improvements. With this new
-    interface, users may directly control how algorithms execute as well as details
-    such as the allocation of temporary storage. Key/value versions of thrust::merge
-    and the set operation algorithms have been added, as well stencil versions of
-    partitioning algorithms. thrust::tabulate has been introduced to tabulate the
-    values of functions taking integers. For 32b types, new CUDA merge and set
-    operations provide 2-15x faster performance while a new CUDA comparison sort
-    provides 1.3-4x faster performance. Finally, a new TBB reduce_by_key implementation
-    provides 80% faster performance.
-
-Breaking API Changes
-    Dispatch
-      Custom user backend systems' tag types must now inherit from the corresponding system's execution_policy template (e.g. thrust::cuda::execution_policy) instead
-      of the tag struct (e.g. thrust::cuda::tag). Otherwise, algorithm specializations will silently go unfound during dispatch.
-      See examples/minimal_custom_backend.cu and examples/cuda/fallback_allocator.cu for usage examples.
-
-      thrust::advance and thrust::distance are no longer dispatched based on iterator system type and thus may no longer be customized.
-
-    Iterators
-      iterator_facade and iterator_adaptor's Pointer template parameters have been eliminated.
-      iterator_adaptor has been moved into the thrust namespace (previously thrust::experimental::iterator_adaptor).
-      iterator_facade has been moved into the thrust namespace (previously thrust::experimental::iterator_facade).
-      iterator_core_access has been moved into the thrust namespace (previously thrust::experimental::iterator_core_access).
-      All iterators' nested pointer typedef (the type of the result of operator->) is now void instead of a pointer type to indicate that such expressions are currently impossible.
-      Floating point counting_iterators' nested difference_type typedef is now a signed integral type instead of a floating point type.
-
-    Other
-      normal_distribution has been moved into the thrust::random namespace (previously thrust::random::experimental::normal_distribution).
-      Placeholder expressions may no longer include the comma operator.
-
-New Features
-    Execution Policies
-      Users may directly control the dispatch of algorithm invocations with optional execution policy arguments.
-      For example, instead of wrapping raw pointers allocated by cudaMalloc with thrust::device_ptr, the thrust::device execution_policy may be passed as an argument to an algorithm invocation to enable CUDA execution.
-      The following execution policies are supported in this version:
-
-        thrust::host
-        thrust::device
-        thrust::cpp::par
-        thrust::cuda::par
-        thrust::omp::par
-        thrust::tbb::par
-
-    Algorithms
-	free
-	get_temporary_buffer
-	malloc
-        merge_by_key
-        partition with stencil
-        partition_copy with stencil
-	return_temporary_buffer
-        set_difference_by_key
-        set_intersection_by_key
-        set_symmetric_difference_by_key
-        set_union_by_key
-        stable_partition with stencil
-        stable_partition_copy with stencil
-	tabulate
-
-New Examples
-    uninitialized_vector demonstrates how to use a custom allocator to avoid the automatic initialization of elements in thrust::device_vector.
-
-Other Enhancements
-    Authors of custom backend systems may manipulate arbitrary state during algorithm dispatch by incorporating it into their execution_policy parameter.
-    Users may control the allocation of temporary storage during algorithm execution by passing standard allocators as parameters via execution policies such as thrust::device.
-    THRUST_DEVICE_SYSTEM_CPP has been added as a compile-time target for the device backend. 
-    CUDA merge performance is 2-15x faster.
-    CUDA comparison sort performance is 1.3-4x faster.
-    CUDA set operation performance is 1.5-15x faster.
-    TBB reduce_by_key performance is 80% faster.
-    Several algorithms have been parallelized with TBB.
-    Support for user allocators in vectors has been improved.
-    The sparse_vector example is now implemented with merge_by_key instead of sort_by_key.
-    Warnings have been eliminated in various contexts.
-    Warnings about __host__ or __device__-only functions called from __host__ __device__ functions have been eliminated in various contexts.
-    Documentation about algorithm requirements have been improved.
-    Simplified the minimal_custom_backend example.
-    Simplified the cuda/custom_temporary_allocation example.
-    Simplified the cuda/fallback_allocator example.
-
-Bug Fixes
-    #248 fix broken counting_iterator<float> behavior with OpenMP
-    #231, #209 fix set operation failures with CUDA
-    #187 fix incorrect occupancy calculation with CUDA
-    #153 fix broken multigpu behavior with CUDA
-    #142 eliminate warning produced by thrust::random::taus88 and MSVC 2010
-    #208 correctly initialize elements in temporary storage when necessary
-    #16 fix compilation error when sorting bool with CUDA
-    #10 fix ambiguous overloads of reinterpret_tag
-
-Known Issues
-    g++ versions 4.3 and lower may fail to dispatch thrust::get_temporary_buffer correctly causing infinite recursion in examples such as cuda/custom_temporary_allocation.
-
-Acknowledgments
-    Thanks to Sean Baxter, Bryan Catanzaro, and Manjunath Kudlur for contributing a faster merge implementation for CUDA.
-    Thanks to Sean Baxter for contributing a faster set operation implementation for CUDA.
-    Thanks to Cliff Woolley for contributing a correct occupancy calculation algorithm.
-
-#######################################
-#           Thrust v1.6.0             #
-#######################################
-
-Summary
-    Thrust v1.6.0 provides an interface for customization and extension and a new
-    backend system based on the Threading Building Blocks library. With this
-    new interface, programmers may customize the behavior of specific algorithms
-    as well as control the allocation of temporary storage or invent entirely new
-    backends. These enhancements also allow multiple different backend systems
-    such as CUDA and OpenMP to coexist within a single program. Support for TBB
-    allows Thrust programs to integrate more naturally into applications which
-    may already employ the TBB task scheduler.
-
-Breaking API Changes
-    The header <thrust/experimental/cuda/pinned_allocator.h> has been moved to <thrust/system/cuda/experimental/pinned_allocator.h>
-    thrust::experimental::cuda::pinned_allocator has been moved to thrust::cuda::experimental::pinned_allocator
-    The macro THRUST_DEVICE_BACKEND has been renamed THRUST_DEVICE_SYSTEM
-    The macro THRUST_DEVICE_BACKEND_CUDA has been renamed THRUST_DEVICE_SYSTEM_CUDA
-    The macro THRUST_DEVICE_BACKEND_OMP has been renamed THRUST_DEVICE_SYSTEM_OMP
-    thrust::host_space_tag has been renamed thrust::host_system_tag
-    thrust::device_space_tag has been renamed thrust::device_system_tag
-    thrust::any_space_tag has been renamed thrust::any_system_tag
-    thrust::iterator_space has been renamed thrust::iterator_system
-    
-
-New Features
-    Backend Systems
-        Threading Building Blocks (TBB) is now supported
-    Functions
-        for_each_n
-        raw_reference_cast
-    Types
-        pointer
-        reference
-
-New Examples
-    cuda/custom_temporary_allocation
-    cuda/fallback_allocator
-    device_ptr
-    expand
-    minimal_custom_backend
-    raw_reference_cast
-    set_operations
-
-Other Enhancements
-    thrust::for_each now returns the end of the input range similar to most other algorithms
-    thrust::pair and thrust::tuple have swap functionality
-    all CUDA algorithms now support large data types
-    iterators may be dereferenced in user __device__ or __global__ functions
-    the safe use of different backend systems is now possible within a single binary
-
-Bug Fixes
-    #469 min_element and max_element algorithms no longer require a const comparison operator
-
-Known Issues
-    cudafe++.exe may crash when parsing TBB headers on Windows. 
-
-#######################################
-#           Thrust v1.5.3             #
-#######################################
-
-Summary
-    Small bug fixes
-
-Bug Fixes
-    Avoid warnings about potential race due to __shared__ non-POD variable
-
-#######################################
-#           Thrust v1.5.2             #
-#######################################
-
-Summary
-    Small bug fixes
-
-Bug Fixes
-    Fixed warning about C-style initialization of structures
-
-#######################################
-#           Thrust v1.5.1             #
-#######################################
-
-Summary
-    Small bug fixes
-
-Bug Fixes
-    Sorting data referenced by permutation_iterators on CUDA produces invalid results
-
-#######################################
-#           Thrust v1.5.0             #
-#######################################
-
-Summary
-    Thrust v1.5.0 provides introduces new programmer productivity and performance
-    enhancements. New functionality for creating anonymous "lambda" functions has
-    been added. A faster host sort provides 2-10x faster performance for sorting
-    arithmetic types on (single-threaded) CPUs. A new OpenMP sort provides
-    2.5x-3.0x speedup over the host sort using a quad-core CPU. When sorting
-    arithmetic types with the OpenMP backend the combined performance improvement
-    is 5.9x for 32-bit integers and ranges from 3.0x (64-bit types) to 14.2x
-    (8-bit types). A new CUDA reduce_by_key implementation provides 2-3x faster
-    performance.
-
-Breaking API Changes
-    device_ptr<void> no longer unsafely converts to device_ptr<T> without an
-    explicit cast. Use the expression
-    device_pointer_cast(static_cast<int*>(void_ptr.get()))
-    to convert, for example, device_ptr<void> to device_ptr<int>.
-
-New Features
-    Functions
-        stencil-less transform_if
-
-    Types
-        lambda placeholders
-
-New Examples
-    lambda
-
-Other Enhancements
-    host sort is 2-10x faster for arithmetic types
-    OMP sort provides speedup over host sort
-    reduce_by_key is 2-3x faster
-    reduce_by_key no longer requires O(N) temporary storage
-    CUDA scan algorithms are 10-40% faster
-    host_vector and device_vector are now documented
-    out-of-memory exceptions now provide detailed information from CUDART
-    improved histogram example
-    device_reference now has a specialized swap
-    reduce_by_key and scan algorithms are compatible with discard_iterator
-
-Removed Functionality
-
-Bug Fixes
-     #44 allow host_vector to compile when value_type uses __align__
-    #198 allow adjacent_difference to permit safe in-situ operation
-    #303 make thrust thread-safe
-    #313 avoid race conditions in device_vector::insert
-    #314 avoid unintended adl invocation when dispatching copy
-    #365 fix merge and set operation failures
-
-Known Issues
-    None
-
-Acknowledgments
-    Thanks to Manjunath Kudlur for contributing his Carbon library, from which the lambda functionality is derived.
-    Thanks to Jean-Francois Bastien for suggesting a fix for issue 303.
-
-#######################################
-#           Thrust v1.4.0             #
-#######################################
-
-Summary
-    Thrust v1.4.0 provides support for CUDA 4.0 in addition to many feature
-    and performance improvements.  New set theoretic algorithms operating on
-    sorted sequences have been added.  Additionally, a new fancy iterator
-    allows discarding redundant or otherwise unnecessary output from
-    algorithms, conserving memory storage and bandwidth.
-
-Breaking API Changes
-    Eliminations
-        thrust/is_sorted.h
-        thrust/utility.h
-        thrust/set_intersection.h
-        thrust/experimental/cuda/ogl_interop_allocator.h and the functionality therein
-        thrust::deprecated::copy_when
-        thrust::deprecated::absolute_value
-
-New Features
-    Functions
-        copy_n
-        merge
-        set_difference
-        set_symmetric_difference
-        set_union
-
-    Types
-        discard_iterator
-
-    Device support
-        Compute Capability 2.1 GPUs
-
-New Examples
-    run_length_decoding
-
-Other Enhancements
-    Compilation warnings are substantially reduced in various contexts.
-    The compilation time of thrust::sort, thrust::stable_sort, thrust::sort_by_key,
-    and thrust::stable_sort_by_key are substantially reduced.
-    A fast sort implementation is used when sorting primitive types with thrust::greater.
-    The performance of thrust::set_intersection is improved.
-    The performance of thrust::fill is improved on SM 1.x devices.
-    A code example is now provided in each algorithm's documentation.
-    thrust::reverse now operates in-place
-
-Removed Functionality
-    thrust::deprecated::copy_when
-    thrust::deprecated::absolute_value
-    thrust::experimental::cuda::ogl_interop_allocator
-    thrust::gather and thrust::scatter from host to device and vice versa are no longer supported.
-    Operations which modify the elements of a thrust::device_vector are no longer
-    available from source code compiled without nvcc when the device backend is CUDA.
-    Instead, use the idiom from the cpp_interop example.
-
-Bug Fixes
-    #212 set_intersection works correctly for large input sizes.
-    #275 counting_iterator and constant_iterator work correctly with OpenMP as the
-    backend when compiling with optimization
-    #256 min and max correctly return their first argument as a tie-breaker
-    #248 NDEBUG is interpreted correctly
-
-Known Issues
-    nvcc may generate code containing warnings when compiling some Thrust algorithms.
-    When compiling with -arch=sm_1x, some Thrust algorithms may cause nvcc to issue
-    benign pointer advisories.
-    When compiling with -arch=sm_1x and -G, some Thrust algorithms may fail to execute correctly.
-    thrust::inclusive_scan, thrust::exclusive_scan, thrust::inclusive_scan_by_key,
-    and thrust::exclusive_scan_by_key are currently incompatible with thrust::discard_iterator.
-
-Acknowledgments
-    Thanks to David Tarjan for improving the performance of set_intersection.
-    Thanks to Duane Merrill for continued help with sort.
-    Thanks to Nathan Whitehead for help with CUDA Toolkit integration.
-
-#######################################
-#           Thrust v1.3.0             #
-#######################################
-
-Summary
-    Thrust v1.3.0 provides support for CUDA 3.2 in addition to many feature
-    and performance enhancements.
-    
-    Performance of the sort and sort_by_key algorithms is improved by as much 
-    as 3x in certain situations.  The performance of stream compaction algorithms,
-    such as copy_if, is improved by as much as 2x.  Reduction performance is 
-    also improved, particularly for small input sizes.
-    
-    CUDA errors are now converted to runtime exceptions using the system_error
-    interface.  Combined with a debug mode, also new in v1.3, runtime errors
-    can be located with greater precision.
-
-    Lastly, a few header files have been consolidated or renamed for clarity.
-    See the deprecations section below for additional details.
-
-
-Breaking API Changes
-    Promotions
-        thrust::experimental::inclusive_segmented_scan has been renamed thrust::inclusive_scan_by_key and exposes a different interface
-        thrust::experimental::exclusive_segmented_scan has been renamed thrust::exclusive_scan_by_key and exposes a different interface
-        thrust::experimental::partition_copy has been renamed thrust::partition_copy and exposes a different interface
-        thrust::next::gather has been renamed thrust::gather
-        thrust::next::gather_if has been renamed thrust::gather_if
-        thrust::unique_copy_by_key has been renamed thrust::unique_by_key_copy
-    Deprecations
-        thrust::copy_when has been renamed thrust::deprecated::copy_when
-        thrust::absolute_value has been renamed thrust::deprecated::absolute_value
-        The header thrust/set_intersection.h is now deprecated; use thrust/set_operations.h instead
-        The header thrust/utility.h is now deprecated; use thrust/swap.h instead
-        The header thrust/swap_ranges.h is now deprecated; use thrust/swap.h instead
-    Eliminations
-        thrust::deprecated::gather
-        thrust::deprecated::gather_if
-        thrust/experimental/arch.h and the functions therein
-        thrust/sorting/merge_sort.h
-        thrust/sorting/radix_sort.h
-
-New Features
-    Functions
-        exclusive_scan_by_key
-        find
-        find_if
-        find_if_not
-        inclusive_scan_by_key
-        is_partitioned
-        is_sorted_until
-        mismatch
-        partition_point
-        reverse
-        reverse_copy
-        stable_partition_copy
-
-    Types
-        system_error and related types
-        experimental::cuda::ogl_interop_allocator
-        bit_and, bit_or, and bit_xor
-
-    Device support
-        gf104-based GPUs
-
-New Examples
-    opengl_interop.cu
-    repeated_range.cu
-    simple_moving_average.cu
-    sparse_vector.cu
-    strided_range.cu
-
-Other Enhancements
-    Performance of thrust::sort and thrust::sort_by_key is substantially improved for primitive key types
-    Performance of thrust::copy_if is substantially improved
-    Performance of thrust::reduce and related reductions is improved
-    THRUST_DEBUG mode added
-    Callers of Thrust functions may detect error conditions by catching thrust::system_error, which derives from std::runtime_error
-    The number of compiler warnings generated by Thrust has been substantially reduced
-    Comparison sort now works correctly for input sizes > 32M
-    min & max usage no longer collides with <windows.h> definitions
-    Compiling against the OpenMP backend no longer requires nvcc
-    Performance of device_vector initialized in .cpp files is substantially improved in common cases
-    Performance of thrust::sort_by_key on the host is substantially improved
-
-Removed Functionality
-    nvcc 2.3 is no longer supported
-
-Bug Fixes
-    Debug device code now compiles correctly
-    thrust::uninitialized_copy and thrust::unintialized_fill now dispatch constructors on the device rather than the host
-
-Known Issues
-    #212 set_intersection is known to fail for large input sizes
-    partition_point is known to fail for 64b types with nvcc 3.2
-
-Acknowledgments
-    Thanks to Duane Merrill for contributing a fast CUDA radix sort implementation
-    Thanks to Erich Elsen for contributing an implementation of find_if
-    Thanks to Andrew Corrigan for contributing changes which allow the OpenMP backend to compile in the absence of nvcc
-    Thanks to Andrew Corrigan, Cliff Wooley, David Coeurjolly, Janick Martinez Esturo, John Bowers, Maxim Naumov, Michael Garland, and Ryuta Suzuki for bug reports
-    Thanks to Cliff Woolley for help with testing
-
-#######################################
-#           Thrust v1.2.1             #
-#######################################
-
-Summary
-    Small fixes for compatibility with CUDA 3.1
-
-Known Issues
-    inclusive_scan & exclusive_scan may fail with very large types
-    the Microsoft compiler may fail to compile code using both sort and binary search algorithms
-    uninitialized_fill & uninitialized_copy dispatch constructors on the host rather than the device
-    # 109 some algorithms may exhibit poor performance with the OpenMP backend with large numbers (>= 6) of CPU threads
-    default_random_engine::discard is not accelerated with nvcc 2.3
-    nvcc 3.1 may fail to compile code using types derived from thrust::subtract_with_carry_engine, such as thrust::ranlux24 & thrust::ranlux48.
-
-#######################################
-#           Thrust v1.2.0             #
-#######################################
-
-Summary
-    Thrust v1.2 introduces support for compilation to multicore CPUs
-    and the Ocelot virtual machine, and several new facilities for
-    pseudo-random number generation.  New algorithms such as set
-    intersection and segmented reduction have also been added.  Lastly,
-    improvements to the robustness of the CUDA backend ensure
-    correctness across a broad set of (uncommon) use cases.
-
-Breaking API Changes
-    thrust::gather's interface was incorrect and has been removed.
-    The old interface is deprecated but will be preserved for Thrust
-    version 1.2 at thrust::deprecated::gather &
-    thrust::deprecated::gather_if. The new interface is provided at
-    thrust::next::gather & thrust::next::gather_if.  The new interface
-    will be promoted to thrust:: in Thrust version 1.3. For more details,
-    please refer to this thread:
-    http://groups.google.com/group/thrust-users/browse_thread/thread/f5f0583cb97b51fd
-
-    The thrust::sorting namespace has been deprecated in favor of the
-    top-level sorting functions, such as thrust::sort() and
-    thrust::sort_by_key().
-
-New Features
-    Functions
-        reduce_by_key
-        set_intersection
-        tie
-        unique_copy
-        unique_by_key
-        unique_copy_by_key
-
-    Types
-        Random Number Generation
-            discard_block_engine
-            default_random_engine
-            linear_congruential_engine
-            linear_feedback_shift_engine
-            minstd_rand
-            minstd_rand0
-            normal_distribution (experimental)
-            ranlux24
-            ranlux48
-            ranlux24_base
-            ranlux48_base
-            subtract_with_carry_engine
-            taus88
-            uniform_int_distribution
-            uniform_real_distribution
-            xor_combine_engine
-        Functionals
-            project1st
-            project2nd
-
-    Fancy Iterators
-        permutation_iterator
-        reverse_iterator
-
-    Device support
-        Add support for multicore CPUs via OpenMP
-        Add support for Fermi-class GPUs
-        Add support for Ocelot virtual machine
-
-New Examples
-    cpp_integration
-    histogram
-    mode
-    monte_carlo
-    monte_carlo_disjoint_sequences
-    padded_grid_reduction
-    permutation_iterator
-    row_sum
-    run_length_encoding
-    segmented_scan
-    stream_compaction
-    summary_statistics
-    transform_iterator
-    word_count
-
-Other Enhancements
-    vector functions operator!=, rbegin, crbegin, rend, crend, data, & shrink_to_fit
-    integer sorting performance is improved when max is large but (max - min) is small and when min is negative
-    performance of inclusive_scan() and exclusive_scan() is improved by 20-25% for primitive types
-    support for nvcc 3.0
-
-Removed Functionality
-    removed support for equal between host & device sequences
-    removed support for gather() and scatter() between host & device sequences
-
-Bug Fixes
-    # 8 cause a compiler error if the required compiler is not found rather than a mysterious error at link time
-    # 42 device_ptr & device_reference are classes rather than structs, eliminating warnings on certain platforms
-    # 46 gather & scatter handle any space iterators correctly
-    # 51 thrust::experimental::arch functions gracefully handle unrecognized GPUs
-    # 52 avoid collisions with common user macros such as BLOCK_SIZE
-    # 62 provide better documentation for device_reference
-    # 68 allow built-in CUDA vector types to work with device_vector in pure C++ mode
-    # 102 eliminated a race condition in device_vector::erase
-    various compilation warnings eliminated
-
-Known Issues
-   inclusive_scan & exclusive_scan may fail with very large types
-   the Microsoft compiler may fail to compile code using both sort and binary search algorithms
-   uninitialized_fill & uninitialized_copy dispatch constructors on the host rather than the device
-   # 109 some algorithms may exhibit poor performance with the OpenMP backend with large numbers (>= 6) of CPU threads
-   default_random_engine::discard is not accelerated with nvcc 2.3
-
-Acknowledgments
-   Thanks to Gregory Diamos for contributing a CUDA implementation of set_intersection
-   Thanks to Ryuta Suzuki & Gregory Diamos for rigorously testing Thrust's unit tests and examples against Ocelot
-   Thanks to Tom Bradley for contributing an implementation of normal_distribution
-   Thanks to Joseph Rhoads for contributing the example summary_statistics
-
-#######################################
-#           Thrust v1.1.1             #
-#######################################
-
-Summary
-    Small fixes for compatibility with CUDA 2.3a and Mac OSX Snow Leopard.
-
-#######################################
-#           Thrust v1.1.0             #
-#######################################
-
-Summary
-    Thrust v1.1 introduces fancy iterators, binary search functions, and
-    several specialized reduction functions.  Experimental support for
-    segmented scan has also been added.
-
-Breaking API Changes
-    counting_iterator has been moved into the thrust namespace (previously thrust::experimental)
-
-New Features
-    Functions
-        copy_if
-        lower_bound
-        upper_bound
-        vectorized lower_bound
-        vectorized upper_bound
-        equal_range
-        binary_search
-        vectorized binary_search
-        all_of
-        any_of
-        none_of
-        minmax_element
-        advance
-        inclusive_segmented_scan (experimental)
-        exclusive_segmented_scan (experimental)
-
-    Types
-        pair
-        tuple
-        device_malloc_allocator
-
-    Fancy Iterators
-        constant_iterator
-        counting_iterator
-        transform_iterator
-        zip_iterator
-
-New Examples
-    computing the maximum absolute difference between vectors
-    computing the bounding box of a two-dimensional point set
-    sorting multiple arrays together (lexicographical sorting)
-    constructing a summed area table
-    using zip_iterator to mimic an array of structs
-    using constant_iterator to increment array values
-
-Other Enhancements
-    added pinned memory allocator (experimental)
-    added more methods to host_vector & device_vector (issue #4)
-    added variant of remove_if with a stencil argument (issue #29)
-    scan and reduce use cudaFuncGetAttributes to determine grid size
-    exceptions are reported when temporary device arrays cannot be allocated 
-
-Bug Fixes
-     #5 make vector work for larger data types
-     #9 stable_partition_copy doesn't respect OutputIterator concept semantics
-    #10 scans should return OutputIterator
-    #16 make algorithms work for larger data types
-    #27 dispatch radix_sort even when comp=less<T> is explicitly provided
-
-Known Issues
-    Using functors with Thrust entry points may not compile on Mac OSX with gcc-4.0.1
-    uninitialized_copy & uninitialized_fill dispatch constructors on the host rather than the device.
-    inclusive_scan, inclusive_scan_by_key, exclusive_scan, and exclusive_scan_by_key may fail when used with large types with the CUDA 3.1 driver
-
-
-#######################################
-#           Thrust v1.0.0             #
-#######################################
-
-Breaking API changes
-    Rename top level namespace komrade to thrust.
-    Move partition_copy() & stable_partition_copy() into thrust::experimental namespace until we can easily provide the standard interface.
-    Rename range() to sequence() to avoid collision with Boost.Range.
-    Rename copy_if() to copy_when() due to semantic differences with C++0x copy_if().
-
-New Features
-    Add C++0x style cbegin() & cend() methods to host_vector & device_vector.
-    Add transform_if function.
-    Add stencil versions of replace_if() & replace_copy_if().
-    Allow counting_iterator to work with for_each().
-    Allow types with constructors in comparison sort & reduce.
-
-Other Enhancements
-    merge_sort and stable_merge_sort are now 2 to 5x faster when executed on the parallel device.
-
-Bug fixes
-    Workaround an issue where an incremented iterator causes nvcc to crash. (Komrade issue #6)
-    Fix an issue where const_iterators could not be passed to transform. (Komrade issue #7)
-
diff --git a/compat/thrust/adjacent_difference.h b/compat/thrust/adjacent_difference.h
deleted file mode 100644
index 772b5f993f..0000000000
--- a/compat/thrust/adjacent_difference.h
+++ /dev/null
@@ -1,244 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file adjacent_difference.h
- *  \brief Compute difference between consecutive elements of a range
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/execution_policy.h>
-
-namespace thrust
-{
-
-
-/*! \addtogroup transformations Transformations
- *  \{
- */
-
-
-/*! \p adjacent_difference calculates the differences of adjacent elements in the
- *  range <tt>[first, last)</tt>. That is, <tt>\*first</tt> is assigned to
- *  <tt>\*result</tt>, and, for each iterator \p i in the range
- *  <tt>[first + 1, last)</tt>, the difference of <tt>\*i</tt> and <tt>*(i - 1)</tt>
- *  is assigned to <tt>\*(result + (i - first))</tt>.
- *
- *  This version of \p adjacent_difference uses <tt>operator-</tt> to calculate
- *  differences.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the input range.
- *  \param last  The end of the input range.
- *  \param result The beginning of the output range.
- *  \return The iterator <tt>result + (last - first)</tt>
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \c x and \c y are objects of \p InputIterator's \c value_type, then \c x - \c is defined,
- *          and \p InputIterator's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types,
- *          and the return type of <tt>x - y</tt> is convertible to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *
- *  \remark Note that \p result is permitted to be the same iterator as \p first. This is
- *          useful for computing differences "in place".
- *
- *  The following code snippet demonstrates how to use \p adjacent_difference to compute
- *  the difference between adjacent elements of a range using the \p thrust::device execution policy:
- *
- *  \code
- *  #include <thrust/adjacent_difference.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  int h_data[8] = {1, 2, 1, 2, 1, 2, 1, 2};
- *  thrust::device_vector<int> d_data(h_data, h_data + 8);
- *  thrust::device_vector<int> d_result(8);
- *
- *  thrust::adjacent_difference(thrust::device, d_data.begin(), d_data.end(), d_result.begin());
- *
- *  // d_result is now [1, 1, -1, 1, -1, 1, -1, 1]
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/adjacent_difference.html
- *  \see inclusive_scan
- */
-template<typename DerivedPolicy, typename InputIterator, typename OutputIterator>
-OutputIterator adjacent_difference(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                   InputIterator first, InputIterator last, 
-                                   OutputIterator result);
-
-/*! \p adjacent_difference calculates the differences of adjacent elements in the
- *  range <tt>[first, last)</tt>. That is, <tt>*first</tt> is assigned to
- *  <tt>\*result</tt>, and, for each iterator \p i in the range
- *  <tt>[first + 1, last)</tt>, <tt>binary_op(\*i, \*(i - 1))</tt> is assigned to
- *  <tt>\*(result + (i - first))</tt>.
- *  
- *  This version of \p adjacent_difference uses the binary function \p binary_op to
- *  calculate differences.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the input range.
- *  \param last  The end of the input range.
- *  \param result The beginning of the output range.
- *  \param binary_op The binary function used to compute differences.
- *  \return The iterator <tt>result + (last - first)</tt>
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator's \c value_type is convertible to \p BinaryFunction's \c first_argument_type and \c second_argument_type,
- *          and \p InputIterator's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam BinaryFunction's \c result_type is convertible to a type in \p OutputIterator's set of \c value_types.
- *
- *  \remark Note that \p result is permitted to be the same iterator as \p first. This is
- *          useful for computing differences "in place".
- *
- *  The following code snippet demonstrates how to use \p adjacent_difference to compute
- *  the sum between adjacent elements of a range using the \p thrust::device execution policy:
- *
- *  \code
- *  #include <thrust/adjacent_difference.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  int h_data[8] = {1, 2, 1, 2, 1, 2, 1, 2};
- *  thrust::device_vector<int> d_data(h_data, h_data + 8);
- *  thrust::device_vector<int> d_result(8);
- *
- *  thrust::adjacent_difference(thrust::device, d_data.begin(), d_data.end(), d_result.begin(), thrust::plus<int>());
- *
- *  // d_data is now [1, 3, 3, 3, 3, 3, 3, 3]
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/adjacent_difference.html
- *  \see inclusive_scan
- */
-template<typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename BinaryFunction>
-OutputIterator adjacent_difference(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                   InputIterator first, InputIterator last,
-                                   OutputIterator result,
-                                   BinaryFunction binary_op);
-
-/*! \p adjacent_difference calculates the differences of adjacent elements in the
- *  range <tt>[first, last)</tt>. That is, <tt>\*first</tt> is assigned to
- *  <tt>\*result</tt>, and, for each iterator \p i in the range
- *  <tt>[first + 1, last)</tt>, the difference of <tt>\*i</tt> and <tt>*(i - 1)</tt>
- *  is assigned to <tt>\*(result + (i - first))</tt>.
- *
- *  This version of \p adjacent_difference uses <tt>operator-</tt> to calculate
- *  differences.
- *
- *  \param first The beginning of the input range.
- *  \param last  The end of the input range.
- *  \param result The beginning of the output range.
- *  \return The iterator <tt>result + (last - first)</tt>
- *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \c x and \c y are objects of \p InputIterator's \c value_type, then \c x - \c is defined,
- *          and \p InputIterator's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types,
- *          and the return type of <tt>x - y</tt> is convertible to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *
- *  \remark Note that \p result is permitted to be the same iterator as \p first. This is
- *          useful for computing differences "in place".
- *
- *  The following code snippet demonstrates how to use \p adjacent_difference to compute
- *  the difference between adjacent elements of a range.
- *
- *  \code
- *  #include <thrust/adjacent_difference.h>
- *  #include <thrust/device_vector.h>
- *  ...
- *  int h_data[8] = {1, 2, 1, 2, 1, 2, 1, 2};
- *  thrust::device_vector<int> d_data(h_data, h_data + 8);
- *  thrust::device_vector<int> d_result(8);
- *
- *  thrust::adjacent_difference(d_data.begin(), d_data.end(), d_result.begin());
- *
- *  // d_result is now [1, 1, -1, 1, -1, 1, -1, 1]
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/adjacent_difference.html
- *  \see inclusive_scan
- */
-template <typename InputIterator, typename OutputIterator>
-OutputIterator adjacent_difference(InputIterator first, InputIterator last, 
-                                   OutputIterator result);
-
-/*! \p adjacent_difference calculates the differences of adjacent elements in the
- *  range <tt>[first, last)</tt>. That is, <tt>*first</tt> is assigned to
- *  <tt>\*result</tt>, and, for each iterator \p i in the range
- *  <tt>[first + 1, last)</tt>, <tt>binary_op(\*i, \*(i - 1))</tt> is assigned to
- *  <tt>\*(result + (i - first))</tt>.
- *  
- *  This version of \p adjacent_difference uses the binary function \p binary_op to
- *  calculate differences.
- *
- *  \param first The beginning of the input range.
- *  \param last  The end of the input range.
- *  \param result The beginning of the output range.
- *  \param binary_op The binary function used to compute differences.
- *  \return The iterator <tt>result + (last - first)</tt>
- *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator's \c value_type is convertible to \p BinaryFunction's \c first_argument_type and \c second_argument_type,
- *          and \p InputIterator's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam BinaryFunction's \c result_type is convertible to a type in \p OutputIterator's set of \c value_types.
- *
- *  \remark Note that \p result is permitted to be the same iterator as \p first. This is
- *          useful for computing differences "in place".
- *
- *  The following code snippet demonstrates how to use \p adjacent_difference to compute
- *  the sum between adjacent elements of a range.
- *
- *  \code
- *  #include <thrust/adjacent_difference.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/device_vector.h>
- *  ...
- *  int h_data[8] = {1, 2, 1, 2, 1, 2, 1, 2};
- *  thrust::device_vector<int> d_data(h_data, h_data + 8);
- *  thrust::device_vector<int> d_result(8);
- *
- *  thrust::adjacent_difference(d_data.begin(), d_data.end(), d_result.begin(), thrust::plus<int>());
- *
- *  // d_data is now [1, 3, 3, 3, 3, 3, 3, 3]
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/adjacent_difference.html
- *  \see inclusive_scan
- */
-template <typename InputIterator, typename OutputIterator, typename BinaryFunction>
-OutputIterator adjacent_difference(InputIterator first, InputIterator last,
-                                   OutputIterator result,
-                                   BinaryFunction binary_op);
-
-/*! \}
- */
-
-} // end namespace thrust
-
-#include <thrust/detail/adjacent_difference.inl>
-
diff --git a/compat/thrust/advance.h b/compat/thrust/advance.h
deleted file mode 100644
index e7f60b0d54..0000000000
--- a/compat/thrust/advance.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file advance.h
- *  \brief Advance an iterator by a given distance.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-namespace thrust
-{
-
-
-/*! \addtogroup iterators
- *  \{
- */
-
-/*! \p advance(i, n) increments the iterator \p i by the distance \p n. 
- *  If <tt>n > 0</tt> it is equivalent to executing <tt>++i</tt> \p n
- *  times, and if <tt>n < 0</tt> it is equivalent to executing <tt>--i</tt>
- *  \p n times. If <tt>n == 0</tt>, the call has no effect.
- *
- *  \param i The iterator to be advanced.
- *  \param n The distance by which to advance the iterator.
- *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam Distance is an integral type that is convertible to \p InputIterator's distance type. 
- *
- *  \pre \p n shall be negative only for bidirectional and random access iterators.
- *
- *  The following code snippet demonstrates how to use \p advance to increment
- *  an iterator a given number of times.
- *
- *  \code
- *  #include <thrust/advance.h>
- *  #include <thrust/device_vector.h>
- *  ...
- *  thrust::device_vector<int> vec(13);
- *  thrust::device_vector<int>::iterator iter = vec.begin();
- *
- *  thrust::advance(iter, 7);
- *
- *  // iter - vec.begin() == 7
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/advance.html
- */
-template <typename InputIterator, typename Distance>
-void advance(InputIterator& i, Distance n);
-
-/*! \} // end iterators
- */
-
-} // end thrust
-
-#include <thrust/detail/advance.inl>
-
diff --git a/compat/thrust/binary_search.h b/compat/thrust/binary_search.h
deleted file mode 100644
index d2ac5a621e..0000000000
--- a/compat/thrust/binary_search.h
+++ /dev/null
@@ -1,1888 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file binary_search.h
- *  \brief Search for values in sorted ranges.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/execution_policy.h>
-#include <thrust/pair.h>
-
-namespace thrust
-{
-
-    
-/*! \addtogroup algorithms
- */
-
-
-/*! \addtogroup searching
- *  \ingroup algorithms
- *  \{
- */
-
-
-/*! \addtogroup binary_search Binary Search
- *  \ingroup searching
- *  \{
- */
-
-
-//////////////////////   
-// Scalar Functions //
-//////////////////////
-
-
-/*! \p lower_bound is a version of binary search: it attempts to find
- * the element value in an ordered range <tt>[first, last)</tt>. 
- * Specifically, it returns the first position where value could be
- * inserted without violating the ordering. This version of 
- * \p lower_bound uses <tt>operator<</tt> for comparison and returns
- * the furthermost iterator \c i in <tt>[first, last)</tt> such that,
- * for every iterator \c j in <tt>[first, i)</tt>, <tt>*j < value</tt>. 
- *
- * The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the ordered sequence.
- *  \param last The end of the ordered sequence.
- *  \param value The value to be searched.
- *  \return The furthermost iterator \c i, such that <tt>*i < value</tt>.
- * 
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam LessThanComparable is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>. 
- *
- *  The following code snippet demonstrates how to use \p lower_bound
- *  to search for values in a ordered range using the \p thrust::device execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/binary_search.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  thrust::device_vector<int> input(5);
- *
- *  input[0] = 0;
- *  input[1] = 2;
- *  input[2] = 5;
- *  input[3] = 7;
- *  input[4] = 8;
- *
- *  thrust::lower_bound(thrust::device, input.begin(), input.end(), 0); // returns input.begin()
- *  thrust::lower_bound(thrust::device, input.begin(), input.end(), 1); // returns input.begin() + 1
- *  thrust::lower_bound(thrust::device, input.begin(), input.end(), 2); // returns input.begin() + 1
- *  thrust::lower_bound(thrust::device, input.begin(), input.end(), 3); // returns input.begin() + 2
- *  thrust::lower_bound(thrust::device, input.begin(), input.end(), 8); // returns input.begin() + 4
- *  thrust::lower_bound(thrust::device, input.begin(), input.end(), 9); // returns input.end()
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/lower_bound.html
- *  \see \p upper_bound
- *  \see \p equal_range
- *  \see \p binary_search
- */
-template<typename DerivedPolicy, typename ForwardIterator, typename LessThanComparable>
-ForwardIterator lower_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                            ForwardIterator first,
-                            ForwardIterator last,
-                            const LessThanComparable &value);
-
-
-/*! \p lower_bound is a version of binary search: it attempts to find
- * the element value in an ordered range <tt>[first, last)</tt>. 
- * Specifically, it returns the first position where value could be
- * inserted without violating the ordering. This version of 
- * \p lower_bound uses <tt>operator<</tt> for comparison and returns
- * the furthermost iterator \c i in <tt>[first, last)</tt> such that,
- * for every iterator \c j in <tt>[first, i)</tt>, <tt>*j < value</tt>. 
- *
- *  \param first The beginning of the ordered sequence.
- *  \param last The end of the ordered sequence.
- *  \param value The value to be searched.
- *  \return The furthermost iterator \c i, such that <tt>*i < value</tt>.
- * 
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam LessThanComparable is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>. 
- *
- *  The following code snippet demonstrates how to use \p lower_bound
- *  to search for values in a ordered range.
- *
- *  \code
- *  #include <thrust/binary_search.h>
- *  #include <thrust/device_vector.h>
- *  ...
- *  thrust::device_vector<int> input(5);
- *
- *  input[0] = 0;
- *  input[1] = 2;
- *  input[2] = 5;
- *  input[3] = 7;
- *  input[4] = 8;
- *
- *  thrust::lower_bound(input.begin(), input.end(), 0); // returns input.begin()
- *  thrust::lower_bound(input.begin(), input.end(), 1); // returns input.begin() + 1
- *  thrust::lower_bound(input.begin(), input.end(), 2); // returns input.begin() + 1
- *  thrust::lower_bound(input.begin(), input.end(), 3); // returns input.begin() + 2
- *  thrust::lower_bound(input.begin(), input.end(), 8); // returns input.begin() + 4
- *  thrust::lower_bound(input.begin(), input.end(), 9); // returns input.end()
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/lower_bound.html
- *  \see \p upper_bound
- *  \see \p equal_range
- *  \see \p binary_search
- */
-template <class ForwardIterator, class LessThanComparable>
-ForwardIterator lower_bound(ForwardIterator first, 
-                            ForwardIterator last,
-                            const LessThanComparable& value);
-
-
-/*! \p lower_bound is a version of binary search: it attempts to find
- * the element value in an ordered range <tt>[first, last)</tt>. 
- * Specifically, it returns the first position where value could be
- * inserted without violating the ordering. This version of 
- * \p lower_bound uses function object \c comp for comparison 
- * and returns the furthermost iterator \c i in <tt>[first, last)</tt>
- * such that, for every iterator \c j in <tt>[first, i)</tt>, 
- * <tt>comp(*j, value)</tt> is \c true. 
- *
- * The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the ordered sequence.
- *  \param last The end of the ordered sequence.
- *  \param value The value to be searched.
- *  \param comp The comparison operator.
- *  \return The furthermost iterator \c i, such that <tt>comp(*i, value)</tt> is \c true.
- * 
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam T is comparable to \p ForwardIterator's \c value_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
- *
- *  The following code snippet demonstrates how to use \p lower_bound
- *  to search for values in a ordered range using the \p thrust::device execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/binary_search.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  thrust::device_vector<int> input(5);
- *
- *  input[0] = 0;
- *  input[1] = 2;
- *  input[2] = 5;
- *  input[3] = 7;
- *  input[4] = 8;
- *
- *  thrust::lower_bound(input.begin(), input.end(), 0, thrust::less<int>()); // returns input.begin()
- *  thrust::lower_bound(input.begin(), input.end(), 1, thrust::less<int>()); // returns input.begin() + 1
- *  thrust::lower_bound(input.begin(), input.end(), 2, thrust::less<int>()); // returns input.begin() + 1
- *  thrust::lower_bound(input.begin(), input.end(), 3, thrust::less<int>()); // returns input.begin() + 2
- *  thrust::lower_bound(input.begin(), input.end(), 8, thrust::less<int>()); // returns input.begin() + 4
- *  thrust::lower_bound(input.begin(), input.end(), 9, thrust::less<int>()); // returns input.end()
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/lower_bound.html
- *  \see \p upper_bound
- *  \see \p equal_range
- *  \see \p binary_search
- */
-template<typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
-ForwardIterator lower_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                            ForwardIterator first,
-                            ForwardIterator last,
-                            const T &value,
-                            StrictWeakOrdering comp);
-
-
-/*! \p lower_bound is a version of binary search: it attempts to find
- * the element value in an ordered range <tt>[first, last)</tt>. 
- * Specifically, it returns the first position where value could be
- * inserted without violating the ordering. This version of 
- * \p lower_bound uses function object \c comp for comparison 
- * and returns the furthermost iterator \c i in <tt>[first, last)</tt>
- * such that, for every iterator \c j in <tt>[first, i)</tt>, 
- * <tt>comp(*j, value)</tt> is \c true. 
- *
- *  \param first The beginning of the ordered sequence.
- *  \param last The end of the ordered sequence.
- *  \param value The value to be searched.
- *  \param comp The comparison operator.
- *  \return The furthermost iterator \c i, such that <tt>comp(*i, value)</tt> is \c true.
- * 
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam T is comparable to \p ForwardIterator's \c value_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
- *
- *  The following code snippet demonstrates how to use \p lower_bound
- *  to search for values in a ordered range.
- *
- *  \code
- *  #include <thrust/binary_search.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/functional.h>
- *  ...
- *  thrust::device_vector<int> input(5);
- *
- *  input[0] = 0;
- *  input[1] = 2;
- *  input[2] = 5;
- *  input[3] = 7;
- *  input[4] = 8;
- *
- *  thrust::lower_bound(input.begin(), input.end(), 0, thrust::less<int>()); // returns input.begin()
- *  thrust::lower_bound(input.begin(), input.end(), 1, thrust::less<int>()); // returns input.begin() + 1
- *  thrust::lower_bound(input.begin(), input.end(), 2, thrust::less<int>()); // returns input.begin() + 1
- *  thrust::lower_bound(input.begin(), input.end(), 3, thrust::less<int>()); // returns input.begin() + 2
- *  thrust::lower_bound(input.begin(), input.end(), 8, thrust::less<int>()); // returns input.begin() + 4
- *  thrust::lower_bound(input.begin(), input.end(), 9, thrust::less<int>()); // returns input.end()
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/lower_bound.html
- *  \see \p upper_bound
- *  \see \p equal_range
- *  \see \p binary_search
- */
-template <class ForwardIterator, class T, class StrictWeakOrdering>
-ForwardIterator lower_bound(ForwardIterator first,
-                            ForwardIterator last,
-                            const T& value, 
-                            StrictWeakOrdering comp);
-
-
-/*! \p upper_bound is a version of binary search: it attempts to find
- * the element value in an ordered range <tt>[first, last)</tt>. 
- * Specifically, it returns the last position where value could be
- * inserted without violating the ordering. This version of 
- * \p upper_bound uses <tt>operator<</tt> for comparison and returns
- * the furthermost iterator \c i in <tt>[first, last)</tt> such that,
- * for every iterator \c j in <tt>[first, i)</tt>, <tt>value < *j</tt>
- * is \c false.
- *
- * The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the ordered sequence.
- *  \param last The end of the ordered sequence.
- *  \param value The value to be searched.
- *  \return The furthermost iterator \c i, such that <tt>value < *i</tt> is \c false.
- * 
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam LessThanComparable is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>. 
- *
- *  The following code snippet demonstrates how to use \p upper_bound
- *  to search for values in a ordered range using the \p thrust::device execution policy for parallelism:
- *
- *  \code
- *  #include <thrust/binary_search.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  thrust::device_vector<int> input(5);
- *
- *  input[0] = 0;
- *  input[1] = 2;
- *  input[2] = 5;
- *  input[3] = 7;
- *  input[4] = 8;
- *
- *  thrust::upper_bound(thrust::device, input.begin(), input.end(), 0); // returns input.begin() + 1
- *  thrust::upper_bound(thrust::device, input.begin(), input.end(), 1); // returns input.begin() + 1
- *  thrust::upper_bound(thrust::device, input.begin(), input.end(), 2); // returns input.begin() + 2
- *  thrust::upper_bound(thrust::device, input.begin(), input.end(), 3); // returns input.begin() + 2
- *  thrust::upper_bound(thrust::device, input.begin(), input.end(), 8); // returns input.end()
- *  thrust::upper_bound(thrust::device, input.begin(), input.end(), 9); // returns input.end()
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/upper_bound.html
- *  \see \p lower_bound
- *  \see \p equal_range
- *  \see \p binary_search
- */
-template<typename DerivedPolicy, typename ForwardIterator, typename LessThanComparable>
-ForwardIterator upper_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                            ForwardIterator first,
-                            ForwardIterator last,
-                            const LessThanComparable &value);
-
-
-/*! \p upper_bound is a version of binary search: it attempts to find
- * the element value in an ordered range <tt>[first, last)</tt>. 
- * Specifically, it returns the last position where value could be
- * inserted without violating the ordering. This version of 
- * \p upper_bound uses <tt>operator<</tt> for comparison and returns
- * the furthermost iterator \c i in <tt>[first, last)</tt> such that,
- * for every iterator \c j in <tt>[first, i)</tt>, <tt>value < *j</tt>
- * is \c false.
- *
- *  \param first The beginning of the ordered sequence.
- *  \param last The end of the ordered sequence.
- *  \param value The value to be searched.
- *  \return The furthermost iterator \c i, such that <tt>value < *i</tt> is \c false.
- * 
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam LessThanComparable is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>. 
- *
- *  The following code snippet demonstrates how to use \p upper_bound
- *  to search for values in a ordered range.
- *
- *  \code
- *  #include <thrust/binary_search.h>
- *  #include <thrust/device_vector.h>
- *  ...
- *  thrust::device_vector<int> input(5);
- *
- *  input[0] = 0;
- *  input[1] = 2;
- *  input[2] = 5;
- *  input[3] = 7;
- *  input[4] = 8;
- *
- *  thrust::upper_bound(input.begin(), input.end(), 0); // returns input.begin() + 1
- *  thrust::upper_bound(input.begin(), input.end(), 1); // returns input.begin() + 1
- *  thrust::upper_bound(input.begin(), input.end(), 2); // returns input.begin() + 2
- *  thrust::upper_bound(input.begin(), input.end(), 3); // returns input.begin() + 2
- *  thrust::upper_bound(input.begin(), input.end(), 8); // returns input.end()
- *  thrust::upper_bound(input.begin(), input.end(), 9); // returns input.end()
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/upper_bound.html
- *  \see \p lower_bound
- *  \see \p equal_range
- *  \see \p binary_search
- */
-template <class ForwardIterator, class LessThanComparable>
-ForwardIterator upper_bound(ForwardIterator first, 
-                            ForwardIterator last,
-                            const LessThanComparable& value);
-
-
-/*! \p upper_bound is a version of binary search: it attempts to find
- * the element value in an ordered range <tt>[first, last)</tt>. 
- * Specifically, it returns the last position where value could be
- * inserted without violating the ordering. This version of 
- * \p upper_bound uses function object \c comp for comparison and returns
- * the furthermost iterator \c i in <tt>[first, last)</tt> such that,
- * for every iterator \c j in <tt>[first, i)</tt>, <tt>comp(value, *j)</tt>
- * is \c false.
- *
- * The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the ordered sequence.
- *  \param last The end of the ordered sequence.
- *  \param value The value to be searched.
- *  \param comp The comparison operator.
- *  \return The furthermost iterator \c i, such that <tt>comp(value, *i)</tt> is \c false.
- * 
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam T is comparable to \p ForwardIterator's \c value_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
- *
- *  The following code snippet demonstrates how to use \p upper_bound
- *  to search for values in a ordered range using the \p thrust::device execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/binary_search.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  thrust::device_vector<int> input(5);
- *
- *  input[0] = 0;
- *  input[1] = 2;
- *  input[2] = 5;
- *  input[3] = 7;
- *  input[4] = 8;
- *
- *  thrust::upper_bound(thrust::device, input.begin(), input.end(), 0, thrust::less<int>()); // returns input.begin() + 1
- *  thrust::upper_bound(thrust::device, input.begin(), input.end(), 1, thrust::less<int>()); // returns input.begin() + 1
- *  thrust::upper_bound(thrust::device, input.begin(), input.end(), 2, thrust::less<int>()); // returns input.begin() + 2
- *  thrust::upper_bound(thrust::device, input.begin(), input.end(), 3, thrust::less<int>()); // returns input.begin() + 2
- *  thrust::upper_bound(thrust::device, input.begin(), input.end(), 8, thrust::less<int>()); // returns input.end()
- *  thrust::upper_bound(thrust::device, input.begin(), input.end(), 9, thrust::less<int>()); // returns input.end()
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/upper_bound.html
- *  \see \p lower_bound
- *  \see \p equal_range
- *  \see \p binary_search
- */
-template<typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
-ForwardIterator upper_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                            ForwardIterator first,
-                            ForwardIterator last,
-                            const T &value,
-                            StrictWeakOrdering comp);
-
-/*! \p upper_bound is a version of binary search: it attempts to find
- * the element value in an ordered range <tt>[first, last)</tt>. 
- * Specifically, it returns the last position where value could be
- * inserted without violating the ordering. This version of 
- * \p upper_bound uses function object \c comp for comparison and returns
- * the furthermost iterator \c i in <tt>[first, last)</tt> such that,
- * for every iterator \c j in <tt>[first, i)</tt>, <tt>comp(value, *j)</tt>
- * is \c false.
- *
- *  \param first The beginning of the ordered sequence.
- *  \param last The end of the ordered sequence.
- *  \param value The value to be searched.
- *  \param comp The comparison operator.
- *  \return The furthermost iterator \c i, such that <tt>comp(value, *i)</tt> is \c false.
- * 
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam T is comparable to \p ForwardIterator's \c value_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
- *
- *  The following code snippet demonstrates how to use \p upper_bound
- *  to search for values in a ordered range.
- *
- *  \code
- *  #include <thrust/binary_search.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/functional.h>
- *  ...
- *  thrust::device_vector<int> input(5);
- *
- *  input[0] = 0;
- *  input[1] = 2;
- *  input[2] = 5;
- *  input[3] = 7;
- *  input[4] = 8;
- *
- *  thrust::upper_bound(input.begin(), input.end(), 0, thrust::less<int>()); // returns input.begin() + 1
- *  thrust::upper_bound(input.begin(), input.end(), 1, thrust::less<int>()); // returns input.begin() + 1
- *  thrust::upper_bound(input.begin(), input.end(), 2, thrust::less<int>()); // returns input.begin() + 2
- *  thrust::upper_bound(input.begin(), input.end(), 3, thrust::less<int>()); // returns input.begin() + 2
- *  thrust::upper_bound(input.begin(), input.end(), 8, thrust::less<int>()); // returns input.end()
- *  thrust::upper_bound(input.begin(), input.end(), 9, thrust::less<int>()); // returns input.end()
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/upper_bound.html
- *  \see \p lower_bound
- *  \see \p equal_range
- *  \see \p binary_search
- */
-template <class ForwardIterator, class T, class StrictWeakOrdering>
-ForwardIterator upper_bound(ForwardIterator first,
-                            ForwardIterator last,
-                            const T& value, 
-                            StrictWeakOrdering comp);
-
-
-/*! \p binary_search is a version of binary search: it attempts to find
- * the element value in an ordered range <tt>[first, last)</tt>. 
- * It returns \c true if an element that is equivalent to \c value 
- * is present in <tt>[first, last)</tt> and \c false if no such element
- * exists.  Specifically, this version returns \c true if and only if 
- * there exists an iterator \c i in <tt>[first, last)</tt> such that 
- * <tt>*i < value</tt> and <tt>value < *i</tt> are both \c false.
- *
- * The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the ordered sequence.
- *  \param last The end of the ordered sequence.
- *  \param value The value to be searched.
- *  \return \c true if an equivalent element exists in <tt>[first, last)</tt>, otherwise \c false.
- * 
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam LessThanComparable is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>. 
- *
- *  The following code snippet demonstrates how to use \p binary_search
- *  to search for values in a ordered range using the \p thrust::device execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/binary_search.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  thrust::device_vector<int> input(5);
- *
- *  input[0] = 0;
- *  input[1] = 2;
- *  input[2] = 5;
- *  input[3] = 7;
- *  input[4] = 8;
- *
- *  thrust::binary_search(thrust::device, input.begin(), input.end(), 0); // returns true
- *  thrust::binary_search(thrust::device, input.begin(), input.end(), 1); // returns false
- *  thrust::binary_search(thrust::device, input.begin(), input.end(), 2); // returns true
- *  thrust::binary_search(thrust::device, input.begin(), input.end(), 3); // returns false
- *  thrust::binary_search(thrust::device, input.begin(), input.end(), 8); // returns true
- *  thrust::binary_search(thrust::device, input.begin(), input.end(), 9); // returns false
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/binary_search.html
- *  \see \p lower_bound
- *  \see \p upper_bound
- *  \see \p equal_range
- */
-template <typename DerivedPolicy, typename ForwardIterator, typename LessThanComparable>
-bool binary_search(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                   ForwardIterator first, 
-                   ForwardIterator last,
-                   const LessThanComparable& value);
-
-
-/*! \p binary_search is a version of binary search: it attempts to find
- * the element value in an ordered range <tt>[first, last)</tt>. 
- * It returns \c true if an element that is equivalent to \c value 
- * is present in <tt>[first, last)</tt> and \c false if no such element
- * exists.  Specifically, this version returns \c true if and only if 
- * there exists an iterator \c i in <tt>[first, last)</tt> such that 
- * <tt>*i < value</tt> and <tt>value < *i</tt> are both \c false.
- *
- *  \param first The beginning of the ordered sequence.
- *  \param last The end of the ordered sequence.
- *  \param value The value to be searched.
- *  \return \c true if an equivalent element exists in <tt>[first, last)</tt>, otherwise \c false.
- * 
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam LessThanComparable is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>. 
- *
- *  The following code snippet demonstrates how to use \p binary_search
- *  to search for values in a ordered range.
- *
- *  \code
- *  #include <thrust/binary_search.h>
- *  #include <thrust/device_vector.h>
- *  ...
- *  thrust::device_vector<int> input(5);
- *
- *  input[0] = 0;
- *  input[1] = 2;
- *  input[2] = 5;
- *  input[3] = 7;
- *  input[4] = 8;
- *
- *  thrust::binary_search(input.begin(), input.end(), 0); // returns true
- *  thrust::binary_search(input.begin(), input.end(), 1); // returns false
- *  thrust::binary_search(input.begin(), input.end(), 2); // returns true
- *  thrust::binary_search(input.begin(), input.end(), 3); // returns false
- *  thrust::binary_search(input.begin(), input.end(), 8); // returns true
- *  thrust::binary_search(input.begin(), input.end(), 9); // returns false
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/binary_search.html
- *  \see \p lower_bound
- *  \see \p upper_bound
- *  \see \p equal_range
- */
-template <class ForwardIterator, class LessThanComparable>
-bool binary_search(ForwardIterator first, 
-                   ForwardIterator last,
-                   const LessThanComparable& value);
-
-
-/*! \p binary_search is a version of binary search: it attempts to find
- * the element value in an ordered range <tt>[first, last)</tt>. 
- * It returns \c true if an element that is equivalent to \c value 
- * is present in <tt>[first, last)</tt> and \c false if no such element
- * exists.  Specifically, this version returns \c true if and only if 
- * there exists an iterator \c i in <tt>[first, last)</tt> such that 
- * <tt>comp(*i, value)</tt> and <tt>comp(value, *i)</tt> are both \c false.
- *
- * The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the ordered sequence.
- *  \param last The end of the ordered sequence.
- *  \param value The value to be searched.
- *  \param comp The comparison operator.
- *  \return \c true if an equivalent element exists in <tt>[first, last)</tt>, otherwise \c false.
- * 
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam T is comparable to \p ForwardIterator's \c value_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
- *
- *  The following code snippet demonstrates how to use \p binary_search
- *  to search for values in a ordered range using the \p thrust::device execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/binary_search.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  thrust::device_vector<int> input(5);
- *
- *  input[0] = 0;
- *  input[1] = 2;
- *  input[2] = 5;
- *  input[3] = 7;
- *  input[4] = 8;
- *
- *  thrust::binary_search(thrust::device, input.begin(), input.end(), 0, thrust::less<int>()); // returns true
- *  thrust::binary_search(thrust::device, input.begin(), input.end(), 1, thrust::less<int>()); // returns false
- *  thrust::binary_search(thrust::device, input.begin(), input.end(), 2, thrust::less<int>()); // returns true
- *  thrust::binary_search(thrust::device, input.begin(), input.end(), 3, thrust::less<int>()); // returns false
- *  thrust::binary_search(thrust::device, input.begin(), input.end(), 8, thrust::less<int>()); // returns true
- *  thrust::binary_search(thrust::device, input.begin(), input.end(), 9, thrust::less<int>()); // returns false
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/binary_search.html
- *  \see \p lower_bound
- *  \see \p upper_bound
- *  \see \p equal_range
- */
-template <typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
-bool binary_search(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                   ForwardIterator first,
-                   ForwardIterator last,
-                   const T& value, 
-                   StrictWeakOrdering comp);
-
-
-/*! \p binary_search is a version of binary search: it attempts to find
- * the element value in an ordered range <tt>[first, last)</tt>. 
- * It returns \c true if an element that is equivalent to \c value 
- * is present in <tt>[first, last)</tt> and \c false if no such element
- * exists.  Specifically, this version returns \c true if and only if 
- * there exists an iterator \c i in <tt>[first, last)</tt> such that 
- * <tt>comp(*i, value)</tt> and <tt>comp(value, *i)</tt> are both \c false.
- *
- *  \param first The beginning of the ordered sequence.
- *  \param last The end of the ordered sequence.
- *  \param value The value to be searched.
- *  \param comp The comparison operator.
- *  \return \c true if an equivalent element exists in <tt>[first, last)</tt>, otherwise \c false.
- * 
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam T is comparable to \p ForwardIterator's \c value_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
- *
- *  The following code snippet demonstrates how to use \p binary_search
- *  to search for values in a ordered range.
- *
- *  \code
- *  #include <thrust/binary_search.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/functional.h>
- *  ...
- *  thrust::device_vector<int> input(5);
- *
- *  input[0] = 0;
- *  input[1] = 2;
- *  input[2] = 5;
- *  input[3] = 7;
- *  input[4] = 8;
- *
- *  thrust::binary_search(input.begin(), input.end(), 0, thrust::less<int>()); // returns true
- *  thrust::binary_search(input.begin(), input.end(), 1, thrust::less<int>()); // returns false
- *  thrust::binary_search(input.begin(), input.end(), 2, thrust::less<int>()); // returns true
- *  thrust::binary_search(input.begin(), input.end(), 3, thrust::less<int>()); // returns false
- *  thrust::binary_search(input.begin(), input.end(), 8, thrust::less<int>()); // returns true
- *  thrust::binary_search(input.begin(), input.end(), 9, thrust::less<int>()); // returns false
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/binary_search.html
- *  \see \p lower_bound
- *  \see \p upper_bound
- *  \see \p equal_range
- */
-template <class ForwardIterator, class T, class StrictWeakOrdering>
-bool binary_search(ForwardIterator first,
-                   ForwardIterator last,
-                   const T& value, 
-                   StrictWeakOrdering comp);
-
-
-/*! \p equal_range is a version of binary search: it attempts to find
- * the element value in an ordered range <tt>[first, last)</tt>. The 
- * value returned by \p equal_range is essentially a combination of
- * the values returned by \p lower_bound and \p upper_bound: it returns
- * a \p pair of iterators \c i and \c j such that \c i is the first
- * position where value could be inserted without violating the 
- * ordering and \c j is the last position where value could be inserted
- * without violating the ordering. It follows that every element in the
- * range <tt>[i, j)</tt> is equivalent to value, and that 
- * <tt>[i, j)</tt> is the largest subrange of <tt>[first, last)</tt> that
- * has this property. 
- *
- * This version of \p equal_range returns a \p pair of iterators 
- * <tt>[i, j)</tt>, where \c i is the furthermost iterator in 
- * <tt>[first, last)</tt> such that, for every iterator \c k in 
- * <tt>[first, i)</tt>, <tt>*k < value</tt>.  \c j is the furthermost
- * iterator in <tt>[first, last)</tt> such that, for every iterator 
- * \c k in <tt>[first, j)</tt>, <tt>value < *k</tt> is \c false. 
- * For every iterator \c k in <tt>[i, j)</tt>, neither 
- * <tt>value < *k</tt> nor <tt>*k < value</tt> is \c true.
- *
- * The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the ordered sequence.
- *  \param last The end of the ordered sequence.
- *  \param value The value to be searched.
- *  \return A \p pair of iterators <tt>[i, j)</tt> that define the range of equivalent elements.
- * 
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam LessThanComparable is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>. 
- *
- *  The following code snippet demonstrates how to use \p equal_range
- *  to search for values in a ordered range using the \p thrust::device execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/binary_search.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  thrust::device_vector<int> input(5);
- *
- *  input[0] = 0;
- *  input[1] = 2;
- *  input[2] = 5;
- *  input[3] = 7;
- *  input[4] = 8;
- *
- *  thrust::equal_range(thrust::device, input.begin(), input.end(), 0); // returns [input.begin(), input.begin() + 1)
- *  thrust::equal_range(thrust::device, input.begin(), input.end(), 1); // returns [input.begin() + 1, input.begin() + 1)
- *  thrust::equal_range(thrust::device, input.begin(), input.end(), 2); // returns [input.begin() + 1, input.begin() + 2)
- *  thrust::equal_range(thrust::device, input.begin(), input.end(), 3); // returns [input.begin() + 2, input.begin() + 2)
- *  thrust::equal_range(thrust::device, input.begin(), input.end(), 8); // returns [input.begin() + 4, input.end)
- *  thrust::equal_range(thrust::device, input.begin(), input.end(), 9); // returns [input.end(), input.end)
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/equal_range.html
- *  \see \p lower_bound
- *  \see \p upper_bound
- *  \see \p binary_search
- */
-template <typename DerivedPolicy, typename ForwardIterator, typename LessThanComparable>
-thrust::pair<ForwardIterator, ForwardIterator>
-equal_range(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-            ForwardIterator first,
-            ForwardIterator last,
-            const LessThanComparable& value);
-
-
-/*! \p equal_range is a version of binary search: it attempts to find
- * the element value in an ordered range <tt>[first, last)</tt>. The 
- * value returned by \p equal_range is essentially a combination of
- * the values returned by \p lower_bound and \p upper_bound: it returns
- * a \p pair of iterators \c i and \c j such that \c i is the first
- * position where value could be inserted without violating the 
- * ordering and \c j is the last position where value could be inserted
- * without violating the ordering. It follows that every element in the
- * range <tt>[i, j)</tt> is equivalent to value, and that 
- * <tt>[i, j)</tt> is the largest subrange of <tt>[first, last)</tt> that
- * has this property. 
- *
- * This version of \p equal_range returns a \p pair of iterators 
- * <tt>[i, j)</tt>, where \c i is the furthermost iterator in 
- * <tt>[first, last)</tt> such that, for every iterator \c k in 
- * <tt>[first, i)</tt>, <tt>*k < value</tt>.  \c j is the furthermost
- * iterator in <tt>[first, last)</tt> such that, for every iterator 
- * \c k in <tt>[first, j)</tt>, <tt>value < *k</tt> is \c false. 
- * For every iterator \c k in <tt>[i, j)</tt>, neither 
- * <tt>value < *k</tt> nor <tt>*k < value</tt> is \c true.
- *
- *  \param first The beginning of the ordered sequence.
- *  \param last The end of the ordered sequence.
- *  \param value The value to be searched.
- *  \return A \p pair of iterators <tt>[i, j)</tt> that define the range of equivalent elements.
- * 
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam LessThanComparable is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>. 
- *
- *  The following code snippet demonstrates how to use \p equal_range
- *  to search for values in a ordered range.
- *
- *  \code
- *  #include <thrust/binary_search.h>
- *  #include <thrust/device_vector.h>
- *  ...
- *  thrust::device_vector<int> input(5);
- *
- *  input[0] = 0;
- *  input[1] = 2;
- *  input[2] = 5;
- *  input[3] = 7;
- *  input[4] = 8;
- *
- *  thrust::equal_range(input.begin(), input.end(), 0); // returns [input.begin(), input.begin() + 1)
- *  thrust::equal_range(input.begin(), input.end(), 1); // returns [input.begin() + 1, input.begin() + 1)
- *  thrust::equal_range(input.begin(), input.end(), 2); // returns [input.begin() + 1, input.begin() + 2)
- *  thrust::equal_range(input.begin(), input.end(), 3); // returns [input.begin() + 2, input.begin() + 2)
- *  thrust::equal_range(input.begin(), input.end(), 8); // returns [input.begin() + 4, input.end)
- *  thrust::equal_range(input.begin(), input.end(), 9); // returns [input.end(), input.end)
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/equal_range.html
- *  \see \p lower_bound
- *  \see \p upper_bound
- *  \see \p binary_search
- */
-template <class ForwardIterator, class LessThanComparable>
-thrust::pair<ForwardIterator, ForwardIterator>
-equal_range(ForwardIterator first,
-            ForwardIterator last,
-            const LessThanComparable& value);
-
-
-/*! \p equal_range is a version of binary search: it attempts to find
- * the element value in an ordered range <tt>[first, last)</tt>. The 
- * value returned by \p equal_range is essentially a combination of
- * the values returned by \p lower_bound and \p upper_bound: it returns
- * a \p pair of iterators \c i and \c j such that \c i is the first
- * position where value could be inserted without violating the 
- * ordering and \c j is the last position where value could be inserted
- * without violating the ordering. It follows that every element in the
- * range <tt>[i, j)</tt> is equivalent to value, and that 
- * <tt>[i, j)</tt> is the largest subrange of <tt>[first, last)</tt> that
- * has this property. 
- *
- * This version of \p equal_range returns a \p pair of iterators 
- * <tt>[i, j)</tt>. \c i is the furthermost iterator in 
- * <tt>[first, last)</tt> such that, for every iterator \c k in 
- * <tt>[first, i)</tt>, <tt>comp(*k, value)</tt> is \c true.
- * \c j is the furthermost iterator in <tt>[first, last)</tt> such
- * that, for every iterator \c k in <tt>[first, last)</tt>, 
- * <tt>comp(value, *k)</tt> is \c false. For every iterator \c k 
- * in <tt>[i, j)</tt>, neither <tt>comp(value, *k)</tt> nor 
- * <tt>comp(*k, value)</tt> is \c true.
- *
- * The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the ordered sequence.
- *  \param last The end of the ordered sequence.
- *  \param value The value to be searched.
- *  \param comp The comparison operator.
- *  \return A \p pair of iterators <tt>[i, j)</tt> that define the range of equivalent elements.
- * 
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam T is comparable to \p ForwardIterator's \c value_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
- *
- *  The following code snippet demonstrates how to use \p equal_range
- *  to search for values in a ordered range using the \p thrust::device execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/binary_search.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  thrust::device_vector<int> input(5);
- *
- *  input[0] = 0;
- *  input[1] = 2;
- *  input[2] = 5;
- *  input[3] = 7;
- *  input[4] = 8;
- *
- *  thrust::equal_range(thrust::device, input.begin(), input.end(), 0, thrust::less<int>()); // returns [input.begin(), input.begin() + 1)
- *  thrust::equal_range(thrust::device, input.begin(), input.end(), 1, thrust::less<int>()); // returns [input.begin() + 1, input.begin() + 1)
- *  thrust::equal_range(thrust::device, input.begin(), input.end(), 2, thrust::less<int>()); // returns [input.begin() + 1, input.begin() + 2)
- *  thrust::equal_range(thrust::device, input.begin(), input.end(), 3, thrust::less<int>()); // returns [input.begin() + 2, input.begin() + 2)
- *  thrust::equal_range(thrust::device, input.begin(), input.end(), 8, thrust::less<int>()); // returns [input.begin() + 4, input.end)
- *  thrust::equal_range(thrust::device, input.begin(), input.end(), 9, thrust::less<int>()); // returns [input.end(), input.end)
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/equal_range.html
- *  \see \p lower_bound
- *  \see \p upper_bound
- *  \see \p binary_search
- */
-template <typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
-thrust::pair<ForwardIterator, ForwardIterator>
-equal_range(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-            ForwardIterator first,
-            ForwardIterator last,
-            const T& value,
-            StrictWeakOrdering comp);
-
-
-/*! \p equal_range is a version of binary search: it attempts to find
- * the element value in an ordered range <tt>[first, last)</tt>. The 
- * value returned by \p equal_range is essentially a combination of
- * the values returned by \p lower_bound and \p upper_bound: it returns
- * a \p pair of iterators \c i and \c j such that \c i is the first
- * position where value could be inserted without violating the 
- * ordering and \c j is the last position where value could be inserted
- * without violating the ordering. It follows that every element in the
- * range <tt>[i, j)</tt> is equivalent to value, and that 
- * <tt>[i, j)</tt> is the largest subrange of <tt>[first, last)</tt> that
- * has this property. 
- *
- * This version of \p equal_range returns a \p pair of iterators 
- * <tt>[i, j)</tt>. \c i is the furthermost iterator in 
- * <tt>[first, last)</tt> such that, for every iterator \c k in 
- * <tt>[first, i)</tt>, <tt>comp(*k, value)</tt> is \c true.
- * \c j is the furthermost iterator in <tt>[first, last)</tt> such
- * that, for every iterator \c k in <tt>[first, last)</tt>, 
- * <tt>comp(value, *k)</tt> is \c false. For every iterator \c k 
- * in <tt>[i, j)</tt>, neither <tt>comp(value, *k)</tt> nor 
- * <tt>comp(*k, value)</tt> is \c true.
- *
- *  \param first The beginning of the ordered sequence.
- *  \param last The end of the ordered sequence.
- *  \param value The value to be searched.
- *  \param comp The comparison operator.
- *  \return A \p pair of iterators <tt>[i, j)</tt> that define the range of equivalent elements.
- * 
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam T is comparable to \p ForwardIterator's \c value_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
- *
- *  The following code snippet demonstrates how to use \p equal_range
- *  to search for values in a ordered range.
- *
- *  \code
- *  #include <thrust/binary_search.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/functional.h>
- *  ...
- *  thrust::device_vector<int> input(5);
- *
- *  input[0] = 0;
- *  input[1] = 2;
- *  input[2] = 5;
- *  input[3] = 7;
- *  input[4] = 8;
- *
- *  thrust::equal_range(input.begin(), input.end(), 0, thrust::less<int>()); // returns [input.begin(), input.begin() + 1)
- *  thrust::equal_range(input.begin(), input.end(), 1, thrust::less<int>()); // returns [input.begin() + 1, input.begin() + 1)
- *  thrust::equal_range(input.begin(), input.end(), 2, thrust::less<int>()); // returns [input.begin() + 1, input.begin() + 2)
- *  thrust::equal_range(input.begin(), input.end(), 3, thrust::less<int>()); // returns [input.begin() + 2, input.begin() + 2)
- *  thrust::equal_range(input.begin(), input.end(), 8, thrust::less<int>()); // returns [input.begin() + 4, input.end)
- *  thrust::equal_range(input.begin(), input.end(), 9, thrust::less<int>()); // returns [input.end(), input.end)
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/equal_range.html
- *  \see \p lower_bound
- *  \see \p upper_bound
- *  \see \p binary_search
- */
-template <class ForwardIterator, class T, class StrictWeakOrdering>
-thrust::pair<ForwardIterator, ForwardIterator>
-equal_range(ForwardIterator first,
-            ForwardIterator last,
-            const T& value,
-            StrictWeakOrdering comp);
-
-
-/*! \addtogroup vectorized_binary_search Vectorized Searches
- *  \ingroup binary_search
- *  \{
- */
-
-
-//////////////////////
-// Vector Functions //
-//////////////////////
-
-
-/*! \p lower_bound is a vectorized version of binary search: for each 
- * iterator \c v in <tt>[values_first, values_last)</tt> it attempts to
- * find the value <tt>*v</tt> in an ordered range <tt>[first, last)</tt>.
- * Specifically, it returns the index of first position where value could
- * be inserted without violating the ordering.
- *
- * The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the ordered sequence.
- *  \param last The end of the ordered sequence.
- *  \param values_first The beginning of the search values sequence.
- *  \param values_last The end of the search values sequence.
- *  \param result The beginning of the output sequence.
- * 
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *                        and \c InputIterator's \c value_type is <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *                        and \c ForwardIterator's difference_type is convertible to \c OutputIterator's \c value_type.
- *
- *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
- *
- *  The following code snippet demonstrates how to use \p lower_bound
- *  to search for multiple values in a ordered range using the \p thrust::device execution policy for
- *  parallelization:
- *
- *  \code
- *  #include <thrust/binary_search.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  thrust::device_vector<int> input(5);
- *
- *  input[0] = 0;
- *  input[1] = 2;
- *  input[2] = 5;
- *  input[3] = 7;
- *  input[4] = 8;
- *
- *  thrust::device_vector<int> values(6);
- *  values[0] = 0; 
- *  values[1] = 1;
- *  values[2] = 2;
- *  values[3] = 3;
- *  values[4] = 8;
- *  values[5] = 9;
- *
- *  thrust::device_vector<unsigned int> output(6);
- *
- *  thrust::lower_bound(thrust::device,
- *                      input.begin(), input.end(),
- *                      values.begin(), values.end(),
- *                      output.begin());
- *
- *  // output is now [0, 1, 1, 2, 4, 5]
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/lower_bound.html
- *  \see \p upper_bound
- *  \see \p equal_range
- *  \see \p binary_search
- */
-template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
-OutputIterator lower_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                           ForwardIterator first, 
-                           ForwardIterator last,
-                           InputIterator values_first, 
-                           InputIterator values_last,
-                           OutputIterator result);
-
-
-/*! \p lower_bound is a vectorized version of binary search: for each 
- * iterator \c v in <tt>[values_first, values_last)</tt> it attempts to
- * find the value <tt>*v</tt> in an ordered range <tt>[first, last)</tt>.
- * Specifically, it returns the index of first position where value could
- * be inserted without violating the ordering.
- *
- *  \param first The beginning of the ordered sequence.
- *  \param last The end of the ordered sequence.
- *  \param values_first The beginning of the search values sequence.
- *  \param values_last The end of the search values sequence.
- *  \param result The beginning of the output sequence.
- * 
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *                        and \c InputIterator's \c value_type is <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *                        and \c ForwardIterator's difference_type is convertible to \c OutputIterator's \c value_type.
- *
- *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
- *
- *  The following code snippet demonstrates how to use \p lower_bound
- *  to search for multiple values in a ordered range.
- *
- *  \code
- *  #include <thrust/binary_search.h>
- *  #include <thrust/device_vector.h>
- *  ...
- *  thrust::device_vector<int> input(5);
- *
- *  input[0] = 0;
- *  input[1] = 2;
- *  input[2] = 5;
- *  input[3] = 7;
- *  input[4] = 8;
- *
- *  thrust::device_vector<int> values(6);
- *  values[0] = 0; 
- *  values[1] = 1;
- *  values[2] = 2;
- *  values[3] = 3;
- *  values[4] = 8;
- *  values[5] = 9;
- *
- *  thrust::device_vector<unsigned int> output(6);
- *
- *  thrust::lower_bound(input.begin(), input.end(),
- *                      values.begin(), values.end(),
- *                      output.begin());
- *
- *  // output is now [0, 1, 1, 2, 4, 5]
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/lower_bound.html
- *  \see \p upper_bound
- *  \see \p equal_range
- *  \see \p binary_search
- */
-template <class ForwardIterator, class InputIterator, class OutputIterator>
-OutputIterator lower_bound(ForwardIterator first, 
-                           ForwardIterator last,
-                           InputIterator values_first, 
-                           InputIterator values_last,
-                           OutputIterator result);
-
-
-/*! \p lower_bound is a vectorized version of binary search: for each 
- * iterator \c v in <tt>[values_first, values_last)</tt> it attempts to
- * find the value <tt>*v</tt> in an ordered range <tt>[first, last)</tt>.
- * Specifically, it returns the index of first position where value could
- * be inserted without violating the ordering.  This version of 
- * \p lower_bound uses function object \c comp for comparison.
- *
- * The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the ordered sequence.
- *  \param last The end of the ordered sequence.
- *  \param values_first The beginning of the search values sequence.
- *  \param values_last The end of the search values sequence.
- *  \param result The beginning of the output sequence.
- *  \param comp The comparison operator.
- * 
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *                        and \c InputIterator's \c value_type is comparable to \p ForwardIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *                        and \c ForwardIterator's difference_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
- *
- *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
- *
- *  The following code snippet demonstrates how to use \p lower_bound
- *  to search for multiple values in a ordered range.
- *
- *  \code
- *  #include <thrust/binary_search.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  thrust::device_vector<int> input(5);
- *
- *  input[0] = 0;
- *  input[1] = 2;
- *  input[2] = 5;
- *  input[3] = 7;
- *  input[4] = 8;
- *
- *  thrust::device_vector<int> values(6);
- *  values[0] = 0; 
- *  values[1] = 1;
- *  values[2] = 2;
- *  values[3] = 3;
- *  values[4] = 8;
- *  values[5] = 9;
- *
- *  thrust::device_vector<unsigned int> output(6);
- *
- *  thrust::lower_bound(input.begin(), input.end(),
- *                      values.begin(), values.end(), 
- *                      output.begin(),
- *                      thrust::less<int>());
- *
- *  // output is now [0, 1, 1, 2, 4, 5]
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/lower_bound.html
- *  \see \p upper_bound
- *  \see \p equal_range
- *  \see \p binary_search
- */
-template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
-OutputIterator lower_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                           ForwardIterator first, 
-                           ForwardIterator last,
-                           InputIterator values_first, 
-                           InputIterator values_last,
-                           OutputIterator result,
-                           StrictWeakOrdering comp);
-
-
-/*! \p lower_bound is a vectorized version of binary search: for each 
- * iterator \c v in <tt>[values_first, values_last)</tt> it attempts to
- * find the value <tt>*v</tt> in an ordered range <tt>[first, last)</tt>.
- * Specifically, it returns the index of first position where value could
- * be inserted without violating the ordering.  This version of 
- * \p lower_bound uses function object \c comp for comparison.
- *
- *  \param first The beginning of the ordered sequence.
- *  \param last The end of the ordered sequence.
- *  \param values_first The beginning of the search values sequence.
- *  \param values_last The end of the search values sequence.
- *  \param result The beginning of the output sequence.
- *  \param comp The comparison operator.
- * 
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *                        and \c InputIterator's \c value_type is comparable to \p ForwardIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *                        and \c ForwardIterator's difference_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
- *
- *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
- *
- *  The following code snippet demonstrates how to use \p lower_bound
- *  to search for multiple values in a ordered range.
- *
- *  \code
- *  #include <thrust/binary_search.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/functional.h>
- *  ...
- *  thrust::device_vector<int> input(5);
- *
- *  input[0] = 0;
- *  input[1] = 2;
- *  input[2] = 5;
- *  input[3] = 7;
- *  input[4] = 8;
- *
- *  thrust::device_vector<int> values(6);
- *  values[0] = 0; 
- *  values[1] = 1;
- *  values[2] = 2;
- *  values[3] = 3;
- *  values[4] = 8;
- *  values[5] = 9;
- *
- *  thrust::device_vector<unsigned int> output(6);
- *
- *  thrust::lower_bound(input.begin(), input.end(),
- *                      values.begin(), values.end(), 
- *                      output.begin(),
- *                      thrust::less<int>());
- *
- *  // output is now [0, 1, 1, 2, 4, 5]
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/lower_bound.html
- *  \see \p upper_bound
- *  \see \p equal_range
- *  \see \p binary_search
- */
-template <class ForwardIterator, class InputIterator, class OutputIterator, class StrictWeakOrdering>
-OutputIterator lower_bound(ForwardIterator first, 
-                           ForwardIterator last,
-                           InputIterator values_first, 
-                           InputIterator values_last,
-                           OutputIterator result,
-                           StrictWeakOrdering comp);
-
-
-/*! \p upper_bound is a vectorized version of binary search: for each 
- * iterator \c v in <tt>[values_first, values_last)</tt> it attempts to
- * find the value <tt>*v</tt> in an ordered range <tt>[first, last)</tt>.
- * Specifically, it returns the index of last position where value could
- * be inserted without violating the ordering.
- *
- * The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the ordered sequence.
- *  \param last The end of the ordered sequence.
- *  \param values_first The beginning of the search values sequence.
- *  \param values_last The end of the search values sequence.
- *  \param result The beginning of the output sequence.
- * 
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *                        and \c InputIterator's \c value_type is <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *                        and \c ForwardIterator's difference_type is convertible to \c OutputIterator's \c value_type.
- *
- *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
- *
- *  The following code snippet demonstrates how to use \p lower_bound
- *  to search for multiple values in a ordered range using the \p thrust::device execution policy for
- *  parallelization:
- *
- *  \code
- *  #include <thrust/binary_search.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  thrust::device_vector<int> input(5);
- *
- *  input[0] = 0;
- *  input[1] = 2;
- *  input[2] = 5;
- *  input[3] = 7;
- *  input[4] = 8;
- *
- *  thrust::device_vector<int> values(6);
- *  values[0] = 0; 
- *  values[1] = 1;
- *  values[2] = 2;
- *  values[3] = 3;
- *  values[4] = 8;
- *  values[5] = 9;
- *
- *  thrust::device_vector<unsigned int> output(6);
- *
- *  thrust::upper_bound(thrust::device,
- *                      input.begin(), input.end(),
- *                      values.begin(), values.end(),
- *                      output.begin());
- *
- *  // output is now [1, 1, 2, 2, 5, 5]
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/upper_bound.html
- *  \see \p upper_bound
- *  \see \p equal_range
- *  \see \p binary_search
- */
-template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
-OutputIterator upper_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                           ForwardIterator first, 
-                           ForwardIterator last,
-                           InputIterator values_first, 
-                           InputIterator values_last,
-                           OutputIterator result);
-
-
-/*! \p upper_bound is a vectorized version of binary search: for each 
- * iterator \c v in <tt>[values_first, values_last)</tt> it attempts to
- * find the value <tt>*v</tt> in an ordered range <tt>[first, last)</tt>.
- * Specifically, it returns the index of last position where value could
- * be inserted without violating the ordering.
- *
- *  \param first The beginning of the ordered sequence.
- *  \param last The end of the ordered sequence.
- *  \param values_first The beginning of the search values sequence.
- *  \param values_last The end of the search values sequence.
- *  \param result The beginning of the output sequence.
- * 
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *                        and \c InputIterator's \c value_type is <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *                        and \c ForwardIterator's difference_type is convertible to \c OutputIterator's \c value_type.
- *
- *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
- *
- *  The following code snippet demonstrates how to use \p lower_bound
- *  to search for multiple values in a ordered range.
- *
- *  \code
- *  #include <thrust/binary_search.h>
- *  #include <thrust/device_vector.h>
- *  ...
- *  thrust::device_vector<int> input(5);
- *
- *  input[0] = 0;
- *  input[1] = 2;
- *  input[2] = 5;
- *  input[3] = 7;
- *  input[4] = 8;
- *
- *  thrust::device_vector<int> values(6);
- *  values[0] = 0; 
- *  values[1] = 1;
- *  values[2] = 2;
- *  values[3] = 3;
- *  values[4] = 8;
- *  values[5] = 9;
- *
- *  thrust::device_vector<unsigned int> output(6);
- *
- *  thrust::upper_bound(input.begin(), input.end(),
- *                      values.begin(), values.end(),
- *                      output.begin());
- *
- *  // output is now [1, 1, 2, 2, 5, 5]
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/upper_bound.html
- *  \see \p upper_bound
- *  \see \p equal_range
- *  \see \p binary_search
- */
-template <class ForwardIterator, class InputIterator, class OutputIterator>
-OutputIterator upper_bound(ForwardIterator first, 
-                           ForwardIterator last,
-                           InputIterator values_first, 
-                           InputIterator values_last,
-                           OutputIterator result);
-
-
-/*! \p upper_bound is a vectorized version of binary search: for each 
- * iterator \c v in <tt>[values_first, values_last)</tt> it attempts to
- * find the value <tt>*v</tt> in an ordered range <tt>[first, last)</tt>.
- * Specifically, it returns the index of first position where value could
- * be inserted without violating the ordering.  This version of 
- * \p upper_bound uses function object \c comp for comparison.
- *
- * The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the ordered sequence.
- *  \param last The end of the ordered sequence.
- *  \param values_first The beginning of the search values sequence.
- *  \param values_last The end of the search values sequence.
- *  \param result The beginning of the output sequence.
- *  \param comp The comparison operator.
- * 
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *                        and \c InputIterator's \c value_type is comparable to \p ForwardIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *                        and \c ForwardIterator's difference_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
- *
- *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
- *
- *  The following code snippet demonstrates how to use \p lower_bound
- *  to search for multiple values in a ordered range using the \p thrust::device execution policy for
- *  parallelization:
- *
- *  \code
- *  #include <thrust/binary_search.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  thrust::device_vector<int> input(5);
- *
- *  input[0] = 0;
- *  input[1] = 2;
- *  input[2] = 5;
- *  input[3] = 7;
- *  input[4] = 8;
- *
- *  thrust::device_vector<int> values(6);
- *  values[0] = 0; 
- *  values[1] = 1;
- *  values[2] = 2;
- *  values[3] = 3;
- *  values[4] = 8;
- *  values[5] = 9;
- *
- *  thrust::device_vector<unsigned int> output(6);
- *
- *  thrust::upper_bound(thrust::device,
- *                      input.begin(), input.end(),
- *                      values.begin(), values.end(), 
- *                      output.begin(),
- *                      thrust::less<int>());
- *
- *  // output is now [1, 1, 2, 2, 5, 5]
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/upper_bound.html
- *  \see \p lower_bound
- *  \see \p equal_range
- *  \see \p binary_search
- */
-template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
-OutputIterator upper_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                           ForwardIterator first, 
-                           ForwardIterator last,
-                           InputIterator values_first, 
-                           InputIterator values_last,
-                           OutputIterator result,
-                           StrictWeakOrdering comp);
-
-
-/*! \p upper_bound is a vectorized version of binary search: for each 
- * iterator \c v in <tt>[values_first, values_last)</tt> it attempts to
- * find the value <tt>*v</tt> in an ordered range <tt>[first, last)</tt>.
- * Specifically, it returns the index of first position where value could
- * be inserted without violating the ordering.  This version of 
- * \p upper_bound uses function object \c comp for comparison.
- *
- *  \param first The beginning of the ordered sequence.
- *  \param last The end of the ordered sequence.
- *  \param values_first The beginning of the search values sequence.
- *  \param values_last The end of the search values sequence.
- *  \param result The beginning of the output sequence.
- *  \param comp The comparison operator.
- * 
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *                        and \c InputIterator's \c value_type is comparable to \p ForwardIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *                        and \c ForwardIterator's difference_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
- *
- *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
- *
- *  The following code snippet demonstrates how to use \p lower_bound
- *  to search for multiple values in a ordered range.
- *
- *  \code
- *  #include <thrust/binary_search.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/functional.h>
- *  ...
- *  thrust::device_vector<int> input(5);
- *
- *  input[0] = 0;
- *  input[1] = 2;
- *  input[2] = 5;
- *  input[3] = 7;
- *  input[4] = 8;
- *
- *  thrust::device_vector<int> values(6);
- *  values[0] = 0; 
- *  values[1] = 1;
- *  values[2] = 2;
- *  values[3] = 3;
- *  values[4] = 8;
- *  values[5] = 9;
- *
- *  thrust::device_vector<unsigned int> output(6);
- *
- *  thrust::upper_bound(input.begin(), input.end(),
- *                      values.begin(), values.end(), 
- *                      output.begin(),
- *                      thrust::less<int>());
- *
- *  // output is now [1, 1, 2, 2, 5, 5]
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/upper_bound.html
- *  \see \p lower_bound
- *  \see \p equal_range
- *  \see \p binary_search
- */
-template <class ForwardIterator, class InputIterator, class OutputIterator, class StrictWeakOrdering>
-OutputIterator upper_bound(ForwardIterator first, 
-                           ForwardIterator last,
-                           InputIterator values_first, 
-                           InputIterator values_last,
-                           OutputIterator result,
-                           StrictWeakOrdering comp);
-
-
-/*! \p binary_search is a vectorized version of binary search: for each 
- * iterator \c v in <tt>[values_first, values_last)</tt> it attempts to
- * find the value <tt>*v</tt> in an ordered range <tt>[first, last)</tt>.
- * It returns \c true if an element that is equivalent to \c value 
- * is present in <tt>[first, last)</tt> and \c false if no such element
- * exists.
- *
- * The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the ordered sequence.
- *  \param last The end of the ordered sequence.
- *  \param values_first The beginning of the search values sequence.
- *  \param values_last The end of the search values sequence.
- *  \param result The beginning of the output sequence.
- * 
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *                        and \c InputIterator's \c value_type is <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *                        and bool is convertible to \c OutputIterator's \c value_type.
- *
- *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
- *
- *  The following code snippet demonstrates how to use \p binary_search
- *  to search for multiple values in a ordered range using the \p thrust::device execution policy for
- *  parallelization:
- *
- *  \code
- *  #include <thrust/binary_search.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  thrust::device_vector<int> input(5);
- *
- *  input[0] = 0;
- *  input[1] = 2;
- *  input[2] = 5;
- *  input[3] = 7;
- *  input[4] = 8;
- *
- *  thrust::device_vector<int> values(6);
- *  values[0] = 0; 
- *  values[1] = 1;
- *  values[2] = 2;
- *  values[3] = 3;
- *  values[4] = 8;
- *  values[5] = 9;
- *
- *  thrust::device_vector<bool> output(6);
- *
- *  thrust::binary_search(thrust::device,
- *                        input.begin(), input.end(),
- *                        values.begin(), values.end(),
- *                        output.begin());
- *
- *  // output is now [true, false, true, false, true, false]
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/binary_search.html
- *  \see \p lower_bound
- *  \see \p upper_bound
- *  \see \p equal_range
- */
-template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
-OutputIterator binary_search(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                             ForwardIterator first, 
-                             ForwardIterator last,
-                             InputIterator values_first, 
-                             InputIterator values_last,
-                             OutputIterator result);
-
-
-/*! \p binary_search is a vectorized version of binary search: for each 
- * iterator \c v in <tt>[values_first, values_last)</tt> it attempts to
- * find the value <tt>*v</tt> in an ordered range <tt>[first, last)</tt>.
- * It returns \c true if an element that is equivalent to \c value 
- * is present in <tt>[first, last)</tt> and \c false if no such element
- * exists.
- *
- *  \param first The beginning of the ordered sequence.
- *  \param last The end of the ordered sequence.
- *  \param values_first The beginning of the search values sequence.
- *  \param values_last The end of the search values sequence.
- *  \param result The beginning of the output sequence.
- * 
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *                        and \c InputIterator's \c value_type is <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *                        and bool is convertible to \c OutputIterator's \c value_type.
- *
- *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
- *
- *  The following code snippet demonstrates how to use \p binary_search
- *  to search for multiple values in a ordered range.
- *
- *  \code
- *  #include <thrust/binary_search.h>
- *  #include <thrust/device_vector.h>
- *  ...
- *  thrust::device_vector<int> input(5);
- *
- *  input[0] = 0;
- *  input[1] = 2;
- *  input[2] = 5;
- *  input[3] = 7;
- *  input[4] = 8;
- *
- *  thrust::device_vector<int> values(6);
- *  values[0] = 0; 
- *  values[1] = 1;
- *  values[2] = 2;
- *  values[3] = 3;
- *  values[4] = 8;
- *  values[5] = 9;
- *
- *  thrust::device_vector<bool> output(6);
- *
- *  thrust::binary_search(input.begin(), input.end(),
- *                        values.begin(), values.end(),
- *                        output.begin());
- *
- *  // output is now [true, false, true, false, true, false]
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/binary_search.html
- *  \see \p lower_bound
- *  \see \p upper_bound
- *  \see \p equal_range
- */
-template <class ForwardIterator, class InputIterator, class OutputIterator>
-OutputIterator binary_search(ForwardIterator first, 
-                             ForwardIterator last,
-                             InputIterator values_first, 
-                             InputIterator values_last,
-                             OutputIterator result);
-
-
-/*! \p binary_search is a vectorized version of binary search: for each 
- * iterator \c v in <tt>[values_first, values_last)</tt> it attempts to
- * find the value <tt>*v</tt> in an ordered range <tt>[first, last)</tt>.
- * It returns \c true if an element that is equivalent to \c value 
- * is present in <tt>[first, last)</tt> and \c false if no such element
- * exists.  This version of \p binary_search uses function object 
- * \c comp for comparison.
- *
- * The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the ordered sequence.
- *  \param last The end of the ordered sequence.
- *  \param values_first The beginning of the search values sequence.
- *  \param values_last The end of the search values sequence.
- *  \param result The beginning of the output sequence.
- *  \param comp The comparison operator.
- * 
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *                        and \c InputIterator's \c value_type is <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *                        and bool is convertible to \c OutputIterator's \c value_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
- *
- *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
- *
- *  The following code snippet demonstrates how to use \p binary_search
- *  to search for multiple values in a ordered range using the \p thrust::device execution policy for
- *  parallelization:
- *
- *  \code
- *  #include <thrust/binary_search.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  thrust::device_vector<int> input(5);
- *
- *  input[0] = 0;
- *  input[1] = 2;
- *  input[2] = 5;
- *  input[3] = 7;
- *  input[4] = 8;
- *
- *  thrust::device_vector<int> values(6);
- *  values[0] = 0; 
- *  values[1] = 1;
- *  values[2] = 2;
- *  values[3] = 3;
- *  values[4] = 8;
- *  values[5] = 9;
- *
- *  thrust::device_vector<bool> output(6);
- *
- *  thrust::binary_search(thrust::device,
- *                        input.begin(), input.end(),
- *                        values.begin(), values.end(),
- *                        output.begin(),
- *                        thrust::less<T>());
- *
- *  // output is now [true, false, true, false, true, false]
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/binary_search.html
- *  \see \p lower_bound
- *  \see \p upper_bound
- *  \see \p equal_range
- */
-template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
-OutputIterator binary_search(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                             ForwardIterator first, 
-                             ForwardIterator last,
-                             InputIterator values_first, 
-                             InputIterator values_last,
-                             OutputIterator result,
-                             StrictWeakOrdering comp);
-
-
-/*! \p binary_search is a vectorized version of binary search: for each 
- * iterator \c v in <tt>[values_first, values_last)</tt> it attempts to
- * find the value <tt>*v</tt> in an ordered range <tt>[first, last)</tt>.
- * It returns \c true if an element that is equivalent to \c value 
- * is present in <tt>[first, last)</tt> and \c false if no such element
- * exists.  This version of \p binary_search uses function object 
- * \c comp for comparison.
- *
- *  \param first The beginning of the ordered sequence.
- *  \param last The end of the ordered sequence.
- *  \param values_first The beginning of the search values sequence.
- *  \param values_last The end of the search values sequence.
- *  \param result The beginning of the output sequence.
- *  \param comp The comparison operator.
- * 
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *                        and \c InputIterator's \c value_type is <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *                        and bool is convertible to \c OutputIterator's \c value_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
- *
- *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
- *
- *  The following code snippet demonstrates how to use \p binary_search
- *  to search for multiple values in a ordered range.
- *
- *  \code
- *  #include <thrust/binary_search.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/functional.h>
- *  ...
- *  thrust::device_vector<int> input(5);
- *
- *  input[0] = 0;
- *  input[1] = 2;
- *  input[2] = 5;
- *  input[3] = 7;
- *  input[4] = 8;
- *
- *  thrust::device_vector<int> values(6);
- *  values[0] = 0; 
- *  values[1] = 1;
- *  values[2] = 2;
- *  values[3] = 3;
- *  values[4] = 8;
- *  values[5] = 9;
- *
- *  thrust::device_vector<bool> output(6);
- *
- *  thrust::binary_search(input.begin(), input.end(),
- *                        values.begin(), values.end(),
- *                        output.begin(),
- *                        thrust::less<T>());
- *
- *  // output is now [true, false, true, false, true, false]
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/binary_search.html
- *  \see \p lower_bound
- *  \see \p upper_bound
- *  \see \p equal_range
- */
-template <class ForwardIterator, class InputIterator, class OutputIterator, class StrictWeakOrdering>
-OutputIterator binary_search(ForwardIterator first, 
-                             ForwardIterator last,
-                             InputIterator values_first, 
-                             InputIterator values_last,
-                             OutputIterator result,
-                             StrictWeakOrdering comp);
-
-
-/*! \} // end vectorized_binary_search
- */
-
-
-/*! \} // end binary_search
- */
-
-
-/*! \} // end searching
- */
-
-
-} // end namespace thrust
-
-#include <thrust/detail/binary_search.inl>
-
diff --git a/compat/thrust/copy.h b/compat/thrust/copy.h
deleted file mode 100644
index eaa9719459..0000000000
--- a/compat/thrust/copy.h
+++ /dev/null
@@ -1,505 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file copy.h
- *  \brief Copies elements from one range to another
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/execution_policy.h>
-
-namespace thrust
-{
-
-/*! \addtogroup algorithms
- */
-
-/*! \addtogroup copying
- *  \ingroup algorithms
- *  \{
- */
-
-
-/*! \p copy copies elements from the range [\p first, \p last) to the range
- *  [\p result, \p result + (\p last - \p first)). That is, it performs
- *  the assignments *\p result = *\p first, *(\p result + \c 1) = *(\p first + \c 1),
- *  and so on. Generally, for every integer \c n from \c 0 to \p last - \p first, \p copy
- *  performs the assignment *(\p result + \c n) = *(\p first + \c n). Unlike
- *  \c std::copy, \p copy offers no guarantee on order of operation.  As a result,
- *  calling \p copy with overlapping source and destination ranges has undefined
- *  behavior.
- *
- *  The return value is \p result + (\p last - \p first).
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the sequence to copy.
- *  \param last The end of the sequence to copy.
- *  \param result The destination sequence.
- *  \return The end of the destination sequence.
- *  \see http://www.sgi.com/tech/stl/copy.html
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *
- *  \pre \p result may be equal to \p first, but \p result shall not be in the range <tt>[first, last)</tt> otherwise.
- *
- *  The following code snippet demonstrates how to use \p copy
- *  to copy from one range to another using the \p thrust::device parallelization policy:
- *
- *  \code
- *  #include <thrust/copy.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *
- *  thrust::device_vector<int> vec0(100);
- *  thrust::device_vector<int> vec1(100);
- *  ...
- *
- *  thrust::copy(thrust::device, vec0.begin(), vec0.end(), vec1.begin());
- *
- *  // vec1 is now a copy of vec0
- *  \endcode
- */
-template<typename DerivedPolicy, typename InputIterator, typename OutputIterator>
-  OutputIterator copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                      InputIterator first,
-                      InputIterator last,
-                      OutputIterator result);
-
-
-/*! \p copy_n copies elements from the range <tt>[first, first + n)</tt> to the range
- *  <tt>[result, result + n)</tt>. That is, it performs the assignments <tt>*result = *first, *(result + 1) = *(first + 1)</tt>,
- *  and so on. Generally, for every integer \c i from \c 0 to \c n, \p copy
- *  performs the assignment *(\p result + \c i) = *(\p first + \c i). Unlike
- *  \c std::copy_n, \p copy_n offers no guarantee on order of operation. As a result,
- *  calling \p copy_n with overlapping source and destination ranges has undefined
- *  behavior.
- *
- *  The return value is \p result + \p n.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the range to copy.
- *  \param n The number of elements to copy.
- *  \param result The beginning destination range.
- *  \return The end of the destination range.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
- *  \tparam Size is an integral type.
- *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *
- *  \pre \p result may be equal to \p first, but \p result shall not be in the range <tt>[first, first + n)</tt> otherwise.
- *
- *  The following code snippet demonstrates how to use \p copy
- *  to copy from one range to another using the \p thrust::device parallelization policy:
- *
- *  \code
- *  #include <thrust/copy.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  size_t n = 100;
- *  thrust::device_vector<int> vec0(n);
- *  thrust::device_vector<int> vec1(n);
- *  ...
- *  thrust::copy_n(thrust::device, vec0.begin(), n, vec1.begin());
- *
- *  // vec1 is now a copy of vec0
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/copy_n.html
- *  \see thrust::copy
- */
-template<typename DerivedPolicy, typename InputIterator, typename Size, typename OutputIterator>
-  OutputIterator copy_n(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                        InputIterator first,
-                        Size n,
-                        OutputIterator result);
-
-
-	
-/*! \p copy copies elements from the range [\p first, \p last) to the range
- *  [\p result, \p result + (\p last - \p first)). That is, it performs
- *  the assignments *\p result = *\p first, *(\p result + \c 1) = *(\p first + \c 1),
- *  and so on. Generally, for every integer \c n from \c 0 to \p last - \p first, \p copy
- *  performs the assignment *(\p result + \c n) = *(\p first + \c n). Unlike
- *  \c std::copy, \p copy offers no guarantee on order of operation.  As a result,
- *  calling \p copy with overlapping source and destination ranges has undefined
- *  behavior.
- *
- *  The return value is \p result + (\p last - \p first).
- *
- *  \param first The beginning of the sequence to copy.
- *  \param last The end of the sequence to copy.
- *  \param result The destination sequence.
- *  \return The end of the destination sequence.
- *  \see http://www.sgi.com/tech/stl/copy.html
- *
- *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *
- *  \pre \p result may be equal to \p first, but \p result shall not be in the range <tt>[first, last)</tt> otherwise.
- *
- *  The following code snippet demonstrates how to use \p copy
- *  to copy from one range to another.
- *
- *  \code
- *  #include <thrust/copy.h>
- *  #include <thrust/device_vector.h>
- *  ...
- *
- *  thrust::device_vector<int> vec0(100);
- *  thrust::device_vector<int> vec1(100);
- *  ...
- *
- *  thrust::copy(vec0.begin(), vec0.end(),
- *               vec1.begin());
- *
- *  // vec1 is now a copy of vec0
- *  \endcode
- */
-template<typename InputIterator, typename OutputIterator>
-  OutputIterator copy(InputIterator first,
-                      InputIterator last,
-                      OutputIterator result);
-
-/*! \p copy_n copies elements from the range <tt>[first, first + n)</tt> to the range
- *  <tt>[result, result + n)</tt>. That is, it performs the assignments <tt>*result = *first, *(result + 1) = *(first + 1)</tt>,
- *  and so on. Generally, for every integer \c i from \c 0 to \c n, \p copy
- *  performs the assignment *(\p result + \c i) = *(\p first + \c i). Unlike
- *  \c std::copy_n, \p copy_n offers no guarantee on order of operation. As a result,
- *  calling \p copy_n with overlapping source and destination ranges has undefined
- *  behavior.
- *
- *  The return value is \p result + \p n.
- *
- *  \param first The beginning of the range to copy.
- *  \param n The number of elements to copy.
- *  \param result The beginning destination range.
- *  \return The end of the destination range.
- *
- *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
- *  \tparam Size is an integral type.
- *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *
- *  \pre \p result may be equal to \p first, but \p result shall not be in the range <tt>[first, first + n)</tt> otherwise.
- *
- *  The following code snippet demonstrates how to use \p copy
- *  to copy from one range to another.
- *
- *  \code
- *  #include <thrust/copy.h>
- *  #include <thrust/device_vector.h>
- *  ...
- *  size_t n = 100;
- *  thrust::device_vector<int> vec0(n);
- *  thrust::device_vector<int> vec1(n);
- *  ...
- *  thrust::copy_n(vec0.begin(), n, vec1.begin());
- *
- *  // vec1 is now a copy of vec0
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/copy_n.html
- *  \see thrust::copy
- */
-template<typename InputIterator, typename Size, typename OutputIterator>
-  OutputIterator copy_n(InputIterator first,
-                        Size n,
-                        OutputIterator result);
-
-/*! \} // end copying
- */
-
-/*! \addtogroup stream_compaction
- *  \{
- */
-
-
-/*! This version of \p copy_if copies elements from the range <tt>[first,last)</tt>
- *  to a range beginning at \ presult, except that any element which causes \p pred
- *  to be \p pred to be \c false is not copied.
- *
- *  More precisely, for every integer \c n such that <tt>0 <= n < last-first</tt>,
- *  \p copy_if performs the assignment <tt>*result = *(first+n)</tt> and \p result
- *  is advanced one position if <tt>pred(*(first+n))</tt>. Otherwise, no assignment
- *  occurs and \p result is not advanced.
- *
- *  The algorithm's execution is parallelized as determined by \p system.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the sequence from which to copy.
- *  \param last The end of the sequence from which to copy.
- *  \param result The beginning of the sequence into which to copy.
- *  \param pred The predicate to test on every value of the range <tt>[first, last)</tt>.
- *  \return <tt>result + n</tt>, where \c n is equal to the number of times \p pred
- *          evaluated to \c true in the range <tt>[first, last)</tt>.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *                        and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  \pre The ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
- *
- *  The following code snippet demonstrates how to use \p copy_if to perform stream compaction
- *  to copy even numbers to an output range using the \p thrust::host parallelization policy:
- *
- *  \code
- *  #include <thrust/copy.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  struct is_even
- *  {
- *    __host__ __device__
- *    bool operator()(const int x)
- *    {
- *      return (x % 2) == 0;
- *    }
- *  };
- *  ...
- *  const int N = 6;
- *  int V[N] = {-2, 0, -1, 0, 1, 2};
- *  int result[4];
- *
- *  thrust::copy_if(thrust::host, V, V + N, result, is_even());
- *
- *  // V remains {-2, 0, -1, 0, 1, 2}
- *  // result is now {-2, 0, 0, 2}
- *  \endcode
- *
- *  \see \c remove_copy_if
- */
-template<typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename Predicate>
-  OutputIterator copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                         InputIterator first,
-                         InputIterator last,
-                         OutputIterator result,
-                         Predicate pred);
-
-
-
-/*! This version of \p copy_if copies elements from the range <tt>[first,last)</tt>
- *  to a range beginning at \ presult, except that any element which causes \p pred
- *  to be \p pred to be \c false is not copied.
- *
- *  More precisely, for every integer \c n such that <tt>0 <= n < last-first</tt>,
- *  \p copy_if performs the assignment <tt>*result = *(first+n)</tt> and \p result
- *  is advanced one position if <tt>pred(*(first+n))</tt>. Otherwise, no assignment
- *  occurs and \p result is not advanced.
- *
- *  \param first The beginning of the sequence from which to copy.
- *  \param last The end of the sequence from which to copy.
- *  \param result The beginning of the sequence into which to copy.
- *  \param pred The predicate to test on every value of the range <tt>[first, last)</tt>.
- *  \return <tt>result + n</tt>, where \c n is equal to the number of times \p pred
- *          evaluated to \c true in the range <tt>[first, last)</tt>.
- *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *                        and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  \pre The ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
- *
- *  The following code snippet demonstrates how to use \p copy_if to perform stream compaction
- *  to copy even numbers to an output range.
- *
- *  \code
- *  #include <thrust/copy.h>
- *  ...
- *  struct is_even
- *  {
- *    __host__ __device__
- *    bool operator()(const int x)
- *    {
- *      return (x % 2) == 0;
- *    }
- *  };
- *  ...
- *  const int N = 6;
- *  int V[N] = {-2, 0, -1, 0, 1, 2};
- *  int result[4];
- *
- *  thrust::copy_if(V, V + N, result, is_even());
- *
- *  // V remains {-2, 0, -1, 0, 1, 2}
- *  // result is now {-2, 0, 0, 2}
- *  \endcode
- *
- *  \see \c remove_copy_if
- */
-template<typename InputIterator,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator copy_if(InputIterator first,
-                         InputIterator last,
-                         OutputIterator result,
-                         Predicate pred);
-
-
-/*! This version of \p copy_if copies elements from the range <tt>[first,last)</tt>
- *  to a range beginning at \p result, except that any element whose corresponding stencil
- *  element causes \p pred to be \c false is not copied.
- *
- *  More precisely, for every integer \c n such that <tt>0 <= n < last-first</tt>,
- *  \p copy_if performs the assignment <tt>*result = *(first+n)</tt> and \p result
- *  is advanced one position if <tt>pred(*(stencil+n))</tt>. Otherwise, no assignment
- *  occurs and \p result is not advanced.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the sequence from which to copy.
- *  \param last The end of the sequence from which to copy.
- *  \param stencil The beginning of the stencil sequence.
- *  \param result The beginning of the sequence into which to copy.
- *  \param pred The predicate to test on every value of the range <tt>[stencil, stencil + (last-first))</tt>.
- *  \return <tt>result + n</tt>, where \c n is equal to the number of times \p pred
- *          evaluated to \c true in the range <tt>[stencil, stencil + (last-first))</tt>.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *                         and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  \pre The ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
- *  \pre The ranges <tt>[stencil, stencil + (last - first))</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
- *
- *  The following code snippet demonstrates how to use \p copy_if to perform stream compaction
- *  to copy numbers to an output range when corresponding stencil elements are even using the \p thrust::host execution policy:
- *
- *  \code
- *  #include <thrust/copy.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  struct is_even
- *  {
- *    __host__ __device__
- *    bool operator()(const int x)
- *    {
- *      return (x % 2) == 0;
- *    }
- *  };
- *  ...
- *  int N = 6;
- *  int data[N]    = { 0, 1,  2, 3, 4, 5};
- *  int stencil[N] = {-2, 0, -1, 0, 1, 2};
- *  int result[4];
- *
- *  thrust::copy_if(thrust::host, data, data + N, stencil, result, is_even());
- *
- *  // data remains    = { 0, 1,  2, 3, 4, 5};
- *  // stencil remains = {-2, 0, -1, 0, 1, 2};
- *  // result is now     { 0, 1,  3, 5}
- *  \endcode
- *
- *  \see \c remove_copy_if
- */
-template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Predicate>
-  OutputIterator copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                         InputIterator1 first,
-                         InputIterator1 last,
-                         InputIterator2 stencil,
-                         OutputIterator result,
-                         Predicate pred);
-
-
-/*! This version of \p copy_if copies elements from the range <tt>[first,last)</tt>
- *  to a range beginning at \p result, except that any element whose corresponding stencil
- *  element causes \p pred to be \c false is not copied.
- *
- *  More precisely, for every integer \c n such that <tt>0 <= n < last-first</tt>,
- *  \p copy_if performs the assignment <tt>*result = *(first+n)</tt> and \p result
- *  is advanced one position if <tt>pred(*(stencil+n))</tt>. Otherwise, no assignment
- *  occurs and \p result is not advanced.
- *
- *  \param first The beginning of the sequence from which to copy.
- *  \param last The end of the sequence from which to copy.
- *  \param stencil The beginning of the stencil sequence.
- *  \param result The beginning of the sequence into which to copy.
- *  \param pred The predicate to test on every value of the range <tt>[stencil, stencil + (last-first))</tt>.
- *  \return <tt>result + n</tt>, where \c n is equal to the number of times \p pred
- *          evaluated to \c true in the range <tt>[stencil, stencil + (last-first))</tt>.
- *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *                         and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  \pre The ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
- *  \pre The ranges <tt>[stencil, stencil + (last - first))</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
- *
- *  The following code snippet demonstrates how to use \p copy_if to perform stream compaction
- *  to copy numbers to an output range when corresponding stencil elements are even:
- *
- *  \code
- *  #include <thrust/copy.h>
- *  ...
- *  struct is_even
- *  {
- *    __host__ __device__
- *    bool operator()(const int x)
- *    {
- *      return (x % 2) == 0;
- *    }
- *  };
- *  ...
- *  int N = 6;
- *  int data[N]    = { 0, 1,  2, 3, 4, 5};
- *  int stencil[N] = {-2, 0, -1, 0, 1, 2};
- *  int result[4];
- *
- *  thrust::copy_if(data, data + N, stencil, result, is_even());
- *
- *  // data remains    = { 0, 1,  2, 3, 4, 5};
- *  // stencil remains = {-2, 0, -1, 0, 1, 2};
- *  // result is now     { 0, 1,  3, 5}
- *  \endcode
- *
- *  \see \c remove_copy_if
- */
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator copy_if(InputIterator1 first,
-                         InputIterator1 last,
-                         InputIterator2 stencil,
-                         OutputIterator result,
-                         Predicate pred);
-
-/*! \} // end stream_compaction
- */
-	
-} // end namespace thrust
-
-#include <thrust/detail/copy.h>
-#include <thrust/detail/copy_if.h>
-
diff --git a/compat/thrust/count.h b/compat/thrust/count.h
deleted file mode 100644
index cddd1dd68b..0000000000
--- a/compat/thrust/count.h
+++ /dev/null
@@ -1,231 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file count.h
- *  \brief Counting elements in a range
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/execution_policy.h>
-#include <thrust/iterator/iterator_traits.h>
-
-namespace thrust
-{
-
-
-/*! \addtogroup algorithms
- */
-
-/*! \addtogroup reductions
- *  \ingroup algorithms
- *  \{
- */
-
-/*! \addtogroup counting
- *  \ingroup reductions
- *  \{
- */
-
-
-/*! \p count finds the number of elements in <tt>[first,last)</tt> that are equal
- *  to \p value. More precisely, \p count returns the number of iterators \c i in
- *  <tt>[first, last)</tt> such that <tt>*i == value</tt>.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the sequence.
- *  \param last The end of the sequence.
- *  \param value The value to be counted.
- *  \return The number of elements equal to \p value.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be a model of must be a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
- *  \tparam EqualityComparable must be a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a> and can be compared for equality with \c InputIterator's \c value_type
- *
- *  The following code snippet demonstrates how to use \p count to 
- *  count the number of instances in a range of a value of interest using the \p thrust::device execution policy:
- *
- *  \code
- *  #include <thrust/count.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  // put 3 1s in a device_vector
- *  thrust::device_vector<int> vec(5,0);
- *  vec[1] = 1;
- *  vec[3] = 1;
- *  vec[4] = 1;
- *  
- *  // count the 1s
- *  int result = thrust::count(thrust::device, vec.begin(), vec.end(), 1);
- *  // result == 3
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/count.html
- */
-template<typename DerivedPolicy, typename InputIterator, typename EqualityComparable>
-  typename thrust::iterator_traits<InputIterator>::difference_type
-    count(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, InputIterator first, InputIterator last, const EqualityComparable& value);
-
-
-
-/*! \p count finds the number of elements in <tt>[first,last)</tt> that are equal
- *  to \p value. More precisely, \p count returns the number of iterators \c i in
- *  <tt>[first, last)</tt> such that <tt>*i == value</tt>.
- *
- *  \param first The beginning of the sequence.
- *  \param last The end of the sequence.
- *  \param value The value to be counted.
- *  \return The number of elements equal to \p value.
- *
- *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be a model of must be a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
- *  \tparam EqualityComparable must be a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a> and can be compared for equality with \c InputIterator's \c value_type
- *
- *  The following code snippet demonstrates how to use \p count to 
- *  count the number of instances in a range of a value of interest.
- *  \code
- *  #include <thrust/count.h>
- *  #include <thrust/device_vector.h>
- *  ...
- *  // put 3 1s in a device_vector
- *  thrust::device_vector<int> vec(5,0);
- *  vec[1] = 1;
- *  vec[3] = 1;
- *  vec[4] = 1;
- *  
- *  // count the 1s
- *  int result = thrust::count(vec.begin(), vec.end(), 1);
- *  // result == 3
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/count.html
- */
-template <typename InputIterator, typename EqualityComparable>
-  typename thrust::iterator_traits<InputIterator>::difference_type
-    count(InputIterator first, InputIterator last, const EqualityComparable& value);
-
-
-/*! \p count_if finds the number of elements in <tt>[first,last)</tt> for which 
- *  a predicate is \c true. More precisely, \p count_if returns the number of iterators
- *  \c i in <tt>[first, last)</tt> such that <tt>pred(*i) == true</tt>.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the sequence.
- *  \param last The end of the sequence.
- *  \param pred The predicate.
- *  \return The number of elements where \p pred is \c true.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c Predicate's \c argument_type.
- *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  The following code snippet demonstrates how to use \p count to
- *  count the number of odd numbers in a range using the \p thrust::device execution policy:
- *
- *  \code
- *  #include <thrust/count.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  struct is_odd
- *  {
- *    __host__ __device__
- *    bool operator()(int &x)
- *    {
- *      return x & 1;
- *    }
- *  };
- *  ...
- *  // fill a device_vector with even & odd numbers
- *  thrust::device_vector<int> vec(5);
- *  vec[0] = 0;
- *  vec[1] = 1;
- *  vec[2] = 2;
- *  vec[3] = 3;
- *  vec[4] = 4;
- *
- *  // count the odd elements in vec
- *  int result = thrust::count_if(thrust::device, vec.begin(), vec.end(), is_odd());
- *  // result == 2
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/count.html
- */
-template<typename DerivedPolicy, typename InputIterator, typename Predicate>
-  typename thrust::iterator_traits<InputIterator>::difference_type
-    count_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, InputIterator first, InputIterator last, Predicate pred);
-
-
-/*! \p count_if finds the number of elements in <tt>[first,last)</tt> for which 
- *  a predicate is \c true. More precisely, \p count_if returns the number of iterators
- *  \c i in <tt>[first, last)</tt> such that <tt>pred(*i) == true</tt>.
- *
- *  \param first The beginning of the sequence.
- *  \param last The end of the sequence.
- *  \param pred The predicate.
- *  \return The number of elements where \p pred is \c true.
- *
- *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c Predicate's \c argument_type.
- *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  The following code snippet demonstrates how to use \p count to
- *  count the number of odd numbers in a range.
- *  \code
- *  #include <thrust/count.h>
- *  #include <thrust/device_vector.h>
- *  ...
- *  struct is_odd
- *  {
- *    __host__ __device__
- *    bool operator()(int &x)
- *    {
- *      return x & 1;
- *    }
- *  };
- *  ...
- *  // fill a device_vector with even & odd numbers
- *  thrust::device_vector<int> vec(5);
- *  vec[0] = 0;
- *  vec[1] = 1;
- *  vec[2] = 2;
- *  vec[3] = 3;
- *  vec[4] = 4;
- *
- *  // count the odd elements in vec
- *  int result = thrust::count_if(vec.begin(), vec.end(), is_odd());
- *  // result == 2
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/count.html
- */
-template <typename InputIterator, typename Predicate>
-  typename thrust::iterator_traits<InputIterator>::difference_type
-    count_if(InputIterator first, InputIterator last, Predicate pred);
-
-/*! \} // end counting
- *  \} // end reductions
- */
-
-} // end thrust
-
-#include <thrust/detail/count.inl>
-
diff --git a/compat/thrust/detail/adjacent_difference.inl b/compat/thrust/detail/adjacent_difference.inl
deleted file mode 100644
index 6590f9d15d..0000000000
--- a/compat/thrust/detail/adjacent_difference.inl
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file adjacent_difference.inl
- *  \brief Inline file for adjacent_difference.h
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/select_system.h>
-#include <thrust/system/detail/generic/adjacent_difference.h>
-#include <thrust/system/detail/adl/adjacent_difference.h>
-
-namespace thrust
-{
-
-
-template <typename DerivedPolicy, typename InputIterator, typename OutputIterator>
-OutputIterator adjacent_difference(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                   InputIterator first, InputIterator last, 
-                                   OutputIterator result)
-{
-  using thrust::system::detail::generic::adjacent_difference;
-
-  return adjacent_difference(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result);
-} // end adjacent_difference()
-
-
-template <typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename BinaryFunction>
-OutputIterator adjacent_difference(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                   InputIterator first, InputIterator last, 
-                                   OutputIterator result,
-                                   BinaryFunction binary_op)
-{
-  using thrust::system::detail::generic::adjacent_difference;
-
-  return adjacent_difference(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, binary_op);
-} // end adjacent_difference()
-
-
-template <typename InputIterator, typename OutputIterator>
-OutputIterator adjacent_difference(InputIterator first, InputIterator last, 
-                                   OutputIterator result)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator>::type  System1;
-  typedef typename thrust::iterator_system<OutputIterator>::type System2;
-
-  System1 system1;
-  System2 system2;
-
-  return thrust::adjacent_difference(select_system(system1, system2), first, last, result);
-} // end adjacent_difference()
-
-
-template <typename InputIterator, typename OutputIterator, typename BinaryFunction>
-OutputIterator adjacent_difference(InputIterator first, InputIterator last,
-                                   OutputIterator result,
-                                   BinaryFunction binary_op)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator>::type  System1;
-  typedef typename thrust::iterator_system<OutputIterator>::type System2;
-
-  System1 system1;
-  System2 system2;
-
-  return thrust::adjacent_difference(select_system(system1, system2), first, last, result, binary_op);
-} // end adjacent_difference()
-
-
-} // end namespace thrust
-
diff --git a/compat/thrust/detail/advance.inl b/compat/thrust/detail/advance.inl
deleted file mode 100644
index 2907be7534..0000000000
--- a/compat/thrust/detail/advance.inl
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file advance.inl
- *  \brief Inline file for advance.h
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/advance.h>
-#include <thrust/system/detail/generic/advance.h>
-
-namespace thrust
-{
-
-
-template <typename InputIterator, typename Distance>
-void advance(InputIterator& i, Distance n)
-{
-  thrust::system::detail::generic::advance(i, n);
-} // end advance()
-
-
-} // end namespace thrust
-
diff --git a/compat/thrust/detail/allocator/allocator_traits.h b/compat/thrust/detail/allocator/allocator_traits.h
deleted file mode 100644
index 6ee99b453f..0000000000
--- a/compat/thrust/detail/allocator/allocator_traits.h
+++ /dev/null
@@ -1,240 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/type_traits/pointer_traits.h>
-#include <thrust/detail/type_traits/has_nested_type.h>
-#include <thrust/detail/type_traits.h>
-
-namespace thrust
-{
-namespace detail
-{
-namespace allocator_traits_detail
-{
-
-__THRUST_DEFINE_HAS_NESTED_TYPE(has_pointer, pointer)
-__THRUST_DEFINE_HAS_NESTED_TYPE(has_const_pointer, const_pointer)
-__THRUST_DEFINE_HAS_NESTED_TYPE(has_reference, reference)
-__THRUST_DEFINE_HAS_NESTED_TYPE(has_const_reference, const_reference)
-__THRUST_DEFINE_HAS_NESTED_TYPE(has_void_pointer, void_pointer)
-__THRUST_DEFINE_HAS_NESTED_TYPE(has_const_void_pointer, const_void_pointer)
-__THRUST_DEFINE_HAS_NESTED_TYPE(has_difference_type, difference_type)
-__THRUST_DEFINE_HAS_NESTED_TYPE(has_size_type, size_type)
-__THRUST_DEFINE_HAS_NESTED_TYPE(has_propagate_on_container_copy_assignment, propagate_on_container_copy_assignment)
-__THRUST_DEFINE_HAS_NESTED_TYPE(has_propagate_on_container_move_assignment, propagate_on_container_move_assignment)
-__THRUST_DEFINE_HAS_NESTED_TYPE(has_propagate_on_container_swap, propagate_on_container_swap)
-__THRUST_DEFINE_HAS_NESTED_TYPE(has_system_type, system_type)
-
-template<typename T>
-  struct nested_pointer
-{
-  typedef typename T::pointer type;
-};
-
-template<typename T>
-  struct nested_const_pointer
-{
-  typedef typename T::const_pointer type;
-};
-
-template<typename T>
-  struct nested_reference
-{
-  typedef typename T::reference type;
-};
-
-template<typename T>
-  struct nested_const_reference
-{
-  typedef typename T::const_reference type;
-};
-
-template<typename T>
-  struct nested_void_pointer
-{
-  typedef typename T::void_pointer type;
-};
-
-template<typename T>
-  struct nested_const_void_pointer
-{
-  typedef typename T::const_void_pointer type;
-};
-
-template<typename T>
-  struct nested_difference_type
-{
-  typedef typename T::difference_type type;
-};
-
-template<typename T>
-  struct nested_size_type
-{
-  typedef typename T::size_type type;
-};
-
-template<typename T>
-  struct nested_propagate_on_container_copy_assignment
-{
-  typedef typename T::propagate_on_container_copy_assignment type;
-};
-
-template<typename T>
-  struct nested_propagate_on_container_move_assignment
-{
-  typedef typename T::propagate_on_container_move_assignment type;
-};
-
-template<typename T>
-  struct nested_propagate_on_container_swap
-{
-  typedef typename T::propagate_on_container_swap type;
-};
-
-template<typename T>
-  struct nested_system_type
-{
-  typedef typename T::system_type type;
-};
-
-} // end allocator_traits_detail
-
-
-template<typename Alloc>
-  struct allocator_traits
-{
-  typedef Alloc allocator_type;
-
-  typedef typename allocator_type::value_type value_type;
-
-  typedef typename eval_if<
-    allocator_traits_detail::has_pointer<allocator_type>::value,
-    allocator_traits_detail::nested_pointer<allocator_type>,
-    identity_<value_type*>
-  >::type pointer;
-
-  private:
-    template<typename T>
-      struct rebind_pointer
-    {
-      typedef typename pointer_traits<pointer>::template rebind<T>::other type;
-    };
-
-  public:
-
-  typedef typename eval_if<
-    allocator_traits_detail::has_const_pointer<allocator_type>::value,
-    allocator_traits_detail::nested_const_pointer<allocator_type>,
-    rebind_pointer<const value_type>
-  >::type const_pointer;
-
-  typedef typename eval_if<
-    allocator_traits_detail::has_void_pointer<allocator_type>::value,
-    allocator_traits_detail::nested_void_pointer<allocator_type>,
-    rebind_pointer<void>
-  >::type void_pointer;
-
-  typedef typename eval_if<
-    allocator_traits_detail::has_const_void_pointer<allocator_type>::value,
-    allocator_traits_detail::nested_const_void_pointer<allocator_type>,
-    rebind_pointer<const void>
-  >::type const_void_pointer;
-
-  typedef typename eval_if<
-    allocator_traits_detail::has_difference_type<allocator_type>::value,
-    allocator_traits_detail::nested_difference_type<allocator_type>,
-    pointer_difference<pointer>
-  >::type difference_type;
-
-  typedef typename eval_if<
-    allocator_traits_detail::has_size_type<allocator_type>::value,
-    allocator_traits_detail::nested_size_type<allocator_type>,
-    make_unsigned<difference_type>
-  >::type size_type;
-
-  typedef typename eval_if<
-    allocator_traits_detail::has_propagate_on_container_copy_assignment<allocator_type>::value,
-    allocator_traits_detail::nested_propagate_on_container_copy_assignment<allocator_type>,
-    identity_<false_type>
-  >::type propagate_on_container_copy_assignment;
-
-  typedef typename eval_if<
-    allocator_traits_detail::has_propagate_on_container_move_assignment<allocator_type>::value,
-    allocator_traits_detail::nested_propagate_on_container_move_assignment<allocator_type>,
-    identity_<false_type>
-  >::type propagate_on_container_move_assignment;
-
-  typedef typename eval_if<
-    allocator_traits_detail::has_propagate_on_container_swap<allocator_type>::value,
-    allocator_traits_detail::nested_propagate_on_container_swap<allocator_type>,
-    identity_<false_type>
-  >::type propagate_on_container_swap;
-
-  typedef typename eval_if<
-    allocator_traits_detail::has_system_type<allocator_type>::value,
-    allocator_traits_detail::nested_system_type<allocator_type>,
-    thrust::iterator_system<pointer>
-  >::type system_type;
-
-  // XXX rebind and rebind_traits are alias templates
-  //     and so are omitted while c++11 is unavailable
-
-  inline static pointer allocate(allocator_type &a, size_type n);
-
-  inline static pointer allocate(allocator_type &a, size_type n, const_void_pointer hint);
-
-  inline static void deallocate(allocator_type &a, pointer p, size_type n);
-
-  // XXX should probably change T* to pointer below and then relax later
-
-  template<typename T>
-  inline __host__ __device__ static void construct(allocator_type &a, T *p);
-  
-  template<typename T, typename Arg1>
-  inline __host__ __device__ static void construct(allocator_type &a, T *p, const Arg1 &arg1);
-
-  template<typename T>
-  inline __host__ __device__ static void destroy(allocator_type &a, T *p);
-
-  inline static size_type max_size(const allocator_type &a);
-}; // end allocator_traits
-
-
-// XXX consider moving this non-standard functionality inside allocator_traits
-template<typename Alloc>
-  struct allocator_system
-{
-  // the type of the allocator's system
-  typedef typename eval_if<
-    allocator_traits_detail::has_system_type<Alloc>::value,
-    allocator_traits_detail::nested_system_type<Alloc>,
-    thrust::iterator_system<
-      typename allocator_traits<Alloc>::pointer
-    >
-  >::type type;
-
-  inline static type &get(Alloc &a);
-};
-
-
-} // end detail
-} // end thrust
-
-#include <thrust/detail/allocator/allocator_traits.inl>
-
diff --git a/compat/thrust/detail/allocator/allocator_traits.inl b/compat/thrust/detail/allocator/allocator_traits.inl
deleted file mode 100644
index 83193355d5..0000000000
--- a/compat/thrust/detail/allocator/allocator_traits.inl
+++ /dev/null
@@ -1,287 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/allocator/allocator_traits.h>
-#include <thrust/detail/type_traits/has_member_function.h>
-#include <thrust/detail/type_traits/is_call_possible.h>
-#include <new>
-#include <limits>
-
-namespace thrust
-{
-namespace detail
-{
-namespace allocator_traits_detail
-{
-
-__THRUST_DEFINE_IS_CALL_POSSIBLE(has_member_allocate_with_hint_impl, allocate)
-
-template<typename Alloc>
-  class has_member_allocate_with_hint
-{
-  typedef typename allocator_traits<Alloc>::pointer            pointer;
-  typedef typename allocator_traits<Alloc>::size_type          size_type;
-  typedef typename allocator_traits<Alloc>::const_void_pointer const_void_pointer;
-
-  public:
-    typedef typename has_member_allocate_with_hint_impl<Alloc, pointer(size_type,const_void_pointer)>::type type;
-    static const bool value = type::value;
-};
-
-template<typename Alloc>
-  typename enable_if<
-    has_member_allocate_with_hint<Alloc>::value,
-    typename allocator_traits<Alloc>::pointer
-  >::type
-    allocate(Alloc &a, typename allocator_traits<Alloc>::size_type n, typename allocator_traits<Alloc>::const_void_pointer hint)
-{
-  return a.allocate(n,hint);
-}
-
-template<typename Alloc>
-  typename disable_if<
-    has_member_allocate_with_hint<Alloc>::value,
-    typename allocator_traits<Alloc>::pointer
-  >::type
-    allocate(Alloc &a, typename allocator_traits<Alloc>::size_type n, typename allocator_traits<Alloc>::const_void_pointer)
-{
-  return a.allocate(n);
-}
-
-
-__THRUST_DEFINE_IS_CALL_POSSIBLE(has_member_construct1_impl, construct)
-
-template<typename Alloc, typename T>
-  struct has_member_construct1
-    : has_member_construct1_impl<Alloc, void(T*)>
-{};
-
-template<typename Alloc, typename T>
-  inline __host__ __device__
-    typename enable_if<
-      has_member_construct1<Alloc,T>::value
-    >::type
-      construct(Alloc &a, T *p)
-{
-  a.construct(p);
-}
-
-template<typename Alloc, typename T>
-  inline __host__ __device__
-    typename disable_if<
-      has_member_construct1<Alloc,T>::value
-    >::type
-      construct(Alloc &a, T *p)
-{
-  ::new(static_cast<void*>(p)) T();
-}
-
-
-__THRUST_DEFINE_IS_CALL_POSSIBLE(has_member_construct2_impl, construct)
-
-template<typename Alloc, typename T, typename Arg1>
-  struct has_member_construct2
-    : has_member_construct2_impl<Alloc, void(T*,const Arg1 &)>
-{};
-
-template<typename Alloc, typename T, typename Arg1>
-  inline __host__ __device__
-    typename enable_if<
-      has_member_construct2<Alloc,T,Arg1>::value
-    >::type
-      construct(Alloc &a, T *p, const Arg1 &arg1)
-{
-  a.construct(p,arg1);
-}
-
-template<typename Alloc, typename T, typename Arg1>
-  inline __host__ __device__
-    typename disable_if<
-      has_member_construct2<Alloc,T,Arg1>::value
-    >::type
-      construct(Alloc &, T *p, const Arg1 &arg1)
-{
-  ::new(static_cast<void*>(p)) T(arg1);
-}
-
-
-__THRUST_DEFINE_IS_CALL_POSSIBLE(has_member_destroy_impl, destroy)
-
-template<typename Alloc, typename T>
-  struct has_member_destroy
-    : has_member_destroy_impl<Alloc, void(T*)>
-{};
-
-template<typename Alloc, typename T>
-  inline __host__ __device__
-    typename enable_if<
-      has_member_destroy<Alloc,T>::value
-    >::type
-      destroy(Alloc &a, T *p)
-{
-  a.destroy(p);
-}
-
-template<typename Alloc, typename T>
-  inline __host__ __device__
-    typename disable_if<
-      has_member_destroy<Alloc,T>::value
-    >::type
-      destroy(Alloc &, T *p)
-{
-  p->~T();
-}
-
-
-__THRUST_DEFINE_IS_CALL_POSSIBLE(has_member_max_size_impl, max_size)
-
-template<typename Alloc>
-  class has_member_max_size
-{
-  typedef typename allocator_traits<Alloc>::size_type size_type;
-
-  public:
-    typedef typename has_member_max_size_impl<Alloc, size_type(void)>::type type;
-    static const bool value = type::value;
-};
-
-template<typename Alloc>
-  typename enable_if<
-    has_member_max_size<Alloc>::value,
-    typename allocator_traits<Alloc>::size_type
-  >::type
-    max_size(const Alloc &a)
-{
-  return a.max_size();
-}
-
-template<typename Alloc>
-  typename disable_if<
-    has_member_max_size<Alloc>::value,
-    typename allocator_traits<Alloc>::size_type
-  >::type
-    max_size(const Alloc &a)
-{
-  typedef typename allocator_traits<Alloc>::size_type size_type;
-  return std::numeric_limits<size_type>::max();
-}
-
-__THRUST_DEFINE_HAS_MEMBER_FUNCTION(has_member_system_impl, system)
-
-template<typename Alloc>
-  class has_member_system
-{
-  typedef typename allocator_system<Alloc>::type system_type;
-
-  public:
-    typedef typename has_member_system_impl<Alloc, system_type&(void)>::type type;
-    static const bool value = type::value;
-};
-
-template<typename Alloc>
-  typename enable_if<
-    has_member_system<Alloc>::value,
-    typename allocator_system<Alloc>::type &
-  >::type
-    system(Alloc &a)
-{
-  return a.system();
-}
-
-template<typename Alloc>
-  typename disable_if<
-    has_member_system<Alloc>::value,
-    typename allocator_system<Alloc>::type &
-  >::type
-    system(Alloc &a)
-{
-  // assumes the system is default-constructible
-  static typename allocator_system<Alloc>::type state;
-  return state;
-}
-
-
-} // end allocator_traits_detail
-
-
-template<typename Alloc>
-  typename allocator_traits<Alloc>::pointer
-    allocator_traits<Alloc>
-      ::allocate(Alloc &a, typename allocator_traits<Alloc>::size_type n)
-{
-  return a.allocate(n);
-}
-
-template<typename Alloc>
-  typename allocator_traits<Alloc>::pointer
-    allocator_traits<Alloc>
-      ::allocate(Alloc &a, typename allocator_traits<Alloc>::size_type n, typename allocator_traits<Alloc>::const_void_pointer hint)
-{
-  return allocator_traits_detail::allocate(a, n, hint);
-}
-
-template<typename Alloc>
-  void allocator_traits<Alloc>
-    ::deallocate(Alloc &a, typename allocator_traits<Alloc>::pointer p, typename allocator_traits<Alloc>::size_type n)
-{
-  return a.deallocate(p,n);
-}
-
-template<typename Alloc>
-  template<typename T>
-    void allocator_traits<Alloc>
-      ::construct(allocator_type &a, T *p)
-{
-  return allocator_traits_detail::construct(a,p);
-}
-
-template<typename Alloc>
-  template<typename T, typename Arg1>
-    void allocator_traits<Alloc>
-      ::construct(allocator_type &a, T *p, const Arg1 &arg1)
-{
-  return allocator_traits_detail::construct(a,p,arg1);
-}
-
-template<typename Alloc>
-  template<typename T>
-    void allocator_traits<Alloc>
-      ::destroy(allocator_type &a, T *p)
-{
-  return allocator_traits_detail::destroy(a,p);
-}
-
-template<typename Alloc>
-  typename allocator_traits<Alloc>::size_type
-    allocator_traits<Alloc>
-      ::max_size(const allocator_type &a)
-{
-  return allocator_traits_detail::max_size(a);
-}
-
-template<typename Alloc>
-  typename allocator_system<Alloc>::type &
-    allocator_system<Alloc>
-      ::get(Alloc &a)
-{
-  return allocator_traits_detail::system(a);
-}
-
-
-} // end detail
-} // end thrust
-
diff --git a/compat/thrust/detail/allocator/copy_construct_range.h b/compat/thrust/detail/allocator/copy_construct_range.h
deleted file mode 100644
index 5d99e1fa14..0000000000
--- a/compat/thrust/detail/allocator/copy_construct_range.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace detail
-{
-
-template<typename System, typename Allocator, typename InputIterator, typename Pointer>
-  Pointer copy_construct_range(thrust::execution_policy<System> &from_system,
-                               Allocator &a,
-                               InputIterator first,
-                               InputIterator last,
-                               Pointer result);
-
-template<typename System, typename Allocator, typename InputIterator, typename Size, typename Pointer>
-  Pointer copy_construct_range_n(thrust::execution_policy<System> &from_system,
-                                 Allocator &a,
-                                 InputIterator first,
-                                 Size n,
-                                 Pointer result);
-
-} // end detail
-} // end thrust
-
-#include <thrust/detail/allocator/copy_construct_range.inl>
-
diff --git a/compat/thrust/detail/allocator/copy_construct_range.inl b/compat/thrust/detail/allocator/copy_construct_range.inl
deleted file mode 100644
index 7c5478b65b..0000000000
--- a/compat/thrust/detail/allocator/copy_construct_range.inl
+++ /dev/null
@@ -1,298 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/allocator/allocator_traits.h>
-#include <thrust/detail/type_traits/pointer_traits.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/copy.h>
-#include <thrust/tuple.h>
-#include <thrust/advance.h>
-#include <thrust/distance.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/for_each.h>
-#include <memory>
-
-namespace thrust
-{
-namespace detail
-{
-namespace allocator_traits_detail
-{
-
-
-template<typename Allocator, typename InputType, typename OutputType>
-  struct copy_construct_with_allocator
-{
-  Allocator &a;
-
-  copy_construct_with_allocator(Allocator &a)
-    : a(a)
-  {}
-
-  template<typename Tuple>
-  inline __host__ __device__
-  void operator()(Tuple t)
-  {
-    const InputType &in = thrust::get<0>(t);
-    OutputType &out = thrust::get<1>(t);
-
-    allocator_traits<Allocator>::construct(a, &out, in);
-  }
-};
-
-
-template<typename Allocator, typename T>
-  struct needs_copy_construct_via_allocator
-    : has_member_construct2<
-        Allocator,
-        T,
-        T
-      >
-{};
-
-
-// we know that std::allocator::construct's only effect is to call T's
-// copy constructor, so we needn't use it for copy construction
-template<typename U, typename T>
-  struct needs_copy_construct_via_allocator<std::allocator<U>, T>
-    : thrust::detail::false_type
-{};
-
-
-// XXX it's regrettable that this implementation is copied almost
-//     exactly from system::detail::generic::uninitialized_copy
-//     perhaps generic::uninitialized_copy could call this routine
-//     with a default allocator
-template<typename Allocator, typename FromSystem, typename ToSystem, typename InputIterator, typename Pointer>
-  typename enable_if_convertible<
-    FromSystem,
-    ToSystem,
-    Pointer
-  >::type
-    uninitialized_copy_with_allocator(Allocator &a,
-                                      thrust::execution_policy<FromSystem> &from_system,
-                                      thrust::execution_policy<ToSystem> &to_system,
-                                      InputIterator first,
-                                      InputIterator last,
-                                      Pointer result)
-{
-  // zip up the iterators
-  typedef thrust::tuple<InputIterator,Pointer> IteratorTuple;
-  typedef thrust::zip_iterator<IteratorTuple>  ZipIterator;
-
-  ZipIterator begin = thrust::make_zip_iterator(thrust::make_tuple(first,result));
-  ZipIterator end = begin;
-
-  // get a zip_iterator pointing to the end
-  const typename thrust::iterator_difference<InputIterator>::type n = thrust::distance(first,last);
-  thrust::advance(end,n);
-
-  // create a functor
-  typedef typename iterator_traits<InputIterator>::value_type InputType;
-  typedef typename iterator_traits<Pointer>::value_type       OutputType;
-
-  // do the for_each
-  // note we use to_system to dispatch the for_each
-  thrust::for_each(to_system, begin, end, copy_construct_with_allocator<Allocator,InputType,OutputType>(a));
-
-  // return the end of the output range
-  return thrust::get<1>(end.get_iterator_tuple());
-}
-
-
-// XXX it's regrettable that this implementation is copied almost
-//     exactly from system::detail::generic::uninitialized_copy_n
-//     perhaps generic::uninitialized_copy_n could call this routine
-//     with a default allocator
-template<typename Allocator, typename FromSystem, typename ToSystem, typename InputIterator, typename Size, typename Pointer>
-  typename enable_if_convertible<
-    FromSystem,
-    ToSystem,
-    Pointer
-  >::type
-    uninitialized_copy_with_allocator_n(Allocator &a,
-                                        thrust::execution_policy<FromSystem> &from_system,
-                                        thrust::execution_policy<ToSystem> &to_system,
-                                        InputIterator first,
-                                        Size n,
-                                        Pointer result)
-{
-  // zip up the iterators
-  typedef thrust::tuple<InputIterator,Pointer> IteratorTuple;
-  typedef thrust::zip_iterator<IteratorTuple>  ZipIterator;
-
-  ZipIterator begin = thrust::make_zip_iterator(thrust::make_tuple(first,result));
-
-  // create a functor
-  typedef typename iterator_traits<InputIterator>::value_type InputType;
-  typedef typename iterator_traits<Pointer>::value_type       OutputType;
-
-  // do the for_each_n
-  // note we use to_system to dispatch the for_each_n
-  ZipIterator end = thrust::for_each_n(to_system, begin, n, copy_construct_with_allocator<Allocator,InputType,OutputType>(a));
-
-  // return the end of the output range
-  return thrust::get<1>(end.get_iterator_tuple());
-}
-
-
-template<typename Allocator, typename FromSystem, typename ToSystem, typename InputIterator, typename Pointer>
-  typename disable_if_convertible<
-    FromSystem,
-    ToSystem,
-    Pointer
-  >::type
-    uninitialized_copy_with_allocator(Allocator &,
-                                      thrust::execution_policy<FromSystem> &from_system,
-                                      thrust::execution_policy<ToSystem> &to_system,
-                                      InputIterator first,
-                                      InputIterator last,
-                                      Pointer result)
-{
-  // the systems aren't trivially interoperable
-  // just call two_system_copy and hope for the best
-  return thrust::detail::two_system_copy(from_system, to_system, first, last, result);
-} // end uninitialized_copy_with_allocator()
-
-
-template<typename Allocator, typename FromSystem, typename ToSystem, typename InputIterator, typename Size, typename Pointer>
-  typename disable_if_convertible<
-    FromSystem,
-    ToSystem,
-    Pointer
-  >::type
-    uninitialized_copy_with_allocator_n(Allocator &,
-                                        thrust::execution_policy<FromSystem> &from_system,
-                                        thrust::execution_policy<ToSystem> &to_system,
-                                        InputIterator first,
-                                        Size n,
-                                        Pointer result)
-{
-  // the systems aren't trivially interoperable
-  // just call two_system_copy_n and hope for the best
-  return thrust::detail::two_system_copy_n(from_system, to_system, first, n, result);
-} // end uninitialized_copy_with_allocator_n()
-
-
-template<typename FromSystem, typename Allocator, typename InputIterator, typename Pointer>
-  typename disable_if<
-    needs_copy_construct_via_allocator<
-      Allocator,
-      typename pointer_element<Pointer>::type
-    >::value,
-    Pointer
-  >::type
-    copy_construct_range(thrust::execution_policy<FromSystem> &from_system,
-                         Allocator &a,
-                         InputIterator first,
-                         InputIterator last,
-                         Pointer result)
-{
-  typename allocator_system<Allocator>::type &to_system = allocator_system<Allocator>::get(a);
-
-  // just call two_system_copy
-  return thrust::detail::two_system_copy(from_system, to_system, first, last, result);
-}
-
-
-template<typename FromSystem, typename Allocator, typename InputIterator, typename Size, typename Pointer>
-  typename disable_if<
-    needs_copy_construct_via_allocator<
-      Allocator,
-      typename pointer_element<Pointer>::type
-    >::value,
-    Pointer
-  >::type
-    copy_construct_range_n(thrust::execution_policy<FromSystem> &from_system,
-                           Allocator &a,
-                           InputIterator first,
-                           Size n,
-                           Pointer result)
-{
-  typename allocator_system<Allocator>::type &to_system = allocator_system<Allocator>::get(a);
-
-  // just call two_system_copy_n
-  return thrust::detail::two_system_copy_n(from_system, to_system, first, n, result);
-}
-
-
-template<typename FromSystem, typename Allocator, typename InputIterator, typename Pointer>
-  typename enable_if<
-    needs_copy_construct_via_allocator<
-      Allocator,
-      typename pointer_element<Pointer>::type
-    >::value,
-    Pointer
-  >::type
-    copy_construct_range(thrust::execution_policy<FromSystem> &from_system,
-                         Allocator &a,
-                         InputIterator first,
-                         InputIterator last,
-                         Pointer result)
-{
-  typename allocator_system<Allocator>::type &to_system = allocator_system<Allocator>::get(a);
-  return uninitialized_copy_with_allocator(a, from_system, to_system, first, last, result);
-}
-
-
-template<typename FromSystem, typename Allocator, typename InputIterator, typename Size, typename Pointer>
-  typename enable_if<
-    needs_copy_construct_via_allocator<
-      Allocator,
-      typename pointer_element<Pointer>::type
-    >::value,
-    Pointer
-  >::type
-    copy_construct_range_n(thrust::execution_policy<FromSystem> &from_system,
-                           Allocator &a,
-                           InputIterator first,
-                           Size n,
-                           Pointer result)
-{
-  typename allocator_system<Allocator>::type &to_system = allocator_system<Allocator>::get(a);
-  return uninitialized_copy_with_allocator_n(a, from_system, to_system, first, n, result);
-}
-
-
-} // end allocator_traits_detail
-
-
-template<typename System, typename Allocator, typename InputIterator, typename Pointer>
-  Pointer copy_construct_range(thrust::execution_policy<System> &from_system,
-                               Allocator &a,
-                               InputIterator first,
-                               InputIterator last,
-                               Pointer result)
-{
-  return allocator_traits_detail::copy_construct_range(from_system, a, first, last, result);
-}
-
-
-template<typename System, typename Allocator, typename InputIterator, typename Size, typename Pointer>
-  Pointer copy_construct_range_n(thrust::execution_policy<System> &from_system,
-                                 Allocator &a,
-                                 InputIterator first,
-                                 Size n,
-                                 Pointer result)
-{
-  return allocator_traits_detail::copy_construct_range_n(from_system, a, first, n, result);
-}
-
-
-} // end detail
-} // end thrust
-
diff --git a/compat/thrust/detail/allocator/default_construct_range.h b/compat/thrust/detail/allocator/default_construct_range.h
deleted file mode 100644
index d83cb31f35..0000000000
--- a/compat/thrust/detail/allocator/default_construct_range.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-namespace thrust
-{
-namespace detail
-{
-
-
-template<typename Allocator, typename Pointer, typename Size>
-inline void default_construct_range(Allocator &a, Pointer p, Size n);
-
-
-} // end detail
-} // end thrust
-
-#include <thrust/detail/allocator/default_construct_range.inl>
-
-
diff --git a/compat/thrust/detail/allocator/default_construct_range.inl b/compat/thrust/detail/allocator/default_construct_range.inl
deleted file mode 100644
index 45fe9c69a1..0000000000
--- a/compat/thrust/detail/allocator/default_construct_range.inl
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/allocator/allocator_traits.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/type_traits/pointer_traits.h>
-#include <thrust/for_each.h>
-#include <thrust/uninitialized_fill.h>
-
-namespace thrust
-{
-namespace detail
-{
-namespace allocator_traits_detail
-{
-
-
-template<typename Allocator>
-  struct construct1_via_allocator
-{
-  Allocator &a;
-
-  construct1_via_allocator(Allocator &a)
-    : a(a)
-  {}
-
-  template<typename T>
-  inline __host__ __device__
-  void operator()(T &x)
-  {
-    allocator_traits<Allocator>::construct(a, &x);
-  }
-};
-
-
-template<typename Allocator, typename T>
-  struct needs_default_construct_via_allocator
-    : has_member_construct1<
-        Allocator,
-        T
-      >
-{};
-
-
-// we know that std::allocator::construct's only effect is to call T's 
-// default constructor, so we needn't use it for default construction
-template<typename U, typename T>
-  struct needs_default_construct_via_allocator<std::allocator<U>, T>
-    : thrust::detail::false_type
-{};
-
-
-template<typename Allocator, typename Pointer, typename Size>
-  typename enable_if<
-    needs_default_construct_via_allocator<
-      Allocator,
-      typename pointer_element<Pointer>::type
-    >::value
-  >::type
-    default_construct_range(Allocator &a, Pointer p, Size n)
-{
-  thrust::for_each_n(allocator_system<Allocator>::get(a), p, n, construct1_via_allocator<Allocator>(a));
-}
-
-
-template<typename Allocator, typename Pointer, typename Size>
-  typename disable_if<
-    needs_default_construct_via_allocator<
-      Allocator,
-      typename pointer_element<Pointer>::type
-    >::value
-  >::type
-    default_construct_range(Allocator &a, Pointer p, Size n)
-{
-  thrust::uninitialized_fill_n(allocator_system<Allocator>::get(a), p, n, typename pointer_element<Pointer>::type());
-}
-
-
-} // end allocator_traits_detail
-
-
-template<typename Allocator, typename Pointer, typename Size>
-  void default_construct_range(Allocator &a, Pointer p, Size n)
-{
-  return allocator_traits_detail::default_construct_range(a,p,n);
-}
-
-
-} // end detail
-} // end thrust
-
diff --git a/compat/thrust/detail/allocator/destroy_range.h b/compat/thrust/detail/allocator/destroy_range.h
deleted file mode 100644
index d690a60a79..0000000000
--- a/compat/thrust/detail/allocator/destroy_range.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-namespace thrust
-{
-namespace detail
-{
-
-template<typename Allocator, typename Pointer, typename Size>
-  inline void destroy_range(Allocator &a, Pointer p, Size n);
-
-} // end detail
-} // end thrust
-
-#include <thrust/detail/allocator/destroy_range.inl>
-
diff --git a/compat/thrust/detail/allocator/destroy_range.inl b/compat/thrust/detail/allocator/destroy_range.inl
deleted file mode 100644
index ace222356f..0000000000
--- a/compat/thrust/detail/allocator/destroy_range.inl
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/allocator/destroy_range.h>
-#include <thrust/detail/allocator/allocator_traits.h>
-#include <thrust/detail/type_traits/pointer_traits.h>
-#include <thrust/for_each.h>
-#include <memory>
-
-namespace thrust
-{
-namespace detail
-{
-namespace allocator_traits_detail
-{
-
-
-// destroy_range has three cases:
-// if Allocator has an effectful member function destroy:
-//   1. destroy via the allocator
-// else
-//   2. if T has a non-trivial destructor, destroy the range without using the allocator
-//   3. if T has a trivial destructor, do a no-op
-
-template<typename Allocator, typename T>
-  struct has_effectful_member_destroy
-    : has_member_destroy<Allocator,T>
-{};
-
-// std::allocator::destroy's only effect is to invoke its argument's destructor
-template<typename U, typename T>
-  struct has_effectful_member_destroy<std::allocator<U>, T>
-    : thrust::detail::false_type
-{};
-
-// case 1: Allocator has an effectful 1-argument member function "destroy"
-template<typename Allocator, typename Pointer>
-  struct enable_if_destroy_range_case1
-    : thrust::detail::enable_if<
-        has_effectful_member_destroy<
-          Allocator,
-          typename pointer_element<Pointer>::type
-        >::value
-      >
-{};
-
-// case 2: Allocator has no member function "destroy", but T has a non-trivial destructor
-template<typename Allocator, typename Pointer>
-  struct enable_if_destroy_range_case2
-    : thrust::detail::enable_if<
-        !has_effectful_member_destroy<
-          Allocator,
-          typename pointer_element<Pointer>::type
-        >::value &&
-        !has_trivial_destructor<
-          typename pointer_element<Pointer>::type
-        >::value
-      >
-{};
-
-// case 3: Allocator has no member function "destroy", and T has a trivial destructor
-template<typename Allocator, typename Pointer>
-  struct enable_if_destroy_range_case3
-    : thrust::detail::enable_if<
-        !has_effectful_member_destroy<
-          Allocator,
-          typename pointer_element<Pointer>::type
-        >::value &&
-        has_trivial_destructor<
-          typename pointer_element<Pointer>::type
-        >::value
-      >
-{};
-
-
-
-template<typename Allocator>
-  struct destroy_via_allocator
-{
-  Allocator &a;
-
-  destroy_via_allocator(Allocator &a)
-    : a(a)
-  {}
-
-  template<typename T>
-  inline __host__ __device__
-  void operator()(T &x)
-  {
-    allocator_traits<Allocator>::destroy(a, &x);
-  }
-};
-
-
-// destroy_range case 1: destroy via allocator
-template<typename Allocator, typename Pointer, typename Size>
-  typename enable_if_destroy_range_case1<Allocator,Pointer>::type
-    destroy_range(Allocator &a, Pointer p, Size n)
-{
-  thrust::for_each_n(allocator_system<Allocator>::get(a), p, n, destroy_via_allocator<Allocator>(a));
-}
-
-
-// we must prepare for His coming
-struct gozer
-{
-  template<typename T>
-  inline __host__ __device__
-  void operator()(T &x)
-  {
-    x.~T();
-  }
-};
-
-// destroy_range case 2: destroy without the allocator
-template<typename Allocator, typename Pointer, typename Size>
-  typename enable_if_destroy_range_case2<Allocator,Pointer>::type
-    destroy_range(Allocator &a, Pointer p, Size n)
-{
-  thrust::for_each_n(allocator_system<Allocator>::get(a), p, n, gozer());
-}
-
-
-// destroy_range case 3: no-op
-template<typename Allocator, typename Pointer, typename Size>
-  typename enable_if_destroy_range_case3<Allocator,Pointer>::type
-    destroy_range(Allocator &, Pointer, Size)
-{
-  // no op
-}
-
-
-} // end allocator_traits_detail
-
-
-template<typename Allocator, typename Pointer, typename Size>
-  void destroy_range(Allocator &a, Pointer p, Size n)
-{
-  return allocator_traits_detail::destroy_range(a,p,n);
-}
-
-
-} // end detail
-} // end thrust
-
diff --git a/compat/thrust/detail/allocator/fill_construct_range.h b/compat/thrust/detail/allocator/fill_construct_range.h
deleted file mode 100644
index 66fec416c3..0000000000
--- a/compat/thrust/detail/allocator/fill_construct_range.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-namespace thrust
-{
-namespace detail
-{
-
-
-template<typename Allocator, typename Pointer, typename Size, typename T>
-inline void fill_construct_range(Allocator &a, Pointer p, Size n, const T &value);
-
-
-} // end detail
-} // end thrust
-
-#include <thrust/detail/allocator/fill_construct_range.inl>
-
diff --git a/compat/thrust/detail/allocator/fill_construct_range.inl b/compat/thrust/detail/allocator/fill_construct_range.inl
deleted file mode 100644
index e2c9c09c80..0000000000
--- a/compat/thrust/detail/allocator/fill_construct_range.inl
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/allocator/allocator_traits.h>
-#include <thrust/detail/type_traits/pointer_traits.h>
-#include <thrust/for_each.h>
-#include <thrust/uninitialized_fill.h>
-#include <memory>
-
-namespace thrust
-{
-namespace detail
-{
-namespace allocator_traits_detail
-{
-
-// fill_construct_range has 2 cases:
-// if Allocator has an effectful member function construct:
-//   1. construct via the allocator
-// else
-//   2. construct via uninitialized_fill
-
-template<typename Allocator, typename T, typename Arg1>
-  struct has_effectful_member_construct2
-    : has_member_construct2<Allocator,T,Arg1>
-{};
-
-// std::allocator::construct's only effect is to invoke placement new
-template<typename U, typename T, typename Arg1>
-  struct has_effectful_member_construct2<std::allocator<U>,T,Arg1>
-    : thrust::detail::false_type
-{};
-
-
-template<typename Allocator, typename Arg1>
-  struct construct2_via_allocator
-{
-  Allocator &a;
-  Arg1 arg;
-
-  construct2_via_allocator(Allocator &a, const Arg1 &arg)
-    : a(a), arg(arg)
-  {}
-
-  template<typename T>
-  inline __host__ __device__
-  void operator()(T &x)
-  {
-    allocator_traits<Allocator>::construct(a, &x, arg);
-  }
-};
-
-
-template<typename Allocator, typename Pointer, typename Size, typename T>
-  typename enable_if<
-    has_effectful_member_construct2<
-      Allocator,
-      typename pointer_element<Pointer>::type,
-      T
-    >::value
-  >::type
-    fill_construct_range(Allocator &a, Pointer p, Size n, const T &value)
-{
-  thrust::for_each_n(allocator_system<Allocator>::get(a), p, n, construct2_via_allocator<Allocator,T>(a, value));
-}
-
-
-template<typename Allocator, typename Pointer, typename Size, typename T>
-  typename disable_if<
-    has_effectful_member_construct2<
-      Allocator,
-      typename pointer_element<Pointer>::type,
-      T
-    >::value
-  >::type
-    fill_construct_range(Allocator &a, Pointer p, Size n, const T &value)
-{
-  thrust::uninitialized_fill_n(allocator_system<Allocator>::get(a), p, n, value);
-}
-
-
-} // end allocator_traits_detail
-
-
-template<typename Alloc, typename Pointer, typename Size, typename T>
-  void fill_construct_range(Alloc &a, Pointer p, Size n, const T &value)
-{
-  return allocator_traits_detail::fill_construct_range(a,p,n,value);
-}
-
-
-} // end detail
-} // end thrust
-
diff --git a/compat/thrust/detail/allocator/malloc_allocator.h b/compat/thrust/detail/allocator/malloc_allocator.h
deleted file mode 100644
index cf4567e419..0000000000
--- a/compat/thrust/detail/allocator/malloc_allocator.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/type_traits/pointer_traits.h>
-#include <thrust/detail/allocator/tagged_allocator.h>
-
-namespace thrust
-{
-namespace detail
-{
-
-template<typename T, typename System, typename Pointer>
-  class malloc_allocator
-    : public thrust::detail::tagged_allocator<
-               T, System, Pointer
-             >
-{
-  private:
-    typedef thrust::detail::tagged_allocator<
-      T, System, Pointer
-    > super_t;
-
-  public:
-    typedef typename super_t::pointer   pointer;
-    typedef typename super_t::size_type size_type;
-
-    pointer allocate(size_type cnt);
-
-    void deallocate(pointer p, size_type n);
-};
-
-} // end detail
-} // end thrust
-
-#include <thrust/detail/allocator/malloc_allocator.inl>
-
diff --git a/compat/thrust/detail/allocator/malloc_allocator.inl b/compat/thrust/detail/allocator/malloc_allocator.inl
deleted file mode 100644
index dd70202a04..0000000000
--- a/compat/thrust/detail/allocator/malloc_allocator.inl
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/allocator/malloc_allocator.h>
-#include <thrust/system/detail/generic/select_system.h>
-#include <thrust/system/detail/bad_alloc.h>
-#include <thrust/detail/raw_pointer_cast.h>
-#include <thrust/detail/malloc_and_free.h>
-
-namespace thrust
-{
-namespace detail
-{
-
-
-template<typename T, typename System, typename Pointer>
-  typename malloc_allocator<T,System,Pointer>::pointer
-    malloc_allocator<T,System,Pointer>
-      ::allocate(typename malloc_allocator<T,System,Pointer>::size_type cnt)
-{
-  using thrust::system::detail::generic::select_system;
-
-  // XXX should use a hypothetical thrust::static_pointer_cast here
-  System system;
-
-  pointer result = thrust::malloc<T>(select_system(system), cnt);
-
-  if(result.get() == 0)
-  {
-    throw thrust::system::detail::bad_alloc("malloc_allocator::allocate: malloc failed");
-  } // end if
-
-  return result;
-} // end malloc_allocator::allocate()
-
-
-template<typename T, typename System, typename Pointer>
-  void malloc_allocator<T,System,Pointer>
-    ::deallocate(typename malloc_allocator<T,System,Pointer>::pointer p, typename malloc_allocator<T,System,Pointer>::size_type n)
-{
-  using thrust::system::detail::generic::select_system;
-
-  System system;
-  thrust::free(select_system(system), p);
-} // end malloc_allocator
-
-
-} // end detail
-} // end thrust
-
diff --git a/compat/thrust/detail/allocator/no_throw_allocator.h b/compat/thrust/detail/allocator/no_throw_allocator.h
deleted file mode 100644
index ce397dbdb5..0000000000
--- a/compat/thrust/detail/allocator/no_throw_allocator.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-namespace thrust
-{
-namespace detail
-{
-
-template<typename BaseAllocator>
-  struct no_throw_allocator : BaseAllocator
-{
-  private:
-    typedef BaseAllocator super_t;
-  
-  public:
-    inline no_throw_allocator(const BaseAllocator &other = BaseAllocator())
-      : super_t(other)
-    {}
-
-    template<typename U>
-      struct rebind
-    {
-      typedef no_throw_allocator<typename super_t::template rebind<U>::other> other;
-    }; // end rebind
-
-    void deallocate(typename super_t::pointer p, typename super_t::size_type n)
-    {
-      try
-      {
-        super_t::deallocate(p, n);
-      } // end try
-      catch(...)
-      {
-        // catch anything
-      } // end catch
-    } // end deallocate()
-
-    inline bool operator==(no_throw_allocator const &other) { return super_t::operator==(other); }
-    inline bool operator!=(no_throw_allocator const &other) { return super_t::operator!=(other); }
-}; // end no_throw_allocator
-
-} // end detail
-} // end thrust
-
-
diff --git a/compat/thrust/detail/allocator/tagged_allocator.h b/compat/thrust/detail/allocator/tagged_allocator.h
deleted file mode 100644
index 3cb87a32bf..0000000000
--- a/compat/thrust/detail/allocator/tagged_allocator.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/type_traits/pointer_traits.h>
-#include <thrust/iterator/iterator_traits.h>
-
-namespace thrust
-{
-namespace detail
-{
-
-template<typename T, typename Tag, typename Pointer> class tagged_allocator;
-
-template<typename Tag, typename Pointer>
-  class tagged_allocator<void, Tag, Pointer>
-{
-  public:
-    typedef void                                                                                 value_type;
-    typedef typename thrust::detail::pointer_traits<Pointer>::template rebind<void>::other       pointer;
-    typedef typename thrust::detail::pointer_traits<Pointer>::template rebind<const void>::other const_pointer;
-    typedef std::size_t                                                                          size_type;
-    typedef typename thrust::detail::pointer_traits<Pointer>::difference_type                    difference_type;
-    typedef Tag                                                                                  system_type;
-
-    template<typename U>
-      struct rebind
-    {
-      typedef tagged_allocator<U,Tag,Pointer> other;
-    }; // end rebind
-};
-
-template<typename T, typename Tag, typename Pointer>
-  class tagged_allocator
-{
-  public:
-    typedef T                                                                                 value_type;
-    typedef typename thrust::detail::pointer_traits<Pointer>::template rebind<T>::other       pointer;
-    typedef typename thrust::detail::pointer_traits<Pointer>::template rebind<const T>::other const_pointer;
-    typedef typename thrust::iterator_reference<pointer>::type                                reference;
-    typedef typename thrust::iterator_reference<const_pointer>::type                          const_reference;
-    typedef std::size_t                                                                       size_type;
-    typedef typename thrust::detail::pointer_traits<pointer>::difference_type                 difference_type;
-    typedef Tag                                                                               system_type;
-
-    template<typename U>
-      struct rebind
-    {
-      typedef tagged_allocator<U,Tag,Pointer> other;
-    }; // end rebind
-
-    __host__ __device__
-    inline tagged_allocator();
-
-    __host__ __device__
-    inline tagged_allocator(const tagged_allocator &);
-
-    template<typename U, typename OtherPointer>
-    __host__ __device__
-    inline tagged_allocator(const tagged_allocator<U, Tag, OtherPointer> &);
-
-    __host__ __device__
-    inline ~tagged_allocator();
-
-    __host__ __device__
-    pointer address(reference x) const;
-
-    __host__ __device__
-    const_pointer address(const_reference x) const;
-
-    size_type max_size() const;
-};
-
-template<typename T1, typename Pointer1, typename T2, typename Pointer2, typename Tag>
-__host__ __device__
-bool operator==(const tagged_allocator<T1,Pointer1,Tag> &, const tagged_allocator<T2,Pointer2,Tag> &);
-
-template<typename T1, typename Pointer1, typename T2, typename Pointer2, typename Tag>
-__host__ __device__
-bool operator!=(const tagged_allocator<T1,Pointer1,Tag> &, const tagged_allocator<T2,Pointer2,Tag> &);
-
-} // end detail
-} // end thrust
-
-#include <thrust/detail/allocator/tagged_allocator.inl>
-
diff --git a/compat/thrust/detail/allocator/tagged_allocator.inl b/compat/thrust/detail/allocator/tagged_allocator.inl
deleted file mode 100644
index cb362a840d..0000000000
--- a/compat/thrust/detail/allocator/tagged_allocator.inl
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/allocator/tagged_allocator.h>
-#include <limits>
-
-namespace thrust
-{
-namespace detail
-{
-
-
-template<typename T, typename Tag, typename Pointer>
-  tagged_allocator<T,Tag,Pointer>
-    ::tagged_allocator()
-{}
-
-
-template<typename T, typename Tag, typename Pointer>
-  tagged_allocator<T,Tag,Pointer>
-    ::tagged_allocator(const tagged_allocator<T,Tag,Pointer> &)
-{}
-
-
-template<typename T, typename Tag, typename Pointer>
-  template<typename U, typename OtherPointer>
-    tagged_allocator<T,Tag,Pointer>
-      ::tagged_allocator(const tagged_allocator<U,Tag,OtherPointer> &)
-{}
-
-
-template<typename T, typename Tag, typename Pointer>
-  tagged_allocator<T,Tag,Pointer>
-    ::~tagged_allocator()
-{}
-
-
-template<typename T, typename Tag, typename Pointer>
-  typename tagged_allocator<T,Tag,Pointer>::pointer
-    tagged_allocator<T,Tag,Pointer>
-      ::address(reference x) const
-{
-  return &x;
-}
-
-
-template<typename T, typename Tag, typename Pointer>
-  typename tagged_allocator<T,Tag,Pointer>::const_pointer
-    tagged_allocator<T,Tag,Pointer>
-      ::address(const_reference x) const
-{
-  return &x;
-}
-
-
-template<typename T, typename Tag, typename Pointer>
-  typename tagged_allocator<T,Tag,Pointer>::size_type
-    tagged_allocator<T,Tag,Pointer>
-      ::max_size() const
-{
-  return (std::numeric_limits<size_type>::max)() / sizeof(T);
-}
-
-
-template<typename T1, typename Pointer1, typename T2, typename Pointer2, typename Tag>
-__host__ __device__
-bool operator==(const tagged_allocator<T1,Pointer1,Tag> &, const tagged_allocator<T2,Pointer2,Tag> &)
-{
-  return true;
-}
-
-
-template<typename T1, typename Pointer1, typename T2, typename Pointer2, typename Tag>
-__host__ __device__
-bool operator!=(const tagged_allocator<T1,Pointer1,Tag> &, const tagged_allocator<T2,Pointer2,Tag> &)
-{
-  return false;
-}
-    
-
-} // end detail
-} // end thrust
-
diff --git a/compat/thrust/detail/allocator/temporary_allocator.h b/compat/thrust/detail/allocator/temporary_allocator.h
deleted file mode 100644
index f0496f9fe1..0000000000
--- a/compat/thrust/detail/allocator/temporary_allocator.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/allocator/tagged_allocator.h>
-#include <thrust/detail/allocator/allocator_traits.h>
-#include <thrust/pair.h>
-#include <thrust/memory.h>
-#include <thrust/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace detail
-{
-
-
-// XXX the pointer parameter given to tagged_allocator should be related to
-//     the type of the expression get_temporary_buffer(system, n).first
-//     without decltype, compromise on pointer<T,System>
-template<typename T, typename System>
-  class temporary_allocator
-    : public thrust::detail::tagged_allocator<
-               T, System, thrust::pointer<T,System>
-             >
-{
-  private:
-    typedef thrust::detail::tagged_allocator<
-      T, System, thrust::pointer<T,System>
-    > super_t;
-
-    System &m_system;
-
-  public:
-    typedef typename super_t::pointer   pointer;
-    typedef typename super_t::size_type size_type;
-
-    inline explicit temporary_allocator(thrust::execution_policy<System> &system) :
-      super_t(),
-      m_system(thrust::detail::derived_cast(system))
-    {}
-
-    pointer allocate(size_type cnt);
-
-    void deallocate(pointer p, size_type n);
-
-    inline System &system()
-    {
-      return m_system;
-    } // end system()
-
-  private:
-    typedef thrust::pair<pointer, size_type> pointer_and_size;
-}; // end temporary_allocator
-
-
-} // end detail
-} // end thrust
-
-#include <thrust/detail/allocator/temporary_allocator.inl>
-
diff --git a/compat/thrust/detail/allocator/temporary_allocator.inl b/compat/thrust/detail/allocator/temporary_allocator.inl
deleted file mode 100644
index 63221d57e9..0000000000
--- a/compat/thrust/detail/allocator/temporary_allocator.inl
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/allocator/temporary_allocator.h>
-#include <thrust/detail/temporary_buffer.h>
-#include <thrust/system/detail/bad_alloc.h>
-
-namespace thrust
-{
-namespace detail
-{
-
-
-template<typename T, typename System>
-  typename temporary_allocator<T,System>::pointer
-    temporary_allocator<T,System>
-      ::allocate(typename temporary_allocator<T,System>::size_type cnt)
-{
-  pointer_and_size result = thrust::get_temporary_buffer<T>(system(), cnt);
-
-  // handle failure
-  if(result.second < cnt)
-  {
-    // deallocate and throw
-    // note that we pass cnt to deallocate, not a value derived from result.second
-    deallocate(result.first, cnt);
-
-    throw thrust::system::detail::bad_alloc("temporary_buffer::allocate: get_temporary_buffer failed");
-  } // end if
-
-  return result.first;
-} // end temporary_allocator::allocate()
-
-
-template<typename T, typename System>
-  void temporary_allocator<T,System>
-    ::deallocate(typename temporary_allocator<T,System>::pointer p, typename temporary_allocator<T,System>::size_type n)
-{
-  return thrust::return_temporary_buffer(system(), p);
-} // end temporary_allocator
-
-
-} // end detail
-} // end thrust
-
diff --git a/compat/thrust/detail/binary_search.inl b/compat/thrust/detail/binary_search.inl
deleted file mode 100644
index 0fd799a311..0000000000
--- a/compat/thrust/detail/binary_search.inl
+++ /dev/null
@@ -1,458 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file binary_search.inl
- *  \brief Inline file for binary_search.h.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/binary_search.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/detail/generic/select_system.h>
-#include <thrust/system/detail/generic/binary_search.h>
-#include <thrust/system/detail/adl/binary_search.h>
-
-namespace thrust
-{
-
-
-template <typename DerivedPolicy, typename ForwardIterator, typename LessThanComparable>
-ForwardIterator lower_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                            ForwardIterator first,
-                            ForwardIterator last,
-                            const LessThanComparable &value)
-{
-    using thrust::system::detail::generic::lower_bound;
-    return lower_bound(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, value);
-}
-
-
-template<typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
-ForwardIterator lower_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                            ForwardIterator first,
-                            ForwardIterator last,
-                            const T &value,
-                            StrictWeakOrdering comp)
-{
-    using thrust::system::detail::generic::lower_bound;
-    return lower_bound(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, value, comp);
-}
-
-
-template<typename DerivedPolicy, typename ForwardIterator, typename LessThanComparable>
-ForwardIterator upper_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                            ForwardIterator first,
-                            ForwardIterator last,
-                            const LessThanComparable &value)
-{
-    using thrust::system::detail::generic::upper_bound;
-    return upper_bound(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, value);
-}
-
-
-template<typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
-ForwardIterator upper_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                            ForwardIterator first,
-                            ForwardIterator last,
-                            const T &value,
-                            StrictWeakOrdering comp)
-{
-    using thrust::system::detail::generic::upper_bound;
-    return upper_bound(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, value, comp);
-}
-
-
-template <typename DerivedPolicy, typename ForwardIterator, typename LessThanComparable>
-bool binary_search(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                   ForwardIterator first, 
-                   ForwardIterator last,
-                   const LessThanComparable& value)
-{
-    using thrust::system::detail::generic::binary_search;
-    return binary_search(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, value);
-}
-
-
-template <typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
-bool binary_search(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                   ForwardIterator first,
-                   ForwardIterator last,
-                   const T& value, 
-                   StrictWeakOrdering comp)
-{
-    using thrust::system::detail::generic::binary_search;
-    return binary_search(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, value, comp);
-}
-
-
-template <typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
-thrust::pair<ForwardIterator, ForwardIterator>
-equal_range(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-            ForwardIterator first,
-            ForwardIterator last,
-            const T& value,
-            StrictWeakOrdering comp)
-{
-    using thrust::system::detail::generic::equal_range;
-    return equal_range(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, value, comp);
-}
-
-
-template <typename DerivedPolicy, typename ForwardIterator, typename LessThanComparable>
-thrust::pair<ForwardIterator, ForwardIterator>
-equal_range(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-            ForwardIterator first,
-            ForwardIterator last,
-            const LessThanComparable& value)
-{
-    using thrust::system::detail::generic::equal_range;
-    return equal_range(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, value);
-}
-
-
-template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
-OutputIterator lower_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                           ForwardIterator first, 
-                           ForwardIterator last,
-                           InputIterator values_first, 
-                           InputIterator values_last,
-                           OutputIterator output)
-{
-    using thrust::system::detail::generic::lower_bound;
-    return lower_bound(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, values_first, values_last, output);
-}
-
-
-template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
-OutputIterator lower_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                           ForwardIterator first, 
-                           ForwardIterator last,
-                           InputIterator values_first, 
-                           InputIterator values_last,
-                           OutputIterator output,
-                           StrictWeakOrdering comp)
-{
-    using thrust::system::detail::generic::lower_bound;
-    return lower_bound(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, values_first, values_last, output, comp);
-}
-
-
-template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
-OutputIterator upper_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                           ForwardIterator first, 
-                           ForwardIterator last,
-                           InputIterator values_first, 
-                           InputIterator values_last,
-                           OutputIterator output)
-{
-    using thrust::system::detail::generic::upper_bound;
-    return upper_bound(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, values_first, values_last, output);
-}
-
-
-template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
-OutputIterator upper_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                           ForwardIterator first, 
-                           ForwardIterator last,
-                           InputIterator values_first, 
-                           InputIterator values_last,
-                           OutputIterator output,
-                           StrictWeakOrdering comp)
-{
-    using thrust::system::detail::generic::upper_bound;
-    return upper_bound(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, values_first, values_last, output, comp);
-}
-
-
-template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
-OutputIterator binary_search(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                             ForwardIterator first, 
-                             ForwardIterator last,
-                             InputIterator values_first, 
-                             InputIterator values_last,
-                             OutputIterator output)
-{
-    using thrust::system::detail::generic::binary_search;
-    return binary_search(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, values_first, values_last, output);
-}
-
-
-template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
-OutputIterator binary_search(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                             ForwardIterator first, 
-                             ForwardIterator last,
-                             InputIterator values_first, 
-                             InputIterator values_last,
-                             OutputIterator output,
-                             StrictWeakOrdering comp)
-{
-    using thrust::system::detail::generic::binary_search;
-    return binary_search(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, values_first, values_last, output, comp);
-}
-
-
-//////////////////////
-// Scalar Functions //
-//////////////////////
-
-template <typename ForwardIterator, typename LessThanComparable>
-ForwardIterator lower_bound(ForwardIterator first, 
-                            ForwardIterator last,
-                            const LessThanComparable& value)
-{
-    using thrust::system::detail::generic::select_system;
-
-    typedef typename thrust::iterator_system<ForwardIterator>::type System; 
-
-    System system;
-
-    return thrust::lower_bound(select_system(system), first, last, value);
-}
-
-template <typename ForwardIterator, typename T, typename StrictWeakOrdering>
-ForwardIterator lower_bound(ForwardIterator first,
-                            ForwardIterator last,
-                            const T& value, 
-                            StrictWeakOrdering comp)
-{
-    using thrust::system::detail::generic::select_system;
-
-    typedef typename thrust::iterator_system<ForwardIterator>::type System; 
-
-    System system;
-
-    return thrust::lower_bound(select_system(system), first, last, value, comp);
-}
-
-template <typename ForwardIterator, typename LessThanComparable>
-ForwardIterator upper_bound(ForwardIterator first, 
-                            ForwardIterator last,
-                            const LessThanComparable& value)
-{
-    using thrust::system::detail::generic::select_system;
-
-    typedef typename thrust::iterator_system<ForwardIterator>::type System;
-
-    System system;
-
-    return thrust::upper_bound(select_system(system), first, last, value);
-}
-
-template <typename ForwardIterator, typename T, typename StrictWeakOrdering>
-ForwardIterator upper_bound(ForwardIterator first,
-                            ForwardIterator last,
-                            const T& value, 
-                            StrictWeakOrdering comp)
-{
-    using thrust::system::detail::generic::select_system;
-
-    typedef typename thrust::iterator_system<ForwardIterator>::type System;
-
-    System system;
-
-    return thrust::upper_bound(select_system(system), first, last, value, comp);
-}
-
-template <typename ForwardIterator, typename LessThanComparable>
-bool binary_search(ForwardIterator first, 
-                   ForwardIterator last,
-                   const LessThanComparable& value)
-{
-    using thrust::system::detail::generic::select_system;
-
-    typedef typename thrust::iterator_system<ForwardIterator>::type System;
-
-    System system;
-
-    return thrust::binary_search(select_system(system), first, last, value);
-}
-
-template <typename ForwardIterator, typename T, typename StrictWeakOrdering>
-bool binary_search(ForwardIterator first,
-                   ForwardIterator last,
-                   const T& value, 
-                   StrictWeakOrdering comp)
-{
-    using thrust::system::detail::generic::select_system;
-
-    typedef typename thrust::iterator_system<ForwardIterator>::type System;
-
-    System system;
-
-    return thrust::binary_search(select_system(system), first, last, value, comp);
-}
-
-template <typename ForwardIterator, typename LessThanComparable>
-thrust::pair<ForwardIterator, ForwardIterator>
-equal_range(ForwardIterator first,
-            ForwardIterator last,
-            const LessThanComparable& value)
-{
-    using thrust::system::detail::generic::select_system;
-
-    typedef typename thrust::iterator_system<ForwardIterator>::type System;
-
-    System system;
-
-    return thrust::equal_range(select_system(system), first, last, value);
-}
-
-template <typename ForwardIterator, typename T, typename StrictWeakOrdering>
-thrust::pair<ForwardIterator, ForwardIterator>
-equal_range(ForwardIterator first,
-            ForwardIterator last,
-            const T& value,
-            StrictWeakOrdering comp)
-{
-    using thrust::system::detail::generic::select_system;
-
-    typedef typename thrust::iterator_system<ForwardIterator>::type System;
-
-    System system;
-
-    return thrust::equal_range(select_system(system), first, last, value, comp);
-}
-
-//////////////////////
-// Vector Functions //
-//////////////////////
-
-template <typename ForwardIterator, typename InputIterator, typename OutputIterator>
-OutputIterator lower_bound(ForwardIterator first, 
-                           ForwardIterator last,
-                           InputIterator values_first, 
-                           InputIterator values_last,
-                           OutputIterator output)
-{
-    using thrust::system::detail::generic::select_system;
-
-    typedef typename thrust::iterator_system<ForwardIterator>::type System1;
-    typedef typename thrust::iterator_system<InputIterator>::type   System2;
-    typedef typename thrust::iterator_system<OutputIterator>::type  System3;
-
-    System1 system1;
-    System2 system2;
-    System3 system3;
-
-    return thrust::lower_bound(select_system(system1,system2,system3), first, last, values_first, values_last, output);
-}
-
-template <typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
-OutputIterator lower_bound(ForwardIterator first, 
-                           ForwardIterator last,
-                           InputIterator values_first, 
-                           InputIterator values_last,
-                           OutputIterator output,
-                           StrictWeakOrdering comp)
-{
-    using thrust::system::detail::generic::select_system;
-
-    typedef typename thrust::iterator_system<ForwardIterator>::type System1;
-    typedef typename thrust::iterator_system<InputIterator>::type   System2;
-    typedef typename thrust::iterator_system<OutputIterator>::type  System3;
-
-    System1 system1;
-    System2 system2;
-    System3 system3;
-
-    return thrust::lower_bound(select_system(system1,system2,system3), first, last, values_first, values_last, output, comp);
-}
-    
-template <typename ForwardIterator, typename InputIterator, typename OutputIterator>
-OutputIterator upper_bound(ForwardIterator first, 
-                           ForwardIterator last,
-                           InputIterator values_first, 
-                           InputIterator values_last,
-                           OutputIterator output)
-{
-    using thrust::system::detail::generic::select_system;
-
-    typedef typename thrust::iterator_system<ForwardIterator>::type System1;
-    typedef typename thrust::iterator_system<InputIterator>::type   System2;
-    typedef typename thrust::iterator_system<OutputIterator>::type  System3;
-
-    System1 system1;
-    System2 system2;
-    System3 system3;
-
-    return thrust::upper_bound(select_system(system1,system2,system3), first, last, values_first, values_last, output);
-}
-
-template <typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
-OutputIterator upper_bound(ForwardIterator first, 
-                           ForwardIterator last,
-                           InputIterator values_first, 
-                           InputIterator values_last,
-                           OutputIterator output,
-                           StrictWeakOrdering comp)
-{
-    using thrust::system::detail::generic::select_system;
-
-    typedef typename thrust::iterator_system<ForwardIterator>::type System1;
-    typedef typename thrust::iterator_system<InputIterator>::type   System2;
-    typedef typename thrust::iterator_system<OutputIterator>::type  System3;
-
-    System1 system1;
-    System2 system2;
-    System3 system3;
-
-    return thrust::upper_bound(select_system(system1,system2,system3), first, last, values_first, values_last, output, comp);
-}
-
-template <typename ForwardIterator, typename InputIterator, typename OutputIterator>
-OutputIterator binary_search(ForwardIterator first, 
-                             ForwardIterator last,
-                             InputIterator values_first, 
-                             InputIterator values_last,
-                             OutputIterator output)
-{
-    using thrust::system::detail::generic::select_system;
-
-    typedef typename thrust::iterator_system<ForwardIterator>::type System1;
-    typedef typename thrust::iterator_system<InputIterator>::type   System2;
-    typedef typename thrust::iterator_system<OutputIterator>::type  System3;
-
-    System1 system1;
-    System2 system2;
-    System3 system3;
-
-    return thrust::binary_search(select_system(system1,system2,system3), first, last, values_first, values_last, output);
-}
-
-template <typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
-OutputIterator binary_search(ForwardIterator first, 
-                             ForwardIterator last,
-                             InputIterator values_first, 
-                             InputIterator values_last,
-                             OutputIterator output,
-                             StrictWeakOrdering comp)
-{
-    using thrust::system::detail::generic::select_system;
-
-    typedef typename thrust::iterator_system<ForwardIterator>::type System1;
-    typedef typename thrust::iterator_system<InputIterator>::type   System2;
-    typedef typename thrust::iterator_system<OutputIterator>::type  System3;
-
-    System1 system1;
-    System2 system2;
-    System3 system3;
-
-    return thrust::binary_search(select_system(system1,system2,system3), first, last, values_first, values_last, output, comp);
-}
-
-} // end namespace thrust
-
diff --git a/compat/thrust/detail/config.h b/compat/thrust/detail/config.h
deleted file mode 100644
index d6b6691089..0000000000
--- a/compat/thrust/detail/config.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-/*! \file config.h
- *  \brief Defines platform configuration.
- */
-
-#pragma once
-
-#include <thrust/detail/config/config.h>
-
diff --git a/compat/thrust/detail/config/compiler.h b/compat/thrust/detail/config/compiler.h
deleted file mode 100644
index 90ce911155..0000000000
--- a/compat/thrust/detail/config/compiler.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file compiler.h
- *  \brief Compiler-specific configuration
- */
-
-#pragma once
-
-#ifdef __CUDACC__
-
-#include <cuda.h>
-
-// Thrust supports CUDA >= 3.0
-#if CUDA_VERSION < 3000
-#error "CUDA v3.0 or newer is required"
-#endif // CUDA_VERSION
-
-#endif // __CUDACC__
-
-// enumerate host compilers we know about
-#define THRUST_HOST_COMPILER_UNKNOWN 0
-#define THRUST_HOST_COMPILER_MSVC    1
-#define THRUST_HOST_COMPILER_GCC     2
-
-// enumerate host compilers we know about
-#define THRUST_DEVICE_COMPILER_UNKNOWN 0
-#define THRUST_DEVICE_COMPILER_MSVC    1
-#define THRUST_DEVICE_COMPILER_GCC     2
-#define THRUST_DEVICE_COMPILER_NVCC    3
-
-// figure out which host compiler we're using
-// XXX we should move the definition of THRUST_DEPRECATED out of this logic
-#if   defined(_MSC_VER)
-#define THRUST_HOST_COMPILER THRUST_HOST_COMPILER_MSVC
-#define THRUST_DEPRECATED __declspec(deprecated)
-#elif defined(__GNUC__)
-#define THRUST_HOST_COMPILER THRUST_HOST_COMPILER_GCC
-#define THRUST_DEPRECATED __attribute__ ((deprecated)) 
-#define THRUST_GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
-#else
-#define THRUST_HOST_COMPILER THRUST_HOST_COMPILER_UNKNOWN
-#define THRUST_DEPRECATED
-#endif // THRUST_HOST_COMPILER
-
-// figure out which device compiler we're using
-#if defined(__CUDACC__)
-#define THRUST_DEVICE_COMPILER THRUST_DEVICE_COMPILER_NVCC
-#elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
-#define THRUST_DEVICE_COMPILER THRUST_DEVICE_COMPILER_MSVC
-#elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC
-#define THRUST_DEVICE_COMPILER THRUST_DEVICE_COMPILER_GCC
-#else
-#define THRUST_DEVICE_COMPILER THRUST_DEVICE_COMPILER_UNKNOWN
-#endif
-
-// is the device compiler capable of compiling omp?
-#ifdef _OPENMP
-#define THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE THRUST_TRUE
-#else
-#define THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE THRUST_FALSE
-#endif // _OPENMP
-
-// disable specific MSVC warnings
-#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC) && !defined(__CUDA_ARCH__)
-#define __THRUST_DISABLE_MSVC_WARNING_BEGIN(x) \
-__pragma(warning(push)) \
-__pragma(warning(disable : x))
-#define __THRUST_DISABLE_MSVC_WARNING_END(x) \
-__pragma(warning(pop))
-#else
-#define __THRUST_DISABLE_MSVC_WARNING_BEGIN(x)
-#define __THRUST_DISABLE_MSVC_WARNING_END(x)
-#endif
-#define __THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING(x) \
-__THRUST_DISABLE_MSVC_WARNING_BEGIN(4244 4267) \
-x;\
-__THRUST_DISABLE_MSVC_WARNING_END(4244 4267)
-#define __THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN \
-__THRUST_DISABLE_MSVC_WARNING_BEGIN(4244 4267)
-#define __THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END \
-__THRUST_DISABLE_MSVC_WARNING_END(4244 4267)
-#define __THRUST_DISABLE_MSVC_FORCING_VALUE_TO_BOOL(x) \
-__THRUST_DISABLE_MSVC_WARNING_BEGIN(4800) \
-x;\
-__THRUST_DISABLE_MSVC_WARNING_END(4800)
-#define __THRUST_DISABLE_MSVC_FORCING_VALUE_TO_BOOL_BEGIN \
-__THRUST_DISABLE_MSVC_WARNING_BEGIN(4800)
-#define __THRUST_DISABLE_MSVC_FORCING_VALUE_TO_BOOL_END \
-__THRUST_DISABLE_MSVC_WARNING_END(4800)
diff --git a/compat/thrust/detail/config/compiler_fence.h b/compat/thrust/detail/config/compiler_fence.h
deleted file mode 100644
index f5cbf98204..0000000000
--- a/compat/thrust/detail/config/compiler_fence.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// msvc case
-#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
-
-#ifndef _DEBUG
-
-#include <intrin.h>
-#pragma intrinsic(_ReadWriteBarrier)
-#define __thrust_compiler_fence() _ReadWriteBarrier()
-#else
-
-#define __thrust_compiler_fence() do {} while (0)
-
-#endif // _DEBUG
-
-// gcc case
-#elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC
-
-#if THRUST_GCC_VERSION >= 40200 // atomic built-ins were introduced ~4.2
-#define __thrust_compiler_fence() __sync_synchronize()
-#else
-// allow the code to compile without any guarantees
-#define __thrust_compiler_fence() do {} while (0)
-#endif // THRUST_GCC_VERSION
-
-// unknown case
-#elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_UNKNOWN
-
-// allow the code to compile without any guarantees
-#define __thrust_compiler_fence() do {} while (0)
-
-#endif
-
diff --git a/compat/thrust/detail/config/config.h b/compat/thrust/detail/config/config.h
deleted file mode 100644
index f3498acd14..0000000000
--- a/compat/thrust/detail/config/config.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file config.h
- *  \brief Defines platform configuration.
- */
-
-#pragma once
-
-// XXX the order of these #includes matters
-
-#include <thrust/detail/config/simple_defines.h>
-#include <thrust/detail/config/compiler.h>
-// host_system.h & device_system.h must be #included as early as possible
-// because other config headers depend on it
-#include <thrust/detail/config/host_system.h>
-#include <thrust/detail/config/device_system.h>
-#include <thrust/detail/config/host_device.h>
-#include <thrust/detail/config/debug.h>
-#include <thrust/detail/config/compiler_fence.h>
-#include <thrust/detail/config/forceinline.h>
-#include <thrust/detail/config/hd_warning_disable.h>
-
diff --git a/compat/thrust/detail/config/debug.h b/compat/thrust/detail/config/debug.h
deleted file mode 100644
index 56c1bad207..0000000000
--- a/compat/thrust/detail/config/debug.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#ifndef THRUST_DEBUG
-#  ifndef NDEBUG
-#    if (DEBUG || _DEBUG)
-#      define THRUST_DEBUG 1
-#    endif // (DEBUG || _DEBUG)
-#  endif // NDEBUG
-#endif // THRUST_DEBUG
-
-#if THRUST_DEBUG
-#  ifndef __THRUST_SYNCHRONOUS
-#    define __THRUST_SYNCHRONOUS 1
-#  endif // __THRUST_SYNCHRONOUS
-#endif // THRUST_DEBUG
-
diff --git a/compat/thrust/detail/config/device_system.h b/compat/thrust/detail/config/device_system.h
deleted file mode 100644
index a104906753..0000000000
--- a/compat/thrust/detail/config/device_system.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-// reserve 0 for undefined
-#define THRUST_DEVICE_SYSTEM_CUDA    1
-#define THRUST_DEVICE_SYSTEM_OMP     2
-#define THRUST_DEVICE_SYSTEM_TBB     3
-#define THRUST_DEVICE_SYSTEM_CPP     4
-
-#ifndef THRUST_DEVICE_SYSTEM
-#define THRUST_DEVICE_SYSTEM THRUST_DEVICE_SYSTEM_CUDA
-#endif // THRUST_DEVICE_SYSTEM
-
-// XXX make the use of THRUST_DEVICE_BACKEND an error in Thrust 1.7
-// XXX eliminate the following in Thrust 1.7
-
-#define THRUST_DEVICE_BACKEND_CUDA THRUST_DEVICE_SYSTEM_CUDA
-#define THRUST_DEVICE_BACKEND_OMP  THRUST_DEVICE_SYSTEM_OMP
-#define THRUST_DEVICE_BACKEND_TBB  THRUST_DEVICE_SYSTEM_TBB
-
-#ifdef THRUST_DEVICE_BACKEND
-#  if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
-#    pragma message("----------------------------------------------------------------------------------")
-#    pragma message("| WARNING: THRUST_DEVICE_BACKEND is deprecated; use THRUST_DEVICE_SYSTEM instead |")
-#    pragma message("----------------------------------------------------------------------------------")
-#  else
-#    warning ----------------------------------------------------------------------------------
-#    warning | WARNING: THRUST_DEVICE_BACKEND is deprecated; use THRUST_DEVICE_SYSTEM instead |
-#    warning ----------------------------------------------------------------------------------
-#  endif // THRUST_HOST_COMPILER
-#  undef THRUST_DEVICE_SYSTEM
-#  define THRUST_DEVICE_SYSTEM THRUST_DEVICE_BACKEND
-#endif // THRUST_DEVICE_BACKEND
-
-#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
-#define __THRUST_DEVICE_SYSTEM_NAMESPACE cuda
-#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_OMP
-#define __THRUST_DEVICE_SYSTEM_NAMESPACE omp
-#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_TBB
-#define __THRUST_DEVICE_SYSTEM_NAMESPACE tbb
-#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CPP
-#define __THRUST_DEVICE_SYSTEM_NAMESPACE cpp
-#endif
-
-#define __THRUST_DEVICE_SYSTEM_ROOT thrust/system/__THRUST_DEVICE_SYSTEM_NAMESPACE
-
diff --git a/compat/thrust/detail/config/forceinline.h b/compat/thrust/detail/config/forceinline.h
deleted file mode 100644
index 620769b999..0000000000
--- a/compat/thrust/detail/config/forceinline.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file forceinline.h
- *  \brief Defines __thrust_forceinline__
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-#if defined(__CUDACC__)
-
-#define __thrust_forceinline__ __forceinline__
-
-#else
-
-// TODO add 
-
-#define __thrust_forceinline__
-
-#endif
-
diff --git a/compat/thrust/detail/config/hd_warning_disable.h b/compat/thrust/detail/config/hd_warning_disable.h
deleted file mode 100644
index b993ef2828..0000000000
--- a/compat/thrust/detail/config/hd_warning_disable.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file hd_warning_disable.h
- *  \brief Defines __thrust_hd_warning_disable__
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-#if defined(__CUDACC__)
-
-#define __thrust_hd_warning_disable__ \
-#pragma hd_warning_disable
-#else
-
-#define __thrust_hd_warning_disable__
-
-#endif
-
-
diff --git a/compat/thrust/detail/config/host_device.h b/compat/thrust/detail/config/host_device.h
deleted file mode 100644
index 5d0975d106..0000000000
--- a/compat/thrust/detail/config/host_device.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file host_device.h
- *  \brief Defines __host__ and __device__ and other CUDA-isms
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
-
-#include <host_defines.h>
-
-#else
-
-// since __host__ & __device__ might have already be defined, only
-// #define them if not defined already
-// XXX this will break if the client does #include <host_defines.h> later
-
-#ifndef __host__
-#define __host__
-#endif // __host__
-
-#ifndef __device__
-#define __device__
-#endif // __device__
-
-#endif
-
diff --git a/compat/thrust/detail/config/host_system.h b/compat/thrust/detail/config/host_system.h
deleted file mode 100644
index fb8edabc55..0000000000
--- a/compat/thrust/detail/config/host_system.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-// reserve 0 for undefined
-#define THRUST_HOST_SYSTEM_CPP    1
-#define THRUST_HOST_SYSTEM_OMP    2
-#define THRUST_HOST_SYSTEM_TBB    3
-
-#ifndef THRUST_HOST_SYSTEM
-#define THRUST_HOST_SYSTEM THRUST_HOST_SYSTEM_CPP
-#endif // THRUST_HOST_SYSTEM
-
-// XXX make the use of THRUST_HOST_BACKEND an error in Thrust 1.7
-// XXX eliminate the following in Thrust 1.7
-
-#define THRUST_HOST_BACKEND_CPP THRUST_HOST_SYSTEM_CPP
-#define THRUST_HOST_BACKEND_OMP THRUST_HOST_SYSTEM_OMP
-#define THRUST_HOST_BACKEND_TBB THRUST_HOST_SYSTEM_TBB
-
-#ifdef THRUST_HOST_BACKEND
-#  if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
-#    pragma message("------------------------------------------------------------------------------")
-#    pragma message("| WARNING: THRUST_HOST_BACKEND is deprecated; use THRUST_HOST_SYSTEM instead |")
-#    pragma message("------------------------------------------------------------------------------")
-#  else
-#    warning ------------------------------------------------------------------------------
-#    warning | WARNING: THRUST_HOST_BACKEND is deprecated; use THRUST_HOST_SYSTEM instead |
-#    warning ------------------------------------------------------------------------------
-#  endif // THRUST_HOST_COMPILER
-#  undef THRUST_HOST_SYSTEM
-#  define THRUST_HOST_SYSTEM THRUST_HOST_BACKEND
-#endif // THRUST_HOST_BACKEND
-
-#if THRUST_HOST_SYSTEM == THRUST_HOST_SYSTEM_CPP
-#define __THRUST_HOST_SYSTEM_NAMESPACE cpp
-#elif THRUST_HOST_SYSTEM == THRUST_HOST_SYSTEM_OMP
-#define __THRUST_HOST_SYSTEM_NAMESPACE omp
-#elif THRUST_HOST_SYSTEM == THRUST_HOST_SYSTEM_TBB
-#define __THRUST_HOST_SYSTEM_NAMESPACE tbb
-#endif
-
-#define __THRUST_HOST_SYSTEM_ROOT thrust/system/__THRUST_HOST_SYSTEM_NAMESPACE
-
diff --git a/compat/thrust/detail/config/simple_defines.h b/compat/thrust/detail/config/simple_defines.h
deleted file mode 100644
index f9510ee9ce..0000000000
--- a/compat/thrust/detail/config/simple_defines.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file simple_defines.h
- *  \brief Primitive macros without dependencies.
- */
-
-#pragma once
-
-#define THRUST_UNKNOWN 0
-#define THRUST_FALSE   0
-#define THRUST_TRUE    1
-
-#define THRUST_PREVENT_MACRO_SUBSTITUTION
-
diff --git a/compat/thrust/detail/contiguous_storage.h b/compat/thrust/detail/contiguous_storage.h
deleted file mode 100644
index fe72bce2d2..0000000000
--- a/compat/thrust/detail/contiguous_storage.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/iterator/detail/normal_iterator.h>
-#include <thrust/detail/execution_policy.h>
-#include <thrust/detail/allocator/allocator_traits.h>
-
-namespace thrust
-{
-
-namespace detail
-{
-
-// XXX parameter T is redundant with parameter Alloc
-template<typename T, typename Alloc>
-  class contiguous_storage
-{
-  private:
-    typedef thrust::detail::allocator_traits<Alloc> alloc_traits;
-
-  public:
-    typedef Alloc                                      allocator_type;
-    typedef T                                          value_type;
-    typedef typename alloc_traits::pointer             pointer;
-    typedef typename alloc_traits::const_pointer       const_pointer;
-    typedef typename alloc_traits::size_type           size_type;
-    typedef typename alloc_traits::difference_type     difference_type;
-
-    // XXX we should bring reference & const_reference into allocator_traits
-    //     at the moment, it's unclear how -- we have nothing analogous to
-    //     rebind_pointer for references
-    //     we either need to add reference_traits or extend the existing
-    //     pointer_traits to support wrapped references
-    typedef typename Alloc::reference                  reference;
-    typedef typename Alloc::const_reference            const_reference;
-
-    typedef thrust::detail::normal_iterator<pointer>       iterator;
-    typedef thrust::detail::normal_iterator<const_pointer> const_iterator;
-
-    explicit contiguous_storage(const allocator_type &alloc = allocator_type());
-
-    explicit contiguous_storage(size_type n, const allocator_type &alloc = allocator_type());
-
-    ~contiguous_storage(void);
-
-    size_type size(void) const;
-
-    size_type max_size(void) const;
-
-    iterator begin(void);
-    
-    const_iterator begin(void) const;
-
-    iterator end(void);
-
-    const_iterator end(void) const;
-
-    reference operator[](size_type n);
-
-    const_reference operator[](size_type n) const;
-
-    allocator_type get_allocator(void) const;
-
-    // note that allocate does *not* automatically call deallocate
-    void allocate(size_type n);
-
-    void deallocate(void);
-
-    void swap(contiguous_storage &x);
-
-    void default_construct_n(iterator first, size_type n);
-
-    void uninitialized_fill_n(iterator first, size_type n, const value_type &value);
-
-    template<typename InputIterator>
-    iterator uninitialized_copy(InputIterator first, InputIterator last, iterator result);
-
-    template<typename System, typename InputIterator>
-    iterator uninitialized_copy(thrust::execution_policy<System> &from_system,
-                                InputIterator first,
-                                InputIterator last,
-                                iterator result);
-
-    template<typename InputIterator, typename Size>
-    iterator uninitialized_copy_n(InputIterator first, Size n, iterator result);
-
-    template<typename System, typename InputIterator, typename Size>
-    iterator uninitialized_copy_n(thrust::execution_policy<System> &from_system,
-                                  InputIterator first,
-                                  Size n,
-                                  iterator result);
-
-    void destroy(iterator first, iterator last);
-
-  private:
-    // XXX we could inherit from this to take advantage of empty base class optimization
-    allocator_type m_allocator;
-
-    iterator m_begin;
-    
-    size_type m_size;
-
-    // disallow assignment
-    contiguous_storage &operator=(const contiguous_storage &x);
-}; // end contiguous_storage
-
-} // end detail
-
-template<typename T, typename Alloc> void swap(detail::contiguous_storage<T,Alloc> &lhs, detail::contiguous_storage<T,Alloc> &rhs);
-
-} // end thrust
-
-#include <thrust/detail/contiguous_storage.inl>
-
diff --git a/compat/thrust/detail/contiguous_storage.inl b/compat/thrust/detail/contiguous_storage.inl
deleted file mode 100644
index 7e26c26b1a..0000000000
--- a/compat/thrust/detail/contiguous_storage.inl
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/contiguous_storage.h>
-#include <thrust/detail/swap.h>
-#include <thrust/detail/allocator/allocator_traits.h>
-#include <thrust/detail/allocator/copy_construct_range.h>
-#include <thrust/detail/allocator/default_construct_range.h>
-#include <thrust/detail/allocator/destroy_range.h>
-#include <thrust/detail/allocator/fill_construct_range.h>
-#include <utility> // for use of std::swap in the WAR below
-
-namespace thrust
-{
-
-namespace detail
-{
-
-template<typename T, typename Alloc>
-  contiguous_storage<T,Alloc>
-    ::contiguous_storage(const Alloc &alloc)
-      :m_allocator(alloc),
-       m_begin(pointer(static_cast<T*>(0))),
-       m_size(0)
-{
-  ;
-} // end contiguous_storage::contiguous_storage()
-
-template<typename T, typename Alloc>
-  contiguous_storage<T,Alloc>
-    ::contiguous_storage(size_type n, const Alloc &alloc)
-      :m_allocator(alloc),
-       m_begin(pointer(static_cast<T*>(0))),
-       m_size(0)
-{
-  allocate(n);
-} // end contiguous_storage::contiguous_storage()
-
-template<typename T, typename Alloc>
-  contiguous_storage<T,Alloc>
-    ::~contiguous_storage(void)
-{
-  deallocate();
-} // end contiguous_storage::~contiguous_storage()
-
-template<typename T, typename Alloc>
-  typename contiguous_storage<T,Alloc>::size_type
-    contiguous_storage<T,Alloc>
-      ::size(void) const
-{
-  return m_size;
-} // end contiguous_storage::size()
-
-template<typename T, typename Alloc>
-  typename contiguous_storage<T,Alloc>::size_type
-    contiguous_storage<T,Alloc>
-      ::max_size(void) const
-{
-  return alloc_traits::max_size(m_allocator);
-} // end contiguous_storage::max_size()
-
-template<typename T, typename Alloc>
-  typename contiguous_storage<T,Alloc>::iterator
-    contiguous_storage<T,Alloc>
-      ::begin(void)
-{
-  return m_begin;
-} // end contiguous_storage::begin()
-
-template<typename T, typename Alloc>
-  typename contiguous_storage<T,Alloc>::const_iterator
-    contiguous_storage<T,Alloc>
-      ::begin(void) const
-{
-  return m_begin;
-} // end contiguous_storage::begin()
-
-template<typename T, typename Alloc>
-  typename contiguous_storage<T,Alloc>::iterator
-    contiguous_storage<T,Alloc>
-      ::end(void)
-{
-  return m_begin + size();
-} // end contiguous_storage::end()
-
-template<typename T, typename Alloc>
-  typename contiguous_storage<T,Alloc>::const_iterator
-    contiguous_storage<T,Alloc>
-      ::end(void) const
-{
-  return m_begin + size();
-} // end contiguous_storage::end()
-
-template<typename T, typename Alloc>
-  typename contiguous_storage<T,Alloc>::reference
-    contiguous_storage<T,Alloc>
-      ::operator[](size_type n)
-{
-  return m_begin[n];
-} // end contiguous_storage::operator[]()
-
-template<typename T, typename Alloc>
-  typename contiguous_storage<T,Alloc>::const_reference
-    contiguous_storage<T,Alloc>
-      ::operator[](size_type n) const
-{
-  return m_begin[n];
-} // end contiguous_storage::operator[]()
-
-template<typename T, typename Alloc>
-  typename contiguous_storage<T,Alloc>::allocator_type
-    contiguous_storage<T,Alloc>
-      ::get_allocator(void) const
-{
-  return m_allocator;
-} // end contiguous_storage::get_allocator()
-
-template<typename T, typename Alloc>
-  void contiguous_storage<T,Alloc>
-    ::allocate(size_type n)
-{
-  if(n > 0)
-  {
-    m_begin = iterator(m_allocator.allocate(n));
-    m_size = n;
-  } // end if
-  else
-  {
-    m_begin = iterator(pointer(static_cast<T*>(0)));
-    m_size = 0;
-  } // end else
-} // end contiguous_storage::allocate()
-
-template<typename T, typename Alloc>
-  void contiguous_storage<T,Alloc>
-    ::deallocate(void)
-{
-  if(size() > 0)
-  {
-    m_allocator.deallocate(m_begin.base(), size());
-    m_begin = iterator(pointer(static_cast<T*>(0)));
-    m_size = 0;
-  } // end if
-} // end contiguous_storage::deallocate()
-
-template<typename T, typename Alloc>
-  void contiguous_storage<T,Alloc>
-    ::swap(contiguous_storage &x)
-{
-  thrust::swap(m_begin, x.m_begin);
-  thrust::swap(m_size, x.m_size);
-
-  // XXX WAR nvcc 4.0's "calling a __host__ function from a __host__ __device__ function is not allowed" warning
-  //thrust::swap(m_allocator, x.m_allocator);
-  std::swap(m_allocator, x.m_allocator);
-} // end contiguous_storage::swap()
-
-template<typename T, typename Alloc>
-  void contiguous_storage<T,Alloc>
-    ::default_construct_n(iterator first, size_type n)
-{
-  default_construct_range(m_allocator, first.base(), n);
-} // end contiguous_storage::default_construct_n()
-
-template<typename T, typename Alloc>
-  void contiguous_storage<T,Alloc>
-    ::uninitialized_fill_n(iterator first, size_type n, const value_type &x)
-{
-  fill_construct_range(m_allocator, first.base(), n, x);
-} // end contiguous_storage::uninitialized_fill()
-
-template<typename T, typename Alloc>
-  template<typename System, typename InputIterator>
-    typename contiguous_storage<T,Alloc>::iterator
-      contiguous_storage<T,Alloc>
-        ::uninitialized_copy(thrust::execution_policy<System> &from_system, InputIterator first, InputIterator last, iterator result)
-{
-  return iterator(copy_construct_range(from_system, m_allocator, first, last, result.base()));
-} // end contiguous_storage::uninitialized_copy()
-
-template<typename T, typename Alloc>
-  template<typename InputIterator>
-    typename contiguous_storage<T,Alloc>::iterator
-      contiguous_storage<T,Alloc>
-        ::uninitialized_copy(InputIterator first, InputIterator last, iterator result)
-{
-  // XXX assumes InputIterator's associated System is default-constructible
-  typename thrust::iterator_system<InputIterator>::type from_system;
-
-  return iterator(copy_construct_range(from_system, m_allocator, first, last, result.base()));
-} // end contiguous_storage::uninitialized_copy()
-
-template<typename T, typename Alloc>
-  template<typename System, typename InputIterator, typename Size>
-    typename contiguous_storage<T,Alloc>::iterator
-      contiguous_storage<T,Alloc>
-        ::uninitialized_copy_n(thrust::execution_policy<System> &from_system, InputIterator first, Size n, iterator result)
-{
-  return iterator(copy_construct_range_n(from_system, m_allocator, first, n, result.base()));
-} // end contiguous_storage::uninitialized_copy_n()
-
-template<typename T, typename Alloc>
-  template<typename InputIterator, typename Size>
-    typename contiguous_storage<T,Alloc>::iterator
-      contiguous_storage<T,Alloc>
-        ::uninitialized_copy_n(InputIterator first, Size n, iterator result)
-{
-  // XXX assumes InputIterator's associated System is default-constructible
-  typename thrust::iterator_system<InputIterator>::type from_system;
-
-  return iterator(copy_construct_range_n(from_system, m_allocator, first, n, result.base()));
-} // end contiguous_storage::uninitialized_copy_n()
-
-template<typename T, typename Alloc>
-  void contiguous_storage<T,Alloc>
-    ::destroy(iterator first, iterator last)
-{
-  destroy_range(m_allocator, first.base(), last - first);
-} // end contiguous_storage::destroy()
-
-} // end detail
-
-template<typename T, typename Alloc>
-  void swap(detail::contiguous_storage<T,Alloc> &lhs, detail::contiguous_storage<T,Alloc> &rhs)
-{
-  lhs.swap(rhs);
-} // end swap()
-
-} // end thrust
-
diff --git a/compat/thrust/detail/copy.h b/compat/thrust/detail/copy.h
deleted file mode 100644
index 8ed3abd219..0000000000
--- a/compat/thrust/detail/copy.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/execution_policy.h>
-
-namespace thrust
-{
-
-template<typename System,
-         typename InputIterator,
-         typename OutputIterator>
-  OutputIterator copy(const thrust::detail::execution_policy_base<System> &system,
-                      InputIterator first,
-                      InputIterator last,
-                      OutputIterator result);
-
-template<typename System,
-         typename InputIterator,
-         typename Size,
-         typename OutputIterator>
-  OutputIterator copy_n(const thrust::detail::execution_policy_base<System> &system,
-                        InputIterator first,
-                        Size n,
-                        OutputIterator result);
-
-template<typename InputIterator,
-         typename OutputIterator>
-  OutputIterator copy(InputIterator first,
-                      InputIterator last,
-                      OutputIterator result);
-
-template<typename InputIterator,
-         typename Size,
-         typename OutputIterator>
-  OutputIterator copy_n(InputIterator first,
-                        Size n,
-                        OutputIterator result);
-
-
-namespace detail
-{
-
-
-template<typename FromSystem,
-         typename ToSystem,
-         typename InputIterator,
-         typename OutputIterator>
-  OutputIterator two_system_copy(thrust::execution_policy<FromSystem> &from_system,
-                                 thrust::execution_policy<ToSystem>   &two_system,
-                                 InputIterator first,
-                                 InputIterator last,
-                                 OutputIterator result);
-
-
-template<typename FromSystem,
-         typename ToSystem,
-         typename InputIterator,
-         typename Size,
-         typename OutputIterator>
-  OutputIterator two_system_copy_n(thrust::execution_policy<FromSystem> &from_system,
-                                   thrust::execution_policy<ToSystem>   &two_system,
-                                   InputIterator first,
-                                   Size n,
-                                   OutputIterator result);
-
-
-} // end detail
-} // end thrust
-
-#include <thrust/detail/copy.inl>
-
diff --git a/compat/thrust/detail/copy.inl b/compat/thrust/detail/copy.inl
deleted file mode 100644
index 9ac48074a0..0000000000
--- a/compat/thrust/detail/copy.inl
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/copy.h>
-#include <thrust/system/detail/generic/select_system.h>
-#include <thrust/system/detail/generic/copy.h>
-#include <thrust/system/detail/adl/copy.h>
-
-namespace thrust
-{
-
-
-template<typename DerivedPolicy, typename InputIterator, typename OutputIterator>
-  OutputIterator copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                      InputIterator first,
-                      InputIterator last,
-                      OutputIterator result)
-{
-  using thrust::system::detail::generic::copy;
-  return copy(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result);
-} // end copy()
-
-
-template<typename DerivedPolicy, typename InputIterator, typename Size, typename OutputIterator>
-  OutputIterator copy_n(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                        InputIterator first,
-                        Size n,
-                        OutputIterator result)
-{
-  using thrust::system::detail::generic::copy_n;
-  return copy_n(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, n, result);
-} // end copy_n()
-
-
-namespace detail
-{
-
-
-template<typename System1,
-         typename System2,
-         typename InputIterator,
-         typename OutputIterator>
-  OutputIterator two_system_copy(thrust::execution_policy<System1> &system1,
-                                 thrust::execution_policy<System2> &system2,
-                                 InputIterator first,
-                                 InputIterator last,
-                                 OutputIterator result)
-{
-  using thrust::system::detail::generic::select_system;
-
-  return thrust::copy(select_system(thrust::detail::derived_cast(thrust::detail::strip_const(system1)), thrust::detail::derived_cast(thrust::detail::strip_const(system2))), first, last, result);
-} // end two_system_copy()
-
-
-template<typename System1,
-         typename System2,
-         typename InputIterator,
-         typename Size,
-         typename OutputIterator>
-  OutputIterator two_system_copy_n(thrust::execution_policy<System1> &system1,
-                                   thrust::execution_policy<System2> &system2,
-                                   InputIterator first,
-                                   Size n,
-                                   OutputIterator result)
-{
-  using thrust::system::detail::generic::select_system;
-
-  return thrust::copy_n(select_system(thrust::detail::derived_cast(thrust::detail::strip_const(system1)), thrust::detail::derived_cast(thrust::detail::strip_const(system2))), first, n, result);
-} // end two_system_copy_n()
-
-
-} // end detail
-
-
-template<typename InputIterator,
-         typename OutputIterator>
-  OutputIterator copy(InputIterator first,
-                      InputIterator last,
-                      OutputIterator result)
-{
-  typedef typename thrust::iterator_system<InputIterator>::type  System1;
-  typedef typename thrust::iterator_system<OutputIterator>::type System2;
-
-  System1 system1;
-  System2 system2;
-
-  return thrust::detail::two_system_copy(system1, system2, first, last, result);
-} // end copy()
-
-
-template<typename InputIterator,
-         typename Size,
-         typename OutputIterator>
-  OutputIterator copy_n(InputIterator first,
-                        Size n,
-                        OutputIterator result)
-{
-  typedef typename thrust::iterator_system<InputIterator>::type  System1;
-  typedef typename thrust::iterator_system<OutputIterator>::type System2;
-
-  System1 system1;
-  System2 system2;
-
-  return thrust::detail::two_system_copy_n(system1, system2, first, n, result);
-} // end copy_n()
-
-
-} // end namespace thrust
-
diff --git a/compat/thrust/detail/copy_if.h b/compat/thrust/detail/copy_if.h
deleted file mode 100644
index 54e1ef4027..0000000000
--- a/compat/thrust/detail/copy_if.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/execution_policy.h>
-
-namespace thrust
-{
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                         InputIterator first,
-                         InputIterator last,
-                         OutputIterator result,
-                         Predicate pred);
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                         InputIterator1 first,
-                         InputIterator1 last,
-                         InputIterator2 stencil,
-                         OutputIterator result,
-                         Predicate pred);
-
-template<typename InputIterator,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator copy_if(InputIterator first,
-                         InputIterator last,
-                         OutputIterator result,
-                         Predicate pred);
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator copy_if(InputIterator1 first,
-                         InputIterator1 last,
-                         InputIterator2 stencil,
-                         OutputIterator result,
-                         Predicate pred);
-
-} // end thrust
-
-#include <thrust/detail/copy_if.inl>
-
diff --git a/compat/thrust/detail/copy_if.inl b/compat/thrust/detail/copy_if.inl
deleted file mode 100644
index e443bb7d0b..0000000000
--- a/compat/thrust/detail/copy_if.inl
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/copy_if.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/detail/generic/copy_if.h>
-#include <thrust/system/detail/generic/select_system.h>
-#include <thrust/system/detail/adl/copy_if.h>
-
-namespace thrust
-{
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                         InputIterator first,
-                         InputIterator last,
-                         OutputIterator result,
-                         Predicate pred)
-{
-  using thrust::system::detail::generic::copy_if;
-  return copy_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, pred);
-} // end copy_if()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                         InputIterator1 first,
-                         InputIterator1 last,
-                         InputIterator2 stencil,
-                         OutputIterator result,
-                         Predicate pred)
-{
-  using thrust::system::detail::generic::copy_if;
-  return copy_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, stencil, result, pred);
-} // end copy_if()
-
-
-template<typename InputIterator,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator copy_if(InputIterator first,
-                         InputIterator last,
-                         OutputIterator result,
-                         Predicate pred)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator>::type  System1;
-  typedef typename thrust::iterator_system<OutputIterator>::type System2;
-
-  System1 system1;
-  System2 system2;
-
-  return thrust::copy_if(select_system(system1,system2), first, last, result, pred);
-} // end copy_if()
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator copy_if(InputIterator1 first,
-                         InputIterator1 last,
-                         InputIterator2 stencil,
-                         OutputIterator result,
-                         Predicate pred)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type System1;
-  typedef typename thrust::iterator_system<InputIterator2>::type System2;
-  typedef typename thrust::iterator_system<OutputIterator>::type System3;
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-
-  return thrust::copy_if(select_system(system1,system2,system3), first, last, stencil, result, pred);
-} // end copy_if()
-
-
-} // end thrust
-
diff --git a/compat/thrust/detail/count.inl b/compat/thrust/detail/count.inl
deleted file mode 100644
index d2856ae1ce..0000000000
--- a/compat/thrust/detail/count.inl
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file count.inl
- *  \brief Inline file for count.h.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/count.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/detail/generic/select_system.h>
-#include <thrust/system/detail/generic/count.h>
-#include <thrust/system/detail/adl/count.h>
-
-namespace thrust
-{
-
-
-template<typename DerivedPolicy, typename InputIterator, typename EqualityComparable>
-  typename thrust::iterator_traits<InputIterator>::difference_type
-    count(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, InputIterator first, InputIterator last, const EqualityComparable& value)
-{
-  using thrust::system::detail::generic::count;
-  return count(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, value);
-} // end count()
-
-
-template<typename DerivedPolicy, typename InputIterator, typename Predicate>
-  typename thrust::iterator_traits<InputIterator>::difference_type
-    count_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, InputIterator first, InputIterator last, Predicate pred)
-{
-  using thrust::system::detail::generic::count_if;
-  return count_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, pred);
-} // end count_if()
-
-
-template <typename InputIterator, typename EqualityComparable>
-typename thrust::iterator_traits<InputIterator>::difference_type
-count(InputIterator first, InputIterator last, const EqualityComparable& value)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator>::type System;
-
-  System system;
-
-  return thrust::count(select_system(system), first, last, value);
-} // end count()
-
-
-template <typename InputIterator, typename Predicate>
-typename thrust::iterator_traits<InputIterator>::difference_type
-count_if(InputIterator first, InputIterator last, Predicate pred)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator>::type System;
-
-  System system;
-
-  return thrust::count_if(select_system(system), first, last, pred);
-} // end count_if()
-
-
-} // end namespace thrust
-
diff --git a/compat/thrust/detail/cstdint.h b/compat/thrust/detail/cstdint.h
deleted file mode 100644
index 25d30fd5b2..0000000000
--- a/compat/thrust/detail/cstdint.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC)
-#include <stdint.h>
-#endif
-
-namespace thrust
-{
-namespace detail
-{
-
-#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC)
-
-#if (_MSC_VER < 1300)
-   typedef signed   char     int8_t;
-   typedef signed   short    int16_t;
-   typedef signed   int      int32_t;
-   typedef unsigned char     uint8_t;
-   typedef unsigned short    uint16_t;
-   typedef unsigned int      uint32_t;
-#else
-   typedef signed   __int8   int8_t;
-   typedef signed   __int16  int16_t;
-   typedef signed   __int32  int32_t;
-   typedef unsigned __int8   uint8_t;
-   typedef unsigned __int16  uint16_t;
-   typedef unsigned __int32  uint32_t;
-#endif
-typedef signed   __int64     int64_t;
-typedef unsigned __int64     uint64_t;
-
-#else
-
-typedef ::int8_t   int8_t;
-typedef ::int16_t  int16_t;
-typedef ::int32_t  int32_t;
-typedef ::int64_t  int64_t;
-typedef ::uint8_t  uint8_t;
-typedef ::uint16_t uint16_t;
-typedef ::uint32_t uint32_t;
-typedef ::uint64_t uint64_t;
-
-#endif
-
-
-// an oracle to tell us how to define intptr_t
-template<int word_size = sizeof(void*)> struct divine_intptr_t;
-template<int word_size = sizeof(void*)> struct divine_uintptr_t;
-
-// 32b platforms
-template<>  struct divine_intptr_t<4>  {  typedef thrust::detail::int32_t  type; };
-template<>  struct divine_uintptr_t<4> {  typedef thrust::detail::uint32_t type; };
-
-// 64b platforms
-template<>  struct divine_intptr_t<8>  { typedef thrust::detail::int64_t  type; };
-template<>  struct divine_uintptr_t<8> { typedef thrust::detail::uint64_t type; };
-
-typedef divine_intptr_t<>::type   intptr_t;
-typedef divine_uintptr_t<>::type  uintptr_t;
-
-} // end detail
-} // end thrust
-
diff --git a/compat/thrust/detail/device_delete.inl b/compat/thrust/detail/device_delete.inl
deleted file mode 100644
index dd70d76891..0000000000
--- a/compat/thrust/detail/device_delete.inl
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file device_delete.inl
- *  \brief Inline file for device_delete.h.
- */
-
-#include <thrust/device_delete.h>
-#include <thrust/device_free.h>
-#include <thrust/detail/allocator/destroy_range.h>
-
-namespace thrust
-{
-namespace detail
-{
-
-// define an empty allocator class to use below
-struct device_delete_allocator {};
-
-}
-
-template<typename T>
-  void device_delete(device_ptr<T> ptr,
-                     const size_t n)
-{
-  // we can use device_allocator to destroy the range
-  thrust::detail::device_delete_allocator a;
-  thrust::detail::destroy_range(a, ptr, n);
-  thrust::device_free(ptr);
-} // end device_delete()
-
-} // end thrust
-
diff --git a/compat/thrust/detail/device_free.inl b/compat/thrust/detail/device_free.inl
deleted file mode 100644
index ab8db9f09c..0000000000
--- a/compat/thrust/detail/device_free.inl
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file device_free.inl
- *  \brief Inline file for device_free.h.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/device_free.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/detail/generic/select_system.h>
-#include <thrust/detail/malloc_and_free.h>
-
-namespace thrust
-{
-
-void device_free(thrust::device_ptr<void> ptr)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef thrust::iterator_system< thrust::device_ptr<void> >::type system;
-
-  // XXX lower to select_system(system) here
-  system s;
-
-  thrust::free(s, ptr);
-} // end device_free()
-
-} // end thrust
-
diff --git a/compat/thrust/detail/device_malloc.inl b/compat/thrust/detail/device_malloc.inl
deleted file mode 100644
index 76d0029993..0000000000
--- a/compat/thrust/detail/device_malloc.inl
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file device_malloc.inl
- *  \brief Inline file for device_malloc.h.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/device_malloc.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/detail/generic/select_system.h>
-#include <thrust/detail/malloc_and_free.h>
-
-namespace thrust
-{
-
-
-thrust::device_ptr<void> device_malloc(const std::size_t n)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef thrust::iterator_system< thrust::device_ptr<void> >::type system;
-
-  // XXX lower to select_system(system) here
-  system s;
-
-  return thrust::device_ptr<void>(thrust::malloc(s, n).get());
-} // end device_malloc()
-
-
-template<typename T>
-  thrust::device_ptr<T> device_malloc(const std::size_t n)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef thrust::iterator_system< thrust::device_ptr<void> >::type system;
-
-  // XXX lower to select_system(system) here
-  system s;
-
-  return thrust::device_ptr<T>(thrust::malloc<T>(s,n).get());
-} // end device_malloc()
-
-
-} // end thrust
-
diff --git a/compat/thrust/detail/device_new.inl b/compat/thrust/detail/device_new.inl
deleted file mode 100644
index 1f00a97a94..0000000000
--- a/compat/thrust/detail/device_new.inl
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file device_new.inl
- *  \brief Inline file for device_new.h.
- */
-
-#include <thrust/device_new.h>
-#include <thrust/device_malloc.h>
-#include <thrust/uninitialized_fill.h>
-
-namespace thrust
-{
-
-template<typename T>
-  device_ptr<T> device_new(device_ptr<void> p,
-                           const size_t n)
-{
-  // XXX TODO dispatch n null device constructors at p here
-  // in the meantime, dispatch 1 null host constructor here
-  // and dispatch n copy constructors
-  return device_new<T>(p, T(), n);
-} // end device_new()
-
-template<typename T>
-  device_ptr<T> device_new(device_ptr<void> p,
-                           const T &exemplar,
-                           const size_t n)
-{
-  device_ptr<T> result(reinterpret_cast<T*>(p.get()));
-
-  // run copy constructors at p here
-  thrust::uninitialized_fill(result, result + n, exemplar);
-  
-  return result;
-} // end device_new()
-
-template<typename T>
-  device_ptr<T> device_new(const size_t n)
-{
-  // call placement new
-  return device_new<T>(thrust::device_malloc<T>(n));
-} // end device_new()
-
-} // thrust
-
diff --git a/compat/thrust/detail/device_ptr.inl b/compat/thrust/detail/device_ptr.inl
deleted file mode 100644
index 0afe8a19c1..0000000000
--- a/compat/thrust/detail/device_ptr.inl
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file device_ptr.inl
- *  \brief Inline file for device_ptr.h.
- */
-
-#include <thrust/device_ptr.h>
-#include <thrust/device_reference.h>
-#include <iostream>
-
-#include <thrust/detail/type_traits.h>
-#include <thrust/iterator/iterator_traits.h>
-
-namespace thrust
-{
-
-template<typename T>
-  device_ptr<T> device_pointer_cast(T *ptr)
-{
-  return device_ptr<T>(ptr);
-} // end device_pointer_cast()
-
-template<typename T>
-  device_ptr<T> device_pointer_cast(const device_ptr<T> &ptr)
-{
-  return ptr;
-} // end device_pointer_cast()
-
-// output to ostream
-template<class E, class T, class Y>
-  std::basic_ostream<E, T> &operator<<(std::basic_ostream<E, T> &os, const device_ptr<Y> &p)
-{
-  return os << p.get();
-} // end operator<<()
-
-
-namespace detail
-{
-
-template<typename T>
-  struct is_device_ptr< thrust::device_ptr<T> >
-    : public true_type
-{
-}; // end is_device_ptr
-
-#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC) && (_MSC_VER <= 1400)
-// XXX WAR MSVC 2005 problem with correctly implementing
-//     pointer_raw_pointer for device_ptr by specializing it here
-template<typename T>
-  struct pointer_raw_pointer< thrust::device_ptr<T> >
-{
-  typedef typename device_ptr<T>::raw_pointer type;
-}; // end pointer_raw_pointer
-#endif
-
-
-} // end namespace detail
-} // end namespace thrust
-
diff --git a/compat/thrust/detail/device_reference.inl b/compat/thrust/detail/device_reference.inl
deleted file mode 100644
index ad5cb76688..0000000000
--- a/compat/thrust/detail/device_reference.inl
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file device_reference.inl
- *  \brief Inline file for device_reference.h.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/device_reference.h>
-
-namespace thrust
-{
-
-template<typename T>
-  template<typename OtherT>
-    device_reference<T> &
-      device_reference<T>
-        ::operator=(const device_reference<OtherT> &other)
-{
-  return super_t::operator=(other);
-} // end operator=()
-
-template<typename T>
-  device_reference<T> &
-    device_reference<T>
-      ::operator=(const value_type &x)
-{
-  return super_t::operator=(x);
-} // end operator=()
-
-template<typename T>
-__host__ __device__
-void swap(device_reference<T> &a, device_reference<T> &b)
-{
-  a.swap(b);
-} // end swap()
-
-} // end thrust
-
diff --git a/compat/thrust/detail/device_vector.inl b/compat/thrust/detail/device_vector.inl
deleted file mode 100644
index f6bafbaa5f..0000000000
--- a/compat/thrust/detail/device_vector.inl
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file device_vector.inl
- *  \brief Inline file for device_vector.h.
- */
-
-#include <thrust/host_vector.h>
-
-namespace thrust
-{
-
-template<typename T, typename Alloc>
-  template<typename OtherT, typename OtherAlloc>
-    device_vector<T,Alloc>
-      ::device_vector(const host_vector<OtherT,OtherAlloc> &v)
-        :Parent(v)
-{
-  ;
-} // end device_vector::device_vector()
-
-} // end namespace thrust
-
diff --git a/compat/thrust/detail/dispatch/is_trivial_copy.h b/compat/thrust/detail/dispatch/is_trivial_copy.h
deleted file mode 100644
index 2bedf1f716..0000000000
--- a/compat/thrust/detail/dispatch/is_trivial_copy.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file trivial_copy.h
- *  \brief Device implementations for copying memory between host and device.
- */
-
-#pragma once
-
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/iterator/detail/is_trivial_iterator.h>
-
-namespace thrust
-{
-
-namespace detail
-{
-
-namespace dispatch
-{
-
-
-// a trivial copy's iterator's value_types match,
-// the iterators themselves are normal_iterators
-// and the ToIterator's value_type has_trivial_assign
-template<typename FromIterator, typename ToIterator>
-  struct is_trivial_copy :
-    integral_constant<
-      bool,
-      is_same<
-        typename thrust::iterator_value<FromIterator>::type,
-        typename thrust::iterator_value<ToIterator>::type
-      >::value
-      && is_trivial_iterator<FromIterator>::value
-      && is_trivial_iterator<ToIterator>::value
-      && has_trivial_assign<typename thrust::iterator_value<ToIterator>::type>::value
-    > {};
-
-} // end namespace dispatch
-
-} // end namespace detail
-
-} // end namespace thrust
-
diff --git a/compat/thrust/detail/distance.inl b/compat/thrust/detail/distance.inl
deleted file mode 100644
index f37595f324..0000000000
--- a/compat/thrust/detail/distance.inl
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file distance.inl
- *  \brief Inline file for distance.h
- */
-
-#include <thrust/advance.h>
-#include <thrust/system/detail/generic/distance.h>
-#include <thrust/iterator/iterator_traits.h>
-
-namespace thrust
-{
-
-
-template<typename InputIterator>
-  inline typename thrust::iterator_traits<InputIterator>::difference_type
-    distance(InputIterator first, InputIterator last)
-{
-  return thrust::system::detail::generic::distance(first, last);
-} // end distance()
-
-
-} // end namespace thrust
-
diff --git a/compat/thrust/detail/equal.inl b/compat/thrust/detail/equal.inl
deleted file mode 100644
index ca6fecccf1..0000000000
--- a/compat/thrust/detail/equal.inl
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file equal.inl
- *  \brief Inline file for equal.h.
- */
-
-#include <thrust/equal.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/detail/generic/select_system.h>
-#include <thrust/system/detail/generic/equal.h>
-#include <thrust/system/detail/adl/equal.h>
-
-namespace thrust
-{
-
-
-template<typename System, typename InputIterator1, typename InputIterator2>
-bool equal(const thrust::detail::execution_policy_base<System> &system, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2)
-{
-  using thrust::system::detail::generic::equal;
-  return equal(thrust::detail::derived_cast(thrust::detail::strip_const(system)), first1, last1, first2);
-} // end equal()
-
-
-template<typename System, typename InputIterator1, typename InputIterator2, typename BinaryPredicate>
-bool equal(const thrust::detail::execution_policy_base<System> &system, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, BinaryPredicate binary_pred)
-{
-  using thrust::system::detail::generic::equal;
-  return equal(thrust::detail::derived_cast(thrust::detail::strip_const(system)), first1, last1, first2, binary_pred);
-} // end equal()
-
-
-template <typename InputIterator1, typename InputIterator2>
-bool equal(InputIterator1 first1, InputIterator1 last1,
-           InputIterator2 first2)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type System1;
-  typedef typename thrust::iterator_system<InputIterator2>::type System2;
-
-  System1 system1;
-  System2 system2;
-
-  return thrust::equal(select_system(system1,system2), first1, last1, first2);
-}
-
-
-template <typename InputIterator1, typename InputIterator2, 
-          typename BinaryPredicate>
-bool equal(InputIterator1 first1, InputIterator1 last1,
-           InputIterator2 first2, BinaryPredicate binary_pred)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type System1;
-  typedef typename thrust::iterator_system<InputIterator2>::type System2;
-
-  System1 system1;
-  System2 system2;
-
-  return thrust::equal(select_system(system1,system2), first1, last1, first2, binary_pred);
-}
-
-
-} // end namespace thrust
-
diff --git a/compat/thrust/detail/execute_with_allocator.h b/compat/thrust/detail/execute_with_allocator.h
deleted file mode 100644
index 9d3c1ba29b..0000000000
--- a/compat/thrust/detail/execute_with_allocator.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/raw_pointer_cast.h>
-#include <thrust/detail/type_traits/pointer_traits.h>
-#include <thrust/detail/allocator/allocator_traits.h>
-#include <thrust/detail/util/blocking.h>
-#include <thrust/pair.h>
-
-namespace thrust
-{
-namespace detail
-{
-
-template<typename ToPointer, typename FromPointer>
-__host__ __device__
-ToPointer reinterpret_pointer_cast(FromPointer ptr)
-{
-  typedef typename thrust::detail::pointer_element<ToPointer>::type to_element;
-  return ToPointer(reinterpret_cast<to_element*>(thrust::raw_pointer_cast(ptr)));
-}
-
-
-template<typename Allocator, template <typename> class BaseSystem>
-  struct execute_with_allocator
-    : BaseSystem<execute_with_allocator<Allocator, BaseSystem> >
-{
-  Allocator &m_alloc;
-
-  execute_with_allocator(Allocator &alloc)
-    : m_alloc(alloc)
-  {}
-
-  template<typename T>
-    friend thrust::pair<T*,std::ptrdiff_t>
-      get_temporary_buffer(execute_with_allocator &system, std::ptrdiff_t n)
-  {
-    typedef typename thrust::detail::allocator_traits<Allocator> alloc_traits;
-    typedef typename alloc_traits::void_pointer                  void_pointer;
-    typedef typename alloc_traits::size_type                     size_type;
-    typedef typename alloc_traits::value_type                    value_type;
-
-    // how many elements of type value_type do we need to accomodate n elements of type T?
-    size_type num_elements = thrust::detail::util::divide_ri(sizeof(T) * n, sizeof(value_type));
-
-    // allocate that many
-    void_pointer ptr = alloc_traits::allocate(system.m_alloc, num_elements);
-
-    // return the pointer and the number of elements of type T allocated
-    return thrust::make_pair(thrust::detail::reinterpret_pointer_cast<T*>(ptr),n);
-  }
-
-  template<typename Pointer>
-    friend void return_temporary_buffer(execute_with_allocator &system, Pointer p)
-  {
-    typedef typename thrust::detail::allocator_traits<Allocator> alloc_traits;
-    typedef typename alloc_traits::pointer                       pointer;
-
-    // return the pointer to the allocator
-    pointer to_ptr = thrust::detail::reinterpret_pointer_cast<pointer>(p);
-    alloc_traits::deallocate(system.m_alloc, to_ptr, 0);
-  }
-};
-
-
-} // end detail
-} // end thrust
-
diff --git a/compat/thrust/detail/execution_policy.h b/compat/thrust/detail/execution_policy.h
deleted file mode 100644
index 28e77f2e91..0000000000
--- a/compat/thrust/detail/execution_policy.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-namespace thrust
-{
-namespace detail
-{
-
-
-// execution_policy_base serves as a guard against
-// inifinite recursion in thrust entry points:
-//
-// template<typename DerivedPolicy>
-// void foo(const thrust::detail::execution_policy_base<DerivedPolicy> &s)
-// {
-//   using thrust::system::detail::generic::foo;
-//
-//   foo(thrust::detail::derived_cast(thrust::detail::strip_const(s));
-// }
-//
-// foo is not recursive when
-// 1. DerivedPolicy is derived from thrust::execution_policy below
-// 2. generic::foo takes thrust::execution_policy as a parameter
-template<typename DerivedPolicy> struct execution_policy_base {};
-
-
-template<typename DerivedPolicy>
-__host__ __device__
-inline execution_policy_base<DerivedPolicy> &strip_const(const execution_policy_base<DerivedPolicy> &x)
-{
-  return const_cast<execution_policy_base<DerivedPolicy>&>(x);
-}
-
-
-template<typename DerivedPolicy>
-__host__ __device__
-inline DerivedPolicy &derived_cast(execution_policy_base<DerivedPolicy> &x)
-{
-  return static_cast<DerivedPolicy&>(x);
-}
-
-
-template<typename DerivedPolicy>
-__host__ __device__
-inline const DerivedPolicy &derived_cast(const execution_policy_base<DerivedPolicy> &x)
-{
-  return static_cast<const DerivedPolicy&>(x);
-}
-
-
-} // end detail
-
-
-template<typename DerivedPolicy>
-  struct execution_policy
-    : thrust::detail::execution_policy_base<DerivedPolicy>
-{};
-
-
-} // end thrust
-
diff --git a/compat/thrust/detail/extrema.inl b/compat/thrust/detail/extrema.inl
deleted file mode 100644
index 4bcd0bde56..0000000000
--- a/compat/thrust/detail/extrema.inl
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#include <thrust/detail/config.h>
-#include <thrust/extrema.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/detail/generic/select_system.h>
-#include <thrust/system/detail/generic/extrema.h>
-#include <thrust/system/detail/adl/extrema.h>
-
-namespace thrust
-{
-
-
-template<typename DerivedPolicy, typename ForwardIterator>
-ForwardIterator min_element(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, ForwardIterator first, ForwardIterator last)
-{
-  using thrust::system::detail::generic::min_element;
-  return min_element(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last);
-} // end min_element()
-
-
-template<typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
-ForwardIterator min_element(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, ForwardIterator first, ForwardIterator last, BinaryPredicate comp)
-{
-  using thrust::system::detail::generic::min_element;
-  return min_element(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, comp);
-} // end min_element()
-
-
-template<typename DerivedPolicy, typename ForwardIterator>
-ForwardIterator max_element(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, ForwardIterator first, ForwardIterator last)
-{
-  using thrust::system::detail::generic::max_element;
-  return max_element(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last);
-} // end max_element()
-
-
-template<typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
-ForwardIterator max_element(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, ForwardIterator first, ForwardIterator last, BinaryPredicate comp)
-{
-  using thrust::system::detail::generic::max_element;
-  return max_element(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, comp);
-} // end max_element()
-
-
-template<typename DerivedPolicy, typename ForwardIterator>
-thrust::pair<ForwardIterator,ForwardIterator> minmax_element(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, ForwardIterator first, ForwardIterator last)
-{
-  using thrust::system::detail::generic::minmax_element;
-  return minmax_element(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last);
-} // end minmax_element()
-
-
-template<typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
-thrust::pair<ForwardIterator,ForwardIterator> minmax_element(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, ForwardIterator first, ForwardIterator last, BinaryPredicate comp)
-{
-  using thrust::system::detail::generic::minmax_element;
-  return minmax_element(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, comp);
-} // end minmax_element()
-
-
-template <typename ForwardIterator>
-ForwardIterator min_element(ForwardIterator first, ForwardIterator last)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<ForwardIterator>::type System;
-
-  System system;
-
-  return thrust::min_element(select_system(system), first, last);
-} // end min_element()
-
-
-template <typename ForwardIterator, typename BinaryPredicate>
-ForwardIterator min_element(ForwardIterator first, ForwardIterator last,
-                            BinaryPredicate comp)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<ForwardIterator>::type System;
-
-  System system;
-
-  return thrust::min_element(select_system(system), first, last, comp);
-} // end min_element()
-
-
-template <typename ForwardIterator>
-ForwardIterator max_element(ForwardIterator first, ForwardIterator last)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<ForwardIterator>::type System;
-
-  System system;
-
-  return thrust::max_element(select_system(system), first, last);
-} // end max_element()
-
-
-template <typename ForwardIterator, typename BinaryPredicate>
-ForwardIterator max_element(ForwardIterator first, ForwardIterator last,
-                            BinaryPredicate comp)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<ForwardIterator>::type System;
-
-  System system;
-
-  return thrust::max_element(select_system(system), first, last, comp);
-} // end max_element()
-
-
-template <typename ForwardIterator>
-thrust::pair<ForwardIterator,ForwardIterator> 
-minmax_element(ForwardIterator first, ForwardIterator last)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<ForwardIterator>::type System;
-
-  System system;
-
-  return thrust::minmax_element(select_system(system), first, last);
-} // end minmax_element()
-
-
-template <typename ForwardIterator, typename BinaryPredicate>
-thrust::pair<ForwardIterator,ForwardIterator> 
-minmax_element(ForwardIterator first, ForwardIterator last, BinaryPredicate comp)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<ForwardIterator>::type System;
-
-  System system;
-
-  return thrust::minmax_element(select_system(system), first, last, comp);
-} // end minmax_element()
-
-
-} // end namespace thrust
-
diff --git a/compat/thrust/detail/fill.inl b/compat/thrust/detail/fill.inl
deleted file mode 100644
index c60e4a059e..0000000000
--- a/compat/thrust/detail/fill.inl
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file fill.inl
- *  \brief Inline file for fill.h.
- */
-
-#include <thrust/fill.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/detail/generic/select_system.h>
-#include <thrust/system/detail/generic/fill.h>
-#include <thrust/system/detail/adl/fill.h>
-
-namespace thrust
-{
-
-
-template<typename DerivedPolicy, typename ForwardIterator, typename T>
-  void fill(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-            ForwardIterator first,
-            ForwardIterator last,
-            const T &value)
-{
-  using thrust::system::detail::generic::fill;
-  return fill(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, value);
-} // end fill()
-
-
-template<typename DerivedPolicy, typename OutputIterator, typename Size, typename T>
-  OutputIterator fill_n(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                        OutputIterator first,
-                        Size n,
-                        const T &value)
-{
-  using thrust::system::detail::generic::fill_n;
-  return fill_n(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, n, value);
-} // end fill_n()
-
-
-template<typename ForwardIterator, typename T>
-  void fill(ForwardIterator first,
-            ForwardIterator last,
-            const T &value)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<ForwardIterator>::type System;
-
-  System system;
-
-  thrust::fill(select_system(system), first, last, value);
-} // end fill()
-
-
-template<typename OutputIterator, typename Size, typename T>
-  OutputIterator fill_n(OutputIterator first,
-                        Size n,
-                        const T &value)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<OutputIterator>::type System;
-
-  System system;
-
-  return thrust::fill_n(select_system(system), first, n, value);
-} // end fill()
-
-
-} // end namespace thrust
-
diff --git a/compat/thrust/detail/find.inl b/compat/thrust/detail/find.inl
deleted file mode 100644
index 465c937395..0000000000
--- a/compat/thrust/detail/find.inl
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file find.inl
- *  \brief Inline file for find.h
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/detail/generic/select_system.h>
-#include <thrust/system/detail/generic/find.h>
-#include <thrust/system/detail/adl/find.h>
-
-namespace thrust
-{
-
-
-template<typename DerivedPolicy, typename InputIterator, typename T>
-InputIterator find(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                   InputIterator first,
-                   InputIterator last,
-                   const T& value)
-{
-  using thrust::system::detail::generic::find;
-  return find(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, value);
-} // end find()
-
-
-template<typename DerivedPolicy, typename InputIterator, typename Predicate>
-InputIterator find_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                      InputIterator first,
-                      InputIterator last,
-                      Predicate pred)
-{
-  using thrust::system::detail::generic::find_if;
-  return find_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, pred);
-} // end find_if()
-
-
-template<typename DerivedPolicy, typename InputIterator, typename Predicate>
-InputIterator find_if_not(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                          InputIterator first,
-                          InputIterator last,
-                          Predicate pred)
-{
-  using thrust::system::detail::generic::find_if_not;
-  return find_if_not(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, pred);
-} // end find_if_not()
-
-
-template <typename InputIterator, typename T>
-InputIterator find(InputIterator first,
-                   InputIterator last,
-                   const T& value)
-{
-    using thrust::system::detail::generic::select_system;
-
-    typedef typename thrust::iterator_system<InputIterator>::type System;
-
-    System system;
-
-    return thrust::find(select_system(system), first, last, value);
-}
-
-template <typename InputIterator, typename Predicate>
-InputIterator find_if(InputIterator first,
-                      InputIterator last,
-                      Predicate pred)
-{
-    using thrust::system::detail::generic::select_system;
-
-    typedef typename thrust::iterator_system<InputIterator>::type System;
-
-    System system;
-
-    return thrust::find_if(select_system(system), first, last, pred);
-}
-
-template <typename InputIterator, typename Predicate>
-InputIterator find_if_not(InputIterator first,
-                          InputIterator last,
-                          Predicate pred)
-{
-    using thrust::system::detail::generic::select_system;
-
-    typedef typename thrust::iterator_system<InputIterator>::type System;
-
-    System system;
-
-    return thrust::find_if_not(select_system(system), first, last, pred);
-}
-
-
-} // end namespace thrust
-
diff --git a/compat/thrust/detail/for_each.inl b/compat/thrust/detail/for_each.inl
deleted file mode 100644
index 7c9dc172e3..0000000000
--- a/compat/thrust/detail/for_each.inl
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file for_each.inl
- *  \brief Inline file for for_each.h.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/for_each.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/detail/generic/select_system.h>
-#include <thrust/system/detail/generic/for_each.h>
-#include <thrust/system/detail/adl/for_each.h>
-
-namespace thrust
-{
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename UnaryFunction>
-  InputIterator for_each(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                         InputIterator first,
-                         InputIterator last,
-                         UnaryFunction f)
-{
-  using thrust::system::detail::generic::for_each;
-
-  return for_each(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, f);
-}
-
-
-template<typename InputIterator,
-         typename UnaryFunction>
-InputIterator for_each(InputIterator first,
-                       InputIterator last,
-                       UnaryFunction f)
-{
-  using thrust::system::detail::generic::select_system;
-  typedef typename thrust::iterator_system<InputIterator>::type System;
-
-  System system;
-  return thrust::for_each(select_system(system), first, last, f);
-} // end for_each()
-
-
-template<typename DerivedPolicy, typename InputIterator, typename Size, typename UnaryFunction>
-  InputIterator for_each_n(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                           InputIterator first,
-                           Size n,
-                           UnaryFunction f)
-{
-  using thrust::system::detail::generic::for_each_n;
-
-  return for_each_n(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, n, f);
-} // end for_each_n()
-
-
-template<typename InputIterator,
-         typename Size,
-         typename UnaryFunction>
-InputIterator for_each_n(InputIterator first,
-                         Size n,
-                         UnaryFunction f)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator>::type System;
-
-  System system;
-  return thrust::for_each_n(select_system(system), first, n, f);
-} // end for_each_n()
-
-
-} // end namespace thrust
-
diff --git a/compat/thrust/detail/function.h b/compat/thrust/detail/function.h
deleted file mode 100644
index 36b76c286f..0000000000
--- a/compat/thrust/detail/function.h
+++ /dev/null
@@ -1,226 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/raw_reference_cast.h>
-
-namespace thrust
-{
-namespace detail
-{
-
-
-template<typename Function, typename Result>
-  struct host_function
-{
-  // mutable because Function::operator() might be const
-  mutable Function m_f;
-
-  inline host_function()
-    : m_f()
-  {}
-
-  inline host_function(const Function &f)
-    : m_f(f)
-  {}
-
-  template<typename Argument>
-    inline Result operator()(Argument &x) const
-  {
-    // we static cast to Result to handle void Result without error
-    // in case Function's result is non-void
-    return static_cast<Result>(m_f(thrust::raw_reference_cast(x)));
-  }
-
-  template<typename Argument>
-    inline Result operator()(const Argument &x) const
-  {
-    // we static cast to Result to handle void Result without error
-    // in case Function's result is non-void
-    return static_cast<Result>(m_f(thrust::raw_reference_cast(x)));
-  }
-
-  template<typename Argument1, typename Argument2>
-    inline Result operator()(Argument1 &x, Argument2 &y) const
-  {
-    // we static cast to Result to handle void Result without error
-    // in case Function's result is non-void
-    return static_cast<Result>(m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y)));
-  }
-
-  template<typename Argument1, typename Argument2>
-    inline Result operator()(const Argument1 &x, Argument2 &y) const
-  {
-    // we static cast to Result to handle void Result without error
-    // in case Function's result is non-void
-    return static_cast<Result>(m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y)));
-  }
-
-  template<typename Argument1, typename Argument2>
-    inline Result operator()(const Argument1 &x, const Argument2 &y) const
-  {
-    // we static cast to Result to handle void Result without error
-    // in case Function's result is non-void
-    return static_cast<Result>(m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y)));
-  }
-
-  template<typename Argument1, typename Argument2>
-    inline Result operator()(Argument1 &x, const Argument2 &y) const
-  {
-    // we static cast to Result to handle void Result without error
-    // in case Function's result is non-void
-    return static_cast<Result>(m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y)));
-  }
-}; // end host_function
-
-
-template<typename Function, typename Result>
-  struct device_function
-{
-  // mutable because Function::operator() might be const
-  mutable Function m_f;
-
-  inline __device__ device_function()
-    : m_f()
-  {}
-
-  inline __device__ device_function(const Function &f)
-    : m_f(f)
-  {}
-
-  template<typename Argument>
-    inline __device__ Result operator()(Argument &x) const
-  {
-    // we static cast to Result to handle void Result without error
-    // in case Function's result is non-void
-    return static_cast<Result>(m_f(thrust::raw_reference_cast(x)));
-  }
-
-  template<typename Argument>
-    inline __device__ Result operator()(const Argument &x) const
-  {
-    // we static cast to Result to handle void Result without error
-    // in case Function's result is non-void
-    return static_cast<Result>(m_f(thrust::raw_reference_cast(x)));
-  }
-
-  template<typename Argument1, typename Argument2>
-    inline __device__ Result operator()(Argument1 &x, Argument2 &y) const
-  {
-    // we static cast to Result to handle void Result without error
-    // in case Function's result is non-void
-    return static_cast<Result>(m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y)));
-  }
-
-  template<typename Argument1, typename Argument2>
-    inline __device__ Result operator()(const Argument1 &x, Argument2 &y) const
-  {
-    // we static cast to Result to handle void Result without error
-    // in case Function's result is non-void
-    return static_cast<Result>(m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y)));
-  }
-
-  template<typename Argument1, typename Argument2>
-    inline __device__ Result operator()(const Argument1 &x, const Argument2 &y) const
-  {
-    // we static cast to Result to handle void Result without error
-    // in case Function's result is non-void
-    return static_cast<Result>(m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y)));
-  }
-
-  template<typename Argument1, typename Argument2>
-    inline __device__ Result operator()(Argument1 &x, const Argument2 &y) const
-  {
-    // we static cast to Result to handle void Result without error
-    // in case Function's result is non-void
-    return static_cast<Result>(m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y)));
-  }
-}; // end device_function
-
-
-template<typename Function, typename Result>
-  struct host_device_function
-{
-  // mutable because Function::operator() might be const
-  mutable Function m_f;
-
-  inline __host__ __device__
-  host_device_function()
-    : m_f()
-  {}
-
-  inline __host__ __device__
-  host_device_function(const Function &f)
-    : m_f(f)
-  {}
-
-  __thrust_hd_warning_disable__
-  template<typename Argument>
-  inline __host__ __device__
-    Result operator()(Argument &x) const
-  {
-    // we static cast to Result to handle void Result without error
-    // in case Function's result is non-void
-    return static_cast<Result>(m_f(thrust::raw_reference_cast(x)));
-  }
-
-  template<typename Argument>
-    inline __host__ __device__ Result operator()(const Argument &x) const
-  {
-    // we static cast to Result to handle void Result without error
-    // in case Function's result is non-void
-    return static_cast<Result>(m_f(thrust::raw_reference_cast(x)));
-  }
-
-  template<typename Argument1, typename Argument2>
-    inline __host__ __device__ Result operator()(Argument1 &x, Argument2 &y) const
-  {
-    // we static cast to Result to handle void Result without error
-    // in case Function's result is non-void
-    return static_cast<Result>(m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y)));
-  }
-
-  template<typename Argument1, typename Argument2>
-    inline __host__ __device__ Result operator()(const Argument1 &x, Argument2 &y) const
-  {
-    // we static cast to Result to handle void Result without error
-    // in case Function's result is non-void
-    return static_cast<Result>(m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y)));
-  }
-
-  template<typename Argument1, typename Argument2>
-    inline __host__ __device__ Result operator()(const Argument1 &x, const Argument2 &y) const
-  {
-    // we static cast to Result to handle void Result without error
-    // in case Function's result is non-void
-    return static_cast<Result>(m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y)));
-  }
-
-  template<typename Argument1, typename Argument2>
-    inline __host__ __device__ Result operator()(Argument1 &x, const Argument2 &y) const
-  {
-    // we static cast to Result to handle void Result without error
-    // in case Function's result is non-void
-    return static_cast<Result>(m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y)));
-  }
-}; // end host_device_function
-
-
-} // end detail
-} // end thrust
-
diff --git a/compat/thrust/detail/functional.inl b/compat/thrust/detail/functional.inl
deleted file mode 100644
index 4024585935..0000000000
--- a/compat/thrust/detail/functional.inl
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/functional.h>
-
-namespace thrust
-{
-
-namespace detail
-{
-
-template<typename Operation>
-  struct unary_traits_imp;
-
-template<typename Operation>
-  struct unary_traits_imp<Operation*>
-{
-  typedef Operation                         function_type;
-  typedef const function_type &             param_type;
-  typedef typename Operation::result_type   result_type;
-  typedef typename Operation::argument_type argument_type;
-}; // end unary_traits_imp
-
-template<typename Result, typename Argument>
-  struct unary_traits_imp<Result(*)(Argument)>
-{
-  typedef Result   (*function_type)(Argument);
-  typedef Result   (*param_type)(Argument);
-  typedef Result   result_type;
-  typedef Argument argument_type;
-}; // end unary_traits_imp
-
-template<typename Operation>
-  struct binary_traits_imp;
-
-template<typename Operation>
-  struct binary_traits_imp<Operation*>
-{
-  typedef Operation                                function_type;
-  typedef const function_type &                    param_type;
-  typedef typename Operation::result_type          result_type;
-  typedef typename Operation::first_argument_type  first_argument_type;
-  typedef typename Operation::second_argument_type second_argument_type;
-}; // end binary_traits_imp
-
-template<typename Result, typename Argument1, typename Argument2>
-  struct binary_traits_imp<Result(*)(Argument1, Argument2)>
-{
-  typedef Result (*function_type)(Argument1, Argument2);
-  typedef Result (*param_type)(Argument1, Argument2);
-  typedef Result result_type;
-  typedef Argument1 first_argument_type;
-  typedef Argument2 second_argument_type;
-}; // end binary_traits_imp
-
-} // end detail
-
-template<typename Operation>
-  struct unary_traits
-{
-  typedef typename detail::unary_traits_imp<Operation*>::function_type function_type;
-  typedef typename detail::unary_traits_imp<Operation*>::param_type    param_type;
-  typedef typename detail::unary_traits_imp<Operation*>::result_type   result_type;
-  typedef typename detail::unary_traits_imp<Operation*>::argument_type argument_type;
-}; // end unary_traits
-
-template<typename Result, typename Argument>
-  struct unary_traits<Result(*)(Argument)>
-{
-  typedef Result   (*function_type)(Argument);
-  typedef Result   (*param_type)(Argument);
-  typedef Result   result_type;
-  typedef Argument argument_type;
-}; // end unary_traits
-
-template<typename Operation>
-  struct binary_traits
-{
-  typedef typename detail::binary_traits_imp<Operation*>::function_type        function_type;
-  typedef typename detail::binary_traits_imp<Operation*>::param_type           param_type;
-  typedef typename detail::binary_traits_imp<Operation*>::result_type          result_type;
-  typedef typename detail::binary_traits_imp<Operation*>::first_argument_type  first_argument_type;
-  typedef typename detail::binary_traits_imp<Operation*>::second_argument_type second_argument_type;
-}; // end binary_traits
-
-template<typename Result, typename Argument1, typename Argument2>
-  struct binary_traits<Result(*)(Argument1, Argument2)>
-{
-  typedef Result (*function_type)(Argument1, Argument2);
-  typedef Result (*param_type)(Argument1, Argument2);
-  typedef Result result_type;
-  typedef Argument1 first_argument_type;
-  typedef Argument2 second_argument_type;
-}; // end binary_traits
-
-template<typename Predicate>
-  unary_negate<Predicate> not1(const Predicate &pred)
-{
-  return unary_negate<Predicate>(pred);
-} // end not1()
-
-template<typename BinaryPredicate>
-  binary_negate<BinaryPredicate> not2(const BinaryPredicate &pred)
-{
-  return binary_negate<BinaryPredicate>(pred);
-} // end not2()
-
-} // end thrust
-
diff --git a/compat/thrust/detail/functional/actor.h b/compat/thrust/detail/functional/actor.h
deleted file mode 100644
index 0b95a6b894..0000000000
--- a/compat/thrust/detail/functional/actor.h
+++ /dev/null
@@ -1,192 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-// Portions of this code are derived from
-//
-// Manjunath Kudlur's Carbon library
-//
-// and
-//
-// Based on Boost.Phoenix v1.2
-// Copyright (c) 2001-2002 Joel de Guzman
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/tuple.h>
-#include <thrust/detail/functional/value.h>
-#include <thrust/detail/functional/composite.h>
-#include <thrust/detail/functional/operators/assignment_operator.h>
-#include <thrust/detail/type_traits/result_of.h>
-
-namespace thrust
-{
-namespace detail
-{
-namespace functional
-{
-
-template<typename Action, typename Env>
-  struct apply_actor
-{
-  typedef typename Action::template result<Env>::type type;
-};
-
-template<typename Eval>
-  struct actor
-    : Eval
-{
-  typedef Eval eval_type;
-
-  __host__ __device__
-  actor(void);
-
-  __host__ __device__
-  actor(const Eval &base);
-
-  __host__ __device__
-  typename apply_actor<eval_type, thrust::null_type >::type
-  operator()(void) const;
-
-  template<typename T0>
-  __host__ __device__
-  typename apply_actor<eval_type, thrust::tuple<T0&> >::type
-  operator()(T0 &_0) const;
-
-  template<typename T0, typename T1>
-  __host__ __device__
-  typename apply_actor<eval_type, thrust::tuple<T0&,T1&> >::type
-  operator()(T0 &_0, T1 &_1) const;
-
-  template<typename T0, typename T1, typename T2>
-  __host__ __device__
-  typename apply_actor<eval_type, thrust::tuple<T0&,T1&,T2&> >::type
-  operator()(T0 &_0, T1 &_1, T2 &_2) const;
-
-  template<typename T0, typename T1, typename T2, typename T3>
-  __host__ __device__
-  typename apply_actor<eval_type, thrust::tuple<T0&,T1&,T2&,T3&> >::type
-  operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3) const;
-
-  template<typename T0, typename T1, typename T2, typename T3, typename T4>
-  __host__ __device__
-  typename apply_actor<eval_type, thrust::tuple<T0&,T1&,T2&,T3&,T4&> >::type
-  operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4) const;
-
-  template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5>
-  __host__ __device__
-  typename apply_actor<eval_type, thrust::tuple<T0&,T1&,T2&,T3&,T4&,T5&> >::type
-  operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4, T5 &_5) const;
-
-  template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6>
-  __host__ __device__
-  typename apply_actor<eval_type, thrust::tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&> >::type
-  operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4, T5 &_5, T6 &_6) const;
-
-  template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7>
-  __host__ __device__
-  typename apply_actor<eval_type, thrust::tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&> >::type
-  operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4, T5 &_5, T6 &_6, T7 &_7) const;
-
-  template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8>
-  __host__ __device__
-  typename apply_actor<eval_type, thrust::tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&,T8&> >::type
-  operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4, T5 &_5, T6 &_6, T7 &_7, T8 &_8) const;
-
-  template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9>
-  __host__ __device__
-  typename apply_actor<eval_type, thrust::tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&,T8&,T9&> >::type
-  operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4, T5 &_5, T6 &_6, T7 &_7, T8 &_8, T9 &_9) const;
-
-  template<typename T>
-  __host__ __device__
-  typename assign_result<Eval,T>::type
-  operator=(const T &_1) const;
-}; // end actor
-
-// in general, as_actor should turn things into values
-template<typename T>
-  struct as_actor
-{
-  typedef value<T> type;
-
-  static inline __host__ __device__ type convert(const T &x)
-  {
-    return val(x);
-  } // end convert()
-}; // end as_actor
-
-// specialization for things which are already actors
-template<typename Eval>
-  struct as_actor<actor<Eval> >
-{
-  typedef actor<Eval> type;
-
-  static inline __host__ __device__ const type &convert(const actor<Eval> &x)
-  {
-    return x;
-  } // end convert()
-}; // end as_actor
-
-template<typename T>
-  typename as_actor<T>::type
-  __host__ __device__
-    make_actor(const T &x)
-{
-  return as_actor<T>::convert(x);
-} // end make_actor()
-
-} // end functional
-
-// provide specializations for result_of for nullary, unary, and binary invocations of actor
-template<typename Eval>
-  struct result_of<
-    thrust::detail::functional::actor<Eval>()
-  >
-{
-  typedef typename thrust::detail::functional::apply_actor<
-    thrust::detail::functional::actor<Eval>,
-    thrust::null_type
-  >::type type;
-}; // end result_of
-
-template<typename Eval, typename Arg1>
-  struct result_of<
-    thrust::detail::functional::actor<Eval>(Arg1)
-  >
-{
-  typedef typename thrust::detail::functional::apply_actor<
-    thrust::detail::functional::actor<Eval>,
-    thrust::tuple<Arg1>
-  >::type type;
-}; // end result_of
-
-template<typename Eval, typename Arg1, typename Arg2>
-  struct result_of<
-    thrust::detail::functional::actor<Eval>(Arg1,Arg2)
-  >
-{
-  typedef typename thrust::detail::functional::apply_actor<
-    thrust::detail::functional::actor<Eval>,
-    thrust::tuple<Arg1,Arg2>
-  >::type type;
-}; // end result_of
-
-} // end detail
-} // end thrust
-
-#include <thrust/detail/functional/actor.inl>
-
diff --git a/compat/thrust/detail/functional/actor.inl b/compat/thrust/detail/functional/actor.inl
deleted file mode 100644
index 84347be7b8..0000000000
--- a/compat/thrust/detail/functional/actor.inl
+++ /dev/null
@@ -1,194 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-// Portions of this code are derived from
-//
-// Manjunath Kudlur's Carbon library
-//
-// and
-//
-// Based on Boost.Phoenix v1.2
-// Copyright (c) 2001-2002 Joel de Guzman
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/functional/composite.h>
-#include <thrust/detail/functional/operators/assignment_operator.h>
-#include <thrust/functional.h>
-
-namespace thrust
-{
-
-namespace detail
-{
-namespace functional
-{
-
-template<typename Eval>
-  actor<Eval>
-    ::actor(void)
-      : eval_type()
-{}
-
-template<typename Eval>
-  actor<Eval>
-    ::actor(const Eval &base)
-      : eval_type(base)
-{}
-
-template<typename Eval>
-  typename apply_actor<
-    typename actor<Eval>::eval_type,
-    typename thrust::null_type
-  >::type
-    actor<Eval>
-      ::operator()(void) const
-{
-  return eval_type::eval(thrust::null_type());
-} // end basic_environment::operator()
-
-template<typename Eval>
-  template<typename T0>
-    typename apply_actor<
-      typename actor<Eval>::eval_type,
-      typename thrust::tuple<T0&>
-    >::type
-      actor<Eval>
-        ::operator()(T0 &_0) const
-{
-  return eval_type::eval(thrust::tie(_0));
-} // end basic_environment::operator()
-
-template<typename Eval>
-  template<typename T0, typename T1>
-    typename apply_actor<
-      typename actor<Eval>::eval_type,
-      typename thrust::tuple<T0&,T1&>
-    >::type
-      actor<Eval>
-        ::operator()(T0 &_0, T1 &_1) const
-{
-  return eval_type::eval(thrust::tie(_0,_1));
-} // end basic_environment::operator()
-
-template<typename Eval>
-  template<typename T0, typename T1, typename T2>
-    typename apply_actor<
-      typename actor<Eval>::eval_type,
-      typename thrust::tuple<T0&,T1&,T2&>
-    >::type
-      actor<Eval>
-        ::operator()(T0 &_0, T1 &_1, T2 &_2) const
-{
-  return eval_type::eval(thrust::tie(_0,_1,_2));
-} // end basic_environment::operator()
-
-template<typename Eval>
-  template<typename T0, typename T1, typename T2, typename T3>
-    typename apply_actor<
-      typename actor<Eval>::eval_type,
-      typename thrust::tuple<T0&,T1&,T2&,T3&>
-    >::type
-      actor<Eval>
-        ::operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3) const
-{
-  return eval_type::eval(thrust::tie(_0,_1,_2,_3));
-} // end basic_environment::operator()
-
-template<typename Eval>
-  template<typename T0, typename T1, typename T2, typename T3, typename T4>
-    typename apply_actor<
-      typename actor<Eval>::eval_type,
-      typename thrust::tuple<T0&,T1&,T2&,T3&,T4&>
-    >::type
-      actor<Eval>
-        ::operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4) const
-{
-  return eval_type::eval(thrust::tie(_0,_1,_2,_3,_4));
-} // end basic_environment::operator()
-
-template<typename Eval>
-  template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5>
-    typename apply_actor<
-      typename actor<Eval>::eval_type,
-      typename thrust::tuple<T0&,T1&,T2&,T3&,T4&,T5&>
-    >::type
-      actor<Eval>
-        ::operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4, T5 &_5) const
-{
-  return eval_type::eval(thrust::tie(_0,_1,_2,_3,_4,_5));
-} // end basic_environment::operator()
-
-template<typename Eval>
-  template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6>
-    typename apply_actor<
-      typename actor<Eval>::eval_type,
-      typename thrust::tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&>
-    >::type
-      actor<Eval>
-        ::operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4, T5 &_5, T6 &_6) const
-{
-  return eval_type::eval(thrust::tie(_0,_1,_2,_3,_4,_5,_6));
-} // end basic_environment::operator()
-
-template<typename Eval>
-  template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7>
-    typename apply_actor<
-      typename actor<Eval>::eval_type,
-      typename thrust::tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&>
-    >::type
-      actor<Eval>
-        ::operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4, T5 &_5, T6 &_6, T7 &_7) const
-{
-  return eval_type::eval(thrust::tie(_0,_1,_2,_3,_4,_5,_6,_7));
-} // end basic_environment::operator()
-
-template<typename Eval>
-  template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8>
-    typename apply_actor<
-      typename actor<Eval>::eval_type,
-      typename thrust::tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&,T8&>
-    >::type
-      actor<Eval>
-        ::operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4, T5 &_5, T6 &_6, T7 &_7, T8 &_8) const
-{
-  return eval_type::eval(thrust::tie(_0,_1,_2,_3,_4,_5,_6,_7,_8));
-} // end basic_environment::operator()
-
-template<typename Eval>
-  template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9>
-    typename apply_actor<
-      typename actor<Eval>::eval_type,
-      typename thrust::tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&,T8&,T9&>
-    >::type
-      actor<Eval>
-        ::operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4, T5 &_5, T6 &_6, T7 &_7, T8 &_8, T9 &_9) const
-{
-  return eval_type::eval(thrust::tie(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9));
-} // end basic_environment::operator()
-
-template<typename Eval>
-  template<typename T>
-    typename assign_result<Eval,T>::type
-      actor<Eval>
-        ::operator=(const T& _1) const
-{
-  return do_assign(*this,_1);
-} // end actor::operator=()
-
-} // end functional
-} // end detail
-} // end thrust
-
diff --git a/compat/thrust/detail/functional/argument.h b/compat/thrust/detail/functional/argument.h
deleted file mode 100644
index 96a20bed1f..0000000000
--- a/compat/thrust/detail/functional/argument.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-// Portions of this code are derived from
-//
-// Manjunath Kudlur's Carbon library
-//
-// and
-//
-// Based on Boost.Phoenix v1.2
-// Copyright (c) 2001-2002 Joel de Guzman
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/tuple.h>
-
-namespace thrust
-{
-namespace detail
-{
-namespace functional
-{
-
-template<unsigned int i, typename Env>
-  struct argument_helper
-{
-  typedef typename thrust::tuple_element<i,Env>::type type;
-};
-
-template<unsigned int i>
-  struct argument_helper<i,thrust::null_type>
-{
-  typedef thrust::null_type type;
-};
-
-
-template<unsigned int i>
-  class argument
-{
-  public:
-    template<typename Env>
-      struct result
-        : argument_helper<i,Env>
-    {
-    };
-
-    __host__ __device__
-    argument(void){}
-
-    template<typename Env>
-    __host__ __device__
-    typename result<Env>::type eval(const Env &e) const
-    {
-      return thrust::get<i>(e);
-    } // end eval()
-}; // end argument
-
-} // end functional
-} // end detail
-} // end thrust
-
diff --git a/compat/thrust/detail/functional/composite.h b/compat/thrust/detail/functional/composite.h
deleted file mode 100644
index 1d5fde3152..0000000000
--- a/compat/thrust/detail/functional/composite.h
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-// Portions of this code are derived from
-//
-// Manjunath Kudlur's Carbon library
-//
-// and
-//
-// Based on Boost.Phoenix v1.2
-// Copyright (c) 2001-2002 Joel de Guzman
-
-#pragma once
-
-#include <thrust/detail/functional/actor.h>
-#include <thrust/tuple.h>
-
-namespace thrust
-{
-namespace detail
-{
-namespace functional
-{
-
-// XXX we should just take a single EvalTuple
-template<typename Eval0,
-         typename Eval1  = thrust::null_type,
-         typename Eval2  = thrust::null_type,
-         typename Eval3  = thrust::null_type,
-         typename Eval4  = thrust::null_type,
-         typename Eval5  = thrust::null_type,
-         typename Eval6  = thrust::null_type,
-         typename Eval7  = thrust::null_type,
-         typename Eval8  = thrust::null_type,
-         typename Eval9  = thrust::null_type,
-         typename Eval10 = thrust::null_type>
-  class composite;
-
-template<typename Eval0, typename Eval1>
-  class composite<
-    Eval0,
-    Eval1,
-    thrust::null_type,
-    thrust::null_type,
-    thrust::null_type,
-    thrust::null_type,
-    thrust::null_type,
-    thrust::null_type,
-    thrust::null_type,
-    thrust::null_type
-  >
-{
-  public:
-    template<typename Env>
-      struct result
-    {
-      typedef typename Eval0::template result<
-        thrust::tuple<
-          typename Eval1::template result<Env>::type
-        >
-      >::type type;
-    };
-
-    __host__ __device__
-    composite(const Eval0 &e0, const Eval1 &e1)
-      : m_eval0(e0),
-        m_eval1(e1)
-    {}
-
-    template<typename Env>
-    __host__ __device__
-    typename result<Env>::type
-    eval(const Env &x) const
-    {
-      typename Eval1::template result<Env>::type result1 = m_eval1.eval(x);
-      return m_eval0.eval(thrust::tie(result1));
-    }
-
-  private:
-    Eval0 m_eval0;
-    Eval1 m_eval1;
-}; // end composite<Eval0,Eval1>
-
-template<typename Eval0, typename Eval1, typename Eval2>
-  class composite<
-    Eval0,
-    Eval1,
-    Eval2,
-    thrust::null_type,
-    thrust::null_type,
-    thrust::null_type,
-    thrust::null_type,
-    thrust::null_type,
-    thrust::null_type,
-    thrust::null_type
-  >
-{
-  public:
-    template<typename Env>
-      struct result
-    {
-      typedef typename Eval0::template result<
-        thrust::tuple<
-          typename Eval1::template result<Env>::type,
-          typename Eval2::template result<Env>::type
-        >
-      >::type type;
-    };
-
-    __host__ __device__
-    composite(const Eval0 &e0, const Eval1 &e1, const Eval2 &e2)
-      : m_eval0(e0),
-        m_eval1(e1),
-        m_eval2(e2)
-    {}
-
-    template<typename Env>
-    __host__ __device__
-    typename result<Env>::type
-    eval(const Env &x) const
-    {
-      typename Eval1::template result<Env>::type result1 = m_eval1.eval(x);
-      typename Eval2::template result<Env>::type result2 = m_eval2.eval(x);
-      return m_eval0.eval(thrust::tie(result1,result2));
-    }
-
-  private:
-    Eval0 m_eval0;
-    Eval1 m_eval1;
-    Eval2 m_eval2;
-}; // end composite<Eval0,Eval1,Eval2>
-
-template<typename Eval0, typename Eval1>
-__host__ __device__
-  actor<composite<Eval0,Eval1> > compose(const Eval0 &e0, const Eval1 &e1)
-{
-  return actor<composite<Eval0,Eval1> >(composite<Eval0,Eval1>(e0,e1));
-}
-
-template<typename Eval0, typename Eval1, typename Eval2>
-__host__ __device__
-  actor<composite<Eval0,Eval1,Eval2> > compose(const Eval0 &e0, const Eval1 &e1, const Eval2 &e2)
-{
-  return actor<composite<Eval0,Eval1,Eval2> >(composite<Eval0,Eval1,Eval2>(e0,e1,e2));
-}
-
-} // end functional
-} // end detail
-} // end thrust
-
diff --git a/compat/thrust/detail/functional/operators.h b/compat/thrust/detail/functional/operators.h
deleted file mode 100644
index 0fc3539cb9..0000000000
--- a/compat/thrust/detail/functional/operators.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/functional/operators/arithmetic_operators.h>
-#include <thrust/detail/functional/operators/relational_operators.h>
-#include <thrust/detail/functional/operators/logical_operators.h>
-#include <thrust/detail/functional/operators/bitwise_operators.h>
-#include <thrust/detail/functional/operators/compound_assignment_operators.h>
-
diff --git a/compat/thrust/detail/functional/operators/arithmetic_operators.h b/compat/thrust/detail/functional/operators/arithmetic_operators.h
deleted file mode 100644
index a11e7acdd1..0000000000
--- a/compat/thrust/detail/functional/operators/arithmetic_operators.h
+++ /dev/null
@@ -1,394 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/functional/actor.h>
-#include <thrust/detail/functional/composite.h>
-#include <thrust/detail/functional/operators/operator_adaptors.h>
-#include <thrust/functional.h>
-
-namespace thrust
-{
-namespace detail
-{
-namespace functional
-{
-
-template<typename Eval>
-__host__ __device__
-actor<
-  composite<
-    unary_operator<thrust::negate>,
-    actor<Eval>
-  >
->
-__host__ __device__
-operator-(const actor<Eval> &_1)
-{
-  return compose(unary_operator<thrust::negate>(), _1);
-} // end operator-()
-
-// there's no standard unary_plus functional, so roll an ad hoc one here
-template<typename T>
-  struct unary_plus
-    : public thrust::unary_function<T,T>
-{
-  __host__ __device__ T operator()(const T &x) const {return +x;}
-}; // end unary_plus
-
-template<typename Eval>
-__host__ __device__
-actor<
-  composite<
-    unary_operator<unary_plus>,
-    actor<Eval>
-  >
->
-operator+(const actor<Eval> &_1)
-{
-  return compose(unary_operator<unary_plus>(), _1);
-} // end operator+()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::plus>,
-    actor<T1>,
-    typename as_actor<T2>::type
-  >
->
-operator+(const actor<T1> &_1, const T2 &_2)
-{
-  return compose(binary_operator<thrust::plus>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator+()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::plus>,
-    typename as_actor<T1>::type,
-    actor<T2>
-  >
->
-operator+(const T1 &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<thrust::plus>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator+()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::plus>,
-    actor<T1>,
-    actor<T2>
-  >
->
-operator+(const actor<T1> &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<thrust::plus>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator+()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::minus>,
-    typename as_actor<T1>::type,
-    actor<T2>
-  >
->
-operator-(const T1 &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<thrust::minus>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator-()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::minus>,
-    actor<T1>,
-    typename as_actor<T2>::type
-  >
->
-operator-(const actor<T1> &_1, const T2 &_2)
-{
-  return compose(binary_operator<thrust::minus>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator-()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::minus>,
-    actor<T1>,
-    actor<T2>
-  >
->
-operator-(const actor<T1> &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<thrust::minus>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator-()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::multiplies>,
-    typename as_actor<T1>::type,
-    actor<T2>
-  >
->
-operator*(const T1 &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<thrust::multiplies>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator*()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::multiplies>,
-    actor<T1>,
-    typename as_actor<T2>::type
-  >
->
-operator*(const actor<T1> &_1, const T2 &_2)
-{
-  return compose(binary_operator<thrust::multiplies>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator*()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::multiplies>,
-    actor<T1>,
-    actor<T2>
-  >
->
-operator*(const actor<T1> &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<thrust::multiplies>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator*()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::divides>,
-    actor<T1>,
-    typename as_actor<T2>::type
-  >
->
-operator/(const actor<T1> &_1, const T2 &_2)
-{
-  return compose(binary_operator<thrust::divides>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator/()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::divides>,
-    typename as_actor<T1>::type,
-    actor<T2>
-  >
->
-operator/(const T1 &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<thrust::divides>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator/()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::divides>,
-    actor<T1>,
-    actor<T2>
-  >
->
-operator/(const actor<T1> &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<thrust::divides>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator/()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::modulus>,
-    actor<T1>,
-    typename as_actor<T2>::type
-  >
->
-operator%(const actor<T1> &_1, const T2 &_2)
-{
-  return compose(binary_operator<thrust::modulus>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator%()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::modulus>,
-    typename as_actor<T1>::type,
-    actor<T2>
-  >
->
-operator%(const T1 &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<thrust::modulus>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator%()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::modulus>,
-    actor<T1>,
-    actor<T2>
-  >
->
-operator%(const actor<T1> &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<thrust::modulus>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator%()
-
-// there's no standard prefix_increment functional, so roll an ad hoc one here
-template<typename T>
-  struct prefix_increment
-    : public thrust::unary_function<T&,T&>
-{
-  __host__ __device__ T& operator()(T &x) const { return ++x; }
-}; // end prefix_increment
-
-template<typename Eval>
-__host__ __device__
-actor<
-  composite<
-    unary_operator<prefix_increment>,
-    actor<Eval>
-  >
->
-operator++(const actor<Eval> &_1)
-{
-  return compose(unary_operator<prefix_increment>(), _1);
-} // end operator++()
-
-// there's no standard suffix_increment functional, so roll an ad hoc one here
-template<typename T>
-  struct suffix_increment
-    : public thrust::unary_function<T&,T>
-{
-  __host__ __device__ T operator()(T &x) const { return x++; }
-}; // end suffix_increment
-
-template<typename Eval>
-__host__ __device__
-actor<
-  composite<
-    unary_operator<suffix_increment>,
-    actor<Eval>
-  >
->
-operator++(const actor<Eval> &_1, int)
-{
-  return compose(unary_operator<suffix_increment>(), _1);
-} // end operator++()
-
-// there's no standard prefix_decrement functional, so roll an ad hoc one here
-template<typename T>
-  struct prefix_decrement
-    : public thrust::unary_function<T&,T&>
-{
-  __host__ __device__ T& operator()(T &x) const { return --x; }
-}; // end prefix_decrement
-
-template<typename Eval>
-__host__ __device__
-actor<
-  composite<
-    unary_operator<prefix_decrement>,
-    actor<Eval>
-  >
->
-operator--(const actor<Eval> &_1)
-{
-  return compose(unary_operator<prefix_decrement>(), _1);
-} // end operator--()
-
-// there's no standard suffix_decrement functional, so roll an ad hoc one here
-template<typename T>
-  struct suffix_decrement
-    : public thrust::unary_function<T&,T>
-{
-  __host__ __device__ T operator()(T &x) const { return x--; }
-}; // end suffix_decrement
-
-template<typename Eval>
-__host__ __device__
-actor<
-  composite<
-    unary_operator<suffix_decrement>,
-    actor<Eval>
-  >
->
-operator--(const actor<Eval> &_1, int)
-{
-  return compose(unary_operator<suffix_decrement>(), _1);
-} // end operator--()
-
-} // end functional
-} // end detail
-} // end thrust
-
diff --git a/compat/thrust/detail/functional/operators/assignment_operator.h b/compat/thrust/detail/functional/operators/assignment_operator.h
deleted file mode 100644
index e5d66202bf..0000000000
--- a/compat/thrust/detail/functional/operators/assignment_operator.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/functional/actor.h>
-#include <thrust/detail/functional/composite.h>
-#include <thrust/detail/functional/operators/operator_adaptors.h>
-#include <thrust/functional.h>
-
-namespace thrust
-{
-
-// XXX WAR circular inclusion with this forward declaration
-template<typename,typename,typename> struct binary_function;
-
-namespace detail
-{
-namespace functional
-{
-
-// XXX WAR circular inclusion with this forward declaration
-template<typename> struct as_actor;
-
-// there's no standard assign functional, so roll an ad hoc one here
-template<typename T>
-  struct assign
-    : thrust::binary_function<T&,T,T&>
-{
-  __host__ __device__ T& operator()(T &lhs, const T &rhs) const { return lhs = rhs; }
-}; // end assign
-
-template<typename Eval, typename T>
-  struct assign_result
-{
-  typedef actor<
-    composite<
-      binary_operator<assign>,
-      actor<Eval>,
-      typename as_actor<T>::type
-    >
-  > type;
-}; // end assign_result
-
-template<typename Eval, typename T>
-  __host__ __device__
-    typename assign_result<Eval,T>::type
-      do_assign(const actor<Eval> &_1, const T &_2)
-{
-  return compose(binary_operator<assign>(),
-                 _1,
-                 as_actor<T>::convert(_2));
-} // end do_assign()
-
-} // end functional
-} // end detail
-} // end thrust
-
diff --git a/compat/thrust/detail/functional/operators/bitwise_operators.h b/compat/thrust/detail/functional/operators/bitwise_operators.h
deleted file mode 100644
index c89c5d4f83..0000000000
--- a/compat/thrust/detail/functional/operators/bitwise_operators.h
+++ /dev/null
@@ -1,313 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/functional/actor.h>
-#include <thrust/detail/functional/composite.h>
-#include <thrust/detail/functional/operators/operator_adaptors.h>
-#include <thrust/functional.h>
-
-namespace thrust
-{
-namespace detail
-{
-namespace functional
-{
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::bit_and>,
-    actor<T1>,
-    typename as_actor<T2>::type
-  >
->
-operator&(const actor<T1> &_1, const T2 &_2)
-{
-  return compose(binary_operator<thrust::bit_and>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator&()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::bit_and>,
-    typename as_actor<T1>::type,
-    actor<T2>
-  >
->
-operator&(const T1 &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<thrust::bit_and>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator&()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::bit_and>,
-    actor<T1>,
-    actor<T2>
-  >
->
-operator&(const actor<T1> &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<thrust::bit_and>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator&()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::bit_or>,
-    actor<T1>,
-    typename as_actor<T2>::type
-  >
->
-operator|(const actor<T1> &_1, const T2 &_2)
-{
-  return compose(binary_operator<thrust::bit_or>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator|()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::bit_or>,
-    typename as_actor<T1>::type,
-    actor<T2>
-  >
->
-operator|(const T1 &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<thrust::bit_or>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator|()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::bit_or>,
-    actor<T1>,
-    actor<T2>
-  >
->
-operator|(const actor<T1> &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<thrust::bit_or>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator|()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::bit_xor>,
-    actor<T1>,
-    typename as_actor<T2>::type
-  >
->
-operator^(const actor<T1> &_1, const T2 &_2)
-{
-  return compose(binary_operator<thrust::bit_xor>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator^()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::bit_xor>,
-    typename as_actor<T1>::type,
-    actor<T2>
-  >
->
-operator^(const T1 &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<thrust::bit_xor>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator^()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::bit_xor>,
-    actor<T1>,
-    actor<T2>
-  >
->
-operator^(const actor<T1> &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<thrust::bit_xor>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator^()
-
-// there's no standard bit_not functional, so roll an ad hoc one here
-template<typename T>
-  struct bit_not
-    : public thrust::unary_function<T,T>
-{
-  __host__ __device__ T operator()(const T &x) const {return ~x;}
-}; // end bit_not
-
-template<typename Eval>
-__host__ __device__
-actor<
-  composite<
-    unary_operator<bit_not>,
-    actor<Eval>
-  >
->
-__host__ __device__
-operator~(const actor<Eval> &_1)
-{
-  return compose(unary_operator<bit_not>(), _1);
-} // end operator~()
-
-// there's no standard bit_lshift functional, so roll an ad hoc one here
-template<typename T>
-  struct bit_lshift
-    : public thrust::binary_function<T,T,T>
-{
-  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs << rhs;}
-}; // end bit_lshift
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<bit_lshift>,
-    actor<T1>,
-    typename as_actor<T2>::type
-  >
->
-operator<<(const actor<T1> &_1, const T2 &_2)
-{
-  return compose(binary_operator<bit_lshift>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator<<()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<bit_lshift>,
-    typename as_actor<T1>::type,
-    actor<T2>
-  >
->
-operator<<(const T1 &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<bit_lshift>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator<<()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<bit_lshift>,
-    actor<T1>,
-    actor<T2>
-  >
->
-operator<<(const actor<T1> &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<bit_lshift>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator<<()
-
-// there's no standard bit_rshift functional, so roll an ad hoc one here
-template<typename T>
-  struct bit_rshift
-    : public thrust::binary_function<T,T,T>
-{
-  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs >> rhs;}
-}; // end bit_rshift
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<bit_rshift>,
-    actor<T1>,
-    typename as_actor<T2>::type
-  >
->
-operator>>(const actor<T1> &_1, const T2 &_2)
-{
-  return compose(binary_operator<bit_rshift>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator>>()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<bit_rshift>,
-    typename as_actor<T1>::type,
-    actor<T2>
-  >
->
-operator>>(const T1 &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<bit_rshift>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator>>()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<bit_rshift>,
-    actor<T1>,
-    actor<T2>
-  >
->
-operator>>(const actor<T1> &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<bit_rshift>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator>>()
-
-} // end functional
-} // end detail
-} // end thrust
-
diff --git a/compat/thrust/detail/functional/operators/compound_assignment_operators.h b/compat/thrust/detail/functional/operators/compound_assignment_operators.h
deleted file mode 100644
index ef7389b55c..0000000000
--- a/compat/thrust/detail/functional/operators/compound_assignment_operators.h
+++ /dev/null
@@ -1,424 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/functional/actor.h>
-#include <thrust/detail/functional/composite.h>
-#include <thrust/detail/functional/operators/operator_adaptors.h>
-
-namespace thrust
-{
-namespace detail
-{
-namespace functional
-{
-
-template<typename T>
-  struct plus_equal
-    : public thrust::binary_function<T&,T,T&>
-{
-  __host__ __device__ T& operator()(T &lhs, const T &rhs) const { return lhs += rhs; }
-}; // end plus_equal
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<plus_equal>,
-    actor<T1>,
-    typename as_actor<T2>::type
-  >
->
-operator+=(const actor<T1> &_1, const T2 &_2)
-{
-  return compose(binary_operator<plus_equal>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator+=()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<plus_equal>,
-    actor<T1>,
-    actor<T2>
-  >
->
-operator+=(const actor<T1> &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<plus_equal>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator+=()
-
-template<typename T>
-  struct minus_equal
-    : public thrust::binary_function<T&,T,T&>
-{
-  __host__ __device__ T& operator()(T &lhs, const T &rhs) const { return lhs -= rhs; }
-}; // end minus_equal
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<minus_equal>,
-    actor<T1>,
-    typename as_actor<T2>::type
-  >
->
-operator-=(const actor<T1> &_1, const T2 &_2)
-{
-  return compose(binary_operator<minus_equal>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator-=()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<minus_equal>,
-    actor<T1>,
-    actor<T2>
-  >
->
-operator-=(const actor<T1> &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<minus_equal>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator-=()
-
-template<typename T>
-  struct multiplies_equal
-    : public thrust::binary_function<T&,T,T&>
-{
-  __host__ __device__ T& operator()(T &lhs, const T&rhs) const { return lhs *= rhs; }
-}; // end multiplies_equal
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<multiplies_equal>,
-    actor<T1>,
-    typename as_actor<T2>::type
-  >
->
-operator*=(const actor<T1> &_1, const T2 &_2)
-{
-  return compose(binary_operator<multiplies_equal>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator*=()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<multiplies_equal>,
-    actor<T1>,
-    actor<T2>
-  >
->
-operator*=(const actor<T1> &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<multiplies_equal>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator*=()
-
-template<typename T>
-  struct divides_equal
-    : public thrust::binary_function<T&,T,T&>
-{
-  __host__ __device__ T& operator()(T &lhs, const T&rhs) const { return lhs /= rhs; }
-}; // end divides_equal
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<divides_equal>,
-    actor<T1>,
-    typename as_actor<T2>::type
-  >
->
-operator/=(const actor<T1> &_1, const T2 &_2)
-{
-  return compose(binary_operator<divides_equal>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator/=()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<divides_equal>,
-    actor<T1>,
-    actor<T2>
-  >
->
-operator/=(const actor<T1> &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<divides_equal>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator/=()
-
-template<typename T>
-  struct modulus_equal
-    : public thrust::binary_function<T&,T,T&>
-{
-  __host__ __device__ T& operator()(T &lhs, const T&rhs) const { return lhs %= rhs; }
-}; // end modulus_equal
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<modulus_equal>,
-    actor<T1>,
-    typename as_actor<T2>::type
-  >
->
-operator%=(const actor<T1> &_1, const T2 &_2)
-{
-  return compose(binary_operator<modulus_equal>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator%=()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<modulus_equal>,
-    actor<T1>,
-    actor<T2>
-  >
->
-operator%=(const actor<T1> &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<modulus_equal>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator%=()
-
-template<typename T>
-  struct bit_and_equal
-    : public thrust::binary_function<T&,T,T&>
-{
-  __host__ __device__ T& operator()(T &lhs, const T&rhs) const { return lhs &= rhs; }
-}; // end bit_and_equal
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<bit_and_equal>,
-    actor<T1>,
-    typename as_actor<T2>::type
-  >
->
-operator&=(const actor<T1> &_1, const T2 &_2)
-{
-  return compose(binary_operator<bit_and_equal>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator&=()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<bit_and_equal>,
-    actor<T1>,
-    actor<T2>
-  >
->
-operator&=(const actor<T1> &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<bit_and_equal>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator&=()
-
-template<typename T>
-  struct bit_or_equal
-    : public thrust::binary_function<T&,T,T&>
-{
-  __host__ __device__ T& operator()(T &lhs, const T&rhs) const { return lhs |= rhs; }
-}; // end bit_or_equal
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<bit_or_equal>,
-    actor<T1>,
-    typename as_actor<T2>::type
-  >
->
-operator|=(const actor<T1> &_1, const T2 &_2)
-{
-  return compose(binary_operator<bit_or_equal>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator|=()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<bit_or_equal>,
-    actor<T1>,
-    actor<T2>
-  >
->
-operator|=(const actor<T1> &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<bit_or_equal>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator|=()
-
-template<typename T>
-  struct bit_xor_equal
-    : public thrust::binary_function<T&,T,T&>
-{
-  __host__ __device__ T& operator()(T &lhs, const T&rhs) const { return lhs ^= rhs; }
-}; // end bit_xor_equal
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<bit_xor_equal>,
-    actor<T1>,
-    typename as_actor<T2>::type
-  >
->
-operator^=(const actor<T1> &_1, const T2 &_2)
-{
-  return compose(binary_operator<bit_xor_equal>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator|=()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<bit_xor_equal>,
-    actor<T1>,
-    actor<T2>
-  >
->
-operator^=(const actor<T1> &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<bit_xor_equal>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator|=()
-
-template<typename T>
-  struct bit_lshift_equal
-    : public thrust::binary_function<T&,T,T&>
-{
-  __host__ __device__ T& operator()(T &lhs, const T&rhs) const { return lhs <<= rhs; }
-}; // end bit_lshift_equal
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<bit_lshift_equal>,
-    actor<T1>,
-    typename as_actor<T2>::type
-  >
->
-operator<<=(const actor<T1> &_1, const T2 &_2)
-{
-  return compose(binary_operator<bit_lshift_equal>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator<<=()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<bit_lshift_equal>,
-    actor<T1>,
-    actor<T2>
-  >
->
-operator<<=(const actor<T1> &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<bit_lshift_equal>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator<<=()
-
-template<typename T>
-  struct bit_rshift_equal
-    : public thrust::binary_function<T&,T,T&>
-{
-  __host__ __device__ T& operator()(T &lhs, const T&rhs) const { return lhs >>= rhs; }
-}; // end bit_rshift_equal
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<bit_rshift_equal>,
-    actor<T1>,
-    typename as_actor<T2>::type
-  >
->
-operator>>=(const actor<T1> &_1, const T2 &_2)
-{
-  return compose(binary_operator<bit_rshift_equal>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator>>=()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<bit_rshift_equal>,
-    actor<T1>,
-    actor<T2>
-  >
->
-operator>>=(const actor<T1> &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<bit_rshift_equal>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator>>=()
-
-} // end functional
-} // end detail
-} // end thrust
-
diff --git a/compat/thrust/detail/functional/operators/logical_operators.h b/compat/thrust/detail/functional/operators/logical_operators.h
deleted file mode 100644
index 9c952620db..0000000000
--- a/compat/thrust/detail/functional/operators/logical_operators.h
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/functional/actor.h>
-#include <thrust/detail/functional/composite.h>
-#include <thrust/detail/functional/operators/operator_adaptors.h>
-#include <thrust/functional.h>
-
-namespace thrust
-{
-namespace detail
-{
-namespace functional
-{
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::logical_and>,
-    actor<T1>,
-    typename as_actor<T2>::type
-  >
->
-operator&&(const actor<T1> &_1, const T2 &_2)
-{
-  return compose(binary_operator<thrust::logical_and>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator&&()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::logical_and>,
-    typename as_actor<T1>::type,
-    actor<T2>
-  >
->
-operator&&(const T1 &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<thrust::logical_and>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator&&()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::logical_and>,
-    actor<T1>,
-    actor<T2>
-  >
->
-operator&&(const actor<T1> &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<thrust::logical_and>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator&&()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::logical_or>,
-    actor<T1>,
-    typename as_actor<T2>::type
-  >
->
-operator||(const actor<T1> &_1, const T2 &_2)
-{
-  return compose(binary_operator<thrust::logical_or>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator&&()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::logical_or>,
-    typename as_actor<T1>::type,
-    actor<T2>
-  >
->
-operator||(const T1 &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<thrust::logical_or>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator&&()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::logical_or>,
-    actor<T1>,
-    actor<T2>
-  >
->
-operator||(const actor<T1> &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<thrust::logical_or>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator&&()
-
-template<typename Eval>
-__host__ __device__
-actor<
-  composite<
-    unary_operator<thrust::logical_not>,
-    actor<Eval>
-  >
->
-operator!(const actor<Eval> &_1)
-{
-  return compose(unary_operator<thrust::logical_not>(), _1);
-} // end operator!()
-
-} // end functional
-} // end detail
-} // end thrust
-
diff --git a/compat/thrust/detail/functional/operators/operator_adaptors.h b/compat/thrust/detail/functional/operators/operator_adaptors.h
deleted file mode 100644
index d35fe9726b..0000000000
--- a/compat/thrust/detail/functional/operators/operator_adaptors.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/tuple.h>
-#include <thrust/detail/type_traits.h>
-
-namespace thrust
-{
-namespace detail
-{
-namespace functional
-{
-
-// this thing (which models Eval) is an adaptor for the unary
-// functors inside functional.h
-template<template<typename> class UnaryOperator>
-  struct unary_operator
-{
-  template<typename Env>
-    struct argument
-      : thrust::detail::eval_if<
-          (thrust::tuple_size<Env>::value == 0),
-          thrust::detail::identity_<thrust::null_type>,
-          thrust::tuple_element<0,Env>
-        >
-  {
-  };
-
-  template<typename Env>
-    struct operator_type
-  {
-    typedef UnaryOperator<
-      typename thrust::detail::remove_reference<
-        typename argument<Env>::type
-      >::type
-    > type;
-  };
-
-  template<typename Env>
-    struct result
-  {
-    typedef typename operator_type<Env>::type op_type;
-    typedef typename op_type::result_type type;
-  };
-
-  template<typename Env>
-  __host__ __device__
-  typename result<Env>::type eval(const Env &e) const
-  {
-    typename operator_type<Env>::type op;
-    return op(thrust::get<0>(e));
-  } // end eval()
-}; // end unary_operator
-
-// this thing (which models Eval) is an adaptor for the binary
-// functors inside functional.h
-template<template<typename> class BinaryOperator>
-  struct binary_operator
-{
-  template<typename Env>
-    struct first_argument
-      : thrust::detail::eval_if<
-          (thrust::tuple_size<Env>::value == 0),
-          thrust::detail::identity_<thrust::null_type>,
-          thrust::tuple_element<0,Env>
-        >
-  {
-  };
-
-  template<typename Env>
-    struct operator_type
-  {
-    typedef BinaryOperator<
-      typename thrust::detail::remove_reference<
-        typename first_argument<Env>::type
-      >::type
-    > type;
-  };
-
-  template<typename Env>
-    struct result
-  {
-    typedef typename operator_type<Env>::type op_type;
-    typedef typename op_type::result_type type;
-  };
-
-  template<typename Env>
-  __host__ __device__
-  typename result<Env>::type eval(const Env &e) const
-  {
-    typename operator_type<Env>::type op;
-    return op(thrust::get<0>(e), thrust::get<1>(e));
-  } // end eval()
-}; // end binary_operator
-
-} // end functional
-} // end detail
-} // end thrust
-
diff --git a/compat/thrust/detail/functional/operators/relational_operators.h b/compat/thrust/detail/functional/operators/relational_operators.h
deleted file mode 100644
index 6b26534430..0000000000
--- a/compat/thrust/detail/functional/operators/relational_operators.h
+++ /dev/null
@@ -1,323 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/functional/actor.h>
-#include <thrust/detail/functional/composite.h>
-#include <thrust/detail/functional/operators/operator_adaptors.h>
-#include <thrust/functional.h>
-
-namespace thrust
-{
-namespace detail
-{
-namespace functional
-{
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::equal_to>,
-    actor<T1>,
-    typename as_actor<T2>::type
-  >
->
-operator==(const actor<T1> &_1, const T2 &_2)
-{
-  return compose(binary_operator<thrust::equal_to>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator==()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::equal_to>,
-    typename as_actor<T1>::type,
-    actor<T2>
-  >
->
-operator==(const T1 &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<thrust::equal_to>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator==()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::equal_to>,
-    actor<T1>,
-    actor<T2>
-  >
->
-operator==(const actor<T1> &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<thrust::equal_to>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator==()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::not_equal_to>,
-    actor<T1>,
-    typename as_actor<T2>::type
-  >
->
-operator!=(const actor<T1> &_1, const T2 &_2)
-{
-  return compose(binary_operator<thrust::not_equal_to>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator!=()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::not_equal_to>,
-    typename as_actor<T1>::type,
-    actor<T2>
-  >
->
-operator!=(const T1 &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<thrust::not_equal_to>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator!=()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::not_equal_to>,
-    actor<T1>,
-    actor<T2>
-  >
->
-operator!=(const actor<T1> &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<thrust::not_equal_to>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator!=()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::greater>,
-    actor<T1>,
-    typename as_actor<T2>::type
-  >
->
-operator>(const actor<T1> &_1, const T2 &_2)
-{
-  return compose(binary_operator<thrust::greater>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator>()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::greater>,
-    typename as_actor<T1>::type,
-    actor<T2>
-  >
->
-operator>(const T1 &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<thrust::greater>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator>()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::greater>,
-    actor<T1>,
-    actor<T2>
-  >
->
-operator>(const actor<T1> &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<thrust::greater>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator>()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::less>,
-    actor<T1>,
-    typename as_actor<T2>::type
-  >
->
-operator<(const actor<T1> &_1, const T2 &_2)
-{
-  return compose(binary_operator<thrust::less>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator<()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::less>,
-    typename as_actor<T1>::type,
-    actor<T2>
-  >
->
-operator<(const T1 &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<thrust::less>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator<()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::less>,
-    actor<T1>,
-    actor<T2>
-  >
->
-operator<(const actor<T1> &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<thrust::less>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator<()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::greater_equal>,
-    actor<T1>,
-    typename as_actor<T2>::type
-  >
->
-operator>=(const actor<T1> &_1, const T2 &_2)
-{
-  return compose(binary_operator<thrust::greater_equal>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator>=()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::greater_equal>,
-    typename as_actor<T1>::type,
-    actor<T2>
-  >
->
-operator>=(const T1 &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<thrust::greater_equal>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator>=()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::greater_equal>,
-    actor<T1>,
-    actor<T2>
-  >
->
-operator>=(const actor<T1> &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<thrust::greater_equal>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator>=()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::less_equal>,
-    actor<T1>,
-    typename as_actor<T2>::type
-  >
->
-operator<=(const actor<T1> &_1, const T2 &_2)
-{
-  return compose(binary_operator<thrust::less_equal>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator<=()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::less_equal>,
-    typename as_actor<T1>::type,
-    actor<T2>
-  >
->
-operator<=(const T1 &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<thrust::less_equal>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator<=()
-
-template<typename T1, typename T2>
-__host__ __device__
-actor<
-  composite<
-    binary_operator<thrust::less_equal>,
-    actor<T1>,
-    actor<T2>
-  >
->
-operator<=(const actor<T1> &_1, const actor<T2> &_2)
-{
-  return compose(binary_operator<thrust::less_equal>(),
-                 make_actor(_1),
-                 make_actor(_2));
-} // end operator<=()
-
-} // end functional
-} // end detail
-} // end thrust
-
diff --git a/compat/thrust/detail/functional/placeholder.h b/compat/thrust/detail/functional/placeholder.h
deleted file mode 100644
index 9acf6da803..0000000000
--- a/compat/thrust/detail/functional/placeholder.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/functional/actor.h>
-#include <thrust/detail/functional/argument.h>
-
-namespace thrust
-{
-namespace detail
-{
-namespace functional
-{
-
-template<unsigned int i>
-  struct placeholder
-{
-  typedef actor<argument<i> > type;
-};
-
-} // end functional
-} // end detail
-} // end thrust
-
diff --git a/compat/thrust/detail/functional/value.h b/compat/thrust/detail/functional/value.h
deleted file mode 100644
index 27e2802e3a..0000000000
--- a/compat/thrust/detail/functional/value.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-// Portions of this code are derived from
-//
-// Manjunath Kudlur's Carbon library
-//
-// and
-//
-// Based on Boost.Phoenix v1.2
-// Copyright (c) 2001-2002 Joel de Guzman
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/functional/actor.h>
-
-namespace thrust
-{
-namespace detail
-{
-namespace functional
-{
-
-
-template<typename Eval> struct actor;
-
-
-template<typename T>
-  class value
-{
-  public:
-
-    template<typename Env>
-      struct result
-    {
-      typedef T type;
-    };
-
-    __host__ __device__
-    value(const T &arg)
-      : m_val(arg)
-    {}
-
-    template<typename Env>
-    __host__ __device__
-      T eval(const Env &) const
-    {
-      return m_val;
-    }
-
-  private:
-    T m_val;
-}; // end value
-
-template<typename T>
-__host__ __device__
-actor<value<T> > val(const T &x)
-{
-  return value<T>(x);
-} // end val()
-
-
-} // end functional
-} // end detail
-} // end thrust
-
diff --git a/compat/thrust/detail/gather.inl b/compat/thrust/detail/gather.inl
deleted file mode 100644
index 4edecd038a..0000000000
--- a/compat/thrust/detail/gather.inl
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file gather.inl
- *  \brief Inline file for gather.h.
- */
-
-#include <thrust/gather.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/detail/generic/select_system.h>
-#include <thrust/system/detail/generic/gather.h>
-#include <thrust/system/detail/adl/gather.h>
-
-namespace thrust
-{
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename RandomAccessIterator,
-         typename OutputIterator>
-  OutputIterator gather(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                        InputIterator                                               map_first,
-                        InputIterator                                               map_last,
-                        RandomAccessIterator                                        input_first,
-                        OutputIterator                                              result)
-{
-  using thrust::system::detail::generic::gather;
-  return gather(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), map_first, map_last, input_first, result);
-} // end gather()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename RandomAccessIterator,
-         typename OutputIterator>
-  OutputIterator gather_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                           InputIterator1                                              map_first,
-                           InputIterator1                                              map_last,
-                           InputIterator2                                              stencil,
-                           RandomAccessIterator                                        input_first,
-                           OutputIterator                                              result)
-{
-  using thrust::system::detail::generic::gather_if;
-  return gather_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), map_first, map_last, stencil, input_first, result);
-} // end gather_if()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename RandomAccessIterator,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator gather_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                           InputIterator1                                              map_first,
-                           InputIterator1                                              map_last,
-                           InputIterator2                                              stencil,
-                           RandomAccessIterator                                        input_first,
-                           OutputIterator                                              result,
-                           Predicate                                                   pred)
-{
-  using thrust::system::detail::generic::gather_if;
-  return gather_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), map_first, map_last, stencil, input_first, result, pred);
-} // end gather_if()
-
-
-template<typename InputIterator,
-         typename RandomAccessIterator,
-         typename OutputIterator>
-  OutputIterator gather(InputIterator        map_first,
-                        InputIterator        map_last,
-                        RandomAccessIterator input_first,
-                        OutputIterator       result)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator>::type        System1; 
-  typedef typename thrust::iterator_system<RandomAccessIterator>::type System2; 
-  typedef typename thrust::iterator_system<OutputIterator>::type       System3; 
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-
-  return thrust::gather(select_system(system1,system2,system3), map_first, map_last, input_first, result);
-} // end gather()
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename RandomAccessIterator,
-         typename OutputIterator>
-  OutputIterator gather_if(InputIterator1       map_first,
-                           InputIterator1       map_last,
-                           InputIterator2       stencil,
-                           RandomAccessIterator input_first,
-                           OutputIterator       result)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type       System1; 
-  typedef typename thrust::iterator_system<InputIterator2>::type       System2; 
-  typedef typename thrust::iterator_system<RandomAccessIterator>::type System3; 
-  typedef typename thrust::iterator_system<OutputIterator>::type       System4; 
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-  System4 system4;
-
-  return thrust::gather_if(select_system(system1,system2,system3,system4), map_first, map_last, stencil, input_first, result);
-} // end gather_if()
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename RandomAccessIterator,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator gather_if(InputIterator1       map_first,
-                           InputIterator1       map_last,
-                           InputIterator2       stencil,
-                           RandomAccessIterator input_first,
-                           OutputIterator       result,
-                           Predicate            pred)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type       System1; 
-  typedef typename thrust::iterator_system<InputIterator2>::type       System2; 
-  typedef typename thrust::iterator_system<RandomAccessIterator>::type System3; 
-  typedef typename thrust::iterator_system<OutputIterator>::type       System4; 
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-  System4 system4;
-
-  return thrust::gather_if(select_system(system1,system2,system3,system4), map_first, map_last, stencil, input_first, result, pred);
-} // end gather_if()
-
-
-} // end namespace thrust
-
diff --git a/compat/thrust/detail/generate.inl b/compat/thrust/detail/generate.inl
deleted file mode 100644
index c12580452e..0000000000
--- a/compat/thrust/detail/generate.inl
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file generate.inl
- *  \author Jared Hoberock
- *  \brief Inline file for generate.h.
- */
-
-#include <thrust/generate.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/detail/generic/select_system.h>
-#include <thrust/system/detail/generic/generate.h>
-#include <thrust/system/detail/adl/generate.h>
-
-namespace thrust
-{
-
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename Generator>
-  void generate(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                ForwardIterator first,
-                ForwardIterator last,
-                Generator gen)
-{
-  using thrust::system::detail::generic::generate;
-  return generate(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, gen);
-} // end generate()
-
-
-template<typename DerivedPolicy,
-         typename OutputIterator,
-         typename Size,
-         typename Generator>
-  OutputIterator generate_n(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                            OutputIterator first,
-                            Size n,
-                            Generator gen)
-{
-  using thrust::system::detail::generic::generate_n;
-  return generate_n(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, n, gen);
-} // end generate_n()
-
-
-template<typename ForwardIterator,
-         typename Generator>
-  void generate(ForwardIterator first,
-                ForwardIterator last,
-                Generator gen)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<ForwardIterator>::type System;
-
-  System system;
-
-  return thrust::generate(select_system(system), first, last, gen);
-} // end generate()
-
-
-template<typename OutputIterator,
-         typename Size,
-         typename Generator>
-  OutputIterator generate_n(OutputIterator first,
-                            Size n,
-                            Generator gen)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<OutputIterator>::type System;
-
-  System system;
-
-  return thrust::generate_n(select_system(system), first, n, gen);
-} // end generate_n()
-
-
-} // end thrust
-
diff --git a/compat/thrust/detail/host_vector.inl b/compat/thrust/detail/host_vector.inl
deleted file mode 100644
index e5c60ab973..0000000000
--- a/compat/thrust/detail/host_vector.inl
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file host_vector.inl
- *  \brief Inline file for host_vector.h.
- */
-
-#include <thrust/host_vector.h>
-
-namespace thrust
-{
-
-template<typename T, typename Alloc>
-  template<typename OtherT, typename OtherAlloc>
-    host_vector<T,Alloc>
-      ::host_vector(const device_vector<OtherT,OtherAlloc> &v)
-        :Parent(v)
-{
-  ;
-} // end host_vector::host_vector()
-
-} // end namespace thrust
-
diff --git a/compat/thrust/detail/inner_product.inl b/compat/thrust/detail/inner_product.inl
deleted file mode 100644
index f7773d8d2c..0000000000
--- a/compat/thrust/detail/inner_product.inl
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file inner_product.inl
- *  \brief Inline file for inner_product.h.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/inner_product.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/detail/generic/select_system.h>
-#include <thrust/system/detail/generic/inner_product.h>
-#include <thrust/system/detail/adl/inner_product.h>
-
-namespace thrust
-{
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputType>
-OutputType inner_product(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                         InputIterator1 first1,
-                         InputIterator1 last1,
-                         InputIterator2 first2,
-                         OutputType init)
-{
-  using thrust::system::detail::generic::inner_product;
-  return inner_product(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, init);
-} // end inner_product()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputType,
-         typename BinaryFunction1,
-         typename BinaryFunction2>
-OutputType inner_product(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                         InputIterator1 first1,
-                         InputIterator1 last1,
-                         InputIterator2 first2,
-                         OutputType init, 
-                         BinaryFunction1 binary_op1,
-                         BinaryFunction2 binary_op2)
-{
-  using thrust::system::detail::generic::inner_product;
-  return inner_product(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, init, binary_op1, binary_op2);
-} // end inner_product()
-
-
-template <typename InputIterator1, typename InputIterator2, typename OutputType>
-OutputType 
-inner_product(InputIterator1 first1, InputIterator1 last1,
-              InputIterator2 first2, OutputType init)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type System1;
-  typedef typename thrust::iterator_system<InputIterator2>::type System2;
-
-  System1 system1;
-  System2 system2;
-
-  return thrust::inner_product(select_system(system1,system2), first1, last1, first2, init);
-} // end inner_product()
-
-
-template <typename InputIterator1, typename InputIterator2, typename OutputType,
-          typename BinaryFunction1, typename BinaryFunction2>
-OutputType
-inner_product(InputIterator1 first1, InputIterator1 last1,
-              InputIterator2 first2, OutputType init, 
-              BinaryFunction1 binary_op1, BinaryFunction2 binary_op2)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type System1;
-  typedef typename thrust::iterator_system<InputIterator2>::type System2;
-
-  System1 system1;
-  System2 system2;
-
-  return thrust::inner_product(select_system(system1,system2), first1, last1, first2, init, binary_op1, binary_op2);
-} // end inner_product()
-
-
-} // end namespace thrust
-
diff --git a/compat/thrust/detail/integer_traits.h b/compat/thrust/detail/integer_traits.h
deleted file mode 100644
index e4cf5d159f..0000000000
--- a/compat/thrust/detail/integer_traits.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <limits>
-#include <limits.h>
-
-namespace thrust
-{
-
-namespace detail
-{
-
-template<typename T>
-  class integer_traits
-{
-  public:
-    static const bool is_integral = false;
-};
-
-template<typename T, T min_val, T max_val>
-  class integer_traits_base
-{
-  public:
-    static const bool is_integral = true;
-    static const T const_min = min_val;
-    static const T const_max = max_val;
-};
-
-
-template<>
-  class integer_traits<bool>
-    : public std::numeric_limits<bool>,
-      public integer_traits_base<bool, false, true>
-{};
-
-
-template<>
-  class integer_traits<char>
-    : public std::numeric_limits<char>,
-      public integer_traits_base<char, CHAR_MIN, CHAR_MAX>
-{};
-
-
-template<>
-  class integer_traits<signed char>
-    : public std::numeric_limits<signed char>,
-      public integer_traits_base<signed char, SCHAR_MIN, SCHAR_MAX>
-{};
-
-
-template<>
-  class integer_traits<unsigned char>
-    : public std::numeric_limits<unsigned char>,
-      public integer_traits_base<unsigned char, 0, UCHAR_MAX>
-{};
-
-
-template<>
-  class integer_traits<short>
-    : public std::numeric_limits<short>,
-      public integer_traits_base<short, SHRT_MIN, SHRT_MAX>
-{};
-
-
-template<>
-  class integer_traits<unsigned short>
-    : public std::numeric_limits<unsigned short>,
-      public integer_traits_base<unsigned short, 0, USHRT_MAX>
-{};
-
-
-template<>
-  class integer_traits<int>
-    : public std::numeric_limits<int>,
-      public integer_traits_base<int, INT_MIN, INT_MAX>
-{};
-
-
-template<>
-  class integer_traits<unsigned int>
-    : public std::numeric_limits<unsigned int>,
-      public integer_traits_base<unsigned int, 0, UINT_MAX>
-{};
-
-
-template<>
-  class integer_traits<long>
-    : public std::numeric_limits<long>,
-      public integer_traits_base<long, LONG_MIN, LONG_MAX>
-{};
-
-
-template<>
-  class integer_traits<unsigned long>
-    : public std::numeric_limits<unsigned long>,
-      public integer_traits_base<unsigned long, 0, ULONG_MAX>
-{};
-
-
-template<>
-  class integer_traits<long long>
-    : public std::numeric_limits<long long>,
-      public integer_traits_base<long long, LLONG_MIN, LLONG_MAX>
-{};
-
-
-template<>
-  class integer_traits<unsigned long long>
-    : public std::numeric_limits<unsigned long long>,
-      public integer_traits_base<unsigned long long, 0, ULLONG_MAX>
-{};
-
-} // end detail
-
-} // end thrust
-
diff --git a/compat/thrust/detail/internal_functional.h b/compat/thrust/detail/internal_functional.h
deleted file mode 100644
index 6d5264ae35..0000000000
--- a/compat/thrust/detail/internal_functional.h
+++ /dev/null
@@ -1,678 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file internal_functional.inl
- *  \brief Non-public functionals used to implement algorithm internals.
- */
-
-#pragma once
-
-#include <thrust/tuple.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/iterator/detail/tuple_of_iterator_references.h>
-#include <thrust/detail/raw_reference_cast.h>
-#include <memory> // for ::new
-
-namespace thrust
-{
-namespace detail
-{
-
-// unary_negate does not need to know argument_type
-template <typename Predicate>
-struct unary_negate
-{
-    typedef bool result_type;
-
-    Predicate pred;
-
-    __host__ __device__
-    explicit unary_negate(const Predicate& pred) : pred(pred) {}
-
-    template <typename T>
-    __host__ __device__
-    bool operator()(const T& x)
-    {
-        return !bool(pred(x));
-    }
-};
-
-// binary_negate does not need to know first_argument_type or second_argument_type
-template <typename Predicate>
-struct binary_negate
-{
-    typedef bool result_type;
-
-    Predicate pred;
-
-    __host__ __device__
-    explicit binary_negate(const Predicate& pred) : pred(pred) {}
-
-    template <typename T1, typename T2>
-        __host__ __device__
-        bool operator()(const T1& x, const T2& y)
-        {
-            return !bool(pred(x,y));
-        }
-};
-
-template<typename Predicate>
-  __host__ __device__
-  thrust::detail::unary_negate<Predicate> not1(const Predicate &pred)
-{
-    return thrust::detail::unary_negate<Predicate>(pred);
-}
-
-template<typename Predicate>
-  __host__ __device__
-  thrust::detail::binary_negate<Predicate> not2(const Predicate &pred)
-{
-    return thrust::detail::binary_negate<Predicate>(pred);
-}
-
-
-// convert a predicate to a 0 or 1 integral value
-template <typename Predicate, typename IntegralType>
-struct predicate_to_integral
-{
-    Predicate pred;
-
-    __host__ __device__
-    explicit predicate_to_integral(const Predicate& pred) : pred(pred) {}
-
-    template <typename T>
-        __host__ __device__
-        bool operator()(const T& x)
-        {
-            return pred(x) ? IntegralType(1) : IntegralType(0);
-        }
-};
-
-
-// note that detail::equal_to does not force conversion from T2 -> T1 as equal_to does
-template <typename T1>
-struct equal_to
-{
-    typedef bool result_type;
-
-    template <typename T2>
-        __host__ __device__
-        bool operator()(const T1& lhs, const T2& rhs) const
-        {
-            return lhs == rhs;
-        }
-};
-
-// note that equal_to_value does not force conversion from T2 -> T1 as equal_to does
-template <typename T2>
-struct equal_to_value
-{
-    T2 rhs;
-
-    equal_to_value(const T2& rhs) : rhs(rhs) {}
-
-    template <typename T1>
-        __host__ __device__
-        bool operator()(const T1& lhs) const
-        {
-            return lhs == rhs;
-        }
-};
-
-template <typename Predicate>
-struct tuple_binary_predicate
-{
-    typedef bool result_type;
-
-    __host__ __device__
-        tuple_binary_predicate(const Predicate& p) : pred(p) {}
-
-    template<typename Tuple>
-        __host__ __device__
-        bool operator()(const Tuple& t) const
-        { 
-            return pred(thrust::get<0>(t), thrust::get<1>(t));
-        }
-
-    Predicate pred;
-};
-
-template <typename Predicate>
-struct tuple_not_binary_predicate
-{
-    typedef bool result_type;
-
-    __host__ __device__
-        tuple_not_binary_predicate(const Predicate& p) : pred(p) {}
-
-    template<typename Tuple>
-        __host__ __device__
-        bool operator()(const Tuple& t) const
-        { 
-            return !pred(thrust::get<0>(t), thrust::get<1>(t));
-        }
-
-    Predicate pred;
-};
-
-template<typename Generator>
-  struct host_generate_functor
-{
-  typedef void result_type;
-
-  __host__ __device__
-  host_generate_functor(Generator g)
-    : gen(g) {}
-
-  // operator() does not take an lvalue reference because some iterators
-  // produce temporary proxy references when dereferenced. for example,
-  // consider the temporary tuple of references produced by zip_iterator.
-  // such temporaries cannot bind to an lvalue reference.
-  //
-  // to WAR this, accept a const reference (which is bindable to a temporary),
-  // and const_cast in the implementation.
-  //
-  // XXX change to an rvalue reference upon c++0x (which either a named variable
-  //     or temporary can bind to)
-  template<typename T>
-  __host__
-  void operator()(const T &x)
-  {
-    // we have to be naughty and const_cast this to get it to work
-    T &lvalue = const_cast<T&>(x);
-
-    // this assigns correctly whether x is a true reference or proxy
-    lvalue = gen();
-  }
-
-  Generator gen;
-};
-
-template<typename Generator>
-  struct device_generate_functor
-{
-  typedef void result_type;
-
-  __host__ __device__
-  device_generate_functor(Generator g)
-    : gen(g) {}
-
-  // operator() does not take an lvalue reference because some iterators
-  // produce temporary proxy references when dereferenced. for example,
-  // consider the temporary tuple of references produced by zip_iterator.
-  // such temporaries cannot bind to an lvalue reference.
-  //
-  // to WAR this, accept a const reference (which is bindable to a temporary),
-  // and const_cast in the implementation.
-  //
-  // XXX change to an rvalue reference upon c++0x (which either a named variable
-  //     or temporary can bind to)
-  template<typename T>
-  __host__ __device__
-  void operator()(const T &x)
-  {
-    // we have to be naughty and const_cast this to get it to work
-    T &lvalue = const_cast<T&>(x);
-
-    // this assigns correctly whether x is a true reference or proxy
-    lvalue = gen();
-  }
-
-  Generator gen;
-};
-
-template<typename System, typename Generator>
-  struct generate_functor
-    : thrust::detail::eval_if<
-        thrust::detail::is_convertible<System, thrust::host_system_tag>::value,
-        thrust::detail::identity_<host_generate_functor<Generator> >,
-        thrust::detail::identity_<device_generate_functor<Generator> >
-      >
-{};
-
-
-template<typename ResultType, typename BinaryFunction>
-  struct zipped_binary_op
-{
-  typedef ResultType result_type;
-
-  __host__ __device__
-  zipped_binary_op(BinaryFunction binary_op)
-    : m_binary_op(binary_op) {}
-
-  template<typename Tuple>
-  __host__ __device__
-  inline result_type operator()(Tuple t)
-  {
-    return m_binary_op(thrust::get<0>(t), thrust::get<1>(t));
-  }
-
-  BinaryFunction m_binary_op;
-};
-
-
-template<typename T>
-  struct is_non_const_reference
-    : thrust::detail::and_<
-        thrust::detail::not_<thrust::detail::is_const<T> >,
-        thrust::detail::is_reference<T>
-      >
-{};
-
-template<typename T> struct is_tuple_of_iterator_references : thrust::detail::false_type {};
-
-template<typename T1, typename T2, typename T3,
-         typename T4, typename T5, typename T6,
-         typename T7, typename T8, typename T9,
-         typename T10>
-  struct is_tuple_of_iterator_references<
-    thrust::detail::tuple_of_iterator_references<
-      T1,T2,T3,T4,T5,T6,T7,T8,T9,T10
-    >
-  >
-    : thrust::detail::true_type
-{};
-
-// use this enable_if to avoid assigning to temporaries in the transform functors below
-// XXX revisit this problem with c++11 perfect forwarding
-template<typename T>
-  struct enable_if_non_const_reference_or_tuple_of_iterator_references
-    : thrust::detail::enable_if<
-        is_non_const_reference<T>::value || is_tuple_of_iterator_references<T>::value
-      >
-{};
-
-
-template<typename UnaryFunction>
-  struct host_unary_transform_functor
-{
-  typedef void result_type;
-
-  UnaryFunction f;
-
-  host_unary_transform_functor(UnaryFunction f_)
-    :f(f_) {}
-
-  template<typename Tuple>
-  inline __host__
-  typename enable_if_non_const_reference_or_tuple_of_iterator_references<
-    typename thrust::tuple_element<1,Tuple>::type
-  >::type
-    operator()(Tuple t)
-  {
-    thrust::get<1>(t) = f(thrust::get<0>(t));
-  }
-};
-
-template<typename UnaryFunction>
-  struct device_unary_transform_functor
-{
-  typedef void result_type;
-
-  UnaryFunction f;
-
-  device_unary_transform_functor(UnaryFunction f_)
-    :f(f_) {}
-
-  // add __host__ to allow the omp backend compile with nvcc
-  template<typename Tuple>
-  inline __host__ __device__
-  typename enable_if_non_const_reference_or_tuple_of_iterator_references<
-    typename thrust::tuple_element<1,Tuple>::type
-  >::type
-    operator()(Tuple t)
-  {
-    thrust::get<1>(t) = f(thrust::get<0>(t));
-  }
-};
-
-
-template<typename System, typename UnaryFunction>
-  struct unary_transform_functor
-    : thrust::detail::eval_if<
-        thrust::detail::is_convertible<System, thrust::host_system_tag>::value,
-        thrust::detail::identity_<host_unary_transform_functor<UnaryFunction> >,
-        thrust::detail::identity_<device_unary_transform_functor<UnaryFunction> >
-      >
-{};
-
-
-template <typename BinaryFunction>
-  struct host_binary_transform_functor
-{
-  BinaryFunction f;
-
-  host_binary_transform_functor(BinaryFunction f_)
-    :f(f_)
-  {}
-
-  template <typename Tuple>
-  __host__
-  void operator()(Tuple t)
-  { 
-    thrust::get<2>(t) = f(thrust::get<0>(t), thrust::get<1>(t));
-  }
-}; // end binary_transform_functor
-
-
-template <typename BinaryFunction>
-  struct device_binary_transform_functor
-{
-  BinaryFunction f;
-
-  device_binary_transform_functor(BinaryFunction f_)
-    :f(f_)
-  {}
-
-  // add __host__ to allow the omp backend compile with nvcc
-  template <typename Tuple>
-  inline __host__ __device__
-  typename enable_if_non_const_reference_or_tuple_of_iterator_references<
-    typename thrust::tuple_element<2,Tuple>::type
-  >::type
-    operator()(Tuple t)
-  { 
-    thrust::get<2>(t) = f(thrust::get<0>(t), thrust::get<1>(t));
-  }
-}; // end binary_transform_functor
-
-
-template<typename System, typename BinaryFunction>
-  struct binary_transform_functor
-    : thrust::detail::eval_if<
-        thrust::detail::is_convertible<System, thrust::host_system_tag>::value,
-        thrust::detail::identity_<host_binary_transform_functor<BinaryFunction> >,
-        thrust::detail::identity_<device_binary_transform_functor<BinaryFunction> >
-      >
-{};
-
-
-template <typename UnaryFunction, typename Predicate>
-struct host_unary_transform_if_functor
-{
-  UnaryFunction unary_op;
-  Predicate pred;
-
-  host_unary_transform_if_functor(UnaryFunction unary_op_, Predicate pred_)
-    : unary_op(unary_op_), pred(pred_) {}
-
-  template<typename Tuple>
-  inline __host__
-  typename enable_if_non_const_reference_or_tuple_of_iterator_references<
-    typename thrust::tuple_element<1,Tuple>::type
-  >::type
-    operator()(Tuple t)
-  {
-    if(pred(thrust::get<0>(t)))
-    {
-      thrust::get<1>(t) = unary_op(thrust::get<0>(t));
-    }
-  }
-}; // end host_unary_transform_if_functor
-
-
-template <typename UnaryFunction, typename Predicate>
-struct device_unary_transform_if_functor
-{
-  UnaryFunction unary_op;
-  Predicate pred;
-
-  device_unary_transform_if_functor(UnaryFunction unary_op_, Predicate pred_)
-    : unary_op(unary_op_), pred(pred_) {}
-
-  template<typename Tuple>
-  inline __host__ __device__
-  typename enable_if_non_const_reference_or_tuple_of_iterator_references<
-    typename thrust::tuple_element<1,Tuple>::type
-  >::type
-    operator()(Tuple t)
-  {
-    if(pred(thrust::get<0>(t)))
-    {
-      thrust::get<1>(t) = unary_op(thrust::get<0>(t));
-    }
-  }
-}; // end device_unary_transform_if_functor
-
-
-template<typename System, typename UnaryFunction, typename Predicate>
-  struct unary_transform_if_functor
-    : thrust::detail::eval_if<
-        thrust::detail::is_convertible<System, thrust::host_system_tag>::value,
-        thrust::detail::identity_<host_unary_transform_if_functor<UnaryFunction,Predicate> >,
-        thrust::detail::identity_<device_unary_transform_if_functor<UnaryFunction,Predicate> >
-      >
-{};
-
-
-template <typename UnaryFunction, typename Predicate>
-struct host_unary_transform_if_with_stencil_functor
-{
-  UnaryFunction unary_op;
-  Predicate pred;
-  
-  host_unary_transform_if_with_stencil_functor(UnaryFunction _unary_op, Predicate _pred)
-    : unary_op(_unary_op), pred(_pred) {} 
-  
-  template <typename Tuple>
-  inline __host__
-  typename enable_if_non_const_reference_or_tuple_of_iterator_references<
-    typename thrust::tuple_element<2,Tuple>::type
-  >::type
-    operator()(Tuple t)
-  {
-    if(pred(thrust::get<1>(t)))
-      thrust::get<2>(t) = unary_op(thrust::get<0>(t));
-  }
-}; // end host_unary_transform_if_with_stencil_functor
-
-
-template <typename UnaryFunction, typename Predicate>
-struct device_unary_transform_if_with_stencil_functor
-{
-  UnaryFunction unary_op;
-  Predicate pred;
-  
-  device_unary_transform_if_with_stencil_functor(UnaryFunction _unary_op, Predicate _pred)
-    : unary_op(_unary_op), pred(_pred) {} 
-  
-  // add __host__ to allow the omp backend compile with nvcc
-  template <typename Tuple>
-  inline __host__ __device__
-  typename enable_if_non_const_reference_or_tuple_of_iterator_references<
-    typename thrust::tuple_element<2,Tuple>::type
-  >::type
-    operator()(Tuple t)
-  {
-    if(pred(thrust::get<1>(t)))
-      thrust::get<2>(t) = unary_op(thrust::get<0>(t));
-  }
-}; // end device_unary_transform_if_with_stencil_functor
-
-
-template<typename System, typename UnaryFunction, typename Predicate>
-  struct unary_transform_if_with_stencil_functor
-    : thrust::detail::eval_if<
-        thrust::detail::is_convertible<System, thrust::host_system_tag>::value,
-        thrust::detail::identity_<host_unary_transform_if_with_stencil_functor<UnaryFunction,Predicate> >,
-        thrust::detail::identity_<device_unary_transform_if_with_stencil_functor<UnaryFunction,Predicate> >
-      >
-{};
-
-
-template <typename BinaryFunction, typename Predicate>
-struct host_binary_transform_if_functor
-{
-  BinaryFunction binary_op;
-  Predicate pred;
-
-  host_binary_transform_if_functor(BinaryFunction _binary_op, Predicate _pred)
-    : binary_op(_binary_op), pred(_pred) {} 
-
-  template <typename Tuple>
-  inline __host__
-  typename enable_if_non_const_reference_or_tuple_of_iterator_references<
-    typename thrust::tuple_element<3,Tuple>::type
-  >::type
-    operator()(Tuple t)
-  {
-    if(pred(thrust::get<2>(t)))
-      thrust::get<3>(t) = binary_op(thrust::get<0>(t), thrust::get<1>(t));
-  }
-}; // end host_binary_transform_if_functor
-
-
-template <typename BinaryFunction, typename Predicate>
-struct device_binary_transform_if_functor
-{
-  BinaryFunction binary_op;
-  Predicate pred;
-
-  device_binary_transform_if_functor(BinaryFunction _binary_op, Predicate _pred)
-    : binary_op(_binary_op), pred(_pred) {} 
-
-  // add __host__ to allow the omp backend compile with nvcc
-  template <typename Tuple>
-  inline __host__ __device__
-  typename enable_if_non_const_reference_or_tuple_of_iterator_references<
-    typename thrust::tuple_element<3,Tuple>::type
-  >::type
-    operator()(Tuple t)
-  {
-    if(pred(thrust::get<2>(t)))
-      thrust::get<3>(t) = binary_op(thrust::get<0>(t), thrust::get<1>(t));
-  }
-}; // end device_binary_transform_if_functor
-
-
-template<typename System, typename BinaryFunction, typename Predicate>
-  struct binary_transform_if_functor
-    : thrust::detail::eval_if<
-        thrust::detail::is_convertible<System, thrust::host_system_tag>::value,
-        thrust::detail::identity_<host_binary_transform_if_functor<BinaryFunction,Predicate> >,
-        thrust::detail::identity_<device_binary_transform_if_functor<BinaryFunction,Predicate> >
-      >
-{};
-
-
-template<typename T>
-  struct host_destroy_functor
-{
-  __host__
-  void operator()(T &x) const
-  {
-    x.~T();
-  } // end operator()()
-}; // end host_destroy_functor
-
-
-template<typename T>
-  struct device_destroy_functor
-{
-  // add __host__ to allow the omp backend to compile with nvcc
-  __host__ __device__
-  void operator()(T &x) const
-  {
-    x.~T();
-  } // end operator()()
-}; // end device_destroy_functor
-
-
-template<typename System, typename T>
-  struct destroy_functor
-    : thrust::detail::eval_if<
-        thrust::detail::is_convertible<System, thrust::host_system_tag>::value,
-        thrust::detail::identity_<host_destroy_functor<T> >,
-        thrust::detail::identity_<device_destroy_functor<T> >
-      >
-{};
-
-
-template <typename T>
-struct fill_functor
-{
-  const T exemplar;
-
-  fill_functor(const T& _exemplar) 
-    : exemplar(_exemplar) {}
-
-  __host__ __device__
-  T operator()(void) const
-  { 
-    return exemplar;
-  }
-};
-
-
-template<typename T>
-  struct uninitialized_fill_functor
-{
-  T exemplar;
-
-  uninitialized_fill_functor(T x):exemplar(x){}
-
-  __host__ __device__
-  void operator()(T &x)
-  {
-    ::new(static_cast<void*>(&x)) T(exemplar);
-  } // end operator()()
-}; // end uninitialized_fill_functor
-
-
-// this predicate tests two two-element tuples
-// we first use a Compare for the first element
-// if the first elements are equivalent, we use
-// < for the second elements
-template<typename Compare>
-  struct compare_first_less_second
-{
-  compare_first_less_second(Compare c)
-    : comp(c) {}
-
-  template<typename T1, typename T2>
-  __host__ __device__
-  bool operator()(T1 lhs, T2 rhs)
-  {
-    return comp(thrust::get<0>(lhs), thrust::get<0>(rhs)) || (!comp(thrust::get<0>(rhs), thrust::get<0>(lhs)) && thrust::get<1>(lhs) < thrust::get<1>(rhs));
-  }
-
-  Compare comp;
-}; // end compare_first_less_second
-
-
-template<typename Compare>
-  struct compare_first
-{
-  Compare comp;
-
-  compare_first(Compare comp)
-    : comp(comp)
-  {}
-
-  template<typename Tuple1, typename Tuple2>
-  __host__ __device__
-  bool operator()(const Tuple1 &x, const Tuple2 &y)
-  {
-    return comp(thrust::raw_reference_cast(thrust::get<0>(x)), thrust::raw_reference_cast(thrust::get<0>(y)));
-  }
-}; // end compare_first
-
-
-} // end namespace detail
-} // end namespace thrust
-
diff --git a/compat/thrust/detail/logical.inl b/compat/thrust/detail/logical.inl
deleted file mode 100644
index 126a3e3fb1..0000000000
--- a/compat/thrust/detail/logical.inl
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file logical.inl
- *  \brief Inline file for logical.h.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/detail/generic/select_system.h>
-#include <thrust/system/detail/generic/logical.h>
-#include <thrust/system/detail/adl/logical.h>
-
-namespace thrust
-{
-
-
-template <typename DerivedPolicy, typename InputIterator, typename Predicate>
-bool all_of(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, InputIterator first, InputIterator last, Predicate pred)
-{
-  using thrust::system::detail::generic::all_of;
-  return all_of(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, pred);
-} // end all_of()
-
-
-template <typename DerivedPolicy, typename InputIterator, typename Predicate>
-bool any_of(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, InputIterator first, InputIterator last, Predicate pred)
-{
-  using thrust::system::detail::generic::any_of;
-  return any_of(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, pred);
-} // end any_of()
-
-
-template <typename DerivedPolicy, typename InputIterator, typename Predicate>
-bool none_of(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, InputIterator first, InputIterator last, Predicate pred)
-{
-  using thrust::system::detail::generic::none_of;
-  return none_of(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, pred);
-} // end none_of()
-
-
-template <typename InputIterator, typename Predicate>
-bool all_of(InputIterator first, InputIterator last, Predicate pred)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator>::type System;
-
-  System system;
-
-  return thrust::all_of(select_system(system), first, last, pred);
-}
-
-
-template <typename InputIterator, typename Predicate>
-bool any_of(InputIterator first, InputIterator last, Predicate pred)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator>::type System;
-
-  System system;
-
-  return thrust::any_of(select_system(system), first, last, pred);
-}
-
-
-template <typename InputIterator, typename Predicate>
-bool none_of(InputIterator first, InputIterator last, Predicate pred)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator>::type System;
-
-  System system;
-
-  return thrust::none_of(select_system(system), first, last, pred);
-}
-
-
-} // end namespace thrust
-
diff --git a/compat/thrust/detail/malloc_and_free.h b/compat/thrust/detail/malloc_and_free.h
deleted file mode 100644
index 57b1685476..0000000000
--- a/compat/thrust/detail/malloc_and_free.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/execution_policy.h>
-#include <thrust/detail/pointer.h>
-#include <thrust/detail/raw_pointer_cast.h>
-#include <thrust/system/detail/generic/memory.h>
-#include <thrust/system/detail/adl/malloc_and_free.h>
-
-namespace thrust
-{
-
-template<typename DerivedPolicy>
-pointer<void,DerivedPolicy> malloc(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, std::size_t n)
-{
-  using thrust::system::detail::generic::malloc;
-
-  // XXX should use a hypothetical thrust::static_pointer_cast here
-  void *raw_ptr = static_cast<void*>(thrust::raw_pointer_cast(malloc(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), n)));
-
-  return pointer<void,DerivedPolicy>(raw_ptr);
-}
-
-template<typename T, typename DerivedPolicy>
-pointer<T,DerivedPolicy> malloc(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, std::size_t n)
-{
-  using thrust::system::detail::generic::malloc;
-
-  T *raw_ptr = static_cast<T*>(thrust::raw_pointer_cast(malloc<T>(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), n)));
-
-  return pointer<T,DerivedPolicy>(raw_ptr);
-}
-
-
-// XXX WAR nvbug 992955
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-#if CUDA_VERSION < 5000
-
-// cudafe generates unqualified calls to free(int *volatile)
-// which get confused with thrust::free
-// spoof a thrust::free which simply maps to ::free
-inline __host__ __device__
-void free(int *volatile ptr)
-{
-  ::free(ptr);
-}
-
-#endif // CUDA_VERSION
-#endif // THRUST_DEVICE_COMPILER
-
-template<typename DerivedPolicy, typename Pointer>
-void free(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, Pointer ptr)
-{
-  using thrust::system::detail::generic::free;
-
-  free(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), ptr);
-}
-
-// XXX consider another form of free which does not take a system argument and
-// instead infers the system from the pointer
-
-} // end namespace thrust
-
diff --git a/compat/thrust/detail/merge.inl b/compat/thrust/detail/merge.inl
deleted file mode 100644
index 77f09f5bed..0000000000
--- a/compat/thrust/detail/merge.inl
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file merge.inl
- *  \brief Inline file for merge.h.
- */
-
-#include <thrust/merge.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/detail/generic/select_system.h>
-#include <thrust/system/detail/generic/merge.h>
-#include <thrust/system/detail/adl/merge.h>
-
-namespace thrust
-{
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-  OutputIterator merge(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                       InputIterator1 first1,
-                       InputIterator1 last1,
-                       InputIterator2 first2,
-                       InputIterator2 last2,
-                       OutputIterator result)
-{
-  using thrust::system::detail::generic::merge;
-  return merge(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, last2, result);
-} // end merge()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename StrictWeakCompare>
-  OutputIterator merge(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                       InputIterator1 first1,
-                       InputIterator1 last1,
-                       InputIterator2 first2,
-                       InputIterator2 last2,
-                       OutputIterator result,
-                       StrictWeakCompare comp)
-{
-  using thrust::system::detail::generic::merge;
-  return merge(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, last2, result, comp);
-} // end merge()
-
-
-template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename InputIterator3, typename InputIterator4, typename OutputIterator1, typename OutputIterator2>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    merge_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                 InputIterator1 keys_first1, InputIterator1 keys_last1,
-                 InputIterator2 keys_first2, InputIterator2 keys_last2,
-                 InputIterator3 values_first1, InputIterator4 values_first2,
-                 OutputIterator1 keys_result,
-                 OutputIterator2 values_result)
-{
-  using thrust::system::detail::generic::merge_by_key;
-  return merge_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result);
-} // end merge_by_key()
-
-
-template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename InputIterator3, typename InputIterator4, typename OutputIterator1, typename OutputIterator2, typename Compare>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    merge_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                 InputIterator1 keys_first1, InputIterator1 keys_last1,
-                 InputIterator2 keys_first2, InputIterator2 keys_last2,
-                 InputIterator3 values_first1, InputIterator4 values_first2,
-                 OutputIterator1 keys_result,
-                 OutputIterator2 values_result,
-                 Compare comp)
-{
-  using thrust::system::detail::generic::merge_by_key;
-  return merge_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result, comp);
-} // end merge_by_key()
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename StrictWeakOrdering>
-  OutputIterator merge(InputIterator1 first1,
-                       InputIterator1 last1,
-                       InputIterator2 first2,
-                       InputIterator2 last2,
-                       OutputIterator result,
-                       StrictWeakOrdering comp)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type System1;
-  typedef typename thrust::iterator_system<InputIterator2>::type System2;
-  typedef typename thrust::iterator_system<OutputIterator>::type System3;
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-
-  return thrust::merge(select_system(system1,system2,system3), first1, last1, first2, last2, result, comp);
-} // end merge()
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-  OutputIterator merge(InputIterator1 first1,
-                       InputIterator1 last1,
-                       InputIterator2 first2,
-                       InputIterator2 last2,
-                       OutputIterator result)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type System1;
-  typedef typename thrust::iterator_system<InputIterator2>::type System2;
-  typedef typename thrust::iterator_system<OutputIterator>::type System3;
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-
-  return thrust::merge(select_system(system1,system2,system3), first1, last1, first2, last2, result);
-} // end merge()
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename InputIterator4,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename StrictWeakOrdering>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    merge_by_key(InputIterator1 keys_first1,
-                 InputIterator1 keys_last1,
-                 InputIterator2 keys_first2,
-                 InputIterator2 keys_last2,
-                 InputIterator3 values_first1,
-                 InputIterator4 values_first2,
-                 OutputIterator1 keys_result,
-                 OutputIterator2 values_result,
-                 StrictWeakOrdering comp)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
-  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
-  typedef typename thrust::iterator_system<InputIterator3>::type  System3;
-  typedef typename thrust::iterator_system<InputIterator4>::type  System4;
-  typedef typename thrust::iterator_system<OutputIterator1>::type System5;
-  typedef typename thrust::iterator_system<OutputIterator2>::type System6;
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-  System4 system4;
-  System5 system5;
-  System6 system6;
-
-  return thrust::merge_by_key(select_system(system1,system2,system3,system4,system5,system6), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result, comp);
-} // end merge_by_key()
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename InputIterator4,
-         typename OutputIterator1,
-         typename OutputIterator2>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    merge_by_key(InputIterator1 keys_first1,
-                 InputIterator1 keys_last1,
-                 InputIterator2 keys_first2,
-                 InputIterator2 keys_last2,
-                 InputIterator3 values_first1,
-                 InputIterator4 values_first2,
-                 OutputIterator1 keys_result,
-                 OutputIterator2 values_result)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
-  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
-  typedef typename thrust::iterator_system<InputIterator3>::type  System3;
-  typedef typename thrust::iterator_system<InputIterator4>::type  System4;
-  typedef typename thrust::iterator_system<OutputIterator1>::type System5;
-  typedef typename thrust::iterator_system<OutputIterator2>::type System6;
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-  System4 system4;
-  System5 system5;
-  System6 system6;
-
-  return thrust::merge_by_key(select_system(system1,system2,system3,system4,system5,system6), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result);
-} // end merge_by_key()
-
-
-} // end thrust
-
diff --git a/compat/thrust/detail/minmax.h b/compat/thrust/detail/minmax.h
deleted file mode 100644
index a560ea1fe5..0000000000
--- a/compat/thrust/detail/minmax.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-namespace thrust
-{
-
-
-template<typename T, typename BinaryPredicate>
-__host__ __device__
-  T min THRUST_PREVENT_MACRO_SUBSTITUTION (const T &lhs, const T &rhs, BinaryPredicate comp)
-{
-  return comp(rhs, lhs) ? rhs : lhs;
-} // end min()
-
-template<typename T>
-__host__ __device__
-  T min THRUST_PREVENT_MACRO_SUBSTITUTION (const T &lhs, const T &rhs)
-{
-  return rhs < lhs ? rhs : lhs;
-} // end min()
-
-template<typename T, typename BinaryPredicate>
-__host__ __device__
-  T max THRUST_PREVENT_MACRO_SUBSTITUTION (const T &lhs, const T &rhs, BinaryPredicate comp)
-{
-  return comp(lhs,rhs) ? rhs : lhs;
-} // end max()
-
-template<typename T>
-__host__ __device__
-  T max THRUST_PREVENT_MACRO_SUBSTITUTION (const T &lhs, const T &rhs)
-{
-  return lhs < rhs ? rhs : lhs;
-} // end max()
-
-
-} // end thrust
-
diff --git a/compat/thrust/detail/mismatch.inl b/compat/thrust/detail/mismatch.inl
deleted file mode 100644
index 37ac663bbe..0000000000
--- a/compat/thrust/detail/mismatch.inl
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file mismatch.inl
- *  \brief Inline file for mismatch.h
- */
-
-
-#include <thrust/detail/config.h>
-#include <thrust/mismatch.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/detail/generic/select_system.h>
-#include <thrust/system/detail/generic/mismatch.h>
-#include <thrust/system/detail/adl/mismatch.h>
-
-namespace thrust
-{
-
-
-template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2>
-thrust::pair<InputIterator1, InputIterator2> mismatch(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                                      InputIterator1 first1,
-                                                      InputIterator1 last1,
-                                                      InputIterator2 first2)
-{
-  using thrust::system::detail::generic::mismatch;
-  return mismatch(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2);
-} // end mismatch()
-
-
-template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename BinaryPredicate>
-thrust::pair<InputIterator1, InputIterator2> mismatch(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                                      InputIterator1 first1,
-                                                      InputIterator1 last1,
-                                                      InputIterator2 first2,
-                                                      BinaryPredicate pred)
-{
-  using thrust::system::detail::generic::mismatch;
-  return mismatch(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, pred);
-} // end mismatch()
-
-
-template <typename InputIterator1, typename InputIterator2>
-thrust::pair<InputIterator1, InputIterator2> mismatch(InputIterator1 first1,
-                                                      InputIterator1 last1,
-                                                      InputIterator2 first2)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type System1;
-  typedef typename thrust::iterator_system<InputIterator2>::type System2;
-
-  System1 system1;
-  System2 system2;
-
-  return thrust::mismatch(select_system(system1,system2), first1, last1, first2);
-} // end mismatch()
-
-
-template <typename InputIterator1, typename InputIterator2, typename BinaryPredicate>
-thrust::pair<InputIterator1, InputIterator2> mismatch(InputIterator1 first1,
-                                                      InputIterator1 last1,
-                                                      InputIterator2 first2,
-                                                      BinaryPredicate pred)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type System1;
-  typedef typename thrust::iterator_system<InputIterator2>::type System2;
-
-  System1 system1;
-  System2 system2;
-
-  return thrust::mismatch(select_system(system1,system2), first1, last1, first2, pred);
-} // end mismatch()
-
-
-} // end namespace thrust
-
diff --git a/compat/thrust/detail/mpl/math.h b/compat/thrust/detail/mpl/math.h
deleted file mode 100644
index 80adfc1e88..0000000000
--- a/compat/thrust/detail/mpl/math.h
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file math.h
- *  \brief Math-related metaprogramming functionality.
- */
-
-
-#pragma once
-
-namespace thrust
-{
-
-namespace detail
-{
-
-namespace mpl
-{
-
-namespace math
-{
-
-namespace detail
-{
-
-// compute the log base-2 of an integer at compile time
-template <unsigned int N, unsigned int Cur>
-struct log2
-{
-    static const unsigned int value = log2<N / 2,Cur+1>::value;
-};
-
-template <unsigned int Cur>
-struct log2<1, Cur>
-{
-    static const unsigned int value = Cur;
-};
-
-template <unsigned int Cur>
-struct log2<0, Cur>
-{
-    // undefined
-};
-
-} // end namespace detail
-
-
-template <unsigned int N>
-struct log2
-{
-    static const unsigned int value = detail::log2<N,0>::value;
-};
-
-
-template <typename T, T lhs, T rhs>
-struct min
-{
-  static const T value = (lhs < rhs) ? lhs : rhs;
-};
-
-
-template <typename T, T lhs, T rhs>
-struct max
-{
-  static const T value = (!(lhs < rhs)) ? lhs : rhs;
-};
-
-
-template<typename result_type, result_type x, result_type y>
-  struct mul
-{
-  static const result_type value = x * y;
-};
-
-
-template<typename result_type, result_type x, result_type y>
-  struct mod
-{
-  static const result_type value = x % y;
-};
-
-
-template<typename result_type, result_type x, result_type y>
-  struct div
-{
-  static const result_type value = x / y;
-};
-
-
-template<typename result_type, result_type x, result_type y>
-  struct geq
-{
-  static const bool value = x >= y;
-};
-
-
-template<typename result_type, result_type x, result_type y>
-  struct lt
-{
-  static const bool value = x < y;
-};
-
-
-template<typename result_type, result_type x, result_type y>
-  struct gt
-{
-  static const bool value = x > y;
-};
-
-
-template<bool x, bool y>
-  struct or_
-{
-  static const bool value = (x || y);
-};
-
-
-template<typename result_type, result_type x, result_type y>
-  struct bit_and
-{
-  static const result_type value = x & y;
-};
-
-
-template<typename result_type, result_type x, result_type y>
-  struct plus
-{
-  static const result_type value = x + y;
-};
-
-
-template<typename result_type, result_type x, result_type y>
-  struct minus
-{
-  static const result_type value = x - y;
-};
-
-
-template<typename result_type, result_type x, result_type y>
-  struct equal
-{
-  static const bool value = x == y;
-};
-
-
-template<typename result_type, result_type x>
-  struct is_odd
-{
-  static const bool value = x & 1;
-};
-
-
-} // end namespace math
-
-} // end namespace mpl
-
-} // end namespace detail
-
-} // end namespace thrust
-
diff --git a/compat/thrust/detail/numeric_traits.h b/compat/thrust/detail/numeric_traits.h
deleted file mode 100644
index a3bc56c211..0000000000
--- a/compat/thrust/detail/numeric_traits.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/type_traits.h>
-#include <limits>
-
-//#include <stdint.h> // for intmax_t (not provided on MSVS 2005)
-
-namespace thrust
-{
-
-namespace detail
-{
-
-// XXX good enough for the platforms we care about
-typedef long long intmax_t;
-
-template<typename Number>
-  struct is_signed
-    : integral_constant<bool, std::numeric_limits<Number>::is_signed>
-{}; // end is_signed
-
-
-template<typename T>
-  struct num_digits
-    : eval_if<
-        std::numeric_limits<T>::is_specialized,
-        integral_constant<
-          int,
-          std::numeric_limits<T>::digits
-        >,
-        integral_constant<
-          int,
-          sizeof(T) * std::numeric_limits<unsigned char>::digits - (is_signed<T>::value ? 1 : 0)  
-        >
-      >::type
-{}; // end num_digits
-
-
-template<typename Integer>
-  struct integer_difference
-    //: eval_if<
-    //    sizeof(Integer) >= sizeof(intmax_t),
-    //    eval_if<
-    //      is_signed<Integer>::value,
-    //      identity_<Integer>,
-    //      identity_<intmax_t>
-    //    >,
-    //    eval_if<
-    //      sizeof(Integer) < sizeof(std::ptrdiff_t),
-    //      identity_<std::ptrdiff_t>,
-    //      identity_<intmax_t>
-    //    >
-    //  >
-{
-  private:
-    // XXX workaround a pedantic warning in old versions of g++
-    //     which complains about &&ing with a constant value
-    template<bool x, bool y>
-      struct and_
-    {
-      static const bool value = false;
-    };
-
-    template<bool y>
-      struct and_<true,y>
-    {
-      static const bool value = y;
-    };
-
-  public:
-    typedef typename
-      eval_if<
-        and_<
-          std::numeric_limits<Integer>::is_signed,
-          // digits is the number of no-sign bits
-          (!std::numeric_limits<Integer>::is_bounded || (int(std::numeric_limits<Integer>::digits) + 1 >= num_digits<intmax_t>::value))
-        >::value,
-        identity_<Integer>,
-        eval_if<
-          int(std::numeric_limits<Integer>::digits) + 1 < num_digits<signed int>::value,
-          identity_<signed int>,
-          eval_if<
-            int(std::numeric_limits<Integer>::digits) + 1 < num_digits<signed long>::value,
-            identity_<signed long>,
-            identity_<intmax_t>
-          >
-        >
-      >::type type;
-}; // end integer_difference
-
-
-template<typename Number>
-  struct numeric_difference
-    : eval_if<
-      is_integral<Number>::value,
-      integer_difference<Number>,
-      identity_<Number>
-    >
-{}; // end numeric_difference
-
-
-template<typename Number>
-__host__ __device__
-typename numeric_difference<Number>::type
-numeric_distance(Number x, Number y)
-{
-  typedef typename numeric_difference<Number>::type difference_type;
-  return difference_type(y) - difference_type(x);
-} // end numeric_distance
-
-} // end detail
-
-} // end thrust
-
diff --git a/compat/thrust/detail/overlapped_copy.h b/compat/thrust/detail/overlapped_copy.h
deleted file mode 100644
index a5540b8643..0000000000
--- a/compat/thrust/detail/overlapped_copy.h
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/iterator/detail/minimum_system.h>
-#include <thrust/detail/copy.h>
-#include <thrust/detail/temporary_array.h>
-#include <thrust/system/cpp/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace detail
-{
-
-
-template<typename InputIterator,
-         typename OutputIterator>
-  OutputIterator sequential_copy(InputIterator first,
-                                 InputIterator last,
-                                 OutputIterator result)
-{
-  for(; first != last; ++first, ++result)
-  {
-    *result = *first;
-  } // end for
-
-  return result;
-} // end sequential_copy()
-
-
-template<typename BidirectionalIterator1,
-         typename BidirectionalIterator2>
-  BidirectionalIterator2 sequential_copy_backward(BidirectionalIterator1 first,
-                                                  BidirectionalIterator1 last,
-                                                  BidirectionalIterator2 result)
-{
-  // yes, we preincrement
-  // the ranges are open on the right, i.e. [first, last)
-  while(first != last)
-  {
-    *--result = *--last;
-  } // end while
-
-  return result;
-} // end sequential_copy_backward()
-
-
-namespace dispatch
-{
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-  RandomAccessIterator2 overlapped_copy(thrust::system::cpp::detail::execution_policy<DerivedPolicy> &,
-                                        RandomAccessIterator1 first,
-                                        RandomAccessIterator1 last,
-                                        RandomAccessIterator2 result)
-{
-  if(first < last && first <= result && result < last)
-  {
-    // result lies in [first, last)
-    // it's safe to use std::copy_backward here
-    thrust::detail::sequential_copy_backward(first, last, result + (last - first));
-    result += (last - first);
-  } // end if
-  else
-  {
-    // result + (last - first) lies in [first, last)
-    // it's safe to use sequential_copy here
-    result = thrust::detail::sequential_copy(first, last, result);
-  } // end else
-
-  return result;
-} // end overlapped_copy()
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-  RandomAccessIterator2 overlapped_copy(thrust::execution_policy<DerivedPolicy> &exec,
-                                        RandomAccessIterator1 first,
-                                        RandomAccessIterator1 last,
-                                        RandomAccessIterator2 result)
-{
-  typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type;
-
-  // make a temporary copy of [first,last), and copy into it first
-  thrust::detail::temporary_array<value_type, DerivedPolicy> temp(exec, first, last);
-  return thrust::copy(exec, temp.begin(), temp.end(), result);
-} // end overlapped_copy()
-
-} // end dispatch
-
-
-template<typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-  RandomAccessIterator2 overlapped_copy(RandomAccessIterator1 first,
-                                        RandomAccessIterator1 last,
-                                        RandomAccessIterator2 result)
-{
-  typedef typename thrust::iterator_system<RandomAccessIterator2>::type System1;
-  typedef typename thrust::iterator_system<RandomAccessIterator2>::type System2;
-
-  typedef typename thrust::detail::minimum_system<System1, System2>::type System;
-
-  // XXX presumes System is default constructible
-  System system;
-
-  return thrust::detail::dispatch::overlapped_copy(system, first, last, result);
-} // end overlapped_copy()
-
-} // end detail
-} // end thrust
-
diff --git a/compat/thrust/detail/pair.inl b/compat/thrust/detail/pair.inl
deleted file mode 100644
index 776bdc2315..0000000000
--- a/compat/thrust/detail/pair.inl
+++ /dev/null
@@ -1,225 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/pair.h>
-#include <thrust/detail/swap.h>
-
-namespace thrust
-{
-
-template <typename T1, typename T2>
-  pair<T1,T2>
-    ::pair(void)
-      :first(),second()
-{
-  ;
-} // end pair::pair()
-
-
-template <typename T1, typename T2>
-  pair<T1,T2>
-    ::pair(const T1 &x, const T2 &y)
-      :first(x),second(y)
-{
-  ;
-} // end pair::pair()
-
-
-template <typename T1, typename T2>
-  template <typename U1, typename U2>
-    pair<T1,T2>
-      ::pair(const pair<U1,U2> &p)
-        :first(p.first),second(p.second)
-{
-  ;
-} // end pair::pair()
-
-
-template <typename T1, typename T2>
-  template <typename U1, typename U2>
-    pair<T1,T2>
-      ::pair(const std::pair<U1,U2> &p)
-        :first(p.first),second(p.second)
-{
-  ;
-} // end pair::pair()
-
-
-template<typename T1, typename T2>
-  inline __host__ __device__
-    void pair<T1,T2>
-      ::swap(thrust::pair<T1,T2> &p)
-{
-  using thrust::swap;
-
-  swap(first, p.first);
-  swap(second, p.second);
-} // end pair::swap()
-
-
-template <typename T1, typename T2>
-  inline __host__ __device__
-    bool operator==(const pair<T1,T2> &x, const pair<T1,T2> &y)
-{
-  return x.first == y.first && x.second == y.second;
-} // end operator==()
-
-
-template <typename T1, typename T2>
-  inline __host__ __device__
-    bool operator<(const pair<T1,T2> &x, const pair<T1,T2> &y)
-{
-  return x.first < y.first || (!(y.first < x.first) && x.second < y.second);
-} // end operator<()
-
-
-template <typename T1, typename T2>
-  inline __host__ __device__
-    bool operator!=(const pair<T1,T2> &x, const pair<T1,T2> &y)
-{
-  return !(x == y);
-} // end operator==()
-
-
-template <typename T1, typename T2>
-  inline __host__ __device__
-    bool operator>(const pair<T1,T2> &x, const pair<T1,T2> &y)
-{
-  return y < x;
-} // end operator<()
-
-
-template <typename T1, typename T2>
-  inline __host__ __device__
-    bool operator<=(const pair<T1,T2> &x, const pair<T1,T2> &y)
-{
-  return !(y < x);
-} // end operator<=()
-
-
-template <typename T1, typename T2>
-  inline __host__ __device__
-    bool operator>=(const pair<T1,T2> &x, const pair<T1,T2> &y)
-{
-  return !(x < y);
-} // end operator>=()
-
-
-template <typename T1, typename T2>
-  inline __host__ __device__
-    void swap(pair<T1,T2> &x, pair<T1,T2> &y)
-{
-  return x.swap(y);
-} // end swap()
-
-
-template <typename T1, typename T2>
-  inline __host__ __device__
-    pair<T1,T2> make_pair(T1 x, T2 y)
-{
-  return pair<T1,T2>(x,y);
-} // end make_pair()
-
-
-// specializations of tuple_element for pair
-template<typename T1, typename T2>
-  struct tuple_element<0, pair<T1,T2> >
-{
-  typedef T1 type;
-}; // end tuple_element
-
-template<typename T1, typename T2>
-  struct tuple_element<1, pair<T1,T2> >
-{
-  typedef T2 type;
-}; // end tuple_element
-
-
-// specialization of tuple_size for pair
-template<typename T1, typename T2>
-  struct tuple_size< pair<T1,T2 > >
-{
-  static const unsigned int value = 2;
-}; // end tuple_size
-
-
-
-namespace detail
-{
-
-
-template<int N, typename Pair> struct pair_get {};
-
-template<typename Pair>
-  struct pair_get<0, Pair>
-{
-  inline __host__ __device__
-    const typename tuple_element<0, Pair>::type &
-      operator()(const Pair &p) const
-  {
-    return p.first;
-  } // end operator()()
-
-  inline __host__ __device__
-    typename tuple_element<0, Pair>::type &
-      operator()(Pair &p) const
-  {
-    return p.first;
-  } // end operator()()
-}; // end pair_get
-
-
-template<typename Pair>
-  struct pair_get<1, Pair>
-{
-  inline __host__ __device__
-    const typename tuple_element<1, Pair>::type &
-      operator()(const Pair &p) const
-  {
-    return p.second;
-  } // end operator()()
-
-  inline __host__ __device__
-    typename tuple_element<1, Pair>::type &
-      operator()(Pair &p) const
-  {
-    return p.second;
-  } // end operator()()
-}; // end pair_get
-
-} // end detail
-
-
-
-template<unsigned int N, typename T1, typename T2>
-  inline __host__ __device__
-    typename tuple_element<N, pair<T1,T2> >::type &
-      get(pair<T1,T2> &p)
-{
-  return detail::pair_get<N, pair<T1,T2> >()(p);
-} // end get()
-
-template<unsigned int N, typename T1, typename T2>
-  inline __host__ __device__
-    const typename tuple_element<N, pair<T1,T2> >::type &
-      get(const pair<T1,T2> &p)
-{
-  return detail::pair_get<N, pair<T1,T2> >()(p);
-} // end get()
-
-
-} // end thrust
-
diff --git a/compat/thrust/detail/partition.inl b/compat/thrust/detail/partition.inl
deleted file mode 100644
index 19ef08a73c..0000000000
--- a/compat/thrust/detail/partition.inl
+++ /dev/null
@@ -1,398 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file partition.inl
- *  \brief Inline file for partition.h.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/partition.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/detail/generic/select_system.h>
-#include <thrust/system/detail/generic/partition.h>
-#include <thrust/system/detail/adl/partition.h>
-
-namespace thrust
-{
-
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename Predicate>
-  ForwardIterator partition(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                            ForwardIterator first,
-                            ForwardIterator last,
-                            Predicate pred)
-{
-  using thrust::system::detail::generic::partition;
-  return partition(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, pred);
-} // end partition()
-
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename InputIterator,
-         typename Predicate>
-  ForwardIterator partition(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                            ForwardIterator first,
-                            ForwardIterator last,
-                            InputIterator stencil,
-                            Predicate pred)
-{
-  using thrust::system::detail::generic::partition;
-  return partition(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, stencil, pred);
-} // end partition()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename Predicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    partition_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                   InputIterator first,
-                   InputIterator last,
-                   OutputIterator1 out_true,
-                   OutputIterator2 out_false,
-                   Predicate pred)
-{
-  using thrust::system::detail::generic::partition_copy;
-  return partition_copy(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, out_true, out_false, pred);
-} // end partition_copy()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename Predicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    partition_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                   InputIterator1 first,
-                   InputIterator1 last,
-                   InputIterator2 stencil,
-                   OutputIterator1 out_true,
-                   OutputIterator2 out_false,
-                   Predicate pred)
-{
-  using thrust::system::detail::generic::partition_copy;
-  return partition_copy(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, stencil, out_true, out_false, pred);
-} // end partition_copy()
-
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename Predicate>
-  ForwardIterator stable_partition(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                   ForwardIterator first,
-                                   ForwardIterator last,
-                                   Predicate pred)
-{
-  using thrust::system::detail::generic::stable_partition;
-  return stable_partition(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, pred);
-} // end stable_partition()
-
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename InputIterator,
-         typename Predicate>
-  ForwardIterator stable_partition(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                   ForwardIterator first,
-                                   ForwardIterator last,
-                                   InputIterator stencil,
-                                   Predicate pred)
-{
-  using thrust::system::detail::generic::stable_partition;
-  return stable_partition(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, stencil, pred);
-} // end stable_partition()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename Predicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    stable_partition_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                          InputIterator first,
-                          InputIterator last,
-                          OutputIterator1 out_true,
-                          OutputIterator2 out_false,
-                          Predicate pred)
-{
-  using thrust::system::detail::generic::stable_partition_copy;
-  return stable_partition_copy(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, out_true, out_false, pred);
-} // end stable_partition_copy()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename Predicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    stable_partition_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                          InputIterator1 first,
-                          InputIterator1 last,
-                          InputIterator2 stencil,
-                          OutputIterator1 out_true,
-                          OutputIterator2 out_false,
-                          Predicate pred)
-{
-  using thrust::system::detail::generic::stable_partition_copy;
-  return stable_partition_copy(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, stencil, out_true, out_false, pred);
-} // end stable_partition_copy()
-
-
-template<typename DerivedPolicy, typename ForwardIterator, typename Predicate>
-  ForwardIterator partition_point(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                  ForwardIterator first,
-                                  ForwardIterator last,
-                                  Predicate pred)
-{
-  using thrust::system::detail::generic::partition_point;
-  return partition_point(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, pred);
-} // end partition_point()
-
-
-template<typename DerivedPolicy, typename InputIterator, typename Predicate>
-  bool is_partitioned(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                      InputIterator first,
-                      InputIterator last,
-                      Predicate pred)
-{
-  using thrust::system::detail::generic::is_partitioned;
-  return is_partitioned(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, pred);
-} // end is_partitioned()
-
-
-template<typename ForwardIterator,
-         typename Predicate>
-  ForwardIterator partition(ForwardIterator first,
-                            ForwardIterator last,
-                            Predicate pred)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<ForwardIterator>::type System;
-
-  System system;
-
-  return thrust::partition(select_system(system), first, last, pred);
-} // end partition()
-
-
-template<typename ForwardIterator,
-         typename InputIterator,
-         typename Predicate>
-  ForwardIterator partition(ForwardIterator first,
-                            ForwardIterator last,
-                            InputIterator stencil,
-                            Predicate pred)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<ForwardIterator>::type System1;
-  typedef typename thrust::iterator_system<InputIterator>::type System2;
-
-  System1 system1;
-  System2 system2;
-
-  return thrust::partition(select_system(system1,system2), first, last, stencil, pred);
-} // end partition()
-
-
-template<typename ForwardIterator,
-         typename Predicate>
-  ForwardIterator stable_partition(ForwardIterator first,
-                                   ForwardIterator last,
-                                   Predicate pred)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<ForwardIterator>::type System;
-
-  System system;
-
-  return thrust::stable_partition(select_system(system), first, last, pred);
-} // end stable_partition()
-
-
-template<typename ForwardIterator,
-         typename InputIterator,
-         typename Predicate>
-  ForwardIterator stable_partition(ForwardIterator first,
-                                   ForwardIterator last,
-                                   InputIterator stencil,
-                                   Predicate pred)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<ForwardIterator>::type System1;
-  typedef typename thrust::iterator_system<InputIterator>::type   System2;
-
-  System1 system1;
-  System2 system2;
-
-  return thrust::stable_partition(select_system(system1,system2), first, last, stencil, pred);
-} // end stable_partition()
-
-
-template<typename InputIterator,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename Predicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    partition_copy(InputIterator first,
-                   InputIterator last,
-                   OutputIterator1 out_true,
-                   OutputIterator2 out_false,
-                   Predicate pred)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator>::type   System1;
-  typedef typename thrust::iterator_system<OutputIterator1>::type System2;
-  typedef typename thrust::iterator_system<OutputIterator2>::type System3;
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-
-  return thrust::partition_copy(select_system(system1,system2,system3), first, last, out_true, out_false, pred);
-} // end partition_copy()
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename Predicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    partition_copy(InputIterator1 first,
-                   InputIterator1 last,
-                   InputIterator2 stencil,
-                   OutputIterator1 out_true,
-                   OutputIterator2 out_false,
-                   Predicate pred)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
-  typedef typename thrust::iterator_system<InputIterator1>::type  System2;
-  typedef typename thrust::iterator_system<OutputIterator1>::type System3;
-  typedef typename thrust::iterator_system<OutputIterator2>::type System4;
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-  System4 system4;
-
-  return thrust::partition_copy(select_system(system1,system2,system3,system4), first, last, stencil, out_true, out_false, pred);
-} // end partition_copy()
-
-
-template<typename InputIterator,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename Predicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    stable_partition_copy(InputIterator first,
-                          InputIterator last,
-                          OutputIterator1 out_true,
-                          OutputIterator2 out_false,
-                          Predicate pred)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator>::type   System1;
-  typedef typename thrust::iterator_system<OutputIterator1>::type System2;
-  typedef typename thrust::iterator_system<OutputIterator2>::type System3;
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-
-  return thrust::stable_partition_copy(select_system(system1,system2,system3), first, last, out_true, out_false, pred);
-} // end stable_partition_copy()
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename Predicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    stable_partition_copy(InputIterator1 first,
-                          InputIterator1 last,
-                          InputIterator2 stencil,
-                          OutputIterator1 out_true,
-                          OutputIterator2 out_false,
-                          Predicate pred)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type   System1;
-  typedef typename thrust::iterator_system<InputIterator2>::type   System2;
-  typedef typename thrust::iterator_system<OutputIterator1>::type  System3;
-  typedef typename thrust::iterator_system<OutputIterator2>::type  System4;
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-  System4 system4;
-
-  return thrust::stable_partition_copy(select_system(system1,system2,system3,system4), first, last, stencil, out_true, out_false, pred);
-} // end stable_partition_copy()
-
-
-template<typename ForwardIterator, typename Predicate>
-  ForwardIterator partition_point(ForwardIterator first,
-                                  ForwardIterator last,
-                                  Predicate pred)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<ForwardIterator>::type System;
-
-  System system;
-
-  return thrust::partition_point(select_system(system), first, last, pred);
-} // end partition_point()
-
-
-template<typename InputIterator, typename Predicate>
-  bool is_partitioned(InputIterator first,
-                      InputIterator last,
-                      Predicate pred)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator>::type System;
-
-  System system;
-
-  return thrust::is_partitioned(select_system(system), first, last, pred);
-} // end is_partitioned()
-
-
-} // end thrust
-
diff --git a/compat/thrust/detail/pointer.h b/compat/thrust/detail/pointer.h
deleted file mode 100644
index bc97939c77..0000000000
--- a/compat/thrust/detail/pointer.h
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/iterator/iterator_adaptor.h>
-#include <thrust/iterator/detail/iterator_traversal_tags.h>
-#include <thrust/detail/type_traits/pointer_traits.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/reference_forward_declaration.h>
-
-namespace thrust
-{
-
-// declare pointer with default values of template parameters
-template<typename Element, typename Tag, typename Reference = use_default, typename Derived = use_default> class pointer;
-
-} // end thrust
-
-
-// specialize std::iterator_traits to avoid problems with the name of
-// pointer's constructor shadowing its nested pointer type
-// do this before pointer is defined so the specialization is correctly
-// used inside the definition
-namespace std
-{
-
-template<typename Element, typename Tag, typename Reference, typename Derived>
-  struct iterator_traits<thrust::pointer<Element,Tag,Reference,Derived> >
-{
-  private:
-    typedef thrust::pointer<Element,Tag,Reference,Derived> ptr;
-
-  public:
-    typedef typename ptr::iterator_category iterator_category;
-    typedef typename ptr::value_type        value_type;
-    typedef typename ptr::difference_type   difference_type;
-    // XXX implement this type (the result of operator->) later
-    typedef void                             pointer;
-    typedef typename ptr::reference         reference;
-}; // end iterator_traits
-
-} // end std
-
-
-namespace thrust
-{
-
-namespace detail
-{
-
-// this metafunction computes the type of iterator_adaptor thrust::pointer should inherit from
-template<typename Element, typename Tag, typename Reference, typename Derived>
-  struct pointer_base
-{
-  // void pointers should have no element type
-  // note that we remove_cv from the Element type to get the value_type
-  typedef typename thrust::detail::eval_if<
-    thrust::detail::is_void<typename thrust::detail::remove_const<Element>::type>::value,
-    thrust::detail::identity_<void>,
-    thrust::detail::remove_cv<Element>
-  >::type value_type;
-
-  // if no Derived type is given, just use pointer
-  typedef typename thrust::detail::eval_if<
-    thrust::detail::is_same<Derived,use_default>::value,
-    thrust::detail::identity_<pointer<Element,Tag,Reference,Derived> >,
-    thrust::detail::identity_<Derived>
-  >::type derived_type;
-
-  // void pointers should have no reference type
-  // if no Reference type is given, just use reference
-  typedef typename thrust::detail::eval_if<
-    thrust::detail::is_void<typename thrust::detail::remove_const<Element>::type>::value,
-    thrust::detail::identity_<void>,
-    thrust::detail::eval_if<
-      thrust::detail::is_same<Reference,use_default>::value,
-      thrust::detail::identity_<reference<Element,derived_type> >,
-      thrust::detail::identity_<Reference>
-    >
-  >::type reference_arg;
-
-  typedef thrust::iterator_adaptor<
-    derived_type,                        // pass along the type of our Derived class to iterator_adaptor
-    Element *,                           // we adapt a raw pointer
-    value_type,                          // the value type
-    Tag,                                 // system tag
-    thrust::random_access_traversal_tag, // pointers have random access traversal
-    reference_arg,                       // pass along our Reference type
-    std::ptrdiff_t
-  > type;
-}; // end pointer_base
-
-
-} // end detail
-
-
-// the base type for all of thrust's tagged pointers.
-// for reasonable pointer-like semantics, derived types should reimplement the following:
-// 1. no-argument constructor
-// 2. constructor from OtherElement *
-// 3. constructor from OtherPointer related by convertibility
-// 4. assignment from OtherPointer related by convertibility
-// These should just call the corresponding members of pointer.
-template<typename Element, typename Tag, typename Reference, typename Derived>
-  class pointer
-    : public thrust::detail::pointer_base<Element,Tag,Reference,Derived>::type
-{
-  private:
-    typedef typename thrust::detail::pointer_base<Element,Tag,Reference,Derived>::type         super_t;
-
-    typedef typename thrust::detail::pointer_base<Element,Tag,Reference,Derived>::derived_type derived_type;
-
-    // friend iterator_core_access to give it access to dereference
-    friend class thrust::iterator_core_access;
-
-    __host__ __device__
-    typename super_t::reference dereference() const;
-
-    // don't provide access to this part of super_t's interface
-    using super_t::base;
-    using typename super_t::base_type;
-
-  public:
-    typedef typename super_t::base_type raw_pointer;
-
-    // constructors
-    
-    __host__ __device__
-    pointer();
-
-    // OtherValue shall be convertible to Value
-    // XXX consider making the pointer implementation a template parameter which defaults to Element *
-    template<typename OtherElement>
-    __host__ __device__
-    explicit pointer(OtherElement *ptr);
-
-    // OtherPointer's element_type shall be convertible to Element
-    // OtherPointer's system shall be convertible to Tag
-    template<typename OtherPointer>
-    __host__ __device__
-    pointer(const OtherPointer &other,
-            typename thrust::detail::enable_if_pointer_is_convertible<
-              OtherPointer,
-              pointer<Element,Tag,Reference,Derived>
-            >::type * = 0);
-
-    // assignment
-    
-    // OtherPointer's element_type shall be convertible to Element
-    // OtherPointer's system shall be convertible to Tag
-    template<typename OtherPointer>
-    __host__ __device__
-    typename thrust::detail::enable_if_pointer_is_convertible<
-      OtherPointer,
-      pointer,
-      derived_type &
-    >::type
-    operator=(const OtherPointer &other);
-
-    // observers
-
-    __host__ __device__
-    Element *get() const;
-}; // end pointer
-
-} // end thrust
-
-#include <thrust/detail/pointer.inl>
-
diff --git a/compat/thrust/detail/pointer.inl b/compat/thrust/detail/pointer.inl
deleted file mode 100644
index 1d066b041c..0000000000
--- a/compat/thrust/detail/pointer.inl
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/pointer.h>
-
-
-namespace thrust
-{
-
-
-template<typename Element, typename Tag, typename Reference, typename Derived>
-  pointer<Element,Tag,Reference,Derived>
-    ::pointer()
-      : super_t(static_cast<Element*>(0))
-{} // end pointer::pointer
-
-
-template<typename Element, typename Tag, typename Reference, typename Derived>
-  template<typename OtherElement>
-    pointer<Element,Tag,Reference,Derived>
-      ::pointer(OtherElement *other)
-        : super_t(other)
-{} // end pointer::pointer
-
-
-template<typename Element, typename Tag, typename Reference, typename Derived>
-  template<typename OtherPointer>
-    pointer<Element,Tag,Reference,Derived>
-      ::pointer(const OtherPointer &other,
-                typename thrust::detail::enable_if_pointer_is_convertible<
-                  OtherPointer,
-                  pointer<Element,Tag,Reference,Derived>
-                 >::type *)
-        : super_t(thrust::detail::pointer_traits<OtherPointer>::get(other))
-{} // end pointer::pointer
-
-
-template<typename Element, typename Tag, typename Reference, typename Derived>
-  template<typename OtherPointer>
-    typename thrust::detail::enable_if_pointer_is_convertible<
-      OtherPointer,
-      pointer<Element,Tag,Reference,Derived>,
-      typename pointer<Element,Tag,Reference,Derived>::derived_type &
-    >::type
-      pointer<Element,Tag,Reference,Derived>
-        ::operator=(const OtherPointer &other)
-{
-  super_t::base_reference() = thrust::detail::pointer_traits<OtherPointer>::get(other);
-  return static_cast<derived_type&>(*this);
-} // end pointer::operator=
-
-
-template<typename Element, typename Tag, typename Reference, typename Derived>
-  typename pointer<Element,Tag,Reference,Derived>::super_t::reference
-    pointer<Element,Tag,Reference,Derived>
-      ::dereference() const
-{
-  return typename super_t::reference(static_cast<const derived_type&>(*this));
-} // end pointer::dereference
-
-
-template<typename Element, typename Tag, typename Reference, typename Derived>
-  Element *pointer<Element,Tag,Reference,Derived>
-    ::get() const
-{
-  return super_t::base();
-} // end pointer::get
-
-
-namespace detail
-{
-
-#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC) && (_MSC_VER <= 1400)
-// XXX WAR MSVC 2005 problem with correctly implementing
-//     pointer_raw_pointer for pointer by specializing it here
-template<typename Element, typename Tag, typename Reference, typename Derived>
-  struct pointer_raw_pointer< thrust::pointer<Element,Tag,Reference,Derived> >
-{
-  typedef typename pointer<Element,Tag,Reference,Derived>::raw_pointer type;
-}; // end pointer_raw_pointer
-#endif
-
-
-#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC) && (THRUST_GCC_VERSION < 40200)
-// XXX WAR g++-4.1 problem with correctly implementing
-//     pointer_element for pointer by specializing it here
-template<typename Element, typename Tag>
-  struct pointer_element< thrust::pointer<Element,Tag> >
-{
-  typedef Element type;
-}; // end pointer_element
-
-template<typename Element, typename Tag, typename Reference>
-  struct pointer_element< thrust::pointer<Element,Tag,Reference> >
-    : pointer_element< thrust::pointer<Element,Tag> >
-{}; // end pointer_element
-
-template<typename Element, typename Tag, typename Reference, typename Derived>
-  struct pointer_element< thrust::pointer<Element,Tag,Reference,Derived> >
-    : pointer_element< thrust::pointer<Element,Tag,Reference> >
-{}; // end pointer_element
-
-
-
-// XXX WAR g++-4.1 problem with correctly implementing
-//     rebind_pointer for pointer by specializing it here
-template<typename Element, typename Tag, typename NewElement>
-  struct rebind_pointer<thrust::pointer<Element,Tag>, NewElement>
-{
-  // XXX note we don't attempt to rebind the pointer's Reference type (or Derived)
-  typedef thrust::pointer<NewElement,Tag> type;
-};
-
-template<typename Element, typename Tag, typename Reference, typename NewElement>
-  struct rebind_pointer<thrust::pointer<Element,Tag,Reference>, NewElement>
-    : rebind_pointer<thrust::pointer<Element,Tag>, NewElement>
-{};
-
-template<typename Element, typename Tag, typename Reference, typename Derived, typename NewElement>
-  struct rebind_pointer<thrust::pointer<Element,Tag,Reference,Derived>, NewElement>
-    : rebind_pointer<thrust::pointer<Element,Tag,Reference>, NewElement>
-{};
-#endif
-
-} // end namespace detail
-
-
-} // end thrust
-
diff --git a/compat/thrust/detail/range/tail_flags.h b/compat/thrust/detail/range/tail_flags.h
deleted file mode 100644
index 06fd9f8a00..0000000000
--- a/compat/thrust/detail/range/tail_flags.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/tuple.h>
-#include <thrust/functional.h>
-
-namespace thrust
-{
-namespace detail
-{
-
-
-template<typename RandomAccessIterator,
-         typename BinaryPredicate = thrust::equal_to<typename thrust::iterator_value<RandomAccessIterator>::type>,
-         typename IndexType = typename thrust::iterator_difference<RandomAccessIterator>::type>
-  class tail_flags
-{
-  private:
-    struct tail_flag_functor
-    {
-      BinaryPredicate binary_pred; // this must be the first member for performance reasons
-      IndexType n;
-
-      typedef bool result_type;
-
-      tail_flag_functor(IndexType n)
-        : binary_pred(), n(n)
-      {}
-
-      tail_flag_functor(IndexType n, BinaryPredicate binary_pred)
-        : binary_pred(binary_pred), n(n)
-      {}
-
-      template<typename Tuple>
-      __host__ __device__ __thrust_forceinline__
-      result_type operator()(const Tuple &t)
-      {
-        const IndexType i = thrust::get<0>(t);
-
-        // note that we do not dereference the tuple's 2nd element when i >= n
-        // and therefore do not dereference a bad location at the boundary
-        return (i == (n - 1) || !binary_pred(thrust::get<1>(t), thrust::get<2>(t)));
-      }
-    };
-
-    typedef thrust::counting_iterator<IndexType> counting_iterator;
-
-  public:
-    typedef thrust::transform_iterator<
-      tail_flag_functor,
-      thrust::zip_iterator<thrust::tuple<counting_iterator,RandomAccessIterator,RandomAccessIterator> >
-    > iterator;
-
-    tail_flags(RandomAccessIterator first, RandomAccessIterator last)
-      : m_begin(thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator<IndexType>(0), first, first + 1)),
-                                                tail_flag_functor(last - first))),
-        m_end(m_begin + (last - first))
-    {}
-
-    tail_flags(RandomAccessIterator first, RandomAccessIterator last, BinaryPredicate binary_pred)
-      : m_begin(thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator<IndexType>(0), first, first + 1)),
-                                                tail_flag_functor(last - first, binary_pred))),
-        m_end(m_begin + (last - first))
-    {}
-
-    iterator begin() const
-    {
-      return m_begin;
-    }
-
-    iterator end() const
-    {
-      return m_end;
-    }
-
-    template<typename OtherIndex>
-    typename iterator::reference operator[](OtherIndex i)
-    {
-      return *(begin() + i);
-    }
-
-  private:
-    iterator m_begin, m_end;
-};
-
-
-template<typename RandomAccessIterator, typename BinaryPredicate>
-  tail_flags<RandomAccessIterator, BinaryPredicate>
-    make_tail_flags(RandomAccessIterator first, RandomAccessIterator last, BinaryPredicate binary_pred)
-{
-  return tail_flags<RandomAccessIterator, BinaryPredicate>(first, last, binary_pred);
-}
-
-
-template<typename RandomAccessIterator>
-  tail_flags<RandomAccessIterator>
-    make_tail_flags(RandomAccessIterator first, RandomAccessIterator last)
-{
-  return tail_flags<RandomAccessIterator>(first, last);
-}
-
-
-} // end detail
-} // end thrust
-
diff --git a/compat/thrust/detail/raw_pointer_cast.h b/compat/thrust/detail/raw_pointer_cast.h
deleted file mode 100644
index 05e1e6bc45..0000000000
--- a/compat/thrust/detail/raw_pointer_cast.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/type_traits/pointer_traits.h>
-
-namespace thrust
-{
-
-template<typename Pointer>
-  inline __host__ __device__ typename thrust::detail::pointer_traits<Pointer>::raw_pointer
-    raw_pointer_cast(const Pointer &ptr)
-{
-  return thrust::detail::pointer_traits<Pointer>::get(ptr);
-} // end raw_pointer_cast()
-
-} // end thrust
-
diff --git a/compat/thrust/detail/raw_reference_cast.h b/compat/thrust/detail/raw_reference_cast.h
deleted file mode 100644
index 1ffd7e5701..0000000000
--- a/compat/thrust/detail/raw_reference_cast.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/type_traits/has_nested_type.h>
-#include <thrust/detail/raw_pointer_cast.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/iterator/detail/tuple_of_iterator_references.h>
-
-namespace thrust
-{
-namespace detail
-{
-
-
-__THRUST_DEFINE_HAS_NESTED_TYPE(is_wrapped_reference, wrapped_reference_hint)
-
-namespace raw_reference_detail
-{
-
-template<typename T, typename Enable = void>
-  struct raw_reference
-    : add_reference<T>
-{};
-
-
-// XXX consider making raw_reference<T&> an error
-
-
-template<typename T>
-  struct raw_reference<
-    T,
-    typename thrust::detail::enable_if<
-      is_wrapped_reference<
-        typename remove_cv<T>::type
-      >::value
-    >::type
-  >
-{
-  typedef typename add_reference<
-    typename pointer_element<typename T::pointer>::type
-  >::type type;
-};
-
-} // end raw_reference_ns
-
-template<typename T>
-  struct raw_reference : 
-    raw_reference_detail::raw_reference<T>
-{};
-
-
-// wrapped reference-like things which aren't strictly wrapped references
-// (e.g. tuples of wrapped references) are considered unwrappable
-template<typename T>
-  struct is_unwrappable
-    : is_wrapped_reference<T>
-{};
-
-
-template<typename T, typename Result = void>
-  struct enable_if_unwrappable
-    : enable_if<
-        is_unwrappable<T>::value,
-        Result
-      >
-{};
-
-
-} // end detail
-
-
-template<typename T>
-  inline __host__ __device__ typename detail::raw_reference<T>::type raw_reference_cast(T &ref)
-{
-  return *thrust::raw_pointer_cast(&ref);
-} // end raw_reference_cast
-
-
-template<typename T>
-  inline __host__ __device__ typename detail::raw_reference<const T>::type raw_reference_cast(const T &ref)
-{
-  return *thrust::raw_pointer_cast(&ref);
-} // end raw_reference_cast
-
-
-template<
-  typename T0, typename T1, typename T2,
-  typename T3, typename T4, typename T5,
-  typename T6, typename T7, typename T8,
-  typename T9
->
-inline __host__ __device__
-typename detail::enable_if_unwrappable<
-  thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>,
-  typename detail::raw_reference<
-    thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
-  >::type
->::type
-raw_reference_cast(detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9> t);
-
-
-} // end thrust
-
-#include <thrust/detail/raw_reference_cast.inl>
-
diff --git a/compat/thrust/detail/raw_reference_cast.inl b/compat/thrust/detail/raw_reference_cast.inl
deleted file mode 100644
index ea619ec028..0000000000
--- a/compat/thrust/detail/raw_reference_cast.inl
+++ /dev/null
@@ -1,277 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/raw_reference_cast.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/tuple_transform.h>
-
-namespace thrust
-{
-namespace detail
-{
-
-// specialize is_unwrappable
-// a tuple is_unwrappable if any of its elements is_unwrappable
-template<
-  typename T0, typename T1, typename T2,
-  typename T3, typename T4, typename T5,
-  typename T6, typename T7, typename T8,
-  typename T9
->
-  struct is_unwrappable<
-    thrust::tuple<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
-  >
-    : or_<
-        is_unwrappable<T0>,
-        is_unwrappable<T1>,
-        is_unwrappable<T2>,
-        is_unwrappable<T3>,
-        is_unwrappable<T4>,
-        is_unwrappable<T5>,
-        is_unwrappable<T6>,
-        is_unwrappable<T7>,
-        is_unwrappable<T8>,
-        is_unwrappable<T9>
-      >
-{};
-
-
-// specialize is_unwrappable
-// a tuple_of_iterator_references is_unwrappable if any of its elements is_unwrappable
-template<
-  typename T0, typename T1, typename T2,
-  typename T3, typename T4, typename T5,
-  typename T6, typename T7, typename T8,
-  typename T9
->
-  struct is_unwrappable<
-    thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
-  >
-    : or_<
-        is_unwrappable<T0>,
-        is_unwrappable<T1>,
-        is_unwrappable<T2>,
-        is_unwrappable<T3>,
-        is_unwrappable<T4>,
-        is_unwrappable<T5>,
-        is_unwrappable<T6>,
-        is_unwrappable<T7>,
-        is_unwrappable<T8>,
-        is_unwrappable<T9>
-      >
-{};
-
-
-namespace raw_reference_detail
-{
-
-// unlike raw_reference,
-// raw_reference_tuple_helper needs to return a value
-// when it encounters one, rather than a reference
-// upon encountering tuple, recurse
-//
-// we want the following behavior:
-//  1. T                                -> T
-//  2. T&                               -> T&
-//  3. null_type                        -> null_type
-//  4. reference<T>                     -> T&
-//  5. tuple_of_iterator_references<T>  -> tuple_of_iterator_references<raw_reference_tuple_helper<T>::type>
-
-
-// wrapped references are unwrapped using raw_reference, otherwise, return T
-template<typename T>
-  struct raw_reference_tuple_helper
-    : eval_if<
-        is_unwrappable<
-          typename remove_cv<T>::type
-        >::value,
-        raw_reference<T>,
-        identity_<T>
-      >
-{};
-
-
-// recurse on tuples
-template <
-  typename T0, typename T1, typename T2,
-  typename T3, typename T4, typename T5,
-  typename T6, typename T7, typename T8,
-  typename T9
->
-  struct raw_reference_tuple_helper<
-    thrust::tuple<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
-  >
-{
-  typedef thrust::tuple<
-    typename raw_reference_tuple_helper<T0>::type,
-    typename raw_reference_tuple_helper<T1>::type,
-    typename raw_reference_tuple_helper<T2>::type,
-    typename raw_reference_tuple_helper<T3>::type,
-    typename raw_reference_tuple_helper<T4>::type,
-    typename raw_reference_tuple_helper<T5>::type,
-    typename raw_reference_tuple_helper<T6>::type,
-    typename raw_reference_tuple_helper<T7>::type,
-    typename raw_reference_tuple_helper<T8>::type,
-    typename raw_reference_tuple_helper<T9>::type
-  > type;
-};
-
-
-template <
-  typename T0, typename T1, typename T2,
-  typename T3, typename T4, typename T5,
-  typename T6, typename T7, typename T8,
-  typename T9
->
-  struct raw_reference_tuple_helper<
-    thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
-  >
-{
-  typedef thrust::detail::tuple_of_iterator_references<
-    typename raw_reference_tuple_helper<T0>::type,
-    typename raw_reference_tuple_helper<T1>::type,
-    typename raw_reference_tuple_helper<T2>::type,
-    typename raw_reference_tuple_helper<T3>::type,
-    typename raw_reference_tuple_helper<T4>::type,
-    typename raw_reference_tuple_helper<T5>::type,
-    typename raw_reference_tuple_helper<T6>::type,
-    typename raw_reference_tuple_helper<T7>::type,
-    typename raw_reference_tuple_helper<T8>::type,
-    typename raw_reference_tuple_helper<T9>::type
-  > type;
-};
-
-
-} // end raw_reference_detail
-
-
-// if a tuple "tuple_type" is_unwrappable,
-//   then the raw_reference of tuple_type is a tuple of its members' raw_references
-//   else the raw_reference of tuple_type is tuple_type &
-template <
-  typename T0, typename T1, typename T2,
-  typename T3, typename T4, typename T5,
-  typename T6, typename T7, typename T8,
-  typename T9
->
-  struct raw_reference<
-    thrust::tuple<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
-  >
-{
-  private:
-    typedef thrust::tuple<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9> tuple_type;
-
-  public:
-    typedef typename eval_if<
-      is_unwrappable<tuple_type>::value,
-      raw_reference_detail::raw_reference_tuple_helper<tuple_type>,
-      add_reference<tuple_type>
-    >::type type;
-};
-
-
-template <
-  typename T0, typename T1, typename T2,
-  typename T3, typename T4, typename T5,
-  typename T6, typename T7, typename T8,
-  typename T9
->
-  struct raw_reference<
-    thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
-  >
-{
-  private:
-    typedef detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9> tuple_type;
-
-  public:
-    typedef typename raw_reference_detail::raw_reference_tuple_helper<tuple_type>::type type;
-
-    // XXX figure out why is_unwrappable seems to be broken for tuple_of_iterator_references
-    //typedef typename eval_if<
-    //  is_unwrappable<tuple_type>::value,
-    //  raw_reference_detail::raw_reference_tuple_helper<tuple_type>,
-    //  add_reference<tuple_type>
-    //>::type type;
-};
-
-
-struct raw_reference_caster
-{
-  template<typename T>
-  __host__ __device__
-  typename detail::raw_reference<T>::type operator()(T &ref)
-  {
-    return thrust::raw_reference_cast(ref);
-  }
-
-  template<typename T>
-  __host__ __device__
-  typename detail::raw_reference<const T>::type operator()(const T &ref)
-  {
-    return thrust::raw_reference_cast(ref);
-  }
-
-  template<
-    typename T0, typename T1, typename T2,
-    typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8,
-    typename T9
-  >
-  __host__ __device__
-  typename detail::raw_reference<
-    thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
-  >::type
-  operator()(thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9> t,
-             typename enable_if<
-               is_unwrappable<thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9> >::value
-             >::type * = 0)
-  {
-    return thrust::raw_reference_cast(t);
-  }
-}; // end raw_reference_caster
-
-
-} // end detail
-
-
-template<
-  typename T0, typename T1, typename T2,
-  typename T3, typename T4, typename T5,
-  typename T6, typename T7, typename T8,
-  typename T9
->
-__host__ __device__
-typename detail::enable_if_unwrappable<
-  thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>,
-  typename detail::raw_reference<
-    thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
-  >::type
->::type
-raw_reference_cast(thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9> t)
-{
-  thrust::detail::raw_reference_caster f;
-
-  // note that we pass raw_reference_tuple_helper, not raw_reference as the unary metafunction
-  // the subtle difference is important
-  return thrust::detail::tuple_host_device_transform<detail::raw_reference_detail::raw_reference_tuple_helper>(t, f);
-} // end raw_reference_cast
-
-
-} // end thrust
-
diff --git a/compat/thrust/detail/reduce.inl b/compat/thrust/detail/reduce.inl
deleted file mode 100644
index ba84423475..0000000000
--- a/compat/thrust/detail/reduce.inl
+++ /dev/null
@@ -1,261 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file reduce.inl
- *  \brief Inline file for reduce.h.
- */
-
-#include <thrust/reduce.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/detail/generic/select_system.h>
-#include <thrust/system/detail/generic/reduce.h>
-#include <thrust/system/detail/generic/reduce_by_key.h>
-#include <thrust/system/detail/adl/reduce.h>
-#include <thrust/system/detail/adl/reduce_by_key.h>
-
-namespace thrust
-{
-
-
-template<typename DerivedPolicy, typename InputIterator>
-  typename thrust::iterator_traits<InputIterator>::value_type
-    reduce(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, InputIterator first, InputIterator last)
-{
-  using thrust::system::detail::generic::reduce;
-  return reduce(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last);
-} // end reduce()
-
-
-template<typename DerivedPolicy, typename InputIterator, typename T>
-  T reduce(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-           InputIterator first,
-           InputIterator last,
-           T init)
-{
-  using thrust::system::detail::generic::reduce;
-  return reduce(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, init);
-} // end reduce()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename T,
-         typename BinaryFunction>
-  T reduce(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-           InputIterator first,
-           InputIterator last,
-           T init,
-           BinaryFunction binary_op)
-{
-  using thrust::system::detail::generic::reduce;
-  return reduce(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, init, binary_op);
-} // end reduce()
-
-
-template <typename DerivedPolicy,
-          typename InputIterator1,
-          typename InputIterator2,
-          typename OutputIterator1,
-          typename OutputIterator2>
-  thrust::pair<OutputIterator1,OutputIterator2>
-  reduce_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                InputIterator1 keys_first, 
-                InputIterator1 keys_last,
-                InputIterator2 values_first,
-                OutputIterator1 keys_output,
-                OutputIterator2 values_output)
-{
-  using thrust::system::detail::generic::reduce_by_key;
-  return reduce_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first, keys_last, values_first, keys_output, values_output);
-} // end reduce_by_key()
-
-
-template <typename DerivedPolicy,
-          typename InputIterator1,
-          typename InputIterator2,
-          typename OutputIterator1,
-          typename OutputIterator2,
-          typename BinaryPredicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-  reduce_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                InputIterator1 keys_first, 
-                InputIterator1 keys_last,
-                InputIterator2 values_first,
-                OutputIterator1 keys_output,
-                OutputIterator2 values_output,
-                BinaryPredicate binary_pred)
-{
-  using thrust::system::detail::generic::reduce_by_key;
-  return reduce_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first, keys_last, values_first, keys_output, values_output, binary_pred);
-} // end reduce_by_key()
-
-
-template <typename DerivedPolicy,
-          typename InputIterator1,
-          typename InputIterator2,
-          typename OutputIterator1,
-          typename OutputIterator2,
-          typename BinaryPredicate,
-          typename BinaryFunction>
-  thrust::pair<OutputIterator1,OutputIterator2>
-  reduce_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                InputIterator1 keys_first, 
-                InputIterator1 keys_last,
-                InputIterator2 values_first,
-                OutputIterator1 keys_output,
-                OutputIterator2 values_output,
-                BinaryPredicate binary_pred,
-                BinaryFunction binary_op)
-{
-  using thrust::system::detail::generic::reduce_by_key;
-  return reduce_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first, keys_last, values_first, keys_output, values_output, binary_pred, binary_op);
-} // end reduce_by_key()
-
-
-template<typename InputIterator>
-typename thrust::iterator_traits<InputIterator>::value_type
-  reduce(InputIterator first,
-         InputIterator last)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator>::type System;
-
-  System system;
-
-  return thrust::reduce(select_system(system), first, last);
-}
-
-template<typename InputIterator,
-         typename T>
-   T reduce(InputIterator first,
-            InputIterator last,
-            T init)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator>::type System;
-
-  System system;
-
-  return thrust::reduce(select_system(system), first, last, init);
-}
-
-
-template<typename InputIterator,
-         typename T,
-         typename BinaryFunction>
-   T reduce(InputIterator first,
-            InputIterator last,
-            T init,
-            BinaryFunction binary_op)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator>::type System;
-
-  System system;
-
-  return thrust::reduce(select_system(system), first, last, init, binary_op);
-}
-
-template <typename InputIterator1,
-          typename InputIterator2,
-          typename OutputIterator1,
-          typename OutputIterator2>
-  thrust::pair<OutputIterator1,OutputIterator2>
-  reduce_by_key(InputIterator1 keys_first, 
-                InputIterator1 keys_last,
-                InputIterator2 values_first,
-                OutputIterator1 keys_output,
-                OutputIterator2 values_output)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
-  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
-  typedef typename thrust::iterator_system<OutputIterator1>::type System3;
-  typedef typename thrust::iterator_system<OutputIterator2>::type System4;
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-  System4 system4;
-
-  return thrust::reduce_by_key(select_system(system1,system2,system3,system4), keys_first, keys_last, values_first, keys_output, values_output);
-}
-
-template <typename InputIterator1,
-          typename InputIterator2,
-          typename OutputIterator1,
-          typename OutputIterator2,
-          typename BinaryPredicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-  reduce_by_key(InputIterator1 keys_first, 
-                InputIterator1 keys_last,
-                InputIterator2 values_first,
-                OutputIterator1 keys_output,
-                OutputIterator2 values_output,
-                BinaryPredicate binary_pred)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
-  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
-  typedef typename thrust::iterator_system<OutputIterator1>::type System3;
-  typedef typename thrust::iterator_system<OutputIterator2>::type System4;
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-  System4 system4;
-
-  return thrust::reduce_by_key(select_system(system1,system2,system3,system4), keys_first, keys_last, values_first, keys_output, values_output, binary_pred);
-}
-
-template <typename InputIterator1,
-          typename InputIterator2,
-          typename OutputIterator1,
-          typename OutputIterator2,
-          typename BinaryPredicate,
-          typename BinaryFunction>
-  thrust::pair<OutputIterator1,OutputIterator2>
-  reduce_by_key(InputIterator1 keys_first, 
-                InputIterator1 keys_last,
-                InputIterator2 values_first,
-                OutputIterator1 keys_output,
-                OutputIterator2 values_output,
-                BinaryPredicate binary_pred,
-                BinaryFunction binary_op)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
-  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
-  typedef typename thrust::iterator_system<OutputIterator1>::type System3;
-  typedef typename thrust::iterator_system<OutputIterator2>::type System4;
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-  System4 system4;
-
-  return thrust::reduce_by_key(select_system(system1,system2,system3,system4), keys_first, keys_last, values_first, keys_output, values_output, binary_pred, binary_op);
-}
-
-} // end namespace thrust
-
diff --git a/compat/thrust/detail/reference.h b/compat/thrust/detail/reference.h
deleted file mode 100644
index 8c0b06186f..0000000000
--- a/compat/thrust/detail/reference.h
+++ /dev/null
@@ -1,167 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/use_default.h>
-#include <thrust/detail/reference_forward_declaration.h>
-
-
-namespace thrust
-{
-namespace detail
-{
-
-template<typename> struct is_wrapped_reference;
-
-}
-
-// the base type for all of thrust's system-annotated references.
-// for reasonable reference-like semantics, derived types must reimplement the following:
-// 1. constructor from pointer
-// 2. copy constructor
-// 3. templated copy constructor from other reference
-// 4. templated assignment from other reference
-// 5. assignment from value_type
-template<typename Element, typename Pointer, typename Derived>
-  class reference
-{
-  private:
-    typedef typename thrust::detail::eval_if<
-      thrust::detail::is_same<Derived,use_default>::value,
-      thrust::detail::identity_<reference>,
-      thrust::detail::identity_<Derived>
-    >::type derived_type;
-
-    // hint for is_wrapped_reference lets it know that this type (or a derived type)
-    // is a wrapped reference
-    struct wrapped_reference_hint {};
-    template<typename> friend struct thrust::detail::is_wrapped_reference;
-
-  public:
-    typedef Pointer                                              pointer;
-    typedef typename thrust::detail::remove_const<Element>::type value_type;
-
-    __host__ __device__
-    explicit reference(const pointer &ptr);
-
-    template<typename OtherElement, typename OtherPointer, typename OtherDerived>
-    __host__ __device__
-    reference(const reference<OtherElement,OtherPointer,OtherDerived> &other,
-              typename thrust::detail::enable_if_convertible<
-                typename reference<OtherElement,OtherPointer,OtherDerived>::pointer,
-                pointer
-              >::type * = 0);
-
-    __host__ __device__
-    derived_type &operator=(const reference &other);
-
-    // XXX this may need an enable_if
-    template<typename OtherElement, typename OtherPointer, typename OtherDerived>
-    __host__ __device__
-    derived_type &operator=(const reference<OtherElement,OtherPointer,OtherDerived> &other);
-
-    __host__ __device__
-    derived_type &operator=(const value_type &x);
-
-    __host__ __device__
-    pointer operator&() const;
-
-    __host__ __device__
-    operator value_type () const;
-
-    __host__ __device__
-    void swap(derived_type &other);
-
-    derived_type &operator++();
-
-    value_type operator++(int);
-
-    // XXX parameterize the type of rhs
-    derived_type &operator+=(const value_type &rhs);
-
-    derived_type &operator--();
-
-    value_type operator--(int);
-
-    // XXX parameterize the type of rhs
-    derived_type &operator-=(const value_type &rhs);
-
-    // XXX parameterize the type of rhs
-    derived_type &operator*=(const value_type &rhs);
-
-    // XXX parameterize the type of rhs
-    derived_type &operator/=(const value_type &rhs);
-
-    // XXX parameterize the type of rhs
-    derived_type &operator%=(const value_type &rhs);
-
-    // XXX parameterize the type of rhs
-    derived_type &operator<<=(const value_type &rhs);
-
-    // XXX parameterize the type of rhs
-    derived_type &operator>>=(const value_type &rhs);
-
-    // XXX parameterize the type of rhs
-    derived_type &operator&=(const value_type &rhs);
-
-    // XXX parameterize the type of rhs
-    derived_type &operator|=(const value_type &rhs);
-
-    // XXX parameterize the type of rhs
-    derived_type &operator^=(const value_type &rhs);
-
-  private:
-    const pointer m_ptr;
-
-    // allow access to m_ptr for other references
-    template <typename OtherElement, typename OtherPointer, typename OtherDerived> friend class reference;
-
-    template<typename System>
-    __host__ __device__
-    inline value_type strip_const_get_value(const System &system) const;
-
-    template<typename OtherPointer>
-    __host__ __device__
-    inline void assign_from(OtherPointer src);
-
-    // XXX this helper exists only to avoid warnings about null references from the other assign_from
-    template<typename System1, typename System2, typename OtherPointer>
-    inline __host__ __device__
-    void assign_from(System1 *system1, System2 *system2, OtherPointer src);
-
-    template<typename System, typename OtherPointer>
-    __host__ __device__
-    inline void strip_const_assign_value(const System &system, OtherPointer src);
-
-    // XXX this helper exists only to avoid warnings about null references from the other swap
-    template<typename System>
-    inline __host__ __device__
-    void swap(System *system, derived_type &other);
-
-    // XXX this helper exists only to avoid warnings about null references from operator value_type ()
-    template<typename System>
-    inline __host__ __device__
-    value_type convert_to_value_type(System *system) const;
-}; // end reference
-
-  
-} // end thrust
-
-#include <thrust/detail/reference.inl>
-
diff --git a/compat/thrust/detail/reference.inl b/compat/thrust/detail/reference.inl
deleted file mode 100644
index 8b55edb712..0000000000
--- a/compat/thrust/detail/reference.inl
+++ /dev/null
@@ -1,361 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/reference.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/detail/generic/select_system.h>
-#include <thrust/system/detail/generic/memory.h>
-#include <thrust/system/detail/adl/get_value.h>
-#include <thrust/system/detail/adl/assign_value.h>
-#include <thrust/system/detail/adl/iter_swap.h>
-
-
-namespace thrust
-{
-
-
-template<typename Element, typename Pointer, typename Derived>
-  template<typename OtherElement, typename OtherPointer, typename OtherDerived>
-    reference<Element,Pointer,Derived>
-      ::reference(const reference<OtherElement,OtherPointer,OtherDerived> &other,
-                  typename thrust::detail::enable_if_convertible<
-                    typename reference<OtherElement,OtherPointer,OtherDerived>::pointer,
-                    pointer
-                  >::type *)
-        : m_ptr(other.m_ptr)
-{}
-
-
-template<typename Element, typename Pointer, typename Derived>
-  reference<Element,Pointer,Derived>
-    ::reference(const pointer &ptr)
-      : m_ptr(ptr)
-{}
-
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::pointer
-    reference<Element,Pointer,Derived>
-      ::operator&() const
-{
-  return m_ptr;
-} // end reference::operator&()
-
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::derived_type &
-    reference<Element,Pointer,Derived>
-      ::operator=(const value_type &v)
-{
-  assign_from(&v);
-  return static_cast<derived_type&>(*this);
-} // end reference::operator=()
-
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::derived_type &
-    reference<Element,Pointer,Derived>
-      ::operator=(const reference &other)
-{
-  assign_from(&other); 
-  return static_cast<derived_type&>(*this);
-} // end reference::operator=()
-
-
-template<typename Element, typename Pointer, typename Derived>
-  template<typename OtherElement, typename OtherPointer, typename OtherDerived>
-    typename reference<Element,Pointer,Derived>::derived_type &
-      reference<Element,Pointer,Derived>
-        ::operator=(const reference<OtherElement,OtherPointer,OtherDerived> &other)
-{
-  assign_from(&other);
-  return static_cast<derived_type&>(*this);
-} // end reference::operator=()
-
-
-template<typename Element, typename Pointer, typename Derived>
-  template<typename System>
-    typename reference<Element,Pointer,Derived>::value_type
-      reference<Element,Pointer,Derived>
-        ::convert_to_value_type(System *system) const
-{
-  using thrust::system::detail::generic::select_system;
-  return strip_const_get_value(select_system(*system));
-} // end convert_to_value_type()
-
-
-template<typename Element, typename Pointer, typename Derived>
-  reference<Element,Pointer,Derived>
-    ::operator typename reference<Element,Pointer,Derived>::value_type () const
-{
-  typedef typename thrust::iterator_system<pointer>::type System;
-
-  // XXX avoid default-constructing a system
-  // XXX use null a reference for dispatching
-  // XXX this assumes that the eventual invocation of
-  // XXX get_value will not access system state
-  System *system = 0;
-
-  return convert_to_value_type(system);
-} // end reference::operator value_type ()
-
-
-template<typename Element, typename Pointer, typename Derived>
-  template<typename System>
-    typename reference<Element,Pointer,Derived>::value_type
-      reference<Element,Pointer,Derived>
-        ::strip_const_get_value(const System &system) const
-{
-  System &non_const_system = const_cast<System&>(system);
-
-  using thrust::system::detail::generic::get_value;
-
-  return get_value(thrust::detail::derived_cast(non_const_system), m_ptr);
-} // end reference::strip_const_get_value()
-
-
-template<typename Element, typename Pointer, typename Derived>
-  template<typename System1, typename System2, typename OtherPointer>
-    void reference<Element,Pointer,Derived>
-      ::assign_from(System1 *system1, System2 *system2, OtherPointer src)
-{
-  using thrust::system::detail::generic::select_system;
-
-  strip_const_assign_value(select_system(*system1, *system2), src);
-} // end assign_from()
-
-
-template<typename Element, typename Pointer, typename Derived>
-  template<typename OtherPointer>
-    void reference<Element,Pointer,Derived>
-      ::assign_from(OtherPointer src)
-{
-  typedef typename thrust::iterator_system<pointer>::type      System1;
-  typedef typename thrust::iterator_system<OtherPointer>::type System2;
-
-  // XXX avoid default-constructing a system
-  // XXX use null references for dispatching
-  // XXX this assumes that the eventual invocation of
-  // XXX assign_value will not access system state
-  System1 *system1 = 0;
-  System2 *system2 = 0;
-
-  assign_from(system1, system2, src);
-} // end assign_from()
-
-
-template<typename Element, typename Pointer, typename Derived>
-  template<typename System, typename OtherPointer>
-    void reference<Element,Pointer,Derived>
-      ::strip_const_assign_value(const System &system, OtherPointer src)
-{
-  System &non_const_system = const_cast<System&>(system);
-
-  using thrust::system::detail::generic::assign_value;
-
-  assign_value(thrust::detail::derived_cast(non_const_system), m_ptr, src);
-} // end strip_const_assign_value()
-
-
-template<typename Element, typename Pointer, typename Derived>
-  template<typename System>
-    void reference<Element,Pointer,Derived>
-      ::swap(System *system, derived_type &other)
-{
-  using thrust::system::detail::generic::select_system;
-  using thrust::system::detail::generic::iter_swap;
-
-  iter_swap(select_system(*system, *system), m_ptr, other.m_ptr);
-} // end reference::swap()
-
-
-template<typename Element, typename Pointer, typename Derived>
-  void reference<Element,Pointer,Derived>
-    ::swap(derived_type &other)
-{
-  typedef typename thrust::iterator_system<pointer>::type System;
-
-  // XXX avoid default-constructing a system
-  // XXX use null references for dispatching
-  // XXX this assumes that the eventual invocation
-  // XXX of iter_swap will not access system state
-  System *system = 0;
-
-  swap(system, other);
-} // end reference::swap()
-
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::derived_type &
-    reference<Element,Pointer,Derived>
-      ::operator++(void)
-{
-  value_type temp = *this;
-  ++temp;
-  *this = temp;
-  return static_cast<derived_type&>(*this);
-} // end reference::operator++()
-
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::value_type
-    reference<Element,Pointer,Derived>
-      ::operator++(int)
-{
-  value_type temp = *this;
-  value_type result = temp++;
-  *this = temp;
-  return result;
-} // end reference::operator++()
-
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::derived_type &
-    reference<Element,Pointer,Derived>
-      ::operator+=(const value_type &rhs)
-{
-  value_type temp = *this;
-  temp += rhs;
-  *this = temp;
-  return static_cast<derived_type&>(*this);
-} // end reference::operator+=()
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::derived_type &
-    reference<Element,Pointer,Derived>
-      ::operator--(void)
-{
-  value_type temp = *this;
-  --temp;
-  *this = temp;
-  return static_cast<derived_type&>(*this);
-} // end reference::operator--()
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::value_type
-    reference<Element,Pointer,Derived>
-      ::operator--(int)
-{
-  value_type temp = *this;
-  value_type result = temp--;
-  *this = temp;
-  return result;
-} // end reference::operator--()
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::derived_type &
-    reference<Element,Pointer,Derived>
-      ::operator-=(const value_type &rhs)
-{
-  value_type temp = *this;
-  temp -= rhs;
-  *this = temp;
-  return static_cast<derived_type&>(*this);
-} // end reference::operator-=()
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::derived_type &
-    reference<Element,Pointer,Derived>
-      ::operator*=(const value_type &rhs)
-{
-  value_type temp = *this;
-  temp *= rhs;
-  *this = temp;
-  return static_cast<derived_type&>(*this);
-} // end reference::operator*=()
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::derived_type &
-    reference<Element,Pointer,Derived>
-      ::operator/=(const value_type &rhs)
-{
-  value_type temp = *this;
-  temp /= rhs;
-  *this = temp;
-  return static_cast<derived_type&>(*this);
-} // end reference::operator/=()
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::derived_type &
-    reference<Element,Pointer,Derived>
-      ::operator%=(const value_type &rhs)
-{
-  value_type temp = *this;
-  temp %= rhs;
-  *this = temp;
-  return static_cast<derived_type&>(*this);
-} // end reference::operator%=()
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::derived_type &
-    reference<Element,Pointer,Derived>
-      ::operator<<=(const value_type &rhs)
-{
-  value_type temp = *this;
-  temp <<= rhs;
-  *this = temp;
-  return static_cast<derived_type&>(*this);
-} // end reference::operator<<=()
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::derived_type &
-    reference<Element,Pointer,Derived>
-      ::operator>>=(const value_type &rhs)
-{
-  value_type temp = *this;
-  temp >>= rhs;
-  *this = temp;
-  return static_cast<derived_type&>(*this);
-} // end reference::operator>>=()
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::derived_type &
-    reference<Element,Pointer,Derived>
-      ::operator&=(const value_type &rhs)
-{
-  value_type temp = *this;
-  temp &= rhs;
-  *this = temp;
-  return static_cast<derived_type&>(*this);
-} // end reference::operator&=()
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::derived_type &
-    reference<Element,Pointer,Derived>
-      ::operator|=(const value_type &rhs)
-{
-  value_type temp = *this;
-  temp |= rhs;
-  *this = temp;
-  return static_cast<derived_type&>(*this);
-} // end reference::operator|=()
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::derived_type &
-    reference<Element,Pointer,Derived>
-      ::operator^=(const value_type &rhs)
-{
-  value_type temp = *this;
-  temp ^= rhs;
-  *this = temp;
-  return static_cast<derived_type&>(*this);
-} // end reference::operator^=()
-
-  
-} // end thrust
-
diff --git a/compat/thrust/detail/reference_forward_declaration.h b/compat/thrust/detail/reference_forward_declaration.h
deleted file mode 100644
index 60524d3ec2..0000000000
--- a/compat/thrust/detail/reference_forward_declaration.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/use_default.h>
-
-namespace thrust
-{
-
-template<typename Element, typename Pointer, typename Derived = use_default> class reference;
-
-} // end thrust
-
diff --git a/compat/thrust/detail/remove.inl b/compat/thrust/detail/remove.inl
deleted file mode 100644
index 5675243fbd..0000000000
--- a/compat/thrust/detail/remove.inl
+++ /dev/null
@@ -1,238 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file remove.inl
- *  \brief Inline file for remove.h.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/remove.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/detail/generic/select_system.h>
-#include <thrust/system/detail/generic/remove.h>
-#include <thrust/system/detail/adl/remove.h>
-
-namespace thrust
-{
-
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename T>
-  ForwardIterator remove(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                         ForwardIterator first,
-                         ForwardIterator last,
-                         const T &value)
-{
-  using thrust::system::detail::generic::remove;
-  return remove(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, value);
-} // end remove()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename T>
-  OutputIterator remove_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                             InputIterator first,
-                             InputIterator last,
-                             OutputIterator result,
-                             const T &value)
-{
-  using thrust::system::detail::generic::remove_copy;
-  return remove_copy(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, value);
-} // end remove_copy()
-
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename Predicate>
-  ForwardIterator remove_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                            ForwardIterator first,
-                            ForwardIterator last,
-                            Predicate pred)
-{
-  using thrust::system::detail::generic::remove_if;
-  return remove_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, pred);
-} // end remove_if()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator remove_copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                InputIterator first,
-                                InputIterator last,
-                                OutputIterator result,
-                                Predicate pred)
-{
-  using thrust::system::detail::generic::remove_copy_if;
-  return remove_copy_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, pred);
-} // end remove_copy_if()
-
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename InputIterator,
-         typename Predicate>
-  ForwardIterator remove_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                            ForwardIterator first,
-                            ForwardIterator last,
-                            InputIterator stencil,
-                            Predicate pred)
-{
-  using thrust::system::detail::generic::remove_if;
-  return remove_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, stencil, pred);
-} // end remove_if()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator remove_copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                InputIterator1 first,
-                                InputIterator1 last,
-                                InputIterator2 stencil,
-                                OutputIterator result,
-                                Predicate pred)
-{
-  using thrust::system::detail::generic::remove_copy_if;
-  return remove_copy_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, stencil, result, pred);
-} // end remove_copy_if()
-
-
-template<typename ForwardIterator,
-         typename T>
-  ForwardIterator remove(ForwardIterator first,
-                         ForwardIterator last,
-                         const T &value)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<ForwardIterator>::type System;
-
-  System system;
-
-  return thrust::remove(select_system(system), first, last, value);
-} // end remove()
-
-
-template<typename InputIterator,
-         typename OutputIterator,
-         typename T>
-  OutputIterator remove_copy(InputIterator first,
-                             InputIterator last,
-                             OutputIterator result,
-                             const T &value)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator>::type  System1;
-  typedef typename thrust::iterator_system<OutputIterator>::type System2;
-
-  System1 system1;
-  System2 system2;
-
-  return thrust::remove_copy(select_system(system1,system2), first, last, result, value);
-} // end remove_copy()
-
-
-template<typename ForwardIterator,
-         typename Predicate>
-  ForwardIterator remove_if(ForwardIterator first,
-                            ForwardIterator last,
-                            Predicate pred)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<ForwardIterator>::type System;
-
-  System system;
-
-  return thrust::remove_if(select_system(system), first, last, pred);
-} // end remove_if()
-
-
-template<typename ForwardIterator,
-         typename InputIterator,
-         typename Predicate>
-  ForwardIterator remove_if(ForwardIterator first,
-                            ForwardIterator last,
-                            InputIterator stencil,
-                            Predicate pred)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<ForwardIterator>::type System1;
-  typedef typename thrust::iterator_system<InputIterator>::type   System2;
-
-  System1 system1;
-  System2 system2;
-
-  return thrust::remove_if(select_system(system1,system2), first, last, stencil, pred);
-} // end remove_if()
-
-
-template<typename InputIterator,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator remove_copy_if(InputIterator first,
-                                InputIterator last,
-                                OutputIterator result,
-                                Predicate pred)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator>::type  System1;
-  typedef typename thrust::iterator_system<OutputIterator>::type System2;
-
-  System1 system1;
-  System2 system2;
-
-  return thrust::remove_copy_if(select_system(system1,system2), first, last, result, pred);
-} // end remove_copy_if()
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator remove_copy_if(InputIterator1 first,
-                                InputIterator1 last,
-                                InputIterator2 stencil,
-                                OutputIterator result,
-                                Predicate pred)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type System1;
-  typedef typename thrust::iterator_system<InputIterator2>::type System2;
-  typedef typename thrust::iterator_system<OutputIterator>::type System3;
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-
-  return thrust::remove_copy_if(select_system(system1,system2,system3), first, last, stencil, result, pred);
-} // end remove_copy_if()
-
-
-} // end namespace thrust
-
diff --git a/compat/thrust/detail/replace.inl b/compat/thrust/detail/replace.inl
deleted file mode 100644
index 1eaf24d621..0000000000
--- a/compat/thrust/detail/replace.inl
+++ /dev/null
@@ -1,210 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file replace.inl
- *  \brief Inline file for replace.h.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/replace.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/detail/generic/select_system.h>
-#include <thrust/system/detail/generic/replace.h>
-#include <thrust/system/detail/adl/replace.h>
-
-namespace thrust
-{
-
-
-template<typename DerivedPolicy, typename ForwardIterator, typename T>
-  void replace(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-               ForwardIterator first, ForwardIterator last,
-               const T &old_value,
-               const T &new_value)
-{
-  using thrust::system::detail::generic::replace;
-  return replace(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, old_value, new_value);
-} // end replace()
-
-
-template<typename DerivedPolicy, typename ForwardIterator, typename Predicate, typename T>
-  void replace_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                  ForwardIterator first, ForwardIterator last,
-                  Predicate pred,
-                  const T &new_value)
-{
-  using thrust::system::detail::generic::replace_if;
-  return replace_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, pred, new_value);
-} // end replace_if()
-
-
-template<typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename Predicate, typename T>
-  void replace_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                  ForwardIterator first, ForwardIterator last,
-                  InputIterator stencil,
-                  Predicate pred,
-                  const T &new_value)
-{
-  using thrust::system::detail::generic::replace_if;
-  return replace_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, stencil, pred, new_value);
-} // end replace_if()
-
-
-template<typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename T>
-  OutputIterator replace_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                              InputIterator first, InputIterator last,
-                              OutputIterator result,
-                              const T &old_value,
-                              const T &new_value)
-{
-  using thrust::system::detail::generic::replace_copy;
-  return replace_copy(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, old_value, new_value);
-} // end replace_copy()
-
-
-template<typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename Predicate, typename T>
-  OutputIterator replace_copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                 InputIterator first, InputIterator last,
-                                 OutputIterator result,
-                                 Predicate pred,
-                                 const T &new_value)
-{
-  using thrust::system::detail::generic::replace_copy_if;
-  return replace_copy_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, pred, new_value);
-} // end replace_copy_if()
-
-
-template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Predicate, typename T>
-  OutputIterator replace_copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                 InputIterator1 first, InputIterator1 last,
-                                 InputIterator2 stencil,
-                                 OutputIterator result,
-                                 Predicate pred,
-                                 const T &new_value)
-{
-  using thrust::system::detail::generic::replace_copy_if;
-  return replace_copy_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, stencil, result, pred, new_value);
-} // end replace_copy_if()
-
-
-template<typename InputIterator, typename OutputIterator, typename Predicate, typename T>
-  OutputIterator replace_copy_if(InputIterator first, InputIterator last,
-                                 OutputIterator result,
-                                 Predicate pred,
-                                 const T &new_value)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator>::type  System1;
-  typedef typename thrust::iterator_system<OutputIterator>::type System2;
-
-  System1 system1;
-  System2 system2;
-
-  return thrust::replace_copy_if(select_system(system1,system2), first, last, result, pred, new_value);
-} // end replace_copy_if()
-
-
-template<typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Predicate, typename T>
-  OutputIterator replace_copy_if(InputIterator1 first, InputIterator1 last,
-                                 InputIterator2 stencil,
-                                 OutputIterator result,
-                                 Predicate pred,
-                                 const T &new_value)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type System1;
-  typedef typename thrust::iterator_system<InputIterator2>::type System2;
-  typedef typename thrust::iterator_system<OutputIterator>::type System3;
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-
-  return thrust::replace_copy_if(select_system(system1,system2,system3), first, last, stencil, result, pred, new_value);
-} // end replace_copy_if()
-
-
-template<typename InputIterator, typename OutputIterator, typename T>
-  OutputIterator replace_copy(InputIterator first, InputIterator last,
-                              OutputIterator result,
-                              const T &old_value,
-                              const T &new_value)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator>::type  System1;
-  typedef typename thrust::iterator_system<OutputIterator>::type System2;
-
-  System1 system1;
-  System2 system2;
-
-  return thrust::replace_copy(select_system(system1,system2), first, last, result, old_value, new_value);
-} // end replace_copy()
-
-
-template<typename ForwardIterator, typename Predicate, typename T>
-  void replace_if(ForwardIterator first, ForwardIterator last,
-                  Predicate pred,
-                  const T &new_value)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<ForwardIterator>::type System;
-
-  System system;
-
-  return thrust::replace_if(select_system(system), first, last, pred, new_value);
-} // end replace_if()
-
-
-template<typename ForwardIterator, typename InputIterator, typename Predicate, typename T>
-  void replace_if(ForwardIterator first, ForwardIterator last,
-                  InputIterator stencil,
-                  Predicate pred,
-                  const T &new_value)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<ForwardIterator>::type System1;
-  typedef typename thrust::iterator_system<InputIterator>::type   System2;
-
-  System1 system1;
-  System2 system2;
-
-  return thrust::replace_if(select_system(system1,system2), first, last, stencil, pred, new_value);
-} // end replace_if()
-
-
-template<typename ForwardIterator, typename T>
-  void replace(ForwardIterator first, ForwardIterator last,
-               const T &old_value,
-               const T &new_value)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<ForwardIterator>::type System;
-
-  System system;
-
-  return thrust::replace(select_system(system), first, last, old_value, new_value);
-} // end replace()
-
-
-} // end namespace thrust
-
diff --git a/compat/thrust/detail/reverse.inl b/compat/thrust/detail/reverse.inl
deleted file mode 100644
index 18c26c00e6..0000000000
--- a/compat/thrust/detail/reverse.inl
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file reverse.inl
- *  \brief Inline file for reverse.h.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/reverse.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/detail/generic/select_system.h>
-#include <thrust/system/detail/generic/reverse.h>
-#include <thrust/system/detail/adl/reverse.h>
-
-namespace thrust
-{
-
-
-template<typename DerivedPolicy, typename BidirectionalIterator>
-  void reverse(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-               BidirectionalIterator first,
-               BidirectionalIterator last)
-{
-  using thrust::system::detail::generic::reverse;
-  return reverse(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last);
-} // end reverse()
-
-
-template<typename DerivedPolicy, typename BidirectionalIterator, typename OutputIterator>
-  OutputIterator reverse_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                              BidirectionalIterator first,
-                              BidirectionalIterator last,
-                              OutputIterator result)
-{
-  using thrust::system::detail::generic::reverse_copy;
-  return reverse_copy(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result);
-} // end reverse_copy()
-
-
-template<typename BidirectionalIterator>
-  void reverse(BidirectionalIterator first,
-               BidirectionalIterator last)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<BidirectionalIterator>::type System;
-
-  System system;
-
-  return thrust::reverse(select_system(system), first, last);
-} // end reverse()
-
-
-template<typename BidirectionalIterator,
-         typename OutputIterator>
-  OutputIterator reverse_copy(BidirectionalIterator first,
-                              BidirectionalIterator last,
-                              OutputIterator result)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<BidirectionalIterator>::type System1;
-  typedef typename thrust::iterator_system<OutputIterator>::type        System2;
-
-  System1 system1;
-  System2 system2;
-
-  return thrust::reverse_copy(select_system(system1,system2), first, last, result);
-} // end reverse_copy()
-
-
-} // end thrust
-
diff --git a/compat/thrust/detail/scan.inl b/compat/thrust/detail/scan.inl
deleted file mode 100644
index 3e5fd9b4f7..0000000000
--- a/compat/thrust/detail/scan.inl
+++ /dev/null
@@ -1,502 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file scan.inl
- *  \brief Inline file for scan.h.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/scan.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/detail/generic/select_system.h>
-#include <thrust/system/detail/generic/scan.h>
-#include <thrust/system/detail/generic/scan_by_key.h>
-#include <thrust/system/detail/adl/scan.h>
-#include <thrust/system/detail/adl/scan_by_key.h>
-
-namespace thrust
-{
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator>
-  OutputIterator inclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                InputIterator first,
-                                InputIterator last,
-                                OutputIterator result)
-{
-  using thrust::system::detail::generic::inclusive_scan;
-  return inclusive_scan(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result);
-} // end inclusive_scan() 
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename AssociativeOperator>
-  OutputIterator inclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                InputIterator first,
-                                InputIterator last,
-                                OutputIterator result,
-                                AssociativeOperator binary_op)
-{
-  using thrust::system::detail::generic::inclusive_scan;
-  return inclusive_scan(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, binary_op);
-} // end inclusive_scan()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator>
-  OutputIterator exclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                InputIterator first,
-                                InputIterator last,
-                                OutputIterator result)
-{
-  using thrust::system::detail::generic::exclusive_scan;
-  return exclusive_scan(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result);
-} // end exclusive_scan()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename T>
-  OutputIterator exclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                InputIterator first,
-                                InputIterator last,
-                                OutputIterator result,
-                                T init)
-{
-  using thrust::system::detail::generic::exclusive_scan;
-  return exclusive_scan(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, init);
-} // end exclusive_scan()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename T,
-         typename AssociativeOperator>
-  OutputIterator exclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                InputIterator first,
-                                InputIterator last,
-                                OutputIterator result,
-                                T init,
-                                AssociativeOperator binary_op)
-{
-  using thrust::system::detail::generic::exclusive_scan;
-  return exclusive_scan(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, init, binary_op);
-} // end exclusive_scan()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-  OutputIterator inclusive_scan_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                       InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result)
-{
-  using thrust::system::detail::generic::inclusive_scan_by_key;
-  return inclusive_scan_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, result);
-} // end inclusive_scan_by_key()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename BinaryPredicate>
-  OutputIterator inclusive_scan_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                       InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result,
-                                       BinaryPredicate binary_pred)
-{
-  using thrust::system::detail::generic::inclusive_scan_by_key;
-  return inclusive_scan_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, result, binary_pred);
-} // end inclusive_scan_by_key()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename BinaryPredicate,
-         typename AssociativeOperator>
-  OutputIterator inclusive_scan_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                       InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result,
-                                       BinaryPredicate binary_pred,
-                                       AssociativeOperator binary_op)
-{
-  using thrust::system::detail::generic::inclusive_scan_by_key;
-  return inclusive_scan_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, result, binary_pred, binary_op);
-} // end inclusive_scan_by_key()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-  OutputIterator exclusive_scan_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                       InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result)
-{
-  using thrust::system::detail::generic::exclusive_scan_by_key;
-  return exclusive_scan_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, result);
-} // end exclusive_scan_by_key()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename T>
-  OutputIterator exclusive_scan_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                       InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result,
-                                       T init)
-{
-  using thrust::system::detail::generic::exclusive_scan_by_key;
-  return exclusive_scan_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, result, init);
-} // end exclusive_scan_by_key()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename T,
-         typename BinaryPredicate>
-  OutputIterator exclusive_scan_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                       InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result,
-                                       T init,
-                                       BinaryPredicate binary_pred)
-{
-  using thrust::system::detail::generic::exclusive_scan_by_key;
-  return exclusive_scan_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, result, init, binary_pred);
-} // end exclusive_scan_by_key()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename T,
-         typename BinaryPredicate,
-         typename AssociativeOperator>
-  OutputIterator exclusive_scan_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                       InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result,
-                                       T init,
-                                       BinaryPredicate binary_pred,
-                                       AssociativeOperator binary_op)
-{
-  using thrust::system::detail::generic::exclusive_scan_by_key;
-  return exclusive_scan_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, result, init, binary_pred, binary_op);
-} // end exclusive_scan_by_key()
-
-
-template<typename InputIterator,
-         typename OutputIterator>
-  OutputIterator inclusive_scan(InputIterator first,
-                                InputIterator last,
-                                OutputIterator result)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator>::type  System1;
-  typedef typename thrust::iterator_system<OutputIterator>::type System2;
-
-  System1 system1;
-  System2 system2;
-
-  return thrust::inclusive_scan(select_system(system1,system2), first, last, result);
-} // end inclusive_scan()
-
-
-template<typename InputIterator,
-         typename OutputIterator,
-         typename BinaryFunction>
-  OutputIterator inclusive_scan(InputIterator first,
-                                InputIterator last,
-                                OutputIterator result,
-                                BinaryFunction binary_op)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator>::type  System1;
-  typedef typename thrust::iterator_system<OutputIterator>::type System2;
-
-  System1 system1;
-  System2 system2;
-
-  return thrust::inclusive_scan(select_system(system1,system2), first, last, result, binary_op);
-} // end inclusive_scan()
-
-
-template<typename InputIterator,
-         typename OutputIterator>
-  OutputIterator exclusive_scan(InputIterator first,
-                                InputIterator last,
-                                OutputIterator result)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator>::type  System1;
-  typedef typename thrust::iterator_system<OutputIterator>::type System2;
-
-  System1 system1;
-  System2 system2;
-
-  return thrust::exclusive_scan(select_system(system1,system2), first, last, result);
-} // end exclusive_scan()
-
-
-template<typename InputIterator,
-         typename OutputIterator,
-         typename T>
-  OutputIterator exclusive_scan(InputIterator first,
-                                InputIterator last,
-                                OutputIterator result,
-                                T init)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator>::type  System1;
-  typedef typename thrust::iterator_system<OutputIterator>::type System2;
-
-  System1 system1;
-  System2 system2;
-
-  return thrust::exclusive_scan(select_system(system1,system2), first, last, result, init);
-} // end exclusive_scan()
-
-
-template<typename InputIterator,
-         typename OutputIterator,
-         typename T,
-         typename BinaryFunction>
-  OutputIterator exclusive_scan(InputIterator first,
-                                InputIterator last,
-                                OutputIterator result,
-                                T init,
-                                BinaryFunction binary_op)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator>::type  System1;
-  typedef typename thrust::iterator_system<OutputIterator>::type System2;
-
-  System1 system1;
-  System2 system2;
-
-  return thrust::exclusive_scan(select_system(system1,system2), first, last, result, init, binary_op);
-} // end exclusive_scan()
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-  OutputIterator inclusive_scan_by_key(InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type System1;
-  typedef typename thrust::iterator_system<InputIterator2>::type System2;
-  typedef typename thrust::iterator_system<OutputIterator>::type System3;
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-
-  return thrust::inclusive_scan_by_key(select_system(system1,system2,system3), first1, last1, first2, result);
-}
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename BinaryPredicate>
-  OutputIterator inclusive_scan_by_key(InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result,
-                                       BinaryPredicate binary_pred)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type System1;
-  typedef typename thrust::iterator_system<InputIterator2>::type System2;
-  typedef typename thrust::iterator_system<OutputIterator>::type System3;
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-
-  return thrust::inclusive_scan_by_key(select_system(system1,system2,system3), first1, last1, first2, result, binary_pred);
-}
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename BinaryPredicate,
-         typename AssociativeOperator>
-  OutputIterator inclusive_scan_by_key(InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result,
-                                       BinaryPredicate binary_pred,
-                                       AssociativeOperator binary_op)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type System1;
-  typedef typename thrust::iterator_system<InputIterator2>::type System2;
-  typedef typename thrust::iterator_system<OutputIterator>::type System3;
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-
-  return thrust::inclusive_scan_by_key(select_system(system1,system2,system3), first1, last1, first2, result, binary_pred, binary_op);
-}
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-  OutputIterator exclusive_scan_by_key(InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type System1;
-  typedef typename thrust::iterator_system<InputIterator2>::type System2;
-  typedef typename thrust::iterator_system<OutputIterator>::type System3;
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-
-  return thrust::exclusive_scan_by_key(select_system(system1,system2,system3), first1, last1, first2, result);
-}
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename T>
-  OutputIterator exclusive_scan_by_key(InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result,
-                                       T init)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type System1;
-  typedef typename thrust::iterator_system<InputIterator2>::type System2;
-  typedef typename thrust::iterator_system<OutputIterator>::type System3;
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-
-  return thrust::exclusive_scan_by_key(select_system(system1,system2,system3), first1, last1, first2, result, init);
-}
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename T,
-         typename BinaryPredicate>
-  OutputIterator exclusive_scan_by_key(InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result,
-                                       T init,
-                                       BinaryPredicate binary_pred)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type System1;
-  typedef typename thrust::iterator_system<InputIterator2>::type System2;
-  typedef typename thrust::iterator_system<OutputIterator>::type System3;
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-
-  return thrust::exclusive_scan_by_key(select_system(system1,system2,system3), first1, last1, first2, result, init, binary_pred);
-}
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename T,
-         typename BinaryPredicate,
-         typename AssociativeOperator>
-  OutputIterator exclusive_scan_by_key(InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result,
-                                       T init,
-                                       BinaryPredicate binary_pred,
-                                       AssociativeOperator binary_op)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type System1;
-  typedef typename thrust::iterator_system<InputIterator2>::type System2;
-  typedef typename thrust::iterator_system<OutputIterator>::type System3;
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-
-  return thrust::exclusive_scan_by_key(select_system(system1,system2,system3), first1, last1, first2, result, init, binary_pred, binary_op);
-}
-
-
-} // end namespace thrust
-
diff --git a/compat/thrust/detail/scatter.inl b/compat/thrust/detail/scatter.inl
deleted file mode 100644
index 934addb727..0000000000
--- a/compat/thrust/detail/scatter.inl
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file scatter.inl
- *  \brief Inline file for scatter.h.
- */
-
-#include <thrust/scatter.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/detail/generic/select_system.h>
-#include <thrust/system/detail/generic/scatter.h>
-#include <thrust/system/detail/adl/scatter.h>
-
-namespace thrust
-{
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename RandomAccessIterator>
-  void scatter(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-               InputIterator1 first,
-               InputIterator1 last,
-               InputIterator2 map,
-               RandomAccessIterator output)
-{
-  using thrust::system::detail::generic::scatter;
-  return scatter(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, map, output);
-} // end scatter()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename RandomAccessIterator>
-  void scatter_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                  InputIterator1 first,
-                  InputIterator1 last,
-                  InputIterator2 map,
-                  InputIterator3 stencil,
-                  RandomAccessIterator output)
-{
-  using thrust::system::detail::generic::scatter_if;
-  return scatter_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, map, stencil, output);
-} // end scatter_if()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename RandomAccessIterator,
-         typename Predicate>
-  void scatter_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                  InputIterator1 first,
-                  InputIterator1 last,
-                  InputIterator2 map,
-                  InputIterator3 stencil,
-                  RandomAccessIterator output,
-                  Predicate pred)
-{
-  using thrust::system::detail::generic::scatter_if;
-  return scatter_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, map, stencil, output, pred);
-} // end scatter_if()
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename RandomAccessIterator>
-  void scatter(InputIterator1 first,
-               InputIterator1 last,
-               InputIterator2 map,
-               RandomAccessIterator output)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type       System1; 
-  typedef typename thrust::iterator_system<InputIterator2>::type       System2; 
-  typedef typename thrust::iterator_system<RandomAccessIterator>::type System3; 
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-
-  return thrust::scatter(select_system(system1,system2,system3), first, last, map, output);
-} // end scatter()
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename RandomAccessIterator>
-  void scatter_if(InputIterator1 first,
-                  InputIterator1 last,
-                  InputIterator2 map,
-                  InputIterator3 stencil,
-                  RandomAccessIterator output)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type       System1; 
-  typedef typename thrust::iterator_system<InputIterator2>::type       System2; 
-  typedef typename thrust::iterator_system<InputIterator3>::type       System3; 
-  typedef typename thrust::iterator_system<RandomAccessIterator>::type System4; 
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-  System4 system4;
-
-  return thrust::scatter_if(select_system(system1,system2,system3,system4), first, last, map, stencil, output);
-} // end scatter_if()
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename RandomAccessIterator,
-         typename Predicate>
-  void scatter_if(InputIterator1 first,
-                  InputIterator1 last,
-                  InputIterator2 map,
-                  InputIterator3 stencil,
-                  RandomAccessIterator output,
-                  Predicate pred)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type       System1; 
-  typedef typename thrust::iterator_system<InputIterator2>::type       System2; 
-  typedef typename thrust::iterator_system<InputIterator3>::type       System3; 
-  typedef typename thrust::iterator_system<RandomAccessIterator>::type System4; 
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-  System4 system4;
-
-  return thrust::scatter_if(select_system(system1,system2,system3,system4), first, last, map, stencil, output, pred);
-} // end scatter_if()
-
-} // end namespace thrust
-
diff --git a/compat/thrust/detail/sequence.inl b/compat/thrust/detail/sequence.inl
deleted file mode 100644
index f1741877f8..0000000000
--- a/compat/thrust/detail/sequence.inl
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file sequence.inl
- *  \brief Inline file for sequence.h.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/sequence.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/detail/generic/select_system.h>
-#include <thrust/system/detail/generic/sequence.h>
-#include <thrust/system/detail/adl/sequence.h>
-
-namespace thrust
-{
-
-
-template<typename DerivedPolicy, typename ForwardIterator>
-  void sequence(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                ForwardIterator first,
-                ForwardIterator last)
-{
-  using thrust::system::detail::generic::sequence;
-  return sequence(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last);
-} // end sequence()
-
-
-template<typename DerivedPolicy, typename ForwardIterator, typename T>
-  void sequence(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                ForwardIterator first,
-                ForwardIterator last,
-                T init)
-{
-  using thrust::system::detail::generic::sequence;
-  return sequence(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, init);
-} // end sequence()
-
-
-template<typename DerivedPolicy, typename ForwardIterator, typename T>
-  void sequence(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                ForwardIterator first,
-                ForwardIterator last,
-                T init,
-                T step)
-{
-  using thrust::system::detail::generic::sequence;
-  return sequence(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, init, step);
-} // end sequence()
-
-
-template<typename ForwardIterator>
-  void sequence(ForwardIterator first,
-                ForwardIterator last)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<ForwardIterator>::type System;
-
-  System system;
-
-  return thrust::sequence(select_system(system), first, last);
-} // end sequence()
-
-
-template<typename ForwardIterator, typename T>
-  void sequence(ForwardIterator first,
-                ForwardIterator last,
-                T init)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<ForwardIterator>::type System;
-
-  System system;
-
-  return thrust::sequence(select_system(system), first, last, init);
-} // end sequence()
-
-
-template<typename ForwardIterator, typename T>
-  void sequence(ForwardIterator first,
-                ForwardIterator last,
-                T init,
-                T step)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<ForwardIterator>::type System;
-
-  System system;
-
-  return thrust::sequence(select_system(system), first, last, init, step);
-} // end sequence()
-
-
-} // end namespace thrust
-
diff --git a/compat/thrust/detail/set_operations.inl b/compat/thrust/detail/set_operations.inl
deleted file mode 100644
index daec46156d..0000000000
--- a/compat/thrust/detail/set_operations.inl
+++ /dev/null
@@ -1,836 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file set_operations.inl
- *  \brief Inline file for set_operations.h.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/detail/generic/select_system.h>
-#include <thrust/system/detail/generic/set_operations.h>
-#include <thrust/system/detail/adl/set_operations.h>
-
-namespace thrust
-{
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-  OutputIterator set_difference(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                InputIterator1                                              first1,
-                                InputIterator1                                              last1,
-                                InputIterator2                                              first2,
-                                InputIterator2                                              last2,
-                                OutputIterator                                              result)
-{
-  using thrust::system::detail::generic::set_difference;
-  return set_difference(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, last2, result);
-} // end set_difference()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename StrictWeakCompare>
-  OutputIterator set_difference(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                InputIterator1                                              first1,
-                                InputIterator1                                              last1,
-                                InputIterator2                                              first2,
-                                InputIterator2                                              last2,
-                                OutputIterator                                              result,
-                                StrictWeakCompare                                           comp)
-{
-  using thrust::system::detail::generic::set_difference;
-  return set_difference(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, last2, result, comp);
-} // end set_difference()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename InputIterator4,
-         typename OutputIterator1,
-         typename OutputIterator2>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_difference_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                          InputIterator1                                              keys_first1,
-                          InputIterator1                                              keys_last1,
-                          InputIterator2                                              keys_first2,
-                          InputIterator2                                              keys_last2,
-                          InputIterator3                                              values_first1,
-                          InputIterator4                                              values_first2,
-                          OutputIterator1                                             keys_result,
-                          OutputIterator2                                             values_result)
-{
-  using thrust::system::detail::generic::set_difference_by_key;
-  return set_difference_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result);
-} // end set_difference_by_key()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename InputIterator4,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename StrictWeakCompare>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_difference_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                          InputIterator1                                              keys_first1,
-                          InputIterator1                                              keys_last1,
-                          InputIterator2                                              keys_first2,
-                          InputIterator2                                              keys_last2,
-                          InputIterator3                                              values_first1,
-                          InputIterator4                                              values_first2,
-                          OutputIterator1                                             keys_result,
-                          OutputIterator2                                             values_result,
-                          StrictWeakCompare                                           comp)
-{
-  using thrust::system::detail::generic::set_difference_by_key;
-  return set_difference_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result, comp);
-} // end set_difference_by_key()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-  OutputIterator set_intersection(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                  InputIterator1                                              first1,
-                                  InputIterator1                                              last1,
-                                  InputIterator2                                              first2,
-                                  InputIterator2                                              last2,
-                                  OutputIterator                                              result)
-{
-  using thrust::system::detail::generic::set_intersection;
-  return set_intersection(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, last2, result);
-} // end set_intersection()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename StrictWeakCompare>
-  OutputIterator set_intersection(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                  InputIterator1                                              first1,
-                                  InputIterator1                                              last1,
-                                  InputIterator2                                              first2,
-                                  InputIterator2                                              last2,
-                                  OutputIterator                                              result,
-                                  StrictWeakCompare                                           comp)
-{
-  using thrust::system::detail::generic::set_intersection;
-  return set_intersection(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, last2, result, comp);
-} // end set_intersection()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename OutputIterator1,
-         typename OutputIterator2>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_intersection_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                            InputIterator1                                              keys_first1,
-                            InputIterator1                                              keys_last1,
-                            InputIterator2                                              keys_first2,
-                            InputIterator2                                              keys_last2,
-                            InputIterator3                                              values_first1,
-                            OutputIterator1                                             keys_result,
-                            OutputIterator2                                             values_result)
-{
-  using thrust::system::detail::generic::set_intersection_by_key;
-  return set_intersection_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, keys_result, values_result);
-} // end set_intersection_by_key()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename StrictWeakCompare>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_intersection_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                            InputIterator1                                              keys_first1,
-                            InputIterator1                                              keys_last1,
-                            InputIterator2                                              keys_first2,
-                            InputIterator2                                              keys_last2,
-                            InputIterator3                                              values_first1,
-                            OutputIterator1                                             keys_result,
-                            OutputIterator2                                             values_result,
-                            StrictWeakCompare                                           comp)
-{
-  using thrust::system::detail::generic::set_intersection_by_key;
-  return set_intersection_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, keys_result, values_result, comp);
-} // end set_intersection_by_key()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-  OutputIterator set_symmetric_difference(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                          InputIterator1                                              first1,
-                                          InputIterator1                                              last1,
-                                          InputIterator2                                              first2,
-                                          InputIterator2                                              last2,
-                                          OutputIterator                                              result)
-{
-  using thrust::system::detail::generic::set_symmetric_difference;
-  return set_symmetric_difference(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, last2, result);
-} // end set_symmetric_difference()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename StrictWeakCompare>
-  OutputIterator set_symmetric_difference(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                          InputIterator1                                              first1,
-                                          InputIterator1                                              last1,
-                                          InputIterator2                                              first2,
-                                          InputIterator2                                              last2,
-                                          OutputIterator                                              result,
-                                          StrictWeakCompare                                           comp)
-{
-  using thrust::system::detail::generic::set_symmetric_difference;
-  return set_symmetric_difference(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, last2, result, comp);
-} // end set_symmetric_difference()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename InputIterator4,
-         typename OutputIterator1,
-         typename OutputIterator2>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_symmetric_difference_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                    InputIterator1                                              keys_first1,
-                                    InputIterator1                                              keys_last1,
-                                    InputIterator2                                              keys_first2,
-                                    InputIterator2                                              keys_last2,
-                                    InputIterator3                                              values_first1,
-                                    InputIterator4                                              values_first2,
-                                    OutputIterator1                                             keys_result,
-                                    OutputIterator2                                             values_result)
-{
-  using thrust::system::detail::generic::set_symmetric_difference_by_key;
-  return set_symmetric_difference_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result);
-} // end set_symmetric_difference_by_key()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename InputIterator4,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename StrictWeakCompare>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_symmetric_difference_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                    InputIterator1                                              keys_first1,
-                                    InputIterator1                                              keys_last1,
-                                    InputIterator2                                              keys_first2,
-                                    InputIterator2                                              keys_last2,
-                                    InputIterator3                                              values_first1,
-                                    InputIterator4                                              values_first2,
-                                    OutputIterator1                                             keys_result,
-                                    OutputIterator2                                             values_result,
-                                    StrictWeakCompare                                           comp)
-{
-  using thrust::system::detail::generic::set_symmetric_difference_by_key;
-  return set_symmetric_difference_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result, comp);
-} // end set_symmetric_difference_by_key()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-  OutputIterator set_union(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                           InputIterator1                                              first1,
-                           InputIterator1                                              last1,
-                           InputIterator2                                              first2,
-                           InputIterator2                                              last2,
-                           OutputIterator                                              result)
-{
-  using thrust::system::detail::generic::set_union;
-  return set_union(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, last2, result);
-} // end set_union()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename StrictWeakCompare>
-  OutputIterator set_union(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                           InputIterator1                                              first1,
-                           InputIterator1                                              last1,
-                           InputIterator2                                              first2,
-                           InputIterator2                                              last2,
-                           OutputIterator                                              result,
-                           StrictWeakCompare                                           comp)
-{
-  using thrust::system::detail::generic::set_union;
-  return set_union(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, last2, result, comp);
-} // end set_union()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename InputIterator4,
-         typename OutputIterator1,
-         typename OutputIterator2>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_union_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                     InputIterator1                                              keys_first1,
-                     InputIterator1                                              keys_last1,
-                     InputIterator2                                              keys_first2,
-                     InputIterator2                                              keys_last2,
-                     InputIterator3                                              values_first1,
-                     InputIterator4                                              values_first2,
-                     OutputIterator1                                             keys_result,
-                     OutputIterator2                                             values_result)
-{
-  using thrust::system::detail::generic::set_union_by_key;
-  return set_union_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result);
-} // end set_union_by_key()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename InputIterator4,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename StrictWeakCompare>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_union_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                     InputIterator1                                              keys_first1,
-                     InputIterator1                                              keys_last1,
-                     InputIterator2                                              keys_first2,
-                     InputIterator2                                              keys_last2,
-                     InputIterator3                                              values_first1,
-                     InputIterator4                                              values_first2,
-                     OutputIterator1                                             keys_result,
-                     OutputIterator2                                             values_result,
-                     StrictWeakCompare                                           comp)
-{
-  using thrust::system::detail::generic::set_union_by_key;
-  return set_union_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result, comp);
-} // end set_union_by_key()
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename StrictWeakOrdering>
-  OutputIterator set_difference(InputIterator1 first1,
-                                InputIterator1 last1,
-                                InputIterator2 first2,
-                                InputIterator2 last2,
-                                OutputIterator result,
-                                StrictWeakOrdering comp)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type System1;
-  typedef typename thrust::iterator_system<InputIterator2>::type System2;
-  typedef typename thrust::iterator_system<OutputIterator>::type System3;
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-
-  return thrust::set_difference(select_system(system1,system2,system3), first1, last1, first2, last2, result, comp);
-} // end set_difference()
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-  OutputIterator set_difference(InputIterator1 first1,
-                                InputIterator1 last1,
-                                InputIterator2 first2,
-                                InputIterator2 last2,
-                                OutputIterator result)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type System1;
-  typedef typename thrust::iterator_system<InputIterator2>::type System2;
-  typedef typename thrust::iterator_system<OutputIterator>::type System3;
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-
-  return thrust::set_difference(select_system(system1,system2,system3), first1, last1, first2, last2, result);
-} // end set_difference()
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename InputIterator4,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename StrictWeakOrdering>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_difference_by_key(InputIterator1 keys_first1,
-                          InputIterator1 keys_last1,
-                          InputIterator2 keys_first2,
-                          InputIterator2 keys_last2,
-                          InputIterator3 values_first1,
-                          InputIterator4 values_first2,
-                          OutputIterator1 keys_result,
-                          OutputIterator2 values_result,
-                          StrictWeakOrdering comp)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
-  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
-  typedef typename thrust::iterator_system<InputIterator3>::type  System3;
-  typedef typename thrust::iterator_system<InputIterator4>::type  System4;
-  typedef typename thrust::iterator_system<OutputIterator1>::type System5;
-  typedef typename thrust::iterator_system<OutputIterator2>::type System6;
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-  System4 system4;
-  System5 system5;
-  System6 system6;
-
-  return thrust::set_difference_by_key(select_system(system1,system2,system3,system4,system5,system6), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result, comp);
-} // end set_difference_by_key()
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename InputIterator4,
-         typename OutputIterator1,
-         typename OutputIterator2>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_difference_by_key(InputIterator1 keys_first1,
-                          InputIterator1 keys_last1,
-                          InputIterator2 keys_first2,
-                          InputIterator2 keys_last2,
-                          InputIterator3 values_first1,
-                          InputIterator4 values_first2,
-                          OutputIterator1 keys_result,
-                          OutputIterator2 values_result)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
-  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
-  typedef typename thrust::iterator_system<InputIterator3>::type  System3;
-  typedef typename thrust::iterator_system<InputIterator4>::type  System4;
-  typedef typename thrust::iterator_system<OutputIterator1>::type System5;
-  typedef typename thrust::iterator_system<OutputIterator2>::type System6;
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-  System4 system4;
-  System5 system5;
-  System6 system6;
-
-  return thrust::set_difference_by_key(select_system(system1,system2,system3,system4,system5,system6), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result);
-} // end set_difference_by_key()
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename StrictWeakOrdering>
-  OutputIterator set_intersection(InputIterator1 first1,
-                                  InputIterator1 last1,
-                                  InputIterator2 first2,
-                                  InputIterator2 last2,
-                                  OutputIterator result,
-                                  StrictWeakOrdering comp)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type System1;
-  typedef typename thrust::iterator_system<InputIterator2>::type System2;
-  typedef typename thrust::iterator_system<OutputIterator>::type System3;
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-
-  return thrust::set_intersection(select_system(system1,system2,system3), first1, last1, first2, last2, result, comp);
-} // end set_intersection()
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-  OutputIterator set_intersection(InputIterator1 first1,
-                                  InputIterator1 last1,
-                                  InputIterator2 first2,
-                                  InputIterator2 last2,
-                                  OutputIterator result)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type System1;
-  typedef typename thrust::iterator_system<InputIterator2>::type System2;
-  typedef typename thrust::iterator_system<OutputIterator>::type System3;
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-
-  return thrust::set_intersection(select_system(system1,system2,system3), first1, last1, first2, last2, result);
-} // end set_intersection()
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename StrictWeakOrdering>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_intersection_by_key(InputIterator1 keys_first1,
-                            InputIterator1 keys_last1,
-                            InputIterator2 keys_first2,
-                            InputIterator2 keys_last2,
-                            InputIterator3 values_first1,
-                            OutputIterator1 keys_result,
-                            OutputIterator2 values_result,
-                            StrictWeakOrdering comp)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
-  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
-  typedef typename thrust::iterator_system<InputIterator3>::type  System3;
-  typedef typename thrust::iterator_system<OutputIterator1>::type System4;
-  typedef typename thrust::iterator_system<OutputIterator2>::type System5;
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-  System4 system4;
-  System5 system5;
-
-  return thrust::set_intersection_by_key(select_system(system1,system2,system3,system4,system5), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, keys_result, values_result, comp);
-} // end set_intersection_by_key()
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename OutputIterator1,
-         typename OutputIterator2>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_intersection_by_key(InputIterator1 keys_first1,
-                            InputIterator1 keys_last1,
-                            InputIterator2 keys_first2,
-                            InputIterator2 keys_last2,
-                            InputIterator3 values_first1,
-                            OutputIterator1 keys_result,
-                            OutputIterator2 values_result)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
-  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
-  typedef typename thrust::iterator_system<InputIterator3>::type  System3;
-  typedef typename thrust::iterator_system<OutputIterator1>::type System4;
-  typedef typename thrust::iterator_system<OutputIterator2>::type System5;
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-  System4 system4;
-  System5 system5;
-
-  return thrust::set_intersection_by_key(select_system(system1,system2,system3,system4,system5), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, keys_result, values_result);
-} // end set_intersection_by_key()
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename StrictWeakOrdering>
-  OutputIterator set_symmetric_difference(InputIterator1 first1,
-                                          InputIterator1 last1,
-                                          InputIterator2 first2,
-                                          InputIterator2 last2,
-                                          OutputIterator result,
-                                          StrictWeakOrdering comp)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type System1;
-  typedef typename thrust::iterator_system<InputIterator2>::type System2;
-  typedef typename thrust::iterator_system<OutputIterator>::type System3;
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-
-  return thrust::set_symmetric_difference(select_system(system1,system2,system3), first1, last1, first2, last2, result, comp);
-} // end set_symmetric_difference()
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-  OutputIterator set_symmetric_difference(InputIterator1 first1,
-                                          InputIterator1 last1,
-                                          InputIterator2 first2,
-                                          InputIterator2 last2,
-                                          OutputIterator result)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type System1;
-  typedef typename thrust::iterator_system<InputIterator2>::type System2;
-  typedef typename thrust::iterator_system<OutputIterator>::type System3;
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-
-  return thrust::set_symmetric_difference(select_system(system1,system2,system3), first1, last1, first2, last2, result);
-} // end set_symmetric_difference()
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename InputIterator4,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename StrictWeakOrdering>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_symmetric_difference_by_key(InputIterator1 keys_first1,
-                                    InputIterator1 keys_last1,
-                                    InputIterator2 keys_first2,
-                                    InputIterator2 keys_last2,
-                                    InputIterator3 values_first1,
-                                    InputIterator4 values_first2,
-                                    OutputIterator1 keys_result,
-                                    OutputIterator2 values_result,
-                                    StrictWeakOrdering comp)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
-  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
-  typedef typename thrust::iterator_system<InputIterator3>::type  System3;
-  typedef typename thrust::iterator_system<InputIterator4>::type  System4;
-  typedef typename thrust::iterator_system<OutputIterator1>::type System5;
-  typedef typename thrust::iterator_system<OutputIterator2>::type System6;
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-  System4 system4;
-  System5 system5;
-  System6 system6;
-
-  return thrust::set_symmetric_difference_by_key(select_system(system1,system2,system3,system4,system5,system6), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result, comp);
-} // end set_symmetric_difference_by_key()
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename InputIterator4,
-         typename OutputIterator1,
-         typename OutputIterator2>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_symmetric_difference_by_key(InputIterator1 keys_first1,
-                                    InputIterator1 keys_last1,
-                                    InputIterator2 keys_first2,
-                                    InputIterator2 keys_last2,
-                                    InputIterator3 values_first1,
-                                    InputIterator4 values_first2,
-                                    OutputIterator1 keys_result,
-                                    OutputIterator2 values_result)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
-  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
-  typedef typename thrust::iterator_system<InputIterator3>::type  System3;
-  typedef typename thrust::iterator_system<InputIterator4>::type  System4;
-  typedef typename thrust::iterator_system<OutputIterator1>::type System5;
-  typedef typename thrust::iterator_system<OutputIterator2>::type System6;
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-  System4 system4;
-  System5 system5;
-  System6 system6;
-
-  return thrust::set_symmetric_difference_by_key(select_system(system1,system2,system3,system4,system5,system6), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result);
-} // end set_symmetric_difference_by_key()
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename StrictWeakOrdering>
-  OutputIterator set_union(InputIterator1 first1,
-                           InputIterator1 last1,
-                           InputIterator2 first2,
-                           InputIterator2 last2,
-                           OutputIterator result,
-                           StrictWeakOrdering comp)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type System1;
-  typedef typename thrust::iterator_system<InputIterator2>::type System2;
-  typedef typename thrust::iterator_system<OutputIterator>::type System3;
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-
-  return thrust::set_union(select_system(system1,system2,system3), first1, last1, first2, last2, result, comp);
-} // end set_union()
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-  OutputIterator set_union(InputIterator1 first1,
-                           InputIterator1 last1,
-                           InputIterator2 first2,
-                           InputIterator2 last2,
-                           OutputIterator result)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type System1;
-  typedef typename thrust::iterator_system<InputIterator2>::type System2;
-  typedef typename thrust::iterator_system<OutputIterator>::type System3;
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-
-  return thrust::set_union(select_system(system1,system2,system3), first1, last1, first2, last2, result);
-} // end set_union()
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename InputIterator4,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename StrictWeakOrdering>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_union_by_key(InputIterator1 keys_first1,
-                     InputIterator1 keys_last1,
-                     InputIterator2 keys_first2,
-                     InputIterator2 keys_last2,
-                     InputIterator3 values_first1,
-                     InputIterator4 values_first2,
-                     OutputIterator1 keys_result,
-                     OutputIterator2 values_result,
-                     StrictWeakOrdering comp)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
-  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
-  typedef typename thrust::iterator_system<InputIterator3>::type  System3;
-  typedef typename thrust::iterator_system<InputIterator4>::type  System4;
-  typedef typename thrust::iterator_system<OutputIterator1>::type System5;
-  typedef typename thrust::iterator_system<OutputIterator2>::type System6;
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-  System4 system4;
-  System5 system5;
-  System6 system6;
-
-  return thrust::set_union_by_key(select_system(system1,system2,system3,system4,system5,system6), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result, comp);
-} // end set_union_by_key()
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename InputIterator4,
-         typename OutputIterator1,
-         typename OutputIterator2>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_union_by_key(InputIterator1 keys_first1,
-                     InputIterator1 keys_last1,
-                     InputIterator2 keys_first2,
-                     InputIterator2 keys_last2,
-                     InputIterator3 values_first1,
-                     InputIterator4 values_first2,
-                     OutputIterator1 keys_result,
-                     OutputIterator2 values_result)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
-  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
-  typedef typename thrust::iterator_system<InputIterator3>::type  System3;
-  typedef typename thrust::iterator_system<InputIterator4>::type  System4;
-  typedef typename thrust::iterator_system<OutputIterator1>::type System5;
-  typedef typename thrust::iterator_system<OutputIterator2>::type System6;
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-  System4 system4;
-  System5 system5;
-  System6 system6;
-
-  return thrust::set_union_by_key(select_system(system1,system2,system3,system4,system5,system6), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result);
-} // end set_union_by_key()
-
-
-} // end thrust
-
diff --git a/compat/thrust/detail/sort.inl b/compat/thrust/detail/sort.inl
deleted file mode 100644
index 08be55a8ce..0000000000
--- a/compat/thrust/detail/sort.inl
+++ /dev/null
@@ -1,383 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file sort.inl
- *  \brief Inline file for sort.h.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/sort.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/detail/generic/select_system.h>
-#include <thrust/system/detail/generic/sort.h>
-#include <thrust/system/detail/adl/sort.h>
-
-namespace thrust
-{
-
-
-template<typename DerivedPolicy, typename RandomAccessIterator>
-  void sort(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-            RandomAccessIterator first,
-            RandomAccessIterator last)
-{
-  using thrust::system::detail::generic::sort;
-  return sort(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last);
-} // end sort()
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator,
-         typename StrictWeakOrdering>
-  void sort(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-            RandomAccessIterator first,
-            RandomAccessIterator last,
-            StrictWeakOrdering comp)
-{
-  using thrust::system::detail::generic::sort;
-  return sort(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, comp);
-} // end sort()
-
-
-template<typename DerivedPolicy, typename RandomAccessIterator>
-  void stable_sort(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                   RandomAccessIterator first,
-                   RandomAccessIterator last)
-{
-  using thrust::system::detail::generic::stable_sort;
-  return stable_sort(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last);
-} // end stable_sort()
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator,
-         typename StrictWeakOrdering>
-  void stable_sort(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                   RandomAccessIterator first,
-                   RandomAccessIterator last,
-                   StrictWeakOrdering comp)
-{
-  using thrust::system::detail::generic::stable_sort;
-  return stable_sort(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, comp);
-} // end stable_sort()
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-  void sort_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                   RandomAccessIterator1 keys_first,
-                   RandomAccessIterator1 keys_last,
-                   RandomAccessIterator2 values_first)
-{
-  using thrust::system::detail::generic::sort_by_key;
-  return sort_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first, keys_last, values_first);
-} // end sort_by_key()
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename StrictWeakOrdering>
-  void sort_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                   RandomAccessIterator1 keys_first,
-                   RandomAccessIterator1 keys_last,
-                   RandomAccessIterator2 values_first,
-                   StrictWeakOrdering comp)
-{
-  using thrust::system::detail::generic::sort_by_key;
-  return sort_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first, keys_last, values_first, comp);
-} // end sort_by_key()
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-  void stable_sort_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                          RandomAccessIterator1 keys_first,
-                          RandomAccessIterator1 keys_last,
-                          RandomAccessIterator2 values_first)
-{
-  using thrust::system::detail::generic::stable_sort_by_key;
-  return stable_sort_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first, keys_last, values_first);
-} // end stable_sort_by_key()
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename StrictWeakOrdering>
-  void stable_sort_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                          RandomAccessIterator1 keys_first,
-                          RandomAccessIterator1 keys_last,
-                          RandomAccessIterator2 values_first,
-                          StrictWeakOrdering comp)
-{
-  using thrust::system::detail::generic::stable_sort_by_key;
-  return stable_sort_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first, keys_last, values_first, comp);
-} // end stable_sort_by_key()
-
-
-template<typename DerivedPolicy, typename ForwardIterator>
-  bool is_sorted(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                 ForwardIterator first,
-                 ForwardIterator last)
-{
-  using thrust::system::detail::generic::is_sorted;
-  return is_sorted(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last);
-} // end is_sorted()
-
-
-template<typename DerivedPolicy, typename ForwardIterator, typename Compare>
-  bool is_sorted(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                 ForwardIterator first,
-                 ForwardIterator last,
-                 Compare comp)
-{
-  using thrust::system::detail::generic::is_sorted;
-  return is_sorted(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, comp);
-} // end is_sorted()
-
-
-template<typename DerivedPolicy, typename ForwardIterator>
-  ForwardIterator is_sorted_until(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                  ForwardIterator first,
-                                  ForwardIterator last)
-{
-  using thrust::system::detail::generic::is_sorted_until;
-  return is_sorted_until(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last);
-} // end is_sorted_until()
-
-
-template<typename DerivedPolicy, typename ForwardIterator, typename Compare>
-  ForwardIterator is_sorted_until(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                  ForwardIterator first,
-                                  ForwardIterator last,
-                                  Compare comp)
-{
-  using thrust::system::detail::generic::is_sorted_until;
-  return is_sorted_until(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, comp);
-} // end is_sorted_until()
-
-
-///////////////
-// Key Sorts //
-///////////////
-
-template<typename RandomAccessIterator>
-  void sort(RandomAccessIterator first,
-            RandomAccessIterator last)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<RandomAccessIterator>::type System;
-
-  System system;
-
-  return thrust::sort(select_system(system), first, last);
-} // end sort()
-
-
-template<typename RandomAccessIterator,
-         typename StrictWeakOrdering>
-  void sort(RandomAccessIterator first,
-            RandomAccessIterator last,
-            StrictWeakOrdering comp)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<RandomAccessIterator>::type System;
-
-  System system;
-
-  return thrust::sort(select_system(system), first, last, comp);
-} // end sort()
-
-
-template<typename RandomAccessIterator>
-  void stable_sort(RandomAccessIterator first,
-                   RandomAccessIterator last)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<RandomAccessIterator>::type System;
-
-  System system;
-
-  return thrust::stable_sort(select_system(system), first, last);
-} // end stable_sort() 
-
-
-template<typename RandomAccessIterator,
-         typename StrictWeakOrdering>
-  void stable_sort(RandomAccessIterator first,
-                   RandomAccessIterator last,
-                   StrictWeakOrdering comp)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<RandomAccessIterator>::type System;
-
-  System system;
-
-  return thrust::stable_sort(select_system(system), first, last, comp);
-} // end stable_sort()
-
-
-
-/////////////////////
-// Key-Value Sorts //
-/////////////////////
-
-template<typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-  void sort_by_key(RandomAccessIterator1 keys_first,
-                   RandomAccessIterator1 keys_last,
-                   RandomAccessIterator2 values_first)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<RandomAccessIterator1>::type System1;
-  typedef typename thrust::iterator_system<RandomAccessIterator2>::type System2;
-
-  System1 system1;
-  System2 system2;
-
-  return thrust::sort_by_key(select_system(system1,system2), keys_first, keys_last, values_first);
-} // end sort_by_key()
-
-
-template<typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename StrictWeakOrdering>
-  void sort_by_key(RandomAccessIterator1 keys_first,
-                   RandomAccessIterator1 keys_last,
-                   RandomAccessIterator2 values_first,
-                   StrictWeakOrdering comp)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<RandomAccessIterator1>::type System1;
-  typedef typename thrust::iterator_system<RandomAccessIterator2>::type System2;
-
-  System1 system1;
-  System2 system2;
-
-  return thrust::sort_by_key(select_system(system1,system2), keys_first, keys_last, values_first, comp);
-} // end sort_by_key()
-
-
-template<typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-  void stable_sort_by_key(RandomAccessIterator1 keys_first,
-                          RandomAccessIterator1 keys_last,
-                          RandomAccessIterator2 values_first)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<RandomAccessIterator1>::type System1;
-  typedef typename thrust::iterator_system<RandomAccessIterator2>::type System2;
-
-  System1 system1;
-  System2 system2;
-
-  return thrust::stable_sort_by_key(select_system(system1,system2), keys_first, keys_last, values_first);
-} // end stable_sort_by_key()
-
-
-template<typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename StrictWeakOrdering>
-  void stable_sort_by_key(RandomAccessIterator1 keys_first,
-                          RandomAccessIterator1 keys_last,
-                          RandomAccessIterator2 values_first,
-                          StrictWeakOrdering comp)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<RandomAccessIterator1>::type System1;
-  typedef typename thrust::iterator_system<RandomAccessIterator2>::type System2;
-
-  System1 system1;
-  System2 system2;
-
-  return thrust::stable_sort_by_key(select_system(system1,system2), keys_first, keys_last, values_first, comp);
-} // end stable_sort_by_key()
-
-
-template<typename ForwardIterator>
-  bool is_sorted(ForwardIterator first,
-                 ForwardIterator last)
-{
-  using thrust::system::detail::generic::select_system;
-  
-  typedef typename thrust::iterator_system<ForwardIterator>::type System;
-
-  System system;
-
-  return thrust::is_sorted(select_system(system), first, last);
-} // end is_sorted()
-
-
-template<typename ForwardIterator,
-         typename Compare>
-  bool is_sorted(ForwardIterator first,
-                 ForwardIterator last,
-                 Compare comp)
-{
-  using thrust::system::detail::generic::select_system;
-  
-  typedef typename thrust::iterator_system<ForwardIterator>::type System;
-
-  System system;
-
-  return thrust::is_sorted(select_system(system), first, last, comp);
-} // end is_sorted()
-
-
-template<typename ForwardIterator>
-  ForwardIterator is_sorted_until(ForwardIterator first,
-                                  ForwardIterator last)
-{
-  using thrust::system::detail::generic::select_system;
-  
-  typedef typename thrust::iterator_system<ForwardIterator>::type System;
-
-  System system;
-
-  return thrust::is_sorted_until(select_system(system), first, last);
-} // end is_sorted_until()
-
-
-template<typename ForwardIterator,
-         typename Compare>
-  ForwardIterator is_sorted_until(ForwardIterator first,
-                                  ForwardIterator last,
-                                  Compare comp)
-{
-  using thrust::system::detail::generic::select_system;
-  
-  typedef typename thrust::iterator_system<ForwardIterator>::type System;
-
-  System system;
-
-  return thrust::is_sorted_until(select_system(system), first, last, comp);
-} // end is_sorted_until()
-
-
-} // end namespace thrust
-
diff --git a/compat/thrust/detail/static_assert.h b/compat/thrust/detail/static_assert.h
deleted file mode 100644
index ccc084286c..0000000000
--- a/compat/thrust/detail/static_assert.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/type_traits.h>
-
-/*
- * (C) Copyright John Maddock 2000.
- * 
- * Distributed under the Boost Software License, Version 1.0.
- * (See accompanying NOTICE file for the complete license)
- *
- * For more information, see http://www.boost.org
- */
-
-//
-// Helper macro THRUST_JOIN (based on BOOST_JOIN):
-// The following piece of macro magic joins the two
-// arguments together, even when one of the arguments is
-// itself a macro (see 16.3.1 in C++ standard).  The key
-// is that macro expansion of macro arguments does not
-// occur in THRUST_DO_JOIN2 but does in THRUST_DO_JOIN.
-//
-#define THRUST_JOIN( X, Y ) THRUST_DO_JOIN( X, Y )
-#define THRUST_DO_JOIN( X, Y ) THRUST_DO_JOIN2(X,Y)
-#define THRUST_DO_JOIN2( X, Y ) X##Y
-
-namespace thrust
-{
-
-namespace detail
-{
-
-// HP aCC cannot deal with missing names for template value parameters
-template <bool x> struct STATIC_ASSERTION_FAILURE;
-
-template <> struct STATIC_ASSERTION_FAILURE<true> { enum { value = 1 }; };
-
-// HP aCC cannot deal with missing names for template value parameters
-template<int x> struct static_assert_test{};
-
-template<typename, bool x>
-  struct depend_on_instantiation
-{
-  static const bool value = x;
-};
-
-} // end detail
-
-} // end thrust
-
-#define THRUST_STATIC_ASSERT( B ) \
-   typedef ::thrust::detail::static_assert_test<\
-      sizeof(::thrust::detail::STATIC_ASSERTION_FAILURE< (bool)( B ) >)>\
-         THRUST_JOIN(thrust_static_assert_typedef_, __LINE__)
-
diff --git a/compat/thrust/detail/swap.h b/compat/thrust/detail/swap.h
deleted file mode 100644
index 9f82ac247a..0000000000
--- a/compat/thrust/detail/swap.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-namespace thrust
-{
-
-template<typename Assignable1, typename Assignable2>
-__host__ __device__
-inline void swap(Assignable1 &a, Assignable2 &b)
-{
-  Assignable1 temp = a;
-  a = b;
-  b = temp;
-} // end swap()
-
-} // end namespace thrust
-
diff --git a/compat/thrust/detail/swap.inl b/compat/thrust/detail/swap.inl
deleted file mode 100644
index eafd70ae6a..0000000000
--- a/compat/thrust/detail/swap.inl
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#include <thrust/swap.h>
-#include <thrust/detail/swap.h>
-#include <thrust/detail/swap_ranges.inl>
-
diff --git a/compat/thrust/detail/swap_ranges.inl b/compat/thrust/detail/swap_ranges.inl
deleted file mode 100644
index e3b06deb01..0000000000
--- a/compat/thrust/detail/swap_ranges.inl
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file swap_ranges.inl
- *  \brief Inline file for swap_ranges.h.
- */
-
-#include <thrust/swap.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/detail/generic/select_system.h>
-#include <thrust/system/detail/generic/swap_ranges.h>
-#include <thrust/system/detail/adl/swap_ranges.h>
-
-namespace thrust
-{
-
-
-template<typename DerivedPolicy,
-         typename ForwardIterator1,
-         typename ForwardIterator2>
-  ForwardIterator2 swap_ranges(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                               ForwardIterator1 first1,
-                               ForwardIterator1 last1,
-                               ForwardIterator2 first2)
-{
-  using thrust::system::detail::generic::swap_ranges;
-  return swap_ranges(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2);
-} // end swap_ranges()
-
-
-template<typename ForwardIterator1,
-         typename ForwardIterator2>
-  ForwardIterator2 swap_ranges(ForwardIterator1 first1,
-                               ForwardIterator1 last1,
-                               ForwardIterator2 first2)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<ForwardIterator1>::type System1;
-  typedef typename thrust::iterator_system<ForwardIterator2>::type System2;
-
-  System1 system1;
-  System2 system2;
-
-  return thrust::swap_ranges(select_system(system1,system2), first1, last1, first2);
-} // end swap_ranges()
-
-
-} // end namespace thrust
-
diff --git a/compat/thrust/detail/tabulate.inl b/compat/thrust/detail/tabulate.inl
deleted file mode 100644
index 961c76e9f6..0000000000
--- a/compat/thrust/detail/tabulate.inl
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/tabulate.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/detail/generic/select_system.h>
-#include <thrust/system/detail/generic/tabulate.h>
-#include <thrust/system/detail/adl/tabulate.h>
-
-namespace thrust
-{
-
-
-template<typename DerivedPolicy, typename ForwardIterator, typename UnaryOperation>
-  void tabulate(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                ForwardIterator first,
-                ForwardIterator last,
-                UnaryOperation unary_op)
-{
-  using thrust::system::detail::generic::tabulate;
-  return tabulate(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, unary_op);
-} // end tabulate()
-
-
-template<typename ForwardIterator, typename UnaryOperation>
-  void tabulate(ForwardIterator first,
-                ForwardIterator last,
-                UnaryOperation unary_op)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<ForwardIterator>::type System;
-
-  System system;
-
-  return thrust::tabulate(select_system(system), first, last, unary_op);
-} // end tabulate()
-
-
-} // end namespace thrust
-
diff --git a/compat/thrust/detail/temporary_array.h b/compat/thrust/detail/temporary_array.h
deleted file mode 100644
index 3a9e08481c..0000000000
--- a/compat/thrust/detail/temporary_array.h
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file temporary_array.h
- *  \brief Container-like class temporary storage inside algorithms.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/iterator/retag.h>
-#include <thrust/detail/contiguous_storage.h>
-#include <thrust/detail/allocator/temporary_allocator.h>
-#include <thrust/detail/allocator/no_throw_allocator.h>
-#include <memory>
-
-namespace thrust
-{
-namespace detail
-{
-
-
-template<typename T, typename System>
-  class temporary_array
-    : public contiguous_storage<
-               T,
-               no_throw_allocator<
-                 temporary_allocator<T,System>
-               >
-             >
-{
-  private:
-    typedef contiguous_storage<
-      T,
-      no_throw_allocator<
-        temporary_allocator<T,System>
-      >
-    > super_t;
-
-    // to help out the constructor
-    typedef no_throw_allocator<temporary_allocator<T,System> > alloc_type;
-
-  public:
-    typedef typename super_t::size_type size_type;
-
-    temporary_array(thrust::execution_policy<System> &system, size_type n);
-
-    // provide a kill-switch to explicitly avoid initialization
-    temporary_array(int uninit, thrust::execution_policy<System> &system, size_type n);
-
-    template<typename InputIterator>
-    temporary_array(thrust::execution_policy<System> &system,
-                    InputIterator first,
-                    size_type n);
-
-    template<typename InputIterator, typename InputSystem>
-    temporary_array(thrust::execution_policy<System> &system,
-                    thrust::execution_policy<InputSystem> &input_system,
-                    InputIterator first,
-                    size_type n);
-
-    template<typename InputIterator>
-    temporary_array(thrust::execution_policy<System> &system,
-                    InputIterator first,
-                    InputIterator last);
-
-    template<typename InputSystem, typename InputIterator>
-    temporary_array(thrust::execution_policy<System> &system,
-                    thrust::execution_policy<InputSystem> &input_system,
-                    InputIterator first,
-                    InputIterator last);
-
-    ~temporary_array();
-}; // end temporary_array
-
-
-// XXX eliminate this when we do ranges for real
-template<typename Iterator, typename System>
-  class tagged_iterator_range
-{
-  public:
-    typedef thrust::detail::tagged_iterator<Iterator,System> iterator;
-
-    template<typename Ignored1, typename Ignored2>
-    tagged_iterator_range(const Ignored1 &, const Ignored2 &, Iterator first, Iterator last)
-      : m_begin(reinterpret_tag<System>(first)),
-        m_end(reinterpret_tag<System>(last))
-    {}
-
-    iterator begin(void) const { return m_begin; }
-    iterator end(void) const { return m_end; }
-
-  private:
-    iterator m_begin, m_end;
-};
-
-
-// if FromSystem is convertible to ToSystem, then just make a shallow
-// copy of the range. else, use a temporary_array
-// note that the resulting iterator is explicitly tagged with ToSystem either way
-template<typename Iterator, typename FromSystem, typename ToSystem>
-  struct move_to_system_base
-    : public eval_if<
-        is_convertible<
-          FromSystem,
-          ToSystem
-        >::value,
-        identity_<
-          tagged_iterator_range<Iterator,ToSystem>
-        >,
-        identity_<
-          temporary_array<
-            typename thrust::iterator_value<Iterator>::type,
-            ToSystem
-          >
-        >
-      >
-{};
-
-
-template<typename Iterator, typename FromSystem, typename ToSystem>
-  class move_to_system
-    : public move_to_system_base<
-        Iterator,
-        FromSystem,
-        ToSystem
-      >::type
-{
-  typedef typename move_to_system_base<Iterator,FromSystem,ToSystem>::type super_t;
-
-  public:
-    move_to_system(thrust::execution_policy<FromSystem> &from_system,
-                   thrust::execution_policy<ToSystem> &to_system,
-                   Iterator first,
-                   Iterator last)
-      : super_t(to_system, from_system, first, last) {}
-};
-
-
-} // end detail
-} // end thrust
-
-#include <thrust/detail/temporary_array.inl>
-
diff --git a/compat/thrust/detail/temporary_array.inl b/compat/thrust/detail/temporary_array.inl
deleted file mode 100644
index 36ed16736e..0000000000
--- a/compat/thrust/detail/temporary_array.inl
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/temporary_array.h>
-#include <thrust/distance.h>
-#include <thrust/system/detail/generic/select_system.h>
-#include <thrust/detail/type_traits.h>
-
-
-namespace thrust
-{
-
-namespace detail
-{
-namespace temporary_array_detail
-{
-
-
-template<typename T> struct avoid_initialization : thrust::detail::has_trivial_copy_constructor<T> {};
-
-
-template<typename T, typename TemporaryArray, typename Size>
-typename thrust::detail::enable_if<
-  avoid_initialization<T>::value
->::type
-  construct_values(TemporaryArray &,
-                   Size)
-{
-  // avoid the overhead of initialization
-} // end construct_values()
-
-
-template<typename T, typename TemporaryArray, typename Size>
-typename thrust::detail::disable_if<
-  avoid_initialization<T>::value
->::type
-  construct_values(TemporaryArray &a,
-                   Size n)
-{
-  a.default_construct_n(a.begin(), n);
-} // end construct_values()
-
-
-} // end temporary_array_detail
-
-
-template<typename T, typename System>
-  temporary_array<T,System>
-    ::temporary_array(thrust::execution_policy<System> &system, size_type n)
-      :super_t(n, alloc_type(temporary_allocator<T,System>(system)))
-{
-  temporary_array_detail::construct_values<T>(*this, n);
-} // end temporary_array::temporary_array()
-
-
-template<typename T, typename System>
-  temporary_array<T,System>
-    ::temporary_array(int, thrust::execution_policy<System> &system, size_type n)
-      :super_t(n, alloc_type(temporary_allocator<T,System>(system)))
-{
-  // avoid initialization
-  ;
-} // end temporary_array::temporary_array()
-
-
-template<typename T, typename System>
-  template<typename InputIterator>
-    temporary_array<T,System>
-      ::temporary_array(thrust::execution_policy<System> &system,
-                        InputIterator first,
-                        size_type n)
-        : super_t(alloc_type(temporary_allocator<T,System>(system)))
-{
-  super_t::allocate(n);
-
-  super_t::uninitialized_copy_n(system, first, n, super_t::begin());
-} // end temporary_array::temporary_array()
-
-
-template<typename T, typename System>
-  template<typename InputIterator, typename InputSystem>
-    temporary_array<T,System>
-      ::temporary_array(thrust::execution_policy<System> &system,
-                        thrust::execution_policy<InputSystem> &input_system,
-                        InputIterator first,
-                        size_type n)
-        : super_t(alloc_type(temporary_allocator<T,System>(system)))
-{
-  super_t::allocate(n);
-
-  super_t::uninitialized_copy_n(input_system, first, n, super_t::begin());
-} // end temporary_array::temporary_array()
-
-
-template<typename T, typename System>
-  template<typename InputIterator>
-    temporary_array<T,System>
-      ::temporary_array(thrust::execution_policy<System> &system,
-                        InputIterator first,
-                        InputIterator last)
-        : super_t(alloc_type(temporary_allocator<T,System>(system)))
-{
-  super_t::allocate(thrust::distance(first,last));
-
-  super_t::uninitialized_copy(system, first, last, super_t::begin());
-} // end temporary_array::temporary_array()
-
-
-template<typename T, typename System>
-  template<typename InputSystem, typename InputIterator>
-    temporary_array<T,System>
-      ::temporary_array(thrust::execution_policy<System> &system,
-                        thrust::execution_policy<InputSystem> &input_system,
-                        InputIterator first,
-                        InputIterator last)
-        : super_t(alloc_type(temporary_allocator<T,System>(system)))
-{
-  super_t::allocate(thrust::distance(first,last));
-
-  super_t::uninitialized_copy(input_system, first, last, super_t::begin());
-} // end temporary_array::temporary_array()
-
-
-template<typename T, typename System>
-  temporary_array<T,System>
-    ::~temporary_array()
-{
-  // note that super_t::destroy will ignore trivial destructors automatically
-  super_t::destroy(super_t::begin(), super_t::end());
-} // end temporary_array::~temporary_array()
-
-} // end detail
-
-} // end thrust
-
diff --git a/compat/thrust/detail/temporary_buffer.h b/compat/thrust/detail/temporary_buffer.h
deleted file mode 100644
index 046a3b363a..0000000000
--- a/compat/thrust/detail/temporary_buffer.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/execution_policy.h>
-#include <thrust/pair.h>
-#include <thrust/detail/pointer.h>
-#include <thrust/detail/raw_pointer_cast.h>
-#include <thrust/system/detail/generic/temporary_buffer.h>
-#include <thrust/system/detail/adl/temporary_buffer.h>
-
-namespace thrust
-{
-namespace detail
-{
-namespace get_temporary_buffer_detail
-{
-
-
-template<typename T, typename DerivedPolicy, typename Pair>
-  thrust::pair<thrust::pointer<T,DerivedPolicy>, typename thrust::pointer<T,DerivedPolicy>::difference_type>
-    down_cast_pair(Pair p)
-{
-  // XXX should use a hypothetical thrust::static_pointer_cast here
-  thrust::pointer<T,DerivedPolicy> ptr = thrust::pointer<T,DerivedPolicy>(static_cast<T*>(thrust::raw_pointer_cast(p.first)));
-
-  typedef thrust::pair<thrust::pointer<T,DerivedPolicy>, typename thrust::pointer<T,DerivedPolicy>::difference_type> result_type;
-  return result_type(ptr, p.second);
-} // end down_cast_pair()
-
-
-} // end get_temporary_buffer_detail
-} // end detail
-
-
-template<typename T, typename DerivedPolicy>
-  thrust::pair<thrust::pointer<T,DerivedPolicy>, typename thrust::pointer<T,DerivedPolicy>::difference_type>
-    get_temporary_buffer(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, typename thrust::pointer<T,DerivedPolicy>::difference_type n)
-{
-  using thrust::system::detail::generic::get_temporary_buffer;
-
-  return thrust::detail::get_temporary_buffer_detail::down_cast_pair<T,DerivedPolicy>(get_temporary_buffer<T>(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), n));
-} // end get_temporary_buffer()
-
-
-template<typename DerivedPolicy, typename Pointer>
-  void return_temporary_buffer(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, Pointer p)
-{
-  using thrust::system::detail::generic::return_temporary_buffer;
-
-  return return_temporary_buffer(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), p);
-} // end return_temporary_buffer()
-
-
-} // end thrust
-
diff --git a/compat/thrust/detail/transform.inl b/compat/thrust/detail/transform.inl
deleted file mode 100644
index ae303bcc0d..0000000000
--- a/compat/thrust/detail/transform.inl
+++ /dev/null
@@ -1,239 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file transform.inl
- *  \brief Inline file for transform.h.
- */
-
-#include <thrust/transform.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/detail/generic/select_system.h>
-#include <thrust/system/detail/generic/transform.h>
-#include <thrust/system/detail/adl/transform.h>
-
-namespace thrust
-{
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename UnaryFunction>
-  OutputIterator transform(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                           InputIterator first, InputIterator last,
-                           OutputIterator result,
-                           UnaryFunction op)
-{
-  using thrust::system::detail::generic::transform;
-  return transform(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, op);
-} // end transform()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename BinaryFunction>
-  OutputIterator transform(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                           InputIterator1 first1, InputIterator1 last1,
-                           InputIterator2 first2,
-                           OutputIterator result,
-                           BinaryFunction op)
-{
-  using thrust::system::detail::generic::transform;
-  return transform(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, result, op);
-} // end transform()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename ForwardIterator,
-         typename UnaryFunction,
-         typename Predicate>
-  ForwardIterator transform_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                               InputIterator first, InputIterator last,
-                               ForwardIterator result,
-                               UnaryFunction op,
-                               Predicate pred)
-{
-  using thrust::system::detail::generic::transform_if;
-  return transform_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, op, pred);
-} // end transform_if()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename ForwardIterator,
-         typename UnaryFunction,
-         typename Predicate>
-  ForwardIterator transform_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                               InputIterator1 first, InputIterator1 last,
-                               InputIterator2 stencil,
-                               ForwardIterator result,
-                               UnaryFunction op,
-                               Predicate pred)
-{
-  using thrust::system::detail::generic::transform_if;
-  return transform_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, stencil, result, op, pred);
-} // end transform_if()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename ForwardIterator,
-         typename BinaryFunction,
-         typename Predicate>
-  ForwardIterator transform_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                               InputIterator1 first1, InputIterator1 last1,
-                               InputIterator2 first2,
-                               InputIterator3 stencil,
-                               ForwardIterator result,
-                               BinaryFunction binary_op,
-                               Predicate pred)
-{
-  using thrust::system::detail::generic::transform_if;
-  return transform_if(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first1, last1, first2, stencil, result, binary_op, pred);
-} // end transform_if()
-
-
-template<typename InputIterator,
-         typename OutputIterator,
-         typename UnaryFunction>
-  OutputIterator transform(InputIterator first,
-                           InputIterator last,
-                           OutputIterator result,
-                           UnaryFunction op)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator>::type  System1;
-  typedef typename thrust::iterator_system<OutputIterator>::type System2;
-
-  System1 system1;
-  System2 system2;
-
-  return thrust::transform(select_system(system1,system2), first, last, result, op);
-} // end transform()
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename BinaryFunction>
-  OutputIterator transform(InputIterator1 first1,
-                           InputIterator1 last1,
-                           InputIterator2 first2,
-                           OutputIterator result,
-                           BinaryFunction op)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type System1;
-  typedef typename thrust::iterator_system<InputIterator2>::type System2;
-  typedef typename thrust::iterator_system<OutputIterator>::type System3;
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-
-  return thrust::transform(select_system(system1,system2,system3), first1, last1, first2, result, op);
-} // end transform()
-
-
-template<typename InputIterator,
-         typename ForwardIterator,
-         typename UnaryFunction,
-         typename Predicate>
-  ForwardIterator transform_if(InputIterator first,
-                               InputIterator last,
-                               ForwardIterator result,
-                               UnaryFunction unary_op,
-                               Predicate pred)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator>::type   System1;
-  typedef typename thrust::iterator_system<ForwardIterator>::type System2;
-
-  System1 system1;
-  System2 system2;
-
-  return thrust::transform_if(select_system(system1,system2), first, last, result, unary_op, pred);
-} // end transform_if()
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename ForwardIterator,
-         typename UnaryFunction,
-         typename Predicate>
-  ForwardIterator transform_if(InputIterator1 first,
-                               InputIterator1 last,
-                               InputIterator2 stencil,
-                               ForwardIterator result,
-                               UnaryFunction unary_op,
-                               Predicate pred)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
-  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
-  typedef typename thrust::iterator_system<ForwardIterator>::type System3;
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-
-  return thrust::transform_if(select_system(system1,system2,system3), first, last, stencil, result, unary_op, pred);
-} // end transform_if()
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename ForwardIterator,
-         typename BinaryFunction,
-         typename Predicate>
-  ForwardIterator transform_if(InputIterator1 first1,
-                               InputIterator1 last1,
-                               InputIterator2 first2,
-                               InputIterator3 stencil,
-                               ForwardIterator result,
-                               BinaryFunction binary_op,
-                               Predicate pred)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
-  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
-  typedef typename thrust::iterator_system<InputIterator3>::type  System3;
-  typedef typename thrust::iterator_system<ForwardIterator>::type System4;
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-  System4 system4;
-
-  return thrust::transform_if(select_system(system1,system2,system3,system4), first1, last1, first2, stencil, result, binary_op, pred);
-} // end transform_if()
-
-
-} // end namespace thrust
-
diff --git a/compat/thrust/detail/transform_reduce.inl b/compat/thrust/detail/transform_reduce.inl
deleted file mode 100644
index ede65030b1..0000000000
--- a/compat/thrust/detail/transform_reduce.inl
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file transform_reduce.inl
- *  \brief Inline file for transform_reduce.h.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/detail/generic/select_system.h>
-#include <thrust/system/detail/generic/transform_reduce.h>
-#include <thrust/system/detail/adl/transform_reduce.h>
-
-namespace thrust
-{
-
-
-template<typename DerivedPolicy,
-         typename InputIterator, 
-         typename UnaryFunction, 
-         typename OutputType,
-         typename BinaryFunction>
-  OutputType transform_reduce(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                              InputIterator first,
-                              InputIterator last,
-                              UnaryFunction unary_op,
-                              OutputType init,
-                              BinaryFunction binary_op)
-{
-  using thrust::system::detail::generic::transform_reduce;
-  return transform_reduce(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, unary_op, init, binary_op);
-} // end transform_reduce()
-
-
-template<typename InputIterator, 
-         typename UnaryFunction, 
-         typename OutputType,
-         typename BinaryFunction>
-  OutputType transform_reduce(InputIterator first,
-                              InputIterator last,
-                              UnaryFunction unary_op,
-                              OutputType init,
-                              BinaryFunction binary_op)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator>::type System;
-
-  System system;
-
-  return thrust::transform_reduce(select_system(system), first, last, unary_op, init, binary_op);
-} // end transform_reduce()
-
-
-} // end namespace thrust
-
diff --git a/compat/thrust/detail/transform_scan.inl b/compat/thrust/detail/transform_scan.inl
deleted file mode 100644
index 0187c4b98d..0000000000
--- a/compat/thrust/detail/transform_scan.inl
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file transform_scan.inl
- *  \brief Inline file for transform_scan.h.
- */
-
-#include <thrust/scan.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/detail/generic/select_system.h>
-#include <thrust/system/detail/generic/transform_scan.h>
-#include <thrust/system/detail/adl/transform_scan.h>
-
-namespace thrust
-{
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename UnaryFunction,
-         typename AssociativeOperator>
-  OutputIterator transform_inclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                          InputIterator first,
-                                          InputIterator last,
-                                          OutputIterator result,
-                                          UnaryFunction unary_op,
-                                          AssociativeOperator binary_op)
-{
-  using thrust::system::detail::generic::transform_inclusive_scan;
-  return transform_inclusive_scan(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, unary_op, binary_op);
-} // end transform_inclusive_scan()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename UnaryFunction,
-         typename T,
-         typename AssociativeOperator>
-  OutputIterator transform_exclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                          InputIterator first,
-                                          InputIterator last,
-                                          OutputIterator result,
-                                          UnaryFunction unary_op,
-                                          T init,
-                                          AssociativeOperator binary_op)
-{
-  using thrust::system::detail::generic::transform_exclusive_scan;
-  return transform_exclusive_scan(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result, unary_op, init, binary_op);
-} // end transform_exclusive_scan()
-
-
-template<typename InputIterator,
-         typename OutputIterator,
-         typename UnaryFunction,
-         typename BinaryFunction>
-  OutputIterator transform_inclusive_scan(InputIterator first,
-                                          InputIterator last,
-                                          OutputIterator result,
-                                          UnaryFunction unary_op,
-                                          BinaryFunction binary_op)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator>::type  System1;
-  typedef typename thrust::iterator_system<OutputIterator>::type System2;
-
-  System1 system1;
-  System2 system2;
-
-  return thrust::transform_inclusive_scan(select_system(system1,system2), first, last, result, unary_op, binary_op);
-} // end transform_inclusive_scan()
-
-
-template<typename InputIterator,
-         typename OutputIterator,
-         typename UnaryFunction,
-         typename T,
-         typename AssociativeOperator>
-  OutputIterator transform_exclusive_scan(InputIterator first,
-                                          InputIterator last,
-                                          OutputIterator result,
-                                          UnaryFunction unary_op,
-                                          T init,
-                                          AssociativeOperator binary_op)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator>::type  System1;
-  typedef typename thrust::iterator_system<OutputIterator>::type System2;
-
-  System1 system1;
-  System2 system2;
-
-  return thrust::transform_exclusive_scan(select_system(system1,system2), first, last, result, unary_op, init, binary_op);
-} // end transform_exclusive_scan()
-
-
-} // end namespace thrust
-
diff --git a/compat/thrust/detail/trivial_sequence.h b/compat/thrust/detail/trivial_sequence.h
deleted file mode 100644
index cc7e32be27..0000000000
--- a/compat/thrust/detail/trivial_sequence.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file trivial_sequence.h
- *  \brief Container-like class for wrapping sequences.  The wrapped
- *         sequence always has trivial iterators, even when the input
- *         sequence does not.
- */
-
-
-#pragma once
-
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/execution_policy.h>
-#include <thrust/detail/temporary_array.h>
-
-namespace thrust
-{
-
-namespace detail
-{
-
-// never instantiated
-template<typename Iterator, typename DerivedPolicy, typename is_trivial> struct _trivial_sequence { };
-
-// trivial case
-template<typename Iterator, typename DerivedPolicy>
-struct _trivial_sequence<Iterator, DerivedPolicy, thrust::detail::true_type>
-{
-    typedef Iterator iterator_type;
-    Iterator first, last;
-
-    _trivial_sequence(thrust::execution_policy<DerivedPolicy> &, Iterator _first, Iterator _last) : first(_first), last(_last)
-    {
-//        std::cout << "trivial case" << std::endl;
-    }
-
-    iterator_type begin() { return first; }
-    iterator_type end()   { return last; }
-};
-
-// non-trivial case
-template<typename Iterator, typename DerivedPolicy>
-struct _trivial_sequence<Iterator, DerivedPolicy, thrust::detail::false_type>
-{
-    typedef typename thrust::iterator_value<Iterator>::type iterator_value;
-    typedef typename thrust::detail::temporary_array<iterator_value, DerivedPolicy>::iterator iterator_type;
-    
-    thrust::detail::temporary_array<iterator_value, DerivedPolicy> buffer;
-
-    _trivial_sequence(thrust::execution_policy<DerivedPolicy> &exec, Iterator first, Iterator last)
-      : buffer(exec, first, last)
-    {
-//        std::cout << "non-trivial case" << std::endl;
-    }
-
-    iterator_type begin() { return buffer.begin(); }
-    iterator_type end()   { return buffer.end(); }
-};
-
-template <typename Iterator, typename DerivedPolicy>
-struct trivial_sequence
-  : detail::_trivial_sequence<Iterator, DerivedPolicy, typename thrust::detail::is_trivial_iterator<Iterator>::type>
-{
-    typedef _trivial_sequence<Iterator, DerivedPolicy, typename thrust::detail::is_trivial_iterator<Iterator>::type> super_t;
-
-    trivial_sequence(thrust::execution_policy<DerivedPolicy> &exec, Iterator first, Iterator last) : super_t(exec, first, last) { }
-};
-
-} // end namespace detail
-
-} // end namespace thrust
-
diff --git a/compat/thrust/detail/tuple.inl b/compat/thrust/detail/tuple.inl
deleted file mode 100644
index 067ad636c4..0000000000
--- a/compat/thrust/detail/tuple.inl
+++ /dev/null
@@ -1,948 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/swap.h>
-
-namespace thrust
-{
-
-// define null_type
-struct null_type {};
-
-// null_type comparisons
-__host__ __device__ inline
-bool operator==(const null_type&, const null_type&) { return true; }
-
-__host__ __device__ inline
-bool operator>=(const null_type&, const null_type&) { return true; }
-
-__host__ __device__ inline
-bool operator<=(const null_type&, const null_type&) { return true; }
-
-__host__ __device__ inline
-bool operator!=(const null_type&, const null_type&) { return false; }
-
-__host__ __device__ inline
-bool operator<(const null_type&, const null_type&) { return false; }
-
-__host__ __device__ inline
-bool operator>(const null_type&, const null_type&) { return false; }
-
-// forward declaration for tuple
-template <
-  class T0 = null_type, class T1 = null_type, class T2 = null_type,
-  class T3 = null_type, class T4 = null_type, class T5 = null_type,
-  class T6 = null_type, class T7 = null_type, class T8 = null_type,
-  class T9 = null_type>
-class tuple;
-
-// forward declaration of tuple_element
-template<int i, typename T> struct tuple_element;
-
-// specializations for tuple_element
-template<class T>
-  struct tuple_element<0,T>
-{
-  typedef typename T::head_type type;
-}; // end tuple_element<0,T>
-
-template<int N, class T>
-  struct tuple_element<N, const T>
-{
-  private:
-    typedef typename T::tail_type Next;
-    typedef typename tuple_element<N-1, Next>::type unqualified_type;
-
-  public:
-    typedef typename thrust::detail::add_const<unqualified_type>::type type;
-}; // end tuple_element<N, const T>
-
-template<class T>
-  struct tuple_element<0,const T>
-{
-  typedef typename thrust::detail::add_const<typename T::head_type>::type type;
-}; // end tuple_element<0,const T>
-
-
-
-// forward declaration of tuple_size
-template<class T> struct tuple_size;
-
-// specializations for tuple_size
-template<>
-  struct tuple_size< tuple<> >
-{
-  static const int value = 0;
-}; // end tuple_size< tuple<> >
-
-template<>
-  struct tuple_size<null_type>
-{
-  static const int value = 0;
-}; // end tuple_size<null_type>
-
-
-
-// forward declaration of detail::cons
-namespace detail
-{
-
-template <class HT, class TT> struct cons;
-
-} // end detail
-
-
-// -- some traits classes for get functions
-template <class T> struct access_traits
-{
-  typedef const T& const_type;
-  typedef T& non_const_type;
-
-  typedef const typename thrust::detail::remove_cv<T>::type& parameter_type;
-
-// used as the tuple constructors parameter types
-// Rationale: non-reference tuple element types can be cv-qualified.
-// It should be possible to initialize such types with temporaries,
-// and when binding temporaries to references, the reference must
-// be non-volatile and const. 8.5.3. (5)
-}; // end access_traits
-
-template <class T> struct access_traits<T&>
-{
-  typedef T& const_type;
-  typedef T& non_const_type;
-
-  typedef T& parameter_type;
-}; // end access_traits<T&>
-
-// forward declarations of get()
-template<int N, class HT, class TT>
-__host__ __device__
-inline typename access_traits<
-                  typename tuple_element<N, detail::cons<HT, TT> >::type
-                >::non_const_type
-// XXX we probably don't need to do this for any compiler we care about -jph
-//get(cons<HT, TT>& c BOOST_APPEND_EXPLICIT_TEMPLATE_NON_TYPE(int, N));
-get(detail::cons<HT, TT>& c);
-
-template<int N, class HT, class TT>
-__host__ __device__
-inline typename access_traits<
-                  typename tuple_element<N, detail::cons<HT, TT> >::type
-                >::const_type
-// XXX we probably don't need to do this for any compiler we care about -jph
-//get(const cons<HT, TT>& c BOOST_APPEND_EXPLICIT_TEMPLATE_NON_TYPE(int, N));
-get(const detail::cons<HT, TT>& c);
-
-namespace detail
-{
-
-// -- generate error template, referencing to non-existing members of this
-// template is used to produce compilation errors intentionally
-template<class T>
-class generate_error;
-
-// - cons getters --------------------------------------------------------
-// called: get_class<N>::get<RETURN_TYPE>(aTuple)
-
-template< int N >
-struct get_class
-{
-  template<class RET, class HT, class TT >
-  __host__ __device__
-  inline static RET get(const cons<HT, TT>& t)
-  {
-    // XXX we may not need to deal with this for any compiler we care about -jph
-    //return get_class<N-1>::BOOST_NESTED_TEMPLATE get<RET>(t.tail);
-    return get_class<N-1>::template get<RET>(t.tail);
-    
-    // gcc 4.3 couldn't compile this:
-    //return get_class<N-1>::get<RET>(t.tail);
-  }
-
-  template<class RET, class HT, class TT >
-  __host__ __device__
-  inline static RET get(cons<HT, TT>& t)
-  {
-    // XXX we may not need to deal with this for any compiler we care about -jph
-    //return get_class<N-1>::BOOST_NESTED_TEMPLATE get<RET>(t.tail);
-    return get_class<N-1>::template get<RET>(t.tail);
-
-    // gcc 4.3 couldn't compile this:
-    //return get_class<N-1>::get<RET>(t.tail);
-  }
-}; // end get_class
-
-template<>
-struct get_class<0>
-{
-  template<class RET, class HT, class TT>
-  __host__ __device__
-  inline static RET get(const cons<HT, TT>& t)
-  {
-    return t.head;
-  }
-
-  template<class RET, class HT, class TT>
-  __host__ __device__
-  inline static RET get(cons<HT, TT>& t)
-  {
-    return t.head;
-  }
-}; // get get_class<0>
-
-
-template <bool If, class Then, class Else> struct IF
-{
-  typedef Then RET;
-};
-
-template <class Then, class Else> struct IF<false, Then, Else>
-{
-  typedef Else RET;
-};
-
-//  These helper templates wrap void types and plain function types.
-//  The rationale is to allow one to write tuple types with those types
-//  as elements, even though it is not possible to instantiate such object.
-//  E.g: typedef tuple<void> some_type; // ok
-//  but: some_type x; // fails
-
-template <class T> class non_storeable_type
-{
-  __host__ __device__
-  non_storeable_type();
-};
-
-template <class T> struct wrap_non_storeable_type
-{
-  // XXX is_function looks complicated; punt for now -jph
-  //typedef typename IF<
-  //  ::thrust::detail::is_function<T>::value, non_storeable_type<T>, T
-  //>::RET type;
-
-  typedef T type;
-};
-
-template <> struct wrap_non_storeable_type<void>
-{
-  typedef non_storeable_type<void> type;
-};
-
-
-template <class HT, class TT>
-  struct cons
-{
-  typedef HT head_type;
-  typedef TT tail_type;
-
-  typedef typename
-    wrap_non_storeable_type<head_type>::type stored_head_type;
-
-  stored_head_type head;
-  tail_type tail;
-
-  inline __host__ __device__
-  typename access_traits<stored_head_type>::non_const_type
-  get_head() { return head; }
-
-  inline __host__ __device__
-  typename access_traits<tail_type>::non_const_type
-  get_tail() { return tail; }
-
-  inline __host__ __device__
-  typename access_traits<stored_head_type>::const_type
-  get_head() const { return head; }
-
-  inline __host__ __device__
-  typename access_traits<tail_type>::const_type
-  get_tail() const { return tail; }
-
-  inline __host__ __device__
-  cons(void) : head(), tail() {}
-  //  cons() : head(detail::default_arg<HT>::f()), tail() {}
-
-  // the argument for head is not strictly needed, but it prevents
-  // array type elements. This is good, since array type elements
-  // cannot be supported properly in any case (no assignment,
-  // copy works only if the tails are exactly the same type, ...)
-
-  inline __host__ __device__
-  cons(typename access_traits<stored_head_type>::parameter_type h,
-       const tail_type& t)
-    : head (h), tail(t) {}
-
-  template <class T1, class T2, class T3, class T4, class T5,
-            class T6, class T7, class T8, class T9, class T10>
-  inline __host__ __device__
-  cons( T1& t1, T2& t2, T3& t3, T4& t4, T5& t5,
-        T6& t6, T7& t7, T8& t8, T9& t9, T10& t10 )
-    : head (t1),
-      tail (t2, t3, t4, t5, t6, t7, t8, t9, t10, static_cast<const null_type&>(null_type()))
-      {}
-
-  template <class T2, class T3, class T4, class T5,
-            class T6, class T7, class T8, class T9, class T10>
-  inline __host__ __device__
-  cons( const null_type& /*t1*/, T2& t2, T3& t3, T4& t4, T5& t5,
-        T6& t6, T7& t7, T8& t8, T9& t9, T10& t10 )
-    : head (),
-      tail (t2, t3, t4, t5, t6, t7, t8, t9, t10, static_cast<const null_type&>(null_type()))
-      {}
-
-
-  template <class HT2, class TT2>
-  inline __host__ __device__
-  cons( const cons<HT2, TT2>& u ) : head(u.head), tail(u.tail) {}
-
-  template <class HT2, class TT2>
-  inline __host__ __device__
-  cons& operator=( const cons<HT2, TT2>& u ) {
-    head=u.head; tail=u.tail; return *this;
-  }
-
-  // must define assignment operator explicitly, implicit version is
-  // illformed if HT is a reference (12.8. (12))
-  inline __host__ __device__
-  cons& operator=(const cons& u) {
-    head = u.head; tail = u.tail;  return *this;
-  }
-
-  // XXX enable when we support std::pair -jph
-  //template <class T1, class T2>
-  //__host__ __device__
-  //cons& operator=( const std::pair<T1, T2>& u ) {
-  //  //BOOST_STATIC_ASSERT(length<cons>::value == 2); // check length = 2
-  //  head = u.first; tail.head = u.second; return *this;
-  //}
-
-  // get member functions (non-const and const)
-  template <int N>
-  __host__ __device__
-  typename access_traits<
-             typename tuple_element<N, cons<HT, TT> >::type
-           >::non_const_type
-  get() {
-    return thrust::get<N>(*this); // delegate to non-member get
-  }
-
-  template <int N>
-  __host__ __device__
-  typename access_traits<
-             typename tuple_element<N, cons<HT, TT> >::type
-           >::const_type
-  get() const {
-    return thrust::get<N>(*this); // delegate to non-member get
-  }
-
-  inline __host__ __device__
-  void swap(cons &c)
-  {
-    using thrust::swap;
-
-    swap(head, c.head);
-    tail.swap(c.tail);
-  }
-};
-
-template <class HT>
-  struct cons<HT, null_type>
-{
-  typedef HT head_type;
-  typedef null_type tail_type;
-  typedef cons<HT, null_type> self_type;
-
-  typedef typename
-    wrap_non_storeable_type<head_type>::type stored_head_type;
-  stored_head_type head;
-
-  typename access_traits<stored_head_type>::non_const_type
-  inline __host__ __device__
-  get_head() { return head; }
-
-  inline __host__ __device__
-  null_type get_tail() { return null_type(); }
-
-  inline __host__ __device__
-  typename access_traits<stored_head_type>::const_type
-  get_head() const { return head; }
-
-  inline __host__ __device__
-  null_type get_tail() const { return null_type(); }
-
-  inline __host__ __device__
-  cons() : head() {}
-
-  inline __host__ __device__
-  cons(typename access_traits<stored_head_type>::parameter_type h,
-       const null_type& = null_type())
-    : head (h) {}
-
-  template<class T1>
-  inline __host__ __device__
-  cons(T1& t1, const null_type&, const null_type&, const null_type&,
-       const null_type&, const null_type&, const null_type&,
-       const null_type&, const null_type&, const null_type&)
-  : head (t1) {}
-
-  inline __host__ __device__
-  cons(const null_type&,
-       const null_type&, const null_type&, const null_type&,
-       const null_type&, const null_type&, const null_type&,
-       const null_type&, const null_type&, const null_type&)
-  : head () {}
-
-  template <class HT2>
-  inline __host__ __device__
-  cons( const cons<HT2, null_type>& u ) : head(u.head) {}
-
-  template <class HT2>
-  inline __host__ __device__
-  cons& operator=(const cons<HT2, null_type>& u )
-  {
-    head = u.head;
-    return *this;
-  }
-
-  // must define assignment operator explicitly, implicit version
-  // is illformed if HT is a reference
-  inline __host__ __device__
-  cons& operator=(const cons& u) { head = u.head; return *this; }
-
-  template <int N>
-  inline __host__ __device__
-  typename access_traits<
-             typename tuple_element<N, self_type>::type
-            >::non_const_type
-  // XXX we probably don't need this for the compilers we care about -jph
-  //get(BOOST_EXPLICIT_TEMPLATE_NON_TYPE(int, N))
-  get(void)
-  {
-    return thrust::get<N>(*this);
-  }
-
-  template <int N>
-  inline __host__ __device__
-  typename access_traits<
-             typename tuple_element<N, self_type>::type
-           >::const_type
-  // XXX we probably don't need this for the compilers we care about -jph
-  //get(BOOST_EXPLICIT_TEMPLATE_NON_TYPE(int, N)) const
-  get(void) const
-  {
-    return thrust::get<N>(*this);
-  }
-
-  inline __host__ __device__
-  void swap(cons &c)
-  {
-    using thrust::swap;
-
-    swap(head, c.head);
-  }
-}; // end cons
-
-template <class T0, class T1, class T2, class T3, class T4,
-          class T5, class T6, class T7, class T8, class T9>
-  struct map_tuple_to_cons
-{
-  typedef cons<T0,
-               typename map_tuple_to_cons<T1, T2, T3, T4, T5,
-                                          T6, T7, T8, T9, null_type>::type
-              > type;
-}; // end map_tuple_to_cons
-
-// The empty tuple is a null_type
-template <>
-  struct map_tuple_to_cons<null_type, null_type, null_type, null_type, null_type, null_type, null_type, null_type, null_type, null_type>
-{
-  typedef null_type type;
-}; // end map_tuple_to_cons<...>
-
-
-
-// ---------------------------------------------------------------------------
-// The call_traits for make_tuple
-
-// Must be instantiated with plain or const plain types (not with references)
-
-// from template<class T> foo(const T& t) : make_tuple_traits<const T>::type
-// from template<class T> foo(T& t) : make_tuple_traits<T>::type
-
-// Conversions:
-// T -> T,
-// references -> compile_time_error
-// array -> const ref array
-
-
-template<class T>
-struct make_tuple_traits {
-  typedef T type;
-
-  // commented away, see below  (JJ)
-  //  typedef typename IF<
-  //  boost::is_function<T>::value,
-  //  T&,
-  //  T>::RET type;
-
-};
-
-// The is_function test was there originally for plain function types,
-// which can't be stored as such (we must either store them as references or
-// pointers). Such a type could be formed if make_tuple was called with a
-// reference to a function.
-// But this would mean that a const qualified function type was formed in
-// the make_tuple function and hence make_tuple can't take a function
-// reference as a parameter, and thus T can't be a function type.
-// So is_function test was removed.
-// (14.8.3. says that type deduction fails if a cv-qualified function type
-// is created. (It only applies for the case of explicitly specifying template
-// args, though?)) (JJ)
-
-template<class T>
-struct make_tuple_traits<T&> {
-  typedef typename
-     detail::generate_error<T&>::
-       do_not_use_with_reference_type error;
-};
-
-// Arrays can't be stored as plain types; convert them to references.
-// All arrays are converted to const. This is because make_tuple takes its
-// parameters as const T& and thus the knowledge of the potential
-// non-constness of actual argument is lost.
-template<class T, int n>  struct make_tuple_traits <T[n]> {
-  typedef const T (&type)[n];
-};
-
-template<class T, int n>
-struct make_tuple_traits<const T[n]> {
-  typedef const T (&type)[n];
-};
-
-template<class T, int n>  struct make_tuple_traits<volatile T[n]> {
-  typedef const volatile T (&type)[n];
-};
-
-template<class T, int n>
-struct make_tuple_traits<const volatile T[n]> {
-  typedef const volatile T (&type)[n];
-};
-
-// XXX enable these if we ever care about reference_wrapper -jph
-//template<class T>
-//struct make_tuple_traits<reference_wrapper<T> >{
-//  typedef T& type;
-//};
-//
-//template<class T>
-//struct make_tuple_traits<const reference_wrapper<T> >{
-//  typedef T& type;
-//};
-
-
-// a helper traits to make the make_tuple functions shorter (Vesa Karvonen's
-// suggestion)
-template <
-  class T0 = null_type, class T1 = null_type, class T2 = null_type,
-  class T3 = null_type, class T4 = null_type, class T5 = null_type,
-  class T6 = null_type, class T7 = null_type, class T8 = null_type,
-  class T9 = null_type
->
-struct make_tuple_mapper {
-  typedef
-    tuple<typename make_tuple_traits<T0>::type,
-          typename make_tuple_traits<T1>::type,
-          typename make_tuple_traits<T2>::type,
-          typename make_tuple_traits<T3>::type,
-          typename make_tuple_traits<T4>::type,
-          typename make_tuple_traits<T5>::type,
-          typename make_tuple_traits<T6>::type,
-          typename make_tuple_traits<T7>::type,
-          typename make_tuple_traits<T8>::type,
-          typename make_tuple_traits<T9>::type> type;
-};
-
-} // end detail
-
-
-template<int N, class HT, class TT>
-__host__ __device__
-inline typename access_traits<
-                  typename tuple_element<N, detail::cons<HT, TT> >::type
-                >::non_const_type
-get(detail::cons<HT, TT>& c)
-{
-  //return detail::get_class<N>::BOOST_NESTED_TEMPLATE
-  
-  // gcc 4.3 couldn't compile this:
-  //return detail::get_class<N>::
-
-  return detail::get_class<N>::template
-         get<
-           typename access_traits<
-             typename tuple_element<N, detail::cons<HT, TT> >::type
-           >::non_const_type,
-           HT,TT
-         >(c);
-}
-
-
-// get function for const cons-lists, returns a const reference to
-// the element. If the element is a reference, returns the reference
-// as such (that is, can return a non-const reference)
-template<int N, class HT, class TT>
-__host__ __device__
-inline typename access_traits<
-                  typename tuple_element<N, detail::cons<HT, TT> >::type
-                >::const_type
-get(const detail::cons<HT, TT>& c)
-{
-  //return detail::get_class<N>::BOOST_NESTED_TEMPLATE
-
-  // gcc 4.3 couldn't compile this:
-  //return detail::get_class<N>::
-
-  return detail::get_class<N>::template
-         get<
-           typename access_traits<
-             typename tuple_element<N, detail::cons<HT, TT> >::type
-           >::const_type,
-           HT,TT
-         >(c);
-}
-
-
-template<class T0>
-__host__ __device__ inline
-  typename detail::make_tuple_mapper<T0>::type
-    make_tuple(const T0& t0)
-{
-  typedef typename detail::make_tuple_mapper<T0>::type t;
-  return t(t0);
-} // end make_tuple()
-
-template<class T0, class T1>
-__host__ __device__ inline
-  typename detail::make_tuple_mapper<T0, T1>::type
-    make_tuple(const T0& t0, const T1& t1)
-{
-  typedef typename detail::make_tuple_mapper<T0,T1>::type t;
-  return t(t0,t1);
-} // end make_tuple()
-
-template<class T0, class T1, class T2>
-__host__ __device__ inline
-  typename detail::make_tuple_mapper<T0, T1, T2>::type
-    make_tuple(const T0& t0, const T1& t1, const T2& t2)
-{
-  typedef typename detail::make_tuple_mapper<T0,T1,T2>::type t;
-  return t(t0,t1,t2);
-} // end make_tuple()
-
-template<class T0, class T1, class T2, class T3>
-__host__ __device__ inline
-  typename detail::make_tuple_mapper<T0, T1, T2, T3>::type
-    make_tuple(const T0& t0, const T1& t1, const T2& t2, const T3& t3)
-{
-  typedef typename detail::make_tuple_mapper<T0,T1,T2,T3>::type t;
-  return t(t0,t1,t2,t3);
-} // end make_tuple()
-
-template<class T0, class T1, class T2, class T3, class T4>
-__host__ __device__ inline
-  typename detail::make_tuple_mapper<T0, T1, T2, T3, T4>::type
-    make_tuple(const T0& t0, const T1& t1, const T2& t2, const T3& t3, const T4& t4)
-{
-  typedef typename detail::make_tuple_mapper<T0,T1,T2,T3,T4>::type t;
-  return t(t0,t1,t2,t3,t4);
-} // end make_tuple()
-
-template<class T0, class T1, class T2, class T3, class T4, class T5>
-__host__ __device__ inline
-  typename detail::make_tuple_mapper<T0, T1, T2, T3, T4, T5>::type
-    make_tuple(const T0& t0, const T1& t1, const T2& t2, const T3& t3, const T4& t4, const T5& t5)
-{
-  typedef typename detail::make_tuple_mapper<T0,T1,T2,T3,T4,T5>::type t;
-  return t(t0,t1,t2,t3,t4,t5);
-} // end make_tuple()
-
-template<class T0, class T1, class T2, class T3, class T4, class T5, class T6>
-__host__ __device__ inline
-  typename detail::make_tuple_mapper<T0, T1, T2, T3, T4, T5, T6>::type
-    make_tuple(const T0& t0, const T1& t1, const T2& t2, const T3& t3, const T4& t4, const T5& t5, const T6& t6)
-{
-  typedef typename detail::make_tuple_mapper<T0,T1,T2,T3,T4,T5,T6>::type t;
-  return t(t0,t1,t2,t3,t4,t5,t6);
-} // end make_tuple()
-
-template<class T0, class T1, class T2, class T3, class T4, class T5, class T6, class T7>
-__host__ __device__ inline
-  typename detail::make_tuple_mapper<T0, T1, T2, T3, T4, T5, T6, T7>::type
-    make_tuple(const T0& t0, const T1& t1, const T2& t2, const T3& t3, const T4& t4, const T5& t5, const T6& t6, const T7& t7)
-{
-  typedef typename detail::make_tuple_mapper<T0,T1,T2,T3,T4,T5,T6,T7>::type t;
-  return t(t0,t1,t2,t3,t4,t5,t6,t7);
-} // end make_tuple()
-
-template<class T0, class T1, class T2, class T3, class T4, class T5, class T6, class T7, class T8>
-__host__ __device__ inline
-  typename detail::make_tuple_mapper<T0, T1, T2, T3, T4, T5, T6, T7, T8>::type
-    make_tuple(const T0& t0, const T1& t1, const T2& t2, const T3& t3, const T4& t4, const T5& t5, const T6& t6, const T7& t7, const T8& t8)
-{
-  typedef typename detail::make_tuple_mapper<T0,T1,T2,T3,T4,T5,T6,T7,T8>::type t;
-  return t(t0,t1,t2,t3,t4,t5,t6,t7,t8);
-} // end make_tuple()
-
-template<class T0, class T1, class T2, class T3, class T4, class T5, class T6, class T7, class T8, class T9>
-__host__ __device__ inline
-  typename detail::make_tuple_mapper<T0, T1, T2, T3, T4, T5, T6, T7, T8, T9>::type
-    make_tuple(const T0& t0, const T1& t1, const T2& t2, const T3& t3, const T4& t4, const T5& t5, const T6& t6, const T7& t7, const T8& t8, const T9& t9)
-{
-  typedef typename detail::make_tuple_mapper<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>::type t;
-  return t(t0,t1,t2,t3,t4,t5,t6,t7,t8,t9);
-} // end make_tuple()
-
-
-template<typename T0>
-__host__ __device__ inline
-tuple<T0&> tie(T0 &t0)
-{
-  return tuple<T0&>(t0);
-}
-
-template<typename T0,typename T1>
-__host__ __device__ inline
-tuple<T0&,T1&> tie(T0 &t0, T1 &t1)
-{
-  return tuple<T0&,T1&>(t0,t1);
-}
-
-template<typename T0,typename T1, typename T2>
-__host__ __device__ inline
-tuple<T0&,T1&,T2&> tie(T0 &t0, T1 &t1, T2 &t2)
-{
-  return tuple<T0&,T1&,T2&>(t0,t1,t2);
-}
-
-template<typename T0,typename T1, typename T2, typename T3>
-__host__ __device__ inline
-tuple<T0&,T1&,T2&,T3&> tie(T0 &t0, T1 &t1, T2 &t2, T3 &t3)
-{
-  return tuple<T0&,T1&,T2&,T3&>(t0,t1,t2,t3);
-}
-
-template<typename T0,typename T1, typename T2, typename T3, typename T4>
-__host__ __device__ inline
-tuple<T0&,T1&,T2&,T3&,T4&> tie(T0 &t0, T1 &t1, T2 &t2, T3 &t3, T4 &t4)
-{
-  return tuple<T0&,T1&,T2&,T3&,T4&>(t0,t1,t2,t3,t4);
-}
-
-template<typename T0,typename T1, typename T2, typename T3, typename T4, typename T5>
-__host__ __device__ inline
-tuple<T0&,T1&,T2&,T3&,T4&,T5&> tie(T0 &t0, T1 &t1, T2 &t2, T3 &t3, T4 &t4, T5 &t5)
-{
-  return tuple<T0&,T1&,T2&,T3&,T4&,T5&>(t0,t1,t2,t3,t4,t5);
-}
-
-template<typename T0,typename T1, typename T2, typename T3, typename T4, typename T5, typename T6>
-__host__ __device__ inline
-tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&> tie(T0 &t0, T1 &t1, T2 &t2, T3 &t3, T4 &t4, T5 &t5, T6 &t6)
-{
-  return tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&>(t0,t1,t2,t3,t4,t5,t6);
-}
-
-template<typename T0,typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7>
-__host__ __device__ inline
-tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&> tie(T0 &t0, T1 &t1, T2 &t2, T3 &t3, T4 &t4, T5 &t5, T6 &t6, T7 &t7)
-{
-  return tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&>(t0,t1,t2,t3,t4,t5,t6,t7);
-}
-
-template<typename T0,typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8>
-__host__ __device__ inline
-tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&,T8&> tie(T0 &t0, T1 &t1, T2 &t2, T3 &t3, T4 &t4, T5 &t5, T6 &t6, T7 &t7, T8 &t8)
-{
-  return tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&,T8&>(t0,t1,t2,t3,t4,t5,t6,t7,t8);
-}
-
-template<typename T0,typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9>
-__host__ __device__ inline
-tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&,T8&,T9&> tie(T0 &t0, T1 &t1, T2 &t2, T3 &t3, T4 &t4, T5 &t5, T6 &t6, T7 &t7, T8 &t8, T9 &t9)
-{
-  return tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&,T8&,T9&>(t0,t1,t2,t3,t4,t5,t6,t7,t8,t9);
-}
-
-template<
-  typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9,
-  typename U0, typename U1, typename U2, typename U3, typename U4, typename U5, typename U6, typename U7, typename U8, typename U9
->
-__host__ __device__ inline
-void swap(thrust::tuple<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9> &x,
-          thrust::tuple<U0,U1,U2,U3,U4,U5,U6,U7,U8,U9> &y)
-{
-  return x.swap(y);
-}
-
-
-
-namespace detail
-{
-
-template<class T1, class T2>
-__host__ __device__
-inline bool eq(const T1& lhs, const T2& rhs) {
-  return lhs.get_head() == rhs.get_head() &&
-         eq(lhs.get_tail(), rhs.get_tail());
-}
-template<>
-inline bool eq<null_type,null_type>(const null_type&, const null_type&) { return true; }
-
-template<class T1, class T2>
-__host__ __device__
-inline bool neq(const T1& lhs, const T2& rhs) {
-  return lhs.get_head() != rhs.get_head()  ||
-         neq(lhs.get_tail(), rhs.get_tail());
-}
-template<>
-__host__ __device__
-inline bool neq<null_type,null_type>(const null_type&, const null_type&) { return false; }
-
-template<class T1, class T2>
-__host__ __device__
-inline bool lt(const T1& lhs, const T2& rhs) {
-  return (lhs.get_head() < rhs.get_head())  ||
-            (!(rhs.get_head() < lhs.get_head()) &&
-             lt(lhs.get_tail(), rhs.get_tail()));
-}
-template<>
-__host__ __device__
-inline bool lt<null_type,null_type>(const null_type&, const null_type&) { return false; }
-
-template<class T1, class T2>
-__host__ __device__
-inline bool gt(const T1& lhs, const T2& rhs) {
-  return (lhs.get_head() > rhs.get_head())  ||
-            (!(rhs.get_head() > lhs.get_head()) &&
-             gt(lhs.get_tail(), rhs.get_tail()));
-}
-template<>
-__host__ __device__
-inline bool gt<null_type,null_type>(const null_type&, const null_type&) { return false; }
-
-template<class T1, class T2>
-__host__ __device__
-inline bool lte(const T1& lhs, const T2& rhs) {
-  return lhs.get_head() <= rhs.get_head()  &&
-          ( !(rhs.get_head() <= lhs.get_head()) ||
-            lte(lhs.get_tail(), rhs.get_tail()));
-}
-template<>
-__host__ __device__
-inline bool lte<null_type,null_type>(const null_type&, const null_type&) { return true; }
-
-template<class T1, class T2>
-__host__ __device__
-inline bool gte(const T1& lhs, const T2& rhs) {
-  return lhs.get_head() >= rhs.get_head()  &&
-          ( !(rhs.get_head() >= lhs.get_head()) ||
-            gte(lhs.get_tail(), rhs.get_tail()));
-}
-template<>
-__host__ __device__
-inline bool gte<null_type,null_type>(const null_type&, const null_type&) { return true; }
-
-} // end detail
-
-
-
-// equal ----
-
-template<class T1, class T2, class S1, class S2>
-__host__ __device__
-inline bool operator==(const detail::cons<T1, T2>& lhs, const detail::cons<S1, S2>& rhs)
-{
-  // XXX support this eventually -jph
-  //// check that tuple lengths are equal
-  //BOOST_STATIC_ASSERT(tuple_size<T2>::value == tuple_size<S2>::value);
-
-  return  detail::eq(lhs, rhs);
-} // end operator==()
-
-// not equal -----
-
-template<class T1, class T2, class S1, class S2>
-__host__ __device__
-inline bool operator!=(const detail::cons<T1, T2>& lhs, const detail::cons<S1, S2>& rhs)
-{
-  // XXX support this eventually -jph
-  //// check that tuple lengths are equal
-  //BOOST_STATIC_ASSERT(tuple_size<T2>::value == tuple_size<S2>::value);
-
-  return detail::neq(lhs, rhs);
-} // end operator!=()
-
-// <
-template<class T1, class T2, class S1, class S2>
-__host__ __device__
-inline bool operator<(const detail::cons<T1, T2>& lhs, const detail::cons<S1, S2>& rhs)
-{
-  // XXX support this eventually -jph
-  //// check that tuple lengths are equal
-  //BOOST_STATIC_ASSERT(tuple_size<T2>::value == tuple_size<S2>::value);
-
-  return detail::lt(lhs, rhs);
-} // end operator<()
-
-// >
-template<class T1, class T2, class S1, class S2>
-__host__ __device__
-inline bool operator>(const detail::cons<T1, T2>& lhs, const detail::cons<S1, S2>& rhs)
-{
-  // XXX support this eventually -jph
-  //// check that tuple lengths are equal
-  //BOOST_STATIC_ASSERT(tuple_size<T2>::value == tuple_size<S2>::value);
-
-  return detail::gt(lhs, rhs);
-} // end operator>()
-
-// <=
-template<class T1, class T2, class S1, class S2>
-__host__ __device__
-inline bool operator<=(const detail::cons<T1, T2>& lhs, const detail::cons<S1, S2>& rhs)
-{
-  // XXX support this eventually -jph
-  //// check that tuple lengths are equal
-  //BOOST_STATIC_ASSERT(tuple_size<T2>::value == tuple_size<S2>::value);
-
-  return detail::lte(lhs, rhs);
-} // end operator<=()
-
-// >=
-template<class T1, class T2, class S1, class S2>
-__host__ __device__
-inline bool operator>=(const detail::cons<T1, T2>& lhs, const detail::cons<S1, S2>& rhs)
-{
-  // XXX support this eventually -jph
-  //// check that tuple lengths are equal
-  //BOOST_STATIC_ASSERT(tuple_size<T2>::value == tuple_size<S2>::value);
-
-  return detail::gte(lhs, rhs);
-} // end operator>=()
-
-} // end thrust
-
diff --git a/compat/thrust/detail/tuple_meta_transform.h b/compat/thrust/detail/tuple_meta_transform.h
deleted file mode 100644
index ff99709b6d..0000000000
--- a/compat/thrust/detail/tuple_meta_transform.h
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/tuple.h>
-
-namespace thrust
-{
-
-namespace detail
-{
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         unsigned int sz = thrust::tuple_size<Tuple>::value>
-  struct tuple_meta_transform;
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction>
-  struct tuple_meta_transform<Tuple,UnaryMetaFunction,0>
-{
-  typedef null_type type;
-};
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction>
-  struct tuple_meta_transform<Tuple,UnaryMetaFunction,1>
-{
-  typedef thrust::tuple<
-    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type
-  > type;
-};
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction>
-  struct tuple_meta_transform<Tuple,UnaryMetaFunction,2>
-{
-  typedef thrust::tuple<
-    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type
-  > type;
-};
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction>
-  struct tuple_meta_transform<Tuple,UnaryMetaFunction,3>
-{
-  typedef thrust::tuple<
-    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type
-  > type;
-};
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction>
-  struct tuple_meta_transform<Tuple,UnaryMetaFunction,4>
-{
-  typedef thrust::tuple<
-    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<3,Tuple>::type>::type
-  > type;
-};
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction>
-  struct tuple_meta_transform<Tuple,UnaryMetaFunction,5>
-{
-  typedef thrust::tuple<
-    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<3,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<4,Tuple>::type>::type
-  > type;
-};
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction>
-  struct tuple_meta_transform<Tuple,UnaryMetaFunction,6>
-{
-  typedef thrust::tuple<
-    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<3,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<4,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<5,Tuple>::type>::type
-  > type;
-};
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction>
-  struct tuple_meta_transform<Tuple,UnaryMetaFunction,7>
-{
-  typedef thrust::tuple<
-    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<3,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<4,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<5,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<6,Tuple>::type>::type
-  > type;
-};
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction>
-  struct tuple_meta_transform<Tuple,UnaryMetaFunction,8>
-{
-  typedef thrust::tuple<
-    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<3,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<4,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<5,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<6,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<7,Tuple>::type>::type
-  > type;
-};
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction>
-  struct tuple_meta_transform<Tuple,UnaryMetaFunction,9>
-{
-  typedef thrust::tuple<
-    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<3,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<4,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<5,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<6,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<7,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<8,Tuple>::type>::type
-  > type;
-};
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction>
-  struct tuple_meta_transform<Tuple,UnaryMetaFunction,10>
-{
-  typedef thrust::tuple<
-    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<3,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<4,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<5,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<6,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<7,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<8,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<9,Tuple>::type>::type
-  > type;
-};
-
-} // end detail
-
-} // end thrust
-
diff --git a/compat/thrust/detail/tuple_transform.h b/compat/thrust/detail/tuple_transform.h
deleted file mode 100644
index f18b8727e1..0000000000
--- a/compat/thrust/detail/tuple_transform.h
+++ /dev/null
@@ -1,418 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/tuple.h>
-#include <thrust/detail/tuple_meta_transform.h>
-
-namespace thrust
-{
-
-namespace detail
-{
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         typename UnaryFunction,
-         unsigned int sz = thrust::tuple_size<Tuple>::value>
-  struct tuple_transform_functor;
-
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         typename UnaryFunction>
-  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,0>
-{
-  static __host__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host(const Tuple &t, UnaryFunction f)
-  {
-    return thrust::null_type();
-  }
-
-  static __host__ __device__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
-  {
-    return thrust::null_type();
-  }
-};
-
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         typename UnaryFunction>
-  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,1>
-{
-  static __host__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)));
-  }
-
-  static __host__ __device__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)));
-  }
-};
-
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         typename UnaryFunction>
-  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,2>
-{
-  static __host__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)));
-  }
-
-  static __host__ __device__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)));
-  }
-};
-
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         typename UnaryFunction>
-  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,3>
-{
-  static __host__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)));
-  }
-
-  static __host__ __device__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)));
-  }
-};
-
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         typename UnaryFunction>
-  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,4>
-{
-  static __host__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)));
-  }
-
-  static __host__ __device__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)));
-  }
-};
-
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         typename UnaryFunction>
-  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,5>
-{
-  static __host__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)));
-  }
-
-  static __host__ __device__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)));
-  }
-};
-
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         typename UnaryFunction>
-  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,6>
-{
-  static __host__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)),
-                     f(thrust::get<5>(t)));
-  }
-
-  static __host__ __device__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)),
-                     f(thrust::get<5>(t)));
-  }
-};
-
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         typename UnaryFunction>
-  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,7>
-{
-  static __host__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)),
-                     f(thrust::get<5>(t)),
-                     f(thrust::get<6>(t)));
-  }
-
-  static __host__ __device__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)),
-                     f(thrust::get<5>(t)),
-                     f(thrust::get<6>(t)));
-  }
-};
-
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         typename UnaryFunction>
-  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,8>
-{
-  static __host__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)),
-                     f(thrust::get<5>(t)),
-                     f(thrust::get<6>(t)),
-                     f(thrust::get<7>(t)));
-  }
-
-  static __host__ __device__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)),
-                     f(thrust::get<5>(t)),
-                     f(thrust::get<6>(t)),
-                     f(thrust::get<7>(t)));
-  }
-};
-
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         typename UnaryFunction>
-  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,9>
-{
-  static __host__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)),
-                     f(thrust::get<5>(t)),
-                     f(thrust::get<6>(t)),
-                     f(thrust::get<7>(t)),
-                     f(thrust::get<8>(t)));
-  }
-
-  static __host__ __device__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)),
-                     f(thrust::get<5>(t)),
-                     f(thrust::get<6>(t)),
-                     f(thrust::get<7>(t)),
-                     f(thrust::get<8>(t)));
-  }
-};
-
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         typename UnaryFunction>
-  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,10>
-{
-  static __host__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)),
-                     f(thrust::get<5>(t)),
-                     f(thrust::get<6>(t)),
-                     f(thrust::get<7>(t)),
-                     f(thrust::get<8>(t)),
-                     f(thrust::get<9>(t)));
-  }
-
-  static __host__ __device__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)),
-                     f(thrust::get<5>(t)),
-                     f(thrust::get<6>(t)),
-                     f(thrust::get<7>(t)),
-                     f(thrust::get<8>(t)),
-                     f(thrust::get<9>(t)));
-  }
-};
-
-
-template<template<typename> class UnaryMetaFunction,
-         typename Tuple,
-         typename UnaryFunction>
-typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-tuple_host_transform(const Tuple &t, UnaryFunction f)
-{
-  return tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction>::do_it_on_the_host(t,f);
-}
-
-template<template<typename> class UnaryMetaFunction,
-         typename Tuple,
-         typename UnaryFunction>
-typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-__host__ __device__
-tuple_host_device_transform(const Tuple &t, UnaryFunction f)
-{
-  return tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction>::do_it_on_the_host_or_device(t,f);
-}
-
-} // end detail
-
-} // end thrust
-
diff --git a/compat/thrust/detail/type_traits.h b/compat/thrust/detail/type_traits.h
deleted file mode 100644
index 5dbeb906eb..0000000000
--- a/compat/thrust/detail/type_traits.h
+++ /dev/null
@@ -1,641 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file type_traits.h
- *  \brief Temporarily define some type traits
- *         until nvcc can compile tr1::type_traits.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// XXX nvcc 2.2 closed beta can't compile type_traits
-//// find type_traits
-//
-//#ifdef __GNUC__
-//
-//#if __GNUC__ == 4 && __GNUC_MINOR__ == 2
-//#include <tr1/type_traits>
-//#elif __GNUC__ == 4 && __GNUC_MINOR__ > 2
-//#include <type_traits>
-//#endif // GCC version
-//
-//#endif // GCC
-//
-//#ifdef _MSC_VER
-//#include <type_traits>
-//#endif // MSVC
-
-
-namespace thrust
-{
-
-// forward declaration of device_reference
-template<typename T> class device_reference;
-
-namespace detail
-{
- /// helper classes [4.3].
- template<typename _Tp, _Tp __v>
-   struct integral_constant
-   {
-     static const _Tp                      value = __v;
-     typedef _Tp                           value_type;
-     typedef integral_constant<_Tp, __v>   type;
-   };
- 
- /// typedef for true_type
- typedef integral_constant<bool, true>     true_type;
-
- /// typedef for true_type
- typedef integral_constant<bool, false>    false_type;
-
-//template<typename T> struct is_integral : public std::tr1::is_integral<T> {};
-template<typename T> struct is_integral                           : public false_type {};
-template<>           struct is_integral<bool>                     : public true_type {};
-template<>           struct is_integral<char>                     : public true_type {};
-template<>           struct is_integral<signed char>              : public true_type {};
-template<>           struct is_integral<unsigned char>            : public true_type {};
-template<>           struct is_integral<short>                    : public true_type {};
-template<>           struct is_integral<unsigned short>           : public true_type {};
-template<>           struct is_integral<int>                      : public true_type {};
-template<>           struct is_integral<unsigned int>             : public true_type {};
-template<>           struct is_integral<long>                     : public true_type {};
-template<>           struct is_integral<unsigned long>            : public true_type {};
-template<>           struct is_integral<long long>                : public true_type {};
-template<>           struct is_integral<unsigned long long>       : public true_type {};
-template<>           struct is_integral<const bool>               : public true_type {};
-template<>           struct is_integral<const char>               : public true_type {};
-template<>           struct is_integral<const unsigned char>      : public true_type {};
-template<>           struct is_integral<const short>              : public true_type {};
-template<>           struct is_integral<const unsigned short>     : public true_type {};
-template<>           struct is_integral<const int>                : public true_type {};
-template<>           struct is_integral<const unsigned int>       : public true_type {};
-template<>           struct is_integral<const long>               : public true_type {};
-template<>           struct is_integral<const unsigned long>      : public true_type {};
-template<>           struct is_integral<const long long>          : public true_type {};
-template<>           struct is_integral<const unsigned long long> : public true_type {};
-
-template<typename T> struct is_floating_point              : public false_type {};
-template<>           struct is_floating_point<float>       : public true_type {};
-template<>           struct is_floating_point<double>      : public true_type {};
-template<>           struct is_floating_point<long double> : public true_type {};
-
-template<typename T> struct is_arithmetic               : public is_integral<T> {};
-template<>           struct is_arithmetic<float>        : public true_type {};
-template<>           struct is_arithmetic<double>       : public true_type {};
-template<>           struct is_arithmetic<const float>  : public true_type {};
-template<>           struct is_arithmetic<const double> : public true_type {};
-
-template<typename T> struct is_pointer      : public false_type {};
-template<typename T> struct is_pointer<T *> : public true_type  {};
-
-template<typename T> struct is_device_ptr  : public false_type {};
-
-template<typename T> struct is_void             : public false_type {};
-template<>           struct is_void<void>       : public true_type {};
-template<>           struct is_void<const void> : public true_type {};
-
-
-namespace tt_detail
-{
-
-
-} // end tt_detail
-
-template<typename T> struct is_pod
-   : public integral_constant<
-       bool,
-       is_void<T>::value || is_pointer<T>::value || is_arithmetic<T>::value
-#if THRUST_HOST_COMPILER   == THRUST_HOST_COMPILER_MSVC
-// use intrinsic type traits
-       || __is_pod(T)
-#elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC
-// only use the intrinsic for >= 4.3
-#if (__GNUC__ >= 4) && (__GNUC_MINOR__ >= 3)
-       || __is_pod(T)
-#endif // GCC VERSION
-#endif // THRUST_HOST_COMPILER
-     >
- {};
-
-
-template<typename T> struct has_trivial_constructor
-  : public integral_constant<
-      bool,
-      is_pod<T>::value
-#if THRUST_HOST_COMPILER   == THRUST_HOST_COMPILER_MSVC
-      || __has_trivial_constructor(T)
-#elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC
-// only use the intrinsic for >= 4.3
-#if (__GNUC__ >= 4) && (__GNUC_MINOR__ >= 3)
-      || __has_trivial_constructor(T)
-#endif // GCC VERSION
-#endif // THRUST_HOST_COMPILER
-      >
-{};
-
-template<typename T> struct has_trivial_copy_constructor
-  : public integral_constant<
-      bool,
-      is_pod<T>::value
-#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
-      || __has_trivial_copy(T)
-#elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC
-// only use the intrinsic for >= 4.3
-#if (__GNUC__ >= 4) && (__GNUC_MINOR__ >= 3)
-      || __has_trivial_copy(T)
-#endif // GCC VERSION
-#endif // THRUST_HOST_COMPILER
-    >
-{};
-
-template<typename T> struct has_trivial_destructor : public is_pod<T> {};
-
-template<typename T> struct is_const          : public false_type {};
-template<typename T> struct is_const<const T> : public true_type {};
-
-template<typename T> struct is_volatile             : public false_type {};
-template<typename T> struct is_volatile<volatile T> : public true_type {};
-
-template<typename T>
-  struct add_const
-{
-  typedef T const type;
-}; // end add_const
-
-template<typename T>
-  struct remove_const
-{
-  typedef T type;
-}; // end remove_const
-
-template<typename T>
-  struct remove_const<const T>
-{
-  typedef T type;
-}; // end remove_const
-
-template<typename T>
-  struct add_volatile
-{
-  typedef volatile T type;
-}; // end add_volatile
-
-template<typename T>
-  struct remove_volatile
-{
-  typedef T type;
-}; // end remove_volatile
-
-template<typename T>
-  struct remove_volatile<volatile T>
-{
-  typedef T type;
-}; // end remove_volatile
-
-template<typename T>
-  struct add_cv
-{
-  typedef const volatile T type;
-}; // end add_cv
-
-template<typename T>
-  struct remove_cv
-{
-  typedef typename remove_const<typename remove_volatile<T>::type>::type type;
-}; // end remove_cv
-
-
-template<typename T> struct is_reference     : public false_type {};
-template<typename T> struct is_reference<T&> : public true_type {};
-
-template<typename T> struct is_device_reference                                : public false_type {};
-template<typename T> struct is_device_reference< thrust::device_reference<T> > : public true_type {};
-
-
-// NB: Careful with reference to void.
-template<typename _Tp, bool = (is_void<_Tp>::value || is_reference<_Tp>::value)>
-  struct __add_reference_helper
-  { typedef _Tp&    type; };
-
-template<typename _Tp>
-  struct __add_reference_helper<_Tp, true>
-  { typedef _Tp     type; };
-
-template<typename _Tp>
-  struct add_reference
-    : public __add_reference_helper<_Tp>{};
-
-template<typename T>
-  struct remove_reference
-{
-  typedef T type;
-}; // end remove_reference
-
-template<typename T>
-  struct remove_reference<T&>
-{
-  typedef T type;
-}; // end remove_reference
-
-template<typename T1, typename T2>
-  struct is_same
-    : public false_type
-{
-}; // end is_same
-
-template<typename T>
-  struct is_same<T,T>
-    : public true_type
-{
-}; // end is_same
-
-template<typename T1, typename T2>
-  struct lazy_is_same
-    : is_same<typename T1::type, typename T2::type>
-{
-}; // end lazy_is_same
-
-template<typename T1, typename T2>
-  struct is_different
-    : public true_type
-{
-}; // end is_different
-
-template<typename T>
-  struct is_different<T,T>
-    : public false_type
-{
-}; // end is_different
-
-template<typename T1, typename T2>
-  struct lazy_is_different
-    : is_different<typename T1::type, typename T2::type>
-{
-}; // end lazy_is_different
-
-namespace tt_detail
-{
-
-template<typename T>
-  struct is_int_or_cref
-{
-  typedef typename remove_reference<T>::type type_sans_ref;
-  static const bool value = (is_integral<T>::value
-                             || (is_integral<type_sans_ref>::value
-                                 && is_const<type_sans_ref>::value
-                                 && !is_volatile<type_sans_ref>::value));
-}; // end is_int_or_cref
-
-
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
-__THRUST_DISABLE_MSVC_FORCING_VALUE_TO_BOOL_BEGIN
-
-
-template<typename From, typename To>
-  struct is_convertible_sfinae
-{
-  private:
-    typedef char                          one_byte;
-    typedef struct { char two_chars[2]; } two_bytes;
-
-    static one_byte  test(To);
-    static two_bytes test(...);
-    static From      m_from;
-
-  public:
-    static const bool value = sizeof(test(m_from)) == sizeof(one_byte);
-}; // end is_convertible_sfinae
-
-
-__THRUST_DISABLE_MSVC_FORCING_VALUE_TO_BOOL_END
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
-
-
-template<typename From, typename To>
-  struct is_convertible_needs_simple_test
-{
-  static const bool from_is_void      = is_void<From>::value;
-  static const bool to_is_void        = is_void<To>::value;
-  static const bool from_is_float     = is_floating_point<typename remove_reference<From>::type>::value;
-  static const bool to_is_int_or_cref = is_int_or_cref<To>::value;
-
-  static const bool value = (from_is_void || to_is_void || (from_is_float && to_is_int_or_cref));
-}; // end is_convertible_needs_simple_test
-
-
-template<typename From, typename To,
-         bool = is_convertible_needs_simple_test<From,To>::value>
-  struct is_convertible
-{
-  static const bool value = (is_void<To>::value
-                             || (is_int_or_cref<To>::value
-                                 && !is_void<From>::value));
-}; // end is_convertible
-
-
-template<typename From, typename To>
-  struct is_convertible<From, To, false>
-{
-  static const bool value = (is_convertible_sfinae<typename
-                             add_reference<From>::type, To>::value);
-}; // end is_convertible
-
-
-} // end tt_detail
-
-template<typename From, typename To>
-  struct is_convertible
-    : public integral_constant<bool, tt_detail::is_convertible<From, To>::value>
-{
-}; // end is_convertible
-
-
-template<typename T1, typename T2>
-  struct is_one_convertible_to_the_other
-    : public integral_constant<
-        bool,
-        is_convertible<T1,T2>::value || is_convertible<T2,T1>::value
-      >
-{};
-
-
-// mpl stuff
-
-template <typename Condition1,               typename Condition2,              typename Condition3 = false_type,
-          typename Condition4  = false_type, typename Condition5 = false_type, typename Condition6 = false_type,
-          typename Condition7  = false_type, typename Condition8 = false_type, typename Condition9 = false_type,
-          typename Condition10 = false_type>
-  struct or_
-    : public integral_constant<
-        bool,
-        Condition1::value || Condition2::value || Condition3::value || Condition4::value || Condition5::value || Condition6::value || Condition7::value || Condition8::value || Condition9::value || Condition10::value
-      >
-{
-}; // end or_
-
-template <typename Condition1, typename Condition2, typename Condition3 = true_type>
-  struct and_
-    : public integral_constant<bool, Condition1::value && Condition2::value && Condition3::value>
-{
-}; // end and_
-
-template <typename Boolean>
-  struct not_
-    : public integral_constant<bool, !Boolean::value>
-{
-}; // end not_
-
-template <bool, typename Then, typename Else>
-  struct eval_if
-{
-}; // end eval_if
-
-template<typename Then, typename Else>
-  struct eval_if<true, Then, Else>
-{
-  typedef typename Then::type type;
-}; // end eval_if
-
-template<typename Then, typename Else>
-  struct eval_if<false, Then, Else>
-{
-  typedef typename Else::type type;
-}; // end eval_if
-
-template<typename T>
-//  struct identity
-//  XXX WAR nvcc's confusion with thrust::identity
-  struct identity_
-{
-  typedef T type;
-}; // end identity
-
-template<bool, typename T = void> struct enable_if {};
-template<typename T>              struct enable_if<true, T> {typedef T type;};
-
-template<bool, typename T> struct lazy_enable_if {};
-template<typename T>       struct lazy_enable_if<true, T> {typedef typename T::type type;};
-
-template<bool condition, typename T = void> struct disable_if : enable_if<!condition, T> {};
-template<bool condition, typename T>        struct lazy_disable_if : lazy_enable_if<!condition, T> {};
-
-
-template<typename T1, typename T2, typename T = void>
-  struct enable_if_convertible
-    : enable_if< is_convertible<T1,T2>::value, T >
-{};
-
-
-template<typename T1, typename T2, typename T = void>
-  struct disable_if_convertible
-    : disable_if< is_convertible<T1,T2>::value, T >
-{};
-
-
-template<typename T1, typename T2, typename Result = void>
-  struct enable_if_different
-    : enable_if<is_different<T1,T2>::value, Result>
-{};
-
-
-template<typename T>
-  struct is_numeric
-    : and_<
-        is_convertible<int,T>,
-        is_convertible<T,int>
-      >
-{
-}; // end is_numeric
-
-
-template<typename> struct is_reference_to_const             : false_type {};
-template<typename T> struct is_reference_to_const<const T&> : true_type {};
-
-
-// make_unsigned follows
-
-namespace tt_detail
-{
-
-template<typename T> struct make_unsigned_simple;
-
-template<> struct make_unsigned_simple<char>                   { typedef unsigned char          type; };
-template<> struct make_unsigned_simple<signed char>            { typedef signed   char          type; };
-template<> struct make_unsigned_simple<unsigned char>          { typedef unsigned char          type; };
-template<> struct make_unsigned_simple<short>                  { typedef unsigned short         type; };
-template<> struct make_unsigned_simple<unsigned short>         { typedef unsigned short         type; };
-template<> struct make_unsigned_simple<int>                    { typedef unsigned int           type; };
-template<> struct make_unsigned_simple<unsigned int>           { typedef unsigned int           type; };
-template<> struct make_unsigned_simple<long int>               { typedef unsigned long int      type; };
-template<> struct make_unsigned_simple<unsigned long int>      { typedef unsigned long int      type; };
-template<> struct make_unsigned_simple<long long int>          { typedef unsigned long long int type; };
-template<> struct make_unsigned_simple<unsigned long long int> { typedef unsigned long long int type; };
-
-template<typename T>
-  struct make_unsigned_base
-{
-  // remove cv
-  typedef typename remove_cv<T>::type remove_cv_t;
-
-  // get the simple unsigned type
-  typedef typename make_unsigned_simple<remove_cv_t>::type unsigned_remove_cv_t;
-
-  // add back const, volatile, both, or neither to the simple result
-  typedef typename eval_if<
-    is_const<T>::value && is_volatile<T>::value,
-    // add cv back
-    add_cv<unsigned_remove_cv_t>,
-    // check const & volatile individually
-    eval_if<
-      is_const<T>::value,
-      // add c back
-      add_const<unsigned_remove_cv_t>,
-      eval_if<
-        is_volatile<T>::value,
-        // add v back
-        add_volatile<unsigned_remove_cv_t>,
-        // original type was neither cv, return the simple unsigned result
-        identity_<unsigned_remove_cv_t>
-      >
-    >
-  >::type type;
-};
-
-} // end tt_detail
-
-template<typename T>
-  struct make_unsigned
-    : tt_detail::make_unsigned_base<T>
-{};
-
-struct largest_available_float
-{
-#if defined(__CUDA_ARCH__)
-#  if (__CUDA_ARCH__ < 130)
-  typedef float type;
-#  else
-  typedef double type;
-#  endif
-#else
-  typedef double type;
-#endif
-};
-
-// T1 wins if they are both the same size
-template<typename T1, typename T2>
-  struct larger_type
-    : thrust::detail::eval_if<
-        (sizeof(T2) > sizeof(T1)),
-        thrust::detail::identity_<T2>,
-        thrust::detail::identity_<T1>
-      >
-{};
-
-
-namespace is_base_of_ns
-{
-
-typedef char                          yes;
-typedef struct { char two_chars[2]; } no;
-
-template<typename Base, typename Derived>
-  struct host
-{
-  operator Base*() const;
-  operator Derived*();
-}; // end host
-
-template<typename Base, typename Derived>
-  struct impl
-{
-  template<typename T> static yes check(Derived *, T);
-  static no check(Base*, int);
-
-  static const bool value = sizeof(check(host<Base,Derived>(), int())) == sizeof(yes);
-}; // end impl
-
-} // end is_base_of_ns
-
-
-template<typename Base, typename Derived>
-  struct is_base_of
-    : integral_constant<
-        bool,
-        is_base_of_ns::impl<Base,Derived>::value
-      >
-{};
-
-template<typename Base, typename Derived, typename Result = void>
-  struct enable_if_base_of
-    : enable_if<
-        is_base_of<Base,Derived>::value,
-        Result
-      >
-{};
-
-
-namespace is_assignable_ns
-{
-
-template<typename T1, typename T2>
-  class is_assignable
-{
-  typedef char                      yes_type;
-  typedef struct { char array[2]; } no_type;
-
-  template<typename T> static typename add_reference<T>::type declval();
-  
-  template<unsigned int> struct helper { typedef void * type; };
-
-  template<typename U1, typename U2> static yes_type test(typename helper<sizeof(declval<U1>() = declval<U2>())>::type);
-
-  template<typename,typename> static no_type test(...);
-
-  public:
-    static const bool value = sizeof(test<T1,T2>(0)) == 1;
-}; // end is_assignable
-
-} // end is_assignable_ns
-
-
-template<typename T1, typename T2>
-  struct is_assignable
-    : integral_constant<
-        bool,
-        is_assignable_ns::is_assignable<T1,T2>::value
-      >
-{};
-
-
-template<typename T>
-  struct is_copy_assignable
-    : is_assignable<
-        typename add_reference<T>::type,
-        typename add_reference<typename add_const<T>::type>::type
-      >
-{};
-
-
-} // end detail
-
-} // end thrust
-
-#include <thrust/detail/type_traits/has_trivial_assign.h>
-
diff --git a/compat/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h b/compat/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h
deleted file mode 100644
index 92767b5497..0000000000
--- a/compat/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/type_traits/function_traits.h>
-#include <thrust/detail/type_traits/iterator/is_output_iterator.h>
-
-namespace thrust
-{
-
-namespace detail
-{
-
-// this trait reports what type should be used as a temporary in certain algorithms
-// which aggregate intermediate results from a function before writing to an output iterator
-
-// the pseudocode for deducing the type of the temporary used below:
-// 
-// if Function is an AdaptableFunction
-//   result = Function::result_type
-// else if OutputIterator2 is a "pure" output iterator
-//   result = InputIterator2::value_type
-// else
-//   result = OutputIterator2::value_type
-//
-// XXX upon c++0x, TemporaryType needs to be:
-// result_of<BinaryFunction>::type
-template<typename InputIterator, typename OutputIterator, typename Function>
-  struct intermediate_type_from_function_and_iterators
-    : eval_if<
-        has_result_type<Function>::value,
-        result_type<Function>,
-        eval_if<
-          is_output_iterator<OutputIterator>::value,
-          thrust::iterator_value<InputIterator>,
-          thrust::iterator_value<OutputIterator>
-        >
-      >
-{
-}; // end intermediate_type_from_function_and_iterators
-
-} // end detail
-
-} // end thrust
-
diff --git a/compat/thrust/detail/type_traits/function_traits.h b/compat/thrust/detail/type_traits/function_traits.h
deleted file mode 100644
index 39015c608d..0000000000
--- a/compat/thrust/detail/type_traits/function_traits.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/type_traits/has_nested_type.h>
-
-namespace thrust
-{
-
-// forward definitions for is_commutative
-template <typename T> struct plus;
-template <typename T> struct multiplies;
-template <typename T> struct minimum;
-template <typename T> struct maximum;
-template <typename T> struct logical_or;
-template <typename T> struct logical_and;
-template <typename T> struct bit_or;
-template <typename T> struct bit_and;
-template <typename T> struct bit_xor;
-
-namespace detail
-{
-
-
-// some metafunctions which check for the nested types of the adaptable functions
-
-__THRUST_DEFINE_HAS_NESTED_TYPE(has_result_type, result_type)
-
-__THRUST_DEFINE_HAS_NESTED_TYPE(has_argument_type, argument_type)
-
-__THRUST_DEFINE_HAS_NESTED_TYPE(has_first_argument_type, first_argument_type)
-
-__THRUST_DEFINE_HAS_NESTED_TYPE(has_second_argument_type, second_argument_type)
-
-
-template<typename AdaptableBinaryFunction>
-  struct result_type
-{
-  typedef typename AdaptableBinaryFunction::result_type type;
-};
-
-
-template<typename T>
-  struct is_adaptable_unary_function
-    : thrust::detail::and_<
-        has_result_type<T>,
-        has_argument_type<T>
-      >
-{};
-
-
-template<typename T>
-  struct is_adaptable_binary_function
-    : thrust::detail::and_<
-        has_result_type<T>,
-        thrust::detail::and_<
-          has_first_argument_type<T>,
-          has_second_argument_type<T>
-        >
-      >
-{};
-
-
-template<typename BinaryFunction>
-  struct is_commutative
-    : public thrust::detail::false_type
-{};
-
-template<typename T> struct is_commutative< typename thrust::plus<T>        > : public thrust::detail::is_arithmetic<T> {};
-template<typename T> struct is_commutative< typename thrust::multiplies<T>  > : public thrust::detail::is_arithmetic<T> {};
-template<typename T> struct is_commutative< typename thrust::minimum<T>     > : public thrust::detail::is_arithmetic<T> {};
-template<typename T> struct is_commutative< typename thrust::maximum<T>     > : public thrust::detail::is_arithmetic<T> {};
-template<typename T> struct is_commutative< typename thrust::logical_or<T>  > : public thrust::detail::is_arithmetic<T> {};
-template<typename T> struct is_commutative< typename thrust::logical_and<T> > : public thrust::detail::is_arithmetic<T> {};
-template<typename T> struct is_commutative< typename thrust::bit_or<T>      > : public thrust::detail::is_arithmetic<T> {};
-template<typename T> struct is_commutative< typename thrust::bit_and<T>     > : public thrust::detail::is_arithmetic<T> {};
-template<typename T> struct is_commutative< typename thrust::bit_xor<T>     > : public thrust::detail::is_arithmetic<T> {};
-
-} // end namespace detail
-} // end namespace thrust
-
diff --git a/compat/thrust/detail/type_traits/has_member_function.h b/compat/thrust/detail/type_traits/has_member_function.h
deleted file mode 100644
index 117f4cb9bf..0000000000
--- a/compat/thrust/detail/type_traits/has_member_function.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/type_traits.h>
-
-#define __THRUST_DEFINE_HAS_MEMBER_FUNCTION(trait_name, member_function_name)                                \
-template<typename T, typename Signature> class trait_name;                                                   \
-                                                                                                             \
-template<typename T, typename Result>                                                                        \
-class trait_name<T, Result(void)>                                                                            \
-{                                                                                                            \
-   class yes { char m; };                                                                                    \
-   class no { yes m[2]; };                                                                                   \
-   struct base_mixin                                                                                         \
-   {                                                                                                         \
-     Result member_function_name();                                                                          \
-   };                                                                                                        \
-   struct base : public T, public base_mixin {};                                                             \
-   template <typename U, U t>  class helper{};                                                               \
-   template <typename U>                                                                                     \
-   static no deduce(U*, helper<Result (base_mixin::*)(), &U::member_function_name>* = 0);                    \
-   static yes deduce(...);                                                                                   \
-public:                                                                                                      \
-   static const bool value = sizeof(yes) == sizeof(deduce(static_cast<base*>(0)));                           \
-   typedef thrust::detail::integral_constant<bool,value> type;                                               \
-};                                                                                                           \
-                                                                                                             \
-template<typename T, typename Result, typename Arg>                                                          \
-class trait_name<T, Result(Arg)>                                                                             \
-{                                                                                                            \
-   class yes { char m; };                                                                                    \
-   class no { yes m[2]; };                                                                                   \
-   struct base_mixin                                                                                         \
-   {                                                                                                         \
-     Result member_function_name(Arg);                                                                       \
-   };                                                                                                        \
-   struct base : public T, public base_mixin {};                                                             \
-   template <typename U, U t>  class helper{};                                                               \
-   template <typename U>                                                                                     \
-   static no deduce(U*, helper<Result (base_mixin::*)(Arg), &U::member_function_name>* = 0);                 \
-   static yes deduce(...);                                                                                   \
-public:                                                                                                      \
-   static const bool value = sizeof(yes) == sizeof(deduce(static_cast<base*>(0)));                           \
-   typedef thrust::detail::integral_constant<bool,value> type;                                               \
-};                                                                                                           \
-                                                                                                             \
-template<typename T, typename Result, typename Arg1, typename Arg2>                                          \
-class trait_name<T, Result(Arg1,Arg2)>                                                                       \
-{                                                                                                            \
-   class yes { char m; };                                                                                    \
-   class no { yes m[2]; };                                                                                   \
-   struct base_mixin                                                                                         \
-   {                                                                                                         \
-     Result member_function_name(Arg1,Arg2);                                                                 \
-   };                                                                                                        \
-   struct base : public T, public base_mixin {};                                                             \
-   template <typename U, U t>  class helper{};                                                               \
-   template <typename U>                                                                                     \
-   static no deduce(U*, helper<Result (base_mixin::*)(Arg1,Arg2), &U::member_function_name>* = 0);           \
-   static yes deduce(...);                                                                                   \
-public:                                                                                                      \
-   static const bool value = sizeof(yes) == sizeof(deduce(static_cast<base*>(0)));                           \
-   typedef thrust::detail::integral_constant<bool,value> type;                                               \
-};                                                                                                           \
-                                                                                                             \
-template<typename T, typename Result, typename Arg1, typename Arg2, typename Arg3>                           \
-class trait_name<T, Result(Arg1,Arg2,Arg3)>                                                                  \
-{                                                                                                            \
-   class yes { char m; };                                                                                    \
-   class no { yes m[2]; };                                                                                   \
-   struct base_mixin                                                                                         \
-   {                                                                                                         \
-     Result member_function_name(Arg1,Arg2,Arg3);                                                            \
-   };                                                                                                        \
-   struct base : public T, public base_mixin {};                                                             \
-   template <typename U, U t>  class helper{};                                                               \
-   template <typename U>                                                                                     \
-   static no deduce(U*, helper<Result (base_mixin::*)(Arg1,Arg2,Arg3), &U::member_function_name>* = 0);      \
-   static yes deduce(...);                                                                                   \
-public:                                                                                                      \
-   static const bool value = sizeof(yes) == sizeof(deduce(static_cast<base*>(0)));                           \
-   typedef thrust::detail::integral_constant<bool,value> type;                                               \
-};                                                                                                           \
-                                                                                                             \
-template<typename T, typename Result, typename Arg1, typename Arg2, typename Arg3, typename Arg4>            \
-class trait_name<T, Result(Arg1,Arg2,Arg3,Arg4)>                                                             \
-{                                                                                                            \
-   class yes { char m; };                                                                                    \
-   class no { yes m[2]; };                                                                                   \
-   struct base_mixin                                                                                         \
-   {                                                                                                         \
-     Result member_function_name(Arg1,Arg2,Arg3,Arg4);                                                       \
-   };                                                                                                        \
-   struct base : public T, public base_mixin {};                                                             \
-   template <typename U, U t>  class helper{};                                                               \
-   template <typename U>                                                                                     \
-   static no deduce(U*, helper<Result (base_mixin::*)(Arg1,Arg2,Arg3,Arg4), &U::member_function_name>* = 0); \
-   static yes deduce(...);                                                                                   \
-public:                                                                                                      \
-   static const bool value = sizeof(yes) == sizeof(deduce(static_cast<base*>(0)));                           \
-   typedef thrust::detail::integral_constant<bool,value> type;                                               \
-};                                                                                                           
-
diff --git a/compat/thrust/detail/type_traits/has_nested_type.h b/compat/thrust/detail/type_traits/has_nested_type.h
deleted file mode 100644
index 98c9460500..0000000000
--- a/compat/thrust/detail/type_traits/has_nested_type.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/type_traits.h>
-
-#define __THRUST_DEFINE_HAS_NESTED_TYPE(trait_name, nested_type_name) \
-template<typename T> \
-  struct trait_name  \
-{                    \
-  typedef char yes_type; \
-  typedef int  no_type;  \
-  template<typename S> static yes_type test(typename S::nested_type_name *); \
-  template<typename S> static no_type  test(...); \
-  static bool const value = sizeof(test<T>(0)) == sizeof(yes_type);\
-  typedef thrust::detail::integral_constant<bool, value> type;\
-};
-
diff --git a/compat/thrust/detail/type_traits/has_trivial_assign.h b/compat/thrust/detail/type_traits/has_trivial_assign.h
deleted file mode 100644
index d248245e84..0000000000
--- a/compat/thrust/detail/type_traits/has_trivial_assign.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file type_traits.h
- *  \brief Temporarily define some type traits
- *         until nvcc can compile tr1::type_traits.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/type_traits.h>
-
-namespace thrust
-{
-
-namespace detail
-{
-
-template<typename T> struct has_trivial_assign
-  : public integral_constant<
-      bool,
-      (is_pod<T>::value && !is_const<T>::value)
-#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
-      || __has_trivial_assign(T)
-#elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC
-// only use the intrinsic for >= 4.3
-#if (__GNUC__ >= 4) && (__GNUC_MINOR__ >= 3)
-      || __has_trivial_assign(T)
-#endif // GCC VERSION
-#endif // THRUST_HOST_COMPILER
-    >
-{};
-
-} // end detail
-
-} // end thrust
-
diff --git a/compat/thrust/detail/type_traits/is_call_possible.h b/compat/thrust/detail/type_traits/is_call_possible.h
deleted file mode 100644
index 41b9539e19..0000000000
--- a/compat/thrust/detail/type_traits/is_call_possible.h
+++ /dev/null
@@ -1,161 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/type_traits/has_member_function.h>
-
-// inspired by Roman Perepelitsa's presentation from comp.lang.c++.moderated
-// based on the implementation here: http://www.rsdn.ru/forum/cpp/2759773.1.aspx
-
-namespace thrust
-{
-namespace detail
-{
-namespace is_call_possible_detail
-{
-
-template<typename T> class void_exp_result {}; 
-
-template<typename T, typename U> 
-U const& operator,(U const&, void_exp_result<T>); 
-
-template<typename T, typename U> 
-U& operator,(U&, void_exp_result<T>); 
-
-template<typename src_type, typename dest_type> 
-struct clone_constness 
-{
-  typedef dest_type type; 
-}; 
-
-template<typename src_type, typename dest_type> 
-struct clone_constness<const src_type, dest_type> 
-{ 
-  typedef const dest_type type; 
-};
-
-} // end is_call_possible_detail
-} // end detail
-} // end thrust
-
-#define __THRUST_DEFINE_IS_CALL_POSSIBLE(trait_name, member_function_name)                                                                \
-__THRUST_DEFINE_HAS_MEMBER_FUNCTION(trait_name##_has_member, member_function_name)                                                        \
-                                                                                                                                          \
-template <typename T, typename Signature>                                                                                                 \
-struct trait_name                                                                                                                         \
-{                                                                                                                                         \
-  private:                                                                                                                                \
-    struct yes {};                                                                                                                        \
-    struct no { yes m[2]; };                                                                                                              \
-    struct derived : public T                                                                                                             \
-    {                                                                                                                                     \
-      using T::member_function_name;                                                                                                      \
-      no member_function_name(...) const;                                                                                                 \
-    };                                                                                                                                    \
-                                                                                                                                          \
-    typedef typename thrust::detail::is_call_possible_detail::clone_constness<T, derived>::type derived_type;                             \
-                                                                                                                                          \
-    template<typename U, typename Result>                                                                                                 \
-    struct return_value_check                                                                                                             \
-    {                                                                                                                                     \
-      static yes deduce(Result);                                                                                                          \
-      static no deduce(...);                                                                                                              \
-      static no deduce(no);                                                                                                               \
-      static no deduce(thrust::detail::is_call_possible_detail::void_exp_result<T>);                                                      \
-    };                                                                                                                                    \
-                                                                                                                                          \
-    template<typename U>                                                                                                                  \
-    struct return_value_check<U, void>                                                                                                    \
-    {                                                                                                                                     \
-      static yes deduce(...);                                                                                                             \
-      static no deduce(no);                                                                                                               \
-    };                                                                                                                                    \
-                                                                                                                                          \
-    template<bool has_the_member_of_interest, typename F>                                                                                 \
-    struct impl                                                                                                                           \
-    {                                                                                                                                     \
-      static const bool value = false;                                                                                                    \
-    };                                                                                                                                    \
-                                                                                                                                          \
-    template<typename Result, typename Arg>                                                                                               \
-    struct impl<true, Result(Arg)>                                                                                                        \
-    {                                                                                                                                     \
-      static typename add_reference<derived_type>::type test_me;                                                                          \
-      static typename add_reference<Arg>::type          arg;                                                                              \
-                                                                                                                                          \
-      static const bool value =                                                                                                           \
-        sizeof(                                                                                                                           \
-                return_value_check<T, Result>::deduce(                                                                                    \
-                  (test_me.member_function_name(arg), thrust::detail::is_call_possible_detail::void_exp_result<T>())                      \
-                )                                                                                                                         \
-              ) == sizeof(yes);                                                                                                           \
-    };                                                                                                                                    \
-                                                                                                                                          \
-    template<typename Result, typename Arg1, typename Arg2>                                                                               \
-    struct impl<true, Result(Arg1,Arg2)>                                                                                                  \
-    {                                                                                                                                     \
-      static typename add_reference<derived_type>::type test_me;                                                                          \
-      static typename add_reference<Arg1>::type         arg1;                                                                             \
-      static typename add_reference<Arg2>::type         arg2;                                                                             \
-                                                                                                                                          \
-      static const bool value =                                                                                                           \
-        sizeof(                                                                                                                           \
-                return_value_check<T, Result>::deduce(                                                                                    \
-                  (test_me.member_function_name(arg1,arg2), thrust::detail::is_call_possible_detail::void_exp_result<T>())                \
-                )                                                                                                                         \
-              ) == sizeof(yes);                                                                                                           \
-    };                                                                                                                                    \
-                                                                                                                                          \
-    template<typename Result, typename Arg1, typename Arg2, typename Arg3>                                                                \
-    struct impl<true, Result(Arg1,Arg2,Arg3)>                                                                                             \
-    {                                                                                                                                     \
-      static typename add_reference<derived_type>::type test_me;                                                                          \
-      static typename add_reference<Arg1>::type         arg1;                                                                             \
-      static typename add_reference<Arg2>::type         arg2;                                                                             \
-      static typename add_reference<Arg3>::type         arg3;                                                                             \
-                                                                                                                                          \
-      static const bool value =                                                                                                           \
-        sizeof(                                                                                                                           \
-                return_value_check<T, Result>::deduce(                                                                                    \
-                  (test_me.member_function_name(arg1,arg2,arg3), thrust::detail::is_call_possible_detail::void_exp_result<T>())           \
-                )                                                                                                                         \
-              ) == sizeof(yes);                                                                                                           \
-    };                                                                                                                                    \
-                                                                                                                                          \
-    template<typename Result, typename Arg1, typename Arg2, typename Arg3, typename Arg4>                                                 \
-    struct impl<true, Result(Arg1,Arg2,Arg3,Arg4)>                                                                                        \
-    {                                                                                                                                     \
-      static typename add_reference<derived_type>::type test_me;                                                                          \
-      static typename add_reference<Arg1>::type         arg1;                                                                             \
-      static typename add_reference<Arg2>::type         arg2;                                                                             \
-      static typename add_reference<Arg3>::type         arg3;                                                                             \
-      static typename add_reference<Arg4>::type         arg4;                                                                             \
-                                                                                                                                          \
-      static const bool value =                                                                                                           \
-        sizeof(                                                                                                                           \
-                return_value_check<T, Result>::deduce(                                                                                    \
-                  (test_me.member_function_name(arg1,arg2,arg3,arg4), thrust::detail::is_call_possible_detail::void_exp_result<T>())      \
-                )                                                                                                                         \
-              ) == sizeof(yes);                                                                                                           \
-    };                                                                                                                                    \
-                                                                                                                                          \
-  public:                                                                                                                                 \
-    static const bool value = impl<trait_name##_has_member<T,Signature>::value, Signature>::value;                                        \
-    typedef thrust::detail::integral_constant<bool,value> type;                                                                           \
-}; 
-
diff --git a/compat/thrust/detail/type_traits/is_metafunction_defined.h b/compat/thrust/detail/type_traits/is_metafunction_defined.h
deleted file mode 100644
index fba0811fb3..0000000000
--- a/compat/thrust/detail/type_traits/is_metafunction_defined.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/type_traits/has_nested_type.h>
-#include <thrust/detail/type_traits.h>
-
-namespace thrust
-{
-
-namespace detail
-{
-
-__THRUST_DEFINE_HAS_NESTED_TYPE(is_metafunction_defined, type)
-
-template<typename Metafunction>
-  struct enable_if_defined
-    : thrust::detail::lazy_enable_if<
-        is_metafunction_defined<Metafunction>::value,
-        Metafunction
-      >
-{};
-
-} // end detail
-
-} // end thrust
-
diff --git a/compat/thrust/detail/type_traits/iterator/is_discard_iterator.h b/compat/thrust/detail/type_traits/iterator/is_discard_iterator.h
deleted file mode 100644
index cca59da045..0000000000
--- a/compat/thrust/detail/type_traits/iterator/is_discard_iterator.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/iterator/discard_iterator.h>
-
-namespace thrust
-{
-namespace detail
-{
-
-template <typename Iterator>
-struct is_discard_iterator
-  : public thrust::detail::false_type
-{};
-
-template <typename System>
-struct is_discard_iterator< thrust::discard_iterator<System> >
- : public thrust::detail::true_type
-{};
-
-} // end namespace detail
-} // end namespace thrust
-
diff --git a/compat/thrust/detail/type_traits/iterator/is_output_iterator.h b/compat/thrust/detail/type_traits/iterator/is_output_iterator.h
deleted file mode 100644
index 4cefe6353a..0000000000
--- a/compat/thrust/detail/type_traits/iterator/is_output_iterator.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/type_traits/is_metafunction_defined.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/iterator/detail/any_assign.h>
-
-namespace thrust
-{
-
-namespace detail
-{
-
-
-template<typename T>
-  struct is_void_like
-    : thrust::detail::or_<
-        thrust::detail::is_void<T>,
-        thrust::detail::is_same<T,thrust::detail::any_assign>
-      >
-{}; // end is_void_like
-
-
-template<typename T>
-  struct lazy_is_void_like
-    : is_void_like<typename T::type>
-{}; // end lazy_is_void_like
-
-
-// XXX this meta function should first check that T is actually an iterator
-//
-//     if thrust::iterator_value<T> is defined and thrust::iterator_value<T>::type == void
-//       return false
-//     else
-//       return true
-template<typename T>
-  struct is_output_iterator
-    : eval_if<
-        is_metafunction_defined<thrust::iterator_value<T> >::value,
-        lazy_is_void_like<thrust::iterator_value<T> >,
-        thrust::detail::true_type
-      >::type
-{
-}; // end is_output_iterator
-
-} // end detail
-
-} // end thrust
-
diff --git a/compat/thrust/detail/type_traits/minimum_type.h b/compat/thrust/detail/type_traits/minimum_type.h
deleted file mode 100644
index aaa011ec8d..0000000000
--- a/compat/thrust/detail/type_traits/minimum_type.h
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/type_traits.h>
-
-namespace thrust
-{
-
-namespace detail
-{ 
-
-namespace minimum_type_detail
-{
-
-//
-// Returns the minimum type or is empty
-// if T1 and T2 are unrelated.
-//
-template <typename T1, typename T2, bool GreaterEqual, bool LessEqual> struct minimum_type_impl {};
-  
-template <typename T1, typename T2>
-struct minimum_type_impl<T1,T2,true,false>
-{
-  typedef T2 type;
-}; // end minimum_type_impl
-
-template <typename T1, typename T2>
-struct minimum_type_impl<T1,T2,false,true>
-{
-  typedef T1 type;
-}; // end minimum_type_impl
-
-template <typename T1, typename T2>
-struct minimum_type_impl<T1,T2,true,true>
-{
-  typedef T1 type;
-}; // end minimum_type_impl
-
-template <typename T1, typename T2>
-struct primitive_minimum_type
-  : minimum_type_detail::minimum_type_impl<
-      T1,
-      T2,
-      ::thrust::detail::is_convertible<T1,T2>::value,
-      ::thrust::detail::is_convertible<T2,T1>::value
-    >
-{
-}; // end primitive_minimum_type
-
-// because some types are not convertible (even to themselves)
-// specialize primitive_minimum_type for when both types are identical
-template <typename T>
-struct primitive_minimum_type<T,T>
-{
-  typedef T type;
-}; // end primitive_minimum_type
-
-// XXX this belongs somewhere more general
-struct any_conversion
-{
-  template<typename T> operator T (void);
-};
-
-} // end minimum_type_detail
-
-template<typename T1,
-         typename T2  = minimum_type_detail::any_conversion,
-         typename T3  = minimum_type_detail::any_conversion,
-         typename T4  = minimum_type_detail::any_conversion,
-         typename T5  = minimum_type_detail::any_conversion,
-         typename T6  = minimum_type_detail::any_conversion,
-         typename T7  = minimum_type_detail::any_conversion,
-         typename T8  = minimum_type_detail::any_conversion,
-         typename T9  = minimum_type_detail::any_conversion,
-         typename T10 = minimum_type_detail::any_conversion,
-         typename T11 = minimum_type_detail::any_conversion,
-         typename T12 = minimum_type_detail::any_conversion,
-         typename T13 = minimum_type_detail::any_conversion,
-         typename T14 = minimum_type_detail::any_conversion,
-         typename T15 = minimum_type_detail::any_conversion,
-         typename T16 = minimum_type_detail::any_conversion>
-  struct minimum_type;
-
-// base case
-template<typename T1, typename T2>
-  struct minimum_type<T1,T2>
-    : minimum_type_detail::primitive_minimum_type<T1,T2>
-{};
-
-template<typename T1, typename T2>
-  struct lazy_minimum_type
-    : minimum_type<
-        typename T1::type,
-        typename T2::type
-      >
-{};
-
-// carefully avoid referring to a nested ::type which may not exist
-template<typename T1,  typename T2,  typename T3,  typename T4,
-         typename T5,  typename T6,  typename T7,  typename T8,
-         typename T9,  typename T10, typename T11, typename T12,
-         typename T13, typename T14, typename T15, typename T16>
-  struct minimum_type
-    : lazy_minimum_type<
-        lazy_minimum_type<
-          lazy_minimum_type<
-            minimum_type<
-              T1,T2
-            >,
-            minimum_type<
-              T3,T4
-            >
-          >,
-          lazy_minimum_type<
-            minimum_type<
-              T5,T6
-            >,
-            minimum_type<
-              T7,T8
-            >
-          >
-        >,
-        lazy_minimum_type<
-          lazy_minimum_type<
-            minimum_type<
-              T9,T10
-            >,
-            minimum_type<
-              T11,T12
-            >
-          >,
-          lazy_minimum_type<
-            minimum_type<
-              T13,T14
-            >,
-            minimum_type<
-              T15,T16
-            >
-          >
-        >
-      >
-{};
-
-} // end detail
-
-} // end thrust
-
diff --git a/compat/thrust/detail/type_traits/pointer_traits.h b/compat/thrust/detail/type_traits/pointer_traits.h
deleted file mode 100644
index a0b5dc625c..0000000000
--- a/compat/thrust/detail/type_traits/pointer_traits.h
+++ /dev/null
@@ -1,276 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/type_traits/is_metafunction_defined.h>
-#include <thrust/detail/type_traits/has_nested_type.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <cstddef>
-
-namespace thrust
-{
-namespace detail
-{
-
-template<typename Ptr> struct pointer_element;
-
-template<template<typename> class Ptr, typename Arg>
-  struct pointer_element<Ptr<Arg> >
-{
-  typedef Arg type;
-};
-
-template<template<typename,typename> class Ptr, typename Arg1, typename Arg2>
-  struct pointer_element<Ptr<Arg1,Arg2> >
-{
-  typedef Arg1 type;
-};
-
-template<template<typename,typename,typename> class Ptr, typename Arg1, typename Arg2, typename Arg3>
-  struct pointer_element<Ptr<Arg1,Arg2,Arg3> >
-{
-  typedef Arg1 type;
-};
-
-template<template<typename,typename,typename,typename> class Ptr, typename Arg1, typename Arg2, typename Arg3, typename Arg4>
-  struct pointer_element<Ptr<Arg1,Arg2,Arg3,Arg4> >
-{
-  typedef Arg1 type;
-};
-
-template<template<typename,typename,typename,typename,typename> class Ptr, typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5>
-  struct pointer_element<Ptr<Arg1,Arg2,Arg3,Arg4,Arg5> >
-{
-  typedef Arg1 type;
-};
-
-template<typename T>
-  struct pointer_element<T*>
-{
-  typedef T type;
-};
-
-template<typename Ptr>
-  struct pointer_difference
-{
-  typedef typename Ptr::difference_type type;
-};
-
-template<typename T>
-  struct pointer_difference<T*>
-{
-  typedef std::ptrdiff_t type;
-};
-
-template<typename Ptr, typename T> struct rebind_pointer;
-
-template<typename T, typename U>
-  struct rebind_pointer<T*,U>
-{
-  typedef U* type;
-};
-
-template<template<typename> class Ptr, typename Arg, typename T>
-  struct rebind_pointer<Ptr<Arg>,T>
-{
-  typedef Ptr<T> type;
-};
-
-template<template<typename, typename> class Ptr, typename Arg1, typename Arg2, typename T>
-  struct rebind_pointer<Ptr<Arg1,Arg2>,T>
-{
-  typedef Ptr<T,Arg2> type;
-};
-
-template<template<typename, typename, typename> class Ptr, typename Arg1, typename Arg2, typename Arg3, typename T>
-  struct rebind_pointer<Ptr<Arg1,Arg2,Arg3>,T>
-{
-  typedef Ptr<T,Arg2,Arg3> type;
-};
-
-template<template<typename, typename, typename, typename> class Ptr, typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename T>
-  struct rebind_pointer<Ptr<Arg1,Arg2,Arg3,Arg4>,T>
-{
-  typedef Ptr<T,Arg2,Arg3,Arg4> type;
-};
-
-// XXX this should probably be renamed native_type or similar
-__THRUST_DEFINE_HAS_NESTED_TYPE(has_raw_pointer, raw_pointer)
-
-namespace pointer_traits_detail
-{
-
-template<typename Ptr, typename Enable = void> struct pointer_raw_pointer_impl {};
-
-template<typename T>
-  struct pointer_raw_pointer_impl<T*>
-{
-  typedef T* type;
-};
-
-template<typename Ptr>
-  struct pointer_raw_pointer_impl<Ptr, typename enable_if<has_raw_pointer<Ptr>::value>::type>
-{
-  typedef typename Ptr::raw_pointer type;
-};
-
-} // end pointer_traits_detail
-
-template<typename T>
-  struct pointer_raw_pointer
-    : pointer_traits_detail::pointer_raw_pointer_impl<T>
-{};
-
-namespace pointer_traits_detail
-{
-
-template<typename Void>
-  struct capture_address
-{
-  template<typename T>
-  __host__ __device__
-  capture_address(T &r)
-    : m_addr(&r)
-  {}
-
-  inline __host__ __device__
-  Void *operator&() const
-  {
-    return m_addr;
-  }
-
-  Void *m_addr;
-};
-
-// metafunction to compute the type of pointer_to's parameter below
-template<typename T>
-  struct pointer_to_param
-    : thrust::detail::eval_if<
-        thrust::detail::is_void<T>::value,
-        thrust::detail::identity_<capture_address<T> >,
-        thrust::detail::add_reference<T>
-      >
-{};
-
-}
-
-template<typename Ptr>
-  struct pointer_traits
-{
-  typedef Ptr                                    pointer;
-  typedef typename pointer_element<Ptr>::type    element_type;
-  typedef typename pointer_difference<Ptr>::type difference_type;
-
-  template<typename U>
-    struct rebind 
-  {
-    typedef typename rebind_pointer<Ptr,U>::type other;
-  };
-
-  __host__ __device__
-  inline static pointer pointer_to(typename pointer_traits_detail::pointer_to_param<element_type>::type r)
-  {
-    // XXX this is supposed to be pointer::pointer_to(&r); (i.e., call a static member function of pointer called pointer_to)
-    //     assume that pointer has a constructor from raw pointer instead
-    
-    return pointer(&r);
-  }
-
-  // thrust additions follow
-  typedef typename pointer_raw_pointer<Ptr>::type raw_pointer;
-
-  __host__ __device__
-  inline static raw_pointer get(pointer ptr)
-  {
-    return ptr.get();
-  }
-};
-
-template<typename T>
-  struct pointer_traits<T*>
-{
-  typedef T*                                    pointer;
-  typedef T                                     element_type;
-  typedef typename pointer_difference<T*>::type difference_type;
-
-  template<typename U>
-    struct rebind
-  {
-    typedef U* other;
-  };
-
-  __host__ __device__
-  inline static pointer pointer_to(typename pointer_traits_detail::pointer_to_param<element_type>::type r)
-  {
-    return &r;
-  }
-
-  // thrust additions follow
-  typedef typename pointer_raw_pointer<T*>::type raw_pointer;
-
-  __host__ __device__
-  inline static raw_pointer get(pointer ptr)
-  {
-    return ptr;
-  }
-};
-
-template<typename FromPtr, typename ToPtr>
-  struct is_pointer_convertible
-    : thrust::detail::and_<
-        thrust::detail::is_convertible<
-          typename pointer_element<FromPtr>::type *,
-          typename pointer_element<ToPtr>::type *
-        >,
-        thrust::detail::is_convertible<
-          typename iterator_system<FromPtr>::type,
-          typename iterator_system<ToPtr>::type
-        >
-      >
-{};
-
-// this could be a lot better, but for our purposes, it's probably
-// sufficient just to check if pointer_raw_pointer<T> has meaning
-template<typename T>
-  struct is_thrust_pointer
-    : is_metafunction_defined<pointer_raw_pointer<T> >
-{};
-
-// avoid inspecting traits of the arguments if they aren't known to be pointers
-template<typename FromPtr, typename ToPtr>
-  struct lazy_is_pointer_convertible
-    : thrust::detail::eval_if<
-        is_thrust_pointer<FromPtr>::value && is_thrust_pointer<ToPtr>::value,
-        is_pointer_convertible<FromPtr,ToPtr>,
-        thrust::detail::identity_<thrust::detail::false_type>
-      >
-{};
-
-template<typename FromPtr, typename ToPtr, typename T = void>
-  struct enable_if_pointer_is_convertible
-    : thrust::detail::enable_if<
-        lazy_is_pointer_convertible<FromPtr,ToPtr>::type::value,
-        T
-      >
-{};
-
-
-} // end detail
-} // end thrust
-
diff --git a/compat/thrust/detail/type_traits/result_of.h b/compat/thrust/detail/type_traits/result_of.h
deleted file mode 100644
index e30b4fda3b..0000000000
--- a/compat/thrust/detail/type_traits/result_of.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/type_traits/function_traits.h>
-
-namespace thrust
-{
-namespace detail
-{
-
-template<typename Signature, typename Enable = void> struct result_of;
-
-// specialization for unary invocations of things which have result_type
-template<typename Functor, typename Arg1>
-  struct result_of<
-    Functor(Arg1),
-    typename thrust::detail::enable_if<thrust::detail::has_result_type<Functor>::value>::type
-  >
-{
-  typedef typename Functor::result_type type;
-}; // end result_of
-
-// specialization for binary invocations of things which have result_type
-template<typename Functor, typename Arg1, typename Arg2>
-  struct result_of<
-    Functor(Arg1,Arg2),
-    typename thrust::detail::enable_if<thrust::detail::has_result_type<Functor>::value>::type
-  >
-{
-  typedef typename Functor::result_type type;
-};
-
-} // end detail
-} // end thrust
-
diff --git a/compat/thrust/detail/uninitialized_copy.inl b/compat/thrust/detail/uninitialized_copy.inl
deleted file mode 100644
index a01dca53d6..0000000000
--- a/compat/thrust/detail/uninitialized_copy.inl
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file uninitialized_copy.inl
- *  \brief Inline file for uninitialized_copy.h.
- */
-
-#include <thrust/uninitialized_copy.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/detail/generic/select_system.h>
-#include <thrust/system/detail/generic/uninitialized_copy.h>
-#include <thrust/system/detail/adl/uninitialized_copy.h>
-
-namespace thrust
-{
-
-
-template<typename DerivedPolicy, typename InputIterator, typename ForwardIterator>
-  ForwardIterator uninitialized_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                     InputIterator first,
-                                     InputIterator last,
-                                     ForwardIterator result)
-{
-  using thrust::system::detail::generic::uninitialized_copy;
-  return uninitialized_copy(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result);
-} // end uninitialized_copy()
-
-
-template<typename DerivedPolicy, typename InputIterator, typename Size, typename ForwardIterator>
-  ForwardIterator uninitialized_copy_n(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                       InputIterator first,
-                                       Size n,
-                                       ForwardIterator result)
-{
-  using thrust::system::detail::generic::uninitialized_copy_n;
-  return uninitialized_copy_n(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, n, result);
-} // end uninitialized_copy_n()
-
-
-template<typename InputIterator,
-         typename ForwardIterator>
-  ForwardIterator uninitialized_copy(InputIterator first,
-                                     InputIterator last,
-                                     ForwardIterator result)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator>::type   System1;
-  typedef typename thrust::iterator_system<ForwardIterator>::type System2;
-
-  System1 system1;
-  System2 system2;
-
-  return thrust::uninitialized_copy(select_system(system1,system2), first, last, result);
-} // end uninitialized_copy()
-
-
-template<typename InputIterator,
-         typename Size,
-         typename ForwardIterator>
-  ForwardIterator uninitialized_copy_n(InputIterator first,
-                                       Size n,
-                                       ForwardIterator result)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator>::type   System1;
-  typedef typename thrust::iterator_system<ForwardIterator>::type System2;
-
-  System1 system1;
-  System2 system2;
-
-  return thrust::uninitialized_copy_n(select_system(system1,system2), first, n, result);
-} // end uninitialized_copy_n()
-
-
-} // end thrust
-
-
diff --git a/compat/thrust/detail/uninitialized_fill.inl b/compat/thrust/detail/uninitialized_fill.inl
deleted file mode 100644
index 3545de56ee..0000000000
--- a/compat/thrust/detail/uninitialized_fill.inl
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file uninitialized_fill.inl
- *  \brief Inline file for uninitialized_fill.h.
- */
-
-#include <thrust/uninitialized_fill.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/detail/generic/select_system.h>
-#include <thrust/system/detail/generic/uninitialized_fill.h>
-#include <thrust/system/detail/adl/uninitialized_fill.h>
-
-namespace thrust
-{
-
-
-template<typename DerivedPolicy, typename ForwardIterator, typename T>
-  void uninitialized_fill(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                          ForwardIterator first,
-                          ForwardIterator last,
-                          const T &x)
-{
-  using thrust::system::detail::generic::uninitialized_fill;
-  return uninitialized_fill(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, x);
-} // end uninitialized_fill()
-
-
-template<typename DerivedPolicy, typename ForwardIterator, typename Size, typename T>
-  ForwardIterator uninitialized_fill_n(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                       ForwardIterator first,
-                                       Size n,
-                                       const T &x)
-{
-  using thrust::system::detail::generic::uninitialized_fill_n;
-  return uninitialized_fill_n(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, n, x);
-} // end uninitialized_fill_n()
-
-
-template<typename ForwardIterator,
-         typename T>
-  void uninitialized_fill(ForwardIterator first,
-                          ForwardIterator last,
-                          const T &x)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<ForwardIterator>::type System;
-
-  System system;
-
-  thrust::uninitialized_fill(select_system(system), first, last, x);
-} // end uninitialized_fill()
-
-
-template<typename ForwardIterator,
-         typename Size,
-         typename T>
-  ForwardIterator uninitialized_fill_n(ForwardIterator first,
-                                       Size n,
-                                       const T &x)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<ForwardIterator>::type System;
-
-  System system;
-
-  return thrust::uninitialized_fill_n(select_system(system), first, n, x);
-} // end uninitialized_fill_n()
-
-
-} // end thrust
-
diff --git a/compat/thrust/detail/unique.inl b/compat/thrust/detail/unique.inl
deleted file mode 100644
index e90187d919..0000000000
--- a/compat/thrust/detail/unique.inl
+++ /dev/null
@@ -1,320 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file unique.inl
- *  \brief Inline file for unique.h.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/unique.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/detail/generic/select_system.h>
-#include <thrust/system/detail/generic/unique.h>
-#include <thrust/system/detail/generic/unique_by_key.h>
-#include <thrust/system/detail/adl/unique.h>
-#include <thrust/system/detail/adl/unique_by_key.h>
-
-namespace thrust
-{
-
-
-template <typename DerivedPolicy,
-          typename ForwardIterator>
-ForwardIterator unique(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                       ForwardIterator first,
-                       ForwardIterator last)
-{
-  using thrust::system::detail::generic::unique;
-  return unique(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last);
-} // end unique()
-
-
-template <typename DerivedPolicy,
-          typename ForwardIterator,
-          typename BinaryPredicate>
-ForwardIterator unique(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                       ForwardIterator first,
-                       ForwardIterator last,
-                       BinaryPredicate binary_pred)
-{
-  using thrust::system::detail::generic::unique;
-  return unique(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, binary_pred);
-} // end unique()
-
-
-template <typename DerivedPolicy,
-          typename InputIterator,
-          typename OutputIterator>
-OutputIterator unique_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                           InputIterator first,
-                           InputIterator last,
-                           OutputIterator output)
-{
-  using thrust::system::detail::generic::unique_copy;
-  return unique_copy(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, output);
-} // end unique_copy()
-
-
-template <typename DerivedPolicy,
-          typename InputIterator,
-          typename OutputIterator,
-          typename BinaryPredicate>
-OutputIterator unique_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                           InputIterator first,
-                           InputIterator last,
-                           OutputIterator output,
-                           BinaryPredicate binary_pred)
-{
-  using thrust::system::detail::generic::unique_copy;
-  return unique_copy(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, output, binary_pred);
-} // end unique_copy()
-
-
-template <typename DerivedPolicy,
-          typename ForwardIterator1,
-          typename ForwardIterator2>
-  thrust::pair<ForwardIterator1,ForwardIterator2>
-  unique_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                ForwardIterator1 keys_first, 
-                ForwardIterator1 keys_last,
-                ForwardIterator2 values_first)
-{
-  using thrust::system::detail::generic::unique_by_key;
-  return unique_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first, keys_last, values_first);
-} // end unique_by_key()
-
-
-template <typename DerivedPolicy,
-          typename ForwardIterator1,
-          typename ForwardIterator2,
-          typename BinaryPredicate>
-  thrust::pair<ForwardIterator1,ForwardIterator2>
-  unique_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                ForwardIterator1 keys_first, 
-                ForwardIterator1 keys_last,
-                ForwardIterator2 values_first,
-                BinaryPredicate binary_pred)
-{
-  using thrust::system::detail::generic::unique_by_key;
-  return unique_by_key(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first, keys_last, values_first, binary_pred);
-} // end unique_by_key()
-
-
-template <typename DerivedPolicy,
-          typename InputIterator1,
-          typename InputIterator2,
-          typename OutputIterator1,
-          typename OutputIterator2>
-  thrust::pair<OutputIterator1,OutputIterator2>
-  unique_by_key_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                     InputIterator1 keys_first, 
-                     InputIterator1 keys_last,
-                     InputIterator2 values_first,
-                     OutputIterator1 keys_output,
-                     OutputIterator2 values_output)
-{
-  using thrust::system::detail::generic::unique_by_key_copy;
-  return unique_by_key_copy(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first, keys_last, values_first, keys_output, values_output);
-} // end unique_by_key_copy()
-
-
-template <typename DerivedPolicy,
-          typename InputIterator1,
-          typename InputIterator2,
-          typename OutputIterator1,
-          typename OutputIterator2,
-          typename BinaryPredicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-  unique_by_key_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                     InputIterator1 keys_first, 
-                     InputIterator1 keys_last,
-                     InputIterator2 values_first,
-                     OutputIterator1 keys_output,
-                     OutputIterator2 values_output,
-                     BinaryPredicate binary_pred)
-{
-  using thrust::system::detail::generic::unique_by_key_copy;
-  return unique_by_key_copy(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), keys_first, keys_last, values_first, keys_output, values_output, binary_pred);
-} // end unique_by_key_copy()
-
-
-template<typename ForwardIterator>
-  ForwardIterator unique(ForwardIterator first,
-                         ForwardIterator last)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<ForwardIterator>::type System;
-
-  System system;
-
-  return thrust::unique(select_system(system), first, last);
-} // end unique()
-
-
-template<typename ForwardIterator,
-         typename BinaryPredicate>
-  ForwardIterator unique(ForwardIterator first,
-                         ForwardIterator last,
-                         BinaryPredicate binary_pred)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<ForwardIterator>::type System;
-
-  System system;
-
-  return thrust::unique(select_system(system), first, last, binary_pred);
-} // end unique()
-
-
-template<typename InputIterator,
-         typename OutputIterator>
-  OutputIterator unique_copy(InputIterator first,
-                             InputIterator last,
-                             OutputIterator output)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator>::type  System1;
-  typedef typename thrust::iterator_system<OutputIterator>::type System2;
-
-  System1 system1;
-  System2 system2;
-
-  return thrust::unique_copy(select_system(system1,system2), first, last, output);
-} // end unique_copy()
-
-
-template<typename InputIterator,
-         typename OutputIterator,
-         typename BinaryPredicate>
-  OutputIterator unique_copy(InputIterator first,
-                             InputIterator last,
-                             OutputIterator output,
-                             BinaryPredicate binary_pred)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator>::type  System1;
-  typedef typename thrust::iterator_system<OutputIterator>::type System2;
-
-  System1 system1;
-  System2 system2;
-
-  return thrust::unique_copy(select_system(system1,system2), first, last, output, binary_pred);
-} // end unique_copy()
-
-
-template<typename ForwardIterator1,
-         typename ForwardIterator2>
-  thrust::pair<ForwardIterator1,ForwardIterator2>
-    unique_by_key(ForwardIterator1 keys_first, 
-                  ForwardIterator1 keys_last,
-                  ForwardIterator2 values_first)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<ForwardIterator1>::type System1;
-  typedef typename thrust::iterator_system<ForwardIterator2>::type System2;
-
-  System1 system1;
-  System2 system2;
-
-  return thrust::unique_by_key(select_system(system1,system2), keys_first, keys_last, values_first);
-} // end unique_by_key()
-
-
-template<typename ForwardIterator1,
-         typename ForwardIterator2,
-         typename BinaryPredicate>
-  thrust::pair<ForwardIterator1,ForwardIterator2>
-    unique_by_key(ForwardIterator1 keys_first, 
-                  ForwardIterator1 keys_last,
-                  ForwardIterator2 values_first,
-                  BinaryPredicate binary_pred)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<ForwardIterator1>::type System1;
-  typedef typename thrust::iterator_system<ForwardIterator2>::type System2;
-
-  System1 system1;
-  System2 system2;
-
-  return thrust::unique_by_key(select_system(system1,system2), keys_first, keys_last, values_first, binary_pred);
-} // end unique_by_key()
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    unique_by_key_copy(InputIterator1 keys_first, 
-                       InputIterator1 keys_last,
-                       InputIterator2 values_first,
-                       OutputIterator1 keys_output,
-                       OutputIterator2 values_output)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
-  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
-  typedef typename thrust::iterator_system<OutputIterator1>::type System3;
-  typedef typename thrust::iterator_system<OutputIterator2>::type System4;
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-  System4 system4;
-
-  return thrust::unique_by_key_copy(select_system(system1,system2,system3,system4), keys_first, keys_last, values_first, keys_output, values_output);
-} // end unique_by_key_copy()
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename BinaryPredicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    unique_by_key_copy(InputIterator1 keys_first, 
-                       InputIterator1 keys_last,
-                       InputIterator2 values_first,
-                       OutputIterator1 keys_output,
-                       OutputIterator2 values_output,
-                       BinaryPredicate binary_pred)
-{
-  using thrust::system::detail::generic::select_system;
-
-  typedef typename thrust::iterator_system<InputIterator1>::type  System1;
-  typedef typename thrust::iterator_system<InputIterator2>::type  System2;
-  typedef typename thrust::iterator_system<OutputIterator1>::type System3;
-  typedef typename thrust::iterator_system<OutputIterator2>::type System4;
-
-  System1 system1;
-  System2 system2;
-  System3 system3;
-  System4 system4;
-
-  return thrust::unique_by_key_copy(select_system(system1,system2,system3,system4), keys_first, keys_last, values_first, keys_output, values_output, binary_pred);
-} // end unique_by_key_copy()
-
-
-} // end namespace thrust
-
diff --git a/compat/thrust/detail/use_default.h b/compat/thrust/detail/use_default.h
deleted file mode 100644
index c6eb66ef05..0000000000
--- a/compat/thrust/detail/use_default.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-namespace thrust
-{
-
-struct use_default {};
-
-} // end thrust
-
diff --git a/compat/thrust/detail/util/align.h b/compat/thrust/detail/util/align.h
deleted file mode 100644
index 10f107a95b..0000000000
--- a/compat/thrust/detail/util/align.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#pragma once
-
-#include <thrust/detail/cstdint.h>
-
-// functions to handle memory alignment
-
-namespace thrust
-{
-namespace detail
-{
-namespace util
-{
-
-template <typename T>
-T * align_up(T * ptr, detail::uintptr_t bytes)
-{
-    return (T *) ( bytes * (((detail::uintptr_t) ptr + (bytes - 1)) / bytes) );
-}
-
-template <typename T>
-T * align_down(T * ptr, detail::uintptr_t bytes)
-{
-    return (T *) ( bytes * (detail::uintptr_t(ptr) / bytes) );
-}
-
-template <typename T>
-bool is_aligned(T * ptr, detail::uintptr_t bytes = sizeof(T))
-{
-    return detail::uintptr_t(ptr) % bytes == 0;
-}
-
-} // end namespace util
-} // end namespace detail
-} // end namespace thrust
-
diff --git a/compat/thrust/detail/util/blocking.h b/compat/thrust/detail/util/blocking.h
deleted file mode 100644
index 3bb78a637b..0000000000
--- a/compat/thrust/detail/util/blocking.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#pragma once
-
-//functions to support blocking
-
-namespace thrust
-{
-
-namespace detail
-{
-
-namespace util
-{
-
-// x/y rounding towards +infinity for integers, used to determine # of blocks/warps etc.
-template<typename L, typename R>
-  inline __host__ __device__ L divide_ri(const L x, const R y)
-{
-    return (x + (y - 1)) / y;
-}
-
-// x/y rounding towards zero for integers, used to determine # of blocks/warps etc.
-template<typename L, typename R>
-  inline __host__ __device__ L divide_rz(const L x, const R y)
-{
-    return x / y;
-}
-
-// round x towards infinity to the next multiple of y
-template<typename L, typename R>
-  inline __host__ __device__ L round_i(const L x, const R y){ return y * divide_ri(x, y); }
-
-// round x towards zero to the next multiple of y
-template<typename L, typename R>
-  inline __host__ __device__ L round_z(const L x, const R y){ return y * divide_rz(x, y); }
-
-} // end namespace util
-
-} // end namespace detail
-
-} // end namespace thrust
-
diff --git a/compat/thrust/detail/vector_base.h b/compat/thrust/detail/vector_base.h
deleted file mode 100644
index 6974eab554..0000000000
--- a/compat/thrust/detail/vector_base.h
+++ /dev/null
@@ -1,534 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file vector_base.h
- *  \brief Defines the interface to a base class for
- *         host_vector & device_vector.
- */
-
-#pragma once
-
-#include <thrust/iterator/detail/normal_iterator.h>
-#include <thrust/iterator/reverse_iterator.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/contiguous_storage.h>
-#include <vector>
-
-namespace thrust
-{
-
-namespace detail
-{
-
-template<typename T, typename Alloc>
-  class vector_base
-{
-  private:
-    typedef thrust::detail::contiguous_storage<T,Alloc> storage_type;
-
-  public:
-    // typedefs
-    typedef typename storage_type::value_type      value_type;
-    typedef typename storage_type::pointer         pointer;
-    typedef typename storage_type::const_pointer   const_pointer;
-    typedef typename storage_type::reference       reference;
-    typedef typename storage_type::const_reference const_reference;
-    typedef typename storage_type::size_type       size_type;
-    typedef typename storage_type::difference_type difference_type;
-    typedef typename storage_type::allocator_type  allocator_type;
-
-    typedef typename storage_type::iterator        iterator;
-    typedef typename storage_type::const_iterator  const_iterator;
-
-    typedef thrust::reverse_iterator<iterator>       reverse_iterator;
-    typedef thrust::reverse_iterator<const_iterator> const_reverse_iterator;
-
-    /*! This constructor creates an empty vector_base.
-     */
-    vector_base(void);
-
-    /*! This constructor creates a vector_base with default-constructed
-     *  elements.
-     *  \param n The number of elements to create.
-     */
-    explicit vector_base(size_type n);
-
-    /*! This constructor creates a vector_base with copies
-     *  of an exemplar element.
-     *  \param n The number of elements to initially create.
-     *  \param value An element to copy.
-     */
-    explicit vector_base(size_type n, const value_type &value);
-
-    /*! Copy constructor copies from an exemplar vector_base.
-     *  \param v The vector_base to copy.
-     */
-    vector_base(const vector_base &v);
-
-    /*! assign operator makes a copy of an exemplar vector_base.
-     *  \param v The vector_base to copy.
-     */
-    vector_base &operator=(const vector_base &v);
-
-    /*! Copy constructor copies from an exemplar vector_base with different
-     *  type.
-     *  \param v The vector_base to copy.
-     */
-    template<typename OtherT, typename OtherAlloc>
-    vector_base(const vector_base<OtherT, OtherAlloc> &v);
-
-    /*! assign operator makes a copy of an exemplar vector_base with different
-     *  type.
-     *  \param v The vector_base to copy.
-     */
-    template<typename OtherT, typename OtherAlloc>
-    vector_base &operator=(const vector_base<OtherT,OtherAlloc> &v);
-
-    /*! Copy constructor copies from an exemplar std::vector.
-     *  \param v The std::vector to copy.
-     *  XXX TODO: Make this method redundant with a properly templatized constructor.
-     *            We would like to copy from a vector whose element type is anything
-     *            assignable to value_type.
-     */
-    template<typename OtherT, typename OtherAlloc>
-    vector_base(const std::vector<OtherT, OtherAlloc> &v);
-
-    /*! assign operator makes a copy of an exemplar std::vector.
-     *  \param v The vector to copy.
-     *  XXX TODO: Templatize this assign on the type of the vector to copy from.
-     *            We would like to copy from a vector whose element type is anything
-     *            assignable to value_type.
-     */
-    template<typename OtherT, typename OtherAlloc>
-    vector_base &operator=(const std::vector<OtherT,OtherAlloc> &v);
-
-    /*! This constructor builds a vector_base from a range.
-     *  \param first The beginning of the range.
-     *  \param last The end of the range.
-     */
-    template<typename InputIterator>
-    vector_base(InputIterator first, InputIterator last);
-
-    /*! The destructor erases the elements.
-     */
-    ~vector_base(void);
-
-    /*! \brief Resizes this vector_base to the specified number of elements.
-     *  \param new_size Number of elements this vector_base should contain.
-     *  \throw std::length_error If n exceeds max_size9).
-     *
-     *  This method will resize this vector_base to the specified number of
-     *  elements. If the number is smaller than this vector_base's current
-     *  size this vector_base is truncated, otherwise this vector_base is
-     *  extended and new elements are default constructed.
-     */
-    void resize(size_type new_size);
-
-    /*! \brief Resizes this vector_base to the specified number of elements.
-     *  \param new_size Number of elements this vector_base should contain.
-     *  \param x Data with which new elements should be populated.
-     *  \throw std::length_error If n exceeds max_size().
-     *
-     *  This method will resize this vector_base to the specified number of
-     *  elements.  If the number is smaller than this vector_base's current
-     *  size this vector_base is truncated, otherwise this vector_base is
-     *  extended and new elements are populated with given data.
-     */
-    void resize(size_type new_size, const value_type &x);
-
-    /*! Returns the number of elements in this vector_base.
-     */
-    size_type size(void) const;
-
-    /*! Returns the size() of the largest possible vector_base.
-     *  \return The largest possible return value of size().
-     */
-    size_type max_size(void) const;
-
-    /*! \brief If n is less than or equal to capacity(), this call has no effect.
-     *         Otherwise, this method is a request for allocation of additional memory. If
-     *         the request is successful, then capacity() is greater than or equal to
-     *         n; otherwise, capacity() is unchanged. In either case, size() is unchanged.
-     *  \throw std::length_error If n exceeds max_size().
-     */
-    void reserve(size_type n);
-
-    /*! Returns the number of elements which have been reserved in this
-     *  vector_base.
-     */
-    size_type capacity(void) const;
-
-    /*! This method shrinks the capacity of this vector_base to exactly
-     *  fit its elements.
-     */
-    void shrink_to_fit(void);
-
-    /*! \brief Subscript access to the data contained in this vector_dev.
-     *  \param n The index of the element for which data should be accessed.
-     *  \return Read/write reference to data.
-     *
-     *  This operator allows for easy, array-style, data access.
-     *  Note that data access with this operator is unchecked and
-     *  out_of_range lookups are not defined.
-     */
-    reference operator[](size_type n);
-
-    /*! \brief Subscript read access to the data contained in this vector_dev.
-     *  \param n The index of the element for which data should be accessed.
-     *  \return Read reference to data.
-     *
-     *  This operator allows for easy, array-style, data access.
-     *  Note that data access with this operator is unchecked and
-     *  out_of_range lookups are not defined.
-     */
-    const_reference operator[](size_type n) const;
-
-    /*! This method returns an iterator pointing to the beginning of
-     *  this vector_base.
-     *  \return mStart
-     */
-    iterator begin(void);
-
-    /*! This method returns a const_iterator pointing to the beginning
-     *  of this vector_base.
-     *  \return mStart
-     */
-    const_iterator begin(void) const;
-
-    /*! This method returns a const_iterator pointing to the beginning
-     *  of this vector_base.
-     *  \return mStart
-     */
-    const_iterator cbegin(void) const;
-
-    /*! This method returns a reverse_iterator pointing to the beginning of
-     *  this vector_base's reversed sequence.
-     *  \return A reverse_iterator pointing to the beginning of this
-     *          vector_base's reversed sequence.
-     */
-    reverse_iterator rbegin(void);
-
-    /*! This method returns a const_reverse_iterator pointing to the beginning of
-     *  this vector_base's reversed sequence.
-     *  \return A const_reverse_iterator pointing to the beginning of this
-     *          vector_base's reversed sequence.
-     */
-    const_reverse_iterator rbegin(void) const;
-
-    /*! This method returns a const_reverse_iterator pointing to the beginning of
-     *  this vector_base's reversed sequence.
-     *  \return A const_reverse_iterator pointing to the beginning of this
-     *          vector_base's reversed sequence.
-     */
-    const_reverse_iterator crbegin(void) const;
-
-    /*! This method returns an iterator pointing to one element past the
-     *  last of this vector_base.
-     *  \return begin() + size().
-     */
-    iterator end(void);
-
-    /*! This method returns a const_iterator pointing to one element past the
-     *  last of this vector_base.
-     *  \return begin() + size().
-     */
-    const_iterator end(void) const;
-
-    /*! This method returns a const_iterator pointing to one element past the
-     *  last of this vector_base.
-     *  \return begin() + size().
-     */
-    const_iterator cend(void) const;
-
-    /*! This method returns a reverse_iterator pointing to one element past the
-     *  last of this vector_base's reversed sequence.
-     *  \return rbegin() + size().
-     */
-    reverse_iterator rend(void);
-
-    /*! This method returns a const_reverse_iterator pointing to one element past the
-     *  last of this vector_base's reversed sequence.
-     *  \return rbegin() + size().
-     */
-    const_reverse_iterator rend(void) const;
-
-    /*! This method returns a const_reverse_iterator pointing to one element past the
-     *  last of this vector_base's reversed sequence.
-     *  \return rbegin() + size().
-     */
-    const_reverse_iterator crend(void) const;
-
-    /*! This method returns a const_reference referring to the first element of this
-     *  vector_base.
-     *  \return The first element of this vector_base.
-     */
-    const_reference front(void) const;
-
-    /*! This method returns a reference pointing to the first element of this
-     *  vector_base.
-     *  \return The first element of this vector_base.
-     */
-    reference front(void);
-
-    /*! This method returns a const reference pointing to the last element of
-     *  this vector_base.
-     *  \return The last element of this vector_base.
-     */
-    const_reference back(void) const;
-
-    /*! This method returns a reference referring to the last element of
-     *  this vector_dev.
-     *  \return The last element of this vector_base.
-     */
-    reference back(void);
-
-    /*! This method returns a pointer to this vector_base's first element.
-     *  \return A pointer to the first element of this vector_base.
-     */
-    pointer data(void);
-
-    /*! This method returns a const_pointer to this vector_base's first element.
-     *  \return a const_pointer to the first element of this vector_base.
-     */
-    const_pointer data(void) const;
-
-    /*! This method resizes this vector_base to 0.
-     */
-    void clear(void);
-
-    /*! This method returns true iff size() == 0.
-     *  \return true if size() == 0; false, otherwise.
-     */
-    bool empty(void) const;
-
-    /*! This method appends the given element to the end of this vector_base.
-     *  \param x The element to append.
-     */
-    void push_back(const value_type &x);
-
-    /*! This method erases the last element of this vector_base, invalidating
-     *  all iterators and references to it.
-     */
-    void pop_back(void);
-
-    /*! This method swaps the contents of this vector_base with another vector_base.
-     *  \param v The vector_base with which to swap.
-     */
-    void swap(vector_base &v);
-
-    /*! This method removes the element at position pos.
-     *  \param pos The position of the element of interest.
-     *  \return An iterator pointing to the new location of the element that followed the element
-     *          at position pos.
-     */
-    iterator erase(iterator pos);
-
-    /*! This method removes the range of elements [first,last) from this vector_base.
-     *  \param first The beginning of the range of elements to remove.
-     *  \param last The end of the range of elements to remove.
-     *  \return An iterator pointing to the new location of the element that followed the last
-     *          element in the sequence [first,last).
-     */
-    iterator erase(iterator first, iterator last);
-
-    /*! This method inserts a single copy of a given exemplar value at the
-     *  specified position in this vector_base.
-     *  \param position The insertion position.
-     *  \param x The exemplar element to copy & insert.
-     *  \return An iterator pointing to the newly inserted element.
-     */
-    iterator insert(iterator position, const T &x); 
-
-    /*! This method inserts a copy of an exemplar value to a range at the
-     *  specified position in this vector_base.
-     *  \param position The insertion position
-     *  \param n The number of insertions to perform.
-     *  \param x The value to replicate and insert.
-     */
-    void insert(iterator position, size_type n, const T &x);
-
-    /*! This method inserts a copy of an input range at the specified position
-     *  in this vector_base.
-     *  \param position The insertion position.
-     *  \param first The beginning of the range to copy.
-     *  \param last  The end of the range to copy.
-     *
-     *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html>Input Iterator</a>,
-     *                        and \p InputIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>.
-     */
-    template<typename InputIterator>
-    void insert(iterator position, InputIterator first, InputIterator last);
-
-    /*! This version of \p assign replicates a given exemplar
-     *  \p n times into this vector_base.
-     *  \param n The number of times to copy \p x.
-     *  \param x The exemplar element to replicate.
-     */
-    void assign(size_type n, const T &x);
-
-    /*! This version of \p assign makes this vector_base a copy of a given input range.
-     *  \param first The beginning of the range to copy.
-     *  \param last  The end of the range to copy.
-     *
-     *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator">Input Iterator</a>.
-     */
-    template<typename InputIterator>
-    void assign(InputIterator first, InputIterator last);
-
-    /*! This method returns a copy of this vector's allocator.
-     *  \return A copy of the alloctor used by this vector.
-     */
-    allocator_type get_allocator(void) const;
-
-  protected:
-    // Our storage
-    storage_type m_storage;
-
-    // The size of this vector_base, in number of elements.
-    size_type m_size;
-
-  private:
-    // these methods resolve the ambiguity of the constructor template of form (Iterator, Iterator)
-    template<typename IteratorOrIntegralType>
-      void init_dispatch(IteratorOrIntegralType begin, IteratorOrIntegralType end, false_type); 
-
-    template<typename IteratorOrIntegralType>
-      void init_dispatch(IteratorOrIntegralType n, IteratorOrIntegralType value, true_type); 
-
-    template<typename InputIterator>
-      void range_init(InputIterator first, InputIterator last);
-
-    template<typename InputIterator>
-      void range_init(InputIterator first, InputIterator last, thrust::incrementable_traversal_tag);
-
-    template<typename ForwardIterator>
-      void range_init(ForwardIterator first, ForwardIterator last, thrust::random_access_traversal_tag);
-
-    void default_init(size_type n);
-
-    void fill_init(size_type n, const T &x);
-
-    // these methods resolve the ambiguity of the insert() template of form (iterator, InputIterator, InputIterator)
-    template<typename InputIteratorOrIntegralType>
-      void insert_dispatch(iterator position, InputIteratorOrIntegralType first, InputIteratorOrIntegralType last, false_type);
-
-    // these methods resolve the ambiguity of the insert() template of form (iterator, InputIterator, InputIterator)
-    template<typename InputIteratorOrIntegralType>
-      void insert_dispatch(iterator position, InputIteratorOrIntegralType n, InputIteratorOrIntegralType x, true_type);
-
-    // this method appends n default-constructed elements at the end
-    void append(size_type n);
-
-    // this method performs insertion from a fill value
-    void fill_insert(iterator position, size_type n, const T &x);
-
-    // this method performs insertion from a range
-    template<typename InputIterator>
-      void copy_insert(iterator position, InputIterator first, InputIterator last);
-
-    // these methods resolve the ambiguity of the assign() template of form (InputIterator, InputIterator)
-    template<typename InputIterator>
-      void assign_dispatch(InputIterator first, InputIterator last, false_type);
-
-    // these methods resolve the ambiguity of the assign() template of form (InputIterator, InputIterator)
-    template<typename Integral>
-      void assign_dispatch(Integral n, Integral x, true_type);
-
-    // this method performs assignment from a range
-    template<typename InputIterator>
-      void range_assign(InputIterator first, InputIterator last);
-
-    // this method performs assignment from a range of RandomAccessIterators
-    template<typename RandomAccessIterator>
-      void range_assign(RandomAccessIterator first, RandomAccessIterator last, thrust::random_access_traversal_tag);
-
-    // this method performs assignment from a range of InputIterators
-    template<typename InputIterator>
-      void range_assign(InputIterator first, InputIterator last, thrust::incrementable_traversal_tag);
-
-    // this method performs assignment from a fill value
-    void fill_assign(size_type n, const T &x);
-
-    // this method allocates new storage and construct copies the given range
-    template<typename ForwardIterator>
-    void allocate_and_copy(size_type requested_size,
-                           ForwardIterator first, ForwardIterator last,
-                           storage_type &new_storage);
-}; // end vector_base
-
-} // end detail
-
-/*! This function assigns the contents of vector a to vector b and the
- *  contents of vector b to vector a.
- *
- *  \param a The first vector of interest. After completion, the contents
- *           of b will be returned here.
- *  \param b The second vector of interest. After completion, the contents
- *           of a will be returned here.
- */
-template<typename T, typename Alloc>
-  void swap(detail::vector_base<T,Alloc> &a,
-            detail::vector_base<T,Alloc> &b);
-
-
-/*! This operator allows comparison between two vectors.
- *  \param lhs The first \p vector to compare.
- *  \param rhs The second \p vector to compare.
- *  \return \c true if and only if each corresponding element in either
- *          \p vector equals the other; \c false, otherwise.
- */
-template<typename T1, typename Alloc1,
-         typename T2, typename Alloc2>
-bool operator==(const detail::vector_base<T1,Alloc1>& lhs,
-                const detail::vector_base<T2,Alloc2>& rhs);
-    
-template<typename T1, typename Alloc1,
-         typename T2, typename Alloc2>
-bool operator==(const detail::vector_base<T1,Alloc1>& lhs,
-                const std::vector<T2,Alloc2>&         rhs);
-
-template<typename T1, typename Alloc1,
-         typename T2, typename Alloc2>
-bool operator==(const std::vector<T1,Alloc1>&         lhs,
-                const detail::vector_base<T2,Alloc2>& rhs);
-
-/*! This operator allows comparison between two vectors.
- *  \param lhs The first \p vector to compare.
- *  \param rhs The second \p vector to compare.
- *  \return \c false if and only if each corresponding element in either
- *          \p vector equals the other; \c true, otherwise.
- */
-template<typename T1, typename Alloc1,
-         typename T2, typename Alloc2>
-bool operator!=(const detail::vector_base<T1,Alloc1>& lhs,
-                const detail::vector_base<T2,Alloc2>& rhs);
-    
-template<typename T1, typename Alloc1,
-         typename T2, typename Alloc2>
-bool operator!=(const detail::vector_base<T1,Alloc1>& lhs,
-                const std::vector<T2,Alloc2>&         rhs);
-
-template<typename T1, typename Alloc1,
-         typename T2, typename Alloc2>
-bool operator!=(const std::vector<T1,Alloc1>&         lhs,
-                const detail::vector_base<T2,Alloc2>& rhs);
-
-} // end thrust
-
-#include <thrust/detail/vector_base.inl>
-
diff --git a/compat/thrust/detail/vector_base.inl b/compat/thrust/detail/vector_base.inl
deleted file mode 100644
index 24e6466c18..0000000000
--- a/compat/thrust/detail/vector_base.inl
+++ /dev/null
@@ -1,1203 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file vector_base.inl
- *  \brief Inline file for vector_base.h.
- */
-
-#include <thrust/detail/vector_base.h>
-#include <thrust/detail/copy.h>
-#include <thrust/detail/overlapped_copy.h>
-#include <thrust/equal.h>
-#include <thrust/distance.h>
-#include <thrust/advance.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/minmax.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/temporary_array.h>
-
-#include <stdexcept>
-
-namespace thrust
-{
-
-namespace detail
-{
-
-template<typename T, typename Alloc>
-  vector_base<T,Alloc>
-    ::vector_base(void)
-      :m_storage(),
-       m_size(0)
-{
-  ;
-} // end vector_base::vector_base()
-
-template<typename T, typename Alloc>
-  vector_base<T,Alloc>
-    ::vector_base(size_type n)
-      :m_storage(),
-       m_size(0)
-{
-  default_init(n);
-} // end vector_base::vector_base()
-
-template<typename T, typename Alloc>
-  vector_base<T,Alloc>
-    ::vector_base(size_type n, const value_type &value)
-      :m_storage(),
-       m_size(0)
-{
-  fill_init(n,value);
-} // end vector_base::vector_base()
-
-template<typename T, typename Alloc>
-  vector_base<T,Alloc>
-    ::vector_base(const vector_base &v)
-      :m_storage(),
-       m_size(0)
-{
-  range_init(v.begin(), v.end());
-} // end vector_base::vector_base()
-
-template<typename T, typename Alloc>
-  vector_base<T,Alloc> &
-    vector_base<T,Alloc>
-      ::operator=(const vector_base &v)
-{
-  if(this != &v)
-  {
-    assign(v.begin(), v.end());
-  } // end if
-
-  return *this;
-} // end vector_base::operator=()
-
-template<typename T, typename Alloc>
-  template<typename OtherT, typename OtherAlloc>
-    vector_base<T,Alloc>
-      ::vector_base(const vector_base<OtherT,OtherAlloc> &v)
-        :m_storage(),
-         m_size(0)
-{
-  range_init(v.begin(), v.end());
-} // end vector_base::vector_base()
-
-template<typename T, typename Alloc>
-  template<typename OtherT, typename OtherAlloc>
-    vector_base<T,Alloc> &
-      vector_base<T,Alloc>
-        ::operator=(const vector_base<OtherT,OtherAlloc> &v)
-{
-  assign(v.begin(), v.end());
-
-  return *this;
-} // end vector_base::operator=()
-
-template<typename T, typename Alloc>
-  template<typename OtherT, typename OtherAlloc>
-    vector_base<T,Alloc>
-      ::vector_base(const std::vector<OtherT,OtherAlloc> &v)
-        :m_storage(),
-         m_size(0)
-{
-  range_init(v.begin(), v.end());
-} // end vector_base::vector_base()
-
-template<typename T, typename Alloc>
-  template<typename OtherT, typename OtherAlloc>
-    vector_base<T,Alloc> &
-      vector_base<T,Alloc>
-        ::operator=(const std::vector<OtherT,OtherAlloc> &v)
-{
-  assign(v.begin(), v.end());
-
-  return *this;
-} // end vector_base::operator=()
-
-template<typename T, typename Alloc>
-  template<typename IteratorOrIntegralType>
-    void vector_base<T,Alloc>
-      ::init_dispatch(IteratorOrIntegralType n,
-                      IteratorOrIntegralType value,
-                      true_type)
-{
-  fill_init(n,value);
-} // end vector_base::init_dispatch()
-
-template<typename T, typename Alloc>
-  void vector_base<T,Alloc>
-    ::default_init(size_type n)
-{
-  if(n > 0)
-  {
-    m_storage.allocate(n);
-    m_size = n;
-
-    m_storage.default_construct_n(begin(), size());
-  } // end if
-} // end vector_base::default_init()
-
-template<typename T, typename Alloc>
-  void vector_base<T,Alloc>
-    ::fill_init(size_type n, const T &x)
-{
-  if(n > 0)
-  {
-    m_storage.allocate(n);
-    m_size = n;
-
-    m_storage.uninitialized_fill_n(begin(), size(), x);
-  } // end if
-} // end vector_base::fill_init()
-
-template<typename T, typename Alloc>
-  template<typename InputIterator>
-    void vector_base<T,Alloc>
-      ::init_dispatch(InputIterator first,
-                      InputIterator last,
-                      false_type)
-{
-  range_init(first, last);
-} // end vector_base::init_dispatch()
-
-template<typename T, typename Alloc>
-  template<typename InputIterator>
-    void vector_base<T,Alloc>
-      ::range_init(InputIterator first,
-                   InputIterator last)
-{
-  range_init(first, last,
-    typename thrust::iterator_traversal<InputIterator>::type());
-} // end vector_base::range_init()
-
-template<typename T, typename Alloc>
-  template<typename InputIterator>
-    void vector_base<T,Alloc>
-      ::range_init(InputIterator first,
-                   InputIterator last,
-                   thrust::incrementable_traversal_tag)
-{
-  for(; first != last; ++first)
-    push_back(*first);
-} // end vector_base::range_init()
-
-template<typename T, typename Alloc>
-  template<typename ForwardIterator>
-    void vector_base<T,Alloc>
-      ::range_init(ForwardIterator first,
-                   ForwardIterator last,
-                   thrust::random_access_traversal_tag)
-{
-  size_type new_size = thrust::distance(first, last);
-
-  allocate_and_copy(new_size, first, last, m_storage);
-  m_size    = new_size;
-} // end vector_base::range_init()
-
-template<typename T, typename Alloc>
-  template<typename InputIterator>
-    vector_base<T,Alloc>
-      ::vector_base(InputIterator first,
-                    InputIterator last)
-        :m_storage(),
-         m_size(0)
-{
-  // check the type of InputIterator: if it's an integral type,
-  // we need to interpret this call as (size_type, value_type)
-  typedef thrust::detail::is_integral<InputIterator> Integer;
-
-  init_dispatch(first, last, Integer());
-} // end vector_basee::vector_base()
-
-template<typename T, typename Alloc>
-  void vector_base<T,Alloc>
-    ::resize(size_type new_size)
-{
-  if(new_size < size())
-  {
-    iterator new_end = begin();
-    thrust::advance(new_end, new_size);
-    erase(new_end, end());
-  } // end if
-  else
-  {
-    append(new_size - size());
-  } // end else
-} // end vector_base::resize()
-
-template<typename T, typename Alloc>
-  void vector_base<T,Alloc>
-    ::resize(size_type new_size, const value_type &x)
-{
-  if(new_size < size())
-  {
-    iterator new_end = begin();
-    thrust::advance(new_end, new_size);
-    erase(new_end, end());
-  } // end if
-  else
-  {
-    insert(end(), new_size - size(), x);
-  } // end else
-} // end vector_base::resize()
-
-template<typename T, typename Alloc>
-  typename vector_base<T,Alloc>::size_type
-    vector_base<T,Alloc>
-      ::size(void) const
-{
-  return m_size;
-} // end vector_base::size()
-
-template<typename T, typename Alloc>
-  typename vector_base<T,Alloc>::size_type
-    vector_base<T,Alloc>
-      ::max_size(void) const
-{
-  return m_storage.max_size();
-} // end vector_base::max_size()
-
-template<typename T, typename Alloc>
-  void vector_base<T,Alloc>
-    ::reserve(size_type n)
-{
-  if(n > capacity())
-  {
-    allocate_and_copy(n, begin(), end(), m_storage);
-  } // end if
-} // end vector_base::reserve()
-
-template<typename T, typename Alloc>
-  typename vector_base<T,Alloc>::size_type
-    vector_base<T,Alloc>
-      ::capacity(void) const
-{
-  return m_storage.size();
-} // end vector_base::capacity()
-
-template<typename T, typename Alloc>
-  void vector_base<T,Alloc>
-    ::shrink_to_fit(void)
-{
-  // use the swap trick
-  vector_base(*this).swap(*this);
-} // end vector_base::shrink_to_fit()
-
-template<typename T, typename Alloc>
-  typename vector_base<T,Alloc>::reference
-    vector_base<T,Alloc>
-      ::operator[](const size_type n)
-{
-  return m_storage[n];
-} // end vector_base::operator[]
-
-template<typename T, typename Alloc>
-  typename vector_base<T,Alloc>::const_reference 
-    vector_base<T,Alloc>
-      ::operator[](const size_type n) const
-{
-  return m_storage[n];
-} // end vector_base::operator[]
-
-template<typename T, typename Alloc>
-  typename vector_base<T,Alloc>::iterator
-    vector_base<T,Alloc>
-      ::begin(void)
-{
-  return m_storage.begin();
-} // end vector_base::begin()
-
-template<typename T, typename Alloc>
-  typename vector_base<T,Alloc>::const_iterator
-    vector_base<T,Alloc>
-      ::begin(void) const
-{
-  return m_storage.begin();
-} // end vector_base::begin()
-
-template<typename T, typename Alloc>
-  typename vector_base<T,Alloc>::const_iterator
-    vector_base<T,Alloc>
-      ::cbegin(void) const
-{
-  return begin();
-} // end vector_base::cbegin()
-
-template<typename T, typename Alloc>
-  typename vector_base<T,Alloc>::reverse_iterator
-    vector_base<T,Alloc>
-      ::rbegin(void)
-{
-  return reverse_iterator(end());
-} // end vector_base::rbegin()
-
-template<typename T, typename Alloc>
-  typename vector_base<T,Alloc>::const_reverse_iterator
-    vector_base<T,Alloc>
-      ::rbegin(void) const
-{
-  return const_reverse_iterator(end());
-} // end vector_base::rbegin()
-
-template<typename T, typename Alloc>
-  typename vector_base<T,Alloc>::const_reverse_iterator
-    vector_base<T,Alloc>
-      ::crbegin(void) const
-{
-  return rbegin();
-} // end vector_base::crbegin()
-
-template<typename T, typename Alloc>
-  typename vector_base<T,Alloc>::iterator
-    vector_base<T,Alloc>
-      ::end(void)
-{
-  iterator result = begin();
-  thrust::advance(result, size());
-  return result;
-} // end vector_base::end()
-
-template<typename T, typename Alloc>
-  typename vector_base<T,Alloc>::const_iterator
-    vector_base<T,Alloc>
-      ::end(void) const
-{
-  const_iterator result = begin();
-  thrust::advance(result, size());
-  return result;
-} // end vector_base::end()
-
-template<typename T, typename Alloc>
-  typename vector_base<T,Alloc>::const_iterator
-    vector_base<T,Alloc>
-      ::cend(void) const
-{
-  return end();
-} // end vector_base::cend()
-
-template<typename T, typename Alloc>
-  typename vector_base<T,Alloc>::reverse_iterator
-    vector_base<T,Alloc>
-      ::rend(void)
-{
-  return reverse_iterator(begin());
-} // end vector_base::rend()
-
-template<typename T, typename Alloc>
-  typename vector_base<T,Alloc>::const_reverse_iterator
-    vector_base<T,Alloc>
-      ::rend(void) const
-{
-  return const_reverse_iterator(begin());
-} // end vector_base::rend()
-
-template<typename T, typename Alloc>
-  typename vector_base<T,Alloc>::const_reverse_iterator
-    vector_base<T,Alloc>
-      ::crend(void) const
-{
-  return rend();
-} // end vector_base::crend()
-
-template<typename T, typename Alloc>
-  typename vector_base<T,Alloc>::const_reference
-    vector_base<T,Alloc>
-      ::front(void) const
-{
-  return *begin();
-} // end vector_base::front()
-
-template<typename T, typename Alloc>
-  typename vector_base<T,Alloc>::reference
-    vector_base<T,Alloc>
-      ::front(void)
-{
-  return *begin();
-} // end vector_base::front()
-
-template<typename T, typename Alloc>
-  typename vector_base<T,Alloc>::const_reference
-    vector_base<T,Alloc>
-      ::back(void) const
-{
-  const_iterator ptr_to_back = end();
-  --ptr_to_back;
-  return *ptr_to_back;
-} // end vector_base::vector_base
-
-template<typename T, typename Alloc>
-  typename vector_base<T,Alloc>::reference
-    vector_base<T,Alloc>
-      ::back(void)
-{
-  iterator ptr_to_back = end();
-  --ptr_to_back;
-  return *ptr_to_back;
-} // end vector_base::vector_base
-
-template<typename T, typename Alloc>
-  typename vector_base<T,Alloc>::pointer
-    vector_base<T,Alloc>
-      ::data(void)
-{
-  return &front();
-} // end vector_base::data()
-
-template<typename T, typename Alloc>
-  typename vector_base<T,Alloc>::const_pointer
-    vector_base<T,Alloc>
-      ::data(void) const
-{
-  return &front();
-} // end vector_base::data()
-
-template<typename T, typename Alloc>
-  vector_base<T,Alloc>
-    ::~vector_base(void)
-{
-  // destroy every living thing
-  m_storage.destroy(begin(),end());
-} // end vector_base::~vector_base()
-
-template<typename T, typename Alloc>
-  void vector_base<T,Alloc>
-    ::clear(void)
-{
-  resize(0);
-} // end vector_base::~vector_dev()
-
-template<typename T, typename Alloc>
-  bool vector_base<T,Alloc>
-    ::empty(void) const
-{
-  return size() == 0;
-} // end vector_base::empty();
-
-template<typename T, typename Alloc>
-  void vector_base<T,Alloc>
-    ::push_back(const value_type &x)
-{
-  insert(end(), x);
-} // end vector_base::push_back()
-
-template<typename T, typename Alloc>
-  void vector_base<T,Alloc>
-    ::pop_back(void)
-{
-  iterator e = end();
-  iterator ptr_to_back = e;
-  --ptr_to_back;
-  m_storage.destroy(ptr_to_back, e);
-  --m_size;
-} // end vector_base::pop_back()
-
-template<typename T, typename Alloc>
-  typename vector_base<T,Alloc>::iterator vector_base<T,Alloc>
-    ::erase(iterator pos)
-{
-  iterator end = pos;
-  ++end;
-  return erase(pos,end);
-} // end vector_base::erase()
-
-template<typename T, typename Alloc>
-  typename vector_base<T,Alloc>::iterator vector_base<T,Alloc>
-    ::erase(iterator first, iterator last)
-{
-  // overlap copy the range [last,end()) to first
-  // XXX this copy only potentially overlaps
-  iterator i = thrust::detail::overlapped_copy(last, end(), first);
-
-  // destroy everything after i
-  m_storage.destroy(i, end());
-
-  // modify our size
-  m_size -= (last - first);
-
-  // return an iterator pointing to the position of the first element
-  // following the erased range
-  return first;
-} // end vector_base::erase()
-
-template<typename T, typename Alloc>
-  void vector_base<T,Alloc>
-    ::swap(vector_base &v)
-{
-  thrust::swap(m_storage,  v.m_storage);
-  thrust::swap(m_size,     v.m_size);
-} // end vector_base::swap()
-
-template<typename T, typename Alloc>
-  void vector_base<T,Alloc>
-    ::assign(size_type n, const T &x)
-{
-  fill_assign(n, x);
-} // end vector_base::assign()
-
-template<typename T, typename Alloc>
-  template<typename InputIterator>
-    void vector_base<T,Alloc>
-      ::assign(InputIterator first, InputIterator last)
-{
-  // we could have received assign(n, x), so disambiguate on the
-  // type of InputIterator
-  typedef typename thrust::detail::is_integral<InputIterator> integral;
-
-  assign_dispatch(first, last, integral());
-} // end vector_base::assign()
-
-template<typename T, typename Alloc>
-  typename vector_base<T,Alloc>::allocator_type
-    vector_base<T,Alloc>
-      ::get_allocator(void) const
-{
-  return m_storage.get_allocator();
-} // end vector_base::get_allocator()
-
-template<typename T, typename Alloc>
-  typename vector_base<T,Alloc>::iterator
-    vector_base<T,Alloc>
-      ::insert(iterator position, const T &x)
-{
-  // find the index of the insertion
-  size_type index = thrust::distance(begin(), position);
-
-  // make the insertion
-  insert(position, 1, x);
-
-  // return an iterator pointing back to position
-  iterator result = begin();
-  thrust::advance(result, index);
-  return result;
-} // end vector_base::insert()
-
-template<typename T, typename Alloc>
-  void vector_base<T,Alloc>
-    ::insert(iterator position, size_type n, const T &x)
-{
-  fill_insert(position, n, x);
-} // end vector_base::insert()
-
-template<typename T, typename Alloc>
-  template<typename InputIterator>
-    void vector_base<T,Alloc>
-      ::insert(iterator position, InputIterator first, InputIterator last)
-{
-  // we could have received insert(position, n, x), so disambiguate on the
-  // type of InputIterator
-  typedef typename thrust::detail::is_integral<InputIterator> integral;
-
-  insert_dispatch(position, first, last, integral());
-} // end vector_base::insert()
-
-template<typename T, typename Alloc>
-  template<typename InputIterator>
-    void vector_base<T,Alloc>
-      ::assign_dispatch(InputIterator first, InputIterator last, false_type)
-{
-  range_assign(first, last);
-} // end vector_base::assign_dispatch()
-
-template<typename T, typename Alloc>
-  template<typename Integral>
-    void vector_base<T,Alloc>
-      ::assign_dispatch(Integral n, Integral x, true_type)
-{
-  fill_assign(n, x);
-} // end vector_base::assign_dispatch()
-
-template<typename T, typename Alloc>
-  template<typename InputIterator>
-    void vector_base<T,Alloc>
-      ::insert_dispatch(iterator position, InputIterator first, InputIterator last, false_type)
-{
-  copy_insert(position, first, last);
-} // end vector_base::insert_dispatch()
-
-template<typename T, typename Alloc>
-  template<typename Integral>
-    void vector_base<T,Alloc>
-      ::insert_dispatch(iterator position, Integral n, Integral x, true_type)
-{
-  fill_insert(position, n, x);
-} // end vector_base::insert_dispatch()
-
-template<typename T, typename Alloc>
-  template<typename ForwardIterator>
-    void vector_base<T,Alloc>
-      ::copy_insert(iterator position,
-                    ForwardIterator first,
-                    ForwardIterator last)
-{
-  if(first != last)
-  {
-    // how many new elements will we create?
-    const size_type num_new_elements = thrust::distance(first, last);
-    if(capacity() - size() >= num_new_elements)
-    {
-      // we've got room for all of them
-      // how many existing elements will we displace?
-      const size_type num_displaced_elements = end() - position;
-      iterator old_end = end();
-
-      if(num_displaced_elements > num_new_elements)
-      {
-        // construct copy n displaced elements to new elements
-        // following the insertion
-        m_storage.uninitialized_copy(end() - num_new_elements, end(), end());
-
-        // extend the size
-        m_size += num_new_elements;
-
-        // copy num_displaced_elements - num_new_elements elements to existing elements
-        // this copy overlaps
-        const size_type copy_length = (old_end - num_new_elements) - position;
-        thrust::detail::overlapped_copy(position, old_end - num_new_elements, old_end - copy_length);
-
-        // finally, copy the range to the insertion point
-        thrust::copy(first, last, position);
-      } // end if
-      else
-      {
-        ForwardIterator mid = first;
-        thrust::advance(mid, num_displaced_elements);
-
-        // construct copy new elements at the end of the vector
-        m_storage.uninitialized_copy(mid, last, end());
-
-        // extend the size
-        m_size += num_new_elements - num_displaced_elements;
-
-        // construct copy the displaced elements
-        m_storage.uninitialized_copy(position, old_end, end());
-
-        // extend the size
-        m_size += num_displaced_elements;
-
-        // copy to elements which already existed
-        thrust::copy(first, mid, position);
-      } // end else
-    } // end if
-    else
-    {
-      const size_type old_size = size();
-
-      // compute the new capacity after the allocation
-      size_type new_capacity = old_size + thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION (old_size, num_new_elements);
-
-      // allocate exponentially larger new storage
-      new_capacity = thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION <size_type>(new_capacity, 2 * capacity());
-
-      // do not exceed maximum storage
-      new_capacity = thrust::min THRUST_PREVENT_MACRO_SUBSTITUTION <size_type>(new_capacity, max_size());
-
-      if(new_capacity > max_size())
-      {
-        throw std::length_error("insert(): insertion exceeds max_size().");
-      } // end if
-
-      storage_type new_storage(new_capacity);
-
-      // record how many constructors we invoke in the try block below
-      iterator new_end = new_storage.begin();
-
-      try
-      {
-        // construct copy elements before the insertion to the beginning of the newly
-        // allocated storage
-        new_end = m_storage.uninitialized_copy(begin(), position, new_storage.begin());
-
-        // construct copy elements to insert
-        new_end = m_storage.uninitialized_copy(first, last, new_end);
-
-        // construct copy displaced elements from the old storage to the new storage
-        // remember [position, end()) refers to the old storage
-        new_end = m_storage.uninitialized_copy(position, end(), new_end);
-      } // end try
-      catch(...)
-      {
-        // something went wrong, so destroy & deallocate the new storage 
-        m_storage.destroy(new_storage.begin(), new_end);
-        new_storage.deallocate();
-
-        // rethrow
-        throw;
-      } // end catch
-
-      // call destructors on the elements in the old storage
-      m_storage.destroy(begin(), end());
-
-      // record the vector's new state
-      m_storage.swap(new_storage);
-      m_size = old_size + num_new_elements;
-    } // end else
-  } // end if
-} // end vector_base::copy_insert()
-
-template<typename T, typename Alloc>
-  void vector_base<T,Alloc>
-    ::append(size_type n)
-{
-  if(n != 0)
-  {
-    if(capacity() - size() >= n)
-    {
-      // we've got room for all of them
-
-      // default construct new elements at the end of the vector
-      m_storage.default_construct_n(end(), n);
-
-      // extend the size
-      m_size += n;
-    } // end if
-    else
-    {
-      const size_type old_size = size();
-
-      // compute the new capacity after the allocation
-      size_type new_capacity = old_size + thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION (old_size, n);
-
-      // allocate exponentially larger new storage
-      new_capacity = thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION <size_type>(new_capacity, 2 * capacity());
-
-      // do not exceed maximum storage
-      new_capacity = thrust::min THRUST_PREVENT_MACRO_SUBSTITUTION <size_type>(new_capacity, max_size());
-
-      // create new storage
-      storage_type new_storage(new_capacity);
-
-      // record how many constructors we invoke in the try block below
-      iterator new_end = new_storage.begin();
-
-      try
-      {
-        // construct copy all elements into the newly allocated storage
-        new_end = m_storage.uninitialized_copy(begin(), end(), new_storage.begin());
-
-        // construct new elements to insert
-        m_storage.default_construct_n(new_end, n);
-        new_end += n;
-      } // end try
-      catch(...)
-      {
-        // something went wrong, so destroy & deallocate the new storage 
-        m_storage.destroy(new_storage.begin(), new_end);
-        new_storage.deallocate();
-
-        // rethrow
-        throw;
-      } // end catch
-
-      // call destructors on the elements in the old storage
-      m_storage.destroy(begin(), end());
-
-      // record the vector's new state
-      m_storage.swap(new_storage);
-      m_size    = old_size + n;
-    } // end else
-  } // end if
-} // end vector_base::append()
-
-template<typename T, typename Alloc>
-  void vector_base<T,Alloc>
-    ::fill_insert(iterator position, size_type n, const T &x)
-{
-  if(n != 0)
-  {
-    if(capacity() - size() >= n)
-    {
-      // we've got room for all of them
-      // how many existing elements will we displace?
-      const size_type num_displaced_elements = end() - position;
-      iterator old_end = end();
-
-      if(num_displaced_elements > n)
-      {
-        // construct copy n displaced elements to new elements
-        // following the insertion
-        m_storage.uninitialized_copy(end() - n, end(), end());
-
-        // extend the size
-        m_size += n;
-
-        // copy num_displaced_elements - n elements to existing elements
-        // this copy overlaps
-        const size_type copy_length = (old_end - n) - position;
-        thrust::detail::overlapped_copy(position, old_end - n, old_end - copy_length);
-
-        // finally, fill the range to the insertion point
-        thrust::fill_n(position, n, x);
-      } // end if
-      else
-      {
-        // construct new elements at the end of the vector
-        m_storage.uninitialized_fill_n(end(), n - num_displaced_elements, x);
-
-        // extend the size
-        m_size += n - num_displaced_elements;
-
-        // construct copy the displaced elements
-        m_storage.uninitialized_copy(position, old_end, end());
-
-        // extend the size
-        m_size += num_displaced_elements;
-
-        // fill to elements which already existed
-        thrust::fill(position, old_end, x);
-      } // end else
-    } // end if
-    else
-    {
-      const size_type old_size = size();
-
-      // compute the new capacity after the allocation
-      size_type new_capacity = old_size + thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION (old_size, n);
-
-      // allocate exponentially larger new storage
-      new_capacity = thrust::max THRUST_PREVENT_MACRO_SUBSTITUTION <size_type>(new_capacity, 2 * capacity());
-
-      // do not exceed maximum storage
-      new_capacity = thrust::min THRUST_PREVENT_MACRO_SUBSTITUTION <size_type>(new_capacity, max_size());
-
-      if(new_capacity > max_size())
-      {
-        throw std::length_error("insert(): insertion exceeds max_size().");
-      } // end if
-
-      storage_type new_storage(new_capacity);
-
-      // record how many constructors we invoke in the try block below
-      iterator new_end = new_storage.begin();
-
-      try
-      {
-        // construct copy elements before the insertion to the beginning of the newly
-        // allocated storage
-        new_end = m_storage.uninitialized_copy(begin(), position, new_storage.begin());
-
-        // construct new elements to insert
-        m_storage.uninitialized_fill_n(new_end, n, x);
-        new_end += n;
-
-        // construct copy displaced elements from the old storage to the new storage
-        // remember [position, end()) refers to the old storage
-        new_end = m_storage.uninitialized_copy(position, end(), new_end);
-      } // end try
-      catch(...)
-      {
-        // something went wrong, so destroy & deallocate the new storage 
-        m_storage.destroy(new_storage.begin(), new_end);
-        new_storage.deallocate();
-
-        // rethrow
-        throw;
-      } // end catch
-
-      // call destructors on the elements in the old storage
-      m_storage.destroy(begin(), end());
-
-      // record the vector's new state
-      m_storage.swap(new_storage);
-      m_size    = old_size + n;
-    } // end else
-  } // end if
-} // end vector_base::fill_insert()
-
-template<typename T, typename Alloc>
-  template<typename InputIterator>
-    void vector_base<T,Alloc>
-      ::range_assign(InputIterator first,
-                     InputIterator last)
-{
-  // dispatch on traversal
-  range_assign(first, last,
-    typename thrust::iterator_traversal<InputIterator>::type());
-} // end range_assign()
-
-template<typename T, typename Alloc>
-  template<typename InputIterator>
-    void vector_base<T,Alloc>
-      ::range_assign(InputIterator first,
-                     InputIterator last,
-                     thrust::incrementable_traversal_tag)
-{
-  iterator current(begin());
-
-  // assign to elements which already exist
-  for(; first != last && current != end(); ++current, ++first)
-  {
-    *current = *first;
-  } // end for
-  
-  // either just the input was exhausted or both
-  // the input and vector elements were exhausted
-  if(first == last)
-  {
-    // if we exhausted the input, erase leftover elements
-    erase(current, end());
-  } // end if
-  else
-  {
-    // insert the rest of the input at the end of the vector
-    insert(end(), first, last);
-  } // end else
-} // end vector_base::range_assign()
-
-template<typename T, typename Alloc>
-  template<typename RandomAccessIterator>
-    void vector_base<T,Alloc>
-      ::range_assign(RandomAccessIterator first,
-                     RandomAccessIterator last,
-                     thrust::random_access_traversal_tag)
-{
-  const size_type n = thrust::distance(first, last);
-
-  if(n > capacity())
-  {
-    storage_type new_storage;
-    allocate_and_copy(n, first, last, new_storage);
-
-    // call destructors on the elements in the old storage
-    m_storage.destroy(begin(), end());
-
-    // record the vector's new state
-    m_storage.swap(new_storage);
-    m_size = n;
-  } // end if
-  else if(size() >= n)
-  {
-    // we can already accomodate the new range
-    iterator new_end = thrust::copy(first, last, begin());
-
-    // destroy the elements we don't need
-    m_storage.destroy(new_end, end());
-
-    // update size
-    m_size = n;
-  } // end else if
-  else
-  {
-    // range fits inside allocated storage, but some elements
-    // have not been constructed yet
-    
-    // XXX TODO we could possibly implement this with one call
-    // to transform rather than copy + uninitialized_copy
-
-    // copy to elements which already exist
-    RandomAccessIterator mid = first;
-    thrust::advance(mid, size());
-    thrust::copy(first, mid, begin());
-
-    // uninitialize_copy to elements which must be constructed
-    m_storage.uninitialized_copy(mid, last, end());
-
-    // update size
-    m_size = n;
-  } // end else
-} // end vector_base::assign()
-
-template<typename T, typename Alloc>
-  void vector_base<T,Alloc>
-    ::fill_assign(size_type n, const T &x)
-{
-  if(n > capacity())
-  {
-    // XXX we should also include a copy of the allocator:
-    // vector_base<T,Alloc> temp(n, x, get_allocator());
-    vector_base<T,Alloc> temp(n, x);
-    temp.swap(*this);
-  } // end if
-  else if(n > size())
-  {
-    // fill to existing elements
-    thrust::fill(begin(), end(), x);
-
-    // construct uninitialized elements
-    m_storage.uninitialized_fill_n(end(), n - size(), x);
-
-    // adjust size
-    m_size += (n - size());
-  } // end else if
-  else
-  {
-    // fill to existing elements
-    iterator new_end = thrust::fill_n(begin(), n, x);
-
-    // erase the elements after the fill
-    erase(new_end, end());
-  } // end else
-} // end vector_base::fill_assign()
-
-template<typename T, typename Alloc>
-  template<typename ForwardIterator>
-    void vector_base<T,Alloc>
-      ::allocate_and_copy(size_type requested_size,
-                          ForwardIterator first, ForwardIterator last,
-                          storage_type &new_storage)
-{
-  if(requested_size == 0)
-  {
-    new_storage.deallocate();
-    return;
-  } // end if
-
-  // allocate exponentially larger new storage
-  size_type allocated_size = thrust::max<size_type>(requested_size, 2 * capacity());
-
-  // do not exceed maximum storage
-  allocated_size = thrust::min<size_type>(allocated_size, max_size());
-
-  if(requested_size > allocated_size)
-  {
-    throw std::length_error("assignment exceeds max_size().");
-  } // end if
-
-  new_storage.allocate(allocated_size);
-
-  try
-  {
-    // construct the range to the newly allocated storage
-    m_storage.uninitialized_copy(first, last, new_storage.begin());
-  } // end try
-  catch(...)
-  {
-    // something went wrong, so destroy & deallocate the new storage 
-    // XXX seems like this destroys too many elements -- should just be last - first instead of requested_size
-    iterator new_storage_end = new_storage.begin();
-    thrust::advance(new_storage_end, requested_size);
-    m_storage.destroy(new_storage.begin(), new_storage_end);
-    new_storage.deallocate();
-
-    // rethrow
-    throw;
-  } // end catch
-} // end vector_base::allocate_and_copy()
-
-
-} // end detail
-
-template<typename T, typename Alloc>
-  void swap(detail::vector_base<T,Alloc> &a,
-            detail::vector_base<T,Alloc> &b)
-{
-  a.swap(b);
-} // end swap()
-
-
-
-namespace detail
-{
-    
-// iterator tags match
-template <typename InputIterator1, typename InputIterator2>
-bool vector_equal(InputIterator1 first1, InputIterator1 last1,
-                  InputIterator2 first2,
-                  thrust::detail::true_type)
-{
-  return thrust::equal(first1, last1, first2);
-}
-
-// iterator tags differ
-template <typename InputIterator1, typename InputIterator2>
-bool vector_equal(InputIterator1 first1, InputIterator1 last1,
-                  InputIterator2 first2,
-                  thrust::detail::false_type)
-{
-  typename thrust::iterator_difference<InputIterator1>::type n = thrust::distance(first1,last1);
-
-  typedef typename thrust::iterator_system<InputIterator1>::type FromSystem1;
-  typedef typename thrust::iterator_system<InputIterator2>::type FromSystem2;
-
-  // bring both ranges to the host system
-  // note that these copies are no-ops if the range is already convertible to the host system
-  FromSystem1 from_system1;
-  FromSystem2 from_system2;
-  thrust::host_system_tag to_system;
-  thrust::detail::move_to_system<InputIterator1, FromSystem1, thrust::host_system_tag> rng1(from_system1, to_system, first1, last1);
-  thrust::detail::move_to_system<InputIterator2, FromSystem2, thrust::host_system_tag> rng2(from_system2, to_system, first2, first2 + n);
-
-  return thrust::equal(rng1.begin(), rng1.end(), rng2.begin());
-}
-
-template <typename InputIterator1, typename InputIterator2>
-bool vector_equal(InputIterator1 first1, InputIterator1 last1,
-                  InputIterator2 first2)
-{
-  typedef typename thrust::iterator_system<InputIterator1>::type system1;
-  typedef typename thrust::iterator_system<InputIterator2>::type system2;
-
-  // dispatch on the sameness of the two systems
-  return vector_equal(first1, last1, first2,
-    thrust::detail::is_same<system1,system2>());
-}
-
-} // end namespace detail
-
-
-
-
-template<typename T1, typename Alloc1,
-         typename T2, typename Alloc2>
-bool operator==(const detail::vector_base<T1,Alloc1>& lhs,
-                const detail::vector_base<T2,Alloc2>& rhs)
-{
-    return lhs.size() == rhs.size() && detail::vector_equal(lhs.begin(), lhs.end(), rhs.begin());
-}
-    
-template<typename T1, typename Alloc1,
-         typename T2, typename Alloc2>
-bool operator==(const detail::vector_base<T1,Alloc1>& lhs,
-                const std::vector<T2,Alloc2>&         rhs)
-{
-    return lhs.size() == rhs.size() && detail::vector_equal(lhs.begin(), lhs.end(), rhs.begin());
-}
-
-template<typename T1, typename Alloc1,
-         typename T2, typename Alloc2>
-bool operator==(const std::vector<T1,Alloc1>&         lhs,
-                const detail::vector_base<T2,Alloc2>& rhs)
-{
-    return lhs.size() == rhs.size() && detail::vector_equal(lhs.begin(), lhs.end(), rhs.begin());
-}
-
-template<typename T1, typename Alloc1,
-         typename T2, typename Alloc2>
-bool operator!=(const detail::vector_base<T1,Alloc1>& lhs,
-                const detail::vector_base<T2,Alloc2>& rhs)
-{
-    return !(lhs == rhs);
-}
-    
-template<typename T1, typename Alloc1,
-         typename T2, typename Alloc2>
-bool operator!=(const detail::vector_base<T1,Alloc1>& lhs,
-                const std::vector<T2,Alloc2>&         rhs)
-{
-    return !(lhs == rhs);
-}
-
-template<typename T1, typename Alloc1,
-         typename T2, typename Alloc2>
-bool operator!=(const std::vector<T1,Alloc1>&         lhs,
-                const detail::vector_base<T2,Alloc2>& rhs)
-{
-    return !(lhs == rhs);
-}
-
-} // end thrust
-
diff --git a/compat/thrust/device_allocator.h b/compat/thrust/device_allocator.h
deleted file mode 100644
index a5462d1a28..0000000000
--- a/compat/thrust/device_allocator.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file device_allocator.h
- *  \brief An allocator which creates new elements in device memory
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/device_new_allocator.h>
-#include <limits>
-#include <stdexcept>
-
-namespace thrust
-{
-
-/*! \addtogroup memory_management_classes Memory Management Classes
- *  \{
- */
-
-template<typename T> class device_allocator;
-
-/*! \p device_allocator<void> is a device memory allocator.
- *  This class is a specialization for \c void.
- *
- *  \see device_ptr
- *  \see http://www.sgi.com/tech/stl/Allocators.html
- */
-template<>
-  class device_allocator<void>
-{
-  public:
-    /*! Type of element allocated, \c void. */
-    typedef void                              value_type;
-
-    /*! Pointer to allocation, \c device_ptr<void>. */
-    typedef device_ptr<void>                  pointer;
-
-    /*! \c const pointer to allocation, \c device_ptr<const void>. */
-    typedef device_ptr<const void>            const_pointer;
-
-    /*! Type of allocation size, \c std::size_t. */
-    typedef std::size_t                       size_type;
-
-    /*! Type of allocation difference, \c pointer::difference_type. */
-    typedef pointer::difference_type difference_type;
-
-    /*! The \p rebind metafunction provides the type of a \p device_allocator
-     *  instantiated with another type.
-     *
-     *  \tparam U The other type to use for instantiation.
-     */
-    template<typename U>
-      struct rebind
-    {
-      /*! The typedef \p other gives the type of the rebound \p device_allocator.
-       */
-      typedef device_allocator<U> other;
-    }; // end rebind
-}; // end device_allocator<void>
-
-/*! \p device_allocator is a device memory allocator.
- *  This implementation inherits from \p device_new_allocator.
- *
- *  \see device_ptr
- *  \see device_new_allocator
- *  \see http://www.sgi.com/tech/stl/Allocators.html
- */
-template<typename T>
-  class device_allocator
-    : public device_new_allocator<T>
-{
-  public:
-    /*! The \p rebind metafunction provides the type of a \p device_allocator
-     *  instantiated with another type.
-     *
-     *  \tparam U The other type to use for instantiation.
-     */
-    template<typename U>
-      struct rebind
-    {
-      /*! The typedef \p other gives the type of the rebound \p device_allocator.
-       */
-      typedef device_allocator<U> other;
-    }; // end rebind
-
-    /*! No-argument constructor has no effect.
-     */
-    __host__ __device__
-    inline device_allocator() {}
-
-    /*! Copy constructor has no effect.
-     */
-    __host__ __device__
-    inline device_allocator(device_allocator const&) {}
-
-    /*! Constructor from other \p allocator has no effect.
-     */
-    template<typename U>
-    __host__ __device__
-    inline device_allocator(device_allocator<U> const&) {}
-}; // end device_allocator
-
-/*! \}
- */
-
-} // end thrust
-
diff --git a/compat/thrust/device_delete.h b/compat/thrust/device_delete.h
deleted file mode 100644
index 1df3bb6f46..0000000000
--- a/compat/thrust/device_delete.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file device_delete.h
- *  \brief Deletes variables in device memory
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/device_ptr.h>
-
-namespace thrust
-{
-
-/*! \addtogroup deallocation_functions Deallocation Functions
- *  \ingroup memory_management_functions
- *  \{
- */
-
-/*! \p device_delete deletes a \p device_ptr allocated with
- *  \p device_new.
- *
- *  \param ptr The \p device_ptr to delete, assumed to have
- *         been allocated with \p device_new.
- *  \param n The number of objects to destroy at \p ptr. Defaults to \c 1
- *         similar to \p device_new.
- *
- *  \see device_ptr
- *  \see device_new
- */
-template<typename T>
-  inline void device_delete(thrust::device_ptr<T> ptr,
-                            const size_t n = 1);
-
-/*! \}
- */
-
-} // end thrust
-
-#include <thrust/detail/device_delete.inl>
-
diff --git a/compat/thrust/device_free.h b/compat/thrust/device_free.h
deleted file mode 100644
index a734418e58..0000000000
--- a/compat/thrust/device_free.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file device_free.h
- *  \brief Deallocates storage allocated by \p device_malloc
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/device_ptr.h>
-
-namespace thrust
-{
-
-/*! \addtogroup deallocation_functions Deallocation Functions
- *  \ingroup memory_management_functions
- *  \{
- */
-
-/*! \p device_free deallocates memory allocated by the function \p device_malloc.
- *
- *  \param ptr A \p device_ptr pointing to memory to be deallocated.
- *
- *  The following code snippet demonstrates how to use \p device_free to
- *  deallocate memory allocated by \p device_malloc.
- *
- *  \code
- *  #include <thrust/device_malloc.h>
- *  #include <thrust/device_free.h>
- *  ...
- *  // allocate some integers with device_malloc
- *  const int N = 100;
- *  thrust::device_ptr<int> int_array = thrust::device_malloc<int>(N);
- *
- *  // manipulate integers
- *  ...
- *
- *  // deallocate with device_free
- *  thrust::device_free(int_array);
- *  \endcode
- *
- *  \see device_ptr
- *  \see device_malloc
- */
-inline void device_free(thrust::device_ptr<void> ptr);
-
-/*! \}
- */
-
-} // end thrust
-
-#include <thrust/detail/device_free.inl>
-
diff --git a/compat/thrust/device_malloc.h b/compat/thrust/device_malloc.h
deleted file mode 100644
index a3b07234f9..0000000000
--- a/compat/thrust/device_malloc.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file device_malloc.h
- *  \brief Allocates storage in device memory
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/device_ptr.h>
-#include <cstddef> // for std::size_t
-
-namespace thrust
-{
-
-/*! \addtogroup allocation_functions Allocation Functions
- *  \ingroup memory_management_functions
- *  \{
- */
-
-/*! This version of \p device_malloc allocates sequential device storage
- *  for bytes.
- *
- *  \param n The number of bytes to allocate sequentially
- *           in device memory.
- *  \return A \p device_ptr to the newly allocated memory.
- *
- *  The following code snippet demonstrates how to use \p device_malloc to
- *  allocate a range of device memory.
- *
- *  \code
- *  #include <thrust/device_malloc.h>
- *  #include <thrust/device_free.h>
- *  ...
- *  // allocate some memory with device_malloc
- *  const int N = 100;
- *  thrust::device_ptr<void> void_ptr = thrust::device_malloc(N);
- *
- *  // manipulate memory
- *  ...
- *
- *  // deallocate with device_free
- *  thrust::device_free(void_ptr);
- *  \endcode
- *
- *  \see device_ptr
- *  \see device_free
- */
-inline thrust::device_ptr<void> device_malloc(const std::size_t n);
-
-/*! This version of \p device_malloc allocates sequential device storage for
- *  new objects of the given type.
- *
- *  \param n The number of objects of type T to allocate
- *           sequentially in device memory.
- *  \return A \p device_ptr to the newly allocated memory.
- *
- *  The following code snippet demonstrates how to use \p device_malloc to
- *  allocate a range of device memory.
- *
- *  \code
- *  #include <thrust/device_malloc.h>
- *  #include <thrust/device_free.h>
- *  ...
- *  // allocate some integers with device_malloc
- *  const int N = 100;
- *  thrust::device_ptr<int> int_array = thrust::device_malloc<int>(N);
- *
- *  // manipulate integers
- *  ...
- *
- *  // deallocate with device_free
- *  thrust::device_free(int_array);
- *  \endcode
- *
- *  \see device_ptr
- *  \see device_free
- */
-template<typename T>
-  inline thrust::device_ptr<T> device_malloc(const std::size_t n);
-
-/*! \}
- */
-
-} // end thrust
-
-#include <thrust/detail/device_malloc.inl>
-
diff --git a/compat/thrust/device_malloc_allocator.h b/compat/thrust/device_malloc_allocator.h
deleted file mode 100644
index 404a6d297a..0000000000
--- a/compat/thrust/device_malloc_allocator.h
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file device_malloc_allocator.h
- *  \brief An allocator which allocates storage with \p device_malloc
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/device_ptr.h>
-#include <thrust/device_reference.h>
-#include <thrust/device_malloc.h>
-#include <thrust/device_free.h>
-#include <limits>
-#include <stdexcept>
-
-namespace thrust
-{
-
-// forward declarations to WAR circular #includes
-template<typename> class device_ptr;
-template<typename T> device_ptr<T> device_malloc(const std::size_t n);
-
-/*! \addtogroup memory_management Memory Management
- *  \addtogroup memory_management_classes Memory Management Classes
- *  \ingroup memory_management
- *  \{
- */
-
-/*! \p device_malloc_allocator is a device memory allocator that employs the
- *  \p device_malloc function for allocation.
- *
- *  \see device_malloc
- *  \see device_ptr
- *  \see http://www.sgi.com/tech/stl/Allocators.html
- */
-template<typename T>
-  class device_malloc_allocator
-{
-  public:
-    /*! Type of element allocated, \c T. */
-    typedef T                                 value_type;
-
-    /*! Pointer to allocation, \c device_ptr<T>. */
-    typedef device_ptr<T>                     pointer;
-
-    /*! \c const pointer to allocation, \c device_ptr<const T>. */
-    typedef device_ptr<const T>               const_pointer;
-
-    /*! Reference to allocated element, \c device_reference<T>. */
-    typedef device_reference<T>               reference;
-
-    /*! \c const reference to allocated element, \c device_reference<const T>. */
-    typedef device_reference<const T>         const_reference;
-
-    /*! Type of allocation size, \c std::size_t. */
-    typedef std::size_t                       size_type;
-
-    /*! Type of allocation difference, \c pointer::difference_type. */
-    typedef typename pointer::difference_type difference_type;
-
-    /*! The \p rebind metafunction provides the type of a \p device_malloc_allocator
-     *  instantiated with another type.
-     *
-     *  \tparam U The other type to use for instantiation.
-     */
-    template<typename U>
-      struct rebind
-    {
-      /*! The typedef \p other gives the type of the rebound \p device_malloc_allocator.
-       */
-      typedef device_malloc_allocator<U> other;
-    }; // end rebind
-
-    /*! No-argument constructor has no effect. */
-    __host__ __device__
-    inline device_malloc_allocator() {}
-
-    /*! No-argument destructor has no effect. */
-    __host__ __device__
-    inline ~device_malloc_allocator() {}
-
-    /*! Copy constructor has no effect. */
-    __host__ __device__
-    inline device_malloc_allocator(device_malloc_allocator const&) {}
-
-    /*! Constructor from other \p device_malloc_allocator has no effect. */
-    template<typename U>
-    __host__ __device__
-    inline device_malloc_allocator(device_malloc_allocator<U> const&) {}
-
-    /*! Returns the address of an allocated object.
-     *  \return <tt>&r</tt>.
-     */
-    __host__ __device__
-    inline pointer address(reference r) { return &r; }
-    
-    /*! Returns the address an allocated object.
-     *  \return <tt>&r</tt>.
-     */
-    __host__ __device__
-    inline const_pointer address(const_reference r) { return &r; }
-
-    /*! Allocates storage for \p cnt objects.
-     *  \param cnt The number of objects to allocate.
-     *  \return A \p pointer to uninitialized storage for \p cnt objects.
-     *  \note Memory allocated by this function must be deallocated with \p deallocate.
-     */
-    __host__
-    inline pointer allocate(size_type cnt,
-                            const_pointer = const_pointer(static_cast<T*>(0)))
-    {
-      if(cnt > this->max_size())
-      {
-        throw std::bad_alloc();
-      } // end if
-
-      return pointer(device_malloc<T>(cnt));
-    } // end allocate()
-
-    /*! Deallocates storage for objects allocated with \p allocate.
-     *  \param p A \p pointer to the storage to deallocate.
-     *  \param cnt The size of the previous allocation.
-     *  \note Memory deallocated by this function must previously have been
-     *        allocated with \p allocate.
-     */
-    __host__
-    inline void deallocate(pointer p, size_type cnt)
-    {
-      device_free(p);
-    } // end deallocate()
-
-    /*! Returns the largest value \c n for which <tt>allocate(n)</tt> might succeed.
-     *  \return The largest value \c n for which <tt>allocate(n)</tt> might succeed.
-     */
-    inline size_type max_size() const
-    {
-      return (std::numeric_limits<size_type>::max)() / sizeof(T);
-    } // end max_size()
-
-    /*! Compares against another \p device_malloc_allocator for equality.
-     *  \return \c true
-     */
-    __host__ __device__
-    inline bool operator==(device_malloc_allocator const&) { return true; }
-
-    /*! Compares against another \p device_malloc_allocator for inequality.
-     *  \return \c false
-     */
-    __host__ __device__
-    inline bool operator!=(device_malloc_allocator const &a) {return !operator==(a); }
-}; // end device_malloc_allocator
-
-/*! \}
- */
-
-} // end thrust
-
-
diff --git a/compat/thrust/device_new.h b/compat/thrust/device_new.h
deleted file mode 100644
index 001d476896..0000000000
--- a/compat/thrust/device_new.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file device_new.h
- *  \brief Constructs new elements in device memory
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// #include this for size_t
-#include <cstddef>
-#include <thrust/device_ptr.h>
-
-namespace thrust
-{
-
-/*!
- *  \addtogroup allocation_functions Allocation Functions
- *  \{
- */
-
-/*! \p device_new implements the placement \c new operator for types
- *  resident in device memory. \p device_new calls <tt>T</tt>'s null
- *  constructor on a array of objects in device memory.
- *  No memory is allocated by this function.
- *
- *  \param  p A \p device_ptr to a region of device memory into which
- *          to construct one or many <tt>T</tt>s.
- *  \param  n The number of objects to construct at \p p.
- *  \return p, casted to <tt>T</tt>'s type.
- *
- *  \see device_ptr
- */
-template <typename T>
-  device_ptr<T> device_new(device_ptr<void> p,
-                           const size_t n = 1);
-
-/*! \p device_new implements the placement new operator for types
- *  resident in device memory. \p device_new calls <tt>T</tt>'s copy
- *  constructor on a array of objects in device memory. No memory is
- *  allocated by this function.
- *
- *  \param  p A \p device_ptr to a region of device memory into which to
- *          construct one or many <tt>T</tt>s.
- *  \param exemplar The value from which to copy.
- *  \param  n The number of objects to construct at \p p.
- *  \return p, casted to <tt>T</tt>'s type.
- *
- *  \see device_ptr
- *  \see fill
- */
-template <typename T>
-  device_ptr<T> device_new(device_ptr<void> p,
-                           const T &exemplar,
-                           const size_t n = 1);
-
-/*! \p device_new implements the new operator for types resident in device memory.
- *  It allocates device memory large enough to hold \p n new objects of type \c T.
- *
- *  \param n The number of objects to allocate. Defaults to \c 1.
- *  \return A \p device_ptr to the newly allocated region of device memory.
- */
-template <typename T>
-  device_ptr<T> device_new(const size_t n = 1);
-
-/*! \}
- */
-
-} // end thrust
-
-#include <thrust/detail/device_new.inl>
-
diff --git a/compat/thrust/device_new_allocator.h b/compat/thrust/device_new_allocator.h
deleted file mode 100644
index 527d1fd7b5..0000000000
--- a/compat/thrust/device_new_allocator.h
+++ /dev/null
@@ -1,172 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file device_new_allocator.h
- *  \brief An allocator which allocates storage with \p device_new
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/device_ptr.h>
-#include <thrust/device_reference.h>
-#include <thrust/device_new.h>
-#include <thrust/device_delete.h>
-#include <limits>
-#include <stdexcept>
-
-namespace thrust
-{
-
-/*! \addtogroup memory_management Memory Management
- *  \addtogroup memory_management_classes Memory Management Classes
- *  \ingroup memory_management
- *  \{
- */
-
-/*! \p device_new_allocator is a device memory allocator that employs the
- *  \p device_new function for allocation.
- *
- *  \see device_new
- *  \see device_ptr
- *  \see http://www.sgi.com/tech/stl/Allocators.html
- */
-template<typename T>
-  class device_new_allocator
-{
-  public:
-    /*! Type of element allocated, \c T. */
-    typedef T                                 value_type;
-
-    /*! Pointer to allocation, \c device_ptr<T>. */
-    typedef device_ptr<T>                     pointer;
-
-    /*! \c const pointer to allocation, \c device_ptr<const T>. */
-    typedef device_ptr<const T>               const_pointer;
-
-    /*! Reference to allocated element, \c device_reference<T>. */
-    typedef device_reference<T>               reference;
-
-    /*! \c const reference to allocated element, \c device_reference<const T>. */
-    typedef device_reference<const T>         const_reference;
-
-    /*! Type of allocation size, \c std::size_t. */
-    typedef std::size_t                       size_type;
-
-    /*! Type of allocation difference, \c pointer::difference_type. */
-    typedef typename pointer::difference_type difference_type;
-
-    /*! The \p rebind metafunction provides the type of a \p device_new_allocator
-     *  instantiated with another type.
-     *
-     *  \tparam U The other type to use for instantiation.
-     */
-    template<typename U>
-      struct rebind
-    {
-      /*! The typedef \p other gives the type of the rebound \p device_new_allocator.
-       */
-      typedef device_new_allocator<U> other;
-    }; // end rebind
-
-    /*! No-argument constructor has no effect. */
-    __host__ __device__
-    inline device_new_allocator() {}
-
-    /*! No-argument destructor has no effect. */
-    __host__ __device__
-    inline ~device_new_allocator() {}
-
-    /*! Copy constructor has no effect. */
-    __host__ __device__
-    inline device_new_allocator(device_new_allocator const&) {}
-
-    /*! Constructor from other \p device_malloc_allocator has no effect. */
-    template<typename U>
-    __host__ __device__
-    inline device_new_allocator(device_new_allocator<U> const&) {}
-
-    /*! Returns the address of an allocated object.
-     *  \return <tt>&r</tt>.
-     */
-    __host__ __device__
-    inline pointer address(reference r) { return &r; }
-    
-    /*! Returns the address an allocated object.
-     *  \return <tt>&r</tt>.
-     */
-    __host__ __device__
-    inline const_pointer address(const_reference r) { return &r; }
-
-    /*! Allocates storage for \p cnt objects.
-     *  \param cnt The number of objects to allocate.
-     *  \return A \p pointer to uninitialized storage for \p cnt objects.
-     *  \note Memory allocated by this function must be deallocated with \p deallocate.
-     */
-    __host__
-    inline pointer allocate(size_type cnt,
-                            const_pointer = const_pointer(static_cast<T*>(0)))
-    {
-      if(cnt > this->max_size())
-      {
-        throw std::bad_alloc();
-      } // end if
-
-      // use "::operator new" rather than keyword new
-      return pointer(device_new<T>(cnt));
-    } // end allocate()
-
-    /*! Deallocates storage for objects allocated with \p allocate.
-     *  \param p A \p pointer to the storage to deallocate.
-     *  \param cnt The size of the previous allocation.
-     *  \note Memory deallocated by this function must previously have been
-     *        allocated with \p allocate.
-     */
-    __host__
-    inline void deallocate(pointer p, size_type cnt)
-    {
-      // use "::operator delete" rather than keyword delete
-      device_delete(p);
-    } // end deallocate()
-
-    /*! Returns the largest value \c n for which <tt>allocate(n)</tt> might succeed.
-     *  \return The largest value \c n for which <tt>allocate(n)</tt> might succeed.
-     */
-    __host__ __device__
-    inline size_type max_size() const
-    {
-      return std::numeric_limits<size_type>::max THRUST_PREVENT_MACRO_SUBSTITUTION () / sizeof(T);
-    } // end max_size()
-
-    /*! Compares against another \p device_malloc_allocator for equality.
-     *  \return \c true
-     */
-    __host__ __device__
-    inline bool operator==(device_new_allocator const&) { return true; }
-
-    /*! Compares against another \p device_malloc_allocator for inequality.
-     *  \return \c false
-     */
-    __host__ __device__
-    inline bool operator!=(device_new_allocator const &a) {return !operator==(a); }
-}; // end device_new_allocator
-
-/*! \}
- */
-
-} // end thrust
-
diff --git a/compat/thrust/device_ptr.h b/compat/thrust/device_ptr.h
deleted file mode 100644
index dfc7e90dc3..0000000000
--- a/compat/thrust/device_ptr.h
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file device_ptr.h
- *  \brief A pointer to a variable which resides in the "device" system's memory space
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/memory.h>
-#include <ostream>
-
-namespace thrust
-{
-
-/*! \addtogroup memory_management Memory Management
- *  \addtogroup memory_management_classes Memory Management Classes
- *  \ingroup memory_management
- *  \{
- */
-
-// forward declarations
-template<typename T> class device_reference;
-
-/*! \p device_ptr stores a pointer to an object allocated in device memory. This type
- *  provides type safety when dispatching standard algorithms on ranges resident in
- *  device memory.
- *
- *  \p device_ptr has pointer semantics: it may be dereferenced safely from the host and
- *  may be manipulated with pointer arithmetic.
- *
- *  \p device_ptr can be created with the functions device_malloc, device_new, or
- *  device_pointer_cast, or by explicitly calling its constructor with a raw pointer.
- *
- *  The raw pointer encapsulated by a \p device_ptr may be obtained by either its <tt>get</tt>
- *  method or the \p raw_pointer_cast free function.
- *
- *  \note \p device_ptr is not a smart pointer; it is the programmer's responsibility to
- *  deallocate memory pointed to by \p device_ptr.
- *
- *  \see device_malloc
- *  \see device_new
- *  \see device_pointer_cast
- *  \see raw_pointer_cast
- */
-template<typename T>
-  class device_ptr
-    : public thrust::pointer<
-               T,
-               thrust::device_system_tag,
-               thrust::device_reference<T>,
-               thrust::device_ptr<T>
-             >
-{
-  private:
-    typedef thrust::pointer<
-      T,
-      thrust::device_system_tag,
-      thrust::device_reference<T>,
-      thrust::device_ptr<T>
-    > super_t;
-
-  public:
-    /*! \p device_ptr's null constructor initializes its raw pointer to \c 0.
-     */
-    __host__ __device__
-    device_ptr() : super_t() {}
-
-    /*! \p device_ptr's copy constructor is templated to allow copying to a
-     *  <tt>device_ptr<const T></tt> from a <tt>T *</tt>.
-     *  
-     *  \param ptr A raw pointer to copy from, presumed to point to a location in
-     *         device memory.
-     */
-    template<typename OtherT>
-    __host__ __device__
-    explicit device_ptr(OtherT *ptr) : super_t(ptr) {}
-
-    /*! \p device_ptr's copy constructor allows copying from another device_ptr with related type.
-     *  \param other The \p device_ptr to copy from.
-     */
-    template<typename OtherT>
-    __host__ __device__
-    device_ptr(const device_ptr<OtherT> &other) : super_t(other) {}
-
-    /*! \p device_ptr's assignment operator allows assigning from another \p device_ptr with related type.
-     *  \param other The other \p device_ptr to copy from.
-     *  \return <tt>*this</tt>
-     */
-    template<typename OtherT>
-    __host__ __device__
-    device_ptr &operator=(const device_ptr<OtherT> &other)
-    {
-      super_t::operator=(other);
-      return *this;
-    }
-
-// declare these members for the purpose of Doxygenating them
-// they actually exist in a derived-from class
-#if 0
-    /*! This method returns this \p device_ptr's raw pointer.
-     *  \return This \p device_ptr's raw pointer.
-     */
-    __host__ __device__
-    T *get(void) const;
-#endif // end doxygen-only members
-}; // end device_ptr
-
-/*! This operator outputs the value of a \p device_ptr's raw pointer to a \p std::basic_ostream.
- *
- *  \param os The std::basic_ostream of interest.
- *  \param p The device_ptr of interest.
- *  \return os.
- */
-template<class E, class T, class Y>
-inline std::basic_ostream<E, T> &operator<<(std::basic_ostream<E, T> &os, const device_ptr<Y> &p);
-
-/*! \}
- */
-
-
-/*!
- *  \addtogroup memory_management_functions Memory Management Functions
- *  \ingroup memory_management
- *  \{
- */
-
-/*! \p device_pointer_cast creates a device_ptr from a raw pointer which is presumed to point
- *  to a location in device memory.
- *
- *  \param ptr A raw pointer, presumed to point to a location in device memory.
- *  \return A device_ptr wrapping ptr.
- */
-template<typename T>
-__host__ __device__
-inline device_ptr<T> device_pointer_cast(T *ptr);
-
-/*! This version of \p device_pointer_cast creates a copy of a device_ptr from another device_ptr.
- *  This version is included for symmetry with \p raw_pointer_cast.
- *
- *  \param ptr A device_ptr.
- *  \return A copy of \p ptr.
- */
-template<typename T>
-__host__ __device__
-inline device_ptr<T> device_pointer_cast(const device_ptr<T> &ptr);
-
-/*! \}
- */
-
-} // end thrust
-
-#include <thrust/detail/device_ptr.inl>
-#include <thrust/detail/raw_pointer_cast.h>
-
diff --git a/compat/thrust/device_reference.h b/compat/thrust/device_reference.h
deleted file mode 100644
index edae2b59af..0000000000
--- a/compat/thrust/device_reference.h
+++ /dev/null
@@ -1,969 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file device_reference.h
- *  \brief A reference to a variable which resides in the "device" system's memory space
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/device_ptr.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/reference.h>
-
-namespace thrust
-{
-
-/*! \addtogroup memory_management_classes Memory Management Classes
- *  \ingroup memory_management
- *  \{
- */
-
-/*! \p device_reference acts as a reference-like object to an object stored in device memory.
- *  \p device_reference is not intended to be used directly; rather, this type
- *  is the result of deferencing a \p device_ptr. Similarly, taking the address of
- *  a \p device_reference yields a \p device_ptr.
- *  
- *  \p device_reference may often be used from host code in place of operations defined on
- *  its associated \c value_type. For example, when \p device_reference refers to an
- *  arithmetic type, arithmetic operations on it are legal:
- *
- *  \code
- *  #include <thrust/device_vector.h>
- *
- *  int main(void)
- *  {
- *    thrust::device_vector<int> vec(1, 13);
- *
- *    thrust::device_reference<int> ref_to_thirteen = vec[0];
- *
- *    int x = ref_to_thirteen + 1;
- *
- *    // x is 14
- *
- *    return 0;
- *  }
- *  \endcode
- *
- *  Similarly, we can print the value of \c ref_to_thirteen in the above code by using an
- *  \c iostream:
- *
- *  \code
- *  #include <thrust/device_vector.h>
- *  #include <iostream>
- *
- *  int main(void)
- *  {
- *    thrust::device_vector<int> vec(1, 13);
- *
- *    thrust::device_reference<int> ref_to_thirteen = vec[0];
- *
- *    std::cout << ref_to_thirteen << std::endl;
- *
- *    // 13 is printed
- *
- *    return 0;
- *  }
- *  \endcode
- *
- *  Of course, we needn't explicitly create a \p device_reference in the previous
- *  example, because one is returned by \p device_vector's bracket operator. A more natural
- *  way to print the value of a \p device_vector element might be:
- *
- *  \code
- *  #include <thrust/device_vector.h>
- *  #include <iostream>
- *
- *  int main(void)
- *  {
- *    thrust::device_vector<int> vec(1, 13);
- *
- *    std::cout << vec[0] << std::endl;
- *
- *    // 13 is printed
- *
- *    return 0;
- *  }
- *  \endcode
- *
- *  These kinds of operations should be used sparingly in performance-critical code, because
- *  they imply a potentially expensive copy between host and device space.
- *
- *  Some operations which are possible with regular objects are impossible with their
- *  corresponding \p device_reference objects due to the requirements of the C++ language. For
- *  example, because the member access operator cannot be overloaded, member variables and functions
- *  of a referent object cannot be directly accessed through its \p device_reference.
- *
- *  The following code, which generates a compiler error, illustrates:
- *
- *  \code
- *  #include <thrust/device_vector.h>
- *
- *  struct foo
- *  {
- *    int x;
- *  };
- *
- *  int main(void)
- *  {
- *    thrust::device_vector<foo> foo_vec(1);
- *
- *    thrust::device_reference<foo> foo_ref = foo_vec[0];
- *
- *    foo_ref.x = 13; // ERROR: x cannot be accessed through foo_ref
- *
- *    return 0;
- *  }
- *  \endcode
- *
- *  Instead, a host space copy must be created to access \c foo's \c x member:
- *
- *  \code
- *  #include <thrust/device_vector.h>
- *
- *  struct foo
- *  {
- *    int x;
- *  };
- *
- *  int main(void)
- *  {
- *    thrust::device_vector<foo> foo_vec(1);
- *
- *    // create a local host-side foo object
- *    foo host_foo;
- *    host_foo.x = 13;
- *
- *    thrust::device_reference<foo> foo_ref = foo_vec[0];
- *
- *    foo_ref = host_foo;
- *
- *    // foo_ref's x member is 13
- *
- *    return 0;
- *  }
- *  \endcode
- *  
- *  Another common case where a \p device_reference cannot directly be used in place of
- *  its referent object occurs when passing them as parameters to functions like \c printf
- *  which have varargs parameters. Because varargs parameters must be Plain Old Data, a
- *  \p device_reference to a POD type requires a cast when passed to \c printf:
- *
- *  \code
- *  #include <stdio.h>
- *  #include <thrust/device_vector.h>
- *
- *  int main(void)
- *  {
- *    thrust::device_vector<int> vec(1,13);
- *
- *    // vec[0] must be cast to int when passing to printf
- *    printf("%d\n", (int) vec[0]);
- *
- *    return 0;
- *  }
- *  \endcode
- *
- *  \see device_ptr
- *  \see device_vector
- */
-template<typename T>
-  class device_reference
-    : public thrust::reference<
-               T,
-               thrust::device_ptr<T>,
-               thrust::device_reference<T>
-             >
-{
-  private:
-    typedef thrust::reference<
-      T,
-      thrust::device_ptr<T>,
-      thrust::device_reference<T>
-    > super_t;
-
-  public:
-    /*! The type of the value referenced by this type of \p device_reference.
-     */
-    typedef typename super_t::value_type value_type;
-
-    /*! The type of the expression <tt>&ref</tt>, where <tt>ref</tt> is a \p device_reference.
-     */
-    typedef typename super_t::pointer    pointer;
-
-    /*! This copy constructor accepts a const reference to another
-     *  \p device_reference. After this \p device_reference is constructed,
-     *  it shall refer to the same object as \p other.
-     *  
-     *  \param other A \p device_reference to copy from.
-     *
-     *  The following code snippet demonstrates the semantics of this
-     *  copy constructor.
-     *
-     *  \code
-     *  #include <thrust/device_vector.h>
-     *  #include <assert.h>
-     *  ...
-     *  thrust::device_vector<int> v(1,0);
-     *  thrust::device_reference<int> ref = v[0];
-     *
-     *  // ref equals the object at v[0]
-     *  assert(ref == v[0]);
-     *
-     *  // the address of ref equals the address of v[0]
-     *  assert(&ref == &v[0]);
-     *
-     *  // modifying v[0] modifies ref
-     *  v[0] = 13;
-     *  assert(ref == 13);
-     *  \endcode
-     *
-     *  \note This constructor is templated primarily to allow initialization of 
-     *  <tt>device_reference<const T></tt> from <tt>device_reference<T></tt>.
-     */
-    template<typename OtherT>
-    __host__ __device__
-    device_reference(const device_reference<OtherT> &other,
-                     typename thrust::detail::enable_if_convertible<
-                       typename device_reference<OtherT>::pointer,
-                       pointer
-                     >::type * = 0)
-      : super_t(other)
-    {}
-
-    /*! This copy constructor initializes this \p device_reference
-     *  to refer to an object pointed to by the given \p device_ptr. After
-     *  this \p device_reference is constructed, it shall refer to the
-     *  object pointed to by \p ptr.
-     *
-     *  \param ptr A \p device_ptr to copy from.
-     *
-     *  The following code snippet demonstrates the semantic of this
-     *  copy constructor.
-     *
-     *  \code
-     *  #include <thrust/device_vector.h>
-     *  #include <assert.h>
-     *  ...
-     *  thrust::device_vector<int> v(1,0);
-     *  thrust::device_ptr<int> ptr = &v[0];
-     *  thrust::device_reference<int> ref(ptr);
-     *
-     *  // ref equals the object pointed to by ptr
-     *  assert(ref == *ptr);
-     *
-     *  // the address of ref equals ptr
-     *  assert(&ref == ptr);
-     *
-     *  // modifying *ptr modifies ref
-     *  *ptr = 13;
-     *  assert(ref == 13);
-     *  \endcode
-     */
-    __host__ __device__
-    explicit device_reference(const pointer &ptr)
-      : super_t(ptr)
-    {}
-
-    /*! This assignment operator assigns the value of the object referenced by
-     *  the given \p device_reference to the object referenced by this
-     *  \p device_reference.
-     *
-     *  \param other The \p device_reference to assign from.
-     *  \return <tt>*this</tt>
-     */
-    template<typename OtherT>
-    __host__ __device__
-    device_reference &operator=(const device_reference<OtherT> &other);
-
-    /*! Assignment operator assigns the value of the given value to the
-     *  value referenced by this \p device_reference.
-     *  
-     *  \param x The value to assign from.
-     *  \return <tt>*this</tt>
-     */
-    __host__ __device__
-    device_reference &operator=(const value_type &x);
-
-// declare these members for the purpose of Doxygenating them
-// they actually exist in a derived-from class
-#if 0
-    /*! Address-of operator returns a \p device_ptr pointing to the object
-     *  referenced by this \p device_reference. It does not return the
-     *  address of this \p device_reference.
-     *
-     *  \return A \p device_ptr pointing to the object this
-     *  \p device_reference references.
-     */
-    __host__ __device__
-    pointer operator&(void) const;
-
-    /*! Conversion operator converts this \p device_reference to T
-     *  by returning a copy of the object referenced by this
-     *  \p device_reference.
-     *
-     *  \return A copy of the object referenced by this \p device_reference.
-     */
-    __host__ __device__
-    operator value_type (void) const;
-
-    /*! swaps the value this \p device_reference references with another.
-     *  \p other The other \p device_reference with which to swap.
-     */
-    __host__ __device__
-    void swap(device_reference &other);
-
-    /*! Prefix increment operator increments the object referenced by this
-     *  \p device_reference.
-     *
-     *  \return <tt>*this</tt>
-     *  
-     *  The following code snippet demonstrates the semantics of
-     *  \p device_reference's prefix increment operator.
-     *
-     *  \code
-     *  #include <thrust/device_vector.h>
-     *  #include <assert.h>
-     *  ...
-     *  thrust::device_vector<int> v(1,0);
-     *  thrust::device_ptr<int> ptr = &v[0];
-     *  thrust::device_reference<int> ref(ptr);
-     *
-     *  // ref equals 0
-     *  assert(ref == 0);
-     *
-     *  // the object pointed to by ptr equals 1
-     *  assert(*ptr == 1);
-     *
-     *  // v[0] equals 1
-     *  assert(v[0] == 1);
-     *
-     *  // increment ref
-     *  ++ref;
-     *
-     *  // ref equals 1
-     *  assert(ref == 1);
-     *
-     *  // the object pointed to by ptr equals 1
-     *  assert(*ptr == 1);
-     *
-     *  // v[0] equals 1
-     *  assert(v[0] == 1);
-     *  \endcode
-     *
-     *  \note The increment executes as if it were executed on the host.
-     *  This may change in a later version.
-     */
-    device_reference &operator++(void);
-
-    /*! Postfix increment operator copies the object referenced by this
-     *  \p device_reference, increments the object referenced by this
-     *  \p device_reference, and returns the copy.
-     *
-     *  \return A copy of the object referenced by this \p device_reference
-     *          before being incremented.
-     *
-     *  The following code snippet demonstrates the semantics of
-     *  \p device_reference's postfix increment operator.
-     *
-     *  \code
-     *  #include <thrust/device_vector.h>
-     *  #include <assert.h>
-     *  ...
-     *  thrust::device_vector<int> v(1,0);
-     *  thrust::device_ptr<int> ptr = &v[0];
-     *  thrust::device_reference<int> ref(ptr);
-     *
-     *  // ref equals 0
-     *  assert(ref == 0);
-     *
-     *  // the object pointed to by ptr equals 0
-     *  assert(*ptr == 0);
-     *
-     *  // v[0] equals 0
-     *  assert(v[0] == 0);
-     *
-     *  // increment ref
-     *  int x = ref++;
-     *
-     *  // x equals 0
-     *  assert(x == 0)
-     *
-     *  // ref equals 1
-     *  assert(ref == 1);
-     *
-     *  // the object pointed to by ptr equals 1
-     *  assert(*ptr == 1);
-     *
-     *  // v[0] equals 1
-     *  assert(v[0] == 1);
-     *  \endcode
-     *
-     *  \note The increment executes as if it were executed on the host.
-     *  This may change in a later version.
-     */
-    value_type operator++(int);
-
-    /*! Addition assignment operator add-assigns the object referenced by this
-     *  \p device_reference and returns this \p device_reference.
-     *
-     *  \param rhs The right hand side of the add-assignment.
-     *  \return <tt>*this</tt>.
-     *
-     *  The following code snippet demonstrates the semantics of
-     *  \p device_reference's addition assignment operator.
-     *
-     *  \code
-     *  #include <thrust/device_vector.h>
-     *  #include <assert.h>
-     *  ...
-     *  thrust::device_vector<int> v(1,0);
-     *  thrust::device_ptr<int> ptr = &v[0];
-     *  thrust::device_reference<int> ref(ptr);
-     *
-     *  // ref equals 0
-     *  assert(ref == 0);
-     *
-     *  // the object pointed to by ptr equals 0
-     *  assert(*ptr == 0);
-     *
-     *  // v[0] equals 0
-     *  assert(v[0] == 0);
-     *
-     *  // add-assign ref
-     *  ref += 5;
-     *
-     *  // ref equals 5
-     *  assert(ref == 5);
-     *
-     *  // the object pointed to by ptr equals 5
-     *  assert(*ptr == 5);
-     *
-     *  // v[0] equals 5
-     *  assert(v[0] == 5);
-     *  \endcode
-     *
-     *  \note The add-assignment executes as as if it were executed on the host.
-     *  This may change in a later version.
-     */
-    device_reference &operator+=(const T &rhs);
-
-    /*! Prefix decrement operator decrements the object referenced by this
-     *  \p device_reference.
-     *
-     *  \return <tt>*this</tt>
-     *  
-     *  The following code snippet demonstrates the semantics of
-     *  \p device_reference's prefix decrement operator.
-     *
-     *  \code
-     *  #include <thrust/device_vector.h>
-     *  #include <assert.h>
-     *  ...
-     *  thrust::device_vector<int> v(1,0);
-     *  thrust::device_ptr<int> ptr = &v[0];
-     *  thrust::device_reference<int> ref(ptr);
-     *
-     *  // ref equals 0
-     *  assert(ref == 0);
-     *
-     *  // the object pointed to by ptr equals 0
-     *  assert(*ptr == 0);
-     *
-     *  // v[0] equals 0
-     *  assert(v[0] == 0);
-     *
-     *  // decrement ref
-     *  --ref;
-     *
-     *  // ref equals -1
-     *  assert(ref == -1);
-     *
-     *  // the object pointed to by ptr equals -1
-     *  assert(*ptr == -1);
-     *
-     *  // v[0] equals -1
-     *  assert(v[0] == -1);
-     *  \endcode
-     *
-     *  \note The decrement executes as if it were executed on the host.
-     *  This may change in a later version.
-     */
-    device_reference &operator--(void);
-
-    /*! Postfix decrement operator copies the object referenced by this
-     *  \p device_reference, decrements the object referenced by this
-     *  \p device_reference, and returns the copy.
-     *
-     *  \return A copy of the object referenced by this \p device_reference
-     *          before being decremented.
-     *
-     *  The following code snippet demonstrates the semantics of
-     *  \p device_reference's postfix decrement operator.
-     *
-     *  \code
-     *  #include <thrust/device_vector.h>
-     *  #include <assert.h>
-     *  ...
-     *  thrust::device_vector<int> v(1,0);
-     *  thrust::device_ptr<int> ptr = &v[0];
-     *  thrust::device_reference<int> ref(ptr);
-     *
-     *  // ref equals 0
-     *  assert(ref == 0);
-     *
-     *  // the object pointed to by ptr equals 0
-     *  assert(*ptr == 0);
-     *
-     *  // v[0] equals 0
-     *  assert(v[0] == 0);
-     *
-     *  // decrement ref
-     *  int x = ref--;
-     *
-     *  // x equals 0
-     *  assert(x == 0)
-     *
-     *  // ref equals -1
-     *  assert(ref == -1);
-     *
-     *  // the object pointed to by ptr equals -1
-     *  assert(*ptr == -1);
-     *
-     *  // v[0] equals -1
-     *  assert(v[0] == -1);
-     *  \endcode
-     *
-     *  \note The decrement executes as if it were executed on the host.
-     *  This may change in a later version.
-     */
-    value_type operator--(int);
-
-    /*! Subtraction assignment operator subtract-assigns the object referenced by this
-     *  \p device_reference and returns this \p device_reference.
-     *
-     *  \param rhs The right hand side of the subtraction-assignment.
-     *  \return <tt>*this</tt>.
-     *
-     *  The following code snippet demonstrates the semantics of
-     *  \p device_reference's addition assignment operator.
-     *
-     *  \code
-     *  #include <thrust/device_vector.h>
-     *  #include <assert.h>
-     *  ...
-     *  thrust::device_vector<int> v(1,0);
-     *  thrust::device_ptr<int> ptr = &v[0];
-     *  thrust::device_reference<int> ref(ptr);
-     *
-     *  // ref equals 0
-     *  assert(ref == 0);
-     *
-     *  // the object pointed to by ptr equals 0
-     *  assert(*ptr == 0);
-     *
-     *  // v[0] equals 0
-     *  assert(v[0] == 0);
-     *
-     *  // subtract-assign ref
-     *  ref -= 5;
-     *
-     *  // ref equals -5
-     *  assert(ref == -5);
-     *
-     *  // the object pointed to by ptr equals -5
-     *  assert(*ptr == -5);
-     *
-     *  // v[0] equals -5
-     *  assert(v[0] == -5);
-     *  \endcode
-     *
-     *  \note The subtract-assignment executes as as if it were executed on the host.
-     *  This may change in a later version.
-     */
-    device_reference &operator-=(const T &rhs);
-
-    /*! Multiplication assignment operator multiply-assigns the object referenced by this
-     *  \p device_reference and returns this \p device_reference.
-     *
-     *  \param rhs The right hand side of the multiply-assignment.
-     *  \return <tt>*this</tt>.
-     *
-     *  The following code snippet demonstrates the semantics of
-     *  \p device_reference's multiply assignment operator.
-     *
-     *  \code
-     *  #include <thrust/device_vector.h>
-     *  #include <assert.h>
-     *  ...
-     *  thrust::device_vector<int> v(1,1);
-     *  thrust::device_ptr<int> ptr = &v[0];
-     *  thrust::device_reference<int> ref(ptr);
-     *
-     *  // ref equals 1
-     *  assert(ref == 1);
-     *
-     *  // the object pointed to by ptr equals 1
-     *  assert(*ptr == 1);
-     *
-     *  // v[0] equals 1
-     *  assert(v[0] == 1);
-     *
-     *  // multiply-assign ref
-     *  ref *= 5;
-     *
-     *  // ref equals 5
-     *  assert(ref == 5);
-     *
-     *  // the object pointed to by ptr equals 5
-     *  assert(*ptr == 5);
-     *
-     *  // v[0] equals 5
-     *  assert(v[0] == 5);
-     *  \endcode
-     *
-     *  \note The multiply-assignment executes as as if it were executed on the host.
-     *  This may change in a later version.
-     */
-    device_reference &operator*=(const T &rhs);
-
-    /*! Division assignment operator divide-assigns the object referenced by this
-     *  \p device_reference and returns this \p device_reference.
-     *
-     *  \param rhs The right hand side of the divide-assignment.
-     *  \return <tt>*this</tt>.
-     *
-     *  The following code snippet demonstrates the semantics of
-     *  \p device_reference's divide assignment operator.
-     *
-     *  \code
-     *  #include <thrust/device_vector.h>
-     *  #include <assert.h>
-     *  ...
-     *  thrust::device_vector<int> v(1,5);
-     *  thrust::device_ptr<int> ptr = &v[0];
-     *  thrust::device_reference<int> ref(ptr);
-     *
-     *  // ref equals 5
-     *  assert(ref == 5);
-     *
-     *  // the object pointed to by ptr equals 5
-     *  assert(*ptr == 5);
-     *
-     *  // v[0] equals 5
-     *  assert(v[0] == 5);
-     *
-     *  // divide-assign ref
-     *  ref /= 5;
-     *
-     *  // ref equals 1
-     *  assert(ref == 1);
-     *
-     *  // the object pointed to by ptr equals 1
-     *  assert(*ptr == 1);
-     *
-     *  // v[0] equals 1
-     *  assert(v[0] == 1);
-     *  \endcode
-     *
-     *  \note The divide-assignment executes as as if it were executed on the host.
-     *  This may change in a later version.
-     */
-    device_reference &operator/=(const T &rhs);
-
-    /*! Modulation assignment operator modulus-assigns the object referenced by this
-     *  \p device_reference and returns this \p device_reference.
-     *
-     *  \param rhs The right hand side of the divide-assignment.
-     *  \return <tt>*this</tt>.
-     *
-     *  The following code snippet demonstrates the semantics of
-     *  \p device_reference's divide assignment operator.
-     *
-     *  \code
-     *  #include <thrust/device_vector.h>
-     *  #include <assert.h>
-     *  ...
-     *  thrust::device_vector<int> v(1,5);
-     *  thrust::device_ptr<int> ptr = &v[0];
-     *  thrust::device_reference<int> ref(ptr);
-     *
-     *  // ref equals 5
-     *  assert(ref == 5);
-     *
-     *  // the object pointed to by ptr equals 5
-     *  assert(*ptr == 5);
-     *
-     *  // v[0] equals 5
-     *  assert(v[0] == 5);
-     *
-     *  // modulus-assign ref
-     *  ref %= 5;
-     *
-     *  // ref equals 0
-     *  assert(ref == 0);
-     *
-     *  // the object pointed to by ptr equals 0
-     *  assert(*ptr == 0);
-     *
-     *  // v[0] equals 0
-     *  assert(v[0] == 0);
-     *  \endcode
-     *
-     *  \note The modulus-assignment executes as as if it were executed on the host.
-     *  This may change in a later version.
-     */
-    device_reference &operator%=(const T &rhs);
-
-    /*! Bitwise left shift assignment operator left shift-assigns the object referenced by this
-     *  \p device_reference and returns this \p device_reference.
-     *
-     *  \param rhs The right hand side of the left shift-assignment.
-     *  \return <tt>*this</tt>.
-     *
-     *  The following code snippet demonstrates the semantics of
-     *  \p device_reference's left shift assignment operator.
-     *
-     *  \code
-     *  #include <thrust/device_vector.h>
-     *  #include <assert.h>
-     *  ...
-     *  thrust::device_vector<int> v(1,1);
-     *  thrust::device_ptr<int> ptr = &v[0];
-     *  thrust::device_reference<int> ref(ptr);
-     *
-     *  // ref equals 1
-     *  assert(ref == 1);
-     *
-     *  // the object pointed to by ptr equals 1
-     *  assert(*ptr == 1);
-     *
-     *  // v[0] equals 1
-     *  assert(v[0] == 1);
-     *
-     *  // left shift-assign ref
-     *  ref <<= 1;
-     *
-     *  // ref equals 2
-     *  assert(ref == 2);
-     *
-     *  // the object pointed to by ptr equals 2
-     *  assert(*ptr == 2);
-     *
-     *  // v[0] equals 2
-     *  assert(v[0] == 2);
-     *  \endcode
-     *
-     *  \note The left shift-assignment executes as as if it were executed on the host.
-     *  This may change in a later version.
-     */
-    device_reference &operator<<=(const T &rhs);
-
-    /*! Bitwise right shift assignment operator right shift-assigns the object referenced by this
-     *  \p device_reference and returns this \p device_reference.
-     *
-     *  \param rhs The right hand side of the right shift-assignment.
-     *  \return <tt>*this</tt>.
-     *
-     *  The following code snippet demonstrates the semantics of
-     *  \p device_reference's right shift assignment operator.
-     *
-     *  \code
-     *  #include <thrust/device_vector.h>
-     *  #include <assert.h>
-     *  ...
-     *  thrust::device_vector<int> v(1,2);
-     *  thrust::device_ptr<int> ptr = &v[0];
-     *  thrust::device_reference<int> ref(ptr);
-     *
-     *  // ref equals 2
-     *  assert(ref == 2);
-     *
-     *  // the object pointed to by ptr equals 2
-     *  assert(*ptr == 2);
-     *
-     *  // v[0] equals 2
-     *  assert(v[0] == 2);
-     *
-     *  // right shift-assign ref
-     *  ref >>= 1;
-     *
-     *  // ref equals 1
-     *  assert(ref == 1);
-     *
-     *  // the object pointed to by ptr equals 1
-     *  assert(*ptr == 1);
-     *
-     *  // v[0] equals 1
-     *  assert(v[0] == 1);
-     *  \endcode
-     *
-     *  \note The right shift-assignment executes as as if it were executed on the host.
-     *  This may change in a later version.
-     */
-    device_reference &operator>>=(const T &rhs);
-
-    /*! Bitwise AND assignment operator AND-assigns the object referenced by this
-     *  \p device_reference and returns this \p device_reference.
-     *
-     *  \param rhs The right hand side of the AND-assignment.
-     *  \return <tt>*this</tt>.
-     *
-     *  The following code snippet demonstrates the semantics of
-     *  \p device_reference's AND assignment operator.
-     *
-     *  \code
-     *  #include <thrust/device_vector.h>
-     *  #include <assert.h>
-     *  ...
-     *  thrust::device_vector<int> v(1,1);
-     *  thrust::device_ptr<int> ptr = &v[0];
-     *  thrust::device_reference<int> ref(ptr);
-     *
-     *  // ref equals 1
-     *  assert(ref == 1);
-     *
-     *  // the object pointed to by ptr equals 1
-     *  assert(*ptr == 1);
-     *
-     *  // v[0] equals 1
-     *  assert(v[0] == 1);
-     *
-     *  // right AND-assign ref
-     *  ref &= 0;
-     *
-     *  // ref equals 0
-     *  assert(ref == 0);
-     *
-     *  // the object pointed to by ptr equals 0
-     *  assert(*ptr == 0);
-     *
-     *  // v[0] equals 0
-     *  assert(v[0] == 0);
-     *  \endcode
-     *
-     *  \note The AND-assignment executes as as if it were executed on the host.
-     *  This may change in a later version.
-     */
-    device_reference &operator&=(const T &rhs);
-
-    /*! Bitwise OR assignment operator OR-assigns the object referenced by this
-     *  \p device_reference and returns this \p device_reference.
-     *
-     *  \param rhs The right hand side of the OR-assignment.
-     *  \return <tt>*this</tt>.
-     *
-     *  The following code snippet demonstrates the semantics of
-     *  \p device_reference's OR assignment operator.
-     *
-     *  \code
-     *  #include <thrust/device_vector.h>
-     *  #include <assert.h>
-     *  ...
-     *  thrust::device_vector<int> v(1,0);
-     *  thrust::device_ptr<int> ptr = &v[0];
-     *  thrust::device_reference<int> ref(ptr);
-     *
-     *  // ref equals 0
-     *  assert(ref == 0);
-     *
-     *  // the object pointed to by ptr equals 0
-     *  assert(*ptr == 0);
-     *
-     *  // v[0] equals 0
-     *  assert(v[0] == 0);
-     *
-     *  // right OR-assign ref
-     *  ref |= 1;
-     *
-     *  // ref equals 1
-     *  assert(ref == 1);
-     *
-     *  // the object pointed to by ptr equals 1
-     *  assert(*ptr == 1);
-     *
-     *  // v[0] equals 1
-     *  assert(v[0] == 1);
-     *  \endcode
-     *
-     *  \note The OR-assignment executes as as if it were executed on the host.
-     *  This may change in a later version.
-     */
-    device_reference &operator|=(const T &rhs);
-
-    /*! Bitwise XOR assignment operator XOR-assigns the object referenced by this
-     *  \p device_reference and returns this \p device_reference.
-     *
-     *  \param rhs The right hand side of the XOR-assignment.
-     *  \return <tt>*this</tt>.
-     *
-     *  The following code snippet demonstrates the semantics of
-     *  \p device_reference's XOR assignment operator.
-     *
-     *  \code
-     *  #include <thrust/device_vector.h>
-     *  #include <assert.h>
-     *  ...
-     *  thrust::device_vector<int> v(1,1);
-     *  thrust::device_ptr<int> ptr = &v[0];
-     *  thrust::device_reference<int> ref(ptr);
-     *
-     *  // ref equals 1
-     *  assert(ref == 1);
-     *
-     *  // the object pointed to by ptr equals 1
-     *  assert(*ptr == 1);
-     *
-     *  // v[0] equals 1
-     *  assert(v[0] == 1);
-     *
-     *  // right XOR-assign ref
-     *  ref ^= 1;
-     *
-     *  // ref equals 0
-     *  assert(ref == 0);
-     *
-     *  // the object pointed to by ptr equals 0
-     *  assert(*ptr == 0);
-     *
-     *  // v[0] equals 0
-     *  assert(v[0] == 0);
-     *  \endcode
-     *
-     *  \note The XOR-assignment executes as as if it were executed on the host.
-     *  This may change in a later version.
-     */
-    device_reference &operator^=(const T &rhs);
-#endif // end doxygen-only members
-}; // end device_reference
-
-/*! swaps the value of one \p device_reference with another.
- *  \p x The first \p device_reference of interest.
- *  \p y The second \p device_reference of interest.
- */
-template<typename T>
-__host__ __device__
-void swap(device_reference<T> &x, device_reference<T> &y);
-
-/*! \}
- */
-
-} // end thrust
-
-#include <thrust/detail/device_reference.inl>
-
diff --git a/compat/thrust/device_vector.h b/compat/thrust/device_vector.h
deleted file mode 100644
index 8c9d0051a2..0000000000
--- a/compat/thrust/device_vector.h
+++ /dev/null
@@ -1,418 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file device_vector.h
- *  \brief A dynamically-sizable array of elements which reside in the "device" memory space
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/device_malloc_allocator.h>
-#include <thrust/detail/vector_base.h>
-#include <vector>
-
-namespace thrust
-{
-
-// forward declaration of host_vector
-template<typename T, typename Alloc> class host_vector;
-
-/*! \addtogroup container_classes Container Classes
- *  \addtogroup device_containers Device Containers
- *  \ingroup container_classes
- *  \{
- */
-
-/*! A \p device_vector is a container that supports random access to elements,
- *  constant time removal of elements at the end, and linear time insertion
- *  and removal of elements at the beginning or in the middle. The number of
- *  elements in a \p device_vector may vary dynamically; memory management is
- *  automatic. The memory associated with a \p device_vector resides in the memory
- *  space of a parallel device.
- *
- *  \see http://www.sgi.com/tech/stl/Vector.html
- *  \see host_vector
- */
-template<typename T, typename Alloc = thrust::device_malloc_allocator<T> >
-  class device_vector
-    : public detail::vector_base<T,Alloc>
-{
-  private:
-    typedef detail::vector_base<T,Alloc> Parent;
-
-  public:
-    /*! \cond */
-    typedef typename Parent::size_type  size_type;
-    typedef typename Parent::value_type value_type;
-    /*! \endcond */
-
-    /*! This constructor creates an empty \p device_vector.
-     */
-    __host__
-    device_vector(void)
-      :Parent() {}
-
-    /*! This constructor creates a \p device_vector with the given
-     *  size.
-     *  \param n The number of elements to initially craete.
-     */
-    __host__
-    explicit device_vector(size_type n)
-      :Parent(n) {}
-
-    /*! This constructor creates a \p device_vector with copies
-     *  of an exemplar element.
-     *  \param n The number of elements to initially create.
-     *  \param value An element to copy.
-     */
-    __host__
-    explicit device_vector(size_type n, const value_type &value)
-      :Parent(n,value) {}
-
-    /*! Copy constructor copies from an exemplar \p device_vector.
-     *  \param v The \p device_vector to copy.
-     */
-    __host__
-    device_vector(const device_vector &v)
-      :Parent(v) {}
-
-    /*! Copy constructor copies from an exemplar \p device_vector with different type.
-     *  \param v The \p device_vector to copy.
-     */
-    template<typename OtherT, typename OtherAlloc>
-    __device__
-    device_vector(const device_vector<OtherT,OtherAlloc> &v)
-      :Parent(v) {}
-
-    /*! Assign operator copies from an exemplar \p device_vector with different type.
-     *  \param v The \p device_vector to copy.
-     */
-    template<typename OtherT, typename OtherAlloc>
-    __device__
-    device_vector &operator=(const device_vector<OtherT,OtherAlloc> &v)
-    { Parent::operator=(v); return *this; }
-
-    /*! Copy constructor copies from an exemplar \c std::vector.
-     *  \param v The <tt>std::vector</tt> to copy.
-     */
-    template<typename OtherT, typename OtherAlloc>
-    __host__
-    device_vector(const std::vector<OtherT,OtherAlloc> &v)
-      :Parent(v) {}
-
-    /*! Assign operator copies from an exemplar <tt>std::vector</tt>.
-     *  \param v The <tt>std::vector</tt> to copy.
-     */
-    template<typename OtherT, typename OtherAlloc>
-    __host__
-    device_vector &operator=(const std::vector<OtherT,OtherAlloc> &v)
-    { Parent::operator=(v); return *this;}
-
-    /*! Copy constructor copies from an exemplar \p host_vector with possibly different type.
-     *  \param v The \p host_vector to copy.
-     */
-    template<typename OtherT, typename OtherAlloc>
-    __host__
-    device_vector(const host_vector<OtherT,OtherAlloc> &v);
-
-    /*! Assign operator copies from an examplar \p host_vector.
-     *  \param v The \p host_vector to copy.
-     */
-    template<typename OtherT, typename OtherAlloc>
-    __host__
-    device_vector &operator=(const host_vector<OtherT,OtherAlloc> &v)
-    { Parent::operator=(v); return *this; }
-
-    /*! This constructor builds a \p device_vector from a range.
-     *  \param first The beginning of the range.
-     *  \param last The end of the range.
-     */
-    template<typename InputIterator>
-    __host__
-    device_vector(InputIterator first, InputIterator last)
-      :Parent(first,last) {}
-
-// declare these members for the purpose of Doxygenating them
-// they actually exist in a derived-from class
-#if 0
-    /*! \brief Resizes this vector to the specified number of elements.
-     *  \param new_size Number of elements this vector should contain.
-     *  \param x Data with which new elements should be populated.
-     *  \throw std::length_error If n exceeds max_size().
-     *
-     *  This method will resize this vector to the specified number of
-     *  elements.  If the number is smaller than this vector's current
-     *  size this vector is truncated, otherwise this vector is
-     *  extended and new elements are populated with given data.
-     */
-    void resize(size_type new_size, const value_type &x = value_type());
-
-    /*! Returns the number of elements in this vector.
-     */
-    size_type size(void) const;
-
-    /*! Returns the size() of the largest possible vector.
-     *  \return The largest possible return value of size().
-     */
-    size_type max_size(void) const;
-
-    /*! \brief If n is less than or equal to capacity(), this call has no effect.
-     *         Otherwise, this method is a request for allocation of additional memory. If
-     *         the request is successful, then capacity() is greater than or equal to
-     *         n; otherwise, capacity() is unchanged. In either case, size() is unchanged.
-     *  \throw std::length_error If n exceeds max_size().
-     */
-    void reserve(size_type n);
-
-    /*! Returns the number of elements which have been reserved in this
-     *  vector.
-     */
-    size_type capacity(void) const;
-
-    /*! This method shrinks the capacity of this vector to exactly
-     *  fit its elements.
-     */
-    void shrink_to_fit(void);
-
-    /*! \brief Subscript access to the data contained in this vector_dev.
-     *  \param n The index of the element for which data should be accessed.
-     *  \return Read/write reference to data.
-     *
-     *  This operator allows for easy, array-style, data access.
-     *  Note that data access with this operator is unchecked and
-     *  out_of_range lookups are not defined.
-     */
-    reference operator[](size_type n);
-
-    /*! \brief Subscript read access to the data contained in this vector_dev.
-     *  \param n The index of the element for which data should be accessed.
-     *  \return Read reference to data.
-     *
-     *  This operator allows for easy, array-style, data access.
-     *  Note that data access with this operator is unchecked and
-     *  out_of_range lookups are not defined.
-     */
-    const_reference operator[](size_type n) const;
-
-    /*! This method returns an iterator pointing to the beginning of
-     *  this vector.
-     *  \return mStart
-     */
-    iterator begin(void);
-
-    /*! This method returns a const_iterator pointing to the beginning
-     *  of this vector.
-     *  \return mStart
-     */
-    const_iterator begin(void) const;
-
-    /*! This method returns a const_iterator pointing to the beginning
-     *  of this vector.
-     *  \return mStart
-     */
-    const_iterator cbegin(void) const;
-
-    /*! This method returns a reverse_iterator pointing to the beginning of
-     *  this vector's reversed sequence.
-     *  \return A reverse_iterator pointing to the beginning of this
-     *          vector's reversed sequence.
-     */
-    reverse_iterator rbegin(void);
-
-    /*! This method returns a const_reverse_iterator pointing to the beginning of
-     *  this vector's reversed sequence.
-     *  \return A const_reverse_iterator pointing to the beginning of this
-     *          vector's reversed sequence.
-     */
-    const_reverse_iterator rbegin(void) const;
-
-    /*! This method returns a const_reverse_iterator pointing to the beginning of
-     *  this vector's reversed sequence.
-     *  \return A const_reverse_iterator pointing to the beginning of this
-     *          vector's reversed sequence.
-     */
-    const_reverse_iterator crbegin(void) const;
-
-    /*! This method returns an iterator pointing to one element past the
-     *  last of this vector.
-     *  \return begin() + size().
-     */
-    iterator end(void);
-
-    /*! This method returns a const_iterator pointing to one element past the
-     *  last of this vector.
-     *  \return begin() + size().
-     */
-    const_iterator end(void) const;
-
-    /*! This method returns a const_iterator pointing to one element past the
-     *  last of this vector.
-     *  \return begin() + size().
-     */
-    const_iterator cend(void) const;
-
-    /*! This method returns a reverse_iterator pointing to one element past the
-     *  last of this vector's reversed sequence.
-     *  \return rbegin() + size().
-     */
-    reverse_iterator rend(void);
-
-    /*! This method returns a const_reverse_iterator pointing to one element past the
-     *  last of this vector's reversed sequence.
-     *  \return rbegin() + size().
-     */
-    const_reverse_iterator rend(void) const;
-
-    /*! This method returns a const_reverse_iterator pointing to one element past the
-     *  last of this vector's reversed sequence.
-     *  \return rbegin() + size().
-     */
-    const_reverse_iterator crend(void) const;
-
-    /*! This method returns a const_reference referring to the first element of this
-     *  vector.
-     *  \return The first element of this vector.
-     */
-    const_reference front(void) const;
-
-    /*! This method returns a reference pointing to the first element of this
-     *  vector.
-     *  \return The first element of this vector.
-     */
-    reference front(void);
-
-    /*! This method returns a const reference pointing to the last element of
-     *  this vector.
-     *  \return The last element of this vector.
-     */
-    const_reference back(void) const;
-
-    /*! This method returns a reference referring to the last element of
-     *  this vector_dev.
-     *  \return The last element of this vector.
-     */
-    reference back(void);
-
-    /*! This method returns a pointer to this vector's first element.
-     *  \return A pointer to the first element of this vector.
-     */
-    pointer data(void);
-
-    /*! This method returns a const_pointer to this vector's first element.
-     *  \return a const_pointer to the first element of this vector.
-     */
-    const_pointer data(void) const;
-
-    /*! This method resizes this vector to 0.
-     */
-    void clear(void);
-
-    /*! This method returns true iff size() == 0.
-     *  \return true if size() == 0; false, otherwise.
-     */
-    bool empty(void) const;
-
-    /*! This method appends the given element to the end of this vector.
-     *  \param x The element to append.
-     */
-    void push_back(const value_type &x);
-
-    /*! This method erases the last element of this vector, invalidating
-     *  all iterators and references to it.
-     */
-    void pop_back(void);
-
-    /*! This method swaps the contents of this vector_base with another vector.
-     *  \param v The vector with which to swap.
-     */
-    void swap(device_vector &v);
-
-    /*! This method removes the element at position pos.
-     *  \param pos The position of the element of interest.
-     *  \return An iterator pointing to the new location of the element that followed the element
-     *          at position pos.
-     */
-    iterator erase(iterator pos);
-
-    /*! This method removes the range of elements [first,last) from this vector.
-     *  \param first The beginning of the range of elements to remove.
-     *  \param last The end of the range of elements to remove.
-     *  \return An iterator pointing to the new location of the element that followed the last
-     *          element in the sequence [first,last).
-     */
-    iterator erase(iterator first, iterator last);
-
-    /*! This method inserts a single copy of a given exemplar value at the
-     *  specified position in this vector.
-     *  \param position The insertion position.
-     *  \param x The exemplar element to copy & insert.
-     *  \return An iterator pointing to the newly inserted element.
-     */
-    iterator insert(iterator position, const T &x); 
-
-    /*! This method inserts a copy of an exemplar value to a range at the
-     *  specified position in this vector.
-     *  \param position The insertion position
-     *  \param n The number of insertions to perform.
-     *  \param x The value to replicate and insert.
-     */
-    void insert(iterator position, size_type n, const T &x);
-
-    /*! This method inserts a copy of an input range at the specified position
-     *  in this vector.
-     *  \param position The insertion position.
-     *  \param first The beginning of the range to copy.
-     *  \param last  The end of the range to copy.
-     *
-     *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html>Input Iterator</a>,
-     *                        and \p InputIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>.
-     */
-    template<typename InputIterator>
-    void insert(iterator position, InputIterator first, InputIterator last);
-
-    /*! This version of \p assign replicates a given exemplar
-     *  \p n times into this vector.
-     *  \param n The number of times to copy \p x.
-     *  \param x The exemplar element to replicate.
-     */
-    void assign(size_type n, const T &x);
-
-    /*! This version of \p assign makes this vector a copy of a given input range.
-     *  \param first The beginning of the range to copy.
-     *  \param last  The end of the range to copy.
-     *
-     *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator">Input Iterator</a>.
-     */
-    template<typename InputIterator>
-    void assign(InputIterator first, InputIterator last);
-
-    /*! This method returns a copy of this vector's allocator.
-     *  \return A copy of the alloctor used by this vector.
-     */
-    allocator_type get_allocator(void) const;
-#endif // end doxygen-only members
-}; // end device_vector
-
-/*! \}
- */
-
-} // end thrust
-
-#include <thrust/detail/device_vector.inl>
-
-
diff --git a/compat/thrust/distance.h b/compat/thrust/distance.h
deleted file mode 100644
index 67b41946bf..0000000000
--- a/compat/thrust/distance.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file distance.h
- *  \brief Computes the size of a range
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/iterator/iterator_traits.h>
-
-namespace thrust
-{
-
-
-/*! \addtogroup iterators
- *  \{
- */
-
-/*! \p distance finds the distance between \p first and \p last, i.e. the
- *  number of times that \p first must be incremented until it is equal to
- *  \p last.
- *
- *  \param first The beginning of an input range of interest.
- *  \param last The end of an input range of interest.
- *  \return The distance between the beginning and end of the input range.
- *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *
- *  \pre If \c InputIterator meets the requirements of random access iterator, \p last shall be reachable from \p first or
- *       \p first shall be reachable from \p last; otherwise, \p last shall be reachable from \p first.
- *
- *  The following code snippet demonstrates how to use \p distance to compute
- *  the distance to one iterator from another.
- *
- *  \code
- *  #include <thrust/distance.h>
- *  #include <thrust/device_vector.h>
- *  ...
- *  thrust::device_vector<int> vec(13);
- *  thrust::device_vector<int>::iterator iter1 = vec.begin();
- *  thrust::device_vector<int>::iterator iter2 = iter1 + 7;
- *
- *  int d = thrust::distance(iter1, iter2);
- *
- *  // d is 7
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/distance.html
- */
-template<typename InputIterator>
-  inline typename thrust::iterator_traits<InputIterator>::difference_type
-    distance(InputIterator first, InputIterator last);
-
-/*! \} // end iterators
- */
-
-} // end thrust
-
-#include <thrust/detail/distance.inl>
-
diff --git a/compat/thrust/equal.h b/compat/thrust/equal.h
deleted file mode 100644
index e96946fcf7..0000000000
--- a/compat/thrust/equal.h
+++ /dev/null
@@ -1,236 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file equal.h
- *  \brief Equality between ranges
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/execution_policy.h>
-
-namespace thrust
-{
-
-
-/*! \addtogroup reductions
- *  \{
- *  \addtogroup comparisons
- *  \ingroup reductions
- *  \{
- */
-
-
-/*! \p equal returns \c true if the two ranges <tt>[first1, last1)</tt>
- *  and <tt>[first2, first2 + (last1 - first1))</tt> are identical when
- *  compared element-by-element, and otherwise returns \c false.
- *
- *  This version of \p equal returns \c true if and only if for every
- *  iterator \c i in <tt>[first1, last1)</tt>, <tt>*i == *(first2 + (i - first1))</tt>.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first1 The beginning of the first sequence.
- *  \param last1  The end of the first sequence.
- *  \param first2 The beginning of the second sequence.
- *  \return \c true, if the sequences are equal; \c false, otherwise.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>,
- *          and \p InputIterator1's \c value_type can be compared for equality with \c InputIterator2's \c value_type.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>,
- *          and \p InputIterator2's \c value_type can be compared for equality with \c InputIterator1's \c value_type.
- *
- *  The following code snippet demonstrates how to use \p equal to test
- *  two ranges for equality using the \p thrust::host execution policy:
- *
- *  \code
- *  #include <thrust/equal.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  int A1[7] = {3, 1, 4, 1, 5, 9, 3};
- *  int A2[7] = {3, 1, 4, 2, 8, 5, 7};
- *  ...
- *  bool result = thrust::equal(thrust::host, A1, A1 + 7, A2);
- *
- *  // result == false
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/equal.html
- */
-template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2>
-bool equal(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2);
-
-
-/*! \p equal returns \c true if the two ranges <tt>[first1, last1)</tt>
- *  and <tt>[first2, first2 + (last1 - first1))</tt> are identical when
- *  compared element-by-element, and otherwise returns \c false.
- *
- *  This version of \p equal returns \c true if and only if for every
- *  iterator \c i in <tt>[first1, last1)</tt>, <tt>*i == *(first2 + (i - first1))</tt>.
- *
- *  \param first1 The beginning of the first sequence.
- *  \param last1  The end of the first sequence.
- *  \param first2 The beginning of the second sequence.
- *  \return \c true, if the sequences are equal; \c false, otherwise.
- *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>,
- *          and \p InputIterator1's \c value_type can be compared for equality with \c InputIterator2's \c value_type.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>,
- *          and \p InputIterator2's \c value_type can be compared for equality with \c InputIterator1's \c value_type.
- *
- *  The following code snippet demonstrates how to use \p equal to test
- *  two ranges for equality.
- *
- *  \code
- *  #include <thrust/equal.h>
- *  ...
- *  int A1[7] = {3, 1, 4, 1, 5, 9, 3};
- *  int A2[7] = {3, 1, 4, 2, 8, 5, 7};
- *  ...
- *  bool result = thrust::equal(A1, A1 + 7, A2);
- *
- *  // result == false
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/equal.html
- */
-template <typename InputIterator1, typename InputIterator2>
-bool equal(InputIterator1 first1, InputIterator1 last1,
-           InputIterator2 first2);
-
-
-/*! \p equal returns \c true if the two ranges <tt>[first1, last1)</tt>
- *  and <tt>[first2, first2 + (last1 - first1))</tt> are identical when
- *  compared element-by-element, and otherwise returns \c false.
- *
- *  This version of \p equal returns \c true if and only if for every
- *  iterator \c i in <tt>[first1, last1)</tt>,
- *  <tt>binary_pred(*i, *(first2 + (i - first1)))</tt> is \c true.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first1 The beginning of the first sequence.
- *  \param last1  The end of the first sequence.
- *  \param first2 The beginning of the second sequence.
- *  \param binary_pred Binary predicate used to test element equality.
- *  \return \c true, if the sequences are equal; \c false, otherwise.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator1's \c value_type is convertible to \p BinaryPredicate's \c first_argument_type.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator2's \c value_type is convertible to \p BinaryPredicate's \c second_argument_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
- *
- *  The following code snippet demonstrates how to use \p equal to compare the
- *  elements in two ranges modulo 2 using the \p thrust::host execution policy.
- *
- *  \code
- *  #include <thrust/equal.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  
- *  __host__ __device__
- *  struct compare_modulo_two
- *  {
- *    bool operator()(int x, int y)
- *    {
- *      return (x % 2) == (y % 2);
- *    }
- *  };
- *  ...
- *  int x[5] = {0, 2, 4, 6, 8, 10};
- *  int y[5] = {1, 3, 5, 7, 9, 11};
- *
- *  bool result = thrust::equal(x, x + 5, y, compare_modulo_two());
- *
- *  // result is true
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/equal.html
- */
-template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename BinaryPredicate>
-bool equal(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, BinaryPredicate binary_pred);
-
-
-/*! \p equal returns \c true if the two ranges <tt>[first1, last1)</tt>
- *  and <tt>[first2, first2 + (last1 - first1))</tt> are identical when
- *  compared element-by-element, and otherwise returns \c false.
- *
- *  This version of \p equal returns \c true if and only if for every
- *  iterator \c i in <tt>[first1, last1)</tt>,
- *  <tt>binary_pred(*i, *(first2 + (i - first1)))</tt> is \c true.
- *
- *  \param first1 The beginning of the first sequence.
- *  \param last1  The end of the first sequence.
- *  \param first2 The beginning of the second sequence.
- *  \param binary_pred Binary predicate used to test element equality.
- *  \return \c true, if the sequences are equal; \c false, otherwise.
- *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator1's \c value_type is convertible to \p BinaryPredicate's \c first_argument_type.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator2's \c value_type is convertible to \p BinaryPredicate's \c second_argument_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
- *
- *  The following code snippet demonstrates how to use \p equal to compare the
- *  elements in two ranges modulo 2.
- *
- *  \code
- *  #include <thrust/equal.h>
- *  
- *  __host__ __device__
- *  struct compare_modulo_two
- *  {
- *    bool operator()(int x, int y)
- *    {
- *      return (x % 2) == (y % 2);
- *    }
- *  };
- *  ...
- *  int x[5] = {0, 2, 4, 6, 8, 10};
- *  int y[5] = {1, 3, 5, 7, 9, 11};
- *
- *  bool result = thrust::equal(x, x + 5, y, compare_modulo_two());
- *
- *  // result is true
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/equal.html
- */
-template <typename InputIterator1, typename InputIterator2, 
-          typename BinaryPredicate>
-bool equal(InputIterator1 first1, InputIterator1 last1,
-           InputIterator2 first2, BinaryPredicate binary_pred);
-
-
-/*! \} // end comparisons
- *  \} // end reductions
- */
-
-} // end namespace thrust
-
-#include <thrust/detail/equal.inl>
-
diff --git a/compat/thrust/execution_policy.h b/compat/thrust/execution_policy.h
deleted file mode 100644
index a5b61e95b4..0000000000
--- a/compat/thrust/execution_policy.h
+++ /dev/null
@@ -1,351 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file thrust/execution_policy.h
- *  \brief Thrust execution policies.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// get the definition of thrust::execution_policy
-#include <thrust/detail/execution_policy.h>
-
-// #include the host system's execution_policy header
-#define __THRUST_HOST_SYSTEM_EXECUTION_POLICY_HEADER <__THRUST_HOST_SYSTEM_ROOT/execution_policy.h>
-#include __THRUST_HOST_SYSTEM_EXECUTION_POLICY_HEADER
-#undef __THRUST_HOST_SYSTEM_EXECUTION_POLICY_HEADER
-
-// #include the device system's execution_policy.h header
-#define __THRUST_DEVICE_SYSTEM_EXECUTION_POLICY_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/execution_policy.h>
-#include __THRUST_DEVICE_SYSTEM_EXECUTION_POLICY_HEADER
-#undef __THRUST_DEVICE_SYSTEM_EXECUTION_POLICY_HEADER
-
-namespace thrust
-{
-
-
-/*! \cond
- */
-
-
-namespace detail
-{
-
-
-typedef thrust::system::__THRUST_HOST_SYSTEM_NAMESPACE::detail::par_t host_t;
-
-
-typedef thrust::system::__THRUST_DEVICE_SYSTEM_NAMESPACE::detail::par_t device_t;
-
-
-} // end detail
-
-
-/*! \endcond
- */
-
-
-/*! \addtogroup execution_policies Parallel Execution Policies
- *  \{
- */
-
-
-// define execution_policy for the purpose of Doxygenating it
-// it is actually defined elsewhere
-#if 0
-/*! \p execution_policy is the base class for all Thrust parallel execution policies
- *  like \p thrust::host, \p thrust::device, and each backend system's tag type.
- *
- *  Custom user-defined backends should derive a policy from this type in order to
- *  interoperate with Thrust algorithm dispatch.
- *
- *  The following code snippet demonstrates how to derive a standalone custom execution policy
- *  from \p thrust::execution_policy to implement a backend which only implements \p for_each:
- *
- *  \code
- *  #include <thrust/execution_policy.h>
- *  #include <iostream>
- *
- *  // define a type derived from thrust::execution_policy to distinguish our custom execution policy:
- *  struct my_policy : thrust::execution_policy<my_policy> {};
- *
- *  // overload for_each on my_policy
- *  template<typename Iterator, typename Function>
- *  Iterator for_each(my_policy, Iterator first, Iterator last, Function f)
- *  {
- *    std::cout << "Hello, world from for_each(my_policy)!" << std::endl;
- *
- *    for(; first < last; ++first)
- *    {
- *      f(*first);
- *    }
- *
- *    return first;
- *  }
- *
- *  struct ignore_argument
- *  {
- *    void operator()(int) {}
- *  };
- *
- *  int main()
- *  {
- *    int data[4];
- *
- *    // dispatch thrust::for_each using our custom policy:
- *    my_policy exec;
- *    thrust::for_each(exec, data, data + 4, ignore_argument());
- *
- *    // can't dispatch thrust::transform because no overload exists for my_policy:
- *    //thrust::transform(exec, data, data, + 4, data, thrust::identity<int>()); // error!
- *
- *    return 0;
- *  }
- *  \endcode
- *
- *  \see host_execution_policy
- *  \see device_execution_policy
- */
-template<typename DerivedPolicy>
-struct execution_policy : thrust::detail::execution_policy_base<DerivedPolicy>
-{};
-#endif
-
-
-/*! \p host_execution_policy is the base class for all Thrust parallel execution policies
- *  which are derived from Thrust's default host backend system configured with the \p THRUST_HOST_SYSTEM
- *  macro.
- *
- *  Custom user-defined backends which wish to inherit the functionality of Thrust's host backend system
- *  should derive a policy from this type in order to interoperate with Thrust algorithm dispatch.
- *
- *  The following code snippet demonstrates how to derive a standalone custom execution policy from
- *  \p thrust::host_execution_policy to implement a backend which specializes \p for_each while inheriting
- *  the behavior of every other algorithm from the host system:
- *
- *  \code
- *  #include <thrust/execution_policy.h>
- *  #include <iostream>
- *
- *  // define a type derived from thrust::host_execution_policy to distinguish our custom execution policy:
- *  struct my_policy : thrust::host_execution_policy<my_policy> {};
- *
- *  // overload for_each on my_policy
- *  template<typename Iterator, typename Function>
- *  Iterator for_each(my_policy, Iterator first, Iterator last, Function f)
- *  {
- *    std::cout << "Hello, world from for_each(my_policy)!" << std::endl;
- *
- *    for(; first < last; ++first)
- *    {
- *      f(*first);
- *    }
- *
- *    return first;
- *  }
- *
- *  struct ignore_argument
- *  {
- *    void operator()(int) {}
- *  };
- *
- *  int main()
- *  {
- *    int data[4];
- *
- *    // dispatch thrust::for_each using our custom policy:
- *    my_policy exec;
- *    thrust::for_each(exec, data, data + 4, ignore_argument());
- *
- *    // dispatch thrust::transform whose behavior our policy inherits
- *    thrust::transform(exec, data, data, + 4, data, thrust::identity<int>());
- *
- *    return 0;
- *  }
- *  \endcode
- *
- *  \see execution_policy
- *  \see device_execution_policy
- */
-template<typename DerivedPolicy>
-  struct host_execution_policy
-    : thrust::system::__THRUST_HOST_SYSTEM_NAMESPACE::execution_policy<DerivedPolicy>
-{};
-
-
-/*! \p device_execution_policy is the base class for all Thrust parallel execution policies
- *  which are derived from Thrust's default device backend system configured with the \p THRUST_DEVICE_SYSTEM
- *  macro.
- *
- *  Custom user-defined backends which wish to inherit the functionality of Thrust's device backend system
- *  should derive a policy from this type in order to interoperate with Thrust algorithm dispatch.
- *
- *  The following code snippet demonstrates how to derive a standalone custom execution policy from
- *  \p thrust::device_execution_policy to implement a backend which specializes \p for_each while inheriting
- *  the behavior of every other algorithm from the device system:
- *
- *  \code
- *  #include <thrust/execution_policy.h>
- *  #include <iostream>
- *
- *  // define a type derived from thrust::device_execution_policy to distinguish our custom execution policy:
- *  struct my_policy : thrust::device_execution_policy<my_policy> {};
- *
- *  // overload for_each on my_policy
- *  template<typename Iterator, typename Function>
- *  Iterator for_each(my_policy, Iterator first, Iterator last, Function f)
- *  {
- *    std::cout << "Hello, world from for_each(my_policy)!" << std::endl;
- *
- *    for(; first < last; ++first)
- *    {
- *      f(*first);
- *    }
- *
- *    return first;
- *  }
- *
- *  struct ignore_argument
- *  {
- *    void operator()(int) {}
- *  };
- *
- *  int main()
- *  {
- *    int data[4];
- *
- *    // dispatch thrust::for_each using our custom policy:
- *    my_policy exec;
- *    thrust::for_each(exec, data, data + 4, ignore_argument());
- *
- *    // dispatch thrust::transform whose behavior our policy inherits
- *    thrust::transform(exec, data, data, + 4, data, thrust::identity<int>());
- *
- *    return 0;
- *  }
- *  \endcode
- *
- *  \see execution_policy
- *  \see host_execution_policy
- */
-template<typename DerivedPolicy>
-  struct device_execution_policy
-    : thrust::system::__THRUST_DEVICE_SYSTEM_NAMESPACE::execution_policy<DerivedPolicy>
-{};
-
-
-/*! \p thrust::host is the default parallel execution policy associated with Thrust's host backend system
- *  configured by the \p THRUST_HOST_SYSTEM macro.
- *
- *  Instead of relying on implicit algorithm dispatch through iterator system tags, users may directly target
- *  algorithm dispatch at Thrust's host system by providing \p thrust::host as an algorithm parameter.
- *
- *  Explicit dispatch can be useful in avoiding the introduction of data copies into containers such as
- *  \p thrust::host_vector.
- *
- *  Note that even though \p thrust::host targets the host CPU, it is a parallel execution policy. That is,
- *  the order that an algorithm invokes functors or dereferences iterators is not defined.
- *
- *  The type of \p thrust::host is implementation-defined.
- *
- *  The following code snippet demonstrates how to use \p thrust::host to explicitly dispatch an invocation
- *  of \p thrust::for_each to the host backend system:
- *
- *  \code
- *  #include <thrust/for_each.h>
- *  #include <thrust/execution_policy.h>
- *  #include <cstdio>
- *
- *  struct printf_functor
- *  {
- *    __host__ __device__
- *    void operator()(int x)
- *    {
- *      printf("%d\n");
- *    }
- *  };
- *  ...
- *  int vec[3];
- *  vec[0] = 0; vec[1] = 1; vec[2] = 2;
- *
- *  thrust::for_each(thrust::host, vec.begin(), vec.end(), printf_functor());
- *
- *  // 0 1 2 is printed to standard output in some unspecified order
- *  \endcode
- *
- *  \see host_execution_policy
- *  \see thrust::device
- */
-static const detail::host_t host;
-
-
-/*! \p thrust::device is the default parallel execution policy associated with Thrust's device backend system
- *  configured by the \p THRUST_DEVICE_SYSTEM macro.
- *
- *  Instead of relying on implicit algorithm dispatch through iterator system tags, users may directly target
- *  algorithm dispatch at Thrust's device system by providing \p thrust::device as an algorithm parameter.
- *
- *  Explicit dispatch can be useful in avoiding the introduction of data copies into containers such as
- *  \p thrust::device_vector or to avoid wrapping e.g. raw pointers allocated by the CUDA API with types
- *  such as \p thrust::device_ptr.
- *
- *  The user must take care to guarantee that the iterators provided to an algorithm are compatible with
- *  the device backend system. For example, raw pointers allocated by <tt>std::malloc</tt> typically
- *  cannot be dereferenced by a GPU. For this reason, raw pointers allocated by host APIs should not be mixed
- *  with a \p thrust::device algorithm invocation when the device backend is CUDA.
- *
- *  The type of \p thrust::device is implementation-defined.
- *
- *  The following code snippet demonstrates how to use \p thrust::device to explicitly dispatch an invocation
- *  of \p thrust::for_each to the device backend system:
- *
- *  \code
- *  #include <thrust/for_each.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/execution_policy.h>
- *  #include <cstdio>
- *
- *  struct printf_functor
- *  {
- *    __host__ __device__
- *    void operator()(int x)
- *    {
- *      printf("%d\n");
- *    }
- *  };
- *  ...
- *  thrust::device_vector<int> d_vec[3];
- *  d_vec[0] = 0; d_vec[1] = 1; d_vec[2] = 2;
- *
- *  thrust::for_each(thrust::device, vec.begin(), vec.end(), printf_functor());
- *
- *  // 0 1 2 is printed to standard output in some unspecified order
- *  \endcode
- *
- *  \see host_execution_policy
- *  \see thrust::device
- */
-static const detail::device_t device;
-
-
-/*! \}
- */
-
-
-} // end thrust
-
diff --git a/compat/thrust/extrema.h b/compat/thrust/extrema.h
deleted file mode 100644
index 335bcd1e6b..0000000000
--- a/compat/thrust/extrema.h
+++ /dev/null
@@ -1,798 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file extrema.h
- *  \brief Functions for computing computing extremal values
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/execution_policy.h>
-#include <thrust/pair.h>
-
-namespace thrust
-{
-
-
-/*! This version of \p min returns the smaller of two values, given a comparison operation.
- *  \param lhs The first value to compare.
- *  \param rhs The second value to compare.
- *  \param comp A comparison operation.
- *  \return The smaller element.
- *
- *  \tparam T is convertible to \p BinaryPredicate's first argument type and to its second argument type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">BinaryPredicate</a>.
- *
- *  The following code snippet demonstrates how to use \p min to compute the smaller of two
- *  key-value objects.
- *
- *  \code
- *  #include <thrust/extrema.h>
- *  ...
- *  struct key_value
- *  {
- *    int key;
- *    int value;
- *  };
- *
- *  struct compare_key_value
- *  {
- *    __host__ __device__
- *    bool operator()(key_value lhs, key_value rhs)
- *    {
- *      return lhs.key < rhs.key;
- *    }
- *  };
- *
- *  ...
- *  key_value a = {13, 0};
- *  key_value b = { 7, 1);
- *
- *  key_value smaller = thrust::min(a, b, compare_key_value());
- *
- *  // smaller is {7, 1}
- *  \endcode
- *
- *  \note Returns the first argument when the arguments are equivalent.
- *  \see max
- */
-template<typename T, typename BinaryPredicate>
-__host__ __device__
-  T min THRUST_PREVENT_MACRO_SUBSTITUTION (const T &lhs, const T &rhs, BinaryPredicate comp);
-
-
-/*! This version of \p min returns the smaller of two values.
- *  \param lhs The first value to compare.
- *  \param rhs The second value to compare.
- *  \return The smaller element.
- *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
- *
- *  The following code snippet demonstrates how to use \p min to compute the smaller of two
- *  integers.
- *
- *  \code
- *  #include <thrust/extrema.h>
- *  ...
- *  int a = 13;
- *  int b = 7;
- *
- *  int smaller = thrust::min(a, b);
- *
- *  // smaller is 7
- *  \endcode
- *
- *  \note Returns the first argument when the arguments are equivalent.
- *  \see max
- */
-template<typename T>
-__host__ __device__
-  T min THRUST_PREVENT_MACRO_SUBSTITUTION (const T &lhs, const T &rhs);
-
-
-/*! This version of \p max returns the larger of two values, given a comparison operation.
- *  \param lhs The first value to compare.
- *  \param rhs The second value to compare.
- *  \param comp A comparison operation.
- *  \return The larger element.
- *
- *  \tparam T is convertible to \p BinaryPredicate's first argument type and to its second argument type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">BinaryPredicate</a>.
- *
- *  The following code snippet demonstrates how to use \p max to compute the larger of two
- *  key-value objects.
- *
- *  \code
- *  #include <thrust/extrema.h>
- *  ...
- *  struct key_value
- *  {
- *    int key;
- *    int value;
- *  };
- *
- *  struct compare_key_value
- *  {
- *    __host__ __device__
- *    bool operator()(key_value lhs, key_value rhs)
- *    {
- *      return lhs.key < rhs.key;
- *    }
- *  };
- *
- *  ...
- *  key_value a = {13, 0};
- *  key_value b = { 7, 1);
- *
- *  key_value larger = thrust::max(a, b, compare_key_value());
- *
- *  // larger is {13, 0}
- *  \endcode
- *
- *  \note Returns the first argument when the arguments are equivalent.
- *  \see min
- */
-template<typename T, typename BinaryPredicate>
-__host__ __device__
-  T max THRUST_PREVENT_MACRO_SUBSTITUTION (const T &lhs, const T &rhs, BinaryPredicate comp);
-
-
-/*! This version of \p max returns the larger of two values.
- *  \param lhs The first value to compare.
- *  \param rhs The second value to compare.
- *  \return The larger element.
- *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
- *
- *  The following code snippet demonstrates how to use \p max to compute the larger of two
- *  integers.
- *
- *  \code
- *  #include <thrust/extrema.h>
- *  ...
- *  int a = 13;
- *  int b = 7;
- *
- *  int larger = thrust::min(a, b);
- *
- *  // larger is 13
- *  \endcode
- *
- *  \note Returns the first argument when the arguments are equivalent.
- *  \see min
- */
-template<typename T>
-__host__ __device__
-  T max THRUST_PREVENT_MACRO_SUBSTITUTION (const T &lhs, const T &rhs);
-
-
-/*! \addtogroup reductions
- *  \{
- *  \addtogroup extrema
- *  \ingroup reductions
- *  \{
- */
-
-/*! \p min_element finds the smallest element in the range <tt>[first, last)</tt>.
- *  It returns the first iterator \c i in <tt>[first, last)</tt>
- *  such that no other iterator in <tt>[first, last)</tt> points to a value smaller
- *  than \c *i. The return value is \p last if and only if <tt>[first, last)</tt> is an
- *  empty range.
- *
- *  The two versions of \p min_element differ in how they define whether one element is
- *  less than another. This version compares objects using \c operator<. Specifically,
- *  this version of \p min_element returns the first iterator \c i in <tt>[first, last)</tt>
- *  such that, for every iterator \c j in <tt>[first, last)</tt>, <tt>*j < *i</tt> is
- *  \c false.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the sequence.
- *  \param last  The end of the sequence.
- *  \return An iterator pointing to the smallest element of the range <tt>[first, last)</tt>,
- *          if it is not an empty range; \p last, otherwise.
- *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
- *          and \c ForwardIterator's \c value_type is a model of
- *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
- *
- *  \code
- *  #include <thrust/extrema.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  int data[6] = {1, 0, 2, 2, 1, 3};
- *  int *result = thrust::min_element(thrust::host, data, data + 6);
- *
- *  // result is data + 1
- *  // *result is 0
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/min_element.html 
- */
-template<typename DerivedPolicy, typename ForwardIterator>
-ForwardIterator min_element(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, ForwardIterator first, ForwardIterator last);
-
-
-/*! \p min_element finds the smallest element in the range <tt>[first, last)</tt>.
- *  It returns the first iterator \c i in <tt>[first, last)</tt>
- *  such that no other iterator in <tt>[first, last)</tt> points to a value smaller
- *  than \c *i. The return value is \p last if and only if <tt>[first, last)</tt> is an
- *  empty range.
- *
- *  The two versions of \p min_element differ in how they define whether one element is
- *  less than another. This version compares objects using \c operator<. Specifically,
- *  this version of \p min_element returns the first iterator \c i in <tt>[first, last)</tt>
- *  such that, for every iterator \c j in <tt>[first, last)</tt>, <tt>*j < *i</tt> is
- *  \c false.
- *
- *  \param first The beginning of the sequence.
- *  \param last  The end of the sequence.
- *  \return An iterator pointing to the smallest element of the range <tt>[first, last)</tt>,
- *          if it is not an empty range; \p last, otherwise.
- *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
- *          and \c ForwardIterator's \c value_type is a model of
- *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
- *
- *  \code
- *  #include <thrust/extrema.h>
- *  ...
- *  int data[6] = {1, 0, 2, 2, 1, 3};
- *  int *result = thrust::min_element(data, data + 6);
- *
- *  // result is data + 1
- *  // *result is 0
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/min_element.html 
- */
-template <typename ForwardIterator>
-ForwardIterator min_element(ForwardIterator first, ForwardIterator last);
-
-
-/*! \p min_element finds the smallest element in the range <tt>[first, last)</tt>.
- *  It returns the first iterator \c i in <tt>[first, last)</tt>
- *  such that no other iterator in <tt>[first, last)</tt> points to a value smaller
- *  than \c *i. The return value is \p last if and only if <tt>[first, last)</tt> is an
- *  empty range.
- *
- *  The two versions of \p min_element differ in how they define whether one element is
- *  less than another. This version compares objects using a function object \p comp.
- *  Specifically, this version of \p min_element returns the first iterator \c i in <tt>[first, last)</tt>
- *  such that, for every iterator \c j in <tt>[first, last)</tt>, <tt>comp(*j, *i)</tt> is
- *  \c false.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the sequence.
- *  \param last  The end of the sequence.
- *  \param comp A binary predicate used for comparison.
- *  \return An iterator pointing to the smallest element of the range <tt>[first, last)</tt>,
- *          if it is not an empty range; \p last, otherwise.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
- *          and \p ForwardIterator's \c value_type is convertible to both \p comp's
- *          \c first_argument_type and \c second_argument_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate">Binary Predicate</a>.
- *
- *  The following code snippet demonstrates how to use \p min_element to find the smallest element
- *  of a collection of key-value pairs using the \p thrust::host execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/extrema.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *
- *  struct key_value
- *  {
- *    int key;
- *    int value;
- *  };
- *
- *  struct compare_key_value
- *  {
- *    __host__ __device__
- *    bool operator()(key_value lhs, key_value rhs)
- *    {
- *      return lhs.key < rhs.key;
- *    }
- *  };
- *
- *  ...
- *  key_value data[4] = { {4,5}, {0,7}, {2,3}, {6,1} };
- *
- *  key_value *smallest = thrust::min_element(thrust::host, data, data + 4, compare_key_value());
- *
- *  // smallest == data + 1
- *  // *smallest == {0,7}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/min_element.html 
- */
-template<typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
-ForwardIterator min_element(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, ForwardIterator first, ForwardIterator last, BinaryPredicate comp);
-
-
-/*! \p min_element finds the smallest element in the range <tt>[first, last)</tt>.
- *  It returns the first iterator \c i in <tt>[first, last)</tt>
- *  such that no other iterator in <tt>[first, last)</tt> points to a value smaller
- *  than \c *i. The return value is \p last if and only if <tt>[first, last)</tt> is an
- *  empty range.
- *
- *  The two versions of \p min_element differ in how they define whether one element is
- *  less than another. This version compares objects using a function object \p comp.
- *  Specifically, this version of \p min_element returns the first iterator \c i in <tt>[first, last)</tt>
- *  such that, for every iterator \c j in <tt>[first, last)</tt>, <tt>comp(*j, *i)</tt> is
- *  \c false.
- *
- *  \param first The beginning of the sequence.
- *  \param last  The end of the sequence.
- *  \param comp A binary predicate used for comparison.
- *  \return An iterator pointing to the smallest element of the range <tt>[first, last)</tt>,
- *          if it is not an empty range; \p last, otherwise.
- *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
- *          and \p ForwardIterator's \c value_type is convertible to both \p comp's
- *          \c first_argument_type and \c second_argument_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate">Binary Predicate</a>.
- *
- *  The following code snippet demonstrates how to use \p min_element to find the smallest element
- *  of a collection of key-value pairs.
- *
- *  \code
- *  #include <thrust/extrema.h>
- *
- *  struct key_value
- *  {
- *    int key;
- *    int value;
- *  };
- *
- *  struct compare_key_value
- *  {
- *    __host__ __device__
- *    bool operator()(key_value lhs, key_value rhs)
- *    {
- *      return lhs.key < rhs.key;
- *    }
- *  };
- *
- *  ...
- *  key_value data[4] = { {4,5}, {0,7}, {2,3}, {6,1} };
- *
- *  key_value *smallest = thrust::min_element(data, data + 4, compare_key_value());
- *
- *  // smallest == data + 1
- *  // *smallest == {0,7}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/min_element.html 
- */
-template <typename ForwardIterator, typename BinaryPredicate>
-ForwardIterator min_element(ForwardIterator first, ForwardIterator last,
-                            BinaryPredicate comp);
-
-
-/*! \p max_element finds the largest element in the range <tt>[first, last)</tt>.
- *  It returns the first iterator \c i in <tt>[first, last)</tt>
- *  such that no other iterator in <tt>[first, last)</tt> points to a value larger
- *  than \c *i. The return value is \p last if and only if <tt>[first, last)</tt> is an
- *  empty range.
- *
- *  The two versions of \p max_element differ in how they define whether one element is
- *  greater than another. This version compares objects using \c operator<. Specifically,
- *  this version of \p max_element returns the first iterator \c i in <tt>[first, last)</tt>
- *  such that, for every iterator \c j in <tt>[first, last)</tt>, <tt>*i < *j</tt> is
- *  \c false.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the sequence.
- *  \param last  The end of the sequence.
- *  \return An iterator pointing to the largest element of the range <tt>[first, last)</tt>,
- *          if it is not an empty range; \p last, otherwise.
- *
- *  \tparam A Thrust backend system.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \c ForwardIterator's \c value_type is a model of
- *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
- *
- *  \code
- *  #include <thrust/extrema.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  int data[6] = {1, 0, 2, 2, 1, 3};
- *  int *result = thrust::max_element(thrust::host, data, data + 6);
- *
- *  // *result == 3
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/max_element.html 
- */
-template<typename DerivedPolicy, typename ForwardIterator>
-ForwardIterator max_element(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, ForwardIterator first, ForwardIterator last);
-
-
-/*! \p max_element finds the largest element in the range <tt>[first, last)</tt>.
- *  It returns the first iterator \c i in <tt>[first, last)</tt>
- *  such that no other iterator in <tt>[first, last)</tt> points to a value larger
- *  than \c *i. The return value is \p last if and only if <tt>[first, last)</tt> is an
- *  empty range.
- *
- *  The two versions of \p max_element differ in how they define whether one element is
- *  greater than another. This version compares objects using \c operator<. Specifically,
- *  this version of \p max_element returns the first iterator \c i in <tt>[first, last)</tt>
- *  such that, for every iterator \c j in <tt>[first, last)</tt>, <tt>*i < *j</tt> is
- *  \c false.
- *
- *  \param first The beginning of the sequence.
- *  \param last  The end of the sequence.
- *  \return An iterator pointing to the largest element of the range <tt>[first, last)</tt>,
- *          if it is not an empty range; \p last, otherwise.
- *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \c ForwardIterator's \c value_type is a model of
- *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
- *
- *  \code
- *  #include <thrust/extrema.h>
- *  ...
- *  int data[6] = {1, 0, 2, 2, 1, 3};
- *  int *result = thrust::max_element(data, data + 6);
- *
- *  // *result == 3
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/max_element.html 
- */
-template <typename ForwardIterator>
-ForwardIterator max_element(ForwardIterator first, ForwardIterator last);
-
-
-/*! \p max_element finds the largest element in the range <tt>[first, last)</tt>.
- *  It returns the first iterator \c i in <tt>[first, last)</tt>
- *  such that no other iterator in <tt>[first, last)</tt> points to a value larger
- *  than \c *i. The return value is \p last if and only if <tt>[first, last)</tt> is an
- *  empty range.
- *
- *  The two versions of \p max_element differ in how they define whether one element is
- *  less than another. This version compares objects using a function object \p comp.
- *  Specifically, this version of \p max_element returns the first iterator \c i in <tt>[first, last)</tt>
- *  such that, for every iterator \c j in <tt>[first, last)</tt>, <tt>comp(*i, *j)</tt> is
- *  \c false.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the sequence.
- *  \param last  The end of the sequence.
- *  \param comp A binary predicate used for comparison.
- *  \return An iterator pointing to the largest element of the range <tt>[first, last)</tt>,
- *          if it is not an empty range; \p last, otherwise.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator's \c value_type is convertible to both \p comp's
- *          \c first_argument_type and \c second_argument_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
- *
- *  The following code snippet demonstrates how to use \p max_element to find the largest element
- *  of a collection of key-value pairs using the \p thrust::host execution policy for parallelization.
- *
- *  \code
- *  #include <thrust/extrema.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *
- *  struct key_value
- *  {
- *    int key;
- *    int value;
- *  };
- *
- *  struct compare_key_value
- *  {
- *    __host__ __device__
- *    bool operator()(key_value lhs, key_value rhs)
- *    {
- *      return lhs.key < rhs.key;
- *    }
- *  };
- *
- *  ...
- *  key_value data[4] = { {4,5}, {0,7}, {2,3}, {6,1} };
- *
- *  key_value *largest = thrust::max_element(thrust::host, data, data + 4, compare_key_value());
- *
- *  // largest == data + 3
- *  // *largest == {6,1}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/max_element.html 
- */
-template<typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
-ForwardIterator max_element(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, ForwardIterator first, ForwardIterator last, BinaryPredicate comp);
-
-
-/*! \p max_element finds the largest element in the range <tt>[first, last)</tt>.
- *  It returns the first iterator \c i in <tt>[first, last)</tt>
- *  such that no other iterator in <tt>[first, last)</tt> points to a value larger
- *  than \c *i. The return value is \p last if and only if <tt>[first, last)</tt> is an
- *  empty range.
- *
- *  The two versions of \p max_element differ in how they define whether one element is
- *  less than another. This version compares objects using a function object \p comp.
- *  Specifically, this version of \p max_element returns the first iterator \c i in <tt>[first, last)</tt>
- *  such that, for every iterator \c j in <tt>[first, last)</tt>, <tt>comp(*i, *j)</tt> is
- *  \c false.
- *
- *  \param first The beginning of the sequence.
- *  \param last  The end of the sequence.
- *  \param comp A binary predicate used for comparison.
- *  \return An iterator pointing to the largest element of the range <tt>[first, last)</tt>,
- *          if it is not an empty range; \p last, otherwise.
- *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator's \c value_type is convertible to both \p comp's
- *          \c first_argument_type and \c second_argument_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
- *
- *  The following code snippet demonstrates how to use \p max_element to find the largest element
- *  of a collection of key-value pairs.
- *
- *  \code
- *  #include <thrust/extrema.h>
- *
- *  struct key_value
- *  {
- *    int key;
- *    int value;
- *  };
- *
- *  struct compare_key_value
- *  {
- *    __host__ __device__
- *    bool operator()(key_value lhs, key_value rhs)
- *    {
- *      return lhs.key < rhs.key;
- *    }
- *  };
- *
- *  ...
- *  key_value data[4] = { {4,5}, {0,7}, {2,3}, {6,1} };
- *
- *  key_value *largest = thrust::max_element(data, data + 4, compare_key_value());
- *
- *  // largest == data + 3
- *  // *largest == {6,1}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/max_element.html 
- */
-template <typename ForwardIterator, typename BinaryPredicate>
-ForwardIterator max_element(ForwardIterator first, ForwardIterator last,
-                            BinaryPredicate comp);
-
-
-/*! \p minmax_element finds the smallest and largest elements in the range <tt>[first, last)</tt>.
- *  It returns a pair of iterators <tt>(imin, imax)</tt> where \c imin is the same iterator
- *  returned by \p min_element and \c imax is the same iterator returned by \p max_element.
- *  This function is potentially more efficient than separate calls to \p min_element and \p max_element.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the sequence.
- *  \param last  The end of the sequence.
- *  \return A pair of iterator pointing to the smallest and largest elements of the range <tt>[first, last)</tt>,
- *          if it is not an empty range; \p last, otherwise.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \c ForwardIterator's \c value_type is a model of
- *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
- *
- *  \code
- *  #include <thrust/extrema.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  int data[6] = {1, 0, 2, 2, 1, 3};
- *  thrust::pair<int *, int *> result = thrust::minmax_element(thrust::host, data, data + 6);
- *
- *  // result.first is data + 1
- *  // result.second is data + 5
- *  // *result.first is 0
- *  // *result.second is 3
- *  \endcode
- *
- *  \see min_element
- *  \see max_element
- *  \see http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2005/n1840.pdf
- */
-template<typename DerivedPolicy, typename ForwardIterator>
-thrust::pair<ForwardIterator,ForwardIterator> minmax_element(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, ForwardIterator first, ForwardIterator last);
-
-
-/*! \p minmax_element finds the smallest and largest elements in the range <tt>[first, last)</tt>.
- *  It returns a pair of iterators <tt>(imin, imax)</tt> where \c imin is the same iterator
- *  returned by \p min_element and \c imax is the same iterator returned by \p max_element.
- *  This function is potentially more efficient than separate calls to \p min_element and \p max_element.
- *
- *  \param first The beginning of the sequence.
- *  \param last  The end of the sequence.
- *  \return A pair of iterator pointing to the smallest and largest elements of the range <tt>[first, last)</tt>,
- *          if it is not an empty range; \p last, otherwise.
- *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \c ForwardIterator's \c value_type is a model of
- *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
- *
- *  \code
- *  #include <thrust/extrema.h>
- *  ...
- *  int data[6] = {1, 0, 2, 2, 1, 3};
- *  thrust::pair<int *, int *> result = thrust::minmax_element(data, data + 6);
- *
- *  // result.first is data + 1
- *  // result.second is data + 5
- *  // *result.first is 0
- *  // *result.second is 3
- *  \endcode
- *
- *  \see min_element
- *  \see max_element
- *  \see http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2005/n1840.pdf
- */
-template <typename ForwardIterator>
-thrust::pair<ForwardIterator,ForwardIterator> minmax_element(ForwardIterator first, 
-                                                             ForwardIterator last);
-
-
-/*! \p minmax_element finds the smallest and largest elements in the range <tt>[first, last)</tt>.
- *  It returns a pair of iterators <tt>(imin, imax)</tt> where \c imin is the same iterator
- *  returned by \p min_element and \c imax is the same iterator returned by \p max_element.
- *  This function is potentially more efficient than separate calls to \p min_element and \p max_element.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the sequence.
- *  \param last  The end of the sequence.
- *  \param comp A binary predicate used for comparison.
- *  \return A pair of iterator pointing to the smallest and largest elements of the range <tt>[first, last)</tt>,
- *          if it is not an empty range; \p last, otherwise.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
- *          and \p ForwardIterator's \c value_type is convertible to both \p comp's
- *          \c first_argument_type and \c second_argument_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate">Binary Predicate</a>.
- *
- *  The following code snippet demonstrates how to use \p minmax_element to find the smallest and largest elements
- *  of a collection of key-value pairs using the \p thrust::host execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/extrema.h>
- *  #include <thrust/pair.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *
- *  struct key_value
- *  {
- *    int key;
- *    int value;
- *  };
- *
- *  struct compare_key_value
- *  {
- *    __host__ __device__
- *    bool operator()(key_value lhs, key_value rhs)
- *    {
- *      return lhs.key < rhs.key;
- *    }
- *  };
- *
- *  ...
- *  key_value data[4] = { {4,5}, {0,7}, {2,3}, {6,1} };
- *
- *  thrust::pair<key_value*,key_value*> extrema = thrust::minmax_element(thrust::host, data, data + 4, compare_key_value());
- *
- *  // extrema.first   == data + 1
- *  // *extrema.first  == {0,7}
- *  // extrema.second  == data + 3
- *  // *extrema.second == {6,1}
- *  \endcode
- *
- *  \see min_element
- *  \see max_element
- *  \see http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2005/n1840.pdf
- */
-template<typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
-thrust::pair<ForwardIterator,ForwardIterator> minmax_element(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, ForwardIterator first, ForwardIterator last, BinaryPredicate comp);
-
-
-/*! \p minmax_element finds the smallest and largest elements in the range <tt>[first, last)</tt>.
- *  It returns a pair of iterators <tt>(imin, imax)</tt> where \c imin is the same iterator
- *  returned by \p min_element and \c imax is the same iterator returned by \p max_element.
- *  This function is potentially more efficient than separate calls to \p min_element and \p max_element.
- *
- *  \param first The beginning of the sequence.
- *  \param last  The end of the sequence.
- *  \param comp A binary predicate used for comparison.
- *  \return A pair of iterator pointing to the smallest and largest elements of the range <tt>[first, last)</tt>,
- *          if it is not an empty range; \p last, otherwise.
- *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
- *          and \p ForwardIterator's \c value_type is convertible to both \p comp's
- *          \c first_argument_type and \c second_argument_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate">Binary Predicate</a>.
- *
- *  The following code snippet demonstrates how to use \p minmax_element to find the smallest and largest elements
- *  of a collection of key-value pairs.
- *
- *  \code
- *  #include <thrust/extrema.h>
- *  #include <thrust/pair.h>
- *
- *  struct key_value
- *  {
- *    int key;
- *    int value;
- *  };
- *
- *  struct compare_key_value
- *  {
- *    __host__ __device__
- *    bool operator()(key_value lhs, key_value rhs)
- *    {
- *      return lhs.key < rhs.key;
- *    }
- *  };
- *
- *  ...
- *  key_value data[4] = { {4,5}, {0,7}, {2,3}, {6,1} };
- *
- *  thrust::pair<key_value*,key_value*> extrema = thrust::minmax_element(data, data + 4, compare_key_value());
- *
- *  // extrema.first   == data + 1
- *  // *extrema.first  == {0,7}
- *  // extrema.second  == data + 3
- *  // *extrema.second == {6,1}
- *  \endcode
- *
- *  \see min_element
- *  \see max_element
- *  \see http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2005/n1840.pdf
- */
-template <typename ForwardIterator, typename BinaryPredicate>
-thrust::pair<ForwardIterator,ForwardIterator> minmax_element(ForwardIterator first, 
-                                                             ForwardIterator last,
-                                                             BinaryPredicate comp);
-
-/*! \} // end extrema
- *  \} // end reductions
- */
-
-} // end thrust
-
-#include <thrust/detail/extrema.inl>
-#include <thrust/detail/minmax.h>
-
diff --git a/compat/thrust/fill.h b/compat/thrust/fill.h
deleted file mode 100644
index b492cec9dd..0000000000
--- a/compat/thrust/fill.h
+++ /dev/null
@@ -1,205 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file fill.h
- *  \brief Fills a range with a constant value
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/execution_policy.h>
-
-namespace thrust
-{
-
-
-/*! \addtogroup transformations
- *  \addtogroup filling
- *  \ingroup transformations
- *  \{
- */
-
-
-/*! \p fill assigns the value \p value to every element in
- *  the range <tt>[first, last)</tt>. That is, for every
- *  iterator \c i in <tt>[first, last)</tt>, it performs
- *  the assignment <tt>*i = value</tt>.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the sequence.
- *  \param last The end of the sequence.
- *  \param value The value to be copied.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator is mutable.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
- *          and \p T's \c value_type is convertible to \p ForwardIterator's \c value_type.
- *
- *  The following code snippet demonstrates how to use \p fill to set a thrust::device_vector's
- *  elements to a given value using the \p thrust::device execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/fill.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  thrust::device_vector<int> v(4);
- *  thrust::fill(thrust::device, v.begin(), v.end(), 137);
- *
- *  // v[0] == 137, v[1] == 137, v[2] == 137, v[3] == 137
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/fill.html
- *  \see \c fill_n
- *  \see \c uninitialized_fill
- */
-template<typename DerivedPolicy, typename ForwardIterator, typename T>
-  void fill(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-            ForwardIterator first,
-            ForwardIterator last,
-            const T &value);
-
-
-/*! \p fill assigns the value \p value to every element in
- *  the range <tt>[first, last)</tt>. That is, for every
- *  iterator \c i in <tt>[first, last)</tt>, it performs
- *  the assignment <tt>*i = value</tt>.
- *
- *  \param first The beginning of the sequence.
- *  \param last The end of the sequence.
- *  \param value The value to be copied.
- *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator is mutable.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
- *          and \p T's \c value_type is convertible to \p ForwardIterator's \c value_type.
- *
- *  The following code snippet demonstrates how to use \p fill to set a thrust::device_vector's
- *  elements to a given value.
- *
- *  \code
- *  #include <thrust/fill.h>
- *  #include <thrust/device_vector.h>
- *  ...
- *  thrust::device_vector<int> v(4);
- *  thrust::fill(v.begin(), v.end(), 137);
- *
- *  // v[0] == 137, v[1] == 137, v[2] == 137, v[3] == 137
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/fill.html
- *  \see \c fill_n
- *  \see \c uninitialized_fill
- */
-template<typename ForwardIterator, typename T>
-  void fill(ForwardIterator first,
-            ForwardIterator last,
-            const T &value);
-
-
-/*! \p fill_n assigns the value \p value to every element in
- *  the range <tt>[first, first+n)</tt>. That is, for every
- *  iterator \c i in <tt>[first, first+n)</tt>, it performs
- *  the assignment <tt>*i = value</tt>.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the sequence.
- *  \param n The size of the sequence.
- *  \param value The value to be copied.
- *  \return <tt>first + n</tt>
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
- *          and \p T's \c value_type is convertible to a type in \p OutputIterator's set of \c value_type.
- *
- *  The following code snippet demonstrates how to use \p fill to set a thrust::device_vector's
- *  elements to a given value using the \p thrust::device execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/fill.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  thrust::device_vector<int> v(4);
- *  thrust::fill_n(thrust::device, v.begin(), v.size(), 137);
- *
- *  // v[0] == 137, v[1] == 137, v[2] == 137, v[3] == 137
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/fill_n.html
- *  \see \c fill
- *  \see \c uninitialized_fill_n
- */
-template<typename DerivedPolicy, typename OutputIterator, typename Size, typename T>
-  OutputIterator fill_n(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                        OutputIterator first,
-                        Size n,
-                        const T &value);
-
-
-/*! \p fill_n assigns the value \p value to every element in
- *  the range <tt>[first, first+n)</tt>. That is, for every
- *  iterator \c i in <tt>[first, first+n)</tt>, it performs
- *  the assignment <tt>*i = value</tt>.
- *
- *  \param first The beginning of the sequence.
- *  \param n The size of the sequence.
- *  \param value The value to be copied.
- *  \return <tt>first + n</tt>
- *
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
- *          and \p T's \c value_type is convertible to a type in \p OutputIterator's set of \c value_type.
- *
- *  The following code snippet demonstrates how to use \p fill to set a thrust::device_vector's
- *  elements to a given value.
- *
- *  \code
- *  #include <thrust/fill.h>
- *  #include <thrust/device_vector.h>
- *  ...
- *  thrust::device_vector<int> v(4);
- *  thrust::fill_n(v.begin(), v.size(), 137);
- *
- *  // v[0] == 137, v[1] == 137, v[2] == 137, v[3] == 137
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/fill_n.html
- *  \see \c fill
- *  \see \c uninitialized_fill_n
- */
-template<typename OutputIterator, typename Size, typename T>
-  OutputIterator fill_n(OutputIterator first,
-                        Size n,
-                        const T &value);
-
-
-/*! \} // end filling
- *  \} // transformations
- */
-
-} // end namespace thrust
-
-#include <thrust/detail/fill.inl>
-
diff --git a/compat/thrust/find.h b/compat/thrust/find.h
deleted file mode 100644
index fa01ded500..0000000000
--- a/compat/thrust/find.h
+++ /dev/null
@@ -1,382 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file find.h
- *  \brief Locating values in (unsorted) ranges
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/execution_policy.h>
-
-namespace thrust
-{
-
-
-/*! \addtogroup algorithms
- */
-
-/*! \addtogroup searching
- *  \ingroup algorithms
- *  \{
- */
-
-
-/*! \p find returns the first iterator \c i in the range 
- *  <tt>[first, last)</tt> such that <tt>*i == value</tt>
- *  or \c last if no such iterator exists.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first Beginning of the sequence to search.
- *  \param last End of the sequence to search.
- *  \param value The value to find.
- *  \return The first iterator \c i such that <tt>*i == value</tt> or \c last.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *          and \p InputIterator's \c value_type is equality comparable to type \c T.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">EqualityComparable</a>. 
- *
- *  \code
- *  #include <thrust/find.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  thrust::device_vector<int> input(4);
- *
- *  input[0] = 0;
- *  input[1] = 5;
- *  input[2] = 3;
- *  input[3] = 7;
- *
- *  thrust::device_vector<int>::iterator iter;
- *
- *  iter = thrust::find(thrust::device, input.begin(), input.end(), 3); // returns input.first() + 2
- *  iter = thrust::find(thrust::device, input.begin(), input.end(), 5); // returns input.first() + 1
- *  iter = thrust::find(thrust::device, input.begin(), input.end(), 9); // returns input.end()
- *  \endcode
- *
- *  \see find_if
- *  \see mismatch
- */
-template<typename DerivedPolicy, typename InputIterator, typename T>
-InputIterator find(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                   InputIterator first,
-                   InputIterator last,
-                   const T& value);
-
-
-/*! \p find returns the first iterator \c i in the range 
- *  <tt>[first, last)</tt> such that <tt>*i == value</tt>
- *  or \c last if no such iterator exists.
- *
- *  \param first Beginning of the sequence to search.
- *  \param last End of the sequence to search.
- *  \param value The value to find.
- *  \return The first iterator \c i such that <tt>*i == value</tt> or \c last.
- *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *          and \p InputIterator's \c value_type is equality comparable to type \c T.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">EqualityComparable</a>. 
- *
- *  \code
- *  #include <thrust/find.h>
- *  #include <thrust/device_vector.h>
- *  ...
- *  thrust::device_vector<int> input(4);
- *
- *  input[0] = 0;
- *  input[1] = 5;
- *  input[2] = 3;
- *  input[3] = 7;
- *
- *  thrust::device_vector<int>::iterator iter;
- *
- *  iter = thrust::find(input.begin(), input.end(), 3); // returns input.first() + 2
- *  iter = thrust::find(input.begin(), input.end(), 5); // returns input.first() + 1
- *  iter = thrust::find(input.begin(), input.end(), 9); // returns input.end()
- *  \endcode
- *
- *  \see find_if
- *  \see mismatch
- */
-template <typename InputIterator, typename T>
-InputIterator find(InputIterator first,
-                   InputIterator last,
-                   const T& value);
-
-
-/*! \p find_if returns the first iterator \c i in the range 
- *  <tt>[first, last)</tt> such that <tt>pred(*i)</tt> is \c true
- *  or \c last if no such iterator exists.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first Beginning of the sequence to search.
- *  \param last End of the sequence to search.
- *  \param pred A predicate used to test range elements.
- *  \return The first iterator \c i such that <tt>pred(*i)</tt> is \c true, or \c last.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  \code
- *  #include <thrust/find.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *
- *  struct greater_than_four
- *  {
- *    __host__ __device__
- *    bool operator()(int x)
- *    {
- *      return x > 4;
- *    }
- *  };
- *
- *  struct greater_than_ten
- *  {
- *    __host__ __device__
- *    bool operator()(int x)
- *    {
- *      return x > 10;
- *    }
- *  };
- *
- *  ...
- *  thrust::device_vector<int> input(4);
- *
- *  input[0] = 0;
- *  input[1] = 5;
- *  input[2] = 3;
- *  input[3] = 7;
- *
- *  thrust::device_vector<int>::iterator iter;
- *
- *  iter = thrust::find_if(thrust::device, input.begin(), input.end(), greater_than_four()); // returns input.first() + 1
- *
- *  iter = thrust::find_if(thrust::device, input.begin(), input.end(), greater_than_ten());  // returns input.end()
- *  \endcode
- *
- *  \see find
- *  \see find_if_not
- *  \see mismatch
- */
-template<typename DerivedPolicy, typename InputIterator, typename Predicate>
-InputIterator find_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                      InputIterator first,
-                      InputIterator last,
-                      Predicate pred);
-
-
-/*! \p find_if returns the first iterator \c i in the range 
- *  <tt>[first, last)</tt> such that <tt>pred(*i)</tt> is \c true
- *  or \c last if no such iterator exists.
- *
- *  \param first Beginning of the sequence to search.
- *  \param last End of the sequence to search.
- *  \param pred A predicate used to test range elements.
- *  \return The first iterator \c i such that <tt>pred(*i)</tt> is \c true, or \c last.
- *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  \code
- *  #include <thrust/find.h>
- *  #include <thrust/device_vector.h>
- *
- *  struct greater_than_four
- *  {
- *    __host__ __device__
- *    bool operator()(int x)
- *    {
- *      return x > 4;
- *    }
- *  };
- *
- *  struct greater_than_ten
- *  {
- *    __host__ __device__
- *    bool operator()(int x)
- *    {
- *      return x > 10;
- *    }
- *  };
- *
- *  ...
- *  thrust::device_vector<int> input(4);
- *
- *  input[0] = 0;
- *  input[1] = 5;
- *  input[2] = 3;
- *  input[3] = 7;
- *
- *  thrust::device_vector<int>::iterator iter;
- *
- *  iter = thrust::find_if(input.begin(), input.end(), greater_than_four()); // returns input.first() + 1
- *
- *  iter = thrust::find_if(input.begin(), input.end(), greater_than_ten());  // returns input.end()
- *  \endcode
- *
- *  \see find
- *  \see find_if_not
- *  \see mismatch
- */
-template <typename InputIterator, typename Predicate>
-InputIterator find_if(InputIterator first,
-                      InputIterator last,
-                      Predicate pred);
-
-
-/*! \p find_if_not returns the first iterator \c i in the range 
- *  <tt>[first, last)</tt> such that <tt>pred(*i)</tt> is \c false
- *  or \c last if no such iterator exists.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first Beginning of the sequence to search.
- *  \param last End of the sequence to search.
- *  \param pred A predicate used to test range elements.
- *  \return The first iterator \c i such that <tt>pred(*i)</tt> is \c false, or \c last.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  \code
- *  #include <thrust/find.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *
- *  struct greater_than_four
- *  {
- *    __host__ __device__
- *    bool operator()(int x)
- *    {
- *      return x > 4;
- *    }
- *  };
- *
- *  struct greater_than_ten
- *  {
- *    __host__ __device__
- *    bool operator()(int x)
- *    {
- *      return x > 10;
- *    }
- *  };
- *
- *  ...
- *  thrust::device_vector<int> input(4);
- *
- *  input[0] = 0;
- *  input[1] = 5;
- *  input[2] = 3;
- *  input[3] = 7;
- *
- *  thrust::device_vector<int>::iterator iter;
- *
- *  iter = thrust::find_if_not(thrust::device, input.begin(), input.end(), greater_than_four()); // returns input.first()
- *
- *  iter = thrust::find_if_not(thrust::device, input.begin(), input.end(), greater_than_ten());  // returns input.first()
- *  \endcode
- *
- *  \see find
- *  \see find_if
- *  \see mismatch
- */
-template<typename DerivedPolicy, typename InputIterator, typename Predicate>
-InputIterator find_if_not(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                          InputIterator first,
-                          InputIterator last,
-                          Predicate pred);
-
-
-/*! \p find_if_not returns the first iterator \c i in the range 
- *  <tt>[first, last)</tt> such that <tt>pred(*i)</tt> is \c false
- *  or \c last if no such iterator exists.
- *
- *  \param first Beginning of the sequence to search.
- *  \param last End of the sequence to search.
- *  \param pred A predicate used to test range elements.
- *  \return The first iterator \c i such that <tt>pred(*i)</tt> is \c false, or \c last.
- *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  \code
- *  #include <thrust/find.h>
- *  #include <thrust/device_vector.h>
- *
- *  struct greater_than_four
- *  {
- *    __host__ __device__
- *    bool operator()(int x)
- *    {
- *      return x > 4;
- *    }
- *  };
- *
- *  struct greater_than_ten
- *  {
- *    __host__ __device__
- *    bool operator()(int x)
- *    {
- *      return x > 10;
- *    }
- *  };
- *
- *  ...
- *  thrust::device_vector<int> input(4);
- *
- *  input[0] = 0;
- *  input[1] = 5;
- *  input[2] = 3;
- *  input[3] = 7;
- *
- *  thrust::device_vector<int>::iterator iter;
- *
- *  iter = thrust::find_if_not(input.begin(), input.end(), greater_than_four()); // returns input.first()
- *
- *  iter = thrust::find_if_not(input.begin(), input.end(), greater_than_ten());  // returns input.first()
- *  \endcode
- *
- *  \see find
- *  \see find_if
- *  \see mismatch
- */
-template <typename InputIterator, typename Predicate>
-InputIterator find_if_not(InputIterator first,
-                          InputIterator last,
-                          Predicate pred);
-
-/*! \} // end searching
- */
-
-
-} // end namespace thrust
-
-#include <thrust/detail/find.inl>
-
diff --git a/compat/thrust/for_each.h b/compat/thrust/for_each.h
deleted file mode 100644
index efab9d8fab..0000000000
--- a/compat/thrust/for_each.h
+++ /dev/null
@@ -1,278 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- * *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file for_each.h
- *  \brief Applies a function to each element in a range
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/execution_policy.h>
-
-namespace thrust
-{
-
-
-/*! \addtogroup modifying
- *  \ingroup transformations
- *  \{
- */
-
-
-/*! \p for_each applies the function object \p f to each element
- *  in the range <tt>[first, last)</tt>; \p f's return value, if any,
- *  is ignored. Unlike the C++ Standard Template Library function
- *  <tt>std::for_each</tt>, this version offers no guarantee on
- *  order of execution. For this reason, this version of \p for_each
- *  does not return a copy of the function object.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the sequence.
- *  \param last The end of the sequence.
- *  \param f The function object to apply to the range <tt>[first, last)</tt>.
- *  \return last
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator">Input Iterator</a>,
- *          and \p InputIterator's \c value_type is convertible to \p UnaryFunction's \c argument_type.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction">Unary Function</a>,
- *          and \p UnaryFunction does not apply any non-constant operation through its argument.
- *
- *  The following code snippet demonstrates how to use \p for_each to print the elements
- *  of a \p std::device_vector using the \p thrust::device parallelization policy:
- *
- *  \code
- *  #include <thrust/for_each.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/execution_policy.h>
- *  #include <cstdio>
- *  ...
- *
- *  struct printf_functor
- *  {
- *    __host__ __device__
- *    void operator()(int x)
- *    {
- *      // note that using printf in a __device__ function requires
- *      // code compiled for a GPU with compute capability 2.0 or
- *      // higher (nvcc --arch=sm_20)
- *      printf("%d\n");
- *    }
- *  };
- *  ...
- *  thrust::device_vector<int> d_vec(3);
- *  d_vec[0] = 0; d_vec[1] = 1; d_vec[2] = 2;
- *
- *  thrust::for_each(thrust::device, d_vec.begin(), d_vec.end(), printf_functor());
- *
- *  // 0 1 2 is printed to standard output in some unspecified order
- *  \endcode
- *
- *  \see for_each_n
- *  \see http://www.sgi.com/tech/stl/for_each.html
- */
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename UnaryFunction>
-InputIterator for_each(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                       InputIterator first,
-                       InputIterator last,
-                       UnaryFunction f);
-
-
-/*! \p for_each_n applies the function object \p f to each element
- *  in the range <tt>[first, first + n)</tt>; \p f's return value, if any,
- *  is ignored. Unlike the C++ Standard Template Library function
- *  <tt>std::for_each</tt>, this version offers no guarantee on
- *  order of execution.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the sequence.
- *  \param n The size of the input sequence.
- *  \param f The function object to apply to the range <tt>[first, first + n)</tt>.
- *  \return <tt>first + n</tt>
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator">Input Iterator</a>,
- *          and \p InputIterator's \c value_type is convertible to \p UnaryFunction's \c argument_type.
- *  \tparam Size is an integral type.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction">Unary Function</a>,
- *          and \p UnaryFunction does not apply any non-constant operation through its argument.
- *
- *  The following code snippet demonstrates how to use \p for_each_n to print the elements
- *  of a \p device_vector using the \p thrust::device parallelization policy.
- *
- *  \code
- *  #include <thrust/for_each.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/execution_policy.h>
- *  #include <cstdio>
- *
- *  struct printf_functor
- *  {
- *    __host__ __device__
- *    void operator()(int x)
- *    {
- *      // note that using printf in a __device__ function requires
- *      // code compiled for a GPU with compute capability 2.0 or
- *      // higher (nvcc --arch=sm_20)
- *      printf("%d\n");
- *    }
- *  };
- *  ...
- *  thrust::device_vector<int> d_vec(3);
- *  d_vec[0] = 0; d_vec[1] = 1; d_vec[2] = 2;
- *
- *  thrust::for_each_n(thrust::device, d_vec.begin(), d_vec.size(), printf_functor());
- *
- *  // 0 1 2 is printed to standard output in some unspecified order
- *  \endcode
- *
- *  \see for_each
- *  \see http://www.sgi.com/tech/stl/for_each.html
- */
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename Size,
-         typename UnaryFunction>
-InputIterator for_each_n(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                         InputIterator first,
-                         Size n,
-                         UnaryFunction f);
-
-/*! \p for_each applies the function object \p f to each element
- *  in the range <tt>[first, last)</tt>; \p f's return value, if any,
- *  is ignored. Unlike the C++ Standard Template Library function
- *  <tt>std::for_each</tt>, this version offers no guarantee on
- *  order of execution. For this reason, this version of \p for_each
- *  does not return a copy of the function object.
- *
- *  \param first The beginning of the sequence.
- *  \param last The end of the sequence.
- *  \param f The function object to apply to the range <tt>[first, last)</tt>.
- *  \return last
- *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator">Input Iterator</a>,
- *          and \p InputIterator's \c value_type is convertible to \p UnaryFunction's \c argument_type.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction">Unary Function</a>,
- *          and \p UnaryFunction does not apply any non-constant operation through its argument.
- *
- *  The following code snippet demonstrates how to use \p for_each to print the elements
- *  of a \p device_vector.
- *
- *  \code
- *  #include <thrust/for_each.h>
- *  #include <thrust/device_vector.h>
- *  #include <stdio.h>
- *
- *  struct printf_functor
- *  {
- *    __host__ __device__
- *    void operator()(int x)
- *    {
- *      // note that using printf in a __device__ function requires
- *      // code compiled for a GPU with compute capability 2.0 or
- *      // higher (nvcc --arch=sm_20)
- *      printf("%d\n");
- *    }
- *  };
- *  ...
- *  thrust::device_vector<int> d_vec(3);
- *  d_vec[0] = 0; d_vec[1] = 1; d_vec[2] = 2;
- *
- *  thrust::for_each(d_vec.begin(), d_vec.end(), printf_functor());
- *
- *  // 0 1 2 is printed to standard output in some unspecified order
- *  \endcode
- *
- *  \see for_each_n
- *  \see http://www.sgi.com/tech/stl/for_each.html
- */
-template<typename InputIterator,
-         typename UnaryFunction>
-InputIterator for_each(InputIterator first,
-                       InputIterator last,
-                       UnaryFunction f);
-
-
-/*! \p for_each_n applies the function object \p f to each element
- *  in the range <tt>[first, first + n)</tt>; \p f's return value, if any,
- *  is ignored. Unlike the C++ Standard Template Library function
- *  <tt>std::for_each</tt>, this version offers no guarantee on
- *  order of execution.
- *
- *  \param first The beginning of the sequence.
- *  \param n The size of the input sequence.
- *  \param f The function object to apply to the range <tt>[first, first + n)</tt>.
- *  \return <tt>first + n</tt>
- *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator">Input Iterator</a>,
- *          and \p InputIterator's \c value_type is convertible to \p UnaryFunction's \c argument_type.
- *  \tparam Size is an integral type.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction">Unary Function</a>,
- *          and \p UnaryFunction does not apply any non-constant operation through its argument.
- *
- *  The following code snippet demonstrates how to use \p for_each_n to print the elements
- *  of a \p device_vector.
- *
- *  \code
- *  #include <thrust/for_each.h>
- *  #include <thrust/device_vector.h>
- *  #include <stdio.h>
- *
- *  struct printf_functor
- *  {
- *    __host__ __device__
- *    void operator()(int x)
- *    {
- *      // note that using printf in a __device__ function requires
- *      // code compiled for a GPU with compute capability 2.0 or
- *      // higher (nvcc --arch=sm_20)
- *      printf("%d\n");
- *    }
- *  };
- *  ...
- *  thrust::device_vector<int> d_vec(3);
- *  d_vec[0] = 0; d_vec[1] = 1; d_vec[2] = 2;
- *
- *  thrust::for_each_n(d_vec.begin(), d_vec.size(), printf_functor());
- *
- *  // 0 1 2 is printed to standard output in some unspecified order
- *  \endcode
- *
- *  \see for_each
- *  \see http://www.sgi.com/tech/stl/for_each.html
- */
-template<typename InputIterator,
-         typename Size,
-         typename UnaryFunction>
-InputIterator for_each_n(InputIterator first,
-                         Size n,
-                         UnaryFunction f);
-
-/*! \} // end modifying
- */
-
-} // end namespace thrust
-
-#include <thrust/detail/for_each.inl>
-
diff --git a/compat/thrust/functional.h b/compat/thrust/functional.h
deleted file mode 100644
index b3d47f9179..0000000000
--- a/compat/thrust/functional.h
+++ /dev/null
@@ -1,1079 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file functional.h
- *  \brief Function objects and tools for manipulating them
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <functional>
-#include <thrust/detail/functional/placeholder.h>
-
-namespace thrust
-{
-
-/*! \addtogroup function_objects Function Objects
- */
-
-template<typename Operation> struct unary_traits;
-
-template<typename Operation> struct binary_traits;
-
-/*! \addtogroup function_object_adaptors Function Object Adaptors
- *  \ingroup function_objects
- *  \{
- */
-
-/*! \p unary_function is an empty base class: it contains no member functions
- *  or member variables, but only type information. The only reason it exists
- *  is to make it more convenient to define types that are models of the
- *  concept Adaptable Unary Function. Specifically, any model of Adaptable
- *  Unary Function must define nested \c typedefs. Those \c typedefs are
- *  provided by the base class \p unary_function.
- *
- *  The following code snippet demonstrates how to construct an 
- *  Adaptable Unary Function using \p unary_function.
- *
- *  \code
- *  struct sine : public thrust::unary_function<float,float>
- *  {
- *    __host__ __device__
- *    float operator()(float x) { return sinf(x); }
- *  };
- *  \endcode
- *
- *  \note unary_function is currently redundant with the C++ STL type
- *  \c std::unary_function. We reserve it here for potential additional
- *  functionality at a later date.
- *
- *  \see http://www.sgi.com/tech/stl/unary_function.html
- *  \see binary_function
- */
-template<typename Argument,
-         typename Result>
-  struct unary_function
-    : public std::unary_function<Argument, Result>
-{
-}; // end unary_function
-
-/*! \p binary_function is an empty base class: it contains no member functions
- *  or member variables, but only type information. The only reason it exists
- *  is to make it more convenient to define types that are models of the
- *  concept Adaptable Binary Function. Specifically, any model of Adaptable
- *  Binary Function must define nested \c typedefs. Those \c typedefs are
- *  provided by the base class \p binary_function.
- *
- *  The following code snippet demonstrates how to construct an 
- *  Adaptable Binary Function using \p binary_function.
- *
- *  \code
- *  struct exponentiate : public thrust::binary_function<float,float,float>
- *  {
- *    __host__ __device__
- *    float operator()(float x, float y) { return powf(x,y); }
- *  };
- *  \endcode
- *
- *  \note binary_function is currently redundant with the C++ STL type
- *  \c std::binary_function. We reserve it here for potential additional
- *  functionality at a later date.
- *
- *  \see http://www.sgi.com/tech/stl/binary_function.html
- *  \see unary_function
- */
-template<typename Argument1,
-         typename Argument2,
-         typename Result>
-  struct binary_function
-    : public std::binary_function<Argument1, Argument2, Result>
-{
-}; // end binary_function
-
-/*! \}
- */
-
-
-/*! \addtogroup predefined_function_objects Predefined Function Objects
- *  \ingroup function_objects
- */
-
-/*! \addtogroup arithmetic_operations Arithmetic Operations
- *  \ingroup predefined_function_objects
- *  \{
- */
-
-/*! \p plus is a function object. Specifically, it is an Adaptable Binary Function.
- *  If \c f is an object of class <tt>plus<T></tt>, and \c x and \c y are objects
- *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x+y</tt>.
- *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
- *          and if \c x and \c y are objects of type \p T, then <tt>x+y</tt> must be defined and must have a return type that is convertible to \c T.
- *
- *  The following code snippet demonstrates how to use <tt>plus</tt> to sum two
- *  device_vectors of \c floats.
- *
- *  \code
- *  #include <thrust/device_vector.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/sequence.h>
- *  #include <thrust/fill.h>
- *  #include <thrust/transform.h>
- *  ...
- *  const int N = 1000;
- *  thrust::device_vector<float> V1(N);
- *  thrust::device_vector<float> V2(N);
- *  thrust::device_vector<float> V3(N);
- *
- *  thrust::sequence(V1.begin(), V1.end(), 1);
- *  thrust::fill(V2.begin(), V2.end(), 75);
- *
- *  thrust::transform(V1.begin(), V1.end(), V2.begin(), V3.begin(),
- *                     thrust::plus<float>());
- *  // V3 is now {76, 77, 78, ..., 1075}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/plus.html
- *  \see binary_function
- */
-template<typename T>
-  struct plus : public binary_function<T,T,T>
-{
-  /*! Function call operator. The return value is <tt>lhs + rhs</tt>.
-   */
-  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs + rhs;}
-}; // end plus
-
-/*! \p minus is a function object. Specifically, it is an Adaptable Binary Function.
- *  If \c f is an object of class <tt>minus<T></tt>, and \c x and \c y are objects
- *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x-y</tt>.
- *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
- *          and if \c x and \c y are objects of type \p T, then <tt>x-y</tt> must be defined and must have a return type that is convertible to \c T.
- *
- *  The following code snippet demonstrates how to use <tt>minus</tt> to subtract
- *  a device_vector of \c floats from another.
- *
- *  \code
- *  #include <thrust/device_vector.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/sequence.h>
- *  #include <thrust/fill.h>
- *  #include <thrust/transform.h>
- *  ...
- *  const int N = 1000;
- *  thrust::device_vector<float> V1(N);
- *  thrust::device_vector<float> V2(N);
- *  thrust::device_vector<float> V3(N);
- *
- *  thrust::sequence(V1.begin(), V1.end(), 1);
- *  thrust::fill(V2.begin(), V2.end(), 75);
- *
- *  thrust::transform(V1.begin(), V1.end(), V2.begin(), V3.begin(),
- *                     thrust::minus<float>());
- *  // V3 is now {-74, -75, -76, ..., -925}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/minus.html
- *  \see binary_function
- */
-template<typename T>
-  struct minus : public binary_function<T,T,T>
-{
-  /*! Function call operator. The return value is <tt>lhs - rhs</tt>.
-   */
-  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs - rhs;}
-}; // end minus
-
-/*! \p multiplies is a function object. Specifically, it is an Adaptable Binary Function.
- *  If \c f is an object of class <tt>minus<T></tt>, and \c x and \c y are objects
- *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x*y</tt>.
- *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
- *          and if \c x and \c y are objects of type \p T, then <tt>x*y</tt> must be defined and must have a return type that is convertible to \c T.
- *
- *  The following code snippet demonstrates how to use <tt>multiplies</tt> to multiply
- *  two device_vectors of \c floats.
- *
- *  \code
- *  #include <thrust/device_vector.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/sequence.h>
- *  #include <thrust/fill.h>
- *  #include <thrust/transform.h>
- *  ...
- *  const int N = 1000;
- *  thrust::device_vector<float> V1(N);
- *  thrust::device_vector<float> V2(N);
- *  thrust::device_vector<float> V3(N);
- *
- *  thrust::sequence(V1.begin(), V1.end(), 1);
- *  thrust::fill(V2.begin(), V2.end(), 75);
- *
- *  thrust::transform(V1.begin(), V1.end(), V2.begin(), V3.begin(),
- *                     thrust::multiplies<float>());
- *  // V3 is now {75, 150, 225, ..., 75000}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/multiplies.html
- *  \see binary_function
- */
-template<typename T>
-  struct multiplies : public binary_function<T,T,T>
-{
-  /*! Function call operator. The return value is <tt>lhs * rhs</tt>.
-   */
-  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs * rhs;}
-}; // end multiplies
-
-/*! \p divides is a function object. Specifically, it is an Adaptable Binary Function.
- *  If \c f is an object of class <tt>divides<T></tt>, and \c x and \c y are objects
- *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x/y</tt>.
- *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
- *          and if \c x and \c y are objects of type \p T, then <tt>x/y</tt> must be defined and must have a return type that is convertible to \c T.
- *
- *  The following code snippet demonstrates how to use <tt>divides</tt> to divide
- *  one device_vectors of \c floats by another.
- *
- *  \code
- *  #include <thrust/device_vector.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/sequence.h>
- *  #include <thrust/fill.h>
- *  #include <thrust/transform.h>
- *  ...
- *  const int N = 1000;
- *  thrust::device_vector<float> V1(N);
- *  thrust::device_vector<float> V2(N);
- *  thrust::device_vector<float> V3(N);
- *
- *  thrust::sequence(V1.begin(), V1.end(), 1);
- *  thrust::fill(V2.begin(), V2.end(), 75);
- *
- *  thrust::transform(V1.begin(), V1.end(), V2.begin(), V3.begin(),
- *                     thrust::divides<float>());
- *  // V3 is now {1/75, 2/75, 3/75, ..., 1000/75}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/divides.html
- *  \see binary_function
- */
-template<typename T>
-  struct divides : public binary_function<T,T,T>
-{
-  /*! Function call operator. The return value is <tt>lhs / rhs</tt>.
-   */
-  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs / rhs;}
-}; // end divides
-
-/*! \p modulus is a function object. Specifically, it is an Adaptable Binary Function.
- *  If \c f is an object of class <tt>divides<T></tt>, and \c x and \c y are objects
- *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x%y</tt>.
- *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
- *          and if \c x and \c y are objects of type \p T, then <tt>x%y</tt> must be defined and must have a return type that is convertible to \c T.
- *
- *  The following code snippet demonstrates how to use <tt>modulus</tt> to take
- *  the modulus of one device_vectors of \c floats by another.
- *
- *  \code
- *  #include <thrust/device_vector.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/sequence.h>
- *  #include <thrust/fill.h>
- *  #include <thrust/transform.h>
- *  ...
- *  const int N = 1000;
- *  thrust::device_vector<float> V1(N);
- *  thrust::device_vector<float> V2(N);
- *  thrust::device_vector<float> V3(N);
- *
- *  thrust::sequence(V1.begin(), V1.end(), 1);
- *  thrust::fill(V2.begin(), V2.end(), 75);
- *
- *  thrust::transform(V1.begin(), V1.end(), V2.begin(), V3.begin(),
- *                     thrust::modulus<int>());
- *  // V3 is now {1%75, 2%75, 3%75, ..., 1000%75}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/modulus.html
- *  \see binary_function
- */
-template<typename T>
-  struct modulus : public binary_function<T,T,T>
-{
-  /*! Function call operator. The return value is <tt>lhs % rhs</tt>.
-   */
-  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs % rhs;}
-}; // end modulus
-
-/*! \p negate is a function object. Specifically, it is an Adaptable Unary Function.
- *  If \c f is an object of class <tt>negate<T></tt>, and \c x is an object
- *  of class \c T, then <tt>f(x)</tt> returns <tt>-x</tt>.
- *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
- *          and if \c x is an object of type \p T, then <tt>-x</tt> must be defined and must have a return type that is convertible to \c T.
- *
- *  The following code snippet demonstrates how to use <tt>negate</tt> to negate
- *  the element of a device_vector of \c floats.
- *
- *  \code
- *  #include <thrust/device_vector.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/sequence.h>
- *  #include <thrust/transform.h>
- *  ...
- *  const int N = 1000;
- *  thrust::device_vector<float> V1(N);
- *  thrust::device_vector<float> V2(N);
- *
- *  thrust::sequence(V1.begin(), V1.end(), 1);
- *
- *  thrust::transform(V1.begin(), V1.end(), V2.begin(),
- *                     thrust::negate<float>());
- *  // V2 is now {-1, -2, -3, ..., -1000}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/negate.html
- *  \see unary_function
- */
-template<typename T>
-  struct negate : public unary_function<T,T>
-{
-  /*! Function call operator. The return value is <tt>-x</tt>.
-   */
-  __host__ __device__ T operator()(const T &x) const {return -x;}
-}; // end negate
-
-/*! \}
- */
-
-/*! \addtogroup comparison_operations Comparison Operations
- *  \ingroup predefined_function_objects
- *  \{
- */
-
-/*! \p equal_to is a function object. Specifically, it is an Adaptable Binary
- *  Predicate, which means it is a function object that tests the truth or falsehood
- *  of some condition. If \c f is an object of class <tt>equal_to<T></tt> and \c x
- *  and \c y are objects of class \c T, then <tt>f(x,y)</tt> returns \c true if
- *  <tt>x == y</tt> and \c false otherwise.
- *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
- *
- *  \see http://www.sgi.com/tech/stl/equal_to.html
- *  \see binary_function
- */
-template<typename T>
-  struct equal_to : public binary_function<T,T,bool>
-{
-  /*! Function call operator. The return value is <tt>lhs == rhs</tt>.
-   */
-  __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {return lhs == rhs;}
-}; // end equal_to
-
-/*! \p not_equal_to is a function object. Specifically, it is an Adaptable Binary
- *  Predicate, which means it is a function object that tests the truth or falsehood
- *  of some condition. If \c f is an object of class <tt>not_equal_to<T></tt> and \c x
- *  and \c y are objects of class \c T, then <tt>f(x,y)</tt> returns \c true if
- *  <tt>x != y</tt> and \c false otherwise.
- *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
- *
- *  \see http://www.sgi.com/tech/stl/not_equal_to.html
- *  \see binary_function
- */
-template<typename T>
-  struct not_equal_to : public binary_function<T,T,bool>
-{
-  /*! Function call operator. The return value is <tt>lhs != rhs</tt>.
-   */
-  __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {return lhs != rhs;}
-}; // end not_equal_to
-
-/*! \p greater is a function object. Specifically, it is an Adaptable Binary
- *  Predicate, which means it is a function object that tests the truth or falsehood
- *  of some condition. If \c f is an object of class <tt>greater<T></tt> and \c x
- *  and \c y are objects of class \c T, then <tt>f(x,y)</tt> returns \c true if
- *  <tt>x > y</tt> and \c false otherwise.
- *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
- *
- *  \see http://www.sgi.com/tech/stl/greater.html
- *  \see binary_function
- */
-template<typename T>
-  struct greater : public binary_function<T,T,bool>
-{
-  /*! Function call operator. The return value is <tt>lhs > rhs</tt>.
-   */
-  __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {return lhs > rhs;}
-}; // end greater
-
-/*! \p less is a function object. Specifically, it is an Adaptable Binary
- *  Predicate, which means it is a function object that tests the truth or falsehood
- *  of some condition. If \c f is an object of class <tt>less<T></tt> and \c x
- *  and \c y are objects of class \c T, then <tt>f(x,y)</tt> returns \c true if
- *  <tt>x < y</tt> and \c false otherwise.
- *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
- *
- *  \see http://www.sgi.com/tech/stl/less.html
- *  \see binary_function
- */
-template<typename T>
-  struct less : public binary_function<T,T,bool>
-{
-  /*! Function call operator. The return value is <tt>lhs < rhs</tt>.
-   */
-  __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {return lhs < rhs;}
-}; // end less
-
-/*! \p greater_equal is a function object. Specifically, it is an Adaptable Binary
- *  Predicate, which means it is a function object that tests the truth or falsehood
- *  of some condition. If \c f is an object of class <tt>greater_equal<T></tt> and \c x
- *  and \c y are objects of class \c T, then <tt>f(x,y)</tt> returns \c true if
- *  <tt>x >= y</tt> and \c false otherwise.
- *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
- *
- *  \see http://www.sgi.com/tech/stl/greater_equal.html
- *  \see binary_function
- */
-template<typename T>
-  struct greater_equal : public binary_function<T,T,bool>
-{
-  /*! Function call operator. The return value is <tt>lhs >= rhs</tt>.
-   */
-  __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {return lhs >= rhs;}
-}; // end greater_equal
-
-/*! \p less_equal is a function object. Specifically, it is an Adaptable Binary
- *  Predicate, which means it is a function object that tests the truth or falsehood
- *  of some condition. If \c f is an object of class <tt>less_equal<T></tt> and \c x
- *  and \c y are objects of class \c T, then <tt>f(x,y)</tt> returns \c true if
- *  <tt>x <= y</tt> and \c false otherwise.
- *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
- *
- *  \see http://www.sgi.com/tech/stl/less_equal.html
- *  \see binary_function
- */
-template<typename T>
-  struct less_equal : public binary_function<T,T,bool>
-{
-  /*! Function call operator. The return value is <tt>lhs <= rhs</tt>.
-   */
-  __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {return lhs <= rhs;}
-}; // end less_equal
-
-/*! \}
- */
-
-
-/*! \addtogroup logical_operations Logical Operations
- *  \ingroup predefined_function_objects
- *  \{
- */
-
-/*! \p logical_and is a function object. Specifically, it is an Adaptable Binary Predicate,
- *  which means it is a function object that tests the truth or falsehood of some condition.
- *  If \c f is an object of class <tt>logical_and<T></tt> and \c x and \c y are objects of
- *  class \c T (where \c T is convertible to \c bool) then <tt>f(x,y)</tt> returns \c true
- *  if and only if both \c x and \c y are \c true.
- *
- *  \tparam T must be convertible to \c bool.
- *
- *  \see http://www.sgi.com/tech/stl/logical_and.html
- *  \see binary_function
- */
-template<typename T>
-  struct logical_and : public binary_function<T,T,bool>
-{
-  /*! Function call operator. The return value is <tt>lhs && rhs</tt>.
-   */
-  __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {return lhs && rhs;}
-}; // end logical_and
-
-/*! \p logical_or is a function object. Specifically, it is an Adaptable Binary Predicate,
- *  which means it is a function object that tests the truth or falsehood of some condition.
- *  If \c f is an object of class <tt>logical_or<T></tt> and \c x and \c y are objects of
- *  class \c T (where \c T is convertible to \c bool) then <tt>f(x,y)</tt> returns \c true
- *  if and only if either \c x or \c y are \c true.
- *
- *  \tparam T must be convertible to \c bool.
- *
- *  \see http://www.sgi.com/tech/stl/logical_or.html
- *  \see binary_function
- */
-template<typename T>
-  struct logical_or : public binary_function<T,T,bool>
-{
-  /*! Function call operator. The return value is <tt>lhs || rhs</tt>.
-   */
-  __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {return lhs || rhs;}
-}; // end logical_or
-
-/*! \p logical_not is a function object. Specifically, it is an Adaptable Predicate,
- *  which means it is a function object that tests the truth or falsehood of some condition.
- *  If \c f is an object of class <tt>logical_not<T></tt> and \c x is an object of
- *  class \c T (where \c T is convertible to \c bool) then <tt>f(x)</tt> returns \c true
- *  if and only if \c x is \c false.
- *
- *  \tparam T must be convertible to \c bool.
- *
- *  The following code snippet demonstrates how to use \p logical_not to transform
- *  a device_vector of \c bools into its logical complement.
- *
- *  \code
- *  #include <thrust/device_vector.h>
- *  #include <thrust/transform.h>
- *  #include <thrust/functional.h>
- *  ...
- *  thrust::device_vector<bool> V;
- *  ...
- *  thrust::transform(V.begin(), V.end(), V.begin(), thrust::logical_not<bool>());
- *  // The elements of V are now the logical complement of what they were prior
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/logical_not.html
- *  \see unary_function
- */
-template<typename T>
-  struct logical_not : public unary_function<T,bool>
-{
-  /*! Function call operator. The return value is <tt>!x</tt>.
-   */
-  __host__ __device__ bool operator()(const T &x) const {return !x;}
-}; // end logical_not
-
-/*! \}
- */
-
-/*! \addtogroup bitwise_operations Bitwise Operations
- *  \ingroup predefined_function_objects
- *  \{
- */
-
-/*! \p bit_and is a function object. Specifically, it is an Adaptable Binary Function.
- *  If \c f is an object of class <tt>bit_and<T></tt>, and \c x and \c y are objects
- *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x&y</tt>.
- *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
- *          and if \c x and \c y are objects of type \p T, then <tt>x&y</tt> must be defined and must have a return type that is convertible to \c T.
- *
- *  The following code snippet demonstrates how to use <tt>bit_and</tt> to take
- *  the bitwise AND of one device_vector of \c ints by another.
- *
- *  \code
- *  #include <thrust/device_vector.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/sequence.h>
- *  #include <thrust/fill.h>
- *  #include <thrust/transform.h>
- *  ...
- *  const int N = 1000;
- *  thrust::device_vector<int> V1(N);
- *  thrust::device_vector<int> V2(N);
- *  thrust::device_vector<int> V3(N);
- *
- *  thrust::sequence(V1.begin(), V1.end(), 1);
- *  thrust::fill(V2.begin(), V2.end(), 13);
- *
- *  thrust::transform(V1.begin(), V1.end(), V2.begin(), V3.begin(),
- *                    thrust::bit_and<int>());
- *  // V3 is now {1&13, 2&13, 3&13, ..., 1000%13}
- *  \endcode
- *
- *  \see binary_function
- */
-template<typename T>
-  struct bit_and : public binary_function<T,T,T>
-{
-  /*! Function call operator. The return value is <tt>lhs & rhs</tt>.
-   */
-  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs & rhs;}
-}; // end bit_and
-
-/*! \p bit_or is a function object. Specifically, it is an Adaptable Binary Function.
- *  If \c f is an object of class <tt>bit_and<T></tt>, and \c x and \c y are objects
- *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x|y</tt>.
- *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
- *          and if \c x and \c y are objects of type \p T, then <tt>x|y</tt> must be defined and must have a return type that is convertible to \c T.
- *
- *  The following code snippet demonstrates how to use <tt>bit_or</tt> to take
- *  the bitwise OR of one device_vector of \c ints by another.
- *
- *  \code
- *  #include <thrust/device_vector.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/sequence.h>
- *  #include <thrust/fill.h>
- *  #include <thrust/transform.h>
- *  ...
- *  const int N = 1000;
- *  thrust::device_vector<int> V1(N);
- *  thrust::device_vector<int> V2(N);
- *  thrust::device_vector<int> V3(N);
- *
- *  thrust::sequence(V1.begin(), V1.end(), 1);
- *  thrust::fill(V2.begin(), V2.end(), 13);
- *
- *  thrust::transform(V1.begin(), V1.end(), V2.begin(), V3.begin(),
- *                    thrust::bit_or<int>());
- *  // V3 is now {1|13, 2|13, 3|13, ..., 1000|13}
- *  \endcode
- *
- *  \see binary_function
- */
-template<typename T>
-  struct bit_or : public binary_function<T,T,T>
-{
-  /*! Function call operator. The return value is <tt>lhs | rhs</tt>.
-   */
-  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs | rhs;}
-}; // end bit_or
-
-/*! \p bit_xor is a function object. Specifically, it is an Adaptable Binary Function.
- *  If \c f is an object of class <tt>bit_and<T></tt>, and \c x and \c y are objects
- *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x^y</tt>.
- *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
- *          and if \c x and \c y are objects of type \p T, then <tt>x^y</tt> must be defined and must have a return type that is convertible to \c T.
- *
- *  The following code snippet demonstrates how to use <tt>bit_xor</tt> to take
- *  the bitwise XOR of one device_vector of \c ints by another.
- *
- *  \code
- *  #include <thrust/device_vector.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/sequence.h>
- *  #include <thrust/fill.h>
- *  #include <thrust/transform.h>
- *  ...
- *  const int N = 1000;
- *  thrust::device_vector<int> V1(N);
- *  thrust::device_vector<int> V2(N);
- *  thrust::device_vector<int> V3(N);
- *
- *  thrust::sequence(V1.begin(), V1.end(), 1);
- *  thrust::fill(V2.begin(), V2.end(), 13);
- *
- *  thrust::transform(V1.begin(), V1.end(), V2.begin(), V3.begin(),
- *                    thrust::bit_xor<int>());
- *  // V3 is now {1^13, 2^13, 3^13, ..., 1000^13}
- *  \endcode
- *
- *  \see binary_function
- */
-template<typename T>
-  struct bit_xor : public binary_function<T,T,T>
-{
-  /*! Function call operator. The return value is <tt>lhs ^ rhs</tt>.
-   */
-  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs ^ rhs;}
-}; // end bit_xor
-
-/*! \}
- */
-
-/*! \addtogroup generalized_identity_operations Generalized Identity Operations
- *  \ingroup predefined_function_objects
- *  \{
- */
-
-/*! \p identity is a Unary Function that represents the identity function: it takes
- *  a single argument \c x, and returns \c x.
- *
- *  \tparam T No requirements on \p T.
- *
- *  The following code snippet demonstrates that \p identity returns its
- *  argument.
- *
- *  \code
- *  #include <thrust/functional.h>
- *  #include <assert.h>
- *  ...
- *  int x = 137;
- *  thrust::identity<int> id;
- *  assert(x == id(x));
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/identity.html
- *  \see unary_function
- */
-template<typename T>
-  struct identity : public unary_function<T,T>
-{
-  /*! Function call operator. The return value is <tt>x</tt>.
-   */
-  __host__ __device__ const T &operator()(const T &x) const {return x;}
-}; // end identity
-
-/*! \p maximum is a function object that takes two arguments and returns the greater
- *  of the two. Specifically, it is an Adaptable Binary Function. If \c f is an
- *  object of class <tt>maximum<T></tt> and \c x and \c y are objects of class \c T
- *  <tt>f(x,y)</tt> returns \c x if <tt>x > y</tt> and \c y, otherwise.
- *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
- *
- *  The following code snippet demonstrates that \p maximum returns its
- *  greater argument.
- *
- *  \code
- *  #include <thrust/functional.h>
- *  #include <assert.h>
- *  ...
- *  int x =  137;
- *  int y = -137;
- *  thrust::maximum<int> mx;
- *  assert(x == mx(x,y));
- *  \endcode
- *
- *  \see minimum
- *  \see min
- *  \see binary_function
- */
-template<typename T>
-  struct maximum : public binary_function<T,T,T>
-{
-  /*! Function call operator. The return value is <tt>rhs < lhs ? lhs : rhs</tt>.
-   */
-  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs < rhs ? rhs : lhs;}
-}; // end maximum
-
-/*! \p minimum is a function object that takes two arguments and returns the lesser
- *  of the two. Specifically, it is an Adaptable Binary Function. If \c f is an
- *  object of class <tt>minimum<T></tt> and \c x and \c y are objects of class \c T
- *  <tt>f(x,y)</tt> returns \c x if <tt>x < y</tt> and \c y, otherwise.
- *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
- *
- *  The following code snippet demonstrates that \p minimum returns its
- *  lesser argument.
- *
- *  \code
- *  #include <thrust/functional.h>
- *  #include <assert.h>
- *  ...
- *  int x =  137;
- *  int y = -137;
- *  thrust::minimum<int> mn;
- *  assert(y == mn(x,y));
- *  \endcode
- *
- *  \see maximum
- *  \see max
- *  \see binary_function
- */
-template<typename T>
-  struct minimum : public binary_function<T,T,T>
-{
-  /*! Function call operator. The return value is <tt>lhs < rhs ? lhs : rhs</tt>.
-   */
-  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs < rhs ? lhs : rhs;}
-}; // end minimum
-
-/*! \p project1st is a function object that takes two arguments and returns 
- *  its first argument; the second argument is unused. It is essentially a
- *  generalization of identity to the case of a Binary Function.
- *
- *  \code
- *  #include <thrust/functional.h>
- *  #include <assert.h>
- *  ...
- *  int x =  137;
- *  int y = -137;
- *  thrust::project1st<int> pj1;
- *  assert(x == pj1(x,y));
- *  \endcode
- *
- *  \see identity
- *  \see project2nd
- *  \see binary_function
- */
-template<typename T1, typename T2>
-  struct project1st : public binary_function<T1,T2,T1>
-{
-  /*! Function call operator. The return value is <tt>lhs</tt>.
-   */
-  __host__ __device__ const T1 &operator()(const T1 &lhs, const T2 &rhs) const {return lhs;}
-}; // end project1st
-
-/*! \p project2nd is a function object that takes two arguments and returns 
- *  its second argument; the first argument is unused. It is essentially a
- *  generalization of identity to the case of a Binary Function.
- *
- *  \code
- *  #include <thrust/functional.h>
- *  #include <assert.h>
- *  ...
- *  int x =  137;
- *  int y = -137;
- *  thrust::project2nd<int> pj2;
- *  assert(y == pj2(x,y));
- *  \endcode
- *
- *  \see identity
- *  \see project1st
- *  \see binary_function
- */
-template<typename T1, typename T2>
-  struct project2nd : public binary_function<T1,T2,T2>
-{
-  /*! Function call operator. The return value is <tt>rhs</tt>.
-   */
-  __host__ __device__ const T2 &operator()(const T1 &lhs, const T2 &rhs) const {return rhs;}
-}; // end project2nd
-
-/*! \}
- */
-
-
-// odds and ends
-
-/*! \addtogroup function_object_adaptors
- *  \{
- */
-
-/*! \p unary_negate is a function object adaptor: it is an Adaptable Predicate
- *  that represents the logical negation of some other Adaptable Predicate.
- *  That is: if \c f is an object of class <tt>unary_negate<AdaptablePredicate></tt>,
- *  then there exists an object \c pred of class \c AdaptablePredicate such
- *  that <tt>f(x)</tt> always returns the same value as <tt>!pred(x)</tt>.
- *  There is rarely any reason to construct a <tt>unary_negate</tt> directly;
- *  it is almost always easier to use the helper function not1.
- *
- *  \see http://www.sgi.com/tech/stl/unary_negate.html
- *  \see not1
- */
-template<typename Predicate>
-struct unary_negate 
-    : public thrust::unary_function<typename Predicate::argument_type, bool>
-{
-  /*! Constructor takes a \p Predicate object to negate.
-   *  \param p The \p Predicate object to negate.
-   */
-  __host__ __device__
-  explicit unary_negate(Predicate p) : pred(p){}
-
-  /*! Function call operator. The return value is <tt>!pred(x)</tt>.
-   */
-  __host__ __device__
-  bool operator()(const typename Predicate::argument_type& x) { return !pred(x); }
-
-  /*! \cond */
-  Predicate pred;
-  /*! \endcond */
-}; // end unary_negate
-
-/*! \p not1 is a helper function to simplify the creation of Adaptable Predicates:
- *  it takes an Adaptable Predicate \p pred as an argument and returns a new Adaptable
- *  Predicate that represents the negation of \p pred. That is: if \c pred is an object
- *  of a type which models Adaptable Predicate, then the the type of the result
- *  \c npred of <tt>not1(pred)</tt> is also a model of Adaptable Predicate and
- *  <tt>npred(x)</tt> always returns the same value as <tt>!pred(x)</tt>.
- *
- *  \param pred The Adaptable Predicate to negate.
- *  \return A new object, <tt>npred</tt> such that <tt>npred(x)</tt> always returns
- *          the same value as <tt>!pred(x)</tt>.
- *
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/AdaptablePredicate.html">Adaptable Predicate</a>.
- *
- *  \see unary_negate
- *  \see not2
- */
-template<typename Predicate>
-  __host__ __device__
-  unary_negate<Predicate> not1(const Predicate &pred);
-
-/*! \p binary_negate is a function object adaptor: it is an Adaptable Binary 
- *  Predicate that represents the logical negation of some other Adaptable
- *  Binary Predicate. That is: if \c f is an object of class <tt>binary_negate<AdaptablePredicate></tt>,
- *  then there exists an object \c pred of class \c AdaptableBinaryPredicate
- *  such that <tt>f(x,y)</tt> always returns the same value as <tt>!pred(x,y)</tt>.
- *  There is rarely any reason to construct a <tt>binary_negate</tt> directly;
- *  it is almost always easier to use the helper function not2.
- *
- *  \see http://www.sgi.com/tech/stl/binary_negate.html
- */
-template<typename Predicate>
-struct binary_negate
-    : public thrust::binary_function<typename Predicate::first_argument_type,
-                                     typename Predicate::second_argument_type,
-                                     bool>
-{
-  /*! Constructor takes a \p Predicate object to negate.
-   *  \param p The \p Predicate object to negate.
-   */
-  __host__ __device__
-  explicit binary_negate(Predicate p) : pred(p){}
-
-  /*! Function call operator. The return value is <tt>!pred(x,y)</tt>.
-   */
-  __host__ __device__
-  bool operator()(const typename Predicate::first_argument_type& x, const typename Predicate::second_argument_type& y)
-  { 
-      return !pred(x,y); 
-  }
-
-  /*! \cond */
-  Predicate pred;
-  /*! \endcond */
-}; // end binary_negate
-
-/*! \p not2 is a helper function to simplify the creation of Adaptable Binary Predicates:
- *  it takes an Adaptable Binary Predicate \p pred as an argument and returns a new Adaptable
- *  Binary Predicate that represents the negation of \p pred. That is: if \c pred is an object
- *  of a type which models Adaptable Binary Predicate, then the the type of the result
- *  \c npred of <tt>not2(pred)</tt> is also a model of Adaptable Binary Predicate and
- *  <tt>npred(x,y)</tt> always returns the same value as <tt>!pred(x,y)</tt>.
- *
- *  \param pred The Adaptable Binary Predicate to negate.
- *  \return A new object, <tt>npred</tt> such that <tt>npred(x,y)</tt> always returns
- *          the same value as <tt>!pred(x,y)</tt>.
- *
- *  \tparam Binary Predicate is a model of <a href="http://www.sgi.com/tech/stl/AdaptableBinaryPredicate.html">Adaptable Binary Predicate</a>.
- *
- *  \see binary_negate
- *  \see not1
- */
-template<typename BinaryPredicate>
-  __host__ __device__
-  binary_negate<BinaryPredicate> not2(const BinaryPredicate &pred);
-
-/*! \}
- */
-
-
-/*! \addtogroup placeholder_objects Placeholder Objects
- *  \ingroup function_objects
- *  \{
- */
-
-
-/*! \namespace placeholders
- *  \brief Facilities for constructing simple functions inline.
- *
- *  Objects in the \p thrust::placeholders namespace may be used to create simple arithmetic functions inline
- *  in an algorithm invocation. Combining placeholders such as \p _1 and \p _2 with arithmetic operations such as \c +
- *  creates an unnamed function object which applies the operation to their arguments.
- *
- *  The type of placeholder objects is implementation-defined.
- *
- *  The following code snippet demonstrates how to use the placeholders \p _1 and \p _2 with \p thrust::transform
- *  to implement the SAXPY computation:
- *
- *  \code
- *  #include <thrust/device_vector.h>
- *  #include <thrust/transform.h>
- *  #include <thrust/functional.h>
- *
- *  int main()
- *  {
- *    thrust::device_vector<float> x(4), y(4);
- *    x[0] = 1;
- *    x[1] = 2;
- *    x[2] = 3;
- *    x[3] = 4;
- *    
- *    y[0] = 1;
- *    y[1] = 1;
- *    y[2] = 1;
- *    y[3] = 1;
- *
- *    float a = 2.0f;
- *
- *    using namespace thrust::placeholders;
- *
- *    thrust::transform(x.begin(), x.end(), y.begin(), y.begin(),
- *      a * _1 + 2
- *    );
- *
- *    // y is now {3, 5, 7, 9}
- *  }
- *  \endcode
- */
-namespace placeholders
-{
-
-
-/*! \p thrust::placeholders::_1 is the placeholder for the first function parameter.
- */
-static const thrust::detail::functional::placeholder<0>::type _1;
-
-
-/*! \p thrust::placeholders::_2 is the placeholder for the second function parameter.
- */
-static const thrust::detail::functional::placeholder<1>::type _2;
-
-
-/*! \p thrust::placeholders::_3 is the placeholder for the third function parameter.
- */
-static const thrust::detail::functional::placeholder<2>::type _3;
-
-
-/*! \p thrust::placeholders::_4 is the placeholder for the fourth function parameter.
- */
-static const thrust::detail::functional::placeholder<3>::type _4;
-
-
-/*! \p thrust::placeholders::_5 is the placeholder for the fifth function parameter.
- */
-static const thrust::detail::functional::placeholder<4>::type _5;
-
-
-/*! \p thrust::placeholders::_6 is the placeholder for the sixth function parameter.
- */
-static const thrust::detail::functional::placeholder<5>::type _6;
-
-
-/*! \p thrust::placeholders::_7 is the placeholder for the seventh function parameter.
- */
-static const thrust::detail::functional::placeholder<6>::type _7;
-
-
-/*! \p thrust::placeholders::_8 is the placeholder for the eighth function parameter.
- */
-static const thrust::detail::functional::placeholder<7>::type _8;
-
-
-/*! \p thrust::placeholders::_9 is the placeholder for the ninth function parameter.
- */
-static const thrust::detail::functional::placeholder<8>::type _9;
-
-
-/*! \p thrust::placeholders::_10 is the placeholder for the tenth function parameter.
- */
-static const thrust::detail::functional::placeholder<9>::type _10;
-
-
-} // end placeholders
-
-
-/*! \} // placeholder_objects
- */
-
-
-} // end thrust
-
-#include <thrust/detail/functional.inl>
-#include <thrust/detail/functional/operators.h>
-
diff --git a/compat/thrust/gather.h b/compat/thrust/gather.h
deleted file mode 100644
index f2b8233657..0000000000
--- a/compat/thrust/gather.h
+++ /dev/null
@@ -1,438 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file gather.h
- *  \brief Irregular copying from a source range
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/execution_policy.h>
-
-namespace thrust
-{
-
-
-/*! \addtogroup gathering
- *  \ingroup copying
- *  \{
- */
-
-
-/*! \p gather copies elements from a source array into a destination range according 
- *  to a map. For each input iterator \c i in the range <tt>[map_first, map_last)</tt>, the
- *  value <tt>input_first[\*i]</tt> is assigned to <tt>*(result + (i - map_first))</tt>.
- *  \p RandomAccessIterator must permit random access.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param map_first Beginning of the range of gather locations.
- *  \param map_last End of the range of gather locations.
- *  \param input_first Beginning of the source range.
- *  \param result Beginning of the destination range.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
- *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a> and \c RandomAccessIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *
- *  \pre The range <tt>[map_first, map_last)</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
- *
- *  \remark \p gather is the inverse of thrust::scatter.
- *
- *  The following code snippet demonstrates how to use \p gather to reorder
- *  a range using the \p thrust::device execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/gather.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  // mark even indices with a 1; odd indices with a 0
- *  int values[10] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
- *  thrust::device_vector<int> d_values(values, values + 10);
- *
- *  // gather all even indices into the first half of the range
- *  // and odd indices to the last half of the range
- *  int map[10]   = {0, 2, 4, 6, 8, 1, 3, 5, 7, 9};
- *  thrust::device_vector<int> d_map(map, map + 10);
- *
- *  thrust::device_vector<int> d_output(10);
- *  thrust::gather(thrust::device,
- *                 d_map.begin(), d_map.end(),
- *                 d_values.begin(),
- *                 d_output.begin());
- *  // d_output is now {1, 1, 1, 1, 1, 0, 0, 0, 0, 0}
- *  \endcode
- */
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename RandomAccessIterator,
-         typename OutputIterator>
-  OutputIterator gather(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                        InputIterator                                               map_first,
-                        InputIterator                                               map_last,
-                        RandomAccessIterator                                        input_first,
-                        OutputIterator                                              result);
-
-
-/*! \p gather copies elements from a source array into a destination range according 
- *  to a map. For each input iterator \c i in the range <tt>[map_first, map_last)</tt>, the
- *  value <tt>input_first[\*i]</tt> is assigned to <tt>*(result + (i - map_first))</tt>.
- *  \p RandomAccessIterator must permit random access.
- *
- *  \param map_first Beginning of the range of gather locations.
- *  \param map_last End of the range of gather locations.
- *  \param input_first Beginning of the source range.
- *  \param result Beginning of the destination range.
- *
- *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
- *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a> and \c RandomAccessIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *
- *  \pre The range <tt>[map_first, map_last)</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
- *
- *  \remark \p gather is the inverse of thrust::scatter.
- *
- *  The following code snippet demonstrates how to use \p gather to reorder
- *  a range.
- *
- *  \code
- *  #include <thrust/gather.h>
- *  #include <thrust/device_vector.h>
- *  ...
- *  // mark even indices with a 1; odd indices with a 0
- *  int values[10] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
- *  thrust::device_vector<int> d_values(values, values + 10);
- *
- *  // gather all even indices into the first half of the range
- *  // and odd indices to the last half of the range
- *  int map[10]   = {0, 2, 4, 6, 8, 1, 3, 5, 7, 9};
- *  thrust::device_vector<int> d_map(map, map + 10);
- *
- *  thrust::device_vector<int> d_output(10);
- *  thrust::gather(d_map.begin(), d_map.end(),
- *                 d_values.begin(),
- *                 d_output.begin());
- *  // d_output is now {1, 1, 1, 1, 1, 0, 0, 0, 0, 0}
- *  \endcode
- */
-template<typename InputIterator,
-         typename RandomAccessIterator,
-         typename OutputIterator>
-  OutputIterator gather(InputIterator        map_first,
-                        InputIterator        map_last,
-                        RandomAccessIterator input_first,
-                        OutputIterator       result);
-
-
-/*! \p gather_if conditionally copies elements from a source array into a destination 
- *  range according to a map. For each input iterator \c i in the range <tt>[map_first, map_last)</tt>,
- *  such that the value of <tt>\*(stencil + (i - map_first))</tt> is \c true, the value
- *  <tt>input_first[\*i]</tt> is assigned to <tt>*(result + (i - map_first))</tt>.
- *  \p RandomAccessIterator must permit random access.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param map_first Beginning of the range of gather locations.
- *  \param map_last End of the range of gather locations.
- *  \param stencil Beginning of the range of predicate values.
- *  \param input_first Beginning of the source range.
- *  \param result Beginning of the destination range.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
- *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c bool.
- *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a> and \c RandomAccessIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *
- *  \pre The range <tt>[map_first, map_last)</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
- *  \pre The range <tt>[stencil, stencil + (map_last - map_first))</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
- *
- *  \remark \p gather_if is the inverse of \p scatter_if.
- *
- *  The following code snippet demonstrates how to use \p gather_if to gather selected values from
- *  an input range using the \p thrust::device execution policy:
- *
- *  \code
- *  #include <thrust/gather.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *
- *  int values[10] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
- *  thrust::device_vector<int> d_values(values, values + 10);
- *
- *  // select elements at even-indexed locations
- *  int stencil[10] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
- *  thrust::device_vector<int> d_stencil(stencil, stencil + 10);
- *
- *  // map all even indices into the first half of the range
- *  // and odd indices to the last half of the range
- *  int map[10]   = {0, 2, 4, 6, 8, 1, 3, 5, 7, 9};
- *  thrust::device_vector<int> d_map(map, map + 10);
- *
- *  thrust::device_vector<int> d_output(10, 7);
- *  thrust::gather_if(thrust::device,
- *                    d_map.begin(), d_map.end(),
- *                    d_stencil.begin(),
- *                    d_values.begin(),
- *                    d_output.begin());
- *  // d_output is now {0, 7, 4, 7, 8, 7, 3, 7, 7, 7}
- *  \endcode
- */
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename RandomAccessIterator,
-         typename OutputIterator>
-  OutputIterator gather_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                           InputIterator1                                              map_first,
-                           InputIterator1                                              map_last,
-                           InputIterator2                                              stencil,
-                           RandomAccessIterator                                        input_first,
-                           OutputIterator                                              result);
-
-
-/*! \p gather_if conditionally copies elements from a source array into a destination 
- *  range according to a map. For each input iterator \c i in the range <tt>[map_first, map_last)</tt>,
- *  such that the value of <tt>\*(stencil + (i - map_first))</tt> is \c true, the value
- *  <tt>input_first[\*i]</tt> is assigned to <tt>*(result + (i - map_first))</tt>.
- *  \p RandomAccessIterator must permit random access.
- *
- *  \param map_first Beginning of the range of gather locations.
- *  \param map_last End of the range of gather locations.
- *  \param stencil Beginning of the range of predicate values.
- *  \param input_first Beginning of the source range.
- *  \param result Beginning of the destination range.
- *
- *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
- *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c bool.
- *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a> and \c RandomAccessIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *
- *  \pre The range <tt>[map_first, map_last)</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
- *  \pre The range <tt>[stencil, stencil + (map_last - map_first))</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
- *
- *  \remark \p gather_if is the inverse of \p scatter_if.
- *
- *  The following code snippet demonstrates how to use \p gather_if to gather selected values from
- *  an input range.
- *
- *  \code
- *  #include <thrust/gather.h>
- *  #include <thrust/device_vector.h>
- *  ...
- *
- *  int values[10] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
- *  thrust::device_vector<int> d_values(values, values + 10);
- *
- *  // select elements at even-indexed locations
- *  int stencil[10] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
- *  thrust::device_vector<int> d_stencil(stencil, stencil + 10);
- *
- *  // map all even indices into the first half of the range
- *  // and odd indices to the last half of the range
- *  int map[10]   = {0, 2, 4, 6, 8, 1, 3, 5, 7, 9};
- *  thrust::device_vector<int> d_map(map, map + 10);
- *
- *  thrust::device_vector<int> d_output(10, 7);
- *  thrust::gather_if(d_map.begin(), d_map.end(),
- *                    d_stencil.begin(),
- *                    d_values.begin(),
- *                    d_output.begin());
- *  // d_output is now {0, 7, 4, 7, 8, 7, 3, 7, 7, 7}
- *  \endcode
- */
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename RandomAccessIterator,
-         typename OutputIterator>
-  OutputIterator gather_if(InputIterator1       map_first,
-                           InputIterator1       map_last,
-                           InputIterator2       stencil,
-                           RandomAccessIterator input_first,
-                           OutputIterator       result);
-
-
-/*! \p gather_if conditionally copies elements from a source array into a destination 
- *  range according to a map. For each input iterator \c i in the range <tt>[map_first, map_last)</tt>
- *  such that the value of <tt>pred(\*(stencil + (i - map_first)))</tt> is \c true,
- *  the value <tt>input_first[\*i]</tt> is assigned to <tt>*(result + (i - map_first))</tt>.
- *  \p RandomAccessIterator must permit random access.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param map_first Beginning of the range of gather locations.
- *  \param map_last End of the range of gather locations.
- *  \param stencil Beginning of the range of predicate values.
- *  \param input_first Beginning of the source range.
- *  \param result Beginning of the destination range.
- *  \param pred Predicate to apply to the stencil values.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
- *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c Predicate's \c argument_type.
- *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a> and \c RandomAccessIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  \pre The range <tt>[map_first, map_last)</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
- *  \pre The range <tt>[stencil, stencil + (map_last - map_first))</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
- *
- *  \remark \p gather_if is the inverse of \p scatter_if.
- *
- *  The following code snippet demonstrates how to use \p gather_if to gather selected values from
- *  an input range based on an arbitrary selection function using the \p thrust::device execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/gather.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/execution_policy.h>
- *  
- *  struct is_even
- *  {
- *    __host__ __device__
- *    bool operator()(const int x)
- *    {
- *      return (x % 2) == 0;
- *    }
- *  };
- *  ...
- *
- *  int values[10] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
- *  thrust::device_vector<int> d_values(values, values + 10);
- *
- *  // we will select an element when our stencil is even
- *  int stencil[10] = {0, 3, 4, 1, 4, 1, 2, 7, 8, 9};
- *  thrust::device_vector<int> d_stencil(stencil, stencil + 10);
- *
- *  // map all even indices into the first half of the range
- *  // and odd indices to the last half of the range
- *  int map[10]   = {0, 2, 4, 6, 8, 1, 3, 5, 7, 9};
- *  thrust::device_vector<int> d_map(map, map + 10);
- *
- *  thrust::device_vector<int> d_output(10, 7);
- *  thrust::gather_if(thrust::device,
- *                    d_map.begin(), d_map.end(),
- *                    d_stencil.begin(),
- *                    d_values.begin(),
- *                    d_output.begin(),
- *                    is_even());
- *  // d_output is now {0, 7, 4, 7, 8, 7, 3, 7, 7, 7}
- *  \endcode
- */
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename RandomAccessIterator,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator gather_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                           InputIterator1                                              map_first,
-                           InputIterator1                                              map_last,
-                           InputIterator2                                              stencil,
-                           RandomAccessIterator                                        input_first,
-                           OutputIterator                                              result,
-                           Predicate                                                   pred);
-
-
-/*! \p gather_if conditionally copies elements from a source array into a destination 
- *  range according to a map. For each input iterator \c i in the range <tt>[map_first, map_last)</tt>
- *  such that the value of <tt>pred(\*(stencil + (i - map_first)))</tt> is \c true,
- *  the value <tt>input_first[\*i]</tt> is assigned to <tt>*(result + (i - map_first))</tt>.
- *  \p RandomAccessIterator must permit random access.
- *
- *  \param map_first Beginning of the range of gather locations.
- *  \param map_last End of the range of gather locations.
- *  \param stencil Beginning of the range of predicate values.
- *  \param input_first Beginning of the source range.
- *  \param result Beginning of the destination range.
- *  \param pred Predicate to apply to the stencil values.
- *
- *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
- *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c Predicate's \c argument_type.
- *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a> and \c RandomAccessIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  \pre The range <tt>[map_first, map_last)</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
- *  \pre The range <tt>[stencil, stencil + (map_last - map_first))</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
- *
- *  \remark \p gather_if is the inverse of \p scatter_if.
- *
- *  The following code snippet demonstrates how to use \p gather_if to gather selected values from
- *  an input range based on an arbitrary selection function.
- *
- *  \code
- *  #include <thrust/gather.h>
- *  #include <thrust/device_vector.h>
- *  
- *  struct is_even
- *  {
- *    __host__ __device__
- *    bool operator()(const int x)
- *    {
- *      return (x % 2) == 0;
- *    }
- *  };
- *  ...
- *
- *  int values[10] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
- *  thrust::device_vector<int> d_values(values, values + 10);
- *
- *  // we will select an element when our stencil is even
- *  int stencil[10] = {0, 3, 4, 1, 4, 1, 2, 7, 8, 9};
- *  thrust::device_vector<int> d_stencil(stencil, stencil + 10);
- *
- *  // map all even indices into the first half of the range
- *  // and odd indices to the last half of the range
- *  int map[10]   = {0, 2, 4, 6, 8, 1, 3, 5, 7, 9};
- *  thrust::device_vector<int> d_map(map, map + 10);
- *
- *  thrust::device_vector<int> d_output(10, 7);
- *  thrust::gather_if(d_map.begin(), d_map.end(),
- *                    d_stencil.begin(),
- *                    d_values.begin(),
- *                    d_output.begin(),
- *                    is_even());
- *  // d_output is now {0, 7, 4, 7, 8, 7, 3, 7, 7, 7}
- *  \endcode
- */
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename RandomAccessIterator,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator gather_if(InputIterator1       map_first,
-                           InputIterator1       map_last,
-                           InputIterator2       stencil,
-                           RandomAccessIterator input_first,
-                           OutputIterator       result,
-                           Predicate            pred);
-
-/*! \} // gathering
- */
-
-} // end namespace thrust
-
-#include <thrust/detail/gather.inl>
-
diff --git a/compat/thrust/generate.h b/compat/thrust/generate.h
deleted file mode 100644
index 1d52721a78..0000000000
--- a/compat/thrust/generate.h
+++ /dev/null
@@ -1,211 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file generate.h
- *  \brief Fills a range with values "generated" from a function of no arguments
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/execution_policy.h>
-
-namespace thrust
-{
-
-
-/*! \addtogroup transformations
- *  \{
- */
-
-
-/*! \p generate assigns the result of invoking \p gen, a function object that takes no arguments,
- *  to each element in the range <tt>[first,last)</tt>.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The first element in the range of interest.
- *  \param last The last element in the range of interest.
- *  \param gen A function argument, taking no parameters, used to generate values to assign to
- *             elements in the range <tt>[first,last)</tt>.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator is mutable.
- *  \tparam Generator is a model of <a href="http://www.sgi.com/tech/stl/Generator.html">Generator</a>,
- *          and \p Generator's \c result_type is convertible to \p ForwardIterator's \c value_type.
- *
- *  The following code snippet demonstrates how to fill a \c host_vector with random numbers,
- *  using the standard C library function \c rand using the \p thrust::host execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/generate.h>
- *  #include <thrust/host_vector.h>
- *  #include <thrust/execution_policy.h>
- *  #include <cstdlib>
- *  ...
- *  thrust::host_vector<int> v(10);
- *  srand(13);
- *  thrust::generate(thrust::host, v.begin(), v.end(), rand);
- *
- *  // the elements of v are now pseudo-random numbers
- *  \endcode
- *
- *  \see generate_n
- *  \see http://www.sgi.com/tech/stl/generate.html
- */
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename Generator>
-  void generate(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                ForwardIterator first,
-                ForwardIterator last,
-                Generator gen);
-
-
-/*! \p generate assigns the result of invoking \p gen, a function object that takes no arguments,
- *  to each element in the range <tt>[first,last)</tt>.
- *
- *  \param first The first element in the range of interest.
- *  \param last The last element in the range of interest.
- *  \param gen A function argument, taking no parameters, used to generate values to assign to
- *             elements in the range <tt>[first,last)</tt>.
- *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator is mutable.
- *  \tparam Generator is a model of <a href="http://www.sgi.com/tech/stl/Generator.html">Generator</a>,
- *          and \p Generator's \c result_type is convertible to \p ForwardIterator's \c value_type.
- *
- *  The following code snippet demonstrates how to fill a \c host_vector with random numbers,
- *  using the standard C library function \c rand.
- *
- *  \code
- *  #include <thrust/generate.h>
- *  #include <thrust/host_vector.h>
- *  #include <thrust/execution_policy.h>
- *  #include <cstdlib>
- *  ...
- *  thrust::host_vector<int> v(10);
- *  srand(13);
- *  thrust::generate(v.begin(), v.end(), rand);
- *
- *  // the elements of v are now pseudo-random numbers
- *  \endcode
- *
- *  \see generate_n
- *  \see http://www.sgi.com/tech/stl/generate.html
- */
-template<typename ForwardIterator,
-         typename Generator>
-  void generate(ForwardIterator first,
-                ForwardIterator last,
-                Generator gen);
-
-
-/*! \p generate_n assigns the result of invoking \p gen, a function object that takes no arguments,
- *  to each element in the range <tt>[first,first + n)</tt>. The return value is <tt>first + n</tt>.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The first element in the range of interest.
- *  \param n The size of the range of interest.
- *  \param gen A function argument, taking no parameters, used to generate values to assign to
- *             elements in the range <tt>[first,first + n)</tt>.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Size is an integral type (either signed or unsigned).
- *  \tparam Generator is a model of <a href="http://www.sgi.com/tech/stl/Generator.html">Generator</a>,
- *          and \p Generator's \c result_type is convertible to a type in \p OutputIterator's set of \c value_types.
- *
- *  The following code snippet demonstrates how to fill a \c host_vector with random numbers,
- *  using the standard C library function \c rand using the \p thrust::host execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/generate.h>
- *  #include <thrust/host_vector.h>
- *  #include <thrust/execution_policy.h>
- *  #include <cstdlib>
- *  ...
- *  thrust::host_vector<int> v(10);
- *  srand(13);
- *  thrust::generate_n(thrust::host, v.begin(), 10, rand);
- *
- *  // the elements of v are now pseudo-random numbers
- *  \endcode
- *
- *  \see generate
- *  \see http://www.sgi.com/tech/stl/generate.html
- */
-template<typename DerivedPolicy,
-         typename OutputIterator,
-         typename Size,
-         typename Generator>
-  OutputIterator generate_n(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                            OutputIterator first,
-                            Size n,
-                            Generator gen);
-
-
-/*! \p generate_n assigns the result of invoking \p gen, a function object that takes no arguments,
- *  to each element in the range <tt>[first,first + n)</tt>. The return value is <tt>first + n</tt>.
- *
- *  \param first The first element in the range of interest.
- *  \param n The size of the range of interest.
- *  \param gen A function argument, taking no parameters, used to generate values to assign to
- *             elements in the range <tt>[first,first + n)</tt>.
- *
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Size is an integral type (either signed or unsigned).
- *  \tparam Generator is a model of <a href="http://www.sgi.com/tech/stl/Generator.html">Generator</a>,
- *          and \p Generator's \c result_type is convertible to a type in \p OutputIterator's set of \c value_types.
- *
- *  The following code snippet demonstrates how to fill a \c host_vector with random numbers,
- *  using the standard C library function \c rand.
- *
- *  \code
- *  #include <thrust/generate.h>
- *  #include <thrust/host_vector.h>
- *  #include <stdlib.h>
- *  ...
- *  thrust::host_vector<int> v(10);
- *  srand(13);
- *  thrust::generate_n(v.begin(), 10, rand);
- *
- *  // the elements of v are now pseudo-random numbers
- *  \endcode
- *
- *  \see generate
- *  \see http://www.sgi.com/tech/stl/generate.html
- */
-template<typename OutputIterator,
-         typename Size,
-         typename Generator>
-  OutputIterator generate_n(OutputIterator first,
-                            Size n,
-                            Generator gen);
-
-
-/*! \} // end transformations
- */
-
-} // end namespace thrust
-
-#include <thrust/detail/generate.inl>
-
diff --git a/compat/thrust/host_vector.h b/compat/thrust/host_vector.h
deleted file mode 100644
index 11b1ae0685..0000000000
--- a/compat/thrust/host_vector.h
+++ /dev/null
@@ -1,424 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file host_vector.h
- *  \brief A dynamically-sizable array of elements which reside in the "host" memory space
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <memory>
-#include <thrust/detail/vector_base.h>
-#include <vector>
-
-namespace thrust
-{
-
-// forward declaration of device_vector
-template<typename T, typename Alloc> class device_vector;
-
-/*! \addtogroup container_classes Container Classes
- *  \addtogroup host_containers Host Containers
- *  \ingroup container_classes
- *  \{
- */
-
-/*! A \p host_vector is a container that supports random access to elements,
- *  constant time removal of elements at the end, and linear time insertion
- *  and removal of elements at the beginning or in the middle. The number of
- *  elements in a \p host_vector may vary dynamically; memory management is
- *  automatic. The memory associated with a \p host_vector resides in the memory
- *  space of the host associated with a parallel device.
- *
- *  \see http://www.sgi.com/tech/stl/Vector.html
- *  \see device_vector
- */
-template<typename T, typename Alloc = std::allocator<T> >
-  class host_vector
-    : public detail::vector_base<T,Alloc>
-{
-  private:
-    typedef detail::vector_base<T,Alloc> Parent;
-
-  public:
-    /*! \cond */
-    typedef typename Parent::size_type  size_type;
-    typedef typename Parent::value_type value_type;
-    /*! \endcond */
-
-    /*! This constructor creates an empty \p host_vector.
-     */
-    __host__
-    host_vector(void)
-      :Parent() {}
-
-    /*! This constructor creates a \p host_vector with the given
-     *  size.
-     *  \param n The number of elements to initially craete.
-     */
-    __host__
-    explicit host_vector(size_type n)
-      :Parent(n) {}
-
-    /*! This constructor creates a \p host_vector with copies
-     *  of an exemplar element.
-     *  \param n The number of elements to initially create.
-     *  \param value An element to copy.
-     */
-    __host__
-    explicit host_vector(size_type n, const value_type &value)
-      :Parent(n,value) {}
-
-    /*! Copy constructor copies from an exemplar \p host_vector.
-     *  \param v The \p host_vector to copy.
-     */
-    __host__
-    host_vector(const host_vector &v)
-      :Parent(v) {}
-
-    /*! Assign operator copies from an exemplar \p host_vector.
-     *  \param v The \p host_vector to copy.
-     */
-    __host__
-    host_vector &operator=(const host_vector &v)
-    { Parent::operator=(v); return *this; }
-
-    /*! Copy constructor copies from an exemplar \p host_vector with different type.
-     *  \param v The \p host_vector to copy.
-     */
-    template<typename OtherT, typename OtherAlloc>
-    __host__
-    host_vector(const host_vector<OtherT,OtherAlloc> &v)
-      :Parent(v) {}
-
-    /*! Assign operator copies from an exemplar \p host_vector with different type.
-     *  \param v The \p host_vector to copy.
-     */
-    template<typename OtherT, typename OtherAlloc>
-    __host__
-    host_vector &operator=(const host_vector<OtherT,OtherAlloc> &v)
-    { Parent::operator=(v); return *this; }
-
-    /*! Copy constructor copies from an exemplar <tt>std::vector</tt>.
-     *  \param v The <tt>std::vector</tt> to copy.
-     */
-    template<typename OtherT, typename OtherAlloc>
-    __host__
-    host_vector(const std::vector<OtherT,OtherAlloc> &v)
-      :Parent(v) {}
-
-    /*! Assign operator copies from an exemplar <tt>std::vector</tt>.
-     *  \param v The <tt>std::vector</tt> to copy.
-     */
-    template<typename OtherT, typename OtherAlloc>
-    __host__
-    host_vector &operator=(const std::vector<OtherT,OtherAlloc> &v)
-    { Parent::operator=(v); return *this;}
-
-    /*! Copy constructor copies from an exemplar \p device_vector with possibly different type.
-     *  \param v The \p device_vector to copy.
-     */
-    template<typename OtherT, typename OtherAlloc>
-    __host__
-    host_vector(const device_vector<OtherT,OtherAlloc> &v);
-
-    /*! Assign operator copies from an exemplar \p device_vector.
-     *  \param v The \p device_vector to copy.
-     */
-    template<typename OtherT, typename OtherAlloc>
-    __host__
-    host_vector &operator=(const device_vector<OtherT,OtherAlloc> &v)
-    { Parent::operator=(v); return *this; }
-
-    /*! This constructor builds a \p host_vector from a range.
-     *  \param first The beginning of the range.
-     *  \param last The end of the range.
-     */
-    template<typename InputIterator>
-    __host__
-    host_vector(InputIterator first, InputIterator last)
-      :Parent(first, last) {}
-
-// declare these members for the purpose of Doxygenating them
-// they actually exist in a derived-from class
-#if 0
-    /*! \brief Resizes this vector to the specified number of elements.
-     *  \param new_size Number of elements this vector should contain.
-     *  \param x Data with which new elements should be populated.
-     *  \throw std::length_error If n exceeds max_size().
-     *
-     *  This method will resize this vector to the specified number of
-     *  elements.  If the number is smaller than this vector's current
-     *  size this vector is truncated, otherwise this vector is
-     *  extended and new elements are populated with given data.
-     */
-    void resize(size_type new_size, const value_type &x = value_type());
-
-    /*! Returns the number of elements in this vector.
-     */
-    size_type size(void) const;
-
-    /*! Returns the size() of the largest possible vector.
-     *  \return The largest possible return value of size().
-     */
-    size_type max_size(void) const;
-
-    /*! \brief If n is less than or equal to capacity(), this call has no effect.
-     *         Otherwise, this method is a request for allocation of additional memory. If
-     *         the request is successful, then capacity() is greater than or equal to
-     *         n; otherwise, capacity() is unchanged. In either case, size() is unchanged.
-     *  \throw std::length_error If n exceeds max_size().
-     */
-    void reserve(size_type n);
-
-    /*! Returns the number of elements which have been reserved in this
-     *  vector.
-     */
-    size_type capacity(void) const;
-
-    /*! This method shrinks the capacity of this vector to exactly
-     *  fit its elements.
-     */
-    void shrink_to_fit(void);
-
-    /*! \brief Subscript access to the data contained in this vector_dev.
-     *  \param n The index of the element for which data should be accessed.
-     *  \return Read/write reference to data.
-     *
-     *  This operator allows for easy, array-style, data access.
-     *  Note that data access with this operator is unchecked and
-     *  out_of_range lookups are not defined.
-     */
-    reference operator[](size_type n);
-
-    /*! \brief Subscript read access to the data contained in this vector_dev.
-     *  \param n The index of the element for which data should be accessed.
-     *  \return Read reference to data.
-     *
-     *  This operator allows for easy, array-style, data access.
-     *  Note that data access with this operator is unchecked and
-     *  out_of_range lookups are not defined.
-     */
-    const_reference operator[](size_type n) const;
-
-    /*! This method returns an iterator pointing to the beginning of
-     *  this vector.
-     *  \return mStart
-     */
-    iterator begin(void);
-
-    /*! This method returns a const_iterator pointing to the beginning
-     *  of this vector.
-     *  \return mStart
-     */
-    const_iterator begin(void) const;
-
-    /*! This method returns a const_iterator pointing to the beginning
-     *  of this vector.
-     *  \return mStart
-     */
-    const_iterator cbegin(void) const;
-
-    /*! This method returns a reverse_iterator pointing to the beginning of
-     *  this vector's reversed sequence.
-     *  \return A reverse_iterator pointing to the beginning of this
-     *          vector's reversed sequence.
-     */
-    reverse_iterator rbegin(void);
-
-    /*! This method returns a const_reverse_iterator pointing to the beginning of
-     *  this vector's reversed sequence.
-     *  \return A const_reverse_iterator pointing to the beginning of this
-     *          vector's reversed sequence.
-     */
-    const_reverse_iterator rbegin(void) const;
-
-    /*! This method returns a const_reverse_iterator pointing to the beginning of
-     *  this vector's reversed sequence.
-     *  \return A const_reverse_iterator pointing to the beginning of this
-     *          vector's reversed sequence.
-     */
-    const_reverse_iterator crbegin(void) const;
-
-    /*! This method returns an iterator pointing to one element past the
-     *  last of this vector.
-     *  \return begin() + size().
-     */
-    iterator end(void);
-
-    /*! This method returns a const_iterator pointing to one element past the
-     *  last of this vector.
-     *  \return begin() + size().
-     */
-    const_iterator end(void) const;
-
-    /*! This method returns a const_iterator pointing to one element past the
-     *  last of this vector.
-     *  \return begin() + size().
-     */
-    const_iterator cend(void) const;
-
-    /*! This method returns a reverse_iterator pointing to one element past the
-     *  last of this vector's reversed sequence.
-     *  \return rbegin() + size().
-     */
-    reverse_iterator rend(void);
-
-    /*! This method returns a const_reverse_iterator pointing to one element past the
-     *  last of this vector's reversed sequence.
-     *  \return rbegin() + size().
-     */
-    const_reverse_iterator rend(void) const;
-
-    /*! This method returns a const_reverse_iterator pointing to one element past the
-     *  last of this vector's reversed sequence.
-     *  \return rbegin() + size().
-     */
-    const_reverse_iterator crend(void) const;
-
-    /*! This method returns a const_reference referring to the first element of this
-     *  vector.
-     *  \return The first element of this vector.
-     */
-    const_reference front(void) const;
-
-    /*! This method returns a reference pointing to the first element of this
-     *  vector.
-     *  \return The first element of this vector.
-     */
-    reference front(void);
-
-    /*! This method returns a const reference pointing to the last element of
-     *  this vector.
-     *  \return The last element of this vector.
-     */
-    const_reference back(void) const;
-
-    /*! This method returns a reference referring to the last element of
-     *  this vector_dev.
-     *  \return The last element of this vector.
-     */
-    reference back(void);
-
-    /*! This method returns a pointer to this vector's first element.
-     *  \return A pointer to the first element of this vector.
-     */
-    pointer data(void);
-
-    /*! This method returns a const_pointer to this vector's first element.
-     *  \return a const_pointer to the first element of this vector.
-     */
-    const_pointer data(void) const;
-
-    /*! This method resizes this vector to 0.
-     */
-    void clear(void);
-
-    /*! This method returns true iff size() == 0.
-     *  \return true if size() == 0; false, otherwise.
-     */
-    bool empty(void) const;
-
-    /*! This method appends the given element to the end of this vector.
-     *  \param x The element to append.
-     */
-    void push_back(const value_type &x);
-
-    /*! This method erases the last element of this vector, invalidating
-     *  all iterators and references to it.
-     */
-    void pop_back(void);
-
-    /*! This method swaps the contents of this vector_base with another vector.
-     *  \param v The vector with which to swap.
-     */
-    void swap(host_vector &v);
-
-    /*! This method removes the element at position pos.
-     *  \param pos The position of the element of interest.
-     *  \return An iterator pointing to the new location of the element that followed the element
-     *          at position pos.
-     */
-    iterator erase(iterator pos);
-
-    /*! This method removes the range of elements [first,last) from this vector.
-     *  \param first The beginning of the range of elements to remove.
-     *  \param last The end of the range of elements to remove.
-     *  \return An iterator pointing to the new location of the element that followed the last
-     *          element in the sequence [first,last).
-     */
-    iterator erase(iterator first, iterator last);
-
-    /*! This method inserts a single copy of a given exemplar value at the
-     *  specified position in this vector.
-     *  \param position The insertion position.
-     *  \param x The exemplar element to copy & insert.
-     *  \return An iterator pointing to the newly inserted element.
-     */
-    iterator insert(iterator position, const T &x); 
-
-    /*! This method inserts a copy of an exemplar value to a range at the
-     *  specified position in this vector.
-     *  \param position The insertion position
-     *  \param n The number of insertions to perform.
-     *  \param x The value to replicate and insert.
-     */
-    void insert(iterator position, size_type n, const T &x);
-
-    /*! This method inserts a copy of an input range at the specified position
-     *  in this vector.
-     *  \param position The insertion position.
-     *  \param first The beginning of the range to copy.
-     *  \param last  The end of the range to copy.
-     *
-     *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html>Input Iterator</a>,
-     *                        and \p InputIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>.
-     */
-    template<typename InputIterator>
-    void insert(iterator position, InputIterator first, InputIterator last);
-
-    /*! This version of \p assign replicates a given exemplar
-     *  \p n times into this vector.
-     *  \param n The number of times to copy \p x.
-     *  \param x The exemplar element to replicate.
-     */
-    void assign(size_type n, const T &x);
-
-    /*! This version of \p assign makes this vector a copy of a given input range.
-     *  \param first The beginning of the range to copy.
-     *  \param last  The end of the range to copy.
-     *
-     *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator">Input Iterator</a>.
-     */
-    template<typename InputIterator>
-    void assign(InputIterator first, InputIterator last);
-
-    /*! This method returns a copy of this vector's allocator.
-     *  \return A copy of the alloctor used by this vector.
-     */
-    allocator_type get_allocator(void) const;
-#endif // end doxygen-only members
-}; // end host_vector
-
-/*! \}
- */
-
-} // end thrust
-
-#include <thrust/detail/host_vector.inl>
-
diff --git a/compat/thrust/inner_product.h b/compat/thrust/inner_product.h
deleted file mode 100644
index 01f55414bd..0000000000
--- a/compat/thrust/inner_product.h
+++ /dev/null
@@ -1,262 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file inner_product.h
- *  \brief Mathematical inner product between ranges
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/execution_policy.h>
-
-namespace thrust
-{
-
-
-/*! \addtogroup reductions
- *  \{
- *  \addtogroup transformed_reductions Transformed Reductions
- *  \ingroup reductions
- *  \{
- */
-
-
-/*! \p inner_product calculates an inner product of the ranges
- *  <tt>[first1, last1)</tt> and <tt>[first2, first2 + (last1 - first1))</tt>.
- *
- *  Specifically, this version of \p inner_product computes the sum
- *  <tt>init + (*first1 * *first2) + (*(first1+1) * *(first2+1)) + ... </tt>
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first1 The beginning of the first sequence.
- *  \param last1 The end of the first sequence.
- *  \param first2 The beginning of the second sequence.
- *  \param init Initial value of the result.
- *  \return The inner product of sequences <tt>[first1, last1)</tt>
- *          and <tt>[first2, last2)</tt> plus \p init.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam OutputType is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
- *          and if \c x is an object of type \p OutputType, and \c y is an object of \p InputIterator1's \c value_type,
- *          and \c z is an object of \p InputIterator2's \c value_type, then <tt>x + y * z</tt> is defined
- *          and is convertible to \p OutputType.
- *
- *  The following code demonstrates how to use \p inner_product to
- *  compute the dot product of two vectors using the \p thrust::host execution policy for parallelization.
- *
- *  \code
- *  #include <thrust/inner_product.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  float vec1[3] = {1.0f, 2.0f, 5.0f};
- *  float vec2[3] = {4.0f, 1.0f, 5.0f};
- *
- *  float result = thrust::inner_product(thrust::host, vec1, vec1 + 3, vec2, 0.0f);
- *
- *  // result == 31.0f
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/inner_product.html
- */
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputType>
-OutputType inner_product(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                         InputIterator1 first1,
-                         InputIterator1 last1,
-                         InputIterator2 first2,
-                         OutputType init);
-
-
-/*! \p inner_product calculates an inner product of the ranges
- *  <tt>[first1, last1)</tt> and <tt>[first2, first2 + (last1 - first1))</tt>.
- *
- *  Specifically, this version of \p inner_product computes the sum
- *  <tt>init + (*first1 * *first2) + (*(first1+1) * *(first2+1)) + ... </tt>
- *
- *  Unlike the C++ Standard Template Library function <tt>std::inner_product</tt>,
- *  this version offers no guarantee on order of execution.
- *
- *  \param first1 The beginning of the first sequence.
- *  \param last1 The end of the first sequence.
- *  \param first2 The beginning of the second sequence.
- *  \param init Initial value of the result.
- *  \return The inner product of sequences <tt>[first1, last1)</tt>
- *          and <tt>[first2, last2)</tt> plus \p init.
- *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam OutputType is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
- *          and if \c x is an object of type \p OutputType, and \c y is an object of \p InputIterator1's \c value_type,
- *          and \c z is an object of \p InputIterator2's \c value_type, then <tt>x + y * z</tt> is defined
- *          and is convertible to \p OutputType.
- *
- *  The following code demonstrates how to use \p inner_product to
- *  compute the dot product of two vectors.
- *
- *  \code
- *  #include <thrust/inner_product.h>
- *  ...
- *  float vec1[3] = {1.0f, 2.0f, 5.0f};
- *  float vec2[3] = {4.0f, 1.0f, 5.0f};
- *
- *  float result = thrust::inner_product(vec1, vec1 + 3, vec2, 0.0f);
- *
- *  // result == 31.0f
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/inner_product.html
- */
-template <typename InputIterator1, typename InputIterator2, typename OutputType>
-OutputType inner_product(InputIterator1 first1, InputIterator1 last1,
-                         InputIterator2 first2, OutputType init);
-
-
-/*! \p inner_product calculates an inner product of the ranges
- *  <tt>[first1, last1)</tt> and <tt>[first2, first2 + (last1 - first1))</tt>.
- *
- *  This version of \p inner_product is identical to the first, except that is uses
- *  two user-supplied function objects instead of \c operator+ and \c operator*.
- *
- *  Specifically, this version of \p inner_product computes the sum
- *  <tt>binary_op1( init, binary_op2(*first1, *first2) ), ... </tt>
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first1 The beginning of the first sequence.
- *  \param last1 The end of the first sequence.
- *  \param first2 The beginning of the second sequence.
- *  \param init Initial value of the result.
- *  \param binary_op1 Generalized addition operation.
- *  \param binary_op2 Generalized multiplication operation.
- *  \return The inner product of sequences <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator1's \c value_type is convertible to \p BinaryFunction2's \c first_argument_type.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *          and \p InputIterator2's \c value_type is convertible to \p BinaryFunction2's \c second_argument_type.
- *  \tparam OutputType is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
- *          and \p OutputType is convertible to \p BinaryFunction1's \c first_argument_type.
- *  \tparam BinaryFunction1 is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>,
- *          and \p BinaryFunction1's \c return_type is convertible to \p OutputType.
- *  \tparam BinaryFunction2 is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>,
- *          and \p BinaryFunction2's \c return_type is convertible to \p BinaryFunction1's \c second_argument_type.
- * 
- *  \code
- *  #include <thrust/inner_product.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  float vec1[3] = {1.0f, 2.0f, 5.0f};
- *  float vec2[3] = {4.0f, 1.0f, 5.0f};
- *
- *  float init = 0.0f;
- *  thrust::plus<float>       binary_op1;
- *  thrust::multiplies<float> binary_op2;
- *
- *  float result = thrust::inner_product(thrust::host, vec1, vec1 + 3, vec2, init, binary_op1, binary_op2);
- *
- *  // result == 31.0f
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/inner_product.html
- */
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputType,
-         typename BinaryFunction1,
-         typename BinaryFunction2>
-OutputType inner_product(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                         InputIterator1 first1,
-                         InputIterator1 last1,
-                         InputIterator2 first2,
-                         OutputType init, 
-                         BinaryFunction1 binary_op1,
-                         BinaryFunction2 binary_op2);
-
-
-/*! \p inner_product calculates an inner product of the ranges
- *  <tt>[first1, last1)</tt> and <tt>[first2, first2 + (last1 - first1))</tt>.
- *
- *  This version of \p inner_product is identical to the first, except that is uses
- *  two user-supplied function objects instead of \c operator+ and \c operator*.
- *
- *  Specifically, this version of \p inner_product computes the sum
- *  <tt>binary_op1( init, binary_op2(*first1, *first2) ), ... </tt>
- *
- *  Unlike the C++ Standard Template Library function <tt>std::inner_product</tt>,
- *  this version offers no guarantee on order of execution.
- *
- *  \param first1 The beginning of the first sequence.
- *  \param last1 The end of the first sequence.
- *  \param first2 The beginning of the second sequence.
- *  \param init Initial value of the result.
- *  \param binary_op1 Generalized addition operation.
- *  \param binary_op2 Generalized multiplication operation.
- *  \return The inner product of sequences <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>.
- *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator1's \c value_type is convertible to \p BinaryFunction2's \c first_argument_type.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *          and \p InputIterator2's \c value_type is convertible to \p BinaryFunction2's \c second_argument_type.
- *  \tparam OutputType is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
- *          and \p OutputType is convertible to \p BinaryFunction1's \c first_argument_type.
- *  \tparam BinaryFunction1 is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>,
- *          and \p BinaryFunction1's \c return_type is convertible to \p OutputType.
- *  \tparam BinaryFunction2 is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>,
- *          and \p BinaryFunction2's \c return_type is convertible to \p BinaryFunction1's \c second_argument_type.
- * 
- *  \code
- *  #include <thrust/inner_product.h>
- *  ...
- *  float vec1[3] = {1.0f, 2.0f, 5.0f};
- *  float vec2[3] = {4.0f, 1.0f, 5.0f};
- *
- *  float init = 0.0f;
- *  thrust::plus<float>       binary_op1;
- *  thrust::multiplies<float> binary_op2;
- *
- *  float result = thrust::inner_product(vec1, vec1 + 3, vec2, init, binary_op1, binary_op2);
- *
- *  // result == 31.0f
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/inner_product.html
- */
-template <typename InputIterator1, typename InputIterator2, typename OutputType,
-          typename BinaryFunction1, typename BinaryFunction2>
-OutputType inner_product(InputIterator1 first1, InputIterator1 last1,
-                         InputIterator2 first2, OutputType init, 
-                         BinaryFunction1 binary_op1, BinaryFunction2 binary_op2);
-
-
-/*! \} // end transformed_reductions
- *  \} // end reductions
- */
-
-} // end namespace thrust
-
-#include <thrust/detail/inner_product.inl>
-
diff --git a/compat/thrust/iterator/constant_iterator.h b/compat/thrust/iterator/constant_iterator.h
deleted file mode 100644
index e9e03c18c2..0000000000
--- a/compat/thrust/iterator/constant_iterator.h
+++ /dev/null
@@ -1,251 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file thrust/iterator/constant_iterator.h
- *  \brief An iterator which returns a constant value when
- *         dereferenced
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/iterator/detail/constant_iterator_base.h>
-#include <thrust/iterator/iterator_facade.h>
-
-namespace thrust
-{
-
-/*! \addtogroup iterators
- *  \{
- */
-
-/*! \addtogroup fancyiterator Fancy Iterators
- *  \ingroup iterators
- *  \{
- */
-
-/*! \p constant_iterator is an iterator which represents a pointer into a range
- *  of constant values. This iterator is useful for creating a range filled with the same
- *  value without explicitly storing it in memory. Using \p constant_iterator saves both
- *  memory capacity and bandwidth.
- *
- *  The following code snippet demonstrates how to create a \p constant_iterator whose
- *  \c value_type is \c int and whose value is \c 10.
- *
- *  \code
- *  #include <thrust/iterator/constant_iterator.h>
- *
- *  thrust::constant_iterator<int> iter(10);
- *
- *  *iter;    // returns 10
- *  iter[0];  // returns 10
- *  iter[1];  // returns 10
- *  iter[13]; // returns 10
- *
- *  // and so on...
- *  \endcode
- *
- *  This next example demonstrates how to use a \p constant_iterator with the
- *  \p thrust::transform function to increment all elements of a sequence by the
- *  same value. We will create a temporary \p constant_iterator with the function
- *  \p make_constant_iterator function in order to avoid explicitly specifying
- *  its type:
- *
- *  \code
- *  #include <thrust/iterator/constant_iterator.h>
- *  #include <thrust/transform.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/device_vector.h>
- *
- *  int main(void)
- *  {
- *    thrust::device_vector<int> data(4);
- *    data[0] = 3;
- *    data[1] = 7;
- *    data[2] = 2;
- *    data[3] = 5;
- *    
- *    // add 10 to all values in data
- *    thrust::transform(data.begin(), data.end(),
- *                      thrust::make_constant_iterator(10),
- *                      data.begin(),
- *                      thrust::plus<int>());
- *    
- *    // data is now [13, 17, 12, 15]
- *    
- *    return 0;
- *  }
- *  \endcode
- *
- *  \see make_constant_iterator
- */
-template<typename Value,
-         typename Incrementable = use_default,
-         typename System = use_default>
-  class constant_iterator
-    : public detail::constant_iterator_base<Value, Incrementable, System>::type
-{
-    /*! \cond
-     */
-    friend class thrust::iterator_core_access;
-    typedef typename detail::constant_iterator_base<Value, Incrementable, System>::type          super_t;
-    typedef typename detail::constant_iterator_base<Value, Incrementable, System>::incrementable incrementable;
-    typedef typename detail::constant_iterator_base<Value, Incrementable, System>::base_iterator base_iterator;
-
-  public:
-    typedef typename super_t::reference  reference;
-    typedef typename super_t::value_type value_type;
-
-    /*! \endcond
-     */
-
-    /*! Null constructor initializes this \p constant_iterator's constant using its
-     *  null constructor.
-     */
-    __host__ __device__
-    constant_iterator(void)
-      : super_t(), m_value(){};
-
-    /*! Copy constructor copies the value of another \p constant_iterator into this
-     *  \p constant_iterator.
-     *
-     *  \p rhs The constant_iterator to copy.
-     */
-    __host__ __device__
-    constant_iterator(constant_iterator const &rhs)
-      : super_t(rhs.base()), m_value(rhs.m_value) {}
-
-    /*! Copy constructor copies the value of another \p constant_iterator with related
-     *  System type.
-     *
-     *  \param rhs The \p constant_iterator to copy.
-     */
-    template<typename OtherSystem>
-    __host__ __device__
-    constant_iterator(constant_iterator<Value,Incrementable,OtherSystem> const &rhs,
-                      typename thrust::detail::enable_if_convertible<
-                        typename thrust::iterator_system<constant_iterator<Value,Incrementable,OtherSystem> >::type,
-                        typename thrust::iterator_system<super_t>::type
-                      >::type * = 0)
-      : super_t(rhs.base()), m_value(rhs.value()) {}
-
-    /*! This constructor receives a value to use as the constant value of this
-     *  \p constant_iterator and an index specifying the location of this
-     *  \p constant_iterator in a sequence.
-     *  
-     *  \p v The value of this \p constant_iterator's constant value.
-     *  \p i The index of this \p constant_iterator in a sequence. Defaults to the
-     *       value returned by \c Incrementable's null constructor. For example,
-     *       when <tt>Incrementable == int</tt>, \c 0.
-     */
-    __host__ __device__
-    constant_iterator(value_type const& v, incrementable const &i = incrementable())
-      : super_t(base_iterator(i)), m_value(v) {}
-
-    /*! This constructor is templated to allow construction from a value type and
-     *  incrementable type related this this \p constant_iterator's respective types.
-     *
-     *  \p v The value of this \p constant_iterator's constant value.
-     *  \p i The index of this \p constant_iterator in a sequence. Defaults to the
-     *       value returned by \c Incrementable's null constructor. For example,
-     *       when <tt>Incrementable == int</tt>, \c 0.
-     */
-    template<typename OtherValue, typename OtherIncrementable>
-    __host__ __device__
-    constant_iterator(OtherValue const& v, OtherIncrementable const& i = incrementable())
-      : super_t(base_iterator(i)), m_value(v) {}
-
-    /*! This method returns the value of this \p constant_iterator's constant value.
-     *  \return A \c const reference to this \p constant_iterator's constant value.
-     */
-    __host__ __device__
-    Value const& value(void) const
-    { return m_value; }
-
-    /*! \cond
-     */
-
-  protected:
-    __host__ __device__
-    Value const& value_reference(void) const
-    { return m_value; }
-
-    __host__ __device__
-    Value & value_reference(void)
-    { return m_value; }
-  
-  private: // Core iterator interface
-    __host__ __device__
-    reference dereference(void) const
-    {
-      return m_value;
-    }
-
-  private:
-    Value m_value;
-
-    /*! \endcond
-     */
-}; // end constant_iterator
-
-
-/*! This version of \p make_constant_iterator creates a \p constant_iterator
- *  from values given for both value and index. The type of \p constant_iterator
- *  may be inferred by the compiler from the types of its parameters.
- *
- *  \param x The value of the returned \p constant_iterator's constant value.
- *  \param i The index of the returned \p constant_iterator within a sequence.
- *           The type of this parameter defaults to \c int. In the default case,
- *           the value of this parameter is \c 0.
- *
- *  \return A new \p constant_iterator with constant value & index as given
- *          by \p x & \p i.
- *
- *  \see constant_iterator
- */
-template<typename V, typename I>
-inline __host__ __device__
-constant_iterator<V,I> make_constant_iterator(V x, I i = int())
-{
-  return constant_iterator<V,I>(x, i);
-} // end make_constant_iterator()
-
-
-/*! This version of \p make_constant_iterator creates a \p constant_iterator
- *  using only a parameter for the desired constant value. The value of the
- *  returned \p constant_iterator's index is set to \c 0.
- *
- *  \param x The value of the returned \p constant_iterator's constant value.
- *  \return A new \p constant_iterator with constant value equal to \p x and
- *          index equal to \c 0.
- *  \see constant_iterator
- */
-template<typename V>
-inline __host__ __device__
-constant_iterator<V> make_constant_iterator(V x)
-{
-  return constant_iterator<V>(x, 0);
-} // end make_constant_iterator()
-
-/*! \} // end fancyiterators
- */
-
-/*! \} // end iterators
- */
-
-} // end namespace thrust
-
diff --git a/compat/thrust/iterator/counting_iterator.h b/compat/thrust/iterator/counting_iterator.h
deleted file mode 100644
index 99812cae17..0000000000
--- a/compat/thrust/iterator/counting_iterator.h
+++ /dev/null
@@ -1,243 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file thrust/iterator/counting_iterator.h
- *  \brief An iterator which returns an increasing incrementable value
- *         when dereferenced
- */
-
-/*
- * Copyright David Abrahams 2003.
- * 
- * Distributed under the Boost Software License, Version 1.0.
- * (See accompanying NOTICE file for the complete license)
- *
- * For more information, see http://www.boost.org
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/iterator/iterator_adaptor.h>
-#include <thrust/iterator/iterator_facade.h>
-#include <thrust/iterator/iterator_categories.h>
-
-// #include the details first
-#include <thrust/iterator/detail/counting_iterator.inl>
-
-namespace thrust
-{
-
-/*! \addtogroup iterators
- *  \{
- */
-
-/*! \addtogroup fancyiterator Fancy Iterators
- *  \ingroup iterators
- *  \{
- */
-
-/*! \p counting_iterator is an iterator which represents a pointer into a range
- *  of sequentially changing values. This iterator is useful for creating a range
- *  filled with a sequence without explicitly storing it in memory. Using
- *  \p counting_iterator saves memory capacity and bandwidth.
- *
- *  The following code snippet demonstrates how to create a \p counting_iterator whose
- *  \c value_type is \c int and which sequentially increments by \c 1.
- *
- *  \code
- *  #include <thrust/iterator/counting_iterator.h>
- *  ...
- *  // create iterators
- *  thrust::counting_iterator<int> first(10);
- *  thrust::counting_iterator<int> last = first + 3;
- *   
- *  first[0]   // returns 10
- *  first[1]   // returns 11
- *  first[100] // returns 110
- *   
- *  // sum of [first, last)
- *  thrust::reduce(first, last);   // returns 33 (i.e. 10 + 11 + 12)
- *   
- *  // initialize vector to [0,1,2,..]
- *  thrust::counting_iterator<int> iter(0);
- *  thrust::device_vector<int> vec(500);
- *  thrust::copy(iter, iter + vec.size(), vec.begin());
- *  \endcode
- *
- *  This next example demonstrates how to use a \p counting_iterator with the
- *  \p thrust::copy_if function to compute the indices of the non-zero elements
- *  of a \p device_vector. In this example, we use the \p make_counting_iterator
- *  function to avoid specifying the type of the \p counting_iterator.
- *
- *  \code
- *  #include <thrust/iterator/counting_iterator.h>
- *  #include <thrust/copy.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/device_vector.h>
- *   
- *  int main(void)
- *  {
- *   // this example computes indices for all the nonzero values in a sequence
- *   
- *   // sequence of zero and nonzero values
- *   thrust::device_vector<int> stencil(8);
- *   stencil[0] = 0;
- *   stencil[1] = 1;
- *   stencil[2] = 1;
- *   stencil[3] = 0;
- *   stencil[4] = 0;
- *   stencil[5] = 1;
- *   stencil[6] = 0;
- *   stencil[7] = 1;
- *   
- *   // storage for the nonzero indices
- *   thrust::device_vector<int> indices(8);
- *   
- *   // compute indices of nonzero elements
- *   typedef thrust::device_vector<int>::iterator IndexIterator;
- *   
- *   // use make_counting_iterator to define the sequence [0, 8)
- *   IndexIterator indices_end = thrust::copy_if(thrust::make_counting_iterator(0),
- *                                               thrust::make_counting_iterator(8),
- *                                               stencil.begin(),
- *                                               indices.begin(),
- *                                               thrust::identity<int>());
- *   // indices now contains [1,2,5,7]
- *   
- *   return 0;
- *  }
- *  \endcode
- *
- *  \see make_counting_iterator
- */
-template<typename Incrementable,
-         typename System = use_default,
-         typename Traversal = use_default,
-         typename Difference = use_default>
-  class counting_iterator
-    : public detail::counting_iterator_base<Incrementable, System, Traversal, Difference>::type
-{
-    /*! \cond
-     */
-    typedef typename detail::counting_iterator_base<Incrementable, System, Traversal, Difference>::type super_t;
-
-    friend class thrust::iterator_core_access;
-
-  public:
-    typedef typename super_t::reference       reference;
-    typedef typename super_t::difference_type difference_type;
-
-    /*! \endcond
-     */
-
-    /*! Null constructor initializes this \p counting_iterator's \c Incrementable
-     *  counter using its null constructor.
-     */
-    __host__ __device__
-    counting_iterator(void){};
-
-    /*! Copy constructor copies the value of another \p counting_iterator into a
-     *  new \p counting_iterator.
-     *
-     *  \p rhs The \p counting_iterator to copy.
-     */
-    __host__ __device__
-    counting_iterator(counting_iterator const &rhs):super_t(rhs.base()){}
-
-    /*! Copy constructor copies the value of another counting_iterator 
-     *  with related System type.
-     *
-     *  \param rhs The \p counting_iterator to copy.
-     */
-    template<typename OtherSystem>
-    __host__ __device__
-    counting_iterator(counting_iterator<Incrementable, OtherSystem, Traversal, Difference> const &rhs,
-                      typename thrust::detail::enable_if_convertible<
-                        typename thrust::iterator_system<counting_iterator<Incrementable,OtherSystem,Traversal,Difference> >::type,
-                        typename thrust::iterator_system<super_t>::type
-                      >::type * = 0)
-      : super_t(rhs.base()){}
-
-    /*! This \c explicit constructor copies the value of an \c Incrementable
-     *  into a new \p counting_iterator's \c Incrementable counter.
-     *  
-     *  \param x The initial value of the new \p counting_iterator's \c Incrementable
-     *         counter.
-     */
-    __host__ __device__
-    explicit counting_iterator(Incrementable x):super_t(x){}
-
-    /*! \cond
-     */
-  private:
-    __host__ __device__
-    reference dereference(void) const
-    {
-      return this->base_reference();
-    }
-
-    // note that we implement equal specially for floating point counting_iterator
-    template <typename OtherIncrementable, typename OtherSystem, typename OtherTraversal, typename OtherDifference>
-    __host__ __device__
-    bool equal(counting_iterator<OtherIncrementable, OtherSystem, OtherTraversal, OtherDifference> const& y) const
-    {
-      typedef thrust::detail::counting_iterator_equal<difference_type,Incrementable,OtherIncrementable> e;
-      return e::equal(this->base(), y.base());
-    }
-
-    template <class OtherIncrementable>
-    __host__ __device__
-    difference_type
-    distance_to(counting_iterator<OtherIncrementable, System, Traversal, Difference> const& y) const
-    {
-      typedef typename
-      thrust::detail::eval_if<
-        thrust::detail::is_numeric<Incrementable>::value,
-        thrust::detail::identity_<thrust::detail::number_distance<difference_type, Incrementable, OtherIncrementable> >,
-        thrust::detail::identity_<thrust::detail::iterator_distance<difference_type, Incrementable, OtherIncrementable> >
-      >::type d;
-
-      return d::distance(this->base(), y.base());
-    }
-
-    /*! \endcond
-     */
-}; // end counting_iterator
-
-
-/*! \p make_counting_iterator creates a \p counting_iterator
- *  using an initial value for its \c Incrementable counter.
- *
- *  \param x The initial value of the new \p counting_iterator's counter.
- *  \return A new \p counting_iterator whose counter has been initialized to \p x.
- */
-template <typename Incrementable>
-inline __host__ __device__
-counting_iterator<Incrementable> make_counting_iterator(Incrementable x)
-{
-  return counting_iterator<Incrementable>(x);
-}
-
-/*! \} // end fancyiterators
- */
-
-/*! \} // end iterators
- */
-
-} // end thrust
-
diff --git a/compat/thrust/iterator/detail/any_assign.h b/compat/thrust/iterator/detail/any_assign.h
deleted file mode 100644
index e08a829ec0..0000000000
--- a/compat/thrust/iterator/detail/any_assign.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-namespace thrust
-{
-namespace detail
-{
-
-
-// a type which may be assigned any other type
-struct any_assign
-{
-  inline __host__ __device__ any_assign(void)
-  {}
-
-  template<typename T>
-  inline __host__ __device__ any_assign(T)
-  {}
-
-  template<typename T>
-  inline __host__ __device__
-  any_assign &operator=(T)
-  {
-    if(0)
-    {
-      // trick the compiler into silencing "warning: this expression has no effect"
-      int *x = 0;
-      *x = 13;
-    } // end if
-
-    return *this;
-  }
-};
-
-
-} // end detail
-} // end thrust
-
diff --git a/compat/thrust/iterator/detail/any_system_tag.h b/compat/thrust/iterator/detail/any_system_tag.h
deleted file mode 100644
index fc6417ad8a..0000000000
--- a/compat/thrust/iterator/detail/any_system_tag.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/execution_policy.h>
-
-namespace thrust
-{
-
-struct any_system_tag
-  : thrust::execution_policy<any_system_tag>
-{
-  // allow any_system_tag to convert to any type at all
-  // XXX make this safer using enable_if<is_tag<T>> upon c++11
-  template<typename T> operator T () const {return T();}
-};
-
-// TODO remove this in 1.7.0
-typedef THRUST_DEPRECATED any_system_tag any_space_tag;
-
-} // end thrust
-
diff --git a/compat/thrust/iterator/detail/constant_iterator_base.h b/compat/thrust/iterator/detail/constant_iterator_base.h
deleted file mode 100644
index 276e5ff0ef..0000000000
--- a/compat/thrust/iterator/detail/constant_iterator_base.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/iterator_adaptor.h>
-
-namespace thrust
-{
-
-// forward declaration of constant_iterator
-template<typename,typename,typename> class constant_iterator;
-
-namespace detail
-{
-
-template<typename Value,
-         typename Incrementable,
-         typename System>
-  struct constant_iterator_base
-{
-  typedef Value              value_type;
-
-  // the reference type is the same as the value_type.
-  // we wish to avoid returning a reference to the internal state
-  // of the constant_iterator, which is prone to subtle bugs.
-  // consider the temporary iterator created in the expression
-  // *(iter + i)
-  typedef value_type         reference;
-
-  // the incrementable type is int unless otherwise specified
-  typedef typename thrust::detail::ia_dflt_help<
-    Incrementable,
-    thrust::detail::identity_<int>
-  >::type incrementable;
-
-  typedef typename thrust::counting_iterator<
-    incrementable,
-    System,
-    thrust::random_access_traversal_tag
-  > base_iterator;
-
-  typedef typename thrust::iterator_adaptor<
-    constant_iterator<Value, Incrementable, System>,
-    base_iterator,
-    value_type, // XXX we may need to pass const value_type here as boost counting_iterator does
-    typename thrust::iterator_system<base_iterator>::type,
-    typename thrust::iterator_traversal<base_iterator>::type,
-    reference
-  > type;
-}; // end constant_iterator_base
-
-} // end detail
-  
-} // end thrust
-
diff --git a/compat/thrust/iterator/detail/counting_iterator.inl b/compat/thrust/iterator/detail/counting_iterator.inl
deleted file mode 100644
index ad4fcffaa6..0000000000
--- a/compat/thrust/iterator/detail/counting_iterator.inl
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/numeric_traits.h>
-#include <thrust/detail/type_traits.h>
-#include <cstddef>
-
-namespace thrust
-{
-
-// forward declaration of counting_iterator
-template <typename Incrementable, typename System, typename Traversal, typename Difference>
-  class counting_iterator;
-
-namespace detail
-{
-
-template <typename Incrementable, typename System, typename Traversal, typename Difference>
-  struct counting_iterator_base
-{
-  typedef typename thrust::detail::eval_if<
-    // use any_system_tag if we are given use_default
-    thrust::detail::is_same<System,use_default>::value,
-    thrust::detail::identity_<thrust::any_system_tag>,
-    thrust::detail::identity_<System>
-  >::type system;
-
-  typedef typename thrust::detail::ia_dflt_help<
-      Traversal,
-      thrust::detail::eval_if<
-          thrust::detail::is_numeric<Incrementable>::value,
-          thrust::detail::identity_<random_access_traversal_tag>,
-          thrust::iterator_traversal<Incrementable>
-      >
-  >::type traversal;
-
-  // unlike Boost, we explicitly use std::ptrdiff_t as the difference type
-  // for floating point counting_iterators
-  typedef typename thrust::detail::ia_dflt_help<
-    Difference,
-    thrust::detail::eval_if<
-      thrust::detail::is_numeric<Incrementable>::value,
-        thrust::detail::eval_if<
-          thrust::detail::is_integral<Incrementable>::value,
-          thrust::detail::numeric_difference<Incrementable>,
-          thrust::detail::identity_<std::ptrdiff_t>
-        >,
-      thrust::iterator_difference<Incrementable>
-    >
-  >::type difference;
-
-  // our implementation departs from Boost's in that counting_iterator::dereference
-  // returns a copy of its counter, rather than a reference to it. returning a reference
-  // to the internal state of an iterator causes subtle bugs (consider the temporary
-  // iterator created in the expression *(iter + i) ) and has no compelling use case
-  typedef thrust::iterator_adaptor<
-    counting_iterator<Incrementable, System, Traversal, Difference>, // self
-    Incrementable,                                                  // Base
-    Incrementable,                                                  // XXX we may need to pass const here as Boost does
-    system,
-    traversal,
-    Incrementable,
-    difference
-  > type;
-}; // end counting_iterator_base
-
-
-template<typename Difference, typename Incrementable1, typename Incrementable2>
-  struct iterator_distance
-{
-  __host__ __device__
-  static Difference distance(Incrementable1 x, Incrementable2 y)
-  {
-    return y - x;
-  }
-};
-
-
-template<typename Difference, typename Incrementable1, typename Incrementable2>
-  struct number_distance
-{
-  __host__ __device__
-  static Difference distance(Incrementable1 x, Incrementable2 y)
-  {
-      return static_cast<Difference>(numeric_distance(x,y));
-  }
-};
-
-
-template<typename Difference, typename Incrementable1, typename Incrementable2, typename Enable = void>
-  struct counting_iterator_equal
-{
-  __host__ __device__
-  static bool equal(Incrementable1 x, Incrementable2 y)
-  {
-    return x == y;
-  }
-};
-
-
-// specialization for floating point equality
-template<typename Difference, typename Incrementable1, typename Incrementable2>
-  struct counting_iterator_equal<
-    Difference,
-    Incrementable1,
-    Incrementable2,
-    typename thrust::detail::enable_if<
-      thrust::detail::is_floating_point<Incrementable1>::value ||
-      thrust::detail::is_floating_point<Incrementable2>::value
-    >::type
-  >
-{
-  __host__ __device__
-  static bool equal(Incrementable1 x, Incrementable2 y)
-  {
-    typedef number_distance<Difference,Incrementable1,Incrementable2> d;
-    return d::distance(x,y) == 0;
-  }
-};
-
-
-} // end detail
-} // end thrust
-
diff --git a/compat/thrust/iterator/detail/device_system_tag.h b/compat/thrust/iterator/detail/device_system_tag.h
deleted file mode 100644
index ab66fb48bf..0000000000
--- a/compat/thrust/iterator/detail/device_system_tag.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// #include the device system's execution_policy header
-#define __THRUST_DEVICE_SYSTEM_TAG_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/execution_policy.h>
-#include __THRUST_DEVICE_SYSTEM_TAG_HEADER
-#undef __THRUST_DEVICE_SYSTEM_TAG_HEADER
-
-namespace thrust
-{
-
-typedef thrust::system::__THRUST_DEVICE_SYSTEM_NAMESPACE::tag device_system_tag;
-
-} // end thrust
-
-// TODO remove this in 1.8.0
-namespace thrust
-{
-
-typedef THRUST_DEPRECATED device_system_tag device_space_tag;
-
-} // end thrust
-
diff --git a/compat/thrust/iterator/detail/discard_iterator_base.h b/compat/thrust/iterator/detail/discard_iterator_base.h
deleted file mode 100644
index 1909ca8239..0000000000
--- a/compat/thrust/iterator/detail/discard_iterator_base.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/iterator_adaptor.h>
-#include <thrust/iterator/detail/any_assign.h>
-#include <cstddef> // for std::ptrdiff_t
-
-namespace thrust
-{
-
-// forward declaration of discard_iterator
-template<typename> class discard_iterator;
-
-namespace detail
-{
-
-
-template<typename System>
-  struct discard_iterator_base
-{
-  // XXX value_type should actually be void
-  //     but this interferes with zip_iterator<discard_iterator>
-  typedef any_assign         value_type;
-  typedef any_assign&        reference;
-  typedef std::ptrdiff_t     incrementable;
-
-  typedef typename thrust::counting_iterator<
-    incrementable,
-    System,
-    thrust::random_access_traversal_tag
-  > base_iterator;
-
-  typedef typename thrust::iterator_adaptor<
-    discard_iterator<System>,
-    base_iterator,
-    value_type,
-    typename thrust::iterator_system<base_iterator>::type,
-    typename thrust::iterator_traversal<base_iterator>::type,
-    reference
-  > type;
-}; // end discard_iterator_base
-
-
-} // end detail
-  
-} // end thrust
-
-
diff --git a/compat/thrust/iterator/detail/distance_from_result.h b/compat/thrust/iterator/detail/distance_from_result.h
deleted file mode 100644
index bf83e6ca44..0000000000
--- a/compat/thrust/iterator/detail/distance_from_result.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/type_traits.h>
-
-namespace thrust
-{
-
-namespace detail
-{
-
-// since both arguments are known to be specializations of iterator_facade,
-// it's legal to access IteratorFacade2::difference_type
-template<typename IteratorFacade1, typename IteratorFacade2>
-  struct distance_from_result
-    : eval_if<
-        is_convertible<IteratorFacade2,IteratorFacade1>::value,
-        identity_<typename IteratorFacade1::difference_type>,
-        identity_<typename IteratorFacade2::difference_type>
-      >
-{};
-
-} // end detail
-
-} // end thrust
-
diff --git a/compat/thrust/iterator/detail/host_system_tag.h b/compat/thrust/iterator/detail/host_system_tag.h
deleted file mode 100644
index 26d3f7d73f..0000000000
--- a/compat/thrust/iterator/detail/host_system_tag.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// #include the host system's execution_policy header
-#define __THRUST_HOST_SYSTEM_TAG_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/execution_policy.h>
-#include __THRUST_HOST_SYSTEM_TAG_HEADER
-#undef __THRUST_HOST_SYSTEM_TAG_HEADER
-
-namespace thrust
-{
-
-typedef thrust::system::__THRUST_HOST_SYSTEM_NAMESPACE::tag host_system_tag;
-
-} // end thrust
-
-// TODO remove this in 1.8.0
-namespace thrust
-{
-
-typedef THRUST_DEPRECATED host_system_tag host_space_tag;
-
-} // end thrust
-
diff --git a/compat/thrust/iterator/detail/is_iterator_category.h b/compat/thrust/iterator/detail/is_iterator_category.h
deleted file mode 100644
index 95f14d558c..0000000000
--- a/compat/thrust/iterator/detail/is_iterator_category.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/iterator/iterator_categories.h>
-#include <thrust/detail/type_traits.h>
-
-namespace thrust
-{
-
-namespace detail
-{
-
-template <typename T>
-  struct is_host_iterator_category
-    : thrust::detail::or_<
-        thrust::detail::is_convertible<T, thrust::input_host_iterator_tag>,
-        thrust::detail::is_convertible<T, thrust::output_host_iterator_tag>
-      >
-{
-}; // end is_host_iterator_category
-
-template <typename T>
-  struct is_device_iterator_category
-    : thrust::detail::or_<
-        thrust::detail::is_convertible<T, thrust::input_device_iterator_tag>,
-        thrust::detail::is_convertible<T, thrust::output_device_iterator_tag>
-      >
-{
-}; // end is_device_iterator_category
-
-
-template <typename T>
-  struct is_iterator_category
-    : thrust::detail::or_<
-        is_host_iterator_category<T>,
-        is_device_iterator_category<T>
-      >
-{
-}; // end is_iterator_category
-
-} // end detail
-
-} // end thrust
-
diff --git a/compat/thrust/iterator/detail/is_trivial_iterator.h b/compat/thrust/iterator/detail/is_trivial_iterator.h
deleted file mode 100644
index ca37e74e64..0000000000
--- a/compat/thrust/iterator/detail/is_trivial_iterator.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/type_traits/pointer_traits.h>
-
-#if __GNUC__
-// forward declaration of gnu's __normal_iterator
-namespace __gnu_cxx
-{
-
-template<typename Iterator, typename Container> class __normal_iterator;
-
-} // end __gnu_cxx
-#endif // __GNUC__
-
-#if _MSC_VER
-// forward declaration of MSVC's "normal iterators"
-namespace std
-{
-
-template<typename Value, typename Difference, typename Pointer, typename Reference> struct _Ranit;
-
-} // end std
-#endif // _MSC_VER
-
-namespace thrust
-{
-namespace detail
-{
-
-#ifdef __GNUC__
-template<typename T>
-  struct is_gnu_normal_iterator
-    : false_type
-{};
-
-
-// catch gnu __normal_iterators
-template<typename Iterator, typename Container>
-  struct is_gnu_normal_iterator< __gnu_cxx::__normal_iterator<Iterator, Container> >
-    : true_type
-{};
-#endif // __GNUC__
-
-
-#ifdef _MSC_VER
-// catch msvc _Ranit
-template<typename Iterator>
-  struct is_convertible_to_msvc_Ranit :
-    is_convertible<
-      Iterator,
-      std::_Ranit<
-        typename iterator_value<Iterator>::type,
-        typename iterator_difference<Iterator>::type,
-        typename iterator_pointer<Iterator>::type,
-        typename iterator_reference<Iterator>::type
-      >
-    >
-{};
-#endif // _MSC_VER
-
-
-template<typename T>
-  struct is_trivial_iterator :
-    integral_constant<
-      bool,
-        is_pointer<T>::value
-      | thrust::detail::is_thrust_pointer<T>::value
-#if __GNUC__
-      | is_gnu_normal_iterator<T>::value
-#endif // __GNUC__
-#ifdef _MSC_VER
-      | is_convertible_to_msvc_Ranit<T>::value
-#endif // _MSC_VER
-    >
-{};
-
-} // end detail
-} // end thrust
-
diff --git a/compat/thrust/iterator/detail/iterator_adaptor_base.h b/compat/thrust/iterator/detail/iterator_adaptor_base.h
deleted file mode 100644
index 8b77f05d81..0000000000
--- a/compat/thrust/iterator/detail/iterator_adaptor_base.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/type_traits.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/use_default.h>
-#include <thrust/iterator/iterator_facade.h>
-
-namespace thrust
-{
-
-
-// forward declaration of iterator_adaptor for iterator_adaptor_base below
-template<typename Derived,
-         typename Base,
-         typename Value,
-         typename System,
-         typename Traversal,
-         typename Reference,
-         typename Difference
->
-class iterator_adaptor;
-
-
-namespace detail
-{
-
-// If T is use_default, return the result of invoking
-// DefaultNullaryFn, otherwise return T.
-// XXX rename to dflt_help
-template <class T, class DefaultNullaryFn>
-struct ia_dflt_help
-  : thrust::detail::eval_if<
-        thrust::detail::is_same<T, thrust::use_default>::value
-      , DefaultNullaryFn
-      , thrust::detail::identity_<T>
-    >
-{
-}; // end ia_dflt_help
-
-
-// A metafunction which computes an iterator_adaptor's base class,
-// a specialization of iterator_facade.
-template<typename Derived,
-         typename Base,
-         typename Value,
-         typename System,
-         typename Traversal,
-         typename Reference,
-         typename Difference
->
-  struct iterator_adaptor_base
-{
-  typedef typename ia_dflt_help<
-    Value,
-    iterator_value<Base>
-  >::type value;
-
-  typedef typename ia_dflt_help<
-    System,
-    thrust::iterator_system<Base>
-  >::type system;
-
-  typedef typename ia_dflt_help<
-    Traversal,
-    thrust::iterator_traversal<Base>
-  >::type traversal;
-
-  typedef typename ia_dflt_help<
-    Reference,
-    thrust::detail::eval_if<
-      thrust::detail::is_same<Value,use_default>::value,
-      thrust::iterator_reference<Base>,
-      thrust::detail::add_reference<Value>
-    >
-  >::type reference;
-
-  typedef typename ia_dflt_help<
-    Difference,
-    iterator_difference<Base>
-  >::type difference;
-
-  typedef thrust::iterator_facade<
-    Derived,
-    value,
-    system,
-    traversal,
-    reference,
-    difference
-  > type;
-}; // end iterator_adaptor_base
-
-
-} // end detail
-} // end thrust
-
diff --git a/compat/thrust/iterator/detail/iterator_category_to_system.h b/compat/thrust/iterator/detail/iterator_category_to_system.h
deleted file mode 100644
index 17e7d78c8f..0000000000
--- a/compat/thrust/iterator/detail/iterator_category_to_system.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/iterator/iterator_categories.h>
-#include <thrust/iterator/detail/iterator_traversal_tags.h>
-#include <thrust/iterator/detail/host_system_tag.h>
-#include <thrust/iterator/detail/device_system_tag.h>
-#include <thrust/iterator/detail/any_system_tag.h>
-#include <thrust/detail/type_traits.h>
-
-namespace thrust
-{
-
-// XXX WAR circular #inclusion with forward declarations
-struct random_access_universal_iterator_tag;
-struct input_universal_iterator_tag;
-struct output_universal_iterator_tag;
-
-namespace detail
-{
-
-// forward declaration
-template <typename> struct is_iterator_system;
-
-template <typename> struct device_iterator_category_to_backend_system;
-
-// XXX this should work entirely differently
-// we should just specialize this metafunction for iterator_category_with_system_and_traversal
-template<typename Category>
-  struct iterator_category_to_system
-    // convertible to any iterator?
-    : eval_if<
-        or_<
-          is_convertible<Category, thrust::input_universal_iterator_tag>,
-          is_convertible<Category, thrust::output_universal_iterator_tag>
-        >::value,
-
-        detail::identity_<thrust::any_system_tag>,
-
-        // convertible to host iterator?
-        eval_if<
-          or_<
-            is_convertible<Category, thrust::input_host_iterator_tag>,
-            is_convertible<Category, thrust::output_host_iterator_tag>
-          >::value,
-
-          detail::identity_<thrust::host_system_tag>,
-          
-          // convertible to device iterator?
-          eval_if<
-            or_<
-              is_convertible<Category, thrust::input_device_iterator_tag>,
-              is_convertible<Category, thrust::output_device_iterator_tag>
-            >::value,
-
-            detail::identity_<thrust::device_system_tag>,
-
-            // unknown system
-            detail::identity_<void>
-          > // if device
-        > // if host
-      > // if any
-{
-}; // end iterator_category_to_system
-
-
-template<typename CategoryOrTraversal>
-  struct iterator_category_or_traversal_to_system
-    : eval_if<
-        is_iterator_system<CategoryOrTraversal>::value,
-        detail::identity_<CategoryOrTraversal>,
-        iterator_category_to_system<CategoryOrTraversal>
-      >
-{
-}; // end iterator_category_or_traversal_to_system
-
-} // end detail
-} // end thrust
-
diff --git a/compat/thrust/iterator/detail/iterator_category_to_traversal.h b/compat/thrust/iterator/detail/iterator_category_to_traversal.h
deleted file mode 100644
index 04ef60c0c2..0000000000
--- a/compat/thrust/iterator/detail/iterator_category_to_traversal.h
+++ /dev/null
@@ -1,178 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/iterator/iterator_categories.h>
-#include <thrust/iterator/detail/iterator_traversal_tags.h>
-#include <thrust/iterator/detail/iterator_category_to_system.h>
-#include <thrust/detail/type_traits.h>
-
-namespace thrust
-{
-
-// XXX WAR circular #inclusion with these forward declarations
-struct bidirectional_universal_iterator_tag;
-struct forward_universal_iterator_tag;
-
-namespace detail
-{
-
-// forward declarations
-template <typename> struct is_iterator_system;
-template <typename> struct is_iterator_traversal;
-
-// make type_traits easy to access
-using namespace thrust::detail;
-
-template <typename Category>
-  struct host_system_category_to_traversal
-    : eval_if<
-        is_convertible<Category, random_access_host_iterator_tag>::value,
-        detail::identity_<random_access_traversal_tag>,
-        eval_if<
-          is_convertible<Category, bidirectional_host_iterator_tag>::value,
-          detail::identity_<bidirectional_traversal_tag>,
-          eval_if<
-            is_convertible<Category, forward_host_iterator_tag>::value,
-            detail::identity_<forward_traversal_tag>,
-            eval_if<
-              is_convertible<Category, input_host_iterator_tag>::value,
-              detail::identity_<single_pass_traversal_tag>,
-              eval_if<
-                is_convertible<Category, output_host_iterator_tag>::value,
-                detail::identity_<incrementable_traversal_tag>,
-                void
-              >
-            >
-          >
-        >
-      >
-{
-}; // end host_system_category_to_traversal
-
-
-
-template <typename Category>
-  struct device_system_category_to_traversal
-    : eval_if<
-        is_convertible<Category, random_access_device_iterator_tag>::value,
-        detail::identity_<random_access_traversal_tag>,
-        eval_if<
-          is_convertible<Category, bidirectional_device_iterator_tag>::value,
-          detail::identity_<bidirectional_traversal_tag>,
-          eval_if<
-            is_convertible<Category, forward_device_iterator_tag>::value,
-            detail::identity_<forward_traversal_tag>,
-            eval_if<
-              is_convertible<Category, input_device_iterator_tag>::value,
-              detail::identity_<single_pass_traversal_tag>,
-              eval_if<
-                is_convertible<Category, output_device_iterator_tag>::value,
-                detail::identity_<incrementable_traversal_tag>,
-                void
-              >
-            >
-          >
-        >
-      >
-{
-}; // end device_system_category_to_traversal
-
-
-
-template <typename Category>
-  struct any_system_category_to_traversal
-    : eval_if<
-        is_convertible<Category, random_access_universal_iterator_tag>::value,
-        identity_<random_access_traversal_tag>,
-        eval_if<
-          is_convertible<Category, bidirectional_universal_iterator_tag>::value,
-          identity_<bidirectional_traversal_tag>,
-          eval_if<
-            is_convertible<Category, forward_universal_iterator_tag>::value,
-            identity_<forward_traversal_tag>,
-            eval_if<
-              is_convertible<Category, input_universal_iterator_tag>::value,
-              identity_<single_pass_traversal_tag>,
-              eval_if<
-                is_convertible<Category, output_universal_iterator_tag>::value,
-                identity_<incrementable_traversal_tag>,
-
-                // unknown traversal
-                void
-              >
-            >
-          >
-        >
-      >
-{
-}; // end any_system_category_to_traversal
-
-
-template<typename Category>
-  struct category_to_traversal
-      // check for any system
-    : eval_if<
-        or_<
-          is_convertible<Category, thrust::input_universal_iterator_tag>,
-          is_convertible<Category, thrust::output_universal_iterator_tag>
-        >::value,
-
-        any_system_category_to_traversal<Category>,
-
-        // check for host system
-        eval_if<
-          or_<
-            is_convertible<Category, thrust::input_host_iterator_tag>,
-            is_convertible<Category, thrust::output_host_iterator_tag>
-          >::value,
-
-          host_system_category_to_traversal<Category>,
-
-          // check for device system
-          eval_if<
-            or_<
-              is_convertible<Category, thrust::input_device_iterator_tag>,
-              is_convertible<Category, thrust::output_device_iterator_tag>
-            >::value,
-
-            device_system_category_to_traversal<Category>,
-
-            // unknown category
-            void
-          >
-        >
-      >
-{};
-
-
-template <typename CategoryOrTraversal>
-  struct iterator_category_to_traversal
-    : eval_if<
-        is_iterator_traversal<CategoryOrTraversal>::value,
-        detail::identity_<CategoryOrTraversal>,
-        category_to_traversal<CategoryOrTraversal>
-      >
-{
-}; // end iterator_category_to_traversal
-
-
-} // end detail
-
-} // end thrust
-
diff --git a/compat/thrust/iterator/detail/iterator_facade_category.h b/compat/thrust/iterator/detail/iterator_facade_category.h
deleted file mode 100644
index fbb8bd6451..0000000000
--- a/compat/thrust/iterator/detail/iterator_facade_category.h
+++ /dev/null
@@ -1,283 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/iterator/detail/host_system_tag.h>
-#include <thrust/iterator/detail/device_system_tag.h>
-#include <thrust/iterator/detail/any_system_tag.h>
-#include <thrust/iterator/iterator_categories.h>
-#include <thrust/iterator/detail/iterator_traversal_tags.h>
-#include <thrust/iterator/detail/is_iterator_category.h>
-#include <thrust/iterator/detail/iterator_category_to_traversal.h>
-
-namespace thrust
-{
-
-namespace detail
-{
-
-template<typename Category, typename System, typename Traversal>
-  struct iterator_category_with_system_and_traversal
-    : Category
-{
-}; // end iterator_category_with_system_and_traversal
-
-// specialize iterator_category_to_system for iterator_category_with_system_and_traversal
-template<typename Category> struct iterator_category_to_system;
-
-template<typename Category, typename System, typename Traversal>
-  struct iterator_category_to_system<iterator_category_with_system_and_traversal<Category,System,Traversal> >
-{
-  typedef System type;
-}; // end iterator_category_with_system_and_traversal
-
-
-// adapted from http://www.boost.org/doc/libs/1_37_0/libs/iterator/doc/iterator_facade.html#iterator-category
-//
-// in our implementation, R need not be a reference type to result in a category
-// derived from forward_XXX_iterator_tag
-//
-// iterator-category(T,V,R) :=
-//   if(T is convertible to input_host_iterator_tag
-//      || T is convertible to output_host_iterator_tag
-//      || T is convertible to input_device_iterator_tag
-//      || T is convertible to output_device_iterator_tag
-//   )
-//     return T
-//
-//   else if (T is not convertible to incrementable_traversal_tag)
-//     the program is ill-formed
-//
-//   else return a type X satisfying the following two constraints:
-//
-//     1. X is convertible to X1, and not to any more-derived
-//        type, where X1 is defined by:
-//
-//        if (T is convertible to forward_traversal_tag)
-//        {
-//          if (T is convertible to random_access_traversal_tag)
-//            X1 = random_access_host_iterator_tag
-//          else if (T is convertible to bidirectional_traversal_tag)
-//            X1 = bidirectional_host_iterator_tag
-//          else
-//            X1 = forward_host_iterator_tag
-//        }
-//        else
-//        {
-//          if (T is convertible to single_pass_traversal_tag
-//              && R is convertible to V)
-//            X1 = input_host_iterator_tag
-//          else
-//            X1 = T
-//        }
-//
-//     2. category-to-traversal(X) is convertible to the most
-//        derived traversal tag type to which X is also convertible,
-//        and not to any more-derived traversal tag type.
-
-
-template<typename System, typename Traversal, typename ValueParam, typename Reference>
-  struct iterator_facade_default_category;
-
-
-// Thrust's implementation of iterator_facade_default_category is slightly
-// different from Boost's equivalent.
-// Thrust does not check is_convertible<Reference, ValueParam> because Reference
-// may not be a complete type at this point, and implementations of is_convertible
-// typically require that both types be complete.
-// Instead, it simply assumes that if is_convertible<Traversal, single_pass_traversal_tag>,
-// then the category is input_iterator_tag
-
-
-// this is the function for standard system iterators
-template<typename Traversal, typename ValueParam, typename Reference>
-  struct iterator_facade_default_category_std :
-    thrust::detail::eval_if<
-      thrust::detail::is_convertible<Traversal, thrust::forward_traversal_tag>::value,
-      thrust::detail::eval_if<
-        thrust::detail::is_convertible<Traversal, thrust::random_access_traversal_tag>::value,
-        thrust::detail::identity_<std::random_access_iterator_tag>,
-        thrust::detail::eval_if<
-          thrust::detail::is_convertible<Traversal, thrust::bidirectional_traversal_tag>::value,
-          thrust::detail::identity_<std::bidirectional_iterator_tag>,
-          thrust::detail::identity_<std::forward_iterator_tag>
-        >
-      >,
-      thrust::detail::eval_if< // XXX note we differ from Boost here
-        thrust::detail::is_convertible<Traversal, thrust::single_pass_traversal_tag>::value,
-        thrust::detail::identity_<std::input_iterator_tag>,
-        thrust::detail::identity_<Traversal>
-      >
-    >
-{
-}; // end iterator_facade_default_category_std
-
-
-// this is the function for host system iterators
-template<typename Traversal, typename ValueParam, typename Reference>
-  struct iterator_facade_default_category_host :
-    thrust::detail::eval_if<
-      thrust::detail::is_convertible<Traversal, thrust::forward_traversal_tag>::value,
-      thrust::detail::eval_if<
-        thrust::detail::is_convertible<Traversal, thrust::random_access_traversal_tag>::value,
-        thrust::detail::identity_<thrust::random_access_host_iterator_tag>,
-        thrust::detail::eval_if<
-          thrust::detail::is_convertible<Traversal, thrust::bidirectional_traversal_tag>::value,
-          thrust::detail::identity_<thrust::bidirectional_host_iterator_tag>,
-          thrust::detail::identity_<thrust::forward_host_iterator_tag>
-        >
-      >,
-      thrust::detail::eval_if< // XXX note we differ from Boost here
-        thrust::detail::is_convertible<Traversal, thrust::single_pass_traversal_tag>::value,
-        thrust::detail::identity_<thrust::input_host_iterator_tag>,
-        thrust::detail::identity_<Traversal>
-      >
-    >
-{
-}; // end iterator_facade_default_category_host
-
-
-// this is the function for device system iterators
-template<typename Traversal, typename ValueParam, typename Reference>
-  struct iterator_facade_default_category_device :
-    thrust::detail::eval_if<
-      thrust::detail::is_convertible<Traversal, thrust::forward_traversal_tag>::value,
-      thrust::detail::eval_if<
-        thrust::detail::is_convertible<Traversal, thrust::random_access_traversal_tag>::value,
-        thrust::detail::identity_<thrust::random_access_device_iterator_tag>,
-        thrust::detail::eval_if<
-          thrust::detail::is_convertible<Traversal, thrust::bidirectional_traversal_tag>::value,
-          thrust::detail::identity_<thrust::bidirectional_device_iterator_tag>,
-          thrust::detail::identity_<thrust::forward_device_iterator_tag>
-        >
-      >,
-      thrust::detail::eval_if<
-        thrust::detail::is_convertible<Traversal, thrust::single_pass_traversal_tag>::value, // XXX note we differ from Boost here
-        thrust::detail::identity_<thrust::input_device_iterator_tag>,
-        thrust::detail::identity_<Traversal>
-      >
-    >
-{
-}; // end iterator_facade_default_category_device
-
-
-// this is the function for any system iterators
-template<typename Traversal, typename ValueParam, typename Reference>
-  struct iterator_facade_default_category_any :
-    thrust::detail::eval_if<
-
-      thrust::detail::is_convertible<Traversal, thrust::forward_traversal_tag>::value,
-
-      thrust::detail::eval_if<
-        thrust::detail::is_convertible<Traversal, thrust::random_access_traversal_tag>::value,
-        thrust::detail::identity_<thrust::random_access_universal_iterator_tag>,
-
-        thrust::detail::eval_if<
-          thrust::detail::is_convertible<Traversal, thrust::bidirectional_traversal_tag>::value,
-          thrust::detail::identity_<thrust::bidirectional_universal_iterator_tag>,
-          thrust::detail::identity_<thrust::forward_universal_iterator_tag>
-        >
-      >,
-
-      thrust::detail::eval_if<
-        thrust::detail::is_convertible<Traversal, thrust::single_pass_traversal_tag>::value, // XXX note we differ from Boost here
-        thrust::detail::identity_<thrust::input_universal_iterator_tag>,
-        thrust::detail::identity_<Traversal>
-      >
-    >
-{
-}; // end iterator_facade_default_category_any
-
-
-template<typename System, typename Traversal, typename ValueParam, typename Reference>
-  struct iterator_facade_default_category
-      // check for any system
-    : thrust::detail::eval_if<
-        thrust::detail::is_convertible<System, thrust::any_system_tag>::value,
-        iterator_facade_default_category_any<Traversal, ValueParam, Reference>,
-
-        // check for host system
-        thrust::detail::eval_if<
-          thrust::detail::is_convertible<System, thrust::host_system_tag>::value,
-          iterator_facade_default_category_host<Traversal, ValueParam, Reference>,
-
-          // check for device system
-          thrust::detail::eval_if<
-            thrust::detail::is_convertible<System, thrust::device_system_tag>::value,
-            iterator_facade_default_category_device<Traversal, ValueParam, Reference>,
-
-            // if we don't recognize the system, get a standard iterator category
-            // and combine it with System & Traversal
-            thrust::detail::identity_<
-              thrust::detail::iterator_category_with_system_and_traversal<
-                typename iterator_facade_default_category_std<Traversal, ValueParam, Reference>::type,
-                System,
-                Traversal
-              >
-            >
-          >
-        >
-      >
-{};
-
-
-template<typename System, typename Traversal, typename ValueParam, typename Reference>
-  struct iterator_facade_category_impl
-{
-  typedef typename iterator_facade_default_category<
-    System,Traversal,ValueParam,Reference
-  >::type category;
-
-  // we must be able to deduce both Traversal & System from category
-  // otherwise, munge them all together
-  typedef typename thrust::detail::eval_if<
-    thrust::detail::and_<
-      thrust::detail::is_same<
-        Traversal,
-        typename thrust::detail::iterator_category_to_traversal<category>::type
-      >,
-      thrust::detail::is_same<
-        System,
-        typename thrust::detail::iterator_category_to_system<category>::type
-      >
-    >::value,
-    thrust::detail::identity_<category>,
-    thrust::detail::identity_<thrust::detail::iterator_category_with_system_and_traversal<category,System,Traversal> >
-  >::type type;
-}; // end iterator_facade_category_impl
-
-
-template<typename CategoryOrSystem,
-         typename CategoryOrTraversal,
-         typename ValueParam,
-         typename Reference>
-  struct iterator_facade_category
-{
-  typedef typename
-  thrust::detail::eval_if<
-    thrust::detail::is_iterator_category<CategoryOrTraversal>::value,
-    thrust::detail::identity_<CategoryOrTraversal>, // categories are fine as-is
-    iterator_facade_category_impl<CategoryOrSystem, CategoryOrTraversal, ValueParam, Reference>
-  >::type type;
-}; // end iterator_facade_category
-
-
-} // end detail
-} // end thrust
-
diff --git a/compat/thrust/iterator/detail/iterator_traits.inl b/compat/thrust/iterator/detail/iterator_traits.inl
deleted file mode 100644
index 924eabb187..0000000000
--- a/compat/thrust/iterator/detail/iterator_traits.inl
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file iterator_traits.inl
- *  \brief Inline file for iterator_traits.h.
- */
-
-#include <thrust/iterator/iterator_categories.h>
-#include <thrust/iterator/detail/iterator_category_to_traversal.h>
-#include <thrust/detail/type_traits.h>
-
-namespace thrust
-{
-
-template<typename Iterator>
-  struct iterator_value
-{
-  typedef typename thrust::iterator_traits<Iterator>::value_type type;
-}; // end iterator_value
-
-
-template<typename Iterator>
-  struct iterator_pointer
-{
-  typedef typename thrust::iterator_traits<Iterator>::pointer type;
-}; // end iterator_pointer
-
-
-template<typename Iterator>
-  struct iterator_reference
-{
-  typedef typename iterator_traits<Iterator>::reference type;
-}; // end iterator_reference
-
-
-template<typename Iterator>
-  struct iterator_difference
-{
-  typedef typename thrust::iterator_traits<Iterator>::difference_type type;
-}; // end iterator_difference
-
-
-template<typename Iterator>
-  struct iterator_system
-    : detail::iterator_category_to_system<
-        typename thrust::iterator_traits<Iterator>::iterator_category
-      >
-{
-}; // end iterator_system
-
-// specialize iterator_system for void *, which has no category
-template<>
-  struct iterator_system<void *>
-{
-  typedef thrust::iterator_system<int*>::type type;
-}; // end iterator_system<void*>
-
-template<>
-  struct iterator_system<const void *>
-{
-  typedef thrust::iterator_system<const int*>::type type;
-}; // end iterator_system<void*>
-
-
-template <typename Iterator>
-  struct iterator_traversal
-    : detail::iterator_category_to_traversal<
-        typename thrust::iterator_traits<Iterator>::iterator_category
-      >
-{
-}; // end iterator_traversal
-
-namespace detail
-{
-
-template <typename T>
-  struct is_iterator_traversal
-    : thrust::detail::is_convertible<T, incrementable_traversal_tag>
-{
-}; // end is_iterator_traversal
-
-
-template<typename T>
-  struct is_iterator_system
-    : detail::or_<
-        detail::is_convertible<T, any_system_tag>,
-        detail::or_<
-          detail::is_convertible<T, host_system_tag>,
-          detail::is_convertible<T, device_system_tag>
-        >
-      >
-{
-}; // end is_iterator_system
-
-
-} // end namespace detail
-} // end namespace thrust
-
diff --git a/compat/thrust/iterator/detail/iterator_traversal_tags.h b/compat/thrust/iterator/detail/iterator_traversal_tags.h
deleted file mode 100644
index dcbebf3fda..0000000000
--- a/compat/thrust/iterator/detail/iterator_traversal_tags.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-namespace thrust
-{
-
-// define Boost's traversal tags
-struct no_traversal_tag {};
-
-struct incrementable_traversal_tag
-  : no_traversal_tag {};
-
-struct single_pass_traversal_tag
-  : incrementable_traversal_tag {};
-
-struct forward_traversal_tag
-  : single_pass_traversal_tag {};
-
-struct bidirectional_traversal_tag
-  : forward_traversal_tag {};
-
-struct random_access_traversal_tag
-  : bidirectional_traversal_tag {};
-
-} // end thrust
-
diff --git a/compat/thrust/iterator/detail/minimum_category.h b/compat/thrust/iterator/detail/minimum_category.h
deleted file mode 100644
index e07e09636e..0000000000
--- a/compat/thrust/iterator/detail/minimum_category.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/type_traits/minimum_type.h>
-
-namespace thrust
-{
-
-namespace detail
-{ 
-
-template<typename T1,
-         typename T2  = minimum_type_detail::any_conversion,
-         typename T3  = minimum_type_detail::any_conversion,
-         typename T4  = minimum_type_detail::any_conversion,
-         typename T5  = minimum_type_detail::any_conversion,
-         typename T6  = minimum_type_detail::any_conversion,
-         typename T7  = minimum_type_detail::any_conversion,
-         typename T8  = minimum_type_detail::any_conversion,
-         typename T9  = minimum_type_detail::any_conversion,
-         typename T10 = minimum_type_detail::any_conversion,
-         typename T11 = minimum_type_detail::any_conversion,
-         typename T12 = minimum_type_detail::any_conversion,
-         typename T13 = minimum_type_detail::any_conversion,
-         typename T14 = minimum_type_detail::any_conversion,
-         typename T15 = minimum_type_detail::any_conversion,
-         typename T16 = minimum_type_detail::any_conversion>
-  struct minimum_category
-    : minimum_type<T1,T2,T3,T4,T5,T6,T7,T8,T9,T10,T11,T12,T13,T14,T15,T16>
-{
-}; // end minimum_category
-
-} // end detail
-
-} // end thrust
-
-
diff --git a/compat/thrust/iterator/detail/minimum_system.h b/compat/thrust/iterator/detail/minimum_system.h
deleted file mode 100644
index 5448a0d1f0..0000000000
--- a/compat/thrust/iterator/detail/minimum_system.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/type_traits/minimum_type.h>
-
-namespace thrust
-{
-namespace detail
-{ 
-
-template<typename T1,
-         typename T2  = minimum_type_detail::any_conversion,
-         typename T3  = minimum_type_detail::any_conversion,
-         typename T4  = minimum_type_detail::any_conversion,
-         typename T5  = minimum_type_detail::any_conversion,
-         typename T6  = minimum_type_detail::any_conversion,
-         typename T7  = minimum_type_detail::any_conversion,
-         typename T8  = minimum_type_detail::any_conversion,
-         typename T9  = minimum_type_detail::any_conversion,
-         typename T10 = minimum_type_detail::any_conversion,
-         typename T11 = minimum_type_detail::any_conversion,
-         typename T12 = minimum_type_detail::any_conversion,
-         typename T13 = minimum_type_detail::any_conversion,
-         typename T14 = minimum_type_detail::any_conversion,
-         typename T15 = minimum_type_detail::any_conversion,
-         typename T16 = minimum_type_detail::any_conversion>
-  struct minimum_system
-    : minimum_type<T1,T2,T3,T4,T5,T6,T7,T8,T9,T10,T11,T12,T13,T14,T15,T16>
-{
-}; // end minimum_system
-
-} // end detail
-} // end thrust
-
diff --git a/compat/thrust/iterator/detail/normal_iterator.h b/compat/thrust/iterator/detail/normal_iterator.h
deleted file mode 100644
index 7fe61bfed3..0000000000
--- a/compat/thrust/iterator/detail/normal_iterator.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file normal_iterator.h
- *  \brief Defines the interface to an iterator class
- *         which adapts a pointer type.
- */
-
-#pragma once
-
-#include <thrust/iterator/iterator_adaptor.h>
-#include <thrust/iterator/detail/is_trivial_iterator.h>
-#include <thrust/detail/type_traits.h>
-
-namespace thrust
-{
-namespace detail
-{
-
-
-template<typename Pointer>
-  class normal_iterator
-    : public iterator_adaptor<
-        normal_iterator<Pointer>,
-        Pointer
-      >
-{
-  typedef iterator_adaptor<normal_iterator<Pointer>, Pointer> super_t;
-
-  public:
-    __host__ __device__
-    normal_iterator() {}
-
-    __host__ __device__
-    normal_iterator(Pointer p)
-      : super_t(p) {}
-    
-    template<typename OtherPointer>
-    __host__ __device__
-    normal_iterator(const normal_iterator<OtherPointer> &other,
-                    typename thrust::detail::enable_if_convertible<
-                      OtherPointer,
-                      Pointer
-                    >::type * = 0)
-      : super_t(other.base()) {}
-
-}; // end normal_iterator
-
-
-template<typename Pointer>
-  inline __host__ __device__ normal_iterator<Pointer> make_normal_iterator(Pointer ptr)
-{
-  return normal_iterator<Pointer>(ptr);
-}
-
-
-template<typename T> struct is_trivial_iterator< normal_iterator<T> > : public true_type {};
-
-
-} // end detail
-} // end thrust
-
diff --git a/compat/thrust/iterator/detail/permutation_iterator_base.h b/compat/thrust/iterator/detail/permutation_iterator_base.h
deleted file mode 100644
index a145b88aec..0000000000
--- a/compat/thrust/iterator/detail/permutation_iterator_base.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/iterator/iterator_adaptor.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/iterator/detail/minimum_system.h>
-
-namespace thrust
-{
-
-template<typename,typename> class permutation_iterator;
-
-
-namespace detail
-{
-
-template<typename ElementIterator,
-         typename IndexIterator>
-  struct permutation_iterator_base
-{
-  typedef typename thrust::iterator_system<ElementIterator>::type System1;
-  typedef typename thrust::iterator_system<IndexIterator>::type System2;
-
-  typedef thrust::iterator_adaptor<
-    permutation_iterator<ElementIterator,IndexIterator>,
-    IndexIterator,
-    typename thrust::iterator_value<ElementIterator>::type,
-    typename detail::minimum_system<System1,System2>::type,
-    thrust::use_default,
-    typename thrust::iterator_reference<ElementIterator>::type
-  > type;
-}; // end permutation_iterator_base
-
-} // end detail
-
-} // end thrust
-
diff --git a/compat/thrust/iterator/detail/retag.h b/compat/thrust/iterator/detail/retag.h
deleted file mode 100644
index 4417fa5604..0000000000
--- a/compat/thrust/iterator/detail/retag.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/iterator/detail/tagged_iterator.h>
-#include <thrust/detail/pointer.h>
-
-namespace thrust
-{
-namespace detail
-{
-
-
-// we can retag an iterator if FromTag converts to ToTag
-// or vice versa
-template<typename FromTag, typename ToTag>
-  struct is_retaggable
-    : integral_constant<
-        bool,
-        (is_convertible<FromTag,ToTag>::value || is_convertible<ToTag,FromTag>::value)
-      >
-{};
-
-
-template<typename FromTag, typename ToTag, typename Result>
-  struct enable_if_retaggable
-    : enable_if<
-        is_retaggable<FromTag,ToTag>::value,
-        Result
-      >
-{}; // end enable_if_retaggable
-
-
-} // end detail
-
-
-template<typename Tag, typename Iterator>
-  thrust::detail::tagged_iterator<Iterator,Tag>
-    reinterpret_tag(Iterator iter)
-{
-  return thrust::detail::tagged_iterator<Iterator,Tag>(iter);
-} // end reinterpret_tag()
-
-
-// specialization for raw pointer
-template<typename Tag, typename T>
-  thrust::pointer<T,Tag>
-    reinterpret_tag(T *ptr)
-{
-  return thrust::pointer<T,Tag>(ptr);
-} // end reinterpret_tag()
-
-
-// specialization for thrust::pointer
-template<typename Tag, typename T, typename OtherTag, typename Reference, typename Derived>
-  thrust::pointer<T,Tag>
-    reinterpret_tag(thrust::pointer<T,OtherTag,Reference,Derived> ptr)
-{
-  return reinterpret_tag<Tag>(ptr.get());
-} // end reinterpret_tag()
-
-
-// avoid deeply-nested tagged_iterator
-template<typename Tag, typename BaseIterator, typename OtherTag>
-  thrust::detail::tagged_iterator<BaseIterator,Tag>
-    reinterpret_tag(thrust::detail::tagged_iterator<BaseIterator,OtherTag> iter)
-{
-  return reinterpret_tag<Tag>(iter.base());
-} // end reinterpret_tag()
-
-
-template<typename Tag, typename Iterator>
-  typename thrust::detail::enable_if_retaggable<
-    typename thrust::iterator_system<Iterator>::type,
-    Tag,
-    thrust::detail::tagged_iterator<Iterator,Tag>
-  >::type
-    retag(Iterator iter)
-{
-  return reinterpret_tag<Tag>(iter);
-} // end retag()
-
-
-// specialization for raw pointer
-template<typename Tag, typename T>
-  typename thrust::detail::enable_if_retaggable<
-    typename thrust::iterator_system<T*>::type,
-    Tag,
-    thrust::pointer<T,Tag>
-  >::type
-    retag(T *ptr)
-{
-  return reinterpret_tag<Tag>(ptr);
-} // end retag()
-
-
-// specialization for thrust::pointer
-template<typename Tag, typename T, typename OtherTag>
-  typename thrust::detail::enable_if_retaggable<
-    OtherTag,
-    Tag,
-    thrust::pointer<T,Tag>
-  >::type
-    retag(thrust::pointer<T,OtherTag> ptr)
-{
-  return reinterpret_tag<Tag>(ptr);
-} // end retag()
-
-
-// avoid deeply-nested tagged_iterator
-template<typename Tag, typename BaseIterator, typename OtherTag>
-  typename thrust::detail::enable_if_retaggable<
-    OtherTag,
-    Tag,
-    thrust::detail::tagged_iterator<BaseIterator,Tag>
-  >::type
-    retag(thrust::detail::tagged_iterator<BaseIterator,OtherTag> iter)
-{
-  return reinterpret_tag<Tag>(iter);
-} // end retag()
-
-
-} // end thrust
-
diff --git a/compat/thrust/iterator/detail/reverse_iterator.inl b/compat/thrust/iterator/detail/reverse_iterator.inl
deleted file mode 100644
index 03e9032130..0000000000
--- a/compat/thrust/iterator/detail/reverse_iterator.inl
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/iterator/reverse_iterator.h>
-#include <thrust/iterator/iterator_traits.h>
-
-namespace thrust
-{
-
-namespace detail
-{
-
-__thrust_hd_warning_disable__
-template<typename Iterator>
-__host__ __device__
-  Iterator prior(Iterator x)
-{
-  return --x;
-} // end prior()
-
-} // end detail
-
-template<typename BidirectionalIterator>
-  reverse_iterator<BidirectionalIterator>
-    ::reverse_iterator(BidirectionalIterator x)
-      :super_t(x)
-{
-} // end reverse_iterator::reverse_iterator()
-
-template<typename BidirectionalIterator>
-  template<typename OtherBidirectionalIterator>
-    reverse_iterator<BidirectionalIterator>
-      ::reverse_iterator(reverse_iterator<OtherBidirectionalIterator> const &r
-// XXX msvc screws this up
-#ifndef _MSC_VER
-                     , typename thrust::detail::enable_if<
-                         thrust::detail::is_convertible<
-                           OtherBidirectionalIterator,
-                           BidirectionalIterator
-                         >::value
-                       >::type *
-#endif // _MSC_VER
-                     )
-        :super_t(r.base())
-{
-} // end reverse_iterator::reverse_iterator()
-
-template<typename BidirectionalIterator>
-  typename reverse_iterator<BidirectionalIterator>::super_t::reference
-    reverse_iterator<BidirectionalIterator>
-      ::dereference(void) const
-{
-  return *thrust::detail::prior(this->base());
-} // end reverse_iterator::increment()
-
-template<typename BidirectionalIterator>
-  void reverse_iterator<BidirectionalIterator>
-    ::increment(void)
-{
-  --this->base_reference();
-} // end reverse_iterator::increment()
-
-template<typename BidirectionalIterator>
-  void reverse_iterator<BidirectionalIterator>
-    ::decrement(void)
-{
-  ++this->base_reference();
-} // end reverse_iterator::decrement()
-
-template<typename BidirectionalIterator>
-  void reverse_iterator<BidirectionalIterator>
-    ::advance(typename super_t::difference_type n)
-{
-  this->base_reference() += -n;
-} // end reverse_iterator::advance()
-
-template<typename BidirectionalIterator>
-  template<typename OtherBidirectionalIterator>
-    typename reverse_iterator<BidirectionalIterator>::super_t::difference_type
-      reverse_iterator<BidirectionalIterator>
-        ::distance_to(reverse_iterator<OtherBidirectionalIterator> const &y) const
-{
-  return this->base_reference() - y.base();
-} // end reverse_iterator::distance_to()
-
-template<typename BidirectionalIterator>
-__host__ __device__
-reverse_iterator<BidirectionalIterator> make_reverse_iterator(BidirectionalIterator x)
-{
-  return reverse_iterator<BidirectionalIterator>(x);
-} // end make_reverse_iterator()
-
-
-} // end thrust
-
diff --git a/compat/thrust/iterator/detail/reverse_iterator_base.h b/compat/thrust/iterator/detail/reverse_iterator_base.h
deleted file mode 100644
index c10c5b73ff..0000000000
--- a/compat/thrust/iterator/detail/reverse_iterator_base.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/iterator/iterator_adaptor.h>
-#include <thrust/iterator/iterator_traits.h>
-
-namespace thrust
-{
-
-template <typename> class reverse_iterator;
-
-namespace detail
-{
-
-template<typename BidirectionalIterator>
-  struct reverse_iterator_base
-{
-  typedef thrust::iterator_adaptor<
-    thrust::reverse_iterator<BidirectionalIterator>,
-    BidirectionalIterator
-  > type;
-}; // end reverse_iterator_base
-
-} // end detail
-
-} // end thrust
-
diff --git a/compat/thrust/iterator/detail/tagged_iterator.h b/compat/thrust/iterator/detail/tagged_iterator.h
deleted file mode 100644
index 69e6445183..0000000000
--- a/compat/thrust/iterator/detail/tagged_iterator.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/iterator/iterator_adaptor.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/use_default.h>
-
-namespace thrust
-{
-namespace detail
-{
-
-template <typename,typename> class tagged_iterator;
-
-template<typename Iterator, typename Tag>
-  struct tagged_iterator_base
-{
-  typedef thrust::iterator_adaptor<
-    tagged_iterator<Iterator,Tag>,
-    Iterator,
-    typename thrust::iterator_value<Iterator>::type,
-    Tag,
-    typename thrust::iterator_traversal<Iterator>::type,
-    typename thrust::iterator_reference<Iterator>::type,
-    typename thrust::iterator_difference<Iterator>::type
-  > type;
-}; // end tagged_iterator_base
-
-template<typename Iterator, typename Tag>
-  class tagged_iterator
-    : public tagged_iterator_base<Iterator,Tag>::type
-{
-  private:
-    typedef typename tagged_iterator_base<Iterator,Tag>::type super_t;
-
-  public:
-    __host__ __device__
-    tagged_iterator(void) {}
-
-    __host__ __device__
-    explicit tagged_iterator(Iterator x)
-      : super_t(x) {}
-}; // end tagged_iterator
-
-
-// specialize is_trivial_iterator for tagged_iterator
-template<typename> struct is_trivial_iterator;
-
-// tagged_iterator is trivial if its base iterator is
-template<typename BaseIterator, typename Tag>
-  struct is_trivial_iterator<tagged_iterator<BaseIterator,Tag> >
-    : is_trivial_iterator<BaseIterator>
-{};
-
-
-} // end detail
-} // end thrust
-
diff --git a/compat/thrust/iterator/detail/transform_iterator.inl b/compat/thrust/iterator/detail/transform_iterator.inl
deleted file mode 100644
index a5a36a78be..0000000000
--- a/compat/thrust/iterator/detail/transform_iterator.inl
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/iterator/iterator_adaptor.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/type_traits/result_of.h>
-
-namespace thrust
-{
-
-template <class UnaryFunction, class Iterator, class Reference, class Value>
-  class transform_iterator;
-  
-namespace detail 
-{
-
-// Compute the iterator_adaptor instantiation to be used for transform_iterator
-template <class UnaryFunc, class Iterator, class Reference, class Value>
-struct transform_iterator_base
-{
- private:
-    // By default, dereferencing the iterator yields the same as the function.
-    typedef typename thrust::detail::ia_dflt_help<
-      Reference,
-      thrust::detail::result_of<UnaryFunc(typename thrust::iterator_value<Iterator>::type)>
-    >::type reference;
-
-    // To get the default for Value: remove any reference on the
-    // result type, but retain any constness to signal
-    // non-writability.  Note that if we adopt Thomas' suggestion
-    // to key non-writability *only* on the Reference argument,
-    // we'd need to strip constness here as well.
-    typedef typename thrust::detail::ia_dflt_help<
-      Value,
-      thrust::detail::remove_reference<reference>
-    >::type cv_value_type;
-
- public:
-    typedef thrust::iterator_adaptor
-    <
-        transform_iterator<UnaryFunc, Iterator, Reference, Value>
-      , Iterator
-      , cv_value_type
-      , thrust::use_default   // Leave the system alone
-        //, thrust::use_default   // Leave the traversal alone
-        // use the Iterator's category to let any system iterators remain random access even though
-        // transform_iterator's reference type may not be a reference
-        // XXX figure out why only iterators whose reference types are true references are random access
-        , typename thrust::iterator_traits<Iterator>::iterator_category
-      , reference
-    > type;
-};
-
-
-} // end detail
-} // end thrust
-
diff --git a/compat/thrust/iterator/detail/tuple_of_iterator_references.h b/compat/thrust/iterator/detail/tuple_of_iterator_references.h
deleted file mode 100644
index fdbf6b8f66..0000000000
--- a/compat/thrust/iterator/detail/tuple_of_iterator_references.h
+++ /dev/null
@@ -1,246 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/tuple.h>
-#include <thrust/pair.h>
-#include <thrust/detail/reference_forward_declaration.h>
-
-namespace thrust
-{
-namespace detail
-{
-
-  
-template<
-  typename T0, typename T1, typename T2,
-  typename T3, typename T4, typename T5,
-  typename T6, typename T7, typename T8,
-  typename T9
->
-  class tuple_of_iterator_references
-    : public thrust::tuple<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
-{
-  private:
-    typedef thrust::tuple<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9> super_t;
-
-  public:
-    // allow implicit construction from tuple<refs>
-    inline __host__ __device__
-    tuple_of_iterator_references(const super_t &other)
-      : super_t(other)
-    {}
-
-    // allow assignment from tuples
-    // XXX might be worthwhile to guard this with an enable_if is_assignable
-    template<typename U1, typename U2>
-    inline __host__ __device__
-    tuple_of_iterator_references &operator=(const detail::cons<U1,U2> &other)
-    {
-      super_t::operator=(other);
-      return *this;
-    }
-
-    // allow assignment from pairs
-    // XXX might be worthwhile to guard this with an enable_if is_assignable
-    template<typename U1, typename U2>
-    inline __host__ __device__
-    tuple_of_iterator_references &operator=(const thrust::pair<U1,U2> &other)
-    {
-      super_t::operator=(other);
-      return *this;
-    }
-
-    // allow assignment from reference<tuple>
-    // XXX perhaps we should generalize to reference<T>
-    //     we could captures reference<pair> this way
-    template<typename U0, typename U1, typename U2,
-             typename U3, typename U4, typename U5,
-             typename U6, typename U7, typename U8,
-             typename U9,
-             typename Pointer, typename Derived>
-    inline __host__ __device__
-// XXX gcc-4.2 crashes on is_assignable
-//    typename thrust::detail::enable_if<
-//      thrust::detail::is_assignable<
-//        super_t,
-//        const thrust::tuple<U0,U1,U2,U3,U4,U5,U6,U7,U8,U9>
-//      >::value,
-//      tuple_of_iterator_references &
-//    >::type
-    tuple_of_iterator_references &
-    operator=(const thrust::reference<thrust::tuple<U0,U1,U2,U3,U4,U5,U6,U7,U8,U9>, Pointer, Derived> &other)
-    {
-      typedef thrust::tuple<U0,U1,U2,U3,U4,U5,U6,U7,U8,U9> tuple_type;
-
-      // XXX perhaps this could be accelerated
-      tuple_type other_tuple = other;
-      super_t::operator=(other_tuple);
-      return *this;
-    }
-
-
-    // duplicate thrust::tuple's constructors
-    inline __host__ __device__
-    tuple_of_iterator_references() {}
-
-    inline __host__ __device__ 
-    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0)
-      : super_t(t0,
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()))
-    {}
-
-    inline __host__ __device__ 
-    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0,
-                                 typename access_traits<T1>::parameter_type t1)
-      : super_t(t0, t1,
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()))
-    {}
-
-    inline __host__ __device__ 
-    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0,
-                                 typename access_traits<T1>::parameter_type t1,
-                                 typename access_traits<T2>::parameter_type t2)
-      : super_t(t0, t1, t2,
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()))
-    {}
-
-    inline __host__ __device__ 
-    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0,
-                                 typename access_traits<T1>::parameter_type t1,
-                                 typename access_traits<T2>::parameter_type t2,
-                                 typename access_traits<T3>::parameter_type t3)
-      : super_t(t0, t1, t2, t3,
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()))
-    {}
-
-    inline __host__ __device__ 
-    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0,
-                                 typename access_traits<T1>::parameter_type t1,
-                                 typename access_traits<T2>::parameter_type t2,
-                                 typename access_traits<T3>::parameter_type t3,
-                                 typename access_traits<T4>::parameter_type t4)
-      : super_t(t0, t1, t2, t3, t4,
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()))
-    {}
-
-    inline __host__ __device__ 
-    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0,
-                                 typename access_traits<T1>::parameter_type t1,
-                                 typename access_traits<T2>::parameter_type t2,
-                                 typename access_traits<T3>::parameter_type t3,
-                                 typename access_traits<T4>::parameter_type t4,
-                                 typename access_traits<T5>::parameter_type t5)
-      : super_t(t0, t1, t2, t3, t4, t5,
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()))
-    {}
-
-    inline __host__ __device__ 
-    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0,
-                                 typename access_traits<T1>::parameter_type t1,
-                                 typename access_traits<T2>::parameter_type t2,
-                                 typename access_traits<T3>::parameter_type t3,
-                                 typename access_traits<T4>::parameter_type t4,
-                                 typename access_traits<T5>::parameter_type t5,
-                                 typename access_traits<T6>::parameter_type t6)
-      : super_t(t0, t1, t2, t3, t4, t5, t6,
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()))
-    {}
-
-    inline __host__ __device__ 
-    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0,
-                                 typename access_traits<T1>::parameter_type t1,
-                                 typename access_traits<T2>::parameter_type t2,
-                                 typename access_traits<T3>::parameter_type t3,
-                                 typename access_traits<T4>::parameter_type t4,
-                                 typename access_traits<T5>::parameter_type t5,
-                                 typename access_traits<T6>::parameter_type t6,
-                                 typename access_traits<T7>::parameter_type t7)
-      : super_t(t0, t1, t2, t3, t4, t5, t6, t7,
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()))
-    {}
-
-    inline __host__ __device__ 
-    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0,
-                                 typename access_traits<T1>::parameter_type t1,
-                                 typename access_traits<T2>::parameter_type t2,
-                                 typename access_traits<T3>::parameter_type t3,
-                                 typename access_traits<T4>::parameter_type t4,
-                                 typename access_traits<T5>::parameter_type t5,
-                                 typename access_traits<T6>::parameter_type t6,
-                                 typename access_traits<T7>::parameter_type t7,
-                                 typename access_traits<T8>::parameter_type t8)
-      : super_t(t0, t1, t2, t3, t4, t5, t6, t7, t8,
-                static_cast<const null_type&>(null_type()))
-    {}
-
-    inline __host__ __device__ 
-    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0,
-                                 typename access_traits<T1>::parameter_type t1,
-                                 typename access_traits<T2>::parameter_type t2,
-                                 typename access_traits<T3>::parameter_type t3,
-                                 typename access_traits<T4>::parameter_type t4,
-                                 typename access_traits<T5>::parameter_type t5,
-                                 typename access_traits<T6>::parameter_type t6,
-                                 typename access_traits<T7>::parameter_type t7,
-                                 typename access_traits<T8>::parameter_type t8,
-                                 typename access_traits<T9>::parameter_type t9)
-      : super_t(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9)
-    {}
-};
-
-
-} // end detail
-} // end thrust
-
diff --git a/compat/thrust/iterator/detail/universal_categories.h b/compat/thrust/iterator/detail/universal_categories.h
deleted file mode 100644
index 7c3922210c..0000000000
--- a/compat/thrust/iterator/detail/universal_categories.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/iterator/iterator_categories.h>
-
-namespace thrust
-{
-
-// define these types without inheritance to avoid ambiguous conversion to base classes
-
-struct input_universal_iterator_tag
-{
-  operator input_host_iterator_tag () {return input_host_iterator_tag();}
-
-  operator input_device_iterator_tag () {return input_device_iterator_tag();}
-};
-
-struct output_universal_iterator_tag
-{
-  operator output_host_iterator_tag () {return output_host_iterator_tag();}
-
-  operator output_device_iterator_tag () {return output_device_iterator_tag();}
-};
-
-struct forward_universal_iterator_tag
-  : input_universal_iterator_tag
-{
-  operator forward_host_iterator_tag () {return forward_host_iterator_tag();};
-
-  operator forward_device_iterator_tag () {return forward_device_iterator_tag();};
-};
-
-struct bidirectional_universal_iterator_tag
-  : forward_universal_iterator_tag
-{
-  operator bidirectional_host_iterator_tag () {return bidirectional_host_iterator_tag();};
-
-  operator bidirectional_device_iterator_tag () {return bidirectional_device_iterator_tag();};
-};
-
-
-namespace detail
-{
-
-// create this struct to control conversion precedence in random_access_universal_iterator_tag
-template<typename T>
-struct one_degree_of_separation
-  : T
-{
-};
-
-} // end detail
-
-
-struct random_access_universal_iterator_tag
-{
-  // these conversions are all P0
-  operator random_access_host_iterator_tag () {return random_access_host_iterator_tag();};
-
-  operator random_access_device_iterator_tag () {return random_access_device_iterator_tag();};
-
-  // bidirectional_universal_iterator_tag is P1
-  operator detail::one_degree_of_separation<bidirectional_universal_iterator_tag> () {return detail::one_degree_of_separation<bidirectional_universal_iterator_tag>();}
-
-};
-
-
-} // end thrust
-
diff --git a/compat/thrust/iterator/detail/zip_iterator.inl b/compat/thrust/iterator/detail/zip_iterator.inl
deleted file mode 100644
index fddd0ada11..0000000000
--- a/compat/thrust/iterator/detail/zip_iterator.inl
+++ /dev/null
@@ -1,151 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/detail/tuple_transform.h>
-
-namespace thrust
-{
-
-
-template <typename IteratorTuple>
-  zip_iterator<IteratorTuple>
-    ::zip_iterator(void)
-{
-} // end zip_iterator::zip_iterator()
-
-
-template <typename IteratorTuple>
-  zip_iterator<IteratorTuple>
-    ::zip_iterator(IteratorTuple iterator_tuple)
-      :m_iterator_tuple(iterator_tuple)
-{
-} // end zip_iterator::zip_iterator()
-
-
-template <typename IteratorTuple>
-  template <typename OtherIteratorTuple>
-    zip_iterator<IteratorTuple>
-      ::zip_iterator(const zip_iterator<OtherIteratorTuple> &other,
-                     typename thrust::detail::enable_if_convertible<
-                       OtherIteratorTuple,
-                       IteratorTuple
-                     >::type *)
-        :m_iterator_tuple(other.get_iterator_tuple())
-{
-} // end zip_iterator::zip_iterator()
-
-
-template <typename IteratorTuple>
-const IteratorTuple &zip_iterator<IteratorTuple>
-  ::get_iterator_tuple(void) const
-{
-  return m_iterator_tuple;
-} // end zip_iterator::get_iterator_tuple()
-
-
-template <typename IteratorTuple>
-  typename zip_iterator<IteratorTuple>::super_t::reference
-    zip_iterator<IteratorTuple>
-      ::dereference(void) const
-{
-  using namespace detail::tuple_impl_specific;
-
-  return thrust::detail::tuple_host_device_transform<detail::dereference_iterator::template apply>(get_iterator_tuple(), detail::dereference_iterator());
-} // end zip_iterator::dereference()
-
-
-__thrust_hd_warning_disable__
-template <typename IteratorTuple>
-  template <typename OtherIteratorTuple>
-    bool zip_iterator<IteratorTuple>
-      ::equal(const zip_iterator<OtherIteratorTuple> &other) const
-{
-  return get<0>(get_iterator_tuple()) == get<0>(other.get_iterator_tuple());
-} // end zip_iterator::equal()
-
-
-template <typename IteratorTuple>
-  void zip_iterator<IteratorTuple>
-    ::advance(typename super_t::difference_type n)
-{
-  using namespace detail::tuple_impl_specific;
-
-  // XXX note that we use a pointer to System to dispatch to avoid
-  //     default construction of a System
-  typename thrust::iterator_system<zip_iterator>::type *use_me_to_dispatch = 0;
-
-  // dispatch on system
-  tuple_for_each(m_iterator_tuple,
-                 detail::advance_iterator<typename super_t::difference_type>(n),
-                 use_me_to_dispatch);
-} // end zip_iterator::advance()
-
-
-template <typename IteratorTuple>
-  void zip_iterator<IteratorTuple>
-    ::increment(void)
-{
-  using namespace detail::tuple_impl_specific;
-
-  // XXX note that we use a pointer to System to dispatch to avoid
-  //     default construction of a System
-  typename thrust::iterator_system<zip_iterator>::type *use_me_to_dispatch = 0;
-
-  // dispatch on system
-  tuple_for_each(m_iterator_tuple, detail::increment_iterator(),
-                 use_me_to_dispatch);
-} // end zip_iterator::increment()
-
-
-template <typename IteratorTuple>
-  void zip_iterator<IteratorTuple>
-    ::decrement(void)
-{
-  using namespace detail::tuple_impl_specific;
-
-  // XXX note that we use a pointer to System to dispatch to avoid
-  //     default construction of a System
-  typename thrust::iterator_system<zip_iterator>::type *use_me_to_dispatch = 0;
-
-  // dispatch on system
-  tuple_for_each(m_iterator_tuple, detail::decrement_iterator(),
-                 use_me_to_dispatch);
-} // end zip_iterator::decrement()
-
-
-__thrust_hd_warning_disable__
-template <typename IteratorTuple>
-  template <typename OtherIteratorTuple>
-    typename zip_iterator<IteratorTuple>::super_t::difference_type
-      zip_iterator<IteratorTuple>
-        ::distance_to(const zip_iterator<OtherIteratorTuple> &other) const
-{
-  return get<0>(other.get_iterator_tuple()) - get<0>(get_iterator_tuple());
-} // end zip_iterator::distance_to()
-
-
-template <typename IteratorTuple>
-  zip_iterator<IteratorTuple> make_zip_iterator(IteratorTuple t)
-{
-  return zip_iterator<IteratorTuple>(t);
-} // end make_zip_iterator()
-
-
-} // end thrust
-
diff --git a/compat/thrust/iterator/detail/zip_iterator_base.h b/compat/thrust/iterator/detail/zip_iterator_base.h
deleted file mode 100644
index 9dd7789e52..0000000000
--- a/compat/thrust/iterator/detail/zip_iterator_base.h
+++ /dev/null
@@ -1,418 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/iterator/iterator_facade.h>
-#include <thrust/iterator/iterator_categories.h>
-#include <thrust/iterator/detail/minimum_category.h>
-#include <thrust/iterator/detail/minimum_system.h>
-#include <thrust/tuple.h>
-#include <thrust/detail/tuple_meta_transform.h>
-#include <thrust/detail/tuple_transform.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/iterator/detail/tuple_of_iterator_references.h>
-
-namespace thrust
-{
-
-// forward declare zip_iterator for zip_iterator_base
-template<typename IteratorTuple> class zip_iterator;
-
-namespace detail
-{
-
-
-// Functors to be used with tuple algorithms
-//
-template<typename DiffType>
-class advance_iterator
-{
-public:
-  inline __host__ __device__
-  advance_iterator(DiffType step) : m_step(step) {}
-  
-  template<typename Iterator>
-  inline __host__ __device__
-  void operator()(Iterator& it) const
-  { it += m_step; }
-
-private:
-  DiffType m_step;
-}; // end advance_iterator
-
-
-struct increment_iterator
-{
-  template<typename Iterator>
-  inline __host__ __device__
-  void operator()(Iterator& it)
-  { ++it; }
-}; // end increment_iterator
-
-
-struct decrement_iterator
-{
-  template<typename Iterator>
-  inline __host__ __device__
-  void operator()(Iterator& it)
-  { --it; }
-}; // end decrement_iterator
-
-
-struct dereference_iterator
-{
-  template<typename Iterator>
-  struct apply
-  { 
-    typedef typename
-      iterator_traits<Iterator>::reference
-    type;
-  }; // end apply
-
-  // XXX silence warnings of the form "calling a __host__ function from a __host__ __device__ function is not allowed
-  __thrust_hd_warning_disable__
-  template<typename Iterator>
-  __host__ __device__
-    typename apply<Iterator>::type operator()(Iterator const& it)
-  {
-    return *it;
-  }
-}; // end dereference_iterator
-
-
-// The namespace tuple_impl_specific provides two meta-
-// algorithms and two algorithms for tuples.
-namespace tuple_impl_specific
-{
-
-// define apply1 for tuple_meta_transform_impl
-template<typename UnaryMetaFunctionClass, class Arg>
-  struct apply1
-    : UnaryMetaFunctionClass::template apply<Arg>
-{
-}; // end apply1
-
-
-// define apply2 for tuple_meta_accumulate_impl
-template<typename UnaryMetaFunctionClass, class Arg1, class Arg2>
-  struct apply2
-    : UnaryMetaFunctionClass::template apply<Arg1,Arg2>
-{
-}; // end apply2
-
-
-// Meta-accumulate algorithm for tuples. Note: The template 
-// parameter StartType corresponds to the initial value in 
-// ordinary accumulation.
-//
-template<class Tuple, class BinaryMetaFun, class StartType>
-  struct tuple_meta_accumulate;
-
-template<
-    typename Tuple
-  , class BinaryMetaFun
-  , typename StartType
->
-  struct tuple_meta_accumulate_impl
-{
-   typedef typename apply2<
-       BinaryMetaFun
-     , typename Tuple::head_type
-     , typename tuple_meta_accumulate<
-           typename Tuple::tail_type
-         , BinaryMetaFun
-         , StartType 
-       >::type
-   >::type type;
-};
-
-
-template<
-    typename Tuple
-  , class BinaryMetaFun
-  , typename StartType
->
-struct tuple_meta_accumulate
-  : thrust::detail::eval_if<
-        thrust::detail::is_same<Tuple, thrust::null_type>::value
-      , thrust::detail::identity_<StartType>
-      , tuple_meta_accumulate_impl<
-            Tuple
-          , BinaryMetaFun
-          , StartType
-        >
-    > // end eval_if
-{
-}; // end tuple_meta_accumulate
-
-
-// transform algorithm for tuples. The template parameter Fun
-// must be a unary functor which is also a unary metafunction
-// class that computes its return type based on its argument
-// type. For example:
-//
-// struct to_ptr
-// {
-//     template <class Arg>
-//     struct apply
-//     {
-//          typedef Arg* type;
-//     }
-//
-//     template <class Arg>
-//     Arg* operator()(Arg x);
-// };
-
-
-
-// for_each algorithm for tuples.
-//
-template<typename Fun, typename System>
-inline __host__ __device__
-Fun tuple_for_each(thrust::null_type, Fun f, System *)
-{
-  return f;
-} // end tuple_for_each()
-
-
-template<typename Tuple, typename Fun, typename System>
-inline __host__ __device__
-Fun tuple_for_each(Tuple& t, Fun f, System *dispatch_tag)
-{ 
-  f( t.get_head() );
-  return tuple_for_each(t.get_tail(), f, dispatch_tag);
-} // end tuple_for_each()
-
-
-template<typename Tuple, typename Fun>
-inline __host__ __device__
-Fun tuple_for_each(Tuple& t, Fun f, thrust::host_system_tag *dispatch_tag)
-{ 
-// XXX this path is required in order to accomodate pure host iterators
-//     (such as std::vector::iterator) in a zip_iterator
-#ifndef __CUDA_ARCH__
-  f( t.get_head() );
-  return tuple_for_each(t.get_tail(), f, dispatch_tag);
-#else
-  // this code will never be called
-  return f;
-#endif
-} // end tuple_for_each()
-
-
-// Equality of tuples. NOTE: "==" for tuples currently (7/2003)
-// has problems under some compilers, so I just do my own.
-// No point in bringing in a bunch of #ifdefs here. This is
-// going to go away with the next tuple implementation anyway.
-//
-__host__ __device__
-inline bool tuple_equal(thrust::null_type, thrust::null_type)
-{ return true; }
-
-
-template<typename Tuple1, typename Tuple2>
-__host__ __device__
-bool tuple_equal(Tuple1 const& t1, Tuple2 const& t2)
-{ 
-  return t1.get_head() == t2.get_head() && 
-  tuple_equal(t1.get_tail(), t2.get_tail());
-} // end tuple_equal()
-
-} // end end tuple_impl_specific
-
-
-// Metafunction to obtain the type of the tuple whose element types
-// are the value_types of an iterator tupel.
-//
-template<typename IteratorTuple>
-  struct tuple_of_value_types
-    : tuple_meta_transform<
-          IteratorTuple,
-          iterator_value
-        >
-{
-}; // end tuple_of_value_types
-
-
-struct minimum_category_lambda
-{
-  template<typename T1, typename T2>
-    struct apply : minimum_category<T1,T2>
-  {};
-};
-
-
-
-// Metafunction to obtain the minimal traversal tag in a tuple
-// of iterators.
-//
-template<typename IteratorTuple>
-struct minimum_traversal_category_in_iterator_tuple
-{
-  typedef typename tuple_meta_transform<
-      IteratorTuple
-    , thrust::iterator_traversal
-  >::type tuple_of_traversal_tags;
-      
-  typedef typename tuple_impl_specific::tuple_meta_accumulate<
-      tuple_of_traversal_tags
-    , minimum_category_lambda
-    , thrust::random_access_traversal_tag
-  >::type type;
-};
-
-
-struct minimum_system_lambda
-{
-  template<typename T1, typename T2>
-    struct apply : minimum_system<T1,T2>
-  {};
-};
-
-
-
-// Metafunction to obtain the minimal system tag in a tuple
-// of iterators.
-template<typename IteratorTuple>
-struct minimum_system_in_iterator_tuple
-{
-  typedef typename thrust::detail::tuple_meta_transform<
-    IteratorTuple,
-    thrust::iterator_system
-  >::type tuple_of_system_tags;
-
-  typedef typename tuple_impl_specific::tuple_meta_accumulate<
-    tuple_of_system_tags,
-    minimum_system_lambda,
-    thrust::any_system_tag
-  >::type type;
-};
-
-namespace zip_iterator_base_ns
-{
-
-
-template<int i, typename Tuple>
-  struct tuple_elements_helper
-    : eval_if<
-        (i < tuple_size<Tuple>::value),
-        tuple_element<i,Tuple>,
-        identity_<thrust::null_type>
-      >
-{};
-
-
-template<typename Tuple>
-  struct tuple_elements
-{
-  typedef typename tuple_elements_helper<0,Tuple>::type T0;
-  typedef typename tuple_elements_helper<1,Tuple>::type T1;
-  typedef typename tuple_elements_helper<2,Tuple>::type T2;
-  typedef typename tuple_elements_helper<3,Tuple>::type T3;
-  typedef typename tuple_elements_helper<4,Tuple>::type T4;
-  typedef typename tuple_elements_helper<5,Tuple>::type T5;
-  typedef typename tuple_elements_helper<6,Tuple>::type T6;
-  typedef typename tuple_elements_helper<7,Tuple>::type T7;
-  typedef typename tuple_elements_helper<8,Tuple>::type T8;
-  typedef typename tuple_elements_helper<9,Tuple>::type T9;
-};
-
-
-template<typename IteratorTuple>
-  struct tuple_of_iterator_references
-{
-  // get a thrust::tuple of the iterators' references
-  typedef typename tuple_meta_transform<
-    IteratorTuple,
-    iterator_reference
-  >::type tuple_of_references;
-
-  // get at the individual tuple element types by name
-  typedef tuple_elements<tuple_of_references> elements;
-
-  // map thrust::tuple<T...> to tuple_of_iterator_references<T...>
-  typedef thrust::detail::tuple_of_iterator_references<
-    typename elements::T0,
-    typename elements::T1,
-    typename elements::T2,
-    typename elements::T3,
-    typename elements::T4,
-    typename elements::T5,
-    typename elements::T6,
-    typename elements::T7,
-    typename elements::T8,
-    typename elements::T9
-  > type;
-};
-
-
-} // end zip_iterator_base_ns
-
-///////////////////////////////////////////////////////////////////
-//
-// Class zip_iterator_base
-//
-// Builds and exposes the iterator facade type from which the zip 
-// iterator will be derived.
-//
-template<typename IteratorTuple>
-  struct zip_iterator_base
-{
- //private:
-    // reference type is the type of the tuple obtained from the
-    // iterators' reference types.
-    typedef typename zip_iterator_base_ns::tuple_of_iterator_references<IteratorTuple>::type reference;
-
-    // Boost's Value type is the same as reference type.
-    //typedef reference value_type;
-    typedef typename tuple_of_value_types<IteratorTuple>::type value_type;
-
-    // Difference type is the first iterator's difference type
-    typedef typename thrust::iterator_traits<
-      typename thrust::tuple_element<0, IteratorTuple>::type
-    >::difference_type difference_type;
-
-    // Iterator system is the minimum system tag in the
-    // iterator tuple
-    typedef typename
-    minimum_system_in_iterator_tuple<IteratorTuple>::type system;
-
-    // Traversal category is the minimum traversal category in the
-    // iterator tuple
-    typedef typename
-    minimum_traversal_category_in_iterator_tuple<IteratorTuple>::type traversal_category;
-  
- public:
-  
-    // The iterator facade type from which the zip iterator will
-    // be derived.
-    typedef thrust::iterator_facade<
-        zip_iterator<IteratorTuple>,
-        value_type,  
-        system,
-        traversal_category,
-        reference,
-        difference_type
-    > type;
-}; // end zip_iterator_base
-
-} // end detail
-
-} // end thrust
-
-
diff --git a/compat/thrust/iterator/discard_iterator.h b/compat/thrust/iterator/discard_iterator.h
deleted file mode 100644
index 6e089b567e..0000000000
--- a/compat/thrust/iterator/discard_iterator.h
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file thrust/iterator/discard_iterator.h
- *  \brief An iterator which "discards" (ignores) values assigned to it upon dereference
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/iterator/detail/discard_iterator_base.h>
-#include <thrust/iterator/iterator_facade.h>
-
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
-
-namespace thrust
-{
-
-/*! \addtogroup iterators
- *  \{
- */
-
-/*! \addtogroup fancyiterator Fancy Iterators
- *  \ingroup iterators
- *  \{
- */
-
-/*! \p discard_iterator is an iterator which represents a special kind of pointer that
- *  ignores values written to it upon dereference. This iterator is useful for ignoring
- *  the output of certain algorithms without wasting memory capacity or bandwidth.
- *  \p discard_iterator may also be used to count the size of an algorithm's output which
- *  may not be known a priori.
- *
- *  The following code snippet demonstrates how to use \p discard_iterator to ignore
- *  ignore one of the output ranges of reduce_by_key
- *
- *  \code
- *  #include <thrust/iterator/discard_iterator.h>
- *  #include <thrust/reduce.h>
- *  #include <thrust/device_vector.h>
- *
- *  int main(void)
- *  {
- *    thrust::device_vector<int> keys(7), values(7);
- *
- *    keys[0] = 1;
- *    keys[1] = 3;
- *    keys[2] = 3;
- *    keys[3] = 3;
- *    keys[4] = 2;
- *    keys[5] = 2;
- *    keys[6] = 1;
- *
- *    values[0] = 9;
- *    values[1] = 8;
- *    values[2] = 7;
- *    values[3] = 6;
- *    values[4] = 5;
- *    values[5] = 4;
- *    values[6] = 3;
- *
- *    thrust::device_vector<int> result(4);
- *
- *    // we are only interested in the reduced values
- *    // use discard_iterator to ignore the output keys
- *    thrust::reduce_by_key(keys.begin(), keys.end(),
- *                          values.begin(), values.end(),
- *                          thrust::make_discard_iterator(),
- *                          result.begin());
- *    
- *    // result is now [9, 21, 9, 3]
- *    
- *    return 0;
- *  }
- *  \endcode
- *
- *  \see make_discard_iterator
- */
-template<typename System = use_default>
-  class discard_iterator
-    : public detail::discard_iterator_base<System>::type
-{
-    /*! \cond
-     */
-    friend class thrust::iterator_core_access;
-    typedef typename detail::discard_iterator_base<System>::type          super_t;
-    typedef typename detail::discard_iterator_base<System>::incrementable incrementable;
-    typedef typename detail::discard_iterator_base<System>::base_iterator base_iterator;
-
-  public:
-    typedef typename super_t::reference  reference;
-    typedef typename super_t::value_type value_type;
-
-    /*! \endcond
-     */
-
-    /*! Copy constructor copies from a source discard_iterator.
-     *
-     *  \p rhs The discard_iterator to copy.
-     */
-    __host__ __device__
-    discard_iterator(discard_iterator const &rhs)
-      : super_t(rhs.base()) {}
-
-    /*! This constructor receives an optional index specifying the position of this
-     *  \p discard_iterator in a range.
-     *  
-     *  \p i The index of this \p discard_iterator in a range. Defaults to the
-     *       value returned by \c Incrementable's null constructor. For example,
-     *       when <tt>Incrementable == int</tt>, \c 0.
-     */
-    __host__ __device__
-    discard_iterator(incrementable const &i = incrementable())
-      : super_t(base_iterator(i)) {}
-
-    /*! \cond
-     */
-  
-  private: // Core iterator interface
-    __host__ __device__
-    reference dereference(void) const
-    {
-      return m_element;
-    }
-
-    mutable value_type m_element;
-
-    /*! \endcond
-     */
-}; // end constant_iterator
-
-
-/*! \p make_discard_iterator creates a \p discard_iterator from an optional index parameter.
- *
- *  \param i The index of the returned \p discard_iterator within a range.
- *           In the default case, the value of this parameter is \c 0.
- *
- *  \return A new \p discard_iterator with index as given by \p i.
- *
- *  \see constant_iterator
- */
-inline __host__ __device__
-discard_iterator<> make_discard_iterator(discard_iterator<>::difference_type i = discard_iterator<>::difference_type(0))
-{
-  return discard_iterator<>(i);
-} // end make_discard_iterator()
-
-/*! \} // end fancyiterators
- */
-
-/*! \} // end iterators
- */
-
-} // end namespace thrust
-  
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
-
diff --git a/compat/thrust/iterator/iterator_adaptor.h b/compat/thrust/iterator/iterator_adaptor.h
deleted file mode 100644
index 7b9cca308a..0000000000
--- a/compat/thrust/iterator/iterator_adaptor.h
+++ /dev/null
@@ -1,239 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file thrust/iterator/iterator_adaptor.h
- *  \brief An iterator which adapts a base iterator
- */
-
-/*
- * (C) Copyright David Abrahams 2002.
- * (C) Copyright Jeremy Siek    2002.
- * (C) Copyright Thomas Witt    2002.
- * 
- * Distributed under the Boost Software License, Version 1.0.
- * (See accompanying NOTICE file for the complete license)
- *
- * For more information, see http://www.boost.org
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/iterator/iterator_facade.h>
-#include <thrust/detail/use_default.h>
-#include <thrust/iterator/detail/iterator_adaptor_base.h>
-
-namespace thrust
-{
-
-/*! \addtogroup iterators
- *  \{
- */
-
-/*! \addtogroup fancyiterator Fancy Iterators
- *  \ingroup iterators
- *  \{
- */
-
-/*! \p iterator_adaptor is an iterator which adapts an existing type of iterator to create a new type of
- *  iterator. Most of Thrust's fancy iterators are defined via inheritance from \p iterator_adaptor.
- *  While composition of these existing Thrust iterators is often sufficient for expressing the desired
- *  functionality, it is occasionally more straightforward to derive from \p iterator_adaptor directly.
- *
- *  To see how to use \p iterator_adaptor to create a novel iterator type, let's examine how to use it to
- *  define \p repeat_iterator, a fancy iterator which repeats elements from another range a given number of time:
- *
- *  \code
- *  #include <thrust/iterator/iterator_adaptor.h>
- *
- *  // derive repeat_iterator from iterator_adaptor
- *  template<typename Iterator>
- *    class repeat_iterator
- *      : public thrust::iterator_adaptor<
- *          repeat_iterator<Iterator>, // the first template parameter is the name of the iterator we're creating
- *          Iterator                   // the second template parameter is the name of the iterator we're adapting
- *                                     // we can use the default for the additional template parameters
- *        >
- *  {
- *    public:
- *      // shorthand for the name of the iterator_adaptor we're deriving from
- *      typedef thrust::iterator_adaptor<
- *        repeat_iterator<Iterator>,
- *        Iterator
- *      > super_t;
- *
- *      __host__ __device__
- *      repeat_iterator(const Iterator &x, int n) : super_t(x), begin(x), n(n) {}
- *
- *      // befriend thrust::iterator_core_access to allow it access to the private interface below
- *      friend class thrust::iterator_core_access;
- *
- *    private:
- *      // repeat each element of the adapted range n times
- *      unsigned int n;
- *
- *      // used to keep track of where we began
- *      const Iterator begin;
- *
- *      // it is private because only thrust::iterator_core_access needs access to it
- *      __host__ __device__
- *      typename super_t::reference dereference() const
- *      {
- *        return *(begin + (this->base() - begin) / n);
- *      }
- *  };
- *  \endcode
- *
- *  Except for the first two, \p iterator_adaptor's template parameters are optional. When omitted, or when the
- *  user specifies \p thrust::use_default in its place, \p iterator_adaptor will use a default type inferred from \p Base.
- *
- *  \p iterator_adaptor's functionality is derived from and generally equivalent to \p boost::iterator_adaptor.
- *  The exception is Thrust's addition of the template parameter \p System, which is necessary to allow Thrust
- *  to dispatch an algorithm to one of several parallel backend systems.
- *
- *  \p iterator_adaptor is a powerful tool for creating custom iterators directly. However, the large set of iterator semantics which must be satisfied
- *  for algorithm compatibility can make \p iterator_adaptor difficult to use correctly. Unless you require the full expressivity of \p iterator_adaptor,
- *  consider building a custom iterator through composition of existing higher-level fancy iterators instead. 
- *
- *  Interested users may refer to <tt>boost::iterator_adaptor</tt>'s documentation for further usage examples.
- */
-template<typename Derived,
-         typename Base,
-         typename Value      = use_default,
-         typename System     = use_default,
-         typename Traversal  = use_default,
-         typename Reference  = use_default,
-         typename Difference = use_default>
-  class iterator_adaptor:
-    public detail::iterator_adaptor_base<
-      Derived, Base, Value, System, Traversal, Reference, Difference
-    >::type
-{
-  /*! \cond
-   */
-
-    friend class thrust::iterator_core_access;
-
-  protected:
-    typedef typename detail::iterator_adaptor_base<
-        Derived, Base, Value, System, Traversal, Reference, Difference
-    >::type super_t;
-
-  /*! \endcond
-   */
-  
-  public:
-    /*! \p iterator_adaptor's default constructor does nothing.
-     */
-    __host__ __device__
-    iterator_adaptor(){}
-
-    /*! This constructor copies from a given instance of the \p Base iterator.
-     */
-    __host__ __device__
-    explicit iterator_adaptor(Base const& iter)
-      : m_iterator(iter)
-    {}
-
-    /*! The type of iterator this \p iterator_adaptor's \p adapts.
-     */
-    typedef Base       base_type;
-                                                                                              
-    /*! \cond
-     */
-    typedef typename super_t::reference reference;
-                                                                                              
-    typedef typename super_t::difference_type difference_type;
-    /*! \endcond
-     */
-
-    /*! \return A \p const reference to the \p Base iterator this \p iterator_adaptor adapts.
-     */
-    __host__ __device__
-    Base const& base() const
-    { return m_iterator; }
-
-  protected:
-    /*! \return A \p const reference to the \p Base iterator this \p iterator_adaptor adapts.
-     */
-    __host__ __device__
-    Base const& base_reference() const
-    { return m_iterator; }
-
-    /*! \return A mutable reference to the \p Base iterator this \p iterator_adaptor adapts.
-     */
-    __host__ __device__
-    Base& base_reference()
-    { return m_iterator; }
-
-    /*! \cond
-     */
-  private: // Core iterator interface for iterator_facade
-
-    __thrust_hd_warning_disable__
-    __host__ __device__
-    typename iterator_adaptor::reference dereference() const
-    { return *m_iterator; }
-
-    __thrust_hd_warning_disable__
-    template<typename OtherDerived, typename OtherIterator, typename V, typename S, typename T, typename R, typename D>
-    __host__ __device__
-    bool equal(iterator_adaptor<OtherDerived, OtherIterator, V, S, T, R, D> const& x) const
-    { return m_iterator == x.base(); }
-
-    __thrust_hd_warning_disable__
-    __host__ __device__
-    void advance(typename iterator_adaptor::difference_type n)
-    {
-      // XXX statically assert on random_access_traversal_tag
-      m_iterator += n;
-    }
-
-    __thrust_hd_warning_disable__
-    __host__ __device__
-    void increment()
-    { ++m_iterator; }
-
-    __thrust_hd_warning_disable__
-    __host__ __device__
-    void decrement()
-    {
-      // XXX statically assert on bidirectional_traversal_tag
-      --m_iterator;
-    }
-
-    __thrust_hd_warning_disable__
-    template<typename OtherDerived, typename OtherIterator, typename V, typename S, typename T, typename R, typename D>
-    __host__ __device__
-    typename iterator_adaptor::difference_type distance_to(iterator_adaptor<OtherDerived, OtherIterator, V, S, T, R, D> const& y) const
-    { return y.base() - m_iterator; }
-
-  private:
-    Base m_iterator;
-
-    /*! \endcond
-     */
-}; // end iterator_adaptor
-
-/*! \} // end fancyiterators
- */
-
-/*! \} // end iterators
- */
-
-} // end thrust
-
diff --git a/compat/thrust/iterator/iterator_categories.h b/compat/thrust/iterator/iterator_categories.h
deleted file mode 100644
index 81601b4a40..0000000000
--- a/compat/thrust/iterator/iterator_categories.h
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file thrust/iterator/iterator_categories.h
- *  \brief Types for reasoning about the categories of iterators
- */
-
-/*
- * (C) Copyright Jeremy Siek 2002.
- * 
- * Distributed under the Boost Software License, Version 1.0.
- * (See accompanying NOTICE file for the complete license)
- *
- * For more information, see http://www.boost.org
- */
-
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// #include this for stl's iterator tags
-#include <iterator>
-
-namespace thrust
-{
-
-/*! \addtogroup iterators
- *  \addtogroup iterator_tags Iterator Tags
- *  \ingroup iterators
- *  \addtogroup iterator_tag_classes Iterator Tag Classes
- *  \ingroup iterator_tags
- *  \{
- */
-
-/*! \p input_device_iterator_tag is an empty class: it has no member functions,
- *  member variables, or nested types. It is used solely as a "tag": a
- *  representation of the Input Device Iterator concept within the C++ type
- *  system.
- *
- *  \see http://www.sgi.com/tech/sgi/input_iterator_tag.html, iterator_traits,
- *  output_device_iterator_tag, forward_device_iterator_tag,
- *  bidirectional_device_iterator_tag, random_access_device_iterator_tag,
- *  input_host_iterator_tag, output_host_iterator_tag, forward_host_iterator_tag,
- *  bidirectional_host_iterator_tag, random_access_host_iterator_tag
- */
-struct input_device_iterator_tag {};
-
-/*! \p output_device_iterator_tag is an empty class: it has no member functions,
- *  member variables, or nested types. It is used solely as a "tag": a
- *  representation of the Output Device Iterator concept within the C++ type
- *  system.
- *
- *  \see http://www.sgi.com/tech/sgi/output_iterator_tag.html, iterator_traits,
- *  input_device_iterator_tag, forward_device_iterator_tag,
- *  bidirectional_device_iterator_tag, random_access_device_iterator_tag,
- *  input_host_iterator_tag, output_host_iterator_tag, forward_host_iterator_tag,
- *  bidirectional_host_iterator_tag, random_access_host_iterator_tag
- */
-struct output_device_iterator_tag {};
-
-/*! \p forward_device_iterator_tag is an empty class: it has no member functions,
- *  member variables, or nested types. It is used solely as a "tag": a
- *  representation of the Forward Device Iterator concept within the C++ type
- *  system.
- *
- *  \see http://www.sgi.com/tech/sgi/forward_iterator_tag.html, iterator_traits,
- *  input_device_iterator_tag, output_device_iterator_tag,
- *  bidirectional_device_iterator_tag, random_access_device_iterator_tag,
- *  input_host_iterator_tag, output_host_iterator_tag, forward_host_iterator_tag,
- *  bidirectional_host_iterator_tag, random_access_host_iterator_tag
- */
-struct forward_device_iterator_tag : public input_device_iterator_tag {};
-
-/*! \p bidirectional_device_iterator_tag is an empty class: it has no member
- *  functions, member variables, or nested types. It is used solely as a "tag": a
- *  representation of the Bidirectional Device Iterator concept within the C++
- *  type system.
- *
- *  \see http://www.sgi.com/tech/sgi/bidirectional_iterator_tag.html,
- *  iterator_traits, input_device_iterator_tag, output_device_iterator_tag,
- *  forward_device_iterator_tag, random_access_device_iterator_tag,
- *  input_host_iterator_tag, output_host_iterator_tag, forward_host_iterator_tag,
- *  bidirectional_host_iterator_tag, random_access_host_iterator_tag
- */
-struct bidirectional_device_iterator_tag : public forward_device_iterator_tag {};
-
-/*! \p random_access_device_iterator_tag is an empty class: it has no member
- *  functions, member variables, or nested types. It is used solely as a "tag": a
- *  representation of the Random Access Device Iterator concept within the C++
- *  type system.
- *
- *  \see http://www.sgi.com/tech/sgi/random_access_iterator_tag.html,
- *  iterator_traits, input_device_iterator_tag, output_device_iterator_tag,
- *  forward_device_iterator_tag, bidirectional_device_iterator_tag,
- *  input_host_iterator_tag, output_host_iterator_tag, forward_host_iterator_tag,
- *  bidirectional_host_iterator_tag, random_access_host_iterator_tag
- */
-struct random_access_device_iterator_tag : public bidirectional_device_iterator_tag {};
-
-/*! \p input_host_iterator_tag is an empty class: it has no member
- *  functions, member variables, or nested types. It is used solely as a "tag": a
- *  representation of the Input Host Iterator concept within the C++
- *  type system.
- *
- *  \see http://www.sgi.com/tech/sgi/input_iterator_tag.html,
- *  iterator_traits, input_device_iterator_tag, output_device_iterator_tag,
- *  forward_device_iterator_tag, bidirectional_device_iterator_tag,
- *  random_access_device_iterator_tag,
- *  output_host_iterator_tag, forward_host_iterator_tag,
- *  bidirectional_host_iterator_tag, random_access_host_iterator_tag
- */
-typedef std::input_iterator_tag input_host_iterator_tag;
-
-/*! \p output_host_iterator_tag is an empty class: it has no member
- *  functions, member variables, or nested types. It is used solely as a "tag": a
- *  representation of the Output Host Iterator concept within the C++
- *  type system.
- *
- *  \see http://www.sgi.com/tech/sgi/output_iterator_tag.html,
- *  iterator_traits, input_device_iterator_tag, output_device_iterator_tag,
- *  forward_device_iterator_tag, bidirectional_device_iterator_tag,
- *  random_access_device_iterator_tag,
- *  input_host_iterator_tag, forward_host_iterator_tag,
- *  bidirectional_host_iterator_tag, random_access_host_iterator_tag
- */
-typedef std::output_iterator_tag output_host_iterator_tag;
-
-/*! \p forward_host_iterator_tag is an empty class: it has no member
- *  functions, member variables, or nested types. It is used solely as a "tag": a
- *  representation of the Forward Host Iterator concept within the C++
- *  type system.
- *
- *  \see http://www.sgi.com/tech/sgi/forward_iterator_tag.html,
- *  iterator_traits, input_device_iterator_tag, output_device_iterator_tag,
- *  forward_device_iterator_tag, bidirectional_device_iterator_tag,
- *  random_access_device_iterator_tag,
- *  input_host_iterator_tag, output_host_iterator_tag,
- *  bidirectional_host_iterator_tag, random_access_host_iterator_tag
- */
-typedef std::forward_iterator_tag forward_host_iterator_tag;
-
-/*! \p bidirectional_host_iterator_tag is an empty class: it has no member
- *  functions, member variables, or nested types. It is used solely as a "tag": a
- *  representation of the Forward Host Iterator concept within the C++
- *  type system.
- *
- *  \see http://www.sgi.com/tech/sgi/bidirectional_iterator_tag.html,
- *  iterator_traits, input_device_iterator_tag, output_device_iterator_tag,
- *  forward_device_iterator_tag, bidirectional_device_iterator_tag,
- *  random_access_device_iterator_tag,
- *  input_host_iterator_tag, output_host_iterator_tag,
- *  forward_host_iterator_tag, random_access_host_iterator_tag
- */
-typedef std::bidirectional_iterator_tag bidirectional_host_iterator_tag;
-
-/*! \p random_access_host_iterator_tag is an empty class: it has no member
- *  functions, member variables, or nested types. It is used solely as a "tag": a
- *  representation of the Forward Host Iterator concept within the C++
- *  type system.
- *
- *  \see http://www.sgi.com/tech/sgi/random_access_iterator_tag.html,
- *  iterator_traits, input_device_iterator_tag, output_device_iterator_tag,
- *  forward_device_iterator_tag, bidirectional_device_iterator_tag,
- *  random_access_device_iterator_tag,
- *  input_host_iterator_tag, output_host_iterator_tag,
- *  forward_host_iterator_tag, bidirectional_host_iterator_tag
- */
-typedef std::random_access_iterator_tag random_access_host_iterator_tag;
-
-/*! \} // end iterator_tag_classes
- */
-
-} // end namespace thrust
-
-#include <thrust/iterator/detail/universal_categories.h>
-
diff --git a/compat/thrust/iterator/iterator_facade.h b/compat/thrust/iterator/iterator_facade.h
deleted file mode 100644
index 232c150a6f..0000000000
--- a/compat/thrust/iterator/iterator_facade.h
+++ /dev/null
@@ -1,538 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file thrust/iterator/iterator_facade.h
- *  \brief A class which exposes a public interface for iterators
- */
-
-/*
- * (C) Copyright David Abrahams 2002.
- * (C) Copyright Jeremy Siek    2002.
- * (C) Copyright Thomas Witt    2002.
- * 
- * Distributed under the Boost Software License, Version 1.0.
- * (See accompanying NOTICE file for the complete license)
- *
- * For more information, see http://www.boost.org
- */
-
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/iterator/detail/iterator_facade_category.h>
-#include <thrust/iterator/detail/distance_from_result.h>
-
-namespace thrust
-{
-
-/*! \addtogroup iterators
- *  \{
- */
-
-/*! \addtogroup fancyiterator Fancy Iterators
- *  \ingroup iterators
- *  \{
- */
-
-
-// This forward declaration is required for the friend declaration
-// in iterator_core_access
-template<typename Derived, typename Value, typename System, typename Traversal, typename Reference, typename Difference> class iterator_facade;
-
-
-/*! \p iterator_core_access is the class which user iterator types derived from \p thrust::iterator_adaptor
- *  or \p thrust::iterator_facade must befriend to allow it to access their private interface.
- */
-class iterator_core_access
-{
-    /*! \cond
-     */
-
-    // declare our friends
-    template<typename Derived, typename Value, typename System, typename Traversal, typename Reference, typename Difference> friend class iterator_facade;
-
-    // iterator comparisons are our friends
-    template <typename Derived1, typename Value1, typename System1, typename Traversal1, typename Reference1, typename Difference1,
-              typename Derived2, typename Value2, typename System2, typename Traversal2, typename Reference2, typename Difference2>
-    inline __host__ __device__
-    friend bool
-    operator ==(iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1> const& lhs,
-                iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2> const& rhs);
-
-    template <typename Derived1, typename Value1, typename System1, typename Traversal1, typename Reference1, typename Difference1,
-              typename Derived2, typename Value2, typename System2, typename Traversal2, typename Reference2, typename Difference2>
-    inline __host__ __device__
-    friend bool
-    operator !=(iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1> const& lhs,
-                iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2> const& rhs);
-
-    template <typename Derived1, typename Value1, typename System1, typename Traversal1, typename Reference1, typename Difference1,
-              typename Derived2, typename Value2, typename System2, typename Traversal2, typename Reference2, typename Difference2>
-    inline __host__ __device__
-    friend bool
-    operator <(iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1> const& lhs,
-               iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2> const& rhs);
-
-    template <typename Derived1, typename Value1, typename System1, typename Traversal1, typename Reference1, typename Difference1,
-              typename Derived2, typename Value2, typename System2, typename Traversal2, typename Reference2, typename Difference2>
-    inline __host__ __device__
-    friend bool
-    operator >(iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1> const& lhs,
-               iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2> const& rhs);
-
-    template <typename Derived1, typename Value1, typename System1, typename Traversal1, typename Reference1, typename Difference1,
-              typename Derived2, typename Value2, typename System2, typename Traversal2, typename Reference2, typename Difference2>
-    inline __host__ __device__
-    friend bool
-    operator <=(iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1> const& lhs,
-                iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2> const& rhs);
-
-    template <typename Derived1, typename Value1, typename System1, typename Traversal1, typename Reference1, typename Difference1,
-              typename Derived2, typename Value2, typename System2, typename Traversal2, typename Reference2, typename Difference2>
-    inline __host__ __device__
-    friend bool
-    operator >=(iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1> const& lhs,
-                iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2> const& rhs);
-
-    // iterator difference is our friend
-    template <typename Derived1, typename Value1, typename System1, typename Traversal1, typename Reference1, typename Difference1,
-              typename Derived2, typename Value2, typename System2, typename Traversal2, typename Reference2, typename Difference2>
-    inline __host__ __device__
-    friend
-      typename thrust::detail::distance_from_result<
-        iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1>,
-        iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2>
-      >::type
-    operator-(iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1> const& lhs,
-              iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2> const& rhs);
-
-    template<typename Facade>
-    __host__ __device__
-    static typename Facade::reference dereference(Facade const& f)
-    {
-      return f.dereference();
-    }
-
-    template<typename Facade>
-    __host__ __device__
-    static void increment(Facade& f)
-    {
-      f.increment();
-    }
-
-    template<typename Facade>
-    __host__ __device__
-    static void decrement(Facade& f)
-    {
-      f.decrement();
-    }
-
-    template <class Facade1, class Facade2>
-    __host__ __device__
-    static bool equal(Facade1 const& f1, Facade2 const& f2)
-    {
-      return f1.equal(f2);
-    }
-
-    // XXX TODO: Investigate whether we need both of these cases
-    //template <class Facade1, class Facade2>
-    //__host__ __device__
-    //static bool equal(Facade1 const& f1, Facade2 const& f2, mpl::true_)
-    //{
-    //  return f1.equal(f2);
-    //}
-
-    //template <class Facade1, class Facade2>
-    //__host__ __device__
-    //static bool equal(Facade1 const& f1, Facade2 const& f2, mpl::false_)
-    //{
-    //  return f2.equal(f1);
-    //}
-
-    template <class Facade>
-    __host__ __device__
-    static void advance(Facade& f, typename Facade::difference_type n)
-    {
-      f.advance(n);
-    }
-
-    // Facade2 is convertible to Facade1,
-    // so return Facade1's difference_type
-    template <class Facade1, class Facade2>
-    __host__ __device__
-    static typename Facade1::difference_type
-      distance_from(Facade1 const& f1, Facade2 const& f2, thrust::detail::true_type)
-    {
-      return -f1.distance_to(f2);
-    }
-
-    // Facade2 is not convertible to Facade1,
-    // so return Facade2's difference_type
-    template <class Facade1, class Facade2>
-    __host__ __device__
-    static typename Facade2::difference_type
-      distance_from(Facade1 const& f1, Facade2 const& f2, thrust::detail::false_type)
-    {
-      return f2.distance_to(f1);
-    }
-    
-    template <class Facade1, class Facade2>
-    __host__ __device__
-    static typename thrust::detail::distance_from_result<Facade1,Facade2>::type
-      distance_from(Facade1 const& f1, Facade2 const& f2)
-    {
-      // dispatch the implementation of this method upon whether or not
-      // Facade2 is convertible to Facade1
-      return distance_from(f1, f2,
-        typename thrust::detail::is_convertible<Facade2,Facade1>::type());
-    }
-
-    //
-    // Curiously Recurring Template interface.
-    //
-    template <typename Derived, typename Value, typename System, typename Traversal, typename Reference, typename Difference>
-    __host__ __device__
-    static Derived& derived(iterator_facade<Derived,Value,System,Traversal,Reference,Difference>& facade)
-    {
-      return *static_cast<Derived*>(&facade);
-    }
-
-    template <typename Derived, typename Value, typename System, typename Traversal, typename Reference, typename Difference>
-    __host__ __device__
-    static Derived const& derived(iterator_facade<Derived,Value,System,Traversal,Reference,Difference> const& facade)
-    {
-      return *static_cast<Derived const*>(&facade);
-    }
-
-    /*! \endcond
-     */
-}; // end iterator_core_access
-
-
-/*! \p iterator_facade is a template which allows the programmer to define a novel iterator with a standards-conforming interface
- *  which Thrust can use to reason about algorithm acceleration opportunities.
- *
- *  Because most of a standard iterator's interface is defined in terms of a small set of core primitives, \p iterator_facade
- *  defines the non-primitive portion mechanically. In principle a novel iterator could explicitly provide the entire interface in
- *  an ad hoc fashion but doing so might be tedious and prone to subtle errors.
- *
- *  Often \p iterator_facade is too primitive a tool to use for defining novel iterators. In these cases, \p iterator_adaptor
- *  or a specific fancy iterator should be used instead.
- *
- *  \p iterator_facade's functionality is derived from and generally equivalent to \p boost::iterator_facade.
- *  The exception is Thrust's addition of the template parameter \p System, which is necessary to allow Thrust
- *  to dispatch an algorithm to one of several parallel backend systems. An additional exception is Thrust's omission
- *  of the \c operator-> member function.
- *
- *  Interested users may refer to <tt>boost::iterator_facade</tt>'s documentation for usage examples.
- *
- *  \note \p iterator_facade's arithmetic operator free functions exist with the usual meanings but are omitted here for brevity.
- */
-template<typename Derived, typename Value, typename System, typename Traversal, typename Reference, typename Difference>
-  class iterator_facade
-{
-  private:
-    /*! \cond
-     */
-
-    //
-    // Curiously Recurring Template interface.
-    //
-    __host__ __device__
-    Derived& derived()
-    {
-      return *static_cast<Derived*>(this);
-    }
-
-    __host__ __device__
-    Derived const& derived() const
-    {
-      return *static_cast<Derived const*>(this);
-    }
-    /*! \endcond
-     */
-
-  public:
-    /*! The type of element pointed to by \p iterator_facade.
-     */
-    typedef typename thrust::detail::remove_const<Value>::type value_type;
-
-    /*! The return type of \p iterator_facade::operator*().
-     */
-    typedef Reference                                          reference;
-
-    /*! The return type of \p iterator_facade's non-existent \c operator->()
-     *  member function. Unlike \c boost::iterator_facade, \p iterator_facade
-     *  disallows access to the \p value_type's members through expressions of the
-     *  form <tt>iter->member</tt>. \p pointer is defined to \c void to indicate
-     *  that these expressions are not allowed. This limitation may be relaxed in a
-     *  future version of Thrust.
-     */
-    typedef void                                               pointer;
-
-    /*! The type of expressions of the form <tt>x - y</tt> where <tt>x</tt> and <tt>y</tt>
-     *  are of type \p iterator_facade.
-     */
-    typedef Difference                                         difference_type;
-
-    /*! The type of iterator category of \p iterator_facade.
-     */
-    typedef typename thrust::detail::iterator_facade_category<
-      System, Traversal, Value, Reference
-    >::type                                                    iterator_category;
-
-    /*! \p operator*() dereferences this \p iterator_facade.
-     *  \return A reference to the element pointed to by this \p iterator_facade.
-     */
-    __host__ __device__
-    reference operator*() const
-    {
-      return iterator_core_access::dereference(this->derived());
-    }
-
-    // XXX unimplemented for now, consider implementing it later
-    //pointer operator->() const
-    //{
-    //  return;
-    //}
-
-    // XXX investigate whether or not we need to go to the lengths
-    //     boost does to determine the return type
-
-    /*! \p operator[] performs indexed dereference.
-     *  \return A reference to the element \p n distance away from this \p iterator_facade.
-     */
-    __host__ __device__
-    reference operator[](difference_type n) const
-    {
-      return *(this->derived() + n);
-    }
-
-    /*! \p operator++ pre-increments this \p iterator_facade to refer to the element in the next position.
-     *  \return <tt>*this</tt>
-     */
-    __host__ __device__
-    Derived& operator++()
-    {
-      iterator_core_access::increment(this->derived());
-      return this->derived();
-    }
-
-    /*! \p operator++ post-increments this \p iterator_facade and returns a new \p iterator_facade referring to the element in the next position.
-     *  \return A copy of <tt>*this</tt> before increment.
-     */
-    __host__ __device__
-    Derived  operator++(int)
-    {
-      Derived tmp(this->derived());
-      ++*this;
-      return tmp;
-    }
-
-    /*! \p operator-- pre-decrements this \p iterator_facade to refer to the element in the previous position.
-     *  \return <tt>*this</tt>
-     */
-    __host__ __device__
-    Derived& operator--()
-    {
-      iterator_core_access::decrement(this->derived());
-      return this->derived();
-    }
-
-    /*! \p operator-- post-decrements this \p iterator_facade and returns a new \p iterator_facade referring to the element in the previous position.
-     *  \return A copy of <tt>*this</tt> before decrement.
-     */
-    __host__ __device__
-    Derived  operator--(int)
-    {
-      Derived tmp(this->derived());
-      --*this;
-      return tmp;
-    }
-
-    /*! \p operator+= increments this \p iterator_facade to refer to an element a given distance after its current position.
-     *  \param n The quantity to increment.
-     *  \return <tt>*this</tt>
-     */
-    __host__ __device__
-    Derived& operator+=(difference_type n)
-    {
-      iterator_core_access::advance(this->derived(), n);
-      return this->derived();
-    }
-
-    /*! \p operator-= decrements this \p iterator_facade to refer to an element a given distance before its current postition.
-     *  \param n The quantity to decrement.
-     *  \return <tt>*this</tt>
-     */
-    __host__ __device__
-    Derived& operator-=(difference_type n)
-    {
-      iterator_core_access::advance(this->derived(), -n);
-      return this->derived();
-    }
-
-    /*! \p operator- subtracts a given quantity from this \p iterator_facade and returns a new \p iterator_facade referring to the element at the given position before this \p iterator_facade.
-     *  \param n The quantity to decrement
-     *  \return An \p iterator_facade pointing \p n elements before this \p iterator_facade.
-     */
-    __host__ __device__
-    Derived  operator-(difference_type n) const
-    {
-      Derived result(this->derived());
-      return result -= n;
-    }
-}; // end iterator_facade
-
-/*! \cond
- */
-
-// Comparison operators
-template <typename Derived1, typename Value1, typename System1, typename Traversal1, typename Reference1, typename Difference1,
-          typename Derived2, typename Value2, typename System2, typename Traversal2, typename Reference2, typename Difference2>
-inline __host__ __device__
-// XXX it might be nice to implement this at some point
-//typename enable_if_interoperable<Dr1,Dr2,bool>::type // exposition
-bool
-operator ==(iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1> const& lhs,
-            iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2> const& rhs)
-{
-  return iterator_core_access
-    ::equal(*static_cast<Derived1 const*>(&lhs),
-            *static_cast<Derived2 const*>(&rhs));
-}
-
-template <typename Derived1, typename Value1, typename System1, typename Traversal1, typename Reference1, typename Difference1,
-          typename Derived2, typename Value2, typename System2, typename Traversal2, typename Reference2, typename Difference2>
-inline __host__ __device__
-// XXX it might be nice to implement this at some point
-//typename enable_if_interoperable<Dr1,Dr2,bool>::type // exposition
-bool
-operator !=(iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1> const& lhs,
-            iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2> const& rhs)
-{
-  return !iterator_core_access
-    ::equal(*static_cast<Derived1 const*>(&lhs),
-            *static_cast<Derived2 const*>(&rhs));
-}
-
-template <typename Derived1, typename Value1, typename System1, typename Traversal1, typename Reference1, typename Difference1,
-          typename Derived2, typename Value2, typename System2, typename Traversal2, typename Reference2, typename Difference2>
-inline __host__ __device__
-// XXX it might be nice to implement this at some point
-//typename enable_if_interoperable<Dr1,Dr2,bool>::type // exposition
-bool
-operator <(iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1> const& lhs,
-           iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2> const& rhs)
-{
-  return 0 > iterator_core_access
-    ::distance_from(*static_cast<Derived1 const*>(&lhs),
-                    *static_cast<Derived2 const*>(&rhs));
-}
-
-template <typename Derived1, typename Value1, typename System1, typename Traversal1, typename Reference1, typename Difference1,
-          typename Derived2, typename Value2, typename System2, typename Traversal2, typename Reference2, typename Difference2>
-inline __host__ __device__
-// XXX it might be nice to implement this at some point
-//typename enable_if_interoperable<Dr1,Dr2,bool>::type // exposition
-bool
-operator >(iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1> const& lhs,
-           iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2> const& rhs)
-{
-  return 0 < iterator_core_access
-    ::distance_from(*static_cast<Derived1 const*>(&lhs),
-                    *static_cast<Derived2 const*>(&rhs));
-}
-
-template <typename Derived1, typename Value1, typename System1, typename Traversal1, typename Reference1, typename Difference1,
-          typename Derived2, typename Value2, typename System2, typename Traversal2, typename Reference2, typename Difference2>
-inline __host__ __device__
-// XXX it might be nice to implement this at some point
-//typename enable_if_interoperable<Dr1,Dr2,bool>::type // exposition
-bool
-operator <=(iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1> const& lhs,
-            iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2> const& rhs)
-{
-  return 0 >= iterator_core_access
-    ::distance_from(*static_cast<Derived1 const*>(&lhs),
-                    *static_cast<Derived2 const*>(&rhs));
-}
-
-template <typename Derived1, typename Value1, typename System1, typename Traversal1, typename Reference1, typename Difference1,
-          typename Derived2, typename Value2, typename System2, typename Traversal2, typename Reference2, typename Difference2>
-inline __host__ __device__
-// XXX it might be nice to implement this at some point
-//typename enable_if_interoperable<Dr1,Dr2,bool>::type // exposition
-bool
-operator >=(iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1> const& lhs,
-            iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2> const& rhs)
-{
-  return 0 <= iterator_core_access
-    ::distance_from(*static_cast<Derived1 const*>(&lhs),
-                    *static_cast<Derived2 const*>(&rhs));
-}
-
-// Iterator difference
-template <typename Derived1, typename Value1, typename System1, typename Traversal1, typename Reference1, typename Difference1,
-          typename Derived2, typename Value2, typename System2, typename Traversal2, typename Reference2, typename Difference2>
-inline __host__ __device__
-
-// divine the type this operator returns
-typename thrust::detail::distance_from_result<
-  iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1>,
-  iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2>
->::type
-
-operator-(iterator_facade<Derived1,Value1,System1,Traversal1,Reference1,Difference1> const& lhs,
-          iterator_facade<Derived2,Value2,System2,Traversal2,Reference2,Difference2> const& rhs)
-{
-  return iterator_core_access
-    ::distance_from(*static_cast<Derived1 const*>(&lhs),
-                    *static_cast<Derived2 const*>(&rhs));
-}
-
-// Iterator addition
-template <typename Derived, typename Value, typename System, typename Traversal, typename Reference, typename Difference>
-inline __host__ __device__
-Derived operator+ (iterator_facade<Derived,Value,System,Traversal,Reference,Difference> const& i,
-                   typename Derived::difference_type n)
-{
-  Derived tmp(static_cast<Derived const&>(i));
-  return tmp += n;
-}
-
-template <typename Derived, typename Value, typename System, typename Traversal, typename Reference, typename Difference>
-inline __host__ __device__
-Derived operator+ (typename Derived::difference_type n,
-                   iterator_facade<Derived,Value,System,Traversal,Reference,Difference> const& i)
-{
-  Derived tmp(static_cast<Derived const&>(i));
-  return tmp += n;
-}
-
-/*! \endcond
- */
-
-/*! \} // end fancyiterators
- */
-
-/*! \} // end iterators
- */
-
-} // end thrust
-
diff --git a/compat/thrust/iterator/iterator_traits.h b/compat/thrust/iterator/iterator_traits.h
deleted file mode 100644
index a16f219b07..0000000000
--- a/compat/thrust/iterator/iterator_traits.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file thrust/iterator/iterator_traits.h
- *  \brief Traits and metafunctions for reasoning about the traits of iterators
- */
-
-/*
- * (C) Copyright David Abrahams 2003.
- * 
- * Distributed under the Boost Software License, Version 1.0.
- * (See accompanying NOTICE file for the complete license)
- *
- * For more information, see http://www.boost.org
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <iterator>
-
-namespace thrust
-{
-
-/*! \p iterator_traits is a type trait class that provides a uniform
- *  interface for querying the properties of iterators at compile-time.
- */
-template<typename T>
-  struct iterator_traits
-    : public std::iterator_traits<T>
-{
-}; // end iterator_traits
-
-
-template<typename Iterator> struct iterator_value;
-
-template<typename Iterator> struct iterator_pointer;
-
-template<typename Iterator> struct iterator_reference;
-
-template<typename Iterator> struct iterator_difference;
-
-template<typename Iterator> struct iterator_traversal;
-
-template<typename Iterator> struct iterator_system;
-
-// TODO remove this in Thrust v1.7.0
-template<typename Iterator>
-  struct THRUST_DEPRECATED iterator_space
-{
-  typedef THRUST_DEPRECATED typename iterator_system<Iterator>::type type;
-};
-
-
-} // end thrust
-
-#include <thrust/iterator/detail/iterator_traversal_tags.h>
-#include <thrust/iterator/detail/host_system_tag.h>
-#include <thrust/iterator/detail/device_system_tag.h>
-#include <thrust/iterator/detail/any_system_tag.h>
-#include <thrust/iterator/detail/iterator_traits.inl>
-
diff --git a/compat/thrust/iterator/permutation_iterator.h b/compat/thrust/iterator/permutation_iterator.h
deleted file mode 100644
index 509097b347..0000000000
--- a/compat/thrust/iterator/permutation_iterator.h
+++ /dev/null
@@ -1,210 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file thrust/iterator/permutation_iterator.h
- *  \brief An iterator which performs a gather or scatter operation when dereferenced
- */
-
-/*
- * (C) Copyright Toon Knapen    2001.
- * (C) Copyright David Abrahams 2003.
- * (C) Copyright Roland Richter 2003.
- * 
- * Distributed under the Boost Software License, Version 1.0.
- * (See accompanying NOTICE file for the complete license)
- *
- * For more information, see http://www.boost.org
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/iterator/detail/permutation_iterator_base.h>
-#include <thrust/iterator/iterator_facade.h>
-#include <thrust/iterator/iterator_traits.h>
-
-namespace thrust
-{
-
-
-/*! \addtogroup iterators
- *  \{
- */
-
-/*! \addtogroup fancyiterator Fancy Iterators
- *  \ingroup iterators
- *  \{
- */
-
-/*! \p permutation_iterator is an iterator which represents a pointer into a
- *  reordered view of a given range. \p permutation_iterator is an imprecise name;
- *  the reordered view need not be a strict permutation. This iterator is useful
- *  for fusing a scatter or gather operation with other algorithms.
- *
- *  This iterator takes two arguments:
- *
- *    - an iterator to the range \c V on which the "permutation" will be applied
- *    - the reindexing scheme that defines how the elements of \c V will be permuted.
- *
- *  Note that \p permutation_iterator is not limited to strict permutations of the
- *  given range \c V. The distance between begin and end of the reindexing iterators
- *  is allowed to be smaller compared to the size of the range \c V, in which case
- *  the \p permutation_iterator only provides a "permutation" of a subrange of \c V.
- *  The indices neither need to be unique. In this same context, it must be noted
- *  that the past-the-end \p permutation_iterator is completely defined by means of
- *  the past-the-end iterator to the indices.
- *
- *  The following code snippet demonstrates how to create a \p permutation_iterator
- *  which represents a reordering of the contents of a \p device_vector.
- *
- *  \code
- *  #include <thrust/iterator/permutation_iterator.h>
- *  #include <thrust/device_vector.h>
- *  ...
- *  thrust::device_vector<float> values(4);
- *  values[0] = 10.0f;
- *  values[1] = 20.0f;
- *  values[2] = 30.0f;
- *  values[3] = 40.0f;
- *  values[4] = 50.0f;
- *  values[5] = 60.0f;
- *  values[6] = 70.0f;
- *  values[7] = 80.0f;
- *
- *  thrust::device_vector<int> indices(4);
- *  indices[0] = 2;
- *  indices[1] = 6;
- *  indices[2] = 1;
- *  indices[3] = 3;
- *
- *  typedef thrust::device_vector<float>::iterator ElementIterator;
- *  typedef thrust::device_vector<int>::iterator   IndexIterator;
- *
- *  thrust::permutation_iterator<ElementIterator,IndexIterator> iter(values.begin(), indices.begin());
- *
- *  *iter;   // returns 30.0f;
- *  iter[0]; // returns 30.0f;
- *  iter[1]; // returns 70.0f;
- *  iter[2]; // returns 20.0f;
- *  iter[3]; // returns 40.0f;
- *
- *  // iter[4] is an out-of-bounds error
- *
- *  *iter   = -1.0f; // sets values[2] to -1.0f;
- *  iter[0] = -1.0f; // sets values[2] to -1.0f;
- *  iter[1] = -1.0f; // sets values[6] to -1.0f;
- *  iter[2] = -1.0f; // sets values[1] to -1.0f;
- *  iter[3] = -1.0f; // sets values[3] to -1.0f;
- *
- *  // values is now {10, -1, -1, -1, 50, 60, -1, 80}
- *  \endcode
- *
- *  \see make_permutation_iterator
- */
-template <typename ElementIterator,
-          typename IndexIterator>
-  class permutation_iterator
-    : public thrust::detail::permutation_iterator_base<
-        ElementIterator,
-        IndexIterator
-      >::type
-{
-  /*! \cond
-   */
-  private:
-    typedef typename detail::permutation_iterator_base<ElementIterator,IndexIterator>::type super_t;
-
-    friend class thrust::iterator_core_access;
-  /*! \endcond
-   */
-
-  public:
-    /*! Null constructor calls the null constructor of this \p permutation_iterator's
-     *  element iterator.
-     */
-    __host__ __device__
-    permutation_iterator()
-      : m_element_iterator() {}
-
-    /*! Constructor accepts an \c ElementIterator into a range of values and an
-     *  \c IndexIterator into a range of indices defining the indexing scheme on the
-     *  values.
-     *
-     *  \param x An \c ElementIterator pointing this \p permutation_iterator's range of values.
-     *  \param y An \c IndexIterator pointing to an indexing scheme to use on \p x.
-     */
-    __host__ __device__
-    explicit permutation_iterator(ElementIterator x, IndexIterator y)
-      : super_t(y), m_element_iterator(x) {}
-
-    /*! Copy constructor accepts a related \p permutation_iterator.
-     *  \param r A compatible \p permutation_iterator to copy from.
-     */
-    template<typename OtherElementIterator, typename OtherIndexIterator>
-    __host__ __device__
-    permutation_iterator(permutation_iterator<OtherElementIterator,OtherIndexIterator> const &r
-    // XXX remove these guards when we have static_assert
-    , typename detail::enable_if_convertible<OtherElementIterator, ElementIterator>::type* = 0
-    , typename detail::enable_if_convertible<OtherIndexIterator, IndexIterator>::type* = 0
-    )
-      : super_t(r.base()), m_element_iterator(r.m_element_iterator)
-    {}
-
-  /*! \cond
-   */
-  private:
-    __thrust_hd_warning_disable__
-    __host__ __device__
-    typename super_t::reference dereference() const
-    {
-      return *(m_element_iterator + *this->base());
-    }
-
-    // make friends for the copy constructor
-    template<typename,typename> friend class permutation_iterator;
-
-    ElementIterator m_element_iterator;
-  /*! \endcond
-   */
-}; // end permutation_iterator
-
-
-/*! \p make_permutation_iterator creates a \p permutation_iterator
- *  from an \c ElementIterator pointing to a range of elements to "permute"
- *  and an \c IndexIterator pointing to a range of indices defining an indexing
- *  scheme on the values.
- *
- *  \param e An \c ElementIterator pointing to a range of values.
- *  \param i An \c IndexIterator pointing to an indexing scheme to use on \p e.
- *  \return A new \p permutation_iterator which permutes the range \p e by \p i.
- *  \see permutation_iterator
- */
-template<typename ElementIterator, typename IndexIterator>
-__host__ __device__
-permutation_iterator<ElementIterator,IndexIterator> make_permutation_iterator(ElementIterator e, IndexIterator i)
-{
-  return permutation_iterator<ElementIterator,IndexIterator>(e,i);
-}
-
-/*! \} // end fancyiterators
- */
-
-/*! \} // end iterators
- */
-
-} // end thrust
-
diff --git a/compat/thrust/iterator/retag.h b/compat/thrust/iterator/retag.h
deleted file mode 100644
index 660da8f2fd..0000000000
--- a/compat/thrust/iterator/retag.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file thrust/iterator/retag.h
- *  \brief Functionality for altering an iterator's associated system.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/iterator/detail/retag.h>
-
-namespace thrust
-{
-
-
-/*! \ingroup iterator_tags
- *  \{
- */
-
-#if 0
-/*! \p reinterpret_tag returns a copy of an iterator and changes the type of the result's system tag.
- *  \tparam Tag Any system tag.
- *  \tparam Iterator Any iterator type.
- *  \param iter The iterator of interest.
- *  \return An iterator of unspecified type whose system tag is \p Tag and whose behavior is otherwise
- *          equivalent to \p iter.
- *  \note Unlike \p retag, \p reinterpret_tag does not enforce that the converted-to system tag be
- *        related to the converted-from system tag.
- *  \see retag
- */
-template<typename Tag, typename Iterator>
-unspecified_iterator_type reinterpret_tag(Iterator iter);
-
-/*! \p retag returns a copy of an iterator and changes the type of the result's system tag.
- *  \tparam Tag \p Tag shall be convertible to <tt>thrust::iterator_system<Iterator>::type</tt>,
- *              or <tt>thrust::iterator_system<Iterator>::type</tt> is a base type of \p Tag.
- *  \tparam Iterator Any iterator type.
- *  \param iter The iterator of interest.
- *  \return An iterator of unspecified type whose system tag is \p Tag and whose behavior is
- *          otherwise equivalent to \p iter.
- *  \note Unlike \p reinterpret_tag, \p retag enforces that the converted-to system tag be
- *        related to the converted-from system tag.
- *  \see reinterpret_tag
- */
-template<typename Tag, typename Iterator>
-unspecified_iterator_type retag(Iterator iter);
-#endif
-
-/*! \} // iterator_tags
- */
-
-
-} // end thrust
-
diff --git a/compat/thrust/iterator/reverse_iterator.h b/compat/thrust/iterator/reverse_iterator.h
deleted file mode 100644
index 03f03396d3..0000000000
--- a/compat/thrust/iterator/reverse_iterator.h
+++ /dev/null
@@ -1,238 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file thrust/iterator/reverse_iterator.h
- *  \brief An iterator adaptor which adapts another iterator to traverse backwards
- */
-
-/*
- * (C) Copyright David Abrahams 2002.
- * (C) Copyright Jeremy Siek    2002.
- * (C) Copyright Thomas Witt    2002.
- * 
- * Distributed under the Boost Software License, Version 1.0.
- * (See accompanying NOTICE file for the complete license)
- *
- * For more information, see http://www.boost.org
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/iterator/detail/reverse_iterator_base.h>
-#include <thrust/iterator/iterator_facade.h>
-
-namespace thrust
-{
-
-/*! \addtogroup iterators
- *  \{
- */
-
-/*! \addtogroup fancyiterator Fancy Iterators
- *  \ingroup iterators
- *  \{
- */
-
-/*! \p reverse_iterator is an iterator which represents a pointer into a
- *  reversed view of a given range. In this way, \p reverse_iterator allows
- *  backwards iteration through a bidirectional input range.
- *
- *  It is important to note that although \p reverse_iterator is constructed
- *  from a given iterator, it points to the element preceding it. In this way,
- *  the past-the-end \p reverse_iterator of a given range points to the element
- *  preceding the first element of the input range. By the same token, the first
- *  \p reverse_iterator of a given range is constructed from a past-the-end iterator
- *  of the original range yet points to the last element of the input.
- *
- *  The following code snippet demonstrates how to create a \p reverse_iterator
- *  which represents a reversed view of the contents of a \p device_vector.
- *
- *  \code
- *  #include <thrust/iterator/reverse_iterator.h>
- *  #include <thrust/device_vector.h>
- *  ...
- *  thrust::device_vector<float> v(4);
- *  v[0] = 0.0f;
- *  v[1] = 1.0f;
- *  v[2] = 2.0f;
- *  v[3] = 3.0f;
- *
- *  typedef thrust::device_vector<float>::iterator Iterator;
- *
- *  // note that we point the iterator to the *end* of the device_vector
- *  thrust::reverse_iterator<Iterator> iter(values.end());
- *
- *  *iter;   // returns 3.0f;
- *  iter[0]; // returns 3.0f;
- *  iter[1]; // returns 2.0f;
- *  iter[2]; // returns 1.0f;
- *  iter[3]; // returns 0.0f;
- *
- *  // iter[4] is an out-of-bounds error
- *  \endcode
- *
- *  Since reversing a range is a common operation, containers like \p device_vector
- *  have nested typedefs for declaration shorthand and methods for constructing
- *  reverse_iterators. The following code snippet is equivalent to the previous:
- *
- *  \code
- *  #include <thrust/device_vector.h>
- *  ...
- *  thrust::device_vector<float> v(4);
- *  v[0] = 0.0f;
- *  v[1] = 1.0f;
- *  v[2] = 2.0f;
- *  v[3] = 3.0f;
- *
- *  // we use the nested type reverse_iterator to refer to a reversed view of
- *  // a device_vector and the method rbegin() to create a reverse_iterator pointing
- *  // to the beginning of the reversed device_vector
- *  thrust::device_iterator<float>::reverse_iterator iter = values.rbegin();
- *
- *  *iter;   // returns 3.0f;
- *  iter[0]; // returns 3.0f;
- *  iter[1]; // returns 2.0f;
- *  iter[2]; // returns 1.0f;
- *  iter[3]; // returns 0.0f;
- *
- *  // iter[4] is an out-of-bounds error
- *
- *  // similarly, rend() points to the end of the reversed sequence:
- *  assert(values.rend() == (iter + 4));
- *  \endcode
- *
- *  Finally, the following code snippet demonstrates how to use reverse_iterator to
- *  perform a reversed prefix sum operation on the contents of a device_vector:
- *
- *  \code
- *  #include <thrust/device_vector.h>
- *  #include <thrust/scan.h>
- *  ...
- *  thrust::device_vector<int> v(5);
- *  v[0] = 0;
- *  v[1] = 1;
- *  v[2] = 2;
- *  v[3] = 3;
- *  v[4] = 4;
- *
- *  thrust::device_vector<int> result(5);
- *
- *  // exclusive scan v into result in reverse
- *  thrust::exclusive_scan(v.rbegin(), v.rend(), result.begin());
- *
- *  // result is now {0, 4, 7, 9, 10}
- *  \endcode
- *
- *  \see make_reverse_iterator
- */
-template<typename BidirectionalIterator>
-  class reverse_iterator
-    : public detail::reverse_iterator_base<BidirectionalIterator>::type
-{
-  /*! \cond
-   */
-  private:
-    typedef typename thrust::detail::reverse_iterator_base<
-      BidirectionalIterator
-    >::type super_t;
-
-    friend class thrust::iterator_core_access;
-  /*! \endcond
-   */
-
-  public:
-    /*! Default constructor does nothing.
-     */
-    __host__ __device__
-    reverse_iterator(void) {}
-
-    /*! \p Constructor accepts a \c BidirectionalIterator pointing to a range
-     *  for this \p reverse_iterator to reverse.
-     *
-     *  \param x A \c BidirectionalIterator pointing to a range to reverse.
-     */
-    __host__ __device__
-    explicit reverse_iterator(BidirectionalIterator x);
-
-    /*! \p Copy constructor allows construction from a related compatible
-     *  \p reverse_iterator.
-     *
-     *  \param r A \p reverse_iterator to copy from.
-     */
-    template<typename OtherBidirectionalIterator>
-    __host__ __device__
-    reverse_iterator(reverse_iterator<OtherBidirectionalIterator> const &r
-// XXX msvc screws this up
-// XXX remove these guards when we have static_assert
-#ifndef _MSC_VER
-                     , typename thrust::detail::enable_if<
-                         thrust::detail::is_convertible<
-                           OtherBidirectionalIterator,
-                           BidirectionalIterator
-                         >::value
-                       >::type * = 0
-#endif // _MSC_VER
-                     );
-
-  /*! \cond
-   */
-  private:
-    __thrust_hd_warning_disable__
-    __host__ __device__
-    typename super_t::reference dereference(void) const;
-
-    __host__ __device__
-    void increment(void);
-
-    __host__ __device__
-    void decrement(void);
-
-    __host__ __device__
-    void advance(typename super_t::difference_type n);
-
-    template<typename OtherBidirectionalIterator>
-    __host__ __device__
-    typename super_t::difference_type
-    distance_to(reverse_iterator<OtherBidirectionalIterator> const &y) const;
-  /*! \endcond
-   */
-}; // end reverse_iterator
-
-
-/*! \p make_reverse_iterator creates a \p reverse_iterator
- *  from a \c BidirectionalIterator pointing to a range of elements to reverse.
- *  
- *  \param x A \c BidirectionalIterator pointing to a range to reverse.
- *  \return A new \p reverse_iterator which reverses the range \p x.
- */
-template<typename BidirectionalIterator>
-__host__ __device__
-reverse_iterator<BidirectionalIterator> make_reverse_iterator(BidirectionalIterator x);
-
-
-/*! \} // end fancyiterators
- */
-
-/*! \} // end iterators
- */
-
-} // end thrust
-
-#include <thrust/iterator/detail/reverse_iterator.inl>
-
diff --git a/compat/thrust/iterator/transform_iterator.h b/compat/thrust/iterator/transform_iterator.h
deleted file mode 100644
index 985b61b775..0000000000
--- a/compat/thrust/iterator/transform_iterator.h
+++ /dev/null
@@ -1,344 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file thrust/iterator/transform_iterator.h
- *  \brief An iterator which adapts another iterator by applying a function to the result of its dereference 
- */
-
-/*
- * (C) Copyright David Abrahams 2002.
- * (C) Copyright Jeremy Siek    2002.
- * (C) Copyright Thomas Witt    2002.
- * 
- * Distributed under the Boost Software License, Version 1.0.
- * (See accompanying NOTICE file for the complete license)
- *
- * For more information, see http://www.boost.org
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// #include the details first
-#include <thrust/iterator/detail/transform_iterator.inl>
-#include <thrust/iterator/iterator_facade.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/function.h>
-
-namespace thrust
-{
-
-/*! \addtogroup iterators
- *  \{
- */
-
-/*! \addtogroup fancyiterator Fancy Iterators
- *  \ingroup iterators
- *  \{
- */
-
-/*! \p transform_iterator is an iterator which represents a pointer into a range
- *  of values after transformation by a function. This iterator is useful for 
- *  creating a range filled with the result of applying an operation to another range
- *  without either explicitly storing it in memory, or explicitly executing the transformation.
- *  Using \p transform_iterator facilitates kernel fusion by deferring the execution
- *  of a transformation until the value is needed while saving both memory capacity
- *  and bandwidth.
- *
- *  The following code snippet demonstrates how to create a \p transform_iterator
- *  which represents the result of \c sqrtf applied to the contents of a \p device_vector.
- *
- *  \code
- *  #include <thrust/iterator/transform_iterator.h>
- *  #include <thrust/device_vector.h>
- *  
- *  // note: functor inherits from unary_function
- *  struct square_root : public thrust::unary_function<float,float>
- *  {
- *    __host__ __device__
- *    float operator()(float x) const
- *    {
- *      return sqrtf(x);
- *    }
- *  };
- *  
- *  int main(void)
- *  {
- *    thrust::device_vector<float> v(4);
- *    v[0] = 1.0f;
- *    v[1] = 4.0f;
- *    v[2] = 9.0f;
- *    v[3] = 16.0f;
- *                                                                                           
- *    typedef thrust::device_vector<float>::iterator FloatIterator;
- *                                                                                           
- *    thrust::transform_iterator<square_root, FloatIterator> iter(v.begin(), square_root());
- *                                                                                           
- *    *iter;   // returns 1.0f
- *    iter[0]; // returns 1.0f;
- *    iter[1]; // returns 2.0f;
- *    iter[2]; // returns 3.0f;
- *    iter[3]; // returns 4.0f;
- *                                                                                           
- *    // iter[4] is an out-of-bounds error
- *  }
- *  \endcode
- *
- *  This next example demonstrates how to use a \p transform_iterator with the
- *  \p thrust::reduce function to compute the sum of squares of a sequence.
- *  We will create temporary \p transform_iterators with the
- *  \p make_transform_iterator function in order to avoid explicitly specifying their type:
- *
- *  \code
- *  #include <thrust/iterator/transform_iterator.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/reduce.h>
- *  #include <iostream>
- *  
- *  // note: functor inherits from unary_function
- *  struct square : public thrust::unary_function<float,float>
- *  {
- *    __host__ __device__
- *    float operator()(float x) const
- *    {
- *      return x * x;
- *    }
- *  };
- *  
- *  int main(void)
- *  {
- *    // initialize a device array
- *    thrust::device_vector<float> v(4);
- *    v[0] = 1.0f;
- *    v[1] = 2.0f;
- *    v[2] = 3.0f;
- *    v[3] = 4.0f;
- *  
- *    float sum_of_squares =
- *     thrust::reduce(thrust::make_transform_iterator(v.begin(), square()),
- *                    thrust::make_transform_iterator(v.end(),   square()));
- *  
- *    std::cout << "sum of squares: " << sum_of_squares << std::endl;
- *    return 0;
- *  }
- *  \endcode
- *
- *  Note that in the previous two examples the transform functor (namely \c square_root 
- *  and \c square) inherits from \c thrust::unary_function.  Inheriting from 
- *  \c thrust::unary_function ensures that a functor is a valid \c AdaptableUnaryFunction
- *  and provides all the necessary \c typedef declarations.  The \p transform_iterator
- *  can also be applied to a \c UnaryFunction that does not inherit from 
- *  \c thrust::unary_function using an optional template argument.  The following example
- *  illustrates how to use the third template argument to specify the \c result_type of
- *  the function.   
- *
- *  \code
- *  #include <thrust/iterator/transform_iterator.h>
- *  #include <thrust/device_vector.h>
- *  
- *  // note: functor *does not* inherit from unary_function
- *  struct square_root
- *  {
- *    __host__ __device__
- *    float operator()(float x) const
- *    {
- *      return sqrtf(x);
- *    }
- *  };
- *  
- *  int main(void)
- *  {
- *    thrust::device_vector<float> v(4);
- *    v[0] = 1.0f;
- *    v[1] = 4.0f;
- *    v[2] = 9.0f;
- *    v[3] = 16.0f;
- *                                                                                           
- *    typedef thrust::device_vector<float>::iterator FloatIterator;
- *    
- *    // note: float result_type is specified explicitly
- *    thrust::transform_iterator<square_root, FloatIterator, float> iter(v.begin(), square_root());
- *                                                                                           
- *    *iter;   // returns 1.0f
- *    iter[0]; // returns 1.0f;
- *    iter[1]; // returns 2.0f;
- *    iter[2]; // returns 3.0f;
- *    iter[3]; // returns 4.0f;
- *                                                                                           
- *    // iter[4] is an out-of-bounds error
- *  }
- *  \endcode
- *
- *  \see make_transform_iterator
- */
-template <class AdaptableUnaryFunction, class Iterator, class Reference = use_default, class Value = use_default>
-  class transform_iterator
-    : public detail::transform_iterator_base<AdaptableUnaryFunction, Iterator, Reference, Value>::type
-{
-  /*! \cond
-   */
-  public:
-    typedef typename
-    detail::transform_iterator_base<AdaptableUnaryFunction, Iterator, Reference, Value>::type
-    super_t;
-
-    friend class thrust::iterator_core_access;
-  /*! \endcond
-   */
-
-  public:
-    /*! Null constructor does nothing.
-     */
-    __host__ __device__
-    transform_iterator() {}
-  
-    /*! This constructor takes as arguments an \c Iterator and an \c AdaptableUnaryFunction
-     *  and copies them to a new \p transform_iterator.
-     *
-     *  \param x An \c Iterator pointing to the input to this \p transform_iterator's \c AdaptableUnaryFunction.
-     *  \param f An \c AdaptableUnaryFunction used to transform the objects pointed to by \p x.
-     */
-    __host__ __device__
-    transform_iterator(Iterator const& x, AdaptableUnaryFunction f)
-      : super_t(x), m_f(f) {
-    }
-  
-    /*! This explicit constructor copies the value of a given \c Iterator and creates
-     *  this \p transform_iterator's \c AdaptableUnaryFunction using its null constructor.
-     *
-     *  \param x An \c Iterator to copy.
-     */
-    __host__ __device__
-    explicit transform_iterator(Iterator const& x)
-      : super_t(x) { }
-
-    /*! This copy constructor creates a new \p transform_iterator from another
-     *  \p transform_iterator.
-     *
-     *  \param other The \p transform_iterator to copy.
-     */
-    template<typename OtherAdaptableUnaryFunction,
-             typename OtherIterator,
-             typename OtherReference,
-             typename OtherValue>
-    __host__ __device__
-    transform_iterator(const transform_iterator<OtherAdaptableUnaryFunction, OtherIterator, OtherReference, OtherValue> &other,
-                       typename thrust::detail::enable_if_convertible<OtherIterator, Iterator>::type* = 0,
-                       typename thrust::detail::enable_if_convertible<OtherAdaptableUnaryFunction, AdaptableUnaryFunction>::type* = 0)
-      : super_t(other.base()), m_f(other.functor()) {}
-
-    /*! Copy assignment operator copies from another \p transform_iterator.
-     *  \p other The other \p transform_iterator to copy
-     *  \return <tt>*this</tt>
-     *
-     *  \note If the type of this \p transform_iterator's functor is not copy assignable
-     *        (for example, if it is a lambda) it is not an error to call this function.
-     *        In this case, however, the functor will not be modified.
-     *
-     *        In any case, this \p transform_iterator's underlying iterator will be copy assigned.
-     */
-    __host__ __device__
-    transform_iterator &operator=(const transform_iterator &other)
-    {
-      return do_assign(other,
-      // XXX gcc 4.2.1 crashes on is_copy_assignable; just assume the functor is assignable as a WAR
-#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC) && (THRUST_GCC_VERSION <= 40201)
-          thrust::detail::true_type()
-#else
-          typename thrust::detail::is_copy_assignable<AdaptableUnaryFunction>::type()
-#endif // THRUST_HOST_COMPILER
-      );
-    }
-
-    /*! This method returns a copy of this \p transform_iterator's \c AdaptableUnaryFunction.
-     *  \return A copy of this \p transform_iterator's \c AdaptableUnaryFunction.
-     */
-    __host__ __device__
-    AdaptableUnaryFunction functor() const
-      { return m_f; }
-
-    /*! \cond
-     */
-  private:
-    __host__ __device__
-    transform_iterator &do_assign(const transform_iterator &other, thrust::detail::true_type)
-    {
-      super_t::operator=(other);
-
-      // do assign to m_f
-      m_f = other.functor();
-
-      return *this;
-    }
-
-    __host__ __device__
-    transform_iterator &do_assign(const transform_iterator &other, thrust::detail::false_type)
-    {
-      super_t::operator=(other);
-
-      // don't assign to m_f
-
-      return *this;
-    }
-
-    __thrust_hd_warning_disable__
-    __host__ __device__
-    typename super_t::reference dereference() const
-    { 
-      // XXX consider making this a member instead of a temporary created inside dereference
-      thrust::detail::host_device_function<AdaptableUnaryFunction, typename super_t::reference> wrapped_f(m_f);
-
-      return wrapped_f(*this->base());
-    }
-
-    // tag this as mutable per Dave Abrahams in this thread:
-    // http://lists.boost.org/Archives/boost/2004/05/65332.php
-    mutable AdaptableUnaryFunction m_f;
-
-    /*! \endcond
-     */
-}; // end transform_iterator
-
-
-/*! \p make_transform_iterator creates a \p transform_iterator
- *  from an \c Iterator and \c AdaptableUnaryFunction.
- *
- *  \param it The \c Iterator pointing to the input range of the
- *            newly created \p transform_iterator.
- *  \param fun The \c AdaptableUnaryFunction used to transform the range pointed
- *             to by \p it in the newly created \p transform_iterator.
- *  \return A new \p transform_iterator which transforms the range at
- *          \p it by \p fun.
- *  \see transform_iterator
- */
-template <class AdaptableUnaryFunction, class Iterator>
-inline __host__ __device__
-transform_iterator<AdaptableUnaryFunction, Iterator>
-make_transform_iterator(Iterator it, AdaptableUnaryFunction fun)
-{
-  return transform_iterator<AdaptableUnaryFunction, Iterator>(it, fun);
-} // end make_transform_iterator
-
-/*! \} // end fancyiterators
- */
-
-/*! \} // end iterators
- */
-
-} // end thrust
-
diff --git a/compat/thrust/iterator/zip_iterator.h b/compat/thrust/iterator/zip_iterator.h
deleted file mode 100644
index 8e7299c407..0000000000
--- a/compat/thrust/iterator/zip_iterator.h
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file thrust/iterator/zip_iterator.h
- *  \brief An iterator which returns a tuple of the result of dereferencing
- *         a tuple of iterators when dereferenced
- */
-
-/*
- * Copyright David Abrahams and Thomas Becker 2000-2006.
- * 
- * Distributed under the Boost Software License, Version 1.0.
- * (See accompanying NOTICE file for the complete license)
- *
- * For more information, see http://www.boost.org
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/iterator/detail/zip_iterator_base.h>
-#include <thrust/iterator/iterator_facade.h>
-#include <thrust/detail/type_traits.h>
-
-namespace thrust
-{
-
-/*! \addtogroup iterators
- *  \{
- */
-
-/*! \addtogroup fancyiterator Fancy Iterators
- *  \ingroup iterators
- *  \{
- */
-
-/*! \p zip_iterator is an iterator which represents a pointer into a range
- *  of \p tuples whose elements are themselves taken from a \p tuple of input
- *  iterators. This iterator is useful for creating a virtual array of structures
- *  while achieving the same performance and bandwidth as the structure of arrays
- *  idiom. \p zip_iterator also facilitates kernel fusion by providing a convenient
- *  means of amortizing the execution of the same operation over multiple ranges.
- *
- *  The following code snippet demonstrates how to create a \p zip_iterator
- *  which represents the result of "zipping" multiple ranges together.
- *  
- *  \code
- *  #include <thrust/iterator/zip_iterator.h>
- *  #include <thrust/tuple.h>
- *  #include <thrust/device_vector.h>
- *  ...
- *  thrust::device_vector<int> int_v(3);
- *  int_v[0] = 0; int_v[1] = 1; int_v[2] = 2;
- *
- *  thrust::device_vector<float> float_v(3);
- *  float_v[0] = 0.0f; float_v[1] = 1.0;f float_v[2] = 2.0f;
- *
- *  thrust::device_vector<char> char_v(3);
- *  char_v[0] = 'a'; char_v[1] = 'b'; char_v[2] = 'c';
- *
- *  // typedef these iterators for shorthand
- *  typedef thrust::device_vector<int>::iterator   IntIterator;
- *  typedef thrust::device_vector<float>::iterator FloatIterator;
- *  typedef thrust::device_vector<char>::iterator  CharIterator;
- *
- *  // typedef a tuple of these iterators
- *  typedef thrust::tuple<IntIterator, FloatIterator, CharIterator> IteratorTuple;
- *
- *  // typedef the zip_iterator of this tuple
- *  typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
- *
- *  // finally, create the zip_iterator
- *  ZipIterator iter(thrust::make_tuple(int_v.begin(), float_v.begin(), char_v.begin()));
- *
- *  *iter;   // returns (0, 0.0f, 'a')
- *  iter[0]; // returns (0, 0.0f, 'a')
- *  iter[1]; // returns (1, 1.0f, 'b')
- *  iter[2]; // returns (2, 2.0f, 'c')
- *
- *  thrust::get<0>(iter[2]); // returns 2
- *  thrust::get<1>(iter[0]); // returns 0.0f
- *  thrust::get<2>(iter[1]); // returns 'b'
- *
- *  // iter[3] is an out-of-bounds error
- *  \endcode
- *
- *  Defining the type of a \p zip_iterator can be complex. The next code example demonstrates
- *  how to use the \p make_zip_iterator function with the \p make_tuple function to avoid
- *  explicitly specifying the type of the \p zip_iterator. This example shows how to use
- *  \p zip_iterator to copy multiple ranges with a single call to \p thrust::copy.
- *
- *  \code
- *  #include <thrust/zip_iterator.h>
- *  #include <thrust/tuple.h>
- *  #include <thrust/device_vector.h>
- *
- *  int main(void)
- *  {
- *    thrust::device_vector<int> int_in(3), int_out(3);
- *    int_in[0] = 0;
- *    int_in[1] = 1;
- *    int_in[2] = 2;
- *
- *    thrust::device_vector<float> float_in(3), float_out(3);
- *    float_in[0] =  0.0f;
- *    float_in[1] = 10.0f;
- *    float_in[2] = 20.0f;
- *
- *    thrust::copy(thrust::make_zip_iterator(thrust::make_tuple(int_in.begin(), float_in.begin())),
- *                 thrust::make_zip_iterator(thrust::make_tuple(int_in.end(),   float_in.end())),
- *                 thrust::make_zip_iterator(thrust::make_tuple(int_out.begin(),float_out.begin())));
- *
- *    // int_out is now [0, 1, 2]
- *    // float_out is now [0.0f, 10.0f, 20.0f]
- *
- *    return 0;
- *  }
- *  \endcode
- *
- *  \see make_zip_iterator
- *  \see make_tuple
- *  \see tuple
- *  \see get
- */
-template <typename IteratorTuple>
-  class zip_iterator
-    : public detail::zip_iterator_base<IteratorTuple>::type
-{
-  public:
-    /*! Null constructor does nothing.
-     */
-    inline __host__ __device__
-    zip_iterator(void);
-
-    /*! This constructor creates a new \p zip_iterator from a
-     *  \p tuple of iterators.
-     *  
-     *  \param iterator_tuple The \p tuple of iterators to copy from.
-     */
-    inline __host__ __device__
-    zip_iterator(IteratorTuple iterator_tuple);
-
-    /*! This copy constructor creates a new \p zip_iterator from another
-     *  \p zip_iterator.
-     *
-     *  \param other The \p zip_iterator to copy.
-     */
-    template<typename OtherIteratorTuple>
-    inline __host__ __device__
-    zip_iterator(const zip_iterator<OtherIteratorTuple> &other,
-                 typename thrust::detail::enable_if_convertible<
-                   OtherIteratorTuple,
-                   IteratorTuple
-                 >::type * = 0);
-
-    /*! This method returns a \c const reference to this \p zip_iterator's
-     *  \p tuple of iterators.
-     *
-     *  \return A \c const reference to this \p zip_iterator's \p tuple
-     *          of iterators.
-     */
-    inline __host__ __device__
-    const IteratorTuple &get_iterator_tuple() const;
-
-    /*! \cond
-     */
-  private:
-    typedef typename
-    detail::zip_iterator_base<IteratorTuple>::type super_t;
-
-    friend class thrust::iterator_core_access;
-
-    // Dereferencing returns a tuple built from the dereferenced
-    // iterators in the iterator tuple.
-    __host__ __device__
-    typename super_t::reference dereference() const;
-
-    // Two zip_iterators are equal if the two first iterators of the
-    // tuple are equal. Note this differs from Boost's implementation, which
-    // considers the entire tuple.
-    template<typename OtherIteratorTuple>
-    inline __host__ __device__
-    bool equal(const zip_iterator<OtherIteratorTuple> &other) const;
-
-    // Advancing a zip_iterator means to advance all iterators in the tuple
-    inline __host__ __device__
-    void advance(typename super_t::difference_type n);
-
-    // Incrementing a zip iterator means to increment all iterators in the tuple
-    inline __host__ __device__
-    void increment();
-
-    // Decrementing a zip iterator means to decrement all iterators in the tuple
-    inline __host__ __device__
-    void decrement();
-
-    // Distance is calculated using the first iterator in the tuple.
-    template<typename OtherIteratorTuple>
-    inline __host__ __device__
-      typename super_t::difference_type
-        distance_to(const zip_iterator<OtherIteratorTuple> &other) const;
-
-    // The iterator tuple.
-    IteratorTuple m_iterator_tuple;
-
-    /*! \endcond
-     */
-}; // end zip_iterator
-
-/*! \p make_zip_iterator creates a \p zip_iterator from a \p tuple
- *  of iterators.
- *
- *  \param t The \p tuple of iterators to copy.
- *  \return A newly created \p zip_iterator which zips the iterators encapsulated in \p t.
- *
- *  \see zip_iterator
- */
-template<typename IteratorTuple>
-inline __host__ __device__
-zip_iterator<IteratorTuple> make_zip_iterator(IteratorTuple t);
-
-/*! \} // end fancyiterators
- */
-
-/*! \} // end iterators
- */
-
-} // end thrust
-
-#include <thrust/iterator/detail/zip_iterator.inl>
-
diff --git a/compat/thrust/logical.h b/compat/thrust/logical.h
deleted file mode 100644
index 21510f3f21..0000000000
--- a/compat/thrust/logical.h
+++ /dev/null
@@ -1,276 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file logical.h
- *  \brief Logical operations on ranges
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/execution_policy.h>
-
-namespace thrust
-{
-
-
-/*! \addtogroup reductions
- *  \{
- *  \addtogroup logical
- *  \ingroup reductions
- *  \{
- */
-
-
-/*! \p all_of determines whether all elements in a range satify a predicate.
- *  Specifically, \p all_of returns \c true if <tt>pred(*i)</tt> is \c true
- *  for every iterator \c i in the range <tt>[first, last)</tt> and 
- *  \c false otherwise.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the sequence.
- *  \param last  The end of the sequence.
- *  \param pred A predicate used to test range elements.
- *  \return \c true, if all elements satisfy the predicate; \c false, otherwise.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  \code
- *  #include <thrust/logical.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  bool A[3] = {true, true, false};
- *
- *  thrust::all_of(thrust::host, A, A + 2, thrust::identity<bool>()); // returns true
- *  thrust::all_of(thrust::host, A, A + 3, thrust::identity<bool>()); // returns false
- *
- *  // empty range
- *  thrust::all_of(thrust::host, A, A, thrust::identity<bool>()); // returns false
- *  
- *  \endcode
- *
- *  \see any_of
- *  \see none_of
- *  \see transform_reduce
- */
-template <typename DerivedPolicy, typename InputIterator, typename Predicate>
-bool all_of(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, InputIterator first, InputIterator last, Predicate pred);
-
-
-/*! \p all_of determines whether all elements in a range satify a predicate.
- * Specifically, \p all_of returns \c true if <tt>pred(*i)</tt> is \c true
- * for every iterator \c i in the range <tt>[first, last)</tt> and 
- * \c false otherwise.
- *
- *  \param first The beginning of the sequence.
- *  \param last  The end of the sequence.
- *  \param pred A predicate used to test range elements.
- *  \return \c true, if all elements satisfy the predicate; \c false, otherwise.
- *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  \code
- *  #include <thrust/logical.h>
- *  #include <thrust/functional.h>
- *  ...
- *  bool A[3] = {true, true, false};
- *
- *  thrust::all_of(A, A + 2, thrust::identity<bool>()); // returns true
- *  thrust::all_of(A, A + 3, thrust::identity<bool>()); // returns false
- *
- *  // empty range
- *  thrust::all_of(A, A, thrust::identity<bool>()); // returns false
- *  
- *  \endcode
- *
- *  \see any_of
- *  \see none_of
- *  \see transform_reduce
- */
-template <typename InputIterator, typename Predicate>
-bool all_of(InputIterator first, InputIterator last, Predicate pred);
-
-
-/*! \p any_of determines whether any element in a range satifies a predicate.
- *  Specifically, \p any_of returns \c true if <tt>pred(*i)</tt> is \c true
- *  for any iterator \c i in the range <tt>[first, last)</tt> and 
- *  \c false otherwise.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the sequence.
- *  \param last  The end of the sequence.
- *  \param pred A predicate used to test range elements.
- *  \return \c true, if any element satisfies the predicate; \c false, otherwise.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  \code
- *  #include <thrust/logical.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  bool A[3] = {true, true, false};
- *
- *  thrust::any_of(thrust::host, A, A + 2, thrust::identity<bool>()); // returns true
- *  thrust::any_of(thrust::host, A, A + 3, thrust::identity<bool>()); // returns true
- *
- *  thrust::any_of(thrust::host, A + 2, A + 3, thrust::identity<bool>()); // returns false
- *
- *  // empty range
- *  thrust::any_of(thrust::host, A, A, thrust::identity<bool>()); // returns false
- *  \endcode
- *
- *  \see all_of
- *  \see none_of
- *  \see transform_reduce
- */
-template <typename DerivedPolicy, typename InputIterator, typename Predicate>
-bool any_of(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, InputIterator first, InputIterator last, Predicate pred);
-   
-
-/*! \p any_of determines whether any element in a range satifies a predicate.
- * Specifically, \p any_of returns \c true if <tt>pred(*i)</tt> is \c true
- * for any iterator \c i in the range <tt>[first, last)</tt> and 
- * \c false otherwise.
- *
- *  \param first The beginning of the sequence.
- *  \param last  The end of the sequence.
- *  \param pred A predicate used to test range elements.
- *  \return \c true, if any element satisfies the predicate; \c false, otherwise.
- *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  \code
- *  #include <thrust/logical.h>
- *  #include <thrust/functional.h>
- *  ...
- *  bool A[3] = {true, true, false};
- *
- *  thrust::any_of(A, A + 2, thrust::identity<bool>()); // returns true
- *  thrust::any_of(A, A + 3, thrust::identity<bool>()); // returns true
- *
- *  thrust::any_of(A + 2, A + 3, thrust::identity<bool>()); // returns false
- *
- *  // empty range
- *  thrust::any_of(A, A, thrust::identity<bool>()); // returns false
- *  \endcode
- *
- *  \see all_of
- *  \see none_of
- *  \see transform_reduce
- */
-template <typename InputIterator, typename Predicate>
-bool any_of(InputIterator first, InputIterator last, Predicate pred);
-
-
-/*! \p none_of determines whether no element in a range satifies a predicate.
- *  Specifically, \p none_of returns \c true if there is no iterator \c i in 
- *  the range <tt>[first, last)</tt> such that <tt>pred(*i)</tt> is \c true,
- *  and \c false otherwise.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the sequence.
- *  \param last  The end of the sequence.
- *  \param pred A predicate used to test range elements.
- *  \return \c true, if no element satisfies the predicate; \c false, otherwise.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  \code
- *  #include <thrust/logical.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  bool A[3] = {true, true, false};
- *
- *  thrust::none_of(thrust::host, A, A + 2, thrust::identity<bool>()); // returns false
- *  thrust::none_of(thrust::host, A, A + 3, thrust::identity<bool>()); // returns false
- *
- *  thrust::none_of(thrust::host, A + 2, A + 3, thrust::identity<bool>()); // returns true
- *
- *  // empty range
- *  thrust::none_of(thrust::host, A, A, thrust::identity<bool>()); // returns true
- *  \endcode
- *
- *  \see all_of
- *  \see any_of
- *  \see transform_reduce
- */
-template <typename DerivedPolicy, typename InputIterator, typename Predicate>
-bool none_of(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, InputIterator first, InputIterator last, Predicate pred);
-
-
-/*! \p none_of determines whether no element in a range satifies a predicate.
- *  Specifically, \p none_of returns \c true if there is no iterator \c i in 
- *  the range <tt>[first, last)</tt> such that <tt>pred(*i)</tt> is \c true,
- *  and \c false otherwise.
- *
- *  \param first The beginning of the sequence.
- *  \param last  The end of the sequence.
- *  \param pred A predicate used to test range elements.
- *  \return \c true, if no element satisfies the predicate; \c false, otherwise.
- *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  \code
- *  #include <thrust/logical.h>
- *  #include <thrust/functional.h>
- *  ...
- *  bool A[3] = {true, true, false};
- *
- *  thrust::none_of(A, A + 2, thrust::identity<bool>()); // returns false
- *  thrust::none_of(A, A + 3, thrust::identity<bool>()); // returns false
- *
- *  thrust::none_of(A + 2, A + 3, thrust::identity<bool>()); // returns true
- *
- *  // empty range
- *  thrust::none_of(A, A, thrust::identity<bool>()); // returns true
- *  \endcode
- *
- *  \see all_of
- *  \see any_of
- *  \see transform_reduce
- */
-template <typename InputIterator, typename Predicate>
-bool none_of(InputIterator first, InputIterator last, Predicate pred);
-
-
-/*! \} // end logical
- *  \} // end reductions
- */
-
-
-} // end namespace thrust
-
-#include <thrust/detail/logical.inl>
-
diff --git a/compat/thrust/memory.h b/compat/thrust/memory.h
deleted file mode 100644
index 6362de4064..0000000000
--- a/compat/thrust/memory.h
+++ /dev/null
@@ -1,538 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file thrust/memory.h
- *  \brief Abstractions for Thrust's memory model.
- */
-
-#include <thrust/detail/config.h>
-
-#include <thrust/detail/type_traits/pointer_traits.h>
-#include <thrust/detail/pointer.h>
-#include <thrust/detail/reference.h>
-#include <thrust/detail/raw_pointer_cast.h>
-#include <thrust/detail/raw_reference_cast.h>
-#include <thrust/detail/malloc_and_free.h>
-#include <thrust/detail/temporary_buffer.h>
-
-namespace thrust
-{
-
-
-/*! \addtogroup memory_management Memory Management
- *  \addtogroup memory_management_classes Memory Management Classes
- *  \ingroup memory_management
- *  \{
- */
-
-/*! \p pointer stores a pointer to an object allocated in memory. Like \p device_ptr, this
- *  type ensures type safety when dispatching standard algorithms on ranges resident in memory.
- *
- *  \p pointer generalizes \p device_ptr by relaxing the backend system associated with the \p pointer.
- *  Instead of the backend system specified by \p THRUST_DEFAULT_DEVICE_BACKEND, \p pointer's
- *  system is given by its second template parameter, \p Tag. For the purpose of Thrust dispatch,
- *  <tt>device_ptr<Element></tt> and <tt>pointer<Element,device_system_tag></tt> are considered equivalent.
- *
- *  The raw pointer encapsulated by a \p pointer may be obtained through its <tt>get</tt> member function
- *  or the \p raw_pointer_cast free function.
- *
- *  \tparam Element specifies the type of the pointed-to object.
- *
- *  \tparam Tag specifies the system with which this \p pointer is associated. This may be any Thrust
- *          backend system, or a user-defined tag.
- *
- *  \tparam Reference allows the client to specify the reference type returned upon derereference.
- *          By default, this type is <tt>reference<Element,pointer></tt>.
- *
- *  \tparam Derived allows the client to specify the name of the derived type when \p pointer is used as
- *          a base class. This is useful to ensure that arithmetic on values of the derived type return
- *          values of the derived type as a result. By default, this type is <tt>pointer<Element,Tag,Reference></tt>.
- *
- *  \note \p pointer is not a smart pointer; it is the client's responsibility to deallocate memory
- *        pointer to by \p pointer.
- *
- *  \see device_ptr
- *  \see reference
- *  \see raw_pointer_cast
- */
-// define pointer for the purpose of Doxygenating it
-// it is actually defined elsewhere
-#if 0
-template<typename Element, typename Tag, typename Reference = thrust::use_default, typename Derived = thrust::use_default>
-  class pointer
-{
-  public:
-    /*! The type of the raw pointer
-     */
-    typedef typename super_t::base_type raw_pointer;
-    
-    /*! \p pointer's default constructor initializes its encapsulated pointer to \c 0
-     */
-    __host__ __device__
-    pointer();
-
-    /*! This constructor allows construction of a <tt>pointer<const T, ...></tt> from a <tt>T*</tt>.
-     *
-     *  \param ptr A raw pointer to copy from, presumed to point to a location in \p Tag's memory.
-     *  \tparam OtherElement \p OtherElement shall be convertible to \p Element.
-     */
-    template<typename OtherElement>
-    __host__ __device__
-    explicit pointer(OtherElement *ptr);
-
-    /*! This contructor allows initialization from another pointer-like object.
-     *
-     *  \param other The \p OtherPointer to copy.
-     *
-     *  \tparam OtherPointer The tag associated with \p OtherPointer shall be convertible to \p Tag,
-     *                       and its element type shall be convertible to \p Element.
-     */
-    template<typename OtherPointer>
-    __host__ __device__
-    pointer(const OtherPointer &other,
-            typename thrust::detail::enable_if_pointer_is_convertible<
-              OtherPointer,
-              pointer<Element,Tag,Reference,Derived>
-            >::type * = 0);
-
-    /*! Assignment operator allows assigning from another pointer-like object with related type.
-     *
-     *  \param other The other pointer-like object to assign from.
-     *  \return <tt>*this</tt>
-     *
-     *  \tparam OtherPointer The tag associated with \p OtherPointer shall be convertible to \p Tag,
-     *                       and its element type shall be convertible to \p Element.
-     */
-    template<typename OtherPointer>
-    __host__ __device__
-    typename thrust::detail::enable_if_pointer_is_convertible<
-      OtherPointer,
-      pointer,
-      derived_type &
-    >::type
-    operator=(const OtherPointer &other);
-
-    /*! \p get returns this \p pointer's encapsulated raw pointer.
-     *  \return This \p pointer's raw pointer.
-     */
-    __host__ __device__
-    Element *get() const;
-};
-#endif
-
-/*! \p reference is a wrapped reference to an object stored in memory. \p reference generalizes
- *  \p device_reference by relaxing the type of pointer associated with the object. \p reference
- *  is the type of the result of dereferencing a tagged pointer-like object such as \p pointer, and
- *  intermediates operations on objects existing in a remote memory.
- *
- *  \tparam Element specifies the type of the referent object.
- *  \tparam Pointer specifies the type of the result of taking the address of \p reference.
- *  \tparam Derived allows the client to specify the name of the derived type when \p reference is used as
- *          a base class. This is useful to ensure that assignment to objects of the derived type return
- *          values of the derived type as a result. By default, this type is <tt>reference<Element,Pointer></tt>.
- */
-// define pointer for the purpose of Doxygenating it
-// it is actually defined elsewhere
-#if 0
-template<typename Element, typename Pointer, typename Derived = thrust::use_default>
-  class reference
-{
-  public:
-    /*! The type of this \p reference's wrapped pointers.
-     */
-    typedef Pointer                                              pointer;
-
-    /*! The \p value_type of this \p reference.
-     */
-    typedef typename thrust::detail::remove_const<Element>::type value_type;
-
-    /*! This copy constructor initializes this \p reference
-     *  to refer to an object pointed to by the given \p pointer. After
-     *  this \p reference is constructed, it shall refer to the
-     *  object pointed to by \p ptr.
-     *
-     *  \param ptr A \p pointer to copy from.
-     */
-    __host__ __device__
-    explicit reference(const pointer &ptr);
-
-    /*! This copy constructor accepts a const reference to another
-     *  \p reference of related type. After this \p reference is constructed,
-     *  it shall refer to the same object as \p other.
-     *  
-     *  \param other A \p reference to copy from.
-     *  \tparam OtherElement the element type of the other \p reference.
-     *  \tparam OtherPointer the pointer type of the other \p reference.
-     *  \tparam OtherDerived the derived type of the other \p reference.
-     *
-     *  \note This constructor is templated primarily to allow initialization of 
-     *  <tt>reference<const T,...></tt> from <tt>reference<T,...></tt>.
-     */
-    template<typename OtherElement, typename OtherPointer, typename OtherDerived>
-    __host__ __device__
-    reference(const reference<OtherElement,OtherPointer,OtherDerived> &other,
-              typename thrust::detail::enable_if_convertible<
-                typename reference<OtherElement,OtherPointer,OtherDerived>::pointer,
-                pointer
-              >::type * = 0);
-
-    /*! Copy assignment operator copy assigns from another \p reference.
-     *
-     *  \param other The other \p reference to assign from.
-     *  \return <tt>static_cast<derived_type&>(*this)</tt>
-     */
-    __host__ __device__
-    derived_type &operator=(const reference &other);
-
-    /*! Assignment operator copy assigns from another \p reference of related type.
-     *
-     *  \param other The other \p reference to assign from.
-     *  \return <tt>static_cast<derived_type&>(*this)</tt>
-     *
-     *  \tparam OtherElement the element type of the other \p reference.
-     *  \tparam OtherPointer the pointer type of the other \p reference.
-     *  \tparam OtherDerived the derived type of the other \p reference.
-     */
-    template<typename OtherElement, typename OtherPointer, typename OtherDerived>
-    __host__ __device__
-    derived_type &operator=(const reference<OtherElement,OtherPointer,OtherDerived> &other);
-
-    /*! Assignment operator assigns from a \p value_type.
-     *
-     *  \param x The \p value_type to assign from.
-     *  \return <tt>static_cast<derived_type&>(*this)</tt>.
-     */
-    __host__ __device__
-    derived_type &operator=(const value_type &x);
-
-    /*! Address-of operator returns a \p pointer pointing to the object
-     *  referenced by this \p reference. It does not return the address of this
-     *  \p reference.
-     *
-     *  \return A \p pointer pointing to the referenct object.
-     */
-    __host__ __device__
-    pointer operator&() const;
-
-    /*! Conversion operator converts this \p reference to \p value_type by
-     *  returning a copy of the referent object.
-     *  
-     *  \return A copy of the referent object.
-     */
-    __host__ __device__
-    operator value_type () const;
-
-    /*! Swaps the value of the referent object with another.
-     *
-     *  \param other The other \p reference with which to swap.
-     *  \note The argument is of type \p derived_type rather than \p reference.
-     */
-    __host__ __device__
-    void swap(derived_type &other);
-
-    /*! Prefix increment operator increments the referent object.
-     *
-     *  \return <tt>static_Cast<derived_type&>(*this)</tt>.
-     *
-     *  \note Documentation for other arithmetic operators omitted for brevity.
-     */
-    derived_type &operator++();
-};
-#endif
-
-/*! \}
- */
-
-/*!
- *  \addtogroup memory_management_functions Memory Management Functions
- *  \ingroup memory_management
- *  \{
- */
-
-
-/*! \addtogroup allocation_functions
- *  \{
- */
-
-
-/*! This version of \p malloc allocates untyped uninitialized storage associated with a given system.
- *
- *  \param system The Thrust system with which to associate the storage.
- *  \param n The number of bytes of storage to allocate.
- *  \return If allocation succeeds, a pointer to the allocated storage; a null pointer otherwise.
- *          The pointer must be deallocated with \p thrust::free.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *
- *  \pre \p DerivedPolicy must be publically derived from <code>thrust::execution_policy<DerivedPolicy></code>.
- *
- *  The following code snippet demonstrates how to use \p malloc to allocate a range of memory
- *  associated with Thrust's device system.
- *
- *  \code
- *  #include <thrust/memory.h>
- *  ...
- *  // allocate some memory with thrust::malloc
- *  const int N = 100;
- *  thrust::device_system_tag device_sys;
- *  thrust::pointer<void,thrust::device_space_tag> void_ptr = thrust::malloc(device_sys, N);
- *
- *  // manipulate memory
- *  ...
- *
- *  // deallocate void_ptr with thrust::free
- *  thrust::free(device_sys, void_ptr);
- *  \endcode
- *
- *  \see free
- *  \see device_malloc
- */
-template<typename DerivedPolicy>
-pointer<void,DerivedPolicy> malloc(const thrust::detail::execution_policy_base<DerivedPolicy> &system, std::size_t n);
-
-
-/*! This version of \p malloc allocates typed uninitialized storage associated with a given system.
- *
- *  \param system The Thrust system with which to associate the storage.
- *  \param n The number of elements of type \c T which the storage should accomodate.
- *  \return If allocation succeeds, a pointer to an allocation large enough to accomodate \c n
- *          elements of type \c T; a null pointer otherwise.
- *          The pointer must be deallocated with \p thrust::free.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *
- *  \pre \p DerivedPolicy must be publically derived from <code>thrust::execution_policy<DerivedPolicy></code>.
- *
- *  The following code snippet demonstrates how to use \p malloc to allocate a range of memory
- *  to accomodate integers associated with Thrust's device system.
- *
- *  \code
- *  #include <thrust/memory.h>
- *  ...
- *  // allocate storage for 100 ints with thrust::malloc
- *  const int N = 100;
- *  thrust::device_system_tag device_sys;
- *  thrust::pointer<int,thrust::device_system_tag> ptr = thrust::malloc<int>(device_sys, N);
- *
- *  // manipulate memory
- *  ...
- *
- *  // deallocate ptr with thrust::free
- *  thrust::free(device_sys, ptr);
- *  \endcode
- *
- *  \see free
- *  \see device_malloc
- */
-template<typename T, typename DerivedPolicy>
-pointer<T,DerivedPolicy> malloc(const thrust::detail::execution_policy_base<DerivedPolicy> &system, std::size_t n);
-
-
-/*! \p get_temporary_buffer returns a pointer to storage associated with a given Thrust system sufficient to store up to
- *  \p n objects of type \c T. If not enough storage is available to accomodate \p n objects, an implementation may return
- *  a smaller buffer. The number of objects the returned buffer can accomodate is also returned.
- *
- *  Thrust uses \p get_temporary_buffer internally when allocating temporary storage required by algorithm implementations.
- *
- *  The storage allocated with \p get_temporary_buffer must be returned to the system with \p return_temporary_buffer.
- *
- *  \param system The Thrust system with which to associate the storage.
- *  \param n The requested number of objects of type \c T the storage should accomodate.
- *  \return A pair \c p such that <tt>p.first</tt> is a pointer to the allocated storage and <tt>p.second</tt> is the number of
- *          contiguous objects of type \c T that the storage can accomodate. If no storage can be allocated, <tt>p.first</tt> if
- *          no storage can be obtained. The storage must be returned to the system using \p return_temporary_buffer.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *
- *  \pre \p DerivedPolicy must be publically derived from <code>thrust::execution_policy<DerivedPolicy></code>.
- *
- *  The following code snippet demonstrates how to use \p get_temporary_buffer to allocate a range of memory
- *  to accomodate integers associated with Thrust's device system.
- *
- *  \code
- *  #include <thrust/memory.h>
- *  ...
- *  // allocate storage for 100 ints with thrust::get_temporary_buffer
- *  const int N = 100;
- *
- *  typedef thrust::pair<
- *    thrust::pointer<int,thrust::device_system_tag>,
- *    std::ptrdiff_t
- *  > ptr_and_size_t;
- *
- *  thrust::device_system_tag device_sys;
- *  ptr_and_size_t ptr_and_size = thrust::get_temporary_buffer<int>(device_sys, N);
- *
- *  // manipulate up to 100 ints
- *  for(int i = 0; i < ptr_and_size.second; ++i)
- *  {
- *    *ptr_and_size.first = i;
- *  }
- *
- *  // deallocate storage with thrust::return_temporary_buffer
- *  thrust::return_temporary_buffer(device_sys, ptr_and_size.first);
- *  \endcode
- *
- *  \see malloc
- *  \see return_temporary_buffer
- */
-template<typename T, typename DerivedPolicy>
-thrust::pair<thrust::pointer<T,DerivedPolicy>, typename thrust::pointer<T,DerivedPolicy>::difference_type>
-get_temporary_buffer(const thrust::detail::execution_policy_base<DerivedPolicy> &system, typename thrust::pointer<T,DerivedPolicy>::difference_type n);
-
-
-/*! \} allocation_functions
- */
-
-
-/*! \addtogroup deallocation_functions
- *  \{
- */
-
-
-/*! \p free deallocates the storage previously allocated by \p thrust::malloc.
- *
- *  \param system The Thrust system with which the storage is associated.
- *  \param ptr A pointer previously returned by \p thrust::malloc. If \p ptr is null, \p free
- *         does nothing.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *
- *  \pre \p ptr shall have been returned by a previous call to <tt>thrust::malloc(system, n)</tt> or <tt>thrust::malloc<T>(system, n)</tt> for some type \c T.
- *
- *  The following code snippet demonstrates how to use \p free to deallocate a range of memory
- *  previously allocated with \p thrust::malloc.
- *
- *  \code
- *  #include <thrust/memory.h>
- *  ...
- *  // allocate storage for 100 ints with thrust::malloc
- *  const int N = 100;
- *  thrust::device_system_tag device_sys;
- *  thrust::pointer<int,thrust::device_system_tag> ptr = thrust::malloc<int>(device_sys, N);
- *
- *  // mainpulate memory
- *  ...
- *
- *  // deallocate ptr with thrust::free
- *  thrust::free(device_sys, ptr);
- *  \endcode
- */
-template<typename DerivedPolicy, typename Pointer>
-void free(const thrust::detail::execution_policy_base<DerivedPolicy> &system, Pointer ptr);
-
-
-/*! \p return_temporary_buffer deallocates storage associated with a given Thrust system previously allocated by \p get_temporary_buffer.
- *
- *  Thrust uses \p return_temporary_buffer internally when deallocating temporary storage required by algorithm implementations.
- *
- *  \param system The Thrust system with which the storage is associated.
- *  \param p A pointer previously returned by \p thrust::get_temporary_buffer. If \p ptr is null, \p return_temporary_buffer does nothing.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *
- *  \pre \p p shall have been previously allocated by \p thrust::get_temporary_buffer.
- *
- *  The following code snippet demonstrates how to use \p return_temporary_buffer to deallocate a range of memory
- *  previously allocated by \p get_temporary_buffer.
- *
- *  \code
- *  #include <thrust/memory.h>
- *  ...
- *  // allocate storage for 100 ints with thrust::get_temporary_buffer
- *  const int N = 100;
- *
- *  typedef thrust::pair<
- *    thrust::pointer<int,thrust::device_system_tag>,
- *    std::ptrdiff_t
- *  > ptr_and_size_t;
- *
- *  thrust::device_system_tag device_sys;
- *  ptr_and_size_t ptr_and_size = thrust::get_temporary_buffer<int>(device_sys, N);
- *
- *  // manipulate up to 100 ints
- *  for(int i = 0; i < ptr_and_size.second; ++i)
- *  {
- *    *ptr_and_size.first = i;
- *  }
- *
- *  // deallocate storage with thrust::return_temporary_buffer
- *  thrust::return_temporary_buffer(device_sys, ptr_and_size.first);
- *  \endcode
- *
- *  \see free
- *  \see get_temporary_buffer
- */
-template<typename DerivedPolicy, typename Pointer>
-void return_temporary_buffer(const thrust::detail::execution_policy_base<DerivedPolicy> &system, Pointer p);
-
-
-/*! \} deallocation_functions
- */
-
-
-/*! \p raw_pointer_cast creates a "raw" pointer from a pointer-like type,
- *  simply returning the wrapped pointer, should it exist.
- *
- *  \param ptr The pointer of interest.
- *  \return <tt>ptr.get()</tt>, if the expression is well formed; <tt>ptr</tt>, otherwise.
- *  \see raw_reference_cast
- */
-template<typename Pointer>
-__host__ __device__
-inline typename thrust::detail::pointer_traits<Pointer>::raw_pointer
-  raw_pointer_cast(const Pointer &ptr);
-
-
-/*! \p raw_reference_cast creates a "raw" reference from a wrapped reference type,
- *  simply returning the underlying reference, should it exist.
- *
- *  If the argument is not a reference wrapper, the result is a reference to the argument.
- *
- *  \param ref The reference of interest.
- *  \return <tt>*thrust::raw_pointer_cast(&ref)</tt>.
- *  \note There are two versions of \p raw_reference_cast. One for <tt>const</tt> references,
- *        and one for non-<tt>const</tt>.
- *  \see raw_pointer_cast
- */
-template<typename T>
-__host__ __device__
-inline typename detail::raw_reference<T>::type
-  raw_reference_cast(T &ref);
-
-
-/*! \p raw_reference_cast creates a "raw" reference from a wrapped reference type,
- *  simply returning the underlying reference, should it exist.
- *
- *  If the argument is not a reference wrapper, the result is a reference to the argument.
- *
- *  \param ref The reference of interest.
- *  \return <tt>*thrust::raw_pointer_cast(&ref)</tt>.
- *  \note There are two versions of \p raw_reference_cast. One for <tt>const</tt> references,
- *        and one for non-<tt>const</tt>.
- *  \see raw_pointer_cast
- */
-template<typename T>
-__host__ __device__
-inline typename detail::raw_reference<const T>::type
-  raw_reference_cast(const T &ref);
-
-
-/*! \}
- */
-
-} // end thrust
-
diff --git a/compat/thrust/merge.h b/compat/thrust/merge.h
deleted file mode 100644
index e5fa7b47ac..0000000000
--- a/compat/thrust/merge.h
+++ /dev/null
@@ -1,676 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file merge.h
- *  \brief Merging sorted ranges
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/execution_policy.h>
-#include <thrust/pair.h>
-
-namespace thrust
-{
-
-
-/*! \addtogroup merging Merging
- *  \ingroup algorithms
- *  \{
- */
-
-
-/*! \p merge combines two sorted ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>
- *  into a single sorted range. That is, it copies from <tt>[first1, last1)</tt> and
- *  <tt>[first2, last2)</tt> into <tt>[result, result + (last1 - first1) + (last2 - first2))</tt>
- *  such that the resulting range is in ascending order. \p merge is stable, meaning both that the
- *  relative order of elements within each input range is preserved, and that for equivalent elements
- *  in both input ranges the element from the first range precedes the element from the second. The
- *  return value is <tt>result + (last1 - first1) + (last2 - first2)</tt>.
- *
- *  This version of \p merge compares elements using \c operator<.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first1 The beginning of the first input range.
- *  \param last1 The end of the first input range.
- *  \param first2 The beginning of the second input range.
- *  \param last2 The end of the second input range.
- *  \param result The beginning of the merged output.
- *  \return The end of the output range.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *
- *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
- *  \pre The resulting range shall not overlap with either input range.
- *
- *  The following code snippet demonstrates how to use
- *  \p merge to compute the merger of two sorted sets of integers using the \p thrust::host execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/merge.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  int A1[6] = {1, 3, 5, 7, 9, 11};
- *  int A2[7] = {1, 1, 2, 3, 5,  8, 13};
- *
- *  int result[13];
- *
- *  int *result_end =
- *    thrust::merge(thrust::host,
- *                  A1, A1 + 6,
- *                  A2, A2 + 7,
- *                  result);
- *  // result = {1, 1, 1, 2, 3, 3, 5, 5, 7, 8, 9, 11, 13}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/merge.html
- *  \see \p set_union
- *  \see \p sort
- *  \see \p is_sorted
- */
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-  OutputIterator merge(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                       InputIterator1 first1,
-                       InputIterator1 last1,
-                       InputIterator2 first2,
-                       InputIterator2 last2,
-                       OutputIterator result);
-
-
-/*! \p merge combines two sorted ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>
- *  into a single sorted range. That is, it copies from <tt>[first1, last1)</tt> and
- *  <tt>[first2, last2)</tt> into <tt>[result, result + (last1 - first1) + (last2 - first2))</tt>
- *  such that the resulting range is in ascending order. \p merge is stable, meaning both that the
- *  relative order of elements within each input range is preserved, and that for equivalent elements
- *  in both input ranges the element from the first range precedes the element from the second. The
- *  return value is <tt>result + (last1 - first1) + (last2 - first2)</tt>.
- *
- *  This version of \p merge compares elements using \c operator<.
- *
- *  \param first1 The beginning of the first input range.
- *  \param last1 The end of the first input range.
- *  \param first2 The beginning of the second input range.
- *  \param last2 The end of the second input range.
- *  \param result The beginning of the merged output.
- *  \return The end of the output range.
- *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *
- *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
- *  \pre The resulting range shall not overlap with either input range.
- *
- *  The following code snippet demonstrates how to use
- *  \p merge to compute the merger of two sorted sets of integers.
- *
- *  \code
- *  #include <thrust/merge.h>
- *  ...
- *  int A1[6] = {1, 3, 5, 7, 9, 11};
- *  int A2[7] = {1, 1, 2, 3, 5,  8, 13};
- *
- *  int result[13];
- *
- *  int *result_end = thrust::merge(A1, A1 + 6, A2, A2 + 7, result);
- *  // result = {1, 1, 1, 2, 3, 3, 5, 5, 7, 8, 9, 11, 13}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/merge.html
- *  \see \p set_union
- *  \see \p sort
- *  \see \p is_sorted
- */
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-  OutputIterator merge(InputIterator1 first1,
-                       InputIterator1 last1,
-                       InputIterator2 first2,
-                       InputIterator2 last2,
-                       OutputIterator result);
-
-
-/*! \p merge combines two sorted ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>
- *  into a single sorted range. That is, it copies from <tt>[first1, last1)</tt> and
- *  <tt>[first2, last2)</tt> into <tt>[result, result + (last1 - first1) + (last2 - first2))</tt>
- *  such that the resulting range is in ascending order. \p merge is stable, meaning both that the
- *  relative order of elements within each input range is preserved, and that for equivalent elements
- *  in both input ranges the element from the first range precedes the element from the second. The
- *  return value is <tt>result + (last1 - first1) + (last2 - first2)</tt>.
- *
- *  This version of \p merge compares elements using a function object \p comp.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first1 The beginning of the first input range.
- *  \param last1 The end of the first input range.
- *  \param first2 The beginning of the second input range.
- *  \param last2 The end of the second input range.
- *  \param result The beginning of the merged output.
- *  \param comp Comparison operator.
- *  \return The end of the output range.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator1's \c value_type is convertable to \p StrictWeakCompare's \c first_argument_type.
- *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator2's \c value_type is convertable to \p StrictWeakCompare's \c second_argument_type.
- *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
- *
- *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
- *  \pre The resulting range shall not overlap with either input range.
- *
- *  The following code snippet demonstrates how to use
- *  \p merge to compute the merger of two sets of integers sorted in
- *  descending order using the \p thrust::host execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/merge.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  int A1[6] = {11, 9, 7, 5, 3, 1};
- *  int A2[7] = {13, 8, 5, 3, 2, 1, 1};
- *
- *  int result[13];
- *
- *  int *result_end = thrust::merge(thrust::host,
- *                                  A1, A1 + 6,
- *                                  A2, A2 + 7,
- *                                  result,
- *                                  thrust::greater<int>());
- *  // result = {13, 11, 9, 8, 7, 5, 5, 3, 3, 2, 1, 1, 1}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/merge.html
- *  \see \p sort
- *  \see \p is_sorted
- */
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename StrictWeakCompare>
-  OutputIterator merge(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                       InputIterator1 first1,
-                       InputIterator1 last1,
-                       InputIterator2 first2,
-                       InputIterator2 last2,
-                       OutputIterator result,
-                       StrictWeakCompare comp);
-
-
-/*! \p merge combines two sorted ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>
- *  into a single sorted range. That is, it copies from <tt>[first1, last1)</tt> and
- *  <tt>[first2, last2)</tt> into <tt>[result, result + (last1 - first1) + (last2 - first2))</tt>
- *  such that the resulting range is in ascending order. \p merge is stable, meaning both that the
- *  relative order of elements within each input range is preserved, and that for equivalent elements
- *  in both input ranges the element from the first range precedes the element from the second. The
- *  return value is <tt>result + (last1 - first1) + (last2 - first2)</tt>.
- *
- *  This version of \p merge compares elements using a function object \p comp.
- *
- *  \param first1 The beginning of the first input range.
- *  \param last1 The end of the first input range.
- *  \param first2 The beginning of the second input range.
- *  \param last2 The end of the second input range.
- *  \param result The beginning of the merged output.
- *  \param comp Comparison operator.
- *  \return The end of the output range.
- *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator1's \c value_type is convertable to \p StrictWeakCompare's \c first_argument_type.
- *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator2's \c value_type is convertable to \p StrictWeakCompare's \c second_argument_type.
- *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
- *
- *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
- *  \pre The resulting range shall not overlap with either input range.
- *
- *  The following code snippet demonstrates how to use
- *  \p merge to compute the merger of two sets of integers sorted in
- *  descending order.
- *
- *  \code
- *  #include <thrust/merge.h>
- *  #include <thrust/functional.h>
- *  ...
- *  int A1[6] = {11, 9, 7, 5, 3, 1};
- *  int A2[7] = {13, 8, 5, 3, 2, 1, 1};
- *
- *  int result[13];
- *
- *  int *result_end = thrust::merge(A1, A1 + 6, A2, A2 + 7, result, thrust::greater<int>());
- *  // result = {13, 11, 9, 8, 7, 5, 5, 3, 3, 2, 1, 1, 1}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/merge.html
- *  \see \p sort
- *  \see \p is_sorted
- */
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename StrictWeakCompare>
-  OutputIterator merge(InputIterator1 first1,
-                       InputIterator1 last1,
-                       InputIterator2 first2,
-                       InputIterator2 last2,
-                       OutputIterator result,
-                       StrictWeakCompare comp);
-
-
-/*! \p merge_by_key performs a key-value merge. That is, \p merge_by_key copies elements from
- *  <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> into a single range,
- *  <tt>[keys_result, keys_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt> such that
- *  the resulting range is in ascending key order.
- *
- *  At the same time, \p merge_by_key copies elements from the two associated ranges <tt>[values_first1 + (keys_last1 - keys_first1))</tt>
- *  and <tt>[values_first2 + (keys_last2 - keys_first2))</tt> into a single range,
- *  <tt>[values_result, values_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt> such that
- *  the resulting range is in ascending order implied by each input element's associated key.
- *
- *  \p merge_by_key is stable, meaning both that the relative order of elements within each input range is
- *  preserved, and that for equivalent elements in all input key ranges the element from the first range
- *  precedes the element from the second.
- *
- *  The return value is is <tt>(keys_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt>
- *  and <tt>(values_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt>.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param keys_first1 The beginning of the first input range of keys.
- *  \param keys_last1 The end of the first input range of keys.
- *  \param keys_first2 The beginning of the second input range of keys.
- *  \param keys_last2 The end of the second input range of keys.
- *  \param values_first1 The beginning of the first input range of values.
- *  \param values_first2 The beginning of the first input range of values.
- *  \param keys_result The beginning of the merged output range of keys.
- *  \param values_result The beginning of the merged output range of values.
- *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
- *          and such that <tt>p.second</tt> is the end of the output range of values.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *
- *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
- *  \pre The resulting ranges shall not overlap with any input range.
- *
- *  The following code snippet demonstrates how to use
- *  \p merge_by_key to compute the merger of two sets of integers sorted in
- *  ascending order using the \p thrust::host execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/merge.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  int A_keys[6] = {1, 3, 5, 7, 9, 11};
- *  int A_vals[6] = {0, 0, 0, 0, 0, 0};
- *
- *  int B_keys[7] = {1, 1, 2, 3, 5, 8, 13};
- *  int B_vals[7] = {1, 1, 1, 1, 1, 1, 1};
- *
- *  int keys_result[13];
- *  int vals_result[13];
- *
- *  thrust::pair<int*,int*> end =
- *    thrust::merge_by_key(thrust::host,
- *                         A_keys, A_keys + 6,
- *                         B_keys, B_keys + 7,
- *                         A_vals, B_vals,
- *                         keys_result, vals_result);
- *
- *  // keys_result = {1, 1, 1, 2, 3, 3, 5, 5, 7, 8, 9, 11, 13}
- *  // vals_result = {0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0,  0,  1}
- *  \endcode
- *
- *  \see merge
- *  \see \p sort_by_key
- *  \see \p is_sorted
- */
-template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename InputIterator3, typename InputIterator4, typename OutputIterator1, typename OutputIterator2>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    merge_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                 InputIterator1 keys_first1, InputIterator1 keys_last1,
-                 InputIterator2 keys_first2, InputIterator2 keys_last2,
-                 InputIterator3 values_first1, InputIterator4 values_first2,
-                 OutputIterator1 keys_result,
-                 OutputIterator2 values_result);
-
-
-/*! \p merge_by_key performs a key-value merge. That is, \p merge_by_key copies elements from
- *  <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> into a single range,
- *  <tt>[keys_result, keys_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt> such that
- *  the resulting range is in ascending key order.
- *
- *  At the same time, \p merge_by_key copies elements from the two associated ranges <tt>[values_first1 + (keys_last1 - keys_first1))</tt>
- *  and <tt>[values_first2 + (keys_last2 - keys_first2))</tt> into a single range,
- *  <tt>[values_result, values_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt> such that
- *  the resulting range is in ascending order implied by each input element's associated key.
- *
- *  \p merge_by_key is stable, meaning both that the relative order of elements within each input range is
- *  preserved, and that for equivalent elements in all input key ranges the element from the first range
- *  precedes the element from the second.
- *
- *  The return value is is <tt>(keys_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt>
- *  and <tt>(values_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt>.
- *
- *  \param keys_first1 The beginning of the first input range of keys.
- *  \param keys_last1 The end of the first input range of keys.
- *  \param keys_first2 The beginning of the second input range of keys.
- *  \param keys_last2 The end of the second input range of keys.
- *  \param values_first1 The beginning of the first input range of values.
- *  \param values_first2 The beginning of the first input range of values.
- *  \param keys_result The beginning of the merged output range of keys.
- *  \param values_result The beginning of the merged output range of values.
- *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
- *          and such that <tt>p.second</tt> is the end of the output range of values.
- *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *
- *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
- *  \pre The resulting ranges shall not overlap with any input range.
- *
- *  The following code snippet demonstrates how to use
- *  \p merge_by_key to compute the merger of two sets of integers sorted in
- *  ascending order.
- *
- *  \code
- *  #include <thrust/merge.h>
- *  #include <thrust/functional.h>
- *  ...
- *  int A_keys[6] = {1, 3, 5, 7, 9, 11};
- *  int A_vals[6] = {0, 0, 0, 0, 0, 0};
- *
- *  int B_keys[7] = {1, 1, 2, 3, 5, 8, 13};
- *  int B_vals[7] = {1, 1, 1, 1, 1, 1, 1};
- *
- *  int keys_result[13];
- *  int vals_result[13];
- *
- *  thrust::pair<int*,int*> end = thrust::merge_by_key(A_keys, A_keys + 6, B_keys, B_keys + 7, A_vals, B_vals, keys_result, vals_result);
- *
- *  // keys_result = {1, 1, 1, 2, 3, 3, 5, 5, 7, 8, 9, 11, 13}
- *  // vals_result = {0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0,  0,  1}
- *  \endcode
- *
- *  \see merge
- *  \see \p sort_by_key
- *  \see \p is_sorted
- */
-template<typename InputIterator1, typename InputIterator2, typename InputIterator3, typename InputIterator4, typename OutputIterator1, typename OutputIterator2>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    merge_by_key(InputIterator1 keys_first1, InputIterator1 keys_last1,
-                 InputIterator2 keys_first2, InputIterator2 keys_last2,
-                 InputIterator3 values_first1, InputIterator4 values_first2,
-                 OutputIterator1 keys_result,
-                 OutputIterator2 values_result);
-
-
-/*! \p merge_by_key performs a key-value merge. That is, \p merge_by_key copies elements from
- *  <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> into a single range,
- *  <tt>[keys_result, keys_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt> such that
- *  the resulting range is in ascending key order.
- *
- *  At the same time, \p merge_by_key copies elements from the two associated ranges <tt>[values_first1 + (keys_last1 - keys_first1))</tt>
- *  and <tt>[values_first2 + (keys_last2 - keys_first2))</tt> into a single range,
- *  <tt>[values_result, values_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt> such that
- *  the resulting range is in ascending order implied by each input element's associated key.
- *
- *  \p merge_by_key is stable, meaning both that the relative order of elements within each input range is
- *  preserved, and that for equivalent elements in all input key ranges the element from the first range
- *  precedes the element from the second.
- *
- *  The return value is is <tt>(keys_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt>
- *  and <tt>(values_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt>.
- *
- *  This version of \p merge_by_key compares key elements using a function object \p comp.
- *
- *  The algorithm's execution is parallelized using \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param keys_first1 The beginning of the first input range of keys.
- *  \param keys_last1 The end of the first input range of keys.
- *  \param keys_first2 The beginning of the second input range of keys.
- *  \param keys_last2 The end of the second input range of keys.
- *  \param values_first1 The beginning of the first input range of values.
- *  \param values_first2 The beginning of the first input range of values.
- *  \param keys_result The beginning of the merged output range of keys.
- *  \param values_result The beginning of the merged output range of values.
- *  \param comp Comparison operator.
- *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
- *          and such that <tt>p.second</tt> is the end of the output range of values.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator1's \c value_type is convertable to \p StrictWeakCompare's \c first_argument_type.
- *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator1's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator2's \c value_type is convertable to \p StrictWeakCompare's \c second_argument_type.
- *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator1's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
- *
- *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
- *  \pre The resulting ranges shall not overlap with any input range.
- *
- *  The following code snippet demonstrates how to use
- *  \p merge_by_key to compute the merger of two sets of integers sorted in
- *  descending order using the \p thrust::host execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/merge.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  int A_keys[6] = {11, 9, 7, 5, 3, 1};
- *  int A_vals[6] = { 0, 0, 0, 0, 0, 0};
- *
- *  int B_keys[7] = {13, 8, 5, 3, 2, 1, 1};
- *  int B_vals[7] = { 1, 1, 1, 1, 1, 1, 1};
- *
- *  int keys_result[13];
- *  int vals_result[13];
- *
- *  thrust::pair<int*,int*> end =
- *    thrust::merge_by_key(thrust::host,
- *                         A_keys, A_keys + 6,
- *                         B_keys, B_keys + 7,
- *                         A_vals, B_vals,
- *                         keys_result, vals_result,
- *                         thrust::greater<int>());
- *
- *  // keys_result = {13, 11, 9, 8, 7, 5, 5, 3, 3, 2, 1, 1, 1}
- *  // vals_result = { 1,  0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1}
- *  \endcode
- *
- *  \see merge
- *  \see \p sort_by_key
- *  \see \p is_sorted
- */
-template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename InputIterator3, typename InputIterator4, typename OutputIterator1, typename OutputIterator2, typename Compare>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    merge_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                 InputIterator1 keys_first1, InputIterator1 keys_last1,
-                 InputIterator2 keys_first2, InputIterator2 keys_last2,
-                 InputIterator3 values_first1, InputIterator4 values_first2,
-                 OutputIterator1 keys_result,
-                 OutputIterator2 values_result,
-                 Compare comp);
-
-
-/*! \p merge_by_key performs a key-value merge. That is, \p merge_by_key copies elements from
- *  <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> into a single range,
- *  <tt>[keys_result, keys_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt> such that
- *  the resulting range is in ascending key order.
- *
- *  At the same time, \p merge_by_key copies elements from the two associated ranges <tt>[values_first1 + (keys_last1 - keys_first1))</tt>
- *  and <tt>[values_first2 + (keys_last2 - keys_first2))</tt> into a single range,
- *  <tt>[values_result, values_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt> such that
- *  the resulting range is in ascending order implied by each input element's associated key.
- *
- *  \p merge_by_key is stable, meaning both that the relative order of elements within each input range is
- *  preserved, and that for equivalent elements in all input key ranges the element from the first range
- *  precedes the element from the second.
- *
- *  The return value is is <tt>(keys_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt>
- *  and <tt>(values_result + (keys_last1 - keys_first1) + (keys_last2 - keys_first2))</tt>.
- *
- *  This version of \p merge_by_key compares key elements using a function object \p comp.
- *
- *  \param keys_first1 The beginning of the first input range of keys.
- *  \param keys_last1 The end of the first input range of keys.
- *  \param keys_first2 The beginning of the second input range of keys.
- *  \param keys_last2 The end of the second input range of keys.
- *  \param values_first1 The beginning of the first input range of values.
- *  \param values_first2 The beginning of the first input range of values.
- *  \param keys_result The beginning of the merged output range of keys.
- *  \param values_result The beginning of the merged output range of values.
- *  \param comp Comparison operator.
- *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
- *          and such that <tt>p.second</tt> is the end of the output range of values.
- *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator1's \c value_type is convertable to \p StrictWeakCompare's \c first_argument_type.
- *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator1's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator2's \c value_type is convertable to \p StrictWeakCompare's \c second_argument_type.
- *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator1's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
- *
- *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
- *  \pre The resulting ranges shall not overlap with any input range.
- *
- *  The following code snippet demonstrates how to use
- *  \p merge_by_key to compute the merger of two sets of integers sorted in
- *  descending order.
- *
- *  \code
- *  #include <thrust/merge.h>
- *  #include <thrust/functional.h>
- *  ...
- *  int A_keys[6] = {11, 9, 7, 5, 3, 1};
- *  int A_vals[6] = { 0, 0, 0, 0, 0, 0};
- *
- *  int B_keys[7] = {13, 8, 5, 3, 2, 1, 1};
- *  int B_vals[7] = { 1, 1, 1, 1, 1, 1, 1};
- *
- *  int keys_result[13];
- *  int vals_result[13];
- *
- *  thrust::pair<int*,int*> end = thrust::merge_by_key(A_keys, A_keys + 6, B_keys, B_keys + 7, A_vals, B_vals, keys_result, vals_result, thrust::greater<int>());
- *
- *  // keys_result = {13, 11, 9, 8, 7, 5, 5, 3, 3, 2, 1, 1, 1}
- *  // vals_result = { 1,  0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1}
- *  \endcode
- *
- *  \see merge
- *  \see \p sort_by_key
- *  \see \p is_sorted
- */
-template<typename InputIterator1, typename InputIterator2, typename InputIterator3, typename InputIterator4, typename OutputIterator1, typename OutputIterator2, typename StrictWeakCompare>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    merge_by_key(InputIterator1 keys_first1, InputIterator1 keys_last1,
-                 InputIterator2 keys_first2, InputIterator2 keys_last2,
-                 InputIterator3 values_first1, InputIterator4 values_first2,
-                 OutputIterator1 keys_result,
-                 OutputIterator2 values_result,
-                 StrictWeakCompare comp);
-
-
-/*! \} // merging
- */
-
-} // end thrust
-
-#include <thrust/detail/merge.inl>
-
diff --git a/compat/thrust/mismatch.h b/compat/thrust/mismatch.h
deleted file mode 100644
index 898157ac90..0000000000
--- a/compat/thrust/mismatch.h
+++ /dev/null
@@ -1,258 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file mismatch.h
- *  \brief Search for differences between ranges
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/execution_policy.h>
-#include <thrust/pair.h>
-
-namespace thrust
-{
-
-
-/*! \addtogroup algorithms
- */
-
-/*! \addtogroup searching
- *  \ingroup algorithms
- *  \{
- */
-
-
-/*! \p mismatch finds the first position where the two ranges <tt>[first1, last1)</tt>
- *  and <tt>[first2, first2 + (last1 - first1))</tt> differ. The two versions of 
- *  \p mismatch use different tests for whether elements differ.
- *
- *  This version of \p mismatch finds the first iterator \c i in <tt>[first1, last1)</tt>
- *  such that <tt>*i == *(first2 + (i - first1))</tt> is \c false. The return value is a
- *  \c pair whose first element is \c i and whose second element is <tt>*(first2 + (i - first1))</tt>.
- *  If no such iterator \c i exists, the return value is a \c pair whose first element
- *  is \c last1 and whose second element is <tt>*(first2 + (last1 - first1))</tt>.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first1 The beginning of the first sequence.
- *  \param last1  The end of the first sequence.
- *  \param first2 The beginning of the second sequence.
- *  \return The first position where the sequences differ.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *          and \p InputIterator1's \c value_type is equality comparable to \p InputIterator2's \c value_type.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *
- *  \code
- *  #include <thrust/mismatch.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  thrust::device_vector<int> vec1(4);
- *  thrust::device_vector<int> vec2(4);
- *
- *  vec1[0] = 0;  vec2[0] = 0; 
- *  vec1[1] = 5;  vec2[1] = 5;
- *  vec1[2] = 3;  vec2[2] = 8;
- *  vec1[3] = 7;  vec2[3] = 7;
- *
- *  typedef thrust::device_vector<int>::iterator Iterator;
- *  thrust::pair<Iterator,Iterator> result;
- *
- *  result = thrust::mismatch(thrust::device, vec1.begin(), vec1.end(), vec2.begin());
- *
- *  // result.first  is vec1.begin() + 2
- *  // result.second is vec2.begin() + 2
- *  \endcode
- *
- *  \see find
- *  \see find_if
- */
-template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2>
-thrust::pair<InputIterator1, InputIterator2> mismatch(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                                      InputIterator1 first1,
-                                                      InputIterator1 last1,
-                                                      InputIterator2 first2);
-
-
-/*! \p mismatch finds the first position where the two ranges <tt>[first1, last1)</tt>
- * and <tt>[first2, first2 + (last1 - first1))</tt> differ. The two versions of 
- * \p mismatch use different tests for whether elements differ.
- *
- * This version of \p mismatch finds the first iterator \c i in <tt>[first1, last1)</tt>
- * such that <tt>*i == *(first2 + (i - first1))</tt> is \c false. The return value is a
- * \c pair whose first element is \c i and whose second element is <tt>*(first2 + (i - first1))</tt>.
- * If no such iterator \c i exists, the return value is a \c pair whose first element
- * is \c last1 and whose second element is <tt>*(first2 + (last1 - first1))</tt>.
- *
- *  \param first1 The beginning of the first sequence.
- *  \param last1  The end of the first sequence.
- *  \param first2 The beginning of the second sequence.
- *  \return The first position where the sequences differ.
- *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *          and \p InputIterator1's \c value_type is equality comparable to \p InputIterator2's \c value_type.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *
- *  \code
- *  #include <thrust/mismatch.h>
- *  #include <thrust/device_vector.h>
- *  ...
- *  thrust::device_vector<int> vec1(4);
- *  thrust::device_vector<int> vec2(4);
- *
- *  vec1[0] = 0;  vec2[0] = 0; 
- *  vec1[1] = 5;  vec2[1] = 5;
- *  vec1[2] = 3;  vec2[2] = 8;
- *  vec1[3] = 7;  vec2[3] = 7;
- *
- *  typedef thrust::device_vector<int>::iterator Iterator;
- *  thrust::pair<Iterator,Iterator> result;
- *
- *  result = thrust::mismatch(vec1.begin(), vec1.end(), vec2.begin());
- *
- *  // result.first  is vec1.begin() + 2
- *  // result.second is vec2.begin() + 2
- *  \endcode
- *
- *  \see find
- *  \see find_if
- */
-template <typename InputIterator1, typename InputIterator2>
-thrust::pair<InputIterator1, InputIterator2> mismatch(InputIterator1 first1,
-                                                      InputIterator1 last1,
-                                                      InputIterator2 first2);
-
-
-/*! \p mismatch finds the first position where the two ranges <tt>[first1, last1)</tt>
- *  and <tt>[first2, first2 + (last1 - first1))</tt> differ. The two versions of 
- *  \p mismatch use different tests for whether elements differ.
- *
- *  This version of \p mismatch finds the first iterator \c i in <tt>[first1, last1)</tt>
- *  such that <tt>pred(\*i, \*(first2 + (i - first1))</tt> is \c false. The return value is a
- *  \c pair whose first element is \c i and whose second element is <tt>*(first2 + (i - first1))</tt>.
- *  If no such iterator \c i exists, the return value is a \c pair whose first element is
- *  \c last1 and whose second element is <tt>*(first2 + (last1 - first1))</tt>.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first1 The beginning of the first sequence.
- *  \param last1  The end of the first sequence.
- *  \param first2 The beginning of the second sequence.
- *  \param pred   The binary predicate to compare elements.
- *  \return The first position where the sequences differ.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Input Iterator</a>.
- *
- *  \code
- *  #include <thrust/mismatch.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  thrust::device_vector<int> vec1(4);
- *  thrust::device_vector<int> vec2(4);
- *
- *  vec1[0] = 0;  vec2[0] = 0; 
- *  vec1[1] = 5;  vec2[1] = 5;
- *  vec1[2] = 3;  vec2[2] = 8;
- *  vec1[3] = 7;  vec2[3] = 7;
- *
- *  typedef thrust::device_vector<int>::iterator Iterator;
- *  thrust::pair<Iterator,Iterator> result;
- *
- *  result = thrust::mismatch(thrust::device, vec1.begin(), vec1.end(), vec2.begin(), thrust::equal_to<int>());
- *
- *  // result.first  is vec1.begin() + 2
- *  // result.second is vec2.begin() + 2
- *  \endcode
- *
- *  \see find
- *  \see find_if
- */
-template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename BinaryPredicate>
-thrust::pair<InputIterator1, InputIterator2> mismatch(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                                      InputIterator1 first1,
-                                                      InputIterator1 last1,
-                                                      InputIterator2 first2,
-                                                      BinaryPredicate pred);
-
-
-/*! \p mismatch finds the first position where the two ranges <tt>[first1, last1)</tt>
- * and <tt>[first2, first2 + (last1 - first1))</tt> differ. The two versions of 
- * \p mismatch use different tests for whether elements differ.
- *
- * This version of \p mismatch finds the first iterator \c i in <tt>[first1, last1)</tt>
- * such that <tt>pred(\*i, \*(first2 + (i - first1))</tt> is \c false. The return value is a
- * \c pair whose first element is \c i and whose second element is <tt>*(first2 + (i - first1))</tt>.
- * If no such iterator \c i exists, the return value is a \c pair whose first element is
- * \c last1 and whose second element is <tt>*(first2 + (last1 - first1))</tt>.
- *
- *  \param first1 The beginning of the first sequence.
- *  \param last1  The end of the first sequence.
- *  \param first2 The beginning of the second sequence.
- *  \param pred   The binary predicate to compare elements.
- *  \return The first position where the sequences differ.
- *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Input Iterator</a>.
- *
- *  \code
- *  #include <thrust/mismatch.h>
- *  #include <thrust/device_vector.h>
- *  ...
- *  thrust::device_vector<int> vec1(4);
- *  thrust::device_vector<int> vec2(4);
- *
- *  vec1[0] = 0;  vec2[0] = 0; 
- *  vec1[1] = 5;  vec2[1] = 5;
- *  vec1[2] = 3;  vec2[2] = 8;
- *  vec1[3] = 7;  vec2[3] = 7;
- *
- *  typedef thrust::device_vector<int>::iterator Iterator;
- *  thrust::pair<Iterator,Iterator> result;
- *
- *  result = thrust::mismatch(vec1.begin(), vec1.end(), vec2.begin(), thrust::equal_to<int>());
- *
- *  // result.first  is vec1.begin() + 2
- *  // result.second is vec2.begin() + 2
- *  \endcode
- *
- *  \see find
- *  \see find_if
- */
-template <typename InputIterator1, typename InputIterator2, typename BinaryPredicate>
-thrust::pair<InputIterator1, InputIterator2> mismatch(InputIterator1 first1,
-                                                      InputIterator1 last1,
-                                                      InputIterator2 first2,
-                                                      BinaryPredicate pred);
-
-/*! \} // end searching
- */
-
-} // end namespace thrust
-
-#include <thrust/detail/mismatch.inl>
-
diff --git a/compat/thrust/pair.h b/compat/thrust/pair.h
deleted file mode 100644
index 897cc078e0..0000000000
--- a/compat/thrust/pair.h
+++ /dev/null
@@ -1,283 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file pair.h
- *  \brief A type encapsulating a heterogeneous pair of elements
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <utility>
-
-namespace thrust
-{
-
-/*! \addtogroup utility
- *  \{
- */
-
-/*! \addtogroup pair
- *  \{
- */
-
-/*! \p pair is a generic data structure encapsulating a heterogeneous
- *  pair of values.
- *
- *  \tparam T1 The type of \p pair's first object type.  There are no
- *          requirements on the type of \p T1. <tt>T1</tt>'s type is
- *          provided by <tt>pair::first_type</tt>.
- *
- *  \tparam T2 The type of \p pair's second object type.  There are no
- *          requirements on the type of \p T2. <tt>T2</tt>'s type is
- *          provided by <tt>pair::second_type</tt>.
- */
-template <typename T1, typename T2>
-  struct pair
-{
-  /*! \p first_type is the type of \p pair's first object type.
-   */
-  typedef T1 first_type;
-
-  /*! \p second_type is the type of \p pair's second object type.
-   */
-  typedef T2 second_type;
-
-  /*! The \p pair's first object.
-   */
-  first_type first;
-
-  /*! The \p pair's second object.
-   */
-  second_type second;
-
-  /*! \p pair's default constructor constructs \p first
-   *  and \p second using \c first_type & \c second_type's
-   *  default constructors, respectively.
-   */
-  __host__ __device__ pair(void);
-
-  /*! This constructor accepts two objects to copy into this \p pair.
-   *
-   *  \param x The object to copy into \p first.
-   *  \param y The object to copy into \p second.
-   */
-  inline __host__ __device__
-  pair(const T1 &x, const T2 &y);
-
-  /*! This copy constructor copies from a \p pair whose types are
-   *  convertible to this \p pair's \c first_type and \c second_type,
-   *  respectively.
-   *
-   *  \param p The \p pair to copy from.
-   *
-   *  \tparam U1 is convertible to \c first_type.
-   *  \tparam U2 is convertible to \c second_type.
-   */
-  template <typename U1, typename U2>
-  inline __host__ __device__
-  pair(const pair<U1,U2> &p);
-
-  /*! This copy constructor copies from a <tt>std::pair</tt> whose types are
-   *  convertible to this \p pair's \c first_type and \c second_type,
-   *  respectively.
-   *
-   *  \param p The <tt>std::pair</tt> to copy from.
-   *
-   *  \tparam U1 is convertible to \c first_type.
-   *  \tparam U2 is convertible to \c second_type.
-   */
-  template <typename U1, typename U2>
-  inline __host__ __device__
-  pair(const std::pair<U1,U2> &p);
-
-  /*! \p swap swaps the elements of two <tt>pair</tt>s.
-   *  
-   *  \param p The other <tt>pair</tt> with which to swap.
-   */
-  inline __host__ __device__
-  void swap(pair &p);
-}; // end pair
-
-
-/*! This operator tests two \p pairs for equality.
- *
- *  \param x The first \p pair to compare.
- *  \param y The second \p pair to compare.
- *  \return \c true if and only if <tt>x.first == y.first && x.second == y.second</tt>.
- *  
- *  \tparam T1 is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
- *  \tparam T2 is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
- */
-template <typename T1, typename T2>
-  inline __host__ __device__
-    bool operator==(const pair<T1,T2> &x, const pair<T1,T2> &y);
-
-
-/*! This operator tests two pairs for ascending ordering.
- *
- *  \param x The first \p pair to compare.
- *  \param y The second \p pair to compare.
- *  \return \c true if and only if <tt>x.first < y.first || (!(y.first < x.first) && x.second < y.second)</tt>.
- *
- *  \tparam T1 is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
- *  \tparam T2 is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
- */
-template <typename T1, typename T2>
-  inline __host__ __device__
-    bool operator<(const pair<T1,T2> &x, const pair<T1,T2> &y);
-
-
-/*! This operator tests two pairs for inequality.
- *
- *  \param x The first \p pair to compare.
- *  \param y The second \p pair to compare.
- *  \return \c true if and only if <tt>!(x == y)</tt>.
- *
- *  \tparam T1 is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
- *  \tparam T2 is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
- */
-template <typename T1, typename T2>
-  inline __host__ __device__
-    bool operator!=(const pair<T1,T2> &x, const pair<T1,T2> &y);
-
-
-/*! This operator tests two pairs for descending ordering.
- *
- *  \param x The first \p pair to compare.
- *  \param y The second \p pair to compare.
- *  \return \c true if and only if <tt>y < x</tt>.
- *
- *  \tparam T1 is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
- *  \tparam T2 is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
- */
-template <typename T1, typename T2>
-  inline __host__ __device__
-    bool operator>(const pair<T1,T2> &x, const pair<T1,T2> &y);
-
-
-/*! This operator tests two pairs for ascending ordering or equivalence.
- *
- *  \param x The first \p pair to compare.
- *  \param y The second \p pair to compare.
- *  \return \c true if and only if <tt>!(y < x)</tt>.
- *
- *  \tparam T1 is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
- *  \tparam T2 is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
- */
-template <typename T1, typename T2>
-  inline __host__ __device__
-    bool operator<=(const pair<T1,T2> &x, const pair<T1,T2> &y);
-
-
-/*! This operator tests two pairs for descending ordering or equivalence.
- *
- *  \param x The first \p pair to compare.
- *  \param y The second \p pair to compare.
- *  \return \c true if and only if <tt>!(x < y)</tt>.
- *
- *  \tparam T1 is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
- *  \tparam T2 is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
- */
-template <typename T1, typename T2>
-  inline __host__ __device__
-    bool operator>=(const pair<T1,T2> &x, const pair<T1,T2> &y);
-
-
-/*! \p swap swaps the contents of two <tt>pair</tt>s.
- *
- *  \param x The first \p pair to swap.
- *  \param y The second \p pair to swap.
- */
-template <typename T1, typename T2>
-  inline __host__ __device__
-    void swap(pair<T1,T2> &x, pair<T1,T2> &y);
-
-
-/*! This convenience function creates a \p pair from two objects.
- *
- *  \param x The first object to copy from.
- *  \param y The second object to copy from.
- *  \return A newly-constructed \p pair copied from \p a and \p b.
- *
- *  \tparam T1 There are no requirements on the type of \p T1.
- *  \tparam T2 There are no requirements on the type of \p T2.
- */
-template <typename T1, typename T2>
-  inline __host__ __device__
-    pair<T1,T2> make_pair(T1 x, T2 y);
-
-
-/*! This convenience metafunction is included for compatibility with
- *  \p tuple. It returns either the type of a \p pair's
- *  \c first_type or \c second_type in its nested type, \c type.
- *
- *  \tparam N This parameter selects the member of interest.
- *  \tparam T A \c pair type of interest.
- */
-template<int N, typename T> struct tuple_element;
-
-
-/*! This convenience metafunction is included for compatibility with
- *  \p tuple. It returns \c 2, the number of elements of a \p pair,
- *  in its nested data member, \c value.
- *
- *  \tparam Pair A \c pair type of interest.
- */
-template<typename Pair> struct tuple_size;
-
-
-/*! This convenience function returns a reference to either the first or
- *  second member of a \p pair.
- *
- *  \param p The \p pair of interest.
- *  \return \c p.first or \c p.second, depending on the template
- *          parameter.
- *
- *  \tparam N This parameter selects the member of interest.
- */
-// XXX comment out these prototypes as a WAR to a problem on MSVC 2005
-//template<unsigned int N, typename T1, typename T2>
-//  inline __host__ __device__
-//    typename tuple_element<N, pair<T1,T2> >::type &
-//      get(pair<T1,T2> &p);
-
-
-/*! This convenience function returns a const reference to either the
- *  first or second member of a \p pair.
- *
- *  \param p The \p pair of interest.
- *  \return \c p.first or \c p.second, depending on the template
- *          parameter.
- *
- *  \tparam i This parameter selects the member of interest.
- */
-// XXX comment out these prototypes as a WAR to a problem on MSVC 2005
-//template<int N, typename T1, typename T2>
-//  inline __host__ __device__
-//    const typename tuple_element<N, pair<T1,T2> >::type &
-//      get(const pair<T1,T2> &p);
-
-/*! \} // pair
- */
-
-/*! \} // utility
- */
-
-} // end thrust
-
-#include <thrust/detail/pair.inl>
-
diff --git a/compat/thrust/partition.h b/compat/thrust/partition.h
deleted file mode 100644
index 61a6278a8d..0000000000
--- a/compat/thrust/partition.h
+++ /dev/null
@@ -1,1429 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file partition.h
- *  \brief Reorganizes a range based on a predicate
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/execution_policy.h>
-#include <thrust/pair.h>
-
-namespace thrust
-{
-
-
-/*! \addtogroup reordering
- *  \ingroup algorithms
- *
- *  \addtogroup partitioning
- *  \ingroup reordering
- *  \{
- */
-
-
-/*! \p partition reorders the elements <tt>[first, last)</tt> based on the function
- *  object \p pred, such that all of the elements that satisfy \p pred precede the
- *  elements that fail to satisfy it. The postcondition is that, for some iterator
- *  \c middle in the range <tt>[first, last)</tt>, <tt>pred(*i)</tt> is \c true for every
- *  iterator \c i in the range <tt>[first,middle)</tt> and \c false for every iterator
- *  \c i in the range <tt>[middle, last)</tt>. The return value of \p partition is
- *  \c middle.
- *
- *  Note that the relative order of elements in the two reordered sequences is not
- *  necessarily the same as it was in the original sequence. A different algorithm,
- *  \ref stable_partition, does guarantee to preserve the relative order.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the sequence to reorder.
- *  \param last The end of the sequence to reorder.
- *  \param pred A function object which decides to which partition each element of the
- *              sequence <tt>[first, last)</tt> belongs.
- *  \return An iterator referring to the first element of the second partition, that is,
- *          the sequence of the elements which do not satisfy \p pred.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type,
- *          and \p ForwardIterator is mutable.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  The following code snippet demonstrates how to use \p partition to reorder a
- *  sequence so that even numbers precede odd numbers using the \p thrust::host execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/partition.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  struct is_even
- *  {
- *    __host__ __device__
- *    bool operator()(const int &x)
- *    {
- *      return (x % 2) == 0;
- *    }
- *  };
- *  ...
- *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
- *  const int N = sizeof(A)/sizeof(int);
- *  thrust::partition(thrust::host,
- *                    A, A + N,
- *                    is_even());
- *  // A is now {2, 4, 6, 8, 10, 1, 3, 5, 7, 9}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/partition.html
- *  \see \p stable_partition
- *  \see \p partition_copy
- */
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename Predicate>
-  ForwardIterator partition(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                            ForwardIterator first,
-                            ForwardIterator last,
-                            Predicate pred);
-
-
-/*! \p partition reorders the elements <tt>[first, last)</tt> based on the function
- *  object \p pred, such that all of the elements that satisfy \p pred precede the
- *  elements that fail to satisfy it. The postcondition is that, for some iterator
- *  \c middle in the range <tt>[first, last)</tt>, <tt>pred(*i)</tt> is \c true for every
- *  iterator \c i in the range <tt>[first,middle)</tt> and \c false for every iterator
- *  \c i in the range <tt>[middle, last)</tt>. The return value of \p partition is
- *  \c middle.
- *
- *  Note that the relative order of elements in the two reordered sequences is not
- *  necessarily the same as it was in the original sequence. A different algorithm,
- *  \ref stable_partition, does guarantee to preserve the relative order.
- *
- *  \param first The beginning of the sequence to reorder.
- *  \param last The end of the sequence to reorder.
- *  \param pred A function object which decides to which partition each element of the
- *              sequence <tt>[first, last)</tt> belongs.
- *  \return An iterator referring to the first element of the second partition, that is,
- *          the sequence of the elements which do not satisfy \p pred.
- *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type,
- *          and \p ForwardIterator is mutable.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  The following code snippet demonstrates how to use \p partition to reorder a
- *  sequence so that even numbers precede odd numbers.
- *
- *  \code
- *  #include <thrust/partition.h>
- *  ...
- *  struct is_even
- *  {
- *    __host__ __device__
- *    bool operator()(const int &x)
- *    {
- *      return (x % 2) == 0;
- *    }
- *  };
- *  ...
- *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
- *  const int N = sizeof(A)/sizeof(int);
- *  thrust::partition(A, A + N,
- *                     is_even());
- *  // A is now {2, 4, 6, 8, 10, 1, 3, 5, 7, 9}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/partition.html
- *  \see \p stable_partition
- *  \see \p partition_copy
- */
-template<typename ForwardIterator,
-         typename Predicate>
-  ForwardIterator partition(ForwardIterator first,
-                            ForwardIterator last,
-                            Predicate pred);
-
-
-/*! \p partition reorders the elements <tt>[first, last)</tt> based on the function
- *  object \p pred applied to a stencil range <tt>[stencil, stencil + (last - first))</tt>,
- *  such that all of the elements whose corresponding stencil element satisfies \p pred precede all of the elements whose
- *  corresponding stencil element fails to satisfy it. The postcondition is that, for some iterator
- *  \c middle in the range <tt>[first, last)</tt>, <tt>pred(*stencil_i)</tt> is \c true for every iterator
- *  \c stencil_i in the range <tt>[stencil,stencil + (middle - first))</tt> and \c false for every iterator \c stencil_i
- *  in the range <tt>[stencil + (middle - first), stencil + (last - first))</tt>.
- *  The return value of \p stable_partition is \c middle.
- *
- *  Note that the relative order of elements in the two reordered sequences is not
- *  necessarily the same as it was in the original sequence. A different algorithm,
- *  \ref stable_partition, does guarantee to preserve the relative order.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the sequence to reorder.
- *  \param last The end of the sequence to reorder.
- *  \param stencil The beginning of the stencil sequence.
- *  \param pred A function object which decides to which partition each element of the
- *              sequence <tt>[first, last)</tt> belongs.
- *  \return An iterator referring to the first element of the second partition, that is,
- *          the sequence of the elements whose stencil elements do not satisfy \p pred.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator is mutable.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  \pre The ranges <tt>[first,last)</tt> and <tt>[stencil, stencil + (last - first))</tt> shall not overlap.
- *
- *  The following code snippet demonstrates how to use \p partition to reorder a
- *  sequence so that even numbers precede odd numbers using the \p thrust::host execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/partition.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  struct is_even
- *  {
- *    __host__ __device__
- *    bool operator()(const int &x)
- *    {
- *      return (x % 2) == 0;
- *    }
- *  };
- *  ...
- *  int A[] = {0, 1, 0, 1, 0, 1, 0, 1, 0,  1};
- *  int S[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
- *  const int N = sizeof(A)/sizeof(int);
- *  thrust::partition(thrust::host, A, A + N, S, is_even());
- *  // A is now {1, 1, 1, 1, 1, 0, 0, 0, 0, 0}
- *  // S is unmodified
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/partition.html
- *  \see \p stable_partition
- *  \see \p partition_copy
- */
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename InputIterator,
-         typename Predicate>
-  ForwardIterator partition(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                            ForwardIterator first,
-                            ForwardIterator last,
-                            InputIterator stencil,
-                            Predicate pred);
-
-
-/*! \p partition reorders the elements <tt>[first, last)</tt> based on the function
- *  object \p pred applied to a stencil range <tt>[stencil, stencil + (last - first))</tt>,
- *  such that all of the elements whose corresponding stencil element satisfies \p pred precede all of the elements whose
- *  corresponding stencil element fails to satisfy it. The postcondition is that, for some iterator
- *  \c middle in the range <tt>[first, last)</tt>, <tt>pred(*stencil_i)</tt> is \c true for every iterator
- *  \c stencil_i in the range <tt>[stencil,stencil + (middle - first))</tt> and \c false for every iterator \c stencil_i
- *  in the range <tt>[stencil + (middle - first), stencil + (last - first))</tt>.
- *  The return value of \p stable_partition is \c middle.
- *
- *  Note that the relative order of elements in the two reordered sequences is not
- *  necessarily the same as it was in the original sequence. A different algorithm,
- *  \ref stable_partition, does guarantee to preserve the relative order.
- *
- *  \param first The beginning of the sequence to reorder.
- *  \param last The end of the sequence to reorder.
- *  \param stencil The beginning of the stencil sequence.
- *  \param pred A function object which decides to which partition each element of the
- *              sequence <tt>[first, last)</tt> belongs.
- *  \return An iterator referring to the first element of the second partition, that is,
- *          the sequence of the elements whose stencil elements do not satisfy \p pred.
- *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator is mutable.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  \pre The ranges <tt>[first,last)</tt> and <tt>[stencil, stencil + (last - first))</tt> shall not overlap.
- *
- *  The following code snippet demonstrates how to use \p partition to reorder a
- *  sequence so that even numbers precede odd numbers.
- *
- *  \code
- *  #include <thrust/partition.h>
- *  ...
- *  struct is_even
- *  {
- *    __host__ __device__
- *    bool operator()(const int &x)
- *    {
- *      return (x % 2) == 0;
- *    }
- *  };
- *  ...
- *  int A[] = {0, 1, 0, 1, 0, 1, 0, 1, 0,  1};
- *  int S[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
- *  const int N = sizeof(A)/sizeof(int);
- *  thrust::partition(A, A + N, S, is_even());
- *  // A is now {1, 1, 1, 1, 1, 0, 0, 0, 0, 0}
- *  // S is unmodified
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/partition.html
- *  \see \p stable_partition
- *  \see \p partition_copy
- */
-template<typename ForwardIterator,
-         typename InputIterator,
-         typename Predicate>
-  ForwardIterator partition(ForwardIterator first,
-                            ForwardIterator last,
-                            InputIterator stencil,
-                            Predicate pred);
-
-
-/*! \p partition_copy differs from \ref partition only in that the reordered
- *  sequence is written to difference output sequences, rather than in place.
- *
- *  \p partition_copy copies the elements <tt>[first, last)</tt> based on the
- *  function object \p pred. All of the elements that satisfy \p pred are copied
- *  to the range beginning at \p out_true and all the elements that fail to satisfy it
- *  are copied to the range beginning at \p out_false.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the sequence to reorder.
- *  \param last The end of the sequence to reorder.
- *  \param out_true The destination of the resulting sequence of elements which satisfy \p pred.
- *  \param out_false The destination of the resulting sequence of elements which fail to satisfy \p pred.
- *  \param pred A function object which decides to which partition each element of the
- *              sequence <tt>[first, last)</tt> belongs.
- *  \return A \p pair p such that <tt>p.first</tt> is the end of the output range beginning
- *          at \p out_true and <tt>p.second</tt> is the end of the output range beginning at
- *          \p out_false.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type and \p InputIterator's \c value_type
- *          is convertible to \p OutputIterator1 and \p OutputIterator2's \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  \pre The input range shall not overlap with either output range.
- *
- *  The following code snippet demonstrates how to use \p partition_copy to separate a
- *  sequence into two output sequences of even and odd numbers using the \p thrust::host execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/partition.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  struct is_even
- *  {
- *    __host__ __device__
- *    bool operator()(const int &x)
- *    {
- *      return (x % 2) == 0;
- *    }
- *  };
- *  ...
- *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
- *  int result[10];
- *  const int N = sizeof(A)/sizeof(int);
- *  int *evens = result;
- *  int *odds  = result + 5;
- *  thrust::partition_copy(thrust::host, A, A + N, evens, odds, is_even());
- *  // A remains {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
- *  // result is now {2, 4, 6, 8, 10, 1, 3, 5, 7, 9}
- *  // evens points to {2, 4, 6, 8, 10}
- *  // odds points to {1, 3, 5, 7, 9}
- *  \endcode
- *
- *  \note The relative order of elements in the two reordered sequences is not
- *  necessarily the same as it was in the original sequence. A different algorithm,
- *  \ref stable_partition_copy, does guarantee to preserve the relative order.
- *
- *  \see http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2569.pdf
- *  \see \p stable_partition_copy
- *  \see \p partition
- */
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename Predicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    partition_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                   InputIterator first,
-                   InputIterator last,
-                   OutputIterator1 out_true,
-                   OutputIterator2 out_false,
-                   Predicate pred);
-
-
-/*! \p partition_copy differs from \ref partition only in that the reordered
- *  sequence is written to difference output sequences, rather than in place.
- *
- *  \p partition_copy copies the elements <tt>[first, last)</tt> based on the
- *  function object \p pred. All of the elements that satisfy \p pred are copied
- *  to the range beginning at \p out_true and all the elements that fail to satisfy it
- *  are copied to the range beginning at \p out_false.
- *
- *  \param first The beginning of the sequence to reorder.
- *  \param last The end of the sequence to reorder.
- *  \param out_true The destination of the resulting sequence of elements which satisfy \p pred.
- *  \param out_false The destination of the resulting sequence of elements which fail to satisfy \p pred.
- *  \param pred A function object which decides to which partition each element of the
- *              sequence <tt>[first, last)</tt> belongs.
- *  \return A \p pair p such that <tt>p.first</tt> is the end of the output range beginning
- *          at \p out_true and <tt>p.second</tt> is the end of the output range beginning at
- *          \p out_false.
- *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type and \p InputIterator's \c value_type
- *          is convertible to \p OutputIterator1 and \p OutputIterator2's \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  \pre The input range shall not overlap with either output range.
- *
- *  The following code snippet demonstrates how to use \p partition_copy to separate a
- *  sequence into two output sequences of even and odd numbers.
- *
- *  \code
- *  #include <thrust/partition.h>
- *  ...
- *  struct is_even
- *  {
- *    __host__ __device__
- *    bool operator()(const int &x)
- *    {
- *      return (x % 2) == 0;
- *    }
- *  };
- *  ...
- *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
- *  int result[10];
- *  const int N = sizeof(A)/sizeof(int);
- *  int *evens = result;
- *  int *odds  = result + 5;
- *  thrust::partition_copy(A, A + N, evens, odds, is_even());
- *  // A remains {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
- *  // result is now {2, 4, 6, 8, 10, 1, 3, 5, 7, 9}
- *  // evens points to {2, 4, 6, 8, 10}
- *  // odds points to {1, 3, 5, 7, 9}
- *  \endcode
- *
- *  \note The relative order of elements in the two reordered sequences is not
- *  necessarily the same as it was in the original sequence. A different algorithm,
- *  \ref stable_partition_copy, does guarantee to preserve the relative order.
- *
- *  \see http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2569.pdf
- *  \see \p stable_partition_copy
- *  \see \p partition
- */
-template<typename InputIterator,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename Predicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    partition_copy(InputIterator first,
-                   InputIterator last,
-                   OutputIterator1 out_true,
-                   OutputIterator2 out_false,
-                   Predicate pred);
-
-
-/*! \p partition_copy differs from \ref partition only in that the reordered
- *  sequence is written to difference output sequences, rather than in place.
- *
- *  \p partition_copy copies the elements <tt>[first, last)</tt> based on the
- *  function object \p pred which is applied to a range of stencil elements. All of the elements
- *  whose corresponding stencil element satisfies \p pred are copied to the range beginning at \p out_true
- *  and all the elements whose stencil element fails to satisfy it are copied to the range beginning
- *  at \p out_false.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the sequence to reorder.
- *  \param last The end of the sequence to reorder.
- *  \param stencil The beginning of the stencil sequence.
- *  \param out_true The destination of the resulting sequence of elements which satisfy \p pred.
- *  \param out_false The destination of the resulting sequence of elements which fail to satisfy \p pred.
- *  \param pred A function object which decides to which partition each element of the
- *              sequence <tt>[first, last)</tt> belongs.
- *  \return A \p pair p such that <tt>p.first</tt> is the end of the output range beginning
- *          at \p out_true and <tt>p.second</tt> is the end of the output range beginning at
- *          \p out_false.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator's \c value_type is convertible to \p OutputIterator1 and \p OutputIterator2's \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  \pre The input ranges shall not overlap with either output range.
- *
- *  The following code snippet demonstrates how to use \p partition_copy to separate a
- *  sequence into two output sequences of even and odd numbers using the \p thrust::host execution
- *  policy for parallelization.
- *
- *  \code
- *  #include <thrust/partition.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
- *  int S[] = {0, 1, 0, 1, 0, 1, 0, 1, 0,  1};
- *  int result[10];
- *  const int N = sizeof(A)/sizeof(int);
- *  int *evens = result;
- *  int *odds  = result + 5;
- *  thrust::stable_partition_copy(thrust::host, A, A + N, S, evens, odds, thrust::identity<int>());
- *  // A remains {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
- *  // S remains {0, 1, 0, 1, 0, 1, 0, 1, 0,  1}
- *  // result is now {2, 4, 6, 8, 10, 1, 3, 5, 7, 9}
- *  // evens points to {2, 4, 6, 8, 10}
- *  // odds points to {1, 3, 5, 7, 9}
- *  \endcode
- *
- *  \note The relative order of elements in the two reordered sequences is not
- *  necessarily the same as it was in the original sequence. A different algorithm,
- *  \ref stable_partition_copy, does guarantee to preserve the relative order.
- *
- *  \see http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2569.pdf
- *  \see \p stable_partition_copy
- *  \see \p partition
- */
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename Predicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    partition_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                   InputIterator1 first,
-                   InputIterator1 last,
-                   InputIterator2 stencil,
-                   OutputIterator1 out_true,
-                   OutputIterator2 out_false,
-                   Predicate pred);
-
-
-/*! \p partition_copy differs from \ref partition only in that the reordered
- *  sequence is written to difference output sequences, rather than in place.
- *
- *  \p partition_copy copies the elements <tt>[first, last)</tt> based on the
- *  function object \p pred which is applied to a range of stencil elements. All of the elements
- *  whose corresponding stencil element satisfies \p pred are copied to the range beginning at \p out_true
- *  and all the elements whose stencil element fails to satisfy it are copied to the range beginning
- *  at \p out_false.
- *
- *  \param first The beginning of the sequence to reorder.
- *  \param last The end of the sequence to reorder.
- *  \param stencil The beginning of the stencil sequence.
- *  \param out_true The destination of the resulting sequence of elements which satisfy \p pred.
- *  \param out_false The destination of the resulting sequence of elements which fail to satisfy \p pred.
- *  \param pred A function object which decides to which partition each element of the
- *              sequence <tt>[first, last)</tt> belongs.
- *  \return A \p pair p such that <tt>p.first</tt> is the end of the output range beginning
- *          at \p out_true and <tt>p.second</tt> is the end of the output range beginning at
- *          \p out_false.
- *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator's \c value_type is convertible to \p OutputIterator1 and \p OutputIterator2's \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  \pre The input ranges shall not overlap with either output range.
- *
- *  The following code snippet demonstrates how to use \p partition_copy to separate a
- *  sequence into two output sequences of even and odd numbers.
- *
- *  \code
- *  #include <thrust/partition.h>
- *  #include <thrust/functional.h>
- *  ...
- *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
- *  int S[] = {0, 1, 0, 1, 0, 1, 0, 1, 0,  1};
- *  int result[10];
- *  const int N = sizeof(A)/sizeof(int);
- *  int *evens = result;
- *  int *odds  = result + 5;
- *  thrust::stable_partition_copy(A, A + N, S, evens, odds, thrust::identity<int>());
- *  // A remains {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
- *  // S remains {0, 1, 0, 1, 0, 1, 0, 1, 0,  1}
- *  // result is now {2, 4, 6, 8, 10, 1, 3, 5, 7, 9}
- *  // evens points to {2, 4, 6, 8, 10}
- *  // odds points to {1, 3, 5, 7, 9}
- *  \endcode
- *
- *  \note The relative order of elements in the two reordered sequences is not
- *  necessarily the same as it was in the original sequence. A different algorithm,
- *  \ref stable_partition_copy, does guarantee to preserve the relative order.
- *
- *  \see http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2569.pdf
- *  \see \p stable_partition_copy
- *  \see \p partition
- */
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename Predicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    partition_copy(InputIterator1 first,
-                   InputIterator1 last,
-                   InputIterator2 stencil,
-                   OutputIterator1 out_true,
-                   OutputIterator2 out_false,
-                   Predicate pred);
-
-
-/*! \p stable_partition is much like \ref partition : it reorders the elements in the
- *  range <tt>[first, last)</tt> based on the function object \p pred, such that all of
- *  the elements that satisfy \p pred precede all of the elements that fail to satisfy
- *  it. The postcondition is that, for some iterator \p middle in the range
- *  <tt>[first, last)</tt>, <tt>pred(*i)</tt> is \c true for every iterator \c i in the
- *  range <tt>[first,middle)</tt> and \c false for every iterator \c i in the range
- *  <tt>[middle, last)</tt>. The return value of \p stable_partition is \c middle.
- *
- *  \p stable_partition differs from \ref partition in that \p stable_partition is
- *  guaranteed to preserve relative order. That is, if \c x and \c y are elements in
- *  <tt>[first, last)</tt>, and \c stencil_x and \c stencil_y are the stencil elements
- *  in corresponding positions within <tt>[stencil, stencil + (last - first))</tt>,
- *  and <tt>pred(stencil_x) == pred(stencil_y)</tt>, and if \c x precedes
- *  \c y, then it will still be true after \p stable_partition that \c x precedes \c y.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The first element of the sequence to reorder.
- *  \param last One position past the last element of the sequence to reorder.
- *  \param pred A function object which decides to which partition each element of the
- *              sequence <tt>[first, last)</tt> belongs.
- *  \return An iterator referring to the first element of the second partition, that is,
- *          the sequence of the elements which do not satisfy pred.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type,
- *          and \p ForwardIterator is mutable.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  The following code snippet demonstrates how to use \p stable_partition to reorder a
- *  sequence so that even numbers precede odd numbers using the \p thrust::host execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/partition.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  struct is_even
- *  {
- *    __host__ __device__
- *    bool operator()(const int &x)
- *    {
- *      return (x % 2) == 0;
- *    }
- *  };
- *  ...
- *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
- *  const int N = sizeof(A)/sizeof(int);
- *  thrust::stable_partition(thrust::host,
- *                           A, A + N,
- *                           is_even());
- *  // A is now {2, 4, 6, 8, 10, 1, 3, 5, 7, 9}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/stable_partition.html
- *  \see \p partition
- *  \see \p stable_partition_copy
- */
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename Predicate>
-  ForwardIterator stable_partition(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                   ForwardIterator first,
-                                   ForwardIterator last,
-                                   Predicate pred);
-
-
-/*! \p stable_partition is much like \ref partition : it reorders the elements in the
- *  range <tt>[first, last)</tt> based on the function object \p pred, such that all of
- *  the elements that satisfy \p pred precede all of the elements that fail to satisfy
- *  it. The postcondition is that, for some iterator \p middle in the range
- *  <tt>[first, last)</tt>, <tt>pred(*i)</tt> is \c true for every iterator \c i in the
- *  range <tt>[first,middle)</tt> and \c false for every iterator \c i in the range
- *  <tt>[middle, last)</tt>. The return value of \p stable_partition is \c middle.
- *
- *  \p stable_partition differs from \ref partition in that \p stable_partition is
- *  guaranteed to preserve relative order. That is, if \c x and \c y are elements in
- *  <tt>[first, last)</tt>, and \c stencil_x and \c stencil_y are the stencil elements
- *  in corresponding positions within <tt>[stencil, stencil + (last - first))</tt>,
- *  and <tt>pred(stencil_x) == pred(stencil_y)</tt>, and if \c x precedes
- *  \c y, then it will still be true after \p stable_partition that \c x precedes \c y.
- *
- *  \param first The first element of the sequence to reorder.
- *  \param last One position past the last element of the sequence to reorder.
- *  \param pred A function object which decides to which partition each element of the
- *              sequence <tt>[first, last)</tt> belongs.
- *  \return An iterator referring to the first element of the second partition, that is,
- *          the sequence of the elements which do not satisfy pred.
- *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type,
- *          and \p ForwardIterator is mutable.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  The following code snippet demonstrates how to use \p stable_partition to reorder a
- *  sequence so that even numbers precede odd numbers.
- *
- *  \code
- *  #include <thrust/partition.h>
- *  ...
- *  struct is_even
- *  {
- *    __host__ __device__
- *    bool operator()(const int &x)
- *    {
- *      return (x % 2) == 0;
- *    }
- *  };
- *  ...
- *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
- *  const int N = sizeof(A)/sizeof(int);
- *  thrust::stable_partition(A, A + N,
- *                            is_even());
- *  // A is now {2, 4, 6, 8, 10, 1, 3, 5, 7, 9}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/stable_partition.html
- *  \see \p partition
- *  \see \p stable_partition_copy
- */
-template<typename ForwardIterator,
-         typename Predicate>
-  ForwardIterator stable_partition(ForwardIterator first,
-                                   ForwardIterator last,
-                                   Predicate pred);
-
-
-/*! \p stable_partition is much like \p partition: it reorders the elements in the
- *  range <tt>[first, last)</tt> based on the function object \p pred applied to a stencil
- *  range <tt>[stencil, stencil + (last - first))</tt>, such that all of
- *  the elements whose corresponding stencil element satisfies \p pred precede all of the elements whose
- *  corresponding stencil element fails to satisfy it. The postcondition is that, for some iterator
- *  \c middle in the range <tt>[first, last)</tt>, <tt>pred(*stencil_i)</tt> is \c true for every iterator
- *  \c stencil_i in the range <tt>[stencil,stencil + (middle - first))</tt> and \c false for every iterator \c stencil_i
- *  in the range <tt>[stencil + (middle - first), stencil + (last - first))</tt>.
- *  The return value of \p stable_partition is \c middle.
- *
- *  \p stable_partition differs from \ref partition in that \p stable_partition is
- *  guaranteed to preserve relative order. That is, if \c x and \c y are elements in
- *  <tt>[first, last)</tt>, such that <tt>pred(x) == pred(y)</tt>, and if \c x precedes
- *  \c y, then it will still be true after \p stable_partition that \c x precedes \c y.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The first element of the sequence to reorder.
- *  \param last One position past the last element of the sequence to reorder.
- *  \param stencil The beginning of the stencil sequence.
- *  \param pred A function object which decides to which partition each element of the
- *              sequence <tt>[first, last)</tt> belongs.
- *  \return An iterator referring to the first element of the second partition, that is,
- *          the sequence of the elements whose stencil elements do not satisfy \p pred.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator is mutable.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  \pre The range <tt>[first, last)</tt> shall not overlap with the range <tt>[stencil, stencil + (last - first))</tt>.
- *
- *  The following code snippet demonstrates how to use \p stable_partition to reorder a
- *  sequence so that even numbers precede odd numbers using the \p thrust::host execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/partition.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  struct is_even
- *  {
- *    __host__ __device__
- *    bool operator()(const int &x)
- *    {
- *      return (x % 2) == 0;
- *    }
- *  };
- *  ...
- *  int A[] = {0, 1, 0, 1, 0, 1, 0, 1, 0,  1};
- *  int S[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
- *  const int N = sizeof(A)/sizeof(int);
- *  thrust::stable_partition(thrust::host, A, A + N, S, is_even());
- *  // A is now {1, 1, 1, 1, 1, 0, 0, 0, 0, 0}
- *  // S is unmodified
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/stable_partition.html
- *  \see \p partition
- *  \see \p stable_partition_copy
- */
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename InputIterator,
-         typename Predicate>
-  ForwardIterator stable_partition(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                   ForwardIterator first,
-                                   ForwardIterator last,
-                                   InputIterator stencil,
-                                   Predicate pred);
-
-
-/*! \p stable_partition is much like \p partition: it reorders the elements in the
- *  range <tt>[first, last)</tt> based on the function object \p pred applied to a stencil
- *  range <tt>[stencil, stencil + (last - first))</tt>, such that all of
- *  the elements whose corresponding stencil element satisfies \p pred precede all of the elements whose
- *  corresponding stencil element fails to satisfy it. The postcondition is that, for some iterator
- *  \c middle in the range <tt>[first, last)</tt>, <tt>pred(*stencil_i)</tt> is \c true for every iterator
- *  \c stencil_i in the range <tt>[stencil,stencil + (middle - first))</tt> and \c false for every iterator \c stencil_i
- *  in the range <tt>[stencil + (middle - first), stencil + (last - first))</tt>.
- *  The return value of \p stable_partition is \c middle.
- *
- *  \p stable_partition differs from \ref partition in that \p stable_partition is
- *  guaranteed to preserve relative order. That is, if \c x and \c y are elements in
- *  <tt>[first, last)</tt>, such that <tt>pred(x) == pred(y)</tt>, and if \c x precedes
- *  \c y, then it will still be true after \p stable_partition that \c x precedes \c y.
- *
- *  \param first The first element of the sequence to reorder.
- *  \param last One position past the last element of the sequence to reorder.
- *  \param stencil The beginning of the stencil sequence.
- *  \param pred A function object which decides to which partition each element of the
- *              sequence <tt>[first, last)</tt> belongs.
- *  \return An iterator referring to the first element of the second partition, that is,
- *          the sequence of the elements whose stencil elements do not satisfy \p pred.
- *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator is mutable.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  \pre The range <tt>[first, last)</tt> shall not overlap with the range <tt>[stencil, stencil + (last - first))</tt>.
- *
- *  The following code snippet demonstrates how to use \p stable_partition to reorder a
- *  sequence so that even numbers precede odd numbers.
- *
- *  \code
- *  #include <thrust/partition.h>
- *  ...
- *  struct is_even
- *  {
- *    __host__ __device__
- *    bool operator()(const int &x)
- *    {
- *      return (x % 2) == 0;
- *    }
- *  };
- *  ...
- *  int A[] = {0, 1, 0, 1, 0, 1, 0, 1, 0,  1};
- *  int S[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
- *  const int N = sizeof(A)/sizeof(int);
- *  thrust::stable_partition(A, A + N, S, is_even());
- *  // A is now {1, 1, 1, 1, 1, 0, 0, 0, 0, 0}
- *  // S is unmodified
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/stable_partition.html
- *  \see \p partition
- *  \see \p stable_partition_copy
- */
-template<typename ForwardIterator,
-         typename InputIterator,
-         typename Predicate>
-  ForwardIterator stable_partition(ForwardIterator first,
-                                   ForwardIterator last,
-                                   InputIterator stencil,
-                                   Predicate pred);
-
-
-/*! \p stable_partition_copy differs from \ref stable_partition only in that the reordered
- *  sequence is written to different output sequences, rather than in place.
- *
- *  \p stable_partition_copy copies the elements <tt>[first, last)</tt> based on the
- *  function object \p pred. All of the elements that satisfy \p pred are copied
- *  to the range beginning at \p out_true and all the elements that fail to satisfy it
- *  are copied to the range beginning at \p out_false.
- *
- *  \p stable_partition_copy differs from \ref partition_copy in that
- *  \p stable_partition_copy is guaranteed to preserve relative order. That is, if
- *  \c x and \c y are elements in <tt>[first, last)</tt>, such that
- *  <tt>pred(x) == pred(y)</tt>, and if \c x precedes \c y, then it will still be true
- *  after \p stable_partition_copy that \c x precedes \c y in the output.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The first element of the sequence to reorder.
- *  \param last One position past the last element of the sequence to reorder.
- *  \param out_true The destination of the resulting sequence of elements which satisfy \p pred.
- *  \param out_false The destination of the resulting sequence of elements which fail to satisfy \p pred.
- *  \param pred A function object which decides to which partition each element of the
- *              sequence <tt>[first, last)</tt> belongs.
- *  \return A \p pair p such that <tt>p.first</tt> is the end of the output range beginning
- *          at \p out_true and <tt>p.second</tt> is the end of the output range beginning at
- *          \p out_false.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type and \p InputIterator's \c value_type
- *          is convertible to \p OutputIterator1 and \p OutputIterator2's \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  \pre The input ranges shall not overlap with either output range.
- *
- *  The following code snippet demonstrates how to use \p stable_partition_copy to
- *  reorder a sequence so that even numbers precede odd numbers using the \p thrust::host execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/partition.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  struct is_even
- *  {
- *    __host__ __device__
- *    bool operator()(const int &x)
- *    {
- *      return (x % 2) == 0;
- *    }
- *  };
- *  ...
- *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
- *  int result[10];
- *  const int N = sizeof(A)/sizeof(int);
- *  int *evens = result;
- *  int *odds  = result + 5;
- *  thrust::stable_partition_copy(thrust::host, A, A + N, evens, odds, is_even());
- *  // A remains {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
- *  // result is now {2, 4, 6, 8, 10, 1, 3, 5, 7, 9}
- *  // evens points to {2, 4, 6, 8, 10}
- *  // odds points to {1, 3, 5, 7, 9}
- *  \endcode
- *
- *  \see http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2569.pdf
- *  \see \p partition_copy
- *  \see \p stable_partition
- */
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename Predicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    stable_partition_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                          InputIterator first,
-                          InputIterator last,
-                          OutputIterator1 out_true,
-                          OutputIterator2 out_false,
-                          Predicate pred);
-
-
-/*! \p stable_partition_copy differs from \ref stable_partition only in that the reordered
- *  sequence is written to different output sequences, rather than in place.
- *
- *  \p stable_partition_copy copies the elements <tt>[first, last)</tt> based on the
- *  function object \p pred. All of the elements that satisfy \p pred are copied
- *  to the range beginning at \p out_true and all the elements that fail to satisfy it
- *  are copied to the range beginning at \p out_false.
- *
- *  \p stable_partition_copy differs from \ref partition_copy in that
- *  \p stable_partition_copy is guaranteed to preserve relative order. That is, if
- *  \c x and \c y are elements in <tt>[first, last)</tt>, such that
- *  <tt>pred(x) == pred(y)</tt>, and if \c x precedes \c y, then it will still be true
- *  after \p stable_partition_copy that \c x precedes \c y in the output.
- *
- *  \param first The first element of the sequence to reorder.
- *  \param last One position past the last element of the sequence to reorder.
- *  \param out_true The destination of the resulting sequence of elements which satisfy \p pred.
- *  \param out_false The destination of the resulting sequence of elements which fail to satisfy \p pred.
- *  \param pred A function object which decides to which partition each element of the
- *              sequence <tt>[first, last)</tt> belongs.
- *  \return A \p pair p such that <tt>p.first</tt> is the end of the output range beginning
- *          at \p out_true and <tt>p.second</tt> is the end of the output range beginning at
- *          \p out_false.
- *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type and \p InputIterator's \c value_type
- *          is convertible to \p OutputIterator1 and \p OutputIterator2's \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  \pre The input ranges shall not overlap with either output range.
- *
- *  The following code snippet demonstrates how to use \p stable_partition_copy to
- *  reorder a sequence so that even numbers precede odd numbers.
- *
- *  \code
- *  #include <thrust/partition.h>
- *  ...
- *  struct is_even
- *  {
- *    __host__ __device__
- *    bool operator()(const int &x)
- *    {
- *      return (x % 2) == 0;
- *    }
- *  };
- *  ...
- *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
- *  int result[10];
- *  const int N = sizeof(A)/sizeof(int);
- *  int *evens = result;
- *  int *odds  = result + 5;
- *  thrust::stable_partition_copy(A, A + N, evens, odds, is_even());
- *  // A remains {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
- *  // result is now {2, 4, 6, 8, 10, 1, 3, 5, 7, 9}
- *  // evens points to {2, 4, 6, 8, 10}
- *  // odds points to {1, 3, 5, 7, 9}
- *  \endcode
- *
- *  \see http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2569.pdf
- *  \see \p partition_copy
- *  \see \p stable_partition
- */
-template<typename InputIterator,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename Predicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    stable_partition_copy(InputIterator first,
-                          InputIterator last,
-                          OutputIterator1 out_true,
-                          OutputIterator2 out_false,
-                          Predicate pred);
-
-
-/*! \p stable_partition_copy differs from \ref stable_partition only in that the reordered
- *  sequence is written to different output sequences, rather than in place.
- *
- *  \p stable_partition_copy copies the elements <tt>[first, last)</tt> based on the
- *  function object \p pred which is applied to a range of stencil elements. All of the elements
- *  whose corresponding stencil element satisfies \p pred are copied to the range beginning at \p out_true
- *  and all the elements whose stencil element fails to satisfy it are copied to the range beginning
- *  at \p out_false.
- *
- *  \p stable_partition_copy differs from \ref partition_copy in that
- *  \p stable_partition_copy is guaranteed to preserve relative order. That is, if
- *  \c x and \c y are elements in <tt>[first, last)</tt>, such that
- *  <tt>pred(x) == pred(y)</tt>, and if \c x precedes \c y, then it will still be true
- *  after \p stable_partition_copy that \c x precedes \c y in the output.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The first element of the sequence to reorder.
- *  \param last One position past the last element of the sequence to reorder.
- *  \param stencil The beginning of the stencil sequence.
- *  \param out_true The destination of the resulting sequence of elements which satisfy \p pred.
- *  \param out_false The destination of the resulting sequence of elements which fail to satisfy \p pred.
- *  \param pred A function object which decides to which partition each element of the
- *              sequence <tt>[first, last)</tt> belongs.
- *  \return A \p pair p such that <tt>p.first</tt> is the end of the output range beginning
- *          at \p out_true and <tt>p.second</tt> is the end of the output range beginning at
- *          \p out_false.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator's \c value_type is convertible to \p OutputIterator1 and \p OutputIterator2's \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  \pre The input ranges shall not overlap with either output range.
- *
- *  The following code snippet demonstrates how to use \p stable_partition_copy to
- *  reorder a sequence so that even numbers precede odd numbers using the \p thrust::host execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/partition.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
- *  int S[] = {0, 1, 0, 1, 0, 1, 0, 1, 0,  1};
- *  int result[10];
- *  const int N = sizeof(A)/sizeof(int);
- *  int *evens = result;
- *  int *odds  = result + 5;
- *  thrust::stable_partition_copy(thrust::host, A, A + N, S, evens, odds, thrust::identity<int>());
- *  // A remains {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
- *  // S remains {0, 1, 0, 1, 0, 1, 0, 1, 0,  1}
- *  // result is now {2, 4, 6, 8, 10, 1, 3, 5, 7, 9}
- *  // evens points to {2, 4, 6, 8, 10}
- *  // odds points to {1, 3, 5, 7, 9}
- *  \endcode
- *
- *  \see http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2569.pdf
- *  \see \p partition_copy
- *  \see \p stable_partition
- */
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename Predicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    stable_partition_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                          InputIterator1 first,
-                          InputIterator1 last,
-                          InputIterator2 stencil,
-                          OutputIterator1 out_true,
-                          OutputIterator2 out_false,
-                          Predicate pred);
-
-
-/*! \p stable_partition_copy differs from \ref stable_partition only in that the reordered
- *  sequence is written to different output sequences, rather than in place.
- *
- *  \p stable_partition_copy copies the elements <tt>[first, last)</tt> based on the
- *  function object \p pred which is applied to a range of stencil elements. All of the elements
- *  whose corresponding stencil element satisfies \p pred are copied to the range beginning at \p out_true
- *  and all the elements whose stencil element fails to satisfy it are copied to the range beginning
- *  at \p out_false.
- *
- *  \p stable_partition_copy differs from \ref partition_copy in that
- *  \p stable_partition_copy is guaranteed to preserve relative order. That is, if
- *  \c x and \c y are elements in <tt>[first, last)</tt>, such that
- *  <tt>pred(x) == pred(y)</tt>, and if \c x precedes \c y, then it will still be true
- *  after \p stable_partition_copy that \c x precedes \c y in the output.
- *
- *  \param first The first element of the sequence to reorder.
- *  \param last One position past the last element of the sequence to reorder.
- *  \param stencil The beginning of the stencil sequence.
- *  \param out_true The destination of the resulting sequence of elements which satisfy \p pred.
- *  \param out_false The destination of the resulting sequence of elements which fail to satisfy \p pred.
- *  \param pred A function object which decides to which partition each element of the
- *              sequence <tt>[first, last)</tt> belongs.
- *  \return A \p pair p such that <tt>p.first</tt> is the end of the output range beginning
- *          at \p out_true and <tt>p.second</tt> is the end of the output range beginning at
- *          \p out_false.
- *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator's \c value_type is convertible to \p OutputIterator1 and \p OutputIterator2's \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  \pre The input ranges shall not overlap with either output range.
- *
- *  The following code snippet demonstrates how to use \p stable_partition_copy to
- *  reorder a sequence so that even numbers precede odd numbers.
- *
- *  \code
- *  #include <thrust/partition.h>
- *  #include <thrust/functional.h>
- *  ...
- *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
- *  int S[] = {0, 1, 0, 1, 0, 1, 0, 1, 0,  1};
- *  int result[10];
- *  const int N = sizeof(A)/sizeof(int);
- *  int *evens = result;
- *  int *odds  = result + 5;
- *  thrust::stable_partition_copy(A, A + N, S, evens, odds, thrust::identity<int>());
- *  // A remains {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
- *  // S remains {0, 1, 0, 1, 0, 1, 0, 1, 0,  1}
- *  // result is now {2, 4, 6, 8, 10, 1, 3, 5, 7, 9}
- *  // evens points to {2, 4, 6, 8, 10}
- *  // odds points to {1, 3, 5, 7, 9}
- *  \endcode
- *
- *  \see http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2569.pdf
- *  \see \p partition_copy
- *  \see \p stable_partition
- */
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename Predicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    stable_partition_copy(InputIterator1 first,
-                          InputIterator1 last,
-                          InputIterator2 stencil,
-                          OutputIterator1 out_true,
-                          OutputIterator2 out_false,
-                          Predicate pred);
-
-
-/*! \} // end stream_compaction
- */
-
-/*! \} // end reordering
- */
-
-/*! \addtogroup searching
- *  \{
- */
-
-
-/*! \p partition_point returns an iterator pointing to the end of the true
- *  partition of a partitioned range. \p partition_point requires the input range
- *  <tt>[first,last)</tt> to be a partition; that is, all elements which satisfy
- *  <tt>pred</tt> shall appear before those that do not.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the range to consider.
- *  \param last The end of the range to consider.
- *  \param pred A function object which decides to which partition each element of the
- *              range <tt>[first, last)</tt> belongs.
- *  \return An iterator \c mid such that <tt>all_of(first, mid, pred)</tt>
- *          and <tt>none_of(mid, last, pred)</tt> are both true.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  \pre The range <tt>[first, last)</tt> shall be partitioned by \p pred.
- *
- *  \note Though similar, \p partition_point is not redundant with \p find_if_not.
- *        \p partition_point's precondition provides an opportunity for a
- *        faster implemention.
- *
- *  \code
- *  #include <thrust/partition.h>
- *  #include <thrust/execution_policy.h>
- *
- *  struct is_even
- *  {
- *    __host__ __device__
- *    bool operator()(const int &x)
- *    {
- *      return (x % 2) == 0;
- *    }
- *  };
- *  
- *  ...
- *
- *  int A[] = {2, 4, 6, 8, 10, 1, 3, 5, 7, 9};
- *  int * B = thrust::partition_point(thrust::host, A, A + 10, is_even());
- *  // B - A is 5
- *  // [A, B) contains only even values
- *  \endcode
- *
- *  \see \p partition
- *  \see \p find_if_not
- */
-template<typename DerivedPolicy, typename ForwardIterator, typename Predicate>
-  ForwardIterator partition_point(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                  ForwardIterator first,
-                                  ForwardIterator last,
-                                  Predicate pred);
-
-
-/*! \p partition_point returns an iterator pointing to the end of the true
- *  partition of a partitioned range. \p partition_point requires the input range
- *  <tt>[first,last)</tt> to be a partition; that is, all elements which satisfy
- *  <tt>pred</tt> shall appear before those that do not.
- *  \param first The beginning of the range to consider.
- *  \param last The end of the range to consider.
- *  \param pred A function object which decides to which partition each element of the
- *              range <tt>[first, last)</tt> belongs.
- *  \return An iterator \c mid such that <tt>all_of(first, mid, pred)</tt>
- *          and <tt>none_of(mid, last, pred)</tt> are both true.
- *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  \pre The range <tt>[first, last)</tt> shall be partitioned by \p pred.
- *
- *  \note Though similar, \p partition_point is not redundant with \p find_if_not.
- *        \p partition_point's precondition provides an opportunity for a
- *        faster implemention.
- *
- *  \code
- *  #include <thrust/partition.h>
- *
- *  struct is_even
- *  {
- *    __host__ __device__
- *    bool operator()(const int &x)
- *    {
- *      return (x % 2) == 0;
- *    }
- *  };
- *  
- *  ...
- *
- *  int A[] = {2, 4, 6, 8, 10, 1, 3, 5, 7, 9};
- *  int * B = thrust::partition_point(A, A + 10, is_even());
- *  // B - A is 5
- *  // [A, B) contains only even values
- *  \endcode
- *
- *  \see \p partition
- *  \see \p find_if_not
- */
-template<typename ForwardIterator, typename Predicate>
-  ForwardIterator partition_point(ForwardIterator first,
-                                  ForwardIterator last,
-                                  Predicate pred);
-
-/*! \} // searching
- */
-
-/*! \addtogroup reductions
- *  \{
- *  \addtogroup predicates
- *  \{
- */
-
-
-/*! \p is_partitioned returns \c true if the given range 
- *  is partitioned with respect to a predicate, and \c false otherwise.
- *
- *  Specifically, \p is_partitioned returns \c true if <tt>[first, last)</tt>
- *  is empty of if <tt>[first, last)</tt> is partitioned by \p pred, i.e. if
- *  all elements that satisfy \p pred appear before those that do not.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the range to consider.
- *  \param last The end of the range to consider.
- *  \param pred A function object which decides to which partition each element of the
- *         range <tt>[first, last)</tt> belongs.
- *  \return \c true if the range <tt>[first, last)</tt> is partitioned with respect
- *          to \p pred, or if <tt>[first, last)</tt> is empty. \c false, otherwise.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Input Iterator</a>,
- *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *  
- *  \code
- *  #include <thrust/partition.h>
- *  #include <thrust/execution_policy.h>
- *
- *  struct is_even
- *  {
- *    __host__ __device__
- *    bool operator()(const int &x)
- *    {
- *      return (x % 2) == 0;
- *    }
- *  };
- *  
- *  ...
- *
- *  int A[] = {2, 4, 6, 8, 10, 1, 3, 5, 7, 9};
- *  int B[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
- *
- *  thrust::is_partitioned(thrust::host, A, A + 10); // returns true
- *  thrust::is_partitioned(thrust::host, B, B + 10); // returns false
- *  \endcode
- *
- *  \see \p partition
- */
-template<typename DerivedPolicy, typename InputIterator, typename Predicate>
-  bool is_partitioned(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                      InputIterator first,
-                      InputIterator last,
-                      Predicate pred);
-
-
-/*! \p is_partitioned returns \c true if the given range 
- *  is partitioned with respect to a predicate, and \c false otherwise.
- *
- *  Specifically, \p is_partitioned returns \c true if <tt>[first, last)</tt>
- *  is empty of if <tt>[first, last)</tt> is partitioned by \p pred, i.e. if
- *  all elements that satisfy \p pred appear before those that do not.
- *
- *  \param first The beginning of the range to consider.
- *  \param last The end of the range to consider.
- *  \param pred A function object which decides to which partition each element of the
- *         range <tt>[first, last)</tt> belongs.
- *  \return \c true if the range <tt>[first, last)</tt> is partitioned with respect
- *          to \p pred, or if <tt>[first, last)</tt> is empty. \c false, otherwise.
- *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Input Iterator</a>,
- *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *  
- *  \code
- *  #include <thrust/partition.h>
- *
- *  struct is_even
- *  {
- *    __host__ __device__
- *    bool operator()(const int &x)
- *    {
- *      return (x % 2) == 0;
- *    }
- *  };
- *  
- *  ...
- *
- *  int A[] = {2, 4, 6, 8, 10, 1, 3, 5, 7, 9};
- *  int B[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
- *
- *  thrust::is_partitioned(A, A + 10); // returns true
- *  thrust::is_partitioned(B, B + 10); // returns false
- *  \endcode
- *
- *  \see \p partition
- */
-template<typename InputIterator, typename Predicate>
-  bool is_partitioned(InputIterator first,
-                      InputIterator last,
-                      Predicate pred);
-
-
-/*! \} // end predicates
- *  \} // end reductions
- */
-
-
-} // end thrust
-
-#include <thrust/detail/partition.inl>
-
diff --git a/compat/thrust/random.h b/compat/thrust/random.h
deleted file mode 100644
index 5a2c00da86..0000000000
--- a/compat/thrust/random.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file random.h
- *  \brief Pseudo-random number generators.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/cstdint.h>
-
-// RNGs
-#include <thrust/random/discard_block_engine.h>
-#include <thrust/random/linear_congruential_engine.h>
-#include <thrust/random/linear_feedback_shift_engine.h>
-#include <thrust/random/subtract_with_carry_engine.h>
-#include <thrust/random/xor_combine_engine.h>
-
-// distributions
-#include <thrust/random/uniform_int_distribution.h>
-#include <thrust/random/uniform_real_distribution.h>
-#include <thrust/random/normal_distribution.h>
-
-namespace thrust
-{
-
-
-/*! \addtogroup random Random Number Generation
- *  \{
- */
-
-
-/*! \namespace thrust::random
- *  \brief \p thrust::random is the namespace which contains random number engine class templates,
- *  random number engine adaptor class templates, engines with predefined parameters,
- *  and random number distribution class templates. They are provided in a separate namespace
- *  for import convenience but are also aliased in the top-level \p thrust namespace for
- *  easy access.
- */
-namespace random
-{
-
-/*! \addtogroup predefined_random Random Number Engines with Predefined Parameters
- *  \ingroup random
- *  \{
- */
-
-/*! \typedef ranlux24
- *  \brief A random number engine with predefined parameters which implements the
- *         RANLUX level-3 random number generation algorithm.
- *  \note The 10000th consecutive invocation of a default-constructed object of type \p ranlux24
- *        shall produce the value \c 9901578 .
- */
-typedef discard_block_engine<ranlux24_base, 223, 23> ranlux24;
-
-
-/*! \typedef ranlux48
- *  \brief A random number engine with predefined parameters which implements the
- *         RANLUX level-4 random number generation algorithm.
- *  \note The 10000th consecutive invocation of a default-constructed object of type \p ranlux48
- *        shall produce the value \c 88229545517833 .
- */
-typedef discard_block_engine<ranlux48_base, 389, 11> ranlux48;
-
-
-/*! \typedef taus88
- *  \brief A random number engine with predefined parameters which implements
- *         L'Ecuyer's 1996 three-component Tausworthe random number generator.
- *
- *  \note The 10000th consecutive invocation of a default-constructed object of type \p taus88
- *        shall produce the value \c 3535848941 .
- */
-typedef xor_combine_engine<
-  linear_feedback_shift_engine<thrust::detail::uint32_t, 32u, 31u, 13u, 12u>,
-  0,
-  xor_combine_engine<
-    linear_feedback_shift_engine<thrust::detail::uint32_t, 32u, 29u,  2u,  4u>, 0,
-    linear_feedback_shift_engine<thrust::detail::uint32_t, 32u, 28u,  3u, 17u>, 0
-  >,
-  0
-> taus88;
-
-/*! \typedef default_random_engine
- *  \brief An implementation-defined "default" random number engine.
- *  \note \p default_random_engine is currently an alias for \p minstd_rand, and may change
- *        in a future version.
- */
-typedef minstd_rand default_random_engine;
-
-/*! \} // end predefined_random
- */
-
-} // end random
-
-
-/*! \} // end random
- */
-
-// import names into thrust::
-using random::ranlux24;
-using random::ranlux48;
-using random::taus88;
-using random::default_random_engine;
-
-} // end thrust
-
diff --git a/compat/thrust/random/detail/discard_block_engine.inl b/compat/thrust/random/detail/discard_block_engine.inl
deleted file mode 100644
index 5f01bd1816..0000000000
--- a/compat/thrust/random/detail/discard_block_engine.inl
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/random/discard_block_engine.h>
-
-namespace thrust
-{
-
-namespace random
-{
-
-
-template<typename Engine, size_t p, size_t r>
-  discard_block_engine<Engine,p,r>
-    ::discard_block_engine()
-      : m_e(), m_n(0)
-{}
-
-
-template<typename Engine, size_t p, size_t r>
-  discard_block_engine<Engine,p,r>
-    ::discard_block_engine(result_type s)
-      : m_e(s), m_n(0)
-{}
-
-
-template<typename Engine, size_t p, size_t r>
-  discard_block_engine<Engine,p,r>
-    ::discard_block_engine(const base_type &urng)
-      : m_e(urng), m_n(0)
-{}
-
-
-template<typename Engine, size_t p, size_t r>
-  void discard_block_engine<Engine,p,r>
-    ::seed(void)
-{
-  m_e.seed();
-  m_n = 0;
-}
-
-
-template<typename Engine, size_t p, size_t r>
-  void discard_block_engine<Engine,p,r>
-    ::seed(result_type s)
-{
-  m_e.seed(s);
-  m_n = 0;
-}
-
-
-template<typename Engine, size_t p, size_t r>
-  typename discard_block_engine<Engine,p,r>::result_type
-    discard_block_engine<Engine,p,r>
-      ::operator()(void)
-{
-  if(m_n >= used_block)
-  {
-    m_e.discard(block_size - m_n);
-//    for(; m_n < block_size; ++m_n)
-//      m_e();
-    m_n = 0;
-  }
-
-  ++m_n;
-
-  return m_e();
-}
-
-
-template<typename Engine, size_t p, size_t r>
-  void discard_block_engine<Engine,p,r>
-    ::discard(unsigned long long z)
-{
-  // XXX this should be accelerated
-  for(; z > 0; --z)
-  {
-    this->operator()();
-  } // end for
-}
-
-
-template<typename Engine, size_t p, size_t r>
-  const typename discard_block_engine<Engine,p,r>::base_type &
-    discard_block_engine<Engine,p,r>
-      ::base(void) const
-{
-  return m_e;
-}
-
-
-template<typename Engine, size_t p, size_t r>
-  template<typename CharT, typename Traits>
-    std::basic_ostream<CharT,Traits>& discard_block_engine<Engine,p,r>
-      ::stream_out(std::basic_ostream<CharT,Traits> &os) const
-{
-  typedef std::basic_ostream<CharT,Traits> ostream_type;
-  typedef typename ostream_type::ios_base  ios_base;
-
-  // save old flags & fill character
-  const typename ios_base::fmtflags flags = os.flags();
-  const CharT fill = os.fill();
-
-  const CharT space = os.widen(' ');
-  os.flags(ios_base::dec | ios_base::fixed | ios_base::left);
-  os.fill(space);
-
-  // output the base engine followed by n
-  os << m_e << space << m_n;
-
-  // restore flags & fill character
-  os.flags(flags);
-  os.fill(fill);
-
-  return os;
-}
-
-
-template<typename Engine, size_t p, size_t r>
-  template<typename CharT, typename Traits>
-    std::basic_istream<CharT,Traits>& discard_block_engine<Engine,p,r>
-      ::stream_in(std::basic_istream<CharT,Traits> &is)
-{
-  typedef std::basic_istream<CharT,Traits> istream_type;
-  typedef typename istream_type::ios_base  ios_base;
-
-  // save old flags
-  const typename ios_base::fmtflags flags = is.flags();
-
-  is.flags(ios_base::skipws);
-
-  // input the base engine and then n
-  is >> m_e >> m_n;
-
-  // restore old flags
-  is.flags(flags);
-  return is;
-}
-
-
-template<typename Engine, size_t p, size_t r>
-  bool discard_block_engine<Engine,p,r>
-    ::equal(const discard_block_engine<Engine,p,r> &rhs) const
-{
-  return (m_e == rhs.m_e) && (m_n == rhs.m_n);
-}
-
-
-template<typename Engine, size_t p, size_t r,
-         typename CharT, typename Traits>
-std::basic_ostream<CharT,Traits>&
-operator<<(std::basic_ostream<CharT,Traits> &os,
-           const discard_block_engine<Engine,p,r> &e)
-{
-  return thrust::random::detail::random_core_access::stream_out(os,e);
-}
-
-
-template<typename Engine, size_t p, size_t r,
-         typename CharT, typename Traits>
-std::basic_istream<CharT,Traits>&
-operator>>(std::basic_istream<CharT,Traits> &is,
-           discard_block_engine<Engine,p,r> &e)
-{
-  return thrust::random::detail::random_core_access::stream_in(is,e);
-}
-
-
-template<typename Engine, size_t p, size_t r>
-bool operator==(const discard_block_engine<Engine,p,r> &lhs,
-                const discard_block_engine<Engine,p,r> &rhs)
-{
-  return thrust::random::detail::random_core_access::equal(lhs,rhs);
-}
-
-
-template<typename Engine, size_t p, size_t r>
-bool operator!=(const discard_block_engine<Engine,p,r> &lhs,
-                const discard_block_engine<Engine,p,r> &rhs)
-{
-  return !(lhs == rhs);
-}
-
-
-} // end random
-
-} // end thrust
-
diff --git a/compat/thrust/random/detail/linear_congruential_engine.inl b/compat/thrust/random/detail/linear_congruential_engine.inl
deleted file mode 100644
index f040563cc2..0000000000
--- a/compat/thrust/random/detail/linear_congruential_engine.inl
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/random/linear_congruential_engine.h>
-#include <thrust/random/detail/mod.h>
-#include <thrust/random/detail/random_core_access.h>
-
-namespace thrust
-{
-
-namespace random
-{
-
-
-template<typename UIntType, UIntType a, UIntType c, UIntType m>
-  linear_congruential_engine<UIntType,a,c,m>
-    ::linear_congruential_engine(result_type s)
-{
-  seed(s);
-} // end linear_congruential_engine::linear_congruential_engine()
-
-
-template<typename UIntType, UIntType a, UIntType c, UIntType m>
-  void linear_congruential_engine<UIntType,a,c,m>
-    ::seed(result_type s)
-{
-  if((detail::mod<UIntType, 1, 0, m>(c) == 0) &&
-     (detail::mod<UIntType, 1, 0, m>(s) == 0))
-    m_x = detail::mod<UIntType, 1, 0, m>(1);
-  else
-    m_x = detail::mod<UIntType, 1, 0, m>(s);
-} // end linear_congruential_engine::seed()
-
-
-template<typename UIntType, UIntType a, UIntType c, UIntType m>
-  typename linear_congruential_engine<UIntType,a,c,m>::result_type
-    linear_congruential_engine<UIntType,a,c,m>
-      ::operator()(void)
-{
-  m_x = detail::mod<UIntType,a,c,m>(m_x);
-  return m_x;
-} // end linear_congruential_engine::operator()()
-
-
-template<typename UIntType, UIntType a, UIntType c, UIntType m>
-  void linear_congruential_engine<UIntType,a,c,m>
-    ::discard(unsigned long long z)
-{
-  thrust::random::detail::linear_congruential_engine_discard::discard(*this,z);
-} // end linear_congruential_engine::discard()
-
-
-template<typename UIntType, UIntType a, UIntType c, UIntType m>
-  template<typename CharT, typename Traits>
-    std::basic_ostream<CharT,Traits>& linear_congruential_engine<UIntType,a,c,m>
-      ::stream_out(std::basic_ostream<CharT,Traits> &os) const
-{
-  typedef std::basic_ostream<CharT,Traits> ostream_type;
-  typedef typename ostream_type::ios_base  ios_base;
-
-  // save old flags & fill character
-  const typename ios_base::fmtflags flags = os.flags();
-  const CharT fill = os.fill();
-
-  os.flags(ios_base::dec | ios_base::fixed | ios_base::left);
-  os.fill(os.widen(' '));
-
-  // output one word of state
-  os << m_x;
-
-  // restore flags & fill character
-  os.flags(flags);
-  os.fill(fill);
-
-  return os;
-}
-
-
-template<typename UIntType, UIntType a, UIntType c, UIntType m>
-  template<typename CharT, typename Traits>
-    std::basic_istream<CharT,Traits>& linear_congruential_engine<UIntType,a,c,m>
-      ::stream_in(std::basic_istream<CharT,Traits> &is)
-{
-  typedef std::basic_istream<CharT,Traits> istream_type;
-  typedef typename istream_type::ios_base     ios_base;
-
-  // save old flags
-  const typename ios_base::fmtflags flags = is.flags();
-
-  is.flags(ios_base::dec);
-
-  // input one word of state
-  is >> m_x;
-
-  // restore flags
-  is.flags(flags);
-
-  return is;
-}
-
-
-template<typename UIntType, UIntType a, UIntType c, UIntType m>
-bool linear_congruential_engine<UIntType,a,c,m>
-  ::equal(const linear_congruential_engine<UIntType,a,c,m> &rhs) const
-{
-  return m_x == rhs.m_x;
-}
-
-
-template<typename UIntType_, UIntType_ a_, UIntType_ c_, UIntType_ m_>
-__host__ __device__
-bool operator==(const linear_congruential_engine<UIntType_,a_,c_,m_> &lhs,
-                const linear_congruential_engine<UIntType_,a_,c_,m_> &rhs)
-{
-  return detail::random_core_access::equal(lhs,rhs);
-}
-
-
-template<typename UIntType, UIntType a, UIntType c, UIntType m>
-bool operator!=(const linear_congruential_engine<UIntType,a,c,m> &lhs,
-                const linear_congruential_engine<UIntType,a,c,m> &rhs)
-{
-  return !(lhs == rhs);
-}
-
-
-template<typename UIntType_, UIntType_ a_, UIntType_ c_, UIntType_ m_,
-         typename CharT, typename Traits>
-std::basic_ostream<CharT,Traits>&
-operator<<(std::basic_ostream<CharT,Traits> &os,
-           const linear_congruential_engine<UIntType_,a_,c_,m_> &e)
-{
-  return detail::random_core_access::stream_out(os,e);
-}
-
-
-template<typename UIntType_, UIntType_ a_, UIntType_ c_, UIntType_ m_,
-         typename CharT, typename Traits>
-std::basic_istream<CharT,Traits>&
-operator>>(std::basic_istream<CharT,Traits> &is,
-           linear_congruential_engine<UIntType_,a_,c_,m_> &e)
-{
-  return detail::random_core_access::stream_in(is,e);
-}
-
-
-} // end random
-
-} // end thrust
-
diff --git a/compat/thrust/random/detail/linear_congruential_engine_discard.h b/compat/thrust/random/detail/linear_congruential_engine_discard.h
deleted file mode 100644
index f4ec23364e..0000000000
--- a/compat/thrust/random/detail/linear_congruential_engine_discard.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/cstdint.h>
-#include <thrust/random/detail/mod.h>
-
-namespace thrust
-{
-
-namespace random
-{
-
-namespace detail
-{
-
-
-template<typename UIntType, UIntType a, unsigned long long c, UIntType m>
-  struct linear_congruential_engine_discard_implementation
-{
-  __host__ __device__
-  static void discard(UIntType &state, unsigned long long z)
-  {
-    for(; z > 0; --z)
-    {
-      state = detail::mod<UIntType,a,c,m>(state);
-    }
-  }
-}; // end linear_congruential_engine_discard
-
-
-// specialize for small integers and c == 0
-// XXX figure out a robust implemenation of this for any unsigned integer type later
-template<thrust::detail::uint32_t a, thrust::detail::uint32_t m>
-  struct linear_congruential_engine_discard_implementation<thrust::detail::uint32_t,a,0,m>
-{
-  __host__ __device__
-  static void discard(thrust::detail::uint32_t &state, unsigned long long z)
-  {
-    const thrust::detail::uint32_t modulus = m;
-
-    // XXX we need to use unsigned long long here or we will encounter overflow in the
-    //     multiplies below
-    //     figure out a robust implementation of this later
-    unsigned long long multiplier = a;
-    unsigned long long multiplier_to_z = 1;
-    
-    // see http://en.wikipedia.org/wiki/Modular_exponentiation
-    while(z > 0)
-    {
-      if(z & 1)
-      {
-        // multiply in this bit's contribution while using modulus to keep result small
-        multiplier_to_z = (multiplier_to_z * multiplier) % modulus;
-      }
-
-      // move to the next bit of the exponent, square (and mod) the base accordingly
-      z >>= 1;
-      multiplier = (multiplier * multiplier) % modulus;
-    }
-
-    state = static_cast<thrust::detail::uint32_t>((multiplier_to_z * state) % modulus);
-  }
-}; // end linear_congruential_engine_discard
-
-
-struct linear_congruential_engine_discard
-{
-  template<typename LinearCongruentialEngine>
-  __host__ __device__
-  static void discard(LinearCongruentialEngine &lcg, unsigned long long z)
-  {
-    typedef typename LinearCongruentialEngine::result_type result_type;
-    const result_type c = LinearCongruentialEngine::increment;
-    const result_type a = LinearCongruentialEngine::multiplier;
-    const result_type m = LinearCongruentialEngine::modulus;
-    
-    // XXX WAR unused variable warnings
-    (void) c;
-    (void) a;
-    (void) m;
-
-    linear_congruential_engine_discard_implementation<result_type,a,c,m>::discard(lcg.m_x, z);
-  }
-}; // end linear_congruential_engine_discard
-
-
-} // end detail
-
-} // end random
-
-} // end thrust
-
diff --git a/compat/thrust/random/detail/linear_feedback_shift_engine.inl b/compat/thrust/random/detail/linear_feedback_shift_engine.inl
deleted file mode 100644
index 4e8dad5140..0000000000
--- a/compat/thrust/random/detail/linear_feedback_shift_engine.inl
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/random/linear_feedback_shift_engine.h>
-
-namespace thrust
-{
-
-namespace random
-{
-
-template<typename UIntType, size_t w, size_t k, size_t q, size_t s>
-  linear_feedback_shift_engine<UIntType,w,k,q,s>
-    ::linear_feedback_shift_engine(result_type value)
-{
-  seed(value);
-} // end linear_feedback_shift_engine::linear_feedback_shift_engine()
-
-template<typename UIntType, size_t w, size_t k, size_t q, size_t s>
-  void linear_feedback_shift_engine<UIntType,w,k,q,s>
-    ::seed(result_type value)
-{
-  m_value = value;
-} // end linear_feedback_shift_engine::seed()
-
-template<typename UIntType, size_t w, size_t k, size_t q, size_t s>
-  typename linear_feedback_shift_engine<UIntType,w,k,q,s>::result_type
-    linear_feedback_shift_engine<UIntType,w,k,q,s>
-      ::operator()(void)
-{
-  const UIntType b = (((m_value << q) ^ m_value) & wordmask) >> (k-s);
-  const UIntType mask = ( (~static_cast<UIntType>(0)) << (w-k) ) & wordmask;
-  m_value = ((m_value & mask) << s) ^ b;
-  return m_value;
-} // end linear_feedback_shift_engine::operator()()
-
-
-template<typename UIntType, size_t w, size_t k, size_t q, size_t s>
-  void linear_feedback_shift_engine<UIntType,w,k,q,s>
-    ::discard(unsigned long long z)
-{
-  for(; z > 0; --z)
-  {
-    this->operator()();
-  } // end for
-} // end linear_feedback_shift_engine::discard()
-
-
-template<typename UIntType, size_t w, size_t k, size_t q, size_t s>
-  template<typename CharT, typename Traits>
-    std::basic_ostream<CharT,Traits>& linear_feedback_shift_engine<UIntType,w,k,q,s>
-      ::stream_out(std::basic_ostream<CharT,Traits> &os) const
-{
-  typedef std::basic_ostream<CharT,Traits> ostream_type;
-  typedef typename ostream_type::ios_base  ios_base;
-
-  // save old flags & fill character
-  const typename ios_base::fmtflags flags = os.flags();
-  const CharT fill = os.fill();
-
-  os.flags(ios_base::dec | ios_base::fixed | ios_base::left);
-  os.fill(os.widen(' '));
-
-  // output one word of state
-  os << m_value;
-
-  // restore flags & fill character
-  os.flags(flags);
-  os.fill(fill);
-
-  return os;
-}
-
-
-template<typename UIntType, size_t w, size_t k, size_t q, size_t s>
-  template<typename CharT, typename Traits>
-    std::basic_istream<CharT,Traits>& linear_feedback_shift_engine<UIntType,w,k,q,s>
-      ::stream_in(std::basic_istream<CharT,Traits> &is)
-{
-  typedef std::basic_istream<CharT,Traits> istream_type;
-  typedef typename istream_type::ios_base     ios_base;
-
-  // save old flags
-  const typename ios_base::fmtflags flags = is.flags();
-
-  is.flags(ios_base::skipws);
-
-  // input one word of state
-  is >> m_value;
-
-  // restore flags
-  is.flags(flags);
-
-  return is;
-}
-
-
-template<typename UIntType, size_t w, size_t k, size_t q, size_t s>
-  bool linear_feedback_shift_engine<UIntType,w,k,q,s>
-    ::equal(const linear_feedback_shift_engine<UIntType,w,k,q,s> &rhs) const
-{
-  return m_value == rhs.m_value;
-}
-
-
-template<typename UIntType, size_t w, size_t k, size_t q, size_t s>
-bool operator==(const linear_feedback_shift_engine<UIntType,w,k,q,s> &lhs,
-                const linear_feedback_shift_engine<UIntType,w,k,q,s> &rhs)
-{
-  return thrust::random::detail::random_core_access::equal(lhs,rhs);
-}
-
-
-template<typename UIntType, size_t w, size_t k, size_t q, size_t s>
-bool operator!=(const linear_feedback_shift_engine<UIntType,w,k,q,s> &lhs,
-                const linear_feedback_shift_engine<UIntType,w,k,q,s> &rhs)
-{
-  return !(lhs == rhs);
-}
-
-
-template<typename UIntType_, size_t w_, size_t k_, size_t q_, size_t s_,
-         typename CharT, typename Traits>
-std::basic_ostream<CharT,Traits>&
-operator<<(std::basic_ostream<CharT,Traits> &os,
-           const linear_feedback_shift_engine<UIntType_,w_,k_,q_,s_> &e)
-{
-  return thrust::random::detail::random_core_access::stream_out(os,e);
-}
-
-
-template<typename UIntType_, size_t w_, size_t k_, size_t q_, size_t s_,
-         typename CharT, typename Traits>
-std::basic_istream<CharT,Traits>&
-operator>>(std::basic_istream<CharT,Traits> &is,
-           linear_feedback_shift_engine<UIntType_,w_,k_,q_,s_> &e)
-{
-  return thrust::random::detail::random_core_access::stream_in(is,e);
-}
-
-
-} // end random
-
-} // end thrust
-
diff --git a/compat/thrust/random/detail/linear_feedback_shift_engine_wordmask.h b/compat/thrust/random/detail/linear_feedback_shift_engine_wordmask.h
deleted file mode 100644
index ed9e51e925..0000000000
--- a/compat/thrust/random/detail/linear_feedback_shift_engine_wordmask.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-namespace thrust
-{
-
-namespace random
-{
-
-namespace detail
-{
-
-template<typename T, int w, int i = w-1>
-  struct linear_feedback_shift_engine_wordmask
-{
-  static const T value =
-    (T(1u) << i) |
-    linear_feedback_shift_engine_wordmask<T, w, i-1>::value;
-}; // end linear_feedback_shift_engine_wordmask
-
-template<typename T, int w>
-  struct linear_feedback_shift_engine_wordmask<T, w, 0>
-{
-  static const T value = 0;
-}; // end linear_feedback_shift_engine_wordmask
-
-} // end detail
-
-} // end random
-
-} // end thrust
-
diff --git a/compat/thrust/random/detail/mod.h b/compat/thrust/random/detail/mod.h
deleted file mode 100644
index ceb2191552..0000000000
--- a/compat/thrust/random/detail/mod.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-namespace thrust
-{
-
-namespace random
-{
-
-namespace detail
-{
-
-template<typename T, T a, T c, T m, bool = (m == 0)>
-  struct static_mod
-{
-  static const T q = m / a;
-  static const T r = m % a;
-
-  __host__ __device__
-  T operator()(T x) const
-  {
-    if(a == 1)
-    {
-      x %= m;
-    }
-    else
-    {
-      T t1 = a * (x % q);
-      T t2 = r * (x / q);
-      if(t1 >= t2)
-      {
-        x = t1 - t2;
-      }
-      else
-      {
-        x = m - t2 + t1;
-      }
-    }
-
-    if(c != 0)
-    {
-      const T d = m - x;
-      if(d > c)
-      {
-        x += c;
-      }
-      else
-      {
-        x = c - d;
-      }
-    }
-
-    return x;
-  }
-}; // end static_mod
-
-
-// Rely on machine overflow handling
-template<typename T, T a, T c, T m>
-  struct static_mod<T,a,c,m,true>
-{
-  __host__ __device__
-  T operator()(T x) const
-  {
-    return a * x + c;
-  }
-}; // end static_mod
-
-template<typename T, T a, T c, T m>
-__host__ __device__
-  T mod(T x)
-{
-  static_mod<T,a,c,m> f;
-  return f(x);
-} // end static_mod
-
-} // end detail
-
-} // end random
-
-} // end thrust
-
diff --git a/compat/thrust/random/detail/normal_distribution.inl b/compat/thrust/random/detail/normal_distribution.inl
deleted file mode 100644
index 1bb55d75b2..0000000000
--- a/compat/thrust/random/detail/normal_distribution.inl
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
- *
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/random/normal_distribution.h>
-#include <thrust/random/uniform_real_distribution.h>
-#include <thrust/detail/cstdint.h>
-#include <thrust/detail/integer_traits.h>
-
-// for floating point infinity
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-#include <math_constants.h>
-#else
-#include <limits>
-#endif
-
-namespace thrust
-{
-
-namespace random
-{
-
-
-template<typename RealType>
-  normal_distribution<RealType>
-    ::normal_distribution(RealType a, RealType b)
-      :super_t(),m_param(a,b)
-{
-} // end normal_distribution::normal_distribution()
-
-
-template<typename RealType>
-  normal_distribution<RealType>
-    ::normal_distribution(const param_type &parm)
-      :super_t(),m_param(parm)
-{
-} // end normal_distribution::normal_distribution()
-
-
-template<typename RealType>
-  void normal_distribution<RealType>
-    ::reset(void)
-{
-  super_t::reset();
-} // end normal_distribution::reset()
-
-
-template<typename RealType>
-  template<typename UniformRandomNumberGenerator>
-    typename normal_distribution<RealType>::result_type
-      normal_distribution<RealType>
-        ::operator()(UniformRandomNumberGenerator &urng)
-{
-  return operator()(urng, m_param);
-} // end normal_distribution::operator()()
-
-
-template<typename RealType>
-  template<typename UniformRandomNumberGenerator>
-    typename normal_distribution<RealType>::result_type
-      normal_distribution<RealType>
-        ::operator()(UniformRandomNumberGenerator &urng,
-                     const param_type &parm)
-{
-  return super_t::sample(urng, parm.first, parm.second);
-} // end normal_distribution::operator()()
-
-
-template<typename RealType>
-  typename normal_distribution<RealType>::param_type
-    normal_distribution<RealType>
-      ::param(void) const
-{
-  return m_param;
-} // end normal_distribution::param()
-
-
-template<typename RealType>
-  void normal_distribution<RealType>
-    ::param(const param_type &parm)
-{
-  m_param = parm;
-} // end normal_distribution::param()
-
-
-template<typename RealType>
-  typename normal_distribution<RealType>::result_type
-    normal_distribution<RealType>
-      ::min THRUST_PREVENT_MACRO_SUBSTITUTION (void) const
-{
-  return -this->max();
-} // end normal_distribution::min()
-
-
-template<typename RealType>
-  typename normal_distribution<RealType>::result_type
-    normal_distribution<RealType>
-      ::max THRUST_PREVENT_MACRO_SUBSTITUTION (void) const
-{
-  // XXX this solution is pretty terrible
-  // we can't use numeric_traits<RealType>::max because nvcc will
-  // complain that it is a __host__ function
-  union
-  {
-    thrust::detail::uint32_t inf_as_int;
-    float result;
-  } hack;
-
-  hack.inf_as_int = 0x7f800000u;
-
-  return hack.result;
-} // end normal_distribution::max()
-
-
-template<typename RealType>
-  typename normal_distribution<RealType>::result_type
-    normal_distribution<RealType>
-      ::mean(void) const
-{
-  return m_param.first;
-} // end normal_distribution::mean()
-
-
-template<typename RealType>
-  typename normal_distribution<RealType>::result_type
-    normal_distribution<RealType>
-      ::stddev(void) const
-{
-  return m_param.second;
-} // end normal_distribution::stddev()
-
-
-template<typename RealType>
-  bool normal_distribution<RealType>
-    ::equal(const normal_distribution &rhs) const
-{
-  return m_param == rhs.param();
-}
-
-
-template<typename RealType>
-  template<typename CharT, typename Traits>
-    std::basic_ostream<CharT,Traits>&
-      normal_distribution<RealType>
-        ::stream_out(std::basic_ostream<CharT,Traits> &os) const
-{
-  typedef std::basic_ostream<CharT,Traits> ostream_type;
-  typedef typename ostream_type::ios_base  ios_base;
-
-  // save old flags and fill character
-  const typename ios_base::fmtflags flags = os.flags();
-  const CharT fill = os.fill();
-
-  const CharT space = os.widen(' ');
-  os.flags(ios_base::dec | ios_base::fixed | ios_base::left);
-  os.fill(space);
-
-  os << mean() << space << stddev();
-
-  // restore old flags and fill character
-  os.flags(flags);
-  os.fill(fill);
-  return os;
-}
-
-
-template<typename RealType>
-  template<typename CharT, typename Traits>
-    std::basic_istream<CharT,Traits>&
-      normal_distribution<RealType>
-        ::stream_in(std::basic_istream<CharT,Traits> &is)
-{
-  typedef std::basic_istream<CharT,Traits> istream_type;
-  typedef typename istream_type::ios_base  ios_base;
-
-  // save old flags
-  const typename ios_base::fmtflags flags = is.flags();
-
-  is.flags(ios_base::skipws);
-
-  is >> m_param.first >> m_param.second;
-
-  // restore old flags
-  is.flags(flags);
-  return is;
-}
-
-
-template<typename RealType>
-bool operator==(const normal_distribution<RealType> &lhs,
-                const normal_distribution<RealType> &rhs)
-{
-  return thrust::random::detail::random_core_access::equal(lhs,rhs);
-}
-
-
-template<typename RealType>
-bool operator!=(const normal_distribution<RealType> &lhs,
-                const normal_distribution<RealType> &rhs)
-{
-  return !(lhs == rhs);
-}
-
-
-template<typename RealType,
-         typename CharT, typename Traits>
-std::basic_ostream<CharT,Traits>&
-operator<<(std::basic_ostream<CharT,Traits> &os,
-           const normal_distribution<RealType> &d)
-{
-  return thrust::random::detail::random_core_access::stream_out(os,d);
-}
-
-
-template<typename RealType,
-         typename CharT, typename Traits>
-std::basic_istream<CharT,Traits>&
-operator>>(std::basic_istream<CharT,Traits> &is,
-           normal_distribution<RealType> &d)
-{
-  return thrust::random::detail::random_core_access::stream_in(is,d);
-}
-
-
-} // end random
-
-} // end thrust
-
diff --git a/compat/thrust/random/detail/normal_distribution_base.h b/compat/thrust/random/detail/normal_distribution_base.h
deleted file mode 100644
index d9166112ad..0000000000
--- a/compat/thrust/random/detail/normal_distribution_base.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*
- * Copyright Jens Maurer 2000-2001
- * Distributed under the Boost Software License, Version 1.0. (See
- * accompanying file LICENSE_1_0.txt or copy at
- * http://www.boost.org/LICENSE_1_0.txt)
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/pair.h>
-#include <thrust/random/uniform_real_distribution.h>
-#include <limits>
-#include <cmath>
-
-namespace thrust
-{
-namespace random
-{
-namespace detail
-{
-
-// this version samples the normal distribution directly 
-// and uses the non-standard math function erfcinv
-template<typename RealType>
-  class normal_distribution_nvcc
-{
-  protected:
-    template<typename UniformRandomNumberGenerator>
-    __host__ __device__
-    RealType sample(UniformRandomNumberGenerator &urng, const RealType mean, const RealType stddev)
-    {
-      typedef typename UniformRandomNumberGenerator::result_type uint_type;
-      const uint_type urng_range = UniformRandomNumberGenerator::max - UniformRandomNumberGenerator::min;
-
-      // Constants for conversion
-      const RealType S1 = static_cast<RealType>(1) / urng_range;
-      const RealType S2 = S1 / 2;
-
-      RealType S3 = static_cast<RealType>(-1.4142135623730950488016887242097); // -sqrt(2)
-      
-      // Get the integer value
-      uint_type u = urng() - UniformRandomNumberGenerator::min;
-
-      // Ensure the conversion to float will give a value in the range [0,0.5)
-      if(u > (urng_range / 2))
-      {
-        u = urng_range - u;
-        S3 = -S3;
-      }
-
-      // Convert to floating point in [0,0.5)
-      RealType p = u*S1 + S2;
-
-      // Apply inverse error function
-      return mean + stddev * S3 * erfcinv(2 * p);
-    }
-
-    // no-op
-    __host__ __device__
-    void reset() {}
-};
-
-// this version samples the normal distribution using 
-// Marsaglia's "polar method"
-template<typename RealType>
-  class normal_distribution_portable
-{
-  protected:
-    normal_distribution_portable()
-      : m_valid(false)
-    {}
-
-    normal_distribution_portable(const normal_distribution_portable &other)
-      : m_valid(other.m_valid)
-    {}
-
-    void reset()
-    {
-      m_valid = false;
-    }
-
-    // note that we promise to call this member function with the same mean and stddev
-    template<typename UniformRandomNumberGenerator>
-    __host__ __device__
-    RealType sample(UniformRandomNumberGenerator &urng, const RealType mean, const RealType stddev)
-    {
-      // implementation from Boost
-      // allow for Koenig lookup
-      using std::sqrt; using std::log; using std::sin; using std::cos;
-
-      if(!m_valid)
-      {
-        uniform_real_distribution<RealType> u01;
-        m_r1 = u01(urng);
-        m_r2 = u01(urng);
-        m_cached_rho = sqrt(-RealType(2) * log(RealType(1)-m_r2));
-
-        m_valid = true;
-      }
-      else
-      {
-        m_valid = false;
-      }
-
-      const RealType pi = RealType(3.14159265358979323846);
-
-      RealType result = m_cached_rho * (m_valid ?
-                          cos(RealType(2)*pi*m_r1) :
-                          sin(RealType(2)*pi*m_r1));
-
-      return result;
-    }
-
-  private:
-    RealType m_r1, m_r2, m_cached_rho;
-    bool m_valid;
-};
-
-template<typename RealType>
-  struct normal_distribution_base
-{
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-  typedef normal_distribution_nvcc<RealType> type;
-#else
-  typedef normal_distribution_portable<RealType> type;
-#endif
-};
-
-} // end detail
-} // end random
-} // end thrust
-
diff --git a/compat/thrust/random/detail/random_core_access.h b/compat/thrust/random/detail/random_core_access.h
deleted file mode 100644
index 81f58e2207..0000000000
--- a/compat/thrust/random/detail/random_core_access.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-namespace thrust
-{
-
-namespace random
-{
-
-namespace detail
-{
-
-struct random_core_access
-{
-
-template<typename OStream, typename EngineOrDistribution>
-static OStream &stream_out(OStream &os, const EngineOrDistribution &x)
-{
-  return x.stream_out(os);
-}
-
-template<typename IStream, typename EngineOrDistribution>
-static IStream &stream_in(IStream &is, EngineOrDistribution &x)
-{
-  return x.stream_in(is);
-}
-
-template<typename EngineOrDistribution>
-__host__ __device__
-static bool equal(const EngineOrDistribution &lhs, const EngineOrDistribution &rhs)
-{
-  return lhs.equal(rhs);
-}
-
-}; // end random_core_access
-
-} // end detail
-
-} // end random
-
-} // end thrust
-
diff --git a/compat/thrust/random/detail/subtract_with_carry_engine.inl b/compat/thrust/random/detail/subtract_with_carry_engine.inl
deleted file mode 100644
index a58b2665b2..0000000000
--- a/compat/thrust/random/detail/subtract_with_carry_engine.inl
+++ /dev/null
@@ -1,203 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/random/linear_congruential_engine.h>
-#include <thrust/random/subtract_with_carry_engine.h>
-#include <thrust/random/detail/mod.h>
-#include <thrust/random/detail/random_core_access.h>
-
-namespace thrust
-{
-
-namespace random
-{
-
-
-template<typename UIntType, size_t w, size_t s, size_t r>
-  subtract_with_carry_engine<UIntType,w,s,r>
-    ::subtract_with_carry_engine(result_type value)
-{
-  seed(value);
-} // end subtract_with_carry_engine::subtract_with_carry_engine()
-
-
-template<typename UIntType, size_t w, size_t s, size_t r>
-  void subtract_with_carry_engine<UIntType,w,s,r>
-    ::seed(result_type value)
-{
-  thrust::random::linear_congruential_engine<result_type,
-    40014u, 0u, 2147483563u> e(value == 0u ? default_seed : value);
-
-  // initialize state
-  for(size_t i = 0; i < long_lag; ++i)
-  {
-    m_x[i] = detail::mod<UIntType, 1, 0, modulus>(e());
-  } // end for i
-
-  m_carry = (m_x[long_lag-1] == 0);
-  m_k = 0;
-} // end subtract_with_carry_engine::seed()
-
-
-template<typename UIntType, size_t w, size_t s, size_t r>
-  typename subtract_with_carry_engine<UIntType,w,s,r>::result_type
-    subtract_with_carry_engine<UIntType,w,s,r>
-      ::operator()(void)
-{
-  // XXX we probably need to cache these m_x[m_k] in a register
-  //     maybe we need to cache the use of all member variables
-  int short_index = m_k - short_lag;
-  if(short_index < 0)
-    short_index += long_lag;
-  result_type xi;
-  if (m_x[short_index] >= m_x[m_k] + m_carry)
-  {
-    // x(n) >= 0
-    xi =  m_x[short_index] - m_x[m_k] - m_carry;
-    m_carry = 0;
-  }
-  else
-  {
-    // x(n) < 0
-    xi = modulus - m_x[m_k] - m_carry + m_x[short_index];
-    m_carry = 1;
-  }
-  m_x[m_k] = xi;
-  ++m_k;
-  if(m_k >= long_lag)
-    m_k = 0;
-  return xi;
-} // end subtract_with_carry_engine::operator()()
-
-
-template<typename UIntType, size_t w, size_t s, size_t r>
-  void subtract_with_carry_engine<UIntType,w,s,r>
-    ::discard(unsigned long long z)
-{
-  for(; z > 0; --z)
-  {
-    this->operator()();
-  } // end for
-} // end subtract_with_carry_engine::discard()
-
-
-template<typename UIntType, size_t w, size_t s, size_t r>
-  template<typename CharT, typename Traits>
-    std::basic_ostream<CharT,Traits>& subtract_with_carry_engine<UIntType,w,s,r>
-      ::stream_out(std::basic_ostream<CharT,Traits> &os) const
-{
-  typedef std::basic_ostream<CharT,Traits> ostream_type;
-  typedef typename ostream_type::ios_base     ios_base;
-                  
-  const typename ios_base::fmtflags flags = os.flags();
-  const CharT fill  = os.fill();
-  const CharT space = os.widen(' ');
-  os.flags(ios_base::dec | ios_base::fixed | ios_base::left);
-  os.fill(space);
-
-  const UIntType long_lag = r;
-                                                          
-  for(size_t i = 0; i < r; ++i)
-    os << m_x[(i + m_k) % long_lag] << space;
-  os << m_carry;
-                                                                          
-  os.flags(flags);
-  os.fill(fill);
-  return os;
-}
-
-
-template<typename UIntType, size_t w, size_t s, size_t r>
-  template<typename CharType, typename Traits>
-    std::basic_istream<CharType,Traits>& subtract_with_carry_engine<UIntType,w,s,r>
-      ::stream_in(std::basic_istream<CharType,Traits> &is)
-{
-  typedef std::basic_istream<CharType,Traits> istream_type;
-  typedef typename istream_type::ios_base     ios_base;
-
-  const typename ios_base::fmtflags flags = is.flags();
-  is.flags(ios_base::dec | ios_base::skipws);
-
-  for(size_t i = 0; i < r; ++i)
-    is >> m_x[i];
-  is >> m_carry;
-
-  m_k = 0;
-
-  is.flags(flags);
-  return is;
-}
-
-
-template<typename UIntType, size_t w, size_t s, size_t r>
-  bool subtract_with_carry_engine<UIntType,w,s,r>
-    ::equal(const subtract_with_carry_engine<UIntType,w,s,r> &rhs) const
-{
-  const UIntType long_lag = r;
-
-  bool result = true;
-  for(size_t i = 0; i < r; ++i)
-  {
-    result &= (m_x[(i + m_k) % long_lag] == rhs.m_x[(i + rhs.m_k) % long_lag]);
-  }
-
-  // XXX not sure if this last check is necessary
-  result &= (m_carry == rhs.m_carry);
-
-  return result;
-}
-
-
-template<typename UIntType, size_t w, size_t s, size_t r,
-         typename CharT, typename Traits>
-  std::basic_ostream<CharT,Traits>&
-    operator<<(std::basic_ostream<CharT,Traits> &os,
-               const subtract_with_carry_engine<UIntType,w,s,r> &e)
-{
-  return thrust::random::detail::random_core_access::stream_out(os,e);
-}
-
-
-template<typename UIntType, size_t w, size_t s, size_t r,
-         typename CharType, typename Traits>
-  std::basic_istream<CharType,Traits>&
-    operator>>(std::basic_istream<CharType,Traits> &is,
-               subtract_with_carry_engine<UIntType,w,s,r> &e)
-{
-  return thrust::random::detail::random_core_access::stream_in(is,e);
-}
-
-
-template<typename UIntType, size_t w, size_t s, size_t r>
-  bool operator==(const subtract_with_carry_engine<UIntType,w,s,r> &lhs,
-                  const subtract_with_carry_engine<UIntType,w,s,r> &rhs)
-{
-  return thrust::random::detail::random_core_access::equal(lhs,rhs);
-}
-
-
-template<typename UIntType, size_t w, size_t s, size_t r>
-  bool operator!=(const subtract_with_carry_engine<UIntType,w,s,r> &lhs,
-                  const subtract_with_carry_engine<UIntType,w,s,r> &rhs)
-{
-  return !(lhs == rhs);
-}
-
-
-} // end random
-
-} // end thrust
-
diff --git a/compat/thrust/random/detail/uniform_int_distribution.inl b/compat/thrust/random/detail/uniform_int_distribution.inl
deleted file mode 100644
index e92754c5e7..0000000000
--- a/compat/thrust/random/detail/uniform_int_distribution.inl
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/random/uniform_int_distribution.h>
-#include <thrust/random/uniform_real_distribution.h>
-#include <thrust/detail/type_traits.h>
-
-namespace thrust
-{
-
-namespace random
-{
-
-
-template<typename IntType>
-  uniform_int_distribution<IntType>
-    ::uniform_int_distribution(IntType a, IntType b)
-      :m_param(a,b)
-{
-} // end uniform_int_distribution::uniform_int_distribution()
-
-
-template<typename IntType>
-  uniform_int_distribution<IntType>
-    ::uniform_int_distribution(const param_type &parm)
-      :m_param(parm)
-{
-} // end uniform_int_distribution::uniform_int_distribution()
-
-
-template<typename IntType>
-  void uniform_int_distribution<IntType>
-    ::reset(void)
-{
-} // end uniform_int_distribution::reset()
-
-
-template<typename IntType>
-  template<typename UniformRandomNumberGenerator>
-    typename uniform_int_distribution<IntType>::result_type
-      uniform_int_distribution<IntType>
-        ::operator()(UniformRandomNumberGenerator &urng)
-{
-  return operator()(urng, m_param);
-} // end uniform_int_distribution::operator()()
-
-
-template<typename IntType>
-  template<typename UniformRandomNumberGenerator>
-    typename uniform_int_distribution<IntType>::result_type
-      uniform_int_distribution<IntType>
-        ::operator()(UniformRandomNumberGenerator &urng, const param_type &parm)
-{
-  // XXX this implementation is somewhat hacky and will skip
-  //     values if the range of the RNG is smaller than the range of the distribution
-  //     we should improve this implementation in a later version
-
-  typedef typename thrust::detail::largest_available_float::type float_type;
-
-  const float_type real_min(parm.first);
-  const float_type real_max(parm.second);
-
-  // add one to the right end of the interval because it is half-open
-  // XXX adding 1.0 to a potentially large floating point number seems like a bad idea
-  uniform_real_distribution<float_type> real_dist(real_min, real_max + float_type(1));
-
-  return static_cast<result_type>(real_dist(urng));
-} // end uniform_int_distribution::operator()()
-
-
-template<typename IntType>
-  typename uniform_int_distribution<IntType>::result_type
-    uniform_int_distribution<IntType>
-      ::a(void) const
-{
-  return m_param.first;
-} // end uniform_int_distribution<IntType>::a()
-
-
-template<typename IntType>
-  typename uniform_int_distribution<IntType>::result_type
-    uniform_int_distribution<IntType>
-      ::b(void) const
-{
-  return m_param.second;
-} // end uniform_int_distribution::b()
-
-
-template<typename IntType>
-  typename uniform_int_distribution<IntType>::param_type
-    uniform_int_distribution<IntType>
-      ::param(void) const
-{
-  return m_param;
-} // end uniform_int_distribution::param()
-
-
-template<typename IntType>
-  void uniform_int_distribution<IntType>
-    ::param(const param_type &parm)
-{
-  m_param = parm;
-} // end uniform_int_distribution::param()
-
-
-template<typename IntType>
-  typename uniform_int_distribution<IntType>::result_type
-    uniform_int_distribution<IntType>
-      ::min THRUST_PREVENT_MACRO_SUBSTITUTION (void) const
-{
-  return a();
-} // end uniform_int_distribution::min()
-
-
-template<typename IntType>
-  typename uniform_int_distribution<IntType>::result_type
-    uniform_int_distribution<IntType>
-      ::max THRUST_PREVENT_MACRO_SUBSTITUTION (void) const
-{
-  return b();
-} // end uniform_int_distribution::max()
-
-
-template<typename IntType>
-  bool uniform_int_distribution<IntType>
-    ::equal(const uniform_int_distribution &rhs) const
-{
-  return param() == rhs.param();
-}
-
-
-template<typename IntType>
-  template<typename CharT, typename Traits>
-    std::basic_ostream<CharT,Traits>&
-      uniform_int_distribution<IntType>
-        ::stream_out(std::basic_ostream<CharT,Traits> &os) const
-{
-  typedef std::basic_ostream<CharT,Traits> ostream_type;
-  typedef typename ostream_type::ios_base  ios_base;
-
-  // save old flags and fill character
-  const typename ios_base::fmtflags flags = os.flags();
-  const CharT fill = os.fill();
-
-  const CharT space = os.widen(' ');
-  os.flags(ios_base::dec | ios_base::fixed | ios_base::left);
-  os.fill(space);
-
-  os << a() << space << b();
-
-  // restore old flags and fill character
-  os.flags(flags);
-  os.fill(fill);
-  return os;
-}
-
-
-template<typename IntType>
-  template<typename CharT, typename Traits>
-    std::basic_istream<CharT,Traits>&
-      uniform_int_distribution<IntType>
-        ::stream_in(std::basic_istream<CharT,Traits> &is)
-{
-  typedef std::basic_istream<CharT,Traits> istream_type;
-  typedef typename istream_type::ios_base  ios_base;
-
-  // save old flags
-  const typename ios_base::fmtflags flags = is.flags();
-
-  is.flags(ios_base::skipws);
-
-  is >> m_param.first >> m_param.second;
-
-  // restore old flags
-  is.flags(flags);
-  return is;
-}
-
-
-template<typename IntType>
-bool operator==(const uniform_int_distribution<IntType> &lhs,
-                const uniform_int_distribution<IntType> &rhs)
-{
-  return thrust::random::detail::random_core_access::equal(lhs,rhs);
-}
-
-
-template<typename IntType>
-bool operator!=(const uniform_int_distribution<IntType> &lhs,
-                const uniform_int_distribution<IntType> &rhs)
-{
-  return !(lhs == rhs);
-}
-
-
-template<typename IntType,
-         typename CharT, typename Traits>
-std::basic_ostream<CharT,Traits>&
-operator<<(std::basic_ostream<CharT,Traits> &os,
-           const uniform_int_distribution<IntType> &d)
-{
-  return thrust::random::detail::random_core_access::stream_out(os,d);
-}
-
-
-template<typename IntType,
-         typename CharT, typename Traits>
-std::basic_istream<CharT,Traits>&
-operator>>(std::basic_istream<CharT,Traits> &is,
-           uniform_int_distribution<IntType> &d)
-{
-  return thrust::random::detail::random_core_access::stream_in(is,d);
-}
-
-
-} // end random
-
-} // end thrust
-
diff --git a/compat/thrust/random/detail/uniform_real_distribution.inl b/compat/thrust/random/detail/uniform_real_distribution.inl
deleted file mode 100644
index 6f6d6b57b5..0000000000
--- a/compat/thrust/random/detail/uniform_real_distribution.inl
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/random/uniform_real_distribution.h>
-
-namespace thrust
-{
-
-namespace random
-{
-
-
-template<typename RealType>
-  uniform_real_distribution<RealType>
-    ::uniform_real_distribution(RealType a, RealType b)
-      :m_param(a,b)
-{
-} // end uniform_real_distribution::uniform_real_distribution()
-
-template<typename RealType>
-  uniform_real_distribution<RealType>
-    ::uniform_real_distribution(const param_type &parm)
-      :m_param(parm)
-{
-} // end uniform_real_distribution::uniform_real_distribution()
-
-template<typename RealType>
-  void uniform_real_distribution<RealType>
-    ::reset(void)
-{
-} // end uniform_real_distribution::reset()
-
-template<typename RealType>
-  template<typename UniformRandomNumberGenerator>
-    typename uniform_real_distribution<RealType>::result_type
-      uniform_real_distribution<RealType>
-        ::operator()(UniformRandomNumberGenerator &urng)
-{
-  return operator()(urng, m_param);
-} // end uniform_real::operator()()
-
-template<typename RealType>
-  template<typename UniformRandomNumberGenerator>
-    typename uniform_real_distribution<RealType>::result_type
-      uniform_real_distribution<RealType>
-        ::operator()(UniformRandomNumberGenerator &urng,
-                     const param_type &parm)
-{
-  // call the urng & map its result to [0,1)
-  result_type result = static_cast<result_type>(urng() - UniformRandomNumberGenerator::min);
-
-  // adding one to the denominator ensures that the interval is half-open at 1.0
-  // XXX adding 1.0 to a potentially large floating point number seems like a bad idea
-  // XXX OTOH adding 1 to what is potentially UINT_MAX also seems like a bad idea
-  // XXX we could statically check if 1u + (max - min) is representable and do that, otherwise use the current implementation
-  result /= (result_type(1) + static_cast<result_type>(UniformRandomNumberGenerator::max - UniformRandomNumberGenerator::min));
-
-  return (result * (parm.second - parm.first)) + parm.first;
-} // end uniform_real::operator()()
-
-template<typename RealType>
-  typename uniform_real_distribution<RealType>::result_type
-    uniform_real_distribution<RealType>
-      ::a(void) const
-{
-  return m_param.first;
-} // end uniform_real::a()
-
-template<typename RealType>
-  typename uniform_real_distribution<RealType>::result_type
-    uniform_real_distribution<RealType>
-      ::b(void) const
-{
-  return m_param.second;
-} // end uniform_real_distribution::b()
-
-template<typename RealType>
-  typename uniform_real_distribution<RealType>::param_type
-    uniform_real_distribution<RealType>
-      ::param(void) const
-{
-  return m_param;;
-} // end uniform_real_distribution::param()
-
-template<typename RealType>
-  void uniform_real_distribution<RealType>
-    ::param(const param_type &parm)
-{
-  m_param = parm;
-} // end uniform_real_distribution::param()
-
-template<typename RealType>
-  typename uniform_real_distribution<RealType>::result_type
-    uniform_real_distribution<RealType>
-      ::min THRUST_PREVENT_MACRO_SUBSTITUTION (void) const
-{
-  return a();
-} // end uniform_real_distribution::min()
-
-template<typename RealType>
-  typename uniform_real_distribution<RealType>::result_type
-    uniform_real_distribution<RealType>
-      ::max THRUST_PREVENT_MACRO_SUBSTITUTION (void) const
-{
-  return b();
-} // end uniform_real_distribution::max()
-
-
-template<typename RealType>
-  bool uniform_real_distribution<RealType>
-    ::equal(const uniform_real_distribution &rhs) const
-{
-  return m_param == rhs.param();
-}
-
-
-template<typename RealType>
-  template<typename CharT, typename Traits>
-    std::basic_ostream<CharT,Traits>&
-      uniform_real_distribution<RealType>
-        ::stream_out(std::basic_ostream<CharT,Traits> &os) const
-{
-  typedef std::basic_ostream<CharT,Traits> ostream_type;
-  typedef typename ostream_type::ios_base  ios_base;
-
-  // save old flags and fill character
-  const typename ios_base::fmtflags flags = os.flags();
-  const CharT fill = os.fill();
-
-  const CharT space = os.widen(' ');
-  os.flags(ios_base::dec | ios_base::fixed | ios_base::left);
-  os.fill(space);
-
-  os << a() << space << b();
-
-  // restore old flags and fill character
-  os.flags(flags);
-  os.fill(fill);
-  return os;
-}
-
-
-template<typename RealType>
-  template<typename CharT, typename Traits>
-    std::basic_istream<CharT,Traits>&
-      uniform_real_distribution<RealType>
-        ::stream_in(std::basic_istream<CharT,Traits> &is)
-{
-  typedef std::basic_istream<CharT,Traits> istream_type;
-  typedef typename istream_type::ios_base  ios_base;
-
-  // save old flags
-  const typename ios_base::fmtflags flags = is.flags();
-
-  is.flags(ios_base::skipws);
-
-  is >> m_param.first >> m_param.second;
-
-  // restore old flags
-  is.flags(flags);
-  return is;
-}
-
-
-template<typename RealType>
-bool operator==(const uniform_real_distribution<RealType> &lhs,
-                const uniform_real_distribution<RealType> &rhs)
-{
-  return thrust::random::detail::random_core_access::equal(lhs,rhs);
-}
-
-
-template<typename RealType>
-bool operator!=(const uniform_real_distribution<RealType> &lhs,
-                const uniform_real_distribution<RealType> &rhs)
-{
-  return !(lhs == rhs);
-}
-
-
-template<typename RealType,
-         typename CharT, typename Traits>
-std::basic_ostream<CharT,Traits>&
-operator<<(std::basic_ostream<CharT,Traits> &os,
-           const uniform_real_distribution<RealType> &d)
-{
-  return thrust::random::detail::random_core_access::stream_out(os,d);
-}
-
-
-template<typename RealType,
-         typename CharT, typename Traits>
-std::basic_istream<CharT,Traits>&
-operator>>(std::basic_istream<CharT,Traits> &is,
-           uniform_real_distribution<RealType> &d)
-{
-  return thrust::random::detail::random_core_access::stream_in(is,d);
-}
-
-
-} // end random
-
-} // end thrust
-
diff --git a/compat/thrust/random/detail/xor_combine_engine.inl b/compat/thrust/random/detail/xor_combine_engine.inl
deleted file mode 100644
index b138722f8b..0000000000
--- a/compat/thrust/random/detail/xor_combine_engine.inl
+++ /dev/null
@@ -1,203 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/random/xor_combine_engine.h>
-#include <thrust/random/detail/random_core_access.h>
-
-namespace thrust
-{
-
-namespace random
-{
-
-template<typename Engine1, size_t s1,
-         typename Engine2, size_t s2>
-  xor_combine_engine<Engine1,s1,Engine2,s2>
-    ::xor_combine_engine(void)
-      :m_b1(),m_b2()
-{
-} // end xor_combine_engine::xor_combine_engine()
-
-template<typename Engine1, size_t s1,
-         typename Engine2, size_t s2>
-  xor_combine_engine<Engine1,s1,Engine2,s2>
-    ::xor_combine_engine(const base1_type &urng1, const base2_type &urng2)
-      :m_b1(urng1),m_b2(urng2)
-{
-} // end xor_combine_engine::xor_combine_engine()
-
-template<typename Engine1, size_t s1,
-         typename Engine2, size_t s2>
-  xor_combine_engine<Engine1,s1,Engine2,s2>
-    ::xor_combine_engine(result_type s)
-      :m_b1(s),m_b2(s)
-{
-} // end xor_combine_engine::xor_combine_engine()
-
-template<typename Engine1, size_t s1,
-         typename Engine2, size_t s2>
-  void xor_combine_engine<Engine1,s1,Engine2,s2>
-    ::seed(void)
-{
-  m_b1.seed();
-  m_b2.seed();
-} // end xor_combine_engine::seed()
-
-template<typename Engine1, size_t s1,
-         typename Engine2, size_t s2>
-  void xor_combine_engine<Engine1,s1,Engine2,s2>
-    ::seed(result_type s)
-{
-  m_b1.seed(s);
-  m_b2.seed(s);
-} // end xor_combine_engine::seed()
-
-template<typename Engine1, size_t s1,
-         typename Engine2, size_t s2>
-  const typename xor_combine_engine<Engine1,s1,Engine2,s2>::base1_type &
-    xor_combine_engine<Engine1,s1,Engine2,s2>
-      ::base1(void) const
-{
-  return m_b1;
-} // end xor_combine_engine::base1()
-
-template<typename Engine1, size_t s1,
-         typename Engine2, size_t s2>
-  const typename xor_combine_engine<Engine1,s1,Engine2,s2>::base2_type &
-    xor_combine_engine<Engine1,s1,Engine2,s2>
-      ::base2(void) const
-{
-  return m_b2;
-} // end xor_combine_engine::base2()
-
-template<typename Engine1, size_t s1,
-         typename Engine2, size_t s2>
-  typename xor_combine_engine<Engine1,s1,Engine2,s2>::result_type
-    xor_combine_engine<Engine1,s1,Engine2,s2>
-      ::operator()(void)
-{
-  return (result_type(m_b1() - base1_type::min) << shift1) ^
-         (result_type(m_b2() - base2_type::min) << shift2);
-} // end xor_combine_engine::operator()()
-
-template<typename Engine1, size_t s1,
-         typename Engine2, size_t s2>
-  void xor_combine_engine<Engine1, s1, Engine2, s2>
-    ::discard(unsigned long long z)
-{
-  for(; z > 0; --z)
-  {
-    this->operator()();
-  } // end for
-} // end xor_combine_engine::discard()
-
-
-template<typename Engine1, size_t s1, typename Engine2, size_t s2>
-  template<typename CharT, typename Traits>
-    std::basic_ostream<CharT,Traits>& xor_combine_engine<Engine1,s1,Engine2,s2>
-      ::stream_out(std::basic_ostream<CharT,Traits> &os) const
-{
-  typedef std::basic_ostream<CharT,Traits> ostream_type;
-  typedef typename ostream_type::ios_base  ios_base;
-
-  // save old flags and fill character
-  const typename ios_base::fmtflags flags = os.flags();
-  const CharT fill = os.fill();
-
-  const CharT space = os.widen(' ');
-  os.flags(ios_base::dec | ios_base::fixed | ios_base::left);
-  os.fill(space);
-
-  // output each base engine in turn
-  os << base1() << space << base2();
-
-  // restore old flags and fill character
-  os.flags(flags);
-  os.fill(fill);
-  return os;
-}
-
-
-template<typename Engine1, size_t s1, typename Engine2, size_t s2>
-  template<typename CharT, typename Traits>
-    std::basic_istream<CharT,Traits>& xor_combine_engine<Engine1,s1,Engine2,s2>
-      ::stream_in(std::basic_istream<CharT,Traits> &is)
-{
-  typedef std::basic_istream<CharT,Traits> istream_type;
-  typedef typename istream_type::ios_base  ios_base;
-
-  // save old flags
-  const typename ios_base::fmtflags flags = is.flags();
-
-  is.flags(ios_base::skipws);
-
-  // input each base engine in turn
-  is >> m_b1 >> m_b2;
-
-  // restore old flags
-  is.flags(flags);
-  return is;
-}
-
-
-template<typename Engine1, size_t s1, typename Engine2, size_t s2>
-  bool xor_combine_engine<Engine1,s1,Engine2,s2>
-    ::equal(const xor_combine_engine<Engine1,s1,Engine2,s2> &rhs) const
-{
-  return (m_b1 == rhs.m_b1) && (m_b2 == rhs.m_b2);
-}
-
-
-template<typename Engine1, size_t s1, typename Engine2, size_t s2,
-         typename CharT, typename Traits>
-std::basic_ostream<CharT,Traits>&
-operator<<(std::basic_ostream<CharT,Traits> &os,
-           const xor_combine_engine<Engine1,s1,Engine2,s2> &e)
-{
-  return thrust::random::detail::random_core_access::stream_out(os,e);
-}
-
-
-template<typename Engine1, size_t s1, typename Engine2, size_t s2,
-         typename CharT, typename Traits>
-std::basic_istream<CharT,Traits>&
-operator>>(std::basic_istream<CharT,Traits> &is,
-           xor_combine_engine<Engine1,s1,Engine2,s2> &e)
-{
-  return thrust::random::detail::random_core_access::stream_in(is,e);
-}
-
-
-template<typename Engine1, size_t s1, typename Engine2, size_t s2>
-bool operator==(const xor_combine_engine<Engine1,s1,Engine2,s2> &lhs,
-                const xor_combine_engine<Engine1,s1,Engine2,s2> &rhs)
-{
-  return thrust::random::detail::random_core_access::equal(lhs,rhs);
-}
-
-
-template<typename Engine1, size_t s1, typename Engine2, size_t s2>
-bool operator!=(const xor_combine_engine<Engine1,s1,Engine2,s2> &lhs,
-                const xor_combine_engine<Engine1,s1,Engine2,s2> &rhs)
-{
-  return !(lhs == rhs);
-}
-
-
-} // end random
-
-} // end thrust
-
diff --git a/compat/thrust/random/detail/xor_combine_engine_max.h b/compat/thrust/random/detail/xor_combine_engine_max.h
deleted file mode 100644
index 8bad9a462c..0000000000
--- a/compat/thrust/random/detail/xor_combine_engine_max.h
+++ /dev/null
@@ -1,324 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/mpl/math.h>
-#include <limits>
-#include <cstddef>
-
-namespace thrust
-{
-
-namespace random
-{
-
-namespace detail
-{
-
-
-namespace math = thrust::detail::mpl::math;
-
-
-namespace detail
-{
-
-// two cases for this function avoids compile-time warnings of overflow
-template<typename UIntType, UIntType w,
-         UIntType lhs, UIntType rhs,
-         bool shift_will_overflow>
-  struct lshift_w
-{
-  static const UIntType value = 0;
-};
-
-
-template<typename UIntType, UIntType w,
-         UIntType lhs, UIntType rhs>
-  struct lshift_w<UIntType,w,lhs,rhs,false>
-{
-  static const UIntType value = lhs << rhs;
-};
-
-} // end detail
-
-
-template<typename UIntType, UIntType w,
-         UIntType lhs, UIntType rhs>
-  struct lshift_w
-{
-  static const bool shift_will_overflow = rhs >= w;
-
-  static const UIntType value = detail::lshift_w<UIntType, w, lhs, rhs, shift_will_overflow>::value;
-};
-
-
-template<typename UIntType, UIntType lhs, UIntType rhs>
-  struct lshift
-    : lshift_w<UIntType, std::numeric_limits<UIntType>::digits, lhs, rhs>
-{};
-
-
-template<typename UIntType, int p>
-  struct two_to_the_power
-    : lshift<UIntType, 1, p>
-{};
-
-
-template<typename result_type, result_type a, result_type b, int d>
-  class xor_combine_engine_max_aux_constants
-{
-  public:
-    static const result_type two_to_the_d = two_to_the_power<result_type, d>::value;
-    static const result_type c = lshift<result_type, a, d>::value;
-
-    static const result_type t =
-      math::max<
-        result_type,
-        c,
-        b
-      >::value;
-
-    static const result_type u =
-      math::min<
-        result_type,
-        c,
-        b
-      >::value;
-
-    static const result_type p            = math::log2<u>::value;
-    static const result_type two_to_the_p = two_to_the_power<result_type, p>::value;
-
-    static const result_type k = math::div<result_type, t, two_to_the_p>::value;
-};
-
-
-template<typename result_type, result_type, result_type, int> struct xor_combine_engine_max_aux;
-
-
-template<typename result_type, result_type a, result_type b, int d>
-  struct xor_combine_engine_max_aux_case4
-{
-  typedef xor_combine_engine_max_aux_constants<result_type,a,b,d> constants;
-
-  static const result_type k_plus_1_times_two_to_the_p =
-    lshift<
-      result_type,
-      math::plus<result_type,constants::k,1>::value,
-      constants::p
-    >::value;
-
-  static const result_type M =
-    xor_combine_engine_max_aux<
-      result_type,
-      math::div<
-        result_type,
-        math::mod<
-          result_type,
-          constants::u,
-          constants::two_to_the_p
-        >::value,
-        constants::two_to_the_p
-      >::value,
-      math::mod<
-        result_type,
-        constants::t,
-        constants::two_to_the_p
-      >::value,
-      d
-    >::value;
-
-  static const result_type value = math::plus<result_type, k_plus_1_times_two_to_the_p, M>::value;
-};
-
-
-template<typename result_type, result_type a, result_type b, int d>
-  struct xor_combine_engine_max_aux_case3
-{
-  typedef xor_combine_engine_max_aux_constants<result_type,a,b,d> constants;
-
-  static const result_type k_plus_1_times_two_to_the_p =
-    lshift<
-      result_type,
-      math::plus<result_type,constants::k,1>::value,
-      constants::p
-    >::value;
-
-  static const result_type M =
-    xor_combine_engine_max_aux<
-      result_type,
-      math::div<
-        result_type,
-        math::mod<
-          result_type,
-          constants::t,
-          constants::two_to_the_p
-        >::value,
-        constants::two_to_the_p
-      >::value,
-      math::mod<
-        result_type,
-        constants::u,
-        constants::two_to_the_p
-      >::value,
-      d
-    >::value;
-
-  static const result_type value = math::plus<result_type, k_plus_1_times_two_to_the_p, M>::value;
-};
-
-
-
-template<typename result_type, result_type a, result_type b, int d>
-  struct xor_combine_engine_max_aux_case2
-{
-  typedef xor_combine_engine_max_aux_constants<result_type,a,b,d> constants;
-
-  static const result_type k_plus_1_times_two_to_the_p =
-    lshift<
-      result_type,
-      math::plus<result_type,constants::k,1>::value,
-      constants::p
-    >::value;
-
-  static const result_type value =
-    math::minus<
-      result_type,
-      k_plus_1_times_two_to_the_p,
-      1
-    >::value;
-};
-
-
-template<typename result_type, result_type a, result_type b, int d>
-  struct xor_combine_engine_max_aux_case1
-{
-  static const result_type c     = lshift<result_type, a, d>::value;
-
-  static const result_type value = math::plus<result_type,c,b>::value;
-};
-
-
-template<typename result_type, result_type a, result_type b, int d>
-  struct xor_combine_engine_max_aux_2
-{
-  typedef xor_combine_engine_max_aux_constants<result_type,a,b,d> constants;
-
-  static const result_type value = 
-    thrust::detail::eval_if<
-      // if k is odd...
-      math::is_odd<result_type, constants::k>::value,
-      thrust::detail::identity_<
-        thrust::detail::integral_constant<
-          result_type,
-          xor_combine_engine_max_aux_case2<result_type,a,b,d>::value
-        >
-      >,
-      thrust::detail::eval_if<
-        // otherwise if a * 2^3 >= b, then case 3
-        a * constants::two_to_the_d >= b,
-        thrust::detail::identity_<
-          thrust::detail::integral_constant<
-            result_type,
-            xor_combine_engine_max_aux_case3<result_type,a,b,d>::value
-          >
-        >,
-        // otherwise, case 4
-        thrust::detail::identity_<
-          thrust::detail::integral_constant<
-            result_type,
-            xor_combine_engine_max_aux_case4<result_type,a,b,d>::value
-          >
-        >
-      >
-    >::type::value;
-};
-
-
-template<typename result_type,
-         result_type a,
-         result_type b,
-         int d,
-         bool use_case1 = (a == 0) || (b < two_to_the_power<result_type,d>::value)>
-  struct xor_combine_engine_max_aux_1
-    : xor_combine_engine_max_aux_case1<result_type,a,b,d>
-{};
-
-
-template<typename result_type,
-         result_type a,
-         result_type b,
-         int d>
-  struct xor_combine_engine_max_aux_1<result_type,a,b,d,false>
-    : xor_combine_engine_max_aux_2<result_type,a,b,d>
-{};
-
-
-template<typename result_type,
-         result_type a,
-         result_type b,
-         int d>
-  struct xor_combine_engine_max_aux
-    : xor_combine_engine_max_aux_1<result_type,a,b,d>
-{};
-
-
-template<typename Engine1, size_t s1, typename Engine2, size_t s2, typename result_type>
-  struct xor_combine_engine_max
-{
-  static const size_t w = std::numeric_limits<result_type>::digits;
-
-  static const result_type m1 =
-    math::min<
-      result_type,
-      result_type(Engine1::max - Engine1::min),
-      two_to_the_power<result_type, w-s1>::value - 1 
-    >::value;
-
-  static const result_type m2 =
-    math::min<
-      result_type,
-      result_type(Engine2::max - Engine2::min),
-      two_to_the_power<result_type, w-s2>::value - 1
-    >::value;
-
-  static const result_type s = s1 - s2;
-
-  static const result_type M =
-    xor_combine_engine_max_aux<
-      result_type,
-      m1,
-      m2,
-      s
-    >::value;
-
-  // the value is M(m1,m2,s) lshift_w s2
-  static const result_type value =
-    lshift_w<
-      result_type,
-      w,
-      M,
-      s2
-    >::value;
-}; // end xor_combine_engine_max
-
-} // end detail
-
-} // end random
-
-} // end thrust
-
diff --git a/compat/thrust/random/discard_block_engine.h b/compat/thrust/random/discard_block_engine.h
deleted file mode 100644
index c902c5863b..0000000000
--- a/compat/thrust/random/discard_block_engine.h
+++ /dev/null
@@ -1,252 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file discard_block_engine.h
- *  \brief A random number engine which adapts a base engine and produces
- *         numbers by discarding all but a contiguous blocks of its values.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-#include <thrust/detail/config.h>
-#include <iostream>
-#include <thrust/detail/cstdint.h>
-#include <thrust/random/detail/random_core_access.h>
-
-namespace thrust
-{
-
-namespace random
-{
-
-/*! \addtogroup random_number_engine_adaptors Random Number Engine Adaptor Class Templates
- *  \ingroup random
- *  \{
- */
-
-/*! \class discard_block_engine
- *  \brief A \p discard_block_engine adapts an existing base random number engine and produces
- *         random values by discarding some of the values returned by its base engine.
- *         Each cycle of the compound engine begins by returning \c r values successively produced
- *         by the base engine and ends by discarding <tt>p-r</tt> such values. The engine's state
- *         is the state of its base engine followed by the number of calls to <tt>operator()</tt>
- *         that have occurred since the beginning of the current cycle.
- *
- *  \tparam Engine The type of the base random number engine to adapt.
- *  \tparam p The discard cycle length.
- *  \tparam r The number of values to return of the base engine. Because <tt>p-r</tt> will be
- *            discarded, <tt>r <= p</tt>.
- *
- *  The following code snippet shows an example of using a \p discard_block_engine instance:
- *
- *  \code
- *  #include <thrust/random/linear_congruential_engine.h>
- *  #include <thrust/random/discard_block_engine.h>
- *  #include <iostream>
- *
- *  int main(void)
- *  {
- *    // create a discard_block_engine from minstd_rand, with a cycle length of 13
- *    // keep every first 10 values, and discard the next 3
- *    thrust::discard_block_engine<thrust::minstd_rand, 13, 10> rng;
- *
- *    // print a random number to standard output
- *    std::cout << rng() << std::endl;
- *
- *    return 0;
- *  }
- *  \endcode
- */         
-template<typename Engine, size_t p, size_t r>
-  class discard_block_engine
-{
-  public:
-    // types
-
-    /*! \typedef base_type
-     *  \brief The type of the adapted base random number engine.
-     */
-    typedef Engine base_type;
-
-    /*! \typedef result_type
-     *  \brief The type of the unsigned integer produced by this \p linear_congruential_engine.
-     */
-    typedef typename base_type::result_type result_type;
-
-    // engine characteristics
-
-    /*! The length of the production cycle.
-     */
-    static const size_t block_size = p;
-
-    /*! The number of used numbers per production cycle.
-     */
-    static const size_t used_block = r;
-
-    /*! The smallest value this \p discard_block_engine may potentially produce.
-     */
-    static const result_type min = base_type::min;
-
-    /*! The largest value this \p discard_block_engine may potentially produce.
-     */
-    static const result_type max = base_type::max;
-
-    // constructors and seeding functions
-
-    /*! This constructor constructs a new \p discard_block_engine and constructs
-     *  its \p base_type engine using its null constructor.
-     */
-    __host__ __device__
-    discard_block_engine();
-
-    /*! This constructor constructs a new \p discard_block_engine using
-     *  a given \p base_type engine to initialize its adapted base engine.
-     *
-     *  \param urng A \p base_type to use to initialize this \p discard_block_engine's
-     *         adapted base engine.
-     */
-    __host__ __device__
-    explicit discard_block_engine(const base_type &urng);
-
-    /*! This constructor initializes a new \p discard_block_engine with a given seed.
-     *  
-     *  \param s The seed used to intialize this \p discard_block_engine's adapted base engine.
-     */
-    __host__ __device__
-    explicit discard_block_engine(result_type s);
-
-    /*! This method initializes the state of this \p discard_block_engine's adapted base engine
-     *  by using its \p default_seed value.
-     */
-    __host__ __device__
-    void seed(void);
-
-    /*! This method initializes the state of this \p discard_block_engine's adapted base engine
-     *  by using the given seed.
-     *
-     *  \param s The seed with which to intialize this \p discard_block_engine's adapted base engine.
-     */
-    __host__ __device__
-    void seed(result_type s);
-
-    // generating functions
-    
-    /*! This member function produces a new random value and updates this \p discard_block_engine's state.
-     *  \return A new random number.
-     */
-    __host__ __device__
-    result_type operator()(void);
-
-    /*! This member function advances this \p discard_block_engine's state a given number of times
-     *  and discards the results.
-     *
-     *  \param z The number of random values to discard.
-     *  \note This function is provided because an implementation may be able to accelerate it.
-     */
-    __host__ __device__
-    void discard(unsigned long long z);
-
-    // property functions
-
-    /*! This member function returns a const reference to this \p discard_block_engine's
-     *  adapted base engine.
-     *
-     *  \return A const reference to the base engine this \p discard_block_engine adapts.
-     */
-    __host__ __device__
-    const base_type &base(void) const;
-
-    /*! \cond
-     */
-  private:
-    base_type m_e;
-    unsigned int m_n;
-
-    friend struct thrust::random::detail::random_core_access;
-
-    __host__ __device__
-    bool equal(const discard_block_engine &rhs) const;
-
-    template<typename CharT, typename Traits>
-    std::basic_ostream<CharT,Traits>& stream_out(std::basic_ostream<CharT,Traits> &os) const;
-
-    template<typename CharT, typename Traits>
-    std::basic_istream<CharT,Traits>& stream_in(std::basic_istream<CharT,Traits> &is);
-    /*! \endcond
-     */
-}; // end discard_block_engine
-
-
-/*! This function checks two \p discard_block_engines for equality.
- *  \param lhs The first \p discard_block_engine to test.
- *  \param rhs The second \p discard_block_engine to test.
- *  \return \c true if \p lhs is equal to \p rhs; \c false, otherwise.
- */
-template<typename Engine, size_t p, size_t r>
-__host__ __device__
-bool operator==(const discard_block_engine<Engine,p,r> &lhs,
-                const discard_block_engine<Engine,p,r> &rhs);
-
-
-/*! This function checks two \p discard_block_engines for inequality.
- *  \param lhs The first \p discard_block_engine to test.
- *  \param rhs The second \p discard_block_engine to test.
- *  \return \c true if \p lhs is not equal to \p rhs; \c false, otherwise.
- */
-template<typename Engine, size_t p, size_t r>
-__host__ __device__
-bool operator!=(const discard_block_engine<Engine,p,r> &lhs,
-                const discard_block_engine<Engine,p,r> &rhs);
-
-
-/*! This function streams a discard_block_engine to a \p std::basic_ostream.
- *  \param os The \p basic_ostream to stream out to.
- *  \param e The \p discard_block_engine to stream out.
- *  \return \p os
- */
-template<typename Engine, size_t p, size_t r,
-         typename CharT, typename Traits>
-std::basic_ostream<CharT,Traits>&
-operator<<(std::basic_ostream<CharT,Traits> &os,
-           const discard_block_engine<Engine,p,r> &e);
-
-
-/*! This function streams a discard_block_engine in from a std::basic_istream.
- *  \param is The \p basic_istream to stream from.
- *  \param e The \p discard_block_engine to stream in.
- *  \return \p is
- */
-template<typename Engine, size_t p, size_t r,
-         typename CharT, typename Traits>
-std::basic_istream<CharT,Traits>&
-operator>>(std::basic_istream<CharT,Traits> &is,
-           discard_block_engine<Engine,p,r> &e);
-
-/*! \} // end random_number_engine_adaptors
- */
-
-} // end random
-
-// import names into thrust::
-using random::discard_block_engine;
-
-} // end thrust
-
-#include <thrust/random/detail/discard_block_engine.inl>
-
diff --git a/compat/thrust/random/linear_congruential_engine.h b/compat/thrust/random/linear_congruential_engine.h
deleted file mode 100644
index 0added0069..0000000000
--- a/compat/thrust/random/linear_congruential_engine.h
+++ /dev/null
@@ -1,295 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file linear_congruential_engine.h
- *  \brief A linear congruential pseudorandom number engine.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <iostream>
-#include <thrust/detail/cstdint.h>
-#include <thrust/random/detail/random_core_access.h>
-#include <thrust/random/detail/linear_congruential_engine_discard.h>
-
-namespace thrust
-{
-
-namespace random
-{
-
-/*! \addtogroup random_number_engine_templates Random Number Engine Class Templates
- *  \ingroup random
- *  \{
- */
-
-/*! \class linear_congruential_engine
- *  \brief A \p linear_congruential_engine random number engine produces unsigned integer
- *         random numbers using a linear congruential random number generation algorithm.
- *
- *         The generation algorithm has the form <tt>x_i = (a * x_{i-1} + c) mod m</tt>.
- *
- *  \tparam UIntType The type of unsigned integer to produce.
- *  \tparam a The multiplier used in the generation algorithm.
- *  \tparam c The increment used in the generation algorithm.
- *  \tparam m The modulus used in the generation algorithm.
- *
- *  \note Inexperienced users should not use this class template directly.  Instead, use
- *  \p minstd_rand or \p minstd_rand0.
- *
- *  The following code snippet shows examples of use of a \p linear_congruential_engine instance:
- *
- *  \code
- *  #include <thrust/random/linear_congruential_engine.h>
- *  #include <iostream>
- *
- *  int main(void)
- *  {
- *    // create a minstd_rand object, which is an instance of linear_congruential_engine
- *    thrust::minstd_rand rng1;
- *
- *    // output some random values to cout
- *    std::cout << rng1() << std::endl;
- *
- *    // a random value is printed
- *
- *    // create a new minstd_rand from a seed
- *    thrust::minstd_rand rng2(13);
- *
- *    // discard some random values
- *    rng2.discard(13);
- *
- *    // stream the object to an iostream
- *    std::cout << rng2 << std::endl;
- *
- *    // rng2's current state is printed
- *
- *    // print the minimum and maximum values that minstd_rand can produce
- *    std::cout << thrust::minstd_rand::min << std::endl;
- *    std::cout << thrust::minstd_rand::max << std::endl;
- *
- *    // the range of minstd_rand is printed
- *
- *    // save the state of rng2 to a different object
- *    thrust::minstd_rand rng3 = rng2;
- *
- *    // compare rng2 and rng3
- *    std::cout << (rng2 == rng3) << std::endl;
- *
- *    // 1 is printed
- *
- *    // re-seed rng2 with a different seed
- *    rng2.seed(7);
- *
- *    // compare rng2 and rng3
- *    std::cout << (rng2 == rng3) << std::endl;
- *
- *    // 0 is printed
- *
- *    return 0;
- *  }
- *
- *  \endcode
- *
- *  \see thrust::random::minstd_rand
- *  \see thrust::random::minstd_rand0
- */
-template<typename UIntType, UIntType a, UIntType c, UIntType m>
-  class linear_congruential_engine
-{
-  public:
-    // types
-    
-    /*! \typedef result_type
-     *  \brief The type of the unsigned integer produced by this \p linear_congruential_engine.
-     */
-    typedef UIntType result_type;
-
-    // engine characteristics
-
-    /*! The multiplier used in the generation algorithm.
-     */
-    static const result_type multiplier = a;
-
-    /*! The increment used in the generation algorithm.
-     */
-    static const result_type increment = c;
-
-    /*! The modulus used in the generation algorithm.
-     */
-    static const result_type modulus = m;
-
-    /*! The smallest value this \p linear_congruential_engine may potentially produce.
-     */
-    static const result_type min = c == 0u ? 1u : 0u;
-
-    /*! The largest value this \p linear_congruential_engine may potentially produce.
-     */
-    static const result_type max = m - 1u;
-
-    /*! The default seed of this \p linear_congruential_engine.
-     */
-    static const result_type default_seed = 1u;
-
-    // constructors and seeding functions
-
-    /*! This constructor, which optionally accepts a seed, initializes a new
-     *  \p linear_congruential_engine.
-     *  
-     *  \param s The seed used to intialize this \p linear_congruential_engine's state.
-     */
-    __host__ __device__
-    explicit linear_congruential_engine(result_type s = default_seed);
-
-    /*! This method initializes this \p linear_congruential_engine's state, and optionally accepts
-     *  a seed value.
-     *
-     *  \param s The seed used to initializes this \p linear_congruential_engine's state.
-     */
-    __host__ __device__
-    void seed(result_type s = default_seed);
-
-    // generating functions
-
-    /*! This member function produces a new random value and updates this \p linear_congruential_engine's state.
-     *  \return A new random number.
-     */
-    __host__ __device__
-    result_type operator()(void);
-
-    /*! This member function advances this \p linear_congruential_engine's state a given number of times
-     *  and discards the results.
-     *
-     *  \param z The number of random values to discard.
-     *  \note This function is provided because an implementation may be able to accelerate it.
-     */
-    __host__ __device__
-    void discard(unsigned long long z);
-
-    /*! \cond
-     */
-  private:
-    result_type m_x;
-
-    static void transition(result_type &state);
-
-    friend struct thrust::random::detail::random_core_access;
-
-    friend struct thrust::random::detail::linear_congruential_engine_discard;
-
-    __host__ __device__
-    bool equal(const linear_congruential_engine &rhs) const;
-
-    template<typename CharT, typename Traits>
-    std::basic_ostream<CharT,Traits>& stream_out(std::basic_ostream<CharT,Traits> &os) const;
-
-    template<typename CharT, typename Traits>
-    std::basic_istream<CharT,Traits>& stream_in(std::basic_istream<CharT,Traits> &is);
-
-    /*! \endcond
-     */
-}; // end linear_congruential_engine
-
-
-/*! This function checks two \p linear_congruential_engines for equality.
- *  \param lhs The first \p linear_congruential_engine to test.
- *  \param rhs The second \p linear_congruential_engine to test.
- *  \return \c true if \p lhs is equal to \p rhs; \c false, otherwise.
- */
-template<typename UIntType_, UIntType_ a_, UIntType_ c_, UIntType_ m_>
-__host__ __device__
-bool operator==(const linear_congruential_engine<UIntType_,a_,c_,m_> &lhs,
-                const linear_congruential_engine<UIntType_,a_,c_,m_> &rhs);
-
-
-/*! This function checks two \p linear_congruential_engines for inequality.
- *  \param lhs The first \p linear_congruential_engine to test.
- *  \param rhs The second \p linear_congruential_engine to test.
- *  \return \c true if \p lhs is not equal to \p rhs; \c false, otherwise.
- */
-template<typename UIntType_, UIntType_ a_, UIntType_ c_, UIntType_ m_>
-__host__ __device__
-bool operator!=(const linear_congruential_engine<UIntType_,a_,c_,m_> &lhs,
-                const linear_congruential_engine<UIntType_,a_,c_,m_> &rhs);
-
-
-/*! This function streams a linear_congruential_engine to a \p std::basic_ostream.
- *  \param os The \p basic_ostream to stream out to.
- *  \param e The \p linear_congruential_engine to stream out.
- *  \return \p os
- */
-template<typename UIntType_, UIntType_ a_, UIntType_ c_, UIntType_ m_,
-         typename CharT, typename Traits>
-std::basic_ostream<CharT,Traits>&
-operator<<(std::basic_ostream<CharT,Traits> &os,
-           const linear_congruential_engine<UIntType_,a_,c_,m_> &e);
-
-
-/*! This function streams a linear_congruential_engine in from a std::basic_istream.
- *  \param is The \p basic_istream to stream from.
- *  \param e The \p linear_congruential_engine to stream in.
- *  \return \p is
- */
-template<typename UIntType_, UIntType_ a_, UIntType_ c_, UIntType_ m_,
-         typename CharT, typename Traits>
-std::basic_istream<CharT,Traits>&
-operator>>(std::basic_istream<CharT,Traits> &is,
-           linear_congruential_engine<UIntType_,a_,c_,m_> &e);
-
-
-/*! \} // random_number_engine_templates
- */
-
-
-/*! \addtogroup predefined_random
- *  \{
- */
-
-// XXX the type N2111 used here was uint_fast32_t
-
-/*! \typedef minstd_rand0
- *  \brief A random number engine with predefined parameters which implements a version of
- *         the Minimal Standard random number generation algorithm.
- *  \note The 10000th consecutive invocation of a default-constructed object of type \p minstd_rand0
- *        shall produce the value \c 1043618065 .
- */
-typedef linear_congruential_engine<thrust::detail::uint32_t, 16807, 0, 2147483647> minstd_rand0;
-
-
-/*! \typedef minstd_rand
- *  \brief A random number engine with predefined parameters which implements a version of
- *         the Minimal Standard random number generation algorithm.
- *  \note The 10000th consecutive invocation of a default-constructed object of type \p minstd_rand
- *        shall produce the value \c 399268537 .
- */
-typedef linear_congruential_engine<thrust::detail::uint32_t, 48271, 0, 2147483647> minstd_rand;
-
-/*! \} // predefined_random
- */
-  
-} // end random
-
-// import names into thrust::
-using random::linear_congruential_engine;
-using random::minstd_rand;
-using random::minstd_rand0;
-
-} // end thrust
-
-#include <thrust/random/detail/linear_congruential_engine.inl>
-
diff --git a/compat/thrust/random/linear_feedback_shift_engine.h b/compat/thrust/random/linear_feedback_shift_engine.h
deleted file mode 100644
index f5646c9483..0000000000
--- a/compat/thrust/random/linear_feedback_shift_engine.h
+++ /dev/null
@@ -1,230 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file linear_feedback_shift_engine.h
- *  \brief A linear feedback shift pseudorandom number generator.
- */
-
-/*
- * Copyright Jens Maurer 2002
- *
- * Distributed under the Boost Software License, Version 1.0.
- * (See accompanying NOTICE file for the complete license)
- *
- * For more information, see http://www.boost.org
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/random/detail/linear_feedback_shift_engine_wordmask.h>
-#include <iostream>
-#include <cstddef> // for size_t
-#include <thrust/random/detail/random_core_access.h>
-
-namespace thrust
-{
-
-
-namespace random
-{
-
-/*! \addtogroup random_number_engine_templates
- *  \{
- */
-
-/*! \class linear_feedback_shift_engine
- *  \brief A \p linear_feedback_shift_engine random number engine produces
- *         unsigned integer random values using a linear feedback shift random number
- *         generation algorithm.
- *
- *  \tparam UIntType The type of unsigned integer to produce.
- *  \tparam w The word size of the produced values (<tt>w <= sizeof(UIntType)</tt>).
- *  \tparam k The k parameter of Tausworthe's 1965 algorithm.
- *  \tparam q The q exponent of Tausworthe's 1965 algorithm.
- *  \tparam s The step size of Tausworthe's 1965 algorithm.
- *
- *  \note linear_feedback_shift_engine is based on the Boost Template Library's linear_feedback_shift.
- */
-template<typename UIntType, size_t w, size_t k, size_t q, size_t s>
-  class linear_feedback_shift_engine
-{
-  public:
-    // types
-
-    /*! \typedef result_type
-     *  \brief The type of the unsigned integer produced by this \p linear_feedback_shift_engine.
-     */
-    typedef UIntType result_type;
-
-    // engine characteristics
-
-    /*! The word size of the produced values.
-     */
-    static const size_t word_size = w;
-
-    /*! A constant used in the generation algorithm.
-     */
-    static const size_t exponent1 = k;
-
-    /*! A constant used in the generation algorithm.
-     */
-    static const size_t exponent2 = q;
-
-    /*! The step size used in the generation algorithm.
-     */
-    static const size_t step_size = s;
-
-    /*! \cond
-     */
-  private:
-    static const result_type wordmask =
-      detail::linear_feedback_shift_engine_wordmask<
-        result_type,
-        w
-      >::value;
-    /*! \endcond
-     */
-
-  public:
-
-    /*! The smallest value this \p linear_feedback_shift_engine may potentially produce.
-     */
-    static const result_type min = 0;
-
-    /*! The largest value this \p linear_feedback_shift_engine may potentially produce.
-     */
-    static const result_type max = wordmask;
-
-    /*! The default seed of this \p linear_feedback_shift_engine.
-     */
-    static const result_type default_seed = 341u;
-
-    // constructors and seeding functions
-
-    /*! This constructor, which optionally accepts a seed, initializes a new
-     *  \p linear_feedback_shift_engine.
-     *  
-     *  \param value The seed used to intialize this \p linear_feedback_shift_engine's state.
-     */
-    __host__ __device__
-    explicit linear_feedback_shift_engine(result_type value = default_seed);
-
-    /*! This method initializes this \p linear_feedback_shift_engine's state, and optionally accepts
-     *  a seed value.
-     *
-     *  \param value The seed used to initializes this \p linear_feedback_shift_engine's state.
-     */
-    __host__ __device__
-    void seed(result_type value = default_seed);
-
-    // generating functions
-    
-    /*! This member function produces a new random value and updates this \p linear_feedback_shift_engine's state.
-     *  \return A new random number.
-     */
-    __host__ __device__
-    result_type operator()(void);
-
-    /*! This member function advances this \p linear_feedback_shift_engine's state a given number of times
-     *  and discards the results.
-     *
-     *  \param z The number of random values to discard.
-     *  \note This function is provided because an implementation may be able to accelerate it.
-     */
-    __host__ __device__
-    void discard(unsigned long long z);
-
-    /*! \cond
-     */
-  private:
-    result_type m_value;
-
-    friend struct thrust::random::detail::random_core_access;
-
-    __host__ __device__
-    bool equal(const linear_feedback_shift_engine &rhs) const;
-
-    template<typename CharT, typename Traits>
-    std::basic_ostream<CharT,Traits>& stream_out(std::basic_ostream<CharT,Traits> &os) const;
-
-    template<typename CharT, typename Traits>
-    std::basic_istream<CharT,Traits>& stream_in(std::basic_istream<CharT,Traits> &is);
-
-    /*! \endcond
-     */
-}; // end linear_feedback_shift_engine
-
-
-/*! This function checks two \p linear_feedback_shift_engines for equality.
- *  \param lhs The first \p linear_feedback_shift_engine to test.
- *  \param rhs The second \p linear_feedback_shift_engine to test.
- *  \return \c true if \p lhs is equal to \p rhs; \c false, otherwise.
- */
-template<typename UIntType_, size_t w_, size_t k_, size_t q_, size_t s_>
-__host__ __device__
-bool operator==(const linear_feedback_shift_engine<UIntType_,w_,k_,q_,s_> &lhs,
-                const linear_feedback_shift_engine<UIntType_,w_,k_,q_,s_> &rhs);
-
-
-/*! This function checks two \p linear_feedback_shift_engines for inequality.
- *  \param lhs The first \p linear_feedback_shift_engine to test.
- *  \param rhs The second \p linear_feedback_shift_engine to test.
- *  \return \c true if \p lhs is not equal to \p rhs; \c false, otherwise.
- */
-template<typename UIntType_, size_t w_, size_t k_, size_t q_, size_t s_>
-__host__ __device__
-bool operator!=(const linear_feedback_shift_engine<UIntType_,w_,k_,q_,s_> &lhs,
-                const linear_feedback_shift_engine<UIntType_,w_,k_,q_,s_> &rhs);
-
-
-/*! This function streams a linear_feedback_shift_engine to a \p std::basic_ostream.
- *  \param os The \p basic_ostream to stream out to.
- *  \param e The \p linear_feedback_shift_engine to stream out.
- *  \return \p os
- */
-template<typename UIntType_, size_t w_, size_t k_, size_t q_, size_t s_,
-         typename CharT, typename Traits>
-std::basic_ostream<CharT,Traits>&
-operator<<(std::basic_ostream<CharT,Traits> &os,
-           const linear_feedback_shift_engine<UIntType_,w_,k_,q_,s_> &e);
-
-
-/*! This function streams a linear_feedback_shift_engine in from a std::basic_istream.
- *  \param is The \p basic_istream to stream from.
- *  \param e The \p linear_feedback_shift_engine to stream in.
- *  \return \p is
- */
-template<typename UIntType_, size_t w_, size_t k_, size_t q_, size_t s_,
-         typename CharT, typename Traits>
-std::basic_istream<CharT,Traits>&
-operator>>(std::basic_istream<CharT,Traits> &is,
-           linear_feedback_shift_engine<UIntType_,w_,k_,q_,s_> &e);
-
-
-/*! \} // end random_number_engine_templates
- */
-
-
-} // end random
-
-// import names into thrust::
-using random::linear_feedback_shift_engine;
-
-} // end thrust
-
-#include <thrust/random/detail/linear_feedback_shift_engine.inl>
-
diff --git a/compat/thrust/random/normal_distribution.h b/compat/thrust/random/normal_distribution.h
deleted file mode 100644
index 5543f30a5f..0000000000
--- a/compat/thrust/random/normal_distribution.h
+++ /dev/null
@@ -1,275 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file normal_distribution.h
- *  \brief A normal (Gaussian) distribution of real-valued numbers.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/pair.h>
-#include <thrust/random/detail/random_core_access.h>
-#include <thrust/random/detail/normal_distribution_base.h>
-#include <iostream>
-
-namespace thrust
-{
-
-namespace random
-{
-
-
-/*! \addtogroup random_number_distributions
- *  \{
- */
-
-/*! \class normal_distribution
- *  \brief A \p normal_distribution random number distribution produces floating point
- *         Normally distributed random numbers.
- *
- *  \tparam RealType The type of floating point number to produce.
- *
- *  The following code snippet demonstrates examples of using a \p normal_distribution with a 
- *  random number engine to produce random values drawn from the Normal distribution with a given
- *  mean and variance:
- *
- *  \code
- *  #include <thrust/random/linear_congruential_engine.h>
- *  #include <thrust/random/normal_distribution.h>
- *
- *  int main(void)
- *  {
- *    // create a minstd_rand object to act as our source of randomness
- *    thrust::minstd_rand rng;
- *
- *    // create a normal_distribution to produce floats from the Normal distribution
- *    // with mean 2.0 and standard deviation 3.5
- *    thrust::random::normal_distribution<float> dist(2.0f, 3.5f);
- *
- *    // write a random number to standard output
- *    std::cout << dist(rng) << std::endl;
- *
- *    // write the mean of the distribution, just in case we forgot
- *    std::cout << dist.mean() << std::endl;
- *
- *    // 2.0 is printed
- *
- *    // and the standard deviation
- *    std::cout << dist.stddev() << std::endl;
- *
- *    // 3.5 is printed
- *
- *    return 0;
- *  }
- *  \endcode
- */
-template<typename RealType = double>
-  class normal_distribution
-    : public detail::normal_distribution_base<RealType>::type
-{
-  private:
-    typedef typename detail::normal_distribution_base<RealType>::type super_t;
-
-  public:
-    // types
-    
-    /*! \typedef result_type
-     *  \brief The type of the floating point number produced by this \p normal_distribution.
-     */
-    typedef RealType result_type;
-
-    /*! \typedef param_type
-     *  \brief The type of the object encapsulating this \p normal_distribution's parameters.
-     */
-    typedef thrust::pair<RealType,RealType> param_type;
-
-    // constructors and reset functions
-    
-    /*! This constructor creates a new \p normal_distribution from two values defining the
-     *  half-open interval of the distribution.
-     *  
-     *  \param mean The mean (expected value) of the distribution. Defaults to \c 0.0.
-     *  \param stddev The standard deviation of the distribution. Defaults to \c 1.0.
-     */
-    __host__ __device__
-    explicit normal_distribution(RealType mean = 0.0, RealType stddev = 1.0);
-
-    /*! This constructor creates a new \p normal_distribution from a \p param_type object
-     *  encapsulating the range of the distribution.
-     *  
-     *  \param parm A \p param_type object encapsulating the parameters (i.e., the mean and standard deviation) of the distribution.
-     */
-    __host__ __device__
-    explicit normal_distribution(const param_type &parm);
-
-    /*! Calling this member function guarantees that subsequent uses of this
-     *  \p normal_distribution do not depend on values produced by any random
-     *  number generator prior to invoking this function.
-     */
-    __host__ __device__
-    void reset(void);
-
-    // generating functions
-
-    /*! This method produces a new Normal random integer drawn from this \p normal_distribution's
-     *  range using a \p UniformRandomNumberGenerator as a source of randomness.
-     *
-     *  \param urng The \p UniformRandomNumberGenerator to use as a source of randomness.
-     */
-    template<typename UniformRandomNumberGenerator>
-    __host__ __device__
-    result_type operator()(UniformRandomNumberGenerator &urng);
-
-    /*! This method produces a new Normal random integer as if by creating a new \p normal_distribution 
-     *  from the given \p param_type object, and calling its <tt>operator()</tt> method with the given
-     *  \p UniformRandomNumberGenerator as a source of randomness.
-     *
-     *  \param urng The \p UniformRandomNumberGenerator to use as a source of randomness.
-     *  \param parm A \p param_type object encapsulating the parameters of the \p normal_distribution
-     *              to draw from.
-     */
-    template<typename UniformRandomNumberGenerator>
-    __host__ __device__
-    result_type operator()(UniformRandomNumberGenerator &urng, const param_type &parm);
-
-    // property functions
-
-    /*! This method returns the value of the parameter with which this \p normal_distribution
-     *  was constructed.
-     *
-     *  \return The mean (expected value) of this \p normal_distribution's output.
-     */
-    __host__ __device__
-    result_type mean(void) const;
-
-    /*! This method returns the value of the parameter with which this \p normal_distribution
-     *  was constructed.
-     *
-     *  \return The standard deviation of this \p uniform_real_distribution's output.
-     */
-    __host__ __device__
-    result_type stddev(void) const;
-
-    /*! This method returns a \p param_type object encapsulating the parameters with which this
-     *  \p normal_distribution was constructed.
-     *
-     *  \return A \p param_type object encapsulating the parameters (i.e., the mean and standard deviation) of this \p normal_distribution.
-     */
-    __host__ __device__
-    param_type param(void) const;
-
-    /*! This method changes the parameters of this \p normal_distribution using the values encapsulated
-     *  in a given \p param_type object.
-     *
-     *  \param parm A \p param_type object encapsulating the new parameters (i.e., the mean and variance) of this \p normal_distribution.
-     */
-    __host__ __device__
-    void param(const param_type &parm);
-
-    /*! This method returns the smallest floating point number this \p normal_distribution can potentially produce.
-     *
-     *  \return The lower bound of this \p normal_distribution's half-open interval.
-     */
-    __host__ __device__
-    result_type min THRUST_PREVENT_MACRO_SUBSTITUTION (void) const;
-
-    /*! This method returns the smallest number larger than largest floating point number this \p uniform_real_distribution can potentially produce.
-     *
-     *  \return The upper bound of this \p normal_distribution's half-open interval.
-     */
-    __host__ __device__
-    result_type max THRUST_PREVENT_MACRO_SUBSTITUTION (void) const;
-
-    /*! \cond
-     */
-  private:
-    param_type m_param;
-
-    friend struct thrust::random::detail::random_core_access;
-
-    __host__ __device__
-    bool equal(const normal_distribution &rhs) const;
-
-    template<typename CharT, typename Traits>
-    std::basic_ostream<CharT,Traits>& stream_out(std::basic_ostream<CharT,Traits> &os) const;
-
-    template<typename CharT, typename Traits>
-    std::basic_istream<CharT,Traits>& stream_in(std::basic_istream<CharT,Traits> &is);
-    /*! \endcond
-     */
-}; // end normal_distribution
-
-
-/*! This function checks two \p normal_distributions for equality.
- *  \param lhs The first \p normal_distribution to test.
- *  \param rhs The second \p normal_distribution to test.
- *  \return \c true if \p lhs is equal to \p rhs; \c false, otherwise.
- */
-template<typename RealType>
-__host__ __device__
-bool operator==(const normal_distribution<RealType> &lhs,
-                const normal_distribution<RealType> &rhs);
-
-
-/*! This function checks two \p normal_distributions for inequality.
- *  \param lhs The first \p normal_distribution to test.
- *  \param rhs The second \p normal_distribution to test.
- *  \return \c true if \p lhs is not equal to \p rhs; \c false, otherwise.
- */
-template<typename RealType>
-__host__ __device__
-bool operator!=(const normal_distribution<RealType> &lhs,
-                const normal_distribution<RealType> &rhs);
-
-
-/*! This function streams a normal_distribution to a \p std::basic_ostream.
- *  \param os The \p basic_ostream to stream out to.
- *  \param d The \p normal_distribution to stream out.
- *  \return \p os
- */
-template<typename RealType,
-         typename CharT, typename Traits>
-std::basic_ostream<CharT,Traits>&
-operator<<(std::basic_ostream<CharT,Traits> &os,
-           const normal_distribution<RealType> &d);
-
-
-/*! This function streams a normal_distribution in from a std::basic_istream.
- *  \param is The \p basic_istream to stream from.
- *  \param d The \p normal_distribution to stream in.
- *  \return \p is
- */
-template<typename RealType,
-         typename CharT, typename Traits>
-std::basic_istream<CharT,Traits>&
-operator>>(std::basic_istream<CharT,Traits> &is,
-           normal_distribution<RealType> &d);
-
-
-/*! \} // end random_number_distributions
- */
-
-
-} // end random
-
-using random::normal_distribution;
-
-} // end thrust
-
-#include <thrust/random/detail/normal_distribution.inl>
-
diff --git a/compat/thrust/random/subtract_with_carry_engine.h b/compat/thrust/random/subtract_with_carry_engine.h
deleted file mode 100644
index b88810097b..0000000000
--- a/compat/thrust/random/subtract_with_carry_engine.h
+++ /dev/null
@@ -1,256 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file subtract_with_carry_engine.h
- *  \brief A subtract-with-carry pseudorandom number generator
- *         based on Marsaglia & Zaman.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/random/detail/random_core_access.h>
-
-#include <thrust/detail/cstdint.h>
-#include <cstddef> // for size_t
-#include <iostream>
-
-namespace thrust
-{
-
-namespace random
-{
-
-
-/*! \addtogroup random_number_engine_templates
- *  \{
- */
-
-/*! \class subtract_with_carry_engine
- *  \brief A \p subtract_with_carry_engine random number engine produces unsigned
- *         integer random numbers using the subtract with carry algorithm of Marsaglia & Zaman.
- *
- *         The generation algorithm is performed as follows:
- *         -# Let <tt>Y = X_{i-s}- X_{i-r} - c</tt>.
- *         -# Set <tt>X_i</tt> to <tt>y = T mod m</tt>. Set \c c to \c 1 if <tt>Y < 0</tt>, otherwise set \c c to \c 0.
- *
- *         This algorithm corresponds to a modular linear function of the form
- *
- *         <tt>TA(x_i) = (a * x_i) mod b</tt>, where \c b is of the form <tt>m^r - m^s + 1</tt> and
- *         <tt>a = b - (b-1)/m</tt>.
- *
- *  \tparam UIntType The type of unsigned integer to produce.
- *  \tparam w The word size of the produced values (<tt> w <= sizeof(UIntType)</tt>).
- *  \tparam s The short lag of the generation algorithm.
- *  \tparam r The long lag of the generation algorithm.
- *
- *  \note Inexperienced users should not use this class template directly.  Instead, use
- *  \p ranlux24_base or \p ranlux48_base, which are instances of \p subtract_with_carry_engine.
- *
- *  \see thrust::random::ranlux24_base
- *  \see thrust::random::ranlux48_base
- */
-template<typename UIntType, size_t w, size_t s, size_t r>
-  class subtract_with_carry_engine
-{
-    /*! \cond
-     */
-  private:
-    static const UIntType modulus = UIntType(1) << w;
-    /*! \endcond
-     */
-
-  public:
-    // types
-    
-    /*! \typedef result_type
-     *  \brief The type of the unsigned integer produced by this \p subtract_with_carry_engine.
-     */
-    typedef UIntType result_type;
-
-    // engine characteristics
-
-    /*! The word size of the produced values.
-     */
-    static const size_t word_size = w;
-
-    /*! The size of the short lag used in the generation algorithm.
-     */
-    static const size_t short_lag = s;
-
-    /*! The size of the long lag used in the generation algorithm.
-     */
-    static const size_t long_lag = r;
-
-    /*! The smallest value this \p subtract_with_carry_engine may potentially produce.
-     */
-    static const result_type min = 0;
-
-    /*! The largest value this \p subtract_with_carry_engine may potentially produce.
-     */
-    static const result_type max = modulus - 1;
-
-    /*! The default seed of this \p subtract_with_carry_engine.
-     */
-    static const result_type default_seed = 19780503u;
-
-    // constructors and seeding functions
-
-    /*! This constructor, which optionally accepts a seed, initializes a new
-     *  \p subtract_with_carry_engine.
-     *  
-     *  \param value The seed used to intialize this \p subtract_with_carry_engine's state.
-     */
-    __host__ __device__
-    explicit subtract_with_carry_engine(result_type value = default_seed);
-
-    /*! This method initializes this \p subtract_with_carry_engine's state, and optionally accepts
-     *  a seed value.
-     *
-     *  \param value The seed used to initializes this \p subtract_with_carry_engine's state.
-     */
-    __host__ __device__
-    void seed(result_type value = default_seed);
-
-    // generating functions
-    
-    /*! This member function produces a new random value and updates this \p subtract_with_carry_engine's state.
-     *  \return A new random number.
-     */
-    __host__ __device__
-    result_type operator()(void);
-
-    /*! This member function advances this \p subtract_with_carry_engine's state a given number of times
-     *  and discards the results.
-     *
-     *  \param z The number of random values to discard.
-     *  \note This function is provided because an implementation may be able to accelerate it.
-     */
-    __host__ __device__
-    void discard(unsigned long long z);
-
-    /*! \cond
-     */
-  private:
-    result_type m_x[long_lag];
-    unsigned int m_k;
-    int m_carry;
-
-    friend struct thrust::random::detail::random_core_access;
-
-    __host__ __device__
-    bool equal(const subtract_with_carry_engine &rhs) const;
-
-    template<typename CharT, typename Traits>
-    std::basic_ostream<CharT,Traits>& stream_out(std::basic_ostream<CharT,Traits> &os) const;
-
-    template<typename CharT, typename Traits>
-    std::basic_istream<CharT,Traits>& stream_in(std::basic_istream<CharT,Traits> &is);
-
-    /*! \endcond
-     */
-}; // end subtract_with_carry_engine
-
-
-/*! This function checks two \p subtract_with_carry_engines for equality.
- *  \param lhs The first \p subtract_with_carry_engine to test.
- *  \param rhs The second \p subtract_with_carry_engine to test.
- *  \return \c true if \p lhs is equal to \p rhs; \c false, otherwise.
- */
-template<typename UIntType_, size_t w_, size_t s_, size_t r_>
-__host__ __device__
-bool operator==(const subtract_with_carry_engine<UIntType_,w_,s_,r_> &lhs,
-                const subtract_with_carry_engine<UIntType_,w_,s_,r_> &rhs);
-
-
-/*! This function checks two \p subtract_with_carry_engines for inequality.
- *  \param lhs The first \p subtract_with_carry_engine to test.
- *  \param rhs The second \p subtract_with_carry_engine to test.
- *  \return \c true if \p lhs is not equal to \p rhs; \c false, otherwise.
- */
-template<typename UIntType_, size_t w_, size_t s_, size_t r_>
-__host__ __device__
-bool operator!=(const subtract_with_carry_engine<UIntType_,w_,s_,r_>&lhs,
-                const subtract_with_carry_engine<UIntType_,w_,s_,r_>&rhs);
-
-
-/*! This function streams a subtract_with_carry_engine to a \p std::basic_ostream.
- *  \param os The \p basic_ostream to stream out to.
- *  \param e The \p subtract_with_carry_engine to stream out.
- *  \return \p os
- */
-template<typename UIntType_, size_t w_, size_t s_, size_t r_,
-         typename CharT, typename Traits>
-std::basic_ostream<CharT,Traits>&
-operator<<(std::basic_ostream<CharT,Traits> &os,
-           const subtract_with_carry_engine<UIntType_,w_,s_,r_> &e);
-
-
-/*! This function streams a subtract_with_carry_engine in from a std::basic_istream.
- *  \param is The \p basic_istream to stream from.
- *  \param e The \p subtract_with_carry_engine to stream in.
- *  \return \p is
- */
-template<typename UIntType_, size_t w_, size_t s_, size_t r_,
-         typename CharT, typename Traits>
-std::basic_istream<CharT,Traits>&
-operator>>(std::basic_istream<CharT,Traits> &is,
-           subtract_with_carry_engine<UIntType_,w_,s_,r_> &e);
-
-
-/*! \} // end random_number_engine_templates
- */
-
-
-/*! \addtogroup predefined_random
- *  \{
- */
-
-// XXX N2111 uses uint_fast32_t here
-
-/*! \typedef ranlux24_base
- *  \brief A random number engine with predefined parameters which implements the
- *         base engine of the \p ranlux24 random number engine.
- *  \note The 10000th consecutive invocation of a default-constructed object of type \p ranlux24_base
- *        shall produce the value \c 7937952 .
- */
-typedef subtract_with_carry_engine<thrust::detail::uint32_t, 24, 10, 24> ranlux24_base;
-
-
-// XXX N2111 uses uint_fast64_t here
-
-/*! \typedef ranlux48_base
- *  \brief A random number engine with predefined parameters which implements the
- *         base engine of the \p ranlux48 random number engine.
- *  \note The 10000th consecutive invocation of a default-constructed object of type \p ranlux48_base
- *        shall produce the value \c 192113843633948 .
- */
-typedef subtract_with_carry_engine<thrust::detail::uint64_t, 48,  5, 12> ranlux48_base;
-
-/*! \} // end predefined_random
- */
-
-} // end random
-
-// import names into thrust::
-using random::subtract_with_carry_engine;
-using random::ranlux24_base;
-using random::ranlux48_base;
-
-} // end thrust
-
-#include <thrust/random/detail/subtract_with_carry_engine.inl>
-
diff --git a/compat/thrust/random/uniform_int_distribution.h b/compat/thrust/random/uniform_int_distribution.h
deleted file mode 100644
index d05f7faaf3..0000000000
--- a/compat/thrust/random/uniform_int_distribution.h
+++ /dev/null
@@ -1,276 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file uniform_int_distribution.h
- *  \brief A uniform distribution of integer-valued numbers
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/pair.h>
-#include <thrust/detail/integer_traits.h>
-#include <thrust/random/detail/random_core_access.h>
-#include <iostream>
-
-namespace thrust
-{
-
-namespace random
-{
-
-/*! \addtogroup random_number_distributions Random Number Distributions Class Templates
- *  \ingroup random
- *  \{
- */
-
-/*! \class uniform_int_distribution
- *  \brief A \p uniform_int_distribution random number distribution produces signed or unsigned integer
- *         uniform random numbers from a given range.
- *
- *  \tparam IntType The type of integer to produce.
- *
- *  The following code snippet demonstrates examples of using a \p uniform_int_distribution with a 
- *  random number engine to produce random integers drawn from a given range:
- *
- *  \code
- *  #include <thrust/random/linear_congruential_engine.h>
- *  #include <thrust/random/uniform_int_distribution.h>
- *
- *  int main(void)
- *  {
- *    // create a minstd_rand object to act as our source of randomness
- *    thrust::minstd_rand rng;
- *
- *    // create a uniform_int_distribution to produce ints from [-7,13]
- *    thrust::uniform_int_distribution<int> dist(-7,13);
- *
- *    // write a random number from the range [-7,13] to standard output
- *    std::cout << dist(rng) << std::endl;
- *
- *    // write the range of the distribution, just in case we forgot
- *    std::cout << dist.min() << std::endl;
- *
- *    // -7 is printed
- *
- *    std::cout << dist.max() << std::endl;
- *
- *    // 13 is printed
- *
- *    // write the parameters of the distribution (which happen to be the bounds) to standard output
- *    std::cout << dist.a() << std::endl;
- *
- *    // -7 is printed
- *
- *    std::cout << dist.b() << std::endl;
- *
- *    // 13 is printed
- *
- *    return 0;
- *  }
- *  \endcode
- */
-template<typename IntType = int>
-  class uniform_int_distribution
-{
-  public:
-    // types
-
-    /*! \typedef result_type
-     *  \brief The type of the integer produced by this \p uniform_int_distribution.
-     */
-    typedef IntType result_type;
-
-    /*! \typedef param_type
-     *  \brief The type of the object encapsulating this \p uniform_int_distribution's parameters.
-     */
-    typedef thrust::pair<IntType,IntType> param_type;
-
-    // constructors and reset functions
-
-    /*! This constructor creates a new \p uniform_int_distribution from two values defining the
-     *  range of the distribution.
-     *  
-     *  \param a The smallest integer to potentially produce. Defaults to \c 0.
-     *  \param b The largest integer to potentially produce. Defaults to the largest representable integer in
-     *           the platform.
-     */
-    __host__ __device__
-    explicit uniform_int_distribution(IntType a = 0, IntType b = thrust::detail::integer_traits<IntType>::const_max);
-
-    /*! This constructor creates a new \p uniform_int_distribution from a \p param_type object
-     *  encapsulating the range of the distribution.
-     *  
-     *  \param parm A \p param_type object encapsulating the parameters (i.e., the range) of the distribution.
-     */
-    __host__ __device__
-    explicit uniform_int_distribution(const param_type &parm);
-
-    /*! This does nothing.  It is included to conform to the requirements of the RandomDistribution concept.
-     */
-    __host__ __device__
-    void reset(void);
-
-    // generating functions
-
-    /*! This method produces a new uniform random integer drawn from this \p uniform_int_distribution's
-     *  range using a \p UniformRandomNumberGenerator as a source of randomness.
-     *
-     *  \param urng The \p UniformRandomNumberGenerator to use as a source of randomness.
-     */
-    template<typename UniformRandomNumberGenerator>
-    __host__ __device__
-    result_type operator()(UniformRandomNumberGenerator &urng);
-
-    /*! This method produces a new uniform random integer as if by creating a new \p uniform_int_distribution 
-     *  from the given \p param_type object, and calling its <tt>operator()</tt> method with the given
-     *  \p UniformRandomNumberGenerator as a source of randomness.
-     *
-     *  \param urng The \p UniformRandomNumberGenerator to use as a source of randomness.
-     *  \param parm A \p param_type object encapsulating the parameters of the \p uniform_int_distribution
-     *              to draw from.
-     */
-    template<typename UniformRandomNumberGenerator>
-    __host__ __device__
-    result_type operator()(UniformRandomNumberGenerator &urng, const param_type &parm);
-
-    // property functions
-    
-    /*! This method returns the value of the parameter with which this \p uniform_int_distribution
-     *  was constructed.
-     *
-     *  \return The lower bound of this \p uniform_int_distribution's range.
-     */
-    __host__ __device__
-    result_type a(void) const;
-
-    /*! This method returns the value of the parameter with which this \p uniform_int_distribution
-     *  was constructed.
-     *
-     *  \return The upper bound of this \p uniform_int_distribution's range.
-     */
-    __host__ __device__
-    result_type b(void) const;
-
-    /*! This method returns a \p param_type object encapsulating the parameters with which this
-     *  \p uniform_int_distribution was constructed.
-     *
-     *  \return A \p param_type object enapsulating the range of this \p uniform_int_distribution.
-     */
-    __host__ __device__
-    param_type param(void) const;
-
-    /*! This method changes the parameters of this \p uniform_int_distribution using the values encapsulated
-     *  in a given \p param_type object.
-     *
-     *  \param parm A \p param_type object encapsulating the new range of this \p uniform_int_distribution.
-     */
-    __host__ __device__
-    void param(const param_type &parm);
-
-    /*! This method returns the smallest integer this \p uniform_int_distribution can potentially produce.
-     *
-     *  \return The lower bound of this \p uniform_int_distribution's range.
-     */
-    __host__ __device__
-    result_type min THRUST_PREVENT_MACRO_SUBSTITUTION (void) const;
-
-    /*! This method returns the largest integer this \p uniform_int_distribution can potentially produce.
-     *
-     *  \return The upper bound of this \p uniform_int_distribution's range.
-     */
-    __host__ __device__
-    result_type max THRUST_PREVENT_MACRO_SUBSTITUTION (void) const;
-
-    /*! \cond
-     */
-  private:
-    param_type m_param;
-
-    friend struct thrust::random::detail::random_core_access;
-
-    __host__ __device__
-    bool equal(const uniform_int_distribution &rhs) const;
-
-    template<typename CharT, typename Traits>
-    std::basic_ostream<CharT,Traits>& stream_out(std::basic_ostream<CharT,Traits> &os) const;
-
-    template<typename CharT, typename Traits>
-    std::basic_istream<CharT,Traits>& stream_in(std::basic_istream<CharT,Traits> &is);
-    /*! \endcond
-     */
-}; // end uniform_int_distribution
-
-
-/*! This function checks two \p uniform_int_distributions for equality.
- *  \param lhs The first \p uniform_int_distribution to test.
- *  \param rhs The second \p uniform_int_distribution to test.
- *  \return \c true if \p lhs is equal to \p rhs; \c false, otherwise.
- */
-template<typename IntType>
-__host__ __device__
-bool operator==(const uniform_int_distribution<IntType> &lhs,
-                const uniform_int_distribution<IntType> &rhs);
-
-
-/*! This function checks two \p uniform_int_distributions for inequality.
- *  \param lhs The first \p uniform_int_distribution to test.
- *  \param rhs The second \p uniform_int_distribution to test.
- *  \return \c true if \p lhs is not equal to \p rhs; \c false, otherwise.
- */
-template<typename IntType>
-__host__ __device__
-bool operator!=(const uniform_int_distribution<IntType> &lhs,
-                const uniform_int_distribution<IntType> &rhs);
-
-
-/*! This function streams a uniform_int_distribution to a \p std::basic_ostream.
- *  \param os The \p basic_ostream to stream out to.
- *  \param d The \p uniform_int_distribution to stream out.
- *  \return \p os
- */
-template<typename IntType,
-         typename CharT, typename Traits>
-std::basic_ostream<CharT,Traits>&
-operator<<(std::basic_ostream<CharT,Traits> &os,
-           const uniform_int_distribution<IntType> &d);
-
-
-/*! This function streams a uniform_int_distribution in from a std::basic_istream.
- *  \param is The \p basic_istream to stream from.
- *  \param d The \p uniform_int_distribution to stream in.
- *  \return \p is
- */
-template<typename IntType,
-         typename CharT, typename Traits>
-std::basic_istream<CharT,Traits>&
-operator>>(std::basic_istream<CharT,Traits> &is,
-           uniform_int_distribution<IntType> &d);
-
-
-/*! \} // end random_number_distributions
- */
-
-
-} // end random
-
-using random::uniform_int_distribution;
-
-} // end thrust
-
-#include <thrust/random/detail/uniform_int_distribution.inl>
-
diff --git a/compat/thrust/random/uniform_real_distribution.h b/compat/thrust/random/uniform_real_distribution.h
deleted file mode 100644
index ab85ab33dc..0000000000
--- a/compat/thrust/random/uniform_real_distribution.h
+++ /dev/null
@@ -1,274 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file uniform_real_distribution.h
- *  \brief A uniform distribution of real-valued numbers
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/pair.h>
-#include <thrust/random/detail/random_core_access.h>
-#include <iostream>
-
-namespace thrust
-{
-
-namespace random
-{
-
-
-/*! \addtogroup random_number_distributions
- *  \{
- */
-
-/*! \class uniform_real_distribution
- *  \brief A \p uniform_real_distribution random number distribution produces floating point
- *         uniform random numbers from a half-open interval.
- *
- *  \tparam RealType The type of floating point number to produce.
- *
- *  The following code snippet demonstrates examples of using a \p uniform_real_distribution with a 
- *  random number engine to produce random integers drawn from a given range:
- *
- *  \code
- *  #include <thrust/random/linear_congruential_engine.h>
- *  #include <thrust/random/uniform_real_distribution.h>
- *
- *  int main(void)
- *  {
- *    // create a minstd_rand object to act as our source of randomness
- *    thrust::minstd_rand rng;
- *
- *    // create a uniform_real_distribution to produce floats from [-7,13)
- *    thrust::uniform_real_distribution<float> dist(-7,13);
- *
- *    // write a random number from the range [-7,13) to standard output
- *    std::cout << dist(rng) << std::endl;
- *
- *    // write the range of the distribution, just in case we forgot
- *    std::cout << dist.min() << std::endl;
- *
- *    // -7.0 is printed
- *
- *    std::cout << dist.max() << std::endl;
- *
- *    // 13.0 is printed
- *
- *    // write the parameters of the distribution (which happen to be the bounds) to standard output
- *    std::cout << dist.a() << std::endl;
- *
- *    // -7.0 is printed
- *
- *    std::cout << dist.b() << std::endl;
- *
- *    // 13.0 is printed
- *
- *    return 0;
- *  }
- *  \endcode
- */
-template<typename RealType = double>
-  class uniform_real_distribution
-{
-  public:
-    // types
-    
-    /*! \typedef result_type
-     *  \brief The type of the floating point number produced by this \p uniform_real_distribution.
-     */
-    typedef RealType result_type;
-
-    /*! \typedef param_type
-     *  \brief The type of the object encapsulating this \p uniform_real_distribution's parameters.
-     */
-    typedef thrust::pair<RealType,RealType> param_type;
-
-    // constructors and reset functions
-    
-    /*! This constructor creates a new \p uniform_real_distribution from two values defining the
-     *  half-open interval of the distribution.
-     *  
-     *  \param a The smallest floating point number to potentially produce. Defaults to \c 0.0.
-     *  \param b The smallest number larger than the largest floating point number to potentially produce. Defaults to \c 1.0.
-     */
-    __host__ __device__
-    explicit uniform_real_distribution(RealType a = 0.0, RealType b = 1.0);
-
-    /*! This constructor creates a new \p uniform_real_distribution from a \p param_type object
-     *  encapsulating the range of the distribution.
-     *  
-     *  \param parm A \p param_type object encapsulating the parameters (i.e., the range) of the distribution.
-     */
-    __host__ __device__
-    explicit uniform_real_distribution(const param_type &parm);
-
-    /*! This does nothing.  It is included to conform to the requirements of the RandomDistribution concept.
-     */
-    __host__ __device__
-    void reset(void);
-
-    // generating functions
-
-    /*! This method produces a new uniform random integer drawn from this \p uniform_real_distribution's
-     *  range using a \p UniformRandomNumberGenerator as a source of randomness.
-     *
-     *  \param urng The \p UniformRandomNumberGenerator to use as a source of randomness.
-     */
-    template<typename UniformRandomNumberGenerator>
-    __host__ __device__
-    result_type operator()(UniformRandomNumberGenerator &urng);
-
-    /*! This method produces a new uniform random integer as if by creating a new \p uniform_real_distribution 
-     *  from the given \p param_type object, and calling its <tt>operator()</tt> method with the given
-     *  \p UniformRandomNumberGenerator as a source of randomness.
-     *
-     *  \param urng The \p UniformRandomNumberGenerator to use as a source of randomness.
-     *  \param parm A \p param_type object encapsulating the parameters of the \p uniform_real_distribution
-     *              to draw from.
-     */
-    template<typename UniformRandomNumberGenerator>
-    __host__ __device__
-    result_type operator()(UniformRandomNumberGenerator &urng, const param_type &parm);
-
-    // property functions
-
-    /*! This method returns the value of the parameter with which this \p uniform_real_distribution
-     *  was constructed.
-     *
-     *  \return The lower bound of this \p uniform_real_distribution's half-open interval.
-     */
-    __host__ __device__
-    result_type a(void) const;
-
-    /*! This method returns the value of the parameter with which this \p uniform_real_distribution
-     *  was constructed.
-     *
-     *  \return The upper bound of this \p uniform_real_distribution's half-open interval.
-     */
-    __host__ __device__
-    result_type b(void) const;
-
-    /*! This method returns a \p param_type object encapsulating the parameters with which this
-     *  \p uniform_real_distribution was constructed.
-     *
-     *  \return A \p param_type object enapsulating the half-open interval of this \p uniform_real_distribution.
-     */
-    __host__ __device__
-    param_type param(void) const;
-
-    /*! This method changes the parameters of this \p uniform_real_distribution using the values encapsulated
-     *  in a given \p param_type object.
-     *
-     *  \param parm A \p param_type object encapsulating the new half-open interval of this \p uniform_real_distribution.
-     */
-    __host__ __device__
-    void param(const param_type &parm);
-
-    /*! This method returns the smallest floating point number this \p uniform_real_distribution can potentially produce.
-     *
-     *  \return The lower bound of this \p uniform_real_distribution's half-open interval.
-     */
-    __host__ __device__
-    result_type min THRUST_PREVENT_MACRO_SUBSTITUTION (void) const;
-
-    /*! This method returns the smallest number larger than largest floating point number this \p uniform_real_distribution can potentially produce.
-     *
-     *  \return The upper bound of this \p uniform_real_distribution's half-open interval.
-     */
-    __host__ __device__
-    result_type max THRUST_PREVENT_MACRO_SUBSTITUTION (void) const;
-
-    /*! \cond
-     */
-  private:
-    param_type m_param;
-
-    friend struct thrust::random::detail::random_core_access;
-
-    __host__ __device__
-    bool equal(const uniform_real_distribution &rhs) const;
-
-    template<typename CharT, typename Traits>
-    std::basic_ostream<CharT,Traits>& stream_out(std::basic_ostream<CharT,Traits> &os) const;
-
-    template<typename CharT, typename Traits>
-    std::basic_istream<CharT,Traits>& stream_in(std::basic_istream<CharT,Traits> &is);
-    /*! \endcond
-     */
-}; // end uniform_real_distribution
-
-
-/*! This function checks two \p uniform_real_distributions for equality.
- *  \param lhs The first \p uniform_real_distribution to test.
- *  \param rhs The second \p uniform_real_distribution to test.
- *  \return \c true if \p lhs is equal to \p rhs; \c false, otherwise.
- */
-template<typename RealType>
-__host__ __device__
-bool operator==(const uniform_real_distribution<RealType> &lhs,
-                const uniform_real_distribution<RealType> &rhs);
-
-
-/*! This function checks two \p uniform_real_distributions for inequality.
- *  \param lhs The first \p uniform_real_distribution to test.
- *  \param rhs The second \p uniform_real_distribution to test.
- *  \return \c true if \p lhs is not equal to \p rhs; \c false, otherwise.
- */
-template<typename RealType>
-__host__ __device__
-bool operator!=(const uniform_real_distribution<RealType> &lhs,
-                const uniform_real_distribution<RealType> &rhs);
-
-
-/*! This function streams a uniform_real_distribution to a \p std::basic_ostream.
- *  \param os The \p basic_ostream to stream out to.
- *  \param d The \p uniform_real_distribution to stream out.
- *  \return \p os
- */
-template<typename RealType,
-         typename CharT, typename Traits>
-std::basic_ostream<CharT,Traits>&
-operator<<(std::basic_ostream<CharT,Traits> &os,
-           const uniform_real_distribution<RealType> &d);
-
-
-/*! This function streams a uniform_real_distribution in from a std::basic_istream.
- *  \param is The \p basic_istream to stream from.
- *  \param d The \p uniform_real_distribution to stream in.
- *  \return \p is
- */
-template<typename RealType,
-         typename CharT, typename Traits>
-std::basic_istream<CharT,Traits>&
-operator>>(std::basic_istream<CharT,Traits> &is,
-           uniform_real_distribution<RealType> &d);
-
-
-/*! \} // end random_number_distributions
- */
-
-
-} // end random
-
-using random::uniform_real_distribution;
-
-} // end thrust
-
-#include <thrust/random/detail/uniform_real_distribution.inl>
-
diff --git a/compat/thrust/random/xor_combine_engine.h b/compat/thrust/random/xor_combine_engine.h
deleted file mode 100644
index 61eb5a50c2..0000000000
--- a/compat/thrust/random/xor_combine_engine.h
+++ /dev/null
@@ -1,271 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file xor_combine_engine.h
- *  \brief A pseudorandom number generator which produces pseudorandom
- *         numbers from two integer base engines by merging their
- *         pseudorandom numbers with bitwise exclusive-or.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/random/detail/xor_combine_engine_max.h>
-#include <thrust/random/detail/random_core_access.h>
-#include <iostream>
-#include <cstddef> // for size_t
-
-namespace thrust
-{
-
-namespace random
-{
-
-/*! \addtogroup random_number_engine_adaptors
- *  \{
- */
-
-/*! \class xor_combine_engine
- *  \brief An \p xor_combine_engine adapts two existing base random number engines and
- *         produces random values by combining the values produced by each.
- *
- *  \tparam Engine1 The type of the first base random number engine to adapt.
- *  \tparam s1 The size of the first shift to use in the generation algorithm.
- *  \tparam Engine2 The type of the second base random number engine to adapt.
- *  \tparam s2 The second of the second shift to use in the generation algorithm. Defaults to \c 0.
- *
- *  The following code snippet shows an example of using an \p xor_combine_engine instance:
- *
- *  \code
- *  #include <thrust/random/linear_congruential_engine.h>
- *  #include <thrust/random/xor_combine_engine.h>
- *  #include <iostream>
- *
- *  int main(void)
- *  {
- *    // create an xor_combine_engine from minstd_rand and minstd_rand0
- *    // use a shift of 0 for each
- *    thrust::xor_combine_engine<thrust::minstd_rand,0,thrust::minstd_rand0,0> rng;
- *
- *    // print a random number to standard output
- *    std::cout << rng() << std::endl;
- *
- *    return 0;
- *  }
- *  \endcode
- */
-template<typename Engine1, size_t s1,
-         typename Engine2, size_t s2=0u>
-  class xor_combine_engine
-{
-  public:
-    // types
-
-    /*! \typedef base1_type
-     *  \brief The type of the first adapted base random number engine.
-     */
-    typedef Engine1 base1_type;
-
-    /*! \typedef base2_type
-     *  \brief The type of the second adapted base random number engine.
-     */
-    typedef Engine2 base2_type;
-
-    /*! \typedef result_type
-     *  \brief The type of the unsigned integer produced by this \p xor_combine_engine.
-     */
-    typedef typename thrust::detail::eval_if<
-      (sizeof(typename base2_type::result_type) > sizeof(typename base1_type::result_type)),
-      thrust::detail::identity_<typename base2_type::result_type>,
-      thrust::detail::identity_<typename base1_type::result_type>
-    >::type result_type;
-    
-    /*! The size of the first shift used in the generation algorithm.
-     */
-    static const size_t shift1 = s1;
-
-    /*! The size of the second shift used in the generation algorithm.
-     */
-    static const size_t shift2 = s2;
-
-    /*! The smallest value this \p xor_combine_engine may potentially produce.
-     */
-    static const result_type min = 0;
-
-    /*! The largest value this \p xor_combine_engine may potentially produce.
-     */
-    static const result_type max =
-      detail::xor_combine_engine_max<
-        Engine1, s1, Engine2, s2, result_type
-      >::value;
-
-    // constructors and seeding functions
-
-    /*! This constructor constructs a new \p xor_combine_engine and constructs
-     *  its adapted engines using their null constructors.
-     */
-    __host__ __device__
-    xor_combine_engine(void);
-
-    /*! This constructor constructs a new \p xor_combine_engine using
-     *  given \p base1_type and \p base2_type engines to initialize its adapted base engines.
-     *
-     *  \param urng1 A \p base1_type to use to initialize this \p xor_combine_engine's
-     *         first adapted base engine.
-     *  \param urng2 A \p base2_type to use to initialize this \p xor_combine_engine's
-     *         first adapted base engine.
-     */
-    __host__ __device__
-    xor_combine_engine(const base1_type &urng1, const base2_type &urng2);
-
-    /*! This constructor initializes a new \p xor_combine_engine with a given seed.
-     *  
-     *  \param s The seed used to intialize this \p xor_combine_engine's adapted base engines.
-     */
-    __host__ __device__
-    xor_combine_engine(result_type s);
-
-    /*! This method initializes the state of this \p xor_combine_engine's adapted base engines
-     *  by using their \p default_seed values.
-     */
-    __host__ __device__
-    void seed(void);
-
-    /*! This method initializes the state of this \p xor_combine_engine's adapted base engines
-     *  by using the given seed.
-     *
-     *  \param s The seed with which to intialize this \p xor_combine_engine's adapted base engines.
-     */
-    __host__ __device__
-    void seed(result_type s);
-
-    // generating functions
-
-    /*! This member function produces a new random value and updates this \p xor_combine_engine's state.
-     *  \return A new random number.
-     */
-    __host__ __device__
-    result_type operator()(void);
-
-    /*! This member function advances this \p xor_combine_engine's state a given number of times
-     *  and discards the results.
-     *
-     *  \param z The number of random values to discard.
-     *  \note This function is provided because an implementation may be able to accelerate it.
-     */
-    __host__ __device__
-    void discard(unsigned long long z);
-
-    // property functions
-
-    /*! This member function returns a const reference to this \p xor_combine_engine's
-     *  first adapted base engine.
-     *
-     *  \return A const reference to the first base engine this \p xor_combine_engine adapts.
-     */
-    __host__ __device__
-    const base1_type &base1(void) const;
-
-    /*! This member function returns a const reference to this \p xor_combine_engine's
-     *  second adapted base engine.
-     *
-     *  \return A const reference to the second base engine this \p xor_combine_engine adapts.
-     */
-    __host__ __device__
-    const base2_type &base2(void) const;
-
-    /*! \cond
-     */
-  private:
-    base1_type m_b1;
-    base2_type m_b2;
-
-    friend struct thrust::random::detail::random_core_access;
-
-    __host__ __device__
-    bool equal(const xor_combine_engine &rhs) const;
-
-    template<typename CharT, typename Traits>
-    std::basic_istream<CharT,Traits>& stream_in(std::basic_istream<CharT,Traits> &is);
-
-    template<typename CharT, typename Traits>
-    std::basic_ostream<CharT,Traits>& stream_out(std::basic_ostream<CharT,Traits> &os) const;
-
-    /*! \endcond
-     */
-}; // end xor_combine_engine
-
-
-/*! This function checks two \p xor_combine_engines for equality.
- *  \param lhs The first \p xor_combine_engine to test.
- *  \param rhs The second \p xor_combine_engine to test.
- *  \return \c true if \p lhs is equal to \p rhs; \c false, otherwise.
- */
-template<typename Engine1_, size_t s1_, typename Engine2_, size_t s2_>
-__host__ __device__
-bool operator==(const xor_combine_engine<Engine1_,s1_,Engine2_,s2_> &lhs,
-                const xor_combine_engine<Engine1_,s1_,Engine2_,s2_> &rhs);
-
-
-/*! This function checks two \p xor_combine_engines for inequality.
- *  \param lhs The first \p xor_combine_engine to test.
- *  \param rhs The second \p xor_combine_engine to test.
- *  \return \c true if \p lhs is not equal to \p rhs; \c false, otherwise.
- */
-template<typename Engine1_, size_t s1_, typename Engine2_, size_t s2_>
-__host__ __device__
-bool operator!=(const xor_combine_engine<Engine1_,s1_,Engine2_,s2_> &lhs,
-                const xor_combine_engine<Engine1_,s1_,Engine2_,s2_> &rhs);
-
-
-/*! This function streams a xor_combine_engine to a \p std::basic_ostream.
- *  \param os The \p basic_ostream to stream out to.
- *  \param e The \p xor_combine_engine to stream out.
- *  \return \p os
- */
-template<typename Engine1_, size_t s1_, typename Engine2_, size_t s2_,
-         typename CharT, typename Traits>
-std::basic_ostream<CharT,Traits>&
-operator<<(std::basic_ostream<CharT,Traits> &os,
-           const xor_combine_engine<Engine1_,s1_,Engine2_,s2_> &e);
-
-
-/*! This function streams a xor_combine_engine in from a std::basic_istream.
- *  \param is The \p basic_istream to stream from.
- *  \param e The \p xor_combine_engine to stream in.
- *  \return \p is
- */
-template<typename Engine1_, size_t s1_, typename Engine2_, size_t s2_,
-         typename CharT, typename Traits>
-std::basic_istream<CharT,Traits>&
-operator>>(std::basic_istream<CharT,Traits> &is,
-           xor_combine_engine<Engine1_,s1_,Engine2_,s2_> &e);
-
-
-/*! \} // end random_number_engine_adaptors
- */
-
-
-} // end random
-
-// import names into thrust::
-using random::xor_combine_engine;
-
-} // end thrust
-
-#include <thrust/random/detail/xor_combine_engine.inl>
-
diff --git a/compat/thrust/reduce.h b/compat/thrust/reduce.h
deleted file mode 100644
index 1dc931f9a1..0000000000
--- a/compat/thrust/reduce.h
+++ /dev/null
@@ -1,779 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file reduce.h
- *  \brief Functions for reducing a range to a single value
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/execution_policy.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/pair.h>
-
-namespace thrust
-{
-
-
-/*! \addtogroup reductions
- *  \{
- */
-
-
-/*! \p reduce is a generalization of summation: it computes the sum (or some
- *  other binary operation) of all the elements in the range <tt>[first,
- *  last)</tt>. This version of \p reduce uses \c 0 as the initial value of the
- *  reduction. \p reduce is similar to the C++ Standard Template Library's
- *  <tt>std::accumulate</tt>. The primary difference between the two functions
- *  is that <tt>std::accumulate</tt> guarantees the order of summation, while
- *  \p reduce requires associativity of the binary operation to parallelize
- *  the reduction.
- *
- *  Note that \p reduce also assumes that the binary reduction operator (in this
- *  case operator+) is commutative.  If the reduction operator is not commutative
- *  then \p thrust::reduce should not be used.  Instead, one could use 
- *  \p inclusive_scan (which does not require commutativity) and select the
- *  last element of the output array.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the sequence.
- *  \param last The end of the sequence.
- *  \return The result of the reduction.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *          and if \c x and \c y are objects of \p InputIterator's \c value_type,
- *          then <tt>x + y</tt> is defined and is convertible to \p InputIterator's
- *          \c value_type. If \c T is \c InputIterator's \c value_type, then
- *          <tt>T(0)</tt> is defined.
- *
- *  The following code snippet demonstrates how to use \p reduce to compute
- *  the sum of a sequence of integers using the \p thrust::host execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/reduce.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  int data[6] = {1, 0, 2, 2, 1, 3};
- *  int result = thrust::reduce(thrust::host, data, data + 6);
- *
- *  // result == 9
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/accumulate.html
- */
-template<typename DerivedPolicy, typename InputIterator>
-  typename thrust::iterator_traits<InputIterator>::value_type
-    reduce(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, InputIterator first, InputIterator last);
-
-
-/*! \p reduce is a generalization of summation: it computes the sum (or some
- *  other binary operation) of all the elements in the range <tt>[first,
- *  last)</tt>. This version of \p reduce uses \c 0 as the initial value of the
- *  reduction. \p reduce is similar to the C++ Standard Template Library's
- *  <tt>std::accumulate</tt>. The primary difference between the two functions
- *  is that <tt>std::accumulate</tt> guarantees the order of summation, while
- *  \p reduce requires associativity of the binary operation to parallelize
- *  the reduction.
- *
- *  Note that \p reduce also assumes that the binary reduction operator (in this
- *  case operator+) is commutative.  If the reduction operator is not commutative
- *  then \p thrust::reduce should not be used.  Instead, one could use 
- *  \p inclusive_scan (which does not require commutativity) and select the
- *  last element of the output array.
- *
- *  \param first The beginning of the sequence.
- *  \param last The end of the sequence.
- *  \return The result of the reduction.
- *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *          and if \c x and \c y are objects of \p InputIterator's \c value_type,
- *          then <tt>x + y</tt> is defined and is convertible to \p InputIterator's
- *          \c value_type. If \c T is \c InputIterator's \c value_type, then
- *          <tt>T(0)</tt> is defined.
- *
- *  The following code snippet demonstrates how to use \p reduce to compute
- *  the sum of a sequence of integers.
- *
- *  \code
- *  #include <thrust/reduce.h>
- *  ...
- *  int data[6] = {1, 0, 2, 2, 1, 3};
- *  int result = thrust::reduce(data, data + 6);
- *
- *  // result == 9
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/accumulate.html
- */
-template<typename InputIterator> typename
-  thrust::iterator_traits<InputIterator>::value_type reduce(InputIterator first, InputIterator last);
-
-
-/*! \p reduce is a generalization of summation: it computes the sum (or some
- *  other binary operation) of all the elements in the range <tt>[first,
- *  last)</tt>. This version of \p reduce uses \p init as the initial value of the
- *  reduction. \p reduce is similar to the C++ Standard Template Library's
- *  <tt>std::accumulate</tt>. The primary difference between the two functions
- *  is that <tt>std::accumulate</tt> guarantees the order of summation, while
- *  \p reduce requires associativity of the binary operation to parallelize
- *  the reduction.
- *
- *  Note that \p reduce also assumes that the binary reduction operator (in this
- *  case operator+) is commutative.  If the reduction operator is not commutative
- *  then \p thrust::reduce should not be used.  Instead, one could use 
- *  \p inclusive_scan (which does not require commutativity) and select the
- *  last element of the output array.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the input sequence.
- *  \param last The end of the input sequence.
- *  \param init The initial value.
- *  \return The result of the reduction.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *          and if \c x and \c y are objects of \p InputIterator's \c value_type,
- *          then <tt>x + y</tt> is defined and is convertible to \p T.
- *  \tparam T is convertible to \p InputIterator's \c value_type.
- *
- *  The following code snippet demonstrates how to use \p reduce to compute
- *  the sum of a sequence of integers including an intialization value using the \p thrust::host
- *  execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/reduce.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  int data[6] = {1, 0, 2, 2, 1, 3};
- *  int result = thrust::reduce(thrust::host, data, data + 6, 1);
- *
- *  // result == 10
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/accumulate.html
- */
-template<typename DerivedPolicy, typename InputIterator, typename T>
-  T reduce(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-           InputIterator first,
-           InputIterator last,
-           T init);
-
-
-/*! \p reduce is a generalization of summation: it computes the sum (or some
- *  other binary operation) of all the elements in the range <tt>[first,
- *  last)</tt>. This version of \p reduce uses \p init as the initial value of the
- *  reduction. \p reduce is similar to the C++ Standard Template Library's
- *  <tt>std::accumulate</tt>. The primary difference between the two functions
- *  is that <tt>std::accumulate</tt> guarantees the order of summation, while
- *  \p reduce requires associativity of the binary operation to parallelize
- *  the reduction.
- *
- *  Note that \p reduce also assumes that the binary reduction operator (in this
- *  case operator+) is commutative.  If the reduction operator is not commutative
- *  then \p thrust::reduce should not be used.  Instead, one could use 
- *  \p inclusive_scan (which does not require commutativity) and select the
- *  last element of the output array.
- *
- *  \param first The beginning of the input sequence.
- *  \param last The end of the input sequence.
- *  \param init The initial value.
- *  \return The result of the reduction.
- *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *          and if \c x and \c y are objects of \p InputIterator's \c value_type,
- *          then <tt>x + y</tt> is defined and is convertible to \p T.
- *  \tparam T is convertible to \p InputIterator's \c value_type.
- *
- *  The following code snippet demonstrates how to use \p reduce to compute
- *  the sum of a sequence of integers including an intialization value.
- *
- *  \code
- *  #include <thrust/reduce.h>
- *  ...
- *  int data[6] = {1, 0, 2, 2, 1, 3};
- *  int result = thrust::reduce(data, data + 6, 1);
- *
- *  // result == 10
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/accumulate.html
- */
-template<typename InputIterator, typename T>
-  T reduce(InputIterator first,
-           InputIterator last,
-           T init);
-
-
-/*! \p reduce is a generalization of summation: it computes the sum (or some
- *  other binary operation) of all the elements in the range <tt>[first,
- *  last)</tt>. This version of \p reduce uses \p init as the initial value of the
- *  reduction and \p binary_op as the binary function used for summation. \p reduce
- *  is similar to the C++ Standard Template Library's <tt>std::accumulate</tt>.
- *  The primary difference between the two functions is that <tt>std::accumulate</tt>
- *  guarantees the order of summation, while \p reduce requires associativity of
- *  \p binary_op to parallelize the reduction.
- *
- *  Note that \p reduce also assumes that the binary reduction operator (in this
- *  case \p binary_op) is commutative.  If the reduction operator is not commutative
- *  then \p thrust::reduce should not be used.  Instead, one could use 
- *  \p inclusive_scan (which does not require commutativity) and select the
- *  last element of the output array.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the input sequence.
- *  \param last The end of the input sequence.
- *  \param init The initial value.
- *  \param binary_op The binary function used to 'sum' values.
- *  \return The result of the reduction.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *          and \c InputIterator's \c value_type is convertible to \c T.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
- *          and is convertible to \p BinaryFunction's \c first_argument_type and \c second_argument_type.
- *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>,
- *          and \p BinaryFunction's \c result_type is convertible to \p OutputType.
- *
- *  The following code snippet demonstrates how to use \p reduce to
- *  compute the maximum value of a sequence of integers using the \p thrust::host execution policy
- *  for parallelization:
- *
- *  \code
- *  #include <thrust/reduce.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  int data[6] = {1, 0, 2, 2, 1, 3};
- *  int result = thrust::reduce(thrust::host,
- *                              data, data + 6,
- *                              -1,
- *                              thrust::maximum<int>());
- *  // result == 3
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/accumulate.html
- *  \see transform_reduce
- */
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename T,
-         typename BinaryFunction>
-  T reduce(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-           InputIterator first,
-           InputIterator last,
-           T init,
-           BinaryFunction binary_op);
-
-
-/*! \p reduce is a generalization of summation: it computes the sum (or some
- *  other binary operation) of all the elements in the range <tt>[first,
- *  last)</tt>. This version of \p reduce uses \p init as the initial value of the
- *  reduction and \p binary_op as the binary function used for summation. \p reduce
- *  is similar to the C++ Standard Template Library's <tt>std::accumulate</tt>.
- *  The primary difference between the two functions is that <tt>std::accumulate</tt>
- *  guarantees the order of summation, while \p reduce requires associativity of
- *  \p binary_op to parallelize the reduction.
- *
- *  Note that \p reduce also assumes that the binary reduction operator (in this
- *  case \p binary_op) is commutative.  If the reduction operator is not commutative
- *  then \p thrust::reduce should not be used.  Instead, one could use 
- *  \p inclusive_scan (which does not require commutativity) and select the
- *  last element of the output array.
- *
- *  \param first The beginning of the input sequence.
- *  \param last The end of the input sequence.
- *  \param init The initial value.
- *  \param binary_op The binary function used to 'sum' values.
- *  \return The result of the reduction.
- *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *          and \c InputIterator's \c value_type is convertible to \c T.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
- *          and is convertible to \p BinaryFunction's \c first_argument_type and \c second_argument_type.
- *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>,
- *          and \p BinaryFunction's \c result_type is convertible to \p OutputType.
- *
- *  The following code snippet demonstrates how to use \p reduce to
- *  compute the maximum value of a sequence of integers.
- *
- *  \code
- *  #include <thrust/reduce.h>
- *  #include <thrust/functional.h>
- *  ...
- *  int data[6] = {1, 0, 2, 2, 1, 3};
- *  int result = thrust::reduce(data, data + 6,
- *                              -1,
- *                              thrust::maximum<int>());
- *  // result == 3
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/accumulate.html
- *  \see transform_reduce
- */
-template<typename InputIterator,
-         typename T,
-         typename BinaryFunction>
-  T reduce(InputIterator first,
-           InputIterator last,
-           T init,
-           BinaryFunction binary_op);
-
-
-/*! \p reduce_by_key is a generalization of \p reduce to key-value pairs.
- *  For each group of consecutive keys in the range <tt>[keys_first, keys_last)</tt>
- *  that are equal, \p reduce_by_key copies the first element of the group to the
- *  \c keys_output. The corresponding values in the range are reduced using the
- *  \c plus and the result copied to \c values_output. 
- *
- *  This version of \p reduce_by_key uses the function object \c equal_to
- *  to test for equality and \c plus to reduce values with equal keys.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param keys_first The beginning of the input key range.
- *  \param keys_last  The end of the input key range.
- *  \param values_first The beginning of the input value range.
- *  \param keys_output The beginning of the output key range.
- *  \param values_output The beginning of the output value range.
- *  \return A pair of iterators at end of the ranges <tt>[keys_output, keys_output_last)</tt> and <tt>[values_output, values_output_last)</tt>.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
- *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
- *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
- *
- *  \pre The input ranges shall not overlap either output range.
- *
- *  The following code snippet demonstrates how to use \p reduce_by_key to
- *  compact a sequence of key/value pairs and sum values with equal keys using the \p thrust::host
- *  execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/reduce.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  const int N = 7;
- *  int A[N] = {1, 3, 3, 3, 2, 2, 1}; // input keys
- *  int B[N] = {9, 8, 7, 6, 5, 4, 3}; // input values
- *  int C[N];                         // output keys
- *  int D[N];                         // output values
- *
- *  thrust::pair<int*,int*> new_end;
- *  new_end = thrust::reduce_by_key(thrust::host, A, A + N, B, C, D);
- *
- *  // The first four keys in C are now {1, 3, 2, 1} and new_end.first - C is 4.
- *  // The first four values in D are now {9, 21, 9, 3} and new_end.second - D is 4.
- *  \endcode
- *  
- *  \see reduce
- *  \see unique_copy
- *  \see unique_by_key
- *  \see unique_by_key_copy
- */
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2>
-  thrust::pair<OutputIterator1,OutputIterator2>
-  reduce_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                InputIterator1 keys_first, 
-                InputIterator1 keys_last,
-                InputIterator2 values_first,
-                OutputIterator1 keys_output,
-                OutputIterator2 values_output);
-
-
-/*! \p reduce_by_key is a generalization of \p reduce to key-value pairs.
- *  For each group of consecutive keys in the range <tt>[keys_first, keys_last)</tt>
- *  that are equal, \p reduce_by_key copies the first element of the group to the
- *  \c keys_output. The corresponding values in the range are reduced using the
- *  \c plus and the result copied to \c values_output. 
- *
- *  This version of \p reduce_by_key uses the function object \c equal_to
- *  to test for equality and \c plus to reduce values with equal keys.
- *
- *  \param keys_first The beginning of the input key range.
- *  \param keys_last  The end of the input key range.
- *  \param values_first The beginning of the input value range.
- *  \param keys_output The beginning of the output key range.
- *  \param values_output The beginning of the output value range.
- *  \return A pair of iterators at end of the ranges <tt>[keys_output, keys_output_last)</tt> and <tt>[values_output, values_output_last)</tt>.
- *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
- *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
- *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
- *
- *  \pre The input ranges shall not overlap either output range.
- *
- *  The following code snippet demonstrates how to use \p reduce_by_key to
- *  compact a sequence of key/value pairs and sum values with equal keys.
- *
- *  \code
- *  #include <thrust/reduce.h>
- *  ...
- *  const int N = 7;
- *  int A[N] = {1, 3, 3, 3, 2, 2, 1}; // input keys
- *  int B[N] = {9, 8, 7, 6, 5, 4, 3}; // input values
- *  int C[N];                         // output keys
- *  int D[N];                         // output values
- *
- *  thrust::pair<int*,int*> new_end;
- *  new_end = thrust::reduce_by_key(A, A + N, B, C, D);
- *
- *  // The first four keys in C are now {1, 3, 2, 1} and new_end.first - C is 4.
- *  // The first four values in D are now {9, 21, 9, 3} and new_end.second - D is 4.
- *  \endcode
- *  
- *  \see reduce
- *  \see unique_copy
- *  \see unique_by_key
- *  \see unique_by_key_copy
- */
-template <typename InputIterator1,
-          typename InputIterator2,
-          typename OutputIterator1,
-          typename OutputIterator2>
-  thrust::pair<OutputIterator1,OutputIterator2>
-  reduce_by_key(InputIterator1 keys_first, 
-                InputIterator1 keys_last,
-                InputIterator2 values_first,
-                OutputIterator1 keys_output,
-                OutputIterator2 values_output);
-
-
-/*! \p reduce_by_key is a generalization of \p reduce to key-value pairs.
- *  For each group of consecutive keys in the range <tt>[keys_first, keys_last)</tt>
- *  that are equal, \p reduce_by_key copies the first element of the group to the
- *  \c keys_output. The corresponding values in the range are reduced using the
- *  \c plus and the result copied to \c values_output. 
- *
- *  This version of \p reduce_by_key uses the function object \c binary_pred
- *  to test for equality and \c plus to reduce values with equal keys.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param keys_first The beginning of the input key range.
- *  \param keys_last  The end of the input key range.
- *  \param values_first The beginning of the input value range.
- *  \param keys_output The beginning of the output key range.
- *  \param values_output The beginning of the output value range.
- *  \param binary_pred  The binary predicate used to determine equality.
- *  \return A pair of iterators at end of the ranges <tt>[keys_output, keys_output_last)</tt> and <tt>[values_output, values_output_last)</tt>.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
- *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
- *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
- *
- *  \pre The input ranges shall not overlap either output range.
- *
- *  The following code snippet demonstrates how to use \p reduce_by_key to
- *  compact a sequence of key/value pairs and sum values with equal keys using the \p thrust::host
- *  execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/reduce.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  const int N = 7;
- *  int A[N] = {1, 3, 3, 3, 2, 2, 1}; // input keys
- *  int B[N] = {9, 8, 7, 6, 5, 4, 3}; // input values
- *  int C[N];                         // output keys
- *  int D[N];                         // output values
- *
- *  thrust::pair<int*,int*> new_end;
- *  thrust::equal_to<int> binary_pred;
- *  new_end = thrust::reduce_by_key(thrust::host, A, A + N, B, C, D, binary_pred);
- *
- *  // The first four keys in C are now {1, 3, 2, 1} and new_end.first - C is 4.
- *  // The first four values in D are now {9, 21, 9, 3} and new_end.second - D is 4.
- *  \endcode
- *  
- *  \see reduce
- *  \see unique_copy
- *  \see unique_by_key
- *  \see unique_by_key_copy
- */
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename BinaryPredicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-  reduce_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                InputIterator1 keys_first, 
-                InputIterator1 keys_last,
-                InputIterator2 values_first,
-                OutputIterator1 keys_output,
-                OutputIterator2 values_output,
-                BinaryPredicate binary_pred);
-
-
-/*! \p reduce_by_key is a generalization of \p reduce to key-value pairs.
- *  For each group of consecutive keys in the range <tt>[keys_first, keys_last)</tt>
- *  that are equal, \p reduce_by_key copies the first element of the group to the
- *  \c keys_output. The corresponding values in the range are reduced using the
- *  \c plus and the result copied to \c values_output. 
- *
- *  This version of \p reduce_by_key uses the function object \c binary_pred
- *  to test for equality and \c plus to reduce values with equal keys.
- *
- *  \param keys_first The beginning of the input key range.
- *  \param keys_last  The end of the input key range.
- *  \param values_first The beginning of the input value range.
- *  \param keys_output The beginning of the output key range.
- *  \param values_output The beginning of the output value range.
- *  \param binary_pred  The binary predicate used to determine equality.
- *  \return A pair of iterators at end of the ranges <tt>[keys_output, keys_output_last)</tt> and <tt>[values_output, values_output_last)</tt>.
- *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
- *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
- *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
- *
- *  \pre The input ranges shall not overlap either output range.
- *
- *  The following code snippet demonstrates how to use \p reduce_by_key to
- *  compact a sequence of key/value pairs and sum values with equal keys.
- *
- *  \code
- *  #include <thrust/reduce.h>
- *  ...
- *  const int N = 7;
- *  int A[N] = {1, 3, 3, 3, 2, 2, 1}; // input keys
- *  int B[N] = {9, 8, 7, 6, 5, 4, 3}; // input values
- *  int C[N];                         // output keys
- *  int D[N];                         // output values
- *
- *  thrust::pair<int*,int*> new_end;
- *  thrust::equal_to<int> binary_pred;
- *  new_end = thrust::reduce_by_key(A, A + N, B, C, D, binary_pred);
- *
- *  // The first four keys in C are now {1, 3, 2, 1} and new_end.first - C is 4.
- *  // The first four values in D are now {9, 21, 9, 3} and new_end.second - D is 4.
- *  \endcode
- *  
- *  \see reduce
- *  \see unique_copy
- *  \see unique_by_key
- *  \see unique_by_key_copy
- */
-template <typename InputIterator1,
-          typename InputIterator2,
-          typename OutputIterator1,
-          typename OutputIterator2,
-          typename BinaryPredicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-  reduce_by_key(InputIterator1 keys_first, 
-                InputIterator1 keys_last,
-                InputIterator2 values_first,
-                OutputIterator1 keys_output,
-                OutputIterator2 values_output,
-                BinaryPredicate binary_pred);
-
-
-/*! \p reduce_by_key is a generalization of \p reduce to key-value pairs.
- *  For each group of consecutive keys in the range <tt>[keys_first, keys_last)</tt>
- *  that are equal, \p reduce_by_key copies the first element of the group to the
- *  \c keys_output. The corresponding values in the range are reduced using the
- *  \c BinaryFunction \c binary_op and the result copied to \c values_output. 
- *  Specifically, if consecutive key iterators \c i and \c (i + 1) are 
- *  such that <tt>binary_pred(*i, *(i+1))</tt> is \c true, then the corresponding
- *  values are reduced to a single value with \c binary_op.
- *
- *  This version of \p reduce_by_key uses the function object \c binary_pred
- *  to test for equality and \c binary_op to reduce values with equal keys.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param keys_first The beginning of the input key range.
- *  \param keys_last  The end of the input key range.
- *  \param values_first The beginning of the input value range.
- *  \param keys_output The beginning of the output key range.
- *  \param values_output The beginning of the output value range.
- *  \param binary_pred  The binary predicate used to determine equality.
- *  \param binary_op The binary function used to accumulate values.
- *  \return A pair of iterators at end of the ranges <tt>[keys_output, keys_output_last)</tt> and <tt>[values_output, values_output_last)</tt>.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
- *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
- *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
- *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
- *          and \c BinaryFunction's \c result_type is convertible to \c OutputIterator2's \c value_type.
- *
- *  \pre The input ranges shall not overlap either output range.
- *
- *  The following code snippet demonstrates how to use \p reduce_by_key to
- *  compact a sequence of key/value pairs and sum values with equal keys using the \p thrust::host
- *  execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/reduce.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  const int N = 7;
- *  int A[N] = {1, 3, 3, 3, 2, 2, 1}; // input keys
- *  int B[N] = {9, 8, 7, 6, 5, 4, 3}; // input values
- *  int C[N];                         // output keys
- *  int D[N];                         // output values
- *
- *  thrust::pair<int*,int*> new_end;
- *  thrust::equal_to<int> binary_pred;
- *  thrust::plus<int> binary_op;
- *  new_end = thrust::reduce_by_key(thrust::host, A, A + N, B, C, D, binary_pred, binary_op);
- *
- *  // The first four keys in C are now {1, 3, 2, 1} and new_end.first - C is 4.
- *  // The first four values in D are now {9, 21, 9, 3} and new_end.second - D is 4.
- *  \endcode
- *  
- *  \see reduce
- *  \see unique_copy
- *  \see unique_by_key
- *  \see unique_by_key_copy
- */
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename BinaryPredicate,
-         typename BinaryFunction>
-  thrust::pair<OutputIterator1,OutputIterator2>
-  reduce_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                InputIterator1 keys_first, 
-                InputIterator1 keys_last,
-                InputIterator2 values_first,
-                OutputIterator1 keys_output,
-                OutputIterator2 values_output,
-                BinaryPredicate binary_pred,
-                BinaryFunction binary_op);
-
-
-/*! \p reduce_by_key is a generalization of \p reduce to key-value pairs.
- *  For each group of consecutive keys in the range <tt>[keys_first, keys_last)</tt>
- *  that are equal, \p reduce_by_key copies the first element of the group to the
- *  \c keys_output. The corresponding values in the range are reduced using the
- *  \c BinaryFunction \c binary_op and the result copied to \c values_output. 
- *  Specifically, if consecutive key iterators \c i and \c (i + 1) are 
- *  such that <tt>binary_pred(*i, *(i+1))</tt> is \c true, then the corresponding
- *  values are reduced to a single value with \c binary_op.
- *
- *  This version of \p reduce_by_key uses the function object \c binary_pred
- *  to test for equality and \c binary_op to reduce values with equal keys.
- *
- *  \param keys_first The beginning of the input key range.
- *  \param keys_last  The end of the input key range.
- *  \param values_first The beginning of the input value range.
- *  \param keys_output The beginning of the output key range.
- *  \param values_output The beginning of the output value range.
- *  \param binary_pred  The binary predicate used to determine equality.
- *  \param binary_op The binary function used to accumulate values.
- *  \return A pair of iterators at end of the ranges <tt>[keys_output, keys_output_last)</tt> and <tt>[values_output, values_output_last)</tt>.
- *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
- *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
- *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
- *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
- *          and \c BinaryFunction's \c result_type is convertible to \c OutputIterator2's \c value_type.
- *
- *  \pre The input ranges shall not overlap either output range.
- *
- *  The following code snippet demonstrates how to use \p reduce_by_key to
- *  compact a sequence of key/value pairs and sum values with equal keys.
- *
- *  \code
- *  #include <thrust/reduce.h>
- *  ...
- *  const int N = 7;
- *  int A[N] = {1, 3, 3, 3, 2, 2, 1}; // input keys
- *  int B[N] = {9, 8, 7, 6, 5, 4, 3}; // input values
- *  int C[N];                         // output keys
- *  int D[N];                         // output values
- *
- *  thrust::pair<int*,int*> new_end;
- *  thrust::equal_to<int> binary_pred;
- *  thrust::plus<int> binary_op;
- *  new_end = thrust::reduce_by_key(A, A + N, B, C, D, binary_pred, binary_op);
- *
- *  // The first four keys in C are now {1, 3, 2, 1} and new_end.first - C is 4.
- *  // The first four values in D are now {9, 21, 9, 3} and new_end.second - D is 4.
- *  \endcode
- *  
- *  \see reduce
- *  \see unique_copy
- *  \see unique_by_key
- *  \see unique_by_key_copy
- */
-template <typename InputIterator1,
-          typename InputIterator2,
-          typename OutputIterator1,
-          typename OutputIterator2,
-          typename BinaryPredicate,
-          typename BinaryFunction>
-  thrust::pair<OutputIterator1,OutputIterator2>
-  reduce_by_key(InputIterator1 keys_first, 
-                InputIterator1 keys_last,
-                InputIterator2 values_first,
-                OutputIterator1 keys_output,
-                OutputIterator2 values_output,
-                BinaryPredicate binary_pred,
-                BinaryFunction binary_op);
-
-
-/*! \} // end reductions
- */
-
-
-} // end namespace thrust
-
-#include <thrust/detail/reduce.inl>
-
diff --git a/compat/thrust/remove.h b/compat/thrust/remove.h
deleted file mode 100644
index c538776258..0000000000
--- a/compat/thrust/remove.h
+++ /dev/null
@@ -1,800 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file remove.h
- *  \brief Functions for removing elements from a range
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/execution_policy.h>
-
-namespace thrust
-{
-
-
-/*! \addtogroup stream_compaction Stream Compaction
- *  \ingroup reordering
- *  \{
- *
- */
-
-
-/*! \p remove removes from the range <tt>[first, last)</tt> all elements that are
- *  equal to \p value. That is, \p remove returns an iterator \p new_last such
- *  that the range <tt>[first, new_last)</tt> contains no elements equal to
- *  \p value. The iterators in the range <tt>[new_first,last)</tt> are all still
- *  dereferenceable, but the elements that they point to are unspecified. \p remove
- *  is stable, meaning that the relative order of elements that are not equal to
- *  \p value is unchanged.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the range of interest.
- *  \param last The end of the range of interest.
- *  \param value The value to remove from the range <tt>[first, last)</tt>.
- *         Elements which are equal to value are removed from the sequence.
- *  \return A \p ForwardIterator pointing to the end of the resulting range of
- *          elements which are not equal to \p value.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator is mutable.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>,
- *          and objects of type \p T can be compared for equality with objects of \p ForwardIterator's \c value_type.
- *
- *  The following code snippet demonstrates how to use \p remove to remove a number
- *  of interest from a range using the \p thrust::host execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/remove.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  const int N = 6;
- *  int A[N] = {3, 1, 4, 1, 5, 9};
- *  int *new_end = thrust::remove(A, A + N, 1);
- *  // The first four values of A are now {3, 4, 5, 9}
- *  // Values beyond new_end are unspecified
- *  \endcode
- *
- *  \note The meaning of "removal" is somewhat subtle. \p remove does not destroy any
- *  iterators, and does not change the distance between \p first and \p last.
- *  (There's no way that it could do anything of the sort.) So, for example, if
- *  \c V is a device_vector, <tt>remove(V.begin(), V.end(), 0)</tt> does not
- *  change <tt>V.size()</tt>: \c V will contain just as many elements as it did
- *  before. \p remove returns an iterator that points to the end of the resulting
- *  range after elements have been removed from it; it follows that the elements
- *  after that iterator are of no interest, and may be discarded. If you are
- *  removing elements from a
- *  <a href="http://www.sgi.com/tech/stl/Sequence.html">Sequence</a>, you may
- *  simply erase them. That is, a reasonable way of removing elements from a
- *  <a href="http://www.sgi.com/tech/stl/Sequence.html">Sequence</a> is
- *  <tt>S.erase(remove(S.begin(), S.end(), x), S.end())</tt>.
- *
- *  \see http://www.sgi.com/tech/stl/remove.html
- *  \see remove_if
- *  \see remove_copy
- *  \see remove_copy_if
- */
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename T>
-  ForwardIterator remove(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                         ForwardIterator first,
-                         ForwardIterator last,
-                         const T &value);
-
-
-/*! \p remove removes from the range <tt>[first, last)</tt> all elements that are
- *  equal to \p value. That is, \p remove returns an iterator \p new_last such
- *  that the range <tt>[first, new_last)</tt> contains no elements equal to
- *  \p value. The iterators in the range <tt>[new_first,last)</tt> are all still
- *  dereferenceable, but the elements that they point to are unspecified. \p remove
- *  is stable, meaning that the relative order of elements that are not equal to
- *  \p value is unchanged.
- *
- *  \param first The beginning of the range of interest.
- *  \param last The end of the range of interest.
- *  \param value The value to remove from the range <tt>[first, last)</tt>.
- *         Elements which are equal to value are removed from the sequence.
- *  \return A \p ForwardIterator pointing to the end of the resulting range of
- *          elements which are not equal to \p value.
- *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator is mutable.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>,
- *          and objects of type \p T can be compared for equality with objects of \p ForwardIterator's \c value_type.
- *
- *  The following code snippet demonstrates how to use \p remove to remove a number
- *  of interest from a range.
- *
- *  \code
- *  #include <thrust/remove.h>
- *  ...
- *  const int N = 6;
- *  int A[N] = {3, 1, 4, 1, 5, 9};
- *  int *new_end = thrust::remove(A, A + N, 1);
- *  // The first four values of A are now {3, 4, 5, 9}
- *  // Values beyond new_end are unspecified
- *  \endcode
- *
- *  \note The meaning of "removal" is somewhat subtle. \p remove does not destroy any
- *  iterators, and does not change the distance between \p first and \p last.
- *  (There's no way that it could do anything of the sort.) So, for example, if
- *  \c V is a device_vector, <tt>remove(V.begin(), V.end(), 0)</tt> does not
- *  change <tt>V.size()</tt>: \c V will contain just as many elements as it did
- *  before. \p remove returns an iterator that points to the end of the resulting
- *  range after elements have been removed from it; it follows that the elements
- *  after that iterator are of no interest, and may be discarded. If you are
- *  removing elements from a
- *  <a href="http://www.sgi.com/tech/stl/Sequence.html">Sequence</a>, you may
- *  simply erase them. That is, a reasonable way of removing elements from a
- *  <a href="http://www.sgi.com/tech/stl/Sequence.html">Sequence</a> is
- *  <tt>S.erase(remove(S.begin(), S.end(), x), S.end())</tt>.
- *
- *  \see http://www.sgi.com/tech/stl/remove.html
- *  \see remove_if
- *  \see remove_copy
- *  \see remove_copy_if
- */
-template<typename ForwardIterator,
-         typename T>
-  ForwardIterator remove(ForwardIterator first,
-                         ForwardIterator last,
-                         const T &value);
-
-
-/*! \p remove_copy copies elements that are not equal to \p value from the range
- *  <tt>[first, last)</tt> to a range beginning at \p result. The return value is
- *  the end of the resulting range. This operation is stable, meaning that the
- *  relative order of the elements that are copied is the same as in
- *  the range <tt>[first, last)</tt>.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the range of interest.
- *  \param last The end of the range of interest.
- *  \param result The resulting range is copied to the sequence beginning at this
- *                location.
- *  \param value The value to omit from the copied range.
- *  \return An OutputIterator pointing to the end of the resulting range of elements
- *          which are not equal to \p value.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable">Equality Comparable</a>,
- *          and objects of type \p T can be compared for equality with objects of \p InputIterator's \c value_type.
- *
- *  \pre The range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
- *
- *  The following code snippet demonstrates how to use \p remove_copy to copy
- *  a sequence of numbers to an output range while omitting a value of interest using the \p thrust::host
- *  execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/remove.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  const int N = 6;
- *  int V[N] = {-2, 0, -1, 0, 1, 2};
- *  int result[N-2];
- *  thrust::remove_copy(thrust::host, V, V + N, result, 0);
- *  // V remains {-2, 0, -1, 0, 1, 2}
- *  // result is now {-2, -1, 1, 2}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/remove_copy.html
- *  \see remove
- *  \see remove_if
- *  \see remove_copy_if
- */
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename T>
-  OutputIterator remove_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                             InputIterator first,
-                             InputIterator last,
-                             OutputIterator result,
-                             const T &value);
-
-
-/*! \p remove_copy copies elements that are not equal to \p value from the range
- *  <tt>[first, last)</tt> to a range beginning at \p result. The return value is
- *  the end of the resulting range. This operation is stable, meaning that the
- *  relative order of the elements that are copied is the same as in
- *  the range <tt>[first, last)</tt>.
- *
- *  \param first The beginning of the range of interest.
- *  \param last The end of the range of interest.
- *  \param result The resulting range is copied to the sequence beginning at this
- *                location.
- *  \param value The value to omit from the copied range.
- *  \return An OutputIterator pointing to the end of the resulting range of elements
- *          which are not equal to \p value.
- *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable">Equality Comparable</a>,
- *          and objects of type \p T can be compared for equality with objects of \p InputIterator's \c value_type.
- *
- *  \pre The range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
- *
- *  The following code snippet demonstrates how to use \p remove_copy to copy
- *  a sequence of numbers to an output range while omitting a value of interest.
- *
- *  \code
- *  #include <thrust/remove.h>
- *  ...
- *  const int N = 6;
- *  int V[N] = {-2, 0, -1, 0, 1, 2};
- *  int result[N-2];
- *  thrust::remove_copy(V, V + N, result, 0);
- *  // V remains {-2, 0, -1, 0, 1, 2}
- *  // result is now {-2, -1, 1, 2}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/remove_copy.html
- *  \see remove
- *  \see remove_if
- *  \see remove_copy_if
- */
-template<typename InputIterator,
-         typename OutputIterator,
-         typename T>
-  OutputIterator remove_copy(InputIterator first,
-                             InputIterator last,
-                             OutputIterator result,
-                             const T &value);
-
-
-/*! \p remove_if removes from the range <tt>[first, last)</tt> every element \p x
- *  such that <tt>pred(x)</tt> is \c true. That is, \p remove_if returns an
- *  iterator \c new_last such that the range <tt>[first,new_last)</tt> contains
- *  no elements for which \p pred is \c true. The iterators in the range
- *  <tt>[new_last,last)</tt> are all still dereferenceable, but the elements that
- *  they point to are unspecified. \p remove_if is stable, meaning that the
- *  relative order of elements that are not removed is unchanged.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the range of interest.
- *  \param last The end of the range of interest.
- *  \param pred A predicate to evaluate for each element of the range
- *              <tt>[first,last)</tt>. Elements for which \p pred evaluates to
- *              \c false are removed from the sequence.
- *  \return A ForwardIterator pointing to the end of the resulting range of
- *          elements for which \p pred evaluated to \c true.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/ForwardIterator.html">Forward Iterator</a>,
- *          \p ForwardIterator is mutable,
- *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/Predicate.html">Predicate</a>.
- *
- *  The following code snippet demonstrates how to use \p remove_if to remove
- *  all even numbers from an array of integers using the \p thrust::host execution policy for
- *  parallelization:
- *
- *  \code
- *  #include <thrust/remove.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  struct is_even
- *  {
- *    __host__ __device__
- *    bool operator()(const int x)
- *    {
- *      return (x % 2) == 0;
- *    }
- *  };
- *  ...
- *  const int N = 6;
- *  int A[N] = {1, 4, 2, 8, 5, 7};
- *  int *new_end = thrust::remove_if(thrust::host, A, A + N, is_even());
- *  // The first three values of A are now {1, 5, 7}
- *  // Values beyond new_end are unspecified
- *  \endcode
- *
- *  \note The meaning of "removal" is somewhat subtle. \p remove_if does not
- *  destroy any iterators, and does not change the distance between \p first and
- *  \p last. (There's no way that it could do anything of the sort.) So, for
- *  example, if \c V is a device_vector,
- *  <tt>remove_if(V.begin(), V.end(), pred)</tt> does not change
- *  <tt>V.size()</tt>: \c V will contain just as many elements as it did before.
- *  \p remove_if returns an iterator that points to the end of the resulting
- *  range after elements have been removed from it; it follows that the elements
- *  after that iterator are of no interest, and may be discarded. If you are
- *  removing elements from a
- *  <a href="http://www.sgi.com/tech/stl/Sequence.html">Sequence</a>, you may
- *  simply erase them. That is, a reasonable way of removing elements from a
- *  <a href="http://www.sgi.com/tech/stl/Sequence.html">Sequence</a> is
- *  <tt>S.erase(remove_if(S.begin(), S.end(), pred), S.end())</tt>.
- *
- *  \see http://www.sgi.com/tech/stl/remove_if.html
- *  \see remove
- *  \see remove_copy
- *  \see remove_copy_if
- */
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename Predicate>
-  ForwardIterator remove_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                            ForwardIterator first,
-                            ForwardIterator last,
-                            Predicate pred);
-
-
-/*! \p remove_if removes from the range <tt>[first, last)</tt> every element \p x
- *  such that <tt>pred(x)</tt> is \c true. That is, \p remove_if returns an
- *  iterator \c new_last such that the range <tt>[first,new_last)</tt> contains
- *  no elements for which \p pred is \c true. The iterators in the range
- *  <tt>[new_last,last)</tt> are all still dereferenceable, but the elements that
- *  they point to are unspecified. \p remove_if is stable, meaning that the
- *  relative order of elements that are not removed is unchanged.
- *
- *  \param first The beginning of the range of interest.
- *  \param last The end of the range of interest.
- *  \param pred A predicate to evaluate for each element of the range
- *              <tt>[first,last)</tt>. Elements for which \p pred evaluates to
- *              \c false are removed from the sequence.
- *  \return A ForwardIterator pointing to the end of the resulting range of
- *          elements for which \p pred evaluated to \c true.
- *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/ForwardIterator.html">Forward Iterator</a>,
- *          \p ForwardIterator is mutable,
- *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/Predicate.html">Predicate</a>.
- *
- *  The following code snippet demonstrates how to use \p remove_if to remove
- *  all even numbers from an array of integers.
- *
- *  \code
- *  #include <thrust/remove.h>
- *  ...
- *  struct is_even
- *  {
- *    __host__ __device__
- *    bool operator()(const int x)
- *    {
- *      return (x % 2) == 0;
- *    }
- *  };
- *  ...
- *  const int N = 6;
- *  int A[N] = {1, 4, 2, 8, 5, 7};
- *  int *new_end = thrust::remove_if(A, A + N, is_even());
- *  // The first three values of A are now {1, 5, 7}
- *  // Values beyond new_end are unspecified
- *  \endcode
- *
- *  \note The meaning of "removal" is somewhat subtle. \p remove_if does not
- *  destroy any iterators, and does not change the distance between \p first and
- *  \p last. (There's no way that it could do anything of the sort.) So, for
- *  example, if \c V is a device_vector,
- *  <tt>remove_if(V.begin(), V.end(), pred)</tt> does not change
- *  <tt>V.size()</tt>: \c V will contain just as many elements as it did before.
- *  \p remove_if returns an iterator that points to the end of the resulting
- *  range after elements have been removed from it; it follows that the elements
- *  after that iterator are of no interest, and may be discarded. If you are
- *  removing elements from a
- *  <a href="http://www.sgi.com/tech/stl/Sequence.html">Sequence</a>, you may
- *  simply erase them. That is, a reasonable way of removing elements from a
- *  <a href="http://www.sgi.com/tech/stl/Sequence.html">Sequence</a> is
- *  <tt>S.erase(remove_if(S.begin(), S.end(), pred), S.end())</tt>.
- *
- *  \see http://www.sgi.com/tech/stl/remove_if.html
- *  \see remove
- *  \see remove_copy
- *  \see remove_copy_if
- */
-template<typename ForwardIterator,
-         typename Predicate>
-  ForwardIterator remove_if(ForwardIterator first,
-                            ForwardIterator last,
-                            Predicate pred);
-
-
-/*! \p remove_copy_if copies elements from the range <tt>[first,last)</tt> to a
- *  range beginning at \p result, except that elements for which \p pred is
- *  \c true are not copied. The return value is the end of the resulting range.
- *  This operation is stable, meaning that the relative order of the elements that
- *  are copied is the same as the range <tt>[first,last)</tt>.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the range of interest.
- *  \param last The end of the range of interest.
- *  \param result The resulting range is copied to the sequence beginning at this
- *                location.
- *  \param pred A predicate to evaluate for each element of the range <tt>[first,last)</tt>.
- *              Elements for which \p pred evaluates to \c false are not copied
- *              to the resulting sequence.
- *  \return An OutputIterator pointing to the end of the resulting range.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types,
- *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  \pre The range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
- *
- *  The following code snippet demonstrates how to use \p remove_copy_if to copy
- *  a sequence of numbers to an output range while omitting even numbers using the \p thrust::host
- *  execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/remove.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  struct is_even
- *  {
- *    __host__ __device__
- *    bool operator()(const int x)
- *    {
- *      return (x % 2) == 0;
- *    }
- *  };
- *  ...
- *  const int N = 6;
- *  int V[N] = {-2, 0, -1, 0, 1, 2};
- *  int result[2];
- *  thrust::remove_copy_if(thrust::host, V, V + N, result, is_even());
- *  // V remains {-2, 0, -1, 0, 1, 2}
- *  // result is now {-1, 1}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/remove_copy_if.html
- *  \see remove
- *  \see remove_copy
- *  \see remove_if
- */
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator remove_copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                InputIterator first,
-                                InputIterator last,
-                                OutputIterator result,
-                                Predicate pred);
-
-
-/*! \p remove_copy_if copies elements from the range <tt>[first,last)</tt> to a
- *  range beginning at \p result, except that elements for which \p pred is
- *  \c true are not copied. The return value is the end of the resulting range.
- *  This operation is stable, meaning that the relative order of the elements that
- *  are copied is the same as the range <tt>[first,last)</tt>.
- *
- *  \param first The beginning of the range of interest.
- *  \param last The end of the range of interest.
- *  \param result The resulting range is copied to the sequence beginning at this
- *                location.
- *  \param pred A predicate to evaluate for each element of the range <tt>[first,last)</tt>.
- *              Elements for which \p pred evaluates to \c false are not copied
- *              to the resulting sequence.
- *  \return An OutputIterator pointing to the end of the resulting range.
- *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types,
- *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  \pre The range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
- *
- *  The following code snippet demonstrates how to use \p remove_copy_if to copy
- *  a sequence of numbers to an output range while omitting even numbers.
- *
- *  \code
- *  #include <thrust/remove.h>
- *  ...
- *  struct is_even
- *  {
- *    __host__ __device__
- *    bool operator()(const int x)
- *    {
- *      return (x % 2) == 0;
- *    }
- *  };
- *  ...
- *  const int N = 6;
- *  int V[N] = {-2, 0, -1, 0, 1, 2};
- *  int result[2];
- *  thrust::remove_copy_if(V, V + N, result, is_even());
- *  // V remains {-2, 0, -1, 0, 1, 2}
- *  // result is now {-1, 1}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/remove_copy_if.html
- *  \see remove
- *  \see remove_copy
- *  \see remove_if
- */
-template<typename InputIterator,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator remove_copy_if(InputIterator first,
-                                InputIterator last,
-                                OutputIterator result,
-                                Predicate pred);
-
-
-/*! \p remove_if removes from the range <tt>[first, last)</tt> every element \p x
- *  such that <tt>pred(x)</tt> is \c true. That is, \p remove_if returns an
- *  iterator \c new_last such that the range <tt>[first, new_last)</tt> contains
- *  no elements for which \p pred of the corresponding stencil value is \c true. 
- *  The iterators in the range <tt>[new_last,last)</tt> are all still dereferenceable,
- *  but the elements that they point to are unspecified. \p remove_if is stable,
- *  meaning that the relative order of elements that are not removed is unchanged.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the range of interest.
- *  \param last The end of the range of interest.
- *  \param stencil The beginning of the stencil sequence.
- *  \param pred A predicate to evaluate for each element of the range
- *              <tt>[stencil, stencil + (last - first))</tt>. Elements for which \p pred evaluates to
- *              \c false are removed from the sequence <tt>[first, last)</tt>
- *  \return A ForwardIterator pointing to the end of the resulting range of
- *          elements for which \p pred evaluated to \c true.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/ForwardIterator.html">Forward Iterator</a>
- *          and \p ForwardIterator is mutable.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/Predicate.html">Predicate</a>.
- *
- *  \pre The range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
- *  \pre The range <tt>[stencil, stencil + (last - first))</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
- *
- *  The following code snippet demonstrates how to use \p remove_if to remove
- *  specific elements from an array of integers using the \p thrust::host execution policy for
- *  parallelization:
- *
- *  \code
- *  #include <thrust/remove.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  const int N = 6;
- *  int A[N] = {1, 4, 2, 8, 5, 7};
- *  int S[N] = {0, 1, 1, 1, 0, 0};
- *
- *  int *new_end = thrust::remove(thrust::host, A, A + N, S, thrust::identity<int>());
- *  // The first three values of A are now {1, 5, 7}
- *  // Values beyond new_end are unspecified
- *  \endcode
- *
- *  \note The range <tt>[first, last)</tt> is not permitted to overlap with the range <tt>[stencil, stencil + (last - first))</tt>.
- *
- *  \see http://www.sgi.com/tech/stl/remove_if.html
- *  \see remove
- *  \see remove_copy
- *  \see remove_copy_if
- */
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename InputIterator,
-         typename Predicate>
-  ForwardIterator remove_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                            ForwardIterator first,
-                            ForwardIterator last,
-                            InputIterator stencil,
-                            Predicate pred);
-
-
-/*! \p remove_if removes from the range <tt>[first, last)</tt> every element \p x
- *  such that <tt>pred(x)</tt> is \c true. That is, \p remove_if returns an
- *  iterator \c new_last such that the range <tt>[first, new_last)</tt> contains
- *  no elements for which \p pred of the corresponding stencil value is \c true. 
- *  The iterators in the range <tt>[new_last,last)</tt> are all still dereferenceable,
- *  but the elements that they point to are unspecified. \p remove_if is stable,
- *  meaning that the relative order of elements that are not removed is unchanged.
- *
- *  \param first The beginning of the range of interest.
- *  \param last The end of the range of interest.
- *  \param stencil The beginning of the stencil sequence.
- *  \param pred A predicate to evaluate for each element of the range
- *              <tt>[stencil, stencil + (last - first))</tt>. Elements for which \p pred evaluates to
- *              \c false are removed from the sequence <tt>[first, last)</tt>
- *  \return A ForwardIterator pointing to the end of the resulting range of
- *          elements for which \p pred evaluated to \c true.
- *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/ForwardIterator.html">Forward Iterator</a>
- *          and \p ForwardIterator is mutable.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/Predicate.html">Predicate</a>.
- *
- *  \pre The range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
- *  \pre The range <tt>[stencil, stencil + (last - first))</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
- *
- *  The following code snippet demonstrates how to use \p remove_if to remove
- *  specific elements from an array of integers.
- *
- *  \code
- *  #include <thrust/remove.h>
- *  ...
- *  const int N = 6;
- *  int A[N] = {1, 4, 2, 8, 5, 7};
- *  int S[N] = {0, 1, 1, 1, 0, 0};
- *
- *  int *new_end = thrust::remove(A, A + N, S, thrust::identity<int>());
- *  // The first three values of A are now {1, 5, 7}
- *  // Values beyond new_end are unspecified
- *  \endcode
- *
- *  \note The range <tt>[first, last)</tt> is not permitted to overlap with the range <tt>[stencil, stencil + (last - first))</tt>.
- *
- *  \see http://www.sgi.com/tech/stl/remove_if.html
- *  \see remove
- *  \see remove_copy
- *  \see remove_copy_if
- */
-template<typename ForwardIterator,
-         typename InputIterator,
-         typename Predicate>
-  ForwardIterator remove_if(ForwardIterator first,
-                            ForwardIterator last,
-                            InputIterator stencil,
-                            Predicate pred);
-
-
-/*! \p remove_copy_if copies elements from the range <tt>[first,last)</tt> to a
- *  range beginning at \p result, except that elements for which \p pred of the 
- *  corresponding stencil value is \c true are not copied. The return value is 
- *  the end of the resulting range.  This operation is stable, meaning that the
- *  relative order of the elements that are copied is the same as the 
- *  range <tt>[first,last)</tt>.
- *
- *  The algorithm's execution policy is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the range of interest.
- *  \param last The end of the range of interest.
- *  \param stencil The beginning of the stencil sequence.
- *  \param result The resulting range is copied to the sequence beginning at this
- *                location.
- *  \param pred A predicate to evaluate for each element of the range <tt>[first,last)</tt>.
- *              Elements for which \p pred evaluates to \c false are not copied
- *              to the resulting sequence.
- *  \return An OutputIterator pointing to the end of the resulting range.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator1's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  \pre The range <tt>[stencil, stencil + (last - first))</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
- *
- *  The following code snippet demonstrates how to use \p remove_copy_if to copy
- *  a sequence of numbers to an output range while omitting specific elements using the \p thrust::host
- *  execution policy for parallelization.
- *
- *  \code
- *  #include <thrust/remove.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  const int N = 6;
- *  int V[N] = {-2, 0, -1, 0, 1, 2};
- *  int S[N] = { 1, 1,  0, 1, 0, 1};
- *  int result[2];
- *  thrust::remove_copy_if(thrust::host, V, V + N, S, result, thrust::identity<int>());
- *  // V remains {-2, 0, -1, 0, 1, 2}
- *  // result is now {-1, 1}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/remove_copy_if.html
- *  \see remove
- *  \see remove_copy
- *  \see remove_if
- *  \see copy_if
- */
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator remove_copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                InputIterator1 first,
-                                InputIterator1 last,
-                                InputIterator2 stencil,
-                                OutputIterator result,
-                                Predicate pred);
-
-
-/*! \p remove_copy_if copies elements from the range <tt>[first,last)</tt> to a
- *  range beginning at \p result, except that elements for which \p pred of the 
- *  corresponding stencil value is \c true are not copied. The return value is 
- *  the end of the resulting range.  This operation is stable, meaning that the
- *  relative order of the elements that are copied is the same as the 
- *  range <tt>[first,last)</tt>.
- *
- *  \param first The beginning of the range of interest.
- *  \param last The end of the range of interest.
- *  \param stencil The beginning of the stencil sequence.
- *  \param result The resulting range is copied to the sequence beginning at this
- *                location.
- *  \param pred A predicate to evaluate for each element of the range <tt>[first,last)</tt>.
- *              Elements for which \p pred evaluates to \c false are not copied
- *              to the resulting sequence.
- *  \return An OutputIterator pointing to the end of the resulting range.
- *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator1's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  \pre The range <tt>[stencil, stencil + (last - first))</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
- *
- *  The following code snippet demonstrates how to use \p remove_copy_if to copy
- *  a sequence of numbers to an output range while omitting specific elements.
- *
- *  \code
- *  #include <thrust/remove.h>
- *  ...
- *  const int N = 6;
- *  int V[N] = {-2, 0, -1, 0, 1, 2};
- *  int S[N] = { 1, 1,  0, 1, 0, 1};
- *  int result[2];
- *  thrust::remove_copy_if(V, V + N, S, result, thrust::identity<int>());
- *  // V remains {-2, 0, -1, 0, 1, 2}
- *  // result is now {-1, 1}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/remove_copy_if.html
- *  \see remove
- *  \see remove_copy
- *  \see remove_if
- *  \see copy_if
- */
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator remove_copy_if(InputIterator1 first,
-                                InputIterator1 last,
-                                InputIterator2 stencil,
-                                OutputIterator result,
-                                Predicate pred);
-
-
-/*! \} // end stream_compaction
- */
-
-
-} // end thrust
-
-#include <thrust/detail/remove.inl>
-
diff --git a/compat/thrust/replace.h b/compat/thrust/replace.h
deleted file mode 100644
index 48e3e49e6c..0000000000
--- a/compat/thrust/replace.h
+++ /dev/null
@@ -1,817 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file replace.h
- *  \brief Functions for replacing elements in a range with a particular value
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/execution_policy.h>
-
-namespace thrust
-{
-
-
-/*! \addtogroup transformations
- *  \addtogroup replacing
- *  \ingroup transformations
- *  \{
- */
-
-
-/*! \p replace replaces every element in the range [first, last) equal to \p old_value
- *  with \p new_value. That is: for every iterator \c i, if <tt>*i == old_value</tt>
- *  then it performs the <tt>assignment *i = new_value</tt>.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the sequence of interest.
- *  \param last The end of the sequence of interest.
- *  \param old_value The value to replace.
- *  \param new_value The new value to replace \p old_value.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator is mutable.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html>Assignable.html">Assignable</a>,
- *          \p T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">EqualityComparable</a>,
- *          objects of \p T may be compared for equality with objects of
- *          \p ForwardIterator's \c value_type,
- *          and \p T is convertible to \p ForwardIterator's \c value_type.
- *
- *  The following code snippet demonstrates how to use \p replace to replace
- *  a value of interest in a \c device_vector with another using the \p thrust::device
- *  execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/replace.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/execution_policy.h>
- *
- *  ...
- *  
- *  thrust::device_vector<int> A(4);
- *  A[0] = 1;
- *  A[1] = 2;
- *  A[2] = 3;
- *  A[3] = 1;
- *
- *  thrust::replace(thrust::device, A.begin(), A.end(), 1, 99);
- *
- *  // A contains [99, 2, 3, 99]
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/replace.html
- *  \see \c replace_if
- *  \see \c replace_copy
- *  \see \c replace_copy_if
- */
-template<typename DerivedPolicy, typename ForwardIterator, typename T>
-  void replace(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-               ForwardIterator first, ForwardIterator last,
-               const T &old_value,
-               const T &new_value);
-
-
-/*! \p replace replaces every element in the range [first, last) equal to \p old_value
- *  with \p new_value. That is: for every iterator \c i, if <tt>*i == old_value</tt>
- *  then it performs the <tt>assignment *i = new_value</tt>.
- *
- *  \param first The beginning of the sequence of interest.
- *  \param last The end of the sequence of interest.
- *  \param old_value The value to replace.
- *  \param new_value The new value to replace \p old_value.
- *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator is mutable.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html>Assignable.html">Assignable</a>,
- *          \p T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">EqualityComparable</a>,
- *          objects of \p T may be compared for equality with objects of
- *          \p ForwardIterator's \c value_type,
- *          and \p T is convertible to \p ForwardIterator's \c value_type.
- *
- *  The following code snippet demonstrates how to use \p replace to replace
- *  a value of interest in a \c device_vector with another.
- *
- *  \code
- *  #include <thrust/replace.h>
- *  #include <thrust/device_vector.h>
- *
- *  ...
- *  
- *  thrust::device_vector<int> A(4);
- *  A[0] = 1;
- *  A[1] = 2;
- *  A[2] = 3;
- *  A[3] = 1;
- *
- *  thrust::replace(A.begin(), A.end(), 1, 99);
- *
- *  // A contains [99, 2, 3, 99]
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/replace.html
- *  \see \c replace_if
- *  \see \c replace_copy
- *  \see \c replace_copy_if
- */
-template<typename ForwardIterator, typename T>
-  void replace(ForwardIterator first, ForwardIterator last, const T &old_value,
-               const T &new_value);
-
-
-/*! \p replace_if replaces every element in the range <tt>[first, last)</tt> for which
- *  \p pred returns \c true with \p new_value. That is: for every iterator \c i, if
- *  <tt>pred(*i)</tt> is \c true then it performs the assignment <tt>*i = new_value</tt>.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the sequence of interest.
- *  \param last The end of the sequence of interest.
- *  \param pred The predicate to test on every value of the range <tt>[first,last)</tt>.
- *  \param new_value The new value to replace elements which <tt>pred(*i)</tt> evaluates
- *         to \c true.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          \p ForwardIterator is mutable,
- *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
- *          and \p T is convertible to \p ForwardIterator's \c value_type.
- *
- *  The following code snippet demonstrates how to use \p replace_if to replace
- *  a \c device_vector's negative elements with \c 0 using the \p thrust::device execution policy
- *  for parallelization:
- *
- *  \code
- *  #include <thrust/replace.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  struct is_less_than_zero
- *  {
- *    __host__ __device__
- *    bool operator()(int x)
- *    {
- *      return x < 0;
- *    }
- *  };
- *
- *  ...
- *  
- *  thrust::device_vector<int> A(4);
- *  A[0] =  1;
- *  A[1] = -3;
- *  A[2] =  2;
- *  A[3] = -1;
- *
- *  is_less_than_zero pred;
- *
- *  thrust::replace_if(thrust::device, A.begin(), A.end(), pred, 0);
- *
- *  // A contains [1, 0, 2, 0]
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/replace_if.html
- *  \see \c replace
- *  \see \c replace_copy
- *  \see \c replace_copy_if
- */
-template<typename DerivedPolicy, typename ForwardIterator, typename Predicate, typename T>
-  void replace_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                  ForwardIterator first, ForwardIterator last,
-                  Predicate pred,
-                  const T &new_value);
-
-
-/*! \p replace_if replaces every element in the range <tt>[first, last)</tt> for which
- *  \p pred returns \c true with \p new_value. That is: for every iterator \c i, if
- *  <tt>pred(*i)</tt> is \c true then it performs the assignment <tt>*i = new_value</tt>.
- *
- *  \param first The beginning of the sequence of interest.
- *  \param last The end of the sequence of interest.
- *  \param pred The predicate to test on every value of the range <tt>[first,last)</tt>.
- *  \param new_value The new value to replace elements which <tt>pred(*i)</tt> evaluates
- *         to \c true.
- *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          \p ForwardIterator is mutable,
- *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
- *          and \p T is convertible to \p ForwardIterator's \c value_type.
- *
- *  The following code snippet demonstrates how to use \p replace_if to replace
- *  a \c device_vector's negative elements with \c 0.
- *
- *  \code
- *  #include <thrust/replace.h>
- *  #include <thrust/device_vector.h>
- *  ...
- *  struct is_less_than_zero
- *  {
- *    __host__ __device__
- *    bool operator()(int x)
- *    {
- *      return x < 0;
- *    }
- *  };
- *
- *  ...
- *  
- *  thrust::device_vector<int> A(4);
- *  A[0] =  1;
- *  A[1] = -3;
- *  A[2] =  2;
- *  A[3] = -1;
- *
- *  is_less_than_zero pred;
- *
- *  thrust::replace_if(A.begin(), A.end(), pred, 0);
- *
- *  // A contains [1, 0, 2, 0]
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/replace_if.html
- *  \see \c replace
- *  \see \c replace_copy
- *  \see \c replace_copy_if
- */
-template<typename ForwardIterator, typename Predicate, typename T>
-  void replace_if(ForwardIterator first, ForwardIterator last,
-                  Predicate pred,
-                  const T &new_value);
-
-
-/*! \p replace_if replaces every element in the range <tt>[first, last)</tt> for which
- *  <tt>pred(*s)</tt> returns \c true with \p new_value. That is: for every iterator
- *  \c i in the range <tt>[first, last)</tt>, and \c s in the range <tt>[stencil, stencil + (last - first))</tt>,
- *  if <tt>pred(*s)</tt> is \c true then it performs the assignment <tt>*i = new_value</tt>.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the sequence of interest.
- *  \param last The end of the sequence of interest.
- *  \param stencil The beginning of the stencil sequence.
- *  \param pred The predicate to test on every value of the range <tt>[first,last)</tt>.
- *  \param new_value The new value to replace elements which <tt>pred(*i)</tt> evaluates
- *         to \c true.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator is mutable.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
- *          and \p T is convertible to \p ForwardIterator's \c value_type.
- *
- *  The following code snippet demonstrates how to use \p replace_if to replace
- *  a \c device_vector's element with \c 0 when its corresponding stencil element is less than zero
- *  using the \p thrust::device execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/replace.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/execution_policy.h>
- *
- *  struct is_less_than_zero
- *  {
- *    __host__ __device__
- *    bool operator()(int x)
- *    {
- *      return x < 0;
- *    }
- *  };
- *  
- *  ...
- *  
- *  thrust::device_vector<int> A(4);
- *  A[0] =  10;
- *  A[1] =  20;
- *  A[2] =  30;
- *  A[3] =  40;
- *
- *  thrust::device_vector<int> S(4);
- *  S[0] = -1;
- *  S[1] =  0;
- *  S[2] = -1;
- *  S[3] =  0;
- *
- *  is_less_than_zero pred;
- *  thrust::replace_if(thrust::device, A.begin(), A.end(), S.begin(), pred, 0);
- *
- *  // A contains [0, 20, 0, 40]
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/replace_if.html
- *  \see \c replace
- *  \see \c replace_copy
- *  \see \c replace_copy_if
- */
-template<typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename Predicate, typename T>
-  void replace_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                  ForwardIterator first, ForwardIterator last,
-                  InputIterator stencil,
-                  Predicate pred,
-                  const T &new_value);
-
-
-/*! \p replace_if replaces every element in the range <tt>[first, last)</tt> for which
- *  <tt>pred(*s)</tt> returns \c true with \p new_value. That is: for every iterator
- *  \c i in the range <tt>[first, last)</tt>, and \c s in the range <tt>[stencil, stencil + (last - first))</tt>,
- *  if <tt>pred(*s)</tt> is \c true then it performs the assignment <tt>*i = new_value</tt>.
- *
- *  \param first The beginning of the sequence of interest.
- *  \param last The end of the sequence of interest.
- *  \param stencil The beginning of the stencil sequence.
- *  \param pred The predicate to test on every value of the range <tt>[first,last)</tt>.
- *  \param new_value The new value to replace elements which <tt>pred(*i)</tt> evaluates
- *         to \c true.
- *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator is mutable.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
- *          and \p T is convertible to \p ForwardIterator's \c value_type.
- *
- *  The following code snippet demonstrates how to use \p replace_if to replace
- *  a \c device_vector's element with \c 0 when its corresponding stencil element is less than zero.
- *
- *  \code
- *  #include <thrust/replace.h>
- *  #include <thrust/device_vector.h>
- *
- *  struct is_less_than_zero
- *  {
- *    __host__ __device__
- *    bool operator()(int x)
- *    {
- *      return x < 0;
- *    }
- *  };
- *  
- *  ...
- *  
- *  thrust::device_vector<int> A(4);
- *  A[0] =  10;
- *  A[1] =  20;
- *  A[2] =  30;
- *  A[3] =  40;
- *
- *  thrust::device_vector<int> S(4);
- *  S[0] = -1;
- *  S[1] =  0;
- *  S[2] = -1;
- *  S[3] =  0;
- *
- *  is_less_than_zero pred;
- *  thrust::replace_if(A.begin(), A.end(), S.begin(), pred, 0);
- *
- *  // A contains [0, 20, 0, 40]
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/replace_if.html
- *  \see \c replace
- *  \see \c replace_copy
- *  \see \c replace_copy_if
- */
-template<typename ForwardIterator, typename InputIterator, typename Predicate, typename T>
-  void replace_if(ForwardIterator first, ForwardIterator last,
-                  InputIterator stencil,
-                  Predicate pred,
-                  const T &new_value);
-
-
-/*! \p replace_copy copies elements from the range <tt>[first, last)</tt> to the range
- *  <tt>[result, result + (last-first))</tt>, except that any element equal to \p old_value
- *  is not copied; \p new_value is copied instead.
- *
- *  More precisely, for every integer \c n such that <tt>0 <= n < last-first</tt>, \p replace_copy
- *  performs the assignment <tt>*(result+n) = new_value</tt> if <tt>*(first+n) == old_value</tt>,
- *  and <tt>*(result+n) = *(first+n)</tt> otherwise.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the sequence to copy from.
- *  \param last The end of the sequence to copy from.
- *  \param result The beginning of the sequence to copy to.
- *  \param old_value The value to replace.
- *  \param new_value The replacement value for which <tt>*i == old_value</tt> evaluates to \c true.
- *  \return <tt>result + (last-first)</tt>
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
- *          \p T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>,
- *          \p T may be compared for equality with \p InputIterator's \c value_type,
- *          and \p T is convertible to \p OutputIterator's \c value_type.
- *
- *  \pre \p first may equal \p result, but the ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
- *
- *  \code
- *  #include <thrust/replace.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  thrust::device_vector<int> A(4);
- *  A[0] = 1;
- *  A[1] = 2;
- *  A[2] = 3;
- *  A[3] = 1;
- *
- *  thrust::device_vector<int> B(4);
- *
- *  thrust::replace_copy(thrust::device, A.begin(), A.end(), B.begin(), 1, 99);
- *
- *  // B contains [99, 2, 3, 99]
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/replace_copy.html
- *  \see \c copy
- *  \see \c replace
- *  \see \c replace_if
- *  \see \c replace_copy_if
- */
-template<typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename T>
-  OutputIterator replace_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                              InputIterator first, InputIterator last,
-                              OutputIterator result,
-                              const T &old_value,
-                              const T &new_value);
-
-
-/*! \p replace_copy copies elements from the range <tt>[first, last)</tt> to the range
- *  <tt>[result, result + (last-first))</tt>, except that any element equal to \p old_value
- *  is not copied; \p new_value is copied instead.
- *
- *  More precisely, for every integer \c n such that <tt>0 <= n < last-first</tt>, \p replace_copy
- *  performs the assignment <tt>*(result+n) = new_value</tt> if <tt>*(first+n) == old_value</tt>,
- *  and <tt>*(result+n) = *(first+n)</tt> otherwise.
- *
- *  \param first The beginning of the sequence to copy from.
- *  \param last The end of the sequence to copy from.
- *  \param result The beginning of the sequence to copy to.
- *  \param old_value The value to replace.
- *  \param new_value The replacement value for which <tt>*i == old_value</tt> evaluates to \c true.
- *  \return <tt>result + (last-first)</tt>
- *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
- *          \p T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>,
- *          \p T may be compared for equality with \p InputIterator's \c value_type,
- *          and \p T is convertible to \p OutputIterator's \c value_type.
- *
- *  \pre \p first may equal \p result, but the ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
- *
- *  \code
- *  #include <thrust/replace.h>
- *  #include <thrust/device_vector.h>
- *  ...
- *  thrust::device_vector<int> A(4);
- *  A[0] = 1;
- *  A[1] = 2;
- *  A[2] = 3;
- *  A[3] = 1;
- *
- *  thrust::device_vector<int> B(4);
- *
- *  thrust::replace_copy(A.begin(), A.end(), B.begin(), 1, 99);
- *
- *  // B contains [99, 2, 3, 99]
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/replace_copy.html
- *  \see \c copy
- *  \see \c replace
- *  \see \c replace_if
- *  \see \c replace_copy_if
- */
-template<typename InputIterator, typename OutputIterator, typename T>
-  OutputIterator replace_copy(InputIterator first, InputIterator last,
-                              OutputIterator result, const T &old_value,
-                              const T &new_value);
-
-
-/*! \p replace_copy_if copies elements from the range <tt>[first, last)</tt> to the range
- *  <tt>[result, result + (last-first))</tt>, except that any element for which \p pred
- *  is \c true is not copied; \p new_value is copied instead.
- *
- *  More precisely, for every integer \c n such that 0 <= n < last-first,
- *  \p replace_copy_if performs the assignment <tt>*(result+n) = new_value</tt> if
- *  <tt>pred(*(first+n))</tt>, and <tt>*(result+n) = *(first+n)</tt> otherwise.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the sequence to copy from.
- *  \param last The end of the sequence to copy from.
- *  \param result The beginning of the sequence to copy to.
- *  \param pred The predicate to test on every value of the range <tt>[first,last)</tt>.
- *  \param new_value The replacement value to assign <tt>pred(*i)</tt> evaluates to \c true.
- *  \return <tt>result + (last-first)</tt>
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
- *          and \p T is convertible to \p OutputIterator's \c value_type.
- *
- *  \pre \p first may equal \p result, but the ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
- *
- *  \code
- *  #include <thrust/replace.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/execution_policy.h>
- *
- *  struct is_less_than_zero
- *  {
- *    __host__ __device__
- *    bool operator()(int x)
- *    {
- *      return x < 0;
- *    }
- *  };
- *
- *  ...
- *  
- *  thrust::device_vector<int> A(4);
- *  A[0] =  1;
- *  A[1] = -3;
- *  A[2] =  2;
- *  A[3] = -1;
- 
- *  thrust::device_vector<int> B(4);
- *  is_less_than_zero pred;
- *
- *  thrust::replace_copy_if(thrust::device, A.begin(), A.end(), B.begin(), pred, 0);
- *
- *  // B contains [1, 0, 2, 0]
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/replace_copy_if.html
- *  \see \c replace
- *  \see \c replace_if
- *  \see \c replace_copy
- */
-template<typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename Predicate, typename T>
-  OutputIterator replace_copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                 InputIterator first, InputIterator last,
-                                 OutputIterator result,
-                                 Predicate pred,
-                                 const T &new_value);
-
-
-/*! \p replace_copy_if copies elements from the range <tt>[first, last)</tt> to the range
- *  <tt>[result, result + (last-first))</tt>, except that any element for which \p pred
- *  is \c true is not copied; \p new_value is copied instead.
- *
- *  More precisely, for every integer \c n such that 0 <= n < last-first,
- *  \p replace_copy_if performs the assignment <tt>*(result+n) = new_value</tt> if
- *  <tt>pred(*(first+n))</tt>, and <tt>*(result+n) = *(first+n)</tt> otherwise.
- *
- *  \param first The beginning of the sequence to copy from.
- *  \param last The end of the sequence to copy from.
- *  \param result The beginning of the sequence to copy to.
- *  \param pred The predicate to test on every value of the range <tt>[first,last)</tt>.
- *  \param new_value The replacement value to assign <tt>pred(*i)</tt> evaluates to \c true.
- *  \return <tt>result + (last-first)</tt>
- *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
- *          and \p T is convertible to \p OutputIterator's \c value_type.
- *
- *  \pre \p first may equal \p result, but the ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
- *
- *  \code
- *  #include <thrust/replace.h>
- *  #include <thrust/device_vector.h>
- *
- *  struct is_less_than_zero
- *  {
- *    __host__ __device__
- *    bool operator()(int x)
- *    {
- *      return x < 0;
- *    }
- *  };
- *
- *  ...
- *  
- *  thrust::device_vector<int> A(4);
- *  A[0] =  1;
- *  A[1] = -3;
- *  A[2] =  2;
- *  A[3] = -1;
- 
- *  thrust::device_vector<int> B(4);
- *  is_less_than_zero pred;
- *
- *  thrust::replace_copy_if(A.begin(), A.end(), B.begin(), pred, 0);
- *
- *  // B contains [1, 0, 2, 0]
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/replace_copy_if.html
- *  \see \c replace
- *  \see \c replace_if
- *  \see \c replace_copy
- */
-template<typename InputIterator, typename OutputIterator, typename Predicate, typename T>
-  OutputIterator replace_copy_if(InputIterator first, InputIterator last,
-                                 OutputIterator result,
-                                 Predicate pred,
-                                 const T &new_value);
-
-
-/*! This version of \p replace_copy_if copies elements from the range <tt>[first, last)</tt> to the range
- *  <tt>[result, result + (last-first))</tt>, except that any element whose corresponding stencil
- *  element causes \p pred to be \c true is not copied; \p new_value is copied instead.
- *
- *  More precisely, for every integer \c n such that <tt>0 <= n < last-first</tt>,
- *  \p replace_copy_if performs the assignment <tt>*(result+n) = new_value</tt> if
- *  <tt>pred(*(stencil+n))</tt>, and <tt>*(result+n) = *(first+n)</tt> otherwise.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the sequence to copy from.
- *  \param last The end of the sequence to copy from.
- *  \param stencil The beginning of the stencil sequence.
- *  \param result The beginning of the sequence to copy to.
- *  \param pred The predicate to test on every value of the range <tt>[stencil, stencil + (last - first))</tt>.
- *  \param new_value The replacement value to assign when <tt>pred(*s)</tt> evaluates to \c true. 
- *  \return <tt>result + (last-first)</tt>
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *                         and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
- *          and \p T is convertible to \p OutputIterator's \c value_type.
- *
- *  \pre \p first may equal \p result, but the ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
- *  \pre \p stencil may equal \p result, but the ranges <tt>[stencil, stencil + (last - first))</tt> and <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
- *
- *  \code
- *  #include <thrust/replace.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/execution_policy.h>
- *
- *  struct is_less_than_zero
- *  {
- *    __host__ __device__
- *    bool operator()(int x)
- *    {
- *      return x < 0;
- *    }
- *  };
- *  
- *  ...
- *  
- *  thrust::device_vector<int> A(4);
- *  A[0] =  10;
- *  A[1] =  20;
- *  A[2] =  30;
- *  A[3] =  40;
- *
- *  thrust::device_vector<int> S(4);
- *  S[0] = -1;
- *  S[1] =  0;
- *  S[2] = -1;
- *  S[3] =  0;
- *
- *  thrust::device_vector<int> B(4);
- *  is_less_than_zero pred;
- *
- *  thrust::replace_if(thrust::device, A.begin(), A.end(), S.begin(), B.begin(), pred, 0);
- *
- *  // B contains [0, 20, 0, 40]
- *  \endcode
- *
- *  \see \c replace_copy
- *  \see \c replace_if
- */
-template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Predicate, typename T>
-  OutputIterator replace_copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                 InputIterator1 first, InputIterator1 last,
-                                 InputIterator2 stencil,
-                                 OutputIterator result,
-                                 Predicate pred,
-                                 const T &new_value);
-
-
-/*! This version of \p replace_copy_if copies elements from the range <tt>[first, last)</tt> to the range
- *  <tt>[result, result + (last-first))</tt>, except that any element whose corresponding stencil
- *  element causes \p pred to be \c true is not copied; \p new_value is copied instead.
- *
- *  More precisely, for every integer \c n such that <tt>0 <= n < last-first</tt>,
- *  \p replace_copy_if performs the assignment <tt>*(result+n) = new_value</tt> if
- *  <tt>pred(*(stencil+n))</tt>, and <tt>*(result+n) = *(first+n)</tt> otherwise.
- *
- *  \param first The beginning of the sequence to copy from.
- *  \param last The end of the sequence to copy from.
- *  \param stencil The beginning of the stencil sequence.
- *  \param result The beginning of the sequence to copy to.
- *  \param pred The predicate to test on every value of the range <tt>[stencil, stencil + (last - first))</tt>.
- *  \param new_value The replacement value to assign when <tt>pred(*s)</tt> evaluates to \c true. 
- *  \return <tt>result + (last-first)</tt>
- *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *                         and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
- *          and \p T is convertible to \p OutputIterator's \c value_type.
- *
- *  \pre \p first may equal \p result, but the ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
- *  \pre \p stencil may equal \p result, but the ranges <tt>[stencil, stencil + (last - first))</tt> and <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
- *
- *  \code
- *  #include <thrust/replace.h>
- *  #include <thrust/device_vector.h>
- *
- *  struct is_less_than_zero
- *  {
- *    __host__ __device__
- *    bool operator()(int x)
- *    {
- *      return x < 0;
- *    }
- *  };
- *  
- *  ...
- *  
- *  thrust::device_vector<int> A(4);
- *  A[0] =  10;
- *  A[1] =  20;
- *  A[2] =  30;
- *  A[3] =  40;
- *
- *  thrust::device_vector<int> S(4);
- *  S[0] = -1;
- *  S[1] =  0;
- *  S[2] = -1;
- *  S[3] =  0;
- *
- *  thrust::device_vector<int> B(4);
- *  is_less_than_zero pred;
- *
- *  thrust::replace_if(A.begin(), A.end(), S.begin(), B.begin(), pred, 0);
- *
- *  // B contains [0, 20, 0, 40]
- *  \endcode
- *
- *  \see \c replace_copy
- *  \see \c replace_if
- */
-template<typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Predicate, typename T>
-  OutputIterator replace_copy_if(InputIterator1 first, InputIterator1 last,
-                                 InputIterator2 stencil,
-                                 OutputIterator result,
-                                 Predicate pred,
-                                 const T &new_value);
-
-
-/*! \} // end replacing
- *  \} // transformations
- */
-
-
-} // end thrust
-
-#include <thrust/detail/replace.inl>
-
diff --git a/compat/thrust/reverse.h b/compat/thrust/reverse.h
deleted file mode 100644
index ba50c5d05a..0000000000
--- a/compat/thrust/reverse.h
+++ /dev/null
@@ -1,213 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file reverse.h
- *  \brief Reverses the order of a range
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/execution_policy.h>
-
-namespace thrust
-{
-
-
-/*! \addtogroup reordering
- *  \ingroup algorithms
- */
-
-
-/*! \p reverse reverses a range. That is: for every <tt>i</tt> such that
- *  <tt>0 <= i <= (last - first) / 2</tt>, it exchanges <tt>*(first + i)</tt>
- *  and <tt>*(last - (i + 1))</tt>.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the range to reverse.
- *  \param last The end of the range to reverse.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam BidirectionalIterator is a model of <a href="http://www.sgi.com/tech/stl/BidirectionalIterator.html">Bidirectional Iterator</a> and
- *          \p BidirectionalIterator is mutable.
- *
- *  The following code snippet demonstrates how to use \p reverse to reverse a
- *  \p device_vector of integers using the \p thrust::device execution policy for
- *  parallelization:
- *
- *  \code
- *  #include <thrust/reverse.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  const int N = 6;
- *  int data[N] = {0, 1, 2, 3, 4, 5};
- *  thrust::device_vector<int> v(data, data + N);
- *  thrust::reverse(thrust::device, v.begin(), v.end());
- *  // v is now {5, 4, 3, 2, 1, 0}
- *  \endcode
- *  
- *  \see http://www.sgi.com/tech/stl/reverse.html
- *  \see \p reverse_copy
- *  \see \p reverse_iterator
- */
-template<typename DerivedPolicy, typename BidirectionalIterator>
-  void reverse(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-               BidirectionalIterator first,
-               BidirectionalIterator last);
-
-
-/*! \p reverse reverses a range. That is: for every <tt>i</tt> such that
- *  <tt>0 <= i <= (last - first) / 2</tt>, it exchanges <tt>*(first + i)</tt>
- *  and <tt>*(last - (i + 1))</tt>.
- *
- *  \param first The beginning of the range to reverse.
- *  \param last The end of the range to reverse.
- *
- *  \tparam BidirectionalIterator is a model of <a href="http://www.sgi.com/tech/stl/BidirectionalIterator.html">Bidirectional Iterator</a> and
- *          \p BidirectionalIterator is mutable.
- *
- *  The following code snippet demonstrates how to use \p reverse to reverse a
- *  \p device_vector of integers.
- *
- *  \code
- *  #include <thrust/reverse.h>
- *  ...
- *  const int N = 6;
- *  int data[N] = {0, 1, 2, 3, 4, 5};
- *  thrust::device_vector<int> v(data, data + N);
- *  thrust::reverse(v.begin(), v.end());
- *  // v is now {5, 4, 3, 2, 1, 0}
- *  \endcode
- *  
- *  \see http://www.sgi.com/tech/stl/reverse.html
- *  \see \p reverse_copy
- *  \see \p reverse_iterator
- */
-template<typename BidirectionalIterator>
-  void reverse(BidirectionalIterator first,
-               BidirectionalIterator last);
-
-
-/*! \p reverse_copy differs from \ref reverse only in that the reversed range
- *  is written to a different output range, rather than inplace.
- *
- *  \p reverse_copy copies elements from the range <tt>[first, last)</tt> to the
- *  range <tt>[result, result + (last - first))</tt> such that the copy is a 
- *  reverse of the original range. Specifically: for every <tt>i</tt> such that
- *  <tt>0 <= i < (last - first)</tt>, \p reverse_copy performs the assignment
- *  <tt>*(result + (last - first) - i) = *(first + i)</tt>.
- *
- *  The return value is <tt>result + (last - first))</tt>.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the range to reverse.
- *  \param last The end of the range to reverse.
- *  \param result The beginning of the output range.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam BidirectionalIterator is a model of <a href="http://www.sgi.com/tech/stl/BidirectionalIterator.html">Bidirectional Iterator</a>,
- *          and \p BidirectionalIterator's \p value_type is convertible to \p OutputIterator's \p value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *
- *  \pre The range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap.
- *
- *  The following code snippet demonstrates how to use \p reverse_copy to reverse
- *  an input \p device_vector of integers to an output \p device_vector using the \p thrust::device
- *  execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/reverse.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  const int N = 6;
- *  int data[N] = {0, 1, 2, 3, 4, 5};
- *  thrust::device_vector<int> input(data, data + N);
- *  thrust::device_vector<int> output(N);
- *  thrust::reverse_copy(thrust::device, v.begin(), v.end(), output.begin());
- *  // input is still {0, 1, 2, 3, 4, 5}
- *  // output is now  {5, 4, 3, 2, 1, 0}
- *  \endcode
- *  
- *  \see http://www.sgi.com/tech/stl/reverse_copy.html
- *  \see \p reverse
- *  \see \p reverse_iterator
- */
-template<typename DerivedPolicy, typename BidirectionalIterator, typename OutputIterator>
-  OutputIterator reverse_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                              BidirectionalIterator first,
-                              BidirectionalIterator last,
-                              OutputIterator result);
-
-
-/*! \p reverse_copy differs from \ref reverse only in that the reversed range
- *  is written to a different output range, rather than inplace.
- *
- *  \p reverse_copy copies elements from the range <tt>[first, last)</tt> to the
- *  range <tt>[result, result + (last - first))</tt> such that the copy is a 
- *  reverse of the original range. Specifically: for every <tt>i</tt> such that
- *  <tt>0 <= i < (last - first)</tt>, \p reverse_copy performs the assignment
- *  <tt>*(result + (last - first) - i) = *(first + i)</tt>.
- *
- *  The return value is <tt>result + (last - first))</tt>.
- *
- *  \param first The beginning of the range to reverse.
- *  \param last The end of the range to reverse.
- *  \param result The beginning of the output range.
- *
- *  \tparam BidirectionalIterator is a model of <a href="http://www.sgi.com/tech/stl/BidirectionalIterator.html">Bidirectional Iterator</a>,
- *          and \p BidirectionalIterator's \p value_type is convertible to \p OutputIterator's \p value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *
- *  \pre The range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap.
- *
- *  The following code snippet demonstrates how to use \p reverse_copy to reverse
- *  an input \p device_vector of integers to an output \p device_vector.
- *
- *  \code
- *  #include <thrust/reverse.h>
- *  ...
- *  const int N = 6;
- *  int data[N] = {0, 1, 2, 3, 4, 5};
- *  thrust::device_vector<int> input(data, data + N);
- *  thrust::device_vector<int> output(N);
- *  thrust::reverse_copy(v.begin(), v.end(), output.begin());
- *  // input is still {0, 1, 2, 3, 4, 5}
- *  // output is now  {5, 4, 3, 2, 1, 0}
- *  \endcode
- *  
- *  \see http://www.sgi.com/tech/stl/reverse_copy.html
- *  \see \p reverse
- *  \see \p reverse_iterator
- */
-template<typename BidirectionalIterator, typename OutputIterator>
-  OutputIterator reverse_copy(BidirectionalIterator first,
-                              BidirectionalIterator last,
-                              OutputIterator result);
-
-
-/*! \} // end reordering
- */
-
-
-} // end thrust
-
-#include <thrust/detail/reverse.inl>
-
diff --git a/compat/thrust/scan.h b/compat/thrust/scan.h
deleted file mode 100644
index 95074e6b90..0000000000
--- a/compat/thrust/scan.h
+++ /dev/null
@@ -1,1552 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file scan.h
- *  \brief Functions for computing prefix sums
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/execution_policy.h>
-
-namespace thrust
-{
-
-
-/*! \addtogroup algorithms
- */
-
-
-/*! \addtogroup prefixsums Prefix Sums
- *  \ingroup algorithms
- *  \{
- */
-
-
-/*! \p inclusive_scan computes an inclusive prefix sum operation. The
- *  term 'inclusive' means that each result includes the corresponding
- *  input operand in the partial sum. More precisely, <tt>*first</tt> is 
- *  assigned to <tt>*result</tt> and the sum of <tt>*first</tt> and 
- *  <tt>*(first + 1)</tt> is assigned to <tt>*(result + 1)</tt>, and so on. 
- *  This version of \p inclusive_scan assumes plus as the associative operator.  
- *  When the input and output sequences are the same, the scan is performed 
- *  in-place.
- 
- *  \p inclusive_scan is similar to \c std::partial_sum in the STL.  The primary
- *  difference between the two functions is that \c std::partial_sum guarantees
- *  a serial summation order, while \p inclusive_scan requires associativity of 
- *  the binary operation to parallelize the prefix sum.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *    
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the input sequence.
- *  \param last The end of the input sequence.
- *  \param result The beginning of the output sequence.
- *  \return The end of the output sequence.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *                        and \c InputIterator's \c value_type is convertible to
- *                        \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
- *                         and if \c x and \c y are objects of \c OutputIterator's
- *                         \c value_type, then <tt>x + y</tt> is defined. If \c T is
- *                         \c OutputIterator's \c value_type, then <tt>T(0)</tt> is
- *                         defined.
- *
- *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
- *
- *  The following code snippet demonstrates how to use \p inclusive_scan to compute an in-place
- *  prefix sum using the \p thrust::host execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/scan.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  
- *  int data[6] = {1, 0, 2, 2, 1, 3};
- *
- *  thrust::inclusive_scan(thrust::host, data, data + 6, data); // in-place scan
- *
- *  // data is now {1, 1, 3, 5, 6, 9}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/partial_sum.html
- *
- */
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator>
-  OutputIterator inclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                InputIterator first,
-                                InputIterator last,
-                                OutputIterator result);
-
-
-/*! \p inclusive_scan computes an inclusive prefix sum operation. The
- *  term 'inclusive' means that each result includes the corresponding
- *  input operand in the partial sum. More precisely, <tt>*first</tt> is 
- *  assigned to <tt>*result</tt> and the sum of <tt>*first</tt> and 
- *  <tt>*(first + 1)</tt> is assigned to <tt>*(result + 1)</tt>, and so on. 
- *  This version of \p inclusive_scan assumes plus as the associative operator.  
- *  When the input and output sequences are the same, the scan is performed 
- *  in-place.
- 
- *  \p inclusive_scan is similar to \c std::partial_sum in the STL.  The primary
- *  difference between the two functions is that \c std::partial_sum guarantees
- *  a serial summation order, while \p inclusive_scan requires associativity of 
- *  the binary operation to parallelize the prefix sum.
- *    
- *  \param first The beginning of the input sequence.
- *  \param last The end of the input sequence.
- *  \param result The beginning of the output sequence.
- *  \return The end of the output sequence.
- *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *                        and \c InputIterator's \c value_type is convertible to
- *                        \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
- *                         and if \c x and \c y are objects of \c OutputIterator's
- *                         \c value_type, then <tt>x + y</tt> is defined. If \c T is
- *                         \c OutputIterator's \c value_type, then <tt>T(0)</tt> is
- *                         defined.
- *
- *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
- *
- *  The following code snippet demonstrates how to use \p inclusive_scan
- *
- *  \code
- *  #include <thrust/scan.h>
- *  
- *  int data[6] = {1, 0, 2, 2, 1, 3};
- *
- *  thrust::inclusive_scan(data, data + 6, data); // in-place scan
- *
- *  // data is now {1, 1, 3, 5, 6, 9}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/partial_sum.html
- *
- */
-template<typename InputIterator,
-         typename OutputIterator>
-  OutputIterator inclusive_scan(InputIterator first,
-                                InputIterator last,
-                                OutputIterator result);
-
-
-/*! \p inclusive_scan computes an inclusive prefix sum operation. The
- *  term 'inclusive' means that each result includes the corresponding
- *  input operand in the partial sum.  When the input and output sequences 
- *  are the same, the scan is performed in-place.
- *    
- *  \p inclusive_scan is similar to \c std::partial_sum in the STL.  The primary
- *  difference between the two functions is that \c std::partial_sum guarantees
- *  a serial summation order, while \p inclusive_scan requires associativity of 
- *  the binary operation to parallelize the prefix sum.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the input sequence.
- *  \param last The end of the input sequence.
- *  \param result The beginning of the output sequence.
- *  \param binary_op The associatve operator used to 'sum' values.
- *  \return The end of the output sequence.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *                        and \c InputIterator's \c value_type is convertible to
- *                        \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>
- *                         and \c OutputIterator's \c value_type is convertible to
- *                         both \c AssociativeOperator's \c first_argument_type and
- *                         \c second_argument_type.
- *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
- *                              and \c AssociativeOperator's \c result_type is
- *                              convertible to \c OutputIterator's \c value_type.
- *
- *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
- *
- *  The following code snippet demonstrates how to use \p inclusive_scan to compute an in-place
- *  prefix sum using the \p thrust::host execution policy for parallelization:
- *
- *  \code
- *  int data[10] = {-5, 0, 2, -3, 2, 4, 0, -1, 2, 8};
- * 
- *  thrust::maximum<int> binary_op;
- *
- *  thrust::inclusive_scan(thrust::host, data, data + 10, data, binary_op); // in-place scan
- *
- *  // data is now {-5, 0, 2, 2, 2, 4, 4, 4, 4, 8}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/partial_sum.html
- */
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename AssociativeOperator>
-  OutputIterator inclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                InputIterator first,
-                                InputIterator last,
-                                OutputIterator result,
-                                AssociativeOperator binary_op);
-
-
-/*! \p inclusive_scan computes an inclusive prefix sum operation. The
- *  term 'inclusive' means that each result includes the corresponding
- *  input operand in the partial sum.  When the input and output sequences 
- *  are the same, the scan is performed in-place.
- *    
- *  \p inclusive_scan is similar to \c std::partial_sum in the STL.  The primary
- *  difference between the two functions is that \c std::partial_sum guarantees
- *  a serial summation order, while \p inclusive_scan requires associativity of 
- *  the binary operation to parallelize the prefix sum.
- *
- *  \param first The beginning of the input sequence.
- *  \param last The end of the input sequence.
- *  \param result The beginning of the output sequence.
- *  \param binary_op The associatve operator used to 'sum' values.
- *  \return The end of the output sequence.
- *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *                        and \c InputIterator's \c value_type is convertible to
- *                        \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>
- *                         and \c OutputIterator's \c value_type is convertible to
- *                         both \c AssociativeOperator's \c first_argument_type and
- *                         \c second_argument_type.
- *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
- *                              and \c AssociativeOperator's \c result_type is
- *                              convertible to \c OutputIterator's \c value_type.
- *
- *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
- *
- *  The following code snippet demonstrates how to use \p inclusive_scan
- *
- *  \code
- *  int data[10] = {-5, 0, 2, -3, 2, 4, 0, -1, 2, 8};
- * 
- *  thrust::maximum<int> binary_op;
- *
- *  thrust::inclusive_scan(data, data + 10, data, binary_op); // in-place scan
- *
- *  // data is now {-5, 0, 2, 2, 2, 4, 4, 4, 4, 8}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/partial_sum.html
- */
-template<typename InputIterator,
-         typename OutputIterator,
-         typename AssociativeOperator>
-  OutputIterator inclusive_scan(InputIterator first,
-                                InputIterator last,
-                                OutputIterator result,
-                                AssociativeOperator binary_op);
-
-
-/*! \p exclusive_scan computes an exclusive prefix sum operation. The
- *  term 'exclusive' means that each result does not include the 
- *  corresponding input operand in the partial sum.  More precisely,
- *  <tt>0</tt> is assigned to <tt>*result</tt> and the sum of 
- *  <tt>0</tt> and <tt>*first</tt> is assigned to <tt>*(result + 1)</tt>,
- *  and so on. This version of \p exclusive_scan assumes plus as the 
- *  associative operator and \c 0 as the initial value.  When the input and 
- *  output sequences are the same, the scan is performed in-place.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *    
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the input sequence.
- *  \param last The end of the input sequence.
- *  \param result The beginning of the output sequence.
- *  \return The end of the output sequence.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *                        and \c InputIterator's \c value_type is convertible to
- *                        \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
- *                         and if \c x and \c y are objects of \c OutputIterator's
- *                         \c value_type, then <tt>x + y</tt> is defined. If \c T is
- *                         \c OutputIterator's \c value_type, then <tt>T(0)</tt> is
- *                         defined.
- *
- *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
- *
- *  The following code snippet demonstrates how to use \p exclusive_scan to compute an in-place
- *  prefix sum using the \p thrust::host execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/scan.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  
- *  int data[6] = {1, 0, 2, 2, 1, 3};
- *
- *  thrust::exclusive_scan(thrust::host, data, data + 6, data); // in-place scan
- *
- *  // data is now {0, 1, 1, 3, 5, 6}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/partial_sum.html
- */
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator>
-  OutputIterator exclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                InputIterator first,
-                                InputIterator last,
-                                OutputIterator result);
-
-
-/*! \p exclusive_scan computes an exclusive prefix sum operation. The
- *  term 'exclusive' means that each result does not include the 
- *  corresponding input operand in the partial sum.  More precisely,
- *  <tt>0</tt> is assigned to <tt>*result</tt> and the sum of 
- *  <tt>0</tt> and <tt>*first</tt> is assigned to <tt>*(result + 1)</tt>,
- *  and so on. This version of \p exclusive_scan assumes plus as the 
- *  associative operator and \c 0 as the initial value.  When the input and 
- *  output sequences are the same, the scan is performed in-place.
- *    
- *  \param first The beginning of the input sequence.
- *  \param last The end of the input sequence.
- *  \param result The beginning of the output sequence.
- *  \return The end of the output sequence.
- *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *                        and \c InputIterator's \c value_type is convertible to
- *                        \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
- *                         and if \c x and \c y are objects of \c OutputIterator's
- *                         \c value_type, then <tt>x + y</tt> is defined. If \c T is
- *                         \c OutputIterator's \c value_type, then <tt>T(0)</tt> is
- *                         defined.
- *
- *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
- *
- *  The following code snippet demonstrates how to use \p exclusive_scan
- *
- *  \code
- *  #include <thrust/scan.h>
- *  
- *  int data[6] = {1, 0, 2, 2, 1, 3};
- *
- *  thrust::exclusive_scan(data, data + 6, data); // in-place scan
- *
- *  // data is now {0, 1, 1, 3, 5, 6}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/partial_sum.html
- */
-template<typename InputIterator,
-         typename OutputIterator>
-  OutputIterator exclusive_scan(InputIterator first,
-                                InputIterator last,
-                                OutputIterator result);
-
-
-/*! \p exclusive_scan computes an exclusive prefix sum operation. The
- *  term 'exclusive' means that each result does not include the 
- *  corresponding input operand in the partial sum.  More precisely,
- *  \p init is assigned to <tt>*result</tt> and the sum of \p init and 
- *  <tt>*first</tt> is assigned to <tt>*(result + 1)</tt>, and so on. 
- *  This version of \p exclusive_scan assumes plus as the associative 
- *  operator but requires an initial value \p init.  When the input and 
- *  output sequences are the same, the scan is performed in-place.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the input sequence.
- *  \param last The end of the input sequence.
- *  \param result The beginning of the output sequence.
- *  \param init The initial value.
- *  \return The end of the output sequence.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *                        and \c InputIterator's \c value_type is convertible to
- *                        \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
- *                         and if \c x and \c y are objects of \c OutputIterator's
- *                         \c value_type, then <tt>x + y</tt> is defined.
- *  \tparam T is convertible to \c OutputIterator's \c value_type.
- *
- *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
- *
- *  The following code snippet demonstrates how to use \p exclusive_scan to compute an in-place
- *  prefix sum using the \p thrust::host execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/scan.h>
- *  #include <thrust/execution_policy.h>
- *  
- *  int data[6] = {1, 0, 2, 2, 1, 3};
- *
- *  thrust::exclusive_scan(thrust::host, data, data + 6, data, 4); // in-place scan
- *
- *  // data is now {4, 5, 5, 7, 9, 10}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/partial_sum.html
- */
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename T>
-  OutputIterator exclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                InputIterator first,
-                                InputIterator last,
-                                OutputIterator result,
-                                T init);
-
-
-/*! \p exclusive_scan computes an exclusive prefix sum operation. The
- *  term 'exclusive' means that each result does not include the 
- *  corresponding input operand in the partial sum.  More precisely,
- *  \p init is assigned to <tt>*result</tt> and the sum of \p init and 
- *  <tt>*first</tt> is assigned to <tt>*(result + 1)</tt>, and so on. 
- *  This version of \p exclusive_scan assumes plus as the associative 
- *  operator but requires an initial value \p init.  When the input and 
- *  output sequences are the same, the scan is performed in-place.
- *
- *  \param first The beginning of the input sequence.
- *  \param last The end of the input sequence.
- *  \param result The beginning of the output sequence.
- *  \param init The initial value.
- *  \return The end of the output sequence.
- *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *                        and \c InputIterator's \c value_type is convertible to
- *                        \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
- *                         and if \c x and \c y are objects of \c OutputIterator's
- *                         \c value_type, then <tt>x + y</tt> is defined.
- *  \tparam T is convertible to \c OutputIterator's \c value_type.
- *
- *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
- *
- *  The following code snippet demonstrates how to use \p exclusive_scan
- *
- *  \code
- *  #include <thrust/scan.h>
- *  
- *  int data[6] = {1, 0, 2, 2, 1, 3};
- *
- *  thrust::exclusive_scan(data, data + 6, data, 4); // in-place scan
- *
- *  // data is now {4, 5, 5, 7, 9, 10}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/partial_sum.html
- */
-template<typename InputIterator,
-         typename OutputIterator,
-         typename T>
-  OutputIterator exclusive_scan(InputIterator first,
-                                InputIterator last,
-                                OutputIterator result,
-                                T init);
-
-
-/*! \p exclusive_scan computes an exclusive prefix sum operation. The
- *  term 'exclusive' means that each result does not include the 
- *  corresponding input operand in the partial sum.  More precisely,
- *  \p init is assigned to <tt>\*result</tt> and the value
- *  <tt>binary_op(init, \*first)</tt> is assigned to <tt>\*(result + 1)</tt>,
- *  and so on. This version of the function requires both and associative 
- *  operator and an initial value \p init.  When the input and output
- *  sequences are the same, the scan is performed in-place.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *    
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the input sequence.
- *  \param last The end of the input sequence.
- *  \param result The beginning of the output sequence.
- *  \param init The initial value.
- *  \param binary_op The associatve operator used to 'sum' values.
- *  \return The end of the output sequence.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *                        and \c InputIterator's \c value_type is convertible to
- *                        \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>
- *                         and \c OutputIterator's \c value_type is convertible to
- *                         both \c AssociativeOperator's \c first_argument_type and
- *                         \c second_argument_type.
- *  \tparam T is convertible to \c OutputIterator's \c value_type.
- *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
- *                              and \c AssociativeOperator's \c result_type is
- *                              convertible to \c OutputIterator's \c value_type.
- *
- *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
- *
- *  The following code snippet demonstrates how to use \p exclusive_scan to compute an in-place
- *  prefix sum using the \p thrust::host execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/scan.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  
- *  int data[10] = {-5, 0, 2, -3, 2, 4, 0, -1, 2, 8};
- * 
- *  thrust::maximum<int> binary_op;
- *
- *  thrust::exclusive_scan(thrust::host, data, data + 10, data, 1, binary_op); // in-place scan
- *
- *  // data is now {1, 1, 1, 2, 2, 2, 4, 4, 4, 4 }
- *  \endcode
- *  
- *  \see http://www.sgi.com/tech/stl/partial_sum.html
- */
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename T,
-         typename AssociativeOperator>
-  OutputIterator exclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                InputIterator first,
-                                InputIterator last,
-                                OutputIterator result,
-                                T init,
-                                AssociativeOperator binary_op);
-
-
-/*! \p exclusive_scan computes an exclusive prefix sum operation. The
- *  term 'exclusive' means that each result does not include the 
- *  corresponding input operand in the partial sum.  More precisely,
- *  \p init is assigned to <tt>\*result</tt> and the value
- *  <tt>binary_op(init, \*first)</tt> is assigned to <tt>\*(result + 1)</tt>,
- *  and so on. This version of the function requires both and associative 
- *  operator and an initial value \p init.  When the input and output
- *  sequences are the same, the scan is performed in-place.
- *    
- *  \param first The beginning of the input sequence.
- *  \param last The end of the input sequence.
- *  \param result The beginning of the output sequence.
- *  \param init The initial value.
- *  \param binary_op The associatve operator used to 'sum' values.
- *  \return The end of the output sequence.
- *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *                        and \c InputIterator's \c value_type is convertible to
- *                        \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>
- *                         and \c OutputIterator's \c value_type is convertible to
- *                         both \c AssociativeOperator's \c first_argument_type and
- *                         \c second_argument_type.
- *  \tparam T is convertible to \c OutputIterator's \c value_type.
- *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
- *                              and \c AssociativeOperator's \c result_type is
- *                              convertible to \c OutputIterator's \c value_type.
- *
- *  \pre \p first may equal \p result but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
- *
- *  The following code snippet demonstrates how to use \p exclusive_scan
- *
- *  \code
- *  #include <thrust/scan.h>
- *  #include <thrust/functional.h>
- *  
- *  int data[10] = {-5, 0, 2, -3, 2, 4, 0, -1, 2, 8};
- * 
- *  thrust::maximum<int> binary_op;
- *
- *  thrust::exclusive_scan(data, data + 10, data, 1, binary_op); // in-place scan
- *
- *  // data is now {1, 1, 1, 2, 2, 2, 4, 4, 4, 4 }
- *  \endcode
- *  
- *  \see http://www.sgi.com/tech/stl/partial_sum.html
- */
-template<typename InputIterator,
-         typename OutputIterator,
-         typename T,
-         typename AssociativeOperator>
-  OutputIterator exclusive_scan(InputIterator first,
-                                InputIterator last,
-                                OutputIterator result,
-                                T init,
-                                AssociativeOperator binary_op);
-
-
-/*! \addtogroup segmentedprefixsums Segmented Prefix Sums
- *  \ingroup prefixsums
- *  \{
- */
-
-
-/*! \p inclusive_scan_by_key computes an inclusive key-value or 'segmented' prefix 
- *  sum operation. The term 'inclusive' means that each result includes 
- *  the corresponding input operand in the partial sum. The term 'segmented'
- *  means that the partial sums are broken into distinct segments.  In other
- *  words, within each segment a separate inclusive scan operation is computed.
- *  Refer to the code sample below for example usage.
- *
- *  This version of \p inclusive_scan_by_key assumes \c equal_to as the binary
- *  predicate used to compare adjacent keys.  Specifically, consecutive iterators
- *  <tt>i</tt> and <tt>i+1</tt> in the range <tt>[first1, last1)</tt>
- *  belong to the same segment if <tt>*i == *(i+1)</tt>, and belong to
- *  different segments otherwise.
- *
- *  This version of \p inclusive_scan_by_key assumes \c plus as the associative
- *  operator used to perform the prefix sum. When the input and output sequences
- *  are the same, the scan is performed in-place.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first1 The beginning of the key sequence.
- *  \param last1 The end of the key sequence.
- *  \param first2 The beginning of the input value sequence.
- *  \param result The beginning of the output value sequence.
- *  \return The end of the output sequence.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *                         and \c InputIterator2's \c value_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
- *                         and if \c x and \c y are objects of \c OutputIterator's \c value_type, then 
- *                         <tt>binary_op(x,y)</tt> is defined.
- *
- *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
- *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
- *
- *  The following code snippet demonstrates how to use \p inclusive_scan_by_key using the \p thrust::host
- *  execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/scan.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  
- *  int data[10] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
- *  int keys[10] = {0, 0, 0, 1, 1, 2, 3, 3, 3, 3};
- *
- *  thrust::inclusive_scan_by_key(thrust::host, keys, keys + 10, vals, vals); // in-place scan
- *
- *  // data is now {1, 2, 3, 1, 2, 1, 1, 2, 3, 4};
- *  \endcode
- *
- *  \see inclusive_scan
- *  \see exclusive_scan_by_key
- *
- */
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-  OutputIterator inclusive_scan_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                       InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result);
- 
-
-/*! \p inclusive_scan_by_key computes an inclusive key-value or 'segmented' prefix 
- *  sum operation. The term 'inclusive' means that each result includes 
- *  the corresponding input operand in the partial sum. The term 'segmented'
- *  means that the partial sums are broken into distinct segments.  In other
- *  words, within each segment a separate inclusive scan operation is computed.
- *  Refer to the code sample below for example usage.
- *
- *  This version of \p inclusive_scan_by_key assumes \c equal_to as the binary
- *  predicate used to compare adjacent keys.  Specifically, consecutive iterators
- *  <tt>i</tt> and <tt>i+1</tt> in the range <tt>[first1, last1)</tt>
- *  belong to the same segment if <tt>*i == *(i+1)</tt>, and belong to
- *  different segments otherwise.
- *
- *  This version of \p inclusive_scan_by_key assumes \c plus as the associative
- *  operator used to perform the prefix sum. When the input and output sequences
- *  are the same, the scan is performed in-place.
- *
- *  \param first1 The beginning of the key sequence.
- *  \param last1 The end of the key sequence.
- *  \param first2 The beginning of the input value sequence.
- *  \param result The beginning of the output value sequence.
- *  \return The end of the output sequence.
- *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *                         and \c InputIterator2's \c value_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
- *                         and if \c x and \c y are objects of \c OutputIterator's \c value_type, then 
- *                         <tt>binary_op(x,y)</tt> is defined.
- *
- *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
- *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
- *
- *  The following code snippet demonstrates how to use \p inclusive_scan_by_key
- *
- *  \code
- *  #include <thrust/scan.h>
- *  
- *  int data[10] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
- *  int keys[10] = {0, 0, 0, 1, 1, 2, 3, 3, 3, 3};
- *
- *  thrust::inclusive_scan_by_key(keys, keys + 10, vals, vals); // in-place scan
- *
- *  // data is now {1, 2, 3, 1, 2, 1, 1, 2, 3, 4};
- *  \endcode
- *
- *  \see inclusive_scan
- *  \see exclusive_scan_by_key
- *
- */
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-  OutputIterator inclusive_scan_by_key(InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result);
-
-
-/*! \p inclusive_scan_by_key computes an inclusive key-value or 'segmented' prefix 
- *  sum operation. The term 'inclusive' means that each result includes 
- *  the corresponding input operand in the partial sum. The term 'segmented'
- *  means that the partial sums are broken into distinct segments.  In other
- *  words, within each segment a separate inclusive scan operation is computed.
- *  Refer to the code sample below for example usage.
- *
- *  This version of \p inclusive_scan_by_key uses the binary predicate 
- *  \c pred to compare adjacent keys.  Specifically, consecutive iterators
- *  <tt>i</tt> and <tt>i+1</tt> in the range <tt>[first1, last1)</tt>
- *  belong to the same segment if <tt>binary_pred(*i, *(i+1))</tt> is true, and belong to 
- *  different segments otherwise.
- *
- *  This version of \p inclusive_scan_by_key assumes \c plus as the associative
- *  operator used to perform the prefix sum. When the input and output sequences
- *  are the same, the scan is performed in-place.
- *
- *  The algorithm's execution is parallelized as determined by \p exec. 
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first1 The beginning of the key sequence.
- *  \param last1 The end of the key sequence.
- *  \param first2 The beginning of the input value sequence.
- *  \param result The beginning of the output value sequence.
- *  \param binary_pred  The binary predicate used to determine equality of keys.
- *  \return The end of the output sequence.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *                         and \c InputIterator2's \c value_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
- *                         and if \c x and \c y are objects of \c OutputIterator's \c value_type, then 
- *                         <tt>binary_op(x,y)</tt> is defined.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
- *
- *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
- *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
- *
- *  The following code snippet demonstrates how to use \p inclusive_scan_by_key using the \p thrust::host
- *  execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/scan.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  
- *  int data[10] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
- *  int keys[10] = {0, 0, 0, 1, 1, 2, 3, 3, 3, 3};
- *
- *  thrust::equal_to<int> binary_pred;
- *
- *  thrust::inclusive_scan_by_key(thrust::host, keys, keys + 10, vals, vals, binary_pred); // in-place scan
- *
- *  // data is now {1, 2, 3, 1, 2, 1, 1, 2, 3, 4};
- *  \endcode
- *
- *  \see inclusive_scan
- *  \see exclusive_scan_by_key
- *
- */
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename BinaryPredicate>
-  OutputIterator inclusive_scan_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                       InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result,
-                                       BinaryPredicate binary_pred);
-
-
-/*! \p inclusive_scan_by_key computes an inclusive key-value or 'segmented' prefix 
- *  sum operation. The term 'inclusive' means that each result includes 
- *  the corresponding input operand in the partial sum. The term 'segmented'
- *  means that the partial sums are broken into distinct segments.  In other
- *  words, within each segment a separate inclusive scan operation is computed.
- *  Refer to the code sample below for example usage.
- *
- *  This version of \p inclusive_scan_by_key uses the binary predicate 
- *  \c pred to compare adjacent keys.  Specifically, consecutive iterators
- *  <tt>i</tt> and <tt>i+1</tt> in the range <tt>[first1, last1)</tt>
- *  belong to the same segment if <tt>binary_pred(*i, *(i+1))</tt> is true, and belong to 
- *  different segments otherwise.
- *
- *  This version of \p inclusive_scan_by_key assumes \c plus as the associative
- *  operator used to perform the prefix sum. When the input and output sequences
- *  are the same, the scan is performed in-place.
- *
- *  \param first1 The beginning of the key sequence.
- *  \param last1 The end of the key sequence.
- *  \param first2 The beginning of the input value sequence.
- *  \param result The beginning of the output value sequence.
- *  \param binary_pred  The binary predicate used to determine equality of keys.
- *  \return The end of the output sequence.
- *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *                         and \c InputIterator2's \c value_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
- *                         and if \c x and \c y are objects of \c OutputIterator's \c value_type, then 
- *                         <tt>binary_op(x,y)</tt> is defined.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
- *
- *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
- *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
- *
- *  The following code snippet demonstrates how to use \p inclusive_scan_by_key
- *
- *  \code
- *  #include <thrust/scan.h>
- *  #include <thrust/functional.h>
- *  
- *  int data[10] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
- *  int keys[10] = {0, 0, 0, 1, 1, 2, 3, 3, 3, 3};
- *
- *  thrust::equal_to<int> binary_pred;
- *
- *  thrust::inclusive_scan_by_key(keys, keys + 10, vals, vals, binary_pred); // in-place scan
- *
- *  // data is now {1, 2, 3, 1, 2, 1, 1, 2, 3, 4};
- *  \endcode
- *
- *  \see inclusive_scan
- *  \see exclusive_scan_by_key
- *
- */
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename BinaryPredicate>
-  OutputIterator inclusive_scan_by_key(InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result,
-                                       BinaryPredicate binary_pred);
-
-
-/*! \p inclusive_scan_by_key computes an inclusive key-value or 'segmented' prefix 
- *  sum operation. The term 'inclusive' means that each result includes 
- *  the corresponding input operand in the partial sum. The term 'segmented'
- *  means that the partial sums are broken into distinct segments.  In other
- *  words, within each segment a separate inclusive scan operation is computed.
- *  Refer to the code sample below for example usage.
- *
- *  This version of \p inclusive_scan_by_key uses the binary predicate 
- *  \c pred to compare adjacent keys.  Specifically, consecutive iterators
- *  <tt>i</tt> and <tt>i+1</tt> in the range <tt>[first1, last1)</tt>
- *  belong to the same segment if <tt>binary_pred(*i, *(i+1))</tt> is true, and belong to 
- *  different segments otherwise.
- *
- *  This version of \p inclusive_scan_by_key uses the associative operator 
- *  \c binary_op to perform the prefix sum. When the input and output sequences
- *  are the same, the scan is performed in-place.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first1 The beginning of the key sequence.
- *  \param last1 The end of the key sequence.
- *  \param first2 The beginning of the input value sequence.
- *  \param result The beginning of the output value sequence.
- *  \param binary_pred  The binary predicate used to determine equality of keys.
- *  \param binary_op The associatve operator used to 'sum' values.
- *  \return The end of the output sequence.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *                         and \c InputIterator2's \c value_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
- *                         and if \c x and \c y are objects of \c OutputIterator's \c value_type, then 
- *                         <tt>binary_op(x,y)</tt> is defined.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
- *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
- *                              and \c AssociativeOperator's \c result_type is
- *                              convertible to \c OutputIterator's \c value_type.
- *
- *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
- *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
- *
- *  The following code snippet demonstrates how to use \p inclusive_scan_by_key using the \p thrust::host
- *  execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/scan.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  
- *  int data[10] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
- *  int keys[10] = {0, 0, 0, 1, 1, 2, 3, 3, 3, 3};
- *
- *  thrust::equal_to<int> binary_pred;
- *  thrust::plus<int>     binary_op;
- *
- *  thrust::inclusive_scan_by_key(thrust::host, keys, keys + 10, vals, vals, binary_pred, binary_op); // in-place scan
- *
- *  // data is now {1, 2, 3, 1, 2, 1, 1, 2, 3, 4};
- *  \endcode
- *
- *  \see inclusive_scan
- *  \see exclusive_scan_by_key
- *
- */
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename BinaryPredicate,
-         typename AssociativeOperator>
-  OutputIterator inclusive_scan_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                       InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result,
-                                       BinaryPredicate binary_pred,
-                                       AssociativeOperator binary_op);
-
-
-/*! \p inclusive_scan_by_key computes an inclusive key-value or 'segmented' prefix 
- *  sum operation. The term 'inclusive' means that each result includes 
- *  the corresponding input operand in the partial sum. The term 'segmented'
- *  means that the partial sums are broken into distinct segments.  In other
- *  words, within each segment a separate inclusive scan operation is computed.
- *  Refer to the code sample below for example usage.
- *
- *  This version of \p inclusive_scan_by_key uses the binary predicate 
- *  \c pred to compare adjacent keys.  Specifically, consecutive iterators
- *  <tt>i</tt> and <tt>i+1</tt> in the range <tt>[first1, last1)</tt>
- *  belong to the same segment if <tt>binary_pred(*i, *(i+1))</tt> is true, and belong to 
- *  different segments otherwise.
- *
- *  This version of \p inclusive_scan_by_key uses the associative operator 
- *  \c binary_op to perform the prefix sum. When the input and output sequences
- *  are the same, the scan is performed in-place.
- *
- *  \param first1 The beginning of the key sequence.
- *  \param last1 The end of the key sequence.
- *  \param first2 The beginning of the input value sequence.
- *  \param result The beginning of the output value sequence.
- *  \param binary_pred  The binary predicate used to determine equality of keys.
- *  \param binary_op The associatve operator used to 'sum' values.
- *  \return The end of the output sequence.
- *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *                         and \c InputIterator2's \c value_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
- *                         and if \c x and \c y are objects of \c OutputIterator's \c value_type, then 
- *                         <tt>binary_op(x,y)</tt> is defined.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
- *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
- *                              and \c AssociativeOperator's \c result_type is
- *                              convertible to \c OutputIterator's \c value_type.
- *
- *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
- *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
- *
- *  The following code snippet demonstrates how to use \p inclusive_scan_by_key
- *
- *  \code
- *  #include <thrust/scan.h>
- *  #include <thrust/functional.h>
- *  
- *  int data[10] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
- *  int keys[10] = {0, 0, 0, 1, 1, 2, 3, 3, 3, 3};
- *
- *  thrust::equal_to<int> binary_pred;
- *  thrust::plus<int>     binary_op;
- *
- *  thrust::inclusive_scan_by_key(keys, keys + 10, vals, vals, binary_pred, binary_op); // in-place scan
- *
- *  // data is now {1, 2, 3, 1, 2, 1, 1, 2, 3, 4};
- *  \endcode
- *
- *  \see inclusive_scan
- *  \see exclusive_scan_by_key
- *
- */
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename BinaryPredicate,
-         typename AssociativeOperator>
-  OutputIterator inclusive_scan_by_key(InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result,
-                                       BinaryPredicate binary_pred,
-                                       AssociativeOperator binary_op);
-
-
-/*! \p exclusive_scan_by_key computes an exclusive segmented prefix 
- *
- *  This version of \p exclusive_scan_by_key uses the value \c 0 to
- *  initialize the exclusive scan operation.
- *
- *  This version of \p exclusive_scan_by_key assumes \c plus as the associative
- *  operator used to perform the prefix sum. When the input and output sequences
- *  are the same, the scan is performed in-place.
- * 
- *  This version of \p exclusive_scan_by_key assumes \c equal_to as the binary
- *  predicate used to compare adjacent keys.  Specifically, consecutive iterators
- *  <tt>i</tt> and <tt>i+1</tt> in the range <tt>[first1, last1</tt>
- *  belong to the same segment if <tt>*i == *(i+1)</tt>, and belong to 
- *  different segments otherwise.
- *
- *  Refer to the most general form of \p exclusive_scan_by_key for additional details.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first1 The beginning of the key sequence.
- *  \param last1 The end of the key sequence.
- *  \param first2 The beginning of the input value sequence.
- *  \param result The beginning of the output value sequence.
- *
- *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
- *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
- *
- *  The following code snippet demonstrates how to use \p exclusive_scan_by_key using the
- *  \p thrust::host execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/scan.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  
- *  int keys[10] = {0, 0, 0, 1, 1, 2, 3, 3, 3, 3};
- *  int vals[10] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
- *
- *  thrust::exclusive_scan_by_key(thrust::host, key, key + 10, vals, vals); // in-place scan
- *
- *  // vals is now {0, 1, 2, 0, 1, 0, 0, 1, 2, 3};
- *  \endcode
- *
- *  \see exclusive_scan
- *
- */
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-  OutputIterator exclusive_scan_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                       InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result);
-
-
-/*! \p exclusive_scan_by_key computes an exclusive segmented prefix 
- *
- *  This version of \p exclusive_scan_by_key uses the value \c 0 to
- *  initialize the exclusive scan operation.
- *
- *  This version of \p exclusive_scan_by_key assumes \c plus as the associative
- *  operator used to perform the prefix sum. When the input and output sequences
- *  are the same, the scan is performed in-place.
- * 
- *  This version of \p exclusive_scan_by_key assumes \c equal_to as the binary
- *  predicate used to compare adjacent keys.  Specifically, consecutive iterators
- *  <tt>i</tt> and <tt>i+1</tt> in the range <tt>[first1, last1</tt>
- *  belong to the same segment if <tt>*i == *(i+1)</tt>, and belong to 
- *  different segments otherwise.
- *
- *  Refer to the most general form of \p exclusive_scan_by_key for additional details.
- *
- *  \param first1 The beginning of the key sequence.
- *  \param last1 The end of the key sequence.
- *  \param first2 The beginning of the input value sequence.
- *  \param result The beginning of the output value sequence.
- *
- *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
- *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
- *
- *  The following code snippet demonstrates how to use \p exclusive_scan_by_key.
- *
- *  \code
- *  #include <thrust/scan.h>
- *  
- *  int keys[10] = {0, 0, 0, 1, 1, 2, 3, 3, 3, 3};
- *  int vals[10] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
- *
- *  thrust::exclusive_scan_by_key(key, key + 10, vals, vals); // in-place scan
- *
- *  // vals is now {0, 1, 2, 0, 1, 0, 0, 1, 2, 3};
- *  \endcode
- *
- *  \see exclusive_scan
- *
- */
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-  OutputIterator exclusive_scan_by_key(InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result);
-
-
-/*! \p exclusive_scan_by_key computes an exclusive key-value or 'segmented' prefix 
- *  sum operation. The term 'exclusive' means that each result does not include
- *  the corresponding input operand in the partial sum. The term 'segmented'
- *  means that the partial sums are broken into distinct segments.  In other
- *  words, within each segment a separate exclusive scan operation is computed.
- *  Refer to the code sample below for example usage.
- *
- *  This version of \p exclusive_scan_by_key uses the value \c init to
- *  initialize the exclusive scan operation.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first1 The beginning of the key sequence.
- *  \param last1 The end of the key sequence.
- *  \param first2 The beginning of the input value sequence.
- *  \param result The beginning of the output value sequence.
- *  \param init The initial of the exclusive sum value.
- *  \return The end of the output sequence.
- *
- *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
- *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
- *
- *  The following code snippet demonstrates how to use \p exclusive_scan_by_key using the \p
- *  thrust::host execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/scan.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  
- *  int keys[10] = {0, 0, 0, 1, 1, 2, 3, 3, 3, 3};
- *  int vals[10] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
- *
- *  int init = 5;
- *
- *  thrust::exclusive_scan_by_key(thrust::host, key, key + 10, vals, vals, init); // in-place scan
- *
- *  // vals is now {5, 6, 7, 5, 6, 5, 5, 6, 7, 8};
- *  \endcode
- *
- *  \see exclusive_scan
- *  \see inclusive_scan_by_key
- *
- */
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename T>
-  OutputIterator exclusive_scan_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                       InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result,
-                                       T init);
-
-
-/*! \p exclusive_scan_by_key computes an exclusive key-value or 'segmented' prefix 
- *  sum operation. The term 'exclusive' means that each result does not include
- *  the corresponding input operand in the partial sum. The term 'segmented'
- *  means that the partial sums are broken into distinct segments.  In other
- *  words, within each segment a separate exclusive scan operation is computed.
- *  Refer to the code sample below for example usage.
- *
- *  This version of \p exclusive_scan_by_key uses the value \c init to
- *  initialize the exclusive scan operation.
- *
- *  \param first1 The beginning of the key sequence.
- *  \param last1 The end of the key sequence.
- *  \param first2 The beginning of the input value sequence.
- *  \param result The beginning of the output value sequence.
- *  \param init The initial of the exclusive sum value.
- *  \return The end of the output sequence.
- *
- *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
- *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
- *
- *  The following code snippet demonstrates how to use \p exclusive_scan_by_key
- *
- *  \code
- *  #include <thrust/scan.h>
- *  #include <thrust/functional.h>
- *  
- *  int keys[10] = {0, 0, 0, 1, 1, 2, 3, 3, 3, 3};
- *  int vals[10] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
- *
- *  int init = 5;
- *
- *  thrust::exclusive_scan_by_key(key, key + 10, vals, vals, init); // in-place scan
- *
- *  // vals is now {5, 6, 7, 5, 6, 5, 5, 6, 7, 8};
- *  \endcode
- *
- *  \see exclusive_scan
- *  \see inclusive_scan_by_key
- *
- */
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename T>
-  OutputIterator exclusive_scan_by_key(InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result,
-                                       T init);
-
-
-/*! \p exclusive_scan_by_key computes an exclusive key-value or 'segmented' prefix 
- *  sum operation. The term 'exclusive' means that each result does not include
- *  the corresponding input operand in the partial sum. The term 'segmented'
- *  means that the partial sums are broken into distinct segments.  In other
- *  words, within each segment a separate exclusive scan operation is computed.
- *  Refer to the code sample below for example usage.
- *
- *  This version of \p exclusive_scan_by_key uses the value \c init to
- *  initialize the exclusive scan operation.
- *
- *  This version of \p exclusive_scan_by_key uses the binary predicate \c binary_pred
- *  to compare adjacent keys.  Specifically, consecutive iterators <tt>i</tt> and
- *  <tt>i+1</tt> in the range <tt>[first1, last1)</tt> belong to the same segment if
- *  <tt>binary_pred(*i, *(i+1))</tt> is true, and belong to different segments otherwise.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first1 The beginning of the key sequence.
- *  \param last1 The end of the key sequence.
- *  \param first2 The beginning of the input value sequence.
- *  \param result The beginning of the output value sequence.
- *  \param init The initial of the exclusive sum value.
- *  \param binary_pred The binary predicate used to determine equality of keys.
- *  \return The end of the output sequence.
- *
- *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
- *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
- *
- *  The following code snippet demonstrates how to use \p exclusive_scan_by_key using the
- *  \p thrust::host execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/scan.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  
- *  int keys[10] = {0, 0, 0, 1, 1, 2, 3, 3, 3, 3};
- *  int vals[10] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
- *
- *  int init = 5;
- *
- *  thrust::equal_to<int> binary_pred;
- *
- *  thrust::exclusive_scan_by_key(thrust::host, key, key + 10, vals, vals, init, binary_pred); // in-place scan
- *
- *  // vals is now {5, 6, 7, 5, 6, 5, 5, 6, 7, 8};
- *  \endcode
- *
- *  \see exclusive_scan
- *  \see inclusive_scan_by_key
- *
- */
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename T,
-         typename BinaryPredicate>
-  OutputIterator exclusive_scan_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                       InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result,
-                                       T init,
-                                       BinaryPredicate binary_pred);
-
-
-/*! \p exclusive_scan_by_key computes an exclusive key-value or 'segmented' prefix 
- *  sum operation. The term 'exclusive' means that each result does not include
- *  the corresponding input operand in the partial sum. The term 'segmented'
- *  means that the partial sums are broken into distinct segments.  In other
- *  words, within each segment a separate exclusive scan operation is computed.
- *  Refer to the code sample below for example usage.
- *
- *  This version of \p exclusive_scan_by_key uses the value \c init to
- *  initialize the exclusive scan operation.
- *
- *  This version of \p exclusive_scan_by_key uses the binary predicate \c binary_pred
- *  to compare adjacent keys.  Specifically, consecutive iterators <tt>i</tt> and
- *  <tt>i+1</tt> in the range <tt>[first1, last1)</tt> belong to the same segment if
- *  <tt>binary_pred(*i, *(i+1))</tt> is true, and belong to different segments otherwise.
- *
- *  \param first1 The beginning of the key sequence.
- *  \param last1 The end of the key sequence.
- *  \param first2 The beginning of the input value sequence.
- *  \param result The beginning of the output value sequence.
- *  \param init The initial of the exclusive sum value.
- *  \param binary_pred The binary predicate used to determine equality of keys.
- *  \return The end of the output sequence.
- *
- *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
- *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
- *
- *  The following code snippet demonstrates how to use \p exclusive_scan_by_key
- *
- *  \code
- *  #include <thrust/scan.h>
- *  #include <thrust/functional.h>
- *  
- *  int keys[10] = {0, 0, 0, 1, 1, 2, 3, 3, 3, 3};
- *  int vals[10] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
- *
- *  int init = 5;
- *
- *  thrust::equal_to<int> binary_pred;
- *
- *  thrust::exclusive_scan_by_key(key, key + 10, vals, vals, init, binary_pred); // in-place scan
- *
- *  // vals is now {5, 6, 7, 5, 6, 5, 5, 6, 7, 8};
- *  \endcode
- *
- *  \see exclusive_scan
- *  \see inclusive_scan_by_key
- *
- */
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename T,
-         typename BinaryPredicate>
-  OutputIterator exclusive_scan_by_key(InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result,
-                                       T init,
-                                       BinaryPredicate binary_pred);
-
-
-/*! \p exclusive_scan_by_key computes an exclusive key-value or 'segmented' prefix 
- *  sum operation. The term 'exclusive' means that each result does not include
- *  the corresponding input operand in the partial sum. The term 'segmented'
- *  means that the partial sums are broken into distinct segments.  In other
- *  words, within each segment a separate exclusive scan operation is computed.
- *  Refer to the code sample below for example usage.
- *
- *  This version of \p exclusive_scan_by_key uses the value \c init to
- *  initialize the exclusive scan operation.
- *
- *  This version of \p exclusive_scan_by_key uses the binary predicate \c binary_pred
- *  to compare adjacent keys.  Specifically, consecutive iterators <tt>i</tt> and
- *  <tt>i+1</tt> in the range <tt>[first1, last1)</tt> belong to the same segment if 
- *  <tt>binary_pred(*i, *(i+1))</tt> is true, and belong to different segments otherwise.
- *
- *  This version of \p exclusive_scan_by_key uses the associative operator 
- *  \c binary_op to perform the prefix sum. When the input and output sequences
- *  are the same, the scan is performed in-place.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first1 The beginning of the key sequence.
- *  \param last1 The end of the key sequence.
- *  \param first2 The beginning of the input value sequence.
- *  \param result The beginning of the output value sequence.
- *  \param init The initial of the exclusive sum value.
- *  \param binary_pred The binary predicate used to determine equality of keys.
- *  \param binary_op The associatve operator used to 'sum' values.
- *  \return The end of the output sequence.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *                         and \c InputIterator2's \c value_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
- *                         and if \c x and \c y are objects of \c OutputIterator's \c value_type, then 
- *                         <tt>binary_op(x,y)</tt> is defined.
- *  \tparam T is convertible to \c OutputIterator's \c value_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
- *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
- *                         and \c AssociativeOperator's \c result_type is convertible to \c OutputIterator's \c value_type.
- *
- *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
- *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
- *
- *  The following code snippet demonstrates how to use \p exclusive_scan_by_key using the
- *  \p thrust::host execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/scan.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  
- *  int keys[10] = {0, 0, 0, 1, 1, 2, 3, 3, 3, 3};
- *  int vals[10] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
- *
- *  int init = 5;
- *
- *  thrust::equal_to<int> binary_pred;
- *  thrust::plus<int>     binary_op;
- *
- *  thrust::exclusive_scan_by_key(thrust::host, key, key + 10, vals, vals, init, binary_pred, binary_op); // in-place scan
- *
- *  // vals is now {5, 6, 7, 5, 6, 5, 5, 6, 7, 8};
- *  \endcode
- *
- *  \see exclusive_scan
- *  \see inclusive_scan_by_key
- *
- */
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename T,
-         typename BinaryPredicate,
-         typename AssociativeOperator>
-  OutputIterator exclusive_scan_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                       InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result,
-                                       T init,
-                                       BinaryPredicate binary_pred,
-                                       AssociativeOperator binary_op);
-
-
-/*! \p exclusive_scan_by_key computes an exclusive key-value or 'segmented' prefix 
- *  sum operation. The term 'exclusive' means that each result does not include
- *  the corresponding input operand in the partial sum. The term 'segmented'
- *  means that the partial sums are broken into distinct segments.  In other
- *  words, within each segment a separate exclusive scan operation is computed.
- *  Refer to the code sample below for example usage.
- *
- *  This version of \p exclusive_scan_by_key uses the value \c init to
- *  initialize the exclusive scan operation.
- *
- *  This version of \p exclusive_scan_by_key uses the binary predicate \c binary_pred
- *  to compare adjacent keys.  Specifically, consecutive iterators <tt>i</tt> and
- *  <tt>i+1</tt> in the range <tt>[first1, last1)</tt> belong to the same segment if 
- *  <tt>binary_pred(*i, *(i+1))</tt> is true, and belong to different segments otherwise.
- *
- *  This version of \p exclusive_scan_by_key uses the associative operator 
- *  \c binary_op to perform the prefix sum. When the input and output sequences
- *  are the same, the scan is performed in-place.
- *
- *  \param first1 The beginning of the key sequence.
- *  \param last1 The end of the key sequence.
- *  \param first2 The beginning of the input value sequence.
- *  \param result The beginning of the output value sequence.
- *  \param init The initial of the exclusive sum value.
- *  \param binary_pred The binary predicate used to determine equality of keys.
- *  \param binary_op The associatve operator used to 'sum' values.
- *  \return The end of the output sequence.
- *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *                         and \c InputIterator2's \c value_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
- *                         and if \c x and \c y are objects of \c OutputIterator's \c value_type, then 
- *                         <tt>binary_op(x,y)</tt> is defined.
- *  \tparam T is convertible to \c OutputIterator's \c value_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
- *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
- *                         and \c AssociativeOperator's \c result_type is convertible to \c OutputIterator's \c value_type.
- *
- *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
- *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
- *
- *  The following code snippet demonstrates how to use \p exclusive_scan_by_key
- *
- *  \code
- *  #include <thrust/scan.h>
- *  #include <thrust/functional.h>
- *  
- *  int keys[10] = {0, 0, 0, 1, 1, 2, 3, 3, 3, 3};
- *  int vals[10] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
- *
- *  int init = 5;
- *
- *  thrust::equal_to<int> binary_pred;
- *  thrust::plus<int>     binary_op;
- *
- *  thrust::exclusive_scan_by_key(key, key + 10, vals, vals, init, binary_pred, binary_op); // in-place scan
- *
- *  // vals is now {5, 6, 7, 5, 6, 5, 5, 6, 7, 8};
- *  \endcode
- *
- *  \see exclusive_scan
- *  \see inclusive_scan_by_key
- *
- */
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename T,
-         typename BinaryPredicate,
-         typename AssociativeOperator>
-  OutputIterator exclusive_scan_by_key(InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result,
-                                       T init,
-                                       BinaryPredicate binary_pred,
-                                       AssociativeOperator binary_op);
-
-
-/*! \} // end segmentedprefixsums
- */
-
-
-/*! \} // end prefix sums
- */
-
-	
-} // end namespace thrust
-
-#include <thrust/detail/scan.inl>
-
diff --git a/compat/thrust/scatter.h b/compat/thrust/scatter.h
deleted file mode 100644
index 59604ca170..0000000000
--- a/compat/thrust/scatter.h
+++ /dev/null
@@ -1,420 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file scatter.h
- *  \brief Irregular copying to a destination range
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/execution_policy.h>
-
-namespace thrust
-{
-
-
-/*! \addtogroup scattering
- *  \ingroup copying
- *  \{
- */
-
-
-/*! \p scatter copies elements from a source range into an output array
- *  according to a map. For each iterator \c i in the range [\p first, \p last),
- *  the value \c *i is assigned to <tt>output[*(map + (i - first))]</tt>. The 
- *  output iterator must permit random access. If the same index 
- *  appears more than once in the range <tt>[map, map + (last - first))</tt>,
- *  the result is undefined.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first Beginning of the sequence of values to scatter.
- *  \param last End of the sequence of values to scatter.
- *  \param map  Beginning of the sequence of output indices.
- *  \param result Destination of the source elements.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c value_type.
- *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
- *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a>.
- *
- *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[first,last)` for all iterators `i` in the range `[map,map + (last - first))`.
- *
- *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[map,map + (last - first))` for all iterators `i` in the range `[map,map + (last - first))`.
- *
- *  \pre The expression `result[*i]` shall be valid for all iterators in the range `[map,map + (last - first))`.
- *
- *  The following code snippet demonstrates how to use \p scatter to
- *  reorder a range using the \p thrust::device execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/scatter.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  // mark even indices with a 1; odd indices with a 0
- *  int values[10] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
- *  thrust::device_vector<int> d_values(values, values + 10);
- *
- *  // scatter all even indices into the first half of the
- *  // range, and odd indices vice versa
- *  int map[10]   = {0, 5, 1, 6, 2, 7, 3, 8, 4, 9};
- *  thrust::device_vector<int> d_map(map, map + 10);
- *
- *  thrust::device_vector<int> d_output(10);
- *  thrust::scatter(thrust::device,
- *                  d_values.begin(), d_values.end(),
- *                  d_map.begin(), d_output.begin());
- *  // d_output is now {1, 1, 1, 1, 1, 0, 0, 0, 0, 0}
- *  \endcode
- *
- *  \note \p scatter is the inverse of thrust::gather.
- */
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename RandomAccessIterator>
-  void scatter(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-               InputIterator1 first,
-               InputIterator1 last,
-               InputIterator2 map,
-               RandomAccessIterator result);
-
-
-/*! \p scatter copies elements from a source range into an output array
- *  according to a map. For each iterator \c i in the range [\p first, \p last),
- *  the value \c *i is assigned to <tt>output[*(map + (i - first))]</tt>. The 
- *  output iterator must permit random access. If the same index 
- *  appears more than once in the range <tt>[map, map + (last - first))</tt>,
- *  the result is undefined.
- *
- *  \param first Beginning of the sequence of values to scatter.
- *  \param last End of the sequence of values to scatter.
- *  \param map  Beginning of the sequence of output indices.
- *  \param result Destination of the source elements.
- *
- *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c value_type.
- *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
- *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a>.
- *
- *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[first,last)` for all iterators `i` in the range `[map,map + (last - first))`.
- *
- *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[map,map + (last - first))` for all iterators `i` in the range `[map,map + (last - first))`.
- *
- *  \pre The expression `result[*i]` shall be valid for all iterators in the range `[map,map + (last - first))`.
- *
- *  The following code snippet demonstrates how to use \p scatter to
- *  reorder a range.
- *
- *  \code
- *  #include <thrust/scatter.h>
- *  #include <thrust/device_vector.h>
- *  ...
- *  // mark even indices with a 1; odd indices with a 0
- *  int values[10] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
- *  thrust::device_vector<int> d_values(values, values + 10);
- *
- *  // scatter all even indices into the first half of the
- *  // range, and odd indices vice versa
- *  int map[10]   = {0, 5, 1, 6, 2, 7, 3, 8, 4, 9};
- *  thrust::device_vector<int> d_map(map, map + 10);
- *
- *  thrust::device_vector<int> d_output(10);
- *  thrust::scatter(d_values.begin(), d_values.end(),
- *                  d_map.begin(), d_output.begin());
- *  // d_output is now {1, 1, 1, 1, 1, 0, 0, 0, 0, 0}
- *  \endcode
- *
- *  \note \p scatter is the inverse of thrust::gather.
- */
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename RandomAccessIterator>
-  void scatter(InputIterator1 first,
-               InputIterator1 last,
-               InputIterator2 map,
-               RandomAccessIterator result);
-
-
-/*! \p scatter_if conditionally copies elements from a source range into an 
- *  output array according to a map. For each iterator \c i in the 
- *  range <tt>[first, last)</tt> such that <tt>*(stencil + (i - first))</tt> is
- *  true, the value \c *i is assigned to <tt>output[*(map + (i - first))]</tt>.
- *  The output iterator must permit random access. If the same index 
- *  appears more than once in the range <tt>[map, map + (last - first))</tt>
- *  the result is undefined.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first Beginning of the sequence of values to scatter.
- *  \param last End of the sequence of values to scatter.
- *  \param map Beginning of the sequence of output indices.
- *  \param stencil Beginning of the sequence of predicate values.
- *  \param output Beginning of the destination range.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c value_type.
- *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
- *  \tparam InputIterator3 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator3's \c value_type must be convertible to \c bool.
- *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a>.
- *
- *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[first,last)` for all iterators `i` in the range `[map,map + (last - first))`.
- *
- *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[map,map + (last - first))` for all iterators `i` in the range `[map,map + (last - first))`.
- *
- *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[stencil,stencil + (last - first))` for all iterators `i` in the range `[map,map + (last - first))`.
- *
- *  \pre The expression `result[*i]` shall be valid for all iterators `i` in the range `[map,map + (last - first))` for which the following condition holds: `*(stencil + i) != false`.
- *
- *  \code
- *  #include <thrust/scatter.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  int V[8] = {10, 20, 30, 40, 50, 60, 70, 80};
- *  int M[8] = {0, 5, 1, 6, 2, 7, 3, 4};
- *  int S[8] = {1, 0, 1, 0, 1, 0, 1, 0};
- *  int D[8] = {0, 0, 0, 0, 0, 0, 0, 0};
- * 
- *  thrust::scatter_if(thrust::host, V, V + 8, M, S, D);
- * 
- *  // D contains [10, 30, 50, 70, 0, 0, 0, 0];
- *  \endcode
- *
- *  \note \p scatter_if is the inverse of thrust::gather_if.
- */
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename RandomAccessIterator>
-  void scatter_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                  InputIterator1 first,
-                  InputIterator1 last,
-                  InputIterator2 map,
-                  InputIterator3 stencil,
-                  RandomAccessIterator output);
-
-
-/*! \p scatter_if conditionally copies elements from a source range into an 
- *  output array according to a map. For each iterator \c i in the 
- *  range <tt>[first, last)</tt> such that <tt>*(stencil + (i - first))</tt> is
- *  true, the value \c *i is assigned to <tt>output[*(map + (i - first))]</tt>.
- *  The output iterator must permit random access. If the same index 
- *  appears more than once in the range <tt>[map, map + (last - first))</tt>
- *  the result is undefined.
- *
- *  \param first Beginning of the sequence of values to scatter.
- *  \param last End of the sequence of values to scatter.
- *  \param map Beginning of the sequence of output indices.
- *  \param stencil Beginning of the sequence of predicate values.
- *  \param output Beginning of the destination range.
- *
- *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c value_type.
- *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
- *  \tparam InputIterator3 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator3's \c value_type must be convertible to \c bool.
- *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a>.
- *
- *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[first,last)` for all iterators `i` in the range `[map,map + (last - first))`.
- *
- *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[map,map + (last - first))` for all iterators `i` in the range `[map,map + (last - first))`.
- *
- *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[stencil,stencil + (last - first))` for all iterators `i` in the range `[map,map + (last - first))`.
- *
- *  \pre The expression `result[*i]` shall be valid for all iterators `i` in the range `[map,map + (last - first))` for which the following condition holds: `*(stencil + i) != false`.
- *
- *  \code
- *  #include <thrust/scatter.h>
- *  ...
- *  int V[8] = {10, 20, 30, 40, 50, 60, 70, 80};
- *  int M[8] = {0, 5, 1, 6, 2, 7, 3, 4};
- *  int S[8] = {1, 0, 1, 0, 1, 0, 1, 0};
- *  int D[8] = {0, 0, 0, 0, 0, 0, 0, 0};
- * 
- *  thrust::scatter_if(V, V + 8, M, S, D);
- * 
- *  // D contains [10, 30, 50, 70, 0, 0, 0, 0];
- *  \endcode
- *
- *  \note \p scatter_if is the inverse of thrust::gather_if.
- */
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename RandomAccessIterator>
-  void scatter_if(InputIterator1 first,
-                  InputIterator1 last,
-                  InputIterator2 map,
-                  InputIterator3 stencil,
-                  RandomAccessIterator output);
-
-
-/*! \p scatter_if conditionally copies elements from a source range into an 
- *  output array according to a map. For each iterator \c i in the 
- *  range <tt>[first, last)</tt> such that <tt>pred(*(stencil + (i - first)))</tt> is
- *  \c true, the value \c *i is assigned to <tt>output[*(map + (i - first))]</tt>.
- *  The output iterator must permit random access. If the same index 
- *  appears more than once in the range <tt>[map, map + (last - first))</tt>
- *  the result is undefined.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first Beginning of the sequence of values to scatter.
- *  \param last End of the sequence of values to scatter.
- *  \param map Beginning of the sequence of output indices.
- *  \param stencil Beginning of the sequence of predicate values.
- *  \param output Beginning of the destination range.
- *  \param pred Predicate to apply to the stencil values.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c value_type.
- *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
- *  \tparam InputIterator3 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator3's \c value_type must be convertible to \c Predicate's \c argument_type.
- *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a>.
- *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[first,last)` for all iterators `i` in the range `[map,map + (last - first))`.
- *
- *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[map,map + (last - first))` for all iterators `i` in the range `[map,map + (last - first))`.
- *
- *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[stencil,stencil + (last - first))` for all iterators `i` in the range `[map,map + (last - first))`.
- *
- *  \pre The expression `result[*i]` shall be valid for all iterators `i` in the range `[map,map + (last - first))` for which the following condition holds: `pred(*(stencil + i)) != false`.
- *
- *  \code
- *  #include <thrust/scatter.h>
- *  #include <thrust/execution_policy.h>
- *
- *  struct is_even
- *  {
- *    __host__ __device__
- *    bool operator()(int x)
- *    {
- *      return (x % 2) == 0;
- *    }
- *  };
- *
- *  ...
- *
- *  int V[8] = {10, 20, 30, 40, 50, 60, 70, 80};
- *  int M[8] = {0, 5, 1, 6, 2, 7, 3, 4};
- *  int S[8] = {2, 1, 2, 1, 2, 1, 2, 1};
- *  int D[8] = {0, 0, 0, 0, 0, 0, 0, 0};
- * 
- *  is_even pred;
- *  thrust::scatter_if(thrust::host, V, V + 8, M, S, D, pred);
- * 
- *  // D contains [10, 30, 50, 70, 0, 0, 0, 0];
- *  \endcode
- *  
- *  \note \p scatter_if is the inverse of thrust::gather_if.
- */
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename RandomAccessIterator,
-         typename Predicate>
-  void scatter_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                  InputIterator1 first,
-                  InputIterator1 last,
-                  InputIterator2 map,
-                  InputIterator3 stencil,
-                  RandomAccessIterator output,
-                  Predicate pred);
-                  
-
-/*! \p scatter_if conditionally copies elements from a source range into an 
- *  output array according to a map. For each iterator \c i in the 
- *  range <tt>[first, last)</tt> such that <tt>pred(*(stencil + (i - first)))</tt> is
- *  \c true, the value \c *i is assigned to <tt>output[*(map + (i - first))]</tt>.
- *  The output iterator must permit random access. If the same index 
- *  appears more than once in the range <tt>[map, map + (last - first))</tt>
- *  the result is undefined.
- *
- *  \param first Beginning of the sequence of values to scatter.
- *  \param last End of the sequence of values to scatter.
- *  \param map Beginning of the sequence of output indices.
- *  \param stencil Beginning of the sequence of predicate values.
- *  \param output Beginning of the destination range.
- *  \param pred Predicate to apply to the stencil values.
- *
- *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c value_type.
- *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
- *  \tparam InputIterator3 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator3's \c value_type must be convertible to \c Predicate's \c argument_type.
- *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a>.
- *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[first,last)` for all iterators `i` in the range `[map,map + (last - first))`.
- *
- *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[map,map + (last - first))` for all iterators `i` in the range `[map,map + (last - first))`.
- *
- *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[stencil,stencil + (last - first))` for all iterators `i` in the range `[map,map + (last - first))`.
- *
- *  \pre The expression `result[*i]` shall be valid for all iterators `i` in the range `[map,map + (last - first))` for which the following condition holds: `pred(*(stencil + i)) != false`.
- *
- *  \code
- *  #include <thrust/scatter.h>
- *
- *  struct is_even
- *  {
- *    __host__ __device__
- *    bool operator()(int x)
- *    {
- *      return (x % 2) == 0;
- *    }
- *  };
- *
- *  ...
- *
- *  int V[8] = {10, 20, 30, 40, 50, 60, 70, 80};
- *  int M[8] = {0, 5, 1, 6, 2, 7, 3, 4};
- *  int S[8] = {2, 1, 2, 1, 2, 1, 2, 1};
- *  int D[8] = {0, 0, 0, 0, 0, 0, 0, 0};
- * 
- *  is_even pred;
- *  thrust::scatter_if(V, V + 8, M, S, D, pred);
- * 
- *  // D contains [10, 30, 50, 70, 0, 0, 0, 0];
- *  \endcode
- *  
- *  \note \p scatter_if is the inverse of thrust::gather_if.
- */
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename RandomAccessIterator,
-         typename Predicate>
-  void scatter_if(InputIterator1 first,
-                  InputIterator1 last,
-                  InputIterator2 map,
-                  InputIterator3 stencil,
-                  RandomAccessIterator output,
-                  Predicate pred);
-
-
-/*! \} // end scattering
- */
-
-
-} // end namespace thrust
-
-#include <thrust/detail/scatter.inl>
-
diff --git a/compat/thrust/sequence.h b/compat/thrust/sequence.h
deleted file mode 100644
index 6c54a5bbf4..0000000000
--- a/compat/thrust/sequence.h
+++ /dev/null
@@ -1,293 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file sequence.h
- *  \brief Fills a range with a sequence of numbers
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/execution_policy.h>
-
-namespace thrust
-{
-
-
-/*! \addtogroup transformations
- *  \{
- */
-
-
-/*! \p sequence fills the range <tt>[first, last)</tt> with a sequence of numbers.
- *
- *  For each iterator \c i in the range <tt>[first, last)</tt>, this version of 
- *  \p sequence performs the assignment <tt>*i =  (i - first)</tt>.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the sequence.
- *  \param last The end of the sequence.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator is mutable,
- *          and if \c x and \c y are objects of \c ForwardIterator's \c value_type, then <tt>x + y</tt> is defined,
- *          and if \c T is \p ForwardIterator's \c value_type, then <tt>T(0)</tt> is defined.
- *
- *  The following code snippet demonstrates how to use \p sequence to fill a range
- *  with a sequence of numbers using the \p thrust::host execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/sequence.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  const int N = 10;
- *  int A[N];
- *  thrust::sequence(thrust::host, A, A + 10);
- *  // A is now {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
- *  \endcode
- *
- *  \note Unlike the similar C++ STL function \c std::iota, \p sequence offers no
- *        guarantee on order of execution.
- *
- *  \see http://www.sgi.com/tech/stl/iota.html
- */
-template<typename DerivedPolicy, typename ForwardIterator>
-  void sequence(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                ForwardIterator first,
-                ForwardIterator last);
-
-
-/*! \p sequence fills the range <tt>[first, last)</tt> with a sequence of numbers.
- *
- *  For each iterator \c i in the range <tt>[first, last)</tt>, this version of 
- *  \p sequence performs the assignment <tt>*i =  (i - first)</tt>.
- *
- *  \param first The beginning of the sequence.
- *  \param last The end of the sequence.
- *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator is mutable,
- *          and if \c x and \c y are objects of \c ForwardIterator's \c value_type, then <tt>x + y</tt> is defined,
- *          and if \c T is \p ForwardIterator's \c value_type, then <tt>T(0)</tt> is defined.
- *
- *  The following code snippet demonstrates how to use \p sequence to fill a range
- *  with a sequence of numbers.
- *
- *  \code
- *  #include <thrust/sequence.h>
- *  ...
- *  const int N = 10;
- *  int A[N];
- *  thrust::sequence(A, A + 10);
- *  // A is now {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
- *  \endcode
- *
- *  \note Unlike the similar C++ STL function \c std::iota, \p sequence offers no
- *        guarantee on order of execution.
- *
- *  \see http://www.sgi.com/tech/stl/iota.html
- */
-template<typename ForwardIterator>
-  void sequence(ForwardIterator first,
-                ForwardIterator last);
-
-
-/*! \p sequence fills the range <tt>[first, last)</tt> with a sequence of numbers.
- *
- *  For each iterator \c i in the range <tt>[first, last)</tt>, this version of 
- *  \p sequence performs the assignment <tt>*i =  init + (i - first)</tt>.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the sequence.
- *  \param last The end of the sequence.
- *  \param init The first value of the sequence of numbers.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator is mutable,
- *          and if \c x and \c y are objects of \c ForwardIterator's \c value_type, then <tt>x + y</tt> is defined,
- *          and if \c T is \p ForwardIterator's \c value_type, then <tt>T(0)</tt> is defined.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
- *          and \p T is convertible to \p ForwardIterator's \c value_type.
- *
- *  The following code snippet demonstrates how to use \p sequence to fill a range
- *  with a sequence of numbers starting from the value 1 using the \p thrust::host execution
- *  policy for parallelization:
- *
- *  \code
- *  #include <thrust/sequence.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  const int N = 10;
- *  int A[N];
- *  thrust::sequence(thrust::host, A, A + 10, 1);
- *  // A is now {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
- *  \endcode
- *
- *  \note Unlike the similar C++ STL function \c std::iota, \p sequence offers no
- *        guarantee on order of execution.
- *
- *  \see http://www.sgi.com/tech/stl/iota.html
- */
-template<typename DerivedPolicy, typename ForwardIterator, typename T>
-  void sequence(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                ForwardIterator first,
-                ForwardIterator last,
-                T init);
-
-
-/*! \p sequence fills the range <tt>[first, last)</tt> with a sequence of numbers.
- *
- *  For each iterator \c i in the range <tt>[first, last)</tt>, this version of 
- *  \p sequence performs the assignment <tt>*i =  init + (i - first)</tt>.
- *
- *  \param first The beginning of the sequence.
- *  \param last The end of the sequence.
- *  \param init The first value of the sequence of numbers.
- *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator is mutable,
- *          and if \c x and \c y are objects of \c ForwardIterator's \c value_type, then <tt>x + y</tt> is defined,
- *          and if \c T is \p ForwardIterator's \c value_type, then <tt>T(0)</tt> is defined.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
- *          and \p T is convertible to \p ForwardIterator's \c value_type.
- *
- *  The following code snippet demonstrates how to use \p sequence to fill a range
- *  with a sequence of numbers starting from the value 1.
- *
- *  \code
- *  #include <thrust/sequence.h>
- *  ...
- *  const int N = 10;
- *  int A[N];
- *  thrust::sequence(A, A + 10, 1);
- *  // A is now {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
- *  \endcode
- *
- *  \note Unlike the similar C++ STL function \c std::iota, \p sequence offers no
- *        guarantee on order of execution.
- *
- *  \see http://www.sgi.com/tech/stl/iota.html
- */
-template<typename ForwardIterator, typename T>
-  void sequence(ForwardIterator first,
-                ForwardIterator last,
-                T init);
-
-
-/*! \p sequence fills the range <tt>[first, last)</tt> with a sequence of numbers.
- *
- *  For each iterator \c i in the range <tt>[first, last)</tt>, this version of 
- *  \p sequence performs the assignment <tt>*i =  init + step * (i - first)</tt>.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the sequence.
- *  \param last The end of the sequence.
- *  \param init The first value of the sequence of numbers
- *  \param step The difference between consecutive elements.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator is mutable,
- *          and if \c x and \c y are objects of \c ForwardIterator's \c value_type, then <tt>x + y</tt> is defined,
- *          and if \c T is \p ForwardIterator's \c value_type, then <tt>T(0)</tt> is defined.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
- *          and \p T is convertible to \p ForwardIterator's \c value_type.
- *
- *  The following code snippet demonstrates how to use \p sequence to fill a range
- *  with a sequence of numbers starting from the value 1 with a step size of 3 using the \p thrust::host
- *  execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/sequence.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  const int N = 10;
- *  int A[N];
- *  thrust::sequence(thrust::host, A, A + 10, 1, 3);
- *  // A is now {1, 4, 7, 10, 13, 16, 19, 22, 25, 28}
- *  \endcode
- *
- *  \note Unlike the similar C++ STL function \c std::iota, \p sequence offers no
- *        guarantee on order of execution.
- *
- *  \see http://www.sgi.com/tech/stl/iota.html
- */
-template<typename DerivedPolicy, typename ForwardIterator, typename T>
-  void sequence(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                ForwardIterator first,
-                ForwardIterator last,
-                T init,
-                T step);
-
-
-/*! \p sequence fills the range <tt>[first, last)</tt> with a sequence of numbers.
- *
- *  For each iterator \c i in the range <tt>[first, last)</tt>, this version of 
- *  \p sequence performs the assignment <tt>*i =  init + step * (i - first)</tt>.
- *
- *  \param first The beginning of the sequence.
- *  \param last The end of the sequence.
- *  \param init The first value of the sequence of numbers
- *  \param step The difference between consecutive elements.
- *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator is mutable,
- *          and if \c x and \c y are objects of \c ForwardIterator's \c value_type, then <tt>x + y</tt> is defined,
- *          and if \c T is \p ForwardIterator's \c value_type, then <tt>T(0)</tt> is defined.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
- *          and \p T is convertible to \p ForwardIterator's \c value_type.
- *
- *  The following code snippet demonstrates how to use \p sequence to fill a range
- *  with a sequence of numbers starting from the value 1 with a step size of 3.
- *
- *  \code
- *  #include <thrust/sequence.h>
- *  ...
- *  const int N = 10;
- *  int A[N];
- *  thrust::sequence(A, A + 10, 1, 3);
- *  // A is now {1, 4, 7, 10, 13, 16, 19, 22, 25, 28}
- *  \endcode
- *
- *  \note Unlike the similar C++ STL function \c std::iota, \p sequence offers no
- *        guarantee on order of execution.
- *
- *  \see http://www.sgi.com/tech/stl/iota.html
- */
-template<typename ForwardIterator, typename T>
-  void sequence(ForwardIterator first,
-                ForwardIterator last,
-                T init,
-                T step);
-
-
-/*! \} // end transformations
- */
-
-
-} // end namespace thrust
-
-#include <thrust/detail/sequence.inl>
-
diff --git a/compat/thrust/set_operations.h b/compat/thrust/set_operations.h
deleted file mode 100644
index a7ee624f92..0000000000
--- a/compat/thrust/set_operations.h
+++ /dev/null
@@ -1,2947 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file set_operations.h
- *  \brief Set theoretic operations for sorted ranges
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/execution_policy.h>
-#include <thrust/pair.h>
-
-namespace thrust
-{
-
-
-/*! \addtogroup set_operations Set Operations
- *  \ingroup algorithms
- *  \{
- */
-
-
-/*! \p set_difference constructs a sorted range that is the set difference of the sorted
- *  ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>. The return value is the
- *  end of the output range.
- *
- *  In the simplest case, \p set_difference performs the "difference" operation from set
- *  theory: the output range contains a copy of every element that is contained in
- *  <tt>[first1, last1)</tt> and not contained in <tt>[first2, last1)</tt>. The general case
- *  is more complicated, because the input ranges may contain duplicate elements.
- *  The generalization is that if <tt>[first1, last1)</tt> contains \c m elements
- *  that are equivalent to each other and if <tt>[first2, last2)</tt> contains \c n
- *  elements that are equivalent to them, the last <tt>max(m-n,0)</tt> elements from
- *  <tt>[first1, last1)</tt> range shall be copied to the output range.
- *
- *  This version of \p set_difference compares elements using \c operator<.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first1 The beginning of the first input range.
- *  \param last1 The end of the first input range.
- *  \param first2 The beginning of the second input range.
- *  \param last2 The end of the second input range.
- *  \param result The beginning of the output range.
- *  \return The end of the output range.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *
- *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
- *  \pre The resulting range shall not overlap with either input range.
- *
- *  The following code snippet demonstrates how to use \p set_difference to compute the
- *  set difference of two sets of integers sorted in ascending order using the \p thrust::host execution
- *  policy for parallelization:
- *
- *  \code
- *  #include <thrust/set_operations.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  int A1[6] = {0, 1, 3, 4, 5, 6, 9};
- *  int A2[5] = {1, 3, 5, 7, 9};
- *
- *  int result[3];
- *
- *  int *result_end = thrust::set_difference(thrust::host, A1, A1 + 6, A2, A2 + 5, result);
- *  // result is now {0, 4, 6}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/set_difference.html
- *  \see \p includes
- *  \see \p set_union
- *  \see \p set_intersection
- *  \see \p set_symmetric_difference
- *  \see \p sort
- *  \see \p is_sorted
- */
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-  OutputIterator set_difference(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                InputIterator1                                              first1,
-                                InputIterator1                                              last1,
-                                InputIterator2                                              first2,
-                                InputIterator2                                              last2,
-                                OutputIterator                                              result);
-
-
-/*! \p set_difference constructs a sorted range that is the set difference of the sorted
- *  ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>. The return value is the
- *  end of the output range.
- *
- *  In the simplest case, \p set_difference performs the "difference" operation from set
- *  theory: the output range contains a copy of every element that is contained in
- *  <tt>[first1, last1)</tt> and not contained in <tt>[first2, last1)</tt>. The general case
- *  is more complicated, because the input ranges may contain duplicate elements.
- *  The generalization is that if <tt>[first1, last1)</tt> contains \c m elements
- *  that are equivalent to each other and if <tt>[first2, last2)</tt> contains \c n
- *  elements that are equivalent to them, the last <tt>max(m-n,0)</tt> elements from
- *  <tt>[first1, last1)</tt> range shall be copied to the output range.
- *
- *  This version of \p set_difference compares elements using \c operator<.
- *
- *  \param first1 The beginning of the first input range.
- *  \param last1 The end of the first input range.
- *  \param first2 The beginning of the second input range.
- *  \param last2 The end of the second input range.
- *  \param result The beginning of the output range.
- *  \return The end of the output range.
- *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *
- *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
- *  \pre The resulting range shall not overlap with either input range.
- *
- *  The following code snippet demonstrates how to use \p set_difference to compute the
- *  set difference of two sets of integers sorted in ascending order.
- *
- *  \code
- *  #include <thrust/set_operations.h>
- *  ...
- *  int A1[6] = {0, 1, 3, 4, 5, 6, 9};
- *  int A2[5] = {1, 3, 5, 7, 9};
- *
- *  int result[3];
- *
- *  int *result_end = thrust::set_difference(A1, A1 + 6, A2, A2 + 5, result);
- *  // result is now {0, 4, 6}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/set_difference.html
- *  \see \p includes
- *  \see \p set_union
- *  \see \p set_intersection
- *  \see \p set_symmetric_difference
- *  \see \p sort
- *  \see \p is_sorted
- */
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-  OutputIterator set_difference(InputIterator1 first1,
-                                InputIterator1 last1,
-                                InputIterator2 first2,
-                                InputIterator2 last2,
-                                OutputIterator result);
-
-
-/*! \p set_difference constructs a sorted range that is the set difference of the sorted
- *  ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>. The return value is the
- *  end of the output range.
- *
- *  In the simplest case, \p set_difference performs the "difference" operation from set
- *  theory: the output range contains a copy of every element that is contained in
- *  <tt>[first1, last1)</tt> and not contained in <tt>[first2, last1)</tt>. The general case
- *  is more complicated, because the input ranges may contain duplicate elements.
- *  The generalization is that if <tt>[first1, last1)</tt> contains \c m elements
- *  that are equivalent to each other and if <tt>[first2, last2)</tt> contains \c n
- *  elements that are equivalent to them, the last <tt>max(m-n,0)</tt> elements from
- *  <tt>[first1, last1)</tt> range shall be copied to the output range.
- *
- *  This version of \p set_difference compares elements using a function object \p comp.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first1 The beginning of the first input range.
- *  \param last1 The end of the first input range.
- *  \param first2 The beginning of the second input range.
- *  \param last2 The end of the second input range.
- *  \param result The beginning of the output range.
- *  \param comp Comparison operator.
- *  \return The end of the output range.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator1's \c value_type is convertable to \p StrictWeakCompare's \c first_argument_type.
- *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator2's \c value_type is convertable to \p StrictWeakCompare's \c second_argument_type.
- *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
- *
- *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
- *  \pre The resulting range shall not overlap with either input range.
- *
- *  The following code snippet demonstrates how to use \p set_difference to compute the
- *  set difference of two sets of integers sorted in descending order using the \p thrust::host execution
- *  policy for parallelization:
- *
- *  \code
- *  #include <thrust/set_operations.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  int A1[6] = {9, 6, 5, 4, 3, 1, 0};
- *  int A2[5] = {9, 7, 5, 3, 1};
- *
- *  int result[3];
- *
- *  int *result_end = thrust::set_difference(thrust::host, A1, A1 + 6, A2, A2 + 5, result, thrust::greater<int>());
- *  // result is now {6, 4, 0}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/set_difference.html
- *  \see \p includes
- *  \see \p set_union
- *  \see \p set_intersection
- *  \see \p set_symmetric_difference
- *  \see \p sort
- *  \see \p is_sorted
- */
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename StrictWeakCompare>
-  OutputIterator set_difference(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                InputIterator1                                              first1,
-                                InputIterator1                                              last1,
-                                InputIterator2                                              first2,
-                                InputIterator2                                              last2,
-                                OutputIterator                                              result,
-                                StrictWeakCompare                                           comp);
-
-
-/*! \p set_difference constructs a sorted range that is the set difference of the sorted
- *  ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>. The return value is the
- *  end of the output range.
- *
- *  In the simplest case, \p set_difference performs the "difference" operation from set
- *  theory: the output range contains a copy of every element that is contained in
- *  <tt>[first1, last1)</tt> and not contained in <tt>[first2, last1)</tt>. The general case
- *  is more complicated, because the input ranges may contain duplicate elements.
- *  The generalization is that if <tt>[first1, last1)</tt> contains \c m elements
- *  that are equivalent to each other and if <tt>[first2, last2)</tt> contains \c n
- *  elements that are equivalent to them, the last <tt>max(m-n,0)</tt> elements from
- *  <tt>[first1, last1)</tt> range shall be copied to the output range.
- *
- *  This version of \p set_difference compares elements using a function object \p comp.
- *
- *  \param first1 The beginning of the first input range.
- *  \param last1 The end of the first input range.
- *  \param first2 The beginning of the second input range.
- *  \param last2 The end of the second input range.
- *  \param result The beginning of the output range.
- *  \param comp Comparison operator.
- *  \return The end of the output range.
- *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator1's \c value_type is convertable to \p StrictWeakCompare's \c first_argument_type.
- *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator2's \c value_type is convertable to \p StrictWeakCompare's \c second_argument_type.
- *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
- *
- *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
- *  \pre The resulting range shall not overlap with either input range.
- *
- *  The following code snippet demonstrates how to use \p set_difference to compute the
- *  set difference of two sets of integers sorted in descending order.
- *
- *  \code
- *  #include <thrust/set_operations.h>
- *  #include <thrust/functional.h>
- *  ...
- *  int A1[6] = {9, 6, 5, 4, 3, 1, 0};
- *  int A2[5] = {9, 7, 5, 3, 1};
- *
- *  int result[3];
- *
- *  int *result_end = thrust::set_difference(A1, A1 + 6, A2, A2 + 5, result, thrust::greater<int>());
- *  // result is now {6, 4, 0}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/set_difference.html
- *  \see \p includes
- *  \see \p set_union
- *  \see \p set_intersection
- *  \see \p set_symmetric_difference
- *  \see \p sort
- *  \see \p is_sorted
- */
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename StrictWeakCompare>
-  OutputIterator set_difference(InputIterator1 first1,
-                                InputIterator1 last1,
-                                InputIterator2 first2,
-                                InputIterator2 last2,
-                                OutputIterator result,
-                                StrictWeakCompare comp);
-
-
-/*! \p set_intersection constructs a sorted range that is the
- *  intersection of sorted ranges <tt>[first1, last1)</tt> and
- *  <tt>[first2, last2)</tt>. The return value is the end of the
- *  output range.
- *
- *  In the simplest case, \p set_intersection performs the
- *  "intersection" operation from set theory: the output range
- *  contains a copy of every element that is contained in both
- *  <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>. The
- *  general case is more complicated, because the input ranges may
- *  contain duplicate elements. The generalization is that if a value
- *  appears \c m times in <tt>[first1, last1)</tt> and \c n times in
- *  <tt>[first2, last2)</tt> (where \c m may be zero), then it
- *  appears <tt>min(m,n)</tt> times in the output range.
- *  \p set_intersection is stable, meaning that both elements are
- *  copied from the first range rather than the second, and that the
- *  relative order of elements in the output range is the same as in
- *  the first input range.
- *
- *  This version of \p set_intersection compares objects using
- *  \c operator<.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first1 The beginning of the first input range.
- *  \param last1 The end of the first input range.
- *  \param first2 The beginning of the second input range.
- *  \param last2 The end of the second input range.
- *  \param result The beginning of the output range.
- *  \return The end of the output range.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *
- *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
- *  \pre The resulting range shall not overlap with either input range.
- *
- *  The following code snippet demonstrates how to use \p set_intersection to compute the
- *  set intersection of two sets of integers sorted in ascending order using the \p thrust::host execution
- *  policy for parallelization:
- *
- *  \code
- *  #include <thrust/set_operations.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  int A1[6] = {1, 3, 5, 7, 9, 11};
- *  int A2[7] = {1, 1, 2, 3, 5,  8, 13};
- *
- *  int result[7];
- *
- *  int *result_end = thrust::set_intersection(thrust::host, A1, A1 + 6, A2, A2 + 7, result);
- *  // result is now {1, 3, 5}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/set_intersection.html
- *  \see \p includes
- *  \see \p set_union
- *  \see \p set_intersection
- *  \see \p set_symmetric_difference
- *  \see \p sort
- *  \see \p is_sorted
- */
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-  OutputIterator set_intersection(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                  InputIterator1                                              first1,
-                                  InputIterator1                                              last1,
-                                  InputIterator2                                              first2,
-                                  InputIterator2                                              last2,
-                                  OutputIterator                                              result);
-
-
-/*! \p set_intersection constructs a sorted range that is the
- *  intersection of sorted ranges <tt>[first1, last1)</tt> and
- *  <tt>[first2, last2)</tt>. The return value is the end of the
- *  output range.
- *
- *  In the simplest case, \p set_intersection performs the
- *  "intersection" operation from set theory: the output range
- *  contains a copy of every element that is contained in both
- *  <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>. The
- *  general case is more complicated, because the input ranges may
- *  contain duplicate elements. The generalization is that if a value
- *  appears \c m times in <tt>[first1, last1)</tt> and \c n times in
- *  <tt>[first2, last2)</tt> (where \c m may be zero), then it
- *  appears <tt>min(m,n)</tt> times in the output range.
- *  \p set_intersection is stable, meaning that both elements are
- *  copied from the first range rather than the second, and that the
- *  relative order of elements in the output range is the same as in
- *  the first input range.
- *
- *  This version of \p set_intersection compares objects using
- *  \c operator<.
- *
- *  \param first1 The beginning of the first input range.
- *  \param last1 The end of the first input range.
- *  \param first2 The beginning of the second input range.
- *  \param last2 The end of the second input range.
- *  \param result The beginning of the output range.
- *  \return The end of the output range.
- *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *
- *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
- *  \pre The resulting range shall not overlap with either input range.
- *
- *  The following code snippet demonstrates how to use \p set_intersection to compute the
- *  set intersection of two sets of integers sorted in ascending order.
- *
- *  \code
- *  #include <thrust/set_operations.h>
- *  ...
- *  int A1[6] = {1, 3, 5, 7, 9, 11};
- *  int A2[7] = {1, 1, 2, 3, 5,  8, 13};
- *
- *  int result[7];
- *
- *  int *result_end = thrust::set_intersection(A1, A1 + 6, A2, A2 + 7, result);
- *  // result is now {1, 3, 5}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/set_intersection.html
- *  \see \p includes
- *  \see \p set_union
- *  \see \p set_intersection
- *  \see \p set_symmetric_difference
- *  \see \p sort
- *  \see \p is_sorted
- */
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-  OutputIterator set_intersection(InputIterator1 first1,
-                                  InputIterator1 last1,
-                                  InputIterator2 first2,
-                                  InputIterator2 last2,
-                                  OutputIterator result);
-
-
-/*! \p set_intersection constructs a sorted range that is the
- *  intersection of sorted ranges <tt>[first1, last1)</tt> and
- *  <tt>[first2, last2)</tt>. The return value is the end of the
- *  output range.
- *
- *  In the simplest case, \p set_intersection performs the
- *  "intersection" operation from set theory: the output range
- *  contains a copy of every element that is contained in both
- *  <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>. The
- *  general case is more complicated, because the input ranges may
- *  contain duplicate elements. The generalization is that if a value
- *  appears \c m times in <tt>[first1, last1)</tt> and \c n times in
- *  <tt>[first2, last2)</tt> (where \c m may be zero), then it
- *  appears <tt>min(m,n)</tt> times in the output range.
- *  \p set_intersection is stable, meaning that both elements are
- *  copied from the first range rather than the second, and that the
- *  relative order of elements in the output range is the same as in
- *  the first input range.
- *
- *  This version of \p set_intersection compares elements using a function object \p comp.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first1 The beginning of the first input range.
- *  \param last1 The end of the first input range.
- *  \param first2 The beginning of the second input range.
- *  \param last2 The end of the second input range.
- *  \param result The beginning of the output range.
- *  \param comp Comparison operator.
- *  \return The end of the output range.
- *
- *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
- *  \pre The resulting range shall not overlap with either input range.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *
- *  The following code snippet demonstrates how to use \p set_intersection to compute
- *  the set intersection of sets of integers sorted in descending order using the \p thrust::host execution
- *  policy for parallelization:
- *
- *  \code
- *  #include <thrust/set_operations.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  int A1[6] = {11, 9, 7, 5, 3, 1};
- *  int A2[7] = {13, 8, 5, 3, 2,  1, 1};
- *
- *  int result[3];
- *
- *  int *result_end = thrust::set_intersection(thrust::host, A1, A1 + 6, A2, A2 + 7, result, thrust::greater<int>());
- *  // result is now {5, 3, 1}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/set_intersection.html
- *  \see \p includes
- *  \see \p set_union
- *  \see \p set_intersection
- *  \see \p set_symmetric_difference
- *  \see \p sort
- *  \see \p is_sorted
- */
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename StrictWeakCompare>
-  OutputIterator set_intersection(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                  InputIterator1                                              first1,
-                                  InputIterator1                                              last1,
-                                  InputIterator2                                              first2,
-                                  InputIterator2                                              last2,
-                                  OutputIterator                                              result,
-                                  StrictWeakCompare                                           comp);
-
-
-/*! \p set_intersection constructs a sorted range that is the
- *  intersection of sorted ranges <tt>[first1, last1)</tt> and
- *  <tt>[first2, last2)</tt>. The return value is the end of the
- *  output range.
- *
- *  In the simplest case, \p set_intersection performs the
- *  "intersection" operation from set theory: the output range
- *  contains a copy of every element that is contained in both
- *  <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>. The
- *  general case is more complicated, because the input ranges may
- *  contain duplicate elements. The generalization is that if a value
- *  appears \c m times in <tt>[first1, last1)</tt> and \c n times in
- *  <tt>[first2, last2)</tt> (where \c m may be zero), then it
- *  appears <tt>min(m,n)</tt> times in the output range.
- *  \p set_intersection is stable, meaning that both elements are
- *  copied from the first range rather than the second, and that the
- *  relative order of elements in the output range is the same as in
- *  the first input range.
- *
- *  This version of \p set_intersection compares elements using a function object \p comp.
- *
- *  \param first1 The beginning of the first input range.
- *  \param last1 The end of the first input range.
- *  \param first2 The beginning of the second input range.
- *  \param last2 The end of the second input range.
- *  \param result The beginning of the output range.
- *  \param comp Comparison operator.
- *  \return The end of the output range.
- *
- *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
- *  \pre The resulting range shall not overlap with either input range.
- *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *
- *  The following code snippet demonstrates how to use \p set_intersection to compute
- *  the set intersection of sets of integers sorted in descending order.
- *
- *  \code
- *  #include <thrust/set_operations.h>
- *  ...
- *  int A1[6] = {11, 9, 7, 5, 3, 1};
- *  int A2[7] = {13, 8, 5, 3, 2,  1, 1};
- *
- *  int result[3];
- *
- *  int *result_end = thrust::set_intersection(A1, A1 + 6, A2, A2 + 7, result, thrust::greater<int>());
- *  // result is now {5, 3, 1}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/set_intersection.html
- *  \see \p includes
- *  \see \p set_union
- *  \see \p set_intersection
- *  \see \p set_symmetric_difference
- *  \see \p sort
- *  \see \p is_sorted
- */
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename StrictWeakCompare>
-  OutputIterator set_intersection(InputIterator1 first1,
-                                  InputIterator1 last1,
-                                  InputIterator2 first2,
-                                  InputIterator2 last2,
-                                  OutputIterator result,
-                                  StrictWeakCompare comp);
-
-
-/*! \p set_symmetric_difference constructs a sorted range that is the set symmetric
- *  difference of the sorted ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>.
- *  The return value is the end of the output range.
- *
- *  In the simplest case, \p set_symmetric_difference performs a set theoretic calculation:
- *  it constructs the union of the two sets A - B and B - A, where A and B are the two
- *  input ranges. That is, the output range contains a copy of every element that is
- *  contained in <tt>[first1, last1)</tt> but not <tt>[first2, last1)</tt>, and a copy of
- *  every element that is contained in <tt>[first2, last2)</tt> but not <tt>[first1, last1)</tt>.
- *  The general case is more complicated, because the input ranges may contain duplicate elements.
- *  The generalization is that if <tt>[first1, last1)</tt> contains \c m elements that are
- *  equivalent to each other and <tt>[first2, last1)</tt> contains \c n elements that are
- *  equivalent to them, then <tt>|m - n|</tt> of those elements shall be copied to the output
- *  range: the last <tt>m - n</tt> elements from <tt>[first1, last1)</tt> if <tt>m > n</tt>, and
- *  the last <tt>n - m</tt> of these elements from <tt>[first2, last2)</tt> if <tt>m < n</tt>.
- *
- *  This version of \p set_union compares elements using \c operator<.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first1 The beginning of the first input range.
- *  \param last1 The end of the first input range.
- *  \param first2 The beginning of the second input range.
- *  \param last2 The end of the second input range.
- *  \param result The beginning of the output range.
- *  \return The end of the output range.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *
- *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
- *  \pre The resulting range shall not overlap with either input range.
- *
- *  The following code snippet demonstrates how to use \p set_symmetric_difference to compute
- *  the symmetric difference of two sets of integers sorted in ascending order using the \p thrust::host
- *  execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/set_operations.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  int A1[6] = {0, 1, 2, 2, 4, 6, 7};
- *  int A2[5] = {1, 1, 2, 5, 8};
- *
- *  int result[6];
- *
- *  int *result_end = thrust::set_symmetric_difference(thrust::host, A1, A1 + 6, A2, A2 + 5, result);
- *  // result = {0, 4, 5, 6, 7, 8}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/set_symmetric_difference.html
- *  \see \p merge
- *  \see \p includes
- *  \see \p set_difference
- *  \see \p set_union
- *  \see \p set_intersection
- *  \see \p sort
- *  \see \p is_sorted
- */
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-  OutputIterator set_symmetric_difference(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                          InputIterator1                                              first1,
-                                          InputIterator1                                              last1,
-                                          InputIterator2                                              first2,
-                                          InputIterator2                                              last2,
-                                          OutputIterator                                              result);
-
-
-/*! \p set_symmetric_difference constructs a sorted range that is the set symmetric
- *  difference of the sorted ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>.
- *  The return value is the end of the output range.
- *
- *  In the simplest case, \p set_symmetric_difference performs a set theoretic calculation:
- *  it constructs the union of the two sets A - B and B - A, where A and B are the two
- *  input ranges. That is, the output range contains a copy of every element that is
- *  contained in <tt>[first1, last1)</tt> but not <tt>[first2, last1)</tt>, and a copy of
- *  every element that is contained in <tt>[first2, last2)</tt> but not <tt>[first1, last1)</tt>.
- *  The general case is more complicated, because the input ranges may contain duplicate elements.
- *  The generalization is that if <tt>[first1, last1)</tt> contains \c m elements that are
- *  equivalent to each other and <tt>[first2, last1)</tt> contains \c n elements that are
- *  equivalent to them, then <tt>|m - n|</tt> of those elements shall be copied to the output
- *  range: the last <tt>m - n</tt> elements from <tt>[first1, last1)</tt> if <tt>m > n</tt>, and
- *  the last <tt>n - m</tt> of these elements from <tt>[first2, last2)</tt> if <tt>m < n</tt>.
- *
- *  This version of \p set_union compares elements using \c operator<.
- *
- *  \param first1 The beginning of the first input range.
- *  \param last1 The end of the first input range.
- *  \param first2 The beginning of the second input range.
- *  \param last2 The end of the second input range.
- *  \param result The beginning of the output range.
- *  \return The end of the output range.
- *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *
- *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
- *  \pre The resulting range shall not overlap with either input range.
- *
- *  The following code snippet demonstrates how to use \p set_symmetric_difference to compute
- *  the symmetric difference of two sets of integers sorted in ascending order.
- *
- *  \code
- *  #include <thrust/set_operations.h>
- *  ...
- *  int A1[6] = {0, 1, 2, 2, 4, 6, 7};
- *  int A2[5] = {1, 1, 2, 5, 8};
- *
- *  int result[6];
- *
- *  int *result_end = thrust::set_symmetric_difference(A1, A1 + 6, A2, A2 + 5, result);
- *  // result = {0, 4, 5, 6, 7, 8}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/set_symmetric_difference.html
- *  \see \p merge
- *  \see \p includes
- *  \see \p set_difference
- *  \see \p set_union
- *  \see \p set_intersection
- *  \see \p sort
- *  \see \p is_sorted
- */
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-  OutputIterator set_symmetric_difference(InputIterator1 first1,
-                                          InputIterator1 last1,
-                                          InputIterator2 first2,
-                                          InputIterator2 last2,
-                                          OutputIterator result);
-
-
-/*! \p set_symmetric_difference constructs a sorted range that is the set symmetric
- *  difference of the sorted ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>.
- *  The return value is the end of the output range.
- *
- *  In the simplest case, \p set_symmetric_difference performs a set theoretic calculation:
- *  it constructs the union of the two sets A - B and B - A, where A and B are the two
- *  input ranges. That is, the output range contains a copy of every element that is
- *  contained in <tt>[first1, last1)</tt> but not <tt>[first2, last1)</tt>, and a copy of
- *  every element that is contained in <tt>[first2, last2)</tt> but not <tt>[first1, last1)</tt>.
- *  The general case is more complicated, because the input ranges may contain duplicate elements.
- *  The generalization is that if <tt>[first1, last1)</tt> contains \c m elements that are
- *  equivalent to each other and <tt>[first2, last1)</tt> contains \c n elements that are
- *  equivalent to them, then <tt>|m - n|</tt> of those elements shall be copied to the output
- *  range: the last <tt>m - n</tt> elements from <tt>[first1, last1)</tt> if <tt>m > n</tt>, and
- *  the last <tt>n - m</tt> of these elements from <tt>[first2, last2)</tt> if <tt>m < n</tt>.
- *
- *  This version of \p set_union compares elements using a function object \p comp.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first1 The beginning of the first input range.
- *  \param last1 The end of the first input range.
- *  \param first2 The beginning of the second input range.
- *  \param last2 The end of the second input range.
- *  \param result The beginning of the output range.
- *  \param comp Comparison operator.
- *  \return The end of the output range.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *
- *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
- *  \pre The resulting range shall not overlap with either input range.
- *
- *  The following code snippet demonstrates how to use \p set_symmetric_difference to compute
- *  the symmetric difference of two sets of integers sorted in descending order using the \p thrust::host
- *  execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/set_operations.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  int A1[6] = {7, 6, 4, 2, 2, 1, 0};
- *  int A2[5] = {8, 5, 2, 1, 1};
- *
- *  int result[6];
- *
- *  int *result_end = thrust::set_symmetric_difference(thrust::host, A1, A1 + 6, A2, A2 + 5, result);
- *  // result = {8, 7, 6, 5, 4, 0}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/set_symmetric_difference.html
- *  \see \p merge
- *  \see \p includes
- *  \see \p set_difference
- *  \see \p set_union
- *  \see \p set_intersection
- *  \see \p sort
- *  \see \p is_sorted
- */
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename StrictWeakCompare>
-  OutputIterator set_symmetric_difference(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                          InputIterator1                                              first1,
-                                          InputIterator1                                              last1,
-                                          InputIterator2                                              first2,
-                                          InputIterator2                                              last2,
-                                          OutputIterator                                              result,
-                                          StrictWeakCompare                                           comp);
-
-
-/*! \p set_symmetric_difference constructs a sorted range that is the set symmetric
- *  difference of the sorted ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>.
- *  The return value is the end of the output range.
- *
- *  In the simplest case, \p set_symmetric_difference performs a set theoretic calculation:
- *  it constructs the union of the two sets A - B and B - A, where A and B are the two
- *  input ranges. That is, the output range contains a copy of every element that is
- *  contained in <tt>[first1, last1)</tt> but not <tt>[first2, last1)</tt>, and a copy of
- *  every element that is contained in <tt>[first2, last2)</tt> but not <tt>[first1, last1)</tt>.
- *  The general case is more complicated, because the input ranges may contain duplicate elements.
- *  The generalization is that if <tt>[first1, last1)</tt> contains \c m elements that are
- *  equivalent to each other and <tt>[first2, last1)</tt> contains \c n elements that are
- *  equivalent to them, then <tt>|m - n|</tt> of those elements shall be copied to the output
- *  range: the last <tt>m - n</tt> elements from <tt>[first1, last1)</tt> if <tt>m > n</tt>, and
- *  the last <tt>n - m</tt> of these elements from <tt>[first2, last2)</tt> if <tt>m < n</tt>.
- *
- *  This version of \p set_union compares elements using a function object \p comp.
- *
- *  \param first1 The beginning of the first input range.
- *  \param last1 The end of the first input range.
- *  \param first2 The beginning of the second input range.
- *  \param last2 The end of the second input range.
- *  \param result The beginning of the output range.
- *  \param comp Comparison operator.
- *  \return The end of the output range.
- *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *
- *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
- *  \pre The resulting range shall not overlap with either input range.
- *
- *  The following code snippet demonstrates how to use \p set_symmetric_difference to compute
- *  the symmetric difference of two sets of integers sorted in descending order.
- *
- *  \code
- *  #include <thrust/set_operations.h>
- *  ...
- *  int A1[6] = {7, 6, 4, 2, 2, 1, 0};
- *  int A2[5] = {8, 5, 2, 1, 1};
- *
- *  int result[6];
- *
- *  int *result_end = thrust::set_symmetric_difference(A1, A1 + 6, A2, A2 + 5, result);
- *  // result = {8, 7, 6, 5, 4, 0}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/set_symmetric_difference.html
- *  \see \p merge
- *  \see \p includes
- *  \see \p set_difference
- *  \see \p set_union
- *  \see \p set_intersection
- *  \see \p sort
- *  \see \p is_sorted
- */
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename StrictWeakCompare>
-  OutputIterator set_symmetric_difference(InputIterator1 first1,
-                                          InputIterator1 last1,
-                                          InputIterator2 first2,
-                                          InputIterator2 last2,
-                                          OutputIterator result,
-                                          StrictWeakCompare comp);
-
-
-/*! \p set_union constructs a sorted range that is the union of the sorted ranges
- *  <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>. The return value is the
- *  end of the output range.
- *
- *  In the simplest case, \p set_union performs the "union" operation from set
- *  theory: the output range contains a copy of every element that is contained in
- *  <tt>[first1, last1)</tt>, <tt>[first2, last1)</tt>, or both. The general case
- *  is more complicated, because the input ranges may contain duplicate elements.
- *  The generalization is that if <tt>[first1, last1)</tt> contains \c m elements
- *  that are equivalent to each other and if <tt>[first2, last2)</tt> contains \c n
- *  elements that are equivalent to them, then all \c m elements from the first
- *  range shall be copied to the output range, in order, and then <tt>max(n - m, 0)</tt>
- *  elements from the second range shall be copied to the output, in order.
- *
- *  This version of \p set_union compares elements using \c operator<.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first1 The beginning of the first input range.
- *  \param last1 The end of the first input range.
- *  \param first2 The beginning of the second input range.
- *  \param last2 The end of the second input range.
- *  \param result The beginning of the output range.
- *  \return The end of the output range.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *
- *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
- *  \pre The resulting range shall not overlap with either input range.
- *
- *  The following code snippet demonstrates how to use \p set_union to compute the union of
- *  two sets of integers sorted in ascending order using the \p thrust::host execution policy for
- *  parallelization:
- *
- *  \code
- *  #include <thrust/set_operations.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  int A1[6] = {0, 2, 4, 6, 8, 10, 12};
- *  int A2[5] = {1, 3, 5, 7, 9};
- *
- *  int result[11];
- *
- *  int *result_end = thrust::set_union(thrust::host, A1, A1 + 6, A2, A2 + 5, result);
- *  // result = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/set_union.html
- *  \see \p merge
- *  \see \p includes
- *  \see \p set_union
- *  \see \p set_intersection
- *  \see \p set_symmetric_difference
- *  \see \p sort
- *  \see \p is_sorted
- */
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-  OutputIterator set_union(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                           InputIterator1                                              first1,
-                           InputIterator1                                              last1,
-                           InputIterator2                                              first2,
-                           InputIterator2                                              last2,
-                           OutputIterator                                              result);
-
-
-/*! \p set_union constructs a sorted range that is the union of the sorted ranges
- *  <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>. The return value is the
- *  end of the output range.
- *
- *  In the simplest case, \p set_union performs the "union" operation from set
- *  theory: the output range contains a copy of every element that is contained in
- *  <tt>[first1, last1)</tt>, <tt>[first2, last1)</tt>, or both. The general case
- *  is more complicated, because the input ranges may contain duplicate elements.
- *  The generalization is that if <tt>[first1, last1)</tt> contains \c m elements
- *  that are equivalent to each other and if <tt>[first2, last2)</tt> contains \c n
- *  elements that are equivalent to them, then all \c m elements from the first
- *  range shall be copied to the output range, in order, and then <tt>max(n - m, 0)</tt>
- *  elements from the second range shall be copied to the output, in order.
- *
- *  This version of \p set_union compares elements using \c operator<.
- *
- *  \param first1 The beginning of the first input range.
- *  \param last1 The end of the first input range.
- *  \param first2 The beginning of the second input range.
- *  \param last2 The end of the second input range.
- *  \param result The beginning of the output range.
- *  \return The end of the output range.
- *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *
- *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
- *  \pre The resulting range shall not overlap with either input range.
- *
- *  The following code snippet demonstrates how to use \p set_union to compute the union of
- *  two sets of integers sorted in ascending order.
- *
- *  \code
- *  #include <thrust/set_operations.h>
- *  ...
- *  int A1[6] = {0, 2, 4, 6, 8, 10, 12};
- *  int A2[5] = {1, 3, 5, 7, 9};
- *
- *  int result[11];
- *
- *  int *result_end = thrust::set_union(A1, A1 + 6, A2, A2 + 5, result);
- *  // result = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/set_union.html
- *  \see \p merge
- *  \see \p includes
- *  \see \p set_union
- *  \see \p set_intersection
- *  \see \p set_symmetric_difference
- *  \see \p sort
- *  \see \p is_sorted
- */
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-  OutputIterator set_union(InputIterator1 first1,
-                           InputIterator1 last1,
-                           InputIterator2 first2,
-                           InputIterator2 last2,
-                           OutputIterator result);
-
-
-/*! \p set_union constructs a sorted range that is the union of the sorted ranges
- *  <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>. The return value is the
- *  end of the output range.
- *
- *  In the simplest case, \p set_union performs the "union" operation from set
- *  theory: the output range contains a copy of every element that is contained in
- *  <tt>[first1, last1)</tt>, <tt>[first2, last1)</tt>, or both. The general case
- *  is more complicated, because the input ranges may contain duplicate elements.
- *  The generalization is that if <tt>[first1, last1)</tt> contains \c m elements
- *  that are equivalent to each other and if <tt>[first2, last2)</tt> contains \c n
- *  elements that are equivalent to them, then all \c m elements from the first
- *  range shall be copied to the output range, in order, and then <tt>max(n - m, 0)</tt>
- *  elements from the second range shall be copied to the output, in order.
- *
- *  This version of \p set_union compares elements using a function object \p comp.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first1 The beginning of the first input range.
- *  \param last1 The end of the first input range.
- *  \param first2 The beginning of the second input range.
- *  \param last2 The end of the second input range.
- *  \param result The beginning of the output range.
- *  \param comp Comparison operator.
- *  \return The end of the output range.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator1's \c value_type is convertable to \p StrictWeakCompare's \c first_argument_type.
- *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator2's \c value_type is convertable to \p StrictWeakCompare's \c second_argument_type.
- *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
- *
- *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
- *  \pre The resulting range shall not overlap with either input range.
- *
- *  The following code snippet demonstrates how to use \p set_union to compute the union of
- *  two sets of integers sorted in ascending order using the \p thrust::host execution policy for
- *  parallelization:
- *
- *  \code
- *  #include <thrust/set_operations.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  int A1[6] = {12, 10, 8, 6, 4, 2, 0};
- *  int A2[5] = {9, 7, 5, 3, 1};
- *
- *  int result[11];
- *
- *  int *result_end = thrust::set_union(thrust::host, A1, A1 + 6, A2, A2 + 5, result, thrust::greater<int>());
- *  // result = {12, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/set_union.html
- *  \see \p merge
- *  \see \p includes
- *  \see \p set_union
- *  \see \p set_intersection
- *  \see \p set_symmetric_difference
- *  \see \p sort
- *  \see \p is_sorted
- */
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename StrictWeakCompare>
-  OutputIterator set_union(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                           InputIterator1                                              first1,
-                           InputIterator1                                              last1,
-                           InputIterator2                                              first2,
-                           InputIterator2                                              last2,
-                           OutputIterator                                              result,
-                           StrictWeakCompare                                           comp);
-
-
-/*! \p set_union constructs a sorted range that is the union of the sorted ranges
- *  <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>. The return value is the
- *  end of the output range.
- *
- *  In the simplest case, \p set_union performs the "union" operation from set
- *  theory: the output range contains a copy of every element that is contained in
- *  <tt>[first1, last1)</tt>, <tt>[first2, last1)</tt>, or both. The general case
- *  is more complicated, because the input ranges may contain duplicate elements.
- *  The generalization is that if <tt>[first1, last1)</tt> contains \c m elements
- *  that are equivalent to each other and if <tt>[first2, last2)</tt> contains \c n
- *  elements that are equivalent to them, then all \c m elements from the first
- *  range shall be copied to the output range, in order, and then <tt>max(n - m, 0)</tt>
- *  elements from the second range shall be copied to the output, in order.
- *
- *  This version of \p set_union compares elements using a function object \p comp.
- *
- *  \param first1 The beginning of the first input range.
- *  \param last1 The end of the first input range.
- *  \param first2 The beginning of the second input range.
- *  \param last2 The end of the second input range.
- *  \param result The beginning of the output range.
- *  \param comp Comparison operator.
- *  \return The end of the output range.
- *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator1's \c value_type is convertable to \p StrictWeakCompare's \c first_argument_type.
- *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator2's \c value_type is convertable to \p StrictWeakCompare's \c second_argument_type.
- *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
- *
- *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
- *  \pre The resulting range shall not overlap with either input range.
- *
- *  The following code snippet demonstrates how to use \p set_union to compute the union of
- *  two sets of integers sorted in ascending order.
- *
- *  \code
- *  #include <thrust/set_operations.h>
- *  #include <thrust/functional.h>
- *  ...
- *  int A1[6] = {12, 10, 8, 6, 4, 2, 0};
- *  int A2[5] = {9, 7, 5, 3, 1};
- *
- *  int result[11];
- *
- *  int *result_end = thrust::set_union(A1, A1 + 6, A2, A2 + 5, result, thrust::greater<int>());
- *  // result = {12, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/set_union.html
- *  \see \p merge
- *  \see \p includes
- *  \see \p set_union
- *  \see \p set_intersection
- *  \see \p set_symmetric_difference
- *  \see \p sort
- *  \see \p is_sorted
- */
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename StrictWeakCompare>
-  OutputIterator set_union(InputIterator1 first1,
-                           InputIterator1 last1,
-                           InputIterator2 first2,
-                           InputIterator2 last2,
-                           OutputIterator result,
-                           StrictWeakCompare comp);
-
-
-/*! \p set_difference_by_key performs a key-value difference operation from set theory.
- *  \p set_difference_by_key constructs a sorted range that is the difference of the sorted
- *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
- *  with each element from the input and output key ranges is a value element. The associated input
- *  value ranges need not be sorted.
- *
- *  In the simplest case, \p set_difference_by_key performs the "difference" operation from set
- *  theory: the keys output range contains a copy of every element that is contained in
- *  <tt>[keys_first1, keys_last1)</tt> and not contained in <tt>[keys_first2, keys_last2)</tt>.
- *  The general case is more complicated, because the input ranges may contain duplicate elements.
- *  The generalization is that if <tt>[keys_first1, keys_last1)</tt> contains \c m elements
- *  that are equivalent to each other and if <tt>[keys_first2, keys_last2)</tt> contains \c n
- *  elements that are equivalent to them, the last <tt>max(m-n,0)</tt> elements from
- *  <tt>[keys_first1, keys_last1)</tt> range shall be copied to the output range.
- *
- *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> or
- *  <tt>[keys_first2, keys_last2)</tt> is copied to the keys output range, the
- *  corresponding value element is copied from the corresponding values input range (beginning at
- *  \p values_first1 or \p values_first2) to the values output range.
- *
- *  This version of \p set_difference_by_key compares key elements using \c operator<.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param keys_first1 The beginning of the first input range of keys.
- *  \param keys_last1 The end of the first input range of keys.
- *  \param keys_first2 The beginning of the second input range of keys.
- *  \param keys_last2 The end of the second input range of keys.
- *  \param values_first1 The beginning of the first input range of values.
- *  \param values_first2 The beginning of the first input range of values.
- *  \param keys_result The beginning of the output range of keys.
- *  \param values_result The beginning of the output range of values.
- *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
- *          and such that <tt>p.second</tt> is the end of the output range of values.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *
- *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
- *  \pre The resulting ranges shall not overlap with any input range.
- *
- *  The following code snippet demonstrates how to use \p set_difference_by_key to compute the
- *  set difference of two sets of integers sorted in ascending order with their values using the \p thrust::host
- *  execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/set_operations.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  int A_keys[6] = {0, 1, 3, 4, 5, 6, 9};
- *  int A_vals[6] = {0, 0, 0, 0, 0, 0, 0};
- *
- *  int B_keys[5] = {1, 3, 5, 7, 9};
- *  int B_vals[5] = {1, 1, 1, 1, 1};
- *
- *  int keys_result[3];
- *  int vals_result[3];
- *
- *  thrust::pair<int*,int*> end = thrust::set_difference_by_key(thrust::host, A_keys, A_keys + 6, B_keys, B_keys + 5, A_vals, B_vals, keys_result, vals_result);
- *  // keys_result is now {0, 4, 6}
- *  // vals_result is now {0, 0, 0}
- *  \endcode
- *
- *  \see \p set_union_by_key
- *  \see \p set_intersection_by_key
- *  \see \p set_symmetric_difference_by_key
- *  \see \p sort_by_key
- *  \see \p is_sorted
- */
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename InputIterator4,
-         typename OutputIterator1,
-         typename OutputIterator2>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_difference_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                          InputIterator1                                              keys_first1,
-                          InputIterator1                                              keys_last1,
-                          InputIterator2                                              keys_first2,
-                          InputIterator2                                              keys_last2,
-                          InputIterator3                                              values_first1,
-                          InputIterator4                                              values_first2,
-                          OutputIterator1                                             keys_result,
-                          OutputIterator2                                             values_result);
-
-
-/*! \p set_difference_by_key performs a key-value difference operation from set theory.
- *  \p set_difference_by_key constructs a sorted range that is the difference of the sorted
- *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
- *  with each element from the input and output key ranges is a value element. The associated input
- *  value ranges need not be sorted.
- *
- *  In the simplest case, \p set_difference_by_key performs the "difference" operation from set
- *  theory: the keys output range contains a copy of every element that is contained in
- *  <tt>[keys_first1, keys_last1)</tt> and not contained in <tt>[keys_first2, keys_last2)</tt>.
- *  The general case is more complicated, because the input ranges may contain duplicate elements.
- *  The generalization is that if <tt>[keys_first1, keys_last1)</tt> contains \c m elements
- *  that are equivalent to each other and if <tt>[keys_first2, keys_last2)</tt> contains \c n
- *  elements that are equivalent to them, the last <tt>max(m-n,0)</tt> elements from
- *  <tt>[keys_first1, keys_last1)</tt> range shall be copied to the output range.
- *
- *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> or
- *  <tt>[keys_first2, keys_last2)</tt> is copied to the keys output range, the
- *  corresponding value element is copied from the corresponding values input range (beginning at
- *  \p values_first1 or \p values_first2) to the values output range.
- *
- *  This version of \p set_difference_by_key compares key elements using \c operator<.
- *
- *  \param keys_first1 The beginning of the first input range of keys.
- *  \param keys_last1 The end of the first input range of keys.
- *  \param keys_first2 The beginning of the second input range of keys.
- *  \param keys_last2 The end of the second input range of keys.
- *  \param values_first1 The beginning of the first input range of values.
- *  \param values_first2 The beginning of the first input range of values.
- *  \param keys_result The beginning of the output range of keys.
- *  \param values_result The beginning of the output range of values.
- *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
- *          and such that <tt>p.second</tt> is the end of the output range of values.
- *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *
- *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
- *  \pre The resulting ranges shall not overlap with any input range.
- *
- *  The following code snippet demonstrates how to use \p set_difference_by_key to compute the
- *  set difference of two sets of integers sorted in ascending order with their values.
- *
- *  \code
- *  #include <thrust/set_operations.h>
- *  ...
- *  int A_keys[6] = {0, 1, 3, 4, 5, 6, 9};
- *  int A_vals[6] = {0, 0, 0, 0, 0, 0, 0};
- *
- *  int B_keys[5] = {1, 3, 5, 7, 9};
- *  int B_vals[5] = {1, 1, 1, 1, 1};
- *
- *  int keys_result[3];
- *  int vals_result[3];
- *
- *  thrust::pair<int*,int*> end = thrust::set_difference_by_key(A_keys, A_keys + 6, B_keys, B_keys + 5, A_vals, B_vals, keys_result, vals_result);
- *  // keys_result is now {0, 4, 6}
- *  // vals_result is now {0, 0, 0}
- *  \endcode
- *
- *  \see \p set_union_by_key
- *  \see \p set_intersection_by_key
- *  \see \p set_symmetric_difference_by_key
- *  \see \p sort_by_key
- *  \see \p is_sorted
- */
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename InputIterator4,
-         typename OutputIterator1,
-         typename OutputIterator2>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_difference_by_key(InputIterator1                             keys_first1,
-                          InputIterator1                             keys_last1,
-                          InputIterator2                             keys_first2,
-                          InputIterator2                             keys_last2,
-                          InputIterator3                             values_first1,
-                          InputIterator4                             values_first2,
-                          OutputIterator1                            keys_result,
-                          OutputIterator2                            values_result);
-
-
-/*! \p set_difference_by_key performs a key-value difference operation from set theory.
- *  \p set_difference_by_key constructs a sorted range that is the difference of the sorted
- *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
- *  with each element from the input and output key ranges is a value element. The associated input
- *  value ranges need not be sorted.
- *
- *  In the simplest case, \p set_difference_by_key performs the "difference" operation from set
- *  theory: the keys output range contains a copy of every element that is contained in
- *  <tt>[keys_first1, keys_last1)</tt> and not contained in <tt>[keys_first2, keys_last2)</tt>.
- *  The general case is more complicated, because the input ranges may contain duplicate elements.
- *  The generalization is that if <tt>[keys_first1, keys_last1)</tt> contains \c m elements
- *  that are equivalent to each other and if <tt>[keys_first2, keys_last2)</tt> contains \c n
- *  elements that are equivalent to them, the last <tt>max(m-n,0)</tt> elements from
- *  <tt>[keys_first1, keys_last1)</tt> range shall be copied to the output range.
- *
- *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> or
- *  <tt>[keys_first2, keys_last2)</tt> is copied to the keys output range, the
- *  corresponding value element is copied from the corresponding values input range (beginning at
- *  \p values_first1 or \p values_first2) to the values output range.
- *
- *  This version of \p set_difference_by_key compares key elements using a function object \p comp.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param keys_first1 The beginning of the first input range of keys.
- *  \param keys_last1 The end of the first input range of keys.
- *  \param keys_first2 The beginning of the second input range of keys.
- *  \param keys_last2 The end of the second input range of keys.
- *  \param values_first1 The beginning of the first input range of values.
- *  \param values_first2 The beginning of the first input range of values.
- *  \param keys_result The beginning of the output range of keys.
- *  \param values_result The beginning of the output range of values.
- *  \param comp Comparison operator.
- *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
- *          and such that <tt>p.second</tt> is the end of the output range of values.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
- *
- *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
- *  \pre The resulting ranges shall not overlap with any input range.
- *
- *  The following code snippet demonstrates how to use \p set_difference_by_key to compute the
- *  set difference of two sets of integers sorted in descending order with their values using the \p thrust::host
- *  execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/set_operations.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  int A_keys[6] = {9, 6, 5, 4, 3, 1, 0};
- *  int A_vals[6] = {0, 0, 0, 0, 0, 0, 0};
- *
- *  int B_keys[5] = {9, 7, 5, 3, 1};
- *  int B_vals[5] = {1, 1, 1, 1, 1};
- *
- *  int keys_result[3];
- *  int vals_result[3];
- *
- *  thrust::pair<int*,int*> end = thrust::set_difference_by_key(thrust::host, A_keys, A_keys + 6, B_keys, B_keys + 5, A_vals, B_vals, keys_result, vals_result, thrust::greater<int>());
- *  // keys_result is now {0, 4, 6}
- *  // vals_result is now {0, 0, 0}
- *  \endcode
- *
- *  \see \p set_union_by_key
- *  \see \p set_intersection_by_key
- *  \see \p set_symmetric_difference_by_key
- *  \see \p sort_by_key
- *  \see \p is_sorted
- */
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename InputIterator4,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename StrictWeakCompare>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_difference_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                          InputIterator1                                              keys_first1,
-                          InputIterator1                                              keys_last1,
-                          InputIterator2                                              keys_first2,
-                          InputIterator2                                              keys_last2,
-                          InputIterator3                                              values_first1,
-                          InputIterator4                                              values_first2,
-                          OutputIterator1                                             keys_result,
-                          OutputIterator2                                             values_result,
-                          StrictWeakCompare                                           comp);
-
-
-/*! \p set_difference_by_key performs a key-value difference operation from set theory.
- *  \p set_difference_by_key constructs a sorted range that is the difference of the sorted
- *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
- *  with each element from the input and output key ranges is a value element. The associated input
- *  value ranges need not be sorted.
- *
- *  In the simplest case, \p set_difference_by_key performs the "difference" operation from set
- *  theory: the keys output range contains a copy of every element that is contained in
- *  <tt>[keys_first1, keys_last1)</tt> and not contained in <tt>[keys_first2, keys_last2)</tt>.
- *  The general case is more complicated, because the input ranges may contain duplicate elements.
- *  The generalization is that if <tt>[keys_first1, keys_last1)</tt> contains \c m elements
- *  that are equivalent to each other and if <tt>[keys_first2, keys_last2)</tt> contains \c n
- *  elements that are equivalent to them, the last <tt>max(m-n,0)</tt> elements from
- *  <tt>[keys_first1, keys_last1)</tt> range shall be copied to the output range.
- *
- *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> or
- *  <tt>[keys_first2, keys_last2)</tt> is copied to the keys output range, the
- *  corresponding value element is copied from the corresponding values input range (beginning at
- *  \p values_first1 or \p values_first2) to the values output range.
- *
- *  This version of \p set_difference_by_key compares key elements using a function object \p comp.
- *
- *  \param keys_first1 The beginning of the first input range of keys.
- *  \param keys_last1 The end of the first input range of keys.
- *  \param keys_first2 The beginning of the second input range of keys.
- *  \param keys_last2 The end of the second input range of keys.
- *  \param values_first1 The beginning of the first input range of values.
- *  \param values_first2 The beginning of the first input range of values.
- *  \param keys_result The beginning of the output range of keys.
- *  \param values_result The beginning of the output range of values.
- *  \param comp Comparison operator.
- *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
- *          and such that <tt>p.second</tt> is the end of the output range of values.
- *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
- *
- *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
- *  \pre The resulting ranges shall not overlap with any input range.
- *
- *  The following code snippet demonstrates how to use \p set_difference_by_key to compute the
- *  set difference of two sets of integers sorted in descending order with their values.
- *
- *  \code
- *  #include <thrust/set_operations.h>
- *  #include <thrust/functional.h>
- *  ...
- *  int A_keys[6] = {9, 6, 5, 4, 3, 1, 0};
- *  int A_vals[6] = {0, 0, 0, 0, 0, 0, 0};
- *
- *  int B_keys[5] = {9, 7, 5, 3, 1};
- *  int B_vals[5] = {1, 1, 1, 1, 1};
- *
- *  int keys_result[3];
- *  int vals_result[3];
- *
- *  thrust::pair<int*,int*> end = thrust::set_difference_by_key(A_keys, A_keys + 6, B_keys, B_keys + 5, A_vals, B_vals, keys_result, vals_result, thrust::greater<int>());
- *  // keys_result is now {0, 4, 6}
- *  // vals_result is now {0, 0, 0}
- *  \endcode
- *
- *  \see \p set_union_by_key
- *  \see \p set_intersection_by_key
- *  \see \p set_symmetric_difference_by_key
- *  \see \p sort_by_key
- *  \see \p is_sorted
- */
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename InputIterator4,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename StrictWeakCompare>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_difference_by_key(InputIterator1                             keys_first1,
-                          InputIterator1                             keys_last1,
-                          InputIterator2                             keys_first2,
-                          InputIterator2                             keys_last2,
-                          InputIterator3                             values_first1,
-                          InputIterator4                             values_first2,
-                          OutputIterator1                            keys_result,
-                          OutputIterator2                            values_result,
-                          StrictWeakCompare                          comp);
-
-
-/*! \p set_intersection_by_key performs a key-value intersection operation from set theory.
- *  \p set_intersection_by_key constructs a sorted range that is the intersection of the sorted
- *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
- *  with each element from the input and output key ranges is a value element. The associated input
- *  value ranges need not be sorted.
- *
- *  In the simplest case, \p set_intersection_by_key performs the "intersection" operation from set
- *  theory: the keys output range contains a copy of every element that is contained in both
- *  <tt>[keys_first1, keys_last1)</tt> <tt>[keys_first2, keys_last2)</tt>.
- *  The general case is more complicated, because the input ranges may contain duplicate elements.
- *  The generalization is that if an element appears \c m times in <tt>[keys_first1, keys_last1)</tt>
- *  and \c n times in <tt>[keys_first2, keys_last2)</tt> (where \c m may be zero), then it
- *  appears <tt>min(m,n)</tt> times in the keys output range.
- *  \p set_intersection_by_key is stable, meaning both that elements are copied from the first
- *  input range rather than the second, and that the relative order of elements in the output range
- *  is the same as the first input range.
- *
- *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> to the keys output range,
- *  the corresponding value element is copied from <tt>[values_first1, values_last1)</tt> to the values
- *  output range.
- *
- *  This version of \p set_intersection_by_key compares objects using \c operator<.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param keys_first1 The beginning of the first input range of keys.
- *  \param keys_last1 The end of the first input range of keys.
- *  \param keys_first2 The beginning of the second input range of keys.
- *  \param keys_last2 The end of the second input range of keys.
- *  \param values_first1 The beginning of the first input range of values.
- *  \param keys_result The beginning of the output range of keys.
- *  \param values_result The beginning of the output range of values.
- *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
- *          and such that <tt>p.second</tt> is the end of the output range of values.
- *
- *  \note Unlike the other key-value set operations, \p set_intersection_by_key is unique in that it has no
- *        \c values_first2 parameter because elements from the second input range are never copied to the output range.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *
- *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
- *  \pre The resulting ranges shall not overlap with any input range.
- *
- *  The following code snippet demonstrates how to use \p set_intersection_by_key to compute the
- *  set intersection of two sets of integers sorted in ascending order with their values using the \p thrust::host
- *  execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/set_operations.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  int A_keys[6] = {1, 3, 5, 7, 9, 11};
- *  int A_vals[6] = {0, 0, 0, 0, 0,  0};
- *  
- *  int B_keys[7] = {1, 1, 2, 3, 5,  8, 13};
- *
- *  int keys_result[7];
- *  int vals_result[7];
- *
- *  thrust::pair<int*,int*> end = thrust::set_intersection_by_key(thrust::host, A_keys, A_keys + 6, B_keys, B_keys + 7, A_vals, keys_result, vals_result);
- *
- *  // keys_result is now {1, 3, 5}
- *  // vals_result is now {0, 0, 0}
- *  \endcode
- *
- *  \see \p set_union_by_key
- *  \see \p set_difference_by_key
- *  \see \p set_symmetric_difference_by_key
- *  \see \p sort_by_key
- *  \see \p is_sorted
- */
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename OutputIterator1,
-         typename OutputIterator2>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_intersection_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                            InputIterator1                                              keys_first1,
-                            InputIterator1                                              keys_last1,
-                            InputIterator2                                              keys_first2,
-                            InputIterator2                                              keys_last2,
-                            InputIterator3                                              values_first1,
-                            OutputIterator1                                             keys_result,
-                            OutputIterator2                                             values_result);
-
-
-/*! \p set_intersection_by_key performs a key-value intersection operation from set theory.
- *  \p set_intersection_by_key constructs a sorted range that is the intersection of the sorted
- *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
- *  with each element from the input and output key ranges is a value element. The associated input
- *  value ranges need not be sorted.
- *
- *  In the simplest case, \p set_intersection_by_key performs the "intersection" operation from set
- *  theory: the keys output range contains a copy of every element that is contained in both
- *  <tt>[keys_first1, keys_last1)</tt> <tt>[keys_first2, keys_last2)</tt>.
- *  The general case is more complicated, because the input ranges may contain duplicate elements.
- *  The generalization is that if an element appears \c m times in <tt>[keys_first1, keys_last1)</tt>
- *  and \c n times in <tt>[keys_first2, keys_last2)</tt> (where \c m may be zero), then it
- *  appears <tt>min(m,n)</tt> times in the keys output range.
- *  \p set_intersection_by_key is stable, meaning both that elements are copied from the first
- *  input range rather than the second, and that the relative order of elements in the output range
- *  is the same as the first input range.
- *
- *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> to the keys output range,
- *  the corresponding value element is copied from <tt>[values_first1, values_last1)</tt> to the values
- *  output range.
- *
- *  This version of \p set_intersection_by_key compares objects using \c operator<.
- *
- *  \param keys_first1 The beginning of the first input range of keys.
- *  \param keys_last1 The end of the first input range of keys.
- *  \param keys_first2 The beginning of the second input range of keys.
- *  \param keys_last2 The end of the second input range of keys.
- *  \param values_first1 The beginning of the first input range of values.
- *  \param keys_result The beginning of the output range of keys.
- *  \param values_result The beginning of the output range of values.
- *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
- *          and such that <tt>p.second</tt> is the end of the output range of values.
- *
- *  \note Unlike the other key-value set operations, \p set_intersection_by_key is unique in that it has no
- *        \c values_first2 parameter because elements from the second input range are never copied to the output range.
- *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *
- *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
- *  \pre The resulting ranges shall not overlap with any input range.
- *
- *  The following code snippet demonstrates how to use \p set_intersection_by_key to compute the
- *  set intersection of two sets of integers sorted in ascending order with their values.
- *
- *  \code
- *  #include <thrust/set_operations.h>
- *  ...
- *  int A_keys[6] = {1, 3, 5, 7, 9, 11};
- *  int A_vals[6] = {0, 0, 0, 0, 0,  0};
- *  
- *  int B_keys[7] = {1, 1, 2, 3, 5,  8, 13};
- *
- *  int keys_result[7];
- *  int vals_result[7];
- *
- *  thrust::pair<int*,int*> end = thrust::set_intersection_by_key(A_keys, A_keys + 6, B_keys, B_keys + 7, A_vals, keys_result, vals_result);
- *
- *  // keys_result is now {1, 3, 5}
- *  // vals_result is now {0, 0, 0}
- *  \endcode
- *
- *  \see \p set_union_by_key
- *  \see \p set_difference_by_key
- *  \see \p set_symmetric_difference_by_key
- *  \see \p sort_by_key
- *  \see \p is_sorted
- */
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename OutputIterator1,
-         typename OutputIterator2>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_intersection_by_key(InputIterator1                             keys_first1,
-                            InputIterator1                             keys_last1,
-                            InputIterator2                             keys_first2,
-                            InputIterator2                             keys_last2,
-                            InputIterator3                             values_first1,
-                            OutputIterator1                            keys_result,
-                            OutputIterator2                            values_result);
-
-
-/*! \p set_intersection_by_key performs a key-value intersection operation from set theory.
- *  \p set_intersection_by_key constructs a sorted range that is the intersection of the sorted
- *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
- *  with each element from the input and output key ranges is a value element. The associated input
- *  value ranges need not be sorted.
- *
- *  In the simplest case, \p set_intersection_by_key performs the "intersection" operation from set
- *  theory: the keys output range contains a copy of every element that is contained in both
- *  <tt>[keys_first1, keys_last1)</tt> <tt>[keys_first2, keys_last2)</tt>.
- *  The general case is more complicated, because the input ranges may contain duplicate elements.
- *  The generalization is that if an element appears \c m times in <tt>[keys_first1, keys_last1)</tt>
- *  and \c n times in <tt>[keys_first2, keys_last2)</tt> (where \c m may be zero), then it
- *  appears <tt>min(m,n)</tt> times in the keys output range.
- *  \p set_intersection_by_key is stable, meaning both that elements are copied from the first
- *  input range rather than the second, and that the relative order of elements in the output range
- *  is the same as the first input range.
- *
- *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> to the keys output range,
- *  the corresponding value element is copied from <tt>[values_first1, values_last1)</tt> to the values
- *  output range.
- *
- *  This version of \p set_intersection_by_key compares objects using a function object \p comp.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param keys_first1 The beginning of the first input range of keys.
- *  \param keys_last1 The end of the first input range of keys.
- *  \param keys_first2 The beginning of the second input range of keys.
- *  \param keys_last2 The end of the second input range of keys.
- *  \param values_first1 The beginning of the first input range of values.
- *  \param keys_result The beginning of the output range of keys.
- *  \param values_result The beginning of the output range of values.
- *  \param comp Comparison operator.
- *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
- *          and such that <tt>p.second</tt> is the end of the output range of values.
- *
- *  \note Unlike the other key-value set operations, \p set_intersection_by_key is unique in that it has no
- *        \c values_first2 parameter because elements from the second input range are never copied to the output range.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
- *
- *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
- *  \pre The resulting ranges shall not overlap with any input range.
- *
- *  The following code snippet demonstrates how to use \p set_intersection_by_key to compute the
- *  set intersection of two sets of integers sorted in descending order with their values using the
- *  \p thrust::host execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/set_operations.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  int A_keys[6] = {11, 9, 7, 5, 3, 1};
- *  int A_vals[6] = { 0, 0, 0, 0, 0, 0};
- *  
- *  int B_keys[7] = {13, 8, 5, 3, 2, 1, 1};
- *
- *  int keys_result[7];
- *  int vals_result[7];
- *
- *  thrust::pair<int*,int*> end = thrust::set_intersection_by_key(thrust::host, A_keys, A_keys + 6, B_keys, B_keys + 7, A_vals, keys_result, vals_result, thrust::greater<int>());
- *
- *  // keys_result is now {5, 3, 1}
- *  // vals_result is now {0, 0, 0}
- *  \endcode
- *
- *  \see \p set_union_by_key
- *  \see \p set_difference_by_key
- *  \see \p set_symmetric_difference_by_key
- *  \see \p sort_by_key
- *  \see \p is_sorted
- */
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename StrictWeakCompare>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_intersection_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                            InputIterator1                                              keys_first1,
-                            InputIterator1                                              keys_last1,
-                            InputIterator2                                              keys_first2,
-                            InputIterator2                                              keys_last2,
-                            InputIterator3                                              values_first1,
-                            OutputIterator1                                             keys_result,
-                            OutputIterator2                                             values_result,
-                            StrictWeakCompare                                           comp);
-
-
-/*! \p set_intersection_by_key performs a key-value intersection operation from set theory.
- *  \p set_intersection_by_key constructs a sorted range that is the intersection of the sorted
- *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
- *  with each element from the input and output key ranges is a value element. The associated input
- *  value ranges need not be sorted.
- *
- *  In the simplest case, \p set_intersection_by_key performs the "intersection" operation from set
- *  theory: the keys output range contains a copy of every element that is contained in both
- *  <tt>[keys_first1, keys_last1)</tt> <tt>[keys_first2, keys_last2)</tt>.
- *  The general case is more complicated, because the input ranges may contain duplicate elements.
- *  The generalization is that if an element appears \c m times in <tt>[keys_first1, keys_last1)</tt>
- *  and \c n times in <tt>[keys_first2, keys_last2)</tt> (where \c m may be zero), then it
- *  appears <tt>min(m,n)</tt> times in the keys output range.
- *  \p set_intersection_by_key is stable, meaning both that elements are copied from the first
- *  input range rather than the second, and that the relative order of elements in the output range
- *  is the same as the first input range.
- *
- *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> to the keys output range,
- *  the corresponding value element is copied from <tt>[values_first1, values_last1)</tt> to the values
- *  output range.
- *
- *  This version of \p set_intersection_by_key compares objects using a function object \p comp.
- *
- *  \param keys_first1 The beginning of the first input range of keys.
- *  \param keys_last1 The end of the first input range of keys.
- *  \param keys_first2 The beginning of the second input range of keys.
- *  \param keys_last2 The end of the second input range of keys.
- *  \param values_first1 The beginning of the first input range of values.
- *  \param keys_result The beginning of the output range of keys.
- *  \param values_result The beginning of the output range of values.
- *  \param comp Comparison operator.
- *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
- *          and such that <tt>p.second</tt> is the end of the output range of values.
- *
- *  \note Unlike the other key-value set operations, \p set_intersection_by_key is unique in that it has no
- *        \c values_first2 parameter because elements from the second input range are never copied to the output range.
- *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
- *
- *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
- *  \pre The resulting ranges shall not overlap with any input range.
- *
- *  The following code snippet demonstrates how to use \p set_intersection_by_key to compute the
- *  set intersection of two sets of integers sorted in descending order with their values.
- *
- *  \code
- *  #include <thrust/set_operations.h>
- *  #include <thrust/functional.h>
- *  ...
- *  int A_keys[6] = {11, 9, 7, 5, 3, 1};
- *  int A_vals[6] = { 0, 0, 0, 0, 0, 0};
- *  
- *  int B_keys[7] = {13, 8, 5, 3, 2, 1, 1};
- *
- *  int keys_result[7];
- *  int vals_result[7];
- *
- *  thrust::pair<int*,int*> end = thrust::set_intersection_by_key(A_keys, A_keys + 6, B_keys, B_keys + 7, A_vals, keys_result, vals_result, thrust::greater<int>());
- *
- *  // keys_result is now {5, 3, 1}
- *  // vals_result is now {0, 0, 0}
- *  \endcode
- *
- *  \see \p set_union_by_key
- *  \see \p set_difference_by_key
- *  \see \p set_symmetric_difference_by_key
- *  \see \p sort_by_key
- *  \see \p is_sorted
- */
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename StrictWeakCompare>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_intersection_by_key(InputIterator1                             keys_first1,
-                            InputIterator1                             keys_last1,
-                            InputIterator2                             keys_first2,
-                            InputIterator2                             keys_last2,
-                            InputIterator3                             values_first1,
-                            OutputIterator1                            keys_result,
-                            OutputIterator2                            values_result,
-                            StrictWeakCompare                          comp);
-
-
-/*! \p set_symmetric_difference_by_key performs a key-value symmetric difference operation from set theory.
- *  \p set_difference_by_key constructs a sorted range that is the symmetric difference of the sorted
- *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
- *  with each element from the input and output key ranges is a value element. The associated input
- *  value ranges need not be sorted.
- *
- *  In the simplest case, \p set_symmetric_difference_by_key performs a set theoretic calculation:
- *  it constructs the union of the two sets A - B and B - A, where A and B are the two
- *  input ranges. That is, the output range contains a copy of every element that is
- *  contained in <tt>[keys_first1, keys_last1)</tt> but not <tt>[keys_first2, keys_last1)</tt>, and a copy of
- *  every element that is contained in <tt>[keys_first2, keys_last2)</tt> but not <tt>[keys_first1, keys_last1)</tt>.
- *  The general case is more complicated, because the input ranges may contain duplicate elements.
- *  The generalization is that if <tt>[keys_first1, keys_last1)</tt> contains \c m elements that are
- *  equivalent to each other and <tt>[keys_first2, keys_last1)</tt> contains \c n elements that are
- *  equivalent to them, then <tt>|m - n|</tt> of those elements shall be copied to the output
- *  range: the last <tt>m - n</tt> elements from <tt>[keys_first1, keys_last1)</tt> if <tt>m > n</tt>, and
- *  the last <tt>n - m</tt> of these elements from <tt>[keys_first2, keys_last2)</tt> if <tt>m < n</tt>.
- *
- *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> or
- *  <tt>[keys_first2, keys_last2)</tt> is copied to the keys output range, the
- *  corresponding value element is copied from the corresponding values input range (beginning at
- *  \p values_first1 or \p values_first2) to the values output range.
- *
- *  This version of \p set_symmetric_difference_by_key compares key elements using \c operator<.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param keys_first1 The beginning of the first input range of keys.
- *  \param keys_last1 The end of the first input range of keys.
- *  \param keys_first2 The beginning of the second input range of keys.
- *  \param keys_last2 The end of the second input range of keys.
- *  \param values_first1 The beginning of the first input range of values.
- *  \param values_first2 The beginning of the first input range of values.
- *  \param keys_result The beginning of the output range of keys.
- *  \param values_result The beginning of the output range of values.
- *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
- *          and such that <tt>p.second</tt> is the end of the output range of values.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *
- *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
- *  \pre The resulting ranges shall not overlap with any input range.
- *
- *  The following code snippet demonstrates how to use \p set_symmetric_difference_by_key to compute the
- *  symmetric difference of two sets of integers sorted in ascending order with their values using the
- *  \p thrust::host execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/set_operations.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  int A_keys[6] = {0, 1, 2, 2, 4, 6, 7};
- *  int A_vals[6] = {0, 0, 0, 0, 0, 0, 0};
- *
- *  int B_keys[5] = {1, 1, 2, 5, 8};
- *  int B_vals[5] = {1, 1, 1, 1, 1};
- *
- *  int keys_result[6];
- *  int vals_result[6];
- *
- *  thrust::pair<int*,int*> end = thrust::set_symmetric_difference_by_key(thrust::host, A_keys, A_keys + 6, B_keys, B_keys + 5, A_vals, B_vals, keys_result, vals_result);
- *  // keys_result is now {0, 4, 5, 6, 7, 8}
- *  // vals_result is now {0, 0, 1, 0, 0, 1}
- *  \endcode
- *
- *  \see \p set_union_by_key
- *  \see \p set_intersection_by_key
- *  \see \p set_difference_by_key
- *  \see \p sort_by_key
- *  \see \p is_sorted
- */
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename InputIterator4,
-         typename OutputIterator1,
-         typename OutputIterator2>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_symmetric_difference_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                    InputIterator1                                              keys_first1,
-                                    InputIterator1                                              keys_last1,
-                                    InputIterator2                                              keys_first2,
-                                    InputIterator2                                              keys_last2,
-                                    InputIterator3                                              values_first1,
-                                    InputIterator4                                              values_first2,
-                                    OutputIterator1                                             keys_result,
-                                    OutputIterator2                                             values_result);
-
-
-/*! \p set_symmetric_difference_by_key performs a key-value symmetric difference operation from set theory.
- *  \p set_difference_by_key constructs a sorted range that is the symmetric difference of the sorted
- *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
- *  with each element from the input and output key ranges is a value element. The associated input
- *  value ranges need not be sorted.
- *
- *  In the simplest case, \p set_symmetric_difference_by_key performs a set theoretic calculation:
- *  it constructs the union of the two sets A - B and B - A, where A and B are the two
- *  input ranges. That is, the output range contains a copy of every element that is
- *  contained in <tt>[keys_first1, keys_last1)</tt> but not <tt>[keys_first2, keys_last1)</tt>, and a copy of
- *  every element that is contained in <tt>[keys_first2, keys_last2)</tt> but not <tt>[keys_first1, keys_last1)</tt>.
- *  The general case is more complicated, because the input ranges may contain duplicate elements.
- *  The generalization is that if <tt>[keys_first1, keys_last1)</tt> contains \c m elements that are
- *  equivalent to each other and <tt>[keys_first2, keys_last1)</tt> contains \c n elements that are
- *  equivalent to them, then <tt>|m - n|</tt> of those elements shall be copied to the output
- *  range: the last <tt>m - n</tt> elements from <tt>[keys_first1, keys_last1)</tt> if <tt>m > n</tt>, and
- *  the last <tt>n - m</tt> of these elements from <tt>[keys_first2, keys_last2)</tt> if <tt>m < n</tt>.
- *
- *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> or
- *  <tt>[keys_first2, keys_last2)</tt> is copied to the keys output range, the
- *  corresponding value element is copied from the corresponding values input range (beginning at
- *  \p values_first1 or \p values_first2) to the values output range.
- *
- *  This version of \p set_symmetric_difference_by_key compares key elements using \c operator<.
- *
- *  \param keys_first1 The beginning of the first input range of keys.
- *  \param keys_last1 The end of the first input range of keys.
- *  \param keys_first2 The beginning of the second input range of keys.
- *  \param keys_last2 The end of the second input range of keys.
- *  \param values_first1 The beginning of the first input range of values.
- *  \param values_first2 The beginning of the first input range of values.
- *  \param keys_result The beginning of the output range of keys.
- *  \param values_result The beginning of the output range of values.
- *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
- *          and such that <tt>p.second</tt> is the end of the output range of values.
- *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *
- *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
- *  \pre The resulting ranges shall not overlap with any input range.
- *
- *  The following code snippet demonstrates how to use \p set_symmetric_difference_by_key to compute the
- *  symmetric difference of two sets of integers sorted in ascending order with their values.
- *
- *  \code
- *  #include <thrust/set_operations.h>
- *  ...
- *  int A_keys[6] = {0, 1, 2, 2, 4, 6, 7};
- *  int A_vals[6] = {0, 0, 0, 0, 0, 0, 0};
- *
- *  int B_keys[5] = {1, 1, 2, 5, 8};
- *  int B_vals[5] = {1, 1, 1, 1, 1};
- *
- *  int keys_result[6];
- *  int vals_result[6];
- *
- *  thrust::pair<int*,int*> end = thrust::set_symmetric_difference_by_key(A_keys, A_keys + 6, B_keys, B_keys + 5, A_vals, B_vals, keys_result, vals_result);
- *  // keys_result is now {0, 4, 5, 6, 7, 8}
- *  // vals_result is now {0, 0, 1, 0, 0, 1}
- *  \endcode
- *
- *  \see \p set_union_by_key
- *  \see \p set_intersection_by_key
- *  \see \p set_difference_by_key
- *  \see \p sort_by_key
- *  \see \p is_sorted
- */
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename InputIterator4,
-         typename OutputIterator1,
-         typename OutputIterator2>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_symmetric_difference_by_key(InputIterator1                             keys_first1,
-                                    InputIterator1                             keys_last1,
-                                    InputIterator2                             keys_first2,
-                                    InputIterator2                             keys_last2,
-                                    InputIterator3                             values_first1,
-                                    InputIterator4                             values_first2,
-                                    OutputIterator1                            keys_result,
-                                    OutputIterator2                            values_result);
-
-
-/*! \p set_symmetric_difference_by_key performs a key-value symmetric difference operation from set theory.
- *  \p set_difference_by_key constructs a sorted range that is the symmetric difference of the sorted
- *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
- *  with each element from the input and output key ranges is a value element. The associated input
- *  value ranges need not be sorted.
- *
- *  In the simplest case, \p set_symmetric_difference_by_key performs a set theoretic calculation:
- *  it constructs the union of the two sets A - B and B - A, where A and B are the two
- *  input ranges. That is, the output range contains a copy of every element that is
- *  contained in <tt>[keys_first1, keys_last1)</tt> but not <tt>[keys_first2, keys_last1)</tt>, and a copy of
- *  every element that is contained in <tt>[keys_first2, keys_last2)</tt> but not <tt>[keys_first1, keys_last1)</tt>.
- *  The general case is more complicated, because the input ranges may contain duplicate elements.
- *  The generalization is that if <tt>[keys_first1, keys_last1)</tt> contains \c m elements that are
- *  equivalent to each other and <tt>[keys_first2, keys_last1)</tt> contains \c n elements that are
- *  equivalent to them, then <tt>|m - n|</tt> of those elements shall be copied to the output
- *  range: the last <tt>m - n</tt> elements from <tt>[keys_first1, keys_last1)</tt> if <tt>m > n</tt>, and
- *  the last <tt>n - m</tt> of these elements from <tt>[keys_first2, keys_last2)</tt> if <tt>m < n</tt>.
- *
- *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> or
- *  <tt>[keys_first2, keys_last2)</tt> is copied to the keys output range, the
- *  corresponding value element is copied from the corresponding values input range (beginning at
- *  \p values_first1 or \p values_first2) to the values output range.
- *
- *  This version of \p set_symmetric_difference_by_key compares key elements using a function object \c comp.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param keys_first1 The beginning of the first input range of keys.
- *  \param keys_last1 The end of the first input range of keys.
- *  \param keys_first2 The beginning of the second input range of keys.
- *  \param keys_last2 The end of the second input range of keys.
- *  \param values_first1 The beginning of the first input range of values.
- *  \param values_first2 The beginning of the first input range of values.
- *  \param keys_result The beginning of the output range of keys.
- *  \param values_result The beginning of the output range of values.
- *  \param comp Comparison operator.
- *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
- *          and such that <tt>p.second</tt> is the end of the output range of values.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
- *
- *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
- *  \pre The resulting ranges shall not overlap with any input range.
- *
- *  The following code snippet demonstrates how to use \p set_symmetric_difference_by_key to compute the
- *  symmetric difference of two sets of integers sorted in descending order with their values using the
- *  \p thrust::host execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/set_operations.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  int A_keys[6] = {7, 6, 4, 2, 2, 1, 0};
- *  int A_vals[6] = {0, 0, 0, 0, 0, 0, 0};
- *
- *  int B_keys[5] = {8, 5, 2, 1, 1};
- *  int B_vals[5] = {1, 1, 1, 1, 1};
- *
- *  int keys_result[6];
- *  int vals_result[6];
- *
- *  thrust::pair<int*,int*> end = thrust::set_symmetric_difference_by_key(thrust::host, A_keys, A_keys + 6, B_keys, B_keys + 5, A_vals, B_vals, keys_result, vals_result);
- *  // keys_result is now {8, 7, 6, 5, 4, 0}
- *  // vals_result is now {1, 0, 0, 1, 0, 0}
- *  \endcode
- *
- *  \see \p set_union_by_key
- *  \see \p set_intersection_by_key
- *  \see \p set_difference_by_key
- *  \see \p sort_by_key
- *  \see \p is_sorted
- */
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename InputIterator4,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename StrictWeakCompare>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_symmetric_difference_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                    InputIterator1                                              keys_first1,
-                                    InputIterator1                                              keys_last1,
-                                    InputIterator2                                              keys_first2,
-                                    InputIterator2                                              keys_last2,
-                                    InputIterator3                                              values_first1,
-                                    InputIterator4                                              values_first2,
-                                    OutputIterator1                                             keys_result,
-                                    OutputIterator2                                             values_result,
-                                    StrictWeakCompare                                           comp);
-
-
-/*! \p set_symmetric_difference_by_key performs a key-value symmetric difference operation from set theory.
- *  \p set_difference_by_key constructs a sorted range that is the symmetric difference of the sorted
- *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
- *  with each element from the input and output key ranges is a value element. The associated input
- *  value ranges need not be sorted.
- *
- *  In the simplest case, \p set_symmetric_difference_by_key performs a set theoretic calculation:
- *  it constructs the union of the two sets A - B and B - A, where A and B are the two
- *  input ranges. That is, the output range contains a copy of every element that is
- *  contained in <tt>[keys_first1, keys_last1)</tt> but not <tt>[keys_first2, keys_last1)</tt>, and a copy of
- *  every element that is contained in <tt>[keys_first2, keys_last2)</tt> but not <tt>[keys_first1, keys_last1)</tt>.
- *  The general case is more complicated, because the input ranges may contain duplicate elements.
- *  The generalization is that if <tt>[keys_first1, keys_last1)</tt> contains \c m elements that are
- *  equivalent to each other and <tt>[keys_first2, keys_last1)</tt> contains \c n elements that are
- *  equivalent to them, then <tt>|m - n|</tt> of those elements shall be copied to the output
- *  range: the last <tt>m - n</tt> elements from <tt>[keys_first1, keys_last1)</tt> if <tt>m > n</tt>, and
- *  the last <tt>n - m</tt> of these elements from <tt>[keys_first2, keys_last2)</tt> if <tt>m < n</tt>.
- *
- *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> or
- *  <tt>[keys_first2, keys_last2)</tt> is copied to the keys output range, the
- *  corresponding value element is copied from the corresponding values input range (beginning at
- *  \p values_first1 or \p values_first2) to the values output range.
- *
- *  This version of \p set_symmetric_difference_by_key compares key elements using a function object \c comp.
- *
- *  \param keys_first1 The beginning of the first input range of keys.
- *  \param keys_last1 The end of the first input range of keys.
- *  \param keys_first2 The beginning of the second input range of keys.
- *  \param keys_last2 The end of the second input range of keys.
- *  \param values_first1 The beginning of the first input range of values.
- *  \param values_first2 The beginning of the first input range of values.
- *  \param keys_result The beginning of the output range of keys.
- *  \param values_result The beginning of the output range of values.
- *  \param comp Comparison operator.
- *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
- *          and such that <tt>p.second</tt> is the end of the output range of values.
- *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
- *
- *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
- *  \pre The resulting ranges shall not overlap with any input range.
- *
- *  The following code snippet demonstrates how to use \p set_symmetric_difference_by_key to compute the
- *  symmetric difference of two sets of integers sorted in descending order with their values.
- *
- *  \code
- *  #include <thrust/set_operations.h>
- *  #include <thrust/functional.h>
- *  ...
- *  int A_keys[6] = {7, 6, 4, 2, 2, 1, 0};
- *  int A_vals[6] = {0, 0, 0, 0, 0, 0, 0};
- *
- *  int B_keys[5] = {8, 5, 2, 1, 1};
- *  int B_vals[5] = {1, 1, 1, 1, 1};
- *
- *  int keys_result[6];
- *  int vals_result[6];
- *
- *  thrust::pair<int*,int*> end = thrust::set_symmetric_difference_by_key(A_keys, A_keys + 6, B_keys, B_keys + 5, A_vals, B_vals, keys_result, vals_result);
- *  // keys_result is now {8, 7, 6, 5, 4, 0}
- *  // vals_result is now {1, 0, 0, 1, 0, 0}
- *  \endcode
- *
- *  \see \p set_union_by_key
- *  \see \p set_intersection_by_key
- *  \see \p set_difference_by_key
- *  \see \p sort_by_key
- *  \see \p is_sorted
- */
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename InputIterator4,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename StrictWeakCompare>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_symmetric_difference_by_key(InputIterator1                             keys_first1,
-                                    InputIterator1                             keys_last1,
-                                    InputIterator2                             keys_first2,
-                                    InputIterator2                             keys_last2,
-                                    InputIterator3                             values_first1,
-                                    InputIterator4                             values_first2,
-                                    OutputIterator1                            keys_result,
-                                    OutputIterator2                            values_result,
-                                    StrictWeakCompare                          comp);
-
-
-/*! \p set_union_by_key performs a key-value union operation from set theory.
- *  \p set_union_by_key constructs a sorted range that is the union of the sorted
- *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
- *  with each element from the input and output key ranges is a value element. The associated input
- *  value ranges need not be sorted.
- *
- *  In the simplest case, \p set_union_by_key performs the "union" operation from set theory:
- *  the output range contains a copy of every element that is contained in
- *  <tt>[keys_first1, keys_last1)</tt>, <tt>[keys_first2, keys_last1)</tt>, or both. The general case
- *  is more complicated, because the input ranges may contain duplicate elements.
- *  The generalization is that if <tt>[keys_first1, keys_last1)</tt> contains \c m elements
- *  that are equivalent to each other and if <tt>[keys_first2, keys_last2)</tt> contains \c n
- *  elements that are equivalent to them, then all \c m elements from the first
- *  range shall be copied to the output range, in order, and then <tt>max(n - m, 0)</tt>
- *  elements from the second range shall be copied to the output, in order.
- *
- *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> or
- *  <tt>[keys_first2, keys_last2)</tt> is copied to the keys output range, the
- *  corresponding value element is copied from the corresponding values input range (beginning at
- *  \p values_first1 or \p values_first2) to the values output range.
- *
- *  This version of \p set_union_by_key compares key elements using \c operator<.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param keys_first1 The beginning of the first input range of keys.
- *  \param keys_last1 The end of the first input range of keys.
- *  \param keys_first2 The beginning of the second input range of keys.
- *  \param keys_last2 The end of the second input range of keys.
- *  \param values_first1 The beginning of the first input range of values.
- *  \param values_first2 The beginning of the first input range of values.
- *  \param keys_result The beginning of the output range of keys.
- *  \param values_result The beginning of the output range of values.
- *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
- *          and such that <tt>p.second</tt> is the end of the output range of values.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *
- *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
- *  \pre The resulting ranges shall not overlap with any input range.
- *
- *  The following code snippet demonstrates how to use \p set_symmetric_difference_by_key to compute the
- *  symmetric difference of two sets of integers sorted in ascending order with their values using the
- *  \p thrust::host execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/set_operations.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  int A_keys[6] = {0, 2, 4, 6, 8, 10, 12};
- *  int A_vals[6] = {0, 0, 0, 0, 0,  0,  0};
- *
- *  int B_keys[5] = {1, 3, 5, 7, 9};
- *  int B_vals[5] = {1, 1, 1, 1, 1};
- *
- *  int keys_result[11];
- *  int vals_result[11];
- *
- *  thrust::pair<int*,int*> end = thrust::set_symmetric_difference_by_key(thrust::host, A_keys, A_keys + 6, B_keys, B_keys + 5, A_vals, B_vals, keys_result, vals_result);
- *  // keys_result is now {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12}
- *  // vals_result is now {0, 1, 0, 1, 0, 1, 0, 1, 0, 1,  0,  0}
- *  \endcode
- *
- *  \see \p set_symmetric_difference_by_key
- *  \see \p set_intersection_by_key
- *  \see \p set_difference_by_key
- *  \see \p sort_by_key
- *  \see \p is_sorted
- */
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename InputIterator4,
-         typename OutputIterator1,
-         typename OutputIterator2>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_union_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                     InputIterator1                                              keys_first1,
-                     InputIterator1                                              keys_last1,
-                     InputIterator2                                              keys_first2,
-                     InputIterator2                                              keys_last2,
-                     InputIterator3                                              values_first1,
-                     InputIterator4                                              values_first2,
-                     OutputIterator1                                             keys_result,
-                     OutputIterator2                                             values_result);
-
-
-/*! \p set_union_by_key performs a key-value union operation from set theory.
- *  \p set_union_by_key constructs a sorted range that is the union of the sorted
- *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
- *  with each element from the input and output key ranges is a value element. The associated input
- *  value ranges need not be sorted.
- *
- *  In the simplest case, \p set_union_by_key performs the "union" operation from set theory:
- *  the output range contains a copy of every element that is contained in
- *  <tt>[keys_first1, keys_last1)</tt>, <tt>[keys_first2, keys_last1)</tt>, or both. The general case
- *  is more complicated, because the input ranges may contain duplicate elements.
- *  The generalization is that if <tt>[keys_first1, keys_last1)</tt> contains \c m elements
- *  that are equivalent to each other and if <tt>[keys_first2, keys_last2)</tt> contains \c n
- *  elements that are equivalent to them, then all \c m elements from the first
- *  range shall be copied to the output range, in order, and then <tt>max(n - m, 0)</tt>
- *  elements from the second range shall be copied to the output, in order.
- *
- *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> or
- *  <tt>[keys_first2, keys_last2)</tt> is copied to the keys output range, the
- *  corresponding value element is copied from the corresponding values input range (beginning at
- *  \p values_first1 or \p values_first2) to the values output range.
- *
- *  This version of \p set_union_by_key compares key elements using \c operator<.
- *
- *  \param keys_first1 The beginning of the first input range of keys.
- *  \param keys_last1 The end of the first input range of keys.
- *  \param keys_first2 The beginning of the second input range of keys.
- *  \param keys_last2 The end of the second input range of keys.
- *  \param values_first1 The beginning of the first input range of values.
- *  \param values_first2 The beginning of the first input range of values.
- *  \param keys_result The beginning of the output range of keys.
- *  \param values_result The beginning of the output range of values.
- *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
- *          and such that <tt>p.second</tt> is the end of the output range of values.
- *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *
- *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
- *  \pre The resulting ranges shall not overlap with any input range.
- *
- *  The following code snippet demonstrates how to use \p set_symmetric_difference_by_key to compute the
- *  symmetric difference of two sets of integers sorted in ascending order with their values.
- *
- *  \code
- *  #include <thrust/set_operations.h>
- *  ...
- *  int A_keys[6] = {0, 2, 4, 6, 8, 10, 12};
- *  int A_vals[6] = {0, 0, 0, 0, 0,  0,  0};
- *
- *  int B_keys[5] = {1, 3, 5, 7, 9};
- *  int B_vals[5] = {1, 1, 1, 1, 1};
- *
- *  int keys_result[11];
- *  int vals_result[11];
- *
- *  thrust::pair<int*,int*> end = thrust::set_symmetric_difference_by_key(A_keys, A_keys + 6, B_keys, B_keys + 5, A_vals, B_vals, keys_result, vals_result);
- *  // keys_result is now {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12}
- *  // vals_result is now {0, 1, 0, 1, 0, 1, 0, 1, 0, 1,  0,  0}
- *  \endcode
- *
- *  \see \p set_symmetric_difference_by_key
- *  \see \p set_intersection_by_key
- *  \see \p set_difference_by_key
- *  \see \p sort_by_key
- *  \see \p is_sorted
- */
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename InputIterator4,
-         typename OutputIterator1,
-         typename OutputIterator2>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_union_by_key(InputIterator1                             keys_first1,
-                     InputIterator1                             keys_last1,
-                     InputIterator2                             keys_first2,
-                     InputIterator2                             keys_last2,
-                     InputIterator3                             values_first1,
-                     InputIterator4                             values_first2,
-                     OutputIterator1                            keys_result,
-                     OutputIterator2                            values_result);
-
-
-/*! \p set_union_by_key performs a key-value union operation from set theory.
- *  \p set_union_by_key constructs a sorted range that is the union of the sorted
- *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
- *  with each element from the input and output key ranges is a value element. The associated input
- *  value ranges need not be sorted.
- *
- *  In the simplest case, \p set_union_by_key performs the "union" operation from set theory:
- *  the output range contains a copy of every element that is contained in
- *  <tt>[keys_first1, keys_last1)</tt>, <tt>[keys_first2, keys_last1)</tt>, or both. The general case
- *  is more complicated, because the input ranges may contain duplicate elements.
- *  The generalization is that if <tt>[keys_first1, keys_last1)</tt> contains \c m elements
- *  that are equivalent to each other and if <tt>[keys_first2, keys_last2)</tt> contains \c n
- *  elements that are equivalent to them, then all \c m elements from the first
- *  range shall be copied to the output range, in order, and then <tt>max(n - m, 0)</tt>
- *  elements from the second range shall be copied to the output, in order.
- *
- *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> or
- *  <tt>[keys_first2, keys_last2)</tt> is copied to the keys output range, the
- *  corresponding value element is copied from the corresponding values input range (beginning at
- *  \p values_first1 or \p values_first2) to the values output range.
- *
- *  This version of \p set_union_by_key compares key elements using a function object \c comp.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param keys_first1 The beginning of the first input range of keys.
- *  \param keys_last1 The end of the first input range of keys.
- *  \param keys_first2 The beginning of the second input range of keys.
- *  \param keys_last2 The end of the second input range of keys.
- *  \param values_first1 The beginning of the first input range of values.
- *  \param values_first2 The beginning of the first input range of values.
- *  \param keys_result The beginning of the output range of keys.
- *  \param values_result The beginning of the output range of values.
- *  \param comp Comparison operator.
- *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
- *          and such that <tt>p.second</tt> is the end of the output range of values.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
- *
- *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
- *  \pre The resulting ranges shall not overlap with any input range.
- *
- *  The following code snippet demonstrates how to use \p set_symmetric_difference_by_key to compute the
- *  symmetric difference of two sets of integers sorted in descending order with their values using the
- *  \p thrust::host execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/set_operations.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  int A_keys[6] = {12, 10, 8, 6, 4, 2, 0};
- *  int A_vals[6] = { 0,  0, 0, 0, 0, 0, 0};
- *
- *  int B_keys[5] = {9, 7, 5, 3, 1};
- *  int B_vals[5] = {1, 1, 1, 1, 1};
- *
- *  int keys_result[11];
- *  int vals_result[11];
- *
- *  thrust::pair<int*,int*> end = thrust::set_symmetric_difference_by_key(thrust::host, A_keys, A_keys + 6, B_keys, B_keys + 5, A_vals, B_vals, keys_result, vals_result, thrust::greater<int>());
- *  // keys_result is now {12, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}
- *  // vals_result is now { 0,  1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0}
- *  \endcode
- *
- *  \see \p set_symmetric_difference_by_key
- *  \see \p set_intersection_by_key
- *  \see \p set_difference_by_key
- *  \see \p sort_by_key
- *  \see \p is_sorted
- */
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename InputIterator4,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename StrictWeakCompare>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_union_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                     InputIterator1                                              keys_first1,
-                     InputIterator1                                              keys_last1,
-                     InputIterator2                                              keys_first2,
-                     InputIterator2                                              keys_last2,
-                     InputIterator3                                              values_first1,
-                     InputIterator4                                              values_first2,
-                     OutputIterator1                                             keys_result,
-                     OutputIterator2                                             values_result,
-                     StrictWeakCompare                                           comp);
-
-
-/*! \p set_union_by_key performs a key-value union operation from set theory.
- *  \p set_union_by_key constructs a sorted range that is the union of the sorted
- *  ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt>. Associated
- *  with each element from the input and output key ranges is a value element. The associated input
- *  value ranges need not be sorted.
- *
- *  In the simplest case, \p set_union_by_key performs the "union" operation from set theory:
- *  the output range contains a copy of every element that is contained in
- *  <tt>[keys_first1, keys_last1)</tt>, <tt>[keys_first2, keys_last1)</tt>, or both. The general case
- *  is more complicated, because the input ranges may contain duplicate elements.
- *  The generalization is that if <tt>[keys_first1, keys_last1)</tt> contains \c m elements
- *  that are equivalent to each other and if <tt>[keys_first2, keys_last2)</tt> contains \c n
- *  elements that are equivalent to them, then all \c m elements from the first
- *  range shall be copied to the output range, in order, and then <tt>max(n - m, 0)</tt>
- *  elements from the second range shall be copied to the output, in order.
- *
- *  Each time a key element is copied from <tt>[keys_first1, keys_last1)</tt> or
- *  <tt>[keys_first2, keys_last2)</tt> is copied to the keys output range, the
- *  corresponding value element is copied from the corresponding values input range (beginning at
- *  \p values_first1 or \p values_first2) to the values output range.
- *
- *  This version of \p set_union_by_key compares key elements using a function object \c comp.
- *
- *  \param keys_first1 The beginning of the first input range of keys.
- *  \param keys_last1 The end of the first input range of keys.
- *  \param keys_first2 The beginning of the second input range of keys.
- *  \param keys_last2 The end of the second input range of keys.
- *  \param values_first1 The beginning of the first input range of values.
- *  \param values_first2 The beginning of the first input range of values.
- *  \param keys_result The beginning of the output range of keys.
- *  \param values_result The beginning of the output range of values.
- *  \param comp Comparison operator.
- *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
- *          and such that <tt>p.second</tt> is the end of the output range of values.
- *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
- *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
- *
- *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
- *  \pre The resulting ranges shall not overlap with any input range.
- *
- *  The following code snippet demonstrates how to use \p set_symmetric_difference_by_key to compute the
- *  symmetric difference of two sets of integers sorted in descending order with their values.
- *
- *  \code
- *  #include <thrust/set_operations.h>
- *  #include <thrust/functional.h>
- *  ...
- *  int A_keys[6] = {12, 10, 8, 6, 4, 2, 0};
- *  int A_vals[6] = { 0,  0, 0, 0, 0, 0, 0};
- *
- *  int B_keys[5] = {9, 7, 5, 3, 1};
- *  int B_vals[5] = {1, 1, 1, 1, 1};
- *
- *  int keys_result[11];
- *  int vals_result[11];
- *
- *  thrust::pair<int*,int*> end = thrust::set_symmetric_difference_by_key(A_keys, A_keys + 6, B_keys, B_keys + 5, A_vals, B_vals, keys_result, vals_result, thrust::greater<int>());
- *  // keys_result is now {12, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}
- *  // vals_result is now { 0,  1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0}
- *  \endcode
- *
- *  \see \p set_symmetric_difference_by_key
- *  \see \p set_intersection_by_key
- *  \see \p set_difference_by_key
- *  \see \p sort_by_key
- *  \see \p is_sorted
- */
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename InputIterator4,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename StrictWeakCompare>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_union_by_key(InputIterator1                             keys_first1,
-                     InputIterator1                             keys_last1,
-                     InputIterator2                             keys_first2,
-                     InputIterator2                             keys_last2,
-                     InputIterator3                             values_first1,
-                     InputIterator4                             values_first2,
-                     OutputIterator1                            keys_result,
-                     OutputIterator2                            values_result,
-                     StrictWeakCompare                          comp);
-
-
-/*! \} // end set_operations
- */
-
-
-} // end thrust
-
-#include <thrust/detail/set_operations.inl>
-
diff --git a/compat/thrust/sort.h b/compat/thrust/sort.h
deleted file mode 100644
index e8edfcd876..0000000000
--- a/compat/thrust/sort.h
+++ /dev/null
@@ -1,1349 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file sort.h
- *  \brief Functions for reorganizing ranges into sorted order
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/execution_policy.h>
-
-namespace thrust
-{
-
-
-/*! \addtogroup sorting
- *  \ingroup algorithms
- *  \{
- */
-
-
-/*! \p sort sorts the elements in <tt>[first, last)</tt> into
- *  ascending order, meaning that if \c i and \c j are any two valid
- *  iterators in <tt>[first, last)</tt> such that \c i precedes \c j,
- *  then \c *j is not less than \c *i. Note: \c sort is not guaranteed
- *  to be stable. That is, suppose that \c *i and \c *j are equivalent:
- *  neither one is less than the other. It is not guaranteed that the
- *  relative order of these two elements will be preserved by \p sort.
- *
- *  This version of \p sort compares objects using \c operator<.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the sequence.
- *  \param last The end of the sequence.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam RandomAccessIterator is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
- *          \p RandomAccessIterator is mutable,
- *          and \p RandomAccessIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
- *          and the ordering relation on \p RandomAccessIterator's \c value_type is a <em>strict weak ordering</em>, as defined in the
- *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
- *
- *  The following code snippet demonstrates how to use \p sort to sort
- *  a sequence of integers using the \p thrust::host execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/sort.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  const int N = 6;
- *  int A[N] = {1, 4, 2, 8, 5, 7};
- *  thrust::sort(thrust::host, A, A + N);
- *  // A is now {1, 2, 4, 5, 7, 8}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/sort.html
- *  \see \p stable_sort
- *  \see \p sort_by_key
- */
-template<typename DerivedPolicy, typename RandomAccessIterator>
-  void sort(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-            RandomAccessIterator first,
-            RandomAccessIterator last);
-
-
-/*! \p sort sorts the elements in <tt>[first, last)</tt> into
- *  ascending order, meaning that if \c i and \c j are any two valid
- *  iterators in <tt>[first, last)</tt> such that \c i precedes \c j,
- *  then \c *j is not less than \c *i. Note: \c sort is not guaranteed
- *  to be stable. That is, suppose that \c *i and \c *j are equivalent:
- *  neither one is less than the other. It is not guaranteed that the
- *  relative order of these two elements will be preserved by \p sort.
- *
- *  This version of \p sort compares objects using \c operator<.
- *
- *  \param first The beginning of the sequence.
- *  \param last The end of the sequence.
- *
- *  \tparam RandomAccessIterator is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
- *          \p RandomAccessIterator is mutable,
- *          and \p RandomAccessIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
- *          and the ordering relation on \p RandomAccessIterator's \c value_type is a <em>strict weak ordering</em>, as defined in the
- *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
- *
- *  The following code snippet demonstrates how to use \p sort to sort
- *  a sequence of integers.
- *
- *  \code
- *  #include <thrust/sort.h>
- *  ...
- *  const int N = 6;
- *  int A[N] = {1, 4, 2, 8, 5, 7};
- *  thrust::sort(A, A + N);
- *  // A is now {1, 2, 4, 5, 7, 8}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/sort.html
- *  \see \p stable_sort
- *  \see \p sort_by_key
- */
-template<typename RandomAccessIterator>
-  void sort(RandomAccessIterator first,
-            RandomAccessIterator last);
-
-
-/*! \p sort sorts the elements in <tt>[first, last)</tt> into
- *  ascending order, meaning that if \c i and \c j are any two valid
- *  iterators in <tt>[first, last)</tt> such that \c i precedes \c j,
- *  then \c *j is not less than \c *i. Note: \c sort is not guaranteed
- *  to be stable. That is, suppose that \c *i and \c *j are equivalent:
- *  neither one is less than the other. It is not guaranteed that the
- *  relative order of these two elements will be preserved by \p sort.
- *
- *  This version of \p sort compares objects using a function object
- *  \p comp.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the sequence.
- *  \param last The end of the sequence.
- *  \param comp  Comparison operator.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam RandomAccessIterator is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
- *          \p RandomAccessIterator is mutable,
- *          and \p RandomAccessIterator's \c value_type is convertible to \p StrictWeakOrdering's
- *          \c first_argument_type and \c second_argument_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
- *
- *  The following code demonstrates how to sort integers in descending order
- *  using the greater<int> comparison operator using the \p thrust::host execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/sort.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  const int N = 6;
- *  int A[N] = {1, 4, 2, 8, 5, 7};
- *  thrust::sort(thrust::host, A, A + N, thrust::greater<int>());
- *  // A is now {8, 7, 5, 4, 2, 1};
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/sort.html
- *  \see \p stable_sort
- *  \see \p sort_by_key
- */
-template<typename DerivedPolicy,
-         typename RandomAccessIterator,
-         typename StrictWeakOrdering>
-  void sort(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-            RandomAccessIterator first,
-            RandomAccessIterator last,
-            StrictWeakOrdering comp);
-
-
-/*! \p sort sorts the elements in <tt>[first, last)</tt> into
- *  ascending order, meaning that if \c i and \c j are any two valid
- *  iterators in <tt>[first, last)</tt> such that \c i precedes \c j,
- *  then \c *j is not less than \c *i. Note: \c sort is not guaranteed
- *  to be stable. That is, suppose that \c *i and \c *j are equivalent:
- *  neither one is less than the other. It is not guaranteed that the
- *  relative order of these two elements will be preserved by \p sort.
- *
- *  This version of \p sort compares objects using a function object
- *  \p comp.
- *
- *  \param first The beginning of the sequence.
- *  \param last The end of the sequence.
- *  \param comp  Comparison operator.
- *
- *  \tparam RandomAccessIterator is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
- *          \p RandomAccessIterator is mutable,
- *          and \p RandomAccessIterator's \c value_type is convertible to \p StrictWeakOrdering's
- *          \c first_argument_type and \c second_argument_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
- *
- *  The following code demonstrates how to sort integers in descending order
- *  using the greater<int> comparison operator.
- *
- *  \code
- *  #include <thrust/sort.h>
- *  #include <thrust/functional.h>
- *  ...
- *  const int N = 6;
- *  int A[N] = {1, 4, 2, 8, 5, 7};
- *  thrust::sort(A, A + N, thrust::greater<int>());
- *  // A is now {8, 7, 5, 4, 2, 1};
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/sort.html
- *  \see \p stable_sort
- *  \see \p sort_by_key
- */
-template<typename RandomAccessIterator,
-         typename StrictWeakOrdering>
-  void sort(RandomAccessIterator first,
-            RandomAccessIterator last,
-            StrictWeakOrdering comp);
-
-
-/*! \p stable_sort is much like \c sort: it sorts the elements in
- *  <tt>[first, last)</tt> into ascending order, meaning that if \c i
- *  and \c j are any two valid iterators in <tt>[first, last)</tt> such
- *  that \c i precedes \c j, then \c *j is not less than \c *i.
- *
- *  As the name suggests, \p stable_sort is stable: it preserves the
- *  relative ordering of equivalent elements. That is, if \c x and \c y
- *  are elements in <tt>[first, last)</tt> such that \c x precedes \c y,
- *  and if the two elements are equivalent (neither <tt>x < y</tt> nor
- *  <tt>y < x</tt>) then a postcondition of \p stable_sort is that \c x
- *  still precedes \c y.
- *
- *  This version of \p stable_sort compares objects using \c operator<.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the sequence.
- *  \param last The end of the sequence.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam RandomAccessIterator is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
- *          \p RandomAccessIterator is mutable,
- *          and \p RandomAccessIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
- *          and the ordering relation on \p RandomAccessIterator's \c value_type is a <em>strict weak ordering</em>, as defined in the
- *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
- *
- *  The following code snippet demonstrates how to use \p sort to sort
- *  a sequence of integers using the \p thrust::host execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/sort.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  const int N = 6;
- *  int A[N] = {1, 4, 2, 8, 5, 7};
- *  thrust::stable_sort(thrust::host, A, A + N);
- *  // A is now {1, 2, 4, 5, 7, 8}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/stable_sort.html
- *  \see \p sort
- *  \see \p stable_sort_by_key
- */
-template<typename DerivedPolicy, typename RandomAccessIterator>
-  void stable_sort(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                   RandomAccessIterator first,
-                   RandomAccessIterator last);
-
-
-/*! \p stable_sort is much like \c sort: it sorts the elements in
- *  <tt>[first, last)</tt> into ascending order, meaning that if \c i
- *  and \c j are any two valid iterators in <tt>[first, last)</tt> such
- *  that \c i precedes \c j, then \c *j is not less than \c *i.
- *
- *  As the name suggests, \p stable_sort is stable: it preserves the
- *  relative ordering of equivalent elements. That is, if \c x and \c y
- *  are elements in <tt>[first, last)</tt> such that \c x precedes \c y,
- *  and if the two elements are equivalent (neither <tt>x < y</tt> nor
- *  <tt>y < x</tt>) then a postcondition of \p stable_sort is that \c x
- *  still precedes \c y.
- *
- *  This version of \p stable_sort compares objects using \c operator<.
- *
- *  \param first The beginning of the sequence.
- *  \param last The end of the sequence.
- *
- *  \tparam RandomAccessIterator is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
- *          \p RandomAccessIterator is mutable,
- *          and \p RandomAccessIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
- *          and the ordering relation on \p RandomAccessIterator's \c value_type is a <em>strict weak ordering</em>, as defined in the
- *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
- *
- *  The following code snippet demonstrates how to use \p sort to sort
- *  a sequence of integers.
- *
- *  \code
- *  #include <thrust/sort.h>
- *  ...
- *  const int N = 6;
- *  int A[N] = {1, 4, 2, 8, 5, 7};
- *  thrust::stable_sort(A, A + N);
- *  // A is now {1, 2, 4, 5, 7, 8}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/stable_sort.html
- *  \see \p sort
- *  \see \p stable_sort_by_key
- */
-template<typename RandomAccessIterator>
-  void stable_sort(RandomAccessIterator first,
-                   RandomAccessIterator last);
-
-
-/*! \p stable_sort is much like \c sort: it sorts the elements in
- *  <tt>[first, last)</tt> into ascending order, meaning that if \c i
- *  and \c j are any two valid iterators in <tt>[first, last)</tt> such
- *  that \c i precedes \c j, then \c *j is not less than \c *i.
- *
- *  As the name suggests, \p stable_sort is stable: it preserves the
- *  relative ordering of equivalent elements. That is, if \c x and \c y
- *  are elements in <tt>[first, last)</tt> such that \c x precedes \c y,
- *  and if the two elements are equivalent (neither <tt>x < y</tt> nor
- *  <tt>y < x</tt>) then a postcondition of \p stable_sort is that \c x
- *  still precedes \c y.
- *
- *  This version of \p stable_sort compares objects using a function object
- *  \p comp.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the sequence.
- *  \param last The end of the sequence.
- *  \param comp Comparison operator.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam RandomAccessIterator is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
- *          \p RandomAccessIterator is mutable,
- *          and \p RandomAccessIterator's \c value_type is convertible to \p StrictWeakOrdering's
- *          \c first_argument_type and \c second_argument_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
- *
- *  The following code demonstrates how to sort integers in descending order
- *  using the greater<int> comparison operator using the \p thrust::host execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/sort.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  const int N = 6;
- *  int A[N] = {1, 4, 2, 8, 5, 7};
- *  thrust::sort(A, A + N, thrust::greater<int>());
- *  // A is now {8, 7, 5, 4, 2, 1};
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/stable_sort.html
- *  \see \p sort
- *  \see \p stable_sort_by_key
- */
-template<typename DerivedPolicy,
-         typename RandomAccessIterator,
-         typename StrictWeakOrdering>
-  void stable_sort(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                   RandomAccessIterator first,
-                   RandomAccessIterator last,
-                   StrictWeakOrdering comp);
-
-
-/*! \p stable_sort is much like \c sort: it sorts the elements in
- *  <tt>[first, last)</tt> into ascending order, meaning that if \c i
- *  and \c j are any two valid iterators in <tt>[first, last)</tt> such
- *  that \c i precedes \c j, then \c *j is not less than \c *i.
- *
- *  As the name suggests, \p stable_sort is stable: it preserves the
- *  relative ordering of equivalent elements. That is, if \c x and \c y
- *  are elements in <tt>[first, last)</tt> such that \c x precedes \c y,
- *  and if the two elements are equivalent (neither <tt>x < y</tt> nor
- *  <tt>y < x</tt>) then a postcondition of \p stable_sort is that \c x
- *  still precedes \c y.
- *
- *  This version of \p stable_sort compares objects using a function object
- *  \p comp.
- *
- *  \param first The beginning of the sequence.
- *  \param last The end of the sequence.
- *  \param comp Comparison operator.
- *
- *  \tparam RandomAccessIterator is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
- *          \p RandomAccessIterator is mutable,
- *          and \p RandomAccessIterator's \c value_type is convertible to \p StrictWeakOrdering's
- *          \c first_argument_type and \c second_argument_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
- *
- *  The following code demonstrates how to sort integers in descending order
- *  using the greater<int> comparison operator.
- *
- *  \code
- *  #include <thrust/sort.h>
- *  #include <thrust/functional.h>
- *  ...
- *  const int N = 6;
- *  int A[N] = {1, 4, 2, 8, 5, 7};
- *  thrust::sort(A, A + N, thrust::greater<int>());
- *  // A is now {8, 7, 5, 4, 2, 1};
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/stable_sort.html
- *  \see \p sort
- *  \see \p stable_sort_by_key
- */
-template<typename RandomAccessIterator,
-         typename StrictWeakOrdering>
-  void stable_sort(RandomAccessIterator first,
-                   RandomAccessIterator last,
-                   StrictWeakOrdering comp);
-
-
-///////////////
-// Key Value //
-///////////////
-
-
-/*! \p sort_by_key performs a key-value sort. That is, \p sort_by_key sorts the
- *  elements in <tt>[keys_first, keys_last)</tt> and <tt>[values_first,
- *  values_first + (keys_last - keys_first))</tt> into ascending key order,
- *  meaning that if \c i and \c j are any two valid iterators in <tt>[keys_first,
- *  keys_last)</tt> such that \c i precedes \c j, and \c p and \c q are iterators
- *  in <tt>[values_first, values_first + (keys_last - keys_first))</tt>
- *  corresponding to \c i and \c j respectively, then \c *j is not less than
- *  \c *i.
- *
- *  Note: \c sort_by_key is not guaranteed to be stable. That is, suppose that
- *  \c *i and \c *j are equivalent: neither one is less than the other. It is not
- *  guaranteed that the relative order of these two keys or the relative
- *  order of their corresponding values will be preserved by \p sort_by_key.
- *
- *  This version of \p sort_by_key compares key objects using \c operator<.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param keys_first The beginning of the key sequence.
- *  \param keys_last The end of the key sequence.
- *  \param values_first The beginning of the value sequence.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam RandomAccessIterator1 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
- *          \p RandomAccessIterator1 is mutable,
- *          and \p RandomAccessIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
- *          and the ordering relation on \p RandomAccessIterator1's \c value_type is a <em>strict weak ordering</em>, as defined in the
- *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
- *  \tparam RandomAccessIterator2 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.hml">Random Access Iterator</a>,
- *          and \p RandomAccessIterator2 is mutable.
- *
- *  \pre The range <tt>[keys_first, keys_last))</tt> shall not overlap the range <tt>[values_first, values_first + (keys_last - keys_first))</tt>.
- *
- *  The following code snippet demonstrates how to use \p sort_by_key to sort
- *  an array of character values using integers as sorting keys using the \p thrust::host execution policy
- *  for parallelization:
- *
- *  \code
- *  #include <thrust/sort.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  const int N = 6;
- *  int    keys[N] = {  1,   4,   2,   8,   5,   7};
- *  char values[N] = {'a', 'b', 'c', 'd', 'e', 'f'};
- *  thrust::sort_by_key(thrust::host, keys, keys + N, values);
- *  // keys is now   {  1,   2,   4,   5,   7,   8}
- *  // values is now {'a', 'c', 'b', 'e', 'f', 'd'}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/sort.html
- *  \see \p stable_sort_by_key
- *  \see \p sort
- */
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-  void sort_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                   RandomAccessIterator1 keys_first,
-                   RandomAccessIterator1 keys_last,
-                   RandomAccessIterator2 values_first);
-
-
-/*! \p sort_by_key performs a key-value sort. That is, \p sort_by_key sorts the
- *  elements in <tt>[keys_first, keys_last)</tt> and <tt>[values_first,
- *  values_first + (keys_last - keys_first))</tt> into ascending key order,
- *  meaning that if \c i and \c j are any two valid iterators in <tt>[keys_first,
- *  keys_last)</tt> such that \c i precedes \c j, and \c p and \c q are iterators
- *  in <tt>[values_first, values_first + (keys_last - keys_first))</tt>
- *  corresponding to \c i and \c j respectively, then \c *j is not less than
- *  \c *i.
- *
- *  Note: \c sort_by_key is not guaranteed to be stable. That is, suppose that
- *  \c *i and \c *j are equivalent: neither one is less than the other. It is not
- *  guaranteed that the relative order of these two keys or the relative
- *  order of their corresponding values will be preserved by \p sort_by_key.
- *
- *  This version of \p sort_by_key compares key objects using \c operator<.
- *
- *  \param keys_first The beginning of the key sequence.
- *  \param keys_last The end of the key sequence.
- *  \param values_first The beginning of the value sequence.
- *
- *  \tparam RandomAccessIterator1 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
- *          \p RandomAccessIterator1 is mutable,
- *          and \p RandomAccessIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
- *          and the ordering relation on \p RandomAccessIterator1's \c value_type is a <em>strict weak ordering</em>, as defined in the
- *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
- *  \tparam RandomAccessIterator2 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.hml">Random Access Iterator</a>,
- *          and \p RandomAccessIterator2 is mutable.
- *
- *  \pre The range <tt>[keys_first, keys_last))</tt> shall not overlap the range <tt>[values_first, values_first + (keys_last - keys_first))</tt>.
- *
- *  The following code snippet demonstrates how to use \p sort_by_key to sort
- *  an array of character values using integers as sorting keys.
- *
- *  \code
- *  #include <thrust/sort.h>
- *  ...
- *  const int N = 6;
- *  int    keys[N] = {  1,   4,   2,   8,   5,   7};
- *  char values[N] = {'a', 'b', 'c', 'd', 'e', 'f'};
- *  thrust::sort_by_key(keys, keys + N, values);
- *  // keys is now   {  1,   2,   4,   5,   7,   8}
- *  // values is now {'a', 'c', 'b', 'e', 'f', 'd'}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/sort.html
- *  \see \p stable_sort_by_key
- *  \see \p sort
- */
-template<typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-  void sort_by_key(RandomAccessIterator1 keys_first,
-                   RandomAccessIterator1 keys_last,
-                   RandomAccessIterator2 values_first);
-
-
-/*! \p sort_by_key performs a key-value sort. That is, \p sort_by_key sorts the
- *  elements in <tt>[keys_first, keys_last)</tt> and <tt>[values_first,
- *  values_first + (keys_last - keys_first))</tt> into ascending key order,
- *  meaning that if \c i and \c j are any two valid iterators in <tt>[keys_first,
- *  keys_last)</tt> such that \c i precedes \c j, and \c p and \c q are iterators
- *  in <tt>[values_first, values_first + (keys_last - keys_first))</tt>
- *  corresponding to \c i and \c j respectively, then \c *j is not less than
- *  \c *i.
- *
- *  Note: \c sort_by_key is not guaranteed to be stable. That is, suppose that
- *  \c *i and \c *j are equivalent: neither one is less than the other. It is not
- *  guaranteed that the relative order of these two keys or the relative
- *  order of their corresponding values will be preserved by \p sort_by_key.
- *
- *  This version of \p sort_by_key compares key objects using a function object
- *  \c comp.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param keys_first The beginning of the key sequence.
- *  \param keys_last The end of the key sequence.
- *  \param values_first The beginning of the value sequence.
- *  \param comp Comparison operator.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam RandomAccessIterator1 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
- *          \p RandomAccessIterator1 is mutable,
- *          and \p RandomAccessIterator1's \c value_type is convertible to \p StrictWeakOrdering's
- *          \c first_argument_type and \c second_argument_type.
- *  \tparam RandomAccessIterator2 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.hml">Random Access Iterator</a>,
- *          and \p RandomAccessIterator2 is mutable.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
- *
- *  \pre The range <tt>[keys_first, keys_last))</tt> shall not overlap the range <tt>[values_first, values_first + (keys_last - keys_first))</tt>.
- *
- *  The following code snippet demonstrates how to use \p sort_by_key to sort
- *  an array of character values using integers as sorting keys using the \p thrust::host execution policy
- *  for parallelization.The keys are sorted in descending order using the <tt>greater<int></tt> comparison operator.
- *
- *  \code
- *  #include <thrust/sort.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  const int N = 6;
- *  int    keys[N] = {  1,   4,   2,   8,   5,   7};
- *  char values[N] = {'a', 'b', 'c', 'd', 'e', 'f'};
- *  thrust::sort_by_key(thrust::host, keys, keys + N, values, thrust::greater<int>());
- *  // keys is now   {  8,   7,   5,   4,   2,   1}
- *  // values is now {'d', 'f', 'e', 'b', 'c', 'a'}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/sort.html
- *  \see \p stable_sort_by_key
- *  \see \p sort
- */
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename StrictWeakOrdering>
-  void sort_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                   RandomAccessIterator1 keys_first,
-                   RandomAccessIterator1 keys_last,
-                   RandomAccessIterator2 values_first,
-                   StrictWeakOrdering comp);
-
-
-/*! \p sort_by_key performs a key-value sort. That is, \p sort_by_key sorts the
- *  elements in <tt>[keys_first, keys_last)</tt> and <tt>[values_first,
- *  values_first + (keys_last - keys_first))</tt> into ascending key order,
- *  meaning that if \c i and \c j are any two valid iterators in <tt>[keys_first,
- *  keys_last)</tt> such that \c i precedes \c j, and \c p and \c q are iterators
- *  in <tt>[values_first, values_first + (keys_last - keys_first))</tt>
- *  corresponding to \c i and \c j respectively, then \c *j is not less than
- *  \c *i.
- *
- *  Note: \c sort_by_key is not guaranteed to be stable. That is, suppose that
- *  \c *i and \c *j are equivalent: neither one is less than the other. It is not
- *  guaranteed that the relative order of these two keys or the relative
- *  order of their corresponding values will be preserved by \p sort_by_key.
- *
- *  This version of \p sort_by_key compares key objects using a function object
- *  \c comp.
- *
- *  \param keys_first The beginning of the key sequence.
- *  \param keys_last The end of the key sequence.
- *  \param values_first The beginning of the value sequence.
- *  \param comp Comparison operator.
- *
- *  \tparam RandomAccessIterator1 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
- *          \p RandomAccessIterator1 is mutable,
- *          and \p RandomAccessIterator1's \c value_type is convertible to \p StrictWeakOrdering's
- *          \c first_argument_type and \c second_argument_type.
- *  \tparam RandomAccessIterator2 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.hml">Random Access Iterator</a>,
- *          and \p RandomAccessIterator2 is mutable.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
- *
- *  \pre The range <tt>[keys_first, keys_last))</tt> shall not overlap the range <tt>[values_first, values_first + (keys_last - keys_first))</tt>.
- *
- *  The following code snippet demonstrates how to use \p sort_by_key to sort
- *  an array of character values using integers as sorting keys.  The keys
- *  are sorted in descending order using the greater<int> comparison operator.
- *
- *  \code
- *  #include <thrust/sort.h>
- *  ...
- *  const int N = 6;
- *  int    keys[N] = {  1,   4,   2,   8,   5,   7};
- *  char values[N] = {'a', 'b', 'c', 'd', 'e', 'f'};
- *  thrust::sort_by_key(keys, keys + N, values, thrust::greater<int>());
- *  // keys is now   {  8,   7,   5,   4,   2,   1}
- *  // values is now {'d', 'f', 'e', 'b', 'c', 'a'}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/sort.html
- *  \see \p stable_sort_by_key
- *  \see \p sort
- */
-template<typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename StrictWeakOrdering>
-  void sort_by_key(RandomAccessIterator1 keys_first,
-                   RandomAccessIterator1 keys_last,
-                   RandomAccessIterator2 values_first,
-                   StrictWeakOrdering comp);
-
-
-/*! \p stable_sort_by_key performs a key-value sort. That is, \p stable_sort_by_key
- *  sorts the elements in <tt>[keys_first, keys_last)</tt> and <tt>[values_first,
- *  values_first + (keys_last - keys_first))</tt> into ascending key order,
- *  meaning that if \c i and \c j are any two valid iterators in <tt>[keys_first,
- *  keys_last)</tt> such that \c i precedes \c j, and \c p and \c q are iterators
- *  in <tt>[values_first, values_first + (keys_last - keys_first))</tt>
- *  corresponding to \c i and \c j respectively, then \c *j is not less than
- *  \c *i.
- *
- *  As the name suggests, \p stable_sort_by_key is stable: it preserves the
- *  relative ordering of equivalent elements. That is, if \c x and \c y
- *  are elements in <tt>[keys_first, keys_last)</tt> such that \c x precedes \c y,
- *  and if the two elements are equivalent (neither <tt>x < y</tt> nor
- *  <tt>y < x</tt>) then a postcondition of \p stable_sort_by_key is that \c x
- *  still precedes \c y.
- *
- *  This version of \p stable_sort_by_key compares key objects using \c operator<.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param keys_first The beginning of the key sequence.
- *  \param keys_last The end of the key sequence.
- *  \param values_first The beginning of the value sequence.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam RandomAccessIterator1 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
- *          \p RandomAccessIterator1 is mutable,
- *          and \p RandomAccessIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
- *          and the ordering relation on \p RandomAccessIterator1's \c value_type is a <em>strict weak ordering</em>, as defined in the
- *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
- *  \tparam RandomAccessIterator2 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.hml">Random Access Iterator</a>,
- *          and \p RandomAccessIterator2 is mutable.
- *
- *  \pre The range <tt>[keys_first, keys_last))</tt> shall not overlap the range <tt>[values_first, values_first + (keys_last - keys_first))</tt>.
- *
- *  The following code snippet demonstrates how to use \p stable_sort_by_key to sort
- *  an array of characters using integers as sorting keys using the \p thrust::host execution policy for
- *  parallelization:
- *
- *  \code
- *  #include <thrust/sort.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  const int N = 6;
- *  int    keys[N] = {  1,   4,   2,   8,   5,   7};
- *  char values[N] = {'a', 'b', 'c', 'd', 'e', 'f'};
- *  thrust::stable_sort_by_key(thrust::host, keys, keys + N, values);
- *  // keys is now   {  1,   2,   4,   5,   7,   8}
- *  // values is now {'a', 'c', 'b', 'e', 'f', 'd'}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/sort.html
- *  \see \p sort_by_key
- *  \see \p stable_sort
- */
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-  void stable_sort_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                          RandomAccessIterator1 keys_first,
-                          RandomAccessIterator1 keys_last,
-                          RandomAccessIterator2 values_first);
-
-
-/*! \p stable_sort_by_key performs a key-value sort. That is, \p stable_sort_by_key
- *  sorts the elements in <tt>[keys_first, keys_last)</tt> and <tt>[values_first,
- *  values_first + (keys_last - keys_first))</tt> into ascending key order,
- *  meaning that if \c i and \c j are any two valid iterators in <tt>[keys_first,
- *  keys_last)</tt> such that \c i precedes \c j, and \c p and \c q are iterators
- *  in <tt>[values_first, values_first + (keys_last - keys_first))</tt>
- *  corresponding to \c i and \c j respectively, then \c *j is not less than
- *  \c *i.
- *
- *  As the name suggests, \p stable_sort_by_key is stable: it preserves the
- *  relative ordering of equivalent elements. That is, if \c x and \c y
- *  are elements in <tt>[keys_first, keys_last)</tt> such that \c x precedes \c y,
- *  and if the two elements are equivalent (neither <tt>x < y</tt> nor
- *  <tt>y < x</tt>) then a postcondition of \p stable_sort_by_key is that \c x
- *  still precedes \c y.
- *
- *  This version of \p stable_sort_by_key compares key objects using \c operator<.
- *
- *  \param keys_first The beginning of the key sequence.
- *  \param keys_last The end of the key sequence.
- *  \param values_first The beginning of the value sequence.
- *
- *  \tparam RandomAccessIterator1 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
- *          \p RandomAccessIterator1 is mutable,
- *          and \p RandomAccessIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
- *          and the ordering relation on \p RandomAccessIterator1's \c value_type is a <em>strict weak ordering</em>, as defined in the
- *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
- *  \tparam RandomAccessIterator2 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.hml">Random Access Iterator</a>,
- *          and \p RandomAccessIterator2 is mutable.
- *
- *  \pre The range <tt>[keys_first, keys_last))</tt> shall not overlap the range <tt>[values_first, values_first + (keys_last - keys_first))</tt>.
- *
- *  The following code snippet demonstrates how to use \p stable_sort_by_key to sort
- *  an array of characters using integers as sorting keys.
- *
- *  \code
- *  #include <thrust/sort.h>
- *  ...
- *  const int N = 6;
- *  int    keys[N] = {  1,   4,   2,   8,   5,   7};
- *  char values[N] = {'a', 'b', 'c', 'd', 'e', 'f'};
- *  thrust::stable_sort_by_key(keys, keys + N, values);
- *  // keys is now   {  1,   2,   4,   5,   7,   8}
- *  // values is now {'a', 'c', 'b', 'e', 'f', 'd'}
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/sort.html
- *  \see \p sort_by_key
- *  \see \p stable_sort
- */
-template<typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-  void stable_sort_by_key(RandomAccessIterator1 keys_first,
-                          RandomAccessIterator1 keys_last,
-                          RandomAccessIterator2 values_first);
-
-
-/*! \p stable_sort_by_key performs a key-value sort. That is, \p stable_sort_by_key
- *  sorts the elements in <tt>[keys_first, keys_last)</tt> and <tt>[values_first,
- *  values_first + (keys_last - keys_first))</tt> into ascending key order,
- *  meaning that if \c i and \c j are any two valid iterators in <tt>[keys_first,
- *  keys_last)</tt> such that \c i precedes \c j, and \c p and \c q are iterators
- *  in <tt>[values_first, values_first + (keys_last - keys_first))</tt>
- *  corresponding to \c i and \c j respectively, then \c *j is not less than
- *  \c *i.
- *
- *  As the name suggests, \p stable_sort_by_key is stable: it preserves the
- *  relative ordering of equivalent elements. That is, if \c x and \c y
- *  are elements in <tt>[keys_first, keys_last)</tt> such that \c x precedes \c y,
- *  and if the two elements are equivalent (neither <tt>x < y</tt> nor
- *  <tt>y < x</tt>) then a postcondition of \p stable_sort_by_key is that \c x
- *  still precedes \c y.
- *
- *  This version of \p stable_sort_by_key compares key objects using the function
- *  object \p comp.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param keys_first The beginning of the key sequence.
- *  \param keys_last The end of the key sequence.
- *  \param values_first The beginning of the value sequence.
- *  \param comp Comparison operator.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam RandomAccessIterator1 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
- *          \p RandomAccessIterator1 is mutable,
- *          and \p RandomAccessIterator1's \c value_type is convertible to \p StrictWeakOrdering's
- *          \c first_argument_type and \c second_argument_type.
- *  \tparam RandomAccessIterator2 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.hml">Random Access Iterator</a>,
- *          and \p RandomAccessIterator2 is mutable.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
- *
- *  \pre The range <tt>[keys_first, keys_last))</tt> shall not overlap the range <tt>[values_first, values_first + (keys_last - keys_first))</tt>.
- *
- *  The following code snippet demonstrates how to use \p sort_by_key to sort
- *  an array of character values using integers as sorting keys using the \p thrust::host execution policy for
- *  parallelization. The keys are sorted in descending order using the <tt>greater<int></tt> comparison operator.
- *
- *  \code
- *  #include <thrust/sort.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  const int N = 6;
- *  int    keys[N] = {  1,   4,   2,   8,   5,   7};
- *  char values[N] = {'a', 'b', 'c', 'd', 'e', 'f'};
- *  thrust::stable_sort_by_key(thrust::host, keys, keys + N, values, thrust::greater<int>());
- *  // keys is now   {  8,   7,   5,   4,   2,   1}
- *  // values is now {'d', 'f', 'e', 'b', 'c', 'a'}
- *  \endcode
- *
- *
- *  \see http://www.sgi.com/tech/stl/sort.html
- *  \see \p sort_by_key
- *  \see \p stable_sort
- */
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename StrictWeakOrdering>
-  void stable_sort_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                          RandomAccessIterator1 keys_first,
-                          RandomAccessIterator1 keys_last,
-                          RandomAccessIterator2 values_first,
-                          StrictWeakOrdering comp);
-
-
-/*! \p stable_sort_by_key performs a key-value sort. That is, \p stable_sort_by_key
- *  sorts the elements in <tt>[keys_first, keys_last)</tt> and <tt>[values_first,
- *  values_first + (keys_last - keys_first))</tt> into ascending key order,
- *  meaning that if \c i and \c j are any two valid iterators in <tt>[keys_first,
- *  keys_last)</tt> such that \c i precedes \c j, and \c p and \c q are iterators
- *  in <tt>[values_first, values_first + (keys_last - keys_first))</tt>
- *  corresponding to \c i and \c j respectively, then \c *j is not less than
- *  \c *i.
- *
- *  As the name suggests, \p stable_sort_by_key is stable: it preserves the
- *  relative ordering of equivalent elements. That is, if \c x and \c y
- *  are elements in <tt>[keys_first, keys_last)</tt> such that \c x precedes \c y,
- *  and if the two elements are equivalent (neither <tt>x < y</tt> nor
- *  <tt>y < x</tt>) then a postcondition of \p stable_sort_by_key is that \c x
- *  still precedes \c y.
- *
- *  This version of \p stable_sort_by_key compares key objects using the function
- *  object \p comp.
- *
- *  \param keys_first The beginning of the key sequence.
- *  \param keys_last The end of the key sequence.
- *  \param values_first The beginning of the value sequence.
- *  \param comp Comparison operator.
- *
- *  \tparam RandomAccessIterator1 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
- *          \p RandomAccessIterator1 is mutable,
- *          and \p RandomAccessIterator1's \c value_type is convertible to \p StrictWeakOrdering's
- *          \c first_argument_type and \c second_argument_type.
- *  \tparam RandomAccessIterator2 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.hml">Random Access Iterator</a>,
- *          and \p RandomAccessIterator2 is mutable.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
- *
- *  \pre The range <tt>[keys_first, keys_last))</tt> shall not overlap the range <tt>[values_first, values_first + (keys_last - keys_first))</tt>.
- *
- *  The following code snippet demonstrates how to use \p sort_by_key to sort
- *  an array of character values using integers as sorting keys.  The keys
- *  are sorted in descending order using the greater<int> comparison operator.
- *
- *  \code
- *  #include <thrust/sort.h>
- *  ...
- *  const int N = 6;
- *  int    keys[N] = {  1,   4,   2,   8,   5,   7};
- *  char values[N] = {'a', 'b', 'c', 'd', 'e', 'f'};
- *  thrust::stable_sort_by_key(keys, keys + N, values, thrust::greater<int>());
- *  // keys is now   {  8,   7,   5,   4,   2,   1}
- *  // values is now {'d', 'f', 'e', 'b', 'c', 'a'}
- *  \endcode
- *
- *
- *  \see http://www.sgi.com/tech/stl/sort.html
- *  \see \p sort_by_key
- *  \see \p stable_sort
- */
-template<typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename StrictWeakOrdering>
-  void stable_sort_by_key(RandomAccessIterator1 keys_first,
-                          RandomAccessIterator1 keys_last,
-                          RandomAccessIterator2 values_first,
-                          StrictWeakOrdering comp);
-
-
-/*! \} // end sorting
- */
-
-
-/*! \addtogroup reductions
- *  \{
- *  \addtogroup predicates
- *  \{
- */
-
-
-/*! \p is_sorted returns \c true if the range <tt>[first, last)</tt> is
- *  sorted in ascending order, and \c false otherwise.
- *
- *  Specifically, this version of \p is_sorted returns \c false if for
- *  some iterator \c i in the range <tt>[first, last - 1)</tt> the
- *  expression <tt>*(i + 1) < *i</tt> is \c true.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the sequence.
- *  \param last  The end of the sequence.
- *  \return \c true, if the sequence is sorted; \c false, otherwise.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
- *          and the ordering on objects of \p ForwardIterator's \c value_type is a <em>strict weak ordering</em>, as defined
- *          in the <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
- *
- *
- *  The following code demonstrates how to use \p is_sorted to test whether the
- *  contents of a \c device_vector are stored in ascending order using the \p thrust::device execution policy
- *  for parallelization:
- *
- *  \code
- *  #include <thrust/sort.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/sort.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  thrust::device_vector<int> v(6);
- *  v[0] = 1;
- *  v[1] = 4;
- *  v[2] = 2;
- *  v[3] = 8;
- *  v[4] = 5;
- *  v[5] = 7;
- *
- *  bool result = thrust::is_sorted(thrust::device, v.begin(), v.end());
- *
- *  // result == false
- *
- *  thrust::sort(v.begin(), v.end());
- *  result = thrust::is_sorted(thrust::device, v.begin(), v.end());
- *
- *  // result == true
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/is_sorted.html
- *  \see is_sorted_until
- *  \see \c sort
- *  \see \c stable_sort
- *  \see \c less<T>
- */
-template<typename DerivedPolicy, typename ForwardIterator>
-  bool is_sorted(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                 ForwardIterator first,
-                 ForwardIterator last);
-
-
-/*! \p is_sorted returns \c true if the range <tt>[first, last)</tt> is
- *  sorted in ascending order, and \c false otherwise.
- *
- *  Specifically, this version of \p is_sorted returns \c false if for
- *  some iterator \c i in the range <tt>[first, last - 1)</tt> the
- *  expression <tt>*(i + 1) < *i</tt> is \c true.
- *
- *  \param first The beginning of the sequence.
- *  \param last  The end of the sequence.
- *  \return \c true, if the sequence is sorted; \c false, otherwise.
- *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
- *          and the ordering on objects of \p ForwardIterator's \c value_type is a <em>strict weak ordering</em>, as defined
- *          in the <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
- *
- *
- *  The following code demonstrates how to use \p is_sorted to test whether the
- *  contents of a \c device_vector are stored in ascending order.
- *
- *  \code
- *  #include <thrust/sort.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/sort.h>
- *  ...
- *  thrust::device_vector<int> v(6);
- *  v[0] = 1;
- *  v[1] = 4;
- *  v[2] = 2;
- *  v[3] = 8;
- *  v[4] = 5;
- *  v[5] = 7;
- *
- *  bool result = thrust::is_sorted(v.begin(), v.end());
- *
- *  // result == false
- *
- *  thrust::sort(v.begin(), v.end());
- *  result = thrust::is_sorted(v.begin(), v.end());
- *
- *  // result == true
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/is_sorted.html
- *  \see is_sorted_until
- *  \see \c sort
- *  \see \c stable_sort
- *  \see \c less<T>
- */
-template<typename ForwardIterator>
-  bool is_sorted(ForwardIterator first,
-                 ForwardIterator last);
-
-
-/*! \p is_sorted returns \c true if the range <tt>[first, last)</tt> is sorted in ascending 
- *  order accoring to a user-defined comparison operation, and \c false otherwise.
- *
- *  Specifically, this version of \p is_sorted returns \c false if for some iterator \c i in
- *  the range <tt>[first, last - 1)</tt> the expression <tt>comp(*(i + 1), *i)</tt> is \c true.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the sequence.
- *  \param last  The end of the sequence.
- *  \param comp  Comparison operator.
- *  \return \c true, if the sequence is sorted according to comp; \c false, otherwise.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator's \c value_type is convertible to both \c StrictWeakOrdering's \c first_argument_type
- *          and \c second_argument_type.
- *  \tparam Compare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
- *
- *  The following code snippet demonstrates how to use \p is_sorted to test whether the
- *  contents of a \c device_vector are stored in descending order using the \p thrust::device execution
- *  policy for parallelization:
- *
- *  \code
- *  #include <thrust/sort.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  thrust::device_vector<int> v(6);
- *  v[0] = 1;
- *  v[1] = 4;
- *  v[2] = 2;
- *  v[3] = 8;
- *  v[4] = 5;
- *  v[5] = 7;
- *
- *  thrust::greater<int> comp;
- *  bool result = thrust::is_sorted(thrust::device, v.begin(), v.end(), comp);
- *
- *  // result == false
- *
- *  thrust::sort(v.begin(), v.end(), comp);
- *  result = thrust::is_sorted(thrust::device, v.begin(), v.end(), comp);
- *
- *  // result == true
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/is_sorted.html
- *  \see \c sort
- *  \see \c stable_sort
- *  \see \c less<T>
- */
-template<typename DerivedPolicy, typename ForwardIterator, typename Compare>
-  bool is_sorted(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                 ForwardIterator first,
-                 ForwardIterator last,
-                 Compare comp);
-
-
-/*! \p is_sorted returns \c true if the range <tt>[first, last)</tt> is sorted in ascending 
- *  order accoring to a user-defined comparison operation, and \c false otherwise.
- *
- *  Specifically, this version of \p is_sorted returns \c false if for some iterator \c i in
- *  the range <tt>[first, last - 1)</tt> the expression <tt>comp(*(i + 1), *i)</tt> is \c true.
- *
- *  \param first The beginning of the sequence.
- *  \param last  The end of the sequence.
- *  \param comp  Comparison operator.
- *  \return \c true, if the sequence is sorted according to comp; \c false, otherwise.
- *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator's \c value_type is convertible to both \c StrictWeakOrdering's \c first_argument_type
- *          and \c second_argument_type.
- *  \tparam Compare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
- *
- *  The following code snippet demonstrates how to use \p is_sorted to test whether the
- *  contents of a \c device_vector are stored in descending order.
- *
- *  \code
- *  #include <thrust/sort.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/device_vector.h>
- *  ...
- *  thrust::device_vector<int> v(6);
- *  v[0] = 1;
- *  v[1] = 4;
- *  v[2] = 2;
- *  v[3] = 8;
- *  v[4] = 5;
- *  v[5] = 7;
- *
- *  thrust::greater<int> comp;
- *  bool result = thrust::is_sorted(v.begin(), v.end(), comp);
- *
- *  // result == false
- *
- *  thrust::sort(v.begin(), v.end(), comp);
- *  result = thrust::is_sorted(v.begin(), v.end(), comp);
- *
- *  // result == true
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/is_sorted.html
- *  \see \c sort
- *  \see \c stable_sort
- *  \see \c less<T>
- */
-template<typename ForwardIterator, typename Compare>
-  bool is_sorted(ForwardIterator first,
-                 ForwardIterator last,
-                 Compare comp);
-
-
-/*! This version of \p is_sorted_until returns the last iterator \c i in <tt>[first,last]</tt> for
- *  which the range <tt>[first,last)</tt> is sorted using \c operator<. If <tt>distance(first,last) < 2</tt>,
- *  \p is_sorted_until simply returns \p last.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the range of interest.
- *  \param last The end of the range of interest.
- *  \return The last iterator in the input range for which it is sorted.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a> and
- *          \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
- *
- *  The following code snippet demonstrates how to use \p is_sorted_until to find the first position
- *  in an array where the data becomes unsorted using the \p thrust::host execution policy for
- *  parallelization:
- *  
- *  \code
- *  #include <thrust/sort.h>
- *  #include <thrust/execution_policy.h>
- *
- *  ...
- *   
- *  int A[8] = {0, 1, 2, 3, 0, 1, 2, 3};
- *  
- *  int * B = thrust::is_sorted_until(thrust::host, A, A + 8);
- *  
- *  // B - A is 4
- *  // [A, B) is sorted
- *  \endcode
- *
- *  \see \p is_sorted
- *  \see \p sort
- *  \see \p sort_by_key
- *  \see \p stable_sort
- *  \see \p stable_sort_by_key
- */
-template<typename DerivedPolicy, typename ForwardIterator>
-  ForwardIterator is_sorted_until(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                  ForwardIterator first,
-                                  ForwardIterator last);
-
-
-/*! This version of \p is_sorted_until returns the last iterator \c i in <tt>[first,last]</tt> for
- *  which the range <tt>[first,last)</tt> is sorted using \c operator<. If <tt>distance(first,last) < 2</tt>,
- *  \p is_sorted_until simply returns \p last.
- *
- *  \param first The beginning of the range of interest.
- *  \param last The end of the range of interest.
- *  \return The last iterator in the input range for which it is sorted.
- *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a> and
- *          \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
- *
- *  The following code snippet demonstrates how to use \p is_sorted_until to find the first position
- *  in an array where the data becomes unsorted:
- *  
- *  \code
- *  #include <thrust/sort.h>
- *
- *  ...
- *   
- *  int A[8] = {0, 1, 2, 3, 0, 1, 2, 3};
- *  
- *  int * B = thrust::is_sorted_until(A, A + 8);
- *  
- *  // B - A is 4
- *  // [A, B) is sorted
- *  \endcode
- *
- *  \see \p is_sorted
- *  \see \p sort
- *  \see \p sort_by_key
- *  \see \p stable_sort
- *  \see \p stable_sort_by_key
- */
-template<typename ForwardIterator>
-  ForwardIterator is_sorted_until(ForwardIterator first,
-                                  ForwardIterator last);
-
-
-/*! This version of \p is_sorted_until returns the last iterator \c i in <tt>[first,last]</tt> for
- *  which the range <tt>[first,last)</tt> is sorted using the function object \c comp. If <tt>distance(first,last) < 2</tt>,
- *  \p is_sorted_until simply returns \p last.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization:
- *  \param first The beginning of the range of interest.
- *  \param last The end of the range of interest.
- *  \param comp The function object to use for comparison.
- *  \return The last iterator in the input range for which it is sorted.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a> and
- *          \p ForwardIterator's \c value_type is convertible to \p Compare's \c argument_type.
- *  \tparam Compare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
- *
- *  The following code snippet demonstrates how to use \p is_sorted_until to find the first position
- *  in an array where the data becomes unsorted in descending order using the \p thrust::host execution
- *  policy for parallelization:
- *
- *  \code
- *  #include <thrust/sort.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/execution_policy.h>
- *
- *  ...
- *   
- *  int A[8] = {3, 2, 1, 0, 3, 2, 1, 0};
- *  
- *  thrust::greater<int> comp;
- *  int * B = thrust::is_sorted_until(thrust::host, A, A + 8, comp);
- *  
- *  // B - A is 4
- *  // [A, B) is sorted in descending order
- *  \endcode
- *
- *  \see \p is_sorted
- *  \see \p sort
- *  \see \p sort_by_key
- *  \see \p stable_sort
- *  \see \p stable_sort_by_key
- */
-template<typename DerivedPolicy, typename ForwardIterator, typename Compare>
-  ForwardIterator is_sorted_until(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                  ForwardIterator first,
-                                  ForwardIterator last,
-                                  Compare comp);
-
-
-/*! This version of \p is_sorted_until returns the last iterator \c i in <tt>[first,last]</tt> for
- *  which the range <tt>[first,last)</tt> is sorted using the function object \c comp. If <tt>distance(first,last) < 2</tt>,
- *  \p is_sorted_until simply returns \p last.
- *
- *  \param first The beginning of the range of interest.
- *  \param last The end of the range of interest.
- *  \param comp The function object to use for comparison.
- *  \return The last iterator in the input range for which it is sorted.
- *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a> and
- *          \p ForwardIterator's \c value_type is convertible to \p Compare's \c argument_type.
- *  \tparam Compare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
- *
- *  The following code snippet demonstrates how to use \p is_sorted_until to find the first position
- *  in an array where the data becomes unsorted in descending order:
- *
- *  \code
- *  #include <thrust/sort.h>
- *  #include <thrust/functional.h>
- *
- *  ...
- *   
- *  int A[8] = {3, 2, 1, 0, 3, 2, 1, 0};
- *  
- *  thrust::greater<int> comp;
- *  int * B = thrust::is_sorted_until(A, A + 8, comp);
- *  
- *  // B - A is 4
- *  // [A, B) is sorted in descending order
- *  \endcode
- *
- *  \see \p is_sorted
- *  \see \p sort
- *  \see \p sort_by_key
- *  \see \p stable_sort
- *  \see \p stable_sort_by_key
- */
-template<typename ForwardIterator, typename Compare>
-  ForwardIterator is_sorted_until(ForwardIterator first,
-                                  ForwardIterator last,
-                                  Compare comp);
-
-
-/*! \} // end predicates
- *  \} // end reductions
- */
-
-
-} // end namespace thrust
-
-#include <thrust/detail/sort.inl>
-
diff --git a/compat/thrust/swap.h b/compat/thrust/swap.h
deleted file mode 100644
index 085e546930..0000000000
--- a/compat/thrust/swap.h
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file swap.h
- *  \brief Functions for swapping the value of elements
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/execution_policy.h>
-
-// empty Doxygen comment below so namespace thrust's documentation will be extracted
-
-/*!
- */
-namespace thrust
-{
-
-/*! \addtogroup utility
- *  \{
- */
-
-/*! \addtogroup swap
- *  \{
- */
-
-/*! \p swap assigns the contents of \c a to \c b and the
- *  contents of \c b to \c a. This is used as a primitive operation
- *  by many other algorithms.
- *  
- *  \param a The first value of interest. After completion,
- *           the value of b will be returned here.
- *  \param b The second value of interest. After completion,
- *           the value of a will be returned here.
- *
- *  \tparam Assignable is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>.
- *
- *  The following code snippet demonstrates how to use \p swap to
- *  swap the contents of two variables.
- *
- *  \code
- *  #include <thrust/swap.h>
- *  ...
- *  int x = 1;
- *  int y = 2;
- *  thrust::swap(x,h);
- *
- *  // x == 2, y == 1
- *  \endcode
- */
-template<typename Assignable1, typename Assignable2>
-__host__ __device__ 
-inline void swap(Assignable1 &a, Assignable2 &b);
-
-/*! \} // swap
- */
-
-/*! \} // utility
- */
-
-
-/*! \addtogroup copying
- *  \{
- */
-
-
-/*! \p swap_ranges swaps each of the elements in the range <tt>[first1, last1)</tt>
- *  with the corresponding element in the range <tt>[first2, first2 + (last1 - first1))</tt>.
- *  That is, for each integer \c n such that <tt>0 <= n < (last1 - first1)</tt>, it swaps
- *  <tt>*(first1 + n)</tt> and <tt>*(first2 + n)</tt>. The return value is
- *  <tt>first2 + (last1 - first1)</tt>.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first1 The beginning of the first sequence to swap.
- *  \param last1 One position past the last element of the first sequence to swap.
- *  \param first2 The beginning of the second sequence to swap.
- *  \return An iterator pointing to one position past the last element of the second
- *          sequence to swap.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator1 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator1's \c value_type must be convertible to \p ForwardIterator2's \c value_type.
- *  \tparam ForwardIterator2 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator2's \c value_type must be convertible to \p ForwardIterator1's \c value_type.
- *
- *  \pre \p first1 may equal \p first2, but the range <tt>[first1, last1)</tt> shall not overlap the range <tt>[first2, first2 + (last1 - first1))</tt> otherwise.
- *
- *  The following code snippet demonstrates how to use \p swap_ranges to
- *  swap the contents of two \c thrust::device_vectors using the \p thrust::device execution
- *  policy for parallelization:
- *
- *  \code
- *  #include <thrust/swap.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  thrust::device_vector<int> v1(2), v2(2);
- *  v1[0] = 1;
- *  v1[1] = 2;
- *  v2[0] = 3;
- *  v2[1] = 4;
- *
- *  thrust::swap_ranges(thrust::device, v1.begin(), v1.end(), v2.begin());
- *
- *  // v1[0] == 3, v1[1] == 4, v2[0] == 1, v2[1] == 2
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/swap_ranges.html
- *  \see \c swap
- */
-template<typename DerivedPolicy,
-         typename ForwardIterator1,
-         typename ForwardIterator2>
-  ForwardIterator2 swap_ranges(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                               ForwardIterator1 first1,
-                               ForwardIterator1 last1,
-                               ForwardIterator2 first2);
-
-
-/*! \p swap_ranges swaps each of the elements in the range <tt>[first1, last1)</tt>
- *  with the corresponding element in the range <tt>[first2, first2 + (last1 - first1))</tt>.
- *  That is, for each integer \c n such that <tt>0 <= n < (last1 - first1)</tt>, it swaps
- *  <tt>*(first1 + n)</tt> and <tt>*(first2 + n)</tt>. The return value is
- *  <tt>first2 + (last1 - first1)</tt>.
- *
- *  \param first1 The beginning of the first sequence to swap.
- *  \param last1 One position past the last element of the first sequence to swap.
- *  \param first2 The beginning of the second sequence to swap.
- *  \return An iterator pointing to one position past the last element of the second
- *          sequence to swap.
- *
- *  \tparam ForwardIterator1 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator1's \c value_type must be convertible to \p ForwardIterator2's \c value_type.
- *  \tparam ForwardIterator2 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator2's \c value_type must be convertible to \p ForwardIterator1's \c value_type.
- *
- *  \pre \p first1 may equal \p first2, but the range <tt>[first1, last1)</tt> shall not overlap the range <tt>[first2, first2 + (last1 - first1))</tt> otherwise.
- *
- *  The following code snippet demonstrates how to use \p swap_ranges to
- *  swap the contents of two \c thrust::device_vectors.
- *
- *  \code
- *  #include <thrust/swap.h>
- *  #include <thrust/device_vector.h>
- *  ...
- *  thrust::device_vector<int> v1(2), v2(2);
- *  v1[0] = 1;
- *  v1[1] = 2;
- *  v2[0] = 3;
- *  v2[1] = 4;
- *
- *  thrust::swap_ranges(v1.begin(), v1.end(), v2.begin());
- *
- *  // v1[0] == 3, v1[1] == 4, v2[0] == 1, v2[1] == 2
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/swap_ranges.html
- *  \see \c swap
- */
-template<typename ForwardIterator1,
-         typename ForwardIterator2>
-  ForwardIterator2 swap_ranges(ForwardIterator1 first1,
-                               ForwardIterator1 last1,
-                               ForwardIterator2 first2);
-
-
-/*! \} // copying
- */
-
-
-} // end thrust
-
-#include <thrust/detail/swap.inl>
-
diff --git a/compat/thrust/system/cpp/detail/adjacent_difference.h b/compat/thrust/system/cpp/detail/adjacent_difference.h
deleted file mode 100644
index ea212ffcd9..0000000000
--- a/compat/thrust/system/cpp/detail/adjacent_difference.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file adjacent_difference.h
- *  \brief C++ implementation of adjacent_difference.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cpp/detail/execution_policy.h>
-#include <thrust/system/detail/internal/scalar/adjacent_difference.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cpp
-{
-namespace detail
-{
-
-template <typename DerivedPolicy,
-          typename InputIterator,
-          typename OutputIterator,
-          typename BinaryFunction>
-OutputIterator adjacent_difference(execution_policy<DerivedPolicy> &,
-                                   InputIterator first,
-                                   InputIterator last,
-                                   OutputIterator result,
-                                   BinaryFunction binary_op)
-{
-  return thrust::system::detail::internal::scalar::adjacent_difference(first, last, result, binary_op);
-}
-
-} // end namespace detail
-} // end namespace cpp
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cpp/detail/assign_value.h b/compat/thrust/system/cpp/detail/assign_value.h
deleted file mode 100644
index 847fc97afd..0000000000
--- a/compat/thrust/system/cpp/detail/assign_value.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/system/cpp/detail/execution_policy.h>
-#include <thrust/detail/raw_pointer_cast.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cpp
-{
-namespace detail
-{
-
-template<typename DerivedPolicy, typename Pointer1, typename Pointer2>
-__host__ __device__
-  void assign_value(thrust::system::cpp::detail::execution_policy<DerivedPolicy> &, Pointer1 dst, Pointer2 src)
-{
-  *thrust::raw_pointer_cast(dst) = *thrust::raw_pointer_cast(src);
-} // end assign_value()
-
-} // end detail
-} // end cpp
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/cpp/detail/binary_search.h b/compat/thrust/system/cpp/detail/binary_search.h
deleted file mode 100644
index 37af539e0d..0000000000
--- a/compat/thrust/system/cpp/detail/binary_search.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file binary_search.h
- *  \brief C++ implementation of binary search algorithms.
- */
-
-#pragma once
-
-#include <thrust/system/cpp/detail/execution_policy.h>
-#include <thrust/system/detail/internal/scalar/binary_search.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cpp
-{
-namespace detail
-{
-
-template <typename ForwardIterator,
-          typename T,
-          typename StrictWeakOrdering>
-ForwardIterator lower_bound(tag,
-                            ForwardIterator first,
-                            ForwardIterator last,
-                            const T& val,
-                            StrictWeakOrdering comp)
-{
-  return thrust::system::detail::internal::scalar::lower_bound(first, last, val, comp);
-}
-
-
-template <typename ForwardIterator,
-          typename T,
-          typename StrictWeakOrdering>
-ForwardIterator upper_bound(tag,
-                            ForwardIterator first,
-                            ForwardIterator last,
-                            const T& val, 
-                            StrictWeakOrdering comp)
-{
-  return thrust::system::detail::internal::scalar::upper_bound(first, last, val, comp);
-}
-
-template <typename ForwardIterator,
-          typename T,
-          typename StrictWeakOrdering>
-bool binary_search(tag,
-                   ForwardIterator first,
-                   ForwardIterator last,
-                   const T& val, 
-                   StrictWeakOrdering comp)
-{
-  return thrust::system::detail::internal::scalar::binary_search(first, last, val, comp);
-}
-
-} // end namespace detail
-} // end namespace cpp
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cpp/detail/copy.h b/compat/thrust/system/cpp/detail/copy.h
deleted file mode 100644
index 7299bbbd7a..0000000000
--- a/compat/thrust/system/cpp/detail/copy.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file copy.h
- *  \brief C++ implementations of copy functions.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cpp/detail/execution_policy.h>
-#include <thrust/system/detail/internal/scalar/copy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cpp
-{
-namespace detail
-{
-
-template<typename InputIterator,
-         typename OutputIterator>
-  OutputIterator copy(tag,
-                      InputIterator first,
-                      InputIterator last,
-                      OutputIterator result)
-{
-  return thrust::system::detail::internal::scalar::copy(first, last, result);
-}
-
-template<typename InputIterator,
-         typename Size,
-         typename OutputIterator>
-  OutputIterator copy_n(tag,
-                        InputIterator first,
-                        Size n,
-                        OutputIterator result)
-{
-  return thrust::system::detail::internal::scalar::copy_n(first, n, result);
-}
-
-} // end namespace detail
-} // end namespace cpp
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cpp/detail/copy_if.h b/compat/thrust/system/cpp/detail/copy_if.h
deleted file mode 100644
index 2faadfa1b7..0000000000
--- a/compat/thrust/system/cpp/detail/copy_if.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cpp/detail/execution_policy.h>
-#include <thrust/system/detail/internal/scalar/copy_if.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cpp
-{
-namespace detail
-{
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator copy_if(tag,
-                         InputIterator1 first,
-                         InputIterator1 last,
-                         InputIterator2 stencil,
-                         OutputIterator result,
-                         Predicate pred)
-{
-  return thrust::system::detail::internal::scalar::copy_if(first, last, stencil, result, pred);
-}
-
-} // end detail
-} // end cpp
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/cpp/detail/count.h b/compat/thrust/system/cpp/detail/count.h
deleted file mode 100644
index a307fc5fc8..0000000000
--- a/compat/thrust/system/cpp/detail/count.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system has no special version of this algorithm 
-
diff --git a/compat/thrust/system/cpp/detail/dispatch/sort.h b/compat/thrust/system/cpp/detail/dispatch/sort.h
deleted file mode 100644
index 2a03cf62bc..0000000000
--- a/compat/thrust/system/cpp/detail/dispatch/sort.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/reverse.h>
-
-#include <thrust/detail/type_traits.h>
-#include <thrust/iterator/iterator_traits.h>
-
-#include <thrust/system/detail/internal/scalar/stable_merge_sort.h>
-#include <thrust/system/detail/internal/scalar/stable_radix_sort.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cpp
-{
-namespace detail
-{
-namespace dispatch
-{
-
-////////////////
-// Radix Sort //
-////////////////
-
-template<typename RandomAccessIterator,
-         typename StrictWeakOrdering>
-void stable_sort(RandomAccessIterator first,
-                 RandomAccessIterator last,
-                 StrictWeakOrdering comp,
-                 thrust::detail::true_type)
-{
-  thrust::system::detail::internal::scalar::stable_radix_sort(first, last);
-        
-  // if comp is greater<T> then reverse the keys
-  typedef typename thrust::iterator_traits<RandomAccessIterator>::value_type KeyType;
-  const static bool reverse = thrust::detail::is_same<StrictWeakOrdering, typename thrust::greater<KeyType> >::value;
-
-  if (reverse)
-    thrust::reverse(first, last);
-}
-
-template<typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename StrictWeakOrdering>
-void stable_sort_by_key(RandomAccessIterator1 first1,
-                        RandomAccessIterator1 last1,
-                        RandomAccessIterator2 first2,
-                        StrictWeakOrdering comp,
-                        thrust::detail::true_type)
-{
-  // if comp is greater<T> then reverse the keys and values
-  typedef typename thrust::iterator_traits<RandomAccessIterator1>::value_type KeyType;
-  const static bool reverse = thrust::detail::is_same<StrictWeakOrdering, typename thrust::greater<KeyType> >::value;
-
-  // note, we also have to reverse the (unordered) input to preserve stability
-  if (reverse)
-  {
-    thrust::reverse(first1,  last1);
-    thrust::reverse(first2, first2 + (last1 - first1));
-  }
-
-  thrust::system::detail::internal::scalar::stable_radix_sort_by_key(first1, last1, first2);
-
-  if (reverse)
-  {
-    thrust::reverse(first1,  last1);
-    thrust::reverse(first2, first2 + (last1 - first1));
-  }
-}
-
-////////////////
-// Merge Sort //
-////////////////
-
-template<typename RandomAccessIterator,
-         typename StrictWeakOrdering>
-void stable_sort(RandomAccessIterator first,
-                 RandomAccessIterator last,
-                 StrictWeakOrdering comp,
-                 thrust::detail::false_type)
-{
-  thrust::system::detail::internal::scalar::stable_merge_sort(first, last, comp);
-}
-
-template<typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename StrictWeakOrdering>
-void stable_sort_by_key(RandomAccessIterator1 first1,
-                        RandomAccessIterator1 last1,
-                        RandomAccessIterator2 first2,
-                        StrictWeakOrdering comp,
-                        thrust::detail::false_type)
-{
-  thrust::system::detail::internal::scalar::stable_merge_sort_by_key(first1, last1, first2, comp);
-}
-
-} // end namespace dispatch
-} // end namespace detail
-} // end namespace cpp
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cpp/detail/equal.h b/compat/thrust/system/cpp/detail/equal.h
deleted file mode 100644
index a307fc5fc8..0000000000
--- a/compat/thrust/system/cpp/detail/equal.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system has no special version of this algorithm 
-
diff --git a/compat/thrust/system/cpp/detail/execution_policy.h b/compat/thrust/system/cpp/detail/execution_policy.h
deleted file mode 100644
index 229ff5c6c8..0000000000
--- a/compat/thrust/system/cpp/detail/execution_policy.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-// put the canonical tag in the same ns as the backend's entry points
-namespace cpp
-{
-namespace detail
-{
-
-// this awkward sequence of definitions arise
-// from the desire both for tag to derive
-// from execution_policy and for execution_policy
-// to convert to tag (when execution_policy is not
-// an ancestor of tag)
-
-// forward declaration of tag
-struct tag;
-
-// forward declaration of execution_policy
-template<typename> struct execution_policy;
-
-// specialize execution_policy for tag
-template<>
-  struct execution_policy<tag>
-    : thrust::execution_policy<tag>
-{};
-
-// tag's definition comes before the
-// generic definition of execution_policy
-struct tag : execution_policy<tag> {};
-
-// allow conversion to tag when it is not a successor
-template<typename Derived>
-  struct execution_policy
-    : thrust::execution_policy<Derived>
-{
-  // allow conversion to tag
-  inline operator tag () const
-  {
-    return tag();
-  }
-};
-
-} // end detail
-
-// alias execution_policy and tag here
-using thrust::system::cpp::detail::execution_policy;
-using thrust::system::cpp::detail::tag;
-
-} // end cpp
-} // end system
-
-// alias items at top-level
-namespace cpp
-{
-
-using thrust::system::cpp::execution_policy;
-using thrust::system::cpp::tag;
-
-} // end cpp
-} // end thrust
-
diff --git a/compat/thrust/system/cpp/detail/extrema.h b/compat/thrust/system/cpp/detail/extrema.h
deleted file mode 100644
index 3eab6d406d..0000000000
--- a/compat/thrust/system/cpp/detail/extrema.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file extrema.h
- *  \brief C++ implementations of extrema functions.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/pair.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/cpp/detail/execution_policy.h>
-#include <thrust/system/detail/internal/scalar/extrema.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cpp
-{
-namespace detail
-{
-
-template <typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
-ForwardIterator min_element(execution_policy<DerivedPolicy> &,
-                            ForwardIterator first, 
-                            ForwardIterator last,
-                            BinaryPredicate comp)
-{
-  return thrust::system::detail::internal::scalar::min_element(first, last, comp);
-}
-
-
-template <typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
-ForwardIterator max_element(execution_policy<DerivedPolicy> &,
-                            ForwardIterator first, 
-                            ForwardIterator last,
-                            BinaryPredicate comp)
-{
-  return thrust::system::detail::internal::scalar::max_element(first, last, comp);
-}
-
-
-template <typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
-thrust::pair<ForwardIterator,ForwardIterator> minmax_element(execution_policy<DerivedPolicy> &,
-                                                             ForwardIterator first, 
-                                                             ForwardIterator last,
-                                                             BinaryPredicate comp)
-{
-  return thrust::system::detail::internal::scalar::minmax_element(first, last, comp);
-}
-
-} // end namespace detail
-} // end namespace cpp
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cpp/detail/fill.h b/compat/thrust/system/cpp/detail/fill.h
deleted file mode 100644
index a307fc5fc8..0000000000
--- a/compat/thrust/system/cpp/detail/fill.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system has no special version of this algorithm 
-
diff --git a/compat/thrust/system/cpp/detail/find.h b/compat/thrust/system/cpp/detail/find.h
deleted file mode 100644
index 9698524ed8..0000000000
--- a/compat/thrust/system/cpp/detail/find.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file find.h
- *  \brief C++ implementation of find_if. 
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/cpp/detail/execution_policy.h>
-#include <thrust/system/detail/internal/scalar/find.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cpp
-{
-namespace detail
-{
-
-template <typename InputIterator,
-          typename Predicate>
-InputIterator find_if(tag,
-                      InputIterator first,
-                      InputIterator last,
-                      Predicate pred)
-{
-  return thrust::system::detail::internal::scalar::find_if(first, last, pred);
-}
-
-} // end namespace detail
-} // end namespace cpp
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cpp/detail/for_each.h b/compat/thrust/system/cpp/detail/for_each.h
deleted file mode 100644
index 8d4e1c730b..0000000000
--- a/compat/thrust/system/cpp/detail/for_each.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cpp/detail/execution_policy.h>
-#include <thrust/system/detail/internal/scalar/for_each.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cpp
-{
-namespace detail
-{
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename UnaryFunction>
-InputIterator for_each(thrust::system::cpp::detail::execution_policy<DerivedPolicy> &,
-                       InputIterator first,
-                       InputIterator last,
-                       UnaryFunction f)
-{
-  return thrust::system::detail::internal::scalar::for_each(first, last, f);
-}
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename Size,
-         typename UnaryFunction>
-InputIterator for_each_n(thrust::system::cpp::detail::execution_policy<DerivedPolicy> &,
-                         InputIterator first,
-                         Size n,
-                         UnaryFunction f)
-{
-  return thrust::system::detail::internal::scalar::for_each_n(first, n, f);
-}
-
-} // end namespace detail
-} // end namespace cpp
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cpp/detail/gather.h b/compat/thrust/system/cpp/detail/gather.h
deleted file mode 100644
index a307fc5fc8..0000000000
--- a/compat/thrust/system/cpp/detail/gather.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system has no special version of this algorithm 
-
diff --git a/compat/thrust/system/cpp/detail/generate.h b/compat/thrust/system/cpp/detail/generate.h
deleted file mode 100644
index a307fc5fc8..0000000000
--- a/compat/thrust/system/cpp/detail/generate.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system has no special version of this algorithm 
-
diff --git a/compat/thrust/system/cpp/detail/get_value.h b/compat/thrust/system/cpp/detail/get_value.h
deleted file mode 100644
index 5ddb2c8349..0000000000
--- a/compat/thrust/system/cpp/detail/get_value.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/system/cpp/detail/execution_policy.h>
-#include <thrust/detail/raw_pointer_cast.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cpp
-{
-namespace detail
-{
-
-
-template<typename DerivedPolicy, typename Pointer>
-__host__ __device__
-  typename thrust::iterator_value<Pointer>::type
-    get_value(thrust::system::cpp::detail::execution_policy<DerivedPolicy> &, Pointer ptr)
-{
-  return *thrust::raw_pointer_cast(ptr);
-} // end get_value()
-
-
-} // end detail
-} // end cpp
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/cpp/detail/inner_product.h b/compat/thrust/system/cpp/detail/inner_product.h
deleted file mode 100644
index a307fc5fc8..0000000000
--- a/compat/thrust/system/cpp/detail/inner_product.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system has no special version of this algorithm 
-
diff --git a/compat/thrust/system/cpp/detail/iter_swap.h b/compat/thrust/system/cpp/detail/iter_swap.h
deleted file mode 100644
index 257276ffea..0000000000
--- a/compat/thrust/system/cpp/detail/iter_swap.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/system/cpp/detail/execution_policy.h>
-#include <thrust/detail/raw_pointer_cast.h>
-#include <thrust/detail/swap.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cpp
-{
-namespace detail
-{
-
-
-template<typename Pointer1, typename Pointer2>
-__host__ __device__
-  void iter_swap(tag, Pointer1 a, Pointer2 b)
-{
-  using thrust::swap;
-  swap(*thrust::raw_pointer_cast(a), *thrust::raw_pointer_cast(b));
-} // end iter_swap()
-
-
-} // end detail
-} // end cpp
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/cpp/detail/logical.h b/compat/thrust/system/cpp/detail/logical.h
deleted file mode 100644
index a307fc5fc8..0000000000
--- a/compat/thrust/system/cpp/detail/logical.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system has no special version of this algorithm 
-
diff --git a/compat/thrust/system/cpp/detail/malloc_and_free.h b/compat/thrust/system/cpp/detail/malloc_and_free.h
deleted file mode 100644
index 4f8ae82092..0000000000
--- a/compat/thrust/system/cpp/detail/malloc_and_free.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/raw_pointer_cast.h>
-#include <cstdlib> // for malloc & free
-#include <thrust/system/cpp/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cpp
-{
-namespace detail
-{
-
-
-// note that malloc returns a raw pointer to avoid
-// depending on the heavyweight thrust/system/cpp/memory.h header
-template<typename DerivedPolicy>
-  void *malloc(execution_policy<DerivedPolicy> &, std::size_t n)
-{
-  return std::malloc(n);
-} // end malloc()
-
-
-template<typename DerivedPolicy, typename Pointer>
-  void free(execution_policy<DerivedPolicy> &, Pointer ptr)
-{
-  std::free(thrust::raw_pointer_cast(ptr));
-} // end free()
-
-
-} // end detail
-} // end cpp
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/cpp/detail/memory.inl b/compat/thrust/system/cpp/detail/memory.inl
deleted file mode 100644
index 7f9a48dee7..0000000000
--- a/compat/thrust/system/cpp/detail/memory.inl
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cpp/memory.h>
-#include <thrust/system/cpp/detail/malloc_and_free.h>
-#include <limits>
-
-namespace thrust
-{
-
-// XXX WAR an issue with MSVC 2005 (cl v14.00) incorrectly implementing
-//     pointer_raw_pointer for pointer by specializing it here
-#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC) && (_MSC_VER <= 1400)
-namespace detail
-{
-
-template<typename T>
-  struct pointer_raw_pointer< thrust::cpp::pointer<T> >
-{
-  typedef typename thrust::cpp::pointer<T>::raw_pointer type;
-}; // end pointer_raw_pointer
-
-} // end detail
-#endif
-
-namespace system
-{
-namespace cpp
-{
-
-
-template<typename T>
-  template<typename OtherT>
-    reference<T> &
-      reference<T>
-        ::operator=(const reference<OtherT> &other)
-{
-  return super_t::operator=(other);
-} // end reference::operator=()
-
-template<typename T>
-  reference<T> &
-    reference<T>
-      ::operator=(const value_type &x)
-{
-  return super_t::operator=(x);
-} // end reference::operator=()
-
-template<typename T>
-__host__ __device__
-void swap(reference<T> a, reference<T> b)
-{
-  a.swap(b);
-} // end swap()
-
-pointer<void> malloc(std::size_t n)
-{
-  tag t;
-  return pointer<void>(thrust::system::cpp::detail::malloc(t, n));
-} // end malloc()
-
-template<typename T>
-pointer<T> malloc(std::size_t n)
-{
-  pointer<void> raw_ptr = thrust::system::cpp::malloc(sizeof(T) * n);
-  return pointer<T>(reinterpret_cast<T*>(raw_ptr.get()));
-} // end malloc()
-
-void free(pointer<void> ptr)
-{
-  tag t;
-  return thrust::system::cpp::detail::free(t, ptr);
-} // end free()
-
-} // end cpp
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/cpp/detail/merge.h b/compat/thrust/system/cpp/detail/merge.h
deleted file mode 100644
index 7f01c0713a..0000000000
--- a/compat/thrust/system/cpp/detail/merge.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/pair.h>
-#include <thrust/system/cpp/detail/execution_policy.h>
-#include <thrust/system/detail/internal/scalar/merge.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cpp
-{
-namespace detail
-{
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename StrictWeakOrdering>
-OutputIterator merge(execution_policy<DerivedPolicy> &,
-                     InputIterator1 first1,
-                     InputIterator1 last1,
-                     InputIterator2 first2,
-                     InputIterator2 last2,
-                     OutputIterator result,
-                     StrictWeakOrdering comp)
-{
-  return thrust::system::detail::internal::scalar::merge(first1, last1, first2, last2, result, comp);
-}
-
-template <typename DerivedPolicy,
-          typename InputIterator1,
-          typename InputIterator2,
-          typename InputIterator3,
-          typename InputIterator4,
-          typename OutputIterator1,
-          typename OutputIterator2,
-          typename StrictWeakOrdering>
-thrust::pair<OutputIterator1,OutputIterator2>
-  merge_by_key(execution_policy<DerivedPolicy> &,
-               InputIterator1 keys_first1,
-               InputIterator1 keys_last1,
-               InputIterator2 keys_first2,
-               InputIterator2 keys_last2,
-               InputIterator3 values_first1,
-               InputIterator4 values_first2,
-               OutputIterator1 keys_result,
-               OutputIterator2 values_result,
-               StrictWeakOrdering comp)
-{
-  return thrust::system::detail::internal::scalar::merge_by_key(keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result, comp);
-}
-
-} // end namespace detail
-} // end namespace cpp
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cpp/detail/mismatch.h b/compat/thrust/system/cpp/detail/mismatch.h
deleted file mode 100644
index a307fc5fc8..0000000000
--- a/compat/thrust/system/cpp/detail/mismatch.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system has no special version of this algorithm 
-
diff --git a/compat/thrust/system/cpp/detail/par.h b/compat/thrust/system/cpp/detail/par.h
deleted file mode 100644
index 953e5274db..0000000000
--- a/compat/thrust/system/cpp/detail/par.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cpp/detail/execution_policy.h>
-#include <thrust/detail/execute_with_allocator.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cpp
-{
-namespace detail
-{
-
-
-struct par_t : thrust::system::cpp::detail::execution_policy<par_t>
-{
-  par_t() : thrust::system::cpp::detail::execution_policy<par_t>() {}
-
-  template<typename Allocator>
-    thrust::detail::execute_with_allocator<Allocator, thrust::system::cpp::detail::execution_policy>
-      operator()(Allocator &alloc) const
-  {
-    return thrust::detail::execute_with_allocator<Allocator, thrust::system::cpp::detail::execution_policy>(alloc);
-  }
-};
-
-
-} // end detail
-
-
-static const detail::par_t par;
-
-
-} // end cpp
-} // end system
-
-
-// alias par here
-namespace cpp
-{
-
-
-using thrust::system::cpp::par;
-
-
-} // end cpp
-} // end thrust
-
diff --git a/compat/thrust/system/cpp/detail/partition.h b/compat/thrust/system/cpp/detail/partition.h
deleted file mode 100644
index 25a4f1c346..0000000000
--- a/compat/thrust/system/cpp/detail/partition.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file partition.h
- *  \brief cpp implementations of partition functions
- */
-
-#pragma once
-
-#include <thrust/pair.h>
-#include <thrust/system/cpp/detail/execution_policy.h>
-#include <thrust/system/detail/internal/scalar/partition.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cpp
-{
-namespace detail
-{
-
-template<typename ForwardIterator,
-         typename Predicate>
-  ForwardIterator stable_partition(tag,
-                                   ForwardIterator first,
-                                   ForwardIterator last,
-                                   Predicate pred)
-{
-  return thrust::system::detail::internal::scalar::stable_partition(first, last, pred);
-}
-
-template<typename ForwardIterator,
-         typename InputIterator,
-         typename Predicate>
-  ForwardIterator stable_partition(tag,
-                                   ForwardIterator first,
-                                   ForwardIterator last,
-                                   InputIterator stencil,
-                                   Predicate pred)
-{
-  return thrust::system::detail::internal::scalar::stable_partition(first, last, stencil, pred);
-}
-
-template<typename InputIterator,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename Predicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    stable_partition_copy(tag,
-                          InputIterator first,
-                          InputIterator last,
-                          OutputIterator1 out_true,
-                          OutputIterator2 out_false,
-                          Predicate pred)
-{
-  return thrust::system::detail::internal::scalar::stable_partition_copy(first, last, out_true, out_false, pred);
-}
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename Predicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    stable_partition_copy(tag,
-                          InputIterator1 first,
-                          InputIterator1 last,
-                          InputIterator2 stencil,
-                          OutputIterator1 out_true,
-                          OutputIterator2 out_false,
-                          Predicate pred)
-{
-  return thrust::system::detail::internal::scalar::stable_partition_copy(first, last, stencil, out_true, out_false, pred);
-}
-
-} // end namespace detail
-} // end namespace cpp
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cpp/detail/reduce.h b/compat/thrust/system/cpp/detail/reduce.h
deleted file mode 100644
index 5428206ba3..0000000000
--- a/compat/thrust/system/cpp/detail/reduce.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file reduce.h
- *  \brief C++ implementation of reduce algorithms.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cpp/detail/execution_policy.h>
-#include <thrust/system/detail/internal/scalar/reduce.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cpp
-{
-namespace detail
-{
-
-template<typename DerivedPolicy,
-         typename InputIterator, 
-         typename OutputType,
-         typename BinaryFunction>
-  OutputType reduce(execution_policy<DerivedPolicy> &,
-                    InputIterator begin,
-                    InputIterator end,
-                    OutputType init,
-                    BinaryFunction binary_op)
-{
-  return thrust::system::detail::internal::scalar::reduce(begin, end, init, binary_op);
-}
-
-} // end namespace detail
-} // end namespace cpp
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cpp/detail/reduce_by_key.h b/compat/thrust/system/cpp/detail/reduce_by_key.h
deleted file mode 100644
index 22dc2d9d3d..0000000000
--- a/compat/thrust/system/cpp/detail/reduce_by_key.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cpp/detail/execution_policy.h>
-#include <thrust/pair.h>
-#include <thrust/system/detail/internal/scalar/reduce_by_key.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cpp
-{
-namespace detail
-{
-
-template <typename DerivedPolicy,
-          typename InputIterator1,
-          typename InputIterator2,
-          typename OutputIterator1,
-          typename OutputIterator2,
-          typename BinaryPredicate,
-          typename BinaryFunction>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    reduce_by_key(execution_policy<DerivedPolicy> &,
-                  InputIterator1 keys_first, 
-                  InputIterator1 keys_last,
-                  InputIterator2 values_first,
-                  OutputIterator1 keys_output,
-                  OutputIterator2 values_output,
-                  BinaryPredicate binary_pred,
-                  BinaryFunction binary_op)
-{
-  return thrust::system::detail::internal::scalar::reduce_by_key(keys_first, keys_last, values_first, keys_output, values_output, binary_pred, binary_op);
-}
-
-} // end namespace detail
-} // end namespace cpp
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cpp/detail/remove.h b/compat/thrust/system/cpp/detail/remove.h
deleted file mode 100644
index cf2202bee3..0000000000
--- a/compat/thrust/system/cpp/detail/remove.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cpp/detail/execution_policy.h>
-#include <thrust/system/detail/internal/scalar/remove.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cpp
-{
-namespace detail
-{
-
-template<typename ForwardIterator,
-         typename Predicate>
-  ForwardIterator remove_if(tag,
-                            ForwardIterator first,
-                            ForwardIterator last,
-                            Predicate pred)
-{
-  return thrust::system::detail::internal::scalar::remove_if(first, last, pred);
-}
-
-
-template<typename ForwardIterator,
-         typename InputIterator,
-         typename Predicate>
-  ForwardIterator remove_if(tag,
-                            ForwardIterator first,
-                            ForwardIterator last,
-                            InputIterator stencil,
-                            Predicate pred)
-{
-  return thrust::system::detail::internal::scalar::remove_if(first, last, stencil, pred);
-}
-
-
-template<typename InputIterator,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator remove_copy_if(tag,
-                                InputIterator first,
-                                InputIterator last,
-                                OutputIterator result,
-                                Predicate pred)
-{
-  return thrust::system::detail::internal::scalar::remove_copy_if(first, last, result, pred);
-}
-
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator remove_copy_if(tag,
-                                InputIterator1 first,
-                                InputIterator1 last,
-                                InputIterator2 stencil,
-                                OutputIterator result,
-                                Predicate pred)
-{
-  return thrust::system::detail::internal::scalar::remove_copy_if(first, last, stencil, result, pred);
-}
-
-} // end namespace detail
-} // end namespace cpp
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cpp/detail/replace.h b/compat/thrust/system/cpp/detail/replace.h
deleted file mode 100644
index a307fc5fc8..0000000000
--- a/compat/thrust/system/cpp/detail/replace.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system has no special version of this algorithm 
-
diff --git a/compat/thrust/system/cpp/detail/reverse.h b/compat/thrust/system/cpp/detail/reverse.h
deleted file mode 100644
index a307fc5fc8..0000000000
--- a/compat/thrust/system/cpp/detail/reverse.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system has no special version of this algorithm 
-
diff --git a/compat/thrust/system/cpp/detail/scan.h b/compat/thrust/system/cpp/detail/scan.h
deleted file mode 100644
index d4bae1e739..0000000000
--- a/compat/thrust/system/cpp/detail/scan.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file scan.h
- *  \brief C++ implementations of scan functions.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cpp/detail/execution_policy.h>
-#include <thrust/system/detail/internal/scalar/scan.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cpp
-{
-namespace detail
-{
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename BinaryFunction>
-  OutputIterator inclusive_scan(execution_policy<DerivedPolicy> &,
-                                InputIterator first,
-                                InputIterator last,
-                                OutputIterator result,
-                                BinaryFunction binary_op)
-{
-  return thrust::system::detail::internal::scalar::inclusive_scan(first, last, result, binary_op);
-}
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename T,
-         typename BinaryFunction>
-  OutputIterator exclusive_scan(execution_policy<DerivedPolicy> &,
-                                InputIterator first,
-                                InputIterator last,
-                                OutputIterator result,
-                                T init,
-                                BinaryFunction binary_op)
-{
-  return thrust::system::detail::internal::scalar::exclusive_scan(first, last, result, init, binary_op);
-}
-
-} // end namespace detail
-} // end namespace cpp
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cpp/detail/scan_by_key.h b/compat/thrust/system/cpp/detail/scan_by_key.h
deleted file mode 100644
index 4165d842fd..0000000000
--- a/compat/thrust/system/cpp/detail/scan_by_key.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cpp/detail/execution_policy.h>
-#include <thrust/system/detail/internal/scalar/scan_by_key.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cpp
-{
-namespace detail
-{
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename BinaryPredicate,
-         typename BinaryFunction>
-  OutputIterator inclusive_scan_by_key(tag,
-                                       InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result,
-                                       BinaryPredicate binary_pred,
-                                       BinaryFunction binary_op)
-{
-  return thrust::system::detail::internal::scalar::inclusive_scan_by_key(first1, last1, first2, result, binary_pred, binary_op);
-}
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename T,
-         typename BinaryPredicate,
-         typename BinaryFunction>
-  OutputIterator exclusive_scan_by_key(tag,
-                                       InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result,
-                                       T init,
-                                       BinaryPredicate binary_pred,
-                                       BinaryFunction binary_op)
-{
-  return thrust::system::detail::internal::scalar::exclusive_scan_by_key(first1, last1, first2, result, init, binary_pred, binary_op);
-}
-
-} // end namespace detail
-} // end namespace cpp
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cpp/detail/scatter.h b/compat/thrust/system/cpp/detail/scatter.h
deleted file mode 100644
index a307fc5fc8..0000000000
--- a/compat/thrust/system/cpp/detail/scatter.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system has no special version of this algorithm 
-
diff --git a/compat/thrust/system/cpp/detail/sequence.h b/compat/thrust/system/cpp/detail/sequence.h
deleted file mode 100644
index a307fc5fc8..0000000000
--- a/compat/thrust/system/cpp/detail/sequence.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system has no special version of this algorithm 
-
diff --git a/compat/thrust/system/cpp/detail/set_operations.h b/compat/thrust/system/cpp/detail/set_operations.h
deleted file mode 100644
index 07ce71257b..0000000000
--- a/compat/thrust/system/cpp/detail/set_operations.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cpp/detail/execution_policy.h>
-#include <thrust/system/detail/internal/scalar/set_operations.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cpp
-{
-namespace detail
-{
-
-
-template<typename ExecutionPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename StrictWeakOrdering>
-  OutputIterator set_difference(execution_policy<ExecutionPolicy> &,
-                                InputIterator1 first1,
-                                InputIterator1 last1,
-                                InputIterator2 first2,
-                                InputIterator2 last2,
-                                OutputIterator result,
-                                StrictWeakOrdering comp)
-{
-  return thrust::system::detail::internal::scalar::set_difference(first1, last1, first2, last2, result, comp);
-}
-
-
-template<typename ExecutionPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename StrictWeakOrdering>
-  OutputIterator set_intersection(execution_policy<ExecutionPolicy> &,
-                                  InputIterator1 first1,
-                                  InputIterator1 last1,
-                                  InputIterator2 first2,
-                                  InputIterator2 last2,
-                                  OutputIterator result,
-                                  StrictWeakOrdering comp)
-{
-  return thrust::system::detail::internal::scalar::set_intersection(first1, last1, first2, last2, result, comp);
-}
-
-
-template<typename ExecutionPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename StrictWeakOrdering>
-  OutputIterator set_symmetric_difference(execution_policy<ExecutionPolicy> &,
-                                          InputIterator1 first1,
-                                          InputIterator1 last1,
-                                          InputIterator2 first2,
-                                          InputIterator2 last2,
-                                          OutputIterator result,
-                                          StrictWeakOrdering comp)
-{
-  return thrust::system::detail::internal::scalar::set_symmetric_difference(first1, last1, first2, last2, result, comp);
-}
-
-
-template<typename ExecutionPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename StrictWeakOrdering>
-  OutputIterator set_union(execution_policy<ExecutionPolicy> &,
-                           InputIterator1 first1,
-                           InputIterator1 last1,
-                           InputIterator2 first2,
-                           InputIterator2 last2,
-                           OutputIterator result,
-                           StrictWeakOrdering comp)
-{
-  return thrust::system::detail::internal::scalar::set_union(first1, last1, first2, last2, result, comp);
-}
-
-
-} // end namespace detail
-} // end namespace cpp
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cpp/detail/sort.h b/compat/thrust/system/cpp/detail/sort.h
deleted file mode 100644
index 60244e22a3..0000000000
--- a/compat/thrust/system/cpp/detail/sort.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cpp/detail/execution_policy.h>
-#include <thrust/system/detail/internal/scalar/sort.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cpp
-{
-namespace detail
-{
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator,
-         typename StrictWeakOrdering>
-  void stable_sort(execution_policy<DerivedPolicy> &,
-                   RandomAccessIterator first,
-                   RandomAccessIterator last,
-                   StrictWeakOrdering comp)
-{
-  thrust::system::detail::internal::scalar::stable_sort(first, last, comp);
-}
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename StrictWeakOrdering>
-  void stable_sort_by_key(execution_policy<DerivedPolicy> &,
-                          RandomAccessIterator1 keys_first,
-                          RandomAccessIterator1 keys_last,
-                          RandomAccessIterator2 values_first,
-                          StrictWeakOrdering comp)
-{
-  thrust::system::detail::internal::scalar::stable_sort_by_key(keys_first, keys_last, values_first, comp);
-}
-
-} // end namespace detail
-} // end namespace cpp
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cpp/detail/swap_ranges.h b/compat/thrust/system/cpp/detail/swap_ranges.h
deleted file mode 100644
index a834a2c0ed..0000000000
--- a/compat/thrust/system/cpp/detail/swap_ranges.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// cpp has no special swap_ranges
-
diff --git a/compat/thrust/system/cpp/detail/tabulate.h b/compat/thrust/system/cpp/detail/tabulate.h
deleted file mode 100644
index a307fc5fc8..0000000000
--- a/compat/thrust/system/cpp/detail/tabulate.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system has no special version of this algorithm 
-
diff --git a/compat/thrust/system/cpp/detail/temporary_buffer.h b/compat/thrust/system/cpp/detail/temporary_buffer.h
deleted file mode 100644
index 628bd75719..0000000000
--- a/compat/thrust/system/cpp/detail/temporary_buffer.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system has no special temporary buffer functions
-
diff --git a/compat/thrust/system/cpp/detail/transform.h b/compat/thrust/system/cpp/detail/transform.h
deleted file mode 100644
index 5909d4a4fb..0000000000
--- a/compat/thrust/system/cpp/detail/transform.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// cpp has no special transform
-
diff --git a/compat/thrust/system/cpp/detail/transform_reduce.h b/compat/thrust/system/cpp/detail/transform_reduce.h
deleted file mode 100644
index a307fc5fc8..0000000000
--- a/compat/thrust/system/cpp/detail/transform_reduce.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system has no special version of this algorithm 
-
diff --git a/compat/thrust/system/cpp/detail/transform_scan.h b/compat/thrust/system/cpp/detail/transform_scan.h
deleted file mode 100644
index a307fc5fc8..0000000000
--- a/compat/thrust/system/cpp/detail/transform_scan.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system has no special version of this algorithm 
-
diff --git a/compat/thrust/system/cpp/detail/uninitialized_copy.h b/compat/thrust/system/cpp/detail/uninitialized_copy.h
deleted file mode 100644
index a307fc5fc8..0000000000
--- a/compat/thrust/system/cpp/detail/uninitialized_copy.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system has no special version of this algorithm 
-
diff --git a/compat/thrust/system/cpp/detail/uninitialized_fill.h b/compat/thrust/system/cpp/detail/uninitialized_fill.h
deleted file mode 100644
index a307fc5fc8..0000000000
--- a/compat/thrust/system/cpp/detail/uninitialized_fill.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system has no special version of this algorithm 
-
diff --git a/compat/thrust/system/cpp/detail/unique.h b/compat/thrust/system/cpp/detail/unique.h
deleted file mode 100644
index cf740498e1..0000000000
--- a/compat/thrust/system/cpp/detail/unique.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cpp/detail/execution_policy.h>
-#include <thrust/pair.h>
-#include <thrust/system/detail/internal/scalar/unique.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cpp
-{
-namespace detail
-{
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename BinaryPredicate>
-  ForwardIterator unique(execution_policy<DerivedPolicy> &,
-                         ForwardIterator first,
-                         ForwardIterator last,
-                         BinaryPredicate binary_pred)
-{
-  return thrust::system::detail::internal::scalar::unique(first, last, binary_pred);
-}
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename BinaryPredicate>
-  OutputIterator unique_copy(execution_policy<DerivedPolicy> &,
-                             InputIterator first,
-                             InputIterator last,
-                             OutputIterator output,
-                             BinaryPredicate binary_pred)
-{
-  return thrust::system::detail::internal::scalar::unique_copy(first, last, output, binary_pred);
-}
-
-} // end namespace detail
-} // end namespace cpp 
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cpp/detail/unique_by_key.h b/compat/thrust/system/cpp/detail/unique_by_key.h
deleted file mode 100644
index a9f13d6a27..0000000000
--- a/compat/thrust/system/cpp/detail/unique_by_key.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cpp/detail/execution_policy.h>
-#include <thrust/pair.h>
-#include <thrust/system/detail/internal/scalar/unique_by_key.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cpp
-{
-namespace detail
-{
-
-template<typename DerivedPolicy,
-         typename ForwardIterator1,
-         typename ForwardIterator2,
-         typename BinaryPredicate>
-  thrust::pair<ForwardIterator1,ForwardIterator2>
-    unique_by_key(execution_policy<DerivedPolicy> &,
-                  ForwardIterator1 keys_first, 
-                  ForwardIterator1 keys_last,
-                  ForwardIterator2 values_first,
-                  BinaryPredicate binary_pred)
-{
-  return thrust::system::detail::internal::scalar::unique_by_key(keys_first, keys_last, values_first, binary_pred);
-}
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename BinaryPredicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    unique_by_key_copy(execution_policy<DerivedPolicy> &,
-                       InputIterator1 keys_first, 
-                       InputIterator1 keys_last,
-                       InputIterator2 values_first,
-                       OutputIterator1 keys_output,
-                       OutputIterator2 values_output,
-                       BinaryPredicate binary_pred)
-{
-  return thrust::system::detail::internal::scalar::unique_by_key_copy(keys_first, keys_last, values_first, keys_output, values_output, binary_pred);
-}
-
-} // end namespace detail
-} // end namespace cpp 
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cpp/detail/vector.inl b/compat/thrust/system/cpp/detail/vector.inl
deleted file mode 100644
index 03bffcd8aa..0000000000
--- a/compat/thrust/system/cpp/detail/vector.inl
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cpp/vector.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cpp
-{
-
-template<typename T, typename Allocator>
-  vector<T,Allocator>
-    ::vector()
-      : super_t()
-{}
-
-template<typename T, typename Allocator>
-  vector<T,Allocator>
-    ::vector(size_type n)
-      : super_t(n)
-{}
-
-template<typename T, typename Allocator>
-  vector<T,Allocator>
-    ::vector(size_type n, const value_type &value)
-      : super_t(n,value)
-{}
-
-template<typename T, typename Allocator>
-  vector<T,Allocator>
-    ::vector(const vector &x)
-      : super_t(x)
-{}
-
-template<typename T, typename Allocator>
-  template<typename OtherT, typename OtherAllocator>
-    vector<T,Allocator>
-      ::vector(const thrust::detail::vector_base<OtherT,OtherAllocator> &x)
-        : super_t(x)
-{}
-
-template<typename T, typename Allocator>
-  template<typename OtherT, typename OtherAllocator>
-    vector<T,Allocator>
-      ::vector(const std::vector<OtherT,OtherAllocator> &x)
-        : super_t(x)
-{}
-
-template<typename T, typename Allocator>
-  template<typename InputIterator>
-    vector<T,Allocator>
-      ::vector(InputIterator first, InputIterator last)
-        : super_t(first,last)
-{}
-
-template<typename T, typename Allocator>
-  template<typename OtherT, typename OtherAllocator>
-    vector<T,Allocator> &
-      vector<T,Allocator>
-        ::operator=(const std::vector<OtherT,OtherAllocator> &x)
-{
-  super_t::operator=(x);
-  return *this;
-}
-
-template<typename T, typename Allocator>
-  template<typename OtherT, typename OtherAllocator>
-    vector<T,Allocator> &
-      vector<T,Allocator>
-        ::operator=(const thrust::detail::vector_base<OtherT,OtherAllocator> &x)
-{
-  super_t::operator=(x);
-  return *this;
-}
-      
-} // end cpp
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/cpp/execution_policy.h b/compat/thrust/system/cpp/execution_policy.h
deleted file mode 100644
index f192eb9659..0000000000
--- a/compat/thrust/system/cpp/execution_policy.h
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-/*! \file thrust/system/cpp/execution_policy.h
- *  \brief Execution policies for Thrust's standard C++ system.
- */
-
-#include <thrust/detail/config.h>
-
-// get the execution policies definitions first
-#include <thrust/system/cpp/detail/execution_policy.h>
-
-// get the definition of par
-#include <thrust/system/cpp/detail/par.h>
-
-// now get all the algorithm definitions
-
-#include <thrust/system/cpp/detail/adjacent_difference.h>
-#include <thrust/system/cpp/detail/assign_value.h>
-#include <thrust/system/cpp/detail/binary_search.h>
-#include <thrust/system/cpp/detail/copy.h>
-#include <thrust/system/cpp/detail/copy_if.h>
-#include <thrust/system/cpp/detail/count.h>
-#include <thrust/system/cpp/detail/equal.h>
-#include <thrust/system/cpp/detail/extrema.h>
-#include <thrust/system/cpp/detail/fill.h>
-#include <thrust/system/cpp/detail/find.h>
-#include <thrust/system/cpp/detail/for_each.h>
-#include <thrust/system/cpp/detail/gather.h>
-#include <thrust/system/cpp/detail/generate.h>
-#include <thrust/system/cpp/detail/get_value.h>
-#include <thrust/system/cpp/detail/inner_product.h>
-#include <thrust/system/cpp/detail/iter_swap.h>
-#include <thrust/system/cpp/detail/logical.h>
-#include <thrust/system/cpp/detail/malloc_and_free.h>
-#include <thrust/system/cpp/detail/merge.h>
-#include <thrust/system/cpp/detail/mismatch.h>
-#include <thrust/system/cpp/detail/partition.h>
-#include <thrust/system/cpp/detail/reduce.h>
-#include <thrust/system/cpp/detail/reduce_by_key.h>
-#include <thrust/system/cpp/detail/remove.h>
-#include <thrust/system/cpp/detail/replace.h>
-#include <thrust/system/cpp/detail/reverse.h>
-#include <thrust/system/cpp/detail/scan.h>
-#include <thrust/system/cpp/detail/scan_by_key.h>
-#include <thrust/system/cpp/detail/scatter.h>
-#include <thrust/system/cpp/detail/sequence.h>
-#include <thrust/system/cpp/detail/set_operations.h>
-#include <thrust/system/cpp/detail/sort.h>
-#include <thrust/system/cpp/detail/swap_ranges.h>
-#include <thrust/system/cpp/detail/tabulate.h>
-#include <thrust/system/cpp/detail/transform.h>
-#include <thrust/system/cpp/detail/transform_reduce.h>
-#include <thrust/system/cpp/detail/transform_scan.h>
-#include <thrust/system/cpp/detail/uninitialized_copy.h>
-#include <thrust/system/cpp/detail/uninitialized_fill.h>
-#include <thrust/system/cpp/detail/unique.h>
-#include <thrust/system/cpp/detail/unique_by_key.h>
-
-
-// define these entities here for the purpose of Doxygenating them
-// they are actually defined elsewhere
-#if 0
-namespace thrust
-{
-namespace system
-{
-namespace cpp
-{
-
-
-/*! \addtogroup execution_policies
- *  \{
- */
-
-
-/*! \p thrust::system::cpp::execution_policy is the base class for all Thrust parallel execution
- *  policies which are derived from Thrust's standard C++ backend system.
- */
-template<typename DerivedPolicy>
-struct execution_policy : thrust::execution_policy<DerivedPolicy>
-{};
-
-
-/*! \p thrust::system::cpp::tag is a type representing Thrust's standard C++ backend system in C++'s type system.
- *  Iterators "tagged" with a type which is convertible to \p cpp::tag assert that they may be
- *  "dispatched" to algorithm implementations in the \p cpp system.
- */
-struct tag : thrust::system::cpp::execution_policy<tag> { unspecified };
-
-
-/*! 
- *  \p thrust::system::cpp::par is the parallel execution policy associated with Thrust's standard
- *  C++ backend system.
- *
- *  Instead of relying on implicit algorithm dispatch through iterator system tags, users may
- *  directly target Thrust's C++ backend system by providing \p thrust::cpp::par as an algorithm
- *  parameter.
- *
- *  Explicit dispatch can be useful in avoiding the introduction of data copies into containers such
- *  as \p thrust::cpp::vector.
- *
- *  The type of \p thrust::cpp::par is implementation-defined.
- *
- *  The following code snippet demonstrates how to use \p thrust::cpp::par to explicitly dispatch an
- *  invocation of \p thrust::for_each to the standard C++ backend system:
- *
- *  \code
- *  #include <thrust/for_each.h>
- *  #include <thrust/system/cpp/execution_policy.h>
- *  #include <cstdio>
- *
- *  struct printf_functor
- *  {
- *    __host__ __device__
- *    void operator()(int x)
- *    {
- *      printf("%d\n");
- *    }
- *  };
- *  ...
- *  int vec[3];
- *  vec[0] = 0; vec[1] = 1; vec[2] = 2;
- *
- *  thrust::for_each(thrust::cpp::par, vec.begin(), vec.end(), printf_functor());
- *
- *  // 0 1 2 is printed to standard output in some unspecified order
- *  \endcode
- */
-static const unspecified par;
-
-
-/*! \}
- */
-
-
-} // end cpp
-} // end system
-} // end thrust
-#endif
-
-
diff --git a/compat/thrust/system/cpp/memory.h b/compat/thrust/system/cpp/memory.h
deleted file mode 100644
index f3a58b8c32..0000000000
--- a/compat/thrust/system/cpp/memory.h
+++ /dev/null
@@ -1,414 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file thrust/system/cpp/memory.h
- *  \brief Managing memory associated with Thrust's standard C++ system.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cpp/detail/execution_policy.h>
-#include <thrust/memory.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/allocator/malloc_allocator.h>
-#include <ostream>
-
-namespace thrust
-{
-namespace system
-{
-namespace cpp
-{
-
-template<typename> class pointer;
-
-} // end cpp
-} // end system
-} // end thrust
-
-
-/*! \cond
- */
-
-// specialize std::iterator_traits to avoid problems with the name of
-// pointer's constructor shadowing its nested pointer type
-// do this before pointer is defined so the specialization is correctly
-// used inside the definition
-namespace std
-{
-
-template<typename Element>
-  struct iterator_traits<thrust::system::cpp::pointer<Element> >
-{
-  private:
-    typedef thrust::system::cpp::pointer<Element> ptr;
-
-  public:
-    typedef typename ptr::iterator_category       iterator_category;
-    typedef typename ptr::value_type              value_type;
-    typedef typename ptr::difference_type         difference_type;
-    typedef ptr                                   pointer;
-    typedef typename ptr::reference               reference;
-}; // end iterator_traits
-
-} // end std
-
-/*! \endcond
- */
-
-
-namespace thrust
-{
-namespace system
-{
-
-/*! \addtogroup system_backends Systems
- *  \ingroup system
- *  \{
- */
-
-/*! \namespace thrust::system::cpp
- *  \brief \p thrust::system::cpp is the namespace containing functionality for allocating, manipulating,
- *         and deallocating memory available to Thrust's standard C++ backend system.
- *         The identifiers are provided in a separate namespace underneath <tt>thrust::system</tt>
- *         for import convenience but are also aliased in the top-level <tt>thrust::cpp</tt>
- *         namespace for easy access.
- *
- */
-namespace cpp
-{
-
-// forward declaration of reference for pointer
-template<typename Element> class reference;
-
-/*! \cond
- */
-
-// XXX nvcc + msvc have trouble instantiating reference below
-//     this is a workaround
-namespace detail
-{
-
-template<typename Element>
-  struct reference_msvc_workaround
-{
-  typedef thrust::system::cpp::reference<Element> type;
-}; // end reference_msvc_workaround
-
-} // end detail
-
-/*! \endcond
- */
-
-
-/*! \p pointer stores a pointer to an object allocated in memory available to the cpp system.
- *  This type provides type safety when dispatching standard algorithms on ranges resident
- *  in cpp memory.
- *
- *  \p pointer has pointer semantics: it may be dereferenced and manipulated with pointer arithmetic.
- *
- *  \p pointer can be created with the function \p cpp::malloc, or by explicitly calling its constructor
- *  with a raw pointer.
- *
- *  The raw pointer encapsulated by a \p pointer may be obtained by eiter its <tt>get</tt> member function
- *  or the \p raw_pointer_cast function.
- *
- *  \note \p pointer is not a "smart" pointer; it is the programmer's responsibility to deallocate memory
- *  pointed to by \p pointer.
- *
- *  \tparam T specifies the type of the pointee.
- *
- *  \see cpp::malloc
- *  \see cpp::free
- *  \see raw_pointer_cast
- */
-template<typename T>
-  class pointer
-    : public thrust::pointer<
-               T,
-               thrust::system::cpp::tag,
-               thrust::system::cpp::reference<T>,
-               thrust::system::cpp::pointer<T>
-             >
-{
-  /*! \cond
-   */
-
-  private:
-    typedef thrust::pointer<
-      T,
-      thrust::system::cpp::tag,
-      //thrust::system::cpp::reference<T>,
-      typename detail::reference_msvc_workaround<T>::type,
-      thrust::system::cpp::pointer<T>
-    > super_t;
-
-  /*! \endcond
-   */
-
-  public:
-    // note that cpp::pointer's member functions need __host__ __device__
-    // to interoperate with nvcc + iterators' dereference member function
-
-    /*! \p pointer's no-argument constructor initializes its encapsulated pointer to \c 0.
-     */
-    __host__ __device__
-    pointer() : super_t() {}
-
-    /*! This constructor allows construction of a <tt>pointer<const T></tt> from a <tt>T*</tt>.
-     *
-     *  \param ptr A raw pointer to copy from, presumed to point to a location in memory
-     *         accessible by the \p cpp system.
-     *  \tparam OtherT \p OtherT shall be convertible to \p T.
-     */
-    template<typename OtherT>
-    __host__ __device__
-    explicit pointer(OtherT *ptr) : super_t(ptr) {}
-
-    /*! This constructor allows construction from another pointer-like object with related type.
-     *
-     *  \param other The \p OtherPointer to copy.
-     *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
-     *          to \p thrust::system::cpp::tag and its element type shall be convertible to \p T.
-     */
-    template<typename OtherPointer>
-    __host__ __device__
-    pointer(const OtherPointer &other,
-            typename thrust::detail::enable_if_pointer_is_convertible<
-              OtherPointer,
-              pointer
-            >::type * = 0) : super_t(other) {}
-
-    /*! Assignment operator allows assigning from another pointer-like object with related type.
-     *
-     *  \param other The other pointer-like object to assign from.
-     *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
-     *          to \p thrust::system::cpp::tag and its element type shall be convertible to \p T.
-     */
-    template<typename OtherPointer>
-    __host__ __device__
-    typename thrust::detail::enable_if_pointer_is_convertible<
-      OtherPointer,
-      pointer,
-      pointer &
-    >::type
-    operator=(const OtherPointer &other)
-    {
-      return super_t::operator=(other);
-    }
-}; // end pointer
-
-
-/*! \p reference is a wrapped reference to an object stored in memory available to the \p cpp system.
- *  \p reference is the type of the result of dereferencing a \p cpp::pointer.
- *
- *  \tparam T Specifies the type of the referenced object.
- */
-template<typename T>
-  class reference
-    : public thrust::reference<
-               T,
-               thrust::system::cpp::pointer<T>,
-               thrust::system::cpp::reference<T>
-             >
-{
-  /*! \cond
-   */
-
-  private:
-    typedef thrust::reference<
-      T,
-      thrust::system::cpp::pointer<T>,
-      thrust::system::cpp::reference<T>
-    > super_t;
-
-  /*! \endcond
-   */
-
-  public:
-    /*! \cond
-     */
-
-    typedef typename super_t::value_type value_type;
-    typedef typename super_t::pointer    pointer;
-
-    /*! \endcond
-     */
-
-    /*! This constructor initializes this \p reference to refer to an object
-     *  pointed to by the given \p pointer. After this \p reference is constructed,
-     *  it shall refer to the object pointed to by \p ptr.
-     *
-     *  \param ptr A \p pointer to copy from.
-     */
-    __host__ __device__
-    explicit reference(const pointer &ptr)
-      : super_t(ptr)
-    {}
-
-    /*! This constructor accepts a const reference to another \p reference of related type.
-     *  After this \p reference is constructed, it shall refer to the same object as \p other.
-     *
-     *  \param other A \p reference to copy from.
-     *  \tparam OtherT The element type of the other \p reference.
-     *
-     *  \note This constructor is templated primarily to allow initialization of <tt>reference<const T></tt>
-     *        from <tt>reference<T></tt>.
-     */
-    template<typename OtherT>
-    __host__ __device__
-    reference(const reference<OtherT> &other,
-              typename thrust::detail::enable_if_convertible<
-                typename reference<OtherT>::pointer,
-                pointer
-              >::type * = 0)
-      : super_t(other)
-    {}
-
-    /*! Copy assignment operator copy assigns from another \p reference of related type.
-     *
-     *  \param other The other \p reference to assign from.
-     *  \return <tt>*this</tt>
-     *  \tparam OtherT The element type of the other \p reference.
-     */
-    template<typename OtherT>
-    reference &operator=(const reference<OtherT> &other);
-
-    /*! Assignment operator assigns from a \p value_type.
-     *
-     *  \param x The \p value_type to assign from.
-     *  \return <tt>*this</tt>
-     */
-    reference &operator=(const value_type &x);
-}; // end reference
-
-/*! Exchanges the values of two objects referred to by \p reference.
- *  \p x The first \p reference of interest.
- *  \p y The second \p reference ot interest.
- */
-template<typename T>
-__host__ __device__
-void swap(reference<T> x, reference<T> y);
-
-/*! Allocates an area of memory available to Thrust's <tt>cpp</tt> system.
- *  \param n Number of bytes to allocate.
- *  \return A <tt>cpp::pointer<void></tt> pointing to the beginning of the newly
- *          allocated memory. A null <tt>cpp::pointer<void></tt> is returned if
- *          an error occurs.
- *  \note The <tt>cpp::pointer<void></tt> returned by this function must be
- *        deallocated with \p cpp::free.
- *  \see cpp::free
- *  \see std::malloc
- */
-inline pointer<void> malloc(std::size_t n);
-
-/*! Allocates a typed area of memory available to Thrust's <tt>cpp</tt> system.
- *  \param n Number of elements to allocate.
- *  \return A <tt>cpp::pointer<T></tt> pointing to the beginning of the newly
- *          allocated elements. A null <tt>cpp::pointer<T></tt> is returned if
- *          an error occurs.
- *  \note The <tt>cpp::pointer<T></tt> returned by this function must be
- *        deallocated with \p cpp::free.
- *  \see cpp::free
- *  \see std::malloc
- */
-template<typename T>
-inline pointer<T> malloc(std::size_t n);
-
-/*! Deallocates an area of memory previously allocated by <tt>cpp::malloc</tt>.
- *  \param ptr A <tt>cpp::pointer<void></tt> pointing to the beginning of an area
- *         of memory previously allocated with <tt>cpp::malloc</tt>.
- *  \see cpp::malloc
- *  \see std::free
- */
-inline void free(pointer<void> ptr);
-
-// XXX upon c++11
-// template<typename T> using allocator = thrust::detail::malloc_allocator<T,tag,pointer<T> >;
-
-/*! \p cpp::allocator is the default allocator used by the \p cpp system's containers such as
- *  <tt>cpp::vector</tt> if no user-specified allocator is provided. \p cpp::allocator allocates
- *  (deallocates) storage with \p cpp::malloc (\p cpp::free).
- */
-template<typename T>
-  struct allocator
-    : thrust::detail::malloc_allocator<
-        T,
-        tag,
-        pointer<T>
-      >
-{
-  /*! The \p rebind metafunction provides the type of an \p allocator
-   *  instantiated with another type.
-   *
-   *  \tparam U The other type to use for instantiation.
-   */
-  template<typename U>
-    struct rebind
-  {
-    /*! The typedef \p other gives the type of the rebound \p allocator.
-     */
-    typedef allocator<U> other;
-  };
-
-  /*! No-argument constructor has no effect.
-   */
-  __host__ __device__
-  inline allocator() {}
-
-  /*! Copy constructor has no effect.
-   */
-  __host__ __device__
-  inline allocator(const allocator &) {}
-
-  /*! Constructor from other \p allocator has no effect.
-   */
-  template<typename U>
-  __host__ __device__
-  inline allocator(const allocator<U> &) {}
-
-  /*! Destructor has no effect.
-   */
-  __host__ __device__
-  inline ~allocator() {}
-}; // end allocator
-
-} // end cpp
-
-/*! \}
- */
-
-} // end system
-
-/*! \namespace thrust::cpp
- *  \brief \p thrust::cpp is a top-level alias for thrust::system::cpp.
- */
-namespace cpp
-{
-
-using thrust::system::cpp::pointer;
-using thrust::system::cpp::reference;
-using thrust::system::cpp::malloc;
-using thrust::system::cpp::free;
-using thrust::system::cpp::allocator;
-
-} // end cpp
-
-} // end thrust
-
-#include <thrust/system/cpp/detail/memory.inl>
-
diff --git a/compat/thrust/system/cpp/vector.h b/compat/thrust/system/cpp/vector.h
deleted file mode 100644
index 4282df991a..0000000000
--- a/compat/thrust/system/cpp/vector.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file thrust/system/cpp/vector.h
- *  \brief A dynamically-sizable array of elements which reside in memory available to
- *         Thrust's standard C++ system.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cpp/memory.h>
-#include <thrust/detail/vector_base.h>
-#include <vector>
-
-namespace thrust
-{
-
-// forward declaration of host_vector
-template<typename T, typename Allocator> class host_vector;
-
-namespace system
-{
-namespace cpp
-{
-
-// XXX upon c++11
-// template<typename T, typename Allocator = allocator<T> > using vector = thrust::detail::vector_base<T,Allocator>;
-
-/*! \p cpp::vector is a container that supports random access to elements,
- *  constant time removal of elements at the end, and linear time insertion
- *  and removal of elements at the beginning or in the middle. The number of
- *  elements in a \p cpp::vector may vary dynamically; memory management is
- *  automatic. The elements contained in a \p cpp::vector reside in memory
- *  available to the \p cpp system.
- *
- *  \tparam T The element type of the \p cpp::vector.
- *  \tparam Allocator The allocator type of the \p cpp::vector. Defaults to \p cpp::allocator.
- *
- *  \see http://www.sgi.com/tech/stl/Vector.html
- *  \see host_vector For the documentation of the complete interface which is
- *                   shared by \p cpp::vector
- *  \see device_vector
- */
-template<typename T, typename Allocator = allocator<T> >
-  class vector
-    : public thrust::detail::vector_base<T,Allocator>
-{
-  /*! \cond
-   */
-  private:
-    typedef thrust::detail::vector_base<T,Allocator> super_t;
-  /*! \endcond
-   */
-
-  public:
-
-  /*! \cond
-   */
-    typedef typename super_t::size_type  size_type;
-    typedef typename super_t::value_type value_type;
-
-  /*! \endcond
-   */
-
-    /*! This constructor creates an empty \p cpp::vector.
-     */
-    vector();
-
-    /*! This constructor creates a \p cpp::vector with \p n default-constructed elements.
-     *  \param n The size of the \p cpp::vector to create.
-     */
-    explicit vector(size_type n);
-
-    /*! This constructor creates a \p cpp::vector with \p n copies of \p value.
-     *  \param n The size of the \p cpp::vector to create.
-     *  \param value An element to copy.
-     */
-    explicit vector(size_type n, const value_type &value);
-
-    /*! Copy constructor copies from another \p cpp::vector.
-     *  \param x The other \p cpp::vector to copy.
-     */
-    vector(const vector &x);
-
-    /*! This constructor copies from another Thrust vector-like object.
-     *  \param x The other object to copy from.
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector(const thrust::detail::vector_base<OtherT,OtherAllocator> &x);
-
-    /*! This constructor copies from a \c std::vector.
-     *  \param x The \c std::vector to copy from.
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector(const std::vector<OtherT,OtherAllocator> &x);
-
-    /*! This constructor creates a \p cpp::vector by copying from a range.
-     *  \param first The beginning of the range.
-     *  \param last The end of the range.
-     */
-    template<typename InputIterator>
-    vector(InputIterator first, InputIterator last);
-
-    // XXX vector_base should take a Derived type so we don't have to define these superfluous assigns
-
-    /*! Assignment operator assigns from a \c std::vector.
-     *  \param x The \c std::vector to assign from.
-     *  \return <tt>*this</tt>
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector &operator=(const std::vector<OtherT,OtherAllocator> &x);
-
-    /*! Assignment operator assigns from another Thrust vector-like object.
-     *  \param x The other object to assign from.
-     *  \return <tt>*this</tt>
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector &operator=(const thrust::detail::vector_base<OtherT,OtherAllocator> &x);
-}; // end vector
-
-} // end cpp
-} // end system
-
-// alias system::cpp names at top-level
-namespace cpp
-{
-
-using thrust::system::cpp::vector;
-
-} // end cpp
-
-} // end thrust
-
-#include <thrust/system/cpp/detail/vector.inl>
-
diff --git a/compat/thrust/system/cuda/detail/adjacent_difference.h b/compat/thrust/system/cuda/detail/adjacent_difference.h
deleted file mode 100644
index ec51794ff9..0000000000
--- a/compat/thrust/system/cuda/detail/adjacent_difference.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file adjacent_difference.h
- *  \brief CUDA implementation of adjacent_difference.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-template <typename DerivedPolicy,
-          typename InputIterator,
-          typename OutputIterator,
-          typename BinaryFunction>
-OutputIterator adjacent_difference(execution_policy<DerivedPolicy> &exec,
-                                   InputIterator first, InputIterator last,
-                                   OutputIterator result,
-                                   BinaryFunction binary_op);
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/cuda/detail/adjacent_difference.inl>
-
diff --git a/compat/thrust/system/cuda/detail/adjacent_difference.inl b/compat/thrust/system/cuda/detail/adjacent_difference.inl
deleted file mode 100644
index 9e4756a5a3..0000000000
--- a/compat/thrust/system/cuda/detail/adjacent_difference.inl
+++ /dev/null
@@ -1,197 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-
-#include <thrust/gather.h>
-#include <thrust/functional.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/detail/temporary_array.h>
-#include <thrust/system/detail/internal/decompose.h>
-#include <thrust/system/cuda/detail/default_decomposition.h>
-#include <thrust/system/cuda/detail/detail/launch_closure.h>
-#include <thrust/system/cuda/detail/detail/launch_calculator.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace detail
-{
-
-template <typename Decomposition>
-struct last_index_in_each_interval : public thrust::unary_function<typename Decomposition::index_type, typename Decomposition::index_type>
-{
-  typedef typename Decomposition::index_type index_type;
-
-  Decomposition decomp;
-
-  last_index_in_each_interval(Decomposition decomp) : decomp(decomp) {}
-
-  __host__ __device__
-  index_type operator()(index_type interval)
-  {
-    return decomp[interval].end() - 1;
-  }
-};
-
-template <typename InputIterator1,
-          typename InputIterator2,
-          typename OutputIterator,
-          typename BinaryFunction,
-          typename Decomposition,
-          typename Context>
-struct adjacent_difference_closure
-{
-  InputIterator1 input;
-  InputIterator2 input_copy;
-  OutputIterator output;
-  BinaryFunction binary_op;
-  Decomposition  decomp;
-  Context        context;
-
-  typedef Context context_type;
-  
-  adjacent_difference_closure(InputIterator1 input,
-                              InputIterator2 input_copy,
-                              OutputIterator output,
-                              BinaryFunction binary_op,
-                              Decomposition  decomp,
-                              Context        context = Context())
-    : input(input), input_copy(input_copy), output(output), binary_op(binary_op), decomp(decomp), context(context) {}
-
-  __device__ __thrust_forceinline__
-  void operator()(void)
-  {
-    typedef typename thrust::iterator_value<InputIterator1>::type  InputType;
-    typedef typename Decomposition::index_type index_type;
-
-    // this block processes results in [range.begin(), range.end())
-    thrust::system::detail::internal::index_range<index_type> range = decomp[context.block_index()];
-    
-    input_copy += context.block_index() - 1;
-      
-    // prime the temp values for all threads so we don't need to launch a default constructor
-    InputType next_left = (context.block_index() == 0) ? *input : *input_copy;
-
-    index_type base = range.begin();
-    index_type i    = range.begin() + context.thread_index();
-    
-    if (i < range.end())
-    {
-      if (context.thread_index() > 0)
-      {
-        InputIterator1 temp = input + (i - 1);
-        next_left = *temp;
-      }              
-    }
-    
-    input  += i;
-    output += i;
-
-    while (base < range.end())
-    {
-      InputType curr_left = next_left;
-
-      if (i + context.block_dimension() < range.end())
-      {
-        InputIterator1 temp = input + (context.block_dimension() - 1);
-        next_left = *temp;
-      }
-
-      context.barrier();
-
-      if (i < range.end())
-      {
-        if (i == 0)
-          *output = *input;
-        else
-        {
-          InputType x = *input;
-          *output = binary_op(x, curr_left);
-        }
-      }
-
-      i      += context.block_dimension();
-      base   += context.block_dimension();
-      input  += context.block_dimension();
-      output += context.block_dimension();
-    }
-  }
-};
-
-} // end namespace detail
-
-
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
-
-template <typename DerivedPolicy,
-          typename InputIterator,
-          typename OutputIterator,
-          typename BinaryFunction>
-OutputIterator adjacent_difference(execution_policy<DerivedPolicy> &exec,
-                                   InputIterator first, InputIterator last,
-                                   OutputIterator result,
-                                   BinaryFunction binary_op)
-{
-  typedef typename thrust::iterator_value<InputIterator>::type                        InputType;
-  typedef typename thrust::iterator_difference<InputIterator>::type                   IndexType;
-  typedef          thrust::system::detail::internal::uniform_decomposition<IndexType> Decomposition;
-
-  IndexType n = last - first;
-
-  if (n == 0)
-    return result;
-
-  Decomposition decomp = default_decomposition(last - first);
-
-  // allocate temporary storage
-  thrust::detail::temporary_array<InputType,DerivedPolicy> temp(exec, decomp.size() - 1);
-
-  // gather last value in each interval
-  detail::last_index_in_each_interval<Decomposition> unary_op(decomp);
-  thrust::gather(exec,
-                 thrust::make_transform_iterator(thrust::counting_iterator<IndexType>(0), unary_op),
-                 thrust::make_transform_iterator(thrust::counting_iterator<IndexType>(0), unary_op) + (decomp.size() - 1),
-                 first,
-                 temp.begin());
-
-  
-  typedef typename thrust::detail::temporary_array<InputType,DerivedPolicy>::iterator InputIterator2;
-  typedef detail::blocked_thread_array Context;
-  typedef detail::adjacent_difference_closure<InputIterator,InputIterator2,OutputIterator,BinaryFunction,Decomposition,Context> Closure;
-
-  Closure closure(first, temp.begin(), result, binary_op, decomp); 
-
-  detail::launch_closure(closure, decomp.size());
-  
-  return result + n;
-}
-
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cuda/detail/assign_value.h b/compat/thrust/system/cuda/detail/assign_value.h
deleted file mode 100644
index c90cf65b79..0000000000
--- a/compat/thrust/system/cuda/detail/assign_value.h
+++ /dev/null
@@ -1,198 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-#include <thrust/detail/raw_pointer_cast.h>
-#include <thrust/detail/copy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-
-// XXX WAR an issue with msvc 2005 (cl v14.00) which creates multiply-defined
-//     symbols resulting from assign_value
-#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC) && (_MSC_VER <= 1400)
-
-namespace
-{
-
-template<typename DerivedPolicy, typename Pointer1, typename Pointer2>
-inline __host__ __device__
-  void assign_value_msvc2005_war(thrust::cuda::execution_policy<DerivedPolicy> &exec, Pointer1 dst, Pointer2 src)
-{
-  // XXX war nvbugs/881631
-  struct war_nvbugs_881631
-  {
-    __host__ inline static void host_path(thrust::cuda::execution_policy<DerivedPolicy> &exec, Pointer1 dst, Pointer2 src)
-    {
-      thrust::copy(exec, src, src + 1, dst);
-    }
-
-    __device__ inline static void device_path(thrust::cuda::execution_policy<DerivedPolicy> &, Pointer1 dst, Pointer2 src)
-    {
-      *thrust::raw_pointer_cast(dst) = *thrust::raw_pointer_cast(src);
-    }
-  };
-
-#ifndef __CUDA_ARCH__
-  war_nvbugs_881631::host_path(exec,dst,src);
-#else
-  war_nvbugs_881631::device_path(exec,dst,src);
-#endif // __CUDA_ARCH__
-} // end assign_value_msvc2005_war()
-
-} // end anon namespace
-
-template<typename DerivedPolicy, typename Pointer1, typename Pointer2>
-inline __host__ __device__
-  void assign_value(thrust::cuda::execution_policy<DerivedPolicy> &exec, Pointer1 dst, Pointer2 src)
-{
-  return assign_value_msvc2005_war(exec,dst,src);
-} // end assign_value()
-
-#else
-
-template<typename DerivedPolicy, typename Pointer1, typename Pointer2>
-inline __host__ __device__
-  void assign_value(thrust::cuda::execution_policy<DerivedPolicy> &exec, Pointer1 dst, Pointer2 src)
-{
-  // XXX war nvbugs/881631
-  struct war_nvbugs_881631
-  {
-    __host__ inline static void host_path(thrust::cuda::execution_policy<DerivedPolicy> &exec, Pointer1 dst, Pointer2 src)
-    {
-      thrust::copy(exec, src, src + 1, dst);
-    }
-
-    __device__ inline static void device_path(thrust::cuda::execution_policy<DerivedPolicy> &, Pointer1 dst, Pointer2 src)
-    {
-      *thrust::raw_pointer_cast(dst) = *thrust::raw_pointer_cast(src);
-    }
-  };
-
-#ifndef __CUDA_ARCH__
-  war_nvbugs_881631::host_path(exec,dst,src);
-#else
-  war_nvbugs_881631::device_path(exec,dst,src);
-#endif // __CUDA_ARCH__
-} // end assign_value()
-
-#endif // msvc 2005 WAR
-
-
-// XXX WAR an issue with msvc 2005 (cl v14.00) which creates multiply-defined
-//     symbols resulting from assign_value
-#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC) && (_MSC_VER <= 1400)
-
-namespace
-{
-
-
-template<typename System1, typename System2, typename Pointer1, typename Pointer2>
-inline __host__ __device__
-  void assign_value_msvc2005_war(cross_system<System1,System2> &systems, Pointer1 dst, Pointer2 src)
-{
-  // XXX war nvbugs/881631
-  struct war_nvbugs_881631
-  {
-    __host__ inline static void host_path(cross_system<System1,System2> &systems, Pointer1 dst, Pointer2 src)
-    {
-      // rotate the systems so that they are ordered the same as (src, dst)
-      // for the call to thrust::copy
-      cross_system<System2,System1> rotated_systems = systems.rotate();
-      thrust::copy(rotated_systems, src, src + 1, dst);
-    }
-
-    __device__ inline static void device_path(cross_system<System1,System2> &systems, Pointer1 dst, Pointer2 src)
-    {
-      // XXX forward the true cuda::execution_policy inside systems here
-      //     instead of materializing a tag
-      thrust::cuda::tag cuda_tag;
-      thrust::system::cuda::detail::assign_value(cuda_tag, dst, src);
-    }
-  };
-
-#if __CUDA_ARCH__
-  war_nvbugs_881631::device_path(systems,dst,src);
-#else
-  war_nvbugs_881631::host_path(systems,dst,src);
-#endif
-} // end assign_value_msvc2005_war
-
-
-} // end anon namespace
-
-
-template<typename System1, typename System2, typename Pointer1, typename Pointer2>
-inline __host__ __device__
-  void assign_value(cross_system<System1,System2> &systems, Pointer1 dst, Pointer2 src)
-{
-  return assign_value_msvc2005_war(systems,dst,src);
-} // end assign_value()
-
-
-#else
-
-
-template<typename System1, typename System2, typename Pointer1, typename Pointer2>
-inline __host__ __device__
-  void assign_value(cross_system<System1,System2> &systems, Pointer1 dst, Pointer2 src)
-{
-  // XXX war nvbugs/881631
-  struct war_nvbugs_881631
-  {
-    __host__ inline static void host_path(cross_system<System1,System2> &systems, Pointer1 dst, Pointer2 src)
-    {
-      // rotate the systems so that they are ordered the same as (src, dst)
-      // for the call to thrust::copy
-      cross_system<System2,System1> rotated_systems = systems.rotate();
-      thrust::copy(rotated_systems, src, src + 1, dst);
-    }
-
-    __device__ inline static void device_path(cross_system<System1,System2> &systems, Pointer1 dst, Pointer2 src)
-    {
-      // XXX forward the true cuda::execution_policy inside systems here
-      //     instead of materializing a tag
-      thrust::cuda::tag cuda_tag;
-      thrust::system::cuda::detail::assign_value(cuda_tag, dst, src);
-    }
-  };
-
-#if __CUDA_ARCH__
-  war_nvbugs_881631::device_path(systems,dst,src);
-#else
-  war_nvbugs_881631::host_path(systems,dst,src);
-#endif
-} // end assign_value()
-
-
-#endif // msvc 2005 WAR
-
-  
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/cuda/detail/binary_search.h b/compat/thrust/system/cuda/detail/binary_search.h
deleted file mode 100644
index a307fc5fc8..0000000000
--- a/compat/thrust/system/cuda/detail/binary_search.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system has no special version of this algorithm 
-
diff --git a/compat/thrust/system/cuda/detail/block/copy.h b/compat/thrust/system/cuda/detail/block/copy.h
deleted file mode 100644
index 9cc786bfcf..0000000000
--- a/compat/thrust/system/cuda/detail/block/copy.h
+++ /dev/null
@@ -1,223 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file copy.h
- *  \brief CUDA implementation of device-to-device copy,
- *         based on Gregory Diamos' memcpy code.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-#include <thrust/pair.h>
-
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/dispatch/is_trivial_copy.h>
-#include <thrust/detail/raw_reference_cast.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace block
-{
-
-namespace trivial_copy_detail
-{
-
-
-template<typename Size>
-  inline __device__ thrust::pair<Size,Size> quotient_and_remainder(Size n, Size d)
-{
-  Size quotient  = n / d;
-  Size remainder = n - d * quotient; 
-  return thrust::make_pair(quotient,remainder);
-} // end quotient_and_remainder()
-
-
-// assumes the addresses dst & src are aligned to T boundaries
-template<typename Context,
-         typename T>
-__device__ __thrust_forceinline__
-void aligned_copy(Context context, T *dst, const T *src, unsigned int num_elements)
-{
-  for(unsigned int i = context.thread_index();
-      i < num_elements;
-      i += context.block_dimension())
-  {
-    dst[i] = src[i];
-  }
-} // end aligned_copy()
-
-
-} // end namespace trivial_copy_detail
-
-
-template <typename Context>
-__device__ __thrust_forceinline__
-void trivial_copy(Context context, void* destination_, const void* source_, size_t num_bytes)
-{
-  // reinterpret at bytes
-  char* destination  = reinterpret_cast<char*>(destination_);
-  const char* source = reinterpret_cast<const char*>(source_);
- 
-  // TODO replace this with uint64
-#if THRUST_DEVICE_COMPILER != THRUST_DEVICE_COMPILER_NVCC
-  typedef long long  int2;
-  typedef long long uint2;
-#endif // THRUST_DEVICE_COMPILER_NVCC
-
-  // check alignment
-  // XXX can we do this in three steps?
-  //     1. copy until alignment is met
-  //     2. go hog wild
-  //     3. get the remainder
-  if(reinterpret_cast<size_t>(destination) % sizeof(uint2) != 0 || reinterpret_cast<size_t>(source) % sizeof(uint2) != 0)
-  {
-    for(unsigned int i = context.thread_index(); i < num_bytes; i += context.block_dimension())
-    {
-      destination[i] = source[i];
-    }
-  }
-  else
-  {
-    // it's aligned; do a wide copy
-
-    // this pair stores the number of int2s in the aligned portion of the arrays
-    // and the number of bytes in the remainder
-    const thrust::pair<size_t,size_t> num_wide_elements_and_remainder_bytes = trivial_copy_detail::quotient_and_remainder(num_bytes, sizeof(int2));
-
-    // copy int2 elements
-    trivial_copy_detail::aligned_copy(context,
-                                      reinterpret_cast<int2*>(destination),
-                                      reinterpret_cast<const int2*>(source),
-                                      num_wide_elements_and_remainder_bytes.first);
-
-    // XXX we could copy int elements here
-
-    // copy remainder byte by byte
-
-    // to find the beginning of the remainder arrays, we need to point at the beginning, and then skip the number of bytes in the aligned portion
-    // this is sizeof(int2) times the number of int2s comprising the aligned portion
-    const char *remainder_first  = reinterpret_cast<const char*>(source + sizeof(int2) * num_wide_elements_and_remainder_bytes.first);
-          char *remainder_result = reinterpret_cast<char*>(destination  + sizeof(int2) * num_wide_elements_and_remainder_bytes.first);
-
-    trivial_copy_detail::aligned_copy(context, remainder_result, remainder_first, num_wide_elements_and_remainder_bytes.second);
-  }
-} // end trivial_copy()
-
-
-namespace detail
-{
-namespace dispatch
-{
-
-template<typename Context,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-  __thrust_forceinline__ __device__
-  RandomAccessIterator2 copy(Context context,
-                             RandomAccessIterator1 first,
-                             RandomAccessIterator1 last,
-                             RandomAccessIterator2 result,
-                             thrust::detail::true_type is_trivial_copy)
-{
-  typedef typename thrust::iterator_value<RandomAccessIterator1>::type T;
-
-  const T *src = &thrust::raw_reference_cast(*first);
-        T *dst = &thrust::raw_reference_cast(*result);
-
-  size_t n = (last - first);
-  thrust::system::cuda::detail::block::trivial_copy(context, dst, src, n * sizeof(T));
-  return result + n;
-} // end copy()
-
-template<typename Context,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-  __thrust_forceinline__ __device__
-  RandomAccessIterator2 copy(Context context, 
-                             RandomAccessIterator1 first,
-                             RandomAccessIterator1 last,
-                             RandomAccessIterator2 result,
-                             thrust::detail::false_type is_trivial_copy)
-{
-  RandomAccessIterator2 end_of_output = result + (last - first);
-  
-  // advance iterators
-  first  += context.thread_index();
-  result += context.thread_index();
-
-  for(;
-      first < last;
-      first  += context.block_dimension(),
-      result += context.block_dimension())
-  {
-    *result = *first;
-  } // end for
-
-  return end_of_output;
-} // end copy()
-
-} // end namespace dispatch
-} // end namespace detail
-
-template<typename Context, 
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-  __thrust_forceinline__ __device__
-  RandomAccessIterator2 copy(Context context,
-                             RandomAccessIterator1 first,
-                             RandomAccessIterator1 last,
-                             RandomAccessIterator2 result)
-{
-  return detail::dispatch::copy(context, first, last, result,
-#if __CUDA_ARCH__ < 200
-      // does not work reliably on pre-Fermi due to "Warning: ... assuming global memory space" issues
-      thrust::detail::false_type()
-#else
-      typename thrust::detail::dispatch::is_trivial_copy<RandomAccessIterator1,RandomAccessIterator2>::type()
-#endif
-      );
-} // end copy()
-
-
-template<typename Context, typename RandomAccessIterator1, typename Size, typename RandomAccessIterator2>
-inline __device__
-RandomAccessIterator2 copy_n(Context &ctx, RandomAccessIterator1 first, Size n, RandomAccessIterator2 result)
-{
-  for(Size i = ctx.thread_index(); i < n; i += ctx.block_dimension())
-  {
-    result[i] = first[i];
-  }
-
-  ctx.barrier();
-
-  return result + n;
-}
-
-
-} // end namespace block
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cuda/detail/block/exclusive_scan.h b/compat/thrust/system/cuda/detail/block/exclusive_scan.h
deleted file mode 100644
index 580a7578bf..0000000000
--- a/compat/thrust/system/cuda/detail/block/exclusive_scan.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/functional.h>
-#include <thrust/system/cuda/detail/block/inclusive_scan.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace block
-{
-
-
-template<typename Context, typename RandomAccessIterator, typename T, typename BinaryFunction>
-inline __device__
-typename thrust::iterator_value<RandomAccessIterator>::type
-  inplace_exclusive_scan(Context &ctx, RandomAccessIterator first, T init, BinaryFunction op)
-{
-  // perform an inclusive scan, then shift right
-  block::inplace_inclusive_scan(ctx, first, op);
-
-  typename thrust::iterator_value<RandomAccessIterator>::type carry = first[ctx.block_dimension() - 1];
-
-  ctx.barrier();
-
-  typename thrust::iterator_value<RandomAccessIterator>::type left = (ctx.thread_index() == 0) ? init : first[ctx.thread_index() - 1];
-
-  ctx.barrier();
-
-  first[ctx.thread_index()] = left;
-
-  ctx.barrier();
-
-  return carry;
-}
-
-
-template<typename Context, typename Iterator, typename T>
-inline __device__
-  typename thrust::iterator_value<Iterator>::type
-    inplace_exclusive_scan(Context &ctx, Iterator first, T init)
-{
-  return block::inplace_exclusive_scan(ctx, first, init, thrust::plus<typename thrust::iterator_value<Iterator>::type>());
-}
-
-
-} // end namespace block
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cuda/detail/block/inclusive_scan.h b/compat/thrust/system/cuda/detail/block/inclusive_scan.h
deleted file mode 100644
index 012f7cd2f7..0000000000
--- a/compat/thrust/system/cuda/detail/block/inclusive_scan.h
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/iterator/iterator_traits.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace block
-{
-
-template<typename Context,
-         typename InputIterator,
-         typename BinaryFunction>
-__device__ __thrust_forceinline__
-void inclusive_scan(Context context,
-                    InputIterator first,
-                    BinaryFunction binary_op)
-{
-  // TODO generalize to arbitrary n
-  // TODO support dynamic block_size
-  const unsigned int block_size = Context::ThreadsPerBlock::value;
-
-  typename thrust::iterator_value<InputIterator>::type val = first[context.thread_index()];
-
-  if(block_size >    1) { if (context.thread_index() >=    1) { val = binary_op(first[context.thread_index() -    1], val); } context.barrier(); first[context.thread_index()] = val; context.barrier(); }
-  if(block_size >    2) { if (context.thread_index() >=    2) { val = binary_op(first[context.thread_index() -    2], val); } context.barrier(); first[context.thread_index()] = val; context.barrier(); } 
-  if(block_size >    4) { if (context.thread_index() >=    4) { val = binary_op(first[context.thread_index() -    4], val); } context.barrier(); first[context.thread_index()] = val; context.barrier(); }
-  if(block_size >    8) { if (context.thread_index() >=    8) { val = binary_op(first[context.thread_index() -    8], val); } context.barrier(); first[context.thread_index()] = val; context.barrier(); }
-  if(block_size >   16) { if (context.thread_index() >=   16) { val = binary_op(first[context.thread_index() -   16], val); } context.barrier(); first[context.thread_index()] = val; context.barrier(); }
-  if(block_size >   32) { if (context.thread_index() >=   32) { val = binary_op(first[context.thread_index() -   32], val); } context.barrier(); first[context.thread_index()] = val; context.barrier(); }
-  if(block_size >   64) { if (context.thread_index() >=   64) { val = binary_op(first[context.thread_index() -   64], val); } context.barrier(); first[context.thread_index()] = val; context.barrier(); }
-  if(block_size >  128) { if (context.thread_index() >=  128) { val = binary_op(first[context.thread_index() -  128], val); } context.barrier(); first[context.thread_index()] = val; context.barrier(); }
-  if(block_size >  256) { if (context.thread_index() >=  256) { val = binary_op(first[context.thread_index() -  256], val); } context.barrier(); first[context.thread_index()] = val; context.barrier(); }
-  if(block_size >  512) { if (context.thread_index() >=  512) { val = binary_op(first[context.thread_index() -  512], val); } context.barrier(); first[context.thread_index()] = val; context.barrier(); }
-  if(block_size > 1024) { if (context.thread_index() >= 1024) { val = binary_op(first[context.thread_index() - 1024], val); } context.barrier(); first[context.thread_index()] = val; context.barrier(); }
-} // end inclusive_scan()
-
-
-template<typename Context,
-         typename InputIterator,
-         typename Size,
-         typename BinaryFunction>
-__device__ __thrust_forceinline__
-void inclusive_scan_n(Context context,
-                      InputIterator first,
-                      Size n,
-                      BinaryFunction binary_op)
-{
-  // TODO support n > context.block_dimension()
-  typename thrust::iterator_value<InputIterator>::type val = first[context.thread_index()];
-
-  for (unsigned int i = 1; i < n; i <<= 1)
-  {
-    if (context.thread_index() < n && context.thread_index() >= i)
-      val = binary_op(first[context.thread_index() - i], val);
-
-    context.barrier();
-    
-    first[context.thread_index()] = val;
-    
-    context.barrier();
-  }
-} // end inclusive_scan()
-
-
-template<typename Context,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename BinaryFunction>
-__device__ __thrust_forceinline__
-void inclusive_scan_by_flag(Context context,
-                            InputIterator1 first1,
-                            InputIterator2 first2,
-                            BinaryFunction binary_op)
-{
-  // TODO generalize to arbitrary n
-  // TODO support dynamic block_size
-  const unsigned int block_size = Context::ThreadsPerBlock::value;
-
-  typename thrust::iterator_value<InputIterator1>::type flg = first1[context.thread_index()];
-  typename thrust::iterator_value<InputIterator2>::type val = first2[context.thread_index()];
-
-  if(block_size >    1) { if (context.thread_index() >=    1) { if (!flg) { flg |= first1[context.thread_index() -    1]; val = binary_op(first2[context.thread_index() -    1], val); } } context.barrier(); first1[context.thread_index()] = flg; first2[context.thread_index()] = val; context.barrier(); }
-  if(block_size >    2) { if (context.thread_index() >=    2) { if (!flg) { flg |= first1[context.thread_index() -    2]; val = binary_op(first2[context.thread_index() -    2], val); } } context.barrier(); first1[context.thread_index()] = flg; first2[context.thread_index()] = val; context.barrier(); } 
-  if(block_size >    4) { if (context.thread_index() >=    4) { if (!flg) { flg |= first1[context.thread_index() -    4]; val = binary_op(first2[context.thread_index() -    4], val); } } context.barrier(); first1[context.thread_index()] = flg; first2[context.thread_index()] = val; context.barrier(); }
-  if(block_size >    8) { if (context.thread_index() >=    8) { if (!flg) { flg |= first1[context.thread_index() -    8]; val = binary_op(first2[context.thread_index() -    8], val); } } context.barrier(); first1[context.thread_index()] = flg; first2[context.thread_index()] = val; context.barrier(); }
-  if(block_size >   16) { if (context.thread_index() >=   16) { if (!flg) { flg |= first1[context.thread_index() -   16]; val = binary_op(first2[context.thread_index() -   16], val); } } context.barrier(); first1[context.thread_index()] = flg; first2[context.thread_index()] = val; context.barrier(); }
-  if(block_size >   32) { if (context.thread_index() >=   32) { if (!flg) { flg |= first1[context.thread_index() -   32]; val = binary_op(first2[context.thread_index() -   32], val); } } context.barrier(); first1[context.thread_index()] = flg; first2[context.thread_index()] = val; context.barrier(); }
-  if(block_size >   64) { if (context.thread_index() >=   64) { if (!flg) { flg |= first1[context.thread_index() -   64]; val = binary_op(first2[context.thread_index() -   64], val); } } context.barrier(); first1[context.thread_index()] = flg; first2[context.thread_index()] = val; context.barrier(); }
-  if(block_size >  128) { if (context.thread_index() >=  128) { if (!flg) { flg |= first1[context.thread_index() -  128]; val = binary_op(first2[context.thread_index() -  128], val); } } context.barrier(); first1[context.thread_index()] = flg; first2[context.thread_index()] = val; context.barrier(); }
-  if(block_size >  256) { if (context.thread_index() >=  256) { if (!flg) { flg |= first1[context.thread_index() -  256]; val = binary_op(first2[context.thread_index() -  256], val); } } context.barrier(); first1[context.thread_index()] = flg; first2[context.thread_index()] = val; context.barrier(); }
-  if(block_size >  512) { if (context.thread_index() >=  512) { if (!flg) { flg |= first1[context.thread_index() -  512]; val = binary_op(first2[context.thread_index() -  512], val); } } context.barrier(); first1[context.thread_index()] = flg; first2[context.thread_index()] = val; context.barrier(); }
-  if(block_size > 1024) { if (context.thread_index() >= 1024) { if (!flg) { flg |= first1[context.thread_index() - 1024]; val = binary_op(first2[context.thread_index() - 1024], val); } } context.barrier(); first1[context.thread_index()] = flg; first2[context.thread_index()] = val; context.barrier(); }
-} // end inclusive_scan_by_flag()
-
-
-template<typename Context,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename Size,
-         typename BinaryFunction>
-__device__ __thrust_forceinline__
-void inclusive_scan_by_flag_n(Context context,
-                              InputIterator1 first1,
-                              InputIterator2 first2,
-                              Size n,
-                              BinaryFunction binary_op)
-{
-  // TODO support n > context.block_dimension()
-  typename thrust::iterator_value<InputIterator1>::type flg = first1[context.thread_index()];
-  typename thrust::iterator_value<InputIterator2>::type val = first2[context.thread_index()];
-  
-  for (unsigned int i = 1; i < n; i <<= 1)
-  {
-    if (context.thread_index() < n && context.thread_index() >= i) 
-    {
-      if (!flg)
-      { 
-        flg |= first1[context.thread_index() - i];
-        val  = binary_op(first2[context.thread_index() - i], val);
-      }
-    }
-
-    context.barrier();
-    
-    first1[context.thread_index()] = flg;
-    first2[context.thread_index()] = val;
-    
-    context.barrier();
-  }
-} // end inclusive_scan_by_flag()
-
-
-template<typename Context, typename RandomAccessIterator, typename BinaryFunction>
-__device__ __thrust_forceinline__
-void inplace_inclusive_scan(Context &ctx, RandomAccessIterator first, BinaryFunction op)
-{
-  typename thrust::iterator_value<RandomAccessIterator>::type x = first[ctx.thread_index()];
-
-  for(unsigned int offset = 1; offset < ctx.block_dimension(); offset *= 2)
-  {
-    if(ctx.thread_index() >= offset)
-    {
-      x = op(first[ctx.thread_index() - offset], x);
-    }
-
-    ctx.barrier();
-
-    first[ctx.thread_index()] = x;
-
-    ctx.barrier();
-  }
-}
-
-
-template<typename Context, typename RandomAccessIterator>
-__device__ __thrust_forceinline__
-void inplace_inclusive_scan(Context &ctx, RandomAccessIterator first)
-{
-  block::inplace_inclusive_scan(ctx, first, thrust::plus<typename thrust::iterator_value<RandomAccessIterator>::type>());
-}
-
-
-} // end namespace block
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cuda/detail/block/merge.h b/compat/thrust/system/cuda/detail/block/merge.h
deleted file mode 100644
index 9af0b7bfbb..0000000000
--- a/compat/thrust/system/cuda/detail/block/merge.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace block
-{
-
-template<typename Context,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename RandomAccessIterator3,
-         typename StrictWeakOrdering>
-__device__ __thrust_forceinline__
-  RandomAccessIterator3 merge(Context context,
-                              RandomAccessIterator1 first1,
-                              RandomAccessIterator1 last1,
-                              RandomAccessIterator2 first2,
-                              RandomAccessIterator2 last2,
-                              RandomAccessIterator3 result,
-                              StrictWeakOrdering comp);
-
-// XXX assumes that context.block_dimension() <= n1 and
-//                  context.block_dimension() <= n2
-// This algorithm is analogous to inplace_merge
-// but instead of working on the ranges
-// [first, middle) and [middle, last)
-// it works on the ranges
-// [first, first + n1) and [first + n1, first + n1 + n2)
-template<typename Context,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename Size1,
-         typename Size2,
-         typename StrictWeakOrdering>
-__device__ __thrust_forceinline__
-  void inplace_merge_by_key_n(Context context,
-                              RandomAccessIterator1 keys_first,
-                              RandomAccessIterator2 values_first,
-                              Size1 n1,
-                              Size2 n2,
-                              StrictWeakOrdering comp);
-
-} // end namespace block
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/cuda/detail/block/merge.inl>
-
diff --git a/compat/thrust/system/cuda/detail/block/merge.inl b/compat/thrust/system/cuda/detail/block/merge.inl
deleted file mode 100644
index 5eae2b58f3..0000000000
--- a/compat/thrust/system/cuda/detail/block/merge.inl
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/raw_reference_cast.h>
-#include <thrust/system/detail/generic/scalar/binary_search.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace block
-{
-
-template<typename Context,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename RandomAccessIterator3,
-         typename StrictWeakOrdering>
-__device__ __thrust_forceinline__
-  RandomAccessIterator3 merge(Context context,
-                              RandomAccessIterator1 first1,
-                              RandomAccessIterator1 last1,
-                              RandomAccessIterator2 first2,
-                              RandomAccessIterator2 last2,
-                              RandomAccessIterator3 result,
-                              StrictWeakOrdering comp)
-{
-  typedef typename thrust::iterator_difference<RandomAccessIterator1>::type difference1;
-  typedef typename thrust::iterator_difference<RandomAccessIterator2>::type difference2;
-
-  difference1 n1 = last1 - first1;
-  difference2 n2 = last2 - first2;
-
-  // find the rank of each element in the other array
-  difference2 rank2 = 0;
-  if(context.thread_index() < n1)
-  {
-    RandomAccessIterator1 x = first1;
-    x += context.thread_index();
-
-    // lower_bound ensures that x sorts before any equivalent element of input2
-    // this ensures stability
-    rank2 = thrust::system::detail::generic::scalar::lower_bound(first2, last2, raw_reference_cast(*x), comp) - first2;
-  } // end if
-
-  difference1 rank1 = 0;
-  if(context.thread_index() < n2)
-  {
-    RandomAccessIterator2 x = first2 + context.thread_index();
-
-    // upper_bound ensures that x sorts before any equivalent element of input1
-    // this ensures stability
-    rank1 = thrust::system::detail::generic::scalar::upper_bound(first1, last1, raw_reference_cast(*x), comp) - first1;
-  } // end if
-
-  if(context.thread_index() < n1)
-  {
-    // scatter each element from input1
-    RandomAccessIterator1 src = first1 + context.thread_index();
-    RandomAccessIterator3 dst = result + context.thread_index() + rank2;
-
-    *dst = *src;
-  }
-
-  if(context.thread_index() < n2)
-  {
-    // scatter each element from input2
-    RandomAccessIterator2 src = first2 + context.thread_index();
-    RandomAccessIterator3 dst = result + context.thread_index() + rank1;
-
-    *dst = *src;
-  }
-
-  return result + n1 + n2;
-} // end merge
-
-
-template<typename Context,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename Size1,
-         typename Size2,
-         typename StrictWeakOrdering>
-__device__ __thrust_forceinline__
-  void inplace_merge_by_key_n(Context context,
-                              RandomAccessIterator1 keys_first,
-                              RandomAccessIterator2 values_first,
-                              Size1 n1,
-                              Size2 n2,
-                              StrictWeakOrdering comp)
-{
-  RandomAccessIterator1 input1 = keys_first;
-  RandomAccessIterator1 input2 = keys_first + n1;
-
-  RandomAccessIterator2 input1val = values_first;
-  RandomAccessIterator2 input2val = values_first + n1;
-  
-  typedef typename thrust::iterator_value<RandomAccessIterator1>::type KeyType;
-  typedef typename thrust::iterator_value<RandomAccessIterator2>::type ValueType;
-
-  // XXX use uninitialized here
-  KeyType inp1 = input1[context.thread_index()]; ValueType inp1val = input1val[context.thread_index()];
-  KeyType inp2 = input2[context.thread_index()]; ValueType inp2val = input2val[context.thread_index()];
-  
-  // to merge input1 and input2, use binary search to find the rank of inp1 & inp2 in arrays input2 & input1, respectively
-  // as before, the "end" variables point to one element after the last element of the arrays
-  
-  // start by looking through input2 for inp1's rank
-  unsigned int start_1 = 0;
-  
-  // don't do the search if our value is beyond the end of input1
-  if(context.thread_index() < n1)
-  {
-    start_1 = thrust::system::detail::generic::scalar::lower_bound_n(input2, n2, inp1, comp) - input2;
-  } // end if
-  
-  // now look through input1 for inp2's rank
-  unsigned int start_2 = 0;
-  
-  // don't do the search if our value is beyond the end of input2
-  if(context.thread_index() < n2)
-  {
-    // upper_bound ensures that equivalent elements in the first range sort before the second
-    start_2 = thrust::system::detail::generic::scalar::upper_bound_n(input1, n1, inp2, comp) - input1;
-  } // end if
-
-  context.barrier();
-  
-  // Write back into the right position to the input arrays; can be done in place since we read in
-  // the input arrays into registers before.
-  if(context.thread_index() < n1)
-  {
-    input1[start_1 + context.thread_index()] = inp1;
-    input1val[start_1 + context.thread_index()] = inp1val;
-  } // end if
-  
-  if(context.thread_index() < n2)
-  {
-    input1[start_2 + context.thread_index()] = inp2;
-    input1val[start_2 + context.thread_index()] = inp2val;
-  } // end if
-} // end inplace_merge_by_key_n()
-
-
-} // end namespace block
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cuda/detail/block/merging_sort.h b/compat/thrust/system/cuda/detail/block/merging_sort.h
deleted file mode 100644
index 8f8f999ec5..0000000000
--- a/compat/thrust/system/cuda/detail/block/merging_sort.h
+++ /dev/null
@@ -1,199 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file merging_sort.h
- *  \brief Block version of merge sort
- */
-
-#pragma once
-
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/system/detail/generic/scalar/binary_search.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace block
-{
-
-
-template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Compare>
-__device__ void conditional_swap(RandomAccessIterator1 keys_first,
-                                 RandomAccessIterator2 values_first,
-                                 const unsigned int i,
-                                 const unsigned int end,
-                                 bool pred,
-                                 Compare comp)
-{
-  typedef typename thrust::iterator_traits<RandomAccessIterator1>::value_type KeyType;
-  typedef typename thrust::iterator_traits<RandomAccessIterator2>::value_type ValueType;
-
-  if(pred && i+1<end)
-  {
-    KeyType xi = keys_first[i];
-    KeyType xj = keys_first[i+1];
-
-    // swap if xj sorts before xi
-    if(comp(xj, xi))
-    {
-      // XXX this implementation should really dispatch swap via ADL
-      ValueType yi;
-      yi = values_first[i];
-      ValueType yj;
-      yj = values_first[i+1];
-
-      keys_first[i]     = xj;
-      keys_first[i+1]   = xi;
-      values_first[i]   = yj;
-      values_first[i+1] = yi;
-    }
-  }
-}
-
-
-template<typename Context,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename Compare>
-__device__ void transposition_sort(Context context,
-                                   RandomAccessIterator1 keys_first,
-                                   RandomAccessIterator2 values_first,
-                                   const unsigned int i,
-                                   const unsigned int end,
-                                   const unsigned int size,
-                                   Compare comp)
-{
-  const bool is_odd = i&0x1;
-  
-  for(unsigned int round=size/2; round>0; --round)
-  {
-    // ODDS
-    conditional_swap(keys_first, values_first, i, end, is_odd, comp);
-    context.barrier();
-  
-    // EVENS
-    conditional_swap(keys_first, values_first, i, end, !is_odd, comp);
-    context.barrier();
-  }
-}
-
-template<typename Context,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename StrictWeakOrdering>
-__device__ void merge(Context context,
-                      RandomAccessIterator1 keys_first, 
-                      RandomAccessIterator2 values_first,
-                      const unsigned int i,
-                      const unsigned int n,
-                      unsigned int begin,
-                      unsigned int end,
-                      unsigned int h,
-                      StrictWeakOrdering cmp)
-{
-  // INVARIANT: Every element i resides within a sequence [begin,end)
-  //            of length h which is already sorted
-  while( h<n )
-  {
-    h *= 2;
-
-    unsigned int new_begin = i&(~(h-1));
-    unsigned int new_end   = min(n,new_begin+h);
-
-    typedef typename thrust::iterator_traits<RandomAccessIterator1>::value_type KeyType;
-    typedef typename thrust::iterator_traits<RandomAccessIterator2>::value_type ValueType;
-
-    KeyType key;
-    ValueType value;
-
-    unsigned int rank = i - begin;
-
-    // prevent out-of-bounds access
-    if(i < new_end)
-    {
-      key = keys_first[i];
-
-      if(begin==new_begin)  // in the left side of merging pair
-      {
-        RandomAccessIterator1 result = thrust::system::detail::generic::scalar::lower_bound_n(keys_first+end, new_end-end, key, cmp);
-        rank += (result - (keys_first+end));
-      }
-      else                  // in the right side of merging pair
-      {
-        RandomAccessIterator1 result = thrust::system::detail::generic::scalar::upper_bound_n(keys_first+new_begin, begin-new_begin, key, cmp);
-        rank += (result - (keys_first+new_begin));
-      }
-
-      value = values_first[i];
-    }
-
-    context.barrier();
-
-    if(i < new_end)
-    {
-      keys_first[new_begin+rank] = key;
-      values_first[new_begin+rank] = value;
-    }
-    
-    context.barrier();
-
-    begin = new_begin;
-    end   = new_end;
-  }
-}
-
-
-/*! Block-wise implementation of merge sort.
- *  It provides the same external interface as odd_even_sort.
- */
-template<typename Context,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename StrictWeakOrdering>
-__device__ void merging_sort(Context context,
-                             RandomAccessIterator1 keys_first,
-                             RandomAccessIterator2 values_first,
-                             const unsigned int n,
-                             StrictWeakOrdering comp)
-{
-  // Phase 1: Sort subsequences of length 32 using odd-even
-  //          transposition sort.  The code below assumes that h is a
-  //          power of 2.  Empirically, 32 delivers best results,
-  //          which is not surprising since that's the warp width.
-  unsigned int i = context.thread_index();
-  unsigned int h = 32;
-  unsigned int begin=i&(~(h-1)),  end=min(n,begin+h);
-  
-  transposition_sort(context, keys_first, values_first, i, end, h, comp);
-  
-  // Phase 2: Apply merge tree to produce final sorted results
-  merge(context, keys_first, values_first, i, n, begin, end, h, comp);
-} // end merging_sort()
-
-
-} // end namespace block
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cuda/detail/block/odd_even_sort.h b/compat/thrust/system/cuda/detail/block/odd_even_sort.h
deleted file mode 100644
index 0fa0ea069a..0000000000
--- a/compat/thrust/system/cuda/detail/block/odd_even_sort.h
+++ /dev/null
@@ -1,151 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file odd_even_sort.h
- *  \brief Block versions of Batcher's Odd-Even Merge Sort
- */
-
-#pragma once
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace block
-{
-
-
-/*! Block-wise implementation of Batcher's Odd-Even Merge Sort
- *  This implementation is based on Nadathur Satish's.
- */
-template<typename KeyType,
-         typename ValueType,
-         typename StrictWeakOrdering>
-  __device__ void odd_even_sort(KeyType *keys,
-                                ValueType *data,
-                                const unsigned int n,
-                                StrictWeakOrdering comp)
-{
-  for(unsigned int p = blockDim.x>>1; p > 0; p >>= 1)
-  {
-    unsigned int q = blockDim.x>>1, r = 0, d = p;
-
-    while(q >= p)
-    {
-      unsigned int j = threadIdx.x + d;
-
-      // if j lies beyond the end of the array, we consider it "sorted" wrt i
-      // regardless of whether i lies beyond the end of the array 
-      if(threadIdx.x < (blockDim.x-d) && (threadIdx.x & p) == r && j < n)
-      {
-        KeyType xikey = keys[threadIdx.x];
-        KeyType xjkey = keys[j];
-
-        ValueType xivalue = data[threadIdx.x];
-        ValueType xjvalue = data[j];
-
-        // does xj sort before xi?
-        if(comp(xjkey, xikey))
-        {
-          keys[threadIdx.x] = xjkey;
-          keys[j] = xikey;
-
-          data[threadIdx.x] = xjvalue;
-          data[j] = xivalue;
-        } // end if
-      } // end if
-
-      d = q - p;
-      q >>= 1;
-      r = p;
-
-      __syncthreads();
-    } // end while
-  } // end for p
-} // end odd_even_sort()
-
-template<typename KeyType,
-         typename ValueType,
-         typename StrictWeakOrdering>
-  __device__ void stable_odd_even_sort(KeyType *keys,
-                                       ValueType *data,
-                                       const unsigned int n,
-                                       StrictWeakOrdering comp)
-{
-  for(unsigned int i = 0;
-      i < blockDim.x>>1;
-      ++i)
-  {
-    bool thread_is_odd = threadIdx.x & 0x1;
-
-    // do odds first
-    if(thread_is_odd && threadIdx.x + 1 < n)
-    {
-      KeyType xikey = keys[threadIdx.x];
-      KeyType xjkey = keys[threadIdx.x + 1];
-
-      ValueType xivalue = data[threadIdx.x];
-      ValueType xjvalue = data[threadIdx.x + 1];
-
-      // does xj sort before xi?
-      if(comp(xjkey, xikey))
-      {
-        keys[threadIdx.x] = xjkey;
-        keys[threadIdx.x + 1] = xikey;
-
-        data[threadIdx.x] = xjvalue;
-        data[threadIdx.x + 1] = xivalue;
-      } // end if
-    } // end if
-
-    __syncthreads();
-
-    // do evens second
-    if(!thread_is_odd && threadIdx.x + 1 < n)
-    {
-      KeyType xikey = keys[threadIdx.x];
-      KeyType xjkey = keys[threadIdx.x + 1];
-
-      ValueType xivalue = data[threadIdx.x];
-      ValueType xjvalue = data[threadIdx.x + 1];
-
-      // does xj sort before xi?
-      if(comp(xjkey, xikey))
-      {
-        keys[threadIdx.x] = xjkey;
-        keys[threadIdx.x + 1] = xikey;
-
-        data[threadIdx.x] = xjvalue;
-        data[threadIdx.x + 1] = xivalue;
-      } // end if
-    } // end if
-
-    __syncthreads();
-  } // end for i
-} // end stable_odd_even_sort()
-
-
-} // end namespace block
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cuda/detail/block/reduce.h b/compat/thrust/system/cuda/detail/block/reduce.h
deleted file mode 100644
index e0a1901b28..0000000000
--- a/compat/thrust/system/cuda/detail/block/reduce.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/iterator/iterator_traits.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace block
-{
-
-/* Reduces [data, data + n) using binary_op and stores the result in data[0]
- *
- * Upon return the elements in [data + 1, data + n) have unspecified values.
- */
-template <typename Context, typename ValueIterator, typename BinaryFunction>
-__device__ __thrust_forceinline__
-void reduce_n(Context context, ValueIterator data, unsigned int n, BinaryFunction binary_op)
-{
-  if (context.block_dimension() < n)
-  {
-    for (unsigned int i = context.block_dimension() + context.thread_index(); i < n; i += context.block_dimension())
-      data[context.thread_index()] = binary_op(data[context.thread_index()], data[i]);
-
-    context.barrier();
-  }
-
-  while (n > 1)
-  {
-    unsigned int half = n / 2;
-
-    if (context.thread_index() < half)
-      data[context.thread_index()] = binary_op(data[context.thread_index()], data[n - context.thread_index() - 1]);
-
-    context.barrier();
-
-    n = n - half;
-  }
-}
-
-} // end namespace block
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cuda/detail/copy.h b/compat/thrust/system/cuda/detail/copy.h
deleted file mode 100644
index 8f7ee97c24..0000000000
--- a/compat/thrust/system/cuda/detail/copy.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator>
-  OutputIterator copy(execution_policy<DerivedPolicy> &exec,
-                      InputIterator first,
-                      InputIterator last,
-                      OutputIterator result);
-
-
-template<typename System1,
-         typename System2,
-         typename InputIterator,
-         typename OutputIterator>
-  OutputIterator copy(cross_system<System1,System2> exec,
-                      InputIterator first,
-                      InputIterator last,
-                      OutputIterator result);
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename Size,
-         typename OutputIterator>
-  OutputIterator copy_n(execution_policy<DerivedPolicy> &exec,
-                        InputIterator first,
-                        Size n,
-                        OutputIterator result);
-
-
-template<typename System1,
-         typename System2,
-         typename InputIterator,
-         typename Size,
-         typename OutputIterator>
-  OutputIterator copy_n(cross_system<System1,System2> exec,
-                        InputIterator first,
-                        Size n,
-                        OutputIterator result);
-
-
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
-
-#include <thrust/system/cuda/detail/copy.inl>
-
diff --git a/compat/thrust/system/cuda/detail/copy.inl b/compat/thrust/system/cuda/detail/copy.inl
deleted file mode 100644
index 125eebdaa5..0000000000
--- a/compat/thrust/system/cuda/detail/copy.inl
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/copy.h>
-#include <thrust/system/cuda/detail/copy_device_to_device.h>
-#include <thrust/system/cuda/detail/copy_cross_system.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-
-template<typename System,
-         typename InputIterator,
-         typename OutputIterator>
-  OutputIterator copy(execution_policy<System> &system,
-                      InputIterator first,
-                      InputIterator last,
-                      OutputIterator result)
-{
-  return thrust::system::cuda::detail::copy_device_to_device(system,first,last,result);
-} // end copy()
-
-
-template<typename System1,
-         typename System2,
-         typename InputIterator,
-         typename OutputIterator>
-  OutputIterator copy(cross_system<System1,System2> systems,
-                      InputIterator first,
-                      InputIterator last,
-                      OutputIterator result)
-{
-  return thrust::system::cuda::detail::copy_cross_system(systems,first,last,result);
-} // end copy()
-
-
-template<typename System,
-         typename InputIterator,
-         typename Size,
-         typename OutputIterator>
-  OutputIterator copy_n(execution_policy<System> &system,
-                        InputIterator first,
-                        Size n,
-                        OutputIterator result)
-{
-  return thrust::system::cuda::detail::copy_device_to_device(system,first,first+n,result);
-} // end copy_n()
-
-
-template<typename System1,
-         typename System2,
-         typename InputIterator,
-         typename Size,
-         typename OutputIterator>
-  OutputIterator copy_n(cross_system<System1,System2> systems,
-                        InputIterator first,
-                        Size n,
-                        OutputIterator result)
-{
-  return thrust::system::cuda::detail::copy_cross_system_n(systems,first,n,result);
-} // end copy_n()
-
-
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/cuda/detail/copy_cross_system.h b/compat/thrust/system/cuda/detail/copy_cross_system.h
deleted file mode 100644
index f68ea3c88a..0000000000
--- a/compat/thrust/system/cuda/detail/copy_cross_system.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-
-template<typename System1,
-         typename System2,
-         typename InputIterator,
-         typename OutputIterator>
-  OutputIterator copy_cross_system(cross_system<System1,System2> systems,
-                                   InputIterator begin, 
-                                   InputIterator end, 
-                                   OutputIterator result);
-
-
-template<typename System1,
-         typename System2,
-         typename InputIterator,
-         typename Size,
-         typename OutputIterator>
-  OutputIterator copy_cross_system_n(cross_system<System1,System2> systems,
-                                     InputIterator begin, 
-                                     Size n, 
-                                     OutputIterator result);
-
-
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
-
-#include <thrust/system/cuda/detail/copy_cross_system.inl>
-
diff --git a/compat/thrust/system/cuda/detail/copy_cross_system.inl b/compat/thrust/system/cuda/detail/copy_cross_system.inl
deleted file mode 100644
index 861cb2c2cd..0000000000
--- a/compat/thrust/system/cuda/detail/copy_cross_system.inl
+++ /dev/null
@@ -1,301 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/copy_cross_system.h>
-#include <thrust/detail/copy.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/temporary_array.h>
-#include <thrust/detail/dispatch/is_trivial_copy.h>
-#include <thrust/system/cuda/detail/trivial_copy.h>
-
-namespace thrust
-{
-namespace detail
-{
-
-// XXX WAR circular #inclusion problem
-template<typename,typename> class temporary_array;
-
-} // end detail
-
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-
-// general input to random access case
-template<typename System1,
-         typename System2,
-         typename InputIterator,
-         typename RandomAccessIterator>
-  RandomAccessIterator copy_cross_system(cross_system<System1,System2> systems,
-                                         InputIterator begin,
-                                         InputIterator end,
-                                         RandomAccessIterator result,
-                                         thrust::incrementable_traversal_tag, 
-                                         thrust::random_access_traversal_tag)
-{
-  //std::cerr << std::endl;
-  //std::cerr << "general copy_host_to_device(): InputIterator: " << typeid(InputIterator).name() << std::endl;
-  //std::cerr << "general copy_host_to_device(): OutputIterator: " << typeid(OutputIterator).name() << std::endl;
-
-  typedef typename thrust::iterator_value<InputIterator>::type InputType;
-
-  // allocate temporary storage in System1
-  thrust::detail::temporary_array<InputType, System1> temp(systems.system1,begin,end);
-  return thrust::copy(systems, temp.begin(), temp.end(), result);
-}
-
-template<typename System1,
-         typename System2,
-         typename InputIterator,
-         typename Size,
-         typename RandomAccessIterator>
-  RandomAccessIterator copy_cross_system_n(cross_system<System1,System2> systems,
-                                           InputIterator first,
-                                           Size n,
-                                           RandomAccessIterator result,
-                                           thrust::incrementable_traversal_tag, 
-                                           thrust::random_access_traversal_tag)
-{
-  typedef typename thrust::iterator_value<InputIterator>::type InputType;
-
-  // allocate and copy to temporary storage System1
-  thrust::detail::temporary_array<InputType, System1> temp(systems.system1, first, n);
-
-  // recurse
-  return copy_cross_system(systems, temp.begin(), temp.end(), result);
-}
-
-
-// random access to general output case
-template<typename System1,
-         typename System2,
-         typename RandomAccessIterator,
-         typename OutputIterator>
-  OutputIterator copy_cross_system(cross_system<System1,System2> systems,
-                                   RandomAccessIterator begin,
-                                   RandomAccessIterator end,
-                                   OutputIterator result,
-                                   thrust::random_access_traversal_tag, 
-                                   thrust::incrementable_traversal_tag)
-{
-  typedef typename thrust::iterator_value<RandomAccessIterator>::type InputType;
-
-  // copy to temporary storage in System2
-  thrust::detail::temporary_array<InputType,System2> temp(systems.system2, systems.system1, begin, end);
-
-  return thrust::copy(systems.system2, temp.begin(), temp.end(), result);
-}
-
-template<typename System1,
-         typename System2,
-         typename RandomAccessIterator,
-         typename Size,
-         typename OutputIterator>
-  OutputIterator copy_cross_system_n(cross_system<System1,System2> systems,
-                                     RandomAccessIterator first,
-                                     Size n,
-                                     OutputIterator result,
-                                     thrust::random_access_traversal_tag, 
-                                     thrust::incrementable_traversal_tag)
-{
-  typedef typename thrust::iterator_value<RandomAccessIterator>::type InputType;
-
-  // copy to temporary storage in System2
-  thrust::detail::temporary_array<InputType,System2> temp(systems.system2, systems.system1, first, n);
-
-  // copy temp to result
-  return thrust::copy(systems.system2, temp.begin(), temp.end(), result);
-}
-
-
-// trivial copy
-template<typename System1,
-         typename System2,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-  RandomAccessIterator2 copy_cross_system(cross_system<System1,System2> systems,
-                                          RandomAccessIterator1 begin,
-                                          RandomAccessIterator1 end,
-                                          RandomAccessIterator2 result,
-                                          thrust::random_access_traversal_tag,
-                                          thrust::random_access_traversal_tag,
-                                          thrust::detail::true_type) // trivial copy
-{
-//  std::cerr << std::endl;
-//  std::cerr << "random access copy_device_to_host(): trivial" << std::endl;
-//  std::cerr << "general copy_device_to_host(): RandomAccessIterator1: " << typeid(RandomAccessIterator1).name() << std::endl;
-//  std::cerr << "general copy_device_to_host(): RandomAccessIterator2: " << typeid(RandomAccessIterator2).name() << std::endl;
-  
-  // how many elements to copy?
-  typename thrust::iterator_traits<RandomAccessIterator1>::difference_type n = end - begin;
-
-  thrust::system::cuda::detail::trivial_copy_n(systems, begin, n, result);
-
-  return result + n;
-}
-
-
-namespace detail
-{
-
-// random access non-trivial iterator to random access iterator
-template<typename System1,
-         typename System2,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-  RandomAccessIterator2 non_trivial_random_access_copy_cross_system(cross_system<System1,System2> systems,
-                                                                    RandomAccessIterator1 begin,
-                                                                    RandomAccessIterator1 end,
-                                                                    RandomAccessIterator2 result,
-                                                                    thrust::detail::false_type) // InputIterator is non-trivial
-{
-  // copy the input to a temporary input system buffer of OutputType
-  typedef typename thrust::iterator_value<RandomAccessIterator2>::type OutputType;
-
-  // allocate temporary storage in System1
-  thrust::detail::temporary_array<OutputType,System1> temp(systems.system1, begin, end);
-
-  // recurse
-  return copy_cross_system(systems, temp.begin(), temp.end(), result);
-}
-
-template<typename System1,
-         typename System2,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-  RandomAccessIterator2 non_trivial_random_access_copy_cross_system(cross_system<System1,System2> systems,
-                                                                    RandomAccessIterator1 begin,
-                                                                    RandomAccessIterator1 end,
-                                                                    RandomAccessIterator2 result,
-                                                                    thrust::detail::true_type) // InputIterator is trivial
-{
-  typename thrust::iterator_difference<RandomAccessIterator1>::type n = thrust::distance(begin, end);
-
-  // allocate temporary storage in System2
-  // retain the input's type for the intermediate storage
-  // do not initialize the storage (the 0 does this)
-  typedef typename thrust::iterator_value<RandomAccessIterator1>::type InputType;
-  thrust::detail::temporary_array<InputType,System2> temp(0, systems.system2, n);
-
-  // force a trivial (memcpy) copy of the input to the temporary
-  // note that this will not correctly account for copy constructors
-  // but there's nothing we can do about that
-  // XXX one thing we might try is to use pinned memory for the temporary storage
-  //     this might allow us to correctly account for copy constructors
-  thrust::system::cuda::detail::trivial_copy_n(systems, begin, n, temp.begin());
-
-  // finally, copy to the result
-  return thrust::copy(systems.system2, temp.begin(), temp.end(), result);
-}
-
-} // end detail
-
-
-// random access iterator to random access host iterator with non-trivial copy
-template<typename System1,
-         typename System2,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-  RandomAccessIterator2 copy_cross_system(cross_system<System1,System2> systems,
-                                          RandomAccessIterator1 begin,
-                                          RandomAccessIterator1 end,
-                                          RandomAccessIterator2 result,
-                                          thrust::random_access_traversal_tag,
-                                          thrust::random_access_traversal_tag,
-                                          thrust::detail::false_type) // is_trivial_copy
-{
-  // dispatch a non-trivial random access cross system copy based on whether or not the InputIterator is trivial
-  return detail::non_trivial_random_access_copy_cross_system(systems, begin, end, result,
-      typename thrust::detail::is_trivial_iterator<RandomAccessIterator1>::type());
-}
-
-// random access iterator to random access iterator
-template<typename System1,
-         typename System2,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-  RandomAccessIterator2 copy_cross_system(cross_system<System1,System2> systems,
-                                          RandomAccessIterator1 begin,
-                                          RandomAccessIterator1 end,
-                                          RandomAccessIterator2 result,
-                                          thrust::random_access_traversal_tag input_traversal,
-                                          thrust::random_access_traversal_tag output_traversal)
-{
-  // dispatch on whether this is a trivial copy
-  return copy_cross_system(systems, begin, end, result, input_traversal, output_traversal,
-          typename thrust::detail::dispatch::is_trivial_copy<RandomAccessIterator1,RandomAccessIterator2>::type());
-}
-
-template<typename System1,
-         typename System2,
-         typename RandomAccessIterator1,
-         typename Size,
-         typename RandomAccessIterator2>
-  RandomAccessIterator2 copy_cross_system_n(cross_system<System1,System2> systems,
-                                            RandomAccessIterator1 first,
-                                            Size n,
-                                            RandomAccessIterator2 result,
-                                            thrust::random_access_traversal_tag input_traversal,
-                                            thrust::random_access_traversal_tag output_traversal)
-{
-  // implement with copy_cross_system
-  return copy_cross_system(systems, first, first + n, result, input_traversal, output_traversal);
-}
-
-/////////////////
-// Entry Point //
-/////////////////
-
-template<typename System1,
-         typename System2,
-         typename InputIterator,
-         typename OutputIterator>
-  OutputIterator copy_cross_system(cross_system<System1,System2> systems,
-                                   InputIterator begin, 
-                                   InputIterator end, 
-                                   OutputIterator result)
-{
-  return copy_cross_system(systems, begin, end, result, 
-          typename thrust::iterator_traversal<InputIterator>::type(),
-          typename thrust::iterator_traversal<OutputIterator>::type());
-}
-
-template<typename System1,
-         typename System2,
-         typename InputIterator,
-         typename Size,
-         typename OutputIterator>
-  OutputIterator copy_cross_system_n(cross_system<System1,System2> systems,
-                                     InputIterator begin, 
-                                     Size n, 
-                                     OutputIterator result)
-{
-  return copy_cross_system_n(systems, begin, n, result, 
-          typename thrust::iterator_traversal<InputIterator>::type(),
-          typename thrust::iterator_traversal<OutputIterator>::type());
-}
-
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/cuda/detail/copy_device_to_device.h b/compat/thrust/system/cuda/detail/copy_device_to_device.h
deleted file mode 100644
index a7d8df8613..0000000000
--- a/compat/thrust/system/cuda/detail/copy_device_to_device.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file copy_device_to_device.h
- *  \brief Device implementations for copying on the device.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator>
-  OutputIterator copy_device_to_device(execution_policy<DerivedPolicy> &exec,
-                                       InputIterator begin, 
-                                       InputIterator end, 
-                                       OutputIterator result);
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/cuda/detail/copy_device_to_device.inl>
-
diff --git a/compat/thrust/system/cuda/detail/copy_device_to_device.inl b/compat/thrust/system/cuda/detail/copy_device_to_device.inl
deleted file mode 100644
index c8263c5d55..0000000000
--- a/compat/thrust/system/cuda/detail/copy_device_to_device.inl
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/copy_device_to_device.h>
-#include <thrust/system/cuda/detail/copy_cross_system.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/distance.h>
-#include <thrust/transform.h>
-#include <thrust/functional.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/temporary_array.h>
-#include <thrust/system/cuda/detail/trivial_copy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace detail
-{
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator>
-  OutputIterator copy_device_to_device(execution_policy<DerivedPolicy> &exec,
-                                       InputIterator begin, 
-                                       InputIterator end, 
-                                       OutputIterator result,
-                                       thrust::detail::false_type)
-{
-    // general case (mixed types)
-    typedef typename thrust::iterator_traits<InputIterator>::value_type InputType;
-
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-    return thrust::transform(exec, begin, end, result, thrust::identity<InputType>());
-#else
-    // we're not compiling with nvcc: copy [begin, end) to temp host memory
-    typename thrust::iterator_traits<InputIterator>::difference_type n = thrust::distance(begin, end);
-
-    thrust::host_system_tag temp_exec;
-    thrust::detail::temporary_array<InputType, thrust::host_system_tag> temp1(temp_exec, begin, end);
-
-    // transform temp1 to OutputType in host memory
-    typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
-
-    thrust::detail::temporary_array<OutputType, thrust::host_system_tag> temp2(temp_exec, temp1.begin(), temp1.end());
-
-    // copy temp2 to device
-    result = thrust::system::cuda::detail::copy_cross_system(temp2.begin(), temp2.end(), result);
-
-    return result;
-#endif // THRUST_DEVICE_COMPILER_NVCC
-}
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator>
-  OutputIterator copy_device_to_device(execution_policy<DerivedPolicy> &exec,
-                                       InputIterator begin, 
-                                       InputIterator end, 
-                                       OutputIterator result,
-                                       thrust::detail::true_type)
-{
-    // specialization for device to device when the value_types match, operator= is not overloaded,
-    // and the iterators are pointers
-
-    // how many elements to copy?
-    typename thrust::iterator_traits<OutputIterator>::difference_type n = end - begin;
-
-    thrust::system::cuda::detail::trivial_copy_n(exec, begin, n, result);
-
-    return result + n;
-}
-
-} // end namespace detail
-
-/////////////////
-// Entry Point //
-/////////////////
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator>
-  OutputIterator copy_device_to_device(execution_policy<DerivedPolicy> &exec,
-                                       InputIterator begin, 
-                                       InputIterator end, 
-                                       OutputIterator result)
-{
-    typedef typename thrust::iterator_traits<InputIterator>::value_type InputType;
-    typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
-
-    const bool use_trivial_copy = 
-        thrust::detail::is_same<InputType, OutputType>::value
-        && thrust::detail::is_trivial_iterator<InputIterator>::value 
-        && thrust::detail::is_trivial_iterator<OutputIterator>::value;
-
-    // XXX WAR unused variable warning
-    (void) use_trivial_copy;
-
-    return detail::copy_device_to_device(exec, begin, end, result,
-            thrust::detail::integral_constant<bool, use_trivial_copy>());
-
-}
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cuda/detail/copy_if.h b/compat/thrust/system/cuda/detail/copy_if.h
deleted file mode 100644
index 5ed0f6c9c4..0000000000
--- a/compat/thrust/system/cuda/detail/copy_if.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename Predicate>
-   OutputIterator copy_if(execution_policy<DerivedPolicy> &exec,
-                          InputIterator1 first,
-                          InputIterator1 last,
-                          InputIterator2 stencil,
-                          OutputIterator result,
-                          Predicate pred);
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/cuda/detail/copy_if.inl>
-
diff --git a/compat/thrust/system/cuda/detail/copy_if.inl b/compat/thrust/system/cuda/detail/copy_if.inl
deleted file mode 100644
index 15ea7faa82..0000000000
--- a/compat/thrust/system/cuda/detail/copy_if.inl
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/detail/minmax.h>
-#include <thrust/detail/temporary_array.h>
-#include <thrust/detail/internal_functional.h>
-#include <thrust/system/detail/internal/decompose.h>
-#include <thrust/scan.h>
-#include <thrust/system/cuda/detail/default_decomposition.h>
-#include <thrust/system/cuda/detail/reduce_intervals.h>
-#include <thrust/system/cuda/detail/block/inclusive_scan.h>
-#include <thrust/system/cuda/detail/detail/launch_closure.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-#include <thrust/functional.h>
-
-
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-template <typename InputIterator1,
-          typename InputIterator2,
-          typename InputIterator3,
-          typename Decomposition,
-          typename OutputIterator,
-          typename Context>
-struct copy_if_intervals_closure
-{
-  InputIterator1 input;
-  InputIterator2 stencil;
-  InputIterator3 offsets;
-  Decomposition decomp;
-  OutputIterator output;
-
-  typedef Context context_type;
-  context_type context;
-  
-  copy_if_intervals_closure(InputIterator1 input,
-                            InputIterator2 stencil,
-                            InputIterator3 offsets,
-                            Decomposition decomp,
-                            OutputIterator output,
-                            Context context = Context())
-    : input(input), stencil(stencil), offsets(offsets), decomp(decomp), output(output), context(context) {}
-
-  __device__ __thrust_forceinline__
-  void operator()(void)
-  {
-    typedef typename thrust::iterator_value<OutputIterator>::type OutputType;
-   
-    typedef unsigned int PredicateType;
-    
-    const unsigned int CTA_SIZE = context_type::ThreadsPerBlock::value;
-
-    thrust::plus<PredicateType> binary_op;
-
-    __shared__ PredicateType sdata[CTA_SIZE];  context.barrier();
-    
-    typedef typename Decomposition::index_type IndexType;
-
-    // this block processes results in [range.begin(), range.end())
-    thrust::system::detail::internal::index_range<IndexType> range = decomp[context.block_index()];
-
-    IndexType base = range.begin();
-
-    PredicateType predicate = 0;
-    
-    // advance input iterators to this thread's starting position
-    input   += base + context.thread_index();
-    stencil += base + context.thread_index();
-
-    // advance output to this interval's starting position
-    if (context.block_index() != 0)
-    {
-        InputIterator3 temp = offsets + (context.block_index() - 1);
-        output += *temp;
-    }
-
-    // process full blocks
-    while(base + CTA_SIZE <= range.end())
-    {
-        // read data
-        sdata[context.thread_index()] = predicate = *stencil;
-      
-        context.barrier();
-
-        // scan block
-        block::inclusive_scan(context, sdata, binary_op);
-       
-        // write data
-        if (predicate)
-        {
-            OutputIterator temp2 = output + (sdata[context.thread_index()] - 1);
-            *temp2 = *input;
-        }
-
-        // advance inputs by CTA_SIZE
-        base    += CTA_SIZE;
-        input   += CTA_SIZE;
-        stencil += CTA_SIZE;
-
-        // advance output by number of true predicates
-        output += sdata[CTA_SIZE - 1];
-
-        context.barrier();
-    }
-
-    // process partially full block at end of input (if necessary)
-    if (base < range.end())
-    {
-        // read data
-        if (base + context.thread_index() < range.end())
-            sdata[context.thread_index()] = predicate = *stencil;
-        else
-            sdata[context.thread_index()] = predicate = 0;
-       
-        context.barrier();
-
-        // scan block
-        block::inclusive_scan(context, sdata, binary_op);
-       
-        // write data
-        if (predicate) // expects predicate=false for >= interval_end
-        {
-            OutputIterator temp2 = output + (sdata[context.thread_index()] - 1);
-            *temp2 = *input;
-        }
-    }
-  }
-}; // copy_if_intervals_closure
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename Predicate>
-   OutputIterator copy_if(execution_policy<DerivedPolicy> &exec,
-                          InputIterator1 first,
-                          InputIterator1 last,
-                          InputIterator2 stencil,
-                          OutputIterator output,
-                          Predicate pred)
-{
-  typedef typename thrust::iterator_difference<InputIterator1>::type IndexType;
-  typedef typename thrust::iterator_value<OutputIterator>::type      OutputType;
-
-  if (first == last)
-      return output;
-
-  typedef thrust::system::detail::internal::uniform_decomposition<IndexType> Decomposition;
-  typedef thrust::detail::temporary_array<IndexType, DerivedPolicy>          IndexArray;
-
-  Decomposition decomp = default_decomposition(last - first);
-
-  // storage for per-block predicate counts
-  IndexArray block_results(exec, decomp.size());
-
-  // convert stencil into an iterator that produces integral values in {0,1}
-  typedef typename thrust::detail::predicate_to_integral<Predicate,IndexType>              PredicateToIndexTransform;
-  typedef thrust::transform_iterator<PredicateToIndexTransform, InputIterator2, IndexType> PredicateToIndexIterator;
-
-  PredicateToIndexIterator predicate_stencil(stencil, PredicateToIndexTransform(pred));
-
-  // compute number of true values in each interval
-  thrust::system::cuda::detail::reduce_intervals(exec, predicate_stencil, block_results.begin(), thrust::plus<IndexType>(), decomp);
-
-  // scan the partial sums
-  thrust::inclusive_scan(exec, block_results.begin(), block_results.end(), block_results.begin(), thrust::plus<IndexType>());
-
-  // copy values to output
-  const unsigned int ThreadsPerBlock = 256;
-  typedef typename IndexArray::iterator InputIterator3;
-  typedef detail::statically_blocked_thread_array<ThreadsPerBlock> Context;
-  typedef copy_if_intervals_closure<InputIterator1,PredicateToIndexIterator,InputIterator3,Decomposition,OutputIterator,Context> Closure;
-  Closure closure(first, predicate_stencil, block_results.begin(), decomp, output);
-  detail::launch_closure(closure, decomp.size(), ThreadsPerBlock);
-
-  return output + block_results[decomp.size() - 1];
-} // end copy_if()
-
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
-
diff --git a/compat/thrust/system/cuda/detail/count.h b/compat/thrust/system/cuda/detail/count.h
deleted file mode 100644
index a307fc5fc8..0000000000
--- a/compat/thrust/system/cuda/detail/count.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system has no special version of this algorithm 
-
diff --git a/compat/thrust/system/cuda/detail/cuda_launch_config.h b/compat/thrust/system/cuda/detail/cuda_launch_config.h
deleted file mode 100644
index b7f0ca2409..0000000000
--- a/compat/thrust/system/cuda/detail/cuda_launch_config.h
+++ /dev/null
@@ -1,384 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <cstddef>
-#include <thrust/detail/config.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-
-// XXX define our own device_properties_t to avoid errors when #including
-//     this file in the absence of a CUDA installation
-struct device_properties_t
-{
-  // mirror the type and spelling of cudaDeviceProp's members
-  // keep these alphabetized
-  int    major;
-  int    maxGridSize[3];
-  int    maxThreadsPerBlock;
-  int    maxThreadsPerMultiProcessor;
-  int    minor;
-  int    multiProcessorCount;
-  int    regsPerBlock;
-  size_t sharedMemPerBlock;
-  int    warpSize;
-};
-
-
-// XXX define our own device_properties_t to avoid errors when #including
-//     this file in the absence of a CUDA installation
-struct function_attributes_t
-{
-  // mirror the type and spelling of cudaFuncAttributes' members
-  // keep these alphabetized
-  size_t constSizeBytes;
-  size_t localSizeBytes;
-  int    maxThreadsPerBlock;
-  int    numRegs;
-  size_t sharedSizeBytes;
-};
-
-
-/*! Computes a block size in number of threads for a CUDA kernel using a occupancy-promoting heuristic.
- *  \param attributes The cudaFuncAttributes corresponding to a __global__ function of interest on a GPU of interest.
- *  \param properties The cudaDeviceProp corresponding to a GPU on which to launch the __global__ function of interest.
- *  \return A CUDA block size, in number of threads, which the resources of the GPU's streaming multiprocessor can
- *          accomodate and which is intended to promote occupancy. The result is equivalent to the one performed by
- *          the "CUDA Occupancy Calculator". 
- *  \note The __global__ function of interest is presumed to use 0 bytes of dynamically-allocated __shared__ memory.
- */
-inline __host__ __device__
-std::size_t block_size_with_maximum_potential_occupancy(const function_attributes_t &attributes,
-                                                        const device_properties_t   &properties);
-
-/*! Computes a block size in number of threads for a CUDA kernel using a occupancy-promoting heuristic.
- *  Use this version of the function when a CUDA block's dynamically-allocated __shared__ memory requirements
- *  vary with the size of the block.
- *  \param attributes The cudaFuncAttributes corresponding to a __global__ function of interest on a GPU of interest.
- *  \param properties The cudaDeviceProp corresponding to a GPU on which to launch the __global__ function of interest.
- *  \param block_size_to_dynamic_smem_bytes A unary function which maps an integer CUDA block size to the number of bytes
- *         of dynamically-allocated __shared__ memory required by a CUDA block of that size.
- *  \return A CUDA block size, in number of threads, which the resources of the GPU's streaming multiprocessor can
- *          accomodate and which is intended to promote occupancy. The result is equivalent to the one performed by
- *          the "CUDA Occupancy Calculator". 
- */
-template<typename UnaryFunction>
-inline __host__ __device__
-std::size_t block_size_with_maximum_potential_occupancy(const function_attributes_t &attributes,
-                                                        const device_properties_t   &properties,
-                                                        UnaryFunction block_size_to_dynamic_smem_size);
-
-
-/*! Returns the maximum amount of dynamic shared memory each block
- *  can utilize without reducing thread occupancy.
- *
- *  \param properties CUDA device properties
- *  \param attributes CUDA function attributes
- *  \param blocks_per_processor Number of blocks per streaming multiprocessor
- */
-inline __host__ __device__
-size_t proportional_smem_allocation(const device_properties_t   &properties,
-                                    const function_attributes_t &attributes,
-                                    size_t blocks_per_processor);
-
-
-template<typename UnaryFunction>
-inline __host__ __device__
-size_t max_blocksize_subject_to_smem_usage(const device_properties_t   &properties,
-                                           const function_attributes_t &attributes,
-                                           UnaryFunction blocksize_to_dynamic_smem_usage);
-
-
-
-namespace cuda_launch_config_detail
-{
-
-using std::size_t;
-
-namespace util
-{
-
-
-template<typename T>
-inline __host__ __device__
-T min_(const T &lhs, const T &rhs)
-{
-  return rhs < lhs ? rhs : lhs;
-}
-
-
-template <typename T>
-struct zero_function
-{
-  inline __host__ __device__
-  T operator()(T)
-  {
-    return 0;
-  }
-};
-
-
-// x/y rounding towards +infinity for integers, used to determine # of blocks/warps etc.
-template<typename L, typename R>
-  inline __host__ __device__ L divide_ri(const L x, const R y)
-{
-    return (x + (y - 1)) / y;
-}
-
-// x/y rounding towards zero for integers, used to determine # of blocks/warps etc.
-template<typename L, typename R>
-  inline __host__ __device__ L divide_rz(const L x, const R y)
-{
-    return x / y;
-}
-
-// round x towards infinity to the next multiple of y
-template<typename L, typename R>
-  inline __host__ __device__ L round_i(const L x, const R y){ return y * divide_ri(x, y); }
-
-// round x towards zero to the next multiple of y
-template<typename L, typename R>
-  inline __host__ __device__ L round_z(const L x, const R y){ return y * divide_rz(x, y); }
-
-} // end namespace util
-
-
-
-// granularity of shared memory allocation
-inline __host__ __device__
-size_t smem_allocation_unit(const device_properties_t &properties)
-{
-  switch(properties.major)
-  {
-    case 1:  return 512;
-    case 2:  return 128;
-    case 3:  return 256;
-    default: return 256; // unknown GPU; have to guess
-  }
-}
-
-
-// granularity of register allocation
-inline __host__ __device__
-size_t reg_allocation_unit(const device_properties_t &properties, const size_t regsPerThread)
-{
-  switch(properties.major)
-  {
-    case 1:  return (properties.minor <= 1) ? 256 : 512;
-    case 2:  switch(regsPerThread)
-             {
-               case 21:
-               case 22:
-               case 29:
-               case 30:
-               case 37:
-               case 38:
-               case 45:
-               case 46:
-                 return 128;
-               default:
-                 return 64;
-             }
-    case 3:  return 256;
-    default: return 256; // unknown GPU; have to guess
-  }
-}
-
-
-// granularity of warp allocation
-inline __host__ __device__
-size_t warp_allocation_multiple(const device_properties_t &properties)
-{
-  return (properties.major <= 1) ? 2 : 1;
-}
-
-// number of "sides" into which the multiprocessor is partitioned
-inline __host__ __device__
-size_t num_sides_per_multiprocessor(const device_properties_t &properties)
-{
-  switch(properties.major)
-  {
-    case 1:  return 1;
-    case 2:  return 2;
-    case 3:  return 4;
-    default: return 4; // unknown GPU; have to guess
-  }
-}
-
-
-inline __host__ __device__
-size_t max_blocks_per_multiprocessor(const device_properties_t &properties)
-{
-  return (properties.major <= 2) ? 8 : 16;
-}
-
-
-inline __host__ __device__
-size_t max_active_blocks_per_multiprocessor(const device_properties_t    &properties,
-                                            const function_attributes_t  &attributes,
-                                            int CTA_SIZE,
-                                            size_t dynamic_smem_bytes)
-{
-  // Determine the maximum number of CTAs that can be run simultaneously per SM
-  // This is equivalent to the calculation done in the CUDA Occupancy Calculator spreadsheet
-
-  //////////////////////////////////////////
-  // Limits due to threads/SM or blocks/SM
-  //////////////////////////////////////////
-  const size_t maxThreadsPerSM = properties.maxThreadsPerMultiProcessor;  // 768, 1024, 1536, etc.
-  const size_t maxBlocksPerSM  = max_blocks_per_multiprocessor(properties);
-
-  // Calc limits
-  const size_t ctaLimitThreads = (CTA_SIZE <= properties.maxThreadsPerBlock) ? maxThreadsPerSM / CTA_SIZE : 0;
-  const size_t ctaLimitBlocks  = maxBlocksPerSM;
-
-  //////////////////////////////////////////
-  // Limits due to shared memory/SM
-  //////////////////////////////////////////
-  const size_t smemAllocationUnit     = smem_allocation_unit(properties);
-  const size_t smemBytes  = attributes.sharedSizeBytes + dynamic_smem_bytes;
-  const size_t smemPerCTA = util::round_i(smemBytes, smemAllocationUnit);
-
-  // Calc limit
-  const size_t ctaLimitSMem = smemPerCTA > 0 ? properties.sharedMemPerBlock / smemPerCTA : maxBlocksPerSM;
-
-  //////////////////////////////////////////
-  // Limits due to registers/SM
-  //////////////////////////////////////////
-  const size_t regAllocationUnit      = reg_allocation_unit(properties, attributes.numRegs);
-  const size_t warpAllocationMultiple = warp_allocation_multiple(properties);
-  const size_t numWarps = util::round_i(util::divide_ri(CTA_SIZE, properties.warpSize), warpAllocationMultiple);
-
-  // Calc limit
-  size_t ctaLimitRegs;
-  if(properties.major <= 1)
-  {
-    // GPUs of compute capability 1.x allocate registers to CTAs
-    // Number of regs per block is regs per thread times number of warps times warp size, rounded up to allocation unit
-    const size_t regsPerCTA = util::round_i(attributes.numRegs * properties.warpSize * numWarps, regAllocationUnit);
-    ctaLimitRegs = regsPerCTA > 0 ? properties.regsPerBlock / regsPerCTA : maxBlocksPerSM;
-  }
-  else
-  {
-    // GPUs of compute capability 2.x and higher allocate registers to warps
-    // Number of regs per warp is regs per thread times times warp size, rounded up to allocation unit
-    const size_t regsPerWarp = util::round_i(attributes.numRegs * properties.warpSize, regAllocationUnit);
-    const size_t numSides = num_sides_per_multiprocessor(properties);
-    const size_t numRegsPerSide = properties.regsPerBlock / numSides;
-    ctaLimitRegs = regsPerWarp > 0 ? ((numRegsPerSide / regsPerWarp) * numSides) / numWarps : maxBlocksPerSM;
-  }
-
-  //////////////////////////////////////////
-  // Overall limit is min() of limits due to above reasons
-  //////////////////////////////////////////
-  return util::min_(ctaLimitRegs, util::min_(ctaLimitSMem, util::min_(ctaLimitThreads, ctaLimitBlocks)));
-}
-
-
-} // end namespace cuda_launch_config_detail
-
-
-template<typename UnaryFunction>
-inline __host__ __device__
-std::size_t block_size_with_maximum_potential_occupancy(const function_attributes_t &attributes,
-                                                        const device_properties_t   &properties,
-                                                        UnaryFunction block_size_to_dynamic_smem_size)
-{
-  size_t max_occupancy      = properties.maxThreadsPerMultiProcessor;
-  size_t largest_blocksize  = cuda_launch_config_detail::util::min_(properties.maxThreadsPerBlock, attributes.maxThreadsPerBlock);
-  size_t granularity        = properties.warpSize;
-  size_t max_blocksize      = 0;
-  size_t highest_occupancy  = 0;
-
-  for(size_t blocksize = largest_blocksize; blocksize != 0; blocksize -= granularity)
-  {
-    size_t occupancy = blocksize * cuda_launch_config_detail::max_active_blocks_per_multiprocessor(properties, attributes, blocksize, block_size_to_dynamic_smem_size(blocksize));
-
-    if(occupancy > highest_occupancy)
-    {
-      max_blocksize = blocksize;
-      highest_occupancy = occupancy;
-    }
-
-    // early out, can't do better
-    if(highest_occupancy == max_occupancy)
-      break;
-  }
-
-  return max_blocksize;
-}
-
-
-inline __host__ __device__
-std::size_t block_size_with_maximum_potential_occupancy(const function_attributes_t &attributes,
-                                                        const device_properties_t   &properties)
-{
-  return block_size_with_maximum_potential_occupancy(attributes, properties, cuda_launch_config_detail::util::zero_function<std::size_t>());
-}
-
-
-inline __host__ __device__
-size_t proportional_smem_allocation(const device_properties_t   &properties,
-                                    const function_attributes_t &attributes,
-                                    size_t blocks_per_processor)
-{
-  size_t smem_per_processor    = properties.sharedMemPerBlock;
-  size_t smem_allocation_unit  = cuda_launch_config_detail::smem_allocation_unit(properties);
-
-  size_t total_smem_per_block  = cuda_launch_config_detail::util::round_z(smem_per_processor / blocks_per_processor, smem_allocation_unit);
-  size_t static_smem_per_block = attributes.sharedSizeBytes;
-  
-  return total_smem_per_block - static_smem_per_block;
-}
-
-
-template<typename UnaryFunction>
-inline __host__ __device__
-size_t max_blocksize_subject_to_smem_usage(const device_properties_t   &properties,
-                                           const function_attributes_t &attributes,
-                                           UnaryFunction blocksize_to_dynamic_smem_usage)
-{
-  size_t largest_blocksize = (thrust::min)(properties.maxThreadsPerBlock, attributes.maxThreadsPerBlock);
-  size_t granularity = properties.warpSize;
-  
-  for(int blocksize = largest_blocksize; blocksize > 0; blocksize -= granularity)
-  {
-    size_t total_smem_usage = blocksize_to_dynamic_smem_usage(blocksize) + attributes.sharedSizeBytes;
-
-    if(total_smem_usage <= properties.sharedMemPerBlock)
-    {
-      return blocksize;
-    }
-  }
-
-  return 0;
-}
-
-
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/cuda/detail/default_decomposition.h b/compat/thrust/system/cuda/detail/default_decomposition.h
deleted file mode 100644
index 1ed6bcfe20..0000000000
--- a/compat/thrust/system/cuda/detail/default_decomposition.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file default_decomposition.h
- *  \brief Return a decomposition that is appropriate for the CUDA backend.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/internal/decompose.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-template <typename IndexType>
-thrust::system::detail::internal::uniform_decomposition<IndexType> default_decomposition(IndexType n);
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/cuda/detail/default_decomposition.inl>
-
diff --git a/compat/thrust/system/cuda/detail/default_decomposition.inl b/compat/thrust/system/cuda/detail/default_decomposition.inl
deleted file mode 100644
index 3f0879ac93..0000000000
--- a/compat/thrust/system/cuda/detail/default_decomposition.inl
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/runtime_introspection.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-template <typename IndexType>
-thrust::system::detail::internal::uniform_decomposition<IndexType> default_decomposition(IndexType n)
-{
-  // TODO eliminate magical constant
-  device_properties_t properties = device_properties();
-  return thrust::system::detail::internal::uniform_decomposition<IndexType>(n, properties.maxThreadsPerBlock, 10 * properties.multiProcessorCount);
-}
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cuda/detail/detail/alignment.h b/compat/thrust/system/cuda/detail/detail/alignment.h
deleted file mode 100644
index 31fdaaf422..0000000000
--- a/compat/thrust/system/cuda/detail/detail/alignment.h
+++ /dev/null
@@ -1,223 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace detail
-{
-namespace alignment_of_detail
-{
-
-
-template<typename T> class alignment_of_impl;
-
-template<typename T, std::size_t size_diff>
-  struct helper
-{
-  static const std::size_t value = size_diff;
-};
-
-template<typename T>
-  class helper<T,0>
-{
-  public:
-    static const std::size_t value = alignment_of_impl<T>::value;
-};
-
-template<typename T>
-  class alignment_of_impl
-{
-  private:
-    struct big { T x; char c; };
-
-  public:
-    static const std::size_t value = helper<big, sizeof(big) - sizeof(T)>::value;
-};
-
-
-} // end alignment_of_detail
-
-
-template<typename T>
-  struct alignment_of
-    : alignment_of_detail::alignment_of_impl<T>
-{};
-
-
-template<std::size_t Align> struct aligned_type;
-
-// __align__ is CUDA-specific, so guard it
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-
-// implementing aligned_type portably is tricky:
-
-#  if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
-// implement aligned_type with specialization because MSVC
-// requires literals as arguments to declspec(align(n))
-template<> struct aligned_type<1>
-{
-  struct __align__(1) type { };
-};
-
-template<> struct aligned_type<2>
-{
-  struct __align__(2) type { };
-};
-
-template<> struct aligned_type<4>
-{
-  struct __align__(4) type { };
-};
-
-template<> struct aligned_type<8>
-{
-  struct __align__(8) type { };
-};
-
-template<> struct aligned_type<16>
-{
-  struct __align__(16) type { };
-};
-
-template<> struct aligned_type<32>
-{
-  struct __align__(32) type { };
-};
-
-template<> struct aligned_type<64>
-{
-  struct __align__(64) type { };
-};
-
-template<> struct aligned_type<128>
-{
-  struct __align__(128) type { };
-};
-
-template<> struct aligned_type<256>
-{
-  struct __align__(256) type { };
-};
-
-template<> struct aligned_type<512>
-{
-  struct __align__(512) type { };
-};
-
-template<> struct aligned_type<1024>
-{
-  struct __align__(1024) type { };
-};
-
-template<> struct aligned_type<2048>
-{
-  struct __align__(2048) type { };
-};
-
-template<> struct aligned_type<4096>
-{
-  struct __align__(4096) type { };
-};
-
-template<> struct aligned_type<8192>
-{
-  struct __align__(8192) type { };
-};
-#  elif (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC) && (THRUST_GCC_VERSION < 40300)
-// implement aligned_type with specialization because gcc 4.2
-// requires literals as arguments to __attribute__(aligned(n))
-template<> struct aligned_type<1>
-{
-  struct __align__(1) type { };
-};
-
-template<> struct aligned_type<2>
-{
-  struct __align__(2) type { };
-};
-
-template<> struct aligned_type<4>
-{
-  struct __align__(4) type { };
-};
-
-template<> struct aligned_type<8>
-{
-  struct __align__(8) type { };
-};
-
-template<> struct aligned_type<16>
-{
-  struct __align__(16) type { };
-};
-
-template<> struct aligned_type<32>
-{
-  struct __align__(32) type { };
-};
-
-template<> struct aligned_type<64>
-{
-  struct __align__(64) type { };
-};
-
-template<> struct aligned_type<128>
-{
-  struct __align__(128) type { };
-};
-
-#  else
-// assume the compiler allows template parameters as
-// arguments to __align__ 
-template<std::size_t Align> struct aligned_type
-{
-  struct __align__(Align) type { };
-};
-#  endif // THRUST_HOST_COMPILER
-#else
-template<std::size_t Align> struct aligned_type
-{
-  struct type { };
-};
-#endif // THRUST_DEVICE_COMPILER
-
-
-template<std::size_t Len, std::size_t Align>
-  struct aligned_storage
-{
-  union type
-  {
-    unsigned char data[Len];
-
-    typename aligned_type<Align>::type align;
-  };
-};
-
-
-} // end detail
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/cuda/detail/detail/b40c/kernel_utils.h b/compat/thrust/system/cuda/detail/detail/b40c/kernel_utils.h
deleted file mode 100644
index e2c5a44941..0000000000
--- a/compat/thrust/system/cuda/detail/detail/b40c/kernel_utils.h
+++ /dev/null
@@ -1,284 +0,0 @@
-/**
- * Copyright 2010 Duane Merrill
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. 
- * 
- * For more information, see our Google Code project site: 
- * http://code.google.com/p/back40computing/
- * 
- * Thanks!
- */
-
-
-//------------------------------------------------------------------------------
-// Common B40C Defines, Properties, and Routines 
-//------------------------------------------------------------------------------
-
-
-#pragma once
-
-#include <cuda.h>
-#include <thrust/system/cuda/detail/synchronize.h>
-
-namespace thrust  {
-namespace system  {
-namespace cuda    {
-namespace detail  {
-namespace detail  {
-namespace b40c_thrust   {
-
-//------------------------------------------------------------------------------
-// Device properties 
-//------------------------------------------------------------------------------
-
-
-#ifndef __CUDA_ARCH__
-	#define __CUDA_ARCH__ 0
-#endif
-
-#define B40C_FERMI(version)								(version >= 200)
-#define B40C_LOG_WARP_THREADS							5									// 32 threads in a warp
-#define B40C_WARP_THREADS								(1 << B40C_LOG_WARP_THREADS)
-#define B40C_LOG_MEM_BANKS(version) 					((version >= 200) ? 5 : 4)			// 32 banks on fermi, 16 on tesla
-#define B40C_MEM_BANKS(version)							(1 << B40C_LOG_MEM_BANKS(version))
-
-// TODO refactor these
-#if __CUDA_ARCH__ >= 200
-	#define FastMul(a, b) (a * b)
-#else
-	#define FastMul(a, b) (__umul24(a, b))
-#endif	
-
-#if __CUDA_ARCH__ >= 120
-	#define WarpVoteAll(active_threads, predicate) (__all(predicate))
-#else 
-	#define WarpVoteAll(active_threads, predicate) (EmulatedWarpVoteAll<active_threads>(predicate))
-#endif
-
-#if __CUDA_ARCH__ >= 200
-	#define TallyWarpVote(active_threads, predicate, storage) (__popc(__ballot(predicate)))
-#else 
-	#define TallyWarpVote(active_threads, predicate, storage) (TallyWarpVoteSm10<active_threads>(predicate, storage))
-#endif
-
-#ifdef __LP64__
-	#define _B40C_LP64_ true
-#else
-	#define _B40C_LP64_ false
-#endif
-
-#define _B40C_REG_MISER_QUALIFIER_ __shared__
-
-
-//------------------------------------------------------------------------------
-// Handy routines 
-//------------------------------------------------------------------------------
-
-
-/**
- * Select maximum
- */
-#define B40C_MAX(a, b) ((a > b) ? a : b)
-
-
-/**
- * MagnitudeShift().  Allows you to shift left for positive magnitude values, 
- * right for negative.   
- * 
- * N.B. This code is a little strange; we are using this meta-programming 
- * pattern of partial template specialization for structures in order to 
- * decide whether to shift left or right.  Normally we would just use a 
- * conditional to decide if something was negative or not and then shift 
- * accordingly, knowing that the compiler will elide the untaken branch, 
- * i.e., the out-of-bounds shift during dead code elimination. However, 
- * the pass for bounds-checking shifts seems to happen before the DCE 
- * phase, which results in a an unsightly number of compiler warnings, so 
- * we force the issue earlier using structural template specialization.
- */
-
-template <typename K, int magnitude, bool shift_left> struct MagnitudeShiftOp;
-
-template <typename K, int magnitude> 
-struct MagnitudeShiftOp<K, magnitude, true> {
-	__device__ __forceinline__ static K Shift(K key) {
-		return key << magnitude;
-	}
-};
-
-template <typename K, int magnitude> 
-struct MagnitudeShiftOp<K, magnitude, false> {
-	__device__ __forceinline__ static K Shift(K key) {
-		return key >> magnitude;
-	}
-};
-
-template <typename K, int magnitude> 
-__device__ __forceinline__ K MagnitudeShift(K key) {
-	return MagnitudeShiftOp<K, (magnitude > 0) ? magnitude : magnitude * -1, (magnitude > 0)>::Shift(key);
-}
-
-
-/**
- * Supress warnings for unused constants
- */
-template <typename T>
-__device__ __forceinline__ void SuppressUnusedConstantWarning(const T) {}
-
-
-
-
-//------------------------------------------------------------------------------
-// Common device routines
-//------------------------------------------------------------------------------
-
-
-/**
- * Perform a warp-synchrounous prefix scan.  Allows for diverting a warp's
- * threads into separate scan problems (multi-scan). 
- */
-template <int NUM_ELEMENTS, bool MULTI_SCAN>
-__device__ __forceinline__ int WarpScan(
-	volatile int warpscan[][NUM_ELEMENTS],
-	int partial_reduction,
-	int copy_section) {
-	
-	int warpscan_idx;
-	if (MULTI_SCAN) {
-		warpscan_idx = threadIdx.x & (NUM_ELEMENTS - 1);
-	} else {
-		warpscan_idx = threadIdx.x;
-	}
-
-	warpscan[1][warpscan_idx] = partial_reduction;
-
-	if (NUM_ELEMENTS > 1) warpscan[1][warpscan_idx] = partial_reduction = 
-			partial_reduction + warpscan[1][warpscan_idx - 1];
-	if (NUM_ELEMENTS > 2) warpscan[1][warpscan_idx] = partial_reduction = 
-			partial_reduction + warpscan[1][warpscan_idx - 2];
-	if (NUM_ELEMENTS > 4) warpscan[1][warpscan_idx] = partial_reduction = 
-			partial_reduction + warpscan[1][warpscan_idx - 4];
-	if (NUM_ELEMENTS > 8) warpscan[1][warpscan_idx] = partial_reduction = 
-			partial_reduction + warpscan[1][warpscan_idx - 8];
-	if (NUM_ELEMENTS > 16) warpscan[1][warpscan_idx] = partial_reduction = 
-			partial_reduction + warpscan[1][warpscan_idx - 16];
-	
-	if (copy_section > 0) {
-		warpscan[1 + copy_section][warpscan_idx] = partial_reduction;
-	}
-	
-	return warpscan[1][warpscan_idx - 1];
-}
-
-/**
- * Perform a warp-synchronous reduction
- */
-template <int NUM_ELEMENTS>
-__device__ __forceinline__ void WarpReduce(
-	int idx,
-	volatile int *storage,
-	int partial_reduction)
-{
-	storage[idx] = partial_reduction;
-
-	if (NUM_ELEMENTS > 16) storage[idx] = partial_reduction = partial_reduction + storage[idx + 16];
-	if (NUM_ELEMENTS > 8) storage[idx] = partial_reduction = partial_reduction + storage[idx + 8];
-	if (NUM_ELEMENTS > 4) storage[idx] = partial_reduction = partial_reduction + storage[idx + 4];
-	if (NUM_ELEMENTS > 2) storage[idx] = partial_reduction = partial_reduction + storage[idx + 2];
-	if (NUM_ELEMENTS > 1) storage[idx] = partial_reduction = partial_reduction + storage[idx + 1];
-}
-
-
-/**
- * Tally a warp-vote regarding the given predicate using the supplied storage
- */
-template <int ACTIVE_THREADS>
-__device__ __forceinline__ int TallyWarpVoteSm10(int predicate, int storage[]) {
-	WarpReduce<ACTIVE_THREADS>(threadIdx.x, storage, predicate);
-	return storage[0];
-}
-
-
-/**
- * Tally a warp-vote regarding the given predicate
- */
-template <int ACTIVE_THREADS>
-__device__ __forceinline__ int TallyWarpVoteSm10(int predicate) {
-  __shared__ int vote_reduction[B40C_WARP_THREADS];
-  return TallyWarpVoteSm10<ACTIVE_THREADS>(predicate, vote_reduction);
-}
-
-/**
- * Emulate the __all() warp vote instruction
- */
-template <int ACTIVE_THREADS>
-__device__ __forceinline__ int EmulatedWarpVoteAll(int predicate) {
-	return (TallyWarpVoteSm10<ACTIVE_THREADS>(predicate) == ACTIVE_THREADS);
-}
-
-
-/**
- * Have each thread concurrently perform a serial reduction over its specified segment 
- */
-template <int LENGTH>
-__device__ __forceinline__ int
-SerialReduce(int segment[]) {
-	
-	int reduce = segment[0];
-
-	#pragma unroll
-	for (int i = 1; i < (int) LENGTH; i++) {
-		reduce += segment[i];
-	}
-	
-	return reduce;
-}
-
-
-/**
- * Have each thread concurrently perform a serial scan over its specified segment
- */
-template <int LENGTH>
-__device__ __forceinline__
-void SerialScan(int segment[], int seed0) {
-	
-	int seed1;
-
-	#pragma unroll	
-	for (int i = 0; i < (int) LENGTH; i += 2) {
-		seed1 = segment[i] + seed0;
-		segment[i] = seed0;
-		seed0 = seed1 + segment[i + 1];
-		segment[i + 1] = seed1;
-	}
-}
-
-
-
-
-//------------------------------------------------------------------------------
-// Empty Kernels
-//------------------------------------------------------------------------------
-
-template <typename T>
-__global__ void FlushKernel(void)
-{
-}
-
-
-} // end namespace b40c_thrust
-} // end namespace detail
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cuda/detail/detail/b40c/radixsort_api.h b/compat/thrust/system/cuda/detail/detail/b40c/radixsort_api.h
deleted file mode 100644
index 2b199bb08a..0000000000
--- a/compat/thrust/system/cuda/detail/detail/b40c/radixsort_api.h
+++ /dev/null
@@ -1,807 +0,0 @@
-/******************************************************************************
- * Copyright 2010 Duane Merrill
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. 
- * 
- * 
- * 
- * 
- * AUTHORS' REQUEST: 
- * 
- * 		If you use|reference|benchmark this code, please cite our Technical 
- * 		Report (http://www.cs.virginia.edu/~dgm4d/papers/RadixSortTR.pdf):
- * 
- *		@TechReport{ Merrill:Sorting:2010,
- *        	author = "Duane Merrill and Andrew Grimshaw",
- *        	title = "Revisiting Sorting for GPGPU Stream Architectures",
- *        	year = "2010",
- *        	institution = "University of Virginia, Department of Computer Science",
- *        	address = "Charlottesville, VA, USA",
- *        	number = "CS2010-03"
- *		}
- * 
- * For more information, see our Google Code project site: 
- * http://code.google.com/p/back40computing/
- * 
- * Thanks!
- ******************************************************************************/
-
-
-
-/******************************************************************************
- * Radix Sorting API
- *
- * USAGE:
- * 
- * Using the B40C radix sorting implementation is easy.  Just #include this API 
- * file and its kernel include dependencies within your source.  Below are two
- * examples for using: 
- *
- * (1) A keys-only example for sorting floats:
- * 
- *		// Create storage-management structure
- * 		RadixSortStorage<float> device_storage(d_float_keys);			
- *
- *		// Create and enact sorter
- * 		RadixSortingEnactor sorter<float>(d_float_keys_len);
- *		sorter.EnactSort(device_storage);
- *
- *		// Re-acquire pointer to sorted keys, free unused/temp storage 
- *		d_float_keys = device_storage.d_keys;
- *		device_storage.CleanupTempStorage();
- *
- * (2) And a key-value example for sorting ints paired with doubles:
- *
- *		// Create storage-management structure
- * 		RadixSortStorage<int, double> device_storage(d_int_keys, d_double_values);			
- *
- *		// Create and enact sorter
- * 		RadixSortingEnactor sorter<int, double>(d_int_keys_len);
- *		sorter.EnactSort(device_storage);
- *
- *		// Re-acquire pointer to sorted keys and values, free unused/temp storage 
- *		d_int_keys = device_storage.d_keys;
- *		d_double_values = device_storage.d_values;
- *		device_storage.CleanupTempStorage();
- *
- *
- ******************************************************************************/
-
-#pragma once
-
-#include <stdlib.h> 
-#include <stdio.h> 
-#include <string.h> 
-#include <math.h> 
-#include <float.h>
-
-#include "radixsort_reduction_kernel.h"
-#include "radixsort_spine_kernel.h"
-#include "radixsort_scanscatter_kernel.h"
-
-#include <thrust/swap.h>
-
-namespace thrust  {
-namespace system  {
-namespace cuda    {
-namespace detail  {
-namespace detail  {
-namespace b40c_thrust   {
-
-
-/******************************************************************************
- * Debugging options
- ******************************************************************************/
-
-static bool RADIXSORT_DEBUG = false;
-
-
-
-/******************************************************************************
- * Structures for mananging device-side sorting state
- ******************************************************************************/
-
-/**
- * Sorting storage-management structure for device vectors
- */
-template <typename K, typename V = KeysOnlyType>
-struct RadixSortStorage {
-
-	// Device vector of keys to sort
-	K* d_keys;
-	
-	// Device vector of values to sort
-	V* d_values;
-
-	// Ancillary device vector for key storage 
-	K* d_alt_keys;
-
-	// Ancillary device vector for value storage
-	V* d_alt_values;
-
-	// Temporary device storage needed for radix sorting histograms
-	int *d_spine;
-	
-	// Flip-flopping temporary device storage denoting which digit place 
-	// pass should read from which input source (i.e., false if reading from 
-	// keys, true if reading from alternate_keys
-	bool *d_from_alt_storage;
-
-	// Host-side boolean whether or not an odd number of sorting passes left the 
-	// results in alternate storage.  If so, the d_keys (and d_values) pointers 
-	// will have been swapped with the d_alt_keys (and d_alt_values) pointers in order to 
-	// point to the final results.
-	bool using_alternate_storage;
-	
-	// Constructor
-	RadixSortStorage(K* keys = NULL, V* values = NULL) 
-	{ 
-		d_keys = keys; 
-		d_values = values; 
-		d_alt_keys = NULL; 
-		d_alt_values = NULL; 
-		d_spine = NULL;
-		d_from_alt_storage = NULL;
-		
-		using_alternate_storage = false;
-	}
-
-	// Clean up non-results storage (may include freeing original storage if 
-	// primary pointers were swizzled as per using_alternate_storage) 
-	cudaError_t CleanupTempStorage() 
-	{
-		if (d_alt_keys) cudaFree(d_alt_keys);
-		if (d_alt_values) cudaFree(d_alt_values);
-		if (d_spine) cudaFree(d_spine);
-		if (d_from_alt_storage) cudaFree(d_from_alt_storage);
-		
-		return cudaSuccess;
-	}
-};
-
-
-
-/******************************************************************************
- * Base class for sorting enactors
- ******************************************************************************/
-
-
-/**
- * Base class for SRTS radix sorting enactors.
- */
-template <typename K, typename V>
-class BaseRadixSortingEnactor 
-{
-public:
-	
-	// Unsigned integer type suitable for radix sorting of keys
-	typedef typename KeyConversion<K>::UnsignedBits ConvertedKeyType;
-
-protected:
-
-	//
-	// Information about our problem configuration
-	//
-	
-	bool				_keys_only;
-	unsigned int 		_num_elements;
-	int 				_cycle_elements;
-	int 				_spine_elements;
-	int 				_grid_size;
-	CtaDecomposition 	_work_decomposition;
-	int 				_passes;
-	bool 				_swizzle_pointers_for_odd_passes;
-
-	// Information about our target device
-	cudaDeviceProp 		_device_props;
-	int 				_device_sm_version;
-	
-	// Information about our kernel assembly
-	int 				_kernel_ptx_version;
-	cudaFuncAttributes 	_spine_scan_kernel_attrs;
-	
-protected:
-	
-	/**
-	 * Constructor.
-	 */
-	BaseRadixSortingEnactor(int passes, int radix_bits, unsigned int num_elements, int max_grid_size, bool swizzle_pointers_for_odd_passes = true); 
-	
-	/**
-	 * Heuristic for determining the number of CTAs to launch.
-	 *   
-	 * @param[in] 		max_grid_size  
-	 * 		Maximum allowable number of CTAs to launch.  A value of 0 indicates 
-	 * 		that the default value should be used.
-	 * 
-	 * @return The actual number of CTAs that should be launched
-	 */
-	int GridSize(int max_grid_size);
-
-	/**
-	 * Performs a distribution sorting pass over a single digit place
-	 */
-	template <int PASS, int RADIX_BITS, int BIT, typename PreprocessFunctor, typename PostprocessFunctor>
-	cudaError_t DigitPlacePass(const RadixSortStorage<ConvertedKeyType, V> &converted_storage); 
-	
-	/**
-	 * Enacts a sorting operation by performing the the appropriate 
-	 * digit-place passes.  To be overloaded by specialized subclasses.
-	 */
-	virtual cudaError_t EnactDigitPlacePasses(const RadixSortStorage<ConvertedKeyType, V> &converted_storage) = 0;
-	
-public:
-	
-	/**
-	 * Returns the length (in unsigned ints) of the device vector needed for  
-	 * temporary storage of the reduction spine.  Useful if pre-allocating 
-	 * your own device storage (as opposed to letting EnactSort() allocate it
-	 * for you).
-	 */
-	int SpineElements() { return _spine_elements; }
-
-	/**
-	 * Returns whether or not the problem will fit on the device.
-	 */
-	bool CanFit();
-
-	/**
-	 * Enacts a radix sorting operation on the specified device data.
-	 * 
-	 * IMPORTANT NOTES: The device storage backing the specified input vectors of 
-	 * keys (and data) will be modified.  (I.e., treat this as an in-place sort.)  
-	 * 
-	 * Additionally, the pointers in the problem_storage structure may be updated 
-	 * (a) depending upon the number of digit-place sorting passes needed, and (b) 
-	 * whether or not the caller has already allocated temporary storage.  
-	 * 
-	 * The sorted results will always be referenced by problem_storage.d_keys (and 
-	 * problem_storage.d_values).  However, for an odd number of sorting passes (uncommon)
-	 * these results will actually be backed by the storage initially allocated for 
-	 * by problem_storage.d_alt_keys (and problem_storage.d_alt_values).  If so, 
-	 * problem_storage.d_alt_keys and problem_storage.d_alt_keys will be updated to 
-	 * reference the original problem_storage.d_keys and problem_storage.d_values in order 
-	 * to facilitate cleanup.  
-	 * 
-	 * This means it is important to avoid keeping stale copies of device pointers 
-	 * to keys/data; you will want to re-reference the pointers in problem_storage.
-	 * 
-	 * @param[in/out] 	problem_storage 
-	 * 		Device vectors of keys and values to sort, and ancillary storage 
-	 * 		needed by the sorting kernels. See the IMPORTANT NOTES above. 
-	 * 
-	 * 		The problem_storage.[alternate_keys|alternate_values|d_spine] fields are 
-	 * 		temporary storage needed by the sorting kernels.  To facilitate 
-	 * 		speed, callers are welcome to re-use this storage for same-sized 
-	 * 		(or smaller) sortign problems. If NULL, these storage vectors will be 
-	 *      allocated by this routine (and must be subsequently cuda-freed by 
-	 *      the caller).
-	 *
-	 * @return cudaSuccess on success, error enumeration otherwise
-	 */
-	cudaError_t EnactSort(RadixSortStorage<K, V> &problem_storage);	
-
-    /*
-     * Destructor
-     */
-    virtual ~BaseRadixSortingEnactor() {}
-};
-
-
-
-template <typename K, typename V>
-BaseRadixSortingEnactor<K, V>::BaseRadixSortingEnactor(
-	int passes, 
-	int max_radix_bits, 
-	unsigned int num_elements, 
-	int max_grid_size,
-	bool swizzle_pointers_for_odd_passes) 
-{
-	//
-	// Get current device properties 
-	//
-
-	int current_device;
-	cudaGetDevice(&current_device);
-	cudaGetDeviceProperties(&_device_props, current_device);
-	_device_sm_version = _device_props.major * 100 + _device_props.minor * 10;
-
-	
-	//
-	// Get SM version of compiled kernel assembly
-	//
-	cudaFuncGetAttributes(&_spine_scan_kernel_attrs, SrtsScanSpine<void>);
-	_kernel_ptx_version = _spine_scan_kernel_attrs.ptxVersion * 10;
-	
-
-	//
-	// Determine number of CTAs to launch, shared memory, cycle elements, etc.
-	//
-
-	_passes								= passes;
-	_num_elements 						= num_elements;
-	_keys_only 							= IsKeysOnly<V>();
-	_cycle_elements 					= B40C_RADIXSORT_CYCLE_ELEMENTS(_kernel_ptx_version , ConvertedKeyType, V);
-	_grid_size 							= GridSize(max_grid_size);
-	_swizzle_pointers_for_odd_passes	= swizzle_pointers_for_odd_passes;
-	
-	int total_cycles 			= _num_elements / _cycle_elements;
-	unsigned int cycles_per_block 		= total_cycles / _grid_size;						
-	unsigned int extra_cycles 			= total_cycles - (cycles_per_block * _grid_size);
-
-	CtaDecomposition work_decomposition = {
-		extra_cycles,										// num_big_blocks
-		(cycles_per_block + 1) * _cycle_elements,			// big_block_elements
-		cycles_per_block * _cycle_elements,					// normal_block_elements
-		_num_elements - (total_cycles * _cycle_elements),	// extra_elements_last_block
-		_num_elements};										// num_elements
-	
-	_work_decomposition = work_decomposition;
-	
-	int spine_cycles = ((_grid_size * (1 << max_radix_bits)) + B40C_RADIXSORT_SPINE_CYCLE_ELEMENTS - 1) / B40C_RADIXSORT_SPINE_CYCLE_ELEMENTS;
-	_spine_elements = spine_cycles * B40C_RADIXSORT_SPINE_CYCLE_ELEMENTS;
-}
-
-
-
-template <typename K, typename V>
-int BaseRadixSortingEnactor<K, V>::GridSize(int max_grid_size)
-{
-	const int SINGLE_CTA_CUTOFF = 0;		// right now zero; we have no single-cta sorting
-
-	// find maximum number of threadblocks if "use-default"
-	if (max_grid_size == 0) {
-
-		if (_num_elements <= static_cast<unsigned int>(SINGLE_CTA_CUTOFF)) {
-
-			// The problem size is too small to warrant a two-level reduction: 
-			// use only one stream-processor
-			max_grid_size = 1;
-
-		} else {
-
-			if (_device_sm_version <= 120) {
-				
-				// G80/G90
-				max_grid_size = _device_props.multiProcessorCount * 4;
-				
-			} else if (_device_sm_version < 200) {
-				
-				// GT200 (has some kind of TLB or icache drama)
-				int orig_max_grid_size = _device_props.multiProcessorCount * B40C_RADIXSORT_SCAN_SCATTER_CTA_OCCUPANCY(_kernel_ptx_version);
-				if (_keys_only) { 
-					orig_max_grid_size *= (_num_elements + (1024 * 1024 * 96) - 1) / (1024 * 1024 * 96);
-				} else {
-					orig_max_grid_size *= (_num_elements + (1024 * 1024 * 64) - 1) / (1024 * 1024 * 64);
-				}
-				max_grid_size = orig_max_grid_size;
-
-				if (_num_elements / _cycle_elements > static_cast<unsigned int>(max_grid_size)) {
-	
-					double multiplier1 = 4.0;
-					double multiplier2 = 16.0;
-
-					double delta1 = 0.068;
-					double delta2 = 0.127;	
-	
-					int dividend = (_num_elements + _cycle_elements - 1) / _cycle_elements;
-	
-					while(true) {
-	
-						double quotient = ((double) dividend) / (multiplier1 * max_grid_size);
-						quotient -= (int) quotient;
-
-						if ((quotient > delta1) && (quotient < 1 - delta1)) {
-
-							quotient = ((double) dividend) / (multiplier2 * max_grid_size / 3.0);
-							quotient -= (int) quotient;
-
-							if ((quotient > delta2) && (quotient < 1 - delta2)) {
-								break;
-							}
-						}
-						
-						if (max_grid_size == orig_max_grid_size - 2) {
-							max_grid_size = orig_max_grid_size - 30;
-						} else {
-							max_grid_size -= 1;
-						}
-					}
-				}
-			} else {
-				
-				// GF100
-				max_grid_size = 418;
-			}
-		}
-	}
-
-	// Calculate the actual number of threadblocks to launch.  Initially
-	// assume that each threadblock will do only one cycle_elements worth 
-	// of work, but then clamp it by the "max" restriction derived above
-	// in order to accomodate the "single-sp" and "saturated" cases.
-
-	int grid_size = _num_elements / _cycle_elements;
-	if (grid_size == 0) {
-		grid_size = 1;
-	}
-	if (grid_size > max_grid_size) {
-		grid_size = max_grid_size;
-	} 
-
-	return grid_size;
-}
-
-
-
-template <typename K, typename V>
-bool BaseRadixSortingEnactor<K, V>::
-CanFit() 
-{
-	long long bytes = (_num_elements * sizeof(K) * 2) + (_spine_elements * sizeof(int));
-	if (!_keys_only) bytes += _num_elements * sizeof(V) * 2;
-
-	if (_device_props.totalGlobalMem < 1024 * 1024 * 513) {
-		return (bytes < ((double) _device_props.totalGlobalMem) * 0.81); 	// allow up to 81% capacity for 512MB   
-	}
-	
-	return (bytes < ((double) _device_props.totalGlobalMem) * 0.89); 	// allow up to 90% capacity 
-}
-
-
-
-template <typename K, typename V>
-template <int PASS, int RADIX_BITS, int BIT, typename PreprocessFunctor, typename PostprocessFunctor>
-cudaError_t BaseRadixSortingEnactor<K, V>::
-DigitPlacePass(const RadixSortStorage<ConvertedKeyType, V> &converted_storage)
-{
-	int threads = B40C_RADIXSORT_THREADS;
-	int dynamic_smem;
-
-	cudaFuncAttributes reduce_kernel_attrs, scan_scatter_attrs;
-	cudaFuncGetAttributes(&reduce_kernel_attrs, RakingReduction<ConvertedKeyType, V, PASS, RADIX_BITS, BIT, PreprocessFunctor>);
-	cudaFuncGetAttributes(&scan_scatter_attrs, ScanScatterDigits<ConvertedKeyType, V, PASS, RADIX_BITS, BIT, PreprocessFunctor, PostprocessFunctor>);
-	
-	//
-	// Counting Reduction
-	//
-
-	// Run tesla flush kernel if we have two or more threadblocks for each of the SMs
-	if ((_device_sm_version == 130) && (_work_decomposition.num_elements > static_cast<unsigned int>(_device_props.multiProcessorCount * _cycle_elements * 2))) { 
-		FlushKernel<void><<<_grid_size, B40C_RADIXSORT_THREADS, scan_scatter_attrs.sharedSizeBytes>>>();
-		synchronize_if_enabled("FlushKernel");
-	}
-
-	// GF100 and GT200 get the same smem allocation for every kernel launch (pad the reduction/top-level-scan kernels)
-	dynamic_smem = (_kernel_ptx_version >= 130) ? scan_scatter_attrs.sharedSizeBytes - reduce_kernel_attrs.sharedSizeBytes : 0;
-
-	RakingReduction<ConvertedKeyType, V, PASS, RADIX_BITS, BIT, PreprocessFunctor> <<<_grid_size, threads, dynamic_smem>>>(
-		converted_storage.d_from_alt_storage,
-		converted_storage.d_spine,
-		converted_storage.d_keys,
-		converted_storage.d_alt_keys,
-		_work_decomposition);
-    synchronize_if_enabled("RakingReduction");
-
-	
-	//
-	// Spine
-	//
-	
-	// GF100 and GT200 get the same smem allocation for every kernel launch (pad the reduction/top-level-scan kernels)
-	dynamic_smem = (_kernel_ptx_version >= 130) ? scan_scatter_attrs.sharedSizeBytes - _spine_scan_kernel_attrs.sharedSizeBytes : 0;
-	
-	SrtsScanSpine<void><<<_grid_size, B40C_RADIXSORT_SPINE_THREADS, dynamic_smem>>>(
-		converted_storage.d_spine,
-		converted_storage.d_spine,
-		_spine_elements);
-    synchronize_if_enabled("SrtsScanSpine");
-
-	
-	//
-	// Scanning Scatter
-	//
-	
-	// Run tesla flush kernel if we have two or more threadblocks for each of the SMs
-	if ((_device_sm_version == 130) && (_work_decomposition.num_elements > static_cast<unsigned int>(_device_props.multiProcessorCount * _cycle_elements * 2))) { 
-		FlushKernel<void><<<_grid_size, B40C_RADIXSORT_THREADS, scan_scatter_attrs.sharedSizeBytes>>>();
-		synchronize_if_enabled("FlushKernel");
-	}
-
-	ScanScatterDigits<ConvertedKeyType, V, PASS, RADIX_BITS, BIT, PreprocessFunctor, PostprocessFunctor> <<<_grid_size, threads, 0>>>(
-		converted_storage.d_from_alt_storage,
-		converted_storage.d_spine,
-		converted_storage.d_keys,
-		converted_storage.d_alt_keys,
-		converted_storage.d_values,
-		converted_storage.d_alt_values,
-		_work_decomposition);
-    synchronize_if_enabled("ScanScatterDigits");
-
-	return cudaSuccess;
-}
-
-
-
-template <typename K, typename V>
-cudaError_t BaseRadixSortingEnactor<K, V>::
-EnactSort(RadixSortStorage<K, V> &problem_storage) 
-{
-	//
-	// Allocate device memory for temporary storage (if necessary)
-	//
-
-	if (problem_storage.d_alt_keys == NULL) {
-		cudaMalloc((void**) &problem_storage.d_alt_keys, _num_elements * sizeof(K));
-	}
-	if (!_keys_only && (problem_storage.d_alt_values == NULL)) {
-		cudaMalloc((void**) &problem_storage.d_alt_values, _num_elements * sizeof(V));
-	}
-	if (problem_storage.d_spine == NULL) {
-		cudaMalloc((void**) &problem_storage.d_spine, _spine_elements * sizeof(int));
-	}
-	if (problem_storage.d_from_alt_storage == NULL) {
-		cudaMalloc((void**) &problem_storage.d_from_alt_storage, 2 * sizeof(bool));
-	}
-
-	// Determine suitable type of unsigned byte storage to use for keys 
-	typedef typename KeyConversion<K>::UnsignedBits ConvertedKeyType;
-	
-	// Copy storage pointers to an appropriately typed stucture 
-	RadixSortStorage<ConvertedKeyType, V> converted_storage;
-	memcpy(&converted_storage, &problem_storage, sizeof(RadixSortStorage<K, V>));
-
-	// 
-	// Enact the sorting operation
-	//
-	
-	if (RADIXSORT_DEBUG) {
-		
-		printf("_device_sm_version: %d, _kernel_ptx_version: %d\n", _device_sm_version, _kernel_ptx_version);
-		printf("Bottom-level reduction & scan kernels:\n\tgrid_size: %d, \n\tthreads: %d, \n\tcycle_elements: %d, \n\tnum_big_blocks: %d, \n\tbig_block_elements: %d, \n\tnormal_block_elements: %d\n\textra_elements_last_block: %d\n\n",
-			_grid_size, B40C_RADIXSORT_THREADS, _cycle_elements, _work_decomposition.num_big_blocks, _work_decomposition.big_block_elements, _work_decomposition.normal_block_elements, _work_decomposition.extra_elements_last_block);
-		printf("Top-level spine scan:\n\tgrid_size: %d, \n\tthreads: %d, \n\tspine_block_elements: %d\n\n", 
-			_grid_size, B40C_RADIXSORT_SPINE_THREADS, _spine_elements);
-	}	
-
-	cudaError_t retval = EnactDigitPlacePasses(converted_storage);
-
-	
-	//
-	// Swizzle pointers if we left our sorted output in temp storage 
-	//
-	
-	if (_swizzle_pointers_for_odd_passes) {
-	
-		cudaMemcpy(
-			&problem_storage.using_alternate_storage, 
-			&problem_storage.d_from_alt_storage[_passes & 0x1], 
-			sizeof(bool), 
-			cudaMemcpyDeviceToHost);
-	
-		if (problem_storage.using_alternate_storage) {
-            thrust::swap<K*>(problem_storage.d_keys, problem_storage.d_alt_keys);
-			if (!_keys_only) {
-                thrust::swap<V*>(problem_storage.d_values, problem_storage.d_alt_values);
-			}
-		}
-	}
-	
-	return retval;
-}
-
-
-
-
-
-/******************************************************************************
- * Sorting enactor classes
- ******************************************************************************/
-
-/**
- * Generic sorting enactor class.  Simply create an instance of this class
- * with your key-type K (and optionally value-type V if sorting with satellite 
- * values).
- * 
- * Template specialization provides the appropriate enactor instance to handle 
- * the specified data types. 
- * 
- * @template-param K
- * 		Type of keys to be sorted
- *
- * @template-param V
- * 		Type of values to be sorted.
- *
- * @template-param ConvertedKeyType
- * 		Leave as default to effect necessary enactor specialization.
- */
-template <typename K, typename V = KeysOnlyType, typename ConvertedKeyType = typename KeyConversion<K>::UnsignedBits>
-class RadixSortingEnactor;
-
-
-
-/**
- * Sorting enactor that is specialized for for 8-bit key types
- */
-template <typename K, typename V>
-class RadixSortingEnactor<K, V, unsigned char> : public BaseRadixSortingEnactor<K, V>
-{
-protected:
-
-	typedef BaseRadixSortingEnactor<K, V> Base; 
-	typedef typename Base::ConvertedKeyType ConvertedKeyType;
-
-	cudaError_t EnactDigitPlacePasses(const RadixSortStorage<ConvertedKeyType, V> &converted_storage)
-	{
-		Base::template DigitPlacePass<0, 4, 0, PreprocessKeyFunctor<K>,      NopFunctor<ConvertedKeyType> >(converted_storage);
-		Base::template DigitPlacePass<1, 4, 4, NopFunctor<ConvertedKeyType>, PostprocessKeyFunctor<K> >    (converted_storage); 
-
-		return cudaSuccess;
-	}
-
-public:
-	
-	/**
-	 * Constructor.
-	 * 
-	 * @param[in] 		num_elements 
-	 * 		Length (in elements) of the input to a sorting operation
-	 * 
-	 * @param[in] 		max_grid_size  
-	 * 		Maximum allowable number of CTAs to launch.  The default value of 0 indicates 
-	 * 		that the dispatch logic should select an appropriate value for the target device.
-	 */	
-	RadixSortingEnactor(unsigned int num_elements, int max_grid_size = 0) : Base::BaseRadixSortingEnactor(2, 4, num_elements, max_grid_size) {}
-
-};
-
-
-
-/**
- * Sorting enactor that is specialized for for 16-bit key types
- */
-template <typename K, typename V>
-class RadixSortingEnactor<K, V, unsigned short> : public BaseRadixSortingEnactor<K, V>
-{
-protected:
-
-	typedef BaseRadixSortingEnactor<K, V> Base; 
-	typedef typename Base::ConvertedKeyType ConvertedKeyType;
-
-	cudaError_t EnactDigitPlacePasses(const RadixSortStorage<ConvertedKeyType, V> &converted_storage)
-	{
-		Base::template DigitPlacePass<0, 4, 0,  PreprocessKeyFunctor<K>,      NopFunctor<ConvertedKeyType> >(converted_storage);
-		Base::template DigitPlacePass<1, 4, 4,  NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
-		Base::template DigitPlacePass<2, 4, 8,  NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
-		Base::template DigitPlacePass<3, 4, 12, NopFunctor<ConvertedKeyType>, PostprocessKeyFunctor<K> >    (converted_storage); 
-
-		return cudaSuccess;
-	}
-
-public:
-	
-	/**
-	 * Constructor.
-	 * 
-	 * @param[in] 		num_elements 
-	 * 		Length (in elements) of the input to a sorting operation
-	 * 
-	 * @param[in] 		max_grid_size  
-	 * 		Maximum allowable number of CTAs to launch.  The default value of 0 indicates 
-	 * 		that the dispatch logic should select an appropriate value for the target device.
-	 */	
-	RadixSortingEnactor(unsigned int num_elements, int max_grid_size = 0) : Base::BaseRadixSortingEnactor(4, 4, num_elements, max_grid_size) {}
-
-};
-
-
-/**
- * Sorting enactor that is specialized for for 32-bit key types
- */
-template <typename K, typename V>
-class RadixSortingEnactor<K, V, unsigned int> : public BaseRadixSortingEnactor<K, V>
-{
-protected:
-
-	typedef BaseRadixSortingEnactor<K, V> Base; 
-	typedef typename Base::ConvertedKeyType ConvertedKeyType;
-
-	cudaError_t EnactDigitPlacePasses(const RadixSortStorage<ConvertedKeyType, V> &converted_storage)
-	{
-		Base::template DigitPlacePass<0, 4, 0,  PreprocessKeyFunctor<K>,      NopFunctor<ConvertedKeyType> >(converted_storage);
-		Base::template DigitPlacePass<1, 4, 4,  NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
-		Base::template DigitPlacePass<2, 4, 8,  NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
-		Base::template DigitPlacePass<3, 4, 12, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
-		Base::template DigitPlacePass<4, 4, 16, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
-		Base::template DigitPlacePass<5, 4, 20, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
-		Base::template DigitPlacePass<6, 4, 24, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
-		Base::template DigitPlacePass<7, 4, 28, NopFunctor<ConvertedKeyType>, PostprocessKeyFunctor<K> >    (converted_storage); 
-
-		return cudaSuccess;
-	}
-
-public:
-	
-	/**
-	 * Constructor.
-	 * 
-	 * @param[in] 		num_elements 
-	 * 		Length (in elements) of the input to a sorting operation
-	 * 
-	 * @param[in] 		max_grid_size  
-	 * 		Maximum allowable number of CTAs to launch.  The default value of 0 indicates 
-	 * 		that the dispatch logic should select an appropriate value for the target device.
-	 */	
-	RadixSortingEnactor(unsigned int num_elements, int max_grid_size = 0) : Base::BaseRadixSortingEnactor(8, 4, num_elements, max_grid_size) {}
-
-};
-
-
-
-/**
- * Sorting enactor that is specialized for for 64-bit key types
- */
-template <typename K, typename V>
-class RadixSortingEnactor<K, V, unsigned long long> : public BaseRadixSortingEnactor<K, V>
-{
-protected:
-
-	typedef BaseRadixSortingEnactor<K, V> Base; 
-	typedef typename Base::ConvertedKeyType ConvertedKeyType;
-
-	cudaError_t EnactDigitPlacePasses(const RadixSortStorage<ConvertedKeyType, V> &converted_storage)
-	{
-		Base::template DigitPlacePass<0,  4, 0,  PreprocessKeyFunctor<K>,      NopFunctor<ConvertedKeyType> >(converted_storage);
-		Base::template DigitPlacePass<1,  4, 4,  NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
-		Base::template DigitPlacePass<2,  4, 8,  NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
-		Base::template DigitPlacePass<3,  4, 12, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
-		Base::template DigitPlacePass<4,  4, 16, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
-		Base::template DigitPlacePass<5,  4, 20, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
-		Base::template DigitPlacePass<6,  4, 24, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
-		Base::template DigitPlacePass<7,  4, 28, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
-		Base::template DigitPlacePass<8,  4, 32, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage);
-		Base::template DigitPlacePass<9,  4, 36, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
-		Base::template DigitPlacePass<10, 4, 40, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
-		Base::template DigitPlacePass<11, 4, 44, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
-		Base::template DigitPlacePass<12, 4, 48, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
-		Base::template DigitPlacePass<13, 4, 52, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
-		Base::template DigitPlacePass<14, 4, 56, NopFunctor<ConvertedKeyType>, NopFunctor<ConvertedKeyType> >(converted_storage); 
-		Base::template DigitPlacePass<15, 4, 60, NopFunctor<ConvertedKeyType>, PostprocessKeyFunctor<K> >    (converted_storage); 
-
-		return cudaSuccess;
-	}
-
-public:
-	
-	/**
-	 * Constructor.
-	 * 
-	 * @param[in] 		num_elements 
-	 * 		Length (in elements) of the input to a sorting operation
-	 * 
-	 * @param[in] 		max_grid_size  
-	 * 		Maximum allowable number of CTAs to launch.  The default value of 0 indicates 
-	 * 		that the dispatch logic should select an appropriate value for the target device.
-	 */	
-	RadixSortingEnactor(unsigned int num_elements, int max_grid_size = 0) : Base::BaseRadixSortingEnactor(16, 4, num_elements, max_grid_size) {}
-
-};
-
-
-} // end namespace b40c_thrust
-} // end namespace detail
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cuda/detail/detail/b40c/radixsort_kernel_common.h b/compat/thrust/system/cuda/detail/detail/b40c/radixsort_kernel_common.h
deleted file mode 100644
index 7899dc3c0e..0000000000
--- a/compat/thrust/system/cuda/detail/detail/b40c/radixsort_kernel_common.h
+++ /dev/null
@@ -1,173 +0,0 @@
-/******************************************************************************
- * 
- * Copyright 2010 Duane Merrill
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. 
- * 
- * 
- * 
- * 
- * AUTHORS' REQUEST: 
- * 
- * 		If you use|reference|benchmark this code, please cite our Technical 
- * 		Report (http://www.cs.virginia.edu/~dgm4d/papers/RadixSortTR.pdf):
- * 
- *		@TechReport{ Merrill:Sorting:2010,
- *        	author = "Duane Merrill and Andrew Grimshaw",
- *        	title = "Revisiting Sorting for GPGPU Stream Architectures",
- *        	year = "2010",
- *        	institution = "University of Virginia, Department of Computer Science",
- *        	address = "Charlottesville, VA, USA",
- *        	number = "CS2010-03"
- *		}
- * 
- * For more information, see our Google Code project site: 
- * http://code.google.com/p/back40computing/
- * 
- * Thanks!
- * 
- ******************************************************************************/
-
-
-/******************************************************************************
- * Configuration management for B40C radix sorting kernels  
- ******************************************************************************/
-
-#pragma once
-
-#include "kernel_utils.h"
-#include "vector_types.h"
-#include "radixsort_key_conversion.h"
-
-namespace thrust  {
-namespace system  {
-namespace cuda    {
-namespace detail  {
-namespace detail  {
-namespace b40c_thrust   {
-
-
-/******************************************************************************
- * Radix sorting configuration  
- ******************************************************************************/
-
-// 128 threads
-#define B40C_RADIXSORT_LOG_THREADS						7								
-#define B40C_RADIXSORT_THREADS							(1 << B40C_RADIXSORT_LOG_THREADS)	
-
-// Target threadblock occupancy for counting/reduction kernel
-#define B40C_SM20_REDUCE_CTA_OCCUPANCY()					(8)			// 8 threadblocks on GF100
-#define B40C_SM12_REDUCE_CTA_OCCUPANCY()					(5)			// 5 threadblocks on GT200
-#define B40C_SM10_REDUCE_CTA_OCCUPANCY()					(3)			// 4 threadblocks on G80
-#define B40C_RADIXSORT_REDUCE_CTA_OCCUPANCY(version)		((version >= 200) ? B40C_SM20_REDUCE_CTA_OCCUPANCY() : 	\
-			        										 (version >= 120) ? B40C_SM12_REDUCE_CTA_OCCUPANCY() : 	\
-					        													B40C_SM10_REDUCE_CTA_OCCUPANCY())		
-													                    
-// Target threadblock occupancy for bulk scan/scatter kernel
-#define B40C_SM20_SCAN_SCATTER_CTA_OCCUPANCY()				(7)			// 7 threadblocks on GF100
-#define B40C_SM12_SCAN_SCATTER_CTA_OCCUPANCY()				(5)			// 5 threadblocks on GT200
-#define B40C_SM10_SCAN_SCATTER_CTA_OCCUPANCY()				(2)			// 2 threadblocks on G80
-#define B40C_RADIXSORT_SCAN_SCATTER_CTA_OCCUPANCY(version)	((version >= 200) ? B40C_SM20_SCAN_SCATTER_CTA_OCCUPANCY() : 	\
-			    											 (version >= 120) ? B40C_SM12_SCAN_SCATTER_CTA_OCCUPANCY() : 	\
-				    															B40C_SM10_SCAN_SCATTER_CTA_OCCUPANCY())		
-
-// Number of 256-element sets to rake per raking pass
-#define B40C_SM20_LOG_SETS_PER_PASS()					(1)			// 2 sets on GF100
-#define B40C_SM12_LOG_SETS_PER_PASS()					(0)			// 1 set on GT200
-#define B40C_SM10_LOG_SETS_PER_PASS()					(1)			// 2 sets on G80
-#define B40C_RADIXSORT_LOG_SETS_PER_PASS(version)		((version >= 200) ? B40C_SM20_LOG_SETS_PER_PASS() : 	\
-			     										 (version >= 120) ? B40C_SM12_LOG_SETS_PER_PASS() : 	\
-				    														B40C_SM10_LOG_SETS_PER_PASS())		
-
-// Number of raking passes per cycle
-#define B40C_SM20_LOG_PASSES_PER_CYCLE(K, V)					(((B40C_MAX(sizeof(K), sizeof(V)) > 4) || _B40C_LP64_) ? 0 : 1)	// 2 passes on GF100 (only one for large keys/values, or for 64-bit device pointers)
-#define B40C_SM12_LOG_PASSES_PER_CYCLE(K, V)					(B40C_MAX(sizeof(K), sizeof(V)) > 4 ? 0 : 1)					// 2 passes on GT200 (only for large keys/values)
-#define B40C_SM10_LOG_PASSES_PER_CYCLE(K, V)					(0)																// 1 pass on G80
-#define B40C_RADIXSORT_LOG_PASSES_PER_CYCLE(version, K, V)	((version >= 200) ? B40C_SM20_LOG_PASSES_PER_CYCLE(K, V) : 	\
-				    										 (version >= 120) ? B40C_SM12_LOG_PASSES_PER_CYCLE(K, V) : 	\
-					    														B40C_SM10_LOG_PASSES_PER_CYCLE(K, V))		
-
-
-// Number of raking threads per raking pass
-#define B40C_SM20_LOG_RAKING_THREADS_PER_PASS()				(B40C_LOG_WARP_THREADS + 1)		// 2 raking warps on GF100
-#define B40C_SM12_LOG_RAKING_THREADS_PER_PASS()				(B40C_LOG_WARP_THREADS)			// 1 raking warp on GT200
-#define B40C_SM10_LOG_RAKING_THREADS_PER_PASS()				(B40C_LOG_WARP_THREADS + 2)		// 4 raking warps on G80
-#define B40C_RADIXSORT_LOG_RAKING_THREADS_PER_PASS(version)	((version >= 200) ? B40C_SM20_LOG_RAKING_THREADS_PER_PASS() : 	\
-				    										 (version >= 120) ? B40C_SM12_LOG_RAKING_THREADS_PER_PASS() : 	\
-					    														B40C_SM10_LOG_RAKING_THREADS_PER_PASS())		
-
-
-// Number of elements per cycle
-#define B40C_RADIXSORT_LOG_CYCLE_ELEMENTS(version, K, V)		(B40C_RADIXSORT_LOG_SETS_PER_PASS(version) + B40C_RADIXSORT_LOG_PASSES_PER_CYCLE(version, K, V) + B40C_RADIXSORT_LOG_THREADS + 1)
-#define B40C_RADIXSORT_CYCLE_ELEMENTS(version, K, V)			(1 << B40C_RADIXSORT_LOG_CYCLE_ELEMENTS(version, K, V))
-
-// Number of warps per CTA
-#define B40C_RADIXSORT_LOG_WARPS								(B40C_RADIXSORT_LOG_THREADS - B40C_LOG_WARP_THREADS)
-#define B40C_RADIXSORT_WARPS									(1 << B40C_RADIXSORT_LOG_WARPS)
-
-// Number of threads for spine-scanning kernel
-#define B40C_RADIXSORT_LOG_SPINE_THREADS						7		// 128 threads
-#define B40C_RADIXSORT_SPINE_THREADS							(1 << B40C_RADIXSORT_LOG_SPINE_THREADS)	
-
-// Number of elements per spine-scanning cycle
-#define B40C_RADIXSORT_LOG_SPINE_CYCLE_ELEMENTS  				9		// 512 elements
-#define B40C_RADIXSORT_SPINE_CYCLE_ELEMENTS		    			(1 << B40C_RADIXSORT_LOG_SPINE_CYCLE_ELEMENTS)
-
-
-
-/******************************************************************************
- * SRTS Control Structures
- ******************************************************************************/
-
-
-/**
- * Value-type structure denoting keys-only sorting
- */
-struct KeysOnlyType {};
-
-/**
- * Returns whether or not the templated type indicates keys-only sorting
- */
-template <typename V>
-inline __host__ __device__ bool IsKeysOnly() {return false;}
-
-
-/**
- * Returns whether or not the templated type indicates keys-only sorting
- */
-template <>
-inline __host__ __device__ bool IsKeysOnly<KeysOnlyType>() {return true;}
-
-
-/**
- * A given threadblock may receive one of three different amounts of 
- * work: "big", "normal", and "last".  The big workloads are one
- * cycle_elements greater than the normal, and the last workload 
- * does the extra (problem-size % cycle_elements) work.
- */
-struct CtaDecomposition {
-	unsigned int num_big_blocks;
-	unsigned int big_block_elements;
-	unsigned int normal_block_elements;
-	unsigned int extra_elements_last_block;
-	unsigned int num_elements;
-};
-
-
-} // end namespace b40c_thrust
-} // end namespace detail
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cuda/detail/detail/b40c/radixsort_key_conversion.h b/compat/thrust/system/cuda/detail/detail/b40c/radixsort_key_conversion.h
deleted file mode 100644
index a170f95e6c..0000000000
--- a/compat/thrust/system/cuda/detail/detail/b40c/radixsort_key_conversion.h
+++ /dev/null
@@ -1,352 +0,0 @@
-/******************************************************************************
- * 
- * Copyright 2010 Duane Merrill
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. 
- * 
- * 
- * 
- * 
- * AUTHORS' REQUEST: 
- * 
- * 		If you use|reference|benchmark this code, please cite our Technical 
- * 		Report (http://www.cs.virginia.edu/~dgm4d/papers/RadixSortTR.pdf):
- * 
- *		@TechReport{ Merrill:Sorting:2010,
- *        	author = "Duane Merrill and Andrew Grimshaw",
- *        	title = "Revisiting Sorting for GPGPU Stream Architectures",
- *        	year = "2010",
- *        	institution = "University of Virginia, Department of Computer Science",
- *        	address = "Charlottesville, VA, USA",
- *        	number = "CS2010-03"
- *		}
- * 
- * For more information, see our Google Code project site: 
- * http://code.google.com/p/back40computing/
- * 
- * Thanks!
- * 
- ******************************************************************************/
-
-
-/******************************************************************************
- * Functors for converting signed and floating point types to unsigned types
- * suitable for radix sorting  
- ******************************************************************************/
-
-#pragma once
-
-namespace thrust  {
-namespace system  {
-namespace cuda    {
-namespace detail  {
-namespace detail  {
-namespace b40c_thrust   {
-
-
-//
-// Do-nothing functors
-//
-
-template <typename T>
-struct NopFunctor{
-    template <typename ConvertedKeyType>
-	__device__ __host__ __forceinline__ void operator()(ConvertedKeyType &converted_key) {}
-	__device__ __host__ __forceinline__ static bool MustApply(){ return false;}
-};
-
-//
-// Do-nothing functors that indicate a mandatory pass
-//
-
-template <typename T>
-struct MandatoryPassNopFunctor{
-    template <typename ConvertedKeyType>
-	__device__ __host__ __forceinline__ void operator()(ConvertedKeyType &converted_key) {}
-	__device__ __host__ __forceinline__ static bool MustApply(){ return false;}
-};
-
-
-//
-// Conversion for generic unsigned types
-//
-
-template <typename T> struct KeyConversion {
-	typedef T UnsignedBits;
-};
-
-template <typename T>
-struct PreprocessKeyFunctor{
-    template <typename ConvertedKeyType>
-	__device__ __host__ __forceinline__ void operator()(ConvertedKeyType &converted_key) {}
-	__device__ __host__ __forceinline__ static bool MustApply(){ return false;}
-};
-
-template <typename T>
-struct PostprocessKeyFunctor {
-    template <typename ConvertedKeyType>
-	__device__ __host__ __forceinline__ void operator()(ConvertedKeyType &converted_key) {}
-	__device__ __host__ __forceinline__ static bool MustApply(){ return false;}
-};
-
-
-
-//
-// Conversion for floats
-//
-
-template <> struct KeyConversion<float> {
-	typedef unsigned int UnsignedBits;
-};
-
-template <>
-struct PreprocessKeyFunctor<float> {
-	__device__ __host__ __forceinline__ void operator()(unsigned int &converted_key) {
-
-		unsigned int mask = (converted_key & 0x80000000) ? 0xffffffff : 0x80000000; 
-		converted_key ^= mask;
-	}
-	__device__ __host__ __forceinline__ static bool MustApply(){ return true;}
-};
-
-template <>
-struct PostprocessKeyFunctor<float> {
-	__device__ __host__ __forceinline__ void operator()(unsigned int &converted_key) {
-
-		unsigned int mask = (converted_key & 0x80000000) ? 0x80000000 : 0xffffffff; 
-		converted_key ^= mask;
-    }
-	__device__ __host__ __forceinline__ static bool MustApply(){ return true;}
-};
-
-
-
-//
-// Conversion for doubles
-//
-
-template <> struct KeyConversion<double> {
-	typedef unsigned long long UnsignedBits;
-};
-
-template <>
-struct PreprocessKeyFunctor<double> {
-	__device__ __host__ __forceinline__ void operator()(unsigned long long &converted_key) {
-
-		unsigned long long mask = (converted_key & 0x8000000000000000) ? 0xffffffffffffffff : 0x8000000000000000; 
-		converted_key ^= mask;
-	}
-	__device__ __host__ __forceinline__ static bool MustApply(){ return true;}
-};
-
-template <>
-struct PostprocessKeyFunctor<double> {
-	__device__ __host__ __forceinline__ void operator()(unsigned long long &converted_key)  {
-		unsigned long long mask = (converted_key & 0x8000000000000000) ? 0x8000000000000000 : 0xffffffffffffffff; 
-        converted_key ^= mask;
-    }
-	__device__ __host__ __forceinline__ static bool MustApply(){ return true;}
-};
-
-
-//
-// Conversion for signed chars
-//
-
-template <> struct KeyConversion<char> {
-  typedef unsigned char UnsignedBits;
-};
-
-template <>
-struct PreprocessKeyFunctor<char> {
-  __device__ __host__ __forceinline__ void operator()(unsigned char &converted_key) {
-    // char is unsigned on some platforms, so we have to check
-    if(std::numeric_limits<char>::is_signed)
-    {
-      const unsigned int SIGN_MASK = 1u << ((sizeof(char) * 8) - 1);
-      converted_key ^= SIGN_MASK;	
-    }
-  }
-  __device__ __host__ __forceinline__ static bool MustApply(){ return std::numeric_limits<char>::is_signed;}
-};
-
-template <>
-struct PostprocessKeyFunctor<char> {
-  __device__ __host__ __forceinline__ void operator()(unsigned char &converted_key)  {
-    // char is unsigned on some platforms, so we have to check
-    if(std::numeric_limits<char>::is_signed)
-    {
-      const unsigned int SIGN_MASK = 1u << ((sizeof(char) * 8) - 1);
-      converted_key ^= SIGN_MASK;	
-    }
-  }
-  __device__ __host__ __forceinline__ static bool MustApply(){ return std::numeric_limits<char>::is_signed;}
-};
-
-
-// TODO handle this more gracefully
-template <> struct KeyConversion<signed char> {
-	typedef unsigned char UnsignedBits;
-};
-
-template <>
-struct PreprocessKeyFunctor<signed char> {
-	__device__ __host__ __forceinline__ void operator()(unsigned char &converted_key) {
-		const unsigned int SIGN_MASK = 1u << ((sizeof(char) * 8) - 1);
-		converted_key ^= SIGN_MASK;	
-	}
-	__device__ __host__ __forceinline__ static bool MustApply(){ return true;}
-};
-
-template <>
-struct PostprocessKeyFunctor<signed char> {
-	__device__ __host__ __forceinline__ void operator()(unsigned char &converted_key)  {
-		const unsigned int SIGN_MASK = 1u << ((sizeof(char) * 8) - 1);
-		converted_key ^= SIGN_MASK;	
-    }
-	__device__ __host__ __forceinline__ static bool MustApply(){ return true;}
-};
-
-
-//
-// Conversion for signed shorts
-//
-
-template <> struct KeyConversion<short> {
-	typedef unsigned short UnsignedBits;
-};
-
-template <>
-struct PreprocessKeyFunctor<short> {
-	__device__ __host__ __forceinline__ void operator()(unsigned short &converted_key) {
-		const unsigned int SIGN_MASK = 1u << ((sizeof(short) * 8) - 1);
-		converted_key ^= SIGN_MASK;	
-	}
-	__device__ __host__ __forceinline__ static bool MustApply(){ return true;}
-};
-
-template <>
-struct PostprocessKeyFunctor<short> {
-	__device__ __host__ __forceinline__ void operator()(unsigned short &converted_key)  {
-		const unsigned int SIGN_MASK = 1u << ((sizeof(short) * 8) - 1);
-		converted_key ^= SIGN_MASK;	
-    }
-	__device__ __host__ __forceinline__ static bool MustApply(){ return true;}
-};
-
-
-
-//
-// Conversion for signed ints
-//
-
-template <> struct KeyConversion<int> {
-	typedef unsigned int UnsignedBits;
-};
-
-template <>
-struct PreprocessKeyFunctor<int> {
-	__device__ __host__ __forceinline__ void operator()(unsigned int &converted_key) {
-		const unsigned int SIGN_MASK = 1u << ((sizeof(int) * 8) - 1);
-		converted_key ^= SIGN_MASK;	
-	}
-	__device__ __host__ __forceinline__ static bool MustApply(){ return true;}
-};
-
-template <>
-struct PostprocessKeyFunctor<int> {
-	__device__ __host__ __forceinline__ void operator()(unsigned int &converted_key)  {
-		const unsigned int SIGN_MASK = 1u << ((sizeof(int) * 8) - 1);
-		converted_key ^= SIGN_MASK;	
-    }
-	__device__ __host__ __forceinline__ static bool MustApply(){ return true;}
-};
-
-
-
-//
-// Conversion for signed longs
-//
-
-// TODO rework this with metaprogramming
-template <> struct KeyConversion<unsigned long> {
-#if ULONG_MAX == UINT_MAX
-    typedef unsigned int UnsignedBits;
-#else
-    typedef unsigned long long UnsignedBits;
-#endif
-};
-
-// TODO rework this with metaprogramming
-template <> struct KeyConversion<long> {
-#if ULONG_MAX == UINT_MAX
-    typedef unsigned int UnsignedBits;
-#else
-    typedef unsigned long long UnsignedBits;
-#endif
-};
-
-template <>
-struct PreprocessKeyFunctor<long> {
-	__device__ __host__ __forceinline__ void operator()(typename KeyConversion<long>::UnsignedBits& converted_key) {
-		const typename KeyConversion<long>::UnsignedBits SIGN_MASK = 1ul << ((sizeof(long) * 8) - 1);
-		converted_key ^= SIGN_MASK;	
-	}
-	__device__ __host__ __forceinline__ static bool MustApply(){ return true;}
-};
-
-template <>
-struct PostprocessKeyFunctor<long> {
-	__device__ __host__ __forceinline__ void operator()(typename KeyConversion<long>::UnsignedBits& converted_key) {
-		const typename KeyConversion<long>::UnsignedBits SIGN_MASK = 1ul << ((sizeof(long) * 8) - 1);
-		converted_key ^= SIGN_MASK;	
-    }
-	__device__ __host__ __forceinline__ static bool MustApply(){ return true;}
-};
-
-
-
-//
-// Conversion for signed long longs 
-//
-
-template <> struct KeyConversion<long long> {
-	typedef unsigned long long UnsignedBits;
-};
-
-template <>
-struct PreprocessKeyFunctor<long long> {
-	__device__ __host__ __forceinline__ void operator()(unsigned long long &converted_key) {
-		const unsigned long long SIGN_MASK = 1ull << ((sizeof(long long) * 8) - 1);
-		converted_key ^= SIGN_MASK;	
-	}
-	__device__ __host__ __forceinline__ static bool MustApply(){ return true;}
-};
-
-template <>
-struct PostprocessKeyFunctor<long long> {
-	__device__ __host__ __forceinline__ void operator()(unsigned long long &converted_key)  {
-		const unsigned long long SIGN_MASK = 1ull << ((sizeof(long long) * 8) - 1);
-		converted_key ^= SIGN_MASK;	
-    }
-	__device__ __host__ __forceinline__ static bool MustApply(){ return true;}
-};
-
-
-} // end namespace b40c_thrust
-} // end namespace detail
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cuda/detail/detail/b40c/radixsort_reduction_kernel.h b/compat/thrust/system/cuda/detail/detail/b40c/radixsort_reduction_kernel.h
deleted file mode 100644
index a8f91d3d24..0000000000
--- a/compat/thrust/system/cuda/detail/detail/b40c/radixsort_reduction_kernel.h
+++ /dev/null
@@ -1,439 +0,0 @@
-/******************************************************************************
- * 
- * Copyright 2010 Duane Merrill
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. 
- * 
- * 
- * 
- * 
- * AUTHORS' REQUEST: 
- * 
- * 		If you use|reference|benchmark this code, please cite our Technical 
- * 		Report (http://www.cs.virginia.edu/~dgm4d/papers/RadixSortTR.pdf):
- * 
- *		@TechReport{ Merrill:Sorting:2010,
- *        	author = "Duane Merrill and Andrew Grimshaw",
- *        	title = "Revisiting Sorting for GPGPU Stream Architectures",
- *        	year = "2010",
- *        	institution = "University of Virginia, Department of Computer Science",
- *        	address = "Charlottesville, VA, USA",
- *        	number = "CS2010-03"
- *		}
- * 
- * For more information, see our Google Code project site: 
- * http://code.google.com/p/back40computing/
- * 
- * Thanks!
- * 
- ******************************************************************************/
-
-
-/******************************************************************************
- * Bottom-level digit-reduction/counting kernel
- ******************************************************************************/
-
-#pragma once
-
-#include "radixsort_kernel_common.h"
-
-namespace thrust  {
-namespace system  {
-namespace cuda    {
-namespace detail  {
-namespace detail  {
-namespace b40c_thrust   {
-
-/******************************************************************************
- * Defines
- ******************************************************************************/
-
-const int BYTE_ENCODE_SHIFT = 0x3;
-
-
-/******************************************************************************
- * Cycle-processing Routines
- ******************************************************************************/
-
-__device__ __forceinline__ int DecodeInt(int encoded, int quad_byte){
-	return (encoded >> quad_byte) & 0xff;		// shift right 8 bits per digit and return rightmost 8 bits
-}
-
-
-__device__ __forceinline__ int EncodeInt(int count, int quad_byte) {
-	return count << quad_byte;					// shift left 8 bits per digit
-}
-
-
-template <typename K, long long RADIX_DIGITS, int BIT>
-__device__ __forceinline__ void DecodeDigit(
-	K key, 
-	int &lane, 
-	int &quad_shift) 
-{
-	const K DIGIT_MASK = RADIX_DIGITS - 1;
-	lane = (key & (DIGIT_MASK << BIT)) >> (BIT + 2);
-	
-	const K QUAD_MASK = (RADIX_DIGITS < 4) ? 0x1 : 0x3;
-	if (BIT == 32) {
-		// N.B.: This takes one more instruction than the code below it, but 
-		// otherwise the compiler goes nuts and shoves hundreds of bytes 
-		// to lmem when bit = 32 on 64-bit keys.		
-		quad_shift = ((key >> BIT) & QUAD_MASK) << BYTE_ENCODE_SHIFT;	
-	} else {
-		quad_shift = MagnitudeShift<K, BYTE_ENCODE_SHIFT - BIT>(key & (QUAD_MASK << BIT));
-	}
-}
-
-
-template <int RADIX_DIGITS, int SCAN_LANES, int LANES_PER_WARP, int BIT, bool FINAL_REDUCE>
-__device__ __forceinline__ void ReduceEncodedCounts(
-	int local_counts[LANES_PER_WARP][4],
-	int encoded_carry[SCAN_LANES][B40C_RADIXSORT_THREADS]) 
-{
-	const int LOG_PARTIALS_PER_THREAD = B40C_RADIXSORT_LOG_THREADS - B40C_LOG_WARP_THREADS;
-	const int PARTIALS_PER_THREAD = 1 << LOG_PARTIALS_PER_THREAD;
-	
-	int encoded;
-	int idx = threadIdx.x & (B40C_WARP_THREADS - 1);
-	
-	
-	__syncthreads();
-
-	#pragma unroll
-	for (int j = 0; j < (int) LANES_PER_WARP; j++) {
-		
-		int warp_id = (threadIdx.x >> B40C_LOG_WARP_THREADS) + (j * B40C_RADIXSORT_WARPS);
-		if (warp_id < SCAN_LANES) {
-
-			// rest of my elements
-			#pragma unroll
-			for (int i = 0; i < (int) PARTIALS_PER_THREAD; i++) {
-				encoded = encoded_carry[warp_id][idx + (i * B40C_WARP_THREADS)];		
-				local_counts[j][0] += DecodeInt(encoded, 0u << BYTE_ENCODE_SHIFT);
-				local_counts[j][1] += DecodeInt(encoded, 1u << BYTE_ENCODE_SHIFT);
-				local_counts[j][2] += DecodeInt(encoded, 2u << BYTE_ENCODE_SHIFT);
-				local_counts[j][3] += DecodeInt(encoded, 3u << BYTE_ENCODE_SHIFT);
-			}
-			
-			if (FINAL_REDUCE) {
-				// reduce all four packed fields, leaving them in the first four elements of our row
-				WarpReduce<B40C_WARP_THREADS>(idx, &encoded_carry[warp_id][0], local_counts[j][0]);
-				WarpReduce<B40C_WARP_THREADS>(idx, &encoded_carry[warp_id][1], local_counts[j][1]);
-				WarpReduce<B40C_WARP_THREADS>(idx, &encoded_carry[warp_id][2], local_counts[j][2]);
-				WarpReduce<B40C_WARP_THREADS>(idx, &encoded_carry[warp_id][3], local_counts[j][3]);
-			}
-		}
-	}	
-
-	__syncthreads();
-	
-}
-	
-
-template <typename K, int RADIX_DIGITS, int SCAN_LANES, int BIT, typename PreprocessFunctor>
-__device__ __forceinline__ void Bucket(
-	K input, 
-	int encoded_carry[SCAN_LANES][B40C_RADIXSORT_THREADS],
-	PreprocessFunctor preprocess = PreprocessFunctor()) 
-{
-	int lane, quad_shift;
-	preprocess(input);
-	DecodeDigit<K, RADIX_DIGITS, BIT>(input, lane, quad_shift);
-	encoded_carry[lane][threadIdx.x] += EncodeInt(1, quad_shift);
-}
-
-
-template <typename K, int RADIX_DIGITS, int SCAN_LANES, int BIT, typename PreprocessFunctor, int CYCLES>
-struct LoadOp;
-
-template <typename K, int RADIX_DIGITS, int SCAN_LANES, int BIT, typename PreprocessFunctor>
-struct LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 1> 
-{
-	static __device__ __forceinline__  void BlockOfLoads(K *d_in_keys, int offset, int encoded_carry[SCAN_LANES][B40C_RADIXSORT_THREADS])
-	{
-		K key = d_in_keys[offset + threadIdx.x];
-		Bucket<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor>(key, encoded_carry);
-	}
-};
-
-template <typename K, int RADIX_DIGITS, int SCAN_LANES, int BIT, typename PreprocessFunctor>
-struct LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 2> 
-{
-	static __device__ __forceinline__  void BlockOfLoads(K *d_in_keys, int offset, int encoded_carry[SCAN_LANES][B40C_RADIXSORT_THREADS])
-	{
-		LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 1>::BlockOfLoads(d_in_keys, offset + (B40C_RADIXSORT_THREADS * 0), encoded_carry);
-		LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 1>::BlockOfLoads(d_in_keys, offset + (B40C_RADIXSORT_THREADS * 1), encoded_carry);
-	}
-};
-
-template <typename K, int RADIX_DIGITS, int SCAN_LANES, int BIT, typename PreprocessFunctor>
-struct LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 4> 
-{
-	static __device__ __forceinline__  void BlockOfLoads(K *d_in_keys, int offset, int encoded_carry[SCAN_LANES][B40C_RADIXSORT_THREADS])
-	{
-		LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 2>::BlockOfLoads(d_in_keys, offset + (B40C_RADIXSORT_THREADS * 0), encoded_carry);
-		LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 2>::BlockOfLoads(d_in_keys, offset + (B40C_RADIXSORT_THREADS * 2), encoded_carry);
-	}
-};
-
-template <typename K, int RADIX_DIGITS, int SCAN_LANES, int BIT, typename PreprocessFunctor>
-struct LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 8> 
-{
-	static __device__ __forceinline__  void BlockOfLoads(K *d_in_keys, int offset, int encoded_carry[SCAN_LANES][B40C_RADIXSORT_THREADS])
-	{
-			K keys[8];
-				
-			keys[0] = d_in_keys[offset + (B40C_RADIXSORT_THREADS * 0) + threadIdx.x];
-			keys[1] = d_in_keys[offset + (B40C_RADIXSORT_THREADS * 1) + threadIdx.x];
-			keys[2] = d_in_keys[offset + (B40C_RADIXSORT_THREADS * 2) + threadIdx.x];
-			keys[3] = d_in_keys[offset + (B40C_RADIXSORT_THREADS * 3) + threadIdx.x];
-
-			if (B40C_FERMI(__CUDA_ARCH__)) __syncthreads();
-			
-			keys[4] = d_in_keys[offset + (B40C_RADIXSORT_THREADS * 4) + threadIdx.x];
-			keys[5] = d_in_keys[offset + (B40C_RADIXSORT_THREADS * 5) + threadIdx.x];
-			keys[6] = d_in_keys[offset + (B40C_RADIXSORT_THREADS * 6) + threadIdx.x];
-			keys[7] = d_in_keys[offset + (B40C_RADIXSORT_THREADS * 7) + threadIdx.x];
-			
-			Bucket<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor>(keys[0], encoded_carry);
-			Bucket<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor>(keys[1], encoded_carry);
-			Bucket<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor>(keys[2], encoded_carry);
-			Bucket<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor>(keys[3], encoded_carry);
-			Bucket<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor>(keys[4], encoded_carry);
-			Bucket<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor>(keys[5], encoded_carry);
-			Bucket<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor>(keys[6], encoded_carry);
-			Bucket<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor>(keys[7], encoded_carry);
-	}
-};
-
-template <typename K, int RADIX_DIGITS, int SCAN_LANES, int BIT, typename PreprocessFunctor>
-struct LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 16> {
-
-	static __device__ __forceinline__  void BlockOfLoads(K *d_in_keys, int offset, int encoded_carry[SCAN_LANES][B40C_RADIXSORT_THREADS])
-	{
-		LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 8>::BlockOfLoads(d_in_keys, offset + (B40C_RADIXSORT_THREADS * 0), encoded_carry);
-		LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 8>::BlockOfLoads(d_in_keys, offset + (B40C_RADIXSORT_THREADS * 8), encoded_carry);
-	}
-};
-
-template <typename K, int RADIX_DIGITS, int SCAN_LANES, int BIT, typename PreprocessFunctor>
-struct LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 32> {
-
-	static __device__ __forceinline__  void BlockOfLoads(K *d_in_keys, int offset, int encoded_carry[SCAN_LANES][B40C_RADIXSORT_THREADS])
-	{
-		LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 16>::BlockOfLoads(d_in_keys, offset + (B40C_RADIXSORT_THREADS * 0), encoded_carry);
-		LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 16>::BlockOfLoads(d_in_keys, offset + (B40C_RADIXSORT_THREADS * 16), encoded_carry);
-	}
-};
-
-template <typename K, int RADIX_DIGITS, int SCAN_LANES, int BIT, typename PreprocessFunctor>
-struct LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 64> {
-
-	static __device__ __forceinline__  void BlockOfLoads(K *d_in_keys, int offset, int encoded_carry[SCAN_LANES][B40C_RADIXSORT_THREADS])
-	{
-		LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 32>::BlockOfLoads(d_in_keys, offset + (B40C_RADIXSORT_THREADS * 0), encoded_carry);
-		LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 32>::BlockOfLoads(d_in_keys, offset + (B40C_RADIXSORT_THREADS * 32), encoded_carry);
-	}
-};
-
-template <typename K, int RADIX_DIGITS, int SCAN_LANES, int BIT, typename PreprocessFunctor>
-struct LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 128> {
-
-	static __device__ __forceinline__  void BlockOfLoads(K *d_in_keys, int offset, int encoded_carry[SCAN_LANES][B40C_RADIXSORT_THREADS])
-	{
-		LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 64>::BlockOfLoads(d_in_keys, offset + (B40C_RADIXSORT_THREADS * 0), encoded_carry);
-		LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 64>::BlockOfLoads(d_in_keys, offset + (B40C_RADIXSORT_THREADS * 64), encoded_carry);
-	}
-};
-
-template <typename K, int RADIX_DIGITS, int SCAN_LANES, int BIT, typename PreprocessFunctor>
-struct LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 252> {
-
-	static __device__ __forceinline__  void BlockOfLoads(K *d_in_keys, int offset, int encoded_carry[SCAN_LANES][B40C_RADIXSORT_THREADS])
-	{
-		LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 128>::BlockOfLoads(d_in_keys, offset + (B40C_RADIXSORT_THREADS * 0), encoded_carry);
-		LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 64>::BlockOfLoads(d_in_keys, offset + (B40C_RADIXSORT_THREADS * 128), encoded_carry);
-		LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 32>::BlockOfLoads(d_in_keys, offset + (B40C_RADIXSORT_THREADS * 192), encoded_carry);
-		LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 16>::BlockOfLoads(d_in_keys, offset + (B40C_RADIXSORT_THREADS * 224), encoded_carry);
-		LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 8>::BlockOfLoads(d_in_keys, offset + (B40C_RADIXSORT_THREADS * 240), encoded_carry);
-		LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 4>::BlockOfLoads(d_in_keys, offset + (B40C_RADIXSORT_THREADS * 248), encoded_carry);
-	}
-};
-
-
-template <int SCAN_LANES>
-__device__ __forceinline__ void ResetEncodedCarry(
-	int encoded_carry[SCAN_LANES][B40C_RADIXSORT_THREADS])
-{
-	#pragma unroll
-	for (int SCAN_LANE = 0; SCAN_LANE < (int) SCAN_LANES; SCAN_LANE++) {
-		encoded_carry[SCAN_LANE][threadIdx.x] = 0;
-	}
-}
-
-
-template <typename K, int RADIX_DIGITS, int SCAN_LANES, int LANES_PER_WARP, int BIT, typename PreprocessFunctor>
-__device__ __forceinline__ int ProcessLoads(
-	K *d_in_keys,
-	int loads,
-	int &offset,
-	int encoded_carry[SCAN_LANES][B40C_RADIXSORT_THREADS],
-	int local_counts[LANES_PER_WARP][4])
-{
-	// Unroll batches of loads with occasional reduction to avoid overflow
-	while (loads >= 32) {
-	
-		LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 32>::BlockOfLoads(d_in_keys, offset, encoded_carry);
-		offset += B40C_RADIXSORT_THREADS * 32;
-		loads -= 32;
-
-		// Reduce int local count registers to prevent overflow
-		ReduceEncodedCounts<RADIX_DIGITS, SCAN_LANES, LANES_PER_WARP, BIT, false>(
-				local_counts, 
-				encoded_carry);
-		
-		// Reset encoded counters
-		ResetEncodedCarry<SCAN_LANES>(encoded_carry);
-	} 
-	
-	int retval = loads;
-	
-	// Wind down loads in decreasing batch sizes
-
-	while (loads >= 4) {
-		LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 4>::BlockOfLoads(d_in_keys, offset, encoded_carry);
-		offset += B40C_RADIXSORT_THREADS * 4;
-		loads -= 4;
-	} 
-
-	while (loads) {
-		LoadOp<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor, 1>::BlockOfLoads(d_in_keys, offset, encoded_carry);
-		offset += B40C_RADIXSORT_THREADS * 1;
-		loads--;
-	}
-	
-	return retval;
-}
-
-
-/******************************************************************************
- * Reduction/counting Kernel Entry Point
- ******************************************************************************/
-
-template <typename K, typename V, int PASS, int RADIX_BITS, int BIT, typename PreprocessFunctor>
-__launch_bounds__ (B40C_RADIXSORT_THREADS, B40C_RADIXSORT_REDUCE_CTA_OCCUPANCY(__CUDA_ARCH__))
-__global__ 
-void RakingReduction(
-	bool *d_from_alt_storage,
-	int *d_spine,
-	K *d_in_keys,
-	K *d_out_keys,
-	CtaDecomposition work_decomposition)
-{
-	const int RADIX_DIGITS 		= 1 << RADIX_BITS;
-
-	const int LOG_SCAN_LANES 		= (RADIX_BITS >= 2) ? RADIX_BITS - 2 : 0;	// Always at least one fours group
-	const int SCAN_LANES 			= 1 << LOG_SCAN_LANES;
-
-	const int LOG_LANES_PER_WARP 	= (SCAN_LANES > B40C_RADIXSORT_WARPS) ? LOG_SCAN_LANES - B40C_RADIXSORT_LOG_WARPS : 0;	// Always at least one fours group per warp
-	const int LANES_PER_WARP 		= 1 << LOG_LANES_PER_WARP;
-	
-	
-	// Each thread gets its own column of fours-groups (for conflict-free updates)
-	__shared__ int encoded_carry[SCAN_LANES][B40C_RADIXSORT_THREADS];			
-
-	// Each thread is also responsible for aggregating an unencoded segment of a fours-group
-	int local_counts[LANES_PER_WARP][4];								
-
-	// Determine where to read our input
-	bool from_alt_storage = (PASS == 0) ? false : d_from_alt_storage[PASS & 0x1];
-	if (from_alt_storage) d_in_keys = d_out_keys;
-	
-	// Calculate our threadblock's range
-	int offset, block_elements;
-	if (blockIdx.x < work_decomposition.num_big_blocks) {
-		offset = work_decomposition.big_block_elements * blockIdx.x;
-		block_elements = work_decomposition.big_block_elements;
-	} else {
-		offset = (work_decomposition.normal_block_elements * blockIdx.x) + (work_decomposition.num_big_blocks * B40C_RADIXSORT_CYCLE_ELEMENTS(__CUDA_ARCH__, K, V));
-		block_elements = work_decomposition.normal_block_elements;
-	}
-	
-	// Initialize local counts
-	#pragma unroll 
-	for (int LANE = 0; LANE < (int) LANES_PER_WARP; LANE++) {
-		local_counts[LANE][0] = 0;
-		local_counts[LANE][1] = 0;
-		local_counts[LANE][2] = 0;
-		local_counts[LANE][3] = 0;
-	}
-	
-	// Reset encoded counters
-	ResetEncodedCarry<SCAN_LANES>(encoded_carry);
-	
-	// Process loads
-	int loads = block_elements >> B40C_RADIXSORT_LOG_THREADS;
-	int unreduced_loads = ProcessLoads<K, RADIX_DIGITS, SCAN_LANES, LANES_PER_WARP, BIT, PreprocessFunctor>(
-		d_in_keys,
-		loads,
-		offset,
-		encoded_carry,
-		local_counts);
-	
-	// Cleanup if we're the last block  
-	if ((blockIdx.x == gridDim.x - 1) && (work_decomposition.extra_elements_last_block)) {
-
-		const int LOADS_PER_CYCLE = B40C_RADIXSORT_CYCLE_ELEMENTS(__CUDA_ARCH__, K, V) / B40C_RADIXSORT_THREADS;
-		
-		// If extra guarded loads may cause overflow, reduce now and reset counters
-		if (unreduced_loads + LOADS_PER_CYCLE > 255) {
-		
-			ReduceEncodedCounts<RADIX_DIGITS, SCAN_LANES, LANES_PER_WARP, BIT, false>(
-					local_counts, 
-					encoded_carry);
-			
-			ResetEncodedCarry<SCAN_LANES>(encoded_carry);
-		}
-		
-		// perform up to LOADS_PER_CYCLE extra guarded loads
-		#pragma unroll
-		for (int EXTRA_LOAD = 0; EXTRA_LOAD < (int) LOADS_PER_CYCLE; EXTRA_LOAD++) {
-			if (threadIdx.x + (B40C_RADIXSORT_THREADS * EXTRA_LOAD) < work_decomposition.extra_elements_last_block) {
-				K key = d_in_keys[offset + (B40C_RADIXSORT_THREADS * EXTRA_LOAD) + threadIdx.x];
-				Bucket<K, RADIX_DIGITS, SCAN_LANES, BIT, PreprocessFunctor>(key, encoded_carry);
-			}
-		}
-	}
-	
-	// Aggregate 
-	ReduceEncodedCounts<RADIX_DIGITS, SCAN_LANES, LANES_PER_WARP, BIT, true>(
-		local_counts, 
-		encoded_carry);
-
-	// Write carry in parallel (carries per row are in the first four bytes of each row) 
-	if (threadIdx.x < RADIX_DIGITS) {
-
-		int row = threadIdx.x >> 2;		
-		int col = threadIdx.x & 3;			 
-		d_spine[(gridDim.x * threadIdx.x) + blockIdx.x] = encoded_carry[row][col];
-	}
-} 
-
-} // end namespace b40c_thrust
-} // end namespace detail
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cuda/detail/detail/b40c/radixsort_scanscatter_kernel.h b/compat/thrust/system/cuda/detail/detail/b40c/radixsort_scanscatter_kernel.h
deleted file mode 100644
index 1377999c76..0000000000
--- a/compat/thrust/system/cuda/detail/detail/b40c/radixsort_scanscatter_kernel.h
+++ /dev/null
@@ -1,1207 +0,0 @@
-/******************************************************************************
- * 
- * Copyright 2010 Duane Merrill
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. 
- * 
- * 
- * 
- * 
- * AUTHORS' REQUEST: 
- * 
- * 		If you use|reference|benchmark this code, please cite our Technical 
- * 		Report (http://www.cs.virginia.edu/~dgm4d/papers/RadixSortTR.pdf):
- * 
- *		@TechReport{ Merrill:Sorting:2010,
- *        	author = "Duane Merrill and Andrew Grimshaw",
- *        	title = "Revisiting Sorting for GPGPU Stream Architectures",
- *        	year = "2010",
- *        	institution = "University of Virginia, Department of Computer Science",
- *        	address = "Charlottesville, VA, USA",
- *        	number = "CS2010-03"
- *		}
- * 
- * For more information, see our Google Code project site: 
- * http://code.google.com/p/back40computing/
- * 
- * Thanks!
- * 
- ******************************************************************************/
-
-
-/******************************************************************************
-// Bottom-level digit scanning/scattering kernel
- ******************************************************************************/
-
-#pragma once
-
-#include "radixsort_kernel_common.h"
-
-namespace thrust  {
-namespace system  {
-namespace cuda    {
-namespace detail  {
-namespace detail  {
-namespace b40c_thrust   {
-
-/******************************************************************************
- * Appropriate substitutes to use for out-of-bounds key (and value) offsets 
- ******************************************************************************/
-
-template <typename T> 
-__device__ __forceinline__ T DefaultextraValue() {
-	return T();
-}
-
-template <> 
-__device__ __forceinline__ unsigned char DefaultextraValue<unsigned char>() {
-	return (unsigned char) -1;
-}
-
-template <> 
-__device__ __forceinline__ unsigned short DefaultextraValue<unsigned short>() {
-	return (unsigned short) -1;
-}
-
-template <> 
-__device__ __forceinline__ unsigned int DefaultextraValue<unsigned int>() {
-	return (unsigned int) -1u;
-}
-
-template <> 
-__device__ __forceinline__ unsigned long DefaultextraValue<unsigned long>() {
-	return (unsigned long) -1ul;
-}
-
-template <> 
-__device__ __forceinline__ unsigned long long DefaultextraValue<unsigned long long>() {
-	return (unsigned long long) -1ull;
-}
-
-
-/******************************************************************************
- * Cycle-processing Routines
- ******************************************************************************/
-
-template <typename K, long long RADIX_DIGITS, int BIT>
-__device__ __forceinline__ int DecodeDigit(K key) 
-{
-	const K DIGIT_MASK = RADIX_DIGITS - 1;
-	return (key >> BIT) & DIGIT_MASK;
-}
-
-
-template <typename K, long long RADIX_DIGITS, int BIT, int PADDED_PARTIALS_PER_LANE>
-__device__ __forceinline__ void DecodeDigit(
-	K key, 
-	int &digit, 
-	int &flag_offset,		// in bytes
-	const int SET_OFFSET)
-{
-	const int PADDED_BYTES_PER_LANE 	= PADDED_PARTIALS_PER_LANE * 4;
-	const int SET_OFFSET_BYTES 		= SET_OFFSET * 4;
-	const K QUAD_MASK 							= (RADIX_DIGITS < 4) ? 0x1 : 0x3;
-	
-	digit = DecodeDigit<K, RADIX_DIGITS, BIT>(key);
-	int lane = digit >> 2;
-	int quad_byte = digit & QUAD_MASK;
-
-	flag_offset = SET_OFFSET_BYTES + FastMul(lane, PADDED_BYTES_PER_LANE) + quad_byte;
-}
-
-
-template <typename K, long long RADIX_DIGITS, int BIT, int SETS_PER_PASS, int SCAN_LANES_PER_SET, int PADDED_PARTIALS_PER_LANE>
-__device__ __forceinline__ void DecodeDigits(
-	typename VecType<K, 2>::Type keypairs[SETS_PER_PASS],
-	int2 digits[SETS_PER_PASS],
-	int2 flag_offsets[SETS_PER_PASS])		// in bytes 
-{
-
-	#pragma unroll
-	for (int SET = 0; SET < (int) SETS_PER_PASS; SET++) {
-		
-		const int SET_OFFSET = SET * SCAN_LANES_PER_SET * PADDED_PARTIALS_PER_LANE;
-
-		DecodeDigit<K, RADIX_DIGITS, BIT, PADDED_PARTIALS_PER_LANE>(
-				keypairs[SET].x, digits[SET].x, flag_offsets[SET].x, SET_OFFSET);
-		
-		DecodeDigit<K, RADIX_DIGITS, BIT, PADDED_PARTIALS_PER_LANE>(
-				keypairs[SET].y, digits[SET].y, flag_offsets[SET].y, SET_OFFSET);
-	}
-}
-
-
-template <typename T, typename PreprocessFunctor>
-__device__ __forceinline__ void GuardedReadSet(
-	T *in, 
-	typename VecType<T, 2>::Type &pair,
-	int offset,
-	int extra[1],
-	PreprocessFunctor preprocess = PreprocessFunctor())				
-{
-	if (offset - extra[0] < 0) {
-		pair.x = in[offset];
-		preprocess(pair.x);
-	} else {
-		pair.x = DefaultextraValue<T>();
-	}
-	
-	if (offset + 1 - extra[0] < 0) {
-		pair.y = in[offset + 1];
-		preprocess(pair.y);
-	} else {
-		pair.y = DefaultextraValue<T>();
-	}
-}
-
-
-template <typename T, bool UNGUARDED_IO, int SETS_PER_PASS, typename PreprocessFunctor>
-__device__ __forceinline__ void ReadSets(
-	typename VecType<T, 2>::Type *d_in, 
-	typename VecType<T, 2>::Type pairs[SETS_PER_PASS],
-	const int BASE2,
-	int extra[1],
-	PreprocessFunctor preprocess = PreprocessFunctor())				
-{
-	if (UNGUARDED_IO) {
-
-		// N.B. -- I wish we could do some pragma unrolling here too, but the compiler makes it 1% slower
-		if (SETS_PER_PASS > 0) pairs[0] = d_in[threadIdx.x + BASE2 + (B40C_RADIXSORT_THREADS * 0)];
-		if (SETS_PER_PASS > 1) pairs[1] = d_in[threadIdx.x + BASE2 + (B40C_RADIXSORT_THREADS * 1)];
-		if (SETS_PER_PASS > 2) pairs[2] = d_in[threadIdx.x + BASE2 + (B40C_RADIXSORT_THREADS * 2)];
-		if (SETS_PER_PASS > 3) pairs[3] = d_in[threadIdx.x + BASE2 + (B40C_RADIXSORT_THREADS * 3)];
-
-		#pragma unroll 
-		for (int SET = 0; SET < (int) SETS_PER_PASS; SET++) {
-			preprocess(pairs[SET].x);
-			preprocess(pairs[SET].y);
-		}
-		
-	} else {
-
-		T* in = (T*) d_in;
-		
-		// N.B. --  I wish we could do some pragma unrolling here, but the compiler won't let 
-		// us with user-defined value types (e.g., Fribbitz): "Advisory: Loop was not unrolled, cannot deduce loop trip count"
-		
-		if (SETS_PER_PASS > 0) GuardedReadSet<T, PreprocessFunctor>(in, pairs[0], (threadIdx.x << 1) + (BASE2 << 1) + (B40C_RADIXSORT_THREADS * 2 * 0), extra);
-		if (SETS_PER_PASS > 1) GuardedReadSet<T, PreprocessFunctor>(in, pairs[1], (threadIdx.x << 1) + (BASE2 << 1) + (B40C_RADIXSORT_THREADS * 2 * 1), extra);
-		if (SETS_PER_PASS > 2) GuardedReadSet<T, PreprocessFunctor>(in, pairs[2], (threadIdx.x << 1) + (BASE2 << 1) + (B40C_RADIXSORT_THREADS * 2 * 2), extra);
-		if (SETS_PER_PASS > 3) GuardedReadSet<T, PreprocessFunctor>(in, pairs[3], (threadIdx.x << 1) + (BASE2 << 1) + (B40C_RADIXSORT_THREADS * 2 * 3), extra);
-	}
-}
-
-
-template <int SETS_PER_PASS>
-__device__ __forceinline__ void PlacePartials(
-	unsigned char * base_partial,
-	int2 digits[SETS_PER_PASS],
-	int2 flag_offsets[SETS_PER_PASS]) 
-{
-	#pragma unroll
-	for (int SET = 0; SET < (int) SETS_PER_PASS; SET++) {
-		base_partial[flag_offsets[SET].x] = 1;
-		base_partial[flag_offsets[SET].y] = 1 + (digits[SET].x == digits[SET].y);
-	}
-}
-
-
-template <int SETS_PER_PASS>
-__device__ __forceinline__ void ExtractRanks(
-	unsigned char * base_partial,
-	int2 digits[SETS_PER_PASS],
-	int2 flag_offsets[SETS_PER_PASS],
-	int2 ranks[SETS_PER_PASS]) 
-{
-	#pragma unroll
-	for (int SET = 0; SET < (int) SETS_PER_PASS; SET++) {
-		ranks[SET].x = base_partial[flag_offsets[SET].x];
-		ranks[SET].y = base_partial[flag_offsets[SET].y] + (digits[SET].x == digits[SET].y);
-	}
-}
-
-
-template <int RADIX_DIGITS, int SETS_PER_PASS>
-__device__ __forceinline__ void UpdateRanks(
-	int2 digits[SETS_PER_PASS],
-	int2 ranks[SETS_PER_PASS],
-	int digit_counts[SETS_PER_PASS][RADIX_DIGITS])
-{
-	// N.B.: I wish we could pragma unroll here, but doing so currently 
-	// results in the 3.1 compilier on 64-bit platforms generating bad
-	// code for SM1.3, resulting in incorrect sorting (e.g., problem size 16)
-	
-	if (SETS_PER_PASS > 0) {
-		ranks[0].x += digit_counts[0][digits[0].x];
-		ranks[0].y += digit_counts[0][digits[0].y]; 
-	}	
-	if (SETS_PER_PASS > 1) {
-		ranks[1].x += digit_counts[1][digits[1].x];
-		ranks[1].y += digit_counts[1][digits[1].y]; 
-	}	
-	if (SETS_PER_PASS > 2) {
-		ranks[2].x += digit_counts[2][digits[2].x];
-		ranks[2].y += digit_counts[2][digits[2].y]; 
-	}	
-	if (SETS_PER_PASS > 3) {
-		ranks[3].x += digit_counts[3][digits[3].x];
-		ranks[3].y += digit_counts[3][digits[3].y]; 
-	}	
-}
-
-template <int RADIX_DIGITS, int PASSES_PER_CYCLE, int SETS_PER_PASS>
-__device__ __forceinline__ void UpdateRanks(
-	int2 digits[PASSES_PER_CYCLE][SETS_PER_PASS],
-	int2 ranks[PASSES_PER_CYCLE][SETS_PER_PASS],
-	int digit_counts[PASSES_PER_CYCLE][SETS_PER_PASS][RADIX_DIGITS])
-{
-	// N.B.: I wish we could pragma unroll here, but doing so currently 
-	// results in the 3.1 compilier on 64-bit platforms generating bad
-	// code for SM1.3, resulting in incorrect sorting (e.g., problem size 16)
-	
-	if (PASSES_PER_CYCLE > 0) UpdateRanks<RADIX_DIGITS, SETS_PER_PASS>(digits[0], ranks[0], digit_counts[0]);
-	if (PASSES_PER_CYCLE > 1) UpdateRanks<RADIX_DIGITS, SETS_PER_PASS>(digits[1], ranks[1], digit_counts[1]);
-	if (PASSES_PER_CYCLE > 2) UpdateRanks<RADIX_DIGITS, SETS_PER_PASS>(digits[2], ranks[2], digit_counts[2]);
-	if (PASSES_PER_CYCLE > 3) UpdateRanks<RADIX_DIGITS, SETS_PER_PASS>(digits[3], ranks[3], digit_counts[3]);
-}
-
-
-
-template <int SCAN_LANES_PER_PASS, int LOG_RAKING_THREADS_PER_LANE, int RAKING_THREADS_PER_LANE, int PARTIALS_PER_SEG>
-__device__ __forceinline__ void PrefixScanOverLanes(
-	int 	raking_segment[],
-	int 	warpscan[SCAN_LANES_PER_PASS][3][RAKING_THREADS_PER_LANE],
-	int 	copy_section)
-{
-	// Upsweep rake
-	int partial_reduction = SerialReduce<PARTIALS_PER_SEG>(raking_segment);
-
-	// Warpscan reduction in digit warpscan_lane
-	int warpscan_lane = threadIdx.x >> LOG_RAKING_THREADS_PER_LANE;
-	int group_prefix = WarpScan<RAKING_THREADS_PER_LANE, true>(
-		warpscan[warpscan_lane], 
-		partial_reduction,
-		copy_section);
-
-	// Downsweep rake
-	SerialScan<PARTIALS_PER_SEG>(raking_segment, group_prefix);
-	
-}
-
-
-template <int SCAN_LANES_PER_PASS, int RAKING_THREADS_PER_LANE, int SETS_PER_PASS, int SCAN_LANES_PER_SET>
-__device__ __forceinline__ void RecoverDigitCounts(
-	int warpscan[SCAN_LANES_PER_PASS][3][RAKING_THREADS_PER_LANE],
-	int counts[SETS_PER_PASS],
-	int copy_section)
-{
-	int my_lane = threadIdx.x >> 2;
-	int my_quad_byte = threadIdx.x & 3;
-	
-	#pragma unroll
-	for (int SET = 0; SET < (int) SETS_PER_PASS; SET++) {
-		unsigned char *warpscan_count = (unsigned char *) &warpscan[my_lane + (SCAN_LANES_PER_SET * SET)][1 + copy_section][RAKING_THREADS_PER_LANE - 1];
-		counts[SET] = warpscan_count[my_quad_byte];
-	}
-}
-
-template<int RADIX_DIGITS>
-__device__ __forceinline__ void CorrectUnguardedSetOverflow(
-	int2 			set_digits,
-	int 	&set_count)				
-{
-	if (WarpVoteAll(RADIX_DIGITS, set_count <= 1)) {
-		// All first-pass, first set keys have same digit. 
-		set_count = (threadIdx.x == set_digits.x) ? 256 : 0;
-	}
-}
-
-template <int RADIX_DIGITS, int SETS_PER_PASS>
-__device__ __forceinline__ void CorrectUnguardedPassOverflow(
-	int2 			pass_digits[SETS_PER_PASS],
-	int 	pass_counts[SETS_PER_PASS])				
-{
-	// N.B. -- I wish we could do some pragma unrolling here too, but the compiler won't comply, 
-	// telling me "Advisory: Loop was not unrolled, unexpected call OPs"
-
-	if (SETS_PER_PASS > 0) CorrectUnguardedSetOverflow<RADIX_DIGITS>(pass_digits[0], pass_counts[0]);
-	if (SETS_PER_PASS > 1) CorrectUnguardedSetOverflow<RADIX_DIGITS>(pass_digits[1], pass_counts[1]);
-	if (SETS_PER_PASS > 2) CorrectUnguardedSetOverflow<RADIX_DIGITS>(pass_digits[2], pass_counts[2]);
-	if (SETS_PER_PASS > 3) CorrectUnguardedSetOverflow<RADIX_DIGITS>(pass_digits[3], pass_counts[3]);
-}
-
-
-template <int RADIX_DIGITS, int PASSES_PER_CYCLE, int SETS_PER_PASS>
-__device__ __forceinline__ void CorrectUnguardedCycleOverflow(
-	int2 			cycle_digits[PASSES_PER_CYCLE][SETS_PER_PASS],
-	int 	cycle_counts[PASSES_PER_CYCLE][SETS_PER_PASS])
-{
-	// N.B. -- I wish we could do some pragma unrolling here too, but the compiler won't comply, 
-	// telling me "Advisory: Loop was not unrolled, unexpected call OPs"
-
-	if (PASSES_PER_CYCLE > 0) CorrectUnguardedPassOverflow<RADIX_DIGITS, SETS_PER_PASS>(cycle_digits[0], cycle_counts[0]);
-	if (PASSES_PER_CYCLE > 1) CorrectUnguardedPassOverflow<RADIX_DIGITS, SETS_PER_PASS>(cycle_digits[1], cycle_counts[1]);
-}
-
-
-template <int RADIX_DIGITS>
-__device__ __forceinline__ void CorrectLastLaneOverflow(int &count, int extra[1]) 
-{
-	if (WarpVoteAll(RADIX_DIGITS, count == 0) && (threadIdx.x == RADIX_DIGITS - 1)) {
-		// We're 'f' and we overflowed b/c of invalid 'f' placemarkers; the number of valid items in this set is the count of valid f's 
-		count = extra[0] & 255;
-	}
-}
-		
-
-template <int RADIX_DIGITS, int PASSES_PER_CYCLE, int SETS_PER_PASS, int SETS_PER_CYCLE, bool UNGUARDED_IO>
-__device__ __forceinline__ void CorrectForOverflows(
-	int2 digits[PASSES_PER_CYCLE][SETS_PER_PASS],
-	int counts[PASSES_PER_CYCLE][SETS_PER_PASS], 
-	int extra[1])				
-{
-	if (!UNGUARDED_IO) {
-
-		// Correct any overflow in the partially-filled last lane
-		int *linear_counts = (int *) counts;
-		CorrectLastLaneOverflow<RADIX_DIGITS>(linear_counts[SETS_PER_CYCLE - 1], extra);
-	}
-
-	CorrectUnguardedCycleOverflow<RADIX_DIGITS, PASSES_PER_CYCLE, SETS_PER_PASS>(digits, counts);
-}
-
-
-template <
-	typename K,
-	int BIT, 
-	int RADIX_DIGITS,
-	int SCAN_LANES_PER_SET,
-	int SETS_PER_PASS,
-	int RAKING_THREADS_PER_PASS,
-	int SCAN_LANES_PER_PASS,
-	int LOG_RAKING_THREADS_PER_LANE,
-	int RAKING_THREADS_PER_LANE,
-	int PARTIALS_PER_SEG,
-	int PADDED_PARTIALS_PER_LANE,
-	int PASSES_PER_CYCLE>
-__device__ __forceinline__ void ScanPass(
-	int *base_partial,
-	int	*raking_partial,
-	int warpscan[SCAN_LANES_PER_PASS][3][RAKING_THREADS_PER_LANE],
-	typename VecType<K, 2>::Type keypairs[SETS_PER_PASS],
-	int2 digits[SETS_PER_PASS],
-	int2 flag_offsets[SETS_PER_PASS],
-	int2 ranks[SETS_PER_PASS],
-	int copy_section)
-{
-	// Reset smem
-	#pragma unroll
-	for (int SCAN_LANE = 0; SCAN_LANE < (int) SCAN_LANES_PER_PASS; SCAN_LANE++) {
-		base_partial[SCAN_LANE * PADDED_PARTIALS_PER_LANE] = 0;
-	}
-	
-	// Decode digits for first pass
-	DecodeDigits<K, RADIX_DIGITS, BIT, SETS_PER_PASS, SCAN_LANES_PER_SET, PADDED_PARTIALS_PER_LANE>(
-		keypairs, digits, flag_offsets);
-	
-	// Encode counts into smem for first pass
-	PlacePartials<SETS_PER_PASS>(
-		(unsigned char *) base_partial,
-		digits,
-		flag_offsets); 
-	
-	__syncthreads();
-	
-	// Intra-group prefix scans for first pass
-	if (threadIdx.x < RAKING_THREADS_PER_PASS) {
-	
-		PrefixScanOverLanes<SCAN_LANES_PER_PASS, LOG_RAKING_THREADS_PER_LANE, RAKING_THREADS_PER_LANE, PARTIALS_PER_SEG>(		// first pass is offset right by one
-			raking_partial,
-			warpscan, 
-			copy_section);
-	}
-	
-	__syncthreads();
-
-	// Extract ranks
-	ExtractRanks<SETS_PER_PASS>(
-		(unsigned char *) base_partial, 
-		digits, 
-		flag_offsets, 
-		ranks); 	
-}	
-	
-
-/******************************************************************************
- * SM1.3 Local Exchange Routines
- * 
- * Routines for exchanging keys (and values) in shared memory (i.e., local 
- * scattering) in order to to facilitate coalesced global scattering
- ******************************************************************************/
-
-template <typename T, bool UNGUARDED_IO, int PASSES_PER_CYCLE, int SETS_PER_PASS, typename PostprocessFunctor>
-__device__ __forceinline__ void ScatterSets(
-	T *d_out, 
-	typename VecType<T, 2>::Type pairs[SETS_PER_PASS],
-	int2 offsets[SETS_PER_PASS],
-	const int BASE4,
-	int extra[1],
-	PostprocessFunctor postprocess = PostprocessFunctor())				
-{
-	#pragma unroll 
-	for (int SET = 0; SET < (int) SETS_PER_PASS; SET++) {
-		postprocess(pairs[SET].x);
-		postprocess(pairs[SET].y);
-	}
-
-	// N.B. -- I wish we could do some pragma unrolling here too, but the compiler makes it 1% slower 
-		
-	if (SETS_PER_PASS > 0) { 
-		if (UNGUARDED_IO || (threadIdx.x + BASE4 + (B40C_RADIXSORT_THREADS * 0) < extra[0])) 
-			d_out[offsets[0].x] = pairs[0].x;
-		if (UNGUARDED_IO || (threadIdx.x + BASE4 + (B40C_RADIXSORT_THREADS * 1) < extra[0])) 
-			d_out[offsets[0].y] = pairs[0].y;
-	}
-
-	if (SETS_PER_PASS > 1) { 
-		if (UNGUARDED_IO || (threadIdx.x + BASE4 + (B40C_RADIXSORT_THREADS * 2) < extra[0])) 
-			d_out[offsets[1].x] = pairs[1].x;
-		if (UNGUARDED_IO || (threadIdx.x + BASE4 + (B40C_RADIXSORT_THREADS * 3) < extra[0])) 
-			d_out[offsets[1].y] = pairs[1].y;
-	}
-
-	if (SETS_PER_PASS > 2) { 
-		if (UNGUARDED_IO || (threadIdx.x + BASE4 + (B40C_RADIXSORT_THREADS * 4) < extra[0])) 
-			d_out[offsets[2].x] = pairs[2].x;
-		if (UNGUARDED_IO || (threadIdx.x + BASE4 + (B40C_RADIXSORT_THREADS * 5) < extra[0])) 
-			d_out[offsets[2].y] = pairs[2].y;
-	}
-
-	if (SETS_PER_PASS > 3) { 
-		if (UNGUARDED_IO || (threadIdx.x + BASE4 + (B40C_RADIXSORT_THREADS * 6) < extra[0])) 
-			d_out[offsets[3].x] = pairs[3].x;
-		if (UNGUARDED_IO || (threadIdx.x + BASE4 + (B40C_RADIXSORT_THREADS * 7) < extra[0])) 
-			d_out[offsets[3].y] = pairs[3].y;
-	}
-}
-
-template <typename T, int PASSES_PER_CYCLE, int SETS_PER_PASS>
-__device__ __forceinline__ void PushPairs(
-	T *swap, 
-	typename VecType<T, 2>::Type pairs[PASSES_PER_CYCLE][SETS_PER_PASS],
-	int2 ranks[PASSES_PER_CYCLE][SETS_PER_PASS])				
-{
-	#pragma unroll 
-	for (int PASS = 0; PASS < (int) PASSES_PER_CYCLE; PASS++) {
-	
-		#pragma unroll 
-		for (int SET = 0; SET < (int) SETS_PER_PASS; SET++) {
-			swap[ranks[PASS][SET].x] = pairs[PASS][SET].x;
-			swap[ranks[PASS][SET].y] = pairs[PASS][SET].y;
-		}
-	}
-}
-	
-template <typename T, int PASSES_PER_CYCLE, int SETS_PER_PASS>
-__device__ __forceinline__ void ExchangePairs(
-	T *swap, 
-	typename VecType<T, 2>::Type pairs[PASSES_PER_CYCLE][SETS_PER_PASS],
-	int2 ranks[PASSES_PER_CYCLE][SETS_PER_PASS])				
-{
-	// Push in Pairs
-	PushPairs<T, PASSES_PER_CYCLE, SETS_PER_PASS>(swap, pairs, ranks);
-	
-	__syncthreads();
-	
-	// Extract pairs
-	#pragma unroll 
-	for (int PASS = 0; PASS < (int) PASSES_PER_CYCLE; PASS++) {
-		
-		#pragma unroll 
-		for (int SET = 0; SET < (int) SETS_PER_PASS; SET++) {
-			const int BLOCK = ((PASS * SETS_PER_PASS) + SET) * 2;
-			pairs[PASS][SET].x = swap[threadIdx.x + (B40C_RADIXSORT_THREADS * (BLOCK + 0))];
-			pairs[PASS][SET].y = swap[threadIdx.x + (B40C_RADIXSORT_THREADS * (BLOCK + 1))];
-		}
-	}
-}
-
-
-template <
-	typename K,
-	typename V,	
-	int RADIX_DIGITS, 
-	int BIT, 
-	int PASSES_PER_CYCLE,
-	int SETS_PER_PASS,
-	bool UNGUARDED_IO,
-	typename PostprocessFunctor>
-__device__ __forceinline__ void SwapAndScatterSm13(
-	typename VecType<K, 2>::Type keypairs[PASSES_PER_CYCLE][SETS_PER_PASS], 
-	int2 ranks[PASSES_PER_CYCLE][SETS_PER_PASS],
-	int4 *exchange,
-	typename VecType<V, 2>::Type *d_in_values, 
-	K *d_out_keys, 
-	V *d_out_values, 
-	int carry[RADIX_DIGITS], 
-	int extra[1])				
-{
-	int2 offsets[PASSES_PER_CYCLE][SETS_PER_PASS];
-	
-	// Swap keys according to ranks
-	ExchangePairs<K, PASSES_PER_CYCLE, SETS_PER_PASS>((K*) exchange, keypairs, ranks);				
-	
-	// Calculate scatter offsets (re-decode digits from keys: it's less work than making a second exchange of digits) 
-	#pragma unroll 
-	for (int PASS = 0; PASS < (int) PASSES_PER_CYCLE; PASS++) {
-		
-		#pragma unroll 
-		for (int SET = 0; SET < (int) SETS_PER_PASS; SET++) {
-			const int BLOCK = ((PASS * SETS_PER_PASS) + SET) * 2;
-			offsets[PASS][SET].x = threadIdx.x + (B40C_RADIXSORT_THREADS * (BLOCK + 0)) + carry[DecodeDigit<K, RADIX_DIGITS, BIT>(keypairs[PASS][SET].x)];
-			offsets[PASS][SET].y = threadIdx.x + (B40C_RADIXSORT_THREADS * (BLOCK + 1)) + carry[DecodeDigit<K, RADIX_DIGITS, BIT>(keypairs[PASS][SET].y)];
-		}
-	}
-	
-	// Scatter keys
-	#pragma unroll 
-	for (int PASS = 0; PASS < (int) PASSES_PER_CYCLE; PASS++) {
-		const int BLOCK = PASS * SETS_PER_PASS * 2;
-		ScatterSets<K, UNGUARDED_IO, PASSES_PER_CYCLE, SETS_PER_PASS, PostprocessFunctor>(d_out_keys, keypairs[PASS], offsets[PASS], B40C_RADIXSORT_THREADS * BLOCK, extra);
-	}
-
-	if (!IsKeysOnly<V>()) {
-	
-		__syncthreads();
-
-		// Read input data
-		typename VecType<V, 2>::Type datapairs[PASSES_PER_CYCLE][SETS_PER_PASS];
-
-		// N.B. -- I wish we could do some pragma unrolling here too, but the compiler won't comply, 
-		// telling me "Advisory: Loop was not unrolled, unexpected control flow"
-
-		if (PASSES_PER_CYCLE > 0) ReadSets<V, UNGUARDED_IO, SETS_PER_PASS, NopFunctor<V> >(d_in_values, datapairs[0], B40C_RADIXSORT_THREADS * SETS_PER_PASS * 0, extra);
-		if (PASSES_PER_CYCLE > 1) ReadSets<V, UNGUARDED_IO, SETS_PER_PASS, NopFunctor<V> >(d_in_values, datapairs[1], B40C_RADIXSORT_THREADS * SETS_PER_PASS * 1, extra);
-		
-		// Swap data according to ranks
-		ExchangePairs<V, PASSES_PER_CYCLE, SETS_PER_PASS>((V*) exchange, datapairs, ranks);
-		
-		// Scatter data
-		#pragma unroll 
-		for (int PASS = 0; PASS < (int) PASSES_PER_CYCLE; PASS++) {
-			const int BLOCK = PASS * SETS_PER_PASS * 2;
-			ScatterSets<V, UNGUARDED_IO, PASSES_PER_CYCLE, SETS_PER_PASS, NopFunctor<V> >(d_out_values, datapairs[PASS], offsets[PASS], B40C_RADIXSORT_THREADS * BLOCK, extra);
-		}
-	}
-}
-
-
-/******************************************************************************
- * SM1.0 Local Exchange Routines
- *
- * Routines for exchanging keys (and values) in shared memory (i.e., local 
- * scattering) in order to to facilitate coalesced global scattering
- ******************************************************************************/
-
-template <
-	typename T, 
-	int RADIX_DIGITS,
-	bool UNGUARDED_IO,
-	typename PostprocessFunctor> 
-__device__ __forceinline__ void ScatterPass(
-	T *swapmem,
-	T *d_out, 
-	int digit_scan[2][RADIX_DIGITS], 
-	int carry[RADIX_DIGITS], 
-	int extra[1],
-	int base_digit,				
-	PostprocessFunctor postprocess = PostprocessFunctor())				
-{
-	const int LOG_STORE_TXN_THREADS = B40C_LOG_MEM_BANKS(__CUDA_ARCH__);
-	const int STORE_TXN_THREADS = 1 << LOG_STORE_TXN_THREADS;
-	
-	int store_txn_idx = threadIdx.x & (STORE_TXN_THREADS - 1);
-	int store_txn_digit = threadIdx.x >> LOG_STORE_TXN_THREADS;
-	
-	int my_digit = base_digit + store_txn_digit;
-	if (my_digit < RADIX_DIGITS) {
-	
-		int my_exclusive_scan = digit_scan[1][my_digit - 1];
-		int my_inclusive_scan = digit_scan[1][my_digit];
-		int my_digit_count = my_inclusive_scan - my_exclusive_scan;
-
-		int my_carry = carry[my_digit] + my_exclusive_scan;
-		int my_aligned_offset = store_txn_idx - (my_carry & (STORE_TXN_THREADS - 1));
-		
-		while (my_aligned_offset < my_digit_count) {
-
-			if ((my_aligned_offset >= 0) && (UNGUARDED_IO || (my_exclusive_scan + my_aligned_offset < extra[0]))) { 
-			
-				T datum = swapmem[my_exclusive_scan + my_aligned_offset];
-				postprocess(datum);
-				d_out[my_carry + my_aligned_offset] = datum;
-			}
-			my_aligned_offset += STORE_TXN_THREADS;
-		}
-	}
-}
-
-template <
-	typename T,
-	int RADIX_DIGITS, 
-	int PASSES_PER_CYCLE,
-	int SETS_PER_PASS,
-	bool UNGUARDED_IO,
-	typename PostprocessFunctor>
-__device__ __forceinline__ void SwapAndScatterPairs(
-	typename VecType<T, 2>::Type pairs[PASSES_PER_CYCLE][SETS_PER_PASS], 
-	int2 ranks[PASSES_PER_CYCLE][SETS_PER_PASS],
-	T *exchange,
-	T *d_out, 
-	int carry[RADIX_DIGITS], 
-	int digit_scan[2][RADIX_DIGITS], 
-	int extra[1])				
-{
-	const int SCATTER_PASS_DIGITS = B40C_RADIXSORT_WARPS * (B40C_WARP_THREADS / B40C_MEM_BANKS(__CUDA_ARCH__));
-	const int SCATTER_PASSES = RADIX_DIGITS / SCATTER_PASS_DIGITS;
-
-	// Push in pairs
-	PushPairs<T, PASSES_PER_CYCLE, SETS_PER_PASS>(exchange, pairs, ranks);
-
-	__syncthreads();
-
-	// N.B. -- I wish we could do some pragma unrolling here too, but the compiler won't comply, 
-	// telling me "Advisory: Loop was not unrolled, not an innermost loop"
-
-	if (SCATTER_PASSES > 0) ScatterPass<T, RADIX_DIGITS, UNGUARDED_IO, PostprocessFunctor>(exchange, d_out, digit_scan, carry, extra, SCATTER_PASS_DIGITS * 0);
-	if (SCATTER_PASSES > 1) ScatterPass<T, RADIX_DIGITS, UNGUARDED_IO, PostprocessFunctor>(exchange, d_out, digit_scan, carry, extra, SCATTER_PASS_DIGITS * 1);
-	if (SCATTER_PASSES > 2) ScatterPass<T, RADIX_DIGITS, UNGUARDED_IO, PostprocessFunctor>(exchange, d_out, digit_scan, carry, extra, SCATTER_PASS_DIGITS * 2);
-	if (SCATTER_PASSES > 3) ScatterPass<T, RADIX_DIGITS, UNGUARDED_IO, PostprocessFunctor>(exchange, d_out, digit_scan, carry, extra, SCATTER_PASS_DIGITS * 3);
-	if (SCATTER_PASSES > 4) ScatterPass<T, RADIX_DIGITS, UNGUARDED_IO, PostprocessFunctor>(exchange, d_out, digit_scan, carry, extra, SCATTER_PASS_DIGITS * 4);
-	if (SCATTER_PASSES > 5) ScatterPass<T, RADIX_DIGITS, UNGUARDED_IO, PostprocessFunctor>(exchange, d_out, digit_scan, carry, extra, SCATTER_PASS_DIGITS * 5);
-	if (SCATTER_PASSES > 6) ScatterPass<T, RADIX_DIGITS, UNGUARDED_IO, PostprocessFunctor>(exchange, d_out, digit_scan, carry, extra, SCATTER_PASS_DIGITS * 6);
-	if (SCATTER_PASSES > 7) ScatterPass<T, RADIX_DIGITS, UNGUARDED_IO, PostprocessFunctor>(exchange, d_out, digit_scan, carry, extra, SCATTER_PASS_DIGITS * 7);
-}
-
-
-template <
-	typename K,
-	typename V,	
-	int RADIX_DIGITS, 
-	int PASSES_PER_CYCLE,
-	int SETS_PER_PASS,
-	bool UNGUARDED_IO,
-	typename PostprocessFunctor>
-__device__ __forceinline__ void SwapAndScatterSm10(
-	typename VecType<K, 2>::Type keypairs[PASSES_PER_CYCLE][SETS_PER_PASS], 
-	int2 ranks[PASSES_PER_CYCLE][SETS_PER_PASS],
-	int4 *exchange,
-	typename VecType<V, 2>::Type *d_in_values, 
-	K *d_out_keys, 
-	V *d_out_values, 
-	int carry[RADIX_DIGITS], 
-	int digit_scan[2][RADIX_DIGITS], 
-	int extra[1])				
-{
-	// Swap and scatter keys
-	SwapAndScatterPairs<K, RADIX_DIGITS, PASSES_PER_CYCLE, SETS_PER_PASS, UNGUARDED_IO, PostprocessFunctor>(
-		keypairs, ranks, (K*) exchange, d_out_keys, carry, digit_scan, extra);				
-	
-	if (!IsKeysOnly<V>()) {
-
-		__syncthreads();
-		
-		// N.B. -- I wish we could do some pragma unrolling here too, but the compiler won't comply, 
-		// telling me "Advisory: Loop was not unrolled, unexpected control flow"
-
-		// Read input data
-		typename VecType<V, 2>::Type datapairs[PASSES_PER_CYCLE][SETS_PER_PASS];
-		if (PASSES_PER_CYCLE > 0) ReadSets<V, UNGUARDED_IO, SETS_PER_PASS, NopFunctor<V> >(d_in_values, datapairs[0], B40C_RADIXSORT_THREADS * SETS_PER_PASS * 0, extra);
-		if (PASSES_PER_CYCLE > 1) ReadSets<V, UNGUARDED_IO, SETS_PER_PASS, NopFunctor<V> >(d_in_values, datapairs[1], B40C_RADIXSORT_THREADS * SETS_PER_PASS * 1, extra);
-
-		// Swap and scatter data
-		SwapAndScatterPairs<V, RADIX_DIGITS, PASSES_PER_CYCLE, SETS_PER_PASS, UNGUARDED_IO, NopFunctor<V> >(
-			datapairs, ranks, (V*) exchange, d_out_values, carry, digit_scan, extra);				
-	}
-}
-
-
-/******************************************************************************
- * Cycle of RADIXSORT_CYCLE_ELEMENTS keys (and values)
- ******************************************************************************/
-
-template <
-	typename K,
-	typename V,	
-	int BIT, 
-	bool UNGUARDED_IO,
-	int RADIX_DIGITS,
-	int LOG_SCAN_LANES_PER_SET,
-	int SCAN_LANES_PER_SET,
-	int SETS_PER_PASS,
-	int PASSES_PER_CYCLE,
-	int LOG_SCAN_LANES_PER_PASS,
-	int SCAN_LANES_PER_PASS,
-	int LOG_PARTIALS_PER_LANE,
-	int LOG_PARTIALS_PER_PASS,
-	int LOG_RAKING_THREADS_PER_PASS,
-	int RAKING_THREADS_PER_PASS,
-	int LOG_RAKING_THREADS_PER_LANE,
-	int RAKING_THREADS_PER_LANE,
-	int LOG_PARTIALS_PER_SEG,
-	int PARTIALS_PER_SEG,
-	int LOG_PARTIALS_PER_ROW,
-	int PARTIALS_PER_ROW,
-	int LOG_SEGS_PER_ROW,	
-	int SEGS_PER_ROW,
-	int LOG_ROWS_PER_SET,
-	int LOG_ROWS_PER_LANE,
-	int ROWS_PER_LANE,
-	int LOG_ROWS_PER_PASS,
-	int ROWS_PER_PASS,
-	int MAX_EXCHANGE_BYTES,
-	typename PreprocessFunctor,
-	typename PostprocessFunctor>
-
-__device__ __forceinline__ void SrtsScanDigitCycle(
-	typename VecType<K, 2>::Type *d_in_keys, 
-	typename VecType<V, 2>::Type *d_in_values, 
-	K *d_out_keys, 
-	V *d_out_values, 
-	int4 *exchange,								
-	int	warpscan[SCAN_LANES_PER_PASS][3][RAKING_THREADS_PER_LANE],
-	int	carry[RADIX_DIGITS],
-	int	digit_scan[2][RADIX_DIGITS],						 
-	int	digit_counts[PASSES_PER_CYCLE][SETS_PER_PASS][RADIX_DIGITS],
-	int	extra[1],
-	int	*base_partial,
-	int	*raking_partial)		
-{
-	
-	const int PADDED_PARTIALS_PER_LANE 		= ROWS_PER_LANE * (PARTIALS_PER_ROW + 1);	 
-	const int SETS_PER_CYCLE 				= PASSES_PER_CYCLE * SETS_PER_PASS;
-
-	// N.B.: We use the following voodoo incantations to elide the compiler's miserable 
-	// "declared but never referenced" warnings for these (which are actually used for 
-	// template instantiation)	
-	SuppressUnusedConstantWarning(PADDED_PARTIALS_PER_LANE);
-	SuppressUnusedConstantWarning(SETS_PER_CYCLE);
-	
-	typename VecType<K, 2>::Type 	keypairs[PASSES_PER_CYCLE][SETS_PER_PASS];
-	int2 							digits[PASSES_PER_CYCLE][SETS_PER_PASS];
-	int2 							flag_offsets[PASSES_PER_CYCLE][SETS_PER_PASS];		// a byte offset
-	int2 							ranks[PASSES_PER_CYCLE][SETS_PER_PASS];
-
-	
-	//-------------------------------------------------------------------------
-	// Read keys
-	//-------------------------------------------------------------------------
-
-	// N.B. -- I wish we could do some pragma unrolling here too, but the compiler won't comply, 
-	// telling me "Advisory: Loop was not unrolled, unexpected control flow construct"
-	
-	// Read Keys
-	if (PASSES_PER_CYCLE > 0) ReadSets<K, UNGUARDED_IO, SETS_PER_PASS, PreprocessFunctor>(d_in_keys, keypairs[0], B40C_RADIXSORT_THREADS * SETS_PER_PASS * 0, extra);		 
-	if (PASSES_PER_CYCLE > 1) ReadSets<K, UNGUARDED_IO, SETS_PER_PASS, PreprocessFunctor>(d_in_keys, keypairs[1], B40C_RADIXSORT_THREADS * SETS_PER_PASS * 1, extra); 	
-	
-	//-------------------------------------------------------------------------
-	// Lane-scanning Passes
-	//-------------------------------------------------------------------------
-
-	#pragma unroll
-	for (int PASS = 0; PASS < (int) PASSES_PER_CYCLE; PASS++) {
-	
-		// First Pass
-		ScanPass<K, BIT, RADIX_DIGITS, SCAN_LANES_PER_SET, SETS_PER_PASS, RAKING_THREADS_PER_PASS, SCAN_LANES_PER_PASS, LOG_RAKING_THREADS_PER_LANE, RAKING_THREADS_PER_LANE, PARTIALS_PER_SEG, PADDED_PARTIALS_PER_LANE, PASSES_PER_CYCLE>(
-			base_partial,
-			raking_partial,
-			warpscan,
-			keypairs[PASS],
-			digits[PASS],
-			flag_offsets[PASS],
-			ranks[PASS],
-			PASSES_PER_CYCLE - PASS - 1);		// lower passes get copied right
-	}
-	
-	//-------------------------------------------------------------------------
-	// Digit-scanning 
-	//-------------------------------------------------------------------------
-
-	// Recover second-half digit-counts, scan across all digit-counts
-	if (threadIdx.x < RADIX_DIGITS) {
-
-		int counts[PASSES_PER_CYCLE][SETS_PER_PASS];
-
-		// Recover digit-counts
-
-		#pragma unroll
-		for (int PASS = 0; PASS < (int) PASSES_PER_CYCLE; PASS++) {
-			RecoverDigitCounts<SCAN_LANES_PER_PASS, RAKING_THREADS_PER_LANE, SETS_PER_PASS, SCAN_LANES_PER_SET>(		// first pass, offset by 1			
-				warpscan, 
-				counts[PASS],
-				PASSES_PER_CYCLE - PASS - 1);		// lower passes get copied right
-		}
-		
-		// Check for overflows
-		CorrectForOverflows<RADIX_DIGITS, PASSES_PER_CYCLE, SETS_PER_PASS, SETS_PER_CYCLE, UNGUARDED_IO>(
-				digits, counts, extra);
-
-		// Scan across my digit counts for each set 
-		int exclusive_total = 0;
-		int inclusive_total = 0;
-		
-		#pragma unroll
-		for (int PASS = 0; PASS < (int) PASSES_PER_CYCLE; PASS++) {
-		
-			#pragma unroll
-			for (int SET = 0; SET < (int) SETS_PER_PASS; SET++) {
-				inclusive_total += counts[PASS][SET];
-				counts[PASS][SET] = exclusive_total;
-				exclusive_total = inclusive_total;
-			}
-		}
-
-		// second half of carry update
-		int my_carry = carry[threadIdx.x] + digit_scan[1][threadIdx.x];
-
-		// Perform overflow-free SIMD Kogge-Stone across digits
-		int digit_prefix = WarpScan<RADIX_DIGITS, false>(
-				digit_scan, 
-				inclusive_total,
-				0);
-
-		// first-half of carry update 
-		carry[threadIdx.x] = my_carry - digit_prefix;
-		
-		#pragma unroll
-		for (int PASS = 0; PASS < (int) PASSES_PER_CYCLE; PASS++) {
-
-			#pragma unroll
-			for (int SET = 0; SET < (int) SETS_PER_PASS; SET++) {
-				digit_counts[PASS][SET][threadIdx.x] = counts[PASS][SET] + digit_prefix;
-			}
-		}
-	}
-	
-	__syncthreads();
-
-	//-------------------------------------------------------------------------
-	// Update Ranks
-	//-------------------------------------------------------------------------
-
-	UpdateRanks<RADIX_DIGITS, PASSES_PER_CYCLE, SETS_PER_PASS>(digits, ranks, digit_counts);
-	
-	
-	//-------------------------------------------------------------------------
-	// Scatter 
-	//-------------------------------------------------------------------------
-
-#if ((__CUDA_ARCH__ < 130) || FERMI_ECC)		
-
-	SwapAndScatterSm10<K, V, RADIX_DIGITS, PASSES_PER_CYCLE, SETS_PER_PASS, UNGUARDED_IO, PostprocessFunctor>(
-		keypairs, 
-		ranks,
-		exchange,
-		d_in_values, 
-		d_out_keys, 
-		d_out_values, 
-		carry, 
-		digit_scan,
-		extra);
-	
-#else 
-
-	SwapAndScatterSm13<K, V, RADIX_DIGITS, BIT, PASSES_PER_CYCLE, SETS_PER_PASS, UNGUARDED_IO, PostprocessFunctor>(
-		keypairs, 
-		ranks,
-		exchange,
-		d_in_values, 
-		d_out_keys, 
-		d_out_values, 
-		carry, 
-		extra);
-	
-#endif
-
-	__syncthreads();
-
-}
-
-
-
-/******************************************************************************
- * Scan/Scatter Kernel Entry Point
- ******************************************************************************/
-
-template <
-	typename K, 
-	typename V, 
-	int PASS, 
-	int RADIX_BITS, 
-	int BIT, 
-	typename PreprocessFunctor, 
-	typename PostprocessFunctor>
-__launch_bounds__ (B40C_RADIXSORT_THREADS, B40C_RADIXSORT_SCAN_SCATTER_CTA_OCCUPANCY(__CUDA_ARCH__))
-__global__ 
-void ScanScatterDigits(
-	bool *d_from_alt_storage,
-	int* d_spine,
-	K* d_in_keys,
-	K* d_out_keys,
-	V* d_in_values,
-	V* d_out_values,
-	CtaDecomposition work_decomposition)
-{
-
-	const int RADIX_DIGITS 				= 1 << RADIX_BITS;
-	
-	const int LOG_SCAN_LANES_PER_SET	= (RADIX_BITS > 2) ? RADIX_BITS - 2 : 0;					// Always at one lane per set
-	const int SCAN_LANES_PER_SET		= 1 << LOG_SCAN_LANES_PER_SET;								// N.B.: we have "declared but never referenced" warnings for these, but they're actually used for template instantiation
-	
-	const int LOG_SETS_PER_PASS			= B40C_RADIXSORT_LOG_SETS_PER_PASS(__CUDA_ARCH__);			
-	const int SETS_PER_PASS				= 1 << LOG_SETS_PER_PASS;
-	
-	const int LOG_PASSES_PER_CYCLE		= B40C_RADIXSORT_LOG_PASSES_PER_CYCLE(__CUDA_ARCH__, K, V);			
-	const int PASSES_PER_CYCLE			= 1 << LOG_PASSES_PER_CYCLE;
-
-	const int LOG_SCAN_LANES_PER_PASS	= LOG_SETS_PER_PASS + LOG_SCAN_LANES_PER_SET;
-	const int SCAN_LANES_PER_PASS		= 1 << LOG_SCAN_LANES_PER_PASS;
-	
-	const int LOG_PARTIALS_PER_LANE 	= B40C_RADIXSORT_LOG_THREADS;
-	
-	const int LOG_PARTIALS_PER_PASS		= LOG_SCAN_LANES_PER_PASS + LOG_PARTIALS_PER_LANE;
-
-	const int LOG_RAKING_THREADS_PER_PASS 		= B40C_RADIXSORT_LOG_RAKING_THREADS_PER_PASS(__CUDA_ARCH__);
-	const int RAKING_THREADS_PER_PASS			= 1 << LOG_RAKING_THREADS_PER_PASS;
-
-	const int LOG_RAKING_THREADS_PER_LANE 		= LOG_RAKING_THREADS_PER_PASS - LOG_SCAN_LANES_PER_PASS;
-	const int RAKING_THREADS_PER_LANE 			= 1 << LOG_RAKING_THREADS_PER_LANE;
-
-	const int LOG_PARTIALS_PER_SEG 		= LOG_PARTIALS_PER_LANE - LOG_RAKING_THREADS_PER_LANE;
-	const int PARTIALS_PER_SEG 			= 1 << LOG_PARTIALS_PER_SEG;
-
-	const int LOG_PARTIALS_PER_ROW		= (LOG_PARTIALS_PER_SEG < B40C_LOG_MEM_BANKS(__CUDA_ARCH__)) ? B40C_LOG_MEM_BANKS(__CUDA_ARCH__) : LOG_PARTIALS_PER_SEG;		// floor of MEM_BANKS partials per row
-	const int PARTIALS_PER_ROW			= 1 << LOG_PARTIALS_PER_ROW;
-	const int PADDED_PARTIALS_PER_ROW 	= PARTIALS_PER_ROW + 1;
-
-	const int LOG_SEGS_PER_ROW 			= LOG_PARTIALS_PER_ROW - LOG_PARTIALS_PER_SEG;	
-	const int SEGS_PER_ROW				= 1 << LOG_SEGS_PER_ROW;
-
-	const int LOG_ROWS_PER_SET 			= LOG_PARTIALS_PER_PASS - LOG_PARTIALS_PER_ROW;
-
-	const int LOG_ROWS_PER_LANE 		= LOG_PARTIALS_PER_LANE - LOG_PARTIALS_PER_ROW;
-	const int ROWS_PER_LANE 			= 1 << LOG_ROWS_PER_LANE;
-
-	const int LOG_ROWS_PER_PASS 		= LOG_SCAN_LANES_PER_PASS + LOG_ROWS_PER_LANE;
-	const int ROWS_PER_PASS 			= 1 << LOG_ROWS_PER_PASS;
-	
-	const int SCAN_LANE_BYTES			= ROWS_PER_PASS * PADDED_PARTIALS_PER_ROW * sizeof(int);
-	const int MAX_EXCHANGE_BYTES		= (sizeof(K) > sizeof(V)) ? 
-													B40C_RADIXSORT_CYCLE_ELEMENTS(__CUDA_ARCH__, K, V) * sizeof(K) : 
-													B40C_RADIXSORT_CYCLE_ELEMENTS(__CUDA_ARCH__, K, V) * sizeof(V);
-	const int SCAN_LANE_INT4S         = (B40C_MAX(MAX_EXCHANGE_BYTES, SCAN_LANE_BYTES) + sizeof(int4) - 1) / sizeof(int4);
-
-
-	// N.B.: We use the following voodoo incantations to elide the compiler's miserable 
-	// "declared but never referenced" warnings for these (which are actually used for 
-	// template instantiation)	
-	SuppressUnusedConstantWarning(SCAN_LANES_PER_SET);
-	SuppressUnusedConstantWarning(PARTIALS_PER_SEG);
-	SuppressUnusedConstantWarning(LOG_ROWS_PER_SET);
-	SuppressUnusedConstantWarning(ROWS_PER_LANE);
-
-    // scan_lanes is a int4[] to avoid alignment issues when casting to (K *) and/or (V *)
-	__shared__ int4		scan_lanes[SCAN_LANE_INT4S];
-	__shared__ int 		warpscan[SCAN_LANES_PER_PASS][3][RAKING_THREADS_PER_LANE];		// One warpscan per fours-group
-	__shared__ int 		carry[RADIX_DIGITS];
-	__shared__ int 		digit_scan[2][RADIX_DIGITS];						 
-	__shared__ int 		digit_counts[PASSES_PER_CYCLE][SETS_PER_PASS][RADIX_DIGITS];
-	__shared__ bool 	non_trivial_digit_pass;
-	__shared__ bool		from_alt_storage;
-	
-	_B40C_REG_MISER_QUALIFIER_ int extra[1];
-	_B40C_REG_MISER_QUALIFIER_ int oob[1];
-
-	extra[0] = (blockIdx.x == gridDim.x - 1) ? work_decomposition.extra_elements_last_block : 0;
-
-	// calculate our threadblock's range
-	int block_elements, block_offset;
-	if (blockIdx.x < work_decomposition.num_big_blocks) {
-		block_offset = work_decomposition.big_block_elements * blockIdx.x;
-		block_elements = work_decomposition.big_block_elements;
-	} else {
-		block_offset = (work_decomposition.normal_block_elements * blockIdx.x) + (work_decomposition.num_big_blocks * B40C_RADIXSORT_CYCLE_ELEMENTS(__CUDA_ARCH__, K, V));
-		block_elements = work_decomposition.normal_block_elements;
-	}
-	oob[0] = block_offset + block_elements;	// out-of-bounds
-
-	
-	// location for placing 2-element partial reductions in the first lane of a pass	
-	int row = threadIdx.x >> LOG_PARTIALS_PER_ROW; 
-	int col = threadIdx.x & (PARTIALS_PER_ROW - 1); 
-	int *base_partial = reinterpret_cast<int *>(scan_lanes) + (row * PADDED_PARTIALS_PER_ROW) + col; 								
-	
-	// location for raking across all sets within a pass
-	int *raking_partial = 0;										
-
-	if (threadIdx.x < RAKING_THREADS_PER_PASS) {
-
-		// initalize lane warpscans
-		if (threadIdx.x < RAKING_THREADS_PER_LANE) {
-			
-			#pragma unroll
-			for (int SCAN_LANE = 0; SCAN_LANE < (int) SCAN_LANES_PER_PASS; SCAN_LANE++) {
-				warpscan[SCAN_LANE][0][threadIdx.x] = 0;
-			}
-		}
-
-		// initialize digit warpscans
-		if (threadIdx.x < RADIX_DIGITS) {
-
-			// Initialize digit_scan
-			digit_scan[0][threadIdx.x] = 0;
-			digit_scan[1][threadIdx.x] = 0;
-
-			// Determine where to read our input
-			from_alt_storage = (PASS == 0) ? false : d_from_alt_storage[PASS & 0x1];
-
-			// Read carry in parallel 
-			int spine_digit_offset = FastMul(gridDim.x, threadIdx.x);
-			int my_digit_carry = d_spine[spine_digit_offset + blockIdx.x];
-			carry[threadIdx.x] = my_digit_carry;
-
-			// Determine whether or not we have work to do and setup the next round 
-			// accordingly.  Everybody but the first threadblock can determine this 
-			// from the number of non-zero-and-non-oob digit carries.  First block 
-			// needs someone else's because he always writes the zero offset.
-			
-			int predicate;
-			if (PreprocessFunctor::MustApply() || PostprocessFunctor::MustApply()) {
-
-				non_trivial_digit_pass = true;
-
-			} else {
-
-				if (blockIdx.x > 0) {
-					// Non-first CTA : use digit-carry from first block
-					my_digit_carry = d_spine[spine_digit_offset];
-				}
-				
-				predicate = ((my_digit_carry > 0) && (my_digit_carry < work_decomposition.num_elements));
-				non_trivial_digit_pass = (TallyWarpVote(RADIX_DIGITS, predicate, reinterpret_cast<int *>(scan_lanes)) > 0);
-			}
-
-			// Let the next round know which set of buffers to use
-			if (blockIdx.x == 0) d_from_alt_storage[(PASS + 1) & 0x1] = from_alt_storage ^ non_trivial_digit_pass;
-		}
-
-		// initialize raking segment
-		row = threadIdx.x >> LOG_SEGS_PER_ROW;
-		col = (threadIdx.x & (SEGS_PER_ROW - 1)) << LOG_PARTIALS_PER_SEG;
-		raking_partial = reinterpret_cast<int *>(scan_lanes) + (row * PADDED_PARTIALS_PER_ROW) + col; 
-	}
-
-	// Sync to acquire non_trivial_digit_pass and from_temp_storage
-	__syncthreads();
-	
-	// Short-circuit this entire pass
-	if (!non_trivial_digit_pass) return; 
-
-	if (!from_alt_storage) {
-	
-		// Scan in tiles of cycle_elements
-		while (block_offset < oob[0]) {
-	
-			SrtsScanDigitCycle<K, V, BIT, true, RADIX_DIGITS, LOG_SCAN_LANES_PER_SET, SCAN_LANES_PER_SET, SETS_PER_PASS, PASSES_PER_CYCLE, LOG_SCAN_LANES_PER_PASS, SCAN_LANES_PER_PASS, LOG_PARTIALS_PER_LANE, LOG_PARTIALS_PER_PASS, LOG_RAKING_THREADS_PER_PASS, RAKING_THREADS_PER_PASS, LOG_RAKING_THREADS_PER_LANE, RAKING_THREADS_PER_LANE, LOG_PARTIALS_PER_SEG, PARTIALS_PER_SEG, LOG_PARTIALS_PER_ROW, PARTIALS_PER_ROW, LOG_SEGS_PER_ROW, SEGS_PER_ROW, LOG_ROWS_PER_SET, LOG_ROWS_PER_LANE, ROWS_PER_LANE, LOG_ROWS_PER_PASS, ROWS_PER_PASS, MAX_EXCHANGE_BYTES, PreprocessFunctor, PostprocessFunctor>(	
-				reinterpret_cast<typename VecType<K, 2>::Type *>((void *) &d_in_keys[block_offset]), 
-				reinterpret_cast<typename VecType<V, 2>::Type *>((void *) &d_in_values[block_offset]), 
-				d_out_keys, 
-				d_out_values, 
-				scan_lanes,
-				warpscan,
-				carry,
-				digit_scan,						 
-				digit_counts,
-				extra,
-				base_partial,
-				raking_partial);		
-	
-			block_offset += B40C_RADIXSORT_CYCLE_ELEMENTS(__CUDA_ARCH__, K, V);
-		}
-	
-		if (extra[0]) {
-			
-			SrtsScanDigitCycle<K, V, BIT, false, RADIX_DIGITS, LOG_SCAN_LANES_PER_SET, SCAN_LANES_PER_SET, SETS_PER_PASS, PASSES_PER_CYCLE, LOG_SCAN_LANES_PER_PASS, SCAN_LANES_PER_PASS, LOG_PARTIALS_PER_LANE, LOG_PARTIALS_PER_PASS, LOG_RAKING_THREADS_PER_PASS, RAKING_THREADS_PER_PASS, LOG_RAKING_THREADS_PER_LANE, RAKING_THREADS_PER_LANE, LOG_PARTIALS_PER_SEG, PARTIALS_PER_SEG, LOG_PARTIALS_PER_ROW, PARTIALS_PER_ROW, LOG_SEGS_PER_ROW, SEGS_PER_ROW, LOG_ROWS_PER_SET, LOG_ROWS_PER_LANE, ROWS_PER_LANE, LOG_ROWS_PER_PASS, ROWS_PER_PASS, MAX_EXCHANGE_BYTES, PreprocessFunctor, PostprocessFunctor>(	
-				reinterpret_cast<typename VecType<K, 2>::Type *>((void *) &d_in_keys[block_offset]), 
-				reinterpret_cast<typename VecType<V, 2>::Type *>((void *) &d_in_values[block_offset]), 
-				d_out_keys, 
-				d_out_values, 
-				scan_lanes,
-				warpscan,
-				carry,
-				digit_scan,						 
-				digit_counts,
-				extra,
-				base_partial,
-				raking_partial);		
-		}
-
-	} else {
-		
-		// Scan in tiles of cycle_elements
-		while (block_offset < oob[0]) {
-
-			SrtsScanDigitCycle<K, V, BIT, true, RADIX_DIGITS, LOG_SCAN_LANES_PER_SET, SCAN_LANES_PER_SET, SETS_PER_PASS, PASSES_PER_CYCLE, LOG_SCAN_LANES_PER_PASS, SCAN_LANES_PER_PASS, LOG_PARTIALS_PER_LANE, LOG_PARTIALS_PER_PASS, LOG_RAKING_THREADS_PER_PASS, RAKING_THREADS_PER_PASS, LOG_RAKING_THREADS_PER_LANE, RAKING_THREADS_PER_LANE, LOG_PARTIALS_PER_SEG, PARTIALS_PER_SEG, LOG_PARTIALS_PER_ROW, PARTIALS_PER_ROW, LOG_SEGS_PER_ROW, SEGS_PER_ROW, LOG_ROWS_PER_SET, LOG_ROWS_PER_LANE, ROWS_PER_LANE, LOG_ROWS_PER_PASS, ROWS_PER_PASS, MAX_EXCHANGE_BYTES, PreprocessFunctor, PostprocessFunctor>(	
-				reinterpret_cast<typename VecType<K, 2>::Type *>((void *) &d_out_keys[block_offset]), 
-				reinterpret_cast<typename VecType<V, 2>::Type *>((void *) &d_out_values[block_offset]), 
-				d_in_keys, 
-				d_in_values, 
-				scan_lanes,
-				warpscan,
-				carry,
-				digit_scan,						 
-				digit_counts,
-				extra,
-				base_partial,
-				raking_partial);		
-
-			block_offset += B40C_RADIXSORT_CYCLE_ELEMENTS(__CUDA_ARCH__, K, V);
-		}
-
-		if (extra[0]) {
-			
-			SrtsScanDigitCycle<K, V, BIT, false, RADIX_DIGITS, LOG_SCAN_LANES_PER_SET, SCAN_LANES_PER_SET, SETS_PER_PASS, PASSES_PER_CYCLE, LOG_SCAN_LANES_PER_PASS, SCAN_LANES_PER_PASS, LOG_PARTIALS_PER_LANE, LOG_PARTIALS_PER_PASS, LOG_RAKING_THREADS_PER_PASS, RAKING_THREADS_PER_PASS, LOG_RAKING_THREADS_PER_LANE, RAKING_THREADS_PER_LANE, LOG_PARTIALS_PER_SEG, PARTIALS_PER_SEG, LOG_PARTIALS_PER_ROW, PARTIALS_PER_ROW, LOG_SEGS_PER_ROW, SEGS_PER_ROW, LOG_ROWS_PER_SET, LOG_ROWS_PER_LANE, ROWS_PER_LANE, LOG_ROWS_PER_PASS, ROWS_PER_PASS, MAX_EXCHANGE_BYTES, PreprocessFunctor, PostprocessFunctor>(	
-				reinterpret_cast<typename VecType<K, 2>::Type *>((void *) &d_out_keys[block_offset]), 
-				reinterpret_cast<typename VecType<V, 2>::Type *>((void *) &d_out_values[block_offset]), 
-				d_in_keys, 
-				d_in_values, 
-				scan_lanes,
-				warpscan,
-				carry,
-				digit_scan,						 
-				digit_counts,
-				extra,
-				base_partial,
-				raking_partial);		
-		}
-		
-	}
-}
-
-} // end namespace b40c_thrust
-} // end namespace detail
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cuda/detail/detail/b40c/radixsort_spine_kernel.h b/compat/thrust/system/cuda/detail/detail/b40c/radixsort_spine_kernel.h
deleted file mode 100644
index 3d20f4aa79..0000000000
--- a/compat/thrust/system/cuda/detail/detail/b40c/radixsort_spine_kernel.h
+++ /dev/null
@@ -1,187 +0,0 @@
-/******************************************************************************
- * 
- * Copyright 2010 Duane Merrill
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. 
- * 
- * 
- * 
- * 
- * AUTHORS' REQUEST: 
- * 
- * 		If you use|reference|benchmark this code, please cite our Technical 
- * 		Report (http://www.cs.virginia.edu/~dgm4d/papers/RadixSortTR.pdf):
- * 
- *		@TechReport{ Merrill:Sorting:2010,
- *        	author = "Duane Merrill and Andrew Grimshaw",
- *        	title = "Revisiting Sorting for GPGPU Stream Architectures",
- *        	year = "2010",
- *        	institution = "University of Virginia, Department of Computer Science",
- *        	address = "Charlottesville, VA, USA",
- *        	number = "CS2010-03"
- *		}
- * 
- * For more information, see our Google Code project site: 
- * http://code.google.com/p/back40computing/
- * 
- * Thanks!
- * 
- ******************************************************************************/
-
-
-/******************************************************************************
- * Top-level histogram/spine scanning kernel
- ******************************************************************************/
-
-#pragma once
-
-#include "radixsort_kernel_common.h"
-
-namespace thrust  {
-namespace system  {
-namespace cuda    {
-namespace detail  {
-namespace detail  {
-namespace b40c_thrust   {
-
-/******************************************************************************
- * Scans a cycle of RADIXSORT_CYCLE_ELEMENTS elements 
- ******************************************************************************/
-
-template<int PARTIALS_PER_SEG>
-__device__ __forceinline__ void SrtsScanCycle(
-	int *smem_offset,
-	int *smem_segment,
-	int warpscan[2][B40C_WARP_THREADS],
-	int4 *in, 
-	int4 *out,
-	int &carry)
-{
-	int4 datum; 
-
-	// read input data
-	datum = in[threadIdx.x];
-
-	smem_offset[0] = datum.x + datum.y + datum.z + datum.w;
-
-	__syncthreads();
-
-	if (threadIdx.x < B40C_WARP_THREADS) {
-
-		int partial_reduction = SerialReduce<PARTIALS_PER_SEG>(smem_segment);
-
-		int seed = WarpScan<B40C_WARP_THREADS, false>(warpscan, partial_reduction, 0);
-		seed += carry;		
-		
-		SerialScan<PARTIALS_PER_SEG>(smem_segment, seed);
-
-		carry += warpscan[1][B40C_WARP_THREADS - 1];	
-	}
-
-	__syncthreads();
-
-	int part0 = smem_offset[0];
-	int part1;
-
-	part1 = datum.x + part0;
-	datum.x = part0;
-	part0 = part1 + datum.y;
-	datum.y = part1;
-
-	part1 = datum.z + part0;
-	datum.z = part0;
-	part0 = part1 + datum.w;
-	datum.w = part1;
-	
-	out[threadIdx.x] = datum;
-}
-
-
-/******************************************************************************
- * Spine/histogram Scan Kernel Entry Point
- ******************************************************************************/
-
-template <typename T>
-__global__ void SrtsScanSpine(
-	int *d_ispine,
-	int *d_ospine,
-	int normal_block_elements)
-{
-	const int LOG_PARTIALS				= B40C_RADIXSORT_LOG_THREADS;				
-	const int PARTIALS			 		= 1 << LOG_PARTIALS;
-	
-	const int LOG_PARTIALS_PER_SEG 		= LOG_PARTIALS - B40C_LOG_WARP_THREADS;
-	const int PARTIALS_PER_SEG 			= 1 << LOG_PARTIALS_PER_SEG;
-
-	const int LOG_PARTIALS_PER_ROW		= (LOG_PARTIALS_PER_SEG < B40C_LOG_MEM_BANKS(__CUDA_ARCH__)) ? B40C_LOG_MEM_BANKS(__CUDA_ARCH__) : LOG_PARTIALS_PER_SEG;		// floor of 32 elts per row
-	const int PARTIALS_PER_ROW			= 1 << LOG_PARTIALS_PER_ROW;
-	
-	const int LOG_SEGS_PER_ROW 			= LOG_PARTIALS_PER_ROW - LOG_PARTIALS_PER_SEG;	
-	const int SEGS_PER_ROW				= 1 << LOG_SEGS_PER_ROW;
-
-	const int SMEM_ROWS 				= PARTIALS / PARTIALS_PER_ROW;
-	
-	__shared__ int smem[SMEM_ROWS][PARTIALS_PER_ROW + 1];
-	__shared__ int warpscan[2][B40C_WARP_THREADS];
-
-  // WAR spurious unused constant warning
-  SuppressUnusedConstantWarning(PARTIALS_PER_SEG);
-
-	int *smem_segment = 0;
-	int carry = 0;
-
-	int row = threadIdx.x >> LOG_PARTIALS_PER_ROW;		
-	int col = threadIdx.x & (PARTIALS_PER_ROW - 1);			
-	int *smem_offset = &smem[row][col];
-
-	if (blockIdx.x > 0) {
-		return;
-	}
-	
-	if (threadIdx.x < B40C_WARP_THREADS) {
-		
-		// two segs per row, odd segs are offset by 8
-		row = threadIdx.x >> LOG_SEGS_PER_ROW;
-		col = (threadIdx.x & (SEGS_PER_ROW - 1)) << LOG_PARTIALS_PER_SEG;
-		smem_segment = &smem[row][col];
-	
-		if (threadIdx.x < B40C_WARP_THREADS) {
-			carry = 0;
-			warpscan[0][threadIdx.x] = 0;
-		}
-	}
-
-	// scan the spine in blocks of cycle_elements
-	int block_offset = 0;
-	while (block_offset < normal_block_elements) {
-		
-		SrtsScanCycle<PARTIALS_PER_SEG>(	
-			smem_offset, 
-			smem_segment, 
-			warpscan,
-			reinterpret_cast<int4 *>((void *) &d_ispine[block_offset]), 
-			reinterpret_cast<int4 *>((void *) &d_ospine[block_offset]), 
-			carry);
-
-		block_offset += B40C_RADIXSORT_SPINE_CYCLE_ELEMENTS;
-	}
-} 
-
-
-} // end namespace b40c_thrust
-} // end namespace detail
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cuda/detail/detail/b40c/vector_types.h b/compat/thrust/system/cuda/detail/detail/b40c/vector_types.h
deleted file mode 100644
index 6db7931078..0000000000
--- a/compat/thrust/system/cuda/detail/detail/b40c/vector_types.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/**
- * Copyright 2010 Duane Merrill
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. 
- * 
- * For more information, see our Google Code project site: 
- * http://code.google.com/p/back40computing/
- * 
- * Thanks!
- */
-
-#pragma once
-
-#include <vector_types.h>
-
-namespace thrust  {
-namespace system  {
-namespace cuda    {
-namespace detail  {
-namespace detail  {
-namespace b40c_thrust   {
-
-//------------------------------------------------------------------------------
-// Vector types
-//------------------------------------------------------------------------------
-
-template <typename K, int vec_elements> struct VecType;
-
-
-//
-// Define general vector types
-//
-
-template <typename K> 
-struct VecType<K, 1> {
-	K x;
-	typedef K Type;
-};
-
-template <typename K> 
-struct VecType<K, 2> {
-	K x;
-	K y;
-	typedef VecType<K, 2> Type;
-};
-
-template <typename K> 
-struct VecType<K, 4> {
-	K x;
-	K y;
-	K z;
-	K w;
-	typedef VecType<K, 4> Type;
-};
-
-//
-// Specialize certain built-in vector types
-//
-
-#define B40C_DEFINE_VECTOR_TYPE(base_type,short_type)                           \
-  template<> struct VecType<base_type, 1> { typedef short_type##1 Type; };      \
-  template<> struct VecType<base_type, 2> { typedef short_type##2 Type; };      \
-  template<> struct VecType<base_type, 4> { typedef short_type##4 Type; };     
-
-B40C_DEFINE_VECTOR_TYPE(char,               char)
-B40C_DEFINE_VECTOR_TYPE(short,              short)
-B40C_DEFINE_VECTOR_TYPE(int,                int)
-B40C_DEFINE_VECTOR_TYPE(long,               long)
-B40C_DEFINE_VECTOR_TYPE(long long,          longlong)
-B40C_DEFINE_VECTOR_TYPE(unsigned char,      uchar)
-B40C_DEFINE_VECTOR_TYPE(unsigned short,     ushort)
-B40C_DEFINE_VECTOR_TYPE(unsigned int,       uint)
-B40C_DEFINE_VECTOR_TYPE(unsigned long,      ulong)
-B40C_DEFINE_VECTOR_TYPE(unsigned long long, ulonglong)
-B40C_DEFINE_VECTOR_TYPE(float,              float)
-B40C_DEFINE_VECTOR_TYPE(double,             double)
-
-#undef B40C_DEFINE_VECTOR_TYPE
-
-} // end namespace b40c_thrust
-} // end namespace detail
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cuda/detail/detail/balanced_path.h b/compat/thrust/system/cuda/detail/detail/balanced_path.h
deleted file mode 100644
index 51e4f5b767..0000000000
--- a/compat/thrust/system/cuda/detail/detail/balanced_path.h
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/pair.h>
-#include <thrust/detail/minmax.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace detail
-{
-namespace balanced_path_detail
-{
-
-template<bool UpperBound, typename IntT, typename It, typename T, typename Comp>
-__host__ __device__ void BinarySearchIteration(It data, int& begin, int& end,
-	T key, int shift, Comp comp) {
-
-	IntT scale = (1<< shift) - 1;
-	int mid = (int)((begin + scale * end)>> shift);
-
-	T key2 = data[mid];
-	bool pred = UpperBound ? !comp(key, key2) : comp(key2, key);
-	if(pred) begin = (int)mid + 1;
-	else end = mid;
-}
-
-template<bool UpperBound, typename T, typename It, typename Comp>
-__host__ __device__ int BinarySearch(It data, int count, T key, Comp comp) {
-	int begin = 0;
-	int end = count;
-	while(begin < end) 
-		BinarySearchIteration<UpperBound, int>(data, begin, end, key, 1, comp);
-	return begin;
-}
-
-template<bool UpperBound, typename IntT, typename T, typename It, typename Comp>
-__host__ __device__ int BiasedBinarySearch(It data, int count, T key, 
-	IntT levels, Comp comp) {
-	int begin = 0;
-	int end = count;
-
-	if(levels >= 4 && begin < end)
-		BinarySearchIteration<UpperBound, IntT>(data, begin, end, key, 9, comp);
-	if(levels >= 3 && begin < end)
-		BinarySearchIteration<UpperBound, IntT>(data, begin, end, key, 7, comp);
-	if(levels >= 2 && begin < end)
-		BinarySearchIteration<UpperBound, IntT>(data, begin, end, key, 5, comp);
-	if(levels >= 1 && begin < end)
-		BinarySearchIteration<UpperBound, IntT>(data, begin, end, key, 4, comp);
-
-	while(begin < end)
-		BinarySearchIteration<UpperBound, IntT>(data, begin, end, key, 1, comp);
-	return begin;
-}
-
-template<bool UpperBound, typename It1, typename It2, typename Comp>
-__host__ __device__ int MergePath(It1 a, int aCount, It2 b, int bCount, int diag, Comp comp)
-{
-  typedef typename thrust::iterator_traits<It1>::value_type T;
-  
-  int begin = thrust::max(0, diag - bCount);
-  int end   = thrust::min(diag, aCount);
-  
-  while(begin < end) 
-  {
-    int mid = (begin + end)>> 1;
-    T aKey = a[mid];
-    T bKey = b[diag - 1 - mid];
-    bool pred = UpperBound ? comp(aKey, bKey) : !comp(bKey, aKey);
-    if(pred) begin = mid + 1;
-    else end = mid;
-  }
-  return begin;
-}
-
-
-} // end namespace balanced_path_detail
-
-
-template<typename RandomAccessIterator1, typename Size1, typename RandomAccessIterator2, typename Size2, typename Compare>
-__host__ __device__
-thrust::pair<Size1,Size1>
-  balanced_path(RandomAccessIterator1 first1, Size1 n1,
-                RandomAccessIterator2 first2, Size1 n2,
-                Size1 diag,
-                Size2 levels,
-                Compare comp)
-{
-  typedef typename thrust::iterator_traits<RandomAccessIterator1>::value_type T;
-
-  Size1 aIndex = balanced_path_detail::MergePath<false>(first1, n1, first2, n2, diag, comp);
-  Size1 bIndex = diag - aIndex;
-  
-  bool star = false;
-  if(bIndex < n2)
-  {
-    T x = first2[bIndex];
-    
-    // Search for the beginning of the duplicate run in both A and B.
-    Size1 aStart = balanced_path_detail::BiasedBinarySearch<false>(first1, aIndex, x, levels, comp);
-    Size1 bStart = balanced_path_detail::BiasedBinarySearch<false>(first2, bIndex, x, levels, comp);
-    
-    // The distance between x's merge path and its lower_bound is its rank.
-    // We add up the a and b ranks and evenly distribute them to
-    // get a stairstep path.
-    Size1 aRun = aIndex - aStart;
-    Size1 bRun = bIndex - bStart;
-    Size1 xCount = aRun + bRun;
-    
-    // Attempt to advance b and regress a.
-    Size1 bAdvance = thrust::max(xCount >> 1, xCount - aRun);
-    Size1 bEnd     = thrust::min<Size1>(n2, bStart + bAdvance + 1);
-    Size1 bRunEnd  = balanced_path_detail::BinarySearch<true>(first2 + bIndex, bEnd - bIndex, x, comp) + bIndex;
-    bRun = bRunEnd - bStart;
-    
-    bAdvance = thrust::min(bAdvance, bRun);
-    Size1 aAdvance = xCount - bAdvance;
-    
-    bool roundUp = (aAdvance == bAdvance + 1) && (bAdvance < bRun);
-    aIndex = aStart + aAdvance;
-    
-    if(roundUp) star = true;
-  }
-
-  return thrust::make_pair(aIndex, (diag - aIndex) + star);
-}
-
-
-} // end namespace detail
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cuda/detail/detail/cached_temporary_allocator.h b/compat/thrust/system/cuda/detail/detail/cached_temporary_allocator.h
deleted file mode 100644
index 2bbd658456..0000000000
--- a/compat/thrust/system/cuda/detail/detail/cached_temporary_allocator.h
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/raw_pointer_cast.h>
-#include <thrust/detail/allocator/allocator_traits.h>
-#include <thrust/detail/allocator/temporary_allocator.h>
-#include <thrust/pair.h>
-#include <map>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace detail
-{
-
-
-template<typename DerivedPolicy, template<typename> class BasePolicy>
-  class cached_temporary_allocator
-    : public BasePolicy<cached_temporary_allocator<DerivedPolicy,BasePolicy> >
-{
-  private:
-    typedef thrust::detail::temporary_allocator<char,DerivedPolicy> base_allocator_type;
-    typedef thrust::detail::allocator_traits<base_allocator_type>   traits;
-    typedef typename traits::pointer                                  allocator_pointer;
-    typedef std::multimap<std::ptrdiff_t, void*>                      free_blocks_type;
-    typedef std::map<void *, std::ptrdiff_t>                          allocated_blocks_type;
-
-    base_allocator_type   m_base_allocator;
-    free_blocks_type      free_blocks;
-    allocated_blocks_type allocated_blocks;
-
-    void free_all()
-    {
-      // deallocate all outstanding blocks in both lists
-      for(free_blocks_type::iterator i = free_blocks.begin();
-          i != free_blocks.end();
-          ++i)
-      {
-        // transform the pointer to allocator_pointer before calling deallocate
-        traits::deallocate(m_base_allocator, allocator_pointer(reinterpret_cast<char*>(i->second)), i->first);
-      }
-
-      for(allocated_blocks_type::iterator i = allocated_blocks.begin();
-          i != allocated_blocks.end();
-          ++i)
-      {
-        // transform the pointer to allocator_pointer before calling deallocate
-        traits::deallocate(m_base_allocator, allocator_pointer(reinterpret_cast<char*>(i->first)), i->second);
-      }
-    }
-
-  public:
-    cached_temporary_allocator(thrust::execution_policy<DerivedPolicy> &system)
-      : m_base_allocator(system)
-    {}
-
-    ~cached_temporary_allocator()
-    {
-      // free all allocations when cached_allocator goes out of scope
-      free_all();
-    }
-
-    void *allocate(std::ptrdiff_t num_bytes)
-    {
-      void *result = 0;
-
-      // search the cache for a free block
-      free_blocks_type::iterator free_block = free_blocks.find(num_bytes);
-
-      if(free_block != free_blocks.end())
-      {
-        // get the pointer
-        result = free_block->second;
-
-        // erase from the free_blocks map
-        free_blocks.erase(free_block);
-      }
-      else
-      {
-        // no allocation of the right size exists
-        // create a new one with m_base_allocator
-        // allocate memory and convert to raw pointer
-        result = thrust::raw_pointer_cast(traits::allocate(m_base_allocator, num_bytes));
-      }
-
-      // insert the allocated pointer into the allocated_blocks map
-      allocated_blocks.insert(std::make_pair(result, num_bytes));
-
-      return result;
-    }
-
-    void deallocate(void *ptr)
-    {
-      // erase the allocated block from the allocated blocks map
-      allocated_blocks_type::iterator iter = allocated_blocks.find(ptr);
-      std::ptrdiff_t num_bytes = iter->second;
-      allocated_blocks.erase(iter);
-
-      // insert the block into the free blocks map
-      free_blocks.insert(std::make_pair(num_bytes, ptr));
-    }
-};
-
-
-// overload get_temporary_buffer on cached_temporary_allocator
-// note that we take a reference to cached_temporary_allocator
-template<typename T, typename DerivedPolicy, template<typename> class BasePolicy>
-  thrust::pair<T*, std::ptrdiff_t>
-    get_temporary_buffer(cached_temporary_allocator<DerivedPolicy,BasePolicy> &alloc, std::ptrdiff_t n)
-{
-  // ask the allocator for sizeof(T) * n bytes
-  T* result = reinterpret_cast<T*>(alloc.allocate(sizeof(T) * n));
-
-  // return the pointer and the number of elements allocated
-  return thrust::make_pair(result,n);
-}
-
-
-// overload return_temporary_buffer on cached_temporary_allocator
-// an overloaded return_temporary_buffer should always accompany
-// an overloaded get_temporary_buffer
-template<typename Pointer, typename DerivedPolicy, template<typename> class BasePolicy>
-  void return_temporary_buffer(cached_temporary_allocator<DerivedPolicy,BasePolicy> &alloc, Pointer p)
-{
-  // return the pointer to the allocator
-  alloc.deallocate(thrust::raw_pointer_cast(p));
-}
-
-
-} // end detail
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/cuda/detail/detail/fast_scan.h b/compat/thrust/system/cuda/detail/detail/fast_scan.h
deleted file mode 100644
index d095a4a2db..0000000000
--- a/compat/thrust/system/cuda/detail/detail/fast_scan.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file fast_scan.h
- *  \brief A fast scan for primitive types.
- */
-
-#pragma once
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace detail
-{
-namespace fast_scan
-{
-
-template <typename ExecutionPolicy,
-          typename InputIterator,
-          typename OutputIterator,
-          typename BinaryFunction>
-OutputIterator inclusive_scan(execution_policy<ExecutionPolicy> &exec,
-                              InputIterator first,
-                              InputIterator last,
-                              OutputIterator output,
-                              BinaryFunction binary_op);
-
-template <typename ExecutionPolicy,
-          typename InputIterator,
-          typename OutputIterator,
-          typename T,
-          typename BinaryFunction>
-OutputIterator exclusive_scan(execution_policy<ExecutionPolicy> &exec,
-                              InputIterator first,
-                              InputIterator last,
-                              OutputIterator output,
-                              const T init,
-                              BinaryFunction binary_op);
-
-} // end namespace fast_scan
-} // end namespace detail
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
-#include "fast_scan.inl"
-
diff --git a/compat/thrust/system/cuda/detail/detail/fast_scan.inl b/compat/thrust/system/cuda/detail/detail/fast_scan.inl
deleted file mode 100644
index b02763d8a9..0000000000
--- a/compat/thrust/system/cuda/detail/detail/fast_scan.inl
+++ /dev/null
@@ -1,753 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#include <thrust/detail/config.h>
-
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/detail/generic/select_system.h>
-#include <thrust/detail/temporary_array.h>
-
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/type_traits/function_traits.h>
-#include <thrust/detail/type_traits/iterator/is_output_iterator.h>
-
-#include <thrust/system/cuda/detail/reduce_intervals.h>
-#include <thrust/system/cuda/detail/synchronize.h>
-#include <thrust/system/cuda/detail/default_decomposition.h>
-#include <thrust/system/cuda/detail/detail/launch_closure.h>
-#include <thrust/system/cuda/detail/detail/uninitialized.h>
-#include <thrust/detail/raw_pointer_cast.h>
-
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
-
-
-namespace thrust
-{
-namespace detail
-{
-
-// forward declaration of temporary_array
-template<typename,typename> class temporary_array;
-
-} // end detail
-
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace detail
-{
-namespace fast_scan
-{
-namespace fast_scan_detail
-{
-
-
-// TODO tune this
-template <typename ValueType>
-struct inclusive_scan_block_size
-{
-  private:
-  static const unsigned int max_memory         = 16384 - 256 - 2 * sizeof(ValueType);
-  static const unsigned int max_block_size     = max_memory / sizeof(ValueType);
-  static const unsigned int default_block_size = 7 * 32;
-  static const unsigned int block_size         = (max_block_size < default_block_size) ? max_block_size : default_block_size;
-
-  public:
-  static const unsigned int pass1 = block_size;
-  static const unsigned int pass2 = block_size;
-  static const unsigned int pass3 = block_size;
-};
-
-// TODO tune this
-template <typename ValueType>
-struct exclusive_scan_block_size
-{
-  private:
-  static const unsigned int max_memory         = 16384 - 256 - 2 * sizeof(ValueType);
-  static const unsigned int max_block_size     = max_memory / sizeof(ValueType);
-  static const unsigned int default_block_size = 5 * 32;
-  static const unsigned int block_size         = (max_block_size < default_block_size) ? max_block_size : default_block_size;
-
-  public:
-  static const unsigned int pass1 = block_size;
-  static const unsigned int pass2 = block_size;
-  static const unsigned int pass3 = block_size;
-};
-
-
-template <unsigned int CTA_SIZE,
-          typename Context,
-          typename SharedArray,
-          typename BinaryFunction>
-__device__ __thrust_forceinline__
-void scan_block(Context context, SharedArray array, BinaryFunction binary_op)
-{
-    typedef typename thrust::iterator_value<SharedArray>::type T;
-
-    T val = array[context.thread_index()];
-
-    if (CTA_SIZE >    1) { if(context.thread_index() >=    1) { T tmp = array[context.thread_index() -    1]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
-    if (CTA_SIZE >    2) { if(context.thread_index() >=    2) { T tmp = array[context.thread_index() -    2]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
-    if (CTA_SIZE >    4) { if(context.thread_index() >=    4) { T tmp = array[context.thread_index() -    4]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
-    if (CTA_SIZE >    8) { if(context.thread_index() >=    8) { T tmp = array[context.thread_index() -    8]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
-    if (CTA_SIZE >   16) { if(context.thread_index() >=   16) { T tmp = array[context.thread_index() -   16]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
-    if (CTA_SIZE >   32) { if(context.thread_index() >=   32) { T tmp = array[context.thread_index() -   32]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
-    if (CTA_SIZE >   64) { if(context.thread_index() >=   64) { T tmp = array[context.thread_index() -   64]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
-    if (CTA_SIZE >  128) { if(context.thread_index() >=  128) { T tmp = array[context.thread_index() -  128]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
-    if (CTA_SIZE >  256) { if(context.thread_index() >=  256) { T tmp = array[context.thread_index() -  256]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }  
-    if (CTA_SIZE >  512) { if(context.thread_index() >=  512) { T tmp = array[context.thread_index() -  512]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }  
-    if (CTA_SIZE > 1024) { if(context.thread_index() >= 1024) { T tmp = array[context.thread_index() - 1024]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }  
-}
-
-template <unsigned int CTA_SIZE,
-          typename Context,
-          typename SharedArray,
-          typename BinaryFunction>
-__device__ __thrust_forceinline__
-void scan_block_n(Context context, SharedArray array, const unsigned int n, BinaryFunction binary_op)
-{
-    typedef typename thrust::iterator_value<SharedArray>::type T;
-
-    T val = array[context.thread_index()];
-
-    if (CTA_SIZE >    1) { if(context.thread_index() < n && context.thread_index() >=    1) { T tmp = array[context.thread_index() -    1]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
-    if (CTA_SIZE >    2) { if(context.thread_index() < n && context.thread_index() >=    2) { T tmp = array[context.thread_index() -    2]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
-    if (CTA_SIZE >    4) { if(context.thread_index() < n && context.thread_index() >=    4) { T tmp = array[context.thread_index() -    4]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
-    if (CTA_SIZE >    8) { if(context.thread_index() < n && context.thread_index() >=    8) { T tmp = array[context.thread_index() -    8]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
-    if (CTA_SIZE >   16) { if(context.thread_index() < n && context.thread_index() >=   16) { T tmp = array[context.thread_index() -   16]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
-    if (CTA_SIZE >   32) { if(context.thread_index() < n && context.thread_index() >=   32) { T tmp = array[context.thread_index() -   32]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
-    if (CTA_SIZE >   64) { if(context.thread_index() < n && context.thread_index() >=   64) { T tmp = array[context.thread_index() -   64]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
-    if (CTA_SIZE >  128) { if(context.thread_index() < n && context.thread_index() >=  128) { T tmp = array[context.thread_index() -  128]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
-    if (CTA_SIZE >  256) { if(context.thread_index() < n && context.thread_index() >=  256) { T tmp = array[context.thread_index() -  256]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
-    if (CTA_SIZE >  512) { if(context.thread_index() < n && context.thread_index() >=  512) { T tmp = array[context.thread_index() -  512]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
-    if (CTA_SIZE > 1024) { if(context.thread_index() < n && context.thread_index() >= 1024) { T tmp = array[context.thread_index() - 1024]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
-}
-
-template <unsigned int CTA_SIZE,
-          unsigned int K,
-          bool FullBlock,
-          typename Context,
-          typename InputIterator,
-          typename ValueType>
-__device__ __thrust_forceinline__
-void load_block(Context context,
-                const unsigned int n,
-                InputIterator input,
-                ValueType (&sdata)[K][CTA_SIZE + 1])
-{
-  for(unsigned int k = 0; k < K; k++)
-  {
-    const unsigned int offset = k*CTA_SIZE + context.thread_index();
-
-    if (FullBlock || offset < n)
-    {
-      InputIterator temp = input + offset;
-      sdata[offset % K][offset / K] = *temp;
-    }
-  }
-
-  context.barrier();
-}
-
-template <unsigned int CTA_SIZE,
-          unsigned int K,
-          bool Inclusive,
-          bool FullBlock,
-          typename Context,
-          typename OutputIterator,
-          typename ValueType>
-__device__ __thrust_forceinline__
-void store_block(Context context,
-                 const unsigned int n,
-                 OutputIterator output,
-                 ValueType (&sdata)[K][CTA_SIZE + 1],
-                 ValueType& carry)
-{
-  if (Inclusive)
-  {
-    for(unsigned int k = 0; k < K; k++)
-    {
-      const unsigned int offset = k*CTA_SIZE + context.thread_index();
-
-      if (FullBlock || offset < n)
-      {
-        OutputIterator temp = output + offset;
-        *temp = sdata[offset % K][offset / K];
-      }
-    }   
-  }
-  else
-  {
-    for(unsigned int k = 0; k < K; k++)
-    {
-      const unsigned int offset = k*CTA_SIZE + context.thread_index();
-
-      if (FullBlock || offset < n)
-      {
-        OutputIterator temp = output + offset;
-        *temp = (offset == 0) ? carry : sdata[(offset - 1) % K][(offset - 1) / K];
-      }
-    }   
-  }
-}
-
-template <unsigned int CTA_SIZE,
-          unsigned int K,
-          bool FullBlock,
-          typename Context,
-          typename InputIterator,
-          typename BinaryFunction,
-          typename ValueType>
-__device__ __thrust_forceinline__
-void upsweep_body(Context context,
-                  const unsigned int n,
-                  const bool carry_in,
-                  InputIterator input,
-                  BinaryFunction binary_op,
-                  ValueType (&sdata)[K][CTA_SIZE + 1],
-                  ValueType& carry)
-{
-  // read data
-  load_block<CTA_SIZE,K,FullBlock>(context, n, input, sdata);
- 
-  // copy into local array
-  ValueType ldata[K];
-  for (unsigned int k = 0; k < K; k++)
-    ldata[k] = sdata[k][context.thread_index()];
-
-  // carry in
-  if (context.thread_index() == 0 && carry_in)
-  {
-    // XXX WAR sm_10 issue
-    ValueType tmp = carry;
-    ldata[0] = binary_op(tmp, ldata[0]);
-  }
-
-  // scan local values
-  for(unsigned int k = 1; k < K; k++)
-  {
-    const unsigned int offset = K * context.thread_index() + k;
-
-    if (FullBlock || offset < n)
-      ldata[k] = binary_op(ldata[k-1],ldata[k]);
-  }
-
-  sdata[K - 1][context.thread_index()] = ldata[K - 1];
-
-  context.barrier();
-
-  // second level scan
-  if (FullBlock && sizeof(ValueType) > 1) // TODO investigate why this WAR is necessary
-    scan_block<CTA_SIZE>(context, sdata[K - 1], binary_op); 
-  else
-    scan_block_n<CTA_SIZE>(context, sdata[K - 1], n / K, binary_op);
-
-  // store carry out
-  if (FullBlock)
-  {
-     if (context.thread_index() == CTA_SIZE - 1)
-        carry = sdata[K - 1][context.thread_index()];
-  }
-  else
-  {
-    if (context.thread_index() == (n - 1) / K)
-    {
-      ValueType sum;
-
-      for (unsigned int k = 0; k < K; k++)
-          if ((n - 1) % K == k)
-              sum = ldata[k];
-
-      if (context.thread_index() > 0)
-      {
-        // WAR sm_10 issue
-        ValueType tmp = sdata[K - 1][context.thread_index() - 1];
-        sum = binary_op(tmp, sum);
-      }
-
-      carry = sum;
-    }
-  }
-
-  context.barrier();
-}
-
-template <unsigned int CTA_SIZE,
-          unsigned int K,
-          bool Inclusive,
-          bool FullBlock,
-          typename Context,
-          typename InputIterator,
-          typename OutputIterator,
-          typename BinaryFunction,
-          typename ValueType>
-__device__ __thrust_forceinline__
-void scan_body(Context context,
-               const unsigned int n,
-               const bool carry_in,
-               InputIterator input,
-               OutputIterator output,
-               BinaryFunction binary_op,
-               ValueType (&sdata)[K][CTA_SIZE + 1],
-               ValueType& carry)
-{
-  // read data
-  load_block<CTA_SIZE,K,FullBlock>(context, n, input, sdata);
-
-  // copy into local array
-  ValueType ldata[K];
-  for (unsigned int k = 0; k < K; k++)
-    ldata[k] = sdata[k][context.thread_index()];
-
-  // carry in
-  if (context.thread_index() == 0 && carry_in)
-  {
-    // XXX WAR sm_10 issue
-    ValueType tmp = carry;
-    ldata[0] = binary_op(tmp, ldata[0]);
-  }
-
-  // scan local values
-  for(unsigned int k = 1; k < K; k++)
-  {
-    const unsigned int offset = K * context.thread_index() + k;
-
-    if (FullBlock || offset < n)
-      ldata[k] = binary_op(ldata[k-1],ldata[k]);
-  }
-
-  sdata[K - 1][context.thread_index()] = ldata[K - 1];
-
-  context.barrier();
-
-  // second level scan
-  if (FullBlock)
-    scan_block<CTA_SIZE>(context, sdata[K - 1], binary_op);
-  else
-    scan_block_n<CTA_SIZE>(context, sdata[K - 1], n / K, binary_op);
-  
-  // update local values
-  if (context.thread_index() > 0)
-  {
-    ValueType left = sdata[K - 1][context.thread_index() - 1];
-
-    for(unsigned int k = 0; k < K; k++)
-    {
-      const unsigned int offset = K * context.thread_index() + k;
-
-      if (FullBlock || offset < n)
-        ldata[k] = binary_op(left, ldata[k]);
-    }
-  }
-
-  for (unsigned int k = 0; k < K; k++)
-    sdata[k][context.thread_index()] = ldata[k];
-
-  context.barrier();
-
-  // write data
-  store_block<CTA_SIZE, K, Inclusive, FullBlock>(context, n, output, sdata, carry);
-  
-  // store carry out
-  if (context.thread_index() == 0)
-  {
-    if (FullBlock)
-      carry = sdata[K - 1][CTA_SIZE - 1];
-    else
-      carry = sdata[(n - 1) % K][(n - 1) / K]; // note: this must come after the local update
-  }
-
-  context.barrier();
-}
-
-template <typename InputIterator,
-          typename ValueType,
-          typename BinaryFunction,
-          typename Decomposition,
-          typename Context>
-struct upsweep_intervals_closure
-{
-  InputIterator  input;
-  ValueType *    block_results; // TODO change this to ValueIterator
-  BinaryFunction binary_op;
-  Decomposition  decomp;
-  Context        context;
-  
-  typedef Context context_type;
-
-  upsweep_intervals_closure(InputIterator input,
-                            ValueType * block_results,
-                            BinaryFunction binary_op,
-                            Decomposition decomp,
-                            Context context = Context())
-    : input(input), block_results(block_results), binary_op(binary_op), decomp(decomp), context(context) {}
-
-  __device__ __thrust_forceinline__
-  void operator()(void)
-  {
-    typedef typename Decomposition::index_type  IndexType;
-
-    const unsigned int CTA_SIZE = context_type::ThreadsPerBlock::value;
-
-#if __CUDA_ARCH__ >= 200
-    const unsigned int SMEM = (48 * 1024);
-#else
-    const unsigned int SMEM = (16 * 1024) - 256;
-#endif
-    const unsigned int MAX_K = ((SMEM - 1 * sizeof(ValueType)) / (sizeof(ValueType) * (CTA_SIZE + 1)));
-    const unsigned int K     = (MAX_K < 6) ? MAX_K : 6;
-
-    __shared__ uninitialized<ValueType[K][CTA_SIZE + 1]> sdata; // padded to avoid bank conflicts
-    
-    __shared__ uninitialized<ValueType> carry; // storage for carry out
-    if(context.thread_index() == 0) carry.construct();
-    
-    context.barrier();
-    
-    thrust::system::detail::internal::index_range<IndexType> interval = decomp[context.block_index()];
-
-    IndexType base = interval.begin();
-
-    input += base;
-
-    const unsigned int unit_size = K * CTA_SIZE;
-
-    bool carry_in = false;
-
-    // process full units
-    while (base + unit_size <= interval.end())
-    {
-      const unsigned int n = unit_size;
-      upsweep_body<CTA_SIZE,K,true>(context, n, carry_in, input, binary_op, sdata.get(), carry.get());
-      base   += unit_size;
-      input  += unit_size;
-      carry_in = true;
-    }
-
-    // process partially full unit at end of input (if necessary)
-    if (base < interval.end())
-    {
-      const unsigned int n = interval.end() - base;
-      upsweep_body<CTA_SIZE,K,false>(context, n, carry_in, input, binary_op, sdata.get(), carry.get());
-    }
-
-    // write interval sum
-    if (context.thread_index() == 0)
-      block_results[context.block_index()] = carry;
-  }
-};
-
-
-template <bool Inclusive,
-          typename InputIterator,
-          typename OutputIterator,
-          typename ValueType,
-          typename BinaryFunction,
-          typename Decomposition,
-          typename Context>
-struct downsweep_intervals_closure
-{
-  InputIterator  input;
-  OutputIterator output;
-  ValueType *    block_results;
-  BinaryFunction binary_op;
-  Decomposition  decomp;
-  Context        context;
-
-  typedef Context context_type;
-
-  downsweep_intervals_closure(InputIterator input,
-                              OutputIterator output,
-                              ValueType * block_results,
-                              BinaryFunction binary_op,
-                              Decomposition decomp,
-                              Context context = Context())
-    : input(input), output(output), block_results(block_results), binary_op(binary_op), decomp(decomp), context(context) {}
-
-  __device__ __thrust_forceinline__
-  void operator()(void)
-  {
-    typedef typename Decomposition::index_type IndexType;
-    
-    const unsigned int CTA_SIZE = context_type::ThreadsPerBlock::value;
-
-#if __CUDA_ARCH__ >= 200
-    const unsigned int SMEM = (48 * 1024);
-#else
-    const unsigned int SMEM = (16 * 1024) - 256;
-#endif
-    const unsigned int MAX_K = ((SMEM - 1 * sizeof(ValueType))/ (sizeof(ValueType) * (CTA_SIZE + 1)));
-    const unsigned int K     = (MAX_K < 6) ? MAX_K : 6;
-
-    __shared__ uninitialized<ValueType[K][CTA_SIZE + 1]> sdata;  // padded to avoid bank conflicts
-    
-    __shared__ uninitialized<ValueType> carry; // storage for carry in and carry out
-    if(context.thread_index() == 0) carry.construct();
-
-    context.barrier();
-
-    thrust::system::detail::internal::index_range<IndexType> interval = decomp[context.block_index()];
-
-    IndexType base = interval.begin();
-
-    input  += base;
-    output += base;
-
-    const unsigned int unit_size = K * CTA_SIZE;
-
-    bool carry_in  = (Inclusive && context.block_index() == 0) ? false : true;
-
-    if (carry_in)
-    {
-      if (context.thread_index() == 0)
-        carry = block_results[context.block_index()];
-      context.barrier();
-    }
-
-    // process full units
-    while (base + unit_size <= interval.end())
-    {
-      const unsigned int n = unit_size;
-      scan_body<CTA_SIZE,K,Inclusive,true>(context, n, carry_in, input, output, binary_op, sdata.get(), carry.get());
-      base   += K * CTA_SIZE;
-      input  += K * CTA_SIZE;
-      output += K * CTA_SIZE;
-      carry_in = true;
-    }
-
-    // process partially full unit at end of input (if necessary)
-    if (base < interval.end())
-    {
-      const unsigned int n = interval.end() - base;
-      scan_body<CTA_SIZE,K,Inclusive,false>(context, n, carry_in, input, output, binary_op, sdata.get(), carry.get());
-    }
-  }
-};
-
-
-} // end namespace fast_scan_detail
-
-
-template <typename DerivedPolicy,
-          typename InputIterator,
-          typename OutputIterator,
-          typename BinaryFunction>
-OutputIterator inclusive_scan(execution_policy<DerivedPolicy> &exec,
-                              InputIterator first,
-                              InputIterator last,
-                              OutputIterator output,
-                              BinaryFunction binary_op)
-{
-  using namespace fast_scan_detail;
-
-  // the pseudocode for deducing the type of the temporary used below:
-  // 
-  // if BinaryFunction is AdaptableBinaryFunction
-  //   TemporaryType = AdaptableBinaryFunction::result_type
-  // else if OutputIterator is a "pure" output iterator
-  //   TemporaryType = InputIterator::value_type
-  // else
-  //   TemporaryType = OutputIterator::value_type
-  //
-  // XXX upon c++0x, TemporaryType needs to be:
-  // result_of<BinaryFunction>::type
-
-  typedef typename thrust::detail::eval_if<
-    thrust::detail::has_result_type<BinaryFunction>::value,
-    thrust::detail::result_type<BinaryFunction>,
-    thrust::detail::eval_if<
-      thrust::detail::is_output_iterator<OutputIterator>::value,
-      thrust::iterator_value<InputIterator>,
-      thrust::iterator_value<OutputIterator>
-    >
-  >::type ValueType;
-
-  typedef unsigned int                                                       IndexType;
-  typedef thrust::system::detail::internal::uniform_decomposition<IndexType> Decomposition;
-  typedef thrust::detail::temporary_array<ValueType,DerivedPolicy>           ValueArray;
-
-  if (first == last)
-      return output;
-
-  Decomposition decomp = thrust::system::cuda::detail::default_decomposition<IndexType>(last - first);
-
-  ValueArray block_results(exec, decomp.size());
-  
-  // compute sum over each interval
-  if (thrust::detail::is_commutative<BinaryFunction>::value)
-  {
-    // use reduce_intervals for commutative operators
-    thrust::system::cuda::detail::reduce_intervals(exec, first, block_results.begin(), binary_op, decomp);
-  }
-  else
-  {
-    const static unsigned int ThreadsPerBlock = inclusive_scan_block_size<ValueType>::pass1;
-    typedef detail::statically_blocked_thread_array<ThreadsPerBlock> Context;
-
-    typedef upsweep_intervals_closure<InputIterator,ValueType,BinaryFunction,Decomposition,Context> Closure;
-    Closure closure(first,
-                    thrust::raw_pointer_cast(&block_results[0]),
-                    binary_op,
-                    decomp);
-    detail::launch_closure(closure, decomp.size(), ThreadsPerBlock);
-  }
-
-  // second level inclusive scan of per-block results
-  {
-    const static unsigned int ThreadsPerBlock = inclusive_scan_block_size<ValueType>::pass2;
-    typedef detail::statically_blocked_thread_array<ThreadsPerBlock> Context;
-
-    typedef downsweep_intervals_closure<true,ValueType*,ValueType*,ValueType,BinaryFunction,Decomposition,Context> Closure;
-    Closure closure(thrust::raw_pointer_cast(&block_results[0]),
-                    thrust::raw_pointer_cast(&block_results[0]),
-                    thrust::raw_pointer_cast(&block_results[0]), // not used
-                    binary_op,
-                    Decomposition(decomp.size(), 1, 1));
-    detail::launch_closure(closure, 1, ThreadsPerBlock);
-  }
-  
-  // update intervals with result of second level scan
-  {
-    const static unsigned int ThreadsPerBlock = inclusive_scan_block_size<ValueType>::pass3;
-    typedef detail::statically_blocked_thread_array<ThreadsPerBlock> Context;
-
-    typedef downsweep_intervals_closure<true,InputIterator,OutputIterator,ValueType,BinaryFunction,Decomposition,Context> Closure;
-    Closure closure(first,
-                    output,
-                    thrust::raw_pointer_cast(&block_results[0]) - 1, // shift block results
-                    binary_op,
-                    decomp);
-    detail::launch_closure(closure, decomp.size(), ThreadsPerBlock);
-  }
-  
-  return output + (last - first);
-}
-
-
-template <typename DerivedPolicy,
-          typename InputIterator,
-          typename OutputIterator,
-          typename T,
-          typename BinaryFunction>
-OutputIterator exclusive_scan(execution_policy<DerivedPolicy> &exec,
-                              InputIterator first,
-                              InputIterator last,
-                              OutputIterator output,
-                              const T init,
-                              BinaryFunction binary_op)
-{
-  using namespace fast_scan_detail;
-
-  // the pseudocode for deducing the type of the temporary used below:
-  // 
-  // if BinaryFunction is AdaptableBinaryFunction
-  //   TemporaryType = AdaptableBinaryFunction::result_type
-  // else if OutputIterator is a "pure" output iterator
-  //   TemporaryType = InputIterator::value_type
-  // else
-  //   TemporaryType = OutputIterator::value_type
-  //
-  // XXX upon c++0x, TemporaryType needs to be:
-  // result_of<BinaryFunction>::type
-
-  typedef typename thrust::detail::eval_if<
-    thrust::detail::has_result_type<BinaryFunction>::value,
-    thrust::detail::result_type<BinaryFunction>,
-    thrust::detail::eval_if<
-      thrust::detail::is_output_iterator<OutputIterator>::value,
-      thrust::iterator_value<InputIterator>,
-      thrust::iterator_value<OutputIterator>
-    >
-  >::type ValueType;
-
-  typedef unsigned int                                                       IndexType;
-  typedef thrust::system::detail::internal::uniform_decomposition<IndexType> Decomposition;
-  typedef thrust::detail::temporary_array<ValueType,DerivedPolicy>           ValueArray;
-
-  if (first == last)
-      return output;
-
-  Decomposition decomp = thrust::system::cuda::detail::default_decomposition<IndexType>(last - first);
-
-  ValueArray block_results(exec, decomp.size() + 1);
-  
-  // compute sum over each interval
-  if (thrust::detail::is_commutative<BinaryFunction>::value)
-  {
-    // use reduce_intervals for commutative operators
-    thrust::system::cuda::detail::reduce_intervals(exec, first, block_results.begin() + 1, binary_op, decomp);
-  }
-  else
-  {
-    const static unsigned int ThreadsPerBlock = exclusive_scan_block_size<ValueType>::pass1;
-    typedef detail::statically_blocked_thread_array<ThreadsPerBlock> Context;
-
-    typedef upsweep_intervals_closure<InputIterator,ValueType,BinaryFunction,Decomposition,Context> Closure;
-    Closure closure(first,
-                    thrust::raw_pointer_cast(&block_results[0]) + 1,
-                    binary_op,
-                    decomp);
-    detail::launch_closure(closure, decomp.size(), ThreadsPerBlock);
-  }
-
-  // place init before per-block results
-  block_results[0] = init;
-  
-  // second level inclusive scan of per-block results
-  {
-    const static unsigned int ThreadsPerBlock = exclusive_scan_block_size<ValueType>::pass2;
-    typedef detail::statically_blocked_thread_array<ThreadsPerBlock> Context;
-
-    typedef downsweep_intervals_closure<true,ValueType*,ValueType*,ValueType,BinaryFunction,Decomposition,Context> Closure;
-    Closure closure(thrust::raw_pointer_cast(&block_results[0]),
-                    thrust::raw_pointer_cast(&block_results[0]),
-                    thrust::raw_pointer_cast(&block_results[0]), // not used
-                    binary_op,
-                    Decomposition(decomp.size() + 1, 1, 1));
-    detail::launch_closure(closure, 1, ThreadsPerBlock);
-  }
-  
-  // update intervals with result of second level scan
-  {
-    const static unsigned int ThreadsPerBlock = exclusive_scan_block_size<ValueType>::pass3;
-    typedef detail::statically_blocked_thread_array<ThreadsPerBlock> Context;
-
-    typedef downsweep_intervals_closure<false,InputIterator,OutputIterator,ValueType,BinaryFunction,Decomposition,Context> Closure;
-    Closure closure(first,
-                    output,
-                    thrust::raw_pointer_cast(&block_results[0]), // shift block results
-                    binary_op,
-                    decomp);
-    detail::launch_closure(closure, decomp.size(), ThreadsPerBlock);
-  }
-  
-  return output + (last - first);
-}
-
-
-} // end namespace fast_scan
-} // end namespace detail
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
-
diff --git a/compat/thrust/system/cuda/detail/detail/launch_calculator.h b/compat/thrust/system/cuda/detail/detail/launch_calculator.h
deleted file mode 100644
index 5126aa6f1a..0000000000
--- a/compat/thrust/system/cuda/detail/detail/launch_calculator.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-#include <thrust/system/cuda/detail/cuda_launch_config.h>
-#include <thrust/tuple.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace detail
-{
-
-template <typename Closure>
-class launch_calculator
-{
-  device_properties_t   properties;
-  function_attributes_t attributes;
-
-  public:
-  
-  launch_calculator(void);
-
-  launch_calculator(const device_properties_t& properties, const function_attributes_t& attributes);
-
-  thrust::tuple<size_t,size_t,size_t> with_variable_block_size(void) const;
-
-  template <typename UnaryFunction>
-  thrust::tuple<size_t,size_t,size_t> with_variable_block_size(UnaryFunction block_size_to_smem_size) const;
-  
-  thrust::tuple<size_t,size_t,size_t> with_variable_block_size_available_smem(void) const;
-
-  private:
-
-  /*! Returns a pair (num_threads_per_block, num_blocks_per_multiprocessor)
-   *  where num_threads_per_block is a valid block size for an instance of Closure
-   *  chosen by a heuristic and num_blocks_per_multiprocessor is the maximum
-   *  number of such blocks that can execute on a streaming multiprocessor at once.
-   */
-  thrust::pair<size_t, size_t> default_block_configuration() const;
-
-  /*! Returns a pair (num_threads_per_block, num_blocks_per_multiprocessor)
-   *  where num_threads_per_block is a valid block size for an instance of Closure
-   *  chosen by a heuristic and num_blocks_per_multiprocessor is the maximum
-   *  number of such blocks that can execute on a streaming multiprocessor at once.
-   *
-   *  \param block_size_to_smem_size Mapping from num_threads_per_block to number of
-   *                                 dynamically-allocated bytes of shared memory
-   */
-  template<typename UnaryFunction>
-  thrust::pair<size_t, size_t> default_block_configuration(UnaryFunction block_size_to_smem_size) const;
-};
-
-} // end namespace detail
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/cuda/detail/detail/launch_calculator.inl>
-
diff --git a/compat/thrust/system/cuda/detail/detail/launch_calculator.inl b/compat/thrust/system/cuda/detail/detail/launch_calculator.inl
deleted file mode 100644
index b851d5fe13..0000000000
--- a/compat/thrust/system/cuda/detail/detail/launch_calculator.inl
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-// do not attempt to compile this file with any other compiler
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-
-#include <thrust/system/cuda/detail/runtime_introspection.h>
-#include <thrust/system/cuda/detail/cuda_launch_config.h>
-#include <thrust/system/cuda/detail/detail/launch_closure.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace detail
-{
-
-template <typename Closure>
-launch_calculator<Closure>::launch_calculator(void)
-  : properties(device_properties()),
-    attributes(closure_attributes<Closure>())
-{}
-  
-template <typename Closure>
-launch_calculator<Closure>::launch_calculator(const device_properties_t& properties, const function_attributes_t& attributes)
-  : properties(properties),
-    attributes(attributes)
-{}
-
-template <typename Closure>
-  template <typename UnaryFunction>
-thrust::pair<size_t, size_t> launch_calculator<Closure>::default_block_configuration(UnaryFunction block_size_to_smem_size) const
-{
-  // choose a block size
-  std::size_t num_threads_per_block = block_size_with_maximum_potential_occupancy(attributes, properties, block_size_to_smem_size);
-
-  // choose a subscription rate
-  std::size_t num_blocks_per_multiprocessor = properties.maxThreadsPerMultiProcessor / num_threads_per_block;
-
-  return thrust::make_pair(num_threads_per_block, num_blocks_per_multiprocessor);
-}
-
-
-template <typename Closure>
-thrust::pair<size_t, size_t> launch_calculator<Closure>::default_block_configuration(void) const
-{
-  // choose a block size
-  std::size_t num_threads_per_block = block_size_with_maximum_potential_occupancy(attributes, properties);
-
-  // choose a subscription rate
-  std::size_t num_blocks_per_multiprocessor = properties.maxThreadsPerMultiProcessor / num_threads_per_block;
-
-  return thrust::make_pair(num_threads_per_block, num_blocks_per_multiprocessor);
-}
-
-template <typename Closure>
-thrust::tuple<size_t,size_t,size_t> launch_calculator<Closure>::with_variable_block_size(void) const
-{
-  thrust::pair<size_t, size_t> config = default_block_configuration();
-  return thrust::tuple<size_t,size_t,size_t>(config.second * properties.multiProcessorCount, config.first, 0);
-}
-
-template <typename Closure>
-  template <typename UnaryFunction>
-thrust::tuple<size_t,size_t,size_t> launch_calculator<Closure>::with_variable_block_size(UnaryFunction block_size_to_smem_size) const
-{
-  thrust::pair<size_t, size_t> config = default_block_configuration(block_size_to_smem_size);
-  return thrust::tuple<size_t,size_t,size_t>(config.second * properties.multiProcessorCount, config.first, block_size_to_smem_size(config.first));
-}
-  
-template <typename Closure>
-thrust::tuple<size_t,size_t,size_t> launch_calculator<Closure>::with_variable_block_size_available_smem(void) const
-{
-  thrust::pair<size_t, size_t> config = default_block_configuration();
-  size_t smem_per_block = proportional_smem_allocation(properties, attributes, config.second);
-  return thrust::tuple<size_t,size_t,size_t>(config.second * properties.multiProcessorCount, config.first, smem_per_block);
-}
-
-} // end detail
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
-#endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-
diff --git a/compat/thrust/system/cuda/detail/detail/launch_closure.h b/compat/thrust/system/cuda/detail/detail/launch_closure.h
deleted file mode 100644
index c2e6c4344f..0000000000
--- a/compat/thrust/system/cuda/detail/detail/launch_closure.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/system/cuda/detail/cuda_launch_config.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace detail
-{
-
-template <unsigned int _ThreadsPerBlock = 0,
-          unsigned int _BlocksPerMultiprocessor = 0>
-struct launch_bounds
-{
-  typedef thrust::detail::integral_constant<unsigned int, _ThreadsPerBlock>         ThreadsPerBlock;
-  typedef thrust::detail::integral_constant<unsigned int, _BlocksPerMultiprocessor> BlocksPerMultiprocessor;
-};
-
-struct thread_array : public launch_bounds<>
-{
-// CUDA built-in variables require nvcc
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-  __device__ __thrust_forceinline__ unsigned int thread_index(void) const { return threadIdx.x; }
-  __device__ __thrust_forceinline__ unsigned int thread_count(void) const { return blockDim.x * gridDim.x; } 
-#else
-  __device__ __thrust_forceinline__ unsigned int thread_index(void) const { return 0; }
-  __device__ __thrust_forceinline__ unsigned int thread_count(void) const { return 0; } 
-#endif // THRUST_DEVICE_COMPILER_NVCC
-};
-
-struct blocked_thread_array : public launch_bounds<>
-{
-// CUDA built-in variables require nvcc
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-  __device__ __thrust_forceinline__ unsigned int thread_index(void)    const { return threadIdx.x; }
-  __device__ __thrust_forceinline__ unsigned int block_dimension(void) const { return blockDim.x;  } 
-  __device__ __thrust_forceinline__ unsigned int block_index(void)     const { return blockIdx.x;  }
-  __device__ __thrust_forceinline__ unsigned int grid_dimension(void)  const { return gridDim.x;   }
-  __device__ __thrust_forceinline__ unsigned int linear_index(void)    const { return block_dimension() * block_index() + thread_index(); }
-  __device__ __thrust_forceinline__ void         barrier(void)               { __syncthreads();    }
-#else
-  __device__ __thrust_forceinline__ unsigned int thread_index(void)    const { return 0; }
-  __device__ __thrust_forceinline__ unsigned int block_dimension(void) const { return 0; }
-  __device__ __thrust_forceinline__ unsigned int block_index(void)     const { return 0; }
-  __device__ __thrust_forceinline__ unsigned int grid_dimension(void)  const { return 0; }
-  __device__ __thrust_forceinline__ unsigned int linear_index(void)    const { return 0; }
-  __device__ __thrust_forceinline__ void         barrier(void)               {           }
-#endif // THRUST_DEVICE_COMPILER_NVCC
-};
-
-template <unsigned int _ThreadsPerBlock>
-struct statically_blocked_thread_array : public launch_bounds<_ThreadsPerBlock,1>
-{
-// CUDA built-in variables require nvcc
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-  __device__ __thrust_forceinline__ unsigned int thread_index(void)    const { return threadIdx.x;      }
-  __device__ __thrust_forceinline__ unsigned int block_dimension(void) const { return _ThreadsPerBlock; } // minor optimization
-  __device__ __thrust_forceinline__ unsigned int block_index(void)     const { return blockIdx.x;       }
-  __device__ __thrust_forceinline__ unsigned int grid_dimension(void)  const { return gridDim.x;        }
-  __device__ __thrust_forceinline__ unsigned int linear_index(void)    const { return block_dimension() * block_index() + thread_index(); }
-  __device__ __thrust_forceinline__ void         barrier(void)               { __syncthreads();    }
-#else
-  __device__ __thrust_forceinline__ unsigned int thread_index(void)    const { return 0; }
-  __device__ __thrust_forceinline__ unsigned int block_dimension(void) const { return 0; }
-  __device__ __thrust_forceinline__ unsigned int block_index(void)     const { return 0; }
-  __device__ __thrust_forceinline__ unsigned int grid_dimension(void)  const { return 0; }
-  __device__ __thrust_forceinline__ unsigned int linear_index(void)    const { return 0; }
-  __device__ __thrust_forceinline__ void         barrier(void)               {           }
-#endif // THRUST_DEVICE_COMPILER_NVCC
-};
-
-template<typename Closure, typename Size1, typename Size2>
-  void launch_closure(Closure f, Size1 num_blocks, Size2 block_size);
-
-template<typename Closure, typename Size1, typename Size2, typename Size3>
-  void launch_closure(Closure f, Size1 num_blocks, Size2 block_size, Size3 smem_size);
-
-/*! Returns a copy of the cudaFuncAttributes structure
- *  that is associated with a given Closure
- */
-template <typename Closure>
-function_attributes_t closure_attributes(void);
-
-} // end namespace detail
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/cuda/detail/detail/launch_closure.inl>
-
diff --git a/compat/thrust/system/cuda/detail/detail/launch_closure.inl b/compat/thrust/system/cuda/detail/detail/launch_closure.inl
deleted file mode 100644
index ce39cfc136..0000000000
--- a/compat/thrust/system/cuda/detail/detail/launch_closure.inl
+++ /dev/null
@@ -1,207 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/minmax.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/temporary_array.h>
-#include <thrust/system/cuda/detail/runtime_introspection.h>
-#include <thrust/system/cuda/detail/synchronize.h>
-#include <thrust/system/cuda/detail/detail/launch_calculator.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace detail
-{
-
-// XXX WAR circular inclusion problems with this forward declaration
-template<typename, typename> class temporary_array;
-
-} // end detail
-
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace detail
-{
-
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-template<typename Closure>
-__global__ __launch_bounds__(Closure::context_type::ThreadsPerBlock::value, Closure::context_type::BlocksPerMultiprocessor::value)
-void launch_closure_by_value(Closure f)
-{
-  f();
-}
-
-template<typename Closure>
-__global__ __launch_bounds__(Closure::context_type::ThreadsPerBlock::value, Closure::context_type::BlocksPerMultiprocessor::value)
-void launch_closure_by_pointer(const Closure *f)
-{
-  // copy to registers
-  Closure f_reg = *f;
-  f_reg();
-}
-#else
-template<typename Closure>
-void launch_closure_by_value(Closure) {}
-
-template<typename Closure>
-void launch_closure_by_pointer(const Closure *) {}
-
-#endif // THRUST_DEVICE_COMPILER_NVCC
-
-template<typename Closure,
-         bool launch_by_value = sizeof(Closure) <= 256>
-  struct closure_launcher_base
-{
-  typedef void (*launch_function_t)(Closure); 
- 
-  static launch_function_t get_launch_function(void)
-  {
-    return launch_closure_by_value<Closure>;
-  }
-
-  template<typename Size1, typename Size2, typename Size3>
-  static void launch(Closure f, Size1 num_blocks, Size2 block_size, Size3 smem_size)
-  {
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-    if(num_blocks > 0)
-    {
-      launch_closure_by_value<<<(unsigned int) num_blocks, (unsigned int) block_size, (unsigned int) smem_size>>>(f);
-      synchronize_if_enabled("launch_closure_by_value");
-    }
-#endif // THRUST_DEVICE_COMPILER_NVCC
-  }
-}; // end closure_launcher_base
-
-
-template<typename Closure>
-  struct closure_launcher_base<Closure,false>
-{
-  typedef void (*launch_function_t)(const Closure *); 
- 
-  static launch_function_t get_launch_function(void)
-  {
-    return launch_closure_by_pointer<Closure>;
-  }
-
-  template<typename Size1, typename Size2, typename Size3>
-  static void launch(Closure f, Size1 num_blocks, Size2 block_size, Size3 smem_size)
-  {
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-    if(num_blocks > 0)
-    {
-      // use temporary storage for the closure
-      // XXX use of cuda::tag is too specific here
-      thrust::cuda::tag cuda_tag;
-      thrust::host_system_tag host_tag;
-      thrust::detail::temporary_array<Closure,thrust::cuda::tag> closure_storage(cuda_tag, host_tag, &f, &f + 1);
-
-      // launch
-      detail::launch_closure_by_pointer<<<(unsigned int) num_blocks, (unsigned int) block_size, (unsigned int) smem_size>>>((&closure_storage[0]).get());
-      synchronize_if_enabled("launch_closure_by_pointer");
-    }
-#endif // THRUST_DEVICE_COMPILER_NVCC
-  }
-};
-
-
-template<typename Closure>
-  struct closure_launcher
-    : public closure_launcher_base<Closure>
-{
-  typedef closure_launcher_base<Closure> super_t;
-  
-  static inline const device_properties_t& device_properties(void)
-  {
-    return device_properties();
-  }
-  
-  static inline function_attributes_t function_attributes(void)
-  {
-    return thrust::system::cuda::detail::function_attributes(super_t::get_launch_function());
-  }
-
-  template<typename Size1, typename Size2, typename Size3>
-  static void launch(Closure f, Size1 num_blocks, Size2 block_size, Size3 smem_size)
-  {
-    super_t::launch(f,num_blocks,block_size,smem_size);
-  }
-};
-
-template<typename Closure, typename Size>
-  void launch_closure(Closure f, Size num_blocks)
-{
-  launch_calculator<Closure> calculator;
-  launch_closure(f, num_blocks, thrust::get<1>(calculator.with_variable_block_size()));
-} // end launch_closure()
-
-template<typename Closure, typename Size1, typename Size2>
-  void launch_closure(Closure f, Size1 num_blocks, Size2 block_size)
-{
-  launch_closure(f, num_blocks, block_size, 0u);
-} // end launch_closure()
-
-template<typename Closure, typename Size1, typename Size2, typename Size3>
-  void launch_closure(Closure f, Size1 num_blocks, Size2 block_size, Size3 smem_size)
-{
-  closure_launcher<Closure>::launch(f, num_blocks, block_size, smem_size);
-} // end launch_closure()
-
-  
-template <typename Closure>
-function_attributes_t closure_attributes(void)
-{
-  typedef closure_launcher<Closure> Launcher;
-
-  // cache the result of function_attributes(), because it is slow
-  // only cache the first few devices
-  static const int max_num_devices                                  = 16;
-
-  static bool attributes_exist[max_num_devices]                     = {0};
-  static function_attributes_t function_attributes[max_num_devices] = {};
-
-  // XXX device_id ought to be an argument to this function
-  int device_id = current_device();
-
-  if(device_id >= max_num_devices)
-  {
-    return thrust::system::cuda::detail::function_attributes(Launcher::get_launch_function());
-  }
-
-  if(!attributes_exist[device_id])
-  {
-    function_attributes[device_id] = thrust::system::cuda::detail::function_attributes(Launcher::get_launch_function());
-
-    // disallow the compiler to move the write to attributes_exist[device_id]
-    // before the initialization of function_attributes[device_id]
-    __thrust_compiler_fence();
-
-    attributes_exist[device_id] = true;
-  }
-
-  return function_attributes[device_id];
-}
-
-} // end namespace detail
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cuda/detail/detail/set_operation.h b/compat/thrust/system/cuda/detail/detail/set_operation.h
deleted file mode 100644
index 5475731edb..0000000000
--- a/compat/thrust/system/cuda/detail/detail/set_operation.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace detail
-{
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename RandomAccessIterator3,
-         typename Compare,
-         typename SetOperation>
-  RandomAccessIterator3 set_operation(execution_policy<DerivedPolicy> &exec,
-                                      RandomAccessIterator1 first1,
-                                      RandomAccessIterator1 last1,
-                                      RandomAccessIterator2 first2,
-                                      RandomAccessIterator2 last2,
-                                      RandomAccessIterator3 result,
-                                      Compare comp,
-                                      SetOperation set_op);
-
-
-} // end detail
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
-
-#include <thrust/system/cuda/detail/detail/set_operation.inl>
-
diff --git a/compat/thrust/system/cuda/detail/detail/set_operation.inl b/compat/thrust/system/cuda/detail/detail/set_operation.inl
deleted file mode 100644
index 3f14379628..0000000000
--- a/compat/thrust/system/cuda/detail/detail/set_operation.inl
+++ /dev/null
@@ -1,639 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/detail/set_operation.h>
-#include <thrust/system/cuda/detail/detail/balanced_path.h>
-#include <thrust/system/cuda/detail/block/inclusive_scan.h>
-#include <thrust/system/cuda/detail/block/exclusive_scan.h>
-#include <thrust/system/cuda/detail/block/copy.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/transform.h>
-#include <thrust/scan.h>
-#include <thrust/pair.h>
-#include <thrust/detail/util/blocking.h>
-#include <thrust/detail/temporary_array.h>
-#include <thrust/detail/cstdint.h>
-#include <thrust/detail/minmax.h>
-
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace detail
-{
-namespace set_operation_detail
-{
-
-
-using thrust::system::cuda::detail::detail::statically_blocked_thread_array;
-using thrust::detail::uint16_t;
-using thrust::detail::uint32_t;
-
-
-// empirically determined on sm_20
-// value_types larger than this will fail to launch if placed in smem
-template<typename T>
-  struct stage_through_smem
-{
-  static const bool value = sizeof(T) <= 6 * sizeof(uint32_t);
-};
-
-
-// max_input_size <= 32
-template<typename Size, typename InputIterator, typename OutputIterator>
-inline __device__
-  OutputIterator serial_bounded_copy_if(Size max_input_size,
-                                        InputIterator first,
-                                        uint32_t mask,
-                                        OutputIterator result)
-{
-  for(Size i = 0; i < max_input_size; ++i, ++first)
-  {
-    if((1<<i) & mask)
-    {
-      *result = *first;
-      ++result;
-    }
-  }
-
-  return result;
-}
-
-
-template<typename Size, typename InputIterator1, typename InputIterator2, typename Compare>
-  struct find_partition_offsets_functor
-{
-  Size partition_size;
-  InputIterator1 first1;
-  InputIterator2 first2;
-  Size n1, n2;
-  Compare comp;
-
-  find_partition_offsets_functor(Size partition_size,
-                                 InputIterator1 first1, InputIterator1 last1,
-                                 InputIterator2 first2, InputIterator2 last2,
-                                 Compare comp)
-    : partition_size(partition_size),
-      first1(first1), first2(first2),
-      n1(last1 - first1), n2(last2 - first2),
-      comp(comp)
-  {}
-
-  inline __host__ __device__
-  thrust::pair<Size,Size> operator()(Size i) const
-  {
-    Size diag = thrust::min(n1 + n2, i * partition_size);
-
-    // XXX the correctness of balanced_path depends critically on the ll suffix below
-    //     why???
-    return balanced_path(first1, n1, first2, n2, diag, 4ll, comp);
-  }
-};
-
-
-template<typename Size, typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Compare>
-  OutputIterator find_partition_offsets(thrust::cuda::execution_policy<DerivedPolicy> &exec,
-                                        Size num_partitions,
-                                        Size partition_size,
-                                        InputIterator1 first1, InputIterator1 last1,
-                                        InputIterator2 first2, InputIterator2 last2,
-                                        OutputIterator result,
-                                        Compare comp)
-{
-  find_partition_offsets_functor<Size,InputIterator1,InputIterator2,Compare> f(partition_size, first1, last1, first2, last2, comp);
-
-  return thrust::transform(exec,
-                           thrust::counting_iterator<Size>(0),
-                           thrust::counting_iterator<Size>(num_partitions),
-                           result,
-                           f);
-}
-
-
-namespace block
-{
-
-
-template<unsigned int block_size, typename T>
-inline __device__
-T right_neighbor(statically_blocked_thread_array<block_size> &ctx, const T &x, const T &boundary)
-{
-  // stage this shift to conserve smem
-  const unsigned int storage_size = block_size / 2;
-  __shared__ uninitialized_array<T,storage_size> shared;
-
-  T result = x;
-
-  unsigned int tid = ctx.thread_index();
-
-  if(0 < tid && tid <= storage_size)
-  {
-    shared[tid - 1] = x;
-  }
-
-  ctx.barrier();
-
-  if(tid < storage_size)
-  {
-    result = shared[tid];
-  }
-
-  ctx.barrier();
-  
-  tid -= storage_size;
-  if(0 < tid && tid <= storage_size)
-  {
-    shared[tid - 1] = x;
-  }
-  else if(tid == 0)
-  {
-    shared[storage_size-1] = boundary;
-  }
-
-  ctx.barrier();
-
-  if(tid < storage_size)
-  {
-    result = shared[tid];
-  }
-
-  ctx.barrier();
-
-  return result;
-}
-
-
-template<uint16_t block_size, uint16_t work_per_thread, typename InputIterator1, typename InputIterator2, typename Compare, typename SetOperation>
-inline __device__
-  unsigned int bounded_count_set_operation_n(statically_blocked_thread_array<block_size> &ctx,
-                                             InputIterator1 first1, uint16_t n1,
-                                             InputIterator2 first2, uint16_t n2,
-                                             Compare comp,
-                                             SetOperation set_op)
-{
-  unsigned int thread_idx = ctx.thread_index();
-
-  // find partition offsets
-  uint16_t diag = thrust::min<uint16_t>(n1 + n2, thread_idx * work_per_thread);
-  thrust::pair<uint16_t,uint16_t> thread_input_begin = balanced_path(first1, n1, first2, n2, diag, 2, comp);
-  thrust::pair<uint16_t,uint16_t> thread_input_end   = block::right_neighbor<block_size>(ctx, thread_input_begin, thrust::make_pair(n1,n2));
-
-  __shared__ uint16_t s_thread_output_size[block_size];
-
-  // work_per_thread + 1 to accomodate a "starred" partition returned from balanced_path above
-  s_thread_output_size[thread_idx] =
-    set_op.count(work_per_thread + 1,
-                 first1 + thread_input_begin.first,  first1 + thread_input_end.first,
-                 first2 + thread_input_begin.second, first2 + thread_input_end.second,
-                 comp);
-
-  ctx.barrier();
-
-  // reduce per-thread counts
-  thrust::system::cuda::detail::block::inplace_inclusive_scan(ctx, s_thread_output_size);
-  return s_thread_output_size[ctx.block_dimension() - 1];
-}
-
-
-inline __device__ int pop_count(unsigned int x)
-{
-// guard use of __popc from other compilers
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-  return __popc(x);
-#else
-  return x;
-#endif
-}
-
-
-
-template<uint16_t block_size, uint16_t work_per_thread, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Compare, typename SetOperation>
-inline __device__
-  OutputIterator bounded_set_operation_n(statically_blocked_thread_array<block_size> &ctx,
-                                         InputIterator1 first1, uint16_t n1,
-                                         InputIterator2 first2, uint16_t n2,
-                                         OutputIterator result,
-                                         Compare comp,
-                                         SetOperation set_op)
-{
-  unsigned int thread_idx = ctx.thread_index();
-  
-  // find partition offsets
-  uint16_t diag = thrust::min<uint16_t>(n1 + n2, thread_idx * work_per_thread);
-  thrust::pair<uint16_t,uint16_t> thread_input_begin = balanced_path(first1, n1, first2, n2, diag, 2, comp);
-  thrust::pair<uint16_t,uint16_t> thread_input_end   = block::right_neighbor<block_size>(ctx, thread_input_begin, thrust::make_pair(n1,n2));
-
-  typedef typename thrust::iterator_value<InputIterator1>::type value_type;
-  // +1 to accomodate a "starred" partition returned from balanced_path above
-  uninitialized_array<value_type, work_per_thread + 1> sparse_result;
-  uint32_t active_mask =
-    set_op(work_per_thread + 1,
-           first1 + thread_input_begin.first,  first1 + thread_input_end.first,
-           first2 + thread_input_begin.second, first2 + thread_input_end.second,
-           sparse_result.begin(),
-           comp);
-
-  __shared__ uint16_t s_thread_output_size[block_size];
-  s_thread_output_size[thread_idx] = pop_count(active_mask);
-
-  ctx.barrier();
-
-  // scan to turn per-thread counts into output indices
-  uint16_t block_output_size = thrust::system::cuda::detail::block::inplace_exclusive_scan(ctx, s_thread_output_size, 0u);
-
-  serial_bounded_copy_if(work_per_thread + 1, sparse_result.begin(), active_mask, result + s_thread_output_size[thread_idx]);
-
-  ctx.barrier();
-
-  return result + block_output_size;
-}
-
-
-template<uint16_t block_size, uint16_t work_per_thread, typename InputIterator1, typename InputIterator2, typename Compare, typename SetOperation>
-inline __device__
-  typename thrust::iterator_difference<InputIterator1>::type
-    count_set_operation(statically_blocked_thread_array<block_size> &ctx,
-                        InputIterator1 first1, InputIterator1 last1,
-                        InputIterator2 first2, InputIterator2 last2,
-                        Compare comp,
-                        SetOperation set_op)
-{
-  typedef typename thrust::iterator_difference<InputIterator1>::type difference;
-
-  difference result = 0;
-
-  thrust::pair<difference,difference> remaining_input_size = thrust::make_pair(last1 - first1, last2 - first2);
-
-  // iterate until the input is consumed
-  while(remaining_input_size.first + remaining_input_size.second > 0)
-  {
-    // find the end of this subpartition's input
-    // -1 to accomodate "starred" partitions
-    uint16_t max_subpartition_size = block_size * work_per_thread - 1;
-    difference diag = thrust::min<difference>(remaining_input_size.first + remaining_input_size.second, max_subpartition_size);
-    thrust::pair<uint16_t,uint16_t> subpartition_size = balanced_path(first1, remaining_input_size.first, first2, remaining_input_size.second, diag, 4ll, comp);
-  
-    typedef typename thrust::iterator_value<InputIterator2>::type value_type;
-    if(stage_through_smem<value_type>::value)
-    {
-      // load the input into __shared__ storage
-      __shared__ uninitialized_array<value_type, block_size * work_per_thread> s_input;
-  
-      value_type *s_input_end1 = thrust::system::cuda::detail::block::copy_n(ctx, first1, subpartition_size.first,  s_input.begin());
-      value_type *s_input_end2 = thrust::system::cuda::detail::block::copy_n(ctx, first2, subpartition_size.second, s_input_end1);
-  
-      result += block::bounded_count_set_operation_n<block_size,work_per_thread>(ctx,
-                                                                                 s_input.begin(), subpartition_size.first,
-                                                                                 s_input_end1,    subpartition_size.second,
-                                                                                 comp,
-                                                                                 set_op);
-    }
-    else
-    {
-      result += block::bounded_count_set_operation_n<block_size,work_per_thread>(ctx,
-                                                                                 first1, subpartition_size.first,
-                                                                                 first2, subpartition_size.second,
-                                                                                 comp,
-                                                                                 set_op);
-    }
-
-    // advance input
-    first1 += subpartition_size.first;
-    first2 += subpartition_size.second;
-
-    // decrement remaining size
-    remaining_input_size.first  -= subpartition_size.first;
-    remaining_input_size.second -= subpartition_size.second;
-  }
-
-  return result;
-}
-
-
-template<uint16_t block_size, uint16_t work_per_thread, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Compare, typename SetOperation>
-inline __device__
-OutputIterator set_operation(statically_blocked_thread_array<block_size> &ctx,
-                             InputIterator1 first1, InputIterator1 last1,
-                             InputIterator2 first2, InputIterator2 last2,
-                             OutputIterator result,
-                             Compare comp,
-                             SetOperation set_op)
-{
-  typedef typename thrust::iterator_difference<InputIterator1>::type difference;
-
-  thrust::pair<difference,difference> remaining_input_size = thrust::make_pair(last1 - first1, last2 - first2);
-
-  // iterate until the input is consumed
-  while(remaining_input_size.first + remaining_input_size.second > 0)
-  {
-    // find the end of this subpartition's input
-    // -1 to accomodate "starred" partitions
-    uint16_t max_subpartition_size = block_size * work_per_thread - 1;
-    difference diag = thrust::min<difference>(remaining_input_size.first + remaining_input_size.second, max_subpartition_size);
-    thrust::pair<uint16_t,uint16_t> subpartition_size = balanced_path(first1, remaining_input_size.first, first2, remaining_input_size.second, diag, 4ll, comp);
-    
-    typedef typename thrust::iterator_value<InputIterator2>::type value_type;
-    if(stage_through_smem<value_type>::value)
-    {
-      // load the input into __shared__ storage
-      __shared__ uninitialized_array<value_type, block_size * work_per_thread> s_input;
-  
-      value_type *s_input_end1 = thrust::system::cuda::detail::block::copy_n(ctx, first1, subpartition_size.first,  s_input.begin());
-      value_type *s_input_end2 = thrust::system::cuda::detail::block::copy_n(ctx, first2, subpartition_size.second, s_input_end1);
-  
-      result = block::bounded_set_operation_n<block_size,work_per_thread>(ctx,
-                                                                          s_input.begin(), subpartition_size.first,
-                                                                          s_input_end1,    subpartition_size.second,
-                                                                          result,
-                                                                          comp,
-                                                                          set_op);
-    }
-    else
-    {
-      result = block::bounded_set_operation_n<block_size,work_per_thread>(ctx,
-                                                                          first1, subpartition_size.first,
-                                                                          first2, subpartition_size.second,
-                                                                          result,
-                                                                          comp,
-                                                                          set_op);
-    }
-  
-    // advance input
-    first1 += subpartition_size.first;
-    first2 += subpartition_size.second;
-
-    // decrement remaining size
-    remaining_input_size.first  -= subpartition_size.first;
-    remaining_input_size.second -= subpartition_size.second;
-  }
-
-  return result;
-}
-
-
-} // end namespace block
-
-
-template<uint16_t threads_per_block, uint16_t work_per_thread, typename InputIterator1, typename Size, typename InputIterator2, typename InputIterator3, typename OutputIterator, typename Compare, typename SetOperation>
-  inline __device__ void count_set_operation(statically_blocked_thread_array<threads_per_block> &ctx,
-                                             InputIterator1                                      input_partition_offsets,
-                                             Size                                                num_partitions,
-                                             InputIterator2                                      first1,
-                                             InputIterator3                                      first2,
-                                             OutputIterator                                      result,
-                                             Compare                                             comp,
-                                             SetOperation                                        set_op)
-{
-  // consume partitions
-  for(Size partition_idx = ctx.block_index();
-      partition_idx < num_partitions;
-      partition_idx += ctx.grid_dimension())
-  {
-    typedef typename thrust::iterator_difference<InputIterator2>::type difference;
-
-    // find the partition
-    thrust::pair<difference,difference> block_input_begin = input_partition_offsets[partition_idx];
-    thrust::pair<difference,difference> block_input_end   = input_partition_offsets[partition_idx + 1];
-
-    // count the size of the set operation
-    difference count = block::count_set_operation<threads_per_block,work_per_thread>(ctx,
-                                                                                     first1 + block_input_begin.first,  first1 + block_input_end.first,
-                                                                                     first2 + block_input_begin.second, first2 + block_input_end.second,
-                                                                                     comp,
-                                                                                     set_op);
-
-    if(ctx.thread_index() == 0)
-    {
-      result[partition_idx] = count;
-    }
-  }
-}
-
-
-template<uint16_t threads_per_block, uint16_t work_per_thread, typename InputIterator1, typename Size, typename InputIterator2, typename InputIterator3, typename OutputIterator, typename Compare, typename SetOperation>
-  struct count_set_operation_closure
-{
-  typedef statically_blocked_thread_array<threads_per_block> context_type;
-
-  InputIterator1 input_partition_offsets;
-  Size           num_partitions;
-  InputIterator2 first1;
-  InputIterator3 first2;
-  OutputIterator result;
-  Compare        comp;
-  SetOperation   set_op;
-
-  count_set_operation_closure(InputIterator1 input_partition_offsets,
-                              Size           num_partitions,
-                              InputIterator2 first1,
-                              InputIterator3 first2,
-                              OutputIterator result,
-                              Compare        comp,
-                              SetOperation   set_op)
-    : input_partition_offsets(input_partition_offsets),
-      num_partitions(num_partitions),
-      first1(first1),
-      first2(first2),
-      result(result),
-      comp(comp),
-      set_op(set_op)
-  {}
-
-  inline __device__ void operator()() const
-  {
-    context_type ctx;
-    count_set_operation<threads_per_block,work_per_thread>(ctx, input_partition_offsets, num_partitions, first1, first2, result, comp, set_op);
-  }
-};
-
-
-template<uint16_t threads_per_block, uint16_t work_per_thread, typename InputIterator1, typename Size, typename InputIterator2, typename InputIterator3, typename OutputIterator, typename Compare, typename SetOperation>
-  count_set_operation_closure<threads_per_block,work_per_thread,InputIterator1,Size,InputIterator2,InputIterator3,OutputIterator,Compare,SetOperation>
-    make_count_set_operation_closure(InputIterator1 input_partition_offsets,
-                                     Size           num_partitions,
-                                     InputIterator2 first1,
-                                     InputIterator3 first2,
-                                     OutputIterator result,
-                                     Compare        comp,
-                                     SetOperation   set_op)
-{
-  typedef count_set_operation_closure<threads_per_block,work_per_thread,InputIterator1,Size,InputIterator2,InputIterator3,OutputIterator,Compare,SetOperation> result_type;
-  return result_type(input_partition_offsets,num_partitions,first1,first2,result,comp,set_op);
-}
-
-
-template<uint16_t threads_per_block, uint16_t work_per_thread, typename InputIterator1, typename Size, typename InputIterator2, typename InputIterator3, typename InputIterator4, typename OutputIterator, typename Compare, typename SetOperation>
-inline __device__
-  void set_operation(statically_blocked_thread_array<threads_per_block> &ctx,
-                     InputIterator1                                      input_partition_offsets,
-                     Size                                                num_partitions,
-                     InputIterator2                                      first1,
-                     InputIterator3                                      first2,
-                     InputIterator4                                      output_partition_offsets,
-                     OutputIterator                                      result,
-                     Compare                                             comp,
-                     SetOperation                                        set_op)
-{
-  // consume partitions
-  for(Size partition_idx = ctx.block_index();
-      partition_idx < num_partitions;
-      partition_idx += ctx.grid_dimension())
-  {
-    typedef typename thrust::iterator_difference<InputIterator2>::type difference;
-
-    // find the partition
-    thrust::pair<difference,difference> block_input_begin = input_partition_offsets[partition_idx];
-    thrust::pair<difference,difference> block_input_end   = input_partition_offsets[partition_idx + 1];
-
-    // do the set operation across the partition
-    block::set_operation<threads_per_block,work_per_thread>(ctx,
-                                                            first1 + block_input_begin.first,  first1 + block_input_end.first,
-                                                            first2 + block_input_begin.second, first2 + block_input_end.second,
-                                                            result + output_partition_offsets[partition_idx],
-                                                            comp,
-                                                            set_op);
-  }
-}
-
-
-template<uint16_t threads_per_block, uint16_t work_per_thread, typename InputIterator1, typename Size, typename InputIterator2, typename InputIterator3, typename InputIterator4, typename OutputIterator, typename Compare, typename SetOperation>
-  struct set_operation_closure
-{
-  typedef statically_blocked_thread_array<threads_per_block> context_type;
-
-  InputIterator1 input_partition_offsets;
-  Size           num_partitions;
-  InputIterator2 first1;
-  InputIterator3 first2;
-  InputIterator4 output_partition_offsets;
-  OutputIterator result;
-  Compare        comp;
-  SetOperation   set_op;
-
-  set_operation_closure(InputIterator1 input_partition_offsets,
-                        Size           num_partitions,
-                        InputIterator2 first1,
-                        InputIterator3 first2,
-                        InputIterator4 output_partition_offsets,
-                        OutputIterator result,
-                        Compare        comp,
-                        SetOperation   set_op)
-    : input_partition_offsets(input_partition_offsets),
-      num_partitions(num_partitions),
-      first1(first1),
-      first2(first2),
-      output_partition_offsets(output_partition_offsets),
-      result(result),
-      comp(comp),
-      set_op(set_op)
-  {}
-
-  inline __device__ void operator()() const
-  {
-    context_type ctx;
-    set_operation<threads_per_block,work_per_thread>(ctx, input_partition_offsets, num_partitions, first1, first2, output_partition_offsets, result, comp, set_op);
-  }
-};
-
-
-template<uint16_t threads_per_block, uint16_t work_per_thread, typename InputIterator1, typename Size, typename InputIterator2, typename InputIterator3, typename InputIterator4, typename OutputIterator, typename Compare, typename SetOperation>
-  set_operation_closure<threads_per_block,work_per_thread,InputIterator1,Size,InputIterator2,InputIterator3,InputIterator4,OutputIterator,Compare,SetOperation>
-    make_set_operation_closure(InputIterator1 input_partition_offsets,
-                               Size           num_partitions,
-                               InputIterator2 first1,
-                               InputIterator3 first2,
-                               InputIterator4 output_partition_offsets,
-                               OutputIterator result,
-                               Compare        comp,
-                               SetOperation   set_op)
-{
-  typedef set_operation_closure<threads_per_block,work_per_thread,InputIterator1,Size,InputIterator2,InputIterator3,InputIterator4,OutputIterator,Compare,SetOperation> result_type;
-  return result_type(input_partition_offsets,num_partitions,first1,first2,output_partition_offsets,result,comp,set_op);
-}
-
-
-} // end namespace set_operation_detail
-
-
-template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Compare, typename SetOperation>
-  OutputIterator set_operation(thrust::cuda::execution_policy<DerivedPolicy> &exec,
-                               InputIterator1 first1, InputIterator1 last1,
-                               InputIterator2 first2, InputIterator2 last2,
-                               OutputIterator result,
-                               Compare comp,
-                               SetOperation set_op)
-{
-  using thrust::system::cuda::detail::device_properties;
-  using thrust::system::cuda::detail::detail::launch_closure;
-  namespace d = thrust::system::cuda::detail::detail::set_operation_detail;
-
-  typedef typename thrust::iterator_difference<InputIterator1>::type difference;
-
-  const difference n1 = last1 - first1;
-  const difference n2 = last2 - first2;
-
-  // handle empty input
-  if(n1 == 0 && n2 == 0)
-  {
-    return result;
-  }
-
-  const thrust::detail::uint16_t work_per_thread   = 15;
-  const thrust::detail::uint16_t threads_per_block = 128;
-  const thrust::detail::uint16_t work_per_block    = threads_per_block * work_per_thread;
-
-  // -1 because balanced_path adds a single element to the end of a "starred" partition, increasing its size by one
-  const thrust::detail::uint16_t maximum_partition_size = work_per_block - 1;
-  const difference num_partitions = thrust::detail::util::divide_ri(n1 + n2, maximum_partition_size);
-
-  // find input partition offsets
-  // +1 to handle the end of the input elegantly
-  thrust::detail::temporary_array<thrust::pair<difference,difference>, DerivedPolicy> input_partition_offsets(0, exec, num_partitions + 1);
-  d::find_partition_offsets<difference>(exec, input_partition_offsets.size(), maximum_partition_size, first1, last1, first2, last2, input_partition_offsets.begin(), comp);
-
-  const difference num_blocks = thrust::min<difference>(device_properties().maxGridSize[0], num_partitions);
-
-  // find output partition offsets
-  // +1 to store the total size of the total
-  thrust::detail::temporary_array<difference, DerivedPolicy> output_partition_offsets(0, exec, num_partitions + 1);
-  launch_closure(d::make_count_set_operation_closure<threads_per_block,work_per_thread>(input_partition_offsets.begin(), num_partitions, first1, first2, output_partition_offsets.begin(), comp, set_op),
-                 num_blocks,
-                 threads_per_block);
-
-  // turn the output partition counts into offsets to output partitions
-  thrust::exclusive_scan(exec, output_partition_offsets.begin(), output_partition_offsets.end(), output_partition_offsets.begin());
-
-  // run the set op kernel
-  launch_closure(d::make_set_operation_closure<threads_per_block,work_per_thread>(input_partition_offsets.begin(), num_partitions, first1, first2, output_partition_offsets.begin(), result, comp, set_op),
-                 num_blocks,
-                 threads_per_block);
-
-  return result + output_partition_offsets[num_partitions];
-}
-
-
-} // end namespace detail
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cuda/detail/detail/stable_merge_sort.h b/compat/thrust/system/cuda/detail/detail/stable_merge_sort.h
deleted file mode 100644
index 23f32545af..0000000000
--- a/compat/thrust/system/cuda/detail/detail/stable_merge_sort.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file stable_merge_sort_dev.h
- *  \brief Defines the interface for a stable merge implementation on CUDA
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace detail
-{
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator,
-         typename StrictWeakOrdering>
-void stable_merge_sort(execution_policy<DerivedPolicy> &exec,
-                       RandomAccessIterator first,
-                       RandomAccessIterator last,
-                       StrictWeakOrdering comp);
-    
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename StrictWeakOrdering>
-void stable_merge_sort_by_key(execution_policy<DerivedPolicy> &exec,
-                              RandomAccessIterator1 keys_begin,
-                              RandomAccessIterator1 keys_end,
-                              RandomAccessIterator2 values_begin,
-                              StrictWeakOrdering comp);
-
-} // end namespace detail
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/cuda/detail/detail/stable_merge_sort.inl>
-
diff --git a/compat/thrust/system/cuda/detail/detail/stable_merge_sort.inl b/compat/thrust/system/cuda/detail/detail/stable_merge_sort.inl
deleted file mode 100644
index 0c69803294..0000000000
--- a/compat/thrust/system/cuda/detail/detail/stable_merge_sort.inl
+++ /dev/null
@@ -1,1103 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file stable_merge_sort.inl
- *  \brief Inline file for stable_merge_sort.h.
- *  \note This algorithm is based on the one described
- *        in "Designing Efficient Sorting Algorithms for
- *        Manycore GPUs", by Satish, Harris, and Garland.
- */
-
-#include <thrust/detail/config.h>
-
-#include <thrust/functional.h>
-#include <thrust/detail/copy.h>
-
-#include <thrust/detail/function.h>
-
-#include <thrust/detail/mpl/math.h> // for log2<N>
-#include <thrust/detail/util/blocking.h>
-#include <thrust/iterator/iterator_traits.h>
-
-#include <thrust/system/cuda/detail/detail/stable_sort_by_count.h>
-#include <thrust/system/cuda/detail/runtime_introspection.h>
-#include <thrust/detail/temporary_array.h>
-#include <thrust/system/cuda/detail/detail/launch_closure.h>
-#include <thrust/system/cuda/detail/detail/uninitialized.h>
-#include <thrust/system/cuda/detail/detail/cached_temporary_allocator.h>
-#include <thrust/system/cuda/detail/block/merge.h>
-#include <thrust/system/cuda/detail/block/copy.h>
-#include <thrust/pair.h>
-#include <thrust/tuple.h>
-#include <thrust/detail/minmax.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/gather.h>
-
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
-
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace detail
-{
-namespace stable_merge_sort_detail
-{
-
-
-template<unsigned int log_block_size, typename Key, typename Value>
-  struct is_block_size_valid
-{
-  // assume sm_10 limits
-  static const unsigned int max_num_smem_bytes = 16384;
-
-  // CUDA steals 256 for itself for kernel parms
-  static const unsigned int num_reserved_smem_bytes = 256;
-
-  // the number of bytes available to our kernels
-  static const unsigned int num_available_smem_bytes = max_num_smem_bytes - num_reserved_smem_bytes;
-
-  // merge_small_tiles_by_key_closure is the hungriest kernel
-  // the block_size it uses is 2x the size of all the other kernels
-  // this merge_small_tiles_by_key_closure's smem requirements:
-  //   2 * block_size_x2 * sizeof(Key)
-  // + 2 * block_size_x2 * sizeof(Key)
-  // + 2 * block_size_x2 * sizeof(Value)
-  // ================================
-  // 4 * (block_size) * (2 * sizeof(Key) + sizeof(Value))
-  static const unsigned int num_needed_smem_bytes = 4 * (1 << log_block_size) * (2 * sizeof(Key) + sizeof(Value));
-
-  static const bool value = num_needed_smem_bytes <= num_available_smem_bytes;
-};
-
-
-
-// choose a (log) block_size to use for our kernels
-template<unsigned int log_preferred_block_size, typename Key, typename Value>
-  struct select_log_block_size
-    : thrust::detail::eval_if<
-        is_block_size_valid<log_preferred_block_size, Key, Value>::value,
-        thrust::detail::integral_constant<unsigned int, log_preferred_block_size>,
-        select_log_block_size<log_preferred_block_size - 1, Key, Value>
-      >::type
-{};
-
-
-// don't recurse lower than block_size < 128
-template<typename Key, typename Value>
-  struct select_log_block_size<6, Key, Value>
-{
-  // no block size exists which can satisfy the storage demands
-};
-
-
-template<typename Key, typename Value>
-  struct block_size
-{
-  // prefer block_size == 512, go lower if we need to
-  static const unsigned int value = 1 << select_log_block_size<8, Key, Value>::value;
-};
-
-
-template <typename Size>
-inline unsigned int max_grid_size(Size block_size)
-{
-  const device_properties_t& properties = device_properties();
-
-  const unsigned int max_threads = properties.maxThreadsPerMultiProcessor * properties.multiProcessorCount;
-  const unsigned int max_blocks  = properties.maxGridSize[0];
-  
-  return std::min<unsigned int>(max_blocks, 3 * max_threads / block_size);
-} // end max_grid_size()
-
-
-// Base case for the merge algorithm: merges data where tile_size <= block_size. 
-// Works by loading two or more tiles into shared memory and doing a binary search.
-template<unsigned int block_size,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename RandomAccessIterator3,
-         typename RandomAccessIterator4,
-         typename StrictWeakOrdering,
-         typename Context>
-struct merge_small_tiles_by_key_closure
-{
-  typedef Context context_type;
-
-  RandomAccessIterator1 keys_first;
-  RandomAccessIterator2 values_first;
-  const unsigned int n;
-  const unsigned int log_tile_size;
-  RandomAccessIterator3 keys_result;
-  RandomAccessIterator4 values_result;
-  StrictWeakOrdering comp;
-  context_type context;
-
-  // these members are derivable from block_size, n, and log_tile_size
-  unsigned int index_of_last_block;
-  unsigned int index_of_last_tile_in_last_block;
-  unsigned int size_of_last_tile;
-
-  merge_small_tiles_by_key_closure
-    (RandomAccessIterator1 keys_first,
-     RandomAccessIterator2 values_first,
-     const unsigned int n,
-     const unsigned int log_tile_size,
-     RandomAccessIterator3 keys_result,
-     RandomAccessIterator4 values_result,
-     StrictWeakOrdering comp,
-     Context context = Context())
-    : keys_first(keys_first), values_first(values_first),
-      n(n), 
-      log_tile_size(log_tile_size),
-      keys_result(keys_result), values_result(values_result),
-      comp(comp),
-      context(context)
-  {
-    // compute the number of tiles, including a possible partial tile
-    unsigned int tile_size = 1 << log_tile_size;
-    unsigned int num_tiles = thrust::detail::util::divide_ri(n, tile_size);
-    unsigned int partial_tile_size = n % tile_size;
-
-    // compute the number of logical thread blocks, including a possible partial block
-    unsigned int tiles_per_block = block_size / tile_size;
-    unsigned int num_blocks = thrust::detail::util::divide_ri(num_tiles, tiles_per_block);
-    unsigned int partial_block_size = num_tiles % tiles_per_block;
-
-    // compute the number of tiles in the last block, which might be of partial size
-    unsigned int number_of_tiles_in_last_block = partial_block_size ? partial_block_size : tiles_per_block;
-
-    size_of_last_tile = partial_tile_size ? partial_tile_size : tile_size;
-    index_of_last_tile_in_last_block = number_of_tiles_in_last_block - 1;
-    index_of_last_block = num_blocks - 1;
-  }
-
-  unsigned int grid_size() const
-  {
-    const unsigned int max_num_blocks = max_grid_size(block_size);
-    const unsigned int num_logical_blocks = index_of_last_block + 1;
-    return thrust::min<unsigned int>(num_logical_blocks, max_num_blocks);
-  }
-
-  __device__ __thrust_forceinline__
-  void operator()(void)
-  {
-    typedef typename iterator_value<RandomAccessIterator3>::type KeyType;
-    typedef typename iterator_value<RandomAccessIterator4>::type ValueType;
-
-    // load (2*block_size) elements into shared memory. These (2*block_size) elements belong to (2*block_size)/tile_size different tiles.
-    __shared__ uninitialized_array<KeyType, 2 * block_size>   key;
-    __shared__ uninitialized_array<KeyType, 2 * block_size>   outkey;
-    __shared__ uninitialized_array<ValueType, 2 * block_size> outvalue;
-
-    const unsigned int grid_size = context.grid_dimension() * context.block_dimension();
-
-    unsigned int block_idx = context.block_index();
-    
-    // the global index of this task
-    unsigned int i = context.thread_index() + context.block_index() * context.block_dimension();
-
-    // advance iterators
-    keys_first    += i;
-    values_first  += i;
-    keys_result   += i;
-    values_result += i;
-
-    for(;
-        block_idx <= index_of_last_block;
-        block_idx += context.grid_dimension(), i += grid_size, keys_first += grid_size, values_first += grid_size, keys_result += grid_size, values_result += grid_size)
-    {
-      KeyType my_key;
-      
-      // copy over inputs to shared memory
-      if(i < n)
-      {
-        key[context.thread_index()] = my_key = *keys_first;
-      } // end if
-      
-      // the tile to which the element belongs
-      unsigned int tile_index = context.thread_index()>>log_tile_size;
-
-      // figure out the index and size of the other tile
-      unsigned int other_tile_index = tile_index^1;
-      unsigned int other_tile_size = (1<<log_tile_size);
-
-      // if the other tile is the final tile, it is potentially
-      // smaller than the rest
-      if(block_idx == index_of_last_block
-         && other_tile_index == index_of_last_tile_in_last_block)
-      {
-        other_tile_size = size_of_last_tile;
-      } // end if
-      
-      // figure out where the other tile begins in shared memory
-      KeyType *other = key.data() + (other_tile_index<<log_tile_size);
-
-      context.barrier();
-      if(i < n)
-      {
-        // to compute the rank of my element in the merged sequence
-        // add the rank of the element in the other tile
-        // plus the rank of the element in this tile
-        // the computation for the rank of the element in this tile 
-        // differs depending on if we're in the odd or even tile
-        unsigned int rank;
-        if(tile_index & 1)
-        {
-          rank = thrust::system::detail::generic::scalar::upper_bound_n(other, other_tile_size, my_key, comp) - other;
-          rank += context.thread_index() - (1<<log_tile_size);
-        }
-        else
-        {
-          rank = thrust::system::detail::generic::scalar::lower_bound_n(other, other_tile_size, my_key, comp) - other;
-          rank += context.thread_index();
-        }
-
-        // store my key and value to the output arrays in smem
-        outkey[rank] = my_key;
-        outvalue[rank] = *values_first;
-      } // end if
-      context.barrier();
-      
-      if(i < n)
-      {
-        // coalesced writes to global memory
-        *keys_result   = outkey[context.thread_index()];
-        *values_result = outvalue[context.thread_index()];
-      } // end if
-      context.barrier();
-    } // end for
-  } // end operator()
-}; // merge_small_tiles_by_key_closure
-
-
-template<unsigned int stride>
-  class static_strided_integer_range
-{
-  // XXX cudafe doesn't like this private for some reason
-  //private:
-  public:
-    typedef typename thrust::counting_iterator<unsigned int> counting_iterator;
-
-    struct stride_functor
-      : public thrust::unary_function<unsigned int,unsigned int>
-    {
-      inline __host__ __device__
-      unsigned int operator()(unsigned int i) const
-      {
-        return stride * i;
-      }
-    };
-
-  public:
-    typedef typename thrust::transform_iterator<stride_functor, counting_iterator> iterator;
-
-    static_strided_integer_range(unsigned int num_strides)
-      : m_begin(iterator(counting_iterator(0), stride_functor())),
-        m_end(iterator(counting_iterator(num_strides), stride_functor()))
-    {}
-
-    iterator begin() const
-    {
-      return m_begin;
-    }
-
-    iterator end() const
-    {
-      return m_end;
-    }
-
-  private:
-    iterator m_begin, m_end;
-};
-
-
-///////////////////// Find the rank of each extracted element in both arrays ////////////////////////////////////////
-///////////////////// This breaks up the array into independent segments to merge ////////////////////////////////////////
-// Inputs: d_splitters, d_splittes_pos: the merged array of splitters with corresponding positions.
-//		   d_srcData: input data, datasize: number of entries in d_srcData
-//		   N_SPLITTERS the number of splitters, log_blocksize: log of the size of each block of sorted data
-//		   log_num_merged_splitters_per_tile = log of the number of merged splitters. ( = log_blocksize - 7). 
-// Output: d_rank1, d_rank2: ranks of each splitter in d_splitters in the block to which it belongs
-//		   (say i) and its corresponding block (block i+1).
-template<unsigned int block_size,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename RandomAccessIterator3,
-         typename RandomAccessIterator4,
-         typename StrictWeakOrdering,
-         typename Context>
-struct rank_splitters_closure
-{
-  typedef Context context_type;
-
-  static const unsigned int log_block_size = thrust::detail::mpl::math::log2<block_size>::value;
-
-  RandomAccessIterator1 splitters_first;
-  RandomAccessIterator2 splitters_pos_first;
-  RandomAccessIterator3 keys_first;
-  RandomAccessIterator4 ranks_result1;
-  RandomAccessIterator4 ranks_result2;
-  unsigned int num_splitters;
-  unsigned int num_keys;
-  unsigned int log_tile_size;
-  thrust::detail::device_function<
-    StrictWeakOrdering,
-    bool
-  > comp;
-  context_type context;
-
-  // this member is derivable from those received in the constructor
-  unsigned int log_num_merged_splitters_per_tile;
-
-  rank_splitters_closure(RandomAccessIterator1 splitters_first,
-                         RandomAccessIterator2 splitters_pos_first, 
-                         RandomAccessIterator3 keys_first,
-                         unsigned int num_splitters,
-                         unsigned int num_keys, 
-                         unsigned int log_tile_size, 
-                         RandomAccessIterator4 ranks_result1,
-                         RandomAccessIterator4 ranks_result2, 
-                         StrictWeakOrdering comp,
-                         context_type context = context_type())
-    : splitters_first(splitters_first), splitters_pos_first(splitters_pos_first),
-      keys_first(keys_first),
-      ranks_result1(ranks_result1), ranks_result2(ranks_result2),
-      num_splitters(num_splitters), num_keys(num_keys),
-      log_tile_size(log_tile_size),
-      comp(comp), context(context)
-  {
-    // the number of splitters in each tile before merging
-    const unsigned int log_num_splitters_per_tile = log_tile_size - log_block_size;
-
-    // the number of splitters in each merged tile
-    log_num_merged_splitters_per_tile = log_num_splitters_per_tile + 1;
-  }
-
-  inline unsigned int grid_size() const
-  {
-    unsigned int num_blocks = num_splitters / block_size;
-    if(num_splitters % block_size) ++num_blocks;
-
-    // compute the maximum number of block_size we can launch on this arch
-    const unsigned int max_num_blocks = max_grid_size(block_size);
-
-    return min<unsigned int>(num_blocks, max_num_blocks);
-  }
-
-  /*! this member function returns the index of the (odd,even) block pair
-   *  that the splitter of interest belongs to
-   *  \param splitter_idx The index of the splitter in the splitters list
-   *  \return The global index of the (odd,even) block pair
-   */
-  __device__ __thrust_forceinline__
-  unsigned int block_pair_idx(unsigned int splitter_idx) const
-  {
-    return splitter_idx >> log_num_merged_splitters_per_tile;
-  }
-
-  /*! This member function returns the end of the search range in the other tile in
-   *  which the splitter of interest needs to be ranked.
-   *  \param splitter_idx The index of the splitter in the splitters array
-   *  \param splitter_global_idx The index of the splitter in the global array of elements
-   *  \param tile_idx The index of the tile to which the splitter belongs.
-   *  \return The half-open interval in the other tile in which the splitter needs to be ranked.
-   *          [first_index_to_search, size_of_interval)
-   */
-  __device__ __thrust_forceinline__
-  thrust::pair<unsigned int,unsigned int> search_interval(unsigned int splitter_idx, unsigned int splitter_global_idx, unsigned int tile_idx) const
-  {
-    // We want to compute the ranks of the splitter in d_srcData1 and d_srcData2
-    // for instance, if the splitter belongs to d_srcData1, then 
-    // (1) the rank in d_srcData1 is simply given by its splitter_global_idx
-    // (2) to find the rank in d_srcData2, we first find the block in d_srcData2 where inp appears.
-    //     We do this by noting that we have already merged/sorted splitters, and thus the rank
-    //     of inp in the elements of d_srcData2 that are present in splitters is given by 
-    //        position of inp in d_splitters - rank of inp in elements of d_srcData1 in splitters
-    //        = i - splitter_global_idx
-    //     This also gives us the block of d_srcData2 that the splitter belongs in, since we have one
-    //     element in splitters per block of d_srcData2.
-    
-    //     We now perform a binary search over this block of d_srcData2 to find the rank of inp in d_srcData2.
-    //     start and end are the start and end indices of this block in d_srcData2, forming the bounds of the binary search.
-    //     Note that this binary search is in global memory with uncoalesced loads. However, we only find the ranks 
-    //     of a small set of elements, one per splitter: thus it is not the performance bottleneck.
-    
-    // the local index of the splitter within the (odd, even) block pair.
-    const unsigned int splitter_block_pair_idx = splitter_idx - (block_pair_idx(splitter_idx)<<log_num_merged_splitters_per_tile);
-
-    // the index of the splitter within its tile
-    const unsigned int splitter_tile_idx = splitter_global_idx - (tile_idx<<log_tile_size);
-
-    // the index of the splitter's block within its tile
-    const unsigned int block_tile_idx = splitter_tile_idx >> log_block_size;
-    
-    // find the end of the search range in the other tile
-    unsigned int end = (( splitter_block_pair_idx - block_tile_idx) << log_block_size);
-
-    // begin by assuming the search range is the size of a full block
-    unsigned int other_block_size = block_size;
-
-    // the index of the other tile can be found with
-    const unsigned int other_tile_idx = tile_idx ^ 1;
-    
-    // the size of the other tile can be less than tile_size if the it is the last tile.
-    unsigned int other_tile_size = min<unsigned int>(1 << log_tile_size, num_keys - (other_tile_idx<<log_tile_size));
-
-    if(end > other_tile_size)
-    {
-      // the other block has partial size
-      end = other_tile_size;
-      other_block_size = num_keys % block_size;
-    }
-    else if(end == 0)
-    {
-      // when the search range is empty
-      // the other_block_size is 0
-      other_block_size = 0;
-    }
-
-    // the search range begins other_block_size elements before the end
-    unsigned int start = end - other_block_size;
-
-    return thrust::make_pair(start,other_block_size);
-  }
-
-  __device__ __thrust_forceinline__
-  void operator()(void)
-  {
-    typedef typename iterator_value<RandomAccessIterator1>::type KeyType;
-    typedef typename iterator_value<RandomAccessIterator2>::type IndexType;
-  
-    const unsigned int grid_size = context.grid_dimension() * context.block_dimension();
-  
-    unsigned int splitter_idx = context.thread_index() + context.block_index() * context.block_dimension();
-  
-    // advance iterators
-    splitters_first     += splitter_idx;
-    splitters_pos_first += splitter_idx;
-    ranks_result1       += splitter_idx;
-    ranks_result2       += splitter_idx;
-    
-    for(;
-        splitter_idx < num_splitters;
-        splitter_idx += grid_size, splitters_first += grid_size, splitters_pos_first += grid_size, ranks_result1 += grid_size, ranks_result2 += grid_size)
-    {
-      // the index of the splitter within the global array of elements
-      IndexType splitter_global_idx = *splitters_pos_first;
-
-      // the tile to which the splitter belongs.
-      unsigned int tile_idx = (splitter_global_idx >> log_tile_size);
-      
-      // the index of the "other" tile which which tile_idx must be merged.
-      unsigned int other_tile_idx = tile_idx^1;
-
-      // compute the interval in the other tile to search
-      unsigned int start, n;
-      thrust::tie(start,n) = search_interval(splitter_idx, splitter_global_idx, tile_idx);
-
-      // point to the beginning of the other tile
-      RandomAccessIterator3 other_tile_begin = keys_first + (other_tile_idx<<log_tile_size);
-
-      // offset the pointer to the other tile by the search range's offset
-      RandomAccessIterator3 search_range_begin = other_tile_begin + start;
-      
-      // find the rank of our splitter in the other tile
-      KeyType splitter = *splitters_first;
-
-      // the index of the splitter within its tile
-      // this is one of the output ranks
-      const unsigned int splitter_tile_idx = splitter_global_idx - (tile_idx<<log_tile_size);
-
-      // branch depending on whether or not our splitter is in the odd tile
-      if(tile_idx & 1)
-      {
-        unsigned int result = thrust::system::detail::generic::scalar::upper_bound_n(search_range_begin, n, splitter, comp) - search_range_begin;
-
-        *ranks_result1 = start + result;
-
-        *ranks_result2 = splitter_tile_idx;
-      } // end if
-      else
-      {
-        unsigned int result = thrust::system::detail::generic::scalar::lower_bound_n(search_range_begin, n, splitter, comp) - search_range_begin;
-
-        *ranks_result1 = splitter_tile_idx;
-
-        *ranks_result2 = start + result;
-      } // end else
-    } // end for
-  } // end operator()
-}; // rank_splitters_closure
-
-
-template<unsigned int block_size,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename RandomAccessIterator3,
-         typename RandomAccessIterator4,
-         typename StrictWeakOrdering>
-  void rank_splitters(RandomAccessIterator1 splitters_first,
-                      RandomAccessIterator1 splitters_last,
-                      RandomAccessIterator2 splitter_positions_first,
-                      RandomAccessIterator3 keys_first,
-                      RandomAccessIterator3 keys_last,
-                      size_t log_tile_size,
-                      RandomAccessIterator4 ranks_result1,
-                      RandomAccessIterator4 ranks_result2,
-                      StrictWeakOrdering comp)
-{
-  typedef rank_splitters_closure<
-    block_size,
-    RandomAccessIterator1,
-    RandomAccessIterator2,
-    RandomAccessIterator3,
-    RandomAccessIterator4,
-    StrictWeakOrdering,
-    detail::statically_blocked_thread_array<block_size>
-  > Closure;
-
-  Closure closure(splitters_first,
-                  splitter_positions_first,
-                  keys_first,
-                  splitters_last - splitters_first,
-                  keys_last - keys_first,
-                  log_tile_size,
-                  ranks_result1,
-                  ranks_result2,
-                  comp);
-
-  detail::launch_closure(closure, closure.grid_size(), block_size);
-}
-
-
-template<typename Context,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename Size,
-         typename RandomAccessIterator3,
-         typename RandomAccessIterator4>
-__device__
-  void copy_n(Context context,
-              RandomAccessIterator1 first1,
-              RandomAccessIterator2 first2,
-              Size n,
-              RandomAccessIterator3 result1,
-              RandomAccessIterator4 result2)
-{
-  for(Size i = context.thread_index();
-      i < n;
-      i += context.block_dimension())
-  {
-    result1[i] = first1[i];
-    result2[i] = first2[i];
-  }
-}
-
-
-///////////////////// MERGE TWO INDEPENDENT SEGMENTS USING BINARY SEARCH IN SHARED MEMORY ////////////////////////////////////////
-// NOTE: This is the most compute-intensive part of the algorithm. 
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// Thread block i merges entries between rank[i] and rank[i+1]. These can be independently
-// merged and concatenated, as noted above. 
-// Each thread in the thread block i does a binary search of one element between rank[i] -> rank[i+1] in the 
-// other array. 
-
-// Inputs: srcdatakey, value: inputs
-//         log_blocksize, log_num_merged_splitters_per_tile: as in previous functions
-// Outputs: resultdatakey, resultdatavalue: output merged arrays are written here.
-template<unsigned int block_size,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename RandomAccessIterator3,
-         typename RandomAccessIterator4,
-         typename RandomAccessIterator5,
-         typename RandomAccessIterator6,
-         typename StrictWeakOrdering,
-         typename Context>
-struct merge_subtiles_by_key_closure
-{	
-  typedef Context context_type;
-  static const unsigned int log_block_size = thrust::detail::mpl::math::log2<block_size>::value;
-
-  RandomAccessIterator1 keys_first;
-  RandomAccessIterator2 values_first;
-  unsigned int n;
-  RandomAccessIterator3 ranks_first1;
-  RandomAccessIterator4 ranks_first2; 
-  const unsigned int tile_size;
-  const unsigned int num_splitters;
-  RandomAccessIterator5 keys_result;
-  RandomAccessIterator6 values_result;
-  StrictWeakOrdering comp;
-  Context context;
-
-  // this member is derivable from the constructor parameters
-  unsigned int log_num_merged_splitters_per_tile;
-
-  merge_subtiles_by_key_closure
-    (RandomAccessIterator1 keys_first,
-     RandomAccessIterator2 values_first,
-     unsigned int n, 
-     RandomAccessIterator3 ranks_first1,
-     RandomAccessIterator4 ranks_first2, 
-     const unsigned int log_tile_size, 
-     const unsigned int num_splitters,
-     RandomAccessIterator5 keys_result,
-     RandomAccessIterator6 values_result,
-     StrictWeakOrdering comp,
-     Context context = Context())
-    : keys_first(keys_first), values_first(values_first), n(n),
-      ranks_first1(ranks_first1), ranks_first2(ranks_first2),
-      tile_size(1 << log_tile_size),
-      num_splitters(num_splitters),
-      keys_result(keys_result), values_result(values_result),
-      comp(comp), context(context)
-  {
-    const unsigned int log_num_splitters_per_tile = log_tile_size - log_block_size;
-    log_num_merged_splitters_per_tile = log_num_splitters_per_tile + 1;
-  }
-
-  unsigned int grid_size() const
-  {
-    const unsigned int max_num_blocks = max_grid_size(block_size);
-    return thrust::min<unsigned int>(num_splitters, max_num_blocks);
-  }
-
-  __device__ __thrust_forceinline__
-  unsigned int even_offset(unsigned int oddeven_blockid) const
-  {
-    return oddeven_blockid << (log_num_merged_splitters_per_tile + log_block_size);
-  }
-
-  __device__ __thrust_forceinline__
-  void get_partition(unsigned int partition_idx, unsigned int oddeven_blockid,
-                     unsigned int &rank1, unsigned int &size1,
-                     unsigned int &rank2, unsigned int &size2) const
-  {
-    // XXX this logic would be much improved if we were guaranteed that there was 
-    //     an element at ranks_first[1]
-    // XXX we could eliminate the need for local_blockIdx, log_num_merged_splitters_per_block, tile_size, and n
-    
-    // the index of the merged splitter within the splitters for the odd-even block pair.
-    unsigned int local_blockIdx = partition_idx - (oddeven_blockid<<log_num_merged_splitters_per_tile);
-
-    rank1 = *ranks_first1;
-    rank2 = *ranks_first2;
-  
-    // get the rank of the next splitter if we aren't processing the very last splitter of a partially full tile
-    // or if we aren't processing the last splitter in our tile
-    if((partition_idx == num_splitters - 1) || (local_blockIdx == ((1<<log_num_merged_splitters_per_tile)-1)))
-    {
-      // we're at the end
-      size1 = size2 = tile_size;
-    } // end if
-    else
-    {
-      // dereference the rank of the *next* splitter
-      size1 = ranks_first1[1];
-      size2 = ranks_first2[1];
-    } // end else
-    
-    // Adjust size2 to account for the last block possibly not being full.
-    // check if size2 would fall off the end of the array
-    if((even_offset(oddeven_blockid) + tile_size + size2) > n)
-    {
-      size2 = n - tile_size - even_offset(oddeven_blockid);
-    } // end if
-  
-    // measure each array relative to its beginning
-    size1 -= rank1;
-    size2 -= rank2;
-  }
-
-  template<typename KeyType, typename ValueType>
-  __device__ __thrust_forceinline__
-  void do_it(KeyType *s_keys, ValueType *s_values)
-  {
-    // advance iterators
-    unsigned int i = context.block_index();
-    ranks_first1 += i;
-    ranks_first2 += i;
-    
-    // Thread Block i merges the sub-block associated with splitter i: rank[i] -> rank[i+1] in a particular odd-even block pair.
-    for(;
-        i < num_splitters;
-        i += context.grid_dimension(), ranks_first1 += context.grid_dimension(), ranks_first2 += context.grid_dimension())
-    {
-      // the (odd, even) block pair that the splitter belongs to.
-      unsigned int oddeven_blockid = i >> log_num_merged_splitters_per_tile;
-      
-      // start1 & start2 store rank[i] and rank[i+1] indices in arrays 1 and 2.
-      // size1 & size2 store the number of of elements between rank[i] & rank[i+1] in arrays 1 & 2.
-      unsigned int rank1, rank2, size1, size2;
-      get_partition(i, oddeven_blockid, rank1, size1, rank2, size2);
-  
-      // find where the odd,even arrays begin
-      RandomAccessIterator1 even_keys_first = keys_first + even_offset(oddeven_blockid);
-      RandomAccessIterator1 odd_keys_first  = even_keys_first + tile_size;
-  
-      RandomAccessIterator2 even_values_first = values_first + even_offset(oddeven_blockid);
-      RandomAccessIterator2 odd_values_first  = even_values_first + tile_size;
-      
-      // load tiles into smem
-      copy_n(context, even_keys_first + rank1, even_values_first + rank1, size1, s_keys, s_values);
-      copy_n(context, odd_keys_first  + rank2, odd_values_first  + rank2, size2, s_keys + size1, s_values + size1);
-
-      context.barrier();
-  
-      // merge the arrays in-place
-      block::inplace_merge_by_key_n(context, s_keys, s_values, size1, size2, comp);
-
-      context.barrier();
-      
-      // write tiles to gmem
-      unsigned int dst_offset = even_offset(oddeven_blockid) + rank1 + rank2;
-      copy_n(context, s_keys, s_values, size1 + size2, keys_result + dst_offset, values_result + dst_offset);
-
-      context.barrier();
-    } // end for i
-  }
-
-  __device__ __thrust_forceinline__
-  void operator()(void)
-  {
-    typedef typename iterator_value<RandomAccessIterator5>::type KeyType;
-    typedef typename iterator_value<RandomAccessIterator6>::type ValueType;
-  
-    __shared__ uninitialized_array<KeyType,   2 * block_size> s_keys;
-    __shared__ uninitialized_array<ValueType, 2 * block_size> s_values;
-  
-    do_it(s_keys.data(), s_values.data());
-  }
-}; // merge_subtiles_by_key_closure
-
-// merge_subtiles_by_key() merges each sub-tile independently. As explained in rank_splitters(), 
-// the sub-tiles are defined by the ranks of the splitter elements d_rank1 and d_rank2 in the odd and even tiles resp.
-// It can be easily shown that each sub-tile cannot contain more than block_size elements of either the odd or even tile.
-
-// the function calls merge_subblocks_binarysearch_kernel() for the remaining N_splitterS sub-tiles
-// We use 1 thread block per splitter: For instance, thread block 0 will merge rank1[0] -> rank1[1] of array i with
-// rank2[0] -> rank2[1] of array i^1, with i being the thread block to which the splitter belongs.
-
-// We implement each sub-tile merge using a binary search. We compute the rank of each element belonging to a sub-tile 
-// of an odd numbered tile in the corresponding sub-tile of its even numbered pair. It then adds this rank to 
-// the index of the element in its own sub-tile to find the output index of the element in the merged sub-tile.
-
-template<typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename RandomAccessIterator3,
-         typename RandomAccessIterator4,
-         typename RandomAccessIterator5,
-         typename RandomAccessIterator6,
-         typename RandomAccessIterator7,
-         typename StrictWeakOrdering>
-  void merge_subtiles_by_key(RandomAccessIterator1 keys_first,
-                             RandomAccessIterator1 keys_last,
-                             RandomAccessIterator2 values_first,
-                             RandomAccessIterator3 splitters_pos_first, 
-                             RandomAccessIterator3 splitters_pos_last,
-                             RandomAccessIterator4 ranks_first1,
-                             RandomAccessIterator5 ranks_first2, 
-                             RandomAccessIterator6 keys_result,
-                             RandomAccessIterator7 values_result, 
-                             unsigned int log_tile_size, 
-                             StrictWeakOrdering comp)
-{
-  typedef typename iterator_value<RandomAccessIterator6>::type KeyType;
-  typedef typename iterator_value<RandomAccessIterator7>::type ValueType;
-
-  const unsigned int block_size = stable_merge_sort_detail::block_size<KeyType,ValueType>::value;
-
-  typedef merge_subtiles_by_key_closure<
-    block_size,
-    RandomAccessIterator1,
-    RandomAccessIterator2,
-    RandomAccessIterator4,
-    RandomAccessIterator5,
-    RandomAccessIterator6,
-    RandomAccessIterator7,
-    StrictWeakOrdering,
-    detail::statically_blocked_thread_array<block_size>
-  > Closure;
-
-  Closure closure(keys_first,
-                  values_first,
-                  keys_last - keys_first, 
-                  ranks_first1,
-                  ranks_first2, 
-                  log_tile_size,
-                  splitters_pos_last - splitters_pos_first,
-  	          keys_result,
-                  values_result,
-                  comp);
-
-  detail::launch_closure(closure, closure.grid_size(), block_size);
-}
-
-
-template<unsigned int block_size,
-         typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename RandomAccessIterator3,
-         typename RandomAccessIterator4,
-         typename StrictWeakOrdering>
-  void merge_small_tiles_by_key(execution_policy<DerivedPolicy> &,
-                                RandomAccessIterator1 keys_first,
-                                RandomAccessIterator1 keys_last,
-                                RandomAccessIterator2 values_first,
-                                size_t log_tile_size,
-                                RandomAccessIterator3 keys_result,
-                                RandomAccessIterator4 values_result,
-                                StrictWeakOrdering comp)
-{
-  typedef merge_small_tiles_by_key_closure<
-    block_size,
-    RandomAccessIterator1,
-    RandomAccessIterator2,
-    RandomAccessIterator3,
-    RandomAccessIterator4,
-    StrictWeakOrdering,
-    detail::statically_blocked_thread_array<block_size>
-  > Closure;
-
-  Closure closure(keys_first, values_first, keys_last - keys_first, log_tile_size, keys_result, values_result, comp);
-
-  detail::launch_closure(closure, closure.grid_size(), block_size);
-} // end merge_small_tiles_by_key()
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename RandomAccessIterator3,
-         typename RandomAccessIterator4,
-         typename StrictWeakOrdering>
-  void merge_tiles_by_key_recursive(execution_policy<DerivedPolicy> &exec,
-                                    RandomAccessIterator1 keys_first,
-                                    RandomAccessIterator1 keys_last,
-                                    RandomAccessIterator2 values_first,
-                                    RandomAccessIterator3 keys_result,
-                                    RandomAccessIterator4 values_result,
-                                    size_t log_tile_size,
-                                    StrictWeakOrdering comp)
-{
-  typedef typename iterator_value<RandomAccessIterator3>::type KeyType;
-  typedef typename iterator_value<RandomAccessIterator4>::type ValueType;
-
-  const size_t tile_size = 1<<log_tile_size;
-
-  // Compute the block_size based on the types to sort
-  const unsigned int block_size = stable_merge_sort_detail::block_size<KeyType,ValueType>::value;
-
-  // Case (a): tile_size <= block_size
-  if(tile_size <= block_size)
-  {
-    return merge_small_tiles_by_key<2*block_size>(exec, keys_first, keys_last, values_first, log_tile_size, keys_result, values_result, comp);
-  } // end if
-
-  // Case (b) tile_size >= block_size
-
-  // step 1 of the recursive case: gather one splitter per block_size entries in each odd-even tile pair.
-  thrust::detail::temporary_array<KeyType, DerivedPolicy> splitters(exec, thrust::detail::util::divide_ri(keys_last - keys_first, block_size));
-  static_strided_integer_range<block_size>                splitters_pos(splitters.size());
-  thrust::gather(exec, splitters_pos.begin(), splitters_pos.end(), keys_first, splitters.begin());
-                            
-  // step 2 of the recursive case: merge the splitters & their positions
-  thrust::detail::temporary_array<KeyType,      DerivedPolicy> merged_splitters(exec, splitters.size());
-  thrust::detail::temporary_array<unsigned int, DerivedPolicy> merged_splitters_pos(exec, splitters.size());
-
-  const unsigned int log_block_size = thrust::detail::mpl::math::log2<block_size>::value;
-  size_t log_num_splitters_per_tile = log_tile_size - log_block_size;
-  merge_tiles_by_key_recursive(exec,
-                               splitters.begin(),
-                               splitters.end(),
-                               splitters_pos.begin(),
-                               merged_splitters.begin(),
-                               merged_splitters_pos.begin(),
-                               log_num_splitters_per_tile,
-                               comp);
-
-  // step 3 of the recursive case: find the ranks of each splitter in the respective two tiles.
-  // reuse the merged_splitters_pos storage
-  thrust::detail::temporary_array<unsigned int, DerivedPolicy> &rank1 = merged_splitters_pos;
-  thrust::detail::temporary_array<unsigned int, DerivedPolicy> rank2(exec, rank1.size());
-
-  rank_splitters<block_size>(merged_splitters.begin(),
-                             merged_splitters.end(),
-                             merged_splitters_pos.begin(),
-                             keys_first,
-                             keys_last,
-                             log_tile_size,
-                             rank1.begin(),
-                             rank2.begin(),
-                             comp);
-
-  // step 4 of the recursive case: merge each sub-tile independently in parallel.
-  merge_subtiles_by_key(keys_first,
-                        keys_last,
-                        values_first,
-                        merged_splitters_pos.begin(),
-                        merged_splitters_pos.end(),
-                        rank1.begin(),
-                        rank2.begin(),
-                        keys_result,
-                        values_result,
-                        log_tile_size,
-                        comp);
-}
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename RandomAccessIterator3,
-         typename RandomAccessIterator4,
-         typename StrictWeakOrdering>
-  void merge_tiles_by_key(execution_policy<DerivedPolicy> &exec,
-                          RandomAccessIterator1 keys_first,
-                          RandomAccessIterator2 values_first,
-                          size_t n,
-                          RandomAccessIterator3 keys_result,
-                          RandomAccessIterator4 values_result,
-                          unsigned int log_tile_size,
-                          StrictWeakOrdering comp)
-{
-  const unsigned int tile_size = 1 << log_tile_size;
-  const size_t num_tiles = thrust::detail::util::divide_ri(n, tile_size);
-
-  // if there is an odd number of tiles, we should exclude the last one
-  // without a twin in merge_recursive
-  const size_t last_tile_offset = (num_tiles%2)?((num_tiles-1)*tile_size):n;
-
-  merge_tiles_by_key_recursive(exec,
-                               keys_first,
-                               keys_first + last_tile_offset,
-                               values_first,
-                               keys_result,
-                               values_result,
-                               log_tile_size,
-                               comp);
-
-  // copy the last tile without a twin, should it exist
-  if(last_tile_offset < n)
-  {
-    thrust::copy(exec, keys_first + last_tile_offset, keys_first + n, keys_result + last_tile_offset);
-    thrust::copy(exec, values_first + last_tile_offset, values_first + n, values_result + last_tile_offset);
-  } // end if
-} // end merge_tiles_by_key()
-
-
-} // end stable_merge_sort_detail
-
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator,
-         typename StrictWeakOrdering>
-void stable_merge_sort(execution_policy<DerivedPolicy> &exec,
-                       RandomAccessIterator first,
-                       RandomAccessIterator last,
-                       StrictWeakOrdering comp)
-{
-  // XXX it's potentially unsafe to pass the same array for keys & values
-  thrust::system::cuda::detail::detail::stable_merge_sort_by_key(exec, first, last, first, comp);
-}
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename StrictWeakOrdering>
-  void stable_merge_sort_by_key(execution_policy<DerivedPolicy> &exec,
-                                RandomAccessIterator1 keys_first,
-                                RandomAccessIterator1 keys_last,
-                                RandomAccessIterator2 values_first,
-                                StrictWeakOrdering comp)
-{
-  typedef typename thrust::iterator_traits<RandomAccessIterator1>::value_type KeyType;
-  typedef typename thrust::iterator_traits<RandomAccessIterator2>::value_type ValueType;
-
-  // compute the block_size based on the types we're sorting
-  const unsigned int block_size = stable_merge_sort_detail::block_size<KeyType,ValueType>::value;
-
-  // XXX WAR unused variable warning issued by nvcc
-  (void) block_size;
-
-  // first, sort each tile of block_size elements
-  stable_sort_by_count<block_size>(exec, keys_first, keys_last, values_first, comp);
-
-  // merge tiles if there is more than one
-  const size_t n = keys_last - keys_first;
-  if(n > block_size)
-  {
-    // allocate scratch space
-    using namespace thrust::detail;
-    using namespace stable_merge_sort_detail;
-    temporary_array<KeyType,   DerivedPolicy> temp_keys(exec, n);
-    temporary_array<ValueType, DerivedPolicy> temp_values(exec, n);
-
-    // use a caching allocator for the calls to merge_tiles_by_key
-    // XXX unfortunately g++-4.2 can't deal with this special execution policy
-#if defined(THRUST_GCC_VERSION) && THRUST_GCC_VERSION < 40300
-    execution_policy<DerivedPolicy> &merge_exec = exec;
-#else
-    cached_temporary_allocator<DerivedPolicy,thrust::cuda::execution_policy> merge_exec(exec);
-#endif
-
-    // The log(n) iterations start here. Each call to 'merge' merges an odd-even pair of tiles
-    unsigned int log_tile_size = thrust::detail::mpl::math::log2<block_size>::value;
-    bool ping = true;
-    for(; (1u << log_tile_size) < n; ++log_tile_size, ping = !ping)
-    {
-      // we ping-pong back and forth
-      if(ping)
-      {
-        merge_tiles_by_key(merge_exec, keys_first, values_first, n, temp_keys.begin(), temp_values.begin(), log_tile_size, comp);
-      } // end if
-      else
-      {
-        merge_tiles_by_key(merge_exec, temp_keys.begin(), temp_values.begin(), n, keys_first, values_first, log_tile_size, comp);
-      } // end else
-    } // end for
-
-    // this is to make sure that our data is finally in the data and keys arrays
-    // and not in the temporary arrays
-    if(!ping)
-    {
-      thrust::copy(exec, temp_keys.begin(), temp_keys.end(), keys_first);
-      thrust::copy(exec, temp_values.begin(), temp_values.end(), values_first);
-    } // end if
-  } // end if
-} // end stable_merge_sort_by_key()
-
-
-} // end namespace detail
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
-
diff --git a/compat/thrust/system/cuda/detail/detail/stable_primitive_sort.h b/compat/thrust/system/cuda/detail/detail/stable_primitive_sort.h
deleted file mode 100644
index 8449a17b0d..0000000000
--- a/compat/thrust/system/cuda/detail/detail/stable_primitive_sort.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace detail
-{
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator>
-void stable_primitive_sort(execution_policy<DerivedPolicy> &exec,
-                           RandomAccessIterator first,
-                           RandomAccessIterator last);
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-void stable_primitive_sort_by_key(execution_policy<DerivedPolicy> &exec,
-                                  RandomAccessIterator1 keys_first,
-                                  RandomAccessIterator1 keys_last,
-                                  RandomAccessIterator2 values_first);
-
-} // end namespace detail
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/cuda/detail/detail/stable_primitive_sort.inl>
-
diff --git a/compat/thrust/system/cuda/detail/detail/stable_primitive_sort.inl b/compat/thrust/system/cuda/detail/detail/stable_primitive_sort.inl
deleted file mode 100644
index d6f4c775b6..0000000000
--- a/compat/thrust/system/cuda/detail/detail/stable_primitive_sort.inl
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/detail/stable_primitive_sort.h>
-#include <thrust/system/cuda/detail/detail/stable_radix_sort.h>
-#include <thrust/functional.h>
-#include <thrust/partition.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/detail/type_traits.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace detail
-{
-namespace stable_primitive_sort_detail
-{
-
-
-template<typename Iterator>
-  struct enable_if_bool_sort
-    : thrust::detail::enable_if<
-        thrust::detail::is_same<
-          bool,
-          typename thrust::iterator_value<Iterator>::type
-        >::value
-      >
-{};
-
-
-template<typename Iterator>
-  struct disable_if_bool_sort
-    : thrust::detail::disable_if<
-        thrust::detail::is_same<
-          bool,
-          typename thrust::iterator_value<Iterator>::type
-        >::value
-      >
-{};
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator>
-  typename enable_if_bool_sort<RandomAccessIterator>::type
-    stable_primitive_sort(execution_policy<DerivedPolicy> &exec,
-                          RandomAccessIterator first,
-                          RandomAccessIterator last)
-{
-  // use stable_partition if we're sorting bool
-  // stable_partition puts true values first, so we need to logical_not
-  thrust::stable_partition(exec, first, last, thrust::logical_not<bool>());
-}
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator>
-  typename disable_if_bool_sort<RandomAccessIterator>::type
-    stable_primitive_sort(execution_policy<DerivedPolicy> &exec,
-                          RandomAccessIterator first,
-                          RandomAccessIterator last)
-{
-  // call stable_radix_sort
-  thrust::system::cuda::detail::detail::stable_radix_sort(exec,first,last);
-}
-
-
-struct logical_not_first
-{
-  template<typename Tuple>
-  __host__ __device__
-  bool operator()(Tuple t)
-  {
-    return !thrust::get<0>(t);
-  }
-};
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-  typename enable_if_bool_sort<RandomAccessIterator1>::type
-    stable_primitive_sort_by_key(execution_policy<DerivedPolicy> &exec,
-                                 RandomAccessIterator1 keys_first,
-                                 RandomAccessIterator1 keys_last,
-                                 RandomAccessIterator2 values_first)
-{
-  // use stable_partition if we're sorting bool
-  // stable_partition puts true values first, so we need to logical_not
-  thrust::stable_partition(exec,
-                           thrust::make_zip_iterator(thrust::make_tuple(keys_first, values_first)),
-                           thrust::make_zip_iterator(thrust::make_tuple(keys_last, values_first)),
-                           logical_not_first());
-}
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-  typename disable_if_bool_sort<RandomAccessIterator1>::type
-    stable_primitive_sort_by_key(execution_policy<DerivedPolicy> &exec,
-                                 RandomAccessIterator1 keys_first,
-                                 RandomAccessIterator1 keys_last,
-                                 RandomAccessIterator2 values_first)
-{
-  // call stable_radix_sort_by_key
-  thrust::system::cuda::detail::detail::stable_radix_sort_by_key(exec, keys_first, keys_last, values_first);
-}
-    
-  
-
-}
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator>
-void stable_primitive_sort(execution_policy<DerivedPolicy> &exec,
-                           RandomAccessIterator first,
-                           RandomAccessIterator last)
-{
-  thrust::system::cuda::detail::detail::stable_primitive_sort_detail::stable_primitive_sort(exec,first,last);
-}
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-void stable_primitive_sort_by_key(execution_policy<DerivedPolicy> &exec,
-                                  RandomAccessIterator1 keys_first,
-                                  RandomAccessIterator1 keys_last,
-                                  RandomAccessIterator2 values_first)
-{
-  thrust::system::cuda::detail::detail::stable_primitive_sort_detail::stable_primitive_sort_by_key(exec, keys_first, keys_last, values_first);
-}
-
-} // end namespace detail
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cuda/detail/detail/stable_radix_sort.h b/compat/thrust/system/cuda/detail/detail/stable_radix_sort.h
deleted file mode 100644
index 7a8b9964c0..0000000000
--- a/compat/thrust/system/cuda/detail/detail/stable_radix_sort.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file stable_radix_sort_dev.h
- *  \brief Defines the interface for a stable radix sort implementation on CUDA
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace detail
-{
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator>
-void stable_radix_sort(execution_policy<DerivedPolicy> &exec,
-                       RandomAccessIterator first,
-                       RandomAccessIterator last);
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-void stable_radix_sort_by_key(execution_policy<DerivedPolicy> &exec,
-                              RandomAccessIterator1 keys_first,
-                              RandomAccessIterator1 keys_last,
-                              RandomAccessIterator2 values_first);
-
-} // end namespace detail
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/cuda/detail/detail/stable_radix_sort.inl>
-
diff --git a/compat/thrust/system/cuda/detail/detail/stable_radix_sort.inl b/compat/thrust/system/cuda/detail/detail/stable_radix_sort.inl
deleted file mode 100644
index 9ea197702c..0000000000
--- a/compat/thrust/system/cuda/detail/detail/stable_radix_sort.inl
+++ /dev/null
@@ -1,220 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-
-// do not attempt to compile this file with any other compiler
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-
-#include <thrust/detail/copy.h>
-#include <thrust/gather.h>
-#include <thrust/sequence.h>
-#include <thrust/iterator/iterator_traits.h>
-
-#include <thrust/detail/temporary_array.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/util/align.h>
-#include <thrust/detail/raw_pointer_cast.h>
-
-
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
-
-
-#include <thrust/system/cuda/detail/detail/b40c/radixsort_api.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace detail
-{
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator>
-void stable_radix_sort(execution_policy<DerivedPolicy> &exec,
-                       RandomAccessIterator first,
-                       RandomAccessIterator last)
-{
-    typedef typename thrust::iterator_value<RandomAccessIterator>::type K;
-    
-    unsigned int num_elements = last - first;
-
-    // ensure data is properly aligned
-    if (!thrust::detail::util::is_aligned(thrust::raw_pointer_cast(&*first), 2*sizeof(K)))
-    {
-        thrust::detail::temporary_array<K, DerivedPolicy> aligned_keys(exec, first, last);
-        stable_radix_sort(exec, aligned_keys.begin(), aligned_keys.end());
-        thrust::copy(exec, aligned_keys.begin(), aligned_keys.end(), first);
-        return;
-    }
-    
-    thrust::system::cuda::detail::detail::b40c_thrust::RadixSortingEnactor<K> sorter(num_elements);
-    thrust::system::cuda::detail::detail::b40c_thrust::RadixSortStorage<K>    storage;
-    
-    // allocate temporary buffers
-    thrust::detail::temporary_array<K,    DerivedPolicy> temp_keys(exec, num_elements);
-    thrust::detail::temporary_array<int,  DerivedPolicy> temp_spine(exec, sorter.SpineElements());
-    thrust::detail::temporary_array<bool, DerivedPolicy> temp_from_alt(exec, 2);
-
-    // define storage
-    storage.d_keys             = thrust::raw_pointer_cast(&*first);
-    storage.d_alt_keys         = thrust::raw_pointer_cast(&temp_keys[0]);
-    storage.d_spine            = thrust::raw_pointer_cast(&temp_spine[0]);
-    storage.d_from_alt_storage = thrust::raw_pointer_cast(&temp_from_alt[0]);
-
-    // perform the sort
-    sorter.EnactSort(storage);
-    
-    // radix sort sometimes leaves results in the alternate buffers
-    if (storage.using_alternate_storage)
-    {
-        thrust::copy(exec, temp_keys.begin(), temp_keys.end(), first);
-    }
-}
-
-///////////////////////
-// Key-Value Sorting //
-///////////////////////
-
-// sort values directly
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-void stable_radix_sort_by_key(execution_policy<DerivedPolicy> &exec,
-                              RandomAccessIterator1 first1,
-                              RandomAccessIterator1 last1,
-                              RandomAccessIterator2 first2,
-                              thrust::detail::true_type)
-{
-    typedef typename thrust::iterator_value<RandomAccessIterator1>::type K;
-    typedef typename thrust::iterator_value<RandomAccessIterator2>::type V;
-    
-    unsigned int num_elements = last1 - first1;
-
-    // ensure data is properly aligned
-    if (!thrust::detail::util::is_aligned(thrust::raw_pointer_cast(&*first1), 2*sizeof(K)))
-    {
-        thrust::detail::temporary_array<K,DerivedPolicy> aligned_keys(exec, first1, last1);
-        stable_radix_sort_by_key(exec, aligned_keys.begin(), aligned_keys.end(), first2);
-        thrust::copy(exec, aligned_keys.begin(), aligned_keys.end(), first1);
-        return;
-    }
-    if (!thrust::detail::util::is_aligned(thrust::raw_pointer_cast(&*first2), 2*sizeof(V)))
-    {
-        thrust::detail::temporary_array<V,DerivedPolicy> aligned_values(exec, first2, first2 + num_elements);
-        stable_radix_sort_by_key(exec, first1, last1, aligned_values.begin());
-        thrust::copy(exec, aligned_values.begin(), aligned_values.end(), first2);
-        return;
-    }
-   
-    thrust::system::cuda::detail::detail::b40c_thrust::RadixSortingEnactor<K,V> sorter(num_elements);
-    thrust::system::cuda::detail::detail::b40c_thrust::RadixSortStorage<K,V>    storage;
-    
-    // allocate temporary buffers
-    thrust::detail::temporary_array<K,    DerivedPolicy> temp_keys(exec, num_elements);
-    thrust::detail::temporary_array<V,    DerivedPolicy> temp_values(exec, num_elements);
-    thrust::detail::temporary_array<int,  DerivedPolicy> temp_spine(exec, sorter.SpineElements());
-    thrust::detail::temporary_array<bool, DerivedPolicy> temp_from_alt(exec, 2);
-
-    // define storage
-    storage.d_keys             = thrust::raw_pointer_cast(&*first1);
-    storage.d_values           = thrust::raw_pointer_cast(&*first2);
-    storage.d_alt_keys         = thrust::raw_pointer_cast(&temp_keys[0]);
-    storage.d_alt_values       = thrust::raw_pointer_cast(&temp_values[0]);
-    storage.d_spine            = thrust::raw_pointer_cast(&temp_spine[0]);
-    storage.d_from_alt_storage = thrust::raw_pointer_cast(&temp_from_alt[0]);
-
-    // perform the sort
-    sorter.EnactSort(storage);
-    
-    // radix sort sometimes leaves results in the alternate buffers
-    if (storage.using_alternate_storage)
-    {
-        thrust::copy(exec, temp_keys.begin(),   temp_keys.end(),   first1);
-        thrust::copy(exec, temp_values.begin(), temp_values.end(), first2);
-    }
-}
-
-
-// sort values indirectly
-template<typename DerivedPolicy, 
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-void stable_radix_sort_by_key(execution_policy<DerivedPolicy> &exec,
-                              RandomAccessIterator1 first1,
-                              RandomAccessIterator1 last1,
-                              RandomAccessIterator2 first2,
-                              thrust::detail::false_type)
-{
-    typedef typename thrust::iterator_value<RandomAccessIterator2>::type V;
-    
-    unsigned int num_elements = last1 - first1;
-
-    // sort with integer values and then permute the real values accordingly
-    thrust::detail::temporary_array<unsigned int,DerivedPolicy> permutation(exec, num_elements);
-    thrust::sequence(exec, permutation.begin(), permutation.end());
-
-    stable_radix_sort_by_key(exec, first1, last1, permutation.begin());
-    
-    // copy values into temp vector and then permute
-    thrust::detail::temporary_array<V,DerivedPolicy> temp_values(exec, first2, first2 + num_elements);
-   
-    // permute values
-    thrust::gather(exec,
-                   permutation.begin(), permutation.end(),
-                   temp_values.begin(),
-                   first2);
-}
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-void stable_radix_sort_by_key(execution_policy<DerivedPolicy> &exec,
-                              RandomAccessIterator1 first1,
-                              RandomAccessIterator1 last1,
-                              RandomAccessIterator2 first2)
-{
-    typedef typename thrust::iterator_value<RandomAccessIterator2>::type V;
-
-    // decide how to handle values
-    static const bool sort_values_directly = thrust::detail::is_trivial_iterator<RandomAccessIterator2>::value &&
-                                             thrust::detail::is_arithmetic<V>::value &&
-                                             sizeof(V) <= 8;    // TODO profile this
-
-    // XXX WAR unused variable warning
-    (void) sort_values_directly;
-
-    stable_radix_sort_by_key(exec, first1, last1, first2, 
-                             thrust::detail::integral_constant<bool, sort_values_directly>());
-}
-
-} // end namespace detail
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
-
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
-
-
-#endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-
diff --git a/compat/thrust/system/cuda/detail/detail/stable_sort_by_count.h b/compat/thrust/system/cuda/detail/detail/stable_sort_by_count.h
deleted file mode 100644
index b563654895..0000000000
--- a/compat/thrust/system/cuda/detail/detail/stable_sort_by_count.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace detail
-{
-
-
-template<unsigned int count,
-         typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename Compare>
-void stable_sort_by_count(execution_policy<DerivedPolicy> &exec,
-                          RandomAccessIterator1 keys_first,
-                          RandomAccessIterator1 keys_last,
-                          RandomAccessIterator2 values_first,
-                          Compare comp);
-
-
-} // end detail
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
-
-#include <thrust/system/cuda/detail/detail/stable_sort_by_count.inl>
-
diff --git a/compat/thrust/system/cuda/detail/detail/stable_sort_by_count.inl b/compat/thrust/system/cuda/detail/detail/stable_sort_by_count.inl
deleted file mode 100644
index 5efb36b9b0..0000000000
--- a/compat/thrust/system/cuda/detail/detail/stable_sort_by_count.inl
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/detail/stable_sort_by_count.h>
-#include <thrust/system/cuda/detail/detail/launch_closure.h>
-#include <thrust/system/cuda/detail/detail/uninitialized.h>
-#include <thrust/system/cuda/detail/block/merging_sort.h>
-#include <thrust/detail/minmax.h>
-#include <thrust/system/cuda/detail/runtime_introspection.h>
-
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace detail
-{
-namespace stable_sort_by_count_detail
-{
-
-
-template<unsigned int block_size,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename StrictWeakOrdering,
-         typename Context>
-struct stable_sort_by_count_closure
-{
-  typedef Context context_type;
-
-  RandomAccessIterator1 keys_first;
-  RandomAccessIterator2 values_first;
-  StrictWeakOrdering comp; // XXX this should probably be thrust::detail::device_function
-  const unsigned int n;
-  context_type context;
-
-  stable_sort_by_count_closure(RandomAccessIterator1 keys_first,
-                               RandomAccessIterator2 values_first,
-                               StrictWeakOrdering comp,
-                               const unsigned int n,
-                               context_type context = context_type())
-    : keys_first(keys_first),
-      values_first(values_first),
-      comp(comp),
-      n(n),
-      context(context)
-  {}
-
-  __device__ __thrust_forceinline__
-  void operator()(void)
-  {
-    typedef typename iterator_value<RandomAccessIterator1>::type KeyType;
-    typedef typename iterator_value<RandomAccessIterator2>::type ValueType;
-  
-    __shared__ uninitialized_array<KeyType,block_size>   s_keys;
-    __shared__ uninitialized_array<ValueType,block_size> s_data;
-  
-    const unsigned int grid_size = context.grid_dimension() * context.block_dimension();
-  
-    // block_offset records the global index of this block's 0th thread
-    unsigned int block_offset = context.block_index() * block_size;
-    unsigned int i = context.thread_index() + block_offset;
-  
-    // advance iterators
-    keys_first   += i;
-    values_first += i;
-  
-    for(;
-        block_offset < n;
-        block_offset += grid_size, i += grid_size, keys_first += grid_size, values_first += grid_size)
-    {
-      context.barrier();
-      // copy input to shared
-      if(i < n)
-      {
-        s_keys[context.thread_index()] = *keys_first;
-        s_data[context.thread_index()] = *values_first;
-      } // end if
-      context.barrier();
-  
-      // this block could be partially full
-      unsigned int length = block_size;
-      if(block_offset + block_size > n)
-      {
-        length = n - block_offset;
-      } // end if
-  
-      // run merge_sort over the block
-      block::merging_sort(context, s_keys.begin(), s_data.begin(), length, comp);
-  
-      // write result
-      if(i < n)
-      {
-        *keys_first   = s_keys[context.thread_index()];
-        *values_first = s_data[context.thread_index()];
-      } // end if
-    } // end for i
-  }
-
-
-  static size_t max_grid_size()
-  {
-    const device_properties_t& properties = device_properties();
-
-    const unsigned int max_threads = properties.maxThreadsPerMultiProcessor * properties.multiProcessorCount;
-    const unsigned int max_blocks  = properties.maxGridSize[0];
-    
-    return thrust::min<size_t>(max_blocks, 3 * max_threads / block_size);
-  } // end max_grid_size()
-
-
-  size_t grid_size() const
-  {
-    // compute the maximum number of blocks we can launch on this arch
-    const unsigned int max_num_blocks = max_grid_size();
-
-    // first, sort within each block
-    size_t num_blocks = n / block_size;
-    if(n % block_size) ++num_blocks;
-
-    return thrust::min<size_t>(num_blocks, max_num_blocks);
-  } // end grid_size()
-}; // stable_sort_by_count_closure
-
-
-} // end stable_sort_by_count_detail
-
-
-template<unsigned int count,
-         typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename Compare>
-void stable_sort_by_count(execution_policy<DerivedPolicy> &,
-                          RandomAccessIterator1 keys_first,
-                          RandomAccessIterator1 keys_last,
-                          RandomAccessIterator2 values_first,
-                          Compare comp)
-{
-  typedef stable_sort_by_count_detail::stable_sort_by_count_closure<
-    count,
-    RandomAccessIterator1,
-    RandomAccessIterator2,
-    Compare,
-    detail::statically_blocked_thread_array<count>
-  > Closure;
-
-  Closure closure(keys_first, values_first, comp, keys_last - keys_first);
- 
-  // do an odd-even sort per block of data
-  detail::launch_closure(closure, closure.grid_size(), count);
-} // end stable_sort_by_count()
-
-
-} // end detail
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/cuda/detail/detail/uninitialized.h b/compat/thrust/system/cuda/detail/detail/uninitialized.h
deleted file mode 100644
index a3e3dd2e76..0000000000
--- a/compat/thrust/system/cuda/detail/detail/uninitialized.h
+++ /dev/null
@@ -1,261 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/detail/alignment.h>
-#include <cstddef>
-#include <new>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace detail
-{
-
-
-template<typename T>
-  class uninitialized
-{
-  private:
-    typename aligned_storage<
-      sizeof(T),
-      alignment_of<T>::value
-    >::type storage;
-
-    __device__ __thrust_forceinline__ const T* ptr() const
-    {
-      return reinterpret_cast<const T*>(storage.data);
-    }
-
-    __device__ __thrust_forceinline__ T* ptr()
-    {
-      return reinterpret_cast<T*>(storage.data);
-    }
-
-  public:
-    // copy assignment
-    __device__ __thrust_forceinline__ uninitialized<T> &operator=(const T &other)
-    {
-      T& self = *this;
-      self = other;
-      return *this;
-    }
-
-    __device__ __thrust_forceinline__ T& get()
-    {
-      return *ptr();
-    }
-
-    __device__ __thrust_forceinline__ const T& get() const
-    {
-      return *ptr();
-    }
-
-    __device__ __thrust_forceinline__ operator T& ()
-    {
-      return get();
-    }
-
-    __device__ __thrust_forceinline__ operator const T&() const
-    {
-      return get();
-    }
-
-    __thrust_forceinline__ __device__ void construct()
-    {
-      ::new(ptr()) T();
-    }
-
-    template<typename Arg>
-    __thrust_forceinline__ __device__ void construct(const Arg &a)
-    {
-      ::new(ptr()) T(a);
-    }
-
-    template<typename Arg1, typename Arg2>
-    __thrust_forceinline__ __device__ void construct(const Arg1 &a1, const Arg2 &a2)
-    {
-      ::new(ptr()) T(a1,a2);
-    }
-
-    template<typename Arg1, typename Arg2, typename Arg3>
-    __thrust_forceinline__ __device__ void construct(const Arg1 &a1, const Arg2 &a2, const Arg3 &a3)
-    {
-      ::new(ptr()) T(a1,a2,a3);
-    }
-
-    template<typename Arg1, typename Arg2, typename Arg3, typename Arg4>
-    __thrust_forceinline__ __device__ void construct(const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4)
-    {
-      ::new(ptr()) T(a1,a2,a3,a4);
-    }
-
-    template<typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5>
-    __thrust_forceinline__ __device__ void construct(const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4, const Arg5 &a5)
-    {
-      ::new(ptr()) T(a1,a2,a3,a4,a5);
-    }
-
-    template<typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5, typename Arg6>
-    __thrust_forceinline__ __device__ void construct(const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4, const Arg5 &a5, const Arg6 &a6)
-    {
-      ::new(ptr()) T(a1,a2,a3,a4,a5,a6);
-    }
-
-    template<typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5, typename Arg6, typename Arg7>
-    __thrust_forceinline__ __device__ void construct(const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4, const Arg5 &a5, const Arg6 &a6, const Arg7 &a7)
-    {
-      ::new(ptr()) T(a1,a2,a3,a4,a5,a6,a7);
-    }
-
-    template<typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5, typename Arg6, typename Arg7, typename Arg8>
-    __thrust_forceinline__ __device__ void construct(const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4, const Arg5 &a5, const Arg6 &a6, const Arg7 &a7, const Arg8 &a8)
-    {
-      ::new(ptr()) T(a1,a2,a3,a4,a5,a6,a7,a8);
-    }
-
-    template<typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5, typename Arg6, typename Arg7, typename Arg8, typename Arg9>
-    __thrust_forceinline__ __device__ void construct(const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4, const Arg5 &a5, const Arg6 &a6, const Arg7 &a7, const Arg8 &a8, const Arg9 &a9)
-    {
-      ::new(ptr()) T(a1,a2,a3,a4,a5,a6,a7,a8,a9);
-    }
-
-    template<typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5, typename Arg6, typename Arg7, typename Arg8, typename Arg9, typename Arg10>
-    __thrust_forceinline__ __device__ void construct(const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4, const Arg5 &a5, const Arg6 &a6, const Arg7 &a7, const Arg8 &a8, const Arg9 &a9, const Arg10 &a10)
-    {
-      ::new(ptr()) T(a1,a2,a3,a4,a5,a6,a7,a8,a9,a10);
-    }
-
-    __thrust_forceinline__ __device__ void destroy()
-    {
-      T& self = *this;
-      self.~T();
-    }
-};
-
-
-template<typename T, std::size_t N>
-  class uninitialized_array
-{
-  public:
-    typedef T             value_type; 
-    typedef T&            reference;
-    typedef const T&      const_reference;
-    typedef T*            pointer;
-    typedef const T*      const_pointer;
-    typedef pointer       iterator;
-    typedef const_pointer const_iterator;
-    typedef std::size_t   size_type;
-
-    __thrust_forceinline__ __device__ iterator begin()
-    {
-      return data();
-    }
-
-    __thrust_forceinline__ __device__ const_iterator begin() const
-    {
-      return data();
-    }
-
-    __thrust_forceinline__ __device__ iterator end()
-    {
-      return begin() + size();
-    }
-
-    __thrust_forceinline__ __device__ const_iterator end() const
-    {
-      return begin() + size();
-    }
-
-    __thrust_forceinline__ __device__ const_iterator cbegin() const
-    {
-      return begin();
-    }
-
-    __thrust_forceinline__ __device__ const_iterator cend() const
-    {
-      return end();
-    }
-
-    __thrust_forceinline__ __device__ size_type size() const
-    {
-      return N;
-    }
-
-    __thrust_forceinline__ __device__ bool empty() const
-    {
-      return false;
-    }
-
-    __thrust_forceinline__ __device__ T* data()
-    {
-      return impl.get();
-    }
-
-    __thrust_forceinline__ __device__ const T* data() const
-    {
-      return impl.get();
-    }
-
-    // element access
-    __thrust_forceinline__ __device__ reference operator[](size_type n)
-    {
-      return data()[n];
-    }
-
-    __thrust_forceinline__ __device__ const_reference operator[](size_type n) const
-    {
-      return data()[n];
-    }
-
-    __thrust_forceinline__ __device__ reference front()
-    {
-      return *data();
-    }
-
-    __thrust_forceinline__ __device__ const_reference front() const
-    {
-      return *data();
-    }
-
-    __thrust_forceinline__ __device__ reference back()
-    {
-      return data()[size() - size_type(1)];
-    }
-
-    __thrust_forceinline__ __device__ const_reference back() const
-    {
-      return data()[size() - size_type(1)];
-    }
-
-  private:
-    uninitialized<T[N]> impl;
-};
-
-
-} // end detail
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/cuda/detail/equal.h b/compat/thrust/system/cuda/detail/equal.h
deleted file mode 100644
index a307fc5fc8..0000000000
--- a/compat/thrust/system/cuda/detail/equal.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system has no special version of this algorithm 
-
diff --git a/compat/thrust/system/cuda/detail/error.inl b/compat/thrust/system/cuda/detail/error.inl
deleted file mode 100644
index 41b928fa32..0000000000
--- a/compat/thrust/system/cuda/detail/error.inl
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#pragma once
-
-#include <thrust/system/cuda/error.h>
-#include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
-
-namespace thrust
-{
-
-namespace system
-{
-
-
-error_code make_error_code(cuda::errc::errc_t e)
-{
-  return error_code(static_cast<int>(e), cuda_category());
-} // end make_error_code()
-
-
-error_condition make_error_condition(cuda::errc::errc_t e)
-{
-  return error_condition(static_cast<int>(e), cuda_category());
-} // end make_error_condition()
-
-
-namespace cuda
-{
-
-namespace detail
-{
-
-
-class cuda_error_category
-  : public error_category
-{
-  public:
-    inline cuda_error_category(void) {}
-
-    inline virtual const char *name(void) const
-    {
-      return "cuda";
-    }
-
-    inline virtual std::string message(int ev) const
-    {
-      static const std::string unknown_err("Unknown error");
-      const char *c_str = ::cudaGetErrorString(static_cast<cudaError_t>(ev));
-      return c_str ? std::string(c_str) : unknown_err;
-    }
-
-    inline virtual error_condition default_error_condition(int ev) const
-    {
-      using namespace cuda::errc;
-
-      if(ev < ::cudaErrorApiFailureBase)
-      {
-        return make_error_condition(static_cast<errc_t>(ev));
-      }
-
-      return system_category().default_error_condition(ev);
-    }
-}; // end cuda_error_category
-
-} // end detail
-
-} // end namespace cuda
-
-
-const error_category &cuda_category(void)
-{
-  static const cuda::detail::cuda_error_category result;
-  return result;
-}
-
-
-} // end namespace system
-
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cuda/detail/execution_policy.h b/compat/thrust/system/cuda/detail/execution_policy.h
deleted file mode 100644
index 7dae04c1eb..0000000000
--- a/compat/thrust/system/cuda/detail/execution_policy.h
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/execution_policy.h>
-#include <thrust/system/cpp/detail/execution_policy.h>
-#include <thrust/iterator/detail/any_system_tag.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-// put the canonical tag in the same ns as the backend's entry points
-namespace detail
-{
-
-// this awkward sequence of definitions arise
-// from the desire both for tag to derive
-// from execution_policy and for execution_policy
-// to convert to tag (when execution_policy is not
-// an ancestor of tag)
-
-// forward declaration of tag
-struct tag;
-
-// forward declaration of execution_policy
-template<typename> struct execution_policy;
-
-// specialize execution_policy for tag
-template<>
-  struct execution_policy<tag>
-    : thrust::execution_policy<tag>
-{};
-
-// tag's definition comes before the
-// generic definition of execution_policy
-struct tag : execution_policy<tag> {};
-
-// allow conversion to tag when it is not a successor
-template<typename Derived>
-  struct execution_policy
-    : thrust::execution_policy<Derived>
-{
-  // allow conversion to tag
-  inline operator tag () const
-  {
-    return tag();
-  }
-};
-
-
-template<typename System1, typename System2>
-  struct cross_system
-    : thrust::execution_policy<cross_system<System1,System2> >
-{
-  inline __host__ __device__
-  cross_system(thrust::execution_policy<System1> &system1,
-               thrust::execution_policy<System2> &system2)
-    : system1(system1), system2(system2)
-  {}
-
-  thrust::execution_policy<System1> &system1;
-  thrust::execution_policy<System2> &system2;
-
-  inline __host__ __device__
-  cross_system<System2,System1> rotate() const
-  {
-    return cross_system<System2,System1>(system2,system1);
-  }
-};
-
-
-// overloads of select_system
-
-// cpp interop
-template<typename System1, typename System2>
-inline __host__ __device__
-cross_system<System1,System2> select_system(const execution_policy<System1> &system1, const thrust::cpp::execution_policy<System2> &system2)
-{
-  thrust::execution_policy<System1> &non_const_system1 = const_cast<execution_policy<System1>&>(system1);
-  thrust::cpp::execution_policy<System2> &non_const_system2 = const_cast<thrust::cpp::execution_policy<System2>&>(system2);
-  return cross_system<System1,System2>(non_const_system1,non_const_system2);
-}
-
-
-template<typename System1, typename System2>
-inline __host__ __device__
-cross_system<System1,System2> select_system(const thrust::cpp::execution_policy<System1> &system1, execution_policy<System2> &system2)
-{
-  thrust::cpp::execution_policy<System1> &non_const_system1 = const_cast<thrust::cpp::execution_policy<System1>&>(system1);
-  thrust::execution_policy<System2> &non_const_system2 = const_cast<execution_policy<System2>&>(system2);
-  return cross_system<System1,System2>(non_const_system1,non_const_system2);
-}
-
-
-} // end detail
-
-// alias execution_policy and tag here
-using thrust::system::cuda::detail::execution_policy;
-using thrust::system::cuda::detail::tag;
-
-} // end cuda
-} // end system
-
-// alias items at top-level
-namespace cuda
-{
-
-using thrust::system::cuda::execution_policy;
-using thrust::system::cuda::tag;
-
-} // end cuda
-} // end thrust
-
diff --git a/compat/thrust/system/cuda/detail/extern_shared_ptr.h b/compat/thrust/system/cuda/detail/extern_shared_ptr.h
deleted file mode 100644
index 5f34cc89ba..0000000000
--- a/compat/thrust/system/cuda/detail/extern_shared_ptr.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-template<typename T>
-  class extern_shared_ptr
-{
-// don't attempt to compile with any compiler other than nvcc
-// due to use of __shared__ below
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-  public:
-    __device__
-    inline operator T * (void)
-    {
-      extern __shared__ int4 smem[];
-      return reinterpret_cast<T*>(smem);
-    }
-
-    __device__
-    inline operator const T * (void) const
-    {
-      extern __shared__ int4 smem[];
-      return reinterpret_cast<const T*>(smem);
-    }
-#endif // THRUST_DEVICE_COMPILER_NVCC
-}; // end extern_shared_ptr
-
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/cuda/detail/extrema.h b/compat/thrust/system/cuda/detail/extrema.h
deleted file mode 100644
index a307fc5fc8..0000000000
--- a/compat/thrust/system/cuda/detail/extrema.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system has no special version of this algorithm 
-
diff --git a/compat/thrust/system/cuda/detail/fill.h b/compat/thrust/system/cuda/detail/fill.h
deleted file mode 100644
index 9c753bb9aa..0000000000
--- a/compat/thrust/system/cuda/detail/fill.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file fill.h
- *  \brief Device implementation of fill.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-template<typename DerivedPolicy, typename ForwardIterator, typename T>
-  void fill(execution_policy<DerivedPolicy> &exec,
-            ForwardIterator first,
-            ForwardIterator last,
-            const T &value);
-
-template<typename DerivedPolicy, typename OutputIterator, typename Size, typename T>
-  OutputIterator fill_n(execution_policy<DerivedPolicy> &exec,
-                        OutputIterator first,
-                        Size n,
-                        const T &value);
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/cuda/detail/fill.inl>
-
diff --git a/compat/thrust/system/cuda/detail/fill.inl b/compat/thrust/system/cuda/detail/fill.inl
deleted file mode 100644
index 3c1feb8ac8..0000000000
--- a/compat/thrust/system/cuda/detail/fill.inl
+++ /dev/null
@@ -1,178 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file fill.inl
- *  \brief Inline file for fill.h.
- */
-
-#include <thrust/detail/config.h>
-
-#include <thrust/distance.h>
-#include <thrust/detail/util/align.h>
-#include <thrust/generate.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/raw_pointer_cast.h>
-#include <thrust/detail/minmax.h>
-#include <thrust/detail/internal_functional.h>
-
-#include <thrust/system/cuda/detail/runtime_introspection.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace detail
-{
-
-
-template<typename WidePtr, typename T>
-  WidePtr widen_raw_ptr(T *ptr)
-{
-  typedef thrust::detail::pointer_traits<WidePtr> WideTraits;
-  typedef typename WideTraits::element_type       WideT;
-
-  // carefully widen the pointer to avoid warnings about conversions between differently aligned types on ARM
-  WideT *wide_raw_ptr = static_cast<WideT*>(static_cast<void*>(ptr));
-
-  return WideTraits::pointer_to(*wide_raw_ptr);
-}
-
-
-template<typename WideType, typename DerivedPolicy, typename Pointer, typename Size, typename T>
-  Pointer wide_fill_n(execution_policy<DerivedPolicy> &exec,
-                      Pointer first,
-                      Size n,
-                      const T &value)
-{
-  typedef typename thrust::iterator_value<Pointer>::type OutputType;
-
-  size_t ALIGNMENT_BOUNDARY = 128; // begin copying blocks at this byte boundary
-
-  WideType   wide_exemplar;
-  OutputType narrow_exemplars[sizeof(WideType) / sizeof(OutputType)];
-
-  for (size_t i = 0; i < sizeof(WideType) / sizeof(OutputType); i++)
-      narrow_exemplars[i] = static_cast<OutputType>(value);
-
-  // cast through char * to avoid type punning warnings
-  for (size_t i = 0; i < sizeof(WideType); i++)
-      reinterpret_cast<char *>(&wide_exemplar)[i] = reinterpret_cast<char *>(narrow_exemplars)[i];
-
-  OutputType *first_raw = thrust::raw_pointer_cast(first);
-  OutputType *last_raw  = first_raw + n;
-
-  OutputType *block_first_raw = (thrust::min)(first_raw + n,   thrust::detail::util::align_up(first_raw, ALIGNMENT_BOUNDARY));
-  OutputType *block_last_raw  = (thrust::max)(block_first_raw, thrust::detail::util::align_down(last_raw, sizeof(WideType)));
-
-  // rebind Pointer to WideType
-  typedef typename thrust::detail::rebind_pointer<Pointer,WideType>::type WidePtr;
-
-  // point to the widened range
-  // XXX since we've got an execution policy, we probably don't even need to deal with rebinding pointers
-  WidePtr block_first_wide = widen_raw_ptr<WidePtr>(block_first_raw);
-  WidePtr block_last_wide  = widen_raw_ptr<WidePtr>(block_last_raw);
-
-  thrust::generate(exec, first,                   Pointer(block_first_raw),    thrust::detail::fill_functor<OutputType>(value));
-  thrust::generate(exec, block_first_wide,        block_last_wide,             thrust::detail::fill_functor<WideType>(wide_exemplar));
-  thrust::generate(exec, Pointer(block_last_raw), first + n,                   thrust::detail::fill_functor<OutputType>(value));
-
-  return first + n;
-}
-
-template<typename DerivedPolicy, typename OutputIterator, typename Size, typename T>
-  OutputIterator fill_n(execution_policy<DerivedPolicy> &exec,
-                        OutputIterator first,
-                        Size n,
-                        const T &value,
-                        thrust::detail::false_type)
-{
-  thrust::detail::fill_functor<T> func(value); 
-  return thrust::generate_n(exec, first, n, func);
-}
-
-template<typename DerivedPolicy, typename OutputIterator, typename Size, typename T>
-  OutputIterator fill_n(execution_policy<DerivedPolicy> &exec,
-                        OutputIterator first,
-                        Size n,
-                        const T &value,
-                        thrust::detail::true_type)
-{
-  typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
-  
-  if ( thrust::detail::util::is_aligned<OutputType>(thrust::raw_pointer_cast(&*first)) )
-  {
-      if (compute_capability() < 20)
-      {
-        // 32-bit writes are faster on G80 and GT200
-        typedef unsigned int WideType;
-        wide_fill_n<WideType>(exec, &*first, n, value);
-      }
-      else
-      {
-        // 64-bit writes are faster on Fermi
-        typedef unsigned long long WideType;
-        wide_fill_n<WideType>(exec, &*first, n, value);
-      }
-
-      return first + n;
-  }
-  else
-  {
-    return fill_n(exec, first, n, value, thrust::detail::false_type());
-  }
-}
-
-} // end detail
-
-template<typename DerivedPolicy, typename OutputIterator, typename Size, typename T>
-  OutputIterator fill_n(execution_policy<DerivedPolicy> &exec,
-                        OutputIterator first,
-                        Size n,
-                        const T &value)
-{
-  typedef typename thrust::iterator_traits<OutputIterator>::value_type      OutputType;
-
-  // we're compiling with nvcc, launch a kernel
-  const bool use_wide_fill = thrust::detail::is_trivial_iterator<OutputIterator>::value
-      && thrust::detail::has_trivial_assign<OutputType>::value
-      && (sizeof(OutputType) == 1 || sizeof(OutputType) == 2 || sizeof(OutputType) == 4);
-
-  // XXX WAR usused variable warning
-  (void)use_wide_fill;
-
-  return detail::fill_n(exec, first, n, value, thrust::detail::integral_constant<bool, use_wide_fill>());
-}
-
-template<typename DerivedPolicy, typename ForwardIterator, typename T>
-  void fill(execution_policy<DerivedPolicy> &exec,
-            ForwardIterator first,
-            ForwardIterator last,
-            const T &value)
-{
-  thrust::system::cuda::detail::fill_n(exec, first, thrust::distance(first,last), value);
-} // end fill()
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cuda/detail/find.h b/compat/thrust/system/cuda/detail/find.h
deleted file mode 100644
index a307fc5fc8..0000000000
--- a/compat/thrust/system/cuda/detail/find.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system has no special version of this algorithm 
-
diff --git a/compat/thrust/system/cuda/detail/for_each.h b/compat/thrust/system/cuda/detail/for_each.h
deleted file mode 100644
index 56be13b177..0000000000
--- a/compat/thrust/system/cuda/detail/for_each.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file for_each.h
- *  \brief Defines the interface for a function that executes a 
- *  function or functional for each value in a given range.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator,
-         typename UnaryFunction>
-  RandomAccessIterator for_each(execution_policy<DerivedPolicy> &s,
-                                RandomAccessIterator first,
-                                RandomAccessIterator last,
-                                UnaryFunction f);
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator,
-         typename Size,
-         typename UnaryFunction>
-  RandomAccessIterator for_each_n(execution_policy<DerivedPolicy> &s,
-                                  RandomAccessIterator first,
-                                  Size n,
-                                  UnaryFunction f);
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/cuda/detail/for_each.inl>
-
diff --git a/compat/thrust/system/cuda/detail/for_each.inl b/compat/thrust/system/cuda/detail/for_each.inl
deleted file mode 100644
index be6e56131f..0000000000
--- a/compat/thrust/system/cuda/detail/for_each.inl
+++ /dev/null
@@ -1,199 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file for_each.inl
- *  \brief Inline file for for_each.h.
- */
-
-#include <thrust/detail/config.h>
-
-#include <thrust/detail/minmax.h>
-#include <thrust/detail/static_assert.h>
-
-#include <thrust/distance.h>
-#include <thrust/for_each.h>
-#include <thrust/system/cuda/detail/detail/launch_closure.h>
-#include <thrust/system/cuda/detail/detail/launch_calculator.h>
-#include <thrust/detail/util/blocking.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/function.h>
-
-#include <limits>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace for_each_n_detail
-{
-
-
-template<typename RandomAccessIterator,
-         typename Size,
-         typename UnaryFunction,
-         typename Context>
-struct for_each_n_closure
-{
-  typedef void result_type;
-  typedef Context context_type;
-
-  RandomAccessIterator first;
-  Size n;
-  thrust::detail::device_function<UnaryFunction,void> f;
-  Context context;
-
-  for_each_n_closure(RandomAccessIterator first,
-                     Size n,
-                     UnaryFunction f,
-                     Context context = Context())
-    : first(first), n(n), f(f), context(context)
-  {}
-
-  __device__ __thrust_forceinline__
-  result_type operator()(void)
-  {
-    const Size grid_size = context.block_dimension() * context.grid_dimension();
-
-    Size i = context.linear_index();
-
-    // advance iterator
-    first += i;
-
-    while(i < n)
-    {
-      f(*first);
-      i += grid_size;
-      first += grid_size;
-    }
-  }
-}; // end for_each_n_closure
-
-
-template<typename Closure, typename Size>
-thrust::tuple<size_t,size_t> configure_launch(Size n)
-{
-  // calculate launch configuration
-  detail::launch_calculator<Closure> calculator;
-  
-  thrust::tuple<size_t, size_t, size_t> config = calculator.with_variable_block_size();
-  size_t max_blocks = thrust::get<0>(config);
-  size_t block_size = thrust::get<1>(config);
-  size_t num_blocks = thrust::min(max_blocks, thrust::detail::util::divide_ri<size_t>(n, block_size));
-
-  return thrust::make_tuple(num_blocks, block_size);
-}
-
-
-template<typename Size>
-bool use_big_closure(Size n, unsigned int little_grid_size)
-{
-  // use the big closure when n will not fit within an unsigned int
-  // or if incrementing an unsigned int by little_grid_size would overflow
-  // the counter
-  
-  Size threshold = std::numeric_limits<unsigned int>::max();
-
-  bool result = (sizeof(Size) > sizeof(unsigned int)) && (n > threshold);
-
-  if(!result)
-  {
-    // check if we'd overflow the little closure's counter
-    unsigned int little_n = static_cast<unsigned int>(n);
-
-    if((little_n - 1u) + little_grid_size < little_n)
-    {
-      result = true;
-    }
-  }
-
-  return result;
-}
-
-
-} // end for_each_n_detail
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator,
-         typename Size,
-         typename UnaryFunction>
-RandomAccessIterator for_each_n(execution_policy<DerivedPolicy> &,
-                                RandomAccessIterator first,
-                                Size n,
-                                UnaryFunction f)
-{
-  // we're attempting to launch a kernel, assert we're compiling with nvcc
-  // ========================================================================
-  // X Note to the user: If you've found this line due to a compiler error, X
-  // X you need to compile your code using nvcc, rather than g++ or cl.exe  X
-  // ========================================================================
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<RandomAccessIterator, THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC>::value) );
-
-  if(n <= 0) return first;  // empty range
-  
-  // create two candidate closures to implement the for_each
-  // choose between them based on the whether we can fit n into a smaller integer
-  // and whether or not we'll overflow the closure's counter
-
-  typedef detail::blocked_thread_array Context;
-  typedef for_each_n_detail::for_each_n_closure<RandomAccessIterator, Size, UnaryFunction, Context>         BigClosure;
-  typedef for_each_n_detail::for_each_n_closure<RandomAccessIterator, unsigned int, UnaryFunction, Context> LittleClosure;
-
-  BigClosure    big_closure(first, n, f);
-  LittleClosure little_closure(first, static_cast<unsigned int>(n), f);
-
-  thrust::tuple<size_t, size_t> little_config = for_each_n_detail::configure_launch<LittleClosure>(n);
-
-  unsigned int little_grid_size = thrust::get<0>(little_config) * thrust::get<1>(little_config);
-
-  if(for_each_n_detail::use_big_closure(n, little_grid_size))
-  {
-    // launch the big closure
-    thrust::tuple<size_t, size_t> big_config = for_each_n_detail::configure_launch<BigClosure>(n);
-    detail::launch_closure(big_closure, thrust::get<0>(big_config), thrust::get<1>(big_config));
-  }
-  else
-  {
-    // launch the little closure
-    detail::launch_closure(little_closure, thrust::get<0>(little_config), thrust::get<1>(little_config));
-  }
-
-  return first + n;
-} 
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename UnaryFunction>
-  InputIterator for_each(execution_policy<DerivedPolicy> &exec,
-                         InputIterator first,
-                         InputIterator last,
-                         UnaryFunction f)
-{
-  return cuda::detail::for_each_n(exec, first, thrust::distance(first,last), f);
-} // end for_each()
-
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cuda/detail/gather.h b/compat/thrust/system/cuda/detail/gather.h
deleted file mode 100644
index a307fc5fc8..0000000000
--- a/compat/thrust/system/cuda/detail/gather.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system has no special version of this algorithm 
-
diff --git a/compat/thrust/system/cuda/detail/generate.h b/compat/thrust/system/cuda/detail/generate.h
deleted file mode 100644
index a307fc5fc8..0000000000
--- a/compat/thrust/system/cuda/detail/generate.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system has no special version of this algorithm 
-
diff --git a/compat/thrust/system/cuda/detail/get_value.h b/compat/thrust/system/cuda/detail/get_value.h
deleted file mode 100644
index 273023f612..0000000000
--- a/compat/thrust/system/cuda/detail/get_value.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-#include <thrust/system/cuda/detail/assign_value.h>
-#include <thrust/detail/raw_pointer_cast.h>
-#include <thrust/iterator/iterator_traits.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-
-namespace
-{
-
-
-template<typename DerivedPolicy, typename Pointer>
-inline __host__ __device__
-  typename thrust::iterator_value<Pointer>::type
-    get_value_msvc2005_war(execution_policy<DerivedPolicy> &exec, Pointer ptr)
-{
-  typedef typename thrust::iterator_value<Pointer>::type result_type;
-
-  // XXX war nvbugs/881631
-  struct war_nvbugs_881631
-  {
-    __host__ inline static result_type host_path(execution_policy<DerivedPolicy> &exec, Pointer ptr)
-    {
-      // when called from host code, implement with assign_value
-      // note that this requires a type with default constructor
-      result_type result;
-
-      thrust::host_system_tag host_tag;
-      cross_system<thrust::host_system_tag, DerivedPolicy> systems(host_tag, exec);
-      assign_value(systems, &result, ptr);
-
-      return result;
-    }
-
-    __device__ inline static result_type device_path(execution_policy<DerivedPolicy> &, Pointer ptr)
-    {
-      // when called from device code, just do simple deref
-      return *thrust::raw_pointer_cast(ptr);
-    }
-  };
-
-#ifndef __CUDA_ARCH__
-  return war_nvbugs_881631::host_path(exec, ptr);
-#else
-  return war_nvbugs_881631::device_path(exec, ptr);
-#endif // __CUDA_ARCH__
-} // end get_value_msvc2005_war()
-
-
-} // end anon namespace
-
-
-template<typename DerivedPolicy, typename Pointer>
-inline __host__ __device__
-  typename thrust::iterator_value<Pointer>::type
-    get_value(execution_policy<DerivedPolicy> &exec, Pointer ptr)
-{
-  return get_value_msvc2005_war(exec,ptr);
-} // end get_value()
-
-
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/cuda/detail/guarded_cuda_runtime_api.h b/compat/thrust/system/cuda/detail/guarded_cuda_runtime_api.h
deleted file mode 100644
index e6c0d2812e..0000000000
--- a/compat/thrust/system/cuda/detail/guarded_cuda_runtime_api.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// the purpose of this header is to check for the existence of macros
-// such as __host__ and __device__, which may already be defined by thrust
-// and to undefine them before entering cuda_runtime_api.h (which will redefine them)
-
-// we only try to do this stuff if cuda/include/host_defines.h has been included
-#if !defined(__HOST_DEFINES_H__)
-
-#ifdef __host__
-#undef __host__
-#endif // __host__
-
-#ifdef __device__
-#undef __device__
-#endif // __device__
-
-#endif // __HOST_DEFINES_H__
-
-#include <cuda_runtime_api.h>
-
diff --git a/compat/thrust/system/cuda/detail/inner_product.h b/compat/thrust/system/cuda/detail/inner_product.h
deleted file mode 100644
index a307fc5fc8..0000000000
--- a/compat/thrust/system/cuda/detail/inner_product.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system has no special version of this algorithm 
-
diff --git a/compat/thrust/system/cuda/detail/iter_swap.h b/compat/thrust/system/cuda/detail/iter_swap.h
deleted file mode 100644
index 9b2bcf069f..0000000000
--- a/compat/thrust/system/cuda/detail/iter_swap.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/raw_pointer_cast.h>
-#include <thrust/swap.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-
-template<typename Pointer1, typename Pointer2>
-inline __host__ __device__
-void iter_swap(tag, Pointer1 a, Pointer2 b)
-{
-  // XXX war nvbugs/881631
-  struct war_nvbugs_881631
-  {
-    __host__ inline static void host_path(Pointer1 a, Pointer2 b)
-    {
-      thrust::swap_ranges(a, a + 1, b);
-    }
-
-    __device__ inline static void device_path(Pointer1 a, Pointer2 b)
-    {
-      using thrust::swap;
-      swap(*thrust::raw_pointer_cast(a),
-           *thrust::raw_pointer_cast(b));
-    }
-  };
-
-#ifndef __CUDA_ARCH__
-  return war_nvbugs_881631::host_path(a,b);
-#else
-  return war_nvbugs_881631::device_path(a,b);
-#endif // __CUDA_ARCH__
-} // end iter_swap()
-
-
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/cuda/detail/logical.h b/compat/thrust/system/cuda/detail/logical.h
deleted file mode 100644
index a307fc5fc8..0000000000
--- a/compat/thrust/system/cuda/detail/logical.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system has no special version of this algorithm 
-
diff --git a/compat/thrust/system/cuda/detail/malloc_and_free.h b/compat/thrust/system/cuda/detail/malloc_and_free.h
deleted file mode 100644
index 676dd7cd5a..0000000000
--- a/compat/thrust/system/cuda/detail/malloc_and_free.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-#include <thrust/detail/raw_pointer_cast.h>
-#include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
-#include <thrust/system/system_error.h>
-#include <thrust/system/cuda/error.h>
-#include <thrust/system/detail/bad_alloc.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-
-// note that malloc returns a raw pointer to avoid
-// depending on the heavyweight thrust/system/cuda/memory.h header
-template<typename DerivedPolicy>
-  void *malloc(execution_policy<DerivedPolicy> &, std::size_t n)
-{
-  void *result = 0;
-
-  cudaError_t error = cudaMalloc(reinterpret_cast<void**>(&result), n);
-
-  if(error)
-  {
-    throw thrust::system::detail::bad_alloc(thrust::cuda_category().message(error).c_str());
-  } // end if
-
-  return result;
-} // end malloc()
-
-
-template<typename DerivedPolicy, typename Pointer>
-  void free(execution_policy<DerivedPolicy> &, Pointer ptr)
-{
-  cudaError_t error = cudaFree(thrust::raw_pointer_cast(ptr));
-
-  if(error)
-  {
-    throw thrust::system_error(error, thrust::cuda_category());
-  } // end error
-} // end free()
-
-
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/cuda/detail/memory.inl b/compat/thrust/system/cuda/detail/memory.inl
deleted file mode 100644
index 998b54e345..0000000000
--- a/compat/thrust/system/cuda/detail/memory.inl
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/memory.h>
-#include <thrust/system/cuda/detail/malloc_and_free.h>
-#include <limits>
-
-namespace thrust
-{
-
-// XXX WAR an issue with MSVC 2005 (cl v14.00) incorrectly implementing
-//     pointer_raw_pointer for pointer by specializing it here
-//     note that we specialize it here, before the use of raw_pointer_cast
-//     below, which causes pointer_raw_pointer's instantiation
-#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC) && (_MSC_VER <= 1400)
-namespace detail
-{
-
-template<typename T>
-  struct pointer_raw_pointer< thrust::cuda::pointer<T> >
-{
-  typedef typename thrust::cuda::pointer<T>::raw_pointer type;
-}; // end pointer_raw_pointer
-
-} // end detail
-#endif
-
-namespace system
-{
-namespace cuda
-{
-
-
-template<typename T>
-  template<typename OtherT>
-    reference<T> &
-      reference<T>
-        ::operator=(const reference<OtherT> &other)
-{
-  return super_t::operator=(other);
-} // end reference::operator=()
-
-template<typename T>
-  reference<T> &
-    reference<T>
-      ::operator=(const value_type &x)
-{
-  return super_t::operator=(x);
-} // end reference::operator=()
-
-template<typename T>
-__host__ __device__
-void swap(reference<T> a, reference<T> b)
-{
-  a.swap(b);
-} // end swap()
-
-pointer<void> malloc(std::size_t n)
-{
-  tag cuda_tag;
-  return pointer<void>(thrust::system::cuda::detail::malloc(cuda_tag, n));
-} // end malloc()
-
-template<typename T>
-pointer<T> malloc(std::size_t n)
-{
-  pointer<void> raw_ptr = thrust::system::cuda::malloc(sizeof(T) * n);
-  return pointer<T>(reinterpret_cast<T*>(raw_ptr.get()));
-} // end malloc()
-
-void free(pointer<void> ptr)
-{
-  tag cuda_tag;
-  return thrust::system::cuda::detail::free(cuda_tag, ptr.get());
-} // end free()
-
-} // end cuda
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/cuda/detail/merge.h b/compat/thrust/system/cuda/detail/merge.h
deleted file mode 100644
index e01b705470..0000000000
--- a/compat/thrust/system/cuda/detail/merge.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename RandomAccessIterator3,
-         typename StrictWeakOrdering>
-  RandomAccessIterator3 merge(execution_policy<DerivedPolicy> &exec,
-                              RandomAccessIterator1 first1,
-                              RandomAccessIterator1 last1,
-                              RandomAccessIterator2 first2,
-                              RandomAccessIterator2 last2,
-                              RandomAccessIterator3 result,
-                              StrictWeakOrdering comp);
-
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
-
-#include <thrust/system/cuda/detail/merge.inl>
-
diff --git a/compat/thrust/system/cuda/detail/merge.inl b/compat/thrust/system/cuda/detail/merge.inl
deleted file mode 100644
index bf7516fde1..0000000000
--- a/compat/thrust/system/cuda/detail/merge.inl
+++ /dev/null
@@ -1,285 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/merge.h>
-#include <thrust/pair.h>
-#include <thrust/tuple.h>
-#include <thrust/detail/minmax.h>
-#include <thrust/detail/function.h>
-#include <thrust/system/cuda/detail/detail/uninitialized.h>
-#include <thrust/system/cuda/detail/detail/launch_closure.h>
-#include <thrust/detail/util/blocking.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace merge_detail
-{
-
-
-template<typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename Size,
-         typename Compare>
-__device__ __thrust_forceinline__
-thrust::pair<Size,Size>
-  partition_search(RandomAccessIterator1 first1,
-                   RandomAccessIterator2 first2,
-                   Size diag,
-                   Size lower_bound1,
-                   Size upper_bound1,
-                   Size lower_bound2,
-                   Size upper_bound2,
-                   Compare comp)
-{
-  Size begin = thrust::max<Size>(lower_bound1, diag - upper_bound2);
-  Size end   = thrust::min<Size>(diag - lower_bound2, upper_bound1);
-
-  while(begin < end)
-  {
-    Size mid = (begin + end) / 2;
-    Size index1 = mid;
-    Size index2 = diag - mid - 1;
-
-    if(comp(first2[index2], first1[index1]))
-    {
-      end = mid;
-    }
-    else
-    {
-      begin = mid + 1;
-    }
-  }
-
-  return thrust::make_pair(begin, diag - begin);
-}
-
-
-template<typename Context, typename RandomAccessIterator1, typename Size, typename RandomAccessIterator2, typename RandomAccessIterator3, typename Compare>
-__device__ __thrust_forceinline__
-void merge_n(Context &ctx,
-             RandomAccessIterator1 first1,
-             Size n1,
-             RandomAccessIterator2 first2,
-             Size n2,
-             RandomAccessIterator3 result,
-             Compare comp_,
-             unsigned int work_per_thread)
-{
-  const unsigned int block_size = ctx.block_dimension();
-  thrust::detail::device_function<Compare,bool> comp(comp_);
-  typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type1;
-  typedef typename thrust::iterator_value<RandomAccessIterator2>::type value_type2;
-
-  Size result_size = n1 + n2;
-
-  // this is just oversubscription_rate * block_size * work_per_thread
-  // but it makes no sense to send oversubscription_rate as an extra parameter
-  Size work_per_block = thrust::detail::util::divide_ri(result_size, ctx.grid_dimension());
-
-  using thrust::system::cuda::detail::detail::uninitialized;
-  __shared__ uninitialized<thrust::pair<Size,Size> > s_block_input_begin;
-
-  Size result_block_offset = ctx.block_index() * work_per_block;
-
-  // find where this block's input begins in both input sequences
-  if(ctx.thread_index() == 0)
-  {
-    s_block_input_begin = (ctx.block_index() == 0) ?
-      thrust::pair<Size,Size>(0,0) :
-      partition_search(first1, first2,
-                       result_block_offset,
-                       Size(0), n1,
-                       Size(0), n2,
-                       comp);
-  }
-
-  ctx.barrier();
-
-  // iterate to consume this block's input
-  Size work_per_iteration = block_size * work_per_thread;
-  thrust::pair<Size,Size> block_input_end = s_block_input_begin;
-  block_input_end.first  += work_per_iteration;
-  block_input_end.second += work_per_iteration;
-  Size result_block_offset_last = result_block_offset + thrust::min<Size>(work_per_block, result_size - result_block_offset);
-
-  for(;
-      result_block_offset < result_block_offset_last;
-      result_block_offset += work_per_iteration,
-      block_input_end.first  += work_per_iteration,
-      block_input_end.second += work_per_iteration
-     )
-  {
-    // find where this thread's input begins in both input sequences for this iteration
-    thrust::pair<Size,Size> thread_input_begin =
-      partition_search(first1, first2,
-                       Size(result_block_offset + ctx.thread_index() * work_per_thread),
-                       s_block_input_begin.get().first,  thrust::min<Size>(block_input_end.first , n1),
-                       s_block_input_begin.get().second, thrust::min<Size>(block_input_end.second, n2),
-                       comp);
-
-    ctx.barrier();
-
-    // XXX the performance impact of not keeping x1 & x2
-    //     in registers is about 10% for int32
-    uninitialized<value_type1> x1;
-    uninitialized<value_type2> x2;
-
-    // XXX this is just a serial merge -- try to simplify or abstract this loop
-    Size i = result_block_offset + ctx.thread_index() * work_per_thread;
-    Size last_i = i + thrust::min<Size>(work_per_thread, result_size - thread_input_begin.first - thread_input_begin.second);
-    for(;
-        i < last_i;
-        ++i)
-    {
-      // optionally load x1 & x2
-      bool output_x2 = true;
-      if(thread_input_begin.second < n2)
-      {
-        x2 = first2[thread_input_begin.second];
-      }
-      else
-      {
-        output_x2 = false;
-      }
-
-      if(thread_input_begin.first < n1)
-      {
-        x1 = first1[thread_input_begin.first];
-
-        if(output_x2)
-        {
-          output_x2 = comp(x2.get(), x1.get());
-        }
-      }
-
-      result[i] = output_x2 ? x2.get() : x1.get();
-
-      if(output_x2)
-      {
-        ++thread_input_begin.second;
-      }
-      else
-      {
-        ++thread_input_begin.first;
-      }
-    } // end for
-
-    // the block's last thread has conveniently located the
-    // beginning of the next iteration's input
-    if(ctx.thread_index() == block_size-1)
-    {
-      s_block_input_begin = thread_input_begin;
-    }
-    ctx.barrier();
-  } // end for
-} // end merge_n
-
-
-template<typename RandomAccessIterator1, typename Size, typename RandomAccessIterator2, typename RandomAccessIterator3, typename Compare>
-  struct merge_n_closure
-{
-  typedef thrust::system::cuda::detail::detail::blocked_thread_array context_type;
-
-  RandomAccessIterator1 first1;
-  Size n1;
-  RandomAccessIterator2 first2;
-  Size n2;
-  RandomAccessIterator3 result;
-  Compare comp;
-  Size work_per_thread;
-
-  merge_n_closure(RandomAccessIterator1 first1, Size n1, RandomAccessIterator2 first2, Size n2, RandomAccessIterator3 result, Compare comp, Size work_per_thread)
-    : first1(first1), n1(n1), first2(first2), n2(n2), result(result), comp(comp), work_per_thread(work_per_thread)
-  {}
-
-  __device__ __forceinline__
-  void operator()()
-  {
-    context_type ctx;
-    merge_n(ctx, first1, n1, first2, n2, result, comp, work_per_thread);
-  }
-};
-
-
-// returns (work_per_thread, threads_per_block, oversubscription_factor)
-template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename RandomAccessIterator3, typename Compare>
-  thrust::tuple<unsigned int,unsigned int,unsigned int>
-    tunables(RandomAccessIterator1, RandomAccessIterator1, RandomAccessIterator2, RandomAccessIterator2, RandomAccessIterator3, Compare comp)
-{
-  // determined by empirical testing on GTX 480
-  // ~4500 Mkeys/s on GTX 480
-  const unsigned int work_per_thread         = 5;
-  const unsigned int threads_per_block       = 128;
-  const unsigned int oversubscription_factor = 30;
-
-  return thrust::make_tuple(work_per_thread, threads_per_block, oversubscription_factor);
-}
-
-
-} // end merge_detail
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2, 
-	 typename RandomAccessIterator3,
-         typename Compare>
-RandomAccessIterator3 merge(execution_policy<DerivedPolicy> &exec,
-                            RandomAccessIterator1 first1,
-                            RandomAccessIterator1 last1,
-                            RandomAccessIterator2 first2,
-                            RandomAccessIterator2 last2,
-                            RandomAccessIterator3 result,
-                            Compare comp)
-{
-  typedef typename thrust::iterator_difference<RandomAccessIterator1>::type Size;
-  Size n1 = last1 - first1;
-  Size n2 = last2 - first2;
-  typename thrust::iterator_difference<RandomAccessIterator1>::type n = n1 + n2;
-
-  // empty result
-  if(n <= 0) return result;
-
-  unsigned int work_per_thread = 0, threads_per_block = 0, oversubscription_factor = 0;
-  thrust::tie(work_per_thread,threads_per_block,oversubscription_factor)
-    = merge_detail::tunables(first1, last1, first2, last2, result, comp);
-
-  const unsigned int work_per_block = work_per_thread * threads_per_block;
-
-  const unsigned int num_processors = device_properties().multiProcessorCount;
-  const unsigned int num_blocks = thrust::min<int>(oversubscription_factor * num_processors, thrust::detail::util::divide_ri(n, work_per_block));
-
-  typedef merge_detail::merge_n_closure<RandomAccessIterator1,Size,RandomAccessIterator2,RandomAccessIterator3,Compare> closure_type;
-  closure_type closure(first1, n1, first2, n2, result, comp, work_per_thread);
-
-  detail::launch_closure(closure, num_blocks, threads_per_block);
-
-  return result + n1 + n2;
-} // end merge()
-
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cuda/detail/mismatch.h b/compat/thrust/system/cuda/detail/mismatch.h
deleted file mode 100644
index a307fc5fc8..0000000000
--- a/compat/thrust/system/cuda/detail/mismatch.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system has no special version of this algorithm 
-
diff --git a/compat/thrust/system/cuda/detail/par.h b/compat/thrust/system/cuda/detail/par.h
deleted file mode 100644
index e56128c1d0..0000000000
--- a/compat/thrust/system/cuda/detail/par.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-#include <thrust/detail/execute_with_allocator.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-
-struct par_t : thrust::system::cuda::detail::execution_policy<par_t>
-{
-  par_t() : thrust::system::cuda::detail::execution_policy<par_t>() {}
-
-  template<typename Allocator>
-    thrust::detail::execute_with_allocator<Allocator, thrust::system::cuda::detail::execution_policy>
-      operator()(Allocator &alloc) const
-  {
-    return thrust::detail::execute_with_allocator<Allocator, thrust::system::cuda::detail::execution_policy>(alloc);
-  }
-};
-
-
-} // end detail
-
-
-static const detail::par_t par;
-
-
-} // end cuda
-} // end system
-
-
-// alias par here
-namespace cuda
-{
-
-
-using thrust::system::cuda::par;
-
-
-} // end cuda
-} // end thrust
-
diff --git a/compat/thrust/system/cuda/detail/partition.h b/compat/thrust/system/cuda/detail/partition.h
deleted file mode 100644
index a307fc5fc8..0000000000
--- a/compat/thrust/system/cuda/detail/partition.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system has no special version of this algorithm 
-
diff --git a/compat/thrust/system/cuda/detail/reduce.h b/compat/thrust/system/cuda/detail/reduce.h
deleted file mode 100644
index d188f60f25..0000000000
--- a/compat/thrust/system/cuda/detail/reduce.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file reduce.h
- *  \brief Reduce a sequence of elements with a given length.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputType,
-         typename BinaryFunction>
-  OutputType reduce(execution_policy<DerivedPolicy> &exec,
-                    InputIterator first,
-                    InputIterator last,
-                    OutputType init,
-                    BinaryFunction binary_op);
-
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/cuda/detail/reduce.inl>
-
diff --git a/compat/thrust/system/cuda/detail/reduce.inl b/compat/thrust/system/cuda/detail/reduce.inl
deleted file mode 100644
index 66b4ac72c2..0000000000
--- a/compat/thrust/system/cuda/detail/reduce.inl
+++ /dev/null
@@ -1,275 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file reduce.inl
- *  \brief Inline file for reduce.h
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/distance.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/minmax.h>
-#include <thrust/detail/temporary_array.h>
-#include <thrust/system/detail/generic/select_system.h>
-
-#include <thrust/system/cuda/detail/runtime_introspection.h>
-#include <thrust/system/cuda/detail/extern_shared_ptr.h>
-#include <thrust/system/cuda/detail/block/reduce.h>
-#include <thrust/system/cuda/detail/detail/launch_closure.h>
-#include <thrust/system/cuda/detail/detail/launch_calculator.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-namespace reduce_detail
-{
-
-/*
- * Reduce a vector of n elements using binary_op()
- *
- * The order of reduction is not defined, so binary_op() should
- * be a commutative (and associative) operator such as 
- * (integer) addition.  Since floating point operations
- * do not completely satisfy these criteria, the result is 
- * generally not the same as a consecutive reduction of 
- * the elements.
- * 
- * Uses the same pattern as reduce6() in the CUDA SDK
- *
- */
-template <typename InputIterator,
-          typename Size,
-          typename T,
-          typename OutputIterator,
-          typename BinaryFunction,
-          typename Context>
-struct unordered_reduce_closure
-{
-  InputIterator  input;
-  Size           n;
-  T              init;
-  OutputIterator output;
-  BinaryFunction binary_op;
-  unsigned int shared_array_size;
-
-  typedef Context context_type;
-  context_type context;
-
-  unordered_reduce_closure(InputIterator input, Size n, T init, OutputIterator output, BinaryFunction binary_op, unsigned int shared_array_size, Context context = Context())
-    : input(input), n(n), init(init), output(output), binary_op(binary_op), shared_array_size(shared_array_size), context(context) {}
-
-  __device__ __thrust_forceinline__
-  void operator()(void)
-  {
-    typedef typename thrust::iterator_value<OutputIterator>::type OutputType;
-    extern_shared_ptr<OutputType>  shared_array;
-
-    Size grid_size = context.block_dimension() * context.grid_dimension();
-
-    Size i = context.linear_index();
-      
-    input += i;
-
-    // compute reduction with all blockDim.x threads
-    OutputType sum = thrust::raw_reference_cast(*input);
-
-    i     += grid_size;
-    input += grid_size;
-
-    while (i < n)
-    {
-      OutputType val = thrust::raw_reference_cast(*input);
-
-      sum = binary_op(sum, val);
-
-      i      += grid_size;
-      input  += grid_size;
-    }
-
-    // write first shared_array_size values into shared memory
-    if (context.thread_index() < shared_array_size)
-      shared_array[context.thread_index()] = sum;  
-
-    // accumulate remaining values (if any) to shared memory in stages
-    if (context.block_dimension() > shared_array_size)
-    {
-      unsigned int lb = shared_array_size;
-      unsigned int ub = shared_array_size + lb;
-      
-      while (lb < context.block_dimension())
-      {
-        context.barrier();
-
-        if (lb <= context.thread_index() && context.thread_index() < ub)
-        {
-          OutputType tmp = shared_array[context.thread_index() - lb];
-          shared_array[context.thread_index() - lb] = binary_op(tmp, sum);
-        }
-
-        lb += shared_array_size;
-        ub += shared_array_size;
-      }
-    }
-    
-    context.barrier();
-
-    block::reduce_n(context, shared_array, thrust::min<unsigned int>(context.block_dimension(), shared_array_size), binary_op);
-  
-    if (context.thread_index() == 0)
-    {
-      OutputType tmp = shared_array[0];
-
-      if (context.grid_dimension() == 1)
-        tmp = binary_op(init, tmp);
-
-      output += context.block_index();
-      *output = tmp;
-    }
-  }
-};
-
-
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputType,
-         typename BinaryFunction>
-  OutputType reduce(execution_policy<DerivedPolicy> &exec,
-                    InputIterator first,
-                    InputIterator last,
-                    OutputType init,
-                    BinaryFunction binary_op)
-{
-  // we're attempting to launch a kernel, assert we're compiling with nvcc
-  // ========================================================================
-  // X Note to the user: If you've found this line due to a compiler error, X
-  // X you need to compile your code using nvcc, rather than g++ or cl.exe  X
-  // ========================================================================
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator, THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC>::value) );
-
-  typedef typename thrust::iterator_difference<InputIterator>::type difference_type;
-
-  difference_type n = thrust::distance(first,last);
-
-  if (n == 0)
-    return init;
-
-  typedef thrust::detail::temporary_array<OutputType, DerivedPolicy> OutputArray;
-  typedef typename OutputArray::iterator OutputIterator;
-
-  typedef detail::blocked_thread_array Context;
-  typedef unordered_reduce_closure<InputIterator,difference_type,OutputType,OutputIterator,BinaryFunction,Context> Closure;
-    
-  function_attributes_t attributes = detail::closure_attributes<Closure>();
-  
-  // TODO chose this in a more principled manner
-  size_t threshold = thrust::max<size_t>(2 * attributes.maxThreadsPerBlock, 1024);
-
-  device_properties_t properties = device_properties();
-
-  // launch configuration
-  size_t num_blocks; 
-  size_t block_size; 
-  size_t array_size; 
-  size_t smem_bytes; 
-
-  // first level reduction
-  if (static_cast<size_t>(n) < threshold)
-  {
-    num_blocks = 1;
-    block_size = thrust::min(static_cast<size_t>(n), static_cast<size_t>(attributes.maxThreadsPerBlock));
-    array_size = thrust::min(block_size, (properties.sharedMemPerBlock - attributes.sharedSizeBytes) / sizeof(OutputType));
-    smem_bytes = sizeof(OutputType) * array_size;
-  }
-  else
-  {
-    detail::launch_calculator<Closure> calculator;
-    
-    thrust::tuple<size_t,size_t,size_t> config = calculator.with_variable_block_size_available_smem();
-
-    num_blocks = thrust::min(thrust::get<0>(config), static_cast<size_t>(n) / thrust::get<1>(config));
-    block_size = thrust::get<1>(config);
-    array_size = thrust::min(block_size, thrust::get<2>(config) / sizeof(OutputType));
-    smem_bytes = sizeof(OutputType) * array_size;
-  }
- 
-  // TODO assert(n <= num_blocks * block_size);
-  // TODO if (shared_array_size < 1) throw cuda exception "insufficient shared memory"
-
-  OutputArray output(exec, num_blocks);
-
-  Closure closure(first, n, init, output.begin(), binary_op, array_size);
-  
-  //std::cout << "Launching " << num_blocks << " blocks of kernel with " << block_size << " threads and " << smem_bytes << " shared memory per block " << std::endl;
-
-  detail::launch_closure(closure, num_blocks, block_size, smem_bytes);
-
-  // second level reduction
-  if (num_blocks > 1)
-  {
-    typedef detail::blocked_thread_array Context;
-    typedef unordered_reduce_closure<OutputIterator,difference_type,OutputType,OutputIterator,BinaryFunction,Context> Closure;
-
-    function_attributes_t attributes = detail::closure_attributes<Closure>();
-
-    num_blocks = 1;
-    block_size = thrust::min(output.size(), static_cast<size_t>(attributes.maxThreadsPerBlock));
-    array_size = thrust::min(block_size, (properties.sharedMemPerBlock - attributes.sharedSizeBytes) / sizeof(OutputType));
-    smem_bytes = sizeof(OutputType) * array_size;
-  
-    // TODO if (shared_array_size < 1) throw cuda exception "insufficient shared memory"
-
-    Closure closure(output.begin(), output.size(), init, output.begin(), binary_op, array_size);
-
-    //std::cout << "Launching " << num_blocks << " blocks of kernel with " << block_size << " threads and " << smem_bytes << " shared memory per block " << std::endl;
-
-    detail::launch_closure(closure, num_blocks, block_size, smem_bytes);
-  }
-  
-  return output[0];
-} // end reduce
-
-} // end reduce_detail
-
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputType,
-         typename BinaryFunction>
-  OutputType reduce(execution_policy<DerivedPolicy> &exec,
-                    InputIterator first,
-                    InputIterator last,
-                    OutputType init,
-                    BinaryFunction binary_op)
-{
-  return reduce_detail::reduce(exec, first, last, init, binary_op);
-} // end reduce()
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cuda/detail/reduce_by_key.h b/compat/thrust/system/cuda/detail/reduce_by_key.h
deleted file mode 100644
index 9b8ec10936..0000000000
--- a/compat/thrust/system/cuda/detail/reduce_by_key.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file reduce_by_key.h
- *  \brief CUDA implementation of reduce_by_key
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-template <typename DerivedPolicy,
-          typename InputIterator1,
-          typename InputIterator2,
-          typename OutputIterator1,
-          typename OutputIterator2,
-          typename BinaryPredicate,
-          typename BinaryFunction>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    reduce_by_key(execution_policy<DerivedPolicy> &exec,
-                  InputIterator1 keys_first, 
-                  InputIterator1 keys_last,
-                  InputIterator2 values_first,
-                  OutputIterator1 keys_output,
-                  OutputIterator2 values_output,
-                  BinaryPredicate binary_pred,
-                  BinaryFunction binary_op);
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/cuda/detail/reduce_by_key.inl>
-
diff --git a/compat/thrust/system/cuda/detail/reduce_by_key.inl b/compat/thrust/system/cuda/detail/reduce_by_key.inl
deleted file mode 100644
index 18dc1e4994..0000000000
--- a/compat/thrust/system/cuda/detail/reduce_by_key.inl
+++ /dev/null
@@ -1,705 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#include <thrust/detail/config.h>
-
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/iterator/permutation_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/system/detail/generic/select_system.h>
-
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/type_traits/function_traits.h>
-#include <thrust/detail/type_traits/iterator/is_output_iterator.h>
-#include <thrust/detail/type_traits/iterator/is_discard_iterator.h>
-
-#include <thrust/detail/minmax.h>
-#include <thrust/detail/internal_functional.h>
-#include <thrust/detail/temporary_array.h>
-
-#include <thrust/reduce.h>
-#include <thrust/scan.h>
-#include <thrust/system/cuda/detail/default_decomposition.h>
-#include <thrust/system/cuda/detail/block/inclusive_scan.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-#include <thrust/system/cuda/detail/detail/launch_closure.h>
-#include <thrust/system/cuda/detail/reduce_intervals.h>
-#include <thrust/system/cuda/detail/detail/uninitialized.h>
-
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace reduce_by_key_detail
-{
-
-template <typename FlagType, typename IndexType, typename KeyType, typename BinaryPredicate>
-struct tail_flag_functor
-{
-  BinaryPredicate binary_pred; // NB: this must be the first member for performance reasons
-  IndexType n;
-
-  typedef FlagType result_type;
-  
-  tail_flag_functor(IndexType n, BinaryPredicate binary_pred)
-    : n(n), binary_pred(binary_pred)
-  {}
-  
-  // XXX why is this noticably faster?  (it may read past the end of input)
-  //FlagType operator()(const thrust::tuple<IndexType,KeyType,KeyType>& t) const
-  
-  template <typename Tuple>
-  __host__ __device__ __thrust_forceinline__
-  FlagType operator()(const Tuple& t)
-  {
-    if (thrust::get<0>(t) == (n - 1) || !binary_pred(thrust::get<1>(t), thrust::get<2>(t)))
-      return 1;
-    else
-      return 0;
-  }
-};
-
-
-template <unsigned int CTA_SIZE,
-          unsigned int K,
-          bool FullBlock,
-          typename Context,
-          typename FlagIterator,
-          typename FlagType>
-__device__ __thrust_forceinline__
-FlagType load_flags(Context context,
-                    const unsigned int n,
-                    FlagIterator iflags,
-                    FlagType  (&sflag)[CTA_SIZE])
-{
-  FlagType flag_bits = 0;
-
-  // load flags in unordered fashion
-  for(unsigned int k = 0; k < K; k++)
-  {
-    const unsigned int offset = k*CTA_SIZE + context.thread_index();
-
-    if (FullBlock || offset < n)
-    {
-      FlagIterator temp = iflags + offset;
-      if (*temp)
-        flag_bits |= FlagType(1) << k;
-    }
-  }
-
-  sflag[context.thread_index()] = flag_bits;
-  
-  context.barrier();
-
-  flag_bits = 0;
-
-  // obtain flags for iflags[K * context.thread_index(), K * context.thread_index() + K)
-  for(unsigned int k = 0; k < K; k++)
-  {
-    const unsigned int offset = K * context.thread_index() + k;
-
-    if (FullBlock || offset < n)
-    {
-      flag_bits |= ((sflag[offset % CTA_SIZE] >> (offset / CTA_SIZE)) & FlagType(1)) << k;
-    }
-  }
-
-  context.barrier();
-  
-  sflag[context.thread_index()] = flag_bits;
-  
-  context.barrier();
-
-  return flag_bits;
-}
-
-template <unsigned int CTA_SIZE,
-          unsigned int K,
-          bool FullBlock,
-          typename Context,
-          typename InputIterator2,
-          typename ValueType>
-__device__ __thrust_forceinline__
-void load_values(Context context,
-                 const unsigned int n,
-                 InputIterator2 ivals,
-                 ValueType (&sdata)[K][CTA_SIZE + 1])
-{
-  for(unsigned int k = 0; k < K; k++)
-  {
-    const unsigned int offset = k*CTA_SIZE + context.thread_index();
-
-    if (FullBlock || offset < n)
-    {
-      InputIterator2 temp = ivals + offset;
-      sdata[offset % K][offset / K] = *temp;
-    }
-  }
-
-  context.barrier();
-}
-
-
-template <unsigned int CTA_SIZE,
-          unsigned int K,
-          bool FullBlock,
-          typename Context,
-          typename InputIterator1,
-          typename InputIterator2,
-          typename OutputIterator1,
-          typename OutputIterator2,
-          typename BinaryPredicate,
-          typename BinaryFunction,
-          typename FlagIterator,
-          typename FlagType,
-          typename IndexType,
-          typename ValueType>
-__device__ __thrust_forceinline__
-void reduce_by_key_body(Context context,
-                        const unsigned int n,
-                        InputIterator1   ikeys,
-                        InputIterator2   ivals,
-                        OutputIterator1  okeys,
-                        OutputIterator2  ovals,
-                        BinaryPredicate  binary_pred,
-                        BinaryFunction   binary_op,
-                        FlagIterator     iflags,
-                        FlagType  (&sflag)[CTA_SIZE],
-                        ValueType (&sdata)[K][CTA_SIZE + 1],
-                        bool&      carry_in,
-                        IndexType& carry_index,
-                        ValueType& carry_value)
-{
-  // load flags
-  const FlagType flag_bits  = load_flags<CTA_SIZE,K,FullBlock>(context, n, iflags, sflag);
-  const FlagType flag_count = __popc(flag_bits); // TODO hide this behind a template
-  const FlagType left_flag  = (context.thread_index() == 0) ? 0 : sflag[context.thread_index() - 1];
-  const FlagType head_flag  = (context.thread_index() == 0 || flag_bits & ((1 << (K - 1)) - 1) || left_flag & (1 << (K - 1))) ? 1 : 0;
-  
-  context.barrier();
-
-  // scan flag counts
-  sflag[context.thread_index()] = flag_count; context.barrier();
-
-  block::inclusive_scan(context, sflag, thrust::plus<FlagType>());
-
-  const FlagType output_position = (context.thread_index() == 0) ? 0 : sflag[context.thread_index() - 1];
-  const FlagType num_outputs     = sflag[CTA_SIZE - 1];
-
-  context.barrier();
-
-  // shuffle keys and write keys out
-  if (!thrust::detail::is_discard_iterator<OutputIterator1>::value)
-  {
-    // XXX this could be improved
-    for (unsigned int i = 0; i < num_outputs; i += CTA_SIZE)
-    {
-      FlagType position = output_position;
-
-      for(unsigned int k = 0; k < K; k++)
-      {
-        if (flag_bits & (FlagType(1) << k))
-        {
-          if (i <= position && position < i + CTA_SIZE)
-            sflag[position - i] = K * context.thread_index() + k;
-          position++;
-        }
-      }
-
-      context.barrier();
-
-      if (i + context.thread_index() < num_outputs)
-      {
-        InputIterator1  tmp1 = ikeys + sflag[context.thread_index()];
-        OutputIterator1 tmp2 = okeys + (i + context.thread_index());
-        *tmp2 = *tmp1; 
-      }
-      
-      context.barrier();
-    }
-  }
-
-  // load values
-  load_values<CTA_SIZE,K,FullBlock> (context, n, ivals, sdata);
-
-  ValueType ldata[K];
-  for (unsigned int k = 0; k < K; k++)
-      ldata[k] = sdata[k][context.thread_index()];
-
-  // carry in (if necessary)
-  if (context.thread_index() == 0 && carry_in)
-  {
-    // XXX WAR sm_10 issue
-    ValueType tmp1 = carry_value;
-    ldata[0] = binary_op(tmp1, ldata[0]);
-  }
-
-  context.barrier();
-
-  // sum local values
-  {
-    for(unsigned int k = 1; k < K; k++)
-    {
-      const unsigned int offset = K * context.thread_index() + k;
-
-      if (FullBlock || offset < n)
-      {
-        if (!(flag_bits & (FlagType(1) << (k - 1))))
-          ldata[k] = binary_op(ldata[k - 1], ldata[k]);
-      }
-    }
-  }
-
-  // second level segmented scan
-  {
-    // use head flags for segmented scan
-    sflag[context.thread_index()] = head_flag;  sdata[K - 1][context.thread_index()] = ldata[K - 1]; context.barrier();
-
-    if (FullBlock)
-      block::inclusive_scan_by_flag(context, sflag, sdata[K-1], binary_op);
-    else
-      block::inclusive_scan_by_flag_n(context, sflag, sdata[K-1], n, binary_op);
-  }
-
-  // update local values
-  if (context.thread_index() > 0)
-  {
-    unsigned int update_bits  = (flag_bits << 1) | (left_flag >> (K - 1));
-// TODO remove guard
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-    unsigned int update_count = __ffs(update_bits) - 1u; // NB: this might wrap around to UINT_MAX
-#else
-    unsigned int update_count = 0;
-#endif // THRUST_DEVICE_COMPILER_NVCC
-
-    if (!FullBlock && (K + 1) * context.thread_index() > n)
-      update_count = thrust::min(n - K * context.thread_index(), update_count);
-
-    ValueType left = sdata[K - 1][context.thread_index() - 1];
-
-    for(unsigned int k = 0; k < K; k++)
-    {
-      if (k < update_count)
-        ldata[k] = binary_op(left, ldata[k]);
-    }
-  }
-  
-  context.barrier();
-
-  // store carry out
-  if (FullBlock)
-  {
-    if (context.thread_index() == CTA_SIZE - 1)
-    {
-      carry_value = ldata[K - 1];
-      carry_in    = (flag_bits & (FlagType(1) << (K - 1))) ? false : true;
-      carry_index = num_outputs;
-    }
-  }
-  else
-  {
-    if (context.thread_index() == (n - 1) / K)
-    {
-      for (unsigned int k = 0; k < K; k++)
-          if (k == (n - 1) % K)
-              carry_value = ldata[k];
-      carry_in    = (flag_bits & (FlagType(1) << ((n - 1) % K))) ? false : true;
-      carry_index = num_outputs;
-    }
-  }
-
-  // shuffle values
-  {
-    FlagType position = output_position;
-  
-    for(unsigned int k = 0; k < K; k++)
-    {
-      const unsigned int offset = K * context.thread_index() + k;
-  
-      if (FullBlock || offset < n)
-      {
-        if (flag_bits & (FlagType(1) << k))
-        {
-          sdata[position / CTA_SIZE][position % CTA_SIZE] = ldata[k];
-          position++;
-        }
-      }
-    }
-  }
-
-  context.barrier();
-
-
-  // write values out
-  for(unsigned int k = 0; k < K; k++)
-  {
-    const unsigned int offset = CTA_SIZE * k + context.thread_index();
-
-    if (offset < num_outputs)
-    {
-      OutputIterator2 tmp = ovals + offset;
-      *tmp = sdata[k][context.thread_index()];
-    }
-  }
-
-  context.barrier();
-}
-
-template <typename InputIterator1,
-          typename InputIterator2,
-          typename OutputIterator1,
-          typename OutputIterator2,
-          typename BinaryPredicate,
-          typename BinaryFunction,
-          typename FlagIterator,
-          typename IndexIterator,
-          typename ValueIterator,
-          typename BoolIterator,
-          typename Decomposition,
-          typename Context>
-struct reduce_by_key_closure
-{
-  InputIterator1   ikeys;
-  InputIterator2   ivals;
-  OutputIterator1  okeys;
-  OutputIterator2  ovals;
-  BinaryPredicate  binary_pred;
-  BinaryFunction   binary_op;
-  FlagIterator     iflags;
-  IndexIterator    interval_counts;
-  ValueIterator    interval_values;
-  BoolIterator     interval_carry;
-  Decomposition    decomp;
-  Context          context;
-
-  typedef Context context_type;
-
-  reduce_by_key_closure(InputIterator1   ikeys,
-                        InputIterator2   ivals,
-                        OutputIterator1  okeys,
-                        OutputIterator2  ovals,
-                        BinaryPredicate  binary_pred,
-                        BinaryFunction   binary_op,
-                        FlagIterator     iflags,
-                        IndexIterator    interval_counts,
-                        ValueIterator    interval_values,
-                        BoolIterator     interval_carry,
-                        Decomposition    decomp,
-                        Context          context = Context())
-    : ikeys(ikeys), ivals(ivals), okeys(okeys), ovals(ovals), binary_pred(binary_pred), binary_op(binary_op),
-      iflags(iflags), interval_counts(interval_counts), interval_values(interval_values), interval_carry(interval_carry),
-      decomp(decomp), context(context) {}
-
-  __device__ __thrust_forceinline__
-  void operator()(void)
-  {
-    typedef typename thrust::iterator_value<InputIterator1>::type KeyType;
-    typedef typename thrust::iterator_value<ValueIterator>::type  ValueType;
-    typedef typename Decomposition::index_type                    IndexType;
-    typedef typename thrust::iterator_value<FlagIterator>::type   FlagType;
-
-    const unsigned int CTA_SIZE = context_type::ThreadsPerBlock::value;
-
-// TODO centralize this mapping (__CUDA_ARCH__ -> smem bytes)
-#if __CUDA_ARCH__ >= 200
-    const unsigned int SMEM = (48 * 1024);
-#else
-    const unsigned int SMEM = (16 * 1024) - 256;
-#endif
-    const unsigned int SMEM_FIXED = CTA_SIZE * sizeof(FlagType) + sizeof(ValueType) + sizeof(IndexType) + sizeof(bool);
-    const unsigned int BOUND_1 = (SMEM - SMEM_FIXED) / ((CTA_SIZE + 1) * sizeof(ValueType));
-    const unsigned int BOUND_2 = 8 * sizeof(FlagType);
-    const unsigned int BOUND_3 = 6;
-  
-    // TODO replace this with a static_min<BOUND_1,BOUND_2,BOUND_3>::value
-    const unsigned int K = (BOUND_1 < BOUND_2) ? (BOUND_1 < BOUND_3 ? BOUND_1 : BOUND_3) : (BOUND_2 < BOUND_3 ? BOUND_2 : BOUND_3);
-  
-    __shared__ detail::uninitialized<FlagType[CTA_SIZE]>         sflag;
-    __shared__ detail::uninitialized<ValueType[K][CTA_SIZE + 1]> sdata;  // padded to avoid bank conflicts
-  
-    __shared__ detail::uninitialized<ValueType> carry_value; // storage for carry in and carry out
-    __shared__ detail::uninitialized<IndexType> carry_index;
-    __shared__ detail::uninitialized<bool>      carry_in; 
-
-    typename Decomposition::range_type interval = decomp[context.block_index()];
-    //thrust::system::detail::internal::index_range<IndexType> interval = decomp[context.block_index()];
-  
-
-    if (context.thread_index() == 0)
-    {
-      carry_in = false; // act as though the previous segment terminated just before us
-  
-      if (context.block_index() == 0)
-      {
-        carry_index = 0;
-      }
-      else
-      {
-        interval_counts += (context.block_index() - 1);
-        carry_index = *interval_counts;
-      }
-    }
-  
-    context.barrier();
-  
-    IndexType base = interval.begin();
-  
-    // advance input and output iterators
-    ikeys  += base;
-    ivals  += base;
-    iflags += base;
-    okeys  += carry_index;
-    ovals  += carry_index;
-  
-    const unsigned int unit_size = K * CTA_SIZE;
-  
-    // process full units
-    while (base + unit_size <= interval.end())
-    {
-      const unsigned int n = unit_size;
-      reduce_by_key_body<CTA_SIZE,K,true>(context, n, ikeys, ivals, okeys, ovals, binary_pred, binary_op, iflags, sflag.get(), sdata.get(), carry_in.get(), carry_index.get(), carry_value.get());
-      base   += unit_size;
-      ikeys  += unit_size;
-      ivals  += unit_size;
-      iflags += unit_size;
-      okeys  += carry_index;
-      ovals  += carry_index;
-    }
-  
-    // process partially full unit at end of input (if necessary)
-    if (base < interval.end())
-    {
-      const unsigned int n = interval.end() - base;
-      reduce_by_key_body<CTA_SIZE,K,false>(context, n, ikeys, ivals, okeys, ovals, binary_pred, binary_op, iflags, sflag.get(), sdata.get(), carry_in.get(), carry_index.get(), carry_value.get());
-    }
-  
-    if (context.thread_index() == 0)
-    {
-      interval_values += context.block_index();
-      interval_carry  += context.block_index();
-      *interval_values = carry_value;
-      *interval_carry  = carry_in;
-    }
-  }
-}; // end reduce_by_key_closure
-
-template <typename InputIterator1,
-          typename InputIterator2,
-          typename OutputIterator1,
-          typename OutputIterator2,
-          typename BinaryPredicate,
-          typename BinaryFunction>
-struct DefaultPolicy
-{
-  // typedefs
-  typedef unsigned int                                                       FlagType;
-  typedef typename thrust::iterator_traits<InputIterator1>::difference_type  IndexType;
-  typedef typename thrust::iterator_traits<InputIterator1>::value_type       KeyType;
-  typedef thrust::system::detail::internal::uniform_decomposition<IndexType> Decomposition;
-    
-  // the pseudocode for deducing the type of the temporary used below:
-  // 
-  // if BinaryFunction is AdaptableBinaryFunction
-  //   TemporaryType = AdaptableBinaryFunction::result_type
-  // else if OutputIterator2 is a "pure" output iterator
-  //   TemporaryType = InputIterator2::value_type
-  // else
-  //   TemporaryType = OutputIterator2::value_type
-  //
-  // XXX upon c++0x, TemporaryType needs to be:
-  // result_of<BinaryFunction>::type
-
-  typedef typename thrust::detail::eval_if<
-    thrust::detail::has_result_type<BinaryFunction>::value,
-    thrust::detail::result_type<BinaryFunction>,
-    thrust::detail::eval_if<
-      thrust::detail::is_output_iterator<OutputIterator2>::value,
-      thrust::iterator_value<InputIterator2>,
-      thrust::iterator_value<OutputIterator2>
-    >
-  >::type ValueType;
- 
-  // XXX WAR problem on sm_11
-  // TODO tune this
-  const static unsigned int ThreadsPerBlock = (thrust::detail::is_pod<ValueType>::value) ? 256 : 192;
-
-  DefaultPolicy(InputIterator1 first1, InputIterator1 last1)
-    : decomp(default_decomposition<IndexType>(last1 - first1))
-  {}
-
-  // member variables
-  Decomposition decomp;
-};
-
-template <typename DerivedPolicy,
-          typename InputIterator1,
-          typename InputIterator2,
-          typename OutputIterator1,
-          typename OutputIterator2,
-          typename BinaryPredicate,
-          typename BinaryFunction,
-          typename Policy>
-  thrust::pair<OutputIterator1,OutputIterator2>
-  reduce_by_key(execution_policy<DerivedPolicy> &exec,
-                InputIterator1 keys_first, 
-                InputIterator1 keys_last,
-                InputIterator2 values_first,
-                OutputIterator1 keys_output,
-                OutputIterator2 values_output,
-                BinaryPredicate binary_pred,
-                BinaryFunction binary_op,
-                Policy policy)
-{
-    typedef typename Policy::FlagType       FlagType;
-    typedef typename Policy::Decomposition  Decomposition;
-    typedef typename Policy::IndexType      IndexType;
-    typedef typename Policy::KeyType        KeyType;
-    typedef typename Policy::ValueType      ValueType;
-
-    // temporary arrays
-    typedef thrust::detail::temporary_array<IndexType,DerivedPolicy> IndexArray;
-    typedef thrust::detail::temporary_array<KeyType,DerivedPolicy>   KeyArray;
-    typedef thrust::detail::temporary_array<ValueType,DerivedPolicy> ValueArray;
-    typedef thrust::detail::temporary_array<bool,DerivedPolicy>      BoolArray;
-
-    Decomposition decomp = policy.decomp;
-
-    // input size
-    IndexType n = keys_last - keys_first;
-
-    if (n == 0)
-      return thrust::make_pair(keys_output, values_output);
-
-    IndexArray interval_counts(exec, decomp.size());
-    ValueArray interval_values(exec, decomp.size());
-    BoolArray  interval_carry(exec, decomp.size());
-
-    // an ode to c++11 auto
-    typedef thrust::counting_iterator<IndexType> CountingIterator;
-    typedef thrust::transform_iterator<
-      tail_flag_functor<FlagType,IndexType,KeyType,BinaryPredicate>,
-      thrust::zip_iterator<
-        thrust::tuple<CountingIterator,InputIterator1,InputIterator1>
-      >
-    > FlagIterator;
-
-    FlagIterator iflag= thrust::make_transform_iterator
-       (thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator<IndexType>(0), keys_first, keys_first + 1)),
-        tail_flag_functor<FlagType,IndexType,KeyType,BinaryPredicate>(n, binary_pred));
-
-    // count number of tail flags per interval
-    thrust::system::cuda::detail::reduce_intervals(exec, iflag, interval_counts.begin(), thrust::plus<IndexType>(), decomp);
-
-    thrust::inclusive_scan(exec,
-                           interval_counts.begin(), interval_counts.end(),
-                           interval_counts.begin(),
-                           thrust::plus<IndexType>());
- 
-    // determine output size
-    const IndexType N = interval_counts[interval_counts.size() - 1];
-   
-    const static unsigned int ThreadsPerBlock = Policy::ThreadsPerBlock;
-    typedef typename IndexArray::iterator IndexIterator;
-    typedef typename ValueArray::iterator ValueIterator; 
-    typedef typename BoolArray::iterator  BoolIterator;  
-    typedef detail::statically_blocked_thread_array<ThreadsPerBlock> Context;
-    typedef reduce_by_key_closure<InputIterator1,InputIterator2,OutputIterator1,OutputIterator2,BinaryPredicate,BinaryFunction,
-                                  FlagIterator,IndexIterator,ValueIterator,BoolIterator,Decomposition,Context> Closure;
-    Closure closure
-      (keys_first,  values_first,
-       keys_output, values_output,
-       binary_pred, binary_op,
-       iflag,
-       interval_counts.begin(),
-       interval_values.begin(),
-       interval_carry.begin(),
-       decomp);
-    detail::launch_closure(closure, decomp.size(), ThreadsPerBlock);
-   
-    if (decomp.size() > 1)
-    {
-      ValueArray interval_values2(exec, decomp.size());
-      IndexArray interval_counts2(exec, decomp.size());
-      BoolArray  interval_carry2(exec, decomp.size());
-
-      IndexType N2 = 
-      thrust::reduce_by_key
-        (exec,
-         thrust::make_zip_iterator(thrust::make_tuple(interval_counts.begin(), interval_carry.begin())),
-         thrust::make_zip_iterator(thrust::make_tuple(interval_counts.end(),   interval_carry.end())),
-         interval_values.begin(),
-         thrust::make_zip_iterator(thrust::make_tuple(interval_counts2.begin(), interval_carry2.begin())),
-         interval_values2.begin(),
-         thrust::equal_to< thrust::tuple<IndexType,bool> >(),
-         binary_op).first
-        -
-        thrust::make_zip_iterator(thrust::make_tuple(interval_counts2.begin(), interval_carry2.begin()));
-    
-      thrust::transform_if
-        (exec,
-         interval_values2.begin(), interval_values2.begin() + N2,
-         thrust::make_permutation_iterator(values_output, interval_counts2.begin()),
-         interval_carry2.begin(),
-         thrust::make_permutation_iterator(values_output, interval_counts2.begin()),
-         binary_op,
-         thrust::identity<bool>());
-    }
-  
-    return thrust::make_pair(keys_output + N, values_output + N); 
-}
-
-} // end namespace reduce_by_key_detail
-
-
-template <typename DerivedPolicy,
-          typename InputIterator1,
-          typename InputIterator2,
-          typename OutputIterator1,
-          typename OutputIterator2,
-          typename BinaryPredicate,
-          typename BinaryFunction>
-  thrust::pair<OutputIterator1,OutputIterator2>
-  reduce_by_key(execution_policy<DerivedPolicy> &exec,
-                InputIterator1 keys_first, 
-                InputIterator1 keys_last,
-                InputIterator2 values_first,
-                OutputIterator1 keys_output,
-                OutputIterator2 values_output,
-                BinaryPredicate binary_pred,
-                BinaryFunction binary_op)
-{
-  return reduce_by_key_detail::reduce_by_key
-    (exec, 
-     keys_first, keys_last, values_first, keys_output, values_output, binary_pred, binary_op,
-     reduce_by_key_detail::DefaultPolicy<InputIterator1,InputIterator2,OutputIterator1,OutputIterator2,BinaryPredicate,BinaryFunction>(keys_first, keys_last));
-} // end reduce_by_key()
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
-
diff --git a/compat/thrust/system/cuda/detail/reduce_intervals.h b/compat/thrust/system/cuda/detail/reduce_intervals.h
deleted file mode 100644
index 505d13635c..0000000000
--- a/compat/thrust/system/cuda/detail/reduce_intervals.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file reduce_intervals.h
- *  \brief CUDA implementations of reduce_intervals algorithms.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-template <typename DerivedPolicy,
-          typename InputIterator,
-          typename OutputIterator,
-          typename BinaryFunction,
-          typename Decomposition>
-void reduce_intervals(execution_policy<DerivedPolicy> &exec,
-                      InputIterator input,
-                      OutputIterator output,
-                      BinaryFunction binary_op,
-                      Decomposition decomp);
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/cuda/detail/reduce_intervals.inl>
-
diff --git a/compat/thrust/system/cuda/detail/reduce_intervals.inl b/compat/thrust/system/cuda/detail/reduce_intervals.inl
deleted file mode 100644
index 2381769223..0000000000
--- a/compat/thrust/system/cuda/detail/reduce_intervals.inl
+++ /dev/null
@@ -1,203 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-
-#include <thrust/iterator/iterator_traits.h>
-
-#include <thrust/detail/minmax.h>
-#include <thrust/system/detail/internal/decompose.h>
-#include <thrust/system/cuda/detail/extern_shared_ptr.h>
-#include <thrust/system/cuda/detail/block/reduce.h>
-#include <thrust/system/cuda/detail/detail/launch_closure.h>
-#include <thrust/system/cuda/detail/detail/launch_calculator.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-template <typename InputIterator,
-          typename OutputIterator,
-          typename BinaryFunction,
-          typename Decomposition,
-          typename Context>
-struct commutative_reduce_intervals_closure
-{
-  InputIterator  input;
-  OutputIterator output;
-  BinaryFunction binary_op;
-  Decomposition  decomposition;
-  unsigned int shared_array_size;
-
-  typedef Context context_type;
-  context_type context;
-
-  commutative_reduce_intervals_closure(InputIterator input, OutputIterator output, BinaryFunction binary_op, Decomposition decomposition, unsigned int shared_array_size, Context context = Context())
-    : input(input), output(output), binary_op(binary_op), decomposition(decomposition), shared_array_size(shared_array_size), context(context) {}
-
-  __device__ __thrust_forceinline__
-  void operator()(void)
-  {
-    typedef typename thrust::iterator_value<OutputIterator>::type OutputType;
-    extern_shared_ptr<OutputType>  shared_array;
-
-    typedef typename Decomposition::index_type index_type;
-   
-    // this block processes results in [range.begin(), range.end())
-    thrust::system::detail::internal::index_range<index_type> range = decomposition[context.block_index()];
-
-    index_type i = range.begin() + context.thread_index();
-      
-    input += i;
-
-    if (range.size() < context.block_dimension())
-    {
-      // compute reduction with the first shared_array_size threads
-      if (context.thread_index() < thrust::min<index_type>(shared_array_size,range.size()))
-      {
-        OutputType sum = *input;
-
-        i     += shared_array_size;
-        input += shared_array_size;
-
-        while (i < range.end())
-        {
-          OutputType val = *input;
-
-          sum = binary_op(sum, val);
-
-          i      += shared_array_size;
-          input  += shared_array_size;
-        }
-
-        shared_array[context.thread_index()] = sum;  
-      }
-    }
-    else
-    {
-      // compute reduction with all blockDim.x threads
-      OutputType sum = *input;
-
-      i     += context.block_dimension();
-      input += context.block_dimension();
-
-      while (i < range.end())
-      {
-        OutputType val = *input;
-
-        sum = binary_op(sum, val);
-
-        i      += context.block_dimension();
-        input  += context.block_dimension();
-      }
-
-      // write first shared_array_size values into shared memory
-      if (context.thread_index() < shared_array_size)
-        shared_array[context.thread_index()] = sum;  
-
-      // accumulate remaining values (if any) to shared memory in stages
-      if (context.block_dimension() > shared_array_size)
-      {
-        unsigned int lb = shared_array_size;
-        unsigned int ub = shared_array_size + lb;
-        
-        while (lb < context.block_dimension())
-        {
-          context.barrier();
-
-          if (lb <= context.thread_index() && context.thread_index() < ub)
-          {
-            OutputType tmp = shared_array[context.thread_index() - lb];
-            shared_array[context.thread_index() - lb] = binary_op(tmp, sum);
-          }
-
-          lb += shared_array_size;
-          ub += shared_array_size;
-        }
-      }
-    }
-  
-    context.barrier();
-
-    block::reduce_n(context, shared_array, thrust::min<index_type>(range.size(), shared_array_size), binary_op);
-  
-    if (context.thread_index() == 0)
-    {
-      output += context.block_index();
-      *output = shared_array[0];
-    }
-  }
-};
-
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
-
-template <typename ExecutionPolicy,
-          typename InputIterator,
-          typename OutputIterator,
-          typename BinaryFunction,
-          typename Decomposition>
-void reduce_intervals(execution_policy<ExecutionPolicy> &,
-                      InputIterator input,
-                      OutputIterator output,
-                      BinaryFunction binary_op,
-                      Decomposition decomp)
-{
-  // we're attempting to launch a kernel, assert we're compiling with nvcc
-  // ========================================================================
-  // X Note to the user: If you've found this line due to a compiler error, X
-  // X you need to compile your code using nvcc, rather than g++ or cl.exe  X
-  // ========================================================================
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator, THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC>::value) );
-
-  if (decomp.size() == 0)
-    return;
-  
-  // TODO if (decomp.size() > deviceProperties.maxGridSize[0]) throw cuda exception (or handle general case)
-
-  typedef detail::blocked_thread_array Context;
-  typedef commutative_reduce_intervals_closure<InputIterator,OutputIterator,BinaryFunction,Decomposition,Context> Closure;
-  typedef typename thrust::iterator_value<OutputIterator>::type OutputType;
-  
-  detail::launch_calculator<Closure> calculator;
-
-  thrust::tuple<size_t,size_t,size_t> config = calculator.with_variable_block_size_available_smem();
-
-  //size_t max_blocks = thrust::get<0>(config);
-  size_t block_size = thrust::get<1>(config);
-  size_t max_memory = thrust::get<2>(config);
-
-  // determine shared array size
-  size_t shared_array_size  = thrust::min(max_memory / sizeof(OutputType), block_size);
-  size_t shared_array_bytes = sizeof(OutputType) * shared_array_size;
-  
-  // TODO if (shared_array_size < 1) throw cuda exception "insufficient shared memory"
-
-  Closure closure(input, output, binary_op, decomp, shared_array_size);
-  detail::launch_closure(closure, decomp.size(), block_size, shared_array_bytes);
-}
-
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cuda/detail/remove.h b/compat/thrust/system/cuda/detail/remove.h
deleted file mode 100644
index a307fc5fc8..0000000000
--- a/compat/thrust/system/cuda/detail/remove.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system has no special version of this algorithm 
-
diff --git a/compat/thrust/system/cuda/detail/replace.h b/compat/thrust/system/cuda/detail/replace.h
deleted file mode 100644
index a307fc5fc8..0000000000
--- a/compat/thrust/system/cuda/detail/replace.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system has no special version of this algorithm 
-
diff --git a/compat/thrust/system/cuda/detail/reverse.h b/compat/thrust/system/cuda/detail/reverse.h
deleted file mode 100644
index a307fc5fc8..0000000000
--- a/compat/thrust/system/cuda/detail/reverse.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system has no special version of this algorithm 
-
diff --git a/compat/thrust/system/cuda/detail/runtime_introspection.h b/compat/thrust/system/cuda/detail/runtime_introspection.h
deleted file mode 100644
index 39f6c9fadc..0000000000
--- a/compat/thrust/system/cuda/detail/runtime_introspection.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file runtime_introspection.h
- *  \brief Defines the interface to functions
- *         providing introspection into the architecture
- *         of CUDA devices.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// #include this for device_properties_t and function_attributes_t
-#include <thrust/system/cuda/detail/cuda_launch_config.h>
-
-// #include this for size_t
-#include <cstddef>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-
-/*! Returns the current device ordinal.
- */
-inline int current_device();
-
-/*! Returns a copy of the device_properties_t structure
- *  that is associated with a given device.
- */
-inline device_properties_t device_properties(int device_id);
-
-/*! Returns a copy of the device_properties_t structure
- *  that is associated with the current device.
- */
-inline device_properties_t device_properties(void);
-
-/*! Returns a copy of the function_attributes_t structure
- *  that is associated with a given __global__ function
- */
-template <typename KernelFunction>
-inline function_attributes_t function_attributes(KernelFunction kernel);
-
-/*! Returns the compute capability of a device in integer format.
- *  For example, returns 10 for sm_10 and 21 for sm_21
- *  \return The compute capability as an integer
- */
-inline size_t compute_capability(const device_properties_t &properties);
-inline size_t compute_capability(void);
-
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/cuda/detail/runtime_introspection.inl>
-
diff --git a/compat/thrust/system/cuda/detail/runtime_introspection.inl b/compat/thrust/system/cuda/detail/runtime_introspection.inl
deleted file mode 100644
index a5cc382964..0000000000
--- a/compat/thrust/system/cuda/detail/runtime_introspection.inl
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/runtime_introspection.h>
-#include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
-#include <thrust/detail/util/blocking.h>
-#include <thrust/detail/minmax.h>
-#include <thrust/system_error.h>
-#include <thrust/system/cuda/error.h>
-
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace runtime_introspection_detail
-{
-
-
-inline void get_device_properties(device_properties_t &p, int device_id)
-{
-  cudaDeviceProp properties;
-  
-  cudaError_t error = cudaGetDeviceProperties(&properties, device_id);
-  
-  if(error)
-    throw thrust::system_error(error, thrust::cuda_category());
-
-  // be careful about how this is initialized!
-  device_properties_t temp = {
-    properties.major,
-    {
-      properties.maxGridSize[0],
-      properties.maxGridSize[1],
-      properties.maxGridSize[2]
-    },
-    properties.maxThreadsPerBlock,
-    properties.maxThreadsPerMultiProcessor,
-    properties.minor,
-    properties.multiProcessorCount,
-    properties.regsPerBlock,
-    properties.sharedMemPerBlock,
-    properties.warpSize
-  };
-
-  p = temp;
-} // end get_device_properties()
-
-
-} // end runtime_introspection_detail
-
-
-inline device_properties_t device_properties(int device_id)
-{
-  // cache the result of get_device_properties, because it is slow
-  // only cache the first few devices
-  static const int max_num_devices                              = 16;
-
-  static bool properties_exist[max_num_devices]                 = {0};
-  static device_properties_t device_properties[max_num_devices] = {};
-
-  if(device_id >= max_num_devices)
-  {
-    device_properties_t result;
-    runtime_introspection_detail::get_device_properties(result, device_id);
-    return result;
-  }
-
-  if(!properties_exist[device_id])
-  {
-    runtime_introspection_detail::get_device_properties(device_properties[device_id], device_id);
-
-    // disallow the compiler to move the write to properties_exist[device_id]
-    // before the initialization of device_properties[device_id]
-    __thrust_compiler_fence();
-    
-    properties_exist[device_id] = true;
-  }
-
-  return device_properties[device_id];
-}
-
-inline int current_device()
-{
-  int result = -1;
-
-  cudaError_t error = cudaGetDevice(&result);
-
-  if(error)
-    throw thrust::system_error(error, thrust::cuda_category());
-
-  if(result < 0)
-    throw thrust::system_error(cudaErrorNoDevice, thrust::cuda_category());
-
-  return result;
-}
-
-inline device_properties_t device_properties(void)
-{
-  return device_properties(current_device());
-}
-
-template <typename KernelFunction>
-inline function_attributes_t function_attributes(KernelFunction kernel)
-{
-// cudaFuncGetAttributes(), used below, only exists when __CUDACC__ is defined
-#ifdef __CUDACC__
-  typedef void (*fun_ptr_type)();
-
-  fun_ptr_type fun_ptr = reinterpret_cast<fun_ptr_type>(kernel);
-
-  cudaFuncAttributes attributes;
-  
-  cudaError_t error = cudaFuncGetAttributes(&attributes, fun_ptr);
-  
-  if(error)
-  {
-    throw thrust::system_error(error, thrust::cuda_category());
-  }
-
-  // be careful about how this is initialized!
-  function_attributes_t result = {
-    attributes.constSizeBytes,
-    attributes.localSizeBytes,
-    attributes.maxThreadsPerBlock,
-    attributes.numRegs,
-    attributes.sharedSizeBytes
-  };
-
-  return result;
-#else
-  return function_attributes_t();
-#endif // __CUDACC__
-}
-
-inline size_t compute_capability(const device_properties_t &properties)
-{
-  return 10 * properties.major + properties.minor;
-}
-
-inline size_t compute_capability(void)
-{
-  return compute_capability(device_properties());
-}
-
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cuda/detail/scan.h b/compat/thrust/system/cuda/detail/scan.h
deleted file mode 100644
index 036c89a217..0000000000
--- a/compat/thrust/system/cuda/detail/scan.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file scan.h
- *  \brief Scan operations (parallel prefix-sum) [cuda]
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename AssociativeOperator>
-  OutputIterator inclusive_scan(execution_policy<DerivedPolicy> &exec,
-                                InputIterator first,
-                                InputIterator last,
-                                OutputIterator result,
-                                AssociativeOperator binary_op);
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename T,
-         typename AssociativeOperator>
-  OutputIterator exclusive_scan(execution_policy<DerivedPolicy> &exec,
-                                InputIterator first,
-                                InputIterator last,
-                                OutputIterator result,
-                                T init,
-                                AssociativeOperator binary_op);
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/cuda/detail/scan.inl>
-
diff --git a/compat/thrust/system/cuda/detail/scan.inl b/compat/thrust/system/cuda/detail/scan.inl
deleted file mode 100644
index 9d9c6d20ee..0000000000
--- a/compat/thrust/system/cuda/detail/scan.inl
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file scan.inl
- *  \brief Inline file for scan.h.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/static_assert.h>
-
-#include <thrust/system/cuda/detail/detail/fast_scan.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename AssociativeOperator>
-  OutputIterator inclusive_scan(execution_policy<DerivedPolicy> &exec,
-                                InputIterator first,
-                                InputIterator last,
-                                OutputIterator result,
-                                AssociativeOperator binary_op)
-{
-    // we're attempting to launch a kernel, assert we're compiling with nvcc
-    // ========================================================================
-    // X Note to the user: If you've found this line due to a compiler error, X
-    // X you need to compile your code using nvcc, rather than g++ or cl.exe  X
-    // ========================================================================
-    THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator, THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC>::value) );
-    
-    return thrust::system::cuda::detail::detail::fast_scan::inclusive_scan(exec, first, last, result, binary_op);
-}
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename T,
-         typename AssociativeOperator>
-  OutputIterator exclusive_scan(execution_policy<DerivedPolicy> &exec,
-                                InputIterator first,
-                                InputIterator last,
-                                OutputIterator result,
-                                T init,
-                                AssociativeOperator binary_op)
-{
-    // we're attempting to launch a kernel, assert we're compiling with nvcc
-    // ========================================================================
-    // X Note to the user: If you've found this line due to a compiler error, X
-    // X you need to compile your code using nvcc, rather than g++ or cl.exe  X
-    // ========================================================================
-    THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator, THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC>::value) );
-
-    return thrust::system::cuda::detail::detail::fast_scan::exclusive_scan(exec, first, last, result, init, binary_op);
-}
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cuda/detail/scan_by_key.h b/compat/thrust/system/cuda/detail/scan_by_key.h
deleted file mode 100644
index a307fc5fc8..0000000000
--- a/compat/thrust/system/cuda/detail/scan_by_key.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system has no special version of this algorithm 
-
diff --git a/compat/thrust/system/cuda/detail/scatter.h b/compat/thrust/system/cuda/detail/scatter.h
deleted file mode 100644
index a307fc5fc8..0000000000
--- a/compat/thrust/system/cuda/detail/scatter.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system has no special version of this algorithm 
-
diff --git a/compat/thrust/system/cuda/detail/sequence.h b/compat/thrust/system/cuda/detail/sequence.h
deleted file mode 100644
index a307fc5fc8..0000000000
--- a/compat/thrust/system/cuda/detail/sequence.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system has no special version of this algorithm 
-
diff --git a/compat/thrust/system/cuda/detail/set_difference.inl b/compat/thrust/system/cuda/detail/set_difference.inl
deleted file mode 100644
index 33d9884730..0000000000
--- a/compat/thrust/system/cuda/detail/set_difference.inl
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/cstdint.h>
-#include <thrust/system/cuda/detail/detail/set_operation.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace set_difference_detail
-{
-
-
-struct serial_bounded_set_difference
-{
-  // max_input_size <= 32
-  template<typename Size, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Compare>
-  inline __device__
-    thrust::detail::uint32_t operator()(Size max_input_size,
-                                        InputIterator1 first1, InputIterator1 last1,
-                                        InputIterator2 first2, InputIterator2 last2,
-                                        OutputIterator result,
-                                        Compare comp)
-  {
-    thrust::detail::uint32_t active_mask = 0;
-    thrust::detail::uint32_t active_bit = 1;
-  
-    while(first1 != last1 && first2 != last2)
-    {
-      if(comp(*first1,*first2))
-      {
-        *result = *first1;
-        active_mask |= active_bit;
-        ++first1;
-      } // end if
-      else if(comp(*first2,*first1))
-      {
-        ++first2;
-      } // end else if
-      else
-      {
-        ++first1;
-        ++first2;
-      } // end else
-  
-      ++result;
-      active_bit <<= 1;
-    } // end while
-
-    while(first1 != last1)
-    {
-      *result = *first1;
-      ++first1;
-      ++result;
-      active_mask |= active_bit;
-      active_bit <<= 1;
-    }
-  
-    return active_mask;
-  }
-
-
-  template<typename Size, typename InputIterator1, typename InputIterator2, typename Compare>
-  inline __device__
-    Size count(Size max_input_size,
-               InputIterator1 first1, InputIterator1 last1,
-               InputIterator2 first2, InputIterator2 last2,
-               Compare comp)
-  {
-    Size result = 0;
-  
-    while(first1 != last1 && first2 != last2)
-    {
-      if(comp(*first1,*first2))
-      {
-        ++first1;
-        ++result;
-      } // end if
-      else if(comp(*first2,*first1))
-      {
-        ++first2;
-      } // end else if
-      else
-      {
-        ++first1;
-        ++first2;
-      } // end else
-    } // end while
-  
-    return result + last1 - first1;
-  }
-}; // end serial_bounded_set_difference
-
-
-} // end namespace set_difference_detail
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2, 
-	 typename RandomAccessIterator3,
-         typename Compare>
-RandomAccessIterator3 set_difference(execution_policy<DerivedPolicy> &exec,
-                                     RandomAccessIterator1 first1,
-                                     RandomAccessIterator1 last1,
-                                     RandomAccessIterator2 first2,
-                                     RandomAccessIterator2 last2,
-                                     RandomAccessIterator3 result,
-                                     Compare comp)
-{
-  return thrust::system::cuda::detail::detail::set_operation(exec, first1, last1, first2, last2, result, comp, set_difference_detail::serial_bounded_set_difference());
-} // end set_difference
-
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cuda/detail/set_intersection.inl b/compat/thrust/system/cuda/detail/set_intersection.inl
deleted file mode 100644
index e4810b6d1b..0000000000
--- a/compat/thrust/system/cuda/detail/set_intersection.inl
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/cstdint.h>
-#include <thrust/system/cuda/detail/detail/set_operation.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace set_intersection_detail
-{
-
-
-struct serial_bounded_set_intersection
-{
-  // max_input_size <= 32
-  template<typename Size, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Compare>
-  inline __device__
-    thrust::detail::uint32_t operator()(Size max_input_size,
-                                        InputIterator1 first1, InputIterator1 last1,
-                                        InputIterator2 first2, InputIterator2 last2,
-                                        OutputIterator result,
-                                        Compare comp)
-  {
-    thrust::detail::uint32_t active_mask = 0;
-    thrust::detail::uint32_t active_bit = 1;
-  
-    while(first1 != last1 && first2 != last2)
-    {
-      if(comp(*first1,*first2))
-      {
-        ++first1;
-      } // end if
-      else if(comp(*first2,*first1))
-      {
-        ++first2;
-      } // end else if
-      else
-      {
-        *result = *first1;
-        ++first1;
-        ++first2;
-        active_mask |= active_bit;
-      } // end else
-  
-      ++result;
-      active_bit <<= 1;
-    } // end while
-  
-    return active_mask;
-  }
-
-
-  template<typename Size, typename InputIterator1, typename InputIterator2, typename Compare>
-  inline __device__
-    Size count(Size max_input_size,
-               InputIterator1 first1, InputIterator1 last1,
-               InputIterator2 first2, InputIterator2 last2,
-               Compare comp)
-  {
-    Size result = 0;
-  
-    while(first1 != last1 && first2 != last2)
-    {
-      if(comp(*first1,*first2))
-      {
-        ++first1;
-      } // end if
-      else if(comp(*first2,*first1))
-      {
-        ++first2;
-      } // end else if
-      else
-      {
-        ++result;
-        ++first1;
-        ++first2;
-      } // end else
-    } // end while
-  
-    return result;
-  }
-}; // end serial_bounded_set_intersection
-
-
-} // end namespace set_intersection_detail
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2, 
-	 typename RandomAccessIterator3,
-         typename Compare>
-RandomAccessIterator3 set_intersection(execution_policy<DerivedPolicy> &exec,
-                                       RandomAccessIterator1 first1,
-                                       RandomAccessIterator1 last1,
-                                       RandomAccessIterator2 first2,
-                                       RandomAccessIterator2 last2,
-                                       RandomAccessIterator3 result,
-                                       Compare comp)
-{
-  return thrust::system::cuda::detail::detail::set_operation(exec, first1, last1, first2, last2, result, comp, set_intersection_detail::serial_bounded_set_intersection());
-} // end set_intersection
-
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cuda/detail/set_operations.h b/compat/thrust/system/cuda/detail/set_operations.h
deleted file mode 100644
index 040e3419ce..0000000000
--- a/compat/thrust/system/cuda/detail/set_operations.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2, 
-	 typename RandomAccessIterator3,
-         typename Compare>
-RandomAccessIterator3 set_difference(execution_policy<DerivedPolicy> &exec,
-                                     RandomAccessIterator1 first1,
-                                     RandomAccessIterator1 last1,
-                                     RandomAccessIterator2 first2,
-                                     RandomAccessIterator2 last2,
-                                     RandomAccessIterator3 result,
-                                     Compare comp);
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2, 
-	 typename RandomAccessIterator3,
-         typename Compare>
-RandomAccessIterator3 set_intersection(execution_policy<DerivedPolicy> &exec,
-                                       RandomAccessIterator1 first1,
-                                       RandomAccessIterator1 last1,
-                                       RandomAccessIterator2 first2,
-                                       RandomAccessIterator2 last2,
-                                       RandomAccessIterator3 result,
-                                       Compare comp);
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2, 
-	 typename RandomAccessIterator3,
-         typename Compare>
-RandomAccessIterator3 set_symmetric_difference(execution_policy<DerivedPolicy> &exec,
-                                               RandomAccessIterator1 first1,
-                                               RandomAccessIterator1 last1,
-                                               RandomAccessIterator2 first2,
-                                               RandomAccessIterator2 last2,
-                                               RandomAccessIterator3 result,
-                                               Compare comp);
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2, 
-	 typename RandomAccessIterator3,
-         typename Compare>
-RandomAccessIterator3 set_union(execution_policy<DerivedPolicy> &exec,
-                                RandomAccessIterator1 first1,
-                                RandomAccessIterator1 last1,
-                                RandomAccessIterator2 first2,
-                                RandomAccessIterator2 last2,
-                                RandomAccessIterator3 result,
-                                Compare comp);
-
-
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
-
-#include <thrust/system/cuda/detail/set_difference.inl>
-#include <thrust/system/cuda/detail/set_intersection.inl>
-#include <thrust/system/cuda/detail/set_symmetric_difference.inl>
-#include <thrust/system/cuda/detail/set_union.inl>
-
diff --git a/compat/thrust/system/cuda/detail/set_symmetric_difference.inl b/compat/thrust/system/cuda/detail/set_symmetric_difference.inl
deleted file mode 100644
index 112c955bc0..0000000000
--- a/compat/thrust/system/cuda/detail/set_symmetric_difference.inl
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/cstdint.h>
-#include <thrust/system/cuda/detail/detail/set_operation.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace set_symmetric_difference_detail
-{
-
-
-struct serial_bounded_set_symmetric_difference
-{
-  // max_input_size <= 32
-  template<typename Size, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Compare>
-  inline __device__
-    thrust::detail::uint32_t operator()(Size max_input_size,
-                                        InputIterator1 first1, InputIterator1 last1,
-                                        InputIterator2 first2, InputIterator2 last2,
-                                        OutputIterator result,
-                                        Compare comp)
-  {
-    thrust::detail::uint32_t active_mask = 0;
-    thrust::detail::uint32_t active_bit = 1;
-  
-    while(first1 != last1 && first2 != last2)
-    {
-      if(comp(*first1,*first2))
-      {
-        *result = *first1;
-        active_mask |= active_bit;
-        ++first1;
-      } // end if
-      else if(comp(*first2,*first1))
-      {
-        *result = *first2;
-        active_mask |= active_bit;
-        ++first2;
-      } // end else if
-      else
-      {
-        ++first1;
-        ++first2;
-      } // end else
-  
-      ++result;
-      active_bit <<= 1;
-    } // end while
-
-    while(first1 != last1)
-    {
-      *result = *first1;
-      ++first1;
-      ++result;
-      active_mask |= active_bit;
-      active_bit <<= 1;
-    }
-
-    while(first2 != last2)
-    {
-      *result = *first2;
-      ++first2;
-      ++result;
-      active_mask |= active_bit;
-      active_bit <<= 1;
-    }
-  
-    return active_mask;
-  }
-
-
-  template<typename Size, typename InputIterator1, typename InputIterator2, typename Compare>
-  inline __device__
-    Size count(Size max_input_size,
-               InputIterator1 first1, InputIterator1 last1,
-               InputIterator2 first2, InputIterator2 last2,
-               Compare comp)
-  {
-    Size result = 0;
-  
-    while(first1 != last1 && first2 != last2)
-    {
-      if(comp(*first1,*first2))
-      {
-        ++first1;
-        ++result;
-      } // end if
-      else if(comp(*first2,*first1))
-      {
-        ++first2;
-        ++result;
-      } // end else if
-      else
-      {
-        ++first1;
-        ++first2;
-      } // end else
-    } // end while
-  
-    return result + thrust::max(last1 - first1,last2 - first2);
-  }
-}; // end serial_bounded_set_symmetric_difference
-
-
-} // end namespace set_symmetric_difference_detail
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2, 
-	 typename RandomAccessIterator3,
-         typename Compare>
-RandomAccessIterator3 set_symmetric_difference(execution_policy<DerivedPolicy> &exec,
-                                               RandomAccessIterator1 first1,
-                                               RandomAccessIterator1 last1,
-                                               RandomAccessIterator2 first2,
-                                               RandomAccessIterator2 last2,
-                                               RandomAccessIterator3 result,
-                                               Compare comp)
-{
-  return thrust::system::cuda::detail::detail::set_operation(exec, first1, last1, first2, last2, result, comp, set_symmetric_difference_detail::serial_bounded_set_symmetric_difference());
-} // end set_symmetric_difference
-
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cuda/detail/set_union.inl b/compat/thrust/system/cuda/detail/set_union.inl
deleted file mode 100644
index 66cccab39b..0000000000
--- a/compat/thrust/system/cuda/detail/set_union.inl
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/cstdint.h>
-#include <thrust/system/cuda/detail/detail/set_operation.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace set_union_detail
-{
-
-
-struct serial_bounded_set_union
-{
-  // max_input_size <= 32
-  template<typename Size, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Compare>
-  inline __device__
-    thrust::detail::uint32_t operator()(Size max_input_size,
-                                        InputIterator1 first1, InputIterator1 last1,
-                                        InputIterator2 first2, InputIterator2 last2,
-                                        OutputIterator result,
-                                        Compare comp)
-  {
-    thrust::detail::uint32_t active_mask = 0;
-    thrust::detail::uint32_t active_bit = 1;
-  
-    while(first1 != last1 && first2 != last2)
-    {
-      if(comp(*first1,*first2))
-      {
-        *result = *first1;
-        ++first1;
-      } // end if
-      else if(comp(*first2,*first1))
-      {
-        *result = *first2;
-        ++first2;
-      } // end else if
-      else
-      {
-        *result = *first1;
-        ++first1;
-        ++first2;
-      } // end else
-  
-      ++result;
-      active_mask |= active_bit;
-      active_bit <<= 1;
-    } // end while
-
-    while(first1 != last1)
-    {
-      *result = *first1;
-      ++first1;
-      ++result;
-      active_mask |= active_bit;
-      active_bit <<= 1;
-    }
-
-    while(first2 != last2)
-    {
-      *result = *first2;
-      ++first2;
-      ++result;
-      active_mask |= active_bit;
-      active_bit <<= 1;
-    }
-  
-    return active_mask;
-  }
-
-
-  template<typename Size, typename InputIterator1, typename InputIterator2, typename Compare>
-  inline __device__
-    Size count(Size max_input_size,
-               InputIterator1 first1, InputIterator1 last1,
-               InputIterator2 first2, InputIterator2 last2,
-               Compare comp)
-  {
-    Size result = 0;
-  
-    while(first1 != last1 && first2 != last2)
-    {
-      if(comp(*first1,*first2))
-      {
-        ++first1;
-      } // end if
-      else if(comp(*first2,*first1))
-      {
-        ++first2;
-      } // end else if
-      else
-      {
-        ++first1;
-        ++first2;
-      } // end else
-
-      ++result;
-    } // end while
-  
-    return result + thrust::max(last1 - first1,last2 - first2);
-  }
-}; // end serial_bounded_set_union
-
-
-} // end namespace set_union_detail
-
-
-template<typename ExecutionPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2, 
-	 typename RandomAccessIterator3,
-         typename Compare>
-RandomAccessIterator3 set_union(execution_policy<ExecutionPolicy> &exec,
-                                RandomAccessIterator1 first1,
-                                RandomAccessIterator1 last1,
-                                RandomAccessIterator2 first2,
-                                RandomAccessIterator2 last2,
-                                RandomAccessIterator3 result,
-                                Compare comp)
-{
-  return thrust::system::cuda::detail::detail::set_operation(exec, first1, last1, first2, last2, result, comp, set_union_detail::serial_bounded_set_union());
-} // end set_union
-
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cuda/detail/sort.h b/compat/thrust/system/cuda/detail/sort.h
deleted file mode 100644
index e78d36a76b..0000000000
--- a/compat/thrust/system/cuda/detail/sort.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator,
-         typename StrictWeakOrdering>
-  void stable_sort(execution_policy<DerivedPolicy> &exec,
-                   RandomAccessIterator first,
-                   RandomAccessIterator last,
-                   StrictWeakOrdering comp);
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename StrictWeakOrdering>
-  void stable_sort_by_key(execution_policy<DerivedPolicy> &exec,
-                          RandomAccessIterator1 keys_first,
-                          RandomAccessIterator1 keys_last,
-                          RandomAccessIterator2 values_first,
-                          StrictWeakOrdering comp);
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/cuda/detail/sort.inl>
-
diff --git a/compat/thrust/system/cuda/detail/sort.inl b/compat/thrust/system/cuda/detail/sort.inl
deleted file mode 100644
index d7e0a60c45..0000000000
--- a/compat/thrust/system/cuda/detail/sort.inl
+++ /dev/null
@@ -1,287 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file sort.inl
- *  \brief Inline file for sort.h
- */
-
-#include <thrust/system/cuda/detail/detail/stable_merge_sort.h>
-#include <thrust/system/cuda/detail/detail/stable_primitive_sort.h>
-
-#include <thrust/reverse.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-#include <thrust/system/cuda/detail/temporary_indirect_permutation.h>
-#include <thrust/detail/trivial_sequence.h>
-
-
-/*
- *  This file implements the following dispatch procedure for cuda::stable_sort()
- *  and cuda::stable_sort_by_key(). The first level inspects the KeyType
- *  and StrictWeakOrdering to determine whether a sort assuming primitive-typed
- *  data may be applied.
- *
- *  If a sort assuming primitive-typed data can be applied (i.e., a radix sort),
- *  the input ranges are first trivialized (turned into simple contiguous ranges
- *  if they are not already). To implement descending orderings, an ascending
- *  sort will be reversed.
- *
- *  If a sort assuming primitive-typed data cannot be applied, a comparison-based
- *  sort is used. Depending on the size of the key and value types, one level of
- *  indirection may be applied to their input ranges. This transformation
- *  may be applied to either range to convert an ill-suited problem (i.e. sorting with
- *  large keys or large value) into a problem more amenable to the underlying
- *  merge sort algorithm.
- */
-
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-
-namespace stable_sort_detail
-{
-
-
-template<typename KeyType, typename StrictWeakCompare>
-  struct can_use_primitive_sort
-    : thrust::detail::and_<
-        thrust::detail::is_arithmetic<KeyType>,
-        thrust::detail::or_<
-          thrust::detail::is_same<StrictWeakCompare,thrust::less<KeyType> >,
-          thrust::detail::is_same<StrictWeakCompare,thrust::greater<KeyType> >
-        >
-      >
-{};
-
-
-template<typename RandomAccessIterator, typename StrictWeakCompare>
-  struct enable_if_primitive_sort
-    : thrust::detail::enable_if<
-        can_use_primitive_sort<
-          typename iterator_value<RandomAccessIterator>::type,
-          StrictWeakCompare
-        >::value
-      >
-{};
-
-
-template<typename RandomAccessIterator, typename StrictWeakCompare>
-  struct enable_if_comparison_sort
-    : thrust::detail::disable_if<
-        can_use_primitive_sort<
-          typename iterator_value<RandomAccessIterator>::type,
-          StrictWeakCompare
-        >::value
-      >
-{};
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator,
-         typename StrictWeakOrdering>
-  typename enable_if_primitive_sort<RandomAccessIterator,StrictWeakOrdering>::type
-    stable_sort(execution_policy<DerivedPolicy> &exec,
-                RandomAccessIterator first,
-                RandomAccessIterator last,
-                StrictWeakOrdering comp)
-{
-  // ensure sequence has trivial iterators
-  thrust::detail::trivial_sequence<RandomAccessIterator,DerivedPolicy> keys(exec, first, last);
-  
-  // CUDA path for thrust::stable_sort with primitive keys
-  // (e.g. int, float, short, etc.) and a less<T> or greater<T> comparison
-  // method is implemented with a primitive sort
-  thrust::system::cuda::detail::detail::stable_primitive_sort(exec, keys.begin(), keys.end());
-  
-  // copy results back, if necessary
-  if(!thrust::detail::is_trivial_iterator<RandomAccessIterator>::value)
-  {
-    thrust::copy(exec, keys.begin(), keys.end(), first);
-  }
-  
-  // if comp is greater<T> then reverse the keys
-  typedef typename thrust::iterator_traits<RandomAccessIterator>::value_type KeyType;
-  const static bool reverse = thrust::detail::is_same<StrictWeakOrdering, typename thrust::greater<KeyType> >::value;
-  
-  if(reverse)
-  {
-    thrust::reverse(first, last);
-  }
-}
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator,
-         typename StrictWeakOrdering>
-  typename enable_if_comparison_sort<RandomAccessIterator,StrictWeakOrdering>::type
-    stable_sort(execution_policy<DerivedPolicy> &exec,
-                RandomAccessIterator first,
-                RandomAccessIterator last,
-                StrictWeakOrdering comp)
-{
-  // decide whether to sort keys indirectly
-  typedef typename thrust::iterator_value<RandomAccessIterator>::type KeyType;
-  typedef thrust::detail::integral_constant<bool, (sizeof(KeyType) > 8)> use_key_indirection;
-  
-  conditional_temporary_indirect_ordering<use_key_indirection, DerivedPolicy, RandomAccessIterator, StrictWeakOrdering> potentially_indirect_keys(derived_cast(exec), first, last, comp);
-  
-  thrust::system::cuda::detail::detail::stable_merge_sort(exec,
-                                                          potentially_indirect_keys.begin(),
-                                                          potentially_indirect_keys.end(),
-                                                          potentially_indirect_keys.comp());
-}
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename StrictWeakOrdering>
-  typename enable_if_primitive_sort<RandomAccessIterator1,StrictWeakOrdering>::type
-    stable_sort_by_key(execution_policy<DerivedPolicy> &exec,
-                       RandomAccessIterator1 keys_first,
-                       RandomAccessIterator1 keys_last,
-                       RandomAccessIterator2 values_first,
-                       StrictWeakOrdering comp)
-{
-  // path for thrust::stable_sort_by_key with primitive keys
-  // (e.g. int, float, short, etc.) and a less<T> or greater<T> comparison
-  // method is implemented with stable_primitive_sort_by_key
-  
-  // if comp is greater<T> then reverse the keys and values
-  typedef typename thrust::iterator_traits<RandomAccessIterator1>::value_type KeyType;
-  const static bool reverse = thrust::detail::is_same<StrictWeakOrdering, typename thrust::greater<KeyType> >::value;
-  
-  // note, we also have to reverse the (unordered) input to preserve stability
-  if (reverse)
-  {
-    thrust::reverse(exec, keys_first,  keys_last);
-    thrust::reverse(exec, values_first, values_first + (keys_last - keys_first));
-  }
-  
-  // ensure sequences have trivial iterators
-  thrust::detail::trivial_sequence<RandomAccessIterator1,DerivedPolicy> keys(exec, keys_first, keys_last);
-  thrust::detail::trivial_sequence<RandomAccessIterator2,DerivedPolicy> values(exec, values_first, values_first + (keys_last - keys_first));
-  
-  thrust::system::cuda::detail::detail::stable_primitive_sort_by_key(exec, keys.begin(), keys.end(), values.begin());
-  
-  // copy results back, if necessary
-  if(!thrust::detail::is_trivial_iterator<RandomAccessIterator1>::value)
-      thrust::copy(exec, keys.begin(), keys.end(), keys_first);
-  if(!thrust::detail::is_trivial_iterator<RandomAccessIterator2>::value)
-      thrust::copy(exec, values.begin(), values.end(), values_first);
-  
-  if (reverse)
-  {
-    thrust::reverse(exec, keys_first,  keys_last);
-    thrust::reverse(exec, values_first, values_first + (keys_last - keys_first));
-  }
-}
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename StrictWeakOrdering>
-  typename enable_if_comparison_sort<RandomAccessIterator1,StrictWeakOrdering>::type
-    stable_sort_by_key(execution_policy<DerivedPolicy> &exec,
-                       RandomAccessIterator1 keys_first,
-                       RandomAccessIterator1 keys_last,
-                       RandomAccessIterator2 values_first,
-                       StrictWeakOrdering comp)
-{
-  // decide whether to apply indirection to either range
-  typedef typename thrust::iterator_value<RandomAccessIterator1>::type KeyType;
-  typedef typename thrust::iterator_value<RandomAccessIterator2>::type ValueType;
-  
-  typedef thrust::detail::integral_constant<bool, (sizeof(KeyType) > 8)> use_key_indirection;
-  typedef thrust::detail::integral_constant<bool, (sizeof(ValueType) > 4)> use_value_indirection;
-  
-  conditional_temporary_indirect_ordering<
-    use_key_indirection,
-    DerivedPolicy,
-    RandomAccessIterator1,
-    StrictWeakOrdering
-  > potentially_indirect_keys(derived_cast(exec), keys_first, keys_last, comp);
-  
-  conditional_temporary_indirect_permutation<
-    use_value_indirection,
-    DerivedPolicy,
-    RandomAccessIterator2
-  > potentially_indirect_values(derived_cast(exec), values_first, values_first + (keys_last - keys_first));
-  
-  thrust::system::cuda::detail::detail::stable_merge_sort_by_key(exec,
-                                                                 potentially_indirect_keys.begin(),
-                                                                 potentially_indirect_keys.end(),
-                                                                 potentially_indirect_values.begin(),
-                                                                 potentially_indirect_keys.comp());
-}
-
-
-} // end namespace stable_sort_detail
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator,
-         typename StrictWeakOrdering>
-  void stable_sort(execution_policy<DerivedPolicy> &exec,
-                   RandomAccessIterator first,
-                   RandomAccessIterator last,
-                   StrictWeakOrdering comp)
-{
-  // we're attempting to launch a kernel, assert we're compiling with nvcc
-  // ========================================================================
-  // X Note to the user: If you've found this line due to a compiler error, X
-  // X you need to compile your code using nvcc, rather than g++ or cl.exe  X
-  // ========================================================================
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<RandomAccessIterator, THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC>::value) );
-  
-  stable_sort_detail::stable_sort(exec, first, last, comp);
-}
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename StrictWeakOrdering>
-  void stable_sort_by_key(execution_policy<DerivedPolicy> &exec,
-                          RandomAccessIterator1 keys_first,
-                          RandomAccessIterator1 keys_last,
-                          RandomAccessIterator2 values_first,
-                          StrictWeakOrdering comp)
-{
-  // we're attempting to launch a kernel, assert we're compiling with nvcc
-  // ========================================================================
-  // X Note to the user: If you've found this line due to a compiler error, X
-  // X you need to compile your code using nvcc, rather than g++ or cl.exe  X
-  // ========================================================================
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<RandomAccessIterator1, THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC>::value) );
-  
-  stable_sort_detail::stable_sort_by_key(exec, keys_first, keys_last, values_first, comp);
-}
-
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cuda/detail/swap_ranges.h b/compat/thrust/system/cuda/detail/swap_ranges.h
deleted file mode 100644
index 9b1949e60f..0000000000
--- a/compat/thrust/system/cuda/detail/swap_ranges.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// cuda has no special swap_ranges
-
diff --git a/compat/thrust/system/cuda/detail/synchronize.h b/compat/thrust/system/cuda/detail/synchronize.h
deleted file mode 100644
index 762f4a39c4..0000000000
--- a/compat/thrust/system/cuda/detail/synchronize.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-inline void synchronize(const char *message = "");
-
-inline void synchronize_if_enabled(const char *message = "");
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/cuda/detail/synchronize.inl>
-
diff --git a/compat/thrust/system/cuda/detail/synchronize.inl b/compat/thrust/system/cuda/detail/synchronize.inl
deleted file mode 100644
index 5f70f799c4..0000000000
--- a/compat/thrust/system/cuda/detail/synchronize.inl
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/synchronize.h>
-#include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
-#include <thrust/system/cuda/error.h>
-#include <thrust/system_error.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-void synchronize(const char *message)
-{
-  cudaError_t error = cudaThreadSynchronize();
-  if(error)
-  {
-    throw thrust::system_error(error, thrust::cuda_category(), std::string("synchronize: ") + message);
-  } // end if
-} // end synchronize()
-
-void synchronize_if_enabled(const char *message)
-{
-// XXX this could potentially be a runtime decision
-#if __THRUST_SYNCHRONOUS
-  synchronize(message);
-#else
-  // WAR "unused parameter" warning
-  (void) message;
-#endif
-} // end synchronize_if_enabled()
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cuda/detail/tabulate.h b/compat/thrust/system/cuda/detail/tabulate.h
deleted file mode 100644
index a307fc5fc8..0000000000
--- a/compat/thrust/system/cuda/detail/tabulate.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system has no special version of this algorithm 
-
diff --git a/compat/thrust/system/cuda/detail/temporary_buffer.h b/compat/thrust/system/cuda/detail/temporary_buffer.h
deleted file mode 100644
index 628bd75719..0000000000
--- a/compat/thrust/system/cuda/detail/temporary_buffer.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system has no special temporary buffer functions
-
diff --git a/compat/thrust/system/cuda/detail/temporary_indirect_permutation.h b/compat/thrust/system/cuda/detail/temporary_indirect_permutation.h
deleted file mode 100644
index 3d05f44155..0000000000
--- a/compat/thrust/system/cuda/detail/temporary_indirect_permutation.h
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/temporary_array.h>
-#include <thrust/sequence.h>
-#include <thrust/gather.h>
-#include <thrust/detail/function.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-
-template<typename DerivedPolicy, typename RandomAccessIterator>
-  struct temporary_indirect_permutation
-{
-  private:
-    typedef unsigned int size_type;
-    typedef thrust::detail::temporary_array<size_type, DerivedPolicy> array_type;
-
-  public:
-    temporary_indirect_permutation(DerivedPolicy &exec, RandomAccessIterator first, RandomAccessIterator last)
-      : m_exec(exec),
-        m_src_first(first),
-        m_src_last(last),
-        m_permutation(0, m_exec, last - first)
-    {
-      // generate sorted index sequence
-      thrust::sequence(exec, m_permutation.begin(), m_permutation.end());
-    }
-
-    ~temporary_indirect_permutation()
-    {
-      // permute the source array using the indices
-      typedef typename thrust::iterator_value<RandomAccessIterator>::type value_type;
-      thrust::detail::temporary_array<value_type, DerivedPolicy> temp(m_exec, m_src_first, m_src_last);
-      thrust::gather(m_exec, m_permutation.begin(), m_permutation.end(), temp.begin(), m_src_first);
-    }
-
-    typedef typename array_type::iterator iterator;
-
-    iterator begin()
-    {
-      return m_permutation.begin();
-    }
-
-    iterator end()
-    {
-      return m_permutation.end();
-    }
-
-  private:
-    DerivedPolicy &m_exec;
-    RandomAccessIterator m_src_first, m_src_last;
-    thrust::detail::temporary_array<size_type, DerivedPolicy> m_permutation;
-};
-
-
-template<typename DerivedPolicy, typename RandomAccessIterator>
-  struct iterator_range_with_execution_policy
-{
-  iterator_range_with_execution_policy(DerivedPolicy &exec, RandomAccessIterator first, RandomAccessIterator last)
-    : m_exec(exec), m_first(first), m_last(last)
-  {}
-
-  typedef RandomAccessIterator iterator;
-
-  iterator begin()
-  {
-    return m_first;
-  }
-
-  iterator end()
-  {
-    return m_last;
-  }
-
-  DerivedPolicy &exec()
-  {
-    return m_exec;
-  }
-
-  DerivedPolicy &m_exec;
-  RandomAccessIterator m_first, m_last;
-};
-
-
-template<typename Condition, typename DerivedPolicy, typename RandomAccessIterator>
-  struct conditional_temporary_indirect_permutation
-    : thrust::detail::eval_if<
-        Condition::value,
-        thrust::detail::identity_<temporary_indirect_permutation<DerivedPolicy, RandomAccessIterator> >,
-        thrust::detail::identity_<iterator_range_with_execution_policy<DerivedPolicy, RandomAccessIterator> >
-      >::type
-{
-  typedef typename thrust::detail::eval_if<
-    Condition::value,
-    thrust::detail::identity_<temporary_indirect_permutation<DerivedPolicy, RandomAccessIterator> >,
-    thrust::detail::identity_<iterator_range_with_execution_policy<DerivedPolicy, RandomAccessIterator> >
-  >::type super_t;
-
-  conditional_temporary_indirect_permutation(DerivedPolicy &exec, RandomAccessIterator first, RandomAccessIterator last)
-    : super_t(exec, first, last)
-  {}
-};
-
-
-template<typename DerivedPolicy, typename RandomAccessIterator, typename Compare>
-  struct temporary_indirect_ordering
-    : temporary_indirect_permutation<DerivedPolicy,RandomAccessIterator>
-{
-  private:
-    typedef temporary_indirect_permutation<DerivedPolicy,RandomAccessIterator> super_t;
-
-  public:
-    temporary_indirect_ordering(DerivedPolicy &exec, RandomAccessIterator first, RandomAccessIterator last, Compare comp)
-      : super_t(exec, first, last),
-        m_comp(first, comp)
-    {}
-
-    struct compare
-    {
-      RandomAccessIterator first;
-
-      thrust::detail::host_device_function<
-        Compare,
-        bool
-      > comp;
-
-      compare(RandomAccessIterator first, Compare comp)
-        : first(first), comp(comp)
-      {}
-
-      template<typename Integral>
-      __host__ __device__
-      bool operator()(Integral a, Integral b)
-      {
-        return comp(first[a], first[b]);
-      }
-    };
-
-    compare comp() const
-    {
-      return m_comp;
-    }
-
-  private:
-    compare m_comp;
-};
-
-
-template<typename DerivedPolicy, typename RandomAccessIterator, typename Compare>
-  struct iterator_range_with_execution_policy_and_compare
-    : iterator_range_with_execution_policy<DerivedPolicy, RandomAccessIterator>
-{
-  typedef iterator_range_with_execution_policy<DerivedPolicy, RandomAccessIterator> super_t;
-
-  iterator_range_with_execution_policy_and_compare(DerivedPolicy &exec, RandomAccessIterator first, RandomAccessIterator last, Compare comp)
-    : super_t(exec, first, last), m_comp(comp)
-  {}
-
-  typedef Compare compare;
-
-  compare comp()
-  {
-    return m_comp;
-  }
-
-  Compare m_comp;
-};
-
-
-template<typename Condition, typename DerivedPolicy, typename RandomAccessIterator, typename Compare>
-  struct conditional_temporary_indirect_ordering
-    : thrust::detail::eval_if<
-        Condition::value,
-        thrust::detail::identity_<temporary_indirect_ordering<DerivedPolicy, RandomAccessIterator, Compare> >,
-        thrust::detail::identity_<iterator_range_with_execution_policy_and_compare<DerivedPolicy, RandomAccessIterator, Compare> >
-      >::type
-{
-  typedef typename thrust::detail::eval_if<
-    Condition::value,
-    thrust::detail::identity_<temporary_indirect_ordering<DerivedPolicy, RandomAccessIterator, Compare> >,
-    thrust::detail::identity_<iterator_range_with_execution_policy_and_compare<DerivedPolicy, RandomAccessIterator, Compare> >
-  >::type super_t;
-
-  conditional_temporary_indirect_ordering(DerivedPolicy &exec, RandomAccessIterator first, RandomAccessIterator last, Compare comp)
-    : super_t(exec, first, last, comp)
-  {}
-};
-
-
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/cuda/detail/transform.h b/compat/thrust/system/cuda/detail/transform.h
deleted file mode 100644
index 0af87056e7..0000000000
--- a/compat/thrust/system/cuda/detail/transform.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// cuda has no special transform
-
diff --git a/compat/thrust/system/cuda/detail/transform_reduce.h b/compat/thrust/system/cuda/detail/transform_reduce.h
deleted file mode 100644
index a307fc5fc8..0000000000
--- a/compat/thrust/system/cuda/detail/transform_reduce.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system has no special version of this algorithm 
-
diff --git a/compat/thrust/system/cuda/detail/transform_scan.h b/compat/thrust/system/cuda/detail/transform_scan.h
deleted file mode 100644
index a307fc5fc8..0000000000
--- a/compat/thrust/system/cuda/detail/transform_scan.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system has no special version of this algorithm 
-
diff --git a/compat/thrust/system/cuda/detail/trivial_copy.h b/compat/thrust/system/cuda/detail/trivial_copy.h
deleted file mode 100644
index e0e898aad4..0000000000
--- a/compat/thrust/system/cuda/detail/trivial_copy.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename Size,
-         typename RandomAccessIterator2>
-  void trivial_copy_n(execution_policy<DerivedPolicy> &exec,
-                      RandomAccessIterator1 first,
-                      Size n,
-                      RandomAccessIterator2 result);
-
-template<typename System1,
-         typename System2,
-         typename RandomAccessIterator1,
-         typename Size,
-         typename RandomAccessIterator2>
-  void trivial_copy_n(cross_system<System1,System2> &exec,
-                      RandomAccessIterator1 first,
-                      Size n,
-                      RandomAccessIterator2 result);
-
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
-
-#include <thrust/system/cuda/detail/trivial_copy.inl>
-
diff --git a/compat/thrust/system/cuda/detail/trivial_copy.inl b/compat/thrust/system/cuda/detail/trivial_copy.inl
deleted file mode 100644
index d23a4ef8c2..0000000000
--- a/compat/thrust/system/cuda/detail/trivial_copy.inl
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/trivial_copy.h>
-#include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
-#include <thrust/system_error.h>
-#include <thrust/system/cuda/error.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/cpp/detail/execution_policy.h>
-#include <thrust/detail/raw_pointer_cast.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-namespace trivial_copy_detail
-{
-
-inline void checked_cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind)
-{
-  cudaError_t error = cudaMemcpy(dst,src,count,kind);
-  if(error)
-  {
-    throw thrust::system_error(error, thrust::cuda_category());
-  } // end error
-} // end checked_cudaMemcpy()
-
-
-template<typename System1,
-         typename System2>
-  cudaMemcpyKind cuda_memcpy_kind(const thrust::cuda::execution_policy<System1> &,
-                                  const thrust::cpp::execution_policy<System2> &)
-{
-  return cudaMemcpyDeviceToHost;
-} // end cuda_memcpy_kind()
-
-
-template<typename System1,
-         typename System2>
-  cudaMemcpyKind cuda_memcpy_kind(const thrust::cpp::execution_policy<System1> &,
-                                  const thrust::cuda::execution_policy<System2> &)
-{
-  return cudaMemcpyHostToDevice;
-} // end cuda_memcpy_kind()
-
-
-} // end namespace trivial_copy_detail
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename Size,
-         typename RandomAccessIterator2>
-  void trivial_copy_n(execution_policy<DerivedPolicy> &exec,
-                      RandomAccessIterator1 first,
-                      Size n,
-                      RandomAccessIterator2 result)
-{
-  typedef typename thrust::iterator_value<RandomAccessIterator1>::type T;
-
-  void *dst = thrust::raw_pointer_cast(&*result);
-  const void *src = thrust::raw_pointer_cast(&*first);
-
-  trivial_copy_detail::checked_cudaMemcpy(dst, src, n * sizeof(T), cudaMemcpyDeviceToDevice);
-}
-
-
-template<typename System1,
-         typename System2,
-         typename RandomAccessIterator1,
-         typename Size,
-         typename RandomAccessIterator2>
-  void trivial_copy_n(cross_system<System1,System2> &systems,
-                      RandomAccessIterator1 first,
-                      Size n,
-                      RandomAccessIterator2 result)
-{
-  typedef typename thrust::iterator_value<RandomAccessIterator1>::type T;
-
-  void *dst = thrust::raw_pointer_cast(&*result);
-  const void *src = thrust::raw_pointer_cast(&*first);
-
-  cudaMemcpyKind kind = trivial_copy_detail::cuda_memcpy_kind(thrust::detail::derived_cast(systems.system1), thrust::detail::derived_cast(systems.system2));
-
-  trivial_copy_detail::checked_cudaMemcpy(dst, src, n * sizeof(T), kind);
-}
-
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/cuda/detail/uninitialized_copy.h b/compat/thrust/system/cuda/detail/uninitialized_copy.h
deleted file mode 100644
index a307fc5fc8..0000000000
--- a/compat/thrust/system/cuda/detail/uninitialized_copy.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system has no special version of this algorithm 
-
diff --git a/compat/thrust/system/cuda/detail/uninitialized_fill.h b/compat/thrust/system/cuda/detail/uninitialized_fill.h
deleted file mode 100644
index a307fc5fc8..0000000000
--- a/compat/thrust/system/cuda/detail/uninitialized_fill.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system has no special version of this algorithm 
-
diff --git a/compat/thrust/system/cuda/detail/unique.h b/compat/thrust/system/cuda/detail/unique.h
deleted file mode 100644
index a307fc5fc8..0000000000
--- a/compat/thrust/system/cuda/detail/unique.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system has no special version of this algorithm 
-
diff --git a/compat/thrust/system/cuda/detail/unique_by_key.h b/compat/thrust/system/cuda/detail/unique_by_key.h
deleted file mode 100644
index a307fc5fc8..0000000000
--- a/compat/thrust/system/cuda/detail/unique_by_key.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system has no special version of this algorithm 
-
diff --git a/compat/thrust/system/cuda/detail/vector.inl b/compat/thrust/system/cuda/detail/vector.inl
deleted file mode 100644
index 36598764b2..0000000000
--- a/compat/thrust/system/cuda/detail/vector.inl
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in ccudaliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/vector.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-
-template<typename T, typename Allocator>
-  vector<T,Allocator>
-    ::vector()
-      : super_t()
-{}
-
-template<typename T, typename Allocator>
-  vector<T,Allocator>
-    ::vector(size_type n)
-      : super_t(n)
-{}
-
-template<typename T, typename Allocator>
-  vector<T,Allocator>
-    ::vector(size_type n, const value_type &value)
-      : super_t(n,value)
-{}
-
-template<typename T, typename Allocator>
-  vector<T,Allocator>
-    ::vector(const vector &x)
-      : super_t(x)
-{}
-
-template<typename T, typename Allocator>
-  template<typename OtherT, typename OtherAllocator>
-    vector<T,Allocator>
-      ::vector(const thrust::detail::vector_base<OtherT,OtherAllocator> &x)
-        : super_t(x)
-{}
-
-template<typename T, typename Allocator>
-  template<typename OtherT, typename OtherAllocator>
-    vector<T,Allocator>
-      ::vector(const std::vector<OtherT,OtherAllocator> &x)
-        : super_t(x)
-{}
-
-template<typename T, typename Allocator>
-  template<typename InputIterator>
-    vector<T,Allocator>
-      ::vector(InputIterator first, InputIterator last)
-        : super_t(first,last)
-{}
-
-template<typename T, typename Allocator>
-  template<typename OtherT, typename OtherAllocator>
-    vector<T,Allocator> &
-      vector<T,Allocator>
-        ::operator=(const std::vector<OtherT,OtherAllocator> &x)
-{
-  super_t::operator=(x);
-  return *this;
-}
-
-template<typename T, typename Allocator>
-  template<typename OtherT, typename OtherAllocator>
-    vector<T,Allocator> &
-      vector<T,Allocator>
-        ::operator=(const thrust::detail::vector_base<OtherT,OtherAllocator> &x)
-{
-  super_t::operator=(x);
-  return *this;
-}
-      
-} // end cuda
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/cuda/error.h b/compat/thrust/system/cuda/error.h
deleted file mode 100644
index 8d098538db..0000000000
--- a/compat/thrust/system/cuda/error.h
+++ /dev/null
@@ -1,186 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file thrust/system/cuda/error.h
- *  \brief CUDA-specific error reporting
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/system/error_code.h>
-#include <driver_types.h>
-
-namespace thrust
-{
-
-namespace system
-{
-
-namespace cuda
-{
-
-/*! \addtogroup system
- *  \{
- */
-
-// To construct an error_code after a CUDA Runtime error:
-//
-//   error_code(::cudaGetLastError(), cuda_category())
-
-// XXX N3000 prefers enum class errc { ... }
-namespace errc
-{
-
-/*! \p errc_t enumerates the kinds of CUDA Runtime errors.
- */
-enum errc_t
-{
-  // from cuda/include/driver_types.h
-  // mirror their order
-  success                            = cudaSuccess,
-  missing_configuration              = cudaErrorMissingConfiguration,
-  memory_allocation                  = cudaErrorMemoryAllocation,
-  initialization_error               = cudaErrorInitializationError,
-  launch_failure                     = cudaErrorLaunchFailure,
-  prior_launch_failure               = cudaErrorPriorLaunchFailure,
-  launch_timeout                     = cudaErrorLaunchTimeout,
-  launch_out_of_resources            = cudaErrorLaunchOutOfResources,
-  invalid_device_function            = cudaErrorInvalidDeviceFunction,
-  invalid_configuration              = cudaErrorInvalidConfiguration,
-  invalid_device                     = cudaErrorInvalidDevice,
-  invalid_value                      = cudaErrorInvalidValue,
-  invalid_pitch_value                = cudaErrorInvalidPitchValue,
-  invalid_symbol                     = cudaErrorInvalidSymbol,
-  map_buffer_object_failed           = cudaErrorMapBufferObjectFailed,
-  unmap_buffer_object_failed         = cudaErrorUnmapBufferObjectFailed,
-  invalid_host_pointer               = cudaErrorInvalidHostPointer,
-  invalid_device_pointer             = cudaErrorInvalidDevicePointer,
-  invalid_texture                    = cudaErrorInvalidTexture,
-  invalid_texture_binding            = cudaErrorInvalidTextureBinding,
-  invalid_channel_descriptor         = cudaErrorInvalidChannelDescriptor,
-  invalid_memcpy_direction           = cudaErrorInvalidMemcpyDirection,
-  address_of_constant_error          = cudaErrorAddressOfConstant,
-  texture_fetch_failed               = cudaErrorTextureFetchFailed,
-  texture_not_bound                  = cudaErrorTextureNotBound,
-  synchronization_error              = cudaErrorSynchronizationError,
-  invalid_filter_setting             = cudaErrorInvalidFilterSetting,
-  invalid_norm_setting               = cudaErrorInvalidNormSetting,
-  mixed_device_execution             = cudaErrorMixedDeviceExecution,
-  cuda_runtime_unloading             = cudaErrorCudartUnloading,
-  unknown                            = cudaErrorUnknown,
-  not_yet_implemented                = cudaErrorNotYetImplemented,
-  memory_value_too_large             = cudaErrorMemoryValueTooLarge,
-  invalid_resource_handle            = cudaErrorInvalidResourceHandle,
-  not_ready                          = cudaErrorNotReady,
-  insufficient_driver                = cudaErrorInsufficientDriver,
-  set_on_active_process_error        = cudaErrorSetOnActiveProcess,
-  no_device                          = cudaErrorNoDevice,
-  ecc_uncorrectable                  = cudaErrorECCUncorrectable,
-
-#if CUDART_VERSION >= 4020
-  shared_object_symbol_not_found     = cudaErrorSharedObjectSymbolNotFound,
-  shared_object_init_failed          = cudaErrorSharedObjectInitFailed,
-  unsupported_limit                  = cudaErrorUnsupportedLimit,
-  duplicate_variable_name            = cudaErrorDuplicateVariableName,
-  duplicate_texture_name             = cudaErrorDuplicateTextureName,
-  duplicate_surface_name             = cudaErrorDuplicateSurfaceName,
-  devices_unavailable                = cudaErrorDevicesUnavailable,
-  invalid_kernel_image               = cudaErrorInvalidKernelImage,
-  no_kernel_image_for_device         = cudaErrorNoKernelImageForDevice,
-  incompatible_driver_context        = cudaErrorIncompatibleDriverContext,
-  peer_access_already_enabled        = cudaErrorPeerAccessAlreadyEnabled,
-  peer_access_not_enabled            = cudaErrorPeerAccessNotEnabled,
-  device_already_in_use              = cudaErrorDeviceAlreadyInUse,
-  profiler_disabled                  = cudaErrorProfilerDisabled,
-  assert_triggered                   = cudaErrorAssert,
-  too_many_peers                     = cudaErrorTooManyPeers,
-  host_memory_already_registered     = cudaErrorHostMemoryAlreadyRegistered,
-  host_memory_not_registered         = cudaErrorHostMemoryNotRegistered,
-  operating_system_error             = cudaErrorOperatingSystem,
-#endif
-
-#if CUDART_VERSION >= 5000
-  peer_access_unsupported            = cudaErrorPeerAccessUnsupported,
-  launch_max_depth_exceeded          = cudaErrorLaunchMaxDepthExceeded,
-  launch_file_scoped_texture_used    = cudaErrorLaunchFileScopedTex,
-  launch_file_scoped_surface_used    = cudaErrorLaunchFileScopedSurf,
-  sync_depth_exceeded                = cudaErrorSyncDepthExceeded,
-  attempted_operation_not_permitted  = cudaErrorNotPermitted,
-  attempted_operation_not_supported  = cudaErrorNotSupported,
-#endif
-
-  startup_failure                    = cudaErrorStartupFailure
-}; // end errc_t
-
-
-} // end namespace errc
-
-} // end namespace cuda
-
-/*! \return A reference to an object of a type derived from class \p thrust::error_category.
- *  \note The object's \p equivalent virtual functions shall behave as specified
- *        for the class \p thrust::error_category. The object's \p name virtual function shall
- *        return a pointer to the string <tt>"cuda"</tt>. The object's
- *        \p default_error_condition virtual function shall behave as follows:
- *
- *        If the argument <tt>ev</tt> corresponds to a CUDA error value, the function
- *        shall return <tt>error_condition(ev,cuda_category())</tt>.
- *        Otherwise, the function shall return <tt>system_category.default_error_condition(ev)</tt>.
- */
-inline const error_category &cuda_category(void);
-
-
-// XXX N3000 prefers is_error_code_enum<cuda::errc>
-
-/*! Specialization of \p is_error_code_enum for \p cuda::errc::errc_t
- */
-template<> struct is_error_code_enum<cuda::errc::errc_t> : thrust::detail::true_type {};
-
-
-// XXX replace cuda::errc::errc_t with cuda::errc upon c++0x
-/*! \return <tt>error_code(static_cast<int>(e), cuda::error_category())</tt>
- */
-inline error_code make_error_code(cuda::errc::errc_t e);
-
-
-// XXX replace cuda::errc::errc_t with cuda::errc upon c++0x
-/*! \return <tt>error_condition(static_cast<int>(e), cuda::error_category())</tt>.
- */
-inline error_condition make_error_condition(cuda::errc::errc_t e);
-
-/*! \} // end system
- */
-
-
-} // end system
-
-namespace cuda
-{
-
-// XXX replace with using system::cuda_errc upon c++0x
-namespace errc = system::cuda::errc;
-
-} // end cuda
-
-using system::cuda_category;
-
-} // end namespace thrust
-
-#include <thrust/system/cuda/detail/error.inl>
-
diff --git a/compat/thrust/system/cuda/execution_policy.h b/compat/thrust/system/cuda/execution_policy.h
deleted file mode 100644
index bbd33defd4..0000000000
--- a/compat/thrust/system/cuda/execution_policy.h
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-/*! \file thrust/system/cuda/execution_policy.h
- *  \brief Execution policies for Thrust's CUDA system.
- */
-
-#include <thrust/detail/config.h>
-
-// get the execution policies definitions first
-#include <thrust/system/cuda/detail/execution_policy.h>
-
-// get the definition of par
-#include <thrust/system/cuda/detail/par.h>
-
-// now get all the algorithm defintitions
-
-// the order of the following #includes seems to matter, unfortunately
-
-// primitives come first, in order of increasing sophistication
-#include <thrust/system/cuda/detail/get_value.h>
-#include <thrust/system/cuda/detail/assign_value.h>
-#include <thrust/system/cuda/detail/iter_swap.h>
-
-#include <thrust/system/cuda/detail/for_each.h>
-#include <thrust/system/cuda/detail/copy.h>
-#include <thrust/system/cuda/detail/reduce.h>
-#include <thrust/system/cuda/detail/scan.h>
-#include <thrust/system/cuda/detail/sort.h>
-
-// these are alphabetical
-#include <thrust/system/cuda/detail/adjacent_difference.h>
-#include <thrust/system/cuda/detail/assign_value.h>
-#include <thrust/system/cuda/detail/binary_search.h>
-#include <thrust/system/cuda/detail/copy_if.h>
-#include <thrust/system/cuda/detail/count.h>
-#include <thrust/system/cuda/detail/equal.h>
-#include <thrust/system/cuda/detail/extrema.h>
-#include <thrust/system/cuda/detail/fill.h>
-#include <thrust/system/cuda/detail/find.h>
-#include <thrust/system/cuda/detail/gather.h>
-#include <thrust/system/cuda/detail/generate.h>
-#include <thrust/system/cuda/detail/inner_product.h>
-#include <thrust/system/cuda/detail/iter_swap.h>
-#include <thrust/system/cuda/detail/logical.h>
-#include <thrust/system/cuda/detail/malloc_and_free.h>
-#include <thrust/system/cuda/detail/merge.h>
-#include <thrust/system/cuda/detail/mismatch.h>
-#include <thrust/system/cuda/detail/partition.h>
-#include <thrust/system/cuda/detail/reduce_by_key.h>
-#include <thrust/system/cuda/detail/remove.h>
-#include <thrust/system/cuda/detail/replace.h>
-#include <thrust/system/cuda/detail/reverse.h>
-#include <thrust/system/cuda/detail/scan_by_key.h>
-#include <thrust/system/cuda/detail/scatter.h>
-#include <thrust/system/cuda/detail/sequence.h>
-#include <thrust/system/cuda/detail/set_operations.h>
-#include <thrust/system/cuda/detail/sort.h>
-#include <thrust/system/cuda/detail/swap_ranges.h>
-#include <thrust/system/cuda/detail/tabulate.h>
-#include <thrust/system/cuda/detail/transform.h>
-#include <thrust/system/cuda/detail/transform_reduce.h>
-#include <thrust/system/cuda/detail/transform_scan.h>
-#include <thrust/system/cuda/detail/uninitialized_copy.h>
-#include <thrust/system/cuda/detail/uninitialized_fill.h>
-#include <thrust/system/cuda/detail/unique.h>
-#include <thrust/system/cuda/detail/unique_by_key.h>
-
-
-// define these entities here for the purpose of Doxygenating them
-// they are actually defined elsewhere
-#if 0
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-
-
-/*! \addtogroup execution_policies
- *  \{
- */
-
-
-/*! \p thrust::cuda::execution_policy is the base class for all Thrust parallel execution
- *  policies which are derived from Thrust's CUDA backend system.
- */
-template<typename DerivedPolicy>
-struct execution_policy : thrust::execution_policy<DerivedPolicy>
-{};
-
-
-/*! \p cuda::tag is a type representing Thrust's CUDA backend system in C++'s type system.
- *  Iterators "tagged" with a type which is convertible to \p cuda::tag assert that they may be
- *  "dispatched" to algorithm implementations in the \p cuda system.
- */
-struct tag : thrust::system::cuda::execution_policy<tag> { unspecified };
-
-
-/*! \p thrust::cuda::par is the parallel execution policy associated with Thrust's CUDA
- *  backend system.
- *
- *  Instead of relying on implicit algorithm dispatch through iterator system tags, users may
- *  directly target Thrust's CUDA backend system by providing \p thrust::cuda::par as an algorithm
- *  parameter.
- *
- *  Explicit dispatch can be useful in avoiding the introduction of data copies into containers such
- *  as \p thrust::cuda::vector.
- *
- *  The type of \p thrust::cuda::par is implementation-defined.
- *
- *  The following code snippet demonstrates how to use \p thrust::cuda::par to explicitly dispatch an
- *  invocation of \p thrust::for_each to the CUDA backend system:
- *
- *  \code
- *  #include <thrust/for_each.h>
- *  #include <thrust/system/cuda/execution_policy.h>
- *  #include <cstdio>
- *
- *  struct printf_functor
- *  {
- *    __host__ __device__
- *    void operator()(int x)
- *    {
- *      printf("%d\n");
- *    }
- *  };
- *  ...
- *  int vec[3];
- *  vec[0] = 0; vec[1] = 1; vec[2] = 2;
- *
- *  thrust::for_each(thrust::cuda::par, vec.begin(), vec.end(), printf_functor());
- *
- *  // 0 1 2 is printed to standard output in some unspecified order
- *  \endcode
- */
-static const unspecified par;
-
-
-/*! \}
- */
-
-
-} // end cuda
-} // end system
-} // end thrust
-#endif
-
-
diff --git a/compat/thrust/system/cuda/experimental/pinned_allocator.h b/compat/thrust/system/cuda/experimental/pinned_allocator.h
deleted file mode 100644
index 5294659e62..0000000000
--- a/compat/thrust/system/cuda/experimental/pinned_allocator.h
+++ /dev/null
@@ -1,239 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file thrust/system/cuda/experimental/pinned_allocator.h
- *  \brief An allocator which creates new elements in "pinned" memory with \p cudaMallocHost
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
-#include <stdexcept>
-#include <limits>
-#include <string>
-#include <thrust/system/system_error.h>
-#include <thrust/system/cuda/error.h>
-
-namespace thrust
-{
-
-namespace system
-{
-
-namespace cuda
-{
-
-namespace experimental
-{
-
-/*! \addtogroup memory_management Memory Management
- *  \addtogroup memory_management_classes
- *  \ingroup memory_management
- *  \{
- */
-
-/*! \p pinned_allocator is a CUDA-specific host memory allocator
- *  that employs \c cudaMallocHost for allocation.
- *
- *  \see http://www.sgi.com/tech/stl/Allocators.html
- */
-template<typename T> class pinned_allocator;
-
-template<>
-  class pinned_allocator<void>
-{
-  public:
-    typedef void           value_type;
-    typedef void       *   pointer;
-    typedef const void *   const_pointer;
-    typedef std::size_t    size_type;
-    typedef std::ptrdiff_t difference_type;
-
-    // convert a pinned_allocator<void> to pinned_allocator<U>
-    template<typename U>
-      struct rebind
-    {
-      typedef pinned_allocator<U> other;
-    }; // end rebind
-}; // end pinned_allocator
-
-
-template<typename T>
-  class pinned_allocator
-{
-  public:
-    typedef T              value_type;
-    typedef T*             pointer;
-    typedef const T*       const_pointer;
-    typedef T&             reference;
-    typedef const T&       const_reference;
-    typedef std::size_t    size_type;
-    typedef std::ptrdiff_t difference_type;
-
-    // convert a pinned_allocator<T> to pinned_allocator<U>
-    template<typename U>
-      struct rebind
-    {
-      typedef pinned_allocator<U> other;
-    }; // end rebind
-
-    /*! \p pinned_allocator's null constructor does nothing.
-     */
-    __host__ __device__
-    inline pinned_allocator() {}
-
-    /*! \p pinned_allocator's null destructor does nothing.
-     */
-    __host__ __device__
-    inline ~pinned_allocator() {}
-
-    /*! \p pinned_allocator's copy constructor does nothing.
-     */
-    __host__ __device__
-    inline pinned_allocator(pinned_allocator const &) {}
-
-    /*! This version of \p pinned_allocator's copy constructor
-     *  is templated on the \c value_type of the \p pinned_allocator
-     *  to copy from.  It is provided merely for convenience; it
-     *  does nothing.
-     */
-    template<typename U>
-    __host__ __device__
-    inline pinned_allocator(pinned_allocator<U> const &) {}
-
-    /*! This method returns the address of a \c reference of
-     *  interest.
-     *
-     *  \p r The \c reference of interest.
-     *  \return \c r's address.
-     */
-    __host__ __device__
-    inline pointer address(reference r) { return &r; }
-
-    /*! This method returns the address of a \c const_reference
-     *  of interest.
-     *
-     *  \p r The \c const_reference of interest.
-     *  \return \c r's address.
-     */
-    __host__ __device__
-    inline const_pointer address(const_reference r) { return &r; }
-
-    /*! This method allocates storage for objects in pinned host
-     *  memory.
-     *
-     *  \p cnt The number of objects to allocate.
-     *  \return a \c pointer to the newly allocated objects.
-     *  \note This method does not invoke \p value_type's constructor.
-     *        It is the responsibility of the caller to initialize the
-     *        objects at the returned \c pointer. 
-     */
-    __host__
-    inline pointer allocate(size_type cnt,
-                            const_pointer = 0)
-    {
-      if(cnt > this->max_size())
-      {
-        throw std::bad_alloc();
-      } // end if
-
-      pointer result(0);
-      cudaError_t error = cudaMallocHost(reinterpret_cast<void**>(&result), cnt * sizeof(value_type));
-
-      if(error)
-      {
-        throw std::bad_alloc();
-      } // end if
-
-      return result;
-    } // end allocate()
-
-    /*! This method deallocates pinned host memory previously allocated
-     *  with this \c pinned_allocator.
-     *
-     *  \p p A \c pointer to the previously allocated memory.
-     *  \p cnt The number of objects previously allocated at
-     *         \p p.
-     *  \note This method does not invoke \p value_type's destructor.
-     *        It is the responsibility of the caller to destroy
-     *        the objects stored at \p p.
-     */
-    __host__
-    inline void deallocate(pointer p, size_type cnt)
-    {
-      cudaError_t error = cudaFreeHost(p);
-      
-      if(error)
-      {
-        throw thrust::system_error(error, thrust::cuda_category());
-      } // end if
-    } // end deallocate()
-
-    /*! This method returns the maximum size of the \c cnt parameter
-     *  accepted by the \p allocate() method.
-     *
-     *  \return The maximum number of objects that may be allocated
-     *          by a single call to \p allocate().
-     */
-    inline size_type max_size() const
-    {
-      return (std::numeric_limits<size_type>::max)() / sizeof(T);
-    } // end max_size()
-
-    /*! This method tests this \p pinned_allocator for equality to
-     *  another.
-     *
-     *  \param x The other \p pinned_allocator of interest.
-     *  \return This method always returns \c true.
-     */
-    __host__ __device__
-    inline bool operator==(pinned_allocator const& x) { return true; }
-
-    /*! This method tests this \p pinned_allocator for inequality
-     *  to another.
-     *
-     *  \param x The other \p pinned_allocator of interest.
-     *  \return This method always returns \c false.
-     */
-    __host__ __device__
-    inline bool operator!=(pinned_allocator const &x) { return !operator==(x); }
-}; // end pinned_allocator
-
-/*! \}
- */
-
-} // end experimental
-
-} // end cuda
-
-} // end system
-
-// alias cuda's members at top-level
-namespace cuda
-{
-
-namespace experimental
-{
-
-using thrust::system::cuda::experimental::pinned_allocator;
-
-} // end experimental
-
-} // end cuda
-
-} // end thrust
-
diff --git a/compat/thrust/system/cuda/memory.h b/compat/thrust/system/cuda/memory.h
deleted file mode 100644
index 368eea265a..0000000000
--- a/compat/thrust/system/cuda/memory.h
+++ /dev/null
@@ -1,421 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in ccudaliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file thrust/system/cuda/memory.h
- *  \brief Managing memory associated with Thrust's CUDA system.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-#include <thrust/memory.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/allocator/malloc_allocator.h>
-#include <ostream>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-
-template<typename> class pointer;
-
-} // end cuda
-} // end system
-} // end thrust
-
-
-/*! \cond
- */
-
-// specialize std::iterator_traits to avoid problems with the name of
-// pointer's constructor shadowing its nested pointer type
-// do this before pointer is defined so the specialization is correctly
-// used inside the definition
-namespace std
-{
-
-template<typename Element>
-  struct iterator_traits<thrust::system::cuda::pointer<Element> >
-{
-  private:
-    typedef thrust::system::cuda::pointer<Element> ptr;
-
-  public:
-    typedef typename ptr::iterator_category       iterator_category;
-    typedef typename ptr::value_type              value_type;
-    typedef typename ptr::difference_type         difference_type;
-    typedef ptr                                   pointer;
-    typedef typename ptr::reference               reference;
-}; // end iterator_traits
-
-} // end std
-
-/*! \endcond
- */
-
-
-namespace thrust
-{
-namespace system
-{
-
-/*! \addtogroup system_backends Systems
- *  \ingroup system
- *  \{
- */
-
-/*! \namespace thrust::system::cuda
- *  \brief \p thrust::system::cuda is the namespace containing functionality for allocating, manipulating,
- *         and deallocating memory available to Thrust's CUDA backend system.
- *         The identifiers are provided in a separate namespace underneath <tt>thrust::system</tt>
- *         for import convenience but are also aliased in the top-level <tt>thrust::tbb</tt>
- *         namespace for easy access.
- *
- */
-namespace cuda
-{
-
-// forward declaration of reference for pointer
-template<typename Element> class reference;
-
-/*! \cond
- */
-
-// XXX nvcc + msvc have trouble instantiating reference below
-//     this is a workaround
-namespace detail
-{
-
-template<typename Element>
-  struct reference_msvc_workaround
-{
-  typedef thrust::system::cuda::reference<Element> type;
-}; // end reference_msvc_workaround
-
-} // end detail
-
-/*! \endcond
- */
-
-#if 0
-/*! \p cuda::tag is type representing Thrust's CUDA backend system in C++'s type system.
- *  Iterators "tagged" with a type which is convertible to \p cuda::tag assert that they may be
- *  "dispatched" to algorithm implementations in the \p cuda system.
- */
-struct tag { unspecified };
-#endif
-
-/*! \p pointer stores a pointer to an object allocated in memory available to the cuda system.
- *  This type provides type safety when dispatching standard algorithms on ranges resident
- *  in cuda memory.
- *
- *  \p pointer has pointer semantics: it may be dereferenced and manipulated with pointer arithmetic.
- *
- *  \p pointer can be created with the function \p cuda::malloc, or by explicitly calling its constructor
- *  with a raw pointer.
- *
- *  The raw pointer encapsulated by a \p pointer may be obtained by eiter its <tt>get</tt> member function
- *  or the \p raw_pointer_cast function.
- *
- *  \note \p pointer is not a "smart" pointer; it is the programmer's responsibility to deallocate memory
- *  pointed to by \p pointer.
- *
- *  \tparam T specifies the type of the pointee.
- *
- *  \see cuda::malloc
- *  \see cuda::free
- *  \see raw_pointer_cast
- */
-template<typename T>
-  class pointer
-    : public thrust::pointer<
-               T,
-               thrust::system::cuda::tag,
-               thrust::system::cuda::reference<T>,
-               thrust::system::cuda::pointer<T>
-             >
-{
-  /*! \cond
-   */
-
-  private:
-    typedef thrust::pointer<
-      T,
-      thrust::system::cuda::tag,
-      //thrust::system::cuda::reference<T>,
-      typename detail::reference_msvc_workaround<T>::type,
-      thrust::system::cuda::pointer<T>
-    > super_t;
-
-  /*! \endcond
-   */
-
-  public:
-
-    /*! \p pointer's no-argument constructor initializes its encapsulated pointer to \c 0.
-     */
-    __host__ __device__
-    pointer() : super_t() {}
-
-    /*! This constructor allows construction of a <tt>pointer<const T></tt> from a <tt>T*</tt>.
-     *
-     *  \param ptr A raw pointer to copy from, presumed to point to a location in memory
-     *         accessible by the \p tbb system.
-     *  \tparam OtherT \p OtherT shall be convertible to \p T.
-     */
-    template<typename OtherT>
-    __host__ __device__
-    explicit pointer(OtherT *ptr) : super_t(ptr) {}
-
-    /*! This constructor allows construction from another pointer-like object with related type.
-     *
-     *  \param other The \p OtherPointer to copy.
-     *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
-     *          to \p thrust::system::cuda::tag and its element type shall be convertible to \p T.
-     */
-    template<typename OtherPointer>
-    __host__ __device__
-    pointer(const OtherPointer &other,
-            typename thrust::detail::enable_if_pointer_is_convertible<
-              OtherPointer,
-              pointer
-            >::type * = 0) : super_t(other) {}
-
-    /*! Assignment operator allows assigning from another pointer-like object with related type.
-     *
-     *  \param other The other pointer-like object to assign from.
-     *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
-     *          to \p thrust::system::cuda::tag and its element type shall be convertible to \p T.
-     */
-    template<typename OtherPointer>
-    __host__ __device__
-    typename thrust::detail::enable_if_pointer_is_convertible<
-      OtherPointer,
-      pointer,
-      pointer &
-    >::type
-    operator=(const OtherPointer &other)
-    {
-      return super_t::operator=(other);
-    }
-}; // end pointer
-
-
-/*! \p reference is a wrapped reference to an object stored in memory available to the \p cuda system.
- *  \p reference is the type of the result of dereferencing a \p cuda::pointer.
- *
- *  \tparam T Specifies the type of the referenced object.
- */
-template<typename T>
-  class reference
-    : public thrust::reference<
-               T,
-               thrust::system::cuda::pointer<T>,
-               thrust::system::cuda::reference<T>
-             >
-{
-  /*! \cond
-   */
-
-  private:
-    typedef thrust::reference<
-      T,
-      thrust::system::cuda::pointer<T>,
-      thrust::system::cuda::reference<T>
-    > super_t;
-
-  /*! \endcond
-   */
-
-  public:
-    /*! \cond
-     */
-
-    typedef typename super_t::value_type value_type;
-    typedef typename super_t::pointer    pointer;
-
-    /*! \endcond
-     */
-
-    /*! This constructor initializes this \p reference to refer to an object
-     *  pointed to by the given \p pointer. After this \p reference is constructed,
-     *  it shall refer to the object pointed to by \p ptr.
-     *
-     *  \param ptr A \p pointer to copy from.
-     */
-    __host__ __device__
-    explicit reference(const pointer &ptr)
-      : super_t(ptr)
-    {}
-
-    /*! This constructor accepts a const reference to another \p reference of related type.
-     *  After this \p reference is constructed, it shall refer to the same object as \p other.
-     *
-     *  \param other A \p reference to copy from.
-     *  \tparam OtherT The element type of the other \p reference.
-     *
-     *  \note This constructor is templated primarily to allow initialization of <tt>reference<const T></tt>
-     *        from <tt>reference<T></tt>.
-     */
-    template<typename OtherT>
-    __host__ __device__
-    reference(const reference<OtherT> &other,
-              typename thrust::detail::enable_if_convertible<
-                typename reference<OtherT>::pointer,
-                pointer
-              >::type * = 0)
-      : super_t(other)
-    {}
-
-    /*! Copy assignment operator copy assigns from another \p reference of related type.
-     *
-     *  \param other The other \p reference to assign from.
-     *  \return <tt>*this</tt>
-     *  \tparam OtherT The element type of the other \p reference.
-     */
-    template<typename OtherT>
-    __host__ __device__
-    reference &operator=(const reference<OtherT> &other);
-
-    /*! Assignment operator assigns from a \p value_type.
-     *
-     *  \param x The \p value_type to assign from.
-     *  \return <tt>*this</tt>
-     */
-    __host__ __device__
-    reference &operator=(const value_type &x);
-}; // end reference
-
-/*! Exchanges the values of two objects referred to by \p reference.
- *  \p x The first \p reference of interest.
- *  \p y The second \p reference ot interest.
- */
-template<typename T>
-__host__ __device__
-void swap(reference<T> x, reference<T> y);
-
-/*! Allocates an area of memory available to Thrust's <tt>cuda</tt> system.
- *  \param n Number of bytes to allocate.
- *  \return A <tt>cuda::pointer<void></tt> pointing to the beginning of the newly
- *          allocated memory. A null <tt>cuda::pointer<void></tt> is returned if
- *          an error occurs.
- *  \note The <tt>cuda::pointer<void></tt> returned by this function must be
- *        deallocated with \p cuda::free.
- *  \see cuda::free
- *  \see std::malloc
- */
-inline pointer<void> malloc(std::size_t n);
-
-/*! Allocates a typed area of memory available to Thrust's <tt>cuda</tt> system.
- *  \param n Number of elements to allocate.
- *  \return A <tt>cuda::pointer<T></tt> pointing to the beginning of the newly
- *          allocated memory. A null <tt>cuda::pointer<T></tt> is returned if
- *          an error occurs.
- *  \note The <tt>cuda::pointer<T></tt> returned by this function must be
- *        deallocated with \p cuda::free.
- *  \see cuda::free
- *  \see std::malloc
- */
-template<typename T>
-inline pointer<T> malloc(std::size_t n);
-
-/*! Deallocates an area of memory previously allocated by <tt>cuda::malloc</tt>.
- *  \param ptr A <tt>cuda::pointer<void></tt> pointing to the beginning of an area
- *         of memory previously allocated with <tt>cuda::malloc</tt>.
- *  \see cuda::malloc
- *  \see std::free
- */
-inline void free(pointer<void> ptr);
-
-// XXX upon c++11
-// template<typename T> using allocator = thrust::detail::malloc_allocator<T,tag,pointer<T> >;
-
-/*! \p cuda::allocator is the default allocator used by the \p cuda system's containers such as
- *  <tt>cuda::vector</tt> if no user-specified allocator is provided. \p cuda::allocator allocates
- *  (deallocates) storage with \p cuda::malloc (\p cuda::free).
- */
-template<typename T>
-  struct allocator
-    : thrust::detail::malloc_allocator<
-        T,
-        tag,
-        pointer<T>
-      >
-{
-  /*! The \p rebind metafunction provides the type of an \p allocator
-   *  instantiated with another type.
-   *
-   *  \tparam U The other type to use for instantiation.
-   */
-  template<typename U>
-    struct rebind
-  {
-    /*! The typedef \p other gives the type of the rebound \p allocator.
-     */
-    typedef allocator<U> other;
-  };
-
-  /*! No-argument constructor has no effect.
-   */
-  __host__ __device__
-  inline allocator() {}
-
-  /*! Copy constructor has no effect.
-   */
-  __host__ __device__
-  inline allocator(const allocator &) {}
-
-  /*! Constructor from other \p allocator has no effect.
-   */
-  template<typename U>
-  __host__ __device__
-  inline allocator(const allocator<U> &) {}
-
-  /*! Destructor has no effect.
-   */
-  __host__ __device__
-  inline ~allocator() {}
-}; // end allocator
-
-} // end cuda
-
-/*! \}
- */
-
-} // end system
-
-/*! \namespace thrust::cuda
- *  \brief \p thrust::cuda is a top-level alias for thrust::system::cuda.
- */
-namespace cuda
-{
-
-using thrust::system::cuda::pointer;
-using thrust::system::cuda::reference;
-using thrust::system::cuda::malloc;
-using thrust::system::cuda::free;
-using thrust::system::cuda::allocator;
-
-} // end cuda
-
-} // end thrust
-
-#include <thrust/system/cuda/detail/memory.inl>
-
diff --git a/compat/thrust/system/cuda/vector.h b/compat/thrust/system/cuda/vector.h
deleted file mode 100644
index ac47a84e1d..0000000000
--- a/compat/thrust/system/cuda/vector.h
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in ccudaliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file thrust/system/cuda/vector.h
- *  \brief A dynamically-sizable array of elements which reside in memory available to
- *         Thrust's CUDA system.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/memory.h>
-#include <thrust/detail/vector_base.h>
-#include <vector>
-
-namespace thrust
-{
-
-// forward declaration of host_vector
-template<typename T, typename Allocator> class host_vector;
-
-namespace system
-{
-namespace cuda
-{
-
-// XXX upon c++11
-// template<typename T, typename Allocator = allocator<T> > using vector = thrust::detail::vector_base<T,Allocator>;
-
-/*! \p cuda::vector is a container that supports random access to elements,
- *  constant time removal of elements at the end, and linear time insertion
- *  and removal of elements at the beginning or in the middle. The number of
- *  elements in a \p cuda::vector may vary dynamically; memory management is
- *  automatic. The elements contained in a \p cuda::vector reside in memory
- *  available to the \p cuda system.
- *
- *  \tparam T The element type of the \p cuda::vector.
- *  \tparam Allocator The allocator type of the \p cuda::vector. Defaults to \p cuda::allocator.
- *
- *  \see http://www.sgi.com/tech/stl/Vector.html
- *  \see host_vector For the documentation of the complete interface which is
- *                   shared by \p cuda::vector
- *  \see device_vector
- */
-template<typename T, typename Allocator = allocator<T> >
-  class vector
-    : public thrust::detail::vector_base<T,Allocator>
-{
-  /*! \cond
-   */
-  private:
-    typedef thrust::detail::vector_base<T,Allocator> super_t;
-  /*! \endcond
-   */
-
-  public:
-
-  /*! \cond
-   */
-    typedef typename super_t::size_type  size_type;
-    typedef typename super_t::value_type value_type;
-  /*! \endcond
-   */
-
-    /*! This constructor creates an empty \p cuda::vector.
-     */
-    vector();
-
-    /*! This constructor creates a \p cuda::vector with \p n default-constructed elements.
-     *  \param n The size of the \p cuda::vector to create.
-     */
-    explicit vector(size_type n);
-
-    /*! This constructor creates a \p cuda::vector with \p n copies of \p value.
-     *  \param n The size of the \p cuda::vector to create.
-     *  \param value An element to copy.
-     */
-    explicit vector(size_type n, const value_type &value);
-
-    /*! Copy constructor copies from another \p cuda::vector.
-     *  \param x The other \p cuda::vector to copy.
-     */
-    vector(const vector &x);
-
-    /*! This constructor copies from another Thrust vector-like object.
-     *  \param x The other object to copy from.
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector(const thrust::detail::vector_base<OtherT,OtherAllocator> &x);
-
-    /*! This constructor copies from a \c std::vector.
-     *  \param x The \c std::vector to copy from.
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector(const std::vector<OtherT,OtherAllocator> &x);
-
-    /*! This constructor creates a \p cuda::vector by copying from a range.
-     *  \param first The beginning of the range.
-     *  \param last The end of the range.
-     */
-    template<typename InputIterator>
-    vector(InputIterator first, InputIterator last);
-
-    // XXX vector_base should take a Derived type so we don't have to define these superfluous assigns
-    //
-    /*! Assignment operator assigns from a \c std::vector.
-     *  \param x The \c std::vector to assign from.
-     *  \return <tt>*this</tt>
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector &operator=(const std::vector<OtherT,OtherAllocator> &x);
-
-    /*! Assignment operator assigns from another Thrust vector-like object.
-     *  \param x The other object to assign from.
-     *  \return <tt>*this</tt>
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector &operator=(const thrust::detail::vector_base<OtherT,OtherAllocator> &x);
-}; // end vector
-
-} // end cuda
-} // end system
-
-// alias system::cuda names at top-level
-namespace cuda
-{
-
-using thrust::system::cuda::vector;
-
-} // end cuda
-
-} // end thrust
-
-#include <thrust/system/cuda/detail/vector.inl>
-
diff --git a/compat/thrust/system/detail/adl/adjacent_difference.h b/compat/thrust/system/detail/adl/adjacent_difference.h
deleted file mode 100644
index 246c1163bb..0000000000
--- a/compat/thrust/system/detail/adl/adjacent_difference.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// the purpose of this header is to #include the adjacent_difference.h header
-// of the host and device systems. It should be #included in any
-// code which uses adl to dispatch adjacent_difference
-
-#define __THRUST_HOST_SYSTEM_ADJACENT_DIFFERENCE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/adjacent_difference.h>
-#include __THRUST_HOST_SYSTEM_ADJACENT_DIFFERENCE_HEADER
-#undef __THRUST_HOST_SYSTEM_ADJACENT_DIFFERENCE_HEADER
-
-#define __THRUST_DEVICE_SYSTEM_ADJACENT_DIFFERENCE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/adjacent_difference.h>
-#include __THRUST_DEVICE_SYSTEM_ADJACENT_DIFFERENCE_HEADER
-#undef __THRUST_DEVICE_SYSTEM_ADJACENT_DIFFERENCE_HEADER
-
diff --git a/compat/thrust/system/detail/adl/assign_value.h b/compat/thrust/system/detail/adl/assign_value.h
deleted file mode 100644
index b5c588ace7..0000000000
--- a/compat/thrust/system/detail/adl/assign_value.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// the purpose of this header is to #include the assign_value.h header
-// of the host and device systems. It should be #included in any
-// code which uses adl to dispatch assign_value
-
-#define __THRUST_HOST_SYSTEM_ASSIGN_VALUE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/assign_value.h>
-#include __THRUST_HOST_SYSTEM_ASSIGN_VALUE_HEADER
-#undef __THRUST_HOST_SYSTEM_ASSIGN_VALUE_HEADER
-
-#define __THRUST_DEVICE_SYSTEM_ASSIGN_VALUE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/assign_value.h>
-#include __THRUST_DEVICE_SYSTEM_ASSIGN_VALUE_HEADER
-#undef __THRUST_DEVICE_SYSTEM_ASSIGN_VALUE_HEADER
-
diff --git a/compat/thrust/system/detail/adl/binary_search.h b/compat/thrust/system/detail/adl/binary_search.h
deleted file mode 100644
index 7accfbc381..0000000000
--- a/compat/thrust/system/detail/adl/binary_search.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// the purpose of this header is to #include the binary_search.h header
-// of the host and device systems. It should be #included in any
-// code which uses adl to dispatch binary_search
-
-#define __THRUST_HOST_SYSTEM_BINARY_SEARCH_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/binary_search.h>
-#include __THRUST_HOST_SYSTEM_BINARY_SEARCH_HEADER
-#undef __THRUST_HOST_SYSTEM_BINARY_SEARCH_HEADER
-
-#define __THRUST_DEVICE_SYSTEM_BINARY_SEARCH_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/binary_search.h>
-#include __THRUST_DEVICE_SYSTEM_BINARY_SEARCH_HEADER
-#undef __THRUST_DEVICE_SYSTEM_BINARY_SEARCH_HEADER
-
diff --git a/compat/thrust/system/detail/adl/copy.h b/compat/thrust/system/detail/adl/copy.h
deleted file mode 100644
index 91a32cd345..0000000000
--- a/compat/thrust/system/detail/adl/copy.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// the purpose of this header is to #include the copy.h header
-// of the host and device systems. It should be #included in any
-// code which uses adl to dispatch copy
-
-#define __THRUST_HOST_SYSTEM_COPY_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/copy.h>
-#include __THRUST_HOST_SYSTEM_COPY_HEADER
-#undef __THRUST_HOST_SYSTEM_COPY_HEADER
-
-#define __THRUST_DEVICE_SYSTEM_COPY_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/copy.h>
-#include __THRUST_DEVICE_SYSTEM_COPY_HEADER
-#undef __THRUST_DEVICE_SYSTEM_COPY_HEADER
-
diff --git a/compat/thrust/system/detail/adl/copy_if.h b/compat/thrust/system/detail/adl/copy_if.h
deleted file mode 100644
index fd1df977ab..0000000000
--- a/compat/thrust/system/detail/adl/copy_if.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy_if.h of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// the purpose of this header is to #include the copy_if.h header
-// of the host and device systems. It should be #included in any
-// code which uses adl to dispatch copy_if
-
-#define __THRUST_HOST_SYSTEM_BINARY_SEARCH_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/copy_if.h>
-#include __THRUST_HOST_SYSTEM_BINARY_SEARCH_HEADER
-#undef __THRUST_HOST_SYSTEM_BINARY_SEARCH_HEADER
-
-#define __THRUST_DEVICE_SYSTEM_BINARY_SEARCH_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/copy_if.h>
-#include __THRUST_DEVICE_SYSTEM_BINARY_SEARCH_HEADER
-#undef __THRUST_DEVICE_SYSTEM_BINARY_SEARCH_HEADER
-
diff --git a/compat/thrust/system/detail/adl/count.h b/compat/thrust/system/detail/adl/count.h
deleted file mode 100644
index 0dd9591a23..0000000000
--- a/compat/thrust/system/detail/adl/count.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a count of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// the purpose of this header is to #include the count.h header
-// of the host and device systems. It should be #included in any
-// code which uses adl to dispatch count
-
-#define __THRUST_HOST_SYSTEM_COUNT_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/count.h>
-#include __THRUST_HOST_SYSTEM_COUNT_HEADER
-#undef __THRUST_HOST_SYSTEM_COUNT_HEADER
-
-#define __THRUST_DEVICE_SYSTEM_COUNT_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/count.h>
-#include __THRUST_DEVICE_SYSTEM_COUNT_HEADER
-#undef __THRUST_DEVICE_SYSTEM_COUNT_HEADER
-
diff --git a/compat/thrust/system/detail/adl/equal.h b/compat/thrust/system/detail/adl/equal.h
deleted file mode 100644
index f933d4f93a..0000000000
--- a/compat/thrust/system/detail/adl/equal.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a equal of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// the purpose of this header is to #include the equal.h header
-// of the host and device systems. It should be #included in any
-// code which uses adl to dispatch equal
-
-#define __THRUST_HOST_SYSTEM_EQUAL_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/equal.h>
-#include __THRUST_HOST_SYSTEM_EQUAL_HEADER
-#undef __THRUST_HOST_SYSTEM_EQUAL_HEADER
-
-#define __THRUST_DEVICE_SYSTEM_EQUAL_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/equal.h>
-#include __THRUST_DEVICE_SYSTEM_EQUAL_HEADER
-#undef __THRUST_DEVICE_SYSTEM_EQUAL_HEADER
-
diff --git a/compat/thrust/system/detail/adl/extrema.h b/compat/thrust/system/detail/adl/extrema.h
deleted file mode 100644
index c766570fc3..0000000000
--- a/compat/thrust/system/detail/adl/extrema.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a extrema of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// the purpose of this header is to #include the extrema.h header
-// of the host and device systems. It should be #included in any
-// code which uses adl to dispatch extrema
-
-#define __THRUST_HOST_SYSTEM_EXTREMA_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/extrema.h>
-#include __THRUST_HOST_SYSTEM_EXTREMA_HEADER
-#undef __THRUST_HOST_SYSTEM_EXTREMA_HEADER
-
-#define __THRUST_DEVICE_SYSTEM_EXTREMA_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/extrema.h>
-#include __THRUST_DEVICE_SYSTEM_EXTREMA_HEADER
-#undef __THRUST_DEVICE_SYSTEM_EXTREMA_HEADER
-
diff --git a/compat/thrust/system/detail/adl/fill.h b/compat/thrust/system/detail/adl/fill.h
deleted file mode 100644
index b241b8a3dc..0000000000
--- a/compat/thrust/system/detail/adl/fill.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a fill of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// the purpose of this header is to #include the fill.h header
-// of the host and device systems. It should be #included in any
-// code which uses adl to dispatch fill
-
-#define __THRUST_HOST_SYSTEM_FILL_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/fill.h>
-#include __THRUST_HOST_SYSTEM_FILL_HEADER
-#undef __THRUST_HOST_SYSTEM_FILL_HEADER
-
-#define __THRUST_DEVICE_SYSTEM_FILL_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/fill.h>
-#include __THRUST_DEVICE_SYSTEM_FILL_HEADER
-#undef __THRUST_DEVICE_SYSTEM_FILL_HEADER
-
diff --git a/compat/thrust/system/detail/adl/find.h b/compat/thrust/system/detail/adl/find.h
deleted file mode 100644
index 7c99f3e7b1..0000000000
--- a/compat/thrust/system/detail/adl/find.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a fill of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// the purpose of this header is to #include the find.h header
-// of the host and device systems. It should be #included in any
-// code which uses adl to dispatch find
-
-#define __THRUST_HOST_SYSTEM_FIND_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/find.h>
-#include __THRUST_HOST_SYSTEM_FIND_HEADER
-#undef __THRUST_HOST_SYSTEM_FIND_HEADER
-
-#define __THRUST_DEVICE_SYSTEM_FIND_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/find.h>
-#include __THRUST_DEVICE_SYSTEM_FIND_HEADER
-#undef __THRUST_DEVICE_SYSTEM_FIND_HEADER
-
diff --git a/compat/thrust/system/detail/adl/for_each.h b/compat/thrust/system/detail/adl/for_each.h
deleted file mode 100644
index 0b2717f34c..0000000000
--- a/compat/thrust/system/detail/adl/for_each.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// the purpose of this header is to #include the for_each.h header
-// of the host and device systems. It should be #included in any
-// code which uses adl to dispatch for_each
-
-#define __THRUST_HOST_SYSTEM_FOR_EACH_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/for_each.h>
-#include __THRUST_HOST_SYSTEM_FOR_EACH_HEADER
-#undef __THRUST_HOST_SYSTEM_FOR_EACH_HEADER
-
-#define __THRUST_DEVICE_SYSTEM_FOR_EACH_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/for_each.h>
-#include __THRUST_DEVICE_SYSTEM_FOR_EACH_HEADER
-#undef __THRUST_DEVICE_SYSTEM_FOR_EACH_HEADER
-
diff --git a/compat/thrust/system/detail/adl/gather.h b/compat/thrust/system/detail/adl/gather.h
deleted file mode 100644
index da4c1d13d5..0000000000
--- a/compat/thrust/system/detail/adl/gather.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a fill of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// the purpose of this header is to #include the gather.h header
-// of the host and device systems. It should be #included in any
-// code which uses adl to dispatch gather
-
-#define __THRUST_HOST_SYSTEM_FOR_EACH_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/gather.h>
-#include __THRUST_HOST_SYSTEM_FOR_EACH_HEADER
-#undef __THRUST_HOST_SYSTEM_FOR_EACH_HEADER
-
-#define __THRUST_DEVICE_SYSTEM_FOR_EACH_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/gather.h>
-#include __THRUST_DEVICE_SYSTEM_FOR_EACH_HEADER
-#undef __THRUST_DEVICE_SYSTEM_FOR_EACH_HEADER
-
diff --git a/compat/thrust/system/detail/adl/generate.h b/compat/thrust/system/detail/adl/generate.h
deleted file mode 100644
index 3a988478f0..0000000000
--- a/compat/thrust/system/detail/adl/generate.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a fill of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// the purpose of this header is to #include the generate.h header
-// of the host and device systems. It should be #included in any
-// code which uses adl to dispatch generate
-
-#define __THRUST_HOST_SYSTEM_GENERATE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/generate.h>
-#include __THRUST_HOST_SYSTEM_GENERATE_HEADER
-#undef __THRUST_HOST_SYSTEM_GENERATE_HEADER
-
-#define __THRUST_DEVICE_SYSTEM_GENERATE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/generate.h>
-#include __THRUST_DEVICE_SYSTEM_GENERATE_HEADER
-#undef __THRUST_DEVICE_SYSTEM_GENERATE_HEADER
-
diff --git a/compat/thrust/system/detail/adl/get_value.h b/compat/thrust/system/detail/adl/get_value.h
deleted file mode 100644
index ed4ef2cfef..0000000000
--- a/compat/thrust/system/detail/adl/get_value.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// the purpose of this header is to #include the get_value.h header
-// of the host and device systems. It should be #included in any
-// code which uses adl to dispatch get_value
-
-#define __THRUST_HOST_SYSTEM_GET_VALUE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/get_value.h>
-#include __THRUST_HOST_SYSTEM_GET_VALUE_HEADER
-#undef __THRUST_HOST_SYSTEM_GET_VALUE_HEADER
-
-#define __THRUST_DEVICE_SYSTEM_GET_VALUE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/get_value.h>
-#include __THRUST_DEVICE_SYSTEM_GET_VALUE_HEADER
-#undef __THRUST_DEVICE_SYSTEM_GET_VALUE_HEADER
-
diff --git a/compat/thrust/system/detail/adl/inner_product.h b/compat/thrust/system/detail/adl/inner_product.h
deleted file mode 100644
index 18cc65b9c1..0000000000
--- a/compat/thrust/system/detail/adl/inner_product.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a fill of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// the purpose of this header is to #include the inner_product.h header
-// of the host and device systems. It should be #included in any
-// code which uses adl to dispatch inner_product
-
-#define __THRUST_HOST_SYSTEM_INNER_PRODUCT_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/inner_product.h>
-#include __THRUST_HOST_SYSTEM_INNER_PRODUCT_HEADER
-#undef __THRUST_HOST_SYSTEM_INNER_PRODUCT_HEADER
-
-#define __THRUST_DEVICE_SYSTEM_INNER_PRODUCT_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/inner_product.h>
-#include __THRUST_DEVICE_SYSTEM_INNER_PRODUCT_HEADER
-#undef __THRUST_DEVICE_SYSTEM_INNER_PRODUCT_HEADER
-
diff --git a/compat/thrust/system/detail/adl/iter_swap.h b/compat/thrust/system/detail/adl/iter_swap.h
deleted file mode 100644
index b302c25f7f..0000000000
--- a/compat/thrust/system/detail/adl/iter_swap.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// the purpose of this header is to #include the iter_swap.h header
-// of the host and device systems. It should be #included in any
-// code which uses adl to dispatch iter_swap
-
-#define __THRUST_HOST_SYSTEM_ITER_SWAP_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/iter_swap.h>
-#include __THRUST_HOST_SYSTEM_ITER_SWAP_HEADER
-#undef __THRUST_HOST_SYSTEM_ITER_SWAP_HEADER
-
-#define __THRUST_DEVICE_SYSTEM_ITER_SWAP_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/iter_swap.h>
-#include __THRUST_DEVICE_SYSTEM_ITER_SWAP_HEADER
-#undef __THRUST_DEVICE_SYSTEM_ITER_SWAP_HEADER
-
diff --git a/compat/thrust/system/detail/adl/logical.h b/compat/thrust/system/detail/adl/logical.h
deleted file mode 100644
index 585f71af36..0000000000
--- a/compat/thrust/system/detail/adl/logical.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a fill of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// the purpose of this header is to #include the logical.h header
-// of the host and device systems. It should be #included in any
-// code which uses adl to dispatch logical
-
-#define __THRUST_HOST_SYSTEM_LOGICAL_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/logical.h>
-#include __THRUST_HOST_SYSTEM_LOGICAL_HEADER
-#undef __THRUST_HOST_SYSTEM_LOGICAL_HEADER
-
-#define __THRUST_DEVICE_SYSTEM_LOGICAL_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/logical.h>
-#include __THRUST_DEVICE_SYSTEM_LOGICAL_HEADER
-#undef __THRUST_DEVICE_SYSTEM_LOGICAL_HEADER
-
diff --git a/compat/thrust/system/detail/adl/malloc_and_free.h b/compat/thrust/system/detail/adl/malloc_and_free.h
deleted file mode 100644
index 7d99a260df..0000000000
--- a/compat/thrust/system/detail/adl/malloc_and_free.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// the purpose of this header is to #include the malloc_and_free.h header
-// of the host and device systems. It should be #included in any
-// code which uses adl to dispatch malloc_and_free
-
-#define __THRUST_HOST_SYSTEM_MALLOC_AND_FREE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/malloc_and_free.h>
-#include __THRUST_HOST_SYSTEM_MALLOC_AND_FREE_HEADER
-#undef __THRUST_HOST_SYSTEM_MALLOC_AND_FREE_HEADER
-
-#define __THRUST_DEVICE_SYSTEM_MALLOC_AND_FREE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/malloc_and_free.h>
-#include __THRUST_DEVICE_SYSTEM_MALLOC_AND_FREE_HEADER
-#undef __THRUST_DEVICE_SYSTEM_MALLOC_AND_FREE_HEADER
-
diff --git a/compat/thrust/system/detail/adl/merge.h b/compat/thrust/system/detail/adl/merge.h
deleted file mode 100644
index 59d8aceb49..0000000000
--- a/compat/thrust/system/detail/adl/merge.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a fill of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// the purpose of this header is to #include the merge.h header
-// of the host and device systems. It should be #included in any
-// code which uses adl to dispatch merge
-
-#define __THRUST_HOST_SYSTEM_MERGE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/merge.h>
-#include __THRUST_HOST_SYSTEM_MERGE_HEADER
-#undef __THRUST_HOST_SYSTEM_MERGE_HEADER
-
-#define __THRUST_DEVICE_SYSTEM_MERGE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/merge.h>
-#include __THRUST_DEVICE_SYSTEM_MERGE_HEADER
-#undef __THRUST_DEVICE_SYSTEM_MERGE_HEADER
-
diff --git a/compat/thrust/system/detail/adl/mismatch.h b/compat/thrust/system/detail/adl/mismatch.h
deleted file mode 100644
index d2d1831374..0000000000
--- a/compat/thrust/system/detail/adl/mismatch.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a fill of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// the purpose of this header is to #include the mismatch.h header
-// of the host and device systems. It should be #included in any
-// code which uses adl to dispatch mismatch
-
-#define __THRUST_HOST_SYSTEM_MISMATCH_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/mismatch.h>
-#include __THRUST_HOST_SYSTEM_MISMATCH_HEADER
-#undef __THRUST_HOST_SYSTEM_MISMATCH_HEADER
-
-#define __THRUST_DEVICE_SYSTEM_MISMATCH_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/mismatch.h>
-#include __THRUST_DEVICE_SYSTEM_MISMATCH_HEADER
-#undef __THRUST_DEVICE_SYSTEM_MISMATCH_HEADER
-
diff --git a/compat/thrust/system/detail/adl/partition.h b/compat/thrust/system/detail/adl/partition.h
deleted file mode 100644
index efdc60555b..0000000000
--- a/compat/thrust/system/detail/adl/partition.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a fill of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// the purpose of this header is to #include the partition.h header
-// of the host and device systems. It should be #included in any
-// code which uses adl to dispatch partition
-
-#define __THRUST_HOST_SYSTEM_PARTITION_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/partition.h>
-#include __THRUST_HOST_SYSTEM_PARTITION_HEADER
-#undef __THRUST_HOST_SYSTEM_PARTITION_HEADER
-
-#define __THRUST_DEVICE_SYSTEM_PARTITION_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/partition.h>
-#include __THRUST_DEVICE_SYSTEM_PARTITION_HEADER
-#undef __THRUST_DEVICE_SYSTEM_PARTITION_HEADER
-
diff --git a/compat/thrust/system/detail/adl/reduce.h b/compat/thrust/system/detail/adl/reduce.h
deleted file mode 100644
index afa00f9c60..0000000000
--- a/compat/thrust/system/detail/adl/reduce.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// the purpose of this header is to #include the reduce.h header
-// of the host and device systems. It should be #included in any
-// code which uses adl to dispatch reduce
-
-#define __THRUST_HOST_SYSTEM_REDUCE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/reduce.h>
-#include __THRUST_HOST_SYSTEM_REDUCE_HEADER
-#undef __THRUST_HOST_SYSTEM_REDUCE_HEADER
-
-#define __THRUST_DEVICE_SYSTEM_REDUCE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/reduce.h>
-#include __THRUST_DEVICE_SYSTEM_REDUCE_HEADER
-#undef __THRUST_DEVICE_SYSTEM_REDUCE_HEADER
-
diff --git a/compat/thrust/system/detail/adl/reduce_by_key.h b/compat/thrust/system/detail/adl/reduce_by_key.h
deleted file mode 100644
index eac65b72d2..0000000000
--- a/compat/thrust/system/detail/adl/reduce_by_key.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// the purpose of this header is to #include the reduce_by_key.h header
-// of the host and device systems. It should be #included in any
-// code which uses adl to dispatch reduce_by_key
-
-#define __THRUST_HOST_SYSTEM_REDUCE_BY_KEY_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/reduce_by_key.h>
-#include __THRUST_HOST_SYSTEM_REDUCE_BY_KEY_HEADER
-#undef __THRUST_HOST_SYSTEM_REDUCE_BY_KEY_HEADER
-
-#define __THRUST_DEVICE_SYSTEM_REDUCE_BY_KEY_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/reduce_by_key.h>
-#include __THRUST_DEVICE_SYSTEM_REDUCE_BY_KEY_HEADER
-#undef __THRUST_DEVICE_SYSTEM_REDUCE_BY_KEY_HEADER
-
diff --git a/compat/thrust/system/detail/adl/remove.h b/compat/thrust/system/detail/adl/remove.h
deleted file mode 100644
index 9d64be8da3..0000000000
--- a/compat/thrust/system/detail/adl/remove.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a fill of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// the purpose of this header is to #include the remove.h header
-// of the host and device systems. It should be #included in any
-// code which uses adl to dispatch remove
-
-#define __THRUST_HOST_SYSTEM_REMOVE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/remove.h>
-#include __THRUST_HOST_SYSTEM_REMOVE_HEADER
-#undef __THRUST_HOST_SYSTEM_REMOVE_HEADER
-
-#define __THRUST_DEVICE_SYSTEM_REMOVE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/remove.h>
-#include __THRUST_DEVICE_SYSTEM_REMOVE_HEADER
-#undef __THRUST_DEVICE_SYSTEM_REMOVE_HEADER
-
diff --git a/compat/thrust/system/detail/adl/replace.h b/compat/thrust/system/detail/adl/replace.h
deleted file mode 100644
index e4d8bd22ac..0000000000
--- a/compat/thrust/system/detail/adl/replace.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a fill of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// the purpose of this header is to #include the replace.h header
-// of the host and device systems. It should be #included in any
-// code which uses adl to dispatch replace
-
-#define __THRUST_HOST_SYSTEM_REPLACE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/replace.h>
-#include __THRUST_HOST_SYSTEM_REPLACE_HEADER
-#undef __THRUST_HOST_SYSTEM_REPLACE_HEADER
-
-#define __THRUST_DEVICE_SYSTEM_REPLACE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/replace.h>
-#include __THRUST_DEVICE_SYSTEM_REPLACE_HEADER
-#undef __THRUST_DEVICE_SYSTEM_REPLACE_HEADER
-
diff --git a/compat/thrust/system/detail/adl/reverse.h b/compat/thrust/system/detail/adl/reverse.h
deleted file mode 100644
index 8cbcfd833c..0000000000
--- a/compat/thrust/system/detail/adl/reverse.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a fill of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// the purpose of this header is to #include the reverse.h header
-// of the host and device systems. It should be #included in any
-// code which uses adl to dispatch reverse
-
-#define __THRUST_HOST_SYSTEM_REVERSE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/reverse.h>
-#include __THRUST_HOST_SYSTEM_REVERSE_HEADER
-#undef __THRUST_HOST_SYSTEM_REVERSE_HEADER
-
-#define __THRUST_DEVICE_SYSTEM_REVERSE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/reverse.h>
-#include __THRUST_DEVICE_SYSTEM_REVERSE_HEADER
-#undef __THRUST_DEVICE_SYSTEM_REVERSE_HEADER
-
diff --git a/compat/thrust/system/detail/adl/scan.h b/compat/thrust/system/detail/adl/scan.h
deleted file mode 100644
index e70cd9fdda..0000000000
--- a/compat/thrust/system/detail/adl/scan.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// the purpose of this header is to #include the scan.h header
-// of the host and device systems. It should be #included in any
-// code which uses adl to dispatch scan
-
-#define __THRUST_HOST_SYSTEM_SCAN_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/scan.h>
-#include __THRUST_HOST_SYSTEM_SCAN_HEADER
-#undef __THRUST_HOST_SYSTEM_SCAN_HEADER
-
-#define __THRUST_DEVICE_SYSTEM_SCAN_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/scan.h>
-#include __THRUST_DEVICE_SYSTEM_SCAN_HEADER
-#undef __THRUST_DEVICE_SYSTEM_SCAN_HEADER
-
diff --git a/compat/thrust/system/detail/adl/scan_by_key.h b/compat/thrust/system/detail/adl/scan_by_key.h
deleted file mode 100644
index 02c4b84751..0000000000
--- a/compat/thrust/system/detail/adl/scan_by_key.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// the purpose of this header is to #include the scan_by_key.h header
-// of the host and device systems. It should be #included in any
-// code which uses adl to dispatch scan_by_key
-
-#define __THRUST_HOST_SYSTEM_SCAN_BY_KEY_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/scan_by_key.h>
-#include __THRUST_HOST_SYSTEM_SCAN_BY_KEY_HEADER
-#undef __THRUST_HOST_SYSTEM_SCAN_BY_KEY_HEADER
-
-#define __THRUST_DEVICE_SYSTEM_SCAN_BY_KEY_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/scan_by_key.h>
-#include __THRUST_DEVICE_SYSTEM_SCAN_BY_KEY_HEADER
-#undef __THRUST_DEVICE_SYSTEM_SCAN_BY_KEY_HEADER
-
diff --git a/compat/thrust/system/detail/adl/scatter.h b/compat/thrust/system/detail/adl/scatter.h
deleted file mode 100644
index b94b0d9892..0000000000
--- a/compat/thrust/system/detail/adl/scatter.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a fill of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// the purpose of this header is to #include the scatter.h header
-// of the host and device systems. It should be #included in any
-// code which uses adl to dispatch scatter
-
-#define __THRUST_HOST_SYSTEM_SCATTER_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/scatter.h>
-#include __THRUST_HOST_SYSTEM_SCATTER_HEADER
-#undef __THRUST_HOST_SYSTEM_SCATTER_HEADER
-
-#define __THRUST_DEVICE_SYSTEM_SCATTER_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/scatter.h>
-#include __THRUST_DEVICE_SYSTEM_SCATTER_HEADER
-#undef __THRUST_DEVICE_SYSTEM_SCATTER_HEADER
-
diff --git a/compat/thrust/system/detail/adl/sequence.h b/compat/thrust/system/detail/adl/sequence.h
deleted file mode 100644
index 07dcc7b7c1..0000000000
--- a/compat/thrust/system/detail/adl/sequence.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a fill of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// the purpose of this header is to #include the sequence.h header
-// of the host and device systems. It should be #included in any
-// code which uses adl to dispatch sequence
-
-#define __THRUST_HOST_SYSTEM_SEQUENCE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/sequence.h>
-#include __THRUST_HOST_SYSTEM_SEQUENCE_HEADER
-#undef __THRUST_HOST_SYSTEM_SEQUENCE_HEADER
-
-#define __THRUST_DEVICE_SYSTEM_SEQUENCE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/sequence.h>
-#include __THRUST_DEVICE_SYSTEM_SEQUENCE_HEADER
-#undef __THRUST_DEVICE_SYSTEM_SEQUENCE_HEADER
-
diff --git a/compat/thrust/system/detail/adl/set_operations.h b/compat/thrust/system/detail/adl/set_operations.h
deleted file mode 100644
index 9901b46851..0000000000
--- a/compat/thrust/system/detail/adl/set_operations.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a fill of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// the purpose of this header is to #include the set_operations.h header
-// of the host and device systems. It should be #included in any
-// code which uses adl to dispatch set_operations
-
-#define __THRUST_HOST_SYSTEM_SET_OPERATIONS_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/set_operations.h>
-#include __THRUST_HOST_SYSTEM_SET_OPERATIONS_HEADER
-#undef __THRUST_HOST_SYSTEM_SET_OPERATIONS_HEADER
-
-#define __THRUST_DEVICE_SYSTEM_SET_OPERATIONS_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/set_operations.h>
-#include __THRUST_DEVICE_SYSTEM_SET_OPERATIONS_HEADER
-#undef __THRUST_DEVICE_SYSTEM_SET_OPERATIONS_HEADER
-
diff --git a/compat/thrust/system/detail/adl/sort.h b/compat/thrust/system/detail/adl/sort.h
deleted file mode 100644
index afcb903f87..0000000000
--- a/compat/thrust/system/detail/adl/sort.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// the purpose of this header is to #include the sort.h header
-// of the host and device systems. It should be #included in any
-// code which uses adl to dispatch sort
-
-#define __THRUST_HOST_SYSTEM_SORT_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/sort.h>
-#include __THRUST_HOST_SYSTEM_SORT_HEADER
-#undef __THRUST_HOST_SYSTEM_SORT_HEADER
-
-#define __THRUST_DEVICE_SYSTEM_SORT_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/sort.h>
-#include __THRUST_DEVICE_SYSTEM_SORT_HEADER
-#undef __THRUST_DEVICE_SYSTEM_SORT_HEADER
-
diff --git a/compat/thrust/system/detail/adl/swap_ranges.h b/compat/thrust/system/detail/adl/swap_ranges.h
deleted file mode 100644
index c0069369e0..0000000000
--- a/compat/thrust/system/detail/adl/swap_ranges.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a fill of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// the purpose of this header is to #include the swap_ranges.h header
-// of the host and device systems. It should be #included in any
-// code which uses adl to dispatch swap_ranges
-
-#define __THRUST_HOST_SYSTEM_SWAP_RANGES_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/swap_ranges.h>
-#include __THRUST_HOST_SYSTEM_SWAP_RANGES_HEADER
-#undef __THRUST_HOST_SYSTEM_SWAP_RANGES_HEADER
-
-#define __THRUST_DEVICE_SYSTEM_SWAP_RANGES_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/swap_ranges.h>
-#include __THRUST_DEVICE_SYSTEM_SWAP_RANGES_HEADER
-#undef __THRUST_DEVICE_SYSTEM_SWAP_RANGES_HEADER
-
diff --git a/compat/thrust/system/detail/adl/tabulate.h b/compat/thrust/system/detail/adl/tabulate.h
deleted file mode 100644
index cb1fdebd11..0000000000
--- a/compat/thrust/system/detail/adl/tabulate.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a fill of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// the purpose of this header is to #include the tabulate.h header
-// of the host and device systems. It should be #included in any
-// code which uses adl to dispatch tabulate
-
-#define __THRUST_HOST_SYSTEM_TABULATE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/tabulate.h>
-#include __THRUST_HOST_SYSTEM_TABULATE_HEADER
-#undef __THRUST_HOST_SYSTEM_TABULATE_HEADER
-
-#define __THRUST_DEVICE_SYSTEM_TABULATE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/tabulate.h>
-#include __THRUST_DEVICE_SYSTEM_TABULATE_HEADER
-#undef __THRUST_DEVICE_SYSTEM_TABULATE_HEADER
-
diff --git a/compat/thrust/system/detail/adl/temporary_buffer.h b/compat/thrust/system/detail/adl/temporary_buffer.h
deleted file mode 100644
index 66df0ea85b..0000000000
--- a/compat/thrust/system/detail/adl/temporary_buffer.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// the purpose of this header is to #include the temporary_buffer.h header
-// of the host and device systems. It should be #included in any
-// code which uses adl to dispatch get_temporary_buffer or return_temporary_buffer
-
-#define __THRUST_HOST_SYSTEM_TEMPORARY_BUFFER_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/temporary_buffer.h>
-#include __THRUST_HOST_SYSTEM_TEMPORARY_BUFFER_HEADER
-#undef __THRUST_HOST_SYSTEM_TEMPORARY_BUFFER_HEADER
-
-#define __THRUST_DEVICE_SYSTEM_TEMPORARY_BUFFER_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/temporary_buffer.h>
-#include __THRUST_DEVICE_SYSTEM_TEMPORARY_BUFFER_HEADER
-#undef __THRUST_DEVICE_SYSTEM_TEMPORARY_BUFFER_HEADER
-
diff --git a/compat/thrust/system/detail/adl/transform.h b/compat/thrust/system/detail/adl/transform.h
deleted file mode 100644
index c9e6a01ea1..0000000000
--- a/compat/thrust/system/detail/adl/transform.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a fill of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// the purpose of this header is to #include the transform.h header
-// of the host and device systems. It should be #included in any
-// code which uses adl to dispatch transform
-
-#define __THRUST_HOST_SYSTEM_TRANSFORM_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/transform.h>
-#include __THRUST_HOST_SYSTEM_TRANSFORM_HEADER
-#undef __THRUST_HOST_SYSTEM_TRANSFORM_HEADER
-
-#define __THRUST_DEVICE_SYSTEM_TRANSFORM_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/transform.h>
-#include __THRUST_DEVICE_SYSTEM_TRANSFORM_HEADER
-#undef __THRUST_DEVICE_SYSTEM_TRANSFORM_HEADER
-
diff --git a/compat/thrust/system/detail/adl/transform_reduce.h b/compat/thrust/system/detail/adl/transform_reduce.h
deleted file mode 100644
index 0a5d97749f..0000000000
--- a/compat/thrust/system/detail/adl/transform_reduce.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a fill of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// the purpose of this header is to #include the transform_reduce.h header
-// of the host and device systems. It should be #included in any
-// code which uses adl to dispatch transform_reduce
-
-#define __THRUST_HOST_SYSTEM_TRANSFORM_REDUCE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/transform_reduce.h>
-#include __THRUST_HOST_SYSTEM_TRANSFORM_REDUCE_HEADER
-#undef __THRUST_HOST_SYSTEM_TRANSFORM_REDUCE_HEADER
-
-#define __THRUST_DEVICE_SYSTEM_TRANSFORM_REDUCE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/transform_reduce.h>
-#include __THRUST_DEVICE_SYSTEM_TRANSFORM_REDUCE_HEADER
-#undef __THRUST_DEVICE_SYSTEM_TRANSFORM_REDUCE_HEADER
-
diff --git a/compat/thrust/system/detail/adl/transform_scan.h b/compat/thrust/system/detail/adl/transform_scan.h
deleted file mode 100644
index 47c1dc3ae9..0000000000
--- a/compat/thrust/system/detail/adl/transform_scan.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a fill of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// the purpose of this header is to #include the transform_scan.h header
-// of the host and device systems. It should be #included in any
-// code which uses adl to dispatch transform_scan
-
-#define __THRUST_HOST_SYSTEM_TRANSFORM_SCAN_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/transform_scan.h>
-#include __THRUST_HOST_SYSTEM_TRANSFORM_SCAN_HEADER
-#undef __THRUST_HOST_SYSTEM_TRANSFORM_SCAN_HEADER
-
-#define __THRUST_DEVICE_SYSTEM_TRANSFORM_SCAN_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/transform_scan.h>
-#include __THRUST_DEVICE_SYSTEM_TRANSFORM_SCAN_HEADER
-#undef __THRUST_DEVICE_SYSTEM_TRANSFORM_SCAN_HEADER
-
diff --git a/compat/thrust/system/detail/adl/uninitialized_copy.h b/compat/thrust/system/detail/adl/uninitialized_copy.h
deleted file mode 100644
index 7cb0b8e401..0000000000
--- a/compat/thrust/system/detail/adl/uninitialized_copy.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a fill of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// the purpose of this header is to #include the uninitialized_copy.h header
-// of the host and device systems. It should be #included in any
-// code which uses adl to dispatch uninitialized_copy
-
-#define __THRUST_HOST_SYSTEM_UNINITIALIZED_COPY_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/uninitialized_copy.h>
-#include __THRUST_HOST_SYSTEM_UNINITIALIZED_COPY_HEADER
-#undef __THRUST_HOST_SYSTEM_UNINITIALIZED_COPY_HEADER
-
-#define __THRUST_DEVICE_SYSTEM_UNINITIALIZED_COPY_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/uninitialized_copy.h>
-#include __THRUST_DEVICE_SYSTEM_UNINITIALIZED_COPY_HEADER
-#undef __THRUST_DEVICE_SYSTEM_UNINITIALIZED_COPY_HEADER
-
diff --git a/compat/thrust/system/detail/adl/uninitialized_fill.h b/compat/thrust/system/detail/adl/uninitialized_fill.h
deleted file mode 100644
index 9f00b51622..0000000000
--- a/compat/thrust/system/detail/adl/uninitialized_fill.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a fill of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// the purpose of this header is to #include the uninitialized_fill.h header
-// of the host and device systems. It should be #included in any
-// code which uses adl to dispatch uninitialized_fill
-
-#define __THRUST_HOST_SYSTEM_UNINITIALIZED_FILL_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/uninitialized_fill.h>
-#include __THRUST_HOST_SYSTEM_UNINITIALIZED_FILL_HEADER
-#undef __THRUST_HOST_SYSTEM_UNINITIALIZED_FILL_HEADER
-
-#define __THRUST_DEVICE_SYSTEM_UNINITIALIZED_FILL_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/uninitialized_fill.h>
-#include __THRUST_DEVICE_SYSTEM_UNINITIALIZED_FILL_HEADER
-#undef __THRUST_DEVICE_SYSTEM_UNINITIALIZED_FILL_HEADER
-
diff --git a/compat/thrust/system/detail/adl/unique.h b/compat/thrust/system/detail/adl/unique.h
deleted file mode 100644
index 932ff58e07..0000000000
--- a/compat/thrust/system/detail/adl/unique.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a fill of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// the purpose of this header is to #include the unique.h header
-// of the host and device systems. It should be #included in any
-// code which uses adl to dispatch unique
-
-#define __THRUST_HOST_SYSTEM_UNIQUE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/unique.h>
-#include __THRUST_HOST_SYSTEM_UNIQUE_HEADER
-#undef __THRUST_HOST_SYSTEM_UNIQUE_HEADER
-
-#define __THRUST_DEVICE_SYSTEM_UNIQUE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/unique.h>
-#include __THRUST_DEVICE_SYSTEM_UNIQUE_HEADER
-#undef __THRUST_DEVICE_SYSTEM_UNIQUE_HEADER
-
diff --git a/compat/thrust/system/detail/adl/unique_by_key.h b/compat/thrust/system/detail/adl/unique_by_key.h
deleted file mode 100644
index 30e6f2f2d1..0000000000
--- a/compat/thrust/system/detail/adl/unique_by_key.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a fill of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// the purpose of this header is to #include the unique_by_key.h header
-// of the host and device systems. It should be #included in any
-// code which uses adl to dispatch unique_by_key
-
-#define __THRUST_HOST_SYSTEM_UNIQUE_BY_KEY_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/unique_by_key.h>
-#include __THRUST_HOST_SYSTEM_UNIQUE_BY_KEY_HEADER
-#undef __THRUST_HOST_SYSTEM_UNIQUE_BY_KEY_HEADER
-
-#define __THRUST_DEVICE_SYSTEM_UNIQUE_BY_KEY_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/unique_by_key.h>
-#include __THRUST_DEVICE_SYSTEM_UNIQUE_BY_KEY_HEADER
-#undef __THRUST_DEVICE_SYSTEM_UNIQUE_BY_KEY_HEADER
-
diff --git a/compat/thrust/system/detail/bad_alloc.h b/compat/thrust/system/detail/bad_alloc.h
deleted file mode 100644
index bb73d1f006..0000000000
--- a/compat/thrust/system/detail/bad_alloc.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#pragma once
-
-#include <new>
-#include <string>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-
-// define our own bad_alloc so we can set its .what()
-class bad_alloc
-  : public std::bad_alloc
-{
-  public:
-    inline bad_alloc(const std::string &w)
-      : std::bad_alloc(), m_what()
-    {
-      m_what = std::bad_alloc::what();
-      m_what += ": ";
-      m_what += w;
-    } // end bad_alloc()
-
-    inline virtual ~bad_alloc(void) throw () {};
-
-    inline virtual const char *what(void) const throw()
-    {
-      return m_what.c_str();
-    } // end what()
-
-  private:
-    std::string m_what;
-}; // end bad_alloc
-  
-} // end detail
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/detail/errno.h b/compat/thrust/system/detail/errno.h
deleted file mode 100644
index 34bc8cc568..0000000000
--- a/compat/thrust/system/detail/errno.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// The rationale for the existence of these apparently redundant definitions is
-// to provide them portably and to avoid bringing in system headers which might
-// pollute the global namespace. These identifiers are in lowercase to avoid
-// colliding with the real macros in errno.h.
-
-namespace thrust
-{
-
-namespace system
-{
-
-namespace detail
-{
-
-static const int eafnosupport    = 9901;
-static const int eaddrinuse      = 9902;
-static const int eaddrnotavail   = 9903;
-static const int eisconn         = 9904;
-static const int ebadmsg         = 9905;
-static const int econnaborted    = 9906;
-static const int ealready        = 9907;
-static const int econnrefused    = 9908;
-static const int econnreset      = 9909;
-static const int edestaddrreq    = 9910;
-static const int ehostunreach    = 9911;
-static const int eidrm           = 9912;
-static const int emsgsize        = 9913;
-static const int enetdown        = 9914;
-static const int enetreset       = 9915;
-static const int enetunreach     = 9916;
-static const int enobufs         = 9917;
-static const int enolink         = 9918;
-static const int enodata         = 9919;
-static const int enomsg          = 9920;
-static const int enoprotoopt     = 9921;
-static const int enosr           = 9922;
-static const int enotsock        = 9923;
-static const int enostr          = 9924;
-static const int enotconn        = 9925;
-static const int enotsup         = 9926;
-static const int ecanceled       = 9927;
-static const int einprogress     = 9928;
-static const int eopnotsupp      = 9929;
-static const int ewouldblock     = 9930;
-static const int eownerdead      = 9931;
-static const int eproto          = 9932;
-static const int eprotonosupport = 9933;
-static const int enotrecoverable = 9934;
-static const int etime           = 9935;
-static const int etxtbsy         = 9936;
-static const int etimedout       = 9938;
-static const int eloop           = 9939;
-static const int eoverflow       = 9940;
-static const int eprototype      = 9941;
-static const int enosys          = 9942;
-static const int einval          = 9943;
-static const int erange          = 9944;
-static const int eilseq          = 9945;
-static const int e2big           = 9946;
-static const int edom            = 9947;
-static const int efault          = 9948;
-static const int ebadf           = 9949;
-static const int epipe           = 9950;
-static const int exdev           = 9951;
-static const int ebusy           = 9952;
-static const int enotempty       = 9953;
-static const int enoexec         = 9954;
-static const int eexist          = 9955;
-static const int efbig           = 9956;
-static const int enametoolong    = 9957;
-static const int enotty          = 9958;
-static const int eintr           = 9959;
-static const int espipe          = 9960;
-static const int eio             = 9961;
-static const int eisdir          = 9962;
-static const int echild          = 9963;
-static const int enolck          = 9964;
-static const int enospc          = 9965;
-static const int enxio           = 9966;
-static const int enodev          = 9967;
-static const int enoent          = 9968;
-static const int esrch           = 9969;
-static const int enotdir         = 9970;
-static const int enomem          = 9971;
-static const int eperm           = 9972;
-static const int eacces          = 9973;
-static const int erofs           = 9974;
-static const int edeadlk         = 9975;
-static const int eagain          = 9976;
-static const int enfile          = 9977;
-static const int emfile          = 9978;
-static const int emlink          = 9979;
-
-} // end detail
-
-} // end system
-
-} // end thrust
-
diff --git a/compat/thrust/system/detail/error_category.inl b/compat/thrust/system/detail/error_category.inl
deleted file mode 100644
index 8e19c89db5..0000000000
--- a/compat/thrust/system/detail/error_category.inl
+++ /dev/null
@@ -1,234 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#pragma once
-
-#include <thrust/system/error_code.h>
-#include <thrust/system/detail/errno.h>
-#include <thrust/functional.h>
-#include <cstring>
-
-namespace thrust
-{
-
-namespace system
-{
-
-error_category
-  ::~error_category(void)
-{
-  ;
-} // end error_category::~error_category()
-
-
-error_condition error_category
-  ::default_error_condition(int ev) const
-{
-  return error_condition(ev, *this);
-} // end error_category::default_error_condition()
-
-
-bool error_category
-  ::equivalent(int code, const error_condition &condition) const
-{
-  return default_error_condition(code) == condition;
-} // end error_condition::equivalent()
-
-
-bool error_category
-  ::equivalent(const error_code &code, int condition) const
-{
-  bool result = (this->operator==(code.category())) && (code.value() == condition);
-  return result;
-} // end error_code::equivalent()
-
-
-bool error_category
-  ::operator==(const error_category &rhs) const
-{
-  return this == &rhs;
-} // end error_category::operator==()
-
-
-bool error_category
-  ::operator!=(const error_category &rhs) const
-{
-  return !this->operator==(rhs);
-} // end error_category::operator!=()
-
-
-bool error_category
-  ::operator<(const error_category &rhs) const
-{
-  return thrust::less<const error_category*>()(this,&rhs);
-} // end error_category::operator<()
-
-
-namespace detail
-{
-
-
-class generic_error_category
-  : public error_category
-{
-  public:
-    inline generic_error_category(void) {}
-
-    inline virtual const char *name(void) const
-    {
-      return "generic";
-    }
-
-    inline virtual std::string message(int ev) const
-    {
-      static const std::string unknown_err("Unknown error");
-
-      // XXX strerror is not thread-safe:
-      //     prefer strerror_r (which is not provided on windows)
-      const char *c_str = std::strerror(ev);
-      return c_str ? std::string(c_str) : unknown_err;
-    }
-}; // end generic_category_result
-
-
-class system_error_category
-  : public error_category
-{
-  public:
-    inline system_error_category(void) {}
-
-    inline virtual const char *name(void) const
-    {
-      return "system";
-    }
-
-    inline virtual std::string message(int ev) const
-    {
-      return generic_category().message(ev);
-    }
-
-    inline virtual error_condition default_error_condition(int ev) const
-    {
-      using namespace errc;
-
-      switch(ev)
-      {
-        case eafnosupport:    return make_error_condition(address_family_not_supported);
-        case eaddrinuse:      return make_error_condition(address_in_use);
-        case eaddrnotavail:   return make_error_condition(address_not_available);
-        case eisconn:         return make_error_condition(already_connected);
-        case e2big:           return make_error_condition(argument_list_too_long);
-        case edom:            return make_error_condition(argument_out_of_domain);
-        case efault:          return make_error_condition(bad_address);
-        case ebadf:           return make_error_condition(bad_file_descriptor);
-        case ebadmsg:         return make_error_condition(bad_message);
-        case epipe:           return make_error_condition(broken_pipe);
-        case econnaborted:    return make_error_condition(connection_aborted);
-        case ealready:        return make_error_condition(connection_already_in_progress);
-        case econnrefused:    return make_error_condition(connection_refused);
-        case econnreset:      return make_error_condition(connection_reset);
-        case exdev:           return make_error_condition(cross_device_link);
-        case edestaddrreq:    return make_error_condition(destination_address_required);
-        case ebusy:           return make_error_condition(device_or_resource_busy);
-        case enotempty:       return make_error_condition(directory_not_empty);
-        case enoexec:         return make_error_condition(executable_format_error);
-        case eexist:          return make_error_condition(file_exists);
-        case efbig:           return make_error_condition(file_too_large);
-        case enametoolong:    return make_error_condition(filename_too_long);
-        case enosys:          return make_error_condition(function_not_supported);
-        case ehostunreach:    return make_error_condition(host_unreachable);
-        case eidrm:           return make_error_condition(identifier_removed);
-        case eilseq:          return make_error_condition(illegal_byte_sequence);
-        case enotty:          return make_error_condition(inappropriate_io_control_operation);
-        case eintr:           return make_error_condition(interrupted);
-        case einval:          return make_error_condition(invalid_argument);
-        case espipe:          return make_error_condition(invalid_seek);
-        case eio:             return make_error_condition(io_error);
-        case eisdir:          return make_error_condition(is_a_directory);
-        case emsgsize:        return make_error_condition(message_size);
-        case enetdown:        return make_error_condition(network_down);
-        case enetreset:       return make_error_condition(network_reset);
-        case enetunreach:     return make_error_condition(network_unreachable);
-        case enobufs:         return make_error_condition(no_buffer_space);
-        case echild:          return make_error_condition(no_child_process);
-        case enolink:         return make_error_condition(no_link);
-        case enolck:          return make_error_condition(no_lock_available);
-        case enodata:         return make_error_condition(no_message_available);
-        case enomsg:          return make_error_condition(no_message);
-        case enoprotoopt:     return make_error_condition(no_protocol_option);
-        case enospc:          return make_error_condition(no_space_on_device);
-        case enosr:           return make_error_condition(no_stream_resources);
-        case enxio:           return make_error_condition(no_such_device_or_address);
-        case enodev:          return make_error_condition(no_such_device);
-        case enoent:          return make_error_condition(no_such_file_or_directory);
-        case esrch:           return make_error_condition(no_such_process);
-        case enotdir:         return make_error_condition(not_a_directory);
-        case enotsock:        return make_error_condition(not_a_socket);
-        case enostr:          return make_error_condition(not_a_stream);
-        case enotconn:        return make_error_condition(not_connected);
-        case enomem:          return make_error_condition(not_enough_memory);
-        case enotsup:         return make_error_condition(not_supported);
-        case ecanceled:       return make_error_condition(operation_canceled);
-        case einprogress:     return make_error_condition(operation_in_progress);
-        case eperm:           return make_error_condition(operation_not_permitted);
-        case eopnotsupp:      return make_error_condition(operation_not_supported);
-        case ewouldblock:     return make_error_condition(operation_would_block);
-        case eownerdead:      return make_error_condition(owner_dead);
-        case eacces:          return make_error_condition(permission_denied);
-        case eproto:          return make_error_condition(protocol_error);
-        case eprotonosupport: return make_error_condition(protocol_not_supported);
-        case erofs:           return make_error_condition(read_only_file_system);
-        case edeadlk:         return make_error_condition(resource_deadlock_would_occur);
-        case eagain:          return make_error_condition(resource_unavailable_try_again);
-        case erange:          return make_error_condition(result_out_of_range);
-        case enotrecoverable: return make_error_condition(state_not_recoverable);
-        case etime:           return make_error_condition(stream_timeout);
-        case etxtbsy:         return make_error_condition(text_file_busy);
-        case etimedout:       return make_error_condition(timed_out);
-        case enfile:          return make_error_condition(too_many_files_open_in_system);
-        case emfile:          return make_error_condition(too_many_files_open);
-        case emlink:          return make_error_condition(too_many_links);
-        case eloop:           return make_error_condition(too_many_symbolic_link_levels);
-        case eoverflow:       return make_error_condition(value_too_large);
-        case eprototype:      return make_error_condition(wrong_protocol_type);
-        default:              return error_condition(ev,system_category());
-      }
-    }
-}; // end system_category_result
-
-
-} // end detail
-
-
-const error_category &generic_category(void)
-{
-  static const detail::generic_error_category result;
-  return result;
-}
-
-
-const error_category &system_category(void)
-{
-  static const detail::system_error_category result;
-  return result;
-}
-
-
-} // end system
-
-} // end thrust
-
diff --git a/compat/thrust/system/detail/error_code.inl b/compat/thrust/system/detail/error_code.inl
deleted file mode 100644
index 0cf86b4821..0000000000
--- a/compat/thrust/system/detail/error_code.inl
+++ /dev/null
@@ -1,197 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#pragma once
-
-#include <thrust/system/error_code.h>
-
-namespace thrust
-{
-
-namespace system
-{
-
-error_code
-  ::error_code(void)
-    :m_val(0),m_cat(&system_category())
-{
-  ;
-} // end error_code::error_code()
-
-
-error_code
-  ::error_code(int val, const error_category &cat)
-    :m_val(val),m_cat(&cat)
-{
-  ;
-} // end error_code::error_code()
-
-
-template <typename ErrorCodeEnum>
-  error_code
-    ::error_code(ErrorCodeEnum e
-// XXX WAR msvc's problem with enable_if
-#if THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
-                 , typename thrust::detail::enable_if<is_error_code_enum<ErrorCodeEnum>::value>::type *
-#endif // THRUST_HOST_COMPILER_MSVC
-                )
-{
-  *this = make_error_code(e);
-} // end error_code::error_code()
-
-
-void error_code
-  ::assign(int val, const error_category &cat)
-{
-  m_val = val;
-  m_cat = &cat;
-} // end error_code::assign()
-
-
-template <typename ErrorCodeEnum>
-// XXX WAR msvc's problem with enable_if
-#if THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
-  typename thrust::detail::enable_if<is_error_code_enum<ErrorCodeEnum>::value, error_code>::type &
-#else
-  error_code &
-#endif // THRUST_HOST_COMPILER_MSVC
-    error_code
-      ::operator=(ErrorCodeEnum e)
-{
-  *this = make_error_code(e);
-  return *this;
-} // end error_code::operator=()
-
-
-void error_code
-  ::clear(void)
-{
-  m_val = 0;
-  m_cat = &system_category();
-} // end error_code::clear()
-
-
-int error_code
-  ::value(void) const
-{
-  return m_val;
-} // end error_code::value()
-
-
-const error_category &error_code
-  ::category(void) const
-{
-  return *m_cat;
-} // end error_code::category()
-
-
-error_condition error_code
-  ::default_error_condition(void) const
-{
-  return category().default_error_condition(value());
-} // end error_code::default_error_condition()
-
-
-std::string error_code
-  ::message(void) const
-{
-  return category().message(value());
-} // end error_code::message()
-
-
-error_code
-  ::operator bool (void) const
-{
-  return value() != 0;
-} // end error_code::operator bool ()
-
-
-error_code make_error_code(errc::errc_t e)
-{
-  return error_code(static_cast<int>(e), generic_category());
-} // end make_error_code()
-
-
-bool operator<(const error_code &lhs, const error_code &rhs)
-{
-  bool result = lhs.category().operator<(rhs.category());
-  result = result || lhs.category().operator==(rhs.category());
-  result = result || lhs.value() < rhs.value();
-  return result;
-} // end operator==()
-
-
-template<typename charT, typename traits>
-  std::basic_ostream<charT,traits>&
-    operator<<(std::basic_ostream<charT,traits> &os, const error_code &ec)
-{
-  return os << ec.category().name() << ':' << ec.value();
-} // end operator<<()
-
-
-bool operator==(const error_code &lhs, const error_code &rhs)
-{
-  return lhs.category().operator==(rhs.category()) && lhs.value() == rhs.value();
-} // end operator==()
-
-
-bool operator==(const error_code &lhs, const error_condition &rhs)
-{
-  return lhs.category().equivalent(lhs.value(), rhs) || rhs.category().equivalent(lhs,rhs.value());
-} // end operator==()
-
-
-bool operator==(const error_condition &lhs, const error_code &rhs)
-{
-  return rhs.category().equivalent(lhs.value(), lhs) || lhs.category().equivalent(rhs, lhs.value());
-} // end operator==()
-
-
-bool operator==(const error_condition &lhs, const error_condition &rhs)
-{
-  return lhs.category().operator==(rhs.category()) && lhs.value() == rhs.value();
-} // end operator==()
-
-
-bool operator!=(const error_code &lhs, const error_code &rhs)
-{
-  return !(lhs == rhs);
-} // end operator!=()
-
-
-bool operator!=(const error_code &lhs, const error_condition &rhs)
-{
-  return !(lhs == rhs);
-} // end operator!=()
-
-
-bool operator!=(const error_condition &lhs, const error_code &rhs)
-{
-  return !(lhs == rhs);
-} // end operator!=()
-
-
-bool operator!=(const error_condition &lhs, const error_condition &rhs)
-{
-  return !(lhs == rhs);
-} // end operator!=()
-
-
-} // end system
-
-} // end thrust
-
diff --git a/compat/thrust/system/detail/error_condition.inl b/compat/thrust/system/detail/error_condition.inl
deleted file mode 100644
index 00fbaf091d..0000000000
--- a/compat/thrust/system/detail/error_condition.inl
+++ /dev/null
@@ -1,133 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#pragma once
-
-#include <thrust/system/detail/error_condition.inl>
-#include <thrust/functional.h>
-
-namespace thrust
-{
-
-namespace system
-{
-
-error_condition
-  ::error_condition(void)
-    :m_val(0),m_cat(&generic_category())
-{
-  ;
-} // end error_condition::error_condition()
-
-
-error_condition
-  ::error_condition(int val, const error_category &cat)
-    :m_val(val),m_cat(&cat)
-{
-  ;
-} // end error_condition::error_condition()
-
-
-template<typename ErrorConditionEnum>
-  error_condition
-    ::error_condition(ErrorConditionEnum e
-// XXX WAR msvc's problem with enable_if
-#if THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
-                      , typename thrust::detail::enable_if<is_error_condition_enum<ErrorConditionEnum>::value>::type *
-#endif // THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
-                     )
-{
-  *this = make_error_condition(e);
-} // end error_condition::error_condition()
-
-
-void error_condition
-  ::assign(int val, const error_category &cat)
-{
-  m_val = val;
-  m_cat = &cat;
-} // end error_category::assign()
-
-
-template<typename ErrorConditionEnum>
-// XXX WAR msvc's problem with enable_if
-#if THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
-  typename thrust::detail::enable_if<is_error_condition_enum<ErrorConditionEnum>::value, error_condition>::type &
-#else
-  error_condition &
-#endif // THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
-    error_condition
-      ::operator=(ErrorConditionEnum e)
-{
-  *this = make_error_condition(e);
-  return *this;
-} // end error_condition::operator=()
-
-
-void error_condition
-  ::clear(void)
-{
-  m_val = 0;
-  m_cat = &generic_category();
-} // end error_condition::clear()
-
-
-int error_condition
-  ::value(void) const
-{
-  return m_val;
-} // end error_condition::value()
-
-
-const error_category &error_condition
-  ::category(void) const
-{
-  return *m_cat;
-} // end error_condition::category()
-
-
-std::string error_condition
-  ::message(void) const
-{
-  return category().message(value());
-} // end error_condition::message()
-
-
-error_condition
-  ::operator bool (void) const
-{
-  return value() != 0;
-} // end error_condition::operator bool ()
-
-
-error_condition make_error_condition(errc::errc_t e)
-{
-  return error_condition(static_cast<int>(e), generic_category());
-} // end make_error_condition()
-
-
-bool operator<(const error_condition &lhs,
-               const error_condition &rhs)
-{
-  return lhs.category().operator<(rhs.category()) || (lhs.category().operator==(rhs.category()) && (lhs.value() < rhs.value()));
-} // end operator<()
-
-
-} // end system
-
-} // end thrust
-
diff --git a/compat/thrust/system/detail/generic/adjacent_difference.h b/compat/thrust/system/detail/generic/adjacent_difference.h
deleted file mode 100644
index bb340df490..0000000000
--- a/compat/thrust/system/detail/generic/adjacent_difference.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file adjacent_difference.h
- *  \brief Generic implementation of adjacent_difference.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/tag.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-template <typename DerivedPolicy, class InputIterator, class OutputIterator>
-OutputIterator adjacent_difference(thrust::execution_policy<DerivedPolicy> &exec,
-                                   InputIterator first, InputIterator last,
-                                   OutputIterator result);
-
-template <typename DerivedPolicy, class InputIterator, class OutputIterator, class BinaryFunction>
-OutputIterator adjacent_difference(thrust::execution_policy<DerivedPolicy> &exec,
-                                   InputIterator first, InputIterator last,
-                                   OutputIterator result,
-                                   BinaryFunction binary_op);
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/detail/generic/adjacent_difference.inl>
-
diff --git a/compat/thrust/system/detail/generic/adjacent_difference.inl b/compat/thrust/system/detail/generic/adjacent_difference.inl
deleted file mode 100644
index 619b29f860..0000000000
--- a/compat/thrust/system/detail/generic/adjacent_difference.inl
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/adjacent_difference.h>
-#include <thrust/adjacent_difference.h>
-#include <thrust/functional.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/temporary_array.h>
-#include <thrust/transform.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-template <typename DerivedPolicy, class InputIterator, class OutputIterator>
-OutputIterator adjacent_difference(thrust::execution_policy<DerivedPolicy> &exec,
-                                   InputIterator first, InputIterator last,
-                                   OutputIterator result)
-{
-  typedef typename thrust::iterator_traits<InputIterator>::value_type InputType;
-  thrust::minus<InputType> binary_op;
-
-  return thrust::adjacent_difference(exec, first, last, result, binary_op);
-} // end adjacent_difference()
-
-template <typename DerivedPolicy, class InputIterator, class OutputIterator, class BinaryFunction>
-OutputIterator adjacent_difference(thrust::execution_policy<DerivedPolicy> &exec,
-                                   InputIterator first, InputIterator last,
-                                   OutputIterator result,
-                                   BinaryFunction binary_op)
-{
-  typedef typename thrust::iterator_traits<InputIterator>::value_type InputType;
-
-  if(first == last)
-  {
-    // empty range, nothing to do
-    return result; 
-  }
-  else 
-  {
-    // an in-place operation is requested, copy the input and call the entry point
-    // XXX a special-purpose kernel would be faster here since
-    // only block boundaries need to be copied
-    thrust::detail::temporary_array<InputType, DerivedPolicy> input_copy(exec, first, last);
-    
-    *result = *first;
-    thrust::transform(exec, input_copy.begin() + 1, input_copy.end(), input_copy.begin(), result + 1, binary_op); 
-  }
-
-  return result + (last - first);
-}
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/generic/advance.h b/compat/thrust/system/detail/generic/advance.h
deleted file mode 100644
index 249aac7e54..0000000000
--- a/compat/thrust/system/detail/generic/advance.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-template<typename InputIterator, typename Distance>
-void advance(InputIterator& i, Distance n);
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/detail/generic/advance.inl>
-
diff --git a/compat/thrust/system/detail/generic/advance.inl b/compat/thrust/system/detail/generic/advance.inl
deleted file mode 100644
index b95737ad9e..0000000000
--- a/compat/thrust/system/detail/generic/advance.inl
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/advance.h>
-#include <thrust/iterator/iterator_traits.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-namespace detail
-{
-
-template<typename InputIterator, typename Distance>
-void advance(InputIterator& i, Distance n, thrust::incrementable_traversal_tag)
-{
-  while(n)
-  {
-    ++i;
-    --n;
-  } // end while
-} // end advance()
-
-template<typename InputIterator, typename Distance>
-void advance(InputIterator& i, Distance n, thrust::random_access_traversal_tag)
-{
-  i += n;
-} // end advance()
-
-} // end detail
-
-template<typename InputIterator, typename Distance>
-void advance(InputIterator& i, Distance n)
-{
-  // dispatch on iterator traversal
-  thrust::system::detail::generic::detail::advance(i, n,
-    typename thrust::iterator_traversal<InputIterator>::type());
-} // end advance()
-
-} // end namespace detail
-} // end namespace generic
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/generic/binary_search.h b/compat/thrust/system/detail/generic/binary_search.h
deleted file mode 100644
index 7fd6c506ee..0000000000
--- a/compat/thrust/system/detail/generic/binary_search.h
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file binary_search.h
- *  \brief Generic implementations of binary search functions.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/tag.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-
-template <typename DerivedPolicy, typename ForwardIterator, typename T>
-ForwardIterator lower_bound(thrust::execution_policy<DerivedPolicy> &exec, 
-                            ForwardIterator begin,
-                            ForwardIterator end,
-                            const T& value);
-
-template <typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
-ForwardIterator lower_bound(thrust::execution_policy<DerivedPolicy> &exec,
-                            ForwardIterator begin,
-                            ForwardIterator end,
-                            const T& value, 
-                            StrictWeakOrdering comp);
-
-
-template <typename DerivedPolicy, typename ForwardIterator, typename T>
-ForwardIterator upper_bound(thrust::execution_policy<DerivedPolicy> &exec,
-                            ForwardIterator begin,
-                            ForwardIterator end,
-                            const T& value);
-
-template <typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
-ForwardIterator upper_bound(thrust::execution_policy<DerivedPolicy> &exec, 
-                            ForwardIterator begin,
-                            ForwardIterator end,
-                            const T& value, 
-                            StrictWeakOrdering comp);
-
-
-template <typename DerivedPolicy, typename ForwardIterator, typename T>
-bool binary_search(thrust::execution_policy<DerivedPolicy> &exec,
-                   ForwardIterator begin,
-                   ForwardIterator end,
-                   const T& value);
-
-template <typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
-bool binary_search(thrust::execution_policy<DerivedPolicy> &exec,
-                   ForwardIterator begin,
-                   ForwardIterator end,
-                   const T& value, 
-                   StrictWeakOrdering comp);
-
-
-template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
-OutputIterator lower_bound(thrust::execution_policy<DerivedPolicy> &exec,
-                           ForwardIterator begin, 
-                           ForwardIterator end,
-                           InputIterator values_begin, 
-                           InputIterator values_end,
-                           OutputIterator output);
-
-template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
-OutputIterator lower_bound(thrust::execution_policy<DerivedPolicy> &exec,
-                           ForwardIterator begin, 
-                           ForwardIterator end,
-                           InputIterator values_begin, 
-                           InputIterator values_end,
-                           OutputIterator output,
-                           StrictWeakOrdering comp);
-
-
-template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
-OutputIterator upper_bound(thrust::execution_policy<DerivedPolicy> &exec,
-                           ForwardIterator begin, 
-                           ForwardIterator end,
-                           InputIterator values_begin, 
-                           InputIterator values_end,
-                           OutputIterator output);
-
-template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
-OutputIterator upper_bound(thrust::execution_policy<DerivedPolicy> &exec,
-                           ForwardIterator begin, 
-                           ForwardIterator end,
-                           InputIterator values_begin, 
-                           InputIterator values_end,
-                           OutputIterator output,
-                           StrictWeakOrdering comp);
-
-
-template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
-OutputIterator binary_search(thrust::execution_policy<DerivedPolicy> &exec,
-                             ForwardIterator begin, 
-                             ForwardIterator end,
-                             InputIterator values_begin, 
-                             InputIterator values_end,
-                             OutputIterator output);
-
-template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
-OutputIterator binary_search(thrust::execution_policy<DerivedPolicy> &exec,
-                             ForwardIterator begin, 
-                             ForwardIterator end,
-                             InputIterator values_begin, 
-                             InputIterator values_end,
-                             OutputIterator output,
-                             StrictWeakOrdering comp);
-
-
-template <typename DerivedPolicy, typename ForwardIterator, typename LessThanComparable>
-thrust::pair<ForwardIterator,ForwardIterator>
-equal_range(thrust::execution_policy<DerivedPolicy> &exec,
-            ForwardIterator first,
-            ForwardIterator last,
-            const LessThanComparable &value);
-
-template <typename DerivedPolicy, typename ForwardIterator, typename LessThanComparable, typename StrictWeakOrdering>
-thrust::pair<ForwardIterator,ForwardIterator>
-equal_range(thrust::execution_policy<DerivedPolicy> &exec,
-            ForwardIterator first,
-            ForwardIterator last,
-            const LessThanComparable &value,
-            StrictWeakOrdering comp);
-
-
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/detail/generic/binary_search.inl>
-
diff --git a/compat/thrust/system/detail/generic/binary_search.inl b/compat/thrust/system/detail/generic/binary_search.inl
deleted file mode 100644
index 151ac0ea35..0000000000
--- a/compat/thrust/system/detail/generic/binary_search.inl
+++ /dev/null
@@ -1,342 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file binary_search.inl
- *  \brief Inline file for binary_search.h
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/distance.h>
-#include <thrust/functional.h>
-#include <thrust/binary_search.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/binary_search.h>
-
-#include <thrust/for_each.h>
-#include <thrust/detail/function.h>
-#include <thrust/system/detail/generic/scalar/binary_search.h>
-
-#include <thrust/detail/temporary_array.h>
-#include <thrust/detail/type_traits.h>
-
-namespace thrust
-{
-namespace detail
-{
-
-// XXX WAR circular #inclusion with this forward declaration
-template<typename,typename> class temporary_array;
-
-} // end detail
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-namespace detail
-{
-
-
-// short names to avoid nvcc bug
-struct lbf
-{
-    template <typename RandomAccessIterator, typename T, typename StrictWeakOrdering>
-    __host__ __device__
-    typename thrust::iterator_traits<RandomAccessIterator>::difference_type
-    operator()(RandomAccessIterator begin, RandomAccessIterator end, const T& value, StrictWeakOrdering comp)
-    {
-        return thrust::system::detail::generic::scalar::lower_bound(begin, end, value, comp) - begin;
-    }
-};
-
-struct ubf
-{
-    template <typename RandomAccessIterator, typename T, typename StrictWeakOrdering>
-        __host__ __device__
-        typename thrust::iterator_traits<RandomAccessIterator>::difference_type
-     operator()(RandomAccessIterator begin, RandomAccessIterator end, const T& value, StrictWeakOrdering comp){
-         return thrust::system::detail::generic::scalar::upper_bound(begin, end, value, comp) - begin;
-     }
-};
-
-struct bsf
-{
-    template <typename RandomAccessIterator, typename T, typename StrictWeakOrdering>
-        __host__ __device__
-     bool operator()(RandomAccessIterator begin, RandomAccessIterator end, const T& value, StrictWeakOrdering comp){
-         RandomAccessIterator iter = thrust::system::detail::generic::scalar::lower_bound(begin, end, value, comp);
-
-         thrust::detail::host_device_function<StrictWeakOrdering,bool> wrapped_comp(comp);
-
-         return iter != end && !wrapped_comp(value, *iter);
-     }
-};
-
-
-template <typename ForwardIterator, typename StrictWeakOrdering, typename BinarySearchFunction>
-struct binary_search_functor
-{
-    ForwardIterator begin;
-    ForwardIterator end;
-    StrictWeakOrdering comp;
-    BinarySearchFunction func;
-
-    binary_search_functor(ForwardIterator begin, ForwardIterator end, StrictWeakOrdering comp, BinarySearchFunction func)
-        : begin(begin), end(end), comp(comp), func(func) {}
-
-    template <typename Tuple>
-        __host__ __device__
-        void operator()(Tuple t)
-        {
-            thrust::get<1>(t) = func(begin, end, thrust::get<0>(t), comp);
-        }
-}; // binary_search_functor
-
-
-// Vector Implementation
-template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering, typename BinarySearchFunction>
-OutputIterator binary_search(thrust::execution_policy<DerivedPolicy> &exec,
-                             ForwardIterator begin, 
-                             ForwardIterator end,
-                             InputIterator values_begin, 
-                             InputIterator values_end,
-                             OutputIterator output,
-                             StrictWeakOrdering comp,
-                             BinarySearchFunction func)
-{
-    thrust::for_each(exec,
-                     thrust::make_zip_iterator(thrust::make_tuple(values_begin, output)),
-                     thrust::make_zip_iterator(thrust::make_tuple(values_end, output + thrust::distance(values_begin, values_end))),
-                     detail::binary_search_functor<ForwardIterator, StrictWeakOrdering, BinarySearchFunction>(begin, end, comp, func));
-
-    return output + thrust::distance(values_begin, values_end);
-}
-
-   
-
-// Scalar Implementation
-template <typename OutputType, typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering, typename BinarySearchFunction>
-OutputType binary_search(thrust::execution_policy<DerivedPolicy> &exec,
-                         ForwardIterator begin,
-                         ForwardIterator end,
-                         const T& value, 
-                         StrictWeakOrdering comp,
-                         BinarySearchFunction func)
-{
-    // use the vectorized path to implement the scalar version
-
-    // allocate device buffers for value and output
-    thrust::detail::temporary_array<T,DerivedPolicy>          d_value(exec,1);
-    thrust::detail::temporary_array<OutputType,DerivedPolicy> d_output(exec,1);
-
-    // copy value to device
-    d_value[0] = value;
-
-    // perform the query
-    thrust::system::detail::generic::detail::binary_search(exec, begin, end, d_value.begin(), d_value.end(), d_output.begin(), comp, func);
-
-    // copy result to host and return
-    return d_output[0];
-}
-   
-} // end namespace detail
-
-
-//////////////////////
-// Scalar Functions //
-//////////////////////
-
-template <typename DerivedPolicy, typename ForwardIterator, typename T>
-ForwardIterator lower_bound(thrust::execution_policy<DerivedPolicy> &exec,
-                            ForwardIterator begin,
-                            ForwardIterator end,
-                            const T& value)
-{
-  return thrust::lower_bound(exec, begin, end, value, thrust::less<T>());
-}
-
-template <typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
-ForwardIterator lower_bound(thrust::execution_policy<DerivedPolicy> &exec,
-                            ForwardIterator begin,
-                            ForwardIterator end,
-                            const T& value, 
-                            StrictWeakOrdering comp)
-{
-  typedef typename thrust::iterator_traits<ForwardIterator>::difference_type difference_type;
-  
-  return begin + detail::binary_search<difference_type>(exec, begin, end, value, comp, detail::lbf());
-}
-
-
-template <typename DerivedPolicy, typename ForwardIterator, typename T>
-ForwardIterator upper_bound(thrust::execution_policy<DerivedPolicy> &exec,
-                            ForwardIterator begin,
-                            ForwardIterator end,
-                            const T& value)
-{
-  return thrust::upper_bound(exec, begin, end, value, thrust::less<T>());
-}
-
-template <typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
-ForwardIterator upper_bound(thrust::execution_policy<DerivedPolicy> &exec,
-                            ForwardIterator begin,
-                            ForwardIterator end,
-                            const T& value, 
-                            StrictWeakOrdering comp)
-{
-  typedef typename thrust::iterator_traits<ForwardIterator>::difference_type difference_type;
-  
-  return begin + detail::binary_search<difference_type>(exec, begin, end, value, comp, detail::ubf());
-}
-
-
-template <typename DerivedPolicy, typename ForwardIterator, typename T>
-bool binary_search(thrust::execution_policy<DerivedPolicy> &exec,
-                   ForwardIterator begin,
-                   ForwardIterator end,
-                   const T& value)
-{
-  return thrust::binary_search(exec, begin, end, value, thrust::less<T>());
-}
-
-template <typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
-bool binary_search(thrust::execution_policy<DerivedPolicy> &exec,
-                   ForwardIterator begin,
-                   ForwardIterator end,
-                   const T& value, 
-                   StrictWeakOrdering comp)
-{
-  return detail::binary_search<bool>(exec, begin, end, value, comp, detail::bsf());
-}
-
-
-//////////////////////
-// Vector Functions //
-//////////////////////
-
-template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
-OutputIterator lower_bound(thrust::execution_policy<DerivedPolicy> &exec,
-                           ForwardIterator begin, 
-                           ForwardIterator end,
-                           InputIterator values_begin, 
-                           InputIterator values_end,
-                           OutputIterator output)
-{
-  typedef typename thrust::iterator_value<InputIterator>::type ValueType;
-
-  return thrust::lower_bound(exec, begin, end, values_begin, values_end, output, thrust::less<ValueType>());
-}
-
-template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
-OutputIterator lower_bound(thrust::execution_policy<DerivedPolicy> &exec,
-                           ForwardIterator begin, 
-                           ForwardIterator end,
-                           InputIterator values_begin, 
-                           InputIterator values_end,
-                           OutputIterator output,
-                           StrictWeakOrdering comp)
-{
-  return detail::binary_search(exec, begin, end, values_begin, values_end, output, comp, detail::lbf());
-}
-
-
-template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
-OutputIterator upper_bound(thrust::execution_policy<DerivedPolicy> &exec,
-                           ForwardIterator begin, 
-                           ForwardIterator end,
-                           InputIterator values_begin, 
-                           InputIterator values_end,
-                           OutputIterator output)
-{
-  typedef typename thrust::iterator_value<InputIterator>::type ValueType;
-
-  return thrust::upper_bound(exec, begin, end, values_begin, values_end, output, thrust::less<ValueType>());
-}
-
-template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
-OutputIterator upper_bound(thrust::execution_policy<DerivedPolicy> &exec,
-                           ForwardIterator begin, 
-                           ForwardIterator end,
-                           InputIterator values_begin, 
-                           InputIterator values_end,
-                           OutputIterator output,
-                           StrictWeakOrdering comp)
-{
-  return detail::binary_search(exec, begin, end, values_begin, values_end, output, comp, detail::ubf());
-}
-
-
-template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
-OutputIterator binary_search(thrust::execution_policy<DerivedPolicy> &exec,
-                             ForwardIterator begin, 
-                             ForwardIterator end,
-                             InputIterator values_begin, 
-                             InputIterator values_end,
-                             OutputIterator output)
-{
-  typedef typename thrust::iterator_value<InputIterator>::type ValueType;
-
-  return thrust::binary_search(exec, begin, end, values_begin, values_end, output, thrust::less<ValueType>());
-}
-
-template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
-OutputIterator binary_search(thrust::execution_policy<DerivedPolicy> &exec,
-                             ForwardIterator begin, 
-                             ForwardIterator end,
-                             InputIterator values_begin, 
-                             InputIterator values_end,
-                             OutputIterator output,
-                             StrictWeakOrdering comp)
-{
-  return detail::binary_search(exec, begin, end, values_begin, values_end, output, comp, detail::bsf());
-}
-
-
-template <typename DerivedPolicy, typename ForwardIterator, typename LessThanComparable>
-thrust::pair<ForwardIterator,ForwardIterator>
-equal_range(thrust::execution_policy<DerivedPolicy> &exec,
-            ForwardIterator first,
-            ForwardIterator last,
-            const LessThanComparable &value)
-{
-  return thrust::equal_range(exec, first, last, value, thrust::less<LessThanComparable>());
-}
-
-
-template <typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
-thrust::pair<ForwardIterator,ForwardIterator>
-equal_range(thrust::execution_policy<DerivedPolicy> &exec,
-            ForwardIterator first,
-            ForwardIterator last,
-            const T &value,
-            StrictWeakOrdering comp)
-{
-  ForwardIterator lb = thrust::lower_bound(exec, first, last, value, comp);
-  ForwardIterator ub = thrust::upper_bound(exec, first, last, value, comp);
-  return thrust::make_pair(lb, ub);
-}
-
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/generic/copy.h b/compat/thrust/system/detail/generic/copy.h
deleted file mode 100644
index 8df98fe67f..0000000000
--- a/compat/thrust/system/detail/generic/copy.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/tag.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator>
-  OutputIterator copy(thrust::execution_policy<DerivedPolicy> &exec,
-                      InputIterator  first,
-                      InputIterator  last,
-                      OutputIterator result);
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename Size,
-         typename OutputIterator>
-  OutputIterator copy_n(thrust::execution_policy<DerivedPolicy> &exec,
-                        InputIterator  first,
-                        Size           n,
-                        OutputIterator result);
-
-
-} // end generic
-} // end detail
-} // end system
-} // end thrust
-
-#include <thrust/system/detail/generic/copy.inl>
-
diff --git a/compat/thrust/system/detail/generic/copy.inl b/compat/thrust/system/detail/generic/copy.inl
deleted file mode 100644
index e081015f8e..0000000000
--- a/compat/thrust/system/detail/generic/copy.inl
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/copy.h>
-#include <thrust/functional.h>
-#include <thrust/detail/internal_functional.h>
-#include <thrust/transform.h>
-#include <thrust/for_each.h>
-#include <thrust/tuple.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/iterator/detail/minimum_system.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator>
-  OutputIterator copy(thrust::execution_policy<DerivedPolicy> &exec,
-                      InputIterator                            first,
-                      InputIterator                            last,
-                      OutputIterator                           result)
-{
-  typedef typename thrust::iterator_value<InputIterator>::type T;
-  return thrust::transform(exec, first, last, result, thrust::identity<T>());
-} // end copy()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename Size,
-         typename OutputIterator>
-  OutputIterator copy_n(thrust::execution_policy<DerivedPolicy> &exec,
-                        InputIterator                            first,
-                        Size                                     n,
-                        OutputIterator                           result)
-{
-  typedef typename thrust::iterator_value<InputIterator>::type value_type;
-  typedef thrust::identity<value_type>                         xfrm_type;
-
-  // XXX why do we need to do this? figure out why, and then see if we can do without
-  typedef typename thrust::detail::unary_transform_functor<DerivedPolicy,xfrm_type>::type functor_type;
-
-  typedef thrust::tuple<InputIterator,OutputIterator> iterator_tuple;
-  typedef thrust::zip_iterator<iterator_tuple>        zip_iter;
-
-  zip_iter zipped = thrust::make_zip_iterator(thrust::make_tuple(first,result));
-
-  return thrust::get<1>(thrust::for_each_n(exec, zipped, n, functor_type(xfrm_type())).get_iterator_tuple());
-} // end copy_n()
-
-
-} // end generic
-} // end detail
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/detail/generic/copy_if.h b/compat/thrust/system/detail/generic/copy_if.h
deleted file mode 100644
index 183f012a03..0000000000
--- a/compat/thrust/system/detail/generic/copy_if.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/tag.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator copy_if(thrust::execution_policy<DerivedPolicy> &exec,
-                         InputIterator first,
-                         InputIterator last,
-                         OutputIterator result,
-                         Predicate pred);
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename Predicate>
-   OutputIterator copy_if(thrust::execution_policy<DerivedPolicy> &exec,
-                          InputIterator1 first,
-                          InputIterator1 last,
-                          InputIterator2 stencil,
-                          OutputIterator result,
-                          Predicate pred);
-
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/detail/generic/copy_if.inl>
-
diff --git a/compat/thrust/system/detail/generic/copy_if.inl b/compat/thrust/system/detail/generic/copy_if.inl
deleted file mode 100644
index 145561c9bc..0000000000
--- a/compat/thrust/system/detail/generic/copy_if.inl
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/copy_if.h>
-#include <thrust/detail/copy_if.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/iterator/detail/minimum_system.h>
-#include <thrust/functional.h>
-#include <thrust/distance.h>
-#include <thrust/transform.h>
-#include <thrust/detail/internal_functional.h>
-#include <thrust/detail/temporary_array.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/scan.h>
-#include <thrust/scatter.h>
-#include <limits>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-namespace detail
-{
-
-template<typename IndexType,
-         typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename Predicate>
-OutputIterator copy_if(thrust::execution_policy<DerivedPolicy> &exec,
-                       InputIterator1 first,
-                       InputIterator1 last,
-                       InputIterator2 stencil,
-                       OutputIterator result,
-                       Predicate pred)
-{
-    __THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING(IndexType n = thrust::distance(first, last));
-
-    // compute {0,1} predicates
-    thrust::detail::temporary_array<IndexType, DerivedPolicy> predicates(exec, n);
-    thrust::transform(exec,
-                      stencil,
-                      stencil + n,
-                      predicates.begin(),
-                      thrust::detail::predicate_to_integral<Predicate,IndexType>(pred));
-
-    // scan {0,1} predicates
-    thrust::detail::temporary_array<IndexType, DerivedPolicy> scatter_indices(exec, n);
-    thrust::exclusive_scan(exec,
-                           predicates.begin(),
-                           predicates.end(),
-                           scatter_indices.begin(),
-                           static_cast<IndexType>(0),
-                           thrust::plus<IndexType>());
-
-    // scatter the true elements
-    thrust::scatter_if(exec,
-                       first,
-                       last,
-                       scatter_indices.begin(),
-                       predicates.begin(),
-                       result,
-                       thrust::identity<IndexType>());
-
-    // find the end of the new sequence
-    IndexType output_size = scatter_indices[n - 1] + predicates[n - 1];
-
-    return result + output_size;
-}
-
-} // end namespace detail
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator copy_if(thrust::execution_policy<DerivedPolicy> &exec,
-                         InputIterator first,
-                         InputIterator last,
-                         OutputIterator result,
-                         Predicate pred)
-{
-  // XXX it's potentially expensive to send [first,last) twice
-  //     we should probably specialize this case for POD
-  //     since we can safely keep the input in a temporary instead
-  //     of doing two loads
-  return thrust::copy_if(exec, first, last, first, result, pred);
-} // end copy_if()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename Predicate>
-   OutputIterator copy_if(thrust::execution_policy<DerivedPolicy> &exec,
-                          InputIterator1 first,
-                          InputIterator1 last,
-                          InputIterator2 stencil,
-                          OutputIterator result,
-                          Predicate pred)
-{
-  typedef typename thrust::iterator_traits<InputIterator1>::difference_type difference_type;
-  
-  // empty sequence
-  if(first == last)
-    return result;
-  
-  difference_type n = thrust::distance(first, last);
-  
-  // create an unsigned version of n (we know n is positive from the comparison above)
-  // to avoid a warning in the compare below
-  typename thrust::detail::make_unsigned<difference_type>::type unsigned_n(n);
-  
-  // use 32-bit indices when possible (almost always)
-  if(sizeof(difference_type) > sizeof(unsigned int) && unsigned_n > (std::numeric_limits<unsigned int>::max)())
-  {
-    result = detail::copy_if<difference_type>(exec, first, last, stencil, result, pred);
-  } // end if
-  else
-  {
-    result = detail::copy_if<unsigned int>(exec, first, last, stencil, result, pred);
-  } // end else
-
-  return result;
-} // end copy_if()
-
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/generic/count.h b/compat/thrust/system/detail/generic/count.h
deleted file mode 100644
index bc4899e6d0..0000000000
--- a/compat/thrust/system/detail/generic/count.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/tag.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-template <typename DerivedPolicy, typename InputIterator, typename EqualityComparable>
-typename thrust::iterator_traits<InputIterator>::difference_type
-count(thrust::execution_policy<DerivedPolicy> &exec, InputIterator first, InputIterator last, const EqualityComparable& value);
-
-template <typename DerivedPolicy, typename InputIterator, typename Predicate>
-typename thrust::iterator_traits<InputIterator>::difference_type
-count_if(thrust::execution_policy<DerivedPolicy> &exec, InputIterator first, InputIterator last, Predicate pred);
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/detail/generic/count.inl>
-
diff --git a/compat/thrust/system/detail/generic/count.inl b/compat/thrust/system/detail/generic/count.inl
deleted file mode 100644
index e3ab8714b7..0000000000
--- a/compat/thrust/system/detail/generic/count.inl
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/count.h>
-#include <thrust/transform_reduce.h>
-#include <thrust/detail/internal_functional.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-template <typename InputType, typename Predicate, typename CountType>
-struct count_if_transform
-{
-  __host__ __device__ 
-  count_if_transform(Predicate _pred) : pred(_pred){}
-
-  __host__ __device__
-  CountType operator()(const InputType& val)
-  {
-    if(pred(val))
-      return 1;
-    else
-      return 0;
-  } // end operator()
-
-  Predicate pred;
-}; // end count_if_transform
-
-template <typename DerivedPolicy, typename InputIterator, typename EqualityComparable>
-typename thrust::iterator_traits<InputIterator>::difference_type
-count(thrust::execution_policy<DerivedPolicy> &exec, InputIterator first, InputIterator last, const EqualityComparable& value)
-{
-  typedef typename thrust::iterator_traits<InputIterator>::value_type InputType;
-  
-  // XXX use placeholder expression here
-  return thrust::count_if(exec, first, last, thrust::detail::equal_to_value<EqualityComparable>(value));
-} // end count()
-
-template <typename DerivedPolicy, typename InputIterator, typename Predicate>
-typename thrust::iterator_traits<InputIterator>::difference_type
-count_if(thrust::execution_policy<DerivedPolicy> &exec, InputIterator first, InputIterator last, Predicate pred)
-{
-  typedef typename thrust::iterator_traits<InputIterator>::value_type InputType;
-  typedef typename thrust::iterator_traits<InputIterator>::difference_type CountType;
-  
-  thrust::system::detail::generic::count_if_transform<InputType, Predicate, CountType> unary_op(pred);
-  thrust::plus<CountType> binary_op;
-  return thrust::transform_reduce(exec, first, last, unary_op, CountType(0), binary_op);
-} // end count_if()
-
-} // end generic
-} // end detail
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/detail/generic/distance.h b/compat/thrust/system/detail/generic/distance.h
deleted file mode 100644
index 80f051ca53..0000000000
--- a/compat/thrust/system/detail/generic/distance.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/iterator/iterator_traits.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-template<typename InputIterator>
-  inline typename thrust::iterator_traits<InputIterator>::difference_type
-    distance(InputIterator first, InputIterator last);
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/detail/generic/distance.inl>
-
diff --git a/compat/thrust/system/detail/generic/distance.inl b/compat/thrust/system/detail/generic/distance.inl
deleted file mode 100644
index a1fdf1458b..0000000000
--- a/compat/thrust/system/detail/generic/distance.inl
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/distance.h>
-#include <thrust/iterator/iterator_traits.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-namespace detail
-{
-
-template<typename InputIterator>
-  inline typename thrust::iterator_traits<InputIterator>::difference_type
-    distance(InputIterator first, InputIterator last, thrust::incrementable_traversal_tag)
-{
-  typename thrust::iterator_traits<InputIterator>::difference_type result(0);
-
-  while(first != last)
-  {
-    ++first;
-    ++result;
-  } // end while
-
-  return result;
-} // end advance()
-
-template<typename InputIterator>
-  inline typename thrust::iterator_traits<InputIterator>::difference_type
-    distance(InputIterator first, InputIterator last, thrust::random_access_traversal_tag)
-{
-  return last - first;
-} // end distance()
-
-} // end detail
-
-template<typename InputIterator>
-  inline typename thrust::iterator_traits<InputIterator>::difference_type
-    distance(InputIterator first, InputIterator last)
-{
-  // dispatch on iterator traversal
-  return thrust::system::detail::generic::detail::distance(first, last,
-    typename thrust::iterator_traversal<InputIterator>::type());
-} // end advance()
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/generic/equal.h b/compat/thrust/system/detail/generic/equal.h
deleted file mode 100644
index da7d105825..0000000000
--- a/compat/thrust/system/detail/generic/equal.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/tag.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2>
-bool equal(thrust::execution_policy<DerivedPolicy> &exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2);
-
-template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename BinaryPredicate>
-bool equal(thrust::execution_policy<DerivedPolicy> &exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, BinaryPredicate binary_pred);
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/detail/generic/equal.inl>
-
diff --git a/compat/thrust/system/detail/generic/equal.inl b/compat/thrust/system/detail/generic/equal.inl
deleted file mode 100644
index 12b8005a2a..0000000000
--- a/compat/thrust/system/detail/generic/equal.inl
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/equal.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/internal_functional.h>
-#include <thrust/mismatch.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2>
-bool equal(thrust::execution_policy<DerivedPolicy> &exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2)
-{
-  typedef typename thrust::iterator_traits<InputIterator1>::value_type InputType1;
-  
-  return thrust::equal(exec, first1, last1, first2, thrust::detail::equal_to<InputType1>());
-}
-
-template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename BinaryPredicate>
-bool equal(thrust::execution_policy<DerivedPolicy> &exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, BinaryPredicate binary_pred)
-{
-  return thrust::mismatch(exec, first1, last1, first2, binary_pred).first == last1;
-}
-
-} // end generic
-} // end detail
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/detail/generic/extrema.h b/compat/thrust/system/detail/generic/extrema.h
deleted file mode 100644
index abb4ddc210..0000000000
--- a/compat/thrust/system/detail/generic/extrema.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file extrema.h
- *  \brief Generic device implementations of extrema functions.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/pair.h>
-#include <thrust/system/detail/generic/tag.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-template <typename DerivedPolicy, typename ForwardIterator>
-ForwardIterator max_element(thrust::execution_policy<DerivedPolicy> &exec,
-                            ForwardIterator first,
-                            ForwardIterator last);
-
-template <typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
-ForwardIterator max_element(thrust::execution_policy<DerivedPolicy> &exec,
-                            ForwardIterator first,
-                            ForwardIterator last,
-                            BinaryPredicate comp);
-
-template <typename DerivedPolicy, typename ForwardIterator>
-ForwardIterator min_element(thrust::execution_policy<DerivedPolicy> &exec,
-                            ForwardIterator first,
-                            ForwardIterator last);
-
-template <typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
-ForwardIterator min_element(thrust::execution_policy<DerivedPolicy> &exec,
-                            ForwardIterator first,
-                            ForwardIterator last,
-                            BinaryPredicate comp);
-
-template <typename DerivedPolicy, typename ForwardIterator>
-thrust::pair<ForwardIterator,ForwardIterator> minmax_element(thrust::execution_policy<DerivedPolicy> &exec,
-                                                             ForwardIterator first, 
-                                                             ForwardIterator last);
-
-template <typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
-thrust::pair<ForwardIterator,ForwardIterator> minmax_element(thrust::execution_policy<DerivedPolicy> &exec,
-                                                             ForwardIterator first, 
-                                                             ForwardIterator last,
-                                                             BinaryPredicate comp);
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/detail/generic/extrema.inl>
-
diff --git a/compat/thrust/system/detail/generic/extrema.inl b/compat/thrust/system/detail/generic/extrema.inl
deleted file mode 100644
index b5f92c3935..0000000000
--- a/compat/thrust/system/detail/generic/extrema.inl
+++ /dev/null
@@ -1,244 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file distance.h
- *  \brief Device implementations for distance.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/extrema.h>
-#include <thrust/functional.h>
-#include <thrust/pair.h>
-#include <thrust/reduce.h>
-#include <thrust/transform_reduce.h>
-
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-namespace detail
-{
-
-//////////////
-// Functors //
-//////////////
-
-// return the smaller/larger element making sure to prefer the 
-// first occurance of the minimum/maximum element
-template <typename InputType, typename IndexType, typename BinaryPredicate>
-struct min_element_reduction
-{
-  BinaryPredicate comp;
-
-  __host__ __device__ 
-  min_element_reduction(BinaryPredicate comp) : comp(comp){}
-
-  __host__ __device__ 
-  thrust::tuple<InputType, IndexType>
-  operator()(const thrust::tuple<InputType, IndexType>& lhs, 
-             const thrust::tuple<InputType, IndexType>& rhs )
-  {
-    if(comp(thrust::get<0>(lhs), thrust::get<0>(rhs)))
-      return lhs;
-    if(comp(thrust::get<0>(rhs), thrust::get<0>(lhs)))
-      return rhs;
-
-    // values are equivalent, prefer value with smaller index
-    if(thrust::get<1>(lhs) < thrust::get<1>(rhs))
-      return lhs;
-    else
-      return rhs;
-  } // end operator()()
-
-}; // end min_element_reduction
-
-
-template <typename InputType, typename IndexType, typename BinaryPredicate>
-struct max_element_reduction
-{
-  BinaryPredicate comp;
-
-  __host__ __device__ 
-  max_element_reduction(BinaryPredicate comp) : comp(comp){}
-
-  __host__ __device__ 
-  thrust::tuple<InputType, IndexType>
-  operator()(const thrust::tuple<InputType, IndexType>& lhs, 
-             const thrust::tuple<InputType, IndexType>& rhs )
-  {
-    if(comp(thrust::get<0>(lhs), thrust::get<0>(rhs)))
-      return rhs;
-    if(comp(thrust::get<0>(rhs), thrust::get<0>(lhs)))
-      return lhs;
-
-    // values are equivalent, prefer value with smaller index
-    if(thrust::get<1>(lhs) < thrust::get<1>(rhs))
-      return lhs;
-    else
-      return rhs;
-  } // end operator()()
-
-}; // end max_element_reduction
-
-// return the smaller & larger element making sure to prefer the 
-// first occurance of the minimum/maximum element
-template <typename InputType, typename IndexType, typename BinaryPredicate>
-struct minmax_element_reduction
-{
-  BinaryPredicate comp;
-
-  minmax_element_reduction(BinaryPredicate comp) : comp(comp){}
-
-  __host__ __device__ 
-  thrust::tuple< thrust::tuple<InputType,IndexType>, thrust::tuple<InputType,IndexType> >
-  operator()(const thrust::tuple< thrust::tuple<InputType,IndexType>, thrust::tuple<InputType,IndexType> >& lhs, 
-             const thrust::tuple< thrust::tuple<InputType,IndexType>, thrust::tuple<InputType,IndexType> >& rhs )
-  {
-
-    return thrust::make_tuple(min_element_reduction<InputType, IndexType, BinaryPredicate>(comp)(thrust::get<0>(lhs), thrust::get<0>(rhs)),
-                              max_element_reduction<InputType, IndexType, BinaryPredicate>(comp)(thrust::get<1>(lhs), thrust::get<1>(rhs)));
-  } // end operator()()
-}; // end minmax_element_reduction
-
-template <typename InputType, typename IndexType>
-struct duplicate_tuple
-{
-  __host__ __device__ 
-  thrust::tuple< thrust::tuple<InputType,IndexType>, thrust::tuple<InputType,IndexType> >
-  operator()(const thrust::tuple<InputType,IndexType>& t)
-  {
-    return thrust::make_tuple(t, t);
-  }
-}; // end duplicate_tuple
-
-} // end namespace detail
-
-template <typename DerivedPolicy, typename ForwardIterator>
-ForwardIterator min_element(thrust::execution_policy<DerivedPolicy> &exec,
-                            ForwardIterator first,
-                            ForwardIterator last)
-{
-  typedef typename thrust::iterator_value<ForwardIterator>::type value_type;
-
-  return thrust::min_element(exec, first, last, thrust::less<value_type>());
-} // end min_element()
-
-template <typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
-ForwardIterator min_element(thrust::execution_policy<DerivedPolicy> &exec,
-                            ForwardIterator first,
-                            ForwardIterator last,
-                            BinaryPredicate comp)
-{
-  if (first == last)
-    return last;
-
-  typedef typename thrust::iterator_traits<ForwardIterator>::value_type      InputType;
-  typedef typename thrust::iterator_traits<ForwardIterator>::difference_type IndexType;
-
-  thrust::tuple<InputType, IndexType> result =
-    thrust::reduce
-      (exec,
-       thrust::make_zip_iterator(thrust::make_tuple(first, thrust::counting_iterator<IndexType>(0))),
-       thrust::make_zip_iterator(thrust::make_tuple(first, thrust::counting_iterator<IndexType>(0))) + (last - first),
-       thrust::tuple<InputType, IndexType>(*first, 0),
-       detail::min_element_reduction<InputType, IndexType, BinaryPredicate>(comp));
-
-  return first + thrust::get<1>(result);
-} // end min_element()
-
-template <typename DerivedPolicy, typename ForwardIterator>
-ForwardIterator max_element(thrust::execution_policy<DerivedPolicy> &exec,
-                            ForwardIterator first,
-                            ForwardIterator last)
-{
-  typedef typename thrust::iterator_value<ForwardIterator>::type value_type;
-
-  return thrust::max_element(exec, first, last, thrust::less<value_type>());
-} // end max_element()
-
-template <typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
-ForwardIterator max_element(thrust::execution_policy<DerivedPolicy> &exec,
-                            ForwardIterator first,
-                            ForwardIterator last,
-                            BinaryPredicate comp)
-{
-  if (first == last)
-    return last;
-
-  typedef typename thrust::iterator_traits<ForwardIterator>::value_type      InputType;
-  typedef typename thrust::iterator_traits<ForwardIterator>::difference_type IndexType;
-
-  thrust::tuple<InputType, IndexType> result =
-    thrust::reduce
-      (exec,
-       thrust::make_zip_iterator(thrust::make_tuple(first, thrust::counting_iterator<IndexType>(0))),
-       thrust::make_zip_iterator(thrust::make_tuple(first, thrust::counting_iterator<IndexType>(0))) + (last - first),
-       thrust::tuple<InputType, IndexType>(*first, 0),
-       detail::max_element_reduction<InputType, IndexType, BinaryPredicate>(comp));
-
-  return first + thrust::get<1>(result);
-} // end max_element()
-
-template <typename DerivedPolicy, typename ForwardIterator>
-thrust::pair<ForwardIterator,ForwardIterator> minmax_element(thrust::execution_policy<DerivedPolicy> &exec,
-                                                             ForwardIterator first, 
-                                                             ForwardIterator last)
-{
-  typedef typename thrust::iterator_value<ForwardIterator>::type value_type;
-
-  return thrust::minmax_element(exec, first, last, thrust::less<value_type>());
-} // end minmax_element()
-
-template <typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
-thrust::pair<ForwardIterator,ForwardIterator> minmax_element(thrust::execution_policy<DerivedPolicy> &exec,
-                                                             ForwardIterator first, 
-                                                             ForwardIterator last,
-                                                             BinaryPredicate comp)
-{
-  if (first == last)
-    return thrust::make_pair(last, last);
-
-  typedef typename thrust::iterator_traits<ForwardIterator>::value_type      InputType;
-  typedef typename thrust::iterator_traits<ForwardIterator>::difference_type IndexType;
-
-  thrust::tuple< thrust::tuple<InputType,IndexType>, thrust::tuple<InputType,IndexType> > result = 
-    thrust::transform_reduce
-      (exec,
-       thrust::make_zip_iterator(thrust::make_tuple(first, thrust::counting_iterator<IndexType>(0))),
-       thrust::make_zip_iterator(thrust::make_tuple(first, thrust::counting_iterator<IndexType>(0))) + (last - first),
-       detail::duplicate_tuple<InputType, IndexType>(),
-       detail::duplicate_tuple<InputType, IndexType>()(thrust::tuple<InputType, IndexType>(*first, 0)),
-       detail::minmax_element_reduction<InputType, IndexType, BinaryPredicate>(comp));
-
-  return thrust::make_pair(first + thrust::get<1>(thrust::get<0>(result)), first + thrust::get<1>(thrust::get<1>(result)));
-} // end minmax_element()
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/generic/fill.h b/compat/thrust/system/detail/generic/fill.h
deleted file mode 100644
index 9745b1cf57..0000000000
--- a/compat/thrust/system/detail/generic/fill.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file fill.h
- *  \brief Device implementation of fill.
- */
-
-#pragma once
-
-#include <thrust/detail/internal_functional.h>
-#include <thrust/generate.h>
-#include <thrust/system/detail/generic/tag.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-
-template<typename DerivedPolicy, typename OutputIterator, typename Size, typename T>
-  OutputIterator fill_n(thrust::execution_policy<DerivedPolicy> &exec,
-                        OutputIterator first,
-                        Size n,
-                        const T &value)
-{
-  // XXX consider using the placeholder expression _1 = value
-  return thrust::generate_n(exec, first, n, thrust::detail::fill_functor<T>(value));
-}
-
-template<typename DerivedPolicy, typename ForwardIterator, typename T>
-  void fill(thrust::execution_policy<DerivedPolicy> &exec,
-            ForwardIterator first,
-            ForwardIterator last,
-            const T &value)
-{
-  // XXX consider using the placeholder expression _1 = value
-  thrust::generate(exec, first, last, thrust::detail::fill_functor<T>(value));
-}
-
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/generic/find.h b/compat/thrust/system/detail/generic/find.h
deleted file mode 100644
index 08888c5a7c..0000000000
--- a/compat/thrust/system/detail/generic/find.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/tag.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-template<typename DerivedPolicy, typename InputIterator, typename T>
-InputIterator find(thrust::execution_policy<DerivedPolicy> &exec,
-                   InputIterator first,
-                   InputIterator last,
-                   const T& value);
-
-template<typename DerivedPolicy, typename InputIterator, typename Predicate>
-InputIterator find_if(thrust::execution_policy<DerivedPolicy> &exec,
-                      InputIterator first,
-                      InputIterator last,
-                      Predicate pred);
-
-template<typename DerivedPolicy, typename InputIterator, typename Predicate>
-InputIterator find_if_not(thrust::execution_policy<DerivedPolicy> &exec,
-                          InputIterator first,
-                          InputIterator last,
-                          Predicate pred);
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/detail/generic/find.inl>
-
diff --git a/compat/thrust/system/detail/generic/find.inl b/compat/thrust/system/detail/generic/find.inl
deleted file mode 100644
index a3414e1c28..0000000000
--- a/compat/thrust/system/detail/generic/find.inl
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/find.h>
-#include <thrust/reduce.h>
-
-#include <thrust/tuple.h>
-#include <thrust/extrema.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/detail/internal_functional.h>
-
-
-// Contributed by Erich Elsen
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-
-template<typename DerivedPolicy, typename InputIterator, typename T>
-InputIterator find(thrust::execution_policy<DerivedPolicy> &exec,
-                   InputIterator first,
-                   InputIterator last,
-                   const T& value)
-{
-  // XXX consider a placeholder expression here
-  return thrust::find_if(exec, first, last, thrust::detail::equal_to_value<T>(value));
-} // end find()
-
-
-template<typename TupleType>
-struct find_if_functor
-{
-    __host__ __device__
-    TupleType operator()(const TupleType& lhs, const TupleType& rhs) const
-    {
-        // select the smallest index among true results
-        if (thrust::get<0>(lhs) && thrust::get<0>(rhs))
-            return TupleType(true, (thrust::min)(thrust::get<1>(lhs), thrust::get<1>(rhs)));
-        else if (thrust::get<0>(lhs))
-            return lhs;
-        else
-            return rhs;
-    }
-};
-    
-
-template<typename DerivedPolicy, typename InputIterator, typename Predicate>
-InputIterator find_if(thrust::execution_policy<DerivedPolicy> &exec,
-                      InputIterator first,
-                      InputIterator last,
-                      Predicate pred)
-{
-    typedef typename thrust::iterator_traits<InputIterator>::difference_type difference_type;
-    typedef typename thrust::tuple<bool,difference_type> result_type;
-   
-    // empty sequence
-    if (first == last)
-        return last;
-
-    const difference_type n = thrust::distance(first, last);
-
-    // this implementation breaks up the sequence into separate intervals
-    // in an attempt to early-out as soon as a value is found
-
-    // TODO incorporate sizeof(InputType) into interval_threshold and round to multiple of 32
-    const difference_type interval_threshold = 1 << 20;
-    const difference_type interval_size = (std::min)(interval_threshold, n);
-
-    // force transform_iterator output to bool
-    typedef thrust::transform_iterator<Predicate, InputIterator, bool> XfrmIterator;
-    typedef thrust::tuple<XfrmIterator, thrust::counting_iterator<difference_type> > IteratorTuple;
-    typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
-
-    IteratorTuple iter_tuple = thrust::make_tuple(XfrmIterator(first, pred),
-                                                  thrust::counting_iterator<difference_type>(0));
-
-    ZipIterator begin = thrust::make_zip_iterator(iter_tuple);
-    ZipIterator end   = begin + n;
-
-    for(ZipIterator interval_begin = begin; interval_begin < end; interval_begin += interval_size)
-    {
-        ZipIterator interval_end = interval_begin + interval_size;
-        if(end < interval_end)
-        {
-          interval_end = end;
-        } // end if
-
-        result_type result = thrust::reduce(exec,
-                                            interval_begin, interval_end,
-                                            result_type(false,interval_end - begin),
-                                            find_if_functor<result_type>());
-
-        // see if we found something
-        if (thrust::get<0>(result))
-        {
-            return first + thrust::get<1>(result);
-        }
-    }
-
-    //nothing was found if we reach here...
-    return first + n;
-}
-
-
-template<typename DerivedPolicy, typename InputIterator, typename Predicate>
-InputIterator find_if_not(thrust::execution_policy<DerivedPolicy> &exec,
-                          InputIterator first,
-                          InputIterator last,
-                          Predicate pred)
-{
-    return thrust::find_if(exec, first, last, thrust::detail::not1(pred));
-} // end find()
-
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/generic/for_each.h b/compat/thrust/system/detail/generic/for_each.h
deleted file mode 100644
index 61abe20b6f..0000000000
--- a/compat/thrust/system/detail/generic/for_each.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file for_each.h
- *  \brief Generic implementation of for_each & for_each_n.
- *         It is an error to call these functions; they have no implementation.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/execution_policy.h>
-#include <thrust/system/detail/generic/tag.h>
-#include <thrust/detail/static_assert.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename UnaryFunction>
-InputIterator for_each(thrust::execution_policy<DerivedPolicy> &exec,
-                       InputIterator first,
-                       InputIterator last,
-                       UnaryFunction f)
-{
-  // unimplemented
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator, false>::value) );
-  return first;
-} // end for_each()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename Size,
-         typename UnaryFunction>
-InputIterator for_each_n(thrust::execution_policy<DerivedPolicy> &exec,
-                         InputIterator first,
-                         Size n,
-                         UnaryFunction f)
-{
-  // unimplemented
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator, false>::value) );
-  return first;
-} // end for_each_n()
-
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/generic/gather.h b/compat/thrust/system/detail/generic/gather.h
deleted file mode 100644
index cfb6f85ca5..0000000000
--- a/compat/thrust/system/detail/generic/gather.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/tag.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-
-template<typename ExecutionPolicy,
-         typename InputIterator,
-         typename RandomAccessIterator,
-         typename OutputIterator>
-  OutputIterator gather(thrust::execution_policy<ExecutionPolicy> &exec,
-                        InputIterator                              map_first,
-                        InputIterator                              map_last,
-                        RandomAccessIterator                       input_first,
-                        OutputIterator                             result);
-
-
-template<typename ExecutionPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename RandomAccessIterator,
-         typename OutputIterator>
-  OutputIterator gather_if(thrust::execution_policy<ExecutionPolicy> &exec,
-                           InputIterator1                             map_first,
-                           InputIterator1                             map_last,
-                           InputIterator2                             stencil,
-                           RandomAccessIterator                       input_first,
-                           OutputIterator                             result);
-
-
-template<typename ExecutionPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename RandomAccessIterator,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator gather_if(thrust::execution_policy<ExecutionPolicy> &exec,
-                           InputIterator1                             map_first,
-                           InputIterator1                             map_last,
-                           InputIterator2                             stencil,
-                           RandomAccessIterator                       input_first,
-                           OutputIterator                             result,
-                           Predicate                                  pred);
-
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/detail/generic/gather.inl>
-
diff --git a/compat/thrust/system/detail/generic/gather.inl b/compat/thrust/system/detail/generic/gather.inl
deleted file mode 100644
index ab2cdd8116..0000000000
--- a/compat/thrust/system/detail/generic/gather.inl
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/gather.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/functional.h>
-#include <thrust/transform.h>
-#include <thrust/iterator/permutation_iterator.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename RandomAccessIterator,
-         typename OutputIterator>
-  OutputIterator gather(thrust::execution_policy<DerivedPolicy> &exec,
-                        InputIterator                            map_first,
-                        InputIterator                            map_last,
-                        RandomAccessIterator                     input_first,
-                        OutputIterator                           result)
-{
-  return thrust::transform(exec,
-                           thrust::make_permutation_iterator(input_first, map_first),
-                           thrust::make_permutation_iterator(input_first, map_last),
-                           result,
-                           thrust::identity<typename thrust::iterator_value<RandomAccessIterator>::type>());
-} // end gather()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename RandomAccessIterator,
-         typename OutputIterator>
-  OutputIterator gather_if(thrust::execution_policy<DerivedPolicy> &exec,
-                           InputIterator1                           map_first,
-                           InputIterator1                           map_last,
-                           InputIterator2                           stencil,
-                           RandomAccessIterator                     input_first,
-                           OutputIterator                           result)
-{
-  typedef typename thrust::iterator_value<InputIterator2>::type StencilType;
-  return thrust::gather_if(exec,
-                           map_first,
-                           map_last,
-                           stencil,
-                           input_first,
-                           result,
-                           thrust::identity<StencilType>());
-} // end gather_if()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename RandomAccessIterator,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator gather_if(thrust::execution_policy<DerivedPolicy> &exec,
-                           InputIterator1                           map_first,
-                           InputIterator1                           map_last,
-                           InputIterator2                           stencil,
-                           RandomAccessIterator                     input_first,
-                           OutputIterator                           result,
-                           Predicate                                pred)
-{
-  typedef typename thrust::iterator_value<RandomAccessIterator>::type InputType;
-  return thrust::transform_if(exec,
-                              thrust::make_permutation_iterator(input_first, map_first),
-                              thrust::make_permutation_iterator(input_first, map_last),
-                              stencil,
-                              result,
-                              thrust::identity<InputType>(),
-                              pred);
-} // end gather_if()
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/generic/generate.h b/compat/thrust/system/detail/generic/generate.h
deleted file mode 100644
index e7a8e00726..0000000000
--- a/compat/thrust/system/detail/generic/generate.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/tag.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-template<typename ExecutionPolicy,
-         typename ForwardIterator,
-         typename Generator>
-  void generate(thrust::execution_policy<ExecutionPolicy> &exec,
-                ForwardIterator first,
-                ForwardIterator last,
-                Generator gen);
-
-template<typename ExecutionPolicy,
-         typename OutputIterator,
-         typename Size,
-         typename Generator>
-  OutputIterator generate_n(thrust::execution_policy<ExecutionPolicy> &exec,
-                            OutputIterator first,
-                            Size n,
-                            Generator gen);
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/detail/generic/generate.inl>
-
diff --git a/compat/thrust/system/detail/generic/generate.inl b/compat/thrust/system/detail/generic/generate.inl
deleted file mode 100644
index 4da5763f9c..0000000000
--- a/compat/thrust/system/detail/generic/generate.inl
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/generate.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/internal_functional.h>
-#include <thrust/for_each.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-template<typename ExecutionPolicy,
-         typename ForwardIterator,
-         typename Generator>
-  void generate(thrust::execution_policy<ExecutionPolicy> &exec,
-                ForwardIterator first,
-                ForwardIterator last,
-                Generator gen)
-{
-  thrust::for_each(exec, first, last, typename thrust::detail::generate_functor<ExecutionPolicy,Generator>::type(gen));
-} // end generate()
-
-template<typename ExecutionPolicy,
-         typename OutputIterator,
-         typename Size,
-         typename Generator>
-  OutputIterator generate_n(thrust::execution_policy<ExecutionPolicy> &exec,
-                            OutputIterator first,
-                            Size n,
-                            Generator gen)
-{
-  return thrust::for_each_n(exec, first, n, typename thrust::detail::generate_functor<ExecutionPolicy,Generator>::type(gen));
-} // end generate()
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/generic/inner_product.h b/compat/thrust/system/detail/generic/inner_product.h
deleted file mode 100644
index 9ac5c69636..0000000000
--- a/compat/thrust/system/detail/generic/inner_product.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/tag.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename OutputType>
-  OutputType inner_product(thrust::execution_policy<DerivedPolicy> &exec,
-                           InputIterator1 first1,
-                           InputIterator1 last1,
-                           InputIterator2 first2,
-                           OutputType init);
-
-template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename OutputType, typename BinaryFunction1, typename BinaryFunction2>
-OutputType inner_product(thrust::execution_policy<DerivedPolicy> &exec,
-                         InputIterator1 first1,
-                         InputIterator1 last1,
-                         InputIterator2 first2,
-                         OutputType init, 
-                         BinaryFunction1 binary_op1,
-                         BinaryFunction2 binary_op2);
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/detail/generic/inner_product.inl>
-
diff --git a/compat/thrust/system/detail/generic/inner_product.inl b/compat/thrust/system/detail/generic/inner_product.inl
deleted file mode 100644
index b6a339ea15..0000000000
--- a/compat/thrust/system/detail/generic/inner_product.inl
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/inner_product.h>
-#include <thrust/functional.h>
-#include <thrust/detail/internal_functional.h>
-#include <thrust/transform_reduce.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-
-template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename OutputType>
-OutputType inner_product(thrust::execution_policy<DerivedPolicy> &exec,
-                         InputIterator1 first1,
-                         InputIterator1 last1,
-                         InputIterator2 first2,
-                         OutputType init)
-{
-  thrust::plus<OutputType>       binary_op1;
-  thrust::multiplies<OutputType> binary_op2;
-  return thrust::inner_product(exec, first1, last1, first2, init, binary_op1, binary_op2);
-} // end inner_product()
-
-
-template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename OutputType, typename BinaryFunction1, typename BinaryFunction2>
-OutputType inner_product(thrust::execution_policy<DerivedPolicy> &exec,
-                         InputIterator1 first1,
-                         InputIterator1 last1,
-                         InputIterator2 first2,
-                         OutputType init, 
-                         BinaryFunction1 binary_op1,
-                         BinaryFunction2 binary_op2)
-{
-  typedef thrust::zip_iterator<thrust::tuple<InputIterator1,InputIterator2> > ZipIter;
-
-  ZipIter first = thrust::make_zip_iterator(thrust::make_tuple(first1,first2));
-
-  // only the first iterator in the tuple is relevant for the purposes of last
-  ZipIter last  = thrust::make_zip_iterator(thrust::make_tuple(last1, first2));
-
-  return thrust::transform_reduce(exec, first, last, thrust::detail::zipped_binary_op<OutputType,BinaryFunction2>(binary_op2), init, binary_op1);
-} // end inner_product()
-
-
-} // end generic
-} // end detail
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/detail/generic/logical.h b/compat/thrust/system/detail/generic/logical.h
deleted file mode 100644
index e0d01e30a5..0000000000
--- a/compat/thrust/system/detail/generic/logical.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/tag.h>
-#include <thrust/detail/internal_functional.h>
-#include <thrust/find.h>
-#include <thrust/logical.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-template <typename ExecutionPolicy, typename InputIterator, typename Predicate>
-bool all_of(thrust::execution_policy<ExecutionPolicy> &exec, InputIterator first, InputIterator last, Predicate pred)
-{
-  return thrust::find_if(exec, first, last, thrust::detail::not1(pred)) == last;
-}
-
-template <typename ExecutionPolicy, typename InputIterator, typename Predicate>
-bool any_of(thrust::execution_policy<ExecutionPolicy> &exec, InputIterator first, InputIterator last, Predicate pred)
-{
-  return thrust::find_if(exec, first, last, pred) != last;
-}
-
-template <typename ExecutionPolicy, typename InputIterator, typename Predicate>
-bool none_of(thrust::execution_policy<ExecutionPolicy> &exec, InputIterator first, InputIterator last, Predicate pred)
-{
-  return !thrust::any_of(exec, first, last, pred);
-}
-
-} // end generic
-} // end detail
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/detail/generic/memory.h b/compat/thrust/system/detail/generic/memory.h
deleted file mode 100644
index c0fe623ac4..0000000000
--- a/compat/thrust/system/detail/generic/memory.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file generic/memory.h
- *  \brief Generic implementation of memory functions.
- *         Calling some of these is an error. They have no implementation.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/execution_policy.h>
-#include <thrust/system/detail/generic/tag.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/pointer.h>
-#include <thrust/pair.h>
-#include <thrust/system/detail/generic/type_traits.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-template<typename DerivedPolicy, typename Size> void malloc(thrust::execution_policy<DerivedPolicy> &, Size);
-
-template<typename T, typename DerivedPolicy>
-thrust::pointer<T,DerivedPolicy> malloc(thrust::execution_policy<DerivedPolicy> &s, std::size_t n);
-
-template<typename DerivedPolicy, typename Pointer> void free(thrust::execution_policy<DerivedPolicy> &, Pointer);
-
-template<typename Pointer1, typename Pointer2>
-__host__ __device__
-void assign_value(tag, Pointer1, Pointer2);
-
-template<typename DerivedPolicy, typename Pointer>
-__host__ __device__
-void get_value(thrust::execution_policy<DerivedPolicy> &, Pointer);
-
-template<typename Pointer1, typename Pointer2>
-__host__ __device__
-void iter_swap(tag, Pointer1, Pointer2);
-
-} // end generic
-} // end detail
-} // end system
-} // end thrust
-
-#include <thrust/system/detail/generic/memory.inl>
-
diff --git a/compat/thrust/system/detail/generic/memory.inl b/compat/thrust/system/detail/generic/memory.inl
deleted file mode 100644
index f89a763a62..0000000000
--- a/compat/thrust/system/detail/generic/memory.inl
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/type_traits/pointer_traits.h>
-#include <thrust/system/detail/generic/memory.h>
-#include <thrust/system/detail/generic/select_system.h>
-#include <thrust/system/detail/adl/malloc_and_free.h>
-#include <thrust/detail/static_assert.h>
-#include <thrust/detail/malloc_and_free.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-
-template<typename DerivedPolicy, typename Size>
-  void malloc(thrust::execution_policy<DerivedPolicy> &, Size)
-{
-  // unimplemented
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<Size, false>::value) );
-}
-
-
-template<typename T, typename DerivedPolicy>
-  thrust::pointer<T,DerivedPolicy>
-    malloc(thrust::execution_policy<DerivedPolicy> &exec, std::size_t n)
-{
-  thrust::pointer<void,DerivedPolicy> void_ptr = thrust::malloc(exec, sizeof(T) * n);
-
-  return pointer<T,DerivedPolicy>(static_cast<T*>(void_ptr.get()));
-} // end malloc()
-
-
-template<typename DerivedPolicy, typename Pointer>
-  void free(thrust::execution_policy<DerivedPolicy> &, Pointer)
-{
-  // unimplemented
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<Pointer, false>::value) );
-}
-
-
-template<typename DerivedPolicy, typename Pointer1, typename Pointer2>
-__host__ __device__
-void assign_value(thrust::execution_policy<DerivedPolicy> &, Pointer1, Pointer2)
-{
-  // unimplemented
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<Pointer1, false>::value) );
-}
-
-
-template<typename DerivedPolicy, typename Pointer>
-__host__ __device__
-void get_value(thrust::execution_policy<DerivedPolicy> &, Pointer)
-{
-  // unimplemented
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<Pointer, false>::value) );
-}
-
-
-template<typename Pointer1, typename Pointer2>
-__host__ __device__
-void iter_swap(tag, Pointer1, Pointer2)
-{
-  // unimplemented
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<Pointer1, false>::value) );
-}
-
-
-} // end generic
-} // end detail
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/detail/generic/merge.h b/compat/thrust/system/detail/generic/merge.h
deleted file mode 100644
index 5f0b99640d..0000000000
--- a/compat/thrust/system/detail/generic/merge.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/tag.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-
-// XXX calling this function is an error; there is no implementation
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename StrictWeakOrdering>
-  OutputIterator merge(thrust::execution_policy<DerivedPolicy> &exec,
-                       InputIterator1 first1,
-                       InputIterator1 last1,
-                       InputIterator2 first2,
-                       InputIterator2 last2,
-                       OutputIterator result,
-                       StrictWeakOrdering comp);
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-  OutputIterator merge(thrust::execution_policy<DerivedPolicy> &exec,
-                       InputIterator1 first1,
-                       InputIterator1 last1,
-                       InputIterator2 first2,
-                       InputIterator2 last2,
-                       OutputIterator result);
-
-
-template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename InputIterator3, typename InputIterator4, typename OutputIterator1, typename OutputIterator2, typename Compare>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    merge_by_key(thrust::execution_policy<DerivedPolicy> &exec,
-                 InputIterator1 keys_first1, InputIterator1 keys_last1,
-                 InputIterator2 keys_first2, InputIterator2 keys_last2,
-                 InputIterator3 values_first1, InputIterator4 values_first2,
-                 OutputIterator1 keys_result,
-                 OutputIterator2 values_result,
-                 Compare comp);
-
-
-template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename InputIterator3, typename InputIterator4, typename OutputIterator1, typename OutputIterator2>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    merge_by_key(thrust::execution_policy<DerivedPolicy> &exec,
-                 InputIterator1 keys_first1, InputIterator1 keys_last1,
-                 InputIterator2 keys_first2, InputIterator2 keys_last2,
-                 InputIterator3 values_first1, InputIterator4 values_first2,
-                 OutputIterator1 keys_result,
-                 OutputIterator2 values_result);
-
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/detail/generic/merge.inl>
-
diff --git a/compat/thrust/system/detail/generic/merge.inl b/compat/thrust/system/detail/generic/merge.inl
deleted file mode 100644
index b913611168..0000000000
--- a/compat/thrust/system/detail/generic/merge.inl
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/static_assert.h>
-#include <thrust/system/detail/generic/merge.h>
-#include <thrust/merge.h>
-#include <thrust/functional.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/detail/internal_functional.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename StrictWeakOrdering>
-  OutputIterator merge(thrust::execution_policy<DerivedPolicy> &exec,
-                       InputIterator1 first1,
-                       InputIterator1 last1,
-                       InputIterator2 first2,
-                       InputIterator2 last2,
-                       OutputIterator result,
-                       StrictWeakOrdering comp)
-{
-  // unimplemented
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator1, false>::value) );
-  return result;
-} // end merge()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-  OutputIterator merge(thrust::execution_policy<DerivedPolicy> &exec,
-                       InputIterator1 first1,
-                       InputIterator1 last1,
-                       InputIterator2 first2,
-                       InputIterator2 last2,
-                       OutputIterator result)
-{
-  typedef typename thrust::iterator_value<InputIterator1>::type value_type;
-  return thrust::merge(exec,first1,last1,first2,last2,result,thrust::less<value_type>());
-} // end merge()
-
-
-template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename InputIterator3, typename InputIterator4, typename OutputIterator1, typename OutputIterator2, typename Compare>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    merge_by_key(thrust::execution_policy<DerivedPolicy> &exec,
-                 InputIterator1 keys_first1, InputIterator1 keys_last1,
-                 InputIterator2 keys_first2, InputIterator2 keys_last2,
-                 InputIterator3 values_first1, InputIterator4 values_first2,
-                 OutputIterator1 keys_result,
-                 OutputIterator2 values_result,
-                 Compare comp)
-{
-  typedef thrust::tuple<InputIterator1, InputIterator3>   iterator_tuple1;
-  typedef thrust::tuple<InputIterator2, InputIterator4>   iterator_tuple2;
-  typedef thrust::tuple<OutputIterator1, OutputIterator2> iterator_tuple3;
-
-  typedef thrust::zip_iterator<iterator_tuple1> zip_iterator1;
-  typedef thrust::zip_iterator<iterator_tuple2> zip_iterator2;
-  typedef thrust::zip_iterator<iterator_tuple3> zip_iterator3;
-
-  zip_iterator1 zipped_first1 = thrust::make_zip_iterator(thrust::make_tuple(keys_first1, values_first1));
-  zip_iterator1 zipped_last1  = thrust::make_zip_iterator(thrust::make_tuple(keys_last1, values_first1));
-
-  zip_iterator2 zipped_first2 = thrust::make_zip_iterator(thrust::make_tuple(keys_first2, values_first2));
-  zip_iterator2 zipped_last2  = thrust::make_zip_iterator(thrust::make_tuple(keys_last2, values_first2));
-
-  zip_iterator3 zipped_result = thrust::make_zip_iterator(thrust::make_tuple(keys_result, values_result));
-
-  thrust::detail::compare_first<Compare> comp_first(comp);
-
-  iterator_tuple3 result = thrust::merge(exec, zipped_first1, zipped_last1, zipped_first2, zipped_last2, zipped_result, comp_first).get_iterator_tuple();
-
-  return thrust::make_pair(thrust::get<0>(result), thrust::get<1>(result));
-} // end merge_by_key()
-
-
-template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename InputIterator3, typename InputIterator4, typename OutputIterator1, typename OutputIterator2>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    merge_by_key(thrust::execution_policy<DerivedPolicy> &exec,
-                 InputIterator1 keys_first1, InputIterator1 keys_last1,
-                 InputIterator2 keys_first2, InputIterator2 keys_last2,
-                 InputIterator3 values_first1, InputIterator4 values_first2,
-                 OutputIterator1 keys_result,
-                 OutputIterator2 values_result)
-{
-  typedef typename thrust::iterator_value<InputIterator1>::type value_type;
-  return thrust::merge_by_key(exec, keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result, thrust::less<value_type>());
-} // end merge_by_key()
-
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/generic/mismatch.h b/compat/thrust/system/detail/generic/mismatch.h
deleted file mode 100644
index dc581ffbee..0000000000
--- a/compat/thrust/system/detail/generic/mismatch.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/tag.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-
-template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2>
-  thrust::pair<InputIterator1, InputIterator2>
-    mismatch(thrust::execution_policy<DerivedPolicy> &exec,
-             InputIterator1 first1,
-             InputIterator1 last1,
-             InputIterator2 first2);
-
-
-template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename BinaryPredicate>
-  thrust::pair<InputIterator1, InputIterator2>
-    mismatch(thrust::execution_policy<DerivedPolicy> &exec,
-             InputIterator1 first1,
-             InputIterator1 last1,
-             InputIterator2 first2,
-             BinaryPredicate pred);
-
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/detail/generic/mismatch.inl>
-
diff --git a/compat/thrust/system/detail/generic/mismatch.inl b/compat/thrust/system/detail/generic/mismatch.inl
deleted file mode 100644
index 923c27f71b..0000000000
--- a/compat/thrust/system/detail/generic/mismatch.inl
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/mismatch.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/internal_functional.h>
-#include <thrust/find.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2>
-  thrust::pair<InputIterator1, InputIterator2>
-    mismatch(thrust::execution_policy<DerivedPolicy> &exec,
-             InputIterator1 first1,
-             InputIterator1 last1,
-             InputIterator2 first2)
-{
-  typedef typename thrust::iterator_value<InputIterator1>::type InputType1;
-  
-  // XXX use a placeholder expression here
-  return thrust::mismatch(exec, first1, last1, first2, thrust::detail::equal_to<InputType1>());
-} // end mismatch()
-
-template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename BinaryPredicate>
-  thrust::pair<InputIterator1, InputIterator2>
-    mismatch(thrust::execution_policy<DerivedPolicy> &exec,
-             InputIterator1 first1,
-             InputIterator1 last1,
-             InputIterator2 first2,
-             BinaryPredicate pred)
-{
-  // Contributed by Erich Elsen
-  typedef thrust::tuple<InputIterator1,InputIterator2> IteratorTuple;
-  typedef thrust::zip_iterator<IteratorTuple>          ZipIterator;
-  
-  ZipIterator zipped_first = thrust::make_zip_iterator(thrust::make_tuple(first1,first2));
-  ZipIterator zipped_last  = thrust::make_zip_iterator(thrust::make_tuple(last1, first2));
-  
-  ZipIterator result = thrust::find_if_not(exec, zipped_first, zipped_last, thrust::detail::tuple_binary_predicate<BinaryPredicate>(pred));
-  
-  return thrust::make_pair(thrust::get<0>(result.get_iterator_tuple()),
-                           thrust::get<1>(result.get_iterator_tuple()));
-} // end mismatch()
-
-} // end generic
-} // end detail
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/detail/generic/partition.h b/compat/thrust/system/detail/generic/partition.h
deleted file mode 100644
index 63daa1d1c1..0000000000
--- a/compat/thrust/system/detail/generic/partition.h
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file partition.h
- *  \brief Generic implementations of partition functions.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/tag.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-template<typename ExecutionPolicy,
-         typename ForwardIterator,
-         typename Predicate>
-  ForwardIterator stable_partition(thrust::execution_policy<ExecutionPolicy> &exec,
-                                   ForwardIterator first,
-                                   ForwardIterator last,
-                                   Predicate pred);
-
-template<typename ExecutionPolicy,
-         typename ForwardIterator,
-         typename InputIterator,
-         typename Predicate>
-  ForwardIterator stable_partition(thrust::execution_policy<ExecutionPolicy> &exec,
-                                   ForwardIterator first,
-                                   ForwardIterator last,
-                                   InputIterator stencil,
-                                   Predicate pred);
-
-template<typename ExecutionPolicy,
-         typename InputIterator,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename Predicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    stable_partition_copy(thrust::execution_policy<ExecutionPolicy> &exec,
-                          InputIterator first,
-                          InputIterator last,
-                          OutputIterator1 out_true,
-                          OutputIterator2 out_false,
-                          Predicate pred);
-
-template<typename ExecutionPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename Predicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    stable_partition_copy(thrust::execution_policy<ExecutionPolicy> &exec,
-                          InputIterator1 first,
-                          InputIterator1 last,
-                          InputIterator2 stencil,
-                          OutputIterator1 out_true,
-                          OutputIterator2 out_false,
-                          Predicate pred);
-
-template<typename ExecutionPolicy,
-         typename ForwardIterator,
-         typename Predicate>
-  ForwardIterator partition(thrust::execution_policy<ExecutionPolicy> &exec,
-                            ForwardIterator first,
-                            ForwardIterator last,
-                            Predicate pred);
-
-template<typename ExecutionPolicy,
-         typename ForwardIterator,
-         typename InputIterator,
-         typename Predicate>
-  ForwardIterator partition(thrust::execution_policy<ExecutionPolicy> &exec,
-                            ForwardIterator first,
-                            ForwardIterator last,
-                            InputIterator stencil,
-                            Predicate pred);
-
-template<typename ExecutionPolicy,
-         typename InputIterator,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename Predicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    partition_copy(thrust::execution_policy<ExecutionPolicy> &exec,
-                   InputIterator first,
-                   InputIterator last,
-                   OutputIterator1 out_true,
-                   OutputIterator2 out_false,
-                   Predicate pred);
-
-template<typename ExecutionPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename Predicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    partition_copy(thrust::execution_policy<ExecutionPolicy> &exec,
-                   InputIterator1 first,
-                   InputIterator1 last,
-                   InputIterator2 stencil,
-                   OutputIterator1 out_true,
-                   OutputIterator2 out_false,
-                   Predicate pred);
-
-template<typename ExecutionPolicy,
-         typename ForwardIterator,
-         typename Predicate>
-  ForwardIterator partition_point(thrust::execution_policy<ExecutionPolicy> &exec,
-                                  ForwardIterator first,
-                                  ForwardIterator last,
-                                  Predicate pred);
-
-template<typename ExecutionPolicy,
-         typename InputIterator,
-         typename Predicate>
-  bool is_partitioned(thrust::execution_policy<ExecutionPolicy> &exec,
-                      InputIterator first,
-                      InputIterator last,
-                      Predicate pred);
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/detail/generic/partition.inl>
-
diff --git a/compat/thrust/system/detail/generic/partition.inl b/compat/thrust/system/detail/generic/partition.inl
deleted file mode 100644
index 3298afc6f5..0000000000
--- a/compat/thrust/system/detail/generic/partition.inl
+++ /dev/null
@@ -1,238 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/partition.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/pair.h>
-
-#include <thrust/remove.h>
-#include <thrust/count.h>
-#include <thrust/advance.h>
-#include <thrust/partition.h>
-#include <thrust/sort.h>
-#include <thrust/iterator/transform_iterator.h>
-
-#include <thrust/detail/internal_functional.h>
-#include <thrust/detail/temporary_array.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename Predicate>
-  ForwardIterator stable_partition(thrust::execution_policy<DerivedPolicy> &exec,
-                                   ForwardIterator first,
-                                   ForwardIterator last,
-                                   Predicate pred)
-{
-  typedef typename thrust::iterator_traits<ForwardIterator>::value_type InputType;
-
-  // copy input to temp buffer
-  thrust::detail::temporary_array<InputType,DerivedPolicy> temp(exec, first, last);
-
-  // count the size of the true partition
-  typename thrust::iterator_difference<ForwardIterator>::type num_true = thrust::count_if(exec, first,last,pred);
-
-  // point to the beginning of the false partition
-  ForwardIterator out_false = first;
-  thrust::advance(out_false, num_true);
-
-  return thrust::stable_partition_copy(exec, temp.begin(), temp.end(), first, out_false, pred).first;
-} // end stable_partition()
-
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename InputIterator,
-         typename Predicate>
-  ForwardIterator stable_partition(thrust::execution_policy<DerivedPolicy> &exec,
-                                   ForwardIterator first,
-                                   ForwardIterator last,
-                                   InputIterator stencil,
-                                   Predicate pred)
-{
-  typedef typename thrust::iterator_traits<ForwardIterator>::value_type InputType;
-
-  // copy input to temp buffer
-  thrust::detail::temporary_array<InputType,DerivedPolicy> temp(exec, first, last);
-
-  // count the size of the true partition
-  InputIterator stencil_last = stencil;
-  thrust::advance(stencil_last, temp.size());
-  typename thrust::iterator_difference<InputIterator>::type num_true = thrust::count_if(exec, stencil, stencil_last, pred);
-
-  // point to the beginning of the false partition
-  ForwardIterator out_false = first;
-  thrust::advance(out_false, num_true);
-
-  return thrust::stable_partition_copy(exec, temp.begin(), temp.end(), stencil, first, out_false, pred).first;
-} // end stable_partition()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename Predicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    stable_partition_copy(thrust::execution_policy<DerivedPolicy> &exec,
-                          InputIterator first,
-                          InputIterator last,
-                          OutputIterator1 out_true,
-                          OutputIterator2 out_false,
-                          Predicate pred)
-{
-  thrust::detail::unary_negate<Predicate> not_pred(pred);
-
-  // remove_copy_if the true partition to out_true
-  OutputIterator1 end_of_true_partition = thrust::remove_copy_if(exec, first, last, out_true, not_pred);
-
-  // remove_copy_if the false partition to out_false
-  OutputIterator2 end_of_false_partition = thrust::remove_copy_if(exec, first, last, out_false, pred);
-
-  return thrust::make_pair(end_of_true_partition, end_of_false_partition);
-} // end stable_partition_copy()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename Predicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    stable_partition_copy(thrust::execution_policy<DerivedPolicy> &exec,
-                          InputIterator1 first,
-                          InputIterator1 last,
-                          InputIterator2 stencil,
-                          OutputIterator1 out_true,
-                          OutputIterator2 out_false,
-                          Predicate pred)
-{
-  thrust::detail::unary_negate<Predicate> not_pred(pred);
-
-  // remove_copy_if the true partition to out_true
-  OutputIterator1 end_of_true_partition = thrust::remove_copy_if(exec, first, last, stencil, out_true, not_pred);
-
-  // remove_copy_if the false partition to out_false
-  OutputIterator2 end_of_false_partition = thrust::remove_copy_if(exec, first, last, stencil, out_false, pred);
-
-  return thrust::make_pair(end_of_true_partition, end_of_false_partition);
-} // end stable_partition_copy()
-
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename Predicate>
-  ForwardIterator partition(thrust::execution_policy<DerivedPolicy> &exec,
-                            ForwardIterator first,
-                            ForwardIterator last,
-                            Predicate pred)
-{
-  return thrust::stable_partition(exec, first, last, pred);
-} // end partition()
-
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename InputIterator,
-         typename Predicate>
-  ForwardIterator partition(thrust::execution_policy<DerivedPolicy> &exec,
-                            ForwardIterator first,
-                            ForwardIterator last,
-                            InputIterator stencil,
-                            Predicate pred)
-{
-  return thrust::stable_partition(exec, first, last, stencil, pred);
-} // end partition()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename Predicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    partition_copy(thrust::execution_policy<DerivedPolicy> &exec,
-                   InputIterator first,
-                   InputIterator last,
-                   OutputIterator1 out_true,
-                   OutputIterator2 out_false,
-                   Predicate pred)
-{
-  return thrust::stable_partition_copy(exec,first,last,out_true,out_false,pred);
-} // end partition_copy()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename Predicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    partition_copy(thrust::execution_policy<DerivedPolicy> &exec,
-                   InputIterator1 first,
-                   InputIterator1 last,
-                   InputIterator2 stencil,
-                   OutputIterator1 out_true,
-                   OutputIterator2 out_false,
-                   Predicate pred)
-{
-  return thrust::stable_partition_copy(exec,first,last,stencil,out_true,out_false,pred);
-} // end partition_copy()
-
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename Predicate>
-  ForwardIterator partition_point(thrust::execution_policy<DerivedPolicy> &exec,
-                                  ForwardIterator first,
-                                  ForwardIterator last,
-                                  Predicate pred)
-{
-  return thrust::find_if_not(exec, first, last, pred);
-} // end partition_point()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename Predicate>
-  bool is_partitioned(thrust::execution_policy<DerivedPolicy> &exec,
-                      InputIterator first,
-                      InputIterator last,
-                      Predicate pred)
-{
-  return thrust::is_sorted(exec,
-                           thrust::make_transform_iterator(first, thrust::detail::not1(pred)),
-                           thrust::make_transform_iterator(last,  thrust::detail::not1(pred)));
-} // end is_partitioned()
-
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/generic/reduce.h b/compat/thrust/system/detail/generic/reduce.h
deleted file mode 100644
index 2811df164f..0000000000
--- a/compat/thrust/system/detail/generic/reduce.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/tag.h>
-#include <thrust/iterator/iterator_traits.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-template<typename DerivedPolicy, typename InputIterator>
-  typename thrust::iterator_traits<InputIterator>::value_type
-    reduce(thrust::execution_policy<DerivedPolicy> &exec, InputIterator first, InputIterator last);
-
-template<typename DerivedPolicy, typename InputIterator, typename T>
-  T reduce(thrust::execution_policy<DerivedPolicy> &exec, InputIterator first, InputIterator last, T init);
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename T,
-         typename BinaryFunction>
-  T reduce(thrust::execution_policy<DerivedPolicy> &exec, InputIterator first, InputIterator last, T init, BinaryFunction binary_op);
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/detail/generic/reduce.inl>
-
diff --git a/compat/thrust/system/detail/generic/reduce.inl b/compat/thrust/system/detail/generic/reduce.inl
deleted file mode 100644
index 8f52385163..0000000000
--- a/compat/thrust/system/detail/generic/reduce.inl
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/reduce.h>
-#include <thrust/system/detail/generic/reduce.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/functional.h>
-#include <thrust/detail/static_assert.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-
-template<typename ExecutionPolicy, typename InputIterator>
-  typename thrust::iterator_traits<InputIterator>::value_type
-    reduce(thrust::execution_policy<ExecutionPolicy> &exec, InputIterator first, InputIterator last)
-{
-  typedef typename thrust::iterator_value<InputIterator>::type InputType;
-
-  // use InputType(0) as init by default
-  return thrust::reduce(exec, first, last, InputType(0));
-} // end reduce()
-
-
-template<typename ExecutionPolicy, typename InputIterator, typename T>
-  T reduce(thrust::execution_policy<ExecutionPolicy> &exec, InputIterator first, InputIterator last, T init)
-{
-  // use plus<T> by default
-  return thrust::reduce(exec, first, last, init, thrust::plus<T>());
-} // end reduce()
-
-
-template<typename ExecutionPolicy,
-         typename RandomAccessIterator,
-         typename OutputType,
-         typename BinaryFunction>
-  OutputType reduce(thrust::execution_policy<ExecutionPolicy> &exec,
-                    RandomAccessIterator first,
-                    RandomAccessIterator last,
-                    OutputType init,
-                    BinaryFunction binary_op)
-{
-  // unimplemented
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<RandomAccessIterator, false>::value) );
-  return OutputType();
-} // end reduce()
-
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/generic/reduce_by_key.h b/compat/thrust/system/detail/generic/reduce_by_key.h
deleted file mode 100644
index c6064ab53e..0000000000
--- a/compat/thrust/system/detail/generic/reduce_by_key.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/tag.h>
-#include <thrust/iterator/iterator_traits.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    reduce_by_key(thrust::execution_policy<DerivedPolicy> &exec,
-                  InputIterator1 keys_first, 
-                  InputIterator1 keys_last,
-                  InputIterator2 values_first,
-                  OutputIterator1 keys_output,
-                  OutputIterator2 values_output);
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename BinaryPredicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    reduce_by_key(thrust::execution_policy<DerivedPolicy> &exec,
-                  InputIterator1 keys_first, 
-                  InputIterator1 keys_last,
-                  InputIterator2 values_first,
-                  OutputIterator1 keys_output,
-                  OutputIterator2 values_output,
-                  BinaryPredicate binary_pred);
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename BinaryPredicate,
-         typename BinaryFunction>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    reduce_by_key(thrust::execution_policy<DerivedPolicy> &exec,
-                  InputIterator1 keys_first, 
-                  InputIterator1 keys_last,
-                  InputIterator2 values_first,
-                  OutputIterator1 keys_output,
-                  OutputIterator2 values_output,
-                  BinaryPredicate binary_pred,
-                  BinaryFunction binary_op);
-
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/detail/generic/reduce_by_key.inl>
-
diff --git a/compat/thrust/system/detail/generic/reduce_by_key.inl b/compat/thrust/system/detail/generic/reduce_by_key.inl
deleted file mode 100644
index 2ca21a5aab..0000000000
--- a/compat/thrust/system/detail/generic/reduce_by_key.inl
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file reduce_by_key.inl
- *  \brief Inline file for reduce_by_key.h.
- */
-
-#pragma once
-
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/iterator/detail/minimum_system.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/type_traits/iterator/is_output_iterator.h>
-#include <thrust/detail/type_traits/function_traits.h>
-#include <thrust/transform.h>
-#include <thrust/scatter.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <limits>
-
-#include <thrust/detail/internal_functional.h>
-#include <thrust/scan.h>
-#include <thrust/detail/temporary_array.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-namespace detail
-{
-
-template <typename ValueType, typename TailFlagType, typename AssociativeOperator>
-struct reduce_by_key_functor
-{
-    AssociativeOperator binary_op;
-
-    typedef typename thrust::tuple<ValueType, TailFlagType> result_type;
-
-    __host__ __device__
-    reduce_by_key_functor(AssociativeOperator _binary_op) : binary_op(_binary_op) {}
-
-    __host__ __device__
-    result_type operator()(result_type a, result_type b)
-    {
-        return result_type(thrust::get<1>(b) ? thrust::get<0>(b) : binary_op(thrust::get<0>(a), thrust::get<0>(b)),
-                           thrust::get<1>(a) | thrust::get<1>(b));
-    }
-};
-
-} // end namespace detail
-
-
-template<typename ExecutionPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename BinaryPredicate,
-         typename BinaryFunction>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    reduce_by_key(thrust::execution_policy<ExecutionPolicy> &exec,
-                  InputIterator1 keys_first, 
-                  InputIterator1 keys_last,
-                  InputIterator2 values_first,
-                  OutputIterator1 keys_output,
-                  OutputIterator2 values_output,
-                  BinaryPredicate binary_pred,
-                  BinaryFunction binary_op)
-{
-    typedef typename thrust::iterator_traits<InputIterator1>::difference_type difference_type;
-    typedef typename thrust::iterator_traits<InputIterator1>::value_type  KeyType;
-
-    typedef unsigned int FlagType;  // TODO use difference_type
-
-    // the pseudocode for deducing the type of the temporary used below:
-    // 
-    // if BinaryFunction is AdaptableBinaryFunction
-    //   TemporaryType = AdaptableBinaryFunction::result_type
-    // else if OutputIterator2 is a "pure" output iterator
-    //   TemporaryType = InputIterator2::value_type
-    // else
-    //   TemporaryType = OutputIterator2::value_type
-    //
-    // XXX upon c++0x, TemporaryType needs to be:
-    // result_of<BinaryFunction>::type
-
-    typedef typename thrust::detail::eval_if<
-      thrust::detail::has_result_type<BinaryFunction>::value,
-      thrust::detail::result_type<BinaryFunction>,
-      thrust::detail::eval_if<
-        thrust::detail::is_output_iterator<OutputIterator2>::value,
-        thrust::iterator_value<InputIterator2>,
-        thrust::iterator_value<OutputIterator2>
-      >
-    >::type ValueType;
-
-    if (keys_first == keys_last)
-        return thrust::make_pair(keys_output, values_output);
-
-    // input size
-    difference_type n = keys_last - keys_first;
-
-    InputIterator2 values_last = values_first + n;
-    
-    // compute head flags
-    thrust::detail::temporary_array<FlagType,ExecutionPolicy> head_flags(exec, n);
-    thrust::transform(exec, keys_first, keys_last - 1, keys_first + 1, head_flags.begin() + 1, thrust::detail::not2(binary_pred));
-    head_flags[0] = 1;
-
-    // compute tail flags
-    thrust::detail::temporary_array<FlagType,ExecutionPolicy> tail_flags(exec, n); //COPY INSTEAD OF TRANSFORM
-    thrust::transform(exec, keys_first, keys_last - 1, keys_first + 1, tail_flags.begin(), thrust::detail::not2(binary_pred));
-    tail_flags[n-1] = 1;
-
-    // scan the values by flag
-    thrust::detail::temporary_array<ValueType,ExecutionPolicy> scanned_values(exec, n);
-    thrust::detail::temporary_array<FlagType,ExecutionPolicy>  scanned_tail_flags(exec, n);
-    
-    thrust::inclusive_scan
-        (exec,
-         thrust::make_zip_iterator(thrust::make_tuple(values_first,           head_flags.begin())),
-         thrust::make_zip_iterator(thrust::make_tuple(values_last,            head_flags.end())),
-         thrust::make_zip_iterator(thrust::make_tuple(scanned_values.begin(), scanned_tail_flags.begin())),
-         detail::reduce_by_key_functor<ValueType, FlagType, BinaryFunction>(binary_op));
-
-    thrust::exclusive_scan(exec, tail_flags.begin(), tail_flags.end(), scanned_tail_flags.begin(), FlagType(0), thrust::plus<FlagType>());
-
-    // number of unique keys
-    FlagType N = scanned_tail_flags[n - 1] + 1;
-    
-    // scatter the keys and accumulated values    
-    thrust::scatter_if(exec, keys_first,            keys_last,             scanned_tail_flags.begin(), head_flags.begin(), keys_output);
-    thrust::scatter_if(exec, scanned_values.begin(), scanned_values.end(), scanned_tail_flags.begin(), tail_flags.begin(), values_output);
-
-    return thrust::make_pair(keys_output + N, values_output + N); 
-} // end reduce_by_key()
-
-
-template<typename ExecutionPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    reduce_by_key(thrust::execution_policy<ExecutionPolicy> &exec,
-                  InputIterator1 keys_first, 
-                  InputIterator1 keys_last,
-                  InputIterator2 values_first,
-                  OutputIterator1 keys_output,
-                  OutputIterator2 values_output)
-{
-  typedef typename thrust::iterator_value<InputIterator1>::type KeyType;
-
-  // use equal_to<KeyType> as default BinaryPredicate
-  return thrust::reduce_by_key(exec, keys_first, keys_last, values_first, keys_output, values_output, thrust::equal_to<KeyType>());
-} // end reduce_by_key()
-
-
-template<typename ExecutionPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename BinaryPredicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    reduce_by_key(thrust::execution_policy<ExecutionPolicy> &exec,
-                  InputIterator1 keys_first, 
-                  InputIterator1 keys_last,
-                  InputIterator2 values_first,
-                  OutputIterator1 keys_output,
-                  OutputIterator2 values_output,
-                  BinaryPredicate binary_pred)
-{
-  typedef typename thrust::detail::eval_if<
-    thrust::detail::is_output_iterator<OutputIterator2>::value,
-    thrust::iterator_value<InputIterator2>,
-    thrust::iterator_value<OutputIterator2>
-  >::type T;
-
-  // use plus<T> as default BinaryFunction
-  return thrust::reduce_by_key(exec,
-                               keys_first, keys_last, 
-                               values_first,
-                               keys_output,
-                               values_output,
-                               binary_pred,
-                               thrust::plus<T>());
-} // end reduce_by_key()
-
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/generic/remove.h b/compat/thrust/system/detail/generic/remove.h
deleted file mode 100644
index e23673574e..0000000000
--- a/compat/thrust/system/detail/generic/remove.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file remove.h
- *  \brief Generic implementations of remove functions.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/tag.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename T>
-  ForwardIterator remove(thrust::execution_policy<DerivedPolicy> &exec,
-                         ForwardIterator first,
-                         ForwardIterator last,
-                         const T &value);
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename T>
-  OutputIterator remove_copy(thrust::execution_policy<DerivedPolicy> &exec,
-                             InputIterator first,
-                             InputIterator last,
-                             OutputIterator result,
-                             const T &value);
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename Predicate>
-  ForwardIterator remove_if(thrust::execution_policy<DerivedPolicy> &exec,
-                            ForwardIterator first,
-                            ForwardIterator last,
-                            Predicate pred);
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename InputIterator,
-         typename Predicate>
-  ForwardIterator remove_if(thrust::execution_policy<DerivedPolicy> &exec,
-                            ForwardIterator first,
-                            ForwardIterator last,
-                            InputIterator stencil,
-                            Predicate pred);
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator remove_copy_if(thrust::execution_policy<DerivedPolicy> &exec,
-                                InputIterator first,
-                                InputIterator last,
-                                OutputIterator result,
-                                Predicate pred);
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator remove_copy_if(thrust::execution_policy<DerivedPolicy> &exec,
-                                InputIterator1 first,
-                                InputIterator1 last,
-                                InputIterator2 stencil,
-                                OutputIterator result,
-                                Predicate pred);
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/detail/generic/remove.inl>
-
diff --git a/compat/thrust/system/detail/generic/remove.inl b/compat/thrust/system/detail/generic/remove.inl
deleted file mode 100644
index 8a533e029a..0000000000
--- a/compat/thrust/system/detail/generic/remove.inl
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file remove.inl
- *  \brief Inline file for remove.h
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/remove.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/copy_if.h>
-#include <thrust/detail/internal_functional.h>
-#include <thrust/detail/temporary_array.h>
-#include <thrust/remove.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename T>
-  ForwardIterator remove(thrust::execution_policy<DerivedPolicy> &exec,
-                         ForwardIterator first,
-                         ForwardIterator last,
-                         const T &value)
-{
-  thrust::detail::equal_to_value<T> pred(value);
-
-  // XXX consider using a placeholder here
-  return thrust::remove_if(exec, first, last, pred);
-} // end remove()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename T>
-  OutputIterator remove_copy(thrust::execution_policy<DerivedPolicy> &exec,
-                             InputIterator first,
-                             InputIterator last,
-                             OutputIterator result,
-                             const T &value)
-{
-  thrust::detail::equal_to_value<T> pred(value);
-
-  // XXX consider using a placeholder here
-  return thrust::remove_copy_if(exec, first, last, result, pred);
-} // end remove_copy()
-
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename Predicate>
-  ForwardIterator remove_if(thrust::execution_policy<DerivedPolicy> &exec,
-                            ForwardIterator first,
-                            ForwardIterator last,
-                            Predicate pred)
-{
-  typedef typename thrust::iterator_traits<ForwardIterator>::value_type InputType;
-
-  // create temporary storage for an intermediate result
-  thrust::detail::temporary_array<InputType,DerivedPolicy> temp(exec, first, last);
-
-  // remove into temp
-  return thrust::remove_copy_if(exec, temp.begin(), temp.end(), temp.begin(), first, pred);
-} // end remove_if()
-
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename InputIterator,
-         typename Predicate>
-  ForwardIterator remove_if(thrust::execution_policy<DerivedPolicy> &exec,
-                            ForwardIterator first,
-                            ForwardIterator last,
-                            InputIterator stencil,
-                            Predicate pred)
-{
-  typedef typename thrust::iterator_traits<ForwardIterator>::value_type InputType;
-
-  // create temporary storage for an intermediate result
-  thrust::detail::temporary_array<InputType,DerivedPolicy> temp(exec, first, last);
-
-  // remove into temp
-  return thrust::remove_copy_if(exec, temp.begin(), temp.end(), stencil, first, pred);
-} // end remove_if() 
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator remove_copy_if(thrust::execution_policy<DerivedPolicy> &exec,
-                                InputIterator first,
-                                InputIterator last,
-                                OutputIterator result,
-                                Predicate pred)
-{
-  return thrust::remove_copy_if(exec, first, last, first, result, pred);
-} // end remove_copy_if()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator remove_copy_if(thrust::execution_policy<DerivedPolicy> &exec,
-                                InputIterator1 first,
-                                InputIterator1 last,
-                                InputIterator2 stencil,
-                                OutputIterator result,
-                                Predicate pred)
-{
-  return thrust::copy_if(exec, first, last, stencil, result, thrust::detail::not1(pred));
-} // end remove_copy_if()
-
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/generic/replace.h b/compat/thrust/system/detail/generic/replace.h
deleted file mode 100644
index deb2e55bc8..0000000000
--- a/compat/thrust/system/detail/generic/replace.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/tag.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-
-template<typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename Predicate, typename T>
-  OutputIterator replace_copy_if(thrust::execution_policy<DerivedPolicy> &exec,
-                                 InputIterator first,
-                                 InputIterator last,
-                                 OutputIterator result,
-                                 Predicate pred,
-                                 const T &new_value);
-
-
-template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Predicate, typename T>
-  OutputIterator replace_copy_if(thrust::execution_policy<DerivedPolicy> &exec,
-                                 InputIterator1 first,
-                                 InputIterator1 last,
-                                 InputIterator2 stencil,
-                                 OutputIterator result,
-                                 Predicate pred,
-                                 const T &new_value);
-
-
-template<typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename T>
-  OutputIterator replace_copy(thrust::execution_policy<DerivedPolicy> &exec,
-                              InputIterator first,
-                              InputIterator last,
-                              OutputIterator result,
-                              const T &old_value,
-                              const T &new_value);
-
-
-template<typename DerivedPolicy, typename ForwardIterator, typename Predicate, typename T>
-  void replace_if(thrust::execution_policy<DerivedPolicy> &exec,
-                  ForwardIterator first,
-                  ForwardIterator last,
-                  Predicate pred,
-                  const T &new_value);
-
-
-template<typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename Predicate, typename T>
-  void replace_if(thrust::execution_policy<DerivedPolicy> &exec,
-                  ForwardIterator first,
-                  ForwardIterator last,
-                  InputIterator stencil,
-                  Predicate pred,
-                  const T &new_value);
-
-
-template<typename DerivedPolicy, typename ForwardIterator, typename T>
-  void replace(thrust::execution_policy<DerivedPolicy> &exec,
-               ForwardIterator first,
-               ForwardIterator last,
-               const T &old_value,
-               const T &new_value);
-
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/detail/generic/replace.inl>
-
diff --git a/compat/thrust/system/detail/generic/replace.inl b/compat/thrust/system/detail/generic/replace.inl
deleted file mode 100644
index 52e7118ecc..0000000000
--- a/compat/thrust/system/detail/generic/replace.inl
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/replace.h>
-#include <thrust/transform.h>
-#include <thrust/replace.h>
-#include <thrust/detail/internal_functional.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-namespace detail
-{
-
-// this functor receives x, and returns a new_value if predicate(x) is true; otherwise,
-// it returns x
-template<typename Predicate, typename NewType, typename OutputType>
-  struct new_value_if
-{
-  new_value_if(Predicate p, NewType nv):pred(p),new_value(nv){}
-
-  template<typename InputType>
-  __host__ __device__
-  OutputType operator()(const InputType x) const
-  {
-    return pred(x) ? new_value : x;
-  } // end operator()()
-
-  // this version of operator()() works like the previous but
-  // feeds its second argument to pred
-  template<typename InputType, typename PredicateArgumentType>
-  __host__ __device__
-  OutputType operator()(const InputType x, const PredicateArgumentType y)
-  {
-    return pred(y) ? new_value : x;
-  } // end operator()()
-  
-  Predicate pred;
-  NewType new_value;
-}; // end new_value_if
-
-// this unary functor ignores its argument and returns a constant
-template<typename T>
-  struct constant_unary
-{
-  constant_unary(T _c):c(_c){}
-
-  template<typename U>
-  __host__ __device__
-  T operator()(U &x)
-  {
-    return c;
-  } // end operator()()
-
-  T c;
-}; // end constant_unary
-
-} // end detail
-
-template<typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename Predicate, typename T>
-  OutputIterator replace_copy_if(thrust::execution_policy<DerivedPolicy> &exec,
-                                 InputIterator first,
-                                 InputIterator last,
-                                 OutputIterator result,
-                                 Predicate pred,
-                                 const T &new_value)
-{
-  typedef typename thrust::iterator_traits<InputIterator>::value_type InputType;
-  typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
-
-  detail::new_value_if<Predicate,T,OutputType> op(pred,new_value);
-  return thrust::transform(exec, first, last, result, op);
-} // end replace_copy_if()
-
-template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Predicate, typename T>
-  OutputIterator replace_copy_if(thrust::execution_policy<DerivedPolicy> &exec,
-                                 InputIterator1 first,
-                                 InputIterator1 last,
-                                 InputIterator2 stencil,
-                                 OutputIterator result,
-                                 Predicate pred,
-                                 const T &new_value)
-{
-  typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
-
-  detail::new_value_if<Predicate,T,OutputType> op(pred,new_value);
-  return thrust::transform(exec, first, last, stencil, result, op);
-} // end replace_copy_if()
-
-
-template<typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename T>
-  OutputIterator replace_copy(thrust::execution_policy<DerivedPolicy> &exec,
-                              InputIterator first,
-                              InputIterator last,
-                              OutputIterator result,
-                              const T &old_value,
-                              const T &new_value)
-{
-  thrust::detail::equal_to_value<T> pred(old_value);
-  return thrust::replace_copy_if(exec, first, last, result, pred, new_value);
-} // end replace_copy()
-
-template<typename DerivedPolicy, typename ForwardIterator, typename Predicate, typename T>
-  void replace_if(thrust::execution_policy<DerivedPolicy> &exec,
-                  ForwardIterator first,
-                  ForwardIterator last,
-                  Predicate pred,
-                  const T &new_value)
-{
-  detail::constant_unary<T> f(new_value);
-
-  // XXX replace this with generate_if:
-  // constant_nullary<T> f(new_value);
-  // generate_if(first, last, first, f, pred);
-  thrust::transform_if(exec, first, last, first, first, f, pred);
-} // end replace_if()
-
-template<typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename Predicate, typename T>
-  void replace_if(thrust::execution_policy<DerivedPolicy> &exec,
-                  ForwardIterator first,
-                  ForwardIterator last,
-                  InputIterator stencil,
-                  Predicate pred,
-                  const T &new_value)
-{
-  detail::constant_unary<T> f(new_value);
-
-  // XXX replace this with generate_if:
-  // constant_nullary<T> f(new_value);
-  // generate_if(stencil, stencil + n, first, f, pred);
-  thrust::transform_if(exec, first, last, stencil, first, f, pred);
-} // end replace_if()
-
-template<typename DerivedPolicy, typename ForwardIterator, typename T>
-  void replace(thrust::execution_policy<DerivedPolicy> &exec,
-               ForwardIterator first,
-               ForwardIterator last,
-               const T &old_value,
-               const T &new_value)
-{
-  thrust::detail::equal_to_value<T> pred(old_value);
-  return thrust::replace_if(exec, first, last, pred, new_value);
-} // end replace()
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/generic/reverse.h b/compat/thrust/system/detail/generic/reverse.h
deleted file mode 100644
index 327bf221b1..0000000000
--- a/compat/thrust/system/detail/generic/reverse.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/tag.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-template<typename DerivedPolicy, typename BidirectionalIterator>
-  void reverse(thrust::execution_policy<DerivedPolicy> &exec,
-               BidirectionalIterator first,
-               BidirectionalIterator last);
-
-template<typename DerivedPolicy,
-         typename BidirectionalIterator,
-         typename OutputIterator>
-  OutputIterator reverse_copy(thrust::execution_policy<DerivedPolicy> &exec,
-                              BidirectionalIterator first,
-                              BidirectionalIterator last,
-                              OutputIterator result);
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/detail/generic/reverse.inl>
-
diff --git a/compat/thrust/system/detail/generic/reverse.inl b/compat/thrust/system/detail/generic/reverse.inl
deleted file mode 100644
index 27c1bbf2fa..0000000000
--- a/compat/thrust/system/detail/generic/reverse.inl
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/reverse.h>
-#include <thrust/advance.h>
-#include <thrust/distance.h>
-#include <thrust/detail/copy.h>
-#include <thrust/swap.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/iterator/reverse_iterator.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-template<typename ExecutionPolicy, typename BidirectionalIterator>
-  void reverse(thrust::execution_policy<ExecutionPolicy> &exec,
-               BidirectionalIterator first,
-               BidirectionalIterator last)
-{
-  typedef typename thrust::iterator_difference<BidirectionalIterator>::type difference_type;
-
-  // find the midpoint of [first,last)
-  difference_type N = thrust::distance(first, last);
-  BidirectionalIterator mid(first);
-  thrust::advance(mid, N / 2);
-
-  // swap elements of [first,mid) with [last - 1, mid)
-  thrust::swap_ranges(exec, first, mid, thrust::make_reverse_iterator(last));
-} // end reverse()
-
-template<typename ExecutionPolicy,
-         typename BidirectionalIterator,
-         typename OutputIterator>
-  OutputIterator reverse_copy(thrust::execution_policy<ExecutionPolicy> &exec,
-                              BidirectionalIterator first,
-                              BidirectionalIterator last,
-                              OutputIterator result)
-{
-  return thrust::copy(exec,
-                      thrust::make_reverse_iterator(last),
-                      thrust::make_reverse_iterator(first),
-                      result);
-} // end reverse_copy()
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
-
diff --git a/compat/thrust/system/detail/generic/scalar/binary_search.h b/compat/thrust/system/detail/generic/scalar/binary_search.h
deleted file mode 100644
index 6ed9e8d9ee..0000000000
--- a/compat/thrust/system/detail/generic/scalar/binary_search.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/pair.h>
-
-namespace thrust
-{
-
-namespace system
-{
-
-namespace detail
-{
-
-namespace generic
-{
-
-namespace scalar
-{
-
-template<typename RandomAccessIterator, typename Size, typename T, typename BinaryPredicate>
-__host__ __device__
-RandomAccessIterator lower_bound_n(RandomAccessIterator first,
-                                   Size n,
-                                   const T &val,
-                                   BinaryPredicate comp);
-
-template<typename RandomAccessIterator, typename T, typename BinaryPredicate>
-__host__ __device__
-RandomAccessIterator lower_bound(RandomAccessIterator first, RandomAccessIterator last,
-                                 const T &val,
-                                 BinaryPredicate comp);
-
-template<typename RandomAccessIterator, typename Size, typename T, typename BinaryPredicate>
-__host__ __device__
-RandomAccessIterator upper_bound_n(RandomAccessIterator first,
-                                   Size n,
-                                   const T &val,
-                                   BinaryPredicate comp);
-
-template<typename RandomAccessIterator, typename T, typename BinaryPredicate>
-__host__ __device__
-RandomAccessIterator upper_bound(RandomAccessIterator first, RandomAccessIterator last,
-                                 const T &val,
-                                 BinaryPredicate comp);
-
-template<typename RandomAccessIterator, typename T, typename BinaryPredicate>
-__host__ __device__
-  pair<RandomAccessIterator,RandomAccessIterator>
-    equal_range(RandomAccessIterator first, RandomAccessIterator last,
-                const T &val,
-                BinaryPredicate comp);
-
-template<typename RandomAccessIterator, typename T, typename Compare>
-__host__ __device__
-bool binary_search(RandomAccessIterator first, RandomAccessIterator last, const T &value, Compare comp);
-
-} // end scalar
-
-} // end generic
-
-} // end detail
-
-} // end system
-
-} // end thrust
-
-#include <thrust/system/detail/generic/scalar/binary_search.inl>
-
diff --git a/compat/thrust/system/detail/generic/scalar/binary_search.inl b/compat/thrust/system/detail/generic/scalar/binary_search.inl
deleted file mode 100644
index 5a9d379612..0000000000
--- a/compat/thrust/system/detail/generic/scalar/binary_search.inl
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/pair.h>
-#include <thrust/detail/function.h>
-#include <thrust/iterator/iterator_traits.h>
-
-namespace thrust
-{
-
-namespace system
-{
-
-namespace detail
-{
-
-namespace generic
-{
-
-namespace scalar
-{
-
-template<typename RandomAccessIterator, typename Size, typename T, typename BinaryPredicate>
-__host__ __device__
-RandomAccessIterator lower_bound_n(RandomAccessIterator first,
-                                   Size n,
-                                   const T &val,
-                                   BinaryPredicate comp)
-{
-  // wrap comp
-  thrust::detail::host_device_function<
-    BinaryPredicate,
-    bool
-  > wrapped_comp(comp);
-
-  Size start = 0, i;
-  while(start < n)
-  {
-    i = (start + n) / 2;
-    if(wrapped_comp(first[i], val))
-    {
-      start = i + 1;
-    }
-    else
-    {
-      n = i;
-    }
-  } // end while
-  
-  return first + start;
-}
-
-// XXX generalize these upon implementation of scalar::distance & scalar::advance
-
-template<typename RandomAccessIterator, typename T, typename BinaryPredicate>
-__host__ __device__
-RandomAccessIterator lower_bound(RandomAccessIterator first, RandomAccessIterator last,
-                                 const T &val,
-                                 BinaryPredicate comp)
-{
-  typename thrust::iterator_difference<RandomAccessIterator>::type n = last - first;
-  return lower_bound_n(first, n, val, comp);
-}
-
-template<typename RandomAccessIterator, typename Size, typename T, typename BinaryPredicate>
-__host__ __device__
-RandomAccessIterator upper_bound_n(RandomAccessIterator first,
-                                   Size n,
-                                   const T &val,
-                                   BinaryPredicate comp)
-{
-  // wrap comp
-  thrust::detail::host_device_function<
-    BinaryPredicate,
-    bool
-  > wrapped_comp(comp);
-
-  Size start = 0, i;
-  while(start < n)
-  {
-    i = (start + n) / 2;
-    if(wrapped_comp(val, first[i]))
-    {
-      n = i;
-    }
-    else
-    {
-      start = i + 1;
-    }
-  } // end while
-  
-  return first + start;
-}
-
-template<typename RandomAccessIterator, typename T, typename BinaryPredicate>
-__host__ __device__
-RandomAccessIterator upper_bound(RandomAccessIterator first, RandomAccessIterator last,
-                                 const T &val,
-                                 BinaryPredicate comp)
-{
-  typename thrust::iterator_difference<RandomAccessIterator>::type n = last - first;
-  return upper_bound_n(first, n, val, comp);
-}
-
-template<typename RandomAccessIterator, typename T, typename BinaryPredicate>
-__host__ __device__
-  pair<RandomAccessIterator,RandomAccessIterator>
-    equal_range(RandomAccessIterator first, RandomAccessIterator last,
-                const T &val,
-                BinaryPredicate comp)
-{
-  RandomAccessIterator lb = thrust::system::detail::generic::scalar::lower_bound(first, last, val, comp);
-  return thrust::make_pair(lb, thrust::system::detail::generic::scalar::upper_bound(lb, last, val, comp));
-}
-
-
-template<typename RandomAccessIterator, typename T, typename Compare>
-__host__ __device__
-bool binary_search(RandomAccessIterator first, RandomAccessIterator last, const T &value, Compare comp)
-{
-  RandomAccessIterator iter = thrust::system::detail::generic::scalar::lower_bound(first, last, value, comp);
-
-  // wrap comp
-  thrust::detail::host_device_function<
-    Compare,
-    bool
-  > wrapped_comp(comp);
-
-  return iter != last && !wrapped_comp(value,*iter);
-}
-
-} // end scalar
-
-} // end generic
-
-} // end detail
-
-} // end system
-
-} // end thrust
-
-#include <thrust/system/detail/generic/scalar/binary_search.inl>
-
diff --git a/compat/thrust/system/detail/generic/scan.h b/compat/thrust/system/detail/generic/scan.h
deleted file mode 100644
index 205f87ff9f..0000000000
--- a/compat/thrust/system/detail/generic/scan.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/tag.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-
-template<typename ExecutionPolicy,
-         typename InputIterator,
-         typename OutputIterator>
-  OutputIterator inclusive_scan(thrust::execution_policy<ExecutionPolicy> &exec,
-                                InputIterator first,
-                                InputIterator last,
-                                OutputIterator result);
-
-
-// XXX it is an error to call this function; it has no implementation 
-template<typename ExecutionPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename BinaryFunction>
-  OutputIterator inclusive_scan(thrust::execution_policy<ExecutionPolicy> &exec,
-                                InputIterator first,
-                                InputIterator last,
-                                OutputIterator result,
-                                BinaryFunction binary_op);
-
-
-template<typename ExecutionPolicy,
-         typename InputIterator,
-         typename OutputIterator>
-  OutputIterator exclusive_scan(thrust::execution_policy<ExecutionPolicy> &exec,
-                                InputIterator first,
-                                InputIterator last,
-                                OutputIterator result);
-
-
-template<typename ExecutionPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename T>
-  OutputIterator exclusive_scan(thrust::execution_policy<ExecutionPolicy> &exec,
-                                InputIterator first,
-                                InputIterator last,
-                                OutputIterator result,
-                                T init);
-
-
-// XXX it is an error to call this function; it has no implementation 
-template<typename ExecutionPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename T,
-         typename BinaryFunction>
-  OutputIterator exclusive_scan(thrust::execution_policy<ExecutionPolicy> &exec,
-                                InputIterator first,
-                                InputIterator last,
-                                OutputIterator result,
-                                T init,
-                                BinaryFunction binary_op);
-
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/detail/generic/scan.inl>
-
diff --git a/compat/thrust/system/detail/generic/scan.inl b/compat/thrust/system/detail/generic/scan.inl
deleted file mode 100644
index 33e0803c70..0000000000
--- a/compat/thrust/system/detail/generic/scan.inl
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/static_assert.h>
-#include <thrust/system/detail/generic/scan.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/scan.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/type_traits/iterator/is_output_iterator.h>
-#include <thrust/functional.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-
-template<typename ExecutionPolicy,
-         typename InputIterator,
-         typename OutputIterator>
-  OutputIterator inclusive_scan(thrust::execution_policy<ExecutionPolicy> &exec,
-                                InputIterator first,
-                                InputIterator last,
-                                OutputIterator result)
-{
-  // the pseudocode for deducing the type of the temporary used below:
-  // 
-  // if OutputIterator is a "pure" output iterator
-  //   TemporaryType = InputIterator::value_type
-  // else
-  //   TemporaryType = OutputIterator::value_type
-
-  typedef typename thrust::detail::eval_if<
-      thrust::detail::is_output_iterator<OutputIterator>::value,
-      thrust::iterator_value<InputIterator>,
-      thrust::iterator_value<OutputIterator>
-  >::type ValueType;
-
-  // assume plus as the associative operator
-  return thrust::inclusive_scan(exec, first, last, result, thrust::plus<ValueType>());
-} // end inclusive_scan()
-
-
-template<typename ExecutionPolicy,
-         typename InputIterator,
-         typename OutputIterator>
-  OutputIterator exclusive_scan(thrust::execution_policy<ExecutionPolicy> &exec,
-                                InputIterator first,
-                                InputIterator last,
-                                OutputIterator result)
-{
-  // the pseudocode for deducing the type of the temporary used below:
-  // 
-  // if OutputIterator is a "pure" output iterator
-  //   TemporaryType = InputIterator::value_type
-  // else
-  //   TemporaryType = OutputIterator::value_type
-
-  typedef typename thrust::detail::eval_if<
-      thrust::detail::is_output_iterator<OutputIterator>::value,
-      thrust::iterator_value<InputIterator>,
-      thrust::iterator_value<OutputIterator>
-  >::type ValueType;
-
-  // assume 0 as the initialization value
-  return thrust::exclusive_scan(exec, first, last, result, ValueType(0));
-} // end exclusive_scan()
-
-
-template<typename ExecutionPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename T>
-  OutputIterator exclusive_scan(thrust::execution_policy<ExecutionPolicy> &exec,
-                                InputIterator first,
-                                InputIterator last,
-                                OutputIterator result,
-                                T init)
-{
-  // assume plus as the associative operator
-  return thrust::exclusive_scan(exec, first, last, result, init, thrust::plus<T>());
-} // end exclusive_scan()
-
-
-template<typename ExecutionPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename BinaryFunction>
-  OutputIterator inclusive_scan(thrust::execution_policy<ExecutionPolicy> &exec,
-                                InputIterator first,
-                                InputIterator last,
-                                OutputIterator result,
-                                BinaryFunction binary_op)
-{
-  // unimplemented primitive
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator, false>::value) );
-  return result;
-} // end inclusive_scan
-
-
-template<typename ExecutionPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename T,
-         typename BinaryFunction>
-  OutputIterator exclusive_scan(thrust::execution_policy<ExecutionPolicy> &exec,
-                                InputIterator first,
-                                InputIterator last,
-                                OutputIterator result,
-                                T init,
-                                BinaryFunction binary_op)
-{
-  // unimplemented primitive
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator, false>::value) );
-  return result;
-} // end exclusive_scan()
-
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/generic/scan_by_key.h b/compat/thrust/system/detail/generic/scan_by_key.h
deleted file mode 100644
index 160121b58b..0000000000
--- a/compat/thrust/system/detail/generic/scan_by_key.h
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file scan_by_key.h
- *  \brief Generic implementations of key-value scans.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/tag.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-  OutputIterator inclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
-                                       InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result);
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename BinaryPredicate>
-  OutputIterator inclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
-                                       InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result,
-                                       BinaryPredicate binary_pred);
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename BinaryPredicate,
-         typename AssociativeOperator>
-  OutputIterator inclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
-                                       InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result,
-                                       BinaryPredicate binary_pred,
-                                       AssociativeOperator binary_op);
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-  OutputIterator exclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
-                                       InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result);
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename T>
-  OutputIterator exclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
-                                       InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result,
-                                       T init);
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename T,
-         typename BinaryPredicate>
-  OutputIterator exclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
-                                       InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result,
-                                       T init,
-                                       BinaryPredicate binary_pred);
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename T,
-         typename BinaryPredicate,
-         typename AssociativeOperator>
-  OutputIterator exclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
-                                       InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result,
-                                       T init,
-                                       BinaryPredicate binary_pred,
-                                       AssociativeOperator binary_op);
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/detail/generic/scan_by_key.inl>
-
diff --git a/compat/thrust/system/detail/generic/scan_by_key.inl b/compat/thrust/system/detail/generic/scan_by_key.inl
deleted file mode 100644
index d866ddec15..0000000000
--- a/compat/thrust/system/detail/generic/scan_by_key.inl
+++ /dev/null
@@ -1,239 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/scan_by_key.h>
-#include <thrust/functional.h>
-#include <thrust/transform.h>
-#include <thrust/replace.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/temporary_array.h>
-#include <thrust/detail/internal_functional.h>
-#include <thrust/scan.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-namespace detail
-{
-
-template <typename OutputType, typename HeadFlagType, typename AssociativeOperator>
-struct segmented_scan_functor
-{
-    AssociativeOperator binary_op;
-
-    typedef typename thrust::tuple<OutputType, HeadFlagType> result_type;
-
-    __host__ __device__
-    segmented_scan_functor(AssociativeOperator _binary_op) : binary_op(_binary_op) {}
-
-    __host__ __device__
-    result_type operator()(result_type a, result_type b)
-    {
-        return result_type(thrust::get<1>(b) ? thrust::get<0>(b) : binary_op(thrust::get<0>(a), thrust::get<0>(b)),
-                           thrust::get<1>(a) | thrust::get<1>(b));
-    }
-};
-
-} // end namespace detail
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-  OutputIterator inclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
-                                       InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result)
-{
-  typedef typename thrust::iterator_traits<InputIterator1>::value_type InputType1;
-  return thrust::inclusive_scan_by_key(exec, first1, last1, first2, result, thrust::equal_to<InputType1>());
-}
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename BinaryPredicate>
-  OutputIterator inclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
-                                       InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result,
-                                       BinaryPredicate binary_pred)
-{
-  typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
-  return thrust::inclusive_scan_by_key(exec, first1, last1, first2, result, binary_pred, thrust::plus<OutputType>());
-}
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename BinaryPredicate,
-         typename AssociativeOperator>
-  OutputIterator inclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
-                                       InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result,
-                                       BinaryPredicate binary_pred,
-                                       AssociativeOperator binary_op)
-{
-  typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
-  typedef unsigned int HeadFlagType;
-
-  const size_t n = last1 - first1;
-
-  if(n != 0)
-  {
-    // compute head flags
-    thrust::detail::temporary_array<HeadFlagType,DerivedPolicy> flags(exec, n);
-    flags[0] = 1; thrust::transform(exec, first1, last1 - 1, first1 + 1, flags.begin() + 1, thrust::detail::not2(binary_pred));
-
-    // scan key-flag tuples, 
-    // For additional details refer to Section 2 of the following paper
-    //    S. Sengupta, M. Harris, and M. Garland. "Efficient parallel scan algorithms for GPUs"
-    //    NVIDIA Technical Report NVR-2008-003, December 2008
-    //    http://mgarland.org/files/papers/nvr-2008-003.pdf
-    thrust::inclusive_scan
-        (exec,
-         thrust::make_zip_iterator(thrust::make_tuple(first2, flags.begin())),
-         thrust::make_zip_iterator(thrust::make_tuple(first2, flags.begin())) + n,
-         thrust::make_zip_iterator(thrust::make_tuple(result, flags.begin())),
-         detail::segmented_scan_functor<OutputType, HeadFlagType, AssociativeOperator>(binary_op));
-  }
-
-  return result + n;
-}
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-  OutputIterator exclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
-                                       InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result)
-{
-  typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
-  return thrust::exclusive_scan_by_key(exec, first1, last1, first2, result, OutputType(0));
-}
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename T>
-  OutputIterator exclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
-                                       InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result,
-                                       T init)
-{
-  typedef typename thrust::iterator_traits<InputIterator1>::value_type InputType1;
-  return thrust::exclusive_scan_by_key(exec, first1, last1, first2, result, init, thrust::equal_to<InputType1>());
-}
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename T,
-         typename BinaryPredicate>
-  OutputIterator exclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
-                                       InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result,
-                                       T init,
-                                       BinaryPredicate binary_pred)
-{
-  typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
-  return thrust::exclusive_scan_by_key(exec, first1, last1, first2, result, init, binary_pred, thrust::plus<OutputType>());
-}
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename T,
-         typename BinaryPredicate,
-         typename AssociativeOperator>
-  OutputIterator exclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
-                                       InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result,
-                                       T init,
-                                       BinaryPredicate binary_pred,
-                                       AssociativeOperator binary_op)
-{
-  typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
-  typedef unsigned int HeadFlagType;
-
-  const size_t n = last1 - first1;
-
-  if(n != 0)
-  {
-    InputIterator2 last2 = first2 + n;
-
-    // compute head flags
-    thrust::detail::temporary_array<HeadFlagType,DerivedPolicy> flags(exec, n);
-    flags[0] = 1; thrust::transform(exec, first1, last1 - 1, first1 + 1, flags.begin() + 1, thrust::detail::not2(binary_pred));
-
-    // shift input one to the right and initialize segments with init
-    thrust::detail::temporary_array<OutputType,DerivedPolicy> temp(exec, n);
-    thrust::replace_copy_if(exec, first2, last2 - 1, flags.begin() + 1, temp.begin() + 1, thrust::negate<HeadFlagType>(), init);
-    temp[0] = init;
-
-    // scan key-flag tuples, 
-    // For additional details refer to Section 2 of the following paper
-    //    S. Sengupta, M. Harris, and M. Garland. "Efficient parallel scan algorithms for GPUs"
-    //    NVIDIA Technical Report NVR-2008-003, December 2008
-    //    http://mgarland.org/files/papers/nvr-2008-003.pdf
-    thrust::inclusive_scan(exec,
-                           thrust::make_zip_iterator(thrust::make_tuple(temp.begin(), flags.begin())),
-                           thrust::make_zip_iterator(thrust::make_tuple(temp.begin(), flags.begin())) + n,
-                           thrust::make_zip_iterator(thrust::make_tuple(result,       flags.begin())),
-                           detail::segmented_scan_functor<OutputType, HeadFlagType, AssociativeOperator>(binary_op));
-  }
-
-  return result + n;
-}
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/generic/scatter.h b/compat/thrust/system/detail/generic/scatter.h
deleted file mode 100644
index 858d11adc1..0000000000
--- a/compat/thrust/system/detail/generic/scatter.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/tag.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename RandomAccessIterator>
-  void scatter(thrust::execution_policy<DerivedPolicy> &exec,
-               InputIterator1 first,
-               InputIterator1 last,
-               InputIterator2 map,
-               RandomAccessIterator output);
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename RandomAccessIterator>
-  void scatter_if(thrust::execution_policy<DerivedPolicy> &exec,
-                  InputIterator1 first,
-                  InputIterator1 last,
-                  InputIterator2 map,
-                  InputIterator3 stencil,
-                  RandomAccessIterator output);
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename RandomAccessIterator,
-         typename Predicate>
-  void scatter_if(thrust::execution_policy<DerivedPolicy> &exec,
-                  InputIterator1 first,
-                  InputIterator1 last,
-                  InputIterator2 map,
-                  InputIterator3 stencil,
-                  RandomAccessIterator output,
-                  Predicate pred);
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/detail/generic/scatter.inl>
-
diff --git a/compat/thrust/system/detail/generic/scatter.inl b/compat/thrust/system/detail/generic/scatter.inl
deleted file mode 100644
index 8c40359844..0000000000
--- a/compat/thrust/system/detail/generic/scatter.inl
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/scatter.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/functional.h>
-#include <thrust/transform.h>
-#include <thrust/iterator/permutation_iterator.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename RandomAccessIterator>
-  void scatter(thrust::execution_policy<DerivedPolicy> &exec,
-               InputIterator1 first,
-               InputIterator1 last,
-               InputIterator2 map,
-               RandomAccessIterator output)
-{
-  thrust::transform(exec,
-                    first,
-                    last,
-                    thrust::make_permutation_iterator(output, map),
-                    thrust::identity<typename thrust::iterator_value<InputIterator1>::type>());
-} // end scatter()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename RandomAccessIterator>
-  void scatter_if(thrust::execution_policy<DerivedPolicy> &exec,
-                  InputIterator1 first,
-                  InputIterator1 last,
-                  InputIterator2 map,
-                  InputIterator3 stencil,
-                  RandomAccessIterator output)
-{
-  // default predicate is identity
-  typedef typename thrust::iterator_value<InputIterator3>::type StencilType;
-  thrust::scatter_if(exec, first, last, map, stencil, output, thrust::identity<StencilType>());
-} // end scatter_if()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename RandomAccessIterator,
-         typename Predicate>
-  void scatter_if(thrust::execution_policy<DerivedPolicy> &exec,
-                  InputIterator1 first,
-                  InputIterator1 last,
-                  InputIterator2 map,
-                  InputIterator3 stencil,
-                  RandomAccessIterator output,
-                  Predicate pred)
-{
-  typedef typename thrust::iterator_value<InputIterator1>::type InputType;
-  thrust::transform_if(exec, first, last, stencil, thrust::make_permutation_iterator(output, map), thrust::identity<InputType>(), pred);
-} // end scatter_if()
-
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/generic/select_system.h b/compat/thrust/system/detail/generic/select_system.h
deleted file mode 100644
index 250a0bce44..0000000000
--- a/compat/thrust/system/detail/generic/select_system.h
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/execution_policy.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/type_traits/is_metafunction_defined.h>
-#include <thrust/iterator/detail/minimum_system.h>
-#include <thrust/system/detail/generic/type_traits.h>
-#include <thrust/iterator/iterator_traits.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-namespace select_system_detail
-{
-
-
-// min_system case 1: both systems have the same type, just return the first one
-template<typename System>
-__host__ __device__
-System &min_system(thrust::execution_policy<System> &system1,
-                   thrust::execution_policy<System> &)
-{
-  return thrust::detail::derived_cast(system1);
-} // end min_system()
-
-
-// min_system case 2: systems have differing type and the first type is considered the minimum
-template<typename System1, typename System2>
-__host__ __device__
-  typename thrust::detail::enable_if<
-    thrust::detail::is_same<
-      System1,
-      typename thrust::detail::minimum_system<System1,System2>::type
-    >::value,
-    System1 &
-  >::type
-    min_system(thrust::execution_policy<System1> &system1, thrust::execution_policy<System2> &)
-{
-  return thrust::detail::derived_cast(system1);
-} // end min_system()
-
-
-// min_system case 3: systems have differing type and the second type is considered the minimum
-template<typename System1, typename System2>
-__host__ __device__
-  typename thrust::detail::enable_if<
-    thrust::detail::is_same<
-      System2,
-      typename thrust::detail::minimum_system<System1,System2>::type
-    >::value,
-    System2 &
-  >::type
-    min_system(thrust::execution_policy<System1> &, thrust::execution_policy<System2> &system2)
-{
-  return thrust::detail::derived_cast(system2);
-} // end min_system()
-
-
-} // end select_system_detail
-
-
-template<typename System>
-__host__ __device__
-  typename thrust::detail::disable_if<
-    select_system1_exists<System>::value,
-    System &
-  >::type
-    select_system(thrust::execution_policy<System> &system)
-{
-  return thrust::detail::derived_cast(system);
-} // end select_system()
-
-
-template<typename System1, typename System2>
-__host__ __device__
-  typename thrust::detail::enable_if_defined<
-    thrust::detail::minimum_system<System1,System2>
-  >::type
-    &select_system(thrust::execution_policy<System1> &system1,
-                   thrust::execution_policy<System2> &system2)
-{
-  return select_system_detail::min_system(system1,system2);
-} // end select_system()
-
-
-template<typename System1, typename System2, typename System3>
-__host__ __device__
-  typename thrust::detail::lazy_disable_if<
-    select_system3_exists<System1,System2,System3>::value,
-    thrust::detail::minimum_system<System1,System2,System3>
-  >::type
-    &select_system(thrust::execution_policy<System1> &system1,
-                   thrust::execution_policy<System2> &system2,
-                   thrust::execution_policy<System3> &system3)
-{
-  return select_system(select_system(system1,system2), system3);
-} // end select_system()
-
-
-template<typename System1, typename System2, typename System3, typename System4>
-__host__ __device__
-  typename thrust::detail::lazy_disable_if<
-    select_system4_exists<System1,System2,System3,System4>::value,
-    thrust::detail::minimum_system<System1,System2,System3,System4>
-  >::type
-    &select_system(thrust::execution_policy<System1> &system1,
-                   thrust::execution_policy<System2> &system2,
-                   thrust::execution_policy<System3> &system3,
-                   thrust::execution_policy<System4> &system4)
-{
-  return select_system(select_system(system1,system2,system3), system4);
-} // end select_system()
-
-
-template<typename System1, typename System2, typename System3, typename System4, typename System5>
-__host__ __device__
-  typename thrust::detail::lazy_disable_if<
-    select_system5_exists<System1,System2,System3,System4,System5>::value,
-    thrust::detail::minimum_system<System1,System2,System3,System4,System5>
-  >::type
-    &select_system(thrust::execution_policy<System1> &system1,
-                   thrust::execution_policy<System2> &system2,
-                   thrust::execution_policy<System3> &system3,
-                   thrust::execution_policy<System4> &system4,
-                   thrust::execution_policy<System5> &system5)
-{
-  return select_system(select_system(system1,system2,system3,system4), system5);
-} // end select_system()
-
-
-template<typename System1, typename System2, typename System3, typename System4, typename System5, typename System6>
-__host__ __device__
-  typename thrust::detail::lazy_disable_if<
-    select_system6_exists<System1,System2,System3,System4,System5,System6>::value,
-    thrust::detail::minimum_system<System1,System2,System3,System4,System5,System6>
-  >::type
-    &select_system(thrust::execution_policy<System1> &system1,
-                   thrust::execution_policy<System2> &system2,
-                   thrust::execution_policy<System3> &system3,
-                   thrust::execution_policy<System4> &system4,
-                   thrust::execution_policy<System5> &system5,
-                   thrust::execution_policy<System6> &system6)
-{
-  return select_system(select_system(system1,system2,system3,system4,system5), system6);
-} // end select_system()
-
-
-// map a single any_system_tag to device_system_tag
-inline __host__ __device__
-thrust::device_system_tag select_system(thrust::any_system_tag)
-{
-  return thrust::device_system_tag();
-} // end select_system()
-
-
-} // end generic
-} // end detail
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/detail/generic/sequence.h b/compat/thrust/system/detail/generic/sequence.h
deleted file mode 100644
index b23a7b5d22..0000000000
--- a/compat/thrust/system/detail/generic/sequence.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/tag.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-
-template<typename DerivedPolicy,
-         typename ForwardIterator>
-  void sequence(thrust::execution_policy<DerivedPolicy> &exec,
-                ForwardIterator first,
-                ForwardIterator last);
-
-
-template<typename DerivedPolicy, typename ForwardIterator, typename T>
-  void sequence(thrust::execution_policy<DerivedPolicy> &exec,
-                ForwardIterator first,
-                ForwardIterator last,
-                T init);
-
-
-template<typename DerivedPolicy, typename ForwardIterator, typename T>
-  void sequence(thrust::execution_policy<DerivedPolicy> &exec,
-                ForwardIterator first,
-                ForwardIterator last,
-                T init,
-                T step);
-
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/detail/generic/sequence.inl>
-
diff --git a/compat/thrust/system/detail/generic/sequence.inl b/compat/thrust/system/detail/generic/sequence.inl
deleted file mode 100644
index 45aec69829..0000000000
--- a/compat/thrust/system/detail/generic/sequence.inl
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/sequence.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/functional.h>
-#include <thrust/tabulate.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-
-template<typename DerivedPolicy, typename ForwardIterator>
-  void sequence(thrust::execution_policy<DerivedPolicy> &exec,
-                ForwardIterator first,
-                ForwardIterator last)
-{
-  typedef typename thrust::iterator_traits<ForwardIterator>::value_type T;
-
-  thrust::sequence(exec, first, last, T(0));
-} // end sequence()
-
-
-template<typename DerivedPolicy, typename ForwardIterator, typename T>
-  void sequence(thrust::execution_policy<DerivedPolicy> &exec,
-                ForwardIterator first,
-                ForwardIterator last,
-                T init)
-{
-  thrust::sequence(exec, first, last, init, T(1));
-} // end sequence()
-
-
-template<typename DerivedPolicy, typename ForwardIterator, typename T>
-  void sequence(thrust::execution_policy<DerivedPolicy> &exec,
-                ForwardIterator first,
-                ForwardIterator last,
-                T init,
-                T step)
-{
-  thrust::tabulate(exec, first, last, init + step * thrust::placeholders::_1);
-} // end sequence()
-
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/generic/set_operations.h b/compat/thrust/system/detail/generic/set_operations.h
deleted file mode 100644
index 1ca8d391de..0000000000
--- a/compat/thrust/system/detail/generic/set_operations.h
+++ /dev/null
@@ -1,303 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/tag.h>
-#include <thrust/pair.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-
-template<typename ExecutionPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-  OutputIterator set_difference(thrust::execution_policy<ExecutionPolicy> &exec,
-                                InputIterator1                             first1,
-                                InputIterator1                             last1,
-                                InputIterator2                             first2,
-                                InputIterator2                             last2,
-                                OutputIterator                             result);
-
-
-// XXX it is an error to call this function; it has no implementation
-template<typename ExecutionPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename StrictWeakOrdering>
-  OutputIterator set_difference(thrust::execution_policy<ExecutionPolicy> &exec,
-                                InputIterator1                             first1,
-                                InputIterator1                             last1,
-                                InputIterator2                             first2,
-                                InputIterator2                             last2,
-                                OutputIterator                             result,
-                                StrictWeakOrdering                         comp);
-
-
-template<typename ExecutionPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename InputIterator4,
-         typename OutputIterator1,
-         typename OutputIterator2>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_difference_by_key(thrust::execution_policy<ExecutionPolicy> &exec,
-                          InputIterator1                             keys_first1,
-                          InputIterator1                             keys_last1,
-                          InputIterator2                             keys_first2,
-                          InputIterator2                             keys_last2,
-                          InputIterator3                             values_first1,
-                          InputIterator4                             values_first2,
-                          OutputIterator1                            keys_result,
-                          OutputIterator2                            values_result);
-
-
-template<typename ExecutionPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename InputIterator4,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename StrictWeakOrdering>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_difference_by_key(thrust::execution_policy<ExecutionPolicy> &exec,
-                          InputIterator1                             keys_first1,
-                          InputIterator1                             keys_last1,
-                          InputIterator2                             keys_first2,
-                          InputIterator2                             keys_last2,
-                          InputIterator3                             values_first1,
-                          InputIterator4                             values_first2,
-                          OutputIterator1                            keys_result,
-                          OutputIterator2                            values_result,
-                          StrictWeakOrdering                         comp);
-
-
-template<typename ExecutionPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-  OutputIterator set_intersection(thrust::execution_policy<ExecutionPolicy> &system,
-                                  InputIterator1                             first1,
-                                  InputIterator1                             last1,
-                                  InputIterator2                             first2,
-                                  InputIterator2                             last2,
-                                  OutputIterator                             result);
-
-
-// XXX it is an error to call this function; it has no implementation
-template<typename ExecutionPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename StrictWeakOrdering>
-  OutputIterator set_intersection(thrust::execution_policy<StrictWeakOrdering> &system,
-                                  InputIterator1                                first1,
-                                  InputIterator1                                last1,
-                                  InputIterator2                                first2,
-                                  InputIterator2                                last2,
-                                  OutputIterator                                result,
-                                  StrictWeakOrdering                            comp);
-
-
-template<typename ExecutionPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename OutputIterator1,
-         typename OutputIterator2>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_intersection_by_key(thrust::execution_policy<ExecutionPolicy> &system,
-                            InputIterator1                             keys_first1,
-                            InputIterator1                             keys_last1,
-                            InputIterator2                             keys_first2,
-                            InputIterator2                             keys_last2,
-                            InputIterator3                             values_first1,
-                            OutputIterator1                            keys_result,
-                            OutputIterator2                            values_result);
-
-
-template<typename ExecutionPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename StrictWeakOrdering>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_intersection_by_key(thrust::execution_policy<ExecutionPolicy> &system,
-                            InputIterator1                             keys_first1,
-                            InputIterator1                             keys_last1,
-                            InputIterator2                             keys_first2,
-                            InputIterator2                             keys_last2,
-                            InputIterator3                             values_first1,
-                            OutputIterator1                            keys_result,
-                            OutputIterator2                            values_result,
-                            StrictWeakOrdering                         comp);
-
-
-template<typename ExecutionPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-  OutputIterator set_symmetric_difference(thrust::execution_policy<ExecutionPolicy> &system,
-                                          InputIterator1                             first1,
-                                          InputIterator1                             last1,
-                                          InputIterator2                             first2,
-                                          InputIterator2                             last2,
-                                          OutputIterator                             result);
-
-
-// XXX it is an error to call this function; it has no implementation
-template<typename ExecutionPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename StrictWeakOrdering>
-  OutputIterator set_symmetric_difference(thrust::execution_policy<ExecutionPolicy> &system,
-                                          InputIterator1                             first1,
-                                          InputIterator1                             last1,
-                                          InputIterator2                             first2,
-                                          InputIterator2                             last2,
-                                          OutputIterator                             result,
-                                          StrictWeakOrdering                         comp);
-
-
-template<typename ExecutionPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename InputIterator4,
-         typename OutputIterator1,
-         typename OutputIterator2>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_symmetric_difference_by_key(thrust::execution_policy<ExecutionPolicy> &system,
-                                    InputIterator1                             keys_first1,
-                                    InputIterator1                             keys_last1,
-                                    InputIterator2                             keys_first2,
-                                    InputIterator2                             keys_last2,
-                                    InputIterator3                             values_first1,
-                                    InputIterator4                             values_first2,
-                                    OutputIterator1                            keys_result,
-                                    OutputIterator2                            values_result);
-
-
-template<typename ExecutionPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename InputIterator4,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename StrictWeakOrdering>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_symmetric_difference_by_key(thrust::execution_policy<ExecutionPolicy> &system,
-                                    InputIterator1                             keys_first1,
-                                    InputIterator1                             keys_last1,
-                                    InputIterator2                             keys_first2,
-                                    InputIterator2                             keys_last2,
-                                    InputIterator3                             values_first1,
-                                    InputIterator4                             values_first2,
-                                    OutputIterator1                            keys_result,
-                                    OutputIterator2                            values_result,
-                                    StrictWeakOrdering                         comp);
-
-
-template<typename ExecutionPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-  OutputIterator set_union(thrust::execution_policy<ExecutionPolicy> &system,
-                           InputIterator1                             first1,
-                           InputIterator1                             last1,
-                           InputIterator2                             first2,
-                           InputIterator2                             last2,
-                           OutputIterator                             result);
-
-
-// XXX it is an error to call this function; it has no implementation
-template<typename ExecutionPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename StrictWeakOrdering>
-  OutputIterator set_union(thrust::execution_policy<ExecutionPolicy> &system,
-                           InputIterator1                             first1,
-                           InputIterator1                             last1,
-                           InputIterator2                             first2,
-                           InputIterator2                             last2,
-                           OutputIterator                             result,
-                           StrictWeakOrdering                         comp);
-
-
-template<typename ExecutionPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename InputIterator4,
-         typename OutputIterator1,
-         typename OutputIterator2>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_union_by_key(thrust::execution_policy<ExecutionPolicy> &system,
-                     InputIterator1                             keys_first1,
-                     InputIterator1                             keys_last1,
-                     InputIterator2                             keys_first2,
-                     InputIterator2                             keys_last2,
-                     InputIterator3                             values_first1,
-                     InputIterator4                             values_first2,
-                     OutputIterator1                            keys_result,
-                     OutputIterator2                            values_result);
-
-
-template<typename ExecutionPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename InputIterator4,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename StrictWeakOrdering>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_union_by_key(thrust::execution_policy<ExecutionPolicy> &system,
-                     InputIterator1                             keys_first1,
-                     InputIterator1                             keys_last1,
-                     InputIterator2                             keys_first2,
-                     InputIterator2                             keys_last2,
-                     InputIterator3                             values_first1,
-                     InputIterator4                             values_first2,
-                     OutputIterator1                            keys_result,
-                     OutputIterator2                            values_result,
-                     StrictWeakOrdering                         comp);
-
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/detail/generic/set_operations.inl>
-
diff --git a/compat/thrust/system/detail/generic/set_operations.inl b/compat/thrust/system/detail/generic/set_operations.inl
deleted file mode 100644
index bac9ccd671..0000000000
--- a/compat/thrust/system/detail/generic/set_operations.inl
+++ /dev/null
@@ -1,449 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/static_assert.h>
-#include <thrust/system/detail/generic/set_operations.h>
-#include <thrust/functional.h>
-#include <thrust/detail/internal_functional.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/iterator/zip_iterator.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-  OutputIterator set_difference(thrust::execution_policy<DerivedPolicy> &exec,
-                                InputIterator1                           first1,
-                                InputIterator1                           last1,
-                                InputIterator2                           first2,
-                                InputIterator2                           last2,
-                                OutputIterator                           result)
-{
-  typedef typename thrust::iterator_value<InputIterator1>::type value_type;
-  return thrust::set_difference(exec, first1, last1, first2, last2, result, thrust::less<value_type>());
-} // end set_difference()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename InputIterator4,
-         typename OutputIterator1,
-         typename OutputIterator2>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_difference_by_key(thrust::execution_policy<DerivedPolicy> &exec,
-                          InputIterator1                           keys_first1,
-                          InputIterator1                           keys_last1,
-                          InputIterator2                           keys_first2,
-                          InputIterator2                           keys_last2,
-                          InputIterator3                           values_first1,
-                          InputIterator4                           values_first2,
-                          OutputIterator1                          keys_result,
-                          OutputIterator2                          values_result)
-{
-  typedef typename thrust::iterator_value<InputIterator1>::type value_type;
-  return thrust::set_difference_by_key(exec, keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result, thrust::less<value_type>());
-} // end set_difference_by_key()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename InputIterator4,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename StrictWeakOrdering>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_difference_by_key(thrust::execution_policy<DerivedPolicy> &exec,
-                          InputIterator1                           keys_first1,
-                          InputIterator1                           keys_last1,
-                          InputIterator2                           keys_first2,
-                          InputIterator2                           keys_last2,
-                          InputIterator3                           values_first1,
-                          InputIterator4                           values_first2,
-                          OutputIterator1                          keys_result,
-                          OutputIterator2                          values_result,
-                          StrictWeakOrdering                       comp)
-{
-  typedef thrust::tuple<InputIterator1, InputIterator3>   iterator_tuple1;
-  typedef thrust::tuple<InputIterator2, InputIterator4>   iterator_tuple2;
-  typedef thrust::tuple<OutputIterator1, OutputIterator2> iterator_tuple3;
-
-  typedef thrust::zip_iterator<iterator_tuple1> zip_iterator1;
-  typedef thrust::zip_iterator<iterator_tuple2> zip_iterator2;
-  typedef thrust::zip_iterator<iterator_tuple3> zip_iterator3;
-
-  zip_iterator1 zipped_first1 = thrust::make_zip_iterator(thrust::make_tuple(keys_first1, values_first1));
-  zip_iterator1 zipped_last1  = thrust::make_zip_iterator(thrust::make_tuple(keys_last1, values_first1));
-
-  zip_iterator2 zipped_first2 = thrust::make_zip_iterator(thrust::make_tuple(keys_first2, values_first2));
-  zip_iterator2 zipped_last2  = thrust::make_zip_iterator(thrust::make_tuple(keys_last2, values_first2));
-
-  zip_iterator3 zipped_result = thrust::make_zip_iterator(thrust::make_tuple(keys_result, values_result));
-
-  thrust::detail::compare_first<StrictWeakOrdering> comp_first(comp);
-
-  iterator_tuple3 result = thrust::set_difference(exec, zipped_first1, zipped_last1, zipped_first2, zipped_last2, zipped_result, comp_first).get_iterator_tuple();
-
-  return thrust::make_pair(thrust::get<0>(result), thrust::get<1>(result));
-} // end set_difference_by_key()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-  OutputIterator set_intersection(thrust::execution_policy<DerivedPolicy> &exec,
-                                  InputIterator1                           first1,
-                                  InputIterator1                           last1,
-                                  InputIterator2                           first2,
-                                  InputIterator2                           last2,
-                                  OutputIterator                           result)
-{
-  typedef typename thrust::iterator_value<InputIterator1>::type value_type;
-  return thrust::set_intersection(exec, first1, last1, first2, last2, result, thrust::less<value_type>());
-} // end set_intersection()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename OutputIterator1,
-         typename OutputIterator2>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_intersection_by_key(thrust::execution_policy<DerivedPolicy> &exec,
-                            InputIterator1                           keys_first1,
-                            InputIterator1                           keys_last1,
-                            InputIterator2                           keys_first2,
-                            InputIterator2                           keys_last2,
-                            InputIterator3                           values_first1,
-                            OutputIterator1                          keys_result,
-                            OutputIterator2                          values_result)
-{
-  typedef typename thrust::iterator_value<InputIterator1>::type value_type;
-  return thrust::set_intersection_by_key(exec, keys_first1, keys_last1, keys_first2, keys_last2, values_first1, keys_result, values_result, thrust::less<value_type>());
-} // end set_intersection_by_key()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename StrictWeakOrdering>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_intersection_by_key(thrust::execution_policy<DerivedPolicy> &exec,
-                            InputIterator1                           keys_first1,
-                            InputIterator1                           keys_last1,
-                            InputIterator2                           keys_first2,
-                            InputIterator2                           keys_last2,
-                            InputIterator3                           values_first1,
-                            OutputIterator1                          keys_result,
-                            OutputIterator2                          values_result,
-                            StrictWeakOrdering                       comp)
-{
-  typedef thrust::tuple<InputIterator1, InputIterator3>   iterator_tuple1;
-  typedef thrust::tuple<InputIterator2, InputIterator2>   iterator_tuple2;
-  typedef thrust::tuple<OutputIterator1, OutputIterator2> iterator_tuple3;
-
-  typedef thrust::zip_iterator<iterator_tuple1> zip_iterator1;
-  typedef thrust::zip_iterator<iterator_tuple2> zip_iterator2;
-  typedef thrust::zip_iterator<iterator_tuple3> zip_iterator3;
-
-  // fabricate a values_first2 by "sending" keys twice
-  // it should never be dereferenced by set_intersection
-  InputIterator2 values_first2 = keys_first2;
-
-  zip_iterator1 zipped_first1 = thrust::make_zip_iterator(thrust::make_tuple(keys_first1, values_first1));
-  zip_iterator1 zipped_last1  = thrust::make_zip_iterator(thrust::make_tuple(keys_last1, values_first1));
-
-  zip_iterator2 zipped_first2 = thrust::make_zip_iterator(thrust::make_tuple(keys_first2, values_first2));
-  zip_iterator2 zipped_last2  = thrust::make_zip_iterator(thrust::make_tuple(keys_last2, values_first2));
-
-  zip_iterator3 zipped_result = thrust::make_zip_iterator(thrust::make_tuple(keys_result, values_result));
-
-  thrust::detail::compare_first<StrictWeakOrdering> comp_first(comp);
-
-  iterator_tuple3 result = thrust::set_intersection(exec, zipped_first1, zipped_last1, zipped_first2, zipped_last2, zipped_result, comp_first).get_iterator_tuple();
-
-  return thrust::make_pair(thrust::get<0>(result), thrust::get<1>(result));
-} // end set_intersection_by_key()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-  OutputIterator set_symmetric_difference(thrust::execution_policy<DerivedPolicy> &exec,
-                                          InputIterator1                           first1,
-                                          InputIterator1                           last1,
-                                          InputIterator2                           first2,
-                                          InputIterator2                           last2,
-                                          OutputIterator                           result)
-{
-  typedef typename thrust::iterator_value<InputIterator1>::type value_type;
-  return thrust::set_symmetric_difference(exec, first1, last1, first2, last2, result, thrust::less<value_type>());
-} // end set_symmetric_difference()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename InputIterator4,
-         typename OutputIterator1,
-         typename OutputIterator2>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_symmetric_difference_by_key(thrust::execution_policy<DerivedPolicy> &exec,
-                                    InputIterator1                           keys_first1,
-                                    InputIterator1                           keys_last1,
-                                    InputIterator2                           keys_first2,
-                                    InputIterator2                           keys_last2,
-                                    InputIterator3                           values_first1,
-                                    InputIterator4                           values_first2,
-                                    OutputIterator1                          keys_result,
-                                    OutputIterator2                          values_result)
-{
-  typedef typename thrust::iterator_value<InputIterator1>::type value_type;
-  return thrust::set_symmetric_difference_by_key(exec, keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result, thrust::less<value_type>());
-} // end set_symmetric_difference_by_key()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename InputIterator4,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename StrictWeakOrdering>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_symmetric_difference_by_key(thrust::execution_policy<DerivedPolicy> &exec,
-                                    InputIterator1                           keys_first1,
-                                    InputIterator1                           keys_last1,
-                                    InputIterator2                           keys_first2,
-                                    InputIterator2                           keys_last2,
-                                    InputIterator3                           values_first1,
-                                    InputIterator4                           values_first2,
-                                    OutputIterator1                          keys_result,
-                                    OutputIterator2                          values_result,
-                                    StrictWeakOrdering                       comp)
-{
-  typedef thrust::tuple<InputIterator1, InputIterator3>   iterator_tuple1;
-  typedef thrust::tuple<InputIterator2, InputIterator4>   iterator_tuple2;
-  typedef thrust::tuple<OutputIterator1, OutputIterator2> iterator_tuple3;
-
-  typedef thrust::zip_iterator<iterator_tuple1> zip_iterator1;
-  typedef thrust::zip_iterator<iterator_tuple2> zip_iterator2;
-  typedef thrust::zip_iterator<iterator_tuple3> zip_iterator3;
-
-  zip_iterator1 zipped_first1 = thrust::make_zip_iterator(thrust::make_tuple(keys_first1, values_first1));
-  zip_iterator1 zipped_last1  = thrust::make_zip_iterator(thrust::make_tuple(keys_last1, values_first1));
-
-  zip_iterator2 zipped_first2 = thrust::make_zip_iterator(thrust::make_tuple(keys_first2, values_first2));
-  zip_iterator2 zipped_last2  = thrust::make_zip_iterator(thrust::make_tuple(keys_last2, values_first2));
-
-  zip_iterator3 zipped_result = thrust::make_zip_iterator(thrust::make_tuple(keys_result, values_result));
-
-  thrust::detail::compare_first<StrictWeakOrdering> comp_first(comp);
-
-  iterator_tuple3 result = thrust::set_symmetric_difference(exec, zipped_first1, zipped_last1, zipped_first2, zipped_last2, zipped_result, comp_first).get_iterator_tuple();
-
-  return thrust::make_pair(thrust::get<0>(result), thrust::get<1>(result));
-} // end set_symmetric_difference_by_key()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-  OutputIterator set_union(thrust::execution_policy<DerivedPolicy> &exec,
-                           InputIterator1                           first1,
-                           InputIterator1                           last1,
-                           InputIterator2                           first2,
-                           InputIterator2                           last2,
-                           OutputIterator                           result)
-{
-  typedef typename thrust::iterator_value<InputIterator1>::type value_type;
-  return thrust::set_union(exec, first1, last1, first2, last2, result, thrust::less<value_type>());
-} // end set_union()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename InputIterator4,
-         typename OutputIterator1,
-         typename OutputIterator2>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_union_by_key(thrust::execution_policy<DerivedPolicy> &exec,
-                     InputIterator1                           keys_first1,
-                     InputIterator1                           keys_last1,
-                     InputIterator2                           keys_first2,
-                     InputIterator2                           keys_last2,
-                     InputIterator3                           values_first1,
-                     InputIterator4                           values_first2,
-                     OutputIterator1                          keys_result,
-                     OutputIterator2                          values_result)
-{
-  typedef typename thrust::iterator_value<InputIterator1>::type value_type;
-  return thrust::set_union_by_key(exec, keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result, thrust::less<value_type>());
-} // end set_union_by_key()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename InputIterator4,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename StrictWeakOrdering>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    set_union_by_key(thrust::execution_policy<DerivedPolicy> &exec,
-                     InputIterator1                           keys_first1,
-                     InputIterator1                           keys_last1,
-                     InputIterator2                           keys_first2,
-                     InputIterator2                           keys_last2,
-                     InputIterator3                           values_first1,
-                     InputIterator4                           values_first2,
-                     OutputIterator1                          keys_result,
-                     OutputIterator2                          values_result,
-                     StrictWeakOrdering                       comp)
-{
-  typedef thrust::tuple<InputIterator1, InputIterator3>   iterator_tuple1;
-  typedef thrust::tuple<InputIterator2, InputIterator4>   iterator_tuple2;
-  typedef thrust::tuple<OutputIterator1, OutputIterator2> iterator_tuple3;
-
-  typedef thrust::zip_iterator<iterator_tuple1> zip_iterator1;
-  typedef thrust::zip_iterator<iterator_tuple2> zip_iterator2;
-  typedef thrust::zip_iterator<iterator_tuple3> zip_iterator3;
-
-  zip_iterator1 zipped_first1 = thrust::make_zip_iterator(thrust::make_tuple(keys_first1, values_first1));
-  zip_iterator1 zipped_last1  = thrust::make_zip_iterator(thrust::make_tuple(keys_last1, values_first1));
-
-  zip_iterator2 zipped_first2 = thrust::make_zip_iterator(thrust::make_tuple(keys_first2, values_first2));
-  zip_iterator2 zipped_last2  = thrust::make_zip_iterator(thrust::make_tuple(keys_last2, values_first2));
-
-  zip_iterator3 zipped_result = thrust::make_zip_iterator(thrust::make_tuple(keys_result, values_result));
-
-  thrust::detail::compare_first<StrictWeakOrdering> comp_first(comp);
-
-  iterator_tuple3 result = thrust::set_union(exec, zipped_first1, zipped_last1, zipped_first2, zipped_last2, zipped_result, comp_first).get_iterator_tuple();
-
-  return thrust::make_pair(thrust::get<0>(result), thrust::get<1>(result));
-} // end set_union_by_key()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename StrictWeakOrdering>
-  OutputIterator set_difference(thrust::execution_policy<DerivedPolicy> &exec,
-                                InputIterator1                           first1,
-                                InputIterator1                           last1,
-                                InputIterator2                           first2,
-                                InputIterator2                           last2,
-                                OutputIterator                           result,
-                                StrictWeakOrdering                       comp)
-{
-  // unimplemented primitive
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator1, false>::value) );
-  return result;
-} // end set_difference()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename StrictWeakOrdering>
-  OutputIterator set_intersection(thrust::execution_policy<DerivedPolicy> &exec,
-                                  InputIterator1                           first1,
-                                  InputIterator1                           last1,
-                                  InputIterator2                           first2,
-                                  InputIterator2                           last2,
-                                  OutputIterator                           result,
-                                  StrictWeakOrdering                       comp)
-{
-  // unimplemented primitive
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator1, false>::value) );
-  return result;
-} // end set_intersection()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename StrictWeakOrdering>
-  OutputIterator set_symmetric_difference(thrust::execution_policy<DerivedPolicy> &exec,
-                                          InputIterator1                           first1,
-                                          InputIterator1                           last1,
-                                          InputIterator2                           first2,
-                                          InputIterator2                           last2,
-                                          OutputIterator                           result,
-                                          StrictWeakOrdering                       comp)
-{
-  // unimplemented primitive
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator1, false>::value) );
-  return result;
-} // end set_symmetric_difference()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename StrictWeakOrdering>
-  OutputIterator set_union(thrust::execution_policy<DerivedPolicy> &exec,
-                           InputIterator1                           first1,
-                           InputIterator1                           last1,
-                           InputIterator2                           first2,
-                           InputIterator2                           last2,
-                           OutputIterator                           result,
-                           StrictWeakOrdering                       comp)
-{
-  // unimplemented primitive
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator1, false>::value) );
-  return result;
-} // end set_union()
-
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/generic/sort.h b/compat/thrust/system/detail/generic/sort.h
deleted file mode 100644
index 5498708a3b..0000000000
--- a/compat/thrust/system/detail/generic/sort.h
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/tag.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator>
-  void sort(thrust::execution_policy<DerivedPolicy> &exec,
-            RandomAccessIterator first,
-            RandomAccessIterator last);
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator,
-         typename StrictWeakOrdering>
-  void sort(thrust::execution_policy<DerivedPolicy> &exec,
-            RandomAccessIterator first,
-            RandomAccessIterator last,
-            StrictWeakOrdering comp);
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-  void sort_by_key(thrust::execution_policy<DerivedPolicy> &exec,
-                   RandomAccessIterator1 keys_first,
-                   RandomAccessIterator1 keys_last,
-                   RandomAccessIterator2 values_first);
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename StrictWeakOrdering>
-  void sort_by_key(thrust::execution_policy<DerivedPolicy> &exec,
-                   RandomAccessIterator1 keys_first,
-                   RandomAccessIterator1 keys_last,
-                   RandomAccessIterator2 values_first,
-                   StrictWeakOrdering comp);
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator>
-  void stable_sort(thrust::execution_policy<DerivedPolicy> &exec,
-                   RandomAccessIterator first,
-                   RandomAccessIterator last);
-
-
-// XXX it is an error to call this function; it has no implementation
-template<typename DerivedPolicy,
-         typename RandomAccessIterator,
-         typename StrictWeakOrdering>
-  void stable_sort(thrust::execution_policy<DerivedPolicy> &exec,
-                   RandomAccessIterator first,
-                   RandomAccessIterator last,
-                   StrictWeakOrdering comp);
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-  void stable_sort_by_key(thrust::execution_policy<DerivedPolicy> &exec,
-                          RandomAccessIterator1 keys_first,
-                          RandomAccessIterator1 keys_last,
-                          RandomAccessIterator2 values_first);
-
-
-// XXX it is an error to call this function; it has no implementation
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename StrictWeakOrdering>
-  void stable_sort_by_key(thrust::execution_policy<DerivedPolicy> &exec,
-                          RandomAccessIterator1 keys_first,
-                          RandomAccessIterator1 keys_last,
-                          RandomAccessIterator2 values_first,
-                          StrictWeakOrdering comp);
-
-
-template<typename DerivedPolicy, typename ForwardIterator>
-  bool is_sorted(thrust::execution_policy<DerivedPolicy> &exec,
-                 ForwardIterator first,
-                 ForwardIterator last);
-
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename Compare>
-  bool is_sorted(thrust::execution_policy<DerivedPolicy> &exec,
-                 ForwardIterator first,
-                 ForwardIterator last,
-                 Compare comp);
-
-
-template<typename DerivedPolicy, typename ForwardIterator>
-  ForwardIterator is_sorted_until(thrust::execution_policy<DerivedPolicy> &exec,
-                                  ForwardIterator first,
-                                  ForwardIterator last);
-
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename Compare>
-  ForwardIterator is_sorted_until(thrust::execution_policy<DerivedPolicy> &exec,
-                                  ForwardIterator first,
-                                  ForwardIterator last,
-                                  Compare comp);
-
-
-} // end generic
-} // end detail
-} // end system
-} // end thrust
-
-#include <thrust/system/detail/generic/sort.inl>
-
diff --git a/compat/thrust/system/detail/generic/sort.inl b/compat/thrust/system/detail/generic/sort.inl
deleted file mode 100644
index aabb2eed81..0000000000
--- a/compat/thrust/system/detail/generic/sort.inl
+++ /dev/null
@@ -1,202 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/sort.h>
-#include <thrust/functional.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/distance.h>
-#include <thrust/functional.h>
-#include <thrust/find.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/tuple.h>
-#include <thrust/detail/internal_functional.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-
-template<typename ExecutionPolicy,
-         typename RandomAccessIterator>
-  void sort(thrust::execution_policy<ExecutionPolicy> &exec,
-            RandomAccessIterator first,
-            RandomAccessIterator last)
-{
-  typedef typename thrust::iterator_value<RandomAccessIterator>::type value_type; 
-  thrust::sort(exec, first, last, thrust::less<value_type>());
-} // end sort()
-
-
-template<typename ExecutionPolicy,
-         typename RandomAccessIterator,
-         typename StrictWeakOrdering>
-  void sort(thrust::execution_policy<ExecutionPolicy> &exec,
-            RandomAccessIterator first,
-            RandomAccessIterator last,
-            StrictWeakOrdering comp)
-{
-  // implement with stable_sort
-  thrust::stable_sort(exec, first, last, comp);
-} // end sort()
-
-
-template<typename ExecutionPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-  void sort_by_key(thrust::execution_policy<ExecutionPolicy> &exec,
-                   RandomAccessIterator1 keys_first,
-                   RandomAccessIterator1 keys_last,
-                   RandomAccessIterator2 values_first)
-{
-  typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type;
-  thrust::sort_by_key(exec, keys_first, keys_last, values_first, thrust::less<value_type>());
-} // end sort_by_key()
-
-
-template<typename ExecutionPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename StrictWeakOrdering>
-  void sort_by_key(thrust::execution_policy<ExecutionPolicy> &exec,
-                   RandomAccessIterator1 keys_first,
-                   RandomAccessIterator1 keys_last,
-                   RandomAccessIterator2 values_first,
-                   StrictWeakOrdering comp)
-{
-  // implement with stable_sort_by_key
-  thrust::stable_sort_by_key(exec, keys_first, keys_last, values_first, comp);
-} // end sort_by_key()
-
-
-template<typename ExecutionPolicy,
-         typename RandomAccessIterator>
-  void stable_sort(thrust::execution_policy<ExecutionPolicy> &exec,
-                   RandomAccessIterator first,
-                   RandomAccessIterator last)
-{
-  typedef typename thrust::iterator_value<RandomAccessIterator>::type value_type;
-  thrust::stable_sort(exec, first, last, thrust::less<value_type>());
-} // end stable_sort()
-
-
-template<typename ExecutionPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-  void stable_sort_by_key(thrust::execution_policy<ExecutionPolicy> &exec,
-                          RandomAccessIterator1 keys_first,
-                          RandomAccessIterator1 keys_last,
-                          RandomAccessIterator2 values_first)
-{
-  typedef typename iterator_value<RandomAccessIterator1>::type value_type;
-  thrust::stable_sort_by_key(exec, keys_first, keys_last, values_first, thrust::less<value_type>());
-} // end stable_sort_by_key()
-
-
-template<typename ExecutionPolicy, typename ForwardIterator>
-  bool is_sorted(thrust::execution_policy<ExecutionPolicy> &exec,
-                 ForwardIterator first,
-                 ForwardIterator last)
-{
-  return thrust::is_sorted_until(exec, first, last) == last;
-} // end is_sorted()
-
-
-template<typename ExecutionPolicy,
-         typename ForwardIterator,
-         typename Compare>
-  bool is_sorted(thrust::execution_policy<ExecutionPolicy> &exec,
-                 ForwardIterator first,
-                 ForwardIterator last,
-                 Compare comp)
-{
-  return thrust::is_sorted_until(exec, first, last, comp) == last;
-} // end is_sorted()
-
-
-template<typename ExecutionPolicy, typename ForwardIterator>
-  ForwardIterator is_sorted_until(thrust::execution_policy<ExecutionPolicy> &exec,
-                                  ForwardIterator first,
-                                  ForwardIterator last)
-{
-  typedef typename thrust::iterator_value<ForwardIterator>::type InputType;
-
-  return thrust::is_sorted_until(exec, first, last, thrust::less<InputType>());
-} // end is_sorted_until()
-
-
-template<typename ExecutionPolicy,
-         typename ForwardIterator,
-         typename Compare>
-  ForwardIterator is_sorted_until(thrust::execution_policy<ExecutionPolicy> &exec,
-                                  ForwardIterator first,
-                                  ForwardIterator last,
-                                  Compare comp)
-{
-  if(thrust::distance(first,last) < 2) return last;
-
-  typedef thrust::tuple<ForwardIterator,ForwardIterator> IteratorTuple;
-  typedef thrust::zip_iterator<IteratorTuple>            ZipIterator;
-
-  ForwardIterator first_plus_one = first;
-  thrust::advance(first_plus_one, 1);
-
-  ZipIterator zipped_first = thrust::make_zip_iterator(thrust::make_tuple(first_plus_one, first));
-  ZipIterator zipped_last  = thrust::make_zip_iterator(thrust::make_tuple(last, first));
-
-  return thrust::get<0>(thrust::find_if(exec, zipped_first, zipped_last, thrust::detail::tuple_binary_predicate<Compare>(comp)).get_iterator_tuple());
-} // end is_sorted_until()
-
-
-template<typename RandomAccessIterator,
-         typename StrictWeakOrdering>
-  void stable_sort(tag,
-                   RandomAccessIterator first,
-                   RandomAccessIterator last,
-                   StrictWeakOrdering comp)
-{
-  // unimplemented primitive
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<RandomAccessIterator, false>::value) );
-} // end stable_sort()
-
-
-template<typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename StrictWeakOrdering>
-  void stable_sort_by_key(tag,
-                          RandomAccessIterator1 keys_first,
-                          RandomAccessIterator1 keys_last,
-                          RandomAccessIterator2 values_first,
-                          StrictWeakOrdering comp)
-{
-  // unimplemented primitive
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<RandomAccessIterator1, false>::value) );
-} // end stable_sort_by_key()
-
-
-} // end generic
-} // end detail
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/detail/generic/swap_ranges.h b/compat/thrust/system/detail/generic/swap_ranges.h
deleted file mode 100644
index 5d640d3feb..0000000000
--- a/compat/thrust/system/detail/generic/swap_ranges.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/tag.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-template<typename DerivedPolicy,
-         typename ForwardIterator1,
-         typename ForwardIterator2>
-  ForwardIterator2 swap_ranges(thrust::execution_policy<DerivedPolicy> &exec,
-                               ForwardIterator1 first1,
-                               ForwardIterator1 last1,
-                               ForwardIterator2 first2);
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/detail/generic/swap_ranges.inl>
-
diff --git a/compat/thrust/system/detail/generic/swap_ranges.inl b/compat/thrust/system/detail/generic/swap_ranges.inl
deleted file mode 100644
index 0e12d07627..0000000000
--- a/compat/thrust/system/detail/generic/swap_ranges.inl
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/swap_ranges.h>
-#include <thrust/tuple.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/detail/internal_functional.h>
-#include <thrust/for_each.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-namespace detail
-{
-
-// XXX define this here rather than in internal_functional.h
-// to avoid circular dependence between swap.h & internal_functional.h
-struct swap_pair_elements
-{
-  template <typename Tuple>
-  __host__ __device__
-  void operator()(Tuple t)
-  {
-    // use unqualified swap to allow ADL to catch any user-defined swap
-    using thrust::swap;
-    swap(thrust::get<0>(t), thrust::get<1>(t));
-  }
-}; // end swap_pair_elements
-
-} // end detail
-
-template<typename DerivedPolicy,
-         typename ForwardIterator1,
-         typename ForwardIterator2>
-  ForwardIterator2 swap_ranges(thrust::execution_policy<DerivedPolicy> &exec,
-                               ForwardIterator1 first1,
-                               ForwardIterator1 last1,
-                               ForwardIterator2 first2)
-{
-  typedef thrust::tuple<ForwardIterator1,ForwardIterator2> IteratorTuple;
-  typedef thrust::zip_iterator<IteratorTuple>              ZipIterator;
-
-  ZipIterator result = thrust::for_each(exec,
-                                        thrust::make_zip_iterator(thrust::make_tuple(first1, first2)),
-                                        thrust::make_zip_iterator(thrust::make_tuple(last1,  first2)),
-                                        detail::swap_pair_elements());
-  return thrust::get<1>(result.get_iterator_tuple());
-} // end swap_ranges()
-
-} // end generic
-} // end detail
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/detail/generic/tabulate.h b/compat/thrust/system/detail/generic/tabulate.h
deleted file mode 100644
index e5911b14ec..0000000000
--- a/compat/thrust/system/detail/generic/tabulate.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/tag.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename UnaryOperation>
-  void tabulate(thrust::execution_policy<DerivedPolicy> &exec,
-                ForwardIterator first,
-                ForwardIterator last,
-                UnaryOperation unary_op);
-
-template<typename DerivedPolicy,
-         typename OutputIterator,
-         typename Size,
-         typename UnaryOperation>
-  OutputIterator tabulate_n(thrust::execution_policy<DerivedPolicy> &exec,
-                            OutputIterator first,
-                            Size n,
-                            UnaryOperation unary_op);
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/detail/generic/tabulate.inl>
-
diff --git a/compat/thrust/system/detail/generic/tabulate.inl b/compat/thrust/system/detail/generic/tabulate.inl
deleted file mode 100644
index d2ffc26df6..0000000000
--- a/compat/thrust/system/detail/generic/tabulate.inl
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/tabulate.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/transform.h>
-#include <thrust/distance.h>
-#include <thrust/iterator/counting_iterator.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename UnaryOperation>
-  void tabulate(thrust::execution_policy<DerivedPolicy> &exec,
-                ForwardIterator first,
-                ForwardIterator last,
-                UnaryOperation unary_op)
-{
-  typedef typename iterator_difference<ForwardIterator>::type difference_type;
-
-  // by default, counting_iterator uses a 64b difference_type on 32b platforms to avoid overflowing its counter.
-  // this causes problems when a zip_iterator is created in transform's implementation -- ForwardIterator is
-  // incremented by a 64b difference_type and some compilers warn
-  // to avoid this, specify the counting_iterator's difference_type to be the same as ForwardIterator's.
-  thrust::counting_iterator<difference_type, thrust::use_default, thrust::use_default, difference_type> iter(0);
-
-  thrust::transform(exec, iter, iter + thrust::distance(first, last), first, unary_op);
-} // end tabulate()
-
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
-
diff --git a/compat/thrust/system/detail/generic/tag.h b/compat/thrust/system/detail/generic/tag.h
deleted file mode 100644
index 577d6a37a7..0000000000
--- a/compat/thrust/system/detail/generic/tag.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file generic/tag.h
- *  \brief Implementation of the generic backend's tag.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-// tag exists only to make the generic entry points the least priority match
-// during ADL. tag should not be derived from and is constructible from anything
-struct tag
-{
-  template<typename T>
-  __host__ __device__ inline
-  tag(const T &) {}
-};
-
-} // end generic
-} // end detail
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/detail/generic/temporary_buffer.h b/compat/thrust/system/detail/generic/temporary_buffer.h
deleted file mode 100644
index 8cb08b06ae..0000000000
--- a/compat/thrust/system/detail/generic/temporary_buffer.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/tag.h>
-#include <thrust/pair.h>
-#include <thrust/detail/pointer.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-
-template<typename T, typename DerivedPolicy>
-  thrust::pair<thrust::pointer<T,DerivedPolicy>, typename thrust::pointer<T,DerivedPolicy>::difference_type>
-    get_temporary_buffer(thrust::execution_policy<DerivedPolicy> &exec, typename thrust::pointer<T,DerivedPolicy>::difference_type n);
-
-
-template<typename DerivedPolicy, typename Pointer>
-  void return_temporary_buffer(thrust::execution_policy<DerivedPolicy> &exec, Pointer p);
-
-
-} // end generic
-} // end detail
-} // end system
-} // end thrust
-
-#include <thrust/system/detail/generic/temporary_buffer.inl>
-
diff --git a/compat/thrust/system/detail/generic/temporary_buffer.inl b/compat/thrust/system/detail/generic/temporary_buffer.inl
deleted file mode 100644
index 0a6be7ee08..0000000000
--- a/compat/thrust/system/detail/generic/temporary_buffer.inl
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/temporary_buffer.h>
-#include <thrust/detail/pointer.h>
-#include <thrust/detail/malloc_and_free.h>
-#include <thrust/pair.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-
-template<typename T, typename DerivedPolicy>
-  thrust::pair<thrust::pointer<T,DerivedPolicy>, typename thrust::pointer<T,DerivedPolicy>::difference_type>
-    get_temporary_buffer(thrust::execution_policy<DerivedPolicy> &exec, typename thrust::pointer<T,DerivedPolicy>::difference_type n)
-{
-  thrust::pointer<T,DerivedPolicy> ptr = thrust::malloc<T>(exec, n);
-
-  // check for a failed malloc
-  if(!ptr.get())
-  {
-    n = 0;
-  } // end if
-
-  return thrust::make_pair(ptr, n);
-} // end get_temporary_buffer()
-
-
-template<typename DerivedPolicy, typename Pointer>
-  void return_temporary_buffer(thrust::execution_policy<DerivedPolicy> &exec, Pointer p)
-{
-  thrust::free(exec, p);
-} // end return_temporary_buffer()
-
-
-} // end generic
-} // end detail
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/detail/generic/transform.h b/compat/thrust/system/detail/generic/transform.h
deleted file mode 100644
index e98d40291e..0000000000
--- a/compat/thrust/system/detail/generic/transform.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/tag.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename UnaryFunction>
-  OutputIterator transform(thrust::execution_policy<DerivedPolicy> &exec,
-                           InputIterator first,
-                           InputIterator last,
-                           OutputIterator result,
-                           UnaryFunction op);
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename BinaryFunction>
-  OutputIterator transform(thrust::execution_policy<DerivedPolicy> &exec,
-                           InputIterator1 first1,
-                           InputIterator1 last1,
-                           InputIterator2 first2,
-                           OutputIterator result,
-                           BinaryFunction op);
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename ForwardIterator,
-         typename UnaryFunction,
-         typename Predicate>
-  ForwardIterator transform_if(thrust::execution_policy<DerivedPolicy> &exec,
-                               InputIterator first,
-                               InputIterator last,
-                               ForwardIterator result,
-                               UnaryFunction unary_op,
-                               Predicate pred);
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename ForwardIterator,
-         typename UnaryFunction,
-         typename Predicate>
-  ForwardIterator transform_if(thrust::execution_policy<DerivedPolicy> &exec,
-                               InputIterator1 first,
-                               InputIterator1 last,
-                               InputIterator2 stencil,
-                               ForwardIterator result,
-                               UnaryFunction unary_op,
-                               Predicate pred);
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename ForwardIterator,
-         typename BinaryFunction,
-         typename Predicate>
-  ForwardIterator transform_if(thrust::execution_policy<DerivedPolicy> &exec,
-                               InputIterator1 first1,
-                               InputIterator1 last1,
-                               InputIterator2 first2,
-                               InputIterator3 stencil,
-                               ForwardIterator result,
-                               BinaryFunction binary_op,
-                               Predicate pred);
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/detail/generic/transform.inl>
-
diff --git a/compat/thrust/system/detail/generic/transform.inl b/compat/thrust/system/detail/generic/transform.inl
deleted file mode 100644
index 8f0995328c..0000000000
--- a/compat/thrust/system/detail/generic/transform.inl
+++ /dev/null
@@ -1,214 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/transform.h>
-#include <thrust/for_each.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/iterator/detail/minimum_system.h>
-#include <thrust/tuple.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/detail/internal_functional.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename UnaryFunction>
-  OutputIterator transform(thrust::execution_policy<DerivedPolicy> &exec,
-                           InputIterator first,
-                           InputIterator last,
-                           OutputIterator result,
-                           UnaryFunction op)
-{
-  // XXX WAR the problem of a generic __host__ __device__ functor's inability to invoke
-  //     a function which is only __host__ or __device__ by selecting a generic functor
-  //     which is one or the other
-  //     when nvcc is able to deal with this, remove this WAR
-  
-  // given the minimal system, determine the unary transform functor we need
-  typedef typename thrust::detail::unary_transform_functor<DerivedPolicy,UnaryFunction>::type UnaryTransformFunctor;
-
-  // make an iterator tuple
-  typedef thrust::tuple<InputIterator,OutputIterator> IteratorTuple;
-  typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
-
-  ZipIterator zipped_result =
-    thrust::for_each(exec,
-                     thrust::make_zip_iterator(thrust::make_tuple(first,result)),
-                     thrust::make_zip_iterator(thrust::make_tuple(last,result)),
-                     UnaryTransformFunctor(op));
-
-  return thrust::get<1>(zipped_result.get_iterator_tuple());
-} // end transform()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename BinaryFunction>
-  OutputIterator transform(thrust::execution_policy<DerivedPolicy> &exec,
-                           InputIterator1 first1,
-                           InputIterator1 last1,
-                           InputIterator2 first2,
-                           OutputIterator result,
-                           BinaryFunction op)
-{
-  // XXX WAR the problem of a generic __host__ __device__ functor's inability to invoke
-  //     a function which is only __host__ or __device__ by selecting a generic functor
-  //     which is one or the other
-  //     when nvcc is able to deal with this, remove this WAR
-  
-  // given the minimal system, determine the binary transform functor we need
-  typedef typename thrust::detail::binary_transform_functor<DerivedPolicy,BinaryFunction>::type BinaryTransformFunctor;
-
-  // make an iterator tuple
-  typedef thrust::tuple<InputIterator1,InputIterator2,OutputIterator> IteratorTuple;
-  typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
-
-  ZipIterator zipped_result =
-    thrust::for_each(exec,
-                     thrust::make_zip_iterator(thrust::make_tuple(first1,first2,result)),
-                     thrust::make_zip_iterator(thrust::make_tuple(last1,first2,result)),
-                     BinaryTransformFunctor(op));
-
-  return thrust::get<2>(zipped_result.get_iterator_tuple());
-} // end transform()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename ForwardIterator,
-         typename UnaryFunction,
-         typename Predicate>
-  ForwardIterator transform_if(thrust::execution_policy<DerivedPolicy> &exec,
-                               InputIterator first,
-                               InputIterator last,
-                               ForwardIterator result,
-                               UnaryFunction unary_op,
-                               Predicate pred)
-{
-  // XXX WAR the problem of a generic __host__ __device__ functor's inability to invoke
-  //     a function which is only __host__ or __device__ by selecting a generic functor
-  //     which is one or the other
-  //     when nvcc is able to deal with this, remove this WAR
-  
-  // given the minimal system, determine the unary transform_if functor we need
-  typedef typename thrust::detail::unary_transform_if_functor<DerivedPolicy,UnaryFunction,Predicate>::type UnaryTransformIfFunctor;
-
-  // make an iterator tuple
-  typedef thrust::tuple<InputIterator,ForwardIterator> IteratorTuple;
-  typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
-
-  ZipIterator zipped_result =
-    thrust::for_each(exec,
-                     thrust::make_zip_iterator(thrust::make_tuple(first,result)),
-                     thrust::make_zip_iterator(thrust::make_tuple(last,result)),
-                     UnaryTransformIfFunctor(unary_op,pred));
-
-  return thrust::get<1>(zipped_result.get_iterator_tuple());
-} // end transform_if()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename ForwardIterator,
-         typename UnaryFunction,
-         typename Predicate>
-  ForwardIterator transform_if(thrust::execution_policy<DerivedPolicy> &exec,
-                               InputIterator1 first,
-                               InputIterator1 last,
-                               InputIterator2 stencil,
-                               ForwardIterator result,
-                               UnaryFunction unary_op,
-                               Predicate pred)
-{
-  // XXX WAR the problem of a generic __host__ __device__ functor's inability to invoke
-  //     a function which is only __host__ or __device__ by selecting a generic functor
-  //     which is one or the other
-  //     when nvcc is able to deal with this, remove this WAR
-  
-  // given the minimal system, determine the unary transform_if functor we need
-  typedef typename thrust::detail::unary_transform_if_with_stencil_functor<DerivedPolicy,UnaryFunction,Predicate>::type UnaryTransformIfFunctor;
-
-  // make an iterator tuple
-  typedef thrust::tuple<InputIterator1,InputIterator2,ForwardIterator> IteratorTuple;
-  typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
-
-  ZipIterator zipped_result =
-    thrust::for_each(exec,
-                     thrust::make_zip_iterator(thrust::make_tuple(first,stencil,result)),
-                     thrust::make_zip_iterator(thrust::make_tuple(last,stencil,result)),
-                     UnaryTransformIfFunctor(unary_op,pred));
-
-  return thrust::get<2>(zipped_result.get_iterator_tuple());
-} // end transform_if()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename ForwardIterator,
-         typename BinaryFunction,
-         typename Predicate>
-  ForwardIterator transform_if(thrust::execution_policy<DerivedPolicy> &exec,
-                               InputIterator1 first1,
-                               InputIterator1 last1,
-                               InputIterator2 first2,
-                               InputIterator3 stencil,
-                               ForwardIterator result,
-                               BinaryFunction binary_op,
-                               Predicate pred)
-{
-  // XXX WAR the problem of a generic __host__ __device__ functor's inability to invoke
-  //     a function which is only __host__ or __device__ by selecting a generic functor
-  //     which is one or the other
-  //     when nvcc is able to deal with this, remove this WAR
-  
-  // given the minimal system, determine the binary transform_if functor we need
-  typedef typename thrust::detail::binary_transform_if_functor<DerivedPolicy,BinaryFunction,Predicate>::type BinaryTransformIfFunctor;
-
-  // make an iterator tuple
-  typedef thrust::tuple<InputIterator1,InputIterator2,InputIterator3,ForwardIterator> IteratorTuple;
-  typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
-
-  ZipIterator zipped_result =
-    thrust::for_each(exec,
-                     thrust::make_zip_iterator(thrust::make_tuple(first1,first2,stencil,result)),
-                     thrust::make_zip_iterator(thrust::make_tuple(last1,first2,stencil,result)),
-                     BinaryTransformIfFunctor(binary_op,pred));
-
-  return thrust::get<3>(zipped_result.get_iterator_tuple());
-} // end transform_if()
-
-
-} // end generic
-} // end detail
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/detail/generic/transform_reduce.h b/compat/thrust/system/detail/generic/transform_reduce.h
deleted file mode 100644
index c1f098f50f..0000000000
--- a/compat/thrust/system/detail/generic/transform_reduce.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/tag.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-template<typename ExecutionPolicy,
-         typename InputIterator, 
-         typename UnaryFunction, 
-         typename OutputType,
-         typename BinaryFunction>
-  OutputType transform_reduce(thrust::execution_policy<ExecutionPolicy> &exec,
-                              InputIterator first,
-                              InputIterator last,
-                              UnaryFunction unary_op,
-                              OutputType init,
-                              BinaryFunction binary_op);
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/detail/generic/transform_reduce.inl>
-
diff --git a/compat/thrust/system/detail/generic/transform_reduce.inl b/compat/thrust/system/detail/generic/transform_reduce.inl
deleted file mode 100644
index ce8b6a1213..0000000000
--- a/compat/thrust/system/detail/generic/transform_reduce.inl
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/transform_reduce.h>
-#include <thrust/reduce.h>
-#include <thrust/iterator/transform_iterator.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-template<typename DerivedPolicy,
-         typename InputIterator, 
-         typename UnaryFunction, 
-         typename OutputType,
-         typename BinaryFunction>
-  OutputType transform_reduce(thrust::execution_policy<DerivedPolicy> &exec,
-                              InputIterator first,
-                              InputIterator last,
-                              UnaryFunction unary_op,
-                              OutputType init,
-                              BinaryFunction binary_op)
-{
-  thrust::transform_iterator<UnaryFunction, InputIterator, OutputType> xfrm_first(first, unary_op);
-  thrust::transform_iterator<UnaryFunction, InputIterator, OutputType> xfrm_last(last, unary_op);
-
-  return thrust::reduce(exec, xfrm_first, xfrm_last, init, binary_op);
-} // end transform_reduce()
-
-} // end generic
-} // end detail
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/detail/generic/transform_scan.h b/compat/thrust/system/detail/generic/transform_scan.h
deleted file mode 100644
index 99db86e4d9..0000000000
--- a/compat/thrust/system/detail/generic/transform_scan.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/tag.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-template<typename ExecutionPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename UnaryFunction,
-         typename BinaryFunction>
-  OutputIterator transform_inclusive_scan(thrust::execution_policy<ExecutionPolicy> &exec,
-                                          InputIterator first,
-                                          InputIterator last,
-                                          OutputIterator result,
-                                          UnaryFunction unary_op,
-                                          BinaryFunction binary_op);
-
-template<typename ExecutionPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename UnaryFunction,
-         typename T,
-         typename AssociativeOperator>
-  OutputIterator transform_exclusive_scan(thrust::execution_policy<ExecutionPolicy> &exec,
-                                          InputIterator first,
-                                          InputIterator last,
-                                          OutputIterator result,
-                                          UnaryFunction unary_op,
-                                          T init,
-                                          AssociativeOperator binary_op);
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/detail/generic/transform_scan.inl>
-
diff --git a/compat/thrust/system/detail/generic/transform_scan.inl b/compat/thrust/system/detail/generic/transform_scan.inl
deleted file mode 100644
index a95ec20e67..0000000000
--- a/compat/thrust/system/detail/generic/transform_scan.inl
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/transform_scan.h>
-#include <thrust/scan.h>
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/type_traits/function_traits.h>
-#include <thrust/detail/type_traits/iterator/is_output_iterator.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-template<typename ExecutionPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename UnaryFunction,
-         typename BinaryFunction>
-  OutputIterator transform_inclusive_scan(thrust::execution_policy<ExecutionPolicy> &exec,
-                                          InputIterator first,
-                                          InputIterator last,
-                                          OutputIterator result,
-                                          UnaryFunction unary_op,
-                                          BinaryFunction binary_op)
-{
-  // the pseudocode for deducing the type of the temporary used below:
-  // 
-  // if UnaryFunction is AdaptableUnaryFunction
-  //   TemporaryType = AdaptableUnaryFunction::result_type
-  // else if OutputIterator is a "pure" output iterator
-  //   TemporaryType = InputIterator::value_type
-  // else
-  //   TemporaryType = OutputIterator::value_type
-  //
-  // XXX upon c++0x, TemporaryType needs to be:
-  // result_of<UnaryFunction>::type
-
-  typedef typename thrust::detail::eval_if<
-    thrust::detail::has_result_type<UnaryFunction>::value,
-    thrust::detail::result_type<UnaryFunction>,
-    thrust::detail::eval_if<
-      thrust::detail::is_output_iterator<OutputIterator>::value,
-      thrust::iterator_value<InputIterator>,
-      thrust::iterator_value<OutputIterator>
-    >
-  >::type ValueType;
-
-  thrust::transform_iterator<UnaryFunction, InputIterator, ValueType> _first(first, unary_op);
-  thrust::transform_iterator<UnaryFunction, InputIterator, ValueType> _last(last, unary_op);
-
-  return thrust::inclusive_scan(exec, _first, _last, result, binary_op);
-} // end transform_inclusive_scan()
-
-template<typename ExecutionPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename UnaryFunction,
-         typename T,
-         typename AssociativeOperator>
-  OutputIterator transform_exclusive_scan(thrust::execution_policy<ExecutionPolicy> &exec,
-                                          InputIterator first,
-                                          InputIterator last,
-                                          OutputIterator result,
-                                          UnaryFunction unary_op,
-                                          T init,
-                                          AssociativeOperator binary_op)
-{
-  // the pseudocode for deducing the type of the temporary used below:
-  // 
-  // if UnaryFunction is AdaptableUnaryFunction
-  //   TemporaryType = AdaptableUnaryFunction::result_type
-  // else if OutputIterator is a "pure" output iterator
-  //   TemporaryType = InputIterator::value_type
-  // else
-  //   TemporaryType = OutputIterator::value_type
-  //
-  // XXX upon c++0x, TemporaryType needs to be:
-  // result_of<UnaryFunction>::type
-
-  typedef typename thrust::detail::eval_if<
-    thrust::detail::has_result_type<UnaryFunction>::value,
-    thrust::detail::result_type<UnaryFunction>,
-    thrust::detail::eval_if<
-      thrust::detail::is_output_iterator<OutputIterator>::value,
-      thrust::iterator_value<InputIterator>,
-      thrust::iterator_value<OutputIterator>
-    >
-  >::type ValueType;
-
-  thrust::transform_iterator<UnaryFunction, InputIterator, ValueType> _first(first, unary_op);
-  thrust::transform_iterator<UnaryFunction, InputIterator, ValueType> _last(last, unary_op);
-
-  return thrust::exclusive_scan(exec, _first, _last, result, init, binary_op);
-} // end transform_exclusive_scan()
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
-
diff --git a/compat/thrust/system/detail/generic/type_traits.h b/compat/thrust/system/detail/generic/type_traits.h
deleted file mode 100644
index 40113525f3..0000000000
--- a/compat/thrust/system/detail/generic/type_traits.h
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file generic/type_traits.h
- *  \brief Introspection for free functions defined in generic.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-namespace thrust
-{
-
-// forward declaration of any_system_tag for any_conversion below
-struct any_system_tag;
-
-namespace system
-{
-namespace detail
-{
-
-// we must define these traits outside of generic's namespace
-namespace generic_type_traits_ns
-{
-
-typedef char yes;
-typedef char (&no)[2];
-
-struct any_conversion
-{
-  template<typename T> any_conversion(const T &);
-
-  // add this extra constructor to disambiguate conversion from any_system_tag
-  any_conversion(const any_system_tag &);
-};
-
-namespace select_system_exists_ns
-{
-  no select_system(const any_conversion &);
-  no select_system(const any_conversion &, const any_conversion &);
-  no select_system(const any_conversion &, const any_conversion &, const any_conversion &);
-  no select_system(const any_conversion &, const any_conversion &, const any_conversion &, const any_conversion &);
-  no select_system(const any_conversion &, const any_conversion &, const any_conversion &, const any_conversion &, const any_conversion &);
-  no select_system(const any_conversion &, const any_conversion &, const any_conversion &, const any_conversion &, const any_conversion &, const any_conversion &);
-
-  template<typename T> yes check(const T &);
-
-  no check(no);
-
-  template<typename Tag>
-    struct select_system1_exists
-  {
-    static Tag &tag;
-
-    static const bool value = sizeof(check(select_system(tag))) == sizeof(yes);
-  };
-
-  template<typename Tag1, typename Tag2>
-    struct select_system2_exists
-  {
-    static Tag1 &tag1;
-    static Tag2 &tag2;
-
-    static const bool value = sizeof(check(select_system(tag1,tag2))) == sizeof(yes);
-  };
-
-  template<typename Tag1, typename Tag2, typename Tag3>
-    struct select_system3_exists
-  {
-    static Tag1 &tag1;
-    static Tag2 &tag2;
-    static Tag3 &tag3;
-
-    static const bool value = sizeof(check(select_system(tag1,tag2,tag3))) == sizeof(yes);
-  };
-
-  template<typename Tag1, typename Tag2, typename Tag3, typename Tag4>
-    struct select_system4_exists
-  {
-    static Tag1 &tag1;
-    static Tag2 &tag2;
-    static Tag3 &tag3;
-    static Tag4 &tag4;
-
-    static const bool value = sizeof(check(select_system(tag1,tag2,tag3,tag4))) == sizeof(yes);
-  };
-
-  template<typename Tag1, typename Tag2, typename Tag3, typename Tag4, typename Tag5>
-    struct select_system5_exists
-  {
-    static Tag1 &tag1;
-    static Tag2 &tag2;
-    static Tag3 &tag3;
-    static Tag4 &tag4;
-    static Tag5 &tag5;
-
-    static const bool value = sizeof(check(select_system(tag1,tag2,tag3,tag4,tag5))) == sizeof(yes);
-  };
-
-  template<typename Tag1, typename Tag2, typename Tag3, typename Tag4, typename Tag5, typename Tag6>
-    struct select_system6_exists
-  {
-    static Tag1 &tag1;
-    static Tag2 &tag2;
-    static Tag3 &tag3;
-    static Tag4 &tag4;
-    static Tag5 &tag5;
-    static Tag6 &tag6;
-
-    static const bool value = sizeof(check(select_system(tag1,tag2,tag3,tag4,tag5,tag6))) == sizeof(yes);
-  };
-} // end select_system_exists_ns
-
-} // end generic_type_traits_ns
-
-namespace generic
-{
-
-template<typename Tag>
-  struct select_system1_exists
-    : generic_type_traits_ns::select_system_exists_ns::select_system1_exists<Tag>
-{};
-
-template<typename Tag1, typename Tag2>
-  struct select_system2_exists
-    : generic_type_traits_ns::select_system_exists_ns::select_system2_exists<Tag1,Tag2>
-{};
-
-template<typename Tag1, typename Tag2, typename Tag3>
-  struct select_system3_exists
-    : generic_type_traits_ns::select_system_exists_ns::select_system3_exists<Tag1,Tag2,Tag3>
-{};
-
-template<typename Tag1, typename Tag2, typename Tag3, typename Tag4>
-  struct select_system4_exists
-    : generic_type_traits_ns::select_system_exists_ns::select_system4_exists<Tag1,Tag2,Tag3,Tag4>
-{};
-
-template<typename Tag1, typename Tag2, typename Tag3, typename Tag4, typename Tag5>
-  struct select_system5_exists
-    : generic_type_traits_ns::select_system_exists_ns::select_system5_exists<Tag1,Tag2,Tag3,Tag4,Tag5>
-{};
-
-template<typename Tag1, typename Tag2, typename Tag3, typename Tag4, typename Tag5, typename Tag6>
-  struct select_system6_exists
-    : generic_type_traits_ns::select_system_exists_ns::select_system6_exists<Tag1,Tag2,Tag3,Tag4,Tag5,Tag6>
-{};
-
-} // end generic
-} // end detail
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/detail/generic/uninitialized_copy.h b/compat/thrust/system/detail/generic/uninitialized_copy.h
deleted file mode 100644
index 67e3e68328..0000000000
--- a/compat/thrust/system/detail/generic/uninitialized_copy.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/tag.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-template<typename ExecutionPolicy,
-         typename InputIterator,
-         typename ForwardIterator>
-  ForwardIterator uninitialized_copy(thrust::execution_policy<ExecutionPolicy> &exec,
-                                     InputIterator first,
-                                     InputIterator last,
-                                     ForwardIterator result);
-
-template<typename ExecutionPolicy,
-         typename InputIterator,
-         typename Size,
-         typename ForwardIterator>
-  ForwardIterator uninitialized_copy_n(thrust::execution_policy<ExecutionPolicy> &exec,
-                                       InputIterator first,
-                                       Size n,
-                                       ForwardIterator result);
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/detail/generic/uninitialized_copy.inl>
-
diff --git a/compat/thrust/system/detail/generic/uninitialized_copy.inl b/compat/thrust/system/detail/generic/uninitialized_copy.inl
deleted file mode 100644
index 414e6e48fe..0000000000
--- a/compat/thrust/system/detail/generic/uninitialized_copy.inl
+++ /dev/null
@@ -1,187 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/uninitialized_copy.h>
-#include <thrust/copy.h>
-#include <thrust/for_each.h>
-#include <thrust/detail/internal_functional.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/iterator/iterator_traits.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-namespace detail
-{
-
-template<typename InputType,
-         typename OutputType>
-  struct uninitialized_copy_functor
-{
-  template<typename Tuple>
-  __host__ __device__
-  void operator()(Tuple t)
-  {
-    const InputType &in = thrust::get<0>(t);
-    OutputType &out = thrust::get<1>(t);
-
-    ::new(static_cast<void*>(&out)) OutputType(in);
-  } // end operator()()
-}; // end uninitialized_copy_functor
-
-
-// non-trivial copy constructor path
-template<typename ExecutionPolicy,
-         typename InputIterator,
-         typename ForwardIterator>
-  ForwardIterator uninitialized_copy(thrust::execution_policy<ExecutionPolicy> &exec,
-                                     InputIterator first,
-                                     InputIterator last,
-                                     ForwardIterator result,
-                                     thrust::detail::false_type) // has_trivial_copy_constructor
-{
-  // zip up the iterators
-  typedef thrust::tuple<InputIterator,ForwardIterator> IteratorTuple;
-  typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
-
-  ZipIterator begin = thrust::make_zip_iterator(thrust::make_tuple(first,result));
-  ZipIterator end = begin;
-
-  // get a zip_iterator pointing to the end
-  const typename thrust::iterator_difference<InputIterator>::type n = thrust::distance(first,last);
-  thrust::advance(end, n);
-
-  // create a functor
-  typedef typename iterator_traits<InputIterator>::value_type InputType;
-  typedef typename iterator_traits<ForwardIterator>::value_type OutputType;
-
-  detail::uninitialized_copy_functor<InputType, OutputType> f;
-
-  // do the for_each
-  thrust::for_each(exec, begin, end, f);
-
-  // return the end of the output range
-  return thrust::get<1>(end.get_iterator_tuple());
-} // end uninitialized_copy()
-
-
-// trivial copy constructor path
-template<typename ExecutionPolicy,
-         typename InputIterator,
-         typename ForwardIterator>
-  ForwardIterator uninitialized_copy(thrust::execution_policy<ExecutionPolicy> &exec,
-                                     InputIterator first,
-                                     InputIterator last,
-                                     ForwardIterator result,
-                                     thrust::detail::true_type) // has_trivial_copy_constructor
-{
-  return thrust::copy(exec, first, last, result);
-} // end uninitialized_copy()
-
-
-// non-trivial copy constructor path
-template<typename ExecutionPolicy,
-         typename InputIterator,
-         typename Size,
-         typename ForwardIterator>
-  ForwardIterator uninitialized_copy_n(thrust::execution_policy<ExecutionPolicy> &exec,
-                                       InputIterator first,
-                                       Size n,
-                                       ForwardIterator result,
-                                       thrust::detail::false_type) // has_trivial_copy_constructor
-{
-  // zip up the iterators
-  typedef thrust::tuple<InputIterator,ForwardIterator> IteratorTuple;
-  typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
-
-  ZipIterator zipped_first = thrust::make_zip_iterator(thrust::make_tuple(first,result));
-
-  // create a functor
-  typedef typename iterator_traits<InputIterator>::value_type   InputType;
-  typedef typename iterator_traits<ForwardIterator>::value_type OutputType;
-
-  detail::uninitialized_copy_functor<InputType, OutputType> f;
-
-  // do the for_each_n
-  ZipIterator zipped_last = thrust::for_each_n(exec, zipped_first, n, f);
-
-  // return the end of the output range
-  return thrust::get<1>(zipped_last.get_iterator_tuple());
-} // end uninitialized_copy_n()
-
-
-// trivial copy constructor path
-template<typename ExecutionPolicy,
-         typename InputIterator,
-         typename Size,
-         typename ForwardIterator>
-  ForwardIterator uninitialized_copy_n(thrust::execution_policy<ExecutionPolicy> &exec,
-                                       InputIterator first,
-                                       Size n,
-                                       ForwardIterator result,
-                                       thrust::detail::true_type) // has_trivial_copy_constructor
-{
-  return thrust::copy_n(exec, first, n, result);
-} // end uninitialized_copy_n()
-
-
-} // end detail
-
-
-template<typename ExecutionPolicy,
-         typename InputIterator,
-         typename ForwardIterator>
-  ForwardIterator uninitialized_copy(thrust::execution_policy<ExecutionPolicy> &exec,
-                                     InputIterator first,
-                                     InputIterator last,
-                                     ForwardIterator result)
-{
-  typedef typename iterator_traits<ForwardIterator>::value_type ResultType;
-
-  typedef typename thrust::detail::has_trivial_copy_constructor<ResultType>::type ResultTypeHasTrivialCopyConstructor;
-
-  return thrust::system::detail::generic::detail::uninitialized_copy(exec, first, last, result, ResultTypeHasTrivialCopyConstructor());
-} // end uninitialized_copy()
-
-
-template<typename ExecutionPolicy,
-         typename InputIterator,
-         typename Size,
-         typename ForwardIterator>
-  ForwardIterator uninitialized_copy_n(thrust::execution_policy<ExecutionPolicy> &exec,
-                                       InputIterator first,
-                                       Size n,
-                                       ForwardIterator result)
-{
-  typedef typename iterator_traits<ForwardIterator>::value_type ResultType;
-
-  typedef typename thrust::detail::has_trivial_copy_constructor<ResultType>::type ResultTypeHasTrivialCopyConstructor;
-
-  return thrust::system::detail::generic::detail::uninitialized_copy_n(exec, first, n, result, ResultTypeHasTrivialCopyConstructor());
-} // end uninitialized_copy_n()
-
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/generic/uninitialized_fill.h b/compat/thrust/system/detail/generic/uninitialized_fill.h
deleted file mode 100644
index c1df694c02..0000000000
--- a/compat/thrust/system/detail/generic/uninitialized_fill.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/tag.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename T>
-  void uninitialized_fill(thrust::execution_policy<DerivedPolicy> &exec,
-                          ForwardIterator first,
-                          ForwardIterator last,
-                          const T &x);
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename Size,
-         typename T>
-  ForwardIterator uninitialized_fill_n(thrust::execution_policy<DerivedPolicy> &exec,
-                                       ForwardIterator first,
-                                       Size n,
-                                       const T &x);
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/detail/generic/uninitialized_fill.inl>
-
diff --git a/compat/thrust/system/detail/generic/uninitialized_fill.inl b/compat/thrust/system/detail/generic/uninitialized_fill.inl
deleted file mode 100644
index bb30b24fd7..0000000000
--- a/compat/thrust/system/detail/generic/uninitialized_fill.inl
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/uninitialized_fill.h>
-#include <thrust/fill.h>
-#include <thrust/detail/internal_functional.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/iterator/iterator_traits.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-namespace detail
-{
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename T>
-  void uninitialized_fill(thrust::execution_policy<DerivedPolicy> &exec,
-                          ForwardIterator first,
-                          ForwardIterator last,
-                          const T &x,
-                          thrust::detail::true_type) // has_trivial_copy_constructor
-{
-  thrust::fill(exec, first, last, x);
-} // end uninitialized_fill()
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename T>
-  void uninitialized_fill(thrust::execution_policy<DerivedPolicy> &exec,
-                          ForwardIterator first,
-                          ForwardIterator last,
-                          const T &x,
-                          thrust::detail::false_type) // has_trivial_copy_constructor
-{
-  typedef typename iterator_traits<ForwardIterator>::value_type ValueType;
-
-  thrust::for_each(exec, first, last, thrust::detail::uninitialized_fill_functor<ValueType>(x));
-} // end uninitialized_fill()
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename Size,
-         typename T>
-  ForwardIterator uninitialized_fill_n(thrust::execution_policy<DerivedPolicy> &exec,
-                                       ForwardIterator first,
-                                       Size n,
-                                       const T &x,
-                                       thrust::detail::true_type) // has_trivial_copy_constructor
-{
-  return thrust::fill_n(exec, first, n, x);
-} // end uninitialized_fill()
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename Size,
-         typename T>
-  ForwardIterator uninitialized_fill_n(thrust::execution_policy<DerivedPolicy> &exec,
-                                       ForwardIterator first,
-                                       Size n,
-                                       const T &x,
-                                       thrust::detail::false_type) // has_trivial_copy_constructor
-{
-  typedef typename iterator_traits<ForwardIterator>::value_type ValueType;
-
-  return thrust::for_each_n(exec, first, n, thrust::detail::uninitialized_fill_functor<ValueType>(x));
-} // end uninitialized_fill()
-
-} // end detail
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename T>
-  void uninitialized_fill(thrust::execution_policy<DerivedPolicy> &exec,
-                          ForwardIterator first,
-                          ForwardIterator last,
-                          const T &x)
-{
-  typedef typename iterator_traits<ForwardIterator>::value_type ValueType;
-
-  typedef thrust::detail::has_trivial_copy_constructor<ValueType> ValueTypeHasTrivialCopyConstructor;
-
-  thrust::system::detail::generic::detail::uninitialized_fill(exec, first, last, x,
-    ValueTypeHasTrivialCopyConstructor());
-} // end uninitialized_fill()
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename Size,
-         typename T>
-  ForwardIterator uninitialized_fill_n(thrust::execution_policy<DerivedPolicy> &exec,
-                                       ForwardIterator first,
-                                       Size n,
-                                       const T &x)
-{
-  typedef typename iterator_traits<ForwardIterator>::value_type ValueType;
-
-  typedef thrust::detail::has_trivial_copy_constructor<ValueType> ValueTypeHasTrivialCopyConstructor;
-
-  return thrust::system::detail::generic::detail::uninitialized_fill_n(exec, first, n, x,
-    ValueTypeHasTrivialCopyConstructor());
-} // end uninitialized_fill()
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/generic/unique.h b/compat/thrust/system/detail/generic/unique.h
deleted file mode 100644
index 57e17cafa4..0000000000
--- a/compat/thrust/system/detail/generic/unique.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/tag.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-
-template<typename DerivedPolicy,
-         typename ForwardIterator>
-ForwardIterator unique(thrust::execution_policy<DerivedPolicy> &exec,
-                       ForwardIterator first,
-                       ForwardIterator last);
-
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename BinaryPredicate>
-ForwardIterator unique(thrust::execution_policy<DerivedPolicy> &exec,
-                       ForwardIterator first,
-                       ForwardIterator last,
-                       BinaryPredicate binary_pred);
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator>
-OutputIterator unique_copy(thrust::execution_policy<DerivedPolicy> &exec,
-                           InputIterator first,
-                           InputIterator last,
-                           OutputIterator output);
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename BinaryPredicate>
-OutputIterator unique_copy(thrust::execution_policy<DerivedPolicy> &exec,
-                           InputIterator first,
-                           InputIterator last,
-                           OutputIterator output,
-                           BinaryPredicate binary_pred);
-
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/detail/generic/unique.inl>
-
diff --git a/compat/thrust/system/detail/generic/unique.inl b/compat/thrust/system/detail/generic/unique.inl
deleted file mode 100644
index 42d6b15e82..0000000000
--- a/compat/thrust/system/detail/generic/unique.inl
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file unique.inl
- *  \brief Inline file for unique.h.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/unique.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/transform.h>
-#include <thrust/unique.h>
-#include <thrust/detail/temporary_array.h>
-#include <thrust/detail/internal_functional.h>
-#include <thrust/detail/copy_if.h>
-#include <thrust/distance.h>
-#include <thrust/functional.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-
-template<typename DerivedPolicy,
-         typename ForwardIterator>
-  ForwardIterator unique(thrust::execution_policy<DerivedPolicy> &exec,
-                         ForwardIterator first,
-                         ForwardIterator last)
-{
-  typedef typename thrust::iterator_traits<ForwardIterator>::value_type InputType;
-
-  return thrust::unique(exec, first, last, thrust::equal_to<InputType>());
-} // end unique()
-
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename BinaryPredicate>
-  ForwardIterator unique(thrust::execution_policy<DerivedPolicy> &exec,
-                         ForwardIterator first,
-                         ForwardIterator last,
-                         BinaryPredicate binary_pred)
-{
-  typedef typename thrust::iterator_traits<ForwardIterator>::value_type InputType;
-  
-  thrust::detail::temporary_array<InputType,DerivedPolicy> input(exec, first, last);
-  
-  return thrust::unique_copy(exec, input.begin(), input.end(), first, binary_pred);
-} // end unique()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator>
-  OutputIterator unique_copy(thrust::execution_policy<DerivedPolicy> &exec,
-                             InputIterator first,
-                             InputIterator last,
-                             OutputIterator output)
-{
-  typedef typename thrust::iterator_value<InputIterator>::type value_type;
-  return thrust::unique_copy(exec, first,last,output,thrust::equal_to<value_type>());
-} // end unique_copy()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename BinaryPredicate>
-  OutputIterator unique_copy(thrust::execution_policy<DerivedPolicy> &exec,
-                             InputIterator first,
-                             InputIterator last,
-                             OutputIterator output,
-                             BinaryPredicate binary_pred)
-{
-  // empty sequence
-  if(first == last)
-    return output;
-  
-  thrust::detail::temporary_array<int,DerivedPolicy> stencil(exec, thrust::distance(first, last));
-  
-  // mark first element in each group
-  stencil[0] = 1; 
-  thrust::transform(exec, first, last - 1, first + 1, stencil.begin() + 1, thrust::detail::not2(binary_pred)); 
-  
-  return thrust::copy_if(exec, first, last, stencil.begin(), output, thrust::identity<int>());
-} // end unique_copy()
-
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/generic/unique_by_key.h b/compat/thrust/system/detail/generic/unique_by_key.h
deleted file mode 100644
index aa62f73e51..0000000000
--- a/compat/thrust/system/detail/generic/unique_by_key.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/tag.h>
-#include <thrust/pair.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-
-template<typename ExecutionPolicy,
-         typename ForwardIterator1,
-         typename ForwardIterator2>
-  thrust::pair<ForwardIterator1,ForwardIterator2>
-    unique_by_key(thrust::execution_policy<ExecutionPolicy> &exec,
-                  ForwardIterator1 keys_first, 
-                  ForwardIterator1 keys_last,
-                  ForwardIterator2 values_first);
-
-
-template<typename ExecutionPolicy,
-         typename ForwardIterator1,
-         typename ForwardIterator2,
-         typename BinaryPredicate>
-  thrust::pair<ForwardIterator1,ForwardIterator2>
-    unique_by_key(thrust::execution_policy<ExecutionPolicy> &exec,
-                  ForwardIterator1 keys_first, 
-                  ForwardIterator1 keys_last,
-                  ForwardIterator2 values_first,
-                  BinaryPredicate binary_pred);
-
-
-template<typename ExecutionPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    unique_by_key_copy(thrust::execution_policy<ExecutionPolicy> &exec,
-                       InputIterator1 keys_first, 
-                       InputIterator1 keys_last,
-                       InputIterator2 values_first,
-                       OutputIterator1 keys_output,
-                       OutputIterator2 values_output);
-
-
-template<typename ExecutionPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename BinaryPredicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    unique_by_key_copy(thrust::execution_policy<ExecutionPolicy> &exec,
-                       InputIterator1 keys_first, 
-                       InputIterator1 keys_last,
-                       InputIterator2 values_first,
-                       OutputIterator1 keys_output,
-                       OutputIterator2 values_output,
-                       BinaryPredicate binary_pred);
-
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/detail/generic/unique_by_key.inl>
-
diff --git a/compat/thrust/system/detail/generic/unique_by_key.inl b/compat/thrust/system/detail/generic/unique_by_key.inl
deleted file mode 100644
index c780fa71d1..0000000000
--- a/compat/thrust/system/detail/generic/unique_by_key.inl
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/unique_by_key.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/iterator/detail/minimum_system.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/transform.h>
-#include <thrust/detail/temporary_array.h>
-#include <thrust/detail/internal_functional.h>
-#include <thrust/detail/copy_if.h>
-#include <thrust/unique.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace generic
-{
-
-
-template<typename ExecutionPolicy,
-         typename ForwardIterator1,
-         typename ForwardIterator2>
-  thrust::pair<ForwardIterator1,ForwardIterator2>
-    unique_by_key(thrust::execution_policy<ExecutionPolicy> &exec,
-                  ForwardIterator1 keys_first, 
-                  ForwardIterator1 keys_last,
-                  ForwardIterator2 values_first)
-{
-  typedef typename thrust::iterator_traits<ForwardIterator1>::value_type KeyType;
-  return thrust::unique_by_key(exec, keys_first, keys_last, values_first, thrust::equal_to<KeyType>());
-} // end unique_by_key()
-
-
-template<typename ExecutionPolicy,
-         typename ForwardIterator1,
-         typename ForwardIterator2,
-         typename BinaryPredicate>
-  thrust::pair<ForwardIterator1,ForwardIterator2>
-    unique_by_key(thrust::execution_policy<ExecutionPolicy> &exec,
-                  ForwardIterator1 keys_first, 
-                  ForwardIterator1 keys_last,
-                  ForwardIterator2 values_first,
-                  BinaryPredicate binary_pred)
-{
-  typedef typename thrust::iterator_traits<ForwardIterator1>::value_type InputType1;
-  typedef typename thrust::iterator_traits<ForwardIterator2>::value_type InputType2;
-  
-  ForwardIterator2 values_last = values_first + (keys_last - keys_first);
-  
-  thrust::detail::temporary_array<InputType1,ExecutionPolicy> keys(exec, keys_first, keys_last);
-  thrust::detail::temporary_array<InputType2,ExecutionPolicy> vals(exec, values_first, values_last);
-  
-  return thrust::unique_by_key_copy(exec, keys.begin(), keys.end(), vals.begin(), keys_first, values_first, binary_pred);
-} // end unique_by_key()
-
-
-template<typename ExecutionPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    unique_by_key_copy(thrust::execution_policy<ExecutionPolicy> &exec,
-                       InputIterator1 keys_first, 
-                       InputIterator1 keys_last,
-                       InputIterator2 values_first,
-                       OutputIterator1 keys_output,
-                       OutputIterator2 values_output)
-{
-  typedef typename thrust::iterator_traits<InputIterator1>::value_type KeyType;
-  return thrust::unique_by_key_copy(exec, keys_first, keys_last, values_first, keys_output, values_output, thrust::equal_to<KeyType>());
-} // end unique_by_key_copy()
-
-
-template<typename ExecutionPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename BinaryPredicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    unique_by_key_copy(thrust::execution_policy<ExecutionPolicy> &exec,
-                       InputIterator1 keys_first, 
-                       InputIterator1 keys_last,
-                       InputIterator2 values_first,
-                       OutputIterator1 keys_output,
-                       OutputIterator2 values_output,
-                       BinaryPredicate binary_pred)
-{
-  typedef typename thrust::iterator_traits<InputIterator1>::difference_type difference_type;
-  
-  // empty sequence
-  if(keys_first == keys_last)
-    return thrust::make_pair(keys_output, values_output);
-  
-  difference_type n = thrust::distance(keys_first, keys_last);
-  
-  thrust::detail::temporary_array<int,ExecutionPolicy> stencil(exec,n);
-  
-  // mark first element in each group
-  stencil[0] = 1; 
-  thrust::transform(exec, keys_first, keys_last - 1, keys_first + 1, stencil.begin() + 1, thrust::detail::not2(binary_pred)); 
-  
-  thrust::zip_iterator< thrust::tuple<OutputIterator1, OutputIterator2> > result =
-    thrust::copy_if(exec,
-                    thrust::make_zip_iterator(thrust::make_tuple(keys_first, values_first)),
-                    thrust::make_zip_iterator(thrust::make_tuple(keys_first, values_first)) + n,
-                    stencil.begin(),
-                    thrust::make_zip_iterator(thrust::make_tuple(keys_output, values_output)),
-                    thrust::identity<int>());
-  
-  difference_type output_size = result - thrust::make_zip_iterator(thrust::make_tuple(keys_output, values_output));
-                                  
-  return thrust::make_pair(keys_output + output_size, values_output + output_size);
-} // end unique_by_key_copy()
-
-
-} // end namespace generic
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/internal/decompose.h b/compat/thrust/system/detail/internal/decompose.h
deleted file mode 100644
index dea806d69c..0000000000
--- a/compat/thrust/system/detail/internal/decompose.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace internal
-{
-
-  template <typename IndexType>
-    class index_range
-    {
-      public:
-        typedef IndexType index_type;
-
-        __host__ __device__
-          index_range(index_type begin, index_type end) : m_begin(begin), m_end(end) {}
-
-        __host__ __device__
-          index_type begin(void) const { return m_begin; }
-
-        __host__ __device__
-          index_type end(void)   const { return m_end; }
-
-        __host__ __device__
-          index_type size(void)  const { return m_end - m_begin; }
-
-      private:
-        index_type m_begin;
-        index_type m_end;
-    };
-
-  template <typename IndexType>
-    class uniform_decomposition
-    {
-      public:
-        typedef IndexType               index_type;
-        typedef index_range<index_type> range_type;
-
-        uniform_decomposition(index_type N, index_type granularity, index_type max_intervals)
-          : m_N(N),
-	    m_intervals((N + granularity - 1) / granularity),
-	    m_threshold(0),
-	    m_small_interval(granularity),
-	    m_large_interval(0)
-        {
-	  if(m_intervals > max_intervals)
-          {
-	    m_small_interval = granularity * (m_intervals / max_intervals);
-	    m_large_interval = m_small_interval + granularity;
-	    m_threshold      = m_intervals % max_intervals;
-	    m_intervals      = max_intervals;
-	  }
-        }
-
-        __host__ __device__
-          index_range<index_type> operator[](const index_type& i) const
-          {
-            if (i < m_threshold)
-            {
-              index_type begin = m_large_interval * i;
-              index_type end   = begin + m_large_interval;
-              return range_type(begin, end);
-            }
-            else
-            {
-              index_type begin = m_large_interval * m_threshold + m_small_interval * (i - m_threshold);
-              index_type end   = (begin + m_small_interval < m_N) ? begin + m_small_interval : m_N;
-              return range_type(begin, end);
-            }
-          }
-
-        __host__ __device__
-          index_type size(void) const
-          {
-            return m_intervals;
-          }
-
-      private:
-
-        index_type m_N;
-        index_type m_intervals;
-        index_type m_threshold;
-        index_type m_small_interval;
-        index_type m_large_interval;
-    };
-
-
-} // end namespace internal
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/internal/scalar/adjacent_difference.h b/compat/thrust/system/detail/internal/scalar/adjacent_difference.h
deleted file mode 100644
index d1a95aeec4..0000000000
--- a/compat/thrust/system/detail/internal/scalar/adjacent_difference.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file adjacent_difference.h
- *  \brief Sequential implementation of adjacent_difference.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/iterator/iterator_traits.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace internal
-{
-namespace scalar
-{
-
-template <typename InputIterator,
-          typename OutputIterator,
-          typename BinaryFunction>
-OutputIterator adjacent_difference(InputIterator first,
-                                   InputIterator last,
-                                   OutputIterator result,
-                                   BinaryFunction binary_op)
-{
-  typedef typename thrust::iterator_traits<InputIterator>::value_type InputType;
-
-  if (first == last)
-    return result;
-
-  InputType curr = *first;
-
-  *result = curr;
-
-  while (++first != last)
-  {
-    InputType next = *first;
-    *(++result) = binary_op(next, curr);
-    curr = next;
-  }
-
-  return ++result;
-}
-
-} // end namespace scalar
-} // end namespace internal
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/internal/scalar/binary_search.h b/compat/thrust/system/detail/internal/scalar/binary_search.h
deleted file mode 100644
index c3ac49fbd3..0000000000
--- a/compat/thrust/system/detail/internal/scalar/binary_search.h
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file binary_search.h
- *  \brief Sequential implementation of binary search algorithms.
- */
-
-#pragma once
-
-#include <thrust/advance.h>
-#include <thrust/distance.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/function.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace internal
-{
-namespace scalar
-{
-
-template <typename ForwardIterator,
-          typename T,
-          typename StrictWeakOrdering>
-ForwardIterator lower_bound(ForwardIterator first,
-                            ForwardIterator last,
-                            const T& val,
-                            StrictWeakOrdering comp)
-{
-  // wrap comp
-  thrust::detail::host_function<
-    StrictWeakOrdering,
-    bool
-  > wrapped_comp(comp);
-
-  typedef typename thrust::iterator_difference<ForwardIterator>::type difference_type;
-
-  difference_type len = thrust::distance(first, last);
-
-  while(len > 0)
-  {
-    difference_type half = len >> 1;
-    ForwardIterator middle = first;
-
-    thrust::advance(middle, half);
-
-    if(wrapped_comp(*middle, val))
-    {
-      first = middle;
-      ++first;
-      len = len - half - 1;
-    }
-    else
-    {
-      len = half;
-    }
-  }
-
-  return first;
-}
-
-
-template <typename ForwardIterator, typename T, typename StrictWeakOrdering>
-ForwardIterator upper_bound(ForwardIterator first,
-                            ForwardIterator last,
-                            const T& val, 
-                            StrictWeakOrdering comp)
-{
-  // wrap comp
-  thrust::detail::host_function<
-    StrictWeakOrdering,
-    bool
-  > wrapped_comp(comp);
-
-  typedef typename thrust::iterator_difference<ForwardIterator>::type difference_type;
-
-  difference_type len = thrust::distance(first, last);
-
-  while(len > 0)
-  {
-    difference_type half = len >> 1;
-    ForwardIterator middle = first;
-
-    thrust::advance(middle, half);
-
-    if(wrapped_comp(val, *middle))
-    {
-      len = half;
-    }
-    else
-    {
-      first = middle;
-      ++first;
-      len = len - half - 1;
-    }
-  }
-
-  return first;
-}
-
-template <typename ForwardIterator,
-          typename T,
-          typename StrictWeakOrdering>
-bool binary_search(ForwardIterator first,
-                   ForwardIterator last,
-                   const T& val, 
-                   StrictWeakOrdering comp)
-{
-  ForwardIterator iter = thrust::system::detail::internal::scalar::lower_bound(first, last, val, comp);
-
-  // wrap comp
-  thrust::detail::host_function<
-    StrictWeakOrdering,
-    bool
-  > wrapped_comp(comp);
-
-  return iter != last && !wrapped_comp(val,*iter);
-}
-
-} // end namespace scalar
-} // end namespace internal
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/internal/scalar/copy.h b/compat/thrust/system/detail/internal/scalar/copy.h
deleted file mode 100644
index 42cb385402..0000000000
--- a/compat/thrust/system/detail/internal/scalar/copy.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file copy.h
- *  \brief Sequential implementations of copy algorithms.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace internal
-{
-namespace scalar
-{
-
-template<typename InputIterator,
-         typename OutputIterator>
-  OutputIterator copy(InputIterator first,
-                      InputIterator last,
-                      OutputIterator result);
-
-template<typename InputIterator,
-         typename Size,
-         typename OutputIterator>
-  OutputIterator copy_n(InputIterator first,
-                        Size n,
-                        OutputIterator result);
-
-} // end namespace scalar
-} // end namespace internal
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/detail/internal/scalar/copy.inl>
-
diff --git a/compat/thrust/system/detail/internal/scalar/copy.inl b/compat/thrust/system/detail/internal/scalar/copy.inl
deleted file mode 100644
index 8c9f5c2825..0000000000
--- a/compat/thrust/system/detail/internal/scalar/copy.inl
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/dispatch/is_trivial_copy.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/system/detail/internal/scalar/general_copy.h>
-#include <thrust/system/detail/internal/scalar/trivial_copy.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/type_traits/pointer_traits.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace internal
-{
-namespace scalar
-{
-namespace copy_detail
-{
-
-
-// returns the raw pointer associated with a Pointer-like thing
-template<typename Pointer>
-  typename thrust::detail::pointer_traits<Pointer>::raw_pointer
-    get(Pointer ptr)
-{
-  return thrust::detail::pointer_traits<Pointer>::get(ptr);
-}
-
-
-template<typename InputIterator,
-         typename OutputIterator>
-  OutputIterator copy(InputIterator first,
-                      InputIterator last,
-                      OutputIterator result,
-                      thrust::detail::true_type)  // is_trivial_copy
-{
-  typedef typename thrust::iterator_difference<InputIterator>::type Size;
-
-  const Size n = last - first;
-  thrust::system::detail::internal::scalar::trivial_copy_n(get(&*first), n, get(&*result));
-  return result + n;
-} // end copy()
-
-
-template<typename InputIterator,
-         typename OutputIterator>
-  OutputIterator copy(InputIterator first,
-                      InputIterator last,
-                      OutputIterator result,
-                      thrust::detail::false_type)  // is_trivial_copy
-{
-  return thrust::system::detail::internal::scalar::general_copy(first,last,result);
-} // end copy()
-
-
-template<typename InputIterator,
-         typename Size,
-         typename OutputIterator>
-  OutputIterator copy_n(InputIterator first,
-                        Size n,
-                        OutputIterator result,
-                        thrust::detail::true_type)  // is_trivial_copy
-{
-  thrust::system::detail::internal::scalar::trivial_copy_n(get(&*first), n, get(&*result));
-  return result + n;
-} // end copy_n()
-
-
-template<typename InputIterator,
-         typename Size,
-         typename OutputIterator>
-  OutputIterator copy_n(InputIterator first,
-                        Size n,
-                        OutputIterator result,
-                        thrust::detail::false_type)  // is_trivial_copy
-{
-  return thrust::system::detail::internal::scalar::general_copy_n(first,n,result);
-} // end copy_n()
-
-} // end namespace copy_detail
-
-
-template<typename InputIterator,
-         typename OutputIterator>
-  OutputIterator copy(InputIterator first,
-                      InputIterator last,
-                      OutputIterator result)
-{
-  return thrust::system::detail::internal::scalar::copy_detail::copy(first, last, result,
-    typename thrust::detail::dispatch::is_trivial_copy<InputIterator,OutputIterator>::type());
-} // end copy()
-
-
-template<typename InputIterator,
-         typename Size,
-         typename OutputIterator>
-  OutputIterator copy_n(InputIterator first,
-                        Size n,
-                        OutputIterator result)
-{
-  return thrust::system::detail::internal::scalar::copy_detail::copy_n(first, n, result,
-    typename thrust::detail::dispatch::is_trivial_copy<InputIterator,OutputIterator>::type());
-} // end copy_n()
-
-} // end namespace scalar
-} // end namespace internal
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/internal/scalar/copy_backward.h b/compat/thrust/system/detail/internal/scalar/copy_backward.h
deleted file mode 100644
index 36f8f66e44..0000000000
--- a/compat/thrust/system/detail/internal/scalar/copy_backward.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace internal
-{
-namespace scalar
-{
-
-template <typename BidirectionalIterator1,
-          typename BidirectionalIterator2>
-BidirectionalIterator2 copy_backward(BidirectionalIterator1 first, 
-                                     BidirectionalIterator1 last, 
-                                     BidirectionalIterator2 result)
-{
-  while (first != last)
-  {
-    --last;
-    --result;
-    *result = *last;
-  }
-
-  return result;
-}
-
-} // end namespace scalar
-} // end namespace internal
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/internal/scalar/copy_if.h b/compat/thrust/system/detail/internal/scalar/copy_if.h
deleted file mode 100644
index 67f9402335..0000000000
--- a/compat/thrust/system/detail/internal/scalar/copy_if.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file copy_if.h
- *  \brief Sequential implementation of copy_if.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/function.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace internal
-{
-namespace scalar
-{
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator copy_if(InputIterator1 first,
-                         InputIterator1 last,
-                         InputIterator2 stencil,
-                         OutputIterator result,
-                         Predicate pred)
-{
-  thrust::detail::host_function<Predicate,bool> wrapped_pred(pred);
-
-  while(first != last)
-  {
-    if(wrapped_pred(*stencil))
-    {
-      *result = *first;
-      ++result;
-    } // end if
-
-    ++first;
-    ++stencil;
-  } // end while
-
-  return result;
-} // end copy_if()
-
-} // end namespace scalar
-} // end namespace internal
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/internal/scalar/extrema.h b/compat/thrust/system/detail/internal/scalar/extrema.h
deleted file mode 100644
index ebea756d8c..0000000000
--- a/compat/thrust/system/detail/internal/scalar/extrema.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file extrema.h
- *  \brief Sequential implementations of extrema functions.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/pair.h>
-#include <thrust/detail/function.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace internal
-{
-namespace scalar
-{
-
-template <typename ForwardIterator,
-          typename BinaryPredicate>
-ForwardIterator min_element(ForwardIterator first, 
-                            ForwardIterator last,
-                            BinaryPredicate comp)
-{
-  // wrap comp
-  thrust::detail::host_function<
-    BinaryPredicate,
-    bool
-  > wrapped_comp(comp);
-
-  ForwardIterator imin = first;
-
-  for (; first != last; first++)
-  {
-    if (wrapped_comp(*first, *imin))
-    {
-      imin = first;
-    }
-  }
-
-  return imin;
-}
-
-
-template <typename ForwardIterator,
-          typename BinaryPredicate>
-ForwardIterator max_element(ForwardIterator first, 
-                            ForwardIterator last,
-                            BinaryPredicate comp)
-{
-  // wrap comp
-  thrust::detail::host_function<
-    BinaryPredicate,
-    bool
-  > wrapped_comp(comp);
-
-  ForwardIterator imax = first;
-
-  for (; first != last; first++)
-  {
-    if (wrapped_comp(*imax, *first))
-    {
-      imax = first;
-    }
-  }
-
-  return imax;
-}
-
-
-template <typename ForwardIterator,
-          typename BinaryPredicate>
-thrust::pair<ForwardIterator,ForwardIterator> minmax_element(ForwardIterator first, 
-                                                             ForwardIterator last,
-                                                             BinaryPredicate comp)
-{
-  // wrap comp
-  thrust::detail::host_function<
-    BinaryPredicate,
-    bool
-  > wrapped_comp(comp);
-  
-  ForwardIterator imin = first;
-  ForwardIterator imax = first;
-
-  for (; first != last; first++)
-  {
-    if (wrapped_comp(*first, *imin))
-    {
-      imin = first;
-    }
-
-    if (wrapped_comp(*imax, *first))
-    {
-      imax = first;
-    }
-  }
-
-  return thrust::make_pair(imin, imax);
-}
-
-} // end namespace scalar
-} // end namespace internal
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/internal/scalar/find.h b/compat/thrust/system/detail/internal/scalar/find.h
deleted file mode 100644
index 6b2502199e..0000000000
--- a/compat/thrust/system/detail/internal/scalar/find.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file find.h
- *  \brief Sequential implementation of find_if. 
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/function.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace internal
-{
-namespace scalar
-{
-
-template <typename InputIterator,
-          typename Predicate>
-InputIterator find_if(InputIterator first,
-                      InputIterator last,
-                      Predicate pred)
-{
-  // wrap pred
-  thrust::detail::host_function<
-    Predicate,
-    bool
-  > wrapped_pred(pred);
-
-  while(first != last)
-  {
-    if (wrapped_pred(*first))
-      return first;
-
-    ++first;
-  }
-
-  // return first so zip_iterator works correctly
-  return first;
-}
-
-} // end namespace scalar
-} // end namespace internal
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/internal/scalar/for_each.h b/compat/thrust/system/detail/internal/scalar/for_each.h
deleted file mode 100644
index 4e31d9183b..0000000000
--- a/compat/thrust/system/detail/internal/scalar/for_each.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file for_each.h
- *  \brief Sequential implementations of for_each functions.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/function.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace internal
-{
-namespace scalar
-{
-
-template<typename InputIterator,
-         typename UnaryFunction>
-InputIterator for_each(InputIterator first,
-                       InputIterator last,
-                       UnaryFunction f)
-{
-  // wrap f
-  thrust::detail::host_function<
-    UnaryFunction,
-    void
-  > wrapped_f(f);
-
-  for(; first != last; ++first)
-  {
-    wrapped_f(*first);
-  }
-
-  return first;
-} // end for_each()
-
-template<typename InputIterator,
-         typename Size,
-         typename UnaryFunction>
-InputIterator for_each_n(InputIterator first,
-                         Size n,
-                         UnaryFunction f)
-{
-  // wrap f
-  thrust::detail::host_function<
-    UnaryFunction,
-    void
-  > wrapped_f(f);
-
-  for(Size i = 0; i != n; i++)
-  {
-    // we can dereference an OutputIterator if f does not
-    // try to use the reference for anything besides assignment
-    wrapped_f(*first);
-    ++first;
-  }
-
-  return first;
-} // end for_each_n()
-
-} // end namespace scalar
-} // end namespace internal
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/internal/scalar/general_copy.h b/compat/thrust/system/detail/internal/scalar/general_copy.h
deleted file mode 100644
index aae061d31d..0000000000
--- a/compat/thrust/system/detail/internal/scalar/general_copy.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file general_copy.h
- *  \brief Sequential copy algorithms for general iterators.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace internal
-{
-namespace scalar
-{
-
-template<typename InputIterator,
-         typename OutputIterator>
-  OutputIterator general_copy(InputIterator first,
-                              InputIterator last,
-                              OutputIterator result)
-{
-  for(; first != last; ++first, ++result)
-    *result = *first;
-  return result;
-} // end general_copy()
-
-
-template<typename InputIterator,
-         typename Size,
-         typename OutputIterator>
-  OutputIterator general_copy_n(InputIterator first,
-                                Size n,
-                                OutputIterator result)
-{
-  for(; n > Size(0); ++first, ++result, --n)
-    *result = *first;
-  return result;
-} // end general_copy_n()
-
-} // end namespace scalar
-} // end namespace internal
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/internal/scalar/insertion_sort.h b/compat/thrust/system/detail/internal/scalar/insertion_sort.h
deleted file mode 100644
index 5949ce7a65..0000000000
--- a/compat/thrust/system/detail/internal/scalar/insertion_sort.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/function.h>
-#include <thrust/system/detail/internal/scalar/copy_backward.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace internal
-{
-namespace scalar
-{
-
-template <typename RandomAccessIterator,
-          typename StrictWeakOrdering>
-void insertion_sort(RandomAccessIterator first,
-                    RandomAccessIterator last,
-                    StrictWeakOrdering comp)
-{
-  typedef typename thrust::iterator_value<RandomAccessIterator>::type value_type;
-
-  if (first == last) return;
-
-  // wrap comp
-  thrust::detail::host_function<
-    StrictWeakOrdering,
-    bool
-  > wrapped_comp(comp);
-
-  for(RandomAccessIterator i = first + 1; i != last; ++i)
-  {
-    value_type tmp = *i;
-
-    if (wrapped_comp(tmp, *first))
-    {
-      // tmp is the smallest value encountered so far
-      thrust::system::detail::internal::scalar::copy_backward(first, i, i + 1);
-
-      *first = tmp;
-    }
-    else
-    {
-      // tmp is not the smallest value, can avoid checking for j == first
-      RandomAccessIterator j = i;
-      RandomAccessIterator k = i - 1;
-
-      while(wrapped_comp(tmp, *k))
-      {
-        *j = *k;
-        j = k;
-        --k;
-      }
-
-      *j = tmp;
-    }
-  }
-}
-
-template <typename RandomAccessIterator1,
-          typename RandomAccessIterator2,
-          typename StrictWeakOrdering>
-void insertion_sort_by_key(RandomAccessIterator1 first1,
-                           RandomAccessIterator1 last1,
-                           RandomAccessIterator2 first2,
-                           StrictWeakOrdering comp)
-{
-  typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type1;
-  typedef typename thrust::iterator_value<RandomAccessIterator2>::type value_type2;
-
-  if (first1 == last1) return;
-
-  // wrap comp
-  thrust::detail::host_function<
-    StrictWeakOrdering,
-    bool
-  > wrapped_comp(comp);
-
-  RandomAccessIterator1 i1 = first1 + 1;
-  RandomAccessIterator2 i2 = first2 + 1;
-
-  for(; i1 != last1; ++i1, ++i2)
-  {
-    value_type1 tmp1 = *i1;
-    value_type2 tmp2 = *i2;
-
-    if (wrapped_comp(tmp1, *first1))
-    {
-      // tmp is the smallest value encountered so far
-      thrust::system::detail::internal::scalar::copy_backward(first1, i1, i1 + 1);
-      thrust::system::detail::internal::scalar::copy_backward(first2, i2, i2 + 1);
-
-      *first1 = tmp1;
-      *first2 = tmp2;
-    }
-    else
-    {
-      // tmp is not the smallest value, can avoid checking for j == first
-      RandomAccessIterator1 j1 = i1;
-      RandomAccessIterator1 k1 = i1 - 1;
-
-      RandomAccessIterator2 j2 = i2;
-      RandomAccessIterator2 k2 = i2 - 1;
-
-      while(wrapped_comp(tmp1, *k1))
-      {
-        *j1 = *k1;
-        *j2 = *k2;
-
-        j1 = k1;
-        j2 = k2;
-
-        --k1;
-        --k2;
-      }
-
-      *j1 = tmp1;
-      *j2 = tmp2;
-    }
-  }
-}
-
-} // end namespace scalar
-} // end namespace internal
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/internal/scalar/merge.h b/compat/thrust/system/detail/internal/scalar/merge.h
deleted file mode 100644
index c02fca44b6..0000000000
--- a/compat/thrust/system/detail/internal/scalar/merge.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file merge.h
- *  \brief Sequential implementation of merge algorithms.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace internal
-{
-namespace scalar
-{
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename StrictWeakOrdering>
-OutputIterator merge(InputIterator1 first1,
-                     InputIterator1 last1,
-                     InputIterator2 first2,
-                     InputIterator2 last2,
-                     OutputIterator result,
-                     StrictWeakOrdering comp);
-
-template <typename InputIterator1,
-          typename InputIterator2,
-          typename InputIterator3,
-          typename InputIterator4,
-          typename OutputIterator1,
-          typename OutputIterator2,
-          typename StrictWeakOrdering>
-thrust::pair<OutputIterator1,OutputIterator2>
-  merge_by_key(InputIterator1 keys_first1,
-               InputIterator1 keys_last1,
-               InputIterator2 keys_first2,
-               InputIterator2 keys_last2,
-               InputIterator3 values_first1,
-               InputIterator4 values_first2,
-               OutputIterator1 keys_result,
-               OutputIterator2 values_result,
-               StrictWeakOrdering comp);
-
-} // end namespace scalar
-} // end namespace internal
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/detail/internal/scalar/merge.inl>
-
diff --git a/compat/thrust/system/detail/internal/scalar/merge.inl b/compat/thrust/system/detail/internal/scalar/merge.inl
deleted file mode 100644
index a7c2a393c4..0000000000
--- a/compat/thrust/system/detail/internal/scalar/merge.inl
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/iterator/iterator_traits.h>
-
-#include <thrust/system/detail/internal/scalar/copy.h>
-#include <thrust/detail/function.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace internal
-{
-namespace scalar
-{
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename StrictWeakOrdering>
-OutputIterator merge(InputIterator1 first1,
-                     InputIterator1 last1,
-                     InputIterator2 first2,
-                     InputIterator2 last2,
-                     OutputIterator result,
-                     StrictWeakOrdering comp)
-{
-  // wrap comp
-  thrust::detail::host_function<
-    StrictWeakOrdering,
-    bool
-  > wrapped_comp(comp);
-
-  while(first1 != last1 && first2 != last2)
-  {
-    if(wrapped_comp(*first2, *first1))
-    {
-      *result = *first2;
-      ++first2;
-    } // end if
-    else
-    {
-      *result = *first1;
-      ++first1;
-    } // end else
-
-    ++result;
-  } // end while
-
-  return thrust::system::detail::internal::scalar::copy(first2, last2, thrust::system::detail::internal::scalar::copy(first1, last1, result));
-} // end merge()
-
-
-template <typename InputIterator1,
-          typename InputIterator2,
-          typename InputIterator3,
-          typename InputIterator4,
-          typename OutputIterator1,
-          typename OutputIterator2,
-          typename StrictWeakOrdering>
-thrust::pair<OutputIterator1,OutputIterator2>
-  merge_by_key(InputIterator1 keys_first1,
-               InputIterator1 keys_last1,
-               InputIterator2 keys_first2,
-               InputIterator2 keys_last2,
-               InputIterator3 values_first1,
-               InputIterator4 values_first2,
-               OutputIterator1 keys_result,
-               OutputIterator2 values_result,
-               StrictWeakOrdering comp)
-{
-  // wrap comp
-  thrust::detail::host_function<
-    StrictWeakOrdering,
-    bool
-  > wrapped_comp(comp);
-
-  while(keys_first1 != keys_last1 && keys_first2 != keys_last2)
-  {
-    if(!wrapped_comp(*keys_first2, *keys_first1))
-    {
-      // *keys_first1 <= *keys_first2
-      *keys_result   = *keys_first1;
-      *values_result = *values_first1;
-      ++keys_first1;
-      ++values_first1;
-    }
-    else
-    {
-      // *keys_first1 > keys_first2
-      *keys_result   = *keys_first2;
-      *values_result = *values_first2;
-      ++keys_first2;
-      ++values_first2;
-    }
-
-    ++keys_result;
-    ++values_result;
-  }
-
-  while(keys_first1 != keys_last1)
-  {
-    *keys_result   = *keys_first1;
-    *values_result = *values_first1;
-    ++keys_first1;
-    ++values_first1;
-    ++keys_result;
-    ++values_result;
-  }
-
-  while(keys_first2 != keys_last2)
-  {
-    *keys_result   = *keys_first2;
-    *values_result = *values_first2;
-    ++keys_first2;
-    ++values_first2;
-    ++keys_result;
-    ++values_result;
-  }
-
-  return thrust::make_pair(keys_result, values_result);
-}
-
-} // end namespace scalar
-} // end namespace internal
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/internal/scalar/partition.h b/compat/thrust/system/detail/internal/scalar/partition.h
deleted file mode 100644
index 7ba677ef22..0000000000
--- a/compat/thrust/system/detail/internal/scalar/partition.h
+++ /dev/null
@@ -1,262 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file partition.h
- *  \brief Sequential implementations of partition functions.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/pair.h>
-#include <thrust/detail/temporary_array.h>
-#include <thrust/detail/function.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace internal
-{
-namespace scalar
-{
-
-template <typename ForwardIterator1,
-          typename ForwardIterator2>
-void iter_swap(ForwardIterator1 iter1, ForwardIterator2 iter2)
-{
-  // XXX this isn't correct because it doesn't use thrust::swap
-  using namespace thrust::detail;
-
-  typedef typename thrust::iterator_value<ForwardIterator1>::type T;
-
-  T temp = *iter1;
-  *iter1 = *iter2;
-  *iter2 = temp;
-}
-
-template<typename ForwardIterator,
-         typename Predicate>
-  ForwardIterator partition(ForwardIterator first,
-                            ForwardIterator last,
-                            Predicate pred)
-{
-  if (first == last)
-    return first;
-
-  // wrap pred
-  thrust::detail::host_function<
-    Predicate,
-    bool
-  > wrapped_pred(pred);
-
-  while (wrapped_pred(*first))
-  {
-    if (++first == last)
-      return first;
-  }
-
-  ForwardIterator next = first;
-
-  while (++next != last)
-  {
-    if (wrapped_pred(*next))
-    {
-      iter_swap(first, next);
-      ++first;
-    }
-  }
-
-  return first;
-}
-
-template<typename ForwardIterator,
-         typename Predicate>
-  ForwardIterator stable_partition(ForwardIterator first,
-                                   ForwardIterator last,
-                                   Predicate pred)
-{
-  // wrap pred
-  thrust::detail::host_function<
-    Predicate,
-    bool
-  > wrapped_pred(pred);
-
-  // XXX the type of exec should be:
-  //     typedef decltype(select_system(first, last)) system;
-  typedef typename thrust::iterator_system<ForwardIterator>::type ExecutionPolicy;
-  typedef typename thrust::iterator_value<ForwardIterator>::type T;
-
-  typedef thrust::detail::temporary_array<T,ExecutionPolicy> TempRange;
-  typedef typename TempRange::iterator                       TempIterator;
-
-  // XXX presumes ExecutionPolicy is default constructible
-  ExecutionPolicy exec;
-  TempRange temp(exec, first, last);
-
-  for(TempIterator iter = temp.begin(); iter != temp.end(); ++iter)
-  {
-    if (wrapped_pred(*iter))
-    {
-      *first = *iter;
-      ++first;
-    }
-  }
-
-  ForwardIterator middle = first;
-
-  for(TempIterator iter = temp.begin(); iter != temp.end(); ++iter)
-  {
-    if (!wrapped_pred(*iter))
-    {
-      *first = *iter;
-      ++first;
-    }
-  }
-
-  return middle;
-}
-
-template<typename ForwardIterator,
-         typename InputIterator,
-         typename Predicate>
-  ForwardIterator stable_partition(ForwardIterator first,
-                                   ForwardIterator last,
-                                   InputIterator stencil,
-                                   Predicate pred)
-{
-  // wrap pred
-  thrust::detail::host_function<
-    Predicate,
-    bool
-  > wrapped_pred(pred);
-
-  // XXX the type of exec should be:
-  //     typedef decltype(select_system(first, stencil)) system;
-  typedef typename thrust::iterator_system<ForwardIterator>::type ExecutionPolicy;
-  typedef typename thrust::iterator_value<ForwardIterator>::type T;
-
-  typedef thrust::detail::temporary_array<T,ExecutionPolicy> TempRange;
-  typedef typename TempRange::iterator                       TempIterator;
-
-  // XXX presumes ExecutionPolicy is default constructible
-  ExecutionPolicy exec;
-  TempRange temp(exec, first, last);
-
-  InputIterator stencil_iter = stencil;
-  for(TempIterator iter = temp.begin(); iter != temp.end(); ++iter, ++stencil_iter)
-  {
-    if (wrapped_pred(*stencil_iter))
-    {
-      *first = *iter;
-      ++first;
-    }
-  }
-
-  ForwardIterator middle = first;
-  stencil_iter = stencil;
-
-  for(TempIterator iter = temp.begin(); iter != temp.end(); ++iter, ++stencil_iter)
-  {
-    if (!wrapped_pred(*stencil_iter))
-    {
-      *first = *iter;
-      ++first;
-    }
-  }
-
-  return middle;
-}
-
-template<typename InputIterator,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename Predicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    stable_partition_copy(InputIterator first,
-                          InputIterator last,
-                          OutputIterator1 out_true,
-                          OutputIterator2 out_false,
-                          Predicate pred)
-{
-  // wrap pred
-  thrust::detail::host_function<
-    Predicate,
-    bool
-  > wrapped_pred(pred);
-
-  for(; first != last; ++first)
-  {
-    if(wrapped_pred(*first))
-    {
-      *out_true = *first;
-      ++out_true;
-    } // end if
-    else
-    {
-      *out_false = *first;
-      ++out_false;
-    } // end else
-  }
-
-  return thrust::make_pair(out_true, out_false);
-}
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename Predicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    stable_partition_copy(InputIterator1 first,
-                          InputIterator1 last,
-                          InputIterator2 stencil,
-                          OutputIterator1 out_true,
-                          OutputIterator2 out_false,
-                          Predicate pred)
-{
-  // wrap pred
-  thrust::detail::host_function<
-    Predicate,
-    bool
-  > wrapped_pred(pred);
-
-  for(; first != last; ++first, ++stencil)
-  {
-    if(wrapped_pred(*stencil))
-    {
-      *out_true = *first;
-      ++out_true;
-    } // end if
-    else
-    {
-      *out_false = *first;
-      ++out_false;
-    } // end else
-  }
-
-  return thrust::make_pair(out_true, out_false);
-}
-
-} // end namespace scalar
-} // end namespace internal
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/internal/scalar/reduce.h b/compat/thrust/system/detail/internal/scalar/reduce.h
deleted file mode 100644
index 7ad430ea50..0000000000
--- a/compat/thrust/system/detail/internal/scalar/reduce.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file reduce.h
- *  \brief Sequential implementation of reduce algorithm.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/function.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace internal
-{
-namespace scalar
-{
-
-template<typename InputIterator, 
-         typename OutputType,
-         typename BinaryFunction>
-  OutputType reduce(InputIterator begin,
-                    InputIterator end,
-                    OutputType init,
-                    BinaryFunction binary_op)
-{
-  // wrap binary_op
-  thrust::detail::host_function<
-    BinaryFunction,
-    OutputType
-  > wrapped_binary_op(binary_op);
-
-  // initialize the result
-  OutputType result = init;
-
-  while(begin != end)
-  {
-    result = wrapped_binary_op(result, *begin);
-    ++begin;
-  } // end while
-
-  return result;
-}
-
-} // end namespace scalar
-} // end namespace internal
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/internal/scalar/reduce_by_key.h b/compat/thrust/system/detail/internal/scalar/reduce_by_key.h
deleted file mode 100644
index eeacb9dd49..0000000000
--- a/compat/thrust/system/detail/internal/scalar/reduce_by_key.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/pair.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace internal
-{
-namespace scalar
-{
-
-template <typename InputIterator1,
-          typename InputIterator2,
-          typename OutputIterator1,
-          typename OutputIterator2,
-          typename BinaryPredicate,
-          typename BinaryFunction>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    reduce_by_key(InputIterator1 keys_first, 
-                  InputIterator1 keys_last,
-                  InputIterator2 values_first,
-                  OutputIterator1 keys_output,
-                  OutputIterator2 values_output,
-                  BinaryPredicate binary_pred,
-                  BinaryFunction binary_op)
-{
-  typedef typename thrust::iterator_traits<InputIterator1>::value_type  InputKeyType;
-  typedef typename thrust::iterator_traits<InputIterator2>::value_type  InputValueType;
-
-  typedef typename thrust::detail::intermediate_type_from_function_and_iterators<
-    InputIterator2,
-    OutputIterator2,
-    BinaryFunction
-  >::type TemporaryType;
-
-  if(keys_first != keys_last)
-  {
-    InputKeyType  temp_key   = *keys_first;
-    TemporaryType temp_value = *values_first;
-
-    for(++keys_first, ++values_first;
-        keys_first != keys_last;
-        ++keys_first, ++values_first)
-    {
-      InputKeyType    key  = *keys_first;
-      InputValueType value = *values_first;
-
-      if (binary_pred(temp_key, key))
-      {
-        temp_value = binary_op(temp_value, value);
-      }
-      else
-      {
-        *keys_output   = temp_key;
-        *values_output = temp_value;
-
-        ++keys_output;
-        ++values_output;
-
-        temp_key   = key;
-        temp_value = value;
-      }
-    }
-
-    *keys_output   = temp_key;
-    *values_output = temp_value;
-
-    ++keys_output;
-    ++values_output;
-  }
-
-  return thrust::make_pair(keys_output, values_output);
-}
-
-} // end namespace scalar
-} // end namespace internal
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/internal/scalar/remove.h b/compat/thrust/system/detail/internal/scalar/remove.h
deleted file mode 100644
index 2360019f11..0000000000
--- a/compat/thrust/system/detail/internal/scalar/remove.h
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file remove.h
- *  \brief Sequential implementations of remove functions.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/function.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace internal
-{
-namespace scalar
-{
-
-template<typename ForwardIterator,
-         typename Predicate>
-  ForwardIterator remove_if(ForwardIterator first,
-                            ForwardIterator last,
-                            Predicate pred)
-{
-  // wrap pred
-  thrust::detail::host_function<
-    Predicate,
-    bool
-  > wrapped_pred(pred);
-
-  // advance iterators until wrapped_pred(*first) is true or we reach the end of input
-  while(first != last && !wrapped_pred(*first))
-    ++first;
-
-  if(first == last)
-    return first;
-
-  // result always trails first 
-  ForwardIterator result = first;
-
-  ++first;
-
-  while(first != last)
-  {
-    if(!wrapped_pred(*first))
-    {
-      *result = *first;
-      ++result;
-    }
-    ++first;
-  }
-
-  return result;
-}
-
-
-template<typename ForwardIterator,
-         typename InputIterator,
-         typename Predicate>
-  ForwardIterator remove_if(ForwardIterator first,
-                            ForwardIterator last,
-                            InputIterator stencil,
-                            Predicate pred)
-{
-  // wrap pred
-  thrust::detail::host_function<
-    Predicate,
-    bool
-  > wrapped_pred(pred);
-
-  // advance iterators until wrapped_pred(*stencil) is true or we reach the end of input
-  while(first != last && !wrapped_pred(*stencil))
-  {
-    ++first;
-    ++stencil;
-  }
-
-  if(first == last)
-    return first;
-
-  // result always trails first 
-  ForwardIterator result = first;
-
-  ++first;
-  ++stencil;
-
-  while(first != last)
-  {
-    if(!wrapped_pred(*stencil))
-    {
-      *result = *first;
-      ++result;
-    }
-    ++first;
-    ++stencil;
-  }
-
-  return result;
-}
-
-
-template<typename InputIterator,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator remove_copy_if(InputIterator first,
-                                InputIterator last,
-                                OutputIterator result,
-                                Predicate pred)
-{
-  // wrap pred
-  thrust::detail::host_function<
-    Predicate,
-    bool
-  > wrapped_pred(pred);
-
-  while (first != last)
-  {
-    if (!wrapped_pred(*first))
-    {
-      *result = *first;
-      ++result;
-    }
-
-    ++first;
-  }
-
-  return result;
-}
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator remove_copy_if(InputIterator1 first,
-                                InputIterator1 last,
-                                InputIterator2 stencil,
-                                OutputIterator result,
-                                Predicate pred)
-{
-  // wrap pred
-  thrust::detail::host_function<
-    Predicate,
-    bool
-  > wrapped_pred(pred);
-
-  while (first != last)
-  {
-    if (!wrapped_pred(*stencil))
-    {
-      *result = *first;
-      ++result;
-    }
-
-    ++first;
-    ++stencil;
-  }
-
-  return result;
-}
-
-} // end namespace scalar
-} // end namespace internal
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/internal/scalar/scan.h b/compat/thrust/system/detail/internal/scalar/scan.h
deleted file mode 100644
index 8f41150247..0000000000
--- a/compat/thrust/system/detail/internal/scalar/scan.h
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file scan.h
- *  \brief Sequential implementations of scan functions.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cpp/detail/execution_policy.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/type_traits/function_traits.h>
-#include <thrust/detail/type_traits/iterator/is_output_iterator.h>
-#include <thrust/detail/function.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace internal
-{
-namespace scalar
-{
-
-template<typename InputIterator,
-         typename OutputIterator,
-         typename BinaryFunction>
-  OutputIterator inclusive_scan(InputIterator first,
-                                InputIterator last,
-                                OutputIterator result,
-                                BinaryFunction binary_op)
-{
-  // the pseudocode for deducing the type of the temporary used below:
-  // 
-  // if BinaryFunction is AdaptableBinaryFunction
-  //   TemporaryType = AdaptableBinaryFunction::result_type
-  // else if OutputIterator is a "pure" output iterator
-  //   TemporaryType = InputIterator::value_type
-  // else
-  //   TemporaryType = OutputIterator::value_type
-  //
-  // XXX upon c++0x, TemporaryType needs to be:
-  // result_of<BinaryFunction>::type
-  
-  using namespace thrust::detail;
-
-  typedef typename eval_if<
-    has_result_type<BinaryFunction>::value,
-    result_type<BinaryFunction>,
-    eval_if<
-      is_output_iterator<OutputIterator>::value,
-      thrust::iterator_value<InputIterator>,
-      thrust::iterator_value<OutputIterator>
-    >
-  >::type ValueType;
-
-  // wrap binary_op
-  thrust::detail::host_function<
-    BinaryFunction,
-    ValueType
-  > wrapped_binary_op(binary_op);
-
-  if(first != last)
-  {
-    ValueType sum = *first;
-
-    *result = sum;
-
-    for(++first, ++result; first != last; ++first, ++result)
-      *result = sum = wrapped_binary_op(sum,*first);
-  }
-
-  return result;
-}
-
-
-template<typename InputIterator,
-         typename OutputIterator,
-         typename T,
-         typename BinaryFunction>
-  OutputIterator exclusive_scan(InputIterator first,
-                                InputIterator last,
-                                OutputIterator result,
-                                T init,
-                                BinaryFunction binary_op)
-{
-  // the pseudocode for deducing the type of the temporary used below:
-  // 
-  // if BinaryFunction is AdaptableBinaryFunction
-  //   TemporaryType = AdaptableBinaryFunction::result_type
-  // else if OutputIterator is a "pure" output iterator
-  //   TemporaryType = InputIterator::value_type
-  // else
-  //   TemporaryType = OutputIterator::value_type
-  //
-  // XXX upon c++0x, TemporaryType needs to be:
-  // result_of<BinaryFunction>::type
-
-  using namespace thrust::detail;
-
-  typedef typename eval_if<
-    has_result_type<BinaryFunction>::value,
-    result_type<BinaryFunction>,
-    eval_if<
-      is_output_iterator<OutputIterator>::value,
-      thrust::iterator_value<InputIterator>,
-      thrust::iterator_value<OutputIterator>
-    >
-  >::type ValueType;
-
-  if(first != last)
-  {
-    ValueType tmp = *first;  // temporary value allows in-situ scan
-    ValueType sum = init;
-
-    *result = sum;
-    sum = binary_op(sum, tmp);
-
-    for(++first, ++result; first != last; ++first, ++result)
-    {
-      tmp = *first;
-      *result = sum;
-      sum = binary_op(sum, tmp);
-    }
-  }
-
-  return result;
-} 
-
-} // end namespace scalar
-} // end namespace internal
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/internal/scalar/scan_by_key.h b/compat/thrust/system/detail/internal/scalar/scan_by_key.h
deleted file mode 100644
index a31fc60bab..0000000000
--- a/compat/thrust/system/detail/internal/scalar/scan_by_key.h
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file scan_by_key.h
- *  \brief Sequential implementation of scan_by_key functions.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cpp/detail/execution_policy.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/function.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace internal
-{
-namespace scalar
-{
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename BinaryPredicate,
-         typename BinaryFunction>
-  OutputIterator inclusive_scan_by_key(InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result,
-                                       BinaryPredicate binary_pred,
-                                       BinaryFunction binary_op)
-{
-  using namespace thrust::detail;
-
-  typedef typename thrust::iterator_traits<InputIterator1>::value_type KeyType;
-  typedef typename thrust::iterator_traits<OutputIterator>::value_type ValueType;
-
-  // wrap binary_op
-  thrust::detail::host_function<
-    BinaryFunction,
-    ValueType
-  > wrapped_binary_op(binary_op);
-
-  if(first1 != last1)
-  {
-    KeyType   prev_key   = *first1;
-    ValueType prev_value = *first2;
-
-    *result = prev_value;
-
-    for(++first1, ++first2, ++result;
-        first1 != last1;
-        ++first1, ++first2, ++result)
-    {
-      KeyType key = *first1;
-
-      if (binary_pred(prev_key, key))
-        *result = prev_value = wrapped_binary_op(prev_value,*first2);
-      else
-        *result = prev_value = *first2;
-
-      prev_key = key;
-    }
-  }
-
-  return result;
-}
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename T,
-         typename BinaryPredicate,
-         typename BinaryFunction>
-  OutputIterator exclusive_scan_by_key(InputIterator1 first1,
-                                       InputIterator1 last1,
-                                       InputIterator2 first2,
-                                       OutputIterator result,
-                                       T init,
-                                       BinaryPredicate binary_pred,
-                                       BinaryFunction binary_op)
-{
-  using namespace thrust::detail;
-
-  typedef typename thrust::iterator_traits<InputIterator1>::value_type KeyType;
-  typedef typename thrust::iterator_traits<OutputIterator>::value_type ValueType;
-
-  if(first1 != last1)
-  {
-    KeyType   temp_key   = *first1;
-    ValueType temp_value = *first2;
-
-    ValueType next = init;
-
-    // first one is init
-    *result = next;
-
-    next = binary_op(next, temp_value);
-
-    for(++first1, ++first2, ++result;
-        first1 != last1;
-        ++first1, ++first2, ++result)
-    {
-      KeyType key = *first1;
-
-      // use temp to permit in-place scans
-      temp_value = *first2;
-
-      if (!binary_pred(temp_key, key))
-        next = init;  // reset sum
-
-      *result = next;  
-      next = binary_op(next, temp_value);
-
-      temp_key = key;
-    }
-  }
-
-  return result;
-}
-
-} // end namespace scalar
-} // end namespace internal
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/internal/scalar/set_operations.h b/compat/thrust/system/detail/internal/scalar/set_operations.h
deleted file mode 100644
index f85b5108af..0000000000
--- a/compat/thrust/system/detail/internal/scalar/set_operations.h
+++ /dev/null
@@ -1,208 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file set_operations.h
- *  \brief Sequential implementation of set operation functions.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/internal/scalar/copy.h>
-#include <thrust/detail/function.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace internal
-{
-namespace scalar
-{
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename StrictWeakOrdering>
-  OutputIterator set_difference(InputIterator1 first1,
-                                InputIterator1 last1,
-                                InputIterator2 first2,
-                                InputIterator2 last2,
-                                OutputIterator result,
-                                StrictWeakOrdering comp)
-{
-  // wrap comp
-  thrust::detail::host_function<
-    StrictWeakOrdering,
-    bool
-  > wrapped_comp(comp);
-
-  while(first1 != last1 && first2 != last2)
-  {
-    if(wrapped_comp(*first1,*first2))
-    {
-      *result = *first1;
-      ++first1;
-      ++result;
-    } // end if
-    else if(wrapped_comp(*first2,*first1))
-    {
-      ++first2;
-    } // end else if
-    else
-    {
-      ++first1;
-      ++first2;
-    } // end else
-  } // end while
-
-  return scalar::copy(first1, last1, result);
-} // end set_difference()
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename StrictWeakOrdering>
-  OutputIterator set_intersection(InputIterator1 first1,
-                                  InputIterator1 last1,
-                                  InputIterator2 first2,
-                                  InputIterator2 last2,
-                                  OutputIterator result,
-                                  StrictWeakOrdering comp)
-{
-  // wrap comp
-  thrust::detail::host_function<
-    StrictWeakOrdering,
-    bool
-  > wrapped_comp(comp);
-
-  while(first1 != last1 && first2 != last2)
-  {
-    if(wrapped_comp(*first1,*first2))
-    {
-      ++first1;
-    } // end if
-    else if(wrapped_comp(*first2,*first1))
-    {
-      ++first2;
-    } // end else if
-    else
-    {
-      *result = *first1;
-      ++first1;
-      ++first2;
-      ++result;
-    } // end else
-  } // end while
-
-  return result;
-} // end set_intersection()
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename StrictWeakOrdering>
-  OutputIterator set_symmetric_difference(InputIterator1 first1,
-                                          InputIterator1 last1,
-                                          InputIterator2 first2,
-                                          InputIterator2 last2,
-                                          OutputIterator result,
-                                          StrictWeakOrdering comp)
-{
-  // wrap comp
-  thrust::detail::host_function<
-    StrictWeakOrdering,
-    bool
-  > wrapped_comp(comp);
-
-  while(first1 != last1 && first2 != last2)
-  {
-    if(wrapped_comp(*first1,*first2))
-    {
-      *result = *first1;
-      ++first1;
-      ++result;
-    } // end if
-    else if(wrapped_comp(*first2,*first1))
-    {
-      *result = *first2;
-      ++first2;
-      ++result;
-    } // end else if
-    else
-    {
-      ++first1;
-      ++first2;
-    } // end else
-  } // end while
-
-  return scalar::copy(first2, last2, scalar::copy(first1, last1, result));
-} // end set_symmetric_difference()
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename StrictWeakOrdering>
-  OutputIterator set_union(InputIterator1 first1,
-                           InputIterator1 last1,
-                           InputIterator2 first2,
-                           InputIterator2 last2,
-                           OutputIterator result,
-                           StrictWeakOrdering comp)
-{
-  // wrap comp
-  thrust::detail::host_function<
-    StrictWeakOrdering,
-    bool
-  > wrapped_comp(comp);
-
-  while(first1 != last1 && first2 != last2)
-  {
-    if(wrapped_comp(*first1,*first2))
-    {
-      *result = *first1;
-      ++first1;
-    } // end if
-    else if(wrapped_comp(*first2,*first1))
-    {
-      *result = *first2;
-      ++first2;
-    } // end else if
-    else
-    {
-      *result = *first1;
-      ++first1;
-      ++first2;
-    } // end else
-
-    ++result;
-  } // end while
-
-  return scalar::copy(first2, last2, scalar::copy(first1, last1, result));
-} // end set_union()
-
-} // end namespace scalar
-} // end namespace internal
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/internal/scalar/sort.h b/compat/thrust/system/detail/internal/scalar/sort.h
deleted file mode 100644
index 9e465c8ca0..0000000000
--- a/compat/thrust/system/detail/internal/scalar/sort.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file sort.h
- *  \brief Sequential implementations of sort algorithms.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace internal
-{
-namespace scalar
-{
-
-template<typename RandomAccessIterator,
-         typename StrictWeakOrdering>
-void stable_sort(RandomAccessIterator first,
-                 RandomAccessIterator last,
-                 StrictWeakOrdering comp);
-
-template<typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename StrictWeakOrdering>
-void stable_sort_by_key(RandomAccessIterator1 first1,
-                        RandomAccessIterator1 last1,
-                        RandomAccessIterator2 first2,
-                        StrictWeakOrdering comp);
-
-} // end namespace scalar
-} // end namespace internal
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/detail/internal/scalar/sort.inl>
-
diff --git a/compat/thrust/system/detail/internal/scalar/sort.inl b/compat/thrust/system/detail/internal/scalar/sort.inl
deleted file mode 100644
index c6ed27324c..0000000000
--- a/compat/thrust/system/detail/internal/scalar/sort.inl
+++ /dev/null
@@ -1,161 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#include <thrust/reverse.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/detail/internal/scalar/stable_merge_sort.h>
-#include <thrust/system/detail/internal/scalar/stable_primitive_sort.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace internal
-{
-namespace scalar
-{
-namespace sort_detail
-{
-
-////////////////////
-// Primitive Sort //
-////////////////////
-
-template<typename RandomAccessIterator,
-         typename StrictWeakOrdering>
-void stable_sort(RandomAccessIterator first,
-                 RandomAccessIterator last,
-                 StrictWeakOrdering comp,
-                 thrust::detail::true_type)
-{
-  thrust::system::detail::internal::scalar::stable_primitive_sort(first, last);
-        
-  // if comp is greater<T> then reverse the keys
-  typedef typename thrust::iterator_traits<RandomAccessIterator>::value_type KeyType;
-  const static bool reverse = thrust::detail::is_same<StrictWeakOrdering, typename thrust::greater<KeyType> >::value;
-
-  if (reverse)
-    thrust::reverse(first, last);
-}
-
-template<typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename StrictWeakOrdering>
-void stable_sort_by_key(RandomAccessIterator1 first1,
-                        RandomAccessIterator1 last1,
-                        RandomAccessIterator2 first2,
-                        StrictWeakOrdering comp,
-                        thrust::detail::true_type)
-{
-  // if comp is greater<T> then reverse the keys and values
-  typedef typename thrust::iterator_traits<RandomAccessIterator1>::value_type KeyType;
-  const static bool reverse = thrust::detail::is_same<StrictWeakOrdering, typename thrust::greater<KeyType> >::value;
-
-  // note, we also have to reverse the (unordered) input to preserve stability
-  if (reverse)
-  {
-    thrust::reverse(first1,  last1);
-    thrust::reverse(first2, first2 + (last1 - first1));
-  }
-
-  thrust::system::detail::internal::scalar::stable_primitive_sort_by_key(first1, last1, first2);
-
-  if (reverse)
-  {
-    thrust::reverse(first1,  last1);
-    thrust::reverse(first2, first2 + (last1 - first1));
-  }
-}
-
-////////////////
-// Merge Sort //
-////////////////
-
-template<typename RandomAccessIterator,
-         typename StrictWeakOrdering>
-void stable_sort(RandomAccessIterator first,
-                 RandomAccessIterator last,
-                 StrictWeakOrdering comp,
-                 thrust::detail::false_type)
-{
-  thrust::system::detail::internal::scalar::stable_merge_sort(first, last, comp);
-}
-
-template<typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename StrictWeakOrdering>
-void stable_sort_by_key(RandomAccessIterator1 first1,
-                        RandomAccessIterator1 last1,
-                        RandomAccessIterator2 first2,
-                        StrictWeakOrdering comp,
-                        thrust::detail::false_type)
-{
-  thrust::system::detail::internal::scalar::stable_merge_sort_by_key(first1, last1, first2, comp);
-}
-
-
-} // end namespace sort_detail
-
-template<typename RandomAccessIterator,
-         typename StrictWeakOrdering>
-void stable_sort(RandomAccessIterator first,
-                 RandomAccessIterator last,
-                 StrictWeakOrdering comp)
-{
-  typedef typename thrust::iterator_traits<RandomAccessIterator>::value_type KeyType;
-  static const bool use_primitive_sort = thrust::detail::is_arithmetic<KeyType>::value &&
-                                         (thrust::detail::is_same<StrictWeakOrdering, typename thrust::less<KeyType> >::value ||
-                                          thrust::detail::is_same<StrictWeakOrdering, typename thrust::greater<KeyType> >::value);
-
-  // supress unused variable warning
-  (void) use_primitive_sort;
-
-  thrust::system::detail::internal::scalar::sort_detail::stable_sort
-    (first, last, comp, 
-      thrust::detail::integral_constant<bool, use_primitive_sort>());
-}
-
-template<typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename StrictWeakOrdering>
-void stable_sort_by_key(RandomAccessIterator1 first1,
-                        RandomAccessIterator1 last1,
-                        RandomAccessIterator2 first2,
-                        StrictWeakOrdering comp)
-{
-  typedef typename thrust::iterator_traits<RandomAccessIterator1>::value_type KeyType;
-  static const bool use_primitive_sort = thrust::detail::is_arithmetic<KeyType>::value &&
-                                         (thrust::detail::is_same<StrictWeakOrdering, typename thrust::less<KeyType> >::value ||
-                                          thrust::detail::is_same<StrictWeakOrdering, typename thrust::greater<KeyType> >::value);
-
-  // supress unused variable warning
-  (void) use_primitive_sort;
-
-  thrust::system::detail::internal::scalar::sort_detail::stable_sort_by_key
-    (first1, last1, first2, comp, 
-      thrust::detail::integral_constant<bool, use_primitive_sort>());
-}
-
-} // end namespace scalar
-} // end namespace internal
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/internal/scalar/stable_merge_sort.h b/compat/thrust/system/detail/internal/scalar/stable_merge_sort.h
deleted file mode 100644
index f68242cc5a..0000000000
--- a/compat/thrust/system/detail/internal/scalar/stable_merge_sort.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file stable_merge_sort.h
- *  \brief Sequential implementation of merge sort.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cpp
-{
-namespace detail
-{
-
-template<typename RandomAccessIterator,
-         typename StrictWeakOrdering>
-void stable_merge_sort(RandomAccessIterator begin,
-                       RandomAccessIterator end,
-                       StrictWeakOrdering comp);
-
-template<typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename StrictWeakOrdering>
-void stable_merge_sort_by_key(RandomAccessIterator1 keys_begin,
-                              RandomAccessIterator1 keys_end,
-                              RandomAccessIterator2 values_begin,
-                              StrictWeakOrdering comp);
-
-} // end namespace detail
-} // end namespace cpp
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/detail/internal/scalar/stable_merge_sort.inl>
-
diff --git a/compat/thrust/system/detail/internal/scalar/stable_merge_sort.inl b/compat/thrust/system/detail/internal/scalar/stable_merge_sort.inl
deleted file mode 100644
index 41d320cb08..0000000000
--- a/compat/thrust/system/detail/internal/scalar/stable_merge_sort.inl
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/temporary_array.h>
-#include <thrust/system/detail/internal/scalar/merge.h>
-#include <thrust/system/detail/internal/scalar/insertion_sort.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace internal
-{
-namespace scalar
-{
-namespace detail
-{
-
-template <typename RandomAccessIterator,
-          typename StrictWeakOrdering>
-void inplace_merge(RandomAccessIterator first,
-                   RandomAccessIterator middle,
-                   RandomAccessIterator last,
-                   StrictWeakOrdering comp)
-{
-  // XXX the type of exec should be:
-  //     typedef decltype(select_system(first, middle, last)) DerivedPolicy;
-  typedef typename thrust::iterator_system<RandomAccessIterator>::type DerivedPolicy;
-  typedef typename thrust::iterator_value<RandomAccessIterator>::type value_type;
-
-  // XXX assumes DerivedPolicy is default constructible
-  // XXX find a way to get a stateful execution policy into this function
-  //     or simply pass scratch space
-  DerivedPolicy exec;
-  thrust::detail::temporary_array<value_type, DerivedPolicy> a(exec, first, middle);
-  thrust::detail::temporary_array<value_type, DerivedPolicy> b(exec, middle, last);
-
-  thrust::system::detail::internal::scalar::merge(a.begin(), a.end(), b.begin(), b.end(), first, comp);
-}
-
-template <typename RandomAccessIterator1,
-          typename RandomAccessIterator2,
-          typename StrictWeakOrdering>
-void inplace_merge_by_key(RandomAccessIterator1 first1,
-                          RandomAccessIterator1 middle1,
-                          RandomAccessIterator1 last1,
-                          RandomAccessIterator2 first2,
-                          StrictWeakOrdering comp)
-{
-  // XXX the type of exec should be:
-  //     typedef decltype(select_system(first1, middle1, last1, first2)) DerivedPolicy;
-  typedef typename thrust::iterator_system<RandomAccessIterator1>::type DerivedPolicy;
-  typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type1;
-  typedef typename thrust::iterator_value<RandomAccessIterator2>::type value_type2;
-
-  RandomAccessIterator2 middle2 = first2 + (middle1 - first1);
-  RandomAccessIterator2 last2   = first2 + (last1   - first1);
-
-  // XXX assumes DerivedPolicy is default constructible
-  // XXX find a way to get a stateful exec into this function
-  //     or simply pass scratch space
-  DerivedPolicy exec;
-  thrust::detail::temporary_array<value_type1, DerivedPolicy> lhs1(exec, first1, middle1);
-  thrust::detail::temporary_array<value_type1, DerivedPolicy> rhs1(exec, middle1, last1);
-  thrust::detail::temporary_array<value_type2, DerivedPolicy> lhs2(exec, first2, middle2);
-  thrust::detail::temporary_array<value_type2, DerivedPolicy> rhs2(exec, middle2, last2);
-
-  thrust::system::detail::internal::scalar::merge_by_key
-    (lhs1.begin(), lhs1.end(), rhs1.begin(), rhs1.end(),
-     lhs2.begin(), rhs2.begin(),
-     first1, first2, comp);
-}
-
-} // end namespace detail
-
-//////////////
-// Key Sort //
-//////////////
-
-template <typename RandomAccessIterator,
-          typename StrictWeakOrdering>
-void stable_merge_sort(RandomAccessIterator first,
-                       RandomAccessIterator last,
-                       StrictWeakOrdering comp)
-{
-  if (last - first < 32)
-  {
-    thrust::system::detail::internal::scalar::insertion_sort(first, last, comp);
-  }
-  else
-  {
-    RandomAccessIterator middle = first + (last - first) / 2;
-
-    thrust::system::detail::internal::scalar::stable_merge_sort(first, middle, comp);
-    thrust::system::detail::internal::scalar::stable_merge_sort(middle,  last, comp);
-    detail::inplace_merge(first, middle, last, comp);
-  }
-}
-
-
-////////////////////
-// Key-Value Sort //
-////////////////////
-
-template <typename RandomAccessIterator1,
-          typename RandomAccessIterator2,
-          typename StrictWeakOrdering>
-void stable_merge_sort_by_key(RandomAccessIterator1 first1,
-                              RandomAccessIterator1 last1,
-                              RandomAccessIterator2 first2,
-                              StrictWeakOrdering comp)
-{
-  if (last1 - first1 <= 32)
-  {
-    thrust::system::detail::internal::scalar::insertion_sort_by_key(first1, last1, first2, comp);
-  }
-  else
-  {
-    RandomAccessIterator1 middle1 = first1 + (last1 - first1) / 2;
-    RandomAccessIterator2 middle2 = first2 + (last1 - first1) / 2;
-
-    thrust::system::detail::internal::scalar::stable_merge_sort_by_key(first1, middle1, first2,  comp);
-    thrust::system::detail::internal::scalar::stable_merge_sort_by_key(middle1,  last1, middle2, comp);
-    detail::inplace_merge_by_key(first1, middle1, last1, first2, comp);
-  }
-}
-
-} // end namespace scalar
-} // end namespace internal
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/internal/scalar/stable_primitive_sort.h b/compat/thrust/system/detail/internal/scalar/stable_primitive_sort.h
deleted file mode 100644
index f37bf27cd1..0000000000
--- a/compat/thrust/system/detail/internal/scalar/stable_primitive_sort.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace internal
-{
-namespace scalar
-{
-
-template<typename RandomAccessIterator>
-void stable_primitive_sort(RandomAccessIterator first,
-                           RandomAccessIterator last);
-
-template<typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-void stable_primitive_sort_by_key(RandomAccessIterator1 keys_first,
-                                  RandomAccessIterator1 keys_last,
-                                  RandomAccessIterator2 values_first);
-
-} // end namespace scalar
-} // end namespace internal
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/detail/internal/scalar/stable_primitive_sort.inl>
-
diff --git a/compat/thrust/system/detail/internal/scalar/stable_primitive_sort.inl b/compat/thrust/system/detail/internal/scalar/stable_primitive_sort.inl
deleted file mode 100644
index c22b15c0e4..0000000000
--- a/compat/thrust/system/detail/internal/scalar/stable_primitive_sort.inl
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/internal/scalar/stable_primitive_sort.h>
-#include <thrust/system/detail/internal/scalar/stable_radix_sort.h>
-#include <thrust/functional.h>
-#include <thrust/system/detail/internal/scalar/partition.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/detail/type_traits.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace internal
-{
-namespace scalar
-{
-namespace stable_primitive_sort_detail
-{
-
-
-template<typename Iterator>
-  struct enable_if_bool_sort
-    : thrust::detail::enable_if<
-        thrust::detail::is_same<
-          bool,
-          typename thrust::iterator_value<Iterator>::type
-        >::value
-      >
-{};
-
-
-template<typename Iterator>
-  struct disable_if_bool_sort
-    : thrust::detail::disable_if<
-        thrust::detail::is_same<
-          bool,
-          typename thrust::iterator_value<Iterator>::type
-        >::value
-      >
-{};
-
-
-
-template<typename RandomAccessIterator>
-  typename enable_if_bool_sort<RandomAccessIterator>::type
-    stable_primitive_sort(RandomAccessIterator first, RandomAccessIterator last)
-{
-  // use stable_partition if we're sorting bool
-  // stable_partition puts true values first, so we need to logical_not
-  scalar::stable_partition(first, last, thrust::logical_not<bool>());
-}
-
-
-template<typename RandomAccessIterator>
-  typename disable_if_bool_sort<RandomAccessIterator>::type
-    stable_primitive_sort(RandomAccessIterator first, RandomAccessIterator last)
-{
-  // call stable_radix_sort
-  scalar::stable_radix_sort(first,last);
-}
-
-
-struct logical_not_first
-{
-  template<typename Tuple>
-  __host__ __device__
-  bool operator()(Tuple t)
-  {
-    return !thrust::get<0>(t);
-  }
-};
-
-
-template<typename RandomAccessIterator1, typename RandomAccessIterator2>
-  typename enable_if_bool_sort<RandomAccessIterator1>::type
-    stable_primitive_sort_by_key(RandomAccessIterator1 keys_first, RandomAccessIterator1 keys_last,
-                                 RandomAccessIterator2 values_first)
-{
-  // use stable_partition if we're sorting bool
-  // stable_partition puts true values first, so we need to logical_not
-  scalar::stable_partition(thrust::make_zip_iterator(thrust::make_tuple(keys_first, values_first)),
-                           thrust::make_zip_iterator(thrust::make_tuple(keys_last, values_first)),
-                           logical_not_first());
-}
-
-
-template<typename RandomAccessIterator1, typename RandomAccessIterator2>
-  typename disable_if_bool_sort<RandomAccessIterator1>::type
-    stable_primitive_sort_by_key(RandomAccessIterator1 keys_first, RandomAccessIterator1 keys_last,
-                                 RandomAccessIterator2 values_first)
-{
-  // call stable_radix_sort_by_key
-  scalar::stable_radix_sort_by_key(keys_first, keys_last, values_first);
-}
-
-
-}
-
-template<typename RandomAccessIterator>
-void stable_primitive_sort(RandomAccessIterator first,
-                           RandomAccessIterator last)
-{
-  scalar::stable_primitive_sort_detail::stable_primitive_sort(first,last);
-}
-
-template<typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-void stable_primitive_sort_by_key(RandomAccessIterator1 keys_first,
-                                  RandomAccessIterator1 keys_last,
-                                  RandomAccessIterator2 values_first)
-{
-  scalar::stable_primitive_sort_detail::stable_primitive_sort_by_key(keys_first, keys_last, values_first);
-}
-
-} // end namespace scalar
-} // end namespace internal
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/detail/internal/scalar/stable_primitive_sort.inl>
-
diff --git a/compat/thrust/system/detail/internal/scalar/stable_radix_sort.h b/compat/thrust/system/detail/internal/scalar/stable_radix_sort.h
deleted file mode 100644
index f2af22263a..0000000000
--- a/compat/thrust/system/detail/internal/scalar/stable_radix_sort.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file stable_radix_sort.h
- *  \brief Sequential implementation of radix sort.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace internal
-{
-namespace scalar
-{
-
-template<typename RandomAccessIterator>
-void stable_radix_sort(RandomAccessIterator begin,
-                       RandomAccessIterator end);
-
-template<typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-void stable_radix_sort_by_key(RandomAccessIterator1 keys_begin,
-                              RandomAccessIterator1 keys_end,
-                              RandomAccessIterator2 values_begin);
-
-} // end namespace scalar
-} // end namespace internal
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/detail/internal/scalar/stable_radix_sort.inl>
-
diff --git a/compat/thrust/system/detail/internal/scalar/stable_radix_sort.inl b/compat/thrust/system/detail/internal/scalar/stable_radix_sort.inl
deleted file mode 100644
index 98846ab101..0000000000
--- a/compat/thrust/system/detail/internal/scalar/stable_radix_sort.inl
+++ /dev/null
@@ -1,434 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#include <limits>
-
-#include <thrust/copy.h>
-#include <thrust/functional.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/temporary_array.h>
-#include <thrust/detail/cstdint.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace internal
-{
-namespace scalar
-{
-namespace detail
-{
-
-template <typename T>
-struct RadixEncoder : public thrust::identity<T>
-{};
-
-template <>
-struct RadixEncoder<char> : public thrust::unary_function<char, unsigned char>
-{
-  unsigned char operator()(char x) const
-  {
-    if(std::numeric_limits<char>::is_signed)
-      return x ^ static_cast<unsigned char>(1) << (8 * sizeof(unsigned char) - 1);
-    else
-      return x;
-  }
-};
-
-template <>
-struct RadixEncoder<signed char> : public thrust::unary_function<signed char, unsigned char>
-{
-  unsigned char operator()(signed char x) const
-  {
-    return x ^ static_cast<unsigned char>(1) << (8 * sizeof(unsigned char) - 1);
-  }
-};
-
-template <>
-struct RadixEncoder<short> : public thrust::unary_function<short, unsigned short>
-{
-  unsigned short operator()(short x) const
-  {
-    return x ^ static_cast<unsigned short>(1) << (8 * sizeof(unsigned short) - 1);
-  }
-};
-
-template <>
-struct RadixEncoder<int> : public thrust::unary_function<int, unsigned int>
-{
-  unsigned long operator()(long x) const
-  {
-    return x ^ static_cast<unsigned int>(1) << (8 * sizeof(unsigned int) - 1);
-  }
-};
-
-template <>
-struct RadixEncoder<long> : public thrust::unary_function<long, unsigned long>
-{
-  unsigned long operator()(long x) const
-  {
-    return x ^ static_cast<unsigned long>(1) << (8 * sizeof(unsigned long) - 1);
-  }
-};
-
-template <>
-struct RadixEncoder<long long> : public thrust::unary_function<long long, unsigned long long>
-{
-  unsigned long long operator()(long long x) const
-  {
-    return x ^ static_cast<unsigned long long>(1) << (8 * sizeof(unsigned long long) - 1);
-  }
-};
-
-// ideally we'd use uint32 here and uint64 below
-template <>
-struct RadixEncoder<float> : public thrust::unary_function<float, thrust::detail::uint32_t>
-{
-  thrust::detail::uint32_t operator()(float x) const
-  {
-    union { float f; thrust::detail::uint32_t i; } u;
-    u.f = x;
-    thrust::detail::uint32_t mask = -static_cast<thrust::detail::int32_t>(u.i >> 31) | (static_cast<thrust::detail::uint32_t>(1) << 31);
-    return u.i ^ mask;
-  }
-};
-
-template <>
-struct RadixEncoder<double> : public thrust::unary_function<double, thrust::detail::uint64_t>
-{
-  thrust::detail::uint64_t operator()(double x) const
-  {
-    union { double f; thrust::detail::uint64_t i; } u;
-    u.f = x;
-    thrust::detail::uint64_t mask = -static_cast<thrust::detail::int64_t>(u.i >> 63) | (static_cast<thrust::detail::uint64_t>(1) << 63);
-    return u.i ^ mask;
-  }
-};
-
-
-template <unsigned int RadixBits,
-          bool HasValues,
-          typename RandomAccessIterator1,
-          typename RandomAccessIterator2,
-          typename RandomAccessIterator3,
-          typename RandomAccessIterator4>
-void radix_sort(RandomAccessIterator1 keys1,
-                RandomAccessIterator2 keys2,
-                RandomAccessIterator3 vals1,
-                RandomAccessIterator4 vals2,
-                const size_t N)
-{
-  typedef typename thrust::iterator_value<RandomAccessIterator1>::type KeyType;
-
-  typedef RadixEncoder<KeyType> Encoder;
-  typedef typename Encoder::result_type EncodedType;
-
-  static const unsigned int NumHistograms = (8 * sizeof(EncodedType) + (RadixBits - 1)) / RadixBits;
-  static const unsigned int HistogramSize =  1 << RadixBits;
-
-  static const EncodedType BitMask = static_cast<EncodedType>((1 << RadixBits) - 1);
-  
-  Encoder encode;
-
-  // storage for histograms
-  size_t histograms[NumHistograms][HistogramSize] = {{0}};
-
-  // see which passes can be eliminated
-  bool skip_shuffle[NumHistograms] = {false};
-  
-  // false if most recent data is stored in (keys1,vals1)
-  bool flip = false;
-    
-  // compute histograms
-  for (size_t i = 0; i < N; i++)
-  {
-    const EncodedType x = encode(keys1[i]);
-
-    for (unsigned int j = 0; j < NumHistograms; j++)
-    {
-      const EncodedType BitShift = RadixBits * j;
-      histograms[j][(x >> BitShift) & BitMask]++;
-    }
-  }
-
-  // scan histograms
-  for (unsigned int i = 0; i < NumHistograms; i++)
-  {
-    size_t sum = 0;
-
-    for (unsigned int j = 0; j < HistogramSize; j++)
-    {
-      size_t bin = histograms[i][j];
-
-      if (bin == N)
-        skip_shuffle[i] = true;
-
-      histograms[i][j] = sum;
-
-      sum = sum + bin;
-    }
-  }
-
-  // shuffle keys and (optionally) values 
-  for (unsigned int i = 0; i < NumHistograms; i++)
-  {
-    const EncodedType BitShift = static_cast<EncodedType>(RadixBits * i);
-
-    if (!skip_shuffle[i])
-    {
-      if (flip)
-      {
-        for (size_t j = 0; j < N; j++)
-        {
-          const EncodedType x = encode(keys2[j]);
-          size_t position = histograms[i][(x >> BitShift) & BitMask]++;
-
-          RandomAccessIterator1 temp_keys1 = keys1;
-          temp_keys1 += position;
-
-          RandomAccessIterator2 temp_keys2 = keys2;
-          temp_keys2 += j;
-
-          // keys1[position] = keys2[j]
-          *temp_keys1 = *temp_keys2;
-
-          if (HasValues)
-          {
-            RandomAccessIterator3 temp_vals1 = vals1;
-            temp_vals1 += position;
-
-            RandomAccessIterator4 temp_vals2 = vals2;
-            temp_vals2 += j;
-
-            // vals1[position] = vals2[j]
-            *temp_vals1 = *temp_vals2;
-          }
-        }
-      }
-      else
-      {
-        for (size_t j = 0; j < N; j++)
-        {
-          const EncodedType x = encode(keys1[j]);
-          size_t position = histograms[i][(x >> BitShift) & BitMask]++;
-
-          RandomAccessIterator1 temp_keys1 = keys1;
-          temp_keys1 += j;
-
-          RandomAccessIterator2 temp_keys2 = keys2;
-          temp_keys2 += position;
-
-          // keys2[position] = keys1[j];
-          *temp_keys2 = *temp_keys1;
-
-          if (HasValues)
-          {
-            RandomAccessIterator3 temp_vals1 = vals1;
-            temp_vals1 += j;
-
-            RandomAccessIterator4 temp_vals2 = vals2;
-            temp_vals2 += position;
-
-            // vals2[position] = vals1[j]
-            *temp_vals2 = *temp_vals1;
-          }
-        }
-      }
-        
-      flip = (flip) ? false : true;
-    }
-  }
- 
-  // ensure final values are in (keys1,vals1)
-  if (flip)
-  {
-    thrust::copy(keys2, keys2 + N, keys1);
-    if (HasValues)
-      thrust::copy(vals2, vals2 + N, vals1);
-  }
-}
-
-
-// Select best radix sort parameters based on sizeof(T) and input size
-// These particular values were determined through empirical testing on a Core i7 950 CPU
-template <size_t KeySize>
-struct radix_sort_dispatcher
-{
-};
-
-template <>
-struct radix_sort_dispatcher<1>
-{
-  template <typename RandomAccessIterator1, typename RandomAccessIterator2>
-  void operator()(RandomAccessIterator1 keys1, RandomAccessIterator2 keys2, const size_t N)
-  {
-    detail::radix_sort<8,false>(keys1, keys2, static_cast<int *>(0), static_cast<int *>(0), N);
-  }
-  template <typename RandomAccessIterator1, typename RandomAccessIterator2, typename RandomAccessIterator3, typename RandomAccessIterator4>
-  void operator()(RandomAccessIterator1 keys1, RandomAccessIterator2 keys2, RandomAccessIterator3 vals1, RandomAccessIterator4 vals2, const size_t N)
-  {
-    detail::radix_sort<8,true>(keys1, keys2, vals1, vals2, N);
-  }
-};
-
-template <>
-struct radix_sort_dispatcher<2>
-{
-  template <typename RandomAccessIterator1, typename RandomAccessIterator2>
-  void operator()(RandomAccessIterator1 keys1, RandomAccessIterator2 keys2, const size_t N)
-  {
-    if (N < (1 << 16))
-      detail::radix_sort<8,false>(keys1, keys2, static_cast<int *>(0), static_cast<int *>(0), N);
-    else
-      detail::radix_sort<16,false>(keys1, keys2, static_cast<int *>(0), static_cast<int *>(0), N);
-  }
-  template <typename RandomAccessIterator1, typename RandomAccessIterator2, typename RandomAccessIterator3, typename RandomAccessIterator4>
-  void operator()(RandomAccessIterator1 keys1, RandomAccessIterator2 keys2, RandomAccessIterator3 vals1, RandomAccessIterator4 vals2, const size_t N)
-  {
-    if (N < (1 << 15))
-      detail::radix_sort<8,true>(keys1, keys2, vals1, vals2, N);
-    else
-      detail::radix_sort<16,true>(keys1, keys2, vals1, vals2, N);
-  }
-};
-
-template <>
-struct radix_sort_dispatcher<4>
-{
-  template <typename RandomAccessIterator1, typename RandomAccessIterator2>
-  void operator()(RandomAccessIterator1 keys1, RandomAccessIterator2 keys2, const size_t N)
-  {
-    if (N < (1 << 22))
-      detail::radix_sort<8,false>(keys1, keys2, static_cast<int *>(0), static_cast<int *>(0), N);
-    else
-      detail::radix_sort<4,false>(keys1, keys2, static_cast<int *>(0), static_cast<int *>(0), N);
-  }
-  template <typename RandomAccessIterator1, typename RandomAccessIterator2, typename RandomAccessIterator3, typename RandomAccessIterator4>
-  void operator()(RandomAccessIterator1 keys1, RandomAccessIterator2 keys2, RandomAccessIterator3 vals1, RandomAccessIterator4 vals2, const size_t N)
-  {
-    if (N < (1 << 22))
-      detail::radix_sort<8,true>(keys1, keys2, vals1, vals2, N);
-    else
-      detail::radix_sort<3,true>(keys1, keys2, vals1, vals2, N);
-  }
-};
-
-template <>
-struct radix_sort_dispatcher<8>
-{
-  template <typename RandomAccessIterator1, typename RandomAccessIterator2>
-  void operator()(RandomAccessIterator1 keys1, RandomAccessIterator2 keys2, const size_t N)
-  {
-    if (N < (1 << 21))
-      detail::radix_sort<8,false>(keys1, keys2, static_cast<int *>(0), static_cast<int *>(0), N);
-    else
-      detail::radix_sort<4,false>(keys1, keys2, static_cast<int *>(0), static_cast<int *>(0), N);
-  }
-  template <typename RandomAccessIterator1, typename RandomAccessIterator2, typename RandomAccessIterator3, typename RandomAccessIterator4>
-  void operator()(RandomAccessIterator1 keys1, RandomAccessIterator2 keys2, RandomAccessIterator3 vals1, RandomAccessIterator4 vals2, const size_t N)
-  {
-    if (N < (1 << 21))
-      detail::radix_sort<8,true>(keys1, keys2, vals1, vals2, N);
-    else
-      detail::radix_sort<3,true>(keys1, keys2, vals1, vals2, N);
-  }
-};
-
-template <typename RandomAccessIterator1,
-          typename RandomAccessIterator2>
-void radix_sort(RandomAccessIterator1 keys1,
-                RandomAccessIterator2 keys2,
-                const size_t N)
-{
-  typedef typename thrust::iterator_value<RandomAccessIterator1>::type KeyType;
-  radix_sort_dispatcher<sizeof(KeyType)>()(keys1, keys2, N);
-}
-
-template <typename RandomAccessIterator1,
-          typename RandomAccessIterator2,
-          typename RandomAccessIterator3,
-          typename RandomAccessIterator4>
-void radix_sort(RandomAccessIterator1 keys1,
-                RandomAccessIterator2 keys2,
-                RandomAccessIterator3 vals1,
-                RandomAccessIterator4 vals2,
-                const size_t N)
-{
-  typedef typename thrust::iterator_value<RandomAccessIterator1>::type KeyType;
-  radix_sort_dispatcher<sizeof(KeyType)>()(keys1, keys2, vals1, vals2, N);
-}
-
-} // namespace detail
-
-//////////////
-// Key Sort //
-//////////////
-
-template <typename RandomAccessIterator>
-void stable_radix_sort(RandomAccessIterator first,
-                       RandomAccessIterator last)
-{
-  typedef typename thrust::iterator_system<RandomAccessIterator>::type ExecutionPolicy;
-  typedef typename thrust::iterator_value<RandomAccessIterator>::type KeyType;
-
-  size_t N = last - first;
-  
-  // XXX assumes ExecutionPolicy is default constructible
-  // XXX consider how to get stateful systems into this function
-  ExecutionPolicy exec;
-  thrust::detail::temporary_array<KeyType, ExecutionPolicy> temp(exec, N);
-  
-  detail::radix_sort(first, temp.begin(), N);
-}
-
-
-////////////////////
-// Key-Value Sort //
-////////////////////
-
-template <typename RandomAccessIterator1,
-          typename RandomAccessIterator2>
-void stable_radix_sort_by_key(RandomAccessIterator1 first1,
-                              RandomAccessIterator1 last1,
-                              RandomAccessIterator2 first2)
-{
-  // XXX the type of exec should be
-  //     typedef decltype(select_system(first1,last1,first2)) system;
-  typedef typename thrust::iterator_system<RandomAccessIterator1>::type ExecutionPolicy;
-  typedef typename thrust::iterator_value<RandomAccessIterator1>::type KeyType;
-  typedef typename thrust::iterator_value<RandomAccessIterator2>::type ValueType;
-
-  size_t N = last1 - first1;
-  
-  // XXX assumes ExecutionPolicy is default constructible
-  // XXX consider how to get stateful systems into this function
-  ExecutionPolicy exec;
-  thrust::detail::temporary_array<KeyType, ExecutionPolicy>   temp1(exec, N);
-  thrust::detail::temporary_array<ValueType, ExecutionPolicy> temp2(exec, N);
-
-  detail::radix_sort(first1, temp1.begin(), first2, temp2.begin(), N);
-}
-
-} // end namespace scalar
-} // end namespace internal
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/internal/scalar/trivial_copy.h b/compat/thrust/system/detail/internal/scalar/trivial_copy.h
deleted file mode 100644
index 8f008b54e4..0000000000
--- a/compat/thrust/system/detail/internal/scalar/trivial_copy.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file trivial_copy.h
- *  \brief Sequential copy algorithms for plain-old-data.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <cstring>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace internal
-{
-namespace scalar
-{
-
-template<typename T>
-  T *trivial_copy_n(const T *first,
-                    std::ptrdiff_t n,
-                    T *result)
-{
-  std::memmove(result, first, n * sizeof(T));
-  return result + n;
-} // end trivial_copy_n()
-
-} // end namespace scalar
-} // end namespace internal
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/internal/scalar/unique.h b/compat/thrust/system/detail/internal/scalar/unique.h
deleted file mode 100644
index cfc60c948a..0000000000
--- a/compat/thrust/system/detail/internal/scalar/unique.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file unique.h
- *  \brief Sequential implementations of unique algorithms.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/pair.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace internal
-{
-namespace scalar
-{
-
-template<typename InputIterator,
-         typename OutputIterator,
-         typename BinaryPredicate>
-  OutputIterator unique_copy(InputIterator first,
-                             InputIterator last,
-                             OutputIterator output,
-                             BinaryPredicate binary_pred)
-{
-  typedef typename thrust::iterator_traits<InputIterator>::value_type T;
-
-  if(first != last)
-  {
-    T prev = *first;
-
-    for(++first; first != last; ++first)
-    {
-      T temp = *first;
-
-      if (!binary_pred(prev, temp))
-      {
-        *output = prev;
-
-        ++output;
-
-        prev = temp;
-      }
-    }
-
-    *output = prev;
-    ++output;
-  }
-
-  return output;
-} // end unique_copy()
-
-
-template<typename ForwardIterator,
-         typename BinaryPredicate>
-  ForwardIterator unique(ForwardIterator first,
-                         ForwardIterator last,
-                         BinaryPredicate binary_pred)
-{
-  // unique_copy() permits in-situ operation
-  return thrust::system::detail::internal::scalar::unique_copy(first, last, first, binary_pred);
-} // end unique()
-
-} // end namespace scalar
-} // end namespace internal
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/internal/scalar/unique_by_key.h b/compat/thrust/system/detail/internal/scalar/unique_by_key.h
deleted file mode 100644
index b0be2663e7..0000000000
--- a/compat/thrust/system/detail/internal/scalar/unique_by_key.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file unique_by_key.h
- *  \brief Sequential implementations of unique_by_key algorithms.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/pair.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace detail
-{
-namespace internal
-{
-namespace scalar
-{
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename BinaryPredicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    unique_by_key_copy(InputIterator1 keys_first, 
-                       InputIterator1 keys_last,
-                       InputIterator2 values_first,
-                       OutputIterator1 keys_output,
-                       OutputIterator2 values_output,
-                       BinaryPredicate binary_pred)
-{
-  typedef typename thrust::iterator_traits<InputIterator1>::value_type  InputKeyType;
-  typedef typename thrust::iterator_traits<OutputIterator2>::value_type OutputValueType;
-
-  if(keys_first != keys_last)
-  {
-    InputKeyType    temp_key   = *keys_first;
-    OutputValueType temp_value = *values_first;
-
-    for(++keys_first, ++values_first;
-        keys_first != keys_last;
-        ++keys_first, ++values_first)
-    {
-      InputKeyType    key   = *keys_first;
-      OutputValueType value = *values_first;
-
-      if(!binary_pred(temp_key, key))
-      {
-        *keys_output   = temp_key;
-        *values_output = temp_value;
-
-        ++keys_output;
-        ++values_output;
-
-        temp_key   = key;
-        temp_value = value;
-      }
-    }
-
-    *keys_output   = temp_key;
-    *values_output = temp_value;
-
-    ++keys_output;
-    ++values_output;
-  }
-
-  return thrust::make_pair(keys_output, values_output);
-} // end unique_by_key_copy()
-
-
-template<typename ForwardIterator1,
-         typename ForwardIterator2,
-         typename BinaryPredicate>
-  thrust::pair<ForwardIterator1,ForwardIterator2>
-    unique_by_key(ForwardIterator1 keys_first, 
-                  ForwardIterator1 keys_last,
-                  ForwardIterator2 values_first,
-                  BinaryPredicate binary_pred)
-{
-  // unique_by_key_copy() permits in-situ operation
-  return thrust::system::detail::internal::scalar::unique_by_key_copy(keys_first, keys_last, values_first, keys_first, values_first, binary_pred);
-} // end unique_by_key()
-
-} // end namespace scalar
-} // end namespace internal
-} // end namespace detail
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/detail/system_error.inl b/compat/thrust/system/detail/system_error.inl
deleted file mode 100644
index 74909bee8a..0000000000
--- a/compat/thrust/system/detail/system_error.inl
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#pragma once
-
-#include <thrust/system/system_error.h>
-
-namespace thrust
-{
-
-namespace system
-{
-
-
-system_error
-  ::system_error(error_code ec, const std::string &what_arg)
-    : std::runtime_error(what_arg), m_error_code(ec)
-{
-
-} // end system_error::system_error()
-
-
-system_error
-  ::system_error(error_code ec, const char *what_arg)
-    : std::runtime_error(what_arg), m_error_code(ec)
-{
-  ;
-} // end system_error::system_error()
-
-
-system_error
-  ::system_error(error_code ec)
-    : std::runtime_error(""), m_error_code(ec)
-{
-  ;
-} // end system_error::system_error()
-
-
-system_error
-  ::system_error(int ev, const error_category &ecat, const std::string &what_arg)
-    : std::runtime_error(what_arg), m_error_code(ev,ecat)
-{
-  ;
-} // end system_error::system_error()
-
-
-system_error
-  ::system_error(int ev, const error_category &ecat, const char *what_arg)
-    : std::runtime_error(what_arg), m_error_code(ev,ecat)
-{
-  ;
-} // end system_error::system_error()
-
-
-system_error
-  ::system_error(int ev, const error_category &ecat)
-    : std::runtime_error(""), m_error_code(ev,ecat)
-{
-  ;
-} // end system_error::system_error()
-
-
-const error_code &system_error
-  ::code(void) const throw()
-{
-  return m_error_code;
-} // end system_error::code()
-
-
-const char *system_error
-  ::what(void) const throw()
-{
-  if(m_what.empty())
-  {
-    try
-    {
-      m_what = this->std::runtime_error::what();
-      if(m_error_code)
-      {
-        if(!m_what.empty()) m_what += ": ";
-        m_what += m_error_code.message();
-      }
-    }
-    catch(...)
-    {
-      return std::runtime_error::what();
-    }
-  }
-
-  return m_what.c_str();
-} // end system_error::what()
-
-
-} // end system
-
-} // end thrust
-
diff --git a/compat/thrust/system/error_code.h b/compat/thrust/system/error_code.h
deleted file mode 100644
index 2b6582c937..0000000000
--- a/compat/thrust/system/error_code.h
+++ /dev/null
@@ -1,521 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file error_code.h
- *  \brief An object used to hold error values, such as those originating from the
- *         operating system or other low-level application program interfaces.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/system/detail/errno.h>
-#include <iostream>
-
-namespace thrust
-{
-
-namespace system
-{
-
-
-/*! \addtogroup system_diagnostics
- *  \{
- */
-
-class error_condition;
-class error_code;
-
-/*! A metafunction returning whether or not the parameter is an \p error_code enum.
- */
-template<typename T> struct is_error_code_enum : public thrust::detail::false_type {};
-
-/*! A metafunction returning whether or not the parameter is an \p error_condition enum.
- */
-template<typename T> struct is_error_condition_enum : public thrust::detail::false_type {};
-
-
-// XXX N3092 prefers enum class errc { ... }
-namespace errc
-{
-
-enum errc_t
-{
-  address_family_not_supported       = detail::eafnosupport,
-  address_in_use                     = detail::eaddrinuse,
-  address_not_available              = detail::eaddrnotavail,
-  already_connected                  = detail::eisconn,
-  argument_list_too_long             = detail::e2big,
-  argument_out_of_domain             = detail::edom,
-  bad_address                        = detail::efault,
-  bad_file_descriptor                = detail::ebadf,
-  bad_message                        = detail::ebadmsg,
-  broken_pipe                        = detail::epipe,
-  connection_aborted                 = detail::econnaborted,
-  connection_already_in_progress     = detail::ealready,
-  connection_refused                 = detail::econnrefused,
-  connection_reset                   = detail::econnreset,
-  cross_device_link                  = detail::exdev,
-  destination_address_required       = detail::edestaddrreq,
-  device_or_resource_busy            = detail::ebusy,
-  directory_not_empty                = detail::enotempty,
-  executable_format_error            = detail::enoexec,
-  file_exists                        = detail::eexist,
-  file_too_large                     = detail::efbig,
-  filename_too_long                  = detail::enametoolong,
-  function_not_supported             = detail::enosys,
-  host_unreachable                   = detail::ehostunreach,
-  identifier_removed                 = detail::eidrm,
-  illegal_byte_sequence              = detail::eilseq,
-  inappropriate_io_control_operation = detail::enotty,
-  interrupted                        = detail::eintr,
-  invalid_argument                   = detail::einval,
-  invalid_seek                       = detail::espipe,
-  io_error                           = detail::eio,
-  is_a_directory                     = detail::eisdir,
-  message_size                       = detail::emsgsize,
-  network_down                       = detail::enetdown,
-  network_reset                      = detail::enetreset,
-  network_unreachable                = detail::enetunreach,
-  no_buffer_space                    = detail::enobufs,
-  no_child_process                   = detail::echild,
-  no_link                            = detail::enolink,
-  no_lock_available                  = detail::enolck,
-  no_message_available               = detail::enodata,
-  no_message                         = detail::enomsg,
-  no_protocol_option                 = detail::enoprotoopt,
-  no_space_on_device                 = detail::enospc,
-  no_stream_resources                = detail::enosr,
-  no_such_device_or_address          = detail::enxio,
-  no_such_device                     = detail::enodev,
-  no_such_file_or_directory          = detail::enoent,
-  no_such_process                    = detail::esrch,
-  not_a_directory                    = detail::enotdir,
-  not_a_socket                       = detail::enotsock,
-  not_a_stream                       = detail::enostr,
-  not_connected                      = detail::enotconn,
-  not_enough_memory                  = detail::enomem,
-  not_supported                      = detail::enotsup,
-  operation_canceled                 = detail::ecanceled,
-  operation_in_progress              = detail::einprogress,
-  operation_not_permitted            = detail::eperm,
-  operation_not_supported            = detail::eopnotsupp,
-  operation_would_block              = detail::ewouldblock,
-  owner_dead                         = detail::eownerdead,
-  permission_denied                  = detail::eacces,
-  protocol_error                     = detail::eproto,
-  protocol_not_supported             = detail::eprotonosupport,
-  read_only_file_system              = detail::erofs,
-  resource_deadlock_would_occur      = detail::edeadlk,
-  resource_unavailable_try_again     = detail::eagain,
-  result_out_of_range                = detail::erange,
-  state_not_recoverable              = detail::enotrecoverable,
-  stream_timeout                     = detail::etime,
-  text_file_busy                     = detail::etxtbsy,
-  timed_out                          = detail::etimedout,
-  too_many_files_open_in_system      = detail::enfile,
-  too_many_files_open                = detail::emfile,
-  too_many_links                     = detail::emlink,
-  too_many_symbolic_link_levels      = detail::eloop,
-  value_too_large                    = detail::eoverflow,
-  wrong_protocol_type                = detail::eprototype
-}; // end errc_t
-
-} // end namespace errc
-
-
-/*! Specialization of \p is_error_condition_enum for \p errc::errc_t
- */
-template<> struct is_error_condition_enum<errc::errc_t> : public thrust::detail::true_type {};
-
-
-// [19.5.1.1] class error_category
-
-/*! \brief The class \p error_category serves as a base class for types used to identify the
- *         source and encoding of a particular category of error code. Classes may be derived
- *         from \p error_category to support categories of errors in addition to those defined
- *         in the C++ International Standard.
- */
-class error_category
-{
-  public:
-    /*! Destructor does nothing.
-     */
-    inline virtual ~error_category(void);
-
-    // XXX enable upon c++0x
-    // error_category(const error_category &) = delete;
-    // error_category &operator=(const error_category &) = delete;
-
-    /*! \return A string naming the error category.
-     */
-    inline virtual const char *name(void) const = 0;
-
-    /*! \return \p error_condition(ev, *this).
-     */
-    inline virtual error_condition default_error_condition(int ev) const;
-
-    /*! \return <tt>default_error_condition(code) == condition</tt>
-     */
-    inline virtual bool equivalent(int code, const error_condition &condition) const;
-
-    /*! \return <tt>*this == code.category() && code.value() == condition</tt>
-     */
-    inline virtual bool equivalent(const error_code &code, int condition) const;
-
-    /*! \return A string that describes the error condition denoted by \p ev.
-     */
-    virtual std::string message(int ev) const = 0;
-
-    /*! \return <tt>*this == &rhs</tt>
-     */
-    inline bool operator==(const error_category &rhs) const;
-
-    /*! \return <tt>!(*this == rhs)</tt>
-     */
-    inline bool operator!=(const error_category &rhs) const;
-
-    /*! \return <tt>less<const error_category*>()(this, &rhs)</tt>
-     *  \note \c less provides a total ordering for pointers.
-     */
-    inline bool operator<(const error_category &rhs) const;
-}; // end error_category
-
-
-// [19.5.1.5] error_category objects
-
-
-/*! \return A reference to an object of a type derived from class \p error_category.
- *  \note The object's \p default_error_condition and \p equivalent virtual functions
- *        shall behave as specified for the class \p error_category. The object's
- *        \p name virtual function shall return a pointer to the string <tt>"generic"</tt>.
- */
-inline const error_category &generic_category(void);
-
-
-/*! \return A reference to an object of a type derived from class \p error_category.
- *  \note The object's \p equivalent virtual functions shall behave as specified for
- *        class \p error_category. The object's \p name virtual function shall return
- *        a pointer to the string <tt>"system"</tt>. The object's \p default_error_condition
- *        virtual function shall behave as follows:
- *
- *        If the argument <tt>ev</tt> corresponds to a POSIX <tt>errno</tt> value
- *        \c posv, the function shall return <tt>error_condition(ev,generic_category())</tt>.
- *        Otherwise, the function shall return <tt>error_condition(ev,system_category())</tt>.
- *        What constitutes correspondence for any given operating system is unspecified.
- */
-inline const error_category &system_category(void);
-
-
-// [19.5.2] Class error_code
-
-
-/*! \brief The class \p error_code describes an object used to hold error code values, such as
- *         those originating from the operating system or other low-level application program
- *         interfaces.
- */
-class error_code
-{
-  public:
-    // [19.5.2.2] constructors:
-
-    /*! Effects: Constructs an object of type \p error_code.
-     *  \post <tt>value() == 0</tt> and <tt>category() == &system_category()</tt>.
-     */
-    inline error_code(void);
-
-    /*! Effects: Constructs an object of type \p error_code.
-     *  \post <tt>value() == val</tt> and <tt>category() == &cat</tt>.
-     */
-    inline error_code(int val, const error_category &cat);
-
-    /*! Effects: Constructs an object of type \p error_code.
-     *  \post <tt>*this == make_error_code(e)</tt>.
-     */
-    template <typename ErrorCodeEnum>
-      error_code(ErrorCodeEnum e
-// XXX WAR msvc's problem with enable_if
-#if THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
-        , typename thrust::detail::enable_if<is_error_code_enum<ErrorCodeEnum>::value>::type * = 0
-#endif // THRUST_HOST_COMPILER_MSVC
-        );
-
-    // [19.5.2.3] modifiers:
-
-    /*! \post <tt>value() == val</tt> and <tt>category() == &cat</tt>.
-     */
-    inline void assign(int val, const error_category &cat);
-
-    /*! \post <tt>*this == make_error_code(e)</tt>.
-     */
-    template <typename ErrorCodeEnum>
-// XXX WAR msvc's problem with enable_if
-#if THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
-      typename thrust::detail::enable_if<is_error_code_enum<ErrorCodeEnum>::value, error_code>::type &
-#else
-      error_code &
-#endif // THRUST_HOST_COMPILER_MSVC
-        operator=(ErrorCodeEnum e);
-
-    /*! \post <tt>value() == 0</tt> and <tt>category() == system_category()</tt>.
-     */
-    inline void clear(void);
-
-    // [19.5.2.4] observers:
-
-    /*! \return An integral value of this \p error_code object.
-     */
-    inline int value(void) const;
-
-    /*! \return An \p error_category describing the category of this \p error_code object.
-     */
-    inline const error_category &category(void) const;
-
-    /*! \return <tt>category().default_error_condition()</tt>.
-     */
-    inline error_condition default_error_condition(void) const;
-
-    /*! \return <tt>category().message(value())</tt>.
-     */
-    inline std::string message(void) const;
-
-    // XXX replace the below upon c++0x
-    // inline explicit operator bool (void) const;
-
-    /*! \return <tt>value() != 0</tt>.
-     */
-    inline operator bool (void) const;
-
-    /*! \cond
-     */
-  private:
-    int m_val;
-    const error_category *m_cat;
-    /*! \endcond
-     */
-}; // end error_code
-
-
-// [19.5.2.5] Class error_code non-member functions
-
-
-// XXX replace errc::errc_t with errc upon c++0x
-/*! \return <tt>error_code(static_cast<int>(e), generic_category())</tt>
- */
-inline error_code make_error_code(errc::errc_t e);
-
-
-/*! \return <tt>lhs.category() < rhs.category() || lhs.category() == rhs.category() && lhs.value() < rhs.value()</tt>.
- */
-inline bool operator<(const error_code &lhs, const error_code &rhs);
-
-
-/*! Effects: <tt>os << ec.category().name() << ':' << ec.value()</tt>.
- */
-template <typename charT, typename traits>
-  std::basic_ostream<charT,traits>&
-    operator<<(std::basic_ostream<charT,traits>& os, const error_code &ec);
-
-
-// [19.5.3] class error_condition
-
-
-/*! \brief The class \p error_condition describes an object used to hold values identifying
- *  error conditions.
- *
- *  \note \p error_condition values are portable abstractions, while \p error_code values
- *        are implementation specific.
- */
-class error_condition
-{
-  public:
-    // [19.5.3.2] constructors
-
-    /*! Constructs an object of type \p error_condition.
-     *  \post <tt>value() == 0</tt>.
-     *  \post <tt>category() == generic_category()</tt>.
-     */
-    inline error_condition(void);
-
-    /*! Constructs an object of type \p error_condition.
-     *  \post <tt>value() == val</tt>.
-     *  \post <tt>category() == cat</tt>.
-     */
-    inline error_condition(int val, const error_category &cat);
-
-    /*! Constructs an object of type \p error_condition.
-     *  \post <tt>*this == make_error_condition(e)</tt>.
-     *  \note This constructor shall not participate in overload resolution unless
-     *        <tt>is_error_condition_enum<ErrorConditionEnum>::value</tt> is <tt>true</tt>.
-     */
-    template<typename ErrorConditionEnum>
-      error_condition(ErrorConditionEnum e
-// XXX WAR msvc's problem with enable_if
-#if THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
-        , typename thrust::detail::enable_if<is_error_condition_enum<ErrorConditionEnum>::value>::type * = 0
-#endif // THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
-                     );
-
-    // [19.5.3.3] modifiers
-
-    /*! Assigns to this \p error_code object from an error value and an \p error_category.
-     *  \param val The new value to return from <tt>value()</tt>.
-     *  \param cat The new \p error_category to return from <tt>category()</tt>.
-     *  \post <tt>value() == val</tt>.
-     *  \post <tt>category() == cat</tt>.
-     */
-    inline void assign(int val, const error_category &cat);
-
-    /*! Assigns to this \p error_code object from an error condition enumeration.
-     *  \return *this
-     *  \post <tt>*this == make_error_condition(e)</tt>.
-     *  \note This operator shall not participate in overload resolution unless
-     *        <tt>is_error_condition_enum<ErrorConditionEnum>::value</tt> is <tt>true</tt>.
-     */
-    template<typename ErrorConditionEnum>
-// XXX WAR msvc's problem with enable_if
-#if THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
-      typename thrust::detail::enable_if<is_error_condition_enum<ErrorConditionEnum>::value, error_condition>::type &
-#else
-      error_condition &
-#endif // THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
-        operator=(ErrorConditionEnum e);
-
-    /*! Clears this \p error_code object.
-     *  \post <tt>value == 0</tt>
-     *  \post <tt>category() == generic_category()</tt>.
-     */
-    inline void clear(void);
-
-    // [19.5.3.4] observers
-
-    /*! \return The value encoded by this \p error_condition.
-     */
-    inline int value(void) const;
-
-    /*! \return A <tt>const</tt> reference to the \p error_category encoded by this \p error_condition.
-     */
-    inline const error_category &category(void) const;
-
-    /*! \return <tt>category().message(value())</tt>.
-     */
-    inline std::string message(void) const;
-
-    // XXX replace below with this upon c++0x
-    //explicit operator bool (void) const;
-    
-    /*! \return <tt>value() != 0</tt>.
-     */
-    inline operator bool (void) const;
-
-    /*! \cond
-     */
-
-  private:
-    int m_val;
-    const error_category *m_cat;
-
-    /*! \endcond
-     */
-}; // end error_condition
-
-
-
-// [19.5.3.5] Class error_condition non-member functions
-
-// XXX replace errc::errc_t with errc upon c++0x
-/*! \return <tt>error_condition(static_cast<int>(e), generic_category())</tt>.
- */
-inline error_condition make_error_condition(errc::errc_t e);
-
-
-/*! \return <tt>lhs.category() < rhs.category() || lhs.category() == rhs.category() && lhs.value() < rhs.value()</tt>.
- */
-inline bool operator<(const error_condition &lhs, const error_condition &rhs);
-
-
-// [19.5.4] Comparison operators
-
-
-/*! \return <tt>lhs.category() == rhs.category() && lhs.value() == rhs.value()</tt>.
- */
-inline bool operator==(const error_code &lhs, const error_code &rhs);
-
-
-/*! \return <tt>lhs.category().equivalent(lhs.value(), rhs) || rhs.category().equivalent(lhs,rhs.value())</tt>.
- */
-inline bool operator==(const error_code &lhs, const error_condition &rhs);
-
-
-/*! \return <tt>rhs.category().equivalent(lhs.value(), lhs) || lhs.category().equivalent(rhs, lhs.value())</tt>.
- */
-inline bool operator==(const error_condition &lhs, const error_code &rhs);
-
-
-/*! \return <tt>lhs.category() == rhs.category() && lhs.value() == rhs.value()</tt>
- */
-inline bool operator==(const error_condition &lhs, const error_condition &rhs);
-
-
-/*! \return <tt>!(lhs == rhs)</tt>
- */
-inline bool operator!=(const error_code &lhs, const error_code &rhs);
-
-
-/*! \return <tt>!(lhs == rhs)</tt>
- */
-inline bool operator!=(const error_code &lhs, const error_condition &rhs);
-
-
-/*! \return <tt>!(lhs == rhs)</tt>
- */
-inline bool operator!=(const error_condition &lhs, const error_code &rhs);
-
-
-/*! \return <tt>!(lhs == rhs)</tt>
- */
-inline bool operator!=(const error_condition &lhs, const error_condition &rhs);
-
-/*! \} // end system_diagnostics
- */
-
-
-} // end system
-
-
-// import names into thrust::
-using system::error_category;
-using system::error_code;
-using system::error_condition;
-using system::is_error_code_enum;
-using system::is_error_condition_enum;
-using system::make_error_code;
-using system::make_error_condition;
-
-// XXX replace with using system::errc upon c++0x
-namespace errc = system::errc;
-
-using system::generic_category;
-using system::system_category;
-
-} // end thrust
-
-#include <thrust/system/detail/error_category.inl>
-#include <thrust/system/detail/error_code.inl>
-#include <thrust/system/detail/error_condition.inl>
-
diff --git a/compat/thrust/system/omp/detail/adjacent_difference.h b/compat/thrust/system/omp/detail/adjacent_difference.h
deleted file mode 100644
index 0bbc1884cc..0000000000
--- a/compat/thrust/system/omp/detail/adjacent_difference.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/omp/detail/execution_policy.h>
-#include <thrust/system/detail/generic/adjacent_difference.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace omp
-{
-namespace detail
-{
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename BinaryFunction>
-  OutputIterator adjacent_difference(execution_policy<DerivedPolicy> &exec,
-                                     InputIterator first,
-                                     InputIterator last,
-                                     OutputIterator result,
-                                     BinaryFunction binary_op)
-{
-  // omp prefers generic::adjacent_difference to cpp::adjacent_difference
-  return thrust::system::detail::generic::adjacent_difference(exec, first, last, result, binary_op);
-} // end adjacent_difference()
-
-} // end detail
-} // end omp
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/omp/detail/assign_value.h b/compat/thrust/system/omp/detail/assign_value.h
deleted file mode 100644
index eda3b977b8..0000000000
--- a/compat/thrust/system/omp/detail/assign_value.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits assign_value
-#include <thrust/system/cpp/detail/assign_value.h>
-
diff --git a/compat/thrust/system/omp/detail/binary_search.h b/compat/thrust/system/omp/detail/binary_search.h
deleted file mode 100644
index 254e6fd445..0000000000
--- a/compat/thrust/system/omp/detail/binary_search.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/omp/detail/execution_policy.h>
-#include <thrust/system/detail/generic/binary_search.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace omp
-{
-namespace detail
-{
-
-
-template <typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
-ForwardIterator lower_bound(execution_policy<DerivedPolicy> &exec,
-                            ForwardIterator begin,
-                            ForwardIterator end,
-                            const T& value, 
-                            StrictWeakOrdering comp)
-{
-    // omp prefers generic::lower_bound to cpp::lower_bound
-    return thrust::system::detail::generic::lower_bound(exec, begin, end, value, comp);
-}
-
-
-template <typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering, typename Backend>
-ForwardIterator upper_bound(execution_policy<DerivedPolicy> &exec,
-                            ForwardIterator begin,
-                            ForwardIterator end,
-                            const T& value, 
-                            StrictWeakOrdering comp)
-{
-    // omp prefers generic::upper_bound to cpp::upper_bound
-    return thrust::system::detail::generic::upper_bound(exec, begin, end, value, comp);
-}
-
-
-template <typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
-bool binary_search(execution_policy<DerivedPolicy> &exec,
-                   ForwardIterator begin,
-                   ForwardIterator end,
-                   const T& value, 
-                   StrictWeakOrdering comp)
-{
-    // omp prefers generic::binary_search to cpp::binary_search
-    return thrust::system::detail::generic::binary_search(exec, begin, end, value, comp);
-}
-
-
-} // end detail
-} // end omp
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/omp/detail/copy.h b/compat/thrust/system/omp/detail/copy.h
deleted file mode 100644
index b23ac18801..0000000000
--- a/compat/thrust/system/omp/detail/copy.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/omp/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace omp
-{
-namespace detail
-{
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator>
-OutputIterator copy(execution_policy<DerivedPolicy> &exec,
-                    InputIterator first,
-                    InputIterator last,
-                    OutputIterator result);
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename Size,
-         typename OutputIterator>
-OutputIterator copy_n(execution_policy<DerivedPolicy> &exec,
-                      InputIterator first,
-                      Size n,
-                      OutputIterator result);
-
-
-} // end namespace detail
-} // end namespace omp
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/omp/detail/copy.inl>
-
diff --git a/compat/thrust/system/omp/detail/copy.inl b/compat/thrust/system/omp/detail/copy.inl
deleted file mode 100644
index 915ff92d70..0000000000
--- a/compat/thrust/system/omp/detail/copy.inl
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/omp/detail/copy.h>
-#include <thrust/system/detail/generic/copy.h>
-#include <thrust/detail/type_traits/minimum_type.h>
-#include <thrust/system/cpp/detail/copy.h>
-#include <thrust/iterator/detail/retag.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace omp
-{
-namespace detail
-{
-namespace dispatch
-{
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator>
-  OutputIterator copy(execution_policy<DerivedPolicy> &exec,
-                      InputIterator first,
-                      InputIterator last,
-                      OutputIterator result,
-                      thrust::incrementable_traversal_tag)
-{
-  return thrust::system::cpp::detail::copy(exec, first, last, result);
-} // end copy()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator>
-  OutputIterator copy(execution_policy<DerivedPolicy> &exec,
-                      InputIterator first,
-                      InputIterator last,
-                      OutputIterator result,
-                      thrust::random_access_traversal_tag)
-{
-  // XXX WAR problems reconciling unrelated types such as omp & tbb
-  // reinterpret iterators as the policy we were passed
-  // this ensures that generic::copy's implementation, which eventually results in
-  // zip_iterator works correctly
-  thrust::detail::tagged_iterator<OutputIterator,DerivedPolicy> retagged_result(result);
-
-  return thrust::system::detail::generic::copy(exec, thrust::reinterpret_tag<DerivedPolicy>(first), thrust::reinterpret_tag<DerivedPolicy>(last), retagged_result).base();
-} // end copy()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename Size,
-         typename OutputIterator>
-  OutputIterator copy_n(execution_policy<DerivedPolicy> &exec,
-                        InputIterator first,
-                        Size n,
-                        OutputIterator result,
-                        thrust::incrementable_traversal_tag)
-{
-  return thrust::system::cpp::detail::copy_n(exec, first, n, result);
-} // end copy_n()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename Size,
-         typename OutputIterator>
-  OutputIterator copy_n(execution_policy<DerivedPolicy> &exec,
-                        InputIterator first,
-                        Size n,
-                        OutputIterator result,
-                        thrust::random_access_traversal_tag)
-{
-  // XXX WAR problems reconciling unrelated types such as omp & tbb
-  // reinterpret iterators as the policy we were passed
-  // this ensures that generic::copy's implementation, which eventually results in
-  // zip_iterator works correctly
-  thrust::detail::tagged_iterator<OutputIterator,DerivedPolicy> retagged_result(result);
-
-  return thrust::system::detail::generic::copy_n(exec, thrust::reinterpret_tag<DerivedPolicy>(first), n, retagged_result).base();
-} // end copy_n()
-
-} // end dispatch
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator>
-OutputIterator copy(execution_policy<DerivedPolicy> &exec,
-                    InputIterator first,
-                    InputIterator last,
-                    OutputIterator result)
-{
-  typedef typename thrust::iterator_traversal<InputIterator>::type  traversal1;
-  typedef typename thrust::iterator_traversal<OutputIterator>::type traversal2;
-  
-  typedef typename thrust::detail::minimum_type<traversal1,traversal2>::type traversal;
-
-  // dispatch on minimum traversal
-  return thrust::system::omp::detail::dispatch::copy(exec, first,last,result,traversal());
-} // end copy()
-
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename Size,
-         typename OutputIterator>
-OutputIterator copy_n(execution_policy<DerivedPolicy> &exec,
-                      InputIterator first,
-                      Size n,
-                      OutputIterator result)
-{
-  typedef typename thrust::iterator_traversal<InputIterator>::type  traversal1;
-  typedef typename thrust::iterator_traversal<OutputIterator>::type traversal2;
-  
-  typedef typename thrust::detail::minimum_type<traversal1,traversal2>::type traversal;
-
-  // dispatch on minimum traversal
-  return thrust::system::omp::detail::dispatch::copy_n(exec,first,n,result,traversal());
-} // end copy_n()
-
-
-} // end namespace detail
-} // end namespace omp
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/omp/detail/copy_if.h b/compat/thrust/system/omp/detail/copy_if.h
deleted file mode 100644
index 46754a9f21..0000000000
--- a/compat/thrust/system/omp/detail/copy_if.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/omp/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace omp
-{
-namespace detail
-{
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator copy_if(execution_policy<DerivedPolicy> &exec,
-                         InputIterator1 first,
-                         InputIterator1 last,
-                         InputIterator2 stencil,
-                         OutputIterator result,
-                         Predicate pred);
-
-
-} // end detail
-} // end omp
-} // end system
-} // end thrust
-
-#include <thrust/system/omp/detail/copy_if.inl>
-
diff --git a/compat/thrust/system/omp/detail/copy_if.inl b/compat/thrust/system/omp/detail/copy_if.inl
deleted file mode 100644
index 1af6a215a6..0000000000
--- a/compat/thrust/system/omp/detail/copy_if.inl
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/omp/detail/copy_if.h>
-#include <thrust/system/detail/generic/copy_if.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace omp
-{
-namespace detail
-{
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator copy_if(execution_policy<DerivedPolicy> &exec,
-                         InputIterator1 first,
-                         InputIterator1 last,
-                         InputIterator2 stencil,
-                         OutputIterator result,
-                         Predicate pred)
-{
-  // omp prefers generic::copy_if to cpp::copy_if
-  return thrust::system::detail::generic::copy_if(exec, first, last, stencil, result, pred);
-} // end copy_if()
-
-
-} // end detail
-} // end omp
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/omp/detail/count.h b/compat/thrust/system/omp/detail/count.h
deleted file mode 100644
index da31ee8700..0000000000
--- a/compat/thrust/system/omp/detail/count.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits count
-#include <thrust/system/cpp/detail/count.h>
-
diff --git a/compat/thrust/system/omp/detail/default_decomposition.h b/compat/thrust/system/omp/detail/default_decomposition.h
deleted file mode 100644
index f1904c29c6..0000000000
--- a/compat/thrust/system/omp/detail/default_decomposition.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file default_decomposition.h
- *  \brief Return a decomposition that is appropriate for the OpenMP backend.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/internal/decompose.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace omp
-{
-namespace detail
-{
-
-template <typename IndexType>
-thrust::system::detail::internal::uniform_decomposition<IndexType> default_decomposition(IndexType n);
-
-} // end namespace detail
-} // end namespace omp
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/omp/detail/default_decomposition.inl>
-
diff --git a/compat/thrust/system/omp/detail/default_decomposition.inl b/compat/thrust/system/omp/detail/default_decomposition.inl
deleted file mode 100644
index 366b4f56c2..0000000000
--- a/compat/thrust/system/omp/detail/default_decomposition.inl
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/omp/detail/default_decomposition.h>
-
-// don't attempt to #include this file without omp support
-#if (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
-#include <omp.h>
-#endif // omp support
-
-namespace thrust
-{
-namespace system
-{
-namespace omp
-{
-namespace detail
-{
-
-template <typename IndexType>
-thrust::system::detail::internal::uniform_decomposition<IndexType> default_decomposition(IndexType n)
-{
-  // we're attempting to launch an omp kernel, assert we're compiling with omp support
-  // ========================================================================
-  // X Note to the user: If you've found this line due to a compiler error, X
-  // X you need to OpenMP support in your compiler.                         X
-  // ========================================================================
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<IndexType,
-                        (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)>::value) );
-
-#if (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
-  return thrust::system::detail::internal::uniform_decomposition<IndexType>(n, 1, omp_get_num_procs());
-#else
-  return thrust::system::detail::internal::uniform_decomposition<IndexType>(n, 1, 1);
-#endif
-}
-
-} // end namespace detail
-} // end namespace omp
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/omp/detail/equal.h b/compat/thrust/system/omp/detail/equal.h
deleted file mode 100644
index 74e55183d9..0000000000
--- a/compat/thrust/system/omp/detail/equal.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits equal
-#include <thrust/system/cpp/detail/equal.h>
-
diff --git a/compat/thrust/system/omp/detail/execution_policy.h b/compat/thrust/system/omp/detail/execution_policy.h
deleted file mode 100644
index 1b06224217..0000000000
--- a/compat/thrust/system/omp/detail/execution_policy.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cpp/detail/execution_policy.h>
-#include <thrust/system/tbb/detail/execution_policy.h>
-#include <thrust/iterator/detail/any_system_tag.h>
-#include <thrust/detail/type_traits.h>
-
-namespace thrust
-{
-namespace system
-{
-// put the canonical tag in the same ns as the backend's entry points
-namespace omp
-{
-namespace detail
-{
-
-// this awkward sequence of definitions arise
-// from the desire both for tag to derive
-// from execution_policy and for execution_policy
-// to convert to tag (when execution_policy is not
-// an ancestor of tag)
-
-// forward declaration of tag
-struct tag;
-
-// forward declaration of execution_policy
-template<typename> struct execution_policy;
-
-// specialize execution_policy for tag
-template<>
-  struct execution_policy<tag>
-    : thrust::system::cpp::detail::execution_policy<tag>
-{};
-
-// tag's definition comes before the
-// generic definition of execution_policy
-struct tag : execution_policy<tag> {};
-
-// allow conversion to tag when it is not a successor
-template<typename Derived>
-  struct execution_policy
-    : thrust::system::cpp::detail::execution_policy<Derived>
-{
-  // allow conversion to tag
-  inline operator tag () const
-  {
-    return tag();
-  }
-};
-
-
-// overloads of select_system
-
-// XXX select_system(tbb, omp) & select_system(omp, tbb) are ambiguous
-//     because both convert to cpp without these overloads, which we
-//     arbitrarily define in the omp backend
-
-template<typename System1, typename System2>
-inline __host__ __device__
-  System1 select_system(execution_policy<System1> s, thrust::system::tbb::detail::execution_policy<System2>)
-{
-  return thrust::detail::derived_cast(s);
-} // end select_system()
-
-
-template<typename System1, typename System2>
-inline __host__ __device__
-  System2 select_system(thrust::system::tbb::detail::execution_policy<System1>, execution_policy<System2> s)
-{
-  return thrust::detail::derived_cast(s);
-} // end select_system()
-
-
-} // end detail
-
-// alias execution_policy and tag here
-using thrust::system::omp::detail::execution_policy;
-using thrust::system::omp::detail::tag;
-
-} // end omp
-} // end system
-
-// alias items at top-level
-namespace omp
-{
-
-using thrust::system::omp::execution_policy;
-using thrust::system::omp::tag;
-
-} // end omp
-} // end thrust
-
diff --git a/compat/thrust/system/omp/detail/extrema.h b/compat/thrust/system/omp/detail/extrema.h
deleted file mode 100644
index fb96770b97..0000000000
--- a/compat/thrust/system/omp/detail/extrema.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/omp/detail/execution_policy.h>
-#include <thrust/system/detail/generic/extrema.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace omp
-{
-namespace detail
-{
-
-template <typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
-ForwardIterator max_element(execution_policy<DerivedPolicy> &exec,
-                            ForwardIterator first, 
-                            ForwardIterator last,
-                            BinaryPredicate comp)
-{
-  // omp prefers generic::max_element to cpp::max_element
-  return thrust::system::detail::generic::max_element(exec, first, last, comp);
-} // end max_element()
-
-template <typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
-ForwardIterator min_element(execution_policy<DerivedPolicy> &exec,
-                            ForwardIterator first, 
-                            ForwardIterator last,
-                            BinaryPredicate comp)
-{
-  // omp prefers generic::min_element to cpp::min_element
-  return thrust::system::detail::generic::min_element(exec, first, last, comp);
-} // end min_element()
-
-template <typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
-thrust::pair<ForwardIterator,ForwardIterator> minmax_element(execution_policy<DerivedPolicy> &exec,
-                                                             ForwardIterator first, 
-                                                             ForwardIterator last,
-                                                             BinaryPredicate comp)
-{
-  // omp prefers generic::minmax_element to cpp::minmax_element
-  return thrust::system::detail::generic::minmax_element(exec, first, last, comp);
-} // end minmax_element()
-
-} // end detail
-} // end omp
-} // end system
-} // end thrust
-
-
diff --git a/compat/thrust/system/omp/detail/fill.h b/compat/thrust/system/omp/detail/fill.h
deleted file mode 100644
index 5219e1c7c5..0000000000
--- a/compat/thrust/system/omp/detail/fill.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits fill
-#include <thrust/system/cpp/detail/fill.h>
-
diff --git a/compat/thrust/system/omp/detail/find.h b/compat/thrust/system/omp/detail/find.h
deleted file mode 100644
index a8dca5ad4d..0000000000
--- a/compat/thrust/system/omp/detail/find.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file find.h
- *  \brief OpenMP implementation of find_if. 
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/find.h>
-#include <thrust/system/omp/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace omp
-{
-namespace detail
-{
-
-template <typename DerivedPolicy, typename InputIterator, typename Predicate>
-InputIterator find_if(execution_policy<DerivedPolicy> &exec,
-                      InputIterator first,
-                      InputIterator last,
-                      Predicate pred)
-{
-  // omp prefers generic::find_if to cpp::find_if
-  return thrust::system::detail::generic::find_if(exec, first, last, pred);
-}
-
-} // end namespace detail
-} // end namespace omp
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/omp/detail/for_each.h b/compat/thrust/system/omp/detail/for_each.h
deleted file mode 100644
index 1030623ec0..0000000000
--- a/compat/thrust/system/omp/detail/for_each.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file for_each.h
- *  \brief Defines the interface for a function that executes a 
- *  function or functional for each value in a given range.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/omp/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace omp
-{
-namespace detail
-{
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator,
-         typename UnaryFunction>
-  RandomAccessIterator for_each(execution_policy<DerivedPolicy> &exec,
-                                RandomAccessIterator first,
-                                RandomAccessIterator last,
-                                UnaryFunction f);
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator,
-         typename Size,
-         typename UnaryFunction>
-  RandomAccessIterator for_each_n(execution_policy<DerivedPolicy> &exec,
-                                  RandomAccessIterator first,
-                                  Size n,
-                                  UnaryFunction f);
-
-} // end namespace detail
-} // end namespace omp
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/omp/detail/for_each.inl>
-
diff --git a/compat/thrust/system/omp/detail/for_each.inl b/compat/thrust/system/omp/detail/for_each.inl
deleted file mode 100644
index c6ab8277ba..0000000000
--- a/compat/thrust/system/omp/detail/for_each.inl
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file for_each.inl
- *  \brief Inline file for for_each.h.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/static_assert.h>
-#include <thrust/distance.h>
-#include <thrust/detail/function.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/distance.h>
-#include <thrust/for_each.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace omp
-{
-namespace detail
-{
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator,
-         typename Size,
-         typename UnaryFunction>
-RandomAccessIterator for_each_n(execution_policy<DerivedPolicy> &,
-                                RandomAccessIterator first,
-                                Size n,
-                                UnaryFunction f)
-{
-  // we're attempting to launch an omp kernel, assert we're compiling with omp support
-  // ========================================================================
-  // X Note to the user: If you've found this line due to a compiler error, X
-  // X you need to enable OpenMP support in your compiler.                  X
-  // ========================================================================
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<RandomAccessIterator,
-                        (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)>::value) );
-
-  if (n <= 0) return first;  //empty range
-
-  // create a wrapped function for f
-  typedef typename thrust::iterator_reference<RandomAccessIterator>::type reference;
-  thrust::detail::host_function<UnaryFunction,void> wrapped_f(f);
-
-// do not attempt to compile the body of this function, which depends on #pragma omp,
-// without support from the compiler
-// XXX implement the body of this function in another file to eliminate this ugliness
-#if (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
-  // use a signed type for the iteration variable or suffer the consequences of warnings
-  typedef typename thrust::iterator_difference<RandomAccessIterator>::type DifferenceType;
-  DifferenceType signed_n = n;
-#pragma omp parallel for
-  for(DifferenceType i = 0;
-      i < signed_n;
-      ++i)
-  {
-    RandomAccessIterator temp = first + i;
-    wrapped_f(*temp);
-  }
-#endif // THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE
-
-  return first + n;
-} // end for_each_n() 
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator,
-         typename UnaryFunction>
-  RandomAccessIterator for_each(execution_policy<DerivedPolicy> &s,
-                                RandomAccessIterator first,
-                                RandomAccessIterator last,
-                                UnaryFunction f)
-{
-  return omp::detail::for_each_n(s, first, thrust::distance(first,last), f);
-} // end for_each()
-
-} // end namespace detail
-} // end namespace omp
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/omp/detail/gather.h b/compat/thrust/system/omp/detail/gather.h
deleted file mode 100644
index dfb7d7fc2d..0000000000
--- a/compat/thrust/system/omp/detail/gather.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits gather
-#include <thrust/system/cpp/detail/gather.h>
-
diff --git a/compat/thrust/system/omp/detail/generate.h b/compat/thrust/system/omp/detail/generate.h
deleted file mode 100644
index 0cb33b9336..0000000000
--- a/compat/thrust/system/omp/detail/generate.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits generate
-#include <thrust/system/cpp/detail/generate.h>
-
diff --git a/compat/thrust/system/omp/detail/get_value.h b/compat/thrust/system/omp/detail/get_value.h
deleted file mode 100644
index e376e65749..0000000000
--- a/compat/thrust/system/omp/detail/get_value.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits get_value
-#include <thrust/system/cpp/detail/get_value.h>
-
diff --git a/compat/thrust/system/omp/detail/inner_product.h b/compat/thrust/system/omp/detail/inner_product.h
deleted file mode 100644
index 351421a577..0000000000
--- a/compat/thrust/system/omp/detail/inner_product.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits inner_product
-#include <thrust/system/cpp/detail/inner_product.h>
-
diff --git a/compat/thrust/system/omp/detail/iter_swap.h b/compat/thrust/system/omp/detail/iter_swap.h
deleted file mode 100644
index 16176ec69b..0000000000
--- a/compat/thrust/system/omp/detail/iter_swap.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits iter_swap
-#include <thrust/system/cpp/detail/iter_swap.h>
-
diff --git a/compat/thrust/system/omp/detail/logical.h b/compat/thrust/system/omp/detail/logical.h
deleted file mode 100644
index b2a80de70f..0000000000
--- a/compat/thrust/system/omp/detail/logical.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits logical
-#include <thrust/system/cpp/detail/logical.h>
-
diff --git a/compat/thrust/system/omp/detail/malloc_and_free.h b/compat/thrust/system/omp/detail/malloc_and_free.h
deleted file mode 100644
index 811a552a4f..0000000000
--- a/compat/thrust/system/omp/detail/malloc_and_free.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits malloc and free
-#include <thrust/system/cpp/detail/malloc_and_free.h>
-
diff --git a/compat/thrust/system/omp/detail/memory.inl b/compat/thrust/system/omp/detail/memory.inl
deleted file mode 100644
index 7d53de60a1..0000000000
--- a/compat/thrust/system/omp/detail/memory.inl
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cpp/detail/execution_policy.h>
-#include <thrust/system/omp/memory.h>
-#include <thrust/system/cpp/memory.h>
-#include <limits>
-
-namespace thrust
-{
-namespace system
-{
-namespace omp
-{
-
-
-template<typename T>
-  template<typename OtherT>
-    reference<T> &
-      reference<T>
-        ::operator=(const reference<OtherT> &other)
-{
-  return super_t::operator=(other);
-} // end reference::operator=()
-
-template<typename T>
-  reference<T> &
-    reference<T>
-      ::operator=(const value_type &x)
-{
-  return super_t::operator=(x);
-} // end reference::operator=()
-
-template<typename T>
-__host__ __device__
-void swap(reference<T> a, reference<T> b)
-{
-  a.swap(b);
-} // end swap()
-
-namespace detail
-{
-
-// XXX circular #inclusion problems cause the compiler to believe that cpp::malloc
-//     is not defined
-//     WAR the problem by using adl to call cpp::malloc, which requires it to depend
-//     on a template parameter
-template<typename Tag>
-  pointer<void> malloc_workaround(Tag t, std::size_t n)
-{
-  return pointer<void>(malloc(t, n));
-} // end malloc_workaround()
-
-// XXX circular #inclusion problems cause the compiler to believe that cpp::free
-//     is not defined
-//     WAR the problem by using adl to call cpp::free, which requires it to depend
-//     on a template parameter
-template<typename Tag>
-  void free_workaround(Tag t, pointer<void> ptr)
-{
-  free(t, ptr.get());
-} // end free_workaround()
-
-} // end detail
-
-inline pointer<void> malloc(std::size_t n)
-{
-  // XXX this is how we'd like to implement this function,
-  //     if not for circular #inclusion problems:
-  //
-  // return pointer<void>(thrust::system::cpp::malloc(n))
-  //
-  return detail::malloc_workaround(cpp::tag(), n);
-} // end malloc()
-
-template<typename T>
-pointer<T> malloc(std::size_t n)
-{
-  pointer<void> raw_ptr = thrust::system::omp::malloc(sizeof(T) * n);
-  return pointer<T>(reinterpret_cast<T*>(raw_ptr.get()));
-} // end malloc()
-
-inline void free(pointer<void> ptr)
-{
-  // XXX this is how we'd like to implement this function,
-  //     if not for circular #inclusion problems:
-  //
-  // thrust::system::cpp::free(ptr)
-  //
-  detail::free_workaround(cpp::tag(), ptr);
-} // end free()
-
-} // end omp
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/omp/detail/merge.h b/compat/thrust/system/omp/detail/merge.h
deleted file mode 100644
index a7047aa028..0000000000
--- a/compat/thrust/system/omp/detail/merge.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits merge
-#include <thrust/system/cpp/detail/merge.h>
-
diff --git a/compat/thrust/system/omp/detail/mismatch.h b/compat/thrust/system/omp/detail/mismatch.h
deleted file mode 100644
index 03980cfcd5..0000000000
--- a/compat/thrust/system/omp/detail/mismatch.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits mismatch
-#include <thrust/system/cpp/detail/mismatch.h>
-
diff --git a/compat/thrust/system/omp/detail/par.h b/compat/thrust/system/omp/detail/par.h
deleted file mode 100644
index fa6d18e64c..0000000000
--- a/compat/thrust/system/omp/detail/par.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/omp/detail/execution_policy.h>
-#include <thrust/detail/execute_with_allocator.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace omp
-{
-namespace detail
-{
-
-
-struct par_t : thrust::system::omp::detail::execution_policy<par_t>
-{
-  par_t() : thrust::system::omp::detail::execution_policy<par_t>() {}
-
-  template<typename Allocator>
-    thrust::detail::execute_with_allocator<Allocator, thrust::system::omp::detail::execution_policy>
-      operator()(Allocator &alloc) const
-  {
-    return thrust::detail::execute_with_allocator<Allocator, thrust::system::omp::detail::execution_policy>(alloc);
-  }
-};
-
-
-} // end detail
-
-
-static const detail::par_t par;
-
-
-} // end omp
-} // end system
-
-
-// alias par here
-namespace omp
-{
-
-
-using thrust::system::omp::par;
-
-
-} // end omp
-} // end thrust
-
diff --git a/compat/thrust/system/omp/detail/partition.h b/compat/thrust/system/omp/detail/partition.h
deleted file mode 100644
index edcbc30834..0000000000
--- a/compat/thrust/system/omp/detail/partition.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file reduce.h
- *  \brief OpenMP implementation of reduce algorithms.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/omp/detail/execution_policy.h>
-#include <thrust/pair.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace omp
-{
-namespace detail
-{
-
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename Predicate>
-  ForwardIterator stable_partition(execution_policy<DerivedPolicy> &exec,
-                                   ForwardIterator first,
-                                   ForwardIterator last,
-                                   Predicate pred);
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename InputIterator,
-         typename Predicate>
-  ForwardIterator stable_partition(execution_policy<DerivedPolicy> &exec,
-                                   ForwardIterator first,
-                                   ForwardIterator last,
-                                   InputIterator stencil,
-                                   Predicate pred);
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename Predicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    stable_partition_copy(execution_policy<DerivedPolicy> &exec,
-                          InputIterator first,
-                          InputIterator last,
-                          OutputIterator1 out_true,
-                          OutputIterator2 out_false,
-                          Predicate pred);
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename Predicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    stable_partition_copy(execution_policy<DerivedPolicy> &exec,
-                          InputIterator1 first,
-                          InputIterator1 last,
-                          InputIterator2 stencil,
-                          OutputIterator1 out_true,
-                          OutputIterator2 out_false,
-                          Predicate pred);
-
-
-} // end namespace detail
-} // end namespace omp
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/omp/detail/partition.inl>
-
diff --git a/compat/thrust/system/omp/detail/partition.inl b/compat/thrust/system/omp/detail/partition.inl
deleted file mode 100644
index da629e5c69..0000000000
--- a/compat/thrust/system/omp/detail/partition.inl
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file reduce.h
- *  \brief OpenMP implementation of reduce algorithms.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/omp/detail/partition.h>
-#include <thrust/system/detail/generic/partition.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace omp
-{
-namespace detail
-{
-
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename Predicate>
-  ForwardIterator stable_partition(execution_policy<DerivedPolicy> &exec,
-                                   ForwardIterator first,
-                                   ForwardIterator last,
-                                   Predicate pred)
-{
-  // omp prefers generic::stable_partition to cpp::stable_partition
-  return thrust::system::detail::generic::stable_partition(exec, first, last, pred);
-} // end stable_partition()
-
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename InputIterator,
-         typename Predicate>
-  ForwardIterator stable_partition(execution_policy<DerivedPolicy> &exec,
-                                   ForwardIterator first,
-                                   ForwardIterator last,
-                                   InputIterator stencil,
-                                   Predicate pred)
-{
-  // omp prefers generic::stable_partition to cpp::stable_partition
-  return thrust::system::detail::generic::stable_partition(exec, first, last, stencil, pred);
-} // end stable_partition()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename Predicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    stable_partition_copy(execution_policy<DerivedPolicy> &exec,
-                          InputIterator first,
-                          InputIterator last,
-                          OutputIterator1 out_true,
-                          OutputIterator2 out_false,
-                          Predicate pred)
-{
-  // omp prefers generic::stable_partition_copy to cpp::stable_partition_copy
-  return thrust::system::detail::generic::stable_partition_copy(exec, first, last, out_true, out_false, pred);
-} // end stable_partition_copy()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename Predicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    stable_partition_copy(execution_policy<DerivedPolicy> &exec,
-                          InputIterator1 first,
-                          InputIterator1 last,
-                          InputIterator2 stencil,
-                          OutputIterator1 out_true,
-                          OutputIterator2 out_false,
-                          Predicate pred)
-{
-  // omp prefers generic::stable_partition_copy to cpp::stable_partition_copy
-  return thrust::system::detail::generic::stable_partition_copy(exec, first, last, stencil, out_true, out_false, pred);
-} // end stable_partition_copy()
-
-
-} // end namespace detail
-} // end namespace omp
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/omp/detail/reduce.h b/compat/thrust/system/omp/detail/reduce.h
deleted file mode 100644
index 0cc5cebc2f..0000000000
--- a/compat/thrust/system/omp/detail/reduce.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file reduce.h
- *  \brief OpenMP implementation of reduce algorithms.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/omp/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace omp
-{
-namespace detail
-{
-
-
-template<typename DerivedPolicy,
-         typename InputIterator, 
-         typename OutputType,
-         typename BinaryFunction>
-  OutputType reduce(execution_policy<DerivedPolicy> &exec,
-                    InputIterator first,
-                    InputIterator last,
-                    OutputType init,
-                    BinaryFunction binary_op);
-
-
-} // end namespace detail
-} // end namespace omp
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/omp/detail/reduce.inl>
-
diff --git a/compat/thrust/system/omp/detail/reduce.inl b/compat/thrust/system/omp/detail/reduce.inl
deleted file mode 100644
index 1347bfd004..0000000000
--- a/compat/thrust/system/omp/detail/reduce.inl
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/omp/detail/reduce.h>
-#include <thrust/system/omp/detail/default_decomposition.h>
-#include <thrust/system/omp/detail/reduce_intervals.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace omp
-{
-namespace detail
-{
-
-
-template<typename DerivedPolicy,
-         typename InputIterator, 
-         typename OutputType,
-         typename BinaryFunction>
-  OutputType reduce(execution_policy<DerivedPolicy> &exec,
-                    InputIterator first,
-                    InputIterator last,
-                    OutputType init,
-                    BinaryFunction binary_op)
-{
-  typedef typename thrust::iterator_difference<InputIterator>::type difference_type;
-
-  const difference_type n = thrust::distance(first,last);
-
-  // determine first and second level decomposition
-  thrust::system::detail::internal::uniform_decomposition<difference_type> decomp1 = thrust::system::omp::detail::default_decomposition(n);
-  thrust::system::detail::internal::uniform_decomposition<difference_type> decomp2(decomp1.size() + 1, 1, 1);
-
-  // allocate storage for the initializer and partial sums
-  // XXX use select_system for Tag
-  thrust::detail::temporary_array<OutputType,DerivedPolicy> partial_sums(exec, decomp1.size() + 1);
-  
-  // set first element of temp array to init
-  partial_sums[0] = init;
-  
-  // accumulate partial sums (first level reduction)
-  thrust::system::omp::detail::reduce_intervals(exec, first, partial_sums.begin() + 1, binary_op, decomp1);
-
-  // reduce partial sums (second level reduction)
-  thrust::system::omp::detail::reduce_intervals(exec, partial_sums.begin(), partial_sums.begin(), binary_op, decomp2);
-
-  return partial_sums[0];
-} // end reduce()
-
-
-} // end detail
-} // end omp
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/omp/detail/reduce_by_key.h b/compat/thrust/system/omp/detail/reduce_by_key.h
deleted file mode 100644
index d7243ee0ca..0000000000
--- a/compat/thrust/system/omp/detail/reduce_by_key.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file reduce.h
- *  \brief OpenMP implementation of reduce algorithms.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/omp/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace omp
-{
-namespace detail
-{
-
-
-template <typename DerivedPolicy,
-          typename InputIterator1,
-          typename InputIterator2,
-          typename OutputIterator1,
-          typename OutputIterator2,
-          typename BinaryPredicate,
-          typename BinaryFunction>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    reduce_by_key(execution_policy<DerivedPolicy> &exec,
-                  InputIterator1 keys_first, 
-                  InputIterator1 keys_last,
-                  InputIterator2 values_first,
-                  OutputIterator1 keys_output,
-                  OutputIterator2 values_output,
-                  BinaryPredicate binary_pred,
-                  BinaryFunction binary_op);
-
-
-} // end namespace detail
-} // end namespace omp
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/omp/detail/reduce_by_key.inl>
-
diff --git a/compat/thrust/system/omp/detail/reduce_by_key.inl b/compat/thrust/system/omp/detail/reduce_by_key.inl
deleted file mode 100644
index 91402d8280..0000000000
--- a/compat/thrust/system/omp/detail/reduce_by_key.inl
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/omp/detail/reduce_by_key.h>
-#include <thrust/system/detail/generic/reduce_by_key.h>
-#include <thrust/distance.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace omp
-{
-namespace detail
-{
-
-template <typename DerivedPolicy,
-          typename InputIterator1,
-          typename InputIterator2,
-          typename OutputIterator1,
-          typename OutputIterator2,
-          typename BinaryPredicate,
-          typename BinaryFunction>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    reduce_by_key(execution_policy<DerivedPolicy> &exec,
-                  InputIterator1 keys_first, 
-                  InputIterator1 keys_last,
-                  InputIterator2 values_first,
-                  OutputIterator1 keys_output,
-                  OutputIterator2 values_output,
-                  BinaryPredicate binary_pred,
-                  BinaryFunction binary_op)
-{
-  // omp prefers generic::reduce_by_key to cpp::reduce_by_key
-  return thrust::system::detail::generic::reduce_by_key(exec, keys_first, keys_last, values_first, keys_output, values_output, binary_pred, binary_op);
-} // end reduce_by_key()
-
-
-} // end detail
-} // end omp
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/omp/detail/reduce_intervals.h b/compat/thrust/system/omp/detail/reduce_intervals.h
deleted file mode 100644
index 7bce2074a3..0000000000
--- a/compat/thrust/system/omp/detail/reduce_intervals.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file reduce_intervals.h
- *  \brief OpenMP implementations of reduce_intervals algorithms.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/omp/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace omp
-{
-namespace detail
-{
-
-template <typename DerivedPolicy,
-          typename InputIterator,
-          typename OutputIterator,
-          typename BinaryFunction,
-          typename Decomposition>
-void reduce_intervals(execution_policy<DerivedPolicy> &exec,
-                      InputIterator input,
-                      OutputIterator output,
-                      BinaryFunction binary_op,
-                      Decomposition decomp);
-
-} // end namespace detail
-} // end namespace omp
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/omp/detail/reduce_intervals.inl>
-
diff --git a/compat/thrust/system/omp/detail/reduce_intervals.inl b/compat/thrust/system/omp/detail/reduce_intervals.inl
deleted file mode 100644
index 0752b8aab9..0000000000
--- a/compat/thrust/system/omp/detail/reduce_intervals.inl
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#include <thrust/detail/config.h>
-#include <thrust/system/omp/detail/reduce_intervals.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/function.h>
-#include <thrust/detail/cstdint.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace omp
-{
-namespace detail
-{
-
-template <typename DerivedPolicy,
-          typename InputIterator,
-          typename OutputIterator,
-          typename BinaryFunction,
-          typename Decomposition>
-void reduce_intervals(execution_policy<DerivedPolicy> &,
-                      InputIterator input,
-                      OutputIterator output,
-                      BinaryFunction binary_op,
-                      Decomposition decomp)
-{
-  // we're attempting to launch an omp kernel, assert we're compiling with omp support
-  // ========================================================================
-  // X Note to the user: If you've found this line due to a compiler error, X
-  // X you need to enable OpenMP support in your compiler.                  X
-  // ========================================================================
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator,
-                        (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)>::value) );
-
-#if (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
-  typedef typename thrust::iterator_value<OutputIterator>::type OutputType;
-
-  // wrap binary_op
-  thrust::detail::host_function<BinaryFunction,OutputType> wrapped_binary_op(binary_op);
-
-  typedef thrust::detail::intptr_t index_type;
-
-  index_type n = static_cast<index_type>(decomp.size());
-
-#if (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
-# pragma omp parallel for
-#endif // THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE
-  for(index_type i = 0; i < n; i++)
-  {
-    InputIterator begin = input + decomp[i].begin();
-    InputIterator end   = input + decomp[i].end();
-
-    if (begin != end)
-    {
-      OutputType sum = thrust::raw_reference_cast(*begin);
-
-      ++begin;
-
-      while (begin != end)
-      {
-        sum = wrapped_binary_op(sum, *begin);
-        ++begin;
-      }
-
-      OutputIterator tmp = output + i;
-      *tmp = sum;
-    }
-  }
-#endif // THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE
-}
-
-} // end namespace detail
-} // end namespace omp
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/omp/detail/remove.h b/compat/thrust/system/omp/detail/remove.h
deleted file mode 100644
index ebcb49613c..0000000000
--- a/compat/thrust/system/omp/detail/remove.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/omp/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace omp
-{
-namespace detail
-{
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename Predicate>
-  ForwardIterator remove_if(execution_policy<DerivedPolicy> &exec,
-                            ForwardIterator first,
-                            ForwardIterator last,
-                            Predicate pred);
-
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename InputIterator,
-         typename Predicate>
-  ForwardIterator remove_if(execution_policy<DerivedPolicy> &exec,
-                            ForwardIterator first,
-                            ForwardIterator last,
-                            InputIterator stencil,
-                            Predicate pred);
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator remove_copy_if(execution_policy<DerivedPolicy> &exec,
-                                InputIterator first,
-                                InputIterator last,
-                                OutputIterator result,
-                                Predicate pred);
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator remove_copy_if(execution_policy<DerivedPolicy> &exec,
-                                InputIterator1 first,
-                                InputIterator1 last,
-                                InputIterator2 stencil,
-                                OutputIterator result,
-                                Predicate pred);
-
-
-} // end namespace detail
-} // end namespace omp
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/omp/detail/remove.inl>
-
diff --git a/compat/thrust/system/omp/detail/remove.inl b/compat/thrust/system/omp/detail/remove.inl
deleted file mode 100644
index c056f967e4..0000000000
--- a/compat/thrust/system/omp/detail/remove.inl
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/omp/detail/remove.h>
-#include <thrust/system/detail/generic/remove.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace omp
-{
-namespace detail
-{
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename Predicate>
-  ForwardIterator remove_if(execution_policy<DerivedPolicy> &exec,
-                            ForwardIterator first,
-                            ForwardIterator last,
-                            Predicate pred)
-{
-  // omp prefers generic::remove_if to cpp::remove_if
-  return thrust::system::detail::generic::remove_if(exec, first, last, pred);
-}
-
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename InputIterator,
-         typename Predicate>
-  ForwardIterator remove_if(execution_policy<DerivedPolicy> &exec,
-                            ForwardIterator first,
-                            ForwardIterator last,
-                            InputIterator stencil,
-                            Predicate pred)
-{
-  // omp prefers generic::remove_if to cpp::remove_if
-  return thrust::system::detail::generic::remove_if(exec, first, last, stencil, pred);
-}
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator remove_copy_if(execution_policy<DerivedPolicy> &exec,
-                                InputIterator first,
-                                InputIterator last,
-                                OutputIterator result,
-                                Predicate pred)
-{
-  // omp prefers generic::remove_copy_if to cpp::remove_copy_if
-  return thrust::system::detail::generic::remove_copy_if(exec, first, last, result, pred);
-}
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator remove_copy_if(execution_policy<DerivedPolicy> &exec,
-                                InputIterator1 first,
-                                InputIterator1 last,
-                                InputIterator2 stencil,
-                                OutputIterator result,
-                                Predicate pred)
-{
-  // omp prefers generic::remove_copy_if to cpp::remove_copy_if
-  return thrust::system::detail::generic::remove_copy_if(exec, first, last, stencil, result, pred);
-}
-
-} // end namespace detail
-} // end namespace omp
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/omp/detail/replace.h b/compat/thrust/system/omp/detail/replace.h
deleted file mode 100644
index c48555d0e1..0000000000
--- a/compat/thrust/system/omp/detail/replace.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits this algorithm
-#include <thrust/system/cpp/detail/scatter.h>
-
diff --git a/compat/thrust/system/omp/detail/reverse.h b/compat/thrust/system/omp/detail/reverse.h
deleted file mode 100644
index 04923d1f6a..0000000000
--- a/compat/thrust/system/omp/detail/reverse.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits reverse
-#include <thrust/system/cpp/detail/reverse.h>
-
diff --git a/compat/thrust/system/omp/detail/scan.h b/compat/thrust/system/omp/detail/scan.h
deleted file mode 100644
index c105951fff..0000000000
--- a/compat/thrust/system/omp/detail/scan.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits scan
-#include <thrust/system/cpp/detail/scan.h>
-
diff --git a/compat/thrust/system/omp/detail/scan_by_key.h b/compat/thrust/system/omp/detail/scan_by_key.h
deleted file mode 100644
index bfbd5d69bd..0000000000
--- a/compat/thrust/system/omp/detail/scan_by_key.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits this algorithm
-#include <thrust/system/cpp/detail/scan_by_key.h>
-
diff --git a/compat/thrust/system/omp/detail/scatter.h b/compat/thrust/system/omp/detail/scatter.h
deleted file mode 100644
index c48555d0e1..0000000000
--- a/compat/thrust/system/omp/detail/scatter.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits this algorithm
-#include <thrust/system/cpp/detail/scatter.h>
-
diff --git a/compat/thrust/system/omp/detail/sequence.h b/compat/thrust/system/omp/detail/sequence.h
deleted file mode 100644
index 811d8f5fbb..0000000000
--- a/compat/thrust/system/omp/detail/sequence.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits sequence
-#include <thrust/system/cpp/detail/sequence.h>
-
diff --git a/compat/thrust/system/omp/detail/set_operations.h b/compat/thrust/system/omp/detail/set_operations.h
deleted file mode 100644
index 687edb2e7d..0000000000
--- a/compat/thrust/system/omp/detail/set_operations.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits set_operations
-#include <thrust/system/cpp/detail/set_operations.h>
-
diff --git a/compat/thrust/system/omp/detail/sort.h b/compat/thrust/system/omp/detail/sort.h
deleted file mode 100644
index 9a480f2799..0000000000
--- a/compat/thrust/system/omp/detail/sort.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/omp/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace omp
-{
-namespace detail
-{
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator,
-         typename StrictWeakOrdering>
-void stable_sort(execution_policy<DerivedPolicy> &exec,
-                 RandomAccessIterator first,
-                 RandomAccessIterator last,
-                 StrictWeakOrdering comp);
-    
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename StrictWeakOrdering>
-void stable_sort_by_key(execution_policy<DerivedPolicy> &exec,
-                        RandomAccessIterator1 keys_first,
-                        RandomAccessIterator1 keys_last,
-                        RandomAccessIterator2 values_first,
-                        StrictWeakOrdering comp);
-
-} // end namespace detail
-} // end namespace omp
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/omp/detail/sort.inl>
-
diff --git a/compat/thrust/system/omp/detail/sort.inl b/compat/thrust/system/omp/detail/sort.inl
deleted file mode 100644
index ab4f4a1bbe..0000000000
--- a/compat/thrust/system/omp/detail/sort.inl
+++ /dev/null
@@ -1,249 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#include <thrust/detail/config.h>
-
-// don't attempt to #include this file without omp support
-#if (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
-#include <omp.h>
-#endif // omp support
-
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/detail/generic/select_system.h>
-#include <thrust/system/cpp/detail/sort.h>
-#include <thrust/system/cpp/detail/merge.h>
-#include <thrust/system/cpp/detail/execution_policy.h>
-#include <thrust/detail/temporary_array.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace omp
-{
-namespace detail
-{
-namespace sort_detail
-{
-
-
-template <typename DerivedPolicy,
-          typename RandomAccessIterator,
-          typename StrictWeakOrdering>
-void inplace_merge(execution_policy<DerivedPolicy> &exec,
-                   RandomAccessIterator first,
-                   RandomAccessIterator middle,
-                   RandomAccessIterator last,
-                   StrictWeakOrdering comp)
-{
-  typedef typename thrust::iterator_value<RandomAccessIterator>::type value_type;
-
-  thrust::detail::temporary_array<value_type,DerivedPolicy> a(exec, first, middle);
-  thrust::detail::temporary_array<value_type,DerivedPolicy> b(exec, middle, last);
-
-  thrust::system::cpp::detail::merge(exec, a.begin(), a.end(), b.begin(), b.end(), first, comp);
-}
-
-
-template <typename DerivedPolicy,
-          typename RandomAccessIterator1,
-          typename RandomAccessIterator2,
-          typename StrictWeakOrdering>
-void inplace_merge_by_key(execution_policy<DerivedPolicy> &exec,
-                          RandomAccessIterator1 first1,
-                          RandomAccessIterator1 middle1,
-                          RandomAccessIterator1 last1,
-                          RandomAccessIterator2 first2,
-                          StrictWeakOrdering comp)
-{
-  typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type1;
-  typedef typename thrust::iterator_value<RandomAccessIterator2>::type value_type2;
-
-  RandomAccessIterator2 middle2 = first2 + (middle1 - first1);
-  RandomAccessIterator2 last2   = first2 + (last1   - first1);
-
-  thrust::detail::temporary_array<value_type1,DerivedPolicy> lhs1(exec, first1, middle1);
-  thrust::detail::temporary_array<value_type1,DerivedPolicy> rhs1(exec, middle1, last1);
-  thrust::detail::temporary_array<value_type2,DerivedPolicy> lhs2(exec, first2, middle2);
-  thrust::detail::temporary_array<value_type2,DerivedPolicy> rhs2(exec, middle2, last2);
-
-  thrust::system::cpp::detail::merge_by_key
-    (exec,
-     lhs1.begin(), lhs1.end(), rhs1.begin(), rhs1.end(),
-     lhs2.begin(), rhs2.begin(),
-     first1, first2, comp);
-}
-
-
-} // end sort_detail
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator,
-         typename StrictWeakOrdering>
-void stable_sort(execution_policy<DerivedPolicy> &exec,
-                 RandomAccessIterator first,
-                 RandomAccessIterator last,
-                 StrictWeakOrdering comp)
-{
-  // we're attempting to launch an omp kernel, assert we're compiling with omp support
-  // ========================================================================
-  // X Note to the user: If you've found this line due to a compiler error, X
-  // X you need to enable OpenMP support in your compiler.                  X
-  // ========================================================================
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<RandomAccessIterator,
-                        (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)>::value) );
-
-#if (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
-  typedef typename thrust::iterator_difference<RandomAccessIterator>::type IndexType;
-  
-  if (first == last)
-    return;
-
-  #pragma omp parallel
-  {
-    thrust::system::detail::internal::uniform_decomposition<IndexType> decomp(last - first, 1, omp_get_num_threads());
-
-    // process id
-    IndexType p_i = omp_get_thread_num();
-
-    // every thread sorts its own tile
-    if (p_i < decomp.size())
-    {
-      thrust::system::cpp::detail::stable_sort(exec,
-                                               first + decomp[p_i].begin(),
-                                               first + decomp[p_i].end(),
-                                               comp);
-    }
-
-    #pragma omp barrier
-
-    IndexType nseg = decomp.size();
-    IndexType h = 2;
-
-    // keep track of which sub-range we're processing
-    IndexType a=p_i, b=p_i, c=p_i+1;
-
-    while( nseg>1 )
-    {
-        if(c >= decomp.size())
-          c = decomp.size() - 1;
-
-        if((p_i % h) == 0 && c > b)
-        {
-          thrust::system::omp::detail::sort_detail::inplace_merge
-            (exec,
-             first + decomp[a].begin(),
-             first + decomp[b].end(),
-             first + decomp[c].end(),
-             comp);
-            b = c;
-            c += h;
-        }
-
-        nseg = (nseg + 1) / 2;
-        h *= 2;
-
-        #pragma omp barrier
-    }
-  }
-#endif // THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE
-}
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename StrictWeakOrdering>
-void stable_sort_by_key(execution_policy<DerivedPolicy> &exec,
-                        RandomAccessIterator1 keys_first,
-                        RandomAccessIterator1 keys_last,
-                        RandomAccessIterator2 values_first,
-                        StrictWeakOrdering comp)
-{
-  // we're attempting to launch an omp kernel, assert we're compiling with omp support
-  // ========================================================================
-  // X Note to the user: If you've found this line due to a compiler error, X
-  // X you need to enable OpenMP support in your compiler.                  X
-  // ========================================================================
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<RandomAccessIterator1,
-                        (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)>::value) );
-
-#if (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
-  typedef typename thrust::iterator_difference<RandomAccessIterator1>::type IndexType;
-  
-  if (keys_first == keys_last)
-    return;
-
-  #pragma omp parallel
-  {
-    thrust::system::detail::internal::uniform_decomposition<IndexType> decomp(keys_last - keys_first, 1, omp_get_num_threads());
-
-    // process id
-    IndexType p_i = omp_get_thread_num();
-
-    // every thread sorts its own tile
-    if (p_i < decomp.size())
-    {
-      thrust::system::cpp::detail::stable_sort_by_key(exec,
-                                                      keys_first + decomp[p_i].begin(),
-                                                      keys_first + decomp[p_i].end(),
-                                                      values_first + decomp[p_i].begin(),
-                                                      comp);
-    }
-
-    #pragma omp barrier
-
-    IndexType nseg = decomp.size();
-    IndexType h = 2;
-
-    // keep track of which sub-range we're processing
-    IndexType a=p_i, b=p_i, c=p_i+1;
-
-    while( nseg>1 )
-    {
-        if(c >= decomp.size())
-          c = decomp.size() - 1;
-
-        if((p_i % h) == 0 && c > b)
-        {
-          thrust::system::omp::detail::sort_detail::inplace_merge_by_key
-            (exec,
-             keys_first + decomp[a].begin(),
-             keys_first + decomp[b].end(),
-             keys_first + decomp[c].end(),
-             values_first + decomp[a].begin(),
-             comp);
-            b = c;
-            c += h;
-        }
-
-        nseg = (nseg + 1) / 2;
-        h *= 2;
-
-        #pragma omp barrier
-    }
-  }
-#endif // THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE
-}
-
-
-} // end namespace detail
-} // end namespace omp
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/omp/detail/swap_ranges.h b/compat/thrust/system/omp/detail/swap_ranges.h
deleted file mode 100644
index e683aaaa6e..0000000000
--- a/compat/thrust/system/omp/detail/swap_ranges.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// omp inherits swap_ranges
-#include <thrust/system/cpp/detail/swap_ranges.h>
-
diff --git a/compat/thrust/system/omp/detail/tabulate.h b/compat/thrust/system/omp/detail/tabulate.h
deleted file mode 100644
index da65d8e44d..0000000000
--- a/compat/thrust/system/omp/detail/tabulate.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits tabulate
-#include <thrust/system/cpp/detail/tabulate.h>
-
diff --git a/compat/thrust/system/omp/detail/temporary_buffer.h b/compat/thrust/system/omp/detail/temporary_buffer.h
deleted file mode 100644
index 628bd75719..0000000000
--- a/compat/thrust/system/omp/detail/temporary_buffer.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system has no special temporary buffer functions
-
diff --git a/compat/thrust/system/omp/detail/transform.h b/compat/thrust/system/omp/detail/transform.h
deleted file mode 100644
index 70ce1f41b6..0000000000
--- a/compat/thrust/system/omp/detail/transform.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// omp inherits transform
-#include <thrust/system/cpp/detail/transform.h>
-
diff --git a/compat/thrust/system/omp/detail/transform_reduce.h b/compat/thrust/system/omp/detail/transform_reduce.h
deleted file mode 100644
index 23ed07054a..0000000000
--- a/compat/thrust/system/omp/detail/transform_reduce.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits transform_reduce
-#include <thrust/system/cpp/detail/transform_reduce.h>
-
diff --git a/compat/thrust/system/omp/detail/transform_scan.h b/compat/thrust/system/omp/detail/transform_scan.h
deleted file mode 100644
index fc2e55d0c0..0000000000
--- a/compat/thrust/system/omp/detail/transform_scan.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits transform_scan
-#include <thrust/system/cpp/detail/transform_scan.h>
-
diff --git a/compat/thrust/system/omp/detail/uninitialized_copy.h b/compat/thrust/system/omp/detail/uninitialized_copy.h
deleted file mode 100644
index 944f4baf0e..0000000000
--- a/compat/thrust/system/omp/detail/uninitialized_copy.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits uninitialized_copy
-#include <thrust/system/cpp/detail/uninitialized_copy.h>
-
diff --git a/compat/thrust/system/omp/detail/uninitialized_fill.h b/compat/thrust/system/omp/detail/uninitialized_fill.h
deleted file mode 100644
index b9d6de20fa..0000000000
--- a/compat/thrust/system/omp/detail/uninitialized_fill.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits uninitialized_fill
-#include <thrust/system/cpp/detail/uninitialized_fill.h>
-
diff --git a/compat/thrust/system/omp/detail/unique.h b/compat/thrust/system/omp/detail/unique.h
deleted file mode 100644
index 60c617bee9..0000000000
--- a/compat/thrust/system/omp/detail/unique.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/omp/detail/execution_policy.h>
-#include <thrust/pair.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace omp
-{
-namespace detail
-{
-
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename BinaryPredicate>
-  ForwardIterator unique(execution_policy<DerivedPolicy> &exec,
-                         ForwardIterator first,
-                         ForwardIterator last,
-                         BinaryPredicate binary_pred);
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename BinaryPredicate>
-  OutputIterator unique_copy(execution_policy<DerivedPolicy> &exec,
-                             InputIterator first,
-                             InputIterator last,
-                             OutputIterator output,
-                             BinaryPredicate binary_pred);
-
-
-} // end namespace detail
-} // end namespace omp 
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/omp/detail/unique.inl>
-
diff --git a/compat/thrust/system/omp/detail/unique.inl b/compat/thrust/system/omp/detail/unique.inl
deleted file mode 100644
index d66ac3bf4f..0000000000
--- a/compat/thrust/system/omp/detail/unique.inl
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/omp/detail/unique.h>
-#include <thrust/system/detail/generic/unique.h>
-#include <thrust/pair.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace omp
-{
-namespace detail
-{
-
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename BinaryPredicate>
-  ForwardIterator unique(execution_policy<DerivedPolicy> &exec,
-                         ForwardIterator first,
-                         ForwardIterator last,
-                         BinaryPredicate binary_pred)
-{
-  // omp prefers generic::unique to cpp::unique
-  return thrust::system::detail::generic::unique(exec,first,last,binary_pred);
-} // end unique()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename BinaryPredicate>
-  OutputIterator unique_copy(execution_policy<DerivedPolicy> &exec,
-                             InputIterator first,
-                             InputIterator last,
-                             OutputIterator output,
-                             BinaryPredicate binary_pred)
-{
-  // omp prefers generic::unique_copy to cpp::unique_copy
-  return thrust::system::detail::generic::unique_copy(exec,first,last,output,binary_pred);
-} // end unique_copy()
-
-
-} // end namespace detail
-} // end namespace omp 
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/omp/detail/unique_by_key.h b/compat/thrust/system/omp/detail/unique_by_key.h
deleted file mode 100644
index 8fdde66edf..0000000000
--- a/compat/thrust/system/omp/detail/unique_by_key.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/omp/detail/execution_policy.h>
-#include <thrust/pair.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace omp
-{
-namespace detail
-{
-
-
-template<typename DerivedPolicy,
-         typename ForwardIterator1,
-         typename ForwardIterator2,
-         typename BinaryPredicate>
-  thrust::pair<ForwardIterator1,ForwardIterator2>
-    unique_by_key(execution_policy<DerivedPolicy> &exec,
-                  ForwardIterator1 keys_first, 
-                  ForwardIterator1 keys_last,
-                  ForwardIterator2 values_first,
-                  BinaryPredicate binary_pred);
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename BinaryPredicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    unique_by_key_copy(execution_policy<DerivedPolicy> &exec,
-                       InputIterator1 keys_first, 
-                       InputIterator1 keys_last,
-                       InputIterator2 values_first,
-                       OutputIterator1 keys_output,
-                       OutputIterator2 values_output,
-                       BinaryPredicate binary_pred);
-
-
-} // end namespace detail
-} // end namespace omp 
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/omp/detail/unique_by_key.inl>
-
diff --git a/compat/thrust/system/omp/detail/unique_by_key.inl b/compat/thrust/system/omp/detail/unique_by_key.inl
deleted file mode 100644
index 644b5ed6aa..0000000000
--- a/compat/thrust/system/omp/detail/unique_by_key.inl
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/omp/detail/unique_by_key.h>
-#include <thrust/system/detail/generic/unique_by_key.h>
-#include <thrust/pair.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace omp
-{
-namespace detail
-{
-
-
-template<typename DerivedPolicy,
-         typename ForwardIterator1,
-         typename ForwardIterator2,
-         typename BinaryPredicate>
-  thrust::pair<ForwardIterator1,ForwardIterator2>
-    unique_by_key(execution_policy<DerivedPolicy> &exec,
-                  ForwardIterator1 keys_first, 
-                  ForwardIterator1 keys_last,
-                  ForwardIterator2 values_first,
-                  BinaryPredicate binary_pred)
-{
-  // omp prefers generic::unique_by_key to cpp::unique_by_key
-  return thrust::system::detail::generic::unique_by_key(exec,keys_first,keys_last,values_first,binary_pred);
-} // end unique_by_key()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename BinaryPredicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    unique_by_key_copy(execution_policy<DerivedPolicy> &exec,
-                       InputIterator1 keys_first, 
-                       InputIterator1 keys_last,
-                       InputIterator2 values_first,
-                       OutputIterator1 keys_output,
-                       OutputIterator2 values_output,
-                       BinaryPredicate binary_pred)
-{
-  // omp prefers generic::unique_by_key_copy to cpp::unique_by_key_copy
-  return thrust::system::detail::generic::unique_by_key_copy(exec,keys_first,keys_last,values_first,keys_output,values_output,binary_pred);
-} // end unique_by_key_copy()
-
-
-} // end namespace detail
-} // end namespace omp
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/omp/detail/vector.inl b/compat/thrust/system/omp/detail/vector.inl
deleted file mode 100644
index 32c845c4ad..0000000000
--- a/compat/thrust/system/omp/detail/vector.inl
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/omp/vector.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace omp
-{
-
-template<typename T, typename Allocator>
-  vector<T,Allocator>
-    ::vector()
-      : super_t()
-{}
-
-template<typename T, typename Allocator>
-  vector<T,Allocator>
-    ::vector(size_type n)
-      : super_t(n)
-{}
-
-template<typename T, typename Allocator>
-  vector<T,Allocator>
-    ::vector(size_type n, const value_type &value)
-      : super_t(n,value)
-{}
-
-template<typename T, typename Allocator>
-  vector<T,Allocator>
-    ::vector(const vector &x)
-      : super_t(x)
-{}
-
-template<typename T, typename Allocator>
-  template<typename OtherT, typename OtherAllocator>
-    vector<T,Allocator>
-      ::vector(const thrust::detail::vector_base<OtherT,OtherAllocator> &x)
-        : super_t(x)
-{}
-
-template<typename T, typename Allocator>
-  template<typename OtherT, typename OtherAllocator>
-    vector<T,Allocator>
-      ::vector(const std::vector<OtherT,OtherAllocator> &x)
-        : super_t(x)
-{}
-
-template<typename T, typename Allocator>
-  template<typename InputIterator>
-    vector<T,Allocator>
-      ::vector(InputIterator first, InputIterator last)
-        : super_t(first,last)
-{}
-
-template<typename T, typename Allocator>
-  template<typename OtherT, typename OtherAllocator>
-    vector<T,Allocator> &
-      vector<T,Allocator>
-        ::operator=(const std::vector<OtherT,OtherAllocator> &x)
-{
-  super_t::operator=(x);
-  return *this;
-}
-
-template<typename T, typename Allocator>
-  template<typename OtherT, typename OtherAllocator>
-    vector<T,Allocator> &
-      vector<T,Allocator>
-        ::operator=(const thrust::detail::vector_base<OtherT,OtherAllocator> &x)
-{
-  super_t::operator=(x);
-  return *this;
-}
-      
-} // end omp
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/omp/execution_policy.h b/compat/thrust/system/omp/execution_policy.h
deleted file mode 100644
index 7d5d1d80d3..0000000000
--- a/compat/thrust/system/omp/execution_policy.h
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-/*! \file thrust/system/omp/execution_policy.h
- *  \brief Execution policies for Thrust's OpenMP system.
- */
-
-#include <thrust/detail/config.h>
-
-// get the execution policies definitions first
-#include <thrust/system/omp/detail/execution_policy.h>
-
-// get the definition of par
-#include <thrust/system/omp/detail/par.h>
-
-// now get all the algorithm definitions
-
-#include <thrust/system/omp/detail/adjacent_difference.h>
-#include <thrust/system/omp/detail/assign_value.h>
-#include <thrust/system/omp/detail/binary_search.h>
-#include <thrust/system/omp/detail/copy.h>
-#include <thrust/system/omp/detail/copy_if.h>
-#include <thrust/system/omp/detail/count.h>
-#include <thrust/system/omp/detail/equal.h>
-#include <thrust/system/omp/detail/extrema.h>
-#include <thrust/system/omp/detail/fill.h>
-#include <thrust/system/omp/detail/find.h>
-#include <thrust/system/omp/detail/for_each.h>
-#include <thrust/system/omp/detail/gather.h>
-#include <thrust/system/omp/detail/generate.h>
-#include <thrust/system/omp/detail/get_value.h>
-#include <thrust/system/omp/detail/inner_product.h>
-#include <thrust/system/omp/detail/iter_swap.h>
-#include <thrust/system/omp/detail/logical.h>
-#include <thrust/system/omp/detail/malloc_and_free.h>
-#include <thrust/system/omp/detail/merge.h>
-#include <thrust/system/omp/detail/mismatch.h>
-#include <thrust/system/omp/detail/partition.h>
-#include <thrust/system/omp/detail/reduce.h>
-#include <thrust/system/omp/detail/reduce_by_key.h>
-#include <thrust/system/omp/detail/remove.h>
-#include <thrust/system/omp/detail/replace.h>
-#include <thrust/system/omp/detail/reverse.h>
-#include <thrust/system/omp/detail/scan.h>
-#include <thrust/system/omp/detail/scan_by_key.h>
-#include <thrust/system/omp/detail/scatter.h>
-#include <thrust/system/omp/detail/sequence.h>
-#include <thrust/system/omp/detail/set_operations.h>
-#include <thrust/system/omp/detail/sort.h>
-#include <thrust/system/omp/detail/swap_ranges.h>
-#include <thrust/system/omp/detail/tabulate.h>
-#include <thrust/system/omp/detail/transform.h>
-#include <thrust/system/omp/detail/transform_reduce.h>
-#include <thrust/system/omp/detail/transform_scan.h>
-#include <thrust/system/omp/detail/uninitialized_copy.h>
-#include <thrust/system/omp/detail/uninitialized_fill.h>
-#include <thrust/system/omp/detail/unique.h>
-#include <thrust/system/omp/detail/unique_by_key.h>
-
-
-// define these entities here for the purpose of Doxygenating them
-// they are actually defined elsewhere
-#if 0
-namespace thrust
-{
-namespace system
-{
-namespace omp
-{
-
-
-/*! \addtogroup execution_policies
- *  \{
- */
-
-
-/*! \p thrust::omp::execution_policy is the base class for all Thrust parallel execution
- *  policies which are derived from Thrust's OpenMP backend system.
- */
-template<typename DerivedPolicy>
-struct execution_policy : thrust::execution_policy<DerivedPolicy>
-{};
-
-
-/*! \p omp::tag is a type representing Thrust's standard C++ backend system in C++'s type system.
- *  Iterators "tagged" with a type which is convertible to \p omp::tag assert that they may be
- *  "dispatched" to algorithm implementations in the \p omp system.
- */
-struct tag : thrust::system::omp::execution_policy<tag> { unspecified };
-
-
-/*! \p thrust::omp::par is the parallel execution policy associated with Thrust's OpenMP
- *  backend system.
- *
- *  Instead of relying on implicit algorithm dispatch through iterator system tags, users may
- *  directly target Thrust's OpenMP backend system by providing \p thrust::omp::par as an algorithm
- *  parameter.
- *
- *  Explicit dispatch can be useful in avoiding the introduction of data copies into containers such
- *  as \p thrust::omp::vector.
- *
- *  The type of \p thrust::omp::par is implementation-defined.
- *
- *  The following code snippet demonstrates how to use \p thrust::omp::par to explicitly dispatch an
- *  invocation of \p thrust::for_each to the OpenMP backend system:
- *
- *  \code
- *  #include <thrust/for_each.h>
- *  #include <thrust/system/omp/execution_policy.h>
- *  #include <cstdio>
- *
- *  struct printf_functor
- *  {
- *    __host__ __device__
- *    void operator()(int x)
- *    {
- *      printf("%d\n");
- *    }
- *  };
- *  ...
- *  int vec[3];
- *  vec[0] = 0; vec[1] = 1; vec[2] = 2;
- *
- *  thrust::for_each(thrust::omp::par, vec.begin(), vec.end(), printf_functor());
- *
- *  // 0 1 2 is printed to standard output in some unspecified order
- *  \endcode
- */
-static const unspecified par;
-
-
-/*! \}
- */
-
-
-} // end cpp
-} // end system
-} // end thrust
-#endif
-
-
diff --git a/compat/thrust/system/omp/memory.h b/compat/thrust/system/omp/memory.h
deleted file mode 100644
index 0a23434833..0000000000
--- a/compat/thrust/system/omp/memory.h
+++ /dev/null
@@ -1,414 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file thrust/system/omp/memory.h
- *  \brief Managing memory associated with Thrust's OpenMP system.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/omp/detail/execution_policy.h>
-#include <thrust/memory.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/allocator/malloc_allocator.h>
-#include <ostream>
-
-namespace thrust
-{
-namespace system
-{
-namespace omp
-{
-
-template<typename> class pointer;
-
-} // end omp
-} // end system
-} // end thrust
-
-
-/*! \cond
- */
-
-// specialize std::iterator_traits to avoid problems with the name of
-// pointer's constructor shadowing its nested pointer type
-// do this before pointer is defined so the specialization is correctly
-// used inside the definition
-namespace std
-{
-
-template<typename Element>
-  struct iterator_traits<thrust::system::omp::pointer<Element> >
-{
-  private:
-    typedef thrust::system::omp::pointer<Element> ptr;
-
-  public:
-    typedef typename ptr::iterator_category       iterator_category;
-    typedef typename ptr::value_type              value_type;
-    typedef typename ptr::difference_type         difference_type;
-    typedef ptr                                   pointer;
-    typedef typename ptr::reference               reference;
-}; // end iterator_traits
-
-} // end std
-
-/*! \endcond
- */
-
-
-namespace thrust
-{
-namespace system
-{
-
-/*! \addtogroup system_backends Systems
- *  \ingroup system
- *  \{
- */
-
-/*! \namespace thrust::system::omp
- *  \brief \p thrust::system::omp is the namespace containing functionality for allocating, manipulating,
- *         and deallocating memory available to Thrust's OpenMP backend system.
- *         The identifiers are provided in a separate namespace underneath <tt>thrust::system</tt>
- *         for import convenience but are also aliased in the top-level <tt>thrust::omp</tt>
- *         namespace for easy access.
- *
- */
-namespace omp
-{
-
-// forward declaration of reference for pointer
-template<typename Element> class reference;
-
-/*! \cond
- */
-
-// XXX nvcc + msvc have trouble instantiating reference below
-//     this is a workaround
-namespace detail
-{
-
-template<typename Element>
-  struct reference_msvc_workaround
-{
-  typedef thrust::system::omp::reference<Element> type;
-}; // end reference_msvc_workaround
-
-} // end detail
-
-/*! \endcond
- */
-
-
-/*! \p pointer stores a pointer to an object allocated in memory available to the omp system.
- *  This type provides type safety when dispatching standard algorithms on ranges resident
- *  in omp memory.
- *
- *  \p pointer has pointer semantics: it may be dereferenced and manipulated with pointer arithmetic.
- *
- *  \p pointer can be created with the function \p omp::malloc, or by explicitly calling its constructor
- *  with a raw pointer.
- *
- *  The raw pointer encapsulated by a \p pointer may be obtained by eiter its <tt>get</tt> member function
- *  or the \p raw_pointer_cast function.
- *
- *  \note \p pointer is not a "smart" pointer; it is the programmer's responsibility to deallocate memory
- *  pointed to by \p pointer.
- *
- *  \tparam T specifies the type of the pointee.
- *
- *  \see omp::malloc
- *  \see omp::free
- *  \see raw_pointer_cast
- */
-template<typename T>
-  class pointer
-    : public thrust::pointer<
-               T,
-               thrust::system::omp::tag,
-               thrust::system::omp::reference<T>,
-               thrust::system::omp::pointer<T>
-             >
-{
-  /*! \cond
-   */
-
-  private:
-    typedef thrust::pointer<
-      T,
-      thrust::system::omp::tag,
-      //thrust::system::omp::reference<T>,
-      typename detail::reference_msvc_workaround<T>::type,
-      thrust::system::omp::pointer<T>
-    > super_t;
-
-  /*! \endcond
-   */
-
-  public:
-    // note that omp::pointer's member functions need __host__ __device__
-    // to interoperate with nvcc + iterators' dereference member function
-
-    /*! \p pointer's no-argument constructor initializes its encapsulated pointer to \c 0.
-     */
-    __host__ __device__
-    pointer() : super_t() {}
-
-    /*! This constructor allows construction of a <tt>pointer<const T></tt> from a <tt>T*</tt>.
-     *
-     *  \param ptr A raw pointer to copy from, presumed to point to a location in memory
-     *         accessible by the \p omp system.
-     *  \tparam OtherT \p OtherT shall be convertible to \p T.
-     */
-    template<typename OtherT>
-    __host__ __device__
-    explicit pointer(OtherT *ptr) : super_t(ptr) {}
-
-    /*! This constructor allows construction from another pointer-like object with related type.
-     *
-     *  \param other The \p OtherPointer to copy.
-     *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
-     *          to \p thrust::system::omp::tag and its element type shall be convertible to \p T.
-     */
-    template<typename OtherPointer>
-    __host__ __device__
-    pointer(const OtherPointer &other,
-            typename thrust::detail::enable_if_pointer_is_convertible<
-              OtherPointer,
-              pointer
-            >::type * = 0) : super_t(other) {}
-
-    /*! Assignment operator allows assigning from another pointer-like object with related type.
-     *
-     *  \param other The other pointer-like object to assign from.
-     *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
-     *          to \p thrust::system::omp::tag and its element type shall be convertible to \p T.
-     */
-    template<typename OtherPointer>
-    __host__ __device__
-    typename thrust::detail::enable_if_pointer_is_convertible<
-      OtherPointer,
-      pointer,
-      pointer &
-    >::type
-    operator=(const OtherPointer &other)
-    {
-      return super_t::operator=(other);
-    }
-}; // end pointer
-
-
-/*! \p reference is a wrapped reference to an object stored in memory available to the \p omp system.
- *  \p reference is the type of the result of dereferencing a \p omp::pointer.
- *
- *  \tparam T Specifies the type of the referenced object.
- */
-template<typename T>
-  class reference
-    : public thrust::reference<
-               T,
-               thrust::system::omp::pointer<T>,
-               thrust::system::omp::reference<T>
-             >
-{
-  /*! \cond
-   */
-
-  private:
-    typedef thrust::reference<
-      T,
-      thrust::system::omp::pointer<T>,
-      thrust::system::omp::reference<T>
-    > super_t;
-
-  /*! \endcond
-   */
-
-  public:
-    /*! \cond
-     */
-
-    typedef typename super_t::value_type value_type;
-    typedef typename super_t::pointer    pointer;
-
-    /*! \endcond
-     */
-
-    /*! This constructor initializes this \p reference to refer to an object
-     *  pointed to by the given \p pointer. After this \p reference is constructed,
-     *  it shall refer to the object pointed to by \p ptr.
-     *
-     *  \param ptr A \p pointer to copy from.
-     */
-    __host__ __device__
-    explicit reference(const pointer &ptr)
-      : super_t(ptr)
-    {}
-
-    /*! This constructor accepts a const reference to another \p reference of related type.
-     *  After this \p reference is constructed, it shall refer to the same object as \p other.
-     *
-     *  \param other A \p reference to copy from.
-     *  \tparam OtherT The element type of the other \p reference.
-     *
-     *  \note This constructor is templated primarily to allow initialization of <tt>reference<const T></tt>
-     *        from <tt>reference<T></tt>.
-     */
-    template<typename OtherT>
-    __host__ __device__
-    reference(const reference<OtherT> &other,
-              typename thrust::detail::enable_if_convertible<
-                typename reference<OtherT>::pointer,
-                pointer
-              >::type * = 0)
-      : super_t(other)
-    {}
-
-    /*! Copy assignment operator copy assigns from another \p reference of related type.
-     *
-     *  \param other The other \p reference to assign from.
-     *  \return <tt>*this</tt>
-     *  \tparam OtherT The element type of the other \p reference.
-     */
-    template<typename OtherT>
-    reference &operator=(const reference<OtherT> &other);
-
-    /*! Assignment operator assigns from a \p value_type.
-     *
-     *  \param x The \p value_type to assign from.
-     *  \return <tt>*this</tt>
-     */
-    reference &operator=(const value_type &x);
-}; // end reference
-
-/*! Exchanges the values of two objects referred to by \p reference.
- *  \p x The first \p reference of interest.
- *  \p y The second \p reference ot interest.
- */
-template<typename T>
-__host__ __device__
-void swap(reference<T> x, reference<T> y);
-
-/*! Allocates an area of memory available to Thrust's <tt>omp</tt> system.
- *  \param n Number of bytes to allocate.
- *  \return A <tt>omp::pointer<void></tt> pointing to the beginning of the newly
- *          allocated memory. A null <tt>omp::pointer<void></tt> is returned if
- *          an error occurs.
- *  \note The <tt>omp::pointer<void></tt> returned by this function must be
- *        deallocated with \p omp::free.
- *  \see omp::free
- *  \see std::malloc
- */
-inline pointer<void> malloc(std::size_t n);
-
-/*! Allocates a typed area of memory available to Thrust's <tt>omp</tt> system.
- *  \param n Number of elements to allocate.
- *  \return A <tt>omp::pointer<T></tt> pointing to the beginning of the newly
- *          allocated memory. A null <tt>omp::pointer<T></tt> is returned if
- *          an error occurs.
- *  \note The <tt>omp::pointer<T></tt> returned by this function must be
- *        deallocated with \p omp::free.
- *  \see omp::free
- *  \see std::malloc
- */
-template<typename T>
-inline pointer<T> malloc(std::size_t n);
-
-/*! Deallocates an area of memory previously allocated by <tt>omp::malloc</tt>.
- *  \param ptr A <tt>omp::pointer<void></tt> pointing to the beginning of an area
- *         of memory previously allocated with <tt>omp::malloc</tt>.
- *  \see omp::malloc
- *  \see std::free
- */
-inline void free(pointer<void> ptr);
-
-// XXX upon c++11
-// template<typename T> using allocator = thrust::detail::malloc_allocator<T,tag,pointer<T> >;
-
-/*! \p omp::allocator is the default allocator used by the \p omp system's containers such as
- *  <tt>omp::vector</tt> if no user-specified allocator is provided. \p omp::allocator allocates
- *  (deallocates) storage with \p omp::malloc (\p omp::free).
- */
-template<typename T>
-  struct allocator
-    : thrust::detail::malloc_allocator<
-        T,
-        tag,
-        pointer<T>
-      >
-{
-  /*! The \p rebind metafunction provides the type of an \p allocator
-   *  instantiated with another type.
-   *
-   *  \tparam U The other type to use for instantiation.
-   */
-  template<typename U>
-    struct rebind
-  {
-    /*! The typedef \p other gives the type of the rebound \p allocator.
-     */
-    typedef allocator<U> other;
-  };
-
-  /*! No-argument constructor has no effect.
-   */
-  __host__ __device__
-  inline allocator() {}
-
-  /*! Copy constructor has no effect.
-   */
-  __host__ __device__
-  inline allocator(const allocator &) {}
-
-  /*! Constructor from other \p allocator has no effect.
-   */
-  template<typename U>
-  __host__ __device__
-  inline allocator(const allocator<U> &) {}
-
-  /*! Destructor has no effect.
-   */
-  __host__ __device__
-  inline ~allocator() {}
-}; // end allocator
-
-} // end omp
-
-/*! \}
- */
-
-} // end system
-
-/*! \namespace thrust::omp
- *  \brief \p thrust::omp is a top-level alias for thrust::system::omp.
- */
-namespace omp
-{
-
-using thrust::system::omp::pointer;
-using thrust::system::omp::reference;
-using thrust::system::omp::malloc;
-using thrust::system::omp::free;
-using thrust::system::omp::allocator;
-
-} // end omp
-
-} // end thrust
-
-#include <thrust/system/omp/detail/memory.inl>
-
diff --git a/compat/thrust/system/omp/vector.h b/compat/thrust/system/omp/vector.h
deleted file mode 100644
index 5f45a9169b..0000000000
--- a/compat/thrust/system/omp/vector.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file thrust/system/omp/vector.h
- *  \brief A dynamically-sizable array of elements which reside in memory available to
- *         Thrust's OpenMP system.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/omp/memory.h>
-#include <thrust/detail/vector_base.h>
-#include <vector>
-
-namespace thrust
-{
-
-// forward declaration of host_vector
-// XXX why is this here? it doesn't seem necessary for anything below
-template<typename T, typename Allocator> class host_vector;
-
-namespace system
-{
-namespace omp
-{
-
-// XXX upon c++11
-// template<typename T, typename Allocator = allocator<T> > using vector = thrust::detail::vector_base<T,Allocator>;
-
-/*! \p omp::vector is a container that supports random access to elements,
- *  constant time removal of elements at the end, and linear time insertion
- *  and removal of elements at the beginning or in the middle. The number of
- *  elements in a \p omp::vector may vary dynamically; memory management is
- *  automatic. The elements contained in an \p omp::vector reside in memory
- *  available to the \p omp system.
- *
- *  \tparam T The element type of the \p omp::vector.
- *  \tparam Allocator The allocator type of the \p omp::vector. Defaults to \p omp::allocator.
- *
- *  \see http://www.sgi.com/tech/stl/Vector.html
- *  \see host_vector For the documentation of the complete interface which is
- *                   shared by \p omp::vector
- *  \see device_vector
- */
-template<typename T, typename Allocator = allocator<T> >
-  class vector
-    : public thrust::detail::vector_base<T,Allocator>
-{
-  /*! \cond
-   */
-  private:
-    typedef thrust::detail::vector_base<T,Allocator> super_t;
-  /*! \endcond
-   */
-
-  public:
-    
-  /*! \cond
-   */
-    typedef typename super_t::size_type  size_type;
-    typedef typename super_t::value_type value_type;
-  /*! \endcond
-   */
-
-    /*! This constructor creates an empty \p omp::vector.
-     */
-    vector();
-
-    /*! This constructor creates a \p omp::vector with \p n default-constructed elements.
-     *  \param n The size of the \p omp::vector to create.
-     */
-    explicit vector(size_type n);
-
-    /*! This constructor creates a \p omp::vector with \p n copies of \p value.
-     *  \param n The size of the \p omp::vector to create.
-     *  \param value An element to copy.
-     */
-    explicit vector(size_type n, const value_type &value);
-
-    /*! Copy constructor copies from another \p omp::vector.
-     *  \param x The other \p omp::vector to copy.
-     */
-    vector(const vector &x);
-
-    /*! This constructor copies from another Thrust vector-like object.
-     *  \param x The other object to copy from.
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector(const thrust::detail::vector_base<OtherT,OtherAllocator> &x);
-
-    /*! This constructor copies from a \c std::vector.
-     *  \param x The \c std::vector to copy from.
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector(const std::vector<OtherT,OtherAllocator> &x);
-
-    /*! This constructor creates an \p omp::vector by copying from a range.
-     *  \param first The beginning of the range.
-     *  \param last The end of the range.
-     */
-    template<typename InputIterator>
-    vector(InputIterator first, InputIterator last);
-
-    // XXX vector_base should take a Derived type so we don't have to define these superfluous assigns
-
-    /*! Assignment operator assigns from a \c std::vector.
-     *  \param x The \c std::vector to assign from.
-     *  \return <tt>*this</tt>
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector &operator=(const std::vector<OtherT,OtherAllocator> &x);
-
-    /*! Assignment operator assigns from another Thrust vector-like object.
-     *  \param x The other object to assign from.
-     *  \return <tt>*this</tt>
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector &operator=(const thrust::detail::vector_base<OtherT,OtherAllocator> &x);
-}; // end vector
-
-} // end omp
-} // end system
-
-// alias system::omp names at top-level
-namespace omp
-{
-
-using thrust::system::omp::vector;
-
-} // end omp
-
-} // end thrust
-
-#include <thrust/system/omp/detail/vector.inl>
-
diff --git a/compat/thrust/system/system_error.h b/compat/thrust/system/system_error.h
deleted file mode 100644
index 6f94b61e0a..0000000000
--- a/compat/thrust/system/system_error.h
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file system/system_error.h
- *  \brief An exception object used to report error conditions that have an
- *         associated error code
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <stdexcept>
-#include <string>
-
-#include <thrust/system/error_code.h>
-
-namespace thrust
-{
-
-namespace system
-{
-
-// [19.5.5] Class system_error
-
-// [19.5.5.1] Class system_error overview
-
-/*! \addtogroup system_diagnostics System Diagnostics
- *  \ingroup system
- *  \{
- */
-
-/*! \brief The class \p system_error describes an exception object used to report error
- *  conditions that have an associated \p error_code. Such error conditions typically
- *  originate from the operating system or other low-level application program interfaces.
- *
- *  Thrust uses \p system_error to report the error codes returned from device backends
- *  such as the CUDA runtime.
- *
- *  The following code listing demonstrates how to catch a \p system_error to recover
- *  from an error.
- *
- *  \code
- *
- *  #include <thrust/device_vector.h>
- *  #include <thrust/system.h>
- *  #include <thrust/sort.h>
- *
- *  void terminate_gracefully(void)
- *  {
- *    // application-specific termination code here
- *    ...
- *  }
- *
- *  int main(void)
- *  {
- *    try
- *    {
- *      thrust::device_vector<float> vec;
- *      thrust::sort(vec.begin(), vec.end());
- *    }
- *    catch(thrust::system_error e)
- *    {
- *      std::cerr << "Error inside sort: " << e.what() << std::endl;
- *      terminate_gracefully();
- *    }
- *
- *    return 0;
- *  }
- *
- *  \endcode
- *
- *  \note If an error represents an out-of-memory condition, implementations are encouraged
- *  to throw an exception object of type \p std::bad_alloc rather than \p system_error.
- */
-class system_error
-  : public std::runtime_error
-{
-  public:
-    // [19.5.5.2] Class system_error members
-    
-    /*! Constructs an object of class \p system_error.
-     *  \param ec The value returned by \p code().
-     *  \param what_arg A string to include in the result returned by \p what().
-     *  \post <tt>code() == ec</tt>.
-     *  \post <tt>std::string(what()).find(what_arg) != string::npos</tt>.
-     */
-    inline system_error(error_code ec, const std::string &what_arg);
-
-    /*! Constructs an object of class \p system_error.
-     *  \param ec The value returned by \p code().
-     *  \param what_arg A string to include in the result returned by \p what().
-     *  \post <tt>code() == ec</tt>.
-     *  \post <tt>std::string(what()).find(what_arg) != string::npos</tt>.
-     */
-    inline system_error(error_code ec, const char *what_arg);
-
-    /*! Constructs an object of class \p system_error.
-     *  \param ec The value returned by \p code().
-     *  \post <tt>code() == ec</tt>.
-     */
-    inline system_error(error_code ec);
-
-    /*! Constructs an object of class \p system_error.
-     *  \param ev The error value used to create an \p error_code.
-     *  \param ecat The \p error_category used to create an \p error_code.
-     *  \param what_arg A string to include in the result returned by \p what().
-     *  \post <tt>code() == error_code(ev, ecat)</tt>.
-     *  \post <tt>std::string(what()).find(what_arg) != string::npos</tt>.
-     */
-    inline system_error(int ev, const error_category &ecat, const std::string &what_arg);
-
-    /*! Constructs an object of class \p system_error.
-     *  \param ev The error value used to create an \p error_code.
-     *  \param ecat The \p error_category used to create an \p error_code.
-     *  \param what_arg A string to include in the result returned by \p what().
-     *  \post <tt>code() == error_code(ev, ecat)</tt>.
-     *  \post <tt>std::string(what()).find(what_arg) != string::npos</tt>.
-     */
-    inline system_error(int ev, const error_category &ecat, const char *what_arg);
-
-    /*! Constructs an object of class \p system_error.
-     *  \param ev The error value used to create an \p error_code.
-     *  \param ecat The \p error_category used to create an \p error_code.
-     *  \post <tt>code() == error_code(ev, ecat)</tt>.
-     */
-    inline system_error(int ev, const error_category &ecat);
-
-    /*! Destructor does not throw.
-     */
-    inline virtual ~system_error(void) throw () {};
-    
-    /*! Returns an object encoding the error.
-     *  \return <tt>ec</tt> or <tt>error_code(ev, ecat)</tt>, from the
-     *          constructor, as appropriate.
-     */
-    inline const error_code &code(void) const throw();
-
-    /*! Returns a human-readable string indicating the nature of the error.
-     *  \return a string incorporating <tt>code().message()</tt> and the
-     *          arguments supplied in the constructor.
-     */
-    inline const char *what(void) const throw();
-
-    /*! \cond
-     */
-  private:
-    error_code          m_error_code;
-    mutable std::string m_what;
-
-    /*! \endcond
-     */
-}; // end system_error
-
-} // end system
-
-/*! \} // end system_diagnostics
- */
-
-// import names into thrust::
-using system::system_error;
-
-} // end thrust
-
-#include <thrust/system/detail/system_error.inl>
-
diff --git a/compat/thrust/system/tbb/detail/adjacent_difference.h b/compat/thrust/system/tbb/detail/adjacent_difference.h
deleted file mode 100644
index 37c9adc2c9..0000000000
--- a/compat/thrust/system/tbb/detail/adjacent_difference.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/tbb/detail/execution_policy.h>
-#include <thrust/system/detail/generic/adjacent_difference.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace tbb
-{
-namespace detail
-{
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename BinaryFunction>
-  OutputIterator adjacent_difference(execution_policy<DerivedPolicy> &exec,
-                                     InputIterator first,
-                                     InputIterator last,
-                                     OutputIterator result,
-                                     BinaryFunction binary_op)
-{
-  // tbb prefers generic::adjacent_difference to cpp::adjacent_difference
-  return thrust::system::detail::generic::adjacent_difference(exec, first, last, result, binary_op);
-} // end adjacent_difference()
-
-} // end detail
-} // end tbb
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/tbb/detail/assign_value.h b/compat/thrust/system/tbb/detail/assign_value.h
deleted file mode 100644
index eda3b977b8..0000000000
--- a/compat/thrust/system/tbb/detail/assign_value.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits assign_value
-#include <thrust/system/cpp/detail/assign_value.h>
-
diff --git a/compat/thrust/system/tbb/detail/binary_search.h b/compat/thrust/system/tbb/detail/binary_search.h
deleted file mode 100644
index 8dec989771..0000000000
--- a/compat/thrust/system/tbb/detail/binary_search.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits binary_search
-#include <thrust/system/cpp/detail/binary_search.h>
-
diff --git a/compat/thrust/system/tbb/detail/copy.h b/compat/thrust/system/tbb/detail/copy.h
deleted file mode 100644
index 7604e6f4ad..0000000000
--- a/compat/thrust/system/tbb/detail/copy.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in ctbbliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/tbb/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace tbb
-{
-namespace detail
-{
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator>
-OutputIterator copy(execution_policy<DerivedPolicy> &exec,
-                    InputIterator first,
-                    InputIterator last,
-                    OutputIterator result);
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename Size,
-         typename OutputIterator>
-OutputIterator copy_n(execution_policy<DerivedPolicy> &exec,
-                      InputIterator first,
-                      Size n,
-                      OutputIterator result);
-
-
-} // end namespace detail
-} // end namespace tbb
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/tbb/detail/copy.inl>
-
diff --git a/compat/thrust/system/tbb/detail/copy.inl b/compat/thrust/system/tbb/detail/copy.inl
deleted file mode 100644
index 6d354d0b18..0000000000
--- a/compat/thrust/system/tbb/detail/copy.inl
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in ctbbliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/tbb/detail/copy.h>
-#include <thrust/system/detail/generic/copy.h>
-#include <thrust/detail/type_traits/minimum_type.h>
-#include <thrust/system/cpp/detail/copy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace tbb
-{
-namespace detail
-{
-namespace dispatch
-{
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator>
-  OutputIterator copy(execution_policy<DerivedPolicy> &exec,
-                      InputIterator first,
-                      InputIterator last,
-                      OutputIterator result,
-                      thrust::incrementable_traversal_tag)
-{
-  return thrust::system::cpp::detail::copy(exec, first, last, result);
-} // end copy()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator>
-  OutputIterator copy(execution_policy<DerivedPolicy> &exec,
-                      InputIterator first,
-                      InputIterator last,
-                      OutputIterator result,
-                      thrust::random_access_traversal_tag)
-{
-  return thrust::system::detail::generic::copy(exec, first, last, result);
-} // end copy()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename Size,
-         typename OutputIterator>
-  OutputIterator copy_n(execution_policy<DerivedPolicy> &exec,
-                        InputIterator first,
-                        Size n,
-                        OutputIterator result,
-                        thrust::incrementable_traversal_tag)
-{
-  return thrust::system::cpp::detail::copy_n(exec, first, n, result);
-} // end copy_n()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename Size,
-         typename OutputIterator>
-  OutputIterator copy_n(execution_policy<DerivedPolicy> &exec,
-                        InputIterator first,
-                        Size n,
-                        OutputIterator result,
-                        thrust::random_access_traversal_tag)
-{
-  return thrust::system::detail::generic::copy_n(exec, first, n, result);
-} // end copy_n()
-
-} // end dispatch
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator>
-OutputIterator copy(execution_policy<DerivedPolicy> &exec,
-                    InputIterator first,
-                    InputIterator last,
-                    OutputIterator result)
-{
-  typedef typename thrust::iterator_traversal<InputIterator>::type  traversal1;
-  typedef typename thrust::iterator_traversal<OutputIterator>::type traversal2;
-  
-  typedef typename thrust::detail::minimum_type<traversal1,traversal2>::type traversal;
-
-  // dispatch on minimum traversal
-  return thrust::system::tbb::detail::dispatch::copy(exec,first,last,result,traversal());
-} // end copy()
-
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename Size,
-         typename OutputIterator>
-OutputIterator copy_n(execution_policy<DerivedPolicy> &exec,
-                      InputIterator first,
-                      Size n,
-                      OutputIterator result)
-{
-  typedef typename thrust::iterator_traversal<InputIterator>::type  traversal1;
-  typedef typename thrust::iterator_traversal<OutputIterator>::type traversal2;
-  
-  typedef typename thrust::detail::minimum_type<traversal1,traversal2>::type traversal;
-
-  // dispatch on minimum traversal
-  return thrust::system::tbb::detail::dispatch::copy_n(exec,first,n,result,traversal());
-} // end copy_n()
-
-
-} // end namespace detail
-} // end namespace tbb
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/tbb/detail/copy_if.h b/compat/thrust/system/tbb/detail/copy_if.h
deleted file mode 100644
index ffbd4f8f46..0000000000
--- a/compat/thrust/system/tbb/detail/copy_if.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/tbb/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace tbb
-{
-namespace detail
-{
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator copy_if(tag,
-                         InputIterator1 first,
-                         InputIterator1 last,
-                         InputIterator2 stencil,
-                         OutputIterator result,
-                         Predicate pred);
-
-
-} // end detail
-} // end tbb
-} // end system
-} // end thrust
-
-#include <thrust/system/tbb/detail/copy_if.inl>
-
diff --git a/compat/thrust/system/tbb/detail/copy_if.inl b/compat/thrust/system/tbb/detail/copy_if.inl
deleted file mode 100644
index 4353b3b2f7..0000000000
--- a/compat/thrust/system/tbb/detail/copy_if.inl
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/function.h>
-#include <thrust/system/tbb/detail/copy_if.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/distance.h>
-#include <tbb/blocked_range.h>
-#include <tbb/parallel_scan.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace tbb
-{
-namespace detail
-{
-namespace copy_if_detail
-{
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename Predicate,
-         typename Size>
-struct body
-{
-
-  InputIterator1 first;
-  InputIterator2 stencil;
-  OutputIterator result;
-  thrust::detail::host_function<Predicate,bool> pred;
-  Size sum;
-
-  body(InputIterator1 first, InputIterator2 stencil, OutputIterator result, Predicate pred)
-    : first(first), stencil(stencil), result(result), pred(pred), sum(0)
-  {}
-
-  body(body& b, ::tbb::split)
-    : first(b.first), stencil(b.stencil), result(b.result), pred(b.pred), sum(0)
-  {}
-
-  void operator()(const ::tbb::blocked_range<Size>& r, ::tbb::pre_scan_tag)
-  {
-    InputIterator2 iter = stencil + r.begin();
-
-    for (Size i = r.begin(); i != r.end(); ++i, ++iter)
-    {
-      if (pred(*iter))
-        ++sum;
-    }
-  }
-  
-  void operator()(const ::tbb::blocked_range<Size>& r, ::tbb::final_scan_tag)
-  {
-    InputIterator1  iter1 = first   + r.begin();
-    InputIterator2  iter2 = stencil + r.begin();
-    OutputIterator  iter3 = result  + sum;
-      
-    for (Size i = r.begin(); i != r.end(); ++i, ++iter1, ++iter2)
-    {
-      if (pred(*iter2))
-      {
-        *iter3 = *iter1;
-        ++sum;
-        ++iter3;
-      }
-    }
-  }
-
-  void reverse_join(body& b)
-  {
-    sum = b.sum + sum;
-  } 
-
-  void assign(body& b)
-  {
-    sum = b.sum;
-  } 
-}; // end body
-
-} // end copy_if_detail
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator copy_if(tag,
-                         InputIterator1 first,
-                         InputIterator1 last,
-                         InputIterator2 stencil,
-                         OutputIterator result,
-                         Predicate pred)
-{
-  typedef typename thrust::iterator_difference<InputIterator1>::type Size; 
-  typedef typename copy_if_detail::body<InputIterator1,InputIterator2,OutputIterator,Predicate,Size> Body;
-  
-  Size n = thrust::distance(first, last);
-
-  if (n != 0)
-  {
-    Body body(first, stencil, result, pred);
-    ::tbb::parallel_scan(::tbb::blocked_range<Size>(0,n), body);
-    thrust::advance(result, body.sum);
-  }
-
-  return result;
-} // end copy_if()
-
-} // end detail
-} // end tbb
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/tbb/detail/count.h b/compat/thrust/system/tbb/detail/count.h
deleted file mode 100644
index da31ee8700..0000000000
--- a/compat/thrust/system/tbb/detail/count.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits count
-#include <thrust/system/cpp/detail/count.h>
-
diff --git a/compat/thrust/system/tbb/detail/equal.h b/compat/thrust/system/tbb/detail/equal.h
deleted file mode 100644
index 74e55183d9..0000000000
--- a/compat/thrust/system/tbb/detail/equal.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits equal
-#include <thrust/system/cpp/detail/equal.h>
-
diff --git a/compat/thrust/system/tbb/detail/execution_policy.h b/compat/thrust/system/tbb/detail/execution_policy.h
deleted file mode 100644
index 167d1dc4ce..0000000000
--- a/compat/thrust/system/tbb/detail/execution_policy.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in ctbbliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cpp/detail/execution_policy.h>
-#include <thrust/iterator/detail/any_system_tag.h>
-#include <thrust/detail/type_traits.h>
-
-namespace thrust
-{
-namespace system
-{
-// put the canonical tag in the same ns as the backend's entry points
-namespace tbb
-{
-namespace detail
-{
-
-// this awkward sequence of definitions arise
-// from the desire both for tag to derive
-// from execution_policy and for execution_policy
-// to convert to tag (when execution_policy is not
-// an ancestor of tag)
-
-// forward declaration of tag
-struct tag;
-
-// forward declaration of execution_policy
-template<typename> struct execution_policy;
-
-// specialize execution_policy for tag
-template<>
-  struct execution_policy<tag>
-    : thrust::system::cpp::detail::execution_policy<tag>
-{};
-
-// tag's definition comes before the
-// generic definition of execution_policy
-struct tag : execution_policy<tag> {};
-
-// allow conversion to tag when it is not a successor
-template<typename Derived>
-  struct execution_policy
-    : thrust::system::cpp::detail::execution_policy<Derived>
-{
-  // allow conversion to tag
-  inline operator tag () const
-  {
-    return tag();
-  }
-};
-
-} // end detail
-
-// alias execution_policy and tag here
-using thrust::system::tbb::detail::execution_policy;
-using thrust::system::tbb::detail::tag;
-
-} // end tbb
-} // end system
-
-// alias items at top-level
-namespace tbb
-{
-
-using thrust::system::tbb::execution_policy;
-using thrust::system::tbb::tag;
-
-} // end tbb
-} // end thrust
-
diff --git a/compat/thrust/system/tbb/detail/extrema.h b/compat/thrust/system/tbb/detail/extrema.h
deleted file mode 100644
index 4715a8948b..0000000000
--- a/compat/thrust/system/tbb/detail/extrema.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in ctbbliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/tbb/detail/execution_policy.h>
-#include <thrust/system/detail/generic/extrema.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace tbb
-{
-namespace detail
-{
-
-template <typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
-ForwardIterator max_element(execution_policy<DerivedPolicy> &exec,
-                            ForwardIterator first, 
-                            ForwardIterator last,
-                            BinaryPredicate comp)
-{
-  // tbb prefers generic::max_element to cpp::max_element
-  return thrust::system::detail::generic::max_element(exec, first, last, comp);
-} // end max_element()
-
-template <typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
-ForwardIterator min_element(execution_policy<DerivedPolicy> &exec,
-                            ForwardIterator first, 
-                            ForwardIterator last,
-                            BinaryPredicate comp)
-{
-  // tbb prefers generic::min_element to cpp::min_element
-  return thrust::system::detail::generic::min_element(exec, first, last, comp);
-} // end min_element()
-
-template <typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
-thrust::pair<ForwardIterator,ForwardIterator> minmax_element(execution_policy<DerivedPolicy> &exec,
-                                                             ForwardIterator first, 
-                                                             ForwardIterator last,
-                                                             BinaryPredicate comp)
-{
-  // tbb prefers generic::minmax_element to cpp::minmax_element
-  return thrust::system::detail::generic::minmax_element(exec, first, last, comp);
-} // end minmax_element()
-
-} // end detail
-} // end tbb
-} // end system
-} // end thrust
-
-
diff --git a/compat/thrust/system/tbb/detail/fill.h b/compat/thrust/system/tbb/detail/fill.h
deleted file mode 100644
index 5219e1c7c5..0000000000
--- a/compat/thrust/system/tbb/detail/fill.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits fill
-#include <thrust/system/cpp/detail/fill.h>
-
diff --git a/compat/thrust/system/tbb/detail/find.h b/compat/thrust/system/tbb/detail/find.h
deleted file mode 100644
index d351454c16..0000000000
--- a/compat/thrust/system/tbb/detail/find.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/find.h>
-#include <thrust/system/tbb/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace tbb
-{
-namespace detail
-{
-
-template <typename DerivedPolicy, typename InputIterator, typename Predicate>
-InputIterator find_if(execution_policy<DerivedPolicy> &exec,
-                      InputIterator first,
-                      InputIterator last,
-                      Predicate pred)
-{
-  // tbb prefers generic::find_if to cpp::find_if
-  return thrust::system::detail::generic::find_if(exec, first, last, pred);
-}
-
-} // end namespace detail
-} // end namespace tbb
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/tbb/detail/for_each.h b/compat/thrust/system/tbb/detail/for_each.h
deleted file mode 100644
index 573bb819a2..0000000000
--- a/compat/thrust/system/tbb/detail/for_each.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in ctbbliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/tbb/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace tbb
-{
-namespace detail
-{
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator,
-         typename UnaryFunction>
-  RandomAccessIterator for_each(execution_policy<DerivedPolicy> &exec,
-                                RandomAccessIterator first,
-                                RandomAccessIterator last,
-                                UnaryFunction f);
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator,
-         typename Size,
-         typename UnaryFunction>
-  RandomAccessIterator for_each_n(execution_policy<DerivedPolicy> &exec,
-                                  RandomAccessIterator first,
-                                  Size n,
-                                  UnaryFunction f);
-
-} // end namespace detail
-} // end namespace tbb
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/tbb/detail/for_each.inl>
-
diff --git a/compat/thrust/system/tbb/detail/for_each.inl b/compat/thrust/system/tbb/detail/for_each.inl
deleted file mode 100644
index b09c7be881..0000000000
--- a/compat/thrust/system/tbb/detail/for_each.inl
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in ctbbliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/static_assert.h>
-#include <thrust/distance.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/distance.h>
-#include <thrust/system/detail/internal/scalar/for_each.h>
-#include <tbb/blocked_range.h>
-#include <tbb/parallel_for.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace tbb
-{
-namespace detail
-{
-namespace for_each_detail
-{
-
-template<typename RandomAccessIterator,
-         typename Size,
-         typename UnaryFunction>
-  struct body
-{
-  RandomAccessIterator m_first;
-  UnaryFunction m_f;
-
-  body(RandomAccessIterator first, UnaryFunction f)
-    : m_first(first), m_f(f)
-  {}
-
-  void operator()(const ::tbb::blocked_range<Size> &r) const
-  {
-    // we assume that blocked_range specifies a contiguous range of integers
-    thrust::system::detail::internal::scalar::for_each_n(m_first + r.begin(), r.size(), m_f);
-  } // end operator()()
-}; // end body
-
-
-template<typename Size, typename RandomAccessIterator, typename UnaryFunction>
-  body<RandomAccessIterator,Size,UnaryFunction>
-    make_body(RandomAccessIterator first, UnaryFunction f)
-{
-  return body<RandomAccessIterator,Size,UnaryFunction>(first, f);
-} // end make_body()
-
-
-} // end for_each_detail
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator,
-         typename Size,
-         typename UnaryFunction>
-RandomAccessIterator for_each_n(execution_policy<DerivedPolicy> &,
-                                RandomAccessIterator first,
-                                Size n,
-                                UnaryFunction f)
-{
-  ::tbb::parallel_for(::tbb::blocked_range<Size>(0,n), for_each_detail::make_body<Size>(first,f));
-
-  // return the end of the range
-  return first + n;
-} // end for_each_n 
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator,
-         typename UnaryFunction>
-  RandomAccessIterator for_each(execution_policy<DerivedPolicy> &s,
-                                RandomAccessIterator first,
-                                RandomAccessIterator last,
-                                UnaryFunction f)
-{
-  return tbb::detail::for_each_n(s, first, thrust::distance(first,last), f);
-} // end for_each()
-
-
-} // end namespace detail
-} // end namespace tbb
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/tbb/detail/gather.h b/compat/thrust/system/tbb/detail/gather.h
deleted file mode 100644
index dfb7d7fc2d..0000000000
--- a/compat/thrust/system/tbb/detail/gather.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits gather
-#include <thrust/system/cpp/detail/gather.h>
-
diff --git a/compat/thrust/system/tbb/detail/generate.h b/compat/thrust/system/tbb/detail/generate.h
deleted file mode 100644
index 0cb33b9336..0000000000
--- a/compat/thrust/system/tbb/detail/generate.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits generate
-#include <thrust/system/cpp/detail/generate.h>
-
diff --git a/compat/thrust/system/tbb/detail/get_value.h b/compat/thrust/system/tbb/detail/get_value.h
deleted file mode 100644
index e376e65749..0000000000
--- a/compat/thrust/system/tbb/detail/get_value.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits get_value
-#include <thrust/system/cpp/detail/get_value.h>
-
diff --git a/compat/thrust/system/tbb/detail/inner_product.h b/compat/thrust/system/tbb/detail/inner_product.h
deleted file mode 100644
index 351421a577..0000000000
--- a/compat/thrust/system/tbb/detail/inner_product.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits inner_product
-#include <thrust/system/cpp/detail/inner_product.h>
-
diff --git a/compat/thrust/system/tbb/detail/iter_swap.h b/compat/thrust/system/tbb/detail/iter_swap.h
deleted file mode 100644
index 16176ec69b..0000000000
--- a/compat/thrust/system/tbb/detail/iter_swap.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits iter_swap
-#include <thrust/system/cpp/detail/iter_swap.h>
-
diff --git a/compat/thrust/system/tbb/detail/logical.h b/compat/thrust/system/tbb/detail/logical.h
deleted file mode 100644
index b2a80de70f..0000000000
--- a/compat/thrust/system/tbb/detail/logical.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits logical
-#include <thrust/system/cpp/detail/logical.h>
-
diff --git a/compat/thrust/system/tbb/detail/malloc_and_free.h b/compat/thrust/system/tbb/detail/malloc_and_free.h
deleted file mode 100644
index 811a552a4f..0000000000
--- a/compat/thrust/system/tbb/detail/malloc_and_free.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits malloc and free
-#include <thrust/system/cpp/detail/malloc_and_free.h>
-
diff --git a/compat/thrust/system/tbb/detail/memory.inl b/compat/thrust/system/tbb/detail/memory.inl
deleted file mode 100644
index 420a8a14b4..0000000000
--- a/compat/thrust/system/tbb/detail/memory.inl
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in ctbbliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cpp/detail/execution_policy.h>
-#include <thrust/system/tbb/memory.h>
-#include <thrust/system/cpp/memory.h>
-#include <limits>
-
-namespace thrust
-{
-namespace system
-{
-namespace tbb
-{
-
-
-template<typename T>
-  template<typename OtherT>
-    reference<T> &
-      reference<T>
-        ::operator=(const reference<OtherT> &other)
-{
-  return super_t::operator=(other);
-} // end reference::operator=()
-
-template<typename T>
-  reference<T> &
-    reference<T>
-      ::operator=(const value_type &x)
-{
-  return super_t::operator=(x);
-} // end reference::operator=()
-
-template<typename T>
-__host__ __device__
-void swap(reference<T> a, reference<T> b)
-{
-  a.swap(b);
-} // end swap()
-
-namespace detail
-{
-
-// XXX circular #inclusion problems cause the compiler to believe that cpp::malloc
-//     is not defined
-//     WAR the problem by using adl to call cpp::malloc, which requires it to depend
-//     on a template parameter
-template<typename Tag>
-  pointer<void> malloc_workaround(Tag t, std::size_t n)
-{
-  return pointer<void>(malloc(t, n));
-} // end malloc_workaround()
-
-// XXX circular #inclusion problems cause the compiler to believe that cpp::free
-//     is not defined
-//     WAR the problem by using adl to call cpp::free, which requires it to depend
-//     on a template parameter
-template<typename Tag>
-  void free_workaround(Tag t, pointer<void> ptr)
-{
-  free(t, ptr.get());
-} // end free_workaround()
-
-} // end detail
-
-inline pointer<void> malloc(std::size_t n)
-{
-  // XXX this is how we'd like to implement this function,
-  //     if not for circular #inclusion problems:
-  //
-  // return pointer<void>(thrust::system::cpp::malloc(n))
-  //
-  return detail::malloc_workaround(cpp::tag(), n);
-} // end malloc()
-
-template<typename T>
-pointer<T> malloc(std::size_t n)
-{
-  pointer<void> raw_ptr = thrust::system::tbb::malloc(sizeof(T) * n);
-  return pointer<T>(reinterpret_cast<T*>(raw_ptr.get()));
-} // end malloc()
-
-inline void free(pointer<void> ptr)
-{
-  // XXX this is how we'd like to implement this function,
-  //     if not for circular #inclusion problems:
-  //
-  // thrust::system::cpp::free(ptr)
-  //
-  detail::free_workaround(cpp::tag(), ptr);
-} // end free()
-
-} // end tbb
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/tbb/detail/merge.h b/compat/thrust/system/tbb/detail/merge.h
deleted file mode 100644
index 7b203ec327..0000000000
--- a/compat/thrust/system/tbb/detail/merge.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/tbb/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace tbb
-{
-namespace detail
-{
-
-template<typename ExecutionPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename StrictWeakOrdering>
-OutputIterator merge(execution_policy<ExecutionPolicy> &exec,
-                     InputIterator1 first1,
-                     InputIterator1 last1,
-                     InputIterator2 first2,
-                     InputIterator2 last2,
-                     OutputIterator result,
-                     StrictWeakOrdering comp);
-
-template <typename ExecutionPolicy,
-          typename InputIterator1,
-          typename InputIterator2,
-          typename InputIterator3,
-          typename InputIterator4,
-          typename OutputIterator1,
-          typename OutputIterator2,
-          typename StrictWeakOrdering>
-thrust::pair<OutputIterator1,OutputIterator2>
-  merge_by_key(execution_policy<ExecutionPolicy> &exec,
-               InputIterator1 keys_first1,
-               InputIterator1 keys_last1,
-               InputIterator2 keys_first2,
-               InputIterator2 keys_last2,
-               InputIterator3 values_first3,
-               InputIterator4 values_first4,
-               OutputIterator1 keys_result,
-               OutputIterator2 values_result,
-               StrictWeakOrdering comp);
-
-} // end detail
-} // end tbb
-} // end system
-} // end thrust
-
-#include <thrust/system/tbb/detail/merge.inl>
-
diff --git a/compat/thrust/system/tbb/detail/merge.inl b/compat/thrust/system/tbb/detail/merge.inl
deleted file mode 100644
index cc902af85b..0000000000
--- a/compat/thrust/system/tbb/detail/merge.inl
+++ /dev/null
@@ -1,285 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/temporary_array.h>
-#include <thrust/system/tbb/detail/execution_policy.h>
-#include <thrust/system/detail/internal/scalar/merge.h>
-#include <thrust/system/detail/internal/scalar/binary_search.h>
-#include <tbb/parallel_for.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace tbb
-{
-namespace detail
-{
-namespace merge_detail
-{
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename StrictWeakOrdering>
-struct range
-{
-  InputIterator1 first1, last1;
-  InputIterator2 first2, last2;
-  OutputIterator result;
-  StrictWeakOrdering comp;
-  size_t grain_size;
-
-  range(InputIterator1 first1, InputIterator1 last1,
-        InputIterator2 first2, InputIterator2 last2,
-        OutputIterator result,
-        StrictWeakOrdering comp,
-        size_t grain_size = 1024)
-    : first1(first1), last1(last1),
-      first2(first2), last2(last2),
-      result(result), comp(comp), grain_size(grain_size)
-  {}
-  
-  range(range& r, ::tbb::split)
-    : first1(r.first1), last1(r.last1),
-      first2(r.first2), last2(r.last2),
-      result(r.result), comp(r.comp), grain_size(r.grain_size)
-  {
-    // we can assume n1 and n2 are not both 0
-    size_t n1 = thrust::distance(first1, last1);
-    size_t n2 = thrust::distance(first2, last2);
-
-    InputIterator1 mid1 = first1;
-    InputIterator2 mid2 = first2;
-
-    if (n1 > n2)
-    {
-      mid1 += n1 / 2;
-      mid2 = thrust::system::detail::internal::scalar::lower_bound(first2, last2, raw_reference_cast(*mid1), comp);
-    }
-    else
-    {
-      mid2 += n2 / 2;
-      mid1 = thrust::system::detail::internal::scalar::upper_bound(first1, last1, raw_reference_cast(*mid2), comp);
-    }
-    
-    // set first range to [first1, mid1), [first2, mid2), result
-    r.last1 = mid1;
-    r.last2 = mid2;
-
-    // set second range to [mid1, last1), [mid2, last2), result + (mid1 - first1) + (mid2 - first2)
-    first1 = mid1;
-    first2 = mid2;
-    result += thrust::distance(r.first1, mid1) + thrust::distance(r.first2, mid2);
-  }
-
-  bool empty(void) const
-  {
-    return (first1 == last1) && (first2 == last2);
-  }
-
-  bool is_divisible(void) const
-  {
-    return static_cast<size_t>(thrust::distance(first1, last1) + thrust::distance(first2, last2)) > grain_size;
-  }
-};
-
-struct body
-{
-  template <typename Range>
-  void operator()(Range& r) const
-  {
-    thrust::system::detail::internal::scalar::merge
-      (r.first1, r.last1,
-       r.first2, r.last2,
-       r.result,
-       r.comp);
-  }
-};
-
-} // end namespace merge_detail
-
-namespace merge_by_key_detail
-{
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename InputIterator4,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename StrictWeakOrdering>
-struct range
-{
-  InputIterator1 keys_first1, keys_last1;
-  InputIterator2 keys_first2, keys_last2;
-  InputIterator3 values_first1;
-  InputIterator4 values_first2;
-  OutputIterator1 keys_result;
-  OutputIterator2 values_result;
-  StrictWeakOrdering comp;
-  size_t grain_size;
-
-  range(InputIterator1 keys_first1, InputIterator1 keys_last1,
-        InputIterator2 keys_first2, InputIterator2 keys_last2,
-        InputIterator3 values_first1,
-        InputIterator4 values_first2,
-        OutputIterator1 keys_result,
-        OutputIterator2 values_result,
-        StrictWeakOrdering comp,
-        size_t grain_size = 1024)
-    : keys_first1(keys_first1), keys_last1(keys_last1),
-      keys_first2(keys_first2), keys_last2(keys_last2),
-      values_first1(values_first1),
-      values_first2(values_first2),
-      keys_result(keys_result), values_result(values_result),
-      comp(comp), grain_size(grain_size)
-  {}
-  
-  range(range& r, ::tbb::split)
-    : keys_first1(r.keys_first1), keys_last1(r.keys_last1),
-      keys_first2(r.keys_first2), keys_last2(r.keys_last2),
-      values_first1(r.values_first1),
-      values_first2(r.values_first2),
-      keys_result(r.keys_result), values_result(r.values_result),
-      comp(r.comp), grain_size(r.grain_size)
-  {
-    // we can assume n1 and n2 are not both 0
-    size_t n1 = thrust::distance(keys_first1, keys_last1);
-    size_t n2 = thrust::distance(keys_first2, keys_last2);
-
-    InputIterator1 mid1 = keys_first1;
-    InputIterator2 mid2 = keys_first2;
-
-    if (n1 > n2)
-    {
-      mid1 += n1 / 2;
-      mid2 = thrust::system::detail::internal::scalar::lower_bound(keys_first2, keys_last2, raw_reference_cast(*mid1), comp);
-    }
-    else
-    {
-      mid2 += n2 / 2;
-      mid1 = thrust::system::detail::internal::scalar::upper_bound(keys_first1, keys_last1, raw_reference_cast(*mid2), comp);
-    }
-    
-    // set first range to [keys_first1, mid1), [keys_first2, mid2), keys_result, values_result
-    r.keys_last1 = mid1;
-    r.keys_last2 = mid2;
-
-    // set second range to [mid1, keys_last1), [mid2, keys_last2), keys_result + (mid1 - keys_first1) + (mid2 - keys_first2), values_result + (mid1 - keys_first1) + (mid2 - keys_first2) 
-    keys_first1 = mid1;
-    keys_first2 = mid2;
-    values_first1 += thrust::distance(r.keys_first1, mid1);
-    values_first2 += thrust::distance(r.keys_first2, mid2);
-    keys_result += thrust::distance(r.keys_first1, mid1) + thrust::distance(r.keys_first2, mid2);
-    values_result += thrust::distance(r.keys_first1, mid1) + thrust::distance(r.keys_first2, mid2);
-  }
-
-  bool empty(void) const
-  {
-    return (keys_first1 == keys_last1) && (keys_first2 == keys_last2);
-  }
-
-  bool is_divisible(void) const
-  {
-    return static_cast<size_t>(thrust::distance(keys_first1, keys_last1) + thrust::distance(keys_first2, keys_last2)) > grain_size;
-  }
-};
-
-struct body
-{
-  template <typename Range>
-  void operator()(Range& r) const
-  {
-    thrust::system::detail::internal::scalar::merge_by_key
-      (r.keys_first1, r.keys_last1,
-       r.keys_first2, r.keys_last2,
-       r.values_first1,
-       r.values_first2,
-       r.keys_result,
-       r.values_result,
-       r.comp);
-  }
-};
-
-} // end namespace merge_by_key_detail
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename StrictWeakOrdering>
-OutputIterator merge(execution_policy<DerivedPolicy> &exec,
-                     InputIterator1 first1,
-                     InputIterator1 last1,
-                     InputIterator2 first2,
-                     InputIterator2 last2,
-                     OutputIterator result,
-                     StrictWeakOrdering comp)
-{
-  typedef typename merge_detail::range<InputIterator1,InputIterator2,OutputIterator,StrictWeakOrdering> Range;
-  typedef          merge_detail::body                                                                   Body;
-  Range range(first1, last1, first2, last2, result, comp);
-  Body  body;
-
-  ::tbb::parallel_for(range, body);
-
-  thrust::advance(result, thrust::distance(first1, last1) + thrust::distance(first2, last2));
-
-  return result;
-} // end merge()
-
-template <typename DerivedPolicy,
-          typename InputIterator1,
-          typename InputIterator2,
-          typename InputIterator3,
-          typename InputIterator4,
-          typename OutputIterator1,
-          typename OutputIterator2,
-          typename StrictWeakOrdering>
-thrust::pair<OutputIterator1,OutputIterator2>
-  merge_by_key(execution_policy<DerivedPolicy> &exec,
-               InputIterator1 keys_first1,
-               InputIterator1 keys_last1,
-               InputIterator2 keys_first2,
-               InputIterator2 keys_last2,
-               InputIterator3 values_first3,
-               InputIterator4 values_first4,
-               OutputIterator1 keys_result,
-               OutputIterator2 values_result,
-               StrictWeakOrdering comp)
-{
-  typedef typename merge_by_key_detail::range<InputIterator1,InputIterator2,InputIterator3,InputIterator4,OutputIterator1,OutputIterator2,StrictWeakOrdering> Range;
-  typedef          merge_by_key_detail::body                                                                                                                  Body;
-
-  Range range(keys_first1, keys_last1, keys_first2, keys_last2, values_first3, values_first4, keys_result, values_result, comp);
-  Body  body;
-
-  ::tbb::parallel_for(range, body);
-
-  thrust::advance(keys_result,   thrust::distance(keys_first1, keys_last1) + thrust::distance(keys_first2, keys_last2));
-  thrust::advance(values_result, thrust::distance(keys_first1, keys_last1) + thrust::distance(keys_first2, keys_last2));
-
-  return thrust::make_pair(keys_result,values_result);
-}
-
-} // end namespace detail
-} // end namespace tbb
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/tbb/detail/mismatch.h b/compat/thrust/system/tbb/detail/mismatch.h
deleted file mode 100644
index 03980cfcd5..0000000000
--- a/compat/thrust/system/tbb/detail/mismatch.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits mismatch
-#include <thrust/system/cpp/detail/mismatch.h>
-
diff --git a/compat/thrust/system/tbb/detail/par.h b/compat/thrust/system/tbb/detail/par.h
deleted file mode 100644
index 74801ab914..0000000000
--- a/compat/thrust/system/tbb/detail/par.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/tbb/detail/execution_policy.h>
-#include <thrust/detail/execute_with_allocator.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace tbb
-{
-namespace detail
-{
-
-
-struct par_t : thrust::system::tbb::detail::execution_policy<par_t>
-{
-  par_t() : thrust::system::tbb::detail::execution_policy<par_t>() {}
-
-  template<typename Allocator>
-    thrust::detail::execute_with_allocator<Allocator, thrust::system::tbb::detail::execution_policy>
-      operator()(Allocator &alloc) const
-  {
-    return thrust::detail::execute_with_allocator<Allocator, thrust::system::tbb::detail::execution_policy>(alloc);
-  }
-};
-
-
-} // end detail
-
-
-static const detail::par_t par;
-
-
-} // end tbb
-} // end system
-
-
-// alias par here
-namespace tbb
-{
-
-
-using thrust::system::tbb::par;
-
-
-} // end tbb
-} // end thrust
-
diff --git a/compat/thrust/system/tbb/detail/partition.h b/compat/thrust/system/tbb/detail/partition.h
deleted file mode 100644
index af37121888..0000000000
--- a/compat/thrust/system/tbb/detail/partition.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/tbb/detail/execution_policy.h>
-#include <thrust/pair.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace tbb
-{
-namespace detail
-{
-
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename Predicate>
-  ForwardIterator stable_partition(execution_policy<DerivedPolicy> &exec,
-                                   ForwardIterator first,
-                                   ForwardIterator last,
-                                   Predicate pred);
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename InputIterator,
-         typename Predicate>
-  ForwardIterator stable_partition(execution_policy<DerivedPolicy> &exec,
-                                   ForwardIterator first,
-                                   ForwardIterator last,
-                                   InputIterator stencil,
-                                   Predicate pred);
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename Predicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    stable_partition_copy(execution_policy<DerivedPolicy> &exec,
-                          InputIterator first,
-                          InputIterator last,
-                          OutputIterator1 out_true,
-                          OutputIterator2 out_false,
-                          Predicate pred);
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename Predicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    stable_partition_copy(execution_policy<DerivedPolicy> &exec,
-                          InputIterator1 first,
-                          InputIterator1 last,
-                          InputIterator2 stencil,
-                          OutputIterator1 out_true,
-                          OutputIterator2 out_false,
-                          Predicate pred);
-
-
-} // end namespace detail
-} // end namespace tbb
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/tbb/detail/partition.inl>
-
diff --git a/compat/thrust/system/tbb/detail/partition.inl b/compat/thrust/system/tbb/detail/partition.inl
deleted file mode 100644
index 1e421e10f3..0000000000
--- a/compat/thrust/system/tbb/detail/partition.inl
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/tbb/detail/partition.h>
-#include <thrust/system/detail/generic/partition.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace tbb
-{
-namespace detail
-{
-
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename Predicate>
-  ForwardIterator stable_partition(execution_policy<DerivedPolicy> &exec,
-                                   ForwardIterator first,
-                                   ForwardIterator last,
-                                   Predicate pred)
-{
-  // tbb prefers generic::stable_partition to cpp::stable_partition
-  return thrust::system::detail::generic::stable_partition(exec, first, last, pred);
-} // end stable_partition()
-
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename InputIterator,
-         typename Predicate>
-  ForwardIterator stable_partition(execution_policy<DerivedPolicy> &exec,
-                                   ForwardIterator first,
-                                   ForwardIterator last,
-                                   InputIterator stencil,
-                                   Predicate pred)
-{
-  // tbb prefers generic::stable_partition to cpp::stable_partition
-  return thrust::system::detail::generic::stable_partition(exec, first, last, stencil, pred);
-} // end stable_partition()
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename Predicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    stable_partition_copy(execution_policy<DerivedPolicy> &exec,
-                          InputIterator first,
-                          InputIterator last,
-                          OutputIterator1 out_true,
-                          OutputIterator2 out_false,
-                          Predicate pred)
-{
-  // tbb prefers generic::stable_partition_copy to cpp::stable_partition_copy
-  return thrust::system::detail::generic::stable_partition_copy(exec, first, last, out_true, out_false, pred);
-} // end stable_partition_copy()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename Predicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    stable_partition_copy(execution_policy<DerivedPolicy> &exec,
-                          InputIterator1 first,
-                          InputIterator1 last,
-                          InputIterator2 stencil,
-                          OutputIterator1 out_true,
-                          OutputIterator2 out_false,
-                          Predicate pred)
-{
-  // tbb prefers generic::stable_partition_copy to cpp::stable_partition_copy
-  return thrust::system::detail::generic::stable_partition_copy(exec, first, last, stencil, out_true, out_false, pred);
-} // end stable_partition_copy()
-
-
-} // end namespace detail
-} // end namespace tbb
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/tbb/detail/reduce.h b/compat/thrust/system/tbb/detail/reduce.h
deleted file mode 100644
index 83a7cc3214..0000000000
--- a/compat/thrust/system/tbb/detail/reduce.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file reduce.h
- *  \brief TBB implementation of reduce.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/tbb/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace tbb
-{
-namespace detail
-{
-
-
-template<typename DerivedPolicy,
-         typename InputIterator, 
-         typename OutputType,
-         typename BinaryFunction>
-  OutputType reduce(execution_policy<DerivedPolicy> &exec,
-                    InputIterator begin,
-                    InputIterator end,
-                    OutputType init,
-                    BinaryFunction binary_op);
-
-
-} // end namespace detail
-} // end namespace tbb
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/tbb/detail/reduce.inl>
-
diff --git a/compat/thrust/system/tbb/detail/reduce.inl b/compat/thrust/system/tbb/detail/reduce.inl
deleted file mode 100644
index c249852769..0000000000
--- a/compat/thrust/system/tbb/detail/reduce.inl
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/function.h>
-#include <thrust/detail/static_assert.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/distance.h>
-#include <thrust/reduce.h>
-#include <tbb/blocked_range.h>
-#include <tbb/parallel_reduce.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace tbb
-{
-namespace detail
-{
-namespace reduce_detail
-{
-
-template<typename RandomAccessIterator,
-         typename OutputType,
-         typename BinaryFunction>
-struct body
-{
-  RandomAccessIterator first;
-  OutputType sum;
-  bool first_call;  // TBB can invoke operator() multiple times on the same body
-  thrust::detail::host_function<BinaryFunction,OutputType> binary_op;
-
-  // note: we only initalize sum with init to avoid calling OutputType's default constructor
-  body(RandomAccessIterator first, OutputType init, BinaryFunction binary_op)
-    : first(first), sum(init), first_call(true), binary_op(binary_op)
-  {}
-
-  // note: we only initalize sum with b.sum to avoid calling OutputType's default constructor
-  body(body& b, ::tbb::split)
-    : first(b.first), sum(b.sum), first_call(true), binary_op(b.binary_op)
-  {}
-
-  template <typename Size>
-  void operator()(const ::tbb::blocked_range<Size> &r)
-  {
-    // we assume that blocked_range specifies a contiguous range of integers
-    
-    if (r.empty()) return; // nothing to do
-
-    RandomAccessIterator iter = first + r.begin();
-
-    OutputType temp = thrust::raw_reference_cast(*iter);
-
-    ++iter;
-
-    for (Size i = r.begin() + 1; i != r.end(); ++i, ++iter)
-      temp = binary_op(temp, *iter);
-
-
-    if (first_call)
-    {
-      // first time body has been invoked
-      first_call = false;
-      sum = temp;
-    }
-    else
-    {
-      // body has been previously invoked, accumulate temp into sum
-      sum = binary_op(sum, temp);
-    }
-  } // end operator()()
-  
-  void join(body& b)
-  {
-    sum = binary_op(sum, b.sum);
-  }
-}; // end body
-
-} // end reduce_detail
-
-
-template<typename DerivedPolicy,
-         typename InputIterator, 
-         typename OutputType,
-         typename BinaryFunction>
-  OutputType reduce(execution_policy<DerivedPolicy> &exec,
-                    InputIterator begin,
-                    InputIterator end,
-                    OutputType init,
-                    BinaryFunction binary_op)
-{
-  typedef typename thrust::iterator_difference<InputIterator>::type Size; 
-
-  Size n = thrust::distance(begin, end);
-
-  if (n == 0)
-  {
-    return init;
-  }
-  else
-  {
-    typedef typename reduce_detail::body<InputIterator,OutputType,BinaryFunction> Body;
-    Body reduce_body(begin, init, binary_op);
-    ::tbb::parallel_reduce(::tbb::blocked_range<Size>(0,n), reduce_body);
-    return binary_op(init, reduce_body.sum);
-  }
-}
-
-
-} // end namespace detail
-} // end namespace tbb
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/tbb/detail/reduce_by_key.h b/compat/thrust/system/tbb/detail/reduce_by_key.h
deleted file mode 100644
index 0149a763f7..0000000000
--- a/compat/thrust/system/tbb/detail/reduce_by_key.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/tbb/detail/execution_policy.h>
-#include <thrust/pair.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace tbb
-{
-namespace detail
-{
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename BinaryPredicate,
-         typename BinaryFunction>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    reduce_by_key(execution_policy<DerivedPolicy> &exec,
-                  InputIterator1 keys_first, 
-                  InputIterator1 keys_last,
-                  InputIterator2 values_first,
-                  OutputIterator1 keys_output,
-                  OutputIterator2 values_output,
-                  BinaryPredicate binary_pred,
-                  BinaryFunction binary_op);
-
-
-} // end namespace detail
-} // end namespace tbb
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/tbb/detail/reduce_by_key.inl>
-
diff --git a/compat/thrust/system/tbb/detail/reduce_by_key.inl b/compat/thrust/system/tbb/detail/reduce_by_key.inl
deleted file mode 100644
index 10d2d8b4a8..0000000000
--- a/compat/thrust/system/tbb/detail/reduce_by_key.inl
+++ /dev/null
@@ -1,344 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/tbb/detail/reduce_by_key.h>
-#include <thrust/iterator/reverse_iterator.h>
-#include <thrust/system/cpp/execution_policy.h>
-#include <thrust/system/tbb/detail/execution_policy.h>
-#include <thrust/system/tbb/detail/reduce_intervals.h>
-#include <thrust/detail/minmax.h>
-#include <thrust/detail/temporary_array.h>
-#include <thrust/detail/range/tail_flags.h>
-#include <tbb/blocked_range.h>
-#include <tbb/parallel_for.h>
-#include <tbb/tbb_thread.h>
-#include <cassert>
-
-
-namespace thrust
-{
-namespace system
-{
-namespace tbb
-{
-namespace detail
-{
-namespace reduce_by_key_detail
-{
-
-
-template<typename L, typename R>
-  inline L divide_ri(const L x, const R y)
-{
-  return (x + (y - 1)) / y;
-}
-
-
-template<typename InputIterator, typename BinaryFunction, typename OutputIterator = void>
-  struct partial_sum_type
-    : thrust::detail::eval_if<
-        thrust::detail::has_result_type<BinaryFunction>::value,
-        thrust::detail::result_type<BinaryFunction>,
-        thrust::detail::eval_if<
-          thrust::detail::is_output_iterator<OutputIterator>::value,
-          thrust::iterator_value<InputIterator>,
-          thrust::iterator_value<OutputIterator>
-        >
-      >
-{};
-
-
-template<typename InputIterator, typename BinaryFunction>
-  struct partial_sum_type<InputIterator,BinaryFunction,void>
-    : thrust::detail::eval_if<
-        thrust::detail::has_result_type<BinaryFunction>::value,
-        thrust::detail::result_type<BinaryFunction>,
-        thrust::iterator_value<InputIterator>
-      >
-{};
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename BinaryPredicate,
-         typename BinaryFunction>
-  thrust::pair<
-    InputIterator1,
-    thrust::pair<
-      typename InputIterator1::value_type,
-      typename partial_sum_type<InputIterator2,BinaryFunction>::type
-    >
-  >
-    reduce_last_segment_backward(InputIterator1 keys_first,
-                                 InputIterator1 keys_last,
-                                 InputIterator2 values_first,
-                                 BinaryPredicate binary_pred,
-                                 BinaryFunction binary_op)
-{
-  typename thrust::iterator_difference<InputIterator1>::type n = keys_last - keys_first;
-
-  // reverse the ranges and consume from the end
-  thrust::reverse_iterator<InputIterator1> keys_first_r(keys_last);
-  thrust::reverse_iterator<InputIterator1> keys_last_r(keys_first);
-  thrust::reverse_iterator<InputIterator2> values_first_r(values_first + n);
-
-  typename InputIterator1::value_type result_key = *keys_first_r;
-  typename partial_sum_type<InputIterator2,BinaryFunction>::type result_value = *values_first_r;
-
-  // consume the entirety of the first key's sequence
-  for(++keys_first_r, ++values_first_r;
-      (keys_first_r != keys_last_r) && binary_pred(*keys_first_r, result_key);
-      ++keys_first_r, ++values_first_r)
-  {
-    result_value = binary_op(result_value, *values_first_r);
-  }
-
-  return thrust::make_pair(keys_first_r.base(), thrust::make_pair(result_key, result_value));
-}
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename BinaryPredicate,
-         typename BinaryFunction>
-  thrust::tuple<
-    OutputIterator1,
-    OutputIterator2,
-    typename InputIterator1::value_type,
-    typename partial_sum_type<InputIterator2,BinaryFunction>::type
-  >
-    reduce_by_key_with_carry(InputIterator1 keys_first, 
-                             InputIterator1 keys_last,
-                             InputIterator2 values_first,
-                             OutputIterator1 keys_output,
-                             OutputIterator2 values_output,
-                             BinaryPredicate binary_pred,
-                             BinaryFunction binary_op)
-{
-  // first, consume the last sequence to produce the carry
-  // XXX is there an elegant way to pose this such that we don't need to default construct carry?
-  thrust::pair<
-    typename InputIterator1::value_type,
-    typename partial_sum_type<InputIterator2,BinaryFunction>::type
-  > carry;
-
-  thrust::tie(keys_last, carry) = reduce_last_segment_backward(keys_first, keys_last, values_first, binary_pred, binary_op);
-
-  // finish with sequential reduce_by_key
-  thrust::cpp::tag seq;
-  thrust::tie(keys_output, values_output) =
-    thrust::reduce_by_key(seq, keys_first, keys_last, values_first, keys_output, values_output, binary_pred, binary_op);
-  
-  return thrust::make_tuple(keys_output, values_output, carry.first, carry.second);
-}
-
-
-template<typename Iterator>
-  bool interval_has_carry(size_t interval_idx, size_t interval_size, size_t num_intervals, Iterator tail_flags)
-{
-  // to discover whether the interval has a carry, look at the tail_flag corresponding to its last element 
-  // the final interval never has a carry by definition
-  return (interval_idx + 1 < num_intervals) ? !tail_flags[(interval_idx + 1) * interval_size - 1] : false;
-}
-
-
-template<typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4, typename Iterator5, typename Iterator6, typename BinaryPredicate, typename BinaryFunction>
-  struct serial_reduce_by_key_body
-{
-  typedef typename thrust::iterator_difference<Iterator1>::type size_type;
-
-  Iterator1 keys_first;
-  Iterator2 values_first;
-  Iterator3 result_offset;
-  Iterator4 keys_result;
-  Iterator5 values_result;
-  Iterator6 carry_result;
-
-  size_type n;
-  size_type interval_size;
-  size_type num_intervals;
-
-  BinaryPredicate binary_pred;
-  BinaryFunction binary_op;
-
-  serial_reduce_by_key_body(Iterator1 keys_first, Iterator2 values_first, Iterator3 result_offset, Iterator4 keys_result, Iterator5 values_result, Iterator6 carry_result, size_type n, size_type interval_size, size_type num_intervals, BinaryPredicate binary_pred, BinaryFunction binary_op)
-    : keys_first(keys_first), values_first(values_first),
-      result_offset(result_offset),
-      keys_result(keys_result),
-      values_result(values_result),
-      carry_result(carry_result),
-      n(n),
-      interval_size(interval_size),
-      num_intervals(num_intervals),
-      binary_pred(binary_pred),
-      binary_op(binary_op)
-  {}
-
-  void operator()(const ::tbb::blocked_range<size_type> &r) const
-  {
-    assert(r.size() == 1);
-
-    const size_type interval_idx = r.begin();
-
-    const size_type offset_to_first = interval_size * interval_idx;
-    const size_type offset_to_last = thrust::min(n, offset_to_first + interval_size);
-
-    Iterator1 my_keys_first     = keys_first    + offset_to_first;
-    Iterator1 my_keys_last      = keys_first    + offset_to_last;
-    Iterator2 my_values_first   = values_first  + offset_to_first;
-    Iterator3 my_result_offset  = result_offset + interval_idx;
-    Iterator4 my_keys_result    = keys_result   + *my_result_offset;
-    Iterator5 my_values_result  = values_result + *my_result_offset;
-    Iterator6 my_carry_result   = carry_result  + interval_idx;
-
-    // consume the rest of the interval with reduce_by_key
-    typedef typename thrust::iterator_value<Iterator1>::type key_type;
-    typedef typename partial_sum_type<Iterator2,BinaryFunction>::type value_type;
-
-    // XXX is there a way to pose this so that we don't require default construction of carry?
-    thrust::pair<key_type, value_type> carry;
-
-    thrust::tie(my_keys_result, my_values_result, carry.first, carry.second) =
-      reduce_by_key_with_carry(my_keys_first,
-                               my_keys_last,
-                               my_values_first,
-                               my_keys_result,
-                               my_values_result,
-                               binary_pred,
-                               binary_op);
-
-    // store to carry only when we actually have a carry
-    // store to my_keys_result & my_values_result otherwise
-    
-    // create tail_flags so we can check for a carry
-    thrust::detail::tail_flags<Iterator1,BinaryPredicate> flags = thrust::detail::make_tail_flags(keys_first, keys_first + n, binary_pred);
-
-    if(interval_has_carry(interval_idx, interval_size, num_intervals, flags.begin()))
-    {
-      // we can ignore the carry's key
-      // XXX because the carry result is uninitialized, we should copy construct
-      *my_carry_result = carry.second;
-    }
-    else
-    {
-      *my_keys_result = carry.first;
-      *my_values_result = carry.second;
-    }
-  }
-};
-
-
-template<typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4, typename Iterator5, typename Iterator6, typename BinaryPredicate, typename BinaryFunction>
-  serial_reduce_by_key_body<Iterator1,Iterator2,Iterator3,Iterator4,Iterator5,Iterator6,BinaryPredicate,BinaryFunction>
-    make_serial_reduce_by_key_body(Iterator1 keys_first, Iterator2 values_first, Iterator3 result_offset, Iterator4 keys_result, Iterator5 values_result, Iterator6 carry_result, typename thrust::iterator_difference<Iterator1>::type n, size_t interval_size, size_t num_intervals, BinaryPredicate binary_pred, BinaryFunction binary_op)
-{
-  return serial_reduce_by_key_body<Iterator1,Iterator2,Iterator3,Iterator4,Iterator5,Iterator6,BinaryPredicate,BinaryFunction>(keys_first, values_first, result_offset, keys_result, values_result, carry_result, n, interval_size, num_intervals, binary_pred, binary_op);
-}
-
-
-} // end reduce_by_key_detail
-
-
-template<typename DerivedPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4, typename BinaryPredicate, typename BinaryFunction>
-  thrust::pair<Iterator3,Iterator4>
-    reduce_by_key(thrust::tbb::execution_policy<DerivedPolicy> &exec,
-                  Iterator1 keys_first, Iterator1 keys_last, 
-                  Iterator2 values_first,
-                  Iterator3 keys_result,
-                  Iterator4 values_result,
-                  BinaryPredicate binary_pred,
-                  BinaryFunction binary_op)
-{
-
-  typedef typename thrust::iterator_difference<Iterator1>::type difference_type;
-  difference_type n = keys_last - keys_first;
-  if(n == 0) return thrust::make_pair(keys_result, values_result);
-
-  // XXX this value is a tuning opportunity
-  const difference_type parallelism_threshold = 10000;
-
-  if(n < parallelism_threshold)
-  {
-    // don't bother parallelizing for small n
-    thrust::cpp::tag seq;
-    return thrust::reduce_by_key(seq, keys_first, keys_last, values_first, keys_result, values_result, binary_pred, binary_op);
-  }
-
-  // count the number of processors
-  const unsigned int p = thrust::max<unsigned int>(1u, ::tbb::tbb_thread::hardware_concurrency());
-
-  // generate O(P) intervals of sequential work
-  // XXX oversubscribing is a tuning opportunity
-  const unsigned int subscription_rate = 1;
-  difference_type interval_size = thrust::min<difference_type>(parallelism_threshold, thrust::max<difference_type>(n, n / (subscription_rate * p)));
-  difference_type num_intervals = reduce_by_key_detail::divide_ri(n, interval_size);
-
-  // decompose the input into intervals of size N / num_intervals
-  // add one extra element to this vector to store the size of the entire result
-  thrust::detail::temporary_array<difference_type, DerivedPolicy> interval_output_offsets(0, exec, num_intervals + 1);
-
-  // first count the number of tail flags in each interval
-  thrust::detail::tail_flags<Iterator1,BinaryPredicate> tail_flags = thrust::detail::make_tail_flags(keys_first, keys_last, binary_pred);
-  thrust::system::tbb::detail::reduce_intervals(exec, tail_flags.begin(), tail_flags.end(), interval_size, interval_output_offsets.begin() + 1, thrust::plus<size_t>());
-  interval_output_offsets[0] = 0;
-
-  // scan the counts to get each body's output offset
-  thrust::cpp::tag seq;
-  thrust::inclusive_scan(seq,
-                         interval_output_offsets.begin() + 1, interval_output_offsets.end(), 
-                         interval_output_offsets.begin() + 1);
-
-  // do a reduce_by_key serially in each thread
-  // the final interval never has a carry by definition, so don't reserve space for it
-  typedef typename reduce_by_key_detail::partial_sum_type<Iterator2,BinaryFunction>::type carry_type;
-  thrust::detail::temporary_array<carry_type, DerivedPolicy> carries(0, exec, num_intervals - 1);
-
-  // force grainsize == 1 with simple_partioner()
-  ::tbb::parallel_for(::tbb::blocked_range<difference_type>(0, num_intervals, 1),
-    reduce_by_key_detail::make_serial_reduce_by_key_body(keys_first, values_first, interval_output_offsets.begin(), keys_result, values_result, carries.begin(), n, interval_size, num_intervals, binary_pred, binary_op),
-    ::tbb::simple_partitioner());
-
-  difference_type size_of_result = interval_output_offsets[num_intervals];
-
-  // sequentially accumulate the carries
-  // note that the last interval does not have a carry
-  // XXX find a way to express this loop via a sequential algorithm, perhaps reduce_by_key
-  for(typename thrust::detail::temporary_array<carry_type,DerivedPolicy>::size_type i = 0; i < carries.size(); ++i)
-  {
-    // if our interval has a carry, then we need to sum the carry to the next interval's output offset
-    // if it does not have a carry, then we need to ignore carry_value[i]
-    if(reduce_by_key_detail::interval_has_carry(i, interval_size, num_intervals, tail_flags.begin()))
-    {
-      difference_type output_idx = interval_output_offsets[i+1];
-
-      values_result[output_idx] = binary_op(values_result[output_idx], carries[i]);
-    }
-  }
-
-  return thrust::make_pair(keys_result + size_of_result, values_result + size_of_result);
-}
-
-
-} // end detail
-} // end tbb
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/tbb/detail/reduce_intervals.h b/compat/thrust/system/tbb/detail/reduce_intervals.h
deleted file mode 100644
index 0647ffd464..0000000000
--- a/compat/thrust/system/tbb/detail/reduce_intervals.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/tbb/detail/execution_policy.h>
-
-#include <tbb/parallel_for.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/minmax.h>
-#include <thrust/system/cpp/memory.h>
-#include <thrust/reduce.h>
-#include <cassert>
-
-namespace thrust
-{
-namespace system
-{
-namespace tbb
-{
-namespace detail
-{
-namespace reduce_intervals_detail
-{
-
-
-template<typename L, typename R>
-  inline L divide_ri(const L x, const R y)
-{
-  return (x + (y - 1)) / y;
-}
-
-
-template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Size, typename BinaryFunction>
-  struct body
-{
-  RandomAccessIterator1 first;
-  RandomAccessIterator2 result;
-  Size n, interval_size;
-  BinaryFunction binary_op;
-
-  body(RandomAccessIterator1 first, RandomAccessIterator2 result, Size n, Size interval_size, BinaryFunction binary_op)
-    : first(first), result(result), n(n), interval_size(interval_size), binary_op(binary_op)
-  {}
-
-  void operator()(const ::tbb::blocked_range<Size> &r) const
-  {
-    assert(r.size() == 1);
-
-    Size interval_idx = r.begin();
-
-    Size offset_to_first = interval_size * interval_idx;
-    Size offset_to_last = thrust::min(n, offset_to_first + interval_size);
-
-    RandomAccessIterator1 my_first = first + offset_to_first;
-    RandomAccessIterator1 my_last  = first + offset_to_last;
-
-    thrust::cpp::tag seq;
-
-    // carefully pass the init value for the interval with raw_reference_cast
-    typedef typename BinaryFunction::result_type sum_type;
-    result[interval_idx] =
-      thrust::reduce(seq, my_first + 1, my_last, sum_type(thrust::raw_reference_cast(*my_first)), binary_op);
-  }
-};
-
-
-template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Size, typename BinaryFunction>
-  body<RandomAccessIterator1,RandomAccessIterator2,Size,BinaryFunction>
-    make_body(RandomAccessIterator1 first, RandomAccessIterator2 result, Size n, Size interval_size, BinaryFunction binary_op)
-{
-  return body<RandomAccessIterator1,RandomAccessIterator2,Size,BinaryFunction>(first, result, n, interval_size, binary_op);
-}
-
-
-} // end reduce_intervals_detail
-
-
-template<typename DerivedPolicy, typename RandomAccessIterator1, typename Size, typename RandomAccessIterator2, typename BinaryFunction>
-  void reduce_intervals(thrust::tbb::execution_policy<DerivedPolicy> &,
-                        RandomAccessIterator1 first,
-                        RandomAccessIterator1 last,
-                        Size interval_size,
-                        RandomAccessIterator2 result,
-                        BinaryFunction binary_op)
-{
-  typename thrust::iterator_difference<RandomAccessIterator1>::type n = last - first;
-
-  Size num_intervals = reduce_intervals_detail::divide_ri(n, interval_size);
-
-  ::tbb::parallel_for(::tbb::blocked_range<Size>(0, num_intervals, 1), reduce_intervals_detail::make_body(first, result, Size(n), interval_size, binary_op), ::tbb::simple_partitioner());
-}
-
-
-template<typename DerivedPolicy, typename RandomAccessIterator1, typename Size, typename RandomAccessIterator2>
-  void reduce_intervals(thrust::tbb::execution_policy<DerivedPolicy> &exec,
-                        RandomAccessIterator1 first,
-                        RandomAccessIterator1 last,
-                        Size interval_size,
-                        RandomAccessIterator2 result)
-{
-  typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type;
-
-  return thrust::system::tbb::detail::reduce_intervals(exec, first, last, interval_size, result, thrust::plus<value_type>());
-}
-
-
-} // end detail
-} // end tbb
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/tbb/detail/remove.h b/compat/thrust/system/tbb/detail/remove.h
deleted file mode 100644
index 48cbb5c322..0000000000
--- a/compat/thrust/system/tbb/detail/remove.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/omp/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace omp
-{
-namespace detail
-{
-
-template<typename ExecutionPolicy,
-         typename ForwardIterator,
-         typename Predicate>
-  ForwardIterator remove_if(execution_policy<ExecutionPolicy> &exec,
-                            ForwardIterator first,
-                            ForwardIterator last,
-                            Predicate pred);
-
-
-template<typename ExecutionPolicy,
-         typename ForwardIterator,
-         typename InputIterator,
-         typename Predicate>
-  ForwardIterator remove_if(execution_policy<ExecutionPolicy> &exec,
-                            ForwardIterator first,
-                            ForwardIterator last,
-                            InputIterator stencil,
-                            Predicate pred);
-
-
-template<typename ExecutionPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator remove_copy_if(execution_policy<ExecutionPolicy> &exec,
-                                InputIterator first,
-                                InputIterator last,
-                                OutputIterator result,
-                                Predicate pred);
-
-
-template<typename ExecutionPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator remove_copy_if(execution_policy<ExecutionPolicy> &exec,
-                                InputIterator1 first,
-                                InputIterator1 last,
-                                InputIterator2 stencil,
-                                OutputIterator result,
-                                Predicate pred);
-
-
-} // end namespace detail
-} // end namespace omp
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/tbb/detail/remove.inl>
-
diff --git a/compat/thrust/system/tbb/detail/remove.inl b/compat/thrust/system/tbb/detail/remove.inl
deleted file mode 100644
index 01916c52c8..0000000000
--- a/compat/thrust/system/tbb/detail/remove.inl
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/tbb/detail/remove.h>
-#include <thrust/system/detail/generic/remove.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace tbb
-{
-namespace detail
-{
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename Predicate>
-  ForwardIterator remove_if(execution_policy<DerivedPolicy> &exec,
-                            ForwardIterator first,
-                            ForwardIterator last,
-                            Predicate pred)
-{
-  // tbb prefers generic::remove_if to cpp::remove_if
-  return thrust::system::detail::generic::remove_if(exec, first, last, pred);
-}
-
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename InputIterator,
-         typename Predicate>
-  ForwardIterator remove_if(execution_policy<DerivedPolicy> &exec,
-                            ForwardIterator first,
-                            ForwardIterator last,
-                            InputIterator stencil,
-                            Predicate pred)
-{
-  // tbb prefers generic::remove_if to cpp::remove_if
-  return thrust::system::detail::generic::remove_if(exec, first, last, stencil, pred);
-}
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator remove_copy_if(execution_policy<DerivedPolicy> &exec,
-                                InputIterator first,
-                                InputIterator last,
-                                OutputIterator result,
-                                Predicate pred)
-{
-  // tbb prefers generic::remove_copy_if to cpp::remove_copy_if
-  return thrust::system::detail::generic::remove_copy_if(exec, first, last, result, pred);
-}
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename Predicate>
-  OutputIterator remove_copy_if(execution_policy<DerivedPolicy> &exec,
-                                InputIterator1 first,
-                                InputIterator1 last,
-                                InputIterator2 stencil,
-                                OutputIterator result,
-                                Predicate pred)
-{
-  // tbb prefers generic::remove_copy_if to cpp::remove_copy_if
-  return thrust::system::detail::generic::remove_copy_if(exec, first, last, stencil, result, pred);
-}
-
-} // end namespace detail
-} // end namespace tbb
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/tbb/detail/replace.h b/compat/thrust/system/tbb/detail/replace.h
deleted file mode 100644
index c48555d0e1..0000000000
--- a/compat/thrust/system/tbb/detail/replace.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits this algorithm
-#include <thrust/system/cpp/detail/scatter.h>
-
diff --git a/compat/thrust/system/tbb/detail/reverse.h b/compat/thrust/system/tbb/detail/reverse.h
deleted file mode 100644
index 04923d1f6a..0000000000
--- a/compat/thrust/system/tbb/detail/reverse.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits reverse
-#include <thrust/system/cpp/detail/reverse.h>
-
diff --git a/compat/thrust/system/tbb/detail/scan.h b/compat/thrust/system/tbb/detail/scan.h
deleted file mode 100644
index ed5cacd7e9..0000000000
--- a/compat/thrust/system/tbb/detail/scan.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file scan.h
- *  \brief TBB implementations of scan functions.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/tbb/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace tbb
-{
-namespace detail
-{
-
-template<typename InputIterator,
-         typename OutputIterator,
-         typename BinaryFunction>
-  OutputIterator inclusive_scan(tag,
-                                InputIterator first,
-                                InputIterator last,
-                                OutputIterator result,
-                                BinaryFunction binary_op);
-
-
-template<typename InputIterator,
-         typename OutputIterator,
-         typename T,
-         typename BinaryFunction>
-  OutputIterator exclusive_scan(tag,
-                                InputIterator first,
-                                InputIterator last,
-                                OutputIterator result,
-                                T init,
-                                BinaryFunction binary_op);
-
-
-} // end namespace detail
-} // end namespace tbb
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/tbb/detail/scan.inl>
-
diff --git a/compat/thrust/system/tbb/detail/scan.inl b/compat/thrust/system/tbb/detail/scan.inl
deleted file mode 100644
index 48878241c9..0000000000
--- a/compat/thrust/system/tbb/detail/scan.inl
+++ /dev/null
@@ -1,293 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/tbb/detail/scan.h>
-#include <thrust/distance.h>
-#include <thrust/advance.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/function.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/type_traits/function_traits.h>
-#include <thrust/detail/type_traits/iterator/is_output_iterator.h>
-#include <tbb/blocked_range.h>
-#include <tbb/parallel_scan.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace tbb
-{
-namespace detail
-{
-namespace scan_detail
-{
-
-template<typename InputIterator,
-         typename OutputIterator,
-         typename BinaryFunction,
-         typename ValueType>
-struct inclusive_body
-{
-  InputIterator input;
-  OutputIterator output;
-  thrust::detail::host_function<BinaryFunction,ValueType> binary_op;
-  ValueType sum;
-  bool first_call;
-
-  inclusive_body(InputIterator input, OutputIterator output, BinaryFunction binary_op, ValueType dummy)
-    : input(input), output(output), binary_op(binary_op), sum(dummy), first_call(true)
-  {}
-    
-  inclusive_body(inclusive_body& b, ::tbb::split)
-    : input(b.input), output(b.output), binary_op(b.binary_op), sum(b.sum), first_call(true)
-  {}
-
-  template<typename Size> 
-  void operator()(const ::tbb::blocked_range<Size>& r, ::tbb::pre_scan_tag)
-  {
-    InputIterator iter = input + r.begin();
- 
-    ValueType temp = *iter;
-
-    ++iter;
-
-    for (Size i = r.begin() + 1; i != r.end(); ++i, ++iter)
-      temp = binary_op(temp, *iter);
-
-    if (first_call)
-      sum = temp;
-    else
-      sum = binary_op(sum, temp);
-      
-    first_call = false;
-  }
-  
-  template<typename Size> 
-  void operator()(const ::tbb::blocked_range<Size>& r, ::tbb::final_scan_tag)
-  {
-    InputIterator  iter1 = input  + r.begin();
-    OutputIterator iter2 = output + r.begin();
-
-    if (first_call)
-    {
-      *iter2 = sum = *iter1;
-      ++iter1;
-      ++iter2;
-      for (Size i = r.begin() + 1; i != r.end(); ++i, ++iter1, ++iter2)
-        *iter2 = sum = binary_op(sum, *iter1);
-    }
-    else
-    {
-      for (Size i = r.begin(); i != r.end(); ++i, ++iter1, ++iter2)
-        *iter2 = sum = binary_op(sum, *iter1);
-    }
-
-    first_call = false;
-  }
-
-  void reverse_join(inclusive_body& b)
-  {
-    sum = binary_op(b.sum, sum);
-  } 
-
-  void assign(inclusive_body& b)
-  {
-    sum = b.sum;
-  } 
-};
-
-
-template<typename InputIterator,
-         typename OutputIterator,
-         typename BinaryFunction,
-         typename ValueType>
-struct exclusive_body
-{
-  InputIterator input;
-  OutputIterator output;
-  thrust::detail::host_function<BinaryFunction,ValueType> binary_op;
-  ValueType sum;
-  bool first_call;
-
-  exclusive_body(InputIterator input, OutputIterator output, BinaryFunction binary_op, ValueType init)
-    : input(input), output(output), binary_op(binary_op), sum(init), first_call(true)
-  {}
-    
-  exclusive_body(exclusive_body& b, ::tbb::split)
-    : input(b.input), output(b.output), binary_op(b.binary_op), sum(b.sum), first_call(true)
-  {}
-
-  template<typename Size> 
-  void operator()(const ::tbb::blocked_range<Size>& r, ::tbb::pre_scan_tag)
-  {
-    InputIterator iter = input + r.begin();
- 
-    ValueType temp = *iter;
-
-    ++iter;
-
-    for (Size i = r.begin() + 1; i != r.end(); ++i, ++iter)
-      temp = binary_op(temp, *iter);
-
-    if (first_call && r.begin() > 0)
-      sum = temp;
-    else
-      sum = binary_op(sum, temp);
-      
-    first_call = false;
-  }
-  
-  template<typename Size> 
-  void operator()(const ::tbb::blocked_range<Size>& r, ::tbb::final_scan_tag)
-  {
-    InputIterator  iter1 = input  + r.begin();
-    OutputIterator iter2 = output + r.begin();
-
-    for (Size i = r.begin(); i != r.end(); ++i, ++iter1, ++iter2)
-    {
-      ValueType temp = binary_op(sum, *iter1);
-      *iter2 = sum;
-      sum = temp;
-    }
-    
-    first_call = false;
-  }
-
-  void reverse_join(exclusive_body& b)
-  {
-    sum = binary_op(b.sum, sum);
-  } 
-
-  void assign(exclusive_body& b)
-  {
-    sum = b.sum;
-  } 
-};
-
-} // end scan_detail
-
-
-
-template<typename InputIterator,
-         typename OutputIterator,
-         typename BinaryFunction>
-  OutputIterator inclusive_scan(tag,
-                                InputIterator first,
-                                InputIterator last,
-                                OutputIterator result,
-                                BinaryFunction binary_op)
-{
-  // the pseudocode for deducing the type of the temporary used below:
-  // 
-  // if BinaryFunction is AdaptableBinaryFunction
-  //   TemporaryType = AdaptableBinaryFunction::result_type
-  // else if OutputIterator is a "pure" output iterator
-  //   TemporaryType = InputIterator::value_type
-  // else
-  //   TemporaryType = OutputIterator::value_type
-  //
-  // XXX upon c++0x, TemporaryType needs to be:
-  // result_of<BinaryFunction>::type
-  
-  using namespace thrust::detail;
-
-  typedef typename eval_if<
-    has_result_type<BinaryFunction>::value,
-    result_type<BinaryFunction>,
-    eval_if<
-      is_output_iterator<OutputIterator>::value,
-      thrust::iterator_value<InputIterator>,
-      thrust::iterator_value<OutputIterator>
-    >
-  >::type ValueType;
-  
-  typedef typename thrust::iterator_difference<InputIterator>::type Size; 
-  
-  Size n = thrust::distance(first, last);
-
-  if (n != 0)
-  {
-    typedef typename scan_detail::inclusive_body<InputIterator,OutputIterator,BinaryFunction,ValueType> Body;
-    Body scan_body(first, result, binary_op, *first);
-    ::tbb::parallel_scan(::tbb::blocked_range<Size>(0,n), scan_body);
-  }
- 
-  thrust::advance(result, n);
-
-  return result;
-}
-
-
-template<typename InputIterator,
-         typename OutputIterator,
-         typename T,
-         typename BinaryFunction>
-  OutputIterator exclusive_scan(tag,
-                                InputIterator first,
-                                InputIterator last,
-                                OutputIterator result,
-                                T init,
-                                BinaryFunction binary_op)
-{
-  // the pseudocode for deducing the type of the temporary used below:
-  // 
-  // if BinaryFunction is AdaptableBinaryFunction
-  //   TemporaryType = AdaptableBinaryFunction::result_type
-  // else if OutputIterator is a "pure" output iterator
-  //   TemporaryType = InputIterator::value_type
-  // else
-  //   TemporaryType = OutputIterator::value_type
-  //
-  // XXX upon c++0x, TemporaryType needs to be:
-  // result_of<BinaryFunction>::type
-
-  using namespace thrust::detail;
-
-  typedef typename eval_if<
-    has_result_type<BinaryFunction>::value,
-    result_type<BinaryFunction>,
-    eval_if<
-      is_output_iterator<OutputIterator>::value,
-      thrust::iterator_value<InputIterator>,
-      thrust::iterator_value<OutputIterator>
-    >
-  >::type ValueType;
-
-  typedef typename thrust::iterator_difference<InputIterator>::type Size; 
-  
-  Size n = thrust::distance(first, last);
-
-  if (n != 0)
-  {
-    typedef typename scan_detail::exclusive_body<InputIterator,OutputIterator,BinaryFunction,ValueType> Body;
-    Body scan_body(first, result, binary_op, init);
-    ::tbb::parallel_scan(::tbb::blocked_range<Size>(0,n), scan_body);
-  }
- 
-  thrust::advance(result, n);
-
-  return result;
-} 
-
-} // end namespace detail
-} // end namespace tbb
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/tbb/detail/scan_by_key.h b/compat/thrust/system/tbb/detail/scan_by_key.h
deleted file mode 100644
index cad4fc1454..0000000000
--- a/compat/thrust/system/tbb/detail/scan_by_key.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits scan_by_key
-#include <thrust/system/cpp/detail/scan_by_key.h>
-
diff --git a/compat/thrust/system/tbb/detail/scatter.h b/compat/thrust/system/tbb/detail/scatter.h
deleted file mode 100644
index c48555d0e1..0000000000
--- a/compat/thrust/system/tbb/detail/scatter.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits this algorithm
-#include <thrust/system/cpp/detail/scatter.h>
-
diff --git a/compat/thrust/system/tbb/detail/sequence.h b/compat/thrust/system/tbb/detail/sequence.h
deleted file mode 100644
index 811d8f5fbb..0000000000
--- a/compat/thrust/system/tbb/detail/sequence.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits sequence
-#include <thrust/system/cpp/detail/sequence.h>
-
diff --git a/compat/thrust/system/tbb/detail/set_operations.h b/compat/thrust/system/tbb/detail/set_operations.h
deleted file mode 100644
index 687edb2e7d..0000000000
--- a/compat/thrust/system/tbb/detail/set_operations.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits set_operations
-#include <thrust/system/cpp/detail/set_operations.h>
-
diff --git a/compat/thrust/system/tbb/detail/sort.h b/compat/thrust/system/tbb/detail/sort.h
deleted file mode 100644
index 3b6f63075e..0000000000
--- a/compat/thrust/system/tbb/detail/sort.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/tbb/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace tbb
-{
-namespace detail
-{
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator,
-         typename StrictWeakOrdering>
-  void stable_sort(execution_policy<DerivedPolicy> &exec,
-                   RandomAccessIterator first,
-                   RandomAccessIterator last,
-                   StrictWeakOrdering comp);
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename StrictWeakOrdering>
-  void stable_sort_by_key(execution_policy<DerivedPolicy> &exec,
-                          RandomAccessIterator1 keys_first,
-                          RandomAccessIterator1 keys_last,
-                          RandomAccessIterator2 values_first,
-                          StrictWeakOrdering comp);
-
-} // end namespace detail
-} // end namespace tbb
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/tbb/detail/sort.inl>
-
diff --git a/compat/thrust/system/tbb/detail/sort.inl b/compat/thrust/system/tbb/detail/sort.inl
deleted file mode 100644
index f292789067..0000000000
--- a/compat/thrust/system/tbb/detail/sort.inl
+++ /dev/null
@@ -1,251 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/temporary_array.h>
-#include <thrust/detail/copy.h>
-#include <thrust/system/detail/internal/scalar/sort.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/distance.h>
-#include <thrust/merge.h>
-#include <tbb/parallel_invoke.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace tbb
-{
-namespace detail
-{
-namespace sort_detail
-{
-
-// TODO tune this based on data type and comp
-const static int threshold = 128 * 1024;
-  
-template <typename DerivedPolicy, typename Iterator1, typename Iterator2, typename StrictWeakOrdering>
-void merge_sort(execution_policy<DerivedPolicy> &exec, Iterator1 first1, Iterator1 last1, Iterator2 first2, StrictWeakOrdering comp, bool inplace);
-
-template <typename DerivedPolicy, typename Iterator1, typename Iterator2, typename StrictWeakOrdering>
-struct merge_sort_closure
-{
-  execution_policy<DerivedPolicy> &exec;
-  Iterator1 first1, last1;
-  Iterator2 first2;
-  StrictWeakOrdering comp;
-  bool inplace;
-
-  merge_sort_closure(execution_policy<DerivedPolicy> &exec, Iterator1 first1, Iterator1 last1, Iterator2 first2, StrictWeakOrdering comp, bool inplace)
-    : exec(exec), first1(first1), last1(last1), first2(first2), comp(comp), inplace(inplace)
-  {}
-
-  void operator()(void) const
-  {
-    merge_sort(exec, first1, last1, first2, comp, inplace);
-  }
-};
-
-
-template <typename DerivedPolicy, typename Iterator1, typename Iterator2, typename StrictWeakOrdering>
-void merge_sort(execution_policy<DerivedPolicy> &exec, Iterator1 first1, Iterator1 last1, Iterator2 first2, StrictWeakOrdering comp, bool inplace)
-{
-  typedef typename thrust::iterator_difference<Iterator1>::type difference_type;
-
-  difference_type n = thrust::distance(first1, last1);
-
-  if (n < threshold)
-  {
-    thrust::system::detail::internal::scalar::stable_sort(first1, last1, comp);
-    
-    if (!inplace)
-      thrust::system::detail::internal::scalar::copy(first1, last1, first2);
-
-    return;
-  }
-
-  Iterator1 mid1  = first1 + (n / 2);
-  Iterator2 mid2  = first2 + (n / 2);
-  Iterator2 last2 = first2 + n;
-
-  typedef merge_sort_closure<DerivedPolicy,Iterator1,Iterator2,StrictWeakOrdering> Closure;
-  
-  Closure left (exec, first1, mid1,  first2, comp, !inplace);
-  Closure right(exec, mid1,   last1, mid2,   comp, !inplace);
-
-  ::tbb::parallel_invoke(left, right);
-
-  if (inplace) thrust::merge(exec, first2, mid2, mid2, last2, first1, comp);
-  else	       thrust::merge(exec, first1, mid1, mid1, last1, first2, comp);
-}
-
-} // end namespace sort_detail
-
-
-namespace sort_by_key_detail
-{
-
-// TODO tune this based on data type and comp
-const static int threshold = 128 * 1024;
-  
-template <typename DerivedPolicy,
-          typename Iterator1,
-          typename Iterator2,
-          typename Iterator3,
-          typename Iterator4,
-          typename StrictWeakOrdering>
-void merge_sort_by_key(execution_policy<DerivedPolicy> &exec,
-                       Iterator1 first1,
-                       Iterator1 last1,
-                       Iterator2 first2,
-                       Iterator3 first3,
-                       Iterator4 first4,
-                       StrictWeakOrdering comp,
-                       bool inplace);
-
-template <typename DerivedPolicy,
-          typename Iterator1,
-          typename Iterator2,
-          typename Iterator3,
-          typename Iterator4,
-          typename StrictWeakOrdering>
-struct merge_sort_by_key_closure
-{
-  execution_policy<DerivedPolicy> &exec;
-  Iterator1 first1, last1;
-  Iterator2 first2;
-  Iterator3 first3;
-  Iterator4 first4;
-  StrictWeakOrdering comp;
-  bool inplace;
-
-  merge_sort_by_key_closure(execution_policy<DerivedPolicy> &exec,
-                            Iterator1 first1,
-                            Iterator1 last1,
-                            Iterator2 first2,
-                            Iterator3 first3,
-                            Iterator4 first4,
-                            StrictWeakOrdering comp,
-                            bool inplace)
-    : exec(exec), first1(first1), last1(last1), first2(first2), first3(first3), first4(first4), comp(comp), inplace(inplace)
-  {}
-
-  void operator()(void) const
-  {
-    merge_sort_by_key(exec, first1, last1, first2, first3, first4, comp, inplace);
-  }
-};
-
-
-template <typename DerivedPolicy,
-          typename Iterator1,
-          typename Iterator2,
-          typename Iterator3,
-          typename Iterator4,
-          typename StrictWeakOrdering>
-void merge_sort_by_key(execution_policy<DerivedPolicy> &exec,
-                       Iterator1 first1,
-                       Iterator1 last1,
-                       Iterator2 first2,
-                       Iterator3 first3,
-                       Iterator4 first4,
-                       StrictWeakOrdering comp,
-                       bool inplace)
-{
-  typedef typename thrust::iterator_difference<Iterator1>::type difference_type;
-
-  difference_type n = thrust::distance(first1, last1);
-  
-  Iterator1 mid1  = first1 + (n / 2);
-  Iterator2 mid2  = first2 + (n / 2);
-  Iterator3 mid3  = first3 + (n / 2);
-  Iterator4 mid4  = first4 + (n / 2);
-  Iterator2 last2 = first2 + n;
-  Iterator3 last3 = first3 + n;
-
-  if (n < threshold)
-  {
-    thrust::system::detail::internal::scalar::stable_sort_by_key(first1, last1, first2, comp);
-    
-    if (!inplace)
-    {
-      thrust::system::detail::internal::scalar::copy(first1, last1, first3);
-      thrust::system::detail::internal::scalar::copy(first2, last2, first4);
-    }
-
-    return;
-  }
-
-  typedef merge_sort_by_key_closure<DerivedPolicy,Iterator1,Iterator2,Iterator3,Iterator4,StrictWeakOrdering> Closure;
-  
-  Closure left (exec, first1, mid1,  first2, first3, first4, comp, !inplace);
-  Closure right(exec, mid1,   last1, mid2,   mid3,   mid4,   comp, !inplace);
-
-  ::tbb::parallel_invoke(left, right);
-
-  if(inplace)
-  {
-    thrust::merge_by_key(exec, first3, mid3, mid3, last3, first4, mid4, first1, first2, comp);
-  }
-  else
-  {
-    thrust::merge_by_key(exec, first1, mid1, mid1, last1, first2, mid2, first3, first4, comp);
-  }
-}
-
-} // end namespace sort_detail
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator,
-         typename StrictWeakOrdering>
-void stable_sort(execution_policy<DerivedPolicy> &exec,
-                 RandomAccessIterator first,
-                 RandomAccessIterator last,
-                 StrictWeakOrdering comp)
-{
-  typedef typename thrust::iterator_value<RandomAccessIterator>::type key_type;
-
-  thrust::detail::temporary_array<key_type, DerivedPolicy> temp(exec, first, last);
-
-  sort_detail::merge_sort(exec, first, last, temp.begin(), comp, true);
-}
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename StrictWeakOrdering>
-  void stable_sort_by_key(execution_policy<DerivedPolicy> &exec,
-                          RandomAccessIterator1 first1,
-                          RandomAccessIterator1 last1,
-                          RandomAccessIterator2 first2,
-                          StrictWeakOrdering comp)
-{
-  typedef typename thrust::iterator_value<RandomAccessIterator1>::type key_type;
-  typedef typename thrust::iterator_value<RandomAccessIterator2>::type val_type;
-
-  RandomAccessIterator2 last2 = first2 + thrust::distance(first1, last1);
-
-  thrust::detail::temporary_array<key_type, DerivedPolicy> temp1(exec, first1, last1);
-  thrust::detail::temporary_array<val_type, DerivedPolicy> temp2(exec, first2, last2);
-
-  sort_by_key_detail::merge_sort_by_key(exec, first1, last1, first2, temp1.begin(), temp2.begin(), comp, true);
-}
-
-} // end namespace detail
-} // end namespace tbb
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/tbb/detail/swap_ranges.h b/compat/thrust/system/tbb/detail/swap_ranges.h
deleted file mode 100644
index 15f8f55310..0000000000
--- a/compat/thrust/system/tbb/detail/swap_ranges.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// tbb inherits swap_ranges
-#include <thrust/system/cpp/detail/swap_ranges.h>
-
diff --git a/compat/thrust/system/tbb/detail/tabulate.h b/compat/thrust/system/tbb/detail/tabulate.h
deleted file mode 100644
index da65d8e44d..0000000000
--- a/compat/thrust/system/tbb/detail/tabulate.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits tabulate
-#include <thrust/system/cpp/detail/tabulate.h>
-
diff --git a/compat/thrust/system/tbb/detail/temporary_buffer.h b/compat/thrust/system/tbb/detail/temporary_buffer.h
deleted file mode 100644
index 628bd75719..0000000000
--- a/compat/thrust/system/tbb/detail/temporary_buffer.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system has no special temporary buffer functions
-
diff --git a/compat/thrust/system/tbb/detail/transform.h b/compat/thrust/system/tbb/detail/transform.h
deleted file mode 100644
index 70ce1f41b6..0000000000
--- a/compat/thrust/system/tbb/detail/transform.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// omp inherits transform
-#include <thrust/system/cpp/detail/transform.h>
-
diff --git a/compat/thrust/system/tbb/detail/transform_reduce.h b/compat/thrust/system/tbb/detail/transform_reduce.h
deleted file mode 100644
index 23ed07054a..0000000000
--- a/compat/thrust/system/tbb/detail/transform_reduce.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits transform_reduce
-#include <thrust/system/cpp/detail/transform_reduce.h>
-
diff --git a/compat/thrust/system/tbb/detail/transform_scan.h b/compat/thrust/system/tbb/detail/transform_scan.h
deleted file mode 100644
index fc2e55d0c0..0000000000
--- a/compat/thrust/system/tbb/detail/transform_scan.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits transform_scan
-#include <thrust/system/cpp/detail/transform_scan.h>
-
diff --git a/compat/thrust/system/tbb/detail/uninitialized_copy.h b/compat/thrust/system/tbb/detail/uninitialized_copy.h
deleted file mode 100644
index 944f4baf0e..0000000000
--- a/compat/thrust/system/tbb/detail/uninitialized_copy.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits uninitialized_copy
-#include <thrust/system/cpp/detail/uninitialized_copy.h>
-
diff --git a/compat/thrust/system/tbb/detail/uninitialized_fill.h b/compat/thrust/system/tbb/detail/uninitialized_fill.h
deleted file mode 100644
index b9d6de20fa..0000000000
--- a/compat/thrust/system/tbb/detail/uninitialized_fill.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// this system inherits uninitialized_fill
-#include <thrust/system/cpp/detail/uninitialized_fill.h>
-
diff --git a/compat/thrust/system/tbb/detail/unique.h b/compat/thrust/system/tbb/detail/unique.h
deleted file mode 100644
index 34538cac7b..0000000000
--- a/compat/thrust/system/tbb/detail/unique.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in ctbbliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/tbb/detail/execution_policy.h>
-#include <thrust/pair.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace tbb
-{
-namespace detail
-{
-
-
-template<typename ExecutionPolicy,
-         typename ForwardIterator,
-         typename BinaryPredicate>
-  ForwardIterator unique(execution_policy<ExecutionPolicy> &exec,
-                         ForwardIterator first,
-                         ForwardIterator last,
-                         BinaryPredicate binary_pred);
-
-
-template<typename ExecutionPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename BinaryPredicate>
-  OutputIterator unique_copy(execution_policy<ExecutionPolicy> &exec,
-                             InputIterator first,
-                             InputIterator last,
-                             OutputIterator output,
-                             BinaryPredicate binary_pred);
-
-
-} // end namespace detail
-} // end namespace tbb 
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/tbb/detail/unique.inl>
-
diff --git a/compat/thrust/system/tbb/detail/unique.inl b/compat/thrust/system/tbb/detail/unique.inl
deleted file mode 100644
index 06e6a30bb1..0000000000
--- a/compat/thrust/system/tbb/detail/unique.inl
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in ctbbliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/tbb/detail/unique.h>
-#include <thrust/system/detail/generic/unique.h>
-#include <thrust/pair.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace tbb
-{
-namespace detail
-{
-
-
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename BinaryPredicate>
-  ForwardIterator unique(execution_policy<DerivedPolicy> &exec,
-                         ForwardIterator first,
-                         ForwardIterator last,
-                         BinaryPredicate binary_pred)
-{
-  // tbb prefers generic::unique to cpp::unique
-  return thrust::system::detail::generic::unique(exec,first,last,binary_pred);
-} // end unique()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename BinaryPredicate>
-  OutputIterator unique_copy(execution_policy<DerivedPolicy> &exec,
-                             InputIterator first,
-                             InputIterator last,
-                             OutputIterator output,
-                             BinaryPredicate binary_pred)
-{
-  // tbb prefers generic::unique_copy to cpp::unique_copy
-  return thrust::system::detail::generic::unique_copy(exec,first,last,output,binary_pred);
-} // end unique_copy()
-
-
-} // end namespace detail
-} // end namespace tbb 
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/tbb/detail/unique_by_key.h b/compat/thrust/system/tbb/detail/unique_by_key.h
deleted file mode 100644
index c6d053243f..0000000000
--- a/compat/thrust/system/tbb/detail/unique_by_key.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in ctbbliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/tbb/detail/execution_policy.h>
-#include <thrust/pair.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace tbb
-{
-namespace detail
-{
-
-
-template<typename DerivedPolicy,
-         typename ForwardIterator1,
-         typename ForwardIterator2,
-         typename BinaryPredicate>
-  thrust::pair<ForwardIterator1,ForwardIterator2>
-    unique_by_key(execution_policy<DerivedPolicy> &exec,
-                  ForwardIterator1 keys_first, 
-                  ForwardIterator1 keys_last,
-                  ForwardIterator2 values_first,
-                  BinaryPredicate binary_pred);
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename BinaryPredicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    unique_by_key_copy(execution_policy<DerivedPolicy> &exec,
-                       InputIterator1 keys_first, 
-                       InputIterator1 keys_last,
-                       InputIterator2 values_first,
-                       OutputIterator1 keys_output,
-                       OutputIterator2 values_output,
-                       BinaryPredicate binary_pred);
-
-
-} // end namespace detail
-} // end namespace tbb 
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/tbb/detail/unique_by_key.inl>
-
diff --git a/compat/thrust/system/tbb/detail/unique_by_key.inl b/compat/thrust/system/tbb/detail/unique_by_key.inl
deleted file mode 100644
index 7747ca4c2e..0000000000
--- a/compat/thrust/system/tbb/detail/unique_by_key.inl
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in ctbbliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/tbb/detail/unique_by_key.h>
-#include <thrust/system/detail/generic/unique_by_key.h>
-#include <thrust/pair.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace tbb
-{
-namespace detail
-{
-
-
-template<typename DerivedPolicy,
-         typename ForwardIterator1,
-         typename ForwardIterator2,
-         typename BinaryPredicate>
-  thrust::pair<ForwardIterator1,ForwardIterator2>
-    unique_by_key(execution_policy<DerivedPolicy> &exec,
-                  ForwardIterator1 keys_first, 
-                  ForwardIterator1 keys_last,
-                  ForwardIterator2 values_first,
-                  BinaryPredicate binary_pred)
-{
-  // tbb prefers generic::unique_by_key to cpp::unique_by_key
-  return thrust::system::detail::generic::unique_by_key(exec,keys_first,keys_last,values_first,binary_pred);
-} // end unique_by_key()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename BinaryPredicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    unique_by_key_copy(execution_policy<DerivedPolicy> &exec,
-                       InputIterator1 keys_first, 
-                       InputIterator1 keys_last,
-                       InputIterator2 values_first,
-                       OutputIterator1 keys_output,
-                       OutputIterator2 values_output,
-                       BinaryPredicate binary_pred)
-{
-  // tbb prefers generic::unique_by_key_copy to cpp::unique_by_key_copy
-  return thrust::system::detail::generic::unique_by_key_copy(exec,keys_first,keys_last,values_first,keys_output,values_output,binary_pred);
-} // end unique_by_key_copy()
-
-
-} // end namespace detail
-} // end namespace tbb
-} // end namespace system
-} // end namespace thrust
-
diff --git a/compat/thrust/system/tbb/detail/vector.inl b/compat/thrust/system/tbb/detail/vector.inl
deleted file mode 100644
index d87e670fd6..0000000000
--- a/compat/thrust/system/tbb/detail/vector.inl
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in ctbbliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/tbb/vector.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace tbb
-{
-
-template<typename T, typename Allocator>
-  vector<T,Allocator>
-    ::vector()
-      : super_t()
-{}
-
-template<typename T, typename Allocator>
-  vector<T,Allocator>
-    ::vector(size_type n)
-      : super_t(n)
-{}
-
-template<typename T, typename Allocator>
-  vector<T,Allocator>
-    ::vector(size_type n, const value_type &value)
-      : super_t(n,value)
-{}
-
-template<typename T, typename Allocator>
-  vector<T,Allocator>
-    ::vector(const vector &x)
-      : super_t(x)
-{}
-
-template<typename T, typename Allocator>
-  template<typename OtherT, typename OtherAllocator>
-    vector<T,Allocator>
-      ::vector(const thrust::detail::vector_base<OtherT,OtherAllocator> &x)
-        : super_t(x)
-{}
-
-template<typename T, typename Allocator>
-  template<typename OtherT, typename OtherAllocator>
-    vector<T,Allocator>
-      ::vector(const std::vector<OtherT,OtherAllocator> &x)
-        : super_t(x)
-{}
-
-template<typename T, typename Allocator>
-  template<typename InputIterator>
-    vector<T,Allocator>
-      ::vector(InputIterator first, InputIterator last)
-        : super_t(first,last)
-{}
-
-template<typename T, typename Allocator>
-  template<typename OtherT, typename OtherAllocator>
-    vector<T,Allocator> &
-      vector<T,Allocator>
-        ::operator=(const std::vector<OtherT,OtherAllocator> &x)
-{
-  super_t::operator=(x);
-  return *this;
-}
-
-template<typename T, typename Allocator>
-  template<typename OtherT, typename OtherAllocator>
-    vector<T,Allocator> &
-      vector<T,Allocator>
-        ::operator=(const thrust::detail::vector_base<OtherT,OtherAllocator> &x)
-{
-  super_t::operator=(x);
-  return *this;
-}
-      
-} // end tbb
-} // end system
-} // end thrust
-
diff --git a/compat/thrust/system/tbb/execution_policy.h b/compat/thrust/system/tbb/execution_policy.h
deleted file mode 100644
index c462586f82..0000000000
--- a/compat/thrust/system/tbb/execution_policy.h
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-/*! \file thrust/system/tbb/execution_policy.h
- *  \brief Execution policies for Thrust's TBB system.
- */
-
-#include <thrust/detail/config.h>
-
-// get the execution policies definitions first
-#include <thrust/system/tbb/detail/execution_policy.h>
-
-// get the definition of par
-#include <thrust/system/tbb/detail/par.h>
-
-// now get all the algorithm definitions
-
-#include <thrust/system/tbb/detail/adjacent_difference.h>
-#include <thrust/system/tbb/detail/assign_value.h>
-#include <thrust/system/tbb/detail/binary_search.h>
-#include <thrust/system/tbb/detail/copy.h>
-#include <thrust/system/tbb/detail/copy_if.h>
-#include <thrust/system/tbb/detail/count.h>
-#include <thrust/system/tbb/detail/equal.h>
-#include <thrust/system/tbb/detail/extrema.h>
-#include <thrust/system/tbb/detail/fill.h>
-#include <thrust/system/tbb/detail/find.h>
-#include <thrust/system/tbb/detail/for_each.h>
-#include <thrust/system/tbb/detail/gather.h>
-#include <thrust/system/tbb/detail/generate.h>
-#include <thrust/system/tbb/detail/get_value.h>
-#include <thrust/system/tbb/detail/inner_product.h>
-#include <thrust/system/tbb/detail/iter_swap.h>
-#include <thrust/system/tbb/detail/logical.h>
-#include <thrust/system/tbb/detail/malloc_and_free.h>
-#include <thrust/system/tbb/detail/merge.h>
-#include <thrust/system/tbb/detail/mismatch.h>
-#include <thrust/system/tbb/detail/partition.h>
-#include <thrust/system/tbb/detail/reduce.h>
-#include <thrust/system/tbb/detail/reduce_by_key.h>
-#include <thrust/system/tbb/detail/remove.h>
-#include <thrust/system/tbb/detail/replace.h>
-#include <thrust/system/tbb/detail/reverse.h>
-#include <thrust/system/tbb/detail/scan.h>
-#include <thrust/system/tbb/detail/scan_by_key.h>
-#include <thrust/system/tbb/detail/scatter.h>
-#include <thrust/system/tbb/detail/sequence.h>
-#include <thrust/system/tbb/detail/set_operations.h>
-#include <thrust/system/tbb/detail/sort.h>
-#include <thrust/system/tbb/detail/swap_ranges.h>
-#include <thrust/system/tbb/detail/tabulate.h>
-#include <thrust/system/tbb/detail/transform.h>
-#include <thrust/system/tbb/detail/transform_reduce.h>
-#include <thrust/system/tbb/detail/transform_scan.h>
-#include <thrust/system/tbb/detail/uninitialized_copy.h>
-#include <thrust/system/tbb/detail/uninitialized_fill.h>
-#include <thrust/system/tbb/detail/unique.h>
-#include <thrust/system/tbb/detail/unique_by_key.h>
-
-
-// define these entities here for the purpose of Doxygenating them
-// they are actually defined elsewhere
-#if 0
-namespace thrust
-{
-namespace system
-{
-namespace tbb
-{
-
-
-/*! \addtogroup execution_policies
- *  \{
- */
-
-
-/*! \p thrust::tbb::execution_policy is the base class for all Thrust parallel execution
- *  policies which are derived from Thrust's TBB backend system.
- */
-template<typename DerivedPolicy>
-struct execution_policy : thrust::execution_policy<DerivedPolicy>
-{};
-
-
-/*! \p tbb::tag is a type representing Thrust's TBB backend system in C++'s type system.
- *  Iterators "tagged" with a type which is convertible to \p tbb::tag assert that they may be
- *  "dispatched" to algorithm implementations in the \p tbb system.
- */
-struct tag : thrust::system::tbb::execution_policy<tag> { unspecified };
-
-
-/*! \p thrust::tbb::par is the parallel execution policy associated with Thrust's TBB
- *  backend system.
- *
- *  Instead of relying on implicit algorithm dispatch through iterator system tags, users may
- *  directly target Thrust's TBB backend system by providing \p thrust::tbb::par as an algorithm
- *  parameter.
- *
- *  Explicit dispatch can be useful in avoiding the introduction of data copies into containers such
- *  as \p thrust::tbb::vector.
- *
- *  The type of \p thrust::tbb::par is implementation-defined.
- *
- *  The following code snippet demonstrates how to use \p thrust::tbb::par to explicitly dispatch an
- *  invocation of \p thrust::for_each to the TBB backend system:
- *
- *  \code
- *  #include <thrust/for_each.h>
- *  #include <thrust/system/tbb/execution_policy.h>
- *  #include <cstdio>
- *
- *  struct printf_functor
- *  {
- *    __host__ __device__
- *    void operator()(int x)
- *    {
- *      printf("%d\n");
- *    }
- *  };
- *  ...
- *  int vec[3];
- *  vec[0] = 0; vec[1] = 1; vec[2] = 2;
- *
- *  thrust::for_each(thrust::tbb::par, vec.begin(), vec.end(), printf_functor());
- *
- *  // 0 1 2 is printed to standard output in some unspecified order
- *  \endcode
- */
-static const unspecified par;
-
-
-/*! \}
- */
-
-
-} // end tbb
-} // end system
-} // end thrust
-#endif
-
-
diff --git a/compat/thrust/system/tbb/memory.h b/compat/thrust/system/tbb/memory.h
deleted file mode 100644
index deea7eed7d..0000000000
--- a/compat/thrust/system/tbb/memory.h
+++ /dev/null
@@ -1,414 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in ctbbliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file thrust/system/tbb/memory.h
- *  \brief Managing memory associated with Thrust's TBB system.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/tbb/detail/execution_policy.h>
-#include <thrust/memory.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/allocator/malloc_allocator.h>
-#include <ostream>
-
-namespace thrust
-{
-namespace system
-{
-namespace tbb
-{
-
-template<typename> class pointer;
-
-} // end tbb
-} // end system
-} // end thrust
-
-
-/*! \cond
- */
-
-// specialize std::iterator_traits to avoid problems with the name of
-// pointer's constructor shadowing its nested pointer type
-// do this before pointer is defined so the specialization is correctly
-// used inside the definition
-namespace std
-{
-
-template<typename Element>
-  struct iterator_traits<thrust::system::tbb::pointer<Element> >
-{
-  private:
-    typedef thrust::system::tbb::pointer<Element> ptr;
-
-  public:
-    typedef typename ptr::iterator_category       iterator_category;
-    typedef typename ptr::value_type              value_type;
-    typedef typename ptr::difference_type         difference_type;
-    typedef ptr                                   pointer;
-    typedef typename ptr::reference               reference;
-}; // end iterator_traits
-
-} // end std
-
-/*! \endcond
- */
-
-
-namespace thrust
-{
-namespace system
-{
-
-/*! \addtogroup system_backends Systems
- *  \ingroup system
- *  \{
- */
-
-/*! \namespace thrust::system::tbb
- *  \brief \p thrust::system::tbb is the namespace containing functionality for allocating, manipulating,
- *         and deallocating memory available to Thrust's TBB backend system.
- *         The identifiers are provided in a separate namespace underneath <tt>thrust::system</tt>
- *         for import convenience but are also aliased in the top-level <tt>thrust::tbb</tt>
- *         namespace for easy access.
- *
- */
-namespace tbb
-{
-
-// forward declaration of reference for pointer
-template<typename Element> class reference;
-
-/*! \cond
- */
-
-// XXX nvcc + msvc have trouble instantiating reference below
-//     this is a workaround
-namespace detail
-{
-
-template<typename Element>
-  struct reference_msvc_workaround
-{
-  typedef thrust::system::tbb::reference<Element> type;
-}; // end reference_msvc_workaround
-
-} // end detail
-
-/*! \endcond
- */
-
-
-/*! \p pointer stores a pointer to an object allocated in memory available to the tbb system.
- *  This type provides type safety when dispatching standard algorithms on ranges resident
- *  in tbb memory.
- *
- *  \p pointer has pointer semantics: it may be dereferenced and manipulated with pointer arithmetic.
- *
- *  \p pointer can be created with the function \p tbb::malloc, or by explicitly calling its constructor
- *  with a raw pointer.
- *
- *  The raw pointer encapsulated by a \p pointer may be obtained by eiter its <tt>get</tt> member function
- *  or the \p raw_pointer_cast function.
- *
- *  \note \p pointer is not a "smart" pointer; it is the programmer's responsibility to deallocate memory
- *  pointed to by \p pointer.
- *
- *  \tparam T specifies the type of the pointee.
- *
- *  \see tbb::malloc
- *  \see tbb::free
- *  \see raw_pointer_cast
- */
-template<typename T>
-  class pointer
-    : public thrust::pointer<
-               T,
-               thrust::system::tbb::tag,
-               thrust::system::tbb::reference<T>,
-               thrust::system::tbb::pointer<T>
-             >
-{
-  /*! \cond
-   */
-
-  private:
-    typedef thrust::pointer<
-      T,
-      thrust::system::tbb::tag,
-      //thrust::system::tbb::reference<T>,
-      typename detail::reference_msvc_workaround<T>::type,
-      thrust::system::tbb::pointer<T>
-    > super_t;
-
-  /*! \endcond
-   */
-
-  public:
-    // note that tbb::pointer's member functions need __host__ __device__
-    // to interoperate with nvcc + iterators' dereference member function
-
-    /*! \p pointer's no-argument constructor initializes its encapsulated pointer to \c 0.
-     */
-    __host__ __device__
-    pointer() : super_t() {}
-
-    /*! This constructor allows construction of a <tt>pointer<const T></tt> from a <tt>T*</tt>.
-     *
-     *  \param ptr A raw pointer to copy from, presumed to point to a location in memory
-     *         accessible by the \p tbb system.
-     *  \tparam OtherT \p OtherT shall be convertible to \p T.
-     */
-    template<typename OtherT>
-    __host__ __device__
-    explicit pointer(OtherT *ptr) : super_t(ptr) {}
-
-    /*! This constructor allows construction from another pointer-like object with related type.
-     *
-     *  \param other The \p OtherPointer to copy.
-     *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
-     *          to \p thrust::system::tbb::tag and its element type shall be convertible to \p T.
-     */
-    template<typename OtherPointer>
-    __host__ __device__
-    pointer(const OtherPointer &other,
-            typename thrust::detail::enable_if_pointer_is_convertible<
-              OtherPointer,
-              pointer
-            >::type * = 0) : super_t(other) {}
-
-    /*! Assignment operator allows assigning from another pointer-like object with related type.
-     *
-     *  \param other The other pointer-like object to assign from.
-     *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
-     *          to \p thrust::system::tbb::tag and its element type shall be convertible to \p T.
-     */
-    template<typename OtherPointer>
-    __host__ __device__
-    typename thrust::detail::enable_if_pointer_is_convertible<
-      OtherPointer,
-      pointer,
-      pointer &
-    >::type
-    operator=(const OtherPointer &other)
-    {
-      return super_t::operator=(other);
-    }
-}; // end pointer
-
-
-/*! \p reference is a wrapped reference to an object stored in memory available to the \p tbb system.
- *  \p reference is the type of the result of dereferencing a \p tbb::pointer.
- *
- *  \tparam T Specifies the type of the referenced object.
- */
-template<typename T>
-  class reference
-    : public thrust::reference<
-               T,
-               thrust::system::tbb::pointer<T>,
-               thrust::system::tbb::reference<T>
-             >
-{
-  /*! \cond
-   */
-
-  private:
-    typedef thrust::reference<
-      T,
-      thrust::system::tbb::pointer<T>,
-      thrust::system::tbb::reference<T>
-    > super_t;
-
-  /*! \endcond
-   */
-
-  public:
-    /*! \cond
-     */
-
-    typedef typename super_t::value_type value_type;
-    typedef typename super_t::pointer    pointer;
-
-    /*! \endcond
-     */
-
-    /*! This constructor initializes this \p reference to refer to an object
-     *  pointed to by the given \p pointer. After this \p reference is constructed,
-     *  it shall refer to the object pointed to by \p ptr.
-     *
-     *  \param ptr A \p pointer to copy from.
-     */
-    __host__ __device__
-    explicit reference(const pointer &ptr)
-      : super_t(ptr)
-    {}
-
-    /*! This constructor accepts a const reference to another \p reference of related type.
-     *  After this \p reference is constructed, it shall refer to the same object as \p other.
-     *
-     *  \param other A \p reference to copy from.
-     *  \tparam OtherT The element type of the other \p reference.
-     *
-     *  \note This constructor is templated primarily to allow initialization of <tt>reference<const T></tt>
-     *        from <tt>reference<T></tt>.
-     */
-    template<typename OtherT>
-    __host__ __device__
-    reference(const reference<OtherT> &other,
-              typename thrust::detail::enable_if_convertible<
-                typename reference<OtherT>::pointer,
-                pointer
-              >::type * = 0)
-      : super_t(other)
-    {}
-
-    /*! Copy assignment operator copy assigns from another \p reference of related type.
-     *
-     *  \param other The other \p reference to assign from.
-     *  \return <tt>*this</tt>
-     *  \tparam OtherT The element type of the other \p reference.
-     */
-    template<typename OtherT>
-    reference &operator=(const reference<OtherT> &other);
-
-    /*! Assignment operator assigns from a \p value_type.
-     *
-     *  \param x The \p value_type to assign from.
-     *  \return <tt>*this</tt>
-     */
-    reference &operator=(const value_type &x);
-}; // end reference
-
-/*! Exchanges the values of two objects referred to by \p reference.
- *  \p x The first \p reference of interest.
- *  \p y The second \p reference ot interest.
- */
-template<typename T>
-__host__ __device__
-void swap(reference<T> x, reference<T> y);
-
-/*! Allocates an area of memory available to Thrust's <tt>tbb</tt> system.
- *  \param n Number of bytes to allocate.
- *  \return A <tt>tbb::pointer<void></tt> pointing to the beginning of the newly
- *          allocated memory. A null <tt>tbb::pointer<void></tt> is returned if
- *          an error occurs.
- *  \note The <tt>tbb::pointer<void></tt> returned by this function must be
- *        deallocated with \p tbb::free.
- *  \see tbb::free
- *  \see std::malloc
- */
-inline pointer<void> malloc(std::size_t n);
-
-/*! Allocates a typed area of memory available to Thrust's <tt>tbb</tt> system.
- *  \param n Number of elements to allocate.
- *  \return A <tt>tbb::pointer<T></tt> pointing to the beginning of the newly
- *          allocated memory. A null <tt>tbb::pointer<T></tt> is returned if
- *          an error occurs.
- *  \note The <tt>tbb::pointer<T></tt> returned by this function must be
- *        deallocated with \p tbb::free.
- *  \see tbb::free
- *  \see std::malloc
- */
-template<typename T>
-inline pointer<T> malloc(std::size_t n);
-
-/*! Deallocates an area of memory previously allocated by <tt>tbb::malloc</tt>.
- *  \param ptr A <tt>tbb::pointer<void></tt> pointing to the beginning of an area
- *         of memory previously allocated with <tt>tbb::malloc</tt>.
- *  \see tbb::malloc
- *  \see std::free
- */
-inline void free(pointer<void> ptr);
-
-// XXX upon c++11
-// template<typename T> using allocator = thrust::detail::malloc_allocator<T,tag,pointer<T> >;
-
-/*! \p tbb::allocator is the default allocator used by the \p tbb system's containers such as
- *  <tt>tbb::vector</tt> if no user-specified allocator is provided. \p tbb::allocator allocates
- *  (deallocates) storage with \p tbb::malloc (\p tbb::free).
- */
-template<typename T>
-  struct allocator
-    : thrust::detail::malloc_allocator<
-        T,
-        tag,
-        pointer<T>
-      >
-{
-  /*! The \p rebind metafunction provides the type of an \p allocator
-   *  instantiated with another type.
-   *
-   *  \tparam U The other type to use for instantiation.
-   */
-  template<typename U>
-    struct rebind
-  {
-    /*! The typedef \p other gives the type of the rebound \p allocator.
-     */
-    typedef allocator<U> other;
-  };
-
-  /*! No-argument constructor has no effect.
-   */
-  __host__ __device__
-  inline allocator() {}
-
-  /*! Copy constructor has no effect.
-   */
-  __host__ __device__
-  inline allocator(const allocator &) {}
-
-  /*! Constructor from other \p allocator has no effect.
-   */
-  template<typename U>
-  __host__ __device__
-  inline allocator(const allocator<U> &) {}
-
-  /*! Destructor has no effect.
-   */
-  __host__ __device__
-  inline ~allocator() {}
-}; // end allocator
-
-} // end tbb
-
-/*! \}
- */
-
-} // end system
-
-/*! \namespace thrust::tbb
- *  \brief \p thrust::tbb is a top-level alias for thrust::system::tbb.
- */
-namespace tbb
-{
-
-using thrust::system::tbb::pointer;
-using thrust::system::tbb::reference;
-using thrust::system::tbb::malloc;
-using thrust::system::tbb::free;
-using thrust::system::tbb::allocator;
-
-} // end tbb
-
-} // end thrust
-
-#include <thrust/system/tbb/detail/memory.inl>
-
diff --git a/compat/thrust/system/tbb/vector.h b/compat/thrust/system/tbb/vector.h
deleted file mode 100644
index 1c49c3f9e0..0000000000
--- a/compat/thrust/system/tbb/vector.h
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in ctbbliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file thrust/system/tbb/vector.h
- *  \brief A dynamically-sizable array of elements which reside in memory available to
- *         Thrust's TBB system.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/tbb/memory.h>
-#include <thrust/detail/vector_base.h>
-#include <vector>
-
-namespace thrust
-{
-namespace system
-{
-namespace tbb
-{
-
-// XXX upon c++11
-// template<typename T, typename Allocator = allocator<T> > using vector = thrust::detail::vector_base<T,Allocator>;
-
-/*! \p tbb::vector is a container that supports random access to elements,
- *  constant time removal of elements at the end, and linear time insertion
- *  and removal of elements at the beginning or in the middle. The number of
- *  elements in a \p tbb::vector may vary dynamically; memory management is
- *  automatic. The elements contained in a \p tbb::vector reside in memory
- *  available to the \p tbb system.
- *
- *  \tparam T The element type of the \p tbb::vector.
- *  \tparam Allocator The allocator type of the \p tbb::vector. Defaults to \p tbb::allocator.
- *
- *  \see http://www.sgi.com/tech/stl/Vector.html
- *  \see host_vector For the documentation of the complete interface which is
- *                   shared by \p tbb::vector
- *  \see device_vector
- */
-template<typename T, typename Allocator = allocator<T> >
-  class vector
-    : public thrust::detail::vector_base<T,Allocator>
-{
-  /*! \cond
-   */
-  private:
-    typedef thrust::detail::vector_base<T,Allocator> super_t;
-  /*! \endcond
-   */
-
-  public:
-
-  /*! \cond
-   */
-    typedef typename super_t::size_type  size_type;
-    typedef typename super_t::value_type value_type;
-  /*! \endcond
-   */
-
-    /*! This constructor creates an empty \p tbb::vector.
-     */
-    vector();
-
-    /*! This constructor creates a \p tbb::vector with \p n default-constructed elements.
-     *  \param n The size of the \p tbb::vector to create.
-     */
-    explicit vector(size_type n);
-
-    /*! This constructor creates a \p tbb::vector with \p n copies of \p value.
-     *  \param n The size of the \p tbb::vector to create.
-     *  \param value An element to copy.
-     */
-    explicit vector(size_type n, const value_type &value);
-
-    /*! Copy constructor copies from another \p tbb::vector.
-     *  \param x The other \p tbb::vector to copy.
-     */
-    vector(const vector &x);
-
-    /*! This constructor copies from another Thrust vector-like object.
-     *  \param x The other object to copy from.
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector(const thrust::detail::vector_base<OtherT,OtherAllocator> &x);
-
-    /*! This constructor copies from a \c std::vector.
-     *  \param x The \c std::vector to copy from.
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector(const std::vector<OtherT,OtherAllocator> &x);
-
-    /*! This constructor creates a \p tbb::vector by copying from a range.
-     *  \param first The beginning of the range.
-     *  \param last The end of the range.
-     */
-    template<typename InputIterator>
-    vector(InputIterator first, InputIterator last);
-
-    // XXX vector_base should take a Derived type so we don't have to define these superfluous assigns
-
-    /*! Assignment operator assigns from a \c std::vector.
-     *  \param x The \c std::vector to assign from.
-     *  \return <tt>*this</tt>
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector &operator=(const std::vector<OtherT,OtherAllocator> &x);
-
-    /*! Assignment operator assigns from another Thrust vector-like object.
-     *  \param x The other object to assign from.
-     *  \return <tt>*this</tt>
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector &operator=(const thrust::detail::vector_base<OtherT,OtherAllocator> &x);
-}; // end vector
-
-} // end tbb
-} // end system
-
-// alias system::tbb names at top-level
-namespace tbb
-{
-
-using thrust::system::tbb::vector;
-
-} // end tbb
-
-} // end thrust
-
-#include <thrust/system/tbb/detail/vector.inl>
-
diff --git a/compat/thrust/system_error.h b/compat/thrust/system_error.h
deleted file mode 100644
index ce88fe6bd8..0000000000
--- a/compat/thrust/system_error.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file thrust/system_error.h
- *  \brief System diagnostics
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-namespace thrust
-{
-
-/*! \addtogroup system System Access
- *  \{
- */
-
-/*! \namespace thrust::system
- *  \brief \p thrust::system is the namespace which contains functionality for manipulating
- *         memory specific to one of Thrust's backend systems. It also contains functionality
- *         for reporting error conditions originating from the operating system or other
- *         low-level application program interfaces such as the CUDA runtime.
- *         They are provided in a separate namespace for import convenience but are
- *         also aliased in the top-level \p thrust namespace for easy access.
- */
-namespace system
-{
-} // end system
-
-/*! \} // end system
- */
-
-} // end thrust
-
-#include <thrust/system/error_code.h>
-#include <thrust/system/system_error.h>
-
diff --git a/compat/thrust/tabulate.h b/compat/thrust/tabulate.h
deleted file mode 100644
index c87edf01ab..0000000000
--- a/compat/thrust/tabulate.h
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file tabulate.h
- *  \brief Fills a range with the tabulation of a function
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/execution_policy.h>
-
-namespace thrust
-{
-
-
-/*! \addtogroup transformations
- *  \{
- */
-
-
-/*! \p tabulate fills the range <tt>[first, last)</tt> with the value of a function applied to each
- *     element's index.
- *
- *  For each iterator \c i in the range <tt>[first, last)</tt>, \p tabulate performs the assignment
- *  <tt>*i = unary_op(i - first)</tt>.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the range.
- *  \param last The end of the range.
- *  \param unary_op The unary operation to apply.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator is mutable,
- *          and if \c x and \c y are objects of \c ForwardIterator's \c value_type, then <tt>x + y</tt> is defined,
- *          and if \c T is \p ForwardIterator's \c value_type, then <tt>T(0)</tt> is defined.
- *  \tparam UnaryOperation is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
- *                         and \c UnaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
- *
- *  The following code snippet demonstrates how to use \p tabulate to generate the first \c n non-positive integers
- *  using the \p thrust::host execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/tabulate.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  const int N = 10;
- *  int A[N];
- *  thrust::tabulate(thrust::host, A, A + 10, thrust::negate<int>());
- *  // A is now {0, -1, -2, -3, -4, -5, -6, -7, -8, -9}
- *  \endcode
- *
- *  \see thrust::fill
- *  \see thrust::generate
- *  \see thrust::sequence
- */
-template<typename DerivedPolicy, typename ForwardIterator, typename UnaryOperation>
-  void tabulate(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                ForwardIterator first,
-                ForwardIterator last,
-                UnaryOperation unary_op);
-
-
-/*! \p tabulate fills the range <tt>[first, last)</tt> with the value of a function applied to each
- *     element's index.
- *
- *  For each iterator \c i in the range <tt>[first, last)</tt>, \p tabulate performs the assignment
- *  <tt>*i = unary_op(i - first)</tt>.
- *
- *  \param first The beginning of the range.
- *  \param last The end of the range.
- *  \param unary_op The unary operation to apply.
- *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator is mutable,
- *          and if \c x and \c y are objects of \c ForwardIterator's \c value_type, then <tt>x + y</tt> is defined,
- *          and if \c T is \p ForwardIterator's \c value_type, then <tt>T(0)</tt> is defined.
- *  \tparam UnaryOperation is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
- *                         and \c UnaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
- *
- *  The following code snippet demonstrates how to use \p tabulate to generate the first \c n non-positive integers:
- *
- *  \code
- *  #include <thrust/tabulate.h>
- *  #include <thrust/functional.h>
- *  ...
- *  const int N = 10;
- *  int A[N];
- *  thrust::tabulate(A, A + 10, thrust::negate<int>());
- *  // A is now {0, -1, -2, -3, -4, -5, -6, -7, -8, -9}
- *  \endcode
- *
- *  \see thrust::fill
- *  \see thrust::generate
- *  \see thrust::sequence
- */
-template<typename ForwardIterator, typename UnaryOperation>
-  void tabulate(ForwardIterator first,
-                ForwardIterator last,
-                UnaryOperation unary_op);
-
-
-/*! \} // end transformations
- */
-
-
-} // end namespace thrust
-
-#include <thrust/detail/tabulate.inl>
-
diff --git a/compat/thrust/transform.h b/compat/thrust/transform.h
deleted file mode 100644
index 1ada105828..0000000000
--- a/compat/thrust/transform.h
+++ /dev/null
@@ -1,720 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file transform.h
- *  \brief Transforms input ranges using a function object
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/execution_policy.h>
-
-namespace thrust
-{
-
-
-/*! \addtogroup algorithms
- */
-
-/*! \addtogroup transformations
- *  \ingroup algorithms
- *  \{
- */
-
-
-/*! This version of \p transform applies a unary function to each element
- *  of an input sequence and stores the result in the corresponding 
- *  position in an output sequence.  Specifically, for each iterator 
- *  <tt>i</tt> in the range [\p first, \p last) the operation 
- *  <tt>op(*i)</tt> is performed and the result is assigned to <tt>*o</tt>,
- *  where <tt>o</tt> is the corresponding output iterator in the range
- *  [\p result, \p result + (\p last - \p first) ).  The input and
- *  output sequences may coincide, resulting in an in-place transformation.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *    
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the input sequence.
- *  \param last The end of the input sequence.
- *  \param result The beginning of the output sequence.
- *  \param op The tranformation operation.
- *  \return The end of the output sequence.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *                        and \c InputIterator's \c value_type is convertible to \c UnaryFunction's \c argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
- *                              and \c UnaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
- *
- *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt> otherwise.
- *
- *  The following code snippet demonstrates how to use \p transform to negate a range in-place
- *  using the \p thrust::host execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/transform.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  
- *  int data[10] = {-5, 0, 2, -3, 2, 4, 0, -1, 2, 8};
- * 
- *  thrust::negate<int> op;
- *
- *  thrust::transform(thrust::host, data, data + 10, data, op); // in-place transformation
- *
- *  // data is now {5, 0, -2, 3, -2, -4, 0, 1, -2, -8};
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/transform.html
- */
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename UnaryFunction>
-  OutputIterator transform(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                           InputIterator first, InputIterator last,
-                           OutputIterator result,
-                           UnaryFunction op);
-
-	
-/*! This version of \p transform applies a unary function to each element
- *  of an input sequence and stores the result in the corresponding 
- *  position in an output sequence.  Specifically, for each iterator 
- *  <tt>i</tt> in the range [\p first, \p last) the operation 
- *  <tt>op(*i)</tt> is performed and the result is assigned to <tt>*o</tt>,
- *  where <tt>o</tt> is the corresponding output iterator in the range
- *  [\p result, \p result + (\p last - \p first) ).  The input and
- *  output sequences may coincide, resulting in an in-place transformation.
- *    
- *  \param first The beginning of the input sequence.
- *  \param last The end of the input sequence.
- *  \param result The beginning of the output sequence.
- *  \param op The tranformation operation.
- *  \return The end of the output sequence.
- *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *                        and \c InputIterator's \c value_type is convertible to \c UnaryFunction's \c argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
- *                              and \c UnaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
- *
- *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt> otherwise.
- *
- *  The following code snippet demonstrates how to use \p transform
- *
- *  \code
- *  #include <thrust/transform.h>
- *  #include <thrust/functional.h>
- *  
- *  int data[10] = {-5, 0, 2, -3, 2, 4, 0, -1, 2, 8};
- * 
- *  thrust::negate<int> op;
- *
- *  thrust::transform(data, data + 10, data, op); // in-place transformation
- *
- *  // data is now {5, 0, -2, 3, -2, -4, 0, 1, -2, -8};
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/transform.html
- */
-template<typename InputIterator,
-         typename OutputIterator,
-         typename UnaryFunction>
-  OutputIterator transform(InputIterator first, InputIterator last,
-                           OutputIterator result,
-                           UnaryFunction op);
-
-
-/*! This version of \p transform applies a binary function to each pair
- *  of elements from two input sequences and stores the result in the
- *  corresponding position in an output sequence.  Specifically, for
- *  each iterator <tt>i</tt> in the range [\p first1, \p last1) and 
- *  <tt>j = first + (i - first1)</tt> in the range [\p first2, \p last2)
- *  the operation <tt>op(*i,*j)</tt> is performed and the result is 
- *  assigned to <tt>*o</tt>,  where <tt>o</tt> is the corresponding
- *  output iterator in the range [\p result, \p result + (\p last - \p first) ).
- *  The input and output sequences may coincide, resulting in an 
- *  in-place transformation.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *    
- *  \param exec The execution policy to use for parallelization.
- *  \param first1 The beginning of the first input sequence.
- *  \param last1 The end of the first input sequence.
- *  \param first2 The beginning of the second input sequence.
- *  \param result The beginning of the output sequence.
- *  \param op The tranformation operation.
- *  \return The end of the output sequence.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *                        and \c InputIterator1's \c value_type is convertible to \c BinaryFunction's \c first_argument_type.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *                        and \c InputIterator2's \c value_type is convertible to \c BinaryFunction's \c second_argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
- *                              and \c BinaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
- *
- *  \pre \p first1 may equal \p result, but the range <tt>[first1, last1)</tt> shall not overlap the range <tt>[result, result + (last1 - first1))</tt> otherwise.
- *  \pre \p first2 may equal \p result, but the range <tt>[first2, first2 + (last1 - first1))</tt> shall not overlap the range <tt>[result, result + (last1 - first1))</tt> otherwise.
- *
- *  The following code snippet demonstrates how to use \p transform to compute the sum of two
- *  ranges using the \p thrust::host execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/transform.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  
- *  int input1[6] = {-5,  0,  2,  3,  2,  4};
- *  int input2[6] = { 3,  6, -2,  1,  2,  3};
- *  int output[6];
- * 
- *  thrust::plus<int> op;
- *
- *  thrust::transform(thrust::host, input1, input1 + 6, input2, output, op);
- *
- *  // output is now {-2,  6,  0,  4,  4,  7};
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/transform.html
- */
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename BinaryFunction>
-  OutputIterator transform(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                           InputIterator1 first1, InputIterator1 last1,
-                           InputIterator2 first2,
-                           OutputIterator result,
-                           BinaryFunction op);
-
-
-/*! This version of \p transform applies a binary function to each pair
- *  of elements from two input sequences and stores the result in the
- *  corresponding position in an output sequence.  Specifically, for
- *  each iterator <tt>i</tt> in the range [\p first1, \p last1) and 
- *  <tt>j = first + (i - first1)</tt> in the range [\p first2, \p last2)
- *  the operation <tt>op(*i,*j)</tt> is performed and the result is 
- *  assigned to <tt>*o</tt>,  where <tt>o</tt> is the corresponding
- *  output iterator in the range [\p result, \p result + (\p last - \p first) ).
- *  The input and output sequences may coincide, resulting in an 
- *  in-place transformation.
- *    
- *  \param first1 The beginning of the first input sequence.
- *  \param last1 The end of the first input sequence.
- *  \param first2 The beginning of the second input sequence.
- *  \param result The beginning of the output sequence.
- *  \param op The tranformation operation.
- *  \return The end of the output sequence.
- *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *                        and \c InputIterator1's \c value_type is convertible to \c BinaryFunction's \c first_argument_type.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *                        and \c InputIterator2's \c value_type is convertible to \c BinaryFunction's \c second_argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
- *                              and \c BinaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
- *
- *  \pre \p first1 may equal \p result, but the range <tt>[first1, last1)</tt> shall not overlap the range <tt>[result, result + (last1 - first1))</tt> otherwise.
- *  \pre \p first2 may equal \p result, but the range <tt>[first2, first2 + (last1 - first1))</tt> shall not overlap the range <tt>[result, result + (last1 - first1))</tt> otherwise.
- *
- *  The following code snippet demonstrates how to use \p transform
- *
- *  \code
- *  #include <thrust/transform.h>
- *  #include <thrust/functional.h>
- *  
- *  int input1[6] = {-5,  0,  2,  3,  2,  4};
- *  int input2[6] = { 3,  6, -2,  1,  2,  3};
- *  int output[6];
- * 
- *  thrust::plus<int> op;
- *
- *  thrust::transform(input1, input1 + 6, input2, output, op);
- *
- *  // output is now {-2,  6,  0,  4,  4,  7};
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/transform.html
- */
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename BinaryFunction>
-  OutputIterator transform(InputIterator1 first1, InputIterator1 last1,
-                           InputIterator2 first2,
-                           OutputIterator result,
-                           BinaryFunction op);
-
-
-/*! This version of \p transform_if conditionally applies a unary function
- *  to each element of an input sequence and stores the result in the corresponding 
- *  position in an output sequence if the corresponding position in the input sequence
- *  satifies a predicate. Otherwise, the corresponding position in the
- *  output sequence is not modified.
- *
- *  Specifically, for each iterator <tt>i</tt> in the range <tt>[first, last)</tt> the
- *  predicate <tt>pred(*i)</tt> is evaluated. If this predicate
- *  evaluates to \c true, the result of <tt>op(*i)</tt> is assigned to <tt>*o</tt>,
- *  where <tt>o</tt> is the corresponding output iterator in the range
- *  <tt>[result, result + (last - first) )</tt>. Otherwise, <tt>op(*i)</tt> is
- *  not evaluated and no assignment occurs. The input and output sequences may coincide,
- *  resulting in an in-place transformation.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *    
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the input sequence.
- *  \param last The end of the input sequence.
- *  \param result The beginning of the output sequence.
- *  \param op The tranformation operation.
- *  \param pred The predicate operation.
- *  \return The end of the output sequence.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *                        and \c InputIterator's \c value_type is convertible to \c Predicate's \c argument_type,
- *                        and \c InputIterator's \c value_type is convertible to \c UnaryFunction's \c argument_type.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
- *                        and \c UnaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt> otherwise.
- *
- *  The following code snippet demonstrates how to use \p transform_if to negate the odd-valued
- *  elements of a range using the \p thrust::host execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/transform.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  
- *  int data[10]    = {-5, 0, 2, -3, 2, 4, 0, -1, 2, 8};
- *
- *  struct is_odd
- *  {
- *    __host__ __device__
- *    bool operator()(int x)
- *    {
- *      return x % 2;
- *    }
- *  };
- * 
- *  thrust::negate<int> op;
- *  thrust::identity<int> identity;
- *
- *  // negate odd elements
- *  thrust::transform_if(thrust::host, data, data + 10, data, op, is_odd()); // in-place transformation
- *
- *  // data is now {5, 0, 2, 3, 2, 4, 0, 1, 2, 8};
- *  \endcode
- *
- *  \see thrust::transform
- */
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename ForwardIterator,
-         typename UnaryFunction,
-         typename Predicate>
-  ForwardIterator transform_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                               InputIterator first, InputIterator last,
-                               ForwardIterator result,
-                               UnaryFunction op,
-                               Predicate pred);
-
-
-/*! This version of \p transform_if conditionally applies a unary function
- *  to each element of an input sequence and stores the result in the corresponding 
- *  position in an output sequence if the corresponding position in the input sequence
- *  satifies a predicate. Otherwise, the corresponding position in the
- *  output sequence is not modified.
- *
- *  Specifically, for each iterator <tt>i</tt> in the range <tt>[first, last)</tt> the
- *  predicate <tt>pred(*i)</tt> is evaluated. If this predicate
- *  evaluates to \c true, the result of <tt>op(*i)</tt> is assigned to <tt>*o</tt>,
- *  where <tt>o</tt> is the corresponding output iterator in the range
- *  <tt>[result, result + (last - first) )</tt>. Otherwise, <tt>op(*i)</tt> is
- *  not evaluated and no assignment occurs. The input and output sequences may coincide,
- *  resulting in an in-place transformation.
- *    
- *  \param first The beginning of the input sequence.
- *  \param last The end of the input sequence.
- *  \param result The beginning of the output sequence.
- *  \param op The tranformation operation.
- *  \param pred The predicate operation.
- *  \return The end of the output sequence.
- *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *                        and \c InputIterator's \c value_type is convertible to \c Predicate's \c argument_type,
- *                        and \c InputIterator's \c value_type is convertible to \c UnaryFunction's \c argument_type.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
- *                        and \c UnaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt> otherwise.
- *
- *  The following code snippet demonstrates how to use \p transform_if:
- *
- *  \code
- *  #include <thrust/transform.h>
- *  #include <thrust/functional.h>
- *  
- *  int data[10]    = {-5, 0, 2, -3, 2, 4, 0, -1, 2, 8};
- *
- *  struct is_odd
- *  {
- *    __host__ __device__
- *    bool operator()(int x)
- *    {
- *      return x % 2;
- *    }
- *  };
- * 
- *  thrust::negate<int> op;
- *  thrust::identity<int> identity;
- *
- *  // negate odd elements
- *  thrust::transform_if(data, data + 10, data, op, is_odd()); // in-place transformation
- *
- *  // data is now {5, 0, 2, 3, 2, 4, 0, 1, 2, 8};
- *  \endcode
- *
- *  \see thrust::transform
- */
-template<typename InputIterator,
-         typename ForwardIterator,
-         typename UnaryFunction,
-         typename Predicate>
-  ForwardIterator transform_if(InputIterator first, InputIterator last,
-                               ForwardIterator result,
-                               UnaryFunction op,
-                               Predicate pred);
-
-
-/*! This version of \p transform_if conditionally applies a unary function
- *  to each element of an input sequence and stores the result in the corresponding 
- *  position in an output sequence if the corresponding position in a stencil sequence
- *  satisfies a predicate. Otherwise, the corresponding position in the
- *  output sequence is not modified.
- *
- *  Specifically, for each iterator <tt>i</tt> in the range <tt>[first, last)</tt> the
- *  predicate <tt>pred(*s)</tt> is evaluated, where <tt>s</tt> is the corresponding input
- *  iterator in the range <tt>[stencil, stencil + (last - first) )</tt>. If this predicate
- *  evaluates to \c true, the result of <tt>op(*i)</tt> is assigned to <tt>*o</tt>,
- *  where <tt>o</tt> is the corresponding output iterator in the range
- *  <tt>[result, result + (last - first) )</tt>. Otherwise, <tt>op(*i)</tt> is
- *  not evaluated and no assignment occurs. The input and output sequences may coincide,
- *  resulting in an in-place transformation.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *    
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the input sequence.
- *  \param last The end of the input sequence.
- *  \param stencil The beginning of the stencil sequence.
- *  \param result The beginning of the output sequence.
- *  \param op The tranformation operation.
- *  \param pred The predicate operation.
- *  \return The end of the output sequence.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *                         and \c InputIterator1's \c value_type is convertible to \c UnaryFunction's \c argument_type.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *                         and \c InputIterator2's \c value_type is convertible to \c Predicate's \c argument_type.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
- *                        and \c UnaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt> otherwise.
- *  \pre \p stencil may equal \p result, but the range <tt>[stencil, stencil + (last - first))</tt> shall not overlap the range <tt>[result, result + (last - first))</tt> otherwise.
- *
- *  The following code snippet demonstrates how to use \p transform_if using the \p thrust::host
- *  execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/transform.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  
- *  int data[10]    = {-5, 0, 2, -3, 2, 4, 0, -1, 2, 8};
- *  int stencil[10] = { 1, 0, 1,  0, 1, 0, 1,  0, 1, 0};
- * 
- *  thrust::negate<int> op;
- *  thrust::identity<int> identity;
- *
- *  thrust::transform_if(thrust::host, data, data + 10, stencil, data, op, identity); // in-place transformation
- *
- *  // data is now {5, 0, -2, -3, -2,  4, 0, -1, -2,  8};
- *  \endcode
- *
- *  \see thrust::transform
- */
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename ForwardIterator,
-         typename UnaryFunction,
-         typename Predicate>
-  ForwardIterator transform_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                               InputIterator1 first, InputIterator1 last,
-                               InputIterator2 stencil,
-                               ForwardIterator result,
-                               UnaryFunction op,
-                               Predicate pred);
-
-
-/*! This version of \p transform_if conditionally applies a unary function
- *  to each element of an input sequence and stores the result in the corresponding 
- *  position in an output sequence if the corresponding position in a stencil sequence
- *  satisfies a predicate. Otherwise, the corresponding position in the
- *  output sequence is not modified.
- *
- *  Specifically, for each iterator <tt>i</tt> in the range <tt>[first, last)</tt> the
- *  predicate <tt>pred(*s)</tt> is evaluated, where <tt>s</tt> is the corresponding input
- *  iterator in the range <tt>[stencil, stencil + (last - first) )</tt>. If this predicate
- *  evaluates to \c true, the result of <tt>op(*i)</tt> is assigned to <tt>*o</tt>,
- *  where <tt>o</tt> is the corresponding output iterator in the range
- *  <tt>[result, result + (last - first) )</tt>. Otherwise, <tt>op(*i)</tt> is
- *  not evaluated and no assignment occurs. The input and output sequences may coincide,
- *  resulting in an in-place transformation.
- *    
- *  \param first The beginning of the input sequence.
- *  \param last The end of the input sequence.
- *  \param stencil The beginning of the stencil sequence.
- *  \param result The beginning of the output sequence.
- *  \param op The tranformation operation.
- *  \param pred The predicate operation.
- *  \return The end of the output sequence.
- *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *                         and \c InputIterator1's \c value_type is convertible to \c UnaryFunction's \c argument_type.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *                         and \c InputIterator2's \c value_type is convertible to \c Predicate's \c argument_type.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
- *                        and \c UnaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt> otherwise.
- *  \pre \p stencil may equal \p result, but the range <tt>[stencil, stencil + (last - first))</tt> shall not overlap the range <tt>[result, result + (last - first))</tt> otherwise.
- *
- *  The following code snippet demonstrates how to use \p transform_if:
- *
- *  \code
- *  #include <thrust/transform.h>
- *  #include <thrust/functional.h>
- *  
- *  int data[10]    = {-5, 0, 2, -3, 2, 4, 0, -1, 2, 8};
- *  int stencil[10] = { 1, 0, 1,  0, 1, 0, 1,  0, 1, 0};
- * 
- *  thrust::negate<int> op;
- *  thrust::identity<int> identity;
- *
- *  thrust::transform_if(data, data + 10, stencil, data, op, identity); // in-place transformation
- *
- *  // data is now {5, 0, -2, -3, -2,  4, 0, -1, -2,  8};
- *  \endcode
- *
- *  \see thrust::transform
- */
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename ForwardIterator,
-         typename UnaryFunction,
-         typename Predicate>
-  ForwardIterator transform_if(InputIterator1 first, InputIterator1 last,
-                               InputIterator2 stencil,
-                               ForwardIterator result,
-                               UnaryFunction op,
-                               Predicate pred);
-
-
-/*! This version of \p transform_if conditionally applies a binary function
- *  to each pair of elements from two input sequences and stores the result in the corresponding 
- *  position in an output sequence if the corresponding position in a stencil sequence
- *  satifies a predicate. Otherwise, the corresponding position in the
- *  output sequence is not modified.
- *
- *  Specifically, for each iterator <tt>i</tt> in the range <tt>[first1, last1)</tt> and 
- *  <tt>j = first2 + (i - first1)</tt> in the range <tt>[first2, first2 + (last1 - first1) )</tt>,
- *  the predicate <tt>pred(*s)</tt> is evaluated, where <tt>s</tt> is the corresponding input
- *  iterator in the range <tt>[stencil, stencil + (last1 - first1) )</tt>. If this predicate
- *  evaluates to \c true, the result of <tt>binary_op(*i,*j)</tt> is assigned to <tt>*o</tt>,
- *  where <tt>o</tt> is the corresponding output iterator in the range
- *  <tt>[result, result + (last1 - first1) )</tt>. Otherwise, <tt>binary_op(*i,*j)</tt> is
- *  not evaluated and no assignment occurs. The input and output sequences may coincide,
- *  resulting in an in-place transformation.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *    
- *  \param exec The execution policy to use for parallelization.
- *  \param first1 The beginning of the first input sequence.
- *  \param last1 The end of the first input sequence.
- *  \param first2 The beginning of the second input sequence.
- *  \param stencil The beginning of the stencil sequence.
- *  \param result The beginning of the output sequence.
- *  \param binary_op The transformation operation.
- *  \param pred The predicate operation.
- *  \return The end of the output sequence.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *                         and \c InputIterator1's \c value_type is convertible to \c BinaryFunction's \c first_argument_type.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *                         and \c InputIterator2's \c value_type is convertible to \c BinaryFunction's \c second_argument_type.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>.
- *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
- *                         and \c BinaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  \pre \p first1 may equal \p result, but the range <tt>[first1, last1)</tt> shall not overlap the range <tt>[result, result + (last1 - first1))</tt> otherwise.
- *  \pre \p first2 may equal \p result, but the range <tt>[first2, first2 + (last1 - first1))</tt> shall not overlap the range <tt>[result, result + (last1 - first1))</tt> otherwise.
- *  \pre \p stencil may equal \p result, but the range <tt>[stencil, stencil + (last1 - first1))</tt> shall not overlap the range <tt>[result, result + (last1 - first1))</tt> otherwise.
- *
- *  The following code snippet demonstrates how to use \p transform_if using the \p thrust::host
- *  execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/transform.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  
- *  int input1[6]  = {-5,  0,  2,  3,  2,  4};
- *  int input2[6]  = { 3,  6, -2,  1,  2,  3};
- *  int stencil[8] = { 1,  0,  1,  0,  1,  0};
- *  int output[6];
- * 
- *  thrust::plus<int> op;
- *  thrust::identity<int> identity;
- *
- *  thrust::transform_if(thrust::host, input1, input1 + 6, input2, stencil, output, op, identity);
- *
- *  // output is now {-2,  0,  0,  3,  4,  4};
- *  \endcode
- *
- *  \see thrust::transform
- */
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename ForwardIterator,
-         typename BinaryFunction,
-         typename Predicate>
-  ForwardIterator transform_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                               InputIterator1 first1, InputIterator1 last1,
-                               InputIterator2 first2,
-                               InputIterator3 stencil,
-                               ForwardIterator result,
-                               BinaryFunction binary_op,
-                               Predicate pred);
-
-
-/*! This version of \p transform_if conditionally applies a binary function
- *  to each pair of elements from two input sequences and stores the result in the corresponding 
- *  position in an output sequence if the corresponding position in a stencil sequence
- *  satifies a predicate. Otherwise, the corresponding position in the
- *  output sequence is not modified.
- *
- *  Specifically, for each iterator <tt>i</tt> in the range <tt>[first1, last1)</tt> and 
- *  <tt>j = first2 + (i - first1)</tt> in the range <tt>[first2, first2 + (last1 - first1) )</tt>,
- *  the predicate <tt>pred(*s)</tt> is evaluated, where <tt>s</tt> is the corresponding input
- *  iterator in the range <tt>[stencil, stencil + (last1 - first1) )</tt>. If this predicate
- *  evaluates to \c true, the result of <tt>binary_op(*i,*j)</tt> is assigned to <tt>*o</tt>,
- *  where <tt>o</tt> is the corresponding output iterator in the range
- *  <tt>[result, result + (last1 - first1) )</tt>. Otherwise, <tt>binary_op(*i,*j)</tt> is
- *  not evaluated and no assignment occurs. The input and output sequences may coincide,
- *  resulting in an in-place transformation.
- *    
- *  \param first1 The beginning of the first input sequence.
- *  \param last1 The end of the first input sequence.
- *  \param first2 The beginning of the second input sequence.
- *  \param stencil The beginning of the stencil sequence.
- *  \param result The beginning of the output sequence.
- *  \param binary_op The transformation operation.
- *  \param pred The predicate operation.
- *  \return The end of the output sequence.
- *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *                         and \c InputIterator1's \c value_type is convertible to \c BinaryFunction's \c first_argument_type.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *                         and \c InputIterator2's \c value_type is convertible to \c BinaryFunction's \c second_argument_type.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>.
- *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
- *                         and \c BinaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *
- *  \pre \p first1 may equal \p result, but the range <tt>[first1, last1)</tt> shall not overlap the range <tt>[result, result + (last1 - first1))</tt> otherwise.
- *  \pre \p first2 may equal \p result, but the range <tt>[first2, first2 + (last1 - first1))</tt> shall not overlap the range <tt>[result, result + (last1 - first1))</tt> otherwise.
- *  \pre \p stencil may equal \p result, but the range <tt>[stencil, stencil + (last1 - first1))</tt> shall not overlap the range <tt>[result, result + (last1 - first1))</tt> otherwise.
- *
- *  The following code snippet demonstrates how to use \p transform_if:
- *
- *  \code
- *  #include <thrust/transform.h>
- *  #include <thrust/functional.h>
- *  
- *  int input1[6]  = {-5,  0,  2,  3,  2,  4};
- *  int input2[6]  = { 3,  6, -2,  1,  2,  3};
- *  int stencil[8] = { 1,  0,  1,  0,  1,  0};
- *  int output[6];
- * 
- *  thrust::plus<int> op;
- *  thrust::identity<int> identity;
- *
- *  thrust::transform_if(input1, input1 + 6, input2, stencil, output, op, identity);
- *
- *  // output is now {-2,  0,  0,  3,  4,  4};
- *  \endcode
- *
- *  \see thrust::transform
- */
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename ForwardIterator,
-         typename BinaryFunction,
-         typename Predicate>
-  ForwardIterator transform_if(InputIterator1 first1, InputIterator1 last1,
-                               InputIterator2 first2,
-                               InputIterator3 stencil,
-                               ForwardIterator result,
-                               BinaryFunction binary_op,
-                               Predicate pred);
-
-
-/*! \} // end transformations
- */
-
-
-} // end namespace thrust
-
-#include <thrust/detail/transform.inl>
-
diff --git a/compat/thrust/transform_reduce.h b/compat/thrust/transform_reduce.h
deleted file mode 100644
index 3ef5efd6aa..0000000000
--- a/compat/thrust/transform_reduce.h
+++ /dev/null
@@ -1,197 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file transform_reduce.h
- *  \brief Fused transform / reduction
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/execution_policy.h>
-
-namespace thrust
-{
-
-
-/*! \addtogroup reductions
- *  \{
- *  \addtogroup transformed_reductions Transformed Reductions
- *  \ingroup reductions
- *  \{
- */
-
-
-/*! \p transform_reduce fuses the \p transform and \p reduce operations.
- *  \p transform_reduce is equivalent to performing a transformation defined by
- *  \p unary_op into a temporary sequence and then performing \p reduce on the
- *  transformed sequence. In most cases, fusing these two operations together is
- *  more efficient, since fewer memory reads and writes are required.
- *
- *  \p transform_reduce performs a reduction on the transformation of the
- *  sequence <tt>[first, last)</tt> according to \p unary_op. Specifically,
- *  \p unary_op is applied to each element of the sequence and then the result
- *  is reduced to a single value with \p binary_op using the initial value 
- *  \p init.  Note that the transformation \p unary_op is not applied to 
- *  the initial value \p init.  The order of reduction is not specified, 
- *  so \p binary_op must be both commutative and associative. 
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the sequence.
- *  \param last The end of the sequence.
- *  \param unary_op The function to apply to each element of the input sequence.
- *  \param init The result is initialized to this value.
- *  \param binary_op The reduction operation.
- *  \return The result of the transformed reduction.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator's \c value_type is convertible to \p UnaryFunction's \c argument_type.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>,
- *          and \p UnaryFunction's \c result_type is convertible to \c OutputType.
- *  \tparam OutputType is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
- *          and is convertible to \p BinaryFunction's \c first_argument_type and \c second_argument_type.
- *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>,
- *          and \p BinaryFunction's \c result_type is convertible to \p OutputType.
- *
- *  The following code snippet demonstrates how to use \p transform_reduce
- *  to compute the maximum value of the absolute value of the elements
- *  of a range using the \p thrust::host execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/transform_reduce.h>
- *  #include <thrust/functional.h>
- *  #include <thrust/execution_policy.h>
- *
- *  template<typename T>
- *  struct absolute_value : public unary_function<T,T>
- *  {
- *    __host__ __device__ T operator()(const T &x) const
- *    {
- *      return x < T(0) ? -x : x;
- *    }
- *  };
- *
- *  ...
- *
- *  int data[6] = {-1, 0, -2, -2, 1, -3};
- *  int result = thrust::transform_reduce(thrust::host,
- *                                        data, data + 6,
- *                                        absolute_value<int>(),
- *                                        0,
- *                                        thrust::maximum<int>());
- *  // result == 3
- *  \endcode
- *
- *  \see \c transform
- *  \see \c reduce
- */
-template<typename DerivedPolicy,
-         typename InputIterator, 
-         typename UnaryFunction, 
-         typename OutputType,
-         typename BinaryFunction>
-  OutputType transform_reduce(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                              InputIterator first,
-                              InputIterator last,
-                              UnaryFunction unary_op,
-                              OutputType init,
-                              BinaryFunction binary_op);
-
-
-/*! \p transform_reduce fuses the \p transform and \p reduce operations.
- *  \p transform_reduce is equivalent to performing a transformation defined by
- *  \p unary_op into a temporary sequence and then performing \p reduce on the
- *  transformed sequence. In most cases, fusing these two operations together is
- *  more efficient, since fewer memory reads and writes are required.
- *
- *  \p transform_reduce performs a reduction on the transformation of the
- *  sequence <tt>[first, last)</tt> according to \p unary_op. Specifically,
- *  \p unary_op is applied to each element of the sequence and then the result
- *  is reduced to a single value with \p binary_op using the initial value 
- *  \p init.  Note that the transformation \p unary_op is not applied to 
- *  the initial value \p init.  The order of reduction is not specified, 
- *  so \p binary_op must be both commutative and associative. 
- *
- *  \param first The beginning of the sequence.
- *  \param last The end of the sequence.
- *  \param unary_op The function to apply to each element of the input sequence.
- *  \param init The result is initialized to this value.
- *  \param binary_op The reduction operation.
- *  \return The result of the transformed reduction.
- *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator's \c value_type is convertible to \p UnaryFunction's \c argument_type.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>,
- *          and \p UnaryFunction's \c result_type is convertible to \c OutputType.
- *  \tparam OutputType is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
- *          and is convertible to \p BinaryFunction's \c first_argument_type and \c second_argument_type.
- *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>,
- *          and \p BinaryFunction's \c result_type is convertible to \p OutputType.
- *
- *  The following code snippet demonstrates how to use \p transform_reduce
- *  to compute the maximum value of the absolute value of the elements
- *  of a range.
- *
- *  \code
- *  #include <thrust/transform_reduce.h>
- *  #include <thrust/functional.h>
- *
- *  template<typename T>
- *  struct absolute_value : public unary_function<T,T>
- *  {
- *    __host__ __device__ T operator()(const T &x) const
- *    {
- *      return x < T(0) ? -x : x;
- *    }
- *  };
- *
- *  ...
- *
- *  int data[6] = {-1, 0, -2, -2, 1, -3};
- *  int result = thrust::transform_reduce(data, data + 6,
- *                                        absolute_value<int>(),
- *                                        0,
- *                                        thrust::maximum<int>());
- *  // result == 3
- *  \endcode
- *
- *  \see \c transform
- *  \see \c reduce
- */
-template<typename InputIterator, 
-         typename UnaryFunction, 
-         typename OutputType,
-         typename BinaryFunction>
-  OutputType transform_reduce(InputIterator first,
-                              InputIterator last,
-                              UnaryFunction unary_op,
-                              OutputType init,
-                              BinaryFunction binary_op);
-
-
-/*! \} // end transformed_reductions
- *  \} // end reductions
- */
-
-
-} // end namespace thrust
-
-#include <thrust/detail/transform_reduce.inl>
-
diff --git a/compat/thrust/transform_scan.h b/compat/thrust/transform_scan.h
deleted file mode 100644
index e9943e401f..0000000000
--- a/compat/thrust/transform_scan.h
+++ /dev/null
@@ -1,322 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file transform_scan.h
- *  \brief Fused transform / prefix-sum
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/execution_policy.h>
-
-namespace thrust
-{
-
-
-/*! \addtogroup algorithms
- */
-
-/*! \addtogroup prefixsums Prefix Sums
- *  \ingroup algorithms
- *  \{
- */
-	
-/*! \addtogroup transformed_prefixsums Transformed Prefix Sums
- *  \ingroup prefixsums
- *  \{
- */
-
-
-/*! \p transform_inclusive_scan fuses the \p transform and \p inclusive_scan
- *  operations.  \p transform_inclusive_scan is equivalent to performing a
- *  tranformation defined by \p unary_op into a temporary sequence and then
- *  performing an \p inclusive_scan on the tranformed sequence.  In most
- *  cases, fusing these two operations together is more efficient, since
- *  fewer memory reads and writes are required. In \p transform_inclusive_scan,
- *  <tt>unary_op(\*first)</tt> is assigned to <tt>\*result</tt> and the result
- *  of <tt>binary_op(unary_op(\*first), unary_op(\*(first + 1)))</tt> is
- *  assigned to <tt>\*(result + 1)</tt>, and so on.  The transform scan
- *  operation is permitted to be in-place.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the input sequence.
- *  \param last The end of the input sequence.
- *  \param result The beginning of the output sequence.
- *  \param unary_op The function used to tranform the input sequence.
- *  \param binary_op The associatve operator used to 'sum' transformed values.
- *  \return The end of the output sequence.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *                               and \c InputIterator's \c value_type is convertible to \c unary_op's input type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
- *                               and accepts inputs of \c InputIterator's \c value_type.  \c UnaryFunction's result_type
- *                               is convertable to \c OutputIterator's \c value_type.
- *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
- *                              and \c AssociativeOperator's \c result_type is
- *                              convertible to \c OutputIterator's \c value_type.
- *
- *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
- *
- *  The following code snippet demonstrates how to use \p transform_inclusive_scan using the
- *  \p thrust::host execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/transform_scan.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  
- *  int data[6] = {1, 0, 2, 2, 1, 3};
- *
- *  thrust::negate<int> unary_op;
- *  thrust::plus<int> binary_op;
- *
- *  thrust::transform_inclusive_scan(thrust::host, data, data + 6, data, unary_op, binary_op); // in-place scan
- *
- *  // data is now {-1, -1, -3, -5, -6, -9}
- *  \endcode
- *
- *  \see \p transform
- *  \see \p inclusive_scan
- *
- */
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename UnaryFunction,
-         typename AssociativeOperator>
-  OutputIterator transform_inclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                          InputIterator first,
-                                          InputIterator last,
-                                          OutputIterator result,
-                                          UnaryFunction unary_op,
-                                          AssociativeOperator binary_op);
-
-
-/*! \p transform_inclusive_scan fuses the \p transform and \p inclusive_scan
- *  operations.  \p transform_inclusive_scan is equivalent to performing a
- *  tranformation defined by \p unary_op into a temporary sequence and then
- *  performing an \p inclusive_scan on the tranformed sequence.  In most
- *  cases, fusing these two operations together is more efficient, since
- *  fewer memory reads and writes are required. In \p transform_inclusive_scan,
- *  <tt>unary_op(\*first)</tt> is assigned to <tt>\*result</tt> and the result
- *  of <tt>binary_op(unary_op(\*first), unary_op(\*(first + 1)))</tt> is
- *  assigned to <tt>\*(result + 1)</tt>, and so on.  The transform scan
- *  operation is permitted to be in-place.
- *
- *  \param first The beginning of the input sequence.
- *  \param last The end of the input sequence.
- *  \param result The beginning of the output sequence.
- *  \param unary_op The function used to tranform the input sequence.
- *  \param binary_op The associatve operator used to 'sum' transformed values.
- *  \return The end of the output sequence.
- *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *                               and \c InputIterator's \c value_type is convertible to \c unary_op's input type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
- *                               and accepts inputs of \c InputIterator's \c value_type.  \c UnaryFunction's result_type
- *                               is convertable to \c OutputIterator's \c value_type.
- *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
- *                              and \c AssociativeOperator's \c result_type is
- *                              convertible to \c OutputIterator's \c value_type.
- *
- *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
- *
- *  The following code snippet demonstrates how to use \p transform_inclusive_scan
- *
- *  \code
- *  #include <thrust/transform_scan.h>
- *  
- *  int data[6] = {1, 0, 2, 2, 1, 3};
- *
- *  thrust::negate<int> unary_op;
- *  thrust::plus<int> binary_op;
- *
- *  thrust::transform_inclusive_scan(data, data + 6, data, unary_op, binary_op); // in-place scan
- *
- *  // data is now {-1, -1, -3, -5, -6, -9}
- *  \endcode
- *
- *  \see \p transform
- *  \see \p inclusive_scan
- *
- */
-template<typename InputIterator,
-         typename OutputIterator,
-         typename UnaryFunction,
-         typename AssociativeOperator>
-  OutputIterator transform_inclusive_scan(InputIterator first,
-                                          InputIterator last,
-                                          OutputIterator result,
-                                          UnaryFunction unary_op,
-                                          AssociativeOperator binary_op);
-
-
-/*! \p transform_exclusive_scan fuses the \p transform and \p exclusive_scan
- *  operations.  \p transform_exclusive_scan is equivalent to performing a
- *  tranformation defined by \p unary_op into a temporary sequence and then
- *  performing an \p exclusive_scan on the tranformed sequence.  In most
- *  cases, fusing these two operations together is more efficient, since
- *  fewer memory reads and writes are required. In 
- *  \p transform_exclusive_scan, \p init is assigned to <tt>\*result</tt> 
- *  and the result of <tt>binary_op(init, unary_op(\*first))</tt> is assigned
- *  to <tt>\*(result + 1)</tt>, and so on.  The transform scan operation is 
- *  permitted to be in-place.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the input sequence.
- *  \param last The end of the input sequence.
- *  \param result The beginning of the output sequence.
- *  \param unary_op The function used to tranform the input sequence.
- *  \param init The initial value of the \p exclusive_scan
- *  \param binary_op The associatve operator used to 'sum' transformed values.
- *  \return The end of the output sequence.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *                               and \c InputIterator's \c value_type is convertible to \c unary_op's input type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
- *                               and accepts inputs of \c InputIterator's \c value_type.  \c UnaryFunction's result_type
- *                               is convertable to \c OutputIterator's \c value_type.
- *  \tparam T is convertible to \c OutputIterator's \c value_type.
- *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
- *                              and \c AssociativeOperator's \c result_type is
- *                              convertible to \c OutputIterator's \c value_type.
- *
- *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
- *
- *  The following code snippet demonstrates how to use \p transform_exclusive_scan using the
- *  \p thrust::host execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/transform_scan.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  
- *  int data[6] = {1, 0, 2, 2, 1, 3};
- *
- *  thrust::negate<int> unary_op;
- *  thrust::plus<int> binary_op;
- *
- *  thrust::transform_exclusive_scan(thrust::host, data, data + 6, data, unary_op, 4, binary_op); // in-place scan
- *
- *  // data is now {4, 3, 3, 1, -1, -2}
- *  \endcode
- *
- *  \see \p transform
- *  \see \p exclusive_scan
- *
- */
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename UnaryFunction,
-         typename T,
-         typename AssociativeOperator>
-  OutputIterator transform_exclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                          InputIterator first,
-                                          InputIterator last,
-                                          OutputIterator result,
-                                          UnaryFunction unary_op,
-                                          T init,
-                                          AssociativeOperator binary_op);
-
-
-/*! \p transform_exclusive_scan fuses the \p transform and \p exclusive_scan
- *  operations.  \p transform_exclusive_scan is equivalent to performing a
- *  tranformation defined by \p unary_op into a temporary sequence and then
- *  performing an \p exclusive_scan on the tranformed sequence.  In most
- *  cases, fusing these two operations together is more efficient, since
- *  fewer memory reads and writes are required. In 
- *  \p transform_exclusive_scan, \p init is assigned to <tt>\*result</tt> 
- *  and the result of <tt>binary_op(init, unary_op(\*first))</tt> is assigned
- *  to <tt>\*(result + 1)</tt>, and so on.  The transform scan operation is 
- *  permitted to be in-place.
- *
- *  \param first The beginning of the input sequence.
- *  \param last The end of the input sequence.
- *  \param result The beginning of the output sequence.
- *  \param unary_op The function used to tranform the input sequence.
- *  \param init The initial value of the \p exclusive_scan
- *  \param binary_op The associatve operator used to 'sum' transformed values.
- *  \return The end of the output sequence.
- *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *                               and \c InputIterator's \c value_type is convertible to \c unary_op's input type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
- *                               and accepts inputs of \c InputIterator's \c value_type.  \c UnaryFunction's result_type
- *                               is convertable to \c OutputIterator's \c value_type.
- *  \tparam T is convertible to \c OutputIterator's \c value_type.
- *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
- *                              and \c AssociativeOperator's \c result_type is
- *                              convertible to \c OutputIterator's \c value_type.
- *
- *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
- *
- *  The following code snippet demonstrates how to use \p transform_exclusive_scan
- *
- *  \code
- *  #include <thrust/transform_scan.h>
- *  
- *  int data[6] = {1, 0, 2, 2, 1, 3};
- *
- *  thrust::negate<int> unary_op;
- *  thrust::plus<int> binary_op;
- *
- *  thrust::transform_exclusive_scan(data, data + 6, data, unary_op, 4, binary_op); // in-place scan
- *
- *  // data is now {4, 3, 3, 1, -1, -2}
- *  \endcode
- *
- *  \see \p transform
- *  \see \p exclusive_scan
- *
- */
-template<typename InputIterator,
-         typename OutputIterator,
-         typename UnaryFunction,
-         typename T,
-         typename AssociativeOperator>
-  OutputIterator transform_exclusive_scan(InputIterator first,
-                                          InputIterator last,
-                                          OutputIterator result,
-                                          UnaryFunction unary_op,
-                                          T init,
-                                          AssociativeOperator binary_op);
-
-
-/*! \} // end transformed_prefixsums
- */
-
-
-/*! \} // end prefixsums
- */
-
-	
-} // end namespace thrust
-
-#include <thrust/detail/transform_scan.inl>
-
diff --git a/compat/thrust/tuple.h b/compat/thrust/tuple.h
deleted file mode 100644
index 3961d982fa..0000000000
--- a/compat/thrust/tuple.h
+++ /dev/null
@@ -1,583 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file tuple.h
- *  \brief A type encapsulating a heterogeneous collection of elements
- */
-
-/*
- * Copyright (C) 1999, 2000 Jaakko Järvi (jaakko.jarvi@cs.utu.fi)
- * 
- * Distributed under the Boost Software License, Version 1.0.
- * (See accompanying NOTICE file for the complete license)
- *
- * For more information, see http://www.boost.org
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/tuple.inl>
-#include <thrust/pair.h>
-
-namespace thrust
-{
-
-/*! \addtogroup utility
- *  \{
- */
-
-/*! \addtogroup tuple
- *  \{
- */
-
-/*! \cond
- */
-
-struct null_type;
-
-/*! \endcond
- */
-
-/*! This metafunction returns the type of a
- *  \p tuple's <tt>N</tt>th element.
- *
- *  \tparam N This parameter selects the element of interest.
- *  \tparam T A \c tuple type of interest.
- *
- *  \see pair
- *  \see tuple
- */
-template<int N, class T>
-  struct tuple_element
-{
-  private:
-    typedef typename T::tail_type Next;
-
-  public:
-    /*! The result of this metafunction is returned in \c type.
-     */
-    typedef typename tuple_element<N-1, Next>::type type;
-}; // end tuple_element
-
-/*! This metafunction returns the number of elements
- *  of a \p tuple type of interest.
- *
- *  \tparam T A \c tuple type of interest.
- *
- *  \see pair
- *  \see tuple
- */
-template<class T>
-  struct tuple_size
-{
-  /*! The result of this metafunction is returned in \c value.
-   */
-  static const int value = 1 + tuple_size<typename T::tail_type>::value;
-}; // end tuple_size
-
-// get function for non-const cons-lists, returns a reference to the element
-
-/*! The \p get function returns a reference to a \p tuple element of
- *  interest.
- *
- *  \param t A reference to a \p tuple of interest.
- *  \return A reference to \p t's <tt>N</tt>th element.
- *
- *  \tparam N The index of the element of interest.
- *
- *  The following code snippet demonstrates how to use \p get to print
- *  the value of a \p tuple element.
- *
- *  \code
- *  #include <thrust/tuple.h>
- *  #include <iostream>
- *  ...
- *  thrust::tuple<int, const char *> t(13, "thrust");
- *
- *  std::cout << "The 1st value of t is " << thrust::get<1>(t) << std::endl;
- *  \endcode
- *
- *  \see pair
- *  \see tuple
- */
-template<int N, class HT, class TT>
-__host__ __device__
-inline typename access_traits<
-                  typename tuple_element<N, detail::cons<HT, TT> >::type
-                >::non_const_type
-get(detail::cons<HT, TT>& t);
-
-
-/*! The \p get function returns a \c const reference to a \p tuple element of
- *  interest.
- *
- *  \param t A reference to a \p tuple of interest.
- *  \return A \c const reference to \p t's <tt>N</tt>th element.
- *
- *  \tparam N The index of the element of interest.
- *
- *  The following code snippet demonstrates how to use \p get to print
- *  the value of a \p tuple element.
- *
- *  \code
- *  #include <thrust/tuple.h>
- *  #include <iostream>
- *  ...
- *  thrust::tuple<int, const char *> t(13, "thrust");
- *
- *  std::cout << "The 1st value of t is " << thrust::get<1>(t) << std::endl;
- *  \endcode
- *
- *  \see pair
- *  \see tuple
- */
-template<int N, class HT, class TT>
-__host__ __device__
-inline typename access_traits<
-                  typename tuple_element<N, detail::cons<HT, TT> >::type
-                >::const_type
-get(const detail::cons<HT, TT>& t);
-
-
-
-/*! \p tuple is a class template that can be instantiated with up to ten arguments.
- *  Each template argument specifies the type of element in the \p tuple.
- *  Consequently, tuples are heterogeneous, fixed-size collections of values. An
- *  instantiation of \p tuple with two arguments is similar to an instantiation
- *  of \p pair with the same two arguments. Individual elements of a \p tuple may
- *  be accessed with the \p get function.
- *
- *  \tparam TN The type of the <tt>N</tt> \c tuple element. Thrust's \p tuple
- *          type currently supports up to ten elements.
- *
- *  The following code snippet demonstrates how to create a new \p tuple object
- *  and inspect and modify the value of its elements.
- *
- *  \code
- *  #include <thrust/tuple.h>
- *  #include <iostream>
- *  ...
- *  // create a tuple containing an int, a float, and a string
- *  thrust::tuple<int, float, const char*> t(13, 0.1f, "thrust");
- *
- *  // individual members are accessed with the free function get
- *  std::cout << "The first element's value is " << thrust::get<0>(t) << std::endl; 
- *
- *  // or the member function get
- *  std::cout << "The second element's value is " << t.get<1>() << std::endl;
- *
- *  // we can also modify elements with the same function
- *  thrust::get<0>(t) += 10;
- *  \endcode
- *
- *  \see pair
- *  \see get
- *  \see make_tuple
- *  \see tuple_element
- *  \see tuple_size
- *  \see tie
- */
-template <class T0, class T1, class T2, class T3, class T4,
-          class T5, class T6, class T7, class T8, class T9>
-  class tuple :
-    public detail::map_tuple_to_cons<T0, T1, T2, T3, T4, T5, T6, T7, T8, T9>::type
-{
-  /*! \cond
-   */
-
-  private:
-  typedef typename detail::map_tuple_to_cons<T0, T1, T2, T3, T4, T5, T6, T7, T8, T9>::type inherited;
-
-  /*! \endcond
-   */
-
-  public:
-  /*! \p tuple's no-argument constructor initializes each element.
-   */
-  inline __host__ __device__
-  tuple(void) {}
-
-  /*! \p tuple's one-argument constructor copy constructs the first element from the given parameter
-   *     and intializes all other elements.
-   *  \param t0 The value to assign to this \p tuple's first element.
-   */
-  inline __host__ __device__ 
-  tuple(typename access_traits<T0>::parameter_type t0)
-    : inherited(t0,
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type())) {}
-
-  /*! \p tuple's one-argument constructor copy constructs the first two elements from the given parameters
-   *     and intializes all other elements.
-   *  \param t0 The value to assign to this \p tuple's first element.
-   *  \param t1 The value to assign to this \p tuple's second element.
-   *  \note \p tuple's constructor has ten variants of this form, the rest of which are ommitted here for brevity.
-   */
-  inline __host__ __device__ 
-  tuple(typename access_traits<T0>::parameter_type t0,
-        typename access_traits<T1>::parameter_type t1)
-    : inherited(t0, t1,
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type())) {}
-
-  /*! \cond
-   */
-
-  inline __host__ __device__ 
-  tuple(typename access_traits<T0>::parameter_type t0,
-        typename access_traits<T1>::parameter_type t1,
-        typename access_traits<T2>::parameter_type t2)
-    : inherited(t0, t1, t2,
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type())) {}
-
-  inline __host__ __device__ 
-  tuple(typename access_traits<T0>::parameter_type t0,
-        typename access_traits<T1>::parameter_type t1,
-        typename access_traits<T2>::parameter_type t2,
-        typename access_traits<T3>::parameter_type t3)
-    : inherited(t0, t1, t2, t3,
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type())) {}
-
-  inline __host__ __device__ 
-  tuple(typename access_traits<T0>::parameter_type t0,
-        typename access_traits<T1>::parameter_type t1,
-        typename access_traits<T2>::parameter_type t2,
-        typename access_traits<T3>::parameter_type t3,
-        typename access_traits<T4>::parameter_type t4)
-    : inherited(t0, t1, t2, t3, t4,
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type())) {}
-
-  inline __host__ __device__ 
-  tuple(typename access_traits<T0>::parameter_type t0,
-        typename access_traits<T1>::parameter_type t1,
-        typename access_traits<T2>::parameter_type t2,
-        typename access_traits<T3>::parameter_type t3,
-        typename access_traits<T4>::parameter_type t4,
-        typename access_traits<T5>::parameter_type t5)
-    : inherited(t0, t1, t2, t3, t4, t5,
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type())) {}
-
-  inline __host__ __device__ 
-  tuple(typename access_traits<T0>::parameter_type t0,
-        typename access_traits<T1>::parameter_type t1,
-        typename access_traits<T2>::parameter_type t2,
-        typename access_traits<T3>::parameter_type t3,
-        typename access_traits<T4>::parameter_type t4,
-        typename access_traits<T5>::parameter_type t5,
-        typename access_traits<T6>::parameter_type t6)
-    : inherited(t0, t1, t2, t3, t4, t5, t6,
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type())) {}
-
-  inline __host__ __device__ 
-  tuple(typename access_traits<T0>::parameter_type t0,
-        typename access_traits<T1>::parameter_type t1,
-        typename access_traits<T2>::parameter_type t2,
-        typename access_traits<T3>::parameter_type t3,
-        typename access_traits<T4>::parameter_type t4,
-        typename access_traits<T5>::parameter_type t5,
-        typename access_traits<T6>::parameter_type t6,
-        typename access_traits<T7>::parameter_type t7)
-    : inherited(t0, t1, t2, t3, t4, t5, t6, t7,
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type())) {}
-
-  inline __host__ __device__ 
-  tuple(typename access_traits<T0>::parameter_type t0,
-        typename access_traits<T1>::parameter_type t1,
-        typename access_traits<T2>::parameter_type t2,
-        typename access_traits<T3>::parameter_type t3,
-        typename access_traits<T4>::parameter_type t4,
-        typename access_traits<T5>::parameter_type t5,
-        typename access_traits<T6>::parameter_type t6,
-        typename access_traits<T7>::parameter_type t7,
-        typename access_traits<T8>::parameter_type t8)
-    : inherited(t0, t1, t2, t3, t4, t5, t6, t7, t8,
-                static_cast<const null_type&>(null_type())) {}
-
-  inline __host__ __device__ 
-  tuple(typename access_traits<T0>::parameter_type t0,
-        typename access_traits<T1>::parameter_type t1,
-        typename access_traits<T2>::parameter_type t2,
-        typename access_traits<T3>::parameter_type t3,
-        typename access_traits<T4>::parameter_type t4,
-        typename access_traits<T5>::parameter_type t5,
-        typename access_traits<T6>::parameter_type t6,
-        typename access_traits<T7>::parameter_type t7,
-        typename access_traits<T8>::parameter_type t8,
-        typename access_traits<T9>::parameter_type t9)
-    : inherited(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9) {}
-
-
-  template<class U1, class U2>
-  inline __host__ __device__ 
-  tuple(const detail::cons<U1, U2>& p) : inherited(p) {}
-
-  template <class U1, class U2>
-  inline __host__ __device__ 
-  tuple& operator=(const detail::cons<U1, U2>& k)
-  {
-    inherited::operator=(k);
-    return *this;
-  }
-
-  /*! \endcond
-   */
-
-  /*! This assignment operator allows assigning the first two elements of this \p tuple from a \p pair.
-   *  \param k A \p pair to assign from.
-   */
-  template <class U1, class U2>
-  __host__ __device__ inline
-  tuple& operator=(const thrust::pair<U1, U2>& k) {
-    //BOOST_STATIC_ASSERT(length<tuple>::value == 2);// check_length = 2
-    this->head = k.first;
-    this->tail.head = k.second;
-    return *this;
-  }
-
-  /*! \p swap swaps the elements of two <tt>tuple</tt>s.
-   *
-   *  \param t The other <tt>tuple</tt> with which to swap.
-   */
-  inline __host__ __device__
-  void swap(tuple &t)
-  {
-    inherited::swap(t);
-  }
-};
-
-/*! \cond
- */
-
-template <>
-class tuple<null_type, null_type, null_type, null_type, null_type, null_type, null_type, null_type, null_type, null_type>  :
-  public null_type
-{
-public:
-  typedef null_type inherited;
-};
-
-/*! \endcond
- */
-
-
-/*! This version of \p make_tuple creates a new \c tuple object from a
- *  single object.
- *
- *  \param t0 The object to copy from.
- *  \return A \p tuple object with a single member which is a copy of \p t0.
- */
-template<class T0>
-__host__ __device__ inline
-  typename detail::make_tuple_mapper<T0>::type
-    make_tuple(const T0& t0);
-
-/*! This version of \p make_tuple creates a new \c tuple object from two
- *  objects.
- *
- *  \param t0 The first object to copy from.
- *  \param t1 The second object to copy from.
- *  \return A \p tuple object with two members which are copies of \p t0
- *          and \p t1.
- *
- *  \note \p make_tuple has ten variants, the rest of which are omitted here
- *        for brevity.
- */
-template<class T0, class T1>
-__host__ __device__ inline
-  typename detail::make_tuple_mapper<T0, T1>::type
-    make_tuple(const T0& t0, const T1& t1);
-
-/*! This version of \p tie creates a new \c tuple whose single element is
- *  a reference which refers to this function's argument.
- *
- *  \param t0 The object to reference.
- *  \return A \p tuple object with one member which is a reference to \p t0.
- */
-template<typename T0>
-__host__ __device__ inline
-tuple<T0&> tie(T0& t0);
-
-/*! This version of \p tie creates a new \c tuple of references object which
- *  refers to this function's arguments.
- *
- *  \param t0 The first object to reference.
- *  \param t1 The second object to reference.
- *  \return A \p tuple object with two members which are references to \p t0
- *          and \p t1.
- *
- *  \note \p tie has ten variants, the rest of which are omitted here for
- *           brevity.
- */
-template<typename T0, typename T1>
-__host__ __device__ inline
-tuple<T0&,T1&> tie(T0& t0, T1& t1);
-
-/*! \p swap swaps the contents of two <tt>tuple</tt>s.
- *
- *  \param x The first \p tuple to swap.
- *  \param y The second \p tuple to swap.
- */
-template<
-  typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9,
-  typename U0, typename U1, typename U2, typename U3, typename U4, typename U5, typename U6, typename U7, typename U8, typename U9
->
-inline __host__ __device__
-void swap(tuple<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9> &x,
-          tuple<U0,U1,U2,U3,U4,U5,U6,U7,U8,U9> &y);
-
-
-
-/*! \cond
- */
-
-template<class T0, class T1, class T2>
-__host__ __device__ inline
-  typename detail::make_tuple_mapper<T0, T1, T2>::type
-    make_tuple(const T0& t0, const T1& t1, const T2& t2);
-
-template<class T0, class T1, class T2, class T3>
-__host__ __device__ inline
-  typename detail::make_tuple_mapper<T0, T1, T2, T3>::type
-    make_tuple(const T0& t0, const T1& t1, const T2& t2, const T3& t3);
-
-template<class T0, class T1, class T2, class T3, class T4>
-__host__ __device__ inline
-  typename detail::make_tuple_mapper<T0, T1, T2, T3, T4>::type
-    make_tuple(const T0& t0, const T1& t1, const T2& t2, const T3& t3, const T4& t4);
-
-template<class T0, class T1, class T2, class T3, class T4, class T5>
-__host__ __device__ inline
-  typename detail::make_tuple_mapper<T0, T1, T2, T3, T4, T5>::type
-    make_tuple(const T0& t0, const T1& t1, const T2& t2, const T3& t3, const T4& t4, const T5& t5);
-
-template<class T0, class T1, class T2, class T3, class T4, class T5, class T6>
-__host__ __device__ inline
-  typename detail::make_tuple_mapper<T0, T1, T2, T3, T4, T5, T6>::type
-    make_tuple(const T0& t0, const T1& t1, const T2& t2, const T3& t3, const T4& t4, const T5& t5, const T6& t6);
-
-template<class T0, class T1, class T2, class T3, class T4, class T5, class T6, class T7>
-__host__ __device__ inline
-  typename detail::make_tuple_mapper<T0, T1, T2, T3, T4, T5, T6, T7>::type
-    make_tuple(const T0& t0, const T1& t1, const T2& t2, const T3& t3, const T4& t4, const T5& t5, const T6& t6, const T7& t7);
-
-template<class T0, class T1, class T2, class T3, class T4, class T5, class T6, class T7, class T8>
-__host__ __device__ inline
-  typename detail::make_tuple_mapper<T0, T1, T2, T3, T4, T5, T6, T7, T8>::type
-    make_tuple(const T0& t0, const T1& t1, const T2& t2, const T3& t3, const T4& t4, const T5& t5, const T6& t6, const T7& t7, const T8& t8);
-
-template<class T0, class T1, class T2, class T3, class T4, class T5, class T6, class T7, class T8, class T9>
-__host__ __device__ inline
-  typename detail::make_tuple_mapper<T0, T1, T2, T3, T4, T5, T6, T7, T8, T9>::type
-    make_tuple(const T0& t0, const T1& t1, const T2& t2, const T3& t3, const T4& t4, const T5& t5, const T6& t6, const T7& t7, const T8& t8, const T9& t9);
-
-template<typename T0, typename T1, typename T2>
-__host__ __device__ inline
-tuple<T0&,T1&,T2&> tie(T0 &t0, T1 &t1, T2 &t2);
-
-template<typename T0, typename T1, typename T2, typename T3>
-__host__ __device__ inline
-tuple<T0&,T1&,T2&,T3&> tie(T0 &t0, T1 &t1, T2 &t2, T3 &t3);
-
-template<typename T0, typename T1, typename T2, typename T3, typename T4>
-__host__ __device__ inline
-tuple<T0&,T1&,T2&,T3&,T4&> tie(T0 &t0, T1 &t1, T2 &t2, T3 &t3, T4 &t4);
-
-template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5>
-__host__ __device__ inline
-tuple<T0&,T1&,T2&,T3&,T4&,T5&> tie(T0 &t0, T1 &t1, T2 &t2, T3 &t3, T4 &t4, T5 &t5);
-
-template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6>
-__host__ __device__ inline
-tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&> tie(T0 &t0, T1 &t1, T2 &t2, T3 &t3, T4 &t4, T5 &t5, T6 &t6);
-
-template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7>
-__host__ __device__ inline
-tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&> tie(T0 &t0, T1 &t1, T2 &t2, T3 &t3, T4 &t4, T5 &t5, T6 &t6, T7 &t7);
-
-template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8>
-__host__ __device__ inline
-tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&,T8&> tie(T0 &t0, T1 &t1, T2 &t2, T3 &t3, T4 &t4, T5 &t5, T6 &t6, T7 &t7, T8 &t8);
-
-template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9>
-__host__ __device__ inline
-tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&,T8&,T9&> tie(T0 &t0, T1 &t1, T2 &t2, T3 &t3, T4 &t4, T5 &t5, T6 &t6, T7 &t7, T8 &t8, T9 &t9);
-
-
-__host__ __device__ inline
-bool operator==(const null_type&, const null_type&);
-
-__host__ __device__ inline
-bool operator>=(const null_type&, const null_type&);
-
-__host__ __device__ inline
-bool operator<=(const null_type&, const null_type&);
-
-__host__ __device__ inline
-bool operator!=(const null_type&, const null_type&);
-
-__host__ __device__ inline
-bool operator<(const null_type&, const null_type&);
-
-__host__ __device__ inline
-bool operator>(const null_type&, const null_type&);
-
-/*! \endcond
- */
-
-/*! \} // tuple
- */
-
-/*! \} // utility
- */
-
-} // end thrust
-
diff --git a/compat/thrust/uninitialized_copy.h b/compat/thrust/uninitialized_copy.h
deleted file mode 100644
index 77b673c425..0000000000
--- a/compat/thrust/uninitialized_copy.h
+++ /dev/null
@@ -1,301 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file uninitialized_copy.h
- *  \brief Copy construction into a range of uninitialized elements from a source range
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/execution_policy.h>
-
-namespace thrust
-{
-
-
-/*! \addtogroup copying
- *  \{
- */
-
-
-/*! In \c thrust, the function \c thrust::device_new allocates memory for
- *  an object and then creates an object at that location by calling a constructor.
- *  Occasionally, however, it is useful to separate those two operations.
- *  If each iterator in the range <tt>[result, result + (last - first))</tt> points
- *  to uninitialized memory, then \p uninitialized_copy creates a copy of
- *  <tt>[first, last)</tt> in that range. That is, for each iterator \c i in
- *  the input, \p uninitialized_copy creates a copy of \c *i in the location pointed
- *  to by the corresponding iterator in the output range by \p ForwardIterator's
- *  \c value_type's copy constructor with *i as its argument.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The first element of the input range to copy from.
- *  \param last The last element of the input range to copy from.
- *  \param result The first element of the output range to copy to.
- *  \return An iterator pointing to the last element of the output range.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          \p ForwardIterator is mutable, and \p ForwardIterator's \c value_type has a constructor that takes
- *          a single argument whose type is \p InputIterator's \c value_type.
- *
- *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
- *
- *  The following code snippet demonstrates how to use \p uninitialized_copy to initialize
- *  a range of uninitialized memory using the \p thrust::device execution policy for
- *  parallelization:
- *
- *  \code
- *  #include <thrust/uninitialized_copy.h>
- *  #include <thrust/device_malloc.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/execution_policy.h>
- *  
- *  struct Int
- *  {
- *    __host__ __device__
- *    Int(int x) : val(x) {}
- *    int val;
- *  };  
- *  ...
- *  const int N = 137;
- *
- *  Int val(46);
- *  thrust::device_vector<Int> input(N, val);
- *  thrust::device_ptr<Int> array = thrust::device_malloc<Int>(N);
- *  thrust::uninitialized_copy(thrust::device, input.begin(), input.end(), array);
- *
- *  // Int x = array[i];
- *  // x.val == 46 for all 0 <= i < N
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/uninitialized_copy.html
- *  \see \c copy
- *  \see \c uninitialized_fill
- *  \see \c device_new
- *  \see \c device_malloc
- */
-template<typename DerivedPolicy, typename InputIterator, typename ForwardIterator>
-  ForwardIterator uninitialized_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                     InputIterator first,
-                                     InputIterator last,
-                                     ForwardIterator result);
-
-
-/*! In \c thrust, the function \c thrust::device_new allocates memory for
- *  an object and then creates an object at that location by calling a constructor.
- *  Occasionally, however, it is useful to separate those two operations.
- *  If each iterator in the range <tt>[result, result + (last - first))</tt> points
- *  to uninitialized memory, then \p uninitialized_copy creates a copy of
- *  <tt>[first, last)</tt> in that range. That is, for each iterator \c i in
- *  the input, \p uninitialized_copy creates a copy of \c *i in the location pointed
- *  to by the corresponding iterator in the output range by \p ForwardIterator's
- *  \c value_type's copy constructor with *i as its argument.
- *
- *  \param first The first element of the input range to copy from.
- *  \param last The last element of the input range to copy from.
- *  \param result The first element of the output range to copy to.
- *  \return An iterator pointing to the last element of the output range.
- *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          \p ForwardIterator is mutable, and \p ForwardIterator's \c value_type has a constructor that takes
- *          a single argument whose type is \p InputIterator's \c value_type.
- *
- *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
- *
- *  The following code snippet demonstrates how to use \p uninitialized_copy to initialize
- *  a range of uninitialized memory.
- *
- *  \code
- *  #include <thrust/uninitialized_copy.h>
- *  #include <thrust/device_malloc.h>
- *  #include <thrust/device_vector.h>
- *  
- *  struct Int
- *  {
- *    __host__ __device__
- *    Int(int x) : val(x) {}
- *    int val;
- *  };  
- *  ...
- *  const int N = 137;
- *
- *  Int val(46);
- *  thrust::device_vector<Int> input(N, val);
- *  thrust::device_ptr<Int> array = thrust::device_malloc<Int>(N);
- *  thrust::uninitialized_copy(input.begin(), input.end(), array);
- *
- *  // Int x = array[i];
- *  // x.val == 46 for all 0 <= i < N
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/uninitialized_copy.html
- *  \see \c copy
- *  \see \c uninitialized_fill
- *  \see \c device_new
- *  \see \c device_malloc
- */
-template<typename InputIterator, typename ForwardIterator>
-  ForwardIterator uninitialized_copy(InputIterator first,
-                                     InputIterator last,
-                                     ForwardIterator result);
-
-
-/*! In \c thrust, the function \c thrust::device_new allocates memory for
- *  an object and then creates an object at that location by calling a constructor.
- *  Occasionally, however, it is useful to separate those two operations.
- *  If each iterator in the range <tt>[result, result + n)</tt> points
- *  to uninitialized memory, then \p uninitialized_copy_n creates a copy of
- *  <tt>[first, first + n)</tt> in that range. That is, for each iterator \c i in
- *  the input, \p uninitialized_copy_n creates a copy of \c *i in the location pointed
- *  to by the corresponding iterator in the output range by \p InputIterator's
- *  \c value_type's copy constructor with *i as its argument.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The first element of the input range to copy from.
- *  \param n The number of elements to copy.
- *  \param result The first element of the output range to copy to.
- *  \return An iterator pointing to the last element of the output range.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam Size is an integral type.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          \p ForwardIterator is mutable, and \p ForwardIterator's \c value_type has a constructor that takes
- *          a single argument whose type is \p InputIterator's \c value_type.
- *
- *  \pre \p first may equal \p result, but the range <tt>[first, first + n)</tt> and the range <tt>[result, result + n)</tt> shall not overlap otherwise.
- *
- *  The following code snippet demonstrates how to use \p uninitialized_copy to initialize
- *  a range of uninitialized memory using the \p thrust::device execution policy for
- *  parallelization:
- *
- *  \code
- *  #include <thrust/uninitialized_copy.h>
- *  #include <thrust/device_malloc.h>
- *  #include <thrust/device_vector.h>
- *  #include <thrust/execution_policy.h>
- *  
- *  struct Int
- *  {
- *    __host__ __device__
- *    Int(int x) : val(x) {}
- *    int val;
- *  };  
- *  ...
- *  const int N = 137;
- *
- *  Int val(46);
- *  thrust::device_vector<Int> input(N, val);
- *  thrust::device_ptr<Int> array = thrust::device_malloc<Int>(N);
- *  thrust::uninitialized_copy_n(thrust::device, input.begin(), N, array);
- *
- *  // Int x = array[i];
- *  // x.val == 46 for all 0 <= i < N
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/uninitialized_copy.html
- *  \see \c uninitialized_copy
- *  \see \c copy
- *  \see \c uninitialized_fill
- *  \see \c device_new
- *  \see \c device_malloc
- */
-template<typename DerivedPolicy, typename InputIterator, typename Size, typename ForwardIterator>
-  ForwardIterator uninitialized_copy_n(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                       InputIterator first,
-                                       Size n,
-                                       ForwardIterator result);
-
-
-/*! In \c thrust, the function \c thrust::device_new allocates memory for
- *  an object and then creates an object at that location by calling a constructor.
- *  Occasionally, however, it is useful to separate those two operations.
- *  If each iterator in the range <tt>[result, result + n)</tt> points
- *  to uninitialized memory, then \p uninitialized_copy_n creates a copy of
- *  <tt>[first, first + n)</tt> in that range. That is, for each iterator \c i in
- *  the input, \p uninitialized_copy_n creates a copy of \c *i in the location pointed
- *  to by the corresponding iterator in the output range by \p InputIterator's
- *  \c value_type's copy constructor with *i as its argument.
- *
- *  \param first The first element of the input range to copy from.
- *  \param n The number of elements to copy.
- *  \param result The first element of the output range to copy to.
- *  \return An iterator pointing to the last element of the output range.
- *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam Size is an integral type.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          \p ForwardIterator is mutable, and \p ForwardIterator's \c value_type has a constructor that takes
- *          a single argument whose type is \p InputIterator's \c value_type.
- *
- *  \pre \p first may equal \p result, but the range <tt>[first, first + n)</tt> and the range <tt>[result, result + n)</tt> shall not overlap otherwise.
- *
- *  The following code snippet demonstrates how to use \p uninitialized_copy to initialize
- *  a range of uninitialized memory.
- *
- *  \code
- *  #include <thrust/uninitialized_copy.h>
- *  #include <thrust/device_malloc.h>
- *  #include <thrust/device_vector.h>
- *  
- *  struct Int
- *  {
- *    __host__ __device__
- *    Int(int x) : val(x) {}
- *    int val;
- *  };  
- *  ...
- *  const int N = 137;
- *
- *  Int val(46);
- *  thrust::device_vector<Int> input(N, val);
- *  thrust::device_ptr<Int> array = thrust::device_malloc<Int>(N);
- *  thrust::uninitialized_copy_n(input.begin(), N, array);
- *
- *  // Int x = array[i];
- *  // x.val == 46 for all 0 <= i < N
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/uninitialized_copy.html
- *  \see \c uninitialized_copy
- *  \see \c copy
- *  \see \c uninitialized_fill
- *  \see \c device_new
- *  \see \c device_malloc
- */
-template<typename InputIterator, typename Size, typename ForwardIterator>
-  ForwardIterator uninitialized_copy_n(InputIterator first,
-                                       Size n,
-                                       ForwardIterator result);
-
-
-/*! \} // copying
- */
-
-
-} // end thrust
-
-#include <thrust/detail/uninitialized_copy.inl>
-
diff --git a/compat/thrust/uninitialized_fill.h b/compat/thrust/uninitialized_fill.h
deleted file mode 100644
index c726241ddd..0000000000
--- a/compat/thrust/uninitialized_fill.h
+++ /dev/null
@@ -1,273 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file uninitialized_fill.h
- *  \brief Copy construction into a range of uninitialized elements from a source value
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/execution_policy.h>
-
-namespace thrust
-{
-
-
-/*! \addtogroup filling
- *  \ingroup transformations
- *  \{
- */
-
-
-/*! In \c thrust, the function \c thrust::device_new allocates memory for
- *  an object and then creates an object at that location by calling a
- *  constructor. Occasionally, however, it is useful to separate those two
- *  operations. If each iterator in the range <tt>[first, last)</tt> points
- *  to unitialized memory, then \p unitialized_fill creates copies of \c x
- *  in that range. That is, for each iterator \c i in the range <tt>[first, last)</tt>,
- *  \p uninitialized_fill creates a copy of \c x in the location pointed to \c i by
- *  calling \p ForwardIterator's \c value_type's copy constructor.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *  
- *  \param exec The execution policy to use for parallelization.
- *  \param first The first element of the range of interest.
- *  \param last The last element of the range of interest.
- *  \param x The value to use as the exemplar of the copy constructor.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
- *          \p ForwardIterator is mutable, and \p ForwardIterator's \c value_type has a constructor that
- *          takes a single argument of type \p T.
- *
- *  The following code snippet demonstrates how to use \p uninitialized_fill to initialize a range of
- *  uninitialized memory using the \p thrust::device execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/uninitialized_fill.h>
- *  #include <thrust/device_malloc.h>
- *  #include <thrust/execution_policy.h>
- *  
- *  struct Int
- *  {
- *    __host__ __device__
- *    Int(int x) : val(x) {}
- *    int val;
- *  };  
- *  ...
- *  const int N = 137;
- *
- *  Int val(46);
- *  thrust::device_ptr<Int> array = thrust::device_malloc<Int>(N);
- *  thrust::uninitialized_fill(thrust::device, array, array + N, val);
- *
- *  // Int x = array[i];
- *  // x.val == 46 for all 0 <= i < N
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/uninitialized_fill.html
- *  \see \c uninitialized_fill_n
- *  \see \c fill
- *  \see \c uninitialized_copy
- *  \see \c device_new
- *  \see \c device_malloc
- */
-template<typename DerivedPolicy, typename ForwardIterator, typename T>
-  void uninitialized_fill(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                          ForwardIterator first,
-                          ForwardIterator last,
-                          const T &x);
-
-
-/*! In \c thrust, the function \c thrust::device_new allocates memory for
- *  an object and then creates an object at that location by calling a
- *  constructor. Occasionally, however, it is useful to separate those two
- *  operations. If each iterator in the range <tt>[first, last)</tt> points
- *  to unitialized memory, then \p unitialized_fill creates copies of \c x
- *  in that range. That is, for each iterator \c i in the range <tt>[first, last)</tt>,
- *  \p uninitialized_fill creates a copy of \c x in the location pointed to \c i by
- *  calling \p ForwardIterator's \c value_type's copy constructor.
- *  
- *  \param first The first element of the range of interest.
- *  \param last The last element of the range of interest.
- *  \param x The value to use as the exemplar of the copy constructor.
- *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
- *          \p ForwardIterator is mutable, and \p ForwardIterator's \c value_type has a constructor that
- *          takes a single argument of type \p T.
- *
- *  The following code snippet demonstrates how to use \p uninitialized_fill to initialize a range of
- *  uninitialized memory.
- *
- *  \code
- *  #include <thrust/uninitialized_fill.h>
- *  #include <thrust/device_malloc.h>
- *  
- *  struct Int
- *  {
- *    __host__ __device__
- *    Int(int x) : val(x) {}
- *    int val;
- *  };  
- *  ...
- *  const int N = 137;
- *
- *  Int val(46);
- *  thrust::device_ptr<Int> array = thrust::device_malloc<Int>(N);
- *  thrust::uninitialized_fill(array, array + N, val);
- *
- *  // Int x = array[i];
- *  // x.val == 46 for all 0 <= i < N
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/uninitialized_fill.html
- *  \see \c uninitialized_fill_n
- *  \see \c fill
- *  \see \c uninitialized_copy
- *  \see \c device_new
- *  \see \c device_malloc
- */
-template<typename ForwardIterator, typename T>
-  void uninitialized_fill(ForwardIterator first,
-                          ForwardIterator last,
-                          const T &x);
-
-
-/*! In \c thrust, the function \c thrust::device_new allocates memory for
- *  an object and then creates an object at that location by calling a
- *  constructor. Occasionally, however, it is useful to separate those two
- *  operations. If each iterator in the range <tt>[first, first+n)</tt> points
- *  to unitialized memory, then \p unitialized_fill creates copies of \c x
- *  in that range. That is, for each iterator \c i in the range <tt>[first, first+n)</tt>,
- *  \p uninitialized_fill creates a copy of \c x in the location pointed to \c i by
- *  calling \p ForwardIterator's \c value_type's copy constructor.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *  
- *  \param exec The execution policy to use for parallelization.
- *  \param first The first element of the range of interest.
- *  \param n The size of the range of interest.
- *  \param x The value to use as the exemplar of the copy constructor.
- *  \return <tt>first+n</tt>
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
- *          \p ForwardIterator is mutable, and \p ForwardIterator's \c value_type has a constructor that
- *          takes a single argument of type \p T.
- *
- *  The following code snippet demonstrates how to use \p uninitialized_fill to initialize a range of
- *  uninitialized memory using the \p thrust::device execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/uninitialized_fill.h>
- *  #include <thrust/device_malloc.h>
- *  #include <thrust/execution_policy.h>
- *  
- *  struct Int
- *  {
- *    __host__ __device__
- *    Int(int x) : val(x) {}
- *    int val;
- *  };  
- *  ...
- *  const int N = 137;
- *
- *  Int val(46);
- *  thrust::device_ptr<Int> array = thrust::device_malloc<Int>(N);
- *  thrust::uninitialized_fill_n(thrust::device, array, N, val);
- *
- *  // Int x = array[i];
- *  // x.val == 46 for all 0 <= i < N
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/uninitialized_fill.html
- *  \see \c uninitialized_fill
- *  \see \c fill
- *  \see \c uninitialized_copy_n
- *  \see \c device_new
- *  \see \c device_malloc
- */
-template<typename DerivedPolicy, typename ForwardIterator, typename Size, typename T>
-  ForwardIterator uninitialized_fill_n(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                       ForwardIterator first,
-                                       Size n,
-                                       const T &x);
-
-
-/*! In \c thrust, the function \c thrust::device_new allocates memory for
- *  an object and then creates an object at that location by calling a
- *  constructor. Occasionally, however, it is useful to separate those two
- *  operations. If each iterator in the range <tt>[first, first+n)</tt> points
- *  to unitialized memory, then \p unitialized_fill creates copies of \c x
- *  in that range. That is, for each iterator \c i in the range <tt>[first, first+n)</tt>,
- *  \p uninitialized_fill creates a copy of \c x in the location pointed to \c i by
- *  calling \p ForwardIterator's \c value_type's copy constructor.
- *  
- *  \param first The first element of the range of interest.
- *  \param n The size of the range of interest.
- *  \param x The value to use as the exemplar of the copy constructor.
- *  \return <tt>first+n</tt>
- *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
- *          \p ForwardIterator is mutable, and \p ForwardIterator's \c value_type has a constructor that
- *          takes a single argument of type \p T.
- *
- *  The following code snippet demonstrates how to use \p uninitialized_fill to initialize a range of
- *  uninitialized memory.
- *
- *  \code
- *  #include <thrust/uninitialized_fill.h>
- *  #include <thrust/device_malloc.h>
- *  
- *  struct Int
- *  {
- *    __host__ __device__
- *    Int(int x) : val(x) {}
- *    int val;
- *  };  
- *  ...
- *  const int N = 137;
- *
- *  Int val(46);
- *  thrust::device_ptr<Int> array = thrust::device_malloc<Int>(N);
- *  thrust::uninitialized_fill_n(array, N, val);
- *
- *  // Int x = array[i];
- *  // x.val == 46 for all 0 <= i < N
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/uninitialized_fill.html
- *  \see \c uninitialized_fill
- *  \see \c fill
- *  \see \c uninitialized_copy_n
- *  \see \c device_new
- *  \see \c device_malloc
- */
-template<typename ForwardIterator, typename Size, typename T>
-  ForwardIterator uninitialized_fill_n(ForwardIterator first,
-                                       Size n,
-                                       const T &x);
-
-/*! \} // end filling
- *  \} // transformations
- */
-
-} // end thrust
-
-#include <thrust/detail/uninitialized_fill.inl>
-
diff --git a/compat/thrust/unique.h b/compat/thrust/unique.h
deleted file mode 100644
index 98150f36c9..0000000000
--- a/compat/thrust/unique.h
+++ /dev/null
@@ -1,960 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file unique.h
- *  \brief Move unique elements to the front of a range
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/execution_policy.h>
-#include <thrust/pair.h>
-
-namespace thrust
-{
-
-
-/*! \addtogroup stream_compaction
- *  \{
- */
-
-
-/*! For each group of consecutive elements in the range <tt>[first, last)</tt>
- *  with the same value, \p unique removes all but the first element of 
- *  the group. The return value is an iterator \c new_last such that 
- *  no two consecutive elements in the range <tt>[first, new_last)</tt> are
- *  equal. The iterators in the range <tt>[new_last, last)</tt> are all still
- *  dereferenceable, but the elements that they point to are unspecified.
- *  \p unique is stable, meaning that the relative order of elements that are
- *  not removed is unchanged.
- *
- *  This version of \p unique uses \c operator== to test for equality.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the input range.
- *  \param last  The end of the input range.
- *  \return The end of the unique range <tt>[first, new_last)</tt>.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator is mutable,
- *          and \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
- *
- *  The following code snippet demonstrates how to use \p unique to
- *  compact a sequence of numbers to remove consecutive duplicates using the \p thrust::host execution policy
- *  for parallelization:
- *
- *  \code
- *  #include <thrust/unique.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  const int N = 7;
- *  int A[N] = {1, 3, 3, 3, 2, 2, 1};
- *  int *new_end = thrust::unique(thrust::host, A, A + N);
- *  // The first four values of A are now {1, 3, 2, 1}
- *  // Values beyond new_end are unspecified.
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/unique.html
- *  \see unique_copy
- */
-template<typename DerivedPolicy,
-         typename ForwardIterator>
-ForwardIterator unique(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                       ForwardIterator first,
-                       ForwardIterator last);
-
-
-/*! For each group of consecutive elements in the range <tt>[first, last)</tt>
- *  with the same value, \p unique removes all but the first element of 
- *  the group. The return value is an iterator \c new_last such that 
- *  no two consecutive elements in the range <tt>[first, new_last)</tt> are
- *  equal. The iterators in the range <tt>[new_last, last)</tt> are all still
- *  dereferenceable, but the elements that they point to are unspecified.
- *  \p unique is stable, meaning that the relative order of elements that are
- *  not removed is unchanged.
- *
- *  This version of \p unique uses \c operator== to test for equality.
- *
- *  \param first The beginning of the input range.
- *  \param last  The end of the input range.
- *  \return The end of the unique range <tt>[first, new_last)</tt>.
- *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator is mutable,
- *          and \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
- *
- *  The following code snippet demonstrates how to use \p unique to
- *  compact a sequence of numbers to remove consecutive duplicates.
- *
- *  \code
- *  #include <thrust/unique.h>
- *  ...
- *  const int N = 7;
- *  int A[N] = {1, 3, 3, 3, 2, 2, 1};
- *  int *new_end = thrust::unique(A, A + N);
- *  // The first four values of A are now {1, 3, 2, 1}
- *  // Values beyond new_end are unspecified.
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/unique.html
- *  \see unique_copy
- */
-template <typename ForwardIterator>
-ForwardIterator unique(ForwardIterator first,
-                       ForwardIterator last);
-
-
-/*! For each group of consecutive elements in the range <tt>[first, last)</tt>
- *  with the same value, \p unique removes all but the first element of 
- *  the group. The return value is an iterator \c new_last such that 
- *  no two consecutive elements in the range <tt>[first, new_last)</tt> are
- *  equal. The iterators in the range <tt>[new_last, last)</tt> are all still
- *  dereferenceable, but the elements that they point to are unspecified.
- *  \p unique is stable, meaning that the relative order of elements that are
- *  not removed is unchanged.
- *
- *  This version of \p unique uses the function object \p binary_pred to test
- *  for equality.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the input range.
- *  \param last  The end of the input range.
- *  \param binary_pred  The binary predicate used to determine equality.
- *  \return The end of the unique range <tt>[first, new_last)</tt>
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator is mutable,
- *          and \p ForwardIterator's \c value_type is convertible to \p BinaryPredicate's \c first_argument_type and to \p BinaryPredicate's \c second_argument_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
- *
- *  The following code snippet demonstrates how to use \p unique to
- *  compact a sequence of numbers to remove consecutive duplicates using the \p thrust::host execution policy
- *  for parallelization:
- *
- *  \code
- *  #include <thrust/unique.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  const int N = 7;
- *  int A[N] = {1, 3, 3, 3, 2, 2, 1};
- *  int *new_end = thrust::unique(thrust::host, A, A + N, thrust::equal_to<int>());
- *  // The first four values of A are now {1, 3, 2, 1}
- *  // Values beyond new_end are unspecified.
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/unique.html
- *  \see unique_copy
- */
-template<typename DerivedPolicy,
-         typename ForwardIterator,
-         typename BinaryPredicate>
-ForwardIterator unique(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                       ForwardIterator first,
-                       ForwardIterator last,
-                       BinaryPredicate binary_pred);
-
-
-/*! For each group of consecutive elements in the range <tt>[first, last)</tt>
- *  with the same value, \p unique removes all but the first element of 
- *  the group. The return value is an iterator \c new_last such that 
- *  no two consecutive elements in the range <tt>[first, new_last)</tt> are
- *  equal. The iterators in the range <tt>[new_last, last)</tt> are all still
- *  dereferenceable, but the elements that they point to are unspecified.
- *  \p unique is stable, meaning that the relative order of elements that are
- *  not removed is unchanged.
- *
- *  This version of \p unique uses the function object \p binary_pred to test
- *  for equality.
- *
- *  \param first The beginning of the input range.
- *  \param last  The end of the input range.
- *  \param binary_pred  The binary predicate used to determine equality.
- *  \return The end of the unique range <tt>[first, new_last)</tt>
- *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator is mutable,
- *          and \p ForwardIterator's \c value_type is convertible to \p BinaryPredicate's \c first_argument_type and to \p BinaryPredicate's \c second_argument_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
- *
- *  The following code snippet demonstrates how to use \p unique to
- *  compact a sequence of numbers to remove consecutive duplicates.
- *
- *  \code
- *  #include <thrust/unique.h>
- *  ...
- *  const int N = 7;
- *  int A[N] = {1, 3, 3, 3, 2, 2, 1};
- *  int *new_end = thrust::unique(A, A + N, thrust::equal_to<int>());
- *  // The first four values of A are now {1, 3, 2, 1}
- *  // Values beyond new_end are unspecified.
- *  \endcode
- *
- *  \see http://www.sgi.com/tech/stl/unique.html
- *  \see unique_copy
- */
-template <typename ForwardIterator,
-          typename BinaryPredicate>
-ForwardIterator unique(ForwardIterator first,
-                       ForwardIterator last,
-                       BinaryPredicate binary_pred);
-
-
-/*! \p unique_copy copies elements from the range <tt>[first, last)</tt>
- * to a range beginning with \p result, except that in a consecutive group
- * of duplicate elements only the first one is copied. The return value
- * is the end of the range to which the elements are copied. 
- *
- * The reason there are two different versions of unique_copy is that there
- * are two different definitions of what it means for a consecutive group of
- * elements to be duplicates. In the first version, the test is simple
- * equality: the elements in a range <tt>[f, l)</tt> are duplicates if,
- * for every iterator \p i in the range, either <tt>i == f</tt> or else 
- * <tt>*i == *(i-1)</tt>. In the second, the test is an arbitrary 
- * \p BinaryPredicate \p binary_pred: the elements in <tt>[f, l)</tt> are
- * duplicates if, for every iterator \p i in the range, either <tt>i == f</tt>
- * or else <tt>binary_pred(*i, *(i-1))</tt> is \p true.
- *
- * This version of \p unique_copy uses \c operator== to test for equality.
- *
- * The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the input range.
- *  \param last  The end of the input range.
- *  \param result The beginning of the output range.
- *  \return The end of the unique range <tt>[result, result_end)</tt>.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
- *          and \p InputIterator's \c value_type is convertible to \c OutputIterator's \c value_type.
- *
- *  \pre The range <tt>[first,last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap.
- *
- *  The following code snippet demonstrates how to use \p unique_copy to
- *  compact a sequence of numbers to remove consecutive duplicates using the \p thrust::host execution
- *  policy for parallelization:
- *
- *  \code
- *  #include <thrust/unique.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  const int N = 7;
- *  int A[N] = {1, 3, 3, 3, 2, 2, 1};
- *  int B[N];
- *  int *result_end = thrust::unique_copy(thrust::host, A, A + N, B);
- *  // The first four values of B are now {1, 3, 2, 1} and (result_end - B) is 4
- *  // Values beyond result_end are unspecified
- *  \endcode
- *
- *  \see unique
- *  \see http://www.sgi.com/tech/stl/unique_copy.html
- */
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator>
-OutputIterator unique_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                           InputIterator first,
-                           InputIterator last,
-                           OutputIterator result);
-
-
-/*! \p unique_copy copies elements from the range <tt>[first, last)</tt>
- * to a range beginning with \p result, except that in a consecutive group
- * of duplicate elements only the first one is copied. The return value
- * is the end of the range to which the elements are copied. 
- *
- * The reason there are two different versions of unique_copy is that there
- * are two different definitions of what it means for a consecutive group of
- * elements to be duplicates. In the first version, the test is simple
- * equality: the elements in a range <tt>[f, l)</tt> are duplicates if,
- * for every iterator \p i in the range, either <tt>i == f</tt> or else 
- * <tt>*i == *(i-1)</tt>. In the second, the test is an arbitrary 
- * \p BinaryPredicate \p binary_pred: the elements in <tt>[f, l)</tt> are
- * duplicates if, for every iterator \p i in the range, either <tt>i == f</tt>
- * or else <tt>binary_pred(*i, *(i-1))</tt> is \p true.
- *
- * This version of \p unique_copy uses \c operator== to test for equality.
- *
- *  \param first The beginning of the input range.
- *  \param last  The end of the input range.
- *  \param result The beginning of the output range.
- *  \return The end of the unique range <tt>[result, result_end)</tt>.
- *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
- *          and \p InputIterator's \c value_type is convertible to \c OutputIterator's \c value_type.
- *
- *  \pre The range <tt>[first,last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap.
- *
- *  The following code snippet demonstrates how to use \p unique_copy to
- *  compact a sequence of numbers to remove consecutive duplicates.
- *
- *  \code
- *  #include <thrust/unique.h>
- *  ...
- *  const int N = 7;
- *  int A[N] = {1, 3, 3, 3, 2, 2, 1};
- *  int B[N];
- *  int *result_end = thrust::unique_copy(A, A + N, B);
- *  // The first four values of B are now {1, 3, 2, 1} and (result_end - B) is 4
- *  // Values beyond result_end are unspecified
- *  \endcode
- *
- *  \see unique
- *  \see http://www.sgi.com/tech/stl/unique_copy.html
- */
-template <typename InputIterator,
-          typename OutputIterator>
-OutputIterator unique_copy(InputIterator first,
-                           InputIterator last,
-                           OutputIterator result);
-
-
-/*! \p unique_copy copies elements from the range <tt>[first, last)</tt>
- * to a range beginning with \p result, except that in a consecutive group
- * of duplicate elements only the first one is copied. The return value
- * is the end of the range to which the elements are copied. 
- *
- * This version of \p unique_copy uses the function object \c binary_pred 
- * to test for equality.
- *
- * The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param first The beginning of the input range.
- *  \param last  The end of the input range.
- *  \param result The beginning of the output range.
- *  \param binary_pred  The binary predicate used to determine equality.
- *  \return The end of the unique range <tt>[result, result_end)</tt>.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
- *          and \p InputIterator's \c value_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
- *
- *  \pre The range <tt>[first,last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap.
- *
- *  The following code snippet demonstrates how to use \p unique_copy to
- *  compact a sequence of numbers to remove consecutive duplicates using the \p thrust::host execution
- *  policy for parallelization:
- *
- *  \code
- *  #include <thrust/unique.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  const int N = 7;
- *  int A[N] = {1, 3, 3, 3, 2, 2, 1};
- *  int B[N];
- *  int *result_end = thrust::unique_copy(thrust::host, A, A + N, B, thrust::equal_to<int>());
- *  // The first four values of B are now {1, 3, 2, 1} and (result_end - B) is 4
- *  // Values beyond result_end are unspecified.
- *  \endcode
- *
- *  \see unique
- *  \see http://www.sgi.com/tech/stl/unique_copy.html
- */
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename BinaryPredicate>
-OutputIterator unique_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                           InputIterator first,
-                           InputIterator last,
-                           OutputIterator result,
-                           BinaryPredicate binary_pred);
-                       
-
-/*! \p unique_copy copies elements from the range <tt>[first, last)</tt>
- * to a range beginning with \p result, except that in a consecutive group
- * of duplicate elements only the first one is copied. The return value
- * is the end of the range to which the elements are copied. 
- *
- * This version of \p unique_copy uses the function object \c binary_pred 
- * to test for equality.
- *
- *  \param first The beginning of the input range.
- *  \param last  The end of the input range.
- *  \param result The beginning of the output range.
- *  \param binary_pred  The binary predicate used to determine equality.
- *  \return The end of the unique range <tt>[result, result_end)</tt>.
- *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
- *          and \p InputIterator's \c value_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
- *
- *  \pre The range <tt>[first,last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap.
- *
- *  The following code snippet demonstrates how to use \p unique_copy to
- *  compact a sequence of numbers to remove consecutive duplicates.
- *
- *  \code
- *  #include <thrust/unique.h>
- *  ...
- *  const int N = 7;
- *  int A[N] = {1, 3, 3, 3, 2, 2, 1};
- *  int B[N];
- *  int *result_end = thrust::unique_copy(A, A + N, B, thrust::equal_to<int>());
- *  // The first four values of B are now {1, 3, 2, 1} and (result_end - B) is 4
- *  // Values beyond result_end are unspecified.
- *  \endcode
- *
- *  \see unique
- *  \see http://www.sgi.com/tech/stl/unique_copy.html
- */
-template <typename InputIterator,
-          typename OutputIterator,
-          typename BinaryPredicate>
-OutputIterator unique_copy(InputIterator first,
-                           InputIterator last,
-                           OutputIterator result,
-                           BinaryPredicate binary_pred);
-
-
-/*! \p unique_by_key is a generalization of \p unique to key-value pairs.
- *  For each group of consecutive keys in the range <tt>[keys_first, keys_last)</tt>
- *  that are equal, \p unique_by_key removes all but the first element of 
- *  the group.  Similarly, the corresponding values in the range
- *  <tt>[values_first, values_first + (keys_last - keys_first))</tt> 
- *  are also removed.
- *
- *  The return value is a \p pair of iterators <tt>(new_keys_last,new_values_last)</tt>
- *  such that no two consecutive elements in the range <tt>[keys_first, new_keys_last)</tt>
- *  are equal.
- *
- *  This version of \p unique_by_key uses \c operator== to test for equality and 
- *  \c project1st to reduce values with equal keys.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param keys_first The beginning of the key range.
- *  \param keys_last  The end of the key range.
- *  \param values_first The beginning of the value range.
- *  \return A pair of iterators at end of the ranges <tt>[key_first, keys_new_last)</tt> and <tt>[values_first, values_new_last)</tt>.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator1 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator1 is mutable,
- *          and \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
- *  \tparam ForwardIterator2 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator2 is mutable.
- *
- *  \pre The range <tt>[keys_first, keys_last)</tt> and the range <tt>[values_first, values_first + (keys_last - keys_first))</tt> shall not overlap.
- *
- *  The following code snippet demonstrates how to use \p unique_by_key to
- *  compact a sequence of key/value pairs to remove consecutive duplicates using the \p thrust::host
- *  execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/unique.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  const int N = 7;
- *  int A[N] = {1, 3, 3, 3, 2, 2, 1}; // keys
- *  int B[N] = {9, 8, 7, 6, 5, 4, 3}; // values
- *
- *  thrust::pair<int*,int*> new_end;
- *  new_end = thrust::unique_by_key(thrust::host, A, A + N, B);
- *
- *  // The first four keys in A are now {1, 3, 2, 1} and new_end.first - A is 4.
- *  // The first four values in B are now {9, 8, 5, 3} and new_end.second - B is 4.
- *  \endcode
- *
- *  \see unique
- *  \see unique_by_key_copy
- *  \see reduce_by_key
- */
-template<typename DerivedPolicy,
-         typename ForwardIterator1,
-         typename ForwardIterator2>
-  thrust::pair<ForwardIterator1,ForwardIterator2>
-  unique_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                ForwardIterator1 keys_first, 
-                ForwardIterator1 keys_last,
-                ForwardIterator2 values_first);
-
-
-/*! \p unique_by_key is a generalization of \p unique to key-value pairs.
- *  For each group of consecutive keys in the range <tt>[keys_first, keys_last)</tt>
- *  that are equal, \p unique_by_key removes all but the first element of 
- *  the group.  Similarly, the corresponding values in the range
- *  <tt>[values_first, values_first + (keys_last - keys_first))</tt> 
- *  are also removed.
- *
- *  The return value is a \p pair of iterators <tt>(new_keys_last,new_values_last)</tt>
- *  such that no two consecutive elements in the range <tt>[keys_first, new_keys_last)</tt>
- *  are equal.
- *
- *  This version of \p unique_by_key uses \c operator== to test for equality and 
- *  \c project1st to reduce values with equal keys.
- *
- *  \param keys_first The beginning of the key range.
- *  \param keys_last  The end of the key range.
- *  \param values_first The beginning of the value range.
- *  \return A pair of iterators at end of the ranges <tt>[key_first, keys_new_last)</tt> and <tt>[values_first, values_new_last)</tt>.
- *
- *  \tparam ForwardIterator1 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator1 is mutable,
- *          and \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
- *  \tparam ForwardIterator2 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator2 is mutable.
- *
- *  \pre The range <tt>[keys_first, keys_last)</tt> and the range <tt>[values_first, values_first + (keys_last - keys_first))</tt> shall not overlap.
- *
- *  The following code snippet demonstrates how to use \p unique_by_key to
- *  compact a sequence of key/value pairs to remove consecutive duplicates.
- *
- *  \code
- *  #include <thrust/unique.h>
- *  ...
- *  const int N = 7;
- *  int A[N] = {1, 3, 3, 3, 2, 2, 1}; // keys
- *  int B[N] = {9, 8, 7, 6, 5, 4, 3}; // values
- *
- *  thrust::pair<int*,int*> new_end;
- *  new_end = thrust::unique_by_key(A, A + N, B);
- *
- *  // The first four keys in A are now {1, 3, 2, 1} and new_end.first - A is 4.
- *  // The first four values in B are now {9, 8, 5, 3} and new_end.second - B is 4.
- *  \endcode
- *
- *  \see unique
- *  \see unique_by_key_copy
- *  \see reduce_by_key
- */
-template <typename ForwardIterator1,
-          typename ForwardIterator2>
-  thrust::pair<ForwardIterator1,ForwardIterator2>
-  unique_by_key(ForwardIterator1 keys_first, 
-                ForwardIterator1 keys_last,
-                ForwardIterator2 values_first);
-
-
-/*! \p unique_by_key is a generalization of \p unique to key-value pairs.
- *  For each group of consecutive keys in the range <tt>[keys_first, keys_last)</tt>
- *  that are equal, \p unique_by_key removes all but the first element of 
- *  the group.  Similarly, the corresponding values in the range
- *  <tt>[values_first, values_first + (keys_last - keys_first))</tt> 
- *  are also removed.
- *
- *  This version of \p unique_by_key uses the function object \c binary_pred
- *  to test for equality and \c project1st to reduce values with equal keys.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param keys_first The beginning of the key range.
- *  \param keys_last  The end of the key range.
- *  \param values_first The beginning of the value range.
- *  \param binary_pred  The binary predicate used to determine equality.
- *  \return The end of the unique range <tt>[first, new_last)</tt>.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator1 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator1 is mutable,
- *          and \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
- *  \tparam ForwardIterator2 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator2 is mutable.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
- *
- *  \pre The range <tt>[keys_first, keys_last)</tt> and the range <tt>[values_first, values_first + (keys_last - keys_first))</tt> shall not overlap.
- *
- *  The following code snippet demonstrates how to use \p unique_by_key to
- *  compact a sequence of key/value pairs to remove consecutive duplicates using the \p thrust::host
- *  execution policy for parallelization:
- *
- *  \code
- *  #include <thrust/unique.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  const int N = 7;
- *  int A[N] = {1, 3, 3, 3, 2, 2, 1}; // keys
- *  int B[N] = {9, 8, 7, 6, 5, 4, 3}; // values
- *
- *  thrust::pair<int*,int*> new_end;
- *  thrust::equal_to<int> binary_pred;
- *  new_end = thrust::unique_by_key(thrust::host, keys, keys + N, values, binary_pred);
- *
- *  // The first four keys in A are now {1, 3, 2, 1} and new_end.first - A is 4.
- *  // The first four values in B are now {9, 8, 5, 3} and new_end.second - B is 4.
- *  \endcode
- *
- *  \see unique
- *  \see unique_by_key_copy
- *  \see reduce_by_key
- */
-template<typename DerivedPolicy,
-         typename ForwardIterator1,
-         typename ForwardIterator2,
-         typename BinaryPredicate>
-  thrust::pair<ForwardIterator1,ForwardIterator2>
-    unique_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                  ForwardIterator1 keys_first, 
-                  ForwardIterator1 keys_last,
-                  ForwardIterator2 values_first,
-                  BinaryPredicate binary_pred);
-
-
-/*! \p unique_by_key is a generalization of \p unique to key-value pairs.
- *  For each group of consecutive keys in the range <tt>[keys_first, keys_last)</tt>
- *  that are equal, \p unique_by_key removes all but the first element of 
- *  the group.  Similarly, the corresponding values in the range
- *  <tt>[values_first, values_first + (keys_last - keys_first))</tt> 
- *  are also removed.
- *
- *  This version of \p unique_by_key uses the function object \c binary_pred
- *  to test for equality and \c project1st to reduce values with equal keys.
- *
- *  \param keys_first The beginning of the key range.
- *  \param keys_last  The end of the key range.
- *  \param values_first The beginning of the value range.
- *  \param binary_pred  The binary predicate used to determine equality.
- *  \return The end of the unique range <tt>[first, new_last)</tt>.
- *
- *  \tparam ForwardIterator1 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator1 is mutable,
- *          and \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
- *  \tparam ForwardIterator2 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          and \p ForwardIterator2 is mutable.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
- *
- *  \pre The range <tt>[keys_first, keys_last)</tt> and the range <tt>[values_first, values_first + (keys_last - keys_first))</tt> shall not overlap.
- *
- *  The following code snippet demonstrates how to use \p unique_by_key to
- *  compact a sequence of key/value pairs to remove consecutive duplicates.
- *
- *  \code
- *  #include <thrust/unique.h>
- *  ...
- *  const int N = 7;
- *  int A[N] = {1, 3, 3, 3, 2, 2, 1}; // keys
- *  int B[N] = {9, 8, 7, 6, 5, 4, 3}; // values
- *
- *  thrust::pair<int*,int*> new_end;
- *  thrust::equal_to<int> binary_pred;
- *  new_end = thrust::unique_by_key(keys, keys + N, values, binary_pred);
- *
- *  // The first four keys in A are now {1, 3, 2, 1} and new_end.first - A is 4.
- *  // The first four values in B are now {9, 8, 5, 3} and new_end.second - B is 4.
- *  \endcode
- *
- *  \see unique
- *  \see unique_by_key_copy
- *  \see reduce_by_key
- */
-template <typename ForwardIterator1,
-          typename ForwardIterator2,
-          typename BinaryPredicate>
-  thrust::pair<ForwardIterator1,ForwardIterator2>
-  unique_by_key(ForwardIterator1 keys_first, 
-                ForwardIterator1 keys_last,
-                ForwardIterator2 values_first,
-                BinaryPredicate binary_pred);
-
-
-/*! \p unique_by_key_copy is a generalization of \p unique_copy to key-value pairs.
- *  For each group of consecutive keys in the range <tt>[keys_first, keys_last)</tt>
- *  that are equal, \p unique_by_key_copy copies the first element of the group to
- *  a range beginning with \c keys_result and the corresponding values from the range
- *  <tt>[values_first, values_first + (keys_last - keys_first))</tt> are copied to a range
- *  beginning with \c values_result.
- *
- *  This version of \p unique_by_key_copy uses \c operator== to test for equality and
- *  \c project1st to reduce values with equal keys.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param keys_first The beginning of the input key range.
- *  \param keys_last  The end of the input key range.
- *  \param values_first The beginning of the input value range.
- *  \param keys_result The beginning of the output key range.
- *  \param values_result The beginning of the output value range.
- *  \return A pair of iterators at end of the ranges <tt>[keys_result, keys_result_last)</tt> and <tt>[values_result, values_result_last)</tt>.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
- *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
- *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
- *
- *  \pre The input ranges shall not overlap either output range.
- *
- *  The following code snippet demonstrates how to use \p unique_by_key_copy to
- *  compact a sequence of key/value pairs and with equal keys using the \p thrust::host execution policy
- *  for parallelization:
- *
- *  \code
- *  #include <thrust/unique.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  const int N = 7;
- *  int A[N] = {1, 3, 3, 3, 2, 2, 1}; // input keys
- *  int B[N] = {9, 8, 7, 6, 5, 4, 3}; // input values
- *  int C[N];                         // output keys
- *  int D[N];                         // output values
- *
- *  thrust::pair<int*,int*> new_end;
- *  new_end = thrust::unique_by_key_copy(thrust::host, A, A + N, B, C, D);
- *
- *  // The first four keys in C are now {1, 3, 2, 1} and new_end.first - C is 4.
- *  // The first four values in D are now {9, 8, 5, 3} and new_end.second - D is 4.
- *  \endcode
- *
- *  \see unique_copy
- *  \see unique_by_key
- *  \see reduce_by_key
- */
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    unique_by_key_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                       InputIterator1 keys_first, 
-                       InputIterator1 keys_last,
-                       InputIterator2 values_first,
-                       OutputIterator1 keys_result,
-                       OutputIterator2 values_result);
-
-
-/*! \p unique_by_key_copy is a generalization of \p unique_copy to key-value pairs.
- *  For each group of consecutive keys in the range <tt>[keys_first, keys_last)</tt>
- *  that are equal, \p unique_by_key_copy copies the first element of the group to
- *  a range beginning with \c keys_result and the corresponding values from the range
- *  <tt>[values_first, values_first + (keys_last - keys_first))</tt> are copied to a range
- *  beginning with \c values_result.
- *
- *  This version of \p unique_by_key_copy uses \c operator== to test for equality and
- *  \c project1st to reduce values with equal keys.
- *
- *  \param keys_first The beginning of the input key range.
- *  \param keys_last  The end of the input key range.
- *  \param values_first The beginning of the input value range.
- *  \param keys_result The beginning of the output key range.
- *  \param values_result The beginning of the output value range.
- *  \return A pair of iterators at end of the ranges <tt>[keys_result, keys_result_last)</tt> and <tt>[values_result, values_result_last)</tt>.
- *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
- *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
- *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
- *
- *  \pre The input ranges shall not overlap either output range.
- *
- *  The following code snippet demonstrates how to use \p unique_by_key_copy to
- *  compact a sequence of key/value pairs and with equal keys.
- *
- *  \code
- *  #include <thrust/unique.h>
- *  ...
- *  const int N = 7;
- *  int A[N] = {1, 3, 3, 3, 2, 2, 1}; // input keys
- *  int B[N] = {9, 8, 7, 6, 5, 4, 3}; // input values
- *  int C[N];                         // output keys
- *  int D[N];                         // output values
- *
- *  thrust::pair<int*,int*> new_end;
- *  new_end = thrust::unique_by_key_copy(A, A + N, B, C, D);
- *
- *  // The first four keys in C are now {1, 3, 2, 1} and new_end.first - C is 4.
- *  // The first four values in D are now {9, 8, 5, 3} and new_end.second - D is 4.
- *  \endcode
- *
- *  \see unique_copy
- *  \see unique_by_key
- *  \see reduce_by_key
- */
-template <typename InputIterator1,
-          typename InputIterator2,
-          typename OutputIterator1,
-          typename OutputIterator2>
-  thrust::pair<OutputIterator1,OutputIterator2>
-  unique_by_key_copy(InputIterator1 keys_first, 
-                     InputIterator1 keys_last,
-                     InputIterator2 values_first,
-                     OutputIterator1 keys_result,
-                     OutputIterator2 values_result);
-
-
-/*! \p unique_by_key_copy is a generalization of \p unique_copy to key-value pairs.
- *  For each group of consecutive keys in the range <tt>[keys_first, keys_last)</tt>
- *  that are equal, \p unique_by_key_copy copies the first element of the group to
- *  a range beginning with \c keys_result and the corresponding values from the range
- *  <tt>[values_first, values_first + (keys_last - keys_first))</tt> are copied to a range
- *  beginning with \c values_result.
- *
- *  This version of \p unique_by_key_copy uses the function object \c binary_pred
- *  to test for equality and \c project1st to reduce values with equal keys.
- *
- *  The algorithm's execution is parallelized as determined by \p exec.
- *
- *  \param exec The execution policy to use for parallelization.
- *  \param keys_first The beginning of the input key range.
- *  \param keys_last  The end of the input key range.
- *  \param values_first The beginning of the input value range.
- *  \param keys_result The beginning of the output key range.
- *  \param values_result The beginning of the output value range.
- *  \param binary_pred  The binary predicate used to determine equality.
- *  \return A pair of iterators at end of the ranges <tt>[keys_result, keys_result_last)</tt> and <tt>[values_result, values_result_last)</tt>.
- *
- *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
- *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
- *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
- *
- *  \pre The input ranges shall not overlap either output range.
- *
- *  The following code snippet demonstrates how to use \p unique_by_key_copy to
- *  compact a sequence of key/value pairs and with equal keys using the \p thrust::host execution policy for
- *  parallelization:
- *
- *  \code
- *  #include <thrust/unique.h>
- *  #include <thrust/execution_policy.h>
- *  ...
- *  const int N = 7;
- *  int A[N] = {1, 3, 3, 3, 2, 2, 1}; // input keys
- *  int B[N] = {9, 8, 7, 6, 5, 4, 3}; // input values
- *  int C[N];                         // output keys
- *  int D[N];                         // output values
- *
- *  thrust::pair<int*,int*> new_end;
- *  thrust::equal_to<int> binary_pred;
- *  new_end = thrust::unique_by_key_copy(thrust::host, A, A + N, B, C, D, binary_pred);
- *
- *  // The first four keys in C are now {1, 3, 2, 1} and new_end.first - C is 4.
- *  // The first four values in D are now {9, 8, 5, 3} and new_end.second - D is 4.
- *  \endcode
- *
- *  \see unique_copy
- *  \see unique_by_key
- *  \see reduce_by_key
- */
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename BinaryPredicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    unique_by_key_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                       InputIterator1 keys_first, 
-                       InputIterator1 keys_last,
-                       InputIterator2 values_first,
-                       OutputIterator1 keys_result,
-                       OutputIterator2 values_result,
-                       BinaryPredicate binary_pred);
-
-
-/*! \p unique_by_key_copy is a generalization of \p unique_copy to key-value pairs.
- *  For each group of consecutive keys in the range <tt>[keys_first, keys_last)</tt>
- *  that are equal, \p unique_by_key_copy copies the first element of the group to
- *  a range beginning with \c keys_result and the corresponding values from the range
- *  <tt>[values_first, values_first + (keys_last - keys_first))</tt> are copied to a range
- *  beginning with \c values_result.
- *
- *  This version of \p unique_by_key_copy uses the function object \c binary_pred
- *  to test for equality and \c project1st to reduce values with equal keys.
- *
- *  \param keys_first The beginning of the input key range.
- *  \param keys_last  The end of the input key range.
- *  \param values_first The beginning of the input value range.
- *  \param keys_result The beginning of the output key range.
- *  \param values_result The beginning of the output value range.
- *  \param binary_pred  The binary predicate used to determine equality.
- *  \return A pair of iterators at end of the ranges <tt>[keys_result, keys_result_last)</tt> and <tt>[values_result, values_result_last)</tt>.
- *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
- *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
- *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
- *
- *  \pre The input ranges shall not overlap either output range.
- *
- *  The following code snippet demonstrates how to use \p unique_by_key_copy to
- *  compact a sequence of key/value pairs and with equal keys.
- *
- *  \code
- *  #include <thrust/unique.h>
- *  ...
- *  const int N = 7;
- *  int A[N] = {1, 3, 3, 3, 2, 2, 1}; // input keys
- *  int B[N] = {9, 8, 7, 6, 5, 4, 3}; // input values
- *  int C[N];                         // output keys
- *  int D[N];                         // output values
- *
- *  thrust::pair<int*,int*> new_end;
- *  thrust::equal_to<int> binary_pred;
- *  new_end = thrust::unique_by_key_copy(A, A + N, B, C, D, binary_pred);
- *
- *  // The first four keys in C are now {1, 3, 2, 1} and new_end.first - C is 4.
- *  // The first four values in D are now {9, 8, 5, 3} and new_end.second - D is 4.
- *  \endcode
- *
- *  \see unique_copy
- *  \see unique_by_key
- *  \see reduce_by_key
- */
-template <typename InputIterator1,
-          typename InputIterator2,
-          typename OutputIterator1,
-          typename OutputIterator2,
-          typename BinaryPredicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-  unique_by_key_copy(InputIterator1 keys_first, 
-                     InputIterator1 keys_last,
-                     InputIterator2 values_first,
-                     OutputIterator1 keys_result,
-                     OutputIterator2 values_result,
-                     BinaryPredicate binary_pred);
-
-
-/*! \} // end stream_compaction
- */
-
-
-} // end namespace thrust
-
-#include <thrust/detail/unique.inl>
-
diff --git a/compat/thrust/version.h b/compat/thrust/version.h
deleted file mode 100644
index 730997eecc..0000000000
--- a/compat/thrust/version.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file version.h
- *  \brief Compile-time macros encoding Thrust release version
- *
- *         <thrust/version.h> is the only Thrust header that is guaranteed to
- *         change with every thrust release.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-//  This is the only thrust header that is guaranteed to 
-//  change with every thrust release.
-//
-//  THRUST_VERSION % 100 is the sub-minor version
-//  THRUST_VERSION / 100 % 1000 is the minor version
-//  THRUST_VERSION / 100000 is the major version
-
-/*! \def THRUST_VERSION
- *  \brief The preprocessor macro \p THRUST_VERSION encodes the version
- *         number of the Thrust library.
- *
- *         <tt>THRUST_VERSION % 100</tt> is the sub-minor version.
- *         <tt>THRUST_VERSION / 100 % 1000</tt> is the minor version.
- *         <tt>THRUST_VERSION / 100000</tt> is the major version.
- */
-#define THRUST_VERSION 100700
-
-/*! \def THRUST_MAJOR_VERSION
- *  \brief The preprocessor macro \p THRUST_MAJOR_VERSION encodes the
- *         major version number of the Thrust library.
- */
-#define THRUST_MAJOR_VERSION     (THRUST_VERSION / 100000)
-
-/*! \def THRUST_MINOR_VERSION
- *  \brief The preprocessor macro \p THRUST_MINOR_VERSION encodes the
- *         minor version number of the Thrust library.
- */
-#define THRUST_MINOR_VERSION     (THRUST_VERSION / 100 % 1000)
-
-/*! \def THRUST_SUBMINOR_VERSION
- *  \brief The preprocessor macro \p THRUST_SUBMINOR_VERSION encodes the
- *         sub-minor version number of the Thrust library.
- */
-#define THRUST_SUBMINOR_VERSION  (THRUST_VERSION % 100)
-
-// Declare these namespaces here for the purpose of Doxygenating them
-
-/*! \namespace thrust
- *  \brief \p thrust is the top-level namespace which contains all Thrust
- *         functions and types.
- */
-namespace thrust
-{
-
-}
-
diff --git a/compat/winansi.c b/compat/winansi.c
new file mode 100644
index 0000000000..50e8388ac1
--- /dev/null
+++ b/compat/winansi.c
@@ -0,0 +1,392 @@
+/**
+ * Old Git implementation of windows terminal colors (2009)
+ * before use of a threaded wrapper.
+ */
+
+#undef NOGDI
+#include <windows.h>
+#include <wingdi.h>
+#include <winreg.h>
+#include <malloc.h>
+#include <stdio.h>
+#include <io.h>
+
+#include "compat/winansi.h"
+/*
+* Copyright 2008 Peter Harris <git@peter.is-a-geek.org>
+*/
+
+/*
+Functions to be wrapped:
+*/
+#undef printf
+#undef fprintf
+#undef fputs
+#undef vfprintf
+/* TODO: write */
+
+/*
+ANSI codes used by git: m, K
+
+This file is git-specific. Therefore, this file does not attempt
+to implement any codes that are not used by git.
+*/
+
+static HANDLE console;
+static WORD plain_attr;
+static WORD attr;
+static int negative;
+
+static void init(void)
+{
+	CONSOLE_SCREEN_BUFFER_INFO sbi;
+
+	static int initialized = 0;
+	if (initialized)
+		return;
+
+	console = GetStdHandle(STD_OUTPUT_HANDLE);
+	if (console == INVALID_HANDLE_VALUE)
+		console = NULL;
+
+	if (!console)
+		return;
+
+	GetConsoleScreenBufferInfo(console, &sbi);
+	attr = plain_attr = sbi.wAttributes;
+	negative = 0;
+
+	initialized = 1;
+}
+
+static int write_console(const char *str, int len)
+{
+	/* convert utf-8 to utf-16, write directly to console */
+	int wlen = MultiByteToWideChar(CP_UTF8, 0, str, len, NULL, 0);
+	wchar_t *wbuf = (wchar_t *)alloca(wlen * sizeof(wchar_t));
+	MultiByteToWideChar(CP_UTF8, 0, str, len, wbuf, wlen);
+
+	WriteConsoleW(console, wbuf, wlen, NULL, NULL);
+
+	/* return original (utf-8 encoded) length */
+	return len;
+}
+
+#define FOREGROUND_ALL (FOREGROUND_RED | FOREGROUND_GREEN | FOREGROUND_BLUE)
+#define BACKGROUND_ALL (BACKGROUND_RED | BACKGROUND_GREEN | BACKGROUND_BLUE)
+
+static void set_console_attr(void)
+{
+	WORD attributes = attr;
+	if (negative) {
+		attributes &= ~FOREGROUND_ALL;
+		attributes &= ~BACKGROUND_ALL;
+
+		/* This could probably use a bitmask
+		instead of a series of ifs */
+		if (attr & FOREGROUND_RED)
+			attributes |= BACKGROUND_RED;
+		if (attr & FOREGROUND_GREEN)
+			attributes |= BACKGROUND_GREEN;
+		if (attr & FOREGROUND_BLUE)
+			attributes |= BACKGROUND_BLUE;
+
+		if (attr & BACKGROUND_RED)
+			attributes |= FOREGROUND_RED;
+		if (attr & BACKGROUND_GREEN)
+			attributes |= FOREGROUND_GREEN;
+		if (attr & BACKGROUND_BLUE)
+			attributes |= FOREGROUND_BLUE;
+	}
+	SetConsoleTextAttribute(console, attributes);
+}
+
+static void erase_in_line(void)
+{
+	CONSOLE_SCREEN_BUFFER_INFO sbi;
+	DWORD dummy; /* Needed for Windows 7 (or Vista) regression */
+
+	if (!console)
+		return;
+
+	GetConsoleScreenBufferInfo(console, &sbi);
+	FillConsoleOutputCharacterA(console, ' ',
+		sbi.dwSize.X - sbi.dwCursorPosition.X, sbi.dwCursorPosition,
+		&dummy);
+}
+
+
+static const char *set_attr(const char *str)
+{
+	const char *func;
+	size_t len = strspn(str, "0123456789;");
+	func = str + len;
+
+	switch (*func) {
+	case 'm':
+		do {
+			long val = strtol(str, (char **)&str, 10);
+			switch (val) {
+			case 0: /* reset */
+				attr = plain_attr;
+				negative = 0;
+				break;
+			case 1: /* bold */
+				attr |= FOREGROUND_INTENSITY;
+				break;
+			case 2:  /* faint */
+			case 22: /* normal */
+				attr &= ~FOREGROUND_INTENSITY;
+				break;
+			case 3:  /* italic */
+				/* Unsupported */
+				break;
+			case 4:  /* underline */
+			case 21: /* double underline */
+				/* Wikipedia says this flag does nothing */
+				/* Furthermore, mingw doesn't define this flag
+				attr |= COMMON_LVB_UNDERSCORE; */
+				break;
+			case 24: /* no underline */
+				/* attr &= ~COMMON_LVB_UNDERSCORE; */
+				break;
+			case 5:  /* slow blink */
+			case 6:  /* fast blink */
+				/* We don't have blink, but we do have
+				background intensity */
+				attr |= BACKGROUND_INTENSITY;
+				break;
+			case 25: /* no blink */
+				attr &= ~BACKGROUND_INTENSITY;
+				break;
+			case 7:  /* negative */
+				negative = 1;
+				break;
+			case 27: /* positive */
+				negative = 0;
+				break;
+			case 8:  /* conceal */
+			case 28: /* reveal */
+				/* Unsupported */
+				break;
+			case 30: /* Black */
+				attr &= ~FOREGROUND_ALL;
+				break;
+			case 31: /* Red */
+				attr &= ~FOREGROUND_ALL;
+				attr |= FOREGROUND_RED;
+				break;
+			case 32: /* Green */
+				attr &= ~FOREGROUND_ALL;
+				attr |= FOREGROUND_GREEN;
+				break;
+			case 33: /* Yellow */
+				attr &= ~FOREGROUND_ALL;
+				attr |= FOREGROUND_RED | FOREGROUND_GREEN;
+				break;
+			case 34: /* Blue */
+				attr &= ~FOREGROUND_ALL;
+				attr |= FOREGROUND_BLUE;
+				break;
+			case 35: /* Magenta */
+				attr &= ~FOREGROUND_ALL;
+				attr |= FOREGROUND_RED | FOREGROUND_BLUE;
+				break;
+			case 36: /* Cyan */
+				attr &= ~FOREGROUND_ALL;
+				attr |= FOREGROUND_GREEN | FOREGROUND_BLUE;
+				break;
+			case 37: /* White */
+				attr |= FOREGROUND_RED |
+					FOREGROUND_GREEN |
+					FOREGROUND_BLUE;
+				break;
+			case 38: /* Unknown */
+				break;
+			case 39: /* reset */
+				attr &= ~FOREGROUND_ALL;
+				attr |= (plain_attr & FOREGROUND_ALL);
+				break;
+			case 40: /* Black */
+				attr &= ~BACKGROUND_ALL;
+				break;
+			case 41: /* Red */
+				attr &= ~BACKGROUND_ALL;
+				attr |= BACKGROUND_RED;
+				break;
+			case 42: /* Green */
+				attr &= ~BACKGROUND_ALL;
+				attr |= BACKGROUND_GREEN;
+				break;
+			case 43: /* Yellow */
+				attr &= ~BACKGROUND_ALL;
+				attr |= BACKGROUND_RED | BACKGROUND_GREEN;
+				break;
+			case 44: /* Blue */
+				attr &= ~BACKGROUND_ALL;
+				attr |= BACKGROUND_BLUE;
+				break;
+			case 45: /* Magenta */
+				attr &= ~BACKGROUND_ALL;
+				attr |= BACKGROUND_RED | BACKGROUND_BLUE;
+				break;
+			case 46: /* Cyan */
+				attr &= ~BACKGROUND_ALL;
+				attr |= BACKGROUND_GREEN | BACKGROUND_BLUE;
+				break;
+			case 47: /* White */
+				attr |= BACKGROUND_RED |
+					BACKGROUND_GREEN |
+					BACKGROUND_BLUE;
+				break;
+			case 48: /* Unknown */
+				break;
+			case 49: /* reset */
+				attr &= ~BACKGROUND_ALL;
+				attr |= (plain_attr & BACKGROUND_ALL);
+				break;
+			default:
+				/* Unsupported code */
+				break;
+			}
+			str++;
+		} while (*(str - 1) == ';');
+
+		set_console_attr();
+		break;
+	case 'K':
+		erase_in_line();
+		break;
+	default:
+		/* Unsupported code */
+		break;
+	}
+
+	return func + 1;
+}
+
+static int ansi_emulate(const char *str, FILE *stream)
+{
+	int rv = 0;
+	const char *pos = str;
+
+	fflush(stream);
+
+	while (*pos) {
+		pos = strstr(str, "\033[");
+		if (pos) {
+			int len = (int) (pos - str);
+
+			if (len) {
+				int out_len = write_console(str, len);
+				rv += out_len;
+				if (out_len < len)
+					return rv;
+			}
+
+			str = pos + 2;
+			rv += 2;
+
+			pos = set_attr(str);
+			rv += (int) (pos - str);
+			str = pos;
+		}
+		else {
+			int len = (int) strlen(str);
+			rv += write_console(str, len);
+			return rv;
+		}
+	}
+	return rv;
+}
+
+int winansi_fputs(const char *str, FILE *stream)
+{
+	int rv;
+
+	if (!isatty(fileno(stream)))
+		return fputs(str, stream);
+
+	init();
+
+	if (!console)
+		return fputs(str, stream);
+
+	rv = ansi_emulate(str, stream);
+
+	if (rv >= 0)
+		return 0;
+	else
+		return EOF;
+}
+
+int winansi_vfprintf(FILE *stream, const char *format, va_list list)
+{
+	int len, rv;
+	char small_buf[256] = { 0 };
+	char *buf = small_buf;
+	va_list cp;
+
+	if (!isatty(fileno(stream)))
+		goto abort;
+
+	init();
+
+	if (!console)
+		goto abort;
+
+	va_copy(cp, list);
+	len = vsnprintf(small_buf, sizeof(small_buf), format, cp);
+#ifdef WIN32
+	/* bug on long strings without that */
+	if (len == -1)
+		len = _vscprintf(format, cp);
+#endif
+	va_end(cp);
+
+	if (len > sizeof(small_buf) - 1) {
+		buf = malloc(len + 1);
+		if (!buf)
+			goto abort;
+
+		len = vsnprintf(buf, len + 1, format, list);
+#ifdef WIN32
+		if (len == -1)
+			len = _vscprintf(format, list);
+#endif
+	}
+
+	rv = ansi_emulate(buf, stream);
+
+	if (buf != small_buf)
+		free(buf);
+	return rv;
+
+abort:
+	rv = vfprintf(stream, format, list);
+	return rv;
+}
+
+int winansi_fprintf(FILE *stream, const char *format, ...)
+{
+	va_list list;
+	int rv;
+
+	va_start(list, format);
+	rv = winansi_vfprintf(stream, format, list);
+	va_end(list);
+
+	return rv;
+}
+
+int winansi_printf(const char *format, ...)
+{
+	va_list list;
+	int rv;
+
+	va_start(list, format);
+	rv = winansi_vfprintf(stdout, format, list);
+	va_end(list);
+
+	return rv;
+}
\ No newline at end of file
diff --git a/compat/winansi.h b/compat/winansi.h
new file mode 100644
index 0000000000..c28dd15cd7
--- /dev/null
+++ b/compat/winansi.h
@@ -0,0 +1,32 @@
+/*
+ * ANSI emulation wrappers
+ */
+#ifdef WIN32
+#include <windows.h>
+#include <stddef.h>
+#include <stdio.h>
+
+#define isatty(fd) _isatty(fd)
+#define fileno(fd) _fileno(fd)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+	int winansi_fputs(const char *str, FILE *stream);
+	int winansi_printf(const char *format, ...);
+	int winansi_fprintf(FILE *stream, const char *format, ...);
+	int winansi_vfprintf(FILE *stream, const char *format, va_list list);
+#ifdef __cplusplus
+}
+#endif
+
+#undef fputs
+#undef fprintf
+#undef vfprintf
+
+#define fputs winansi_fputs
+#define printf winansi_printf
+#define fprintf winansi_fprintf
+#define vfprintf winansi_vfprintf
+
+#endif
\ No newline at end of file
diff --git a/compile b/compile
old mode 100644
new mode 100755
index a49b6d0905..531136b068
--- a/compile
+++ b/compile
@@ -1,310 +1,347 @@
-#! /bin/sh
-# Wrapper for compilers which do not understand '-c -o'.
-
-scriptversion=2012-01-04.17; # UTC
-
-# Copyright (C) 1999, 2000, 2003, 2004, 2005, 2009, 2010, 2012 Free
-# Software Foundation, Inc.
-# Written by Tom Tromey <tromey@cygnus.com>.
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-# As a special exception to the GNU General Public License, if you
-# distribute this file as part of a program that contains a
-# configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that program.
-
-# This file is maintained in Automake, please report
-# bugs to <bug-automake@gnu.org> or send patches to
-# <automake-patches@gnu.org>.
-
-nl='
-'
-
-# We need space, tab and new line, in precisely that order.  Quoting is
-# there to prevent tools from complaining about whitespace usage.
-IFS=" ""	$nl"
-
-file_conv=
-
-# func_file_conv build_file lazy
-# Convert a $build file to $host form and store it in $file
-# Currently only supports Windows hosts. If the determined conversion
-# type is listed in (the comma separated) LAZY, no conversion will
-# take place.
-func_file_conv ()
-{
-  file=$1
-  case $file in
-    / | /[!/]*) # absolute file, and not a UNC file
-      if test -z "$file_conv"; then
-	# lazily determine how to convert abs files
-	case `uname -s` in
-	  MINGW*)
-	    file_conv=mingw
-	    ;;
-	  CYGWIN*)
-	    file_conv=cygwin
-	    ;;
-	  *)
-	    file_conv=wine
-	    ;;
-	esac
-      fi
-      case $file_conv/,$2, in
-	*,$file_conv,*)
-	  ;;
-	mingw/*)
-	  file=`cmd //C echo "$file " | sed -e 's/"\(.*\) " *$/\1/'`
-	  ;;
-	cygwin/*)
-	  file=`cygpath -m "$file" || echo "$file"`
-	  ;;
-	wine/*)
-	  file=`winepath -w "$file" || echo "$file"`
-	  ;;
-      esac
-      ;;
-  esac
-}
-
-# func_cl_wrapper cl arg...
-# Adjust compile command to suit cl
-func_cl_wrapper ()
-{
-  # Assume a capable shell
-  lib_path=
-  shared=:
-  linker_opts=
-  for arg
-  do
-    if test -n "$eat"; then
-      eat=
-    else
-      case $1 in
-	-o)
-	  # configure might choose to run compile as 'compile cc -o foo foo.c'.
-	  eat=1
-	  case $2 in
-	    *.o | *.[oO][bB][jJ])
-	      func_file_conv "$2"
-	      set x "$@" -Fo"$file"
-	      shift
-	      ;;
-	    *)
-	      func_file_conv "$2"
-	      set x "$@" -Fe"$file"
-	      shift
-	      ;;
-	  esac
-	  ;;
-	-I*)
-	  func_file_conv "${1#-I}" mingw
-	  set x "$@" -I"$file"
-	  shift
-	  ;;
-	-l*)
-	  lib=${1#-l}
-	  found=no
-	  save_IFS=$IFS
-	  IFS=';'
-	  for dir in $lib_path $LIB
-	  do
-	    IFS=$save_IFS
-	    if $shared && test -f "$dir/$lib.dll.lib"; then
-	      found=yes
-	      set x "$@" "$dir/$lib.dll.lib"
-	      break
-	    fi
-	    if test -f "$dir/$lib.lib"; then
-	      found=yes
-	      set x "$@" "$dir/$lib.lib"
-	      break
-	    fi
-	  done
-	  IFS=$save_IFS
-
-	  test "$found" != yes && set x "$@" "$lib.lib"
-	  shift
-	  ;;
-	-L*)
-	  func_file_conv "${1#-L}"
-	  if test -z "$lib_path"; then
-	    lib_path=$file
-	  else
-	    lib_path="$lib_path;$file"
-	  fi
-	  linker_opts="$linker_opts -LIBPATH:$file"
-	  ;;
-	-static)
-	  shared=false
-	  ;;
-	-Wl,*)
-	  arg=${1#-Wl,}
-	  save_ifs="$IFS"; IFS=','
-	  for flag in $arg; do
-	    IFS="$save_ifs"
-	    linker_opts="$linker_opts $flag"
-	  done
-	  IFS="$save_ifs"
-	  ;;
-	-Xlinker)
-	  eat=1
-	  linker_opts="$linker_opts $2"
-	  ;;
-	-*)
-	  set x "$@" "$1"
-	  shift
-	  ;;
-	*.cc | *.CC | *.cxx | *.CXX | *.[cC]++)
-	  func_file_conv "$1"
-	  set x "$@" -Tp"$file"
-	  shift
-	  ;;
-	*.c | *.cpp | *.CPP | *.lib | *.LIB | *.Lib | *.OBJ | *.obj | *.[oO])
-	  func_file_conv "$1" mingw
-	  set x "$@" "$file"
-	  shift
-	  ;;
-	*)
-	  set x "$@" "$1"
-	  shift
-	  ;;
-      esac
-    fi
-    shift
-  done
-  if test -n "$linker_opts"; then
-    linker_opts="-link$linker_opts"
-  fi
-  exec "$@" $linker_opts
-  exit 1
-}
-
-eat=
-
-case $1 in
-  '')
-     echo "$0: No command.  Try '$0 --help' for more information." 1>&2
-     exit 1;
-     ;;
-  -h | --h*)
-    cat <<\EOF
-Usage: compile [--help] [--version] PROGRAM [ARGS]
-
-Wrapper for compilers which do not understand '-c -o'.
-Remove '-o dest.o' from ARGS, run PROGRAM with the remaining
-arguments, and rename the output as expected.
-
-If you are trying to build a whole package this is not the
-right script to run: please start by reading the file 'INSTALL'.
-
-Report bugs to <bug-automake@gnu.org>.
-EOF
-    exit $?
-    ;;
-  -v | --v*)
-    echo "compile $scriptversion"
-    exit $?
-    ;;
-  cl | *[/\\]cl | cl.exe | *[/\\]cl.exe )
-    func_cl_wrapper "$@"      # Doesn't return...
-    ;;
-esac
-
-ofile=
-cfile=
-
-for arg
-do
-  if test -n "$eat"; then
-    eat=
-  else
-    case $1 in
-      -o)
-	# configure might choose to run compile as 'compile cc -o foo foo.c'.
-	# So we strip '-o arg' only if arg is an object.
-	eat=1
-	case $2 in
-	  *.o | *.obj)
-	    ofile=$2
-	    ;;
-	  *)
-	    set x "$@" -o "$2"
-	    shift
-	    ;;
-	esac
-	;;
-      *.c)
-	cfile=$1
-	set x "$@" "$1"
-	shift
-	;;
-      *)
-	set x "$@" "$1"
-	shift
-	;;
-    esac
-  fi
-  shift
-done
-
-if test -z "$ofile" || test -z "$cfile"; then
-  # If no '-o' option was seen then we might have been invoked from a
-  # pattern rule where we don't need one.  That is ok -- this is a
-  # normal compilation that the losing compiler can handle.  If no
-  # '.c' file was seen then we are probably linking.  That is also
-  # ok.
-  exec "$@"
-fi
-
-# Name of file we expect compiler to create.
-cofile=`echo "$cfile" | sed 's|^.*[\\/]||; s|^[a-zA-Z]:||; s/\.c$/.o/'`
-
-# Create the lock directory.
-# Note: use '[/\\:.-]' here to ensure that we don't use the same name
-# that we are using for the .o file.  Also, base the name on the expected
-# object file name, since that is what matters with a parallel build.
-lockdir=`echo "$cofile" | sed -e 's|[/\\:.-]|_|g'`.d
-while true; do
-  if mkdir "$lockdir" >/dev/null 2>&1; then
-    break
-  fi
-  sleep 1
-done
-# FIXME: race condition here if user kills between mkdir and trap.
-trap "rmdir '$lockdir'; exit 1" 1 2 15
-
-# Run the compile.
-"$@"
-ret=$?
-
-if test -f "$cofile"; then
-  test "$cofile" = "$ofile" || mv "$cofile" "$ofile"
-elif test -f "${cofile}bj"; then
-  test "${cofile}bj" = "$ofile" || mv "${cofile}bj" "$ofile"
-fi
-
-rmdir "$lockdir"
-exit $ret
-
-# Local Variables:
-# mode: shell-script
-# sh-indentation: 2
-# eval: (add-hook 'write-file-hooks 'time-stamp)
-# time-stamp-start: "scriptversion="
-# time-stamp-format: "%:y-%02m-%02d.%02H"
-# time-stamp-time-zone: "UTC"
-# time-stamp-end: "; # UTC"
-# End:
+#! /bin/sh
+# Wrapper for compilers which do not understand '-c -o'.
+
+scriptversion=2012-10-14.11; # UTC
+
+# Copyright (C) 1999-2013 Free Software Foundation, Inc.
+# Written by Tom Tromey <tromey@cygnus.com>.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+# This file is maintained in Automake, please report
+# bugs to <bug-automake@gnu.org> or send patches to
+# <automake-patches@gnu.org>.
+
+nl='
+'
+
+# We need space, tab and new line, in precisely that order.  Quoting is
+# there to prevent tools from complaining about whitespace usage.
+IFS=" ""	$nl"
+
+file_conv=
+
+# func_file_conv build_file lazy
+# Convert a $build file to $host form and store it in $file
+# Currently only supports Windows hosts. If the determined conversion
+# type is listed in (the comma separated) LAZY, no conversion will
+# take place.
+func_file_conv ()
+{
+  file=$1
+  case $file in
+    / | /[!/]*) # absolute file, and not a UNC file
+      if test -z "$file_conv"; then
+	# lazily determine how to convert abs files
+	case `uname -s` in
+	  MINGW*)
+	    file_conv=mingw
+	    ;;
+	  CYGWIN*)
+	    file_conv=cygwin
+	    ;;
+	  *)
+	    file_conv=wine
+	    ;;
+	esac
+      fi
+      case $file_conv/,$2, in
+	*,$file_conv,*)
+	  ;;
+	mingw/*)
+	  file=`cmd //C echo "$file " | sed -e 's/"\(.*\) " *$/\1/'`
+	  ;;
+	cygwin/*)
+	  file=`cygpath -m "$file" || echo "$file"`
+	  ;;
+	wine/*)
+	  file=`winepath -w "$file" || echo "$file"`
+	  ;;
+      esac
+      ;;
+  esac
+}
+
+# func_cl_dashL linkdir
+# Make cl look for libraries in LINKDIR
+func_cl_dashL ()
+{
+  func_file_conv "$1"
+  if test -z "$lib_path"; then
+    lib_path=$file
+  else
+    lib_path="$lib_path;$file"
+  fi
+  linker_opts="$linker_opts -LIBPATH:$file"
+}
+
+# func_cl_dashl library
+# Do a library search-path lookup for cl
+func_cl_dashl ()
+{
+  lib=$1
+  found=no
+  save_IFS=$IFS
+  IFS=';'
+  for dir in $lib_path $LIB
+  do
+    IFS=$save_IFS
+    if $shared && test -f "$dir/$lib.dll.lib"; then
+      found=yes
+      lib=$dir/$lib.dll.lib
+      break
+    fi
+    if test -f "$dir/$lib.lib"; then
+      found=yes
+      lib=$dir/$lib.lib
+      break
+    fi
+    if test -f "$dir/lib$lib.a"; then
+      found=yes
+      lib=$dir/lib$lib.a
+      break
+    fi
+  done
+  IFS=$save_IFS
+
+  if test "$found" != yes; then
+    lib=$lib.lib
+  fi
+}
+
+# func_cl_wrapper cl arg...
+# Adjust compile command to suit cl
+func_cl_wrapper ()
+{
+  # Assume a capable shell
+  lib_path=
+  shared=:
+  linker_opts=
+  for arg
+  do
+    if test -n "$eat"; then
+      eat=
+    else
+      case $1 in
+	-o)
+	  # configure might choose to run compile as 'compile cc -o foo foo.c'.
+	  eat=1
+	  case $2 in
+	    *.o | *.[oO][bB][jJ])
+	      func_file_conv "$2"
+	      set x "$@" -Fo"$file"
+	      shift
+	      ;;
+	    *)
+	      func_file_conv "$2"
+	      set x "$@" -Fe"$file"
+	      shift
+	      ;;
+	  esac
+	  ;;
+	-I)
+	  eat=1
+	  func_file_conv "$2" mingw
+	  set x "$@" -I"$file"
+	  shift
+	  ;;
+	-I*)
+	  func_file_conv "${1#-I}" mingw
+	  set x "$@" -I"$file"
+	  shift
+	  ;;
+	-l)
+	  eat=1
+	  func_cl_dashl "$2"
+	  set x "$@" "$lib"
+	  shift
+	  ;;
+	-l*)
+	  func_cl_dashl "${1#-l}"
+	  set x "$@" "$lib"
+	  shift
+	  ;;
+	-L)
+	  eat=1
+	  func_cl_dashL "$2"
+	  ;;
+	-L*)
+	  func_cl_dashL "${1#-L}"
+	  ;;
+	-static)
+	  shared=false
+	  ;;
+	-Wl,*)
+	  arg=${1#-Wl,}
+	  save_ifs="$IFS"; IFS=','
+	  for flag in $arg; do
+	    IFS="$save_ifs"
+	    linker_opts="$linker_opts $flag"
+	  done
+	  IFS="$save_ifs"
+	  ;;
+	-Xlinker)
+	  eat=1
+	  linker_opts="$linker_opts $2"
+	  ;;
+	-*)
+	  set x "$@" "$1"
+	  shift
+	  ;;
+	*.cc | *.CC | *.cxx | *.CXX | *.[cC]++)
+	  func_file_conv "$1"
+	  set x "$@" -Tp"$file"
+	  shift
+	  ;;
+	*.c | *.cpp | *.CPP | *.lib | *.LIB | *.Lib | *.OBJ | *.obj | *.[oO])
+	  func_file_conv "$1" mingw
+	  set x "$@" "$file"
+	  shift
+	  ;;
+	*)
+	  set x "$@" "$1"
+	  shift
+	  ;;
+      esac
+    fi
+    shift
+  done
+  if test -n "$linker_opts"; then
+    linker_opts="-link$linker_opts"
+  fi
+  exec "$@" $linker_opts
+  exit 1
+}
+
+eat=
+
+case $1 in
+  '')
+     echo "$0: No command.  Try '$0 --help' for more information." 1>&2
+     exit 1;
+     ;;
+  -h | --h*)
+    cat <<\EOF
+Usage: compile [--help] [--version] PROGRAM [ARGS]
+
+Wrapper for compilers which do not understand '-c -o'.
+Remove '-o dest.o' from ARGS, run PROGRAM with the remaining
+arguments, and rename the output as expected.
+
+If you are trying to build a whole package this is not the
+right script to run: please start by reading the file 'INSTALL'.
+
+Report bugs to <bug-automake@gnu.org>.
+EOF
+    exit $?
+    ;;
+  -v | --v*)
+    echo "compile $scriptversion"
+    exit $?
+    ;;
+  cl | *[/\\]cl | cl.exe | *[/\\]cl.exe )
+    func_cl_wrapper "$@"      # Doesn't return...
+    ;;
+esac
+
+ofile=
+cfile=
+
+for arg
+do
+  if test -n "$eat"; then
+    eat=
+  else
+    case $1 in
+      -o)
+	# configure might choose to run compile as 'compile cc -o foo foo.c'.
+	# So we strip '-o arg' only if arg is an object.
+	eat=1
+	case $2 in
+	  *.o | *.obj)
+	    ofile=$2
+	    ;;
+	  *)
+	    set x "$@" -o "$2"
+	    shift
+	    ;;
+	esac
+	;;
+      *.c)
+	cfile=$1
+	set x "$@" "$1"
+	shift
+	;;
+      *)
+	set x "$@" "$1"
+	shift
+	;;
+    esac
+  fi
+  shift
+done
+
+if test -z "$ofile" || test -z "$cfile"; then
+  # If no '-o' option was seen then we might have been invoked from a
+  # pattern rule where we don't need one.  That is ok -- this is a
+  # normal compilation that the losing compiler can handle.  If no
+  # '.c' file was seen then we are probably linking.  That is also
+  # ok.
+  exec "$@"
+fi
+
+# Name of file we expect compiler to create.
+cofile=`echo "$cfile" | sed 's|^.*[\\/]||; s|^[a-zA-Z]:||; s/\.c$/.o/'`
+
+# Create the lock directory.
+# Note: use '[/\\:.-]' here to ensure that we don't use the same name
+# that we are using for the .o file.  Also, base the name on the expected
+# object file name, since that is what matters with a parallel build.
+lockdir=`echo "$cofile" | sed -e 's|[/\\:.-]|_|g'`.d
+while true; do
+  if mkdir "$lockdir" >/dev/null 2>&1; then
+    break
+  fi
+  sleep 1
+done
+# FIXME: race condition here if user kills between mkdir and trap.
+trap "rmdir '$lockdir'; exit 1" 1 2 15
+
+# Run the compile.
+"$@"
+ret=$?
+
+if test -f "$cofile"; then
+  test "$cofile" = "$ofile" || mv "$cofile" "$ofile"
+elif test -f "${cofile}bj"; then
+  test "${cofile}bj" = "$ofile" || mv "${cofile}bj" "$ofile"
+fi
+
+rmdir "$lockdir"
+exit $ret
+
+# Local Variables:
+# mode: shell-script
+# sh-indentation: 2
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "scriptversion="
+# time-stamp-format: "%:y-%02m-%02d.%02H"
+# time-stamp-time-zone: "UTC"
+# time-stamp-end: "; # UTC"
+# End:
diff --git a/config.guess b/config.guess
old mode 100644
new mode 100755
index f32079abda..b79252d6b1
--- a/config.guess
+++ b/config.guess
@@ -1,14 +1,12 @@
 #! /bin/sh
 # Attempt to guess a canonical system name.
-#   Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
-#   2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
-#   Free Software Foundation, Inc.
+#   Copyright 1992-2013 Free Software Foundation, Inc.
 
-timestamp='2008-01-23'
+timestamp='2013-06-10'
 
 # This file is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
+# the Free Software Foundation; either version 3 of the License, or
 # (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful, but
@@ -17,26 +15,22 @@ timestamp='2008-01-23'
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
-# 02110-1301, USA.
+# along with this program; if not, see <http://www.gnu.org/licenses/>.
 #
 # As a special exception to the GNU General Public License, if you
 # distribute this file as part of a program that contains a
 # configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that program.
-
-
-# Originally written by Per Bothner <per@bothner.com>.
-# Please send patches to <config-patches@gnu.org>.  Submit a context
-# diff and a properly formatted ChangeLog entry.
+# the same distribution terms that you use for the rest of that
+# program.  This Exception is an additional permission under section 7
+# of the GNU General Public License, version 3 ("GPLv3").
+#
+# Originally written by Per Bothner.
 #
-# This script attempts to guess a canonical system name similar to
-# config.sub.  If it succeeds, it prints the system name on stdout, and
-# exits with 0.  Otherwise, it exits with 1.
+# You can get the latest version of this script from:
+# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD
 #
-# The plan is that this can be called by configure scripts if you
-# don't specify an explicit build system type.
+# Please send patches with a ChangeLog entry to config-patches@gnu.org.
+
 
 me=`echo "$0" | sed -e 's,.*/,,'`
 
@@ -56,8 +50,7 @@ version="\
 GNU config.guess ($timestamp)
 
 Originally written by Per Bothner.
-Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
-2002, 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
+Copyright 1992-2013 Free Software Foundation, Inc.
 
 This is free software; see the source for copying conditions.  There is NO
 warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
@@ -139,12 +132,33 @@ UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown
 UNAME_SYSTEM=`(uname -s) 2>/dev/null`  || UNAME_SYSTEM=unknown
 UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown
 
+case "${UNAME_SYSTEM}" in
+Linux|GNU|GNU/*)
+	# If the system lacks a compiler, then just pick glibc.
+	# We could probably try harder.
+	LIBC=gnu
+
+	eval $set_cc_for_build
+	cat <<-EOF > $dummy.c
+	#include <features.h>
+	#if defined(__UCLIBC__)
+	LIBC=uclibc
+	#elif defined(__dietlibc__)
+	LIBC=dietlibc
+	#else
+	LIBC=gnu
+	#endif
+	EOF
+	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^LIBC'`
+	;;
+esac
+
 # Note: order is significant - the case branches are not exclusive.
 
 case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
     *:NetBSD:*:*)
 	# NetBSD (nbsd) targets should (where applicable) match one or
-	# more of the tupples: *-*-netbsdelf*, *-*-netbsdaout*,
+	# more of the tuples: *-*-netbsdelf*, *-*-netbsdaout*,
 	# *-*-netbsdecoff* and *-*-netbsd*.  For targets that recently
 	# switched to ELF, *-*-netbsd* would select the old
 	# object file format.  This provides both forward
@@ -170,7 +184,7 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
 	    arm*|i386|m68k|ns32k|sh3*|sparc|vax)
 		eval $set_cc_for_build
 		if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \
-			| grep __ELF__ >/dev/null
+			| grep -q __ELF__
 		then
 		    # Once all utilities can be ECOFF (netbsdecoff) or a.out (netbsdaout).
 		    # Return netbsd for either.  FIX?
@@ -180,7 +194,7 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
 		fi
 		;;
 	    *)
-	        os=netbsd
+		os=netbsd
 		;;
 	esac
 	# The OS release
@@ -201,6 +215,10 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
 	# CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used.
 	echo "${machine}-${os}${release}"
 	exit ;;
+    *:Bitrig:*:*)
+	UNAME_MACHINE_ARCH=`arch | sed 's/Bitrig.//'`
+	echo ${UNAME_MACHINE_ARCH}-unknown-bitrig${UNAME_RELEASE}
+	exit ;;
     *:OpenBSD:*:*)
 	UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'`
 	echo ${UNAME_MACHINE_ARCH}-unknown-openbsd${UNAME_RELEASE}
@@ -223,7 +241,7 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
 		UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $3}'`
 		;;
 	*5.*)
-	        UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $4}'`
+		UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $4}'`
 		;;
 	esac
 	# According to Compaq, /usr/sbin/psrinfo has been available on
@@ -269,7 +287,10 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
 	# A Xn.n version is an unreleased experimental baselevel.
 	# 1.2 uses "1.2" for uname -r.
 	echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
-	exit ;;
+	# Reset EXIT trap before exiting to avoid spurious non-zero exit code.
+	exitcode=$?
+	trap '' 0
+	exit $exitcode ;;
     Alpha\ *:Windows_NT*:*)
 	# How do we know it's Interix rather than the generic POSIX subsystem?
 	# Should we change UNAME_MACHINE based on the output of uname instead
@@ -295,12 +316,12 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
 	echo s390-ibm-zvmoe
 	exit ;;
     *:OS400:*:*)
-        echo powerpc-ibm-os400
+	echo powerpc-ibm-os400
 	exit ;;
     arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*)
 	echo arm-acorn-riscix${UNAME_RELEASE}
 	exit ;;
-    arm:riscos:*:*|arm:RISCOS:*:*)
+    arm*:riscos:*:*|arm*:RISCOS:*:*)
 	echo arm-unknown-riscos
 	exit ;;
     SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*)
@@ -324,14 +345,33 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
 	case `/usr/bin/uname -p` in
 	    sparc) echo sparc-icl-nx7; exit ;;
 	esac ;;
+    s390x:SunOS:*:*)
+	echo ${UNAME_MACHINE}-ibm-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit ;;
     sun4H:SunOS:5.*:*)
 	echo sparc-hal-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
 	exit ;;
     sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*)
 	echo sparc-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
 	exit ;;
+    i86pc:AuroraUX:5.*:* | i86xen:AuroraUX:5.*:*)
+	echo i386-pc-auroraux${UNAME_RELEASE}
+	exit ;;
     i86pc:SunOS:5.*:* | i86xen:SunOS:5.*:*)
-	echo i386-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	eval $set_cc_for_build
+	SUN_ARCH="i386"
+	# If there is a compiler, see if it is configured for 64-bit objects.
+	# Note that the Sun cc does not turn __LP64__ into 1 like gcc does.
+	# This test works for both compilers.
+	if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then
+	    if (echo '#ifdef __amd64'; echo IS_64BIT_ARCH; echo '#endif') | \
+		(CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \
+		grep IS_64BIT_ARCH >/dev/null
+	    then
+		SUN_ARCH="x86_64"
+	    fi
+	fi
+	echo ${SUN_ARCH}-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
 	exit ;;
     sun4*:SunOS:6*:*)
 	# According to config.sub, this is the proper way to canonicalize
@@ -375,23 +415,23 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
     # MiNT.  But MiNT is downward compatible to TOS, so this should
     # be no problem.
     atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*)
-        echo m68k-atari-mint${UNAME_RELEASE}
+	echo m68k-atari-mint${UNAME_RELEASE}
 	exit ;;
     atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*)
 	echo m68k-atari-mint${UNAME_RELEASE}
-        exit ;;
+	exit ;;
     *falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*)
-        echo m68k-atari-mint${UNAME_RELEASE}
+	echo m68k-atari-mint${UNAME_RELEASE}
 	exit ;;
     milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*)
-        echo m68k-milan-mint${UNAME_RELEASE}
-        exit ;;
+	echo m68k-milan-mint${UNAME_RELEASE}
+	exit ;;
     hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*)
-        echo m68k-hades-mint${UNAME_RELEASE}
-        exit ;;
+	echo m68k-hades-mint${UNAME_RELEASE}
+	exit ;;
     *:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*)
-        echo m68k-unknown-mint${UNAME_RELEASE}
-        exit ;;
+	echo m68k-unknown-mint${UNAME_RELEASE}
+	exit ;;
     m68k:machten:*:*)
 	echo m68k-apple-machten${UNAME_RELEASE}
 	exit ;;
@@ -461,8 +501,8 @@ EOF
 	echo m88k-motorola-sysv3
 	exit ;;
     AViiON:dgux:*:*)
-        # DG/UX returns AViiON for all architectures
-        UNAME_PROCESSOR=`/usr/bin/uname -p`
+	# DG/UX returns AViiON for all architectures
+	UNAME_PROCESSOR=`/usr/bin/uname -p`
 	if [ $UNAME_PROCESSOR = mc88100 ] || [ $UNAME_PROCESSOR = mc88110 ]
 	then
 	    if [ ${TARGET_BINARY_INTERFACE}x = m88kdguxelfx ] || \
@@ -475,7 +515,7 @@ EOF
 	else
 	    echo i586-dg-dgux${UNAME_RELEASE}
 	fi
- 	exit ;;
+	exit ;;
     M88*:DolphinOS:*:*)	# DolphinOS (SVR3)
 	echo m88k-dolphin-sysv3
 	exit ;;
@@ -532,7 +572,7 @@ EOF
 		echo rs6000-ibm-aix3.2
 	fi
 	exit ;;
-    *:AIX:*:[456])
+    *:AIX:*:[4567])
 	IBM_CPU_ID=`/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }'`
 	if /usr/sbin/lsattr -El ${IBM_CPU_ID} | grep ' POWER' >/dev/null 2>&1; then
 		IBM_ARCH=rs6000
@@ -575,52 +615,52 @@ EOF
 	    9000/[678][0-9][0-9])
 		if [ -x /usr/bin/getconf ]; then
 		    sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null`
-                    sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null`
-                    case "${sc_cpu_version}" in
-                      523) HP_ARCH="hppa1.0" ;; # CPU_PA_RISC1_0
-                      528) HP_ARCH="hppa1.1" ;; # CPU_PA_RISC1_1
-                      532)                      # CPU_PA_RISC2_0
-                        case "${sc_kernel_bits}" in
-                          32) HP_ARCH="hppa2.0n" ;;
-                          64) HP_ARCH="hppa2.0w" ;;
+		    sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null`
+		    case "${sc_cpu_version}" in
+		      523) HP_ARCH="hppa1.0" ;; # CPU_PA_RISC1_0
+		      528) HP_ARCH="hppa1.1" ;; # CPU_PA_RISC1_1
+		      532)                      # CPU_PA_RISC2_0
+			case "${sc_kernel_bits}" in
+			  32) HP_ARCH="hppa2.0n" ;;
+			  64) HP_ARCH="hppa2.0w" ;;
 			  '') HP_ARCH="hppa2.0" ;;   # HP-UX 10.20
-                        esac ;;
-                    esac
+			esac ;;
+		    esac
 		fi
 		if [ "${HP_ARCH}" = "" ]; then
 		    eval $set_cc_for_build
-		    sed 's/^              //' << EOF >$dummy.c
+		    sed 's/^		//' << EOF >$dummy.c
 
-              #define _HPUX_SOURCE
-              #include <stdlib.h>
-              #include <unistd.h>
+		#define _HPUX_SOURCE
+		#include <stdlib.h>
+		#include <unistd.h>
 
-              int main ()
-              {
-              #if defined(_SC_KERNEL_BITS)
-                  long bits = sysconf(_SC_KERNEL_BITS);
-              #endif
-                  long cpu  = sysconf (_SC_CPU_VERSION);
+		int main ()
+		{
+		#if defined(_SC_KERNEL_BITS)
+		    long bits = sysconf(_SC_KERNEL_BITS);
+		#endif
+		    long cpu  = sysconf (_SC_CPU_VERSION);
 
-                  switch (cpu)
-              	{
-              	case CPU_PA_RISC1_0: puts ("hppa1.0"); break;
-              	case CPU_PA_RISC1_1: puts ("hppa1.1"); break;
-              	case CPU_PA_RISC2_0:
-              #if defined(_SC_KERNEL_BITS)
-              	    switch (bits)
-              		{
-              		case 64: puts ("hppa2.0w"); break;
-              		case 32: puts ("hppa2.0n"); break;
-              		default: puts ("hppa2.0"); break;
-              		} break;
-              #else  /* !defined(_SC_KERNEL_BITS) */
-              	    puts ("hppa2.0"); break;
-              #endif
-              	default: puts ("hppa1.0"); break;
-              	}
-                  exit (0);
-              }
+		    switch (cpu)
+			{
+			case CPU_PA_RISC1_0: puts ("hppa1.0"); break;
+			case CPU_PA_RISC1_1: puts ("hppa1.1"); break;
+			case CPU_PA_RISC2_0:
+		#if defined(_SC_KERNEL_BITS)
+			    switch (bits)
+				{
+				case 64: puts ("hppa2.0w"); break;
+				case 32: puts ("hppa2.0n"); break;
+				default: puts ("hppa2.0"); break;
+				} break;
+		#else  /* !defined(_SC_KERNEL_BITS) */
+			    puts ("hppa2.0"); break;
+		#endif
+			default: puts ("hppa1.0"); break;
+			}
+		    exit (0);
+		}
 EOF
 		    (CCOPTS= $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy`
 		    test -z "$HP_ARCH" && HP_ARCH=hppa
@@ -640,7 +680,7 @@ EOF
 	    # => hppa64-hp-hpux11.23
 
 	    if echo __LP64__ | (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) |
-		grep __LP64__ >/dev/null
+		grep -q __LP64__
 	    then
 		HP_ARCH="hppa2.0w"
 	    else
@@ -711,22 +751,22 @@ EOF
 	exit ;;
     C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*)
 	echo c1-convex-bsd
-        exit ;;
+	exit ;;
     C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*)
 	if getsysinfo -f scalar_acc
 	then echo c32-convex-bsd
 	else echo c2-convex-bsd
 	fi
-        exit ;;
+	exit ;;
     C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*)
 	echo c34-convex-bsd
-        exit ;;
+	exit ;;
     C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*)
 	echo c38-convex-bsd
-        exit ;;
+	exit ;;
     C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*)
 	echo c4-convex-bsd
-        exit ;;
+	exit ;;
     CRAY*Y-MP:*:*:*)
 	echo ymp-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
 	exit ;;
@@ -750,14 +790,14 @@ EOF
 	exit ;;
     F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*)
 	FUJITSU_PROC=`uname -m | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
-        FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
-        FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'`
-        echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
-        exit ;;
+	FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
+	FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'`
+	echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
+	exit ;;
     5000:UNIX_System_V:4.*:*)
-        FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
-        FUJITSU_REL=`echo ${UNAME_RELEASE} | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/ /_/'`
-        echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
+	FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
+	FUJITSU_REL=`echo ${UNAME_RELEASE} | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/ /_/'`
+	echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
 	exit ;;
     i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*)
 	echo ${UNAME_MACHINE}-pc-bsdi${UNAME_RELEASE}
@@ -769,34 +809,39 @@ EOF
 	echo ${UNAME_MACHINE}-unknown-bsdi${UNAME_RELEASE}
 	exit ;;
     *:FreeBSD:*:*)
-	case ${UNAME_MACHINE} in
-	    pc98)
-		echo i386-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
+	UNAME_PROCESSOR=`/usr/bin/uname -p`
+	case ${UNAME_PROCESSOR} in
 	    amd64)
 		echo x86_64-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
 	    *)
-		echo ${UNAME_MACHINE}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
+		echo ${UNAME_PROCESSOR}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
 	esac
 	exit ;;
     i*:CYGWIN*:*)
 	echo ${UNAME_MACHINE}-pc-cygwin
 	exit ;;
+    *:MINGW64*:*)
+	echo ${UNAME_MACHINE}-pc-mingw64
+	exit ;;
     *:MINGW*:*)
 	echo ${UNAME_MACHINE}-pc-mingw32
 	exit ;;
+    i*:MSYS*:*)
+	echo ${UNAME_MACHINE}-pc-msys
+	exit ;;
     i*:windows32*:*)
-    	# uname -m includes "-pc" on this system.
-    	echo ${UNAME_MACHINE}-mingw32
+	# uname -m includes "-pc" on this system.
+	echo ${UNAME_MACHINE}-mingw32
 	exit ;;
     i*:PW*:*)
 	echo ${UNAME_MACHINE}-pc-pw32
 	exit ;;
-    *:Interix*:[3456]*)
-    	case ${UNAME_MACHINE} in
+    *:Interix*:*)
+	case ${UNAME_MACHINE} in
 	    x86)
 		echo i586-pc-interix${UNAME_RELEASE}
 		exit ;;
-	    EM64T | authenticamd)
+	    authenticamd | genuineintel | EM64T)
 		echo x86_64-unknown-interix${UNAME_RELEASE}
 		exit ;;
 	    IA64)
@@ -806,6 +851,9 @@ EOF
     [345]86:Windows_95:* | [345]86:Windows_98:* | [345]86:Windows_NT:*)
 	echo i${UNAME_MACHINE}-pc-mks
 	exit ;;
+    8664:Windows_NT:*)
+	echo x86_64-pc-mks
+	exit ;;
     i*:Windows_NT*:* | Pentium*:Windows_NT*:*)
 	# How do we know it's Interix rather than the generic POSIX subsystem?
 	# It also conflicts with pre-2.0 versions of AT&T UWIN. Should we
@@ -826,210 +874,157 @@ EOF
 	exit ;;
     *:GNU:*:*)
 	# the GNU system
-	echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-gnu`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'`
+	echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-${LIBC}`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'`
 	exit ;;
     *:GNU/*:*:*)
 	# other systems with GNU libc and userland
-	echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-gnu
+	echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-${LIBC}
 	exit ;;
     i*86:Minix:*:*)
 	echo ${UNAME_MACHINE}-pc-minix
 	exit ;;
+    aarch64:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	exit ;;
+    aarch64_be:Linux:*:*)
+	UNAME_MACHINE=aarch64_be
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	exit ;;
+    alpha:Linux:*:*)
+	case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in
+	  EV5)   UNAME_MACHINE=alphaev5 ;;
+	  EV56)  UNAME_MACHINE=alphaev56 ;;
+	  PCA56) UNAME_MACHINE=alphapca56 ;;
+	  PCA57) UNAME_MACHINE=alphapca56 ;;
+	  EV6)   UNAME_MACHINE=alphaev6 ;;
+	  EV67)  UNAME_MACHINE=alphaev67 ;;
+	  EV68*) UNAME_MACHINE=alphaev68 ;;
+	esac
+	objdump --private-headers /bin/sh | grep -q ld.so.1
+	if test "$?" = 0 ; then LIBC="gnulibc1" ; fi
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	exit ;;
+    arc:Linux:*:* | arceb:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	exit ;;
     arm*:Linux:*:*)
 	eval $set_cc_for_build
 	if echo __ARM_EABI__ | $CC_FOR_BUILD -E - 2>/dev/null \
 	    | grep -q __ARM_EABI__
 	then
-	    echo ${UNAME_MACHINE}-unknown-linux-gnu
+	    echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	else
-	    echo ${UNAME_MACHINE}-unknown-linux-gnueabi
+	    if echo __ARM_PCS_VFP | $CC_FOR_BUILD -E - 2>/dev/null \
+		| grep -q __ARM_PCS_VFP
+	    then
+		echo ${UNAME_MACHINE}-unknown-linux-${LIBC}eabi
+	    else
+		echo ${UNAME_MACHINE}-unknown-linux-${LIBC}eabihf
+	    fi
 	fi
 	exit ;;
     avr32*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
     cris:Linux:*:*)
-	echo cris-axis-linux-gnu
+	echo ${UNAME_MACHINE}-axis-linux-${LIBC}
 	exit ;;
     crisv32:Linux:*:*)
-	echo crisv32-axis-linux-gnu
+	echo ${UNAME_MACHINE}-axis-linux-${LIBC}
 	exit ;;
     frv:Linux:*:*)
-    	echo frv-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	exit ;;
+    hexagon:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	exit ;;
+    i*86:Linux:*:*)
+	echo ${UNAME_MACHINE}-pc-linux-${LIBC}
 	exit ;;
     ia64:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
     m32r*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
     m68*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
-    mips:Linux:*:*)
+    mips:Linux:*:* | mips64:Linux:*:*)
 	eval $set_cc_for_build
 	sed 's/^	//' << EOF >$dummy.c
 	#undef CPU
-	#undef mips
-	#undef mipsel
+	#undef ${UNAME_MACHINE}
+	#undef ${UNAME_MACHINE}el
 	#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
-	CPU=mipsel
+	CPU=${UNAME_MACHINE}el
 	#else
 	#if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB)
-	CPU=mips
+	CPU=${UNAME_MACHINE}
 	#else
 	CPU=
 	#endif
 	#endif
 EOF
-	eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n '
-	    /^CPU/{
-		s: ::g
-		p
-	    }'`"
-	test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; }
+	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^CPU'`
+	test x"${CPU}" != x && { echo "${CPU}-unknown-linux-${LIBC}"; exit; }
 	;;
-    mips64:Linux:*:*)
-	eval $set_cc_for_build
-	sed 's/^	//' << EOF >$dummy.c
-	#undef CPU
-	#undef mips64
-	#undef mips64el
-	#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
-	CPU=mips64el
-	#else
-	#if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB)
-	CPU=mips64
-	#else
-	CPU=
-	#endif
-	#endif
-EOF
-	eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n '
-	    /^CPU/{
-		s: ::g
-		p
-	    }'`"
-	test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; }
-	;;
-    or32:Linux:*:*)
-	echo or32-unknown-linux-gnu
+    or1k:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
-    ppc:Linux:*:*)
-	echo powerpc-unknown-linux-gnu
+    or32:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
-    ppc64:Linux:*:*)
-	echo powerpc64-unknown-linux-gnu
+    padre:Linux:*:*)
+	echo sparc-unknown-linux-${LIBC}
 	exit ;;
-    alpha:Linux:*:*)
-	case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in
-	  EV5)   UNAME_MACHINE=alphaev5 ;;
-	  EV56)  UNAME_MACHINE=alphaev56 ;;
-	  PCA56) UNAME_MACHINE=alphapca56 ;;
-	  PCA57) UNAME_MACHINE=alphapca56 ;;
-	  EV6)   UNAME_MACHINE=alphaev6 ;;
-	  EV67)  UNAME_MACHINE=alphaev67 ;;
-	  EV68*) UNAME_MACHINE=alphaev68 ;;
-        esac
-	objdump --private-headers /bin/sh | grep ld.so.1 >/dev/null
-	if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi
-	echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC}
+    parisc64:Linux:*:* | hppa64:Linux:*:*)
+	echo hppa64-unknown-linux-${LIBC}
 	exit ;;
     parisc:Linux:*:* | hppa:Linux:*:*)
 	# Look for CPU level
 	case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in
-	  PA7*) echo hppa1.1-unknown-linux-gnu ;;
-	  PA8*) echo hppa2.0-unknown-linux-gnu ;;
-	  *)    echo hppa-unknown-linux-gnu ;;
+	  PA7*) echo hppa1.1-unknown-linux-${LIBC} ;;
+	  PA8*) echo hppa2.0-unknown-linux-${LIBC} ;;
+	  *)    echo hppa-unknown-linux-${LIBC} ;;
 	esac
 	exit ;;
-    parisc64:Linux:*:* | hppa64:Linux:*:*)
-	echo hppa64-unknown-linux-gnu
+    ppc64:Linux:*:*)
+	echo powerpc64-unknown-linux-${LIBC}
+	exit ;;
+    ppc:Linux:*:*)
+	echo powerpc-unknown-linux-${LIBC}
+	exit ;;
+    ppc64le:Linux:*:*)
+	echo powerpc64le-unknown-linux-${LIBC}
+	exit ;;
+    ppcle:Linux:*:*)
+	echo powerpcle-unknown-linux-${LIBC}
 	exit ;;
     s390:Linux:*:* | s390x:Linux:*:*)
-	echo ${UNAME_MACHINE}-ibm-linux
+	echo ${UNAME_MACHINE}-ibm-linux-${LIBC}
 	exit ;;
     sh64*:Linux:*:*)
-    	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
     sh*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
     sparc:Linux:*:* | sparc64:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	exit ;;
+    tile*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
     vax:Linux:*:*)
-	echo ${UNAME_MACHINE}-dec-linux-gnu
+	echo ${UNAME_MACHINE}-dec-linux-${LIBC}
 	exit ;;
     x86_64:Linux:*:*)
-	echo x86_64-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
     xtensa*:Linux:*:*)
-    	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
-    i*86:Linux:*:*)
-	# The BFD linker knows what the default object file format is, so
-	# first see if it will tell us. cd to the root directory to prevent
-	# problems with other programs or directories called `ld' in the path.
-	# Set LC_ALL=C to ensure ld outputs messages in English.
-	ld_supported_targets=`cd /; LC_ALL=C ld --help 2>&1 \
-			 | sed -ne '/supported targets:/!d
-				    s/[ 	][ 	]*/ /g
-				    s/.*supported targets: *//
-				    s/ .*//
-				    p'`
-        case "$ld_supported_targets" in
-	  elf32-i386)
-		TENTATIVE="${UNAME_MACHINE}-pc-linux-gnu"
-		;;
-	  a.out-i386-linux)
-		echo "${UNAME_MACHINE}-pc-linux-gnuaout"
-		exit ;;
-	  coff-i386)
-		echo "${UNAME_MACHINE}-pc-linux-gnucoff"
-		exit ;;
-	  "")
-		# Either a pre-BFD a.out linker (linux-gnuoldld) or
-		# one that does not give us useful --help.
-		echo "${UNAME_MACHINE}-pc-linux-gnuoldld"
-		exit ;;
-	esac
-	# Determine whether the default compiler is a.out or elf
-	eval $set_cc_for_build
-	sed 's/^	//' << EOF >$dummy.c
-	#include <features.h>
-	#ifdef __ELF__
-	# ifdef __GLIBC__
-	#  if __GLIBC__ >= 2
-	LIBC=gnu
-	#  else
-	LIBC=gnulibc1
-	#  endif
-	# else
-	LIBC=gnulibc1
-	# endif
-	#else
-	#if defined(__INTEL_COMPILER) || defined(__PGI) || defined(__SUNPRO_C) || defined(__SUNPRO_CC)
-	LIBC=gnu
-	#else
-	LIBC=gnuaout
-	#endif
-	#endif
-	#ifdef __dietlibc__
-	LIBC=dietlibc
-	#endif
-EOF
-	eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n '
-	    /^LIBC/{
-		s: ::g
-		p
-	    }'`"
-	test x"${LIBC}" != x && {
-		echo "${UNAME_MACHINE}-pc-linux-${LIBC}"
-		exit
-	}
-	test x"${TENTATIVE}" != x && { echo "${TENTATIVE}"; exit; }
-	;;
     i*86:DYNIX/ptx:4*:*)
 	# ptx 4.0 does uname -s correctly, with DYNIX/ptx in there.
 	# earlier versions are messed up and put the nodename in both
@@ -1037,11 +1032,11 @@ EOF
 	echo i386-sequent-sysv4
 	exit ;;
     i*86:UNIX_SV:4.2MP:2.*)
-        # Unixware is an offshoot of SVR4, but it has its own version
-        # number series starting with 2...
-        # I am not positive that other SVR4 systems won't match this,
+	# Unixware is an offshoot of SVR4, but it has its own version
+	# number series starting with 2...
+	# I am not positive that other SVR4 systems won't match this,
 	# I just have to hope.  -- rms.
-        # Use sysv4.2uw... so that sysv4* matches it.
+	# Use sysv4.2uw... so that sysv4* matches it.
 	echo ${UNAME_MACHINE}-pc-sysv4.2uw${UNAME_VERSION}
 	exit ;;
     i*86:OS/2:*:*)
@@ -1058,7 +1053,7 @@ EOF
     i*86:syllable:*:*)
 	echo ${UNAME_MACHINE}-pc-syllable
 	exit ;;
-    i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.0*:*)
+    i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.[02]*:*)
 	echo i386-unknown-lynxos${UNAME_RELEASE}
 	exit ;;
     i*86:*DOS:*:*)
@@ -1073,7 +1068,7 @@ EOF
 	fi
 	exit ;;
     i*86:*:5:[678]*)
-    	# UnixWare 7.x, OpenUNIX and OpenServer 6.
+	# UnixWare 7.x, OpenUNIX and OpenServer 6.
 	case `/bin/uname -X | grep "^Machine"` in
 	    *486*)	     UNAME_MACHINE=i486 ;;
 	    *Pentium)	     UNAME_MACHINE=i586 ;;
@@ -1101,10 +1096,13 @@ EOF
 	exit ;;
     pc:*:*:*)
 	# Left here for compatibility:
-        # uname -m prints for DJGPP always 'pc', but it prints nothing about
-        # the processor, so we play safe by assuming i386.
-	echo i386-pc-msdosdjgpp
-        exit ;;
+	# uname -m prints for DJGPP always 'pc', but it prints nothing about
+	# the processor, so we play safe by assuming i586.
+	# Note: whatever this is, it MUST be the same as what config.sub
+	# prints for the "djgpp" host, or else GDB configury will decide that
+	# this is a cross-build.
+	echo i586-pc-msdosdjgpp
+	exit ;;
     Intel:Mach:3*:*)
 	echo i386-pc-mach3
 	exit ;;
@@ -1139,8 +1137,18 @@ EOF
 	/bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
 	  && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;;
     3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*)
-        /bin/uname -p 2>/dev/null | grep 86 >/dev/null \
-          && { echo i486-ncr-sysv4; exit; } ;;
+	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
+	  && { echo i486-ncr-sysv4; exit; } ;;
+    NCR*:*:4.2:* | MPRAS*:*:4.2:*)
+	OS_REL='.3'
+	test -r /etc/.relid \
+	    && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid`
+	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
+	    && { echo i486-ncr-sysv4.3${OS_REL}; exit; }
+	/bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
+	    && { echo i586-ncr-sysv4.3${OS_REL}; exit; }
+	/bin/uname -p 2>/dev/null | /bin/grep pteron >/dev/null \
+	    && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;;
     m68*:LynxOS:2.*:* | m68*:LynxOS:3.0*:*)
 	echo m68k-unknown-lynxos${UNAME_RELEASE}
 	exit ;;
@@ -1153,7 +1161,7 @@ EOF
     rs6000:LynxOS:2.*:*)
 	echo rs6000-unknown-lynxos${UNAME_RELEASE}
 	exit ;;
-    PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.0*:*)
+    PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.[02]*:*)
 	echo powerpc-unknown-lynxos${UNAME_RELEASE}
 	exit ;;
     SM[BE]S:UNIX_SV:*:*)
@@ -1173,10 +1181,10 @@ EOF
 		echo ns32k-sni-sysv
 	fi
 	exit ;;
-    PENTIUM:*:4.0*:*) # Unisys `ClearPath HMP IX 4000' SVR4/MP effort
-                      # says <Richard.M.Bartel@ccMail.Census.GOV>
-        echo i586-unisys-sysv4
-        exit ;;
+    PENTIUM:*:4.0*:*)	# Unisys `ClearPath HMP IX 4000' SVR4/MP effort
+			# says <Richard.M.Bartel@ccMail.Census.GOV>
+	echo i586-unisys-sysv4
+	exit ;;
     *:UNIX_System_V:4*:FTX*)
 	# From Gerald Hewes <hewes@openmarket.com>.
 	# How about differentiating between stratus architectures? -djm
@@ -1202,11 +1210,11 @@ EOF
 	exit ;;
     R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*)
 	if [ -d /usr/nec ]; then
-	        echo mips-nec-sysv${UNAME_RELEASE}
+		echo mips-nec-sysv${UNAME_RELEASE}
 	else
-	        echo mips-unknown-sysv${UNAME_RELEASE}
+		echo mips-unknown-sysv${UNAME_RELEASE}
 	fi
-        exit ;;
+	exit ;;
     BeBox:BeOS:*:*)	# BeOS running on hardware made by Be, PPC only.
 	echo powerpc-be-beos
 	exit ;;
@@ -1216,6 +1224,12 @@ EOF
     BePC:BeOS:*:*)	# BeOS running on Intel PC compatible.
 	echo i586-pc-beos
 	exit ;;
+    BePC:Haiku:*:*)	# Haiku running on Intel PC compatible.
+	echo i586-pc-haiku
+	exit ;;
+    x86_64:Haiku:*:*)
+	echo x86_64-unknown-haiku
+	exit ;;
     SX-4:SUPER-UX:*:*)
 	echo sx4-nec-superux${UNAME_RELEASE}
 	exit ;;
@@ -1242,9 +1256,21 @@ EOF
 	exit ;;
     *:Darwin:*:*)
 	UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown
-	case $UNAME_PROCESSOR in
-	    unknown) UNAME_PROCESSOR=powerpc ;;
-	esac
+	eval $set_cc_for_build
+	if test "$UNAME_PROCESSOR" = unknown ; then
+	    UNAME_PROCESSOR=powerpc
+	fi
+	if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then
+	    if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \
+		(CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \
+		grep IS_64BIT_ARCH >/dev/null
+	    then
+		case $UNAME_PROCESSOR in
+		    i386) UNAME_PROCESSOR=x86_64 ;;
+		    powerpc) UNAME_PROCESSOR=powerpc64 ;;
+		esac
+	    fi
+	fi
 	echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE}
 	exit ;;
     *:procnto*:*:* | *:QNX:[0123456789]*:*)
@@ -1258,7 +1284,10 @@ EOF
     *:QNX:*:4*)
 	echo i386-pc-qnx
 	exit ;;
-    NSE-?:NONSTOP_KERNEL:*:*)
+    NEO-?:NONSTOP_KERNEL:*:*)
+	echo neo-tandem-nsk${UNAME_RELEASE}
+	exit ;;
+    NSE-*:NONSTOP_KERNEL:*:*)
 	echo nse-tandem-nsk${UNAME_RELEASE}
 	exit ;;
     NSR-?:NONSTOP_KERNEL:*:*)
@@ -1303,13 +1332,13 @@ EOF
 	echo pdp10-unknown-its
 	exit ;;
     SEI:*:*:SEIUX)
-        echo mips-sei-seiux${UNAME_RELEASE}
+	echo mips-sei-seiux${UNAME_RELEASE}
 	exit ;;
     *:DragonFly:*:*)
 	echo ${UNAME_MACHINE}-unknown-dragonfly`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`
 	exit ;;
     *:*VMS:*:*)
-    	UNAME_MACHINE=`(uname -p) 2>/dev/null`
+	UNAME_MACHINE=`(uname -p) 2>/dev/null`
 	case "${UNAME_MACHINE}" in
 	    A*) echo alpha-dec-vms ; exit ;;
 	    I*) echo ia64-dec-vms ; exit ;;
@@ -1324,11 +1353,14 @@ EOF
     i*86:rdos:*:*)
 	echo ${UNAME_MACHINE}-pc-rdos
 	exit ;;
+    i*86:AROS:*:*)
+	echo ${UNAME_MACHINE}-pc-aros
+	exit ;;
+    x86_64:VMkernel:*:*)
+	echo ${UNAME_MACHINE}-unknown-esx
+	exit ;;
 esac
 
-#echo '(No uname command or uname output not recognized.)' 1>&2
-#echo "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" 1>&2
-
 eval $set_cc_for_build
 cat >$dummy.c <<EOF
 #ifdef _SEQUENT_
@@ -1346,11 +1378,11 @@ main ()
 #include <sys/param.h>
   printf ("m68k-sony-newsos%s\n",
 #ifdef NEWSOS4
-          "4"
+	"4"
 #else
-	  ""
+	""
 #endif
-         ); exit (0);
+	); exit (0);
 #endif
 #endif
 
diff --git a/config.sub b/config.sub
deleted file mode 100644
index 6759825a5b..0000000000
--- a/config.sub
+++ /dev/null
@@ -1,1658 +0,0 @@
-#! /bin/sh
-# Configuration validation subroutine script.
-#   Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
-#   2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
-#   Free Software Foundation, Inc.
-
-timestamp='2008-01-16'
-
-# This file is (in principle) common to ALL GNU software.
-# The presence of a machine in this file suggests that SOME GNU software
-# can handle that machine.  It does not imply ALL GNU software can.
-#
-# This file is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
-# 02110-1301, USA.
-#
-# As a special exception to the GNU General Public License, if you
-# distribute this file as part of a program that contains a
-# configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that program.
-
-
-# Please send patches to <config-patches@gnu.org>.  Submit a context
-# diff and a properly formatted ChangeLog entry.
-#
-# Configuration subroutine to validate and canonicalize a configuration type.
-# Supply the specified configuration type as an argument.
-# If it is invalid, we print an error message on stderr and exit with code 1.
-# Otherwise, we print the canonical config type on stdout and succeed.
-
-# This file is supposed to be the same for all GNU packages
-# and recognize all the CPU types, system types and aliases
-# that are meaningful with *any* GNU software.
-# Each package is responsible for reporting which valid configurations
-# it does not support.  The user should be able to distinguish
-# a failure to support a valid configuration from a meaningless
-# configuration.
-
-# The goal of this file is to map all the various variations of a given
-# machine specification into a single specification in the form:
-#	CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM
-# or in some cases, the newer four-part form:
-#	CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM
-# It is wrong to echo any other type of specification.
-
-me=`echo "$0" | sed -e 's,.*/,,'`
-
-usage="\
-Usage: $0 [OPTION] CPU-MFR-OPSYS
-       $0 [OPTION] ALIAS
-
-Canonicalize a configuration name.
-
-Operation modes:
-  -h, --help         print this help, then exit
-  -t, --time-stamp   print date of last modification, then exit
-  -v, --version      print version number, then exit
-
-Report bugs and patches to <config-patches@gnu.org>."
-
-version="\
-GNU config.sub ($timestamp)
-
-Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
-2002, 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
-
-This is free software; see the source for copying conditions.  There is NO
-warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
-
-help="
-Try \`$me --help' for more information."
-
-# Parse command line
-while test $# -gt 0 ; do
-  case $1 in
-    --time-stamp | --time* | -t )
-       echo "$timestamp" ; exit ;;
-    --version | -v )
-       echo "$version" ; exit ;;
-    --help | --h* | -h )
-       echo "$usage"; exit ;;
-    -- )     # Stop option processing
-       shift; break ;;
-    - )	# Use stdin as input.
-       break ;;
-    -* )
-       echo "$me: invalid option $1$help"
-       exit 1 ;;
-
-    *local*)
-       # First pass through any local machine types.
-       echo $1
-       exit ;;
-
-    * )
-       break ;;
-  esac
-done
-
-case $# in
- 0) echo "$me: missing argument$help" >&2
-    exit 1;;
- 1) ;;
- *) echo "$me: too many arguments$help" >&2
-    exit 1;;
-esac
-
-# Separate what the user gave into CPU-COMPANY and OS or KERNEL-OS (if any).
-# Here we must recognize all the valid KERNEL-OS combinations.
-maybe_os=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'`
-case $maybe_os in
-  nto-qnx* | linux-gnu* | linux-dietlibc | linux-newlib* | linux-uclibc* | \
-  uclinux-uclibc* | uclinux-gnu* | kfreebsd*-gnu* | knetbsd*-gnu* | netbsd*-gnu* | \
-  storm-chaos* | os2-emx* | rtmk-nova*)
-    os=-$maybe_os
-    basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'`
-    ;;
-  *)
-    basic_machine=`echo $1 | sed 's/-[^-]*$//'`
-    if [ $basic_machine != $1 ]
-    then os=`echo $1 | sed 's/.*-/-/'`
-    else os=; fi
-    ;;
-esac
-
-### Let's recognize common machines as not being operating systems so
-### that things like config.sub decstation-3100 work.  We also
-### recognize some manufacturers as not being operating systems, so we
-### can provide default operating systems below.
-case $os in
-	-sun*os*)
-		# Prevent following clause from handling this invalid input.
-		;;
-	-dec* | -mips* | -sequent* | -encore* | -pc532* | -sgi* | -sony* | \
-	-att* | -7300* | -3300* | -delta* | -motorola* | -sun[234]* | \
-	-unicom* | -ibm* | -next | -hp | -isi* | -apollo | -altos* | \
-	-convergent* | -ncr* | -news | -32* | -3600* | -3100* | -hitachi* |\
-	-c[123]* | -convex* | -sun | -crds | -omron* | -dg | -ultra | -tti* | \
-	-harris | -dolphin | -highlevel | -gould | -cbm | -ns | -masscomp | \
-	-apple | -axis | -knuth | -cray)
-		os=
-		basic_machine=$1
-		;;
-	-sim | -cisco | -oki | -wec | -winbond)
-		os=
-		basic_machine=$1
-		;;
-	-scout)
-		;;
-	-wrs)
-		os=-vxworks
-		basic_machine=$1
-		;;
-	-chorusos*)
-		os=-chorusos
-		basic_machine=$1
-		;;
- 	-chorusrdb)
- 		os=-chorusrdb
-		basic_machine=$1
- 		;;
-	-hiux*)
-		os=-hiuxwe2
-		;;
-	-sco6)
-		os=-sco5v6
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-sco5)
-		os=-sco3.2v5
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-sco4)
-		os=-sco3.2v4
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-sco3.2.[4-9]*)
-		os=`echo $os | sed -e 's/sco3.2./sco3.2v/'`
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-sco3.2v[4-9]*)
-		# Don't forget version if it is 3.2v4 or newer.
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-sco5v6*)
-		# Don't forget version if it is 3.2v4 or newer.
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-sco*)
-		os=-sco3.2v2
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-udk*)
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-isc)
-		os=-isc2.2
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-clix*)
-		basic_machine=clipper-intergraph
-		;;
-	-isc*)
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-lynx*)
-		os=-lynxos
-		;;
-	-ptx*)
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-sequent/'`
-		;;
-	-windowsnt*)
-		os=`echo $os | sed -e 's/windowsnt/winnt/'`
-		;;
-	-psos*)
-		os=-psos
-		;;
-	-mint | -mint[0-9]*)
-		basic_machine=m68k-atari
-		os=-mint
-		;;
-esac
-
-# Decode aliases for certain CPU-COMPANY combinations.
-case $basic_machine in
-	# Recognize the basic CPU types without company name.
-	# Some are omitted here because they have special meanings below.
-	1750a | 580 \
-	| a29k \
-	| alpha | alphaev[4-8] | alphaev56 | alphaev6[78] | alphapca5[67] \
-	| alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] | alpha64pca5[67] \
-	| am33_2.0 \
-	| arc | arm | arm[bl]e | arme[lb] | armv[2345] | armv[345][lb] | avr | avr32 \
-	| bfin \
-	| c4x | clipper \
-	| d10v | d30v | dlx | dsp16xx \
-	| fido | fr30 | frv \
-	| h8300 | h8500 | hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \
-	| i370 | i860 | i960 | ia64 \
-	| ip2k | iq2000 \
-	| m32c | m32r | m32rle | m68000 | m68k | m88k \
-	| maxq | mb | microblaze | mcore | mep \
-	| mips | mipsbe | mipseb | mipsel | mipsle \
-	| mips16 \
-	| mips64 | mips64el \
-	| mips64vr | mips64vrel \
-	| mips64orion | mips64orionel \
-	| mips64vr4100 | mips64vr4100el \
-	| mips64vr4300 | mips64vr4300el \
-	| mips64vr5000 | mips64vr5000el \
-	| mips64vr5900 | mips64vr5900el \
-	| mipsisa32 | mipsisa32el \
-	| mipsisa32r2 | mipsisa32r2el \
-	| mipsisa64 | mipsisa64el \
-	| mipsisa64r2 | mipsisa64r2el \
-	| mipsisa64sb1 | mipsisa64sb1el \
-	| mipsisa64sr71k | mipsisa64sr71kel \
-	| mipstx39 | mipstx39el \
-	| mn10200 | mn10300 \
-	| mt \
-	| msp430 \
-	| nios | nios2 \
-	| ns16k | ns32k \
-	| or32 \
-	| pdp10 | pdp11 | pj | pjl \
-	| powerpc | powerpc64 | powerpc64le | powerpcle | ppcbe \
-	| pyramid \
-	| score \
-	| sh | sh[1234] | sh[24]a | sh[23]e | sh[34]eb | sheb | shbe | shle | sh[1234]le | sh3ele \
-	| sh64 | sh64le \
-	| sparc | sparc64 | sparc64b | sparc64v | sparc86x | sparclet | sparclite \
-	| sparcv8 | sparcv9 | sparcv9b | sparcv9v \
-	| spu | strongarm \
-	| tahoe | thumb | tic4x | tic80 | tron \
-	| v850 | v850e \
-	| we32k \
-	| x86 | xc16x | xscale | xscalee[bl] | xstormy16 | xtensa \
-	| z8k)
-		basic_machine=$basic_machine-unknown
-		;;
-	m6811 | m68hc11 | m6812 | m68hc12)
-		# Motorola 68HC11/12.
-		basic_machine=$basic_machine-unknown
-		os=-none
-		;;
-	m88110 | m680[12346]0 | m683?2 | m68360 | m5200 | v70 | w65 | z8k)
-		;;
-	ms1)
-		basic_machine=mt-unknown
-		;;
-
-	# We use `pc' rather than `unknown'
-	# because (1) that's what they normally are, and
-	# (2) the word "unknown" tends to confuse beginning users.
-	i*86 | x86_64)
-	  basic_machine=$basic_machine-pc
-	  ;;
-	# Object if more than one company name word.
-	*-*-*)
-		echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2
-		exit 1
-		;;
-	# Recognize the basic CPU types with company name.
-	580-* \
-	| a29k-* \
-	| alpha-* | alphaev[4-8]-* | alphaev56-* | alphaev6[78]-* \
-	| alpha64-* | alpha64ev[4-8]-* | alpha64ev56-* | alpha64ev6[78]-* \
-	| alphapca5[67]-* | alpha64pca5[67]-* | arc-* \
-	| arm-*  | armbe-* | armle-* | armeb-* | armv*-* \
-	| avr-* | avr32-* \
-	| bfin-* | bs2000-* \
-	| c[123]* | c30-* | [cjt]90-* | c4x-* | c54x-* | c55x-* | c6x-* \
-	| clipper-* | craynv-* | cydra-* \
-	| d10v-* | d30v-* | dlx-* \
-	| elxsi-* \
-	| f30[01]-* | f700-* | fido-* | fr30-* | frv-* | fx80-* \
-	| h8300-* | h8500-* \
-	| hppa-* | hppa1.[01]-* | hppa2.0-* | hppa2.0[nw]-* | hppa64-* \
-	| i*86-* | i860-* | i960-* | ia64-* \
-	| ip2k-* | iq2000-* \
-	| m32c-* | m32r-* | m32rle-* \
-	| m68000-* | m680[012346]0-* | m68360-* | m683?2-* | m68k-* \
-	| m88110-* | m88k-* | maxq-* | mcore-* \
-	| mips-* | mipsbe-* | mipseb-* | mipsel-* | mipsle-* \
-	| mips16-* \
-	| mips64-* | mips64el-* \
-	| mips64vr-* | mips64vrel-* \
-	| mips64orion-* | mips64orionel-* \
-	| mips64vr4100-* | mips64vr4100el-* \
-	| mips64vr4300-* | mips64vr4300el-* \
-	| mips64vr5000-* | mips64vr5000el-* \
-	| mips64vr5900-* | mips64vr5900el-* \
-	| mipsisa32-* | mipsisa32el-* \
-	| mipsisa32r2-* | mipsisa32r2el-* \
-	| mipsisa64-* | mipsisa64el-* \
-	| mipsisa64r2-* | mipsisa64r2el-* \
-	| mipsisa64sb1-* | mipsisa64sb1el-* \
-	| mipsisa64sr71k-* | mipsisa64sr71kel-* \
-	| mipstx39-* | mipstx39el-* \
-	| mmix-* \
-	| mt-* \
-	| msp430-* \
-	| nios-* | nios2-* \
-	| none-* | np1-* | ns16k-* | ns32k-* \
-	| orion-* \
-	| pdp10-* | pdp11-* | pj-* | pjl-* | pn-* | power-* \
-	| powerpc-* | powerpc64-* | powerpc64le-* | powerpcle-* | ppcbe-* \
-	| pyramid-* \
-	| romp-* | rs6000-* \
-	| sh-* | sh[1234]-* | sh[24]a-* | sh[23]e-* | sh[34]eb-* | sheb-* | shbe-* \
-	| shle-* | sh[1234]le-* | sh3ele-* | sh64-* | sh64le-* \
-	| sparc-* | sparc64-* | sparc64b-* | sparc64v-* | sparc86x-* | sparclet-* \
-	| sparclite-* \
-	| sparcv8-* | sparcv9-* | sparcv9b-* | sparcv9v-* | strongarm-* | sv1-* | sx?-* \
-	| tahoe-* | thumb-* \
-	| tic30-* | tic4x-* | tic54x-* | tic55x-* | tic6x-* | tic80-* \
-	| tron-* \
-	| v850-* | v850e-* | vax-* \
-	| we32k-* \
-	| x86-* | x86_64-* | xc16x-* | xps100-* | xscale-* | xscalee[bl]-* \
-	| xstormy16-* | xtensa*-* \
-	| ymp-* \
-	| z8k-*)
-		;;
-	# Recognize the basic CPU types without company name, with glob match.
-	xtensa*)
-		basic_machine=$basic_machine-unknown
-		;;
-	# Recognize the various machine names and aliases which stand
-	# for a CPU type and a company and sometimes even an OS.
-	386bsd)
-		basic_machine=i386-unknown
-		os=-bsd
-		;;
-	3b1 | 7300 | 7300-att | att-7300 | pc7300 | safari | unixpc)
-		basic_machine=m68000-att
-		;;
-	3b*)
-		basic_machine=we32k-att
-		;;
-	a29khif)
-		basic_machine=a29k-amd
-		os=-udi
-		;;
-    	abacus)
-		basic_machine=abacus-unknown
-		;;
-	adobe68k)
-		basic_machine=m68010-adobe
-		os=-scout
-		;;
-	alliant | fx80)
-		basic_machine=fx80-alliant
-		;;
-	altos | altos3068)
-		basic_machine=m68k-altos
-		;;
-	am29k)
-		basic_machine=a29k-none
-		os=-bsd
-		;;
-	amd64)
-		basic_machine=x86_64-pc
-		;;
-	amd64-*)
-		basic_machine=x86_64-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	amdahl)
-		basic_machine=580-amdahl
-		os=-sysv
-		;;
-	amiga | amiga-*)
-		basic_machine=m68k-unknown
-		;;
-	amigaos | amigados)
-		basic_machine=m68k-unknown
-		os=-amigaos
-		;;
-	amigaunix | amix)
-		basic_machine=m68k-unknown
-		os=-sysv4
-		;;
-	apollo68)
-		basic_machine=m68k-apollo
-		os=-sysv
-		;;
-	apollo68bsd)
-		basic_machine=m68k-apollo
-		os=-bsd
-		;;
-	aux)
-		basic_machine=m68k-apple
-		os=-aux
-		;;
-	balance)
-		basic_machine=ns32k-sequent
-		os=-dynix
-		;;
-	blackfin)
-		basic_machine=bfin-unknown
-		os=-linux
-		;;
-	blackfin-*)
-		basic_machine=bfin-`echo $basic_machine | sed 's/^[^-]*-//'`
-		os=-linux
-		;;
-	c90)
-		basic_machine=c90-cray
-		os=-unicos
-		;;
-	convex-c1)
-		basic_machine=c1-convex
-		os=-bsd
-		;;
-	convex-c2)
-		basic_machine=c2-convex
-		os=-bsd
-		;;
-	convex-c32)
-		basic_machine=c32-convex
-		os=-bsd
-		;;
-	convex-c34)
-		basic_machine=c34-convex
-		os=-bsd
-		;;
-	convex-c38)
-		basic_machine=c38-convex
-		os=-bsd
-		;;
-	cray | j90)
-		basic_machine=j90-cray
-		os=-unicos
-		;;
-	craynv)
-		basic_machine=craynv-cray
-		os=-unicosmp
-		;;
-	cr16)
-		basic_machine=cr16-unknown
-		os=-elf
-		;;
-	crds | unos)
-		basic_machine=m68k-crds
-		;;
-	crisv32 | crisv32-* | etraxfs*)
-		basic_machine=crisv32-axis
-		;;
-	cris | cris-* | etrax*)
-		basic_machine=cris-axis
-		;;
-	crx)
-		basic_machine=crx-unknown
-		os=-elf
-		;;
-	da30 | da30-*)
-		basic_machine=m68k-da30
-		;;
-	decstation | decstation-3100 | pmax | pmax-* | pmin | dec3100 | decstatn)
-		basic_machine=mips-dec
-		;;
-	decsystem10* | dec10*)
-		basic_machine=pdp10-dec
-		os=-tops10
-		;;
-	decsystem20* | dec20*)
-		basic_machine=pdp10-dec
-		os=-tops20
-		;;
-	delta | 3300 | motorola-3300 | motorola-delta \
-	      | 3300-motorola | delta-motorola)
-		basic_machine=m68k-motorola
-		;;
-	delta88)
-		basic_machine=m88k-motorola
-		os=-sysv3
-		;;
-	djgpp)
-		basic_machine=i586-pc
-		os=-msdosdjgpp
-		;;
-	dpx20 | dpx20-*)
-		basic_machine=rs6000-bull
-		os=-bosx
-		;;
-	dpx2* | dpx2*-bull)
-		basic_machine=m68k-bull
-		os=-sysv3
-		;;
-	ebmon29k)
-		basic_machine=a29k-amd
-		os=-ebmon
-		;;
-	elxsi)
-		basic_machine=elxsi-elxsi
-		os=-bsd
-		;;
-	encore | umax | mmax)
-		basic_machine=ns32k-encore
-		;;
-	es1800 | OSE68k | ose68k | ose | OSE)
-		basic_machine=m68k-ericsson
-		os=-ose
-		;;
-	fx2800)
-		basic_machine=i860-alliant
-		;;
-	genix)
-		basic_machine=ns32k-ns
-		;;
-	gmicro)
-		basic_machine=tron-gmicro
-		os=-sysv
-		;;
-	go32)
-		basic_machine=i386-pc
-		os=-go32
-		;;
-	h3050r* | hiux*)
-		basic_machine=hppa1.1-hitachi
-		os=-hiuxwe2
-		;;
-	h8300hms)
-		basic_machine=h8300-hitachi
-		os=-hms
-		;;
-	h8300xray)
-		basic_machine=h8300-hitachi
-		os=-xray
-		;;
-	h8500hms)
-		basic_machine=h8500-hitachi
-		os=-hms
-		;;
-	harris)
-		basic_machine=m88k-harris
-		os=-sysv3
-		;;
-	hp300-*)
-		basic_machine=m68k-hp
-		;;
-	hp300bsd)
-		basic_machine=m68k-hp
-		os=-bsd
-		;;
-	hp300hpux)
-		basic_machine=m68k-hp
-		os=-hpux
-		;;
-	hp3k9[0-9][0-9] | hp9[0-9][0-9])
-		basic_machine=hppa1.0-hp
-		;;
-	hp9k2[0-9][0-9] | hp9k31[0-9])
-		basic_machine=m68000-hp
-		;;
-	hp9k3[2-9][0-9])
-		basic_machine=m68k-hp
-		;;
-	hp9k6[0-9][0-9] | hp6[0-9][0-9])
-		basic_machine=hppa1.0-hp
-		;;
-	hp9k7[0-79][0-9] | hp7[0-79][0-9])
-		basic_machine=hppa1.1-hp
-		;;
-	hp9k78[0-9] | hp78[0-9])
-		# FIXME: really hppa2.0-hp
-		basic_machine=hppa1.1-hp
-		;;
-	hp9k8[67]1 | hp8[67]1 | hp9k80[24] | hp80[24] | hp9k8[78]9 | hp8[78]9 | hp9k893 | hp893)
-		# FIXME: really hppa2.0-hp
-		basic_machine=hppa1.1-hp
-		;;
-	hp9k8[0-9][13679] | hp8[0-9][13679])
-		basic_machine=hppa1.1-hp
-		;;
-	hp9k8[0-9][0-9] | hp8[0-9][0-9])
-		basic_machine=hppa1.0-hp
-		;;
-	hppa-next)
-		os=-nextstep3
-		;;
-	hppaosf)
-		basic_machine=hppa1.1-hp
-		os=-osf
-		;;
-	hppro)
-		basic_machine=hppa1.1-hp
-		os=-proelf
-		;;
-	i370-ibm* | ibm*)
-		basic_machine=i370-ibm
-		;;
-# I'm not sure what "Sysv32" means.  Should this be sysv3.2?
-	i*86v32)
-		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
-		os=-sysv32
-		;;
-	i*86v4*)
-		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
-		os=-sysv4
-		;;
-	i*86v)
-		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
-		os=-sysv
-		;;
-	i*86sol2)
-		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
-		os=-solaris2
-		;;
-	i386mach)
-		basic_machine=i386-mach
-		os=-mach
-		;;
-	i386-vsta | vsta)
-		basic_machine=i386-unknown
-		os=-vsta
-		;;
-	iris | iris4d)
-		basic_machine=mips-sgi
-		case $os in
-		    -irix*)
-			;;
-		    *)
-			os=-irix4
-			;;
-		esac
-		;;
-	isi68 | isi)
-		basic_machine=m68k-isi
-		os=-sysv
-		;;
-	m68knommu)
-		basic_machine=m68k-unknown
-		os=-linux
-		;;
-	m68knommu-*)
-		basic_machine=m68k-`echo $basic_machine | sed 's/^[^-]*-//'`
-		os=-linux
-		;;
-	m88k-omron*)
-		basic_machine=m88k-omron
-		;;
-	magnum | m3230)
-		basic_machine=mips-mips
-		os=-sysv
-		;;
-	merlin)
-		basic_machine=ns32k-utek
-		os=-sysv
-		;;
-	mingw32)
-		basic_machine=i386-pc
-		os=-mingw32
-		;;
-	mingw32ce)
-		basic_machine=arm-unknown
-		os=-mingw32ce
-		;;
-	miniframe)
-		basic_machine=m68000-convergent
-		;;
-	*mint | -mint[0-9]* | *MiNT | *MiNT[0-9]*)
-		basic_machine=m68k-atari
-		os=-mint
-		;;
-	mips3*-*)
-		basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`
-		;;
-	mips3*)
-		basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`-unknown
-		;;
-	monitor)
-		basic_machine=m68k-rom68k
-		os=-coff
-		;;
-	morphos)
-		basic_machine=powerpc-unknown
-		os=-morphos
-		;;
-	msdos)
-		basic_machine=i386-pc
-		os=-msdos
-		;;
-	ms1-*)
-		basic_machine=`echo $basic_machine | sed -e 's/ms1-/mt-/'`
-		;;
-	mvs)
-		basic_machine=i370-ibm
-		os=-mvs
-		;;
-	ncr3000)
-		basic_machine=i486-ncr
-		os=-sysv4
-		;;
-	netbsd386)
-		basic_machine=i386-unknown
-		os=-netbsd
-		;;
-	netwinder)
-		basic_machine=armv4l-rebel
-		os=-linux
-		;;
-	news | news700 | news800 | news900)
-		basic_machine=m68k-sony
-		os=-newsos
-		;;
-	news1000)
-		basic_machine=m68030-sony
-		os=-newsos
-		;;
-	news-3600 | risc-news)
-		basic_machine=mips-sony
-		os=-newsos
-		;;
-	necv70)
-		basic_machine=v70-nec
-		os=-sysv
-		;;
-	next | m*-next )
-		basic_machine=m68k-next
-		case $os in
-		    -nextstep* )
-			;;
-		    -ns2*)
-		      os=-nextstep2
-			;;
-		    *)
-		      os=-nextstep3
-			;;
-		esac
-		;;
-	nh3000)
-		basic_machine=m68k-harris
-		os=-cxux
-		;;
-	nh[45]000)
-		basic_machine=m88k-harris
-		os=-cxux
-		;;
-	nindy960)
-		basic_machine=i960-intel
-		os=-nindy
-		;;
-	mon960)
-		basic_machine=i960-intel
-		os=-mon960
-		;;
-	nonstopux)
-		basic_machine=mips-compaq
-		os=-nonstopux
-		;;
-	np1)
-		basic_machine=np1-gould
-		;;
-	nsr-tandem)
-		basic_machine=nsr-tandem
-		;;
-	op50n-* | op60c-*)
-		basic_machine=hppa1.1-oki
-		os=-proelf
-		;;
-	openrisc | openrisc-*)
-		basic_machine=or32-unknown
-		;;
-	os400)
-		basic_machine=powerpc-ibm
-		os=-os400
-		;;
-	OSE68000 | ose68000)
-		basic_machine=m68000-ericsson
-		os=-ose
-		;;
-	os68k)
-		basic_machine=m68k-none
-		os=-os68k
-		;;
-	pa-hitachi)
-		basic_machine=hppa1.1-hitachi
-		os=-hiuxwe2
-		;;
-	paragon)
-		basic_machine=i860-intel
-		os=-osf
-		;;
-	parisc)
-		basic_machine=hppa-unknown
-		os=-linux
-		;;
-	parisc-*)
-		basic_machine=hppa-`echo $basic_machine | sed 's/^[^-]*-//'`
-		os=-linux
-		;;
-	pbd)
-		basic_machine=sparc-tti
-		;;
-	pbb)
-		basic_machine=m68k-tti
-		;;
-	pc532 | pc532-*)
-		basic_machine=ns32k-pc532
-		;;
-	pc98)
-		basic_machine=i386-pc
-		;;
-	pc98-*)
-		basic_machine=i386-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	pentium | p5 | k5 | k6 | nexgen | viac3)
-		basic_machine=i586-pc
-		;;
-	pentiumpro | p6 | 6x86 | athlon | athlon_*)
-		basic_machine=i686-pc
-		;;
-	pentiumii | pentium2 | pentiumiii | pentium3)
-		basic_machine=i686-pc
-		;;
-	pentium4)
-		basic_machine=i786-pc
-		;;
-	pentium-* | p5-* | k5-* | k6-* | nexgen-* | viac3-*)
-		basic_machine=i586-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	pentiumpro-* | p6-* | 6x86-* | athlon-*)
-		basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	pentiumii-* | pentium2-* | pentiumiii-* | pentium3-*)
-		basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	pentium4-*)
-		basic_machine=i786-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	pn)
-		basic_machine=pn-gould
-		;;
-	power)	basic_machine=power-ibm
-		;;
-	ppc)	basic_machine=powerpc-unknown
-		;;
-	ppc-*)	basic_machine=powerpc-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	ppcle | powerpclittle | ppc-le | powerpc-little)
-		basic_machine=powerpcle-unknown
-		;;
-	ppcle-* | powerpclittle-*)
-		basic_machine=powerpcle-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	ppc64)	basic_machine=powerpc64-unknown
-		;;
-	ppc64-*) basic_machine=powerpc64-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	ppc64le | powerpc64little | ppc64-le | powerpc64-little)
-		basic_machine=powerpc64le-unknown
-		;;
-	ppc64le-* | powerpc64little-*)
-		basic_machine=powerpc64le-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	ps2)
-		basic_machine=i386-ibm
-		;;
-	pw32)
-		basic_machine=i586-unknown
-		os=-pw32
-		;;
-	rdos)
-		basic_machine=i386-pc
-		os=-rdos
-		;;
-	rom68k)
-		basic_machine=m68k-rom68k
-		os=-coff
-		;;
-	rm[46]00)
-		basic_machine=mips-siemens
-		;;
-	rtpc | rtpc-*)
-		basic_machine=romp-ibm
-		;;
-	s390 | s390-*)
-		basic_machine=s390-ibm
-		;;
-	s390x | s390x-*)
-		basic_machine=s390x-ibm
-		;;
-	sa29200)
-		basic_machine=a29k-amd
-		os=-udi
-		;;
-	sb1)
-		basic_machine=mipsisa64sb1-unknown
-		;;
-	sb1el)
-		basic_machine=mipsisa64sb1el-unknown
-		;;
-	sde)
-		basic_machine=mipsisa32-sde
-		os=-elf
-		;;
-	sei)
-		basic_machine=mips-sei
-		os=-seiux
-		;;
-	sequent)
-		basic_machine=i386-sequent
-		;;
-	sh)
-		basic_machine=sh-hitachi
-		os=-hms
-		;;
-	sh5el)
-		basic_machine=sh5le-unknown
-		;;
-	sh64)
-		basic_machine=sh64-unknown
-		;;
-	sparclite-wrs | simso-wrs)
-		basic_machine=sparclite-wrs
-		os=-vxworks
-		;;
-	sps7)
-		basic_machine=m68k-bull
-		os=-sysv2
-		;;
-	spur)
-		basic_machine=spur-unknown
-		;;
-	st2000)
-		basic_machine=m68k-tandem
-		;;
-	stratus)
-		basic_machine=i860-stratus
-		os=-sysv4
-		;;
-	sun2)
-		basic_machine=m68000-sun
-		;;
-	sun2os3)
-		basic_machine=m68000-sun
-		os=-sunos3
-		;;
-	sun2os4)
-		basic_machine=m68000-sun
-		os=-sunos4
-		;;
-	sun3os3)
-		basic_machine=m68k-sun
-		os=-sunos3
-		;;
-	sun3os4)
-		basic_machine=m68k-sun
-		os=-sunos4
-		;;
-	sun4os3)
-		basic_machine=sparc-sun
-		os=-sunos3
-		;;
-	sun4os4)
-		basic_machine=sparc-sun
-		os=-sunos4
-		;;
-	sun4sol2)
-		basic_machine=sparc-sun
-		os=-solaris2
-		;;
-	sun3 | sun3-*)
-		basic_machine=m68k-sun
-		;;
-	sun4)
-		basic_machine=sparc-sun
-		;;
-	sun386 | sun386i | roadrunner)
-		basic_machine=i386-sun
-		;;
-	sv1)
-		basic_machine=sv1-cray
-		os=-unicos
-		;;
-	symmetry)
-		basic_machine=i386-sequent
-		os=-dynix
-		;;
-	t3e)
-		basic_machine=alphaev5-cray
-		os=-unicos
-		;;
-	t90)
-		basic_machine=t90-cray
-		os=-unicos
-		;;
-	tic54x | c54x*)
-		basic_machine=tic54x-unknown
-		os=-coff
-		;;
-	tic55x | c55x*)
-		basic_machine=tic55x-unknown
-		os=-coff
-		;;
-	tic6x | c6x*)
-		basic_machine=tic6x-unknown
-		os=-coff
-		;;
-	tile*)
-		basic_machine=tile-unknown
-		os=-linux-gnu
-		;;
-	tx39)
-		basic_machine=mipstx39-unknown
-		;;
-	tx39el)
-		basic_machine=mipstx39el-unknown
-		;;
-	toad1)
-		basic_machine=pdp10-xkl
-		os=-tops20
-		;;
-	tower | tower-32)
-		basic_machine=m68k-ncr
-		;;
-	tpf)
-		basic_machine=s390x-ibm
-		os=-tpf
-		;;
-	udi29k)
-		basic_machine=a29k-amd
-		os=-udi
-		;;
-	ultra3)
-		basic_machine=a29k-nyu
-		os=-sym1
-		;;
-	v810 | necv810)
-		basic_machine=v810-nec
-		os=-none
-		;;
-	vaxv)
-		basic_machine=vax-dec
-		os=-sysv
-		;;
-	vms)
-		basic_machine=vax-dec
-		os=-vms
-		;;
-	vpp*|vx|vx-*)
-		basic_machine=f301-fujitsu
-		;;
-	vxworks960)
-		basic_machine=i960-wrs
-		os=-vxworks
-		;;
-	vxworks68)
-		basic_machine=m68k-wrs
-		os=-vxworks
-		;;
-	vxworks29k)
-		basic_machine=a29k-wrs
-		os=-vxworks
-		;;
-	w65*)
-		basic_machine=w65-wdc
-		os=-none
-		;;
-	w89k-*)
-		basic_machine=hppa1.1-winbond
-		os=-proelf
-		;;
-	xbox)
-		basic_machine=i686-pc
-		os=-mingw32
-		;;
-	xps | xps100)
-		basic_machine=xps100-honeywell
-		;;
-	ymp)
-		basic_machine=ymp-cray
-		os=-unicos
-		;;
-	z8k-*-coff)
-		basic_machine=z8k-unknown
-		os=-sim
-		;;
-	none)
-		basic_machine=none-none
-		os=-none
-		;;
-
-# Here we handle the default manufacturer of certain CPU types.  It is in
-# some cases the only manufacturer, in others, it is the most popular.
-	w89k)
-		basic_machine=hppa1.1-winbond
-		;;
-	op50n)
-		basic_machine=hppa1.1-oki
-		;;
-	op60c)
-		basic_machine=hppa1.1-oki
-		;;
-	romp)
-		basic_machine=romp-ibm
-		;;
-	mmix)
-		basic_machine=mmix-knuth
-		;;
-	rs6000)
-		basic_machine=rs6000-ibm
-		;;
-	vax)
-		basic_machine=vax-dec
-		;;
-	pdp10)
-		# there are many clones, so DEC is not a safe bet
-		basic_machine=pdp10-unknown
-		;;
-	pdp11)
-		basic_machine=pdp11-dec
-		;;
-	we32k)
-		basic_machine=we32k-att
-		;;
-	sh[1234] | sh[24]a | sh[34]eb | sh[1234]le | sh[23]ele)
-		basic_machine=sh-unknown
-		;;
-	sparc | sparcv8 | sparcv9 | sparcv9b | sparcv9v)
-		basic_machine=sparc-sun
-		;;
-	cydra)
-		basic_machine=cydra-cydrome
-		;;
-	orion)
-		basic_machine=orion-highlevel
-		;;
-	orion105)
-		basic_machine=clipper-highlevel
-		;;
-	mac | mpw | mac-mpw)
-		basic_machine=m68k-apple
-		;;
-	pmac | pmac-mpw)
-		basic_machine=powerpc-apple
-		;;
-	*-unknown)
-		# Make sure to match an already-canonicalized machine name.
-		;;
-	*)
-		echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2
-		exit 1
-		;;
-esac
-
-# Here we canonicalize certain aliases for manufacturers.
-case $basic_machine in
-	*-digital*)
-		basic_machine=`echo $basic_machine | sed 's/digital.*/dec/'`
-		;;
-	*-commodore*)
-		basic_machine=`echo $basic_machine | sed 's/commodore.*/cbm/'`
-		;;
-	*)
-		;;
-esac
-
-# Decode manufacturer-specific aliases for certain operating systems.
-
-if [ x"$os" != x"" ]
-then
-case $os in
-        # First match some system type aliases
-        # that might get confused with valid system types.
-	# -solaris* is a basic system type, with this one exception.
-	-solaris1 | -solaris1.*)
-		os=`echo $os | sed -e 's|solaris1|sunos4|'`
-		;;
-	-solaris)
-		os=-solaris2
-		;;
-	-svr4*)
-		os=-sysv4
-		;;
-	-unixware*)
-		os=-sysv4.2uw
-		;;
-	-gnu/linux*)
-		os=`echo $os | sed -e 's|gnu/linux|linux-gnu|'`
-		;;
-	# First accept the basic system types.
-	# The portable systems comes first.
-	# Each alternative MUST END IN A *, to match a version number.
-	# -sysv* is not here because it comes later, after sysvr4.
-	-gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \
-	      | -*vms* | -sco* | -esix* | -isc* | -aix* | -sunos | -sunos[34]*\
-	      | -hpux* | -unos* | -osf* | -luna* | -dgux* | -solaris* | -sym* \
-	      | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \
-	      | -aos* \
-	      | -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \
-	      | -clix* | -riscos* | -uniplus* | -iris* | -rtu* | -xenix* \
-	      | -hiux* | -386bsd* | -knetbsd* | -mirbsd* | -netbsd* \
-	      | -openbsd* | -solidbsd* \
-	      | -ekkobsd* | -kfreebsd* | -freebsd* | -riscix* | -lynxos* \
-	      | -bosx* | -nextstep* | -cxux* | -aout* | -elf* | -oabi* \
-	      | -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \
-	      | -udi* | -eabi* | -lites* | -ieee* | -go32* | -aux* \
-	      | -chorusos* | -chorusrdb* \
-	      | -cygwin* | -pe* | -psos* | -moss* | -proelf* | -rtems* \
-	      | -mingw32* | -linux-gnu* | -linux-newlib* | -linux-uclibc* \
-	      | -uxpv* | -beos* | -mpeix* | -udk* \
-	      | -interix* | -uwin* | -mks* | -rhapsody* | -darwin* | -opened* \
-	      | -openstep* | -oskit* | -conix* | -pw32* | -nonstopux* \
-	      | -storm-chaos* | -tops10* | -tenex* | -tops20* | -its* \
-	      | -os2* | -vos* | -palmos* | -uclinux* | -nucleus* \
-	      | -morphos* | -superux* | -rtmk* | -rtmk-nova* | -windiss* \
-	      | -powermax* | -dnix* | -nx6 | -nx7 | -sei* | -dragonfly* \
-	      | -skyos* | -haiku* | -rdos* | -toppers* | -drops*)
-	# Remember, each alternative MUST END IN *, to match a version number.
-		;;
-	-qnx*)
-		case $basic_machine in
-		    x86-* | i*86-*)
-			;;
-		    *)
-			os=-nto$os
-			;;
-		esac
-		;;
-	-nto-qnx*)
-		;;
-	-nto*)
-		os=`echo $os | sed -e 's|nto|nto-qnx|'`
-		;;
-	-sim | -es1800* | -hms* | -xray | -os68k* | -none* | -v88r* \
-	      | -windows* | -osx | -abug | -netware* | -os9* | -beos* | -haiku* \
-	      | -macos* | -mpw* | -magic* | -mmixware* | -mon960* | -lnews*)
-		;;
-	-mac*)
-		os=`echo $os | sed -e 's|mac|macos|'`
-		;;
-	-linux-dietlibc)
-		os=-linux-dietlibc
-		;;
-	-linux*)
-		os=`echo $os | sed -e 's|linux|linux-gnu|'`
-		;;
-	-sunos5*)
-		os=`echo $os | sed -e 's|sunos5|solaris2|'`
-		;;
-	-sunos6*)
-		os=`echo $os | sed -e 's|sunos6|solaris3|'`
-		;;
-	-opened*)
-		os=-openedition
-		;;
-        -os400*)
-		os=-os400
-		;;
-	-wince*)
-		os=-wince
-		;;
-	-osfrose*)
-		os=-osfrose
-		;;
-	-osf*)
-		os=-osf
-		;;
-	-utek*)
-		os=-bsd
-		;;
-	-dynix*)
-		os=-bsd
-		;;
-	-acis*)
-		os=-aos
-		;;
-	-atheos*)
-		os=-atheos
-		;;
-	-syllable*)
-		os=-syllable
-		;;
-	-386bsd)
-		os=-bsd
-		;;
-	-ctix* | -uts*)
-		os=-sysv
-		;;
-	-nova*)
-		os=-rtmk-nova
-		;;
-	-ns2 )
-		os=-nextstep2
-		;;
-	-nsk*)
-		os=-nsk
-		;;
-	# Preserve the version number of sinix5.
-	-sinix5.*)
-		os=`echo $os | sed -e 's|sinix|sysv|'`
-		;;
-	-sinix*)
-		os=-sysv4
-		;;
-        -tpf*)
-		os=-tpf
-		;;
-	-triton*)
-		os=-sysv3
-		;;
-	-oss*)
-		os=-sysv3
-		;;
-	-svr4)
-		os=-sysv4
-		;;
-	-svr3)
-		os=-sysv3
-		;;
-	-sysvr4)
-		os=-sysv4
-		;;
-	# This must come after -sysvr4.
-	-sysv*)
-		;;
-	-ose*)
-		os=-ose
-		;;
-	-es1800*)
-		os=-ose
-		;;
-	-xenix)
-		os=-xenix
-		;;
-	-*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*)
-		os=-mint
-		;;
-	-aros*)
-		os=-aros
-		;;
-	-kaos*)
-		os=-kaos
-		;;
-	-zvmoe)
-		os=-zvmoe
-		;;
-	-none)
-		;;
-	*)
-		# Get rid of the `-' at the beginning of $os.
-		os=`echo $os | sed 's/[^-]*-//'`
-		echo Invalid configuration \`$1\': system \`$os\' not recognized 1>&2
-		exit 1
-		;;
-esac
-else
-
-# Here we handle the default operating systems that come with various machines.
-# The value should be what the vendor currently ships out the door with their
-# machine or put another way, the most popular os provided with the machine.
-
-# Note that if you're going to try to match "-MANUFACTURER" here (say,
-# "-sun"), then you have to tell the case statement up towards the top
-# that MANUFACTURER isn't an operating system.  Otherwise, code above
-# will signal an error saying that MANUFACTURER isn't an operating
-# system, and we'll never get to this point.
-
-case $basic_machine in
-        score-*)
-		os=-elf
-		;;
-        spu-*)
-		os=-elf
-		;;
-	*-acorn)
-		os=-riscix1.2
-		;;
-	arm*-rebel)
-		os=-linux
-		;;
-	arm*-semi)
-		os=-aout
-		;;
-        c4x-* | tic4x-*)
-        	os=-coff
-		;;
-	# This must come before the *-dec entry.
-	pdp10-*)
-		os=-tops20
-		;;
-	pdp11-*)
-		os=-none
-		;;
-	*-dec | vax-*)
-		os=-ultrix4.2
-		;;
-	m68*-apollo)
-		os=-domain
-		;;
-	i386-sun)
-		os=-sunos4.0.2
-		;;
-	m68000-sun)
-		os=-sunos3
-		# This also exists in the configure program, but was not the
-		# default.
-		# os=-sunos4
-		;;
-	m68*-cisco)
-		os=-aout
-		;;
-        mep-*)
-		os=-elf
-		;;
-	mips*-cisco)
-		os=-elf
-		;;
-	mips*-*)
-		os=-elf
-		;;
-	or32-*)
-		os=-coff
-		;;
-	*-tti)	# must be before sparc entry or we get the wrong os.
-		os=-sysv3
-		;;
-	sparc-* | *-sun)
-		os=-sunos4.1.1
-		;;
-	*-be)
-		os=-beos
-		;;
-	*-haiku)
-		os=-haiku
-		;;
-	*-ibm)
-		os=-aix
-		;;
-    	*-knuth)
-		os=-mmixware
-		;;
-	*-wec)
-		os=-proelf
-		;;
-	*-winbond)
-		os=-proelf
-		;;
-	*-oki)
-		os=-proelf
-		;;
-	*-hp)
-		os=-hpux
-		;;
-	*-hitachi)
-		os=-hiux
-		;;
-	i860-* | *-att | *-ncr | *-altos | *-motorola | *-convergent)
-		os=-sysv
-		;;
-	*-cbm)
-		os=-amigaos
-		;;
-	*-dg)
-		os=-dgux
-		;;
-	*-dolphin)
-		os=-sysv3
-		;;
-	m68k-ccur)
-		os=-rtu
-		;;
-	m88k-omron*)
-		os=-luna
-		;;
-	*-next )
-		os=-nextstep
-		;;
-	*-sequent)
-		os=-ptx
-		;;
-	*-crds)
-		os=-unos
-		;;
-	*-ns)
-		os=-genix
-		;;
-	i370-*)
-		os=-mvs
-		;;
-	*-next)
-		os=-nextstep3
-		;;
-	*-gould)
-		os=-sysv
-		;;
-	*-highlevel)
-		os=-bsd
-		;;
-	*-encore)
-		os=-bsd
-		;;
-	*-sgi)
-		os=-irix
-		;;
-	*-siemens)
-		os=-sysv4
-		;;
-	*-masscomp)
-		os=-rtu
-		;;
-	f30[01]-fujitsu | f700-fujitsu)
-		os=-uxpv
-		;;
-	*-rom68k)
-		os=-coff
-		;;
-	*-*bug)
-		os=-coff
-		;;
-	*-apple)
-		os=-macos
-		;;
-	*-atari*)
-		os=-mint
-		;;
-	*)
-		os=-none
-		;;
-esac
-fi
-
-# Here we handle the case where we know the os, and the CPU type, but not the
-# manufacturer.  We pick the logical manufacturer.
-vendor=unknown
-case $basic_machine in
-	*-unknown)
-		case $os in
-			-riscix*)
-				vendor=acorn
-				;;
-			-sunos*)
-				vendor=sun
-				;;
-			-aix*)
-				vendor=ibm
-				;;
-			-beos*)
-				vendor=be
-				;;
-			-hpux*)
-				vendor=hp
-				;;
-			-mpeix*)
-				vendor=hp
-				;;
-			-hiux*)
-				vendor=hitachi
-				;;
-			-unos*)
-				vendor=crds
-				;;
-			-dgux*)
-				vendor=dg
-				;;
-			-luna*)
-				vendor=omron
-				;;
-			-genix*)
-				vendor=ns
-				;;
-			-mvs* | -opened*)
-				vendor=ibm
-				;;
-			-os400*)
-				vendor=ibm
-				;;
-			-ptx*)
-				vendor=sequent
-				;;
-			-tpf*)
-				vendor=ibm
-				;;
-			-vxsim* | -vxworks* | -windiss*)
-				vendor=wrs
-				;;
-			-aux*)
-				vendor=apple
-				;;
-			-hms*)
-				vendor=hitachi
-				;;
-			-mpw* | -macos*)
-				vendor=apple
-				;;
-			-*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*)
-				vendor=atari
-				;;
-			-vos*)
-				vendor=stratus
-				;;
-		esac
-		basic_machine=`echo $basic_machine | sed "s/unknown/$vendor/"`
-		;;
-esac
-
-echo $basic_machine$os
-exit
-
-# Local variables:
-# eval: (add-hook 'write-file-hooks 'time-stamp)
-# time-stamp-start: "timestamp='"
-# time-stamp-format: "%:y-%02m-%02d"
-# time-stamp-end: "'"
-# End:
diff --git a/configure b/configure
deleted file mode 100755
index f7b8197d86..0000000000
--- a/configure
+++ /dev/null
@@ -1,8048 +0,0 @@
-#! /bin/sh
-# Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.68 for ccminer 2014.05.03.
-#
-#
-# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
-# 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free Software
-# Foundation, Inc.
-#
-#
-# This configure script is free software; the Free Software Foundation
-# gives unlimited permission to copy, distribute and modify it.
-## -------------------- ##
-## M4sh Initialization. ##
-## -------------------- ##
-
-# Be more Bourne compatible
-DUALCASE=1; export DUALCASE # for MKS sh
-if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then :
-  emulate sh
-  NULLCMD=:
-  # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which
-  # is contrary to our usage.  Disable this feature.
-  alias -g '${1+"$@"}'='"$@"'
-  setopt NO_GLOB_SUBST
-else
-  case `(set -o) 2>/dev/null` in #(
-  *posix*) :
-    set -o posix ;; #(
-  *) :
-     ;;
-esac
-fi
-
-
-as_nl='
-'
-export as_nl
-# Printing a long string crashes Solaris 7 /usr/bin/printf.
-as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'
-as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo
-as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo
-# Prefer a ksh shell builtin over an external printf program on Solaris,
-# but without wasting forks for bash or zsh.
-if test -z "$BASH_VERSION$ZSH_VERSION" \
-    && (test "X`print -r -- $as_echo`" = "X$as_echo") 2>/dev/null; then
-  as_echo='print -r --'
-  as_echo_n='print -rn --'
-elif (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then
-  as_echo='printf %s\n'
-  as_echo_n='printf %s'
-else
-  if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then
-    as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"'
-    as_echo_n='/usr/ucb/echo -n'
-  else
-    as_echo_body='eval expr "X$1" : "X\\(.*\\)"'
-    as_echo_n_body='eval
-      arg=$1;
-      case $arg in #(
-      *"$as_nl"*)
-	expr "X$arg" : "X\\(.*\\)$as_nl";
-	arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;;
-      esac;
-      expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl"
-    '
-    export as_echo_n_body
-    as_echo_n='sh -c $as_echo_n_body as_echo'
-  fi
-  export as_echo_body
-  as_echo='sh -c $as_echo_body as_echo'
-fi
-
-# The user is always right.
-if test "${PATH_SEPARATOR+set}" != set; then
-  PATH_SEPARATOR=:
-  (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 && {
-    (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 ||
-      PATH_SEPARATOR=';'
-  }
-fi
-
-
-# IFS
-# We need space, tab and new line, in precisely that order.  Quoting is
-# there to prevent editors from complaining about space-tab.
-# (If _AS_PATH_WALK were called with IFS unset, it would disable word
-# splitting by setting IFS to empty value.)
-IFS=" ""	$as_nl"
-
-# Find who we are.  Look in the path if we contain no directory separator.
-as_myself=
-case $0 in #((
-  *[\\/]* ) as_myself=$0 ;;
-  *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break
-  done
-IFS=$as_save_IFS
-
-     ;;
-esac
-# We did not find ourselves, most probably we were run as `sh COMMAND'
-# in which case we are not to be found in the path.
-if test "x$as_myself" = x; then
-  as_myself=$0
-fi
-if test ! -f "$as_myself"; then
-  $as_echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2
-  exit 1
-fi
-
-# Unset variables that we do not need and which cause bugs (e.g. in
-# pre-3.0 UWIN ksh).  But do not cause bugs in bash 2.01; the "|| exit 1"
-# suppresses any "Segmentation fault" message there.  '((' could
-# trigger a bug in pdksh 5.2.14.
-for as_var in BASH_ENV ENV MAIL MAILPATH
-do eval test x\${$as_var+set} = xset \
-  && ( (unset $as_var) || exit 1) >/dev/null 2>&1 && unset $as_var || :
-done
-PS1='$ '
-PS2='> '
-PS4='+ '
-
-# NLS nuisances.
-LC_ALL=C
-export LC_ALL
-LANGUAGE=C
-export LANGUAGE
-
-# CDPATH.
-(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
-
-if test "x$CONFIG_SHELL" = x; then
-  as_bourne_compatible="if test -n \"\${ZSH_VERSION+set}\" && (emulate sh) >/dev/null 2>&1; then :
-  emulate sh
-  NULLCMD=:
-  # Pre-4.2 versions of Zsh do word splitting on \${1+\"\$@\"}, which
-  # is contrary to our usage.  Disable this feature.
-  alias -g '\${1+\"\$@\"}'='\"\$@\"'
-  setopt NO_GLOB_SUBST
-else
-  case \`(set -o) 2>/dev/null\` in #(
-  *posix*) :
-    set -o posix ;; #(
-  *) :
-     ;;
-esac
-fi
-"
-  as_required="as_fn_return () { (exit \$1); }
-as_fn_success () { as_fn_return 0; }
-as_fn_failure () { as_fn_return 1; }
-as_fn_ret_success () { return 0; }
-as_fn_ret_failure () { return 1; }
-
-exitcode=0
-as_fn_success || { exitcode=1; echo as_fn_success failed.; }
-as_fn_failure && { exitcode=1; echo as_fn_failure succeeded.; }
-as_fn_ret_success || { exitcode=1; echo as_fn_ret_success failed.; }
-as_fn_ret_failure && { exitcode=1; echo as_fn_ret_failure succeeded.; }
-if ( set x; as_fn_ret_success y && test x = \"\$1\" ); then :
-
-else
-  exitcode=1; echo positional parameters were not saved.
-fi
-test x\$exitcode = x0 || exit 1"
-  as_suggested="  as_lineno_1=";as_suggested=$as_suggested$LINENO;as_suggested=$as_suggested" as_lineno_1a=\$LINENO
-  as_lineno_2=";as_suggested=$as_suggested$LINENO;as_suggested=$as_suggested" as_lineno_2a=\$LINENO
-  eval 'test \"x\$as_lineno_1'\$as_run'\" != \"x\$as_lineno_2'\$as_run'\" &&
-  test \"x\`expr \$as_lineno_1'\$as_run' + 1\`\" = \"x\$as_lineno_2'\$as_run'\"' || exit 1
-test \$(( 1 + 1 )) = 2 || exit 1"
-  if (eval "$as_required") 2>/dev/null; then :
-  as_have_required=yes
-else
-  as_have_required=no
-fi
-  if test x$as_have_required = xyes && (eval "$as_suggested") 2>/dev/null; then :
-
-else
-  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-as_found=false
-for as_dir in /bin$PATH_SEPARATOR/usr/bin$PATH_SEPARATOR$PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-  as_found=:
-  case $as_dir in #(
-	 /*)
-	   for as_base in sh bash ksh sh5; do
-	     # Try only shells that exist, to save several forks.
-	     as_shell=$as_dir/$as_base
-	     if { test -f "$as_shell" || test -f "$as_shell.exe"; } &&
-		    { $as_echo "$as_bourne_compatible""$as_required" | as_run=a "$as_shell"; } 2>/dev/null; then :
-  CONFIG_SHELL=$as_shell as_have_required=yes
-		   if { $as_echo "$as_bourne_compatible""$as_suggested" | as_run=a "$as_shell"; } 2>/dev/null; then :
-  break 2
-fi
-fi
-	   done;;
-       esac
-  as_found=false
-done
-$as_found || { if { test -f "$SHELL" || test -f "$SHELL.exe"; } &&
-	      { $as_echo "$as_bourne_compatible""$as_required" | as_run=a "$SHELL"; } 2>/dev/null; then :
-  CONFIG_SHELL=$SHELL as_have_required=yes
-fi; }
-IFS=$as_save_IFS
-
-
-      if test "x$CONFIG_SHELL" != x; then :
-  # We cannot yet assume a decent shell, so we have to provide a
-	# neutralization value for shells without unset; and this also
-	# works around shells that cannot unset nonexistent variables.
-	# Preserve -v and -x to the replacement shell.
-	BASH_ENV=/dev/null
-	ENV=/dev/null
-	(unset BASH_ENV) >/dev/null 2>&1 && unset BASH_ENV ENV
-	export CONFIG_SHELL
-	case $- in # ((((
-	  *v*x* | *x*v* ) as_opts=-vx ;;
-	  *v* ) as_opts=-v ;;
-	  *x* ) as_opts=-x ;;
-	  * ) as_opts= ;;
-	esac
-	exec "$CONFIG_SHELL" $as_opts "$as_myself" ${1+"$@"}
-fi
-
-    if test x$as_have_required = xno; then :
-  $as_echo "$0: This script requires a shell more modern than all"
-  $as_echo "$0: the shells that I found on your system."
-  if test x${ZSH_VERSION+set} = xset ; then
-    $as_echo "$0: In particular, zsh $ZSH_VERSION has bugs and should"
-    $as_echo "$0: be upgraded to zsh 4.3.4 or later."
-  else
-    $as_echo "$0: Please tell bug-autoconf@gnu.org about your system,
-$0: including any error possibly output before this
-$0: message. Then install a modern shell, or manually run
-$0: the script under such a shell if you do have one."
-  fi
-  exit 1
-fi
-fi
-fi
-SHELL=${CONFIG_SHELL-/bin/sh}
-export SHELL
-# Unset more variables known to interfere with behavior of common tools.
-CLICOLOR_FORCE= GREP_OPTIONS=
-unset CLICOLOR_FORCE GREP_OPTIONS
-
-## --------------------- ##
-## M4sh Shell Functions. ##
-## --------------------- ##
-# as_fn_unset VAR
-# ---------------
-# Portably unset VAR.
-as_fn_unset ()
-{
-  { eval $1=; unset $1;}
-}
-as_unset=as_fn_unset
-
-# as_fn_set_status STATUS
-# -----------------------
-# Set $? to STATUS, without forking.
-as_fn_set_status ()
-{
-  return $1
-} # as_fn_set_status
-
-# as_fn_exit STATUS
-# -----------------
-# Exit the shell with STATUS, even in a "trap 0" or "set -e" context.
-as_fn_exit ()
-{
-  set +e
-  as_fn_set_status $1
-  exit $1
-} # as_fn_exit
-
-# as_fn_mkdir_p
-# -------------
-# Create "$as_dir" as a directory, including parents if necessary.
-as_fn_mkdir_p ()
-{
-
-  case $as_dir in #(
-  -*) as_dir=./$as_dir;;
-  esac
-  test -d "$as_dir" || eval $as_mkdir_p || {
-    as_dirs=
-    while :; do
-      case $as_dir in #(
-      *\'*) as_qdir=`$as_echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'(
-      *) as_qdir=$as_dir;;
-      esac
-      as_dirs="'$as_qdir' $as_dirs"
-      as_dir=`$as_dirname -- "$as_dir" ||
-$as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
-	 X"$as_dir" : 'X\(//\)[^/]' \| \
-	 X"$as_dir" : 'X\(//\)$' \| \
-	 X"$as_dir" : 'X\(/\)' \| . 2>/dev/null ||
-$as_echo X"$as_dir" |
-    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\/\)[^/].*/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\/\)$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\).*/{
-	    s//\1/
-	    q
-	  }
-	  s/.*/./; q'`
-      test -d "$as_dir" && break
-    done
-    test -z "$as_dirs" || eval "mkdir $as_dirs"
-  } || test -d "$as_dir" || as_fn_error $? "cannot create directory $as_dir"
-
-
-} # as_fn_mkdir_p
-# as_fn_append VAR VALUE
-# ----------------------
-# Append the text in VALUE to the end of the definition contained in VAR. Take
-# advantage of any shell optimizations that allow amortized linear growth over
-# repeated appends, instead of the typical quadratic growth present in naive
-# implementations.
-if (eval "as_var=1; as_var+=2; test x\$as_var = x12") 2>/dev/null; then :
-  eval 'as_fn_append ()
-  {
-    eval $1+=\$2
-  }'
-else
-  as_fn_append ()
-  {
-    eval $1=\$$1\$2
-  }
-fi # as_fn_append
-
-# as_fn_arith ARG...
-# ------------------
-# Perform arithmetic evaluation on the ARGs, and store the result in the
-# global $as_val. Take advantage of shells that can avoid forks. The arguments
-# must be portable across $(()) and expr.
-if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null; then :
-  eval 'as_fn_arith ()
-  {
-    as_val=$(( $* ))
-  }'
-else
-  as_fn_arith ()
-  {
-    as_val=`expr "$@" || test $? -eq 1`
-  }
-fi # as_fn_arith
-
-
-# as_fn_error STATUS ERROR [LINENO LOG_FD]
-# ----------------------------------------
-# Output "`basename $0`: error: ERROR" to stderr. If LINENO and LOG_FD are
-# provided, also output the error to LOG_FD, referencing LINENO. Then exit the
-# script with STATUS, using 1 if that was 0.
-as_fn_error ()
-{
-  as_status=$1; test $as_status -eq 0 && as_status=1
-  if test "$4"; then
-    as_lineno=${as_lineno-"$3"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-    $as_echo "$as_me:${as_lineno-$LINENO}: error: $2" >&$4
-  fi
-  $as_echo "$as_me: error: $2" >&2
-  as_fn_exit $as_status
-} # as_fn_error
-
-if expr a : '\(a\)' >/dev/null 2>&1 &&
-   test "X`expr 00001 : '.*\(...\)'`" = X001; then
-  as_expr=expr
-else
-  as_expr=false
-fi
-
-if (basename -- /) >/dev/null 2>&1 && test "X`basename -- / 2>&1`" = "X/"; then
-  as_basename=basename
-else
-  as_basename=false
-fi
-
-if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then
-  as_dirname=dirname
-else
-  as_dirname=false
-fi
-
-as_me=`$as_basename -- "$0" ||
-$as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \
-	 X"$0" : 'X\(//\)$' \| \
-	 X"$0" : 'X\(/\)' \| . 2>/dev/null ||
-$as_echo X/"$0" |
-    sed '/^.*\/\([^/][^/]*\)\/*$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\/\(\/\/\)$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\/\(\/\).*/{
-	    s//\1/
-	    q
-	  }
-	  s/.*/./; q'`
-
-# Avoid depending upon Character Ranges.
-as_cr_letters='abcdefghijklmnopqrstuvwxyz'
-as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ'
-as_cr_Letters=$as_cr_letters$as_cr_LETTERS
-as_cr_digits='0123456789'
-as_cr_alnum=$as_cr_Letters$as_cr_digits
-
-
-  as_lineno_1=$LINENO as_lineno_1a=$LINENO
-  as_lineno_2=$LINENO as_lineno_2a=$LINENO
-  eval 'test "x$as_lineno_1'$as_run'" != "x$as_lineno_2'$as_run'" &&
-  test "x`expr $as_lineno_1'$as_run' + 1`" = "x$as_lineno_2'$as_run'"' || {
-  # Blame Lee E. McMahon (1931-1989) for sed's syntax.  :-)
-  sed -n '
-    p
-    /[$]LINENO/=
-  ' <$as_myself |
-    sed '
-      s/[$]LINENO.*/&-/
-      t lineno
-      b
-      :lineno
-      N
-      :loop
-      s/[$]LINENO\([^'$as_cr_alnum'_].*\n\)\(.*\)/\2\1\2/
-      t loop
-      s/-\n.*//
-    ' >$as_me.lineno &&
-  chmod +x "$as_me.lineno" ||
-    { $as_echo "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2; as_fn_exit 1; }
-
-  # Don't try to exec as it changes $[0], causing all sort of problems
-  # (the dirname of $[0] is not the place where we might find the
-  # original and so on.  Autoconf is especially sensitive to this).
-  . "./$as_me.lineno"
-  # Exit status is that of the last command.
-  exit
-}
-
-ECHO_C= ECHO_N= ECHO_T=
-case `echo -n x` in #(((((
--n*)
-  case `echo 'xy\c'` in
-  *c*) ECHO_T='	';;	# ECHO_T is single tab character.
-  xy)  ECHO_C='\c';;
-  *)   echo `echo ksh88 bug on AIX 6.1` > /dev/null
-       ECHO_T='	';;
-  esac;;
-*)
-  ECHO_N='-n';;
-esac
-
-rm -f conf$$ conf$$.exe conf$$.file
-if test -d conf$$.dir; then
-  rm -f conf$$.dir/conf$$.file
-else
-  rm -f conf$$.dir
-  mkdir conf$$.dir 2>/dev/null
-fi
-if (echo >conf$$.file) 2>/dev/null; then
-  if ln -s conf$$.file conf$$ 2>/dev/null; then
-    as_ln_s='ln -s'
-    # ... but there are two gotchas:
-    # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail.
-    # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable.
-    # In both cases, we have to default to `cp -p'.
-    ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe ||
-      as_ln_s='cp -p'
-  elif ln conf$$.file conf$$ 2>/dev/null; then
-    as_ln_s=ln
-  else
-    as_ln_s='cp -p'
-  fi
-else
-  as_ln_s='cp -p'
-fi
-rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file
-rmdir conf$$.dir 2>/dev/null
-
-if mkdir -p . 2>/dev/null; then
-  as_mkdir_p='mkdir -p "$as_dir"'
-else
-  test -d ./-p && rmdir ./-p
-  as_mkdir_p=false
-fi
-
-if test -x / >/dev/null 2>&1; then
-  as_test_x='test -x'
-else
-  if ls -dL / >/dev/null 2>&1; then
-    as_ls_L_option=L
-  else
-    as_ls_L_option=
-  fi
-  as_test_x='
-    eval sh -c '\''
-      if test -d "$1"; then
-	test -d "$1/.";
-      else
-	case $1 in #(
-	-*)set "./$1";;
-	esac;
-	case `ls -ld'$as_ls_L_option' "$1" 2>/dev/null` in #((
-	???[sx]*):;;*)false;;esac;fi
-    '\'' sh
-  '
-fi
-as_executable_p=$as_test_x
-
-# Sed expression to map a string onto a valid CPP name.
-as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'"
-
-# Sed expression to map a string onto a valid variable name.
-as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'"
-
-
-test -n "$DJDIR" || exec 7<&0 </dev/null
-exec 6>&1
-
-# Name of the host.
-# hostname on some systems (SVR3.2, old GNU/Linux) returns a bogus exit status,
-# so uname gets run too.
-ac_hostname=`(hostname || uname -n) 2>/dev/null | sed 1q`
-
-#
-# Initializations.
-#
-ac_default_prefix=/usr/local
-ac_clean_files=
-ac_config_libobj_dir=.
-LIBOBJS=
-cross_compiling=no
-subdirs=
-MFLAGS=
-MAKEFLAGS=
-
-# Identity of this package.
-PACKAGE_NAME='ccminer'
-PACKAGE_TARNAME='ccminer'
-PACKAGE_VERSION='2014.05.03'
-PACKAGE_STRING='ccminer 2014.05.03'
-PACKAGE_BUGREPORT=''
-PACKAGE_URL=''
-
-ac_unique_file="cpu-miner.c"
-# Factoring default headers for most tests.
-ac_includes_default="\
-#include <stdio.h>
-#ifdef HAVE_SYS_TYPES_H
-# include <sys/types.h>
-#endif
-#ifdef HAVE_SYS_STAT_H
-# include <sys/stat.h>
-#endif
-#ifdef STDC_HEADERS
-# include <stdlib.h>
-# include <stddef.h>
-#else
-# ifdef HAVE_STDLIB_H
-#  include <stdlib.h>
-# endif
-#endif
-#ifdef HAVE_STRING_H
-# if !defined STDC_HEADERS && defined HAVE_MEMORY_H
-#  include <memory.h>
-# endif
-# include <string.h>
-#endif
-#ifdef HAVE_STRINGS_H
-# include <strings.h>
-#endif
-#ifdef HAVE_INTTYPES_H
-# include <inttypes.h>
-#endif
-#ifdef HAVE_STDINT_H
-# include <stdint.h>
-#endif
-#ifdef HAVE_UNISTD_H
-# include <unistd.h>
-#endif"
-
-ac_subst_vars='am__EXEEXT_FALSE
-am__EXEEXT_TRUE
-LTLIBOBJS
-LIBOBJS
-NVCC
-CUDA_LDFLAGS
-CUDA_LIBS
-CUDA_CFLAGS
-WS2_LIBS
-PTHREAD_LIBS
-PTHREAD_FLAGS
-JANSSON_LIBS
-LIBCURL
-LIBCURL_CPPFLAGS
-_libcurl_config
-ARCH_x86_64_FALSE
-ARCH_x86_64_TRUE
-ARCH_x86_FALSE
-ARCH_x86_TRUE
-HAVE_WINDOWS_FALSE
-HAVE_WINDOWS_TRUE
-WANT_JANSSON_FALSE
-WANT_JANSSON_TRUE
-ALLOCA
-OPENMP_CFLAGS
-am__fastdepCXX_FALSE
-am__fastdepCXX_TRUE
-CXXDEPMODE
-ac_ct_CXX
-CXXFLAGS
-CXX
-RANLIB
-am__fastdepCCAS_FALSE
-am__fastdepCCAS_TRUE
-CCASDEPMODE
-CCASFLAGS
-CCAS
-EGREP
-GREP
-CPP
-am__fastdepCC_FALSE
-am__fastdepCC_TRUE
-CCDEPMODE
-am__nodep
-AMDEPBACKSLASH
-AMDEP_FALSE
-AMDEP_TRUE
-am__quote
-am__include
-DEPDIR
-OBJEXT
-EXEEXT
-ac_ct_CC
-CPPFLAGS
-LDFLAGS
-CFLAGS
-CC
-MAINT
-MAINTAINER_MODE_FALSE
-MAINTAINER_MODE_TRUE
-am__untar
-am__tar
-AMTAR
-am__leading_dot
-SET_MAKE
-AWK
-mkdir_p
-MKDIR_P
-INSTALL_STRIP_PROGRAM
-STRIP
-install_sh
-MAKEINFO
-AUTOHEADER
-AUTOMAKE
-AUTOCONF
-ACLOCAL
-VERSION
-PACKAGE
-CYGPATH_W
-am__isrc
-INSTALL_DATA
-INSTALL_SCRIPT
-INSTALL_PROGRAM
-target_os
-target_vendor
-target_cpu
-target
-host_os
-host_vendor
-host_cpu
-host
-build_os
-build_vendor
-build_cpu
-build
-target_alias
-host_alias
-build_alias
-LIBS
-ECHO_T
-ECHO_N
-ECHO_C
-DEFS
-mandir
-localedir
-libdir
-psdir
-pdfdir
-dvidir
-htmldir
-infodir
-docdir
-oldincludedir
-includedir
-localstatedir
-sharedstatedir
-sysconfdir
-datadir
-datarootdir
-libexecdir
-sbindir
-bindir
-program_transform_name
-prefix
-exec_prefix
-PACKAGE_URL
-PACKAGE_BUGREPORT
-PACKAGE_STRING
-PACKAGE_VERSION
-PACKAGE_TARNAME
-PACKAGE_NAME
-PATH_SEPARATOR
-SHELL'
-ac_subst_files=''
-ac_user_opts='
-enable_option_checking
-enable_maintainer_mode
-enable_dependency_tracking
-enable_openmp
-with_libcurl
-with_cuda
-'
-      ac_precious_vars='build_alias
-host_alias
-target_alias
-CC
-CFLAGS
-LDFLAGS
-LIBS
-CPPFLAGS
-CPP
-CCAS
-CCASFLAGS
-CXX
-CXXFLAGS
-CCC'
-
-
-# Initialize some variables set by options.
-ac_init_help=
-ac_init_version=false
-ac_unrecognized_opts=
-ac_unrecognized_sep=
-# The variables have the same names as the options, with
-# dashes changed to underlines.
-cache_file=/dev/null
-exec_prefix=NONE
-no_create=
-no_recursion=
-prefix=NONE
-program_prefix=NONE
-program_suffix=NONE
-program_transform_name=s,x,x,
-silent=
-site=
-srcdir=
-verbose=
-x_includes=NONE
-x_libraries=NONE
-
-# Installation directory options.
-# These are left unexpanded so users can "make install exec_prefix=/foo"
-# and all the variables that are supposed to be based on exec_prefix
-# by default will actually change.
-# Use braces instead of parens because sh, perl, etc. also accept them.
-# (The list follows the same order as the GNU Coding Standards.)
-bindir='${exec_prefix}/bin'
-sbindir='${exec_prefix}/sbin'
-libexecdir='${exec_prefix}/libexec'
-datarootdir='${prefix}/share'
-datadir='${datarootdir}'
-sysconfdir='${prefix}/etc'
-sharedstatedir='${prefix}/com'
-localstatedir='${prefix}/var'
-includedir='${prefix}/include'
-oldincludedir='/usr/include'
-docdir='${datarootdir}/doc/${PACKAGE_TARNAME}'
-infodir='${datarootdir}/info'
-htmldir='${docdir}'
-dvidir='${docdir}'
-pdfdir='${docdir}'
-psdir='${docdir}'
-libdir='${exec_prefix}/lib'
-localedir='${datarootdir}/locale'
-mandir='${datarootdir}/man'
-
-ac_prev=
-ac_dashdash=
-for ac_option
-do
-  # If the previous option needs an argument, assign it.
-  if test -n "$ac_prev"; then
-    eval $ac_prev=\$ac_option
-    ac_prev=
-    continue
-  fi
-
-  case $ac_option in
-  *=?*) ac_optarg=`expr "X$ac_option" : '[^=]*=\(.*\)'` ;;
-  *=)   ac_optarg= ;;
-  *)    ac_optarg=yes ;;
-  esac
-
-  # Accept the important Cygnus configure options, so we can diagnose typos.
-
-  case $ac_dashdash$ac_option in
-  --)
-    ac_dashdash=yes ;;
-
-  -bindir | --bindir | --bindi | --bind | --bin | --bi)
-    ac_prev=bindir ;;
-  -bindir=* | --bindir=* | --bindi=* | --bind=* | --bin=* | --bi=*)
-    bindir=$ac_optarg ;;
-
-  -build | --build | --buil | --bui | --bu)
-    ac_prev=build_alias ;;
-  -build=* | --build=* | --buil=* | --bui=* | --bu=*)
-    build_alias=$ac_optarg ;;
-
-  -cache-file | --cache-file | --cache-fil | --cache-fi \
-  | --cache-f | --cache- | --cache | --cach | --cac | --ca | --c)
-    ac_prev=cache_file ;;
-  -cache-file=* | --cache-file=* | --cache-fil=* | --cache-fi=* \
-  | --cache-f=* | --cache-=* | --cache=* | --cach=* | --cac=* | --ca=* | --c=*)
-    cache_file=$ac_optarg ;;
-
-  --config-cache | -C)
-    cache_file=config.cache ;;
-
-  -datadir | --datadir | --datadi | --datad)
-    ac_prev=datadir ;;
-  -datadir=* | --datadir=* | --datadi=* | --datad=*)
-    datadir=$ac_optarg ;;
-
-  -datarootdir | --datarootdir | --datarootdi | --datarootd | --dataroot \
-  | --dataroo | --dataro | --datar)
-    ac_prev=datarootdir ;;
-  -datarootdir=* | --datarootdir=* | --datarootdi=* | --datarootd=* \
-  | --dataroot=* | --dataroo=* | --dataro=* | --datar=*)
-    datarootdir=$ac_optarg ;;
-
-  -disable-* | --disable-*)
-    ac_useropt=`expr "x$ac_option" : 'x-*disable-\(.*\)'`
-    # Reject names that are not valid shell variable names.
-    expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
-      as_fn_error $? "invalid feature name: $ac_useropt"
-    ac_useropt_orig=$ac_useropt
-    ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
-    case $ac_user_opts in
-      *"
-"enable_$ac_useropt"
-"*) ;;
-      *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--disable-$ac_useropt_orig"
-	 ac_unrecognized_sep=', ';;
-    esac
-    eval enable_$ac_useropt=no ;;
-
-  -docdir | --docdir | --docdi | --doc | --do)
-    ac_prev=docdir ;;
-  -docdir=* | --docdir=* | --docdi=* | --doc=* | --do=*)
-    docdir=$ac_optarg ;;
-
-  -dvidir | --dvidir | --dvidi | --dvid | --dvi | --dv)
-    ac_prev=dvidir ;;
-  -dvidir=* | --dvidir=* | --dvidi=* | --dvid=* | --dvi=* | --dv=*)
-    dvidir=$ac_optarg ;;
-
-  -enable-* | --enable-*)
-    ac_useropt=`expr "x$ac_option" : 'x-*enable-\([^=]*\)'`
-    # Reject names that are not valid shell variable names.
-    expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
-      as_fn_error $? "invalid feature name: $ac_useropt"
-    ac_useropt_orig=$ac_useropt
-    ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
-    case $ac_user_opts in
-      *"
-"enable_$ac_useropt"
-"*) ;;
-      *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--enable-$ac_useropt_orig"
-	 ac_unrecognized_sep=', ';;
-    esac
-    eval enable_$ac_useropt=\$ac_optarg ;;
-
-  -exec-prefix | --exec_prefix | --exec-prefix | --exec-prefi \
-  | --exec-pref | --exec-pre | --exec-pr | --exec-p | --exec- \
-  | --exec | --exe | --ex)
-    ac_prev=exec_prefix ;;
-  -exec-prefix=* | --exec_prefix=* | --exec-prefix=* | --exec-prefi=* \
-  | --exec-pref=* | --exec-pre=* | --exec-pr=* | --exec-p=* | --exec-=* \
-  | --exec=* | --exe=* | --ex=*)
-    exec_prefix=$ac_optarg ;;
-
-  -gas | --gas | --ga | --g)
-    # Obsolete; use --with-gas.
-    with_gas=yes ;;
-
-  -help | --help | --hel | --he | -h)
-    ac_init_help=long ;;
-  -help=r* | --help=r* | --hel=r* | --he=r* | -hr*)
-    ac_init_help=recursive ;;
-  -help=s* | --help=s* | --hel=s* | --he=s* | -hs*)
-    ac_init_help=short ;;
-
-  -host | --host | --hos | --ho)
-    ac_prev=host_alias ;;
-  -host=* | --host=* | --hos=* | --ho=*)
-    host_alias=$ac_optarg ;;
-
-  -htmldir | --htmldir | --htmldi | --htmld | --html | --htm | --ht)
-    ac_prev=htmldir ;;
-  -htmldir=* | --htmldir=* | --htmldi=* | --htmld=* | --html=* | --htm=* \
-  | --ht=*)
-    htmldir=$ac_optarg ;;
-
-  -includedir | --includedir | --includedi | --included | --include \
-  | --includ | --inclu | --incl | --inc)
-    ac_prev=includedir ;;
-  -includedir=* | --includedir=* | --includedi=* | --included=* | --include=* \
-  | --includ=* | --inclu=* | --incl=* | --inc=*)
-    includedir=$ac_optarg ;;
-
-  -infodir | --infodir | --infodi | --infod | --info | --inf)
-    ac_prev=infodir ;;
-  -infodir=* | --infodir=* | --infodi=* | --infod=* | --info=* | --inf=*)
-    infodir=$ac_optarg ;;
-
-  -libdir | --libdir | --libdi | --libd)
-    ac_prev=libdir ;;
-  -libdir=* | --libdir=* | --libdi=* | --libd=*)
-    libdir=$ac_optarg ;;
-
-  -libexecdir | --libexecdir | --libexecdi | --libexecd | --libexec \
-  | --libexe | --libex | --libe)
-    ac_prev=libexecdir ;;
-  -libexecdir=* | --libexecdir=* | --libexecdi=* | --libexecd=* | --libexec=* \
-  | --libexe=* | --libex=* | --libe=*)
-    libexecdir=$ac_optarg ;;
-
-  -localedir | --localedir | --localedi | --localed | --locale)
-    ac_prev=localedir ;;
-  -localedir=* | --localedir=* | --localedi=* | --localed=* | --locale=*)
-    localedir=$ac_optarg ;;
-
-  -localstatedir | --localstatedir | --localstatedi | --localstated \
-  | --localstate | --localstat | --localsta | --localst | --locals)
-    ac_prev=localstatedir ;;
-  -localstatedir=* | --localstatedir=* | --localstatedi=* | --localstated=* \
-  | --localstate=* | --localstat=* | --localsta=* | --localst=* | --locals=*)
-    localstatedir=$ac_optarg ;;
-
-  -mandir | --mandir | --mandi | --mand | --man | --ma | --m)
-    ac_prev=mandir ;;
-  -mandir=* | --mandir=* | --mandi=* | --mand=* | --man=* | --ma=* | --m=*)
-    mandir=$ac_optarg ;;
-
-  -nfp | --nfp | --nf)
-    # Obsolete; use --without-fp.
-    with_fp=no ;;
-
-  -no-create | --no-create | --no-creat | --no-crea | --no-cre \
-  | --no-cr | --no-c | -n)
-    no_create=yes ;;
-
-  -no-recursion | --no-recursion | --no-recursio | --no-recursi \
-  | --no-recurs | --no-recur | --no-recu | --no-rec | --no-re | --no-r)
-    no_recursion=yes ;;
-
-  -oldincludedir | --oldincludedir | --oldincludedi | --oldincluded \
-  | --oldinclude | --oldinclud | --oldinclu | --oldincl | --oldinc \
-  | --oldin | --oldi | --old | --ol | --o)
-    ac_prev=oldincludedir ;;
-  -oldincludedir=* | --oldincludedir=* | --oldincludedi=* | --oldincluded=* \
-  | --oldinclude=* | --oldinclud=* | --oldinclu=* | --oldincl=* | --oldinc=* \
-  | --oldin=* | --oldi=* | --old=* | --ol=* | --o=*)
-    oldincludedir=$ac_optarg ;;
-
-  -prefix | --prefix | --prefi | --pref | --pre | --pr | --p)
-    ac_prev=prefix ;;
-  -prefix=* | --prefix=* | --prefi=* | --pref=* | --pre=* | --pr=* | --p=*)
-    prefix=$ac_optarg ;;
-
-  -program-prefix | --program-prefix | --program-prefi | --program-pref \
-  | --program-pre | --program-pr | --program-p)
-    ac_prev=program_prefix ;;
-  -program-prefix=* | --program-prefix=* | --program-prefi=* \
-  | --program-pref=* | --program-pre=* | --program-pr=* | --program-p=*)
-    program_prefix=$ac_optarg ;;
-
-  -program-suffix | --program-suffix | --program-suffi | --program-suff \
-  | --program-suf | --program-su | --program-s)
-    ac_prev=program_suffix ;;
-  -program-suffix=* | --program-suffix=* | --program-suffi=* \
-  | --program-suff=* | --program-suf=* | --program-su=* | --program-s=*)
-    program_suffix=$ac_optarg ;;
-
-  -program-transform-name | --program-transform-name \
-  | --program-transform-nam | --program-transform-na \
-  | --program-transform-n | --program-transform- \
-  | --program-transform | --program-transfor \
-  | --program-transfo | --program-transf \
-  | --program-trans | --program-tran \
-  | --progr-tra | --program-tr | --program-t)
-    ac_prev=program_transform_name ;;
-  -program-transform-name=* | --program-transform-name=* \
-  | --program-transform-nam=* | --program-transform-na=* \
-  | --program-transform-n=* | --program-transform-=* \
-  | --program-transform=* | --program-transfor=* \
-  | --program-transfo=* | --program-transf=* \
-  | --program-trans=* | --program-tran=* \
-  | --progr-tra=* | --program-tr=* | --program-t=*)
-    program_transform_name=$ac_optarg ;;
-
-  -pdfdir | --pdfdir | --pdfdi | --pdfd | --pdf | --pd)
-    ac_prev=pdfdir ;;
-  -pdfdir=* | --pdfdir=* | --pdfdi=* | --pdfd=* | --pdf=* | --pd=*)
-    pdfdir=$ac_optarg ;;
-
-  -psdir | --psdir | --psdi | --psd | --ps)
-    ac_prev=psdir ;;
-  -psdir=* | --psdir=* | --psdi=* | --psd=* | --ps=*)
-    psdir=$ac_optarg ;;
-
-  -q | -quiet | --quiet | --quie | --qui | --qu | --q \
-  | -silent | --silent | --silen | --sile | --sil)
-    silent=yes ;;
-
-  -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb)
-    ac_prev=sbindir ;;
-  -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \
-  | --sbi=* | --sb=*)
-    sbindir=$ac_optarg ;;
-
-  -sharedstatedir | --sharedstatedir | --sharedstatedi \
-  | --sharedstated | --sharedstate | --sharedstat | --sharedsta \
-  | --sharedst | --shareds | --shared | --share | --shar \
-  | --sha | --sh)
-    ac_prev=sharedstatedir ;;
-  -sharedstatedir=* | --sharedstatedir=* | --sharedstatedi=* \
-  | --sharedstated=* | --sharedstate=* | --sharedstat=* | --sharedsta=* \
-  | --sharedst=* | --shareds=* | --shared=* | --share=* | --shar=* \
-  | --sha=* | --sh=*)
-    sharedstatedir=$ac_optarg ;;
-
-  -site | --site | --sit)
-    ac_prev=site ;;
-  -site=* | --site=* | --sit=*)
-    site=$ac_optarg ;;
-
-  -srcdir | --srcdir | --srcdi | --srcd | --src | --sr)
-    ac_prev=srcdir ;;
-  -srcdir=* | --srcdir=* | --srcdi=* | --srcd=* | --src=* | --sr=*)
-    srcdir=$ac_optarg ;;
-
-  -sysconfdir | --sysconfdir | --sysconfdi | --sysconfd | --sysconf \
-  | --syscon | --sysco | --sysc | --sys | --sy)
-    ac_prev=sysconfdir ;;
-  -sysconfdir=* | --sysconfdir=* | --sysconfdi=* | --sysconfd=* | --sysconf=* \
-  | --syscon=* | --sysco=* | --sysc=* | --sys=* | --sy=*)
-    sysconfdir=$ac_optarg ;;
-
-  -target | --target | --targe | --targ | --tar | --ta | --t)
-    ac_prev=target_alias ;;
-  -target=* | --target=* | --targe=* | --targ=* | --tar=* | --ta=* | --t=*)
-    target_alias=$ac_optarg ;;
-
-  -v | -verbose | --verbose | --verbos | --verbo | --verb)
-    verbose=yes ;;
-
-  -version | --version | --versio | --versi | --vers | -V)
-    ac_init_version=: ;;
-
-  -with-* | --with-*)
-    ac_useropt=`expr "x$ac_option" : 'x-*with-\([^=]*\)'`
-    # Reject names that are not valid shell variable names.
-    expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
-      as_fn_error $? "invalid package name: $ac_useropt"
-    ac_useropt_orig=$ac_useropt
-    ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
-    case $ac_user_opts in
-      *"
-"with_$ac_useropt"
-"*) ;;
-      *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--with-$ac_useropt_orig"
-	 ac_unrecognized_sep=', ';;
-    esac
-    eval with_$ac_useropt=\$ac_optarg ;;
-
-  -without-* | --without-*)
-    ac_useropt=`expr "x$ac_option" : 'x-*without-\(.*\)'`
-    # Reject names that are not valid shell variable names.
-    expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
-      as_fn_error $? "invalid package name: $ac_useropt"
-    ac_useropt_orig=$ac_useropt
-    ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
-    case $ac_user_opts in
-      *"
-"with_$ac_useropt"
-"*) ;;
-      *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--without-$ac_useropt_orig"
-	 ac_unrecognized_sep=', ';;
-    esac
-    eval with_$ac_useropt=no ;;
-
-  --x)
-    # Obsolete; use --with-x.
-    with_x=yes ;;
-
-  -x-includes | --x-includes | --x-include | --x-includ | --x-inclu \
-  | --x-incl | --x-inc | --x-in | --x-i)
-    ac_prev=x_includes ;;
-  -x-includes=* | --x-includes=* | --x-include=* | --x-includ=* | --x-inclu=* \
-  | --x-incl=* | --x-inc=* | --x-in=* | --x-i=*)
-    x_includes=$ac_optarg ;;
-
-  -x-libraries | --x-libraries | --x-librarie | --x-librari \
-  | --x-librar | --x-libra | --x-libr | --x-lib | --x-li | --x-l)
-    ac_prev=x_libraries ;;
-  -x-libraries=* | --x-libraries=* | --x-librarie=* | --x-librari=* \
-  | --x-librar=* | --x-libra=* | --x-libr=* | --x-lib=* | --x-li=* | --x-l=*)
-    x_libraries=$ac_optarg ;;
-
-  -*) as_fn_error $? "unrecognized option: \`$ac_option'
-Try \`$0 --help' for more information"
-    ;;
-
-  *=*)
-    ac_envvar=`expr "x$ac_option" : 'x\([^=]*\)='`
-    # Reject names that are not valid shell variable names.
-    case $ac_envvar in #(
-      '' | [0-9]* | *[!_$as_cr_alnum]* )
-      as_fn_error $? "invalid variable name: \`$ac_envvar'" ;;
-    esac
-    eval $ac_envvar=\$ac_optarg
-    export $ac_envvar ;;
-
-  *)
-    # FIXME: should be removed in autoconf 3.0.
-    $as_echo "$as_me: WARNING: you should use --build, --host, --target" >&2
-    expr "x$ac_option" : ".*[^-._$as_cr_alnum]" >/dev/null &&
-      $as_echo "$as_me: WARNING: invalid host type: $ac_option" >&2
-    : "${build_alias=$ac_option} ${host_alias=$ac_option} ${target_alias=$ac_option}"
-    ;;
-
-  esac
-done
-
-if test -n "$ac_prev"; then
-  ac_option=--`echo $ac_prev | sed 's/_/-/g'`
-  as_fn_error $? "missing argument to $ac_option"
-fi
-
-if test -n "$ac_unrecognized_opts"; then
-  case $enable_option_checking in
-    no) ;;
-    fatal) as_fn_error $? "unrecognized options: $ac_unrecognized_opts" ;;
-    *)     $as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2 ;;
-  esac
-fi
-
-# Check all directory arguments for consistency.
-for ac_var in	exec_prefix prefix bindir sbindir libexecdir datarootdir \
-		datadir sysconfdir sharedstatedir localstatedir includedir \
-		oldincludedir docdir infodir htmldir dvidir pdfdir psdir \
-		libdir localedir mandir
-do
-  eval ac_val=\$$ac_var
-  # Remove trailing slashes.
-  case $ac_val in
-    */ )
-      ac_val=`expr "X$ac_val" : 'X\(.*[^/]\)' \| "X$ac_val" : 'X\(.*\)'`
-      eval $ac_var=\$ac_val;;
-  esac
-  # Be sure to have absolute directory names.
-  case $ac_val in
-    [\\/$]* | ?:[\\/]* )  continue;;
-    NONE | '' ) case $ac_var in *prefix ) continue;; esac;;
-  esac
-  as_fn_error $? "expected an absolute directory name for --$ac_var: $ac_val"
-done
-
-# There might be people who depend on the old broken behavior: `$host'
-# used to hold the argument of --host etc.
-# FIXME: To remove some day.
-build=$build_alias
-host=$host_alias
-target=$target_alias
-
-# FIXME: To remove some day.
-if test "x$host_alias" != x; then
-  if test "x$build_alias" = x; then
-    cross_compiling=maybe
-    $as_echo "$as_me: WARNING: if you wanted to set the --build type, don't use --host.
-    If a cross compiler is detected then cross compile mode will be used" >&2
-  elif test "x$build_alias" != "x$host_alias"; then
-    cross_compiling=yes
-  fi
-fi
-
-ac_tool_prefix=
-test -n "$host_alias" && ac_tool_prefix=$host_alias-
-
-test "$silent" = yes && exec 6>/dev/null
-
-
-ac_pwd=`pwd` && test -n "$ac_pwd" &&
-ac_ls_di=`ls -di .` &&
-ac_pwd_ls_di=`cd "$ac_pwd" && ls -di .` ||
-  as_fn_error $? "working directory cannot be determined"
-test "X$ac_ls_di" = "X$ac_pwd_ls_di" ||
-  as_fn_error $? "pwd does not report name of working directory"
-
-
-# Find the source files, if location was not specified.
-if test -z "$srcdir"; then
-  ac_srcdir_defaulted=yes
-  # Try the directory containing this script, then the parent directory.
-  ac_confdir=`$as_dirname -- "$as_myself" ||
-$as_expr X"$as_myself" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
-	 X"$as_myself" : 'X\(//\)[^/]' \| \
-	 X"$as_myself" : 'X\(//\)$' \| \
-	 X"$as_myself" : 'X\(/\)' \| . 2>/dev/null ||
-$as_echo X"$as_myself" |
-    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\/\)[^/].*/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\/\)$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\).*/{
-	    s//\1/
-	    q
-	  }
-	  s/.*/./; q'`
-  srcdir=$ac_confdir
-  if test ! -r "$srcdir/$ac_unique_file"; then
-    srcdir=..
-  fi
-else
-  ac_srcdir_defaulted=no
-fi
-if test ! -r "$srcdir/$ac_unique_file"; then
-  test "$ac_srcdir_defaulted" = yes && srcdir="$ac_confdir or .."
-  as_fn_error $? "cannot find sources ($ac_unique_file) in $srcdir"
-fi
-ac_msg="sources are in $srcdir, but \`cd $srcdir' does not work"
-ac_abs_confdir=`(
-	cd "$srcdir" && test -r "./$ac_unique_file" || as_fn_error $? "$ac_msg"
-	pwd)`
-# When building in place, set srcdir=.
-if test "$ac_abs_confdir" = "$ac_pwd"; then
-  srcdir=.
-fi
-# Remove unnecessary trailing slashes from srcdir.
-# Double slashes in file names in object file debugging info
-# mess up M-x gdb in Emacs.
-case $srcdir in
-*/) srcdir=`expr "X$srcdir" : 'X\(.*[^/]\)' \| "X$srcdir" : 'X\(.*\)'`;;
-esac
-for ac_var in $ac_precious_vars; do
-  eval ac_env_${ac_var}_set=\${${ac_var}+set}
-  eval ac_env_${ac_var}_value=\$${ac_var}
-  eval ac_cv_env_${ac_var}_set=\${${ac_var}+set}
-  eval ac_cv_env_${ac_var}_value=\$${ac_var}
-done
-
-#
-# Report the --help message.
-#
-if test "$ac_init_help" = "long"; then
-  # Omit some internal or obsolete options to make the list less imposing.
-  # This message is too long to be a string in the A/UX 3.1 sh.
-  cat <<_ACEOF
-\`configure' configures ccminer 2014.05.03 to adapt to many kinds of systems.
-
-Usage: $0 [OPTION]... [VAR=VALUE]...
-
-To assign environment variables (e.g., CC, CFLAGS...), specify them as
-VAR=VALUE.  See below for descriptions of some of the useful variables.
-
-Defaults for the options are specified in brackets.
-
-Configuration:
-  -h, --help              display this help and exit
-      --help=short        display options specific to this package
-      --help=recursive    display the short help of all the included packages
-  -V, --version           display version information and exit
-  -q, --quiet, --silent   do not print \`checking ...' messages
-      --cache-file=FILE   cache test results in FILE [disabled]
-  -C, --config-cache      alias for \`--cache-file=config.cache'
-  -n, --no-create         do not create output files
-      --srcdir=DIR        find the sources in DIR [configure dir or \`..']
-
-Installation directories:
-  --prefix=PREFIX         install architecture-independent files in PREFIX
-                          [$ac_default_prefix]
-  --exec-prefix=EPREFIX   install architecture-dependent files in EPREFIX
-                          [PREFIX]
-
-By default, \`make install' will install all the files in
-\`$ac_default_prefix/bin', \`$ac_default_prefix/lib' etc.  You can specify
-an installation prefix other than \`$ac_default_prefix' using \`--prefix',
-for instance \`--prefix=\$HOME'.
-
-For better control, use the options below.
-
-Fine tuning of the installation directories:
-  --bindir=DIR            user executables [EPREFIX/bin]
-  --sbindir=DIR           system admin executables [EPREFIX/sbin]
-  --libexecdir=DIR        program executables [EPREFIX/libexec]
-  --sysconfdir=DIR        read-only single-machine data [PREFIX/etc]
-  --sharedstatedir=DIR    modifiable architecture-independent data [PREFIX/com]
-  --localstatedir=DIR     modifiable single-machine data [PREFIX/var]
-  --libdir=DIR            object code libraries [EPREFIX/lib]
-  --includedir=DIR        C header files [PREFIX/include]
-  --oldincludedir=DIR     C header files for non-gcc [/usr/include]
-  --datarootdir=DIR       read-only arch.-independent data root [PREFIX/share]
-  --datadir=DIR           read-only architecture-independent data [DATAROOTDIR]
-  --infodir=DIR           info documentation [DATAROOTDIR/info]
-  --localedir=DIR         locale-dependent data [DATAROOTDIR/locale]
-  --mandir=DIR            man documentation [DATAROOTDIR/man]
-  --docdir=DIR            documentation root [DATAROOTDIR/doc/ccminer]
-  --htmldir=DIR           html documentation [DOCDIR]
-  --dvidir=DIR            dvi documentation [DOCDIR]
-  --pdfdir=DIR            pdf documentation [DOCDIR]
-  --psdir=DIR             ps documentation [DOCDIR]
-_ACEOF
-
-  cat <<\_ACEOF
-
-Program names:
-  --program-prefix=PREFIX            prepend PREFIX to installed program names
-  --program-suffix=SUFFIX            append SUFFIX to installed program names
-  --program-transform-name=PROGRAM   run sed PROGRAM on installed program names
-
-System types:
-  --build=BUILD     configure for building on BUILD [guessed]
-  --host=HOST       cross-compile to build programs to run on HOST [BUILD]
-  --target=TARGET   configure for building compilers for TARGET [HOST]
-_ACEOF
-fi
-
-if test -n "$ac_init_help"; then
-  case $ac_init_help in
-     short | recursive ) echo "Configuration of ccminer 2014.05.03:";;
-   esac
-  cat <<\_ACEOF
-
-Optional Features:
-  --disable-option-checking  ignore unrecognized --enable/--with options
-  --disable-FEATURE       do not include FEATURE (same as --enable-FEATURE=no)
-  --enable-FEATURE[=ARG]  include FEATURE [ARG=yes]
-  --enable-maintainer-mode  enable make rules and dependencies not useful
-			  (and sometimes confusing) to the casual installer
-  --disable-dependency-tracking  speeds up one-time build
-  --enable-dependency-tracking   do not reject slow dependency extractors
-  --disable-openmp        do not use OpenMP
-
-Optional Packages:
-  --with-PACKAGE[=ARG]    use PACKAGE [ARG=yes]
-  --without-PACKAGE       do not use PACKAGE (same as --with-PACKAGE=no)
-  --with-libcurl=PREFIX   look for the curl library in PREFIX/lib and headers
-                          in PREFIX/include
-  --with-cuda=PATH    prefix where cuda is installed default=/usr/local/cuda
-
-Some influential environment variables:
-  CC          C compiler command
-  CFLAGS      C compiler flags
-  LDFLAGS     linker flags, e.g. -L<lib dir> if you have libraries in a
-              nonstandard directory <lib dir>
-  LIBS        libraries to pass to the linker, e.g. -l<library>
-  CPPFLAGS    (Objective) C/C++ preprocessor flags, e.g. -I<include dir> if
-              you have headers in a nonstandard directory <include dir>
-  CPP         C preprocessor
-  CCAS        assembler compiler command (defaults to CC)
-  CCASFLAGS   assembler compiler flags (defaults to CFLAGS)
-  CXX         C++ compiler command
-  CXXFLAGS    C++ compiler flags
-
-Use these variables to override the choices made by `configure' or to help
-it to find libraries and programs with nonstandard names/locations.
-
-Report bugs to the package provider.
-_ACEOF
-ac_status=$?
-fi
-
-if test "$ac_init_help" = "recursive"; then
-  # If there are subdirs, report their specific --help.
-  for ac_dir in : $ac_subdirs_all; do test "x$ac_dir" = x: && continue
-    test -d "$ac_dir" ||
-      { cd "$srcdir" && ac_pwd=`pwd` && srcdir=. && test -d "$ac_dir"; } ||
-      continue
-    ac_builddir=.
-
-case "$ac_dir" in
-.) ac_dir_suffix= ac_top_builddir_sub=. ac_top_build_prefix= ;;
-*)
-  ac_dir_suffix=/`$as_echo "$ac_dir" | sed 's|^\.[\\/]||'`
-  # A ".." for each directory in $ac_dir_suffix.
-  ac_top_builddir_sub=`$as_echo "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'`
-  case $ac_top_builddir_sub in
-  "") ac_top_builddir_sub=. ac_top_build_prefix= ;;
-  *)  ac_top_build_prefix=$ac_top_builddir_sub/ ;;
-  esac ;;
-esac
-ac_abs_top_builddir=$ac_pwd
-ac_abs_builddir=$ac_pwd$ac_dir_suffix
-# for backward compatibility:
-ac_top_builddir=$ac_top_build_prefix
-
-case $srcdir in
-  .)  # We are building in place.
-    ac_srcdir=.
-    ac_top_srcdir=$ac_top_builddir_sub
-    ac_abs_top_srcdir=$ac_pwd ;;
-  [\\/]* | ?:[\\/]* )  # Absolute name.
-    ac_srcdir=$srcdir$ac_dir_suffix;
-    ac_top_srcdir=$srcdir
-    ac_abs_top_srcdir=$srcdir ;;
-  *) # Relative name.
-    ac_srcdir=$ac_top_build_prefix$srcdir$ac_dir_suffix
-    ac_top_srcdir=$ac_top_build_prefix$srcdir
-    ac_abs_top_srcdir=$ac_pwd/$srcdir ;;
-esac
-ac_abs_srcdir=$ac_abs_top_srcdir$ac_dir_suffix
-
-    cd "$ac_dir" || { ac_status=$?; continue; }
-    # Check for guested configure.
-    if test -f "$ac_srcdir/configure.gnu"; then
-      echo &&
-      $SHELL "$ac_srcdir/configure.gnu" --help=recursive
-    elif test -f "$ac_srcdir/configure"; then
-      echo &&
-      $SHELL "$ac_srcdir/configure" --help=recursive
-    else
-      $as_echo "$as_me: WARNING: no configuration information is in $ac_dir" >&2
-    fi || ac_status=$?
-    cd "$ac_pwd" || { ac_status=$?; break; }
-  done
-fi
-
-test -n "$ac_init_help" && exit $ac_status
-if $ac_init_version; then
-  cat <<\_ACEOF
-ccminer configure 2014.05.03
-generated by GNU Autoconf 2.68
-
-Copyright (C) 2010 Free Software Foundation, Inc.
-This configure script is free software; the Free Software Foundation
-gives unlimited permission to copy, distribute and modify it.
-_ACEOF
-  exit
-fi
-
-## ------------------------ ##
-## Autoconf initialization. ##
-## ------------------------ ##
-
-# ac_fn_c_try_compile LINENO
-# --------------------------
-# Try to compile conftest.$ac_ext, and return whether this succeeded.
-ac_fn_c_try_compile ()
-{
-  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-  rm -f conftest.$ac_objext
-  if { { ac_try="$ac_compile"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_compile") 2>conftest.err
-  ac_status=$?
-  if test -s conftest.err; then
-    grep -v '^ *+' conftest.err >conftest.er1
-    cat conftest.er1 >&5
-    mv -f conftest.er1 conftest.err
-  fi
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; } && {
-	 test -z "$ac_c_werror_flag" ||
-	 test ! -s conftest.err
-       } && test -s conftest.$ac_objext; then :
-  ac_retval=0
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-	ac_retval=1
-fi
-  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
-  as_fn_set_status $ac_retval
-
-} # ac_fn_c_try_compile
-
-# ac_fn_c_try_cpp LINENO
-# ----------------------
-# Try to preprocess conftest.$ac_ext, and return whether this succeeded.
-ac_fn_c_try_cpp ()
-{
-  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-  if { { ac_try="$ac_cpp conftest.$ac_ext"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_cpp conftest.$ac_ext") 2>conftest.err
-  ac_status=$?
-  if test -s conftest.err; then
-    grep -v '^ *+' conftest.err >conftest.er1
-    cat conftest.er1 >&5
-    mv -f conftest.er1 conftest.err
-  fi
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; } > conftest.i && {
-	 test -z "$ac_c_preproc_warn_flag$ac_c_werror_flag" ||
-	 test ! -s conftest.err
-       }; then :
-  ac_retval=0
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-    ac_retval=1
-fi
-  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
-  as_fn_set_status $ac_retval
-
-} # ac_fn_c_try_cpp
-
-# ac_fn_cxx_try_compile LINENO
-# ----------------------------
-# Try to compile conftest.$ac_ext, and return whether this succeeded.
-ac_fn_cxx_try_compile ()
-{
-  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-  rm -f conftest.$ac_objext
-  if { { ac_try="$ac_compile"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_compile") 2>conftest.err
-  ac_status=$?
-  if test -s conftest.err; then
-    grep -v '^ *+' conftest.err >conftest.er1
-    cat conftest.er1 >&5
-    mv -f conftest.er1 conftest.err
-  fi
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; } && {
-	 test -z "$ac_cxx_werror_flag" ||
-	 test ! -s conftest.err
-       } && test -s conftest.$ac_objext; then :
-  ac_retval=0
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-	ac_retval=1
-fi
-  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
-  as_fn_set_status $ac_retval
-
-} # ac_fn_cxx_try_compile
-
-# ac_fn_c_try_link LINENO
-# -----------------------
-# Try to link conftest.$ac_ext, and return whether this succeeded.
-ac_fn_c_try_link ()
-{
-  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-  rm -f conftest.$ac_objext conftest$ac_exeext
-  if { { ac_try="$ac_link"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_link") 2>conftest.err
-  ac_status=$?
-  if test -s conftest.err; then
-    grep -v '^ *+' conftest.err >conftest.er1
-    cat conftest.er1 >&5
-    mv -f conftest.er1 conftest.err
-  fi
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; } && {
-	 test -z "$ac_c_werror_flag" ||
-	 test ! -s conftest.err
-       } && test -s conftest$ac_exeext && {
-	 test "$cross_compiling" = yes ||
-	 $as_test_x conftest$ac_exeext
-       }; then :
-  ac_retval=0
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-	ac_retval=1
-fi
-  # Delete the IPA/IPO (Inter Procedural Analysis/Optimization) information
-  # created by the PGI compiler (conftest_ipa8_conftest.oo), as it would
-  # interfere with the next link command; also delete a directory that is
-  # left behind by Apple's compiler.  We do this before executing the actions.
-  rm -rf conftest.dSYM conftest_ipa8_conftest.oo
-  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
-  as_fn_set_status $ac_retval
-
-} # ac_fn_c_try_link
-
-# ac_fn_c_try_run LINENO
-# ----------------------
-# Try to link conftest.$ac_ext, and return whether this succeeded. Assumes
-# that executables *can* be run.
-ac_fn_c_try_run ()
-{
-  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-  if { { ac_try="$ac_link"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_link") 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; } && { ac_try='./conftest$ac_exeext'
-  { { case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_try") 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then :
-  ac_retval=0
-else
-  $as_echo "$as_me: program exited with status $ac_status" >&5
-       $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-       ac_retval=$ac_status
-fi
-  rm -rf conftest.dSYM conftest_ipa8_conftest.oo
-  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
-  as_fn_set_status $ac_retval
-
-} # ac_fn_c_try_run
-
-# ac_fn_c_check_header_mongrel LINENO HEADER VAR INCLUDES
-# -------------------------------------------------------
-# Tests whether HEADER exists, giving a warning if it cannot be compiled using
-# the include files in INCLUDES and setting the cache variable VAR
-# accordingly.
-ac_fn_c_check_header_mongrel ()
-{
-  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-  if eval \${$3+:} false; then :
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
-$as_echo_n "checking for $2... " >&6; }
-if eval \${$3+:} false; then :
-  $as_echo_n "(cached) " >&6
-fi
-eval ac_res=\$$3
-	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
-$as_echo "$ac_res" >&6; }
-else
-  # Is the header compilable?
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking $2 usability" >&5
-$as_echo_n "checking $2 usability... " >&6; }
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-$4
-#include <$2>
-_ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
-  ac_header_compiler=yes
-else
-  ac_header_compiler=no
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_header_compiler" >&5
-$as_echo "$ac_header_compiler" >&6; }
-
-# Is the header present?
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking $2 presence" >&5
-$as_echo_n "checking $2 presence... " >&6; }
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#include <$2>
-_ACEOF
-if ac_fn_c_try_cpp "$LINENO"; then :
-  ac_header_preproc=yes
-else
-  ac_header_preproc=no
-fi
-rm -f conftest.err conftest.i conftest.$ac_ext
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_header_preproc" >&5
-$as_echo "$ac_header_preproc" >&6; }
-
-# So?  What about this header?
-case $ac_header_compiler:$ac_header_preproc:$ac_c_preproc_warn_flag in #((
-  yes:no: )
-    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: accepted by the compiler, rejected by the preprocessor!" >&5
-$as_echo "$as_me: WARNING: $2: accepted by the compiler, rejected by the preprocessor!" >&2;}
-    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: proceeding with the compiler's result" >&5
-$as_echo "$as_me: WARNING: $2: proceeding with the compiler's result" >&2;}
-    ;;
-  no:yes:* )
-    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: present but cannot be compiled" >&5
-$as_echo "$as_me: WARNING: $2: present but cannot be compiled" >&2;}
-    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2:     check for missing prerequisite headers?" >&5
-$as_echo "$as_me: WARNING: $2:     check for missing prerequisite headers?" >&2;}
-    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: see the Autoconf documentation" >&5
-$as_echo "$as_me: WARNING: $2: see the Autoconf documentation" >&2;}
-    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2:     section \"Present But Cannot Be Compiled\"" >&5
-$as_echo "$as_me: WARNING: $2:     section \"Present But Cannot Be Compiled\"" >&2;}
-    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: proceeding with the compiler's result" >&5
-$as_echo "$as_me: WARNING: $2: proceeding with the compiler's result" >&2;}
-    ;;
-esac
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
-$as_echo_n "checking for $2... " >&6; }
-if eval \${$3+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  eval "$3=\$ac_header_compiler"
-fi
-eval ac_res=\$$3
-	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
-$as_echo "$ac_res" >&6; }
-fi
-  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
-
-} # ac_fn_c_check_header_mongrel
-
-# ac_fn_c_check_header_compile LINENO HEADER VAR INCLUDES
-# -------------------------------------------------------
-# Tests whether HEADER exists and can be compiled using the include files in
-# INCLUDES, setting the cache variable VAR accordingly.
-ac_fn_c_check_header_compile ()
-{
-  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
-$as_echo_n "checking for $2... " >&6; }
-if eval \${$3+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-$4
-#include <$2>
-_ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
-  eval "$3=yes"
-else
-  eval "$3=no"
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-fi
-eval ac_res=\$$3
-	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
-$as_echo "$ac_res" >&6; }
-  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
-
-} # ac_fn_c_check_header_compile
-
-# ac_fn_c_check_decl LINENO SYMBOL VAR INCLUDES
-# ---------------------------------------------
-# Tests whether SYMBOL is declared in INCLUDES, setting cache variable VAR
-# accordingly.
-ac_fn_c_check_decl ()
-{
-  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-  as_decl_name=`echo $2|sed 's/ *(.*//'`
-  as_decl_use=`echo $2|sed -e 's/(/((/' -e 's/)/) 0&/' -e 's/,/) 0& (/g'`
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $as_decl_name is declared" >&5
-$as_echo_n "checking whether $as_decl_name is declared... " >&6; }
-if eval \${$3+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-$4
-int
-main ()
-{
-#ifndef $as_decl_name
-#ifdef __cplusplus
-  (void) $as_decl_use;
-#else
-  (void) $as_decl_name;
-#endif
-#endif
-
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
-  eval "$3=yes"
-else
-  eval "$3=no"
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-fi
-eval ac_res=\$$3
-	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
-$as_echo "$ac_res" >&6; }
-  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
-
-} # ac_fn_c_check_decl
-
-# ac_fn_c_check_type LINENO TYPE VAR INCLUDES
-# -------------------------------------------
-# Tests whether TYPE exists after having included INCLUDES, setting cache
-# variable VAR accordingly.
-ac_fn_c_check_type ()
-{
-  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
-$as_echo_n "checking for $2... " >&6; }
-if eval \${$3+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  eval "$3=no"
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-$4
-int
-main ()
-{
-if (sizeof ($2))
-	 return 0;
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-$4
-int
-main ()
-{
-if (sizeof (($2)))
-	    return 0;
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
-
-else
-  eval "$3=yes"
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-fi
-eval ac_res=\$$3
-	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
-$as_echo "$ac_res" >&6; }
-  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
-
-} # ac_fn_c_check_type
-
-# ac_fn_c_check_func LINENO FUNC VAR
-# ----------------------------------
-# Tests whether FUNC exists, setting the cache variable VAR accordingly
-ac_fn_c_check_func ()
-{
-  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
-$as_echo_n "checking for $2... " >&6; }
-if eval \${$3+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-/* Define $2 to an innocuous variant, in case <limits.h> declares $2.
-   For example, HP-UX 11i <limits.h> declares gettimeofday.  */
-#define $2 innocuous_$2
-
-/* System header to define __stub macros and hopefully few prototypes,
-    which can conflict with char $2 (); below.
-    Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
-    <limits.h> exists even on freestanding compilers.  */
-
-#ifdef __STDC__
-# include <limits.h>
-#else
-# include <assert.h>
-#endif
-
-#undef $2
-
-/* Override any GCC internal prototype to avoid an error.
-   Use char because int might match the return type of a GCC
-   builtin and then its argument prototype would still apply.  */
-#ifdef __cplusplus
-extern "C"
-#endif
-char $2 ();
-/* The GNU C library defines this for functions which it implements
-    to always fail with ENOSYS.  Some functions are actually named
-    something starting with __ and the normal name is an alias.  */
-#if defined __stub_$2 || defined __stub___$2
-choke me
-#endif
-
-int
-main ()
-{
-return $2 ();
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
-  eval "$3=yes"
-else
-  eval "$3=no"
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-fi
-eval ac_res=\$$3
-	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
-$as_echo "$ac_res" >&6; }
-  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
-
-} # ac_fn_c_check_func
-cat >config.log <<_ACEOF
-This file contains any messages produced by compilers while
-running configure, to aid debugging if configure makes a mistake.
-
-It was created by ccminer $as_me 2014.05.03, which was
-generated by GNU Autoconf 2.68.  Invocation command line was
-
-  $ $0 $@
-
-_ACEOF
-exec 5>>config.log
-{
-cat <<_ASUNAME
-## --------- ##
-## Platform. ##
-## --------- ##
-
-hostname = `(hostname || uname -n) 2>/dev/null | sed 1q`
-uname -m = `(uname -m) 2>/dev/null || echo unknown`
-uname -r = `(uname -r) 2>/dev/null || echo unknown`
-uname -s = `(uname -s) 2>/dev/null || echo unknown`
-uname -v = `(uname -v) 2>/dev/null || echo unknown`
-
-/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null || echo unknown`
-/bin/uname -X     = `(/bin/uname -X) 2>/dev/null     || echo unknown`
-
-/bin/arch              = `(/bin/arch) 2>/dev/null              || echo unknown`
-/usr/bin/arch -k       = `(/usr/bin/arch -k) 2>/dev/null       || echo unknown`
-/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null || echo unknown`
-/usr/bin/hostinfo      = `(/usr/bin/hostinfo) 2>/dev/null      || echo unknown`
-/bin/machine           = `(/bin/machine) 2>/dev/null           || echo unknown`
-/usr/bin/oslevel       = `(/usr/bin/oslevel) 2>/dev/null       || echo unknown`
-/bin/universe          = `(/bin/universe) 2>/dev/null          || echo unknown`
-
-_ASUNAME
-
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    $as_echo "PATH: $as_dir"
-  done
-IFS=$as_save_IFS
-
-} >&5
-
-cat >&5 <<_ACEOF
-
-
-## ----------- ##
-## Core tests. ##
-## ----------- ##
-
-_ACEOF
-
-
-# Keep a trace of the command line.
-# Strip out --no-create and --no-recursion so they do not pile up.
-# Strip out --silent because we don't want to record it for future runs.
-# Also quote any args containing shell meta-characters.
-# Make two passes to allow for proper duplicate-argument suppression.
-ac_configure_args=
-ac_configure_args0=
-ac_configure_args1=
-ac_must_keep_next=false
-for ac_pass in 1 2
-do
-  for ac_arg
-  do
-    case $ac_arg in
-    -no-create | --no-c* | -n | -no-recursion | --no-r*) continue ;;
-    -q | -quiet | --quiet | --quie | --qui | --qu | --q \
-    | -silent | --silent | --silen | --sile | --sil)
-      continue ;;
-    *\'*)
-      ac_arg=`$as_echo "$ac_arg" | sed "s/'/'\\\\\\\\''/g"` ;;
-    esac
-    case $ac_pass in
-    1) as_fn_append ac_configure_args0 " '$ac_arg'" ;;
-    2)
-      as_fn_append ac_configure_args1 " '$ac_arg'"
-      if test $ac_must_keep_next = true; then
-	ac_must_keep_next=false # Got value, back to normal.
-      else
-	case $ac_arg in
-	  *=* | --config-cache | -C | -disable-* | --disable-* \
-	  | -enable-* | --enable-* | -gas | --g* | -nfp | --nf* \
-	  | -q | -quiet | --q* | -silent | --sil* | -v | -verb* \
-	  | -with-* | --with-* | -without-* | --without-* | --x)
-	    case "$ac_configure_args0 " in
-	      "$ac_configure_args1"*" '$ac_arg' "* ) continue ;;
-	    esac
-	    ;;
-	  -* ) ac_must_keep_next=true ;;
-	esac
-      fi
-      as_fn_append ac_configure_args " '$ac_arg'"
-      ;;
-    esac
-  done
-done
-{ ac_configure_args0=; unset ac_configure_args0;}
-{ ac_configure_args1=; unset ac_configure_args1;}
-
-# When interrupted or exit'd, cleanup temporary files, and complete
-# config.log.  We remove comments because anyway the quotes in there
-# would cause problems or look ugly.
-# WARNING: Use '\'' to represent an apostrophe within the trap.
-# WARNING: Do not start the trap code with a newline, due to a FreeBSD 4.0 bug.
-trap 'exit_status=$?
-  # Save into config.log some information that might help in debugging.
-  {
-    echo
-
-    $as_echo "## ---------------- ##
-## Cache variables. ##
-## ---------------- ##"
-    echo
-    # The following way of writing the cache mishandles newlines in values,
-(
-  for ac_var in `(set) 2>&1 | sed -n '\''s/^\([a-zA-Z_][a-zA-Z0-9_]*\)=.*/\1/p'\''`; do
-    eval ac_val=\$$ac_var
-    case $ac_val in #(
-    *${as_nl}*)
-      case $ac_var in #(
-      *_cv_*) { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cache variable $ac_var contains a newline" >&5
-$as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;;
-      esac
-      case $ac_var in #(
-      _ | IFS | as_nl) ;; #(
-      BASH_ARGV | BASH_SOURCE) eval $ac_var= ;; #(
-      *) { eval $ac_var=; unset $ac_var;} ;;
-      esac ;;
-    esac
-  done
-  (set) 2>&1 |
-    case $as_nl`(ac_space='\'' '\''; set) 2>&1` in #(
-    *${as_nl}ac_space=\ *)
-      sed -n \
-	"s/'\''/'\''\\\\'\'''\''/g;
-	  s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='\''\\2'\''/p"
-      ;; #(
-    *)
-      sed -n "/^[_$as_cr_alnum]*_cv_[_$as_cr_alnum]*=/p"
-      ;;
-    esac |
-    sort
-)
-    echo
-
-    $as_echo "## ----------------- ##
-## Output variables. ##
-## ----------------- ##"
-    echo
-    for ac_var in $ac_subst_vars
-    do
-      eval ac_val=\$$ac_var
-      case $ac_val in
-      *\'\''*) ac_val=`$as_echo "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;;
-      esac
-      $as_echo "$ac_var='\''$ac_val'\''"
-    done | sort
-    echo
-
-    if test -n "$ac_subst_files"; then
-      $as_echo "## ------------------- ##
-## File substitutions. ##
-## ------------------- ##"
-      echo
-      for ac_var in $ac_subst_files
-      do
-	eval ac_val=\$$ac_var
-	case $ac_val in
-	*\'\''*) ac_val=`$as_echo "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;;
-	esac
-	$as_echo "$ac_var='\''$ac_val'\''"
-      done | sort
-      echo
-    fi
-
-    if test -s confdefs.h; then
-      $as_echo "## ----------- ##
-## confdefs.h. ##
-## ----------- ##"
-      echo
-      cat confdefs.h
-      echo
-    fi
-    test "$ac_signal" != 0 &&
-      $as_echo "$as_me: caught signal $ac_signal"
-    $as_echo "$as_me: exit $exit_status"
-  } >&5
-  rm -f core *.core core.conftest.* &&
-    rm -f -r conftest* confdefs* conf$$* $ac_clean_files &&
-    exit $exit_status
-' 0
-for ac_signal in 1 2 13 15; do
-  trap 'ac_signal='$ac_signal'; as_fn_exit 1' $ac_signal
-done
-ac_signal=0
-
-# confdefs.h avoids OS command line length limits that DEFS can exceed.
-rm -f -r conftest* confdefs.h
-
-$as_echo "/* confdefs.h */" > confdefs.h
-
-# Predefined preprocessor variables.
-
-cat >>confdefs.h <<_ACEOF
-#define PACKAGE_NAME "$PACKAGE_NAME"
-_ACEOF
-
-cat >>confdefs.h <<_ACEOF
-#define PACKAGE_TARNAME "$PACKAGE_TARNAME"
-_ACEOF
-
-cat >>confdefs.h <<_ACEOF
-#define PACKAGE_VERSION "$PACKAGE_VERSION"
-_ACEOF
-
-cat >>confdefs.h <<_ACEOF
-#define PACKAGE_STRING "$PACKAGE_STRING"
-_ACEOF
-
-cat >>confdefs.h <<_ACEOF
-#define PACKAGE_BUGREPORT "$PACKAGE_BUGREPORT"
-_ACEOF
-
-cat >>confdefs.h <<_ACEOF
-#define PACKAGE_URL "$PACKAGE_URL"
-_ACEOF
-
-
-# Let the site file select an alternate cache file if it wants to.
-# Prefer an explicitly selected file to automatically selected ones.
-ac_site_file1=NONE
-ac_site_file2=NONE
-if test -n "$CONFIG_SITE"; then
-  # We do not want a PATH search for config.site.
-  case $CONFIG_SITE in #((
-    -*)  ac_site_file1=./$CONFIG_SITE;;
-    */*) ac_site_file1=$CONFIG_SITE;;
-    *)   ac_site_file1=./$CONFIG_SITE;;
-  esac
-elif test "x$prefix" != xNONE; then
-  ac_site_file1=$prefix/share/config.site
-  ac_site_file2=$prefix/etc/config.site
-else
-  ac_site_file1=$ac_default_prefix/share/config.site
-  ac_site_file2=$ac_default_prefix/etc/config.site
-fi
-for ac_site_file in "$ac_site_file1" "$ac_site_file2"
-do
-  test "x$ac_site_file" = xNONE && continue
-  if test /dev/null != "$ac_site_file" && test -r "$ac_site_file"; then
-    { $as_echo "$as_me:${as_lineno-$LINENO}: loading site script $ac_site_file" >&5
-$as_echo "$as_me: loading site script $ac_site_file" >&6;}
-    sed 's/^/| /' "$ac_site_file" >&5
-    . "$ac_site_file" \
-      || { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "failed to load site script $ac_site_file
-See \`config.log' for more details" "$LINENO" 5; }
-  fi
-done
-
-if test -r "$cache_file"; then
-  # Some versions of bash will fail to source /dev/null (special files
-  # actually), so we avoid doing that.  DJGPP emulates it as a regular file.
-  if test /dev/null != "$cache_file" && test -f "$cache_file"; then
-    { $as_echo "$as_me:${as_lineno-$LINENO}: loading cache $cache_file" >&5
-$as_echo "$as_me: loading cache $cache_file" >&6;}
-    case $cache_file in
-      [\\/]* | ?:[\\/]* ) . "$cache_file";;
-      *)                      . "./$cache_file";;
-    esac
-  fi
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: creating cache $cache_file" >&5
-$as_echo "$as_me: creating cache $cache_file" >&6;}
-  >$cache_file
-fi
-
-# Check that the precious variables saved in the cache have kept the same
-# value.
-ac_cache_corrupted=false
-for ac_var in $ac_precious_vars; do
-  eval ac_old_set=\$ac_cv_env_${ac_var}_set
-  eval ac_new_set=\$ac_env_${ac_var}_set
-  eval ac_old_val=\$ac_cv_env_${ac_var}_value
-  eval ac_new_val=\$ac_env_${ac_var}_value
-  case $ac_old_set,$ac_new_set in
-    set,)
-      { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&5
-$as_echo "$as_me: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&2;}
-      ac_cache_corrupted=: ;;
-    ,set)
-      { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' was not set in the previous run" >&5
-$as_echo "$as_me: error: \`$ac_var' was not set in the previous run" >&2;}
-      ac_cache_corrupted=: ;;
-    ,);;
-    *)
-      if test "x$ac_old_val" != "x$ac_new_val"; then
-	# differences in whitespace do not lead to failure.
-	ac_old_val_w=`echo x $ac_old_val`
-	ac_new_val_w=`echo x $ac_new_val`
-	if test "$ac_old_val_w" != "$ac_new_val_w"; then
-	  { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' has changed since the previous run:" >&5
-$as_echo "$as_me: error: \`$ac_var' has changed since the previous run:" >&2;}
-	  ac_cache_corrupted=:
-	else
-	  { $as_echo "$as_me:${as_lineno-$LINENO}: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&5
-$as_echo "$as_me: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&2;}
-	  eval $ac_var=\$ac_old_val
-	fi
-	{ $as_echo "$as_me:${as_lineno-$LINENO}:   former value:  \`$ac_old_val'" >&5
-$as_echo "$as_me:   former value:  \`$ac_old_val'" >&2;}
-	{ $as_echo "$as_me:${as_lineno-$LINENO}:   current value: \`$ac_new_val'" >&5
-$as_echo "$as_me:   current value: \`$ac_new_val'" >&2;}
-      fi;;
-  esac
-  # Pass precious variables to config.status.
-  if test "$ac_new_set" = set; then
-    case $ac_new_val in
-    *\'*) ac_arg=$ac_var=`$as_echo "$ac_new_val" | sed "s/'/'\\\\\\\\''/g"` ;;
-    *) ac_arg=$ac_var=$ac_new_val ;;
-    esac
-    case " $ac_configure_args " in
-      *" '$ac_arg' "*) ;; # Avoid dups.  Use of quotes ensures accuracy.
-      *) as_fn_append ac_configure_args " '$ac_arg'" ;;
-    esac
-  fi
-done
-if $ac_cache_corrupted; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-  { $as_echo "$as_me:${as_lineno-$LINENO}: error: changes in the environment can compromise the build" >&5
-$as_echo "$as_me: error: changes in the environment can compromise the build" >&2;}
-  as_fn_error $? "run \`make distclean' and/or \`rm $cache_file' and start over" "$LINENO" 5
-fi
-## -------------------- ##
-## Main body of script. ##
-## -------------------- ##
-
-ac_ext=c
-ac_cpp='$CPP $CPPFLAGS'
-ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
-ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_c_compiler_gnu
-
-
-
-
-ac_aux_dir=
-for ac_dir in "$srcdir" "$srcdir/.." "$srcdir/../.."; do
-  if test -f "$ac_dir/install-sh"; then
-    ac_aux_dir=$ac_dir
-    ac_install_sh="$ac_aux_dir/install-sh -c"
-    break
-  elif test -f "$ac_dir/install.sh"; then
-    ac_aux_dir=$ac_dir
-    ac_install_sh="$ac_aux_dir/install.sh -c"
-    break
-  elif test -f "$ac_dir/shtool"; then
-    ac_aux_dir=$ac_dir
-    ac_install_sh="$ac_aux_dir/shtool install -c"
-    break
-  fi
-done
-if test -z "$ac_aux_dir"; then
-  as_fn_error $? "cannot find install-sh, install.sh, or shtool in \"$srcdir\" \"$srcdir/..\" \"$srcdir/../..\"" "$LINENO" 5
-fi
-
-# These three variables are undocumented and unsupported,
-# and are intended to be withdrawn in a future Autoconf release.
-# They can cause serious problems if a builder's source tree is in a directory
-# whose full name contains unusual characters.
-ac_config_guess="$SHELL $ac_aux_dir/config.guess"  # Please don't use this var.
-ac_config_sub="$SHELL $ac_aux_dir/config.sub"  # Please don't use this var.
-ac_configure="$SHELL $ac_aux_dir/configure"  # Please don't use this var.
-
-
-# Make sure we can run config.sub.
-$SHELL "$ac_aux_dir/config.sub" sun4 >/dev/null 2>&1 ||
-  as_fn_error $? "cannot run $SHELL $ac_aux_dir/config.sub" "$LINENO" 5
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking build system type" >&5
-$as_echo_n "checking build system type... " >&6; }
-if ${ac_cv_build+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_build_alias=$build_alias
-test "x$ac_build_alias" = x &&
-  ac_build_alias=`$SHELL "$ac_aux_dir/config.guess"`
-test "x$ac_build_alias" = x &&
-  as_fn_error $? "cannot guess build type; you must specify one" "$LINENO" 5
-ac_cv_build=`$SHELL "$ac_aux_dir/config.sub" $ac_build_alias` ||
-  as_fn_error $? "$SHELL $ac_aux_dir/config.sub $ac_build_alias failed" "$LINENO" 5
-
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_build" >&5
-$as_echo "$ac_cv_build" >&6; }
-case $ac_cv_build in
-*-*-*) ;;
-*) as_fn_error $? "invalid value of canonical build" "$LINENO" 5;;
-esac
-build=$ac_cv_build
-ac_save_IFS=$IFS; IFS='-'
-set x $ac_cv_build
-shift
-build_cpu=$1
-build_vendor=$2
-shift; shift
-# Remember, the first character of IFS is used to create $*,
-# except with old shells:
-build_os=$*
-IFS=$ac_save_IFS
-case $build_os in *\ *) build_os=`echo "$build_os" | sed 's/ /-/g'`;; esac
-
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking host system type" >&5
-$as_echo_n "checking host system type... " >&6; }
-if ${ac_cv_host+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test "x$host_alias" = x; then
-  ac_cv_host=$ac_cv_build
-else
-  ac_cv_host=`$SHELL "$ac_aux_dir/config.sub" $host_alias` ||
-    as_fn_error $? "$SHELL $ac_aux_dir/config.sub $host_alias failed" "$LINENO" 5
-fi
-
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_host" >&5
-$as_echo "$ac_cv_host" >&6; }
-case $ac_cv_host in
-*-*-*) ;;
-*) as_fn_error $? "invalid value of canonical host" "$LINENO" 5;;
-esac
-host=$ac_cv_host
-ac_save_IFS=$IFS; IFS='-'
-set x $ac_cv_host
-shift
-host_cpu=$1
-host_vendor=$2
-shift; shift
-# Remember, the first character of IFS is used to create $*,
-# except with old shells:
-host_os=$*
-IFS=$ac_save_IFS
-case $host_os in *\ *) host_os=`echo "$host_os" | sed 's/ /-/g'`;; esac
-
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking target system type" >&5
-$as_echo_n "checking target system type... " >&6; }
-if ${ac_cv_target+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test "x$target_alias" = x; then
-  ac_cv_target=$ac_cv_host
-else
-  ac_cv_target=`$SHELL "$ac_aux_dir/config.sub" $target_alias` ||
-    as_fn_error $? "$SHELL $ac_aux_dir/config.sub $target_alias failed" "$LINENO" 5
-fi
-
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_target" >&5
-$as_echo "$ac_cv_target" >&6; }
-case $ac_cv_target in
-*-*-*) ;;
-*) as_fn_error $? "invalid value of canonical target" "$LINENO" 5;;
-esac
-target=$ac_cv_target
-ac_save_IFS=$IFS; IFS='-'
-set x $ac_cv_target
-shift
-target_cpu=$1
-target_vendor=$2
-shift; shift
-# Remember, the first character of IFS is used to create $*,
-# except with old shells:
-target_os=$*
-IFS=$ac_save_IFS
-case $target_os in *\ *) target_os=`echo "$target_os" | sed 's/ /-/g'`;; esac
-
-
-# The aliases save the names the user supplied, while $host etc.
-# will get canonicalized.
-test -n "$target_alias" &&
-  test "$program_prefix$program_suffix$program_transform_name" = \
-    NONENONEs,x,x, &&
-  program_prefix=${target_alias}-
-
-
-am__api_version='1.11'
-
-# Find a good install program.  We prefer a C program (faster),
-# so one script is as good as another.  But avoid the broken or
-# incompatible versions:
-# SysV /etc/install, /usr/sbin/install
-# SunOS /usr/etc/install
-# IRIX /sbin/install
-# AIX /bin/install
-# AmigaOS /C/install, which installs bootblocks on floppy discs
-# AIX 4 /usr/bin/installbsd, which doesn't work without a -g flag
-# AFS /usr/afsws/bin/install, which mishandles nonexistent args
-# SVR4 /usr/ucb/install, which tries to use the nonexistent group "staff"
-# OS/2's system install, which has a completely different semantic
-# ./install, which can be erroneously created by make from ./install.sh.
-# Reject install programs that cannot install multiple files.
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for a BSD-compatible install" >&5
-$as_echo_n "checking for a BSD-compatible install... " >&6; }
-if test -z "$INSTALL"; then
-if ${ac_cv_path_install+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    # Account for people who put trailing slashes in PATH elements.
-case $as_dir/ in #((
-  ./ | .// | /[cC]/* | \
-  /etc/* | /usr/sbin/* | /usr/etc/* | /sbin/* | /usr/afsws/bin/* | \
-  ?:[\\/]os2[\\/]install[\\/]* | ?:[\\/]OS2[\\/]INSTALL[\\/]* | \
-  /usr/ucb/* ) ;;
-  *)
-    # OSF1 and SCO ODT 3.0 have their own names for install.
-    # Don't use installbsd from OSF since it installs stuff as root
-    # by default.
-    for ac_prog in ginstall scoinst install; do
-      for ac_exec_ext in '' $ac_executable_extensions; do
-	if { test -f "$as_dir/$ac_prog$ac_exec_ext" && $as_test_x "$as_dir/$ac_prog$ac_exec_ext"; }; then
-	  if test $ac_prog = install &&
-	    grep dspmsg "$as_dir/$ac_prog$ac_exec_ext" >/dev/null 2>&1; then
-	    # AIX install.  It has an incompatible calling convention.
-	    :
-	  elif test $ac_prog = install &&
-	    grep pwplus "$as_dir/$ac_prog$ac_exec_ext" >/dev/null 2>&1; then
-	    # program-specific install script used by HP pwplus--don't use.
-	    :
-	  else
-	    rm -rf conftest.one conftest.two conftest.dir
-	    echo one > conftest.one
-	    echo two > conftest.two
-	    mkdir conftest.dir
-	    if "$as_dir/$ac_prog$ac_exec_ext" -c conftest.one conftest.two "`pwd`/conftest.dir" &&
-	      test -s conftest.one && test -s conftest.two &&
-	      test -s conftest.dir/conftest.one &&
-	      test -s conftest.dir/conftest.two
-	    then
-	      ac_cv_path_install="$as_dir/$ac_prog$ac_exec_ext -c"
-	      break 3
-	    fi
-	  fi
-	fi
-      done
-    done
-    ;;
-esac
-
-  done
-IFS=$as_save_IFS
-
-rm -rf conftest.one conftest.two conftest.dir
-
-fi
-  if test "${ac_cv_path_install+set}" = set; then
-    INSTALL=$ac_cv_path_install
-  else
-    # As a last resort, use the slow shell script.  Don't cache a
-    # value for INSTALL within a source directory, because that will
-    # break other packages using the cache if that directory is
-    # removed, or if the value is a relative name.
-    INSTALL=$ac_install_sh
-  fi
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $INSTALL" >&5
-$as_echo "$INSTALL" >&6; }
-
-# Use test -z because SunOS4 sh mishandles braces in ${var-val}.
-# It thinks the first close brace ends the variable substitution.
-test -z "$INSTALL_PROGRAM" && INSTALL_PROGRAM='${INSTALL}'
-
-test -z "$INSTALL_SCRIPT" && INSTALL_SCRIPT='${INSTALL}'
-
-test -z "$INSTALL_DATA" && INSTALL_DATA='${INSTALL} -m 644'
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether build environment is sane" >&5
-$as_echo_n "checking whether build environment is sane... " >&6; }
-# Just in case
-sleep 1
-echo timestamp > conftest.file
-# Reject unsafe characters in $srcdir or the absolute working directory
-# name.  Accept space and tab only in the latter.
-am_lf='
-'
-case `pwd` in
-  *[\\\"\#\$\&\'\`$am_lf]*)
-    as_fn_error $? "unsafe absolute working directory name" "$LINENO" 5;;
-esac
-case $srcdir in
-  *[\\\"\#\$\&\'\`$am_lf\ \	]*)
-    as_fn_error $? "unsafe srcdir value: \`$srcdir'" "$LINENO" 5;;
-esac
-
-# Do `set' in a subshell so we don't clobber the current shell's
-# arguments.  Must try -L first in case configure is actually a
-# symlink; some systems play weird games with the mod time of symlinks
-# (eg FreeBSD returns the mod time of the symlink's containing
-# directory).
-if (
-   set X `ls -Lt "$srcdir/configure" conftest.file 2> /dev/null`
-   if test "$*" = "X"; then
-      # -L didn't work.
-      set X `ls -t "$srcdir/configure" conftest.file`
-   fi
-   rm -f conftest.file
-   if test "$*" != "X $srcdir/configure conftest.file" \
-      && test "$*" != "X conftest.file $srcdir/configure"; then
-
-      # If neither matched, then we have a broken ls.  This can happen
-      # if, for instance, CONFIG_SHELL is bash and it inherits a
-      # broken ls alias from the environment.  This has actually
-      # happened.  Such a system could not be considered "sane".
-      as_fn_error $? "ls -t appears to fail.  Make sure there is not a broken
-alias in your environment" "$LINENO" 5
-   fi
-
-   test "$2" = conftest.file
-   )
-then
-   # Ok.
-   :
-else
-   as_fn_error $? "newly created file is older than distributed files!
-Check your system clock" "$LINENO" 5
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-test "$program_prefix" != NONE &&
-  program_transform_name="s&^&$program_prefix&;$program_transform_name"
-# Use a double $ so make ignores it.
-test "$program_suffix" != NONE &&
-  program_transform_name="s&\$&$program_suffix&;$program_transform_name"
-# Double any \ or $.
-# By default was `s,x,x', remove it if useless.
-ac_script='s/[\\$]/&&/g;s/;s,x,x,$//'
-program_transform_name=`$as_echo "$program_transform_name" | sed "$ac_script"`
-
-# expand $ac_aux_dir to an absolute path
-am_aux_dir=`cd $ac_aux_dir && pwd`
-
-if test x"${MISSING+set}" != xset; then
-  case $am_aux_dir in
-  *\ * | *\	*)
-    MISSING="\${SHELL} \"$am_aux_dir/missing\"" ;;
-  *)
-    MISSING="\${SHELL} $am_aux_dir/missing" ;;
-  esac
-fi
-# Use eval to expand $SHELL
-if eval "$MISSING --run true"; then
-  am_missing_run="$MISSING --run "
-else
-  am_missing_run=
-  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: \`missing' script is too old or missing" >&5
-$as_echo "$as_me: WARNING: \`missing' script is too old or missing" >&2;}
-fi
-
-if test x"${install_sh}" != xset; then
-  case $am_aux_dir in
-  *\ * | *\	*)
-    install_sh="\${SHELL} '$am_aux_dir/install-sh'" ;;
-  *)
-    install_sh="\${SHELL} $am_aux_dir/install-sh"
-  esac
-fi
-
-# Installed binaries are usually stripped using `strip' when the user
-# run `make install-strip'.  However `strip' might not be the right
-# tool to use in cross-compilation environments, therefore Automake
-# will honor the `STRIP' environment variable to overrule this program.
-if test "$cross_compiling" != no; then
-  if test -n "$ac_tool_prefix"; then
-  # Extract the first word of "${ac_tool_prefix}strip", so it can be a program name with args.
-set dummy ${ac_tool_prefix}strip; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_STRIP+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -n "$STRIP"; then
-  ac_cv_prog_STRIP="$STRIP" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_prog_STRIP="${ac_tool_prefix}strip"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-  done
-IFS=$as_save_IFS
-
-fi
-fi
-STRIP=$ac_cv_prog_STRIP
-if test -n "$STRIP"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $STRIP" >&5
-$as_echo "$STRIP" >&6; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-fi
-
-
-fi
-if test -z "$ac_cv_prog_STRIP"; then
-  ac_ct_STRIP=$STRIP
-  # Extract the first word of "strip", so it can be a program name with args.
-set dummy strip; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_ac_ct_STRIP+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -n "$ac_ct_STRIP"; then
-  ac_cv_prog_ac_ct_STRIP="$ac_ct_STRIP" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_prog_ac_ct_STRIP="strip"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-  done
-IFS=$as_save_IFS
-
-fi
-fi
-ac_ct_STRIP=$ac_cv_prog_ac_ct_STRIP
-if test -n "$ac_ct_STRIP"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_STRIP" >&5
-$as_echo "$ac_ct_STRIP" >&6; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-fi
-
-  if test "x$ac_ct_STRIP" = x; then
-    STRIP=":"
-  else
-    case $cross_compiling:$ac_tool_warned in
-yes:)
-{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
-$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
-ac_tool_warned=yes ;;
-esac
-    STRIP=$ac_ct_STRIP
-  fi
-else
-  STRIP="$ac_cv_prog_STRIP"
-fi
-
-fi
-INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s"
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for a thread-safe mkdir -p" >&5
-$as_echo_n "checking for a thread-safe mkdir -p... " >&6; }
-if test -z "$MKDIR_P"; then
-  if ${ac_cv_path_mkdir+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH$PATH_SEPARATOR/opt/sfw/bin
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_prog in mkdir gmkdir; do
-	 for ac_exec_ext in '' $ac_executable_extensions; do
-	   { test -f "$as_dir/$ac_prog$ac_exec_ext" && $as_test_x "$as_dir/$ac_prog$ac_exec_ext"; } || continue
-	   case `"$as_dir/$ac_prog$ac_exec_ext" --version 2>&1` in #(
-	     'mkdir (GNU coreutils) '* | \
-	     'mkdir (coreutils) '* | \
-	     'mkdir (fileutils) '4.1*)
-	       ac_cv_path_mkdir=$as_dir/$ac_prog$ac_exec_ext
-	       break 3;;
-	   esac
-	 done
-       done
-  done
-IFS=$as_save_IFS
-
-fi
-
-  test -d ./--version && rmdir ./--version
-  if test "${ac_cv_path_mkdir+set}" = set; then
-    MKDIR_P="$ac_cv_path_mkdir -p"
-  else
-    # As a last resort, use the slow shell script.  Don't cache a
-    # value for MKDIR_P within a source directory, because that will
-    # break other packages using the cache if that directory is
-    # removed, or if the value is a relative name.
-    MKDIR_P="$ac_install_sh -d"
-  fi
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $MKDIR_P" >&5
-$as_echo "$MKDIR_P" >&6; }
-
-mkdir_p="$MKDIR_P"
-case $mkdir_p in
-  [\\/$]* | ?:[\\/]*) ;;
-  */*) mkdir_p="\$(top_builddir)/$mkdir_p" ;;
-esac
-
-for ac_prog in gawk mawk nawk awk
-do
-  # Extract the first word of "$ac_prog", so it can be a program name with args.
-set dummy $ac_prog; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_AWK+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -n "$AWK"; then
-  ac_cv_prog_AWK="$AWK" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_prog_AWK="$ac_prog"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-  done
-IFS=$as_save_IFS
-
-fi
-fi
-AWK=$ac_cv_prog_AWK
-if test -n "$AWK"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $AWK" >&5
-$as_echo "$AWK" >&6; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-fi
-
-
-  test -n "$AWK" && break
-done
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${MAKE-make} sets \$(MAKE)" >&5
-$as_echo_n "checking whether ${MAKE-make} sets \$(MAKE)... " >&6; }
-set x ${MAKE-make}
-ac_make=`$as_echo "$2" | sed 's/+/p/g; s/[^a-zA-Z0-9_]/_/g'`
-if eval \${ac_cv_prog_make_${ac_make}_set+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  cat >conftest.make <<\_ACEOF
-SHELL = /bin/sh
-all:
-	@echo '@@@%%%=$(MAKE)=@@@%%%'
-_ACEOF
-# GNU make sometimes prints "make[1]: Entering ...", which would confuse us.
-case `${MAKE-make} -f conftest.make 2>/dev/null` in
-  *@@@%%%=?*=@@@%%%*)
-    eval ac_cv_prog_make_${ac_make}_set=yes;;
-  *)
-    eval ac_cv_prog_make_${ac_make}_set=no;;
-esac
-rm -f conftest.make
-fi
-if eval test \$ac_cv_prog_make_${ac_make}_set = yes; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-  SET_MAKE=
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-  SET_MAKE="MAKE=${MAKE-make}"
-fi
-
-rm -rf .tst 2>/dev/null
-mkdir .tst 2>/dev/null
-if test -d .tst; then
-  am__leading_dot=.
-else
-  am__leading_dot=_
-fi
-rmdir .tst 2>/dev/null
-
-if test "`cd $srcdir && pwd`" != "`pwd`"; then
-  # Use -I$(srcdir) only when $(srcdir) != ., so that make's output
-  # is not polluted with repeated "-I."
-  am__isrc=' -I$(srcdir)'
-  # test to see if srcdir already configured
-  if test -f $srcdir/config.status; then
-    as_fn_error $? "source directory already configured; run \"make distclean\" there first" "$LINENO" 5
-  fi
-fi
-
-# test whether we have cygpath
-if test -z "$CYGPATH_W"; then
-  if (cygpath --version) >/dev/null 2>/dev/null; then
-    CYGPATH_W='cygpath -w'
-  else
-    CYGPATH_W=echo
-  fi
-fi
-
-
-# Define the identity of the package.
- PACKAGE='ccminer'
- VERSION='2014.05.03'
-
-
-cat >>confdefs.h <<_ACEOF
-#define PACKAGE "$PACKAGE"
-_ACEOF
-
-
-cat >>confdefs.h <<_ACEOF
-#define VERSION "$VERSION"
-_ACEOF
-
-# Some tools Automake needs.
-
-ACLOCAL=${ACLOCAL-"${am_missing_run}aclocal-${am__api_version}"}
-
-
-AUTOCONF=${AUTOCONF-"${am_missing_run}autoconf"}
-
-
-AUTOMAKE=${AUTOMAKE-"${am_missing_run}automake-${am__api_version}"}
-
-
-AUTOHEADER=${AUTOHEADER-"${am_missing_run}autoheader"}
-
-
-MAKEINFO=${MAKEINFO-"${am_missing_run}makeinfo"}
-
-# We need awk for the "check" target.  The system "awk" is bad on
-# some platforms.
-# Always define AMTAR for backward compatibility.  Yes, it's still used
-# in the wild :-(  We should find a proper way to deprecate it ...
-AMTAR='$${TAR-tar}'
-
-am__tar='$${TAR-tar} chof - "$$tardir"' am__untar='$${TAR-tar} xf -'
-
-
-
-
-
-ac_config_headers="$ac_config_headers cpuminer-config.h"
-
-
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to enable maintainer-specific portions of Makefiles" >&5
-$as_echo_n "checking whether to enable maintainer-specific portions of Makefiles... " >&6; }
-    # Check whether --enable-maintainer-mode was given.
-if test "${enable_maintainer_mode+set}" = set; then :
-  enableval=$enable_maintainer_mode; USE_MAINTAINER_MODE=$enableval
-else
-  USE_MAINTAINER_MODE=no
-fi
-
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $USE_MAINTAINER_MODE" >&5
-$as_echo "$USE_MAINTAINER_MODE" >&6; }
-   if test $USE_MAINTAINER_MODE = yes; then
-  MAINTAINER_MODE_TRUE=
-  MAINTAINER_MODE_FALSE='#'
-else
-  MAINTAINER_MODE_TRUE='#'
-  MAINTAINER_MODE_FALSE=
-fi
-
-  MAINT=$MAINTAINER_MODE_TRUE
-
-
-
-DEPDIR="${am__leading_dot}deps"
-
-ac_config_commands="$ac_config_commands depfiles"
-
-
-am_make=${MAKE-make}
-cat > confinc << 'END'
-am__doit:
-	@echo this is the am__doit target
-.PHONY: am__doit
-END
-# If we don't find an include directive, just comment out the code.
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for style of include used by $am_make" >&5
-$as_echo_n "checking for style of include used by $am_make... " >&6; }
-am__include="#"
-am__quote=
-_am_result=none
-# First try GNU make style include.
-echo "include confinc" > confmf
-# Ignore all kinds of additional output from `make'.
-case `$am_make -s -f confmf 2> /dev/null` in #(
-*the\ am__doit\ target*)
-  am__include=include
-  am__quote=
-  _am_result=GNU
-  ;;
-esac
-# Now try BSD make style include.
-if test "$am__include" = "#"; then
-   echo '.include "confinc"' > confmf
-   case `$am_make -s -f confmf 2> /dev/null` in #(
-   *the\ am__doit\ target*)
-     am__include=.include
-     am__quote="\""
-     _am_result=BSD
-     ;;
-   esac
-fi
-
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $_am_result" >&5
-$as_echo "$_am_result" >&6; }
-rm -f confinc confmf
-
-# Check whether --enable-dependency-tracking was given.
-if test "${enable_dependency_tracking+set}" = set; then :
-  enableval=$enable_dependency_tracking;
-fi
-
-if test "x$enable_dependency_tracking" != xno; then
-  am_depcomp="$ac_aux_dir/depcomp"
-  AMDEPBACKSLASH='\'
-  am__nodep='_no'
-fi
- if test "x$enable_dependency_tracking" != xno; then
-  AMDEP_TRUE=
-  AMDEP_FALSE='#'
-else
-  AMDEP_TRUE='#'
-  AMDEP_FALSE=
-fi
-
-
-ac_ext=c
-ac_cpp='$CPP $CPPFLAGS'
-ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
-ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_c_compiler_gnu
-if test -n "$ac_tool_prefix"; then
-  # Extract the first word of "${ac_tool_prefix}gcc", so it can be a program name with args.
-set dummy ${ac_tool_prefix}gcc; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_CC+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -n "$CC"; then
-  ac_cv_prog_CC="$CC" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_prog_CC="${ac_tool_prefix}gcc"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-  done
-IFS=$as_save_IFS
-
-fi
-fi
-CC=$ac_cv_prog_CC
-if test -n "$CC"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
-$as_echo "$CC" >&6; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-fi
-
-
-fi
-if test -z "$ac_cv_prog_CC"; then
-  ac_ct_CC=$CC
-  # Extract the first word of "gcc", so it can be a program name with args.
-set dummy gcc; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_ac_ct_CC+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -n "$ac_ct_CC"; then
-  ac_cv_prog_ac_ct_CC="$ac_ct_CC" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_prog_ac_ct_CC="gcc"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-  done
-IFS=$as_save_IFS
-
-fi
-fi
-ac_ct_CC=$ac_cv_prog_ac_ct_CC
-if test -n "$ac_ct_CC"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CC" >&5
-$as_echo "$ac_ct_CC" >&6; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-fi
-
-  if test "x$ac_ct_CC" = x; then
-    CC=""
-  else
-    case $cross_compiling:$ac_tool_warned in
-yes:)
-{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
-$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
-ac_tool_warned=yes ;;
-esac
-    CC=$ac_ct_CC
-  fi
-else
-  CC="$ac_cv_prog_CC"
-fi
-
-if test -z "$CC"; then
-          if test -n "$ac_tool_prefix"; then
-    # Extract the first word of "${ac_tool_prefix}cc", so it can be a program name with args.
-set dummy ${ac_tool_prefix}cc; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_CC+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -n "$CC"; then
-  ac_cv_prog_CC="$CC" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_prog_CC="${ac_tool_prefix}cc"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-  done
-IFS=$as_save_IFS
-
-fi
-fi
-CC=$ac_cv_prog_CC
-if test -n "$CC"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
-$as_echo "$CC" >&6; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-fi
-
-
-  fi
-fi
-if test -z "$CC"; then
-  # Extract the first word of "cc", so it can be a program name with args.
-set dummy cc; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_CC+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -n "$CC"; then
-  ac_cv_prog_CC="$CC" # Let the user override the test.
-else
-  ac_prog_rejected=no
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
-    if test "$as_dir/$ac_word$ac_exec_ext" = "/usr/ucb/cc"; then
-       ac_prog_rejected=yes
-       continue
-     fi
-    ac_cv_prog_CC="cc"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-  done
-IFS=$as_save_IFS
-
-if test $ac_prog_rejected = yes; then
-  # We found a bogon in the path, so make sure we never use it.
-  set dummy $ac_cv_prog_CC
-  shift
-  if test $# != 0; then
-    # We chose a different compiler from the bogus one.
-    # However, it has the same basename, so the bogon will be chosen
-    # first if we set CC to just the basename; use the full file name.
-    shift
-    ac_cv_prog_CC="$as_dir/$ac_word${1+' '}$@"
-  fi
-fi
-fi
-fi
-CC=$ac_cv_prog_CC
-if test -n "$CC"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
-$as_echo "$CC" >&6; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-fi
-
-
-fi
-if test -z "$CC"; then
-  if test -n "$ac_tool_prefix"; then
-  for ac_prog in cl.exe
-  do
-    # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args.
-set dummy $ac_tool_prefix$ac_prog; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_CC+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -n "$CC"; then
-  ac_cv_prog_CC="$CC" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_prog_CC="$ac_tool_prefix$ac_prog"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-  done
-IFS=$as_save_IFS
-
-fi
-fi
-CC=$ac_cv_prog_CC
-if test -n "$CC"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
-$as_echo "$CC" >&6; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-fi
-
-
-    test -n "$CC" && break
-  done
-fi
-if test -z "$CC"; then
-  ac_ct_CC=$CC
-  for ac_prog in cl.exe
-do
-  # Extract the first word of "$ac_prog", so it can be a program name with args.
-set dummy $ac_prog; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_ac_ct_CC+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -n "$ac_ct_CC"; then
-  ac_cv_prog_ac_ct_CC="$ac_ct_CC" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_prog_ac_ct_CC="$ac_prog"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-  done
-IFS=$as_save_IFS
-
-fi
-fi
-ac_ct_CC=$ac_cv_prog_ac_ct_CC
-if test -n "$ac_ct_CC"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CC" >&5
-$as_echo "$ac_ct_CC" >&6; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-fi
-
-
-  test -n "$ac_ct_CC" && break
-done
-
-  if test "x$ac_ct_CC" = x; then
-    CC=""
-  else
-    case $cross_compiling:$ac_tool_warned in
-yes:)
-{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
-$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
-ac_tool_warned=yes ;;
-esac
-    CC=$ac_ct_CC
-  fi
-fi
-
-fi
-
-
-test -z "$CC" && { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "no acceptable C compiler found in \$PATH
-See \`config.log' for more details" "$LINENO" 5; }
-
-# Provide some information about the compiler.
-$as_echo "$as_me:${as_lineno-$LINENO}: checking for C compiler version" >&5
-set X $ac_compile
-ac_compiler=$2
-for ac_option in --version -v -V -qversion; do
-  { { ac_try="$ac_compiler $ac_option >&5"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_compiler $ac_option >&5") 2>conftest.err
-  ac_status=$?
-  if test -s conftest.err; then
-    sed '10a\
-... rest of stderr output deleted ...
-         10q' conftest.err >conftest.er1
-    cat conftest.er1 >&5
-  fi
-  rm -f conftest.er1 conftest.err
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }
-done
-
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-int
-main ()
-{
-
-  ;
-  return 0;
-}
-_ACEOF
-ac_clean_files_save=$ac_clean_files
-ac_clean_files="$ac_clean_files a.out a.out.dSYM a.exe b.out"
-# Try to create an executable without -o first, disregard a.out.
-# It will help us diagnose broken compilers, and finding out an intuition
-# of exeext.
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the C compiler works" >&5
-$as_echo_n "checking whether the C compiler works... " >&6; }
-ac_link_default=`$as_echo "$ac_link" | sed 's/ -o *conftest[^ ]*//'`
-
-# The possible output files:
-ac_files="a.out conftest.exe conftest a.exe a_out.exe b.out conftest.*"
-
-ac_rmfiles=
-for ac_file in $ac_files
-do
-  case $ac_file in
-    *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj ) ;;
-    * ) ac_rmfiles="$ac_rmfiles $ac_file";;
-  esac
-done
-rm -f $ac_rmfiles
-
-if { { ac_try="$ac_link_default"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_link_default") 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; then :
-  # Autoconf-2.13 could set the ac_cv_exeext variable to `no'.
-# So ignore a value of `no', otherwise this would lead to `EXEEXT = no'
-# in a Makefile.  We should not override ac_cv_exeext if it was cached,
-# so that the user can short-circuit this test for compilers unknown to
-# Autoconf.
-for ac_file in $ac_files ''
-do
-  test -f "$ac_file" || continue
-  case $ac_file in
-    *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj )
-	;;
-    [ab].out )
-	# We found the default executable, but exeext='' is most
-	# certainly right.
-	break;;
-    *.* )
-	if test "${ac_cv_exeext+set}" = set && test "$ac_cv_exeext" != no;
-	then :; else
-	   ac_cv_exeext=`expr "$ac_file" : '[^.]*\(\..*\)'`
-	fi
-	# We set ac_cv_exeext here because the later test for it is not
-	# safe: cross compilers may not add the suffix if given an `-o'
-	# argument, so we may need to know it at that point already.
-	# Even if this section looks crufty: it has the advantage of
-	# actually working.
-	break;;
-    * )
-	break;;
-  esac
-done
-test "$ac_cv_exeext" = no && ac_cv_exeext=
-
-else
-  ac_file=''
-fi
-if test -z "$ac_file"; then :
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-$as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-{ { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error 77 "C compiler cannot create executables
-See \`config.log' for more details" "$LINENO" 5; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for C compiler default output file name" >&5
-$as_echo_n "checking for C compiler default output file name... " >&6; }
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_file" >&5
-$as_echo "$ac_file" >&6; }
-ac_exeext=$ac_cv_exeext
-
-rm -f -r a.out a.out.dSYM a.exe conftest$ac_cv_exeext b.out
-ac_clean_files=$ac_clean_files_save
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for suffix of executables" >&5
-$as_echo_n "checking for suffix of executables... " >&6; }
-if { { ac_try="$ac_link"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_link") 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; then :
-  # If both `conftest.exe' and `conftest' are `present' (well, observable)
-# catch `conftest.exe'.  For instance with Cygwin, `ls conftest' will
-# work properly (i.e., refer to `conftest.exe'), while it won't with
-# `rm'.
-for ac_file in conftest.exe conftest conftest.*; do
-  test -f "$ac_file" || continue
-  case $ac_file in
-    *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj ) ;;
-    *.* ) ac_cv_exeext=`expr "$ac_file" : '[^.]*\(\..*\)'`
-	  break;;
-    * ) break;;
-  esac
-done
-else
-  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "cannot compute suffix of executables: cannot compile and link
-See \`config.log' for more details" "$LINENO" 5; }
-fi
-rm -f conftest conftest$ac_cv_exeext
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_exeext" >&5
-$as_echo "$ac_cv_exeext" >&6; }
-
-rm -f conftest.$ac_ext
-EXEEXT=$ac_cv_exeext
-ac_exeext=$EXEEXT
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#include <stdio.h>
-int
-main ()
-{
-FILE *f = fopen ("conftest.out", "w");
- return ferror (f) || fclose (f) != 0;
-
-  ;
-  return 0;
-}
-_ACEOF
-ac_clean_files="$ac_clean_files conftest.out"
-# Check that the compiler produces executables we can run.  If not, either
-# the compiler is broken, or we cross compile.
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are cross compiling" >&5
-$as_echo_n "checking whether we are cross compiling... " >&6; }
-if test "$cross_compiling" != yes; then
-  { { ac_try="$ac_link"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_link") 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }
-  if { ac_try='./conftest$ac_cv_exeext'
-  { { case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_try") 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then
-    cross_compiling=no
-  else
-    if test "$cross_compiling" = maybe; then
-	cross_compiling=yes
-    else
-	{ { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "cannot run C compiled programs.
-If you meant to cross compile, use \`--host'.
-See \`config.log' for more details" "$LINENO" 5; }
-    fi
-  fi
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $cross_compiling" >&5
-$as_echo "$cross_compiling" >&6; }
-
-rm -f conftest.$ac_ext conftest$ac_cv_exeext conftest.out
-ac_clean_files=$ac_clean_files_save
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for suffix of object files" >&5
-$as_echo_n "checking for suffix of object files... " >&6; }
-if ${ac_cv_objext+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-int
-main ()
-{
-
-  ;
-  return 0;
-}
-_ACEOF
-rm -f conftest.o conftest.obj
-if { { ac_try="$ac_compile"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_compile") 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; then :
-  for ac_file in conftest.o conftest.obj conftest.*; do
-  test -f "$ac_file" || continue;
-  case $ac_file in
-    *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM ) ;;
-    *) ac_cv_objext=`expr "$ac_file" : '.*\.\(.*\)'`
-       break;;
-  esac
-done
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-{ { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "cannot compute suffix of object files: cannot compile
-See \`config.log' for more details" "$LINENO" 5; }
-fi
-rm -f conftest.$ac_cv_objext conftest.$ac_ext
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_objext" >&5
-$as_echo "$ac_cv_objext" >&6; }
-OBJEXT=$ac_cv_objext
-ac_objext=$OBJEXT
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are using the GNU C compiler" >&5
-$as_echo_n "checking whether we are using the GNU C compiler... " >&6; }
-if ${ac_cv_c_compiler_gnu+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-int
-main ()
-{
-#ifndef __GNUC__
-       choke me
-#endif
-
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
-  ac_compiler_gnu=yes
-else
-  ac_compiler_gnu=no
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-ac_cv_c_compiler_gnu=$ac_compiler_gnu
-
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_c_compiler_gnu" >&5
-$as_echo "$ac_cv_c_compiler_gnu" >&6; }
-if test $ac_compiler_gnu = yes; then
-  GCC=yes
-else
-  GCC=
-fi
-ac_test_CFLAGS=${CFLAGS+set}
-ac_save_CFLAGS=$CFLAGS
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CC accepts -g" >&5
-$as_echo_n "checking whether $CC accepts -g... " >&6; }
-if ${ac_cv_prog_cc_g+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_save_c_werror_flag=$ac_c_werror_flag
-   ac_c_werror_flag=yes
-   ac_cv_prog_cc_g=no
-   CFLAGS="-g"
-   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-int
-main ()
-{
-
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
-  ac_cv_prog_cc_g=yes
-else
-  CFLAGS=""
-      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-int
-main ()
-{
-
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
-
-else
-  ac_c_werror_flag=$ac_save_c_werror_flag
-	 CFLAGS="-g"
-	 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-int
-main ()
-{
-
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
-  ac_cv_prog_cc_g=yes
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-   ac_c_werror_flag=$ac_save_c_werror_flag
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_g" >&5
-$as_echo "$ac_cv_prog_cc_g" >&6; }
-if test "$ac_test_CFLAGS" = set; then
-  CFLAGS=$ac_save_CFLAGS
-elif test $ac_cv_prog_cc_g = yes; then
-  if test "$GCC" = yes; then
-    CFLAGS="-g -O2"
-  else
-    CFLAGS="-g"
-  fi
-else
-  if test "$GCC" = yes; then
-    CFLAGS="-O2"
-  else
-    CFLAGS=
-  fi
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $CC option to accept ISO C89" >&5
-$as_echo_n "checking for $CC option to accept ISO C89... " >&6; }
-if ${ac_cv_prog_cc_c89+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_cv_prog_cc_c89=no
-ac_save_CC=$CC
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#include <stdarg.h>
-#include <stdio.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-/* Most of the following tests are stolen from RCS 5.7's src/conf.sh.  */
-struct buf { int x; };
-FILE * (*rcsopen) (struct buf *, struct stat *, int);
-static char *e (p, i)
-     char **p;
-     int i;
-{
-  return p[i];
-}
-static char *f (char * (*g) (char **, int), char **p, ...)
-{
-  char *s;
-  va_list v;
-  va_start (v,p);
-  s = g (p, va_arg (v,int));
-  va_end (v);
-  return s;
-}
-
-/* OSF 4.0 Compaq cc is some sort of almost-ANSI by default.  It has
-   function prototypes and stuff, but not '\xHH' hex character constants.
-   These don't provoke an error unfortunately, instead are silently treated
-   as 'x'.  The following induces an error, until -std is added to get
-   proper ANSI mode.  Curiously '\x00'!='x' always comes out true, for an
-   array size at least.  It's necessary to write '\x00'==0 to get something
-   that's true only with -std.  */
-int osf4_cc_array ['\x00' == 0 ? 1 : -1];
-
-/* IBM C 6 for AIX is almost-ANSI by default, but it replaces macro parameters
-   inside strings and character constants.  */
-#define FOO(x) 'x'
-int xlc6_cc_array[FOO(a) == 'x' ? 1 : -1];
-
-int test (int i, double x);
-struct s1 {int (*f) (int a);};
-struct s2 {int (*f) (double a);};
-int pairnames (int, char **, FILE *(*)(struct buf *, struct stat *, int), int, int);
-int argc;
-char **argv;
-int
-main ()
-{
-return f (e, argv, 0) != argv[0]  ||  f (e, argv, 1) != argv[1];
-  ;
-  return 0;
-}
-_ACEOF
-for ac_arg in '' -qlanglvl=extc89 -qlanglvl=ansi -std \
-	-Ae "-Aa -D_HPUX_SOURCE" "-Xc -D__EXTENSIONS__"
-do
-  CC="$ac_save_CC $ac_arg"
-  if ac_fn_c_try_compile "$LINENO"; then :
-  ac_cv_prog_cc_c89=$ac_arg
-fi
-rm -f core conftest.err conftest.$ac_objext
-  test "x$ac_cv_prog_cc_c89" != "xno" && break
-done
-rm -f conftest.$ac_ext
-CC=$ac_save_CC
-
-fi
-# AC_CACHE_VAL
-case "x$ac_cv_prog_cc_c89" in
-  x)
-    { $as_echo "$as_me:${as_lineno-$LINENO}: result: none needed" >&5
-$as_echo "none needed" >&6; } ;;
-  xno)
-    { $as_echo "$as_me:${as_lineno-$LINENO}: result: unsupported" >&5
-$as_echo "unsupported" >&6; } ;;
-  *)
-    CC="$CC $ac_cv_prog_cc_c89"
-    { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_c89" >&5
-$as_echo "$ac_cv_prog_cc_c89" >&6; } ;;
-esac
-if test "x$ac_cv_prog_cc_c89" != xno; then :
-
-fi
-
-ac_ext=c
-ac_cpp='$CPP $CPPFLAGS'
-ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
-ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_c_compiler_gnu
-
-depcc="$CC"   am_compiler_list=
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking dependency style of $depcc" >&5
-$as_echo_n "checking dependency style of $depcc... " >&6; }
-if ${am_cv_CC_dependencies_compiler_type+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -z "$AMDEP_TRUE" && test -f "$am_depcomp"; then
-  # We make a subdir and do the tests there.  Otherwise we can end up
-  # making bogus files that we don't know about and never remove.  For
-  # instance it was reported that on HP-UX the gcc test will end up
-  # making a dummy file named `D' -- because `-MD' means `put the output
-  # in D'.
-  rm -rf conftest.dir
-  mkdir conftest.dir
-  # Copy depcomp to subdir because otherwise we won't find it if we're
-  # using a relative directory.
-  cp "$am_depcomp" conftest.dir
-  cd conftest.dir
-  # We will build objects and dependencies in a subdirectory because
-  # it helps to detect inapplicable dependency modes.  For instance
-  # both Tru64's cc and ICC support -MD to output dependencies as a
-  # side effect of compilation, but ICC will put the dependencies in
-  # the current directory while Tru64 will put them in the object
-  # directory.
-  mkdir sub
-
-  am_cv_CC_dependencies_compiler_type=none
-  if test "$am_compiler_list" = ""; then
-     am_compiler_list=`sed -n 's/^#*\([a-zA-Z0-9]*\))$/\1/p' < ./depcomp`
-  fi
-  am__universal=false
-  case " $depcc " in #(
-     *\ -arch\ *\ -arch\ *) am__universal=true ;;
-     esac
-
-  for depmode in $am_compiler_list; do
-    # Setup a source with many dependencies, because some compilers
-    # like to wrap large dependency lists on column 80 (with \), and
-    # we should not choose a depcomp mode which is confused by this.
-    #
-    # We need to recreate these files for each test, as the compiler may
-    # overwrite some of them when testing with obscure command lines.
-    # This happens at least with the AIX C compiler.
-    : > sub/conftest.c
-    for i in 1 2 3 4 5 6; do
-      echo '#include "conftst'$i'.h"' >> sub/conftest.c
-      # Using `: > sub/conftst$i.h' creates only sub/conftst1.h with
-      # Solaris 8's {/usr,}/bin/sh.
-      touch sub/conftst$i.h
-    done
-    echo "${am__include} ${am__quote}sub/conftest.Po${am__quote}" > confmf
-
-    # We check with `-c' and `-o' for the sake of the "dashmstdout"
-    # mode.  It turns out that the SunPro C++ compiler does not properly
-    # handle `-M -o', and we need to detect this.  Also, some Intel
-    # versions had trouble with output in subdirs
-    am__obj=sub/conftest.${OBJEXT-o}
-    am__minus_obj="-o $am__obj"
-    case $depmode in
-    gcc)
-      # This depmode causes a compiler race in universal mode.
-      test "$am__universal" = false || continue
-      ;;
-    nosideeffect)
-      # after this tag, mechanisms are not by side-effect, so they'll
-      # only be used when explicitly requested
-      if test "x$enable_dependency_tracking" = xyes; then
-	continue
-      else
-	break
-      fi
-      ;;
-    msvc7 | msvc7msys | msvisualcpp | msvcmsys)
-      # This compiler won't grok `-c -o', but also, the minuso test has
-      # not run yet.  These depmodes are late enough in the game, and
-      # so weak that their functioning should not be impacted.
-      am__obj=conftest.${OBJEXT-o}
-      am__minus_obj=
-      ;;
-    none) break ;;
-    esac
-    if depmode=$depmode \
-       source=sub/conftest.c object=$am__obj \
-       depfile=sub/conftest.Po tmpdepfile=sub/conftest.TPo \
-       $SHELL ./depcomp $depcc -c $am__minus_obj sub/conftest.c \
-         >/dev/null 2>conftest.err &&
-       grep sub/conftst1.h sub/conftest.Po > /dev/null 2>&1 &&
-       grep sub/conftst6.h sub/conftest.Po > /dev/null 2>&1 &&
-       grep $am__obj sub/conftest.Po > /dev/null 2>&1 &&
-       ${MAKE-make} -s -f confmf > /dev/null 2>&1; then
-      # icc doesn't choke on unknown options, it will just issue warnings
-      # or remarks (even with -Werror).  So we grep stderr for any message
-      # that says an option was ignored or not supported.
-      # When given -MP, icc 7.0 and 7.1 complain thusly:
-      #   icc: Command line warning: ignoring option '-M'; no argument required
-      # The diagnosis changed in icc 8.0:
-      #   icc: Command line remark: option '-MP' not supported
-      if (grep 'ignoring option' conftest.err ||
-          grep 'not supported' conftest.err) >/dev/null 2>&1; then :; else
-        am_cv_CC_dependencies_compiler_type=$depmode
-        break
-      fi
-    fi
-  done
-
-  cd ..
-  rm -rf conftest.dir
-else
-  am_cv_CC_dependencies_compiler_type=none
-fi
-
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_CC_dependencies_compiler_type" >&5
-$as_echo "$am_cv_CC_dependencies_compiler_type" >&6; }
-CCDEPMODE=depmode=$am_cv_CC_dependencies_compiler_type
-
- if
-  test "x$enable_dependency_tracking" != xno \
-  && test "$am_cv_CC_dependencies_compiler_type" = gcc3; then
-  am__fastdepCC_TRUE=
-  am__fastdepCC_FALSE='#'
-else
-  am__fastdepCC_TRUE='#'
-  am__fastdepCC_FALSE=
-fi
-
-
-   { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $CC option to accept ISO C99" >&5
-$as_echo_n "checking for $CC option to accept ISO C99... " >&6; }
-if ${ac_cv_prog_cc_c99+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_cv_prog_cc_c99=no
-ac_save_CC=$CC
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#include <stdarg.h>
-#include <stdbool.h>
-#include <stdlib.h>
-#include <wchar.h>
-#include <stdio.h>
-
-// Check varargs macros.  These examples are taken from C99 6.10.3.5.
-#define debug(...) fprintf (stderr, __VA_ARGS__)
-#define showlist(...) puts (#__VA_ARGS__)
-#define report(test,...) ((test) ? puts (#test) : printf (__VA_ARGS__))
-static void
-test_varargs_macros (void)
-{
-  int x = 1234;
-  int y = 5678;
-  debug ("Flag");
-  debug ("X = %d\n", x);
-  showlist (The first, second, and third items.);
-  report (x>y, "x is %d but y is %d", x, y);
-}
-
-// Check long long types.
-#define BIG64 18446744073709551615ull
-#define BIG32 4294967295ul
-#define BIG_OK (BIG64 / BIG32 == 4294967297ull && BIG64 % BIG32 == 0)
-#if !BIG_OK
-  your preprocessor is broken;
-#endif
-#if BIG_OK
-#else
-  your preprocessor is broken;
-#endif
-static long long int bignum = -9223372036854775807LL;
-static unsigned long long int ubignum = BIG64;
-
-struct incomplete_array
-{
-  int datasize;
-  double data[];
-};
-
-struct named_init {
-  int number;
-  const wchar_t *name;
-  double average;
-};
-
-typedef const char *ccp;
-
-static inline int
-test_restrict (ccp restrict text)
-{
-  // See if C++-style comments work.
-  // Iterate through items via the restricted pointer.
-  // Also check for declarations in for loops.
-  for (unsigned int i = 0; *(text+i) != '\0'; ++i)
-    continue;
-  return 0;
-}
-
-// Check varargs and va_copy.
-static void
-test_varargs (const char *format, ...)
-{
-  va_list args;
-  va_start (args, format);
-  va_list args_copy;
-  va_copy (args_copy, args);
-
-  const char *str;
-  int number;
-  float fnumber;
-
-  while (*format)
-    {
-      switch (*format++)
-	{
-	case 's': // string
-	  str = va_arg (args_copy, const char *);
-	  break;
-	case 'd': // int
-	  number = va_arg (args_copy, int);
-	  break;
-	case 'f': // float
-	  fnumber = va_arg (args_copy, double);
-	  break;
-	default:
-	  break;
-	}
-    }
-  va_end (args_copy);
-  va_end (args);
-}
-
-int
-main ()
-{
-
-  // Check bool.
-  _Bool success = false;
-
-  // Check restrict.
-  if (test_restrict ("String literal") == 0)
-    success = true;
-  char *restrict newvar = "Another string";
-
-  // Check varargs.
-  test_varargs ("s, d' f .", "string", 65, 34.234);
-  test_varargs_macros ();
-
-  // Check flexible array members.
-  struct incomplete_array *ia =
-    malloc (sizeof (struct incomplete_array) + (sizeof (double) * 10));
-  ia->datasize = 10;
-  for (int i = 0; i < ia->datasize; ++i)
-    ia->data[i] = i * 1.234;
-
-  // Check named initializers.
-  struct named_init ni = {
-    .number = 34,
-    .name = L"Test wide string",
-    .average = 543.34343,
-  };
-
-  ni.number = 58;
-
-  int dynamic_array[ni.number];
-  dynamic_array[ni.number - 1] = 543;
-
-  // work around unused variable warnings
-  return (!success || bignum == 0LL || ubignum == 0uLL || newvar[0] == 'x'
-	  || dynamic_array[ni.number - 1] != 543);
-
-  ;
-  return 0;
-}
-_ACEOF
-for ac_arg in '' -std=gnu99 -std=c99 -c99 -AC99 -xc99=all -qlanglvl=extc99
-do
-  CC="$ac_save_CC $ac_arg"
-  if ac_fn_c_try_compile "$LINENO"; then :
-  ac_cv_prog_cc_c99=$ac_arg
-fi
-rm -f core conftest.err conftest.$ac_objext
-  test "x$ac_cv_prog_cc_c99" != "xno" && break
-done
-rm -f conftest.$ac_ext
-CC=$ac_save_CC
-
-fi
-# AC_CACHE_VAL
-case "x$ac_cv_prog_cc_c99" in
-  x)
-    { $as_echo "$as_me:${as_lineno-$LINENO}: result: none needed" >&5
-$as_echo "none needed" >&6; } ;;
-  xno)
-    { $as_echo "$as_me:${as_lineno-$LINENO}: result: unsupported" >&5
-$as_echo "unsupported" >&6; } ;;
-  *)
-    CC="$CC $ac_cv_prog_cc_c99"
-    { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_c99" >&5
-$as_echo "$ac_cv_prog_cc_c99" >&6; } ;;
-esac
-if test "x$ac_cv_prog_cc_c99" != xno; then :
-
-fi
-
-
-
-ac_ext=c
-ac_cpp='$CPP $CPPFLAGS'
-ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
-ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_c_compiler_gnu
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking how to run the C preprocessor" >&5
-$as_echo_n "checking how to run the C preprocessor... " >&6; }
-# On Suns, sometimes $CPP names a directory.
-if test -n "$CPP" && test -d "$CPP"; then
-  CPP=
-fi
-if test -z "$CPP"; then
-  if ${ac_cv_prog_CPP+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-      # Double quotes because CPP needs to be expanded
-    for CPP in "$CC -E" "$CC -E -traditional-cpp" "/lib/cpp"
-    do
-      ac_preproc_ok=false
-for ac_c_preproc_warn_flag in '' yes
-do
-  # Use a header file that comes with gcc, so configuring glibc
-  # with a fresh cross-compiler works.
-  # Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
-  # <limits.h> exists even on freestanding compilers.
-  # On the NeXT, cc -E runs the code through the compiler's parser,
-  # not just through cpp. "Syntax error" is here to catch this case.
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#ifdef __STDC__
-# include <limits.h>
-#else
-# include <assert.h>
-#endif
-		     Syntax error
-_ACEOF
-if ac_fn_c_try_cpp "$LINENO"; then :
-
-else
-  # Broken: fails on valid input.
-continue
-fi
-rm -f conftest.err conftest.i conftest.$ac_ext
-
-  # OK, works on sane cases.  Now check whether nonexistent headers
-  # can be detected and how.
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#include <ac_nonexistent.h>
-_ACEOF
-if ac_fn_c_try_cpp "$LINENO"; then :
-  # Broken: success on invalid input.
-continue
-else
-  # Passes both tests.
-ac_preproc_ok=:
-break
-fi
-rm -f conftest.err conftest.i conftest.$ac_ext
-
-done
-# Because of `break', _AC_PREPROC_IFELSE's cleaning code was skipped.
-rm -f conftest.i conftest.err conftest.$ac_ext
-if $ac_preproc_ok; then :
-  break
-fi
-
-    done
-    ac_cv_prog_CPP=$CPP
-
-fi
-  CPP=$ac_cv_prog_CPP
-else
-  ac_cv_prog_CPP=$CPP
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $CPP" >&5
-$as_echo "$CPP" >&6; }
-ac_preproc_ok=false
-for ac_c_preproc_warn_flag in '' yes
-do
-  # Use a header file that comes with gcc, so configuring glibc
-  # with a fresh cross-compiler works.
-  # Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
-  # <limits.h> exists even on freestanding compilers.
-  # On the NeXT, cc -E runs the code through the compiler's parser,
-  # not just through cpp. "Syntax error" is here to catch this case.
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#ifdef __STDC__
-# include <limits.h>
-#else
-# include <assert.h>
-#endif
-		     Syntax error
-_ACEOF
-if ac_fn_c_try_cpp "$LINENO"; then :
-
-else
-  # Broken: fails on valid input.
-continue
-fi
-rm -f conftest.err conftest.i conftest.$ac_ext
-
-  # OK, works on sane cases.  Now check whether nonexistent headers
-  # can be detected and how.
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#include <ac_nonexistent.h>
-_ACEOF
-if ac_fn_c_try_cpp "$LINENO"; then :
-  # Broken: success on invalid input.
-continue
-else
-  # Passes both tests.
-ac_preproc_ok=:
-break
-fi
-rm -f conftest.err conftest.i conftest.$ac_ext
-
-done
-# Because of `break', _AC_PREPROC_IFELSE's cleaning code was skipped.
-rm -f conftest.i conftest.err conftest.$ac_ext
-if $ac_preproc_ok; then :
-
-else
-  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "C preprocessor \"$CPP\" fails sanity check
-See \`config.log' for more details" "$LINENO" 5; }
-fi
-
-ac_ext=c
-ac_cpp='$CPP $CPPFLAGS'
-ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
-ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_c_compiler_gnu
-
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for grep that handles long lines and -e" >&5
-$as_echo_n "checking for grep that handles long lines and -e... " >&6; }
-if ${ac_cv_path_GREP+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -z "$GREP"; then
-  ac_path_GREP_found=false
-  # Loop through the user's path and test for each of PROGNAME-LIST
-  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_prog in grep ggrep; do
-    for ac_exec_ext in '' $ac_executable_extensions; do
-      ac_path_GREP="$as_dir/$ac_prog$ac_exec_ext"
-      { test -f "$ac_path_GREP" && $as_test_x "$ac_path_GREP"; } || continue
-# Check for GNU ac_path_GREP and select it if it is found.
-  # Check for GNU $ac_path_GREP
-case `"$ac_path_GREP" --version 2>&1` in
-*GNU*)
-  ac_cv_path_GREP="$ac_path_GREP" ac_path_GREP_found=:;;
-*)
-  ac_count=0
-  $as_echo_n 0123456789 >"conftest.in"
-  while :
-  do
-    cat "conftest.in" "conftest.in" >"conftest.tmp"
-    mv "conftest.tmp" "conftest.in"
-    cp "conftest.in" "conftest.nl"
-    $as_echo 'GREP' >> "conftest.nl"
-    "$ac_path_GREP" -e 'GREP$' -e '-(cannot match)-' < "conftest.nl" >"conftest.out" 2>/dev/null || break
-    diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break
-    as_fn_arith $ac_count + 1 && ac_count=$as_val
-    if test $ac_count -gt ${ac_path_GREP_max-0}; then
-      # Best one so far, save it but keep looking for a better one
-      ac_cv_path_GREP="$ac_path_GREP"
-      ac_path_GREP_max=$ac_count
-    fi
-    # 10*(2^10) chars as input seems more than enough
-    test $ac_count -gt 10 && break
-  done
-  rm -f conftest.in conftest.tmp conftest.nl conftest.out;;
-esac
-
-      $ac_path_GREP_found && break 3
-    done
-  done
-  done
-IFS=$as_save_IFS
-  if test -z "$ac_cv_path_GREP"; then
-    as_fn_error $? "no acceptable grep could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" "$LINENO" 5
-  fi
-else
-  ac_cv_path_GREP=$GREP
-fi
-
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_GREP" >&5
-$as_echo "$ac_cv_path_GREP" >&6; }
- GREP="$ac_cv_path_GREP"
-
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for egrep" >&5
-$as_echo_n "checking for egrep... " >&6; }
-if ${ac_cv_path_EGREP+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if echo a | $GREP -E '(a|b)' >/dev/null 2>&1
-   then ac_cv_path_EGREP="$GREP -E"
-   else
-     if test -z "$EGREP"; then
-  ac_path_EGREP_found=false
-  # Loop through the user's path and test for each of PROGNAME-LIST
-  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_prog in egrep; do
-    for ac_exec_ext in '' $ac_executable_extensions; do
-      ac_path_EGREP="$as_dir/$ac_prog$ac_exec_ext"
-      { test -f "$ac_path_EGREP" && $as_test_x "$ac_path_EGREP"; } || continue
-# Check for GNU ac_path_EGREP and select it if it is found.
-  # Check for GNU $ac_path_EGREP
-case `"$ac_path_EGREP" --version 2>&1` in
-*GNU*)
-  ac_cv_path_EGREP="$ac_path_EGREP" ac_path_EGREP_found=:;;
-*)
-  ac_count=0
-  $as_echo_n 0123456789 >"conftest.in"
-  while :
-  do
-    cat "conftest.in" "conftest.in" >"conftest.tmp"
-    mv "conftest.tmp" "conftest.in"
-    cp "conftest.in" "conftest.nl"
-    $as_echo 'EGREP' >> "conftest.nl"
-    "$ac_path_EGREP" 'EGREP$' < "conftest.nl" >"conftest.out" 2>/dev/null || break
-    diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break
-    as_fn_arith $ac_count + 1 && ac_count=$as_val
-    if test $ac_count -gt ${ac_path_EGREP_max-0}; then
-      # Best one so far, save it but keep looking for a better one
-      ac_cv_path_EGREP="$ac_path_EGREP"
-      ac_path_EGREP_max=$ac_count
-    fi
-    # 10*(2^10) chars as input seems more than enough
-    test $ac_count -gt 10 && break
-  done
-  rm -f conftest.in conftest.tmp conftest.nl conftest.out;;
-esac
-
-      $ac_path_EGREP_found && break 3
-    done
-  done
-  done
-IFS=$as_save_IFS
-  if test -z "$ac_cv_path_EGREP"; then
-    as_fn_error $? "no acceptable egrep could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" "$LINENO" 5
-  fi
-else
-  ac_cv_path_EGREP=$EGREP
-fi
-
-   fi
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_EGREP" >&5
-$as_echo "$ac_cv_path_EGREP" >&6; }
- EGREP="$ac_cv_path_EGREP"
-
-
-if test $ac_cv_c_compiler_gnu = yes; then
-    { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CC needs -traditional" >&5
-$as_echo_n "checking whether $CC needs -traditional... " >&6; }
-if ${ac_cv_prog_gcc_traditional+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-    ac_pattern="Autoconf.*'x'"
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#include <sgtty.h>
-Autoconf TIOCGETP
-_ACEOF
-if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
-  $EGREP "$ac_pattern" >/dev/null 2>&1; then :
-  ac_cv_prog_gcc_traditional=yes
-else
-  ac_cv_prog_gcc_traditional=no
-fi
-rm -f conftest*
-
-
-  if test $ac_cv_prog_gcc_traditional = no; then
-    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#include <termio.h>
-Autoconf TCGETA
-_ACEOF
-if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
-  $EGREP "$ac_pattern" >/dev/null 2>&1; then :
-  ac_cv_prog_gcc_traditional=yes
-fi
-rm -f conftest*
-
-  fi
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_gcc_traditional" >&5
-$as_echo "$ac_cv_prog_gcc_traditional" >&6; }
-  if test $ac_cv_prog_gcc_traditional = yes; then
-    CC="$CC -traditional"
-  fi
-fi
-
-if test "x$CC" != xcc; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CC and cc understand -c and -o together" >&5
-$as_echo_n "checking whether $CC and cc understand -c and -o together... " >&6; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether cc understands -c and -o together" >&5
-$as_echo_n "checking whether cc understands -c and -o together... " >&6; }
-fi
-set dummy $CC; ac_cc=`$as_echo "$2" |
-		      sed 's/[^a-zA-Z0-9_]/_/g;s/^[0-9]/_/'`
-if eval \${ac_cv_prog_cc_${ac_cc}_c_o+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-int
-main ()
-{
-
-  ;
-  return 0;
-}
-_ACEOF
-# Make sure it works both with $CC and with simple cc.
-# We do the test twice because some compilers refuse to overwrite an
-# existing .o file with -o, though they will create one.
-ac_try='$CC -c conftest.$ac_ext -o conftest2.$ac_objext >&5'
-rm -f conftest2.*
-if { { case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_try") 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; } &&
-   test -f conftest2.$ac_objext && { { case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_try") 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; };
-then
-  eval ac_cv_prog_cc_${ac_cc}_c_o=yes
-  if test "x$CC" != xcc; then
-    # Test first that cc exists at all.
-    if { ac_try='cc -c conftest.$ac_ext >&5'
-  { { case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_try") 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then
-      ac_try='cc -c conftest.$ac_ext -o conftest2.$ac_objext >&5'
-      rm -f conftest2.*
-      if { { case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_try") 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; } &&
-	 test -f conftest2.$ac_objext && { { case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_try") 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; };
-      then
-	# cc works too.
-	:
-      else
-	# cc exists but doesn't like -o.
-	eval ac_cv_prog_cc_${ac_cc}_c_o=no
-      fi
-    fi
-  fi
-else
-  eval ac_cv_prog_cc_${ac_cc}_c_o=no
-fi
-rm -f core conftest*
-
-fi
-if eval test \$ac_cv_prog_cc_${ac_cc}_c_o = yes; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-
-$as_echo "#define NO_MINUS_C_MINUS_O 1" >>confdefs.h
-
-fi
-
-# FIXME: we rely on the cache variable name because
-# there is no other way.
-set dummy $CC
-am_cc=`echo $2 | sed 's/[^a-zA-Z0-9_]/_/g;s/^[0-9]/_/'`
-eval am_t=\$ac_cv_prog_cc_${am_cc}_c_o
-if test "$am_t" != yes; then
-   # Losing compiler, so override with the script.
-   # FIXME: It is wrong to rewrite CC.
-   # But if we don't then we get into trouble of one sort or another.
-   # A longer-term fix would be to have automake use am__CC in this case,
-   # and then we could set am__CC="\$(top_srcdir)/compile \$(CC)"
-   CC="$am_aux_dir/compile $CC"
-fi
-
-
-# By default we simply use the C compiler to build assembly code.
-
-test "${CCAS+set}" = set || CCAS=$CC
-test "${CCASFLAGS+set}" = set || CCASFLAGS=$CFLAGS
-
-
-
-depcc="$CCAS"   am_compiler_list=
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking dependency style of $depcc" >&5
-$as_echo_n "checking dependency style of $depcc... " >&6; }
-if ${am_cv_CCAS_dependencies_compiler_type+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -z "$AMDEP_TRUE" && test -f "$am_depcomp"; then
-  # We make a subdir and do the tests there.  Otherwise we can end up
-  # making bogus files that we don't know about and never remove.  For
-  # instance it was reported that on HP-UX the gcc test will end up
-  # making a dummy file named `D' -- because `-MD' means `put the output
-  # in D'.
-  rm -rf conftest.dir
-  mkdir conftest.dir
-  # Copy depcomp to subdir because otherwise we won't find it if we're
-  # using a relative directory.
-  cp "$am_depcomp" conftest.dir
-  cd conftest.dir
-  # We will build objects and dependencies in a subdirectory because
-  # it helps to detect inapplicable dependency modes.  For instance
-  # both Tru64's cc and ICC support -MD to output dependencies as a
-  # side effect of compilation, but ICC will put the dependencies in
-  # the current directory while Tru64 will put them in the object
-  # directory.
-  mkdir sub
-
-  am_cv_CCAS_dependencies_compiler_type=none
-  if test "$am_compiler_list" = ""; then
-     am_compiler_list=`sed -n 's/^#*\([a-zA-Z0-9]*\))$/\1/p' < ./depcomp`
-  fi
-  am__universal=false
-
-
-  for depmode in $am_compiler_list; do
-    # Setup a source with many dependencies, because some compilers
-    # like to wrap large dependency lists on column 80 (with \), and
-    # we should not choose a depcomp mode which is confused by this.
-    #
-    # We need to recreate these files for each test, as the compiler may
-    # overwrite some of them when testing with obscure command lines.
-    # This happens at least with the AIX C compiler.
-    : > sub/conftest.c
-    for i in 1 2 3 4 5 6; do
-      echo '#include "conftst'$i'.h"' >> sub/conftest.c
-      # Using `: > sub/conftst$i.h' creates only sub/conftst1.h with
-      # Solaris 8's {/usr,}/bin/sh.
-      touch sub/conftst$i.h
-    done
-    echo "${am__include} ${am__quote}sub/conftest.Po${am__quote}" > confmf
-
-    # We check with `-c' and `-o' for the sake of the "dashmstdout"
-    # mode.  It turns out that the SunPro C++ compiler does not properly
-    # handle `-M -o', and we need to detect this.  Also, some Intel
-    # versions had trouble with output in subdirs
-    am__obj=sub/conftest.${OBJEXT-o}
-    am__minus_obj="-o $am__obj"
-    case $depmode in
-    gcc)
-      # This depmode causes a compiler race in universal mode.
-      test "$am__universal" = false || continue
-      ;;
-    nosideeffect)
-      # after this tag, mechanisms are not by side-effect, so they'll
-      # only be used when explicitly requested
-      if test "x$enable_dependency_tracking" = xyes; then
-	continue
-      else
-	break
-      fi
-      ;;
-    msvc7 | msvc7msys | msvisualcpp | msvcmsys)
-      # This compiler won't grok `-c -o', but also, the minuso test has
-      # not run yet.  These depmodes are late enough in the game, and
-      # so weak that their functioning should not be impacted.
-      am__obj=conftest.${OBJEXT-o}
-      am__minus_obj=
-      ;;
-    none) break ;;
-    esac
-    if depmode=$depmode \
-       source=sub/conftest.c object=$am__obj \
-       depfile=sub/conftest.Po tmpdepfile=sub/conftest.TPo \
-       $SHELL ./depcomp $depcc -c $am__minus_obj sub/conftest.c \
-         >/dev/null 2>conftest.err &&
-       grep sub/conftst1.h sub/conftest.Po > /dev/null 2>&1 &&
-       grep sub/conftst6.h sub/conftest.Po > /dev/null 2>&1 &&
-       grep $am__obj sub/conftest.Po > /dev/null 2>&1 &&
-       ${MAKE-make} -s -f confmf > /dev/null 2>&1; then
-      # icc doesn't choke on unknown options, it will just issue warnings
-      # or remarks (even with -Werror).  So we grep stderr for any message
-      # that says an option was ignored or not supported.
-      # When given -MP, icc 7.0 and 7.1 complain thusly:
-      #   icc: Command line warning: ignoring option '-M'; no argument required
-      # The diagnosis changed in icc 8.0:
-      #   icc: Command line remark: option '-MP' not supported
-      if (grep 'ignoring option' conftest.err ||
-          grep 'not supported' conftest.err) >/dev/null 2>&1; then :; else
-        am_cv_CCAS_dependencies_compiler_type=$depmode
-        break
-      fi
-    fi
-  done
-
-  cd ..
-  rm -rf conftest.dir
-else
-  am_cv_CCAS_dependencies_compiler_type=none
-fi
-
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_CCAS_dependencies_compiler_type" >&5
-$as_echo "$am_cv_CCAS_dependencies_compiler_type" >&6; }
-CCASDEPMODE=depmode=$am_cv_CCAS_dependencies_compiler_type
-
- if
-  test "x$enable_dependency_tracking" != xno \
-  && test "$am_cv_CCAS_dependencies_compiler_type" = gcc3; then
-  am__fastdepCCAS_TRUE=
-  am__fastdepCCAS_FALSE='#'
-else
-  am__fastdepCCAS_TRUE='#'
-  am__fastdepCCAS_FALSE=
-fi
-
-
-if test -n "$ac_tool_prefix"; then
-  # Extract the first word of "${ac_tool_prefix}ranlib", so it can be a program name with args.
-set dummy ${ac_tool_prefix}ranlib; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_RANLIB+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -n "$RANLIB"; then
-  ac_cv_prog_RANLIB="$RANLIB" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_prog_RANLIB="${ac_tool_prefix}ranlib"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-  done
-IFS=$as_save_IFS
-
-fi
-fi
-RANLIB=$ac_cv_prog_RANLIB
-if test -n "$RANLIB"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $RANLIB" >&5
-$as_echo "$RANLIB" >&6; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-fi
-
-
-fi
-if test -z "$ac_cv_prog_RANLIB"; then
-  ac_ct_RANLIB=$RANLIB
-  # Extract the first word of "ranlib", so it can be a program name with args.
-set dummy ranlib; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_ac_ct_RANLIB+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -n "$ac_ct_RANLIB"; then
-  ac_cv_prog_ac_ct_RANLIB="$ac_ct_RANLIB" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_prog_ac_ct_RANLIB="ranlib"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-  done
-IFS=$as_save_IFS
-
-fi
-fi
-ac_ct_RANLIB=$ac_cv_prog_ac_ct_RANLIB
-if test -n "$ac_ct_RANLIB"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_RANLIB" >&5
-$as_echo "$ac_ct_RANLIB" >&6; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-fi
-
-  if test "x$ac_ct_RANLIB" = x; then
-    RANLIB=":"
-  else
-    case $cross_compiling:$ac_tool_warned in
-yes:)
-{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
-$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
-ac_tool_warned=yes ;;
-esac
-    RANLIB=$ac_ct_RANLIB
-  fi
-else
-  RANLIB="$ac_cv_prog_RANLIB"
-fi
-
-ac_ext=cpp
-ac_cpp='$CXXCPP $CPPFLAGS'
-ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
-ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
-if test -z "$CXX"; then
-  if test -n "$CCC"; then
-    CXX=$CCC
-  else
-    if test -n "$ac_tool_prefix"; then
-  for ac_prog in g++ c++ gpp aCC CC cxx cc++ cl.exe FCC KCC RCC xlC_r xlC
-  do
-    # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args.
-set dummy $ac_tool_prefix$ac_prog; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_CXX+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -n "$CXX"; then
-  ac_cv_prog_CXX="$CXX" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_prog_CXX="$ac_tool_prefix$ac_prog"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-  done
-IFS=$as_save_IFS
-
-fi
-fi
-CXX=$ac_cv_prog_CXX
-if test -n "$CXX"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CXX" >&5
-$as_echo "$CXX" >&6; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-fi
-
-
-    test -n "$CXX" && break
-  done
-fi
-if test -z "$CXX"; then
-  ac_ct_CXX=$CXX
-  for ac_prog in g++ c++ gpp aCC CC cxx cc++ cl.exe FCC KCC RCC xlC_r xlC
-do
-  # Extract the first word of "$ac_prog", so it can be a program name with args.
-set dummy $ac_prog; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_ac_ct_CXX+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -n "$ac_ct_CXX"; then
-  ac_cv_prog_ac_ct_CXX="$ac_ct_CXX" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_prog_ac_ct_CXX="$ac_prog"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-  done
-IFS=$as_save_IFS
-
-fi
-fi
-ac_ct_CXX=$ac_cv_prog_ac_ct_CXX
-if test -n "$ac_ct_CXX"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CXX" >&5
-$as_echo "$ac_ct_CXX" >&6; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-fi
-
-
-  test -n "$ac_ct_CXX" && break
-done
-
-  if test "x$ac_ct_CXX" = x; then
-    CXX="g++"
-  else
-    case $cross_compiling:$ac_tool_warned in
-yes:)
-{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
-$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
-ac_tool_warned=yes ;;
-esac
-    CXX=$ac_ct_CXX
-  fi
-fi
-
-  fi
-fi
-# Provide some information about the compiler.
-$as_echo "$as_me:${as_lineno-$LINENO}: checking for C++ compiler version" >&5
-set X $ac_compile
-ac_compiler=$2
-for ac_option in --version -v -V -qversion; do
-  { { ac_try="$ac_compiler $ac_option >&5"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_compiler $ac_option >&5") 2>conftest.err
-  ac_status=$?
-  if test -s conftest.err; then
-    sed '10a\
-... rest of stderr output deleted ...
-         10q' conftest.err >conftest.er1
-    cat conftest.er1 >&5
-  fi
-  rm -f conftest.er1 conftest.err
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }
-done
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are using the GNU C++ compiler" >&5
-$as_echo_n "checking whether we are using the GNU C++ compiler... " >&6; }
-if ${ac_cv_cxx_compiler_gnu+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-int
-main ()
-{
-#ifndef __GNUC__
-       choke me
-#endif
-
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_cxx_try_compile "$LINENO"; then :
-  ac_compiler_gnu=yes
-else
-  ac_compiler_gnu=no
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-ac_cv_cxx_compiler_gnu=$ac_compiler_gnu
-
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_cxx_compiler_gnu" >&5
-$as_echo "$ac_cv_cxx_compiler_gnu" >&6; }
-if test $ac_compiler_gnu = yes; then
-  GXX=yes
-else
-  GXX=
-fi
-ac_test_CXXFLAGS=${CXXFLAGS+set}
-ac_save_CXXFLAGS=$CXXFLAGS
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CXX accepts -g" >&5
-$as_echo_n "checking whether $CXX accepts -g... " >&6; }
-if ${ac_cv_prog_cxx_g+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_save_cxx_werror_flag=$ac_cxx_werror_flag
-   ac_cxx_werror_flag=yes
-   ac_cv_prog_cxx_g=no
-   CXXFLAGS="-g"
-   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-int
-main ()
-{
-
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_cxx_try_compile "$LINENO"; then :
-  ac_cv_prog_cxx_g=yes
-else
-  CXXFLAGS=""
-      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-int
-main ()
-{
-
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_cxx_try_compile "$LINENO"; then :
-
-else
-  ac_cxx_werror_flag=$ac_save_cxx_werror_flag
-	 CXXFLAGS="-g"
-	 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-int
-main ()
-{
-
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_cxx_try_compile "$LINENO"; then :
-  ac_cv_prog_cxx_g=yes
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-   ac_cxx_werror_flag=$ac_save_cxx_werror_flag
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cxx_g" >&5
-$as_echo "$ac_cv_prog_cxx_g" >&6; }
-if test "$ac_test_CXXFLAGS" = set; then
-  CXXFLAGS=$ac_save_CXXFLAGS
-elif test $ac_cv_prog_cxx_g = yes; then
-  if test "$GXX" = yes; then
-    CXXFLAGS="-g -O2"
-  else
-    CXXFLAGS="-g"
-  fi
-else
-  if test "$GXX" = yes; then
-    CXXFLAGS="-O2"
-  else
-    CXXFLAGS=
-  fi
-fi
-ac_ext=c
-ac_cpp='$CPP $CPPFLAGS'
-ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
-ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_c_compiler_gnu
-
-depcc="$CXX"  am_compiler_list=
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking dependency style of $depcc" >&5
-$as_echo_n "checking dependency style of $depcc... " >&6; }
-if ${am_cv_CXX_dependencies_compiler_type+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -z "$AMDEP_TRUE" && test -f "$am_depcomp"; then
-  # We make a subdir and do the tests there.  Otherwise we can end up
-  # making bogus files that we don't know about and never remove.  For
-  # instance it was reported that on HP-UX the gcc test will end up
-  # making a dummy file named `D' -- because `-MD' means `put the output
-  # in D'.
-  rm -rf conftest.dir
-  mkdir conftest.dir
-  # Copy depcomp to subdir because otherwise we won't find it if we're
-  # using a relative directory.
-  cp "$am_depcomp" conftest.dir
-  cd conftest.dir
-  # We will build objects and dependencies in a subdirectory because
-  # it helps to detect inapplicable dependency modes.  For instance
-  # both Tru64's cc and ICC support -MD to output dependencies as a
-  # side effect of compilation, but ICC will put the dependencies in
-  # the current directory while Tru64 will put them in the object
-  # directory.
-  mkdir sub
-
-  am_cv_CXX_dependencies_compiler_type=none
-  if test "$am_compiler_list" = ""; then
-     am_compiler_list=`sed -n 's/^#*\([a-zA-Z0-9]*\))$/\1/p' < ./depcomp`
-  fi
-  am__universal=false
-  case " $depcc " in #(
-     *\ -arch\ *\ -arch\ *) am__universal=true ;;
-     esac
-
-  for depmode in $am_compiler_list; do
-    # Setup a source with many dependencies, because some compilers
-    # like to wrap large dependency lists on column 80 (with \), and
-    # we should not choose a depcomp mode which is confused by this.
-    #
-    # We need to recreate these files for each test, as the compiler may
-    # overwrite some of them when testing with obscure command lines.
-    # This happens at least with the AIX C compiler.
-    : > sub/conftest.c
-    for i in 1 2 3 4 5 6; do
-      echo '#include "conftst'$i'.h"' >> sub/conftest.c
-      # Using `: > sub/conftst$i.h' creates only sub/conftst1.h with
-      # Solaris 8's {/usr,}/bin/sh.
-      touch sub/conftst$i.h
-    done
-    echo "${am__include} ${am__quote}sub/conftest.Po${am__quote}" > confmf
-
-    # We check with `-c' and `-o' for the sake of the "dashmstdout"
-    # mode.  It turns out that the SunPro C++ compiler does not properly
-    # handle `-M -o', and we need to detect this.  Also, some Intel
-    # versions had trouble with output in subdirs
-    am__obj=sub/conftest.${OBJEXT-o}
-    am__minus_obj="-o $am__obj"
-    case $depmode in
-    gcc)
-      # This depmode causes a compiler race in universal mode.
-      test "$am__universal" = false || continue
-      ;;
-    nosideeffect)
-      # after this tag, mechanisms are not by side-effect, so they'll
-      # only be used when explicitly requested
-      if test "x$enable_dependency_tracking" = xyes; then
-	continue
-      else
-	break
-      fi
-      ;;
-    msvc7 | msvc7msys | msvisualcpp | msvcmsys)
-      # This compiler won't grok `-c -o', but also, the minuso test has
-      # not run yet.  These depmodes are late enough in the game, and
-      # so weak that their functioning should not be impacted.
-      am__obj=conftest.${OBJEXT-o}
-      am__minus_obj=
-      ;;
-    none) break ;;
-    esac
-    if depmode=$depmode \
-       source=sub/conftest.c object=$am__obj \
-       depfile=sub/conftest.Po tmpdepfile=sub/conftest.TPo \
-       $SHELL ./depcomp $depcc -c $am__minus_obj sub/conftest.c \
-         >/dev/null 2>conftest.err &&
-       grep sub/conftst1.h sub/conftest.Po > /dev/null 2>&1 &&
-       grep sub/conftst6.h sub/conftest.Po > /dev/null 2>&1 &&
-       grep $am__obj sub/conftest.Po > /dev/null 2>&1 &&
-       ${MAKE-make} -s -f confmf > /dev/null 2>&1; then
-      # icc doesn't choke on unknown options, it will just issue warnings
-      # or remarks (even with -Werror).  So we grep stderr for any message
-      # that says an option was ignored or not supported.
-      # When given -MP, icc 7.0 and 7.1 complain thusly:
-      #   icc: Command line warning: ignoring option '-M'; no argument required
-      # The diagnosis changed in icc 8.0:
-      #   icc: Command line remark: option '-MP' not supported
-      if (grep 'ignoring option' conftest.err ||
-          grep 'not supported' conftest.err) >/dev/null 2>&1; then :; else
-        am_cv_CXX_dependencies_compiler_type=$depmode
-        break
-      fi
-    fi
-  done
-
-  cd ..
-  rm -rf conftest.dir
-else
-  am_cv_CXX_dependencies_compiler_type=none
-fi
-
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_CXX_dependencies_compiler_type" >&5
-$as_echo "$am_cv_CXX_dependencies_compiler_type" >&6; }
-CXXDEPMODE=depmode=$am_cv_CXX_dependencies_compiler_type
-
- if
-  test "x$enable_dependency_tracking" != xno \
-  && test "$am_cv_CXX_dependencies_compiler_type" = gcc3; then
-  am__fastdepCXX_TRUE=
-  am__fastdepCXX_FALSE='#'
-else
-  am__fastdepCXX_TRUE='#'
-  am__fastdepCXX_FALSE=
-fi
-
-
-
-  OPENMP_CFLAGS=
-  # Check whether --enable-openmp was given.
-if test "${enable_openmp+set}" = set; then :
-  enableval=$enable_openmp;
-fi
-
-  if test "$enable_openmp" != no; then
-    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $CC option to support OpenMP" >&5
-$as_echo_n "checking for $CC option to support OpenMP... " >&6; }
-if ${ac_cv_prog_c_openmp+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-#ifndef _OPENMP
- choke me
-#endif
-#include <omp.h>
-int main () { return omp_get_num_threads (); }
-
-_ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
-  ac_cv_prog_c_openmp='none needed'
-else
-  ac_cv_prog_c_openmp='unsupported'
-	  	  	  	  	  	  	  	  	  	  	  	  	  for ac_option in -fopenmp -xopenmp -openmp -mp -omp -qsmp=omp; do
-	    ac_save_CFLAGS=$CFLAGS
-	    CFLAGS="$CFLAGS $ac_option"
-	    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-#ifndef _OPENMP
- choke me
-#endif
-#include <omp.h>
-int main () { return omp_get_num_threads (); }
-
-_ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
-  ac_cv_prog_c_openmp=$ac_option
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-	    CFLAGS=$ac_save_CFLAGS
-	    if test "$ac_cv_prog_c_openmp" != unsupported; then
-	      break
-	    fi
-	  done
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_c_openmp" >&5
-$as_echo "$ac_cv_prog_c_openmp" >&6; }
-    case $ac_cv_prog_c_openmp in #(
-      "none needed" | unsupported)
-	;; #(
-      *)
-	OPENMP_CFLAGS=$ac_cv_prog_c_openmp ;;
-    esac
-  fi
-
-
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for ANSI C header files" >&5
-$as_echo_n "checking for ANSI C header files... " >&6; }
-if ${ac_cv_header_stdc+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#include <stdlib.h>
-#include <stdarg.h>
-#include <string.h>
-#include <float.h>
-
-int
-main ()
-{
-
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
-  ac_cv_header_stdc=yes
-else
-  ac_cv_header_stdc=no
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-
-if test $ac_cv_header_stdc = yes; then
-  # SunOS 4.x string.h does not declare mem*, contrary to ANSI.
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#include <string.h>
-
-_ACEOF
-if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
-  $EGREP "memchr" >/dev/null 2>&1; then :
-
-else
-  ac_cv_header_stdc=no
-fi
-rm -f conftest*
-
-fi
-
-if test $ac_cv_header_stdc = yes; then
-  # ISC 2.0.2 stdlib.h does not declare free, contrary to ANSI.
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#include <stdlib.h>
-
-_ACEOF
-if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
-  $EGREP "free" >/dev/null 2>&1; then :
-
-else
-  ac_cv_header_stdc=no
-fi
-rm -f conftest*
-
-fi
-
-if test $ac_cv_header_stdc = yes; then
-  # /bin/cc in Irix-4.0.5 gets non-ANSI ctype macros unless using -ansi.
-  if test "$cross_compiling" = yes; then :
-  :
-else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#include <ctype.h>
-#include <stdlib.h>
-#if ((' ' & 0x0FF) == 0x020)
-# define ISLOWER(c) ('a' <= (c) && (c) <= 'z')
-# define TOUPPER(c) (ISLOWER(c) ? 'A' + ((c) - 'a') : (c))
-#else
-# define ISLOWER(c) \
-		   (('a' <= (c) && (c) <= 'i') \
-		     || ('j' <= (c) && (c) <= 'r') \
-		     || ('s' <= (c) && (c) <= 'z'))
-# define TOUPPER(c) (ISLOWER(c) ? ((c) | 0x40) : (c))
-#endif
-
-#define XOR(e, f) (((e) && !(f)) || (!(e) && (f)))
-int
-main ()
-{
-  int i;
-  for (i = 0; i < 256; i++)
-    if (XOR (islower (i), ISLOWER (i))
-	|| toupper (i) != TOUPPER (i))
-      return 2;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_run "$LINENO"; then :
-
-else
-  ac_cv_header_stdc=no
-fi
-rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
-  conftest.$ac_objext conftest.beam conftest.$ac_ext
-fi
-
-fi
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_header_stdc" >&5
-$as_echo "$ac_cv_header_stdc" >&6; }
-if test $ac_cv_header_stdc = yes; then
-
-$as_echo "#define STDC_HEADERS 1" >>confdefs.h
-
-fi
-
-# On IRIX 5.3, sys/types and inttypes.h are conflicting.
-for ac_header in sys/types.h sys/stat.h stdlib.h string.h memory.h strings.h \
-		  inttypes.h stdint.h unistd.h
-do :
-  as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
-ac_fn_c_check_header_compile "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default
-"
-if eval test \"x\$"$as_ac_Header"\" = x"yes"; then :
-  cat >>confdefs.h <<_ACEOF
-#define `$as_echo "HAVE_$ac_header" | $as_tr_cpp` 1
-_ACEOF
-
-fi
-
-done
-
-
-for ac_header in sys/endian.h sys/param.h syslog.h
-do :
-  as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
-ac_fn_c_check_header_mongrel "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default"
-if eval test \"x\$"$as_ac_Header"\" = x"yes"; then :
-  cat >>confdefs.h <<_ACEOF
-#define `$as_echo "HAVE_$ac_header" | $as_tr_cpp` 1
-_ACEOF
-
-fi
-
-done
-
-# sys/sysctl.h requires sys/types.h on FreeBSD
-# sys/sysctl.h requires sys/param.h on OpenBSD
-for ac_header in sys/sysctl.h
-do :
-  ac_fn_c_check_header_compile "$LINENO" "sys/sysctl.h" "ac_cv_header_sys_sysctl_h" "#include <sys/types.h>
-#ifdef HAVE_SYS_PARAM_H
-#include <sys/param.h>
-#endif
-
-"
-if test "x$ac_cv_header_sys_sysctl_h" = xyes; then :
-  cat >>confdefs.h <<_ACEOF
-#define HAVE_SYS_SYSCTL_H 1
-_ACEOF
-
-fi
-
-done
-
-
-ac_fn_c_check_decl "$LINENO" "be32dec" "ac_cv_have_decl_be32dec" "$ac_includes_default
-#ifdef HAVE_SYS_ENDIAN_H
-#include <sys/endian.h>
-#endif
-
-"
-if test "x$ac_cv_have_decl_be32dec" = xyes; then :
-  ac_have_decl=1
-else
-  ac_have_decl=0
-fi
-
-cat >>confdefs.h <<_ACEOF
-#define HAVE_DECL_BE32DEC $ac_have_decl
-_ACEOF
-ac_fn_c_check_decl "$LINENO" "le32dec" "ac_cv_have_decl_le32dec" "$ac_includes_default
-#ifdef HAVE_SYS_ENDIAN_H
-#include <sys/endian.h>
-#endif
-
-"
-if test "x$ac_cv_have_decl_le32dec" = xyes; then :
-  ac_have_decl=1
-else
-  ac_have_decl=0
-fi
-
-cat >>confdefs.h <<_ACEOF
-#define HAVE_DECL_LE32DEC $ac_have_decl
-_ACEOF
-ac_fn_c_check_decl "$LINENO" "be32enc" "ac_cv_have_decl_be32enc" "$ac_includes_default
-#ifdef HAVE_SYS_ENDIAN_H
-#include <sys/endian.h>
-#endif
-
-"
-if test "x$ac_cv_have_decl_be32enc" = xyes; then :
-  ac_have_decl=1
-else
-  ac_have_decl=0
-fi
-
-cat >>confdefs.h <<_ACEOF
-#define HAVE_DECL_BE32ENC $ac_have_decl
-_ACEOF
-ac_fn_c_check_decl "$LINENO" "le32enc" "ac_cv_have_decl_le32enc" "$ac_includes_default
-#ifdef HAVE_SYS_ENDIAN_H
-#include <sys/endian.h>
-#endif
-
-"
-if test "x$ac_cv_have_decl_le32enc" = xyes; then :
-  ac_have_decl=1
-else
-  ac_have_decl=0
-fi
-
-cat >>confdefs.h <<_ACEOF
-#define HAVE_DECL_LE32ENC $ac_have_decl
-_ACEOF
-
-
-ac_fn_c_check_type "$LINENO" "size_t" "ac_cv_type_size_t" "$ac_includes_default"
-if test "x$ac_cv_type_size_t" = xyes; then :
-
-else
-
-cat >>confdefs.h <<_ACEOF
-#define size_t unsigned int
-_ACEOF
-
-fi
-
-# The Ultrix 4.2 mips builtin alloca declared by alloca.h only works
-# for constant arguments.  Useless!
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for working alloca.h" >&5
-$as_echo_n "checking for working alloca.h... " >&6; }
-if ${ac_cv_working_alloca_h+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#include <alloca.h>
-int
-main ()
-{
-char *p = (char *) alloca (2 * sizeof (int));
-			  if (p) return 0;
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
-  ac_cv_working_alloca_h=yes
-else
-  ac_cv_working_alloca_h=no
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_working_alloca_h" >&5
-$as_echo "$ac_cv_working_alloca_h" >&6; }
-if test $ac_cv_working_alloca_h = yes; then
-
-$as_echo "#define HAVE_ALLOCA_H 1" >>confdefs.h
-
-fi
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for alloca" >&5
-$as_echo_n "checking for alloca... " >&6; }
-if ${ac_cv_func_alloca_works+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#ifdef __GNUC__
-# define alloca __builtin_alloca
-#else
-# ifdef _MSC_VER
-#  include <malloc.h>
-#  define alloca _alloca
-# else
-#  ifdef HAVE_ALLOCA_H
-#   include <alloca.h>
-#  else
-#   ifdef _AIX
- #pragma alloca
-#   else
-#    ifndef alloca /* predefined by HP cc +Olibcalls */
-void *alloca (size_t);
-#    endif
-#   endif
-#  endif
-# endif
-#endif
-
-int
-main ()
-{
-char *p = (char *) alloca (1);
-				    if (p) return 0;
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
-  ac_cv_func_alloca_works=yes
-else
-  ac_cv_func_alloca_works=no
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_func_alloca_works" >&5
-$as_echo "$ac_cv_func_alloca_works" >&6; }
-
-if test $ac_cv_func_alloca_works = yes; then
-
-$as_echo "#define HAVE_ALLOCA 1" >>confdefs.h
-
-else
-  # The SVR3 libPW and SVR4 libucb both contain incompatible functions
-# that cause trouble.  Some versions do not even contain alloca or
-# contain a buggy version.  If you still want to use their alloca,
-# use ar to extract alloca.o from them instead of compiling alloca.c.
-
-ALLOCA=\${LIBOBJDIR}alloca.$ac_objext
-
-$as_echo "#define C_ALLOCA 1" >>confdefs.h
-
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether \`alloca.c' needs Cray hooks" >&5
-$as_echo_n "checking whether \`alloca.c' needs Cray hooks... " >&6; }
-if ${ac_cv_os_cray+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#if defined CRAY && ! defined CRAY2
-webecray
-#else
-wenotbecray
-#endif
-
-_ACEOF
-if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
-  $EGREP "webecray" >/dev/null 2>&1; then :
-  ac_cv_os_cray=yes
-else
-  ac_cv_os_cray=no
-fi
-rm -f conftest*
-
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_os_cray" >&5
-$as_echo "$ac_cv_os_cray" >&6; }
-if test $ac_cv_os_cray = yes; then
-  for ac_func in _getb67 GETB67 getb67; do
-    as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
-ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var"
-if eval test \"x\$"$as_ac_var"\" = x"yes"; then :
-
-cat >>confdefs.h <<_ACEOF
-#define CRAY_STACKSEG_END $ac_func
-_ACEOF
-
-    break
-fi
-
-  done
-fi
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking stack direction for C alloca" >&5
-$as_echo_n "checking stack direction for C alloca... " >&6; }
-if ${ac_cv_c_stack_direction+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test "$cross_compiling" = yes; then :
-  ac_cv_c_stack_direction=0
-else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-$ac_includes_default
-int
-find_stack_direction ()
-{
-  static char *addr = 0;
-  auto char dummy;
-  if (addr == 0)
-    {
-      addr = &dummy;
-      return find_stack_direction ();
-    }
-  else
-    return (&dummy > addr) ? 1 : -1;
-}
-
-int
-main ()
-{
-  return find_stack_direction () < 0;
-}
-_ACEOF
-if ac_fn_c_try_run "$LINENO"; then :
-  ac_cv_c_stack_direction=1
-else
-  ac_cv_c_stack_direction=-1
-fi
-rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
-  conftest.$ac_objext conftest.beam conftest.$ac_ext
-fi
-
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_c_stack_direction" >&5
-$as_echo "$ac_cv_c_stack_direction" >&6; }
-cat >>confdefs.h <<_ACEOF
-#define STACK_DIRECTION $ac_cv_c_stack_direction
-_ACEOF
-
-
-fi
-
-for ac_func in getopt_long
-do :
-  ac_fn_c_check_func "$LINENO" "getopt_long" "ac_cv_func_getopt_long"
-if test "x$ac_cv_func_getopt_long" = xyes; then :
-  cat >>confdefs.h <<_ACEOF
-#define HAVE_GETOPT_LONG 1
-_ACEOF
-
-fi
-done
-
-
-case $target in
-  i*86-*-*)
-    have_x86=true
-    ;;
-  x86_64-*-*)
-    have_x86=true
-    have_x86_64=true
-    ;;
-  amd64-*-*)
-    have_x86=true
-    have_x86_64=true
-    ;;
-esac
-
-PTHREAD_FLAGS="-pthread"
-WS2_LIBS=""
-
-case $target in
-  *-*-mingw*)
-    have_win32=true
-    PTHREAD_FLAGS=""
-    WS2_LIBS="-lws2_32"
-    ;;
-esac
-
-if test x$have_x86 = xtrue
-then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we can compile AVX code" >&5
-$as_echo_n "checking whether we can compile AVX code... " >&6; }
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-int
-main ()
-{
-asm ("vmovdqa %ymm0, %ymm1");
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
-
-$as_echo "#define USE_AVX 1" >>confdefs.h
-
-    { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-    { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we can compile XOP code" >&5
-$as_echo_n "checking whether we can compile XOP code... " >&6; }
-    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-int
-main ()
-{
-asm ("vprotd \$7, %xmm0, %xmm1");
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
-
-$as_echo "#define USE_XOP 1" >>confdefs.h
-
-      { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-      { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the XOP instruction set." >&5
-$as_echo "$as_me: WARNING: The assembler does not support the XOP instruction set." >&2;}
-
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-    { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we can compile AVX2 code" >&5
-$as_echo_n "checking whether we can compile AVX2 code... " >&6; }
-    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-int
-main ()
-{
-asm ("vpaddd %ymm0, %ymm1, %ymm2");
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
-
-$as_echo "#define USE_AVX2 1" >>confdefs.h
-
-      { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-      { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the AVX2 instruction set." >&5
-$as_echo "$as_me: WARNING: The assembler does not support the AVX2 instruction set." >&2;}
-
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: The assembler does not support the AVX instruction set." >&5
-$as_echo "$as_me: WARNING: The assembler does not support the AVX instruction set." >&2;}
-
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-fi
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for json_loads in -ljansson" >&5
-$as_echo_n "checking for json_loads in -ljansson... " >&6; }
-if ${ac_cv_lib_jansson_json_loads+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_check_lib_save_LIBS=$LIBS
-LIBS="-ljansson  $LIBS"
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-/* Override any GCC internal prototype to avoid an error.
-   Use char because int might match the return type of a GCC
-   builtin and then its argument prototype would still apply.  */
-#ifdef __cplusplus
-extern "C"
-#endif
-char json_loads ();
-int
-main ()
-{
-return json_loads ();
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
-  ac_cv_lib_jansson_json_loads=yes
-else
-  ac_cv_lib_jansson_json_loads=no
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-LIBS=$ac_check_lib_save_LIBS
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_jansson_json_loads" >&5
-$as_echo "$ac_cv_lib_jansson_json_loads" >&6; }
-if test "x$ac_cv_lib_jansson_json_loads" = xyes; then :
-  request_jansson=false
-else
-  request_jansson=true
-fi
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthread" >&5
-$as_echo_n "checking for pthread_create in -lpthread... " >&6; }
-if ${ac_cv_lib_pthread_pthread_create+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_check_lib_save_LIBS=$LIBS
-LIBS="-lpthread  $LIBS"
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-/* Override any GCC internal prototype to avoid an error.
-   Use char because int might match the return type of a GCC
-   builtin and then its argument prototype would still apply.  */
-#ifdef __cplusplus
-extern "C"
-#endif
-char pthread_create ();
-int
-main ()
-{
-return pthread_create ();
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
-  ac_cv_lib_pthread_pthread_create=yes
-else
-  ac_cv_lib_pthread_pthread_create=no
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-LIBS=$ac_check_lib_save_LIBS
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_pthread_pthread_create" >&5
-$as_echo "$ac_cv_lib_pthread_pthread_create" >&6; }
-if test "x$ac_cv_lib_pthread_pthread_create" = xyes; then :
-  PTHREAD_LIBS="-lpthread"
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthreadGC2" >&5
-$as_echo_n "checking for pthread_create in -lpthreadGC2... " >&6; }
-if ${ac_cv_lib_pthreadGC2_pthread_create+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_check_lib_save_LIBS=$LIBS
-LIBS="-lpthreadGC2  $LIBS"
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-/* Override any GCC internal prototype to avoid an error.
-   Use char because int might match the return type of a GCC
-   builtin and then its argument prototype would still apply.  */
-#ifdef __cplusplus
-extern "C"
-#endif
-char pthread_create ();
-int
-main ()
-{
-return pthread_create ();
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
-  ac_cv_lib_pthreadGC2_pthread_create=yes
-else
-  ac_cv_lib_pthreadGC2_pthread_create=no
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-LIBS=$ac_check_lib_save_LIBS
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_pthreadGC2_pthread_create" >&5
-$as_echo "$ac_cv_lib_pthreadGC2_pthread_create" >&6; }
-if test "x$ac_cv_lib_pthreadGC2_pthread_create" = xyes; then :
-  PTHREAD_LIBS="-lpthreadGC2"
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthreadGC1" >&5
-$as_echo_n "checking for pthread_create in -lpthreadGC1... " >&6; }
-if ${ac_cv_lib_pthreadGC1_pthread_create+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_check_lib_save_LIBS=$LIBS
-LIBS="-lpthreadGC1  $LIBS"
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-/* Override any GCC internal prototype to avoid an error.
-   Use char because int might match the return type of a GCC
-   builtin and then its argument prototype would still apply.  */
-#ifdef __cplusplus
-extern "C"
-#endif
-char pthread_create ();
-int
-main ()
-{
-return pthread_create ();
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
-  ac_cv_lib_pthreadGC1_pthread_create=yes
-else
-  ac_cv_lib_pthreadGC1_pthread_create=no
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-LIBS=$ac_check_lib_save_LIBS
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_pthreadGC1_pthread_create" >&5
-$as_echo "$ac_cv_lib_pthreadGC1_pthread_create" >&6; }
-if test "x$ac_cv_lib_pthreadGC1_pthread_create" = xyes; then :
-  PTHREAD_LIBS="-lpthreadGC1"
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthreadGC" >&5
-$as_echo_n "checking for pthread_create in -lpthreadGC... " >&6; }
-if ${ac_cv_lib_pthreadGC_pthread_create+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_check_lib_save_LIBS=$LIBS
-LIBS="-lpthreadGC  $LIBS"
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-/* Override any GCC internal prototype to avoid an error.
-   Use char because int might match the return type of a GCC
-   builtin and then its argument prototype would still apply.  */
-#ifdef __cplusplus
-extern "C"
-#endif
-char pthread_create ();
-int
-main ()
-{
-return pthread_create ();
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
-  ac_cv_lib_pthreadGC_pthread_create=yes
-else
-  ac_cv_lib_pthreadGC_pthread_create=no
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-LIBS=$ac_check_lib_save_LIBS
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_pthreadGC_pthread_create" >&5
-$as_echo "$ac_cv_lib_pthreadGC_pthread_create" >&6; }
-if test "x$ac_cv_lib_pthreadGC_pthread_create" = xyes; then :
-  PTHREAD_LIBS="-lpthreadGC"
-
-fi
-
-fi
-
-fi
-
-fi
-
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for SSL_library_init in -lssl" >&5
-$as_echo_n "checking for SSL_library_init in -lssl... " >&6; }
-if ${ac_cv_lib_ssl_SSL_library_init+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_check_lib_save_LIBS=$LIBS
-LIBS="-lssl  $LIBS"
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-/* Override any GCC internal prototype to avoid an error.
-   Use char because int might match the return type of a GCC
-   builtin and then its argument prototype would still apply.  */
-#ifdef __cplusplus
-extern "C"
-#endif
-char SSL_library_init ();
-int
-main ()
-{
-return SSL_library_init ();
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
-  ac_cv_lib_ssl_SSL_library_init=yes
-else
-  ac_cv_lib_ssl_SSL_library_init=no
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-LIBS=$ac_check_lib_save_LIBS
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_ssl_SSL_library_init" >&5
-$as_echo "$ac_cv_lib_ssl_SSL_library_init" >&6; }
-if test "x$ac_cv_lib_ssl_SSL_library_init" = xyes; then :
-  cat >>confdefs.h <<_ACEOF
-#define HAVE_LIBSSL 1
-_ACEOF
-
-  LIBS="-lssl $LIBS"
-
-else
-  as_fn_error $? "OpenSSL library required" "$LINENO" 5
-fi
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for EVP_DigestFinal_ex in -lcrypto" >&5
-$as_echo_n "checking for EVP_DigestFinal_ex in -lcrypto... " >&6; }
-if ${ac_cv_lib_crypto_EVP_DigestFinal_ex+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_check_lib_save_LIBS=$LIBS
-LIBS="-lcrypto  $LIBS"
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-/* Override any GCC internal prototype to avoid an error.
-   Use char because int might match the return type of a GCC
-   builtin and then its argument prototype would still apply.  */
-#ifdef __cplusplus
-extern "C"
-#endif
-char EVP_DigestFinal_ex ();
-int
-main ()
-{
-return EVP_DigestFinal_ex ();
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
-  ac_cv_lib_crypto_EVP_DigestFinal_ex=yes
-else
-  ac_cv_lib_crypto_EVP_DigestFinal_ex=no
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-LIBS=$ac_check_lib_save_LIBS
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_crypto_EVP_DigestFinal_ex" >&5
-$as_echo "$ac_cv_lib_crypto_EVP_DigestFinal_ex" >&6; }
-if test "x$ac_cv_lib_crypto_EVP_DigestFinal_ex" = xyes; then :
-  cat >>confdefs.h <<_ACEOF
-#define HAVE_LIBCRYPTO 1
-_ACEOF
-
-  LIBS="-lcrypto $LIBS"
-
-else
-  as_fn_error $? "OpenSSL library required" "$LINENO" 5
-fi
-
-
- if test x$request_jansson = xtrue; then
-  WANT_JANSSON_TRUE=
-  WANT_JANSSON_FALSE='#'
-else
-  WANT_JANSSON_TRUE='#'
-  WANT_JANSSON_FALSE=
-fi
-
- if test x$have_win32 = xtrue; then
-  HAVE_WINDOWS_TRUE=
-  HAVE_WINDOWS_FALSE='#'
-else
-  HAVE_WINDOWS_TRUE='#'
-  HAVE_WINDOWS_FALSE=
-fi
-
- if test x$have_x86 = xtrue; then
-  ARCH_x86_TRUE=
-  ARCH_x86_FALSE='#'
-else
-  ARCH_x86_TRUE='#'
-  ARCH_x86_FALSE=
-fi
-
- if test x$have_x86_64 = xtrue; then
-  ARCH_x86_64_TRUE=
-  ARCH_x86_64_FALSE='#'
-else
-  ARCH_x86_64_TRUE='#'
-  ARCH_x86_64_FALSE=
-fi
-
-
-if test x$request_jansson = xtrue
-then
-	JANSSON_LIBS="compat/jansson/libjansson.a"
-else
-	JANSSON_LIBS=-ljansson
-fi
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-# Check whether --with-libcurl was given.
-if test "${with_libcurl+set}" = set; then :
-  withval=$with_libcurl; _libcurl_with=$withval
-else
-  _libcurl_with=yes
-fi
-
-
-  if test "$_libcurl_with" != "no" ; then
-
-     for ac_prog in gawk mawk nawk awk
-do
-  # Extract the first word of "$ac_prog", so it can be a program name with args.
-set dummy $ac_prog; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_AWK+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -n "$AWK"; then
-  ac_cv_prog_AWK="$AWK" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_prog_AWK="$ac_prog"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-  done
-IFS=$as_save_IFS
-
-fi
-fi
-AWK=$ac_cv_prog_AWK
-if test -n "$AWK"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $AWK" >&5
-$as_echo "$AWK" >&6; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-fi
-
-
-  test -n "$AWK" && break
-done
-
-
-     _libcurl_version_parse="eval $AWK '{split(\$NF,A,\".\"); X=256*256*A[1]+256*A[2]+A[3]; print X;}'"
-
-     _libcurl_try_link=yes
-
-     if test -d "$_libcurl_with" ; then
-        LIBCURL_CPPFLAGS="-I$withval/include"
-        _libcurl_ldflags="-L$withval/lib"
-        # Extract the first word of "curl-config", so it can be a program name with args.
-set dummy curl-config; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_path__libcurl_config+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  case $_libcurl_config in
-  [\\/]* | ?:[\\/]*)
-  ac_cv_path__libcurl_config="$_libcurl_config" # Let the user override the test with a path.
-  ;;
-  *)
-  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in "$withval/bin"
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_path__libcurl_config="$as_dir/$ac_word$ac_exec_ext"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-  done
-IFS=$as_save_IFS
-
-  ;;
-esac
-fi
-_libcurl_config=$ac_cv_path__libcurl_config
-if test -n "$_libcurl_config"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $_libcurl_config" >&5
-$as_echo "$_libcurl_config" >&6; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-fi
-
-
-     else
-        # Extract the first word of "curl-config", so it can be a program name with args.
-set dummy curl-config; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_path__libcurl_config+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  case $_libcurl_config in
-  [\\/]* | ?:[\\/]*)
-  ac_cv_path__libcurl_config="$_libcurl_config" # Let the user override the test with a path.
-  ;;
-  *)
-  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_path__libcurl_config="$as_dir/$ac_word$ac_exec_ext"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-  done
-IFS=$as_save_IFS
-
-  ;;
-esac
-fi
-_libcurl_config=$ac_cv_path__libcurl_config
-if test -n "$_libcurl_config"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $_libcurl_config" >&5
-$as_echo "$_libcurl_config" >&6; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-fi
-
-
-     fi
-
-     if test x$_libcurl_config != "x" ; then
-        { $as_echo "$as_me:${as_lineno-$LINENO}: checking for the version of libcurl" >&5
-$as_echo_n "checking for the version of libcurl... " >&6; }
-if ${libcurl_cv_lib_curl_version+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  libcurl_cv_lib_curl_version=`$_libcurl_config --version | $AWK '{print $2}'`
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libcurl_cv_lib_curl_version" >&5
-$as_echo "$libcurl_cv_lib_curl_version" >&6; }
-
-        _libcurl_version=`echo $libcurl_cv_lib_curl_version | $_libcurl_version_parse`
-        _libcurl_wanted=`echo 7.15.2 | $_libcurl_version_parse`
-
-        if test $_libcurl_wanted -gt 0 ; then
-           { $as_echo "$as_me:${as_lineno-$LINENO}: checking for libcurl >= version 7.15.2" >&5
-$as_echo_n "checking for libcurl >= version 7.15.2... " >&6; }
-if ${libcurl_cv_lib_version_ok+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-
-              if test $_libcurl_version -ge $_libcurl_wanted ; then
-                 libcurl_cv_lib_version_ok=yes
-              else
-                 libcurl_cv_lib_version_ok=no
-              fi
-
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libcurl_cv_lib_version_ok" >&5
-$as_echo "$libcurl_cv_lib_version_ok" >&6; }
-        fi
-
-        if test $_libcurl_wanted -eq 0 || test x$libcurl_cv_lib_version_ok = xyes ; then
-           if test x"$LIBCURL_CPPFLAGS" = "x" ; then
-              LIBCURL_CPPFLAGS=`$_libcurl_config --cflags`
-           fi
-           if test x"$LIBCURL" = "x" ; then
-              LIBCURL=`$_libcurl_config --libs`
-
-              # This is so silly, but Apple actually has a bug in their
-              # curl-config script.  Fixed in Tiger, but there are still
-              # lots of Panther installs around.
-              case "${host}" in
-                 powerpc-apple-darwin7*)
-                    LIBCURL=`echo $LIBCURL | sed -e 's|-arch i386||g'`
-                 ;;
-              esac
-           fi
-
-           # All curl-config scripts support --feature
-           _libcurl_features=`$_libcurl_config --feature`
-
-           # Is it modern enough to have --protocols? (7.12.4)
-           if test $_libcurl_version -ge 461828 ; then
-              _libcurl_protocols=`$_libcurl_config --protocols`
-           fi
-        else
-           _libcurl_try_link=no
-        fi
-
-        unset _libcurl_wanted
-     fi
-
-     if test $_libcurl_try_link = yes ; then
-
-        # we didn't find curl-config, so let's see if the user-supplied
-        # link line (or failing that, "-lcurl") is enough.
-        LIBCURL=${LIBCURL-"$_libcurl_ldflags -lcurl"}
-
-        { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether libcurl is usable" >&5
-$as_echo_n "checking whether libcurl is usable... " >&6; }
-if ${libcurl_cv_lib_curl_usable+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-
-           _libcurl_save_cppflags=$CPPFLAGS
-           CPPFLAGS="$LIBCURL_CPPFLAGS $CPPFLAGS"
-           _libcurl_save_libs=$LIBS
-           LIBS="$LIBCURL $LIBS"
-
-           cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#include <curl/curl.h>
-int
-main ()
-{
-
-/* Try and use a few common options to force a failure if we are
-   missing symbols or can't link. */
-int x;
-curl_easy_setopt(NULL,CURLOPT_URL,NULL);
-x=CURL_ERROR_SIZE;
-x=CURLOPT_WRITEFUNCTION;
-x=CURLOPT_FILE;
-x=CURLOPT_ERRORBUFFER;
-x=CURLOPT_STDERR;
-x=CURLOPT_VERBOSE;
-
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
-  libcurl_cv_lib_curl_usable=yes
-else
-  libcurl_cv_lib_curl_usable=no
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-
-           CPPFLAGS=$_libcurl_save_cppflags
-           LIBS=$_libcurl_save_libs
-           unset _libcurl_save_cppflags
-           unset _libcurl_save_libs
-
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libcurl_cv_lib_curl_usable" >&5
-$as_echo "$libcurl_cv_lib_curl_usable" >&6; }
-
-        if test $libcurl_cv_lib_curl_usable = yes ; then
-
-           # Does curl_free() exist in this version of libcurl?
-           # If not, fake it with free()
-
-           _libcurl_save_cppflags=$CPPFLAGS
-           CPPFLAGS="$CPPFLAGS $LIBCURL_CPPFLAGS"
-           _libcurl_save_libs=$LIBS
-           LIBS="$LIBS $LIBCURL"
-
-           ac_fn_c_check_func "$LINENO" "curl_free" "ac_cv_func_curl_free"
-if test "x$ac_cv_func_curl_free" = xyes; then :
-
-else
-
-$as_echo "#define curl_free free" >>confdefs.h
-
-fi
-
-
-           CPPFLAGS=$_libcurl_save_cppflags
-           LIBS=$_libcurl_save_libs
-           unset _libcurl_save_cppflags
-           unset _libcurl_save_libs
-
-
-$as_echo "#define HAVE_LIBCURL 1" >>confdefs.h
-
-
-
-
-           for _libcurl_feature in $_libcurl_features ; do
-              cat >>confdefs.h <<_ACEOF
-#define `$as_echo "libcurl_feature_$_libcurl_feature" | $as_tr_cpp` 1
-_ACEOF
-
-              eval `$as_echo "libcurl_feature_$_libcurl_feature" | $as_tr_sh`=yes
-           done
-
-           if test "x$_libcurl_protocols" = "x" ; then
-
-              # We don't have --protocols, so just assume that all
-              # protocols are available
-              _libcurl_protocols="HTTP FTP FILE TELNET LDAP DICT TFTP"
-
-              if test x$libcurl_feature_SSL = xyes ; then
-                 _libcurl_protocols="$_libcurl_protocols HTTPS"
-
-                 # FTPS wasn't standards-compliant until version
-                 # 7.11.0 (0x070b00 == 461568)
-                 if test $_libcurl_version -ge 461568; then
-                    _libcurl_protocols="$_libcurl_protocols FTPS"
-                 fi
-              fi
-
-              # RTSP, IMAP, POP3 and SMTP were added in
-              # 7.20.0 (0x071400 == 463872)
-              if test $_libcurl_version -ge 463872; then
-                 _libcurl_protocols="$_libcurl_protocols RTSP IMAP POP3 SMTP"
-              fi
-           fi
-
-           for _libcurl_protocol in $_libcurl_protocols ; do
-              cat >>confdefs.h <<_ACEOF
-#define `$as_echo "libcurl_protocol_$_libcurl_protocol" | $as_tr_cpp` 1
-_ACEOF
-
-              eval `$as_echo "libcurl_protocol_$_libcurl_protocol" | $as_tr_sh`=yes
-           done
-        else
-           unset LIBCURL
-           unset LIBCURL_CPPFLAGS
-        fi
-     fi
-
-     unset _libcurl_try_link
-     unset _libcurl_version_parse
-     unset _libcurl_config
-     unset _libcurl_feature
-     unset _libcurl_features
-     unset _libcurl_protocol
-     unset _libcurl_protocols
-     unset _libcurl_version
-     unset _libcurl_ldflags
-  fi
-
-  if test x$_libcurl_with = xno || test x$libcurl_cv_lib_curl_usable != xyes ; then
-     # This is the IF-NO path
-     as_fn_error $? "Missing required libcurl >= 7.15.2" "$LINENO" 5
-  else
-     # This is the IF-YES path
-     :
-  fi
-
-  unset _libcurl_with
-
-
-
-
-
-
-
-ac_config_files="$ac_config_files Makefile compat/Makefile compat/jansson/Makefile"
-
-
-ARCH=`uname -m`
-if [ $ARCH == "x86_64" ];
-then
-  SUFFIX="64"
-else
-  SUFFIX=""
-fi
-
-
-# Check whether --with-cuda was given.
-if test "${with_cuda+set}" = set; then :
-  withval=$with_cuda;
-fi
-
-
-if test -n "$with_cuda"
-then
-   CUDA_CFLAGS="-I$with_cuda/include"
-   CUDA_LIBS="-lcudart"
-   CUDA_LDFLAGS="-L$with_cuda/lib$SUFFIX"
-   NVCC="$with_cuda/bin/nvcc"
-else
-   CUDA_CFLAGS="-I/usr/local/cuda/include"
-   CUDA_LIBS="-lcudart -static-libstdc++"
-   CUDA_LDFLAGS="-L/usr/local/cuda/lib$SUFFIX"
-   NVCC="nvcc"
-fi
-
-
-
-
-
-
-
-cat >confcache <<\_ACEOF
-# This file is a shell script that caches the results of configure
-# tests run on this system so they can be shared between configure
-# scripts and configure runs, see configure's option --config-cache.
-# It is not useful on other systems.  If it contains results you don't
-# want to keep, you may remove or edit it.
-#
-# config.status only pays attention to the cache file if you give it
-# the --recheck option to rerun configure.
-#
-# `ac_cv_env_foo' variables (set or unset) will be overridden when
-# loading this file, other *unset* `ac_cv_foo' will be assigned the
-# following values.
-
-_ACEOF
-
-# The following way of writing the cache mishandles newlines in values,
-# but we know of no workaround that is simple, portable, and efficient.
-# So, we kill variables containing newlines.
-# Ultrix sh set writes to stderr and can't be redirected directly,
-# and sets the high bit in the cache file unless we assign to the vars.
-(
-  for ac_var in `(set) 2>&1 | sed -n 's/^\([a-zA-Z_][a-zA-Z0-9_]*\)=.*/\1/p'`; do
-    eval ac_val=\$$ac_var
-    case $ac_val in #(
-    *${as_nl}*)
-      case $ac_var in #(
-      *_cv_*) { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cache variable $ac_var contains a newline" >&5
-$as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;;
-      esac
-      case $ac_var in #(
-      _ | IFS | as_nl) ;; #(
-      BASH_ARGV | BASH_SOURCE) eval $ac_var= ;; #(
-      *) { eval $ac_var=; unset $ac_var;} ;;
-      esac ;;
-    esac
-  done
-
-  (set) 2>&1 |
-    case $as_nl`(ac_space=' '; set) 2>&1` in #(
-    *${as_nl}ac_space=\ *)
-      # `set' does not quote correctly, so add quotes: double-quote
-      # substitution turns \\\\ into \\, and sed turns \\ into \.
-      sed -n \
-	"s/'/'\\\\''/g;
-	  s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='\\2'/p"
-      ;; #(
-    *)
-      # `set' quotes correctly as required by POSIX, so do not add quotes.
-      sed -n "/^[_$as_cr_alnum]*_cv_[_$as_cr_alnum]*=/p"
-      ;;
-    esac |
-    sort
-) |
-  sed '
-     /^ac_cv_env_/b end
-     t clear
-     :clear
-     s/^\([^=]*\)=\(.*[{}].*\)$/test "${\1+set}" = set || &/
-     t end
-     s/^\([^=]*\)=\(.*\)$/\1=${\1=\2}/
-     :end' >>confcache
-if diff "$cache_file" confcache >/dev/null 2>&1; then :; else
-  if test -w "$cache_file"; then
-    if test "x$cache_file" != "x/dev/null"; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: updating cache $cache_file" >&5
-$as_echo "$as_me: updating cache $cache_file" >&6;}
-      if test ! -f "$cache_file" || test -h "$cache_file"; then
-	cat confcache >"$cache_file"
-      else
-        case $cache_file in #(
-        */* | ?:*)
-	  mv -f confcache "$cache_file"$$ &&
-	  mv -f "$cache_file"$$ "$cache_file" ;; #(
-        *)
-	  mv -f confcache "$cache_file" ;;
-	esac
-      fi
-    fi
-  else
-    { $as_echo "$as_me:${as_lineno-$LINENO}: not updating unwritable cache $cache_file" >&5
-$as_echo "$as_me: not updating unwritable cache $cache_file" >&6;}
-  fi
-fi
-rm -f confcache
-
-test "x$prefix" = xNONE && prefix=$ac_default_prefix
-# Let make expand exec_prefix.
-test "x$exec_prefix" = xNONE && exec_prefix='${prefix}'
-
-DEFS=-DHAVE_CONFIG_H
-
-ac_libobjs=
-ac_ltlibobjs=
-U=
-for ac_i in : $LIBOBJS; do test "x$ac_i" = x: && continue
-  # 1. Remove the extension, and $U if already installed.
-  ac_script='s/\$U\././;s/\.o$//;s/\.obj$//'
-  ac_i=`$as_echo "$ac_i" | sed "$ac_script"`
-  # 2. Prepend LIBOBJDIR.  When used with automake>=1.10 LIBOBJDIR
-  #    will be set to the directory where LIBOBJS objects are built.
-  as_fn_append ac_libobjs " \${LIBOBJDIR}$ac_i\$U.$ac_objext"
-  as_fn_append ac_ltlibobjs " \${LIBOBJDIR}$ac_i"'$U.lo'
-done
-LIBOBJS=$ac_libobjs
-
-LTLIBOBJS=$ac_ltlibobjs
-
-
- if test -n "$EXEEXT"; then
-  am__EXEEXT_TRUE=
-  am__EXEEXT_FALSE='#'
-else
-  am__EXEEXT_TRUE='#'
-  am__EXEEXT_FALSE=
-fi
-
-if test -z "${MAINTAINER_MODE_TRUE}" && test -z "${MAINTAINER_MODE_FALSE}"; then
-  as_fn_error $? "conditional \"MAINTAINER_MODE\" was never defined.
-Usually this means the macro was only invoked conditionally." "$LINENO" 5
-fi
-if test -z "${AMDEP_TRUE}" && test -z "${AMDEP_FALSE}"; then
-  as_fn_error $? "conditional \"AMDEP\" was never defined.
-Usually this means the macro was only invoked conditionally." "$LINENO" 5
-fi
-if test -z "${am__fastdepCC_TRUE}" && test -z "${am__fastdepCC_FALSE}"; then
-  as_fn_error $? "conditional \"am__fastdepCC\" was never defined.
-Usually this means the macro was only invoked conditionally." "$LINENO" 5
-fi
-if test -z "${am__fastdepCCAS_TRUE}" && test -z "${am__fastdepCCAS_FALSE}"; then
-  as_fn_error $? "conditional \"am__fastdepCCAS\" was never defined.
-Usually this means the macro was only invoked conditionally." "$LINENO" 5
-fi
-if test -z "${am__fastdepCXX_TRUE}" && test -z "${am__fastdepCXX_FALSE}"; then
-  as_fn_error $? "conditional \"am__fastdepCXX\" was never defined.
-Usually this means the macro was only invoked conditionally." "$LINENO" 5
-fi
-if test -z "${WANT_JANSSON_TRUE}" && test -z "${WANT_JANSSON_FALSE}"; then
-  as_fn_error $? "conditional \"WANT_JANSSON\" was never defined.
-Usually this means the macro was only invoked conditionally." "$LINENO" 5
-fi
-if test -z "${HAVE_WINDOWS_TRUE}" && test -z "${HAVE_WINDOWS_FALSE}"; then
-  as_fn_error $? "conditional \"HAVE_WINDOWS\" was never defined.
-Usually this means the macro was only invoked conditionally." "$LINENO" 5
-fi
-if test -z "${ARCH_x86_TRUE}" && test -z "${ARCH_x86_FALSE}"; then
-  as_fn_error $? "conditional \"ARCH_x86\" was never defined.
-Usually this means the macro was only invoked conditionally." "$LINENO" 5
-fi
-if test -z "${ARCH_x86_64_TRUE}" && test -z "${ARCH_x86_64_FALSE}"; then
-  as_fn_error $? "conditional \"ARCH_x86_64\" was never defined.
-Usually this means the macro was only invoked conditionally." "$LINENO" 5
-fi
-
-: "${CONFIG_STATUS=./config.status}"
-ac_write_fail=0
-ac_clean_files_save=$ac_clean_files
-ac_clean_files="$ac_clean_files $CONFIG_STATUS"
-{ $as_echo "$as_me:${as_lineno-$LINENO}: creating $CONFIG_STATUS" >&5
-$as_echo "$as_me: creating $CONFIG_STATUS" >&6;}
-as_write_fail=0
-cat >$CONFIG_STATUS <<_ASEOF || as_write_fail=1
-#! $SHELL
-# Generated by $as_me.
-# Run this file to recreate the current configuration.
-# Compiler output produced by configure, useful for debugging
-# configure, is in config.log if it exists.
-
-debug=false
-ac_cs_recheck=false
-ac_cs_silent=false
-
-SHELL=\${CONFIG_SHELL-$SHELL}
-export SHELL
-_ASEOF
-cat >>$CONFIG_STATUS <<\_ASEOF || as_write_fail=1
-## -------------------- ##
-## M4sh Initialization. ##
-## -------------------- ##
-
-# Be more Bourne compatible
-DUALCASE=1; export DUALCASE # for MKS sh
-if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then :
-  emulate sh
-  NULLCMD=:
-  # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which
-  # is contrary to our usage.  Disable this feature.
-  alias -g '${1+"$@"}'='"$@"'
-  setopt NO_GLOB_SUBST
-else
-  case `(set -o) 2>/dev/null` in #(
-  *posix*) :
-    set -o posix ;; #(
-  *) :
-     ;;
-esac
-fi
-
-
-as_nl='
-'
-export as_nl
-# Printing a long string crashes Solaris 7 /usr/bin/printf.
-as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'
-as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo
-as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo
-# Prefer a ksh shell builtin over an external printf program on Solaris,
-# but without wasting forks for bash or zsh.
-if test -z "$BASH_VERSION$ZSH_VERSION" \
-    && (test "X`print -r -- $as_echo`" = "X$as_echo") 2>/dev/null; then
-  as_echo='print -r --'
-  as_echo_n='print -rn --'
-elif (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then
-  as_echo='printf %s\n'
-  as_echo_n='printf %s'
-else
-  if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then
-    as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"'
-    as_echo_n='/usr/ucb/echo -n'
-  else
-    as_echo_body='eval expr "X$1" : "X\\(.*\\)"'
-    as_echo_n_body='eval
-      arg=$1;
-      case $arg in #(
-      *"$as_nl"*)
-	expr "X$arg" : "X\\(.*\\)$as_nl";
-	arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;;
-      esac;
-      expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl"
-    '
-    export as_echo_n_body
-    as_echo_n='sh -c $as_echo_n_body as_echo'
-  fi
-  export as_echo_body
-  as_echo='sh -c $as_echo_body as_echo'
-fi
-
-# The user is always right.
-if test "${PATH_SEPARATOR+set}" != set; then
-  PATH_SEPARATOR=:
-  (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 && {
-    (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 ||
-      PATH_SEPARATOR=';'
-  }
-fi
-
-
-# IFS
-# We need space, tab and new line, in precisely that order.  Quoting is
-# there to prevent editors from complaining about space-tab.
-# (If _AS_PATH_WALK were called with IFS unset, it would disable word
-# splitting by setting IFS to empty value.)
-IFS=" ""	$as_nl"
-
-# Find who we are.  Look in the path if we contain no directory separator.
-as_myself=
-case $0 in #((
-  *[\\/]* ) as_myself=$0 ;;
-  *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break
-  done
-IFS=$as_save_IFS
-
-     ;;
-esac
-# We did not find ourselves, most probably we were run as `sh COMMAND'
-# in which case we are not to be found in the path.
-if test "x$as_myself" = x; then
-  as_myself=$0
-fi
-if test ! -f "$as_myself"; then
-  $as_echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2
-  exit 1
-fi
-
-# Unset variables that we do not need and which cause bugs (e.g. in
-# pre-3.0 UWIN ksh).  But do not cause bugs in bash 2.01; the "|| exit 1"
-# suppresses any "Segmentation fault" message there.  '((' could
-# trigger a bug in pdksh 5.2.14.
-for as_var in BASH_ENV ENV MAIL MAILPATH
-do eval test x\${$as_var+set} = xset \
-  && ( (unset $as_var) || exit 1) >/dev/null 2>&1 && unset $as_var || :
-done
-PS1='$ '
-PS2='> '
-PS4='+ '
-
-# NLS nuisances.
-LC_ALL=C
-export LC_ALL
-LANGUAGE=C
-export LANGUAGE
-
-# CDPATH.
-(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
-
-
-# as_fn_error STATUS ERROR [LINENO LOG_FD]
-# ----------------------------------------
-# Output "`basename $0`: error: ERROR" to stderr. If LINENO and LOG_FD are
-# provided, also output the error to LOG_FD, referencing LINENO. Then exit the
-# script with STATUS, using 1 if that was 0.
-as_fn_error ()
-{
-  as_status=$1; test $as_status -eq 0 && as_status=1
-  if test "$4"; then
-    as_lineno=${as_lineno-"$3"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-    $as_echo "$as_me:${as_lineno-$LINENO}: error: $2" >&$4
-  fi
-  $as_echo "$as_me: error: $2" >&2
-  as_fn_exit $as_status
-} # as_fn_error
-
-
-# as_fn_set_status STATUS
-# -----------------------
-# Set $? to STATUS, without forking.
-as_fn_set_status ()
-{
-  return $1
-} # as_fn_set_status
-
-# as_fn_exit STATUS
-# -----------------
-# Exit the shell with STATUS, even in a "trap 0" or "set -e" context.
-as_fn_exit ()
-{
-  set +e
-  as_fn_set_status $1
-  exit $1
-} # as_fn_exit
-
-# as_fn_unset VAR
-# ---------------
-# Portably unset VAR.
-as_fn_unset ()
-{
-  { eval $1=; unset $1;}
-}
-as_unset=as_fn_unset
-# as_fn_append VAR VALUE
-# ----------------------
-# Append the text in VALUE to the end of the definition contained in VAR. Take
-# advantage of any shell optimizations that allow amortized linear growth over
-# repeated appends, instead of the typical quadratic growth present in naive
-# implementations.
-if (eval "as_var=1; as_var+=2; test x\$as_var = x12") 2>/dev/null; then :
-  eval 'as_fn_append ()
-  {
-    eval $1+=\$2
-  }'
-else
-  as_fn_append ()
-  {
-    eval $1=\$$1\$2
-  }
-fi # as_fn_append
-
-# as_fn_arith ARG...
-# ------------------
-# Perform arithmetic evaluation on the ARGs, and store the result in the
-# global $as_val. Take advantage of shells that can avoid forks. The arguments
-# must be portable across $(()) and expr.
-if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null; then :
-  eval 'as_fn_arith ()
-  {
-    as_val=$(( $* ))
-  }'
-else
-  as_fn_arith ()
-  {
-    as_val=`expr "$@" || test $? -eq 1`
-  }
-fi # as_fn_arith
-
-
-if expr a : '\(a\)' >/dev/null 2>&1 &&
-   test "X`expr 00001 : '.*\(...\)'`" = X001; then
-  as_expr=expr
-else
-  as_expr=false
-fi
-
-if (basename -- /) >/dev/null 2>&1 && test "X`basename -- / 2>&1`" = "X/"; then
-  as_basename=basename
-else
-  as_basename=false
-fi
-
-if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then
-  as_dirname=dirname
-else
-  as_dirname=false
-fi
-
-as_me=`$as_basename -- "$0" ||
-$as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \
-	 X"$0" : 'X\(//\)$' \| \
-	 X"$0" : 'X\(/\)' \| . 2>/dev/null ||
-$as_echo X/"$0" |
-    sed '/^.*\/\([^/][^/]*\)\/*$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\/\(\/\/\)$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\/\(\/\).*/{
-	    s//\1/
-	    q
-	  }
-	  s/.*/./; q'`
-
-# Avoid depending upon Character Ranges.
-as_cr_letters='abcdefghijklmnopqrstuvwxyz'
-as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ'
-as_cr_Letters=$as_cr_letters$as_cr_LETTERS
-as_cr_digits='0123456789'
-as_cr_alnum=$as_cr_Letters$as_cr_digits
-
-ECHO_C= ECHO_N= ECHO_T=
-case `echo -n x` in #(((((
--n*)
-  case `echo 'xy\c'` in
-  *c*) ECHO_T='	';;	# ECHO_T is single tab character.
-  xy)  ECHO_C='\c';;
-  *)   echo `echo ksh88 bug on AIX 6.1` > /dev/null
-       ECHO_T='	';;
-  esac;;
-*)
-  ECHO_N='-n';;
-esac
-
-rm -f conf$$ conf$$.exe conf$$.file
-if test -d conf$$.dir; then
-  rm -f conf$$.dir/conf$$.file
-else
-  rm -f conf$$.dir
-  mkdir conf$$.dir 2>/dev/null
-fi
-if (echo >conf$$.file) 2>/dev/null; then
-  if ln -s conf$$.file conf$$ 2>/dev/null; then
-    as_ln_s='ln -s'
-    # ... but there are two gotchas:
-    # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail.
-    # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable.
-    # In both cases, we have to default to `cp -p'.
-    ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe ||
-      as_ln_s='cp -p'
-  elif ln conf$$.file conf$$ 2>/dev/null; then
-    as_ln_s=ln
-  else
-    as_ln_s='cp -p'
-  fi
-else
-  as_ln_s='cp -p'
-fi
-rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file
-rmdir conf$$.dir 2>/dev/null
-
-
-# as_fn_mkdir_p
-# -------------
-# Create "$as_dir" as a directory, including parents if necessary.
-as_fn_mkdir_p ()
-{
-
-  case $as_dir in #(
-  -*) as_dir=./$as_dir;;
-  esac
-  test -d "$as_dir" || eval $as_mkdir_p || {
-    as_dirs=
-    while :; do
-      case $as_dir in #(
-      *\'*) as_qdir=`$as_echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'(
-      *) as_qdir=$as_dir;;
-      esac
-      as_dirs="'$as_qdir' $as_dirs"
-      as_dir=`$as_dirname -- "$as_dir" ||
-$as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
-	 X"$as_dir" : 'X\(//\)[^/]' \| \
-	 X"$as_dir" : 'X\(//\)$' \| \
-	 X"$as_dir" : 'X\(/\)' \| . 2>/dev/null ||
-$as_echo X"$as_dir" |
-    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\/\)[^/].*/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\/\)$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\).*/{
-	    s//\1/
-	    q
-	  }
-	  s/.*/./; q'`
-      test -d "$as_dir" && break
-    done
-    test -z "$as_dirs" || eval "mkdir $as_dirs"
-  } || test -d "$as_dir" || as_fn_error $? "cannot create directory $as_dir"
-
-
-} # as_fn_mkdir_p
-if mkdir -p . 2>/dev/null; then
-  as_mkdir_p='mkdir -p "$as_dir"'
-else
-  test -d ./-p && rmdir ./-p
-  as_mkdir_p=false
-fi
-
-if test -x / >/dev/null 2>&1; then
-  as_test_x='test -x'
-else
-  if ls -dL / >/dev/null 2>&1; then
-    as_ls_L_option=L
-  else
-    as_ls_L_option=
-  fi
-  as_test_x='
-    eval sh -c '\''
-      if test -d "$1"; then
-	test -d "$1/.";
-      else
-	case $1 in #(
-	-*)set "./$1";;
-	esac;
-	case `ls -ld'$as_ls_L_option' "$1" 2>/dev/null` in #((
-	???[sx]*):;;*)false;;esac;fi
-    '\'' sh
-  '
-fi
-as_executable_p=$as_test_x
-
-# Sed expression to map a string onto a valid CPP name.
-as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'"
-
-# Sed expression to map a string onto a valid variable name.
-as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'"
-
-
-exec 6>&1
-## ----------------------------------- ##
-## Main body of $CONFIG_STATUS script. ##
-## ----------------------------------- ##
-_ASEOF
-test $as_write_fail = 0 && chmod +x $CONFIG_STATUS || ac_write_fail=1
-
-cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
-# Save the log message, to keep $0 and so on meaningful, and to
-# report actual input values of CONFIG_FILES etc. instead of their
-# values after options handling.
-ac_log="
-This file was extended by ccminer $as_me 2014.05.03, which was
-generated by GNU Autoconf 2.68.  Invocation command line was
-
-  CONFIG_FILES    = $CONFIG_FILES
-  CONFIG_HEADERS  = $CONFIG_HEADERS
-  CONFIG_LINKS    = $CONFIG_LINKS
-  CONFIG_COMMANDS = $CONFIG_COMMANDS
-  $ $0 $@
-
-on `(hostname || uname -n) 2>/dev/null | sed 1q`
-"
-
-_ACEOF
-
-case $ac_config_files in *"
-"*) set x $ac_config_files; shift; ac_config_files=$*;;
-esac
-
-case $ac_config_headers in *"
-"*) set x $ac_config_headers; shift; ac_config_headers=$*;;
-esac
-
-
-cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
-# Files that config.status was made for.
-config_files="$ac_config_files"
-config_headers="$ac_config_headers"
-config_commands="$ac_config_commands"
-
-_ACEOF
-
-cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
-ac_cs_usage="\
-\`$as_me' instantiates files and other configuration actions
-from templates according to the current configuration.  Unless the files
-and actions are specified as TAGs, all are instantiated by default.
-
-Usage: $0 [OPTION]... [TAG]...
-
-  -h, --help       print this help, then exit
-  -V, --version    print version number and configuration settings, then exit
-      --config     print configuration, then exit
-  -q, --quiet, --silent
-                   do not print progress messages
-  -d, --debug      don't remove temporary files
-      --recheck    update $as_me by reconfiguring in the same conditions
-      --file=FILE[:TEMPLATE]
-                   instantiate the configuration file FILE
-      --header=FILE[:TEMPLATE]
-                   instantiate the configuration header FILE
-
-Configuration files:
-$config_files
-
-Configuration headers:
-$config_headers
-
-Configuration commands:
-$config_commands
-
-Report bugs to the package provider."
-
-_ACEOF
-cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
-ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
-ac_cs_version="\\
-ccminer config.status 2014.05.03
-configured by $0, generated by GNU Autoconf 2.68,
-  with options \\"\$ac_cs_config\\"
-
-Copyright (C) 2010 Free Software Foundation, Inc.
-This config.status script is free software; the Free Software Foundation
-gives unlimited permission to copy, distribute and modify it."
-
-ac_pwd='$ac_pwd'
-srcdir='$srcdir'
-INSTALL='$INSTALL'
-MKDIR_P='$MKDIR_P'
-AWK='$AWK'
-test -n "\$AWK" || AWK=awk
-_ACEOF
-
-cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
-# The default lists apply if the user does not specify any file.
-ac_need_defaults=:
-while test $# != 0
-do
-  case $1 in
-  --*=?*)
-    ac_option=`expr "X$1" : 'X\([^=]*\)='`
-    ac_optarg=`expr "X$1" : 'X[^=]*=\(.*\)'`
-    ac_shift=:
-    ;;
-  --*=)
-    ac_option=`expr "X$1" : 'X\([^=]*\)='`
-    ac_optarg=
-    ac_shift=:
-    ;;
-  *)
-    ac_option=$1
-    ac_optarg=$2
-    ac_shift=shift
-    ;;
-  esac
-
-  case $ac_option in
-  # Handling of the options.
-  -recheck | --recheck | --rechec | --reche | --rech | --rec | --re | --r)
-    ac_cs_recheck=: ;;
-  --version | --versio | --versi | --vers | --ver | --ve | --v | -V )
-    $as_echo "$ac_cs_version"; exit ;;
-  --config | --confi | --conf | --con | --co | --c )
-    $as_echo "$ac_cs_config"; exit ;;
-  --debug | --debu | --deb | --de | --d | -d )
-    debug=: ;;
-  --file | --fil | --fi | --f )
-    $ac_shift
-    case $ac_optarg in
-    *\'*) ac_optarg=`$as_echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"` ;;
-    '') as_fn_error $? "missing file argument" ;;
-    esac
-    as_fn_append CONFIG_FILES " '$ac_optarg'"
-    ac_need_defaults=false;;
-  --header | --heade | --head | --hea )
-    $ac_shift
-    case $ac_optarg in
-    *\'*) ac_optarg=`$as_echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"` ;;
-    esac
-    as_fn_append CONFIG_HEADERS " '$ac_optarg'"
-    ac_need_defaults=false;;
-  --he | --h)
-    # Conflict between --help and --header
-    as_fn_error $? "ambiguous option: \`$1'
-Try \`$0 --help' for more information.";;
-  --help | --hel | -h )
-    $as_echo "$ac_cs_usage"; exit ;;
-  -q | -quiet | --quiet | --quie | --qui | --qu | --q \
-  | -silent | --silent | --silen | --sile | --sil | --si | --s)
-    ac_cs_silent=: ;;
-
-  # This is an error.
-  -*) as_fn_error $? "unrecognized option: \`$1'
-Try \`$0 --help' for more information." ;;
-
-  *) as_fn_append ac_config_targets " $1"
-     ac_need_defaults=false ;;
-
-  esac
-  shift
-done
-
-ac_configure_extra_args=
-
-if $ac_cs_silent; then
-  exec 6>/dev/null
-  ac_configure_extra_args="$ac_configure_extra_args --silent"
-fi
-
-_ACEOF
-cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
-if \$ac_cs_recheck; then
-  set X '$SHELL' '$0' $ac_configure_args \$ac_configure_extra_args --no-create --no-recursion
-  shift
-  \$as_echo "running CONFIG_SHELL=$SHELL \$*" >&6
-  CONFIG_SHELL='$SHELL'
-  export CONFIG_SHELL
-  exec "\$@"
-fi
-
-_ACEOF
-cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
-exec 5>>config.log
-{
-  echo
-  sed 'h;s/./-/g;s/^.../## /;s/...$/ ##/;p;x;p;x' <<_ASBOX
-## Running $as_me. ##
-_ASBOX
-  $as_echo "$ac_log"
-} >&5
-
-_ACEOF
-cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
-#
-# INIT-COMMANDS
-#
-AMDEP_TRUE="$AMDEP_TRUE" ac_aux_dir="$ac_aux_dir"
-
-_ACEOF
-
-cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
-
-# Handling of arguments.
-for ac_config_target in $ac_config_targets
-do
-  case $ac_config_target in
-    "cpuminer-config.h") CONFIG_HEADERS="$CONFIG_HEADERS cpuminer-config.h" ;;
-    "depfiles") CONFIG_COMMANDS="$CONFIG_COMMANDS depfiles" ;;
-    "Makefile") CONFIG_FILES="$CONFIG_FILES Makefile" ;;
-    "compat/Makefile") CONFIG_FILES="$CONFIG_FILES compat/Makefile" ;;
-    "compat/jansson/Makefile") CONFIG_FILES="$CONFIG_FILES compat/jansson/Makefile" ;;
-
-  *) as_fn_error $? "invalid argument: \`$ac_config_target'" "$LINENO" 5;;
-  esac
-done
-
-
-# If the user did not use the arguments to specify the items to instantiate,
-# then the envvar interface is used.  Set only those that are not.
-# We use the long form for the default assignment because of an extremely
-# bizarre bug on SunOS 4.1.3.
-if $ac_need_defaults; then
-  test "${CONFIG_FILES+set}" = set || CONFIG_FILES=$config_files
-  test "${CONFIG_HEADERS+set}" = set || CONFIG_HEADERS=$config_headers
-  test "${CONFIG_COMMANDS+set}" = set || CONFIG_COMMANDS=$config_commands
-fi
-
-# Have a temporary directory for convenience.  Make it in the build tree
-# simply because there is no reason against having it here, and in addition,
-# creating and moving files from /tmp can sometimes cause problems.
-# Hook for its removal unless debugging.
-# Note that there is a small window in which the directory will not be cleaned:
-# after its creation but before its name has been assigned to `$tmp'.
-$debug ||
-{
-  tmp= ac_tmp=
-  trap 'exit_status=$?
-  : "${ac_tmp:=$tmp}"
-  { test ! -d "$ac_tmp" || rm -fr "$ac_tmp"; } && exit $exit_status
-' 0
-  trap 'as_fn_exit 1' 1 2 13 15
-}
-# Create a (secure) tmp directory for tmp files.
-
-{
-  tmp=`(umask 077 && mktemp -d "./confXXXXXX") 2>/dev/null` &&
-  test -d "$tmp"
-}  ||
-{
-  tmp=./conf$$-$RANDOM
-  (umask 077 && mkdir "$tmp")
-} || as_fn_error $? "cannot create a temporary directory in ." "$LINENO" 5
-ac_tmp=$tmp
-
-# Set up the scripts for CONFIG_FILES section.
-# No need to generate them if there are no CONFIG_FILES.
-# This happens for instance with `./config.status config.h'.
-if test -n "$CONFIG_FILES"; then
-
-
-ac_cr=`echo X | tr X '\015'`
-# On cygwin, bash can eat \r inside `` if the user requested igncr.
-# But we know of no other shell where ac_cr would be empty at this
-# point, so we can use a bashism as a fallback.
-if test "x$ac_cr" = x; then
-  eval ac_cr=\$\'\\r\'
-fi
-ac_cs_awk_cr=`$AWK 'BEGIN { print "a\rb" }' </dev/null 2>/dev/null`
-if test "$ac_cs_awk_cr" = "a${ac_cr}b"; then
-  ac_cs_awk_cr='\\r'
-else
-  ac_cs_awk_cr=$ac_cr
-fi
-
-echo 'BEGIN {' >"$ac_tmp/subs1.awk" &&
-_ACEOF
-
-
-{
-  echo "cat >conf$$subs.awk <<_ACEOF" &&
-  echo "$ac_subst_vars" | sed 's/.*/&!$&$ac_delim/' &&
-  echo "_ACEOF"
-} >conf$$subs.sh ||
-  as_fn_error $? "could not make $CONFIG_STATUS" "$LINENO" 5
-ac_delim_num=`echo "$ac_subst_vars" | grep -c '^'`
-ac_delim='%!_!# '
-for ac_last_try in false false false false false :; do
-  . ./conf$$subs.sh ||
-    as_fn_error $? "could not make $CONFIG_STATUS" "$LINENO" 5
-
-  ac_delim_n=`sed -n "s/.*$ac_delim\$/X/p" conf$$subs.awk | grep -c X`
-  if test $ac_delim_n = $ac_delim_num; then
-    break
-  elif $ac_last_try; then
-    as_fn_error $? "could not make $CONFIG_STATUS" "$LINENO" 5
-  else
-    ac_delim="$ac_delim!$ac_delim _$ac_delim!! "
-  fi
-done
-rm -f conf$$subs.sh
-
-cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
-cat >>"\$ac_tmp/subs1.awk" <<\\_ACAWK &&
-_ACEOF
-sed -n '
-h
-s/^/S["/; s/!.*/"]=/
-p
-g
-s/^[^!]*!//
-:repl
-t repl
-s/'"$ac_delim"'$//
-t delim
-:nl
-h
-s/\(.\{148\}\)..*/\1/
-t more1
-s/["\\]/\\&/g; s/^/"/; s/$/\\n"\\/
-p
-n
-b repl
-:more1
-s/["\\]/\\&/g; s/^/"/; s/$/"\\/
-p
-g
-s/.\{148\}//
-t nl
-:delim
-h
-s/\(.\{148\}\)..*/\1/
-t more2
-s/["\\]/\\&/g; s/^/"/; s/$/"/
-p
-b
-:more2
-s/["\\]/\\&/g; s/^/"/; s/$/"\\/
-p
-g
-s/.\{148\}//
-t delim
-' <conf$$subs.awk | sed '
-/^[^""]/{
-  N
-  s/\n//
-}
-' >>$CONFIG_STATUS || ac_write_fail=1
-rm -f conf$$subs.awk
-cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
-_ACAWK
-cat >>"\$ac_tmp/subs1.awk" <<_ACAWK &&
-  for (key in S) S_is_set[key] = 1
-  FS = ""
-
-}
-{
-  line = $ 0
-  nfields = split(line, field, "@")
-  substed = 0
-  len = length(field[1])
-  for (i = 2; i < nfields; i++) {
-    key = field[i]
-    keylen = length(key)
-    if (S_is_set[key]) {
-      value = S[key]
-      line = substr(line, 1, len) "" value "" substr(line, len + keylen + 3)
-      len += length(value) + length(field[++i])
-      substed = 1
-    } else
-      len += 1 + keylen
-  }
-
-  print line
-}
-
-_ACAWK
-_ACEOF
-cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
-if sed "s/$ac_cr//" < /dev/null > /dev/null 2>&1; then
-  sed "s/$ac_cr\$//; s/$ac_cr/$ac_cs_awk_cr/g"
-else
-  cat
-fi < "$ac_tmp/subs1.awk" > "$ac_tmp/subs.awk" \
-  || as_fn_error $? "could not setup config files machinery" "$LINENO" 5
-_ACEOF
-
-# VPATH may cause trouble with some makes, so we remove sole $(srcdir),
-# ${srcdir} and @srcdir@ entries from VPATH if srcdir is ".", strip leading and
-# trailing colons and then remove the whole line if VPATH becomes empty
-# (actually we leave an empty line to preserve line numbers).
-if test "x$srcdir" = x.; then
-  ac_vpsub='/^[	 ]*VPATH[	 ]*=[	 ]*/{
-h
-s///
-s/^/:/
-s/[	 ]*$/:/
-s/:\$(srcdir):/:/g
-s/:\${srcdir}:/:/g
-s/:@srcdir@:/:/g
-s/^:*//
-s/:*$//
-x
-s/\(=[	 ]*\).*/\1/
-G
-s/\n//
-s/^[^=]*=[	 ]*$//
-}'
-fi
-
-cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
-fi # test -n "$CONFIG_FILES"
-
-# Set up the scripts for CONFIG_HEADERS section.
-# No need to generate them if there are no CONFIG_HEADERS.
-# This happens for instance with `./config.status Makefile'.
-if test -n "$CONFIG_HEADERS"; then
-cat >"$ac_tmp/defines.awk" <<\_ACAWK ||
-BEGIN {
-_ACEOF
-
-# Transform confdefs.h into an awk script `defines.awk', embedded as
-# here-document in config.status, that substitutes the proper values into
-# config.h.in to produce config.h.
-
-# Create a delimiter string that does not exist in confdefs.h, to ease
-# handling of long lines.
-ac_delim='%!_!# '
-for ac_last_try in false false :; do
-  ac_tt=`sed -n "/$ac_delim/p" confdefs.h`
-  if test -z "$ac_tt"; then
-    break
-  elif $ac_last_try; then
-    as_fn_error $? "could not make $CONFIG_HEADERS" "$LINENO" 5
-  else
-    ac_delim="$ac_delim!$ac_delim _$ac_delim!! "
-  fi
-done
-
-# For the awk script, D is an array of macro values keyed by name,
-# likewise P contains macro parameters if any.  Preserve backslash
-# newline sequences.
-
-ac_word_re=[_$as_cr_Letters][_$as_cr_alnum]*
-sed -n '
-s/.\{148\}/&'"$ac_delim"'/g
-t rset
-:rset
-s/^[	 ]*#[	 ]*define[	 ][	 ]*/ /
-t def
-d
-:def
-s/\\$//
-t bsnl
-s/["\\]/\\&/g
-s/^ \('"$ac_word_re"'\)\(([^()]*)\)[	 ]*\(.*\)/P["\1"]="\2"\
-D["\1"]=" \3"/p
-s/^ \('"$ac_word_re"'\)[	 ]*\(.*\)/D["\1"]=" \2"/p
-d
-:bsnl
-s/["\\]/\\&/g
-s/^ \('"$ac_word_re"'\)\(([^()]*)\)[	 ]*\(.*\)/P["\1"]="\2"\
-D["\1"]=" \3\\\\\\n"\\/p
-t cont
-s/^ \('"$ac_word_re"'\)[	 ]*\(.*\)/D["\1"]=" \2\\\\\\n"\\/p
-t cont
-d
-:cont
-n
-s/.\{148\}/&'"$ac_delim"'/g
-t clear
-:clear
-s/\\$//
-t bsnlc
-s/["\\]/\\&/g; s/^/"/; s/$/"/p
-d
-:bsnlc
-s/["\\]/\\&/g; s/^/"/; s/$/\\\\\\n"\\/p
-b cont
-' <confdefs.h | sed '
-s/'"$ac_delim"'/"\\\
-"/g' >>$CONFIG_STATUS || ac_write_fail=1
-
-cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
-  for (key in D) D_is_set[key] = 1
-  FS = ""
-}
-/^[\t ]*#[\t ]*(define|undef)[\t ]+$ac_word_re([\t (]|\$)/ {
-  line = \$ 0
-  split(line, arg, " ")
-  if (arg[1] == "#") {
-    defundef = arg[2]
-    mac1 = arg[3]
-  } else {
-    defundef = substr(arg[1], 2)
-    mac1 = arg[2]
-  }
-  split(mac1, mac2, "(") #)
-  macro = mac2[1]
-  prefix = substr(line, 1, index(line, defundef) - 1)
-  if (D_is_set[macro]) {
-    # Preserve the white space surrounding the "#".
-    print prefix "define", macro P[macro] D[macro]
-    next
-  } else {
-    # Replace #undef with comments.  This is necessary, for example,
-    # in the case of _POSIX_SOURCE, which is predefined and required
-    # on some systems where configure will not decide to define it.
-    if (defundef == "undef") {
-      print "/*", prefix defundef, macro, "*/"
-      next
-    }
-  }
-}
-{ print }
-_ACAWK
-_ACEOF
-cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
-  as_fn_error $? "could not setup config headers machinery" "$LINENO" 5
-fi # test -n "$CONFIG_HEADERS"
-
-
-eval set X "  :F $CONFIG_FILES  :H $CONFIG_HEADERS    :C $CONFIG_COMMANDS"
-shift
-for ac_tag
-do
-  case $ac_tag in
-  :[FHLC]) ac_mode=$ac_tag; continue;;
-  esac
-  case $ac_mode$ac_tag in
-  :[FHL]*:*);;
-  :L* | :C*:*) as_fn_error $? "invalid tag \`$ac_tag'" "$LINENO" 5;;
-  :[FH]-) ac_tag=-:-;;
-  :[FH]*) ac_tag=$ac_tag:$ac_tag.in;;
-  esac
-  ac_save_IFS=$IFS
-  IFS=:
-  set x $ac_tag
-  IFS=$ac_save_IFS
-  shift
-  ac_file=$1
-  shift
-
-  case $ac_mode in
-  :L) ac_source=$1;;
-  :[FH])
-    ac_file_inputs=
-    for ac_f
-    do
-      case $ac_f in
-      -) ac_f="$ac_tmp/stdin";;
-      *) # Look for the file first in the build tree, then in the source tree
-	 # (if the path is not absolute).  The absolute path cannot be DOS-style,
-	 # because $ac_f cannot contain `:'.
-	 test -f "$ac_f" ||
-	   case $ac_f in
-	   [\\/$]*) false;;
-	   *) test -f "$srcdir/$ac_f" && ac_f="$srcdir/$ac_f";;
-	   esac ||
-	   as_fn_error 1 "cannot find input file: \`$ac_f'" "$LINENO" 5;;
-      esac
-      case $ac_f in *\'*) ac_f=`$as_echo "$ac_f" | sed "s/'/'\\\\\\\\''/g"`;; esac
-      as_fn_append ac_file_inputs " '$ac_f'"
-    done
-
-    # Let's still pretend it is `configure' which instantiates (i.e., don't
-    # use $as_me), people would be surprised to read:
-    #    /* config.h.  Generated by config.status.  */
-    configure_input='Generated from '`
-	  $as_echo "$*" | sed 's|^[^:]*/||;s|:[^:]*/|, |g'
-	`' by configure.'
-    if test x"$ac_file" != x-; then
-      configure_input="$ac_file.  $configure_input"
-      { $as_echo "$as_me:${as_lineno-$LINENO}: creating $ac_file" >&5
-$as_echo "$as_me: creating $ac_file" >&6;}
-    fi
-    # Neutralize special characters interpreted by sed in replacement strings.
-    case $configure_input in #(
-    *\&* | *\|* | *\\* )
-       ac_sed_conf_input=`$as_echo "$configure_input" |
-       sed 's/[\\\\&|]/\\\\&/g'`;; #(
-    *) ac_sed_conf_input=$configure_input;;
-    esac
-
-    case $ac_tag in
-    *:-:* | *:-) cat >"$ac_tmp/stdin" \
-      || as_fn_error $? "could not create $ac_file" "$LINENO" 5 ;;
-    esac
-    ;;
-  esac
-
-  ac_dir=`$as_dirname -- "$ac_file" ||
-$as_expr X"$ac_file" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
-	 X"$ac_file" : 'X\(//\)[^/]' \| \
-	 X"$ac_file" : 'X\(//\)$' \| \
-	 X"$ac_file" : 'X\(/\)' \| . 2>/dev/null ||
-$as_echo X"$ac_file" |
-    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\/\)[^/].*/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\/\)$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\).*/{
-	    s//\1/
-	    q
-	  }
-	  s/.*/./; q'`
-  as_dir="$ac_dir"; as_fn_mkdir_p
-  ac_builddir=.
-
-case "$ac_dir" in
-.) ac_dir_suffix= ac_top_builddir_sub=. ac_top_build_prefix= ;;
-*)
-  ac_dir_suffix=/`$as_echo "$ac_dir" | sed 's|^\.[\\/]||'`
-  # A ".." for each directory in $ac_dir_suffix.
-  ac_top_builddir_sub=`$as_echo "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'`
-  case $ac_top_builddir_sub in
-  "") ac_top_builddir_sub=. ac_top_build_prefix= ;;
-  *)  ac_top_build_prefix=$ac_top_builddir_sub/ ;;
-  esac ;;
-esac
-ac_abs_top_builddir=$ac_pwd
-ac_abs_builddir=$ac_pwd$ac_dir_suffix
-# for backward compatibility:
-ac_top_builddir=$ac_top_build_prefix
-
-case $srcdir in
-  .)  # We are building in place.
-    ac_srcdir=.
-    ac_top_srcdir=$ac_top_builddir_sub
-    ac_abs_top_srcdir=$ac_pwd ;;
-  [\\/]* | ?:[\\/]* )  # Absolute name.
-    ac_srcdir=$srcdir$ac_dir_suffix;
-    ac_top_srcdir=$srcdir
-    ac_abs_top_srcdir=$srcdir ;;
-  *) # Relative name.
-    ac_srcdir=$ac_top_build_prefix$srcdir$ac_dir_suffix
-    ac_top_srcdir=$ac_top_build_prefix$srcdir
-    ac_abs_top_srcdir=$ac_pwd/$srcdir ;;
-esac
-ac_abs_srcdir=$ac_abs_top_srcdir$ac_dir_suffix
-
-
-  case $ac_mode in
-  :F)
-  #
-  # CONFIG_FILE
-  #
-
-  case $INSTALL in
-  [\\/$]* | ?:[\\/]* ) ac_INSTALL=$INSTALL ;;
-  *) ac_INSTALL=$ac_top_build_prefix$INSTALL ;;
-  esac
-  ac_MKDIR_P=$MKDIR_P
-  case $MKDIR_P in
-  [\\/$]* | ?:[\\/]* ) ;;
-  */*) ac_MKDIR_P=$ac_top_build_prefix$MKDIR_P ;;
-  esac
-_ACEOF
-
-cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
-# If the template does not know about datarootdir, expand it.
-# FIXME: This hack should be removed a few years after 2.60.
-ac_datarootdir_hack=; ac_datarootdir_seen=
-ac_sed_dataroot='
-/datarootdir/ {
-  p
-  q
-}
-/@datadir@/p
-/@docdir@/p
-/@infodir@/p
-/@localedir@/p
-/@mandir@/p'
-case `eval "sed -n \"\$ac_sed_dataroot\" $ac_file_inputs"` in
-*datarootdir*) ac_datarootdir_seen=yes;;
-*@datadir@*|*@docdir@*|*@infodir@*|*@localedir@*|*@mandir@*)
-  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&5
-$as_echo "$as_me: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&2;}
-_ACEOF
-cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
-  ac_datarootdir_hack='
-  s&@datadir@&$datadir&g
-  s&@docdir@&$docdir&g
-  s&@infodir@&$infodir&g
-  s&@localedir@&$localedir&g
-  s&@mandir@&$mandir&g
-  s&\\\${datarootdir}&$datarootdir&g' ;;
-esac
-_ACEOF
-
-# Neutralize VPATH when `$srcdir' = `.'.
-# Shell code in configure.ac might set extrasub.
-# FIXME: do we really want to maintain this feature?
-cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
-ac_sed_extra="$ac_vpsub
-$extrasub
-_ACEOF
-cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
-:t
-/@[a-zA-Z_][a-zA-Z_0-9]*@/!b
-s|@configure_input@|$ac_sed_conf_input|;t t
-s&@top_builddir@&$ac_top_builddir_sub&;t t
-s&@top_build_prefix@&$ac_top_build_prefix&;t t
-s&@srcdir@&$ac_srcdir&;t t
-s&@abs_srcdir@&$ac_abs_srcdir&;t t
-s&@top_srcdir@&$ac_top_srcdir&;t t
-s&@abs_top_srcdir@&$ac_abs_top_srcdir&;t t
-s&@builddir@&$ac_builddir&;t t
-s&@abs_builddir@&$ac_abs_builddir&;t t
-s&@abs_top_builddir@&$ac_abs_top_builddir&;t t
-s&@INSTALL@&$ac_INSTALL&;t t
-s&@MKDIR_P@&$ac_MKDIR_P&;t t
-$ac_datarootdir_hack
-"
-eval sed \"\$ac_sed_extra\" "$ac_file_inputs" | $AWK -f "$ac_tmp/subs.awk" \
-  >$ac_tmp/out || as_fn_error $? "could not create $ac_file" "$LINENO" 5
-
-test -z "$ac_datarootdir_hack$ac_datarootdir_seen" &&
-  { ac_out=`sed -n '/\${datarootdir}/p' "$ac_tmp/out"`; test -n "$ac_out"; } &&
-  { ac_out=`sed -n '/^[	 ]*datarootdir[	 ]*:*=/p' \
-      "$ac_tmp/out"`; test -z "$ac_out"; } &&
-  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $ac_file contains a reference to the variable \`datarootdir'
-which seems to be undefined.  Please make sure it is defined" >&5
-$as_echo "$as_me: WARNING: $ac_file contains a reference to the variable \`datarootdir'
-which seems to be undefined.  Please make sure it is defined" >&2;}
-
-  rm -f "$ac_tmp/stdin"
-  case $ac_file in
-  -) cat "$ac_tmp/out" && rm -f "$ac_tmp/out";;
-  *) rm -f "$ac_file" && mv "$ac_tmp/out" "$ac_file";;
-  esac \
-  || as_fn_error $? "could not create $ac_file" "$LINENO" 5
- ;;
-  :H)
-  #
-  # CONFIG_HEADER
-  #
-  if test x"$ac_file" != x-; then
-    {
-      $as_echo "/* $configure_input  */" \
-      && eval '$AWK -f "$ac_tmp/defines.awk"' "$ac_file_inputs"
-    } >"$ac_tmp/config.h" \
-      || as_fn_error $? "could not create $ac_file" "$LINENO" 5
-    if diff "$ac_file" "$ac_tmp/config.h" >/dev/null 2>&1; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: $ac_file is unchanged" >&5
-$as_echo "$as_me: $ac_file is unchanged" >&6;}
-    else
-      rm -f "$ac_file"
-      mv "$ac_tmp/config.h" "$ac_file" \
-	|| as_fn_error $? "could not create $ac_file" "$LINENO" 5
-    fi
-  else
-    $as_echo "/* $configure_input  */" \
-      && eval '$AWK -f "$ac_tmp/defines.awk"' "$ac_file_inputs" \
-      || as_fn_error $? "could not create -" "$LINENO" 5
-  fi
-# Compute "$ac_file"'s index in $config_headers.
-_am_arg="$ac_file"
-_am_stamp_count=1
-for _am_header in $config_headers :; do
-  case $_am_header in
-    $_am_arg | $_am_arg:* )
-      break ;;
-    * )
-      _am_stamp_count=`expr $_am_stamp_count + 1` ;;
-  esac
-done
-echo "timestamp for $_am_arg" >`$as_dirname -- "$_am_arg" ||
-$as_expr X"$_am_arg" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
-	 X"$_am_arg" : 'X\(//\)[^/]' \| \
-	 X"$_am_arg" : 'X\(//\)$' \| \
-	 X"$_am_arg" : 'X\(/\)' \| . 2>/dev/null ||
-$as_echo X"$_am_arg" |
-    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\/\)[^/].*/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\/\)$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\).*/{
-	    s//\1/
-	    q
-	  }
-	  s/.*/./; q'`/stamp-h$_am_stamp_count
- ;;
-
-  :C)  { $as_echo "$as_me:${as_lineno-$LINENO}: executing $ac_file commands" >&5
-$as_echo "$as_me: executing $ac_file commands" >&6;}
- ;;
-  esac
-
-
-  case $ac_file$ac_mode in
-    "depfiles":C) test x"$AMDEP_TRUE" != x"" || {
-  # Autoconf 2.62 quotes --file arguments for eval, but not when files
-  # are listed without --file.  Let's play safe and only enable the eval
-  # if we detect the quoting.
-  case $CONFIG_FILES in
-  *\'*) eval set x "$CONFIG_FILES" ;;
-  *)   set x $CONFIG_FILES ;;
-  esac
-  shift
-  for mf
-  do
-    # Strip MF so we end up with the name of the file.
-    mf=`echo "$mf" | sed -e 's/:.*$//'`
-    # Check whether this is an Automake generated Makefile or not.
-    # We used to match only the files named `Makefile.in', but
-    # some people rename them; so instead we look at the file content.
-    # Grep'ing the first line is not enough: some people post-process
-    # each Makefile.in and add a new line on top of each file to say so.
-    # Grep'ing the whole file is not good either: AIX grep has a line
-    # limit of 2048, but all sed's we know have understand at least 4000.
-    if sed -n 's,^#.*generated by automake.*,X,p' "$mf" | grep X >/dev/null 2>&1; then
-      dirpart=`$as_dirname -- "$mf" ||
-$as_expr X"$mf" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
-	 X"$mf" : 'X\(//\)[^/]' \| \
-	 X"$mf" : 'X\(//\)$' \| \
-	 X"$mf" : 'X\(/\)' \| . 2>/dev/null ||
-$as_echo X"$mf" |
-    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\/\)[^/].*/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\/\)$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\).*/{
-	    s//\1/
-	    q
-	  }
-	  s/.*/./; q'`
-    else
-      continue
-    fi
-    # Extract the definition of DEPDIR, am__include, and am__quote
-    # from the Makefile without running `make'.
-    DEPDIR=`sed -n 's/^DEPDIR = //p' < "$mf"`
-    test -z "$DEPDIR" && continue
-    am__include=`sed -n 's/^am__include = //p' < "$mf"`
-    test -z "am__include" && continue
-    am__quote=`sed -n 's/^am__quote = //p' < "$mf"`
-    # When using ansi2knr, U may be empty or an underscore; expand it
-    U=`sed -n 's/^U = //p' < "$mf"`
-    # Find all dependency output files, they are included files with
-    # $(DEPDIR) in their names.  We invoke sed twice because it is the
-    # simplest approach to changing $(DEPDIR) to its actual value in the
-    # expansion.
-    for file in `sed -n "
-      s/^$am__include $am__quote\(.*(DEPDIR).*\)$am__quote"'$/\1/p' <"$mf" | \
-	 sed -e 's/\$(DEPDIR)/'"$DEPDIR"'/g' -e 's/\$U/'"$U"'/g'`; do
-      # Make sure the directory exists.
-      test -f "$dirpart/$file" && continue
-      fdir=`$as_dirname -- "$file" ||
-$as_expr X"$file" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
-	 X"$file" : 'X\(//\)[^/]' \| \
-	 X"$file" : 'X\(//\)$' \| \
-	 X"$file" : 'X\(/\)' \| . 2>/dev/null ||
-$as_echo X"$file" |
-    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\/\)[^/].*/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\/\)$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\).*/{
-	    s//\1/
-	    q
-	  }
-	  s/.*/./; q'`
-      as_dir=$dirpart/$fdir; as_fn_mkdir_p
-      # echo "creating $dirpart/$file"
-      echo '# dummy' > "$dirpart/$file"
-    done
-  done
-}
- ;;
-
-  esac
-done # for ac_tag
-
-
-as_fn_exit 0
-_ACEOF
-ac_clean_files=$ac_clean_files_save
-
-test $ac_write_fail = 0 ||
-  as_fn_error $? "write failure creating $CONFIG_STATUS" "$LINENO" 5
-
-
-# configure is writing to config.log, and then calls config.status.
-# config.status does its own redirection, appending to config.log.
-# Unfortunately, on DOS this fails, as config.log is still kept open
-# by configure, so config.status won't be able to write to it; its
-# output is simply discarded.  So we exec the FD to /dev/null,
-# effectively closing config.log, so it can be properly (re)opened and
-# appended to by config.status.  When coming back to configure, we
-# need to make the FD available again.
-if test "$no_create" != yes; then
-  ac_cs_success=:
-  ac_config_status_args=
-  test "$silent" = yes &&
-    ac_config_status_args="$ac_config_status_args --quiet"
-  exec 5>/dev/null
-  $SHELL $CONFIG_STATUS $ac_config_status_args || ac_cs_success=false
-  exec 5>>config.log
-  # Use ||, not &&, to avoid exiting from the if with $? = 1, which
-  # would make configure fail if this is the last instruction.
-  $ac_cs_success || as_fn_exit 1
-fi
-if test -n "$ac_unrecognized_opts" && test "$enable_option_checking" != no; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: unrecognized options: $ac_unrecognized_opts" >&5
-$as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2;}
-fi
-
diff --git a/configure.ac b/configure.ac
index 0cb83db9e1..6bb2209c8d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,10 +1,10 @@
-AC_INIT([ccminer], [2014.06.15])
+AC_INIT([ccminer], [2.3.1], [], [ccminer], [http://github.com/tpruvot/ccminer])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
-AC_CONFIG_SRCDIR([cpu-miner.c])
-AM_INIT_AUTOMAKE([gnu])
-AC_CONFIG_HEADERS([cpuminer-config.h])
+AC_CONFIG_SRCDIR([ccminer.cpp])
+AM_INIT_AUTOMAKE([foreign subdir-objects])
+AC_CONFIG_HEADERS([ccminer-config.h])
 
 dnl Make sure anyone changing configure.ac/Makefile.am has a clue
 AM_MAINTAINER_MODE
@@ -63,36 +63,12 @@ case $target in
     PTHREAD_FLAGS=""
     WS2_LIBS="-lws2_32"
     ;;
+  *86*-apple-darwin*)
+    have_osx=true
+    PTHREAD_FLAGS=""
+    ;;
 esac
 
-if test x$have_x86 = xtrue
-then
-  AC_MSG_CHECKING(whether we can compile AVX code)
-  AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vmovdqa %ymm0, %ymm1");])],
-    AC_DEFINE(USE_AVX, 1, [Define to 1 if AVX assembly is available.])
-    AC_MSG_RESULT(yes)
-    AC_MSG_CHECKING(whether we can compile XOP code)
-    AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vprotd \$7, %xmm0, %xmm1");])],
-      AC_DEFINE(USE_XOP, 1, [Define to 1 if XOP assembly is available.])
-      AC_MSG_RESULT(yes)
-    ,
-      AC_MSG_RESULT(no)
-      AC_MSG_WARN([The assembler does not support the XOP instruction set.])
-    )
-    AC_MSG_CHECKING(whether we can compile AVX2 code)
-    AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vpaddd %ymm0, %ymm1, %ymm2");])],
-      AC_DEFINE(USE_AVX2, 1, [Define to 1 if AVX2 assembly is available.])
-      AC_MSG_RESULT(yes)
-    ,
-      AC_MSG_RESULT(no)
-      AC_MSG_WARN([The assembler does not support the AVX2 instruction set.])
-    )
-  ,
-    AC_MSG_RESULT(no)
-    AC_MSG_WARN([The assembler does not support the AVX instruction set.])
-  )
-fi
-
 AC_CHECK_LIB(jansson, json_loads, request_jansson=false, request_jansson=true)
 AC_CHECK_LIB([pthread], [pthread_create], PTHREAD_LIBS="-lpthread",
   AC_CHECK_LIB([pthreadGC2], [pthread_create], PTHREAD_LIBS="-lpthreadGC2",
@@ -100,11 +76,13 @@ AC_CHECK_LIB([pthread], [pthread_create], PTHREAD_LIBS="-lpthread",
       AC_CHECK_LIB([pthreadGC], [pthread_create], PTHREAD_LIBS="-lpthreadGC"
 ))))
 
-AC_CHECK_LIB([ssl],[SSL_library_init], [], [AC_MSG_ERROR([OpenSSL library required])])
+AC_CHECK_LIB([z],[gzopen], [], [])
+AC_CHECK_LIB([ssl],[SSL_free], [], [AC_MSG_ERROR([OpenSSL library required])])
 AC_CHECK_LIB([crypto],[EVP_DigestFinal_ex], [], [AC_MSG_ERROR([OpenSSL library required])])
 
 AM_CONDITIONAL([WANT_JANSSON], [test x$request_jansson = xtrue])
 AM_CONDITIONAL([HAVE_WINDOWS], [test x$have_win32 = xtrue])
+AM_CONDITIONAL([HAVE_OSX], [test x$have_osx = xtrue])
 AM_CONDITIONAL([ARCH_x86], [test x$have_x86 = xtrue])
 AM_CONDITIONAL([ARCH_x86_64], [test x$have_x86_64 = xtrue])
 
@@ -138,23 +116,43 @@ else
   SUFFIX=""
 fi
 
+if test x$have_osx = xtrue
+then
+  SUFFIX=""
+fi
+
 dnl Setup CUDA paths
 AC_ARG_WITH([cuda],
    [  --with-cuda=PATH    prefix where cuda is installed [default=/usr/local/cuda]])
 
-if test -n "$with_cuda"
-then
-   CUDA_CFLAGS="-I$with_cuda/include"
-   CUDA_LIBS="-lcudart"
-   CUDA_LDFLAGS="-L$with_cuda/lib$SUFFIX"
-   NVCC="$with_cuda/bin/nvcc"
+AC_ARG_WITH([nvml],
+   [  --with-nvml=PATH    prefix where libnvml is installed [default=/usr/lib]])
+
+AM_CONDITIONAL([HAVE_NVML], [test -n "$with_nvml"])
+
+NVCC="nvcc"
+
+if test -n "$with_cuda" ; then
+  CUDA_INCLUDES="-I$with_cuda/include"
+  CUDA_LIBS="-lcudart"
+  CUDA_LDFLAGS="-L$with_cuda/lib$SUFFIX"
+  if test -x "$with_cuda/bin/nvcc" ; then
+    NVCC="$with_cuda/bin/nvcc"
+  fi
 else
-   CUDA_CFLAGS="-I/usr/local/cuda/include"
-   CUDA_LIBS="-lcudart -static-libstdc++"
-   CUDA_LDFLAGS="-L/usr/local/cuda/lib$SUFFIX"
-   NVCC="nvcc"
+  CUDA_INCLUDES="-I/usr/local/cuda/include"
+  CUDA_LIBS="-lcudart -static-libstdc++"
+  CUDA_LDFLAGS="-L/usr/local/cuda/lib$SUFFIX"
+fi
+
+if test -n "$with_nvml" ; then
+  NVML_LIBPATH=$with_nvml
+  CUDA_LDFLAGS="$CUDA_LDFLAGS -ldl"
 fi
+AC_SUBST(NVML_LIBPATH)
+
 AC_SUBST(CUDA_CFLAGS)
+AC_SUBST(CUDA_INCLUDES)
 AC_SUBST(CUDA_LIBS)
 AC_SUBST(CUDA_LDFLAGS)
 AC_SUBST(NVCC)
diff --git a/configure.sh b/configure.sh
old mode 100644
new mode 100755
index 134abd163b..a4c4f46e08
--- a/configure.sh
+++ b/configure.sh
@@ -1 +1,7 @@
-./configure "CFLAGS=-O3" "CXXFLAGS=-O3" --with-cuda=/usr/local/cuda
+# To change the cuda arch, edit Makefile.am and run ./build.sh
+
+extracflags="-march=native -D_REENTRANT -falign-functions=16 -falign-jumps=16 -falign-labels=16"
+
+CUDA_CFLAGS="-O3 -lineno -Xcompiler -Wall  -D_FORCE_INLINES" \
+	./configure CXXFLAGS="-O3 $extracflags" --with-cuda=/usr/local/cuda --with-nvml=libnvidia-ml.so
+
diff --git a/cpu-miner.c b/cpu-miner.c
deleted file mode 100644
index 9e3c3b0ac3..0000000000
--- a/cpu-miner.c
+++ /dev/null
@@ -1,1665 +0,0 @@
-/*
- * Copyright 2010 Jeff Garzik
- * Copyright 2012-2014 pooler
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.  See COPYING for more details.
- */
-
-#include "cpuminer-config.h"
-#define _GNU_SOURCE
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdbool.h>
-#include <inttypes.h>
-#include <unistd.h>
-#include <sys/time.h>
-#include <time.h>
-#ifdef WIN32
-#include <windows.h>
-#else
-#include <errno.h>
-#include <signal.h>
-#include <sys/resource.h>
-#if HAVE_SYS_SYSCTL_H
-#include <sys/types.h>
-#if HAVE_SYS_PARAM_H
-#include <sys/param.h>
-#endif
-#include <sys/sysctl.h>
-#endif
-#endif
-#include <jansson.h>
-#include <curl/curl.h>
-#include <openssl/sha.h>
-#include "compat.h"
-#include "miner.h"
-
-#ifdef WIN32
-#include <Mmsystem.h>
-#pragma comment(lib, "winmm.lib")
-#endif
-
-#define PROGRAM_NAME		"minerd"
-#define LP_SCANTIME		60
-#define HEAVYCOIN_BLKHDR_SZ		84
-#define MNR_BLKHDR_SZ 80
-
-// from heavy.cu
-#ifdef __cplusplus
-extern "C"
-{
-#endif
-int cuda_num_devices();
-void cuda_devicenames();
-int cuda_finddevice(char *name);
-#ifdef __cplusplus
-}
-#endif
-
-
-#ifdef __linux /* Linux specific policy and affinity management */
-#include <sched.h>
-static inline void drop_policy(void)
-{
-	struct sched_param param;
-	param.sched_priority = 0;
-
-#ifdef SCHED_IDLE
-	if (unlikely(sched_setscheduler(0, SCHED_IDLE, &param) == -1))
-#endif
-#ifdef SCHED_BATCH
-		sched_setscheduler(0, SCHED_BATCH, &param);
-#endif
-}
-
-static inline void affine_to_cpu(int id, int cpu)
-{
-	cpu_set_t set;
-
-	CPU_ZERO(&set);
-	CPU_SET(cpu, &set);
-	sched_setaffinity(0, sizeof(&set), &set);
-}
-#elif defined(__FreeBSD__) /* FreeBSD specific policy and affinity management */
-#include <sys/cpuset.h>
-static inline void drop_policy(void)
-{
-}
-
-static inline void affine_to_cpu(int id, int cpu)
-{
-	cpuset_t set;
-	CPU_ZERO(&set);
-	CPU_SET(cpu, &set);
-	cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, sizeof(cpuset_t), &set);
-}
-#else
-static inline void drop_policy(void)
-{
-}
-
-static inline void affine_to_cpu(int id, int cpu)
-{
-}
-#endif
-		
-enum workio_commands {
-	WC_GET_WORK,
-	WC_SUBMIT_WORK,
-};
-
-struct workio_cmd {
-	enum workio_commands	cmd;
-	struct thr_info		*thr;
-	union {
-		struct work	*work;
-	} u;
-};
-
-typedef enum {
-	ALGO_HEAVY,		/* Heavycoin hash */
-	ALGO_MJOLLNIR,		/* Mjollnir hash */
-	ALGO_FUGUE256,		/* Fugue256 */
-	ALGO_GROESTL,
-	ALGO_MYR_GR,
-	ALGO_JACKPOT,
-	ALGO_QUARK,
-	ALGO_ANIME,
-	ALGO_NIST5,
-	ALGO_X11,
-	ALGO_X13,
-	ALGO_DMD_GR,
-} sha256_algos;
-
-static const char *algo_names[] = {
-	"heavy",
-	"mjollnir",
-	"fugue256",
-	"groestl",
-	"myr-gr",
-	"jackpot",
-	"quark",
-	"anime",
-	"nist5",
-	"x11",
-	"x13",
-	"dmd-gr",
-};
-
-bool opt_debug = false;
-bool opt_protocol = false;
-bool opt_benchmark = false;
-bool want_longpoll = true;
-bool have_longpoll = false;
-bool want_stratum = true;
-bool have_stratum = false;
-static bool submit_old = false;
-bool use_syslog = false;
-static bool opt_background = false;
-static bool opt_quiet = false;
-static int opt_retries = -1;
-static int opt_fail_pause = 30;
-int opt_timeout = 270;
-static int opt_scantime = 5;
-static json_t *opt_config;
-static const bool opt_time = true;
-static sha256_algos opt_algo = ALGO_HEAVY;
-static int opt_n_threads = 0;
-static double opt_difficulty = 1; // CH
-bool opt_trust_pool = false;
-uint16_t opt_vote = 9999;
-static int num_processors;
-int device_map[8] = {0,1,2,3,4,5,6,7}; // CB
-char *device_name[8]; // CB
-static char *rpc_url;
-static char *rpc_userpass;
-static char *rpc_user, *rpc_pass;
-char *opt_cert;
-char *opt_proxy;
-long opt_proxy_type;
-struct thr_info *thr_info;
-static int work_thr_id;
-int longpoll_thr_id = -1;
-int stratum_thr_id = -1;
-struct work_restart *work_restart = NULL;
-static struct stratum_ctx stratum;
-
-pthread_mutex_t applog_lock;
-static pthread_mutex_t stats_lock;
-
-static unsigned long accepted_count = 0L;
-static unsigned long rejected_count = 0L;
-static double *thr_hashrates;
-
-#ifdef HAVE_GETOPT_LONG
-#include <getopt.h>
-#else
-struct option {
-	const char *name;
-	int has_arg;
-	int *flag;
-	int val;
-};
-#endif
-
-static char const usage[] = "\
-Usage: " PROGRAM_NAME " [OPTIONS]\n\
-Options:\n\
-  -a, --algo=ALGO       specify the algorithm to use\n\
-                        fugue256  Fuguecoin hash\n\
-                        heavy     Heavycoin hash\n\
-                        mjollnir  Mjollnircoin hash\n\
-                        groestl   Groestlcoin hash\n\
-                        myr-gr    Myriad-Groestl hash\n\
-                        jackpot   Jackpot hash\n\
-                        quark     Quark hash\n\
-                        anime     Animecoin hash\n\
-                        nist5     NIST5 (TalkCoin) hash\n\
-                        x11       X11 (DarkCoin) hash\n\
-                        x13       X13 (MaruCoin) hash\n\
-                        dmd-gr    Diamond-Groestl hash\n\
-  -d, --devices         takes a comma separated list of CUDA devices to use.\n\
-                        Device IDs start counting from 0! Alternatively takes\n\
-                        string names of your cards like gtx780ti or gt640#2\n\
-                        (matching 2nd gt640 in the PC)\n\
-  -f, --diff            Divide difficulty by this factor (std is 1) \n\
-  -v, --vote=VOTE       block reward vote (for HeavyCoin)\n\
-  -m, --trust-pool      trust the max block reward vote (maxvote) sent by the pool\n\
-  -o, --url=URL         URL of mining server\n\
-  -O, --userpass=U:P    username:password pair for mining server\n\
-  -u, --user=USERNAME   username for mining server\n\
-  -p, --pass=PASSWORD   password for mining server\n\
-      --cert=FILE       certificate for mining server using SSL\n\
-  -x, --proxy=[PROTOCOL://]HOST[:PORT]  connect through a proxy\n\
-  -t, --threads=N       number of miner threads (default: number of nVidia GPUs)\n\
-  -r, --retries=N       number of times to retry if a network call fails\n\
-                          (default: retry indefinitely)\n\
-  -R, --retry-pause=N   time to pause between retries, in seconds (default: 30)\n\
-  -T, --timeout=N       network timeout, in seconds (default: 270)\n\
-  -s, --scantime=N      upper bound on time spent scanning current work when\n\
-                          long polling is unavailable, in seconds (default: 5)\n\
-      --no-longpoll     disable X-Long-Polling support\n\
-      --no-stratum      disable X-Stratum support\n\
-  -q, --quiet           disable per-thread hashmeter output\n\
-  -D, --debug           enable debug output\n\
-  -P, --protocol-dump   verbose dump of protocol-level activities\n"
-#ifdef HAVE_SYSLOG_H
-"\
-  -S, --syslog          use system log for output messages\n"
-#endif
-#ifndef WIN32
-"\
-  -B, --background      run the miner in the background\n"
-#endif
-"\
-      --benchmark       run in offline benchmark mode\n\
-  -c, --config=FILE     load a JSON-format configuration file\n\
-  -V, --version         display version information and exit\n\
-  -h, --help            display this help text and exit\n\
-";
-
-static char const short_options[] =
-#ifndef WIN32
-	"B"
-#endif
-#ifdef HAVE_SYSLOG_H
-	"S"
-#endif
-	"a:c:Dhp:Px:qr:R:s:t:T:o:u:O:Vd:f:mv:";
-
-static struct option const options[] = {
-	{ "algo", 1, NULL, 'a' },
-#ifndef WIN32
-	{ "background", 0, NULL, 'B' },
-#endif
-	{ "benchmark", 0, NULL, 1005 },
-	{ "cert", 1, NULL, 1001 },
-	{ "config", 1, NULL, 'c' },
-	{ "debug", 0, NULL, 'D' },
-	{ "help", 0, NULL, 'h' },
-	{ "no-longpoll", 0, NULL, 1003 },
-	{ "no-stratum", 0, NULL, 1007 },
-	{ "pass", 1, NULL, 'p' },
-	{ "protocol-dump", 0, NULL, 'P' },
-	{ "proxy", 1, NULL, 'x' },
-	{ "quiet", 0, NULL, 'q' },
-	{ "retries", 1, NULL, 'r' },
-	{ "retry-pause", 1, NULL, 'R' },
-	{ "scantime", 1, NULL, 's' },
-#ifdef HAVE_SYSLOG_H
-	{ "syslog", 0, NULL, 'S' },
-#endif
-	{ "threads", 1, NULL, 't' },
-	{ "vote", 1, NULL, 'v' },
-	{ "trust-pool", 0, NULL, 'm' },
-	{ "timeout", 1, NULL, 'T' },
-	{ "url", 1, NULL, 'o' },
-	{ "user", 1, NULL, 'u' },
-	{ "userpass", 1, NULL, 'O' },
-	{ "version", 0, NULL, 'V' },
-	{ "devices", 1, NULL, 'd' },
-	{ "diff", 1, NULL, 'f' },
-	{ 0, 0, 0, 0 }
-};
-
-struct work {
-	uint32_t data[32];
-	uint32_t target[8];
-	uint32_t maxvote;
-
-	char job_id[128];
-	size_t xnonce2_len;
-	unsigned char xnonce2[32];
-};
-
-static struct work g_work;
-static time_t g_work_time;
-static pthread_mutex_t g_work_lock;
-
-static bool jobj_binary(const json_t *obj, const char *key,
-			void *buf, size_t buflen)
-{
-	const char *hexstr;
-	json_t *tmp;
-
-	tmp = json_object_get(obj, key);
-	if (unlikely(!tmp)) {
-		applog(LOG_ERR, "JSON key '%s' not found", key);
-		return false;
-	}
-	hexstr = json_string_value(tmp);
-	if (unlikely(!hexstr)) {
-		applog(LOG_ERR, "JSON key '%s' is not a string", key);
-		return false;
-	}
-	if (!hex2bin((unsigned char*)buf, hexstr, buflen))
-		return false;
-
-	return true;
-}
-
-static bool work_decode(const json_t *val, struct work *work)
-{
-	int i;
-	
-	if (unlikely(!jobj_binary(val, "data", work->data, sizeof(work->data)))) {
-		applog(LOG_ERR, "JSON inval data");
-		goto err_out;
-	}
-	if (unlikely(!jobj_binary(val, "target", work->target, sizeof(work->target)))) {
-		applog(LOG_ERR, "JSON inval target");
-		goto err_out;
-	}
-	if (opt_algo == ALGO_HEAVY) {
-		if (unlikely(!jobj_binary(val, "maxvote", &work->maxvote, sizeof(work->maxvote)))) {
-			work->maxvote = 1024;
-		}
-	} else work->maxvote = 0;
-
-	for (i = 0; i < ARRAY_SIZE(work->data); i++)
-		work->data[i] = le32dec(work->data + i);
-	for (i = 0; i < ARRAY_SIZE(work->target); i++)
-		work->target[i] = le32dec(work->target + i);
-
-	return true;
-
-err_out:
-	return false;
-}
-
-static void share_result(int result, const char *reason)
-{
-	char s[345];
-	double hashrate;
-	int i;
-
-	hashrate = 0.;
-	pthread_mutex_lock(&stats_lock);
-	for (i = 0; i < opt_n_threads; i++)
-		hashrate += thr_hashrates[i];
-	result ? accepted_count++ : rejected_count++;
-	pthread_mutex_unlock(&stats_lock);
-	
-	sprintf(s, hashrate >= 1e6 ? "%.0f" : "%.2f", 1e-3 * hashrate);
-	applog(LOG_INFO, "accepted: %lu/%lu (%.2f%%), %s khash/s %s",
-		   accepted_count,
-		   accepted_count + rejected_count,
-		   100. * accepted_count / (accepted_count + rejected_count),
-		   s,
-		   result ? "(yay!!!)" : "(booooo)");
-
-	if (opt_debug && reason)
-		applog(LOG_DEBUG, "DEBUG: reject reason: %s", reason);
-}
-
-static bool submit_upstream_work(CURL *curl, struct work *work)
-{
-	char *str = NULL;
-	json_t *val, *res, *reason;
-	char s[345];
-	int i;
-	bool rc = false;
-
-	/* pass if the previous hash is not the current previous hash */
-	if (memcmp(work->data + 1, g_work.data + 1, 32)) {
-		if (opt_debug)
-			applog(LOG_DEBUG, "DEBUG: stale work detected, discarding");
-		return true;
-	}
-
-	if (have_stratum) {
-		uint32_t ntime, nonce;
-		uint16_t nvote;
-		char *ntimestr, *noncestr, *xnonce2str, *nvotestr;
-
-		le32enc(&ntime, work->data[17]);
-		le32enc(&nonce, work->data[19]);
-		be16enc(&nvote, *((uint16_t*)&work->data[20]));
-
-		ntimestr = bin2hex((const unsigned char *)(&ntime), 4);
-		noncestr = bin2hex((const unsigned char *)(&nonce), 4);
-		xnonce2str = bin2hex(work->xnonce2, work->xnonce2_len);
-		nvotestr = bin2hex((const unsigned char *)(&nvote), 2);
-		if (opt_algo == ALGO_HEAVY) {
-			sprintf(s,
-				"{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
-				rpc_user, work->job_id, xnonce2str, ntimestr, noncestr, nvotestr);
-		} else {
-			sprintf(s,
-				"{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
-				rpc_user, work->job_id, xnonce2str, ntimestr, noncestr);
-		}
-		free(ntimestr);
-		free(noncestr);
-		free(xnonce2str);
-		free(nvotestr);
-
-		if (unlikely(!stratum_send_line(&stratum, s))) {
-			applog(LOG_ERR, "submit_upstream_work stratum_send_line failed");
-			goto out;
-		}
-	} else {
-
-		/* build hex string */
-
-		if (opt_algo != ALGO_HEAVY && opt_algo != ALGO_MJOLLNIR) {
-			for (i = 0; i < ARRAY_SIZE(work->data); i++)
-				le32enc(work->data + i, work->data[i]);
-			}
-			str = bin2hex((unsigned char *)work->data, sizeof(work->data));
-			if (unlikely(!str)) {
-				applog(LOG_ERR, "submit_upstream_work OOM");
-				goto out;
-		}
-
-		/* build JSON-RPC request */
-		sprintf(s,
-			"{\"method\": \"getwork\", \"params\": [ \"%s\" ], \"id\":1}\r\n",
-			str);
-
-		/* issue JSON-RPC request */
-		val = json_rpc_call(curl, rpc_url, rpc_userpass, s, false, false, NULL);
-		if (unlikely(!val)) {
-			applog(LOG_ERR, "submit_upstream_work json_rpc_call failed");
-			goto out;
-		}
-
-		res = json_object_get(val, "result");
-		reason = json_object_get(val, "reject-reason");
-		share_result(json_is_true(res), reason ? json_string_value(reason) : NULL);
-
-		json_decref(val);
-	}
-
-	rc = true;
-
-out:
-	free(str);
-	return rc;
-}
-
-static const char *rpc_req =
-	"{\"method\": \"getwork\", \"params\": [], \"id\":0}\r\n";
-
-static bool get_upstream_work(CURL *curl, struct work *work)
-{
-	json_t *val;
-	bool rc;
-	struct timeval tv_start, tv_end, diff;
-
-	gettimeofday(&tv_start, NULL);
-	val = json_rpc_call(curl, rpc_url, rpc_userpass, rpc_req,
-			    want_longpoll, false, NULL);
-	gettimeofday(&tv_end, NULL);
-
-	if (have_stratum) {
-		if (val)
-			json_decref(val);
-		return true;
-	}
-
-	if (!val)
-		return false;
-
-	rc = work_decode(json_object_get(val, "result"), work);
-
-	if (opt_debug && rc) {
-		timeval_subtract(&diff, &tv_end, &tv_start);
-		applog(LOG_DEBUG, "DEBUG: got new work in %d ms",
-		       diff.tv_sec * 1000 + diff.tv_usec / 1000);
-	}
-
-	json_decref(val);
-
-	return rc;
-}
-
-static void workio_cmd_free(struct workio_cmd *wc)
-{
-	if (!wc)
-		return;
-
-	switch (wc->cmd) {
-	case WC_SUBMIT_WORK:
-		free(wc->u.work);
-		break;
-	default: /* do nothing */
-		break;
-	}
-
-	memset(wc, 0, sizeof(*wc));	/* poison */
-	free(wc);
-}
-
-static bool workio_get_work(struct workio_cmd *wc, CURL *curl)
-{
-	struct work *ret_work;
-	int failures = 0;
-
-	ret_work = (struct work*)calloc(1, sizeof(*ret_work));
-	if (!ret_work)
-		return false;
-
-	/* obtain new work from bitcoin via JSON-RPC */
-	while (!get_upstream_work(curl, ret_work)) {
-		if (unlikely((opt_retries >= 0) && (++failures > opt_retries))) {
-			applog(LOG_ERR, "json_rpc_call failed, terminating workio thread");
-			free(ret_work);
-			return false;
-		}
-
-		/* pause, then restart work-request loop */
-		applog(LOG_ERR, "json_rpc_call failed, retry after %d seconds",
-			opt_fail_pause);
-		sleep(opt_fail_pause);
-	}
-
-	/* send work to requesting thread */
-	if (!tq_push(wc->thr->q, ret_work))
-		free(ret_work);
-
-	return true;
-}
-
-static bool workio_submit_work(struct workio_cmd *wc, CURL *curl)
-{
-	int failures = 0;
-
-	/* submit solution to bitcoin via JSON-RPC */
-	while (!submit_upstream_work(curl, wc->u.work)) {
-		if (unlikely((opt_retries >= 0) && (++failures > opt_retries))) {
-			applog(LOG_ERR, "...terminating workio thread");
-			return false;
-		}
-
-		/* pause, then restart work-request loop */
-		applog(LOG_ERR, "...retry after %d seconds",
-			opt_fail_pause);
-		sleep(opt_fail_pause);
-	}
-
-	return true;
-}
-
-static void *workio_thread(void *userdata)
-{
-	struct thr_info *mythr = (struct thr_info*)userdata;
-	CURL *curl;
-	bool ok = true;
-
-	curl = curl_easy_init();
-	if (unlikely(!curl)) {
-		applog(LOG_ERR, "CURL initialization failed");
-		return NULL;
-	}
-
-	while (ok) {
-		struct workio_cmd *wc;
-
-		/* wait for workio_cmd sent to us, on our queue */
-		wc = (struct workio_cmd *)tq_pop(mythr->q, NULL);
-		if (!wc) {
-			ok = false;
-			break;
-		}
-
-		/* process workio_cmd */
-		switch (wc->cmd) {
-		case WC_GET_WORK:
-			ok = workio_get_work(wc, curl);
-			break;
-		case WC_SUBMIT_WORK:
-			ok = workio_submit_work(wc, curl);
-			break;
-
-		default:		/* should never happen */
-			ok = false;
-			break;
-		}
-
-		workio_cmd_free(wc);
-	}
-
-	tq_freeze(mythr->q);
-	curl_easy_cleanup(curl);
-
-	return NULL;
-}
-
-static bool get_work(struct thr_info *thr, struct work *work)
-{
-	struct workio_cmd *wc;
-	struct work *work_heap;
-
-	if (opt_benchmark) {
-		memset(work->data, 0x55, 76);
-		work->data[17] = swab32((uint32_t)time(NULL));
-		memset(work->data + 19, 0x00, 52);
-		work->data[20] = 0x80000000;
-		work->data[31] = 0x00000280;
-		memset(work->target, 0x00, sizeof(work->target));
-		return true;
-	}
-
-	/* fill out work request message */
-	wc = (struct workio_cmd *)calloc(1, sizeof(*wc));
-	if (!wc)
-		return false;
-
-	wc->cmd = WC_GET_WORK;
-	wc->thr = thr;
-
-	/* send work request to workio thread */
-	if (!tq_push(thr_info[work_thr_id].q, wc)) {
-		workio_cmd_free(wc);
-		return false;
-	}
-
-	/* wait for response, a unit of work */
-	work_heap = (struct work *)tq_pop(thr->q, NULL);
-	if (!work_heap)
-		return false;
-
-	/* copy returned work into storage provided by caller */
-	memcpy(work, work_heap, sizeof(*work));
-	free(work_heap);
-
-	return true;
-}
-
-static bool submit_work(struct thr_info *thr, const struct work *work_in)
-{
-	struct workio_cmd *wc;
-	/* fill out work request message */
-	wc = (struct workio_cmd *)calloc(1, sizeof(*wc));
-	if (!wc)
-		return false;
-
-	wc->u.work = (struct work *)malloc(sizeof(*work_in));
-	if (!wc->u.work)
-		goto err_out;
-
-	wc->cmd = WC_SUBMIT_WORK;
-	wc->thr = thr;
-	memcpy(wc->u.work, work_in, sizeof(*work_in));
-
-	/* send solution to workio thread */
-	if (!tq_push(thr_info[work_thr_id].q, wc))
-		goto err_out;
-
-	return true;
-
-err_out:
-	workio_cmd_free(wc);
-	return false;
-}
-
-static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
-{
-	unsigned char merkle_root[64];
-	int i;
-
-	pthread_mutex_lock(&sctx->work_lock);
-
-	strcpy(work->job_id, sctx->job.job_id);
-	work->xnonce2_len = sctx->xnonce2_size;
-	memcpy(work->xnonce2, sctx->job.xnonce2, sctx->xnonce2_size);
-
-	/* Generate merkle root */
-	if (opt_algo == ALGO_HEAVY || opt_algo == ALGO_MJOLLNIR)
-		heavycoin_hash(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size);
-	else
-	if (opt_algo == ALGO_FUGUE256 || opt_algo == ALGO_GROESTL)
-		SHA256((unsigned char*)sctx->job.coinbase, sctx->job.coinbase_size, (unsigned char*)merkle_root);
-	else
-		sha256d(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size);
-
-	for (i = 0; i < sctx->job.merkle_count; i++) {
-		memcpy(merkle_root + 32, sctx->job.merkle[i], 32);
-		if (opt_algo == ALGO_HEAVY || opt_algo == ALGO_MJOLLNIR)
-			heavycoin_hash(merkle_root, merkle_root, 64);
-		else
-			sha256d(merkle_root, merkle_root, 64);
-	}
-	
-	/* Increment extranonce2 */
-	for (i = 0; i < (int)sctx->xnonce2_size && !++sctx->job.xnonce2[i]; i++);
-
-	/* Assemble block header */
-	memset(work->data, 0, 128);
-	work->data[0] = le32dec(sctx->job.version);
-	for (i = 0; i < 8; i++)
-		work->data[1 + i] = le32dec((uint32_t *)sctx->job.prevhash + i);
-	for (i = 0; i < 8; i++)
-		work->data[9 + i] = be32dec((uint32_t *)merkle_root + i);
-	work->data[17] = le32dec(sctx->job.ntime);
-	work->data[18] = le32dec(sctx->job.nbits);
-	if (opt_algo == ALGO_MJOLLNIR)
-	{
-		for (i = 0; i < 20; i++)
-			work->data[i] = be32dec((uint32_t *)&work->data[i]);
-	}
-
-	work->data[20] = 0x80000000;
-	work->data[31] = (opt_algo == ALGO_MJOLLNIR) ? 0x000002A0 : 0x00000280;
-
-	// HeavyCoin
-	if (opt_algo == ALGO_HEAVY) {
-		uint16_t *ext;
-		work->maxvote = 1024;
-		ext = (uint16_t*)(&work->data[20]);
-		ext[0] = opt_vote;
-		ext[1] = be16dec(sctx->job.nreward);
-
-		for (i = 0; i < 20; i++)
-			work->data[i] = be32dec((uint32_t *)&work->data[i]);
-	}
-	//
-
-	pthread_mutex_unlock(&sctx->work_lock);
-
-	if (opt_debug) {
-		char *xnonce2str = bin2hex(work->xnonce2, sctx->xnonce2_size);
-		applog(LOG_DEBUG, "DEBUG: job_id='%s' extranonce2=%s ntime=%08x",
-		       work->job_id, xnonce2str, swab32(work->data[17]));
-		free(xnonce2str);
-	}
-
-	if (opt_algo == ALGO_JACKPOT)
-		diff_to_target(work->target, sctx->job.diff / (65536.0 * opt_difficulty));
-	else if (opt_algo == ALGO_FUGUE256 || opt_algo == ALGO_GROESTL || opt_algo == ALGO_DMD_GR)
-		diff_to_target(work->target, sctx->job.diff / (256.0 * opt_difficulty));
-	else
-		diff_to_target(work->target, sctx->job.diff / opt_difficulty);
-}
-
-static void *miner_thread(void *userdata)
-{
-	struct thr_info *mythr = (struct thr_info *)userdata;
-	int thr_id = mythr->id;
-	struct work work;
-	uint32_t max_nonce;
-	uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - 0x20;
-	unsigned char *scratchbuf = NULL;
-	char s[16];
-	int i;
-    static int rounds = 0;
-
-	memset(&work, 0, sizeof(work)); // prevent work from being used uninitialized
-
-	/* Set worker threads to nice 19 and then preferentially to SCHED_IDLE
-	 * and if that fails, then SCHED_BATCH. No need for this to be an
-	 * error if it fails */
-	if (!opt_benchmark) {
-		setpriority(PRIO_PROCESS, 0, 19);
-		drop_policy();
-	}
-
-	/* Cpu affinity only makes sense if the number of threads is a multiple
-	 * of the number of CPUs */
-	if (num_processors > 1 && opt_n_threads % num_processors == 0) {
-		if (!opt_quiet)
-			applog(LOG_INFO, "Binding thread %d to cpu %d",
-			       thr_id, thr_id % num_processors);
-		affine_to_cpu(thr_id, thr_id % num_processors);
-	}
-
-	while (1) {
-		unsigned long hashes_done;
-		struct timeval tv_start, tv_end, diff;
-		int64_t max64;
-		int rc;
-
-		if (have_stratum) {
-			while (time(NULL) >= g_work_time + 120)
-				sleep(1);
-			pthread_mutex_lock(&g_work_lock);
-			if (work.data[19] >= end_nonce)
-				stratum_gen_work(&stratum, &g_work);
-		} else {
-			/* obtain new work from internal workio thread */
-			pthread_mutex_lock(&g_work_lock);
-			if (!have_stratum && (!have_longpoll ||
-					time(NULL) >= g_work_time + LP_SCANTIME*3/4 ||
-					work.data[19] >= end_nonce)) {
-				if (unlikely(!get_work(mythr, &g_work))) {
-					applog(LOG_ERR, "work retrieval failed, exiting "
-						"mining thread %d", mythr->id);
-					pthread_mutex_unlock(&g_work_lock);
-					goto out;
-				}
-				g_work_time = have_stratum ? 0 : time(NULL);
-			}
-			if (have_stratum) {
-				pthread_mutex_unlock(&g_work_lock);
-				continue;
-			}
-		}
-		if (memcmp(work.data, g_work.data, 76)) {
-			memcpy(&work, &g_work, sizeof(struct work));
-			work.data[19] = 0xffffffffU / opt_n_threads * thr_id;
-		} else
-			work.data[19]++;
-		pthread_mutex_unlock(&g_work_lock);
-		work_restart[thr_id].restart = 0;
-
-		/* adjust max_nonce to meet target scan time */
-		if (have_stratum)
-			max64 = LP_SCANTIME;
-		else
-			max64 = g_work_time + (have_longpoll ? LP_SCANTIME : opt_scantime)
-			      - time(NULL);
-		max64 *= (int64_t)thr_hashrates[thr_id];
-		if (max64 <= 0)
-			max64 = (opt_algo == ALGO_JACKPOT) ? 0x1fffLL : 0xfffffLL;
-		if ((int64_t)work.data[19] + max64 > end_nonce)
-			max_nonce = end_nonce;
-		else
-			max_nonce = (uint32_t)(work.data[19] + max64);
-
-		hashes_done = 0;
-		gettimeofday(&tv_start, NULL);
-
-		/* scan nonces for a proof-of-work hash */
-		switch (opt_algo) {
-
-		case ALGO_HEAVY:
-			rc = scanhash_heavy(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done, work.maxvote, HEAVYCOIN_BLKHDR_SZ);
-			break;
-
-		case ALGO_MJOLLNIR:
-			rc = scanhash_heavy(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done, 0, MNR_BLKHDR_SZ);
-			break;
-
-		case ALGO_FUGUE256:
-			rc = scanhash_fugue256(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done);
-			break;
-
-		case ALGO_GROESTL:
-		case ALGO_DMD_GR:
-			rc = scanhash_groestlcoin(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done);
-			break;
-
-		case ALGO_MYR_GR:
-			rc = scanhash_myriad(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done);
-			break;
-
-		case ALGO_JACKPOT:
-			rc = scanhash_jackpot(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done);
-			break;
-
-		case ALGO_QUARK:
-			rc = scanhash_quark(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done);
-			break;
-
-		case ALGO_ANIME:
-			rc = scanhash_anime(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done);
-			break;
-
-		case ALGO_NIST5:
-			rc = scanhash_nist5(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done);
-			break;
-
-		case ALGO_X11:
-			rc = scanhash_x11(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done);
-			break;
-
-		case ALGO_X13:
-			rc = scanhash_x13(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done);
-			break;
-
-		default:
-			/* should never happen */
-			goto out;
-		}
-
-//        if (opt_benchmark)
-//            if (++rounds == 1) exit(0);
-
-		/* record scanhash elapsed time */
-		gettimeofday(&tv_end, NULL);
-		timeval_subtract(&diff, &tv_end, &tv_start);
-		if (diff.tv_usec || diff.tv_sec) {
-			pthread_mutex_lock(&stats_lock);
-			thr_hashrates[thr_id] =
-				hashes_done / (diff.tv_sec + 1e-6 * diff.tv_usec);
-			pthread_mutex_unlock(&stats_lock);
-		}
-		if (!opt_quiet) {
-			sprintf(s, thr_hashrates[thr_id] >= 1e6 ? "%.0f" : "%.2f",
-				1e-3 * thr_hashrates[thr_id]);
-			applog(LOG_INFO, "GPU #%d: %s, %s khash/s",
-				device_map[thr_id], device_name[thr_id], s);
-//			applog(LOG_INFO, "thread %d: %lu hashes, %s khash/s",
-//				thr_id, hashes_done, s);
-		}
-		if (opt_benchmark && thr_id == opt_n_threads - 1) {
-			double hashrate = 0.;
-			for (i = 0; i < opt_n_threads && thr_hashrates[i]; i++)
-				hashrate += thr_hashrates[i];
-			if (i == opt_n_threads) {
-				sprintf(s, hashrate >= 1e6 ? "%.0f" : "%.2f", 1e-3 * hashrate);
-				applog(LOG_INFO, "Total: %s khash/s", s);
-			}
-		}
-
-		/* if nonce found, submit work */
-		if (rc && !opt_benchmark && !submit_work(mythr, &work))
-			break;
-	}
-
-out:
-	tq_freeze(mythr->q);
-
-	return NULL;
-}
-
-static void restart_threads(void)
-{
-	int i;
-
-	for (i = 0; i < opt_n_threads; i++)
-		work_restart[i].restart = 1;
-}
-
-static void *longpoll_thread(void *userdata)
-{
-	struct thr_info *mythr = (struct thr_info *)userdata;
-	CURL *curl = NULL;
-	char *copy_start, *hdr_path = NULL, *lp_url = NULL;
-	bool need_slash = false;
-
-	curl = curl_easy_init();
-	if (unlikely(!curl)) {
-		applog(LOG_ERR, "CURL initialization failed");
-		goto out;
-	}
-
-start:
-	hdr_path = (char*)tq_pop(mythr->q, NULL);
-	if (!hdr_path)
-		goto out;
-
-	/* full URL */
-	if (strstr(hdr_path, "://")) {
-		lp_url = hdr_path;
-		hdr_path = NULL;
-	}
-	
-	/* absolute path, on current server */
-	else {
-		copy_start = (*hdr_path == '/') ? (hdr_path + 1) : hdr_path;
-		if (rpc_url[strlen(rpc_url) - 1] != '/')
-			need_slash = true;
-
-		lp_url = (char*)malloc(strlen(rpc_url) + strlen(copy_start) + 2);
-		if (!lp_url)
-			goto out;
-
-		sprintf(lp_url, "%s%s%s", rpc_url, need_slash ? "/" : "", copy_start);
-	}
-
-	applog(LOG_INFO, "Long-polling activated for %s", lp_url);
-
-	while (1) {
-		json_t *val, *soval;
-		int err;
-
-		val = json_rpc_call(curl, lp_url, rpc_userpass, rpc_req,
-				    false, true, &err);
-		if (have_stratum) {
-			if (val)
-				json_decref(val);
-			goto out;
-		}
-		if (likely(val)) {
-			if (!opt_quiet) applog(LOG_INFO, "LONGPOLL detected new block");
-			soval = json_object_get(json_object_get(val, "result"), "submitold");
-			submit_old = soval ? json_is_true(soval) : false;
-			pthread_mutex_lock(&g_work_lock);
-			if (work_decode(json_object_get(val, "result"), &g_work)) {
-				if (opt_debug)
-					applog(LOG_DEBUG, "DEBUG: got new work");
-				time(&g_work_time);
-				restart_threads();
-			}
-			pthread_mutex_unlock(&g_work_lock);
-			json_decref(val);
-		} else {
-			pthread_mutex_lock(&g_work_lock);
-			g_work_time -= LP_SCANTIME;
-			pthread_mutex_unlock(&g_work_lock);
-			if (err == CURLE_OPERATION_TIMEDOUT) {
-				restart_threads();
-			} else {
-				have_longpoll = false;
-				restart_threads();
-				free(hdr_path);
-				free(lp_url);
-				lp_url = NULL;
-				sleep(opt_fail_pause);
-				goto start;
-			}
-		}
-	}
-
-out:
-	free(hdr_path);
-	free(lp_url);
-	tq_freeze(mythr->q);
-	if (curl)
-		curl_easy_cleanup(curl);
-
-	return NULL;
-}
-
-static bool stratum_handle_response(char *buf)
-{
-	json_t *val, *err_val, *res_val, *id_val;
-	json_error_t err;
-	bool ret = false;
-
-	val = JSON_LOADS(buf, &err);
-	if (!val) {
-		applog(LOG_INFO, "JSON decode failed(%d): %s", err.line, err.text);
-		goto out;
-	}
-
-	res_val = json_object_get(val, "result");
-	err_val = json_object_get(val, "error");
-	id_val = json_object_get(val, "id");
-
-	if (!id_val || json_is_null(id_val) || !res_val)
-		goto out;
-
-	share_result(json_is_true(res_val),
-		err_val ? json_string_value(json_array_get(err_val, 1)) : NULL);
-
-	ret = true;
-out:
-	if (val)
-		json_decref(val);
-
-	return ret;
-}
-
-static void *stratum_thread(void *userdata)
-{
-	struct thr_info *mythr = (struct thr_info *)userdata;
-	char *s;
-
-	stratum.url = (char*)tq_pop(mythr->q, NULL);
-	if (!stratum.url)
-		goto out;
-	applog(LOG_INFO, "Starting Stratum on %s", stratum.url);
-
-	while (1) {
-		int failures = 0;
-
-		while (!stratum.curl) {
-			pthread_mutex_lock(&g_work_lock);
-			g_work_time = 0;
-			pthread_mutex_unlock(&g_work_lock);
-			restart_threads();
-
-			if (!stratum_connect(&stratum, stratum.url) ||
-			    !stratum_subscribe(&stratum) ||
-			    !stratum_authorize(&stratum, rpc_user, rpc_pass)) {
-				stratum_disconnect(&stratum);
-				if (opt_retries >= 0 && ++failures > opt_retries) {
-					applog(LOG_ERR, "...terminating workio thread");
-					tq_push(thr_info[work_thr_id].q, NULL);
-					goto out;
-				}
-				applog(LOG_ERR, "...retry after %d seconds", opt_fail_pause);
-				sleep(opt_fail_pause);
-			}
-		}
-
-		if (stratum.job.job_id &&
-		    (strcmp(stratum.job.job_id, g_work.job_id) || !g_work_time)) {
-			pthread_mutex_lock(&g_work_lock);
-			stratum_gen_work(&stratum, &g_work);
-			time(&g_work_time);
-			pthread_mutex_unlock(&g_work_lock);
-			if (stratum.job.clean) {
-				if (!opt_quiet) applog(LOG_INFO, "Stratum detected new block");
-				restart_threads();
-			}
-		}
-		
-		if (!stratum_socket_full(&stratum, 120)) {
-			applog(LOG_ERR, "Stratum connection timed out");
-			s = NULL;
-		} else
-			s = stratum_recv_line(&stratum);
-		if (!s) {
-			stratum_disconnect(&stratum);
-			applog(LOG_ERR, "Stratum connection interrupted");
-			continue;
-		}
-		if (!stratum_handle_method(&stratum, s))
-			stratum_handle_response(s);
-		free(s);
-	}
-
-out:
-	return NULL;
-}
-
-static void show_version_and_exit(void)
-{
-	printf("%s\n%s\n", PACKAGE_STRING, curl_version());
-	exit(0);
-}
-
-static void show_usage_and_exit(int status)
-{
-	if (status)
-		fprintf(stderr, "Try `" PROGRAM_NAME " --help' for more information.\n");
-	else
-		printf(usage);
-	exit(status);
-}
-
-static void parse_arg (int key, char *arg)
-{
-	char *p;
-	int v, i;
-	double d;
-
-	switch(key) {
-	case 'a':
-		for (i = 0; i < ARRAY_SIZE(algo_names); i++) {
-			if (algo_names[i] &&
-			    !strcmp(arg, algo_names[i])) {
-				opt_algo = (sha256_algos)i;
-				break;
-			}
-		}
-		if (i == ARRAY_SIZE(algo_names))
-			show_usage_and_exit(1);
-		break;
-	case 'B':
-		opt_background = true;
-		break;
-	case 'c': {
-		json_error_t err;
-		if (opt_config)
-			json_decref(opt_config);
-#if JANSSON_VERSION_HEX >= 0x020000
-		opt_config = json_load_file(arg, 0, &err);
-#else
-		opt_config = json_load_file(arg, &err);
-#endif
-		if (!json_is_object(opt_config)) {
-			applog(LOG_ERR, "JSON decode of %s failed", arg);
-			exit(1);
-		}
-		break;
-	}
-	case 'q':
-		opt_quiet = true;
-		break;
-	case 'D':
-		opt_debug = true;
-		break;
-	case 'p':
-		free(rpc_pass);
-		rpc_pass = strdup(arg);
-		break;
-	case 'P':
-		opt_protocol = true;
-		break;
-	case 'r':
-		v = atoi(arg);
-		if (v < -1 || v > 9999)	/* sanity check */
-			show_usage_and_exit(1);
-		opt_retries = v;
-		break;
-	case 'R':
-		v = atoi(arg);
-		if (v < 1 || v > 9999)	/* sanity check */
-			show_usage_and_exit(1);
-		opt_fail_pause = v;
-		break;
-	case 's':
-		v = atoi(arg);
-		if (v < 1 || v > 9999)	/* sanity check */
-			show_usage_and_exit(1);
-		opt_scantime = v;
-		break;
-	case 'T':
-		v = atoi(arg);
-		if (v < 1 || v > 99999)	/* sanity check */
-			show_usage_and_exit(1);
-		opt_timeout = v;
-		break;
-	case 't':
-		v = atoi(arg);
-		if (v < 1 || v > 9999)	/* sanity check */
-			show_usage_and_exit(1);
-		opt_n_threads = v;
-		break;
-	case 'v':
-		v = atoi(arg);
-		if (v < 0 || v > 1024)	/* sanity check */
-			show_usage_and_exit(1);
-		opt_vote = (uint16_t)v;
-		break;
-	case 'm':
-		opt_trust_pool = true;
-		break;
-	case 'u':
-		free(rpc_user);
-		rpc_user = strdup(arg);
-		break;
-	case 'o':			/* --url */
-		p = strstr(arg, "://");
-		if (p) {
-			if (strncasecmp(arg, "http://", 7) && strncasecmp(arg, "https://", 8) &&
-					strncasecmp(arg, "stratum+tcp://", 14))
-				show_usage_and_exit(1);
-			free(rpc_url);
-			rpc_url = strdup(arg);
-		} else {
-			if (!strlen(arg) || *arg == '/')
-				show_usage_and_exit(1);
-			free(rpc_url);
-			rpc_url = (char*)malloc(strlen(arg) + 8);
-			sprintf(rpc_url, "http://%s", arg);
-		}
-		p = strrchr(rpc_url, '@');
-		if (p) {
-			char *sp, *ap;
-			*p = '\0';
-			ap = strstr(rpc_url, "://") + 3;
-			sp = strchr(ap, ':');
-			if (sp) {
-				free(rpc_userpass);
-				rpc_userpass = strdup(ap);
-				free(rpc_user);
-				rpc_user = (char*)calloc(sp - ap + 1, 1);
-				strncpy(rpc_user, ap, sp - ap);
-				free(rpc_pass);
-				rpc_pass = strdup(sp + 1);
-			} else {
-				free(rpc_user);
-				rpc_user = strdup(ap);
-			}
-			memmove(ap, p + 1, strlen(p + 1) + 1);
-		}
-		have_stratum = !opt_benchmark && !strncasecmp(rpc_url, "stratum", 7);
-		break;
-	case 'O':			/* --userpass */
-		p = strchr(arg, ':');
-		if (!p)
-			show_usage_and_exit(1);
-		free(rpc_userpass);
-		rpc_userpass = strdup(arg);
-		free(rpc_user);
-		rpc_user = (char*)calloc(p - arg + 1, 1);
-		strncpy(rpc_user, arg, p - arg);
-		free(rpc_pass);
-		rpc_pass = strdup(p + 1);
-		break;
-	case 'x':			/* --proxy */
-		if (!strncasecmp(arg, "socks4://", 9))
-			opt_proxy_type = CURLPROXY_SOCKS4;
-		else if (!strncasecmp(arg, "socks5://", 9))
-			opt_proxy_type = CURLPROXY_SOCKS5;
-#if LIBCURL_VERSION_NUM >= 0x071200
-		else if (!strncasecmp(arg, "socks4a://", 10))
-			opt_proxy_type = CURLPROXY_SOCKS4A;
-		else if (!strncasecmp(arg, "socks5h://", 10))
-			opt_proxy_type = CURLPROXY_SOCKS5_HOSTNAME;
-#endif
-		else
-			opt_proxy_type = CURLPROXY_HTTP;
-		free(opt_proxy);
-		opt_proxy = strdup(arg);
-		break;
-	case 1001:
-		free(opt_cert);
-		opt_cert = strdup(arg);
-		break;
-	case 1005:
-		opt_benchmark = true;
-		want_longpoll = false;
-		want_stratum = false;
-		have_stratum = false;
-		break;
-	case 1003:
-		want_longpoll = false;
-		break;
-	case 1007:
-		want_stratum = false;
-		break;
-	case 'S':
-		use_syslog = true;
-		break;
-	case 'd': // CB
-		{
-			char * pch = strtok (arg,",");
-			opt_n_threads = 0;
-			while (pch != NULL) {
-				if (pch[0] >= '0' && pch[0] <= '9' && pch[1] == '\0')
-				{
-					if (atoi(pch) < num_processors)
-						device_map[opt_n_threads++] = atoi(pch);
-					else {
-						applog(LOG_ERR, "Non-existant CUDA device #%d specified in -d option", atoi(pch));
-						exit(1);
-					}
-				} else {
-					int device = cuda_finddevice(pch);
-					if (device >= 0 && device < num_processors)
-						device_map[opt_n_threads++] = device;
-					else {
-						applog(LOG_ERR, "Non-existant CUDA device '%s' specified in -d option", pch);
-						exit(1);
-					}
-				}
-				pch = strtok (NULL, ",");
-			}
-		}
-		break;
-	case 'f': // CH - Divisor for Difficulty
-		d = atof(arg);
-		if (d == 0)	/* sanity check */
-			show_usage_and_exit(1);
-		opt_difficulty = d;
-		break;
-	case 'V':
-		show_version_and_exit();
-	case 'h':
-		show_usage_and_exit(0);
-	default:
-		show_usage_and_exit(1);
-	}
-}
-
-static void parse_config(void)
-{
-	int i;
-	json_t *val;
-
-	if (!json_is_object(opt_config))
-		return;
-
-	for (i = 0; i < ARRAY_SIZE(options); i++) {
-		if (!options[i].name)
-			break;
-		if (!strcmp(options[i].name, "config"))
-			continue;
-
-		val = json_object_get(opt_config, options[i].name);
-		if (!val)
-			continue;
-
-		if (options[i].has_arg && json_is_string(val)) {
-			char *s = strdup(json_string_value(val));
-			if (!s)
-				break;
-			parse_arg(options[i].val, s);
-			free(s);
-		} else if (!options[i].has_arg && json_is_true(val))
-			parse_arg(options[i].val, "");
-		else
-			applog(LOG_ERR, "JSON option %s invalid",
-				options[i].name);
-	}
-
-	if (opt_algo == ALGO_HEAVY && opt_vote == 9999) {
-		fprintf(stderr, "Heavycoin hash requires block reward vote parameter (see --vote)\n");
-		show_usage_and_exit(1);
-	}
-}
-
-static void parse_cmdline(int argc, char *argv[])
-{
-	int key;
-
-	while (1) {
-#if HAVE_GETOPT_LONG
-		key = getopt_long(argc, argv, short_options, options, NULL);
-#else
-		key = getopt(argc, argv, short_options);
-#endif
-		if (key < 0)
-			break;
-
-		parse_arg(key, optarg);
-	}
-	if (optind < argc) {
-		fprintf(stderr, "%s: unsupported non-option argument '%s'\n",
-			argv[0], argv[optind]);
-		show_usage_and_exit(1);
-	}
-
-	if (opt_algo == ALGO_HEAVY && opt_vote == 9999) {
-		fprintf(stderr, "%s: Heavycoin hash requires block reward vote parameter (see --vote)\n",
-			argv[0]);
-		show_usage_and_exit(1);
-	}
-
-	parse_config();
-}
-
-#ifndef WIN32
-static void signal_handler(int sig)
-{
-	switch (sig) {
-	case SIGHUP:
-		applog(LOG_INFO, "SIGHUP received");
-		break;
-	case SIGINT:
-		applog(LOG_INFO, "SIGINT received, exiting");
-		exit(0);
-		break;
-	case SIGTERM:
-		applog(LOG_INFO, "SIGTERM received, exiting");
-		exit(0);
-		break;
-	}
-}
-#endif
-
-#define PROGRAM_VERSION "1.2"
-int main(int argc, char *argv[])
-{
-	struct thr_info *thr;
-	long flags;
-	int i;
-
-#ifdef WIN32
-	SYSTEM_INFO sysinfo;
-#endif
-
-	printf("     *** ccMiner for nVidia GPUs by Christian Buchner and Christian H. ***\n");
-	printf("\t             This is version "PROGRAM_VERSION" (beta)\n");
-	printf("\t  based on pooler-cpuminer 2.3.2 (c) 2010 Jeff Garzik, 2012 pooler\n");
-	printf("\t  based on pooler-cpuminer extension for HVC from\n\t       https://github.com/heavycoin/cpuminer-heavycoin\n");
-	printf("\t\t\tand\n\t       http://hvc.1gh.com/\n");
-	printf("\tCuda additions Copyright 2014 Christian Buchner, Christian H.\n");
-	printf("\t  LTC donation address: LKS1WDKGED647msBQfLBHV3Ls8sveGncnm\n");
-	printf("\t  BTC donation address: 16hJF5mceSojnTD3ZTUDqdRhDyPJzoRakM\n");
-	printf("\t  YAC donation address: Y87sptDEcpLkLeAuex6qZioDbvy1qXZEj4\n");
-
-	rpc_user = strdup("");
-	rpc_pass = strdup("");
-
-	pthread_mutex_init(&applog_lock, NULL);
-	num_processors = cuda_num_devices();
-
-	/* parse command line */
-	parse_cmdline(argc, argv);
-
-	cuda_devicenames();
-
-	if (!opt_benchmark && !rpc_url) {
-		fprintf(stderr, "%s: no URL supplied\n", argv[0]);
-		show_usage_and_exit(1);
-	}
-
-	if (!rpc_userpass) {
-		rpc_userpass = (char*)malloc(strlen(rpc_user) + strlen(rpc_pass) + 2);
-		if (!rpc_userpass)
-			return 1;
-		sprintf(rpc_userpass, "%s:%s", rpc_user, rpc_pass);
-	}
-
-	pthread_mutex_init(&stats_lock, NULL);
-	pthread_mutex_init(&g_work_lock, NULL);
-	pthread_mutex_init(&stratum.sock_lock, NULL);
-	pthread_mutex_init(&stratum.work_lock, NULL);
-
-	flags = !opt_benchmark && strncmp(rpc_url, "https:", 6)
-	      ? (CURL_GLOBAL_ALL & ~CURL_GLOBAL_SSL)
-	      : CURL_GLOBAL_ALL;
-	if (curl_global_init(flags)) {
-		applog(LOG_ERR, "CURL initialization failed");
-		return 1;
-	}
-
-#ifndef WIN32
-	if (opt_background) {
-		i = fork();
-		if (i < 0) exit(1);
-		if (i > 0) exit(0);
-		i = setsid();
-		if (i < 0)
-			applog(LOG_ERR, "setsid() failed (errno = %d)", errno);
-		i = chdir("/");
-		if (i < 0)
-			applog(LOG_ERR, "chdir() failed (errno = %d)", errno);
-		signal(SIGHUP, signal_handler);
-		signal(SIGINT, signal_handler);
-		signal(SIGTERM, signal_handler);
-	}
-#endif
-
-	if (num_processors == 0)
-	{
-		applog(LOG_ERR, "No CUDA devices found! terminating.");
-		exit(1);
-	}
-	if (!opt_n_threads)
-		opt_n_threads = num_processors;
-
-#ifdef HAVE_SYSLOG_H
-	if (use_syslog)
-		openlog("cpuminer", LOG_PID, LOG_USER);
-#endif
-
-	work_restart = (struct work_restart *)calloc(opt_n_threads, sizeof(*work_restart));
-	if (!work_restart)
-		return 1;
-
-	thr_info = (struct thr_info *)calloc(opt_n_threads + 3, sizeof(*thr));
-	if (!thr_info)
-		return 1;
-	
-	thr_hashrates = (double *) calloc(opt_n_threads, sizeof(double));
-	if (!thr_hashrates)
-		return 1;
-
-	/* init workio thread info */
-	work_thr_id = opt_n_threads;
-	thr = &thr_info[work_thr_id];
-	thr->id = work_thr_id;
-	thr->q = tq_new();
-	if (!thr->q)
-		return 1;
-
-	/* start work I/O thread */
-	if (pthread_create(&thr->pth, NULL, workio_thread, thr)) {
-		applog(LOG_ERR, "workio thread create failed");
-		return 1;
-	}
-
-	if (want_longpoll && !have_stratum) {
-		/* init longpoll thread info */
-		longpoll_thr_id = opt_n_threads + 1;
-		thr = &thr_info[longpoll_thr_id];
-		thr->id = longpoll_thr_id;
-		thr->q = tq_new();
-		if (!thr->q)
-			return 1;
-
-		/* start longpoll thread */
-		if (unlikely(pthread_create(&thr->pth, NULL, longpoll_thread, thr))) {
-			applog(LOG_ERR, "longpoll thread create failed");
-			return 1;
-		}
-	}
-	if (want_stratum) {
-		/* init stratum thread info */
-		stratum_thr_id = opt_n_threads + 2;
-		thr = &thr_info[stratum_thr_id];
-		thr->id = stratum_thr_id;
-		thr->q = tq_new();
-		if (!thr->q)
-			return 1;
-
-		/* start stratum thread */
-		if (unlikely(pthread_create(&thr->pth, NULL, stratum_thread, thr))) {
-			applog(LOG_ERR, "stratum thread create failed");
-			return 1;
-		}
-
-		if (have_stratum)
-			tq_push(thr_info[stratum_thr_id].q, strdup(rpc_url));
-	}
-
-	/* start mining threads */
-	for (i = 0; i < opt_n_threads; i++) {
-		thr = &thr_info[i];
-
-		thr->id = i;
-		thr->q = tq_new();
-		if (!thr->q)
-			return 1;
-
-		if (unlikely(pthread_create(&thr->pth, NULL, miner_thread, thr))) {
-			applog(LOG_ERR, "thread %d create failed", i);
-			return 1;
-		}
-	}
-
-	applog(LOG_INFO, "%d miner threads started, "
-		"using '%s' algorithm.",
-		opt_n_threads,
-		algo_names[opt_algo]);
-
-#ifdef WIN32
-	timeBeginPeriod(1); // enable high timer precision (similar to Google Chrome Trick)
-#endif
-
-	/* main loop - simply wait for workio thread to exit */
-	pthread_join(thr_info[work_thr_id].pth, NULL);
-
-#ifdef WIN32
-	timeEndPeriod(1); // be nice and forego high timer precision
-#endif
-
-	applog(LOG_INFO, "workio thread dead, exiting.");
-
-	return 0;
-}
diff --git a/cpuminer-config.h b/cpuminer-config.h
deleted file mode 100644
index 21c0a1b40c..0000000000
--- a/cpuminer-config.h
+++ /dev/null
@@ -1,190 +0,0 @@
-/* cpuminer-config.h.in.  Generated from configure.ac by autoheader.  */
-
-/* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP
-   systems. This function is required for `alloca.c' support on those systems.
-   */
-#undef CRAY_STACKSEG_END
-
-/* Define to 1 if using `alloca.c'. */
-#undef C_ALLOCA
-
-/* Define to 1 if you have `alloca', as a function or macro. */
-#undef HAVE_ALLOCA
-
-/* Define to 1 if you have <alloca.h> and it should be used (not on Ultrix).
-   */
-#undef HAVE_ALLOCA_H
-
-/* Define to 1 if you have the declaration of `be32dec', and to 0 if you
-   don't. */
-#undef HAVE_DECL_BE32DEC
-
-/* Define to 1 if you have the declaration of `be32enc', and to 0 if you
-   don't. */
-#undef HAVE_DECL_BE32ENC
-
-/* Define to 1 if you have the declaration of `le32dec', and to 0 if you
-   don't. */
-#undef HAVE_DECL_LE32DEC
-
-/* Define to 1 if you have the declaration of `le32enc', and to 0 if you
-   don't. */
-#undef HAVE_DECL_LE32ENC
-
-/* Define to 1 if you have the `getopt_long' function. */
-#define HAVE_GETOPT_LONG 1
-
-/* Define to 1 if you have the <inttypes.h> header file. */
-#undef HAVE_INTTYPES_H
-
-/* Define to 1 if you have a functional curl library. */
-#undef HAVE_LIBCURL
-
-/* Define to 1 if you have the <memory.h> header file. */
-#undef HAVE_MEMORY_H
-
-/* Define to 1 if you have the <stdint.h> header file. */
-#undef HAVE_STDINT_H
-
-/* Define to 1 if you have the <stdlib.h> header file. */
-#undef HAVE_STDLIB_H
-
-/* Define to 1 if you have the <strings.h> header file. */
-#undef HAVE_STRINGS_H
-
-/* Define to 1 if you have the <string.h> header file. */
-#undef HAVE_STRING_H
-
-/* Define to 1 if you have the <syslog.h> header file. */
-#undef HAVE_SYSLOG_H
-
-/* Define to 1 if you have the <sys/endian.h> header file. */
-#undef HAVE_SYS_ENDIAN_H
-
-/* Define to 1 if you have the <sys/param.h> header file. */
-#undef HAVE_SYS_PARAM_H
-
-/* Define to 1 if you have the <sys/stat.h> header file. */
-#undef HAVE_SYS_STAT_H
-
-/* Define to 1 if you have the <sys/sysctl.h> header file. */
-#undef HAVE_SYS_SYSCTL_H
-
-/* Define to 1 if you have the <sys/types.h> header file. */
-#undef HAVE_SYS_TYPES_H
-
-/* Define to 1 if you have the <unistd.h> header file. */
-#undef HAVE_UNISTD_H
-
-/* Defined if libcurl supports AsynchDNS */
-#undef LIBCURL_FEATURE_ASYNCHDNS
-
-/* Defined if libcurl supports IDN */
-#undef LIBCURL_FEATURE_IDN
-
-/* Defined if libcurl supports IPv6 */
-#undef LIBCURL_FEATURE_IPV6
-
-/* Defined if libcurl supports KRB4 */
-#undef LIBCURL_FEATURE_KRB4
-
-/* Defined if libcurl supports libz */
-#undef LIBCURL_FEATURE_LIBZ
-
-/* Defined if libcurl supports NTLM */
-#undef LIBCURL_FEATURE_NTLM
-
-/* Defined if libcurl supports SSL */
-#undef LIBCURL_FEATURE_SSL
-
-/* Defined if libcurl supports SSPI */
-#undef LIBCURL_FEATURE_SSPI
-
-/* Defined if libcurl supports DICT */
-#undef LIBCURL_PROTOCOL_DICT
-
-/* Defined if libcurl supports FILE */
-#undef LIBCURL_PROTOCOL_FILE
-
-/* Defined if libcurl supports FTP */
-#undef LIBCURL_PROTOCOL_FTP
-
-/* Defined if libcurl supports FTPS */
-#undef LIBCURL_PROTOCOL_FTPS
-
-/* Defined if libcurl supports HTTP */
-#undef LIBCURL_PROTOCOL_HTTP
-
-/* Defined if libcurl supports HTTPS */
-#undef LIBCURL_PROTOCOL_HTTPS
-
-/* Defined if libcurl supports IMAP */
-#undef LIBCURL_PROTOCOL_IMAP
-
-/* Defined if libcurl supports LDAP */
-#undef LIBCURL_PROTOCOL_LDAP
-
-/* Defined if libcurl supports POP3 */
-#undef LIBCURL_PROTOCOL_POP3
-
-/* Defined if libcurl supports RTSP */
-#undef LIBCURL_PROTOCOL_RTSP
-
-/* Defined if libcurl supports SMTP */
-#undef LIBCURL_PROTOCOL_SMTP
-
-/* Defined if libcurl supports TELNET */
-#undef LIBCURL_PROTOCOL_TELNET
-
-/* Defined if libcurl supports TFTP */
-#undef LIBCURL_PROTOCOL_TFTP
-
-/* Define to 1 if your C compiler doesn't accept -c and -o together. */
-#undef NO_MINUS_C_MINUS_O
-
-/* Name of package */
-#undef PACKAGE
-
-/* Define to the address where bug reports for this package should be sent. */
-#undef PACKAGE_BUGREPORT
-
-/* Define to the full name of this package. */
-#define PACKAGE_NAME "ccminer"
-
-/* Define to the full name and version of this package. */
-#define PACKAGE_STRING "ccminer 2014.06.15"
-
-/* Define to the one symbol short name of this package. */
-#undef PACKAGE_TARNAME
-
-/* Define to the home page for this package. */
-#undef PACKAGE_URL
-
-/* Define to the version of this package. */
-#define PACKAGE_VERSION "2014.06.15"
-
-/* If using the C implementation of alloca, define if you know the
-   direction of stack growth for your system; otherwise it will be
-   automatically deduced at runtime.
-	STACK_DIRECTION > 0 => grows toward higher addresses
-	STACK_DIRECTION < 0 => grows toward lower addresses
-	STACK_DIRECTION = 0 => direction of growth unknown */
-#undef STACK_DIRECTION
-
-/* Define to 1 if you have the ANSI C header files. */
-#undef STDC_HEADERS
-
-/* Define to 1 if AVX assembly is available. */
-#undef USE_AVX
-
-/* Define to 1 if XOP assembly is available. */
-#undef USE_XOP
-
-/* Version number of package */
-#undef VERSION
-
-/* Define curl_free() as free() if our version of curl lacks curl_free. */
-#undef curl_free
-
-/* Define to `unsigned int' if <sys/types.h> does not define. */
-#undef size_t
diff --git a/cpuminer-config.h.in b/cpuminer-config.h.in
deleted file mode 100644
index b8668be60d..0000000000
--- a/cpuminer-config.h.in
+++ /dev/null
@@ -1,199 +0,0 @@
-/* cpuminer-config.h.in.  Generated from configure.ac by autoheader.  */
-
-/* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP
-   systems. This function is required for `alloca.c' support on those systems.
-   */
-#undef CRAY_STACKSEG_END
-
-/* Define to 1 if using `alloca.c'. */
-#undef C_ALLOCA
-
-/* Define to 1 if you have `alloca', as a function or macro. */
-#undef HAVE_ALLOCA
-
-/* Define to 1 if you have <alloca.h> and it should be used (not on Ultrix).
-   */
-#undef HAVE_ALLOCA_H
-
-/* Define to 1 if you have the declaration of `be32dec', and to 0 if you
-   don't. */
-#undef HAVE_DECL_BE32DEC
-
-/* Define to 1 if you have the declaration of `be32enc', and to 0 if you
-   don't. */
-#undef HAVE_DECL_BE32ENC
-
-/* Define to 1 if you have the declaration of `le32dec', and to 0 if you
-   don't. */
-#undef HAVE_DECL_LE32DEC
-
-/* Define to 1 if you have the declaration of `le32enc', and to 0 if you
-   don't. */
-#undef HAVE_DECL_LE32ENC
-
-/* Define to 1 if you have the `getopt_long' function. */
-#undef HAVE_GETOPT_LONG
-
-/* Define to 1 if you have the <inttypes.h> header file. */
-#undef HAVE_INTTYPES_H
-
-/* Define to 1 if you have the `crypto' library (-lcrypto). */
-#undef HAVE_LIBCRYPTO
-
-/* Define to 1 if you have a functional curl library. */
-#undef HAVE_LIBCURL
-
-/* Define to 1 if you have the `ssl' library (-lssl). */
-#undef HAVE_LIBSSL
-
-/* Define to 1 if you have the <memory.h> header file. */
-#undef HAVE_MEMORY_H
-
-/* Define to 1 if you have the <stdint.h> header file. */
-#undef HAVE_STDINT_H
-
-/* Define to 1 if you have the <stdlib.h> header file. */
-#undef HAVE_STDLIB_H
-
-/* Define to 1 if you have the <strings.h> header file. */
-#undef HAVE_STRINGS_H
-
-/* Define to 1 if you have the <string.h> header file. */
-#undef HAVE_STRING_H
-
-/* Define to 1 if you have the <syslog.h> header file. */
-#undef HAVE_SYSLOG_H
-
-/* Define to 1 if you have the <sys/endian.h> header file. */
-#undef HAVE_SYS_ENDIAN_H
-
-/* Define to 1 if you have the <sys/param.h> header file. */
-#undef HAVE_SYS_PARAM_H
-
-/* Define to 1 if you have the <sys/stat.h> header file. */
-#undef HAVE_SYS_STAT_H
-
-/* Define to 1 if you have the <sys/sysctl.h> header file. */
-#undef HAVE_SYS_SYSCTL_H
-
-/* Define to 1 if you have the <sys/types.h> header file. */
-#undef HAVE_SYS_TYPES_H
-
-/* Define to 1 if you have the <unistd.h> header file. */
-#undef HAVE_UNISTD_H
-
-/* Defined if libcurl supports AsynchDNS */
-#undef LIBCURL_FEATURE_ASYNCHDNS
-
-/* Defined if libcurl supports IDN */
-#undef LIBCURL_FEATURE_IDN
-
-/* Defined if libcurl supports IPv6 */
-#undef LIBCURL_FEATURE_IPV6
-
-/* Defined if libcurl supports KRB4 */
-#undef LIBCURL_FEATURE_KRB4
-
-/* Defined if libcurl supports libz */
-#undef LIBCURL_FEATURE_LIBZ
-
-/* Defined if libcurl supports NTLM */
-#undef LIBCURL_FEATURE_NTLM
-
-/* Defined if libcurl supports SSL */
-#undef LIBCURL_FEATURE_SSL
-
-/* Defined if libcurl supports SSPI */
-#undef LIBCURL_FEATURE_SSPI
-
-/* Defined if libcurl supports DICT */
-#undef LIBCURL_PROTOCOL_DICT
-
-/* Defined if libcurl supports FILE */
-#undef LIBCURL_PROTOCOL_FILE
-
-/* Defined if libcurl supports FTP */
-#undef LIBCURL_PROTOCOL_FTP
-
-/* Defined if libcurl supports FTPS */
-#undef LIBCURL_PROTOCOL_FTPS
-
-/* Defined if libcurl supports HTTP */
-#undef LIBCURL_PROTOCOL_HTTP
-
-/* Defined if libcurl supports HTTPS */
-#undef LIBCURL_PROTOCOL_HTTPS
-
-/* Defined if libcurl supports IMAP */
-#undef LIBCURL_PROTOCOL_IMAP
-
-/* Defined if libcurl supports LDAP */
-#undef LIBCURL_PROTOCOL_LDAP
-
-/* Defined if libcurl supports POP3 */
-#undef LIBCURL_PROTOCOL_POP3
-
-/* Defined if libcurl supports RTSP */
-#undef LIBCURL_PROTOCOL_RTSP
-
-/* Defined if libcurl supports SMTP */
-#undef LIBCURL_PROTOCOL_SMTP
-
-/* Defined if libcurl supports TELNET */
-#undef LIBCURL_PROTOCOL_TELNET
-
-/* Defined if libcurl supports TFTP */
-#undef LIBCURL_PROTOCOL_TFTP
-
-/* Define to 1 if your C compiler doesn't accept -c and -o together. */
-#undef NO_MINUS_C_MINUS_O
-
-/* Name of package */
-#undef PACKAGE
-
-/* Define to the address where bug reports for this package should be sent. */
-#undef PACKAGE_BUGREPORT
-
-/* Define to the full name of this package. */
-#undef PACKAGE_NAME
-
-/* Define to the full name and version of this package. */
-#undef PACKAGE_STRING
-
-/* Define to the one symbol short name of this package. */
-#undef PACKAGE_TARNAME
-
-/* Define to the home page for this package. */
-#undef PACKAGE_URL
-
-/* Define to the version of this package. */
-#undef PACKAGE_VERSION
-
-/* If using the C implementation of alloca, define if you know the
-   direction of stack growth for your system; otherwise it will be
-   automatically deduced at runtime.
-	STACK_DIRECTION > 0 => grows toward higher addresses
-	STACK_DIRECTION < 0 => grows toward lower addresses
-	STACK_DIRECTION = 0 => direction of growth unknown */
-#undef STACK_DIRECTION
-
-/* Define to 1 if you have the ANSI C header files. */
-#undef STDC_HEADERS
-
-/* Define to 1 if AVX assembly is available. */
-#undef USE_AVX
-
-/* Define to 1 if AVX2 assembly is available. */
-#undef USE_AVX2
-
-/* Define to 1 if XOP assembly is available. */
-#undef USE_XOP
-
-/* Version number of package */
-#undef VERSION
-
-/* Define curl_free() as free() if our version of curl lacks curl_free. */
-#undef curl_free
-
-/* Define to `unsigned int' if <sys/types.h> does not define. */
-#undef size_t
diff --git a/crc32.c b/crc32.c
new file mode 100644
index 0000000000..f036bcbd3b
--- /dev/null
+++ b/crc32.c
@@ -0,0 +1,119 @@
+/*-
+ *  COPYRIGHT (C) 1986 Gary S. Brown.  You may use this program, or
+ *  code or tables extracted from it, as desired without restriction.
+ *
+ *  First, the polynomial itself and its table of feedback terms.  The
+ *  polynomial is
+ *  X^32+X^26+X^23+X^22+X^16+X^12+X^11+X^10+X^8+X^7+X^5+X^4+X^2+X^1+X^0
+ *
+ *  Note that we take it "backwards" and put the highest-order term in
+ *  the lowest-order bit.  The X^32 term is "implied"; the LSB is the
+ *  X^31 term, etc.  The X^0 term (usually shown as "+1") results in
+ *  the MSB being 1
+ *
+ *  Note that the usual hardware shift register implementation, which
+ *  is what we're using (we're merely optimizing it by doing eight-bit
+ *  chunks at a time) shifts bits into the lowest-order term.  In our
+ *  implementation, that means shifting towards the right.  Why do we
+ *  do it this way?  Because the calculated CRC must be transmitted in
+ *  order from highest-order term to lowest-order term.  UARTs transmit
+ *  characters in order from LSB to MSB.  By storing the CRC this way
+ *  we hand it to the UART in the order low-byte to high-byte; the UART
+ *  sends each low-bit to hight-bit; and the result is transmission bit
+ *  by bit from highest- to lowest-order term without requiring any bit
+ *  shuffling on our part.  Reception works similarly
+ *
+ *  The feedback terms table consists of 256, 32-bit entries.  Notes
+ *
+ *      The table can be generated at runtime if desired; code to do so
+ *      is shown later.  It might not be obvious, but the feedback
+ *      terms simply represent the results of eight shift/xor opera
+ *      tions for all combinations of data and CRC register values
+ *
+ *      The values must be right-shifted by eight bits by the "updcrc
+ *      logic; the shift must be unsigned (bring in zeroes).  On some
+ *      hardware you could probably optimize the shift in assembler by
+ *      using byte-swap instructions
+ *      polynomial $edb88320
+ *
+ *
+ * CRC32 code derived from work by Gary S. Brown.
+ */
+
+#include <stdlib.h>
+#include <stdint.h>
+
+static uint32_t crc32_tab[] = {
+	0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f,
+	0xe963a535, 0x9e6495a3,	0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
+	0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2,
+	0xf3b97148, 0x84be41de,	0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
+	0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec,	0x14015c4f, 0x63066cd9,
+	0xfa0f3d63, 0x8d080df5,	0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
+	0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b,	0x35b5a8fa, 0x42b2986c,
+	0xdbbbc9d6, 0xacbcf940,	0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
+	0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423,
+	0xcfba9599, 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
+	0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d,	0x76dc4190, 0x01db7106,
+	0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
+	0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d,
+	0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
+	0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950,
+	0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
+	0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7,
+	0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
+	0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa,
+	0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
+	0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81,
+	0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
+	0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84,
+	0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
+	0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
+	0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
+	0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8, 0xa1d1937e,
+	0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
+	0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55,
+	0x316e8eef, 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
+	0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28,
+	0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
+	0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f,
+	0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
+	0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242,
+	0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
+	0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69,
+	0x616bffd3, 0x166ccf45, 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
+	0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc,
+	0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
+	0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693,
+	0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
+	0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d
+};
+
+/* Real CRC32 Function */
+extern uint32_t crc32(uint32_t crc, const void *buf, size_t size)
+{
+	const uint8_t *p;
+
+	p = buf;
+	crc = crc ^ ~0U;
+
+	while (size--)
+		crc = crc32_tab[(crc ^ *p++) & 0xFF] ^ (crc >> 8);
+
+	return crc ^ ~0U;
+}
+
+/* CRC32 Function simplified for ccminer */
+extern uint32_t crc32_u32t(const uint32_t *buf, size_t size)
+{
+	const uint8_t *p;
+	uint32_t crc = 0;
+
+	p = (uint8_t *) buf;
+	crc = crc ^ ~0U;
+
+	while (size--)
+		crc = crc32_tab[(crc ^ *p++) & 0xFF] ^ (crc >> 8);
+
+	return crc ^ ~0U;
+}
diff --git a/crypto/aesb.cpp b/crypto/aesb.cpp
new file mode 100644
index 0000000000..2e18b2b852
--- /dev/null
+++ b/crypto/aesb.cpp
@@ -0,0 +1,170 @@
+/*
+---------------------------------------------------------------------------
+Copyright (c) 1998-2013, Brian Gladman, Worcester, UK. All rights reserved.
+
+The redistribution and use of this software (with or without changes)
+is allowed without the payment of fees or royalties provided that:
+
+  source code distributions include the above copyright notice, this
+  list of conditions and the following disclaimer;
+
+  binary distributions include the above copyright notice, this list
+  of conditions and the following disclaimer in their documentation.
+
+This software is provided 'as is' with no explicit or implied warranties
+in respect of its operation, including, but not limited to, correctness
+and fitness for purpose.
+---------------------------------------------------------------------------
+Issue Date: 20/12/2007
+*/
+
+#include <stdint.h>
+
+#if defined(__cplusplus)
+extern "C"
+{
+#endif
+
+#define TABLE_ALIGN     32
+#define WPOLY           0x011b
+#define N_COLS          4
+#define AES_BLOCK_SIZE  16
+#define RC_LENGTH       (5 * (AES_BLOCK_SIZE / 4 - 2))
+
+#if defined(_MSC_VER)
+#define ALIGN __declspec(align(TABLE_ALIGN))
+#elif defined(__GNUC__)
+#define ALIGN __attribute__ ((aligned(16)))
+#else
+#define ALIGN
+#endif
+
+#define rf1(r,c) (r)
+#define word_in(x,c) (*((uint32_t*)(x)+(c)))
+#define word_out(x,c,v) (*((uint32_t*)(x)+(c)) = (v))
+
+#define s(x,c) x[c]
+#define si(y,x,c) (s(y,c) = word_in(x, c))
+#define so(y,x,c) word_out(y, c, s(x,c))
+#define state_in(y,x) si(y,x,0); si(y,x,1); si(y,x,2); si(y,x,3)
+#define state_out(y,x)  so(y,x,0); so(y,x,1); so(y,x,2); so(y,x,3)
+#define round(y,x,k) \
+y[0] = (k)[0]  ^ (t_fn[0][x[0] & 0xff] ^ t_fn[1][(x[1] >> 8) & 0xff] ^ t_fn[2][(x[2] >> 16) & 0xff] ^ t_fn[3][x[3] >> 24]); \
+y[1] = (k)[1]  ^ (t_fn[0][x[1] & 0xff] ^ t_fn[1][(x[2] >> 8) & 0xff] ^ t_fn[2][(x[3] >> 16) & 0xff] ^ t_fn[3][x[0] >> 24]); \
+y[2] = (k)[2]  ^ (t_fn[0][x[2] & 0xff] ^ t_fn[1][(x[3] >> 8) & 0xff] ^ t_fn[2][(x[0] >> 16) & 0xff] ^ t_fn[3][x[1] >> 24]); \
+y[3] = (k)[3]  ^ (t_fn[0][x[3] & 0xff] ^ t_fn[1][(x[0] >> 8) & 0xff] ^ t_fn[2][(x[1] >> 16) & 0xff] ^ t_fn[3][x[2] >> 24]);
+#define to_byte(x) ((x) & 0xff)
+#define bval(x,n) to_byte((x) >> (8 * (n)))
+
+#define fwd_var(x,r,c)\
+ ( r == 0 ? ( c == 0 ? s(x,0) : c == 1 ? s(x,1) : c == 2 ? s(x,2) : s(x,3))\
+ : r == 1 ? ( c == 0 ? s(x,1) : c == 1 ? s(x,2) : c == 2 ? s(x,3) : s(x,0))\
+ : r == 2 ? ( c == 0 ? s(x,2) : c == 1 ? s(x,3) : c == 2 ? s(x,0) : s(x,1))\
+ :          ( c == 0 ? s(x,3) : c == 1 ? s(x,0) : c == 2 ? s(x,1) : s(x,2)))
+
+#define fwd_rnd(y,x,k,c)  (s(y,c) = (k)[c] ^ four_tables(x,t_use(f,n),fwd_var,rf1,c))
+
+#define sb_data(w) {\
+    w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), w(0xc5),\
+    w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), w(0xab), w(0x76),\
+    w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), w(0x59), w(0x47), w(0xf0),\
+    w(0xad), w(0xd4), w(0xa2), w(0xaf), w(0x9c), w(0xa4), w(0x72), w(0xc0),\
+    w(0xb7), w(0xfd), w(0x93), w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc),\
+    w(0x34), w(0xa5), w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15),\
+    w(0x04), w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a),\
+    w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), w(0x75),\
+    w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), w(0x5a), w(0xa0),\
+    w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), w(0xe3), w(0x2f), w(0x84),\
+    w(0x53), w(0xd1), w(0x00), w(0xed), w(0x20), w(0xfc), w(0xb1), w(0x5b),\
+    w(0x6a), w(0xcb), w(0xbe), w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf),\
+    w(0xd0), w(0xef), w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85),\
+    w(0x45), w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8),\
+    w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), w(0xf5),\
+    w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), w(0xf3), w(0xd2),\
+    w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), w(0x97), w(0x44), w(0x17),\
+    w(0xc4), w(0xa7), w(0x7e), w(0x3d), w(0x64), w(0x5d), w(0x19), w(0x73),\
+    w(0x60), w(0x81), w(0x4f), w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88),\
+    w(0x46), w(0xee), w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb),\
+    w(0xe0), w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c),\
+    w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), w(0x79),\
+    w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), w(0x4e), w(0xa9),\
+    w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), w(0x7a), w(0xae), w(0x08),\
+    w(0xba), w(0x78), w(0x25), w(0x2e), w(0x1c), w(0xa6), w(0xb4), w(0xc6),\
+    w(0xe8), w(0xdd), w(0x74), w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a),\
+    w(0x70), w(0x3e), w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e),\
+    w(0x61), w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e),\
+    w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), w(0x94),\
+    w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), w(0x28), w(0xdf),\
+    w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), w(0xe6), w(0x42), w(0x68),\
+    w(0x41), w(0x99), w(0x2d), w(0x0f), w(0xb0), w(0x54), w(0xbb), w(0x16) }
+
+#define rc_data(w) {\
+    w(0x01), w(0x02), w(0x04), w(0x08), w(0x10),w(0x20), w(0x40), w(0x80),\
+    w(0x1b), w(0x36) }
+
+#define bytes2word(b0, b1, b2, b3) (((uint32_t)(b3) << 24) | \
+    ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | (b0))
+
+#define h0(x)   (x)
+#define w0(p)   bytes2word(p, 0, 0, 0)
+#define w1(p)   bytes2word(0, p, 0, 0)
+#define w2(p)   bytes2word(0, 0, p, 0)
+#define w3(p)   bytes2word(0, 0, 0, p)
+
+#define u0(p)   bytes2word(f2(p), p, p, f3(p))
+#define u1(p)   bytes2word(f3(p), f2(p), p, p)
+#define u2(p)   bytes2word(p, f3(p), f2(p), p)
+#define u3(p)   bytes2word(p, p, f3(p), f2(p))
+
+#define v0(p)   bytes2word(fe(p), f9(p), fd(p), fb(p))
+#define v1(p)   bytes2word(fb(p), fe(p), f9(p), fd(p))
+#define v2(p)   bytes2word(fd(p), fb(p), fe(p), f9(p))
+#define v3(p)   bytes2word(f9(p), fd(p), fb(p), fe(p))
+
+#define f2(x)   ((x<<1) ^ (((x>>7) & 1) * WPOLY))
+#define f4(x)   ((x<<2) ^ (((x>>6) & 1) * WPOLY) ^ (((x>>6) & 2) * WPOLY))
+#define f8(x)   ((x<<3) ^ (((x>>5) & 1) * WPOLY) ^ (((x>>5) & 2) * WPOLY) ^ (((x>>5) & 4) * WPOLY))
+#define f3(x)   (f2(x) ^ x)
+#define f9(x)   (f8(x) ^ x)
+#define fb(x)   (f8(x) ^ f2(x) ^ x)
+#define fd(x)   (f8(x) ^ f4(x) ^ x)
+#define fe(x)   (f8(x) ^ f4(x) ^ f2(x))
+
+#define t_dec(m,n) t_##m##n
+#define t_set(m,n) t_##m##n
+#define t_use(m,n) t_##m##n
+
+#define d_4(t,n,b,e,f,g,h) ALIGN const t n[4][256] = { b(e), b(f), b(g), b(h) }
+
+#define four_tables(x,tab,vf,rf,c) \
+    (tab[0][bval(vf(x,0,c),rf(0,c))] \
+    ^ tab[1][bval(vf(x,1,c),rf(1,c))] \
+    ^ tab[2][bval(vf(x,2,c),rf(2,c))] \
+    ^ tab[3][bval(vf(x,3,c),rf(3,c))])
+
+d_4(uint32_t, t_dec(f,n), sb_data, u0, u1, u2, u3);
+
+void aesb_single_round(const uint8_t *in, uint8_t *out, uint8_t *expandedKey)
+{
+    round(((uint32_t*) out), ((uint32_t*) in), ((uint32_t*) expandedKey));
+}
+
+void aesb_pseudo_round_mut(uint8_t *val, uint8_t *expandedKey)
+{
+    uint32_t b1[4];
+    round(b1, ((uint32_t*) val), ((const uint32_t *) expandedKey));
+    round(((uint32_t*) val), b1, ((const uint32_t *) expandedKey) + 1 * N_COLS);
+    round(b1, ((uint32_t*) val), ((const uint32_t *) expandedKey) + 2 * N_COLS);
+    round(((uint32_t*) val), b1, ((const uint32_t *) expandedKey) + 3 * N_COLS);
+    round(b1, ((uint32_t*) val), ((const uint32_t *) expandedKey) + 4 * N_COLS);
+    round(((uint32_t*) val), b1, ((const uint32_t *) expandedKey) + 5 * N_COLS);
+    round(b1, ((uint32_t*) val), ((const uint32_t *) expandedKey) + 6 * N_COLS);
+    round(((uint32_t*) val), b1, ((const uint32_t *) expandedKey) + 7 * N_COLS);
+    round(b1, ((uint32_t*) val), ((const uint32_t *) expandedKey) + 8 * N_COLS);
+    round(((uint32_t*) val), b1, ((const uint32_t *) expandedKey) + 9 * N_COLS);
+}
+
+
+#if defined(__cplusplus)
+}
+#endif
diff --git a/crypto/cn_aes.cuh b/crypto/cn_aes.cuh
new file mode 100644
index 0000000000..99ad212d67
--- /dev/null
+++ b/crypto/cn_aes.cuh
@@ -0,0 +1,990 @@
+
+#define N_COLS          4
+#define WPOLY           0x011b
+/*
+static const __device__ __align__(16) uint32_t d_t_fn[1024] = {
+	0xa56363c6U, 0x847c7cf8U, 0x997777eeU, 0x8d7b7bf6U, 0x0df2f2ffU, 0xbd6b6bd6U, 0xb16f6fdeU, 0x54c5c591U,
+	0x50303060U, 0x03010102U, 0xa96767ceU, 0x7d2b2b56U, 0x19fefee7U, 0x62d7d7b5U, 0xe6abab4dU, 0x9a7676ecU,
+	0x45caca8fU, 0x9d82821fU, 0x40c9c989U, 0x877d7dfaU, 0x15fafaefU, 0xeb5959b2U, 0xc947478eU, 0x0bf0f0fbU,
+	0xecadad41U, 0x67d4d4b3U, 0xfda2a25fU, 0xeaafaf45U, 0xbf9c9c23U, 0xf7a4a453U, 0x967272e4U, 0x5bc0c09bU,
+	0xc2b7b775U, 0x1cfdfde1U, 0xae93933dU, 0x6a26264cU, 0x5a36366cU, 0x413f3f7eU, 0x02f7f7f5U, 0x4fcccc83U,
+	0x5c343468U, 0xf4a5a551U, 0x34e5e5d1U, 0x08f1f1f9U, 0x937171e2U, 0x73d8d8abU, 0x53313162U, 0x3f15152aU,
+	0x0c040408U, 0x52c7c795U, 0x65232346U, 0x5ec3c39dU, 0x28181830U, 0xa1969637U, 0x0f05050aU, 0xb59a9a2fU,
+	0x0907070eU, 0x36121224U, 0x9b80801bU, 0x3de2e2dfU, 0x26ebebcdU, 0x6927274eU, 0xcdb2b27fU, 0x9f7575eaU,
+	0x1b090912U, 0x9e83831dU, 0x742c2c58U, 0x2e1a1a34U, 0x2d1b1b36U, 0xb26e6edcU, 0xee5a5ab4U, 0xfba0a05bU,
+	0xf65252a4U, 0x4d3b3b76U, 0x61d6d6b7U, 0xceb3b37dU, 0x7b292952U, 0x3ee3e3ddU, 0x712f2f5eU, 0x97848413U,
+	0xf55353a6U, 0x68d1d1b9U, 0x00000000U, 0x2cededc1U, 0x60202040U, 0x1ffcfce3U, 0xc8b1b179U, 0xed5b5bb6U,
+	0xbe6a6ad4U, 0x46cbcb8dU, 0xd9bebe67U, 0x4b393972U, 0xde4a4a94U, 0xd44c4c98U, 0xe85858b0U, 0x4acfcf85U,
+	0x6bd0d0bbU, 0x2aefefc5U, 0xe5aaaa4fU, 0x16fbfbedU, 0xc5434386U, 0xd74d4d9aU, 0x55333366U, 0x94858511U,
+	0xcf45458aU, 0x10f9f9e9U, 0x06020204U, 0x817f7ffeU, 0xf05050a0U, 0x443c3c78U, 0xba9f9f25U, 0xe3a8a84bU,
+	0xf35151a2U, 0xfea3a35dU, 0xc0404080U, 0x8a8f8f05U, 0xad92923fU, 0xbc9d9d21U, 0x48383870U, 0x04f5f5f1U,
+	0xdfbcbc63U, 0xc1b6b677U, 0x75dadaafU, 0x63212142U, 0x30101020U, 0x1affffe5U, 0x0ef3f3fdU, 0x6dd2d2bfU,
+	0x4ccdcd81U, 0x140c0c18U, 0x35131326U, 0x2fececc3U, 0xe15f5fbeU, 0xa2979735U, 0xcc444488U, 0x3917172eU,
+	0x57c4c493U, 0xf2a7a755U, 0x827e7efcU, 0x473d3d7aU, 0xac6464c8U, 0xe75d5dbaU, 0x2b191932U, 0x957373e6U,
+	0xa06060c0U, 0x98818119U, 0xd14f4f9eU, 0x7fdcdca3U, 0x66222244U, 0x7e2a2a54U, 0xab90903bU, 0x8388880bU,
+	0xca46468cU, 0x29eeeec7U, 0xd3b8b86bU, 0x3c141428U, 0x79dedea7U, 0xe25e5ebcU, 0x1d0b0b16U, 0x76dbdbadU,
+	0x3be0e0dbU, 0x56323264U, 0x4e3a3a74U, 0x1e0a0a14U, 0xdb494992U, 0x0a06060cU, 0x6c242448U, 0xe45c5cb8U,
+	0x5dc2c29fU, 0x6ed3d3bdU, 0xefacac43U, 0xa66262c4U, 0xa8919139U, 0xa4959531U, 0x37e4e4d3U, 0x8b7979f2U,
+	0x32e7e7d5U, 0x43c8c88bU, 0x5937376eU, 0xb76d6ddaU, 0x8c8d8d01U, 0x64d5d5b1U, 0xd24e4e9cU, 0xe0a9a949U,
+	0xb46c6cd8U, 0xfa5656acU, 0x07f4f4f3U, 0x25eaeacfU, 0xaf6565caU, 0x8e7a7af4U, 0xe9aeae47U, 0x18080810U,
+	0xd5baba6fU, 0x887878f0U, 0x6f25254aU, 0x722e2e5cU, 0x241c1c38U, 0xf1a6a657U, 0xc7b4b473U, 0x51c6c697U,
+	0x23e8e8cbU, 0x7cdddda1U, 0x9c7474e8U, 0x211f1f3eU, 0xdd4b4b96U, 0xdcbdbd61U, 0x868b8b0dU, 0x858a8a0fU,
+	0x907070e0U, 0x423e3e7cU, 0xc4b5b571U, 0xaa6666ccU, 0xd8484890U, 0x05030306U, 0x01f6f6f7U, 0x120e0e1cU,
+	0xa36161c2U, 0x5f35356aU, 0xf95757aeU, 0xd0b9b969U, 0x91868617U, 0x58c1c199U, 0x271d1d3aU, 0xb99e9e27U,
+	0x38e1e1d9U, 0x13f8f8ebU, 0xb398982bU, 0x33111122U, 0xbb6969d2U, 0x70d9d9a9U, 0x898e8e07U, 0xa7949433U,
+	0xb69b9b2dU, 0x221e1e3cU, 0x92878715U, 0x20e9e9c9U, 0x49cece87U, 0xff5555aaU, 0x78282850U, 0x7adfdfa5U,
+	0x8f8c8c03U, 0xf8a1a159U, 0x80898909U, 0x170d0d1aU, 0xdabfbf65U, 0x31e6e6d7U, 0xc6424284U, 0xb86868d0U,
+	0xc3414182U, 0xb0999929U, 0x772d2d5aU, 0x110f0f1eU, 0xcbb0b07bU, 0xfc5454a8U, 0xd6bbbb6dU, 0x3a16162cU,
+
+	0x6363c6a5U, 0x7c7cf884U, 0x7777ee99U, 0x7b7bf68dU, 0xf2f2ff0dU, 0x6b6bd6bdU, 0x6f6fdeb1U, 0xc5c59154U,
+	0x30306050U, 0x01010203U, 0x6767cea9U, 0x2b2b567dU, 0xfefee719U, 0xd7d7b562U, 0xabab4de6U, 0x7676ec9aU,
+	0xcaca8f45U, 0x82821f9dU, 0xc9c98940U, 0x7d7dfa87U, 0xfafaef15U, 0x5959b2ebU, 0x47478ec9U, 0xf0f0fb0bU,
+	0xadad41ecU, 0xd4d4b367U, 0xa2a25ffdU, 0xafaf45eaU, 0x9c9c23bfU, 0xa4a453f7U, 0x7272e496U, 0xc0c09b5bU,
+	0xb7b775c2U, 0xfdfde11cU, 0x93933daeU, 0x26264c6aU, 0x36366c5aU, 0x3f3f7e41U, 0xf7f7f502U, 0xcccc834fU,
+	0x3434685cU, 0xa5a551f4U, 0xe5e5d134U, 0xf1f1f908U, 0x7171e293U, 0xd8d8ab73U, 0x31316253U, 0x15152a3fU,
+	0x0404080cU, 0xc7c79552U, 0x23234665U, 0xc3c39d5eU, 0x18183028U, 0x969637a1U, 0x05050a0fU, 0x9a9a2fb5U,
+	0x07070e09U, 0x12122436U, 0x80801b9bU, 0xe2e2df3dU, 0xebebcd26U, 0x27274e69U, 0xb2b27fcdU, 0x7575ea9fU,
+	0x0909121bU, 0x83831d9eU, 0x2c2c5874U, 0x1a1a342eU, 0x1b1b362dU, 0x6e6edcb2U, 0x5a5ab4eeU, 0xa0a05bfbU,
+	0x5252a4f6U, 0x3b3b764dU, 0xd6d6b761U, 0xb3b37dceU, 0x2929527bU, 0xe3e3dd3eU, 0x2f2f5e71U, 0x84841397U,
+	0x5353a6f5U, 0xd1d1b968U, 0x00000000U, 0xededc12cU, 0x20204060U, 0xfcfce31fU, 0xb1b179c8U, 0x5b5bb6edU,
+	0x6a6ad4beU, 0xcbcb8d46U, 0xbebe67d9U, 0x3939724bU, 0x4a4a94deU, 0x4c4c98d4U, 0x5858b0e8U, 0xcfcf854aU,
+	0xd0d0bb6bU, 0xefefc52aU, 0xaaaa4fe5U, 0xfbfbed16U, 0x434386c5U, 0x4d4d9ad7U, 0x33336655U, 0x85851194U,
+	0x45458acfU, 0xf9f9e910U, 0x02020406U, 0x7f7ffe81U, 0x5050a0f0U, 0x3c3c7844U, 0x9f9f25baU, 0xa8a84be3U,
+	0x5151a2f3U, 0xa3a35dfeU, 0x404080c0U, 0x8f8f058aU, 0x92923fadU, 0x9d9d21bcU, 0x38387048U, 0xf5f5f104U,
+	0xbcbc63dfU, 0xb6b677c1U, 0xdadaaf75U, 0x21214263U, 0x10102030U, 0xffffe51aU, 0xf3f3fd0eU, 0xd2d2bf6dU,
+	0xcdcd814cU, 0x0c0c1814U, 0x13132635U, 0xececc32fU, 0x5f5fbee1U, 0x979735a2U, 0x444488ccU, 0x17172e39U,
+	0xc4c49357U, 0xa7a755f2U, 0x7e7efc82U, 0x3d3d7a47U, 0x6464c8acU, 0x5d5dbae7U, 0x1919322bU, 0x7373e695U,
+	0x6060c0a0U, 0x81811998U, 0x4f4f9ed1U, 0xdcdca37fU, 0x22224466U, 0x2a2a547eU, 0x90903babU, 0x88880b83U,
+	0x46468ccaU, 0xeeeec729U, 0xb8b86bd3U, 0x1414283cU, 0xdedea779U, 0x5e5ebce2U, 0x0b0b161dU, 0xdbdbad76U,
+	0xe0e0db3bU, 0x32326456U, 0x3a3a744eU, 0x0a0a141eU, 0x494992dbU, 0x06060c0aU, 0x2424486cU, 0x5c5cb8e4U,
+	0xc2c29f5dU, 0xd3d3bd6eU, 0xacac43efU, 0x6262c4a6U, 0x919139a8U, 0x959531a4U, 0xe4e4d337U, 0x7979f28bU,
+	0xe7e7d532U, 0xc8c88b43U, 0x37376e59U, 0x6d6ddab7U, 0x8d8d018cU, 0xd5d5b164U, 0x4e4e9cd2U, 0xa9a949e0U,
+	0x6c6cd8b4U, 0x5656acfaU, 0xf4f4f307U, 0xeaeacf25U, 0x6565caafU, 0x7a7af48eU, 0xaeae47e9U, 0x08081018U,
+	0xbaba6fd5U, 0x7878f088U, 0x25254a6fU, 0x2e2e5c72U, 0x1c1c3824U, 0xa6a657f1U, 0xb4b473c7U, 0xc6c69751U,
+	0xe8e8cb23U, 0xdddda17cU, 0x7474e89cU, 0x1f1f3e21U, 0x4b4b96ddU, 0xbdbd61dcU, 0x8b8b0d86U, 0x8a8a0f85U,
+	0x7070e090U, 0x3e3e7c42U, 0xb5b571c4U, 0x6666ccaaU, 0x484890d8U, 0x03030605U, 0xf6f6f701U, 0x0e0e1c12U,
+	0x6161c2a3U, 0x35356a5fU, 0x5757aef9U, 0xb9b969d0U, 0x86861791U, 0xc1c19958U, 0x1d1d3a27U, 0x9e9e27b9U,
+	0xe1e1d938U, 0xf8f8eb13U, 0x98982bb3U, 0x11112233U, 0x6969d2bbU, 0xd9d9a970U, 0x8e8e0789U, 0x949433a7U,
+	0x9b9b2db6U, 0x1e1e3c22U, 0x87871592U, 0xe9e9c920U, 0xcece8749U, 0x5555aaffU, 0x28285078U, 0xdfdfa57aU,
+	0x8c8c038fU, 0xa1a159f8U, 0x89890980U, 0x0d0d1a17U, 0xbfbf65daU, 0xe6e6d731U, 0x424284c6U, 0x6868d0b8U,
+	0x414182c3U, 0x999929b0U, 0x2d2d5a77U, 0x0f0f1e11U, 0xb0b07bcbU, 0x5454a8fcU, 0xbbbb6dd6U, 0x16162c3aU,
+
+	0x63c6a563U, 0x7cf8847cU, 0x77ee9977U, 0x7bf68d7bU, 0xf2ff0df2U, 0x6bd6bd6bU, 0x6fdeb16fU, 0xc59154c5U,
+	0x30605030U, 0x01020301U, 0x67cea967U, 0x2b567d2bU, 0xfee719feU, 0xd7b562d7U, 0xab4de6abU, 0x76ec9a76U,
+	0xca8f45caU, 0x821f9d82U, 0xc98940c9U, 0x7dfa877dU, 0xfaef15faU, 0x59b2eb59U, 0x478ec947U, 0xf0fb0bf0U,
+	0xad41ecadU, 0xd4b367d4U, 0xa25ffda2U, 0xaf45eaafU, 0x9c23bf9cU, 0xa453f7a4U, 0x72e49672U, 0xc09b5bc0U,
+	0xb775c2b7U, 0xfde11cfdU, 0x933dae93U, 0x264c6a26U, 0x366c5a36U, 0x3f7e413fU, 0xf7f502f7U, 0xcc834fccU,
+	0x34685c34U, 0xa551f4a5U, 0xe5d134e5U, 0xf1f908f1U, 0x71e29371U, 0xd8ab73d8U, 0x31625331U, 0x152a3f15U,
+	0x04080c04U, 0xc79552c7U, 0x23466523U, 0xc39d5ec3U, 0x18302818U, 0x9637a196U, 0x050a0f05U, 0x9a2fb59aU,
+	0x070e0907U, 0x12243612U, 0x801b9b80U, 0xe2df3de2U, 0xebcd26ebU, 0x274e6927U, 0xb27fcdb2U, 0x75ea9f75U,
+	0x09121b09U, 0x831d9e83U, 0x2c58742cU, 0x1a342e1aU, 0x1b362d1bU, 0x6edcb26eU, 0x5ab4ee5aU, 0xa05bfba0U,
+	0x52a4f652U, 0x3b764d3bU, 0xd6b761d6U, 0xb37dceb3U, 0x29527b29U, 0xe3dd3ee3U, 0x2f5e712fU, 0x84139784U,
+	0x53a6f553U, 0xd1b968d1U, 0x00000000U, 0xedc12cedU, 0x20406020U, 0xfce31ffcU, 0xb179c8b1U, 0x5bb6ed5bU,
+	0x6ad4be6aU, 0xcb8d46cbU, 0xbe67d9beU, 0x39724b39U, 0x4a94de4aU, 0x4c98d44cU, 0x58b0e858U, 0xcf854acfU,
+	0xd0bb6bd0U, 0xefc52aefU, 0xaa4fe5aaU, 0xfbed16fbU, 0x4386c543U, 0x4d9ad74dU, 0x33665533U, 0x85119485U,
+	0x458acf45U, 0xf9e910f9U, 0x02040602U, 0x7ffe817fU, 0x50a0f050U, 0x3c78443cU, 0x9f25ba9fU, 0xa84be3a8U,
+	0x51a2f351U, 0xa35dfea3U, 0x4080c040U, 0x8f058a8fU, 0x923fad92U, 0x9d21bc9dU, 0x38704838U, 0xf5f104f5U,
+	0xbc63dfbcU, 0xb677c1b6U, 0xdaaf75daU, 0x21426321U, 0x10203010U, 0xffe51affU, 0xf3fd0ef3U, 0xd2bf6dd2U,
+	0xcd814ccdU, 0x0c18140cU, 0x13263513U, 0xecc32fecU, 0x5fbee15fU, 0x9735a297U, 0x4488cc44U, 0x172e3917U,
+	0xc49357c4U, 0xa755f2a7U, 0x7efc827eU, 0x3d7a473dU, 0x64c8ac64U, 0x5dbae75dU, 0x19322b19U, 0x73e69573U,
+	0x60c0a060U, 0x81199881U, 0x4f9ed14fU, 0xdca37fdcU, 0x22446622U, 0x2a547e2aU, 0x903bab90U, 0x880b8388U,
+	0x468cca46U, 0xeec729eeU, 0xb86bd3b8U, 0x14283c14U, 0xdea779deU, 0x5ebce25eU, 0x0b161d0bU, 0xdbad76dbU,
+	0xe0db3be0U, 0x32645632U, 0x3a744e3aU, 0x0a141e0aU, 0x4992db49U, 0x060c0a06U, 0x24486c24U, 0x5cb8e45cU,
+	0xc29f5dc2U, 0xd3bd6ed3U, 0xac43efacU, 0x62c4a662U, 0x9139a891U, 0x9531a495U, 0xe4d337e4U, 0x79f28b79U,
+	0xe7d532e7U, 0xc88b43c8U, 0x376e5937U, 0x6ddab76dU, 0x8d018c8dU, 0xd5b164d5U, 0x4e9cd24eU, 0xa949e0a9U,
+	0x6cd8b46cU, 0x56acfa56U, 0xf4f307f4U, 0xeacf25eaU, 0x65caaf65U, 0x7af48e7aU, 0xae47e9aeU, 0x08101808U,
+	0xba6fd5baU, 0x78f08878U, 0x254a6f25U, 0x2e5c722eU, 0x1c38241cU, 0xa657f1a6U, 0xb473c7b4U, 0xc69751c6U,
+	0xe8cb23e8U, 0xdda17cddU, 0x74e89c74U, 0x1f3e211fU, 0x4b96dd4bU, 0xbd61dcbdU, 0x8b0d868bU, 0x8a0f858aU,
+	0x70e09070U, 0x3e7c423eU, 0xb571c4b5U, 0x66ccaa66U, 0x4890d848U, 0x03060503U, 0xf6f701f6U, 0x0e1c120eU,
+	0x61c2a361U, 0x356a5f35U, 0x57aef957U, 0xb969d0b9U, 0x86179186U, 0xc19958c1U, 0x1d3a271dU, 0x9e27b99eU,
+	0xe1d938e1U, 0xf8eb13f8U, 0x982bb398U, 0x11223311U, 0x69d2bb69U, 0xd9a970d9U, 0x8e07898eU, 0x9433a794U,
+	0x9b2db69bU, 0x1e3c221eU, 0x87159287U, 0xe9c920e9U, 0xce8749ceU, 0x55aaff55U, 0x28507828U, 0xdfa57adfU,
+	0x8c038f8cU, 0xa159f8a1U, 0x89098089U, 0x0d1a170dU, 0xbf65dabfU, 0xe6d731e6U, 0x4284c642U, 0x68d0b868U,
+	0x4182c341U, 0x9929b099U, 0x2d5a772dU, 0x0f1e110fU, 0xb07bcbb0U, 0x54a8fc54U, 0xbb6dd6bbU, 0x162c3a16U,
+
+	0xc6a56363U, 0xf8847c7cU, 0xee997777U, 0xf68d7b7bU, 0xff0df2f2U, 0xd6bd6b6bU, 0xdeb16f6fU, 0x9154c5c5U,
+	0x60503030U, 0x02030101U, 0xcea96767U, 0x567d2b2bU, 0xe719fefeU, 0xb562d7d7U, 0x4de6ababU, 0xec9a7676U,
+	0x8f45cacaU, 0x1f9d8282U, 0x8940c9c9U, 0xfa877d7dU, 0xef15fafaU, 0xb2eb5959U, 0x8ec94747U, 0xfb0bf0f0U,
+	0x41ecadadU, 0xb367d4d4U, 0x5ffda2a2U, 0x45eaafafU, 0x23bf9c9cU, 0x53f7a4a4U, 0xe4967272U, 0x9b5bc0c0U,
+	0x75c2b7b7U, 0xe11cfdfdU, 0x3dae9393U, 0x4c6a2626U, 0x6c5a3636U, 0x7e413f3fU, 0xf502f7f7U, 0x834fccccU,
+	0x685c3434U, 0x51f4a5a5U, 0xd134e5e5U, 0xf908f1f1U, 0xe2937171U, 0xab73d8d8U, 0x62533131U, 0x2a3f1515U,
+	0x080c0404U, 0x9552c7c7U, 0x46652323U, 0x9d5ec3c3U, 0x30281818U, 0x37a19696U, 0x0a0f0505U, 0x2fb59a9aU,
+	0x0e090707U, 0x24361212U, 0x1b9b8080U, 0xdf3de2e2U, 0xcd26ebebU, 0x4e692727U, 0x7fcdb2b2U, 0xea9f7575U,
+	0x121b0909U, 0x1d9e8383U, 0x58742c2cU, 0x342e1a1aU, 0x362d1b1bU, 0xdcb26e6eU, 0xb4ee5a5aU, 0x5bfba0a0U,
+	0xa4f65252U, 0x764d3b3bU, 0xb761d6d6U, 0x7dceb3b3U, 0x527b2929U, 0xdd3ee3e3U, 0x5e712f2fU, 0x13978484U,
+	0xa6f55353U, 0xb968d1d1U, 0x00000000U, 0xc12cededU, 0x40602020U, 0xe31ffcfcU, 0x79c8b1b1U, 0xb6ed5b5bU,
+	0xd4be6a6aU, 0x8d46cbcbU, 0x67d9bebeU, 0x724b3939U, 0x94de4a4aU, 0x98d44c4cU, 0xb0e85858U, 0x854acfcfU,
+	0xbb6bd0d0U, 0xc52aefefU, 0x4fe5aaaaU, 0xed16fbfbU, 0x86c54343U, 0x9ad74d4dU, 0x66553333U, 0x11948585U,
+	0x8acf4545U, 0xe910f9f9U, 0x04060202U, 0xfe817f7fU, 0xa0f05050U, 0x78443c3cU, 0x25ba9f9fU, 0x4be3a8a8U,
+	0xa2f35151U, 0x5dfea3a3U, 0x80c04040U, 0x058a8f8fU, 0x3fad9292U, 0x21bc9d9dU, 0x70483838U, 0xf104f5f5U,
+	0x63dfbcbcU, 0x77c1b6b6U, 0xaf75dadaU, 0x42632121U, 0x20301010U, 0xe51affffU, 0xfd0ef3f3U, 0xbf6dd2d2U,
+	0x814ccdcdU, 0x18140c0cU, 0x26351313U, 0xc32fececU, 0xbee15f5fU, 0x35a29797U, 0x88cc4444U, 0x2e391717U,
+	0x9357c4c4U, 0x55f2a7a7U, 0xfc827e7eU, 0x7a473d3dU, 0xc8ac6464U, 0xbae75d5dU, 0x322b1919U, 0xe6957373U,
+	0xc0a06060U, 0x19988181U, 0x9ed14f4fU, 0xa37fdcdcU, 0x44662222U, 0x547e2a2aU, 0x3bab9090U, 0x0b838888U,
+	0x8cca4646U, 0xc729eeeeU, 0x6bd3b8b8U, 0x283c1414U, 0xa779dedeU, 0xbce25e5eU, 0x161d0b0bU, 0xad76dbdbU,
+	0xdb3be0e0U, 0x64563232U, 0x744e3a3aU, 0x141e0a0aU, 0x92db4949U, 0x0c0a0606U, 0x486c2424U, 0xb8e45c5cU,
+	0x9f5dc2c2U, 0xbd6ed3d3U, 0x43efacacU, 0xc4a66262U, 0x39a89191U, 0x31a49595U, 0xd337e4e4U, 0xf28b7979U,
+	0xd532e7e7U, 0x8b43c8c8U, 0x6e593737U, 0xdab76d6dU, 0x018c8d8dU, 0xb164d5d5U, 0x9cd24e4eU, 0x49e0a9a9U,
+	0xd8b46c6cU, 0xacfa5656U, 0xf307f4f4U, 0xcf25eaeaU, 0xcaaf6565U, 0xf48e7a7aU, 0x47e9aeaeU, 0x10180808U,
+	0x6fd5babaU, 0xf0887878U, 0x4a6f2525U, 0x5c722e2eU, 0x38241c1cU, 0x57f1a6a6U, 0x73c7b4b4U, 0x9751c6c6U,
+	0xcb23e8e8U, 0xa17cddddU, 0xe89c7474U, 0x3e211f1fU, 0x96dd4b4bU, 0x61dcbdbdU, 0x0d868b8bU, 0x0f858a8aU,
+	0xe0907070U, 0x7c423e3eU, 0x71c4b5b5U, 0xccaa6666U, 0x90d84848U, 0x06050303U, 0xf701f6f6U, 0x1c120e0eU,
+	0xc2a36161U, 0x6a5f3535U, 0xaef95757U, 0x69d0b9b9U, 0x17918686U, 0x9958c1c1U, 0x3a271d1dU, 0x27b99e9eU,
+	0xd938e1e1U, 0xeb13f8f8U, 0x2bb39898U, 0x22331111U, 0xd2bb6969U, 0xa970d9d9U, 0x07898e8eU, 0x33a79494U,
+	0x2db69b9bU, 0x3c221e1eU, 0x15928787U, 0xc920e9e9U, 0x8749ceceU, 0xaaff5555U, 0x50782828U, 0xa57adfdfU,
+	0x038f8c8cU, 0x59f8a1a1U, 0x09808989U, 0x1a170d0dU, 0x65dabfbfU, 0xd731e6e6U, 0x84c64242U, 0xd0b86868U,
+	0x82c34141U, 0x29b09999U, 0x5a772d2dU, 0x1e110f0fU, 0x7bcbb0b0U, 0xa8fc5454U, 0x6dd6bbbbU, 0x2c3a1616U
+};
+*/
+
+#define AS_U32(addr) *((uint32_t*)(addr))
+#define AS_U64(addr) *((uint64_t*)(addr))
+#define AS_UINT2(addr) *((uint2*)(addr))
+#define AS_UINT4(addr) *((uint4*)(addr))
+#define AS_UL2(addr) *((ulonglong2*)(addr))
+
+#define t_fn0(x) (sharedMemory[x])
+#define t_fn1(x) (sharedMemory[0x100U | (x)])
+#define t_fn2(x) (sharedMemory[0x200U | (x)])
+#define t_fn3(x) (sharedMemory[0x300U | (x)])
+
+#define round(shared, out, x, k) \
+	out[0] = (k)[0] ^ (t_fn0(x[0] & 0xff) ^ t_fn1((x[1] >> 8) & 0xff) ^ t_fn2((x[2] >> 16) & 0xff) ^ t_fn3((x[3] >> 24) & 0xff)); \
+	out[1] = (k)[1] ^ (t_fn0(x[1] & 0xff) ^ t_fn1((x[2] >> 8) & 0xff) ^ t_fn2((x[3] >> 16) & 0xff) ^ t_fn3((x[0] >> 24) & 0xff)); \
+	out[2] = (k)[2] ^ (t_fn0(x[2] & 0xff) ^ t_fn1((x[3] >> 8) & 0xff) ^ t_fn2((x[0] >> 16) & 0xff) ^ t_fn3((x[1] >> 24) & 0xff)); \
+	out[3] = (k)[3] ^ (t_fn0(x[3] & 0xff) ^ t_fn1((x[0] >> 8) & 0xff) ^ t_fn2((x[1] >> 16) & 0xff) ^ t_fn3((x[2] >> 24) & 0xff));
+
+#define round_u4(shared, out, in, k) \
+	((uint32_t*)out)[0] = (k)[0] ^ t_fn0(in[0].x) ^ t_fn1(in[1].y) ^ t_fn2(in[2].z) ^ t_fn3(in[3].w); \
+	((uint32_t*)out)[1] = (k)[1] ^ t_fn0(in[1].x) ^ t_fn1(in[2].y) ^ t_fn2(in[3].z) ^ t_fn3(in[0].w); \
+	((uint32_t*)out)[2] = (k)[2] ^ t_fn0(in[2].x) ^ t_fn1(in[3].y) ^ t_fn2(in[0].z) ^ t_fn3(in[1].w); \
+	((uint32_t*)out)[3] = (k)[3] ^ t_fn0(in[3].x) ^ t_fn1(in[0].y) ^ t_fn2(in[1].z) ^ t_fn3(in[2].w);
+
+#ifdef __INTELLISENSE__
+#define __byte_perm(a,b,c) a
+#endif
+
+#define OFF32_0(x) (x & 0xFFu)
+#define OFF32_1(x) __byte_perm(x, 0x01, 0x5541)
+#define OFF32_2(x) __byte_perm(x, 0x02, 0x5542)
+#define OFF32_3(x) __byte_perm(x, 0x03, 0x5543)
+
+#define SHARED_0(x) sharedMemory[OFF32_0(x)]
+#define SHARED_1(x) sharedMemory[OFF32_1(x)]
+#define SHARED_2(x) sharedMemory[OFF32_2(x)]
+#define SHARED_3(x) sharedMemory[OFF32_3(x)]
+
+__device__ __forceinline__
+void cn_aes_single_round(uint32_t * const sharedMemory, uint32_t * const in, uint32_t * out, uint32_t* expandedKey)
+{
+	asm("// aes_single_round");
+	out[0] = expandedKey[0] ^ SHARED_0(in[0]) ^ SHARED_1(in[1]) ^ SHARED_2(in[2]) ^ SHARED_3(in[3]);
+	out[1] = expandedKey[1] ^ SHARED_0(in[1]) ^ SHARED_1(in[2]) ^ SHARED_2(in[3]) ^ SHARED_3(in[0]);
+	out[2] = expandedKey[2] ^ SHARED_0(in[2]) ^ SHARED_1(in[3]) ^ SHARED_2(in[0]) ^ SHARED_3(in[1]);
+	out[3] = expandedKey[3] ^ SHARED_0(in[3]) ^ SHARED_1(in[0]) ^ SHARED_2(in[1]) ^ SHARED_3(in[2]);
+}
+
+//
+#ifdef _WIN64
+/* do a mul.wide.u32 to prevent a shl + cvt 32 to 64 on ld.shared [ptr] */
+#define OFF8_0(x) (x & 0xFFu) * sizeof(uint32_t)
+#define OFF8_1(x) __byte_perm(x, 0x01, 0x5541) * sizeof(uint32_t)
+#define OFF8_2(x) __byte_perm(x, 0x02, 0x5542) * sizeof(uint32_t)
+#define OFF8_3(x) __byte_perm(x, 0x03, 0x5543) * sizeof(uint32_t)
+#else
+#define OFF8_0(x) (x & 0xFFu) << 2
+#define OFF8_1(x) __byte_perm(x, 0x01, 0x5541) << 2
+#define OFF8_2(x) __byte_perm(x, 0x02, 0x5542) << 2
+#define OFF8_3(x) __byte_perm(x, 0x03, 0x5543) << 2
+#endif
+
+#define SHAR8_0(x) AS_U32(&sharedMemory[OFF8_0(x)])
+#define SHAR8_1(x) AS_U32(&sharedMemory[OFF8_1(x)])
+#define SHAR8_2(x) AS_U32(&sharedMemory[OFF8_2(x)])
+#define SHAR8_3(x) AS_U32(&sharedMemory[OFF8_3(x)])
+
+__device__ __forceinline__
+void cn_aes_single_round_b(uint8_t * const sharedMemory, void * const long_state, const uint4 key, uint4 *res)
+{
+	asm("// aes_single_round_b");
+	uint4 in = AS_UINT4(long_state);
+	*res = key;
+	res->x ^= SHAR8_0(in.x) ^ SHAR8_1(in.y) ^ SHAR8_2(in.z) ^ SHAR8_3(in.w);
+	res->y ^= SHAR8_0(in.y) ^ SHAR8_1(in.z) ^ SHAR8_2(in.w) ^ SHAR8_3(in.x);
+	res->z ^= SHAR8_0(in.z) ^ SHAR8_1(in.w) ^ SHAR8_2(in.x) ^ SHAR8_3(in.y);
+	res->w ^= SHAR8_0(in.w) ^ SHAR8_1(in.x) ^ SHAR8_2(in.y) ^ SHAR8_3(in.z);
+}
+
+#define round_perm(shared, out, in, k) \
+	out[0] = (k)[0] ^ SHARED_0(in[0]) ^ SHARED_1(in[1]) ^ SHARED_2(in[2]) ^ SHARED_3(in[3]); \
+	out[1] = (k)[1] ^ SHARED_0(in[1]) ^ SHARED_1(in[2]) ^ SHARED_2(in[3]) ^ SHARED_3(in[0]); \
+	out[2] = (k)[2] ^ SHARED_0(in[2]) ^ SHARED_1(in[3]) ^ SHARED_2(in[0]) ^ SHARED_3(in[1]); \
+	out[3] = (k)[3] ^ SHARED_0(in[3]) ^ SHARED_1(in[0]) ^ SHARED_2(in[1]) ^ SHARED_3(in[2]);
+
+__device__ __forceinline__
+void cn_aes_pseudo_round_mut(const uint32_t * sharedMemory, uint32_t * val, uint32_t const * expandedKey)
+{
+	asm("// aes_pseudo_round_mut");
+	uint32_t b[4];
+	round_perm(sharedMemory, b, val, expandedKey);
+	round_perm(sharedMemory, val, b, expandedKey + (1 * N_COLS));
+	round_perm(sharedMemory, b, val, expandedKey + (2 * N_COLS));
+	round_perm(sharedMemory, val, b, expandedKey + (3 * N_COLS));
+	round_perm(sharedMemory, b, val, expandedKey + (4 * N_COLS));
+	round_perm(sharedMemory, val, b, expandedKey + (5 * N_COLS));
+	round_perm(sharedMemory, b, val, expandedKey + (6 * N_COLS));
+	round_perm(sharedMemory, val, b, expandedKey + (7 * N_COLS));
+	round_perm(sharedMemory, b, val, expandedKey + (8 * N_COLS));
+	round_perm(sharedMemory, val, b, expandedKey + (9 * N_COLS));
+}
+
+static __forceinline__ __device__ uint4 operator ^ (const uint4 &a, const uint4 &b) {
+	return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w);
+}
+
+#define round_perm4(in, k) {\
+	uint4 tmp; \
+	tmp.x = SHARED_0(in.x) ^ SHARED_1(in.y) ^ SHARED_2(in.z) ^ SHARED_3(in.w); \
+	tmp.y = SHARED_0(in.y) ^ SHARED_1(in.z) ^ SHARED_2(in.w) ^ SHARED_3(in.x); \
+	tmp.z = SHARED_0(in.z) ^ SHARED_1(in.w) ^ SHARED_2(in.x) ^ SHARED_3(in.y); \
+	tmp.w = SHARED_0(in.w) ^ SHARED_1(in.x) ^ SHARED_2(in.y) ^ SHARED_3(in.z); \
+	val = tmp ^ key[k]; \
+}
+
+__device__ __forceinline__
+void cn_aes_pseudo_round_mut_uint4(uint32_t * const sharedMemory, uint4 &val, uint4 const key[10])
+{
+	asm("// aes_pseudo_round_mut_uint4");
+	round_perm4(val, 0);
+	round_perm4(val, 1);
+	round_perm4(val, 2);
+	round_perm4(val, 3);
+	round_perm4(val, 4);
+	round_perm4(val, 5);
+	round_perm4(val, 6);
+	round_perm4(val, 7);
+	round_perm4(val, 8);
+	round_perm4(val, 9);
+}
+
+/*
+__device__ __forceinline__
+void cn_aes_gpu_init2(uint32_t* sharedMemory)
+{
+#if 0
+	if(blockDim.x >= 64)
+	{
+		if(threadIdx.x < 64) {
+			#define thrX (threadIdx.x << 2U) // ensure offsets aligned (16) to vector
+			#pragma unroll 4
+			for (uint32_t i = 0; i < 1024U; i += 256U) // 32x32 = 1024, 4 * 256 also
+				AS_UINT4(&sharedMemory[i + thrX]) = AS_UINT4(&d_t_fn[i + thrX]);
+		}
+
+	} else
+#endif
+	if(blockDim.x >= 32) {
+
+		if(threadIdx.x < 32) {
+#if 0
+			#pragma unroll 32
+			for(uint32_t i = 0; i < 1024; i += 32)
+				sharedMemory[threadIdx.x + i] = d_t_fn[threadIdx.x + i];
+#else
+			#define thrX (threadIdx.x << 2U) // ensure offsets aligned (16) to vector
+			#pragma unroll 8
+			for (uint32_t i = 0; i < 1024; i += 128U) // 32x32 = 1024, 8 * 128 also
+				AS_UINT4(&sharedMemory[i + thrX]) = AS_UINT4(&d_t_fn[i + thrX]);
+#endif
+		}
+
+	} else {
+
+		if(threadIdx.x < 4) {
+#if 0
+			for (uint32_t i = 0; i < 1024; i += 4)
+				sharedMemory[threadIdx.x + i] = d_t_fn[threadIdx.x + i];
+#else
+			#define thrX (threadIdx.x << 2U) // ensure offsets aligned (16) to vector
+			#pragma unroll 64
+			for (uint32_t i = 0; i < 1024; i += 16U)
+				AS_UINT4(&sharedMemory[i + thrX]) = AS_UINT4(&d_t_fn[i + thrX]);
+#endif
+		}
+	}
+}
+*/
+
+__device__ __forceinline__
+void cn_aes_gpu_init(uint32_t* sharedMemory)
+{
+	// AES 0
+	switch (threadIdx.x) {
+	case 0:
+		AS_UL2(&sharedMemory[0x000]) = make_ulonglong2(0x847c7cf8a56363c6, 0x8d7b7bf6997777ee);
+		AS_UL2(&sharedMemory[0x004]) = make_ulonglong2(0xbd6b6bd60df2f2ff, 0x54c5c591b16f6fde);
+		AS_UL2(&sharedMemory[0x008]) = make_ulonglong2(0x0301010250303060, 0x7d2b2b56a96767ce);
+		AS_UL2(&sharedMemory[0x00C]) = make_ulonglong2(0x62d7d7b519fefee7, 0x9a7676ece6abab4d);
+		AS_UL2(&sharedMemory[0x010]) = make_ulonglong2(0x9d82821f45caca8f, 0x877d7dfa40c9c989);
+		AS_UL2(&sharedMemory[0x014]) = make_ulonglong2(0xeb5959b215fafaef, 0x0bf0f0fbc947478e);
+		AS_UL2(&sharedMemory[0x018]) = make_ulonglong2(0x67d4d4b3ecadad41, 0xeaafaf45fda2a25f);
+		AS_UL2(&sharedMemory[0x01C]) = make_ulonglong2(0xf7a4a453bf9c9c23, 0x5bc0c09b967272e4);
+		break;
+	case 1:
+		AS_UL2(&sharedMemory[0x020]) = make_ulonglong2(0x1cfdfde1c2b7b775, 0x6a26264cae93933d);
+		AS_UL2(&sharedMemory[0x024]) = make_ulonglong2(0x413f3f7e5a36366c, 0x4fcccc8302f7f7f5);
+		AS_UL2(&sharedMemory[0x028]) = make_ulonglong2(0xf4a5a5515c343468, 0x08f1f1f934e5e5d1);
+		AS_UL2(&sharedMemory[0x02C]) = make_ulonglong2(0x73d8d8ab937171e2, 0x3f15152a53313162);
+		AS_UL2(&sharedMemory[0x030]) = make_ulonglong2(0x52c7c7950c040408, 0x5ec3c39d65232346);
+		AS_UL2(&sharedMemory[0x034]) = make_ulonglong2(0xa196963728181830, 0xb59a9a2f0f05050a);
+		AS_UL2(&sharedMemory[0x038]) = make_ulonglong2(0x361212240907070e, 0x3de2e2df9b80801b);
+		AS_UL2(&sharedMemory[0x03C]) = make_ulonglong2(0x6927274e26ebebcd, 0x9f7575eacdb2b27f);
+		break;
+	case 2:
+		AS_UL2(&sharedMemory[0x040]) = make_ulonglong2(0x9e83831d1b090912, 0x2e1a1a34742c2c58);
+		AS_UL2(&sharedMemory[0x044]) = make_ulonglong2(0xb26e6edc2d1b1b36, 0xfba0a05bee5a5ab4);
+		AS_UL2(&sharedMemory[0x048]) = make_ulonglong2(0x4d3b3b76f65252a4, 0xceb3b37d61d6d6b7);
+		AS_UL2(&sharedMemory[0x04C]) = make_ulonglong2(0x3ee3e3dd7b292952, 0x97848413712f2f5e);
+		AS_UL2(&sharedMemory[0x050]) = make_ulonglong2(0x68d1d1b9f55353a6, 0x2cededc100000000);
+		AS_UL2(&sharedMemory[0x054]) = make_ulonglong2(0x1ffcfce360202040, 0xed5b5bb6c8b1b179);
+		AS_UL2(&sharedMemory[0x058]) = make_ulonglong2(0x46cbcb8dbe6a6ad4, 0x4b393972d9bebe67);
+		AS_UL2(&sharedMemory[0x05C]) = make_ulonglong2(0xd44c4c98de4a4a94, 0x4acfcf85e85858b0);
+		break;
+	case 3:
+		AS_UL2(&sharedMemory[0x060]) = make_ulonglong2(0x2aefefc56bd0d0bb, 0x16fbfbede5aaaa4f);
+		AS_UL2(&sharedMemory[0x064]) = make_ulonglong2(0xd74d4d9ac5434386, 0x9485851155333366);
+		AS_UL2(&sharedMemory[0x068]) = make_ulonglong2(0x10f9f9e9cf45458a, 0x817f7ffe06020204);
+		AS_UL2(&sharedMemory[0x06C]) = make_ulonglong2(0x443c3c78f05050a0, 0xe3a8a84bba9f9f25);
+		AS_UL2(&sharedMemory[0x070]) = make_ulonglong2(0xfea3a35df35151a2, 0x8a8f8f05c0404080);
+		AS_UL2(&sharedMemory[0x074]) = make_ulonglong2(0xbc9d9d21ad92923f, 0x04f5f5f148383870);
+		AS_UL2(&sharedMemory[0x078]) = make_ulonglong2(0xc1b6b677dfbcbc63, 0x6321214275dadaaf);
+		AS_UL2(&sharedMemory[0x07C]) = make_ulonglong2(0x1affffe530101020, 0x6dd2d2bf0ef3f3fd);
+		break;
+	case 4:
+		AS_UL2(&sharedMemory[0x080]) = make_ulonglong2(0x140c0c184ccdcd81, 0x2fececc335131326);
+		AS_UL2(&sharedMemory[0x084]) = make_ulonglong2(0xa2979735e15f5fbe, 0x3917172ecc444488);
+		AS_UL2(&sharedMemory[0x088]) = make_ulonglong2(0xf2a7a75557c4c493, 0x473d3d7a827e7efc);
+		AS_UL2(&sharedMemory[0x08C]) = make_ulonglong2(0xe75d5dbaac6464c8, 0x957373e62b191932);
+		AS_UL2(&sharedMemory[0x090]) = make_ulonglong2(0x98818119a06060c0, 0x7fdcdca3d14f4f9e);
+		AS_UL2(&sharedMemory[0x094]) = make_ulonglong2(0x7e2a2a5466222244, 0x8388880bab90903b);
+		AS_UL2(&sharedMemory[0x098]) = make_ulonglong2(0x29eeeec7ca46468c, 0x3c141428d3b8b86b);
+		AS_UL2(&sharedMemory[0x09C]) = make_ulonglong2(0xe25e5ebc79dedea7, 0x76dbdbad1d0b0b16);
+		break;
+	case 5:
+		AS_UL2(&sharedMemory[0x0A0]) = make_ulonglong2(0x563232643be0e0db, 0x1e0a0a144e3a3a74);
+		AS_UL2(&sharedMemory[0x0A4]) = make_ulonglong2(0x0a06060cdb494992, 0xe45c5cb86c242448);
+		AS_UL2(&sharedMemory[0x0A8]) = make_ulonglong2(0x6ed3d3bd5dc2c29f, 0xa66262c4efacac43);
+		AS_UL2(&sharedMemory[0x0AC]) = make_ulonglong2(0xa4959531a8919139, 0x8b7979f237e4e4d3);
+		AS_UL2(&sharedMemory[0x0B0]) = make_ulonglong2(0x43c8c88b32e7e7d5, 0xb76d6dda5937376e);
+		AS_UL2(&sharedMemory[0x0B4]) = make_ulonglong2(0x64d5d5b18c8d8d01, 0xe0a9a949d24e4e9c);
+		AS_UL2(&sharedMemory[0x0B8]) = make_ulonglong2(0xfa5656acb46c6cd8, 0x25eaeacf07f4f4f3);
+		AS_UL2(&sharedMemory[0x0BC]) = make_ulonglong2(0x8e7a7af4af6565ca, 0x18080810e9aeae47);
+		break;
+	case 6:
+		AS_UL2(&sharedMemory[0x0C0]) = make_ulonglong2(0x887878f0d5baba6f, 0x722e2e5c6f25254a);
+		AS_UL2(&sharedMemory[0x0C4]) = make_ulonglong2(0xf1a6a657241c1c38, 0x51c6c697c7b4b473);
+		AS_UL2(&sharedMemory[0x0C8]) = make_ulonglong2(0x7cdddda123e8e8cb, 0x211f1f3e9c7474e8);
+		AS_UL2(&sharedMemory[0x0CC]) = make_ulonglong2(0xdcbdbd61dd4b4b96, 0x858a8a0f868b8b0d);
+		AS_UL2(&sharedMemory[0x0D0]) = make_ulonglong2(0x423e3e7c907070e0, 0xaa6666ccc4b5b571);
+		AS_UL2(&sharedMemory[0x0D4]) = make_ulonglong2(0x05030306d8484890, 0x120e0e1c01f6f6f7);
+		AS_UL2(&sharedMemory[0x0D8]) = make_ulonglong2(0x5f35356aa36161c2, 0xd0b9b969f95757ae);
+		AS_UL2(&sharedMemory[0x0DC]) = make_ulonglong2(0x58c1c19991868617, 0xb99e9e27271d1d3a);
+		break;
+	case 7:
+		AS_UL2(&sharedMemory[0x0E0]) = make_ulonglong2(0x13f8f8eb38e1e1d9, 0x33111122b398982b);
+		AS_UL2(&sharedMemory[0x0E4]) = make_ulonglong2(0x70d9d9a9bb6969d2, 0xa7949433898e8e07);
+		AS_UL2(&sharedMemory[0x0E8]) = make_ulonglong2(0x221e1e3cb69b9b2d, 0x20e9e9c992878715);
+		AS_UL2(&sharedMemory[0x0EC]) = make_ulonglong2(0xff5555aa49cece87, 0x7adfdfa578282850);
+		AS_UL2(&sharedMemory[0x0F0]) = make_ulonglong2(0xf8a1a1598f8c8c03, 0x170d0d1a80898909);
+		AS_UL2(&sharedMemory[0x0F4]) = make_ulonglong2(0x31e6e6d7dabfbf65, 0xb86868d0c6424284);
+		AS_UL2(&sharedMemory[0x0F8]) = make_ulonglong2(0xb0999929c3414182, 0x110f0f1e772d2d5a);
+		AS_UL2(&sharedMemory[0x0FC]) = make_ulonglong2(0xfc5454a8cbb0b07b, 0x3a16162cd6bbbb6d);
+		break;
+	}
+	// AES 1
+	switch (threadIdx.x) {
+	case 0:
+		AS_UL2(&sharedMemory[0x100]) = make_ulonglong2(0x7c7cf8846363c6a5, 0x7b7bf68d7777ee99);
+		AS_UL2(&sharedMemory[0x104]) = make_ulonglong2(0x6b6bd6bdf2f2ff0d, 0xc5c591546f6fdeb1);
+		AS_UL2(&sharedMemory[0x108]) = make_ulonglong2(0x0101020330306050, 0x2b2b567d6767cea9);
+		AS_UL2(&sharedMemory[0x10C]) = make_ulonglong2(0xd7d7b562fefee719, 0x7676ec9aabab4de6);
+		AS_UL2(&sharedMemory[0x110]) = make_ulonglong2(0x82821f9dcaca8f45, 0x7d7dfa87c9c98940);
+		AS_UL2(&sharedMemory[0x114]) = make_ulonglong2(0x5959b2ebfafaef15, 0xf0f0fb0b47478ec9);
+		AS_UL2(&sharedMemory[0x118]) = make_ulonglong2(0xd4d4b367adad41ec, 0xafaf45eaa2a25ffd);
+		AS_UL2(&sharedMemory[0x11C]) = make_ulonglong2(0xa4a453f79c9c23bf, 0xc0c09b5b7272e496);
+		break;
+	case 1:
+		AS_UL2(&sharedMemory[0x120]) = make_ulonglong2(0xfdfde11cb7b775c2, 0x26264c6a93933dae);
+		AS_UL2(&sharedMemory[0x124]) = make_ulonglong2(0x3f3f7e4136366c5a, 0xcccc834ff7f7f502);
+		AS_UL2(&sharedMemory[0x128]) = make_ulonglong2(0xa5a551f43434685c, 0xf1f1f908e5e5d134);
+		AS_UL2(&sharedMemory[0x12C]) = make_ulonglong2(0xd8d8ab737171e293, 0x15152a3f31316253);
+		AS_UL2(&sharedMemory[0x130]) = make_ulonglong2(0xc7c795520404080c, 0xc3c39d5e23234665);
+		AS_UL2(&sharedMemory[0x134]) = make_ulonglong2(0x969637a118183028, 0x9a9a2fb505050a0f);
+		AS_UL2(&sharedMemory[0x138]) = make_ulonglong2(0x1212243607070e09, 0xe2e2df3d80801b9b);
+		AS_UL2(&sharedMemory[0x13C]) = make_ulonglong2(0x27274e69ebebcd26, 0x7575ea9fb2b27fcd);
+		break;
+	case 2:
+		AS_UL2(&sharedMemory[0x140]) = make_ulonglong2(0x83831d9e0909121b, 0x1a1a342e2c2c5874);
+		AS_UL2(&sharedMemory[0x144]) = make_ulonglong2(0x6e6edcb21b1b362d, 0xa0a05bfb5a5ab4ee);
+		AS_UL2(&sharedMemory[0x148]) = make_ulonglong2(0x3b3b764d5252a4f6, 0xb3b37dced6d6b761);
+		AS_UL2(&sharedMemory[0x14C]) = make_ulonglong2(0xe3e3dd3e2929527b, 0x848413972f2f5e71);
+		AS_UL2(&sharedMemory[0x150]) = make_ulonglong2(0xd1d1b9685353a6f5, 0xededc12c00000000);
+		AS_UL2(&sharedMemory[0x154]) = make_ulonglong2(0xfcfce31f20204060, 0x5b5bb6edb1b179c8);
+		AS_UL2(&sharedMemory[0x158]) = make_ulonglong2(0xcbcb8d466a6ad4be, 0x3939724bbebe67d9);
+		AS_UL2(&sharedMemory[0x15C]) = make_ulonglong2(0x4c4c98d44a4a94de, 0xcfcf854a5858b0e8);
+		break;
+	case 3:
+		AS_UL2(&sharedMemory[0x160]) = make_ulonglong2(0xefefc52ad0d0bb6b, 0xfbfbed16aaaa4fe5);
+		AS_UL2(&sharedMemory[0x164]) = make_ulonglong2(0x4d4d9ad7434386c5, 0x8585119433336655);
+		AS_UL2(&sharedMemory[0x168]) = make_ulonglong2(0xf9f9e91045458acf, 0x7f7ffe8102020406);
+		AS_UL2(&sharedMemory[0x16C]) = make_ulonglong2(0x3c3c78445050a0f0, 0xa8a84be39f9f25ba);
+		AS_UL2(&sharedMemory[0x170]) = make_ulonglong2(0xa3a35dfe5151a2f3, 0x8f8f058a404080c0);
+		AS_UL2(&sharedMemory[0x174]) = make_ulonglong2(0x9d9d21bc92923fad, 0xf5f5f10438387048);
+		AS_UL2(&sharedMemory[0x178]) = make_ulonglong2(0xb6b677c1bcbc63df, 0x21214263dadaaf75);
+		AS_UL2(&sharedMemory[0x17C]) = make_ulonglong2(0xffffe51a10102030, 0xd2d2bf6df3f3fd0e);
+		break;
+	case 4:
+		AS_UL2(&sharedMemory[0x180]) = make_ulonglong2(0x0c0c1814cdcd814c, 0xececc32f13132635);
+		AS_UL2(&sharedMemory[0x184]) = make_ulonglong2(0x979735a25f5fbee1, 0x17172e39444488cc);
+		AS_UL2(&sharedMemory[0x188]) = make_ulonglong2(0xa7a755f2c4c49357, 0x3d3d7a477e7efc82);
+		AS_UL2(&sharedMemory[0x18C]) = make_ulonglong2(0x5d5dbae76464c8ac, 0x7373e6951919322b);
+		AS_UL2(&sharedMemory[0x190]) = make_ulonglong2(0x818119986060c0a0, 0xdcdca37f4f4f9ed1);
+		AS_UL2(&sharedMemory[0x194]) = make_ulonglong2(0x2a2a547e22224466, 0x88880b8390903bab);
+		AS_UL2(&sharedMemory[0x198]) = make_ulonglong2(0xeeeec72946468cca, 0x1414283cb8b86bd3);
+		AS_UL2(&sharedMemory[0x19C]) = make_ulonglong2(0x5e5ebce2dedea779, 0xdbdbad760b0b161d);
+		break;
+	case 5:
+		AS_UL2(&sharedMemory[0x1A0]) = make_ulonglong2(0x32326456e0e0db3b, 0x0a0a141e3a3a744e);
+		AS_UL2(&sharedMemory[0x1A4]) = make_ulonglong2(0x06060c0a494992db, 0x5c5cb8e42424486c);
+		AS_UL2(&sharedMemory[0x1A8]) = make_ulonglong2(0xd3d3bd6ec2c29f5d, 0x6262c4a6acac43ef);
+		AS_UL2(&sharedMemory[0x1AC]) = make_ulonglong2(0x959531a4919139a8, 0x7979f28be4e4d337);
+		AS_UL2(&sharedMemory[0x1B0]) = make_ulonglong2(0xc8c88b43e7e7d532, 0x6d6ddab737376e59);
+		AS_UL2(&sharedMemory[0x1B4]) = make_ulonglong2(0xd5d5b1648d8d018c, 0xa9a949e04e4e9cd2);
+		AS_UL2(&sharedMemory[0x1B8]) = make_ulonglong2(0x5656acfa6c6cd8b4, 0xeaeacf25f4f4f307);
+		AS_UL2(&sharedMemory[0x1BC]) = make_ulonglong2(0x7a7af48e6565caaf, 0x08081018aeae47e9);
+		break;
+	case 6:
+		AS_UL2(&sharedMemory[0x1C0]) = make_ulonglong2(0x7878f088baba6fd5, 0x2e2e5c7225254a6f);
+		AS_UL2(&sharedMemory[0x1C4]) = make_ulonglong2(0xa6a657f11c1c3824, 0xc6c69751b4b473c7);
+		AS_UL2(&sharedMemory[0x1C8]) = make_ulonglong2(0xdddda17ce8e8cb23, 0x1f1f3e217474e89c);
+		AS_UL2(&sharedMemory[0x1CC]) = make_ulonglong2(0xbdbd61dc4b4b96dd, 0x8a8a0f858b8b0d86);
+		AS_UL2(&sharedMemory[0x1D0]) = make_ulonglong2(0x3e3e7c427070e090, 0x6666ccaab5b571c4);
+		AS_UL2(&sharedMemory[0x1D4]) = make_ulonglong2(0x03030605484890d8, 0x0e0e1c12f6f6f701);
+		AS_UL2(&sharedMemory[0x1D8]) = make_ulonglong2(0x35356a5f6161c2a3, 0xb9b969d05757aef9);
+		AS_UL2(&sharedMemory[0x1DC]) = make_ulonglong2(0xc1c1995886861791, 0x9e9e27b91d1d3a27);
+		break;
+	case 7:
+		AS_UL2(&sharedMemory[0x1E0]) = make_ulonglong2(0xf8f8eb13e1e1d938, 0x1111223398982bb3);
+		AS_UL2(&sharedMemory[0x1E4]) = make_ulonglong2(0xd9d9a9706969d2bb, 0x949433a78e8e0789);
+		AS_UL2(&sharedMemory[0x1E8]) = make_ulonglong2(0x1e1e3c229b9b2db6, 0xe9e9c92087871592);
+		AS_UL2(&sharedMemory[0x1EC]) = make_ulonglong2(0x5555aaffcece8749, 0xdfdfa57a28285078);
+		AS_UL2(&sharedMemory[0x1F0]) = make_ulonglong2(0xa1a159f88c8c038f, 0x0d0d1a1789890980);
+		AS_UL2(&sharedMemory[0x1F4]) = make_ulonglong2(0xe6e6d731bfbf65da, 0x6868d0b8424284c6);
+		AS_UL2(&sharedMemory[0x1F8]) = make_ulonglong2(0x999929b0414182c3, 0x0f0f1e112d2d5a77);
+		AS_UL2(&sharedMemory[0x1FC]) = make_ulonglong2(0x5454a8fcb0b07bcb, 0x16162c3abbbb6dd6);
+		break;
+	}
+	// AES 2
+	switch (threadIdx.x) {
+	case 0:
+		AS_UL2(&sharedMemory[0x200]) = make_ulonglong2(0x7cf8847c63c6a563, 0x7bf68d7b77ee9977);
+		AS_UL2(&sharedMemory[0x204]) = make_ulonglong2(0x6bd6bd6bf2ff0df2, 0xc59154c56fdeb16f);
+		AS_UL2(&sharedMemory[0x208]) = make_ulonglong2(0x0102030130605030, 0x2b567d2b67cea967);
+		AS_UL2(&sharedMemory[0x20C]) = make_ulonglong2(0xd7b562d7fee719fe, 0x76ec9a76ab4de6ab);
+		AS_UL2(&sharedMemory[0x210]) = make_ulonglong2(0x821f9d82ca8f45ca, 0x7dfa877dc98940c9);
+		AS_UL2(&sharedMemory[0x214]) = make_ulonglong2(0x59b2eb59faef15fa, 0xf0fb0bf0478ec947);
+		AS_UL2(&sharedMemory[0x218]) = make_ulonglong2(0xd4b367d4ad41ecad, 0xaf45eaafa25ffda2);
+		AS_UL2(&sharedMemory[0x21C]) = make_ulonglong2(0xa453f7a49c23bf9c, 0xc09b5bc072e49672);
+		break;
+	case 1:
+		AS_UL2(&sharedMemory[0x220]) = make_ulonglong2(0xfde11cfdb775c2b7, 0x264c6a26933dae93);
+		AS_UL2(&sharedMemory[0x224]) = make_ulonglong2(0x3f7e413f366c5a36, 0xcc834fccf7f502f7);
+		AS_UL2(&sharedMemory[0x228]) = make_ulonglong2(0xa551f4a534685c34, 0xf1f908f1e5d134e5);
+		AS_UL2(&sharedMemory[0x22C]) = make_ulonglong2(0xd8ab73d871e29371, 0x152a3f1531625331);
+		AS_UL2(&sharedMemory[0x230]) = make_ulonglong2(0xc79552c704080c04, 0xc39d5ec323466523);
+		AS_UL2(&sharedMemory[0x234]) = make_ulonglong2(0x9637a19618302818, 0x9a2fb59a050a0f05);
+		AS_UL2(&sharedMemory[0x238]) = make_ulonglong2(0x12243612070e0907, 0xe2df3de2801b9b80);
+		AS_UL2(&sharedMemory[0x23C]) = make_ulonglong2(0x274e6927ebcd26eb, 0x75ea9f75b27fcdb2);
+		break;
+	case 2:
+		AS_UL2(&sharedMemory[0x240]) = make_ulonglong2(0x831d9e8309121b09, 0x1a342e1a2c58742c);
+		AS_UL2(&sharedMemory[0x244]) = make_ulonglong2(0x6edcb26e1b362d1b, 0xa05bfba05ab4ee5a);
+		AS_UL2(&sharedMemory[0x248]) = make_ulonglong2(0x3b764d3b52a4f652, 0xb37dceb3d6b761d6);
+		AS_UL2(&sharedMemory[0x24C]) = make_ulonglong2(0xe3dd3ee329527b29, 0x841397842f5e712f);
+		AS_UL2(&sharedMemory[0x250]) = make_ulonglong2(0xd1b968d153a6f553, 0xedc12ced00000000);
+		AS_UL2(&sharedMemory[0x254]) = make_ulonglong2(0xfce31ffc20406020, 0x5bb6ed5bb179c8b1);
+		AS_UL2(&sharedMemory[0x258]) = make_ulonglong2(0xcb8d46cb6ad4be6a, 0x39724b39be67d9be);
+		AS_UL2(&sharedMemory[0x25C]) = make_ulonglong2(0x4c98d44c4a94de4a, 0xcf854acf58b0e858);
+		break;
+	case 3:
+		AS_UL2(&sharedMemory[0x260]) = make_ulonglong2(0xefc52aefd0bb6bd0, 0xfbed16fbaa4fe5aa);
+		AS_UL2(&sharedMemory[0x264]) = make_ulonglong2(0x4d9ad74d4386c543, 0x8511948533665533);
+		AS_UL2(&sharedMemory[0x268]) = make_ulonglong2(0xf9e910f9458acf45, 0x7ffe817f02040602);
+		AS_UL2(&sharedMemory[0x26C]) = make_ulonglong2(0x3c78443c50a0f050, 0xa84be3a89f25ba9f);
+		AS_UL2(&sharedMemory[0x270]) = make_ulonglong2(0xa35dfea351a2f351, 0x8f058a8f4080c040);
+		AS_UL2(&sharedMemory[0x274]) = make_ulonglong2(0x9d21bc9d923fad92, 0xf5f104f538704838);
+		AS_UL2(&sharedMemory[0x278]) = make_ulonglong2(0xb677c1b6bc63dfbc, 0x21426321daaf75da);
+		AS_UL2(&sharedMemory[0x27C]) = make_ulonglong2(0xffe51aff10203010, 0xd2bf6dd2f3fd0ef3);
+		break;
+	case 4:
+		AS_UL2(&sharedMemory[0x280]) = make_ulonglong2(0x0c18140ccd814ccd, 0xecc32fec13263513);
+		AS_UL2(&sharedMemory[0x284]) = make_ulonglong2(0x9735a2975fbee15f, 0x172e39174488cc44);
+		AS_UL2(&sharedMemory[0x288]) = make_ulonglong2(0xa755f2a7c49357c4, 0x3d7a473d7efc827e);
+		AS_UL2(&sharedMemory[0x28C]) = make_ulonglong2(0x5dbae75d64c8ac64, 0x73e6957319322b19);
+		AS_UL2(&sharedMemory[0x290]) = make_ulonglong2(0x8119988160c0a060, 0xdca37fdc4f9ed14f);
+		AS_UL2(&sharedMemory[0x294]) = make_ulonglong2(0x2a547e2a22446622, 0x880b8388903bab90);
+		AS_UL2(&sharedMemory[0x298]) = make_ulonglong2(0xeec729ee468cca46, 0x14283c14b86bd3b8);
+		AS_UL2(&sharedMemory[0x29C]) = make_ulonglong2(0x5ebce25edea779de, 0xdbad76db0b161d0b);
+		break;
+	case 5:
+		AS_UL2(&sharedMemory[0x2A0]) = make_ulonglong2(0x32645632e0db3be0, 0x0a141e0a3a744e3a);
+		AS_UL2(&sharedMemory[0x2A4]) = make_ulonglong2(0x060c0a064992db49, 0x5cb8e45c24486c24);
+		AS_UL2(&sharedMemory[0x2A8]) = make_ulonglong2(0xd3bd6ed3c29f5dc2, 0x62c4a662ac43efac);
+		AS_UL2(&sharedMemory[0x2AC]) = make_ulonglong2(0x9531a4959139a891, 0x79f28b79e4d337e4);
+		AS_UL2(&sharedMemory[0x2B0]) = make_ulonglong2(0xc88b43c8e7d532e7, 0x6ddab76d376e5937);
+		AS_UL2(&sharedMemory[0x2B4]) = make_ulonglong2(0xd5b164d58d018c8d, 0xa949e0a94e9cd24e);
+		AS_UL2(&sharedMemory[0x2B8]) = make_ulonglong2(0x56acfa566cd8b46c, 0xeacf25eaf4f307f4);
+		AS_UL2(&sharedMemory[0x2BC]) = make_ulonglong2(0x7af48e7a65caaf65, 0x08101808ae47e9ae);
+		break;
+	case 6:
+		AS_UL2(&sharedMemory[0x2C0]) = make_ulonglong2(0x78f08878ba6fd5ba, 0x2e5c722e254a6f25);
+		AS_UL2(&sharedMemory[0x2C4]) = make_ulonglong2(0xa657f1a61c38241c, 0xc69751c6b473c7b4);
+		AS_UL2(&sharedMemory[0x2C8]) = make_ulonglong2(0xdda17cdde8cb23e8, 0x1f3e211f74e89c74);
+		AS_UL2(&sharedMemory[0x2CC]) = make_ulonglong2(0xbd61dcbd4b96dd4b, 0x8a0f858a8b0d868b);
+		AS_UL2(&sharedMemory[0x2D0]) = make_ulonglong2(0x3e7c423e70e09070, 0x66ccaa66b571c4b5);
+		AS_UL2(&sharedMemory[0x2D4]) = make_ulonglong2(0x030605034890d848, 0x0e1c120ef6f701f6);
+		AS_UL2(&sharedMemory[0x2D8]) = make_ulonglong2(0x356a5f3561c2a361, 0xb969d0b957aef957);
+		AS_UL2(&sharedMemory[0x2DC]) = make_ulonglong2(0xc19958c186179186, 0x9e27b99e1d3a271d);
+		break;
+	case 7:
+		AS_UL2(&sharedMemory[0x2E0]) = make_ulonglong2(0xf8eb13f8e1d938e1, 0x11223311982bb398);
+		AS_UL2(&sharedMemory[0x2E4]) = make_ulonglong2(0xd9a970d969d2bb69, 0x9433a7948e07898e);
+		AS_UL2(&sharedMemory[0x2E8]) = make_ulonglong2(0x1e3c221e9b2db69b, 0xe9c920e987159287);
+		AS_UL2(&sharedMemory[0x2EC]) = make_ulonglong2(0x55aaff55ce8749ce, 0xdfa57adf28507828);
+		AS_UL2(&sharedMemory[0x2F0]) = make_ulonglong2(0xa159f8a18c038f8c, 0x0d1a170d89098089);
+		AS_UL2(&sharedMemory[0x2F4]) = make_ulonglong2(0xe6d731e6bf65dabf, 0x68d0b8684284c642);
+		AS_UL2(&sharedMemory[0x2F8]) = make_ulonglong2(0x9929b0994182c341, 0x0f1e110f2d5a772d);
+		AS_UL2(&sharedMemory[0x2FC]) = make_ulonglong2(0x54a8fc54b07bcbb0, 0x162c3a16bb6dd6bb);
+		break;
+	}
+	// AES 3
+	switch (threadIdx.x) {
+	case 0:
+		AS_UL2(&sharedMemory[0x300]) = make_ulonglong2(0xf8847c7cc6a56363, 0xf68d7b7bee997777);
+		AS_UL2(&sharedMemory[0x304]) = make_ulonglong2(0xd6bd6b6bff0df2f2, 0x9154c5c5deb16f6f);
+		AS_UL2(&sharedMemory[0x308]) = make_ulonglong2(0x0203010160503030, 0x567d2b2bcea96767);
+		AS_UL2(&sharedMemory[0x30C]) = make_ulonglong2(0xb562d7d7e719fefe, 0xec9a76764de6abab);
+		AS_UL2(&sharedMemory[0x310]) = make_ulonglong2(0x1f9d82828f45caca, 0xfa877d7d8940c9c9);
+		AS_UL2(&sharedMemory[0x314]) = make_ulonglong2(0xb2eb5959ef15fafa, 0xfb0bf0f08ec94747);
+		AS_UL2(&sharedMemory[0x318]) = make_ulonglong2(0xb367d4d441ecadad, 0x45eaafaf5ffda2a2);
+		AS_UL2(&sharedMemory[0x31C]) = make_ulonglong2(0x53f7a4a423bf9c9c, 0x9b5bc0c0e4967272);
+		break;
+	case 1:
+		AS_UL2(&sharedMemory[0x320]) = make_ulonglong2(0xe11cfdfd75c2b7b7, 0x4c6a26263dae9393);
+		AS_UL2(&sharedMemory[0x324]) = make_ulonglong2(0x7e413f3f6c5a3636, 0x834fccccf502f7f7);
+		AS_UL2(&sharedMemory[0x328]) = make_ulonglong2(0x51f4a5a5685c3434, 0xf908f1f1d134e5e5);
+		AS_UL2(&sharedMemory[0x32C]) = make_ulonglong2(0xab73d8d8e2937171, 0x2a3f151562533131);
+		AS_UL2(&sharedMemory[0x330]) = make_ulonglong2(0x9552c7c7080c0404, 0x9d5ec3c346652323);
+		AS_UL2(&sharedMemory[0x334]) = make_ulonglong2(0x37a1969630281818, 0x2fb59a9a0a0f0505);
+		AS_UL2(&sharedMemory[0x338]) = make_ulonglong2(0x243612120e090707, 0xdf3de2e21b9b8080);
+		AS_UL2(&sharedMemory[0x33C]) = make_ulonglong2(0x4e692727cd26ebeb, 0xea9f75757fcdb2b2);
+		break;
+	case 2:
+		AS_UL2(&sharedMemory[0x340]) = make_ulonglong2(0x1d9e8383121b0909, 0x342e1a1a58742c2c);
+		AS_UL2(&sharedMemory[0x344]) = make_ulonglong2(0xdcb26e6e362d1b1b, 0x5bfba0a0b4ee5a5a);
+		AS_UL2(&sharedMemory[0x348]) = make_ulonglong2(0x764d3b3ba4f65252, 0x7dceb3b3b761d6d6);
+		AS_UL2(&sharedMemory[0x34C]) = make_ulonglong2(0xdd3ee3e3527b2929, 0x139784845e712f2f);
+		AS_UL2(&sharedMemory[0x350]) = make_ulonglong2(0xb968d1d1a6f55353, 0xc12ceded00000000);
+		AS_UL2(&sharedMemory[0x354]) = make_ulonglong2(0xe31ffcfc40602020, 0xb6ed5b5b79c8b1b1);
+		AS_UL2(&sharedMemory[0x358]) = make_ulonglong2(0x8d46cbcbd4be6a6a, 0x724b393967d9bebe);
+		AS_UL2(&sharedMemory[0x35C]) = make_ulonglong2(0x98d44c4c94de4a4a, 0x854acfcfb0e85858);
+		break;
+	case 3:
+		AS_UL2(&sharedMemory[0x360]) = make_ulonglong2(0xc52aefefbb6bd0d0, 0xed16fbfb4fe5aaaa);
+		AS_UL2(&sharedMemory[0x364]) = make_ulonglong2(0x9ad74d4d86c54343, 0x1194858566553333);
+		AS_UL2(&sharedMemory[0x368]) = make_ulonglong2(0xe910f9f98acf4545, 0xfe817f7f04060202);
+		AS_UL2(&sharedMemory[0x36C]) = make_ulonglong2(0x78443c3ca0f05050, 0x4be3a8a825ba9f9f);
+		AS_UL2(&sharedMemory[0x370]) = make_ulonglong2(0x5dfea3a3a2f35151, 0x058a8f8f80c04040);
+		AS_UL2(&sharedMemory[0x374]) = make_ulonglong2(0x21bc9d9d3fad9292, 0xf104f5f570483838);
+		AS_UL2(&sharedMemory[0x378]) = make_ulonglong2(0x77c1b6b663dfbcbc, 0x42632121af75dada);
+		AS_UL2(&sharedMemory[0x37C]) = make_ulonglong2(0xe51affff20301010, 0xbf6dd2d2fd0ef3f3);
+		break;
+	case 4:
+		AS_UL2(&sharedMemory[0x380]) = make_ulonglong2(0x18140c0c814ccdcd, 0xc32fecec26351313);
+		AS_UL2(&sharedMemory[0x384]) = make_ulonglong2(0x35a29797bee15f5f, 0x2e39171788cc4444);
+		AS_UL2(&sharedMemory[0x388]) = make_ulonglong2(0x55f2a7a79357c4c4, 0x7a473d3dfc827e7e);
+		AS_UL2(&sharedMemory[0x38C]) = make_ulonglong2(0xbae75d5dc8ac6464, 0xe6957373322b1919);
+		AS_UL2(&sharedMemory[0x390]) = make_ulonglong2(0x19988181c0a06060, 0xa37fdcdc9ed14f4f);
+		AS_UL2(&sharedMemory[0x394]) = make_ulonglong2(0x547e2a2a44662222, 0x0b8388883bab9090);
+		AS_UL2(&sharedMemory[0x398]) = make_ulonglong2(0xc729eeee8cca4646, 0x283c14146bd3b8b8);
+		AS_UL2(&sharedMemory[0x39C]) = make_ulonglong2(0xbce25e5ea779dede, 0xad76dbdb161d0b0b);
+		break;
+	case 5:
+		AS_UL2(&sharedMemory[0x3A0]) = make_ulonglong2(0x64563232db3be0e0, 0x141e0a0a744e3a3a);
+		AS_UL2(&sharedMemory[0x3A4]) = make_ulonglong2(0x0c0a060692db4949, 0xb8e45c5c486c2424);
+		AS_UL2(&sharedMemory[0x3A8]) = make_ulonglong2(0xbd6ed3d39f5dc2c2, 0xc4a6626243efacac);
+		AS_UL2(&sharedMemory[0x3AC]) = make_ulonglong2(0x31a4959539a89191, 0xf28b7979d337e4e4);
+		AS_UL2(&sharedMemory[0x3B0]) = make_ulonglong2(0x8b43c8c8d532e7e7, 0xdab76d6d6e593737);
+		AS_UL2(&sharedMemory[0x3B4]) = make_ulonglong2(0xb164d5d5018c8d8d, 0x49e0a9a99cd24e4e);
+		AS_UL2(&sharedMemory[0x3B8]) = make_ulonglong2(0xacfa5656d8b46c6c, 0xcf25eaeaf307f4f4);
+		AS_UL2(&sharedMemory[0x3BC]) = make_ulonglong2(0xf48e7a7acaaf6565, 0x1018080847e9aeae);
+		break;
+	case 6:
+		AS_UL2(&sharedMemory[0x3C0]) = make_ulonglong2(0xf08878786fd5baba, 0x5c722e2e4a6f2525);
+		AS_UL2(&sharedMemory[0x3C4]) = make_ulonglong2(0x57f1a6a638241c1c, 0x9751c6c673c7b4b4);
+		AS_UL2(&sharedMemory[0x3C8]) = make_ulonglong2(0xa17cddddcb23e8e8, 0x3e211f1fe89c7474);
+		AS_UL2(&sharedMemory[0x3CC]) = make_ulonglong2(0x61dcbdbd96dd4b4b, 0x0f858a8a0d868b8b);
+		AS_UL2(&sharedMemory[0x3D0]) = make_ulonglong2(0x7c423e3ee0907070, 0xccaa666671c4b5b5);
+		AS_UL2(&sharedMemory[0x3D4]) = make_ulonglong2(0x0605030390d84848, 0x1c120e0ef701f6f6);
+		AS_UL2(&sharedMemory[0x3D8]) = make_ulonglong2(0x6a5f3535c2a36161, 0x69d0b9b9aef95757);
+		AS_UL2(&sharedMemory[0x3DC]) = make_ulonglong2(0x9958c1c117918686, 0x27b99e9e3a271d1d);
+		break;
+	case 7:
+		AS_UL2(&sharedMemory[0x3E0]) = make_ulonglong2(0xeb13f8f8d938e1e1, 0x223311112bb39898);
+		AS_UL2(&sharedMemory[0x3E4]) = make_ulonglong2(0xa970d9d9d2bb6969, 0x33a7949407898e8e);
+		AS_UL2(&sharedMemory[0x3E8]) = make_ulonglong2(0x3c221e1e2db69b9b, 0xc920e9e915928787);
+		AS_UL2(&sharedMemory[0x3EC]) = make_ulonglong2(0xaaff55558749cece, 0xa57adfdf50782828);
+		AS_UL2(&sharedMemory[0x3F0]) = make_ulonglong2(0x59f8a1a1038f8c8c, 0x1a170d0d09808989);
+		AS_UL2(&sharedMemory[0x3F4]) = make_ulonglong2(0xd731e6e665dabfbf, 0xd0b8686884c64242);
+		AS_UL2(&sharedMemory[0x3F8]) = make_ulonglong2(0x29b0999982c34141, 0x1e110f0f5a772d2d);
+		AS_UL2(&sharedMemory[0x3FC]) = make_ulonglong2(0xa8fc54547bcbb0b0, 0x2c3a16166dd6bbbb);
+		break;
+	}
+}
+
+__device__ __forceinline__
+void cn_aes_gpu_init_u4(uint32_t* sharedMemory)
+{
+	// AES 0
+	switch (threadIdx.x) {
+	case 0:
+		AS_UINT4(&sharedMemory[0x000]) = make_uint4(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6);
+		AS_UINT4(&sharedMemory[0x004]) = make_uint4(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591);
+		AS_UINT4(&sharedMemory[0x008]) = make_uint4(0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56);
+		AS_UINT4(&sharedMemory[0x00C]) = make_uint4(0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec);
+		AS_UINT4(&sharedMemory[0x010]) = make_uint4(0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa);
+		AS_UINT4(&sharedMemory[0x014]) = make_uint4(0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb);
+		AS_UINT4(&sharedMemory[0x018]) = make_uint4(0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45);
+		AS_UINT4(&sharedMemory[0x01C]) = make_uint4(0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b);
+		break;
+	case 1:
+		AS_UINT4(&sharedMemory[0x020]) = make_uint4(0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c);
+		AS_UINT4(&sharedMemory[0x024]) = make_uint4(0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83);
+		AS_UINT4(&sharedMemory[0x028]) = make_uint4(0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9);
+		AS_UINT4(&sharedMemory[0x02C]) = make_uint4(0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a);
+		AS_UINT4(&sharedMemory[0x030]) = make_uint4(0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d);
+		AS_UINT4(&sharedMemory[0x034]) = make_uint4(0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f);
+		AS_UINT4(&sharedMemory[0x038]) = make_uint4(0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df);
+		AS_UINT4(&sharedMemory[0x03C]) = make_uint4(0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea);
+		break;
+	case 2:
+		AS_UINT4(&sharedMemory[0x040]) = make_uint4(0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34);
+		AS_UINT4(&sharedMemory[0x044]) = make_uint4(0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b);
+		AS_UINT4(&sharedMemory[0x048]) = make_uint4(0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d);
+		AS_UINT4(&sharedMemory[0x04C]) = make_uint4(0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413);
+		AS_UINT4(&sharedMemory[0x050]) = make_uint4(0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1);
+		AS_UINT4(&sharedMemory[0x054]) = make_uint4(0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6);
+		AS_UINT4(&sharedMemory[0x058]) = make_uint4(0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972);
+		AS_UINT4(&sharedMemory[0x05C]) = make_uint4(0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85);
+		break;
+	case 3:
+		AS_UINT4(&sharedMemory[0x060]) = make_uint4(0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed);
+		AS_UINT4(&sharedMemory[0x064]) = make_uint4(0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511);
+		AS_UINT4(&sharedMemory[0x068]) = make_uint4(0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe);
+		AS_UINT4(&sharedMemory[0x06C]) = make_uint4(0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b);
+		AS_UINT4(&sharedMemory[0x070]) = make_uint4(0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05);
+		AS_UINT4(&sharedMemory[0x074]) = make_uint4(0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1);
+		AS_UINT4(&sharedMemory[0x078]) = make_uint4(0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142);
+		AS_UINT4(&sharedMemory[0x07C]) = make_uint4(0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf);
+		break;
+	case 4:
+		AS_UINT4(&sharedMemory[0x080]) = make_uint4(0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3);
+		AS_UINT4(&sharedMemory[0x084]) = make_uint4(0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e);
+		AS_UINT4(&sharedMemory[0x088]) = make_uint4(0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a);
+		AS_UINT4(&sharedMemory[0x08C]) = make_uint4(0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6);
+		AS_UINT4(&sharedMemory[0x090]) = make_uint4(0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3);
+		AS_UINT4(&sharedMemory[0x094]) = make_uint4(0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b);
+		AS_UINT4(&sharedMemory[0x098]) = make_uint4(0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428);
+		AS_UINT4(&sharedMemory[0x09C]) = make_uint4(0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad);
+		break;
+	case 5:
+		AS_UINT4(&sharedMemory[0x0A0]) = make_uint4(0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14);
+		AS_UINT4(&sharedMemory[0x0A4]) = make_uint4(0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8);
+		AS_UINT4(&sharedMemory[0x0A8]) = make_uint4(0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4);
+		AS_UINT4(&sharedMemory[0x0AC]) = make_uint4(0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2);
+		AS_UINT4(&sharedMemory[0x0B0]) = make_uint4(0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda);
+		AS_UINT4(&sharedMemory[0x0B4]) = make_uint4(0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949);
+		AS_UINT4(&sharedMemory[0x0B8]) = make_uint4(0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf);
+		AS_UINT4(&sharedMemory[0x0BC]) = make_uint4(0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810);
+		break;
+	case 6:
+		AS_UINT4(&sharedMemory[0x0C0]) = make_uint4(0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c);
+		AS_UINT4(&sharedMemory[0x0C4]) = make_uint4(0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697);
+		AS_UINT4(&sharedMemory[0x0C8]) = make_uint4(0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e);
+		AS_UINT4(&sharedMemory[0x0CC]) = make_uint4(0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f);
+		AS_UINT4(&sharedMemory[0x0D0]) = make_uint4(0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc);
+		AS_UINT4(&sharedMemory[0x0D4]) = make_uint4(0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c);
+		AS_UINT4(&sharedMemory[0x0D8]) = make_uint4(0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969);
+		AS_UINT4(&sharedMemory[0x0DC]) = make_uint4(0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27);
+		break;
+	case 7:
+		AS_UINT4(&sharedMemory[0x0E0]) = make_uint4(0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122);
+		AS_UINT4(&sharedMemory[0x0E4]) = make_uint4(0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433);
+		AS_UINT4(&sharedMemory[0x0E8]) = make_uint4(0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9);
+		AS_UINT4(&sharedMemory[0x0EC]) = make_uint4(0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5);
+		AS_UINT4(&sharedMemory[0x0F0]) = make_uint4(0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a);
+		AS_UINT4(&sharedMemory[0x0F4]) = make_uint4(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0);
+		AS_UINT4(&sharedMemory[0x0F8]) = make_uint4(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e);
+		AS_UINT4(&sharedMemory[0x0FC]) = make_uint4(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c);
+		break;
+	}
+	// AES 1
+	switch (threadIdx.x) {
+	case 0:
+		AS_UINT4(&sharedMemory[0x100]) = make_uint4(0x6363c6a5, 0x7c7cf884, 0x7777ee99, 0x7b7bf68d);
+		AS_UINT4(&sharedMemory[0x104]) = make_uint4(0xf2f2ff0d, 0x6b6bd6bd, 0x6f6fdeb1, 0xc5c59154);
+		AS_UINT4(&sharedMemory[0x108]) = make_uint4(0x30306050, 0x01010203, 0x6767cea9, 0x2b2b567d);
+		AS_UINT4(&sharedMemory[0x10C]) = make_uint4(0xfefee719, 0xd7d7b562, 0xabab4de6, 0x7676ec9a);
+		AS_UINT4(&sharedMemory[0x110]) = make_uint4(0xcaca8f45, 0x82821f9d, 0xc9c98940, 0x7d7dfa87);
+		AS_UINT4(&sharedMemory[0x114]) = make_uint4(0xfafaef15, 0x5959b2eb, 0x47478ec9, 0xf0f0fb0b);
+		AS_UINT4(&sharedMemory[0x118]) = make_uint4(0xadad41ec, 0xd4d4b367, 0xa2a25ffd, 0xafaf45ea);
+		AS_UINT4(&sharedMemory[0x11C]) = make_uint4(0x9c9c23bf, 0xa4a453f7, 0x7272e496, 0xc0c09b5b);
+		break;
+	case 1:
+		AS_UINT4(&sharedMemory[0x120]) = make_uint4(0xb7b775c2, 0xfdfde11c, 0x93933dae, 0x26264c6a);
+		AS_UINT4(&sharedMemory[0x124]) = make_uint4(0x36366c5a, 0x3f3f7e41, 0xf7f7f502, 0xcccc834f);
+		AS_UINT4(&sharedMemory[0x128]) = make_uint4(0x3434685c, 0xa5a551f4, 0xe5e5d134, 0xf1f1f908);
+		AS_UINT4(&sharedMemory[0x12C]) = make_uint4(0x7171e293, 0xd8d8ab73, 0x31316253, 0x15152a3f);
+		AS_UINT4(&sharedMemory[0x130]) = make_uint4(0x0404080c, 0xc7c79552, 0x23234665, 0xc3c39d5e);
+		AS_UINT4(&sharedMemory[0x134]) = make_uint4(0x18183028, 0x969637a1, 0x05050a0f, 0x9a9a2fb5);
+		AS_UINT4(&sharedMemory[0x138]) = make_uint4(0x07070e09, 0x12122436, 0x80801b9b, 0xe2e2df3d);
+		AS_UINT4(&sharedMemory[0x13C]) = make_uint4(0xebebcd26, 0x27274e69, 0xb2b27fcd, 0x7575ea9f);
+		break;
+	case 2:
+		AS_UINT4(&sharedMemory[0x140]) = make_uint4(0x0909121b, 0x83831d9e, 0x2c2c5874, 0x1a1a342e);
+		AS_UINT4(&sharedMemory[0x144]) = make_uint4(0x1b1b362d, 0x6e6edcb2, 0x5a5ab4ee, 0xa0a05bfb);
+		AS_UINT4(&sharedMemory[0x148]) = make_uint4(0x5252a4f6, 0x3b3b764d, 0xd6d6b761, 0xb3b37dce);
+		AS_UINT4(&sharedMemory[0x14C]) = make_uint4(0x2929527b, 0xe3e3dd3e, 0x2f2f5e71, 0x84841397);
+		AS_UINT4(&sharedMemory[0x150]) = make_uint4(0x5353a6f5, 0xd1d1b968, 0x00000000, 0xededc12c);
+		AS_UINT4(&sharedMemory[0x154]) = make_uint4(0x20204060, 0xfcfce31f, 0xb1b179c8, 0x5b5bb6ed);
+		AS_UINT4(&sharedMemory[0x158]) = make_uint4(0x6a6ad4be, 0xcbcb8d46, 0xbebe67d9, 0x3939724b);
+		AS_UINT4(&sharedMemory[0x15C]) = make_uint4(0x4a4a94de, 0x4c4c98d4, 0x5858b0e8, 0xcfcf854a);
+		break;
+	case 3:
+		AS_UINT4(&sharedMemory[0x160]) = make_uint4(0xd0d0bb6b, 0xefefc52a, 0xaaaa4fe5, 0xfbfbed16);
+		AS_UINT4(&sharedMemory[0x164]) = make_uint4(0x434386c5, 0x4d4d9ad7, 0x33336655, 0x85851194);
+		AS_UINT4(&sharedMemory[0x168]) = make_uint4(0x45458acf, 0xf9f9e910, 0x02020406, 0x7f7ffe81);
+		AS_UINT4(&sharedMemory[0x16C]) = make_uint4(0x5050a0f0, 0x3c3c7844, 0x9f9f25ba, 0xa8a84be3);
+		AS_UINT4(&sharedMemory[0x170]) = make_uint4(0x5151a2f3, 0xa3a35dfe, 0x404080c0, 0x8f8f058a);
+		AS_UINT4(&sharedMemory[0x174]) = make_uint4(0x92923fad, 0x9d9d21bc, 0x38387048, 0xf5f5f104);
+		AS_UINT4(&sharedMemory[0x178]) = make_uint4(0xbcbc63df, 0xb6b677c1, 0xdadaaf75, 0x21214263);
+		AS_UINT4(&sharedMemory[0x17C]) = make_uint4(0x10102030, 0xffffe51a, 0xf3f3fd0e, 0xd2d2bf6d);
+		break;
+	case 4:
+		AS_UINT4(&sharedMemory[0x180]) = make_uint4(0xcdcd814c, 0x0c0c1814, 0x13132635, 0xececc32f);
+		AS_UINT4(&sharedMemory[0x184]) = make_uint4(0x5f5fbee1, 0x979735a2, 0x444488cc, 0x17172e39);
+		AS_UINT4(&sharedMemory[0x188]) = make_uint4(0xc4c49357, 0xa7a755f2, 0x7e7efc82, 0x3d3d7a47);
+		AS_UINT4(&sharedMemory[0x18C]) = make_uint4(0x6464c8ac, 0x5d5dbae7, 0x1919322b, 0x7373e695);
+		AS_UINT4(&sharedMemory[0x190]) = make_uint4(0x6060c0a0, 0x81811998, 0x4f4f9ed1, 0xdcdca37f);
+		AS_UINT4(&sharedMemory[0x194]) = make_uint4(0x22224466, 0x2a2a547e, 0x90903bab, 0x88880b83);
+		AS_UINT4(&sharedMemory[0x198]) = make_uint4(0x46468cca, 0xeeeec729, 0xb8b86bd3, 0x1414283c);
+		AS_UINT4(&sharedMemory[0x19C]) = make_uint4(0xdedea779, 0x5e5ebce2, 0x0b0b161d, 0xdbdbad76);
+		break;
+	case 5:
+		AS_UINT4(&sharedMemory[0x1A0]) = make_uint4(0xe0e0db3b, 0x32326456, 0x3a3a744e, 0x0a0a141e);
+		AS_UINT4(&sharedMemory[0x1A4]) = make_uint4(0x494992db, 0x06060c0a, 0x2424486c, 0x5c5cb8e4);
+		AS_UINT4(&sharedMemory[0x1A8]) = make_uint4(0xc2c29f5d, 0xd3d3bd6e, 0xacac43ef, 0x6262c4a6);
+		AS_UINT4(&sharedMemory[0x1AC]) = make_uint4(0x919139a8, 0x959531a4, 0xe4e4d337, 0x7979f28b);
+		AS_UINT4(&sharedMemory[0x1B0]) = make_uint4(0xe7e7d532, 0xc8c88b43, 0x37376e59, 0x6d6ddab7);
+		AS_UINT4(&sharedMemory[0x1B4]) = make_uint4(0x8d8d018c, 0xd5d5b164, 0x4e4e9cd2, 0xa9a949e0);
+		AS_UINT4(&sharedMemory[0x1B8]) = make_uint4(0x6c6cd8b4, 0x5656acfa, 0xf4f4f307, 0xeaeacf25);
+		AS_UINT4(&sharedMemory[0x1BC]) = make_uint4(0x6565caaf, 0x7a7af48e, 0xaeae47e9, 0x08081018);
+		break;
+	case 6:
+		AS_UINT4(&sharedMemory[0x1C0]) = make_uint4(0xbaba6fd5, 0x7878f088, 0x25254a6f, 0x2e2e5c72);
+		AS_UINT4(&sharedMemory[0x1C4]) = make_uint4(0x1c1c3824, 0xa6a657f1, 0xb4b473c7, 0xc6c69751);
+		AS_UINT4(&sharedMemory[0x1C8]) = make_uint4(0xe8e8cb23, 0xdddda17c, 0x7474e89c, 0x1f1f3e21);
+		AS_UINT4(&sharedMemory[0x1CC]) = make_uint4(0x4b4b96dd, 0xbdbd61dc, 0x8b8b0d86, 0x8a8a0f85);
+		AS_UINT4(&sharedMemory[0x1D0]) = make_uint4(0x7070e090, 0x3e3e7c42, 0xb5b571c4, 0x6666ccaa);
+		AS_UINT4(&sharedMemory[0x1D4]) = make_uint4(0x484890d8, 0x03030605, 0xf6f6f701, 0x0e0e1c12);
+		AS_UINT4(&sharedMemory[0x1D8]) = make_uint4(0x6161c2a3, 0x35356a5f, 0x5757aef9, 0xb9b969d0);
+		AS_UINT4(&sharedMemory[0x1DC]) = make_uint4(0x86861791, 0xc1c19958, 0x1d1d3a27, 0x9e9e27b9);
+		break;
+	case 7:
+		AS_UINT4(&sharedMemory[0x1E0]) = make_uint4(0xe1e1d938, 0xf8f8eb13, 0x98982bb3, 0x11112233);
+		AS_UINT4(&sharedMemory[0x1E4]) = make_uint4(0x6969d2bb, 0xd9d9a970, 0x8e8e0789, 0x949433a7);
+		AS_UINT4(&sharedMemory[0x1E8]) = make_uint4(0x9b9b2db6, 0x1e1e3c22, 0x87871592, 0xe9e9c920);
+		AS_UINT4(&sharedMemory[0x1EC]) = make_uint4(0xcece8749, 0x5555aaff, 0x28285078, 0xdfdfa57a);
+		AS_UINT4(&sharedMemory[0x1F0]) = make_uint4(0x8c8c038f, 0xa1a159f8, 0x89890980, 0x0d0d1a17);
+		AS_UINT4(&sharedMemory[0x1F4]) = make_uint4(0xbfbf65da, 0xe6e6d731, 0x424284c6, 0x6868d0b8);
+		AS_UINT4(&sharedMemory[0x1F8]) = make_uint4(0x414182c3, 0x999929b0, 0x2d2d5a77, 0x0f0f1e11);
+		AS_UINT4(&sharedMemory[0x1FC]) = make_uint4(0xb0b07bcb, 0x5454a8fc, 0xbbbb6dd6, 0x16162c3a);
+		break;
+	}
+	// AES 2
+	switch (threadIdx.x) {
+	case 0:
+		AS_UINT4(&sharedMemory[0x200]) = make_uint4(0x63c6a563, 0x7cf8847c, 0x77ee9977, 0x7bf68d7b);
+		AS_UINT4(&sharedMemory[0x204]) = make_uint4(0xf2ff0df2, 0x6bd6bd6b, 0x6fdeb16f, 0xc59154c5);
+		AS_UINT4(&sharedMemory[0x208]) = make_uint4(0x30605030, 0x01020301, 0x67cea967, 0x2b567d2b);
+		AS_UINT4(&sharedMemory[0x20C]) = make_uint4(0xfee719fe, 0xd7b562d7, 0xab4de6ab, 0x76ec9a76);
+		AS_UINT4(&sharedMemory[0x210]) = make_uint4(0xca8f45ca, 0x821f9d82, 0xc98940c9, 0x7dfa877d);
+		AS_UINT4(&sharedMemory[0x214]) = make_uint4(0xfaef15fa, 0x59b2eb59, 0x478ec947, 0xf0fb0bf0);
+		AS_UINT4(&sharedMemory[0x218]) = make_uint4(0xad41ecad, 0xd4b367d4, 0xa25ffda2, 0xaf45eaaf);
+		AS_UINT4(&sharedMemory[0x21C]) = make_uint4(0x9c23bf9c, 0xa453f7a4, 0x72e49672, 0xc09b5bc0);
+		break;
+	case 1:
+		AS_UINT4(&sharedMemory[0x220]) = make_uint4(0xb775c2b7, 0xfde11cfd, 0x933dae93, 0x264c6a26);
+		AS_UINT4(&sharedMemory[0x224]) = make_uint4(0x366c5a36, 0x3f7e413f, 0xf7f502f7, 0xcc834fcc);
+		AS_UINT4(&sharedMemory[0x228]) = make_uint4(0x34685c34, 0xa551f4a5, 0xe5d134e5, 0xf1f908f1);
+		AS_UINT4(&sharedMemory[0x22C]) = make_uint4(0x71e29371, 0xd8ab73d8, 0x31625331, 0x152a3f15);
+		AS_UINT4(&sharedMemory[0x230]) = make_uint4(0x04080c04, 0xc79552c7, 0x23466523, 0xc39d5ec3);
+		AS_UINT4(&sharedMemory[0x234]) = make_uint4(0x18302818, 0x9637a196, 0x050a0f05, 0x9a2fb59a);
+		AS_UINT4(&sharedMemory[0x238]) = make_uint4(0x070e0907, 0x12243612, 0x801b9b80, 0xe2df3de2);
+		AS_UINT4(&sharedMemory[0x23C]) = make_uint4(0xebcd26eb, 0x274e6927, 0xb27fcdb2, 0x75ea9f75);
+		break;
+	case 2:
+		AS_UINT4(&sharedMemory[0x240]) = make_uint4(0x09121b09, 0x831d9e83, 0x2c58742c, 0x1a342e1a);
+		AS_UINT4(&sharedMemory[0x244]) = make_uint4(0x1b362d1b, 0x6edcb26e, 0x5ab4ee5a, 0xa05bfba0);
+		AS_UINT4(&sharedMemory[0x248]) = make_uint4(0x52a4f652, 0x3b764d3b, 0xd6b761d6, 0xb37dceb3);
+		AS_UINT4(&sharedMemory[0x24C]) = make_uint4(0x29527b29, 0xe3dd3ee3, 0x2f5e712f, 0x84139784);
+		AS_UINT4(&sharedMemory[0x250]) = make_uint4(0x53a6f553, 0xd1b968d1, 0x00000000, 0xedc12ced);
+		AS_UINT4(&sharedMemory[0x254]) = make_uint4(0x20406020, 0xfce31ffc, 0xb179c8b1, 0x5bb6ed5b);
+		AS_UINT4(&sharedMemory[0x258]) = make_uint4(0x6ad4be6a, 0xcb8d46cb, 0xbe67d9be, 0x39724b39);
+		AS_UINT4(&sharedMemory[0x25C]) = make_uint4(0x4a94de4a, 0x4c98d44c, 0x58b0e858, 0xcf854acf);
+		break;
+	case 3:
+		AS_UINT4(&sharedMemory[0x260]) = make_uint4(0xd0bb6bd0, 0xefc52aef, 0xaa4fe5aa, 0xfbed16fb);
+		AS_UINT4(&sharedMemory[0x264]) = make_uint4(0x4386c543, 0x4d9ad74d, 0x33665533, 0x85119485);
+		AS_UINT4(&sharedMemory[0x268]) = make_uint4(0x458acf45, 0xf9e910f9, 0x02040602, 0x7ffe817f);
+		AS_UINT4(&sharedMemory[0x26C]) = make_uint4(0x50a0f050, 0x3c78443c, 0x9f25ba9f, 0xa84be3a8);
+		AS_UINT4(&sharedMemory[0x270]) = make_uint4(0x51a2f351, 0xa35dfea3, 0x4080c040, 0x8f058a8f);
+		AS_UINT4(&sharedMemory[0x274]) = make_uint4(0x923fad92, 0x9d21bc9d, 0x38704838, 0xf5f104f5);
+		AS_UINT4(&sharedMemory[0x278]) = make_uint4(0xbc63dfbc, 0xb677c1b6, 0xdaaf75da, 0x21426321);
+		AS_UINT4(&sharedMemory[0x27C]) = make_uint4(0x10203010, 0xffe51aff, 0xf3fd0ef3, 0xd2bf6dd2);
+		break;
+	case 4:
+		AS_UINT4(&sharedMemory[0x280]) = make_uint4(0xcd814ccd, 0x0c18140c, 0x13263513, 0xecc32fec);
+		AS_UINT4(&sharedMemory[0x284]) = make_uint4(0x5fbee15f, 0x9735a297, 0x4488cc44, 0x172e3917);
+		AS_UINT4(&sharedMemory[0x288]) = make_uint4(0xc49357c4, 0xa755f2a7, 0x7efc827e, 0x3d7a473d);
+		AS_UINT4(&sharedMemory[0x28C]) = make_uint4(0x64c8ac64, 0x5dbae75d, 0x19322b19, 0x73e69573);
+		AS_UINT4(&sharedMemory[0x290]) = make_uint4(0x60c0a060, 0x81199881, 0x4f9ed14f, 0xdca37fdc);
+		AS_UINT4(&sharedMemory[0x294]) = make_uint4(0x22446622, 0x2a547e2a, 0x903bab90, 0x880b8388);
+		AS_UINT4(&sharedMemory[0x298]) = make_uint4(0x468cca46, 0xeec729ee, 0xb86bd3b8, 0x14283c14);
+		AS_UINT4(&sharedMemory[0x29C]) = make_uint4(0xdea779de, 0x5ebce25e, 0x0b161d0b, 0xdbad76db);
+		break;
+	case 5:
+		AS_UINT4(&sharedMemory[0x2A0]) = make_uint4(0xe0db3be0, 0x32645632, 0x3a744e3a, 0x0a141e0a);
+		AS_UINT4(&sharedMemory[0x2A4]) = make_uint4(0x4992db49, 0x060c0a06, 0x24486c24, 0x5cb8e45c);
+		AS_UINT4(&sharedMemory[0x2A8]) = make_uint4(0xc29f5dc2, 0xd3bd6ed3, 0xac43efac, 0x62c4a662);
+		AS_UINT4(&sharedMemory[0x2AC]) = make_uint4(0x9139a891, 0x9531a495, 0xe4d337e4, 0x79f28b79);
+		AS_UINT4(&sharedMemory[0x2B0]) = make_uint4(0xe7d532e7, 0xc88b43c8, 0x376e5937, 0x6ddab76d);
+		AS_UINT4(&sharedMemory[0x2B4]) = make_uint4(0x8d018c8d, 0xd5b164d5, 0x4e9cd24e, 0xa949e0a9);
+		AS_UINT4(&sharedMemory[0x2B8]) = make_uint4(0x6cd8b46c, 0x56acfa56, 0xf4f307f4, 0xeacf25ea);
+		AS_UINT4(&sharedMemory[0x2BC]) = make_uint4(0x65caaf65, 0x7af48e7a, 0xae47e9ae, 0x08101808);
+		break;
+	case 6:
+		AS_UINT4(&sharedMemory[0x2C0]) = make_uint4(0xba6fd5ba, 0x78f08878, 0x254a6f25, 0x2e5c722e);
+		AS_UINT4(&sharedMemory[0x2C4]) = make_uint4(0x1c38241c, 0xa657f1a6, 0xb473c7b4, 0xc69751c6);
+		AS_UINT4(&sharedMemory[0x2C8]) = make_uint4(0xe8cb23e8, 0xdda17cdd, 0x74e89c74, 0x1f3e211f);
+		AS_UINT4(&sharedMemory[0x2CC]) = make_uint4(0x4b96dd4b, 0xbd61dcbd, 0x8b0d868b, 0x8a0f858a);
+		AS_UINT4(&sharedMemory[0x2D0]) = make_uint4(0x70e09070, 0x3e7c423e, 0xb571c4b5, 0x66ccaa66);
+		AS_UINT4(&sharedMemory[0x2D4]) = make_uint4(0x4890d848, 0x03060503, 0xf6f701f6, 0x0e1c120e);
+		AS_UINT4(&sharedMemory[0x2D8]) = make_uint4(0x61c2a361, 0x356a5f35, 0x57aef957, 0xb969d0b9);
+		AS_UINT4(&sharedMemory[0x2DC]) = make_uint4(0x86179186, 0xc19958c1, 0x1d3a271d, 0x9e27b99e);
+		break;
+	case 7:
+		AS_UINT4(&sharedMemory[0x2E0]) = make_uint4(0xe1d938e1, 0xf8eb13f8, 0x982bb398, 0x11223311);
+		AS_UINT4(&sharedMemory[0x2E4]) = make_uint4(0x69d2bb69, 0xd9a970d9, 0x8e07898e, 0x9433a794);
+		AS_UINT4(&sharedMemory[0x2E8]) = make_uint4(0x9b2db69b, 0x1e3c221e, 0x87159287, 0xe9c920e9);
+		AS_UINT4(&sharedMemory[0x2EC]) = make_uint4(0xce8749ce, 0x55aaff55, 0x28507828, 0xdfa57adf);
+		AS_UINT4(&sharedMemory[0x2F0]) = make_uint4(0x8c038f8c, 0xa159f8a1, 0x89098089, 0x0d1a170d);
+		AS_UINT4(&sharedMemory[0x2F4]) = make_uint4(0xbf65dabf, 0xe6d731e6, 0x4284c642, 0x68d0b868);
+		AS_UINT4(&sharedMemory[0x2F8]) = make_uint4(0x4182c341, 0x9929b099, 0x2d5a772d, 0x0f1e110f);
+		AS_UINT4(&sharedMemory[0x2FC]) = make_uint4(0xb07bcbb0, 0x54a8fc54, 0xbb6dd6bb, 0x162c3a16);
+		break;
+	}
+	// AES 3
+	switch (threadIdx.x) {
+	case 0:
+		AS_UINT4(&sharedMemory[0x300]) = make_uint4(0xc6a56363, 0xf8847c7c, 0xee997777, 0xf68d7b7b);
+		AS_UINT4(&sharedMemory[0x304]) = make_uint4(0xff0df2f2, 0xd6bd6b6b, 0xdeb16f6f, 0x9154c5c5);
+		AS_UINT4(&sharedMemory[0x308]) = make_uint4(0x60503030, 0x02030101, 0xcea96767, 0x567d2b2b);
+		AS_UINT4(&sharedMemory[0x30C]) = make_uint4(0xe719fefe, 0xb562d7d7, 0x4de6abab, 0xec9a7676);
+		AS_UINT4(&sharedMemory[0x310]) = make_uint4(0x8f45caca, 0x1f9d8282, 0x8940c9c9, 0xfa877d7d);
+		AS_UINT4(&sharedMemory[0x314]) = make_uint4(0xef15fafa, 0xb2eb5959, 0x8ec94747, 0xfb0bf0f0);
+		AS_UINT4(&sharedMemory[0x318]) = make_uint4(0x41ecadad, 0xb367d4d4, 0x5ffda2a2, 0x45eaafaf);
+		AS_UINT4(&sharedMemory[0x31C]) = make_uint4(0x23bf9c9c, 0x53f7a4a4, 0xe4967272, 0x9b5bc0c0);
+		break;
+	case 1:
+		AS_UINT4(&sharedMemory[0x320]) = make_uint4(0x75c2b7b7, 0xe11cfdfd, 0x3dae9393, 0x4c6a2626);
+		AS_UINT4(&sharedMemory[0x324]) = make_uint4(0x6c5a3636, 0x7e413f3f, 0xf502f7f7, 0x834fcccc);
+		AS_UINT4(&sharedMemory[0x328]) = make_uint4(0x685c3434, 0x51f4a5a5, 0xd134e5e5, 0xf908f1f1);
+		AS_UINT4(&sharedMemory[0x32C]) = make_uint4(0xe2937171, 0xab73d8d8, 0x62533131, 0x2a3f1515);
+		AS_UINT4(&sharedMemory[0x330]) = make_uint4(0x080c0404, 0x9552c7c7, 0x46652323, 0x9d5ec3c3);
+		AS_UINT4(&sharedMemory[0x334]) = make_uint4(0x30281818, 0x37a19696, 0x0a0f0505, 0x2fb59a9a);
+		AS_UINT4(&sharedMemory[0x338]) = make_uint4(0x0e090707, 0x24361212, 0x1b9b8080, 0xdf3de2e2);
+		AS_UINT4(&sharedMemory[0x33C]) = make_uint4(0xcd26ebeb, 0x4e692727, 0x7fcdb2b2, 0xea9f7575);
+		break;
+	case 2:
+		AS_UINT4(&sharedMemory[0x340]) = make_uint4(0x121b0909, 0x1d9e8383, 0x58742c2c, 0x342e1a1a);
+		AS_UINT4(&sharedMemory[0x344]) = make_uint4(0x362d1b1b, 0xdcb26e6e, 0xb4ee5a5a, 0x5bfba0a0);
+		AS_UINT4(&sharedMemory[0x348]) = make_uint4(0xa4f65252, 0x764d3b3b, 0xb761d6d6, 0x7dceb3b3);
+		AS_UINT4(&sharedMemory[0x34C]) = make_uint4(0x527b2929, 0xdd3ee3e3, 0x5e712f2f, 0x13978484);
+		AS_UINT4(&sharedMemory[0x350]) = make_uint4(0xa6f55353, 0xb968d1d1, 0x00000000, 0xc12ceded);
+		AS_UINT4(&sharedMemory[0x354]) = make_uint4(0x40602020, 0xe31ffcfc, 0x79c8b1b1, 0xb6ed5b5b);
+		AS_UINT4(&sharedMemory[0x358]) = make_uint4(0xd4be6a6a, 0x8d46cbcb, 0x67d9bebe, 0x724b3939);
+		AS_UINT4(&sharedMemory[0x35C]) = make_uint4(0x94de4a4a, 0x98d44c4c, 0xb0e85858, 0x854acfcf);
+		break;
+	case 3:
+		AS_UINT4(&sharedMemory[0x360]) = make_uint4(0xbb6bd0d0, 0xc52aefef, 0x4fe5aaaa, 0xed16fbfb);
+		AS_UINT4(&sharedMemory[0x364]) = make_uint4(0x86c54343, 0x9ad74d4d, 0x66553333, 0x11948585);
+		AS_UINT4(&sharedMemory[0x368]) = make_uint4(0x8acf4545, 0xe910f9f9, 0x04060202, 0xfe817f7f);
+		AS_UINT4(&sharedMemory[0x36C]) = make_uint4(0xa0f05050, 0x78443c3c, 0x25ba9f9f, 0x4be3a8a8);
+		AS_UINT4(&sharedMemory[0x370]) = make_uint4(0xa2f35151, 0x5dfea3a3, 0x80c04040, 0x058a8f8f);
+		AS_UINT4(&sharedMemory[0x374]) = make_uint4(0x3fad9292, 0x21bc9d9d, 0x70483838, 0xf104f5f5);
+		AS_UINT4(&sharedMemory[0x378]) = make_uint4(0x63dfbcbc, 0x77c1b6b6, 0xaf75dada, 0x42632121);
+		AS_UINT4(&sharedMemory[0x37C]) = make_uint4(0x20301010, 0xe51affff, 0xfd0ef3f3, 0xbf6dd2d2);
+		break;
+	case 4:
+		AS_UINT4(&sharedMemory[0x380]) = make_uint4(0x814ccdcd, 0x18140c0c, 0x26351313, 0xc32fecec);
+		AS_UINT4(&sharedMemory[0x384]) = make_uint4(0xbee15f5f, 0x35a29797, 0x88cc4444, 0x2e391717);
+		AS_UINT4(&sharedMemory[0x388]) = make_uint4(0x9357c4c4, 0x55f2a7a7, 0xfc827e7e, 0x7a473d3d);
+		AS_UINT4(&sharedMemory[0x38C]) = make_uint4(0xc8ac6464, 0xbae75d5d, 0x322b1919, 0xe6957373);
+		AS_UINT4(&sharedMemory[0x390]) = make_uint4(0xc0a06060, 0x19988181, 0x9ed14f4f, 0xa37fdcdc);
+		AS_UINT4(&sharedMemory[0x394]) = make_uint4(0x44662222, 0x547e2a2a, 0x3bab9090, 0x0b838888);
+		AS_UINT4(&sharedMemory[0x398]) = make_uint4(0x8cca4646, 0xc729eeee, 0x6bd3b8b8, 0x283c1414);
+		AS_UINT4(&sharedMemory[0x39C]) = make_uint4(0xa779dede, 0xbce25e5e, 0x161d0b0b, 0xad76dbdb);
+		break;
+	case 5:
+		AS_UINT4(&sharedMemory[0x3A0]) = make_uint4(0xdb3be0e0, 0x64563232, 0x744e3a3a, 0x141e0a0a);
+		AS_UINT4(&sharedMemory[0x3A4]) = make_uint4(0x92db4949, 0x0c0a0606, 0x486c2424, 0xb8e45c5c);
+		AS_UINT4(&sharedMemory[0x3A8]) = make_uint4(0x9f5dc2c2, 0xbd6ed3d3, 0x43efacac, 0xc4a66262);
+		AS_UINT4(&sharedMemory[0x3AC]) = make_uint4(0x39a89191, 0x31a49595, 0xd337e4e4, 0xf28b7979);
+		AS_UINT4(&sharedMemory[0x3B0]) = make_uint4(0xd532e7e7, 0x8b43c8c8, 0x6e593737, 0xdab76d6d);
+		AS_UINT4(&sharedMemory[0x3B4]) = make_uint4(0x018c8d8d, 0xb164d5d5, 0x9cd24e4e, 0x49e0a9a9);
+		AS_UINT4(&sharedMemory[0x3B8]) = make_uint4(0xd8b46c6c, 0xacfa5656, 0xf307f4f4, 0xcf25eaea);
+		AS_UINT4(&sharedMemory[0x3BC]) = make_uint4(0xcaaf6565, 0xf48e7a7a, 0x47e9aeae, 0x10180808);
+		break;
+	case 6:
+		AS_UINT4(&sharedMemory[0x3C0]) = make_uint4(0x6fd5baba, 0xf0887878, 0x4a6f2525, 0x5c722e2e);
+		AS_UINT4(&sharedMemory[0x3C4]) = make_uint4(0x38241c1c, 0x57f1a6a6, 0x73c7b4b4, 0x9751c6c6);
+		AS_UINT4(&sharedMemory[0x3C8]) = make_uint4(0xcb23e8e8, 0xa17cdddd, 0xe89c7474, 0x3e211f1f);
+		AS_UINT4(&sharedMemory[0x3CC]) = make_uint4(0x96dd4b4b, 0x61dcbdbd, 0x0d868b8b, 0x0f858a8a);
+		AS_UINT4(&sharedMemory[0x3D0]) = make_uint4(0xe0907070, 0x7c423e3e, 0x71c4b5b5, 0xccaa6666);
+		AS_UINT4(&sharedMemory[0x3D4]) = make_uint4(0x90d84848, 0x06050303, 0xf701f6f6, 0x1c120e0e);
+		AS_UINT4(&sharedMemory[0x3D8]) = make_uint4(0xc2a36161, 0x6a5f3535, 0xaef95757, 0x69d0b9b9);
+		AS_UINT4(&sharedMemory[0x3DC]) = make_uint4(0x17918686, 0x9958c1c1, 0x3a271d1d, 0x27b99e9e);
+		break;
+	case 7:
+		AS_UINT4(&sharedMemory[0x3E0]) = make_uint4(0xd938e1e1, 0xeb13f8f8, 0x2bb39898, 0x22331111);
+		AS_UINT4(&sharedMemory[0x3E4]) = make_uint4(0xd2bb6969, 0xa970d9d9, 0x07898e8e, 0x33a79494);
+		AS_UINT4(&sharedMemory[0x3E8]) = make_uint4(0x2db69b9b, 0x3c221e1e, 0x15928787, 0xc920e9e9);
+		AS_UINT4(&sharedMemory[0x3EC]) = make_uint4(0x8749cece, 0xaaff5555, 0x50782828, 0xa57adfdf);
+		AS_UINT4(&sharedMemory[0x3F0]) = make_uint4(0x038f8c8c, 0x59f8a1a1, 0x09808989, 0x1a170d0d);
+		AS_UINT4(&sharedMemory[0x3F4]) = make_uint4(0x65dabfbf, 0xd731e6e6, 0x84c64242, 0xd0b86868);
+		AS_UINT4(&sharedMemory[0x3F8]) = make_uint4(0x82c34141, 0x29b09999, 0x5a772d2d, 0x1e110f0f);
+		AS_UINT4(&sharedMemory[0x3FC]) = make_uint4(0x7bcbb0b0, 0xa8fc5454, 0x6dd6bbbb, 0x2c3a1616);
+		break;
+	}
+}
diff --git a/crypto/cn_blake.cuh b/crypto/cn_blake.cuh
new file mode 100644
index 0000000000..bd2ba430ef
--- /dev/null
+++ b/crypto/cn_blake.cuh
@@ -0,0 +1,180 @@
+//#include <cuda_helper.h>
+
+typedef struct {
+  uint32_t h[8], s[4], t[2];
+  int buflen, nullt;
+  uint8_t buf[64];
+} blake_state;
+
+#define U8TO32(p) \
+	(((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \
+	 ((uint32_t)((p)[2]) <<  8) | ((uint32_t)((p)[3])      ))
+
+#define U32TO8(p, v) \
+	(p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \
+	(p)[2] = (uint8_t)((v) >>  8); (p)[3] = (uint8_t)((v)      );
+
+#define BLAKE_ROT(x,n) ROTR32(x, n)
+#define BLAKE_G(a,b,c,d,e)                                      \
+	v[a] += (m[d_blake_sigma[i][e]] ^ d_blake_cst[d_blake_sigma[i][e+1]]) + v[b]; \
+	v[d] = BLAKE_ROT(v[d] ^ v[a],16); \
+	v[c] += v[d]; \
+	v[b] = BLAKE_ROT(v[b] ^ v[c],12); \
+	v[a] += (m[d_blake_sigma[i][e+1]] ^ d_blake_cst[d_blake_sigma[i][e]]) + v[b]; \
+	v[d] = BLAKE_ROT(v[d] ^ v[a], 8); \
+	v[c] += v[d]; \
+	v[b] = BLAKE_ROT(v[b] ^ v[c], 7);
+
+__constant__ uint8_t d_blake_sigma[14][16] = {
+	{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
+	{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
+	{7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8},
+	{9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13},
+	{2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9},
+	{12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11},
+	{13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10},
+	{6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5},
+	{10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0},
+	{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
+	{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
+	{7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8}
+};
+
+__constant__ uint32_t d_blake_cst[16] = {
+	0x243F6A88, 0x85A308D3, 0x13198A2E, 0x03707344,
+	0xA4093822, 0x299F31D0, 0x082EFA98, 0xEC4E6C89,
+	0x452821E6, 0x38D01377, 0xBE5466CF, 0x34E90C6C,
+	0xC0AC29B7, 0xC97C50DD, 0x3F84D5B5, 0xB5470917
+};
+
+__device__
+void cn_blake_compress(blake_state * __restrict__ S, const uint8_t * __restrict__ block)
+{
+	uint32_t v[16], m[16], i;
+
+	for (i = 0; i < 16; ++i) m[i] = U8TO32(block + i * 4);
+	for (i = 0; i < 8;  ++i) v[i] = S->h[i];
+	v[ 8] = S->s[0] ^ 0x243F6A88;
+	v[ 9] = S->s[1] ^ 0x85A308D3;
+	v[10] = S->s[2] ^ 0x13198A2E;
+	v[11] = S->s[3] ^ 0x03707344;
+	v[12] = 0xA4093822;
+	v[13] = 0x299F31D0;
+	v[14] = 0x082EFA98;
+	v[15] = 0xEC4E6C89;
+
+	if (S->nullt == 0) {
+		v[12] ^= S->t[0];
+		v[13] ^= S->t[0];
+		v[14] ^= S->t[1];
+		v[15] ^= S->t[1];
+	}
+
+	for (i = 0; i < 14; ++i) {
+		BLAKE_G(0, 4,  8, 12,  0);
+		BLAKE_G(1, 5,  9, 13,  2);
+		BLAKE_G(2, 6, 10, 14,  4);
+		BLAKE_G(3, 7, 11, 15,  6);
+		BLAKE_G(3, 4,  9, 14, 14);
+		BLAKE_G(2, 7,  8, 13, 12);
+		BLAKE_G(0, 5, 10, 15,  8);
+		BLAKE_G(1, 6, 11, 12, 10);
+	}
+
+	for (i = 0; i < 16; ++i) S->h[i % 8] ^= v[i];
+	for (i = 0; i < 8;  ++i) S->h[i] ^= S->s[i % 4];
+}
+
+__device__ void cn_blake_update(blake_state * __restrict__ S, const uint8_t * __restrict__ data, uint64_t datalen)
+{
+	int left = S->buflen >> 3;
+	int fill = 64 - left;
+
+	if (left && (((datalen >> 3) & 0x3F) >= (unsigned) fill)) {
+		memcpy((void *) (S->buf + left), (void *) data, fill);
+		S->t[0] += 512;
+		if (S->t[0] == 0) S->t[1]++;
+		cn_blake_compress(S, S->buf);
+		data += fill;
+		datalen -= (fill << 3);
+		left = 0;
+	}
+
+	while (datalen >= 512) {
+		S->t[0] += 512;
+		if (S->t[0] == 0) S->t[1]++;
+		cn_blake_compress(S, data);
+		data += 64;
+		datalen -= 512;
+	}
+
+	if (datalen > 0) {
+		memcpy((void *) (S->buf + left), (void *) data, datalen >> 3);
+		S->buflen = (left << 3) + datalen;
+	} else {
+		S->buflen = 0;
+	}
+}
+
+__device__
+void cn_blake_final(blake_state * __restrict__ S, uint8_t * __restrict__ digest)
+{
+	const uint8_t padding[] = {
+		0x80,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+	};
+	uint8_t pa = 0x81, pb = 0x01;
+	uint8_t msglen[8];
+	uint32_t lo = S->t[0] + S->buflen, hi = S->t[1];
+	if (lo < (unsigned) S->buflen) hi++;
+	U32TO8(msglen + 0, hi);
+	U32TO8(msglen + 4, lo);
+
+	if (S->buflen == 440) {
+		S->t[0] -= 8;
+		cn_blake_update(S, &pa, 8);
+	} else {
+		if (S->buflen < 440) {
+			if (S->buflen == 0) S->nullt = 1;
+			S->t[0] -= 440 - S->buflen;
+			cn_blake_update(S, padding, 440 - S->buflen);
+		} else {
+			S->t[0] -= 512 - S->buflen;
+			cn_blake_update(S, padding, 512 - S->buflen);
+			S->t[0] -= 440;
+			cn_blake_update(S, padding + 1, 440);
+			S->nullt = 1;
+		}
+		cn_blake_update(S, &pb, 8);
+		S->t[0] -= 8;
+	}
+	S->t[0] -= 64;
+	cn_blake_update(S, msglen, 64);
+
+	U32TO8(digest +  0, S->h[0]);
+	U32TO8(digest +  4, S->h[1]);
+	U32TO8(digest +  8, S->h[2]);
+	U32TO8(digest + 12, S->h[3]);
+	U32TO8(digest + 16, S->h[4]);
+	U32TO8(digest + 20, S->h[5]);
+	U32TO8(digest + 24, S->h[6]);
+	U32TO8(digest + 28, S->h[7]);
+}
+
+__device__
+void cn_blake(const uint8_t * __restrict__ in, uint64_t inlen, uint32_t * out)
+{
+	blake_state bs;
+	blake_state *S = (blake_state *)&bs;
+
+	S->h[0] = 0x6A09E667; S->h[1] = 0xBB67AE85; S->h[2] = 0x3C6EF372;
+	S->h[3] = 0xA54FF53A; S->h[4] = 0x510E527F; S->h[5] = 0x9B05688C;
+	S->h[6] = 0x1F83D9AB; S->h[7] = 0x5BE0CD19;
+	S->t[0] = S->t[1] = S->buflen = S->nullt = 0;
+	S->s[0] = S->s[1] = S->s[2] = S->s[3] = 0;
+
+	cn_blake_update(S, (uint8_t *)in, inlen * 8);
+	cn_blake_final(S, (uint8_t *)out);
+}
diff --git a/crypto/cn_groestl.cuh b/crypto/cn_groestl.cuh
new file mode 100644
index 0000000000..425e0628df
--- /dev/null
+++ b/crypto/cn_groestl.cuh
@@ -0,0 +1,348 @@
+
+#define GROESTL_ROWS 8
+#define GROESTL_LENGTHFIELDLEN GROESTL_ROWS
+#define GROESTL_COLS512 8
+#define GROESTL_SIZE512 (GROESTL_ROWS*GROESTL_COLS512)
+#define GROESTL_ROUNDS512 10
+#define GROESTL_HASH_BIT_LEN 256
+
+#define GROESTL_ROTL32(v, n) ROTL32(v, n)
+
+#define li_32(h) 0x##h##u
+#define GROESTL_EXT_BYTE(var,n) ((uint8_t)((uint32_t)(var) >> (8*n)))
+#define u32BIG(a)				\
+  ((GROESTL_ROTL32(a,8) & li_32(00FF00FF)) | \
+   (GROESTL_ROTL32(a,24) & li_32(FF00FF00)))
+
+typedef struct {
+  uint32_t chaining[GROESTL_SIZE512/sizeof(uint32_t)];
+  uint32_t block_counter1, block_counter2;
+  BitSequence buffer[GROESTL_SIZE512];
+  int buf_ptr;
+  int bits_in_last_byte;
+} groestlHashState;
+
+__constant__ uint32_t d_groestl_T[512] = {
+	0xa5f432c6, 0xc6a597f4, 0x84976ff8, 0xf884eb97, 0x99b05eee, 0xee99c7b0, 0x8d8c7af6, 0xf68df78c, 0x0d17e8ff, 0xff0de517, 0xbddc0ad6, 0xd6bdb7dc, 0xb1c816de, 0xdeb1a7c8, 0x54fc6d91, 0x915439fc,
+	0x50f09060, 0x6050c0f0, 0x03050702, 0x02030405, 0xa9e02ece, 0xcea987e0, 0x7d87d156, 0x567dac87, 0x192bcce7, 0xe719d52b, 0x62a613b5, 0xb56271a6, 0xe6317c4d, 0x4de69a31, 0x9ab559ec, 0xec9ac3b5,
+	0x45cf408f, 0x8f4505cf, 0x9dbca31f, 0x1f9d3ebc, 0x40c04989, 0x894009c0, 0x879268fa, 0xfa87ef92, 0x153fd0ef, 0xef15c53f, 0xeb2694b2, 0xb2eb7f26, 0xc940ce8e, 0x8ec90740, 0x0b1de6fb, 0xfb0bed1d,
+	0xec2f6e41, 0x41ec822f, 0x67a91ab3, 0xb3677da9, 0xfd1c435f, 0x5ffdbe1c, 0xea256045, 0x45ea8a25, 0xbfdaf923, 0x23bf46da, 0xf7025153, 0x53f7a602, 0x96a145e4, 0xe496d3a1, 0x5bed769b, 0x9b5b2ded,
+	0xc25d2875, 0x75c2ea5d, 0x1c24c5e1, 0xe11cd924, 0xaee9d43d, 0x3dae7ae9, 0x6abef24c, 0x4c6a98be, 0x5aee826c, 0x6c5ad8ee, 0x41c3bd7e, 0x7e41fcc3, 0x0206f3f5, 0xf502f106, 0x4fd15283, 0x834f1dd1,
+	0x5ce48c68, 0x685cd0e4, 0xf4075651, 0x51f4a207, 0x345c8dd1, 0xd134b95c, 0x0818e1f9, 0xf908e918, 0x93ae4ce2, 0xe293dfae, 0x73953eab, 0xab734d95, 0x53f59762, 0x6253c4f5, 0x3f416b2a, 0x2a3f5441,
+	0x0c141c08, 0x080c1014, 0x52f66395, 0x955231f6, 0x65afe946, 0x46658caf, 0x5ee27f9d, 0x9d5e21e2, 0x28784830, 0x30286078, 0xa1f8cf37, 0x37a16ef8, 0x0f111b0a, 0x0a0f1411, 0xb5c4eb2f, 0x2fb55ec4,
+	0x091b150e, 0x0e091c1b, 0x365a7e24, 0x2436485a, 0x9bb6ad1b, 0x1b9b36b6, 0x3d4798df, 0xdf3da547, 0x266aa7cd, 0xcd26816a, 0x69bbf54e, 0x4e699cbb, 0xcd4c337f, 0x7fcdfe4c, 0x9fba50ea, 0xea9fcfba,
+	0x1b2d3f12, 0x121b242d, 0x9eb9a41d, 0x1d9e3ab9, 0x749cc458, 0x5874b09c, 0x2e724634, 0x342e6872, 0x2d774136, 0x362d6c77, 0xb2cd11dc, 0xdcb2a3cd, 0xee299db4, 0xb4ee7329, 0xfb164d5b, 0x5bfbb616,
+	0xf601a5a4, 0xa4f65301, 0x4dd7a176, 0x764decd7, 0x61a314b7, 0xb76175a3, 0xce49347d, 0x7dcefa49, 0x7b8ddf52, 0x527ba48d, 0x3e429fdd, 0xdd3ea142, 0x7193cd5e, 0x5e71bc93, 0x97a2b113, 0x139726a2,
+	0xf504a2a6, 0xa6f55704, 0x68b801b9, 0xb96869b8,          0,          0, 0x2c74b5c1, 0xc12c9974, 0x60a0e040, 0x406080a0, 0x1f21c2e3, 0xe31fdd21, 0xc8433a79, 0x79c8f243, 0xed2c9ab6, 0xb6ed772c,
+	0xbed90dd4, 0xd4beb3d9, 0x46ca478d, 0x8d4601ca, 0xd9701767, 0x67d9ce70, 0x4bddaf72, 0x724be4dd, 0xde79ed94, 0x94de3379, 0xd467ff98, 0x98d42b67, 0xe82393b0, 0xb0e87b23, 0x4ade5b85, 0x854a11de,
+	0x6bbd06bb, 0xbb6b6dbd, 0x2a7ebbc5, 0xc52a917e, 0xe5347b4f, 0x4fe59e34, 0x163ad7ed, 0xed16c13a, 0xc554d286, 0x86c51754, 0xd762f89a, 0x9ad72f62, 0x55ff9966, 0x6655ccff, 0x94a7b611, 0x119422a7,
+	0xcf4ac08a, 0x8acf0f4a, 0x1030d9e9, 0xe910c930, 0x060a0e04, 0x0406080a, 0x819866fe, 0xfe81e798, 0xf00baba0, 0xa0f05b0b, 0x44ccb478, 0x7844f0cc, 0xbad5f025, 0x25ba4ad5, 0xe33e754b, 0x4be3963e,
+	0xf30eaca2, 0xa2f35f0e, 0xfe19445d, 0x5dfeba19, 0xc05bdb80, 0x80c01b5b, 0x8a858005, 0x058a0a85, 0xadecd33f, 0x3fad7eec, 0xbcdffe21, 0x21bc42df, 0x48d8a870, 0x7048e0d8, 0x040cfdf1, 0xf104f90c,
+	0xdf7a1963, 0x63dfc67a, 0xc1582f77, 0x77c1ee58, 0x759f30af, 0xaf75459f, 0x63a5e742, 0x426384a5, 0x30507020, 0x20304050, 0x1a2ecbe5, 0xe51ad12e, 0x0e12effd, 0xfd0ee112, 0x6db708bf, 0xbf6d65b7,
+	0x4cd45581, 0x814c19d4, 0x143c2418, 0x1814303c, 0x355f7926, 0x26354c5f, 0x2f71b2c3, 0xc32f9d71, 0xe13886be, 0xbee16738, 0xa2fdc835, 0x35a26afd, 0xcc4fc788, 0x88cc0b4f, 0x394b652e, 0x2e395c4b,
+	0x57f96a93, 0x93573df9, 0xf20d5855, 0x55f2aa0d, 0x829d61fc, 0xfc82e39d, 0x47c9b37a, 0x7a47f4c9, 0xacef27c8, 0xc8ac8bef, 0xe73288ba, 0xbae76f32, 0x2b7d4f32, 0x322b647d, 0x95a442e6, 0xe695d7a4,
+	0xa0fb3bc0, 0xc0a09bfb, 0x98b3aa19, 0x199832b3, 0xd168f69e, 0x9ed12768, 0x7f8122a3, 0xa37f5d81, 0x66aaee44, 0x446688aa, 0x7e82d654, 0x547ea882, 0xabe6dd3b, 0x3bab76e6, 0x839e950b, 0xb83169e,
+	0xca45c98c, 0x8cca0345, 0x297bbcc7, 0xc729957b, 0xd36e056b, 0x6bd3d66e, 0x3c446c28, 0x283c5044, 0x798b2ca7, 0xa779558b, 0xe23d81bc, 0xbce2633d, 0x1d273116, 0x161d2c27, 0x769a37ad, 0xad76419a,
+	0x3b4d96db, 0xdb3bad4d, 0x56fa9e64, 0x6456c8fa, 0x4ed2a674, 0x744ee8d2, 0x1e223614, 0x141e2822, 0xdb76e492, 0x92db3f76, 0x0a1e120c, 0x0c0a181e, 0x6cb4fc48, 0x486c90b4, 0xe4378fb8, 0xb8e46b37,
+	0x5de7789f, 0x9f5d25e7, 0x6eb20fbd, 0xbd6e61b2, 0xef2a6943, 0x43ef862a, 0xa6f135c4, 0xc4a693f1, 0xa8e3da39, 0x39a872e3, 0xa4f7c631, 0x31a462f7, 0x37598ad3, 0xd337bd59, 0x8b8674f2, 0xf28bff86,
+	0x325683d5, 0xd532b156, 0x43c54e8b, 0x8b430dc5, 0x59eb856e, 0x6e59dceb, 0xb7c218da, 0xdab7afc2, 0x8c8f8e01, 0x018c028f, 0x64ac1db1, 0xb16479ac, 0xd26df19c, 0x9cd2236d, 0xe03b7249, 0x49e0923b,
+	0xb4c71fd8, 0xd8b4abc7, 0xfa15b9ac, 0xacfa4315, 0x0709faf3, 0xf307fd09, 0x256fa0cf, 0xcf25856f, 0xafea20ca, 0xcaaf8fea, 0x8e897df4, 0xf48ef389, 0xe9206747, 0x47e98e20, 0x18283810, 0x10182028,
+	0xd5640b6f, 0x6fd5de64, 0x888373f0, 0xf088fb83, 0x6fb1fb4a, 0x4a6f94b1, 0x7296ca5c, 0x5c72b896, 0x246c5438, 0x3824706c, 0xf1085f57, 0x57f1ae08, 0xc7522173, 0x73c7e652, 0x51f36497, 0x975135f3,
+	0x2365aecb, 0xcb238d65, 0x7c8425a1, 0xa17c5984, 0x9cbf57e8, 0xe89ccbbf, 0x21635d3e, 0x3e217c63, 0xdd7cea96, 0x96dd377c, 0xdc7f1e61, 0x61dcc27f, 0x86919c0d, 0x0d861a91, 0x85949b0f, 0xf851e94,
+	0x90ab4be0, 0xe090dbab, 0x42c6ba7c, 0x7c42f8c6, 0xc4572671, 0x71c4e257, 0xaae529cc, 0xccaa83e5, 0xd873e390, 0x90d83b73, 0x050f0906, 0x06050c0f, 0x0103f4f7, 0xf701f503, 0x12362a1c, 0x1c123836,
+	0xa3fe3cc2, 0xc2a39ffe, 0x5fe18b6a, 0x6a5fd4e1, 0xf910beae, 0xaef94710, 0xd06b0269, 0x69d0d26b, 0x91a8bf17, 0x17912ea8, 0x58e87199, 0x995829e8, 0x2769533a, 0x3a277469, 0xb9d0f727, 0x27b94ed0,
+	0x384891d9, 0xd938a948, 0x1335deeb, 0xeb13cd35, 0xb3cee52b, 0x2bb356ce, 0x33557722, 0x22334455, 0xbbd604d2, 0xd2bbbfd6, 0x709039a9, 0xa9704990, 0x89808707, 0x07890e80, 0xa7f2c133, 0x33a766f2,
+	0xb6c1ec2d, 0x2db65ac1, 0x22665a3c, 0x3c227866, 0x92adb815, 0x15922aad, 0x2060a9c9, 0xc9208960, 0x49db5c87, 0x874915db, 0xff1ab0aa, 0xaaff4f1a, 0x7888d850, 0x5078a088, 0x7a8e2ba5, 0xa57a518e,
+	0x8f8a8903, 0x038f068a, 0xf8134a59, 0x59f8b213, 0x809b9209, 0x0980129b, 0x1739231a, 0x1a173439, 0xda751065, 0x65daca75, 0x315384d7, 0xd731b553, 0xc651d584, 0x84c61351, 0xb8d303d0, 0xd0b8bbd3,
+	0xc35edc82, 0x82c31f5e, 0xb0cbe229, 0x29b052cb, 0x7799c35a, 0x5a77b499, 0x11332d1e, 0x1e113c33, 0xcb463d7b, 0x7bcbf646, 0xfc1fb7a8, 0xa8fc4b1f, 0xd6610c6d, 0x6dd6da61, 0x3a4e622c, 0x2c3a584e
+};
+
+#define GROESTL_ROTATE_COLUMN_DOWN(v1, v2, amount_bytes, temp_var) { \
+	temp_var = (v1<<(8*amount_bytes))|(v2>>(8*(4-amount_bytes))); \
+	v2 = (v2<<(8*amount_bytes))|(v1>>(8*(4-amount_bytes))); \
+	v1 = temp_var; \
+}
+
+#define GROESTL_COLUMN(x,y,i, c0,c1,c2,c3,c4,c5,c6,c7, tv1,tv2,tu,tl,t) \
+   tu = d_groestl_T[2*(uint32_t)x[4*c0+0]]; \
+   tl = d_groestl_T[2*(uint32_t)x[4*c0+0]+1]; \
+   tv1 = d_groestl_T[2*(uint32_t)x[4*c1+1]]; \
+   tv2 = d_groestl_T[2*(uint32_t)x[4*c1+1]+1]; \
+   GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,1,t) \
+   tu ^= tv1; \
+   tl ^= tv2; \
+   tv1 = d_groestl_T[2*(uint32_t)x[4*c2+2]]; \
+   tv2 = d_groestl_T[2*(uint32_t)x[4*c2+2]+1]; \
+   GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,2,t) \
+   tu ^= tv1; \
+   tl ^= tv2; \
+   tv1 = d_groestl_T[2*(uint32_t)x[4*c3+3]]; \
+   tv2 = d_groestl_T[2*(uint32_t)x[4*c3+3]+1]; \
+   GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,3,t) \
+   tu ^= tv1; \
+   tl ^= tv2; \
+   tl ^= d_groestl_T[2*(uint32_t)x[4*c4+0]]; \
+   tu ^= d_groestl_T[2*(uint32_t)x[4*c4+0]+1]; \
+   tv1 = d_groestl_T[2*(uint32_t)x[4*c5+1]]; \
+   tv2 = d_groestl_T[2*(uint32_t)x[4*c5+1]+1]; \
+   GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,1,t) \
+   tl ^= tv1; \
+   tu ^= tv2; \
+   tv1 = d_groestl_T[2*(uint32_t)x[4*c6+2]]; \
+   tv2 = d_groestl_T[2*(uint32_t)x[4*c6+2]+1]; \
+   GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,2,t) \
+   tl ^= tv1; \
+   tu ^= tv2; \
+   tv1 = d_groestl_T[2*(uint32_t)x[4*c7+3]]; \
+   tv2 = d_groestl_T[2*(uint32_t)x[4*c7+3]+1]; \
+   GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,3,t) \
+   tl ^= tv1; \
+   tu ^= tv2; \
+   y[i] = tu; \
+   y[i+1] = tl;
+
+__device__
+void cn_groestl_RND512P(uint8_t * __restrict__ x, uint32_t * __restrict__ y, uint32_t r)
+{
+	uint32_t temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp;
+	uint32_t* x32 = (uint32_t*)x;
+	x32[ 0] ^= 0x00000000^r;
+	x32[ 2] ^= 0x00000010^r;
+	x32[ 4] ^= 0x00000020^r;
+	x32[ 6] ^= 0x00000030^r;
+	x32[ 8] ^= 0x00000040^r;
+	x32[10] ^= 0x00000050^r;
+	x32[12] ^= 0x00000060^r;
+	x32[14] ^= 0x00000070^r;
+	GROESTL_COLUMN(x,y, 0,  0,  2,  4,  6,  9, 11, 13, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x,y, 2,  2,  4,  6,  8, 11, 13, 15,  1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x,y, 4,  4,  6,  8, 10, 13, 15,  1,  3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x,y, 6,  6,  8, 10, 12, 15,  1,  3,  5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x,y, 8,  8, 10, 12, 14,  1,  3,  5,  7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x,y,10, 10, 12, 14,  0,  3,  5,  7,  9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x,y,12, 12, 14,  0,  2,  5,  7,  9, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x,y,14, 14,  0,  2,  4,  7,  9, 11, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+}
+
+__device__
+void cn_groestl_RND512Q(uint8_t * __restrict__ x, uint32_t * __restrict__ y, uint32_t r)
+{
+	uint32_t temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp;
+	uint32_t* x32 = (uint32_t*)x;
+	x32[ 0] = ~x32[ 0];
+	x32[ 1] ^= 0xffffffff^r;
+	x32[ 2] = ~x32[ 2];
+	x32[ 3] ^= 0xefffffff^r;
+	x32[ 4] = ~x32[ 4];
+	x32[ 5] ^= 0xdfffffff^r;
+	x32[ 6] = ~x32[ 6];
+	x32[ 7] ^= 0xcfffffff^r;
+	x32[ 8] = ~x32[ 8];
+	x32[ 9] ^= 0xbfffffff^r;
+	x32[10] = ~x32[10];
+	x32[11] ^= 0xafffffff^r;
+	x32[12] = ~x32[12];
+	x32[13] ^= 0x9fffffff^r;
+	x32[14] = ~x32[14];
+	x32[15] ^= 0x8fffffff^r;
+	GROESTL_COLUMN(x,y, 0,  2,  6, 10, 14,  1,  5,  9, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x,y, 2,  4,  8, 12,  0,  3,  7, 11, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x,y, 4,  6, 10, 14,  2,  5,  9, 13,  1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x,y, 6,  8, 12,  0,  4,  7, 11, 15,  3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x,y, 8, 10, 14,  2,  6,  9, 13,  1,  5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x,y,10, 12,  0,  4,  8, 11, 15,  3,  7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x,y,12, 14,  2,  6, 10, 13,  1,  5,  9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+	GROESTL_COLUMN(x,y,14,  0,  4,  8, 12, 15,  3,  7, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp);
+}
+
+__device__
+void cn_groestl_F512(uint32_t * __restrict__ h, const uint32_t * __restrict__ m)
+{
+	int i;
+	uint32_t Ptmp[2*GROESTL_COLS512];
+	uint32_t Qtmp[2*GROESTL_COLS512];
+	uint32_t y[2*GROESTL_COLS512];
+	uint32_t z[2*GROESTL_COLS512];
+
+	for (i = 0; i < 2*GROESTL_COLS512; i++) {
+		z[i] = m[i];
+		Ptmp[i] = h[i]^m[i];
+	}
+
+	cn_groestl_RND512Q((uint8_t*)z, y, 0x00000000);
+	cn_groestl_RND512Q((uint8_t*)y, z, 0x01000000);
+	cn_groestl_RND512Q((uint8_t*)z, y, 0x02000000);
+	cn_groestl_RND512Q((uint8_t*)y, z, 0x03000000);
+	cn_groestl_RND512Q((uint8_t*)z, y, 0x04000000);
+	cn_groestl_RND512Q((uint8_t*)y, z, 0x05000000);
+	cn_groestl_RND512Q((uint8_t*)z, y, 0x06000000);
+	cn_groestl_RND512Q((uint8_t*)y, z, 0x07000000);
+	cn_groestl_RND512Q((uint8_t*)z, y, 0x08000000);
+	cn_groestl_RND512Q((uint8_t*)y, Qtmp, 0x09000000);
+
+	cn_groestl_RND512P((uint8_t*)Ptmp, y, 0x00000000);
+	cn_groestl_RND512P((uint8_t*)y, z, 0x00000001);
+	cn_groestl_RND512P((uint8_t*)z, y, 0x00000002);
+	cn_groestl_RND512P((uint8_t*)y, z, 0x00000003);
+	cn_groestl_RND512P((uint8_t*)z, y, 0x00000004);
+	cn_groestl_RND512P((uint8_t*)y, z, 0x00000005);
+	cn_groestl_RND512P((uint8_t*)z, y, 0x00000006);
+	cn_groestl_RND512P((uint8_t*)y, z, 0x00000007);
+	cn_groestl_RND512P((uint8_t*)z, y, 0x00000008);
+	cn_groestl_RND512P((uint8_t*)y, Ptmp, 0x00000009);
+
+	for (i = 0; i < 2*GROESTL_COLS512; i++)
+		h[i] ^= Ptmp[i]^Qtmp[i];
+}
+
+__device__
+void cn_groestl_outputtransformation(groestlHashState *ctx)
+{
+	int j;
+	uint32_t temp[2*GROESTL_COLS512];
+	uint32_t y[2*GROESTL_COLS512];
+	uint32_t z[2*GROESTL_COLS512];
+
+	for (j = 0; j < 2*GROESTL_COLS512; j++)
+		temp[j] = ctx->chaining[j];
+
+	cn_groestl_RND512P((uint8_t*)temp, y, 0x00000000);
+	cn_groestl_RND512P((uint8_t*)y, z, 0x00000001);
+	cn_groestl_RND512P((uint8_t*)z, y, 0x00000002);
+	cn_groestl_RND512P((uint8_t*)y, z, 0x00000003);
+	cn_groestl_RND512P((uint8_t*)z, y, 0x00000004);
+	cn_groestl_RND512P((uint8_t*)y, z, 0x00000005);
+	cn_groestl_RND512P((uint8_t*)z, y, 0x00000006);
+	cn_groestl_RND512P((uint8_t*)y, z, 0x00000007);
+	cn_groestl_RND512P((uint8_t*)z, y, 0x00000008);
+	cn_groestl_RND512P((uint8_t*)y, temp, 0x00000009);
+
+	for (j = 0; j < 2*GROESTL_COLS512; j++)
+		ctx->chaining[j] ^= temp[j];
+}
+
+__device__
+void cn_groestl_transform(groestlHashState * __restrict__ ctx,
+	const uint8_t * __restrict__ input, int msglen)
+{
+	for (; msglen >= GROESTL_SIZE512; msglen -= GROESTL_SIZE512, input += GROESTL_SIZE512) {
+		cn_groestl_F512(ctx->chaining,(uint32_t*)input);
+		ctx->block_counter1++;
+		if (ctx->block_counter1 == 0) ctx->block_counter2++;
+	}
+}
+
+__device__
+void cn_groestl_final(groestlHashState*  __restrict__ ctx, BitSequence* __restrict__  output)
+{
+	int i, j = 0, hashbytelen = GROESTL_HASH_BIT_LEN/8;
+	uint8_t *s = (BitSequence*)ctx->chaining;
+
+	if (ctx->bits_in_last_byte) {
+		ctx->buffer[(int)ctx->buf_ptr-1] &= ((1<<ctx->bits_in_last_byte)-1)<<(8-ctx->bits_in_last_byte);
+		ctx->buffer[(int)ctx->buf_ptr-1] ^= 0x1<<(7-ctx->bits_in_last_byte);
+		ctx->bits_in_last_byte = 0;
+	}
+	else ctx->buffer[(int)ctx->buf_ptr++] = 0x80;
+
+	if (ctx->buf_ptr > GROESTL_SIZE512-GROESTL_LENGTHFIELDLEN) {
+		while (ctx->buf_ptr < GROESTL_SIZE512) {
+			ctx->buffer[(int)ctx->buf_ptr++] = 0;
+		}
+		cn_groestl_transform(ctx, ctx->buffer, GROESTL_SIZE512);
+		ctx->buf_ptr = 0;
+	}
+	while (ctx->buf_ptr < GROESTL_SIZE512-GROESTL_LENGTHFIELDLEN) {
+		ctx->buffer[(int)ctx->buf_ptr++] = 0;
+	}
+
+	ctx->block_counter1++;
+	if (ctx->block_counter1 == 0) ctx->block_counter2++;
+	ctx->buf_ptr = GROESTL_SIZE512;
+
+	while (ctx->buf_ptr > GROESTL_SIZE512-(int)sizeof(uint32_t)) {
+		ctx->buffer[(int)--ctx->buf_ptr] = (uint8_t)ctx->block_counter1;
+		ctx->block_counter1 >>= 8;
+	}
+	while (ctx->buf_ptr > GROESTL_SIZE512-GROESTL_LENGTHFIELDLEN) {
+		ctx->buffer[(int)--ctx->buf_ptr] = (uint8_t)ctx->block_counter2;
+		ctx->block_counter2 >>= 8;
+	}
+	cn_groestl_transform(ctx, ctx->buffer, GROESTL_SIZE512);
+	cn_groestl_outputtransformation(ctx);
+
+	for (i = GROESTL_SIZE512-hashbytelen; i < GROESTL_SIZE512; i++,j++) {
+		output[j] = s[i];
+	}
+#if 0
+	for (i = 0; i < GROESTL_COLS512; i++) {
+		ctx->chaining[i] = 0;
+	}
+	for (i = 0; i < GROESTL_SIZE512; i++) {
+		ctx->buffer[i] = 0;
+	}
+#endif
+}
+
+__device__
+void cn_groestl_update(groestlHashState* __restrict__ ctx,
+	const BitSequence* __restrict__ input, DataLength databitlen)
+{
+	int index = 0;
+	int msglen = (int)(databitlen/8);
+	int rem = (int)(databitlen%8);
+
+	if (ctx->buf_ptr) {
+		while (ctx->buf_ptr < GROESTL_SIZE512 && index < msglen) {
+			ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
+		}
+		if (ctx->buf_ptr < GROESTL_SIZE512) {
+			if (rem) {
+				ctx->bits_in_last_byte = rem;
+				ctx->buffer[(int)ctx->buf_ptr++] = input[index];
+			}
+			return;
+		}
+
+		ctx->buf_ptr = 0;
+		cn_groestl_transform(ctx, ctx->buffer, GROESTL_SIZE512);
+	}
+
+	cn_groestl_transform(ctx, input+index, msglen-index);
+	index += ((msglen-index)/GROESTL_SIZE512)*GROESTL_SIZE512;
+
+	while (index < msglen) {
+		ctx->buffer[(int)ctx->buf_ptr++] = input[index++];
+	}
+
+	if (rem) {
+		ctx->bits_in_last_byte = rem;
+		ctx->buffer[(int)ctx->buf_ptr++] = input[index];
+	}
+}
+
+__device__
+void cn_groestl_init(groestlHashState* ctx)
+{
+	int i = 0;
+
+	for(;i<(GROESTL_SIZE512/sizeof(uint32_t));i++)
+		ctx->chaining[i] = 0;
+
+	ctx->chaining[2*GROESTL_COLS512-1] = u32BIG((uint32_t)GROESTL_HASH_BIT_LEN);
+	ctx->buf_ptr = 0;
+	ctx->block_counter1 = 0;
+	ctx->block_counter2 = 0;
+	ctx->bits_in_last_byte = 0;
+}
+
+__device__
+void cn_groestl(const uint8_t * __restrict__ data, DataLength len, uint32_t * hashval)
+{
+	DataLength databitlen = len << 3;
+	groestlHashState context;
+
+	cn_groestl_init(&context);
+	cn_groestl_update(&context, (BitSequence*) data, databitlen);
+	cn_groestl_final(&context, (BitSequence*) hashval);
+}
diff --git a/crypto/cn_jh.cuh b/crypto/cn_jh.cuh
new file mode 100644
index 0000000000..b05380d4b3
--- /dev/null
+++ b/crypto/cn_jh.cuh
@@ -0,0 +1,296 @@
+
+typedef struct {
+	int hashbitlen;
+	unsigned long long databitlen;
+	unsigned long long datasize_in_buffer;
+	uint64_t x[8][2];
+	unsigned char buffer[64];
+} jhHashState;
+
+__constant__ unsigned char d_JH256_H0[512] = {
+	0xeb, 0x98, 0xa3, 0x41, 0x2c, 0x20, 0xd3, 0xeb, 0x92, 0xcd, 0xbe, 0x7b, 0x9c, 0xb2, 0x45, 0xc1,
+	0x1c, 0x93, 0x51, 0x91, 0x60, 0xd4, 0xc7, 0xfa, 0x26, 0x0, 0x82, 0xd6, 0x7e, 0x50, 0x8a, 0x3,
+	0xa4, 0x23, 0x9e, 0x26, 0x77, 0x26, 0xb9, 0x45, 0xe0, 0xfb, 0x1a, 0x48, 0xd4, 0x1a, 0x94, 0x77,
+	0xcd, 0xb5, 0xab, 0x26, 0x2, 0x6b, 0x17, 0x7a, 0x56, 0xf0, 0x24, 0x42, 0xf, 0xff, 0x2f, 0xa8,
+	0x71, 0xa3, 0x96, 0x89, 0x7f, 0x2e, 0x4d, 0x75, 0x1d, 0x14, 0x49, 0x8, 0xf7, 0x7d, 0xe2, 0x62,
+	0x27, 0x76, 0x95, 0xf7, 0x76, 0x24, 0x8f, 0x94, 0x87, 0xd5, 0xb6, 0x57, 0x47, 0x80, 0x29, 0x6c,
+	0x5c, 0x5e, 0x27, 0x2d, 0xac, 0x8e, 0xd, 0x6c, 0x51, 0x84, 0x50, 0xc6, 0x57, 0x5, 0x7a, 0xf,
+	0x7b, 0xe4, 0xd3, 0x67, 0x70, 0x24, 0x12, 0xea, 0x89, 0xe3, 0xab, 0x13, 0xd3, 0x1c, 0xd7, 0x69
+};
+
+__constant__ unsigned char d_E8_rc[42][32] = {
+	{0x72, 0xd5, 0xde, 0xa2, 0xdf, 0x15, 0xf8, 0x67, 0x7b, 0x84, 0x15, 0xa, 0xb7, 0x23, 0x15, 0x57, 0x81, 0xab, 0xd6, 0x90, 0x4d, 0x5a, 0x87, 0xf6, 0x4e, 0x9f, 0x4f, 0xc5, 0xc3, 0xd1, 0x2b, 0x40},
+	{0xea, 0x98, 0x3a, 0xe0, 0x5c, 0x45, 0xfa, 0x9c, 0x3, 0xc5, 0xd2, 0x99, 0x66, 0xb2, 0x99, 0x9a, 0x66, 0x2, 0x96, 0xb4, 0xf2, 0xbb, 0x53, 0x8a, 0xb5, 0x56, 0x14, 0x1a, 0x88, 0xdb, 0xa2, 0x31},
+	{0x3, 0xa3, 0x5a, 0x5c, 0x9a, 0x19, 0xe, 0xdb, 0x40, 0x3f, 0xb2, 0xa, 0x87, 0xc1, 0x44, 0x10, 0x1c, 0x5, 0x19, 0x80, 0x84, 0x9e, 0x95, 0x1d, 0x6f, 0x33, 0xeb, 0xad, 0x5e, 0xe7, 0xcd, 0xdc},
+	{0x10, 0xba, 0x13, 0x92, 0x2, 0xbf, 0x6b, 0x41, 0xdc, 0x78, 0x65, 0x15, 0xf7, 0xbb, 0x27, 0xd0, 0xa, 0x2c, 0x81, 0x39, 0x37, 0xaa, 0x78, 0x50, 0x3f, 0x1a, 0xbf, 0xd2, 0x41, 0x0, 0x91, 0xd3},
+	{0x42, 0x2d, 0x5a, 0xd, 0xf6, 0xcc, 0x7e, 0x90, 0xdd, 0x62, 0x9f, 0x9c, 0x92, 0xc0, 0x97, 0xce, 0x18, 0x5c, 0xa7, 0xb, 0xc7, 0x2b, 0x44, 0xac, 0xd1, 0xdf, 0x65, 0xd6, 0x63, 0xc6, 0xfc, 0x23},
+	{0x97, 0x6e, 0x6c, 0x3, 0x9e, 0xe0, 0xb8, 0x1a, 0x21, 0x5, 0x45, 0x7e, 0x44, 0x6c, 0xec, 0xa8, 0xee, 0xf1, 0x3, 0xbb, 0x5d, 0x8e, 0x61, 0xfa, 0xfd, 0x96, 0x97, 0xb2, 0x94, 0x83, 0x81, 0x97},
+	{0x4a, 0x8e, 0x85, 0x37, 0xdb, 0x3, 0x30, 0x2f, 0x2a, 0x67, 0x8d, 0x2d, 0xfb, 0x9f, 0x6a, 0x95, 0x8a, 0xfe, 0x73, 0x81, 0xf8, 0xb8, 0x69, 0x6c, 0x8a, 0xc7, 0x72, 0x46, 0xc0, 0x7f, 0x42, 0x14},
+	{0xc5, 0xf4, 0x15, 0x8f, 0xbd, 0xc7, 0x5e, 0xc4, 0x75, 0x44, 0x6f, 0xa7, 0x8f, 0x11, 0xbb, 0x80, 0x52, 0xde, 0x75, 0xb7, 0xae, 0xe4, 0x88, 0xbc, 0x82, 0xb8, 0x0, 0x1e, 0x98, 0xa6, 0xa3, 0xf4},
+	{0x8e, 0xf4, 0x8f, 0x33, 0xa9, 0xa3, 0x63, 0x15, 0xaa, 0x5f, 0x56, 0x24, 0xd5, 0xb7, 0xf9, 0x89, 0xb6, 0xf1, 0xed, 0x20, 0x7c, 0x5a, 0xe0, 0xfd, 0x36, 0xca, 0xe9, 0x5a, 0x6, 0x42, 0x2c, 0x36},
+	{0xce, 0x29, 0x35, 0x43, 0x4e, 0xfe, 0x98, 0x3d, 0x53, 0x3a, 0xf9, 0x74, 0x73, 0x9a, 0x4b, 0xa7, 0xd0, 0xf5, 0x1f, 0x59, 0x6f, 0x4e, 0x81, 0x86, 0xe, 0x9d, 0xad, 0x81, 0xaf, 0xd8, 0x5a, 0x9f},
+	{0xa7, 0x5, 0x6, 0x67, 0xee, 0x34, 0x62, 0x6a, 0x8b, 0xb, 0x28, 0xbe, 0x6e, 0xb9, 0x17, 0x27, 0x47, 0x74, 0x7, 0x26, 0xc6, 0x80, 0x10, 0x3f, 0xe0, 0xa0, 0x7e, 0x6f, 0xc6, 0x7e, 0x48, 0x7b},
+	{0xd, 0x55, 0xa, 0xa5, 0x4a, 0xf8, 0xa4, 0xc0, 0x91, 0xe3, 0xe7, 0x9f, 0x97, 0x8e, 0xf1, 0x9e, 0x86, 0x76, 0x72, 0x81, 0x50, 0x60, 0x8d, 0xd4, 0x7e, 0x9e, 0x5a, 0x41, 0xf3, 0xe5, 0xb0, 0x62},
+	{0xfc, 0x9f, 0x1f, 0xec, 0x40, 0x54, 0x20, 0x7a, 0xe3, 0xe4, 0x1a, 0x0, 0xce, 0xf4, 0xc9, 0x84, 0x4f, 0xd7, 0x94, 0xf5, 0x9d, 0xfa, 0x95, 0xd8, 0x55, 0x2e, 0x7e, 0x11, 0x24, 0xc3, 0x54, 0xa5},
+	{0x5b, 0xdf, 0x72, 0x28, 0xbd, 0xfe, 0x6e, 0x28, 0x78, 0xf5, 0x7f, 0xe2, 0xf, 0xa5, 0xc4, 0xb2, 0x5, 0x89, 0x7c, 0xef, 0xee, 0x49, 0xd3, 0x2e, 0x44, 0x7e, 0x93, 0x85, 0xeb, 0x28, 0x59, 0x7f},
+	{0x70, 0x5f, 0x69, 0x37, 0xb3, 0x24, 0x31, 0x4a, 0x5e, 0x86, 0x28, 0xf1, 0x1d, 0xd6, 0xe4, 0x65, 0xc7, 0x1b, 0x77, 0x4, 0x51, 0xb9, 0x20, 0xe7, 0x74, 0xfe, 0x43, 0xe8, 0x23, 0xd4, 0x87, 0x8a},
+	{0x7d, 0x29, 0xe8, 0xa3, 0x92, 0x76, 0x94, 0xf2, 0xdd, 0xcb, 0x7a, 0x9, 0x9b, 0x30, 0xd9, 0xc1, 0x1d, 0x1b, 0x30, 0xfb, 0x5b, 0xdc, 0x1b, 0xe0, 0xda, 0x24, 0x49, 0x4f, 0xf2, 0x9c, 0x82, 0xbf},
+	{0xa4, 0xe7, 0xba, 0x31, 0xb4, 0x70, 0xbf, 0xff, 0xd, 0x32, 0x44, 0x5, 0xde, 0xf8, 0xbc, 0x48, 0x3b, 0xae, 0xfc, 0x32, 0x53, 0xbb, 0xd3, 0x39, 0x45, 0x9f, 0xc3, 0xc1, 0xe0, 0x29, 0x8b, 0xa0},
+	{0xe5, 0xc9, 0x5, 0xfd, 0xf7, 0xae, 0x9, 0xf, 0x94, 0x70, 0x34, 0x12, 0x42, 0x90, 0xf1, 0x34, 0xa2, 0x71, 0xb7, 0x1, 0xe3, 0x44, 0xed, 0x95, 0xe9, 0x3b, 0x8e, 0x36, 0x4f, 0x2f, 0x98, 0x4a},
+	{0x88, 0x40, 0x1d, 0x63, 0xa0, 0x6c, 0xf6, 0x15, 0x47, 0xc1, 0x44, 0x4b, 0x87, 0x52, 0xaf, 0xff, 0x7e, 0xbb, 0x4a, 0xf1, 0xe2, 0xa, 0xc6, 0x30, 0x46, 0x70, 0xb6, 0xc5, 0xcc, 0x6e, 0x8c, 0xe6},
+	{0xa4, 0xd5, 0xa4, 0x56, 0xbd, 0x4f, 0xca, 0x0, 0xda, 0x9d, 0x84, 0x4b, 0xc8, 0x3e, 0x18, 0xae, 0x73, 0x57, 0xce, 0x45, 0x30, 0x64, 0xd1, 0xad, 0xe8, 0xa6, 0xce, 0x68, 0x14, 0x5c, 0x25, 0x67},
+	{0xa3, 0xda, 0x8c, 0xf2, 0xcb, 0xe, 0xe1, 0x16, 0x33, 0xe9, 0x6, 0x58, 0x9a, 0x94, 0x99, 0x9a, 0x1f, 0x60, 0xb2, 0x20, 0xc2, 0x6f, 0x84, 0x7b, 0xd1, 0xce, 0xac, 0x7f, 0xa0, 0xd1, 0x85, 0x18},
+	{0x32, 0x59, 0x5b, 0xa1, 0x8d, 0xdd, 0x19, 0xd3, 0x50, 0x9a, 0x1c, 0xc0, 0xaa, 0xa5, 0xb4, 0x46, 0x9f, 0x3d, 0x63, 0x67, 0xe4, 0x4, 0x6b, 0xba, 0xf6, 0xca, 0x19, 0xab, 0xb, 0x56, 0xee, 0x7e},
+	{0x1f, 0xb1, 0x79, 0xea, 0xa9, 0x28, 0x21, 0x74, 0xe9, 0xbd, 0xf7, 0x35, 0x3b, 0x36, 0x51, 0xee, 0x1d, 0x57, 0xac, 0x5a, 0x75, 0x50, 0xd3, 0x76, 0x3a, 0x46, 0xc2, 0xfe, 0xa3, 0x7d, 0x70, 0x1},
+	{0xf7, 0x35, 0xc1, 0xaf, 0x98, 0xa4, 0xd8, 0x42, 0x78, 0xed, 0xec, 0x20, 0x9e, 0x6b, 0x67, 0x79, 0x41, 0x83, 0x63, 0x15, 0xea, 0x3a, 0xdb, 0xa8, 0xfa, 0xc3, 0x3b, 0x4d, 0x32, 0x83, 0x2c, 0x83},
+	{0xa7, 0x40, 0x3b, 0x1f, 0x1c, 0x27, 0x47, 0xf3, 0x59, 0x40, 0xf0, 0x34, 0xb7, 0x2d, 0x76, 0x9a, 0xe7, 0x3e, 0x4e, 0x6c, 0xd2, 0x21, 0x4f, 0xfd, 0xb8, 0xfd, 0x8d, 0x39, 0xdc, 0x57, 0x59, 0xef},
+	{0x8d, 0x9b, 0xc, 0x49, 0x2b, 0x49, 0xeb, 0xda, 0x5b, 0xa2, 0xd7, 0x49, 0x68, 0xf3, 0x70, 0xd, 0x7d, 0x3b, 0xae, 0xd0, 0x7a, 0x8d, 0x55, 0x84, 0xf5, 0xa5, 0xe9, 0xf0, 0xe4, 0xf8, 0x8e, 0x65},
+	{0xa0, 0xb8, 0xa2, 0xf4, 0x36, 0x10, 0x3b, 0x53, 0xc, 0xa8, 0x7, 0x9e, 0x75, 0x3e, 0xec, 0x5a, 0x91, 0x68, 0x94, 0x92, 0x56, 0xe8, 0x88, 0x4f, 0x5b, 0xb0, 0x5c, 0x55, 0xf8, 0xba, 0xbc, 0x4c},
+	{0xe3, 0xbb, 0x3b, 0x99, 0xf3, 0x87, 0x94, 0x7b, 0x75, 0xda, 0xf4, 0xd6, 0x72, 0x6b, 0x1c, 0x5d, 0x64, 0xae, 0xac, 0x28, 0xdc, 0x34, 0xb3, 0x6d, 0x6c, 0x34, 0xa5, 0x50, 0xb8, 0x28, 0xdb, 0x71},
+	{0xf8, 0x61, 0xe2, 0xf2, 0x10, 0x8d, 0x51, 0x2a, 0xe3, 0xdb, 0x64, 0x33, 0x59, 0xdd, 0x75, 0xfc, 0x1c, 0xac, 0xbc, 0xf1, 0x43, 0xce, 0x3f, 0xa2, 0x67, 0xbb, 0xd1, 0x3c, 0x2, 0xe8, 0x43, 0xb0},
+	{0x33, 0xa, 0x5b, 0xca, 0x88, 0x29, 0xa1, 0x75, 0x7f, 0x34, 0x19, 0x4d, 0xb4, 0x16, 0x53, 0x5c, 0x92, 0x3b, 0x94, 0xc3, 0xe, 0x79, 0x4d, 0x1e, 0x79, 0x74, 0x75, 0xd7, 0xb6, 0xee, 0xaf, 0x3f},
+	{0xea, 0xa8, 0xd4, 0xf7, 0xbe, 0x1a, 0x39, 0x21, 0x5c, 0xf4, 0x7e, 0x9, 0x4c, 0x23, 0x27, 0x51, 0x26, 0xa3, 0x24, 0x53, 0xba, 0x32, 0x3c, 0xd2, 0x44, 0xa3, 0x17, 0x4a, 0x6d, 0xa6, 0xd5, 0xad},
+	{0xb5, 0x1d, 0x3e, 0xa6, 0xaf, 0xf2, 0xc9, 0x8, 0x83, 0x59, 0x3d, 0x98, 0x91, 0x6b, 0x3c, 0x56, 0x4c, 0xf8, 0x7c, 0xa1, 0x72, 0x86, 0x60, 0x4d, 0x46, 0xe2, 0x3e, 0xcc, 0x8, 0x6e, 0xc7, 0xf6},
+	{0x2f, 0x98, 0x33, 0xb3, 0xb1, 0xbc, 0x76, 0x5e, 0x2b, 0xd6, 0x66, 0xa5, 0xef, 0xc4, 0xe6, 0x2a, 0x6, 0xf4, 0xb6, 0xe8, 0xbe, 0xc1, 0xd4, 0x36, 0x74, 0xee, 0x82, 0x15, 0xbc, 0xef, 0x21, 0x63},
+	{0xfd, 0xc1, 0x4e, 0xd, 0xf4, 0x53, 0xc9, 0x69, 0xa7, 0x7d, 0x5a, 0xc4, 0x6, 0x58, 0x58, 0x26, 0x7e, 0xc1, 0x14, 0x16, 0x6, 0xe0, 0xfa, 0x16, 0x7e, 0x90, 0xaf, 0x3d, 0x28, 0x63, 0x9d, 0x3f},
+	{0xd2, 0xc9, 0xf2, 0xe3, 0x0, 0x9b, 0xd2, 0xc, 0x5f, 0xaa, 0xce, 0x30, 0xb7, 0xd4, 0xc, 0x30, 0x74, 0x2a, 0x51, 0x16, 0xf2, 0xe0, 0x32, 0x98, 0xd, 0xeb, 0x30, 0xd8, 0xe3, 0xce, 0xf8, 0x9a},
+	{0x4b, 0xc5, 0x9e, 0x7b, 0xb5, 0xf1, 0x79, 0x92, 0xff, 0x51, 0xe6, 0x6e, 0x4, 0x86, 0x68, 0xd3, 0x9b, 0x23, 0x4d, 0x57, 0xe6, 0x96, 0x67, 0x31, 0xcc, 0xe6, 0xa6, 0xf3, 0x17, 0xa, 0x75, 0x5},
+	{0xb1, 0x76, 0x81, 0xd9, 0x13, 0x32, 0x6c, 0xce, 0x3c, 0x17, 0x52, 0x84, 0xf8, 0x5, 0xa2, 0x62, 0xf4, 0x2b, 0xcb, 0xb3, 0x78, 0x47, 0x15, 0x47, 0xff, 0x46, 0x54, 0x82, 0x23, 0x93, 0x6a, 0x48},
+	{0x38, 0xdf, 0x58, 0x7, 0x4e, 0x5e, 0x65, 0x65, 0xf2, 0xfc, 0x7c, 0x89, 0xfc, 0x86, 0x50, 0x8e, 0x31, 0x70, 0x2e, 0x44, 0xd0, 0xb, 0xca, 0x86, 0xf0, 0x40, 0x9, 0xa2, 0x30, 0x78, 0x47, 0x4e},
+	{0x65, 0xa0, 0xee, 0x39, 0xd1, 0xf7, 0x38, 0x83, 0xf7, 0x5e, 0xe9, 0x37, 0xe4, 0x2c, 0x3a, 0xbd, 0x21, 0x97, 0xb2, 0x26, 0x1, 0x13, 0xf8, 0x6f, 0xa3, 0x44, 0xed, 0xd1, 0xef, 0x9f, 0xde, 0xe7},
+	{0x8b, 0xa0, 0xdf, 0x15, 0x76, 0x25, 0x92, 0xd9, 0x3c, 0x85, 0xf7, 0xf6, 0x12, 0xdc, 0x42, 0xbe, 0xd8, 0xa7, 0xec, 0x7c, 0xab, 0x27, 0xb0, 0x7e, 0x53, 0x8d, 0x7d, 0xda, 0xaa, 0x3e, 0xa8, 0xde},
+	{0xaa, 0x25, 0xce, 0x93, 0xbd, 0x2, 0x69, 0xd8, 0x5a, 0xf6, 0x43, 0xfd, 0x1a, 0x73, 0x8, 0xf9, 0xc0, 0x5f, 0xef, 0xda, 0x17, 0x4a, 0x19, 0xa5, 0x97, 0x4d, 0x66, 0x33, 0x4c, 0xfd, 0x21, 0x6a},
+	{0x35, 0xb4, 0x98, 0x31, 0xdb, 0x41, 0x15, 0x70, 0xea, 0x1e, 0xf, 0xbb, 0xed, 0xcd, 0x54, 0x9b, 0x9a, 0xd0, 0x63, 0xa1, 0x51, 0x97, 0x40, 0x72, 0xf6, 0x75, 0x9d, 0xbf, 0x91, 0x47, 0x6f, 0xe2}
+};
+
+#define JH_SWAP1(x)   (x) = ((((x) & 0x5555555555555555ULL) << 1) | (((x) & 0xaaaaaaaaaaaaaaaaULL) >> 1));
+#define JH_SWAP2(x)   (x) = ((((x) & 0x3333333333333333ULL) << 2) | (((x) & 0xccccccccccccccccULL) >> 2));
+#define JH_SWAP4(x)   (x) = ((((x) & 0x0f0f0f0f0f0f0f0fULL) << 4) | (((x) & 0xf0f0f0f0f0f0f0f0ULL) >> 4));
+#define JH_SWAP8(x)   (x) = ((((x) & 0x00ff00ff00ff00ffULL) << 8) | (((x) & 0xff00ff00ff00ff00ULL) >> 8));
+#define JH_SWAP16(x)  (x) = ((((x) & 0x0000ffff0000ffffULL) << 16) | (((x) & 0xffff0000ffff0000ULL) >> 16));
+#define JH_SWAP32(x)  (x) = (((x) << 32) | ((x) >> 32));
+
+#define JH_L(m0,m1,m2,m3,m4,m5,m6,m7) \
+	(m4) ^= (m1);                \
+	(m5) ^= (m2);                \
+	(m6) ^= (m0) ^ (m3);         \
+	(m7) ^= (m0);                \
+	(m0) ^= (m5);                \
+	(m1) ^= (m6);                \
+	(m2) ^= (m4) ^ (m7);         \
+	(m3) ^= (m4);
+
+#define JH_SS(m0,m1,m2,m3,m4,m5,m6,m7,cc0,cc1)   \
+	m3  = ~(m3);                  \
+	m7  = ~(m7);                  \
+	m0 ^= ((~(m2)) & (cc0));      \
+	m4 ^= ((~(m6)) & (cc1));      \
+	temp0 = (cc0) ^ ((m0) & (m1));\
+	temp1 = (cc1) ^ ((m4) & (m5));\
+	m0 ^= ((m2) & (m3));          \
+	m4 ^= ((m6) & (m7));          \
+	m3 ^= ((~(m1)) & (m2));       \
+	m7 ^= ((~(m5)) & (m6));       \
+	m1 ^= ((m0) & (m2));          \
+	m5 ^= ((m4) & (m6));          \
+	m2 ^= ((m0) & (~(m3)));       \
+	m6 ^= ((m4) & (~(m7)));       \
+	m0 ^= ((m1) | (m3));          \
+	m4 ^= ((m5) | (m7));          \
+	m3 ^= ((m1) & (m2));          \
+	m7 ^= ((m5) & (m6));          \
+	m1 ^= (temp0 & (m0));         \
+	m5 ^= (temp1 & (m4));         \
+	m2 ^= temp0;                  \
+	m6 ^= temp1;
+
+__device__
+void cn_jh_E8(jhHashState *state)
+{
+	uint64_t i,roundnumber,temp0,temp1;
+
+	for (roundnumber = 0; roundnumber < 42; roundnumber = roundnumber+7)
+	{
+		for (i = 0; i < 2; i++) {
+			JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],
+				((uint64_t *)d_E8_rc[roundnumber+0])[i],((uint64_t *)d_E8_rc[roundnumber+0])[i+2] );
+			JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
+			JH_SWAP1(state->x[1][i]); JH_SWAP1(state->x[3][i]); JH_SWAP1(state->x[5][i]); JH_SWAP1(state->x[7][i]);
+		}
+
+		for (i = 0; i < 2; i++) {
+			JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],
+				((uint64_t *)d_E8_rc[roundnumber+1])[i],((uint64_t *)d_E8_rc[roundnumber+1])[i+2] );
+			JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
+			JH_SWAP2(state->x[1][i]); JH_SWAP2(state->x[3][i]); JH_SWAP2(state->x[5][i]); JH_SWAP2(state->x[7][i]);
+		}
+
+		for (i = 0; i < 2; i++) {
+			JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],
+				((uint64_t *)d_E8_rc[roundnumber+2])[i],((uint64_t *)d_E8_rc[roundnumber+2])[i+2] );
+			JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
+			JH_SWAP4(state->x[1][i]); JH_SWAP4(state->x[3][i]); JH_SWAP4(state->x[5][i]); JH_SWAP4(state->x[7][i]);
+		}
+
+		for (i = 0; i < 2; i++) {
+			JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],
+				((uint64_t *)d_E8_rc[roundnumber+3])[i],((uint64_t *)d_E8_rc[roundnumber+3])[i+2] );
+			JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
+			JH_SWAP8(state->x[1][i]); JH_SWAP8(state->x[3][i]); JH_SWAP8(state->x[5][i]); JH_SWAP8(state->x[7][i]);
+		}
+
+		for (i = 0; i < 2; i++) {
+			JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],
+				((uint64_t *)d_E8_rc[roundnumber+4])[i],((uint64_t *)d_E8_rc[roundnumber+4])[i+2] );
+			JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
+			JH_SWAP16(state->x[1][i]); JH_SWAP16(state->x[3][i]); JH_SWAP16(state->x[5][i]); JH_SWAP16(state->x[7][i]);
+		}
+
+		for (i = 0; i < 2; i++) {
+			JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],
+				((uint64_t *)d_E8_rc[roundnumber+5])[i],((uint64_t *)d_E8_rc[roundnumber+5])[i+2] );
+			JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
+			JH_SWAP32(state->x[1][i]); JH_SWAP32(state->x[3][i]); JH_SWAP32(state->x[5][i]); JH_SWAP32(state->x[7][i]);
+		}
+
+		for (i = 0; i < 2; i++) {
+			JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],
+				((uint64_t *)d_E8_rc[roundnumber+6])[i],((uint64_t *)d_E8_rc[roundnumber+6])[i+2] );
+			JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
+		}
+
+		for (i = 1; i < 8; i = i+2) {
+			temp0 = state->x[i][0]; state->x[i][0] = state->x[i][1]; state->x[i][1] = temp0;
+		}
+	}
+}
+
+__device__
+void cn_jh_F8(jhHashState *state)
+{
+	uint64_t i;
+	for (i = 0; i < 8; i++) {
+		state->x[i >> 1][i & 1] ^= ((uint64_t *)state->buffer)[i];
+	}
+
+	cn_jh_E8(state);
+
+	for (i = 0; i < 8; i++) {
+		state->x[(8+i) >> 1][(8+i) & 1] ^= ((uint64_t *)state->buffer)[i];
+	}
+}
+
+__device__
+void cn_jh_update(jhHashState * __restrict__ state, const uint8_t * __restrict__ data, DataLength databitlen)
+{
+	DataLength index;
+
+	state->databitlen += databitlen;
+	index = 0;
+
+	if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) < 512)  )
+	{
+		if ( (databitlen & 7) == 0 ) {
+			memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3));
+		}
+		else memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3)+1);
+		state->datasize_in_buffer += databitlen;
+		databitlen = 0;
+	}
+
+	if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) >= 512)  )
+	{
+		memcpy( state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3) );
+		index = 64-(state->datasize_in_buffer >> 3);
+		databitlen = databitlen - (512 - state->datasize_in_buffer);
+		cn_jh_F8(state);
+		state->datasize_in_buffer = 0;
+	}
+
+	for ( ; databitlen >= 512; index = index+64, databitlen = databitlen - 512) {
+		memcpy(state->buffer, data+index, 64);
+		cn_jh_F8(state);
+	}
+
+	if ( databitlen > 0) {
+		if ((databitlen & 7) == 0)
+			memcpy(state->buffer, data+index, (databitlen & 0x1ff) >> 3);
+		else
+			memcpy(state->buffer, data+index, ((databitlen & 0x1ff) >> 3)+1);
+		state->datasize_in_buffer = databitlen;
+	}
+}
+
+/* pad the message, process the padded block(s), truncate the hash value H to obtain the message digest */
+__device__
+void cn_jh_final(jhHashState * __restrict__ state, uint8_t * __restrict__ hashval)
+{
+	unsigned int i;
+	//uint32_t *bufptr = (uint32_t *)state->buffer;
+
+	if ( (state->databitlen & 0x1ff) == 0 ) {
+		/* pad the message when databitlen is multiple of 512 bits, then process the padded block */
+		memset(state->buffer, 0, 64);
+		//for( i = 0; i < 16; i++ ) *(bufptr+i) = 0x00000000;
+		state->buffer[0]  = 0x80;
+		state->buffer[63] = state->databitlen & 0xff;
+		state->buffer[62] = (state->databitlen >> 8)  & 0xff;
+		state->buffer[61] = (state->databitlen >> 16) & 0xff;
+		state->buffer[60] = (state->databitlen >> 24) & 0xff;
+		state->buffer[59] = (state->databitlen >> 32) & 0xff;
+		state->buffer[58] = (state->databitlen >> 40) & 0xff;
+		state->buffer[57] = (state->databitlen >> 48) & 0xff;
+		state->buffer[56] = (state->databitlen >> 56) & 0xff;
+		cn_jh_F8(state);
+
+	} else {
+
+		/* set the rest of the bytes in the buffer to 0 */
+		if ( (state->datasize_in_buffer & 7) == 0) {
+			for (i = (state->databitlen & 0x1ff) >> 3; i < 64; i++) state->buffer[i] = 0;
+		} else {
+			for (i = ((state->databitlen & 0x1ff) >> 3)+1; i < 64; i++) state->buffer[i] = 0;
+		}
+
+		/*pad and process the partial block when databitlen is not multiple of 512 bits, then hash the padded blocks*/
+		state->buffer[((state->databitlen & 0x1ff) >> 3)] |= 1 << (7- (state->databitlen & 7));
+
+		cn_jh_F8(state);
+		memset(state->buffer, 0, 64);
+		//for( i = 0; i < 16; i++ ) *(bufptr+i) = 0x00000000;
+		state->buffer[63] = state->databitlen & 0xff;
+		state->buffer[62] = (state->databitlen >> 8) & 0xff;
+		state->buffer[61] = (state->databitlen >> 16) & 0xff;
+		state->buffer[60] = (state->databitlen >> 24) & 0xff;
+		state->buffer[59] = (state->databitlen >> 32) & 0xff;
+		state->buffer[58] = (state->databitlen >> 40) & 0xff;
+		state->buffer[57] = (state->databitlen >> 48) & 0xff;
+		state->buffer[56] = (state->databitlen >> 56) & 0xff;
+		cn_jh_F8(state);
+	}
+
+	memcpy(hashval, ((unsigned char*)state->x) + 64 + 32, 32);
+	//MEMCPY4(hashval, ((unsigned char*)state->x) + 64 + 32, 8);
+}
+
+__device__
+void cn_jh_init(jhHashState *state, int hashbitlen)
+{
+	state->databitlen = 0;
+	state->datasize_in_buffer = 0;
+	state->hashbitlen = hashbitlen;
+	memcpy(state->x, d_JH256_H0, 128);
+	//MEMCPY8(state->x, d_JH256_H0, 128 / 8);
+}
+
+__device__
+void cn_jh(const uint8_t * __restrict__ data, DataLength len, uint32_t * hashval)
+{
+	const int hashbitlen = 256;
+	DataLength databitlen = len << 3;
+	jhHashState state;
+
+	cn_jh_init(&state, hashbitlen);
+	cn_jh_update(&state, data, databitlen);
+	cn_jh_final(&state, (uint8_t*) hashval);
+}
diff --git a/crypto/cn_keccak.cuh b/crypto/cn_keccak.cuh
new file mode 100644
index 0000000000..c6f5908f61
--- /dev/null
+++ b/crypto/cn_keccak.cuh
@@ -0,0 +1,211 @@
+__constant__ uint64_t keccakf_rndc[24] = {
+	0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
+	0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
+	0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
+	0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
+	0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
+	0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
+	0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
+	0x8000000000008080, 0x0000000080000001, 0x8000000080008008
+};
+
+#if __CUDA_ARCH__ >= 350
+	__forceinline__ __device__ uint64_t cuda_rotl64(const uint64_t value, const int offset)
+	{
+		uint2 result;
+		if(offset >= 32)
+		{
+			asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+			asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+		}
+		else
+		{
+			asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+			asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+		}
+		return  __double_as_longlong(__hiloint2double(result.y, result.x));
+	}
+	#define rotl64_1(x, y) (cuda_rotl64((x), (y)))
+#else
+	#define rotl64_1(x, y) ((x) << (y) | ((x) >> (64 - (y))))
+#endif
+
+#define rotl64_2(x, y) rotl64_1(((x) >> 32) | ((x) << 32), (y))
+#define bitselect(a, b, c) ((a) ^ ((c) & ((b) ^ (a))))
+
+__device__ __forceinline__
+void cn_keccakf2(uint64_t *s)
+{
+	uint8_t i;
+
+	for(i = 0; i < 24; ++i)
+	{
+		uint64_t bc[5], tmpxor[5], tmp1, tmp2;
+
+		tmpxor[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
+		tmpxor[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
+		tmpxor[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
+		tmpxor[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
+		tmpxor[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
+
+		bc[0] = tmpxor[0] ^ rotl64_1(tmpxor[2], 1);
+		bc[1] = tmpxor[1] ^ rotl64_1(tmpxor[3], 1);
+		bc[2] = tmpxor[2] ^ rotl64_1(tmpxor[4], 1);
+		bc[3] = tmpxor[3] ^ rotl64_1(tmpxor[0], 1);
+		bc[4] = tmpxor[4] ^ rotl64_1(tmpxor[1], 1);
+
+		tmp1 = s[1] ^ bc[0];
+
+		s[0] ^= bc[4];
+		s[1] = rotl64_2(s[6] ^ bc[0], 12);
+		s[6] = rotl64_1(s[9] ^ bc[3], 20);
+		s[9] = rotl64_2(s[22] ^ bc[1], 29);
+		s[22] = rotl64_2(s[14] ^ bc[3], 7);
+		s[14] = rotl64_1(s[20] ^ bc[4], 18);
+		s[20] = rotl64_2(s[2] ^ bc[1], 30);
+		s[2] = rotl64_2(s[12] ^ bc[1], 11);
+		s[12] = rotl64_1(s[13] ^ bc[2], 25);
+		s[13] = rotl64_1(s[19] ^ bc[3], 8);
+		s[19] = rotl64_2(s[23] ^ bc[2], 24);
+		s[23] = rotl64_2(s[15] ^ bc[4], 9);
+		s[15] = rotl64_1(s[4] ^ bc[3], 27);
+		s[4] = rotl64_1(s[24] ^ bc[3], 14);
+		s[24] = rotl64_1(s[21] ^ bc[0], 2);
+		s[21] = rotl64_2(s[8] ^ bc[2], 23);
+		s[8] = rotl64_2(s[16] ^ bc[0], 13);
+		s[16] = rotl64_2(s[5] ^ bc[4], 4);
+		s[5] = rotl64_1(s[3] ^ bc[2], 28);
+		s[3] = rotl64_1(s[18] ^ bc[2], 21);
+		s[18] = rotl64_1(s[17] ^ bc[1], 15);
+		s[17] = rotl64_1(s[11] ^ bc[0], 10);
+		s[11] = rotl64_1(s[7] ^ bc[1], 6);
+		s[7] = rotl64_1(s[10] ^ bc[4], 3);
+		s[10] = rotl64_1(tmp1, 1);
+
+		tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
+		tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
+		tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
+		tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
+		tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
+		s[0] ^= keccakf_rndc[i];
+	}
+}
+
+__device__ __forceinline__
+void cn_keccakf(uint64_t *s)
+{
+	uint64_t bc[5], tmpxor[5], tmp1, tmp2;
+
+	tmpxor[0] = s[0] ^ s[5];
+	tmpxor[1] = s[1] ^ s[6] ^ 0x8000000000000000ULL;
+	tmpxor[2] = s[2] ^ s[7];
+	tmpxor[3] = s[3] ^ s[8];
+	tmpxor[4] = s[4] ^ s[9];
+
+	bc[0] = tmpxor[0] ^ rotl64_1(tmpxor[2], 1);
+	bc[1] = tmpxor[1] ^ rotl64_1(tmpxor[3], 1);
+	bc[2] = tmpxor[2] ^ rotl64_1(tmpxor[4], 1);
+	bc[3] = tmpxor[3] ^ rotl64_1(tmpxor[0], 1);
+	bc[4] = tmpxor[4] ^ rotl64_1(tmpxor[1], 1);
+
+	tmp1 = s[1] ^ bc[0];
+
+	s[0] ^= bc[4];
+	s[1] = rotl64_2(s[6] ^ bc[0], 12);
+	s[6] = rotl64_1(s[9] ^ bc[3], 20);
+	s[9] = rotl64_2(bc[1], 29);
+	s[22] = rotl64_2(bc[3], 7);
+	s[14] = rotl64_1(bc[4], 18);
+	s[20] = rotl64_2(s[2] ^ bc[1], 30);
+	s[2] = rotl64_2(bc[1], 11);
+	s[12] = rotl64_1(bc[2], 25);
+	s[13] = rotl64_1(bc[3], 8);
+	s[19] = rotl64_2(bc[2], 24);
+	s[23] = rotl64_2(bc[4], 9);
+	s[15] = rotl64_1(s[4] ^ bc[3], 27);
+	s[4] = rotl64_1(bc[3], 14);
+	s[24] = rotl64_1(bc[0], 2);
+	s[21] = rotl64_2(s[8] ^ bc[2], 23);
+	s[8] = rotl64_2(0x8000000000000000ULL ^ bc[0], 13);
+	s[16] = rotl64_2(s[5] ^ bc[4], 4);
+	s[5] = rotl64_1(s[3] ^ bc[2], 28);
+	s[3] = rotl64_1(bc[2], 21);
+	s[18] = rotl64_1(bc[1], 15);
+	s[17] = rotl64_1(bc[0], 10);
+	s[11] = rotl64_1(s[7] ^ bc[1], 6);
+	s[7] = rotl64_1(bc[4], 3);
+	s[10] = rotl64_1(tmp1, 1);
+
+	tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
+	tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
+	tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
+	tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
+	tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
+	s[0] ^= 0x0000000000000001;
+
+	for(int i = 1; i < 24; ++i)
+	{
+		tmpxor[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
+		tmpxor[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
+		tmpxor[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
+		tmpxor[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
+		tmpxor[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
+
+		bc[0] = tmpxor[0] ^ rotl64_1(tmpxor[2], 1);
+		bc[1] = tmpxor[1] ^ rotl64_1(tmpxor[3], 1);
+		bc[2] = tmpxor[2] ^ rotl64_1(tmpxor[4], 1);
+		bc[3] = tmpxor[3] ^ rotl64_1(tmpxor[0], 1);
+		bc[4] = tmpxor[4] ^ rotl64_1(tmpxor[1], 1);
+
+		tmp1 = s[1] ^ bc[0];
+
+		s[0] ^= bc[4];
+		s[1] = rotl64_2(s[6] ^ bc[0], 12);
+		s[6] = rotl64_1(s[9] ^ bc[3], 20);
+		s[9] = rotl64_2(s[22] ^ bc[1], 29);
+		s[22] = rotl64_2(s[14] ^ bc[3], 7);
+		s[14] = rotl64_1(s[20] ^ bc[4], 18);
+		s[20] = rotl64_2(s[2] ^ bc[1], 30);
+		s[2] = rotl64_2(s[12] ^ bc[1], 11);
+		s[12] = rotl64_1(s[13] ^ bc[2], 25);
+		s[13] = rotl64_1(s[19] ^ bc[3], 8);
+		s[19] = rotl64_2(s[23] ^ bc[2], 24);
+		s[23] = rotl64_2(s[15] ^ bc[4], 9);
+		s[15] = rotl64_1(s[4] ^ bc[3], 27);
+		s[4] = rotl64_1(s[24] ^ bc[3], 14);
+		s[24] = rotl64_1(s[21] ^ bc[0], 2);
+		s[21] = rotl64_2(s[8] ^ bc[2], 23);
+		s[8] = rotl64_2(s[16] ^ bc[0], 13);
+		s[16] = rotl64_2(s[5] ^ bc[4], 4);
+		s[5] = rotl64_1(s[3] ^ bc[2], 28);
+		s[3] = rotl64_1(s[18] ^ bc[2], 21);
+		s[18] = rotl64_1(s[17] ^ bc[1], 15);
+		s[17] = rotl64_1(s[11] ^ bc[0], 10);
+		s[11] = rotl64_1(s[7] ^ bc[1], 6);
+		s[7] = rotl64_1(s[10] ^ bc[4], 3);
+		s[10] = rotl64_1(tmp1, 1);
+
+		tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
+		tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
+		tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
+		tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
+		tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
+		s[0] ^= keccakf_rndc[i];
+	}
+}
+
+__device__ __forceinline__
+void cn_keccak(const uint32_t * __restrict__ in, uint64_t * __restrict__ md)
+{
+	uint64_t st[25];
+
+	MEMCPY4(st, in, 19);
+	MEMSET8(&st[10], 0x00, 15);
+	st[9] = (st[9] & 0x00000000FFFFFFFFULL) | 0x0000000100000000ULL;
+	st[16] = 0x8000000000000000ULL;
+
+	cn_keccakf(st);
+
+	MEMCPY8(md, st, 25);
+	return;
+}
diff --git a/crypto/cn_skein.cuh b/crypto/cn_skein.cuh
new file mode 100644
index 0000000000..0e68143bad
--- /dev/null
+++ b/crypto/cn_skein.cuh
@@ -0,0 +1,319 @@
+typedef unsigned int    uint_t;             /* native unsigned integer */
+
+#define SKEIN_MODIFIER_WORDS  ( 2)          /* number of modifier (tweak) words */
+
+#define SKEIN_256_STATE_WORDS ( 4)
+#define SKEIN_512_STATE_WORDS ( 8)
+
+#define SKEIN_256_STATE_BYTES ( 8*SKEIN_256_STATE_WORDS)
+#define SKEIN_512_STATE_BYTES ( 8*SKEIN_512_STATE_WORDS)
+
+#define SKEIN_256_STATE_BITS  (64*SKEIN_256_STATE_WORDS)
+#define SKEIN_512_STATE_BITS  (64*SKEIN_512_STATE_WORDS)
+
+#define SKEIN_256_BLOCK_BYTES ( 8*SKEIN_256_STATE_WORDS)
+#define SKEIN_512_BLOCK_BYTES ( 8*SKEIN_512_STATE_WORDS)
+
+#define SKEIN_MK_64(hi32,lo32)  ((lo32) + (((uint64_t) (hi32)) << 32))
+#define SKEIN_KS_PARITY         SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)
+
+#define SKEIN_T1_BIT(BIT)       ((BIT) - 64)            /* offset 64 because it's the second word  */
+
+#define SKEIN_T1_POS_FIRST      SKEIN_T1_BIT(126)       /* bits 126     : first block flag         */
+#define SKEIN_T1_POS_BIT_PAD    SKEIN_T1_BIT(119)       /* bit  119     : partial final input byte */
+#define SKEIN_T1_POS_FINAL      SKEIN_T1_BIT(127)       /* bit  127     : final block flag         */
+#define SKEIN_T1_POS_BLK_TYPE   SKEIN_T1_BIT(120)       /* bits 120..125: type field               */
+
+#define SKEIN_T1_FLAG_FIRST     (((uint64_t)  1 ) << SKEIN_T1_POS_FIRST)
+#define SKEIN_T1_FLAG_BIT_PAD   (((uint64_t)  1 ) << SKEIN_T1_POS_BIT_PAD)
+#define SKEIN_T1_FLAG_FINAL     (((uint64_t)  1 ) << SKEIN_T1_POS_FINAL)
+
+#define SKEIN_BLK_TYPE_MSG      (48)                    /* message processing */
+#define SKEIN_BLK_TYPE_OUT      (63)                    /* output stage */
+
+#define SKEIN_T1_BLK_TYPE(T)   (((uint64_t) (SKEIN_BLK_TYPE_##T)) << SKEIN_T1_POS_BLK_TYPE)
+
+#define SKEIN_T1_BLK_TYPE_MSG   SKEIN_T1_BLK_TYPE(MSG)  /* message processing */
+#define SKEIN_T1_BLK_TYPE_OUT   SKEIN_T1_BLK_TYPE(OUT)  /* output stage */
+
+#define SKEIN_T1_BLK_TYPE_OUT_FINAL       (SKEIN_T1_BLK_TYPE_OUT | SKEIN_T1_FLAG_FINAL)
+
+#define Skein_Set_Tweak(ctxPtr,TWK_NUM,tVal)    {(ctxPtr)->h.T[TWK_NUM] = (tVal);}
+
+#define Skein_Set_T0(ctxPtr,T0) Skein_Set_Tweak(ctxPtr,0,T0)
+#define Skein_Set_T1(ctxPtr,T1) Skein_Set_Tweak(ctxPtr,1,T1)
+
+#define Skein_Set_T0_T1(ctxPtr,T0,T1) { \
+  Skein_Set_T0(ctxPtr,(T0)); \
+  Skein_Set_T1(ctxPtr,(T1)); }
+
+#define Skein_Start_New_Type(ctxPtr,BLK_TYPE)   \
+{ Skein_Set_T0_T1(ctxPtr,0,SKEIN_T1_FLAG_FIRST | SKEIN_T1_BLK_TYPE_##BLK_TYPE); (ctxPtr)->h.bCnt=0; }
+
+#define Skein_Set_Bit_Pad_Flag(hdr)      { (hdr).T[1] |=  SKEIN_T1_FLAG_BIT_PAD;     }
+
+#define KW_TWK_BASE     (0)
+#define KW_KEY_BASE     (3)
+#define ks              (kw + KW_KEY_BASE)
+#define ts              (kw + KW_TWK_BASE)
+
+#define R512(p0,p1,p2,p3,p4,p5,p6,p7,R512ROT,rNum) \
+	X##p0 += X##p1; X##p1 = ROTL64(X##p1,R512ROT##_0); X##p1 ^= X##p0; \
+	X##p2 += X##p3; X##p3 = ROTL64(X##p3,R512ROT##_1); X##p3 ^= X##p2; \
+	X##p4 += X##p5; X##p5 = ROTL64(X##p5,R512ROT##_2); X##p5 ^= X##p4; \
+	X##p6 += X##p7; X##p7 = ROTL64(X##p7,R512ROT##_3); X##p7 ^= X##p6;
+
+#define I512(R) \
+	X0   += ks[((R)+1) % 9]; \
+	X1   += ks[((R)+2) % 9]; \
+	X2   += ks[((R)+3) % 9]; \
+	X3   += ks[((R)+4) % 9]; \
+	X4   += ks[((R)+5) % 9]; \
+	X5   += ks[((R)+6) % 9] + ts[((R)+1) % 3]; \
+	X6   += ks[((R)+7) % 9] + ts[((R)+2) % 3]; \
+	X7   += ks[((R)+8) % 9] + (R)+1;
+
+
+#define R512_8_rounds(R) \
+	R512(0,1,2,3,4,5,6,7,R_512_0,8*(R)+ 1); \
+	R512(2,1,4,7,6,5,0,3,R_512_1,8*(R)+ 2); \
+	R512(4,1,6,3,0,5,2,7,R_512_2,8*(R)+ 3); \
+	R512(6,1,0,7,2,5,4,3,R_512_3,8*(R)+ 4); \
+	I512(2*(R)); \
+	R512(0,1,2,3,4,5,6,7,R_512_4,8*(R)+ 5); \
+	R512(2,1,4,7,6,5,0,3,R_512_5,8*(R)+ 6); \
+	R512(4,1,6,3,0,5,2,7,R_512_6,8*(R)+ 7); \
+	R512(6,1,0,7,2,5,4,3,R_512_7,8*(R)+ 8); \
+	I512(2*(R)+1);
+
+typedef struct
+{
+  size_t  hashBitLen;
+  size_t  bCnt;
+  uint64_t  T[SKEIN_MODIFIER_WORDS];
+} Skein_Ctxt_Hdr_t;
+
+typedef struct {
+  Skein_Ctxt_Hdr_t h;
+  uint64_t  X[SKEIN_256_STATE_WORDS];
+  uint8_t  b[SKEIN_256_BLOCK_BYTES];
+} Skein_256_Ctxt_t;
+
+typedef struct {
+  Skein_Ctxt_Hdr_t h;
+  uint64_t  X[SKEIN_512_STATE_WORDS];
+  uint8_t  b[SKEIN_512_BLOCK_BYTES];
+} Skein_512_Ctxt_t;
+
+typedef struct {
+  uint_t  statebits;
+  union {
+	Skein_Ctxt_Hdr_t h;
+	Skein_256_Ctxt_t ctx_256;
+	Skein_512_Ctxt_t ctx_512;
+  } u;
+} skeinHashState;
+
+__device__
+void cn_skein_init(skeinHashState *state, size_t hashBitLen)
+{
+	const uint64_t SKEIN_512_IV_256[] =
+	{
+		SKEIN_MK_64(0xCCD044A1,0x2FDB3E13),
+		SKEIN_MK_64(0xE8359030,0x1A79A9EB),
+		SKEIN_MK_64(0x55AEA061,0x4F816E6F),
+		SKEIN_MK_64(0x2A2767A4,0xAE9B94DB),
+		SKEIN_MK_64(0xEC06025E,0x74DD7683),
+		SKEIN_MK_64(0xE7A436CD,0xC4746251),
+		SKEIN_MK_64(0xC36FBAF9,0x393AD185),
+		SKEIN_MK_64(0x3EEDBA18,0x33EDFC13)
+	};
+
+	Skein_512_Ctxt_t *ctx = &state->u.ctx_512;
+
+	ctx->h.hashBitLen = hashBitLen;
+
+	memcpy(ctx->X, SKEIN_512_IV_256, sizeof(ctx->X));
+
+	Skein_Start_New_Type(ctx, MSG);
+}
+
+__device__
+void cn_skein_processblock(Skein_512_Ctxt_t * __restrict__ ctx, const uint8_t * __restrict__ blkPtr, size_t blkCnt, size_t byteCntAdd)
+{
+	enum {
+		R_512_0_0=46, R_512_0_1=36, R_512_0_2=19, R_512_0_3=37,
+		R_512_1_0=33, R_512_1_1=27, R_512_1_2=14, R_512_1_3=42,
+		R_512_2_0=17, R_512_2_1=49, R_512_2_2=36, R_512_2_3=39,
+		R_512_3_0=44, R_512_3_1= 9, R_512_3_2=54, R_512_3_3=56,
+		R_512_4_0=39, R_512_4_1=30, R_512_4_2=34, R_512_4_3=24,
+		R_512_5_0=13, R_512_5_1=50, R_512_5_2=10, R_512_5_3=17,
+		R_512_6_0=25, R_512_6_1=29, R_512_6_2=39, R_512_6_3=43,
+		R_512_7_0= 8, R_512_7_1=35, R_512_7_2=56, R_512_7_3=22
+	};
+	uint64_t X0,X1,X2,X3,X4,X5,X6,X7;
+	uint64_t w[SKEIN_512_STATE_WORDS];
+	uint64_t kw[SKEIN_512_STATE_WORDS+4];
+
+	ts[0] = ctx->h.T[0];
+	ts[1] = ctx->h.T[1];
+
+	do  {
+
+		ts[0] += byteCntAdd;
+
+		ks[0] = ctx->X[0];
+		ks[1] = ctx->X[1];
+		ks[2] = ctx->X[2];
+		ks[3] = ctx->X[3];
+		ks[4] = ctx->X[4];
+		ks[5] = ctx->X[5];
+		ks[6] = ctx->X[6];
+		ks[7] = ctx->X[7];
+		ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^
+		ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY;
+
+		ts[2] = ts[0] ^ ts[1];
+
+		memcpy(w, blkPtr, SKEIN_512_STATE_WORDS << 3);
+
+		X0 = w[0] + ks[0];
+		X1 = w[1] + ks[1];
+		X2 = w[2] + ks[2];
+		X3 = w[3] + ks[3];
+		X4 = w[4] + ks[4];
+		X5 = w[5] + ks[5] + ts[0];
+		X6 = w[6] + ks[6] + ts[1];
+		X7 = w[7] + ks[7];
+
+		blkPtr += SKEIN_512_BLOCK_BYTES;
+
+		R512_8_rounds( 0);
+		R512_8_rounds( 1);
+		R512_8_rounds( 2);
+		R512_8_rounds( 3);
+		R512_8_rounds( 4);
+		R512_8_rounds( 5);
+		R512_8_rounds( 6);
+		R512_8_rounds( 7);
+		R512_8_rounds( 8);
+
+		ctx->X[0] = X0 ^ w[0];
+		ctx->X[1] = X1 ^ w[1];
+		ctx->X[2] = X2 ^ w[2];
+		ctx->X[3] = X3 ^ w[3];
+		ctx->X[4] = X4 ^ w[4];
+		ctx->X[5] = X5 ^ w[5];
+		ctx->X[6] = X6 ^ w[6];
+		ctx->X[7] = X7 ^ w[7];
+
+		ts[1] &= ~SKEIN_T1_FLAG_FIRST;
+	} while (--blkCnt);
+
+	ctx->h.T[0] = ts[0];
+	ctx->h.T[1] = ts[1];
+}
+
+__device__
+void cn_skein_block(Skein_512_Ctxt_t * __restrict__ ctx, const uint8_t * __restrict__ msg, size_t msgByteCnt)
+{
+	size_t n;
+
+	if (msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES)
+	{
+		if (ctx->h.bCnt) {
+
+			n = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt;
+
+			if (n) {
+
+				memcpy(&ctx->b[ctx->h.bCnt],msg,n);
+				msgByteCnt  -= n;
+				msg         += n;
+				ctx->h.bCnt += n;
+			}
+
+			cn_skein_processblock(ctx, ctx->b, 1, SKEIN_512_BLOCK_BYTES);
+			ctx->h.bCnt = 0;
+		}
+
+		if (msgByteCnt > SKEIN_512_BLOCK_BYTES) {
+
+			n = (msgByteCnt - 1) / SKEIN_512_BLOCK_BYTES;
+			cn_skein_processblock(ctx, msg, n, SKEIN_512_BLOCK_BYTES);
+			msgByteCnt -= n * SKEIN_512_BLOCK_BYTES;
+			msg        += n * SKEIN_512_BLOCK_BYTES;
+		}
+	}
+
+	if (msgByteCnt) {
+
+		memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt);
+		ctx->h.bCnt += msgByteCnt;
+	}
+}
+
+__device__
+void cn_skein_update(skeinHashState * __restrict__ state, const uint8_t * __restrict__ data, DataLength databitlen)
+{
+	if ((databitlen & 7) == 0) {
+		cn_skein_block(&state->u.ctx_512, data, databitlen >> 3);
+	}
+	else {
+		size_t bCnt = (databitlen >> 3) + 1;
+		uint8_t b,mask;
+
+		mask = (uint8_t) (1u << (7 - (databitlen & 7)));
+		b    = (uint8_t) ((data[bCnt-1] & (0-mask)) | mask);
+
+		cn_skein_block(&state->u.ctx_512, data, bCnt - 1);
+		cn_skein_block(&state->u.ctx_512, &b, 1);
+
+		Skein_Set_Bit_Pad_Flag(state->u.h);
+	}
+}
+
+__device__
+void cn_skein_final(skeinHashState * __restrict__ state, uint8_t * __restrict__ hashVal)
+{
+	uint64_t X[SKEIN_512_STATE_WORDS];
+	Skein_512_Ctxt_t *ctx = (Skein_512_Ctxt_t *)&state->u.ctx_512;
+	const int byteCnt = (ctx->h.hashBitLen + 7) >> 3;
+
+	ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;
+
+	if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES)
+	{
+		memset(&ctx->b[ctx->h.bCnt], 0, SKEIN_512_BLOCK_BYTES - ctx->h.bCnt);
+	}
+
+	cn_skein_processblock(ctx, ctx->b, 1, ctx->h.bCnt);
+
+	memset(ctx->b, 0, sizeof(ctx->b));
+	memcpy(X, ctx->X, sizeof(X));
+
+	for (int i = 0; i*SKEIN_512_BLOCK_BYTES < byteCnt; i++)
+	{
+		int n = byteCnt - i*SKEIN_512_BLOCK_BYTES;
+		if (n > SKEIN_512_BLOCK_BYTES) n = SKEIN_512_BLOCK_BYTES;
+		((uint64_t *)ctx->b)[0] = (uint64_t)i;
+		Skein_Start_New_Type(ctx, OUT_FINAL);
+		cn_skein_processblock(ctx, ctx->b, 1, sizeof(uint64_t));
+		memcpy(hashVal + (i*SKEIN_512_BLOCK_BYTES), ctx->X, n);
+		memcpy(ctx->X, X, sizeof(X)); // restore the counter mode key for next time
+	}
+}
+
+__device__
+void cn_skein(const uint8_t * __restrict__ data, DataLength len, uint32_t * hashval)
+{
+	int hashbitlen = 256;
+	DataLength databitlen = len << 3;
+	skeinHashState state;
+
+	state.statebits = 64*SKEIN_512_STATE_WORDS;
+
+	cn_skein_init(&state, hashbitlen);
+	cn_skein_update(&state, data, databitlen);
+	cn_skein_final(&state, (uint8_t*) hashval);
+}
diff --git a/crypto/cpu/c_keccak.c b/crypto/cpu/c_keccak.c
new file mode 100644
index 0000000000..d6721bf7a6
--- /dev/null
+++ b/crypto/cpu/c_keccak.c
@@ -0,0 +1,122 @@
+// keccak.c
+// 19-Nov-11  Markku-Juhani O. Saarinen <mjos@iki.fi>
+// A baseline Keccak (3rd round) implementation.
+
+#include "c_keccak.h"
+
+const uint64_t keccakf_rndc[24] =
+{
+    0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
+    0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
+    0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
+    0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
+    0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
+    0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
+    0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
+    0x8000000000008080, 0x0000000080000001, 0x8000000080008008
+};
+
+const int keccakf_rotc[24] =
+{
+    1,  3,  6,  10, 15, 21, 28, 36, 45, 55, 2,  14,
+    27, 41, 56, 8,  25, 43, 62, 18, 39, 61, 20, 44
+};
+
+const int keccakf_piln[24] =
+{
+    10, 7,  11, 17, 18, 3, 5,  16, 8,  21, 24, 4,
+    15, 23, 19, 13, 12, 2, 20, 14, 22, 9,  6,  1
+};
+
+// update the state with given number of rounds
+
+void keccakf(uint64_t st[25], int rounds)
+{
+    int i, j, round;
+    uint64_t t, bc[5];
+
+    for (round = 0; round < rounds; ++round) {
+
+        // Theta
+        bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20];
+        bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21];
+        bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22];
+        bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23];
+        bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24];
+
+        for (i = 0; i < 5; ++i) {
+            t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1);
+            st[i     ] ^= t;
+            st[i +  5] ^= t;
+            st[i + 10] ^= t;
+            st[i + 15] ^= t;
+            st[i + 20] ^= t;
+        }
+
+        // Rho Pi
+        t = st[1];
+        for (i = 0; i < 24; ++i) {
+            bc[0] = st[keccakf_piln[i]];
+            st[keccakf_piln[i]] = ROTL64(t, keccakf_rotc[i]);
+            t = bc[0];
+        }
+
+        //  Chi
+        for (j = 0; j < 25; j += 5) {
+            bc[0] = st[j    ];
+            bc[1] = st[j + 1];
+            bc[2] = st[j + 2];
+            bc[3] = st[j + 3];
+            bc[4] = st[j + 4];
+            st[j    ] ^= (~bc[1]) & bc[2];
+            st[j + 1] ^= (~bc[2]) & bc[3];
+            st[j + 2] ^= (~bc[3]) & bc[4];
+            st[j + 3] ^= (~bc[4]) & bc[0];
+            st[j + 4] ^= (~bc[0]) & bc[1];
+        }
+
+        //  Iota
+        st[0] ^= keccakf_rndc[round];
+    }
+}
+
+// compute a keccak hash (md) of given byte length from "in"
+typedef uint64_t state_t[25];
+
+int keccak(const uint8_t *in, int inlen, uint8_t *md, int mdlen)
+{
+    state_t st;
+    uint8_t temp[144];
+    int i, rsiz, rsizw;
+
+    rsiz = sizeof(state_t) == mdlen ? HASH_DATA_AREA : 200 - 2 * mdlen;
+    rsizw = rsiz / 8;
+
+    memset(st, 0, sizeof(st));
+
+    for ( ; inlen >= rsiz; inlen -= rsiz, in += rsiz) {
+        for (i = 0; i < rsizw; i++)
+            st[i] ^= ((uint64_t *) in)[i];
+        keccakf(st, KECCAK_ROUNDS);
+    }
+
+    // last block and padding
+    memcpy(temp, in, inlen);
+    temp[inlen++] = 1;
+    memset(temp + inlen, 0, rsiz - inlen);
+    temp[rsiz - 1] |= 0x80;
+
+    for (i = 0; i < rsizw; i++)
+        st[i] ^= ((uint64_t *) temp)[i];
+
+    keccakf(st, KECCAK_ROUNDS);
+
+    memcpy(md, st, mdlen);
+
+    return 0;
+}
+
+void keccak1600(const uint8_t *in, int inlen, uint8_t *md)
+{
+    keccak(in, inlen, md, sizeof(state_t));
+}
diff --git a/crypto/cpu/c_keccak.h b/crypto/cpu/c_keccak.h
new file mode 100644
index 0000000000..566a305c5b
--- /dev/null
+++ b/crypto/cpu/c_keccak.h
@@ -0,0 +1,34 @@
+// keccak.h
+// 19-Nov-11  Markku-Juhani O. Saarinen <mjos@iki.fi>
+
+#ifndef KECCAK_H
+#define KECCAK_H
+
+#include <stdint.h>
+#include <string.h>
+
+#ifndef KECCAK_ROUNDS
+#define KECCAK_ROUNDS 24
+#endif
+
+#ifndef ROTL64
+#define ROTL64(x, y) (((x) << (y)) | ((x) >> (64 - (y))))
+#endif
+
+#ifndef HASH_SIZE
+#define HASH_SIZE 32
+#endif
+
+#ifndef HASH_DATA_AREA
+#define HASH_DATA_AREA 136
+#endif
+
+// compute a keccak hash (md) of given byte length from "in"
+int keccak(const uint8_t *in, int inlen, uint8_t *md, int mdlen);
+
+// update the state
+void keccakf(uint64_t st[25], int norounds);
+
+void keccak1600(const uint8_t *in, int inlen, uint8_t *md);
+
+#endif
diff --git a/crypto/cryptolight-core.cu b/crypto/cryptolight-core.cu
new file mode 100644
index 0000000000..8f0bb75e61
--- /dev/null
+++ b/crypto/cryptolight-core.cu
@@ -0,0 +1,351 @@
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+#include "cryptolight.h"
+#define LONG_SHL_IDX 18
+#define LONG_LOOPS32 0x40000
+
+#include "cn_aes.cuh"
+
+#define MUL_SUM_XOR_DST(a,c,dst) { \
+	uint64_t hi, lo = cuda_mul128(((uint64_t *)a)[0], ((uint64_t *)dst)[0], &hi) + ((uint64_t *)c)[1]; \
+	hi += ((uint64_t *)c)[0]; \
+	((uint64_t *)c)[0] = ((uint64_t *)dst)[0] ^ hi; \
+	((uint64_t *)c)[1] = ((uint64_t *)dst)[1] ^ lo; \
+	((uint64_t *)dst)[0] = hi; \
+	((uint64_t *)dst)[1] = lo; }
+
+__device__ __forceinline__ uint64_t cuda_mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi)
+{
+	*product_hi = __umul64hi(multiplier, multiplicand);
+	return(multiplier * multiplicand);
+}
+
+__global__
+void cryptolight_core_gpu_phase1(int threads, uint32_t * long_state, uint32_t * ctx_state, uint32_t * ctx_key1)
+{
+	__shared__ uint32_t __align__(16) sharedMemory[1024];
+
+	cn_aes_gpu_init(sharedMemory);
+
+	const int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3;
+	const int sub = (threadIdx.x & 7) << 2;
+
+	if(thread < threads)
+	{
+		const int oft = thread * 50 + sub + 16; // not aligned 16!
+		const int long_oft = (thread << LONG_SHL_IDX) + sub;
+		uint32_t __align__(16) key[40];
+		uint32_t __align__(16) text[4];
+
+		// copy 160 bytes
+		#pragma unroll
+		for (int i = 0; i < 40; i += 4)
+			AS_UINT4(&key[i]) = AS_UINT4(ctx_key1 + thread * 40 + i);
+
+		AS_UINT2(&text[0]) = AS_UINT2(&ctx_state[oft]);
+		AS_UINT2(&text[2]) = AS_UINT2(&ctx_state[oft + 2]);
+
+		__syncthreads();
+		for(int i = 0; i < LONG_LOOPS32; i += 32) {
+			cn_aes_pseudo_round_mut(sharedMemory, text, key);
+			AS_UINT4(&long_state[long_oft + i]) = AS_UINT4(text);
+		}
+	}
+}
+
+// --------------------------------------------------------------------------------------------------------------
+
+__global__
+void cryptolight_old_gpu_phase2(const int threads, const int bfactor, const int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b)
+{
+	__shared__ uint32_t __align__(16) sharedMemory[1024];
+
+	cn_aes_gpu_init(sharedMemory);
+
+	__syncthreads();
+
+#if 0 && __CUDA_ARCH__ >= 300
+
+	const int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2;
+	const int sub = threadIdx.x & 3;
+
+	if(thread < threads)
+	{
+		const int batchsize = ITER >> (2 + bfactor);
+		const int start = partidx * batchsize;
+		const int end = start + batchsize;
+		uint32_t * __restrict__ long_state = &d_long_state[thread << LONG_SHL_IDX];
+		uint32_t * __restrict__ ctx_a = d_ctx_a + thread * 4;
+		uint32_t * __restrict__ ctx_b = d_ctx_b + thread * 4;
+		uint32_t a, b, c, x[4];
+		uint32_t t1[4], t2[4], res;
+		uint64_t reshi, reslo;
+		int j;
+
+		a = ctx_a[sub];
+		b = ctx_b[sub];
+
+		#pragma unroll 8
+		for(int i = start; i < end; ++i)
+		{
+			//j = ((uint32_t *)a)[0] & 0xFFFF0;
+			j = (__shfl((int)a, 0, 4) & E2I_MASK1) >> 2;
+
+			//cn_aes_single_round(sharedMemory, &long_state[j], c, a);
+			x[0] = long_state[j + sub];
+			x[1] = __shfl((int)x[0], sub + 1, 4);
+			x[2] = __shfl((int)x[0], sub + 2, 4);
+			x[3] = __shfl((int)x[0], sub + 3, 4);
+			c = a ^
+				t_fn0(x[0] & 0xff) ^
+				t_fn1((x[1] >> 8) & 0xff) ^
+				t_fn2((x[2] >> 16) & 0xff) ^
+				t_fn3((x[3] >> 24) & 0xff);
+
+			//XOR_BLOCKS_DST(c, b, &long_state[j]);
+			long_state[j + sub] = c ^ b;
+
+			//MUL_SUM_XOR_DST(c, a, &long_state[((uint32_t *)c)[0] & 0xFFFF0]);
+			j = (__shfl((int)c, 0, 4) & E2I_MASK1) >> 2;
+			#pragma unroll
+			for(int k = 0; k < 2; k++)
+				t1[k] = __shfl((int)c, k, 4);
+			#pragma unroll
+			for(int k = 0; k < 4; k++)
+				t2[k] = __shfl((int)a, k, 4);
+			asm(
+				"mad.lo.u64 %0, %2, %3, %4;\n\t"
+				"mad.hi.u64 %1, %2, %3, %5;\n\t"
+				: "=l"(reslo), "=l"(reshi)
+				: "l"(((uint64_t *)t1)[0]), "l"(((uint64_t *)long_state)[j >> 1]), "l"(((uint64_t *)t2)[1]), "l"(((uint64_t *)t2)[0]));
+			res = (sub & 2 ? reslo : reshi) >> (sub & 1 ? 32 : 0);
+			a = long_state[j + sub] ^ res;
+			long_state[j + sub] = res;
+
+			//j = ((uint32_t *)a)[0] & 0xFFFF0;
+			j = (__shfl((int)a, 0, 4) & E2I_MASK1) >> 2;
+
+			//cn_aes_single_round(sharedMemory, &long_state[j], b, a);
+			x[0] = long_state[j + sub];
+			x[1] = __shfl((int)x[0], sub + 1, 4);
+			x[2] = __shfl((int)x[0], sub + 2, 4);
+			x[3] = __shfl((int)x[0], sub + 3, 4);
+			b = a ^
+				t_fn0(x[0] & 0xff) ^
+				t_fn1((x[1] >> 8) & 0xff) ^
+				t_fn2((x[2] >> 16) & 0xff) ^
+				t_fn3((x[3] >> 24) & 0xff);
+
+			//XOR_BLOCKS_DST(b, c, &long_state[j]);
+			long_state[j + sub] = c ^ b;
+
+			//MUL_SUM_XOR_DST(b, a, &long_state[((uint32_t *)b)[0] & 0xFFFF0]);
+			j = (__shfl((int)b, 0, 4) & E2I_MASK1) >> 2;
+
+			#pragma unroll
+			for(int k = 0; k < 2; k++)
+				t1[k] = __shfl((int)b, k, 4);
+
+			#pragma unroll
+			for(int k = 0; k < 4; k++)
+				t2[k] = __shfl((int)a, k, 4);
+			asm(
+				"mad.lo.u64 %0, %2, %3, %4;\n\t"
+				"mad.hi.u64 %1, %2, %3, %5;\n\t"
+				: "=l"(reslo), "=l"(reshi)
+				: "l"(((uint64_t *)t1)[0]), "l"(((uint64_t *)long_state)[j >> 1]), "l"(((uint64_t *)t2)[1]), "l"(((uint64_t *)t2)[0]));
+			res = (sub & 2 ? reslo : reshi) >> (sub & 1 ? 32 : 0);
+			a = long_state[j + sub] ^ res;
+			long_state[j + sub] = res;
+		}
+
+		if(bfactor > 0)
+		{
+			ctx_a[sub] = a;
+			ctx_b[sub] = b;
+		}
+	}
+
+#else // __CUDA_ARCH__ < 300
+
+	const int thread = blockDim.x * blockIdx.x + threadIdx.x;
+
+	if (thread < threads)
+	{
+		const int batchsize = ITER >> (2 + bfactor);
+		const int start = partidx * batchsize;
+		const int end = start + batchsize;
+		const int longptr = thread << LONG_SHL_IDX;
+		uint32_t * long_state = &d_long_state[longptr];
+
+		uint64_t * ctx_a = (uint64_t*)(&d_ctx_a[thread * 4]);
+		uint64_t * ctx_b = (uint64_t*)(&d_ctx_b[thread * 4]);
+		uint4 A = AS_UINT4(ctx_a);
+		uint4 B = AS_UINT4(ctx_b);
+		uint32_t* a = (uint32_t*)&A;
+		uint32_t* b = (uint32_t*)&B;
+
+		for (int i = start; i < end; i++) // end = 262144
+		{
+			uint32_t c[4];
+			uint32_t j = (a[0] >> 2) & E2I_MASK2;
+			cn_aes_single_round(sharedMemory, &long_state[j], c, a);
+			XOR_BLOCKS_DST(c, b, &long_state[j]);
+			MUL_SUM_XOR_DST(c, a, &long_state[(c[0] >> 2) & E2I_MASK2]);
+
+			j = (a[0] >> 2) & E2I_MASK2;
+			cn_aes_single_round(sharedMemory, &long_state[j], b, a);
+			XOR_BLOCKS_DST(b, c, &long_state[j]);
+			MUL_SUM_XOR_DST(b, a, &long_state[(b[0] >> 2) & E2I_MASK2]);
+		}
+
+		if (bfactor > 0) {
+			AS_UINT4(ctx_a) = A;
+			AS_UINT4(ctx_b) = B;
+		}
+	}
+#endif // __CUDA_ARCH__ >= 300
+}
+
+__device__ __forceinline__ void store_variant1(uint32_t* long_state)
+{
+	uint4* Z = (uint4*) long_state;
+	const uint32_t tmp = (Z->z >> 24); // __byte_perm(src, 0, 0x7773);
+	const uint32_t index = (((tmp >> 3) & 6u) | (tmp & 1u)) << 1;
+	Z->z = (Z->z & 0x00ffffffu) | ((tmp ^ ((0x75310u >> index) & 0x30u)) << 24);
+}
+
+#define MUL_SUM_XOR_DST_1(a,c,dst,tweak) { \
+        uint64_t hi, lo = cuda_mul128(((uint64_t *)a)[0], ((uint64_t *)dst)[0], &hi) + ((uint64_t *)c)[1]; \
+        hi += ((uint64_t *)c)[0]; \
+        ((uint64_t *)c)[0] = ((uint64_t *)dst)[0] ^ hi; \
+        ((uint64_t *)c)[1] = ((uint64_t *)dst)[1] ^ lo; \
+        ((uint64_t *)dst)[0] = hi; \
+        ((uint64_t *)dst)[1] = lo ^ tweak; }
+
+__global__
+void cryptolight_gpu_phase2(const uint32_t threads, const uint16_t bfactor, const uint32_t partidx,
+	uint32_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b,
+	uint64_t * __restrict__ d_tweak)
+{
+	__shared__ __align__(16) uint32_t sharedMemory[1024];
+	cn_aes_gpu_init(sharedMemory);
+	__syncthreads();
+
+	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
+	if (thread < threads)
+	{
+		const uint32_t batchsize = ITER >> (2 + bfactor);
+		const uint32_t start = partidx * batchsize;
+		const uint32_t end = start + batchsize;
+		const uint32_t longptr = thread << LONG_SHL_IDX;
+		uint32_t * long_state = &d_long_state[longptr];
+		uint64_t tweak = d_tweak[thread];
+
+		void * ctx_a = (void*)(&d_ctx_a[thread << 2]);
+		void * ctx_b = (void*)(&d_ctx_b[thread << 2]);
+		uint4 A = AS_UINT4(ctx_a); // ld.global.u32.v4
+		uint4 B = AS_UINT4(ctx_b);
+		uint32_t* a = (uint32_t*)&A;
+		uint32_t* b = (uint32_t*)&B;
+
+		for (int i = start; i < end; i++)
+		{
+			uint32_t c[4];
+			uint32_t j = (A.x >> 2) & E2I_MASK2;
+			cn_aes_single_round(sharedMemory, &long_state[j], c, a);
+			XOR_BLOCKS_DST(c, b, &long_state[j]);
+			store_variant1(&long_state[j]);
+			MUL_SUM_XOR_DST_1(c, a, &long_state[(c[0] >> 2) & E2I_MASK2], tweak);
+
+			j = (A.x >> 2) & E2I_MASK2;
+			cn_aes_single_round(sharedMemory, &long_state[j], b, a);
+			XOR_BLOCKS_DST(b, c, &long_state[j]);
+			store_variant1(&long_state[j]);
+			MUL_SUM_XOR_DST_1(b, a, &long_state[(b[0] >> 2) & E2I_MASK2], tweak);
+		}
+		if (bfactor) {
+			AS_UINT4(ctx_a) = A;
+			AS_UINT4(ctx_b) = B;
+		}
+	}
+}
+
+__global__
+void cryptolight_core_gpu_phase3(int threads, const uint32_t * long_state, uint32_t * ctx_state, uint32_t * ctx_key2)
+{
+	__shared__ uint32_t __align__(16) sharedMemory[1024];
+
+	cn_aes_gpu_init(sharedMemory);
+
+	const int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3;
+	const int sub = (threadIdx.x & 7) << 2;
+
+	if(thread < threads)
+	{
+		const int long_oft = (thread << LONG_SHL_IDX) + sub;
+		const int oft = thread * 50 + sub + 16;
+		uint32_t __align__(16) key[40];
+		uint32_t __align__(16) text[4];
+
+		#pragma unroll
+		for (int i = 0; i < 40; i += 4)
+			AS_UINT4(&key[i]) = AS_UINT4(ctx_key2 + thread * 40 + i);
+
+		AS_UINT2(&text[0]) = AS_UINT2(&ctx_state[oft + 0]);
+		AS_UINT2(&text[2]) = AS_UINT2(&ctx_state[oft + 2]);
+
+		__syncthreads();
+		for(int i = 0; i < LONG_LOOPS32; i += 32)
+		{
+			#pragma unroll
+			for(int j = 0; j < 4; j++)
+				text[j] ^= long_state[long_oft + i + j];
+
+			cn_aes_pseudo_round_mut(sharedMemory, text, key);
+		}
+
+		AS_UINT2(&ctx_state[oft + 0]) = AS_UINT2(&text[0]);
+		AS_UINT2(&ctx_state[oft + 2]) = AS_UINT2(&text[2]);
+	}
+}
+
+extern int device_bfactor[MAX_GPUS];
+
+__host__
+void cryptolight_core_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint32_t *d_ctx_state,
+	uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak)
+{
+	dim3 grid(blocks);
+	dim3 block(threads);
+	dim3 block4(threads << 2);
+	dim3 block8(threads << 3);
+
+	const int bfactor = device_bfactor[thr_id];
+	const int bsleep = bfactor ? 100 : 0;
+
+	int i, partcount = 1 << bfactor;
+	int dev_id = device_map[thr_id];
+
+	cryptolight_core_gpu_phase1 <<<grid, block8 >>>(blocks*threads, d_long_state, d_ctx_state, d_ctx_key1);
+	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
+	if(partcount > 1) usleep(bsleep);
+
+	for(i = 0; i < partcount; i++)
+	{
+		dim3 b = device_sm[dev_id] >= 300 ? block4 : block;
+		if (variant == 0)
+			cryptolight_old_gpu_phase2 <<<grid, b>>> (blocks*threads, bfactor, i, d_long_state, d_ctx_a, d_ctx_b);
+		else
+			cryptolight_gpu_phase2 <<<grid, b>>> (blocks*threads, bfactor, i, d_long_state, d_ctx_a, d_ctx_b, d_ctx_tweak);
+		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
+		if(partcount > 1) usleep(bsleep);
+	}
+
+	cryptolight_core_gpu_phase3 <<<grid, block8 >>>(blocks*threads, d_long_state, d_ctx_state, d_ctx_key2);
+	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
+}
diff --git a/crypto/cryptolight-cpu.cpp b/crypto/cryptolight-cpu.cpp
new file mode 100644
index 0000000000..14cd3af2ee
--- /dev/null
+++ b/crypto/cryptolight-cpu.cpp
@@ -0,0 +1,246 @@
+#include <miner.h>
+#include <memory.h>
+
+#include "oaes_lib.h"
+#include "cryptolight.h"
+
+extern "C" {
+#include <sph/sph_blake.h>
+#include <sph/sph_groestl.h>
+#include <sph/sph_jh.h>
+#include <sph/sph_skein.h>
+#include "cpu/c_keccak.h"
+}
+
+struct cryptonight_ctx {
+	uint8_t long_state[MEMORY];
+	union cn_slow_hash_state state;
+	uint8_t text[INIT_SIZE_BYTE];
+	uint8_t a[AES_BLOCK_SIZE];
+	uint8_t b[AES_BLOCK_SIZE];
+	uint8_t c[AES_BLOCK_SIZE];
+	oaes_ctx* aes_ctx;
+};
+
+static void do_blake_hash(const void* input, int len, void* output)
+{
+	uchar hash[32];
+	sph_blake256_context ctx;
+	sph_blake256_set_rounds(14);
+	sph_blake256_init(&ctx);
+	sph_blake256(&ctx, input, len);
+	sph_blake256_close(&ctx, hash);
+	memcpy(output, hash, 32);
+}
+
+static void do_groestl_hash(const void* input, int len, void* output)
+{
+	uchar hash[32];
+	sph_groestl256_context ctx;
+	sph_groestl256_init(&ctx);
+	sph_groestl256(&ctx, input, len);
+	sph_groestl256_close(&ctx, hash);
+	memcpy(output, hash, 32);
+}
+
+static void do_jh_hash(const void* input, int len, void* output)
+{
+	uchar hash[64];
+	sph_jh256_context ctx;
+	sph_jh256_init(&ctx);
+	sph_jh256(&ctx, input, len);
+	sph_jh256_close(&ctx, hash);
+	memcpy(output, hash, 32);
+}
+
+static void do_skein_hash(const void* input, int len, void* output)
+{
+	uchar hash[32];
+	sph_skein256_context ctx;
+	sph_skein256_init(&ctx);
+	sph_skein256(&ctx, input, len);
+	sph_skein256_close(&ctx, hash);
+	memcpy(output, hash, 32);
+}
+
+// todo: use sph if possible
+static void keccak_hash_permutation(union hash_state *state) {
+	keccakf((uint64_t*)state, 24);
+}
+
+static void keccak_hash_process(union hash_state *state, const uint8_t *buf, int count) {
+	keccak1600(buf, (int)count, (uint8_t*)state);
+}
+
+extern "C" int fast_aesb_single_round(const uint8_t *in, uint8_t*out, const uint8_t *expandedKey);
+extern "C" int aesb_single_round(const uint8_t *in, uint8_t*out, const uint8_t *expandedKey);
+extern "C" int aesb_pseudo_round_mut(uint8_t *val, uint8_t *expandedKey);
+extern "C" int fast_aesb_pseudo_round_mut(uint8_t *val, uint8_t *expandedKey);
+
+static void (* const extra_hashes[4])(const void*, int, void *) = {
+	do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash
+};
+
+static uint64_t mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi)
+{
+	// multiplier   = ab = a * 2^32 + b
+	// multiplicand = cd = c * 2^32 + d
+	// ab * cd = a * c * 2^64 + (a * d + b * c) * 2^32 + b * d
+	uint64_t a = hi_dword(multiplier);
+	uint64_t b = lo_dword(multiplier);
+	uint64_t c = hi_dword(multiplicand);
+	uint64_t d = lo_dword(multiplicand);
+
+	uint64_t ac = a * c;
+	uint64_t ad = a * d;
+	uint64_t bc = b * c;
+	uint64_t bd = b * d;
+
+	uint64_t adbc = ad + bc;
+	uint64_t adbc_carry = adbc < ad ? 1 : 0;
+
+	// multiplier * multiplicand = product_hi * 2^64 + product_lo
+	uint64_t product_lo = bd + (adbc << 32);
+	uint64_t product_lo_carry = product_lo < bd ? 1 : 0;
+	*product_hi = ac + (adbc >> 32) + (adbc_carry << 32) + product_lo_carry;
+
+	return product_lo;
+}
+
+static size_t e2i(const uint8_t* a) {
+	//const uint32_t mask = (MEMORY / AES_BLOCK_SIZE - 1);
+	//return (*((uint64_t*) a) / AES_BLOCK_SIZE) & mask;
+	return *((uint64_t*) a) & 0xFFFF0; /* mask * AES_BLOCK_SIZE */
+}
+
+static void mul(const uint8_t* a, const uint8_t* b, uint8_t* res) {
+	((uint64_t*) res)[1] = mul128(((uint64_t*) a)[0], ((uint64_t*) b)[0], (uint64_t*) res);
+}
+
+static void sum_half_blocks(uint8_t* a, const uint8_t* b) {
+	((uint64_t*) a)[0] += ((uint64_t*) b)[0];
+	((uint64_t*) a)[1] += ((uint64_t*) b)[1];
+}
+
+static void sum_half_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) {
+	((uint64_t*) dst)[0] = ((uint64_t*) a)[0] + ((uint64_t*) b)[0];
+	((uint64_t*) dst)[1] = ((uint64_t*) a)[1] + ((uint64_t*) b)[1];
+}
+
+static void mul_sum_dst(const uint8_t* a, const uint8_t* b, const uint8_t* c, uint8_t* dst) {
+	((uint64_t*) dst)[1] = mul128(((uint64_t*) a)[0], ((uint64_t*) b)[0], (uint64_t*) dst) + ((uint64_t*) c)[1];
+	((uint64_t*) dst)[0] += ((uint64_t*) c)[0];
+}
+
+static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst, const int variant, const uint64_t tweak) {
+	uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1];
+	hi += ((uint64_t*) c)[0];
+	((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi;
+	((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo;
+	((uint64_t*) dst)[0] = hi;
+	((uint64_t*) dst)[1] = variant ? lo ^ tweak : lo;
+}
+
+static void copy_block(uint8_t* dst, const uint8_t* src) {
+	((uint64_t*) dst)[0] = ((uint64_t*) src)[0];
+	((uint64_t*) dst)[1] = ((uint64_t*) src)[1];
+}
+
+static void xor_blocks(uint8_t* a, const uint8_t* b) {
+	((uint64_t*) a)[0] ^= ((uint64_t*) b)[0];
+	((uint64_t*) a)[1] ^= ((uint64_t*) b)[1];
+}
+
+static void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) {
+	((uint64_t*) dst)[0] = ((uint64_t*) a)[0] ^ ((uint64_t*) b)[0];
+	((uint64_t*) dst)[1] = ((uint64_t*) a)[1] ^ ((uint64_t*) b)[1];
+}
+
+static void cryptolight_store_variant(void* state, int variant) {
+	if (variant == 1) {
+		// use variant 1 like monero since june 2018
+		const uint8_t tmp = ((const uint8_t*)(state))[11];
+		const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1;
+		((uint8_t*)(state))[11] = tmp ^ ((0x75310 >> index) & 0x30);
+	}
+}
+
+static void cryptolight_hash_ctx(void* output, const void* input, const int len, struct cryptonight_ctx* ctx, const int variant)
+{
+	size_t i, j;
+
+	keccak_hash_process(&ctx->state.hs, (const uint8_t*) input, len);
+	ctx->aes_ctx = (oaes_ctx*) oaes_alloc();
+	memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
+
+	const uint64_t tweak = variant ? *((uint64_t*) (((uint8_t*)input) + 35)) ^ ctx->state.hs.w[24] : 0;
+
+	oaes_key_import_data(ctx->aes_ctx, ctx->state.hs.b, AES_KEY_SIZE);
+	for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
+		#undef RND
+		#define RND(p) aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * p], ctx->aes_ctx->key->exp_data);
+		RND(0);
+		RND(1);
+		RND(2);
+		RND(3);
+		RND(4);
+		RND(5);
+		RND(6);
+		RND(7);
+		memcpy(&ctx->long_state[i], ctx->text, INIT_SIZE_BYTE);
+	}
+
+	xor_blocks_dst(&ctx->state.k[0], &ctx->state.k[32], ctx->a);
+	xor_blocks_dst(&ctx->state.k[16], &ctx->state.k[48], ctx->b);
+
+	for (i = 0; likely(i < ITER / 4); ++i) {
+		j = e2i(ctx->a);
+		aesb_single_round(&ctx->long_state[j], ctx->c, ctx->a);
+		xor_blocks_dst(ctx->c, ctx->b, &ctx->long_state[j]);
+		cryptolight_store_variant(&ctx->long_state[j], variant);
+		mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c)], variant, tweak);
+
+		j = e2i(ctx->a);
+		aesb_single_round(&ctx->long_state[j], ctx->b, ctx->a);
+		xor_blocks_dst(ctx->b, ctx->c, &ctx->long_state[j]);
+		cryptolight_store_variant(&ctx->long_state[j], variant);
+		mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b)], variant, tweak);
+	}
+
+	memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
+	oaes_key_import_data(ctx->aes_ctx, &ctx->state.hs.b[32], AES_KEY_SIZE);
+	for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
+		#undef RND
+		#define RND(p) xor_blocks(&ctx->text[p * AES_BLOCK_SIZE], &ctx->long_state[i + p * AES_BLOCK_SIZE]); \
+			aesb_pseudo_round_mut(&ctx->text[p * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
+		RND(0);
+		RND(1);
+		RND(2);
+		RND(3);
+		RND(4);
+		RND(5);
+		RND(6);
+		RND(7);
+	}
+	memcpy(ctx->state.init, ctx->text, INIT_SIZE_BYTE);
+	keccak_hash_permutation(&ctx->state.hs);
+
+	int extra_algo = ctx->state.hs.b[0] & 3;
+	extra_hashes[extra_algo](&ctx->state, 200, output);
+	if (opt_debug) applog(LOG_DEBUG, "extra algo=%d", extra_algo);
+
+	oaes_free((OAES_CTX **) &ctx->aes_ctx);
+}
+
+void cryptolight_hash_variant(void* output, const void* input, int len, int variant)
+{
+	struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx));
+	cryptolight_hash_ctx(output, input, len, ctx, variant);
+	free(ctx);
+}
+
+void cryptolight_hash(void* output, const void* input)
+{
+	cryptolight_hash_variant(output, input, 76, 1);
+}
+
diff --git a/crypto/cryptolight.cu b/crypto/cryptolight.cu
new file mode 100644
index 0000000000..c2a10e419d
--- /dev/null
+++ b/crypto/cryptolight.cu
@@ -0,0 +1,176 @@
+
+#include "cryptolight.h"
+
+extern char *device_config[MAX_GPUS]; // -l 32x16
+
+static __thread uint32_t cn_blocks  = 32;
+static __thread uint32_t cn_threads = 16;
+
+static uint32_t *d_long_state[MAX_GPUS];
+static uint32_t *d_ctx_state[MAX_GPUS];
+static uint32_t *d_ctx_key1[MAX_GPUS];
+static uint32_t *d_ctx_key2[MAX_GPUS];
+static uint32_t *d_ctx_text[MAX_GPUS];
+static uint64_t *d_ctx_tweak[MAX_GPUS];
+static uint32_t *d_ctx_a[MAX_GPUS];
+static uint32_t *d_ctx_b[MAX_GPUS];
+
+static bool init[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant)
+{
+	int res = 0;
+	uint32_t throughput = 0;
+
+	uint32_t *ptarget = work->target;
+	uint8_t *pdata = (uint8_t*) work->data;
+	uint32_t *nonceptr = (uint32_t*) (&pdata[39]);
+	const uint32_t first_nonce = *nonceptr;
+	uint32_t nonce = first_nonce;
+	int dev_id = device_map[thr_id];
+
+	if(opt_benchmark) {
+		ptarget[7] = 0x00ff;
+	}
+
+	if(!init[thr_id])
+	{
+		if (!device_config[thr_id] && strcmp(device_name[dev_id], "TITAN V") == 0) {
+			device_config[thr_id] = strdup("80x32");
+		}
+
+		if (device_config[thr_id]) {
+			sscanf(device_config[thr_id], "%ux%u", &cn_blocks, &cn_threads);
+			throughput = cuda_default_throughput(thr_id, cn_blocks*cn_threads);
+			gpulog(LOG_INFO, thr_id, "Using %u x %u kernel launch config, %u threads",
+				cn_blocks, cn_threads, throughput);
+		} else {
+			throughput = cuda_default_throughput(thr_id, cn_blocks*cn_threads);
+			if (throughput != cn_blocks*cn_threads && cn_threads) {
+				cn_blocks = throughput / cn_threads;
+				throughput = cn_threads * cn_blocks;
+			}
+			gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u threads (%ux%u)",
+				throughput2intensity(throughput), throughput, cn_blocks, cn_threads);
+		}
+
+		if(sizeof(size_t) == 4 && throughput > UINT32_MAX / MEMORY) {
+			gpulog(LOG_ERR, thr_id, "THE 32bit VERSION CAN'T ALLOCATE MORE THAN 4GB OF MEMORY!");
+			gpulog(LOG_ERR, thr_id, "PLEASE REDUCE THE NUMBER OF THREADS OR BLOCKS");
+			exit(1);
+		}
+
+		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
+			CUDA_LOG_ERROR();
+		}
+
+		const size_t alloc = MEMORY * throughput;
+		cryptonight_extra_init(thr_id);
+
+		cudaMalloc(&d_long_state[thr_id], alloc);
+		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
+		cudaMalloc(&d_ctx_state[thr_id], 25 * sizeof(uint64_t) * throughput);
+		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
+		cudaMalloc(&d_ctx_key1[thr_id], 40 * sizeof(uint32_t) * throughput);
+		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
+		cudaMalloc(&d_ctx_key2[thr_id], 40 * sizeof(uint32_t) * throughput);
+		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
+		cudaMalloc(&d_ctx_text[thr_id], 32 * sizeof(uint32_t) * throughput);
+		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
+		cudaMalloc(&d_ctx_a[thr_id], 4 * sizeof(uint32_t) * throughput);
+		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
+		cudaMalloc(&d_ctx_b[thr_id], 4 * sizeof(uint32_t) * throughput);
+		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
+		cudaMalloc(&d_ctx_tweak[thr_id], sizeof(uint64_t) * throughput);
+
+		init[thr_id] = true;
+	}
+
+	throughput = cn_blocks*cn_threads;
+
+	do
+	{
+		const uint32_t Htarg = ptarget[7];
+		uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX };
+
+		cryptonight_extra_setData(thr_id, pdata, ptarget);
+		cryptonight_extra_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak[thr_id]);
+		cryptolight_core_hash(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak[thr_id]);
+		cryptonight_extra_final(thr_id, throughput, nonce, resNonces, d_ctx_state[thr_id]);
+
+		*hashes_done = nonce - first_nonce + throughput;
+
+		if(resNonces[0] != UINT32_MAX)
+		{
+			uint32_t vhash[8];
+			uint32_t tempdata[19];
+			uint32_t *tempnonceptr = (uint32_t*)(((char*)tempdata) + 39);
+			memcpy(tempdata, pdata, 76);
+			*tempnonceptr = resNonces[0];
+			cryptolight_hash_variant(vhash, tempdata, 76, variant);
+			if(vhash[7] <= Htarg && fulltest(vhash, ptarget))
+			{
+				res = 1;
+				work->nonces[0] = resNonces[0];
+				work_set_target_ratio(work, vhash);
+				// second nonce
+				if(resNonces[1] != UINT32_MAX)
+				{
+					*tempnonceptr = resNonces[1];
+					cryptolight_hash_variant(vhash, tempdata, 76, variant);
+					if(vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+						res++;
+						work->nonces[1] = resNonces[1];
+					} else if (vhash[7] > Htarg) {
+						gpu_increment_reject(thr_id);
+					}
+				}
+				goto done;
+			} else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+					gpulog(LOG_WARNING, thr_id, "result for nonce %08x does not validate on CPU!", resNonces[0]);
+			}
+		}
+
+		if ((uint64_t) throughput + nonce >= max_nonce - 127) {
+			nonce = max_nonce;
+			break;
+		}
+
+		nonce += throughput;
+		gpulog(LOG_DEBUG, thr_id, "nonce %08x", nonce);
+
+	} while (!work_restart[thr_id].restart && max_nonce > (uint64_t)throughput + nonce);
+
+done:
+	gpulog(LOG_DEBUG, thr_id, "nonce %08x exit", nonce);
+	work->valid_nonces = res;
+	*nonceptr = nonce;
+	return res;
+}
+
+void free_cryptolight(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaFree(d_long_state[thr_id]);
+	cudaFree(d_ctx_state[thr_id]);
+	cudaFree(d_ctx_key1[thr_id]);
+	cudaFree(d_ctx_key2[thr_id]);
+	cudaFree(d_ctx_text[thr_id]);
+	cudaFree(d_ctx_tweak[thr_id]);
+	cudaFree(d_ctx_a[thr_id]);
+	cudaFree(d_ctx_b[thr_id]);
+
+	cryptonight_extra_free(thr_id);
+
+	cudaDeviceSynchronize();
+
+	init[thr_id] = false;
+}
diff --git a/crypto/cryptolight.h b/crypto/cryptolight.h
new file mode 100644
index 0000000000..482d0f8f7c
--- /dev/null
+++ b/crypto/cryptolight.h
@@ -0,0 +1,144 @@
+#pragma once
+#include <cuda_runtime.h>
+#include <miner.h>
+
+#ifdef __INTELLISENSE__
+/* avoid red underlining */
+#define __CUDA_ARCH__ 520
+struct uint3 {
+	unsigned int x, y, z;
+};
+struct uint3  threadIdx;
+struct uint3  blockIdx;
+struct uint3  blockDim;
+#define atomicExch(p,y) (*p) = y
+#define __funnelshift_r(a,b,c) 1
+#define __syncthreads()
+#define __threadfence_block()
+#define asm(x)
+#define __shfl(a,b,c) 1
+#define __umul64hi(a,b) a*b
+#endif
+
+#define MEMORY         (1UL << 20) /* 1 MiB - 1048576 */
+#define ITER           (1UL << 19) /* 512k */
+#define E2I_MASK1       0xFFFF0    /* MEMORY / AES_BLOCK_SIZE - 1 = 0xFFFF */
+#define E2I_MASK2       0x3FFFC    /* 0xFFFF0 >> 2 */
+
+#define AES_BLOCK_SIZE  16
+#define AES_KEY_SIZE    32
+#define INIT_SIZE_BLK   8
+#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE) // 128 B
+
+#define AES_RKEY_LEN 4
+#define AES_COL_LEN 4
+#define AES_ROUND_BASE 7
+
+#ifndef HASH_SIZE
+#define HASH_SIZE 32
+#endif
+
+#ifndef HASH_DATA_AREA
+#define HASH_DATA_AREA 136
+#endif
+
+#define hi_dword(x) (x >> 32)
+#define lo_dword(x) (x & 0xFFFFFFFF)
+
+#define C32(x)    ((uint32_t)(x ## U))
+#define T32(x) ((x) & C32(0xFFFFFFFF))
+
+#ifndef ROTL64
+    #if __CUDA_ARCH__ >= 350
+        __forceinline__ __device__ uint64_t cuda_ROTL64(const uint64_t value, const int offset) {
+            uint2 result;
+            if(offset >= 32) {
+                asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+                asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+            } else {
+                asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+                asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+            }
+            return  __double_as_longlong(__hiloint2double(result.y, result.x));
+        }
+        #define ROTL64(x, n) (cuda_ROTL64(x, n))
+    #else
+        #define ROTL64(x, n)        (((x) << (n)) | ((x) >> (64 - (n))))
+    #endif
+#endif
+
+#ifndef ROTL32
+    #if __CUDA_ARCH__ < 350
+        #define ROTL32(x, n) T32(((x) << (n)) | ((x) >> (32 - (n))))
+    #else
+        #define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
+    #endif
+#endif
+
+#ifndef ROTR32
+    #if __CUDA_ARCH__ < 350
+        #define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
+    #else
+        #define ROTR32(x, n) __funnelshift_r( (x), (x), (n) )
+    #endif
+#endif
+
+#define MEMSET8(dst,what,cnt) { \
+    int i_memset8; \
+    uint64_t *out_memset8 = (uint64_t *)(dst); \
+    for( i_memset8 = 0; i_memset8 < cnt; i_memset8++ ) \
+        out_memset8[i_memset8] = (what); }
+
+#define MEMSET4(dst,what,cnt) { \
+    int i_memset4; \
+    uint32_t *out_memset4 = (uint32_t *)(dst); \
+    for( i_memset4 = 0; i_memset4 < cnt; i_memset4++ ) \
+        out_memset4[i_memset4] = (what); }
+
+#define MEMCPY8(dst,src,cnt) { \
+    int i_memcpy8; \
+    uint64_t *in_memcpy8 = (uint64_t *)(src); \
+    uint64_t *out_memcpy8 = (uint64_t *)(dst); \
+    for( i_memcpy8 = 0; i_memcpy8 < cnt; i_memcpy8++ ) \
+        out_memcpy8[i_memcpy8] = in_memcpy8[i_memcpy8]; }
+
+#define MEMCPY4(dst,src,cnt) { \
+    int i_memcpy4; \
+    uint32_t *in_memcpy4 = (uint32_t *)(src); \
+    uint32_t *out_memcpy4 = (uint32_t *)(dst); \
+    for( i_memcpy4 = 0; i_memcpy4 < cnt; i_memcpy4++ ) \
+        out_memcpy4[i_memcpy4] = in_memcpy4[i_memcpy4]; }
+
+#define XOR_BLOCKS_DST(x,y,z) { \
+    ((uint64_t *)z)[0] = ((uint64_t *)(x))[0] ^ ((uint64_t *)(y))[0]; \
+    ((uint64_t *)z)[1] = ((uint64_t *)(x))[1] ^ ((uint64_t *)(y))[1]; }
+
+union hash_state {
+  uint8_t b[200];
+  uint64_t w[25];
+};
+
+union cn_slow_hash_state {
+    union hash_state hs;
+    struct {
+        uint8_t k[64];
+        uint8_t init[INIT_SIZE_BYTE];
+    };
+};
+
+static inline void exit_if_cudaerror(int thr_id, const char *src, int line)
+{
+	cudaError_t err = cudaGetLastError();
+	if(err != cudaSuccess) {
+		gpulog(LOG_ERR, thr_id, "%s %s line %d", cudaGetErrorString(err), src, line);
+		exit(1);
+	}
+}
+
+void cryptolight_core_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak);
+
+void cryptonight_extra_setData(int thr_id, const void *data, const void *ptarget);
+void cryptonight_extra_init(int thr_id/*, uint32_t threads*/);
+void cryptonight_extra_free(int thr_id);
+void cryptonight_extra_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak);
+void cryptonight_extra_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces, uint32_t *d_ctx_state);
diff --git a/crypto/cryptonight-core.cu b/crypto/cryptonight-core.cu
new file mode 100644
index 0000000000..90f024f0fc
--- /dev/null
+++ b/crypto/cryptonight-core.cu
@@ -0,0 +1,316 @@
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <sys/time.h>
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#if CUDA_VERSION >= 9000 && __CUDA_ARCH__ >= 300
+#undef __shfl
+#define __shfl(var, srcLane, width) __shfl_sync(0xFFFFFFFFu, var, srcLane, width)
+#endif
+
+#include "cryptonight.h"
+
+#define LONG_SHL32 19 // 1<<19 (uint32_t* index)
+#define LONG_SHL64 18 // 1<<18 (uint64_t* index)
+#define LONG_LOOPS32 0x80000U
+
+#include "cn_aes.cuh"
+
+__global__
+void cryptonight_gpu_phase1(const uint32_t threads, uint32_t * __restrict__ d_long_state,
+	uint32_t * __restrict__ ctx_state, uint32_t * __restrict__ ctx_key1)
+{
+	__shared__ uint32_t sharedMemory[1024];
+
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3;
+	if(thread < threads)
+	{
+		cn_aes_gpu_init(sharedMemory);
+		__syncthreads();
+
+		const uint32_t sub = (threadIdx.x & 0x7U) << 2;
+		uint32_t *longstate = &d_long_state[(thread << LONG_SHL32) + sub];
+		uint32_t __align__(8) key[40];
+		MEMCPY8(key, &ctx_key1[thread * 40U], 20);
+		uint32_t __align__(8) text[4];
+		MEMCPY8(text, &ctx_state[thread * 50U + sub + 16U], 2);
+
+		for(int i = 0; i < LONG_LOOPS32; i += 32)
+		{
+			cn_aes_pseudo_round_mut(sharedMemory, text, key);
+			MEMCPY8(&longstate[i], text, 2);
+		}
+	}
+}
+
+// --------------------------------------------------------------------------------------------------------------
+
+__device__ __forceinline__ ulonglong2 cuda_mul128(const uint64_t multiplier, const uint64_t multiplicand)
+{
+	ulonglong2 product;
+	product.x = __umul64hi(multiplier, multiplicand);
+	product.y = multiplier * multiplicand;
+	return product;
+}
+
+static __forceinline__ __device__ void operator += (ulonglong2 &a, const ulonglong2 b) {
+	a.x += b.x; a.y += b.y;
+}
+
+static __forceinline__ __device__ ulonglong2 operator ^ (const ulonglong2 &a, const ulonglong2 &b) {
+	return make_ulonglong2(a.x ^ b.x, a.y ^ b.y);
+}
+
+__device__ __forceinline__ void MUL_SUM_XOR_DST_0(const uint64_t m, uint4 &a, void* far_dst)
+{
+	ulonglong2 d = AS_UL2(far_dst);
+	ulonglong2 p = cuda_mul128(m, d.x);
+	p += AS_UL2(&a);
+	AS_UL2(&a) = p ^ d;
+	AS_UL2(far_dst) = p;
+}
+
+__global__
+#if __CUDA_ARCH__ >= 500
+//__launch_bounds__(128,12) /* force 40 regs to allow -l ...x32 */
+#endif
+void cryptonight_gpu_phase2(const uint32_t threads, const uint16_t bfactor, const uint32_t partidx,
+	uint64_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b)
+{
+	__shared__ __align__(16) uint32_t sharedMemory[1024];
+	cn_aes_gpu_init(sharedMemory);
+	__syncthreads();
+
+	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
+
+	if (thread < threads)
+	{
+		const uint32_t batchsize = ITER >> (2 + bfactor);
+		const uint32_t start = partidx * batchsize;
+		const uint32_t end = start + batchsize;
+
+		void * ctx_a = (void*)(&d_ctx_a[thread << 2U]);
+		void * ctx_b = (void*)(&d_ctx_b[thread << 2U]);
+		uint4 A = AS_UINT4(ctx_a); // ld.global.u32.v4
+		uint4 B = AS_UINT4(ctx_b);
+
+		uint64_t * long_state = &d_long_state[thread << LONG_SHL64];
+		for (int i = start; i < end; i++) // end = 262144
+		{
+			uint4 C;
+
+			uint32_t j = (A.x & E2I_MASK) >> 3;
+			cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &C);
+			AS_UINT4(&long_state[j]) = C ^ B; // st.global.u32.v4
+			MUL_SUM_XOR_DST_0((AS_UL2(&C)).x, A, &long_state[(C.x & E2I_MASK) >> 3]);
+
+			j = (A.x & E2I_MASK) >> 3;
+			cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &B);
+			AS_UINT4(&long_state[j]) = C ^ B;
+			MUL_SUM_XOR_DST_0((AS_UL2(&B)).x, A, &long_state[(B.x & E2I_MASK) >> 3]);
+		}
+
+		if (bfactor) {
+			AS_UINT4(ctx_a) = A;
+			AS_UINT4(ctx_b) = B;
+		}
+	}
+}
+
+// --------------------------------------------------------------------------------------------------------------
+
+__device__ __forceinline__ void store_variant1(uint64_t* long_state, uint4 Z)
+{
+	const uint32_t tmp = (Z.z >> 24); // __byte_perm(src, 0, 0x7773);
+	const uint32_t index = (((tmp >> 3) & 6u) | (tmp & 1u)) << 1;
+	Z.z = (Z.z & 0x00ffffffu) | ((tmp ^ ((0x75310u >> index) & 0x30u)) << 24);
+	AS_UINT4(long_state) = Z;
+}
+
+__device__ __forceinline__ void store_variant2(uint64_t* long_state, uint4 Z)
+{
+	const uint32_t tmp = (Z.z >> 24); // __byte_perm(src, 0, 0x7773);
+	const uint32_t index = (((tmp >> 4) & 6u) | (tmp & 1u)) << 1;
+	Z.z = (Z.z & 0x00ffffffu) | ((tmp ^ ((0x75312u >> index) & 0x30u)) << 24);
+	AS_UINT4(long_state) = Z;
+}
+
+__device__ __forceinline__ void MUL_SUM_XOR_DST_1(const uint64_t m, uint4 &a, void* far_dst, uint64_t tweak)
+{
+	ulonglong2 d = AS_UL2(far_dst);
+	ulonglong2 p = cuda_mul128(m, d.x);
+	p += AS_UL2(&a);
+	AS_UL2(&a) = p ^ d;
+	p.y = p.y ^ tweak;
+	AS_UL2(far_dst) = p;
+}
+
+__global__
+void monero_gpu_phase2(const uint32_t threads, const uint16_t bfactor, const uint32_t partidx,
+	uint64_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b,
+	uint64_t * __restrict__ d_tweak)
+{
+	__shared__ __align__(16) uint32_t sharedMemory[1024];
+	cn_aes_gpu_init(sharedMemory);
+	__syncthreads();
+
+	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
+	if (thread < threads)
+	{
+		const uint32_t batchsize = ITER >> (2 + bfactor);
+		const uint32_t start = partidx * batchsize;
+		const uint32_t end = start + batchsize;
+		uint64_t tweak = d_tweak[thread];
+
+		void * ctx_a = (void*)(&d_ctx_a[thread << 2]);
+		void * ctx_b = (void*)(&d_ctx_b[thread << 2]);
+		uint4 A = AS_UINT4(ctx_a); // ld.global.u32.v4
+		uint4 B = AS_UINT4(ctx_b);
+
+		uint64_t * long_state = &d_long_state[thread << LONG_SHL64];
+		for (int i = start; i < end; i++) // end = 262144
+		{
+			uint4 C;
+			uint32_t j = (A.x & E2I_MASK) >> 3;
+			cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &C);
+			store_variant1(&long_state[j], C ^ B); // st.global
+			MUL_SUM_XOR_DST_1((AS_UL2(&C)).x, A, &long_state[(C.x & E2I_MASK) >> 3], tweak);
+
+			j = (A.x & E2I_MASK) >> 3;
+			cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &B);
+			store_variant1(&long_state[j], C ^ B);
+			MUL_SUM_XOR_DST_1((AS_UL2(&B)).x, A, &long_state[(B.x & E2I_MASK) >> 3], tweak);
+		}
+		if (bfactor) {
+			AS_UINT4(ctx_a) = A;
+			AS_UINT4(ctx_b) = B;
+		}
+	}
+}
+
+// --------------------------------------------------------------------------------------------------------------
+
+__global__
+void stellite_gpu_phase2(const uint32_t threads, const uint16_t bfactor, const uint32_t partidx,
+	uint64_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b,
+	uint64_t * __restrict__ d_tweak)
+{
+	__shared__ __align__(16) uint32_t sharedMemory[1024];
+	cn_aes_gpu_init(sharedMemory);
+	__syncthreads();
+
+	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
+	if (thread < threads)
+	{
+		const uint32_t batchsize = ITER >> (2 + bfactor);
+		const uint32_t start = partidx * batchsize;
+		const uint32_t end = start + batchsize;
+		uint64_t tweak = d_tweak[thread];
+
+		void * ctx_a = (void*)(&d_ctx_a[thread << 2]);
+		void * ctx_b = (void*)(&d_ctx_b[thread << 2]);
+		uint4 A = AS_UINT4(ctx_a); // ld.global.u32.v4
+		uint4 B = AS_UINT4(ctx_b);
+
+		uint64_t * long_state = &d_long_state[thread << LONG_SHL64];
+		for (int i = start; i < end; i++) // end = 262144
+		{
+			uint4 C;
+			uint32_t j = (A.x & E2I_MASK) >> 3;
+			cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &C);
+			store_variant2(&long_state[j], C ^ B); // st.global
+			MUL_SUM_XOR_DST_1((AS_UL2(&C)).x, A, &long_state[(C.x & E2I_MASK) >> 3], tweak);
+
+			j = (A.x & E2I_MASK) >> 3;
+			cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &B);
+			store_variant2(&long_state[j], C ^ B);
+			MUL_SUM_XOR_DST_1((AS_UL2(&B)).x, A, &long_state[(B.x & E2I_MASK) >> 3], tweak);
+		}
+		if (bfactor) {
+			AS_UINT4(ctx_a) = A;
+			AS_UINT4(ctx_b) = B;
+		}
+	}
+}
+
+// --------------------------------------------------------------------------------------------------------------
+
+__global__
+void cryptonight_gpu_phase3(const uint32_t threads, const uint32_t * __restrict__ d_long_state,
+	uint32_t * __restrict__ d_ctx_state, const uint32_t * __restrict__ d_ctx_key2)
+{
+	__shared__ uint32_t sharedMemory[1024];
+
+	cn_aes_gpu_init(sharedMemory);
+	__syncthreads();
+
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3;
+
+	if(thread < threads)
+	{
+		const int sub = (threadIdx.x & 7) << 2;
+		const uint32_t *longstate = &d_long_state[(thread << LONG_SHL32) + sub];
+		uint32_t key[40], text[4];
+		MEMCPY8(key, d_ctx_key2 + thread * 40, 20);
+		MEMCPY8(text, d_ctx_state + thread * 50 + sub + 16, 2);
+
+		for(int i = 0; i < LONG_LOOPS32; i += 32)
+		{
+			#pragma unroll
+			for(int j = 0; j < 4; ++j)
+				text[j] ^= longstate[i + j];
+
+			cn_aes_pseudo_round_mut(sharedMemory, text, key);
+		}
+
+		MEMCPY8(d_ctx_state + thread * 50 + sub + 16, text, 2);
+	}
+}
+
+// --------------------------------------------------------------------------------------------------------------
+
+extern int device_bfactor[MAX_GPUS];
+
+__host__
+void cryptonight_core_cuda(int thr_id, uint32_t blocks, uint32_t threads, uint64_t *d_long_state, uint32_t *d_ctx_state,
+	uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak)
+{
+	dim3 grid(blocks);
+	dim3 block(threads);
+	dim3 block4(threads << 2);
+	dim3 block8(threads << 3);
+
+	const uint16_t bfactor = (uint16_t) device_bfactor[thr_id];
+	const uint32_t partcount = 1U << bfactor;
+	const uint32_t throughput = (uint32_t) (blocks*threads);
+
+	const int bsleep = bfactor ? 100 : 0;
+	const int dev_id = device_map[thr_id];
+
+	cryptonight_gpu_phase1 <<<grid, block8>>> (throughput, (uint32_t*) d_long_state, d_ctx_state, d_ctx_key1);
+	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
+	if(partcount > 1) usleep(bsleep);
+
+	for (uint32_t i = 0; i < partcount; i++)
+	{
+		dim3 b = device_sm[dev_id] >= 300 ? block4 : block;
+		if (variant == 0)
+			cryptonight_gpu_phase2 <<<grid, b>>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b);
+		else if (variant == 1 || cryptonight_fork == 8)
+			monero_gpu_phase2 <<<grid, b>>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b, d_ctx_tweak);
+		else if (variant == 2 && cryptonight_fork == 3)
+			stellite_gpu_phase2 <<<grid, b>>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b, d_ctx_tweak);
+		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
+		if(partcount > 1) usleep(bsleep);
+	}
+	//cudaDeviceSynchronize();
+	//exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
+	cryptonight_gpu_phase3 <<<grid, block8>>> (throughput, (uint32_t*) d_long_state, d_ctx_state, d_ctx_key2);
+	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
+}
diff --git a/crypto/cryptonight-cpu.cpp b/crypto/cryptonight-cpu.cpp
new file mode 100644
index 0000000000..582d096dc8
--- /dev/null
+++ b/crypto/cryptonight-cpu.cpp
@@ -0,0 +1,269 @@
+#include <miner.h>
+#include <memory.h>
+
+#include "oaes_lib.h"
+#include "cryptonight.h"
+
+extern "C" {
+#include <sph/sph_blake.h>
+#include <sph/sph_groestl.h>
+#include <sph/sph_jh.h>
+#include <sph/sph_skein.h>
+#include "cpu/c_keccak.h"
+}
+
+struct cryptonight_ctx {
+	uint8_t long_state[MEMORY];
+	union cn_slow_hash_state state;
+	uint8_t text[INIT_SIZE_BYTE];
+	uint8_t a[AES_BLOCK_SIZE];
+	uint8_t b[AES_BLOCK_SIZE];
+	uint8_t c[AES_BLOCK_SIZE];
+	oaes_ctx* aes_ctx;
+};
+
+static void do_blake_hash(const void* input, size_t len, void* output)
+{
+	uchar hash[32];
+	sph_blake256_context ctx;
+	sph_blake256_set_rounds(14);
+	sph_blake256_init(&ctx);
+	sph_blake256(&ctx, input, len);
+	sph_blake256_close(&ctx, hash);
+	memcpy(output, hash, 32);
+}
+
+static void do_groestl_hash(const void* input, size_t len, void* output)
+{
+	uchar hash[32];
+	sph_groestl256_context ctx;
+	sph_groestl256_init(&ctx);
+	sph_groestl256(&ctx, input, len);
+	sph_groestl256_close(&ctx, hash);
+	memcpy(output, hash, 32);
+}
+
+static void do_jh_hash(const void* input, size_t len, void* output)
+{
+	uchar hash[64];
+	sph_jh256_context ctx;
+	sph_jh256_init(&ctx);
+	sph_jh256(&ctx, input, len);
+	sph_jh256_close(&ctx, hash);
+	memcpy(output, hash, 32);
+}
+
+static void do_skein_hash(const void* input, size_t len, void* output)
+{
+	uchar hash[32];
+	sph_skein256_context ctx;
+	sph_skein256_init(&ctx);
+	sph_skein256(&ctx, input, len);
+	sph_skein256_close(&ctx, hash);
+	memcpy(output, hash, 32);
+}
+
+// todo: use sph if possible
+static void keccak_hash_permutation(union hash_state *state) {
+	keccakf((uint64_t*)state, 24);
+}
+
+static void keccak_hash_process(union hash_state *state, const uint8_t *buf, size_t count) {
+	keccak1600(buf, (int)count, (uint8_t*)state);
+}
+
+extern "C" int fast_aesb_single_round(const uint8_t *in, uint8_t*out, const uint8_t *expandedKey);
+extern "C" int aesb_single_round(const uint8_t *in, uint8_t*out, const uint8_t *expandedKey);
+extern "C" int aesb_pseudo_round_mut(uint8_t *val, uint8_t *expandedKey);
+extern "C" int fast_aesb_pseudo_round_mut(uint8_t *val, uint8_t *expandedKey);
+
+static void (* const extra_hashes[4])(const void*, size_t, void *) = {
+	do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash
+};
+
+static uint64_t mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi)
+{
+	// multiplier   = ab = a * 2^32 + b
+	// multiplicand = cd = c * 2^32 + d
+	// ab * cd = a * c * 2^64 + (a * d + b * c) * 2^32 + b * d
+	uint64_t a = hi_dword(multiplier);
+	uint64_t b = lo_dword(multiplier);
+	uint64_t c = hi_dword(multiplicand);
+	uint64_t d = lo_dword(multiplicand);
+
+	uint64_t ac = a * c;
+	uint64_t ad = a * d;
+	uint64_t bc = b * c;
+	uint64_t bd = b * d;
+
+	uint64_t adbc = ad + bc;
+	uint64_t adbc_carry = adbc < ad ? 1 : 0;
+
+	// multiplier * multiplicand = product_hi * 2^64 + product_lo
+	uint64_t product_lo = bd + (adbc << 32);
+	uint64_t product_lo_carry = product_lo < bd ? 1 : 0;
+	*product_hi = ac + (adbc >> 32) + (adbc_carry << 32) + product_lo_carry;
+
+	return product_lo;
+}
+
+static size_t e2i(const uint8_t* a) {
+	return (*((uint64_t*) a) / AES_BLOCK_SIZE) & (MEMORY / AES_BLOCK_SIZE - 1);
+}
+
+static void mul(const uint8_t* a, const uint8_t* b, uint8_t* res) {
+	((uint64_t*) res)[1] = mul128(((uint64_t*) a)[0], ((uint64_t*) b)[0], (uint64_t*) res);
+}
+
+static void sum_half_blocks(uint8_t* a, const uint8_t* b) {
+	((uint64_t*) a)[0] += ((uint64_t*) b)[0];
+	((uint64_t*) a)[1] += ((uint64_t*) b)[1];
+}
+
+static void sum_half_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) {
+	((uint64_t*) dst)[0] = ((uint64_t*) a)[0] + ((uint64_t*) b)[0];
+	((uint64_t*) dst)[1] = ((uint64_t*) a)[1] + ((uint64_t*) b)[1];
+}
+
+static void mul_sum_dst(const uint8_t* a, const uint8_t* b, const uint8_t* c, uint8_t* dst) {
+	((uint64_t*) dst)[1] = mul128(((uint64_t*) a)[0], ((uint64_t*) b)[0], (uint64_t*) dst) + ((uint64_t*) c)[1];
+	((uint64_t*) dst)[0] += ((uint64_t*) c)[0];
+}
+
+static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst, const int variant, const uint64_t tweak) {
+	uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1];
+	hi += ((uint64_t*) c)[0];
+
+	((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi;
+	((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo;
+	((uint64_t*) dst)[0] = hi;
+	((uint64_t*) dst)[1] = variant ? lo ^ tweak : lo;
+}
+
+static void copy_block(uint8_t* dst, const uint8_t* src) {
+	((uint64_t*) dst)[0] = ((uint64_t*) src)[0];
+	((uint64_t*) dst)[1] = ((uint64_t*) src)[1];
+}
+
+static void xor_blocks(uint8_t* a, const uint8_t* b) {
+	((uint64_t*) a)[0] ^= ((uint64_t*) b)[0];
+	((uint64_t*) a)[1] ^= ((uint64_t*) b)[1];
+}
+
+static void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) {
+	((uint64_t*) dst)[0] = ((uint64_t*) a)[0] ^ ((uint64_t*) b)[0];
+	((uint64_t*) dst)[1] = ((uint64_t*) a)[1] ^ ((uint64_t*) b)[1];
+}
+
+static void cryptonight_store_variant(void* state, int variant) {
+	if (variant == 1 || cryptonight_fork == 8) {
+		// monero and graft
+		const uint8_t tmp = ((const uint8_t*)(state))[11];
+		const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1;
+		((uint8_t*)(state))[11] = tmp ^ ((0x75310 >> index) & 0x30);
+	} else if (variant == 2 && cryptonight_fork == 3) {
+		// stellite
+		const uint8_t tmp = ((const uint8_t*)(state))[11];
+		const uint8_t index = (((tmp >> 4) & 6) | (tmp & 1)) << 1;
+		((uint8_t*)(state))[11] = tmp ^ ((0x75312 >> index) & 0x30);
+	}
+}
+
+static void cryptonight_hash_ctx(void* output, const void* input, const size_t len, struct cryptonight_ctx* ctx, const int variant)
+{
+	size_t i, j;
+
+	keccak_hash_process(&ctx->state.hs, (const uint8_t*) input, len);
+	ctx->aes_ctx = (oaes_ctx*) oaes_alloc();
+	memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
+
+	const uint64_t tweak = variant ? *((uint64_t*) (((uint8_t*)input) + 35)) ^ ctx->state.hs.w[24] : 0;
+
+	oaes_key_import_data(ctx->aes_ctx, ctx->state.hs.b, AES_KEY_SIZE);
+	for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
+		#undef RND
+			#define RND(p) aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * p], ctx->aes_ctx->key->exp_data);
+		RND(0);
+		RND(1);
+		RND(2);
+		RND(3);
+		RND(4);
+		RND(5);
+		RND(6);
+		RND(7);
+		memcpy(&ctx->long_state[i], ctx->text, INIT_SIZE_BYTE);
+	}
+
+	xor_blocks_dst(&ctx->state.k[0], &ctx->state.k[32], ctx->a);
+	xor_blocks_dst(&ctx->state.k[16], &ctx->state.k[48], ctx->b);
+
+	for (i = 0; likely(i < ITER / 4); ++i) {
+		j = e2i(ctx->a) * AES_BLOCK_SIZE;
+		aesb_single_round(&ctx->long_state[j], ctx->c, ctx->a);
+		xor_blocks_dst(ctx->c, ctx->b, &ctx->long_state[j]);
+		cryptonight_store_variant(&ctx->long_state[j], variant);
+		mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c) * AES_BLOCK_SIZE], variant, tweak);
+
+		j = e2i(ctx->a) * AES_BLOCK_SIZE;
+		aesb_single_round(&ctx->long_state[j], ctx->b, ctx->a);
+		xor_blocks_dst(ctx->b, ctx->c, &ctx->long_state[j]);
+		cryptonight_store_variant(&ctx->long_state[j], variant);
+		mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b) * AES_BLOCK_SIZE], variant, tweak);
+	}
+
+	memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
+	oaes_key_import_data(ctx->aes_ctx, &ctx->state.hs.b[32], AES_KEY_SIZE);
+	for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
+		#undef RND
+		#define RND(p) xor_blocks(&ctx->text[p * AES_BLOCK_SIZE], &ctx->long_state[i + p * AES_BLOCK_SIZE]); \
+			aesb_pseudo_round_mut(&ctx->text[p * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
+		RND(0);
+		RND(1);
+		RND(2);
+		RND(3);
+		RND(4);
+		RND(5);
+		RND(6);
+		RND(7);
+	}
+	memcpy(ctx->state.init, ctx->text, INIT_SIZE_BYTE);
+	keccak_hash_permutation(&ctx->state.hs);
+
+	int extra_algo = ctx->state.hs.b[0] & 3;
+	extra_hashes[extra_algo](&ctx->state, 200, output);
+	if (opt_debug) applog(LOG_DEBUG, "extra algo=%d", extra_algo);
+
+	oaes_free((OAES_CTX **) &ctx->aes_ctx);
+}
+
+void cryptonight_hash_variant(void* output, const void* input, size_t len, int variant)
+{
+	struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx));
+	cryptonight_hash_ctx(output, input, len, ctx, variant);
+	free(ctx);
+}
+
+void cryptonight_hash(void* output, const void* input)
+{
+	cryptonight_fork = 1;
+	cryptonight_hash_variant(output, input, 76, 0);
+}
+
+void graft_hash(void* output, const void* input)
+{
+	cryptonight_fork = 8;
+	cryptonight_hash_variant(output, input, 76, 1);
+}
+
+void monero_hash(void* output, const void* input)
+{
+	cryptonight_fork = 7;
+	cryptonight_hash_variant(output, input, 76, 1);
+}
+
+void stellite_hash(void* output, const void* input)
+{
+	cryptonight_fork = 3;
+	cryptonight_hash_variant(output, input, 76, 2);
+}
+
diff --git a/crypto/cryptonight-extra.cu b/crypto/cryptonight-extra.cu
new file mode 100644
index 0000000000..c55c518e60
--- /dev/null
+++ b/crypto/cryptonight-extra.cu
@@ -0,0 +1,212 @@
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <miner.h>
+#include <cuda_helper.h>
+
+#include "cryptonight.h"
+
+static uint32_t *d_input[MAX_GPUS];
+static uint32_t *d_target[MAX_GPUS];
+static uint32_t *d_result[MAX_GPUS];
+
+typedef uint8_t BitSequence;
+typedef uint32_t DataLength;
+#include "cn_keccak.cuh"
+#include "cn_blake.cuh"
+#include "cn_groestl.cuh"
+#include "cn_jh.cuh"
+#include "cn_skein.cuh"
+
+__constant__ uint8_t d_sub_byte[16][16] = {
+	{0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76},
+	{0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0},
+	{0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15},
+	{0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75},
+	{0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84},
+	{0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf},
+	{0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8},
+	{0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2},
+	{0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73},
+	{0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb},
+	{0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79},
+	{0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08},
+	{0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a},
+	{0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e},
+	{0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf},
+	{0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16}
+};
+
+__device__ __forceinline__
+void cryptonight_aes_set_key(uint32_t * __restrict__ key, const uint32_t * __restrict__ data)
+{
+	const uint32_t aes_gf[10] = {
+		0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36
+	};
+
+	MEMCPY4(key, data, 8);
+	#pragma unroll
+	for(int i = 8; i < 40; i++)
+	{
+		uint8_t temp[4];
+		*(uint32_t *)temp = key[i - 1];
+
+		if(i % 8 == 0) {
+			*(uint32_t *)temp = ROTR32(*(uint32_t *)temp, 8);
+			for(int j = 0; j < 4; j++)
+				temp[j] = d_sub_byte[(temp[j] >> 4) & 0x0f][temp[j] & 0x0f];
+			*(uint32_t *)temp ^= aes_gf[i / 8 - 1];
+		}
+		else if(i % 8 == 4) {
+			#pragma unroll
+			for(int j = 0; j < 4; j++)
+				temp[j] = d_sub_byte[(temp[j] >> 4) & 0x0f][temp[j] & 0x0f];
+		}
+
+		key[i] = key[(i - 8)] ^ *(uint32_t *)temp;
+	}
+}
+
+__global__
+void cryptonight_extra_gpu_prepare(const uint32_t threads, const uint32_t * __restrict__ d_input, uint32_t startNonce,
+	uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b,
+	uint32_t * __restrict__ d_ctx_key1, uint32_t * __restrict__ d_ctx_key2, int variant, uint64_t * d_ctx_tweak)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if(thread < threads)
+	{
+		uint64_t ctx_state[25];
+		uint32_t ctx_a[4];
+		uint32_t ctx_b[4];
+		uint32_t ctx_key1[40];
+		uint32_t ctx_key2[40];
+		uint32_t input[19];
+
+		MEMCPY4(input, d_input, 19);
+
+		uint32_t nonce = startNonce + thread;
+		*(((uint8_t *)input) + 39) = nonce & 0xff;
+		*(((uint8_t *)input) + 40) = (nonce >> 8) & 0xff;
+		*(((uint8_t *)input) + 41) = (nonce >> 16) & 0xff;
+		*(((uint8_t *)input) + 42) = (nonce >> 24) & 0xff;
+
+		cn_keccak(input, ctx_state);
+		MEMCPY4(&d_ctx_state[thread * 50U], ctx_state, 50);
+
+		cryptonight_aes_set_key(ctx_key1, (uint32_t*)(&ctx_state[0]));
+		cryptonight_aes_set_key(ctx_key2, (uint32_t*)(&ctx_state[4]));
+		MEMCPY4(&d_ctx_key1[thread * 40U], ctx_key1, 40);
+		MEMCPY4(&d_ctx_key2[thread * 40U], ctx_key2, 40);
+
+		XOR_BLOCKS_DST(&ctx_state[0], &ctx_state[4], ctx_a);
+		XOR_BLOCKS_DST(&ctx_state[2], &ctx_state[6], ctx_b);
+		MEMCPY4(&d_ctx_a[thread * 4U], ctx_a, 4);
+		MEMCPY4(&d_ctx_b[thread * 4U], ctx_b, 4);
+
+		if (variant) {
+			uint2 tweak = AS_UINT2(&ctx_state[24]);
+			//tweak.x ^= (input[8] >> 24) | (input[9] << 8);
+			tweak.x ^= __byte_perm(input[8], input[ 9], 0x6543);
+			tweak.y ^= __byte_perm(input[9], input[10], 0x6543);
+			MEMCPY4(&d_ctx_tweak[thread], &tweak, 2);
+		}
+	}
+}
+
+__global__
+void cryptonight_extra_gpu_final(const uint32_t threads, uint32_t startNonce, const uint32_t * __restrict__ d_target,
+	uint32_t * __restrict__ resNonces, uint32_t * __restrict__ d_ctx_state)
+{
+	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
+	if(thread < threads)
+	{
+		uint32_t *ctx_state = &d_ctx_state[thread * 50U];
+		uint32_t hash[8];
+		uint32_t state[50];
+
+		#pragma unroll 25
+		for(int i = 0; i < 50; i+=2)
+			AS_UINT2(&state[i]) = AS_UINT2(&ctx_state[i]);
+
+		cn_keccakf2((uint64_t *)state);
+
+		int branch = ((uint8_t *)state)[0] & 0x03;
+		if(branch == 0)
+			cn_blake((const uint8_t *)state, 200, hash);
+		if(branch == 1)
+			cn_groestl((const uint8_t *)state, 200, hash);
+		if(branch == 2)
+			cn_jh((const uint8_t *)state, 200, hash);
+		if(branch == 3)
+			cn_skein((const uint8_t *)state, 200, hash);
+
+		if(hash[7] <= d_target[1] && hash[6] <= d_target[0])
+		{
+			const uint32_t nonce = startNonce + thread;
+			uint32_t tmp = atomicExch(resNonces, nonce);
+			if(tmp != UINT32_MAX)
+				resNonces[1] = tmp;
+		}
+	}
+}
+
+__host__
+void cryptonight_extra_setData(int thr_id, const void *data, const void *ptarget)
+{
+	uint32_t *pTargetIn = (uint32_t*) ptarget;
+	cudaMemcpy(d_input[thr_id], data, 20 * sizeof(uint32_t), cudaMemcpyHostToDevice);
+	cudaMemcpy(d_target[thr_id], &pTargetIn[6], 2 * sizeof(uint32_t), cudaMemcpyHostToDevice);
+	cudaMemset(d_result[thr_id], 0xFF, 2 * sizeof(uint32_t));
+	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
+}
+
+__host__
+void cryptonight_extra_init(int thr_id)
+{
+	cudaMalloc(&d_input[thr_id], 20 * sizeof(uint32_t));
+	cudaMalloc(&d_target[thr_id], 2 * sizeof(uint32_t));
+	cudaMalloc(&d_result[thr_id], 2 * sizeof(uint32_t));
+	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
+}
+
+__host__
+void cryptonight_extra_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak)
+{
+	uint32_t threadsperblock = 128;
+
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	cryptonight_extra_gpu_prepare <<<grid, block>>> (threads, d_input[thr_id], startNonce, d_ctx_state, d_ctx_a, d_ctx_b, d_ctx_key1, d_ctx_key2, variant, d_ctx_tweak);
+	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
+}
+
+__host__
+void cryptonight_extra_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces, uint32_t *d_ctx_state)
+{
+	uint32_t threadsperblock = 128;
+
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
+	cryptonight_extra_gpu_final <<<grid, block>>> (threads, startNonce, d_target[thr_id], d_result[thr_id], d_ctx_state);
+	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
+	cudaMemcpy(resNonces, d_result[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
+}
+
+__host__
+void cryptonight_extra_free(int thr_id)
+{
+	if (d_input[thr_id]) {
+		cudaFree(d_input[thr_id]);
+		cudaFree(d_target[thr_id]);
+		cudaFree(d_result[thr_id]);
+		d_input[thr_id] = NULL;
+	}
+}
diff --git a/crypto/cryptonight.cu b/crypto/cryptonight.cu
new file mode 100644
index 0000000000..2c3a6cdc26
--- /dev/null
+++ b/crypto/cryptonight.cu
@@ -0,0 +1,196 @@
+
+#include "cryptonight.h"
+
+extern char *device_config[MAX_GPUS]; // -l 32x16
+
+static __thread uint32_t cn_blocks;
+static __thread uint32_t cn_threads;
+
+// used for gpu intensity on algo init
+static __thread bool gpu_init_shown = false;
+#define gpulog_init(p,thr,fmt, ...) if (!gpu_init_shown) \
+	gpulog(p, thr, fmt, ##__VA_ARGS__)
+
+static uint64_t *d_long_state[MAX_GPUS];
+static uint32_t *d_ctx_state[MAX_GPUS];
+static uint32_t *d_ctx_key1[MAX_GPUS];
+static uint32_t *d_ctx_key2[MAX_GPUS];
+static uint32_t *d_ctx_text[MAX_GPUS];
+static uint64_t *d_ctx_tweak[MAX_GPUS];
+static uint32_t *d_ctx_a[MAX_GPUS];
+static uint32_t *d_ctx_b[MAX_GPUS];
+
+static bool init[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant)
+{
+	int res = 0;
+	uint32_t throughput = 0;
+
+	uint32_t *ptarget = work->target;
+	uint8_t *pdata = (uint8_t*) work->data;
+	uint32_t *nonceptr = (uint32_t*) (&pdata[39]);
+	const uint32_t first_nonce = *nonceptr;
+	uint32_t nonce = first_nonce;
+	int dev_id = device_map[thr_id];
+
+	if(opt_benchmark) {
+		ptarget[7] = 0x00ff;
+	}
+
+	if(!init[thr_id])
+	{
+		int mem = cuda_available_memory(thr_id);
+		int mul = device_sm[dev_id] >= 300 ? 4 : 1; // see cryptonight-core.cu
+		cn_threads = device_sm[dev_id] >= 600 ? 16 : 8; // real TPB is x4 on SM3+
+		cn_blocks = device_mpcount[dev_id] * 4;
+		if (cn_blocks*cn_threads*2.2 > mem) cn_blocks = device_mpcount[dev_id] * 2;
+
+		if (!opt_quiet)
+			gpulog_init(LOG_INFO, thr_id, "%s, %d MB available, %hd SMX", device_name[dev_id],
+				mem, device_mpcount[dev_id]);
+
+		if (!device_config[thr_id]) {
+			if(strcmp(device_name[dev_id], "TITAN V") == 0)
+				device_config[thr_id] = strdup("80x24");
+			if(strstr(device_name[dev_id], "V100"))
+				device_config[thr_id] = strdup("80x24");
+		}
+
+		if (device_config[thr_id]) {
+			int res = sscanf(device_config[thr_id], "%ux%u", &cn_blocks, &cn_threads);
+			throughput = cuda_default_throughput(thr_id, cn_blocks*cn_threads);
+			gpulog_init(LOG_INFO, thr_id, "Using %ux%u(x%d) kernel launch config, %u threads",
+				cn_blocks, cn_threads, mul, throughput);
+		} else {
+			throughput = cuda_default_throughput(thr_id, cn_blocks*cn_threads);
+			if (throughput != cn_blocks*cn_threads && cn_threads) {
+				cn_blocks = throughput / cn_threads;
+				throughput = cn_threads * cn_blocks;
+			}
+			gpulog_init(LOG_INFO, thr_id, "%u threads (%g) with %u blocks",// of %ux%d",
+				throughput, throughput2intensity(throughput), cn_blocks);//, cn_threads, mul);
+		}
+
+		if(sizeof(size_t) == 4 && throughput > UINT32_MAX / MEMORY) {
+			gpulog(LOG_ERR, thr_id, "THE 32bit VERSION CAN'T ALLOCATE MORE THAN 4GB OF MEMORY!");
+			gpulog(LOG_ERR, thr_id, "PLEASE REDUCE THE NUMBER OF THREADS OR BLOCKS");
+			exit(1);
+		}
+
+		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
+			CUDA_LOG_ERROR();
+		}
+
+		const size_t alloc = MEMORY * size_t(throughput);
+		cryptonight_extra_init(thr_id);
+
+		cudaMalloc(&d_long_state[thr_id], alloc);
+		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
+		cudaMalloc(&d_ctx_state[thr_id], 50 * sizeof(uint32_t) * throughput);
+		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
+		cudaMalloc(&d_ctx_key1[thr_id], 40 * sizeof(uint32_t) * throughput);
+		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
+		cudaMalloc(&d_ctx_key2[thr_id], 40 * sizeof(uint32_t) * throughput);
+		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
+		cudaMalloc(&d_ctx_text[thr_id], 32 * sizeof(uint32_t) * throughput);
+		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
+		cudaMalloc(&d_ctx_a[thr_id], 4 * sizeof(uint32_t) * throughput);
+		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
+		cudaMalloc(&d_ctx_b[thr_id], 4 * sizeof(uint32_t) * throughput);
+		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
+		cudaMalloc(&d_ctx_tweak[thr_id], sizeof(uint64_t) * throughput);
+		exit_if_cudaerror(thr_id, __FILE__, __LINE__);
+
+		gpu_init_shown = true;
+		init[thr_id] = true;
+	}
+
+	throughput = cn_blocks*cn_threads;
+
+	do
+	{
+		const uint32_t Htarg = ptarget[7];
+		uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX };
+
+		cryptonight_extra_setData(thr_id, pdata, ptarget);
+		cryptonight_extra_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak[thr_id]);
+		cryptonight_core_cuda(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak[thr_id]);
+		cryptonight_extra_final(thr_id, throughput, nonce, resNonces, d_ctx_state[thr_id]);
+
+		*hashes_done = nonce - first_nonce + throughput;
+
+		if(resNonces[0] != UINT32_MAX)
+		{
+			uint32_t vhash[8];
+			uint32_t tempdata[19];
+			uint32_t *tempnonceptr = (uint32_t*)(((char*)tempdata) + 39);
+			memcpy(tempdata, pdata, 76);
+			*tempnonceptr = resNonces[0];
+			cryptonight_hash_variant(vhash, tempdata, 76, variant);
+			if(vhash[7] <= Htarg && fulltest(vhash, ptarget))
+			{
+				res = 1;
+				work->nonces[0] = resNonces[0];
+				work_set_target_ratio(work, vhash);
+				// second nonce
+				if(resNonces[1] != UINT32_MAX)
+				{
+					*tempnonceptr = resNonces[1];
+					cryptonight_hash_variant(vhash, tempdata, 76, variant);
+					if(vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+						res++;
+						work->nonces[1] = resNonces[1];
+					} else {
+						gpu_increment_reject(thr_id);
+					}
+				}
+				goto done;
+			} else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+					gpulog(LOG_WARNING, thr_id, "result for nonce %08x does not validate on CPU!", resNonces[0]);
+			}
+		}
+
+		if ((uint64_t) throughput + nonce >= max_nonce - 127) {
+			nonce = max_nonce;
+			break;
+		}
+
+		nonce += throughput;
+		gpulog(LOG_DEBUG, thr_id, "nonce %08x", nonce);
+
+	} while (!work_restart[thr_id].restart && max_nonce > (uint64_t)throughput + nonce);
+
+done:
+	gpulog(LOG_DEBUG, thr_id, "nonce %08x exit", nonce);
+	work->valid_nonces = res;
+	*nonceptr = nonce;
+	return res;
+}
+
+void free_cryptonight(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaFree(d_long_state[thr_id]);
+	cudaFree(d_ctx_state[thr_id]);
+	cudaFree(d_ctx_key1[thr_id]);
+	cudaFree(d_ctx_key2[thr_id]);
+	cudaFree(d_ctx_text[thr_id]);
+	cudaFree(d_ctx_tweak[thr_id]);
+	cudaFree(d_ctx_a[thr_id]);
+	cudaFree(d_ctx_b[thr_id]);
+
+	cryptonight_extra_free(thr_id);
+
+	cudaDeviceSynchronize();
+
+	init[thr_id] = false;
+}
diff --git a/crypto/cryptonight.h b/crypto/cryptonight.h
new file mode 100644
index 0000000000..00417b950b
--- /dev/null
+++ b/crypto/cryptonight.h
@@ -0,0 +1,144 @@
+#pragma once
+#include <cuda_runtime.h>
+#include <miner.h>
+
+#ifdef __INTELLISENSE__
+/* avoid red underlining */
+#define __CUDA_ARCH__ 520
+struct uint3 {
+	unsigned int x, y, z;
+};
+struct uint3  threadIdx;
+struct uint3  blockIdx;
+struct uint3  blockDim;
+#define atomicExch(p,y) (*p) = y
+#define __funnelshift_r(a,b,c) 1
+#define __syncthreads()
+#define __threadfence_block()
+#define asm(x)
+#define __shfl(a,b,c) 1
+#define __umul64hi(a,b) a*b
+#endif
+
+#define MEMORY         (1U << 21) // 2 MiB / 2097152 B
+#define ITER           (1U << 20) // 1048576
+#define E2I_MASK       0x1FFFF0u
+
+#define AES_BLOCK_SIZE  16U
+#define AES_KEY_SIZE    32
+#define INIT_SIZE_BLK   8
+#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE) // 128 B
+
+#define AES_RKEY_LEN 4
+#define AES_COL_LEN 4
+#define AES_ROUND_BASE 7
+
+#ifndef HASH_SIZE
+#define HASH_SIZE 32
+#endif
+
+#ifndef HASH_DATA_AREA
+#define HASH_DATA_AREA 136
+#endif
+
+#define hi_dword(x) (x >> 32)
+#define lo_dword(x) (x & 0xFFFFFFFF)
+
+#define C32(x)    ((uint32_t)(x ## U))
+#define T32(x) ((x) & C32(0xFFFFFFFF))
+
+#ifndef ROTL64
+    #if __CUDA_ARCH__ >= 350
+        __forceinline__ __device__ uint64_t cuda_ROTL64(const uint64_t value, const int offset) {
+            uint2 result;
+            if(offset >= 32) {
+                asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+                asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+            } else {
+                asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+                asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+            }
+            return  __double_as_longlong(__hiloint2double(result.y, result.x));
+        }
+        #define ROTL64(x, n) (cuda_ROTL64(x, n))
+    #else
+        #define ROTL64(x, n)        (((x) << (n)) | ((x) >> (64 - (n))))
+    #endif
+#endif
+
+#ifndef ROTL32
+    #if __CUDA_ARCH__ < 350
+        #define ROTL32(x, n) T32(((x) << (n)) | ((x) >> (32 - (n))))
+    #else
+        #define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
+    #endif
+#endif
+
+#ifndef ROTR32
+    #if __CUDA_ARCH__ < 350
+        #define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
+    #else
+        #define ROTR32(x, n) __funnelshift_r( (x), (x), (n) )
+    #endif
+#endif
+
+#define MEMSET8(dst,what,cnt) { \
+    int i_memset8; \
+    uint64_t *out_memset8 = (uint64_t *)(dst); \
+    for( i_memset8 = 0; i_memset8 < cnt; i_memset8++ ) \
+        out_memset8[i_memset8] = (what); }
+
+#define MEMSET4(dst,what,cnt) { \
+    int i_memset4; \
+    uint32_t *out_memset4 = (uint32_t *)(dst); \
+    for( i_memset4 = 0; i_memset4 < cnt; i_memset4++ ) \
+        out_memset4[i_memset4] = (what); }
+
+#define MEMCPY8(dst,src,cnt) { \
+    int i_memcpy8; \
+    uint64_t *in_memcpy8 = (uint64_t *)(src); \
+    uint64_t *out_memcpy8 = (uint64_t *)(dst); \
+    for( i_memcpy8 = 0; i_memcpy8 < cnt; i_memcpy8++ ) \
+        out_memcpy8[i_memcpy8] = in_memcpy8[i_memcpy8]; }
+
+#define MEMCPY4(dst,src,cnt) { \
+    int i_memcpy4; \
+    uint32_t *in_memcpy4 = (uint32_t *)(src); \
+    uint32_t *out_memcpy4 = (uint32_t *)(dst); \
+    for( i_memcpy4 = 0; i_memcpy4 < cnt; i_memcpy4++ ) \
+        out_memcpy4[i_memcpy4] = in_memcpy4[i_memcpy4]; }
+
+#define XOR_BLOCKS_DST(x,y,z) { \
+    ((uint64_t *)z)[0] = ((uint64_t *)(x))[0] ^ ((uint64_t *)(y))[0]; \
+    ((uint64_t *)z)[1] = ((uint64_t *)(x))[1] ^ ((uint64_t *)(y))[1]; }
+
+#define E2I(x) ((size_t)(((*((uint64_t*)(x)) >> 4) & 0x1ffff)))
+
+union hash_state {
+  uint8_t b[200];
+  uint64_t w[25];
+};
+
+union cn_slow_hash_state {
+    union hash_state hs;
+    struct {
+        uint8_t k[64];
+        uint8_t init[INIT_SIZE_BYTE];
+    };
+};
+
+static inline void exit_if_cudaerror(int thr_id, const char *src, int line)
+{
+	cudaError_t err = cudaGetLastError();
+	if(err != cudaSuccess) {
+		gpulog(LOG_ERR, thr_id, "%s %s line %d", cudaGetErrorString(err), src, line);
+		exit(1);
+	}
+}
+void cryptonight_core_cuda(int thr_id, uint32_t blocks, uint32_t threads, uint64_t *d_long_state, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak);
+
+void cryptonight_extra_setData(int thr_id, const void *data, const void *ptarget);
+void cryptonight_extra_init(int thr_id);
+void cryptonight_extra_free(int thr_id);
+void cryptonight_extra_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak);
+void cryptonight_extra_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resnonce, uint32_t *d_ctx_state);
diff --git a/crypto/int128_c.h b/crypto/int128_c.h
new file mode 100644
index 0000000000..3db164a3ec
--- /dev/null
+++ b/crypto/int128_c.h
@@ -0,0 +1,573 @@
+#ifndef UINT128_C_H
+#define UINT128_C_H
+
+struct __uint128 {
+	uint64_t Hi;
+	uint64_t Lo;
+};
+typedef struct __uint128 uint128;
+
+void Increment(uint128 * N)
+{
+	uint64_t T = (N->Lo + 1);
+	N->Hi += ((N->Lo ^T) & N->Lo) >> 63;
+	N->Lo = T;
+}
+
+void Decrement(uint128 * N)
+{
+	uint64_t T = (N->Lo - 1);
+	N->Hi -= ((T ^ N->Lo) & T) >> 63;
+	N->Lo = T;
+}
+
+void Add(uint128 * Ans, uint128 N, uint128 M)
+{
+	uint64_t C = (((N.Lo & M.Lo) & 1) + (N.Lo >> 1) + (M.Lo >> 1)) >> 63;
+	Ans->Hi = N.Hi + M.Hi + C;
+	Ans->Lo = N.Lo + M.Lo;
+}
+
+void Subtract(uint128 * Ans, uint128 N, uint128 M)
+{
+	Ans->Lo = N.Lo - M.Lo;
+	uint64_t C = (((Ans->Lo & M.Lo) & 1) + (M.Lo >> 1) + (Ans->Lo >> 1)) >> 63;
+	Ans->Hi = N.Hi - (M.Hi + C);
+}
+
+void inc128(uint128 N, uint128* A)
+{
+	A->Lo = (N.Lo + 1);
+	A->Hi = N.Hi + (((N.Lo ^ A->Lo) & N.Lo) >> 63);
+}
+
+void dec128(uint128 N, uint128* A)
+{
+	A->Lo = N.Lo - 1;
+	A->Hi = N.Hi - (((A->Lo ^ N.Lo) & A->Lo) >> 63);
+}
+
+void add128(uint128 N, uint128 M, uint128* A)
+{
+	uint64_t C = (((N.Lo & M.Lo) & 1) + (N.Lo >> 1) + (M.Lo >> 1)) >> 63;
+	A->Hi = N.Hi + M.Hi + C;
+	A->Lo = N.Lo + M.Lo;
+}
+
+void sub128(uint128 N, uint128 M, uint128* A)
+{
+	A->Lo = N.Lo - M.Lo;
+	uint64_t C = (((A->Lo & M.Lo) & 1) + (M.Lo >> 1) + (A->Lo >> 1)) >> 63;
+	A->Hi = N.Hi - (M.Hi + C);
+}
+
+void mult64to128(uint64_t u, uint64_t v, uint64_t * h, uint64_t *l)
+{
+	uint64_t u1 = (u & 0xffffffff);
+	uint64_t v1 = (v & 0xffffffff);
+	uint64_t t = (u1 * v1);
+	uint64_t w3 = (t & 0xffffffff);
+	uint64_t k = (t >> 32);
+
+	u >>= 32;
+	t = (u * v1) + k;
+	k = (t & 0xffffffff);
+	uint64_t w1 = (t >> 32);
+
+	v >>= 32;
+	t = (u1 * v) + k;
+	k = (t >> 32);
+
+	*h = (u * v) + w1 + k;
+	*l = (t << 32) + w3;
+}
+
+void mult128(uint128 N, uint128 M, uint128 * Ans)
+{
+	mult64to128(N.Lo, M.Lo, &Ans->Hi, &Ans->Lo);
+	Ans->Hi += (N.Hi * M.Lo) + (N.Lo * M.Hi);
+}
+
+void mult128to256(uint128 N, uint128 M, uint128 * H, uint128 * L)
+{
+	mult64to128(N.Hi, M.Hi, &H->Hi, &H->Lo);
+	mult64to128(N.Lo, M.Lo, &L->Hi, &L->Lo);
+
+	uint128 T;
+	mult64to128(N.Hi, M.Lo, &T.Hi, &T.Lo);
+	L->Hi += T.Lo;
+	if(L->Hi < T.Lo)  // if L->Hi overflowed
+	{
+		Increment(H);
+	}
+	H->Lo += T.Hi;
+	if(H->Lo < T.Hi)  // if H->Lo overflowed
+	{
+		++H->Hi;
+	}
+
+	mult64to128(N.Lo, M.Hi, &T.Hi, &T.Lo);
+	L->Hi += T.Lo;
+	if(L->Hi < T.Lo)  // if L->Hi overflowed
+	{
+		Increment(H);
+	}
+	H->Lo += T.Hi;
+	if(H->Lo < T.Hi)  // if H->Lo overflowed
+	{
+		++H->Hi;
+	}
+}
+
+
+void sqr64to128(uint64_t r, uint64_t * h, uint64_t *l)
+{
+	uint64_t r1 = (r & 0xffffffff);
+	uint64_t t = (r1 * r1);
+	uint64_t w3 = (t & 0xffffffff);
+	uint64_t k = (t >> 32);
+
+	r >>= 32;
+	uint64_t m = (r * r1);
+	t = m + k;
+	uint64_t w2 = (t & 0xffffffff);
+	uint64_t w1 = (t >> 32);
+
+	t = m + w2;
+	k = (t >> 32);
+	*h = (r * r) + w1 + k;
+	*l = (t << 32) + w3;
+}
+
+void sqr128(uint128 R, uint128 * Ans)
+{
+	sqr64to128(R.Lo, &Ans->Hi, &Ans->Lo);
+	Ans->Hi += (R.Hi * R.Lo) << 1;
+}
+
+void sqr128to256(uint128 R, uint128 * H, uint128 * L)
+{
+	sqr64to128(R.Hi, &H->Hi, &H->Lo);
+	sqr64to128(R.Lo, &L->Hi, &L->Lo);
+
+	uint128 T;
+	mult64to128(R.Hi, R.Lo, &T.Hi, &T.Lo);
+
+	H->Hi += (T.Hi >> 63);
+	T.Hi = (T.Hi << 1) | (T.Lo >> 63);  // Shift Left 1 bit
+	T.Lo <<= 1;
+
+	L->Hi += T.Lo;
+	if(L->Hi < T.Lo)  // if L->Hi overflowed
+	{
+		Increment(H);
+	}
+
+	H->Lo += T.Hi;
+	if(H->Lo < T.Hi)  // if H->Lo overflowed
+	{
+		++H->Hi;
+	}
+}
+
+void shiftleft128(uint128 N, size_t S, uint128 * A)
+{
+	uint64_t M1, M2;
+	S &= 127;
+
+	M1 = ((((S + 127) | S) & 64) >> 6) - 1llu;
+	M2 = (S >> 6) - 1llu;
+	S &= 63;
+	A->Hi = (N.Lo << S) & (~M2);
+	A->Lo = (N.Lo << S) & M2;
+	A->Hi |= ((N.Hi << S) | ((N.Lo >> (64 - S)) & M1)) & M2;
+
+/*
+	S &= 127;
+
+	if(S != 0)
+	{
+		if(S > 64)
+		{
+			A.Hi = N.Lo << (S - 64);
+			A.Lo = 0;
+		}
+		else if(S < 64)
+		{
+			A.Hi = (N.Hi << S) | (N.Lo >> (64 - S));
+			A.Lo = N.Lo << S;
+		}
+		else
+		{
+			A.Hi = N.Lo;
+			A.Lo = 0;
+		}
+	}
+	else
+	{
+		A.Hi = N.Hi;
+		A.Lo = N.Lo;
+	}
+	//*/
+}
+
+void shiftright128(uint128 N, size_t S, uint128 * A)
+{
+	uint64_t M1, M2;
+	S &= 127;
+
+	M1 = ((((S + 127) | S) & 64) >> 6) - 1llu;
+	M2 = (S >> 6) - 1llu;
+	S &= 63;
+	A->Lo = (N.Hi >> S) & (~M2);
+	A->Hi = (N.Hi >> S) & M2;
+	A->Lo |= ((N.Lo >> S) | ((N.Hi << (64 - S)) & M1)) & M2;
+
+	/*
+	S &= 127;
+
+	if(S != 0)
+	{
+		if(S > 64)
+		{
+			A.Hi = N.Hi >> (S - 64);
+			A.Lo = 0;
+		}
+		else if(S < 64)
+		{
+			A.Lo = (N.Lo >> S) | (N.Hi << (64 - S));
+			A.Hi = N.Hi >> S;
+		}
+		else
+		{
+			A.Lo = N.Hi;
+			A.Hi = 0;
+		}
+	}
+	else
+	{
+		A.Hi = N.Hi;
+		A.Lo = N.Lo;
+	}
+	//*/
+}
+
+
+void not128(uint128 N, uint128 * A)
+{
+	A->Hi = ~N.Hi;
+	A->Lo = ~N.Lo;
+}
+
+void or128(uint128 N1, uint128 N2, uint128 * A)
+{
+	A->Hi = N1.Hi | N2.Hi;
+	A->Lo = N1.Lo | N2.Lo;
+}
+
+void and128(uint128 N1, uint128 N2, uint128 * A)
+{
+	A->Hi = N1.Hi & N2.Hi;
+	A->Lo = N1.Lo & N2.Lo;
+}
+
+void xor128(uint128 N1, uint128 N2, uint128 * A)
+{
+	A->Hi = N1.Hi ^ N2.Hi;
+	A->Lo = N1.Lo ^ N2.Lo;
+}
+
+size_t nlz64(uint64_t N)
+{
+	uint64_t I;
+	size_t C;
+
+	I = ~N;
+	C = ((I ^ (I + 1)) & I) >> 63;
+
+	I = (N >> 32) + 0xffffffff;
+	I = ((I & 0x100000000) ^ 0x100000000) >> 27;
+	C += I;  N <<= I;
+
+	I = (N >> 48) + 0xffff;
+	I = ((I & 0x10000) ^ 0x10000) >> 12;
+	C += I;  N <<= I;
+
+	I = (N >> 56) + 0xff;
+	I = ((I & 0x100) ^ 0x100) >> 5;
+	C += I;  N <<= I;
+
+	I = (N >> 60) + 0xf;
+	I = ((I & 0x10) ^ 0x10) >> 2;
+	C += I;  N <<= I;
+
+	I = (N >> 62) + 3;
+	I = ((I & 4) ^ 4) >> 1;
+	C += I;  N <<= I;
+
+	C += (N >> 63) ^ 1;
+
+	return C;
+}
+
+size_t ntz64(uint64_t N)
+{
+	uint64_t I = ~N;
+	size_t C = ((I ^ (I + 1)) & I) >> 63;
+
+	I = (N & 0xffffffff) + 0xffffffff;
+	I = ((I & 0x100000000) ^ 0x100000000) >> 27;
+	C += I;  N >>= I;
+
+	I = (N & 0xffff) + 0xffff;
+	I = ((I & 0x10000) ^ 0x10000) >> 12;
+	C += I;  N >>= I;
+
+	I = (N & 0xff) + 0xff;
+	I = ((I & 0x100) ^ 0x100) >> 5;
+	C += I;  N >>= I;
+
+	I = (N & 0xf) + 0xf;
+	I = ((I & 0x10) ^ 0x10) >> 2;
+	C += I;  N >>= I;
+
+	I = (N & 3) + 3;
+	I = ((I & 4) ^ 4) >> 1;
+	C += I;  N >>= I;
+
+	C += ((N & 1) ^ 1);
+
+	return C;
+}
+
+size_t popcnt64(uint64_t V)
+{
+	// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+	V -= ((V >> 1) & 0x5555555555555555);
+	V = (V & 0x3333333333333333) + ((V >> 2) & 0x3333333333333333);
+	return ((V + (V >> 4) & 0xF0F0F0F0F0F0F0F) * 0x101010101010101) >> 56;
+}
+
+size_t popcnt128(uint128 N)
+{
+	return popcnt64(N.Hi) + popcnt64(N.Lo);
+}
+
+
+size_t nlz128(uint128 N)
+{
+	return (N.Hi == 0) ? nlz64(N.Lo) + 64 : nlz64(N.Hi);
+}
+
+size_t ntz128(uint128 N)
+{
+	return (N.Lo == 0) ? ntz64(N.Hi) + 64 : ntz64(N.Lo);
+}
+int compare128(uint128 N1, uint128 N2)
+{
+	return	(((N1.Hi > N2.Hi) || ((N1.Hi == N2.Hi) && (N1.Lo > N2.Lo))) ? 1 : 0)
+		 -  (((N1.Hi < N2.Hi) || ((N1.Hi == N2.Hi) && (N1.Lo < N2.Lo))) ? 1 : 0);
+}
+
+void bindivmod128(uint128 M, uint128 N, uint128 * Q, uint128 *R)
+{
+	Q->Hi = Q->Lo = 0;
+	size_t Shift = nlz128(N) - nlz128(M);
+	shiftleft128(N, Shift, &N);
+
+	do
+	{
+		shiftleft128(*Q, (size_t)1, Q);
+		if(compare128(M, N) >= 0)
+		{
+			sub128(M, N, &M);
+			Q->Lo |= 1;
+		}
+
+		shiftright128(N, 1, &N);
+	}while(Shift-- != 0);
+
+	R->Hi = M.Hi;
+	R->Lo = M.Lo;
+}
+
+void divmod128by64(const uint64_t u1, const uint64_t u0, uint64_t v, uint64_t * q, uint64_t * r)
+{
+	const uint64_t b = 1ll << 32;
+	uint64_t un1, un0, vn1, vn0, q1, q0, un32, un21, un10, rhat, left, right;
+	size_t s;
+
+	s = nlz64(v);
+	v <<= s;
+	vn1 = v >> 32;
+	vn0 = v & 0xffffffff;
+
+	if (s > 0)
+	{
+		un32 = (u1 << s) | (u0 >> (64 - s));
+		un10 = u0 << s;
+	}
+	else
+	{
+		un32 = u1;
+		un10 = u0;
+	}
+
+	un1 = un10 >> 32;
+	un0 = un10 & 0xffffffff;
+
+	q1 = un32 / vn1;
+	rhat = un32 % vn1;
+
+	left = q1 * vn0;
+	right = (rhat << 32) + un1;
+again1:
+	if ((q1 >= b) || (left > right))
+	{
+		--q1;
+		rhat += vn1;
+		if (rhat < b)
+		{
+			left -= vn0;
+			right = (rhat << 32) | un1;
+			goto again1;
+		}
+	}
+
+	un21 = (un32 << 32) + (un1 - (q1 * v));
+
+	q0 = un21 / vn1;
+	rhat = un21 % vn1;
+
+	left = q0 * vn0;
+	right = (rhat << 32) | un0;
+again2:
+	if ((q0 >= b) || (left > right))
+	{
+		--q0;
+		rhat += vn1;
+		if (rhat < b)
+		{
+			left -= vn0;
+			right = (rhat << 32) | un0;
+			goto again2;
+		}
+	}
+
+	*r = ((un21 << 32) + (un0 - (q0 * v))) >> s;
+	*q = (q1 << 32) | q0;
+}
+
+static void divmod128by128(uint128 M, uint128 N, uint128 * Q, uint128 * R)
+{
+	if (N.Hi == 0)
+	{
+		if (M.Hi < N.Lo)
+		{
+			divmod128by64(M.Hi, M.Lo, N.Lo, &Q->Lo, &R->Lo);
+			Q->Hi = 0;
+			R->Hi = 0;
+			return;
+		}
+		else
+		{
+			Q->Hi = M.Hi / N.Lo;
+			R->Hi = M.Hi % N.Lo;
+			divmod128by64(R->Hi, M.Lo, N.Lo, &Q->Lo, &R->Lo);
+			R->Hi = 0;
+			return;
+		}
+	}
+	else
+	{
+		size_t n = nlz64(N.Hi);
+
+		uint128 v1;
+		shiftleft128(N, n, &v1);
+
+		uint128 u1;
+		shiftright128(M, 1, &u1);
+
+		uint128 q1;
+		divmod128by64(u1.Hi, u1.Lo, v1.Hi, &q1.Hi, &q1.Lo);
+		q1.Hi = 0;
+		shiftright128(q1, 63 - n, &q1);
+
+		if ((q1.Hi | q1.Lo) != 0)
+		{
+			dec128(q1, &q1);
+		}
+
+		Q->Hi = q1.Hi;
+		Q->Lo = q1.Lo;
+		mult128(q1, N, &q1);
+		sub128(M, q1, R);
+
+		if (compare128(*R, N) >= 0)
+		{
+			inc128(*Q, Q);
+			sub128(*R, N, R);
+		}
+
+		return;
+	}
+}
+
+void divmod128(uint128 M, uint128 N, uint128 * Q, uint128 * R)
+{
+	size_t Nlz, Mlz, Ntz;
+	int C;
+
+	Nlz = nlz128(N);
+	Mlz = nlz128(M);
+	Ntz = ntz128(N);
+
+	if(Nlz == 128)
+	{
+		return;
+	}
+	else if((M.Hi | N.Hi) == 0)
+	{
+		Q->Hi = R->Hi = 0;
+		Q->Lo = M.Lo / N.Lo;
+		R->Lo = M.Lo % N.Lo;
+		return;
+	}
+	else if(Nlz == 127)
+	{
+		*Q = M;
+		R->Hi = R->Lo = 0;
+		return;
+	}
+	else if((Ntz + Nlz) == 127)
+	{
+		shiftright128(M, Ntz, Q);
+		dec128(N, &N);
+		and128(N, M, R);
+		return;
+	}
+
+	C = compare128(M, N);
+	if(C < 0)
+	{
+		Q->Hi = Q->Lo = 0;
+		*R = M;
+		return;
+	}
+	else if(C == 0)
+	{
+		Q->Hi = R->Hi = R->Lo = 0;
+		Q->Lo = 1;
+		return;
+	}
+
+	if((Nlz - Mlz) > 5)
+	{
+		divmod128by128(M, N, Q, R);
+	}
+	else
+	{
+		bindivmod128(M, N, Q, R);
+	}
+}
+#endif
\ No newline at end of file
diff --git a/crypto/mman.c b/crypto/mman.c
new file mode 100644
index 0000000000..3f015f2a09
--- /dev/null
+++ b/crypto/mman.c
@@ -0,0 +1,180 @@
+
+#include <windows.h>
+#include <errno.h>
+#include <io.h>
+
+#include "mman.h"
+
+#ifndef FILE_MAP_EXECUTE
+#define FILE_MAP_EXECUTE    0x0020
+#endif /* FILE_MAP_EXECUTE */
+
+static int __map_mman_error(const DWORD err, const int deferr)
+{
+	if (err == 0)
+		return 0;
+	//TODO: implement
+	return err;
+}
+
+static DWORD __map_mmap_prot_page(const int prot)
+{
+	DWORD protect = 0;
+
+	if (prot == PROT_NONE)
+		return protect;
+
+	if ((prot & PROT_EXEC) != 0)
+	{
+		protect = ((prot & PROT_WRITE) != 0) ?
+					PAGE_EXECUTE_READWRITE : PAGE_EXECUTE_READ;
+	}
+	else
+	{
+		protect = ((prot & PROT_WRITE) != 0) ?
+					PAGE_READWRITE : PAGE_READONLY;
+	}
+
+	return protect;
+}
+
+static DWORD __map_mmap_prot_file(const int prot)
+{
+	DWORD desiredAccess = 0;
+
+	if (prot == PROT_NONE)
+		return desiredAccess;
+
+	if ((prot & PROT_READ) != 0)
+		desiredAccess |= FILE_MAP_READ;
+	if ((prot & PROT_WRITE) != 0)
+		desiredAccess |= FILE_MAP_WRITE;
+	if ((prot & PROT_EXEC) != 0)
+		desiredAccess |= FILE_MAP_EXECUTE;
+
+	return desiredAccess;
+}
+
+void* mmap(void *addr, size_t len, int prot, int flags, int fildes, OffsetType off)
+{
+	HANDLE fm, h;
+
+	void * map = MAP_FAILED;
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4293)
+#endif
+
+	const DWORD dwFileOffsetLow = (sizeof(OffsetType) <= sizeof(DWORD)) ?
+					(DWORD)off : (DWORD)(off & 0xFFFFFFFFL);
+	const DWORD dwFileOffsetHigh = (sizeof(OffsetType) <= sizeof(DWORD)) ?
+					(DWORD)0 : (DWORD)((off >> 32) & 0xFFFFFFFFL);
+	const DWORD protect = __map_mmap_prot_page(prot);
+	const DWORD desiredAccess = __map_mmap_prot_file(prot);
+
+	const OffsetType maxSize = off + (OffsetType)len;
+
+	const DWORD dwMaxSizeLow = (sizeof(OffsetType) <= sizeof(DWORD)) ?
+					(DWORD)maxSize : (DWORD)(maxSize & 0xFFFFFFFFL);
+	const DWORD dwMaxSizeHigh = (sizeof(OffsetType) <= sizeof(DWORD)) ?
+					(DWORD)0 : (DWORD)((maxSize >> 32) & 0xFFFFFFFFL);
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+	errno = 0;
+
+	if (len == 0
+		/* Unsupported flag combinations */
+		|| (flags & MAP_FIXED) != 0
+		/* Usupported protection combinations */
+		|| prot == PROT_EXEC)
+	{
+		errno = EINVAL;
+		return MAP_FAILED;
+	}
+
+	h = ((flags & MAP_ANONYMOUS) == 0) ?
+					(HANDLE)_get_osfhandle(fildes) : INVALID_HANDLE_VALUE;
+
+	if ((flags & MAP_ANONYMOUS) == 0 && h == INVALID_HANDLE_VALUE)
+	{
+		errno = EBADF;
+		return MAP_FAILED;
+	}
+
+	fm = CreateFileMapping(h, NULL, protect, dwMaxSizeHigh, dwMaxSizeLow, NULL);
+
+	if (fm == NULL)
+	{
+		errno = __map_mman_error(GetLastError(), EPERM);
+		return MAP_FAILED;
+	}
+
+	map = MapViewOfFile(fm, desiredAccess, dwFileOffsetHigh, dwFileOffsetLow, len);
+
+	CloseHandle(fm);
+
+	if (map == NULL)
+	{
+		errno = __map_mman_error(GetLastError(), EPERM);
+		return MAP_FAILED;
+	}
+
+	return map;
+}
+
+int munmap(void *addr, size_t len)
+{
+	if (UnmapViewOfFile(addr))
+		return 0;
+
+	errno =  __map_mman_error(GetLastError(), EPERM);
+
+	return -1;
+}
+
+int _mprotect(void *addr, size_t len, int prot)
+{
+	DWORD newProtect = __map_mmap_prot_page(prot);
+	DWORD oldProtect = 0;
+
+	if (VirtualProtect(addr, len, newProtect, &oldProtect))
+		return 0;
+
+	errno =  __map_mman_error(GetLastError(), EPERM);
+
+	return -1;
+}
+
+int msync(void *addr, size_t len, int flags)
+{
+	if (FlushViewOfFile(addr, len))
+		return 0;
+
+	errno =  __map_mman_error(GetLastError(), EPERM);
+
+	return -1;
+}
+
+int mlock(const void *addr, size_t len)
+{
+	if (VirtualLock((LPVOID)addr, len))
+		return 0;
+
+	errno =  __map_mman_error(GetLastError(), EPERM);
+
+	return -1;
+}
+
+int munlock(const void *addr, size_t len)
+{
+	if (VirtualUnlock((LPVOID)addr, len))
+		return 0;
+
+	errno =  __map_mman_error(GetLastError(), EPERM);
+
+	return -1;
+}
\ No newline at end of file
diff --git a/crypto/mman.h b/crypto/mman.h
new file mode 100644
index 0000000000..12df7b54f9
--- /dev/null
+++ b/crypto/mman.h
@@ -0,0 +1,63 @@
+/*
+ * sys/mman.h
+ * mman-win32
+ */
+
+#ifndef _SYS_MMAN_H_
+#define _SYS_MMAN_H_
+
+#ifndef _WIN32_WINNT		// Allow use of features specific to Windows XP or later.
+#define _WIN32_WINNT 0x0501	// Change this to the appropriate value to target other versions of Windows.
+#endif
+
+/* All the headers include this file. */
+#ifndef _MSC_VER
+#include <_mingw.h>
+#endif
+
+/* Determine offset type */
+#include <stdint.h>
+#if defined(_WIN64)
+typedef int64_t OffsetType;
+#else
+typedef uint32_t OffsetType;
+#endif
+
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define PROT_NONE       0
+#define PROT_READ       1
+#define PROT_WRITE      2
+#define PROT_EXEC       4
+
+#define MAP_FILE        0
+#define MAP_SHARED      1
+#define MAP_PRIVATE     2
+#define MAP_TYPE        0xf
+#define MAP_FIXED       0x10
+#define MAP_ANONYMOUS   0x20
+#define MAP_ANON        MAP_ANONYMOUS
+
+#define MAP_FAILED      ((void *)-1)
+
+/* Flags for msync. */
+#define MS_ASYNC        1
+#define MS_SYNC         2
+#define MS_INVALIDATE   4
+
+void*   mmap(void *addr, size_t len, int prot, int flags, int fildes, OffsetType off);
+int     munmap(void *addr, size_t len);
+int     _mprotect(void *addr, size_t len, int prot);
+int     msync(void *addr, size_t len, int flags);
+int     mlock(const void *addr, size_t len);
+int     munlock(const void *addr, size_t len);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /*  _SYS_MMAN_H_ */
diff --git a/crypto/oaes_config.h b/crypto/oaes_config.h
new file mode 100644
index 0000000000..78b8edb161
--- /dev/null
+++ b/crypto/oaes_config.h
@@ -0,0 +1,51 @@
+/*
+ * ---------------------------------------------------------------------------
+ * OpenAES License
+ * ---------------------------------------------------------------------------
+ * Copyright (c) 2012, Nabil S. Al Ramli, www.nalramli.com
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *   - Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *   - Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ * ---------------------------------------------------------------------------
+ */
+
+#ifndef _OAES_CONFIG_H
+#define _OAES_CONFIG_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#ifndef OAES_HAVE_ISAAC
+//#define OAES_HAVE_ISAAC 1
+//#endif // OAES_HAVE_ISAAC
+
+//#ifndef OAES_DEBUG
+//#define OAES_DEBUG 0
+//#endif // OAES_DEBUG
+//#define OAES_DEBUG 1
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _OAES_CONFIG_H
diff --git a/crypto/oaes_lib.cpp b/crypto/oaes_lib.cpp
new file mode 100644
index 0000000000..9427d8282c
--- /dev/null
+++ b/crypto/oaes_lib.cpp
@@ -0,0 +1,1447 @@
+/*
+ * ---------------------------------------------------------------------------
+ * OpenAES License
+ * ---------------------------------------------------------------------------
+ * Copyright (c) 2012, Nabil S. Al Ramli, www.nalramli.com
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *   - Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *   - Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ * ---------------------------------------------------------------------------
+ */
+static const char _NR[] = {
+	0x4e,0x61,0x62,0x69,0x6c,0x20,0x53,0x2e,0x20,
+	0x41,0x6c,0x20,0x52,0x61,0x6d,0x6c,0x69,0x00 };
+
+#include "miner.h"
+
+#include <stddef.h>
+#include <time.h>
+#include <sys/timeb.h>
+#if !((defined(__FreeBSD__) && __FreeBSD__ >= 10) || defined(__APPLE__))
+#include <malloc.h>
+#endif
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#ifdef _MSC_VER
+#include <process.h>
+#define getpid() _getpid()
+#else
+#include <sys/types.h>
+#include <unistd.h>
+#endif
+
+#include "oaes_config.h"
+#include "oaes_lib.h"
+
+#ifdef OAES_HAVE_ISAAC
+#include "rand.h"
+#endif // OAES_HAVE_ISAAC
+
+#define OAES_RKEY_LEN 4
+#define OAES_COL_LEN 4
+#define OAES_ROUND_BASE 7
+
+// the block is padded
+#define OAES_FLAG_PAD 0x01
+
+#ifndef min
+# define min(a,b) (((a)<(b)) ? (a) : (b))
+#endif /* min */
+
+// "OAES<8-bit header version><8-bit type><16-bit options><8-bit flags><56-bit reserved>"
+static uint8_t oaes_header[OAES_BLOCK_SIZE] = {
+	// 		0,    1,    2,    3,    4,    5,    6,    7,    8,    9,    a,    b,    c,    d,    e,    f,
+	/*0*/	0x4f, 0x41, 0x45, 0x53, 0x01, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+static uint8_t oaes_gf_8[] = {
+	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36 };
+
+static uint8_t oaes_sub_byte_value[16][16] = {
+	// 		0,    1,    2,    3,    4,    5,    6,    7,    8,    9,    a,    b,    c,    d,    e,    f,
+	/*0*/	{ 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 },
+	/*1*/	{ 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 },
+	/*2*/	{ 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 },
+	/*3*/	{ 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 },
+	/*4*/	{ 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 },
+	/*5*/	{ 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf },
+	/*6*/	{ 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 },
+	/*7*/	{ 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 },
+	/*8*/	{ 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 },
+	/*9*/	{ 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb },
+	/*a*/	{ 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 },
+	/*b*/	{ 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 },
+	/*c*/	{ 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a },
+	/*d*/	{ 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e },
+	/*e*/	{ 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf },
+	/*f*/	{ 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 },
+};
+
+static uint8_t oaes_inv_sub_byte_value[16][16] = {
+	// 		0,    1,    2,    3,    4,    5,    6,    7,    8,    9,    a,    b,    c,    d,    e,    f,
+	/*0*/	{ 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb },
+	/*1*/	{ 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb },
+	/*2*/	{ 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e },
+	/*3*/	{ 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 },
+	/*4*/	{ 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 },
+	/*5*/	{ 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 },
+	/*6*/	{ 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 },
+	/*7*/	{ 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b },
+	/*8*/	{ 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 },
+	/*9*/	{ 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e },
+	/*a*/	{ 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b },
+	/*b*/	{ 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 },
+	/*c*/	{ 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f },
+	/*d*/	{ 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef },
+	/*e*/	{ 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 },
+	/*f*/	{ 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d },
+};
+
+static uint8_t oaes_gf_mul_2[16][16] = {
+	// 		0,    1,    2,    3,    4,    5,    6,    7,    8,    9,    a,    b,    c,    d,    e,    f,
+	/*0*/	{ 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e },
+	/*1*/	{ 0x20, 0x22, 0x24, 0x26, 0x28, 0x2a, 0x2c, 0x2e, 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e },
+	/*2*/	{ 0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e, 0x50, 0x52, 0x54, 0x56, 0x58, 0x5a, 0x5c, 0x5e },
+	/*3*/	{ 0x60, 0x62, 0x64, 0x66, 0x68, 0x6a, 0x6c, 0x6e, 0x70, 0x72, 0x74, 0x76, 0x78, 0x7a, 0x7c, 0x7e },
+	/*4*/	{ 0x80, 0x82, 0x84, 0x86, 0x88, 0x8a, 0x8c, 0x8e, 0x90, 0x92, 0x94, 0x96, 0x98, 0x9a, 0x9c, 0x9e },
+	/*5*/	{ 0xa0, 0xa2, 0xa4, 0xa6, 0xa8, 0xaa, 0xac, 0xae, 0xb0, 0xb2, 0xb4, 0xb6, 0xb8, 0xba, 0xbc, 0xbe },
+	/*6*/	{ 0xc0, 0xc2, 0xc4, 0xc6, 0xc8, 0xca, 0xcc, 0xce, 0xd0, 0xd2, 0xd4, 0xd6, 0xd8, 0xda, 0xdc, 0xde },
+	/*7*/	{ 0xe0, 0xe2, 0xe4, 0xe6, 0xe8, 0xea, 0xec, 0xee, 0xf0, 0xf2, 0xf4, 0xf6, 0xf8, 0xfa, 0xfc, 0xfe },
+	/*8*/	{ 0x1b, 0x19, 0x1f, 0x1d, 0x13, 0x11, 0x17, 0x15, 0x0b, 0x09, 0x0f, 0x0d, 0x03, 0x01, 0x07, 0x05 },
+	/*9*/	{ 0x3b, 0x39, 0x3f, 0x3d, 0x33, 0x31, 0x37, 0x35, 0x2b, 0x29, 0x2f, 0x2d, 0x23, 0x21, 0x27, 0x25 },
+	/*a*/	{ 0x5b, 0x59, 0x5f, 0x5d, 0x53, 0x51, 0x57, 0x55, 0x4b, 0x49, 0x4f, 0x4d, 0x43, 0x41, 0x47, 0x45 },
+	/*b*/	{ 0x7b, 0x79, 0x7f, 0x7d, 0x73, 0x71, 0x77, 0x75, 0x6b, 0x69, 0x6f, 0x6d, 0x63, 0x61, 0x67, 0x65 },
+	/*c*/	{ 0x9b, 0x99, 0x9f, 0x9d, 0x93, 0x91, 0x97, 0x95, 0x8b, 0x89, 0x8f, 0x8d, 0x83, 0x81, 0x87, 0x85 },
+	/*d*/	{ 0xbb, 0xb9, 0xbf, 0xbd, 0xb3, 0xb1, 0xb7, 0xb5, 0xab, 0xa9, 0xaf, 0xad, 0xa3, 0xa1, 0xa7, 0xa5 },
+	/*e*/	{ 0xdb, 0xd9, 0xdf, 0xdd, 0xd3, 0xd1, 0xd7, 0xd5, 0xcb, 0xc9, 0xcf, 0xcd, 0xc3, 0xc1, 0xc7, 0xc5 },
+	/*f*/	{ 0xfb, 0xf9, 0xff, 0xfd, 0xf3, 0xf1, 0xf7, 0xf5, 0xeb, 0xe9, 0xef, 0xed, 0xe3, 0xe1, 0xe7, 0xe5 },
+};
+
+static uint8_t oaes_gf_mul_3[16][16] = {
+	// 		0,    1,    2,    3,    4,    5,    6,    7,    8,    9,    a,    b,    c,    d,    e,    f,
+	/*0*/	{ 0x00, 0x03, 0x06, 0x05, 0x0c, 0x0f, 0x0a, 0x09, 0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11 },
+	/*1*/	{ 0x30, 0x33, 0x36, 0x35, 0x3c, 0x3f, 0x3a, 0x39, 0x28, 0x2b, 0x2e, 0x2d, 0x24, 0x27, 0x22, 0x21 },
+	/*2*/	{ 0x60, 0x63, 0x66, 0x65, 0x6c, 0x6f, 0x6a, 0x69, 0x78, 0x7b, 0x7e, 0x7d, 0x74, 0x77, 0x72, 0x71 },
+	/*3*/	{ 0x50, 0x53, 0x56, 0x55, 0x5c, 0x5f, 0x5a, 0x59, 0x48, 0x4b, 0x4e, 0x4d, 0x44, 0x47, 0x42, 0x41 },
+	/*4*/	{ 0xc0, 0xc3, 0xc6, 0xc5, 0xcc, 0xcf, 0xca, 0xc9, 0xd8, 0xdb, 0xde, 0xdd, 0xd4, 0xd7, 0xd2, 0xd1 },
+	/*5*/	{ 0xf0, 0xf3, 0xf6, 0xf5, 0xfc, 0xff, 0xfa, 0xf9, 0xe8, 0xeb, 0xee, 0xed, 0xe4, 0xe7, 0xe2, 0xe1 },
+	/*6*/	{ 0xa0, 0xa3, 0xa6, 0xa5, 0xac, 0xaf, 0xaa, 0xa9, 0xb8, 0xbb, 0xbe, 0xbd, 0xb4, 0xb7, 0xb2, 0xb1 },
+	/*7*/	{ 0x90, 0x93, 0x96, 0x95, 0x9c, 0x9f, 0x9a, 0x99, 0x88, 0x8b, 0x8e, 0x8d, 0x84, 0x87, 0x82, 0x81 },
+	/*8*/	{ 0x9b, 0x98, 0x9d, 0x9e, 0x97, 0x94, 0x91, 0x92, 0x83, 0x80, 0x85, 0x86, 0x8f, 0x8c, 0x89, 0x8a },
+	/*9*/	{ 0xab, 0xa8, 0xad, 0xae, 0xa7, 0xa4, 0xa1, 0xa2, 0xb3, 0xb0, 0xb5, 0xb6, 0xbf, 0xbc, 0xb9, 0xba },
+	/*a*/	{ 0xfb, 0xf8, 0xfd, 0xfe, 0xf7, 0xf4, 0xf1, 0xf2, 0xe3, 0xe0, 0xe5, 0xe6, 0xef, 0xec, 0xe9, 0xea },
+	/*b*/	{ 0xcb, 0xc8, 0xcd, 0xce, 0xc7, 0xc4, 0xc1, 0xc2, 0xd3, 0xd0, 0xd5, 0xd6, 0xdf, 0xdc, 0xd9, 0xda },
+	/*c*/	{ 0x5b, 0x58, 0x5d, 0x5e, 0x57, 0x54, 0x51, 0x52, 0x43, 0x40, 0x45, 0x46, 0x4f, 0x4c, 0x49, 0x4a },
+	/*d*/	{ 0x6b, 0x68, 0x6d, 0x6e, 0x67, 0x64, 0x61, 0x62, 0x73, 0x70, 0x75, 0x76, 0x7f, 0x7c, 0x79, 0x7a },
+	/*e*/	{ 0x3b, 0x38, 0x3d, 0x3e, 0x37, 0x34, 0x31, 0x32, 0x23, 0x20, 0x25, 0x26, 0x2f, 0x2c, 0x29, 0x2a },
+	/*f*/	{ 0x0b, 0x08, 0x0d, 0x0e, 0x07, 0x04, 0x01, 0x02, 0x13, 0x10, 0x15, 0x16, 0x1f, 0x1c, 0x19, 0x1a },
+};
+
+static uint8_t oaes_gf_mul_9[16][16] = {
+	// 		0,    1,    2,    3,    4,    5,    6,    7,    8,    9,    a,    b,    c,    d,    e,    f,
+	/*0*/	{ 0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f, 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77 },
+	/*1*/	{ 0x90, 0x99, 0x82, 0x8b, 0xb4, 0xbd, 0xa6, 0xaf, 0xd8, 0xd1, 0xca, 0xc3, 0xfc, 0xf5, 0xee, 0xe7 },
+	/*2*/	{ 0x3b, 0x32, 0x29, 0x20, 0x1f, 0x16, 0x0d, 0x04, 0x73, 0x7a, 0x61, 0x68, 0x57, 0x5e, 0x45, 0x4c },
+	/*3*/	{ 0xab, 0xa2, 0xb9, 0xb0, 0x8f, 0x86, 0x9d, 0x94, 0xe3, 0xea, 0xf1, 0xf8, 0xc7, 0xce, 0xd5, 0xdc },
+	/*4*/	{ 0x76, 0x7f, 0x64, 0x6d, 0x52, 0x5b, 0x40, 0x49, 0x3e, 0x37, 0x2c, 0x25, 0x1a, 0x13, 0x08, 0x01 },
+	/*5*/	{ 0xe6, 0xef, 0xf4, 0xfd, 0xc2, 0xcb, 0xd0, 0xd9, 0xae, 0xa7, 0xbc, 0xb5, 0x8a, 0x83, 0x98, 0x91 },
+	/*6*/	{ 0x4d, 0x44, 0x5f, 0x56, 0x69, 0x60, 0x7b, 0x72, 0x05, 0x0c, 0x17, 0x1e, 0x21, 0x28, 0x33, 0x3a },
+	/*7*/	{ 0xdd, 0xd4, 0xcf, 0xc6, 0xf9, 0xf0, 0xeb, 0xe2, 0x95, 0x9c, 0x87, 0x8e, 0xb1, 0xb8, 0xa3, 0xaa },
+	/*8*/	{ 0xec, 0xe5, 0xfe, 0xf7, 0xc8, 0xc1, 0xda, 0xd3, 0xa4, 0xad, 0xb6, 0xbf, 0x80, 0x89, 0x92, 0x9b },
+	/*9*/	{ 0x7c, 0x75, 0x6e, 0x67, 0x58, 0x51, 0x4a, 0x43, 0x34, 0x3d, 0x26, 0x2f, 0x10, 0x19, 0x02, 0x0b },
+	/*a*/	{ 0xd7, 0xde, 0xc5, 0xcc, 0xf3, 0xfa, 0xe1, 0xe8, 0x9f, 0x96, 0x8d, 0x84, 0xbb, 0xb2, 0xa9, 0xa0 },
+	/*b*/	{ 0x47, 0x4e, 0x55, 0x5c, 0x63, 0x6a, 0x71, 0x78, 0x0f, 0x06, 0x1d, 0x14, 0x2b, 0x22, 0x39, 0x30 },
+	/*c*/	{ 0x9a, 0x93, 0x88, 0x81, 0xbe, 0xb7, 0xac, 0xa5, 0xd2, 0xdb, 0xc0, 0xc9, 0xf6, 0xff, 0xe4, 0xed },
+	/*d*/	{ 0x0a, 0x03, 0x18, 0x11, 0x2e, 0x27, 0x3c, 0x35, 0x42, 0x4b, 0x50, 0x59, 0x66, 0x6f, 0x74, 0x7d },
+	/*e*/	{ 0xa1, 0xa8, 0xb3, 0xba, 0x85, 0x8c, 0x97, 0x9e, 0xe9, 0xe0, 0xfb, 0xf2, 0xcd, 0xc4, 0xdf, 0xd6 },
+	/*f*/	{ 0x31, 0x38, 0x23, 0x2a, 0x15, 0x1c, 0x07, 0x0e, 0x79, 0x70, 0x6b, 0x62, 0x5d, 0x54, 0x4f, 0x46 },
+};
+
+static uint8_t oaes_gf_mul_b[16][16] = {
+	// 		0,    1,    2,    3,    4,    5,    6,    7,    8,    9,    a,    b,    c,    d,    e,    f,
+	/*0*/	{ 0x00, 0x0b, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31, 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69 },
+	/*1*/	{ 0xb0, 0xbb, 0xa6, 0xad, 0x9c, 0x97, 0x8a, 0x81, 0xe8, 0xe3, 0xfe, 0xf5, 0xc4, 0xcf, 0xd2, 0xd9 },
+	/*2*/	{ 0x7b, 0x70, 0x6d, 0x66, 0x57, 0x5c, 0x41, 0x4a, 0x23, 0x28, 0x35, 0x3e, 0x0f, 0x04, 0x19, 0x12 },
+	/*3*/	{ 0xcb, 0xc0, 0xdd, 0xd6, 0xe7, 0xec, 0xf1, 0xfa, 0x93, 0x98, 0x85, 0x8e, 0xbf, 0xb4, 0xa9, 0xa2 },
+	/*4*/	{ 0xf6, 0xfd, 0xe0, 0xeb, 0xda, 0xd1, 0xcc, 0xc7, 0xae, 0xa5, 0xb8, 0xb3, 0x82, 0x89, 0x94, 0x9f },
+	/*5*/	{ 0x46, 0x4d, 0x50, 0x5b, 0x6a, 0x61, 0x7c, 0x77, 0x1e, 0x15, 0x08, 0x03, 0x32, 0x39, 0x24, 0x2f },
+	/*6*/	{ 0x8d, 0x86, 0x9b, 0x90, 0xa1, 0xaa, 0xb7, 0xbc, 0xd5, 0xde, 0xc3, 0xc8, 0xf9, 0xf2, 0xef, 0xe4 },
+	/*7*/	{ 0x3d, 0x36, 0x2b, 0x20, 0x11, 0x1a, 0x07, 0x0c, 0x65, 0x6e, 0x73, 0x78, 0x49, 0x42, 0x5f, 0x54 },
+	/*8*/	{ 0xf7, 0xfc, 0xe1, 0xea, 0xdb, 0xd0, 0xcd, 0xc6, 0xaf, 0xa4, 0xb9, 0xb2, 0x83, 0x88, 0x95, 0x9e },
+	/*9*/	{ 0x47, 0x4c, 0x51, 0x5a, 0x6b, 0x60, 0x7d, 0x76, 0x1f, 0x14, 0x09, 0x02, 0x33, 0x38, 0x25, 0x2e },
+	/*a*/	{ 0x8c, 0x87, 0x9a, 0x91, 0xa0, 0xab, 0xb6, 0xbd, 0xd4, 0xdf, 0xc2, 0xc9, 0xf8, 0xf3, 0xee, 0xe5 },
+	/*b*/	{ 0x3c, 0x37, 0x2a, 0x21, 0x10, 0x1b, 0x06, 0x0d, 0x64, 0x6f, 0x72, 0x79, 0x48, 0x43, 0x5e, 0x55 },
+	/*c*/	{ 0x01, 0x0a, 0x17, 0x1c, 0x2d, 0x26, 0x3b, 0x30, 0x59, 0x52, 0x4f, 0x44, 0x75, 0x7e, 0x63, 0x68 },
+	/*d*/	{ 0xb1, 0xba, 0xa7, 0xac, 0x9d, 0x96, 0x8b, 0x80, 0xe9, 0xe2, 0xff, 0xf4, 0xc5, 0xce, 0xd3, 0xd8 },
+	/*e*/	{ 0x7a, 0x71, 0x6c, 0x67, 0x56, 0x5d, 0x40, 0x4b, 0x22, 0x29, 0x34, 0x3f, 0x0e, 0x05, 0x18, 0x13 },
+	/*f*/	{ 0xca, 0xc1, 0xdc, 0xd7, 0xe6, 0xed, 0xf0, 0xfb, 0x92, 0x99, 0x84, 0x8f, 0xbe, 0xb5, 0xa8, 0xa3 },
+};
+
+static uint8_t oaes_gf_mul_d[16][16] = {
+	// 		0,    1,    2,    3,    4,    5,    6,    7,    8,    9,    a,    b,    c,    d,    e,    f,
+	/*0*/	{ 0x00, 0x0d, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23, 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b },
+	/*1*/	{ 0xd0, 0xdd, 0xca, 0xc7, 0xe4, 0xe9, 0xfe, 0xf3, 0xb8, 0xb5, 0xa2, 0xaf, 0x8c, 0x81, 0x96, 0x9b },
+	/*2*/	{ 0xbb, 0xb6, 0xa1, 0xac, 0x8f, 0x82, 0x95, 0x98, 0xd3, 0xde, 0xc9, 0xc4, 0xe7, 0xea, 0xfd, 0xf0 },
+	/*3*/	{ 0x6b, 0x66, 0x71, 0x7c, 0x5f, 0x52, 0x45, 0x48, 0x03, 0x0e, 0x19, 0x14, 0x37, 0x3a, 0x2d, 0x20 },
+	/*4*/	{ 0x6d, 0x60, 0x77, 0x7a, 0x59, 0x54, 0x43, 0x4e, 0x05, 0x08, 0x1f, 0x12, 0x31, 0x3c, 0x2b, 0x26 },
+	/*5*/	{ 0xbd, 0xb0, 0xa7, 0xaa, 0x89, 0x84, 0x93, 0x9e, 0xd5, 0xd8, 0xcf, 0xc2, 0xe1, 0xec, 0xfb, 0xf6 },
+	/*6*/	{ 0xd6, 0xdb, 0xcc, 0xc1, 0xe2, 0xef, 0xf8, 0xf5, 0xbe, 0xb3, 0xa4, 0xa9, 0x8a, 0x87, 0x90, 0x9d },
+	/*7*/	{ 0x06, 0x0b, 0x1c, 0x11, 0x32, 0x3f, 0x28, 0x25, 0x6e, 0x63, 0x74, 0x79, 0x5a, 0x57, 0x40, 0x4d },
+	/*8*/	{ 0xda, 0xd7, 0xc0, 0xcd, 0xee, 0xe3, 0xf4, 0xf9, 0xb2, 0xbf, 0xa8, 0xa5, 0x86, 0x8b, 0x9c, 0x91 },
+	/*9*/	{ 0x0a, 0x07, 0x10, 0x1d, 0x3e, 0x33, 0x24, 0x29, 0x62, 0x6f, 0x78, 0x75, 0x56, 0x5b, 0x4c, 0x41 },
+	/*a*/	{ 0x61, 0x6c, 0x7b, 0x76, 0x55, 0x58, 0x4f, 0x42, 0x09, 0x04, 0x13, 0x1e, 0x3d, 0x30, 0x27, 0x2a },
+	/*b*/	{ 0xb1, 0xbc, 0xab, 0xa6, 0x85, 0x88, 0x9f, 0x92, 0xd9, 0xd4, 0xc3, 0xce, 0xed, 0xe0, 0xf7, 0xfa },
+	/*c*/	{ 0xb7, 0xba, 0xad, 0xa0, 0x83, 0x8e, 0x99, 0x94, 0xdf, 0xd2, 0xc5, 0xc8, 0xeb, 0xe6, 0xf1, 0xfc },
+	/*d*/	{ 0x67, 0x6a, 0x7d, 0x70, 0x53, 0x5e, 0x49, 0x44, 0x0f, 0x02, 0x15, 0x18, 0x3b, 0x36, 0x21, 0x2c },
+	/*e*/	{ 0x0c, 0x01, 0x16, 0x1b, 0x38, 0x35, 0x22, 0x2f, 0x64, 0x69, 0x7e, 0x73, 0x50, 0x5d, 0x4a, 0x47 },
+	/*f*/	{ 0xdc, 0xd1, 0xc6, 0xcb, 0xe8, 0xe5, 0xf2, 0xff, 0xb4, 0xb9, 0xae, 0xa3, 0x80, 0x8d, 0x9a, 0x97 },
+};
+
+static uint8_t oaes_gf_mul_e[16][16] = {
+	// 		0,    1,    2,    3,    4,    5,    6,    7,    8,    9,    a,    b,    c,    d,    e,    f,
+	/*0*/	{ 0x00, 0x0e, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a, 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a },
+	/*1*/	{ 0xe0, 0xee, 0xfc, 0xf2, 0xd8, 0xd6, 0xc4, 0xca, 0x90, 0x9e, 0x8c, 0x82, 0xa8, 0xa6, 0xb4, 0xba },
+	/*2*/	{ 0xdb, 0xd5, 0xc7, 0xc9, 0xe3, 0xed, 0xff, 0xf1, 0xab, 0xa5, 0xb7, 0xb9, 0x93, 0x9d, 0x8f, 0x81 },
+	/*3*/	{ 0x3b, 0x35, 0x27, 0x29, 0x03, 0x0d, 0x1f, 0x11, 0x4b, 0x45, 0x57, 0x59, 0x73, 0x7d, 0x6f, 0x61 },
+	/*4*/	{ 0xad, 0xa3, 0xb1, 0xbf, 0x95, 0x9b, 0x89, 0x87, 0xdd, 0xd3, 0xc1, 0xcf, 0xe5, 0xeb, 0xf9, 0xf7 },
+	/*5*/	{ 0x4d, 0x43, 0x51, 0x5f, 0x75, 0x7b, 0x69, 0x67, 0x3d, 0x33, 0x21, 0x2f, 0x05, 0x0b, 0x19, 0x17 },
+	/*6*/	{ 0x76, 0x78, 0x6a, 0x64, 0x4e, 0x40, 0x52, 0x5c, 0x06, 0x08, 0x1a, 0x14, 0x3e, 0x30, 0x22, 0x2c },
+	/*7*/	{ 0x96, 0x98, 0x8a, 0x84, 0xae, 0xa0, 0xb2, 0xbc, 0xe6, 0xe8, 0xfa, 0xf4, 0xde, 0xd0, 0xc2, 0xcc },
+	/*8*/	{ 0x41, 0x4f, 0x5d, 0x53, 0x79, 0x77, 0x65, 0x6b, 0x31, 0x3f, 0x2d, 0x23, 0x09, 0x07, 0x15, 0x1b },
+	/*9*/	{ 0xa1, 0xaf, 0xbd, 0xb3, 0x99, 0x97, 0x85, 0x8b, 0xd1, 0xdf, 0xcd, 0xc3, 0xe9, 0xe7, 0xf5, 0xfb },
+	/*a*/	{ 0x9a, 0x94, 0x86, 0x88, 0xa2, 0xac, 0xbe, 0xb0, 0xea, 0xe4, 0xf6, 0xf8, 0xd2, 0xdc, 0xce, 0xc0 },
+	/*b*/	{ 0x7a, 0x74, 0x66, 0x68, 0x42, 0x4c, 0x5e, 0x50, 0x0a, 0x04, 0x16, 0x18, 0x32, 0x3c, 0x2e, 0x20 },
+	/*c*/	{ 0xec, 0xe2, 0xf0, 0xfe, 0xd4, 0xda, 0xc8, 0xc6, 0x9c, 0x92, 0x80, 0x8e, 0xa4, 0xaa, 0xb8, 0xb6 },
+	/*d*/	{ 0x0c, 0x02, 0x10, 0x1e, 0x34, 0x3a, 0x28, 0x26, 0x7c, 0x72, 0x60, 0x6e, 0x44, 0x4a, 0x58, 0x56 },
+	/*e*/	{ 0x37, 0x39, 0x2b, 0x25, 0x0f, 0x01, 0x13, 0x1d, 0x47, 0x49, 0x5b, 0x55, 0x7f, 0x71, 0x63, 0x6d },
+	/*f*/	{ 0xd7, 0xd9, 0xcb, 0xc5, 0xef, 0xe1, 0xf3, 0xfd, 0xa7, 0xa9, 0xbb, 0xb5, 0x9f, 0x91, 0x83, 0x8d },
+};
+
+static OAES_RET oaes_sub_byte( uint8_t * byte )
+{
+	size_t _x, _y;
+
+	if( unlikely(NULL == byte) )
+		return OAES_RET_ARG1;
+
+	_y = ((_x = *byte) >> 4) & 0x0f;
+	_x &= 0x0f;
+	*byte = oaes_sub_byte_value[_y][_x];
+
+	return OAES_RET_SUCCESS;
+}
+
+static OAES_RET oaes_inv_sub_byte( uint8_t * byte )
+{
+	size_t _x, _y;
+
+	if( NULL == byte )
+		return OAES_RET_ARG1;
+
+	_x = _y = *byte;
+	_x &= 0x0f;
+	_y &= 0xf0;
+	_y >>= 4;
+	*byte = oaes_inv_sub_byte_value[_y][_x];
+
+	return OAES_RET_SUCCESS;
+}
+/*
+static OAES_RET oaes_word_rot_right( uint8_t word[OAES_COL_LEN] )
+{
+	uint8_t _temp[OAES_COL_LEN];
+
+	if( NULL == word )
+		return OAES_RET_ARG1;
+
+	memcpy( _temp + 1, word, OAES_COL_LEN - 1 );
+	_temp[0] = word[OAES_COL_LEN - 1];
+	memcpy( word, _temp, OAES_COL_LEN );
+
+	return OAES_RET_SUCCESS;
+}
+*/
+static OAES_RET oaes_word_rot_left( uint8_t word[OAES_COL_LEN] )
+{
+	uint8_t _temp[OAES_COL_LEN];
+
+	if( NULL == word )
+		return OAES_RET_ARG1;
+
+	memcpy( _temp, word + 1, OAES_COL_LEN - 1 );
+	_temp[OAES_COL_LEN - 1] = word[0];
+	memcpy( word, _temp, OAES_COL_LEN );
+
+	return OAES_RET_SUCCESS;
+}
+
+static OAES_RET oaes_shift_rows( uint8_t block[OAES_BLOCK_SIZE] )
+{
+	uint8_t _temp[] = { block[0x03], block[0x02], block[0x01], block[0x06], block[0x0b] };
+
+	if( unlikely(NULL == block) )
+		return OAES_RET_ARG1;
+
+	block[0x0b] = block[0x07];
+	block[0x01] = block[0x05];
+	block[0x02] = block[0x0a];
+	block[0x03] = block[0x0f];
+	block[0x05] = block[0x09];
+	block[0x06] = block[0x0e];
+	block[0x07] = _temp[0];
+	block[0x09] = block[0x0d];
+	block[0x0a] = _temp[1];
+	block[0x0d] = _temp[2];
+	block[0x0e] = _temp[3];
+	block[0x0f] = _temp[4];
+
+	return OAES_RET_SUCCESS;
+}
+
+static OAES_RET oaes_inv_shift_rows( uint8_t block[OAES_BLOCK_SIZE] )
+{
+	uint8_t _temp[OAES_BLOCK_SIZE];
+
+	if( NULL == block )
+		return OAES_RET_ARG1;
+
+	_temp[0x00] = block[0x00];
+	_temp[0x01] = block[0x0d];
+	_temp[0x02] = block[0x0a];
+	_temp[0x03] = block[0x07];
+	_temp[0x04] = block[0x04];
+	_temp[0x05] = block[0x01];
+	_temp[0x06] = block[0x0e];
+	_temp[0x07] = block[0x0b];
+	_temp[0x08] = block[0x08];
+	_temp[0x09] = block[0x05];
+	_temp[0x0a] = block[0x02];
+	_temp[0x0b] = block[0x0f];
+	_temp[0x0c] = block[0x0c];
+	_temp[0x0d] = block[0x09];
+	_temp[0x0e] = block[0x06];
+	_temp[0x0f] = block[0x03];
+	memcpy( block, _temp, OAES_BLOCK_SIZE );
+
+	return OAES_RET_SUCCESS;
+}
+
+static uint8_t oaes_gf_mul(uint8_t left, uint8_t right)
+{
+	size_t _x, _y;
+
+	_y = ((_x = left) >> 4) & 0x0f;
+	_x &= 0x0f;
+
+	switch( right )
+	{
+		case 0x02:
+			return oaes_gf_mul_2[_y][_x];
+			break;
+		case 0x03:
+			return oaes_gf_mul_3[_y][_x];
+			break;
+		case 0x09:
+			return oaes_gf_mul_9[_y][_x];
+			break;
+		case 0x0b:
+			return oaes_gf_mul_b[_y][_x];
+			break;
+		case 0x0d:
+			return oaes_gf_mul_d[_y][_x];
+			break;
+		case 0x0e:
+			return oaes_gf_mul_e[_y][_x];
+			break;
+		default:
+			return left;
+			break;
+	}
+}
+
+static OAES_RET oaes_mix_cols( uint8_t word[OAES_COL_LEN] )
+{
+	uint8_t _temp[OAES_COL_LEN];
+
+	if( unlikely(NULL == word) )
+		return OAES_RET_ARG1;
+
+	_temp[0] = oaes_gf_mul(word[0], 0x02) ^ oaes_gf_mul( word[1], 0x03 ) ^
+			word[2] ^ word[3];
+	_temp[1] = word[0] ^ oaes_gf_mul( word[1], 0x02 ) ^
+			oaes_gf_mul( word[2], 0x03 ) ^ word[3];
+	_temp[2] = word[0] ^ word[1] ^
+			oaes_gf_mul( word[2], 0x02 ) ^ oaes_gf_mul( word[3], 0x03 );
+	_temp[3] = oaes_gf_mul( word[0], 0x03 ) ^ word[1] ^
+			word[2] ^ oaes_gf_mul( word[3], 0x02 );
+	memcpy( word, _temp, OAES_COL_LEN );
+
+	return OAES_RET_SUCCESS;
+}
+
+static OAES_RET oaes_inv_mix_cols( uint8_t word[OAES_COL_LEN] )
+{
+	uint8_t _temp[OAES_COL_LEN];
+
+	if( NULL == word )
+		return OAES_RET_ARG1;
+
+	_temp[0] = oaes_gf_mul( word[0], 0x0e ) ^ oaes_gf_mul( word[1], 0x0b ) ^
+			oaes_gf_mul( word[2], 0x0d ) ^ oaes_gf_mul( word[3], 0x09 );
+	_temp[1] = oaes_gf_mul( word[0], 0x09 ) ^ oaes_gf_mul( word[1], 0x0e ) ^
+			oaes_gf_mul( word[2], 0x0b ) ^ oaes_gf_mul( word[3], 0x0d );
+	_temp[2] = oaes_gf_mul( word[0], 0x0d ) ^ oaes_gf_mul( word[1], 0x09 ) ^
+			oaes_gf_mul( word[2], 0x0e ) ^ oaes_gf_mul( word[3], 0x0b );
+	_temp[3] = oaes_gf_mul( word[0], 0x0b ) ^ oaes_gf_mul( word[1], 0x0d ) ^
+			oaes_gf_mul( word[2], 0x09 ) ^ oaes_gf_mul( word[3], 0x0e );
+	memcpy( word, _temp, OAES_COL_LEN );
+
+	return OAES_RET_SUCCESS;
+}
+
+OAES_RET oaes_sprintf(
+		char * buf, size_t * buf_len, const uint8_t * data, size_t data_len )
+{
+	size_t _i, _buf_len_in;
+	char _temp[4];
+
+	if( NULL == buf_len )
+		return OAES_RET_ARG2;
+
+	_buf_len_in = *buf_len;
+	*buf_len = data_len * 3 + data_len / OAES_BLOCK_SIZE + 1;
+
+	if( NULL == buf )
+		return OAES_RET_SUCCESS;
+
+	if( *buf_len > _buf_len_in )
+		return OAES_RET_BUF;
+
+	if( NULL == data )
+		return OAES_RET_ARG3;
+
+	strcpy( buf, "" );
+
+	for( _i = 0; _i < data_len; _i++ )
+	{
+		sprintf( _temp, "%02x ", data[_i] );
+		strcat( buf, _temp );
+		if( _i && 0 == ( _i + 1 ) % OAES_BLOCK_SIZE )
+			strcat( buf, "\n" );
+	}
+
+	return OAES_RET_SUCCESS;
+}
+
+#ifdef OAES_HAVE_ISAAC
+static void oaes_get_seed( char buf[RANDSIZ + 1] )
+{
+	struct timeb timer;
+	struct tm *gmTimer;
+	char * _test = NULL;
+
+	ftime (&timer);
+	gmTimer = gmtime( &timer.time );
+	_test = (char *) calloc( sizeof( char ), timer.millitm );
+	sprintf( buf, "%04d%02d%02d%02d%02d%02d%03d%p%d",
+		gmTimer->tm_year + 1900, gmTimer->tm_mon + 1, gmTimer->tm_mday,
+		gmTimer->tm_hour, gmTimer->tm_min, gmTimer->tm_sec, timer.millitm,
+		_test + timer.millitm, getpid() );
+
+	if( _test )
+		free( _test );
+}
+#else
+static uint32_t oaes_get_seed(void)
+{
+	struct timeb timer;
+	struct tm *gmTimer;
+	char * _test = NULL;
+	uint32_t _ret = 0;
+
+	ftime (&timer);
+	gmTimer = gmtime( &timer.time );
+	_test = (char *) calloc( sizeof( char ), timer.millitm );
+	_ret = (uint32_t)(gmTimer->tm_year + 1900 + gmTimer->tm_mon + 1 + gmTimer->tm_mday +
+			gmTimer->tm_hour + gmTimer->tm_min + gmTimer->tm_sec + timer.millitm +
+			(uintptr_t) ( _test + timer.millitm ) + getpid());
+
+	if( _test )
+		free( _test );
+
+	return _ret;
+}
+#endif // OAES_HAVE_ISAAC
+
+static OAES_RET oaes_key_destroy( oaes_key ** key )
+{
+	if( NULL == *key )
+		return OAES_RET_SUCCESS;
+
+	if( (*key)->data )
+	{
+		free( (*key)->data );
+		(*key)->data = NULL;
+	}
+
+	if( (*key)->exp_data )
+	{
+		free( (*key)->exp_data );
+		(*key)->exp_data = NULL;
+	}
+
+	(*key)->data_len = 0;
+	(*key)->exp_data_len = 0;
+	(*key)->num_keys = 0;
+	(*key)->key_base = 0;
+	free( *key );
+	*key = NULL;
+
+	return OAES_RET_SUCCESS;
+}
+
+static OAES_RET oaes_key_expand( OAES_CTX * ctx )
+{
+	size_t _i, _j;
+	oaes_ctx * _ctx = (oaes_ctx *) ctx;
+    uint8_t _temp[OAES_COL_LEN];
+
+	if( NULL == _ctx )
+		return OAES_RET_ARG1;
+
+	if( NULL == _ctx->key )
+		return OAES_RET_NOKEY;
+
+	_ctx->key->key_base = _ctx->key->data_len / OAES_RKEY_LEN;
+	_ctx->key->num_keys =  _ctx->key->key_base + OAES_ROUND_BASE;
+
+	_ctx->key->exp_data_len = _ctx->key->num_keys * OAES_RKEY_LEN * OAES_COL_LEN;
+	_ctx->key->exp_data = (uint8_t *)
+			calloc( _ctx->key->exp_data_len, sizeof( uint8_t ));
+
+	if( NULL == _ctx->key->exp_data )
+		return OAES_RET_MEM;
+
+	// the first _ctx->key->data_len are a direct copy
+	memcpy( _ctx->key->exp_data, _ctx->key->data, _ctx->key->data_len );
+
+	// apply ExpandKey algorithm for remainder
+	for( _i = _ctx->key->key_base; _i < _ctx->key->num_keys * OAES_RKEY_LEN; _i++ )
+	{
+
+		memcpy( _temp,
+				_ctx->key->exp_data + ( _i - 1 ) * OAES_RKEY_LEN, OAES_COL_LEN );
+
+		// transform key column
+		if( 0 == _i % _ctx->key->key_base )
+		{
+			oaes_word_rot_left( _temp );
+
+			for( _j = 0; _j < OAES_COL_LEN; _j++ )
+				oaes_sub_byte( _temp + _j );
+
+			_temp[0] = _temp[0] ^ oaes_gf_8[ _i / _ctx->key->key_base - 1 ];
+		}
+		else if( _ctx->key->key_base > 6 && 4 == _i % _ctx->key->key_base )
+		{
+			for( _j = 0; _j < OAES_COL_LEN; _j++ )
+				oaes_sub_byte( _temp + _j );
+		}
+
+		for( _j = 0; _j < OAES_COL_LEN; _j++ )
+		{
+			_ctx->key->exp_data[ _i * OAES_RKEY_LEN + _j ] =
+					_ctx->key->exp_data[ ( _i - _ctx->key->key_base ) *
+					OAES_RKEY_LEN + _j ] ^ _temp[_j];
+		}
+	}
+
+	return OAES_RET_SUCCESS;
+}
+
+static OAES_RET oaes_key_gen( OAES_CTX * ctx, size_t key_size )
+{
+	size_t _i;
+	oaes_key * _key = NULL;
+	oaes_ctx * _ctx = (oaes_ctx *) ctx;
+	OAES_RET _rc = OAES_RET_SUCCESS;
+
+	if( NULL == _ctx )
+		return OAES_RET_ARG1;
+
+	_key = (oaes_key *) calloc( sizeof( oaes_key ), 1 );
+
+	if( NULL == _key )
+		return OAES_RET_MEM;
+
+	if( _ctx->key )
+		oaes_key_destroy( &(_ctx->key) );
+
+	_key->data_len = key_size;
+	_key->data = (uint8_t *) calloc( key_size, sizeof( uint8_t ));
+
+	if( NULL == _key->data )
+		return OAES_RET_MEM;
+
+	for( _i = 0; _i < key_size; _i++ )
+#ifdef OAES_HAVE_ISAAC
+		_key->data[_i] = (uint8_t) rand( _ctx->rctx );
+#else
+		_key->data[_i] = (uint8_t) rand();
+#endif // OAES_HAVE_ISAAC
+
+	_ctx->key = _key;
+	_rc = _rc ? _rc : oaes_key_expand( ctx );
+
+	if( _rc != OAES_RET_SUCCESS )
+	{
+		oaes_key_destroy( &(_ctx->key) );
+		return _rc;
+	}
+
+	return OAES_RET_SUCCESS;
+}
+
+OAES_RET oaes_key_gen_128( OAES_CTX * ctx )
+{
+	return oaes_key_gen( ctx, 16 );
+}
+
+OAES_RET oaes_key_gen_192( OAES_CTX * ctx )
+{
+	return oaes_key_gen( ctx, 24 );
+}
+
+OAES_RET oaes_key_gen_256( OAES_CTX * ctx )
+{
+	return oaes_key_gen( ctx, 32 );
+}
+
+OAES_RET oaes_key_export( OAES_CTX * ctx,
+		uint8_t * data, size_t * data_len )
+{
+	size_t _data_len_in;
+	oaes_ctx * _ctx = (oaes_ctx *) ctx;
+
+	if( NULL == _ctx )
+		return OAES_RET_ARG1;
+
+	if( NULL == _ctx->key )
+		return OAES_RET_NOKEY;
+
+	if( NULL == data_len )
+		return OAES_RET_ARG3;
+
+	_data_len_in = *data_len;
+	// data + header
+	*data_len = _ctx->key->data_len + OAES_BLOCK_SIZE;
+
+	if( NULL == data )
+		return OAES_RET_SUCCESS;
+
+	if( _data_len_in < *data_len )
+		return OAES_RET_BUF;
+
+	// header
+	memcpy( data, oaes_header, OAES_BLOCK_SIZE );
+	data[5] = 0x01;
+	data[7] = (uint8_t)(_ctx->key->data_len);
+	memcpy( data + OAES_BLOCK_SIZE, _ctx->key->data, _ctx->key->data_len );
+
+	return OAES_RET_SUCCESS;
+}
+
+OAES_RET oaes_key_export_data( OAES_CTX * ctx,
+		uint8_t * data, size_t * data_len )
+{
+	size_t _data_len_in;
+	oaes_ctx * _ctx = (oaes_ctx *) ctx;
+
+	if( NULL == _ctx )
+		return OAES_RET_ARG1;
+
+	if( NULL == _ctx->key )
+		return OAES_RET_NOKEY;
+
+	if( NULL == data_len )
+		return OAES_RET_ARG3;
+
+	_data_len_in = *data_len;
+	*data_len = _ctx->key->data_len;
+
+	if( NULL == data )
+		return OAES_RET_SUCCESS;
+
+	if( _data_len_in < *data_len )
+		return OAES_RET_BUF;
+
+	memcpy( data, _ctx->key->data, *data_len );
+
+	return OAES_RET_SUCCESS;
+}
+
+OAES_RET oaes_key_import( OAES_CTX * ctx,
+		const uint8_t * data, size_t data_len )
+{
+	oaes_ctx * _ctx = (oaes_ctx *) ctx;
+	OAES_RET _rc = OAES_RET_SUCCESS;
+	int _key_length;
+
+	if( NULL == _ctx )
+		return OAES_RET_ARG1;
+
+	if( NULL == data )
+		return OAES_RET_ARG2;
+
+	switch( data_len )
+	{
+		case 16 + OAES_BLOCK_SIZE:
+		case 24 + OAES_BLOCK_SIZE:
+		case 32 + OAES_BLOCK_SIZE:
+			break;
+		default:
+			return OAES_RET_ARG3;
+	}
+
+	// header
+	if( 0 != memcmp( data, oaes_header, 4 ) )
+		return OAES_RET_HEADER;
+
+	// header version
+	switch( data[4] )
+	{
+		case 0x01:
+			break;
+		default:
+			return OAES_RET_HEADER;
+	}
+
+	// header type
+	switch( data[5] )
+	{
+		case 0x01:
+			break;
+		default:
+			return OAES_RET_HEADER;
+	}
+
+	// options
+	_key_length = data[7];
+	switch( _key_length )
+	{
+		case 16:
+		case 24:
+		case 32:
+			break;
+		default:
+			return OAES_RET_HEADER;
+	}
+
+	if( (int)data_len != _key_length + OAES_BLOCK_SIZE )
+			return OAES_RET_ARG3;
+
+	if( _ctx->key )
+		oaes_key_destroy( &(_ctx->key) );
+
+	_ctx->key = (oaes_key *) calloc( sizeof( oaes_key ), 1 );
+
+	if( NULL == _ctx->key )
+		return OAES_RET_MEM;
+
+	_ctx->key->data_len = _key_length;
+	_ctx->key->data = (uint8_t *)
+			calloc( _key_length, sizeof( uint8_t ));
+
+	if( NULL == _ctx->key->data )
+	{
+		oaes_key_destroy( &(_ctx->key) );
+		return OAES_RET_MEM;
+	}
+
+	memcpy( _ctx->key->data, data + OAES_BLOCK_SIZE, _key_length );
+	_rc = _rc ? _rc : oaes_key_expand( ctx );
+
+	if( _rc != OAES_RET_SUCCESS )
+	{
+		oaes_key_destroy( &(_ctx->key) );
+		return _rc;
+	}
+
+	return OAES_RET_SUCCESS;
+}
+
+OAES_RET oaes_key_import_data( OAES_CTX * ctx,
+		const uint8_t * data, size_t data_len )
+{
+	oaes_ctx * _ctx = (oaes_ctx *) ctx;
+	OAES_RET _rc = OAES_RET_SUCCESS;
+
+	if( NULL == _ctx )
+		return OAES_RET_ARG1;
+
+	if( NULL == data )
+		return OAES_RET_ARG2;
+
+	switch( data_len )
+	{
+		case 16:
+		case 24:
+		case 32:
+			break;
+		default:
+			return OAES_RET_ARG3;
+	}
+
+	if( _ctx->key )
+		oaes_key_destroy( &(_ctx->key) );
+
+	_ctx->key = (oaes_key *) calloc( sizeof( oaes_key ), 1 );
+
+	if( NULL == _ctx->key )
+		return OAES_RET_MEM;
+
+	_ctx->key->data_len = data_len;
+	_ctx->key->data = (uint8_t *)
+			calloc( data_len, sizeof( uint8_t ));
+
+	if( NULL == _ctx->key->data )
+	{
+		oaes_key_destroy( &(_ctx->key) );
+		return OAES_RET_MEM;
+	}
+
+	memcpy( _ctx->key->data, data, data_len );
+	_rc = _rc ? _rc : oaes_key_expand( ctx );
+
+	if( _rc != OAES_RET_SUCCESS )
+	{
+		oaes_key_destroy( &(_ctx->key) );
+		return _rc;
+	}
+
+	return OAES_RET_SUCCESS;
+}
+
+OAES_CTX * oaes_alloc(void)
+{
+	oaes_ctx * _ctx = (oaes_ctx *) calloc(sizeof(oaes_ctx), 1);
+	if(!_ctx)
+		return NULL;
+
+#ifdef OAES_HAVE_ISAAC
+	{
+	  ub4 _i = 0;
+		char _seed[RANDSIZ + 1];
+
+		_ctx->rctx = (randctx *) calloc(sizeof(randctx), 1);
+		if(!_ctx->rctx) {
+			free(_ctx);
+			return NULL;
+		}
+
+		oaes_get_seed( _seed );
+		memset( _ctx->rctx->randrsl, 0, RANDSIZ );
+		memcpy( _ctx->rctx->randrsl, _seed, RANDSIZ );
+		randinit( _ctx->rctx, TRUE);
+	}
+#else
+		srand( oaes_get_seed() );
+#endif // OAES_HAVE_ISAAC
+
+	_ctx->key = NULL;
+	oaes_set_option( _ctx, OAES_OPTION_CBC, NULL );
+
+#ifdef OAES_DEBUG
+	_ctx->step_cb = NULL;
+	oaes_set_option( _ctx, OAES_OPTION_STEP_OFF, NULL );
+#endif // OAES_DEBUG
+
+	return (OAES_CTX *) _ctx;
+}
+
+OAES_RET oaes_free(OAES_CTX ** ctx)
+{
+	oaes_ctx ** _ctx = (oaes_ctx **) ctx;
+
+	if(!_ctx)
+		return OAES_RET_ARG1;
+
+	if(!*_ctx)
+		return OAES_RET_SUCCESS;
+
+	if( (*_ctx)->key )
+		oaes_key_destroy( &((*_ctx)->key) );
+
+#ifdef OAES_HAVE_ISAAC
+	if( (*_ctx)->rctx )
+	{
+		free( (*_ctx)->rctx );
+		(*_ctx)->rctx = NULL;
+	}
+#endif // OAES_HAVE_ISAAC
+
+	free( *_ctx );
+	*_ctx = NULL;
+
+	return OAES_RET_SUCCESS;
+}
+
+OAES_RET oaes_set_option(OAES_CTX * ctx, OAES_OPTION option, const void * value)
+{
+	size_t _i;
+	oaes_ctx * _ctx = (oaes_ctx *) ctx;
+
+	if( NULL == _ctx )
+		return OAES_RET_ARG1;
+
+	switch( option )
+	{
+		case OAES_OPTION_ECB:
+			_ctx->options &= ~OAES_OPTION_CBC;
+			memset( _ctx->iv, 0, OAES_BLOCK_SIZE );
+			break;
+
+		case OAES_OPTION_CBC:
+			_ctx->options &= ~OAES_OPTION_ECB;
+			if( value )
+				memcpy( _ctx->iv, value, OAES_BLOCK_SIZE );
+			else
+			{
+				for( _i = 0; _i < OAES_BLOCK_SIZE; _i++ )
+#ifdef OAES_HAVE_ISAAC
+					_ctx->iv[_i] = (uint8_t) rand( _ctx->rctx );
+#else
+					_ctx->iv[_i] = (uint8_t) rand();
+#endif // OAES_HAVE_ISAAC
+			}
+			break;
+
+#ifdef OAES_DEBUG
+
+		case OAES_OPTION_STEP_ON:
+			if( value )
+			{
+				_ctx->options &= ~OAES_OPTION_STEP_OFF;
+				_ctx->step_cb = value;
+			}
+			else
+			{
+				_ctx->options &= ~OAES_OPTION_STEP_ON;
+				_ctx->options |= OAES_OPTION_STEP_OFF;
+				_ctx->step_cb = NULL;
+				return OAES_RET_ARG3;
+			}
+			break;
+
+		case OAES_OPTION_STEP_OFF:
+			_ctx->options &= ~OAES_OPTION_STEP_ON;
+			_ctx->step_cb = NULL;
+			break;
+
+#endif // OAES_DEBUG
+
+		default:
+			return OAES_RET_ARG2;
+	}
+
+	_ctx->options |= option;
+
+	return OAES_RET_SUCCESS;
+}
+
+static OAES_RET oaes_encrypt_block(OAES_CTX * ctx, uint8_t * c, size_t c_len)
+{
+	size_t _i, _j;
+	oaes_ctx * _ctx = (oaes_ctx *) ctx;
+
+	if( NULL == _ctx )
+		return OAES_RET_ARG1;
+
+	if( NULL == c )
+		return OAES_RET_ARG2;
+
+	if( c_len != OAES_BLOCK_SIZE )
+		return OAES_RET_ARG3;
+
+	if( NULL == _ctx->key )
+		return OAES_RET_NOKEY;
+
+#ifdef OAES_DEBUG
+	if( _ctx->step_cb )
+		_ctx->step_cb( c, "input", 1, NULL );
+#endif // OAES_DEBUG
+
+	// AddRoundKey(State, K0)
+	for( _i = 0; _i < c_len; _i++ )
+		c[_i] = c[_i] ^ _ctx->key->exp_data[_i];
+
+#ifdef OAES_DEBUG
+	if( _ctx->step_cb )
+	{
+		_ctx->step_cb( _ctx->key->exp_data, "k_sch", 1, NULL );
+		_ctx->step_cb( c, "k_add", 1, NULL );
+	}
+#endif // OAES_DEBUG
+
+	// for round = 1 step 1 to Nr–1
+	for( _i = 1; _i < _ctx->key->num_keys - 1; _i++ )
+	{
+		// SubBytes(state)
+		for( _j = 0; _j < c_len; _j++ )
+			oaes_sub_byte( c + _j );
+
+#ifdef OAES_DEBUG
+		if( _ctx->step_cb )
+			_ctx->step_cb( c, "s_box", _i, NULL );
+#endif // OAES_DEBUG
+
+		// ShiftRows(state)
+		oaes_shift_rows( c );
+
+#ifdef OAES_DEBUG
+		if( _ctx->step_cb )
+			_ctx->step_cb( c, "s_row", _i, NULL );
+#endif // OAES_DEBUG
+
+		// MixColumns(state)
+		oaes_mix_cols( c );
+		oaes_mix_cols( c + 4 );
+		oaes_mix_cols( c + 8 );
+		oaes_mix_cols( c + 12 );
+
+#ifdef OAES_DEBUG
+		if( _ctx->step_cb )
+			_ctx->step_cb( c, "m_col", _i, NULL );
+#endif // OAES_DEBUG
+
+		// AddRoundKey(state, w[round*Nb, (round+1)*Nb-1])
+		for( _j = 0; _j < c_len; _j++ )
+			c[_j] = c[_j] ^
+					_ctx->key->exp_data[_i * OAES_RKEY_LEN * OAES_COL_LEN + _j];
+
+#ifdef OAES_DEBUG
+	if( _ctx->step_cb )
+	{
+		_ctx->step_cb( _ctx->key->exp_data + _i * OAES_RKEY_LEN * OAES_COL_LEN,
+				"k_sch", _i, NULL );
+		_ctx->step_cb( c, "k_add", _i, NULL );
+	}
+#endif // OAES_DEBUG
+
+	}
+
+	// SubBytes(state)
+	for( _i = 0; _i < c_len; _i++ )
+		oaes_sub_byte( c + _i );
+
+#ifdef OAES_DEBUG
+	if( _ctx->step_cb )
+		_ctx->step_cb( c, "s_box", _ctx->key->num_keys - 1, NULL );
+#endif // OAES_DEBUG
+
+	// ShiftRows(state)
+	oaes_shift_rows( c );
+
+#ifdef OAES_DEBUG
+	if( _ctx->step_cb )
+		_ctx->step_cb( c, "s_row", _ctx->key->num_keys - 1, NULL );
+#endif // OAES_DEBUG
+
+	// AddRoundKey(state, w[Nr*Nb, (Nr+1)*Nb-1])
+	for( _i = 0; _i < c_len; _i++ )
+		c[_i] = c[_i] ^ _ctx->key->exp_data[
+				( _ctx->key->num_keys - 1 ) * OAES_RKEY_LEN * OAES_COL_LEN + _i ];
+
+#ifdef OAES_DEBUG
+	if( _ctx->step_cb )
+	{
+		_ctx->step_cb( _ctx->key->exp_data +
+				( _ctx->key->num_keys - 1 ) * OAES_RKEY_LEN * OAES_COL_LEN,
+				"k_sch", _ctx->key->num_keys - 1, NULL );
+		_ctx->step_cb( c, "output", _ctx->key->num_keys - 1, NULL );
+	}
+#endif // OAES_DEBUG
+
+	return OAES_RET_SUCCESS;
+}
+
+static OAES_RET oaes_decrypt_block(OAES_CTX * ctx, uint8_t * c, size_t c_len)
+{
+	size_t _i, _j;
+	oaes_ctx * _ctx = (oaes_ctx *) ctx;
+
+	if( NULL == _ctx )
+		return OAES_RET_ARG1;
+
+	if( NULL == c )
+		return OAES_RET_ARG2;
+
+	if( c_len != OAES_BLOCK_SIZE )
+		return OAES_RET_ARG3;
+
+	if( NULL == _ctx->key )
+		return OAES_RET_NOKEY;
+
+#ifdef OAES_DEBUG
+	if( _ctx->step_cb )
+		_ctx->step_cb( c, "iinput", _ctx->key->num_keys - 1, NULL );
+#endif // OAES_DEBUG
+
+	// AddRoundKey(state, w[Nr*Nb, (Nr+1)*Nb-1])
+	for( _i = 0; _i < c_len; _i++ )
+		c[_i] = c[_i] ^ _ctx->key->exp_data[
+				( _ctx->key->num_keys - 1 ) * OAES_RKEY_LEN * OAES_COL_LEN + _i ];
+
+#ifdef OAES_DEBUG
+	if( _ctx->step_cb )
+	{
+		_ctx->step_cb( _ctx->key->exp_data +
+				( _ctx->key->num_keys - 1 ) * OAES_RKEY_LEN * OAES_COL_LEN,
+				"ik_sch", _ctx->key->num_keys - 1, NULL );
+		_ctx->step_cb( c, "ik_add", _ctx->key->num_keys - 1, NULL );
+	}
+#endif // OAES_DEBUG
+
+	for( _i = _ctx->key->num_keys - 2; _i > 0; _i-- )
+	{
+		// InvShiftRows(state)
+		oaes_inv_shift_rows( c );
+
+#ifdef OAES_DEBUG
+	if( _ctx->step_cb )
+		_ctx->step_cb( c, "is_row", _i, NULL );
+#endif // OAES_DEBUG
+
+		// InvSubBytes(state)
+		for( _j = 0; _j < c_len; _j++ )
+			oaes_inv_sub_byte( c + _j );
+
+#ifdef OAES_DEBUG
+	if( _ctx->step_cb )
+		_ctx->step_cb( c, "is_box", _i, NULL );
+#endif // OAES_DEBUG
+
+		// AddRoundKey(state, w[round*Nb, (round+1)*Nb-1])
+		for( _j = 0; _j < c_len; _j++ )
+			c[_j] = c[_j] ^
+					_ctx->key->exp_data[_i * OAES_RKEY_LEN * OAES_COL_LEN + _j];
+
+#ifdef OAES_DEBUG
+	if( _ctx->step_cb )
+	{
+		_ctx->step_cb( _ctx->key->exp_data + _i * OAES_RKEY_LEN * OAES_COL_LEN,
+				"ik_sch", _i, NULL );
+		_ctx->step_cb( c, "ik_add", _i, NULL );
+	}
+#endif // OAES_DEBUG
+
+		// InvMixColums(state)
+		oaes_inv_mix_cols( c );
+		oaes_inv_mix_cols( c + 4 );
+		oaes_inv_mix_cols( c + 8 );
+		oaes_inv_mix_cols( c + 12 );
+
+#ifdef OAES_DEBUG
+	if( _ctx->step_cb )
+		_ctx->step_cb( c, "im_col", _i, NULL );
+#endif // OAES_DEBUG
+
+	}
+
+	// InvShiftRows(state)
+	oaes_inv_shift_rows( c );
+
+#ifdef OAES_DEBUG
+	if( _ctx->step_cb )
+		_ctx->step_cb( c, "is_row", 1, NULL );
+#endif // OAES_DEBUG
+
+	// InvSubBytes(state)
+	for( _i = 0; _i < c_len; _i++ )
+		oaes_inv_sub_byte( c + _i );
+
+#ifdef OAES_DEBUG
+	if( _ctx->step_cb )
+		_ctx->step_cb( c, "is_box", 1, NULL );
+#endif // OAES_DEBUG
+
+	// AddRoundKey(state, w[0, Nb-1])
+	for( _i = 0; _i < c_len; _i++ )
+		c[_i] = c[_i] ^ _ctx->key->exp_data[_i];
+
+#ifdef OAES_DEBUG
+	if( _ctx->step_cb )
+	{
+		_ctx->step_cb( _ctx->key->exp_data, "ik_sch", 1, NULL );
+		_ctx->step_cb( c, "ioutput", 1, NULL );
+	}
+#endif // OAES_DEBUG
+
+	return OAES_RET_SUCCESS;
+}
+
+OAES_RET oaes_encrypt(OAES_CTX * ctx, const uint8_t * m, size_t m_len, uint8_t * c, size_t * c_len)
+{
+	size_t _i, _j, _c_len_in, _c_data_len;
+	size_t _pad_len = m_len % OAES_BLOCK_SIZE == 0 ?
+			0 : OAES_BLOCK_SIZE - m_len % OAES_BLOCK_SIZE;
+	oaes_ctx * _ctx = (oaes_ctx *) ctx;
+	OAES_RET _rc = OAES_RET_SUCCESS;
+	uint8_t _flags = _pad_len ? OAES_FLAG_PAD : 0;
+
+	if( NULL == _ctx )
+		return OAES_RET_ARG1;
+
+	if( NULL == m )
+		return OAES_RET_ARG2;
+
+	if( NULL == c_len )
+		return OAES_RET_ARG5;
+
+	_c_len_in = *c_len;
+	// data + pad
+	_c_data_len = m_len + _pad_len;
+	// header + iv + data + pad
+	*c_len = 2 * OAES_BLOCK_SIZE + m_len + _pad_len;
+
+	if( NULL == c )
+		return OAES_RET_SUCCESS;
+
+	if( _c_len_in < *c_len )
+		return OAES_RET_BUF;
+
+	if( NULL == _ctx->key )
+		return OAES_RET_NOKEY;
+
+	// header
+	memcpy(c, oaes_header, OAES_BLOCK_SIZE );
+	memcpy(c + 6, &_ctx->options, sizeof(_ctx->options));
+	memcpy(c + 8, &_flags, sizeof(_flags));
+	// iv
+	memcpy(c + OAES_BLOCK_SIZE, _ctx->iv, OAES_BLOCK_SIZE );
+	// data
+	memcpy(c + 2 * OAES_BLOCK_SIZE, m, m_len );
+
+	for( _i = 0; _i < _c_data_len; _i += OAES_BLOCK_SIZE )
+	{
+		uint8_t _block[OAES_BLOCK_SIZE];
+		size_t _block_size = min( m_len - _i, OAES_BLOCK_SIZE );
+
+		memcpy( _block, c + 2 * OAES_BLOCK_SIZE + _i, _block_size );
+
+		// insert pad
+		for( _j = 0; _j < OAES_BLOCK_SIZE - _block_size; _j++ )
+			_block[ _block_size + _j ] = (uint8_t)_j + 1;
+
+		// CBC
+		if( _ctx->options & OAES_OPTION_CBC )
+		{
+			for( _j = 0; _j < OAES_BLOCK_SIZE; _j++ )
+				_block[_j] = _block[_j] ^ _ctx->iv[_j];
+		}
+
+		_rc = _rc ? _rc :
+				oaes_encrypt_block( ctx, _block, OAES_BLOCK_SIZE );
+		memcpy( c + 2 * OAES_BLOCK_SIZE + _i, _block, OAES_BLOCK_SIZE );
+
+		if( _ctx->options & OAES_OPTION_CBC )
+			memcpy( _ctx->iv, _block, OAES_BLOCK_SIZE );
+	}
+
+	return _rc;
+}
+
+OAES_RET oaes_decrypt(OAES_CTX * ctx, const uint8_t * c, size_t c_len, uint8_t * m, size_t * m_len)
+{
+	size_t _i, _j, _m_len_in;
+	oaes_ctx * _ctx = (oaes_ctx *) ctx;
+	OAES_RET _rc = OAES_RET_SUCCESS;
+	uint8_t _iv[OAES_BLOCK_SIZE];
+	uint8_t _flags;
+	OAES_OPTION _options;
+
+	if( NULL == ctx )
+		return OAES_RET_ARG1;
+
+	if( NULL == c )
+		return OAES_RET_ARG2;
+
+	if( c_len % OAES_BLOCK_SIZE )
+		return OAES_RET_ARG3;
+
+	if( NULL == m_len )
+		return OAES_RET_ARG5;
+
+	_m_len_in = *m_len;
+	*m_len = c_len - 2 * OAES_BLOCK_SIZE;
+
+	if( NULL == m )
+		return OAES_RET_SUCCESS;
+
+	if( _m_len_in < *m_len )
+		return OAES_RET_BUF;
+
+	if( NULL == _ctx->key )
+		return OAES_RET_NOKEY;
+
+	// header
+	if( 0 != memcmp( c, oaes_header, 4 ) )
+		return OAES_RET_HEADER;
+
+	// header version
+	switch( c[4] )
+	{
+		case 0x01:
+			break;
+		default:
+			return OAES_RET_HEADER;
+	}
+
+	// header type
+	switch( c[5] )
+	{
+		case 0x02:
+			break;
+		default:
+			return OAES_RET_HEADER;
+	}
+
+	// options
+	memcpy(&_options, c + 6, sizeof(_options));
+	// validate that all options are valid
+	if( _options & ~(
+			  OAES_OPTION_ECB
+			| OAES_OPTION_CBC
+#ifdef OAES_DEBUG
+			| OAES_OPTION_STEP_ON
+			| OAES_OPTION_STEP_OFF
+#endif // OAES_DEBUG
+			) )
+		return OAES_RET_HEADER;
+	if( ( _options & OAES_OPTION_ECB ) &&
+			( _options & OAES_OPTION_CBC ) )
+		return OAES_RET_HEADER;
+	if( _options == OAES_OPTION_NONE )
+		return OAES_RET_HEADER;
+
+	// flags
+	memcpy(&_flags, c + 8, sizeof(_flags));
+	// validate that all flags are valid
+	if( _flags & ~(
+			  OAES_FLAG_PAD
+			) )
+		return OAES_RET_HEADER;
+
+	// iv
+	memcpy( _iv, c + OAES_BLOCK_SIZE, OAES_BLOCK_SIZE);
+	// data + pad
+	memcpy( m, c + 2 * OAES_BLOCK_SIZE, *m_len );
+
+	for( _i = 0; _i < *m_len; _i += OAES_BLOCK_SIZE )
+	{
+		if( ( _options & OAES_OPTION_CBC ) && _i > 0 )
+			memcpy( _iv, c + OAES_BLOCK_SIZE + _i, OAES_BLOCK_SIZE );
+
+		_rc = _rc ? _rc :
+				oaes_decrypt_block( ctx, m + _i, min( *m_len - _i, OAES_BLOCK_SIZE ) );
+
+		// CBC
+		if( _options & OAES_OPTION_CBC )
+		{
+			for( _j = 0; _j < OAES_BLOCK_SIZE; _j++ )
+				m[ _i + _j ] = m[ _i + _j ] ^ _iv[_j];
+		}
+	}
+
+	// remove pad
+	if( _flags & OAES_FLAG_PAD )
+	{
+		int _is_pad = 1;
+		size_t _temp = (size_t) m[*m_len - 1];
+
+		if( _temp  <= 0x00 || _temp > 0x0f )
+			return OAES_RET_HEADER;
+		for( _i = 0; _i < _temp; _i++ )
+			if( m[*m_len - 1 - _i] != _temp - _i )
+				_is_pad = 0;
+		if( _is_pad )
+		{
+			memset( m + *m_len - _temp, 0, _temp );
+			*m_len -= _temp;
+		}
+		else
+			return OAES_RET_HEADER;
+	}
+
+	return OAES_RET_SUCCESS;
+}
+
+OAES_API OAES_RET oaes_encryption_round(const uint8_t * key, uint8_t * c)
+{
+	size_t _i;
+
+	if(!key)
+		return OAES_RET_ARG1;
+
+	if(!c)
+		return OAES_RET_ARG2;
+
+	// SubBytes(state)
+	for(_i = 0; _i < OAES_BLOCK_SIZE; _i++)
+		oaes_sub_byte( c + _i );
+
+	// ShiftRows(state)
+	oaes_shift_rows( c );
+
+	// MixColumns(state)
+	oaes_mix_cols( c );
+	oaes_mix_cols( c + 4 );
+	oaes_mix_cols( c + 8 );
+	oaes_mix_cols( c + 12 );
+
+	// AddRoundKey(State, key)
+	for( _i = 0; _i < OAES_BLOCK_SIZE; _i++ )
+		c[_i] ^= key[_i];
+
+	return OAES_RET_SUCCESS;
+}
+
+OAES_API OAES_RET oaes_pseudo_encrypt_ecb(OAES_CTX * ctx, uint8_t * c)
+{
+	oaes_ctx * _ctx = (oaes_ctx *) ctx;
+	size_t _i;
+
+	if(!_ctx)
+		return OAES_RET_ARG1;
+
+	if(!c)
+		return OAES_RET_ARG2;
+
+	if(!_ctx->key)
+		return OAES_RET_NOKEY;
+
+	for (_i = 0; _i < 10; _i++)
+		oaes_encryption_round(&_ctx->key->exp_data[_i * OAES_RKEY_LEN * OAES_COL_LEN], c);
+
+	return OAES_RET_SUCCESS;
+}
diff --git a/crypto/oaes_lib.h b/crypto/oaes_lib.h
new file mode 100644
index 0000000000..c48bad4659
--- /dev/null
+++ b/crypto/oaes_lib.h
@@ -0,0 +1,214 @@
+/*
+ * ---------------------------------------------------------------------------
+ * OpenAES License
+ * ---------------------------------------------------------------------------
+ * Copyright (c) 2012, Nabil S. Al Ramli, www.nalramli.com
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *   - Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *   - Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ * ---------------------------------------------------------------------------
+ */
+
+#ifndef _OAES_LIB_H
+#define _OAES_LIB_H
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _WIN32
+#	ifdef OAES_SHARED
+#		ifdef oaes_lib_EXPORTS
+#			define OAES_API __declspec(dllexport)
+#		else
+#			define OAES_API __declspec(dllimport)
+#		endif
+#	else
+#		define OAES_API
+#	endif
+#else
+#	define OAES_API
+#endif // WIN32
+
+#define OAES_VERSION "0.8.1"
+#define OAES_BLOCK_SIZE 16
+
+typedef void OAES_CTX;
+
+typedef enum
+{
+	OAES_RET_FIRST = 0,
+	OAES_RET_SUCCESS = 0,
+	OAES_RET_UNKNOWN,
+	OAES_RET_ARG1,
+	OAES_RET_ARG2,
+	OAES_RET_ARG3,
+	OAES_RET_ARG4,
+	OAES_RET_ARG5,
+	OAES_RET_NOKEY,
+	OAES_RET_MEM,
+	OAES_RET_BUF,
+	OAES_RET_HEADER,
+	OAES_RET_COUNT
+} OAES_RET;
+
+/*
+ * oaes_set_option() takes one of these values for its [option] parameter
+ * some options accept either an optional or a required [value] parameter
+ */
+// no option
+#define OAES_OPTION_NONE 0
+// enable ECB mode, disable CBC mode
+#define OAES_OPTION_ECB 1
+// enable CBC mode, disable ECB mode
+// value is optional, may pass uint8_t iv[OAES_BLOCK_SIZE] to specify
+// the value of the initialization vector, iv
+#define OAES_OPTION_CBC 2
+
+#ifdef OAES_DEBUG
+typedef int ( * oaes_step_cb ) (
+		const uint8_t state[OAES_BLOCK_SIZE],
+		const char * step_name,
+		int step_count,
+		void * user_data );
+// enable state stepping mode
+// value is required, must pass oaes_step_cb to receive the state at each step
+#define OAES_OPTION_STEP_ON 4
+// disable state stepping mode
+#define OAES_OPTION_STEP_OFF 8
+#endif // OAES_DEBUG
+
+typedef uint16_t OAES_OPTION;
+
+typedef struct _oaes_key
+{
+  size_t data_len;
+  uint8_t *data;
+  size_t exp_data_len;
+  uint8_t *exp_data;
+  size_t num_keys;
+  size_t key_base;
+} oaes_key;
+
+typedef struct _oaes_ctx
+{
+#ifdef OAES_HAVE_ISAAC
+  randctx * rctx;
+#endif // OAES_HAVE_ISAAC
+
+#ifdef OAES_DEBUG
+  oaes_step_cb step_cb;
+#endif // OAES_DEBUG
+
+  oaes_key * key;
+  OAES_OPTION options;
+  uint8_t iv[OAES_BLOCK_SIZE];
+} oaes_ctx;
+/*
+ * // usage:
+ *
+ * OAES_CTX * ctx = oaes_alloc();
+ * .
+ * .
+ * .
+ * {
+ *   oaes_gen_key_xxx( ctx );
+ *   {
+ *     oaes_key_export( ctx, _buf, &_buf_len );
+ *     // or
+ *     oaes_key_export_data( ctx, _buf, &_buf_len );\
+ *   }
+ * }
+ * // or
+ * {
+ *   oaes_key_import( ctx, _buf, _buf_len );
+ *   // or
+ *   oaes_key_import_data( ctx, _buf, _buf_len );
+ * }
+ * .
+ * .
+ * .
+ * oaes_encrypt( ctx, m, m_len, c, &c_len );
+ * .
+ * .
+ * .
+ * oaes_decrypt( ctx, c, c_len, m, &m_len );
+ * .
+ * .
+ * .
+ * oaes_free( &ctx );
+ */
+
+OAES_API OAES_CTX * oaes_alloc(void);
+
+OAES_API OAES_RET oaes_free( OAES_CTX ** ctx );
+
+OAES_API OAES_RET oaes_set_option( OAES_CTX * ctx,
+		OAES_OPTION option, const void * value );
+
+OAES_API OAES_RET oaes_key_gen_128( OAES_CTX * ctx );
+
+OAES_API OAES_RET oaes_key_gen_192( OAES_CTX * ctx );
+
+OAES_API OAES_RET oaes_key_gen_256( OAES_CTX * ctx );
+
+// export key with header information
+// set data == NULL to get the required data_len
+OAES_API OAES_RET oaes_key_export( OAES_CTX * ctx,
+		uint8_t * data, size_t * data_len );
+
+// directly export the data from key
+// set data == NULL to get the required data_len
+OAES_API OAES_RET oaes_key_export_data( OAES_CTX * ctx,
+		uint8_t * data, size_t * data_len );
+
+// import key with header information
+OAES_API OAES_RET oaes_key_import( OAES_CTX * ctx,
+		const uint8_t * data, size_t data_len );
+
+// directly import data into key
+OAES_API OAES_RET oaes_key_import_data( OAES_CTX * ctx,
+		const uint8_t * data, size_t data_len );
+
+// set c == NULL to get the required c_len
+OAES_API OAES_RET oaes_encrypt( OAES_CTX * ctx,
+		const uint8_t * m, size_t m_len, uint8_t * c, size_t * c_len );
+
+// set m == NULL to get the required m_len
+OAES_API OAES_RET oaes_decrypt( OAES_CTX * ctx,
+		const uint8_t * c, size_t c_len, uint8_t * m, size_t * m_len );
+
+// set buf == NULL to get the required buf_len
+OAES_API OAES_RET oaes_sprintf(
+		char * buf, size_t * buf_len, const uint8_t * data, size_t data_len );
+
+OAES_API OAES_RET oaes_encryption_round( const uint8_t * key, uint8_t * c );
+
+OAES_API OAES_RET oaes_pseudo_encrypt_ecb( OAES_CTX * ctx, uint8_t * c );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _OAES_LIB_H
diff --git a/crypto/wildkeccak-cpu.cpp b/crypto/wildkeccak-cpu.cpp
new file mode 100644
index 0000000000..64686b061f
--- /dev/null
+++ b/crypto/wildkeccak-cpu.cpp
@@ -0,0 +1,376 @@
+// Memory-hard extension of keccak for PoW
+// Copyright (c) 2012-2013 The Cryptonote developers
+// Copyright (c) 2014 The Boolberry developers
+
+// Distributed under the MIT/X11 software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+// Modified for CPUminer by Lucas Jones
+// Adapted for ccminer by Tanguy Pruvot - 2016
+
+#include <errno.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef _MSC_VER
+#include <emmintrin.h>
+#include <openssl/opensslv.h>
+#if OPENSSL_VERSION_NUMBER < 0x10100000L
+#include "compat/bignum_ssl10.hpp"
+#else
+#include "bignum.hpp"
+#endif
+#include "int128_c.h"
+#else
+#include <x86intrin.h>
+#endif
+
+#include <miner.h>
+
+#include "xmr-rpc.h"
+
+extern uint64_t* pscratchpad_buff;
+
+struct reciprocal_value64 {
+	uint64_t m;
+	uint8_t sh1, sh2;
+};
+
+static inline int fls64(uint64_t x)
+{
+#if defined(_WIN64)
+	unsigned long bitpos = 0;
+	_BitScanReverse64(&bitpos, x);
+	return (int) (bitpos + 1);
+#elif defined(WIN32)
+	unsigned long hipos = 0, bitpos = 0;
+	uint32_t hi = x >> 32;
+	_BitScanReverse(&hipos, hi);
+	if (!hipos) {
+		_BitScanReverse(&bitpos, (uint32_t) x);
+	}
+	return (int) hipos ? hipos + 33 : bitpos + 1;
+#else
+	/*
+	* AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the
+	* dest reg is undefined if x==0, but their CPU architect says its
+	* value is written to set it to the same as before.
+	*/
+	register long bitpos = -1;
+	asm("bsrq %1,%0" : "+r" (bitpos) : "rm" (x));
+	return bitpos + 1;
+#endif
+}
+
+static inline struct reciprocal_value64 reciprocal_val64(uint64_t d)
+{
+	struct reciprocal_value64 R;
+	int l;
+
+	l = fls64(d - 1);
+
+#ifdef _MSC_VER
+	uint128 v1;
+	v1.Lo = (1ULL << l) - d;v1.Hi=0;
+	uint128 v2;
+	v2.Hi = 1; v2.Lo = 0;
+
+	uint128 v;
+	mult128(v1,v2,&v);
+	divmod128by64(v.Hi,v.Lo,d,&v.Hi,&v.Lo);
+	Increment(&v);
+	R.m = (uint64_t)v.Hi;
+#else
+    __uint128_t m;
+    m = (((__uint128_t)1 << 64) * ((1ULL << l) - d));
+    m /= d;
+	++m;
+	R.m = (uint64_t)m;
+#endif
+
+	R.sh1 = min(l, 1);
+	R.sh2 = max(l - 1, 0);
+
+	return R;
+}
+
+static inline uint64_t reciprocal_divide64(uint64_t a, struct reciprocal_value64 R)
+{
+#ifdef _MSC_VER
+    uint128 v;
+	mult64to128(a,R.m,&v.Hi,&v.Lo);
+	uint64_t t = v.Hi;
+#else
+    uint64_t t = (uint64_t)(((__uint128_t)a * R.m) >> 64);
+#endif
+	return (t + ((a - t) >> R.sh1)) >> R.sh2;
+}
+
+static inline uint64_t reciprocal_remainder64(uint64_t A, uint64_t B, struct reciprocal_value64 R)
+{
+	uint64_t div, mod;
+
+	div = reciprocal_divide64(A, R);
+	mod = A - (uint64_t) (div * B);
+	if (mod >= B) mod -= B;
+	return mod;
+}
+
+//#define UNROLL_SCR_MIX
+
+static inline uint64_t rotl641(uint64_t x) { return((x << 1) | (x >> 63)); }
+static inline uint64_t rotl64_1(uint64_t x, uint64_t y) { return((x << y) | (x >> (64 - y))); }
+static inline uint64_t rotl64_2(uint64_t x, uint64_t y) { return(rotl64_1((x >> 32) | (x << 32), y)); }
+static inline uint64_t bitselect(uint64_t a, uint64_t b, uint64_t c) { return(a ^ (c & (b ^ a))); }
+
+static inline void keccakf_mul(uint64_t *s)
+{
+	uint64_t bc[5], t[5];
+	uint64_t tmp1, tmp2;
+	int i;
+
+	for(i = 0; i < 5; i++)
+		t[i] = s[i + 0] ^ s[i + 5] ^ s[i + 10] * s[i + 15] * s[i + 20];
+
+	bc[0] = t[0] ^ rotl641(t[2]);
+	bc[1] = t[1] ^ rotl641(t[3]);
+	bc[2] = t[2] ^ rotl641(t[4]);
+	bc[3] = t[3] ^ rotl641(t[0]);
+	bc[4] = t[4] ^ rotl641(t[1]);
+
+	tmp1 = s[1] ^ bc[0];
+
+	s[ 0] ^= bc[4];
+	s[ 1] = rotl64_1(s[ 6] ^ bc[0], 44);
+	s[ 6] = rotl64_1(s[ 9] ^ bc[3], 20);
+	s[ 9] = rotl64_1(s[22] ^ bc[1], 61);
+	s[22] = rotl64_1(s[14] ^ bc[3], 39);
+	s[14] = rotl64_1(s[20] ^ bc[4], 18);
+	s[20] = rotl64_1(s[ 2] ^ bc[1], 62);
+	s[ 2] = rotl64_1(s[12] ^ bc[1], 43);
+	s[12] = rotl64_1(s[13] ^ bc[2], 25);
+	s[13] = rotl64_1(s[19] ^ bc[3], 8);
+	s[19] = rotl64_1(s[23] ^ bc[2], 56);
+	s[23] = rotl64_1(s[15] ^ bc[4], 41);
+	s[15] = rotl64_1(s[ 4] ^ bc[3], 27);
+	s[ 4] = rotl64_1(s[24] ^ bc[3], 14);
+	s[24] = rotl64_1(s[21] ^ bc[0], 2);
+	s[21] = rotl64_1(s[ 8] ^ bc[2], 55);
+	s[ 8] = rotl64_1(s[16] ^ bc[0], 45);
+	s[16] = rotl64_1(s[ 5] ^ bc[4], 36);
+	s[ 5] = rotl64_1(s[ 3] ^ bc[2], 28);
+	s[ 3] = rotl64_1(s[18] ^ bc[2], 21);
+	s[18] = rotl64_1(s[17] ^ bc[1], 15);
+	s[17] = rotl64_1(s[11] ^ bc[0], 10);
+	s[11] = rotl64_1(s[ 7] ^ bc[1], 6);
+	s[ 7] = rotl64_1(s[10] ^ bc[4], 3);
+	s[10] = rotl64_1(tmp1, 1);
+
+	tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
+	tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
+	tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
+	tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
+	tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
+
+	s[0] ^= 0x0000000000000001ULL;
+}
+
+static inline void keccakf_mul_last(uint64_t *s)
+{
+	uint64_t bc[5], xormul[5];
+	uint64_t tmp1, tmp2;
+	int i;
+
+	for(i = 0; i < 5; i++)
+		xormul[i] = s[i + 0] ^ s[i + 5] ^ s[i + 10] * s[i + 15] * s[i + 20];
+
+	bc[0] = xormul[0] ^ rotl641(xormul[2]);
+	bc[1] = xormul[1] ^ rotl641(xormul[3]);
+	bc[2] = xormul[2] ^ rotl641(xormul[4]);
+	bc[3] = xormul[3] ^ rotl641(xormul[0]);
+	bc[4] = xormul[4] ^ rotl641(xormul[1]);
+
+	s[0] ^= bc[4];
+	s[1] = rotl64_2(s[6] ^ bc[0], 12);
+	s[2] = rotl64_2(s[12] ^ bc[1], 11);
+	s[4] = rotl64_1(s[24] ^ bc[3], 14);
+	s[3] = rotl64_1(s[18] ^ bc[2], 21);
+
+	tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]);
+	s[0] ^= 0x0000000000000001ULL;
+}
+
+struct reciprocal_value64 cached_recip;
+static uint64_t cached_scr_size = 0;
+
+static inline void scr_mix(uint64_t *st, uint64_t scr_size, struct reciprocal_value64 recip)
+{
+#define KK_MIXIN_SIZE 24
+	uint64_t _ALIGN(128) idx[KK_MIXIN_SIZE];
+
+#ifdef _MSC_VER
+	#define pscr pscratchpad_buff
+	int x;
+
+	// non-optimized 64bit operations
+	for (x = 0; x < KK_MIXIN_SIZE; x++) {
+		idx[x] = reciprocal_remainder64(st[x], scr_size, recip) << 2;
+	}
+	if (idx[7] > scr_size*4) {
+		applog(LOG_WARNING, "Wrong remainder64 returned by the cpu hash %016llx > %016llx",
+			(unsigned long long) idx[7], (unsigned long long) scr_size*4);
+		return;
+	}
+	for(x = 0; x < KK_MIXIN_SIZE; x += 4) {
+		st[x + 0] ^= pscr[idx[x] + 0] ^ pscr[idx[x + 1] + 0] ^ pscr[idx[x + 2] + 0] ^ pscr[idx[x + 3] + 0];
+		st[x + 1] ^= pscr[idx[x] + 1] ^ pscr[idx[x + 1] + 1] ^ pscr[idx[x + 2] + 1] ^ pscr[idx[x + 3] + 1];
+		st[x + 2] ^= pscr[idx[x] + 2] ^ pscr[idx[x + 1] + 2] ^ pscr[idx[x + 2] + 2] ^ pscr[idx[x + 3] + 2];
+		st[x + 3] ^= pscr[idx[x] + 3] ^ pscr[idx[x + 1] + 3] ^ pscr[idx[x + 2] + 3] ^ pscr[idx[x + 3] + 3];
+	}
+	return;
+
+#elif !defined(UNROLL_SCR_MIX)
+
+	#pragma GCC ivdep
+	for(int x = 0; x < 3; ++x)
+	{
+		__m128i *st0, *st1, *st2, *st3;
+
+		idx[0] = reciprocal_remainder64(st[(x << 3) + 0], scr_size, recip) << 2;
+		idx[1] = reciprocal_remainder64(st[(x << 3) + 1], scr_size, recip) << 2;
+		idx[2] = reciprocal_remainder64(st[(x << 3) + 2], scr_size, recip) << 2;
+		idx[3] = reciprocal_remainder64(st[(x << 3) + 3], scr_size, recip) << 2;
+		idx[4] = reciprocal_remainder64(st[(x << 3) + 4], scr_size, recip) << 2;
+		idx[5] = reciprocal_remainder64(st[(x << 3) + 5], scr_size, recip) << 2;
+		idx[6] = reciprocal_remainder64(st[(x << 3) + 6], scr_size, recip) << 2;
+		idx[7] = reciprocal_remainder64(st[(x << 3) + 7], scr_size, recip) << 2;
+
+		for(int y = 0; y < 8; y++) _mm_prefetch((const char*) (&pscratchpad_buff[idx[y]]), _MM_HINT_T1);
+
+		st0 = (__m128i *)&st[(x << 3) + 0];
+		st1 = (__m128i *)&st[(x << 3) + 2];
+		st2 = (__m128i *)&st[(x << 3) + 4];
+		st3 = (__m128i *)&st[(x << 3) + 6];
+
+		*st0 = _mm_xor_si128(*st0, *((__m128i *)&pscratchpad_buff[idx[0]]));
+		*st0 = _mm_xor_si128(*st0, *((__m128i *)&pscratchpad_buff[idx[1]]));
+		*st0 = _mm_xor_si128(*st0, *((__m128i *)&pscratchpad_buff[idx[2]]));
+		*st0 = _mm_xor_si128(*st0, *((__m128i *)&pscratchpad_buff[idx[3]]));
+
+		*st1 = _mm_xor_si128(*st1, *((__m128i *)&pscratchpad_buff[idx[0] + 2]));
+		*st1 = _mm_xor_si128(*st1, *((__m128i *)&pscratchpad_buff[idx[1] + 2]));
+		*st1 = _mm_xor_si128(*st1, *((__m128i *)&pscratchpad_buff[idx[2] + 2]));
+		*st1 = _mm_xor_si128(*st1, *((__m128i *)&pscratchpad_buff[idx[3] + 2]));
+
+		*st2 = _mm_xor_si128(*st2, *((__m128i *)&pscratchpad_buff[idx[4]]));
+		*st2 = _mm_xor_si128(*st2, *((__m128i *)&pscratchpad_buff[idx[5]]));
+		*st2 = _mm_xor_si128(*st2, *((__m128i *)&pscratchpad_buff[idx[6]]));
+		*st2 = _mm_xor_si128(*st2, *((__m128i *)&pscratchpad_buff[idx[7]]));
+
+		*st3 = _mm_xor_si128(*st3, *((__m128i *)&pscratchpad_buff[idx[4] + 2]));
+		*st3 = _mm_xor_si128(*st3, *((__m128i *)&pscratchpad_buff[idx[5] + 2]));
+		*st3 = _mm_xor_si128(*st3, *((__m128i *)&pscratchpad_buff[idx[6] + 2]));
+		*st3 = _mm_xor_si128(*st3, *((__m128i *)&pscratchpad_buff[idx[7] + 2]));
+	}
+
+#else
+	#warning using AVX2 optimizations
+
+	idx[ 0] = reciprocal_remainder64(st[0], scr_size, recip) << 2;
+	idx[ 1] = reciprocal_remainder64(st[1], scr_size, recip) << 2;
+	idx[ 2] = reciprocal_remainder64(st[2], scr_size, recip) << 2;
+	idx[ 3] = reciprocal_remainder64(st[3], scr_size, recip) << 2;
+	idx[ 4] = reciprocal_remainder64(st[4], scr_size, recip) << 2;
+	idx[ 5] = reciprocal_remainder64(st[5], scr_size, recip) << 2;
+	idx[ 6] = reciprocal_remainder64(st[6], scr_size, recip) << 2;
+	idx[ 7] = reciprocal_remainder64(st[7], scr_size, recip) << 2;
+
+	for(int y = 0; y < 8; y++) _mm_prefetch(&pscratchpad_buff[idx[y]], _MM_HINT_T1);
+
+	idx[ 8] = reciprocal_remainder64(st[8], scr_size, recip) << 2;
+	idx[ 9] = reciprocal_remainder64(st[9], scr_size, recip) << 2;
+	idx[10] = reciprocal_remainder64(st[10], scr_size, recip) << 2;
+	idx[11] = reciprocal_remainder64(st[11], scr_size, recip) << 2;
+	idx[12] = reciprocal_remainder64(st[12], scr_size, recip) << 2;
+	idx[13] = reciprocal_remainder64(st[13], scr_size, recip) << 2;
+	idx[14] = reciprocal_remainder64(st[14], scr_size, recip) << 2;
+	idx[15] = reciprocal_remainder64(st[15], scr_size, recip) << 2;
+
+	for(int y = 8; y < 16; ++y) _mm_prefetch(&pscratchpad_buff[idx[y]], _MM_HINT_T1);
+
+	idx[16] = reciprocal_remainder64(st[16], scr_size, recip) << 2;
+	idx[17] = reciprocal_remainder64(st[17], scr_size, recip) << 2;
+	idx[18] = reciprocal_remainder64(st[18], scr_size, recip) << 2;
+	idx[19] = reciprocal_remainder64(st[19], scr_size, recip) << 2;
+	idx[20] = reciprocal_remainder64(st[20], scr_size, recip) << 2;
+	idx[21] = reciprocal_remainder64(st[21], scr_size, recip) << 2;
+	idx[22] = reciprocal_remainder64(st[22], scr_size, recip) << 2;
+	idx[23] = reciprocal_remainder64(st[23], scr_size, recip) << 2;
+
+	for(int y = 16; y < 24; ++y) _mm_prefetch(&pscratchpad_buff[idx[y]], _MM_HINT_T1);
+
+	__m256i *st0 = (__m256i *)&st[0];
+
+	for(int x = 0; x < 6; ++x)
+	{
+		*st0 = _mm256_xor_si256(*st0, *((__m256i *)&pscratchpad_buff[idx[(x << 2) + 0]]));
+		*st0 = _mm256_xor_si256(*st0, *((__m256i *)&pscratchpad_buff[idx[(x << 2) + 1]]));
+		*st0 = _mm256_xor_si256(*st0, *((__m256i *)&pscratchpad_buff[idx[(x << 2) + 2]]));
+		*st0 = _mm256_xor_si256(*st0, *((__m256i *)&pscratchpad_buff[idx[(x << 2) + 3]]));
+		++st0;
+	}
+
+#endif
+	return;
+}
+
+static void wild_keccak_hash_dbl(uint8_t * __restrict md, const uint8_t * __restrict in)
+{
+	uint64_t _ALIGN(32) st[25];
+	uint64_t scr_size, i;
+	struct reciprocal_value64 recip;
+
+	scr_size = scratchpad_size >> 2;
+	if (scr_size == cached_scr_size)
+		recip = cached_recip;
+	else {
+		cached_recip = recip = reciprocal_val64(scr_size);
+		cached_scr_size = scr_size;
+	}
+
+	// Wild Keccak #1
+	memcpy(st, in, 88);
+	st[10] = (st[10] & 0x00000000000000FFULL) | 0x0000000000000100ULL;
+	memset(&st[11], 0, 112);
+	st[16] |= 0x8000000000000000ULL;
+
+	for(i = 0; i < 23; i++) {
+		keccakf_mul(st);
+		scr_mix(st, scr_size, recip);
+	}
+
+	keccakf_mul_last(st);
+
+	// Wild Keccak #2
+	memset(&st[4], 0x00, 168);
+	st[ 4] = 0x0000000000000001ULL;
+	st[16] = 0x8000000000000000ULL;
+
+	for(i = 0; i < 23; i++) {
+		keccakf_mul(st);
+		scr_mix(st, scr_size, recip);
+	}
+
+	keccakf_mul_last(st);
+
+	memcpy(md, st, 32);
+	return;
+}
+
+void wildkeccak_hash(void* output, const void* input, uint64_t* scratchpad, uint64_t ssize)
+{
+	if (scratchpad) pscratchpad_buff = scratchpad;
+	if (!scratchpad_size) scratchpad_size = ssize;
+	wild_keccak_hash_dbl((uint8_t*)output, (uint8_t*)input);
+}
diff --git a/crypto/wildkeccak.cu b/crypto/wildkeccak.cu
new file mode 100644
index 0000000000..99657611a1
--- /dev/null
+++ b/crypto/wildkeccak.cu
@@ -0,0 +1,381 @@
+// WildKeccak CUDA Kernel, Code based on Linux Wolf0 bbr-miner implementation from 2014
+// Adapted to ccminer 2.0 - tpruvot 2016-2017
+//
+// NOTE FOR SP: this ccminer version is licensed under GPLv3 Licence
+
+extern "C" {
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+}
+
+#include <miner.h>
+#include <cuda_helper.h>
+#include <cuda_vector_uint2x4.h> // todo
+
+#include "wildkeccak.h"
+
+extern char *device_config[MAX_GPUS]; // -l
+extern uint64_t* pscratchpad_buff;
+
+static uint64_t*    d_input[MAX_GPUS];
+static uint32_t*    d_retnonce[MAX_GPUS];
+static ulonglong4*  d_scratchpad[MAX_GPUS];
+
+static uint64_t*    h_scratchpad[MAX_GPUS] = { 0 };
+static cudaStream_t bufpad_stream[MAX_GPUS] = { 0 };
+static cudaStream_t kernel_stream[MAX_GPUS] = { 0 };
+
+uint64_t scratchpad_size = 0;
+
+uint32_t WK_CUDABlocks   = 64;
+uint32_t WK_CUDAThreads  = 256;
+
+#define st0 	vst0.x
+#define st1 	vst0.y
+#define st2 	vst0.z
+#define st3 	vst0.w
+
+#define st4 	vst4.x
+#define st5 	vst4.y
+#define st6 	vst4.z
+#define st7 	vst4.w
+
+#define st8 	vst8.x
+#define st9 	vst8.y
+#define st10	vst8.z
+#define st11	vst8.w
+
+#define st12	vst12.x
+#define st13	vst12.y
+#define st14	vst12.z
+#define st15	vst12.w
+
+#define st16	vst16.x
+#define st17	vst16.y
+#define st18	vst16.z
+#define st19	vst16.w
+
+#define st20	vst20.x
+#define st21	vst20.y
+#define st22	vst20.z
+#define st23	vst20.w
+
+#if __CUDA_ARCH__ >= 320
+
+__device__ __forceinline__ uint64_t cuda_rotl641(const uint64_t value)
+{
+	uint2 result;
+	asm("shf.l.wrap.b32 %0, %1, %2, 1U;" : "=r"(result.x)
+		: "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))));
+	asm("shf.l.wrap.b32 %0, %1, %2, 1U;" : "=r"(result.y)
+		: "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))));
+	return __double_as_longlong(__hiloint2double(result.y, result.x));
+}
+
+#else
+__noinline__ __device__ uint64_t cuda_rotl641(const uint64_t x) { return((x << 1) | (x >> 63)); }
+#endif
+
+__noinline__ __device__ uint64_t bitselect(const uint64_t a, const uint64_t b, const uint64_t c) { return(a ^ (c & (b ^ a))); }
+
+#define ROTL641(x) (cuda_rotl641(x))
+
+#define RND() \
+	bc[0] = st0 ^ st5 ^ st10 * st15 * st20 ^ ROTL641(st2 ^ st7 ^ st12 * st17 * st22); \
+	bc[1] = st1 ^ st6 ^ st11 * st16 * st21 ^ ROTL641(st3 ^ st8 ^ st13 * st18 * st23); \
+	bc[2] = st2 ^ st7 ^ st12 * st17 * st22 ^ ROTL641(st4 ^ st9 ^ st14 * st19 * st24); \
+	bc[3] = st3 ^ st8 ^ st13 * st18 * st23 ^ ROTL641(st0 ^ st5 ^ st10 * st15 * st20); \
+	bc[4] = st4 ^ st9 ^ st14 * st19 * st24 ^ ROTL641(st1 ^ st6 ^ st11 * st16 * st21); \
+	tmp1 = st1 ^ bc[0]; \
+	\
+	st0  ^= bc[4]; \
+	st1  = ROTL64(st6  ^ bc[0], 44); \
+	st6  = ROTL64(st9  ^ bc[3], 20); \
+	st9  = ROTL64(st22 ^ bc[1], 61); \
+	st22 = ROTL64(st14 ^ bc[3], 39); \
+	st14 = ROTL64(st20 ^ bc[4], 18); \
+	st20 = ROTL64(st2  ^ bc[1], 62); \
+	st2  = ROTL64(st12 ^ bc[1], 43); \
+	st12 = ROTL64(st13 ^ bc[2], 25); \
+	st13 = ROTL64(st19 ^ bc[3], 8); \
+	st19 = ROTL64(st23 ^ bc[2], 56); \
+	st23 = ROTL64(st15 ^ bc[4], 41); \
+	st15 = ROTL64(st4  ^ bc[3], 27); \
+	st4  = ROTL64(st24 ^ bc[3], 14); \
+	st24 = ROTL64(st21 ^ bc[0], 2); \
+	st21 = ROTL64(st8  ^ bc[2], 55); \
+	st8  = ROTL64(st16 ^ bc[0], 45); \
+	st16 = ROTL64(st5  ^ bc[4], 36); \
+	st5  = ROTL64(st3  ^ bc[2], 28); \
+	st3  = ROTL64(st18 ^ bc[2], 21); \
+	st18 = ROTL64(st17 ^ bc[1], 15); \
+	st17 = ROTL64(st11 ^ bc[0], 10); \
+	st11 = ROTL64(st7  ^ bc[1], 6); \
+	st7  = ROTL64(st10 ^ bc[4], 3); \
+	st10 = ROTL641(tmp1); \
+	\
+	tmp1 = st0; tmp2 = st1; st0 = bitselect(st0 ^ st2, st0, st1); st1 = bitselect(st1 ^ st3, st1, st2); \
+	 st2 = bitselect(st2 ^ st4, st2, st3); st3 = bitselect(st3 ^ tmp1, st3, st4); st4 = bitselect(st4 ^ tmp2, st4, tmp1); \
+	tmp1 = st5; tmp2 = st6; st5 = bitselect(st5 ^ st7, st5, st6); st6 = bitselect(st6 ^ st8, st6, st7); \
+	 st7 = bitselect(st7 ^ st9, st7, st8); st8 = bitselect(st8 ^ tmp1, st8, st9); st9 = bitselect(st9 ^ tmp2, st9, tmp1); \
+	tmp1 = st10; tmp2 = st11; st10 = bitselect(st10 ^ st12, st10, st11); st11 = bitselect(st11 ^ st13, st11, st12); \
+	st12 = bitselect(st12 ^ st14, st12, st13); st13 = bitselect(st13 ^ tmp1, st13, st14); st14 = bitselect(st14 ^ tmp2, st14, tmp1); \
+	tmp1 = st15; tmp2 = st16; st15 = bitselect(st15 ^ st17, st15, st16); st16 = bitselect(st16 ^ st18, st16, st17); \
+	st17 = bitselect(st17 ^ st19, st17, st18); st18 = bitselect(st18 ^ tmp1, st18, st19); st19 = bitselect(st19 ^ tmp2, st19, tmp1); \
+	tmp1 = st20; tmp2 = st21; st20 = bitselect(st20 ^ st22, st20, st21); st21 = bitselect(st21 ^ st23, st21, st22); \
+	st22 = bitselect(st22 ^ st24, st22, st23); st23 = bitselect(st23 ^ tmp1, st23, st24); st24 = bitselect(st24 ^ tmp2, st24, tmp1); \
+	st0 ^= 1;
+
+#define LASTRND1() \
+	bc[0] = st0 ^ st5 ^ st10 * st15 * st20 ^ ROTL64(st2 ^ st7 ^ st12 * st17 * st22, 1); \
+	bc[1] = st1 ^ st6 ^ st11 * st16 * st21 ^ ROTL64(st3 ^ st8 ^ st13 * st18 * st23, 1); \
+	bc[2] = st2 ^ st7 ^ st12 * st17 * st22 ^ ROTL64(st4 ^ st9 ^ st14 * st19 * st24, 1); \
+	bc[3] = st3 ^ st8 ^ st13 * st18 * st23 ^ ROTL64(st0 ^ st5 ^ st10 * st15 * st20, 1); \
+	bc[4] = st4 ^ st9 ^ st14 * st19 * st24 ^ ROTL64(st1 ^ st6 ^ st11 * st16 * st21, 1); \
+	\
+	st0 ^= bc[4]; \
+	st1 = ROTL64(st6 ^ bc[0], 44); \
+	st2 = ROTL64(st12 ^ bc[1], 43); \
+	st4 = ROTL64(st24 ^ bc[3], 14); \
+	st3 = ROTL64(st18 ^ bc[2], 21); \
+	\
+	tmp1 = st0; st0 = bitselect(st0 ^ st2, st0, st1); st1 = bitselect(st1 ^ st3, st1, st2); st2 = bitselect(st2 ^ st4, st2, st3); st3 = bitselect(st3 ^ tmp1, st3, st4); \
+	st0 ^= 1;
+
+#define LASTRND2() \
+	bc[2] = st2 ^ st7 ^ st12 * st17 * st22 ^ ROTL64(st4 ^ st9 ^ st14 * st19 * st24, 1); \
+	bc[3] = st3 ^ st8 ^ st13 * st18 * st23 ^ ROTL64(st0 ^ st5 ^ st10 * st15 * st20, 1); \
+	bc[4] = st4 ^ st9 ^ st14 * st19 * st24 ^ ROTL64(st1 ^ st6 ^ st11 * st16 * st21, 1); \
+	\
+	st0 ^= bc[4]; \
+	st4 = ROTL64(st24 ^ bc[3], 14); \
+	st3 = ROTL64(st18 ^ bc[2], 21); \
+	st3 = bitselect(st3 ^ st0, st3, st4);
+
+__device__ ulonglong4 operator^(const ulonglong4 &a, const ulonglong4 &b)
+{
+	return(make_ulonglong4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w));
+}
+
+#define MIX(vst) vst = vst ^ scratchpad[vst.x % scr_size] ^ scratchpad[vst.y % scr_size] ^ scratchpad[vst.z % scr_size] ^ scratchpad[vst.w % scr_size];
+
+#define MIX_ALL MIX(vst0); MIX(vst4); MIX(vst8); MIX(vst12); MIX(vst16); MIX(vst20);
+
+__global__
+void wk(uint32_t* __restrict__ retnonce, const uint64_t* __restrict__ input, const ulonglong4* __restrict__ scratchpad,
+	const uint32_t scr_size, const uint32_t target, uint64_t startNonce)
+{
+	ulonglong4 vst0, vst4, vst8, vst12, vst16, vst20;
+	uint64_t bc[5];
+	uint64_t st24, tmp1, tmp2;
+
+	const uint64_t nonce = startNonce + (blockDim.x * blockIdx.x) + threadIdx.x;
+	vst0  = make_ulonglong4((nonce << 8) + (input[0] & 0xFF), input[1] & 0xFFFFFFFFFFFFFF00ULL, input[2], input[3]);
+	vst4  = make_ulonglong4(input[4], input[5], input[6], input[7]);
+	vst8  = make_ulonglong4(input[8], input[9], (input[10] & 0xFF) | 0x100, 0);
+	vst12 = make_ulonglong4(0, 0, 0, 0);
+	vst16 = make_ulonglong4(0x8000000000000000ULL, 0, 0, 0);
+	vst20 = make_ulonglong4(0, 0, 0, 0);
+	st24  = 0;
+
+	RND();
+	MIX_ALL;
+
+	for(int i = 0; i < 22; i++) {
+		RND();
+		MIX_ALL;
+	}
+
+	LASTRND1();
+
+	vst4  = make_ulonglong4(1, 0, 0, 0);
+	vst8  = make_ulonglong4(0, 0, 0, 0);
+	vst12 = make_ulonglong4(0, 0, 0, 0);
+	vst16 = make_ulonglong4(0x8000000000000000ULL, 0, 0, 0);
+	vst20 = make_ulonglong4(0, 0, 0, 0);
+	st24  = 0;
+
+	RND();
+	MIX_ALL;
+
+	#pragma unroll
+	for(int i = 0; i < 22; i++) {
+		RND();
+		MIX_ALL;
+	}
+
+	LASTRND2();
+
+	if((st3 >> 32) <= target) {
+		retnonce[0] = (uint32_t) nonce;
+		retnonce[1] = retnonce[0];
+	}
+}
+
+__host__
+void wildkeccak_kernel(const int thr_id, const uint32_t threads, const uint32_t startNounce, const uint2 target, uint32_t *resNonces)
+{
+	CUDA_SAFE_CALL(cudaMemsetAsync(d_retnonce[thr_id], 0xff, 2 * sizeof(uint32_t), kernel_stream[thr_id]));
+
+	const uint32_t threadsperblock = WK_CUDAThreads;
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	wk <<<grid, block, 0, kernel_stream[thr_id]>>> (d_retnonce[thr_id], d_input[thr_id], d_scratchpad[thr_id],
+		(uint32_t)(scratchpad_size >> 2), target.y, startNounce);
+
+	cudaMemcpyAsync(resNonces, d_retnonce[thr_id], 2*sizeof(uint32_t), cudaMemcpyDeviceToHost, kernel_stream[thr_id]);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_wildkeccak(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t *ptarget = work->target;
+	uint32_t throughput = 0;
+	uint64_t n, nonce, first;
+	uint8_t *pdata = (uint8_t*) work->data;
+	memcpy(&first, &pdata[1], 8);
+	n = nonce = first;
+
+	if (!scratchpad_size || !h_scratchpad[thr_id]) {
+		if (h_scratchpad[thr_id])
+			applog(LOG_ERR, "Scratchpad size is not set!");
+		work->data[0] = 0; // invalidate
+		sleep(1);
+		return -EBUSY;
+	}
+
+	if (!init[thr_id]) {
+
+		if (device_config[thr_id]) {
+			sscanf(device_config[thr_id], "%ux%u", &WK_CUDABlocks, &WK_CUDAThreads);
+			gpulog(LOG_INFO, thr_id, "Using %u x %u kernel launch config, %u threads",
+				WK_CUDABlocks, WK_CUDAThreads, throughput);
+		} else {
+			throughput = cuda_default_throughput(thr_id, WK_CUDABlocks*WK_CUDAThreads);
+			gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+		}
+
+		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage (linux)
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
+			CUDA_LOG_ERROR();
+		}
+
+		CUDA_SAFE_CALL(cudaMalloc(&d_input[thr_id], 88));
+		CUDA_SAFE_CALL(cudaMalloc(&d_retnonce[thr_id], 2*sizeof(uint32_t)));
+
+		int status = (int) cudaMalloc(&d_scratchpad[thr_id], WILD_KECCAK_SCRATCHPAD_BUFFSIZE);
+		if (status != cudaSuccess) {
+			gpulog(LOG_ERR, thr_id, "Unable to allocate device memory, %u MB, err %d",
+				(uint32_t) (WILD_KECCAK_SCRATCHPAD_BUFFSIZE/(1024*1024)), status);
+			exit(-ENOMEM);
+		}
+
+		cudaStreamCreate(&bufpad_stream[thr_id]);
+		cudaStreamCreate(&kernel_stream[thr_id]);
+
+		CUDA_SAFE_CALL(cudaMemcpyAsync(d_scratchpad[thr_id], h_scratchpad[thr_id], scratchpad_size << 3, cudaMemcpyHostToDevice, bufpad_stream[thr_id]));
+
+		init[thr_id] = true;
+	}
+
+	throughput = WK_CUDABlocks * WK_CUDAThreads;
+
+	cudaMemcpy(d_input[thr_id], pdata, 88, cudaMemcpyHostToDevice);
+//	cudaMemset(d_retnonce[thr_id], 0xFF, 2*sizeof(uint32_t));
+
+	if (h_scratchpad[thr_id]) {
+		cudaStreamSynchronize(bufpad_stream[thr_id]);
+	}
+
+	do {
+//		const uint32_t blocks = WK_CUDABlocks, threads = WK_CUDAThreads;
+//		const dim3 block(blocks);
+//		const dim3 thread(threads);
+		uint32_t h_retnonce[2] = { UINT32_MAX, UINT32_MAX };
+		uint2 target = make_uint2(ptarget[6], ptarget[7]);
+
+		wildkeccak_kernel(thr_id, throughput, (uint32_t) nonce, target, h_retnonce);
+		/*
+		wk <<<block, thread, 0, kernel_stream[thr_id]>>> (d_retnonce[thr_id], d_input[thr_id], d_scratchpad[thr_id],
+			(uint32_t)(scratchpad_size >> 2), nonce, ptarget[7]);
+		*/
+
+		*hashes_done = (unsigned long) (n - first + throughput);
+
+		cudaStreamSynchronize(kernel_stream[thr_id]);
+		if(h_retnonce[0] != UINT32_MAX) {
+			uint8_t _ALIGN(64) cpuhash[32];
+			uint32_t* vhash = (uint32_t*) cpuhash;
+			uint64_t nonce64;
+			memcpy(&pdata[1], &h_retnonce[0], sizeof(uint32_t));
+			memcpy(&nonce64, &pdata[1], 8);
+			wildkeccak_hash(cpuhash, pdata, pscratchpad_buff, scratchpad_size);
+			if (!cpuhash[31] && vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+				work_set_target_ratio(work, vhash);
+				//applog_hex(pdata,   84);
+				//applog_hex(cpuhash, 32);
+				//applog_hex(ptarget, 32);
+				memcpy(work->nonces, &nonce64, 8);
+				if (n + throughput > max_nonce) {
+					*hashes_done = (unsigned long) (max_nonce - first);
+				}
+				work->valid_nonces = 1;
+				return 1;
+			} else if (vhash[7] > ptarget[7]) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+				gpulog(LOG_WARNING, thr_id, "result for nonce %08x does not validate on CPU!", h_retnonce[0]);
+			}
+		}
+
+		if (n + throughput >= max_nonce) {
+			n = max_nonce;
+			break;
+		}
+
+		n += throughput;
+		nonce += throughput;
+
+	} while(!work_restart[thr_id].restart);
+
+	*hashes_done = (unsigned long) (n - first + 1);
+	return 0;
+}
+
+void wildkeccak_scratchpad_need_update(uint64_t* pscratchpad_buff)
+{
+	for(int i = 0; i < opt_n_threads; i++) {
+		h_scratchpad[i] = pscratchpad_buff;
+		if (init[i]) {
+			gpulog(LOG_DEBUG, i, "Starting scratchpad update...");
+			cudaMemcpyAsync(d_scratchpad[i], h_scratchpad[i], scratchpad_size << 3, cudaMemcpyHostToDevice, bufpad_stream[i]);
+			work_restart[i].restart = true;
+		}
+	}
+}
+
+void free_wildkeccak(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_scratchpad[thr_id]);
+	cudaFree(d_input[thr_id]);
+	cudaFree(d_retnonce[thr_id]);
+
+	cudaStreamDestroy(bufpad_stream[thr_id]);
+	cudaStreamDestroy(kernel_stream[thr_id]);
+
+	cudaDeviceSynchronize();
+
+	init[thr_id] = false;
+}
diff --git a/crypto/wildkeccak.h b/crypto/wildkeccak.h
new file mode 100644
index 0000000000..287ed15323
--- /dev/null
+++ b/crypto/wildkeccak.h
@@ -0,0 +1,10 @@
+
+#define WILD_KECCAK_SCRATCHPAD_BUFFSIZE   1ULL << 29
+#define WILD_KECCAK_ADDENDUMS_ARRAY_SIZE  10
+
+extern uint64_t scratchpad_size;
+
+extern uint32_t WK_CUDABlocks, WK_CUDAThreads;
+
+void wildkeccak_scratchpad_need_update(uint64_t* pscratchpad_buff);
+
diff --git a/crypto/xmr-rpc.cpp b/crypto/xmr-rpc.cpp
new file mode 100644
index 0000000000..433caa7d90
--- /dev/null
+++ b/crypto/xmr-rpc.cpp
@@ -0,0 +1,1334 @@
+/**
+ * XMR RPC 2.0 Stratum and BBR Scratchpad
+ * tpruvot@github - October 2016 - Under GPLv3 Licence
+ */
+
+#include <errno.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h> // mkdir
+
+#include <miner.h>
+
+#ifdef _MSC_VER
+#include "mman.h" // mmap
+#include <direct.h> // _mkdir
+#define chdir(x) _chdir(x)
+#define mkdir(x) _mkdir(x)
+#define getcwd(d,sz) _getcwd(d,sz)
+#define unlink(x) _unlink(x)
+#define PATH_MAX MAX_PATH
+#else
+#include <sys/mman.h> // mmap
+#endif
+
+#if defined(__APPLE__) && !defined(MAP_HUGETLB)
+#define MAP_ANONYMOUS MAP_ANON
+#define MAP_HUGETLB 0
+#define MAP_POPULATE 0
+#define MADV_HUGEPAGE 0
+#endif
+
+#ifndef MADV_HUGEPAGE
+#define MADV_HUGEPAGE 0
+#endif
+
+#ifndef PRIu64
+#define PRIu64 "I64u"
+#endif
+
+#include <algos.h>
+#include "xmr-rpc.h"
+#include "wildkeccak.h"
+
+double target_to_diff_rpc2(uint32_t* target)
+{
+	// unlike other algos, xmr diff is very low
+	if (opt_algo == ALGO_CRYPTONIGHT && target[7]) {
+		// simplified to get 1.0 for 1000
+		return (double) (UINT32_MAX / target[7]) / 1000;
+	}
+	else if (opt_algo == ALGO_CRYPTOLIGHT && target[7]) {
+		return (double) (UINT32_MAX / target[7]) / 1000;
+	}
+	else if (opt_algo == ALGO_WILDKECCAK) {
+		return target_to_diff(target) * 1000;
+	}
+	return target_to_diff(target); // util.cpp
+}
+
+extern struct stratum_ctx stratum;
+
+bool jobj_binary(const json_t *obj, const char *key, void *buf, size_t buflen);
+
+pthread_mutex_t rpc2_job_lock;
+pthread_mutex_t rpc2_work_lock;
+pthread_mutex_t rpc2_login_lock;
+//pthread_mutex_t rpc2_getscratchpad_lock;
+
+char* opt_scratchpad_url = NULL;
+uint64_t* pscratchpad_buff = NULL;
+
+// hide addendums flood on start
+static bool opt_quiet_start = true;
+
+static const char * pscratchpad_local_cache = NULL;
+static const char cachedir_suffix[] = "boolberry"; /* scratchpad cache saved as ~/.cache/boolberry/scratchpad.bin */
+static char scratchpad_file[PATH_MAX];
+static time_t prev_save = 0;
+static struct scratchpad_hi current_scratchpad_hi;
+static struct addendums_array_entry add_arr[WILD_KECCAK_ADDENDUMS_ARRAY_SIZE];
+
+static char *rpc2_job_id = NULL;
+static char *rpc2_blob = NULL;
+static uint32_t rpc2_target = 0;
+static size_t rpc2_bloblen = 0;
+static struct work rpc2_work;
+
+static char rpc2_id[64] = { 0 };
+static uint64_t last_found_nonce = 0;
+
+static const char* get_json_string_param(const json_t *val, const char* param_name)
+{
+	json_t *tmp;
+	tmp = json_object_get(val, param_name);
+	if(!tmp) {
+		return NULL;
+	}
+	return json_string_value(tmp);
+}
+
+static size_t hex2bin_len(unsigned char *p, const char *hexstr, size_t len)
+{
+	char hex_byte[3];
+	char *ep;
+	size_t count = 0;
+
+	hex_byte[2] = '\0';
+
+	while (*hexstr && len) {
+		if (!hexstr[1]) {
+			applog(LOG_ERR, "hex2bin str truncated");
+			return 0;
+		}
+		hex_byte[0] = hexstr[0];
+		hex_byte[1] = hexstr[1];
+		*p = (unsigned char) strtol(hex_byte, &ep, 16);
+		if (*ep) {
+			applog(LOG_ERR, "hex2bin failed on '%s'", hex_byte);
+			return 0;
+		}
+		count++;
+		p++;
+		hexstr += 2;
+		len--;
+	}
+
+	return (/*len == 0 &&*/ *hexstr == 0) ? count : 0;
+}
+
+static bool parse_height_info(const json_t *hi_section, struct scratchpad_hi* phi)
+{
+	unsigned char prevhash[32] = { 0 };
+	const char* block_id;
+	uint64_t hi_h;
+	size_t len;
+
+	if(!phi || !hi_section) {
+		applog(LOG_ERR, "parse_height_info: wrong params");
+		return false;
+	}
+
+	json_t *height = json_object_get(hi_section, "height");
+	if(!height) {
+		applog(LOG_ERR, "JSON inval hi, no height param");
+		goto err_out;
+	}
+
+	if(!json_is_integer(height)) {
+		applog(LOG_ERR, "JSON inval hi: height is not integer ");
+		goto err_out;
+	}
+
+	hi_h = (uint64_t)json_integer_value(height);
+	if(!hi_h) {
+		applog(LOG_ERR, "JSON inval hi: height is 0");
+		goto err_out;
+	}
+
+	block_id = get_json_string_param(hi_section, "block_id");
+	if(!block_id) {
+		applog(LOG_ERR, "JSON inval hi: block_id not found ");
+		goto err_out;
+	}
+
+	len = hex2bin_len(prevhash, block_id, 32);
+	if(len != 32) {
+		applog(LOG_ERR, "JSON inval hi: block_id wrong len %d", len);
+		goto err_out;
+	}
+
+	phi->height = hi_h;
+	memcpy(phi->prevhash, prevhash, 32);
+
+	return true;
+err_out:
+	return false;
+}
+
+static void reset_scratchpad(void)
+{
+	current_scratchpad_hi.height = 0;
+	scratchpad_size = 0;
+	//unlink(scratchpad_file);
+}
+
+static bool patch_scratchpad_with_addendum(uint64_t global_add_startpoint, uint64_t* padd_buff, size_t count/*uint64 units*/)
+{
+	for(size_t i = 0; i < count; i += 4) {
+		uint64_t global_offset = (padd_buff[i]%(global_add_startpoint/4))*4;
+		for(size_t j = 0; j != 4; j++)
+			pscratchpad_buff[global_offset + j] ^= padd_buff[i + j];
+	}
+	return true;
+}
+
+static bool apply_addendum(uint64_t* padd_buff, size_t count/*uint64 units*/)
+{
+	if(WILD_KECCAK_SCRATCHPAD_BUFFSIZE <= (scratchpad_size + count)*8 ) {
+		applog(LOG_ERR, "!!!!!!! WILD_KECCAK_SCRATCHPAD_BUFFSIZE overflowed !!!!!!!! please increase this constant! ");
+		return false;
+	}
+
+	if(!patch_scratchpad_with_addendum(scratchpad_size, padd_buff, count)) {
+		applog(LOG_ERR, "patch_scratchpad_with_addendum is broken, resetting scratchpad");
+		reset_scratchpad();
+		return false;
+	}
+	for(int k = 0; k != count; k++)
+		pscratchpad_buff[scratchpad_size+k] = padd_buff[k];
+
+	scratchpad_size += count;
+
+	return true;
+}
+
+static bool pop_addendum(struct addendums_array_entry* entry)
+{
+	if(!entry)
+		return false;
+
+	if(!entry->add_size || !entry->prev_hi.height) {
+		applog(LOG_ERR, "wrong parameters");
+		return false;
+	}
+	patch_scratchpad_with_addendum(scratchpad_size - entry->add_size, &pscratchpad_buff[scratchpad_size - entry->add_size], (size_t) entry->add_size);
+	scratchpad_size = scratchpad_size - entry->add_size;
+	memcpy(&current_scratchpad_hi, &entry->prev_hi, sizeof(entry->prev_hi));
+
+	memset(entry, 0, sizeof(struct addendums_array_entry));
+	return true;
+}
+
+// playback scratchpad addendums for whole add_arr
+static bool revert_scratchpad()
+{
+	size_t p = 0;
+	size_t i = 0;
+	size_t arr_size = ARRAY_SIZE(add_arr);
+
+	for(p=0; p != arr_size; p++) {
+		i = arr_size-(p+1);
+		if(!add_arr[i].prev_hi.height)
+			continue;
+		pop_addendum(&add_arr[i]);
+	}
+	return true;
+}
+
+static bool push_addendum_info(struct scratchpad_hi* pprev_hi, uint64_t size /* uint64 units count*/)
+{
+	size_t i = 0;
+	size_t arr_size = ARRAY_SIZE(add_arr);
+
+	// Find last free entry
+	for(i=0; i != arr_size; i++) {
+		if(!add_arr[i].prev_hi.height)
+			break;
+	}
+
+	if(i >= arr_size) {
+		// Shift array
+		memmove(&add_arr[0], &add_arr[1], (arr_size-1)*sizeof(add_arr[0]));
+		i = arr_size - 1;
+	}
+	add_arr[i].prev_hi = *pprev_hi;
+	add_arr[i].add_size = size;
+
+	return true;
+}
+
+static bool addendum_decode(const json_t *addm)
+{
+	struct scratchpad_hi hi;
+	unsigned char prevhash[32];
+	uint64_t* padd_buff;
+	uint64_t old_height;
+
+	json_t* hi_section = json_object_get(addm, "hi");
+	if (!hi_section) {
+		//applog(LOG_ERR, "JSON addms field not found");
+		//return false;
+		return true;
+	}
+
+	if(!parse_height_info(hi_section, &hi)) {
+		return false;
+	}
+
+	const char* prev_id_str = get_json_string_param(addm, "prev_id");
+	if(!prev_id_str) {
+		applog(LOG_ERR, "JSON prev_id is not a string");
+		return false;
+	}
+	if(!hex2bin(prevhash, prev_id_str, 32)) {
+		applog(LOG_ERR, "JSON prev_id is not valid hex string");
+		return false;
+	}
+
+	if(current_scratchpad_hi.height != hi.height -1)
+	{
+		if(current_scratchpad_hi.height > hi.height -1) {
+			//skip low scratchpad
+			applog(LOG_ERR, "addendum with hi.height=%lld skiped since current_scratchpad_hi.height=%lld", hi.height, current_scratchpad_hi.height);
+			return true;
+		}
+
+		//TODO: ADD SPLIT HANDLING HERE
+		applog(LOG_ERR, "JSON height in addendum-1 (%lld-1) missmatched with current_scratchpad_hi.height(%lld), reverting scratchpad and re-login",
+			hi.height, current_scratchpad_hi.height);
+		revert_scratchpad();
+		//init re-login
+		strcpy(rpc2_id, "");
+		return false;
+	}
+
+	if(memcmp(prevhash, current_scratchpad_hi.prevhash, 32)) {
+		//TODO: ADD SPLIT HANDLING HERE
+		applog(LOG_ERR, "JSON prev_id in addendum missmatched with current_scratchpad_hi.prevhash");
+		return false;
+	}
+
+	const char* addm_hexstr = get_json_string_param(addm, "addm");
+	if(!addm_hexstr) {
+		applog(LOG_ERR, "JSON prev_id in addendum missmatched with current_scratchpad_hi.prevhash");
+		return false;
+	}
+	size_t add_len = strlen(addm_hexstr);
+	if(add_len%64) {
+		applog(LOG_ERR, "JSON wrong addm hex str len");
+		return false;
+	}
+	padd_buff = (uint64_t*) calloc(1, add_len/2);
+	if (!padd_buff) {
+		applog(LOG_ERR, "out of memory, wanted %zu", add_len/2);
+		return false;
+	}
+
+	if(!hex2bin((unsigned char*)padd_buff, addm_hexstr, add_len/2)) {
+		applog(LOG_ERR, "JSON wrong addm hex str len");
+		goto err_out;
+	}
+
+	if(!apply_addendum(padd_buff, add_len/16)) {
+		applog(LOG_ERR, "JSON Failed to apply_addendum!");
+		goto err_out;
+	}
+	free(padd_buff);
+
+	push_addendum_info(&current_scratchpad_hi, add_len/16);
+	old_height = current_scratchpad_hi.height;
+	current_scratchpad_hi = hi;
+
+	if (!opt_quiet && !opt_quiet_start)
+		applog(LOG_BLUE, "ADDENDUM APPLIED: Block %lld", (long long) current_scratchpad_hi.height);
+
+	return true;
+err_out:
+	free(padd_buff);
+	return false;
+}
+
+static bool addendums_decode(const json_t *job)
+{
+	json_t* paddms = json_object_get(job, "addms");
+	if (!paddms) {
+		//applog(LOG_ERR, "JSON addms field not found");
+		//return false;
+		return true;
+	}
+
+	if(!json_is_array(paddms)) {
+		applog(LOG_ERR, "JSON addms field is not array");
+		return false;
+	}
+
+	size_t add_sz = json_array_size(paddms);
+	for (size_t i = 0; i < add_sz; i++)
+	{
+		json_t *addm = json_array_get(paddms, i);
+		if (!addm) {
+			applog(LOG_ERR, "Internal error: failed to get addm");
+			return false;
+		}
+		if(!addendum_decode(addm))
+			return false;
+	}
+
+	return true;
+}
+
+bool rpc2_job_decode(const json_t *job, struct work *work)
+{
+	json_t *tmp;
+	size_t blobLen;
+	const char *job_id;
+	const char *hexblob;
+
+	tmp = json_object_get(job, "job_id");
+	if (!tmp) {
+		applog(LOG_ERR, "JSON inval job id");
+		goto err_out;
+	}
+
+	if(opt_algo == ALGO_WILDKECCAK && !addendums_decode(job)) {
+		applog(LOG_ERR, "JSON failed to process addendums");
+		goto err_out;
+	}
+	// now allow ADDENDUM notices (after the init)
+	opt_quiet_start = false;
+
+	job_id = json_string_value(tmp);
+	tmp = json_object_get(job, "blob");
+	if (!tmp) {
+		applog(LOG_ERR, "JSON inval blob");
+		goto err_out;
+	}
+	hexblob = json_string_value(tmp);
+	blobLen = strlen(hexblob);
+	if (blobLen % 2 != 0 || ((blobLen / 2) < 40 && blobLen != 0) || (blobLen / 2) > 128)
+	{
+		applog(LOG_ERR, "JSON invalid blob length");
+		goto err_out;
+	}
+
+	if (blobLen != 0)
+	{
+		pthread_mutex_lock(&rpc2_job_lock);
+		char *blob = (char*) calloc(1, blobLen / 2);
+		if (!hex2bin(blob, hexblob, blobLen / 2))
+		{
+			applog(LOG_ERR, "JSON inval blob");
+			pthread_mutex_unlock(&rpc2_job_lock);
+			goto err_out;
+		}
+		if (rpc2_blob) {
+			free(rpc2_blob);
+		}
+		rpc2_bloblen = blobLen / 2;
+		rpc2_blob = (char*) malloc(rpc2_bloblen);
+		memcpy(rpc2_blob, blob, blobLen / 2);
+
+		free(blob);
+
+		uint32_t target;
+		jobj_binary(job, "target", &target, 4);
+		if(rpc2_target != target) {
+			double difficulty = (((double) UINT32_MAX) / target);
+			stratum.job.diff = difficulty;
+			rpc2_target = target;
+		}
+
+		if (rpc2_job_id) {
+			// reset job share counter
+			if (strcmp(rpc2_job_id, job_id)) stratum.job.shares_count = 0;
+			free(rpc2_job_id);
+		}
+		rpc2_job_id = strdup(job_id);
+		pthread_mutex_unlock(&rpc2_job_lock);
+	}
+
+	if(work)
+	{
+		if (!rpc2_blob) {
+			applog(LOG_ERR, "Requested work before work was received");
+			goto err_out;
+		}
+		memcpy(work->data, rpc2_blob, rpc2_bloblen);
+		memset(work->target, 0xff, sizeof(work->target));
+		work->target[7] = rpc2_target;
+		work->targetdiff = target_to_diff_rpc2(work->target);
+
+		snprintf(work->job_id, sizeof(work->job_id), "%s", rpc2_job_id);
+	}
+
+	if (opt_algo == ALGO_WILDKECCAK)
+		wildkeccak_scratchpad_need_update(pscratchpad_buff);
+	return true;
+
+err_out:
+	return false;
+}
+
+extern struct work _ALIGN(64) g_work;
+extern volatile time_t g_work_time;
+extern bool submit_old;
+
+bool rpc2_stratum_job(struct stratum_ctx *sctx, json_t *id, json_t *params)
+{
+	bool ret = false;
+	pthread_mutex_lock(&rpc2_work_lock);
+	ret = rpc2_job_decode(params, &rpc2_work);
+	// update miner threads work
+	ret = ret && rpc2_stratum_gen_work(sctx, &g_work);
+	restart_threads();
+	pthread_mutex_unlock(&rpc2_work_lock);
+	return ret;
+}
+
+bool rpc2_stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
+{
+//	pthread_mutex_lock(&rpc2_work_lock);
+	memcpy(work, &rpc2_work, sizeof(struct work));
+	if (stratum_diff != sctx->job.diff) {
+		char sdiff[32] = { 0 };
+		stratum_diff = sctx->job.diff;
+		if (opt_showdiff && work->targetdiff != stratum_diff)
+			snprintf(sdiff, 32, " (%g)", work->targetdiff);
+		if (stratum_diff >= 1e6)
+			applog(LOG_WARNING, "Stratum difficulty set to %.1f M%s", stratum_diff/1e6, sdiff);
+		else
+			applog(LOG_WARNING, "Stratum difficulty set to %.0f%s", stratum_diff, sdiff);
+	}
+	if (work->target[7] != rpc2_target) {
+		work->target[7] = rpc2_target;
+		work->targetdiff = target_to_diff_rpc2(work->target);
+		g_work_time = 0;
+		restart_threads();
+	}
+//	pthread_mutex_unlock(&rpc2_work_lock);
+	return (work->data[0] != 0);
+}
+
+#define JSON_SUBMIT_BUF_LEN 512
+// called by submit_upstream_work()
+bool rpc2_stratum_submit(struct pool_infos *pool, struct work *work)
+{
+	char    _ALIGN(64) s[JSON_SUBMIT_BUF_LEN];
+	uint8_t _ALIGN(64) hash[32];
+	uint8_t _ALIGN(64) data[88];
+	char *noncestr, *hashhex;
+	int idnonce = work->submit_nonce_id;
+
+	memcpy(&data[0], work->data, 88);
+
+	if (opt_algo == ALGO_WILDKECCAK) {
+		// 64 bits nonce
+		memcpy(&data[1], work->nonces, 8);
+		// pass if the previous hash is not the current previous hash
+		if(!submit_old && memcmp(&work->data[3], &g_work.data[3], 28)) {
+			if (opt_debug) applog(LOG_DEBUG, "stale work detected");
+			pool->stales_count++;
+			return false;
+		}
+		noncestr = bin2hex((unsigned char*) &data[1], 8);
+		// "nonce":"5794ec8000000000" => 0x0000000080ec9457
+		memcpy(&last_found_nonce, work->nonces, 8);
+		wildkeccak_hash(hash, data, NULL, 0);
+		work_set_target_ratio(work, (uint32_t*) hash);
+	}
+
+	else if (opt_algo == ALGO_CRYPTOLIGHT) {
+		int variant = 1;
+		uint32_t nonce = work->nonces[idnonce];
+		noncestr = bin2hex((unsigned char*) &nonce, 4);
+		last_found_nonce = nonce;
+		//if (cryptonight_fork > 1 && ((unsigned char*)work->data)[0] >= cryptonight_fork)
+		//	variant = ((unsigned char*)work->data)[0] - cryptonight_fork + 1;
+		cryptolight_hash_variant(hash, data, 76, variant);
+		work_set_target_ratio(work, (uint32_t*) hash);
+	}
+
+	else if (opt_algo == ALGO_CRYPTONIGHT) {
+		int variant = 0;
+		uint32_t nonce = work->nonces[idnonce];
+		noncestr = bin2hex((unsigned char*) &nonce, 4);
+		last_found_nonce = nonce;
+		if (cryptonight_fork > 1 && ((unsigned char*)work->data)[0] >= cryptonight_fork)
+			variant = ((unsigned char*)work->data)[0] - cryptonight_fork + 1;
+		cryptonight_hash_variant(hash, data, 76, variant);
+		work_set_target_ratio(work, (uint32_t*) hash);
+	}
+
+	if (hash[31] != 0)
+		return false; // prevent bad hashes
+	hashhex = bin2hex((unsigned char*)hash, 32);
+
+	snprintf(s, sizeof(s), "{\"method\":\"submit\",\"params\":"
+		"{\"id\":\"%s\",\"job_id\":\"%s\",\"nonce\":\"%s\",\"result\":\"%s\"}, \"id\":%u}",
+		rpc2_id, work->job_id, noncestr, hashhex, stratum.job.shares_count + 10);
+
+	free(hashhex);
+	free(noncestr);
+
+	gettimeofday(&stratum.tv_submit, NULL);
+
+	if(!stratum_send_line(&stratum, s)) {
+		applog(LOG_ERR, "%s stratum_send_line failed", __func__);
+		return false;
+	}
+
+	//stratum.sharediff = target_to_diff_rpc2((uint32_t*)hash);
+	stratum.sharediff = work->sharediff[idnonce];
+
+	return true;
+}
+
+bool rpc2_login_decode(const json_t *val)
+{
+	const char *id;
+	const char *s;
+	json_t *res = json_object_get(val, "result");
+	if(!res) {
+		applog(LOG_ERR, "JSON invalid result");
+		goto err_out;
+	}
+
+	json_t *tmp;
+	tmp = json_object_get(res, "id");
+	if(!tmp) {
+		applog(LOG_ERR, "JSON inval id");
+		goto err_out;
+	}
+	id = json_string_value(tmp);
+	if(!id) {
+		applog(LOG_ERR, "JSON id is not a string");
+		goto err_out;
+	}
+
+	strncpy(rpc2_id, id, sizeof(rpc2_id)-1);
+
+	if(opt_debug)
+		applog(LOG_DEBUG, "Auth id: %s", id);
+
+	tmp = json_object_get(res, "status");
+	if(!tmp) {
+		applog(LOG_ERR, "JSON inval status");
+		goto err_out;
+	}
+	s = json_string_value(tmp);
+	if(!s) {
+		applog(LOG_ERR, "JSON status is not a string");
+		goto err_out;
+	}
+	if(strcmp(s, "OK")) {
+		applog(LOG_ERR, "JSON returned status \"%s\"", s);
+		goto err_out;
+	}
+
+	return true;
+
+err_out:
+	return false;
+}
+
+bool store_scratchpad_to_file(bool do_fsync)
+{
+	char file_name_buff[PATH_MAX] = { 0 };
+	FILE *fp;
+	int ret;
+
+	if(opt_algo != ALGO_WILDKECCAK) return true;
+	if(!scratchpad_size || !pscratchpad_buff) return true;
+
+	snprintf(file_name_buff, sizeof(file_name_buff), "%s.tmp", pscratchpad_local_cache);
+	unlink(file_name_buff);
+	fp = fopen(file_name_buff, "wbx");
+	if (!fp) {
+		applog(LOG_ERR, "failed to create file %s: %s", file_name_buff, strerror(errno));
+		return false;
+	}
+
+	struct scratchpad_file_header sf = { 0 };
+	memcpy(sf.add_arr, add_arr, sizeof(sf.add_arr));
+	sf.current_hi = current_scratchpad_hi;
+	sf.scratchpad_size = scratchpad_size;
+
+	if ((fwrite(&sf, sizeof(sf), 1, fp) != 1) ||
+		(fwrite(pscratchpad_buff, 8, (size_t) scratchpad_size, fp) != scratchpad_size)) {
+			applog(LOG_ERR, "failed to write file %s: %s", file_name_buff, strerror(errno));
+			fclose(fp);
+			unlink(file_name_buff);
+			return false;
+	}
+	fflush(fp);
+	/*if (do_fsync) {
+		if (fsync(fileno(fp)) == -1) {
+			applog(LOG_ERR, "failed to fsync file %s: %s", file_name_buff, strerror(errno));
+			fclose(fp);
+			unlink(file_name_buff);
+			return false;
+		}
+	}*/
+	if (fclose(fp) == EOF) {
+		applog(LOG_ERR, "failed to write file %s: %s", file_name_buff, strerror(errno));
+		unlink(file_name_buff);
+		return false;
+	}
+	ret = rename(file_name_buff, pscratchpad_local_cache);
+	if (ret == -1) {
+		applog(LOG_ERR, "failed to rename %s to %s: %s",
+			file_name_buff, pscratchpad_local_cache, strerror(errno));
+		unlink(file_name_buff);
+		return false;
+	}
+	applog(LOG_DEBUG, "saved scratchpad to %s (%zu+%zu bytes)", pscratchpad_local_cache,
+		sizeof(struct scratchpad_file_header), (size_t)scratchpad_size * 8);
+	return true;
+}
+
+/* TODO: repetitive error+log spam handling */
+bool load_scratchpad_from_file(const char *fname)
+{
+	FILE *fp;
+	long flen;
+
+	if(opt_algo != ALGO_WILDKECCAK) return true;
+
+	fp = fopen(fname, "rb");
+	if (!fp) {
+		if (errno != ENOENT) {
+			applog(LOG_ERR, "failed to load %s: %s", fname, strerror(errno));
+		}
+		return false;
+	}
+
+	struct scratchpad_file_header fh = { 0 };
+	if ((fread(&fh, sizeof(fh), 1, fp) != 1)) {
+		applog(LOG_ERR, "read error from %s: %s", fname, strerror(errno));
+		fclose(fp);
+		return false;
+	}
+
+	if ((fh.scratchpad_size*8 > (WILD_KECCAK_SCRATCHPAD_BUFFSIZE)) ||(fh.scratchpad_size%4)) {
+		applog(LOG_ERR, "file %s size invalid (%" PRIu64 "), max=%zu",
+			fname, fh.scratchpad_size*8, WILD_KECCAK_SCRATCHPAD_BUFFSIZE);
+		fclose(fp);
+		return false;
+	}
+
+	if (fread(pscratchpad_buff, 8, (size_t) fh.scratchpad_size, fp) != fh.scratchpad_size) {
+		applog(LOG_ERR, "read error from %s: %s", fname, strerror(errno));
+		fclose(fp);
+		return false;
+	}
+
+	scratchpad_size = fh.scratchpad_size;
+	current_scratchpad_hi = fh.current_hi;
+	memcpy(&add_arr[0], &fh.add_arr[0], sizeof(fh.add_arr));
+	flen = (long)scratchpad_size*8;
+
+	if (!opt_quiet) {
+		applog(LOG_INFO, "Scratchpad size %ld kB at block %" PRIu64, flen/1024, current_scratchpad_hi.height);
+	}
+
+	fclose(fp);
+	prev_save = time(NULL);
+
+	return true;
+}
+
+bool dump_scratchpad_to_file_debug()
+{
+	char file_name_buff[1024] = { 0 };
+	if(opt_algo != ALGO_WILDKECCAK) return true;
+
+	snprintf(file_name_buff, sizeof(file_name_buff), "scratchpad_%" PRIu64 "_%llx.scr",
+		current_scratchpad_hi.height, (long long) last_found_nonce);
+
+	/* do not bother rewriting if it exists already */
+
+	FILE *fp = fopen(file_name_buff, "w");
+	if(!fp) {
+		applog(LOG_WARNING, "failed to open file %s: %s", file_name_buff, strerror(errno));
+		return false;
+	}
+	if (fwrite(pscratchpad_buff, 8, (size_t) scratchpad_size, fp) != scratchpad_size) {
+		applog(LOG_ERR, "failed to write file %s: %s", file_name_buff, strerror(errno));
+		fclose(fp);
+		return false;
+	}
+	if (fclose(fp) == EOF) {
+		applog(LOG_ERR, "failed to write file %s: %s", file_name_buff, strerror(errno));
+		return false;
+	}
+
+	fclose(fp);
+	return true;
+}
+
+static bool try_mkdir_chdir(const char *dirn)
+{
+	if (chdir(dirn) == -1) {
+		if (errno == ENOENT) {
+#ifdef WIN32
+			if (mkdir(dirn) == -1) {
+#else
+			if (mkdir(dirn, 0700) == -1) {
+#endif
+				applog(LOG_ERR, "mkdir failed: %s", strerror(errno));
+				return false;
+			}
+			if (chdir(dirn) == -1) {
+				applog(LOG_ERR, "chdir failed: %s", strerror(errno));
+				return false;
+			}
+		} else {
+			applog(LOG_ERR, "chdir failed: %s", strerror(errno));
+			return false;
+		}
+	}
+	return true;
+}
+
+static size_t curl_write_data(void *ptr, size_t size, size_t nmemb, FILE *stream)
+{
+	size_t written = fwrite(ptr, size, nmemb, stream);
+	return written;
+}
+
+static bool download_inital_scratchpad(const char* path_to, const char* url)
+{
+	CURL *curl;
+	CURLcode res;
+	char curl_error_buff[CURL_ERROR_SIZE] = { 0 };
+	FILE *fp = fopen(path_to,"wb");
+	if (!fp) {
+		applog(LOG_ERR, "Failed to create file %s error %d", path_to, errno);
+		return false;
+	}
+
+	applog(LOG_INFO, "Downloading scratchpad....");
+
+	curl_global_cleanup();
+	res = curl_global_init(CURL_GLOBAL_ALL);
+	if (res != CURLE_OK) {
+		applog(LOG_WARNING, "curl curl_global_init error: %d", (int) res);
+	}
+
+	curl = curl_easy_init();
+	if (!curl) {
+		applog(LOG_INFO, "Failed to curl_easy_init.");
+		fclose(fp);
+		unlink(path_to);
+		return false;
+	}
+
+	if (opt_protocol && opt_debug) {
+		curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
+	}
+	if (opt_proxy) {
+		curl_easy_setopt(curl, CURLOPT_PROXY, opt_proxy);
+		curl_easy_setopt(curl, CURLOPT_PROXYTYPE, opt_proxy_type);
+	}
+	curl_easy_setopt(curl, CURLOPT_URL, url);
+	curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 30);
+	curl_easy_setopt(curl, CURLOPT_TIMEOUT, 300);
+	curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);
+	curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_error_buff);
+	curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_write_data);
+	curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp);
+	//curl_easy_setopt(curl, CURLOPT_SSLVERSION, CURL_SSLVERSION_TLSv1_2);
+	curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 0);
+	if (opt_cert) {
+		curl_easy_setopt(curl, CURLOPT_CAINFO, opt_cert);
+	} else {
+		curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0);
+	}
+
+	res = curl_easy_perform(curl);
+	if (res != CURLE_OK) {
+		if (res == CURLE_OUT_OF_MEMORY) {
+			applog(LOG_ERR, "Failed to download file, not enough memory!");
+			applog(LOG_ERR, "curl error: %s", curl_error_buff);
+		} else {
+			applog(LOG_ERR, "Failed to download file, error: %s", curl_error_buff);
+		}
+	} else {
+		applog(LOG_INFO, "Scratchpad downloaded.");
+	}
+	/* always cleanup */
+	curl_easy_cleanup(curl);
+
+	fflush(fp);
+	fclose(fp);
+
+	if (res != CURLE_OK) {
+		unlink(path_to);
+		return false;
+	}
+	return true;
+}
+
+#ifndef WIN32
+
+void GetScratchpad()
+{
+	const char *phome_var_name = "HOME";
+	size_t sz = WILD_KECCAK_SCRATCHPAD_BUFFSIZE;
+	char cachedir[PATH_MAX];
+
+	if(!getenv(phome_var_name)) {
+		applog(LOG_ERR, "$%s not set", phome_var_name);
+		exit(1);
+	}
+	else if(!try_mkdir_chdir(getenv(phome_var_name))) {
+		exit(1);
+	}
+
+	if(!try_mkdir_chdir(".cache")) exit(1);
+
+	if(!try_mkdir_chdir(cachedir_suffix)) exit(1);
+
+	if(getcwd(cachedir, sizeof(cachedir) - 22) == NULL) {
+		applog(LOG_ERR, "getcwd failed: %s", strerror(errno));
+		exit(1);
+	}
+
+	snprintf(scratchpad_file, sizeof(scratchpad_file), "%s/scratchpad.bin", cachedir);
+	pscratchpad_local_cache = scratchpad_file;
+
+	if (!opt_quiet)
+		applog(LOG_INFO, "Scratchpad file %s", pscratchpad_local_cache);
+
+	pscratchpad_buff = (uint64_t*) mmap(0, sz, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE, 0, 0);
+	if(pscratchpad_buff == MAP_FAILED)
+	{
+		if(opt_debug) applog(LOG_DEBUG, "hugetlb not available");
+		pscratchpad_buff = (uint64_t*) malloc(sz);
+		if(!pscratchpad_buff) {
+			applog(LOG_ERR, "Scratchpad allocation failed");
+			exit(1);
+		}
+	} else {
+		if(opt_debug) applog(LOG_DEBUG, "using hugetlb");
+	}
+	madvise(pscratchpad_buff, sz, MADV_RANDOM | MADV_WILLNEED | MADV_HUGEPAGE);
+	mlock(pscratchpad_buff, sz);
+
+	if(!load_scratchpad_from_file(pscratchpad_local_cache))
+	{
+		if(!opt_scratchpad_url) {
+			applog(LOG_ERR, "Scratchpad URL not set. Please specify correct scratchpad url by -k or --scratchpad option");
+			exit(1);
+		}
+		if(!download_inital_scratchpad(pscratchpad_local_cache, opt_scratchpad_url)) {
+			applog(LOG_ERR, "Scratchpad not found and not downloaded. Please specify correct scratchpad url by -k or --scratchpad  option");
+			exit(1);
+		}
+		if(!load_scratchpad_from_file(pscratchpad_local_cache)) {
+			applog(LOG_ERR, "Failed to load scratchpad data after downloading, probably broken scratchpad link, please restart miner with correct inital scratcpad link(-k or --scratchpad )");
+			unlink(pscratchpad_local_cache);
+			exit(1);
+		}
+	}
+}
+
+#else /* Windows */
+
+void GetScratchpad()
+{
+	bool scratchpad_need_update = false;
+	size_t sz = WILD_KECCAK_SCRATCHPAD_BUFFSIZE;
+	const char* phome_var_name = "LOCALAPPDATA";
+	char cachedir[PATH_MAX];
+
+	if(!getenv(phome_var_name)) {
+		applog(LOG_ERR, "%s env var is not set", phome_var_name);
+		exit(1);
+	}
+	else if(!try_mkdir_chdir(getenv(phome_var_name))) {
+		exit(1);
+	}
+
+	if(!try_mkdir_chdir(".cache"))
+		exit(1);
+
+	if(!try_mkdir_chdir(cachedir_suffix))
+		exit(1);
+
+	if(getcwd(cachedir, sizeof(cachedir) - 22) == NULL) {
+		applog(LOG_ERR, "getcwd failed: %s", strerror(errno));
+		exit(1);
+	}
+
+	snprintf(scratchpad_file, sizeof(scratchpad_file), "%s\\scratchpad.bin", cachedir);
+	pscratchpad_local_cache = scratchpad_file;
+
+	if (!opt_quiet)
+		applog(LOG_INFO, "Scratchpad file %s", pscratchpad_local_cache);
+
+	if (pscratchpad_buff) {
+		reset_scratchpad();
+		wildkeccak_scratchpad_need_update(NULL);
+		scratchpad_need_update = true;
+		free(pscratchpad_buff);
+		pscratchpad_buff = NULL;
+	}
+
+	pscratchpad_buff = (uint64_t*) malloc(sz);
+	if(!pscratchpad_buff) {
+		applog(LOG_ERR, "Scratchpad allocation failed");
+		exit(1);
+	}
+
+	if(!load_scratchpad_from_file(pscratchpad_local_cache))
+	{
+		if(!opt_scratchpad_url) {
+			applog(LOG_ERR, "Scratchpad URL not set. Please specify correct scratchpad url by -k or --scratchpad option");
+			exit(1);
+		}
+		free(pscratchpad_buff);
+		pscratchpad_buff = NULL;
+		if(!download_inital_scratchpad(pscratchpad_local_cache, opt_scratchpad_url)) {
+			applog(LOG_ERR, "Scratchpad not found and not downloaded. Please specify correct scratchpad url by -k or --scratchpad  option");
+			exit(1);
+		}
+		pscratchpad_buff = (uint64_t*) malloc(sz);
+		if(!pscratchpad_buff) {
+			applog(LOG_ERR, "Scratchpad allocation failed");
+			exit(1);
+		}
+		if(!load_scratchpad_from_file(pscratchpad_local_cache)) {
+			applog(LOG_ERR, "Failed to load scratchpad data after downloading, probably broken scratchpad link, please restart miner with correct inital scratcpad link(-k or --scratchpad )");
+			unlink(pscratchpad_local_cache);
+			exit(1);
+		}
+	}
+
+	if (scratchpad_need_update)
+		wildkeccak_scratchpad_need_update(pscratchpad_buff);
+}
+
+#endif /* GetScratchpad() linux */
+
+static bool rpc2_getfullscratchpad_decode(const json_t *val)
+{
+	const char* status;
+	const char* scratch_hex;
+	size_t len;
+	json_t *hi;
+	json_t *res = json_object_get(val, "result");
+	if(!res) {
+		applog(LOG_ERR, "JSON invalid result in rpc2_getfullscratchpad_decode");
+		goto err_out;
+	}
+
+	//check status
+	status = get_json_string_param(res, "status");
+	if (!status ) {
+		applog(LOG_ERR, "JSON status is not a string");
+		goto err_out;
+	}
+
+	if(strcmp(status, "OK")) {
+		applog(LOG_ERR, "JSON returned status \"%s\"", status);
+		goto err_out;
+	}
+
+	//parse scratchpad
+	scratch_hex = get_json_string_param(res, "scratchpad_hex");
+	if (!scratch_hex) {
+		applog(LOG_ERR, "JSON scratch_hex is not a string");
+		goto err_out;
+	}
+
+	len = hex2bin_len((unsigned char*)pscratchpad_buff, scratch_hex, WILD_KECCAK_SCRATCHPAD_BUFFSIZE);
+	if (!len) {
+		applog(LOG_ERR, "JSON scratch_hex is not valid hex");
+		goto err_out;
+	}
+
+	if (len%8 || len%32) {
+		applog(LOG_ERR, "JSON scratch_hex is not valid size=%d bytes", len);
+		goto err_out;
+	}
+
+	//parse hi
+	hi = json_object_get(res, "hi");
+	if(!hi) {
+		applog(LOG_ERR, "JSON inval hi");
+		goto err_out;
+	}
+
+	if(!parse_height_info(hi, &current_scratchpad_hi))
+	{
+		applog(LOG_ERR, "JSON inval hi, failed to parse");
+		goto err_out;
+	}
+
+	applog(LOG_INFO, "Fetched scratchpad size %d bytes", len);
+	scratchpad_size = len/8;
+
+	return true;
+
+err_out: return false;
+}
+
+static bool rpc2_stratum_getscratchpad(struct stratum_ctx *sctx)
+{
+	bool ret = false;
+	json_t *val = NULL;
+	json_error_t err;
+	char *s, *sret;
+	if(opt_algo != ALGO_WILDKECCAK) return true;
+
+	s = (char*) calloc(1, 1024);
+	if (!s)
+		goto out;
+	sprintf(s, "{\"method\": \"getfullscratchpad\", \"params\": {\"id\": \"%s\", \"agent\": \"" USER_AGENT "\"}, \"id\": 1}", rpc2_id);
+
+	applog(LOG_INFO, "Getting full scratchpad....");
+	if (!stratum_send_line(sctx, s))
+		goto out;
+
+	//sret = stratum_recv_line_timeout(sctx, 920);
+	sret = stratum_recv_line(sctx);
+	if (!sret)
+		goto out;
+	applog(LOG_DEBUG, "Getting full scratchpad received line");
+
+	val = JSON_LOADS(sret, &err);
+	free(sret);
+	if (!val) {
+		applog(LOG_ERR, "JSON decode rpc2_getscratchpad response failed(%d): %s", err.line, err.text);
+		goto out;
+	}
+
+	applog(LOG_DEBUG, "Getting full scratchpad parsed line");
+
+	ret = rpc2_getfullscratchpad_decode(val);
+
+out:
+	free(s);
+	if (val)
+		json_decref(val);
+
+	return ret;
+}
+
+bool rpc2_stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass)
+{
+	bool ret = false;
+	json_t *val = NULL, *res_val, *err_val, *job_val = NULL;
+	json_error_t err;
+	char *sret;
+	char *s = (char*) calloc(1, 320 + strlen(user) + strlen(pass));
+
+	if (opt_algo == ALGO_WILDKECCAK) {
+		char *prevhash = bin2hex((const unsigned char*)current_scratchpad_hi.prevhash, 32);
+		sprintf(s, "{\"method\":\"login\",\"params\":{\"login\":\"%s\",\"pass\":\"%s\","
+			   "\"hi\":{\"height\":%" PRIu64 ",\"block_id\":\"%s\"},"
+			   "\"agent\":\"" USER_AGENT "\"},\"id\":2}",
+			user, pass, current_scratchpad_hi.height, prevhash);
+		free(prevhash);
+	} else {
+		sprintf(s, "{\"method\":\"login\",\"params\":{\"login\":\"%s\",\"pass\":\"%s\","
+			   "\"agent\":\"" USER_AGENT "\"},\"id\":2}",
+			user, pass);
+	}
+
+	if (!stratum_send_line(sctx, s))
+		goto out;
+
+	while (1) {
+		sret = stratum_recv_line(sctx);
+		if (!sret)
+			goto out;
+		if (!stratum_handle_method(sctx, sret))
+			break;
+		free(sret);
+	}
+
+	val = JSON_LOADS(sret, &err);
+	free(sret);
+	if (!val) {
+		applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text);
+		goto out;
+	}
+
+	res_val = json_object_get(val, "result");
+	err_val = json_object_get(val, "error");
+
+	if (!res_val || json_is_false(res_val) ||
+		(err_val && !json_is_null(err_val)))  {
+			applog(LOG_ERR, "Stratum authentication failed");
+			if (err_val) {
+				const char *msg = json_string_value(json_object_get(err_val,"message"));
+				if (msg && strlen(msg)) {
+					if (strstr(msg, "scratchpad too old") && pscratchpad_local_cache) {
+						if (unlink(pscratchpad_local_cache) == 0) {
+							applog(LOG_INFO, "Outdated scratchpad, deleted...", pscratchpad_local_cache);
+							GetScratchpad();
+							goto out;
+						}
+					}
+					applog(LOG_NOTICE, "%s", msg);
+				}
+			}
+			goto out;
+	}
+
+	rpc2_login_decode(val);
+	job_val = json_object_get(res_val, "job");
+
+	pthread_mutex_lock(&rpc2_work_lock);
+	if(job_val) rpc2_job_decode(job_val, &rpc2_work);
+	pthread_mutex_unlock(&rpc2_work_lock);
+
+	ret = true;
+
+out:
+	free(s);
+	if (val)
+		json_decref(val);
+
+	return ret;
+}
+
+bool rpc2_stratum_request_job(struct stratum_ctx *sctx)
+{
+	json_t *val = NULL, *res_val, *err_val;
+	json_error_t err;
+	bool ret = false;
+	char *sret;
+	char *s = (char*) calloc(1, 10*2048);
+	if (!s) {
+		applog(LOG_ERR, "Stratum job OOM!");
+		return ret;
+	}
+
+	if (opt_algo == ALGO_WILDKECCAK) {
+		char* prevhash = bin2hex((const unsigned char*)current_scratchpad_hi.prevhash, 32);
+		sprintf(s, "{\"method\":\"getjob\",\"params\": {"
+			"\"id\":\"%s\", \"hi\": {\"height\": %" PRIu64 ",\"block_id\":\"%s\" }, \"agent\": \"" USER_AGENT "\"},"
+			"\"id\":1}",
+			rpc2_id, current_scratchpad_hi.height, prevhash);
+		free(prevhash);
+	} else {
+		sprintf(s, "{\"method\":\"getjob\",\"params\":{\"id\":\"%s\"},\"id\":1}", rpc2_id);
+	}
+
+	if(!stratum_send_line(sctx, s)) {
+		applog(LOG_ERR, "Stratum failed to send getjob line");
+		goto out;
+	}
+
+	sret = stratum_recv_line(sctx);
+	if (!sret) {
+		applog(LOG_ERR, "Stratum failed to recv getjob line");
+		goto out;
+	}
+
+	val = JSON_LOADS(sret, &err);
+	free(sret);
+	if (!val) {
+		applog(LOG_ERR, "JSON getwork decode failed(%d): %s", err.line, err.text);
+		goto out;
+	}
+
+	res_val = json_object_get(val, "result");
+	err_val = json_object_get(val, "error");
+
+	if (!res_val || json_is_false(res_val) ||
+		(err_val && !json_is_null(err_val))) {
+			applog(LOG_ERR, "Stratum getjob failed");
+			goto out;
+	}
+
+	pthread_mutex_lock(&rpc2_work_lock);
+	rpc2_job_decode(res_val, &rpc2_work);
+	pthread_mutex_unlock(&rpc2_work_lock);
+
+	ret = true;
+out:
+	if (val)
+		json_decref(val);
+
+	return ret;
+}
+
+int rpc2_stratum_thread_stuff(struct pool_infos* pool)
+{
+	int opt_fail_pause = 10;
+
+	if(!strcmp(rpc2_id, "")) {
+		if (!opt_quiet)
+			applog(LOG_DEBUG, "disconnecting...");
+		stratum_disconnect(&stratum);
+		//not logged in, try to relogin
+		if (!opt_quiet)
+			applog(LOG_DEBUG, "Re-connect and relogin...");
+		if(!stratum_connect(&stratum, stratum.url) || !stratum_authorize(&stratum, pool->user, pool->pass)) {
+			stratum_disconnect(&stratum);
+			applog(LOG_ERR, "Failed...retry after %d seconds", opt_fail_pause);
+			sleep(opt_fail_pause);
+		}
+	}
+
+	if(!scratchpad_size && opt_algo == ALGO_WILDKECCAK) {
+		if(!rpc2_stratum_getscratchpad(&stratum)) {
+			stratum_disconnect(&stratum);
+			applog(LOG_ERR, "...retry after %d seconds", opt_fail_pause);
+			sleep(opt_fail_pause);
+		}
+		store_scratchpad_to_file(false);
+		prev_save = time(NULL);
+
+		if(!rpc2_stratum_request_job(&stratum)) {
+			stratum_disconnect(&stratum);
+			applog(LOG_ERR, "...retry after %d seconds", opt_fail_pause);
+			sleep(opt_fail_pause);
+		}
+	}
+
+	/* save every 12 hours */
+	if ((time(NULL) - prev_save) > 12*3600) {
+		store_scratchpad_to_file(false);
+		prev_save = time(NULL);
+	}
+
+	if (rpc2_work.job_id && (!g_work_time || strcmp(rpc2_work.job_id, g_work.job_id))) {
+		pthread_mutex_lock(&rpc2_work_lock);
+		rpc2_stratum_gen_work(&stratum, &g_work);
+		g_work_time = time(NULL);
+		pthread_mutex_unlock(&rpc2_work_lock);
+
+		if (opt_debug) applog(LOG_DEBUG, "Stratum detected new block");
+		restart_threads();
+	}
+
+	return 0;
+}
+
+void rpc2_init()
+{
+	memset(&current_scratchpad_hi, 0, sizeof(struct scratchpad_hi));
+	memset(&rpc2_work, 0, sizeof(struct work));
+
+	pthread_mutex_init(&rpc2_job_lock, NULL);
+	pthread_mutex_init(&rpc2_work_lock, NULL);
+	pthread_mutex_init(&rpc2_login_lock, NULL);
+	//pthread_mutex_init(&rpc2_getscratchpad_lock, NULL);
+}
diff --git a/crypto/xmr-rpc.h b/crypto/xmr-rpc.h
new file mode 100644
index 0000000000..6e6002f661
--- /dev/null
+++ b/crypto/xmr-rpc.h
@@ -0,0 +1,41 @@
+
+#include <jansson.h>
+
+#include "wildkeccak.h"
+
+#ifdef WIN32
+#define _PACKED _ALIGN(4)
+#else
+#define _PACKED __attribute__((__packed__))
+#endif
+
+struct _PACKED scratchpad_hi {
+    unsigned char prevhash[32];
+    uint64_t height;
+};
+
+struct _PACKED addendums_array_entry {
+    struct scratchpad_hi prev_hi;
+    uint64_t add_size;
+};
+
+
+struct _PACKED scratchpad_file_header {
+    struct scratchpad_hi current_hi;
+    struct addendums_array_entry add_arr[WILD_KECCAK_ADDENDUMS_ARRAY_SIZE];
+    uint64_t scratchpad_size;
+};
+
+
+bool rpc2_job_decode(const json_t *job, struct work *work);
+bool rpc2_stratum_job(struct stratum_ctx *sctx, json_t *id, json_t *params);
+bool rpc2_stratum_gen_work(struct stratum_ctx *sctx, struct work *work);
+bool rpc2_stratum_submit(struct pool_infos *pool, struct work *work);
+
+int  rpc2_stratum_thread_stuff(struct pool_infos* pool);
+
+bool rpc2_login_decode(const json_t *val);
+
+void rpc2_init();
+
+void GetScratchpad();
diff --git a/cuda.cpp b/cuda.cpp
new file mode 100644
index 0000000000..9dde388eb8
--- /dev/null
+++ b/cuda.cpp
@@ -0,0 +1,310 @@
+﻿#include <stdio.h>
+#include <memory.h>
+#include <string.h>
+#include <unistd.h>
+#include <map>
+
+// include thrust
+#ifndef __cplusplus
+#include <thrust/version.h>
+#include <thrust/remove.h>
+#include <thrust/device_vector.h>
+#include <thrust/iterator/constant_iterator.h>
+#else
+#include <ctype.h>
+#endif
+
+#include "miner.h"
+#include "nvml.h"
+
+#include "cuda_runtime.h"
+
+#ifdef __cplusplus
+/* miner.h functions are declared in C type, not C++ */
+extern "C" {
+#endif
+
+// CUDA Devices on the System
+int cuda_num_devices()
+{
+	int version = 0, GPU_N = 0;
+	cudaError_t err = cudaDriverGetVersion(&version);
+	if (err != cudaSuccess) {
+		applog(LOG_ERR, "Unable to query CUDA driver version! Is an nVidia driver installed?");
+		exit(1);
+	}
+
+	if (version < CUDART_VERSION) {
+		applog(LOG_ERR, "Your system does not support CUDA %d.%d API!",
+			CUDART_VERSION / 1000, (CUDART_VERSION % 1000) / 10);
+		exit(1);
+	}
+
+	err = cudaGetDeviceCount(&GPU_N);
+	if (err != cudaSuccess) {
+		applog(LOG_ERR, "Unable to query number of CUDA devices! Is an nVidia driver installed?");
+		exit(1);
+	}
+	return GPU_N;
+}
+
+int cuda_version()
+{
+	return (int) CUDART_VERSION;
+}
+
+void cuda_devicenames()
+{
+	cudaError_t err;
+	int GPU_N;
+	err = cudaGetDeviceCount(&GPU_N);
+	if (err != cudaSuccess)
+	{
+		applog(LOG_ERR, "Unable to query number of CUDA devices! Is an nVidia driver installed?");
+		exit(1);
+	}
+
+	if (opt_n_threads)
+		GPU_N = min(MAX_GPUS, opt_n_threads);
+	for (int i=0; i < GPU_N; i++)
+	{
+		char vendorname[32] = { 0 };
+		int dev_id = device_map[i];
+		cudaDeviceProp props;
+		cudaGetDeviceProperties(&props, dev_id);
+
+		device_sm[dev_id] = (props.major * 100 + props.minor * 10);
+		device_mpcount[dev_id] = (short) props.multiProcessorCount;
+
+		if (device_name[dev_id]) {
+			free(device_name[dev_id]);
+			device_name[dev_id] = NULL;
+		}
+#ifdef USE_WRAPNVML
+		if (gpu_vendor((uint8_t)props.pciBusID, vendorname) > 0 && strlen(vendorname)) {
+			device_name[dev_id] = (char*) calloc(1, strlen(vendorname) + strlen(props.name) + 2);
+			if (!strncmp(props.name, "GeForce ", 8))
+				sprintf(device_name[dev_id], "%s %s", vendorname, &props.name[8]);
+			else
+				sprintf(device_name[dev_id], "%s %s", vendorname, props.name);
+		} else
+#endif
+			device_name[dev_id] = strdup(props.name);
+	}
+}
+
+void cuda_print_devices()
+{
+	int ngpus = cuda_num_devices();
+	cuda_devicenames();
+	for (int n=0; n < ngpus; n++) {
+		int dev_id = device_map[n % MAX_GPUS];
+		cudaDeviceProp props;
+		cudaGetDeviceProperties(&props, dev_id);
+		if (!opt_n_threads || n < opt_n_threads) {
+			fprintf(stderr, "GPU #%d: SM %d.%d %s @ %.0f MHz (MEM %.0f)\n", dev_id,
+				props.major, props.minor, device_name[dev_id],
+				(double) props.clockRate/1000,
+				(double) props.memoryClockRate/1000);
+#ifdef USE_WRAPNVML
+			if (opt_debug) nvml_print_device_info(dev_id);
+#ifdef WIN32
+			if (opt_debug) {
+				unsigned int devNum = nvapi_devnum(dev_id);
+				nvapi_pstateinfo(devNum);
+			}
+#endif
+#endif
+		}
+	}
+}
+
+void cuda_shutdown()
+{
+	// require gpu init first
+	//if (thr_info != NULL)
+	//	cudaDeviceSynchronize();
+	cudaDeviceReset();
+}
+
+static bool substringsearch(const char *haystack, const char *needle, int &match)
+{
+	int hlen = (int) strlen(haystack);
+	int nlen = (int) strlen(needle);
+	for (int i=0; i < hlen; ++i)
+	{
+		if (haystack[i] == ' ') continue;
+		int j=0, x = 0;
+		while(j < nlen)
+		{
+			if (haystack[i+x] == ' ') {++x; continue;}
+			if (needle[j] == ' ') {++j; continue;}
+			if (needle[j] == '#') return ++match == needle[j+1]-'0';
+			if (tolower(haystack[i+x]) != tolower(needle[j])) break;
+			++j; ++x;
+		}
+		if (j == nlen) return true;
+	}
+	return false;
+}
+
+// CUDA Gerät nach Namen finden (gibt Geräte-Index zurück oder -1)
+int cuda_finddevice(char *name)
+{
+	int num = cuda_num_devices();
+	int match = 0;
+	for (int i=0; i < num; ++i)
+	{
+		cudaDeviceProp props;
+		if (cudaGetDeviceProperties(&props, i) == cudaSuccess)
+			if (substringsearch(props.name, name, match)) return i;
+	}
+	return -1;
+}
+
+// since 1.7
+uint32_t cuda_default_throughput(int thr_id, uint32_t defcount)
+{
+	//int dev_id = device_map[thr_id % MAX_GPUS];
+	uint32_t throughput = gpus_intensity[thr_id] ? gpus_intensity[thr_id] : defcount;
+	if (gpu_threads > 1 && throughput == defcount) throughput /= (gpu_threads-1);
+	if (api_thr_id != -1) api_set_throughput(thr_id, throughput);
+	//gpulog(LOG_INFO, thr_id, "throughput %u", throughput);
+	return throughput;
+}
+
+// since 1.8.3
+double throughput2intensity(uint32_t throughput)
+{
+	double intensity = 0.;
+	uint32_t ws = throughput;
+	uint8_t i = 0;
+	while (ws > 1 && i++ < 32)
+		ws = ws >> 1;
+	intensity = (double) i;
+	if (i && ((1U << i) < throughput)) {
+		intensity += ((double) (throughput-(1U << i)) / (1U << i));
+	}
+	return intensity;
+}
+
+// if we use 2 threads on the same gpu, we need to reinit the threads
+void cuda_reset_device(int thr_id, bool *init)
+{
+	int dev_id = device_map[thr_id % MAX_GPUS];
+	cudaSetDevice(dev_id);
+	if (init != NULL) {
+		// with init array, its meant to be used in algo's scan code...
+		for (int i=0; i < MAX_GPUS; i++) {
+			if (device_map[i] == dev_id) {
+				init[i] = false;
+			}
+		}
+		// force exit from algo's scan loops/function
+		restart_threads();
+		cudaDeviceSynchronize();
+		while (cudaStreamQuery(NULL) == cudaErrorNotReady)
+			usleep(1000);
+	}
+	cudaDeviceReset();
+	if (opt_cudaschedule >= 0) {
+		cudaSetDeviceFlags((unsigned)(opt_cudaschedule & cudaDeviceScheduleMask));
+	} else {
+		cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+	}
+	cudaDeviceSynchronize();
+}
+
+// return free memory in megabytes
+int cuda_available_memory(int thr_id)
+{
+	int dev_id = device_map[thr_id % MAX_GPUS];
+#if defined(_WIN32) && defined(USE_WRAPNVML)
+	uint64_t tot64 = 0, free64 = 0;
+	// cuda (6.5) one can crash on pascal and dont handle 8GB
+	nvapiMemGetInfo(dev_id, &free64, &tot64);
+	return (int) (free64 / (1024));
+#else
+	size_t mtotal = 0, mfree = 0;
+	cudaSetDevice(dev_id);
+	cudaDeviceSynchronize();
+	cudaMemGetInfo(&mfree, &mtotal);
+	return (int) (mfree / (1024 * 1024));
+#endif
+}
+
+// Check (and reset) last cuda error, and report it in logs
+void cuda_log_lasterror(int thr_id, const char* func, int line)
+{
+	cudaError_t err = cudaGetLastError();
+	if (err != cudaSuccess && !opt_quiet)
+		gpulog(LOG_WARNING, thr_id, "%s:%d %s", func, line, cudaGetErrorString(err));
+}
+
+// Clear any cuda error in non-cuda unit (.c/.cpp)
+void cuda_clear_lasterror()
+{
+	cudaGetLastError();
+}
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+int cuda_gpu_info(struct cgpu_info *gpu)
+{
+	cudaDeviceProp props;
+	if (cudaGetDeviceProperties(&props, gpu->gpu_id) == cudaSuccess) {
+		gpu->gpu_clock = (uint32_t) props.clockRate;
+		gpu->gpu_memclock = (uint32_t) props.memoryClockRate;
+		gpu->gpu_mem = (uint64_t) (props.totalGlobalMem / 1024); // kB
+#if defined(_WIN32) && defined(USE_WRAPNVML)
+		// required to get mem size > 4GB (size_t too small for bytes on 32bit)
+		nvapiMemGetInfo(gpu->gpu_id, &gpu->gpu_memfree, &gpu->gpu_mem); // kB
+#endif
+		gpu->gpu_mem = gpu->gpu_mem / 1024; // MB
+		return 0;
+	}
+	return -1;
+}
+
+// Zeitsynchronisations-Routine von cudaminer mit CPU sleep
+// Note: if you disable all of these calls, CPU usage will hit 100%
+typedef struct { double value[8]; } tsumarray;
+cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id)
+{
+	cudaError_t result = cudaSuccess;
+	if (abort_flag)
+		return result;
+	if (situation >= 0)
+	{
+		static std::map<int, tsumarray> tsum;
+
+		double a = 0.95, b = 0.05;
+		if (tsum.find(situation) == tsum.end()) { a = 0.5; b = 0.5; } // faster initial convergence
+
+		double tsync = 0.0;
+		double tsleep = 0.95 * tsum[situation].value[thr_id];
+		if (cudaStreamQuery(stream) == cudaErrorNotReady)
+		{
+			usleep((useconds_t)(1e6*tsleep));
+			struct timeval tv_start, tv_end;
+			gettimeofday(&tv_start, NULL);
+			result = cudaStreamSynchronize(stream);
+			gettimeofday(&tv_end, NULL);
+			tsync = 1e-6 * (tv_end.tv_usec-tv_start.tv_usec) + (tv_end.tv_sec-tv_start.tv_sec);
+		}
+		if (tsync >= 0) tsum[situation].value[thr_id] = a * tsum[situation].value[thr_id] + b * (tsleep+tsync);
+	}
+	else
+		result = cudaStreamSynchronize(stream);
+	return result;
+}
+
+void cudaReportHardwareFailure(int thr_id, cudaError_t err, const char* func)
+{
+	struct cgpu_info *gpu = &thr_info[thr_id].gpu;
+	gpu->hw_errors++;
+	gpulog(LOG_ERR, thr_id, "%s %s", func, cudaGetErrorString(err));
+	sleep(1);
+}
diff --git a/cuda_checkhash.cu b/cuda_checkhash.cu
new file mode 100644
index 0000000000..76a94f6623
--- /dev/null
+++ b/cuda_checkhash.cu
@@ -0,0 +1,298 @@
+/**
+ * This code compares final hash against target
+ */
+#include <stdio.h>
+#include <memory.h>
+
+#include "miner.h"
+
+#include "cuda_helper.h"
+
+__constant__ uint32_t pTarget[8]; // 32 bytes
+
+// store MAX_GPUS device arrays of 8 nonces
+static uint32_t* h_resNonces[MAX_GPUS] = { NULL };
+static uint32_t* d_resNonces[MAX_GPUS] = { NULL };
+static __thread bool init_done = false;
+
+__host__
+void cuda_check_cpu_init(int thr_id, uint32_t threads)
+{
+    CUDA_CALL_OR_RET(cudaMalloc(&d_resNonces[thr_id], 32));
+    CUDA_SAFE_CALL(cudaMallocHost(&h_resNonces[thr_id], 32));
+    init_done = true;
+}
+
+__host__
+void cuda_check_cpu_free(int thr_id)
+{
+	if (!init_done) return;
+	cudaFree(d_resNonces[thr_id]);
+	cudaFreeHost(h_resNonces[thr_id]);
+	d_resNonces[thr_id] = NULL;
+	h_resNonces[thr_id] = NULL;
+	init_done = false;
+}
+
+// Target Difficulty
+__host__
+void cuda_check_cpu_setTarget(const void *ptarget)
+{
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(pTarget, ptarget, 32, 0, cudaMemcpyHostToDevice));
+}
+
+/* --------------------------------------------------------------------------------------------- */
+
+__device__ __forceinline__
+static bool hashbelowtarget(const uint32_t *const __restrict__ hash, const uint32_t *const __restrict__ target)
+{
+	if (hash[7] > target[7])
+		return false;
+	if (hash[7] < target[7])
+		return true;
+	if (hash[6] > target[6])
+		return false;
+	if (hash[6] < target[6])
+		return true;
+
+	if (hash[5] > target[5])
+		return false;
+	if (hash[5] < target[5])
+		return true;
+	if (hash[4] > target[4])
+		return false;
+	if (hash[4] < target[4])
+		return true;
+
+	if (hash[3] > target[3])
+		return false;
+	if (hash[3] < target[3])
+		return true;
+	if (hash[2] > target[2])
+		return false;
+	if (hash[2] < target[2])
+		return true;
+
+	if (hash[1] > target[1])
+		return false;
+	if (hash[1] < target[1])
+		return true;
+	if (hash[0] > target[0])
+		return false;
+
+	return true;
+}
+
+__global__ __launch_bounds__(512, 4)
+void cuda_checkhash_64(uint32_t threads, uint32_t startNounce, uint32_t *hash, uint32_t *resNonces)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		// shl 4 = *16 x 4 (uint32) = 64 bytes
+		// todo: use only 32 bytes * threads if possible
+		uint32_t *inpHash = &hash[thread << 4];
+
+		if (resNonces[0] == UINT32_MAX) {
+			if (hashbelowtarget(inpHash, pTarget))
+				resNonces[0] = (startNounce + thread);
+		}
+	}
+}
+
+__global__ __launch_bounds__(512, 4)
+void cuda_checkhash_32(uint32_t threads, uint32_t startNounce, uint32_t *hash, uint32_t *resNonces)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t *inpHash = &hash[thread << 3];
+
+		if (resNonces[0] == UINT32_MAX) {
+			if (hashbelowtarget(inpHash, pTarget))
+				resNonces[0] = (startNounce + thread);
+		}
+	}
+}
+
+__host__
+uint32_t cuda_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash)
+{
+	cudaMemset(d_resNonces[thr_id], 0xff, sizeof(uint32_t));
+
+	const uint32_t threadsperblock = 512;
+
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	if (bench_algo >= 0) // dont interrupt the global benchmark
+		return UINT32_MAX;
+
+	if (!init_done) {
+		applog(LOG_ERR, "missing call to cuda_check_cpu_init");
+		return UINT32_MAX;
+	}
+
+	cuda_checkhash_64 <<<grid, block>>> (threads, startNounce, d_inputHash, d_resNonces[thr_id]);
+	cudaThreadSynchronize();
+
+	cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	return h_resNonces[thr_id][0];
+}
+
+__host__
+uint32_t cuda_check_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash)
+{
+	cudaMemset(d_resNonces[thr_id], 0xff, sizeof(uint32_t));
+
+	const uint32_t threadsperblock = 512;
+
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	if (bench_algo >= 0) // dont interrupt the global benchmark
+		return UINT32_MAX;
+
+	if (!init_done) {
+		applog(LOG_ERR, "missing call to cuda_check_cpu_init");
+		return UINT32_MAX;
+	}
+
+	cuda_checkhash_32 <<<grid, block>>> (threads, startNounce, d_inputHash, d_resNonces[thr_id]);
+	cudaThreadSynchronize();
+
+	cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	return h_resNonces[thr_id][0];
+}
+
+/* --------------------------------------------------------------------------------------------- */
+
+__global__ __launch_bounds__(512, 4)
+void cuda_checkhash_64_suppl(uint32_t startNounce, uint32_t *hash, uint32_t *resNonces)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+	uint32_t *inpHash = &hash[thread << 4];
+
+	if (hashbelowtarget(inpHash, pTarget)) {
+		int resNum = ++resNonces[0];
+		__threadfence();
+		if (resNum < 8)
+			resNonces[resNum] = (startNounce + thread);
+	}
+}
+
+__host__
+uint32_t cuda_check_hash_suppl(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash, uint8_t numNonce)
+{
+	uint32_t rescnt, result = 0;
+
+	const uint32_t threadsperblock = 512;
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	if (!init_done) {
+		applog(LOG_ERR, "missing call to cuda_check_cpu_init");
+		return 0;
+	}
+
+	// first element stores the count of found nonces
+	cudaMemset(d_resNonces[thr_id], 0, sizeof(uint32_t));
+
+	cuda_checkhash_64_suppl <<<grid, block>>> (startNounce, d_inputHash, d_resNonces[thr_id]);
+	cudaThreadSynchronize();
+
+	cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], 32, cudaMemcpyDeviceToHost);
+	rescnt = h_resNonces[thr_id][0];
+	if (rescnt > numNonce) {
+		if (numNonce <= rescnt) {
+			result = h_resNonces[thr_id][numNonce+1];
+		}
+		if (opt_debug)
+			applog(LOG_WARNING, "Found %d nonces: %x + %x", rescnt, h_resNonces[thr_id][1], result);
+	}
+
+	return result;
+}
+
+/* --------------------------------------------------------------------------------------------- */
+
+__global__
+void cuda_check_hash_branch_64(uint32_t threads, uint32_t startNounce, uint32_t *g_nonceVector, uint32_t *g_hash, uint32_t *resNounce)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t nounce = g_nonceVector[thread];
+		uint32_t hashPosition = (nounce - startNounce) << 4;
+		uint32_t *inpHash = &g_hash[hashPosition];
+
+		for (int i = 7; i >= 0; i--) {
+			if (inpHash[i] > pTarget[i]) {
+				return;
+			}
+			if (inpHash[i] < pTarget[i]) {
+				break;
+			}
+		}
+		if (resNounce[0] > nounce)
+			resNounce[0] = nounce;
+	}
+}
+
+__host__
+uint32_t cuda_check_hash_branch(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order)
+{
+	const uint32_t threadsperblock = 256;
+
+	uint32_t result = UINT32_MAX;
+
+	if (bench_algo >= 0) // dont interrupt the global benchmark
+		return result;
+
+	if (!init_done) {
+		applog(LOG_ERR, "missing call to cuda_check_cpu_init");
+		return result;
+	}
+
+	cudaMemset(d_resNonces[thr_id], 0xff, sizeof(uint32_t));
+
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	cuda_check_hash_branch_64 <<<grid, block>>> (threads, startNounce, d_nonceVector, d_inputHash, d_resNonces[thr_id]);
+
+	MyStreamSynchronize(NULL, order, thr_id);
+
+	cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+
+	cudaThreadSynchronize();
+	result = *h_resNonces[thr_id];
+
+	return result;
+}
+
+/* Function to get the compiled Shader Model version */
+int cuda_arch[MAX_GPUS] = { 0 };
+__global__ void nvcc_get_arch(int *d_version)
+{
+	*d_version = 0;
+#ifdef __CUDA_ARCH__
+	*d_version = __CUDA_ARCH__;
+#endif
+}
+
+__host__
+int cuda_get_arch(int thr_id)
+{
+	int *d_version;
+	int dev_id = device_map[thr_id];
+	if (cuda_arch[dev_id] == 0) {
+		// only do it once...
+		cudaMalloc(&d_version, sizeof(int));
+		nvcc_get_arch <<< 1, 1 >>> (d_version);
+		cudaMemcpy(&cuda_arch[dev_id], d_version, sizeof(int), cudaMemcpyDeviceToHost);
+		cudaFree(d_version);
+	}
+	return cuda_arch[dev_id];
+}
diff --git a/cuda_debug.cuh b/cuda_debug.cuh
new file mode 100644
index 0000000000..363dc1aa6b
--- /dev/null
+++ b/cuda_debug.cuh
@@ -0,0 +1,47 @@
+/**
+ * Helper to trace gpu computed data with --cputest
+ *
+ * Sample usage in an algo scan cuda unit :
+ *
+ * #define _DEBUG
+ * #define _DEBUG_PREFIX "x11-"
+ * #include "cuda_debug.cuh"
+ *
+ * TRACE64("luffa", d_hash);
+ * or
+ * TRACE("luffa")
+ *
+ * Dont forget to link the scan function in util.cpp (do_gpu_tests)
+ *
+ */
+
+#include <stdio.h>
+//#include "cuda_helper.h"
+
+#ifndef _DEBUG_PREFIX
+#define _DEBUG_PREFIX ""
+#endif
+
+#ifdef _DEBUG
+#define TRACE64(algo, d_buf) { \
+	if (max_nonce == 1 && pdata[19] <= 1 && !opt_benchmark) { \
+		uint32_t oft = 0; \
+		uint32_t* debugbuf = NULL; \
+		cudaMallocHost(&debugbuf, 16*sizeof(uint32_t)); \
+		cudaMemcpy(debugbuf, d_buf[thr_id] + oft, 16*sizeof(uint32_t), cudaMemcpyDeviceToHost); \
+		printf(_DEBUG_PREFIX "%s %08x %08x %08x %08x %08x %08x %08x %08x  %08x %08x %08x %08x %08x %08x %08x %08x\n", \
+			algo, \
+			swab32(debugbuf[0]), swab32(debugbuf[1]), swab32(debugbuf[2]), swab32(debugbuf[3]), \
+			swab32(debugbuf[4]), swab32(debugbuf[5]), swab32(debugbuf[6]), swab32(debugbuf[7]), \
+			swab32(debugbuf[8]), swab32(debugbuf[9]), swab32(debugbuf[10]),swab32(debugbuf[11]), \
+			swab32(debugbuf[12]),swab32(debugbuf[13]),swab32(debugbuf[14]),swab32(debugbuf[15])); \
+		cudaFreeHost(debugbuf); \
+	} \
+}
+#else
+#define TRACE64(algo, d_buf) {}
+#endif
+
+// simplified default
+#define TRACE(algo) TRACE64(algo, d_hash)
+
diff --git a/cuda_fugue256.h b/cuda_fugue256.h
index c9fac913bc..44f3fd024f 100644
--- a/cuda_fugue256.h
+++ b/cuda_fugue256.h
@@ -1,8 +1,9 @@
-#ifndef _CUDA_FUGUE512_H
-#define _CUDA_FUGUE512_H
+#ifndef _CUDA_FUGUE256_H
+#define _CUDA_FUGUE256_H
 
-void fugue256_cpu_hash(int thr_id, int threads, int startNounce, void *outputHashes, uint32_t *nounce);
+void fugue256_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce);
 void fugue256_cpu_setBlock(int thr_id, void *data, void *pTargetIn);
-void fugue256_cpu_init(int thr_id, int threads);
+void fugue256_cpu_init(int thr_id, uint32_t threads);
+void fugue256_cpu_free(int thr_id);
 
 #endif
diff --git a/cuda_groestlcoin.cu b/cuda_groestlcoin.cu
index 095d3260d7..5c8fcf9c12 100644
--- a/cuda_groestlcoin.cu
+++ b/cuda_groestlcoin.cu
@@ -1,166 +1,163 @@
 // Auf Groestlcoin spezialisierte Version von Groestl inkl. Bitslice
 
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
 #include <stdio.h>
 #include <memory.h>
 
-// aus cpu-miner.c
-extern int device_map[8];
-
-// aus heavy.cu
-extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
+#include "cuda_helper.h"
 
-// Folgende Definitionen sp�ter durch header ersetzen
-typedef unsigned char uint8_t;
-typedef unsigned short uint16_t;
-typedef unsigned int uint32_t;
+#ifdef __INTELLISENSE__
+#define __CUDA_ARCH__ 500
+#define __byte_perm(x,y,n) x
+#endif
 
-// diese Struktur wird in der Init Funktion angefordert
-static cudaDeviceProp props[8];
+#include "miner.h"
 
-// globaler Speicher f�r alle HeftyHashes aller Threads
 __constant__ uint32_t pTarget[8]; // Single GPU
-extern uint32_t *d_resultNonce[8];
-
 __constant__ uint32_t groestlcoin_gpu_msg[32];
 
-// 64 Register Variante f�r Compute 3.0
-#include "groestl_functions_quad.cu"
-#include "bitslice_transformations_quad.cu"
+static uint32_t *d_resultNonce[MAX_GPUS];
 
-#define SWAB32(x)        ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
+#if __CUDA_ARCH__ >= 300
+// 64 Registers Variant for Compute 3.0+
+#include "quark/groestl_functions_quad.h"
+#include "quark/groestl_transf_quad.h"
+#endif
 
-__global__ void __launch_bounds__(256, 4)
- groestlcoin_gpu_hash_quad(int threads, uint32_t startNounce, uint32_t *resNounce)
+#define SWAB32(x) cuda_swab32(x)
+
+__global__ __launch_bounds__(256, 4)
+void groestlcoin_gpu_hash_quad(uint32_t threads, uint32_t startNounce, uint32_t *resNounce)
 {
-    // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
-    int thread = (blockDim.x * blockIdx.x + threadIdx.x) / 4;
-    if (thread < threads)
-    {
-        // GROESTL
-        uint32_t paddedInput[8];
-#pragma unroll 8
-        for(int k=0;k<8;k++) paddedInput[k] = groestlcoin_gpu_msg[4*k+threadIdx.x%4];
-
-        uint32_t nounce = startNounce + thread;
-        if ((threadIdx.x % 4) == 3)
-            paddedInput[4] = SWAB32(nounce);  // 4*4+3 = 19
-
-        uint32_t msgBitsliced[8];
-        to_bitslice_quad(paddedInput, msgBitsliced);
-
-        uint32_t state[8];
-        for (int round=0; round<2; round++)
-        {
-            groestl512_progressMessage_quad(state, msgBitsliced);
-
-            if (round < 1)
-            {
-                // Verkettung zweier Runden inclusive Padding.
-                msgBitsliced[ 0] = __byte_perm(state[ 0], 0x00800100, 0x4341 + ((threadIdx.x%4)==3)*0x2000);
-                msgBitsliced[ 1] = __byte_perm(state[ 1], 0x00800100, 0x4341);
-                msgBitsliced[ 2] = __byte_perm(state[ 2], 0x00800100, 0x4341);
-                msgBitsliced[ 3] = __byte_perm(state[ 3], 0x00800100, 0x4341);
-                msgBitsliced[ 4] = __byte_perm(state[ 4], 0x00800100, 0x4341);
-                msgBitsliced[ 5] = __byte_perm(state[ 5], 0x00800100, 0x4341);
-                msgBitsliced[ 6] = __byte_perm(state[ 6], 0x00800100, 0x4341);
-                msgBitsliced[ 7] = __byte_perm(state[ 7], 0x00800100, 0x4341 + ((threadIdx.x%4)==0)*0x0010);
-            }
-        }
-
-        // Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash
-        uint32_t out_state[16];
-        from_bitslice_quad(state, out_state);
-        
-        if (threadIdx.x % 4 == 0)
-        {
-            int i, position = -1;
-            bool rc = true;
-
-    #pragma unroll 8
-            for (i = 7; i >= 0; i--) {
-                if (out_state[i] > pTarget[i]) {
-                    if(position < i) {
-                        position = i;
-                        rc = false;
-                    }
-                 }
-                 if (out_state[i] < pTarget[i]) {
-                    if(position < i) {
-                        position = i;
-                        rc = true;
-                    }
-                 }
-            }
-
-            if(rc == true)
-                if(resNounce[0] > nounce)
-                    resNounce[0] = nounce;
-        }
-    }
+#if __CUDA_ARCH__ >= 300
+	// durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) / 4;
+	if (thread < threads)
+	{
+		// GROESTL
+		uint32_t paddedInput[8];
+
+		#pragma unroll 8
+		for(int k=0;k<8;k++) paddedInput[k] = groestlcoin_gpu_msg[4*k+threadIdx.x%4];
+
+		uint32_t nounce = startNounce + thread;
+		if ((threadIdx.x % 4) == 3)
+			paddedInput[4] = SWAB32(nounce);  // 4*4+3 = 19
+
+		uint32_t msgBitsliced[8];
+		to_bitslice_quad(paddedInput, msgBitsliced);
+
+		uint32_t state[8];
+		for (int round=0; round<2; round++)
+		{
+			groestl512_progressMessage_quad(state, msgBitsliced);
+
+			if (round < 1)
+			{
+				// Verkettung zweier Runden inclusive Padding.
+				msgBitsliced[ 0] = __byte_perm(state[ 0], 0x00800100, 0x4341 + ((threadIdx.x%4)==3)*0x2000);
+				msgBitsliced[ 1] = __byte_perm(state[ 1], 0x00800100, 0x4341);
+				msgBitsliced[ 2] = __byte_perm(state[ 2], 0x00800100, 0x4341);
+				msgBitsliced[ 3] = __byte_perm(state[ 3], 0x00800100, 0x4341);
+				msgBitsliced[ 4] = __byte_perm(state[ 4], 0x00800100, 0x4341);
+				msgBitsliced[ 5] = __byte_perm(state[ 5], 0x00800100, 0x4341);
+				msgBitsliced[ 6] = __byte_perm(state[ 6], 0x00800100, 0x4341);
+				msgBitsliced[ 7] = __byte_perm(state[ 7], 0x00800100, 0x4341 + ((threadIdx.x%4)==0)*0x0010);
+			}
+		}
+
+		// Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash
+		uint32_t out_state[16];
+		from_bitslice_quad(state, out_state);
+
+		if (threadIdx.x % 4 == 0)
+		{
+			int i, position = -1;
+			bool rc = true;
+
+			#pragma unroll 8
+			for (i = 7; i >= 0; i--) {
+				if (out_state[i] > pTarget[i]) {
+					if(position < i) {
+						position = i;
+						rc = false;
+					}
+				 }
+				 if (out_state[i] < pTarget[i]) {
+					if(position < i) {
+						position = i;
+						rc = true;
+					}
+				 }
+			}
+
+			if(rc && resNounce[0] > nounce)
+				resNounce[0] = nounce;
+		}
+	}
+#endif
 }
 
-// Setup-Funktionen
-__host__ void groestlcoin_cpu_init(int thr_id, int threads)
+__host__
+void groestlcoin_cpu_init(int thr_id, uint32_t threads)
 {
-    cudaSetDevice(device_map[thr_id]);
+	// to check if the binary supports SM3+
+	cuda_get_arch(thr_id);
 
-    cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
+	CUDA_SAFE_CALL(cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)));
+}
 
-    // Speicher f�r Gewinner-Nonce belegen
-    cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); 
+__host__
+void groestlcoin_cpu_free(int thr_id)
+{
+	cudaFree(d_resultNonce[thr_id]);
 }
 
-__host__ void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
+__host__
+void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
 {
-    // Nachricht expandieren und setzen
-    uint32_t msgBlock[32];
+	uint32_t msgBlock[32] = { 0 };
 
-    memset(msgBlock, 0, sizeof(uint32_t) * 32);
-    memcpy(&msgBlock[0], data, 80);
+	memcpy(&msgBlock[0], data, 80);
 
-    // Erweitere die Nachricht auf den Nachrichtenblock (padding)
-    // Unsere Nachricht hat 80 Byte
-    msgBlock[20] = 0x80;
-    msgBlock[31] = 0x01000000;
+	// Erweitere die Nachricht auf den Nachrichtenblock (padding)
+	// Unsere Nachricht hat 80 Byte
+	msgBlock[20] = 0x80;
+	msgBlock[31] = 0x01000000;
 
-    // groestl512 braucht hierf�r keinen CPU-Code (die einzige Runde wird
-    // auf der GPU ausgef�hrt)
+	// groestl512 braucht hierfür keinen CPU-Code (die einzige Runde wird
+	// auf der GPU ausgeführt)
 
-    // Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch)
-    cudaMemcpyToSymbol( groestlcoin_gpu_msg,
-                        msgBlock,
-                        128);
+	// Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch)
+	cudaMemcpyToSymbol(groestlcoin_gpu_msg, msgBlock, 128);
 
-    cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
-    cudaMemcpyToSymbol( pTarget,
-                        pTargetIn,
-                        sizeof(uint32_t) * 8 );
+	cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
+	cudaMemcpyToSymbol(pTarget, pTargetIn, 32);
 }
 
-__host__ void groestlcoin_cpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce)
+__host__
+void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonce)
 {
-    int threadsperblock = 256;
+	uint32_t threadsperblock = 256;
 
-    // Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
-    // mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
-    int factor = 4;
+	// Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
+	// mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
+	int factor = 4;
 
-        // berechne wie viele Thread Blocks wir brauchen
-    dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
-    dim3 block(threadsperblock);
+	// berechne wie viele Thread Blocks wir brauchen
+	dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
+	dim3 block(threadsperblock);
 
-    // Gr��e des dynamischen Shared Memory Bereichs
-    size_t shared_size = 0;
+	int dev_id = device_map[thr_id];
+	if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300) {
+		gpulog(LOG_ERR, thr_id, "Sorry, This algo is not supported by this GPU arch (SM 3.0 required)");
+		proper_exit(EXIT_CODE_CUDA_ERROR);
+	}
 
-    cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
-    groestlcoin_gpu_hash_quad<<<grid, block, shared_size>>>(threads, startNounce, d_resultNonce[thr_id]);
+	cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
+	groestlcoin_gpu_hash_quad <<<grid, block>>> (threads, startNounce, d_resultNonce[thr_id]);
 
-    // Strategisches Sleep Kommando zur Senkung der CPU Last
-    MyStreamSynchronize(NULL, 0, thr_id);
+	// Strategisches Sleep Kommando zur Senkung der CPU Last
+	// MyStreamSynchronize(NULL, 0, thr_id);
 
-    cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	cudaMemcpy(resNonce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
 }
diff --git a/cuda_groestlcoin.h b/cuda_groestlcoin.h
index 8ad7dabbaa..e38f086f06 100644
--- a/cuda_groestlcoin.h
+++ b/cuda_groestlcoin.h
@@ -1,8 +1,9 @@
 #ifndef _CUDA_GROESTLCOIN_H
 #define _CUDA_GROESTLCOIN_H
 
-void groestlcoin_cpu_init(int thr_id, int threads);
+void groestlcoin_cpu_init(int thr_id, uint32_t threads);
+void groestlcoin_cpu_free(int thr_id);
 void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn);
-void groestlcoin_cpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce);
+void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonce);
 
 #endif
\ No newline at end of file
diff --git a/cuda_helper.h b/cuda_helper.h
index 8b0b3f6e90..c51a325332 100644
--- a/cuda_helper.h
+++ b/cuda_helper.h
@@ -1,17 +1,132 @@
 #ifndef CUDA_HELPER_H
 #define CUDA_HELPER_H
 
-static __device__ unsigned long long MAKE_ULONGLONG(uint32_t LO, uint32_t HI)
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#ifdef __INTELLISENSE__
+/* reduce vstudio warnings (__byteperm, blockIdx...) */
+#include <device_functions.h>
+#include <device_launch_parameters.h>
+#define __launch_bounds__(max_tpb, min_blocks)
+#endif
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#ifndef UINT32_MAX
+/* slackware need that */
+#define UINT32_MAX UINT_MAX
+#endif
+
+#ifndef MAX_GPUS
+#define MAX_GPUS 16
+#endif
+
+extern "C" short device_map[MAX_GPUS];
+extern "C"  long device_sm[MAX_GPUS];
+extern "C" short device_mpcount[MAX_GPUS];
+extern int cuda_arch[MAX_GPUS];
+
+// common functions
+extern int cuda_get_arch(int thr_id);
+extern void cuda_check_cpu_init(int thr_id, uint32_t threads);
+extern void cuda_check_cpu_free(int thr_id);
+extern void cuda_check_cpu_setTarget(const void *ptarget);
+extern uint32_t cuda_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash);
+extern uint32_t cuda_check_hash_suppl(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash, uint8_t numNonce);
+extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
+extern void cudaReportHardwareFailure(int thr_id, cudaError_t error, const char* func);
+extern __device__ __device_builtin__ void __syncthreads(void);
+extern __device__ __device_builtin__ void __threadfence(void);
+
+#ifndef __CUDA_ARCH__
+// define blockDim and threadIdx for host
+extern const dim3 blockDim;
+extern const uint3 threadIdx;
+#endif
+
+#ifndef SPH_C32
+#define SPH_C32(x) (x)
+// #define SPH_C32(x) ((uint32_t)(x ## U))
+#endif
+
+#ifndef SPH_C64
+#define SPH_C64(x) (x)
+// #define SPH_C64(x) ((uint64_t)(x ## ULL))
+#endif
+
+#ifndef SPH_T32
+#define SPH_T32(x) (x)
+// #define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
+#endif
+
+#ifndef SPH_T64
+#define SPH_T64(x) (x)
+// #define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF))
+#endif
+
+#if __CUDA_ARCH__ < 320
+// Host and Compute 3.0
+#define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
+#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
+#define __ldg(x) (*(x))
+#else
+// Compute 3.2+
+#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
+#define ROTR32(x, n) __funnelshift_r( (x), (x), (n) )
+#endif
+
+#define AS_U32(addr)   *((uint32_t*)(addr))
+#define AS_U64(addr)   *((uint64_t*)(addr))
+#define AS_UINT2(addr) *((uint2*)(addr))
+#define AS_UINT4(addr) *((uint4*)(addr))
+#define AS_UL2(addr)   *((ulonglong2*)(addr))
+
+__device__ __forceinline__ uint64_t MAKE_ULONGLONG(uint32_t LO, uint32_t HI)
 {
 #if __CUDA_ARCH__ >= 130
-    return __double_as_longlong(__hiloint2double(HI, LO));
+	return __double_as_longlong(__hiloint2double(HI, LO));
+#else
+	return (uint64_t)LO | (((uint64_t)HI) << 32);
+#endif
+}
+
+// das Hi Word in einem 64 Bit Typen ersetzen
+__device__ __forceinline__ uint64_t REPLACE_HIDWORD(const uint64_t &x, const uint32_t &y) {
+	return (x & 0xFFFFFFFFULL) | (((uint64_t)y) << 32U);
+}
+
+// das Lo Word in einem 64 Bit Typen ersetzen
+__device__ __forceinline__ uint64_t REPLACE_LODWORD(const uint64_t &x, const uint32_t &y) {
+	return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y);
+}
+
+// Endian Drehung f�r 32 Bit Typen
+#ifdef __CUDA_ARCH__
+__device__ __forceinline__ uint32_t cuda_swab32(uint32_t x)
+{
+	/* device */
+	return __byte_perm(x, x, 0x0123);
+}
+#else
+	/* host */
+	#define cuda_swab32(x) \
+	((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \
+		(((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
+#endif
+
+// das Lo Word aus einem 64 Bit Typen extrahieren
+__device__ __forceinline__ uint32_t _LODWORD(const uint64_t &x) {
+#if __CUDA_ARCH__ >= 130
+	return (uint32_t)__double2loint(__longlong_as_double(x));
 #else
-	return (unsigned long long)LO | (((unsigned long long)HI) << 32);
+	return (uint32_t)(x & 0xFFFFFFFFULL);
 #endif
 }
 
 // das Hi Word aus einem 64 Bit Typen extrahieren
-static __device__ uint32_t HIWORD(const uint64_t &x) {
+__device__ __forceinline__ uint32_t _HIDWORD(const uint64_t &x) {
 #if __CUDA_ARCH__ >= 130
 	return (uint32_t)__double2hiint(__longlong_as_double(x));
 #else
@@ -19,68 +134,549 @@ static __device__ uint32_t HIWORD(const uint64_t &x) {
 #endif
 }
 
-// das Hi Word in einem 64 Bit Typen ersetzen
-static __device__ uint64_t REPLACE_HIWORD(const uint64_t &x, const uint32_t &y) {
-	return (x & 0xFFFFFFFFULL) | (((uint64_t)y) << 32ULL);
+#ifdef __CUDA_ARCH__
+__device__ __forceinline__ uint64_t cuda_swab64(uint64_t x)
+{
+	// Input:       77665544 33221100
+	// Output:      00112233 44556677
+	uint64_t result;
+	//result = __byte_perm((uint32_t) x, 0, 0x0123);
+	//return (result << 32) + __byte_perm(_HIDWORD(x), 0, 0x0123);
+	asm("{ .reg .b32 x, y; // swab64\n\t"
+		"mov.b64 {x,y}, %1;\n\t"
+		"prmt.b32 x, x, 0, 0x0123;\n\t"
+		"prmt.b32 y, y, 0, 0x0123;\n\t"
+		"mov.b64 %0, {y,x};\n\t"
+	"}\n" : "=l"(result): "l"(x));
+	return result;
 }
+#else
+	/* host */
+	#define cuda_swab64(x) \
+		((uint64_t)((((uint64_t)(x) & 0xff00000000000000ULL) >> 56) | \
+			(((uint64_t)(x) & 0x00ff000000000000ULL) >> 40) | \
+			(((uint64_t)(x) & 0x0000ff0000000000ULL) >> 24) | \
+			(((uint64_t)(x) & 0x000000ff00000000ULL) >>  8) | \
+			(((uint64_t)(x) & 0x00000000ff000000ULL) <<  8) | \
+			(((uint64_t)(x) & 0x0000000000ff0000ULL) << 24) | \
+			(((uint64_t)(x) & 0x000000000000ff00ULL) << 40) | \
+			(((uint64_t)(x) & 0x00000000000000ffULL) << 56)))
+#endif
 
-// das Lo Word aus einem 64 Bit Typen extrahieren
-static __device__ uint32_t LOWORD(const uint64_t &x) {
-#if __CUDA_ARCH__ >= 130
-	return (uint32_t)__double2loint(__longlong_as_double(x));
+// swap two uint32_t without extra registers
+__device__ __host__ __forceinline__ void xchg(uint32_t &x, uint32_t &y) {
+	x ^= y; y = x ^ y; x ^= y;
+}
+// for other types...
+#define XCHG(x, y) { x ^= y; y = x ^ y; x ^= y; }
+
+/*********************************************************************/
+// Macros to catch CUDA errors in CUDA runtime calls
+
+#define CUDA_SAFE_CALL(call)                                          \
+do {                                                                  \
+	cudaError_t err = call;                                           \
+	if (cudaSuccess != err) {                                         \
+		fprintf(stderr, "Cuda error in func '%s' at line %i : %s.\n", \
+		         __FUNCTION__, __LINE__, cudaGetErrorString(err) );   \
+		exit(EXIT_FAILURE);                                           \
+	}                                                                 \
+} while (0)
+
+#define CUDA_CALL_OR_RET(call) do {                                   \
+	cudaError_t err = call;                                           \
+	if (cudaSuccess != err) {                                         \
+		cudaReportHardwareFailure(thr_id, err, __FUNCTION__);         \
+		return;                                                       \
+	}                                                                 \
+} while (0)
+
+#define CUDA_CALL_OR_RET_X(call, ret) do {                            \
+	cudaError_t err = call;                                           \
+	if (cudaSuccess != err) {                                         \
+		cudaReportHardwareFailure(thr_id, err, __FUNCTION__);         \
+		return ret;                                                   \
+	}                                                                 \
+} while (0)
+
+/*********************************************************************/
+#if !defined(__CUDA_ARCH__) || defined(_WIN64)
+#define USE_XOR_ASM_OPTS 0
 #else
-	return (uint32_t)(x & 0xFFFFFFFFULL);
+#define USE_XOR_ASM_OPTS 1
 #endif
+
+#if USE_XOR_ASM_OPTS
+// device asm for whirpool
+__device__ __forceinline__
+uint64_t xor1(uint64_t a, uint64_t b)
+{
+	uint64_t result;
+	asm("xor.b64 %0, %1, %2; // xor1" : "=l"(result) : "l"(a), "l"(b));
+	return result;
 }
+#else
+#define xor1(a,b) (a ^ b)
+#endif
 
-// das Lo Word in einem 64 Bit Typen ersetzen
-static __device__ uint64_t REPLACE_LOWORD(const uint64_t &x, const uint32_t &y) {
-	return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y);
+#if USE_XOR_ASM_OPTS
+// device asm for whirpool
+__device__ __forceinline__
+uint64_t xor3(uint64_t a, uint64_t b, uint64_t c)
+{
+	uint64_t result;
+	asm("xor.b64 %0, %2, %3; // xor3\n\t"
+	    "xor.b64 %0, %0, %1;\n\t"
+		/* output : input registers */
+		: "=l"(result) : "l"(a), "l"(b), "l"(c));
+	return result;
 }
+#else
+#define xor3(a,b,c) (a ^ b ^ c)
+#endif
 
-// Endian Drehung f�r 32 Bit Typen
-static __device__ uint32_t cuda_swab32(uint32_t x)
+#if USE_XOR_ASM_OPTS
+// device asm for whirpool
+__device__ __forceinline__
+uint64_t xor8(uint64_t a, uint64_t b, uint64_t c, uint64_t d,uint64_t e,uint64_t f,uint64_t g, uint64_t h)
 {
-	return __byte_perm(x, x, 0x0123);
+	uint64_t result;
+	asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(g) ,"l"(h));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(f));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(e));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(d));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(c));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(b));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(a));
+	return result;
+}
+#else
+#define xor8(a,b,c,d,e,f,g,h) ((a^b)^(c^d)^(e^f)^(g^h))
+#endif
+
+// device asm for x17
+__device__ __forceinline__
+uint64_t xandx(uint64_t a, uint64_t b, uint64_t c)
+{
+#ifdef __CUDA_ARCH__
+	uint64_t result;
+	asm("{ // xandx \n\t"
+		".reg .u64 n;\n\t"
+		"xor.b64 %0, %2, %3;\n\t"
+		"and.b64 n, %0, %1;\n\t"
+		"xor.b64 %0, n, %3;\n\t"
+	"}\n" : "=l"(result) : "l"(a), "l"(b), "l"(c));
+	return result;
+#else
+	return ((b^c) & a) ^ c;
+#endif
+}
+
+// device asm for x17
+__device__ __forceinline__
+uint64_t andor(uint64_t a, uint64_t b, uint64_t c)
+{
+#ifdef __CUDA_ARCH__
+	uint64_t result;
+	asm("{ // andor\n\t"
+		".reg .u64 m,n;\n\t"
+		"and.b64 m,  %1, %2;\n\t"
+		" or.b64 n,  %1, %2;\n\t"
+		"and.b64 %0, n,  %3;\n\t"
+		" or.b64 %0, %0, m;\n\t"
+	"}\n" : "=l"(result) : "l"(a), "l"(b), "l"(c));
+	return result;
+#else
+	return ((a | b) & c) | (a & b);
+#endif
+}
+
+// device asm for x17
+__device__ __forceinline__
+uint64_t shr_t64(uint64_t x, uint32_t n)
+{
+#ifdef __CUDA_ARCH__
+	uint64_t result;
+	asm("shr.b64 %0,%1,%2;\n\t"
+	: "=l"(result) : "l"(x), "r"(n));
+	return result;
+#else
+	return x >> n;
+#endif
+}
+
+__device__ __forceinline__
+uint64_t shl_t64(uint64_t x, uint32_t n)
+{
+#ifdef __CUDA_ARCH__
+	uint64_t result;
+	asm("shl.b64 %0,%1,%2;\n\t"
+	: "=l"(result) : "l"(x), "r"(n));
+	return result;
+#else
+	return x << n;
+#endif
+}
+
+__device__ __forceinline__
+uint32_t shr_t32(uint32_t x,uint32_t n) {
+#ifdef __CUDA_ARCH__
+	uint32_t result;
+	asm("shr.b32 %0,%1,%2;"	: "=r"(result) : "r"(x), "r"(n));
+	return result;
+#else
+	return x >> n;
+#endif
+}
+
+__device__ __forceinline__
+uint32_t shl_t32(uint32_t x,uint32_t n) {
+#ifdef __CUDA_ARCH__
+	uint32_t result;
+	asm("shl.b32 %0,%1,%2;" : "=r"(result) : "r"(x), "r"(n));
+	return result;
+#else
+	return x << n;
+#endif
+}
+
+#ifndef USE_ROT_ASM_OPT
+#define USE_ROT_ASM_OPT 1
+#endif
+
+// 64-bit ROTATE RIGHT
+#if __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 1
+/* complicated sm >= 3.5 one (with Funnel Shifter beschleunigt), to bench */
+__device__ __forceinline__
+uint64_t ROTR64(const uint64_t value, const int offset) {
+	uint2 result;
+	if(offset < 32) {
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+	} else {
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+	}
+	return __double_as_longlong(__hiloint2double(result.y, result.x));
+}
+#elif __CUDA_ARCH__ >= 120 && USE_ROT_ASM_OPT == 2
+__device__ __forceinline__
+uint64_t ROTR64(const uint64_t x, const int offset)
+{
+	uint64_t result;
+	asm("{ // ROTR64 \n\t"
+		".reg .b64 lhs;\n\t"
+		".reg .u32 roff;\n\t"
+		"shr.b64 lhs, %1, %2;\n\t"
+		"sub.u32 roff, 64, %2;\n\t"
+		"shl.b64 %0, %1, roff;\n\t"
+		"add.u64 %0, %0, lhs;\n\t"
+	"}\n" : "=l"(result) : "l"(x), "r"(offset));
+	return result;
+}
+#else
+/* host */
+#define ROTR64(x, n)  (((x) >> (n)) | ((x) << (64 - (n))))
+#endif
+
+// 64-bit ROTATE LEFT
+#if __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 1
+__device__ __forceinline__
+uint64_t ROTL64(const uint64_t value, const int offset) {
+	uint2 result;
+	if(offset >= 32) {
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+	} else {
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+	}
+	return  __double_as_longlong(__hiloint2double(result.y, result.x));
+}
+#elif __CUDA_ARCH__ >= 120 && USE_ROT_ASM_OPT == 2
+__device__ __forceinline__
+uint64_t ROTL64(const uint64_t x, const int offset)
+{
+	uint64_t result;
+	asm("{ // ROTL64 \n\t"
+		".reg .b64 lhs;\n\t"
+		".reg .u32 roff;\n\t"
+		"shl.b64 lhs, %1, %2;\n\t"
+		"sub.u32 roff, 64, %2;\n\t"
+		"shr.b64 %0, %1, roff;\n\t"
+		"add.u64 %0, lhs, %0;\n\t"
+	"}\n" : "=l"(result) : "l"(x), "r"(offset));
+	return result;
+}
+#elif __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 3
+__device__
+uint64_t ROTL64(const uint64_t x, const int offset)
+{
+	uint64_t res;
+	asm("{ // ROTL64 \n\t"
+		".reg .u32 tl,th,vl,vh;\n\t"
+		".reg .pred p;\n\t"
+		"mov.b64 {tl,th}, %1;\n\t"
+		"shf.l.wrap.b32 vl, tl, th, %2;\n\t"
+		"shf.l.wrap.b32 vh, th, tl, %2;\n\t"
+		"setp.lt.u32 p, %2, 32;\n\t"
+		"@!p mov.b64 %0, {vl,vh};\n\t"
+		"@p  mov.b64 %0, {vh,vl};\n\t"
+	"}\n" : "=l"(res) : "l"(x) , "r"(offset)
+	);
+	return res;
+}
+#else
+/* host */
+#define ROTL64(x, n)  (((x) << (n)) | ((x) >> (64 - (n))))
+#endif
+
+__device__ __forceinline__
+uint64_t SWAPDWORDS(uint64_t value)
+{
+#if __CUDA_ARCH__ >= 320
+	uint2 temp;
+	asm("mov.b64 {%0, %1}, %2; ": "=r"(temp.x), "=r"(temp.y) : "l"(value));
+	asm("mov.b64 %0, {%1, %2}; ": "=l"(value) : "r"(temp.y), "r"(temp.x));
+	return value;
+#else
+	return ROTL64(value, 32);
+#endif
+}
+
+/* lyra2/bmw - uint2 vector's operators */
+
+__device__ __forceinline__
+void LOHI(uint32_t &lo, uint32_t &hi, uint64_t x) {
+#ifdef __CUDA_ARCH__
+	asm("mov.b64 {%0,%1},%2; \n\t"
+		: "=r"(lo), "=r"(hi) : "l"(x));
+#else
+	lo = (uint32_t)(x);
+	hi = (uint32_t)(x >> 32);
+#endif
+}
+
+static __host__ __device__ __forceinline__ uint2 vectorize(uint64_t v) {
+	uint2 result;
+#ifdef __CUDA_ARCH__
+	asm("mov.b64 {%0,%1},%2; \n\t"
+		: "=r"(result.x), "=r"(result.y) : "l"(v));
+#else
+	result.x = (uint32_t)(v);
+	result.y = (uint32_t)(v >> 32);
+#endif
+	return result;
 }
 
-// Endian Drehung f�r 64 Bit Typen
-static __device__ uint64_t cuda_swab64(uint64_t x) {
-    return MAKE_ULONGLONG(cuda_swab32(HIWORD(x)), cuda_swab32(LOWORD(x)));
+static __host__ __device__ __forceinline__ uint64_t devectorize(uint2 v) {
+#ifdef __CUDA_ARCH__
+	return MAKE_ULONGLONG(v.x, v.y);
+#else
+	return (((uint64_t)v.y) << 32) + v.x;
+#endif
 }
 
-// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
-#if __CUDA_ARCH__ >= 350
-__forceinline__ __device__ uint64_t ROTR64(const uint64_t value, const int offset) {
-    uint2 result;
-    if(offset < 32) {
-        asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-        asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-    } else {
-        asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-        asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-    }
-    return  __double_as_longlong(__hiloint2double(result.y, result.x));
+/**
+ * uint2 direct ops by c++ operator definitions
+ */
+static __device__ __forceinline__ uint2 operator^ (uint2 a, uint2 b) { return make_uint2(a.x ^ b.x, a.y ^ b.y); }
+static __device__ __forceinline__ uint2 operator& (uint2 a, uint2 b) { return make_uint2(a.x & b.x, a.y & b.y); }
+static __device__ __forceinline__ uint2 operator| (uint2 a, uint2 b) { return make_uint2(a.x | b.x, a.y | b.y); }
+static __device__ __forceinline__ uint2 operator~ (uint2 a) { return make_uint2(~a.x, ~a.y); }
+static __device__ __forceinline__ void operator^= (uint2 &a, uint2 b) { a = a ^ b; }
+
+static __device__ __forceinline__ uint2 operator+ (uint2 a, uint2 b) {
+	return vectorize(devectorize(a) + devectorize(b));
 }
+static __device__ __forceinline__ void operator+= (uint2 &a, uint2 b) { a = a + b; }
+
+static __device__ __forceinline__ uint2 operator- (uint2 a, uint2 b) {
+	return vectorize(devectorize(a) - devectorize(b));
+}
+static __device__ __forceinline__ void operator-= (uint2 &a, uint2 b) { a = a - b; }
+
+/**
+ * basic multiplication between 64bit no carry outside that range (ie mul.lo.b64(a*b))
+ * (what does uint64 "*" operator)
+ */
+static __device__ __forceinline__ uint2 operator* (uint2 a, uint2 b)
+{
+#ifdef __CUDA_ARCH__
+	uint2 result;
+	asm("{ // uint2 a*b \n\t"
+		"mul.lo.u32       %0, %2, %4;  \n\t"
+		"mul.hi.u32       %1, %2, %4;  \n\t"
+		"mad.lo.cc.u32    %1, %3, %4, %1; \n\t"
+		"madc.lo.u32      %1, %3, %5, %1; \n\t"
+	"}\n" : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y));
+	return result;
 #else
-#define ROTR64(x, n)        (((x) >> (n)) | ((x) << (64 - (n))))
+	// incorrect but unused host equiv
+	return make_uint2(a.x * b.x, a.y * b.y);
 #endif
+}
 
-// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
-#if __CUDA_ARCH__ >= 350
-__forceinline__ __device__ uint64_t ROTL64(const uint64_t value, const int offset) {
-    uint2 result;
-    if(offset >= 32) {
-        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-    } else {
-        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-    }
-    return  __double_as_longlong(__hiloint2double(result.y, result.x));
+// uint2 ROR/ROL methods
+__device__ __forceinline__
+uint2 ROR2(const uint2 a, const int offset)
+{
+	uint2 result;
+#if __CUDA_ARCH__ > 300
+	if (offset < 32) {
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset));
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
+	} else /* if (offset < 64) */ {
+		/* offset SHOULD BE < 64 ! */
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
+	}
+#else
+	if (!offset)
+		result = a;
+	else if (offset < 32) {
+		result.y = ((a.y >> offset) | (a.x << (32 - offset)));
+		result.x = ((a.x >> offset) | (a.y << (32 - offset)));
+	} else if (offset == 32) {
+		result.y = a.x;
+		result.x = a.y;
+	} else {
+		result.y = ((a.x >> (offset - 32)) | (a.y << (64 - offset)));
+		result.x = ((a.y >> (offset - 32)) | (a.x << (64 - offset)));
+	}
+#endif
+	return result;
 }
+
+__device__ __forceinline__
+uint2 ROL2(const uint2 a, const int offset)
+{
+	uint2 result;
+#if __CUDA_ARCH__ > 300
+	if (offset >= 32) {
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset));
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
+	}
+	else {
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
+	}
 #else
-#define ROTL64(x, n)        (((x) << (n)) | ((x) >> (64 - (n))))
+	if (!offset)
+		result = a;
+	else
+		result = ROR2(a, 64 - offset);
+#endif
+	return result;
+}
+
+__device__ __forceinline__
+uint2 SWAPUINT2(uint2 value)
+{
+	return make_uint2(value.y, value.x);
+}
+
+/* Byte aligned Rotations (lyra2) */
+#ifdef __CUDA_ARCH__
+__device__ __inline__ uint2 ROL8(const uint2 a)
+{
+	uint2 result;
+	result.x = __byte_perm(a.y, a.x, 0x6543);
+	result.y = __byte_perm(a.y, a.x, 0x2107);
+	return result;
+}
+
+__device__ __inline__ uint2 ROR16(const uint2 a)
+{
+	uint2 result;
+	result.x = __byte_perm(a.y, a.x, 0x1076);
+	result.y = __byte_perm(a.y, a.x, 0x5432);
+	return result;
+}
+
+__device__ __inline__ uint2 ROR24(const uint2 a)
+{
+	uint2 result;
+	result.x = __byte_perm(a.y, a.x, 0x2107);
+	result.y = __byte_perm(a.y, a.x, 0x6543);
+	return result;
+}
+#else
+#define ROL8(u)  ROL2(u, 8)
+#define ROR16(u) ROR2(u,16)
+#define ROR24(u) ROR2(u,24)
+#endif
+
+/* uint2 for bmw512 - to double check later */
+
+__device__ __forceinline__
+static uint2 SHL2(uint2 a, int offset)
+{
+#if __CUDA_ARCH__ > 300
+	uint2 result;
+	if (offset < 32)  {
+		asm("{ // SHL2 (l) \n\t"
+			"shf.l.clamp.b32 %1, %2, %3, %4; \n\t"
+			"shl.b32         %0, %2, %4;     \n\t"
+		"}\n" : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
+	} else {
+		asm("{ // SHL2 (h) \n\t"
+			"shf.l.clamp.b32 %1, %2, %3, %4; \n\t"
+			"shl.b32         %0, %2, %4;     \n\t"
+		"}\n" : "=r"(result.x), "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
+	}
+	return result;
+#else
+	if (offset <= 32) {
+		a.y = (a.y << offset) | (a.x >> (32 - offset));
+		a.x = (a.x << offset);
+	} else {
+		a.y = (a.x << (offset-32));
+		a.x = 0;
+	}
+	return a;
+#endif
+}
+
+__device__ __forceinline__
+static uint2 SHR2(uint2 a, int offset)
+{
+#if __CUDA_ARCH__ > 300
+	uint2 result;
+	if (offset<32) {
+		asm("{\n\t"
+			"shf.r.clamp.b32 %0,%2,%3,%4; \n\t"
+			"shr.b32 %1,%3,%4; \n\t"
+			"}\n\t"
+			: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
+	} else {
+		asm("{\n\t"
+			"shf.l.clamp.b32 %0,%2,%3,%4; \n\t"
+			"shl.b32 %1,%3,%4; \n\t"
+			"}\n\t"
+			: "=r"(result.x), "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
+	}
+	return result;
+#else
+	if (offset <= 32) {
+		a.x = (a.x >> offset) | (a.y << (32 - offset));
+		a.y = (a.y >> offset);
+	} else {
+		a.x = (a.y >> (offset - 32));
+		a.y = 0;
+	}
+	return a;
+#endif
+}
+
+// CUDA 9+ deprecated functions warnings (new mask param)
+#if CUDA_VERSION >= 9000 && __CUDA_ARCH__ >= 300
+#undef __shfl
+#define __shfl(var, srcLane, width)  __shfl_sync(0xFFFFFFFFu, var, srcLane, width)
+#undef __shfl_up
+#define __shfl_up(var, delta, width) __shfl_up_sync(0xFFFFFFFF, var, delta, width)
+#undef __any
+#define __any(p) __any_sync(0xFFFFFFFFu, p)
 #endif
 
 #endif // #ifndef CUDA_HELPER_H
diff --git a/cuda_myriadgroestl.cu b/cuda_myriadgroestl.cu
index 3a992e0302..4edc142d6c 100644
--- a/cuda_myriadgroestl.cu
+++ b/cuda_myriadgroestl.cu
@@ -1,394 +1,360 @@
 // Auf Myriadcoin spezialisierte Version von Groestl inkl. Bitslice
 
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
 #include <stdio.h>
 #include <memory.h>
 
-// aus cpu-miner.c
-extern int device_map[8];
-
-// aus heavy.cu
-extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
+#include "cuda_helper.h"
 
-// Folgende Definitionen sp�ter durch header ersetzen
-typedef unsigned char uint8_t;
-typedef unsigned short uint16_t;
-typedef unsigned int uint32_t;
+#ifdef __INTELLISENSE__
+#define __CUDA_ARCH__ 500
+#define __funnelshift_r(x,y,n) (x >> n)
+#define atomicExch(p,x) x
+#endif
 
-// diese Struktur wird in der Init Funktion angefordert
-static cudaDeviceProp props[8];
+#if __CUDA_ARCH__ >= 300
+// 64 Registers Variant for Compute 3.0
+#include "quark/groestl_functions_quad.h"
+#include "quark/groestl_transf_quad.h"
+#endif
 
-// globaler Speicher f�r alle HeftyHashes aller Threads
-__constant__ uint32_t pTarget[8]; // Single GPU
-uint32_t *d_outputHashes[8];
-extern uint32_t *d_resultNonce[8];
+// globaler Speicher für alle HeftyHashes aller Threads
+static uint32_t *d_outputHashes[MAX_GPUS];
+static uint32_t *d_resultNonces[MAX_GPUS];
 
+__constant__ uint32_t pTarget[2]; // Same for all GPU
 __constant__ uint32_t myriadgroestl_gpu_msg[32];
 
 // muss expandiert werden
 __constant__ uint32_t myr_sha256_gpu_constantTable[64];
 __constant__ uint32_t myr_sha256_gpu_constantTable2[64];
-__constant__ uint32_t myr_sha256_gpu_hashTable[8];
-
-uint32_t myr_sha256_cpu_hashTable[] = { 
-    0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 };
-uint32_t myr_sha256_cpu_constantTable[] = {
-    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
-    0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
-    0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
-    0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
-    0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
-    0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
-    0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
-    0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
+
+const uint32_t myr_sha256_cpu_constantTable[] = {
+	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
 };
 
-uint32_t myr_sha256_cpu_w2Table[] = {
-    0x80000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 
-    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000200,
-    0x80000000, 0x01400000, 0x00205000, 0x00005088, 0x22000800, 0x22550014, 0x05089742, 0xa0000020,
-    0x5a880000, 0x005c9400, 0x0016d49d, 0xfa801f00, 0xd33225d0, 0x11675959, 0xf6e6bfda, 0xb30c1549,
-    0x08b2b050, 0x9d7c4c27, 0x0ce2a393, 0x88e6e1ea, 0xa52b4335, 0x67a16f49, 0xd732016f, 0x4eeb2e91,
-    0x5dbf55e5, 0x8eee2335, 0xe2bc5ec2, 0xa83f4394, 0x45ad78f7, 0x36f3d0cd, 0xd99c05e8, 0xb0511dc7,
-    0x69bc7ac4, 0xbd11375b, 0xe3ba71e5, 0x3b209ff2, 0x18feee17, 0xe25ad9e7, 0x13375046, 0x0515089d,
-    0x4f0d0f04, 0x2627484e, 0x310128d2, 0xc668b434, 0x420841cc, 0x62d311b8, 0xe59ba771, 0x85a7a484 };
-
-// 64 Register Variante f�r Compute 3.0
-#include "groestl_functions_quad.cu"
-#include "bitslice_transformations_quad.cu"
-
-#define SWAB32(x)        ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
-
-#if __CUDA_ARCH__ < 350 
-    // Kepler (Compute 3.0)
-    #define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
+const uint32_t myr_sha256_cpu_w2Table[] = {
+	0x80000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000200,
+	0x80000000, 0x01400000, 0x00205000, 0x00005088, 0x22000800, 0x22550014, 0x05089742, 0xa0000020,
+	0x5a880000, 0x005c9400, 0x0016d49d, 0xfa801f00, 0xd33225d0, 0x11675959, 0xf6e6bfda, 0xb30c1549,
+	0x08b2b050, 0x9d7c4c27, 0x0ce2a393, 0x88e6e1ea, 0xa52b4335, 0x67a16f49, 0xd732016f, 0x4eeb2e91,
+	0x5dbf55e5, 0x8eee2335, 0xe2bc5ec2, 0xa83f4394, 0x45ad78f7, 0x36f3d0cd, 0xd99c05e8, 0xb0511dc7,
+	0x69bc7ac4, 0xbd11375b, 0xe3ba71e5, 0x3b209ff2, 0x18feee17, 0xe25ad9e7, 0x13375046, 0x0515089d,
+	0x4f0d0f04, 0x2627484e, 0x310128d2, 0xc668b434, 0x420841cc, 0x62d311b8, 0xe59ba771, 0x85a7a484
+};
+
+#define SWAB32(x) cuda_swab32(x)
+
+#if __CUDA_ARCH__ < 320
+	// Kepler (Compute 3.0)
+	#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
 #else
-    // Kepler (Compute 3.5)
-    #define ROTR32(x, n) __funnelshift_r( (x), (x), (n) )
+	// Kepler (Compute 3.5)
+	#define ROTR32(x, n) __funnelshift_r( (x), (x), (n) )
 #endif
-#define R(x, n)            ((x) >> (n))
-#define Ch(x, y, z)        ((x & (y ^ z)) ^ z)
+
+#define R(x, n)         ((x) >> (n))
+#define Ch(x, y, z)     ((x & (y ^ z)) ^ z)
 #define Maj(x, y, z)    ((x & (y | z)) | (y & z))
-#define S0(x)            (ROTR32(x, 2) ^ ROTR32(x, 13) ^ ROTR32(x, 22))
-#define S1(x)            (ROTR32(x, 6) ^ ROTR32(x, 11) ^ ROTR32(x, 25))
-#define s0(x)            (ROTR32(x, 7) ^ ROTR32(x, 18) ^ R(x, 3))
-#define s1(x)            (ROTR32(x, 17) ^ ROTR32(x, 19) ^ R(x, 10))
+#define S0(x)           (ROTR32(x, 2) ^ ROTR32(x, 13) ^ ROTR32(x, 22))
+#define S1(x)           (ROTR32(x, 6) ^ ROTR32(x, 11) ^ ROTR32(x, 25))
+#define s0(x)           (ROTR32(x, 7) ^ ROTR32(x, 18) ^ R(x, 3))
+#define s1(x)           (ROTR32(x, 17) ^ ROTR32(x, 19) ^ R(x, 10))
 
-__device__ void myriadgroestl_gpu_sha256(uint32_t *message)
+__device__ __forceinline__
+void myriadgroestl_gpu_sha256(uint32_t *message)
 {
-    uint32_t W1[16];
-    uint32_t W2[16];
-
-    // Initialisiere die register a bis h mit der Hash-Tabelle
-    uint32_t regs[8];
-    uint32_t hash[8];
-
-    // pre
-#pragma unroll 8
-    for (int k=0; k < 8; k++)
-    {
-        regs[k] = myr_sha256_gpu_hashTable[k];
-        hash[k] = regs[k];
-    }
-    
-#pragma unroll 16
-    for(int k=0;k<16;k++)
-        W1[k] = SWAB32(message[k]);
-
-// Progress W1
-#pragma unroll 16
-    for(int j=0;j<16;j++)
-    {
-        uint32_t T1, T2;
-        T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j] + W1[j];
-        T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
-        
-        #pragma unroll 7
-        for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
-        regs[0] = T1 + T2;
-        regs[4] += T1;
-    }
-
-// Progress W2...W3
-////// PART 1
-#pragma unroll 2
-    for(int j=0;j<2;j++)
-        W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j];
-#pragma unroll 5
-    for(int j=2;j<7;j++)
-        W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j];
-
-#pragma unroll 8
-    for(int j=7;j<15;j++)
-        W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j];
-
-    W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
-
-    // Rundenfunktion
-#pragma unroll 16
-    for(int j=0;j<16;j++)
-    {
-        uint32_t T1, T2;
-        T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 16] + W2[j];
-        T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
-        
-        #pragma unroll 7
-        for (int l=6; l >= 0; l--) regs[l+1] = regs[l];
-        regs[0] = T1 + T2;
-        regs[4] += T1;
-    }
-
-////// PART 2
-#pragma unroll 2
-    for(int j=0;j<2;j++)
-        W1[j] = s1(W2[14+j]) + W2[9+j] + s0(W2[1+j]) + W2[j];
-#pragma unroll 5
-    for(int j=2;j<7;j++)
-        W1[j] = s1(W1[j-2]) + W2[9+j] + s0(W2[1+j]) + W2[j];
-
-#pragma unroll 8
-    for(int j=7;j<15;j++)
-        W1[j] = s1(W1[j-2]) + W1[j-7] + s0(W2[1+j]) + W2[j];
-
-    W1[15] = s1(W1[13]) + W1[8] + s0(W1[0]) + W2[15];
-
-    // Rundenfunktion
-#pragma unroll 16
-    for(int j=0;j<16;j++)
-    {
-        uint32_t T1, T2;
-        T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 32] + W1[j];
-        T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
-        
-        #pragma unroll 7
-        for (int l=6; l >= 0; l--) regs[l+1] = regs[l];
-        regs[0] = T1 + T2;
-        regs[4] += T1;
-    }
-
-////// PART 3
-#pragma unroll 2
-    for(int j=0;j<2;j++)
-        W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j];
-#pragma unroll 5
-    for(int j=2;j<7;j++)
-        W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j];
-
-#pragma unroll 8
-    for(int j=7;j<15;j++)
-        W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j];
-
-    W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
-
-    // Rundenfunktion
-#pragma unroll 16
-    for(int j=0;j<16;j++)
-    {
-        uint32_t T1, T2;
-        T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 48] + W2[j];
-        T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
-        
-        #pragma unroll 7
-        for (int l=6; l >= 0; l--) regs[l+1] = regs[l];
-        regs[0] = T1 + T2;
-        regs[4] += T1;
-    }
-
-#pragma unroll 8
-    for(int k=0;k<8;k++)
-        hash[k] += regs[k];
-
-    /////
-    ///// Zweite Runde (wegen Msg-Padding)
-    /////
-#pragma unroll 8
-    for(int k=0;k<8;k++)
-        regs[k] = hash[k];
-
-// Progress W1
-#pragma unroll 64
-    for(int j=0;j<64;j++)
-    {
-        uint32_t T1, T2;
-        T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable2[j];
-        T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
-        
-        #pragma unroll 7
-        for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
-        regs[0] = T1 + T2;
-        regs[4] += T1;
-    }
-
-#pragma unroll 8
-    for(int k=0;k<8;k++)
-        hash[k] += regs[k];
-
-    //// FERTIG
-
-#pragma unroll 8
-    for(int k=0;k<8;k++)
-        message[k] = SWAB32(hash[k]);
+	uint32_t W1[16];
+	#pragma unroll
+	for(int k=0; k<16; k++)
+		W1[k] = SWAB32(message[k]);
+
+	uint32_t regs[8] = {
+		0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
+		0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
+	};
+
+	// Progress W1
+	#pragma unroll
+	for(int j=0; j<16; j++)
+	{
+		uint32_t T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j] + W1[j];
+		uint32_t T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+
+		#pragma unroll 7
+		for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
+		regs[0] = T1 + T2;
+		regs[4] += T1;
+	}
+
+	// Progress W2...W3
+	uint32_t W2[16];
+
+	////// PART 1
+	#pragma unroll
+	for(int j=0; j<2; j++)
+		W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j];
+
+	#pragma unroll 5
+	for(int j=2; j<7;j++)
+		W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j];
+
+	#pragma unroll
+	for(int j=7; j<15; j++)
+		W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j];
+
+	W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
+
+	// Round function
+	#pragma unroll
+	for(int j=0; j<16; j++)
+	{
+		uint32_t T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 16] + W2[j];
+		uint32_t T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+
+		#pragma unroll 7
+		for (int l=6; l >= 0; l--) regs[l+1] = regs[l];
+		regs[0] = T1 + T2;
+		regs[4] += T1;
+	}
+
+	////// PART 2
+	#pragma unroll
+	for(int j=0; j<2; j++)
+		W1[j] = s1(W2[14+j]) + W2[9+j] + s0(W2[1+j]) + W2[j];
+	#pragma unroll 5
+	for(int j=2; j<7; j++)
+		W1[j] = s1(W1[j-2]) + W2[9+j] + s0(W2[1+j]) + W2[j];
+
+	#pragma unroll
+	for(int j=7; j<15; j++)
+		W1[j] = s1(W1[j-2]) + W1[j-7] + s0(W2[1+j]) + W2[j];
+
+	W1[15] = s1(W1[13]) + W1[8] + s0(W1[0]) + W2[15];
+
+	// Round function
+	#pragma unroll
+	for(int j=0; j<16; j++)
+	{
+		uint32_t T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 32] + W1[j];
+		uint32_t T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+
+		#pragma unroll 7
+		for (int l=6; l >= 0; l--) regs[l+1] = regs[l];
+		regs[0] = T1 + T2;
+		regs[4] += T1;
+	}
+
+	////// PART 3
+	#pragma unroll
+	for(int j=0; j<2; j++)
+		W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j];
+
+	#pragma unroll 5
+	for(int j=2; j<7; j++)
+		W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j];
+
+	#pragma unroll
+	for(int j=7; j<15; j++)
+		W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j];
+
+	W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
+
+	// Round function
+	#pragma unroll
+	for(int j=0; j<16; j++)
+	{
+		uint32_t T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 48] + W2[j];
+		uint32_t T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+
+		#pragma unroll 7
+		for (int l=6; l >= 0; l--) regs[l+1] = regs[l];
+		regs[0] = T1 + T2;
+		regs[4] += T1;
+	}
+
+	uint32_t hash[8] = {
+		0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
+		0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
+	};
+
+	#pragma unroll 8
+	for(int k=0; k<8; k++)
+		hash[k] += regs[k];
+
+	/////
+	///// 2nd Round (wegen Msg-Padding)
+	/////
+	#pragma unroll
+	for(int k=0; k<8; k++)
+		regs[k] = hash[k];
+
+	// Progress W1
+	#pragma unroll
+	for(int j=0; j<64; j++)
+	{
+		uint32_t T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable2[j];
+		uint32_t T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+
+		#pragma unroll 7
+		for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
+		regs[0] = T1 + T2;
+		regs[4] += T1;
+	}
+
+#if 0
+	// Full sha hash
+	#pragma unroll
+	for(int k=0; k<8; k++)
+		hash[k] += regs[k];
+
+	#pragma unroll
+	for(int k=0; k<8; k++)
+		message[k] = SWAB32(hash[k]);
+#else
+	message[6] = SWAB32(hash[6] + regs[6]);
+	message[7] = SWAB32(hash[7] + regs[7]);
+#endif
 }
 
-__global__ void __launch_bounds__(256, 4)
- myriadgroestl_gpu_hash_quad(int threads, uint32_t startNounce, uint32_t *hashBuffer)
+__global__
+//__launch_bounds__(256, 6) // we want <= 40 regs
+void myriadgroestl_gpu_hash_sha(uint32_t threads, uint32_t startNounce, uint32_t *hashBuffer, uint32_t *resNonces)
 {
-    // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
-    int thread = (blockDim.x * blockIdx.x + threadIdx.x) / 4;
-    if (thread < threads)
-    {
-        // GROESTL
-        uint32_t paddedInput[8];
-#pragma unroll 8
-        for(int k=0;k<8;k++) paddedInput[k] = myriadgroestl_gpu_msg[4*k+threadIdx.x%4];
-
-        uint32_t nounce = startNounce + thread;
-        if ((threadIdx.x % 4) == 3)
-            paddedInput[4] = SWAB32(nounce);  // 4*4+3 = 19
-
-        uint32_t msgBitsliced[8];
-        to_bitslice_quad(paddedInput, msgBitsliced);
-
-        uint32_t state[8];
-
-        groestl512_progressMessage_quad(state, msgBitsliced);
-
-        uint32_t out_state[16];
-        from_bitslice_quad(state, out_state);
-
-        if ((threadIdx.x & 0x03) == 0)
-        {
-            uint32_t *outpHash = &hashBuffer[16 * thread];
-#pragma unroll 16
-            for(int k=0;k<16;k++) outpHash[k] = out_state[k];
-        }
-    }
+#if __CUDA_ARCH__ >= 300
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint32_t nonce = startNounce + thread;
+
+		uint32_t out_state[16];
+		uint32_t *inpHash = &hashBuffer[16 * thread];
+
+		#pragma unroll 16
+		for (int i=0; i < 16; i++)
+			out_state[i] = inpHash[i];
+
+		myriadgroestl_gpu_sha256(out_state);
+
+		if (out_state[7] <= pTarget[1] && out_state[6] <= pTarget[0])
+		{
+			uint32_t tmp = atomicExch(&resNonces[0], nonce);
+			if (tmp != UINT32_MAX)
+				resNonces[1] = tmp;
+		}
+	}
+#endif
 }
 
-__global__ void
- myriadgroestl_gpu_hash_quad2(int threads, uint32_t startNounce, uint32_t *resNounce, uint32_t *hashBuffer)
+__global__
+__launch_bounds__(256, 4)
+void myriadgroestl_gpu_hash_quad(uint32_t threads, uint32_t startNounce, uint32_t *hashBuffer)
 {
-    int thread = (blockDim.x * blockIdx.x + threadIdx.x);
-    if (thread < threads)
-    {
-        uint32_t nounce = startNounce + thread;
-
-        uint32_t out_state[16];
-        uint32_t *inpHash = &hashBuffer[16 * thread];
-#pragma unroll 16
-        for (int i=0; i < 16; i++)
-            out_state[i] = inpHash[i];
-
-        myriadgroestl_gpu_sha256(out_state);
-        
-        int i, position = -1;
-        bool rc = true;
-
-#pragma unroll 8
-        for (i = 7; i >= 0; i--) {
-            if (out_state[i] > pTarget[i]) {
-                if(position < i) {
-                    position = i;
-                    rc = false;
-                }
-             }
-             if (out_state[i] < pTarget[i]) {
-                if(position < i) {
-                    position = i;
-                    rc = true;
-                }
-             }
-        }
-
-        if(rc == true)
-            if(resNounce[0] > nounce)
-                resNounce[0] = nounce;
-    }
+#if __CUDA_ARCH__ >= 300
+	// durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) / 4;
+	if (thread < threads)
+	{
+		// GROESTL
+		uint32_t paddedInput[8];
+		#pragma unroll 8
+		for(int k=0; k<8; k++)
+			paddedInput[k] = myriadgroestl_gpu_msg[4*k+threadIdx.x%4];
+
+		uint32_t nounce = startNounce + thread;
+		if ((threadIdx.x % 4) == 3)
+			paddedInput[4] = SWAB32(nounce);  // 4*4+3 = 19
+
+		uint32_t msgBitsliced[8];
+		to_bitslice_quad(paddedInput, msgBitsliced);
+
+		uint32_t state[8];
+		groestl512_progressMessage_quad(state, msgBitsliced);
+
+		uint32_t out_state[16];
+		from_bitslice_quad(state, out_state);
+
+		if ((threadIdx.x & 0x03) == 0)
+		{
+			uint32_t *outpHash = &hashBuffer[16 * thread];
+			#pragma unroll 16
+			for(int k=0; k<16; k++) outpHash[k] = out_state[k];
+		}
+	}
+#endif
 }
 
-// Setup-Funktionen
-__host__ void myriadgroestl_cpu_init(int thr_id, int threads)
+// Setup Function
+__host__
+void myriadgroestl_cpu_init(int thr_id, uint32_t threads)
 {
-    cudaSetDevice(device_map[thr_id]);
-    
-    cudaMemcpyToSymbol( myr_sha256_gpu_hashTable,
-                        myr_sha256_cpu_hashTable,
-                        sizeof(uint32_t) * 8 );
-
-    cudaMemcpyToSymbol( myr_sha256_gpu_constantTable,
-                        myr_sha256_cpu_constantTable,
-                        sizeof(uint32_t) * 64 );
+	uint32_t temp[64];
+	for(int i=0; i<64; i++)
+		temp[i] = myr_sha256_cpu_w2Table[i] + myr_sha256_cpu_constantTable[i];
 
-    // zweite CPU-Tabelle bauen und auf die GPU laden
-    uint32_t temp[64];
-    for(int i=0;i<64;i++)
-        temp[i] = myr_sha256_cpu_w2Table[i] + myr_sha256_cpu_constantTable[i];
+	cudaMemcpyToSymbol( myr_sha256_gpu_constantTable2, temp, sizeof(uint32_t) * 64 );
 
-    cudaMemcpyToSymbol( myr_sha256_gpu_constantTable2,
-                        temp,
-                        sizeof(uint32_t) * 64 );
+	cudaMemcpyToSymbol( myr_sha256_gpu_constantTable,
+						myr_sha256_cpu_constantTable,
+						sizeof(uint32_t) * 64 );
 
-    cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
+	// to check if the binary supports SM3+
+	cuda_get_arch(thr_id);
 
-    // Speicher f�r Gewinner-Nonce belegen
-    cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); 
-
-    // Speicher f�r tempor�reHashes
-    cudaMalloc(&d_outputHashes[thr_id], 16*sizeof(uint32_t)*threads); 
+	cudaMalloc(&d_outputHashes[thr_id], (size_t) 64 * threads);
+	cudaMalloc(&d_resultNonces[thr_id], 2 * sizeof(uint32_t));
 }
 
-__host__ void myriadgroestl_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
+__host__
+void myriadgroestl_cpu_free(int thr_id)
 {
-    // Nachricht expandieren und setzen
-    uint32_t msgBlock[32];
-
-    memset(msgBlock, 0, sizeof(uint32_t) * 32);
-    memcpy(&msgBlock[0], data, 80);
-
-    // Erweitere die Nachricht auf den Nachrichtenblock (padding)
-    // Unsere Nachricht hat 80 Byte
-    msgBlock[20] = 0x80;
-    msgBlock[31] = 0x01000000;
-
-    // groestl512 braucht hierf�r keinen CPU-Code (die einzige Runde wird
-    // auf der GPU ausgef�hrt)
+	cudaFree(d_outputHashes[thr_id]);
+	cudaFree(d_resultNonces[thr_id]);
+}
 
-    // Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch)
-    cudaMemcpyToSymbol( myriadgroestl_gpu_msg,
-                        msgBlock,
-                        128);
+__host__
+void myriadgroestl_cpu_setBlock(int thr_id, void *data, uint32_t *pTargetIn)
+{
+	uint32_t msgBlock[32] = { 0 };
+	memcpy(&msgBlock[0], data, 80);
+	msgBlock[20] = 0x80;
+	msgBlock[31] = 0x01000000;
 
-    cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
-    cudaMemcpyToSymbol( pTarget,
-                        pTargetIn,
-                        sizeof(uint32_t) * 8 );
+	cudaMemcpyToSymbol(myriadgroestl_gpu_msg, msgBlock, 128);
+	cudaMemcpyToSymbol(pTarget, &pTargetIn[6], 2 * sizeof(uint32_t));
 }
 
-__host__ void myriadgroestl_cpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce)
+__host__
+void myriadgroestl_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNounce)
 {
-    int threadsperblock = 256;
+	uint32_t threadsperblock = 256;
+
+	cudaMemset(d_resultNonces[thr_id], 0xFF, 2 * sizeof(uint32_t));
 
-    // Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
-    // mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
-    const int factor=4;
+	// Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
+	// mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
+	const int factor = 4;
 
-    // Gr��e des dynamischen Shared Memory Bereichs
-    size_t shared_size = 0;
+	dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
+	dim3 block(threadsperblock);
 
-    cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
-    // berechne wie viele Thread Blocks wir brauchen
-    dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
-    dim3 block(threadsperblock);
+	int dev_id = device_map[thr_id];
+	if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300) {
+		printf("Sorry, This algo is not supported by this GPU arch (SM 3.0 required)");
+		return;
+	}
 
-    myriadgroestl_gpu_hash_quad<<<grid, block, shared_size>>>(threads, startNounce, d_outputHashes[thr_id]);
-    dim3 grid2((threads + threadsperblock-1)/threadsperblock);
-    myriadgroestl_gpu_hash_quad2<<<grid2, block, shared_size>>>(threads, startNounce, d_resultNonce[thr_id], d_outputHashes[thr_id]);
+	myriadgroestl_gpu_hash_quad <<< grid, block >>> (threads, startNounce, d_outputHashes[thr_id]);
 
-    // Strategisches Sleep Kommando zur Senkung der CPU Last
-    MyStreamSynchronize(NULL, 0, thr_id);
+	dim3 grid2((threads + threadsperblock-1)/threadsperblock);
+	myriadgroestl_gpu_hash_sha <<< grid2, block >>> (threads, startNounce, d_outputHashes[thr_id], d_resultNonces[thr_id]);
 
-    cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	cudaMemcpy(resNounce, d_resultNonces[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
 }
diff --git a/cuda_nist5.cu b/cuda_nist5.cu
deleted file mode 100644
index 4e37f6968c..0000000000
--- a/cuda_nist5.cu
+++ /dev/null
@@ -1,160 +0,0 @@
-
-extern "C"
-{
-#include "sph/sph_blake.h"
-#include "sph/sph_groestl.h"
-#include "sph/sph_skein.h"
-#include "sph/sph_jh.h"
-#include "sph/sph_keccak.h"
-#include "miner.h"
-}
-
-#include <stdint.h>
-
-// aus cpu-miner.c
-extern int device_map[8];
-
-// Speicher f�r Input/Output der verketteten Hashfunktionen
-static uint32_t *d_hash[8];
-
-extern void quark_blake512_cpu_init(int thr_id, int threads);
-extern void quark_blake512_cpu_setBlock_80(void *pdata);
-extern void quark_blake512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order);
-
-extern void quark_groestl512_cpu_init(int thr_id, int threads);
-extern void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
-extern void quark_jh512_cpu_init(int thr_id, int threads);
-extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
-extern void quark_keccak512_cpu_init(int thr_id, int threads);
-extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
-extern void quark_skein512_cpu_init(int thr_id, int threads);
-extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
-extern void quark_check_cpu_init(int thr_id, int threads);
-extern void quark_check_cpu_setTarget(const void *ptarget);
-extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
-
-// Original nist5hash Funktion aus einem miner Quelltext
-inline void nist5hash(void *state, const void *input)
-{
-    sph_blake512_context ctx_blake;
-    sph_groestl512_context ctx_groestl;
-    sph_jh512_context ctx_jh;
-    sph_keccak512_context ctx_keccak;
-    sph_skein512_context ctx_skein;
-    
-    unsigned char hash[64];
-
-    sph_blake512_init(&ctx_blake);
-    // ZBLAKE;
-    sph_blake512 (&ctx_blake, input, 80);
-    sph_blake512_close(&ctx_blake, (void*) hash);
-    
-    sph_groestl512_init(&ctx_groestl);
-    // ZGROESTL;
-    sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
-    sph_groestl512_close(&ctx_groestl, (void*) hash);
-
-    sph_jh512_init(&ctx_jh);
-    // ZJH;
-    sph_jh512 (&ctx_jh, (const void*) hash, 64);
-    sph_jh512_close(&ctx_jh, (void*) hash);
-
-    sph_keccak512_init(&ctx_keccak);
-    // ZKECCAK;
-    sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
-    sph_keccak512_close(&ctx_keccak, (void*) hash);
-
-    sph_skein512_init(&ctx_skein);
-    // ZSKEIN;
-    sph_skein512 (&ctx_skein, (const void*) hash, 64);
-    sph_skein512_close(&ctx_skein, (void*) hash);
-
-    memcpy(state, hash, 32);
-}
-
-
-extern bool opt_benchmark;
-
-extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
-    const uint32_t *ptarget, uint32_t max_nonce,
-    unsigned long *hashes_done)
-{
-	const uint32_t first_nonce = pdata[19];
-
-	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x0000ff;
-
-	const uint32_t Htarg = ptarget[7];
-
-	const int throughput = 256*4096; // 100;
-
-	static bool init[8] = {0,0,0,0,0,0,0,0};
-	if (!init[thr_id])
-	{
-		cudaSetDevice(device_map[thr_id]);
-
-		// Konstanten kopieren, Speicher belegen
-		cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput);
-		quark_blake512_cpu_init(thr_id, throughput);
-		quark_groestl512_cpu_init(thr_id, throughput);
-		quark_jh512_cpu_init(thr_id, throughput);
-		quark_keccak512_cpu_init(thr_id, throughput);
-		quark_skein512_cpu_init(thr_id, throughput);
-		quark_check_cpu_init(thr_id, throughput);
-		init[thr_id] = true;
-	}
-
-	uint32_t endiandata[20];
-	for (int k=0; k < 20; k++)
-		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
-
-	quark_blake512_cpu_setBlock_80((void*)endiandata);
-	quark_check_cpu_setTarget(ptarget);
-
-	do {
-		int order = 0;
-
-		// erstes Blake512 Hash mit CUDA
-		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch f�r Groestl512
-		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch f�r JH512
-		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch f�r Keccak512
-		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch f�r Skein512
-		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-
-		// Scan nach Gewinner Hashes auf der GPU
-		uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		if  (foundNonce != 0xffffffff)
-		{
-			uint32_t vhash64[8];
-			be32enc(&endiandata[19], foundNonce);
-			nist5hash(vhash64, endiandata);
-
-			if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) {
-
-				pdata[19] = foundNonce;
-				*hashes_done = foundNonce - first_nonce + 1;
-				return 1;
-			} else {
-				applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNonce);
-			}
-		}
-
-		pdata[19] += throughput;
-
-	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
-
-	*hashes_done = pdata[19] - first_nonce + 1;
-	return 0;
-}
diff --git a/cuda_skeincoin.cu b/cuda_skeincoin.cu
new file mode 100644
index 0000000000..8c91c5efaf
--- /dev/null
+++ b/cuda_skeincoin.cu
@@ -0,0 +1,747 @@
+/* Merged skein512 80 + sha256 64 (in a single kernel) for SM 5+
+ * based on sp and klaus work, adapted by tpruvot to keep skein2 compat
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <memory.h>
+
+#include "cuda_helper.h"
+
+/* try 1024 for 970+ */
+#define TPB 512
+
+static __constant__ uint64_t c_message16[2];
+static __constant__ uint2 precalcvalues[9];
+
+static uint32_t *d_found[MAX_GPUS];
+
+static __device__ __forceinline__ uint2 vectorizelow(uint32_t v) {
+	uint2 result;
+	result.x = v;
+	result.y = 0;
+	return result;
+}
+
+static __device__ __forceinline__ uint2 vectorizehigh(uint32_t v) {
+	uint2 result;
+	result.x = 0;
+	result.y = v;
+	return result;
+}
+
+/*
+ * M9_ ## s ## _ ## i  evaluates to s+i mod 9 (0 <= s <= 18, 0 <= i <= 7).
+ */
+
+#define M9_0_0    0
+#define M9_0_1    1
+#define M9_0_2    2
+#define M9_0_3    3
+#define M9_0_4    4
+#define M9_0_5    5
+#define M9_0_6    6
+#define M9_0_7    7
+
+#define M9_1_0    1
+#define M9_1_1    2
+#define M9_1_2    3
+#define M9_1_3    4
+#define M9_1_4    5
+#define M9_1_5    6
+#define M9_1_6    7
+#define M9_1_7    8
+
+#define M9_2_0    2
+#define M9_2_1    3
+#define M9_2_2    4
+#define M9_2_3    5
+#define M9_2_4    6
+#define M9_2_5    7
+#define M9_2_6    8
+#define M9_2_7    0
+
+#define M9_3_0    3
+#define M9_3_1    4
+#define M9_3_2    5
+#define M9_3_3    6
+#define M9_3_4    7
+#define M9_3_5    8
+#define M9_3_6    0
+#define M9_3_7    1
+
+#define M9_4_0    4
+#define M9_4_1    5
+#define M9_4_2    6
+#define M9_4_3    7
+#define M9_4_4    8
+#define M9_4_5    0
+#define M9_4_6    1
+#define M9_4_7    2
+
+#define M9_5_0    5
+#define M9_5_1    6
+#define M9_5_2    7
+#define M9_5_3    8
+#define M9_5_4    0
+#define M9_5_5    1
+#define M9_5_6    2
+#define M9_5_7    3
+
+#define M9_6_0    6
+#define M9_6_1    7
+#define M9_6_2    8
+#define M9_6_3    0
+#define M9_6_4    1
+#define M9_6_5    2
+#define M9_6_6    3
+#define M9_6_7    4
+
+#define M9_7_0    7
+#define M9_7_1    8
+#define M9_7_2    0
+#define M9_7_3    1
+#define M9_7_4    2
+#define M9_7_5    3
+#define M9_7_6    4
+#define M9_7_7    5
+
+#define M9_8_0    8
+#define M9_8_1    0
+#define M9_8_2    1
+#define M9_8_3    2
+#define M9_8_4    3
+#define M9_8_5    4
+#define M9_8_6    5
+#define M9_8_7    6
+
+#define M9_9_0    0
+#define M9_9_1    1
+#define M9_9_2    2
+#define M9_9_3    3
+#define M9_9_4    4
+#define M9_9_5    5
+#define M9_9_6    6
+#define M9_9_7    7
+
+#define M9_10_0   1
+#define M9_10_1   2
+#define M9_10_2   3
+#define M9_10_3   4
+#define M9_10_4   5
+#define M9_10_5   6
+#define M9_10_6   7
+#define M9_10_7   8
+
+#define M9_11_0   2
+#define M9_11_1   3
+#define M9_11_2   4
+#define M9_11_3   5
+#define M9_11_4   6
+#define M9_11_5   7
+#define M9_11_6   8
+#define M9_11_7   0
+
+#define M9_12_0   3
+#define M9_12_1   4
+#define M9_12_2   5
+#define M9_12_3   6
+#define M9_12_4   7
+#define M9_12_5   8
+#define M9_12_6   0
+#define M9_12_7   1
+
+#define M9_13_0   4
+#define M9_13_1   5
+#define M9_13_2   6
+#define M9_13_3   7
+#define M9_13_4   8
+#define M9_13_5   0
+#define M9_13_6   1
+#define M9_13_7   2
+
+#define M9_14_0   5
+#define M9_14_1   6
+#define M9_14_2   7
+#define M9_14_3   8
+#define M9_14_4   0
+#define M9_14_5   1
+#define M9_14_6   2
+#define M9_14_7   3
+
+#define M9_15_0   6
+#define M9_15_1   7
+#define M9_15_2   8
+#define M9_15_3   0
+#define M9_15_4   1
+#define M9_15_5   2
+#define M9_15_6   3
+#define M9_15_7   4
+
+#define M9_16_0   7
+#define M9_16_1   8
+#define M9_16_2   0
+#define M9_16_3   1
+#define M9_16_4   2
+#define M9_16_5   3
+#define M9_16_6   4
+#define M9_16_7   5
+
+#define M9_17_0   8
+#define M9_17_1   0
+#define M9_17_2   1
+#define M9_17_3   2
+#define M9_17_4   3
+#define M9_17_5   4
+#define M9_17_6   5
+#define M9_17_7   6
+
+#define M9_18_0   0
+#define M9_18_1   1
+#define M9_18_2   2
+#define M9_18_3   3
+#define M9_18_4   4
+#define M9_18_5   5
+#define M9_18_6   6
+#define M9_18_7   7
+
+/*
+ * M3_ ## s ## _ ## i  evaluates to s+i mod 3 (0 <= s <= 18, 0 <= i <= 1).
+ */
+
+#define M3_0_0    0
+#define M3_0_1    1
+#define M3_1_0    1
+#define M3_1_1    2
+#define M3_2_0    2
+#define M3_2_1    0
+#define M3_3_0    0
+#define M3_3_1    1
+#define M3_4_0    1
+#define M3_4_1    2
+#define M3_5_0    2
+#define M3_5_1    0
+#define M3_6_0    0
+#define M3_6_1    1
+#define M3_7_0    1
+#define M3_7_1    2
+#define M3_8_0    2
+#define M3_8_1    0
+#define M3_9_0    0
+#define M3_9_1    1
+#define M3_10_0   1
+#define M3_10_1   2
+#define M3_11_0   2
+#define M3_11_1   0
+#define M3_12_0   0
+#define M3_12_1   1
+#define M3_13_0   1
+#define M3_13_1   2
+#define M3_14_0   2
+#define M3_14_1   0
+#define M3_15_0   0
+#define M3_15_1   1
+#define M3_16_0   1
+#define M3_16_1   2
+#define M3_17_0   2
+#define M3_17_1   0
+#define M3_18_0   0
+#define M3_18_1   1
+
+#define XCAT(x, y)     XCAT_(x, y)
+#define XCAT_(x, y)    x ## y
+
+#define SKBI(k, s, i)   XCAT(k, XCAT(XCAT(XCAT(M9_, s), _), i))
+#define SKBT(t, s, v)   XCAT(t, XCAT(XCAT(XCAT(M3_, s), _), v))
+
+#define TFBIG_KINIT_UI2(k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2) { \
+	k8 = ((k0 ^ k1) ^ (k2 ^ k3)) ^ ((k4 ^ k5) ^ (k6 ^ k7)) \
+		^ vectorize(SPH_C64(0x1BD11BDAA9FC1A22)); \
+	t2 = t0 ^ t1; \
+}
+
+#define TFBIG_ADDKEY_UI2(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) { \
+	w0 = (w0 + SKBI(k, s, 0)); \
+	w1 = (w1 + SKBI(k, s, 1)); \
+	w2 = (w2 + SKBI(k, s, 2)); \
+	w3 = (w3 + SKBI(k, s, 3)); \
+	w4 = (w4 + SKBI(k, s, 4)); \
+	w5 = (w5 + SKBI(k, s, 5) + SKBT(t, s, 0)); \
+	w6 = (w6 + SKBI(k, s, 6) + SKBT(t, s, 1)); \
+	w7 = (w7 + SKBI(k, s, 7) + vectorize(s)); \
+}
+
+#define TFBIG_MIX_UI2(x0, x1, rc) { \
+	x0 = x0 + x1; \
+	x1 = ROL2(x1, rc) ^ x0; \
+}
+
+#define TFBIG_MIX8_UI2(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) { \
+	TFBIG_MIX_UI2(w0, w1, rc0); \
+	TFBIG_MIX_UI2(w2, w3, rc1); \
+	TFBIG_MIX_UI2(w4, w5, rc2); \
+	TFBIG_MIX_UI2(w6, w7, rc3); \
+}
+
+#define TFBIG_4e_UI2(s)  { \
+	TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \
+	TFBIG_MIX8_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 46, 36, 19, 37); \
+	TFBIG_MIX8_UI2(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 33, 27, 14, 42); \
+	TFBIG_MIX8_UI2(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 17, 49, 36, 39); \
+	TFBIG_MIX8_UI2(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 44,  9, 54, 56); \
+}
+
+#define TFBIG_4o_UI2(s)  { \
+	TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \
+	TFBIG_MIX8_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 39, 30, 34, 24); \
+	TFBIG_MIX8_UI2(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 13, 50, 10, 17); \
+	TFBIG_MIX8_UI2(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 25, 29, 39, 43); \
+	TFBIG_MIX8_UI2(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3],  8, 35, 56, 22); \
+}
+
+/* precalc */
+
+#define TFBIG_ADDKEY_PRE(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) { \
+	w0 = (w0 + SKBI(k, s, 0)); \
+	w1 = (w1 + SKBI(k, s, 1)); \
+	w2 = (w2 + SKBI(k, s, 2)); \
+	w3 = (w3 + SKBI(k, s, 3)); \
+	w4 = (w4 + SKBI(k, s, 4)); \
+	w5 = (w5 + SKBI(k, s, 5) + SKBT(t, s, 0)); \
+	w6 = (w6 + SKBI(k, s, 6) + SKBT(t, s, 1)); \
+	w7 = (w7 + SKBI(k, s, 7) + (s)); \
+}
+
+#define TFBIG_MIX_PRE(x0, x1, rc) { \
+	x0 = x0 + x1; \
+	x1 = ROTL64(x1, rc) ^ x0; \
+}
+
+#define TFBIG_MIX8_PRE(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) { \
+	TFBIG_MIX_PRE(w0, w1, rc0); \
+	TFBIG_MIX_PRE(w2, w3, rc1); \
+	TFBIG_MIX_PRE(w4, w5, rc2); \
+	TFBIG_MIX_PRE(w6, w7, rc3); \
+}
+
+#define TFBIG_4e_PRE(s)  { \
+	TFBIG_ADDKEY_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \
+	TFBIG_MIX8_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 46, 36, 19, 37); \
+	TFBIG_MIX8_PRE(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 33, 27, 14, 42); \
+	TFBIG_MIX8_PRE(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 17, 49, 36, 39); \
+	TFBIG_MIX8_PRE(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 44,  9, 54, 56); \
+}
+
+#define TFBIG_4o_PRE(s)  { \
+	TFBIG_ADDKEY_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \
+	TFBIG_MIX8_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 39, 30, 34, 24); \
+	TFBIG_MIX8_PRE(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 13, 50, 10, 17); \
+	TFBIG_MIX8_PRE(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 25, 29, 39, 43); \
+	TFBIG_MIX8_PRE(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3],  8, 35, 56, 22); \
+}
+
+/* Elementary defines for SHA256 */
+
+#define SWAB32(x)     cuda_swab32(x)
+
+#define R(x, n)       ((x) >> (n))
+#define Ch(x, y, z)   ((x & (y ^ z)) ^ z)
+#define Maj(x, y, z)  ((x & (y | z)) | (y & z))
+#define S0(x)         (ROTR32(x, 2) ^ ROTR32(x, 13) ^ ROTR32(x, 22))
+#define S1(x)         (ROTR32(x, 6) ^ ROTR32(x, 11) ^ ROTR32(x, 25))
+#define s0(x)         (ROTR32(x, 7) ^ ROTR32(x, 18) ^ R(x, 3))
+#define s1(x)         (ROTR32(x,17) ^ ROTR32(x, 19) ^ R(x, 10))
+
+static __device__ __constant__ uint32_t sha256_hashTable[] = {
+	0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
+};
+
+// precomputed table
+static __constant__ uint32_t sha256_endingTable[64] = {
+	0xc28a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf374,
+	0x649b69c1, 0xf0fe4786, 0x0fe1edc6, 0x240cf254, 0x4fe9346f, 0x6cc984be, 0x61b9411e, 0x16f988fa,
+	0xf2c65152, 0xa88e5a6d, 0xb019fc65, 0xb9d99ec7, 0x9a1231c3, 0xe70eeaa0, 0xfdb1232b, 0xc7353eb0,
+	0x3069bad5, 0xcb976d5f, 0x5a0f118f, 0xdc1eeefd, 0x0a35b689, 0xde0b7a04, 0x58f4ca9d, 0xe15d5b16,
+	0x007f3e86, 0x37088980, 0xa507ea32, 0x6fab9537, 0x17406110, 0x0d8cd6f1, 0xcdaa3b6d, 0xc0bbbe37,
+	0x83613bda, 0xdb48a363, 0x0b02e931, 0x6fd15ca7, 0x521afaca, 0x31338431, 0x6ed41a95, 0x6d437890,
+	0xc39c91f2, 0x9eccabbd, 0xb5c9a0e6, 0x532fb63c, 0xd2c741c6, 0x07237ea3, 0xa4954b68, 0x4c191d76
+};
+
+static __constant__ uint32_t sha256_constantTable[64] = {
+	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
+__global__ __launch_bounds__(TPB)
+void skeincoin_gpu_hash_50(uint32_t threads, uint32_t startNounce, uint32_t* d_found, uint64_t target64, int swap)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint2 h0, h1, h2, h3, h4, h5, h6, h7, h8;
+		uint2 t0, t1, t2;
+		uint2 p[8];
+
+		h0 = precalcvalues[0];
+		h1 = precalcvalues[1];
+		h2 = precalcvalues[2];
+		h3 = precalcvalues[3];
+		h4 = precalcvalues[4];
+		h5 = precalcvalues[5];
+		h6 = precalcvalues[6];
+		h7 = precalcvalues[7];
+		t2 = precalcvalues[8];
+
+		const uint32_t nonce = startNounce + thread;
+		const uint2 nonce2 = make_uint2(_LODWORD(c_message16[1]), swap ? cuda_swab32(nonce) : nonce);
+
+		// skein_big_close -> etype = 0x160, ptr = 16, bcount = 1, extra = 16
+		p[0] = vectorize(c_message16[0]);
+		p[1] = nonce2;
+
+		#pragma unroll
+		for (int i = 2; i < 8; i++)
+			p[i] = make_uint2(0, 0);
+
+		t0 = vectorizelow(0x50ull); // SPH_T64(bcount << 6) + (sph_u64)(extra);
+		t1 = vectorizehigh(0xB0000000ul); // (bcount >> 58) + ((sph_u64)(etype) << 55);
+		TFBIG_KINIT_UI2(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2);
+		TFBIG_4e_UI2(0);
+		TFBIG_4o_UI2(1);
+		TFBIG_4e_UI2(2);
+		TFBIG_4o_UI2(3);
+		TFBIG_4e_UI2(4);
+		TFBIG_4o_UI2(5);
+		TFBIG_4e_UI2(6);
+		TFBIG_4o_UI2(7);
+		TFBIG_4e_UI2(8);
+		TFBIG_4o_UI2(9);
+		TFBIG_4e_UI2(10);
+		TFBIG_4o_UI2(11);
+		TFBIG_4e_UI2(12);
+		TFBIG_4o_UI2(13);
+		TFBIG_4e_UI2(14);
+		TFBIG_4o_UI2(15);
+		TFBIG_4e_UI2(16);
+		TFBIG_4o_UI2(17);
+		TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18);
+
+
+		t0 = vectorizelow(8); // extra
+		t1 = vectorizehigh(0xFF000000ul); // etype
+
+		h0 = vectorize(c_message16[0]) ^ p[0];
+		h1 = nonce2 ^ p[1];
+		h2 = p[2];
+		h3 = p[3];
+		h4 = p[4];
+		h5 = p[5];
+		h6 = p[6];
+		h7 = p[7];
+
+		h8 = h0 ^ h1 ^ p[2] ^ p[3] ^ p[4] ^ p[5] ^ p[6] ^ p[7] ^ vectorize(0x1BD11BDAA9FC1A22);
+		t2 = vectorize(0xFF00000000000008ull);
+
+		// p[8] = { 0 };
+		#pragma unroll 8
+		for (int i = 0; i<8; i++)
+			p[i] = make_uint2(0, 0);
+
+		TFBIG_4e_UI2(0);
+		TFBIG_4o_UI2(1);
+		TFBIG_4e_UI2(2);
+		TFBIG_4o_UI2(3);
+		TFBIG_4e_UI2(4);
+		TFBIG_4o_UI2(5);
+		TFBIG_4e_UI2(6);
+		TFBIG_4o_UI2(7);
+		TFBIG_4e_UI2(8);
+		TFBIG_4o_UI2(9);
+		TFBIG_4e_UI2(10);
+		TFBIG_4o_UI2(11);
+		TFBIG_4e_UI2(12);
+		TFBIG_4o_UI2(13);
+		TFBIG_4e_UI2(14);
+		TFBIG_4o_UI2(15);
+		TFBIG_4e_UI2(16);
+		TFBIG_4o_UI2(17);
+		TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18);
+
+		uint32_t *message = (uint32_t *)p;
+
+		uint32_t regs[8];
+		uint32_t hash[8];
+
+		// Init with Hash-Table
+		#pragma unroll 8
+		for (int k = 0; k < 8; k++) {
+			hash[k] = regs[k] = sha256_hashTable[k];
+		}
+
+		uint32_t W1[16];
+		uint32_t W2[16];
+
+		#pragma unroll 16
+		for (int k = 0; k<16; k++)
+			W1[k] = SWAB32(message[k]);
+
+		// Progress W1
+		#pragma unroll 16
+		for (int j = 0; j<16; j++)
+		{
+			uint32_t T1, T2;
+			T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j] + W1[j];
+			T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+
+			#pragma unroll 7
+			for (int k = 6; k >= 0; k--) regs[k + 1] = regs[k];
+			regs[0] = T1 + T2;
+			regs[4] += T1;
+		}
+
+		// Progress W2...W3
+
+		////// PART 1
+		#pragma unroll 2
+		for (int j = 0; j<2; j++)
+			W2[j] = s1(W1[14 + j]) + W1[9 + j] + s0(W1[1 + j]) + W1[j];
+		#pragma unroll 5
+		for (int j = 2; j<7; j++)
+			W2[j] = s1(W2[j - 2]) + W1[9 + j] + s0(W1[1 + j]) + W1[j];
+
+		#pragma unroll 8
+		for (int j = 7; j<15; j++)
+			W2[j] = s1(W2[j - 2]) + W2[j - 7] + s0(W1[1 + j]) + W1[j];
+
+		W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
+
+		// Round function
+		#pragma unroll 16
+		for (int j = 0; j<16; j++)
+		{
+			uint32_t T1, T2;
+			T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j + 16] + W2[j];
+			T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+
+			#pragma unroll 7
+			for (int l = 6; l >= 0; l--) regs[l + 1] = regs[l];
+			regs[0] = T1 + T2;
+			regs[4] += T1;
+		}
+
+		////// PART 2
+		#pragma unroll 2
+		for (int j = 0; j<2; j++)
+			W1[j] = s1(W2[14 + j]) + W2[9 + j] + s0(W2[1 + j]) + W2[j];
+
+		#pragma unroll 5
+		for (int j = 2; j<7; j++)
+			W1[j] = s1(W1[j - 2]) + W2[9 + j] + s0(W2[1 + j]) + W2[j];
+
+		#pragma unroll 8
+		for (int j = 7; j<15; j++)
+			W1[j] = s1(W1[j - 2]) + W1[j - 7] + s0(W2[1 + j]) + W2[j];
+
+		W1[15] = s1(W1[13]) + W1[8] + s0(W1[0]) + W2[15];
+
+		// Round function
+		#pragma unroll 16
+		for (int j = 0; j<16; j++)
+		{
+			uint32_t T1, T2;
+			T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j + 32] + W1[j];
+			T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+
+			#pragma unroll 7
+			for (int l = 6; l >= 0; l--) regs[l + 1] = regs[l];
+			regs[0] = T1 + T2;
+			regs[4] += T1;
+		}
+
+		////// PART 3
+		#pragma unroll 2
+		for (int j = 0; j<2; j++)
+			W2[j] = s1(W1[14 + j]) + W1[9 + j] + s0(W1[1 + j]) + W1[j];
+
+		#pragma unroll 5
+		for (int j = 2; j<7; j++)
+			W2[j] = s1(W2[j - 2]) + W1[9 + j] + s0(W1[1 + j]) + W1[j];
+
+		#pragma unroll 8
+		for (int j = 7; j<15; j++)
+			W2[j] = s1(W2[j - 2]) + W2[j - 7] + s0(W1[1 + j]) + W1[j];
+
+		W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
+
+		// Round function
+		#pragma unroll 16
+		for (int j = 0; j<16; j++)
+		{
+			uint32_t T1, T2;
+			T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j + 48] + W2[j];
+			T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+
+			#pragma unroll 7
+			for (int l = 6; l >= 0; l--) regs[l + 1] = regs[l];
+			regs[0] = T1 + T2;
+			regs[4] += T1;
+		}
+
+		#pragma unroll 8
+		for (int k = 0; k<8; k++)
+			hash[k] += regs[k];
+
+		/////
+		///// Second Pass (ending)
+		/////
+		#pragma unroll 8
+		for (int k = 0; k<8; k++)
+			regs[k] = hash[k];
+
+		// Progress W1
+		uint32_t T1, T2;
+		#pragma unroll 1
+		for (int j = 0; j<56; j++)//62
+		{
+			T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_endingTable[j];
+			T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+
+			#pragma unroll 7
+			for (int k = 6; k >= 0; k--)
+				regs[k + 1] = regs[k];
+			regs[0] = T1 + T2;
+			regs[4] += T1;
+		}
+		T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6])+sha256_endingTable[56];
+		T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+		regs[7] = T1 + T2;
+		regs[3] += T1;
+
+		T1 = regs[6] + S1(regs[3]) + Ch(regs[3], regs[4], regs[5]) + sha256_endingTable[57];
+		T2 = S0(regs[7]) + Maj(regs[7], regs[0], regs[1]);
+		regs[6] = T1 + T2;
+		regs[2] += T1;
+		//************
+		regs[1] += regs[5] + S1(regs[2]) + Ch(regs[2], regs[3], regs[4]) + sha256_endingTable[58];
+		regs[0] += regs[4] + S1(regs[1]) + Ch(regs[1], regs[2], regs[3]) + sha256_endingTable[59];
+		regs[7] += regs[3] + S1(regs[0]) + Ch(regs[0], regs[1], regs[2]) + sha256_endingTable[60];
+		regs[6] += regs[2] + S1(regs[7]) + Ch(regs[7], regs[0], regs[1]) + sha256_endingTable[61];
+
+		uint64_t test = SWAB32(hash[7] + regs[7]);
+		test <<= 32;
+		test|= SWAB32(hash[6] + regs[6]);
+		if (test <= target64)
+		{
+			uint32_t tmp = atomicExch(&(d_found[0]), startNounce + thread);
+			if (tmp != UINT32_MAX)
+				d_found[1] = tmp;
+		}
+	}
+}
+
+__host__
+static void precalc(uint64_t* message)
+{
+	uint64_t h0, h1, h2, h3, h4, h5, h6, h7, h8;
+	uint64_t t0, t1, t2;
+
+	h0 = 0x4903ADFF749C51CEull;
+	h1 = 0x0D95DE399746DF03ull;
+	h2 = 0x8FD1934127C79BCEull;
+	h3 = 0x9A255629FF352CB1ull;
+	h4 = 0x5DB62599DF6CA7B0ull;
+	h5 = 0xEABE394CA9D5C3F4ull;
+	h6 = 0x991112C71A75B523ull;
+	h7 = 0xAE18A40B660FCC33ull;
+	//h8 = h0 ^ h1 ^ h2 ^ h3 ^ h4 ^ h5 ^ h6 ^ h7 ^ SPH_C64(0x1BD11BDAA9FC1A22);
+	h8 = 0xcab2076d98173ec4ULL;
+
+	t0 = 64; // ptr
+	t1 = 0x7000000000000000ull;
+	t2 = 0x7000000000000040ull;
+
+	uint64_t p[8];
+	for (int i = 0; i<8; i++)
+		p[i] = message[i];
+
+	TFBIG_4e_PRE(0);
+	TFBIG_4o_PRE(1);
+	TFBIG_4e_PRE(2);
+	TFBIG_4o_PRE(3);
+	TFBIG_4e_PRE(4);
+	TFBIG_4o_PRE(5);
+	TFBIG_4e_PRE(6);
+	TFBIG_4o_PRE(7);
+	TFBIG_4e_PRE(8);
+	TFBIG_4o_PRE(9);
+	TFBIG_4e_PRE(10);
+	TFBIG_4o_PRE(11);
+	TFBIG_4e_PRE(12);
+	TFBIG_4o_PRE(13);
+	TFBIG_4e_PRE(14);
+	TFBIG_4o_PRE(15);
+	TFBIG_4e_PRE(16);
+	TFBIG_4o_PRE(17);
+	TFBIG_ADDKEY_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18);
+
+	uint64_t buffer[9];
+	buffer[0] = message[0] ^ p[0];
+	buffer[1] = message[1] ^ p[1];
+	buffer[2] = message[2] ^ p[2];
+	buffer[3] = message[3] ^ p[3];
+	buffer[4] = message[4] ^ p[4];
+	buffer[5] = message[5] ^ p[5];
+	buffer[6] = message[6] ^ p[6];
+	buffer[7] = message[7] ^ p[7];
+	buffer[8] = t2;
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(precalcvalues, buffer, sizeof(buffer), 0, cudaMemcpyHostToDevice));
+}
+
+__host__
+void skeincoin_init(int thr_id)
+{
+	cuda_get_arch(thr_id);
+	CUDA_SAFE_CALL(cudaMalloc(&d_found[thr_id], 2 * sizeof(uint32_t)));
+}
+
+__host__
+void skeincoin_free(int thr_id) {
+	cudaFree(d_found[thr_id]);
+}
+
+__host__
+void skeincoin_setBlock_80(int thr_id, void *pdata)
+{
+	uint64_t message[16];
+	memcpy(&message[0], pdata, 80);
+
+	cudaMemcpyToSymbol(c_message16, &message[8], 16, 0, cudaMemcpyHostToDevice);
+
+	precalc(message);
+}
+
+__host__
+uint32_t skeincoin_hash_sm5(int thr_id, uint32_t threads, uint32_t startNounce, int swap, uint64_t target64, uint32_t *secNonce)
+{
+	uint32_t h_found[2];
+	uint32_t threadsperblock = TPB;
+	dim3 block(threadsperblock);
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+
+	memset(h_found, 0xff, sizeof(h_found));
+	cudaMemset(d_found[thr_id], 0xff, 2 * sizeof(uint32_t));
+
+	skeincoin_gpu_hash_50 <<< grid, block >>> (threads, startNounce, d_found[thr_id], target64, swap);
+
+	cudaMemcpy(h_found, d_found[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	if (h_found[1] && h_found[1] != UINT32_MAX && h_found[1] != h_found[0])
+		*secNonce = h_found[1];
+	return h_found[0];
+}
diff --git a/cuda_vector_uint2x4.h b/cuda_vector_uint2x4.h
new file mode 100644
index 0000000000..9cff51440f
--- /dev/null
+++ b/cuda_vector_uint2x4.h
@@ -0,0 +1,62 @@
+#ifndef CUDA_VECTOR_UINT2x4_H
+#define CUDA_VECTOR_UINT2x4_H
+
+///////////////////////////////////////////////////////////////////////////////////
+#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__)
+#define __LDG_PTR   "l"
+#else
+#define __LDG_PTR   "r"
+#endif
+
+#include "cuda_helper.h"
+
+typedef struct __align__(16) uint2x4 {
+	uint2 x, y, z, w;
+} uint2x4;
+
+
+static __inline__ __device__ uint2x4 make_uint2x4(uint2 s0, uint2 s1, uint2 s2, uint2 s3)
+{
+	uint2x4 t;
+	t.x = s0; t.y = s1; t.z = s2; t.w = s3;
+	return t;
+}
+
+static __forceinline__ __device__  uint2x4 operator^ (const uint2x4 &a, const uint2x4 &b) {
+	return make_uint2x4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w);
+}
+
+static __forceinline__ __device__  uint2x4 operator+ (const uint2x4 &a, const uint2x4 &b) {
+	return make_uint2x4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+}
+
+/////////////////////////
+
+static __forceinline__ __device__ void operator^= (uint2x4 &a, const uint2x4 &b) { a = a ^ b; }
+static __forceinline__ __device__ void operator+= (uint2x4 &a, const uint2x4 &b) { a = a + b; }
+
+#if __CUDA_ARCH__ >= 320
+
+static __device__ __inline__ uint2x4 __ldg4(const uint2x4 *ptr)
+{
+	uint2x4 ret;
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];"    : "=r"(ret.x.x), "=r"(ret.x.y), "=r"(ret.y.x), "=r"(ret.y.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.z.x), "=r"(ret.z.y), "=r"(ret.w.x), "=r"(ret.w.y) : __LDG_PTR(ptr));
+	return ret;
+}
+
+static __device__ __inline__ void ldg4(const uint2x4 *ptr, uint2x4 *ret)
+{
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];"     : "=r"(ret[0].x.x), "=r"(ret[0].x.y), "=r"(ret[0].y.x), "=r"(ret[0].y.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];"  : "=r"(ret[0].z.x), "=r"(ret[0].z.y), "=r"(ret[0].w.x), "=r"(ret[0].w.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];"  : "=r"(ret[1].x.x), "=r"(ret[1].x.y), "=r"(ret[1].y.x), "=r"(ret[1].y.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];"  : "=r"(ret[1].z.x), "=r"(ret[1].z.y), "=r"(ret[1].w.x), "=r"(ret[1].w.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];"  : "=r"(ret[2].x.x), "=r"(ret[2].x.y), "=r"(ret[2].y.x), "=r"(ret[2].y.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];"  : "=r"(ret[2].z.x), "=r"(ret[2].z.y), "=r"(ret[2].w.x), "=r"(ret[2].w.y) : __LDG_PTR(ptr));
+}
+#elif !defined(__ldg4)
+#define __ldg4(x) (*(x))
+#define ldg4(ptr, ret) { *(ret) = (*(ptr)); }
+#endif
+
+#endif // H
diff --git a/cuda_vectors.h b/cuda_vectors.h
new file mode 100644
index 0000000000..8399fcfd5d
--- /dev/null
+++ b/cuda_vectors.h
@@ -0,0 +1,47 @@
+#include "cuda_helper.h"
+
+/* Macros for uint2 operations (used by skein) */
+
+__device__ __forceinline__
+uint2 ROR8(const uint2 a) {
+	uint2 result;
+	result.x = __byte_perm(a.x, a.y, 0x4321);
+	result.y = __byte_perm(a.y, a.x, 0x4321);
+	return result;
+}
+
+__device__ __forceinline__
+uint2 ROL24(const uint2 a) {
+	uint2 result;
+	result.x = __byte_perm(a.x, a.y, 0x0765);
+	result.y = __byte_perm(a.y, a.x, 0x0765);
+	return result;
+}
+
+static __device__ __forceinline__ uint2 operator+ (const uint2 a, const uint32_t b)
+{
+#if 0 && defined(__CUDA_ARCH__) && CUDA_VERSION < 7000
+	uint2 result;
+	asm(
+		"add.cc.u32 %0,%2,%4; \n\t"
+		"addc.u32 %1,%3,%5;   \n\t"
+	: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b), "r"(0));
+	return result;
+#else
+	return vectorize(devectorize(a) + b);
+#endif
+}
+
+/* whirlpool ones */
+#ifdef __CUDA_ARCH__
+__device__ __forceinline__
+uint2 ROL16(const uint2 a) {
+	uint2 result;
+	result.x = __byte_perm(a.x, a.y, 0x1076);
+	result.y = __byte_perm(a.y, a.x, 0x1076);
+	return result;
+}
+#else
+#define ROL16(a) make_uint2(a.x, a.y) /* bad, just to define it */
+#endif
+
diff --git a/depcomp b/depcomp
deleted file mode 100644
index bd0ac08958..0000000000
--- a/depcomp
+++ /dev/null
@@ -1,688 +0,0 @@
-#! /bin/sh
-# depcomp - compile a program generating dependencies as side-effects
-
-scriptversion=2011-12-04.11; # UTC
-
-# Copyright (C) 1999, 2000, 2003, 2004, 2005, 2006, 2007, 2009, 2010,
-# 2011 Free Software Foundation, Inc.
-
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-# As a special exception to the GNU General Public License, if you
-# distribute this file as part of a program that contains a
-# configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that program.
-
-# Originally written by Alexandre Oliva <oliva@dcc.unicamp.br>.
-
-case $1 in
-  '')
-     echo "$0: No command.  Try \`$0 --help' for more information." 1>&2
-     exit 1;
-     ;;
-  -h | --h*)
-    cat <<\EOF
-Usage: depcomp [--help] [--version] PROGRAM [ARGS]
-
-Run PROGRAMS ARGS to compile a file, generating dependencies
-as side-effects.
-
-Environment variables:
-  depmode     Dependency tracking mode.
-  source      Source file read by `PROGRAMS ARGS'.
-  object      Object file output by `PROGRAMS ARGS'.
-  DEPDIR      directory where to store dependencies.
-  depfile     Dependency file to output.
-  tmpdepfile  Temporary file to use when outputting dependencies.
-  libtool     Whether libtool is used (yes/no).
-
-Report bugs to <bug-automake@gnu.org>.
-EOF
-    exit $?
-    ;;
-  -v | --v*)
-    echo "depcomp $scriptversion"
-    exit $?
-    ;;
-esac
-
-if test -z "$depmode" || test -z "$source" || test -z "$object"; then
-  echo "depcomp: Variables source, object and depmode must be set" 1>&2
-  exit 1
-fi
-
-# Dependencies for sub/bar.o or sub/bar.obj go into sub/.deps/bar.Po.
-depfile=${depfile-`echo "$object" |
-  sed 's|[^\\/]*$|'${DEPDIR-.deps}'/&|;s|\.\([^.]*\)$|.P\1|;s|Pobj$|Po|'`}
-tmpdepfile=${tmpdepfile-`echo "$depfile" | sed 's/\.\([^.]*\)$/.T\1/'`}
-
-rm -f "$tmpdepfile"
-
-# Some modes work just like other modes, but use different flags.  We
-# parameterize here, but still list the modes in the big case below,
-# to make depend.m4 easier to write.  Note that we *cannot* use a case
-# here, because this file can only contain one case statement.
-if test "$depmode" = hp; then
-  # HP compiler uses -M and no extra arg.
-  gccflag=-M
-  depmode=gcc
-fi
-
-if test "$depmode" = dashXmstdout; then
-   # This is just like dashmstdout with a different argument.
-   dashmflag=-xM
-   depmode=dashmstdout
-fi
-
-cygpath_u="cygpath -u -f -"
-if test "$depmode" = msvcmsys; then
-   # This is just like msvisualcpp but w/o cygpath translation.
-   # Just convert the backslash-escaped backslashes to single forward
-   # slashes to satisfy depend.m4
-   cygpath_u='sed s,\\\\,/,g'
-   depmode=msvisualcpp
-fi
-
-if test "$depmode" = msvc7msys; then
-   # This is just like msvc7 but w/o cygpath translation.
-   # Just convert the backslash-escaped backslashes to single forward
-   # slashes to satisfy depend.m4
-   cygpath_u='sed s,\\\\,/,g'
-   depmode=msvc7
-fi
-
-case "$depmode" in
-gcc3)
-## gcc 3 implements dependency tracking that does exactly what
-## we want.  Yay!  Note: for some reason libtool 1.4 doesn't like
-## it if -MD -MP comes after the -MF stuff.  Hmm.
-## Unfortunately, FreeBSD c89 acceptance of flags depends upon
-## the command line argument order; so add the flags where they
-## appear in depend2.am.  Note that the slowdown incurred here
-## affects only configure: in makefiles, %FASTDEP% shortcuts this.
-  for arg
-  do
-    case $arg in
-    -c) set fnord "$@" -MT "$object" -MD -MP -MF "$tmpdepfile" "$arg" ;;
-    *)  set fnord "$@" "$arg" ;;
-    esac
-    shift # fnord
-    shift # $arg
-  done
-  "$@"
-  stat=$?
-  if test $stat -eq 0; then :
-  else
-    rm -f "$tmpdepfile"
-    exit $stat
-  fi
-  mv "$tmpdepfile" "$depfile"
-  ;;
-
-gcc)
-## There are various ways to get dependency output from gcc.  Here's
-## why we pick this rather obscure method:
-## - Don't want to use -MD because we'd like the dependencies to end
-##   up in a subdir.  Having to rename by hand is ugly.
-##   (We might end up doing this anyway to support other compilers.)
-## - The DEPENDENCIES_OUTPUT environment variable makes gcc act like
-##   -MM, not -M (despite what the docs say).
-## - Using -M directly means running the compiler twice (even worse
-##   than renaming).
-  if test -z "$gccflag"; then
-    gccflag=-MD,
-  fi
-  "$@" -Wp,"$gccflag$tmpdepfile"
-  stat=$?
-  if test $stat -eq 0; then :
-  else
-    rm -f "$tmpdepfile"
-    exit $stat
-  fi
-  rm -f "$depfile"
-  echo "$object : \\" > "$depfile"
-  alpha=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
-## The second -e expression handles DOS-style file names with drive letters.
-  sed -e 's/^[^:]*: / /' \
-      -e 's/^['$alpha']:\/[^:]*: / /' < "$tmpdepfile" >> "$depfile"
-## This next piece of magic avoids the `deleted header file' problem.
-## The problem is that when a header file which appears in a .P file
-## is deleted, the dependency causes make to die (because there is
-## typically no way to rebuild the header).  We avoid this by adding
-## dummy dependencies for each header file.  Too bad gcc doesn't do
-## this for us directly.
-  tr ' ' '
-' < "$tmpdepfile" |
-## Some versions of gcc put a space before the `:'.  On the theory
-## that the space means something, we add a space to the output as
-## well.  hp depmode also adds that space, but also prefixes the VPATH
-## to the object.  Take care to not repeat it in the output.
-## Some versions of the HPUX 10.20 sed can't process this invocation
-## correctly.  Breaking it into two sed invocations is a workaround.
-    sed -e 's/^\\$//' -e '/^$/d' -e "s|.*$object$||" -e '/:$/d' \
-      | sed -e 's/$/ :/' >> "$depfile"
-  rm -f "$tmpdepfile"
-  ;;
-
-hp)
-  # This case exists only to let depend.m4 do its work.  It works by
-  # looking at the text of this script.  This case will never be run,
-  # since it is checked for above.
-  exit 1
-  ;;
-
-sgi)
-  if test "$libtool" = yes; then
-    "$@" "-Wp,-MDupdate,$tmpdepfile"
-  else
-    "$@" -MDupdate "$tmpdepfile"
-  fi
-  stat=$?
-  if test $stat -eq 0; then :
-  else
-    rm -f "$tmpdepfile"
-    exit $stat
-  fi
-  rm -f "$depfile"
-
-  if test -f "$tmpdepfile"; then  # yes, the sourcefile depend on other files
-    echo "$object : \\" > "$depfile"
-
-    # Clip off the initial element (the dependent).  Don't try to be
-    # clever and replace this with sed code, as IRIX sed won't handle
-    # lines with more than a fixed number of characters (4096 in
-    # IRIX 6.2 sed, 8192 in IRIX 6.5).  We also remove comment lines;
-    # the IRIX cc adds comments like `#:fec' to the end of the
-    # dependency line.
-    tr ' ' '
-' < "$tmpdepfile" \
-    | sed -e 's/^.*\.o://' -e 's/#.*$//' -e '/^$/ d' | \
-    tr '
-' ' ' >> "$depfile"
-    echo >> "$depfile"
-
-    # The second pass generates a dummy entry for each header file.
-    tr ' ' '
-' < "$tmpdepfile" \
-   | sed -e 's/^.*\.o://' -e 's/#.*$//' -e '/^$/ d' -e 's/$/:/' \
-   >> "$depfile"
-  else
-    # The sourcefile does not contain any dependencies, so just
-    # store a dummy comment line, to avoid errors with the Makefile
-    # "include basename.Plo" scheme.
-    echo "#dummy" > "$depfile"
-  fi
-  rm -f "$tmpdepfile"
-  ;;
-
-aix)
-  # The C for AIX Compiler uses -M and outputs the dependencies
-  # in a .u file.  In older versions, this file always lives in the
-  # current directory.  Also, the AIX compiler puts `$object:' at the
-  # start of each line; $object doesn't have directory information.
-  # Version 6 uses the directory in both cases.
-  dir=`echo "$object" | sed -e 's|/[^/]*$|/|'`
-  test "x$dir" = "x$object" && dir=
-  base=`echo "$object" | sed -e 's|^.*/||' -e 's/\.o$//' -e 's/\.lo$//'`
-  if test "$libtool" = yes; then
-    tmpdepfile1=$dir$base.u
-    tmpdepfile2=$base.u
-    tmpdepfile3=$dir.libs/$base.u
-    "$@" -Wc,-M
-  else
-    tmpdepfile1=$dir$base.u
-    tmpdepfile2=$dir$base.u
-    tmpdepfile3=$dir$base.u
-    "$@" -M
-  fi
-  stat=$?
-
-  if test $stat -eq 0; then :
-  else
-    rm -f "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3"
-    exit $stat
-  fi
-
-  for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3"
-  do
-    test -f "$tmpdepfile" && break
-  done
-  if test -f "$tmpdepfile"; then
-    # Each line is of the form `foo.o: dependent.h'.
-    # Do two passes, one to just change these to
-    # `$object: dependent.h' and one to simply `dependent.h:'.
-    sed -e "s,^.*\.[a-z]*:,$object:," < "$tmpdepfile" > "$depfile"
-    # That's a tab and a space in the [].
-    sed -e 's,^.*\.[a-z]*:[	 ]*,,' -e 's,$,:,' < "$tmpdepfile" >> "$depfile"
-  else
-    # The sourcefile does not contain any dependencies, so just
-    # store a dummy comment line, to avoid errors with the Makefile
-    # "include basename.Plo" scheme.
-    echo "#dummy" > "$depfile"
-  fi
-  rm -f "$tmpdepfile"
-  ;;
-
-icc)
-  # Intel's C compiler understands `-MD -MF file'.  However on
-  #    icc -MD -MF foo.d -c -o sub/foo.o sub/foo.c
-  # ICC 7.0 will fill foo.d with something like
-  #    foo.o: sub/foo.c
-  #    foo.o: sub/foo.h
-  # which is wrong.  We want:
-  #    sub/foo.o: sub/foo.c
-  #    sub/foo.o: sub/foo.h
-  #    sub/foo.c:
-  #    sub/foo.h:
-  # ICC 7.1 will output
-  #    foo.o: sub/foo.c sub/foo.h
-  # and will wrap long lines using \ :
-  #    foo.o: sub/foo.c ... \
-  #     sub/foo.h ... \
-  #     ...
-
-  "$@" -MD -MF "$tmpdepfile"
-  stat=$?
-  if test $stat -eq 0; then :
-  else
-    rm -f "$tmpdepfile"
-    exit $stat
-  fi
-  rm -f "$depfile"
-  # Each line is of the form `foo.o: dependent.h',
-  # or `foo.o: dep1.h dep2.h \', or ` dep3.h dep4.h \'.
-  # Do two passes, one to just change these to
-  # `$object: dependent.h' and one to simply `dependent.h:'.
-  sed "s,^[^:]*:,$object :," < "$tmpdepfile" > "$depfile"
-  # Some versions of the HPUX 10.20 sed can't process this invocation
-  # correctly.  Breaking it into two sed invocations is a workaround.
-  sed 's,^[^:]*: \(.*\)$,\1,;s/^\\$//;/^$/d;/:$/d' < "$tmpdepfile" |
-    sed -e 's/$/ :/' >> "$depfile"
-  rm -f "$tmpdepfile"
-  ;;
-
-hp2)
-  # The "hp" stanza above does not work with aCC (C++) and HP's ia64
-  # compilers, which have integrated preprocessors.  The correct option
-  # to use with these is +Maked; it writes dependencies to a file named
-  # 'foo.d', which lands next to the object file, wherever that
-  # happens to be.
-  # Much of this is similar to the tru64 case; see comments there.
-  dir=`echo "$object" | sed -e 's|/[^/]*$|/|'`
-  test "x$dir" = "x$object" && dir=
-  base=`echo "$object" | sed -e 's|^.*/||' -e 's/\.o$//' -e 's/\.lo$//'`
-  if test "$libtool" = yes; then
-    tmpdepfile1=$dir$base.d
-    tmpdepfile2=$dir.libs/$base.d
-    "$@" -Wc,+Maked
-  else
-    tmpdepfile1=$dir$base.d
-    tmpdepfile2=$dir$base.d
-    "$@" +Maked
-  fi
-  stat=$?
-  if test $stat -eq 0; then :
-  else
-     rm -f "$tmpdepfile1" "$tmpdepfile2"
-     exit $stat
-  fi
-
-  for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2"
-  do
-    test -f "$tmpdepfile" && break
-  done
-  if test -f "$tmpdepfile"; then
-    sed -e "s,^.*\.[a-z]*:,$object:," "$tmpdepfile" > "$depfile"
-    # Add `dependent.h:' lines.
-    sed -ne '2,${
-	       s/^ *//
-	       s/ \\*$//
-	       s/$/:/
-	       p
-	     }' "$tmpdepfile" >> "$depfile"
-  else
-    echo "#dummy" > "$depfile"
-  fi
-  rm -f "$tmpdepfile" "$tmpdepfile2"
-  ;;
-
-tru64)
-   # The Tru64 compiler uses -MD to generate dependencies as a side
-   # effect.  `cc -MD -o foo.o ...' puts the dependencies into `foo.o.d'.
-   # At least on Alpha/Redhat 6.1, Compaq CCC V6.2-504 seems to put
-   # dependencies in `foo.d' instead, so we check for that too.
-   # Subdirectories are respected.
-   dir=`echo "$object" | sed -e 's|/[^/]*$|/|'`
-   test "x$dir" = "x$object" && dir=
-   base=`echo "$object" | sed -e 's|^.*/||' -e 's/\.o$//' -e 's/\.lo$//'`
-
-   if test "$libtool" = yes; then
-      # With Tru64 cc, shared objects can also be used to make a
-      # static library.  This mechanism is used in libtool 1.4 series to
-      # handle both shared and static libraries in a single compilation.
-      # With libtool 1.4, dependencies were output in $dir.libs/$base.lo.d.
-      #
-      # With libtool 1.5 this exception was removed, and libtool now
-      # generates 2 separate objects for the 2 libraries.  These two
-      # compilations output dependencies in $dir.libs/$base.o.d and
-      # in $dir$base.o.d.  We have to check for both files, because
-      # one of the two compilations can be disabled.  We should prefer
-      # $dir$base.o.d over $dir.libs/$base.o.d because the latter is
-      # automatically cleaned when .libs/ is deleted, while ignoring
-      # the former would cause a distcleancheck panic.
-      tmpdepfile1=$dir.libs/$base.lo.d   # libtool 1.4
-      tmpdepfile2=$dir$base.o.d          # libtool 1.5
-      tmpdepfile3=$dir.libs/$base.o.d    # libtool 1.5
-      tmpdepfile4=$dir.libs/$base.d      # Compaq CCC V6.2-504
-      "$@" -Wc,-MD
-   else
-      tmpdepfile1=$dir$base.o.d
-      tmpdepfile2=$dir$base.d
-      tmpdepfile3=$dir$base.d
-      tmpdepfile4=$dir$base.d
-      "$@" -MD
-   fi
-
-   stat=$?
-   if test $stat -eq 0; then :
-   else
-      rm -f "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" "$tmpdepfile4"
-      exit $stat
-   fi
-
-   for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" "$tmpdepfile4"
-   do
-     test -f "$tmpdepfile" && break
-   done
-   if test -f "$tmpdepfile"; then
-      sed -e "s,^.*\.[a-z]*:,$object:," < "$tmpdepfile" > "$depfile"
-      # That's a tab and a space in the [].
-      sed -e 's,^.*\.[a-z]*:[	 ]*,,' -e 's,$,:,' < "$tmpdepfile" >> "$depfile"
-   else
-      echo "#dummy" > "$depfile"
-   fi
-   rm -f "$tmpdepfile"
-   ;;
-
-msvc7)
-  if test "$libtool" = yes; then
-    showIncludes=-Wc,-showIncludes
-  else
-    showIncludes=-showIncludes
-  fi
-  "$@" $showIncludes > "$tmpdepfile"
-  stat=$?
-  grep -v '^Note: including file: ' "$tmpdepfile"
-  if test "$stat" = 0; then :
-  else
-    rm -f "$tmpdepfile"
-    exit $stat
-  fi
-  rm -f "$depfile"
-  echo "$object : \\" > "$depfile"
-  # The first sed program below extracts the file names and escapes
-  # backslashes for cygpath.  The second sed program outputs the file
-  # name when reading, but also accumulates all include files in the
-  # hold buffer in order to output them again at the end.  This only
-  # works with sed implementations that can handle large buffers.
-  sed < "$tmpdepfile" -n '
-/^Note: including file:  *\(.*\)/ {
-  s//\1/
-  s/\\/\\\\/g
-  p
-}' | $cygpath_u | sort -u | sed -n '
-s/ /\\ /g
-s/\(.*\)/	\1 \\/p
-s/.\(.*\) \\/\1:/
-H
-$ {
-  s/.*/	/
-  G
-  p
-}' >> "$depfile"
-  rm -f "$tmpdepfile"
-  ;;
-
-msvc7msys)
-  # This case exists only to let depend.m4 do its work.  It works by
-  # looking at the text of this script.  This case will never be run,
-  # since it is checked for above.
-  exit 1
-  ;;
-
-#nosideeffect)
-  # This comment above is used by automake to tell side-effect
-  # dependency tracking mechanisms from slower ones.
-
-dashmstdout)
-  # Important note: in order to support this mode, a compiler *must*
-  # always write the preprocessed file to stdout, regardless of -o.
-  "$@" || exit $?
-
-  # Remove the call to Libtool.
-  if test "$libtool" = yes; then
-    while test "X$1" != 'X--mode=compile'; do
-      shift
-    done
-    shift
-  fi
-
-  # Remove `-o $object'.
-  IFS=" "
-  for arg
-  do
-    case $arg in
-    -o)
-      shift
-      ;;
-    $object)
-      shift
-      ;;
-    *)
-      set fnord "$@" "$arg"
-      shift # fnord
-      shift # $arg
-      ;;
-    esac
-  done
-
-  test -z "$dashmflag" && dashmflag=-M
-  # Require at least two characters before searching for `:'
-  # in the target name.  This is to cope with DOS-style filenames:
-  # a dependency such as `c:/foo/bar' could be seen as target `c' otherwise.
-  "$@" $dashmflag |
-    sed 's:^[  ]*[^: ][^:][^:]*\:[    ]*:'"$object"'\: :' > "$tmpdepfile"
-  rm -f "$depfile"
-  cat < "$tmpdepfile" > "$depfile"
-  tr ' ' '
-' < "$tmpdepfile" | \
-## Some versions of the HPUX 10.20 sed can't process this invocation
-## correctly.  Breaking it into two sed invocations is a workaround.
-    sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile"
-  rm -f "$tmpdepfile"
-  ;;
-
-dashXmstdout)
-  # This case only exists to satisfy depend.m4.  It is never actually
-  # run, as this mode is specially recognized in the preamble.
-  exit 1
-  ;;
-
-makedepend)
-  "$@" || exit $?
-  # Remove any Libtool call
-  if test "$libtool" = yes; then
-    while test "X$1" != 'X--mode=compile'; do
-      shift
-    done
-    shift
-  fi
-  # X makedepend
-  shift
-  cleared=no eat=no
-  for arg
-  do
-    case $cleared in
-    no)
-      set ""; shift
-      cleared=yes ;;
-    esac
-    if test $eat = yes; then
-      eat=no
-      continue
-    fi
-    case "$arg" in
-    -D*|-I*)
-      set fnord "$@" "$arg"; shift ;;
-    # Strip any option that makedepend may not understand.  Remove
-    # the object too, otherwise makedepend will parse it as a source file.
-    -arch)
-      eat=yes ;;
-    -*|$object)
-      ;;
-    *)
-      set fnord "$@" "$arg"; shift ;;
-    esac
-  done
-  obj_suffix=`echo "$object" | sed 's/^.*\././'`
-  touch "$tmpdepfile"
-  ${MAKEDEPEND-makedepend} -o"$obj_suffix" -f"$tmpdepfile" "$@"
-  rm -f "$depfile"
-  # makedepend may prepend the VPATH from the source file name to the object.
-  # No need to regex-escape $object, excess matching of '.' is harmless.
-  sed "s|^.*\($object *:\)|\1|" "$tmpdepfile" > "$depfile"
-  sed '1,2d' "$tmpdepfile" | tr ' ' '
-' | \
-## Some versions of the HPUX 10.20 sed can't process this invocation
-## correctly.  Breaking it into two sed invocations is a workaround.
-    sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile"
-  rm -f "$tmpdepfile" "$tmpdepfile".bak
-  ;;
-
-cpp)
-  # Important note: in order to support this mode, a compiler *must*
-  # always write the preprocessed file to stdout.
-  "$@" || exit $?
-
-  # Remove the call to Libtool.
-  if test "$libtool" = yes; then
-    while test "X$1" != 'X--mode=compile'; do
-      shift
-    done
-    shift
-  fi
-
-  # Remove `-o $object'.
-  IFS=" "
-  for arg
-  do
-    case $arg in
-    -o)
-      shift
-      ;;
-    $object)
-      shift
-      ;;
-    *)
-      set fnord "$@" "$arg"
-      shift # fnord
-      shift # $arg
-      ;;
-    esac
-  done
-
-  "$@" -E |
-    sed -n -e '/^# [0-9][0-9]* "\([^"]*\)".*/ s:: \1 \\:p' \
-       -e '/^#line [0-9][0-9]* "\([^"]*\)".*/ s:: \1 \\:p' |
-    sed '$ s: \\$::' > "$tmpdepfile"
-  rm -f "$depfile"
-  echo "$object : \\" > "$depfile"
-  cat < "$tmpdepfile" >> "$depfile"
-  sed < "$tmpdepfile" '/^$/d;s/^ //;s/ \\$//;s/$/ :/' >> "$depfile"
-  rm -f "$tmpdepfile"
-  ;;
-
-msvisualcpp)
-  # Important note: in order to support this mode, a compiler *must*
-  # always write the preprocessed file to stdout.
-  "$@" || exit $?
-
-  # Remove the call to Libtool.
-  if test "$libtool" = yes; then
-    while test "X$1" != 'X--mode=compile'; do
-      shift
-    done
-    shift
-  fi
-
-  IFS=" "
-  for arg
-  do
-    case "$arg" in
-    -o)
-      shift
-      ;;
-    $object)
-      shift
-      ;;
-    "-Gm"|"/Gm"|"-Gi"|"/Gi"|"-ZI"|"/ZI")
-	set fnord "$@"
-	shift
-	shift
-	;;
-    *)
-	set fnord "$@" "$arg"
-	shift
-	shift
-	;;
-    esac
-  done
-  "$@" -E 2>/dev/null |
-  sed -n '/^#line [0-9][0-9]* "\([^"]*\)"/ s::\1:p' | $cygpath_u | sort -u > "$tmpdepfile"
-  rm -f "$depfile"
-  echo "$object : \\" > "$depfile"
-  sed < "$tmpdepfile" -n -e 's% %\\ %g' -e '/^\(.*\)$/ s::	\1 \\:p' >> "$depfile"
-  echo "	" >> "$depfile"
-  sed < "$tmpdepfile" -n -e 's% %\\ %g' -e '/^\(.*\)$/ s::\1\::p' >> "$depfile"
-  rm -f "$tmpdepfile"
-  ;;
-
-msvcmsys)
-  # This case exists only to let depend.m4 do its work.  It works by
-  # looking at the text of this script.  This case will never be run,
-  # since it is checked for above.
-  exit 1
-  ;;
-
-none)
-  exec "$@"
-  ;;
-
-*)
-  echo "Unknown depmode $depmode" 1>&2
-  exit 1
-  ;;
-esac
-
-exit 0
-
-# Local Variables:
-# mode: shell-script
-# sh-indentation: 2
-# eval: (add-hook 'write-file-hooks 'time-stamp)
-# time-stamp-start: "scriptversion="
-# time-stamp-format: "%:y-%02m-%02d.%02H"
-# time-stamp-time-zone: "UTC"
-# time-stamp-end: "; # UTC"
-# End:
diff --git a/equi/blake2/blake2-config.h b/equi/blake2/blake2-config.h
new file mode 100644
index 0000000000..2a8f9ed6e6
--- /dev/null
+++ b/equi/blake2/blake2-config.h
@@ -0,0 +1,81 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+
+   To the extent possible under law, the author(s) have dedicated all copyright
+   and related and neighboring rights to this software to the public domain
+   worldwide. This software is distributed without any warranty.
+
+   You should have received a copy of the CC0 Public Domain Dedication along with
+   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+#pragma once
+#ifndef __BLAKE2_CONFIG_H__
+#define __BLAKE2_CONFIG_H__
+
+// These don't work everywhere
+#if (defined(__SSE2__) || defined(_M_AMD_64) || defined(_M_X64))
+#define HAVE_SSE2
+#endif
+
+#if defined(__SSSE3__)
+#define HAVE_SSSE3
+#endif
+
+#if defined(__SSE4_1__)
+#define HAVE_SSE41
+#endif
+
+#if defined(__AVX__)
+#define HAVE_AVX
+#endif
+
+#if defined(__XOP__)
+#define HAVE_XOP
+#endif
+
+
+#ifdef HAVE_AVX2
+#ifndef HAVE_AVX
+#define HAVE_AVX
+#endif
+#endif
+
+#ifdef HAVE_XOP
+#ifndef HAVE_AVX
+#define HAVE_AVX
+#endif
+#endif
+
+#ifdef HAVE_AVX
+#ifndef HAVE_SSE41
+#define HAVE_SSE41
+#endif
+#endif
+
+#ifdef HAVE_SSE41
+#ifndef HAVE_SSSE3
+#define HAVE_SSSE3
+#endif
+#endif
+
+#ifdef HAVE_SSSE3
+#define HAVE_SSE2
+#endif
+
+#if !defined(HAVE_SSE2)
+
+#ifdef _MSC_VER
+// enforce required stuff for now
+#define HAVE_SSE2
+//#define HAVE_SSSE3
+#define HAVE_SSE41
+#else
+# error "This code requires at least SSE 4.1"
+#endif
+
+#endif
+
+#endif
+
diff --git a/equi/blake2/blake2-impl.h b/equi/blake2/blake2-impl.h
new file mode 100644
index 0000000000..16219dbcb7
--- /dev/null
+++ b/equi/blake2/blake2-impl.h
@@ -0,0 +1,136 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+
+   To the extent possible under law, the author(s) have dedicated all copyright
+   and related and neighboring rights to this software to the public domain
+   worldwide. This software is distributed without any warranty.
+
+   You should have received a copy of the CC0 Public Domain Dedication along with
+   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+#pragma once
+#ifndef __BLAKE2_IMPL_H__
+#define __BLAKE2_IMPL_H__
+
+#include <stdint.h>
+
+static inline uint32_t load32( const void *src )
+{
+#if defined(NATIVE_LITTLE_ENDIAN)
+  uint32_t w;
+  memcpy(&w, src, sizeof w);
+  return w;
+#else
+  const uint8_t *p = ( const uint8_t * )src;
+  uint32_t w = *p++;
+  w |= ( uint32_t )( *p++ ) <<  8;
+  w |= ( uint32_t )( *p++ ) << 16;
+  w |= ( uint32_t )( *p++ ) << 24;
+  return w;
+#endif
+}
+
+static inline uint64_t load64( const void *src )
+{
+#if defined(NATIVE_LITTLE_ENDIAN)
+  uint64_t w;
+  memcpy(&w, src, sizeof w);
+  return w;
+#else
+  const uint8_t *p = ( const uint8_t * )src;
+  uint64_t w = *p++;
+  w |= ( uint64_t )( *p++ ) <<  8;
+  w |= ( uint64_t )( *p++ ) << 16;
+  w |= ( uint64_t )( *p++ ) << 24;
+  w |= ( uint64_t )( *p++ ) << 32;
+  w |= ( uint64_t )( *p++ ) << 40;
+  w |= ( uint64_t )( *p++ ) << 48;
+  w |= ( uint64_t )( *p++ ) << 56;
+  return w;
+#endif
+}
+
+static inline void store32( void *dst, uint32_t w )
+{
+#if defined(NATIVE_LITTLE_ENDIAN)
+  memcpy(dst, &w, sizeof w);
+#else
+  uint8_t *p = ( uint8_t * )dst;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w;
+#endif
+}
+
+static inline void store64( void *dst, uint64_t w )
+{
+#if defined(NATIVE_LITTLE_ENDIAN)
+  memcpy(dst, &w, sizeof w);
+#else
+  uint8_t *p = ( uint8_t * )dst;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w;
+#endif
+}
+
+static inline uint64_t load48( const void *src )
+{
+  const uint8_t *p = ( const uint8_t * )src;
+  uint64_t w = *p++;
+  w |= ( uint64_t )( *p++ ) <<  8;
+  w |= ( uint64_t )( *p++ ) << 16;
+  w |= ( uint64_t )( *p++ ) << 24;
+  w |= ( uint64_t )( *p++ ) << 32;
+  w |= ( uint64_t )( *p++ ) << 40;
+  return w;
+}
+
+static inline void store48( void *dst, uint64_t w )
+{
+  uint8_t *p = ( uint8_t * )dst;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w;
+}
+
+static inline uint32_t rotl32( const uint32_t w, const unsigned c )
+{
+  return ( w << c ) | ( w >> ( 32 - c ) );
+}
+
+static inline uint64_t rotl64( const uint64_t w, const unsigned c )
+{
+  return ( w << c ) | ( w >> ( 64 - c ) );
+}
+
+static inline uint32_t rotr32( const uint32_t w, const unsigned c )
+{
+  return ( w >> c ) | ( w << ( 32 - c ) );
+}
+
+static inline uint64_t rotr64( const uint64_t w, const unsigned c )
+{
+  return ( w >> c ) | ( w << ( 64 - c ) );
+}
+
+/* prevents compiler optimizing out memset() */
+static inline void secure_zero_memory( void *v, size_t n )
+{
+  volatile uint8_t *p = ( volatile uint8_t * )v;
+  while( n-- ) *p++ = 0;
+}
+
+#endif
+
diff --git a/equi/blake2/blake2-round.h b/equi/blake2/blake2-round.h
new file mode 100644
index 0000000000..400ed20340
--- /dev/null
+++ b/equi/blake2/blake2-round.h
@@ -0,0 +1,85 @@
+#define _mm_roti_epi64(x, c) \
+	(-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1))  \
+	: (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
+	: (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \
+	: (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x)))  \
+	: _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c))))
+
+#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+	row1l = _mm_add_epi64(row1l, row2l); \
+	row1h = _mm_add_epi64(row1h, row2h); \
+	\
+	row4l = _mm_xor_si128(row4l, row1l); \
+	row4h = _mm_xor_si128(row4h, row1h); \
+	\
+	row4l = _mm_roti_epi64(row4l, -32); \
+	row4h = _mm_roti_epi64(row4h, -32); \
+	\
+	row3l = _mm_add_epi64(row3l, row4l); \
+	row3h = _mm_add_epi64(row3h, row4h); \
+	\
+	row2l = _mm_xor_si128(row2l, row3l); \
+	row2h = _mm_xor_si128(row2h, row3h); \
+	\
+	row2l = _mm_roti_epi64(row2l, -24); \
+	row2h = _mm_roti_epi64(row2h, -24); \
+ 
+#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+	row1l = _mm_add_epi64(row1l, row2l); \
+	row1h = _mm_add_epi64(row1h, row2h); \
+	\
+	row4l = _mm_xor_si128(row4l, row1l); \
+	row4h = _mm_xor_si128(row4h, row1h); \
+	\
+	row4l = _mm_roti_epi64(row4l, -16); \
+	row4h = _mm_roti_epi64(row4h, -16); \
+	\
+	row3l = _mm_add_epi64(row3l, row4l); \
+	row3h = _mm_add_epi64(row3h, row4h); \
+	\
+	row2l = _mm_xor_si128(row2l, row3l); \
+	row2h = _mm_xor_si128(row2h, row3h); \
+	\
+	row2l = _mm_roti_epi64(row2l, -63); \
+	row2h = _mm_roti_epi64(row2h, -63); \
+
+#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+	t0 = _mm_alignr_epi8(row2h, row2l, 8); \
+	t1 = _mm_alignr_epi8(row2l, row2h, 8); \
+	row2l = t0; \
+	row2h = t1; \
+	\
+	t0 = row3l; \
+	row3l = row3h; \
+	row3h = t0;    \
+	\
+	t0 = _mm_alignr_epi8(row4h, row4l, 8); \
+	t1 = _mm_alignr_epi8(row4l, row4h, 8); \
+	row4l = t1; \
+	row4h = t0;
+
+#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+	t0 = _mm_alignr_epi8(row2l, row2h, 8); \
+	t1 = _mm_alignr_epi8(row2h, row2l, 8); \
+	row2l = t0; \
+	row2h = t1; \
+	\
+	t0 = row3l; \
+	row3l = row3h; \
+	row3h = t0; \
+	\
+	t0 = _mm_alignr_epi8(row4l, row4h, 8); \
+	t1 = _mm_alignr_epi8(row4h, row4l, 8); \
+	row4l = t1; \
+	row4h = t0;
+
+#define BLAKE2_ROUND(row1l,row1h,row2l,row2h,row3l,row3h,row4l,row4h) \
+	G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+	G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+	\
+	DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+	\
+	G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+	G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+	\
+	UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
diff --git a/equi/blake2/blake2.h b/equi/blake2/blake2.h
new file mode 100644
index 0000000000..320657640c
--- /dev/null
+++ b/equi/blake2/blake2.h
@@ -0,0 +1,85 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+
+   To the extent possible under law, the author(s) have dedicated all copyright
+   and related and neighboring rights to this software to the public domain
+   worldwide. This software is distributed without any warranty.
+
+   You should have received a copy of the CC0 Public Domain Dedication along with
+   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+#pragma once
+#ifndef __BLAKE2_H__
+#define __BLAKE2_H__
+
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(_MSC_VER)
+#define ALIGN(x) __declspec(align(x))
+#else
+#define ALIGN(x) __attribute__ ((__aligned__(x)))
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+  enum blake2b_constant
+  {
+    BLAKE2B_BLOCKBYTES = 128,
+    BLAKE2B_OUTBYTES   = 64,
+    BLAKE2B_KEYBYTES   = 64,
+    BLAKE2B_SALTBYTES  = 16,
+    BLAKE2B_PERSONALBYTES = 16
+  };
+
+#pragma pack(push, 1)
+  typedef struct __blake2b_param
+  {
+    uint8_t  digest_length; // 1
+    uint8_t  key_length;    // 2
+    uint8_t  fanout;        // 3
+    uint8_t  depth;         // 4
+    uint32_t leaf_length;   // 8
+    uint64_t node_offset;   // 16
+    uint8_t  node_depth;    // 17
+    uint8_t  inner_length;  // 18
+    uint8_t  reserved[14];  // 32
+    uint8_t  salt[BLAKE2B_SALTBYTES]; // 48
+    uint8_t  personal[BLAKE2B_PERSONALBYTES];  // 64
+  } blake2b_param;
+
+  ALIGN( 64 ) typedef struct __blake2b_state
+  {
+    uint64_t h[8];
+    uint8_t  buf[BLAKE2B_BLOCKBYTES];
+    uint16_t counter;
+    uint8_t  buflen;
+    uint8_t  lastblock;
+  } blake2b_state;
+
+#pragma pack(pop)
+
+  int eq_blake2b_init( blake2b_state *S, const uint8_t outlen );
+  int eq_blake2b_init_key( blake2b_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
+  int eq_blake2b_init_param( blake2b_state *S, const blake2b_param *P );
+  int eq_blake2b_update( blake2b_state *S, const uint8_t *in, uint64_t inlen );
+  int eq_blake2b_final( blake2b_state *S, uint8_t *out, uint8_t outlen );
+
+  // Simple API
+  int eq_blake2b( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
+
+  static inline int eq_blake2( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
+  {
+    return eq_blake2b( out, in, key, outlen, inlen, keylen );
+  }
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
+
diff --git a/equi/blake2/blake2b-load-sse2.h b/equi/blake2/blake2b-load-sse2.h
new file mode 100644
index 0000000000..1ba153c87d
--- /dev/null
+++ b/equi/blake2/blake2b-load-sse2.h
@@ -0,0 +1,68 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+
+   To the extent possible under law, the author(s) have dedicated all copyright
+   and related and neighboring rights to this software to the public domain
+   worldwide. This software is distributed without any warranty.
+
+   You should have received a copy of the CC0 Public Domain Dedication along with
+   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+#pragma once
+#ifndef __BLAKE2B_LOAD_SSE2_H__
+#define __BLAKE2B_LOAD_SSE2_H__
+
+#define LOAD_MSG_0_1(b0, b1) b0 = _mm_set_epi64x(m2, m0); b1 = _mm_set_epi64x(m6, m4)
+#define LOAD_MSG_0_2(b0, b1) b0 = _mm_set_epi64x(m3, m1); b1 = _mm_set_epi64x(m7, m5)
+#define LOAD_MSG_0_3(b0, b1) b0 = _mm_set_epi64x(m10, m8); b1 = _mm_set_epi64x(m14, m12)
+#define LOAD_MSG_0_4(b0, b1) b0 = _mm_set_epi64x(m11, m9); b1 = _mm_set_epi64x(m15, m13)
+#define LOAD_MSG_1_1(b0, b1) b0 = _mm_set_epi64x(m4, m14); b1 = _mm_set_epi64x(m13, m9)
+#define LOAD_MSG_1_2(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m6, m15)
+#define LOAD_MSG_1_3(b0, b1) b0 = _mm_set_epi64x(m0, m1); b1 = _mm_set_epi64x(m5, m11)
+#define LOAD_MSG_1_4(b0, b1) b0 = _mm_set_epi64x(m2, m12); b1 = _mm_set_epi64x(m3, m7)
+#define LOAD_MSG_2_1(b0, b1) b0 = _mm_set_epi64x(m12, m11); b1 = _mm_set_epi64x(m15, m5)
+#define LOAD_MSG_2_2(b0, b1) b0 = _mm_set_epi64x(m0, m8); b1 = _mm_set_epi64x(m13, m2)
+#define LOAD_MSG_2_3(b0, b1) b0 = _mm_set_epi64x(m3, m10); b1 = _mm_set_epi64x(m9, m7)
+#define LOAD_MSG_2_4(b0, b1) b0 = _mm_set_epi64x(m6, m14); b1 = _mm_set_epi64x(m4, m1)
+#define LOAD_MSG_3_1(b0, b1) b0 = _mm_set_epi64x(m3, m7); b1 = _mm_set_epi64x(m11, m13)
+#define LOAD_MSG_3_2(b0, b1) b0 = _mm_set_epi64x(m1, m9); b1 = _mm_set_epi64x(m14, m12)
+#define LOAD_MSG_3_3(b0, b1) b0 = _mm_set_epi64x(m5, m2); b1 = _mm_set_epi64x(m15, m4)
+#define LOAD_MSG_3_4(b0, b1) b0 = _mm_set_epi64x(m10, m6); b1 = _mm_set_epi64x(m8, m0)
+#define LOAD_MSG_4_1(b0, b1) b0 = _mm_set_epi64x(m5, m9); b1 = _mm_set_epi64x(m10, m2)
+#define LOAD_MSG_4_2(b0, b1) b0 = _mm_set_epi64x(m7, m0); b1 = _mm_set_epi64x(m15, m4)
+#define LOAD_MSG_4_3(b0, b1) b0 = _mm_set_epi64x(m11, m14); b1 = _mm_set_epi64x(m3, m6)
+#define LOAD_MSG_4_4(b0, b1) b0 = _mm_set_epi64x(m12, m1); b1 = _mm_set_epi64x(m13, m8)
+#define LOAD_MSG_5_1(b0, b1) b0 = _mm_set_epi64x(m6, m2); b1 = _mm_set_epi64x(m8, m0)
+#define LOAD_MSG_5_2(b0, b1) b0 = _mm_set_epi64x(m10, m12); b1 = _mm_set_epi64x(m3, m11)
+#define LOAD_MSG_5_3(b0, b1) b0 = _mm_set_epi64x(m7, m4); b1 = _mm_set_epi64x(m1, m15)
+#define LOAD_MSG_5_4(b0, b1) b0 = _mm_set_epi64x(m5, m13); b1 = _mm_set_epi64x(m9, m14)
+#define LOAD_MSG_6_1(b0, b1) b0 = _mm_set_epi64x(m1, m12); b1 = _mm_set_epi64x(m4, m14)
+#define LOAD_MSG_6_2(b0, b1) b0 = _mm_set_epi64x(m15, m5); b1 = _mm_set_epi64x(m10, m13)
+#define LOAD_MSG_6_3(b0, b1) b0 = _mm_set_epi64x(m6, m0); b1 = _mm_set_epi64x(m8, m9)
+#define LOAD_MSG_6_4(b0, b1) b0 = _mm_set_epi64x(m3, m7); b1 = _mm_set_epi64x(m11, m2)
+#define LOAD_MSG_7_1(b0, b1) b0 = _mm_set_epi64x(m7, m13); b1 = _mm_set_epi64x(m3, m12)
+#define LOAD_MSG_7_2(b0, b1) b0 = _mm_set_epi64x(m14, m11); b1 = _mm_set_epi64x(m9, m1)
+#define LOAD_MSG_7_3(b0, b1) b0 = _mm_set_epi64x(m15, m5); b1 = _mm_set_epi64x(m2, m8)
+#define LOAD_MSG_7_4(b0, b1) b0 = _mm_set_epi64x(m4, m0); b1 = _mm_set_epi64x(m10, m6)
+#define LOAD_MSG_8_1(b0, b1) b0 = _mm_set_epi64x(m14, m6); b1 = _mm_set_epi64x(m0, m11)
+#define LOAD_MSG_8_2(b0, b1) b0 = _mm_set_epi64x(m9, m15); b1 = _mm_set_epi64x(m8, m3)
+#define LOAD_MSG_8_3(b0, b1) b0 = _mm_set_epi64x(m13, m12); b1 = _mm_set_epi64x(m10, m1)
+#define LOAD_MSG_8_4(b0, b1) b0 = _mm_set_epi64x(m7, m2); b1 = _mm_set_epi64x(m5, m4)
+#define LOAD_MSG_9_1(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m1, m7)
+#define LOAD_MSG_9_2(b0, b1) b0 = _mm_set_epi64x(m4, m2); b1 = _mm_set_epi64x(m5, m6)
+#define LOAD_MSG_9_3(b0, b1) b0 = _mm_set_epi64x(m9, m15); b1 = _mm_set_epi64x(m13, m3)
+#define LOAD_MSG_9_4(b0, b1) b0 = _mm_set_epi64x(m14, m11); b1 = _mm_set_epi64x(m0, m12)
+#define LOAD_MSG_10_1(b0, b1) b0 = _mm_set_epi64x(m2, m0); b1 = _mm_set_epi64x(m6, m4)
+#define LOAD_MSG_10_2(b0, b1) b0 = _mm_set_epi64x(m3, m1); b1 = _mm_set_epi64x(m7, m5)
+#define LOAD_MSG_10_3(b0, b1) b0 = _mm_set_epi64x(m10, m8); b1 = _mm_set_epi64x(m14, m12)
+#define LOAD_MSG_10_4(b0, b1) b0 = _mm_set_epi64x(m11, m9); b1 = _mm_set_epi64x(m15, m13)
+#define LOAD_MSG_11_1(b0, b1) b0 = _mm_set_epi64x(m4, m14); b1 = _mm_set_epi64x(m13, m9)
+#define LOAD_MSG_11_2(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m6, m15)
+#define LOAD_MSG_11_3(b0, b1) b0 = _mm_set_epi64x(m0, m1); b1 = _mm_set_epi64x(m5, m11)
+#define LOAD_MSG_11_4(b0, b1) b0 = _mm_set_epi64x(m2, m12); b1 = _mm_set_epi64x(m3, m7)
+
+
+#endif
+
diff --git a/equi/blake2/blake2b-load-sse41.h b/equi/blake2/blake2b-load-sse41.h
new file mode 100644
index 0000000000..f6c1bc8393
--- /dev/null
+++ b/equi/blake2/blake2b-load-sse41.h
@@ -0,0 +1,402 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+
+   To the extent possible under law, the author(s) have dedicated all copyright
+   and related and neighboring rights to this software to the public domain
+   worldwide. This software is distributed without any warranty.
+
+   You should have received a copy of the CC0 Public Domain Dedication along with
+   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+#pragma once
+#ifndef __BLAKE2B_LOAD_SSE41_H__
+#define __BLAKE2B_LOAD_SSE41_H__
+
+#define LOAD_MSG_0_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m0, m1); \
+b1 = _mm_unpacklo_epi64(m2, m3); \
+} while(0)
+
+
+#define LOAD_MSG_0_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m0, m1); \
+b1 = _mm_unpackhi_epi64(m2, m3); \
+} while(0)
+
+
+#define LOAD_MSG_0_3(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m4, m5); \
+b1 = _mm_unpacklo_epi64(m6, m7); \
+} while(0)
+
+
+#define LOAD_MSG_0_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m4, m5); \
+b1 = _mm_unpackhi_epi64(m6, m7); \
+} while(0)
+
+
+#define LOAD_MSG_1_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m7, m2); \
+b1 = _mm_unpackhi_epi64(m4, m6); \
+} while(0)
+
+
+#define LOAD_MSG_1_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m5, m4); \
+b1 = _mm_alignr_epi8(m3, m7, 8); \
+} while(0)
+
+
+#define LOAD_MSG_1_3(b0, b1) \
+do \
+{ \
+b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
+b1 = _mm_unpackhi_epi64(m5, m2); \
+} while(0)
+
+
+#define LOAD_MSG_1_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m6, m1); \
+b1 = _mm_unpackhi_epi64(m3, m1); \
+} while(0)
+
+
+#define LOAD_MSG_2_1(b0, b1) \
+do \
+{ \
+b0 = _mm_alignr_epi8(m6, m5, 8); \
+b1 = _mm_unpackhi_epi64(m2, m7); \
+} while(0)
+
+
+#define LOAD_MSG_2_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m4, m0); \
+b1 = _mm_blend_epi16(m1, m6, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_2_3(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m5, m1, 0xF0); \
+b1 = _mm_unpackhi_epi64(m3, m4); \
+} while(0)
+
+
+#define LOAD_MSG_2_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m7, m3); \
+b1 = _mm_alignr_epi8(m2, m0, 8); \
+} while(0)
+
+
+#define LOAD_MSG_3_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m3, m1); \
+b1 = _mm_unpackhi_epi64(m6, m5); \
+} while(0)
+
+
+#define LOAD_MSG_3_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m4, m0); \
+b1 = _mm_unpacklo_epi64(m6, m7); \
+} while(0)
+
+
+#define LOAD_MSG_3_3(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m1, m2, 0xF0); \
+b1 = _mm_blend_epi16(m2, m7, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_3_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m3, m5); \
+b1 = _mm_unpacklo_epi64(m0, m4); \
+} while(0)
+
+
+#define LOAD_MSG_4_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m4, m2); \
+b1 = _mm_unpacklo_epi64(m1, m5); \
+} while(0)
+
+
+#define LOAD_MSG_4_2(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m0, m3, 0xF0); \
+b1 = _mm_blend_epi16(m2, m7, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_4_3(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m7, m5, 0xF0); \
+b1 = _mm_blend_epi16(m3, m1, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_4_4(b0, b1) \
+do \
+{ \
+b0 = _mm_alignr_epi8(m6, m0, 8); \
+b1 = _mm_blend_epi16(m4, m6, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_5_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m1, m3); \
+b1 = _mm_unpacklo_epi64(m0, m4); \
+} while(0)
+
+
+#define LOAD_MSG_5_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m6, m5); \
+b1 = _mm_unpackhi_epi64(m5, m1); \
+} while(0)
+
+
+#define LOAD_MSG_5_3(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m2, m3, 0xF0); \
+b1 = _mm_unpackhi_epi64(m7, m0); \
+} while(0)
+
+
+#define LOAD_MSG_5_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m6, m2); \
+b1 = _mm_blend_epi16(m7, m4, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_6_1(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m6, m0, 0xF0); \
+b1 = _mm_unpacklo_epi64(m7, m2); \
+} while(0)
+
+
+#define LOAD_MSG_6_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m2, m7); \
+b1 = _mm_alignr_epi8(m5, m6, 8); \
+} while(0)
+
+
+#define LOAD_MSG_6_3(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m0, m3); \
+b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \
+} while(0)
+
+
+#define LOAD_MSG_6_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m3, m1); \
+b1 = _mm_blend_epi16(m1, m5, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_7_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m6, m3); \
+b1 = _mm_blend_epi16(m6, m1, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_7_2(b0, b1) \
+do \
+{ \
+b0 = _mm_alignr_epi8(m7, m5, 8); \
+b1 = _mm_unpackhi_epi64(m0, m4); \
+} while(0)
+
+
+#define LOAD_MSG_7_3(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m2, m7); \
+b1 = _mm_unpacklo_epi64(m4, m1); \
+} while(0)
+
+
+#define LOAD_MSG_7_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m0, m2); \
+b1 = _mm_unpacklo_epi64(m3, m5); \
+} while(0)
+
+
+#define LOAD_MSG_8_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m3, m7); \
+b1 = _mm_alignr_epi8(m0, m5, 8); \
+} while(0)
+
+
+#define LOAD_MSG_8_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m7, m4); \
+b1 = _mm_alignr_epi8(m4, m1, 8); \
+} while(0)
+
+
+#define LOAD_MSG_8_3(b0, b1) \
+do \
+{ \
+b0 = m6; \
+b1 = _mm_alignr_epi8(m5, m0, 8); \
+} while(0)
+
+
+#define LOAD_MSG_8_4(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m1, m3, 0xF0); \
+b1 = m2; \
+} while(0)
+
+
+#define LOAD_MSG_9_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m5, m4); \
+b1 = _mm_unpackhi_epi64(m3, m0); \
+} while(0)
+
+
+#define LOAD_MSG_9_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m1, m2); \
+b1 = _mm_blend_epi16(m3, m2, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_9_3(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m7, m4); \
+b1 = _mm_unpackhi_epi64(m1, m6); \
+} while(0)
+
+
+#define LOAD_MSG_9_4(b0, b1) \
+do \
+{ \
+b0 = _mm_alignr_epi8(m7, m5, 8); \
+b1 = _mm_unpacklo_epi64(m6, m0); \
+} while(0)
+
+
+#define LOAD_MSG_10_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m0, m1); \
+b1 = _mm_unpacklo_epi64(m2, m3); \
+} while(0)
+
+
+#define LOAD_MSG_10_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m0, m1); \
+b1 = _mm_unpackhi_epi64(m2, m3); \
+} while(0)
+
+
+#define LOAD_MSG_10_3(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m4, m5); \
+b1 = _mm_unpacklo_epi64(m6, m7); \
+} while(0)
+
+
+#define LOAD_MSG_10_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m4, m5); \
+b1 = _mm_unpackhi_epi64(m6, m7); \
+} while(0)
+
+
+#define LOAD_MSG_11_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m7, m2); \
+b1 = _mm_unpackhi_epi64(m4, m6); \
+} while(0)
+
+
+#define LOAD_MSG_11_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m5, m4); \
+b1 = _mm_alignr_epi8(m3, m7, 8); \
+} while(0)
+
+
+#define LOAD_MSG_11_3(b0, b1) \
+do \
+{ \
+b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
+b1 = _mm_unpackhi_epi64(m5, m2); \
+} while(0)
+
+
+#define LOAD_MSG_11_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m6, m1); \
+b1 = _mm_unpackhi_epi64(m3, m1); \
+} while(0)
+
+
+#endif
+
diff --git a/equi/blake2/blake2b-round.h b/equi/blake2/blake2b-round.h
new file mode 100644
index 0000000000..b39106b8f3
--- /dev/null
+++ b/equi/blake2/blake2b-round.h
@@ -0,0 +1,170 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+
+   To the extent possible under law, the author(s) have dedicated all copyright
+   and related and neighboring rights to this software to the public domain
+   worldwide. This software is distributed without any warranty.
+
+   You should have received a copy of the CC0 Public Domain Dedication along with
+   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+#pragma once
+#ifndef __BLAKE2B_ROUND_H__
+#define __BLAKE2B_ROUND_H__
+
+#define LOAD(p)  _mm_load_si128( (const __m128i *)(p) )
+#define STORE(p,r) _mm_store_si128((__m128i *)(p), r)
+
+#define LOADU(p)  _mm_loadu_si128( (const __m128i *)(p) )
+#define STOREU(p,r) _mm_storeu_si128((__m128i *)(p), r)
+
+#define TOF(reg) _mm_castsi128_ps((reg))
+#define TOI(reg) _mm_castps_si128((reg))
+
+#define LIKELY(x) __builtin_expect((x),1)
+
+
+/* Microarchitecture-specific macros */
+#ifndef HAVE_XOP
+#ifdef HAVE_SSSE3
+#define _mm_roti_epi64(x, c) \
+    (-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1))  \
+    : (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
+    : (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \
+    : (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x)))  \
+    : _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c))))
+#else
+#define _mm_roti_epi64(r, c) _mm_xor_si128(_mm_srli_epi64( (r), -(c) ),_mm_slli_epi64( (r), 64-(-c) ))
+#endif
+#else
+/* ... */
+#endif
+
+
+
+#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
+  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
+  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
+  \
+  row4l = _mm_xor_si128(row4l, row1l); \
+  row4h = _mm_xor_si128(row4h, row1h); \
+  \
+  row4l = _mm_roti_epi64(row4l, (-32)); \
+  row4h = _mm_roti_epi64(row4h, (-32)); \
+  \
+  row3l = _mm_add_epi64(row3l, row4l); \
+  row3h = _mm_add_epi64(row3h, row4h); \
+  \
+  row2l = _mm_xor_si128(row2l, row3l); \
+  row2h = _mm_xor_si128(row2h, row3h); \
+  \
+  row2l = _mm_roti_epi64(row2l, (-24)); \
+  row2h = _mm_roti_epi64(row2h, (-24)); \
+
+#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
+  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
+  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
+  \
+  row4l = _mm_xor_si128(row4l, row1l); \
+  row4h = _mm_xor_si128(row4h, row1h); \
+  \
+  row4l = _mm_roti_epi64(row4l, (-16)); \
+  row4h = _mm_roti_epi64(row4h, (-16)); \
+  \
+  row3l = _mm_add_epi64(row3l, row4l); \
+  row3h = _mm_add_epi64(row3h, row4h); \
+  \
+  row2l = _mm_xor_si128(row2l, row3l); \
+  row2h = _mm_xor_si128(row2h, row3h); \
+  \
+  row2l = _mm_roti_epi64(row2l, (-63)); \
+  row2h = _mm_roti_epi64(row2h, (-63)); \
+
+#if defined(HAVE_SSSE3)
+#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+  t0 = _mm_alignr_epi8(row2h, row2l, 8); \
+  t1 = _mm_alignr_epi8(row2l, row2h, 8); \
+  row2l = t0; \
+  row2h = t1; \
+  \
+  t0 = row3l; \
+  row3l = row3h; \
+  row3h = t0;    \
+  \
+  t0 = _mm_alignr_epi8(row4h, row4l, 8); \
+  t1 = _mm_alignr_epi8(row4l, row4h, 8); \
+  row4l = t1; \
+  row4h = t0;
+
+#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+  t0 = _mm_alignr_epi8(row2l, row2h, 8); \
+  t1 = _mm_alignr_epi8(row2h, row2l, 8); \
+  row2l = t0; \
+  row2h = t1; \
+  \
+  t0 = row3l; \
+  row3l = row3h; \
+  row3h = t0; \
+  \
+  t0 = _mm_alignr_epi8(row4l, row4h, 8); \
+  t1 = _mm_alignr_epi8(row4h, row4l, 8); \
+  row4l = t1; \
+  row4h = t0;
+#else
+
+#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+  t0 = row4l;\
+  t1 = row2l;\
+  row4l = row3l;\
+  row3l = row3h;\
+  row3h = row4l;\
+  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); \
+  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); \
+  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); \
+  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1))
+
+#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+  t0 = row3l;\
+  row3l = row3h;\
+  row3h = t0;\
+  t0 = row2l;\
+  t1 = row4l;\
+  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); \
+  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); \
+  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); \
+  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1))
+
+#endif
+
+#if defined(HAVE_SSE41)
+#include "blake2b-load-sse41.h"
+#else
+#include "blake2b-load-sse2.h"
+#endif
+
+#define ROUND(r) \
+  LOAD_MSG_ ##r ##_1(b0, b1); \
+  G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+  LOAD_MSG_ ##r ##_2(b0, b1); \
+  G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+  DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+  LOAD_MSG_ ##r ##_3(b0, b1); \
+  G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+  LOAD_MSG_ ##r ##_4(b0, b1); \
+  G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+  UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
+
+#endif
+
+#define BLAKE2_ROUND(row1l,row1h,row2l,row2h,row3l,row3h,row4l,row4h) \
+	G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
+	G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
+	\
+	DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
+	\
+	G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
+	G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
+	\
+	UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h);
diff --git a/equi/blake2/blake2bx.cpp b/equi/blake2/blake2bx.cpp
new file mode 100644
index 0000000000..6f5c04175a
--- /dev/null
+++ b/equi/blake2/blake2bx.cpp
@@ -0,0 +1,262 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+
+   To the extent possible under law, the author(s) have dedicated all copyright
+   and related and neighboring rights to this software to the public domain
+   worldwide. This software is distributed without any warranty.
+
+   You should have received a copy of the CC0 Public Domain Dedication along with
+   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+
+#include "blake2.h"
+#include "blake2-impl.h"
+
+#include "blake2-config.h"
+
+#ifdef WIN32
+#include <intrin.h>
+#endif
+
+#include <emmintrin.h>
+#if defined(HAVE_SSSE3)
+#include <tmmintrin.h>
+#endif
+#if defined(HAVE_SSE41)
+#include <smmintrin.h>
+#endif
+#if defined(HAVE_AVX)
+#include <immintrin.h>
+#endif
+#if defined(HAVE_XOP)
+#include <x86intrin.h>
+#endif
+
+#include "blake2b-round.h"
+
+ALIGN(64) static const uint64_t blake2b_IV[8] = {
+	0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
+	0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
+	0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
+	0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
+};
+
+/* init xors IV with input parameter block */
+int eq_blake2b_init_param(blake2b_state *S, const blake2b_param *P)
+{
+	//blake2b_init0( S );
+	const uint8_t * v = (const uint8_t *)(blake2b_IV);
+	const uint8_t * p = (const uint8_t *)(P);
+	uint8_t * h = (uint8_t *)(S->h);
+	/* IV XOR ParamBlock */
+	memset(S, 0, sizeof(blake2b_state));
+
+	for (int i = 0; i < BLAKE2B_OUTBYTES; ++i) h[i] = v[i] ^ p[i];
+
+	return 0;
+}
+
+/* Some sort of default parameter block initialization, for sequential blake2b */
+int eq_blake2b_init(blake2b_state *S, const uint8_t outlen)
+{
+	if ((!outlen) || (outlen > BLAKE2B_OUTBYTES)) return -1;
+
+	const blake2b_param P =
+	{
+		outlen,
+		0,
+		1,
+		1,
+		0,
+		0,
+		0,
+		0,
+		{ 0 },
+		{ 0 },
+		{ 0 }
+	};
+	return eq_blake2b_init_param(S, &P);
+}
+
+int eq_blake2b_init_key(blake2b_state *S, const uint8_t outlen, const void *key, const uint8_t keylen)
+{
+	if ((!outlen) || (outlen > BLAKE2B_OUTBYTES)) return -1;
+
+	if ((!keylen) || keylen > BLAKE2B_KEYBYTES) return -1;
+
+	const blake2b_param P =
+	{
+		outlen,
+		keylen,
+		1,
+		1,
+		0,
+		0,
+		0,
+		0,
+		{ 0 },
+		{ 0 },
+		{ 0 }
+	};
+
+	if (eq_blake2b_init_param(S, &P) < 0)
+		return 0;
+
+	{
+		uint8_t block[BLAKE2B_BLOCKBYTES];
+		memset(block, 0, BLAKE2B_BLOCKBYTES);
+		memcpy(block, key, keylen);
+		eq_blake2b_update(S, block, BLAKE2B_BLOCKBYTES);
+		secure_zero_memory(block, BLAKE2B_BLOCKBYTES); /* Burn the key from stack */
+	}
+	return 0;
+}
+
+static inline int blake2b_compress(blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES])
+{
+	__m128i row1l, row1h;
+	__m128i row2l, row2h;
+	__m128i row3l, row3h;
+	__m128i row4l, row4h;
+	__m128i b0, b1;
+	__m128i t0, t1;
+#if defined(HAVE_SSSE3) && !defined(HAVE_XOP)
+	const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
+	const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
+#endif
+#if defined(HAVE_SSE41)
+	const __m128i m0 = LOADU(block + 00);
+	const __m128i m1 = LOADU(block + 16);
+	const __m128i m2 = LOADU(block + 32);
+	const __m128i m3 = LOADU(block + 48);
+	const __m128i m4 = LOADU(block + 64);
+	const __m128i m5 = LOADU(block + 80);
+	const __m128i m6 = LOADU(block + 96);
+	const __m128i m7 = LOADU(block + 112);
+#else
+	const uint64_t  m0 = ( ( uint64_t * )block )[ 0];
+	const uint64_t  m1 = ( ( uint64_t * )block )[ 1];
+	const uint64_t  m2 = ( ( uint64_t * )block )[ 2];
+	const uint64_t  m3 = ( ( uint64_t * )block )[ 3];
+	const uint64_t  m4 = ( ( uint64_t * )block )[ 4];
+	const uint64_t  m5 = ( ( uint64_t * )block )[ 5];
+	const uint64_t  m6 = ( ( uint64_t * )block )[ 6];
+	const uint64_t  m7 = ( ( uint64_t * )block )[ 7];
+	const uint64_t  m8 = ( ( uint64_t * )block )[ 8];
+	const uint64_t  m9 = ( ( uint64_t * )block )[ 9];
+	const uint64_t m10 = ( ( uint64_t * )block )[10];
+	const uint64_t m11 = ( ( uint64_t * )block )[11];
+	const uint64_t m12 = ( ( uint64_t * )block )[12];
+	const uint64_t m13 = ( ( uint64_t * )block )[13];
+	const uint64_t m14 = ( ( uint64_t * )block )[14];
+	const uint64_t m15 = ( ( uint64_t * )block )[15];
+#endif
+	row1l = LOADU(&S->h[0]);
+	row1h = LOADU(&S->h[2]);
+	row2l = LOADU(&S->h[4]);
+	row2h = LOADU(&S->h[6]);
+	row3l = LOADU(&blake2b_IV[0]);
+	row3h = LOADU(&blake2b_IV[2]);
+	row4l = _mm_xor_si128(LOADU(&blake2b_IV[4]), _mm_set_epi32(0, 0, 0, S->counter));
+	row4h = _mm_xor_si128(LOADU(&blake2b_IV[6]), _mm_set_epi32(0, 0, 0L - S->lastblock, 0L - S->lastblock));
+	ROUND(0);
+	ROUND(1);
+	ROUND(2);
+	ROUND(3);
+	ROUND(4);
+	ROUND(5);
+	ROUND(6);
+	ROUND(7);
+	ROUND(8);
+	ROUND(9);
+	ROUND(10);
+	ROUND(11);
+	row1l = _mm_xor_si128(row3l, row1l);
+	row1h = _mm_xor_si128(row3h, row1h);
+	STOREU(&S->h[0], _mm_xor_si128(LOADU(&S->h[0]), row1l));
+	STOREU(&S->h[2], _mm_xor_si128(LOADU(&S->h[2]), row1h));
+	row2l = _mm_xor_si128(row4l, row2l);
+	row2h = _mm_xor_si128(row4h, row2h);
+	STOREU(&S->h[4], _mm_xor_si128(LOADU(&S->h[4]), row2l));
+	STOREU(&S->h[6], _mm_xor_si128(LOADU(&S->h[6]), row2h));
+	return 0;
+}
+
+int eq_blake2b_update(blake2b_state *S, const uint8_t *in, uint64_t inlen)
+{
+	while (inlen > 0)
+	{
+		size_t left = S->buflen;
+		size_t fill = BLAKE2B_BLOCKBYTES - left;
+
+		if (inlen > fill)
+		{
+			memcpy(S->buf + left, in, fill); // Fill buffer
+			in += fill;
+			inlen -= fill;
+			S->counter += BLAKE2B_BLOCKBYTES;
+			blake2b_compress(S, S->buf); // Compress
+			S->buflen = 0;
+		}
+		else // inlen <= fill
+		{
+			memcpy(S->buf + left, in, inlen);
+			S->buflen += (uint8_t) inlen; // not enough to compress
+			in += inlen;
+			inlen = 0;
+		}
+	}
+
+	return 0;
+}
+
+int eq_blake2b_final(blake2b_state *S, uint8_t *out, uint8_t outlen)
+{
+	if (outlen > BLAKE2B_OUTBYTES)
+		return -1;
+
+	if (S->buflen > BLAKE2B_BLOCKBYTES)
+	{
+		S->counter += BLAKE2B_BLOCKBYTES;
+		blake2b_compress(S, S->buf);
+		S->buflen -= BLAKE2B_BLOCKBYTES;
+		memcpy(S->buf, S->buf + BLAKE2B_BLOCKBYTES, S->buflen);
+	}
+
+	S->counter += S->buflen;
+	S->lastblock = 1;
+	memset(S->buf + S->buflen, 0, BLAKE2B_BLOCKBYTES - S->buflen); /* Padding */
+	blake2b_compress(S, S->buf);
+	memcpy(out, &S->h[0], outlen);
+	S->lastblock = 0;
+	return 0;
+}
+
+int eq_blake2b(uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen)
+{
+	blake2b_state S[1];
+
+	/* Verify parameters */
+	if (!in || !out) return -1;
+	if (NULL == key) keylen = 0;
+
+	if (keylen)
+	{
+		if (eq_blake2b_init_key(S, outlen, key, keylen) < 0) return -1;
+	}
+	else
+	{
+		if (eq_blake2b_init(S, outlen) < 0) return -1;
+	}
+
+	eq_blake2b_update(S, (const uint8_t *)in, inlen);
+	eq_blake2b_final(S, out, outlen);
+	return 0;
+}
diff --git a/equi/cuda_equi.cu b/equi/cuda_equi.cu
new file mode 100644
index 0000000000..a3b76a7733
--- /dev/null
+++ b/equi/cuda_equi.cu
@@ -0,0 +1,2130 @@
+/*
+ * Equihash solver created by djeZo (l33tsoftw@gmail.com) for NiceHash
+ * Adapted to be more compatible with older C++ compilers
+ *
+ * cuda_djezo solver was released by NiceHash (www.nicehash.com) under
+ * GPL 3.0 license. If you don't have a copy, you can obtain one from
+ * https://www.gnu.org/licenses/gpl-3.0.txt
+ *
+ * Based on CUDA solver by John Tromp released under MIT license.
+ * Some helper functions taken out of OpenCL solver by Marc Bevand
+ * released under MIT license.
+ *
+ * Copyright (c) 2016 John Tromp, Marc Bevand
+ * Copyright (c) 2017 djeZo, Tanguy Pruvot (GPL v3)
+ */
+
+#ifdef WIN32
+#include <Windows.h>
+#endif
+
+#include <stdio.h>
+#include <vector>
+//#include <mutex>
+
+#include "equihash.h"
+#include "eqcuda.hpp" // eq_cuda_context
+
+#include "blake2/blake2.h"
+
+//#define WN 200
+//#define WK 9
+#ifndef MAX_GPUS
+#define MAX_GPUS 16
+#endif
+
+#define NDIGITS		(WK+1)
+#define DIGITBITS	(WN/(NDIGITS))
+#define PROOFSIZE (1<<WK)
+#define BASE (1<<DIGITBITS)
+#define NHASHES (2*BASE)
+#define HASHESPERBLAKE (512/WN)
+#define HASHOUT (HASHESPERBLAKE*WN/8)
+#define NBLOCKS ((NHASHES + HASHESPERBLAKE - 1) / HASHESPERBLAKE)
+#define BUCKBITS (DIGITBITS - RB)
+#define NBUCKETS (1 << BUCKBITS)
+#define BUCKMASK (NBUCKETS - 1)
+#define SLOTBITS (RB + 2)
+#define SLOTRANGE (1 << SLOTBITS)
+#define NSLOTS SM
+#define SLOTMASK (SLOTRANGE - 1)
+#define NRESTS (1 << RB)
+#define RESTMASK (NRESTS - 1)
+#define CANTORBITS (2 * SLOTBITS - 2)
+#define CANTORMASK ((1 << CANTORBITS) - 1)
+#define CANTORMAXSQRT (2 * NSLOTS)
+#define RB8_NSLOTS 640
+#define RB8_NSLOTS_LD 624
+#define FD_THREADS 128
+
+#ifdef __INTELLISENSE__
+// reduce vstudio editor warnings
+#include <device_functions.h>
+#include <device_launch_parameters.h>
+#define __launch_bounds__(max_tpb, min_blocks)
+#define __CUDA_ARCH__ 520
+uint32_t __byte_perm(uint32_t x, uint32_t y, uint32_t z);
+uint32_t __byte_perm(uint32_t x, uint32_t y, uint32_t z);
+uint32_t __shfl2(uint32_t x, uint32_t y);
+uint32_t __shfl_sync(uint32_t mask, uint32_t x, uint32_t y);
+uint32_t atomicExch(uint32_t *x, uint32_t y);
+uint32_t atomicAdd(uint32_t *x, uint32_t y);
+void __syncthreads(void);
+void __threadfence(void);
+void __threadfence_block(void);
+uint32_t __ldg(const uint32_t* address);
+uint64_t __ldg(const uint64_t* address);
+uint4 __ldca(const uint4 *ptr);
+u32 __ldca(const u32 *ptr);
+u32 umin(const u32, const u32);
+u32 umax(const u32, const u32);
+#endif
+
+#define OPT_SYNC_ALL
+
+#if CUDA_VERSION >= 9000 && __CUDA_ARCH__ >= 300
+#define __shfl2(var, srcLane)  __shfl_sync(0xFFFFFFFFu, var, srcLane)
+#undef __any
+#define __any(p) __any_sync(0xFFFFFFFFu, p)
+#else
+#define __shfl2 __shfl
+#endif
+
+typedef u32 proof[PROOFSIZE];
+
+struct __align__(32) slot {
+	u32 hash[8];
+};
+
+struct __align__(16) slotsmall {
+	u32 hash[4];
+};
+
+struct __align__(8) slottiny {
+	u32 hash[2];
+};
+
+template <u32 RB, u32 SM>
+struct equi
+{
+	slot round0trees[4096][RB8_NSLOTS];
+	slot trees[1][NBUCKETS][NSLOTS];
+	struct {
+		slotsmall treessmall[NSLOTS];
+		slottiny treestiny[NSLOTS];
+	} round2trees[NBUCKETS];
+	struct {
+		slotsmall treessmall[NSLOTS];
+		slottiny treestiny[NSLOTS];
+	} round3trees[NBUCKETS];
+	slotsmall treessmall[4][NBUCKETS][NSLOTS];
+	slottiny treestiny[1][4096][RB8_NSLOTS_LD];
+	u32 round4bidandsids[NBUCKETS][NSLOTS];
+	union {
+		u64 blake_h[8];
+		u32 blake_h32[16];
+	};
+	struct {
+		u32 nslots8[4096];
+		u32 nslots0[4096];
+		u32 nslots[9][NBUCKETS];
+		scontainerreal srealcont;
+	} edata;
+};
+
+// todo: use cuda_helper.h and/or cuda_vector.h
+__device__ __forceinline__ uint2 operator^ (uint2 a, uint2 b)
+{
+	return make_uint2(a.x ^ b.x, a.y ^ b.y);
+}
+
+__device__ __forceinline__ uint4 operator^ (uint4 a, uint4 b)
+{
+	return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w);
+}
+
+// for ROR 63 (or ROL 1); this func only support (32 <= offset < 64)
+__device__ __forceinline__ uint2 ROR2(const uint2 a, const int offset)
+{
+	uint2 result;
+#if __CUDA_ARCH__ > 300
+	{
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
+	}
+#else
+	result.y = ((a.x >> (offset - 32)) | (a.y << (64 - offset)));
+	result.x = ((a.y >> (offset - 32)) | (a.x << (64 - offset)));
+#endif
+	return result;
+}
+
+
+__device__ __forceinline__ uint2 SWAPUINT2(uint2 value)
+{
+	return make_uint2(value.y, value.x);
+}
+
+__device__ __forceinline__ uint2 ROR24(const uint2 a)
+{
+	uint2 result;
+	result.x = __byte_perm(a.y, a.x, 0x2107);
+	result.y = __byte_perm(a.y, a.x, 0x6543);
+	return result;
+}
+
+__device__ __forceinline__ uint2 ROR16(const uint2 a)
+{
+	uint2 result;
+	result.x = __byte_perm(a.y, a.x, 0x1076);
+	result.y = __byte_perm(a.y, a.x, 0x5432);
+	return result;
+}
+
+__device__ __forceinline__ void G2(u64 & a, u64 & b, u64 & c, u64 & d, u64 x, u64 y)
+{
+	a = a + b + x;
+	((uint2*)&d)[0] = SWAPUINT2(((uint2*)&d)[0] ^ ((uint2*)&a)[0]);
+	c = c + d;
+	((uint2*)&b)[0] = ROR24(((uint2*)&b)[0] ^ ((uint2*)&c)[0]);
+	a = a + b + y;
+	((uint2*)&d)[0] = ROR16(((uint2*)&d)[0] ^ ((uint2*)&a)[0]);
+	c = c + d;
+	((uint2*)&b)[0] = ROR2(((uint2*)&b)[0] ^ ((uint2*)&c)[0], 63U);
+}
+
+// untested..
+struct packer_default
+{
+	__device__ __forceinline__ static u32 set_bucketid_and_slots(const u32 bucketid, const u32 s0, const u32 s1, const u32 RB, const u32 SM)
+	{
+		return (((bucketid << SLOTBITS) | s0) << SLOTBITS) | s1;
+	}
+
+	__device__ __forceinline__ static u32 get_bucketid(const u32 bid, const u32 RB, const u32 SM)
+	{
+		// BUCKMASK-ed to prevent illegal memory accesses in case of memory errors
+		return (bid >> (2 * SLOTBITS)) & BUCKMASK;
+	}
+
+	__device__ __forceinline__ static u32 get_slot0(const u32 bid, const u32 s1, const u32 RB, const u32 SM)
+	{
+		return bid & SLOTMASK;
+	}
+
+	__device__ __forceinline__ static u32 get_slot1(const u32 bid, const u32 RB, const u32 SM)
+	{
+		return (bid >> SLOTBITS) & SLOTMASK;
+	}
+};
+
+
+struct packer_cantor
+{
+	__device__ __forceinline__ static u32 cantor(const u32 s0, const u32 s1)
+	{
+		u32 a = umax(s0, s1);
+		u32 b = umin(s0, s1);
+		return a * (a + 1) / 2 + b;
+	}
+
+	__device__ __forceinline__ static u32 set_bucketid_and_slots(const u32 bucketid, const u32 s0, const u32 s1, const u32 RB, const u32 SM)
+	{
+		return (bucketid << CANTORBITS) | cantor(s0, s1);
+	}
+
+	__device__ __forceinline__ static u32 get_bucketid(const u32 bid, const u32 RB, const u32 SM)
+	{
+		return (bid >> CANTORBITS) & BUCKMASK;
+	}
+
+	__device__ __forceinline__ static u32 get_slot0(const u32 bid, const u32 s1, const u32 RB, const u32 SM)
+	{
+		return ((bid & CANTORMASK) - cantor(0, s1)) & SLOTMASK;
+	}
+
+	__device__ __forceinline__ static u32 get_slot1(const u32 bid, const u32 RB, const u32 SM)
+	{
+		u32 k, q, sqr = 8 * (bid & CANTORMASK) + 1;
+		// this k=sqrt(sqr) computing loop averages 3.4 iterations out of maximum 9
+		for (k = CANTORMAXSQRT; (q = sqr / k) < k; k = (k + q) / 2);
+		return ((k - 1) / 2) & SLOTMASK;
+	}
+};
+
+__device__ __constant__ const u64 blake_iv[] = {
+	0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
+	0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
+	0x510e527fade682d1, 0x9b05688c2b3e6c1f,
+	0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
+};
+
+#if CUDART_VERSION < 8000 || !defined(__ldca)
+#define __ldca(ptr) *(ptr)
+#endif
+
+template <u32 RB, u32 SM, typename PACKER>
+__global__ void digit_first(equi<RB, SM>* eq, u32 nonce)
+{
+	const u32 block = blockIdx.x * blockDim.x + threadIdx.x;
+	__shared__ u64 hash_h[8];
+	u32* hash_h32 = (u32*)hash_h;
+
+	if (threadIdx.x < 16)
+		hash_h32[threadIdx.x] = __ldca(&eq->blake_h32[threadIdx.x]);
+
+	__syncthreads();
+
+	u64 m = (u64)block << 32 | (u64)nonce;
+
+	union
+	{
+		u64 v[16];
+		u32 v32[32];
+		uint4 v128[8];
+	};
+
+	v[0] = hash_h[0];
+	v[1] = hash_h[1];
+	v[2] = hash_h[2];
+	v[3] = hash_h[3];
+	v[4] = hash_h[4];
+	v[5] = hash_h[5];
+	v[6] = hash_h[6];
+	v[7] = hash_h[7];
+	v[8] = blake_iv[0];
+	v[9] = blake_iv[1];
+	v[10] = blake_iv[2];
+	v[11] = blake_iv[3];
+	v[12] = blake_iv[4] ^ (128 + 16);
+	v[13] = blake_iv[5];
+	v[14] = blake_iv[6] ^ 0xffffffffffffffff;
+	v[15] = blake_iv[7];
+
+	// mix 1
+	G2(v[0], v[4], v[8], v[12], 0, m);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 2
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], m, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 3
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, m);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 4
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, m);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 5
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, m);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 6
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], m, 0);
+
+	// mix 7
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], m, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 8
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, m);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 9
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], m, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 10
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], m, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 11
+	G2(v[0], v[4], v[8], v[12], 0, m);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 12
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], m, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	v[0] ^= hash_h[0] ^ v[8];
+	v[1] ^= hash_h[1] ^ v[9];
+	v[2] ^= hash_h[2] ^ v[10];
+	v[3] ^= hash_h[3] ^ v[11];
+	v[4] ^= hash_h[4] ^ v[12];
+	v[5] ^= hash_h[5] ^ v[13];
+	v32[12] ^= hash_h32[12] ^ v32[28];
+
+	u32 bexor = __byte_perm(v32[0], 0, 0x4012); // first 20 bits
+	u32 bucketid;
+	asm("bfe.u32 %0, %1, 12, 12;" : "=r"(bucketid) : "r"(bexor));
+	u32 slotp = atomicAdd(&eq->edata.nslots0[bucketid], 1);
+	if (slotp < RB8_NSLOTS)
+	{
+		slot* s = &eq->round0trees[bucketid][slotp];
+
+		uint4 tt;
+		tt.x = __byte_perm(v32[0], v32[1], 0x1234);
+		tt.y = __byte_perm(v32[1], v32[2], 0x1234);
+		tt.z = __byte_perm(v32[2], v32[3], 0x1234);
+		tt.w = __byte_perm(v32[3], v32[4], 0x1234);
+		*(uint4*)(&s->hash[0]) = tt;
+
+		tt.x = __byte_perm(v32[4], v32[5], 0x1234);
+		tt.y = __byte_perm(v32[5], v32[6], 0x1234);
+		tt.z = 0;
+		tt.w = block << 1;
+		*(uint4*)(&s->hash[4]) = tt;
+	}
+
+	bexor = __byte_perm(v32[6], 0, 0x0123);
+	asm("bfe.u32 %0, %1, 12, 12;" : "=r"(bucketid) : "r"(bexor));
+	slotp = atomicAdd(&eq->edata.nslots0[bucketid], 1);
+	if (slotp < RB8_NSLOTS)
+	{
+		slot* s = &eq->round0trees[bucketid][slotp];
+
+		uint4 tt;
+		tt.x = __byte_perm(v32[6], v32[7], 0x2345);
+		tt.y = __byte_perm(v32[7], v32[8], 0x2345);
+		tt.z = __byte_perm(v32[8], v32[9], 0x2345);
+		tt.w = __byte_perm(v32[9], v32[10], 0x2345);
+		*(uint4*)(&s->hash[0]) = tt;
+
+		tt.x = __byte_perm(v32[10], v32[11], 0x2345);
+		tt.y = __byte_perm(v32[11], v32[12], 0x2345);
+		tt.z = 0;
+		tt.w = (block << 1) + 1;
+		*(uint4*)(&s->hash[4]) = tt;
+	}
+}
+
+/*
+  Functions digit_1 to digit_8 works by the same principle;
+  Each thread does 2-3 slot loads (loads are coalesced).
+  Xorwork of slots is loaded into shared memory and is kept in registers (except for digit_1).
+  At the same time, restbits (8 or 9 bits) in xorwork are used for collisions.
+  Restbits determine position in ht.
+  Following next is pair creation. First one (or two) pairs' xorworks are put into global memory
+  as soon as possible, the rest pairs are saved in shared memory (one u32 per pair - 16 bit indices).
+  In most cases, all threads have one (or two) pairs so with this trick, we offload memory writes a bit in last step.
+  In last step we save xorwork of pairs in memory.
+*/
+template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS, u32 THREADS>
+__global__ void digit_1(equi<RB, SM>* eq)
+{
+	__shared__ u16 ht[256][SSM - 1];
+	__shared__ uint2 lastword1[RB8_NSLOTS];
+	__shared__ uint4 lastword2[RB8_NSLOTS];
+	__shared__ int ht_len[MAXPAIRS];
+	__shared__ u32 pairs_len;
+	__shared__ u32 next_pair;
+
+	const u32 threadid = threadIdx.x;
+	const u32 bucketid = blockIdx.x;
+
+	// reset hashtable len
+	if (threadid < 256)
+		ht_len[threadid] = 0;
+	else if (threadid == (THREADS - 1))
+		pairs_len = 0;
+	else if (threadid == (THREADS - 33))
+		next_pair = 0;
+
+	u32 bsize = umin(eq->edata.nslots0[bucketid], RB8_NSLOTS);
+
+	u32 hr[2];
+	int pos[2];
+	pos[0] = pos[1] = SSM;
+
+	uint2 ta[2];
+	uint4 tb[2];
+
+	u32 si[2];
+
+#ifdef OPT_SYNC_ALL
+	// enable this to make fully safe shared mem operations;
+	// disabled gains some speed, but can rarely cause a crash
+	__syncthreads();
+#endif
+	#pragma unroll
+	for (u32 i = 0; i != 2; ++i)
+	{
+		si[i] = i * THREADS + threadid;
+		if (si[i] >= bsize) break;
+
+		const slot* pslot1 = eq->round0trees[bucketid] + si[i];
+
+		// get xhash
+		uint4 a1 = *(uint4*)(&pslot1->hash[0]);
+		uint2 a2 = *(uint2*)(&pslot1->hash[4]);
+		ta[i].x = a1.x;
+		ta[i].y = a1.y;
+		lastword1[si[i]] = ta[i];
+		tb[i].x = a1.z;
+		tb[i].y = a1.w;
+		tb[i].z = a2.x;
+		tb[i].w = a2.y;
+		lastword2[si[i]] = tb[i];
+
+		asm("bfe.u32 %0, %1, 20, 8;" : "=r"(hr[i]) : "r"(ta[i].x));
+		pos[i] = atomicAdd(&ht_len[hr[i]], 1);
+		if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i];
+	}
+
+	__syncthreads();
+	int* pairs = ht_len;
+
+	u32 xors[6];
+	u32 xorbucketid, xorslot;
+
+	#pragma unroll
+	for (u32 i = 0; i != 2; ++i)
+	{
+		if (pos[i] >= SSM) continue;
+
+		if (pos[i] > 0)
+		{
+			u16 p = ht[hr[i]][0];
+
+			*(uint2*)(&xors[0]) = ta[i] ^ lastword1[p];
+
+			asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(RB), "r"(BUCKBITS));
+			xorslot = atomicAdd(&eq->edata.nslots[1][xorbucketid], 1);
+
+			if (xorslot < NSLOTS)
+			{
+				*(uint4*)(&xors[2]) = lastword2[si[i]] ^ lastword2[p];
+
+				slot &xs = eq->trees[0][xorbucketid][xorslot];
+				*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[1]);
+				uint4 ttx;
+				ttx.x = xors[5];
+				ttx.y = xors[0];
+				ttx.z = packer_default::set_bucketid_and_slots(bucketid, si[i], p, 8, RB8_NSLOTS);
+				ttx.w = 0;
+				*(uint4*)(&xs.hash[4]) = ttx;
+			}
+
+			for (int k = 1; k != pos[i]; ++k)
+			{
+				u32 pindex = atomicAdd(&pairs_len, 1);
+				if (pindex >= MAXPAIRS) break;
+				u16 prev = ht[hr[i]][k];
+				pairs[pindex] = __byte_perm(si[i], prev, 0x1054);
+			}
+		}
+	}
+
+	__syncthreads();
+
+	// process pairs
+	u32 plen = umin(pairs_len, MAXPAIRS);
+
+	u32 i, k;
+	for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1))
+	{
+		int pair = pairs[s];
+		i = __byte_perm(pair, 0, 0x4510);
+		k = __byte_perm(pair, 0, 0x4532);
+
+		*(uint2*)(&xors[0]) = lastword1[i] ^ lastword1[k];
+
+		asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(RB), "r"(BUCKBITS));
+		xorslot = atomicAdd(&eq->edata.nslots[1][xorbucketid], 1);
+
+		if (xorslot < NSLOTS)
+		{
+			*(uint4*)(&xors[2]) = lastword2[i] ^ lastword2[k];
+
+			slot &xs = eq->trees[0][xorbucketid][xorslot];
+			*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[1]);
+			uint4 ttx;
+			ttx.x = xors[5];
+			ttx.y = xors[0];
+			ttx.z = packer_default::set_bucketid_and_slots(bucketid, i, k, 8, RB8_NSLOTS);
+			ttx.w = 0;
+			*(uint4*)(&xs.hash[4]) = ttx;
+		}
+	}
+}
+
+
+template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS, u32 THREADS>
+__global__ void digit_2(equi<RB, SM>* eq)
+{
+	__shared__ u16 ht[NRESTS][SSM - 1];
+	__shared__ u32 lastword1[NSLOTS];
+	__shared__ uint4 lastword2[NSLOTS];
+	__shared__ int ht_len[NRESTS];
+	__shared__ int pairs[MAXPAIRS];
+	__shared__ u32 pairs_len;
+	__shared__ u32 next_pair;
+
+	const u32 threadid = threadIdx.x;
+	const u32 bucketid = blockIdx.x;
+
+	// reset hashtable len
+	if (threadid < NRESTS)
+		ht_len[threadid] = 0;
+	else if (threadid == (THREADS - 1))
+		pairs_len = 0;
+	else if (threadid == (THREADS - 33))
+		next_pair = 0;
+
+	slot* buck = eq->trees[0][bucketid];
+	u32 bsize = umin(eq->edata.nslots[1][bucketid], NSLOTS);
+
+	u32 hr[2];
+	int pos[2];
+	pos[0] = pos[1] = SSM;
+
+	u32 ta[2];
+	uint4 tt[2];
+
+	u32 si[2];
+#ifdef OPT_SYNC_ALL
+	__syncthreads();
+#endif
+	#pragma unroll 2
+	for (u32 i = 0; i < 2; i++)
+	{
+		si[i] = i * THREADS + threadid;
+		if (si[i] >= bsize) break;
+
+		// get slot
+		const slot* pslot1 = buck + si[i];
+
+		uint4 ttx = *(uint4*)(&pslot1->hash[0]);
+		lastword1[si[i]] = ta[i] = ttx.x;
+		uint2 tty = *(uint2*)(&pslot1->hash[4]);
+		tt[i].x = ttx.y;
+		tt[i].y = ttx.z;
+		tt[i].z = ttx.w;
+		tt[i].w = tty.x;
+		lastword2[si[i]] = tt[i];
+
+		hr[i] = tty.y & RESTMASK;
+		pos[i] = atomicAdd(&ht_len[hr[i]], 1);
+		if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i];
+	}
+
+	__syncthreads();
+
+	u32 xors[5];
+	u32 xorbucketid, xorslot;
+
+	#pragma unroll 2
+	for (u32 i = 0; i < 2; i++)
+	{
+		if (pos[i] >= SSM) continue;
+
+		if (pos[i] > 0)
+		{
+			u16 p = ht[hr[i]][0];
+
+			xors[0] = ta[i] ^ lastword1[p];
+
+			xorbucketid = xors[0] >> (12 + RB);
+			xorslot = atomicAdd(&eq->edata.nslots[2][xorbucketid], 1);
+			if (xorslot < NSLOTS)
+			{
+				*(uint4*)(&xors[1]) = tt[i] ^ lastword2[p];
+				slotsmall &xs = eq->round2trees[xorbucketid].treessmall[xorslot];
+				*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[0]);
+				slottiny &xst = eq->round2trees[xorbucketid].treestiny[xorslot];
+				uint2 ttx;
+				ttx.x = xors[4];
+				ttx.y = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+				*(uint2*)(&xst.hash[0]) = ttx;
+			}
+
+			for (int k = 1; k != pos[i]; ++k)
+			{
+				u32 pindex = atomicAdd(&pairs_len, 1);
+				if (pindex >= MAXPAIRS) break;
+				u16 prev = ht[hr[i]][k];
+				pairs[pindex] = __byte_perm(si[i], prev, 0x1054);
+			}
+		}
+	}
+
+	__syncthreads();
+
+	// process pairs
+	u32 plen = umin(pairs_len, MAXPAIRS);
+
+	u32 i, k;
+	for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1))
+	{
+		int pair = pairs[s];
+		i = __byte_perm(pair, 0, 0x4510);
+		k = __byte_perm(pair, 0, 0x4532);
+
+		xors[0] = lastword1[i] ^ lastword1[k];
+
+		xorbucketid = xors[0] >> (12 + RB);
+		xorslot = atomicAdd(&eq->edata.nslots[2][xorbucketid], 1);
+		if (xorslot < NSLOTS)
+		{
+			*(uint4*)(&xors[1]) = lastword2[i] ^ lastword2[k];
+			slotsmall &xs = eq->round2trees[xorbucketid].treessmall[xorslot];
+			*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[0]);
+			slottiny &xst = eq->round2trees[xorbucketid].treestiny[xorslot];
+			uint2 ttx;
+			ttx.x = xors[4];
+			ttx.y = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM);
+			*(uint2*)(&xst.hash[0]) = ttx;
+		}
+	}
+}
+
+
+template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS, u32 THREADS>
+__global__ void digit_3(equi<RB, SM>* eq)
+{
+	__shared__ u16 ht[NRESTS][(SSM - 1)];
+	__shared__ uint4 lastword1[NSLOTS];
+	__shared__ u32 lastword2[NSLOTS];
+	__shared__ int ht_len[NRESTS];
+	__shared__ int pairs[MAXPAIRS];
+	__shared__ u32 pairs_len;
+	__shared__ u32 next_pair;
+
+	const u32 threadid = threadIdx.x;
+	const u32 bucketid = blockIdx.x;
+
+	// reset hashtable len
+	if (threadid < NRESTS)
+		ht_len[threadid] = 0;
+	else if (threadid == (THREADS - 1))
+		pairs_len = 0;
+	else if (threadid == (THREADS - 33))
+		next_pair = 0;
+
+	u32 bsize = umin(eq->edata.nslots[2][bucketid], NSLOTS);
+
+	u32 hr[2];
+	int pos[2];
+	pos[0] = pos[1] = SSM;
+
+	u32 si[2];
+	uint4 tt[2];
+	u32 ta[2];
+
+#ifdef OPT_SYNC_ALL
+	__syncthreads();
+#endif
+
+	#pragma unroll 2
+	for (u32 i = 0; i < 2; i++)
+	{
+		si[i] = i * THREADS + threadid;
+		if (si[i] >= bsize) break;
+
+		slotsmall &xs = eq->round2trees[bucketid].treessmall[si[i]];
+		slottiny &xst = eq->round2trees[bucketid].treestiny[si[i]];
+
+		tt[i] = *(uint4*)(&xs.hash[0]);
+		lastword1[si[i]] = tt[i];
+		ta[i] = xst.hash[0];
+		lastword2[si[i]] = ta[i];
+		asm("bfe.u32 %0, %1, 12, %2;" : "=r"(hr[i]) : "r"(tt[i].x), "r"(RB));
+		pos[i] = atomicAdd(&ht_len[hr[i]], 1);
+		if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i];
+	}
+
+	__syncthreads();
+
+	u32 xors[5];
+	u32 bexor, xorbucketid, xorslot;
+
+	#pragma unroll 2
+	for (u32 i = 0; i < 2; i++)
+	{
+		if (pos[i] >= SSM) continue;
+
+		if (pos[i] > 0)
+		{
+			u16 p = ht[hr[i]][0];
+
+			xors[4] = ta[i] ^ lastword2[p];
+
+			if (xors[4] != 0)
+			{
+				*(uint4*)(&xors[0]) = tt[i] ^ lastword1[p];
+
+				bexor = __byte_perm(xors[0], xors[1], 0x2107);
+				asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(bexor), "r"(RB), "r"(BUCKBITS));
+				xorslot = atomicAdd(&eq->edata.nslots[3][xorbucketid], 1);
+
+				if (xorslot < NSLOTS)
+				{
+					slotsmall &xs = eq->round3trees[xorbucketid].treessmall[xorslot];
+					*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[1]);
+					slottiny &xst = eq->round3trees[xorbucketid].treestiny[xorslot];
+					uint2 ttx;
+					ttx.x = bexor;
+					ttx.y = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+					*(uint2*)(&xst.hash[0]) = ttx;
+				}
+			}
+
+			for (int k = 1; k != pos[i]; ++k)
+			{
+				u32 pindex = atomicAdd(&pairs_len, 1);
+				if (pindex >= MAXPAIRS) break;
+				u16 prev = ht[hr[i]][k];
+				pairs[pindex] = __byte_perm(si[i], prev, 0x1054);
+			}
+		}
+	}
+
+	__syncthreads();
+
+	// process pairs
+	u32 plen = umin(pairs_len, MAXPAIRS);
+
+	u32 i, k;
+	for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1))
+	{
+		int pair = pairs[s];
+		i = __byte_perm(pair, 0, 0x4510);
+		k = __byte_perm(pair, 0, 0x4532);
+
+		xors[4] = lastword2[i] ^ lastword2[k];
+
+		if (xors[4] != 0)
+		{
+			*(uint4*)(&xors[0]) = lastword1[i] ^ lastword1[k];
+
+			bexor = __byte_perm(xors[0], xors[1], 0x2107);
+			asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(bexor), "r"(RB), "r"(BUCKBITS));
+			xorslot = atomicAdd(&eq->edata.nslots[3][xorbucketid], 1);
+
+			if (xorslot < NSLOTS)
+			{
+				slotsmall &xs = eq->round3trees[xorbucketid].treessmall[xorslot];
+				*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[1]);
+				slottiny &xst = eq->round3trees[xorbucketid].treestiny[xorslot];
+				uint2 ttx;
+				ttx.x = bexor;
+				ttx.y = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM);
+				*(uint2*)(&xst.hash[0]) = ttx;
+			}
+		}
+	}
+}
+
+
+template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS, u32 THREADS>
+__global__ void digit_4(equi<RB, SM>* eq)
+{
+	__shared__ u16 ht[NRESTS][(SSM - 1)];
+	__shared__ uint4 lastword[NSLOTS];
+	__shared__ int ht_len[NRESTS];
+	__shared__ int pairs[MAXPAIRS];
+	__shared__ u32 pairs_len;
+	__shared__ u32 next_pair;
+
+	const u32 threadid = threadIdx.x;
+	const u32 bucketid = blockIdx.x;
+
+	// reset hashtable len
+	if (threadid < NRESTS)
+		ht_len[threadid] = 0;
+	else if (threadid == (THREADS - 1))
+		pairs_len = 0;
+	else if (threadid == (THREADS - 33))
+		next_pair = 0;
+
+	u32 bsize = umin(eq->edata.nslots[3][bucketid], NSLOTS);
+
+	u32 hr[2];
+	int pos[2];
+	pos[0] = pos[1] = SSM;
+
+	u32 si[2];
+	uint4 tt[2];
+#ifdef OPT_SYNC_ALL
+	__syncthreads();
+#endif
+	#pragma unroll 2
+	for (u32 i = 0; i < 2; i++)
+	{
+		si[i] = i * THREADS + threadid;
+		if (si[i] >= bsize) break;
+
+		slotsmall &xs = eq->round3trees[bucketid].treessmall[si[i]];
+		slottiny &xst = eq->round3trees[bucketid].treestiny[si[i]];
+
+		// get xhash
+		tt[i] = *(uint4*)(&xs.hash[0]);
+		lastword[si[i]] = tt[i];
+		hr[i] = xst.hash[0] & RESTMASK;
+		pos[i] = atomicAdd(&ht_len[hr[i]], 1);
+		if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i];
+	}
+
+	__syncthreads();
+	u32 xors[4];
+	u32 xorbucketid, xorslot;
+
+	#pragma unroll 2
+	for (u32 i = 0; i < 2; i++)
+	{
+		if (pos[i] >= SSM) continue;
+
+		if (pos[i] > 0)
+		{
+			u16 p = ht[hr[i]][0];
+
+			*(uint4*)(&xors[0]) = tt[i] ^ lastword[p];
+
+			if (xors[3] != 0)
+			{
+				asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(4 + RB), "r"(BUCKBITS));
+				xorslot = atomicAdd(&eq->edata.nslots[4][xorbucketid], 1);
+				if (xorslot < NSLOTS)
+				{
+					slotsmall &xs = eq->treessmall[3][xorbucketid][xorslot];
+					*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[0]);
+
+					eq->round4bidandsids[xorbucketid][xorslot] = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+				}
+			}
+
+			for (int k = 1; k != pos[i]; ++k)
+			{
+				u32 pindex = atomicAdd(&pairs_len, 1);
+				if (pindex >= MAXPAIRS) break;
+				u16 prev = ht[hr[i]][k];
+				pairs[pindex] = __byte_perm(si[i], prev, 0x1054);
+			}
+		}
+	}
+
+	__syncthreads();
+
+	// process pairs
+	u32 plen = umin(pairs_len, MAXPAIRS);
+	u32 i, k;
+	for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1))
+	{
+		int pair = pairs[s];
+		i = __byte_perm(pair, 0, 0x4510);
+		k = __byte_perm(pair, 0, 0x4532);
+
+		*(uint4*)(&xors[0]) = lastword[i] ^ lastword[k];
+		if (xors[3] != 0)
+		{
+			asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(4 + RB), "r"(BUCKBITS));
+			xorslot = atomicAdd(&eq->edata.nslots[4][xorbucketid], 1);
+			if (xorslot < NSLOTS)
+			{
+				slotsmall &xs = eq->treessmall[3][xorbucketid][xorslot];
+				*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[0]);
+				eq->round4bidandsids[xorbucketid][xorslot] = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM);
+			}
+		}
+	}
+}
+
+
+template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS, u32 THREADS>
+__global__ void digit_5(equi<RB, SM>* eq)
+{
+	__shared__ u16 ht[NRESTS][(SSM - 1)];
+	__shared__ uint4 lastword[NSLOTS];
+	__shared__ int ht_len[NRESTS];
+	__shared__ int pairs[MAXPAIRS];
+	__shared__ u32 pairs_len;
+	__shared__ u32 next_pair;
+
+	const u32 threadid = threadIdx.x;
+	const u32 bucketid = blockIdx.x;
+
+	if (threadid < NRESTS)
+		ht_len[threadid] = 0;
+	else if (threadid == (THREADS - 1))
+		pairs_len = 0;
+	else if (threadid == (THREADS - 33))
+		next_pair = 0;
+
+	slotsmall* buck = eq->treessmall[3][bucketid];
+	u32 bsize = umin(eq->edata.nslots[4][bucketid], NSLOTS);
+
+	u32 hr[2];
+	int pos[2];
+	pos[0] = pos[1] = SSM;
+
+	u32 si[2];
+	uint4 tt[2];
+#ifdef OPT_SYNC_ALL
+	__syncthreads();
+#endif
+	#pragma unroll 2
+	for (u32 i = 0; i < 2; i++)
+	{
+		si[i] = i * THREADS + threadid;
+		if (si[i] >= bsize) break;
+
+		const slotsmall* pslot1 = buck + si[i];
+
+		tt[i] = *(uint4*)(&pslot1->hash[0]);
+		lastword[si[i]] = tt[i];
+		asm("bfe.u32 %0, %1, 4, %2;" : "=r"(hr[i]) : "r"(tt[i].x), "r"(RB));
+		pos[i] = atomicAdd(&ht_len[hr[i]], 1);
+		if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i];
+	}
+
+	__syncthreads();
+	u32 xors[4];
+	u32 bexor, xorbucketid, xorslot;
+
+	#pragma unroll 2
+	for (u32 i = 0; i < 2; i++)
+	{
+		if (pos[i] >= SSM) continue;
+
+		if (pos[i] > 0)
+		{
+			u16 p = ht[hr[i]][0];
+
+			*(uint4*)(&xors[0]) = tt[i] ^ lastword[p];
+
+			if (xors[3] != 0)
+			{
+				bexor = __byte_perm(xors[0], xors[1], 0x1076);
+				asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(bexor), "r"(RB), "r"(BUCKBITS));
+				xorslot = atomicAdd(&eq->edata.nslots[5][xorbucketid], 1);
+				if (xorslot < NSLOTS)
+				{
+					slotsmall &xs = eq->treessmall[2][xorbucketid][xorslot];
+					uint4 ttx;
+					ttx.x = xors[1];
+					ttx.y = xors[2];
+					ttx.z = xors[3];
+					ttx.w = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+					*(uint4*)(&xs.hash[0]) = ttx;
+				}
+			}
+
+			for (int k = 1; k != pos[i]; ++k)
+			{
+				u32 pindex = atomicAdd(&pairs_len, 1);
+				if (pindex >= MAXPAIRS) break;
+				u16 prev = ht[hr[i]][k];
+				pairs[pindex] = __byte_perm(si[i], prev, 0x1054);
+			}
+		}
+	}
+
+	__syncthreads();
+
+	// process pairs
+	u32 plen = umin(pairs_len, MAXPAIRS);
+	u32 i, k;
+	for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1))
+	{
+		int pair = pairs[s];
+		i = __byte_perm(pair, 0, 0x4510);
+		k = __byte_perm(pair, 0, 0x4532);
+
+		*(uint4*)(&xors[0]) = lastword[i] ^ lastword[k];
+
+		if (xors[3] != 0)
+		{
+			bexor = __byte_perm(xors[0], xors[1], 0x1076);
+			asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(bexor), "r"(RB), "r"(BUCKBITS));
+			xorslot = atomicAdd(&eq->edata.nslots[5][xorbucketid], 1);
+			if (xorslot < NSLOTS)
+			{
+				slotsmall &xs = eq->treessmall[2][xorbucketid][xorslot];
+				uint4 tt;
+				tt.x = xors[1];
+				tt.y = xors[2];
+				tt.z = xors[3];
+				tt.w = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM);
+				*(uint4*)(&xs.hash[0]) = tt;
+			}
+		}
+	}
+}
+
+
+template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS>
+__global__ void digit_6(equi<RB, SM>* eq)
+{
+	__shared__ u16 ht[NRESTS][(SSM - 1)];
+	__shared__ uint2 lastword1[NSLOTS];
+	__shared__ u32 lastword2[NSLOTS];
+	__shared__ int ht_len[MAXPAIRS];
+	__shared__ u32 pairs_len;
+	__shared__ u32 bsize_sh;
+	__shared__ u32 next_pair;
+
+	const u32 threadid = threadIdx.x;
+	const u32 bucketid = blockIdx.x;
+
+	// reset hashtable len
+	ht_len[threadid] = 0;
+	if (threadid == (NRESTS - 1))
+	{
+		pairs_len = 0;
+		next_pair = 0;
+	}
+	else if (threadid == (NRESTS - 33))
+		bsize_sh = umin(eq->edata.nslots[5][bucketid], NSLOTS);
+
+	slotsmall* buck = eq->treessmall[2][bucketid];
+
+	u32 hr[3];
+	int pos[3];
+	pos[0] = pos[1] = pos[2] = SSM;
+
+	u32 si[3];
+	uint4 tt[3];
+
+	__syncthreads();
+
+	u32 bsize = bsize_sh;
+
+	#pragma unroll 3
+	for (u32 i = 0; i < 3; i++)
+	{
+		si[i] = i * NRESTS + threadid;
+		if (si[i] >= bsize) break;
+
+		const slotsmall* pslot1 = buck + si[i];
+
+		tt[i] = *(uint4*)(&pslot1->hash[0]);
+		lastword1[si[i]] = *(uint2*)(&tt[i].x);
+		lastword2[si[i]] = tt[i].z;
+		asm("bfe.u32 %0, %1, 16, %2;" : "=r"(hr[i]) : "r"(tt[i].x), "r"(RB));
+		pos[i] = atomicAdd(&ht_len[hr[i]], 1);
+		if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i];
+	}
+
+	// doing this to save shared memory
+	int* pairs = ht_len;
+	__syncthreads();
+
+	u32 xors[3];
+	u32 bexor, xorbucketid, xorslot;
+
+	#pragma unroll 3
+	for (u32 i = 0; i < 3; i++)
+	{
+		if (pos[i] >= SSM) continue;
+
+		if (pos[i] > 0)
+		{
+			u16 p = ht[hr[i]][0];
+
+			xors[2] = tt[i].z ^ lastword2[p];
+
+			if (xors[2] != 0)
+			{
+				*(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ lastword1[p];
+
+				bexor = __byte_perm(xors[0], xors[1], 0x1076);
+				xorbucketid = bexor >> (12 + RB);
+				xorslot = atomicAdd(&eq->edata.nslots[6][xorbucketid], 1);
+				if (xorslot < NSLOTS)
+				{
+					slotsmall &xs = eq->treessmall[0][xorbucketid][xorslot];
+					uint4 ttx;
+					ttx.x = xors[1];
+					ttx.y = xors[2];
+					ttx.z = bexor;
+					ttx.w = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+					*(uint4*)(&xs.hash[0]) = ttx;
+				}
+			}
+
+			if (pos[i] > 1)
+			{
+				p = ht[hr[i]][1];
+
+				xors[2] = tt[i].z ^ lastword2[p];
+
+				if (xors[2] != 0)
+				{
+					*(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ lastword1[p];
+
+					bexor = __byte_perm(xors[0], xors[1], 0x1076);
+					xorbucketid = bexor >> (12 + RB);
+					xorslot = atomicAdd(&eq->edata.nslots[6][xorbucketid], 1);
+					if (xorslot < NSLOTS)
+					{
+						slotsmall &xs = eq->treessmall[0][xorbucketid][xorslot];
+						uint4 ttx;
+						ttx.x = xors[1];
+						ttx.y = xors[2];
+						ttx.z = bexor;
+						ttx.w = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+						*(uint4*)(&xs.hash[0]) = ttx;
+					}
+				}
+
+				for (int k = 2; k != pos[i]; ++k)
+				{
+					u32 pindex = atomicAdd(&pairs_len, 1);
+					if (pindex >= MAXPAIRS) break;
+					u16 prev = ht[hr[i]][k];
+					pairs[pindex] = __byte_perm(si[i], prev, 0x1054);
+				}
+			}
+		}
+	}
+
+	__syncthreads();
+
+	// process pairs
+	u32 plen = umin(pairs_len, MAXPAIRS);
+	for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1))
+	{
+		u32 pair = pairs[s];
+		u32 i = __byte_perm(pair, 0, 0x4510);
+		u32 k = __byte_perm(pair, 0, 0x4532);
+
+		xors[2] = lastword2[i] ^ lastword2[k];
+		if (xors[2] == 0)
+			continue;
+
+		*(uint2*)(&xors[0]) = lastword1[i] ^ lastword1[k];
+
+		bexor = __byte_perm(xors[0], xors[1], 0x1076);
+		xorbucketid = bexor >> (12 + RB);
+		xorslot = atomicAdd(&eq->edata.nslots[6][xorbucketid], 1);
+		if (xorslot >= NSLOTS) continue;
+		slotsmall &xs = eq->treessmall[0][xorbucketid][xorslot];
+		uint4 ttx;
+		ttx.x = xors[1];
+		ttx.y = xors[2];
+		ttx.z = bexor;
+		ttx.w = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM);
+		*(uint4*)(&xs.hash[0]) = ttx;
+	}
+}
+
+
+template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS>
+__global__ void digit_7(equi<RB, SM>* eq)
+{
+	__shared__ u16 ht[NRESTS][(SSM - 1)];
+	__shared__ u32 lastword[NSLOTS][2];
+	__shared__ int ht_len[NRESTS];
+	__shared__ int pairs[MAXPAIRS];
+	__shared__ u32 pairs_len;
+	__shared__ u32 bsize_sh;
+	__shared__ u32 next_pair;
+
+	const u32 threadid = threadIdx.x;
+	const u32 bucketid = blockIdx.x;
+
+	// reset hashtable len
+	ht_len[threadid] = 0;
+	if (threadid == (NRESTS - 1))
+	{
+		pairs_len = 0;
+		next_pair = 0;
+	}
+	else if (threadid == (NRESTS - 33))
+		bsize_sh = umin(eq->edata.nslots[6][bucketid], NSLOTS);
+
+	slotsmall* buck = eq->treessmall[0][bucketid];
+
+	u32 hr[3];
+	int pos[3];
+	pos[0] = pos[1] = pos[2] = SSM;
+
+	u32 si[3];
+	uint4 tt[3];
+
+	__syncthreads();
+
+	u32 bsize = bsize_sh;
+
+	#pragma unroll 3
+	for (u32 i = 0; i < 3; i++)
+	{
+		si[i] = i * NRESTS + threadid;
+		if (si[i] >= bsize) break;
+
+		const slotsmall* pslot1 = buck + si[i];
+
+		// get xhash
+		tt[i] = *(uint4*)(&pslot1->hash[0]);
+		*(uint2*)(&lastword[si[i]][0]) = *(uint2*)(&tt[i].x);
+		asm("bfe.u32 %0, %1, 12, %2;" : "=r"(hr[i]) : "r"(tt[i].z), "r"(RB));
+		pos[i] = atomicAdd(&ht_len[hr[i]], 1);
+		if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i];
+	}
+
+	__syncthreads();
+
+	u32 xors[2];
+	u32 xorbucketid, xorslot;
+
+	#pragma unroll 3
+	for (u32 i = 0; i < 3; i++)
+	{
+		if (pos[i] >= SSM) continue;
+
+		if (pos[i] > 0)
+		{
+			u16 p = ht[hr[i]][0];
+
+			*(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ *(uint2*)(&lastword[p][0]);
+
+			if (xors[1] != 0)
+			{
+				asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(8 + RB), "r"(BUCKBITS));
+				xorslot = atomicAdd(&eq->edata.nslots[7][xorbucketid], 1);
+				if (xorslot < NSLOTS)
+				{
+					slotsmall &xs = eq->treessmall[1][xorbucketid][xorslot];
+					uint4 ttx;
+					ttx.x = xors[0];
+					ttx.y = xors[1];
+					ttx.z = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+					ttx.w = 0;
+					*(uint4*)(&xs.hash[0]) = ttx;
+				}
+			}
+
+			if (pos[i] > 1)
+			{
+				p = ht[hr[i]][1];
+
+				*(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ *(uint2*)(&lastword[p][0]);
+
+				if (xors[1] != 0)
+				{
+					asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(8 + RB), "r"(BUCKBITS));
+					xorslot = atomicAdd(&eq->edata.nslots[7][xorbucketid], 1);
+					if (xorslot < NSLOTS)
+					{
+						slotsmall &xs = eq->treessmall[1][xorbucketid][xorslot];
+						uint4 ttx;
+						ttx.x = xors[0];
+						ttx.y = xors[1];
+						ttx.z = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+						ttx.w = 0;
+						*(uint4*)(&xs.hash[0]) = ttx;
+					}
+				}
+
+				for (int k = 2; k != pos[i]; ++k)
+				{
+					u32 pindex = atomicAdd(&pairs_len, 1);
+					if (pindex >= MAXPAIRS) break;
+					u16 prev = ht[hr[i]][k];
+					pairs[pindex] = __byte_perm(si[i], prev, 0x1054);
+				}
+			}
+		}
+	}
+
+	__syncthreads();
+
+	// process pairs
+	u32 plen = umin(pairs_len, MAXPAIRS);
+	for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1))
+	{
+		int pair = pairs[s];
+		u32 i = __byte_perm(pair, 0, 0x4510);
+		u32 k = __byte_perm(pair, 0, 0x4532);
+
+		*(uint2*)(&xors[0]) = *(uint2*)(&lastword[i][0]) ^ *(uint2*)(&lastword[k][0]);
+
+		if (xors[1] == 0)
+			continue;
+
+		asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(8 + RB), "r"(BUCKBITS));
+		xorslot = atomicAdd(&eq->edata.nslots[7][xorbucketid], 1);
+		if (xorslot >= NSLOTS) continue;
+		slotsmall &xs = eq->treessmall[1][xorbucketid][xorslot];
+		uint4 tt;
+		tt.x = xors[0];
+		tt.y = xors[1];
+		tt.z = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM);
+		tt.w = 0;
+		*(uint4*)(&xs.hash[0]) = tt;
+	}
+}
+
+
+template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS>
+__global__ void digit_8(equi<RB, SM>* eq)
+{
+	__shared__ u16 ht[NRESTS][(SSM - 1)];
+	__shared__ u32 lastword[NSLOTS][2];
+	__shared__ int ht_len[NRESTS];
+	__shared__ int pairs[MAXPAIRS];
+	__shared__ u32 pairs_len;
+	__shared__ u32 bsize_sh;
+	__shared__ u32 next_pair;
+
+	const u32 threadid = threadIdx.x;
+	const u32 bucketid = blockIdx.x;
+
+	// reset hashtable len
+	ht_len[threadid] = 0;
+	if (threadid == (NRESTS - 1))
+	{
+		next_pair = 0;
+		pairs_len = 0;
+	}
+	else if (threadid == (NRESTS - 33))
+		bsize_sh = umin(eq->edata.nslots[7][bucketid], NSLOTS);
+
+	slotsmall* buck = eq->treessmall[1][bucketid];
+
+	u32 hr[3];
+	int pos[3];
+	pos[0] = pos[1] = pos[2] = SSM;
+
+	u32 si[3];
+	uint2 tt[3];
+
+	__syncthreads();
+
+	u32 bsize = bsize_sh;
+
+	#pragma unroll 3
+	for (u32 i = 0; i < 3; i++)
+	{
+		si[i] = i * NRESTS + threadid;
+		if (si[i] >= bsize) break;
+
+		const slotsmall* pslot1 = buck + si[i];
+
+		// get xhash
+		tt[i] = *(uint2*)(&pslot1->hash[0]);
+		*(uint2*)(&lastword[si[i]][0]) = *(uint2*)(&tt[i].x);
+		asm("bfe.u32 %0, %1, 8, %2;" : "=r"(hr[i]) : "r"(tt[i].x), "r"(RB));
+		pos[i] = atomicAdd(&ht_len[hr[i]], 1);
+		if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i];
+	}
+
+	__syncthreads();
+
+	u32 xors[2];
+	u32 bexor, xorbucketid, xorslot;
+
+	#pragma unroll 3
+	for (u32 i = 0; i < 3; i++)
+	{
+		if (pos[i] >= SSM) continue;
+
+		if (pos[i] > 0)
+		{
+			u16 p = ht[hr[i]][0];
+
+			*(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ *(uint2*)(&lastword[p][0]);
+
+			if (xors[1] != 0)
+			{
+				bexor = __byte_perm(xors[0], xors[1], 0x0765);
+				xorbucketid = bexor >> (12 + 8);
+				xorslot = atomicAdd(&eq->edata.nslots8[xorbucketid], 1);
+				if (xorslot < RB8_NSLOTS_LD)
+				{
+					slottiny &xs = eq->treestiny[0][xorbucketid][xorslot];
+					uint2 tt;
+					tt.x = xors[1];
+					tt.y = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+					*(uint2*)(&xs.hash[0]) = tt;
+				}
+			}
+
+			if (pos[i] > 1)
+			{
+				p = ht[hr[i]][1];
+
+				*(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ *(uint2*)(&lastword[p][0]);
+
+				if (xors[1] != 0)
+				{
+					bexor = __byte_perm(xors[0], xors[1], 0x0765);
+					xorbucketid = bexor >> (12 + 8);
+					xorslot = atomicAdd(&eq->edata.nslots8[xorbucketid], 1);
+					if (xorslot < RB8_NSLOTS_LD)
+					{
+						slottiny &xs = eq->treestiny[0][xorbucketid][xorslot];
+						uint2 tt;
+						tt.x = xors[1];
+						tt.y = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+						*(uint2*)(&xs.hash[0]) = tt;
+					}
+				}
+
+				for (int k = 2; k != pos[i]; ++k)
+				{
+					u32 pindex = atomicAdd(&pairs_len, 1);
+					if (pindex >= MAXPAIRS) break;
+					u16 prev = ht[hr[i]][k];
+					pairs[pindex] = __byte_perm(si[i], prev, 0x1054);
+				}
+			}
+		}
+	}
+
+	__syncthreads();
+
+	// process pairs
+	u32 plen = umin(pairs_len, MAXPAIRS);
+	for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1))
+	{
+		int pair = pairs[s];
+		u32 i = __byte_perm(pair, 0, 0x4510);
+		u32 k = __byte_perm(pair, 0, 0x4532);
+
+		*(uint2*)(&xors[0]) = *(uint2*)(&lastword[i][0]) ^ *(uint2*)(&lastword[k][0]);
+
+		if (xors[1] == 0)
+			continue;
+
+		bexor = __byte_perm(xors[0], xors[1], 0x0765);
+		xorbucketid = bexor >> (12 + 8);
+		xorslot = atomicAdd(&eq->edata.nslots8[xorbucketid], 1);
+		if (xorslot >= RB8_NSLOTS_LD) continue;
+		slottiny &xs = eq->treestiny[0][xorbucketid][xorslot];
+		uint2 tt;
+		tt.x = xors[1];
+		tt.y = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM);
+		*(uint2*)(&xs.hash[0]) = tt;
+	}
+}
+
+/*
+  Last round function is similar to previous ones but has different ending.
+  We use warps to process final candidates. Each warp process one candidate.
+  First two bidandsids (u32 of stored bucketid and two slotids) are retreived by
+  lane 0 and lane 16, next four bidandsids by lane 0, 8, 16 and 24, ... until
+  all lanes in warp have bidandsids from round 4. Next, each thread retreives
+  16 indices. While doing so, indices are put into comparison using atomicExch
+  to determine if there are duplicates (tromp's method). At the end, if no
+  duplicates are found, candidate solution is saved (all indices). Note that this
+  dup check method is not exact so CPU dup checking is needed after.
+*/
+template <u32 RB, u32 SM, int SSM, u32 FCT, typename PACKER, u32 MAXPAIRS, u32 DUPBITS, u32 W>
+__global__ void digit_last_wdc(equi<RB, SM>* eq)
+{
+	__shared__ u8 shared_data[8192];
+	int* ht_len = (int*)(&shared_data[0]);
+	int* pairs = ht_len;
+	u32* lastword = (u32*)(&shared_data[256 * 4]);
+	u16* ht = (u16*)(&shared_data[256 * 4 + RB8_NSLOTS_LD * 4]);
+	u32* pairs_len = (u32*)(&shared_data[8188]);
+
+	const u32 threadid = threadIdx.x;
+	const u32 bucketid = blockIdx.x;
+
+	// reset hashtable len
+	#pragma unroll
+	for (u32 i = 0; i < FCT; i++)
+		ht_len[(i * (256 / FCT)) + threadid] = 0;
+
+	if (threadid == ((256 / FCT) - 1))
+		*pairs_len = 0;
+
+	slottiny* buck = eq->treestiny[0][bucketid];
+	u32 bsize = umin(eq->edata.nslots8[bucketid], RB8_NSLOTS_LD);
+
+	u32 si[3 * FCT];
+	u32 hr[3 * FCT];
+	int pos[3 * FCT];
+	u32 lw[3 * FCT];
+
+	#pragma unroll
+	for (u32 i = 0; i < (3 * FCT); i++)
+		pos[i] = SSM;
+
+	__syncthreads();
+
+	#pragma unroll
+	for (u32 i = 0; i < (3 * FCT); i++)
+	{
+		si[i] = i * (256 / FCT) + threadid;
+		if (si[i] >= bsize) break;
+
+		const slottiny* pslot1 = buck + si[i];
+
+		// get xhash
+		uint2 tt = *(uint2*)(&pslot1->hash[0]);
+		lw[i] = tt.x;
+		lastword[si[i]] = lw[i];
+
+		u32 a;
+		asm("bfe.u32 %0, %1, 20, 8;" : "=r"(a) : "r"(lw[i]));
+		hr[i] = a;
+
+		pos[i] = atomicAdd(&ht_len[hr[i]], 1);
+		if (pos[i] < (SSM - 1))
+			ht[hr[i] * (SSM - 1) + pos[i]] = si[i];
+	}
+
+	__syncthreads();
+
+	#pragma unroll
+	for (u32 i = 0; i < (3 * FCT); i++)
+	{
+		if (pos[i] >= SSM) continue;
+
+		for (int k = 0; k != pos[i]; ++k)
+		{
+			u16 prev = ht[hr[i] * (SSM - 1) + k];
+			if (lw[i] != lastword[prev]) continue;
+			u32 pindex = atomicAdd(pairs_len, 1);
+			if (pindex >= MAXPAIRS) break;
+			pairs[pindex] = __byte_perm(si[i], prev, 0x1054);
+		}
+	}
+
+	__syncthreads();
+	u32 plen = umin(*pairs_len, 64);
+
+#define CALC_LEVEL(a, b, c, d) { \
+	u32 plvl = levels[b]; \
+	u32* bucks = eq->round4bidandsids[PACKER::get_bucketid(plvl, RB, SM)]; \
+	u32 slot1 = PACKER::get_slot1(plvl, RB, SM); \
+	u32 slot0 = PACKER::get_slot0(plvl, slot1, RB, SM); \
+	levels[b] = bucks[slot1]; \
+	levels[c] = bucks[slot0]; \
+				}
+
+#define CALC_LEVEL_SMALL(a, b, c, d) { \
+	u32 plvl = levels[b]; \
+	slotsmall* bucks = eq->treessmall[a][PACKER::get_bucketid(plvl, RB, SM)]; \
+	u32 slot1 = PACKER::get_slot1(plvl, RB, SM); \
+	u32 slot0 = PACKER::get_slot0(plvl, slot1, RB, SM); \
+	levels[b] = bucks[slot1].hash[d]; \
+	levels[c] = bucks[slot0].hash[d]; \
+				}
+
+	u32 lane = threadIdx.x & 0x1f;
+	u32 par = threadIdx.x >> 5;
+
+	u32* levels = (u32*)&pairs[MAXPAIRS + (par << DUPBITS)];
+	u32* susp = levels;
+
+	while (par < plen)
+	{
+		int pair = pairs[par];
+		par += W;
+
+		if (lane % 16 == 0)
+		{
+			u32 plvl;
+			if (lane == 0) plvl = buck[__byte_perm(pair, 0, 0x4510)].hash[1];
+			else plvl = buck[__byte_perm(pair, 0, 0x4532)].hash[1];
+			slotsmall* bucks = eq->treessmall[1][PACKER::get_bucketid(plvl, RB, SM)];
+			u32 slot1 = PACKER::get_slot1(plvl, RB, SM);
+			u32 slot0 = PACKER::get_slot0(plvl, slot1, RB, SM);
+			levels[lane] = bucks[slot1].hash[2];
+			levels[lane + 8] = bucks[slot0].hash[2];
+		}
+
+		if (lane % 8 == 0)
+			CALC_LEVEL_SMALL(0, lane, lane + 4, 3);
+
+		if (lane % 4 == 0)
+			CALC_LEVEL_SMALL(2, lane, lane + 2, 3);
+
+		if (lane % 2 == 0)
+			CALC_LEVEL(0, lane, lane + 1, 4);
+
+		u32 ind[16];
+
+		u32 f1 = levels[lane];
+		const slottiny* buck_v4 = &eq->round3trees[PACKER::get_bucketid(f1, RB, SM)].treestiny[0];
+		const u32 slot1_v4 = PACKER::get_slot1(f1, RB, SM);
+		const u32 slot0_v4 = PACKER::get_slot0(f1, slot1_v4, RB, SM);
+
+		susp[lane] = 0xffffffff;
+		susp[32 + lane] = 0xffffffff;
+
+#define CHECK_DUP(a) \
+	__any(atomicExch(&susp[(ind[a] & ((1 << DUPBITS) - 1))], (ind[a] >> DUPBITS)) == (ind[a] >> DUPBITS))
+
+		u32 f2 = buck_v4[slot1_v4].hash[1];
+		const slottiny* buck_v3_1 = &eq->round2trees[PACKER::get_bucketid(f2, RB, SM)].treestiny[0];
+		const u32 slot1_v3_1 = PACKER::get_slot1(f2, RB, SM);
+		const u32 slot0_v3_1 = PACKER::get_slot0(f2, slot1_v3_1, RB, SM);
+
+		susp[64 + lane] = 0xffffffff;
+		susp[96 + lane] = 0xffffffff;
+
+		u32 f0 = buck_v3_1[slot1_v3_1].hash[1];
+		const slot* buck_v2_1 = eq->trees[0][PACKER::get_bucketid(f0, RB, SM)];
+		const u32 slot1_v2_1 = PACKER::get_slot1(f0, RB, SM);
+		const u32 slot0_v2_1 = PACKER::get_slot0(f0, slot1_v2_1, RB, SM);
+
+		susp[128 + lane] = 0xffffffff;
+		susp[160 + lane] = 0xffffffff;
+
+		u32 f3 = buck_v2_1[slot1_v2_1].hash[6];
+		const slot* buck_fin_1 = eq->round0trees[packer_default::get_bucketid(f3, 8, RB8_NSLOTS)];
+		const u32 slot1_fin_1 = packer_default::get_slot1(f3, 8, RB8_NSLOTS);
+		const u32 slot0_fin_1 = packer_default::get_slot0(f3, slot1_fin_1, 8, RB8_NSLOTS);
+
+		susp[192 + lane] = 0xffffffff;
+		susp[224 + lane] = 0xffffffff;
+
+		ind[0] = buck_fin_1[slot1_fin_1].hash[7];
+		if (CHECK_DUP(0)) continue;
+		ind[1] = buck_fin_1[slot0_fin_1].hash[7];
+		if (CHECK_DUP(1)) continue;
+
+		u32 f4 = buck_v2_1[slot0_v2_1].hash[6];
+		const slot* buck_fin_2 = eq->round0trees[packer_default::get_bucketid(f4, 8, RB8_NSLOTS)];
+		const u32 slot1_fin_2 = packer_default::get_slot1(f4, 8, RB8_NSLOTS);
+		const u32 slot0_fin_2 = packer_default::get_slot0(f4, slot1_fin_2, 8, RB8_NSLOTS);
+
+		ind[2] = buck_fin_2[slot1_fin_2].hash[7];
+		if (CHECK_DUP(2)) continue;
+		ind[3] = buck_fin_2[slot0_fin_2].hash[7];
+		if (CHECK_DUP(3)) continue;
+
+		u32 f5 = buck_v3_1[slot0_v3_1].hash[1];
+		const slot* buck_v2_2 = eq->trees[0][PACKER::get_bucketid(f5, RB, SM)];
+		const u32 slot1_v2_2 = PACKER::get_slot1(f5, RB, SM);
+		const u32 slot0_v2_2 = PACKER::get_slot0(f5, slot1_v2_2, RB, SM);
+
+		u32 f6 = buck_v2_2[slot1_v2_2].hash[6];
+		const slot* buck_fin_3 = eq->round0trees[packer_default::get_bucketid(f6, 8, RB8_NSLOTS)];
+		const u32 slot1_fin_3 = packer_default::get_slot1(f6, 8, RB8_NSLOTS);
+		const u32 slot0_fin_3 = packer_default::get_slot0(f6, slot1_fin_3, 8, RB8_NSLOTS);
+
+		ind[4] = buck_fin_3[slot1_fin_3].hash[7];
+		if (CHECK_DUP(4)) continue;
+		ind[5] = buck_fin_3[slot0_fin_3].hash[7];
+		if (CHECK_DUP(5)) continue;
+
+		u32 f7 = buck_v2_2[slot0_v2_2].hash[6];
+		const slot* buck_fin_4 = eq->round0trees[packer_default::get_bucketid(f7, 8, RB8_NSLOTS)];
+		const u32 slot1_fin_4 = packer_default::get_slot1(f7, 8, RB8_NSLOTS);
+		const u32 slot0_fin_4 = packer_default::get_slot0(f7, slot1_fin_4, 8, RB8_NSLOTS);
+
+		ind[6] = buck_fin_4[slot1_fin_4].hash[7];
+		if (CHECK_DUP(6)) continue;
+		ind[7] = buck_fin_4[slot0_fin_4].hash[7];
+		if (CHECK_DUP(7)) continue;
+
+		u32 f8 = buck_v4[slot0_v4].hash[1];
+		const slottiny* buck_v3_2 = &eq->round2trees[PACKER::get_bucketid(f8, RB, SM)].treestiny[0];
+		const u32 slot1_v3_2 = PACKER::get_slot1(f8, RB, SM);
+		const u32 slot0_v3_2 = PACKER::get_slot0(f8, slot1_v3_2, RB, SM);
+
+		u32 f9 = buck_v3_2[slot1_v3_2].hash[1];
+		const slot* buck_v2_3 = eq->trees[0][PACKER::get_bucketid(f9, RB, SM)];
+		const u32 slot1_v2_3 = PACKER::get_slot1(f9, RB, SM);
+		const u32 slot0_v2_3 = PACKER::get_slot0(f9, slot1_v2_3, RB, SM);
+
+		u32 f10 = buck_v2_3[slot1_v2_3].hash[6];
+		const slot* buck_fin_5 = eq->round0trees[packer_default::get_bucketid(f10, 8, RB8_NSLOTS)];
+		const u32 slot1_fin_5 = packer_default::get_slot1(f10, 8, RB8_NSLOTS);
+		const u32 slot0_fin_5 = packer_default::get_slot0(f10, slot1_fin_5, 8, RB8_NSLOTS);
+
+		ind[8] = buck_fin_5[slot1_fin_5].hash[7];
+		if (CHECK_DUP(8)) continue;
+		ind[9] = buck_fin_5[slot0_fin_5].hash[7];
+		if (CHECK_DUP(9)) continue;
+
+		u32 f11 = buck_v2_3[slot0_v2_3].hash[6];
+		const slot* buck_fin_6 = eq->round0trees[packer_default::get_bucketid(f11, 8, RB8_NSLOTS)];
+		const u32 slot1_fin_6 = packer_default::get_slot1(f11, 8, RB8_NSLOTS);
+		const u32 slot0_fin_6 = packer_default::get_slot0(f11, slot1_fin_6, 8, RB8_NSLOTS);
+
+		ind[10] = buck_fin_6[slot1_fin_6].hash[7];
+		if (CHECK_DUP(10)) continue;
+		ind[11] = buck_fin_6[slot0_fin_6].hash[7];
+		if (CHECK_DUP(11)) continue;
+
+		u32 f12 = buck_v3_2[slot0_v3_2].hash[1];
+		const slot* buck_v2_4 = eq->trees[0][PACKER::get_bucketid(f12, RB, SM)];
+		const u32 slot1_v2_4 = PACKER::get_slot1(f12, RB, SM);
+		const u32 slot0_v2_4 = PACKER::get_slot0(f12, slot1_v2_4, RB, SM);
+
+		u32 f13 = buck_v2_4[slot1_v2_4].hash[6];
+		const slot* buck_fin_7 = eq->round0trees[packer_default::get_bucketid(f13, 8, RB8_NSLOTS)];
+		const u32 slot1_fin_7 = packer_default::get_slot1(f13, 8, RB8_NSLOTS);
+		const u32 slot0_fin_7 = packer_default::get_slot0(f13, slot1_fin_7, 8, RB8_NSLOTS);
+
+		ind[12] = buck_fin_7[slot1_fin_7].hash[7];
+		if (CHECK_DUP(12)) continue;
+		ind[13] = buck_fin_7[slot0_fin_7].hash[7];
+		if (CHECK_DUP(13)) continue;
+
+		u32 f14 = buck_v2_4[slot0_v2_4].hash[6];
+		const slot* buck_fin_8 = eq->round0trees[packer_default::get_bucketid(f14, 8, RB8_NSLOTS)];
+		const u32 slot1_fin_8 = packer_default::get_slot1(f14, 8, RB8_NSLOTS);
+		const u32 slot0_fin_8 = packer_default::get_slot0(f14, slot1_fin_8, 8, RB8_NSLOTS);
+
+		ind[14] = buck_fin_8[slot1_fin_8].hash[7];
+		if (CHECK_DUP(14)) continue;
+		ind[15] = buck_fin_8[slot0_fin_8].hash[7];
+		if (CHECK_DUP(15)) continue;
+
+		u32 soli;
+		if (lane == 0) {
+			soli = atomicAdd(&eq->edata.srealcont.nsols, 1);
+		}
+#if __CUDA_ARCH__ >= 300
+		// all threads get the value from lane 0
+		soli = __shfl2(soli, 0);
+#else
+		__syncthreads();
+		soli = eq->edata.srealcont.nsols;
+#endif
+		if (soli < MAXREALSOLS)
+		{
+			u32 pos = lane << 4;
+			*(uint4*)(&eq->edata.srealcont.sols[soli][pos     ]) = *(uint4*)(&ind[ 0]);
+			*(uint4*)(&eq->edata.srealcont.sols[soli][pos +  4]) = *(uint4*)(&ind[ 4]);
+			*(uint4*)(&eq->edata.srealcont.sols[soli][pos +  8]) = *(uint4*)(&ind[ 8]);
+			*(uint4*)(&eq->edata.srealcont.sols[soli][pos + 12]) = *(uint4*)(&ind[12]);
+		}
+	}
+}
+
+//std::mutex dev_init;
+int dev_init_done[MAX_GPUS] = { 0 };
+
+__host__
+static int compu32(const void *pa, const void *pb)
+{
+	uint32_t a = *(uint32_t *)pa, b = *(uint32_t *)pb;
+	return a<b ? -1 : a == b ? 0 : +1;
+}
+
+__host__
+static bool duped(uint32_t* prf)
+{
+	uint32_t sortprf[512];
+	memcpy(sortprf, prf, sizeof(uint32_t) * 512);
+	qsort(sortprf, 512, sizeof(uint32_t), &compu32);
+	for (uint32_t i = 1; i<512; i++) {
+		if (sortprf[i] <= sortprf[i - 1])
+			return true;
+	}
+	return false;
+}
+
+__host__
+static void sort_pair(uint32_t *a, uint32_t len)
+{
+	uint32_t *b = a + len;
+	uint32_t  tmp, need_sorting = 0;
+	for (uint32_t i = 0; i < len; i++) {
+		if (need_sorting || a[i] > b[i])
+		{
+			need_sorting = 1;
+			tmp = a[i];
+			a[i] = b[i];
+			b[i] = tmp;
+		}
+		else if (a[i] < b[i])
+			return;
+	}
+}
+
+__host__
+static void setheader(blake2b_state *ctx, const char *header, const u32 headerLen, const char* nce, const u32 nonceLen)
+{
+	uint32_t le_N = WN;
+	uint32_t le_K = WK;
+	uchar personal[] = "ZcashPoW01230123";
+	memcpy(personal + 8, &le_N, 4);
+	memcpy(personal + 12, &le_K, 4);
+	blake2b_param P[1];
+	P->digest_length = HASHOUT;
+	P->key_length = 0;
+	P->fanout = 1;
+	P->depth = 1;
+	P->leaf_length = 0;
+	P->node_offset = 0;
+	P->node_depth = 0;
+	P->inner_length = 0;
+	memset(P->reserved, 0, sizeof(P->reserved));
+	memset(P->salt, 0, sizeof(P->salt));
+	memcpy(P->personal, (const uint8_t *)personal, 16);
+	eq_blake2b_init_param(ctx, P);
+	eq_blake2b_update(ctx, (const uchar *)header, headerLen);
+	if (nonceLen) eq_blake2b_update(ctx, (const uchar *)nce, nonceLen);
+}
+
+#ifdef WIN32
+typedef CUresult(CUDAAPI *dec_cuDeviceGet)(CUdevice*, int);
+typedef CUresult(CUDAAPI *dec_cuCtxCreate)(CUcontext*, unsigned int, CUdevice);
+typedef CUresult(CUDAAPI *dec_cuCtxPushCurrent)(CUcontext);
+typedef CUresult(CUDAAPI *dec_cuCtxDestroy)(CUcontext);
+
+dec_cuDeviceGet _cuDeviceGet = nullptr;
+dec_cuCtxCreate _cuCtxCreate = nullptr;
+dec_cuCtxPushCurrent _cuCtxPushCurrent = nullptr;
+dec_cuCtxDestroy _cuCtxDestroy = nullptr;
+#endif
+
+template <u32 RB, u32 SM, u32 SSM, u32 THREADS, typename PACKER>
+__host__ eq_cuda_context<RB, SM, SSM, THREADS, PACKER>::eq_cuda_context(int thr_id, int dev_id)
+{
+	thread_id = thr_id;
+	device_id = dev_id;
+	solutions = nullptr;
+	equi_mem_sz = sizeof(equi<RB, SM>);
+	throughput = NBLOCKS;
+	totalblocks = NBLOCKS/FD_THREADS;
+	threadsperblock = FD_THREADS;
+	threadsperblock_digits = THREADS;
+
+	//dev_init.lock();
+	if (!dev_init_done[device_id])
+	{
+		// only first thread shall init device
+		checkCudaErrors(cudaSetDevice(device_id));
+		checkCudaErrors(cudaDeviceReset());
+		checkCudaErrors(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
+
+		pctx = nullptr;
+	}
+	else
+	{
+		// create new context
+		CUdevice dev;
+
+#ifdef WIN32
+		if (_cuDeviceGet == nullptr)
+		{
+			HMODULE hmod = LoadLibraryA("nvcuda.dll");
+			if (hmod == NULL)
+				throw std::runtime_error("Failed to load nvcuda.dll");
+			_cuDeviceGet = (dec_cuDeviceGet)GetProcAddress(hmod, "cuDeviceGet");
+			if (_cuDeviceGet == nullptr)
+				throw std::runtime_error("Failed to get cuDeviceGet address");
+			_cuCtxCreate = (dec_cuCtxCreate)GetProcAddress(hmod, "cuCtxCreate_v2");
+			if (_cuCtxCreate == nullptr)
+				throw std::runtime_error("Failed to get cuCtxCreate address");
+			_cuCtxPushCurrent = (dec_cuCtxPushCurrent)GetProcAddress(hmod, "cuCtxPushCurrent_v2");
+			if (_cuCtxPushCurrent == nullptr)
+				throw std::runtime_error("Failed to get cuCtxPushCurrent address");
+			_cuCtxDestroy = (dec_cuCtxDestroy)GetProcAddress(hmod, "cuCtxDestroy_v2");
+			if (_cuCtxDestroy == nullptr)
+				throw std::runtime_error("Failed to get cuCtxDestroy address");
+		}
+
+		checkCudaDriverErrors(_cuDeviceGet(&dev, device_id));
+		checkCudaDriverErrors(_cuCtxCreate(&pctx, CU_CTX_SCHED_BLOCKING_SYNC, dev));
+		checkCudaDriverErrors(_cuCtxPushCurrent(pctx));
+#else
+		checkCudaDriverErrors(cuDeviceGet(&dev, device_id));
+		checkCudaDriverErrors(cuCtxCreate(&pctx, CU_CTX_SCHED_BLOCKING_SYNC, dev));
+		checkCudaDriverErrors(cuCtxPushCurrent(pctx));
+#endif
+	}
+	++dev_init_done[device_id];
+	//dev_init.unlock();
+
+	if (cudaMalloc((void**)&device_eq, equi_mem_sz) != cudaSuccess)
+		throw std::runtime_error("CUDA: failed to alloc memory");
+
+	solutions = (scontainerreal*) malloc(sizeof(scontainerreal));
+	if (!solutions)
+		throw std::runtime_error("EOM: failed to alloc solutions memory");
+}
+
+template <u32 RB, u32 SM, u32 SSM, u32 THREADS, typename PACKER>
+__host__ void eq_cuda_context<RB, SM, SSM, THREADS, PACKER>::solve(const char *tequihash_header,
+	unsigned int tequihash_header_len,
+	const char* nonce,
+	unsigned int nonce_len,
+	fn_cancel cancelf,
+	fn_solution solutionf,
+	fn_hashdone hashdonef)
+{
+	blake2b_state blake_ctx;
+
+	int blocks = NBUCKETS;
+
+	setheader(&blake_ctx, tequihash_header, tequihash_header_len, nonce, nonce_len);
+
+	// todo: improve
+	// djezo solver allows last 4 bytes of nonce to be iterrated
+	// this can be used to create internal loop - calc initial blake hash only once, then load 8*8 bytes on device (blake state h)
+	// then just iterate nn++
+	// less CPU load, 1 cudaMemcpy less -> faster
+	//u32 nn = *(u32*)&nonce[28];
+	u32 nn = 0;
+
+	checkCudaErrors(cudaMemcpy(&device_eq->blake_h, &blake_ctx.h, sizeof(u64) * 8, cudaMemcpyHostToDevice));
+
+	checkCudaErrors(cudaMemset(&device_eq->edata, 0, sizeof(device_eq->edata)));
+
+	digit_first<RB, SM, PACKER> <<<NBLOCKS / FD_THREADS, FD_THREADS >>>(device_eq, nn);
+
+	digit_1<RB, SM, SSM, PACKER, 4 * NRESTS, 512> <<<4096, 512 >>>(device_eq);
+	digit_2<RB, SM, SSM, PACKER, 4 * NRESTS, THREADS> <<<blocks, THREADS >>>(device_eq);
+	digit_3<RB, SM, SSM, PACKER, 4 * NRESTS, THREADS> <<<blocks, THREADS >>>(device_eq);
+
+	if (cancelf(thread_id)) return;
+
+	digit_4<RB, SM, SSM, PACKER, 4 * NRESTS, THREADS> <<<blocks, THREADS >>>(device_eq);
+	digit_5<RB, SM, SSM, PACKER, 4 * NRESTS, THREADS> <<<blocks, THREADS >>>(device_eq);
+
+	digit_6<RB, SM, SSM - 1, PACKER, 3 * NRESTS> <<<blocks, NRESTS >>>(device_eq);
+	digit_7<RB, SM, SSM - 1, PACKER, 3 * NRESTS> <<<blocks, NRESTS >>>(device_eq);
+	digit_8<RB, SM, SSM - 1, PACKER, 3 * NRESTS> <<<blocks, NRESTS >>>(device_eq);
+
+	digit_last_wdc<RB, SM, SSM - 3, 2, PACKER, 64, 8, 4> <<<4096, 256 / 2 >>>(device_eq);
+
+	checkCudaErrors(cudaMemcpy(solutions, &device_eq->edata.srealcont, (MAXREALSOLS * (512 * 4)) + 4, cudaMemcpyDeviceToHost));
+
+	//printf("T%d nsols: %u\n", thread_id, solutions->nsols);
+	//if (solutions->nsols > 9)
+	//	printf("missing sol, total: %u\n", solutions->nsols);
+
+	for (u32 s = 0; (s < solutions->nsols) && (s < MAXREALSOLS); s++)
+	{
+		// remove dups on CPU (dup removal on GPU is not fully exact and can pass on some invalid solutions)
+		if (duped(solutions->sols[s])) continue;
+
+		// perform sort of pairs
+		for (uint32_t level = 0; level < 9; level++)
+			for (uint32_t i = 0; i < (1 << 9); i += (2 << level))
+				sort_pair(&solutions->sols[s][i], 1 << level);
+
+		std::vector<uint32_t> index_vector(PROOFSIZE);
+		for (u32 i = 0; i < PROOFSIZE; i++) {
+			index_vector[i] = solutions->sols[s][i];
+		}
+
+		solutionf(thread_id, index_vector, DIGITBITS, nullptr);
+	}
+
+	// ccminer: only use hashdonef if no solutions...
+	if (!solutions->nsols)
+		hashdonef(thread_id);
+}
+
+// destructor
+template <u32 RB, u32 SM, u32 SSM, u32 THREADS, typename PACKER>
+__host__
+void eq_cuda_context<RB, SM, SSM, THREADS, PACKER>::freemem()
+{
+	if (solutions)
+		free(solutions);
+
+	if (device_eq) {
+		cudaFree(device_eq);
+		device_eq = NULL;
+	}
+
+	if (pctx) {
+		// non primary thread, destroy context
+#ifdef WIN32
+		checkCudaDriverErrors(_cuCtxDestroy(pctx));
+#else
+		checkCudaDriverErrors(cuCtxDestroy(pctx));
+#endif
+	} else {
+		checkCudaErrors(cudaDeviceReset());
+		dev_init_done[device_id] = 0;
+	}
+}
+
+template <u32 RB, u32 SM, u32 SSM, u32 THREADS, typename PACKER>
+__host__
+eq_cuda_context<RB, SM, SSM, THREADS, PACKER>::~eq_cuda_context()
+{
+	freemem();
+}
+
+#ifdef CONFIG_MODE_1
+template class eq_cuda_context<CONFIG_MODE_1>;
+#endif
+
+#ifdef CONFIG_MODE_2
+template class eq_cuda_context<CONFIG_MODE_2>;
+#endif
+
+#ifdef CONFIG_MODE_3
+template class eq_cuda_context<CONFIG_MODE_3>;
+#endif
diff --git a/equi/eqcuda.hpp b/equi/eqcuda.hpp
new file mode 100644
index 0000000000..fa5abb11fe
--- /dev/null
+++ b/equi/eqcuda.hpp
@@ -0,0 +1,136 @@
+#pragma once
+
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdexcept>
+#include <vector>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+
+#ifdef WIN32
+#define _SNPRINTF _snprintf
+#else
+#define _SNPRINTF snprintf
+#endif
+
+#ifndef nullptr
+#define nullptr NULL
+#endif
+
+#ifdef WIN32
+#define rt_error std::runtime_error
+#else
+class rt_error : public std::runtime_error
+{
+public:
+	explicit rt_error(const std::string& str) : std::runtime_error(str) {}
+};
+#endif
+
+#define checkCudaErrors(call)								\
+do {														\
+	cudaError_t err = call;									\
+	if (cudaSuccess != err) {								\
+		char errorBuff[512];								\
+		_SNPRINTF(errorBuff, sizeof(errorBuff) - 1,			\
+			"CUDA error '%s' in func '%s' line %d",			\
+			cudaGetErrorString(err), __FUNCTION__, __LINE__); \
+		throw rt_error(errorBuff);				\
+		}													\
+} while (0)
+
+#define checkCudaDriverErrors(call)							\
+do {														\
+	CUresult err = call;									\
+	if (CUDA_SUCCESS != err) {								\
+		char errorBuff[512];								\
+		_SNPRINTF(errorBuff, sizeof(errorBuff) - 1,			\
+			"CUDA error DRIVER: '%d' in func '%s' line %d", \
+			err, __FUNCTION__, __LINE__);	\
+		throw rt_error(errorBuff);				\
+				}											\
+} while (0)
+
+typedef uint64_t u64;
+typedef uint32_t u32;
+typedef uint16_t u16;
+typedef uint8_t u8;
+typedef unsigned char uchar;
+
+struct packer_default;
+struct packer_cantor;
+
+#define MAXREALSOLS 9
+
+struct scontainerreal {
+	u32 sols[MAXREALSOLS][512];
+	u32 nsols;
+};
+
+#if 0
+#include <functional>
+#define fn_solution std::function<void(int thr_id, const std::vector<uint32_t>&, size_t, const unsigned char*)>
+#define fn_hashdone std::function<void(int thr_id)>
+#define fn_cancel   std::function<bool(int thr_id)>
+#else
+typedef void (*fn_solution)(int thr_id, const std::vector<uint32_t>&, size_t, const unsigned char*);
+typedef void (*fn_hashdone)(int thr_id);
+typedef bool (*fn_cancel)(int thr_id);
+#endif
+
+template <u32 RB, u32 SM> struct equi;
+
+// ---------------------------------------------------------------------------------------------------
+
+struct eq_cuda_context_interface
+{
+	//virtual ~eq_cuda_context_interface();
+
+	virtual void solve(const char *tequihash_header,
+		unsigned int tequihash_header_len,
+		const char* nonce,
+		unsigned int nonce_len,
+		fn_cancel cancelf,
+		fn_solution solutionf,
+		fn_hashdone hashdonef);
+
+public:
+	int thread_id;
+	int device_id;
+	int throughput;
+	int totalblocks;
+	int threadsperblock;
+	int threadsperblock_digits;
+	size_t equi_mem_sz;
+};
+
+// ---------------------------------------------------------------------------------------------------
+
+template <u32 RB, u32 SM, u32 SSM, u32 THREADS, typename PACKER>
+class eq_cuda_context : public eq_cuda_context_interface
+{
+	equi<RB, SM>* device_eq;
+	scontainerreal* solutions;
+	CUcontext pctx;
+
+	void solve(const char *tequihash_header,
+		unsigned int tequihash_header_len,
+		const char* nonce,
+		unsigned int nonce_len,
+		fn_cancel cancelf,
+		fn_solution solutionf,
+		fn_hashdone hashdonef);
+public:
+	eq_cuda_context(int thr_id, int dev_id);
+	void freemem();
+	~eq_cuda_context();
+};
+
+// RB, SM, SSM, TPB, PACKER... but any change only here will fail..
+#define CONFIG_MODE_1	9, 1248, 12, 640, packer_cantor
+//#define CONFIG_MODE_2	8, 640, 12, 512, packer_default
diff --git a/equi/equi-stratum.cpp b/equi/equi-stratum.cpp
new file mode 100644
index 0000000000..26433ccc3a
--- /dev/null
+++ b/equi/equi-stratum.cpp
@@ -0,0 +1,275 @@
+/**
+ * Equihash specific stratum protocol
+ * tpruvot@github - 2017 - Part under GPLv3 Licence
+ */
+
+#include <errno.h>
+#include <string.h>
+#include <unistd.h>
+#include <miner.h>
+
+#include "equihash.h"
+
+extern struct stratum_ctx stratum;
+extern pthread_mutex_t stratum_work_lock;
+
+// ZEC uses a different scale to compute diff... 
+// sample targets to diff (stored in the reverse byte order in work->target)
+// 0007fff800000000000000000000000000000000000000000000000000000000 is stratum diff 32
+// 003fffc000000000000000000000000000000000000000000000000000000000 is stratum diff 4
+// 00ffff0000000000000000000000000000000000000000000000000000000000 is stratum diff 1
+double target_to_diff_equi(uint32_t* target)
+{
+	uchar* tgt = (uchar*) target;
+	uint64_t m =
+		(uint64_t)tgt[30] << 24 |
+		(uint64_t)tgt[29] << 16 |
+		(uint64_t)tgt[28] << 8  |
+		(uint64_t)tgt[27] << 0;
+
+	if (!m)
+		return 0.;
+	else
+		return (double)0xffff0000UL/m;
+}
+
+void diff_to_target_equi(uint32_t *target, double diff)
+{
+	uint64_t m;
+	int k;
+
+	for (k = 6; k > 0 && diff > 1.0; k--)
+		diff /= 4294967296.0;
+	m = (uint64_t)(4294901760.0 / diff);
+	if (m == 0 && k == 6)
+		memset(target, 0xff, 32);
+	else {
+		memset(target, 0, 32);
+		target[k + 1] = (uint32_t)(m >> 8);
+		target[k + 2] = (uint32_t)(m >> 40);
+		//memset(target, 0xff, 6*sizeof(uint32_t));
+		for (k = 0; k < 28 && ((uint8_t*)target)[k] == 0; k++)
+			((uint8_t*)target)[k] = 0xff;
+	}
+}
+
+/* compute nbits to get the network diff */
+double equi_network_diff(struct work *work)
+{
+	//KMD bits: "1e 015971",
+	//KMD target: "00 00 015971000000000000000000000000000000000000000000000000000000",
+	//KMD bits: "1d 686aaf",
+	//KMD target: "00 0000 686aaf0000000000000000000000000000000000000000000000000000",
+	uint32_t nbits = work->data[26];
+	uint32_t bits = (nbits & 0xffffff);
+	int16_t shift = (swab32(nbits) & 0xff);
+	shift = (31 - shift) * 8; // 8 bits shift for 0x1e, 16 for 0x1d
+	uint64_t tgt64 = swab32(bits);
+	tgt64 = tgt64 << shift;
+	// applog_hex(&tgt64, 8);
+	uint8_t net_target[32] = { 0 };
+	for (int b=0; b<8; b++)
+		net_target[31-b] = ((uint8_t*)&tgt64)[b];
+	// applog_hex(net_target, 32);
+	double d = target_to_diff_equi((uint32_t*)net_target);
+	return d;
+}
+
+void equi_work_set_target(struct work* work, double diff)
+{
+	// target is given as data by the equihash stratum
+	// memcpy(work->target, stratum.job.claim, 32); // claim field is only used for lbry
+	diff_to_target_equi(work->target, diff);
+	//applog(LOG_BLUE, "diff %f to target :", diff);
+	//applog_hex(work->target, 32);
+	work->targetdiff = diff;
+}
+
+bool equi_stratum_set_target(struct stratum_ctx *sctx, json_t *params)
+{
+	uint8_t target_bin[32], target_be[32];
+
+	const char *target_hex = json_string_value(json_array_get(params, 0));
+	if (!target_hex || strlen(target_hex) == 0)
+		return false;
+
+	hex2bin(target_bin, target_hex, 32);
+	memset(target_be, 0xff, 32);
+	int filled = 0;
+	for (int i=0; i<32; i++) {
+		if (filled == 3) break;
+		target_be[31-i] = target_bin[i];
+		if (target_bin[i]) filled++;
+	}
+	memcpy(sctx->job.extra, target_be, 32);
+
+	pthread_mutex_lock(&stratum_work_lock);
+	sctx->next_diff = target_to_diff_equi((uint32_t*) &target_be);
+	pthread_mutex_unlock(&stratum_work_lock);
+
+	//applog(LOG_BLUE, "low diff %f", sctx->next_diff);
+	//applog_hex(target_be, 32);
+
+	return true;
+}
+
+bool equi_stratum_notify(struct stratum_ctx *sctx, json_t *params)
+{
+	const char *job_id, *version, *prevhash, *coinb1, *coinb2, *nbits, *stime;
+	size_t coinb1_size, coinb2_size;
+	bool clean, ret = false;
+	int ntime, i, p=0;
+	job_id = json_string_value(json_array_get(params, p++));
+	version = json_string_value(json_array_get(params, p++));
+	prevhash = json_string_value(json_array_get(params, p++));
+	coinb1 = json_string_value(json_array_get(params, p++)); //merkle
+	coinb2 = json_string_value(json_array_get(params, p++)); //blank (reserved)
+	stime = json_string_value(json_array_get(params, p++));
+	nbits = json_string_value(json_array_get(params, p++));
+	clean = json_is_true(json_array_get(params, p)); p++;
+
+	if (!job_id || !prevhash || !coinb1 || !coinb2 || !version || !nbits || !stime ||
+	    strlen(prevhash) != 64 || strlen(version) != 8 ||
+	    strlen(coinb1) != 64 || strlen(coinb2) != 64 ||
+	    strlen(nbits) != 8 || strlen(stime) != 8) {
+		applog(LOG_ERR, "Stratum notify: invalid parameters");
+		goto out;
+	}
+
+	/* store stratum server time diff */
+	hex2bin((uchar *)&ntime, stime, 4);
+	ntime = ntime - (int) time(0);
+	if (ntime > sctx->srvtime_diff) {
+		sctx->srvtime_diff = ntime;
+		if (opt_protocol && ntime > 20)
+			applog(LOG_DEBUG, "stratum time is at least %ds in the future", ntime);
+	}
+
+	pthread_mutex_lock(&stratum_work_lock);
+	hex2bin(sctx->job.version, version, 4);
+	hex2bin(sctx->job.prevhash, prevhash, 32);
+
+	coinb1_size = strlen(coinb1) / 2;
+	coinb2_size = strlen(coinb2) / 2;
+	sctx->job.coinbase_size = coinb1_size + coinb2_size + // merkle + reserved
+		sctx->xnonce1_size + sctx->xnonce2_size; // extranonce and...
+
+	sctx->job.coinbase = (uchar*) realloc(sctx->job.coinbase, sctx->job.coinbase_size);
+	hex2bin(sctx->job.coinbase, coinb1, coinb1_size);
+	hex2bin(sctx->job.coinbase + coinb1_size, coinb2, coinb2_size);
+
+	sctx->job.xnonce2 = sctx->job.coinbase + coinb1_size + coinb2_size + sctx->xnonce1_size;
+	if (!sctx->job.job_id || strcmp(sctx->job.job_id, job_id))
+		memset(sctx->job.xnonce2, 0, sctx->xnonce2_size);
+	memcpy(sctx->job.coinbase + coinb1_size + coinb2_size, sctx->xnonce1, sctx->xnonce1_size);
+
+	for (i = 0; i < sctx->job.merkle_count; i++)
+		free(sctx->job.merkle[i]);
+	free(sctx->job.merkle);
+	sctx->job.merkle = NULL;
+	sctx->job.merkle_count = 0;
+
+	free(sctx->job.job_id);
+	sctx->job.job_id = strdup(job_id);
+
+	hex2bin(sctx->job.nbits, nbits, 4);
+	hex2bin(sctx->job.ntime, stime, 4);
+	sctx->job.clean = clean;
+
+	sctx->job.diff = sctx->next_diff;
+	pthread_mutex_unlock(&stratum_work_lock);
+
+	ret = true;
+
+out:
+	return ret;
+}
+
+// equihash stratum protocol is not standard, use client.show_message to pass block height
+bool equi_stratum_show_message(struct stratum_ctx *sctx, json_t *id, json_t *params)
+{
+	char *s;
+	json_t *val;
+	bool ret;
+
+	val = json_array_get(params, 0);
+	if (val) {
+		const char* data = json_string_value(val);
+		if (data && strlen(data)) {
+			char symbol[32] = { 0 };
+			uint32_t height = 0;
+			int ss = sscanf(data, "equihash %s block %u", symbol, &height);
+			if (height && ss > 1) sctx->job.height = height;
+			if (opt_debug && ss > 1) applog(LOG_DEBUG, "%s", data);
+		}
+	}
+
+	if (!id || json_is_null(id))
+		return true;
+
+	val = json_object();
+	json_object_set(val, "id", id);
+	json_object_set_new(val, "error", json_null());
+	json_object_set_new(val, "result", json_true());
+	s = json_dumps(val, 0);
+	ret = stratum_send_line(sctx, s);
+	json_decref(val);
+	free(s);
+
+	return ret;
+}
+
+void equi_store_work_solution(struct work* work, uint32_t* hash, void* sol_data)
+{
+	int nonce = work->valid_nonces-1;
+	memcpy(work->extra, sol_data, 1347);
+	bn_store_hash_target_ratio(hash, work->target, work, nonce);
+	//work->sharediff[nonce] = target_to_diff_equi(hash);
+}
+
+#define JSON_SUBMIT_BUF_LEN (4*1024)
+// called by submit_upstream_work()
+bool equi_stratum_submit(struct pool_infos *pool, struct work *work)
+{
+	char _ALIGN(64) s[JSON_SUBMIT_BUF_LEN];
+	char _ALIGN(64) timehex[16] = { 0 };
+	char *jobid, *noncestr, *solhex;
+	int idnonce = work->submit_nonce_id;
+
+	// scanned nonce
+	work->data[EQNONCE_OFFSET] = work->nonces[idnonce];
+	unsigned char * nonce = (unsigned char*) (&work->data[27]);
+	size_t nonce_len = 32 - stratum.xnonce1_size;
+	// long nonce without pool prefix (extranonce)
+	noncestr = bin2hex(&nonce[stratum.xnonce1_size], nonce_len);
+
+	solhex = (char*) calloc(1, 1344*2 + 64);
+	if (!solhex || !noncestr) {
+		applog(LOG_ERR, "unable to alloc share memory");
+		return false;
+	}
+	cbin2hex(solhex, (const char*) work->extra, 1347);
+
+	jobid = work->job_id + 8;
+	sprintf(timehex, "%08x", swab32(work->data[25]));
+
+	snprintf(s, sizeof(s), "{\"method\":\"mining.submit\",\"params\":"
+		"[\"%s\",\"%s\",\"%s\",\"%s\",\"%s\"], \"id\":%u}",
+		pool->user, jobid, timehex, noncestr, solhex,
+		stratum.job.shares_count + 10);
+
+	free(solhex);
+	free(noncestr);
+
+	gettimeofday(&stratum.tv_submit, NULL);
+
+	if(!stratum_send_line(&stratum, s)) {
+		applog(LOG_ERR, "%s stratum_send_line failed", __func__);
+		return false;
+	}
+
+	stratum.sharediff = work->sharediff[idnonce];
+	stratum.job.shares_count++;
+
+	return true;
+}
diff --git a/equi/equi.cpp b/equi/equi.cpp
new file mode 100644
index 0000000000..0a1d919bff
--- /dev/null
+++ b/equi/equi.cpp
@@ -0,0 +1,171 @@
+/*
+ * Port to Generic C of C++ implementation of the Equihash Proof-of-Work
+ * algorithm from zcashd.
+ *
+ * Copyright (c) 2016 abc at openwall dot com
+ * Copyright (c) 2016 Jack Grigg
+ * Copyright (c) 2016 The Zcash developers
+ * Copyright (c) 2017 tpruvot
+ *
+ * Distributed under the MIT software license, see the accompanying
+ * file COPYING or http://www.opensource.org/licenses/mit-license.php.
+ */
+
+#include <string.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <assert.h>
+
+#include "equihash.h"
+
+//#define USE_LIBSODIUM
+
+#ifdef USE_LIBSODIUM
+#include "sodium.h"
+#define blake2b_state crypto_generichash_blake2b_state
+#else
+#include "blake2/blake2.h"
+#define be32toh(x) swab32(x)
+#define htole32(x) (x)
+#define HASHOUT 50
+#endif
+
+#include <miner.h>
+
+static void digestInit(blake2b_state *S, const uint32_t n, const uint32_t k)
+{
+	uint32_t le_N = htole32(n);
+	uint32_t le_K = htole32(k);
+#ifdef USE_LIBSODIUM
+	uint8_t personalization[crypto_generichash_blake2b_PERSONALBYTES] = { 0 };
+
+	memcpy(personalization, "ZcashPoW", 8);
+	memcpy(personalization + 8,  &le_N, 4);
+	memcpy(personalization + 12, &le_K, 4);
+
+	crypto_generichash_blake2b_init_salt_personal(S,
+		NULL, 0, (512 / n) * n / 8, NULL, personalization);
+#else
+	unsigned char personal[] = "ZcashPoW01230123";
+	memcpy(personal + 8, &le_N, 4);
+	memcpy(personal + 12, &le_K, 4);
+	blake2b_param P[1];
+	P->digest_length = HASHOUT;
+	P->key_length = 0;
+	P->fanout = 1;
+	P->depth = 1;
+	P->leaf_length = 0;
+	P->node_offset = 0;
+	P->node_depth = 0;
+	P->inner_length = 0;
+	memset(P->reserved, 0, sizeof(P->reserved));
+	memset(P->salt, 0, sizeof(P->salt));
+	memcpy(P->personal, (const uint8_t *)personal, 16);
+	eq_blake2b_init_param(S, P);
+#endif
+}
+
+static void expandArray(const unsigned char *in, const uint32_t in_len,
+	unsigned char *out, const uint32_t out_len,
+	const uint32_t bit_len, const uint32_t byte_pad)
+{
+	assert(bit_len >= 8);
+	assert(8 * sizeof(uint32_t) >= 7 + bit_len);
+
+	const uint32_t out_width = (bit_len + 7) / 8 + byte_pad;
+	assert(out_len == 8 * out_width * in_len / bit_len);
+
+	const uint32_t bit_len_mask = ((uint32_t)1 << bit_len) - 1;
+
+	// The acc_bits least-significant bits of acc_value represent a bit sequence
+	// in big-endian order.
+	uint32_t acc_bits = 0;
+	uint32_t acc_value = 0;
+	uint32_t j = 0;
+
+	for (uint32_t i = 0; i < in_len; i++)
+	{
+		acc_value = (acc_value << 8) | in[i];
+		acc_bits += 8;
+
+		// When we have bit_len or more bits in the accumulator, write the next
+		// output element.
+		if (acc_bits >= bit_len) {
+			acc_bits -= bit_len;
+			for (uint32_t x = 0; x < byte_pad; x++) {
+				out[j + x] = 0;
+			}
+			for (uint32_t x = byte_pad; x < out_width; x++) {
+				out[j + x] = (
+					// Big-endian
+					acc_value >> (acc_bits + (8 * (out_width - x - 1)))
+				) & (
+					// Apply bit_len_mask across byte boundaries
+					(bit_len_mask >> (8 * (out_width - x - 1))) & 0xFF
+				);
+			}
+			j += out_width;
+		}
+	}
+}
+
+static void generateHash(blake2b_state *S, const uint32_t g, uint8_t *hash, const size_t hashLen)
+{
+	const uint32_t le_g = htole32(g);
+	blake2b_state digest = *S; /* copy */
+#ifdef USE_LIBSODIUM
+	crypto_generichash_blake2b_update(&digest, (uint8_t *)&le_g, sizeof(le_g));
+	crypto_generichash_blake2b_final(&digest, hash, hashLen);
+#else
+	eq_blake2b_update(&digest, (const uint8_t*) &le_g, sizeof(le_g));
+	eq_blake2b_final(&digest, hash, (uint8_t) (hashLen & 0xFF));
+#endif
+}
+
+static int isZero(const uint8_t *hash, size_t len)
+{
+	// This doesn't need to be constant time.
+	for (size_t i = 0; i < len; i++) {
+		if (hash[i] != 0) return 0;
+	}
+	return 1;
+}
+
+// hdr -> header including nonce (140 bytes)
+// soln -> equihash solution (excluding 3 bytes with size, so 1344 bytes length)
+bool equi_verify(uint8_t* const hdr, uint8_t* const soln)
+{
+	const uint32_t n = WN; // 200
+	const uint32_t k = WK; // 9
+	const uint32_t collisionBitLength = n / (k + 1);
+	const uint32_t collisionByteLength = (collisionBitLength + 7) / 8;
+	const uint32_t hashLength = (k + 1) * collisionByteLength;
+	const uint32_t indicesPerHashOutput = 512 / n;
+	const uint32_t hashOutput = indicesPerHashOutput * n / 8;
+	const uint32_t equihashSolutionSize = (1 << k) * (n / (k + 1) + 1) / 8;
+	const uint32_t solnr = 1 << k;
+
+	uint32_t indices[512] = { 0 };
+	uint8_t vHash[hashLength] = { 0 };
+
+	blake2b_state state;
+	digestInit(&state, n, k);
+#ifdef USE_LIBSODIUM
+	crypto_generichash_blake2b_update(&state, hdr, 140);
+#else
+	eq_blake2b_update(&state, hdr, 140);
+#endif
+
+	expandArray(soln, equihashSolutionSize, (uint8_t*) &indices, sizeof(indices), collisionBitLength + 1, 1);
+
+	for (uint32_t j = 0; j < solnr; j++) {
+		uint8_t tmpHash[hashOutput];
+		uint8_t hash[hashLength];
+		uint32_t i = be32toh(indices[j]);
+		generateHash(&state, i / indicesPerHashOutput, tmpHash, hashOutput);
+		expandArray(tmpHash + (i % indicesPerHashOutput * n / 8), n / 8, hash, hashLength, collisionBitLength, 0);
+		for (uint32_t k = 0; k < hashLength; k++)
+			vHash[k] ^= hash[k];
+	}
+	return isZero(vHash, sizeof(vHash));
+}
diff --git a/equi/equihash.cpp b/equi/equihash.cpp
new file mode 100644
index 0000000000..32095464e3
--- /dev/null
+++ b/equi/equihash.cpp
@@ -0,0 +1,296 @@
+/**
+ * Equihash solver interface for ccminer (compatible with linux and windows)
+ * Solver taken from nheqminer, by djeZo (and NiceHash)
+ * tpruvot - 2017 (GPL v3)
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <assert.h>
+
+#include <stdexcept>
+#include <vector>
+
+#include <sph/sph_sha2.h>
+
+#include "eqcuda.hpp"
+#include "equihash.h" // equi_verify()
+
+#include <miner.h>
+
+// All solutions (BLOCK_HEADER_LEN + SOLSIZE_LEN + SOL_LEN) sha256d should be under the target
+extern "C" void equi_hash(const void* input, void* output, int len)
+{
+	uint8_t _ALIGN(64) hash0[32], hash1[32];
+
+	sph_sha256_context ctx_sha256;
+
+	sph_sha256_init(&ctx_sha256);
+	sph_sha256(&ctx_sha256, input, len);
+	sph_sha256_close(&ctx_sha256, hash0);
+	sph_sha256(&ctx_sha256, hash0, 32);
+	sph_sha256_close(&ctx_sha256, hash1);
+
+	memcpy(output, hash1, 32);
+}
+
+// input here is 140 for the header and 1344 for the solution (equi.cpp)
+extern "C" int equi_verify_sol(void * const hdr, void * const sol)
+{
+	bool res = equi_verify((uint8_t*) hdr, (uint8_t*) sol);
+
+	//applog_hex((void*)hdr, 140);
+	//applog_hex((void*)sol, 1344);
+
+	return res ? 1 : 0;
+}
+
+#include <cuda_helper.h>
+
+//#define EQNONCE_OFFSET 30 /* 27:34 */
+#define NONCE_OFT EQNONCE_OFFSET
+
+static bool init[MAX_GPUS] = { 0 };
+static int valid_sols[MAX_GPUS] = { 0 };
+static uint8_t _ALIGN(64) data_sols[MAX_GPUS][MAXREALSOLS][1536] = { 0 }; // 140+3+1344 required
+static eq_cuda_context_interface* solvers[MAX_GPUS] = { NULL };
+
+static void CompressArray(const unsigned char* in, size_t in_len,
+	unsigned char* out, size_t out_len, size_t bit_len, size_t byte_pad)
+{
+	assert(bit_len >= 8);
+	assert(8 * sizeof(uint32_t) >= 7 + bit_len);
+
+	size_t in_width = (bit_len + 7) / 8 + byte_pad;
+	assert(out_len == bit_len*in_len / (8 * in_width));
+
+	uint32_t bit_len_mask = (1UL << bit_len) - 1;
+
+	// The acc_bits least-significant bits of acc_value represent a bit sequence
+	// in big-endian order.
+	size_t acc_bits = 0;
+	uint32_t acc_value = 0;
+
+	size_t j = 0;
+	for (size_t i = 0; i < out_len; i++) {
+		// When we have fewer than 8 bits left in the accumulator, read the next
+		// input element.
+		if (acc_bits < 8) {
+			acc_value = acc_value << bit_len;
+			for (size_t x = byte_pad; x < in_width; x++) {
+				acc_value = acc_value | (
+					(
+					// Apply bit_len_mask across byte boundaries
+					in[j + x] & ((bit_len_mask >> (8 * (in_width - x - 1))) & 0xFF)
+					) << (8 * (in_width - x - 1))); // Big-endian
+			}
+			j += in_width;
+			acc_bits += bit_len;
+		}
+
+		acc_bits -= 8;
+		out[i] = (acc_value >> acc_bits) & 0xFF;
+	}
+}
+
+#ifndef htobe32
+#define htobe32(x) swab32(x)
+#endif
+
+static void EhIndexToArray(const u32 i, unsigned char* arr)
+{
+	u32 bei = htobe32(i);
+	memcpy(arr, &bei, sizeof(u32));
+}
+
+static std::vector<unsigned char> GetMinimalFromIndices(std::vector<u32> indices, size_t cBitLen)
+{
+	assert(((cBitLen + 1) + 7) / 8 <= sizeof(u32));
+	size_t lenIndices = indices.size()*sizeof(u32);
+	size_t minLen = (cBitLen + 1)*lenIndices / (8 * sizeof(u32));
+	size_t bytePad = sizeof(u32) - ((cBitLen + 1) + 7) / 8;
+	std::vector<unsigned char> array(lenIndices);
+	for (size_t i = 0; i < indices.size(); i++) {
+		EhIndexToArray(indices[i], array.data() + (i*sizeof(u32)));
+	}
+	std::vector<unsigned char> ret(minLen);
+	CompressArray(array.data(), lenIndices, ret.data(), minLen, cBitLen + 1, bytePad);
+	return ret;
+}
+
+// solver callbacks
+static void cb_solution(int thr_id, const std::vector<uint32_t>& solutions, size_t cbitlen, const unsigned char *compressed_sol)
+{
+	std::vector<unsigned char> nSolution;
+	if (!compressed_sol) {
+		nSolution = GetMinimalFromIndices(solutions, cbitlen);
+	} else {
+		gpulog(LOG_INFO, thr_id, "compressed_sol");
+		nSolution = std::vector<unsigned char>(1344);
+		for (size_t i = 0; i < cbitlen; i++)
+			nSolution[i] = compressed_sol[i];
+	}
+	int nsol = valid_sols[thr_id];
+	if (nsol < 0) nsol = 0;
+	if(nSolution.size() == 1344) {
+		// todo, only store solution data here...
+		le32enc(&data_sols[thr_id][nsol][140], 0x000540fd); // sol sz header
+		memcpy(&data_sols[thr_id][nsol][143], nSolution.data(), 1344);
+		valid_sols[thr_id] = nsol + 1;
+	}
+}
+static void cb_hashdone(int thr_id) {
+	if (!valid_sols[thr_id]) valid_sols[thr_id] = -1;
+}
+static bool cb_cancel(int thr_id) {
+	if (work_restart[thr_id].restart)
+		valid_sols[thr_id] = -1;
+	return work_restart[thr_id].restart;
+}
+
+extern "C" int scanhash_equihash(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t _ALIGN(64) endiandata[35];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[NONCE_OFT];
+	uint32_t nonce_increment = rand() & 0xFF; // nonce randomizer
+	struct timeval tv_start, tv_end, diff;
+	double secs, solps;
+	uint32_t soluce_count = 0;
+
+	if (opt_benchmark)
+		ptarget[7] = 0xfffff;
+
+	if (!init[thr_id]) {
+		try {
+			int mode = 1;
+			switch (mode) {
+			case 1:
+				solvers[thr_id] = new eq_cuda_context<CONFIG_MODE_1>(thr_id, device_map[thr_id]);
+				break;
+#ifdef CONFIG_MODE_2
+			case 2:
+				solvers[thr_id] = new eq_cuda_context<CONFIG_MODE_2>(thr_id, device_map[thr_id]);
+				break;
+#endif
+#ifdef CONFIG_MODE_3
+			case 3:
+				solvers[thr_id] = new eq_cuda_context<CONFIG_MODE_3>(thr_id, device_map[thr_id]);
+				break;
+#endif
+			default:
+				proper_exit(EXIT_CODE_SW_INIT_ERROR);
+				return -1;
+			}
+			size_t memSz = solvers[thr_id]->equi_mem_sz / (1024*1024);
+			api_set_throughput(thr_id, (uint32_t) solvers[thr_id]->throughput);
+			gpulog(LOG_DEBUG, thr_id, "Allocated %u MB of context memory", (u32) memSz);
+			cuda_get_arch(thr_id);
+			init[thr_id] = true;
+		} catch (const std::exception & e) {
+			CUDA_LOG_ERROR();
+			gpulog(LOG_ERR, thr_id, "init: %s", e.what());
+			proper_exit(EXIT_CODE_CUDA_ERROR);
+		}
+	}
+
+	gettimeofday(&tv_start, NULL);
+	memcpy(endiandata, pdata, 140);
+	work->valid_nonces = 0;
+
+	do {
+
+		try {
+
+			valid_sols[thr_id] = 0;
+			solvers[thr_id]->solve(
+				(const char *) endiandata, (unsigned int) (140 - 32),
+				(const char *) &endiandata[27], (unsigned int) 32,
+				&cb_cancel, &cb_solution, &cb_hashdone
+			);
+
+			*hashes_done = soluce_count;
+
+		} catch (const std::exception & e) {
+			gpulog(LOG_WARNING, thr_id, "solver: %s", e.what());
+			free_equihash(thr_id);
+			sleep(1);
+			return -1;
+		}
+
+		if (valid_sols[thr_id] > 0)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			uint8_t _ALIGN(64) full_data[140+3+1344] = { 0 };
+			uint8_t* sol_data = &full_data[140];
+
+			soluce_count += valid_sols[thr_id];
+
+			for (int nsol=0; nsol < valid_sols[thr_id]; nsol++)
+			{
+				memcpy(full_data, endiandata, 140);
+				memcpy(sol_data, &data_sols[thr_id][nsol][140], 1347);
+				equi_hash(full_data, vhash, 140+3+1344);
+
+				if (vhash[7] <= Htarg && fulltest(vhash, ptarget))
+				{
+					bool valid = equi_verify_sol(endiandata, &sol_data[3]);
+					if (valid && work->valid_nonces < MAX_NONCES) {
+						work->valid_nonces++;
+						memcpy(work->data, endiandata, 140);
+						equi_store_work_solution(work, vhash, sol_data);
+						work->nonces[work->valid_nonces-1] = endiandata[NONCE_OFT];
+						pdata[NONCE_OFT] = endiandata[NONCE_OFT] + 1;
+						//applog_hex(vhash, 32);
+						//applog_hex(&work->data[27], 32);
+						goto out; // second solution storage not handled..
+					}
+				}
+				if (work->valid_nonces == MAX_NONCES) goto out;
+			}
+			if (work->valid_nonces)
+				goto out;
+
+			valid_sols[thr_id] = 0;
+		}
+
+		endiandata[NONCE_OFT] += nonce_increment;
+
+	} while (!work_restart[thr_id].restart);
+
+out:
+	gettimeofday(&tv_end, NULL);
+	timeval_subtract(&diff, &tv_end, &tv_start);
+	secs = (1.0 * diff.tv_sec) + (0.000001 * diff.tv_usec);
+	solps = (double)soluce_count / secs;
+	gpulog(LOG_DEBUG, thr_id, "%d solutions in %.2f s (%.2f Sol/s)", soluce_count, secs, solps);
+
+	// H/s
+	*hashes_done = soluce_count;
+
+	pdata[NONCE_OFT] = endiandata[NONCE_OFT] + 1;
+
+	return work->valid_nonces;
+}
+
+// cleanup
+void free_equihash(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	// assume config 1 was used... interface destructor seems bad
+	eq_cuda_context<CONFIG_MODE_1>* ptr = dynamic_cast<eq_cuda_context<CONFIG_MODE_1>*>(solvers[thr_id]);
+	ptr->freemem();
+	ptr = NULL;
+
+	solvers[thr_id] = NULL;
+
+	init[thr_id] = false;
+}
+
+// mmm... viva c++ junk
+void eq_cuda_context_interface::solve(const char *tequihash_header, unsigned int tequihash_header_len,
+	const char* nonce, unsigned int nonce_len,
+	fn_cancel cancelf, fn_solution solutionf, fn_hashdone hashdonef) { }
diff --git a/equi/equihash.h b/equi/equihash.h
new file mode 100644
index 0000000000..cdf47fa566
--- /dev/null
+++ b/equi/equihash.h
@@ -0,0 +1,19 @@
+#ifndef EQUIHASH_H
+#define EQUIHASH_H
+
+#include <stdint.h>
+
+// miner nonce "cursor" unique for each thread
+#define EQNONCE_OFFSET 30 /* 27:34 */
+
+#define WK 9
+#define WN 200
+//#define CONFIG_MODE_1 9, 1248, 12, 640, packer_cantor /* eqcuda.hpp */
+
+extern "C" {
+	void equi_hash(const void* input, void* output, int len);
+	int  equi_verify_sol(void* const hdr, void* const soln);
+	bool equi_verify(uint8_t* const hdr, uint8_t* const soln);
+}
+
+#endif
diff --git a/files.txt b/files.txt
deleted file mode 100644
index 5391bba406..0000000000
--- a/files.txt
+++ /dev/null
@@ -1,30 +0,0 @@
-blake512.cu
-blake.c
-combine.cu
-compat.h
-cpu-miner.c
-cpuminer-config.h
-cuda_blake512.h
-cuda_combine.h
-cuda_groestl512.h
-cuda_hefty1.h
-cuda_keccak512.h
-cuda_sha256.h
-elist.h
-groestl512.cu
-groestl.c
-heavy.c
-hefty1.c
-hefty1.cu
-hefty1.h
-keccak512.cu
-keccak.c
-miner.h
-scrypt.c
-sha256.cu
-sha2.c
-sph_blake.h
-sph_groestl.h
-sph_keccak.h
-sph_types.h
-util.c
diff --git a/fuguecoin.cpp b/fuguecoin.cpp
index 9d58a55518..1e36e672f7 100644
--- a/fuguecoin.cpp
+++ b/fuguecoin.cpp
@@ -1,86 +1,114 @@
-#include "uint256.h"
-#include "sph/sph_fugue.h"
-
-#include "cpuminer-config.h"
-#include "miner.h"
-
 #include <string.h>
 #include <stdint.h>
-#include <cuda_fugue256.h>
+#include <cuda_runtime.h>
 
-extern "C" void my_fugue256_init(void *cc);
-extern "C" void my_fugue256(void *cc, const void *data, size_t len);
-extern "C" void my_fugue256_close(void *cc, void *dst);
-extern "C" void my_fugue256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst);
+#include "sph/sph_fugue.h"
 
-// vorbereitete Kontexte nach den ersten 80 Bytes
-sph_fugue256_context  ctx_fugue_const[8];
+#include "miner.h"
+
+#include "cuda_fugue256.h"
 
 #define SWAP32(x) \
     ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u)   | \
       (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
 
-extern "C" int scanhash_fugue256(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
-	uint32_t max_nonce, unsigned long *hashes_done)
-{	
+void fugue256_hash(unsigned char* output, const unsigned char* input, int len)
+{
+	sph_fugue256_context ctx;
+
+	sph_fugue256_init(&ctx);
+	sph_fugue256(&ctx, input, len);
+	sph_fugue256_close(&ctx, (void *)output);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+
+int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t _ALIGN(64) endiandata[20];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
 	uint32_t start_nonce = pdata[19]++;
-	const uint32_t Htarg = ptarget[7];
-	const uint32_t throughPut = 4096 * 128;
+	int intensity = (device_sm[device_map[thr_id]] > 500) ? 22 : 19;
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
+	if (init[thr_id]) throughput = min(throughput, max_nonce - start_nonce);
+
+	if (opt_benchmark)
+		ptarget[7] = 0xf;
 
 	// init
-	static bool init[8] = { false, false, false, false, false, false, false, false };
 	if(!init[thr_id])
 	{
-		fugue256_cpu_init(thr_id, throughPut);
+		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		fugue256_cpu_init(thr_id, throughput);
 		init[thr_id] = true;
 	}
-	
-	// Endian Drehung ist notwendig
-	uint32_t endiandata[20];
+
+	// Endian
 	for (int kk=0; kk < 20; kk++)
 		be32enc(&endiandata[kk], pdata[kk]);
 
-	// Context mit dem Endian gedrehten Blockheader vorbereiten (Nonce wird später ersetzt)
 	fugue256_cpu_setBlock(thr_id, endiandata, (void*)ptarget);
 
 	do {
 		// GPU
-		uint32_t foundNounce = 0xFFFFFFFF;
-		fugue256_cpu_hash(thr_id, throughPut, pdata[19], NULL, &foundNounce);
+		uint32_t foundNounce = UINT32_MAX;
+		fugue256_cpu_hash(thr_id, throughput, pdata[19], NULL, &foundNounce);
 
-		if(foundNounce < 0xffffffff)
+		*hashes_done = pdata[19] - start_nonce + throughput;
+
+		if (foundNounce < UINT32_MAX && bench_algo < 0)
 		{
-			uint32_t hash[8];
-			endiandata[19] = SWAP32(foundNounce);
+			uint32_t vhash[8];
 			sph_fugue256_context ctx_fugue;
+			endiandata[19] = SWAP32(foundNounce);
+
 			sph_fugue256_init(&ctx_fugue);
 			sph_fugue256 (&ctx_fugue, endiandata, 80);
-			sph_fugue256_close(&ctx_fugue, &hash);
+			sph_fugue256_close(&ctx_fugue, &vhash);
 
-			if (hash[7] <= Htarg && fulltest(hash, ptarget))
+			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget))
 			{
+				work_set_target_ratio(work, vhash);
 				pdata[19] = foundNounce;
-				*hashes_done = foundNounce - start_nonce;
 				return 1;
 			} else {
-				applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNounce);
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNounce);
 			}
 		}
 
-		if (pdata[19] + throughPut < pdata[19])
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
 			pdata[19] = max_nonce;
-		else pdata[19] += throughPut;
+			break;
+		}
+
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
 
-	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
-	
 	*hashes_done = pdata[19] - start_nonce;
 	return 0;
 }
 
-void fugue256_hash(unsigned char* output, const unsigned char* input, int len)
+// cleanup
+void free_fugue256(int thr_id)
 {
-	sph_fugue256_context ctx;
-	sph_fugue256_init(&ctx);
-    sph_fugue256(&ctx, input, len);    
-    sph_fugue256_close(&ctx, (void *)output);
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	fugue256_cpu_free(thr_id);
+
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
 }
diff --git a/groestl_functions_quad.cu b/groestl_functions_quad.cu
deleted file mode 100644
index 745a843d54..0000000000
--- a/groestl_functions_quad.cu
+++ /dev/null
@@ -1,315 +0,0 @@
-
-__device__ __forceinline__ void G256_Mul2(uint32_t *regs)
-{
-    uint32_t tmp = regs[7];
-    regs[7] = regs[6];
-    regs[6] = regs[5];
-    regs[5] = regs[4];
-    regs[4] = regs[3] ^ tmp;
-    regs[3] = regs[2] ^ tmp;
-    regs[2] = regs[1];
-    regs[1] = regs[0] ^ tmp;
-    regs[0] = tmp;
-}
-
-__device__ __forceinline__ void G256_AddRoundConstantQ_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0, int round)
-{
-    x0 = ~x0;
-    x1 = ~x1;
-    x2 = ~x2;
-    x3 = ~x3;
-    x4 = ~x4;
-    x5 = ~x5;
-    x6 = ~x6;
-    x7 = ~x7;
-
-    if ((threadIdx.x & 0x03) == 3) {
-        x0 ^= ((- (round & 0x01)    ) & 0xFFFF0000);
-        x1 ^= ((-((round & 0x02)>>1)) & 0xFFFF0000);
-        x2 ^= ((-((round & 0x04)>>2)) & 0xFFFF0000);
-        x3 ^= ((-((round & 0x08)>>3)) & 0xFFFF0000);
-        x4 ^= 0xAAAA0000;
-        x5 ^= 0xCCCC0000;
-        x6 ^= 0xF0F00000;
-        x7 ^= 0xFF000000;
-    }
-}
-
-__device__ __forceinline__ void G256_AddRoundConstantP_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0, int round)
-{
-    if ((threadIdx.x & 0x03) == 0)
-    {
-        x4 ^= 0xAAAA;
-        x5 ^= 0xCCCC;
-        x6 ^= 0xF0F0;
-        x7 ^= 0xFF00;
-
-        x0 ^= ((- (round & 0x01)    ) & 0xFFFF);
-        x1 ^= ((-((round & 0x02)>>1)) & 0xFFFF);
-        x2 ^= ((-((round & 0x04)>>2)) & 0xFFFF);
-        x3 ^= ((-((round & 0x08)>>3)) & 0xFFFF);
-    }
-}
-
-__device__ __forceinline__ void G16mul_quad(uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0,
-                                       uint32_t &y3, uint32_t &y2, uint32_t &y1, uint32_t &y0)
-{
-    uint32_t t0,t1,t2;
-    
-    t0 = ((x2 ^ x0) ^ (x3 ^ x1)) & ((y2 ^ y0) ^ (y3 ^ y1));
-    t1 = ((x2 ^ x0) & (y2 ^ y0)) ^ t0;
-    t2 = ((x3 ^ x1) & (y3 ^ y1)) ^ t0 ^ t1;
-
-    t0 = (x2^x3) & (y2^y3);
-    x3 = (x3 & y3) ^ t0 ^ t1;
-    x2 = (x2 & y2) ^ t0 ^ t2;
-
-    t0 = (x0^x1) & (y0^y1);
-    x1 = (x1 & y1) ^ t0 ^ t1;
-    x0 = (x0 & y0) ^ t0 ^ t2;
-}
-
-__device__ __forceinline__ void G256_inv_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0)
-{
-    uint32_t t0,t1,t2,t3,t4,t5,t6,a,b;
-
-    t3 = x7;
-    t2 = x6;
-    t1 = x5;
-    t0 = x4;
-
-    G16mul_quad(t3, t2, t1, t0, x3, x2, x1, x0);
-
-    a = (x4 ^ x0);
-    t0 ^= a;
-    t2 ^= (x7 ^ x3) ^ (x5 ^ x1); 
-    t1 ^= (x5 ^ x1) ^ a;
-    t3 ^= (x6 ^ x2) ^ a;
-
-    b = t0 ^ t1;
-    t4 = (t2 ^ t3) & b;
-    a = t4 ^ t3 ^ t1;
-    t5 = (t3 & t1) ^ a;
-    t6 = (t2 & t0) ^ a ^ (t2 ^ t0);
-
-    t4 = (t5 ^ t6) & b;
-    t1 = (t6 & t1) ^ t4;
-    t0 = (t5 & t0) ^ t4;
-
-    t4 = (t5 ^ t6) & (t2^t3);
-    t3 = (t6 & t3) ^ t4;
-    t2 = (t5 & t2) ^ t4;
-
-    G16mul_quad(x3, x2, x1, x0, t1, t0, t3, t2);
-
-    G16mul_quad(x7, x6, x5, x4, t1, t0, t3, t2);
-}
-
-__device__ __forceinline__ void transAtoX_quad(uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &x4, uint32_t &x5, uint32_t &x6, uint32_t &x7)
-{
-    uint32_t t0, t1;
-    t0 = x0 ^ x1 ^ x2;
-    t1 = x5 ^ x6;
-    x2 = t0 ^ t1 ^ x7;
-    x6 = t0 ^ x3 ^ x6;
-    x3 = x0 ^ x1 ^ x3 ^ x4 ^ x7;    
-    x4 = x0 ^ x4 ^ t1;
-    x2 = t0 ^ t1 ^ x7;
-    x1 = x0 ^ x1 ^ t1;
-    x7 = x0 ^ t1 ^ x7;
-    x5 = x0 ^ t1;
-}
-
-__device__ __forceinline__ void transXtoA_quad(uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &x4, uint32_t &x5, uint32_t &x6, uint32_t &x7)
-{
-    uint32_t t0,t2,t3,t5;
-
-    x1 ^= x4;
-    t0 = x1 ^ x6;
-    x1 ^= x5;
-
-    t2 = x0 ^ x2;
-    x2 = x3 ^ x5;
-    t2 ^= x2 ^ x6;
-    x2 ^= x7;
-    t3 = x4 ^ x2 ^ x6;
-
-    t5 = x0 ^ x6;
-    x4 = x3 ^ x7;
-    x0 = x3 ^ x5;
-
-    x6 = t0;    
-    x3 = t2;
-    x7 = t3;    
-    x5 = t5;    
-}
-
-__device__ __forceinline__ void sbox_quad(uint32_t *r)
-{
-    transAtoX_quad(r[0], r[1], r[2], r[3], r[4], r[5], r[6], r[7]);
-
-    G256_inv_quad(r[2], r[4], r[1], r[7], r[3], r[0], r[5], r[6]);
-
-    transXtoA_quad(r[7], r[1], r[4], r[2], r[6], r[5], r[0], r[3]);
-    
-    r[0] = ~r[0];
-    r[1] = ~r[1];
-    r[5] = ~r[5];
-    r[6] = ~r[6];
-}
-
-__device__ __forceinline__ void G256_ShiftBytesP_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0)
-{
-    uint32_t t0,t1;
-
-    int tpos = threadIdx.x & 0x03;
-    int shift1 = tpos << 1;
-    int shift2 = shift1+1 + ((tpos == 3)<<2);
-
-    t0 = __byte_perm(x0, 0, 0x1010)>>shift1;
-    t1 = __byte_perm(x0, 0, 0x3232)>>shift2;
-    x0 = __byte_perm(t0, t1, 0x5410);
-
-    t0 = __byte_perm(x1, 0, 0x1010)>>shift1;
-    t1 = __byte_perm(x1, 0, 0x3232)>>shift2;
-    x1 = __byte_perm(t0, t1, 0x5410);
-
-    t0 = __byte_perm(x2, 0, 0x1010)>>shift1;
-    t1 = __byte_perm(x2, 0, 0x3232)>>shift2;
-    x2 = __byte_perm(t0, t1, 0x5410);
-
-    t0 = __byte_perm(x3, 0, 0x1010)>>shift1;
-    t1 = __byte_perm(x3, 0, 0x3232)>>shift2;
-    x3 = __byte_perm(t0, t1, 0x5410);
-
-    t0 = __byte_perm(x4, 0, 0x1010)>>shift1;
-    t1 = __byte_perm(x4, 0, 0x3232)>>shift2;
-    x4 = __byte_perm(t0, t1, 0x5410);
-
-    t0 = __byte_perm(x5, 0, 0x1010)>>shift1;
-    t1 = __byte_perm(x5, 0, 0x3232)>>shift2;
-    x5 = __byte_perm(t0, t1, 0x5410);
-
-    t0 = __byte_perm(x6, 0, 0x1010)>>shift1;
-    t1 = __byte_perm(x6, 0, 0x3232)>>shift2;
-    x6 = __byte_perm(t0, t1, 0x5410);
-
-    t0 = __byte_perm(x7, 0, 0x1010)>>shift1;
-    t1 = __byte_perm(x7, 0, 0x3232)>>shift2;
-    x7 = __byte_perm(t0, t1, 0x5410);
-}
-
-__device__ __forceinline__ void G256_ShiftBytesQ_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0)
-{
-    uint32_t t0,t1;
-
-    int tpos = threadIdx.x & 0x03;
-    int shift1 = (1-(tpos>>1)) + ((tpos & 0x01)<<2);
-    int shift2 = shift1+2 + ((tpos == 1)<<2);
-
-    t0 = __byte_perm(x0, 0, 0x1010)>>shift1;
-    t1 = __byte_perm(x0, 0, 0x3232)>>shift2;
-    x0 = __byte_perm(t0, t1, 0x5410);
-
-    t0 = __byte_perm(x1, 0, 0x1010)>>shift1;
-    t1 = __byte_perm(x1, 0, 0x3232)>>shift2;
-    x1 = __byte_perm(t0, t1, 0x5410);
-
-    t0 = __byte_perm(x2, 0, 0x1010)>>shift1;
-    t1 = __byte_perm(x2, 0, 0x3232)>>shift2;
-    x2 = __byte_perm(t0, t1, 0x5410);
-
-    t0 = __byte_perm(x3, 0, 0x1010)>>shift1;
-    t1 = __byte_perm(x3, 0, 0x3232)>>shift2;
-    x3 = __byte_perm(t0, t1, 0x5410);
-
-    t0 = __byte_perm(x4, 0, 0x1010)>>shift1;
-    t1 = __byte_perm(x4, 0, 0x3232)>>shift2;
-    x4 = __byte_perm(t0, t1, 0x5410);
-
-    t0 = __byte_perm(x5, 0, 0x1010)>>shift1;
-    t1 = __byte_perm(x5, 0, 0x3232)>>shift2;
-    x5 = __byte_perm(t0, t1, 0x5410);
-
-    t0 = __byte_perm(x6, 0, 0x1010)>>shift1;
-    t1 = __byte_perm(x6, 0, 0x3232)>>shift2;
-    x6 = __byte_perm(t0, t1, 0x5410);
-
-    t0 = __byte_perm(x7, 0, 0x1010)>>shift1;
-    t1 = __byte_perm(x7, 0, 0x3232)>>shift2;
-    x7 = __byte_perm(t0, t1, 0x5410);
-}
-
-__device__ __forceinline__ void G256_MixFunction_quad(uint32_t *r)
-{
-#define SHIFT64_16(hi, lo)    __byte_perm(lo, hi, 0x5432)
-#define A(v, u)             __shfl((int)r[v], ((threadIdx.x+u)&0x03), 4)
-#define S(idx, l)            SHIFT64_16( A(idx, (l+1)), A(idx, l) )
-
-#define DOUBLE_ODD(i, bc)        ( S(i, (bc)) ^ A(i, (bc) + 1) )
-#define DOUBLE_EVEN(i, bc)        ( S(i, (bc)) ^ A(i, (bc)    ) )
-
-#define SINGLE_ODD(i, bc)        ( S(i, (bc)) )
-#define SINGLE_EVEN(i, bc)        ( A(i, (bc)) )
-    uint32_t b[8];
-
-#pragma unroll 8
-    for(int i=0;i<8;i++)
-        b[i] = DOUBLE_ODD(i, 1) ^ DOUBLE_EVEN(i, 3);
-
-    G256_Mul2(b);
-#pragma unroll 8
-    for(int i=0;i<8;i++)
-        b[i] = b[i] ^ DOUBLE_ODD(i, 3) ^ DOUBLE_ODD(i, 4) ^ SINGLE_ODD(i, 6);
-
-    G256_Mul2(b);
-#pragma unroll 8
-    for(int i=0;i<8;i++)
-        r[i] = b[i] ^ DOUBLE_EVEN(i, 2) ^ DOUBLE_EVEN(i, 3) ^ SINGLE_EVEN(i, 5);
-
-#undef S
-#undef A
-#undef SHIFT64_16
-#undef t
-#undef X
-}
-
-__device__ __forceinline__ void groestl512_perm_P_quad(uint32_t *r)
-{
-    for(int round=0;round<14;round++)
-    {
-        G256_AddRoundConstantP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0], round);
-        sbox_quad(r);
-        G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]);
-        G256_MixFunction_quad(r);
-    }
-}
-
-__device__ __forceinline__ void groestl512_perm_Q_quad(uint32_t *r)
-{    
-    for(int round=0;round<14;round++)
-    {
-        G256_AddRoundConstantQ_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0], round);
-        sbox_quad(r);
-        G256_ShiftBytesQ_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]);
-        G256_MixFunction_quad(r);
-    }
-}
-
-__device__ __forceinline__ void groestl512_progressMessage_quad(uint32_t *state, uint32_t *message)
-{
-#pragma unroll 8
-    for(int u=0;u<8;u++) state[u] = message[u];
-
-    if ((threadIdx.x & 0x03) == 3) state[ 1] ^= 0x00008000;
-    groestl512_perm_P_quad(state);
-    if ((threadIdx.x & 0x03) == 3) state[ 1] ^= 0x00008000;
-    groestl512_perm_Q_quad(message);
-#pragma unroll 8
-    for(int u=0;u<8;u++) state[u] ^= message[u];
-#pragma unroll 8
-    for(int u=0;u<8;u++) message[u] = state[u];
-    groestl512_perm_P_quad(message);
-#pragma unroll 8
-    for(int u=0;u<8;u++) state[u] ^= message[u];
-}
diff --git a/groestlcoin.cpp b/groestlcoin.cpp
index dc3b0feabe..453bd4c1e3 100644
--- a/groestlcoin.cpp
+++ b/groestlcoin.cpp
@@ -1,132 +1,113 @@
-#include "uint256.h"
-#include "sph/sph_groestl.h"
-
-#include "cpuminer-config.h"
-#include "miner.h"
-
 #include <string.h>
 #include <stdint.h>
-#include "cuda_groestlcoin.h"
+#include <cuda_runtime.h>
 #include <openssl/sha.h>
 
-#define SWAP32(x) \
-    ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u)   | \
-      (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
+#include "sph/sph_groestl.h"
+#include "cuda_groestlcoin.h"
 
-void sha256func(unsigned char *hash, const unsigned char *data, int len)
-{
-    uint32_t S[16], T[16];
-    int i, r;
-
-    sha256_init(S);
-    for (r = len; r > -9; r -= 64) {
-        if (r < 64)
-            memset(T, 0, 64);
-        memcpy(T, data + len - r, r > 64 ? 64 : (r < 0 ? 0 : r));
-        if (r >= 0 && r < 64)
-            ((unsigned char *)T)[r] = 0x80;
-        for (i = 0; i < 16; i++)
-            T[i] = be32dec(T + i);
-        if (r < 56)
-            T[15] = 8 * len;
-        sha256_transform(S, T, 0);
-    }
-    /*
-    memcpy(S + 8, sha256d_hash1 + 8, 32);
-    sha256_init(T);
-    sha256_transform(T, S, 0);
-    */
-    for (i = 0; i < 8; i++)
-        be32enc((uint32_t *)hash + i, T[i]);
-}
+#include "miner.h"
 
-static void groestlhash(void *state, const void *input)
+// CPU hash
+void groestlhash(void *state, const void *input)
 {
-    // Tryout GPU-groestl
+	uint32_t _ALIGN(64) hash[16];
+	sph_groestl512_context ctx_groestl;
+
+	sph_groestl512_init(&ctx_groestl);
+	sph_groestl512(&ctx_groestl, input, 80);
+	sph_groestl512_close(&ctx_groestl, hash);
 
-    sph_groestl512_context     ctx_groestl[2];
-    static unsigned char pblank[1];
-    uint32_t mask = 8;
-    uint32_t zero = 0;
+	sph_groestl512_init(&ctx_groestl);
+	sph_groestl512(&ctx_groestl, hash, 64);
+	sph_groestl512_close(&ctx_groestl, hash);
 
+	memcpy(state, hash, 32);
+}
 
-    //these uint512 in the c++ source of the client are backed by an array of uint32
-    uint32_t hashA[16], hashB[16];    
+static bool init[MAX_GPUS] = { 0 };
 
+int scanhash_groestlcoin(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t _ALIGN(64) endiandata[32];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	uint32_t start_nonce = pdata[19];
+	uint32_t throughput = cuda_default_throughput(thr_id, 1 << 19); // 256*256*8
+	if (init[thr_id]) throughput = min(throughput, max_nonce - start_nonce);
+
+	if (opt_benchmark)
+		ptarget[7] = 0x001f;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		CUDA_LOG_ERROR();
+		groestlcoin_cpu_init(thr_id, throughput);
+		init[thr_id] = true;
+	}
+
+	for (int k=0; k < 20; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	groestlcoin_cpu_setBlock(thr_id, endiandata, (void*)ptarget);
+
+	do {
+		memset(work->nonces, 0xff, sizeof(work->nonces));
+
+		*hashes_done = pdata[19] - start_nonce + throughput;
+
+		// GPU hash
+		groestlcoin_cpu_hash(thr_id, throughput, pdata[19], &work->nonces[0]);
+
+		if (work->nonces[0] < UINT32_MAX && bench_algo < 0)
+		{
+			uint32_t _ALIGN(64) vhash[8];
+			endiandata[19] = swab32(work->nonces[0]);
+			groestlhash(vhash, endiandata);
+
+			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				pdata[19] = work->nonces[0] + 1; // cursor
+				return work->valid_nonces;
+			} else if (vhash[7] > ptarget[7]) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+					gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+			}
+		}
+
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - start_nonce;
+	return 0;
+}
 
-    sph_groestl512_init(&ctx_groestl[0]);
-    sph_groestl512 (&ctx_groestl[0], input, 80); //6
-    sph_groestl512_close(&ctx_groestl[0], hashA); //7    
+// cleanup
+void free_groestlcoin(int thr_id)
+{
+	if (!init[thr_id])
+		return;
 
-    sph_groestl512_init(&ctx_groestl[1]);
-    sph_groestl512 (&ctx_groestl[1], hashA, 64); //6
-    sph_groestl512_close(&ctx_groestl[1], hashB); //7
+	cudaThreadSynchronize();
 
-    memcpy(state, hashB, 32);
-}
+	groestlcoin_cpu_free(thr_id);
+	init[thr_id] = false;
 
-extern bool opt_benchmark;
-
-extern "C" int scanhash_groestlcoin(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
-    uint32_t max_nonce, unsigned long *hashes_done)
-{    
-    if (opt_benchmark)
-        ((uint32_t*)ptarget)[7] = 0x000000ff;
-
-    uint32_t start_nonce = pdata[19]++;
-    const uint32_t Htarg = ptarget[7];
-    const uint32_t throughPut = 4096 * 128;
-    //const uint32_t throughPut = 1;
-    uint32_t *outputHash = (uint32_t*)malloc(throughPut * 16 * sizeof(uint32_t));
-
-    // init
-    static bool init[8] = { false, false, false, false, false, false, false, false };
-    if(!init[thr_id])
-    {
-        groestlcoin_cpu_init(thr_id, throughPut);
-        init[thr_id] = true;
-    }
-    
-    // Endian Drehung ist notwendig
-    uint32_t endiandata[32];
-    for (int kk=0; kk < 32; kk++)
-        be32enc(&endiandata[kk], pdata[kk]);
-
-    // Context mit dem Endian gedrehten Blockheader vorbereiten (Nonce wird sp�ter ersetzt)
-    groestlcoin_cpu_setBlock(thr_id, endiandata, (void*)ptarget);
-    
-    do {
-        // GPU
-        uint32_t foundNounce = 0xFFFFFFFF;
-
-        groestlcoin_cpu_hash(thr_id, throughPut, pdata[19], outputHash, &foundNounce);
-
-        if(foundNounce < 0xffffffff)
-        {
-            uint32_t tmpHash[8];
-            endiandata[19] = SWAP32(foundNounce);
-            groestlhash(tmpHash, endiandata);
-            if (tmpHash[7] <= Htarg && 
-                    fulltest(tmpHash, ptarget)) {
-                        pdata[19] = foundNounce;
-                        *hashes_done = foundNounce - start_nonce;
-                        free(outputHash);
-                return true;
-            } else {
-                applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNounce);
-            }
-
-            foundNounce = 0xffffffff;
-        }
-
-        if (pdata[19] + throughPut < pdata[19])
-            pdata[19] = max_nonce;
-        else pdata[19] += throughPut;
-
-    } while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
-    
-    *hashes_done = pdata[19] - start_nonce;
-    free(outputHash);
-    return 0;
+	cudaDeviceSynchronize();
 }
-
diff --git a/hashlog.cpp b/hashlog.cpp
new file mode 100644
index 0000000000..ce1319119c
--- /dev/null
+++ b/hashlog.cpp
@@ -0,0 +1,310 @@
+/**
+ * Hash log of submitted job nonces
+ * Prevent duplicate shares and remember shares diff
+ *
+ * (to be merged later with stats)
+ *
+ * tpruvot@github 2014 - 2017
+ */
+#include <stdlib.h>
+#include <memory.h>
+#include <map>
+
+#include "miner.h"
+
+#define HI_DWORD(u64) ((uint32_t) (u64 >> 32))
+#define LO_DWORD(u64) ((uint32_t) u64)
+#define MK_HI64(u32) (0x100000000ULL * u32)
+
+/* from miner.h
+struct hashlog_data {
+	uint8_t npool;
+	uint8_t pool_type;
+	uint8_t nonce_id;
+	uint8_t job_nonce_id;
+	uint32_t height;
+	double sharediff;
+	uint32_t njobid;
+	uint32_t nonce;
+	uint32_t scanned_from;
+	uint32_t scanned_to;
+	uint32_t last_from;
+	uint32_t tm_add;
+	uint32_t tm_upd;
+	uint32_t tm_sent;
+};
+*/
+
+static std::map<uint64_t, hashlog_data> tlastshares;
+
+#define LOG_PURGE_TIMEOUT 5*60
+
+extern struct stratum_ctx stratum;
+
+/**
+ * str hex to uint32
+ */
+static uint64_t hextouint(char* jobid)
+{
+	char *ptr;
+	/* dont use strtoull(), only since VS2013 */
+	return (uint64_t) strtoul(jobid, &ptr, 16);
+}
+
+/**
+ * @return time of a job/nonce submission (or last nonce if nonce is 0)
+ */
+uint32_t hashlog_already_submittted(char* jobid, uint32_t nonce)
+{
+	uint32_t ret = 0;
+	uint64_t njobid = hextouint(jobid);
+	uint64_t key = (njobid << 32) + nonce;
+
+	if (nonce == 0) {
+		// search last submitted nonce for job
+		ret = hashlog_get_last_sent(jobid);
+	} else if (tlastshares.find(key) != tlastshares.end()) {
+		hashlog_data data = tlastshares[key];
+		ret = data.tm_sent;
+	}
+	return ret;
+}
+/**
+ * Store submitted nonces of a job
+ */
+void hashlog_remember_submit(struct work* work, uint32_t nonce)
+{
+	uint64_t njobid = hextouint(work->job_id);
+	uint64_t key = (njobid << 32) + nonce;
+	hashlog_data data;
+
+	memset(&data, 0, sizeof(data));
+	data.nonce_id = work->submit_nonce_id;
+	data.scanned_from = work->scanned_from;
+	data.scanned_to = work->scanned_to;
+	data.sharediff = work->sharediff[data.nonce_id];
+	data.height = work->height;
+	data.njobid = (uint32_t) njobid;
+	data.tm_add = data.tm_upd = data.tm_sent = (uint32_t) time(NULL);
+	data.npool = (uint8_t) cur_pooln;
+	data.pool_type = pools[cur_pooln].type;
+	data.job_nonce_id = (uint8_t) stratum.job.shares_count;
+	tlastshares[key] = data;
+}
+
+/**
+ * Update job scanned range
+ */
+void hashlog_remember_scan_range(struct work* work)
+{
+	uint64_t njobid = hextouint(work->job_id);
+	uint64_t key = (njobid << 32);
+	uint64_t range = hashlog_get_scan_range(work->job_id);
+	hashlog_data data;
+
+	// global scan range of a job
+	data = tlastshares[key];
+	if (range == 0) {
+		memset(&data, 0, sizeof(data));
+		data.njobid = (uint32_t) njobid;
+	} else {
+		// get min and max from all sent records
+		data.scanned_from = LO_DWORD(range);
+		data.scanned_to   = HI_DWORD(range);
+	}
+
+	if (data.tm_add == 0)
+		data.tm_add = (uint32_t) time(NULL);
+
+	data.last_from = work->scanned_from;
+
+	if (work->scanned_from < work->scanned_to) {
+		if (data.scanned_to == 0 || work->scanned_from == data.scanned_to + 1)
+			data.scanned_to = work->scanned_to;
+		if (data.scanned_from == 0)
+			data.scanned_from = work->scanned_from ? work->scanned_from : 1; // min 1
+		else if (work->scanned_from < data.scanned_from || work->scanned_to == (data.scanned_from - 1))
+			data.scanned_from = work->scanned_from;
+	}
+
+	data.tm_upd = (uint32_t) time(NULL);
+
+	tlastshares[key] = data;
+/* 	applog(LOG_BLUE, "job %s range : %x %x -> %x %x", jobid,
+		scanned_from, scanned_to, data.scanned_from, data.scanned_to); */
+}
+
+/**
+ * Returns the range of a job
+ * @return uint64_t to|from
+ */
+uint64_t hashlog_get_scan_range(char* jobid)
+{
+	uint64_t ret = 0;
+	uint64_t njobid = hextouint(jobid);
+	uint64_t keypfx = (njobid << 32);
+	uint64_t keymsk = (0xffffffffULL << 32);
+	hashlog_data data;
+
+	data.scanned_from = 0;
+	data.scanned_to = 0;
+	std::map<uint64_t, hashlog_data>::iterator i = tlastshares.begin();
+	while (i != tlastshares.end()) {
+		if ((keymsk & i->first) == keypfx && i->second.scanned_to > ret) {
+			if (i->second.scanned_to > data.scanned_to)
+				data.scanned_to = i->second.scanned_to;
+			if (i->second.scanned_from < data.scanned_from || data.scanned_from == 0)
+				data.scanned_from = i->second.scanned_from;
+		}
+		i++;
+	}
+	ret = data.scanned_from;
+	ret += MK_HI64(data.scanned_to);
+	return ret;
+}
+
+/**
+ * Search last submitted nonce for a job
+ * @return max nonce
+ */
+uint32_t hashlog_get_last_sent(char* jobid)
+{
+	uint32_t nonce = 0;
+	uint64_t njobid = jobid ? hextouint(jobid) : UINT32_MAX;
+	uint64_t keypfx = (njobid << 32);
+	std::map<uint64_t, hashlog_data>::reverse_iterator i = tlastshares.rbegin();
+	while (i != tlastshares.rend()) {
+		if ((keypfx & i->first) == keypfx && i->second.tm_sent) {
+			nonce = LO_DWORD(i->first);
+			break;
+		}
+		i++;
+	}
+	return nonce;
+}
+
+/**
+ * To display correcly second nonce(s) share diff (on pool accept)
+ */
+double hashlog_get_sharediff(char* jobid, int job_nonceid, double defvalue)
+{
+	double diff = defvalue;
+	const uint64_t njobid = jobid ? hextouint(jobid) : UINT32_MAX;
+	const uint64_t keypfx = (njobid << 32);
+	const uint64_t keymsk = (0xffffffffULL << 32);
+
+	std::map<uint64_t, hashlog_data>::reverse_iterator it = tlastshares.rbegin();
+	while (it != tlastshares.rend()) {
+		if ((keymsk & it->first) == keypfx) {
+			if ((int) it->second.job_nonce_id == job_nonceid && it->second.tm_sent) {
+				diff = it->second.sharediff;
+				// applog(LOG_BLUE, "sharediff nonce %x:%d (%d) match %g",
+				//	njobid, (int) it->second.nonce_id, job_nonceid, diff);
+				break;
+			}
+		}
+		++it;
+	}
+	return diff;
+}
+
+/**
+ * Export data for api calls
+ */
+int hashlog_get_history(struct hashlog_data *data, int max_records)
+{
+	int records = 0;
+
+	std::map<uint64_t, hashlog_data>::reverse_iterator it = tlastshares.rbegin();
+	while (it != tlastshares.rend() && records < max_records) {
+		memcpy(&data[records], &(it->second), sizeof(struct hashlog_data));
+		data[records].nonce = LO_DWORD(it->first);
+		data[records].njobid = (uint32_t) HI_DWORD(it->first);
+		records++;
+		++it;
+	}
+	return records;
+}
+
+/**
+ * Remove entries of a job...
+ */
+void hashlog_purge_job(char* jobid)
+{
+	int deleted = 0;
+	uint64_t njobid = hextouint(jobid);
+	uint64_t keypfx = (njobid << 32);
+	uint32_t sz = (uint32_t) tlastshares.size();
+	std::map<uint64_t, hashlog_data>::iterator i = tlastshares.begin();
+	while (i != tlastshares.end()) {
+		if ((keypfx & i->first) == keypfx) {
+			deleted++;
+			tlastshares.erase(i++);
+		}
+		else ++i;
+	}
+	if (opt_debug && deleted) {
+		applog(LOG_DEBUG, "hashlog: purge job %s, del %d/%d", jobid, deleted, sz);
+	}
+}
+
+/**
+ * Remove old entries to reduce memory usage
+ */
+void hashlog_purge_old(void)
+{
+	int deleted = 0;
+	uint32_t now = (uint32_t) time(NULL);
+	uint32_t sz = (uint32_t) tlastshares.size();
+	std::map<uint64_t, hashlog_data>::iterator i = tlastshares.begin();
+	while (i != tlastshares.end()) {
+		if ((now - i->second.tm_sent) > LOG_PURGE_TIMEOUT) {
+			deleted++;
+			tlastshares.erase(i++);
+		}
+		else ++i;
+	}
+	if (opt_debug && deleted) {
+		applog(LOG_DEBUG, "hashlog: %d/%d purged", deleted, sz);
+	}
+}
+
+/**
+ * Reset the submitted nonces cache
+ */
+void hashlog_purge_all(void)
+{
+	tlastshares.clear();
+}
+
+/**
+ * API meminfo
+ */
+void hashlog_getmeminfo(uint64_t *mem, uint32_t *records)
+{
+	(*records) = (uint32_t) tlastshares.size();
+	(*mem) = (*records) * sizeof(hashlog_data);
+}
+
+/**
+ * Used to debug ranges...
+ */
+void hashlog_dump_job(char* jobid)
+{
+	if (opt_debug) {
+		uint64_t njobid = hextouint(jobid);
+		uint64_t keypfx = (njobid << 32);
+		// uint32_t sz = tlastshares.size();
+		std::map<uint64_t, hashlog_data>::iterator i = tlastshares.begin();
+		while (i != tlastshares.end()) {
+			if ((keypfx & i->first) == keypfx) {
+				if (i->first != keypfx)
+					applog(LOG_DEBUG, CL_YLW "job %s, found %08x ", jobid, LO_DWORD(i->first));
+				else
+					applog(LOG_DEBUG, CL_YLW "job %s(%u) range done: %08x-%08x", jobid,
+						i->second.height, i->second.scanned_from, i->second.scanned_to);
+			}
+			i++;
+		}
+	}
+}
diff --git a/heavy/bastion.cu b/heavy/bastion.cu
new file mode 100644
index 0000000000..ec9ba8d755
--- /dev/null
+++ b/heavy/bastion.cu
@@ -0,0 +1,331 @@
+/**
+ * bastion cuda implemention tpruvot@github 2017
+ */
+
+#include <stdio.h>
+#include <string.h>
+//#include <openssl/sha.h>
+#include <stdint.h>
+#include <miner.h>
+#include <cuda_helper.h>
+
+static uint32_t *d_hash[MAX_GPUS];
+static uint32_t* d_hash_br1[MAX_GPUS];
+static uint32_t* d_hash_br2[MAX_GPUS];
+
+extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
+extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void x11_luffa512_cpu_init(int thr_id, uint32_t threads);
+extern void x11_luffa512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads);
+extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
+extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x13_fugue512_cpu_free(int thr_id);
+
+extern void x14_shabal512_cpu_init(int thr_id, uint32_t threads);
+extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int mode);
+extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x15_whirlpool_cpu_free(int thr_id);
+
+extern void x11_echo512_cpu_init(int thr_id, uint32_t threads);
+extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void bastion_init(const int thr_id, const uint32_t threads);
+extern void bastion_free(const int thr_id);
+
+extern uint32_t bastion_filter2(const int thr_id, const uint32_t threads, const uint32_t *inpHashes, uint32_t* d_hash1, uint32_t* d_hash2);
+extern void bastion_merge2(const int thr_id, const uint32_t threads, uint32_t *outpHashes, uint32_t* d_hash1, uint32_t* d_hash2);
+
+extern void hefty_cpu_hash(int thr_id, uint32_t threads, int startNounce);
+extern void hefty_cpu_setBlock(int thr_id, uint32_t threads, void *data, int len);
+extern void hefty_cpu_init(int thr_id, uint32_t threads);
+extern void hefty_cpu_free(int thr_id);
+extern void hefty_copy_hashes(int thr_id, uint32_t threads, uint32_t* d_outputhash);
+
+#define TRACE(algo) {}
+
+static bool init[MAX_GPUS] = { 0 };
+
+int scanhash_bastion(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	// CUDA will process thousands of threads.
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << 20);
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (opt_benchmark)
+		ptarget[7] = 0x00ff;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash_br1[thr_id], (size_t) 64 * throughput));
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash_br2[thr_id], (size_t) 64 * throughput));
+
+		bastion_init(thr_id, throughput);
+		hefty_cpu_init(thr_id, throughput);
+		x11_luffa512_cpu_init(thr_id, throughput);
+
+		quark_skein512_cpu_init(thr_id, throughput);
+		x13_hamsi512_cpu_init(thr_id, throughput);
+		x13_fugue512_cpu_init(thr_id, throughput);
+		x14_shabal512_cpu_init(thr_id, throughput);
+		x15_whirlpool_cpu_init(thr_id, throughput, 0);
+		x11_echo512_cpu_init(thr_id, throughput);
+
+		cuda_check_cpu_init(thr_id, throughput);
+
+		init[thr_id] = true;
+	}
+
+	uint32_t endiandata[20];
+	for (int k=0; k < 20; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	hefty_cpu_setBlock(thr_id, throughput, endiandata, 80);
+
+	cuda_check_cpu_setTarget(ptarget);
+
+	do {
+		uint32_t branchNonces;
+		int order = 0;
+
+		// hefty
+		hefty_cpu_hash(thr_id, throughput, pdata[19]);
+		hefty_copy_hashes(thr_id, throughput, d_hash[thr_id]);
+		TRACE("hefty  :");
+
+		x11_luffa512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		TRACE("luffa  :");
+
+		// fugue or skein
+		branchNonces = bastion_filter2(thr_id, throughput, d_hash[thr_id], d_hash_br1[thr_id], d_hash_br2[thr_id]);
+		x13_fugue512_cpu_hash_64(thr_id, branchNonces, pdata[19], NULL, d_hash_br1[thr_id], order++);
+		quark_skein512_cpu_hash_64(thr_id, throughput-branchNonces, pdata[19], NULL, d_hash_br2[thr_id], order++);
+		bastion_merge2(thr_id, throughput, d_hash[thr_id], d_hash_br1[thr_id], d_hash_br2[thr_id]);
+		TRACE("perm1  :");
+
+		x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		TRACE("whirl  :");
+		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+
+		// echo or luffa
+		branchNonces = bastion_filter2(thr_id, throughput, d_hash[thr_id], d_hash_br1[thr_id], d_hash_br2[thr_id]);
+		x11_echo512_cpu_hash_64(thr_id, branchNonces, pdata[19], NULL, d_hash_br1[thr_id], order++);
+		x11_luffa512_cpu_hash_64(thr_id, throughput-branchNonces, pdata[19], NULL, d_hash_br2[thr_id], order++);
+		bastion_merge2(thr_id, throughput, d_hash[thr_id], d_hash_br1[thr_id], d_hash_br2[thr_id]);
+		TRACE("perm2  :");
+
+		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+
+		// shabal or whirlpool
+		branchNonces = bastion_filter2(thr_id, throughput, d_hash[thr_id], d_hash_br1[thr_id], d_hash_br2[thr_id]);
+		x14_shabal512_cpu_hash_64(thr_id, branchNonces, pdata[19], NULL, d_hash_br1[thr_id], order++);
+		x15_whirlpool_cpu_hash_64(thr_id, throughput-branchNonces, pdata[19], NULL, d_hash_br2[thr_id], order++);
+		bastion_merge2(thr_id, throughput, d_hash[thr_id], d_hash_br1[thr_id], d_hash_br2[thr_id]);
+		TRACE("perm3  :");
+
+		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+
+		// hamsi or luffa
+		branchNonces = bastion_filter2(thr_id, throughput, d_hash[thr_id], d_hash_br1[thr_id], d_hash_br2[thr_id]);
+		x13_hamsi512_cpu_hash_64(thr_id, branchNonces, pdata[19], NULL, d_hash_br1[thr_id], order++);
+		x11_luffa512_cpu_hash_64(thr_id, throughput-branchNonces, pdata[19], NULL, d_hash_br2[thr_id], order++);
+		bastion_merge2(thr_id, throughput, d_hash[thr_id], d_hash_br1[thr_id], d_hash_br2[thr_id]);
+		TRACE("perm4  :");
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		CUDA_LOG_ERROR();
+
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			uint32_t _ALIGN(64) vhash[8];
+			const uint32_t Htarg = ptarget[7];
+			endiandata[19] = work->nonces[0];
+			bastionhash(vhash, (uchar*) endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				work->nonces[0] = swab32(work->nonces[0]);
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				if (work->nonces[1] != 0) {
+					endiandata[19] = work->nonces[1];
+					bastionhash(vhash, (uchar*) endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					work->nonces[1] = swab32(work->nonces[1]);
+					pdata[19] = max(work->nonces[0], work->nonces[1])+1;
+				} else {
+					pdata[19] = work->nonces[0]+1; // cursor
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet) gpulog(LOG_WARNING, thr_id,
+					"result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = swab32(work->nonces[0]) + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+
+	CUDA_LOG_ERROR();
+
+	return 0;
+}
+
+// cleanup
+extern "C" void free_bastion(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+	cudaFree(d_hash_br1[thr_id]);
+	cudaFree(d_hash_br2[thr_id]);
+
+	hefty_cpu_free(thr_id);
+	x13_fugue512_cpu_free(thr_id);
+	x15_whirlpool_cpu_free(thr_id);
+
+	bastion_free(thr_id);
+	cuda_check_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
+
+#undef SPH_C32
+#undef SPH_T32
+#undef SPH_C64
+#undef SPH_T64
+extern "C" {
+#include "hefty1.h"
+#include "sph/sph_luffa.h"
+#include "sph/sph_fugue.h"
+#include "sph/sph_skein.h"
+#include "sph/sph_whirlpool.h"
+#include "sph/sph_shabal.h"
+#include "sph/sph_echo.h"
+#include "sph/sph_hamsi.h"
+}
+
+__host__
+void bastionhash(void* output, const uchar* input)
+{
+	unsigned char _ALIGN(128) hash[64] = { 0 };
+
+	sph_echo512_context ctx_echo;
+	sph_luffa512_context ctx_luffa;
+	sph_fugue512_context ctx_fugue;
+	sph_whirlpool_context ctx_whirlpool;
+	sph_shabal512_context ctx_shabal;
+	sph_skein512_context ctx_skein;
+	sph_hamsi512_context ctx_hamsi;
+
+	HEFTY1(input, 80, hash);
+
+	sph_luffa512_init(&ctx_luffa);
+	sph_luffa512(&ctx_luffa, hash, 64);
+	sph_luffa512_close(&ctx_luffa, hash);
+
+	if (hash[0] & 0x8)
+	{
+		sph_fugue512_init(&ctx_fugue);
+		sph_fugue512(&ctx_fugue, hash, 64);
+		sph_fugue512_close(&ctx_fugue, hash);
+	} else {
+		sph_skein512_init(&ctx_skein);
+		sph_skein512(&ctx_skein, hash, 64);
+		sph_skein512_close(&ctx_skein, hash);
+	}
+
+	sph_whirlpool_init(&ctx_whirlpool);
+	sph_whirlpool(&ctx_whirlpool, hash, 64);
+	sph_whirlpool_close(&ctx_whirlpool, hash);
+
+	sph_fugue512_init(&ctx_fugue);
+	sph_fugue512(&ctx_fugue, hash, 64);
+	sph_fugue512_close(&ctx_fugue, hash);
+
+	if (hash[0] & 0x8)
+	{
+		sph_echo512_init(&ctx_echo);
+		sph_echo512(&ctx_echo, hash, 64);
+		sph_echo512_close(&ctx_echo, hash);
+	} else {
+		sph_luffa512_init(&ctx_luffa);
+		sph_luffa512(&ctx_luffa, hash, 64);
+		sph_luffa512_close(&ctx_luffa, hash);
+	}
+
+	sph_shabal512_init(&ctx_shabal);
+	sph_shabal512(&ctx_shabal, hash, 64);
+	sph_shabal512_close(&ctx_shabal, hash);
+
+	sph_skein512_init(&ctx_skein);
+	sph_skein512(&ctx_skein, hash, 64);
+	sph_skein512_close(&ctx_skein, hash);
+
+	if (hash[0] & 0x8)
+	{
+		sph_shabal512_init(&ctx_shabal);
+		sph_shabal512(&ctx_shabal, hash, 64);
+		sph_shabal512_close(&ctx_shabal, hash);
+	} else {
+		sph_whirlpool_init(&ctx_whirlpool);
+		sph_whirlpool(&ctx_whirlpool, hash, 64);
+		sph_whirlpool_close(&ctx_whirlpool, hash);
+	}
+
+	sph_shabal512_init(&ctx_shabal);
+	sph_shabal512(&ctx_shabal, hash, 64);
+	sph_shabal512_close(&ctx_shabal, hash);
+
+	if (hash[0] & 0x8)
+	{
+		sph_hamsi512_init(&ctx_hamsi);
+		sph_hamsi512(&ctx_hamsi, hash, 64);
+		sph_hamsi512_close(&ctx_hamsi, hash);
+	} else {
+		sph_luffa512_init(&ctx_luffa);
+		sph_luffa512(&ctx_luffa, hash, 64);
+		sph_luffa512_close(&ctx_luffa, hash);
+	}
+
+	memcpy(output, hash, 32);
+}
diff --git a/heavy/cuda_bastion.cu b/heavy/cuda_bastion.cu
new file mode 100644
index 0000000000..44e301a7ce
--- /dev/null
+++ b/heavy/cuda_bastion.cu
@@ -0,0 +1,103 @@
+#include <stdio.h>
+
+#include "cuda_helper.h"
+
+static uint32_t *d_offsets1[MAX_GPUS] = { 0 };
+static uint32_t *d_offsets2[MAX_GPUS] = { 0 };
+
+static uint32_t *d_brcount1[MAX_GPUS] = { 0 };
+static uint32_t *d_brcount2[MAX_GPUS] = { 0 };
+
+__global__ __launch_bounds__(128, 6)
+void bastion_filter2_gpu(const uint32_t threads, const uint32_t* d_hash, uint32_t* d_hash1, uint32_t* d_hash2, uint32_t* d_br_ofts1, uint32_t* d_count1, uint32_t* d_br_ofts2, uint32_t* d_count2)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint32_t offset = thread * 16U; // 64U / sizeof(uint32_t);
+		uint4 *psrc = (uint4*) (&d_hash[offset]);
+		uint4 *pdst;
+		d_br_ofts1[thread] = 0;
+		d_br_ofts2[thread] = 0;
+		if (((uint8_t*)psrc)[0] & 0x8) {
+			// uint4 = 4x uint32_t = 16 bytes
+			uint32_t oft = atomicAdd(d_count1, 1U) * 16U;
+			d_br_ofts1[thread] = oft + 16U;
+			pdst = (uint4*) (&d_hash1[oft]);
+		} else {
+			uint32_t oft = atomicAdd(d_count2, 1U) * 16U;
+			d_br_ofts2[thread] = oft + 16U;
+			pdst = (uint4*) (&d_hash2[oft]);
+		}
+		pdst[0] = psrc[0];
+		pdst[1] = psrc[1];
+		pdst[2] = psrc[2];
+		pdst[3] = psrc[3];
+	}
+}
+
+__global__ __launch_bounds__(128, 6)
+void bastion_merge2_gpu(const uint32_t threads, uint32_t* d_hash, uint32_t* d_hash1, uint32_t* d_hash2, uint32_t* d_br_ofts1, uint32_t* d_br_ofts2)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint32_t offset = thread * 16U;
+		uint4 *pdst = (uint4*) (&d_hash[offset]);
+		uint4 *psrc;
+		if (d_br_ofts1[thread]) {
+			const uint32_t oft = d_br_ofts1[thread] - 16U;
+			psrc = (uint4*) (&d_hash1[oft]);
+		} else {
+			const uint32_t oft = d_br_ofts2[thread] - 16U;
+			psrc = (uint4*) (&d_hash2[oft]);
+		}
+		pdst[0] = psrc[0];
+		pdst[1] = psrc[1];
+		pdst[2] = psrc[2];
+		pdst[3] = psrc[3];
+	}
+}
+
+
+__host__
+void bastion_init(const int thr_id, const uint32_t threads)
+{
+	CUDA_SAFE_CALL(cudaMalloc(&d_offsets1[thr_id], sizeof(uint32_t) * threads));
+	CUDA_SAFE_CALL(cudaMalloc(&d_offsets2[thr_id], sizeof(uint32_t) * threads));
+	CUDA_SAFE_CALL(cudaMalloc(&d_brcount1[thr_id], sizeof(uint32_t)));
+	CUDA_SAFE_CALL(cudaMalloc(&d_brcount2[thr_id], sizeof(uint32_t)));
+}
+
+__host__
+void bastion_free(const int thr_id)
+{
+	cudaFree(d_offsets1[thr_id]);
+	cudaFree(d_offsets2[thr_id]);
+	cudaFree(d_brcount1[thr_id]);
+	cudaFree(d_brcount2[thr_id]);
+}
+
+__host__
+uint32_t bastion_filter2(const int thr_id, const uint32_t threads, const uint32_t *inpHashes, uint32_t* d_hash1, uint32_t* d_hash2)
+{
+	uint32_t num = 0;
+	cudaMemset(d_brcount1[thr_id], 0, 4);
+	cudaMemset(d_brcount2[thr_id], 0, 4);
+	const uint32_t threadsperblock = 128;
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+	bastion_filter2_gpu <<<grid, block>>> (threads, inpHashes, d_hash1, d_hash2, d_offsets1[thr_id], d_brcount1[thr_id], d_offsets2[thr_id], d_brcount2[thr_id]);
+	cudaMemcpy(&num, d_brcount1[thr_id], 4, cudaMemcpyDeviceToHost);
+	return num;
+}
+
+__host__
+void bastion_merge2(const int thr_id, const uint32_t threads, uint32_t *outpHashes, uint32_t* d_hash1, uint32_t* d_hash2)
+{
+	const uint32_t threadsperblock = 128;
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+	// put back branch hashes to the common buffer d_hash
+	bastion_merge2_gpu <<<grid, block>>> (threads, outpHashes, d_hash1, d_hash2, d_offsets1[thr_id], d_offsets2[thr_id]);
+}
diff --git a/heavy/cuda_blake512.cu b/heavy/cuda_blake512.cu
index dfa4240547..f3d22fa166 100644
--- a/heavy/cuda_blake512.cu
+++ b/heavy/cuda_blake512.cu
@@ -1,26 +1,18 @@
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
 #include <stdio.h>
 #include <memory.h>
 
-// Folgende Definitionen sp�ter durch header ersetzen
-typedef unsigned char uint8_t;
-typedef unsigned int uint32_t;
-typedef unsigned long long uint64_t;
+#include "cuda_helper.h"
 
-// globaler Speicher f�r alle HeftyHashes aller Threads
-extern uint32_t *d_heftyHashes[8];
-extern uint32_t *d_nonceVector[8];
+// globaler Speicher für alle HeftyHashes aller Threads
+extern uint32_t *heavy_heftyHashes[MAX_GPUS];
+extern uint32_t *heavy_nonceVector[MAX_GPUS];
 
-// globaler Speicher f�r unsere Ergebnisse
-uint32_t *d_hash5output[8];
+// globaler Speicher für unsere Ergebnisse
+uint32_t *d_hash5output[MAX_GPUS];
 
 // die Message (112 bzw. 116 Bytes) mit Padding zur Berechnung auf der GPU
 __constant__ uint64_t c_PaddedMessage[16]; // padded message (80/84+32 bytes + padding)
 
-#include "cuda_helper.h"
 
 // ---------------------------- BEGIN CUDA blake512 functions ------------------------------------
 
@@ -46,21 +38,9 @@ const uint8_t host_sigma[16][16] =
   { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
 };
 
-// Diese Makros besser nur f�r Compile Time Konstanten verwenden. Sie sind langsam.
-#define SWAP32(x) \
-    ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u)   | \
-      (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
-
-// Diese Makros besser nur f�r Compile Time Konstanten verwenden. Sie sind langsam.
-#define SWAP64(x) \
-    ((uint64_t)((((uint64_t)(x) & 0xff00000000000000ULL) >> 56) | \
-                (((uint64_t)(x) & 0x00ff000000000000ULL) >> 40) | \
-                (((uint64_t)(x) & 0x0000ff0000000000ULL) >> 24) | \
-                (((uint64_t)(x) & 0x000000ff00000000ULL) >>  8) | \
-                (((uint64_t)(x) & 0x00000000ff000000ULL) <<  8) | \
-                (((uint64_t)(x) & 0x0000000000ff0000ULL) << 24) | \
-                (((uint64_t)(x) & 0x000000000000ff00ULL) << 40) | \
-                (((uint64_t)(x) & 0x00000000000000ffULL) << 56)))
+/* in cuda_helper */
+#define SWAP32(x) cuda_swab32(x)
+#define SWAP64(x) cuda_swab64(x)
 
 __constant__ uint64_t c_SecondRound[15];
 
@@ -73,20 +53,20 @@ __constant__ uint64_t c_u512[16];
 
 const uint64_t host_u512[16] =
 {
-  0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL, 
+  0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL,
   0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL,
-  0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL, 
+  0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL,
   0xc0ac29b7c97c50ddULL, 0x3f84d5b5b5470917ULL,
-  0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL, 
+  0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL,
   0x2ffd72dbd01adfb7ULL, 0xb8e1afed6a267e96ULL,
-  0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL, 
+  0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL,
   0x0801f2e2858efc16ULL, 0x636920d871574e69ULL
 };
 
 
 #define G(a,b,c,d,e)          \
     v[a] += (m[sigma[i][e]] ^ u512[sigma[i][e+1]]) + v[b];\
-    v[d] = ROTR64( v[d] ^ v[a],32);        \
+    v[d] = SWAPDWORDS( v[d] ^ v[a]);        \
     v[c] += v[d];           \
     v[b] = ROTR64( v[b] ^ v[c],25);        \
     v[a] += (m[sigma[i][e+1]] ^ u512[sigma[i][e]])+v[b];  \
@@ -138,12 +118,12 @@ template <int BLOCKSIZE> __device__ void blake512_compress( uint64_t *h, const u
     for( i = 0; i < 16; ++i )  h[i % 8] ^= v[i];
 }
 
-template <int BLOCKSIZE> __global__ void blake512_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector)
+template <int BLOCKSIZE> __global__ void blake512_gpu_hash(uint32_t threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector)
 {
-	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
-		// bestimme den aktuellen Z�hler
+		// bestimme den aktuellen Zähler
 		//uint32_t nounce = startNounce + thread;
 		uint32_t nounce = nonceVector[thread];
 
@@ -161,28 +141,28 @@ template <int BLOCKSIZE> __global__ void blake512_gpu_hash(int threads, uint32_t
 		h[6] = 0x1f83d9abfb41bd6bULL;
 		h[7] = 0x5be0cd19137e2179ULL;
 
-		// 128 Byte f�r die Message
+		// 128 Byte für die Message
 		uint64_t buf[16];
 
-		// Message f�r die erste Runde in Register holen
+		// Message für die erste Runde in Register holen
 #pragma unroll 16
 		for (int i=0; i < 16; ++i) buf[i] = c_PaddedMessage[i];
 
 		// die Nounce durch die thread-spezifische ersetzen
-		buf[9] = REPLACE_HIWORD(buf[9], nounce);
+		buf[9] = REPLACE_HIDWORD(buf[9], nounce);
 
 		uint32_t *hefty = heftyHashes + 8 * hashPosition;
 		if (BLOCKSIZE == 84) {
 			// den thread-spezifischen Hefty1 hash einsetzen
-			// aufw�ndig, weil das nicht mit uint64_t W�rtern aligned ist.
-			buf[10] = REPLACE_HIWORD(buf[10], hefty[0]);
-			buf[11] = REPLACE_LOWORD(buf[11], hefty[1]);
-			buf[11] = REPLACE_HIWORD(buf[11], hefty[2]);
-			buf[12] = REPLACE_LOWORD(buf[12], hefty[3]);
-			buf[12] = REPLACE_HIWORD(buf[12], hefty[4]);
-			buf[13] = REPLACE_LOWORD(buf[13], hefty[5]);
-			buf[13] = REPLACE_HIWORD(buf[13], hefty[6]);
-			buf[14] = REPLACE_LOWORD(buf[14], hefty[7]);
+			// aufwändig, weil das nicht mit uint64_t Wörtern aligned ist.
+			buf[10] = REPLACE_HIDWORD(buf[10], hefty[0]);
+			buf[11] = REPLACE_LODWORD(buf[11], hefty[1]);
+			buf[11] = REPLACE_HIDWORD(buf[11], hefty[2]);
+			buf[12] = REPLACE_LODWORD(buf[12], hefty[3]);
+			buf[12] = REPLACE_HIDWORD(buf[12], hefty[4]);
+			buf[13] = REPLACE_LODWORD(buf[13], hefty[5]);
+			buf[13] = REPLACE_HIDWORD(buf[13], hefty[6]);
+			buf[14] = REPLACE_LODWORD(buf[14], hefty[7]);
 		}
 		else if (BLOCKSIZE == 80) {
 			buf[10] = MAKE_ULONGLONG(hefty[0], hefty[1]);
@@ -193,14 +173,14 @@ template <int BLOCKSIZE> __global__ void blake512_gpu_hash(int threads, uint32_t
 
 		// erste Runde
 		blake512_compress<BLOCKSIZE>( h, buf, 0, c_sigma, c_u512 );
-		
-		
+
+
 		// zweite Runde
 #pragma unroll 15
 		for (int i=0; i < 15; ++i) buf[i] = c_SecondRound[i];
 		buf[15] = SWAP64(8*(BLOCKSIZE+32)); // Blocksize in Bits einsetzen
 		blake512_compress<BLOCKSIZE>( h, buf, 1, c_sigma, c_u512 );
-		
+
 		// Hash rauslassen
 		uint64_t *outHash = (uint64_t *)outputHash + 8 * hashPosition;
 #pragma unroll 8
@@ -211,8 +191,9 @@ template <int BLOCKSIZE> __global__ void blake512_gpu_hash(int threads, uint32_t
 
 // ---------------------------- END CUDA blake512 functions ------------------------------------
 
-// Setup-Funktionen
-__host__ void blake512_cpu_init(int thr_id, int threads)
+// Setup Function
+__host__
+void blake512_cpu_init(int thr_id, uint32_t threads)
 {
 	// Kopiere die Hash-Tabellen in den GPU-Speicher
 	cudaMemcpyToSymbol( c_sigma,
@@ -230,26 +211,33 @@ __host__ void blake512_cpu_init(int thr_id, int threads)
 						sizeof(host_SecondRound),
 						0, cudaMemcpyHostToDevice);
 
-	// Speicher f�r alle Ergebnisse belegen
-	cudaMalloc(&d_hash5output[thr_id], 16 * sizeof(uint32_t) * threads);
+	// Speicher für alle Ergebnisse belegen
+	CUDA_SAFE_CALL(cudaMalloc(&d_hash5output[thr_id], (size_t) 64 * threads));
+}
+
+__host__
+void blake512_cpu_free(int thr_id)
+{
+	cudaFree(d_hash5output[thr_id]);
 }
 
 static int BLOCKSIZE = 84;
 
-__host__ void blake512_cpu_setBlock(void *pdata, int len)
+__host__
+void blake512_cpu_setBlock(void *pdata, int len)
 	// data muss 84-Byte haben!
 	// heftyHash hat 32-Byte
 {
 	unsigned char PaddedMessage[128];
 	if (len == 84) {
-		// Message mit Padding f�r erste Runde bereitstellen
+		// Message mit Padding für erste Runde bereitstellen
 		memcpy(PaddedMessage, pdata, 84);
-		memset(PaddedMessage+84, 0, 32); // leeres Hefty Hash einf�llen
+		memset(PaddedMessage+84, 0, 32); // leeres Hefty Hash einfüllen
 		memset(PaddedMessage+116, 0, 12);
 		PaddedMessage[116] = 0x80;
 	} else if (len == 80) {
 		memcpy(PaddedMessage, pdata, 80);
-		memset(PaddedMessage+80, 0, 32); // leeres Hefty Hash einf�llen
+		memset(PaddedMessage+80, 0, 32); // leeres Hefty Hash einfüllen
 		memset(PaddedMessage+112, 0, 16);
 		PaddedMessage[112] = 0x80;
 	}
@@ -258,19 +246,19 @@ __host__ void blake512_cpu_setBlock(void *pdata, int len)
 	BLOCKSIZE = len;
 }
 
-__host__ void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
+__host__ void blake512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce)
 {
-	const int threadsperblock = 256;
+	const uint32_t threadsperblock = 256;
 
 	// berechne wie viele Thread Blocks wir brauchen
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	// Gr��e des dynamischen Shared Memory Bereichs
+	// Größe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 
 	if (BLOCKSIZE == 80)
-		blake512_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash5output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
+		blake512_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash5output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
 	else if (BLOCKSIZE == 84)
-		blake512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash5output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
+		blake512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash5output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
 }
diff --git a/heavy/cuda_blake512.h b/heavy/cuda_blake512.h
deleted file mode 100644
index 7e24973348..0000000000
--- a/heavy/cuda_blake512.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef _CUDA_BLAKE512_H
-#define _CUDA_BLAKE512_H
-
-void blake512_cpu_init(int thr_id, int threads);
-void blake512_cpu_setBlock(void *pdata, int len);
-void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce);
-#endif
diff --git a/heavy/cuda_combine.cu b/heavy/cuda_combine.cu
index e2a8b721a2..64a9561771 100644
--- a/heavy/cuda_combine.cu
+++ b/heavy/cuda_combine.cu
@@ -1,21 +1,19 @@
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
+#include <stdio.h>
 
-// Folgende Definitionen sp�ter durch header ersetzen
-typedef unsigned int uint32_t;
+#include "cuda_helper.h"
 
-// globaler Speicher f�r unsere Ergebnisse
-uint32_t *d_hashoutput[8];
+// globaler Speicher für unsere Ergebnisse
+static uint32_t *d_hashoutput[MAX_GPUS];
+extern uint32_t *d_hash2output[MAX_GPUS];
+extern uint32_t *d_hash3output[MAX_GPUS];
+extern uint32_t *d_hash4output[MAX_GPUS];
+extern uint32_t *d_hash5output[MAX_GPUS];
 
-extern uint32_t *d_hash2output[8];
-extern uint32_t *d_hash3output[8];
-extern uint32_t *d_hash4output[8];
-extern uint32_t *d_hash5output[8];
-extern uint32_t *d_nonceVector[8];
+extern uint32_t *heavy_nonceVector[MAX_GPUS];
 
 /* Combines top 64-bits from each hash into a single hash */
-static void __device__ combine_hashes(uint32_t *out, uint32_t *hash1, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4)
+__device__
+static void combine_hashes(uint32_t *out, uint32_t *hash1, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4)
 {
 	uint32_t lout[8]; // Combining in Registern machen
 
@@ -103,9 +101,10 @@ static void __device__ combine_hashes(uint32_t *out, uint32_t *hash1, uint32_t *
 		out[i] = lout[i];
 }
 
-__global__ void combine_gpu_hash(int threads, uint32_t startNounce, uint32_t *out, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4, uint32_t *hash5, uint32_t *nonceVector)
+__global__
+void combine_gpu_hash(uint32_t threads, uint32_t startNounce, uint32_t *out, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4, uint32_t *hash5, uint32_t *nonceVector)
 {
-	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		uint32_t nounce = nonceVector[thread];
@@ -121,28 +120,32 @@ __global__ void combine_gpu_hash(int threads, uint32_t startNounce, uint32_t *ou
 	}
 }
 
-// Setup-Funktionen
-__host__ void combine_cpu_init(int thr_id, int threads)
+__host__
+void combine_cpu_init(int thr_id, uint32_t threads)
 {
-	// Speicher f�r alle Ergebnisse belegen
-	cudaMalloc(&d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads);
+	// Speicher für alle Ergebnisse belegen
+	CUDA_SAFE_CALL(cudaMalloc(&d_hashoutput[thr_id], (size_t) 32 * threads));
 }
 
-void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *hash)
+__host__
+void combine_cpu_free(int thr_id)
+{
+	cudaFree(d_hashoutput[thr_id]);
+}
+
+__host__
+void combine_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *hash)
 {
 	// diese Kopien sind optional, da die Hashes jetzt bereits auf der GPU liegen sollten
 
-	const int threadsperblock = 128;
+	const uint32_t threadsperblock = 128;
 
 	// berechne wie viele Thread Blocks wir brauchen
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	// Gr��e des dynamischen Shared Memory Bereichs
-	size_t shared_size = 0;
-
-	combine_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_hashoutput[thr_id], d_hash2output[thr_id], d_hash3output[thr_id], d_hash4output[thr_id], d_hash5output[thr_id], d_nonceVector[thr_id]);
+	combine_gpu_hash <<<grid, block>>> (threads, startNounce, d_hashoutput[thr_id], d_hash2output[thr_id], d_hash3output[thr_id], d_hash4output[thr_id], d_hash5output[thr_id], heavy_nonceVector[thr_id]);
 
-	// da die Hash Auswertung noch auf der CPU erfolgt, m�ssen die Ergebnisse auf jeden Fall zum Host kopiert werden
-	cudaMemcpy(hash, d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads, cudaMemcpyDeviceToHost);
+	// da die Hash Auswertung noch auf der CPU erfolgt, müssen die Ergebnisse auf jeden Fall zum Host kopiert werden
+	CUDA_SAFE_CALL(cudaMemcpy(hash, d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads, cudaMemcpyDeviceToHost));
 }
diff --git a/heavy/cuda_combine.h b/heavy/cuda_combine.h
deleted file mode 100644
index 5bb5832d19..0000000000
--- a/heavy/cuda_combine.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef _CUDA_COMBINE_H
-#define _CUDA_COMBINE_H
-
-void combine_cpu_init(int thr_id, int threads);
-void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *hash);
-
-#endif
diff --git a/heavy/cuda_groestl512.cu b/heavy/cuda_groestl512.cu
index 5b1b26745a..ef4c2c1404 100644
--- a/heavy/cuda_groestl512.cu
+++ b/heavy/cuda_groestl512.cu
@@ -1,27 +1,22 @@
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
 #include <stdio.h>
 #include <memory.h>
 
-// Folgende Definitionen sp�ter durch header ersetzen
-typedef unsigned char uint8_t;
-typedef unsigned int uint32_t;
-typedef unsigned long long uint64_t;
+#define SPH_C32(x) ((uint32_t)(x ## U))
+#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
+
+#include "cuda_helper.h"
 
-// globaler Speicher f�r alle HeftyHashes aller Threads
-extern uint32_t *d_heftyHashes[8];
-extern uint32_t *d_nonceVector[8];
+// globaler Speicher für alle HeftyHashes aller Threads
+extern uint32_t *heavy_heftyHashes[MAX_GPUS];
+extern uint32_t *heavy_nonceVector[MAX_GPUS];
+static unsigned int *d_textures[MAX_GPUS][8];
 
-// globaler Speicher f�r unsere Ergebnisse
-uint32_t *d_hash4output[8];
+// globaler Speicher für unsere Ergebnisse
+uint32_t *d_hash4output[MAX_GPUS];
 
 __constant__ uint32_t groestl_gpu_state[32];
 __constant__ uint32_t groestl_gpu_msg[32];
 
-#define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
-
 #define PC32up(j, r)   ((uint32_t)((j) + (r)))
 #define PC32dn(j, r)   0
 #define QC32up(j, r)   0xFFFFFFFF
@@ -32,7 +27,6 @@ __constant__ uint32_t groestl_gpu_msg[32];
 #define B32_2(x)    (((x) >> 16) & 0xFF)
 #define B32_3(x)    ((x) >> 24)
 
-#define SPH_C32(x)	((uint32_t)(x ## U))
 #define C32e(x)     ((SPH_C32(x) >> 24) \
                     | ((SPH_C32(x) >>  8) & SPH_C32(0x0000FF00)) \
                     | ((SPH_C32(x) <<  8) & SPH_C32(0x00FF0000)) \
@@ -610,22 +604,22 @@ __device__ void groestl512_perm_P(uint32_t *a)
 #pragma unroll 16
 		for(int k=0;k<32;k+=2)
 		{
-			t[k + 0] =	T0up( B32_0(a[k & 0x1f]) ) ^ 
-						T1up( B32_1(a[(k + 2) & 0x1f]) ) ^ 
-						T2up( B32_2(a[(k + 4) & 0x1f]) ) ^ 
-						T3up( B32_3(a[(k + 6) & 0x1f]) ) ^ 
-						T0dn( B32_0(a[(k + 9) & 0x1f]) ) ^ 
-						T1dn( B32_1(a[(k + 11) & 0x1f]) ) ^ 
-						T2dn( B32_2(a[(k + 13) & 0x1f]) ) ^ 
+			t[k + 0] =	T0up( B32_0(a[k & 0x1f]) ) ^
+						T1up( B32_1(a[(k + 2) & 0x1f]) ) ^
+						T2up( B32_2(a[(k + 4) & 0x1f]) ) ^
+						T3up( B32_3(a[(k + 6) & 0x1f]) ) ^
+						T0dn( B32_0(a[(k + 9) & 0x1f]) ) ^
+						T1dn( B32_1(a[(k + 11) & 0x1f]) ) ^
+						T2dn( B32_2(a[(k + 13) & 0x1f]) ) ^
 						T3dn( B32_3(a[(k + 23) & 0x1f]) );
 
-			t[k + 1] =	T0dn( B32_0(a[k & 0x1f]) ) ^ 
-						T1dn( B32_1(a[(k + 2) & 0x1f]) ) ^ 
-						T2dn( B32_2(a[(k + 4) & 0x1f]) ) ^ 
-						T3dn( B32_3(a[(k + 6) & 0x1f]) ) ^ 
-						T0up( B32_0(a[(k + 9) & 0x1f]) ) ^ 
-						T1up( B32_1(a[(k + 11) & 0x1f]) ) ^ 
-						T2up( B32_2(a[(k + 13) & 0x1f]) ) ^ 
+			t[k + 1] =	T0dn( B32_0(a[k & 0x1f]) ) ^
+						T1dn( B32_1(a[(k + 2) & 0x1f]) ) ^
+						T2dn( B32_2(a[(k + 4) & 0x1f]) ) ^
+						T3dn( B32_3(a[(k + 6) & 0x1f]) ) ^
+						T0up( B32_0(a[(k + 9) & 0x1f]) ) ^
+						T1up( B32_1(a[(k + 11) & 0x1f]) ) ^
+						T2up( B32_2(a[(k + 13) & 0x1f]) ) ^
 						T3up( B32_3(a[(k + 23) & 0x1f]) );
 		}
 #pragma unroll 32
@@ -652,22 +646,22 @@ __device__ void groestl512_perm_Q(uint32_t *a)
 #pragma unroll 16
 		for(int k=0;k<32;k+=2)
 		{
-			t[k + 0] =	T0up( B32_0(a[(k + 2) & 0x1f]) ) ^ 
-						T1up( B32_1(a[(k + 6) & 0x1f]) ) ^ 
-						T2up( B32_2(a[(k + 10) & 0x1f]) ) ^ 
-						T3up( B32_3(a[(k + 22) & 0x1f]) ) ^ 
-						T0dn( B32_0(a[(k + 1) & 0x1f]) ) ^ 
-						T1dn( B32_1(a[(k + 5) & 0x1f]) ) ^ 
-						T2dn( B32_2(a[(k + 9) & 0x1f]) ) ^ 
+			t[k + 0] =	T0up( B32_0(a[(k + 2) & 0x1f]) ) ^
+						T1up( B32_1(a[(k + 6) & 0x1f]) ) ^
+						T2up( B32_2(a[(k + 10) & 0x1f]) ) ^
+						T3up( B32_3(a[(k + 22) & 0x1f]) ) ^
+						T0dn( B32_0(a[(k + 1) & 0x1f]) ) ^
+						T1dn( B32_1(a[(k + 5) & 0x1f]) ) ^
+						T2dn( B32_2(a[(k + 9) & 0x1f]) ) ^
 						T3dn( B32_3(a[(k + 13) & 0x1f]) );
 
-			t[k + 1] =	T0dn( B32_0(a[(k + 2) & 0x1f]) ) ^ 
-						T1dn( B32_1(a[(k + 6) & 0x1f]) ) ^ 
-						T2dn( B32_2(a[(k + 10) & 0x1f]) ) ^ 
-						T3dn( B32_3(a[(k + 22) & 0x1f]) ) ^ 
-						T0up( B32_0(a[(k + 1) & 0x1f]) ) ^ 
-						T1up( B32_1(a[(k + 5) & 0x1f]) ) ^ 
-						T2up( B32_2(a[(k + 9) & 0x1f]) ) ^ 
+			t[k + 1] =	T0dn( B32_0(a[(k + 2) & 0x1f]) ) ^
+						T1dn( B32_1(a[(k + 6) & 0x1f]) ) ^
+						T2dn( B32_2(a[(k + 10) & 0x1f]) ) ^
+						T3dn( B32_3(a[(k + 22) & 0x1f]) ) ^
+						T0up( B32_0(a[(k + 1) & 0x1f]) ) ^
+						T1up( B32_1(a[(k + 5) & 0x1f]) ) ^
+						T2up( B32_2(a[(k + 9) & 0x1f]) ) ^
 						T3up( B32_3(a[(k + 13) & 0x1f]) );
 		}
 #pragma unroll 32
@@ -676,15 +670,15 @@ __device__ void groestl512_perm_Q(uint32_t *a)
 	}
 }
 
-template <int BLOCKSIZE> __global__ void groestl512_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector)
+template <int BLOCKSIZE> __global__ void groestl512_gpu_hash(uint32_t threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector)
 {
-	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		uint32_t message[32];
 		uint32_t state[32];
 
-		// lese message ein & verkn�pfe diese mit dem hash1 von hefty1
+		// lese message ein & verknüpfe diese mit dem hash1 von hefty1
 		// lese den state ein
 
 #pragma unroll 32
@@ -707,7 +701,7 @@ template <int BLOCKSIZE> __global__ void groestl512_gpu_hash(int threads, uint32
 #pragma unroll 8
 		for (int k=0; k<8; ++k)
 			message[BLOCKSIZE/4+k] = heftyHash[k];
-		
+
 		uint32_t g[32];
 #pragma unroll 32
 		for(int u=0;u<32;u++)
@@ -716,7 +710,7 @@ template <int BLOCKSIZE> __global__ void groestl512_gpu_hash(int threads, uint32
 		// Perm
 		groestl512_perm_P(g);
 		groestl512_perm_Q(message);
-		
+
 #pragma unroll 32
 		for(int u=0;u<32;u++)
 		{
@@ -737,36 +731,50 @@ template <int BLOCKSIZE> __global__ void groestl512_gpu_hash(int threads, uint32
 	}
 }
 
-#define texDef(texname, texmem, texsource, texsize) \
+#define texDef(id, texname, texmem, texsource, texsize) { \
 	unsigned int *texmem; \
 	cudaMalloc(&texmem, texsize); \
+	d_textures[thr_id][id] = texmem; \
 	cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
 	texname.normalized = 0; \
 	texname.filterMode = cudaFilterModePoint; \
 	texname.addressMode[0] = cudaAddressModeClamp; \
 	{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
-	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \
+	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); \
+	} \
+}
 
-// Setup-Funktionen
-__host__ void groestl512_cpu_init(int thr_id, int threads)
+// Setup Function
+__host__
+void groestl512_cpu_init(int thr_id, uint32_t threads)
 {
 	// Texturen mit obigem Makro initialisieren
-	texDef(t0up, d_T0up, T0up_cpu, sizeof(uint32_t)*256);
-	texDef(t0dn, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256);
-	texDef(t1up, d_T1up, T1up_cpu, sizeof(uint32_t)*256);
-	texDef(t1dn, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256);
-	texDef(t2up, d_T2up, T2up_cpu, sizeof(uint32_t)*256);
-	texDef(t2dn, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256);
-	texDef(t3up, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
-	texDef(t3dn, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
-
-	// Speicher f�r alle Ergebnisse belegen
-	cudaMalloc(&d_hash4output[thr_id], 16 * sizeof(uint32_t) * threads);
+	texDef(0, t0up, d_T0up, T0up_cpu, sizeof(uint32_t)*256);
+	texDef(1, t0dn, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256);
+	texDef(2, t1up, d_T1up, T1up_cpu, sizeof(uint32_t)*256);
+	texDef(3, t1dn, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256);
+	texDef(4, t2up, d_T2up, T2up_cpu, sizeof(uint32_t)*256);
+	texDef(5, t2dn, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256);
+	texDef(6, t3up, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
+	texDef(7, t3dn, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
+
+	// Speicher für alle Ergebnisse belegen
+	cudaMalloc(&d_hash4output[thr_id], (size_t) 64 * threads);
+}
+
+__host__
+void groestl512_cpu_free(int thr_id)
+{
+	for (int i=0; i <8; i++)
+		cudaFree(d_textures[thr_id][i]);
+
+	cudaFree(d_hash4output[thr_id]);
 }
 
 static int BLOCKSIZE = 84;
 
-__host__ void groestl512_cpu_setBlock(void *data, int len)
+__host__
+void groestl512_cpu_setBlock(void *data, int len)
 	// data muss 80/84-Byte haben!
 	// heftyHash hat 32-Byte
 {
@@ -785,47 +793,42 @@ __host__ void groestl512_cpu_setBlock(void *data, int len)
 		msgBlock[28] = 0x80;
 		msgBlock[31] = 0x01000000;
 	}
-	// groestl512 braucht hierf�r keinen CPU-Code (die einzige Runde wird
-	// auf der GPU ausgef�hrt)
+	// groestl512 braucht hierfür keinen CPU-Code (die einzige Runde wird
+	// auf der GPU ausgeführt)
 
-	// setze register 
+	// setze register
 	uint32_t groestl_state_init[32];
 	memset(groestl_state_init, 0, sizeof(uint32_t) * 32);
 	groestl_state_init[31] = 0x20000;
 
 	// state speichern
-	cudaMemcpyToSymbol(	groestl_gpu_state,
-						groestl_state_init,
-						128);
+	cudaMemcpyToSymbol(groestl_gpu_state, groestl_state_init, 128);
 
 	// Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch)
-	cudaMemcpyToSymbol(	groestl_gpu_msg,
-						msgBlock,
-						128);
-	
+	cudaMemcpyToSymbol(groestl_gpu_msg, msgBlock, 128);
 	BLOCKSIZE = len;
 }
 
-__host__ void groestl512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy)
+__host__ void groestl512_cpu_copyHeftyHash(int thr_id, uint32_t threads, void *heftyHashes, int copy)
 {
 	// Hefty1 Hashes kopieren (eigentlich nur zum debuggen)
-	if (copy)	
-		cudaMemcpy( d_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice );		
+	if (copy)
+		CUDA_SAFE_CALL(cudaMemcpy(heavy_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice));
 }
 
-__host__ void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
+__host__ void groestl512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce)
 {
-	const int threadsperblock = 128;
+	const uint32_t threadsperblock = 128;
 
 	// berechne wie viele Thread Blocks wir brauchen
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	// Gr��e des dynamischen Shared Memory Bereichs
+	// Größe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 
 	if (BLOCKSIZE == 84)
-		groestl512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash4output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);	
+		groestl512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash4output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
 	else if (BLOCKSIZE == 80)
-		groestl512_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash4output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);	
+		groestl512_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash4output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
 }
diff --git a/heavy/cuda_groestl512.h b/heavy/cuda_groestl512.h
deleted file mode 100644
index 0cdc13b809..0000000000
--- a/heavy/cuda_groestl512.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef _CUDA_GROESTL512_H
-#define _CUDA_GROESTL512_H
-
-void groestl512_cpu_init(int thr_id, int threads);
-void groestl512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy);
-void groestl512_cpu_setBlock(void *data, int len);
-void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce);
-
-#endif
\ No newline at end of file
diff --git a/heavy/cuda_hefty1.cu b/heavy/cuda_hefty1.cu
index 4b60818c15..7700acc0d1 100644
--- a/heavy/cuda_hefty1.cu
+++ b/heavy/cuda_hefty1.cu
@@ -1,28 +1,14 @@
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
 #include <stdio.h>
 #include <memory.h>
 
-#define USE_SHARED 1
-
-// aus cpu-miner.c
-extern int device_map[8];
+#include "miner.h"
 
-// aus heavy.cu
-extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
+#include "cuda_helper.h"
 
-// Folgende Definitionen sp�ter durch header ersetzen
-typedef unsigned int uint32_t;
-typedef unsigned char uint8_t;
-typedef unsigned short uint16_t;
-
-// diese Struktur wird in der Init Funktion angefordert
-static cudaDeviceProp props[8];
+#define USE_SHARED 1
 
-// globaler Speicher f�r alle HeftyHashes aller Threads
-uint32_t *d_heftyHashes[8];
+// globaler Speicher für alle HeftyHashes aller Threads
+uint32_t *heavy_heftyHashes[MAX_GPUS];
 
 /* Hash-Tabellen */
 __constant__ uint32_t hefty_gpu_constantTable[64];
@@ -45,8 +31,9 @@ uint32_t hefty_cpu_hashTable[] = {
     0x510e527fUL,
     0x9b05688cUL,
     0x1f83d9abUL,
-    0x5be0cd19UL };
-    
+    0x5be0cd19UL
+};
+
 uint32_t hefty_cpu_constantTable[] = {
     0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL,
     0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL,
@@ -66,11 +53,16 @@ uint32_t hefty_cpu_constantTable[] = {
     0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL
 };
 
-//#define S(x, n)          (((x) >> (n)) | ((x) << (32 - (n))))
-static __host__ __device__ uint32_t S(uint32_t x, int n)
+#if 0
+#define S(x, n)          (((x) >> (n)) | ((x) << (32 - (n))))
+#else
+__host__ __device__
+static uint32_t S(uint32_t x, int n)
 {
     return (((x) >> (n)) | ((x) << (32 - (n))));
 }
+#endif
+
 #define R(x, n)          ((x) >> (n))
 #define Ch(x, y, z)      ((x & (y ^ z)) ^ z)
 #define Maj(x, y, z)     ((x & (y | z)) | (y & z))
@@ -83,7 +75,9 @@ static __host__ __device__ uint32_t S(uint32_t x, int n)
 
 // uint8_t
 #define smoosh4(x)       ( ((x)>>4) ^ ((x) & 0x0F) )
-__host__ __forceinline__ __device__ uint8_t smoosh2(uint32_t x)
+
+__host__ __forceinline__ __device__
+uint8_t smoosh2(uint32_t x)
 {
     uint16_t w = (x >> 16) ^ (x & 0xffff);
     uint8_t n = smoosh4( (uint8_t)( (w >> 8) ^ (w & 0xFF) ) );
@@ -93,13 +87,14 @@ __host__ __forceinline__ __device__ uint8_t smoosh2(uint32_t x)
 #define smoosh4Quad(x)   ( (((x)>>4) ^ (x)) & 0x0F0F0F0F )
 #define getByte(x,y)     ( ((x) >> (y)) & 0xFF )
 
-__host__ __forceinline__ __device__ void Mangle(uint32_t *inp)
+__host__ __forceinline__ __device__
+void Mangle(uint32_t *inp)
 {
     uint32_t r = smoosh4Quad(inp[0]);
     uint32_t inp0org;
     uint32_t tmp0Mask, tmp1Mask;
     uint32_t in1, in2, isAddition;
-    uint32_t tmp;
+    int32_t tmp;
     uint8_t b;
 
     inp[1] = inp[1] ^ S(inp[0], getByte(r, 24));
@@ -108,24 +103,24 @@ __host__ __forceinline__ __device__ void Mangle(uint32_t *inp)
     tmp = smoosh2(inp[1]);
     b = getByte(r,tmp);
     inp0org = S(inp[0], b);
-    tmp0Mask = -((tmp >> 3)&1); // Bit 3 an Position 0
-    tmp1Mask = -((tmp >> 4)&1); // Bit 4 an Position 0
-    
-    in1 =    (inp[2] & ~inp0org) | 
+    tmp0Mask = (uint32_t) -((tmp >> 3) & 1); // Bit 3 an Position 0
+    tmp1Mask = (uint32_t) -((tmp >> 4) & 1); // Bit 4 an Position 0
+
+    in1 =    (inp[2] & ~inp0org) |
             (tmp1Mask & ~inp[2] & inp0org) |
             (~tmp0Mask & ~inp[2] & inp0org);
     in2 = inp[2] += ~inp0org;
     isAddition = ~tmp0Mask & tmp1Mask;
     inp[2] = isAddition ? in2 : in1;
-    
+
     r += 0x01010101;
     tmp = smoosh2(inp[1] ^ inp[2]);
     b = getByte(r,tmp);
     inp0org = S(inp[0], b);
-    tmp0Mask = -((tmp >> 3)&1); // Bit 3 an Position 0
-    tmp1Mask = -((tmp >> 4)&1); // Bit 4 an Position 0
+    tmp0Mask = (uint32_t) -((tmp >> 3) & 1); // Bit 3 an Position 0
+    tmp1Mask = (uint32_t) -((tmp >> 4) & 1); // Bit 4 an Position 0
 
-    in1 =    (inp[3] & ~inp0org) | 
+    in1 =    (inp[3] & ~inp0org) |
             (tmp1Mask & ~inp[3] & inp0org) |
             (~tmp0Mask & ~inp[3] & inp0org);
     in2 = inp[3] += ~inp0org;
@@ -135,20 +130,23 @@ __host__ __forceinline__ __device__ void Mangle(uint32_t *inp)
     inp[0] ^= (inp[1] ^ inp[2]) + inp[3];
 }
 
-__host__ __forceinline__ __device__ void Absorb(uint32_t *inp, uint32_t x)
+__host__ __forceinline__ __device__
+void Absorb(uint32_t *inp, uint32_t x)
 {
     inp[0] ^= x;
     Mangle(inp);
 }
 
-__host__ __forceinline__ __device__ uint32_t Squeeze(uint32_t *inp)
+__host__ __forceinline__ __device__
+uint32_t Squeeze(uint32_t *inp)
 {
     uint32_t y = inp[0];
     Mangle(inp);
     return y;
 }
 
-__host__ __forceinline__ __device__ uint32_t Br(uint32_t *sponge, uint32_t x)
+__host__ __forceinline__ __device__
+uint32_t Br(uint32_t *sponge, uint32_t x)
 {
     uint32_t r = Squeeze(sponge);
     uint32_t t = ((r >> 8) & 0x1F);
@@ -162,11 +160,12 @@ __host__ __forceinline__ __device__ uint32_t Br(uint32_t *sponge, uint32_t x)
     return retVal;
 }
 
-__forceinline__ __device__ void hefty_gpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge)
+__device__ __forceinline__
+void hefty_gpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge)
 {
     uint32_t tmpBr;
 
-    uint32_t brG = Br(sponge, regs[6]);    
+    uint32_t brG = Br(sponge, regs[6]);
     uint32_t brF = Br(sponge, regs[5]);
     uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K;
     uint32_t brE = Br(sponge, regs[4]);
@@ -185,11 +184,12 @@ __forceinline__ __device__ void hefty_gpu_round(uint32_t *regs, uint32_t W, uint
     regs[4] += tmpBr;
 }
 
-__host__ void hefty_cpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge)
+__host__
+void hefty_cpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge)
 {
     uint32_t tmpBr;
 
-    uint32_t brG = Br(sponge, regs[6]);    
+    uint32_t brG = Br(sponge, regs[6]);
     uint32_t brF = Br(sponge, regs[5]);
     uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K;
     uint32_t brE = Br(sponge, regs[4]);
@@ -207,11 +207,11 @@ __host__ void hefty_cpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *
     regs[4] += tmpBr;
 }
 
-// Die Hash-Funktion
-__global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHash)
+__global__
+void hefty_gpu_hash(uint32_t threads, uint32_t startNounce, uint32_t *outputHash)
 {
-    #if USE_SHARED
-    extern __shared__ char heftytab[];
+#if USE_SHARED
+    extern __shared__ unsigned char heftytab[];
     if(threadIdx.x < 64)
     {
         *((uint32_t*)heftytab + threadIdx.x) = hefty_gpu_constantTable[threadIdx.x];
@@ -220,12 +220,12 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa
     __syncthreads();
 #endif
 
-    int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+    uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
     if (thread < threads)
     {
-        // bestimme den aktuellen Z�hler
+        // bestimme den aktuellen Zähler
         uint32_t nounce = startNounce + thread;
-    
+
         // jeder thread in diesem  Block bekommt sein eigenes W Array im Shared memory
         // reduktion von 256 byte auf 128 byte
         uint32_t W1[16];
@@ -235,7 +235,7 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa
         uint32_t regs[8];
         uint32_t hash[8];
         uint32_t sponge[4];
-    
+
 #pragma unroll 4
         for(int k=0; k < 4; k++)
             sponge[k] = hefty_gpu_sponge[k];
@@ -247,7 +247,7 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa
             regs[k] = hefty_gpu_register[k];
             hash[k] = regs[k];
         }
-    
+
         //memcpy(W, &hefty_gpu_blockHeader[0], sizeof(uint32_t) * 16); // verbleibende 20 bytes aus Block 2 plus padding
 #pragma unroll 16
         for(int k=0;k<16;k++)
@@ -268,34 +268,34 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa
         }
 
 // Progress W2 (Bytes 64...127) then W3 (Bytes 128...191) ...
-        
-#pragma unroll 3
+
+
         for(int k=0;k<3;k++)
         {
-    #pragma unroll 2
+
             for(int j=0;j<2;j++)
                 W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j];
-    #pragma unroll 5
+
             for(int j=2;j<7;j++)
                 W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j];
 
-    #pragma unroll 8
+
             for(int j=7;j<15;j++)
                 W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j];
 
             W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
 
-    #pragma unroll 16
+
             for(int j=0;j<16;j++)
             {
                 Absorb(sponge, regs[3] + regs[7]);
                 hefty_gpu_round(regs, W2[j], heftyLookUp(j + ((k+1)<<4)), sponge);
             }
-    #pragma unroll 16
+
             for(int j=0;j<16;j++)
                 W1[j] = W2[j];
         }
-        
+
 #pragma unroll 8
         for(int k=0;k<8;k++)
             hash[k] += regs[k];
@@ -306,29 +306,34 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa
     }
 }
 
-// Setup-Funktionen
-__host__ void hefty_cpu_init(int thr_id, int threads)
+__host__
+void hefty_cpu_init(int thr_id, uint32_t threads)
 {
     cudaSetDevice(device_map[thr_id]);
 
-    cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
-
     // Kopiere die Hash-Tabellen in den GPU-Speicher
-    cudaMemcpyToSymbol(    hefty_gpu_constantTable,
+    cudaMemcpyToSymbol( hefty_gpu_constantTable,
                         hefty_cpu_constantTable,
                         sizeof(uint32_t) * 64 );
 
-    // Speicher f�r alle Hefty1 hashes belegen
-    cudaMalloc(&d_heftyHashes[thr_id], 8 * sizeof(uint32_t) * threads);
+    // Speicher für alle Hefty1 hashes belegen
+    CUDA_SAFE_CALL(cudaMalloc(&heavy_heftyHashes[thr_id], (size_t) 32 * threads));
+}
+
+__host__
+void hefty_cpu_free(int thr_id)
+{
+    cudaFree(heavy_heftyHashes[thr_id]);
 }
 
-__host__ void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len)
+__host__
+void hefty_cpu_setBlock(int thr_id, uint32_t threads, void *data, int len)
 // data muss 80/84-Byte haben!
 {
     // Nachricht expandieren und setzen
     uint32_t msgBlock[32];
 
-    memset(msgBlock, 0, sizeof(uint32_t) * 32);
+    memset(msgBlock, 0, sizeof(msgBlock));
     memcpy(&msgBlock[0], data, len);
     if (len == 84) {
         msgBlock[21] |= 0x80;
@@ -337,17 +342,17 @@ __host__ void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len)
         msgBlock[20] |= 0x80;
         msgBlock[31] = 640; // bitlen
     }
-    
+
     for(int i=0;i<31;i++) // Byteorder drehen
         msgBlock[i] = SWAB32(msgBlock[i]);
 
-    // die erste Runde wird auf der CPU durchgef�hrt, da diese f�r
+    // die erste Runde wird auf der CPU durchgeführt, da diese für
     // alle Threads gleich ist. Der Hash wird dann an die Threads
-    // �bergeben
+    // übergeben
 
     // Erstelle expandierten Block W
-    uint32_t W[64];    
-    memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16);    
+    uint32_t W[64];
+    memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16);
     for(int j=16;j<64;j++)
         W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16];
 
@@ -362,7 +367,7 @@ __host__ void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len)
     {
         regs[k] = hefty_cpu_hashTable[k];
         hash[k] = regs[k];
-    }    
+    }
 
     // 1. Runde
     for(int j=0;j<16;j++)
@@ -384,40 +389,59 @@ __host__ void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len)
         hash[k] += regs[k];
 
     // sponge speichern
-
-    cudaMemcpyToSymbol( hefty_gpu_sponge,
-                        sponge,
-                        sizeof(uint32_t) * 4 );
+    cudaMemcpyToSymbol(hefty_gpu_sponge, sponge, 16);
     // hash speichern
-    cudaMemcpyToSymbol( hefty_gpu_register,
-                        hash,
-                        sizeof(uint32_t) * 8 );
-
+    cudaMemcpyToSymbol(hefty_gpu_register, hash, 32);
     // Blockheader setzen (korrekte Nonce fehlt da drin noch)
-    cudaMemcpyToSymbol( hefty_gpu_blockHeader,
-                        &msgBlock[16],
-                        64);
+    CUDA_SAFE_CALL(cudaMemcpyToSymbol(hefty_gpu_blockHeader, &msgBlock[16], 64));
 }
 
-__host__ void hefty_cpu_hash(int thr_id, int threads, int startNounce)
+__host__
+void hefty_cpu_hash(int thr_id, uint32_t threads, int startNounce)
 {
-    // Compute 3.x und 5.x Ger�te am besten mit 768 Threads ansteuern,
-    // alle anderen mit 512 Threads.
-    int threadsperblock = (props[thr_id].major >= 3) ? 768 : 512;
+    uint32_t threadsperblock = 256;
 
     // berechne wie viele Thread Blocks wir brauchen
     dim3 grid((threads + threadsperblock-1)/threadsperblock);
     dim3 block(threadsperblock);
 
-    // Gr��e des dynamischen Shared Memory Bereichs
-    #if USE_SHARED
-    size_t shared_size = 8 * 64 * sizeof(uint32_t);
+    // Größe des dynamischen Shared Memory Bereichs
+#if USE_SHARED
+    int shared_size = 8 * 64 * sizeof(uint32_t);
 #else
-    size_t shared_size = 0;
+    int shared_size = 0;
 #endif
 
-    hefty_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (void*)d_heftyHashes[thr_id]);
+    hefty_gpu_hash <<< grid, block, shared_size >>> (threads, startNounce, heavy_heftyHashes[thr_id]);
 
     // Strategisches Sleep Kommando zur Senkung der CPU Last
     MyStreamSynchronize(NULL, 0, thr_id);
 }
+
+__global__
+__launch_bounds__(128, 8)
+void hefty_gpu_copy(const uint32_t threads, uint32_t* d_heftyhash, uint64_t* d_hash)
+{
+    const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+    if (thread < threads)
+    {
+        const uint32_t offset = thread * 8U; // 32 / sizeof(uint32_t);
+        uint4 *psrc = (uint4*) (&d_heftyhash[offset]);
+        uint4 *pdst = (uint4*) (&d_hash[offset]);
+        pdst[0] = psrc[0];
+        pdst[1] = psrc[1];
+        pdst[2] = make_uint4(0,0,0,0);
+        pdst[3] = make_uint4(0,0,0,0);
+    }
+}
+
+__host__
+void hefty_copy_hashes(int thr_id, uint32_t threads, uint32_t* d_outputhash)
+{
+    const uint32_t threadsperblock = 128;
+    dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+    dim3 block(threadsperblock);
+    hefty_gpu_copy <<< grid, block >>> (threads, heavy_heftyHashes[thr_id], (uint64_t*) d_outputhash);
+    cudaStreamSynchronize(NULL);
+}
+
diff --git a/heavy/cuda_hefty1.h b/heavy/cuda_hefty1.h
deleted file mode 100644
index 17b196c836..0000000000
--- a/heavy/cuda_hefty1.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef _CUDA_HEFTY1_H
-#define _CUDA_HEFTY1_H
-
-void hefty_cpu_hash(int thr_id, int threads, int startNounce);
-void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len);
-void hefty_cpu_init(int thr_id, int threads);
-
-#endif
\ No newline at end of file
diff --git a/heavy/cuda_keccak512.cu b/heavy/cuda_keccak512.cu
index eb69e3bc3b..3e94f16220 100644
--- a/heavy/cuda_keccak512.cu
+++ b/heavy/cuda_keccak512.cu
@@ -1,158 +1,148 @@
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
 #include <stdio.h>
 #include <memory.h>
 
-// Folgende Definitionen sp�ter durch header ersetzen
-typedef unsigned char uint8_t;
-typedef unsigned int uint32_t;
-typedef unsigned long long uint64_t;
+#include "cuda_helper.h"
 
-// globaler Speicher f�r alle HeftyHashes aller Threads
-extern uint32_t *d_heftyHashes[8];
-extern uint32_t *d_nonceVector[8];
+// globaler Speicher für alle HeftyHashes aller Threads
+extern uint32_t *heavy_heftyHashes[MAX_GPUS];
+extern uint32_t *heavy_nonceVector[MAX_GPUS];
 
-// globaler Speicher f�r unsere Ergebnisse
-uint32_t *d_hash3output[8];
-extern uint32_t *d_hash4output[8];
-extern uint32_t *d_hash5output[8];
+// globaler Speicher für unsere Ergebnisse
+uint32_t *d_hash3output[MAX_GPUS];
+extern uint32_t *d_hash4output[MAX_GPUS];
+extern uint32_t *d_hash5output[MAX_GPUS];
 
 // der Keccak512 State nach der ersten Runde (72 Bytes)
 __constant__ uint64_t c_State[25];
 
-// die Message (72 Bytes) f�r die zweite Runde auf der GPU
+// die Message (72 Bytes) für die zweite Runde auf der GPU
 __constant__ uint32_t c_PaddedMessage2[18]; // 44 bytes of remaining message (Nonce at offset 4) plus padding
 
 // ---------------------------- BEGIN CUDA keccak512 functions ------------------------------------
 
-#include "cuda_helper.h"
-
 #define U32TO64_LE(p) \
-    (((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32))
+	(((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32))
 
 #define U64TO32_LE(p, v) \
-    *p = (uint32_t)((v)); *(p+1) = (uint32_t)((v) >> 32);
+	*p = (uint32_t)((v)); *(p+1) = (uint32_t)((v) >> 32);
 
 static __device__ void mycpy72(uint32_t *d, const uint32_t *s) {
-#pragma unroll 18
-    for (int k=0; k < 18; ++k) d[k] = s[k];
+	#pragma unroll 18
+	for (int k=0; k < 18; ++k) d[k] = s[k];
 }
 
 static __device__ void mycpy32(uint32_t *d, const uint32_t *s) {
-#pragma unroll 8
-    for (int k=0; k < 8; ++k) d[k] = s[k];
+	#pragma unroll 8
+	for (int k=0; k < 8; ++k) d[k] = s[k];
 }
 
 typedef struct keccak_hash_state_t {
-    uint64_t state[25];                        // 25*2
-    uint32_t buffer[72/4];                     // 72
+	uint64_t state[25];    // 25*2
+	uint32_t buffer[72/4]; // 72
 } keccak_hash_state;
 
 __device__ void statecopy(uint64_t *d, uint64_t *s)
 {
-#pragma unroll 25
-    for (int i=0; i < 25; ++i)
-        d[i] = s[i];
+	#pragma unroll 25
+	for (int i=0; i < 25; ++i)
+		d[i] = s[i];
 }
 
 
 static const uint64_t host_keccak_round_constants[24] = {
-    0x0000000000000001ull, 0x0000000000008082ull,
-    0x800000000000808aull, 0x8000000080008000ull,
-    0x000000000000808bull, 0x0000000080000001ull,
-    0x8000000080008081ull, 0x8000000000008009ull,
-    0x000000000000008aull, 0x0000000000000088ull,
-    0x0000000080008009ull, 0x000000008000000aull,
-    0x000000008000808bull, 0x800000000000008bull,
-    0x8000000000008089ull, 0x8000000000008003ull,
-    0x8000000000008002ull, 0x8000000000000080ull,
-    0x000000000000800aull, 0x800000008000000aull,
-    0x8000000080008081ull, 0x8000000000008080ull,
-    0x0000000080000001ull, 0x8000000080008008ull
+	0x0000000000000001ull, 0x0000000000008082ull,
+	0x800000000000808aull, 0x8000000080008000ull,
+	0x000000000000808bull, 0x0000000080000001ull,
+	0x8000000080008081ull, 0x8000000000008009ull,
+	0x000000000000008aull, 0x0000000000000088ull,
+	0x0000000080008009ull, 0x000000008000000aull,
+	0x000000008000808bull, 0x800000000000008bull,
+	0x8000000000008089ull, 0x8000000000008003ull,
+	0x8000000000008002ull, 0x8000000000000080ull,
+	0x000000000000800aull, 0x800000008000000aull,
+	0x8000000080008081ull, 0x8000000000008080ull,
+	0x0000000080000001ull, 0x8000000080008008ull
 };
 
 __constant__ uint64_t c_keccak_round_constants[24];
 
 __host__ __device__ void
 keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_constants) {
-    size_t i;
-    uint64_t t[5], u[5], v, w;
-
-    /* absorb input */
-#pragma unroll 9
-    for (i = 0; i < 72 / 8; i++, in += 2)
-        s[i] ^= U32TO64_LE(in);
-    
-    for (i = 0; i < 24; i++) {
-        /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
-        t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
-        t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
-        t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
-        t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
-        t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
-
-        /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
-        u[0] = t[4] ^ ROTL64(t[1], 1);
-        u[1] = t[0] ^ ROTL64(t[2], 1);
-        u[2] = t[1] ^ ROTL64(t[3], 1);
-        u[3] = t[2] ^ ROTL64(t[4], 1);
-        u[4] = t[3] ^ ROTL64(t[0], 1);
-
-        /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
-        s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
-        s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
-        s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
-        s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
-        s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
-
-        /* rho pi: b[..] = rotl(a[..], ..) */
-        v = s[ 1];
-        s[ 1] = ROTL64(s[ 6], 44);
-        s[ 6] = ROTL64(s[ 9], 20);
-        s[ 9] = ROTL64(s[22], 61);
-        s[22] = ROTL64(s[14], 39);
-        s[14] = ROTL64(s[20], 18);
-        s[20] = ROTL64(s[ 2], 62);
-        s[ 2] = ROTL64(s[12], 43);
-        s[12] = ROTL64(s[13], 25);
-        s[13] = ROTL64(s[19],  8);
-        s[19] = ROTL64(s[23], 56);
-        s[23] = ROTL64(s[15], 41);
-        s[15] = ROTL64(s[ 4], 27);
-        s[ 4] = ROTL64(s[24], 14);
-        s[24] = ROTL64(s[21],  2);
-        s[21] = ROTL64(s[ 8], 55);
-        s[ 8] = ROTL64(s[16], 45);
-        s[16] = ROTL64(s[ 5], 36);
-        s[ 5] = ROTL64(s[ 3], 28);
-        s[ 3] = ROTL64(s[18], 21);
-        s[18] = ROTL64(s[17], 15);
-        s[17] = ROTL64(s[11], 10);
-        s[11] = ROTL64(s[ 7],  6);
-        s[ 7] = ROTL64(s[10],  3);
-        s[10] = ROTL64(    v,  1);
-
-        /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
-        v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w;
-        v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w;
-        v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
-        v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
-        v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
-
-        /* iota: a[0,0] ^= round constant */
-        s[0] ^= keccak_round_constants[i];
-    }
+	size_t i;
+	uint64_t t[5], u[5], v, w;
+
+	/* absorb input */
+	for (i = 0; i < 9 /* 72/8 */; i++, in += 2)
+		s[i] ^= U32TO64_LE(in);
+
+	for (i = 0; i < 24; i++) {
+		/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
+		t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
+		t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
+		t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
+		t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
+		t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
+
+		/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
+		u[0] = t[4] ^ ROTL64(t[1], 1);
+		u[1] = t[0] ^ ROTL64(t[2], 1);
+		u[2] = t[1] ^ ROTL64(t[3], 1);
+		u[3] = t[2] ^ ROTL64(t[4], 1);
+		u[4] = t[3] ^ ROTL64(t[0], 1);
+
+		/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
+		s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
+		s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
+		s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
+		s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
+		s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
+
+		/* rho pi: b[..] = rotl(a[..], ..) */
+		v = s[ 1];
+		s[ 1] = ROTL64(s[ 6], 44);
+		s[ 6] = ROTL64(s[ 9], 20);
+		s[ 9] = ROTL64(s[22], 61);
+		s[22] = ROTL64(s[14], 39);
+		s[14] = ROTL64(s[20], 18);
+		s[20] = ROTL64(s[ 2], 62);
+		s[ 2] = ROTL64(s[12], 43);
+		s[12] = ROTL64(s[13], 25);
+		s[13] = ROTL64(s[19],  8);
+		s[19] = ROTL64(s[23], 56);
+		s[23] = ROTL64(s[15], 41);
+		s[15] = ROTL64(s[ 4], 27);
+		s[ 4] = ROTL64(s[24], 14);
+		s[24] = ROTL64(s[21],  2);
+		s[21] = ROTL64(s[ 8], 55);
+		s[ 8] = ROTL64(s[16], 45);
+		s[16] = ROTL64(s[ 5], 36);
+		s[ 5] = ROTL64(s[ 3], 28);
+		s[ 3] = ROTL64(s[18], 21);
+		s[18] = ROTL64(s[17], 15);
+		s[17] = ROTL64(s[11], 10);
+		s[11] = ROTL64(s[ 7],  6);
+		s[ 7] = ROTL64(s[10],  3);
+		s[10] = ROTL64(    v,  1);
+
+		/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
+		v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w;
+		v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w;
+		v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
+		v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
+		v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
+
+		/* iota: a[0,0] ^= round constant */
+		s[0] ^= keccak_round_constants[i];
+	}
 }
 
 // Die Hash-Funktion
-template <int BLOCKSIZE> __global__ void keccak512_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector)
+template <int BLOCKSIZE> __global__ void keccak512_gpu_hash(uint32_t threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector)
 {
-	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
-		// bestimme den aktuellen Z�hler
+		// bestimme den aktuellen Zähler
 		//uint32_t nounce = startNounce + thread;
 		uint32_t nounce = nonceVector[thread];
 
@@ -164,7 +154,7 @@ template <int BLOCKSIZE> __global__ void keccak512_gpu_hash(int threads, uint32_
 #pragma unroll 25
 		for (int i=0; i < 25; ++i)
 			keccak_gpu_state[i] = c_State[i];
-	
+
 		// Message2 in den Puffer holen
 		uint32_t msgBlock[18];
 		mycpy72(msgBlock, c_PaddedMessage2);
@@ -175,7 +165,7 @@ template <int BLOCKSIZE> __global__ void keccak512_gpu_hash(int threads, uint32_
 		// den individuellen Hefty1 Hash einsetzen
 		mycpy32(&msgBlock[(BLOCKSIZE-72)/sizeof(uint32_t)], &heftyHashes[8 * hashPosition]);
 
-		// den Block einmal gut durchsch�tteln
+		// den Block einmal gut durchschütteln
 		keccak_block(keccak_gpu_state, msgBlock, c_keccak_round_constants);
 
 		// das Hash erzeugen
@@ -195,8 +185,8 @@ template <int BLOCKSIZE> __global__ void keccak512_gpu_hash(int threads, uint32_
 
 // ---------------------------- END CUDA keccak512 functions ------------------------------------
 
-// Setup-Funktionen
-__host__ void keccak512_cpu_init(int thr_id, int threads)
+__host__
+void keccak512_cpu_init(int thr_id, uint32_t threads)
 {
 	// Kopiere die Hash-Tabellen in den GPU-Speicher
 	cudaMemcpyToSymbol( c_keccak_round_constants,
@@ -204,8 +194,14 @@ __host__ void keccak512_cpu_init(int thr_id, int threads)
 						sizeof(host_keccak_round_constants),
 						0, cudaMemcpyHostToDevice);
 
-	// Speicher f�r alle Ergebnisse belegen
-	cudaMalloc(&d_hash3output[thr_id], 16 * sizeof(uint32_t) * threads);
+	// Speicher für alle Ergebnisse belegen
+	cudaMalloc(&d_hash3output[thr_id], (size_t) 64 * threads);
+}
+
+__host__
+void keccak512_cpu_free(int thr_id)
+{
+	cudaFree(d_hash3output[thr_id]);
 }
 
 // ----------------BEGIN keccak512 CPU version from scrypt-jane code --------------------
@@ -220,23 +216,24 @@ __host__ void keccak512_cpu_init(int thr_id, int threads)
 
 static int BLOCKSIZE = 84;
 
-__host__ void keccak512_cpu_setBlock(void *data, int len)
+__host__
+void keccak512_cpu_setBlock(void *data, int len)
 	// data muss 80 oder 84-Byte haben!
 	// heftyHash hat 32-Byte
 {
 	// CH
-	// state init	
+	// state init
 	uint64_t keccak_cpu_state[25];
 	memset(keccak_cpu_state, 0, sizeof(keccak_cpu_state));
 
-	// erste Runde	
+	// erste Runde
 	keccak_block((uint64_t*)&keccak_cpu_state, (const uint32_t*)data, host_keccak_round_constants);
 
 	// state kopieren
 	cudaMemcpyToSymbol( c_State, keccak_cpu_state, 25*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
 
-	// keccak hat 72-Byte bl�cke, d.h. in unserem Fall zwei Bl�cke
-	// zu jeweils 
+	// keccak hat 72-Byte blöcke, d.h. in unserem Fall zwei Blöcke
+	// zu jeweils
 	uint32_t msgBlock[18];
 	memset(msgBlock, 0, 18 * sizeof(uint32_t));
 
@@ -246,41 +243,43 @@ __host__ void keccak512_cpu_setBlock(void *data, int len)
 	else if (len == 80)
 		memcpy(&msgBlock[0], &((uint8_t*)data)[72], 8);
 
-	// Nachricht abschlie�en
+	// Nachricht abschließen
 	if (len == 84)
 		msgBlock[11] = 0x01;
 	else if (len == 80)
 		msgBlock[10] = 0x01;
 	msgBlock[17] = 0x80000000;
-	
-	// Message 2 ins Constant Memory kopieren (die variable Nonce und 
+
+	// Message 2 ins Constant Memory kopieren (die variable Nonce und
 	// der Hefty1 Anteil muss aber auf der GPU erst noch ersetzt werden)
 	cudaMemcpyToSymbol( c_PaddedMessage2, msgBlock, 18*sizeof(uint32_t), 0, cudaMemcpyHostToDevice );
 
 	BLOCKSIZE = len;
 }
 
-
-__host__ void keccak512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy)
+__host__
+void keccak512_cpu_copyHeftyHash(int thr_id, uint32_t threads, void *heftyHashes, int copy)
 {
 	// Hefty1 Hashes kopieren
-	if (copy) cudaMemcpy( d_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice );
+	if (copy)
+		CUDA_SAFE_CALL(cudaMemcpy(heavy_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice));
 	//else cudaThreadSynchronize();
 }
 
-__host__ void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
+__host__
+void keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce)
 {
-	const int threadsperblock = 128;
+	const uint32_t threadsperblock = 128;
 
 	// berechne wie viele Thread Blocks wir brauchen
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	// Gr��e des dynamischen Shared Memory Bereichs
+	// Größe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 
 	if (BLOCKSIZE==84)
-		keccak512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash3output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
+		keccak512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash3output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
 	else if (BLOCKSIZE==80)
-		keccak512_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash3output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
+		keccak512_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash3output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
 }
diff --git a/heavy/cuda_keccak512.h b/heavy/cuda_keccak512.h
deleted file mode 100644
index 1182447573..0000000000
--- a/heavy/cuda_keccak512.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef _CUDA_KECCAK512_H
-#define _CUDA_KECCAK512_H
-
-void keccak512_cpu_init(int thr_id, int threads);
-void keccak512_cpu_setBlock(void *data, int len);
-void keccak512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy);
-void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce);
-
-#endif
diff --git a/heavy/cuda_sha256.cu b/heavy/cuda_sha256.cu
index f520778b51..7441621b2b 100644
--- a/heavy/cuda_sha256.cu
+++ b/heavy/cuda_sha256.cu
@@ -1,19 +1,14 @@
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
 #include <stdio.h>
 #include <memory.h>
 
-// Folgende Definitionen sp�ter durch header ersetzen
-typedef unsigned int uint32_t;
+#include "cuda_helper.h"
 
-// globaler Speicher f�r alle HeftyHashes aller Threads
-extern uint32_t *d_heftyHashes[8];
-extern uint32_t *d_nonceVector[8];
+// globaler Speicher für alle HeftyHashes aller Threads
+extern uint32_t *heavy_heftyHashes[MAX_GPUS];
+extern uint32_t *heavy_nonceVector[MAX_GPUS];
 
-// globaler Speicher f�r unsere Ergebnisse
-uint32_t *d_hash2output[8];
+// globaler Speicher für unsere Ergebnisse
+uint32_t *d_hash2output[MAX_GPUS];
 
 
 /* Hash-Tabellen */
@@ -47,15 +42,15 @@ uint32_t sha256_cpu_constantTable[] = {
 #define SWAB32(x)		( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
 
 // Die Hash-Funktion
-template <int BLOCKSIZE> __global__ void sha256_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector)
+template <int BLOCKSIZE> __global__ void sha256_gpu_hash(uint32_t threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector)
 {
-	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
-		// bestimme den aktuellen Z�hler
+		// bestimme den aktuellen Zähler
 		uint32_t nounce = startNounce + thread;
 		nonceVector[thread] = nounce;
-	
+
 		// jeder thread in diesem  Block bekommt sein eigenes W Array im Shared memory
 		uint32_t W1[16];
 		uint32_t W2[16];
@@ -71,10 +66,10 @@ template <int BLOCKSIZE> __global__ void sha256_gpu_hash(int threads, uint32_t s
 			regs[k] = sha256_gpu_register[k];
 			hash[k] = regs[k];
 		}
-	
+
 		// 2. Runde
-		//memcpy(W, &sha256_gpu_blockHeader[0], sizeof(uint32_t) * 16); // TODO: aufsplitten in zwei Teilbl�cke
-		//memcpy(&W[5], &heftyHashes[8 * (blockDim.x * blockIdx.x + threadIdx.x)], sizeof(uint32_t) * 8); // den richtigen Hefty1 Hash holen		
+		//memcpy(W, &sha256_gpu_blockHeader[0], sizeof(uint32_t) * 16); // TODO: aufsplitten in zwei Teilblöcke
+		//memcpy(&W[5], &heftyHashes[8 * (blockDim.x * blockIdx.x + threadIdx.x)], sizeof(uint32_t) * 8); // den richtigen Hefty1 Hash holen
 #pragma unroll 16
 		for(int k=0;k<16;k++)
 			W1[k] = sha256_gpu_blockHeader[k];
@@ -95,7 +90,7 @@ template <int BLOCKSIZE> __global__ void sha256_gpu_hash(int threads, uint32_t s
 			uint32_t T1, T2;
 			T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_gpu_constantTable[j] + W1[j];
 			T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
-		
+
 			#pragma unroll 7
 			for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
 			regs[0] = T1 + T2;
@@ -126,7 +121,7 @@ template <int BLOCKSIZE> __global__ void sha256_gpu_hash(int threads, uint32_t s
 				uint32_t T1, T2;
 				T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_gpu_constantTable[j + 16 * (k+1)] + W2[j];
 				T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
-		
+
 				#pragma unroll 7
 				for (int l=6; l >= 0; l--) regs[l+1] = regs[l];
 				regs[0] = T1 + T2;
@@ -141,14 +136,14 @@ template <int BLOCKSIZE> __global__ void sha256_gpu_hash(int threads, uint32_t s
 /*
 		for(int j=16;j<64;j++)
 			W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16];
-	
+
 #pragma unroll 64
 		for(int j=0;j<64;j++)
 		{
 			uint32_t T1, T2;
 			T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_gpu_constantTable[j] + W[j];
 			T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
-		
+
 			#pragma unroll 7
 			for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
 			regs[0] = T1 + T2;
@@ -165,16 +160,23 @@ template <int BLOCKSIZE> __global__ void sha256_gpu_hash(int threads, uint32_t s
 	}
 }
 
-// Setup-Funktionen
-__host__ void sha256_cpu_init(int thr_id, int threads)
+// Setup Function
+__host__
+void sha256_cpu_init(int thr_id, uint32_t threads)
 {
 	// Kopiere die Hash-Tabellen in den GPU-Speicher
 	cudaMemcpyToSymbol(	sha256_gpu_constantTable,
 						sha256_cpu_constantTable,
 						sizeof(uint32_t) * 64 );
 
-	// Speicher f�r alle Ergebnisse belegen
-	cudaMalloc(&d_hash2output[thr_id], 8 * sizeof(uint32_t) * threads);
+	// Speicher für alle Ergebnisse belegen
+	cudaMalloc(&d_hash2output[thr_id], (size_t) 8 * sizeof(uint32_t) * threads);
+}
+
+__host__
+void sha256_cpu_free(int thr_id)
+{
+	cudaFree(d_hash2output[thr_id]);
 }
 
 static int BLOCKSIZE = 84;
@@ -189,25 +191,25 @@ __host__ void sha256_cpu_setBlock(void *data, int len)
 	memset(msgBlock, 0, sizeof(uint32_t) * 32);
 	memcpy(&msgBlock[0], data, len);
 	if (len == 84) {
-		memset(&msgBlock[21], 0, 32); // vorl�ufig  Nullen anstatt der Hefty1 Hashes einf�llen
+		memset(&msgBlock[21], 0, 32); // vorläufig  Nullen anstatt der Hefty1 Hashes einfüllen
 		msgBlock[29] |= 0x80;
 		msgBlock[31] = 928; // bitlen
 	} else if (len == 80) {
-		memset(&msgBlock[20], 0, 32); // vorl�ufig  Nullen anstatt der Hefty1 Hashes einf�llen
+		memset(&msgBlock[20], 0, 32); // vorläufig  Nullen anstatt der Hefty1 Hashes einfüllen
 		msgBlock[28] |= 0x80;
 		msgBlock[31] = 896; // bitlen
 	}
-	
+
 	for(int i=0;i<31;i++) // Byteorder drehen
 		msgBlock[i] = SWAB32(msgBlock[i]);
 
-	// die erste Runde wird auf der CPU durchgef�hrt, da diese f�r
+	// die erste Runde wird auf der CPU durchgeführt, da diese für
 	// alle Threads gleich ist. Der Hash wird dann an die Threads
-	// �bergeben
+	// übergeben
 	uint32_t W[64];
 
 	// Erstelle expandierten Block W
-	memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16);	
+	memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16);
 	for(int j=16;j<64;j++)
 		W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16];
 
@@ -228,7 +230,7 @@ __host__ void sha256_cpu_setBlock(void *data, int len)
 		uint32_t T1, T2;
 		T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_cpu_constantTable[j] + W[j];
 		T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
-		
+
 		//#pragma unroll 7
 		for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
 		// sollte mal noch durch memmov ersetzt werden!
@@ -253,27 +255,28 @@ __host__ void sha256_cpu_setBlock(void *data, int len)
 	BLOCKSIZE = len;
 }
 
-__host__ void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy)
+__host__ void sha256_cpu_copyHeftyHash(int thr_id, uint32_t threads, void *heftyHashes, int copy)
 {
 	// Hefty1 Hashes kopieren
-	if (copy) cudaMemcpy( d_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice );
+	if (copy)
+		CUDA_SAFE_CALL(cudaMemcpy(heavy_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice));
 	//else cudaThreadSynchronize();
 }
 
-__host__ void sha256_cpu_hash(int thr_id, int threads, int startNounce)
+__host__ void sha256_cpu_hash(int thr_id, uint32_t threads, int startNounce)
 {
-	const int threadsperblock = 256;
+	const uint32_t threadsperblock = 256;
 
 	// berechne wie viele Thread Blocks wir brauchen
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	// Gr��e des dynamischen Shared Memory Bereichs
+	// Größe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 
 	if (BLOCKSIZE == 84)
-		sha256_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash2output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
+		sha256_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash2output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
 	else if (BLOCKSIZE == 80) {
-		sha256_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash2output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
+		sha256_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash2output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
 	}
 }
diff --git a/heavy/cuda_sha256.h b/heavy/cuda_sha256.h
deleted file mode 100644
index 03385d125a..0000000000
--- a/heavy/cuda_sha256.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef _CUDA_SHA256_H
-#define _CUDA_SHA256_H
-
-void sha256_cpu_init(int thr_id, int threads);
-void sha256_cpu_setBlock(void *data, int len);
-void sha256_cpu_hash(int thr_id, int threads, int startNounce);
-void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy);
-#endif
diff --git a/heavy/heavy.cu b/heavy/heavy.cu
index 98728dc222..a5349c26de 100644
--- a/heavy/heavy.cu
+++ b/heavy/heavy.cu
@@ -1,484 +1,422 @@
-#include <string.h>
+#include <stdio.h>
 #include <openssl/sha.h>
 #include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
 #include <map>
 
-#ifndef _WIN32
+#ifndef WITH_HEAVY_ALGO
 #include <unistd.h>
+#include "miner.h"
+// nonce array also used in other algos
+uint32_t *heavy_nonceVector[MAX_GPUS];
+int scanhash_heavy(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done, uint32_t maxvote, int blocklen)
+{
+	applog(LOG_ERR, "heavy algo not included in this build!");
+	sleep(3);
+	return -1;
+}
+void free_heavy(int thr_id) {}
+
+#else
+
+// include thrust if possible
+#if defined(__GNUC__) && __GNUC__ == 5 && __GNUC_MINOR__ >= 2 && CUDA_VERSION < 7000
+#warning "Heavy: incompatible GCC version!"
+#define USE_THRUST 0
+#else
+#define USE_THRUST 1
 #endif
 
-// include thrust
-#include <thrust/version.h>
+#if USE_THRUST
 #include <thrust/remove.h>
 #include <thrust/device_vector.h>
-#include <thrust/iterator/constant_iterator.h>
+#endif
 
 #include "miner.h"
+#include "cuda_helper.h"
 
-#include "hefty1.h"
-#include "sph/sph_keccak.h"
-#include "sph/sph_blake.h"
-#include "sph/sph_groestl.h"
-
-#include "heavy/cuda_hefty1.h"
-#include "heavy/cuda_sha256.h"
-#include "heavy/cuda_keccak512.h"
-#include "heavy/cuda_groestl512.h"
-#include "heavy/cuda_blake512.h"
-#include "heavy/cuda_combine.h"
-
-extern uint32_t *d_hash2output[8];
-extern uint32_t *d_hash3output[8];
-extern uint32_t *d_hash4output[8];
-extern uint32_t *d_hash5output[8];
+// nonce array also used in other algos
+uint32_t *heavy_nonceVector[MAX_GPUS];
 
-#define HEAVYCOIN_BLKHDR_SZ        84
-#define MNR_BLKHDR_SZ		       80
+extern uint32_t *d_hash2output[MAX_GPUS];
+extern uint32_t *d_hash3output[MAX_GPUS];
+extern uint32_t *d_hash4output[MAX_GPUS];
+extern uint32_t *d_hash5output[MAX_GPUS];
 
-// nonce-array f�r die threads
-uint32_t *d_nonceVector[8];
+#define HEAVYCOIN_BLKHDR_SZ 84
+#define MNR_BLKHDR_SZ       80
 
-/* Combines top 64-bits from each hash into a single hash */
-static void combine_hashes(uint32_t *out, const uint32_t *hash1, const uint32_t *hash2, const uint32_t *hash3, const uint32_t *hash4)
-{
-    const uint32_t *hash[4] = { hash1, hash2, hash3, hash4 };
-    int bits;
-    unsigned int i;
-    uint32_t mask;
-    unsigned int k;
-
-    /* Transpose first 64 bits of each hash into out */
-    memset(out, 0, 32);
-    bits = 0;
-    for (i = 7; i >= 6; i--) {
-        for (mask = 0x80000000; mask; mask >>= 1) {
-            for (k = 0; k < 4; k++) {
-                out[(255 - bits)/32] <<= 1;
-                if ((hash[k][i] & mask) != 0)
-                    out[(255 - bits)/32] |= 1;
-                bits++;
-            }
-        }
-    }
-}
+extern uint32_t *heavy_heftyHashes[MAX_GPUS];
 
 #ifdef _MSC_VER
 #include <intrin.h>
 static uint32_t __inline bitsset( uint32_t x )
 {
-   DWORD r = 0;
-   _BitScanReverse(&r, x);
-   return r;
+	DWORD r = 0;
+	_BitScanReverse(&r, x);
+	return r;
 }
 #else
 static uint32_t bitsset( uint32_t x )
 {
-    return 31-__builtin_clz(x);
+	return 31-__builtin_clz(x);
 }
 #endif
 
 // Finde das high bit in einem Multiword-Integer.
 static int findhighbit(const uint32_t *ptarget, int words)
 {
-    int i;
-    int highbit = 0;
-    for (i=words-1; i >= 0; --i)
-    {
-        if (ptarget[i] != 0) {
-            highbit = i*32 + bitsset(ptarget[i])+1;
-                break;
-        }
-    }
-    return highbit;
+	int i;
+	int highbit = 0;
+	for (i=words-1; i >= 0; --i)
+	{
+		if (ptarget[i] != 0) {
+			highbit = i*32 + bitsset(ptarget[i])+1;
+			break;
+		}
+	}
+	return highbit;
 }
 
 // Generiere ein Multiword-Integer das die Zahl
-// (2 << highbit) - 1 repr�sentiert.
+// (2 << highbit) - 1 repräsentiert.
 static void genmask(uint32_t *ptarget, int words, int highbit)
 {
-    int i;
-    for (i=words-1; i >= 0; --i)
-    {
-        if ((i+1)*32 <= highbit)
-            ptarget[i] = 0xffffffff;
-        else if (i*32 > highbit)
-            ptarget[i] = 0x00000000;
-        else
-            ptarget[i] = (1 << (highbit-i*32)) - 1;
-    }
+	int i;
+	for (i=words-1; i >= 0; --i)
+	{
+		if ((i+1)*32 <= highbit)
+			ptarget[i] = UINT32_MAX;
+		else if (i*32 > highbit)
+			ptarget[i] = 0x00000000;
+		else
+			ptarget[i] = (1 << (highbit-i*32)) - 1;
+	}
 }
 
 struct check_nonce_for_remove
-{    
-    check_nonce_for_remove(uint64_t target, uint32_t *hashes, uint32_t hashlen, uint32_t startNonce) :
-        m_target(target),
-        m_hashes(hashes),
-        m_hashlen(hashlen),
-        m_startNonce(startNonce) { }
-
-    __device__
-    bool operator()(const uint32_t x)
-    {
-        // Position im Hash Buffer
-        uint32_t hashIndex = x - m_startNonce;
-        // Wert des Hashes (als uint64_t) auslesen.
-        // Steht im 6. und 7. Wort des Hashes (jeder dieser Hashes hat 512 Bits)
-        uint64_t hashValue = *((uint64_t*)(&m_hashes[m_hashlen*hashIndex + 6]));
-        // gegen das Target pr�fen. Es d�rfen nur Bits aus dem Target gesetzt sein.
-        return (hashValue & m_target) != hashValue;
-    }
-
-    uint64_t  m_target;
-    uint32_t *m_hashes;
-    uint32_t  m_hashlen;
-    uint32_t  m_startNonce;
+{
+	check_nonce_for_remove(uint64_t target, uint32_t *hashes, uint32_t hashlen, uint32_t startNonce) :
+		m_target(target),
+		m_hashes(hashes),
+		m_hashlen(hashlen),
+		m_startNonce(startNonce) { }
+
+	uint64_t  m_target;
+	uint32_t *m_hashes;
+	uint32_t  m_hashlen;
+	uint32_t  m_startNonce;
+
+	__device__
+	bool operator()(const uint32_t x)
+	{
+		// Position im Hash Buffer
+		uint32_t hashIndex = x - m_startNonce;
+		// Wert des Hashes (als uint64_t) auslesen.
+		// Steht im 6. und 7. Wort des Hashes (jeder dieser Hashes hat 512 Bits)
+		uint64_t hashValue = *((uint64_t*)(&m_hashes[m_hashlen*hashIndex + 6]));
+		bool res = (hashValue & m_target) != hashValue;
+		//printf("ndx=%x val=%08x target=%lx\n", hashIndex, hashValue, m_target);
+		// gegen das Target prüfen. Es dürfen nur Bits aus dem Target gesetzt sein.
+		return res;
+	}
 };
 
-// Zahl der CUDA Devices im System bestimmen
-extern "C" int cuda_num_devices()
+static bool init[MAX_GPUS] = { 0 };
+
+__host__
+int scanhash_heavy(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done, uint32_t maxvote, int blocklen)
 {
-    int version;
-    cudaError_t err = cudaDriverGetVersion(&version);
-    if (err != cudaSuccess)
-    {
-        applog(LOG_ERR, "Unable to query CUDA driver version! Is an nVidia driver installed?");
-        exit(1);
-    }
-
-    int maj = version / 1000, min = version % 100; // same as in deviceQuery sample
-    if (maj < 5 || (maj == 5 && min < 5))
-    {
-        applog(LOG_ERR, "Driver does not support CUDA %d.%d API! Update your nVidia driver!", 5, 5);
-        exit(1);
-    }
-
-    int GPU_N;
-    err = cudaGetDeviceCount(&GPU_N);
-    if (err != cudaSuccess)
-    {
-        applog(LOG_ERR, "Unable to query number of CUDA devices! Is an nVidia driver installed?");
-        exit(1);
-    }
-    return GPU_N;
-}
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	// CUDA will process thousands of threads.
+	uint32_t throughput = cuda_default_throughput(thr_id, (1U << 19) - 256);
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	int rc = 0;
+	uint32_t *hash = NULL;
+	uint32_t *cpu_nonceVector = NULL;
+
+	int nrmCalls[6];
+	memset(nrmCalls, 0, sizeof(int) * 6);
+
+	if (opt_benchmark)
+	   ptarget[7] = 0x000f;
+
+	// für jeden Hash ein individuelles Target erstellen basierend
+	// auf dem höchsten Bit, das in ptarget gesetzt ist.
+	int highbit = findhighbit(ptarget, 8);
+	uint32_t target2[2], target3[2], target4[2], target5[2];
+	genmask(target2, 2, highbit/4+(((highbit%4)>3)?1:0) ); // SHA256
+	genmask(target3, 2, highbit/4+(((highbit%4)>2)?1:0) ); // keccak512
+	genmask(target4, 2, highbit/4+(((highbit%4)>1)?1:0) ); // groestl512
+	genmask(target5, 2, highbit/4+(((highbit%4)>0)?1:0) ); // blake512
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		hefty_cpu_init(thr_id, throughput);
+		sha256_cpu_init(thr_id, throughput);
+		keccak512_cpu_init(thr_id, throughput);
+		groestl512_cpu_init(thr_id, throughput);
+		blake512_cpu_init(thr_id, throughput);
+		combine_cpu_init(thr_id, throughput);
+
+		CUDA_SAFE_CALL(cudaMalloc(&heavy_nonceVector[thr_id], sizeof(uint32_t) * throughput));
+
+		init[thr_id] = true;
+	}
+
+	// weird but require at least one cudaSetDevice first
+	CUDA_SAFE_CALL(cudaMallocHost(&hash, (size_t) 32 * throughput));
+	CUDA_SAFE_CALL(cudaMallocHost(&cpu_nonceVector, sizeof(uint32_t) * throughput));
+
+	if (blocklen == HEAVYCOIN_BLKHDR_SZ)
+	{
+		uint16_t *ext = (uint16_t*) &pdata[20];
+
+		if (opt_vote > maxvote && !opt_benchmark) {
+			applog(LOG_WARNING, "Your block reward vote (%hu) exceeds the maxvote reported by the pool (%hu).",
+					opt_vote, maxvote);
+		}
+
+		if (opt_trust_pool && opt_vote > maxvote) {
+			applog(LOG_WARNING, "Capping block reward vote to maxvote reported by pool.");
+			ext[0] = maxvote;
+		}
+		else
+			ext[0] = opt_vote;
+	}
+
+	// Setze die Blockdaten
+	hefty_cpu_setBlock(thr_id, throughput, pdata, blocklen);
+	sha256_cpu_setBlock(pdata, blocklen);
+	keccak512_cpu_setBlock(pdata, blocklen);
+	groestl512_cpu_setBlock(pdata, blocklen);
+	blake512_cpu_setBlock(pdata, blocklen);
+
+	do {
+		uint32_t actualNumberOfValuesInNonceVectorGPU = throughput;
+
+		////// Compaction init
+
+		hefty_cpu_hash(thr_id, throughput, pdata[19]);
+		sha256_cpu_hash(thr_id, throughput, pdata[19]);
+
+		// Hier ist die längste CPU Wartephase. Deshalb ein strategisches MyStreamSynchronize() hier.
+		MyStreamSynchronize(NULL, 1, thr_id);
+
+#if USE_THRUST
+		thrust::device_ptr<uint32_t> devNoncePtr(heavy_nonceVector[thr_id]);
+		thrust::device_ptr<uint32_t> devNoncePtrEnd((heavy_nonceVector[thr_id]) + throughput);
+
+		////// Compaction
+		uint64_t *t = (uint64_t*) target2;
+		devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash2output[thr_id], 8, pdata[19]));
+		actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
+		if(actualNumberOfValuesInNonceVectorGPU == 0)
+			goto emptyNonceVector;
+
+		keccak512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]);
+
+		////// Compaction
+		t = (uint64_t*) target3;
+		devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash3output[thr_id], 16, pdata[19]));
+		actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
+		if(actualNumberOfValuesInNonceVectorGPU == 0)
+			goto emptyNonceVector;
+
+		blake512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]);
+
+		////// Compaction
+		t = (uint64_t*) target5;
+		devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash5output[thr_id], 16, pdata[19]));
+		actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
+		if(actualNumberOfValuesInNonceVectorGPU == 0)
+			goto emptyNonceVector;
+
+		groestl512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]);
+
+		////// Compaction
+		t = (uint64_t*) target4;
+		devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash4output[thr_id], 16, pdata[19]));
+		actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
+#else
+		// todo (nvlabs cub ?)
+		actualNumberOfValuesInNonceVectorGPU = 0;
+#endif
+		if(actualNumberOfValuesInNonceVectorGPU == 0)
+			goto emptyNonceVector;
+
+		// combine
+		combine_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19], hash);
+
+		if (opt_tracegpu) {
+			applog(LOG_BLUE, "heavy GPU hash:");
+			applog_hash((uchar*)hash);
+		}
+
+		// Ergebnisse kopieren
+		if(actualNumberOfValuesInNonceVectorGPU > 0)
+		{
+			size_t size = sizeof(uint32_t) * actualNumberOfValuesInNonceVectorGPU;
+			cudaMemcpy(cpu_nonceVector, heavy_nonceVector[thr_id], size, cudaMemcpyDeviceToHost);
+
+			for (uint32_t i=0; i < actualNumberOfValuesInNonceVectorGPU; i++)
+			{
+				uint32_t nonce = cpu_nonceVector[i];
+				uint32_t *foundhash = &hash[8*i];
+				if (foundhash[7] <= ptarget[7] && fulltest(foundhash, ptarget)) {
+					uint32_t vhash[8];
+					pdata[19] += nonce - pdata[19];
+					heavycoin_hash((uchar*)vhash, (uchar*)pdata, blocklen);
+					if (memcmp(vhash, foundhash, 32)) {
+						gpu_increment_reject(thr_id);
+						if (!opt_quiet)
+							gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", nonce);
+					} else {
+						work_set_target_ratio(work, vhash);
+						rc = 1;
+						goto exit;
+					}
+				}
+			}
+		}
 
-// Ger�tenamen holen
-extern char *device_name[8];
-extern int device_map[8];
+emptyNonceVector:
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+		pdata[19] += throughput;
 
-extern "C" void cuda_devicenames()
-{
-    cudaError_t err;
-    int GPU_N;
-    err = cudaGetDeviceCount(&GPU_N);
-    if (err != cudaSuccess)
-    {
-        applog(LOG_ERR, "Unable to query number of CUDA devices! Is an nVidia driver installed?");
-        exit(1);
-    }
-
-    for (int i=0; i < GPU_N; i++)
-    {
-        cudaDeviceProp props;
-        cudaGetDeviceProperties(&props, device_map[i]);
-
-        device_name[i] = strdup(props.name);
-    }
-}
+	} while (!work_restart[thr_id].restart);
 
-static bool substringsearch(const char *haystack, const char *needle, int &match)
-{
-    int hlen = strlen(haystack);
-    int nlen = strlen(needle);
-    for (int i=0; i < hlen; ++i)
-    {
-        if (haystack[i] == ' ') continue;
-        int j=0, x = 0;
-        while(j < nlen)
-        {
-            if (haystack[i+x] == ' ') {++x; continue;}
-            if (needle[j] == ' ') {++j; continue;}
-            if (needle[j] == '#') return ++match == needle[j+1]-'0';
-            if (tolower(haystack[i+x]) != tolower(needle[j])) break;
-            ++j; ++x;
-        }
-        if (j == nlen) return true;
-    }
-    return false;
-}
+exit:
+	*hashes_done = pdata[19] - first_nonce;
 
-// CUDA Ger�t nach Namen finden (gibt Ger�te-Index zur�ck oder -1)
-extern "C" int cuda_finddevice(char *name)
-{
-    int num = cuda_num_devices();
-    int match = 0;
-    for (int i=0; i < num; ++i)
-    {
-        cudaDeviceProp props;
-        if (cudaGetDeviceProperties(&props, i) == cudaSuccess)
-            if (substringsearch(props.name, name, match)) return i;
-    }
-    return -1;
+	cudaFreeHost(cpu_nonceVector);
+	cudaFreeHost(hash);
+	CUDA_LOG_ERROR();
+
+	return rc;
 }
 
-// Zeitsynchronisations-Routine von cudaminer mit CPU sleep
-typedef struct { double value[8]; } tsumarray;
-cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id)
+// cleanup
+extern "C" void free_heavy(int thr_id)
 {
-    cudaError_t result = cudaSuccess;
-    if (situation >= 0)
-    {   
-        static std::map<int, tsumarray> tsum;
-
-        double a = 0.95, b = 0.05;
-        if (tsum.find(situation) == tsum.end()) { a = 0.5; b = 0.5; } // faster initial convergence
-
-        double tsync = 0.0;
-        double tsleep = 0.95 * tsum[situation].value[thr_id];
-        if (cudaStreamQuery(stream) == cudaErrorNotReady)
-        {
-            usleep((useconds_t)(1e6*tsleep));
-            struct timeval tv_start, tv_end;
-            gettimeofday(&tv_start, NULL);
-            result = cudaStreamSynchronize(stream);
-            gettimeofday(&tv_end, NULL);
-            tsync = 1e-6 * (tv_end.tv_usec-tv_start.tv_usec) + (tv_end.tv_sec-tv_start.tv_sec);
-        }
-        if (tsync >= 0) tsum[situation].value[thr_id] = a * tsum[situation].value[thr_id] + b * (tsleep+tsync);
-    }
-    else
-        result = cudaStreamSynchronize(stream);
-    return result;
-}
+	if (!init[thr_id])
+		return;
 
-int scanhash_heavy_cpp(int thr_id, uint32_t *pdata,
- const uint32_t *ptarget, uint32_t max_nonce,
- unsigned long *hashes_done, uint32_t maxvote, int blocklen);
+	cudaThreadSynchronize();
 
-extern "C"
-int scanhash_heavy(int thr_id, uint32_t *pdata,
- const uint32_t *ptarget, uint32_t max_nonce,
- unsigned long *hashes_done, uint32_t maxvote, int blocklen)
-{
- return scanhash_heavy_cpp(thr_id, pdata,
-  ptarget, max_nonce, hashes_done, maxvote, blocklen);
-}
+	cudaFree(heavy_nonceVector[thr_id]);
 
-extern bool opt_benchmark;
+	blake512_cpu_free(thr_id);
+	groestl512_cpu_free(thr_id);
+	hefty_cpu_free(thr_id);
+	keccak512_cpu_free(thr_id);
+	sha256_cpu_free(thr_id);
+	combine_cpu_free(thr_id);
 
-int scanhash_heavy_cpp(int thr_id, uint32_t *pdata,
- const uint32_t *ptarget, uint32_t max_nonce,
- unsigned long *hashes_done, uint32_t maxvote, int blocklen)
-{
-    // CUDA will process thousands of threads.
-    const int throughput = 4096 * 128;
-
-    if (opt_benchmark)
-        ((uint32_t*)ptarget)[7] = 0x000000ff;
-
-    int rc = 0;
-    uint32_t *hash = NULL;
-    cudaMallocHost(&hash, throughput*8*sizeof(uint32_t));
-    uint32_t *cpu_nonceVector = NULL;
-    cudaMallocHost(&cpu_nonceVector, throughput*sizeof(uint32_t));
-
-    int nrmCalls[6];
-    memset(nrmCalls, 0, sizeof(int) * 6);
-
-    uint32_t start_nonce = pdata[19];    
-
-    // f�r jeden Hash ein individuelles Target erstellen basierend
-    // auf dem h�chsten Bit, das in ptarget gesetzt ist.
-    int highbit = findhighbit(ptarget, 8);
-    uint32_t target2[2], target3[2], target4[2], target5[2];
-    genmask(target2, 2, highbit/4+(((highbit%4)>3)?1:0) ); // SHA256
-    genmask(target3, 2, highbit/4+(((highbit%4)>2)?1:0) ); // keccak512
-    genmask(target4, 2, highbit/4+(((highbit%4)>1)?1:0) ); // groestl512
-    genmask(target5, 2, highbit/4+(((highbit%4)>0)?1:0) ); // blake512
-
-    static bool init[8] = {0,0,0,0,0,0,0,0};
-    if (!init[thr_id])
-    {
-        hefty_cpu_init(thr_id, throughput);
-        sha256_cpu_init(thr_id, throughput);
-        keccak512_cpu_init(thr_id, throughput);
-        groestl512_cpu_init(thr_id, throughput);
-        blake512_cpu_init(thr_id, throughput);
-        combine_cpu_init(thr_id, throughput);
-        init[thr_id] = true;
-        cudaMalloc(&d_nonceVector[thr_id], sizeof(uint32_t) * throughput);
-    }
-
-    if (blocklen == HEAVYCOIN_BLKHDR_SZ)
-    {
-        uint16_t *ext = (uint16_t *)&pdata[20];
-
-        if (opt_vote > maxvote) {
-            printf("Warning: Your block reward vote (%hu) exceeds "
-                    "the maxvote reported by the pool (%hu).\n",
-                    opt_vote, maxvote);
-        }
-
-        if (opt_trust_pool && opt_vote > maxvote) {
-            printf("Warning: Capping block reward vote to maxvote reported by pool.\n");
-            ext[0] = maxvote;
-        }
-        else
-            ext[0] = opt_vote;
-    }
-
-    // Setze die Blockdaten
-    hefty_cpu_setBlock(thr_id, throughput, pdata, blocklen);
-    sha256_cpu_setBlock(pdata, blocklen);
-    keccak512_cpu_setBlock(pdata, blocklen);
-    groestl512_cpu_setBlock(pdata, blocklen);
-    blake512_cpu_setBlock(pdata, blocklen);
-
-    do {
-        int i;
-
-        ////// Compaction init
-        thrust::device_ptr<uint32_t> devNoncePtr(d_nonceVector[thr_id]);
-        thrust::device_ptr<uint32_t> devNoncePtrEnd((d_nonceVector[thr_id]) + throughput);
-        uint32_t actualNumberOfValuesInNonceVectorGPU = throughput;
-
-        hefty_cpu_hash(thr_id, throughput, pdata[19]);
-        //cudaThreadSynchronize();
-        sha256_cpu_hash(thr_id, throughput, pdata[19]);
-        //cudaThreadSynchronize();
-
-        // Hier ist die l�ngste CPU Wartephase. Deshalb ein strategisches MyStreamSynchronize() hier.
-        MyStreamSynchronize(NULL, 1, thr_id);
-
-        ////// Compaction
-        devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*((uint64_t*)target2), d_hash2output[thr_id], 8, pdata[19]));
-        actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
-        if(actualNumberOfValuesInNonceVectorGPU == 0)
-            goto emptyNonceVector;
-        
-        keccak512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]);
-        //cudaThreadSynchronize();
-
-        ////// Compaction
-        devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*((uint64_t*)target3), d_hash3output[thr_id], 16, pdata[19]));
-        actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
-        if(actualNumberOfValuesInNonceVectorGPU == 0)
-            goto emptyNonceVector;
-
-        blake512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]);
-        //cudaThreadSynchronize();
-
-        ////// Compaction
-        devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*((uint64_t*)target5), d_hash5output[thr_id], 16, pdata[19]));
-        actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
-        if(actualNumberOfValuesInNonceVectorGPU == 0)
-            goto emptyNonceVector;
-
-        groestl512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]);
-        //cudaThreadSynchronize();
-
-        ////// Compaction
-        devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*((uint64_t*)target4), d_hash4output[thr_id], 16, pdata[19]));
-        actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
-        if(actualNumberOfValuesInNonceVectorGPU == 0)
-            goto emptyNonceVector;
-        
-        // combine
-        combine_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19], hash);
-
-        // Ergebnisse kopieren
-        if(actualNumberOfValuesInNonceVectorGPU > 0)
-        {
-            cudaMemcpy(cpu_nonceVector, d_nonceVector[thr_id], sizeof(uint32_t) * actualNumberOfValuesInNonceVectorGPU, cudaMemcpyDeviceToHost);
-        
-            for (i=0; i<actualNumberOfValuesInNonceVectorGPU;++i)
-            {
-                uint32_t nonce = cpu_nonceVector[i];
-                //uint32_t index = nonce - pdata[19];
-                uint32_t index = i;
-                uint32_t *foundhash = &hash[8*index];
-                if (foundhash[7] <= ptarget[7]) {
-                    if (fulltest(foundhash, ptarget)) {
-                        uint32_t verification[8];
-                        pdata[19] += nonce - pdata[19];
-                        heavycoin_hash((unsigned char *)verification, (const unsigned char *)pdata, blocklen);
-                        if (memcmp(verification, foundhash, 8*sizeof(uint32_t))) {
-                            applog(LOG_ERR, "hash for nonce=$%08X does not validate on CPU!\n", nonce);
-                        }
-                        else
-                        {
-                            *hashes_done = pdata[19] - start_nonce;
-                            rc = 1;
-                            goto exit;
-                        }
-                    }
-                }
-            }
-        }
+	init[thr_id] = false;
 
-emptyNonceVector:
+	cudaDeviceSynchronize();
+}
 
-        pdata[19] += throughput;
+#endif
 
-    } while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
-    *hashes_done = pdata[19] - start_nonce;
+extern "C" {
+#include "sph/sph_keccak.h"
+#include "sph/sph_blake.h"
+#include "sph/sph_groestl.h"
+}
+#include "hefty1.h"
+#include "heavy/heavy.h"
 
-exit:
-    cudaFreeHost(cpu_nonceVector);
-    cudaFreeHost(hash);
-    return rc;
+/* Combines top 64-bits from each hash into a single hash */
+__host__
+static void combine_hashes(uint32_t *out, const uint32_t *hash1, const uint32_t *hash2, const uint32_t *hash3, const uint32_t *hash4)
+{
+	const uint32_t *hash[4] = { hash1, hash2, hash3, hash4 };
+	int bits;
+	unsigned int i;
+	uint32_t mask;
+	unsigned int k;
+
+	/* Transpose first 64 bits of each hash into out */
+	memset(out, 0, 32);
+	bits = 0;
+	for (i = 7; i >= 6; i--) {
+		for (mask = 0x80000000; mask; mask >>= 1) {
+			for (k = 0; k < 4; k++) {
+				out[(255 - bits) / 32] <<= 1;
+				if ((hash[k][i] & mask) != 0)
+					out[(255 - bits) / 32] |= 1;
+				bits++;
+			}
+		}
+	}
 }
 
-void heavycoin_hash(unsigned char* output, const unsigned char* input, int len)
+// CPU hash function
+__host__
+void heavycoin_hash(uchar* output, const uchar* input, int len)
 {
-    unsigned char hash1[32];
-    unsigned char hash2[32];
-    uint32_t hash3[16];
-    uint32_t hash4[16];
-    uint32_t hash5[16];
-    uint32_t *final;
-    SHA256_CTX ctx;
-    sph_keccak512_context keccakCtx;
-    sph_groestl512_context groestlCtx;
-    sph_blake512_context blakeCtx;
-
-    HEFTY1(input, len, hash1);
-
-    /* HEFTY1 is new, so take an extra security measure to eliminate
-     * the possiblity of collisions:
-     *
-     *     Hash(x) = SHA256(x + HEFTY1(x))
-     *
-     * N.B. '+' is concatenation.
-     */
-    SHA256_Init(&ctx);
-    SHA256_Update(&ctx, input, len);
-    SHA256_Update(&ctx, hash1, sizeof(hash1));
-    SHA256_Final(hash2, &ctx);
-
-    /* Additional security: Do not rely on a single cryptographic hash
-     * function.  Instead, combine the outputs of 4 of the most secure
-     * cryptographic hash functions-- SHA256, KECCAK512, GROESTL512
-     * and BLAKE512.
-     */
-
-    sph_keccak512_init(&keccakCtx);
-    sph_keccak512(&keccakCtx, input, len);
-    sph_keccak512(&keccakCtx, hash1, sizeof(hash1));
-    sph_keccak512_close(&keccakCtx, (void *)&hash3);
-
-    sph_groestl512_init(&groestlCtx);
-    sph_groestl512(&groestlCtx, input, len);
-    sph_groestl512(&groestlCtx, hash1, sizeof(hash1));
-    sph_groestl512_close(&groestlCtx, (void *)&hash4);
-
-    sph_blake512_init(&blakeCtx);
-    sph_blake512(&blakeCtx, input, len);
-    sph_blake512(&blakeCtx, (unsigned char *)&hash1, sizeof(hash1));
-    sph_blake512_close(&blakeCtx, (void *)&hash5);
-
-    final = (uint32_t *)output;
-    combine_hashes(final, (uint32_t *)hash2, hash3, hash4, hash5);
+	unsigned char hash1[32];
+	unsigned char hash2[32];
+	uint32_t hash3[16];
+	uint32_t hash4[16];
+	uint32_t hash5[16];
+	uint32_t *final;
+	SHA256_CTX ctx;
+	sph_keccak512_context keccakCtx;
+	sph_groestl512_context groestlCtx;
+	sph_blake512_context blakeCtx;
+
+	HEFTY1(input, len, hash1);
+
+	/* HEFTY1 is new, so take an extra security measure to eliminate
+	 * the possiblity of collisions:
+	 *
+	 *     Hash(x) = SHA256(x + HEFTY1(x))
+	 *
+	 * N.B. '+' is concatenation.
+	 */
+	SHA256_Init(&ctx);
+	SHA256_Update(&ctx, input, len);
+	SHA256_Update(&ctx, hash1, sizeof(hash1));
+	SHA256_Final(hash2, &ctx);
+
+	/* Additional security: Do not rely on a single cryptographic hash
+	 * function.  Instead, combine the outputs of 4 of the most secure
+	 * cryptographic hash functions-- SHA256, KECCAK512, GROESTL512
+	 * and BLAKE512.
+	 */
+
+	sph_keccak512_init(&keccakCtx);
+	sph_keccak512(&keccakCtx, input, len);
+	sph_keccak512(&keccakCtx, hash1, sizeof(hash1));
+	sph_keccak512_close(&keccakCtx, (void *)&hash3);
+
+	sph_groestl512_init(&groestlCtx);
+	sph_groestl512(&groestlCtx, input, len);
+	sph_groestl512(&groestlCtx, hash1, sizeof(hash1));
+	sph_groestl512_close(&groestlCtx, (void *)&hash4);
+
+	sph_blake512_init(&blakeCtx);
+	sph_blake512(&blakeCtx, input, len);
+	sph_blake512(&blakeCtx, (unsigned char *)&hash1, sizeof(hash1));
+	sph_blake512_close(&blakeCtx, (void *)&hash5);
+
+	final = (uint32_t *)output;
+	combine_hashes(final, (uint32_t *)hash2, hash3, hash4, hash5);
 }
diff --git a/heavy/heavy.h b/heavy/heavy.h
new file mode 100644
index 0000000000..50affc408c
--- /dev/null
+++ b/heavy/heavy.h
@@ -0,0 +1,36 @@
+#ifndef _CUDA_HEAVY_H
+#define _CUDA_HEAVY_H
+
+void blake512_cpu_init(int thr_id, uint32_t threads);
+void blake512_cpu_setBlock(void *pdata, int len);
+void blake512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce);
+void blake512_cpu_free(int thr_id);
+
+void groestl512_cpu_init(int thr_id, uint32_t threads);
+void groestl512_cpu_copyHeftyHash(int thr_id, uint32_t threads, void *heftyHashes, int copy);
+void groestl512_cpu_setBlock(void *data, int len);
+void groestl512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce);
+void groestl512_cpu_free(int thr_id);
+
+void hefty_cpu_hash(int thr_id, uint32_t threads, int startNounce);
+void hefty_cpu_setBlock(int thr_id, uint32_t threads, void *data, int len);
+void hefty_cpu_init(int thr_id, uint32_t threads);
+void hefty_cpu_free(int thr_id);
+
+void keccak512_cpu_init(int thr_id, uint32_t threads);
+void keccak512_cpu_setBlock(void *data, int len);
+void keccak512_cpu_copyHeftyHash(int thr_id, uint32_t threads, void *heftyHashes, int copy);
+void keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce);
+void keccak512_cpu_free(int thr_id);
+
+void sha256_cpu_init(int thr_id, uint32_t threads);
+void sha256_cpu_setBlock(void *data, int len);
+void sha256_cpu_hash(int thr_id, uint32_t threads, int startNounce);
+void sha256_cpu_copyHeftyHash(int thr_id, uint32_t threads, void *heftyHashes, int copy);
+void sha256_cpu_free(int thr_id);
+
+void combine_cpu_init(int thr_id, uint32_t threads);
+void combine_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *hash);
+void combine_cpu_free(int thr_id);
+
+#endif
diff --git a/install-sh b/install-sh
old mode 100644
new mode 100755
index 9c04de25c4..377bb8687f
--- a/install-sh
+++ b/install-sh
@@ -1,527 +1,527 @@
-#!/bin/sh
-# install - install a program, script, or datafile
-
-scriptversion=2011-01-19.21; # UTC
-
-# This originates from X11R5 (mit/util/scripts/install.sh), which was
-# later released in X11R6 (xc/config/util/install.sh) with the
-# following copyright and license.
-#
-# Copyright (C) 1994 X Consortium
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to
-# deal in the Software without restriction, including without limitation the
-# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-# sell copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-# X CONSORTIUM BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
-# AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNEC-
-# TION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-#
-# Except as contained in this notice, the name of the X Consortium shall not
-# be used in advertising or otherwise to promote the sale, use or other deal-
-# ings in this Software without prior written authorization from the X Consor-
-# tium.
-#
-#
-# FSF changes to this file are in the public domain.
-#
-# Calling this script install-sh is preferred over install.sh, to prevent
-# `make' implicit rules from creating a file called install from it
-# when there is no Makefile.
-#
-# This script is compatible with the BSD install script, but was written
-# from scratch.
-
-nl='
-'
-IFS=" ""	$nl"
-
-# set DOITPROG to echo to test this script
-
-# Don't use :- since 4.3BSD and earlier shells don't like it.
-doit=${DOITPROG-}
-if test -z "$doit"; then
-  doit_exec=exec
-else
-  doit_exec=$doit
-fi
-
-# Put in absolute file names if you don't have them in your path;
-# or use environment vars.
-
-chgrpprog=${CHGRPPROG-chgrp}
-chmodprog=${CHMODPROG-chmod}
-chownprog=${CHOWNPROG-chown}
-cmpprog=${CMPPROG-cmp}
-cpprog=${CPPROG-cp}
-mkdirprog=${MKDIRPROG-mkdir}
-mvprog=${MVPROG-mv}
-rmprog=${RMPROG-rm}
-stripprog=${STRIPPROG-strip}
-
-posix_glob='?'
-initialize_posix_glob='
-  test "$posix_glob" != "?" || {
-    if (set -f) 2>/dev/null; then
-      posix_glob=
-    else
-      posix_glob=:
-    fi
-  }
-'
-
-posix_mkdir=
-
-# Desired mode of installed file.
-mode=0755
-
-chgrpcmd=
-chmodcmd=$chmodprog
-chowncmd=
-mvcmd=$mvprog
-rmcmd="$rmprog -f"
-stripcmd=
-
-src=
-dst=
-dir_arg=
-dst_arg=
-
-copy_on_change=false
-no_target_directory=
-
-usage="\
-Usage: $0 [OPTION]... [-T] SRCFILE DSTFILE
-   or: $0 [OPTION]... SRCFILES... DIRECTORY
-   or: $0 [OPTION]... -t DIRECTORY SRCFILES...
-   or: $0 [OPTION]... -d DIRECTORIES...
-
-In the 1st form, copy SRCFILE to DSTFILE.
-In the 2nd and 3rd, copy all SRCFILES to DIRECTORY.
-In the 4th, create DIRECTORIES.
-
-Options:
-     --help     display this help and exit.
-     --version  display version info and exit.
-
-  -c            (ignored)
-  -C            install only if different (preserve the last data modification time)
-  -d            create directories instead of installing files.
-  -g GROUP      $chgrpprog installed files to GROUP.
-  -m MODE       $chmodprog installed files to MODE.
-  -o USER       $chownprog installed files to USER.
-  -s            $stripprog installed files.
-  -t DIRECTORY  install into DIRECTORY.
-  -T            report an error if DSTFILE is a directory.
-
-Environment variables override the default commands:
-  CHGRPPROG CHMODPROG CHOWNPROG CMPPROG CPPROG MKDIRPROG MVPROG
-  RMPROG STRIPPROG
-"
-
-while test $# -ne 0; do
-  case $1 in
-    -c) ;;
-
-    -C) copy_on_change=true;;
-
-    -d) dir_arg=true;;
-
-    -g) chgrpcmd="$chgrpprog $2"
-	shift;;
-
-    --help) echo "$usage"; exit $?;;
-
-    -m) mode=$2
-	case $mode in
-	  *' '* | *'	'* | *'
-'*	  | *'*'* | *'?'* | *'['*)
-	    echo "$0: invalid mode: $mode" >&2
-	    exit 1;;
-	esac
-	shift;;
-
-    -o) chowncmd="$chownprog $2"
-	shift;;
-
-    -s) stripcmd=$stripprog;;
-
-    -t) dst_arg=$2
-	# Protect names problematic for `test' and other utilities.
-	case $dst_arg in
-	  -* | [=\(\)!]) dst_arg=./$dst_arg;;
-	esac
-	shift;;
-
-    -T) no_target_directory=true;;
-
-    --version) echo "$0 $scriptversion"; exit $?;;
-
-    --)	shift
-	break;;
-
-    -*)	echo "$0: invalid option: $1" >&2
-	exit 1;;
-
-    *)  break;;
-  esac
-  shift
-done
-
-if test $# -ne 0 && test -z "$dir_arg$dst_arg"; then
-  # When -d is used, all remaining arguments are directories to create.
-  # When -t is used, the destination is already specified.
-  # Otherwise, the last argument is the destination.  Remove it from $@.
-  for arg
-  do
-    if test -n "$dst_arg"; then
-      # $@ is not empty: it contains at least $arg.
-      set fnord "$@" "$dst_arg"
-      shift # fnord
-    fi
-    shift # arg
-    dst_arg=$arg
-    # Protect names problematic for `test' and other utilities.
-    case $dst_arg in
-      -* | [=\(\)!]) dst_arg=./$dst_arg;;
-    esac
-  done
-fi
-
-if test $# -eq 0; then
-  if test -z "$dir_arg"; then
-    echo "$0: no input file specified." >&2
-    exit 1
-  fi
-  # It's OK to call `install-sh -d' without argument.
-  # This can happen when creating conditional directories.
-  exit 0
-fi
-
-if test -z "$dir_arg"; then
-  do_exit='(exit $ret); exit $ret'
-  trap "ret=129; $do_exit" 1
-  trap "ret=130; $do_exit" 2
-  trap "ret=141; $do_exit" 13
-  trap "ret=143; $do_exit" 15
-
-  # Set umask so as not to create temps with too-generous modes.
-  # However, 'strip' requires both read and write access to temps.
-  case $mode in
-    # Optimize common cases.
-    *644) cp_umask=133;;
-    *755) cp_umask=22;;
-
-    *[0-7])
-      if test -z "$stripcmd"; then
-	u_plus_rw=
-      else
-	u_plus_rw='% 200'
-      fi
-      cp_umask=`expr '(' 777 - $mode % 1000 ')' $u_plus_rw`;;
-    *)
-      if test -z "$stripcmd"; then
-	u_plus_rw=
-      else
-	u_plus_rw=,u+rw
-      fi
-      cp_umask=$mode$u_plus_rw;;
-  esac
-fi
-
-for src
-do
-  # Protect names problematic for `test' and other utilities.
-  case $src in
-    -* | [=\(\)!]) src=./$src;;
-  esac
-
-  if test -n "$dir_arg"; then
-    dst=$src
-    dstdir=$dst
-    test -d "$dstdir"
-    dstdir_status=$?
-  else
-
-    # Waiting for this to be detected by the "$cpprog $src $dsttmp" command
-    # might cause directories to be created, which would be especially bad
-    # if $src (and thus $dsttmp) contains '*'.
-    if test ! -f "$src" && test ! -d "$src"; then
-      echo "$0: $src does not exist." >&2
-      exit 1
-    fi
-
-    if test -z "$dst_arg"; then
-      echo "$0: no destination specified." >&2
-      exit 1
-    fi
-    dst=$dst_arg
-
-    # If destination is a directory, append the input filename; won't work
-    # if double slashes aren't ignored.
-    if test -d "$dst"; then
-      if test -n "$no_target_directory"; then
-	echo "$0: $dst_arg: Is a directory" >&2
-	exit 1
-      fi
-      dstdir=$dst
-      dst=$dstdir/`basename "$src"`
-      dstdir_status=0
-    else
-      # Prefer dirname, but fall back on a substitute if dirname fails.
-      dstdir=`
-	(dirname "$dst") 2>/dev/null ||
-	expr X"$dst" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
-	     X"$dst" : 'X\(//\)[^/]' \| \
-	     X"$dst" : 'X\(//\)$' \| \
-	     X"$dst" : 'X\(/\)' \| . 2>/dev/null ||
-	echo X"$dst" |
-	    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
-		   s//\1/
-		   q
-		 }
-		 /^X\(\/\/\)[^/].*/{
-		   s//\1/
-		   q
-		 }
-		 /^X\(\/\/\)$/{
-		   s//\1/
-		   q
-		 }
-		 /^X\(\/\).*/{
-		   s//\1/
-		   q
-		 }
-		 s/.*/./; q'
-      `
-
-      test -d "$dstdir"
-      dstdir_status=$?
-    fi
-  fi
-
-  obsolete_mkdir_used=false
-
-  if test $dstdir_status != 0; then
-    case $posix_mkdir in
-      '')
-	# Create intermediate dirs using mode 755 as modified by the umask.
-	# This is like FreeBSD 'install' as of 1997-10-28.
-	umask=`umask`
-	case $stripcmd.$umask in
-	  # Optimize common cases.
-	  *[2367][2367]) mkdir_umask=$umask;;
-	  .*0[02][02] | .[02][02] | .[02]) mkdir_umask=22;;
-
-	  *[0-7])
-	    mkdir_umask=`expr $umask + 22 \
-	      - $umask % 100 % 40 + $umask % 20 \
-	      - $umask % 10 % 4 + $umask % 2
-	    `;;
-	  *) mkdir_umask=$umask,go-w;;
-	esac
-
-	# With -d, create the new directory with the user-specified mode.
-	# Otherwise, rely on $mkdir_umask.
-	if test -n "$dir_arg"; then
-	  mkdir_mode=-m$mode
-	else
-	  mkdir_mode=
-	fi
-
-	posix_mkdir=false
-	case $umask in
-	  *[123567][0-7][0-7])
-	    # POSIX mkdir -p sets u+wx bits regardless of umask, which
-	    # is incompatible with FreeBSD 'install' when (umask & 300) != 0.
-	    ;;
-	  *)
-	    tmpdir=${TMPDIR-/tmp}/ins$RANDOM-$$
-	    trap 'ret=$?; rmdir "$tmpdir/d" "$tmpdir" 2>/dev/null; exit $ret' 0
-
-	    if (umask $mkdir_umask &&
-		exec $mkdirprog $mkdir_mode -p -- "$tmpdir/d") >/dev/null 2>&1
-	    then
-	      if test -z "$dir_arg" || {
-		   # Check for POSIX incompatibilities with -m.
-		   # HP-UX 11.23 and IRIX 6.5 mkdir -m -p sets group- or
-		   # other-writeable bit of parent directory when it shouldn't.
-		   # FreeBSD 6.1 mkdir -m -p sets mode of existing directory.
-		   ls_ld_tmpdir=`ls -ld "$tmpdir"`
-		   case $ls_ld_tmpdir in
-		     d????-?r-*) different_mode=700;;
-		     d????-?--*) different_mode=755;;
-		     *) false;;
-		   esac &&
-		   $mkdirprog -m$different_mode -p -- "$tmpdir" && {
-		     ls_ld_tmpdir_1=`ls -ld "$tmpdir"`
-		     test "$ls_ld_tmpdir" = "$ls_ld_tmpdir_1"
-		   }
-		 }
-	      then posix_mkdir=:
-	      fi
-	      rmdir "$tmpdir/d" "$tmpdir"
-	    else
-	      # Remove any dirs left behind by ancient mkdir implementations.
-	      rmdir ./$mkdir_mode ./-p ./-- 2>/dev/null
-	    fi
-	    trap '' 0;;
-	esac;;
-    esac
-
-    if
-      $posix_mkdir && (
-	umask $mkdir_umask &&
-	$doit_exec $mkdirprog $mkdir_mode -p -- "$dstdir"
-      )
-    then :
-    else
-
-      # The umask is ridiculous, or mkdir does not conform to POSIX,
-      # or it failed possibly due to a race condition.  Create the
-      # directory the slow way, step by step, checking for races as we go.
-
-      case $dstdir in
-	/*) prefix='/';;
-	[-=\(\)!]*) prefix='./';;
-	*)  prefix='';;
-      esac
-
-      eval "$initialize_posix_glob"
-
-      oIFS=$IFS
-      IFS=/
-      $posix_glob set -f
-      set fnord $dstdir
-      shift
-      $posix_glob set +f
-      IFS=$oIFS
-
-      prefixes=
-
-      for d
-      do
-	test X"$d" = X && continue
-
-	prefix=$prefix$d
-	if test -d "$prefix"; then
-	  prefixes=
-	else
-	  if $posix_mkdir; then
-	    (umask=$mkdir_umask &&
-	     $doit_exec $mkdirprog $mkdir_mode -p -- "$dstdir") && break
-	    # Don't fail if two instances are running concurrently.
-	    test -d "$prefix" || exit 1
-	  else
-	    case $prefix in
-	      *\'*) qprefix=`echo "$prefix" | sed "s/'/'\\\\\\\\''/g"`;;
-	      *) qprefix=$prefix;;
-	    esac
-	    prefixes="$prefixes '$qprefix'"
-	  fi
-	fi
-	prefix=$prefix/
-      done
-
-      if test -n "$prefixes"; then
-	# Don't fail if two instances are running concurrently.
-	(umask $mkdir_umask &&
-	 eval "\$doit_exec \$mkdirprog $prefixes") ||
-	  test -d "$dstdir" || exit 1
-	obsolete_mkdir_used=true
-      fi
-    fi
-  fi
-
-  if test -n "$dir_arg"; then
-    { test -z "$chowncmd" || $doit $chowncmd "$dst"; } &&
-    { test -z "$chgrpcmd" || $doit $chgrpcmd "$dst"; } &&
-    { test "$obsolete_mkdir_used$chowncmd$chgrpcmd" = false ||
-      test -z "$chmodcmd" || $doit $chmodcmd $mode "$dst"; } || exit 1
-  else
-
-    # Make a couple of temp file names in the proper directory.
-    dsttmp=$dstdir/_inst.$$_
-    rmtmp=$dstdir/_rm.$$_
-
-    # Trap to clean up those temp files at exit.
-    trap 'ret=$?; rm -f "$dsttmp" "$rmtmp" && exit $ret' 0
-
-    # Copy the file name to the temp name.
-    (umask $cp_umask && $doit_exec $cpprog "$src" "$dsttmp") &&
-
-    # and set any options; do chmod last to preserve setuid bits.
-    #
-    # If any of these fail, we abort the whole thing.  If we want to
-    # ignore errors from any of these, just make sure not to ignore
-    # errors from the above "$doit $cpprog $src $dsttmp" command.
-    #
-    { test -z "$chowncmd" || $doit $chowncmd "$dsttmp"; } &&
-    { test -z "$chgrpcmd" || $doit $chgrpcmd "$dsttmp"; } &&
-    { test -z "$stripcmd" || $doit $stripcmd "$dsttmp"; } &&
-    { test -z "$chmodcmd" || $doit $chmodcmd $mode "$dsttmp"; } &&
-
-    # If -C, don't bother to copy if it wouldn't change the file.
-    if $copy_on_change &&
-       old=`LC_ALL=C ls -dlL "$dst"	2>/dev/null` &&
-       new=`LC_ALL=C ls -dlL "$dsttmp"	2>/dev/null` &&
-
-       eval "$initialize_posix_glob" &&
-       $posix_glob set -f &&
-       set X $old && old=:$2:$4:$5:$6 &&
-       set X $new && new=:$2:$4:$5:$6 &&
-       $posix_glob set +f &&
-
-       test "$old" = "$new" &&
-       $cmpprog "$dst" "$dsttmp" >/dev/null 2>&1
-    then
-      rm -f "$dsttmp"
-    else
-      # Rename the file to the real destination.
-      $doit $mvcmd -f "$dsttmp" "$dst" 2>/dev/null ||
-
-      # The rename failed, perhaps because mv can't rename something else
-      # to itself, or perhaps because mv is so ancient that it does not
-      # support -f.
-      {
-	# Now remove or move aside any old file at destination location.
-	# We try this two ways since rm can't unlink itself on some
-	# systems and the destination file might be busy for other
-	# reasons.  In this case, the final cleanup might fail but the new
-	# file should still install successfully.
-	{
-	  test ! -f "$dst" ||
-	  $doit $rmcmd -f "$dst" 2>/dev/null ||
-	  { $doit $mvcmd -f "$dst" "$rmtmp" 2>/dev/null &&
-	    { $doit $rmcmd -f "$rmtmp" 2>/dev/null; :; }
-	  } ||
-	  { echo "$0: cannot unlink or rename $dst" >&2
-	    (exit 1); exit 1
-	  }
-	} &&
-
-	# Now rename the file to the real destination.
-	$doit $mvcmd "$dsttmp" "$dst"
-      }
-    fi || exit 1
-
-    trap '' 0
-  fi
-done
-
-# Local variables:
-# eval: (add-hook 'write-file-hooks 'time-stamp)
-# time-stamp-start: "scriptversion="
-# time-stamp-format: "%:y-%02m-%02d.%02H"
-# time-stamp-time-zone: "UTC"
-# time-stamp-end: "; # UTC"
-# End:
+#!/bin/sh
+# install - install a program, script, or datafile
+
+scriptversion=2011-11-20.07; # UTC
+
+# This originates from X11R5 (mit/util/scripts/install.sh), which was
+# later released in X11R6 (xc/config/util/install.sh) with the
+# following copyright and license.
+#
+# Copyright (C) 1994 X Consortium
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# X CONSORTIUM BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+# AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNEC-
+# TION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+# Except as contained in this notice, the name of the X Consortium shall not
+# be used in advertising or otherwise to promote the sale, use or other deal-
+# ings in this Software without prior written authorization from the X Consor-
+# tium.
+#
+#
+# FSF changes to this file are in the public domain.
+#
+# Calling this script install-sh is preferred over install.sh, to prevent
+# 'make' implicit rules from creating a file called install from it
+# when there is no Makefile.
+#
+# This script is compatible with the BSD install script, but was written
+# from scratch.
+
+nl='
+'
+IFS=" ""	$nl"
+
+# set DOITPROG to echo to test this script
+
+# Don't use :- since 4.3BSD and earlier shells don't like it.
+doit=${DOITPROG-}
+if test -z "$doit"; then
+  doit_exec=exec
+else
+  doit_exec=$doit
+fi
+
+# Put in absolute file names if you don't have them in your path;
+# or use environment vars.
+
+chgrpprog=${CHGRPPROG-chgrp}
+chmodprog=${CHMODPROG-chmod}
+chownprog=${CHOWNPROG-chown}
+cmpprog=${CMPPROG-cmp}
+cpprog=${CPPROG-cp}
+mkdirprog=${MKDIRPROG-mkdir}
+mvprog=${MVPROG-mv}
+rmprog=${RMPROG-rm}
+stripprog=${STRIPPROG-strip}
+
+posix_glob='?'
+initialize_posix_glob='
+  test "$posix_glob" != "?" || {
+    if (set -f) 2>/dev/null; then
+      posix_glob=
+    else
+      posix_glob=:
+    fi
+  }
+'
+
+posix_mkdir=
+
+# Desired mode of installed file.
+mode=0755
+
+chgrpcmd=
+chmodcmd=$chmodprog
+chowncmd=
+mvcmd=$mvprog
+rmcmd="$rmprog -f"
+stripcmd=
+
+src=
+dst=
+dir_arg=
+dst_arg=
+
+copy_on_change=false
+no_target_directory=
+
+usage="\
+Usage: $0 [OPTION]... [-T] SRCFILE DSTFILE
+   or: $0 [OPTION]... SRCFILES... DIRECTORY
+   or: $0 [OPTION]... -t DIRECTORY SRCFILES...
+   or: $0 [OPTION]... -d DIRECTORIES...
+
+In the 1st form, copy SRCFILE to DSTFILE.
+In the 2nd and 3rd, copy all SRCFILES to DIRECTORY.
+In the 4th, create DIRECTORIES.
+
+Options:
+     --help     display this help and exit.
+     --version  display version info and exit.
+
+  -c            (ignored)
+  -C            install only if different (preserve the last data modification time)
+  -d            create directories instead of installing files.
+  -g GROUP      $chgrpprog installed files to GROUP.
+  -m MODE       $chmodprog installed files to MODE.
+  -o USER       $chownprog installed files to USER.
+  -s            $stripprog installed files.
+  -t DIRECTORY  install into DIRECTORY.
+  -T            report an error if DSTFILE is a directory.
+
+Environment variables override the default commands:
+  CHGRPPROG CHMODPROG CHOWNPROG CMPPROG CPPROG MKDIRPROG MVPROG
+  RMPROG STRIPPROG
+"
+
+while test $# -ne 0; do
+  case $1 in
+    -c) ;;
+
+    -C) copy_on_change=true;;
+
+    -d) dir_arg=true;;
+
+    -g) chgrpcmd="$chgrpprog $2"
+	shift;;
+
+    --help) echo "$usage"; exit $?;;
+
+    -m) mode=$2
+	case $mode in
+	  *' '* | *'	'* | *'
+'*	  | *'*'* | *'?'* | *'['*)
+	    echo "$0: invalid mode: $mode" >&2
+	    exit 1;;
+	esac
+	shift;;
+
+    -o) chowncmd="$chownprog $2"
+	shift;;
+
+    -s) stripcmd=$stripprog;;
+
+    -t) dst_arg=$2
+	# Protect names problematic for 'test' and other utilities.
+	case $dst_arg in
+	  -* | [=\(\)!]) dst_arg=./$dst_arg;;
+	esac
+	shift;;
+
+    -T) no_target_directory=true;;
+
+    --version) echo "$0 $scriptversion"; exit $?;;
+
+    --)	shift
+	break;;
+
+    -*)	echo "$0: invalid option: $1" >&2
+	exit 1;;
+
+    *)  break;;
+  esac
+  shift
+done
+
+if test $# -ne 0 && test -z "$dir_arg$dst_arg"; then
+  # When -d is used, all remaining arguments are directories to create.
+  # When -t is used, the destination is already specified.
+  # Otherwise, the last argument is the destination.  Remove it from $@.
+  for arg
+  do
+    if test -n "$dst_arg"; then
+      # $@ is not empty: it contains at least $arg.
+      set fnord "$@" "$dst_arg"
+      shift # fnord
+    fi
+    shift # arg
+    dst_arg=$arg
+    # Protect names problematic for 'test' and other utilities.
+    case $dst_arg in
+      -* | [=\(\)!]) dst_arg=./$dst_arg;;
+    esac
+  done
+fi
+
+if test $# -eq 0; then
+  if test -z "$dir_arg"; then
+    echo "$0: no input file specified." >&2
+    exit 1
+  fi
+  # It's OK to call 'install-sh -d' without argument.
+  # This can happen when creating conditional directories.
+  exit 0
+fi
+
+if test -z "$dir_arg"; then
+  do_exit='(exit $ret); exit $ret'
+  trap "ret=129; $do_exit" 1
+  trap "ret=130; $do_exit" 2
+  trap "ret=141; $do_exit" 13
+  trap "ret=143; $do_exit" 15
+
+  # Set umask so as not to create temps with too-generous modes.
+  # However, 'strip' requires both read and write access to temps.
+  case $mode in
+    # Optimize common cases.
+    *644) cp_umask=133;;
+    *755) cp_umask=22;;
+
+    *[0-7])
+      if test -z "$stripcmd"; then
+	u_plus_rw=
+      else
+	u_plus_rw='% 200'
+      fi
+      cp_umask=`expr '(' 777 - $mode % 1000 ')' $u_plus_rw`;;
+    *)
+      if test -z "$stripcmd"; then
+	u_plus_rw=
+      else
+	u_plus_rw=,u+rw
+      fi
+      cp_umask=$mode$u_plus_rw;;
+  esac
+fi
+
+for src
+do
+  # Protect names problematic for 'test' and other utilities.
+  case $src in
+    -* | [=\(\)!]) src=./$src;;
+  esac
+
+  if test -n "$dir_arg"; then
+    dst=$src
+    dstdir=$dst
+    test -d "$dstdir"
+    dstdir_status=$?
+  else
+
+    # Waiting for this to be detected by the "$cpprog $src $dsttmp" command
+    # might cause directories to be created, which would be especially bad
+    # if $src (and thus $dsttmp) contains '*'.
+    if test ! -f "$src" && test ! -d "$src"; then
+      echo "$0: $src does not exist." >&2
+      exit 1
+    fi
+
+    if test -z "$dst_arg"; then
+      echo "$0: no destination specified." >&2
+      exit 1
+    fi
+    dst=$dst_arg
+
+    # If destination is a directory, append the input filename; won't work
+    # if double slashes aren't ignored.
+    if test -d "$dst"; then
+      if test -n "$no_target_directory"; then
+	echo "$0: $dst_arg: Is a directory" >&2
+	exit 1
+      fi
+      dstdir=$dst
+      dst=$dstdir/`basename "$src"`
+      dstdir_status=0
+    else
+      # Prefer dirname, but fall back on a substitute if dirname fails.
+      dstdir=`
+	(dirname "$dst") 2>/dev/null ||
+	expr X"$dst" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	     X"$dst" : 'X\(//\)[^/]' \| \
+	     X"$dst" : 'X\(//\)$' \| \
+	     X"$dst" : 'X\(/\)' \| . 2>/dev/null ||
+	echo X"$dst" |
+	    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+		   s//\1/
+		   q
+		 }
+		 /^X\(\/\/\)[^/].*/{
+		   s//\1/
+		   q
+		 }
+		 /^X\(\/\/\)$/{
+		   s//\1/
+		   q
+		 }
+		 /^X\(\/\).*/{
+		   s//\1/
+		   q
+		 }
+		 s/.*/./; q'
+      `
+
+      test -d "$dstdir"
+      dstdir_status=$?
+    fi
+  fi
+
+  obsolete_mkdir_used=false
+
+  if test $dstdir_status != 0; then
+    case $posix_mkdir in
+      '')
+	# Create intermediate dirs using mode 755 as modified by the umask.
+	# This is like FreeBSD 'install' as of 1997-10-28.
+	umask=`umask`
+	case $stripcmd.$umask in
+	  # Optimize common cases.
+	  *[2367][2367]) mkdir_umask=$umask;;
+	  .*0[02][02] | .[02][02] | .[02]) mkdir_umask=22;;
+
+	  *[0-7])
+	    mkdir_umask=`expr $umask + 22 \
+	      - $umask % 100 % 40 + $umask % 20 \
+	      - $umask % 10 % 4 + $umask % 2
+	    `;;
+	  *) mkdir_umask=$umask,go-w;;
+	esac
+
+	# With -d, create the new directory with the user-specified mode.
+	# Otherwise, rely on $mkdir_umask.
+	if test -n "$dir_arg"; then
+	  mkdir_mode=-m$mode
+	else
+	  mkdir_mode=
+	fi
+
+	posix_mkdir=false
+	case $umask in
+	  *[123567][0-7][0-7])
+	    # POSIX mkdir -p sets u+wx bits regardless of umask, which
+	    # is incompatible with FreeBSD 'install' when (umask & 300) != 0.
+	    ;;
+	  *)
+	    tmpdir=${TMPDIR-/tmp}/ins$RANDOM-$$
+	    trap 'ret=$?; rmdir "$tmpdir/d" "$tmpdir" 2>/dev/null; exit $ret' 0
+
+	    if (umask $mkdir_umask &&
+		exec $mkdirprog $mkdir_mode -p -- "$tmpdir/d") >/dev/null 2>&1
+	    then
+	      if test -z "$dir_arg" || {
+		   # Check for POSIX incompatibilities with -m.
+		   # HP-UX 11.23 and IRIX 6.5 mkdir -m -p sets group- or
+		   # other-writable bit of parent directory when it shouldn't.
+		   # FreeBSD 6.1 mkdir -m -p sets mode of existing directory.
+		   ls_ld_tmpdir=`ls -ld "$tmpdir"`
+		   case $ls_ld_tmpdir in
+		     d????-?r-*) different_mode=700;;
+		     d????-?--*) different_mode=755;;
+		     *) false;;
+		   esac &&
+		   $mkdirprog -m$different_mode -p -- "$tmpdir" && {
+		     ls_ld_tmpdir_1=`ls -ld "$tmpdir"`
+		     test "$ls_ld_tmpdir" = "$ls_ld_tmpdir_1"
+		   }
+		 }
+	      then posix_mkdir=:
+	      fi
+	      rmdir "$tmpdir/d" "$tmpdir"
+	    else
+	      # Remove any dirs left behind by ancient mkdir implementations.
+	      rmdir ./$mkdir_mode ./-p ./-- 2>/dev/null
+	    fi
+	    trap '' 0;;
+	esac;;
+    esac
+
+    if
+      $posix_mkdir && (
+	umask $mkdir_umask &&
+	$doit_exec $mkdirprog $mkdir_mode -p -- "$dstdir"
+      )
+    then :
+    else
+
+      # The umask is ridiculous, or mkdir does not conform to POSIX,
+      # or it failed possibly due to a race condition.  Create the
+      # directory the slow way, step by step, checking for races as we go.
+
+      case $dstdir in
+	/*) prefix='/';;
+	[-=\(\)!]*) prefix='./';;
+	*)  prefix='';;
+      esac
+
+      eval "$initialize_posix_glob"
+
+      oIFS=$IFS
+      IFS=/
+      $posix_glob set -f
+      set fnord $dstdir
+      shift
+      $posix_glob set +f
+      IFS=$oIFS
+
+      prefixes=
+
+      for d
+      do
+	test X"$d" = X && continue
+
+	prefix=$prefix$d
+	if test -d "$prefix"; then
+	  prefixes=
+	else
+	  if $posix_mkdir; then
+	    (umask=$mkdir_umask &&
+	     $doit_exec $mkdirprog $mkdir_mode -p -- "$dstdir") && break
+	    # Don't fail if two instances are running concurrently.
+	    test -d "$prefix" || exit 1
+	  else
+	    case $prefix in
+	      *\'*) qprefix=`echo "$prefix" | sed "s/'/'\\\\\\\\''/g"`;;
+	      *) qprefix=$prefix;;
+	    esac
+	    prefixes="$prefixes '$qprefix'"
+	  fi
+	fi
+	prefix=$prefix/
+      done
+
+      if test -n "$prefixes"; then
+	# Don't fail if two instances are running concurrently.
+	(umask $mkdir_umask &&
+	 eval "\$doit_exec \$mkdirprog $prefixes") ||
+	  test -d "$dstdir" || exit 1
+	obsolete_mkdir_used=true
+      fi
+    fi
+  fi
+
+  if test -n "$dir_arg"; then
+    { test -z "$chowncmd" || $doit $chowncmd "$dst"; } &&
+    { test -z "$chgrpcmd" || $doit $chgrpcmd "$dst"; } &&
+    { test "$obsolete_mkdir_used$chowncmd$chgrpcmd" = false ||
+      test -z "$chmodcmd" || $doit $chmodcmd $mode "$dst"; } || exit 1
+  else
+
+    # Make a couple of temp file names in the proper directory.
+    dsttmp=$dstdir/_inst.$$_
+    rmtmp=$dstdir/_rm.$$_
+
+    # Trap to clean up those temp files at exit.
+    trap 'ret=$?; rm -f "$dsttmp" "$rmtmp" && exit $ret' 0
+
+    # Copy the file name to the temp name.
+    (umask $cp_umask && $doit_exec $cpprog "$src" "$dsttmp") &&
+
+    # and set any options; do chmod last to preserve setuid bits.
+    #
+    # If any of these fail, we abort the whole thing.  If we want to
+    # ignore errors from any of these, just make sure not to ignore
+    # errors from the above "$doit $cpprog $src $dsttmp" command.
+    #
+    { test -z "$chowncmd" || $doit $chowncmd "$dsttmp"; } &&
+    { test -z "$chgrpcmd" || $doit $chgrpcmd "$dsttmp"; } &&
+    { test -z "$stripcmd" || $doit $stripcmd "$dsttmp"; } &&
+    { test -z "$chmodcmd" || $doit $chmodcmd $mode "$dsttmp"; } &&
+
+    # If -C, don't bother to copy if it wouldn't change the file.
+    if $copy_on_change &&
+       old=`LC_ALL=C ls -dlL "$dst"	2>/dev/null` &&
+       new=`LC_ALL=C ls -dlL "$dsttmp"	2>/dev/null` &&
+
+       eval "$initialize_posix_glob" &&
+       $posix_glob set -f &&
+       set X $old && old=:$2:$4:$5:$6 &&
+       set X $new && new=:$2:$4:$5:$6 &&
+       $posix_glob set +f &&
+
+       test "$old" = "$new" &&
+       $cmpprog "$dst" "$dsttmp" >/dev/null 2>&1
+    then
+      rm -f "$dsttmp"
+    else
+      # Rename the file to the real destination.
+      $doit $mvcmd -f "$dsttmp" "$dst" 2>/dev/null ||
+
+      # The rename failed, perhaps because mv can't rename something else
+      # to itself, or perhaps because mv is so ancient that it does not
+      # support -f.
+      {
+	# Now remove or move aside any old file at destination location.
+	# We try this two ways since rm can't unlink itself on some
+	# systems and the destination file might be busy for other
+	# reasons.  In this case, the final cleanup might fail but the new
+	# file should still install successfully.
+	{
+	  test ! -f "$dst" ||
+	  $doit $rmcmd -f "$dst" 2>/dev/null ||
+	  { $doit $mvcmd -f "$dst" "$rmtmp" 2>/dev/null &&
+	    { $doit $rmcmd -f "$rmtmp" 2>/dev/null; :; }
+	  } ||
+	  { echo "$0: cannot unlink or rename $dst" >&2
+	    (exit 1); exit 1
+	  }
+	} &&
+
+	# Now rename the file to the real destination.
+	$doit $mvcmd "$dsttmp" "$dst"
+      }
+    fi || exit 1
+
+    trap '' 0
+  fi
+done
+
+# Local variables:
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "scriptversion="
+# time-stamp-format: "%:y-%02m-%02d.%02H"
+# time-stamp-time-zone: "UTC"
+# time-stamp-end: "; # UTC"
+# End:
diff --git a/lbry/cuda_lbry_merged.cu b/lbry/cuda_lbry_merged.cu
new file mode 100644
index 0000000000..c140350b50
--- /dev/null
+++ b/lbry/cuda_lbry_merged.cu
@@ -0,0 +1,1062 @@
+/*
+ * LBRY merged kernel CUDA implementation.
+ * For compute 5.2 and beyond gpus
+ * tpruvot and Provos Alexis - Sep 2016
+ * Sponsored by LBRY.IO team
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <memory.h>
+
+#include <cuda_helper.h>
+#include <cuda_vector_uint2x4.h>
+
+#include <miner.h>
+
+__constant__ static uint32_t _ALIGN(16) c_midstate112[8];
+__constant__ static uint32_t _ALIGN(16) c_midbuffer112[8];
+__constant__ static uint32_t _ALIGN(16) c_dataEnd112[12];
+
+__constant__ static const uint32_t c_H256[8] = {
+	0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+};
+__constant__ static uint32_t _ALIGN(8) c_K[64] = {
+	0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
+	0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3, 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
+	0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC, 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
+	0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
+	0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13, 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
+	0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3, 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
+	0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
+	0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208, 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
+};
+
+#ifdef __INTELLISENSE__
+#define atomicExch(p,y) y
+#define __byte_perm(x,y,z) x
+#endif
+
+// ------------------------------------------------------------------------------------------------
+
+static const uint32_t cpu_H256[8] = {
+	0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+};
+
+static const uint32_t cpu_K[64] = {
+	0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
+	0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3, 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
+	0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC, 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
+	0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
+	0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13, 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
+	0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3, 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
+	0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
+	0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208, 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
+};
+
+__host__
+static void sha256_step1_host(uint32_t a, uint32_t b, uint32_t c, uint32_t &d, uint32_t e, uint32_t f, uint32_t g, uint32_t &h, uint32_t in, const uint32_t Kshared)
+{
+	uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
+	uint32_t bsg21 = ROTR32(e, 6) ^ ROTR32(e, 11) ^ ROTR32(e, 25); // bsg2_1(e);
+	uint32_t bsg20 = ROTR32(a, 2) ^ ROTR32(a, 13) ^ ROTR32(a, 22); //bsg2_0(a);
+	uint32_t andorv = ((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);
+	uint32_t t1 = h + bsg21 + vxandx + Kshared + in;
+	uint32_t t2 = bsg20 + andorv;
+	d = d + t1;
+	h = t1 + t2;
+}
+
+__host__
+static void sha256_step2_host(uint32_t a, uint32_t b, uint32_t c, uint32_t &d,
+	uint32_t e, uint32_t f, uint32_t g, uint32_t &h, uint32_t* in, uint32_t pc, const uint32_t Kshared)
+{
+	int pcidx1 = (pc-2)  & 0xF;
+	int pcidx2 = (pc-7)  & 0xF;
+	int pcidx3 = (pc-15) & 0xF;
+
+	uint32_t inx0 = in[pc];
+	uint32_t inx1 = in[pcidx1];
+	uint32_t inx2 = in[pcidx2];
+	uint32_t inx3 = in[pcidx3];
+
+	uint32_t ssg21 = ROTR32(inx1, 17) ^ ROTR32(inx1, 19) ^ SPH_T32((inx1) >> 10); //ssg2_1(inx1);
+	uint32_t ssg20 = ROTR32(inx3, 7) ^ ROTR32(inx3, 18) ^ SPH_T32((inx3) >> 3); //ssg2_0(inx3);
+	uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
+	uint32_t bsg21 = ROTR32(e, 6) ^ ROTR32(e, 11) ^ ROTR32(e, 25); // bsg2_1(e);
+	uint32_t bsg20 = ROTR32(a, 2) ^ ROTR32(a, 13) ^ ROTR32(a, 22); //bsg2_0(a);
+	uint32_t andorv = ((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);
+	uint32_t t1,t2;
+
+	in[pc] = ssg21 + inx2 + ssg20 + inx0;
+
+	t1 = h + bsg21 + vxandx + Kshared + in[pc];
+	t2 = bsg20 + andorv;
+	d =  d + t1;
+	h = t1 + t2;
+}
+
+__host__
+static void sha256_round_body_host(uint32_t* in, uint32_t* state, const uint32_t* Kshared)
+{
+	uint32_t a = state[0];
+	uint32_t b = state[1];
+	uint32_t c = state[2];
+	uint32_t d = state[3];
+	uint32_t e = state[4];
+	uint32_t f = state[5];
+	uint32_t g = state[6];
+	uint32_t h = state[7];
+
+	sha256_step1_host(a,b,c,d,e,f,g,h,in[0], Kshared[0]);
+	sha256_step1_host(h,a,b,c,d,e,f,g,in[1], Kshared[1]);
+	sha256_step1_host(g,h,a,b,c,d,e,f,in[2], Kshared[2]);
+	sha256_step1_host(f,g,h,a,b,c,d,e,in[3], Kshared[3]);
+	sha256_step1_host(e,f,g,h,a,b,c,d,in[4], Kshared[4]);
+	sha256_step1_host(d,e,f,g,h,a,b,c,in[5], Kshared[5]);
+	sha256_step1_host(c,d,e,f,g,h,a,b,in[6], Kshared[6]);
+	sha256_step1_host(b,c,d,e,f,g,h,a,in[7], Kshared[7]);
+	sha256_step1_host(a,b,c,d,e,f,g,h,in[8], Kshared[8]);
+	sha256_step1_host(h,a,b,c,d,e,f,g,in[9], Kshared[9]);
+	sha256_step1_host(g,h,a,b,c,d,e,f,in[10],Kshared[10]);
+	sha256_step1_host(f,g,h,a,b,c,d,e,in[11],Kshared[11]);
+	sha256_step1_host(e,f,g,h,a,b,c,d,in[12],Kshared[12]);
+	sha256_step1_host(d,e,f,g,h,a,b,c,in[13],Kshared[13]);
+	sha256_step1_host(c,d,e,f,g,h,a,b,in[14],Kshared[14]);
+	sha256_step1_host(b,c,d,e,f,g,h,a,in[15],Kshared[15]);
+
+	for (int i=0; i<3; i++)
+	{
+		sha256_step2_host(a,b,c,d,e,f,g,h,in,0, Kshared[16+16*i]);
+		sha256_step2_host(h,a,b,c,d,e,f,g,in,1, Kshared[17+16*i]);
+		sha256_step2_host(g,h,a,b,c,d,e,f,in,2, Kshared[18+16*i]);
+		sha256_step2_host(f,g,h,a,b,c,d,e,in,3, Kshared[19+16*i]);
+		sha256_step2_host(e,f,g,h,a,b,c,d,in,4, Kshared[20+16*i]);
+		sha256_step2_host(d,e,f,g,h,a,b,c,in,5, Kshared[21+16*i]);
+		sha256_step2_host(c,d,e,f,g,h,a,b,in,6, Kshared[22+16*i]);
+		sha256_step2_host(b,c,d,e,f,g,h,a,in,7, Kshared[23+16*i]);
+		sha256_step2_host(a,b,c,d,e,f,g,h,in,8, Kshared[24+16*i]);
+		sha256_step2_host(h,a,b,c,d,e,f,g,in,9, Kshared[25+16*i]);
+		sha256_step2_host(g,h,a,b,c,d,e,f,in,10,Kshared[26+16*i]);
+		sha256_step2_host(f,g,h,a,b,c,d,e,in,11,Kshared[27+16*i]);
+		sha256_step2_host(e,f,g,h,a,b,c,d,in,12,Kshared[28+16*i]);
+		sha256_step2_host(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]);
+		sha256_step2_host(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]);
+		sha256_step2_host(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]);
+	}
+
+	state[0] += a;
+	state[1] += b;
+	state[2] += c;
+	state[3] += d;
+	state[4] += e;
+	state[5] += f;
+	state[6] += g;
+	state[7] += h;
+}
+
+__host__
+void lbry_sha256_setBlock_112_merged(uint32_t *pdata)
+{
+	uint32_t in[16], buf[8], end[16];
+	for (int i=0;i<16;i++) in[i] = cuda_swab32(pdata[i]);
+	for (int i=0; i<8;i++) buf[i] = cpu_H256[i];
+	for (int i=0;i<11;i++) end[i] = cuda_swab32(pdata[16+i]);
+	sha256_round_body_host(in, buf, cpu_K);
+
+	cudaMemcpyToSymbol(c_midstate112, buf, 32, 0, cudaMemcpyHostToDevice);
+
+	uint32_t a = buf[0];
+	uint32_t b = buf[1];
+	uint32_t c = buf[2];
+	uint32_t d = buf[3];
+	uint32_t e = buf[4];
+	uint32_t f = buf[5];
+	uint32_t g = buf[6];
+	uint32_t h = buf[7];
+
+	sha256_step1_host(a,b,c,d,e,f,g,h,end[0], cpu_K[0]);
+	sha256_step1_host(h,a,b,c,d,e,f,g,end[1], cpu_K[1]);
+	sha256_step1_host(g,h,a,b,c,d,e,f,end[2], cpu_K[2]);
+	sha256_step1_host(f,g,h,a,b,c,d,e,end[3], cpu_K[3]);
+	sha256_step1_host(e,f,g,h,a,b,c,d,end[4], cpu_K[4]);
+	sha256_step1_host(d,e,f,g,h,a,b,c,end[5], cpu_K[5]);
+	sha256_step1_host(c,d,e,f,g,h,a,b,end[6], cpu_K[6]);
+	sha256_step1_host(b,c,d,e,f,g,h,a,end[7], cpu_K[7]);
+	sha256_step1_host(a,b,c,d,e,f,g,h,end[8], cpu_K[8]);
+	sha256_step1_host(h,a,b,c,d,e,f,g,end[9], cpu_K[9]);
+	sha256_step1_host(g,h,a,b,c,d,e,f,end[10],cpu_K[10]);
+	sha256_step1_host(f, g, h, a, b, c, d, e, 0, cpu_K[11]);
+
+	buf[0] = a;
+	buf[1] = b;
+	buf[2] = c;
+	buf[3] = d;
+	buf[4] = e;
+	buf[5] = f;
+	buf[6] = g;
+	buf[7] = h;
+
+	cudaMemcpyToSymbol(c_midbuffer112, buf, 32, 0, cudaMemcpyHostToDevice);
+
+	end[12] = 0x80000000;
+	end[13] = 0;
+	end[14] = 0;
+	end[15] = 0x380;
+	uint32_t x2_0,x2_1;
+
+	x2_0 = ROTR32(end[1], 7) ^ ROTR32(end[1], 18) ^ SPH_T32(end[1] >> 3); //ssg2_0(inx3);//ssg2_0(end[1]);
+//	x2_1 = ROTR32(end[14], 17) ^ ROTR32(end[14], 19) ^ SPH_T32(end[14] >> 10) + x2_0; //ssg2_1(inx1); ssg2_1(end[14]) + x2_0;
+	end[0] = end[0] + end[9] + x2_0;
+
+	x2_0 = ROTR32(end[2], 7) ^ ROTR32(end[2], 18) ^ SPH_T32(end[2] >> 3);
+	x2_1 = (ROTR32(end[15], 17) ^ ROTR32(end[15], 19) ^ SPH_T32(end[15] >> 10)) + x2_0;
+	end[1] = end[1] + end[10] + x2_1;
+
+	x2_0 = ROTR32(end[3], 7) ^ ROTR32(end[3], 18) ^ SPH_T32(end[3] >> 3);//ssg2_0(end[3]);
+	x2_1 = (ROTR32(end[0], 17) ^ ROTR32(end[0], 19) ^ SPH_T32(end[0] >> 10)) + x2_0;
+	end[2]+= x2_1;
+
+	x2_0 = ROTR32(end[4], 7) ^ ROTR32(end[4], 18) ^ SPH_T32(end[4] >> 3);//ssg2_0(end[4]);
+	x2_1 = (ROTR32(end[1], 17) ^ ROTR32(end[1], 19) ^ SPH_T32(end[1] >> 10)) + x2_0;
+	end[3] = end[3] + end[12] + x2_1;
+
+	x2_0 = ROTR32(end[5], 7) ^ ROTR32(end[5], 18) ^ SPH_T32(end[5] >> 3);//ssg2_0(end[4]);
+	end[4] = end[4] + end[13] + x2_0;
+
+	x2_0 = ROTR32(end[6], 7) ^ ROTR32(end[6], 18) ^ SPH_T32(end[6] >> 3);//ssg2_0(end[6]);
+	x2_1 = (ROTR32(end[3], 17) ^ ROTR32(end[3], 19) ^ SPH_T32(end[3] >> 10)) + x2_0;
+	end[5] = end[5] + end[14] + x2_1;
+
+	x2_0 = ROTR32(end[7], 7) ^ ROTR32(end[7], 18) ^ SPH_T32(end[7] >> 3);//ssg2_0(end[7]);
+	end[6] = end[6] + end[15] + x2_0;
+
+	x2_0 = ROTR32(end[8], 7) ^ ROTR32(end[8], 18) ^ SPH_T32(end[8] >> 3);//ssg2_0(end[8]);
+	x2_1 = (ROTR32(end[5], 17) ^ ROTR32(end[5], 19) ^ SPH_T32(end[5] >> 10)) + x2_0;
+	end[7] = end[7] + end[0] + x2_1;
+
+	x2_0 = ROTR32(end[9], 7) ^ ROTR32(end[9], 18) ^ SPH_T32(end[9] >> 3);//ssg2_0(end[9]);
+	end[8] = end[8] + end[1] + x2_0;
+
+	x2_0 = ROTR32(end[10], 7) ^ ROTR32(end[10], 18) ^ SPH_T32(end[10] >> 3);//ssg2_0(end[10]);
+	x2_1 = (ROTR32(end[7], 17) ^ ROTR32(end[7], 19) ^ SPH_T32(end[7] >> 10)) + x2_0;
+	end[9] = end[9] + x2_1;
+
+	cudaMemcpyToSymbol(c_dataEnd112,  end, sizeof(end), 0, cudaMemcpyHostToDevice);
+}
+
+//END OF HOST FUNCTIONS -------------------------------------------------------------------
+
+//SHA256 MACROS ---------------------------------------------------------------------------
+
+#define xor3b(a,b,c) (a ^ b ^ c)
+
+__device__ __forceinline__ uint32_t bsg2_0(const uint32_t x){
+	return xor3b(ROTR32(x,2),ROTR32(x,13),ROTR32(x,22));
+}
+
+__device__ __forceinline__ uint32_t bsg2_1(const uint32_t x){
+	return xor3b(ROTR32(x,6),ROTR32(x,11),ROTR32(x,25));
+}
+
+__device__ __forceinline__ uint32_t ssg2_0(const uint32_t x){
+	return xor3b(ROTR32(x,7),ROTR32(x,18),(x>>3));
+}
+
+__device__ __forceinline__ uint32_t ssg2_1(const uint32_t x){
+	return xor3b(ROTR32(x,17),ROTR32(x,19),(x>>10));
+}
+
+__device__ __forceinline__ uint64_t vectorizeswap(const uint64_t v){
+	uint2 result;
+	asm volatile ("mov.b64 {%0,%1},%2;" : "=r"(result.y), "=r"(result.x) : "l"(v));
+	return devectorize(result);
+}
+
+#define Maj(x, y, z)    ((x & (y | z)) | (y & z))
+#define Ch(a, b, c)     (((b^c) & a) ^ c)
+
+__device__
+static void sha2_step(const uint32_t a, const uint32_t b,const uint32_t c, uint32_t &d,
+	const uint32_t e,const uint32_t f,const uint32_t g, uint32_t &h,
+	const uint32_t in, const uint32_t Kshared)
+{
+	const uint32_t t1 = h + bsg2_1(e) + Ch(e, f, g) + Kshared + in;
+	h = t1 + Maj(a, b, c) + bsg2_0(a);
+	d+= t1;
+}
+
+__device__
+static void sha256_round_first(uint32_t *in, uint32_t *buf,
+	const uint32_t *state, const uint32_t* __restrict__ Kshared)
+{
+	uint32_t a = buf[0] + in[11];
+	uint32_t b = buf[1];
+	uint32_t c = buf[2];
+	uint32_t d = buf[3];
+	uint32_t e = buf[4] + in[11];
+	uint32_t f = buf[5];
+	uint32_t g = buf[6];
+	uint32_t h = buf[7];
+
+	// 10 first steps made on host
+	//sha2_step(f,g,h,a,b,c,d,e,in[11],Kshared[11]);
+
+	sha2_step(e,f,g,h,a,b,c,d,in[12],Kshared[12]);
+	sha2_step(d,e,f,g,h,a,b,c,in[13],Kshared[13]);
+	sha2_step(c,d,e,f,g,h,a,b,in[14],Kshared[14]);
+	sha2_step(b,c,d,e,f,g,h,a,in[15],Kshared[15]);
+
+	//in is partially precomputed on host
+	in[2]+= in[11];
+	in[4]+= ssg2_1(in[2]);
+	in[6]+= ssg2_1(in[4]);
+	in[8]+= ssg2_1(in[6]);
+	in[9]+= in[ 2];
+
+	sha2_step(a,b,c,d,e,f,g,h,in[0], Kshared[16]);
+	sha2_step(h,a,b,c,d,e,f,g,in[1], Kshared[17]);
+	sha2_step(g,h,a,b,c,d,e,f,in[2], Kshared[18]);
+	sha2_step(f,g,h,a,b,c,d,e,in[3], Kshared[19]);
+	sha2_step(e,f,g,h,a,b,c,d,in[4], Kshared[20]);
+	sha2_step(d,e,f,g,h,a,b,c,in[5], Kshared[21]);
+	sha2_step(c,d,e,f,g,h,a,b,in[6], Kshared[22]);
+	sha2_step(b,c,d,e,f,g,h,a,in[7], Kshared[23]);
+	sha2_step(a,b,c,d,e,f,g,h,in[8], Kshared[24]);
+	sha2_step(h,a,b,c,d,e,f,g,in[9], Kshared[25]);
+
+	#pragma unroll 6
+	for (uint32_t j = 10; j < 16; j++) {
+		const uint32_t x2_0 = ssg2_0(in[(j + 1) & 15]);
+		const uint32_t x2_1 = ssg2_1(in[(j + 14) & 15]) + x2_0;
+		in[j] = in[j] + in[(j + 9) & 15] + x2_1;
+	}
+
+	sha2_step(g,h,a,b,c,d,e,f,in[10],Kshared[26]);
+	sha2_step(f,g,h,a,b,c,d,e,in[11],Kshared[27]);
+	sha2_step(e,f,g,h,a,b,c,d,in[12],Kshared[28]);
+	sha2_step(d,e,f,g,h,a,b,c,in[13],Kshared[29]);
+	sha2_step(c,d,e,f,g,h,a,b,in[14],Kshared[30]);
+	sha2_step(b,c,d,e,f,g,h,a,in[15],Kshared[31]);
+
+	#pragma unroll 16
+	for (uint32_t j = 0; j < 16; j++) {
+		const uint32_t x2_0 = ssg2_0(in[(j + 1) & 15]);
+		const uint32_t x2_1 = ssg2_1(in[(j + 14) & 15]) + x2_0;
+		in[j] = in[j] + in[(j + 9) & 15] + x2_1;
+	}
+
+	sha2_step(a,b,c,d,e,f,g,h,in[0], Kshared[16+16]);
+	sha2_step(h,a,b,c,d,e,f,g,in[1], Kshared[17+16]);
+	sha2_step(g,h,a,b,c,d,e,f,in[2], Kshared[18+16]);
+	sha2_step(f,g,h,a,b,c,d,e,in[3], Kshared[19+16]);
+	sha2_step(e,f,g,h,a,b,c,d,in[4], Kshared[20+16]);
+	sha2_step(d,e,f,g,h,a,b,c,in[5], Kshared[21+16]);
+	sha2_step(c,d,e,f,g,h,a,b,in[6], Kshared[22+16]);
+	sha2_step(b,c,d,e,f,g,h,a,in[7], Kshared[23+16]);
+	sha2_step(a,b,c,d,e,f,g,h,in[8], Kshared[24+16]);
+	sha2_step(h,a,b,c,d,e,f,g,in[9], Kshared[25+16]);
+	sha2_step(g,h,a,b,c,d,e,f,in[10],Kshared[26+16]);
+	sha2_step(f,g,h,a,b,c,d,e,in[11],Kshared[27+16]);
+	sha2_step(e,f,g,h,a,b,c,d,in[12],Kshared[28+16]);
+	sha2_step(d,e,f,g,h,a,b,c,in[13],Kshared[29+16]);
+	sha2_step(c,d,e,f,g,h,a,b,in[14],Kshared[30+16]);
+	sha2_step(b,c,d,e,f,g,h,a,in[15],Kshared[31+16]);
+
+	#pragma unroll 16
+	for (uint32_t j = 0; j < 16; j++) {
+		const uint32_t x2_0 = ssg2_0(in[(j + 1) & 15]);
+		const uint32_t x2_1 = ssg2_1(in[(j + 14) & 15]) + x2_0;
+		in[j] = in[j] + in[(j + 9) & 15] + x2_1;
+	}
+
+	sha2_step(a,b,c,d,e,f,g,h,in[0], Kshared[16+16*2]);
+	sha2_step(h,a,b,c,d,e,f,g,in[1], Kshared[17+16*2]);
+	sha2_step(g,h,a,b,c,d,e,f,in[2], Kshared[18+16*2]);
+	sha2_step(f,g,h,a,b,c,d,e,in[3], Kshared[19+16*2]);
+	sha2_step(e,f,g,h,a,b,c,d,in[4], Kshared[20+16*2]);
+	sha2_step(d,e,f,g,h,a,b,c,in[5], Kshared[21+16*2]);
+	sha2_step(c,d,e,f,g,h,a,b,in[6], Kshared[22+16*2]);
+	sha2_step(b,c,d,e,f,g,h,a,in[7], Kshared[23+16*2]);
+	sha2_step(a,b,c,d,e,f,g,h,in[8], Kshared[24+16*2]);
+	sha2_step(h,a,b,c,d,e,f,g,in[9], Kshared[25+16*2]);
+	sha2_step(g,h,a,b,c,d,e,f,in[10],Kshared[26+16*2]);
+	sha2_step(f,g,h,a,b,c,d,e,in[11],Kshared[27+16*2]);
+	sha2_step(e,f,g,h,a,b,c,d,in[12],Kshared[28+16*2]);
+	sha2_step(d,e,f,g,h,a,b,c,in[13],Kshared[29+16*2]);
+	sha2_step(c,d,e,f,g,h,a,b,in[14],Kshared[30+16*2]);
+	sha2_step(b,c,d,e,f,g,h,a,in[15],Kshared[31+16*2]);
+
+	buf[ 0] = state[0] + a;
+	buf[ 1] = state[1] + b;
+	buf[ 2] = state[2] + c;
+	buf[ 3] = state[3] + d;
+	buf[ 4] = state[4] + e;
+	buf[ 5] = state[5] + f;
+	buf[ 6] = state[6] + g;
+	buf[ 7] = state[7] + h;
+}
+
+__device__
+static void sha256_round_body(uint32_t *in, uint32_t *state,const uint32_t* Kshared)
+{
+	uint32_t a = state[0];
+	uint32_t b = state[1];
+	uint32_t c = state[2];
+	uint32_t d = state[3];
+	uint32_t e = state[4];
+	uint32_t f = state[5];
+	uint32_t g = state[6];
+	uint32_t h = state[7];
+
+	sha2_step(a,b,c,d,e,f,g,h,in[ 0], Kshared[ 0]);
+	sha2_step(h,a,b,c,d,e,f,g,in[ 1], Kshared[ 1]);
+	sha2_step(g,h,a,b,c,d,e,f,in[ 2], Kshared[ 2]);
+	sha2_step(f,g,h,a,b,c,d,e,in[ 3], Kshared[ 3]);
+	sha2_step(e,f,g,h,a,b,c,d,in[ 4], Kshared[ 4]);
+	sha2_step(d,e,f,g,h,a,b,c,in[ 5], Kshared[ 5]);
+	sha2_step(c,d,e,f,g,h,a,b,in[ 6], Kshared[ 6]);
+	sha2_step(b,c,d,e,f,g,h,a,in[ 7], Kshared[ 7]);
+	sha2_step(a,b,c,d,e,f,g,h,in[ 8], Kshared[ 8]);
+	sha2_step(h,a,b,c,d,e,f,g,in[ 9], Kshared[ 9]);
+	sha2_step(g,h,a,b,c,d,e,f,in[10], Kshared[10]);
+	sha2_step(f,g,h,a,b,c,d,e,in[11], Kshared[11]);
+	sha2_step(e,f,g,h,a,b,c,d,in[12], Kshared[12]);
+	sha2_step(d,e,f,g,h,a,b,c,in[13], Kshared[13]);
+	sha2_step(c,d,e,f,g,h,a,b,in[14], Kshared[14]);
+	sha2_step(b,c,d,e,f,g,h,a,in[15], Kshared[15]);
+
+	#pragma unroll 3
+	for (int i=0; i<3; i++)
+	{
+		#pragma unroll 16
+		for (uint32_t j = 0; j < 16; j++) {
+			const uint32_t x2_0 = ssg2_0(in[(j + 1) & 15]);
+			const uint32_t x2_1 = ssg2_1(in[(j + 14) & 15]) + x2_0;
+			in[j] = in[j] + in[(j + 9) & 15] + x2_1;
+		}
+		sha2_step(a, b, c, d, e, f, g, h, in[ 0], Kshared[16 + 16 * i]);
+		sha2_step(h, a, b, c, d, e, f, g, in[ 1], Kshared[17 + 16 * i]);
+		sha2_step(g, h, a, b, c, d, e, f, in[ 2], Kshared[18 + 16 * i]);
+		sha2_step(f, g, h, a, b, c, d, e, in[ 3], Kshared[19 + 16 * i]);
+		sha2_step(e, f, g, h, a, b, c, d, in[ 4], Kshared[20 + 16 * i]);
+		sha2_step(d, e, f, g, h, a, b, c, in[ 5], Kshared[21 + 16 * i]);
+		sha2_step(c, d, e, f, g, h, a, b, in[ 6], Kshared[22 + 16 * i]);
+		sha2_step(b, c, d, e, f, g, h, a, in[ 7], Kshared[23 + 16 * i]);
+		sha2_step(a, b, c, d, e, f, g, h, in[ 8], Kshared[24 + 16 * i]);
+		sha2_step(h, a, b, c, d, e, f, g, in[ 9], Kshared[25 + 16 * i]);
+		sha2_step(g, h, a, b, c, d, e, f, in[10], Kshared[26 + 16 * i]);
+		sha2_step(f, g, h, a, b, c, d, e, in[11], Kshared[27 + 16 * i]);
+		sha2_step(e, f, g, h, a, b, c, d, in[12], Kshared[28 + 16 * i]);
+		sha2_step(d, e, f, g, h, a, b, c, in[13], Kshared[29 + 16 * i]);
+		sha2_step(c, d, e, f, g, h, a, b, in[14], Kshared[30 + 16 * i]);
+		sha2_step(b, c, d, e, f, g, h, a, in[15], Kshared[31 + 16 * i]);
+	}
+
+	state[0] += a;
+	state[1] += b;
+	state[2] += c;
+	state[3] += d;
+	state[4] += e;
+	state[5] += f;
+	state[6] += g;
+	state[7] += h;
+}
+
+__device__
+static void sha256_round_body_final(uint32_t *in, uint32_t *state, const uint32_t *Kshared)
+{
+	uint32_t a = state[0];
+	uint32_t b = state[1];
+	uint32_t c = state[2];
+	uint32_t d = state[3];
+	uint32_t e = state[4];
+	uint32_t f = state[5];
+	uint32_t g = state[6];
+	uint32_t h = state[7];
+
+	sha2_step(a,b,c,d,e,f,g,h,in[ 0], Kshared[0]);
+	sha2_step(h,a,b,c,d,e,f,g,in[ 1], Kshared[1]);
+	sha2_step(g,h,a,b,c,d,e,f,in[ 2], Kshared[2]);
+	sha2_step(f,g,h,a,b,c,d,e,in[ 3], Kshared[3]);
+	sha2_step(e,f,g,h,a,b,c,d,in[ 4], Kshared[4]);
+	sha2_step(d,e,f,g,h,a,b,c,in[ 5], Kshared[5]);
+	sha2_step(c,d,e,f,g,h,a,b,in[ 6], Kshared[6]);
+	sha2_step(b,c,d,e,f,g,h,a,in[ 7], Kshared[7]);
+	sha2_step(a,b,c,d,e,f,g,h,in[ 8], Kshared[8]);
+	sha2_step(h,a,b,c,d,e,f,g,in[ 9], Kshared[9]);
+	sha2_step(g,h,a,b,c,d,e,f,in[10], Kshared[10]);
+	sha2_step(f,g,h,a,b,c,d,e,in[11], Kshared[11]);
+	sha2_step(e,f,g,h,a,b,c,d,in[12], Kshared[12]);
+	sha2_step(d,e,f,g,h,a,b,c,in[13], Kshared[13]);
+	sha2_step(c,d,e,f,g,h,a,b,in[14], Kshared[14]);
+	sha2_step(b,c,d,e,f,g,h,a,in[15], Kshared[15]);
+
+	#pragma unroll 2
+	for (int i=0; i<2; i++)
+	{
+		#pragma unroll 16
+		for (uint32_t j = 0; j < 16; j++) {
+			const uint32_t x2_0 = ssg2_0(in[(j + 1) & 15]);
+			const uint32_t x2_1 = ssg2_1(in[(j + 14) & 15]) + x2_0;
+			in[j] = in[j] + in[(j + 9) & 15] + x2_1;
+		}
+		sha2_step(a, b, c, d, e, f, g, h, in[ 0], Kshared[16 + 16 * i]);
+		sha2_step(h, a, b, c, d, e, f, g, in[ 1], Kshared[17 + 16 * i]);
+		sha2_step(g, h, a, b, c, d, e, f, in[ 2], Kshared[18 + 16 * i]);
+		sha2_step(f, g, h, a, b, c, d, e, in[ 3], Kshared[19 + 16 * i]);
+		sha2_step(e, f, g, h, a, b, c, d, in[ 4], Kshared[20 + 16 * i]);
+		sha2_step(d, e, f, g, h, a, b, c, in[ 5], Kshared[21 + 16 * i]);
+		sha2_step(c, d, e, f, g, h, a, b, in[ 6], Kshared[22 + 16 * i]);
+		sha2_step(b, c, d, e, f, g, h, a, in[ 7], Kshared[23 + 16 * i]);
+		sha2_step(a, b, c, d, e, f, g, h, in[ 8], Kshared[24 + 16 * i]);
+		sha2_step(h, a, b, c, d, e, f, g, in[ 9], Kshared[25 + 16 * i]);
+		sha2_step(g, h, a, b, c, d, e, f, in[10], Kshared[26 + 16 * i]);
+		sha2_step(f, g, h, a, b, c, d, e, in[11], Kshared[27 + 16 * i]);
+		sha2_step(e, f, g, h, a, b, c, d, in[12], Kshared[28 + 16 * i]);
+		sha2_step(d, e, f, g, h, a, b, c, in[13], Kshared[29 + 16 * i]);
+		sha2_step(c, d, e, f, g, h, a, b, in[14], Kshared[30 + 16 * i]);
+		sha2_step(b, c, d, e, f, g, h, a, in[15], Kshared[31 + 16 * i]);
+	}
+	#pragma unroll 16
+	for (uint32_t j = 0; j < 16; j++) {
+		const uint32_t x2_0 = ssg2_0(in[(j + 1) & 15]);
+		const uint32_t x2_1 = ssg2_1(in[(j + 14) & 15]) + x2_0;
+		in[j] = in[j] + in[(j + 9) & 15] + x2_1;
+	}
+	sha2_step(a, b, c, d, e, f, g, h, in[ 0], Kshared[16 + 16 * 2]);
+	sha2_step(h, a, b, c, d, e, f, g, in[ 1], Kshared[17 + 16 * 2]);
+	sha2_step(g, h, a, b, c, d, e, f, in[ 2], Kshared[18 + 16 * 2]);
+	sha2_step(f, g, h, a, b, c, d, e, in[ 3], Kshared[19 + 16 * 2]);
+	sha2_step(e, f, g, h, a, b, c, d, in[ 4], Kshared[20 + 16 * 2]);
+	sha2_step(d, e, f, g, h, a, b, c, in[ 5], Kshared[21 + 16 * 2]);
+	sha2_step(c, d, e, f, g, h, a, b, in[ 6], Kshared[22 + 16 * 2]);
+	sha2_step(b, c, d, e, f, g, h, a, in[ 7], Kshared[23 + 16 * 2]);
+	sha2_step(a, b, c, d, e, f, g, h, in[ 8], Kshared[24 + 16 * 2]);
+	sha2_step(h, a, b, c, d, e, f, g, in[ 9], Kshared[25 + 16 * 2]);
+	sha2_step(g, h, a, b, c, d, e, f, in[10], Kshared[26 + 16 * 2]);
+	sha2_step(f, g, h, a, b, c, d, e, in[11], Kshared[27 + 16 * 2]);
+	sha2_step(e, f, g, h, a, b, c, d, in[12], Kshared[28 + 16 * 2]);
+	sha2_step(d, e, f, g, h, a, b, c, in[13], Kshared[29 + 16 * 2]);
+
+	state[6] += g;
+	state[7] += h;
+}
+
+//END OF SHA256 MACROS --------------------------------------------------------------------
+
+//SHA512 MACROS ---------------------------------------------------------------------------
+static __constant__ _ALIGN(8) uint64_t K_512[80] = {
+	0x428A2F98D728AE22, 0x7137449123EF65CD, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
+	0x3956C25BF348B538, 0x59F111F1B605D019, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
+	0xD807AA98A3030242, 0x12835B0145706FBE, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
+	0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1, 0x9BDC06A725C71235, 0xC19BF174CF692694,
+	0xE49B69C19EF14AD2, 0xEFBE4786384F25E3, 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65,
+	0x2DE92C6F592B0275, 0x4A7484AA6EA6E483, 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5,
+	0x983E5152EE66DFAB, 0xA831C66D2DB43210, 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4,
+	0xC6E00BF33DA88FC2, 0xD5A79147930AA725, 0x06CA6351E003826F, 0x142929670A0E6E70,
+	0x27B70A8546D22FFC, 0x2E1B21385C26C926, 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF,
+	0x650A73548BAF63DE, 0x766A0ABB3C77B2A8, 0x81C2C92E47EDAEE6, 0x92722C851482353B,
+	0xA2BFE8A14CF10364, 0xA81A664BBC423001, 0xC24B8B70D0F89791, 0xC76C51A30654BE30,
+	0xD192E819D6EF5218, 0xD69906245565A910, 0xF40E35855771202A, 0x106AA07032BBD1B8,
+	0x19A4C116B8D2D0C8, 0x1E376C085141AB53, 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8,
+	0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB, 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3,
+	0x748F82EE5DEFB2FC, 0x78A5636F43172F60, 0x84C87814A1F0AB72, 0x8CC702081A6439EC,
+	0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9, 0xBEF9A3F7B2C67915, 0xC67178F2E372532B,
+	0xCA273ECEEA26619C, 0xD186B8C721C0C207, 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178,
+	0x06F067AA72176FBA, 0x0A637DC5A2C898A6, 0x113F9804BEF90DAE, 0x1B710B35131C471B,
+	0x28DB77F523047D84, 0x32CAAB7B40C72493, 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C,
+	0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817
+};
+
+#undef xor3
+#define xor3(a,b,c) (a^b^c)
+
+#define bsg5_0(x) xor3(ROTR64(x,28),ROTR64(x,34),ROTR64(x,39))
+#define bsg5_1(x) xor3(ROTR64(x,14),ROTR64(x,18),ROTR64(x,41))
+#define ssg5_0(x) xor3(ROTR64(x, 1),ROTR64(x, 8),x>>7)
+#define ssg5_1(x) xor3(ROTR64(x,19),ROTR64(x,61),x>>6)
+
+#define andor64(a,b,c) ((a & (b | c)) | (b & c))
+#define xandx64(e,f,g) (g ^ (e & (g ^ f)))
+
+__device__ __forceinline__
+uint64_t cuda_swab64ll(const uint32_t x, const uint32_t y)
+{
+	uint64_t r;
+	asm("prmt.b32 %1, %1, 0, 0x0123; // swab64ll\n\t"
+	    "prmt.b32 %2, %2, 0, 0x0123;\n\t"
+	    "mov.b64 %0, {%1,%2};\n\t"
+	  : "=l"(r): "r"(x), "r"(y) );
+	return r;
+}
+
+// RIPEMD MACROS-----------------------------------------------------------------------------
+static __constant__ const uint32_t c_IV[5] = { 0x67452301u, 0xEFCDAB89u, 0x98BADCFEu, 0x10325476u, 0xC3D2E1F0u };
+static __constant__ const uint32_t c_K1[5] = { 0, 0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xA953FD4E };
+static __constant__ const uint32_t c_K2[5] = { 0x50A28BE6, 0x5C4DD124, 0x6D703EF3, 0x7A6D76E9, 0 };
+
+__device__ __forceinline__
+static uint32_t ROTATE(const uint32_t x,const uint32_t r) {
+	if(r==8)
+		return __byte_perm(x, 0, 0x2103);
+	else
+		return ROTL32(x,r);
+}
+
+/*
+ * Round functions for RIPEMD-160.
+ */
+//#define F1(x, y, z)   xor3x(x, y, z)
+__device__ __forceinline__
+uint32_t F1(const uint32_t a, const uint32_t b, const uint32_t c) {
+	uint32_t result;
+	#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050
+		asm volatile ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(result) : "r"(a), "r"(b),"r"(c));
+	#else
+		result = a^b^c;
+	#endif
+	return result;
+}
+//#define F2(x, y, z)   ((x & (y ^ z)) ^ z)
+__device__ __forceinline__
+uint32_t F2(const uint32_t a, const uint32_t b, const uint32_t c) {
+	uint32_t result;
+	#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050
+		asm volatile ("lop3.b32 %0, %1, %2, %3, 0xCA;" : "=r"(result) : "r"(a), "r"(b),"r"(c)); //0xCA=((F0∧(CC⊻AA))⊻AA)
+	#else
+		result = ((a & (b ^ c)) ^ c);
+	#endif
+	return result;
+}
+//#define F3(x, y, z)   ((x | ~y) ^ z)
+__device__ __forceinline__
+uint32_t F3(const uint32_t x, const uint32_t y, const uint32_t z) {
+	uint32_t result;
+	#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050
+		asm volatile ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r"(result) : "r"(x), "r"(y),"r"(z)); //0x59=((F0∨(¬CC))⊻AA)
+	#else
+		result = ((x | ~y) ^ z);
+	#endif
+	return result;
+}
+//#define F4(x, y, z)   (y ^ ((x ^ y) & z))
+__device__ __forceinline__
+uint32_t F4(const uint32_t x, const uint32_t y, const uint32_t z) {
+	uint32_t result;
+	#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050
+		asm volatile ("lop3.b32 %0, %1, %2, %3, 0xE4;" : "=r"(result) : "r"(x), "r"(y),"r"(z)); //0xE4=(CC⊻((F0⊻CC)∧AA))
+	#else
+		result = (y ^ ((x ^ y) & z));
+	#endif
+	return result;
+}
+//#define F5(x, y, z)   (x ^ (y | ~z))
+__device__ __forceinline__
+uint32_t F5(const uint32_t x, const uint32_t y, const uint32_t z) {
+	uint32_t result;
+	#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050
+		asm volatile ("lop3.b32 %0, %1, %2, %3, 0x2D;" : "=r"(result) : "r"(x), "r"(y),"r"(z)); //0x2D=(F0⊻(CC∨(¬AA)))
+	#else
+		result = (x ^ (y | ~z));
+	#endif
+	return result;
+}
+
+/*
+ * Round constants for RIPEMD-160.
+ */
+#define RR(a, b, c, d, e, f, s, r, k) { \
+	a = e + ROTATE((a + k + r + f(b, c, d)), s); \
+	c = ROTL32(c, 10); \
+}
+
+#define ROUND1(a, b, c, d, e, f, s, r, k) \
+	RR(a[0], b[0], c[0], d[0], e[0], f, s, r, c_K1[k])
+
+#define ROUND2(a, b, c, d, e, f, s, r, k) \
+	RR(a[1], b[1], c[1], d[1], e[1], f, s, r, c_K2[k])
+
+#define RIPEMD160_ROUND_BODY(in, h) { \
+	uint32_t A[2], B[2], C[2], D[2], E[2]; \
+	uint32_t tmp; \
+\
+	A[0] = A[1] = h[0]; \
+	B[0] = B[1] = h[1]; \
+	C[0] = C[1] = h[2]; \
+	D[0] = D[1] = h[3]; \
+	E[0] = E[1] = h[4]; \
+\
+	ROUND1(A, B, C, D, E, F1, 11, in[ 0],  0); \
+	ROUND1(E, A, B, C, D, F1, 14, in[ 1],  0); \
+	ROUND1(D, E, A, B, C, F1, 15, in[ 2],  0); \
+	ROUND1(C, D, E, A, B, F1, 12, in[ 3],  0); \
+	ROUND1(B, C, D, E, A, F1,  5, in[ 4],  0); \
+	ROUND1(A, B, C, D, E, F1,  8, in[ 5],  0); \
+	ROUND1(E, A, B, C, D, F1,  7, in[ 6],  0); \
+	ROUND1(D, E, A, B, C, F1,  9, in[ 7],  0); \
+	ROUND1(C, D, E, A, B, F1, 11, in[ 8],  0); \
+	ROUND1(B, C, D, E, A, F1, 13, in[ 9],  0); \
+	ROUND1(A, B, C, D, E, F1, 14, in[10],  0); \
+	ROUND1(E, A, B, C, D, F1, 15, in[11],  0); \
+	ROUND1(D, E, A, B, C, F1,  6, in[12],  0); \
+	ROUND1(C, D, E, A, B, F1,  7, in[13],  0); \
+	ROUND1(B, C, D, E, A, F1,  9, in[14],  0); \
+	ROUND1(A, B, C, D, E, F1,  8, in[15],  0); \
+\
+	ROUND1(E, A, B, C, D, F2,  7, in[ 7],  1); \
+	ROUND1(D, E, A, B, C, F2,  6, in[ 4],  1); \
+	ROUND1(C, D, E, A, B, F2,  8, in[13],  1); \
+	ROUND1(B, C, D, E, A, F2, 13, in[ 1],  1); \
+	ROUND1(A, B, C, D, E, F2, 11, in[10],  1); \
+	ROUND1(E, A, B, C, D, F2,  9, in[ 6],  1); \
+	ROUND1(D, E, A, B, C, F2,  7, in[15],  1); \
+	ROUND1(C, D, E, A, B, F2, 15, in[ 3],  1); \
+	ROUND1(B, C, D, E, A, F2,  7, in[12],  1); \
+	ROUND1(A, B, C, D, E, F2, 12, in[ 0],  1); \
+	ROUND1(E, A, B, C, D, F2, 15, in[ 9],  1); \
+	ROUND1(D, E, A, B, C, F2,  9, in[ 5],  1); \
+	ROUND1(C, D, E, A, B, F2, 11, in[ 2],  1); \
+	ROUND1(B, C, D, E, A, F2,  7, in[14],  1); \
+	ROUND1(A, B, C, D, E, F2, 13, in[11],  1); \
+	ROUND1(E, A, B, C, D, F2, 12, in[ 8],  1); \
+\
+	ROUND1(D, E, A, B, C, F3, 11, in[ 3],  2); \
+	ROUND1(C, D, E, A, B, F3, 13, in[10],  2); \
+	ROUND1(B, C, D, E, A, F3,  6, in[14],  2); \
+	ROUND1(A, B, C, D, E, F3,  7, in[ 4],  2); \
+	ROUND1(E, A, B, C, D, F3, 14, in[ 9],  2); \
+	ROUND1(D, E, A, B, C, F3,  9, in[15],  2); \
+	ROUND1(C, D, E, A, B, F3, 13, in[ 8],  2); \
+	ROUND1(B, C, D, E, A, F3, 15, in[ 1],  2); \
+	ROUND1(A, B, C, D, E, F3, 14, in[ 2],  2); \
+	ROUND1(E, A, B, C, D, F3,  8, in[ 7],  2); \
+	ROUND1(D, E, A, B, C, F3, 13, in[ 0],  2); \
+	ROUND1(C, D, E, A, B, F3,  6, in[ 6],  2); \
+	ROUND1(B, C, D, E, A, F3,  5, in[13],  2); \
+	ROUND1(A, B, C, D, E, F3, 12, in[11],  2); \
+	ROUND1(E, A, B, C, D, F3,  7, in[ 5],  2); \
+	ROUND1(D, E, A, B, C, F3,  5, in[12],  2); \
+\
+	ROUND1(C, D, E, A, B, F4, 11, in[ 1],  3); \
+	ROUND1(B, C, D, E, A, F4, 12, in[ 9],  3); \
+	ROUND1(A, B, C, D, E, F4, 14, in[11],  3); \
+	ROUND1(E, A, B, C, D, F4, 15, in[10],  3); \
+	ROUND1(D, E, A, B, C, F4, 14, in[ 0],  3); \
+	ROUND1(C, D, E, A, B, F4, 15, in[ 8],  3); \
+	ROUND1(B, C, D, E, A, F4,  9, in[12],  3); \
+	ROUND1(A, B, C, D, E, F4,  8, in[ 4],  3); \
+	ROUND1(E, A, B, C, D, F4,  9, in[13],  3); \
+	ROUND1(D, E, A, B, C, F4, 14, in[ 3],  3); \
+	ROUND1(C, D, E, A, B, F4,  5, in[ 7],  3); \
+	ROUND1(B, C, D, E, A, F4,  6, in[15],  3); \
+	ROUND1(A, B, C, D, E, F4,  8, in[14],  3); \
+	ROUND1(E, A, B, C, D, F4,  6, in[ 5],  3); \
+	ROUND1(D, E, A, B, C, F4,  5, in[ 6],  3); \
+	ROUND1(C, D, E, A, B, F4, 12, in[ 2],  3); \
+\
+	ROUND1(B, C, D, E, A, F5,  9, in[ 4],  4); \
+	ROUND1(A, B, C, D, E, F5, 15, in[ 0],  4); \
+	ROUND1(E, A, B, C, D, F5,  5, in[ 5],  4); \
+	ROUND1(D, E, A, B, C, F5, 11, in[ 9],  4); \
+	ROUND1(C, D, E, A, B, F5,  6, in[ 7],  4); \
+	ROUND1(B, C, D, E, A, F5,  8, in[12],  4); \
+	ROUND1(A, B, C, D, E, F5, 13, in[ 2],  4); \
+	ROUND1(E, A, B, C, D, F5, 12, in[10],  4); \
+	ROUND1(D, E, A, B, C, F5,  5, in[14],  4); \
+	ROUND1(C, D, E, A, B, F5, 12, in[ 1],  4); \
+	ROUND1(B, C, D, E, A, F5, 13, in[ 3],  4); \
+	ROUND1(A, B, C, D, E, F5, 14, in[ 8],  4); \
+	ROUND1(E, A, B, C, D, F5, 11, in[11],  4); \
+	ROUND1(D, E, A, B, C, F5,  8, in[ 6],  4); \
+	ROUND1(C, D, E, A, B, F5,  5, in[15],  4); \
+	ROUND1(B, C, D, E, A, F5,  6, in[13],  4); \
+\
+	ROUND2(A, B, C, D, E, F5,  8, in[ 5],  0); \
+	ROUND2(E, A, B, C, D, F5,  9, in[14],  0); \
+	ROUND2(D, E, A, B, C, F5,  9, in[ 7],  0); \
+	ROUND2(C, D, E, A, B, F5, 11, in[ 0],  0); \
+	ROUND2(B, C, D, E, A, F5, 13, in[ 9],  0); \
+	ROUND2(A, B, C, D, E, F5, 15, in[ 2],  0); \
+	ROUND2(E, A, B, C, D, F5, 15, in[11],  0); \
+	ROUND2(D, E, A, B, C, F5,  5, in[ 4],  0); \
+	ROUND2(C, D, E, A, B, F5,  7, in[13],  0); \
+	ROUND2(B, C, D, E, A, F5,  7, in[ 6],  0); \
+	ROUND2(A, B, C, D, E, F5,  8, in[15],  0); \
+	ROUND2(E, A, B, C, D, F5, 11, in[ 8],  0); \
+	ROUND2(D, E, A, B, C, F5, 14, in[ 1],  0); \
+	ROUND2(C, D, E, A, B, F5, 14, in[10],  0); \
+	ROUND2(B, C, D, E, A, F5, 12, in[ 3],  0); \
+	ROUND2(A, B, C, D, E, F5,  6, in[12],  0); \
+\
+	ROUND2(E, A, B, C, D, F4,  9, in[ 6],  1); \
+	ROUND2(D, E, A, B, C, F4, 13, in[11],  1); \
+	ROUND2(C, D, E, A, B, F4, 15, in[ 3],  1); \
+	ROUND2(B, C, D, E, A, F4,  7, in[ 7],  1); \
+	ROUND2(A, B, C, D, E, F4, 12, in[ 0],  1); \
+	ROUND2(E, A, B, C, D, F4,  8, in[13],  1); \
+	ROUND2(D, E, A, B, C, F4,  9, in[ 5],  1); \
+	ROUND2(C, D, E, A, B, F4, 11, in[10],  1); \
+	ROUND2(B, C, D, E, A, F4,  7, in[14],  1); \
+	ROUND2(A, B, C, D, E, F4,  7, in[15],  1); \
+	ROUND2(E, A, B, C, D, F4, 12, in[ 8],  1); \
+	ROUND2(D, E, A, B, C, F4,  7, in[12],  1); \
+	ROUND2(C, D, E, A, B, F4,  6, in[ 4],  1); \
+	ROUND2(B, C, D, E, A, F4, 15, in[ 9],  1); \
+	ROUND2(A, B, C, D, E, F4, 13, in[ 1],  1); \
+	ROUND2(E, A, B, C, D, F4, 11, in[ 2],  1); \
+\
+	ROUND2(D, E, A, B, C, F3,  9, in[15],  2); \
+	ROUND2(C, D, E, A, B, F3,  7, in[ 5],  2); \
+	ROUND2(B, C, D, E, A, F3, 15, in[ 1],  2); \
+	ROUND2(A, B, C, D, E, F3, 11, in[ 3],  2); \
+	ROUND2(E, A, B, C, D, F3,  8, in[ 7],  2); \
+	ROUND2(D, E, A, B, C, F3,  6, in[14],  2); \
+	ROUND2(C, D, E, A, B, F3,  6, in[ 6],  2); \
+	ROUND2(B, C, D, E, A, F3, 14, in[ 9],  2); \
+	ROUND2(A, B, C, D, E, F3, 12, in[11],  2); \
+	ROUND2(E, A, B, C, D, F3, 13, in[ 8],  2); \
+	ROUND2(D, E, A, B, C, F3,  5, in[12],  2); \
+	ROUND2(C, D, E, A, B, F3, 14, in[ 2],  2); \
+	ROUND2(B, C, D, E, A, F3, 13, in[10],  2); \
+	ROUND2(A, B, C, D, E, F3, 13, in[ 0],  2); \
+	ROUND2(E, A, B, C, D, F3,  7, in[ 4],  2); \
+	ROUND2(D, E, A, B, C, F3,  5, in[13],  2); \
+\
+	ROUND2(C, D, E, A, B, F2, 15, in[ 8],  3); \
+	ROUND2(B, C, D, E, A, F2,  5, in[ 6],  3); \
+	ROUND2(A, B, C, D, E, F2,  8, in[ 4],  3); \
+	ROUND2(E, A, B, C, D, F2, 11, in[ 1],  3); \
+	ROUND2(D, E, A, B, C, F2, 14, in[ 3],  3); \
+	ROUND2(C, D, E, A, B, F2, 14, in[11],  3); \
+	ROUND2(B, C, D, E, A, F2,  6, in[15],  3); \
+	ROUND2(A, B, C, D, E, F2, 14, in[ 0],  3); \
+	ROUND2(E, A, B, C, D, F2,  6, in[ 5],  3); \
+	ROUND2(D, E, A, B, C, F2,  9, in[12],  3); \
+	ROUND2(C, D, E, A, B, F2, 12, in[ 2],  3); \
+	ROUND2(B, C, D, E, A, F2,  9, in[13],  3); \
+	ROUND2(A, B, C, D, E, F2, 12, in[ 9],  3); \
+	ROUND2(E, A, B, C, D, F2,  5, in[ 7],  3); \
+	ROUND2(D, E, A, B, C, F2, 15, in[10],  3); \
+	ROUND2(C, D, E, A, B, F2,  8, in[14],  3); \
+\
+	ROUND2(B, C, D, E, A, F1,  8, in[12],  4); \
+	ROUND2(A, B, C, D, E, F1,  5, in[15],  4); \
+	ROUND2(E, A, B, C, D, F1, 12, in[10],  4); \
+	ROUND2(D, E, A, B, C, F1,  9, in[ 4],  4); \
+	ROUND2(C, D, E, A, B, F1, 12, in[ 1],  4); \
+	ROUND2(B, C, D, E, A, F1,  5, in[ 5],  4); \
+	ROUND2(A, B, C, D, E, F1, 14, in[ 8],  4); \
+	ROUND2(E, A, B, C, D, F1,  6, in[ 7],  4); \
+	ROUND2(D, E, A, B, C, F1,  8, in[ 6],  4); \
+	ROUND2(C, D, E, A, B, F1, 13, in[ 2],  4); \
+	ROUND2(B, C, D, E, A, F1,  6, in[13],  4); \
+	ROUND2(A, B, C, D, E, F1,  5, in[14],  4); \
+	ROUND2(E, A, B, C, D, F1, 15, in[ 0],  4); \
+	ROUND2(D, E, A, B, C, F1, 13, in[ 3],  4); \
+	ROUND2(C, D, E, A, B, F1, 11, in[ 9],  4); \
+	ROUND2(B, C, D, E, A, F1, 11, in[11],  4); \
+\
+	tmp  = h[1] + C[0] + D[1]; \
+	h[1] = h[2] + D[0] + E[1]; \
+	h[2] = h[3] + E[0] + A[1]; \
+	h[3] = h[4] + A[0] + B[1]; \
+	h[4] = h[0] + B[0] + C[1]; \
+	h[0] = tmp; \
+}
+// END OF RIPEMD MACROS----------------------------------------------------------------------
+
+__global__
+__launch_bounds__(768,1) /* will force 64 regs max on SM 3+ */
+void gpu_lbry_merged(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonces, const uint64_t target64)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+	const uint64_t IV512[8] = {
+		0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
+		0x510E527FADE682D1, 0x9B05688C2B3E6C1F, 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
+	};
+
+	if (thread < threads)
+	{
+		uint64_t r[8];
+		uint64_t W[16];
+		uint32_t dat[16];
+		uint32_t buf[8], state[8];
+		uint32_t h[5];
+
+		*(uint2x4*)&dat[0] = *(uint2x4*)&c_dataEnd112[0];
+		dat[ 8] = c_dataEnd112[ 8];
+		dat[ 9] = c_dataEnd112[ 9];
+		dat[10] = c_dataEnd112[10];
+		dat[11] = startNonce + thread;
+		dat[12] = 0x80000000;
+		dat[13] = 0;
+		dat[14] = 0;
+		dat[15] = 0x380;
+
+		*(uint2x4*)&state[0] = *(uint2x4*)&c_midstate112[0];
+		*(uint2x4*)&buf[0]   = *(uint2x4*)&c_midbuffer112[0];
+
+		sha256_round_first(dat, buf, state, c_K);
+
+		// second sha256
+
+		#pragma unroll 8
+		for(int i=0; i<8; i++){
+			dat[i] = buf[i];
+		}
+		dat[8] = 0x80000000;
+
+		#pragma unroll 6
+		for(int i=9; i<15; i++) dat[i] = 0;
+		dat[15] = 0x100;
+
+		#pragma unroll 8
+		for(int i=0; i<8; i++)
+			buf[i] = c_H256[i];
+
+		sha256_round_body(dat, buf, c_K);
+
+// SHA512-------------------------------------------------------------------------------------
+
+		#pragma unroll 8
+		for(int i=0; i<8; i++)
+			r[i] = IV512[i];
+
+		W[0] = vectorizeswap(((uint64_t*)buf)[0]);
+		W[1] = vectorizeswap(((uint64_t*)buf)[1]);
+		W[2] = vectorizeswap(((uint64_t*)buf)[2]);
+		W[3] = vectorizeswap(((uint64_t*)buf)[3]);
+		W[4] = 0x8000000000000000; // end tag
+
+		#pragma unroll 10
+		for (int i = 5; i < 15; i++)
+			W[i] = 0;
+
+		W[15] = 0x100; // 256 bits
+
+		#pragma unroll 16
+		for (uint32_t i = 0; i < 16; i++)
+		{
+//			sha512_step2(r, W[ i], K_512[ i], i&7);
+			const uint32_t ord = i&7;
+			const uint64_t T1 = r[(15-ord) & 7] + K_512[ i] + W[ i] + bsg5_1(r[(12-ord) & 7]) +
+				xandx64(r[(12-ord) & 7], r[(13-ord) & 7], r[(14-ord) & 7]);
+
+			r[(15-ord)& 7] = andor64(r[( 8-ord) & 7], r[( 9-ord) & 7], r[(10-ord) & 7]) + bsg5_0(r[( 8-ord) & 7]) + T1;
+			r[(11-ord)& 7] = r[(11-ord)& 7] + T1;
+		}
+
+		#pragma unroll 5
+		for (uint32_t i = 16; i < 80; i+=16)
+		{
+			#pragma unroll 16
+			for (uint32_t j = 0; j<16; j++)
+				W[(i + j) & 15] = W[((i + j) - 7) & 15] + W[(i + j) & 15] + ssg5_0(W[((i + j) - 15) & 15]) + ssg5_1(W[((i + j) - 2) & 15]);
+
+			#pragma unroll 16
+			for (uint32_t j = 0; j<16; j++) {
+
+				const uint32_t ord = (i+j)&7;
+				const uint64_t T1 = K_512[i+j] + W[ j] + r[(15-ord) & 7] + bsg5_1(r[(12-ord) & 7]) +
+					xandx64(r[(12-ord) & 7], r[(13-ord) & 7], r[(14-ord) & 7]);
+
+				r[(15-ord)& 7] = andor64(r[( 8-ord) & 7], r[( 9-ord) & 7], r[(10-ord) & 7]) + bsg5_0(r[( 8-ord) & 7]) + T1;
+				r[(11-ord)& 7] = r[(11-ord)& 7] + T1;
+			}
+		}
+
+//END OF SHA512------------------------------------------------------------------------------
+		#pragma unroll 4
+		for (int i = 0; i < 4; i++)
+			*(uint64_t*)&dat[i*2] = cuda_swab64(r[i] + IV512[i]);
+		dat[8] = 0x80;
+
+		#pragma unroll 7
+		for (int i=9; i<16; i++) dat[i] = 0;
+
+		dat[14] = 0x100; // size in bits
+
+		#pragma unroll 5
+		for (int i=0; i<5; i++)
+			h[i] = c_IV[i];
+
+		RIPEMD160_ROUND_BODY(dat, h);
+
+		#pragma unroll 5
+		for (int i=0; i<5; i++)
+			buf[i] = h[i];
+
+		// second 32 bytes block hash
+		#pragma unroll 4
+		for (int i=0; i < 4; i++)
+			*(uint64_t*)&dat[i*2] = cuda_swab64(r[i+4] + IV512[i+4]);
+
+		dat[8] = 0x80;
+
+		#pragma unroll 7
+		for (int i=9; i<16; i++) dat[i] = 0;
+
+		dat[14] = 0x100; // size in bits
+
+		#pragma unroll 5
+		for (int i=0; i<5; i++)
+			h[i] = c_IV[i];
+
+		RIPEMD160_ROUND_BODY(dat, h);
+
+		// first final sha256
+
+		#pragma unroll 5
+		for (int i=0; i<5; i++) dat[i] = cuda_swab32(buf[i]);
+		#pragma unroll 5
+		for (int i=0; i<5; i++) dat[i+5] = cuda_swab32(h[i]);
+		dat[10] = 0x80000000;
+		#pragma unroll 4
+		for (int i=11; i<15; i++) dat[i] = 0;
+
+		dat[15] = 0x140;
+
+		#pragma unroll 8
+		for(int i=0; i<8; i++)
+			buf[i] = c_H256[i];
+
+		sha256_round_body(dat, buf, c_K);
+
+		// second sha256
+
+		#pragma unroll 8
+		for(int i=0; i<8; i++) {
+			dat[i] = buf[i];
+		}
+		dat[8] = 0x80000000;
+
+		#pragma unroll 8
+		for(int i=0; i<8; i++)
+			buf[i] = c_H256[i];
+
+		#pragma unroll 6
+		for (int i=9; i<15; i++) dat[i] = 0;
+		dat[15] = 0x100;
+
+		sha256_round_body_final(dat, buf, c_K);
+
+		// valid nonces
+		if (cuda_swab64ll(buf[ 6],buf[ 7]) <= target64) {
+			uint32_t tmp = atomicExch(&resNonces[0], thread);
+			if (tmp != UINT32_MAX)
+				resNonces[1] = tmp;
+		}
+	}
+}
+
+__host__
+void lbry_merged(int thr_id, uint32_t startNonce, uint32_t threads, uint32_t *d_resNonce, const uint64_t target64)
+{
+	uint32_t threadsperblock = 768;
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	gpu_lbry_merged <<<grid, block>>> (threads,startNonce, d_resNonce, target64);
+}
diff --git a/lbry/cuda_sha256_lbry.cu b/lbry/cuda_sha256_lbry.cu
new file mode 100644
index 0000000000..ed66994cc1
--- /dev/null
+++ b/lbry/cuda_sha256_lbry.cu
@@ -0,0 +1,960 @@
+/*
+ * sha256 + ripemd CUDA implementation for SM 5.0 GPUS (and lower)
+ * Tanguy Pruvot and Provos Alexis - Jul / Sep 2016
+ * Sponsored by LBRY.IO team
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <memory.h>
+
+#include <cuda_helper.h>
+#include <cuda_vector_uint2x4.h>
+
+#include <miner.h>
+
+__constant__ static uint32_t _ALIGN(16) c_midstate112[8];
+__constant__ static uint32_t _ALIGN(16) c_midbuffer112[8];
+__constant__ static uint32_t _ALIGN(16) c_dataEnd112[12];
+
+__constant__  const uint32_t c_H256[8] = {
+	0x6A09E667U, 0xBB67AE85U, 0x3C6EF372U, 0xA54FF53AU,
+	0x510E527FU, 0x9B05688CU, 0x1F83D9ABU, 0x5BE0CD19U
+};
+__constant__ static uint32_t _ALIGN(16) c_K[64] = {
+	0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
+	0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3, 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
+	0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC, 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
+	0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
+	0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13, 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
+	0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3, 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
+	0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
+	0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208, 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
+};
+
+#ifdef __INTELLISENSE__
+#define atomicExch(p,y) y
+#define __byte_perm(x,y,c) x
+#endif
+
+// ------------------------------------------------------------------------------------------------
+
+static const uint32_t cpu_H256[8] = {
+	0x6A09E667U, 0xBB67AE85U, 0x3C6EF372U, 0xA54FF53AU,
+	0x510E527FU, 0x9B05688CU, 0x1F83D9ABU, 0x5BE0CD19U
+};
+
+static const uint32_t cpu_K[64] = {
+	0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
+	0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3, 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
+	0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC, 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
+	0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
+	0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13, 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
+	0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3, 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
+	0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
+	0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208, 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
+};
+
+__host__
+static void sha256_step1_host(uint32_t a, uint32_t b, uint32_t c, uint32_t &d,
+	uint32_t e, uint32_t f, uint32_t g, uint32_t &h, uint32_t in, const uint32_t Kshared)
+{
+	uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
+	uint32_t bsg21 = ROTR32(e, 6) ^ ROTR32(e, 11) ^ ROTR32(e, 25); // bsg2_1(e);
+	uint32_t bsg20 = ROTR32(a, 2) ^ ROTR32(a, 13) ^ ROTR32(a, 22); //bsg2_0(a);
+	uint32_t andorv = ((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);
+	uint32_t t1 = h + bsg21 + vxandx + Kshared + in;
+	uint32_t t2 = bsg20 + andorv;
+	d = d + t1;
+	h = t1 + t2;
+}
+
+__host__
+static void sha256_step2_host(uint32_t a, uint32_t b, uint32_t c, uint32_t &d,
+	uint32_t e, uint32_t f, uint32_t g, uint32_t &h, uint32_t* in, uint32_t pc, const uint32_t Kshared)
+{
+	int pcidx1 = (pc-2)  & 0xF;
+	int pcidx2 = (pc-7)  & 0xF;
+	int pcidx3 = (pc-15) & 0xF;
+
+	uint32_t inx0 = in[pc];
+	uint32_t inx1 = in[pcidx1];
+	uint32_t inx2 = in[pcidx2];
+	uint32_t inx3 = in[pcidx3];
+
+	uint32_t ssg21 = ROTR32(inx1, 17) ^ ROTR32(inx1, 19) ^ SPH_T32((inx1) >> 10); //ssg2_1(inx1);
+	uint32_t ssg20 = ROTR32(inx3, 7) ^ ROTR32(inx3, 18) ^ SPH_T32((inx3) >> 3); //ssg2_0(inx3);
+	uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
+	uint32_t bsg21 = ROTR32(e, 6) ^ ROTR32(e, 11) ^ ROTR32(e, 25); // bsg2_1(e);
+	uint32_t bsg20 = ROTR32(a, 2) ^ ROTR32(a, 13) ^ ROTR32(a, 22); //bsg2_0(a);
+	uint32_t andorv = ((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);
+	uint32_t t1,t2;
+
+	in[pc] = ssg21 + inx2 + ssg20 + inx0;
+
+	t1 = h + bsg21 + vxandx + Kshared + in[pc];
+	t2 = bsg20 + andorv;
+	d =  d + t1;
+	h = t1 + t2;
+}
+
+__host__
+static void sha256_round_body_host(uint32_t* in, uint32_t* state, const uint32_t* Kshared)
+{
+	uint32_t a = state[0];
+	uint32_t b = state[1];
+	uint32_t c = state[2];
+	uint32_t d = state[3];
+	uint32_t e = state[4];
+	uint32_t f = state[5];
+	uint32_t g = state[6];
+	uint32_t h = state[7];
+
+	sha256_step1_host(a,b,c,d,e,f,g,h,in[0], Kshared[0]);
+	sha256_step1_host(h,a,b,c,d,e,f,g,in[1], Kshared[1]);
+	sha256_step1_host(g,h,a,b,c,d,e,f,in[2], Kshared[2]);
+	sha256_step1_host(f,g,h,a,b,c,d,e,in[3], Kshared[3]);
+	sha256_step1_host(e,f,g,h,a,b,c,d,in[4], Kshared[4]);
+	sha256_step1_host(d,e,f,g,h,a,b,c,in[5], Kshared[5]);
+	sha256_step1_host(c,d,e,f,g,h,a,b,in[6], Kshared[6]);
+	sha256_step1_host(b,c,d,e,f,g,h,a,in[7], Kshared[7]);
+	sha256_step1_host(a,b,c,d,e,f,g,h,in[8], Kshared[8]);
+	sha256_step1_host(h,a,b,c,d,e,f,g,in[9], Kshared[9]);
+	sha256_step1_host(g,h,a,b,c,d,e,f,in[10],Kshared[10]);
+	sha256_step1_host(f,g,h,a,b,c,d,e,in[11],Kshared[11]);
+	sha256_step1_host(e,f,g,h,a,b,c,d,in[12],Kshared[12]);
+	sha256_step1_host(d,e,f,g,h,a,b,c,in[13],Kshared[13]);
+	sha256_step1_host(c,d,e,f,g,h,a,b,in[14],Kshared[14]);
+	sha256_step1_host(b,c,d,e,f,g,h,a,in[15],Kshared[15]);
+
+	for (int i=0; i<3; i++)
+	{
+		sha256_step2_host(a,b,c,d,e,f,g,h,in,0, Kshared[16+16*i]);
+		sha256_step2_host(h,a,b,c,d,e,f,g,in,1, Kshared[17+16*i]);
+		sha256_step2_host(g,h,a,b,c,d,e,f,in,2, Kshared[18+16*i]);
+		sha256_step2_host(f,g,h,a,b,c,d,e,in,3, Kshared[19+16*i]);
+		sha256_step2_host(e,f,g,h,a,b,c,d,in,4, Kshared[20+16*i]);
+		sha256_step2_host(d,e,f,g,h,a,b,c,in,5, Kshared[21+16*i]);
+		sha256_step2_host(c,d,e,f,g,h,a,b,in,6, Kshared[22+16*i]);
+		sha256_step2_host(b,c,d,e,f,g,h,a,in,7, Kshared[23+16*i]);
+		sha256_step2_host(a,b,c,d,e,f,g,h,in,8, Kshared[24+16*i]);
+		sha256_step2_host(h,a,b,c,d,e,f,g,in,9, Kshared[25+16*i]);
+		sha256_step2_host(g,h,a,b,c,d,e,f,in,10,Kshared[26+16*i]);
+		sha256_step2_host(f,g,h,a,b,c,d,e,in,11,Kshared[27+16*i]);
+		sha256_step2_host(e,f,g,h,a,b,c,d,in,12,Kshared[28+16*i]);
+		sha256_step2_host(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]);
+		sha256_step2_host(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]);
+		sha256_step2_host(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]);
+	}
+
+	state[0] += a;
+	state[1] += b;
+	state[2] += c;
+	state[3] += d;
+	state[4] += e;
+	state[5] += f;
+	state[6] += g;
+	state[7] += h;
+}
+
+#define xor3b(a,b,c) (a ^ b ^ c)
+
+__device__ __forceinline__ uint32_t bsg2_0(const uint32_t x)
+{
+	return xor3b(ROTR32(x,2),ROTR32(x,13),ROTR32(x,22));
+}
+
+__device__ __forceinline__ uint32_t bsg2_1(const uint32_t x)
+{
+	return xor3b(ROTR32(x,6),ROTR32(x,11),ROTR32(x,25));
+}
+
+__device__ __forceinline__ uint32_t ssg2_0(const uint32_t x)
+{
+	return xor3b(ROTR32(x,7),ROTR32(x,18),(x>>3));
+}
+
+__device__ __forceinline__ uint32_t ssg2_1(const uint32_t x)
+{
+	return xor3b(ROTR32(x,17),ROTR32(x,19),(x>>10));
+}
+
+__device__ __forceinline__ uint2 vectorizeswap(uint64_t v)
+{
+	uint2 result;
+	asm("mov.b64 {%0,%1},%2; // vectorizeswap \n\t"
+		: "=r"(result.y), "=r"(result.x) : "l"(v));
+	return result;
+}
+
+#define Maj(x, y, z)    ((x & (y | z)) | (y & z))
+#define Ch(a, b, c)     (((b^c) & a) ^ c)
+
+__device__ __forceinline__
+static void sha2_step(const uint32_t a,const uint32_t b,const uint32_t c, uint32_t &d,const uint32_t e,const uint32_t f,const uint32_t g, uint32_t &h,const uint32_t in, const uint32_t Kshared)
+{
+	const uint32_t t1 = h + bsg2_1(e) + Ch(e, f, g) + Kshared + in;
+	h = t1 + bsg2_0(a) + Maj(a, b, c);
+	d+= t1;
+
+}
+
+__device__ __forceinline__
+static void sha256_round_first(uint32_t *const __restrict__ in,uint32_t *const __restrict__ buf,const uint32_t *const __restrict__ state,const uint32_t* __restrict__ Kshared)
+{
+	uint32_t a = buf[0] + in[11];
+	uint32_t b = buf[1];
+	uint32_t c = buf[2];
+	uint32_t d = buf[3];
+	uint32_t e = buf[4] + in[11];
+	uint32_t f = buf[5];
+	uint32_t g = buf[6];
+	uint32_t h = buf[7];
+
+	// 10 first steps made on host
+	//sha2_step(f,g,h,a,b,c,d,e,in[11],Kshared[11]);
+
+	sha2_step(e,f,g,h,a,b,c,d,in[12],Kshared[12]);
+	sha2_step(d,e,f,g,h,a,b,c,in[13],Kshared[13]);
+	sha2_step(c,d,e,f,g,h,a,b,in[14],Kshared[14]);
+	sha2_step(b,c,d,e,f,g,h,a,in[15],Kshared[15]);
+
+	//in is partially precomputed on host
+	in[2]+= in[11];
+	in[4]+= ssg2_1(in[2]);
+	in[6]+= ssg2_1(in[4]);
+	in[8]+= ssg2_1(in[6]);
+	in[9]+= in[ 2];
+
+	sha2_step(a,b,c,d,e,f,g,h,in[0], Kshared[16]);
+	sha2_step(h,a,b,c,d,e,f,g,in[1], Kshared[17]);
+	sha2_step(g,h,a,b,c,d,e,f,in[2], Kshared[18]);
+	sha2_step(f,g,h,a,b,c,d,e,in[3], Kshared[19]);
+	sha2_step(e,f,g,h,a,b,c,d,in[4], Kshared[20]);
+	sha2_step(d,e,f,g,h,a,b,c,in[5], Kshared[21]);
+	sha2_step(c,d,e,f,g,h,a,b,in[6], Kshared[22]);
+	sha2_step(b,c,d,e,f,g,h,a,in[7], Kshared[23]);
+	sha2_step(a,b,c,d,e,f,g,h,in[8], Kshared[24]);
+	sha2_step(h,a,b,c,d,e,f,g,in[9], Kshared[25]);
+
+	#pragma unroll 6
+	for (uint32_t j = 10; j < 16; j++){
+		in[j] = in[j] + in[(j + 9) & 15] + ssg2_0(in[(j + 1) & 15]) + ssg2_1(in[(j + 14) & 15]);
+	}
+
+	sha2_step(g,h,a,b,c,d,e,f,in[10],Kshared[26]);
+	sha2_step(f,g,h,a,b,c,d,e,in[11],Kshared[27]);
+	sha2_step(e,f,g,h,a,b,c,d,in[12],Kshared[28]);
+	sha2_step(d,e,f,g,h,a,b,c,in[13],Kshared[29]);
+	sha2_step(c,d,e,f,g,h,a,b,in[14],Kshared[30]);
+	sha2_step(b,c,d,e,f,g,h,a,in[15],Kshared[31]);
+
+	#pragma unroll 16
+	for (uint32_t j = 0; j < 16; j++){
+		in[j] = in[j] + in[(j + 9) & 15] + ssg2_0(in[(j + 1) & 15]) + ssg2_1(in[(j + 14) & 15]);
+	}
+
+	sha2_step(a,b,c,d,e,f,g,h,in[0], Kshared[16+16]);
+	sha2_step(h,a,b,c,d,e,f,g,in[1], Kshared[17+16]);
+	sha2_step(g,h,a,b,c,d,e,f,in[2], Kshared[18+16]);
+	sha2_step(f,g,h,a,b,c,d,e,in[3], Kshared[19+16]);
+	sha2_step(e,f,g,h,a,b,c,d,in[4], Kshared[20+16]);
+	sha2_step(d,e,f,g,h,a,b,c,in[5], Kshared[21+16]);
+	sha2_step(c,d,e,f,g,h,a,b,in[6], Kshared[22+16]);
+	sha2_step(b,c,d,e,f,g,h,a,in[7], Kshared[23+16]);
+	sha2_step(a,b,c,d,e,f,g,h,in[8], Kshared[24+16]);
+	sha2_step(h,a,b,c,d,e,f,g,in[9], Kshared[25+16]);
+	sha2_step(g,h,a,b,c,d,e,f,in[10],Kshared[26+16]);
+	sha2_step(f,g,h,a,b,c,d,e,in[11],Kshared[27+16]);
+	sha2_step(e,f,g,h,a,b,c,d,in[12],Kshared[28+16]);
+	sha2_step(d,e,f,g,h,a,b,c,in[13],Kshared[29+16]);
+	sha2_step(c,d,e,f,g,h,a,b,in[14],Kshared[30+16]);
+	sha2_step(b,c,d,e,f,g,h,a,in[15],Kshared[31+16]);
+
+	#pragma unroll 16
+	for (uint32_t j = 0; j < 16; j++){
+		in[j] = in[j] + in[(j + 9) & 15] + ssg2_0(in[(j + 1) & 15]) + ssg2_1(in[(j + 14) & 15]);
+	}
+
+	sha2_step(a,b,c,d,e,f,g,h,in[0], Kshared[16+16*2]);
+	sha2_step(h,a,b,c,d,e,f,g,in[1], Kshared[17+16*2]);
+	sha2_step(g,h,a,b,c,d,e,f,in[2], Kshared[18+16*2]);
+	sha2_step(f,g,h,a,b,c,d,e,in[3], Kshared[19+16*2]);
+	sha2_step(e,f,g,h,a,b,c,d,in[4], Kshared[20+16*2]);
+	sha2_step(d,e,f,g,h,a,b,c,in[5], Kshared[21+16*2]);
+	sha2_step(c,d,e,f,g,h,a,b,in[6], Kshared[22+16*2]);
+	sha2_step(b,c,d,e,f,g,h,a,in[7], Kshared[23+16*2]);
+	sha2_step(a,b,c,d,e,f,g,h,in[8], Kshared[24+16*2]);
+	sha2_step(h,a,b,c,d,e,f,g,in[9], Kshared[25+16*2]);
+	sha2_step(g,h,a,b,c,d,e,f,in[10],Kshared[26+16*2]);
+	sha2_step(f,g,h,a,b,c,d,e,in[11],Kshared[27+16*2]);
+	sha2_step(e,f,g,h,a,b,c,d,in[12],Kshared[28+16*2]);
+	sha2_step(d,e,f,g,h,a,b,c,in[13],Kshared[29+16*2]);
+	sha2_step(c,d,e,f,g,h,a,b,in[14],Kshared[30+16*2]);
+	sha2_step(b,c,d,e,f,g,h,a,in[15],Kshared[31+16*2]);
+
+	buf[ 0] = state[0] + a;
+	buf[ 1] = state[1] + b;
+	buf[ 2] = state[2] + c;
+	buf[ 3] = state[3] + d;
+	buf[ 4] = state[4] + e;
+	buf[ 5] = state[5] + f;
+	buf[ 6] = state[6] + g;
+	buf[ 7] = state[7] + h;
+}
+
+__device__ __forceinline__
+static void sha256_round_body(uint32_t *const __restrict__ in, uint32_t *const __restrict__ state,const uint32_t *const __restrict__ Kshared)
+{
+	uint32_t a = state[0];
+	uint32_t b = state[1];
+	uint32_t c = state[2];
+	uint32_t d = state[3];
+	uint32_t e = state[4];
+	uint32_t f = state[5];
+	uint32_t g = state[6];
+	uint32_t h = state[7];
+
+	sha2_step(a,b,c,d,e,f,g,h,in[0], Kshared[0]);
+	sha2_step(h,a,b,c,d,e,f,g,in[1], Kshared[1]);
+	sha2_step(g,h,a,b,c,d,e,f,in[2], Kshared[2]);
+	sha2_step(f,g,h,a,b,c,d,e,in[3], Kshared[3]);
+	sha2_step(e,f,g,h,a,b,c,d,in[4], Kshared[4]);
+	sha2_step(d,e,f,g,h,a,b,c,in[5], Kshared[5]);
+	sha2_step(c,d,e,f,g,h,a,b,in[6], Kshared[6]);
+	sha2_step(b,c,d,e,f,g,h,a,in[7], Kshared[7]);
+	sha2_step(a,b,c,d,e,f,g,h,in[8], Kshared[8]);
+	sha2_step(h,a,b,c,d,e,f,g,in[9], Kshared[9]);
+	sha2_step(g,h,a,b,c,d,e,f,in[10],Kshared[10]);
+	sha2_step(f,g,h,a,b,c,d,e,in[11],Kshared[11]);
+	sha2_step(e,f,g,h,a,b,c,d,in[12],Kshared[12]);
+	sha2_step(d,e,f,g,h,a,b,c,in[13],Kshared[13]);
+	sha2_step(c,d,e,f,g,h,a,b,in[14],Kshared[14]);
+	sha2_step(b,c,d,e,f,g,h,a,in[15],Kshared[15]);
+
+	#pragma unroll 3
+	for (uint32_t i=0; i<3; i++)
+	{
+		#pragma unroll 16
+		for (uint32_t j = 0; j < 16; j++){
+			in[j] = in[j] + in[(j + 9) & 15] + ssg2_0(in[(j + 1) & 15]) + ssg2_1(in[(j + 14) & 15]);
+		}
+		sha2_step(a, b, c, d, e, f, g, h, in[0], Kshared[16 + 16 * i]);
+		sha2_step(h, a, b, c, d, e, f, g, in[1], Kshared[17 + 16 * i]);
+		sha2_step(g, h, a, b, c, d, e, f, in[2], Kshared[18 + 16 * i]);
+		sha2_step(f, g, h, a, b, c, d, e, in[3], Kshared[19 + 16 * i]);
+		sha2_step(e, f, g, h, a, b, c, d, in[4], Kshared[20 + 16 * i]);
+		sha2_step(d, e, f, g, h, a, b, c, in[5], Kshared[21 + 16 * i]);
+		sha2_step(c, d, e, f, g, h, a, b, in[6], Kshared[22 + 16 * i]);
+		sha2_step(b, c, d, e, f, g, h, a, in[7], Kshared[23 + 16 * i]);
+		sha2_step(a, b, c, d, e, f, g, h, in[8], Kshared[24 + 16 * i]);
+		sha2_step(h, a, b, c, d, e, f, g, in[9], Kshared[25 + 16 * i]);
+		sha2_step(g, h, a, b, c, d, e, f, in[10], Kshared[26 + 16 * i]);
+		sha2_step(f, g, h, a, b, c, d, e, in[11], Kshared[27 + 16 * i]);
+		sha2_step(e, f, g, h, a, b, c, d, in[12], Kshared[28 + 16 * i]);
+		sha2_step(d, e, f, g, h, a, b, c, in[13], Kshared[29 + 16 * i]);
+		sha2_step(c, d, e, f, g, h, a, b, in[14], Kshared[30 + 16 * i]);
+		sha2_step(b, c, d, e, f, g, h, a, in[15], Kshared[31 + 16 * i]);
+	}
+
+	state[0] += a;
+	state[1] += b;
+	state[2] += c;
+	state[3] += d;
+	state[4] += e;
+	state[5] += f;
+	state[6] += g;
+	state[7] += h;
+}
+
+__device__ __forceinline__
+static void sha256_round_body_final(uint32_t *const __restrict__ in, uint32_t *const __restrict__ state,const uint32_t *const __restrict__ Kshared)
+{
+	uint32_t a = state[0];
+	uint32_t b = state[1];
+	uint32_t c = state[2];
+	uint32_t d = state[3];
+	uint32_t e = state[4];
+	uint32_t f = state[5];
+	uint32_t g = state[6];
+	uint32_t h = state[7];
+
+	sha2_step(a,b,c,d,e,f,g,h,in[0], Kshared[0]);
+	sha2_step(h,a,b,c,d,e,f,g,in[1], Kshared[1]);
+	sha2_step(g,h,a,b,c,d,e,f,in[2], Kshared[2]);
+	sha2_step(f,g,h,a,b,c,d,e,in[3], Kshared[3]);
+	sha2_step(e,f,g,h,a,b,c,d,in[4], Kshared[4]);
+	sha2_step(d,e,f,g,h,a,b,c,in[5], Kshared[5]);
+	sha2_step(c,d,e,f,g,h,a,b,in[6], Kshared[6]);
+	sha2_step(b,c,d,e,f,g,h,a,in[7], Kshared[7]);
+	sha2_step(a,b,c,d,e,f,g,h,in[8], Kshared[8]);
+	sha2_step(h,a,b,c,d,e,f,g,in[9], Kshared[9]);
+	sha2_step(g,h,a,b,c,d,e,f,in[10],Kshared[10]);
+	sha2_step(f,g,h,a,b,c,d,e,in[11],Kshared[11]);
+	sha2_step(e,f,g,h,a,b,c,d,in[12],Kshared[12]);
+	sha2_step(d,e,f,g,h,a,b,c,in[13],Kshared[13]);
+	sha2_step(c,d,e,f,g,h,a,b,in[14],Kshared[14]);
+	sha2_step(b,c,d,e,f,g,h,a,in[15],Kshared[15]);
+
+	#pragma unroll 2
+	for (uint32_t i=0; i<2; i++)
+	{
+		#pragma unroll 16
+		for (uint32_t j = 0; j < 16; j++){
+			in[j] = in[j] + in[(j + 9) & 15] + ssg2_0(in[(j + 1) & 15]) + ssg2_1(in[(j + 14) & 15]);
+		}
+		sha2_step(a, b, c, d, e, f, g, h, in[0], Kshared[16 + 16 * i]);
+		sha2_step(h, a, b, c, d, e, f, g, in[1], Kshared[17 + 16 * i]);
+		sha2_step(g, h, a, b, c, d, e, f, in[2], Kshared[18 + 16 * i]);
+		sha2_step(f, g, h, a, b, c, d, e, in[3], Kshared[19 + 16 * i]);
+		sha2_step(e, f, g, h, a, b, c, d, in[4], Kshared[20 + 16 * i]);
+		sha2_step(d, e, f, g, h, a, b, c, in[5], Kshared[21 + 16 * i]);
+		sha2_step(c, d, e, f, g, h, a, b, in[6], Kshared[22 + 16 * i]);
+		sha2_step(b, c, d, e, f, g, h, a, in[7], Kshared[23 + 16 * i]);
+		sha2_step(a, b, c, d, e, f, g, h, in[8], Kshared[24 + 16 * i]);
+		sha2_step(h, a, b, c, d, e, f, g, in[9], Kshared[25 + 16 * i]);
+		sha2_step(g, h, a, b, c, d, e, f, in[10], Kshared[26 + 16 * i]);
+		sha2_step(f, g, h, a, b, c, d, e, in[11], Kshared[27 + 16 * i]);
+		sha2_step(e, f, g, h, a, b, c, d, in[12], Kshared[28 + 16 * i]);
+		sha2_step(d, e, f, g, h, a, b, c, in[13], Kshared[29 + 16 * i]);
+		sha2_step(c, d, e, f, g, h, a, b, in[14], Kshared[30 + 16 * i]);
+		sha2_step(b, c, d, e, f, g, h, a, in[15], Kshared[31 + 16 * i]);
+	}
+	#pragma unroll 16
+	for (uint32_t j = 0; j < 16; j++){
+		in[j] = in[j] + in[(j + 9) & 15] + ssg2_0(in[(j + 1) & 15]) + ssg2_1(in[(j + 14) & 15]);
+	}
+	sha2_step(a, b, c, d, e, f, g, h, in[0], Kshared[16 + 16 * 2]);
+	sha2_step(h, a, b, c, d, e, f, g, in[1], Kshared[17 + 16 * 2]);
+	sha2_step(g, h, a, b, c, d, e, f, in[2], Kshared[18 + 16 * 2]);
+	sha2_step(f, g, h, a, b, c, d, e, in[3], Kshared[19 + 16 * 2]);
+	sha2_step(e, f, g, h, a, b, c, d, in[4], Kshared[20 + 16 * 2]);
+	sha2_step(d, e, f, g, h, a, b, c, in[5], Kshared[21 + 16 * 2]);
+	sha2_step(c, d, e, f, g, h, a, b, in[6], Kshared[22 + 16 * 2]);
+	sha2_step(b, c, d, e, f, g, h, a, in[7], Kshared[23 + 16 * 2]);
+	sha2_step(a, b, c, d, e, f, g, h, in[8], Kshared[24 + 16 * 2]);
+	sha2_step(h, a, b, c, d, e, f, g, in[9], Kshared[25 + 16 * 2]);
+	sha2_step(g, h, a, b, c, d, e, f, in[10], Kshared[26 + 16 * 2]);
+	sha2_step(f, g, h, a, b, c, d, e, in[11], Kshared[27 + 16 * 2]);
+	sha2_step(e, f, g, h, a, b, c, d, in[12], Kshared[28 + 16 * 2]);
+	sha2_step(d, e, f, g, h, a, b, c, in[13], Kshared[29 + 16 * 2]);
+
+	state[6] += g;
+	state[7] += h;
+}
+
+ __device__ __forceinline__
+uint64_t cuda_swab64ll(const uint32_t x, const uint32_t y) {
+	uint64_t r;
+	asm("prmt.b32 %1, %1, 0, 0x0123; // swab64ll\n\t"
+	    "prmt.b32 %2, %2, 0, 0x0123;\n\t"
+	    "mov.b64 %0, {%1,%2};\n\t"
+	  : "=l"(r): "r"(x), "r"(y) );
+	return r;
+}
+
+__global__
+#if CUDA_VERSION > 6050
+__launch_bounds__(768,2) /* to force 32 regs */
+#endif
+void lbry_sha256d_gpu_hash_112(const uint32_t threads, const uint32_t startNonce, uint64_t *outputHash)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	uint32_t buf[8], state[8];
+	if (thread < threads)
+	{
+		uint32_t dat[16];
+
+		#pragma unroll 11
+		for (uint32_t i=0; i<11; i++) dat[i] = c_dataEnd112[i];
+		dat[11] = startNonce + thread;
+		dat[12] = 0x80000000;
+		dat[13] = 0;
+		dat[14] = 0;
+		dat[15] = 0x380;
+
+		*(uint2x4*)&state[0] = *(uint2x4*)&c_midstate112[0];
+		*(uint2x4*)&buf[0]   = *(uint2x4*)&c_midbuffer112[0];
+
+		sha256_round_first(dat, buf, state, c_K); // no shared mem here
+
+		// second sha256
+
+		*(uint2x4*)&dat[0] = *(uint2x4*)&buf[0];
+
+		dat[8] = 0x80000000;
+
+		#pragma unroll 6
+		for (uint32_t i=9; i<15; i++) dat[i] = 0;
+		dat[15] = 0x100;
+
+		*(uint2x4*)&buf[0] = *(uint2x4*)&c_H256[0];
+
+		sha256_round_body(dat, buf, c_K); //no shared mem at all
+
+		// output
+		*(uint2*)&buf[0] = vectorizeswap(((uint64_t*)buf)[0]);
+		*(uint2*)&buf[2] = vectorizeswap(((uint64_t*)buf)[1]);
+		*(uint2*)&buf[4] = vectorizeswap(((uint64_t*)buf)[2]);
+		*(uint2*)&buf[6] = vectorizeswap(((uint64_t*)buf)[3]);
+
+		*(uint2x4*)&outputHash[thread<<3] = *(uint2x4*)&buf[0];
+	}
+}
+
+__host__
+void lbry_sha256d_hash_112(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_outputHash){
+
+	const int threadsperblock = 768;
+
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	lbry_sha256d_gpu_hash_112 <<<grid, block>>> (threads, startNonce, (uint64_t*) d_outputHash);
+}
+
+__host__
+void lbry_sha256_init(int thr_id)
+{
+	cudaMemcpyToSymbol(c_K, cpu_K, sizeof(cpu_K), 0, cudaMemcpyHostToDevice);
+}
+
+__host__
+void lbry_sha256_setBlock_112(uint32_t *pdata){
+
+	uint32_t in[16], buf[8], end[16];
+	for (int i=0;i<16;i++) in[i] = cuda_swab32(pdata[i]);
+	for (int i=0; i<8;i++) buf[i] = cpu_H256[i];
+	for (int i=0;i<11;i++) end[i] = cuda_swab32(pdata[16+i]);
+	sha256_round_body_host(in, buf, cpu_K);
+
+	cudaMemcpyToSymbol(c_midstate112, buf, 32, 0, cudaMemcpyHostToDevice);
+
+	uint32_t a = buf[0];
+	uint32_t b = buf[1];
+	uint32_t c = buf[2];
+	uint32_t d = buf[3];
+	uint32_t e = buf[4];
+	uint32_t f = buf[5];
+	uint32_t g = buf[6];
+	uint32_t h = buf[7];
+
+	sha256_step1_host(a,b,c,d,e,f,g,h,end[0], cpu_K[0]);
+	sha256_step1_host(h,a,b,c,d,e,f,g,end[1], cpu_K[1]);
+	sha256_step1_host(g,h,a,b,c,d,e,f,end[2], cpu_K[2]);
+	sha256_step1_host(f,g,h,a,b,c,d,e,end[3], cpu_K[3]);
+	sha256_step1_host(e,f,g,h,a,b,c,d,end[4], cpu_K[4]);
+	sha256_step1_host(d,e,f,g,h,a,b,c,end[5], cpu_K[5]);
+	sha256_step1_host(c,d,e,f,g,h,a,b,end[6], cpu_K[6]);
+	sha256_step1_host(b,c,d,e,f,g,h,a,end[7], cpu_K[7]);
+	sha256_step1_host(a,b,c,d,e,f,g,h,end[8], cpu_K[8]);
+	sha256_step1_host(h,a,b,c,d,e,f,g,end[9], cpu_K[9]);
+	sha256_step1_host(g,h,a,b,c,d,e,f,end[10],cpu_K[10]);
+	sha256_step1_host(f, g, h, a, b, c, d, e, 0, cpu_K[11]);
+
+	buf[0] = a;
+	buf[1] = b;
+	buf[2] = c;
+	buf[3] = d;
+	buf[4] = e;
+	buf[5] = f;
+	buf[6] = g;
+	buf[7] = h;
+
+	cudaMemcpyToSymbol(c_midbuffer112, buf, 32, 0, cudaMemcpyHostToDevice);
+
+	end[12] = 0x80000000;
+	end[13] = 0;
+	end[14] = 0;
+	end[15] = 0x380;
+	uint32_t x2_0,x2_1;
+
+	x2_0 = ROTR32(end[1], 7) ^ ROTR32(end[1], 18) ^ SPH_T32(end[1] >> 3); //ssg2_0(end[1]);
+//	x2_1 = ROTR32(end[14], 17) ^ ROTR32(end[14], 19) ^ SPH_T32(end[14] >> 10) + x2_0; //ssg2_1(end[14]) + x2_0;
+	end[0] = end[0] + end[9] + x2_0;
+
+	x2_0 = ROTR32(end[2], 7) ^ ROTR32(end[2], 18) ^ SPH_T32(end[2] >> 3);
+	x2_1 = (ROTR32(end[15], 17) ^ ROTR32(end[15], 19) ^ SPH_T32(end[15] >> 10)) + x2_0;
+	end[1] = end[1] + end[10] + x2_1;
+
+	x2_0 = ROTR32(end[3], 7) ^ ROTR32(end[3], 18) ^ SPH_T32(end[3] >> 3);//ssg2_0(end[3]);
+	x2_1 = (ROTR32(end[0], 17) ^ ROTR32(end[0], 19) ^ SPH_T32(end[0] >> 10)) + x2_0;
+	end[2]+= x2_1;
+
+	x2_0 = ROTR32(end[4], 7) ^ ROTR32(end[4], 18) ^ SPH_T32(end[4] >> 3);//ssg2_0(end[4]);
+	x2_1 = (ROTR32(end[1], 17) ^ ROTR32(end[1], 19) ^ SPH_T32(end[1] >> 10)) + x2_0;
+	end[3] = end[3] + end[12] + x2_1;
+
+	x2_0 = ROTR32(end[5], 7) ^ ROTR32(end[5], 18) ^ SPH_T32(end[5] >> 3);//ssg2_0(end[4]);
+	end[4] = end[4] + end[13] + x2_0;
+
+	x2_0 = ROTR32(end[6], 7) ^ ROTR32(end[6], 18) ^ SPH_T32(end[6] >> 3);//ssg2_0(end[6]);
+	x2_1 = (ROTR32(end[3], 17) ^ ROTR32(end[3], 19) ^ SPH_T32(end[3] >> 10)) + x2_0;
+	end[5] = end[5] + end[14] + x2_1;
+
+	x2_0 = ROTR32(end[7], 7) ^ ROTR32(end[7], 18) ^ SPH_T32(end[7] >> 3);//ssg2_0(end[7]);
+	end[6] = end[6] + end[15] + x2_0;
+
+	x2_0 = ROTR32(end[8], 7) ^ ROTR32(end[8], 18) ^ SPH_T32(end[8] >> 3);//ssg2_0(end[8]);
+	x2_1 = (ROTR32(end[5], 17) ^ ROTR32(end[5], 19) ^ SPH_T32(end[5] >> 10)) + x2_0;
+	end[7] = end[7] + end[0] + x2_1;
+
+	x2_0 = ROTR32(end[9], 7) ^ ROTR32(end[9], 18) ^ SPH_T32(end[9] >> 3);//ssg2_0(end[9]);
+	end[8] = end[8] + end[1] + x2_0;
+
+	x2_0 = ROTR32(end[10], 7) ^ ROTR32(end[10], 18) ^ SPH_T32(end[10] >> 3);//ssg2_0(end[10]);
+	x2_1 = (ROTR32(end[7], 17) ^ ROTR32(end[7], 19) ^ SPH_T32(end[7] >> 10)) + x2_0;
+	end[9] = end[9] + x2_1;
+
+	cudaMemcpyToSymbol(c_dataEnd112,  end, 12*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
+}
+
+// ------------------------------------------------------------------------------------------
+
+static __constant__ const uint32_t c_IV[5] = { 0x67452301u, 0xEFCDAB89u, 0x98BADCFEu, 0x10325476u, 0xC3D2E1F0u };
+
+static __constant__ const uint32_t c_K1[5] = { 0, 0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xA953FD4E };
+
+static __constant__ const uint32_t c_K2[5] = { 0x50A28BE6, 0x5C4DD124, 0x6D703EF3, 0x7A6D76E9, 0 };
+
+__device__ __forceinline__
+static uint32_t ROTATE(const uint32_t x,const uint32_t r){
+	if(r==8)
+		return __byte_perm(x, 0, 0x2103);
+	else
+		return ROTL32(x,r);
+}
+
+/*
+ * Round functions for RIPEMD-160.
+ */
+#define F1(x, y, z)   (x ^ y ^ z)
+#define F2(x, y, z)   ((x & (y ^ z)) ^ z)
+#define F3(x, y, z)   ((x | ~y) ^ z)
+#define F4(x, y, z)   (y ^ ((x ^ y) & z))
+#define F5(x, y, z)   (x ^ (y | ~z))
+
+/*
+ * Round constants for RIPEMD-160.
+ */
+#define RR(a, b, c, d, e, f, s, r, k) { \
+	a = e + ROTATE((a + r + k + f(b, c, d)), s); \
+	c = ROTL32(c, 10); \
+}
+
+#define ROUND1(a, b, c, d, e, f, s, r, k) \
+	RR(a[0], b[0], c[0], d[0], e[0], f, s, r, c_K1[k])
+
+#define ROUND2(a, b, c, d, e, f, s, r, k) \
+	RR(a[1], b[1], c[1], d[1], e[1], f, s, r, c_K2[k])
+
+#define RIPEMD160_ROUND_BODY(in, h) { \
+	uint32_t A[2], B[2], C[2], D[2], E[2]; \
+	uint32_t tmp; \
+\
+	A[0] = A[1] = h[0]; \
+	B[0] = B[1] = h[1]; \
+	C[0] = C[1] = h[2]; \
+	D[0] = D[1] = h[3]; \
+	E[0] = E[1] = h[4]; \
+\
+	ROUND1(A, B, C, D, E, F1, 11, in[ 0],  0); \
+	ROUND1(E, A, B, C, D, F1, 14, in[ 1],  0); \
+	ROUND1(D, E, A, B, C, F1, 15, in[ 2],  0); \
+	ROUND1(C, D, E, A, B, F1, 12, in[ 3],  0); \
+	ROUND1(B, C, D, E, A, F1,  5, in[ 4],  0); \
+	ROUND1(A, B, C, D, E, F1,  8, in[ 5],  0); \
+	ROUND1(E, A, B, C, D, F1,  7, in[ 6],  0); \
+	ROUND1(D, E, A, B, C, F1,  9, in[ 7],  0); \
+	ROUND1(C, D, E, A, B, F1, 11, in[ 8],  0); \
+	ROUND1(B, C, D, E, A, F1, 13, in[ 9],  0); \
+	ROUND1(A, B, C, D, E, F1, 14, in[10],  0); \
+	ROUND1(E, A, B, C, D, F1, 15, in[11],  0); \
+	ROUND1(D, E, A, B, C, F1,  6, in[12],  0); \
+	ROUND1(C, D, E, A, B, F1,  7, in[13],  0); \
+	ROUND1(B, C, D, E, A, F1,  9, in[14],  0); \
+	ROUND1(A, B, C, D, E, F1,  8, in[15],  0); \
+\
+	ROUND1(E, A, B, C, D, F2,  7, in[ 7],  1); \
+	ROUND1(D, E, A, B, C, F2,  6, in[ 4],  1); \
+	ROUND1(C, D, E, A, B, F2,  8, in[13],  1); \
+	ROUND1(B, C, D, E, A, F2, 13, in[ 1],  1); \
+	ROUND1(A, B, C, D, E, F2, 11, in[10],  1); \
+	ROUND1(E, A, B, C, D, F2,  9, in[ 6],  1); \
+	ROUND1(D, E, A, B, C, F2,  7, in[15],  1); \
+	ROUND1(C, D, E, A, B, F2, 15, in[ 3],  1); \
+	ROUND1(B, C, D, E, A, F2,  7, in[12],  1); \
+	ROUND1(A, B, C, D, E, F2, 12, in[ 0],  1); \
+	ROUND1(E, A, B, C, D, F2, 15, in[ 9],  1); \
+	ROUND1(D, E, A, B, C, F2,  9, in[ 5],  1); \
+	ROUND1(C, D, E, A, B, F2, 11, in[ 2],  1); \
+	ROUND1(B, C, D, E, A, F2,  7, in[14],  1); \
+	ROUND1(A, B, C, D, E, F2, 13, in[11],  1); \
+	ROUND1(E, A, B, C, D, F2, 12, in[ 8],  1); \
+\
+	ROUND1(D, E, A, B, C, F3, 11, in[ 3],  2); \
+	ROUND1(C, D, E, A, B, F3, 13, in[10],  2); \
+	ROUND1(B, C, D, E, A, F3,  6, in[14],  2); \
+	ROUND1(A, B, C, D, E, F3,  7, in[ 4],  2); \
+	ROUND1(E, A, B, C, D, F3, 14, in[ 9],  2); \
+	ROUND1(D, E, A, B, C, F3,  9, in[15],  2); \
+	ROUND1(C, D, E, A, B, F3, 13, in[ 8],  2); \
+	ROUND1(B, C, D, E, A, F3, 15, in[ 1],  2); \
+	ROUND1(A, B, C, D, E, F3, 14, in[ 2],  2); \
+	ROUND1(E, A, B, C, D, F3,  8, in[ 7],  2); \
+	ROUND1(D, E, A, B, C, F3, 13, in[ 0],  2); \
+	ROUND1(C, D, E, A, B, F3,  6, in[ 6],  2); \
+	ROUND1(B, C, D, E, A, F3,  5, in[13],  2); \
+	ROUND1(A, B, C, D, E, F3, 12, in[11],  2); \
+	ROUND1(E, A, B, C, D, F3,  7, in[ 5],  2); \
+	ROUND1(D, E, A, B, C, F3,  5, in[12],  2); \
+\
+	ROUND1(C, D, E, A, B, F4, 11, in[ 1],  3); \
+	ROUND1(B, C, D, E, A, F4, 12, in[ 9],  3); \
+	ROUND1(A, B, C, D, E, F4, 14, in[11],  3); \
+	ROUND1(E, A, B, C, D, F4, 15, in[10],  3); \
+	ROUND1(D, E, A, B, C, F4, 14, in[ 0],  3); \
+	ROUND1(C, D, E, A, B, F4, 15, in[ 8],  3); \
+	ROUND1(B, C, D, E, A, F4,  9, in[12],  3); \
+	ROUND1(A, B, C, D, E, F4,  8, in[ 4],  3); \
+	ROUND1(E, A, B, C, D, F4,  9, in[13],  3); \
+	ROUND1(D, E, A, B, C, F4, 14, in[ 3],  3); \
+	ROUND1(C, D, E, A, B, F4,  5, in[ 7],  3); \
+	ROUND1(B, C, D, E, A, F4,  6, in[15],  3); \
+	ROUND1(A, B, C, D, E, F4,  8, in[14],  3); \
+	ROUND1(E, A, B, C, D, F4,  6, in[ 5],  3); \
+	ROUND1(D, E, A, B, C, F4,  5, in[ 6],  3); \
+	ROUND1(C, D, E, A, B, F4, 12, in[ 2],  3); \
+\
+	ROUND1(B, C, D, E, A, F5,  9, in[ 4],  4); \
+	ROUND1(A, B, C, D, E, F5, 15, in[ 0],  4); \
+	ROUND1(E, A, B, C, D, F5,  5, in[ 5],  4); \
+	ROUND1(D, E, A, B, C, F5, 11, in[ 9],  4); \
+	ROUND1(C, D, E, A, B, F5,  6, in[ 7],  4); \
+	ROUND1(B, C, D, E, A, F5,  8, in[12],  4); \
+	ROUND1(A, B, C, D, E, F5, 13, in[ 2],  4); \
+	ROUND1(E, A, B, C, D, F5, 12, in[10],  4); \
+	ROUND1(D, E, A, B, C, F5,  5, in[14],  4); \
+	ROUND1(C, D, E, A, B, F5, 12, in[ 1],  4); \
+	ROUND1(B, C, D, E, A, F5, 13, in[ 3],  4); \
+	ROUND1(A, B, C, D, E, F5, 14, in[ 8],  4); \
+	ROUND1(E, A, B, C, D, F5, 11, in[11],  4); \
+	ROUND1(D, E, A, B, C, F5,  8, in[ 6],  4); \
+	ROUND1(C, D, E, A, B, F5,  5, in[15],  4); \
+	ROUND1(B, C, D, E, A, F5,  6, in[13],  4); \
+\
+	ROUND2(A, B, C, D, E, F5,  8, in[ 5],  0); \
+	ROUND2(E, A, B, C, D, F5,  9, in[14],  0); \
+	ROUND2(D, E, A, B, C, F5,  9, in[ 7],  0); \
+	ROUND2(C, D, E, A, B, F5, 11, in[ 0],  0); \
+	ROUND2(B, C, D, E, A, F5, 13, in[ 9],  0); \
+	ROUND2(A, B, C, D, E, F5, 15, in[ 2],  0); \
+	ROUND2(E, A, B, C, D, F5, 15, in[11],  0); \
+	ROUND2(D, E, A, B, C, F5,  5, in[ 4],  0); \
+	ROUND2(C, D, E, A, B, F5,  7, in[13],  0); \
+	ROUND2(B, C, D, E, A, F5,  7, in[ 6],  0); \
+	ROUND2(A, B, C, D, E, F5,  8, in[15],  0); \
+	ROUND2(E, A, B, C, D, F5, 11, in[ 8],  0); \
+	ROUND2(D, E, A, B, C, F5, 14, in[ 1],  0); \
+	ROUND2(C, D, E, A, B, F5, 14, in[10],  0); \
+	ROUND2(B, C, D, E, A, F5, 12, in[ 3],  0); \
+	ROUND2(A, B, C, D, E, F5,  6, in[12],  0); \
+\
+	ROUND2(E, A, B, C, D, F4,  9, in[ 6],  1); \
+	ROUND2(D, E, A, B, C, F4, 13, in[11],  1); \
+	ROUND2(C, D, E, A, B, F4, 15, in[ 3],  1); \
+	ROUND2(B, C, D, E, A, F4,  7, in[ 7],  1); \
+	ROUND2(A, B, C, D, E, F4, 12, in[ 0],  1); \
+	ROUND2(E, A, B, C, D, F4,  8, in[13],  1); \
+	ROUND2(D, E, A, B, C, F4,  9, in[ 5],  1); \
+	ROUND2(C, D, E, A, B, F4, 11, in[10],  1); \
+	ROUND2(B, C, D, E, A, F4,  7, in[14],  1); \
+	ROUND2(A, B, C, D, E, F4,  7, in[15],  1); \
+	ROUND2(E, A, B, C, D, F4, 12, in[ 8],  1); \
+	ROUND2(D, E, A, B, C, F4,  7, in[12],  1); \
+	ROUND2(C, D, E, A, B, F4,  6, in[ 4],  1); \
+	ROUND2(B, C, D, E, A, F4, 15, in[ 9],  1); \
+	ROUND2(A, B, C, D, E, F4, 13, in[ 1],  1); \
+	ROUND2(E, A, B, C, D, F4, 11, in[ 2],  1); \
+\
+	ROUND2(D, E, A, B, C, F3,  9, in[15],  2); \
+	ROUND2(C, D, E, A, B, F3,  7, in[ 5],  2); \
+	ROUND2(B, C, D, E, A, F3, 15, in[ 1],  2); \
+	ROUND2(A, B, C, D, E, F3, 11, in[ 3],  2); \
+	ROUND2(E, A, B, C, D, F3,  8, in[ 7],  2); \
+	ROUND2(D, E, A, B, C, F3,  6, in[14],  2); \
+	ROUND2(C, D, E, A, B, F3,  6, in[ 6],  2); \
+	ROUND2(B, C, D, E, A, F3, 14, in[ 9],  2); \
+	ROUND2(A, B, C, D, E, F3, 12, in[11],  2); \
+	ROUND2(E, A, B, C, D, F3, 13, in[ 8],  2); \
+	ROUND2(D, E, A, B, C, F3,  5, in[12],  2); \
+	ROUND2(C, D, E, A, B, F3, 14, in[ 2],  2); \
+	ROUND2(B, C, D, E, A, F3, 13, in[10],  2); \
+	ROUND2(A, B, C, D, E, F3, 13, in[ 0],  2); \
+	ROUND2(E, A, B, C, D, F3,  7, in[ 4],  2); \
+	ROUND2(D, E, A, B, C, F3,  5, in[13],  2); \
+\
+	ROUND2(C, D, E, A, B, F2, 15, in[ 8],  3); \
+	ROUND2(B, C, D, E, A, F2,  5, in[ 6],  3); \
+	ROUND2(A, B, C, D, E, F2,  8, in[ 4],  3); \
+	ROUND2(E, A, B, C, D, F2, 11, in[ 1],  3); \
+	ROUND2(D, E, A, B, C, F2, 14, in[ 3],  3); \
+	ROUND2(C, D, E, A, B, F2, 14, in[11],  3); \
+	ROUND2(B, C, D, E, A, F2,  6, in[15],  3); \
+	ROUND2(A, B, C, D, E, F2, 14, in[ 0],  3); \
+	ROUND2(E, A, B, C, D, F2,  6, in[ 5],  3); \
+	ROUND2(D, E, A, B, C, F2,  9, in[12],  3); \
+	ROUND2(C, D, E, A, B, F2, 12, in[ 2],  3); \
+	ROUND2(B, C, D, E, A, F2,  9, in[13],  3); \
+	ROUND2(A, B, C, D, E, F2, 12, in[ 9],  3); \
+	ROUND2(E, A, B, C, D, F2,  5, in[ 7],  3); \
+	ROUND2(D, E, A, B, C, F2, 15, in[10],  3); \
+	ROUND2(C, D, E, A, B, F2,  8, in[14],  3); \
+\
+	ROUND2(B, C, D, E, A, F1,  8, in[12],  4); \
+	ROUND2(A, B, C, D, E, F1,  5, in[15],  4); \
+	ROUND2(E, A, B, C, D, F1, 12, in[10],  4); \
+	ROUND2(D, E, A, B, C, F1,  9, in[ 4],  4); \
+	ROUND2(C, D, E, A, B, F1, 12, in[ 1],  4); \
+	ROUND2(B, C, D, E, A, F1,  5, in[ 5],  4); \
+	ROUND2(A, B, C, D, E, F1, 14, in[ 8],  4); \
+	ROUND2(E, A, B, C, D, F1,  6, in[ 7],  4); \
+	ROUND2(D, E, A, B, C, F1,  8, in[ 6],  4); \
+	ROUND2(C, D, E, A, B, F1, 13, in[ 2],  4); \
+	ROUND2(B, C, D, E, A, F1,  6, in[13],  4); \
+	ROUND2(A, B, C, D, E, F1,  5, in[14],  4); \
+	ROUND2(E, A, B, C, D, F1, 15, in[ 0],  4); \
+	ROUND2(D, E, A, B, C, F1, 13, in[ 3],  4); \
+	ROUND2(C, D, E, A, B, F1, 11, in[ 9],  4); \
+	ROUND2(B, C, D, E, A, F1, 11, in[11],  4); \
+\
+	tmp  = h[1] + C[0] + D[1]; \
+	h[1] = h[2] + D[0] + E[1]; \
+	h[2] = h[3] + E[0] + A[1]; \
+	h[3] = h[4] + A[0] + B[1]; \
+	h[4] = h[0] + B[0] + C[1]; \
+	h[0] = tmp; \
+}
+
+__global__
+#if CUDA_VERSION > 6050
+__launch_bounds__(1024,2) /* to force 32 regs */
+#endif
+void lbry_ripemd(const uint32_t threads, uint64_t *Hash512){
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	uint32_t dat[16];
+	uint32_t h[5];
+	uint32_t buf[8]; // align for vectorize
+	if(thread<threads){
+
+		uint32_t* input = (uint32_t*) (&Hash512[thread<<3]);
+
+		*(uint2x4*)&dat[0] = __ldg4((uint2x4*)&input[0]);
+
+		dat[8] = 0x80;
+
+		#pragma unroll 7
+		for (int i=9;i<16;i++) dat[i] = 0;
+
+		dat[14] = 0x100; // size in bits
+
+		#pragma unroll 5
+		for (int i=0; i<5; i++)
+			h[i] = c_IV[i];
+
+		RIPEMD160_ROUND_BODY(dat, h);
+
+		#pragma unroll 5
+		for (int i=0; i<5; i++)
+			buf[i] = h[i];
+
+		// second 32 bytes block hash
+
+		*(uint2x4*)&dat[0] = __ldg4((uint2x4*)&input[8]);
+
+		dat[8] = 0x80;
+
+		#pragma unroll 7
+		for (int i=9;i<16;i++) dat[i] = 0;
+
+		dat[14] = 0x100; // size in bits
+
+		#pragma unroll 5
+		for (int i=0; i<5; i++)
+			h[i] = c_IV[i];
+
+		RIPEMD160_ROUND_BODY(dat, h);
+
+		#pragma unroll 5
+		for (int i=0;i<5;i++) dat[i] = cuda_swab32(buf[i]);
+		#pragma unroll 5
+		for (int i=0;i<5;i++) dat[i+5] = cuda_swab32(h[i]);
+
+		*(uint2x4*)&input[ 0] = *(uint2x4*)&dat[ 0];
+		*(uint2*)&input[ 8] = *(uint2*)&dat[ 8];
+	}
+}
+
+__global__
+#if CUDA_VERSION > 6050
+__launch_bounds__(768,2) /* to force 32 regs */
+#endif
+void lbry_sha256d_gpu_hash_final(const uint32_t threads, uint64_t *Hash512, uint32_t *resNonces,const uint64_t target64)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	uint32_t dat[16];
+	uint32_t buf[8]; // align for vectorize
+	if (thread < threads)
+	{
+		// first final sha256
+
+		uint32_t* input = (uint32_t*) (&Hash512[thread<<3]);
+
+		*(uint2x4*)&dat[0] = __ldg4((uint2x4*)&input[0]);
+		*(uint2*)&dat[8] = __ldg((uint2*)&input[8]);
+		dat[10] = 0x80000000;
+		#pragma unroll 4
+		for (int i=11; i<15; i++) dat[i] = 0;
+		dat[15] = 0x140;
+
+		*(uint2x4*)&buf[0] = *(uint2x4*)&c_H256[0];
+
+		sha256_round_body(dat, buf, c_K); // s_K uses too many regs
+
+		// second sha256
+
+		*(uint2x4*)&dat[0] = *(uint2x4*)&buf[0];
+		*(uint2x4*)&buf[0] = *(uint2x4*)&c_H256[0];
+
+		dat[8] = 0x80000000;
+
+		#pragma unroll 6
+		for (int i=9; i<15; i++) dat[i] = 0;
+		dat[15] = 0x100;
+
+		sha256_round_body_final(dat, buf, c_K);
+
+		// valid nonces
+		if (cuda_swab64ll(buf[ 6],buf[ 7]) <= target64) {
+			uint32_t tmp = atomicExch(&resNonces[0], thread);
+			if (tmp != UINT32_MAX)
+				resNonces[1] = tmp;
+		}
+	}
+}
+
+__host__
+void lbry_sha256d_hash_final(int thr_id, uint32_t threads, uint32_t *d_inputHash, uint32_t *d_resNonce, const uint64_t target64)
+{
+	int threadsperblock;
+
+	threadsperblock = 1024;
+	dim3 grid1((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block1(threadsperblock);
+
+	threadsperblock = 768;
+	dim3 grid2((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block2(threadsperblock);
+	lbry_ripemd <<<grid1, block1>>> (threads,(uint64_t*) d_inputHash);
+	lbry_sha256d_gpu_hash_final <<<grid2, block2>>> (threads, (uint64_t*) d_inputHash, d_resNonce, target64);
+}
diff --git a/lbry/cuda_sha512_lbry.cu b/lbry/cuda_sha512_lbry.cu
new file mode 100644
index 0000000000..fea598bda1
--- /dev/null
+++ b/lbry/cuda_sha512_lbry.cu
@@ -0,0 +1,140 @@
+/**
+ * sha-512 CUDA implementation.
+ * Tanguy Pruvot and Provos Alexis - Jul / Sep 2016
+ * Sponsored by LBRY.IO team
+ */
+
+//#define USE_ROT_ASM_OPT 0
+#include <cuda_helper.h>
+#include <cuda_vector_uint2x4.h>
+
+#include <miner.h>
+
+static __constant__ _ALIGN(8) uint64_t K_512[80] = {
+	0x428A2F98D728AE22, 0x7137449123EF65CD, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
+	0x3956C25BF348B538, 0x59F111F1B605D019, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
+	0xD807AA98A3030242, 0x12835B0145706FBE, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
+	0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1, 0x9BDC06A725C71235, 0xC19BF174CF692694,
+	0xE49B69C19EF14AD2, 0xEFBE4786384F25E3, 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65,
+	0x2DE92C6F592B0275, 0x4A7484AA6EA6E483, 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5,
+	0x983E5152EE66DFAB, 0xA831C66D2DB43210, 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4,
+	0xC6E00BF33DA88FC2, 0xD5A79147930AA725, 0x06CA6351E003826F, 0x142929670A0E6E70,
+	0x27B70A8546D22FFC, 0x2E1B21385C26C926, 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF,
+	0x650A73548BAF63DE, 0x766A0ABB3C77B2A8, 0x81C2C92E47EDAEE6, 0x92722C851482353B,
+	0xA2BFE8A14CF10364, 0xA81A664BBC423001, 0xC24B8B70D0F89791, 0xC76C51A30654BE30,
+	0xD192E819D6EF5218, 0xD69906245565A910, 0xF40E35855771202A, 0x106AA07032BBD1B8,
+	0x19A4C116B8D2D0C8, 0x1E376C085141AB53, 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8,
+	0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB, 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3,
+	0x748F82EE5DEFB2FC, 0x78A5636F43172F60, 0x84C87814A1F0AB72, 0x8CC702081A6439EC,
+	0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9, 0xBEF9A3F7B2C67915, 0xC67178F2E372532B,
+	0xCA273ECEEA26619C, 0xD186B8C721C0C207, 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178,
+	0x06F067AA72176FBA, 0x0A637DC5A2C898A6, 0x113F9804BEF90DAE, 0x1B710B35131C471B,
+	0x28DB77F523047D84, 0x32CAAB7B40C72493, 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C,
+	0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817
+};
+
+#undef xor3
+#define xor3(a,b,c) (a^b^c)
+
+//#define ROR64_8(x) ROTR64(x,8)
+__device__ __inline__
+uint64_t ROR64_8(const uint64_t u64) {
+	const uint2 a = vectorize(u64);
+	uint2 result;
+	result.x = __byte_perm(a.y, a.x, 0x0765);
+	result.y = __byte_perm(a.y, a.x, 0x4321);
+	return devectorize(result);
+}
+
+#define bsg5_0(x) xor3(ROTR64(x,28),ROTR64(x,34),ROTR64(x,39))
+#define bsg5_1(x) xor3(ROTR64(x,14),ROTR64(x,18),ROTR64(x,41))
+#define ssg5_0(x) xor3(ROTR64(x,1), ROR64_8(x), x>>7)
+#define ssg5_1(x) xor3(ROTR64(x,19),ROTR64(x,61), x>>6)
+
+#define andor64(a,b,c) ((a & (b | c)) | (b & c))
+#define xandx64(e,f,g) (g ^ (e & (g ^ f)))
+
+__device__ __forceinline__
+static void sha512_step2(uint64_t *const r,const uint64_t W,const uint64_t K, const int ord)
+{
+	const uint64_t T1 = r[(15-ord) & 7] + K + W + bsg5_1(r[(12-ord) & 7]) + xandx64(r[(12-ord) & 7],r[(13-ord) & 7],r[(14-ord) & 7]);
+	r[(15-ord) & 7] = andor64(r[(8-ord) & 7],r[(9-ord) & 7],r[(10-ord) & 7]) + bsg5_0(r[(8-ord) & 7]) + T1;
+	r[(11-ord) & 7]+= T1;
+}
+
+/**************************************************************************************************/
+
+__global__
+#if CUDA_VERSION > 6050
+__launch_bounds__(512,2)
+#endif
+void lbry_sha512_gpu_hash_32(const uint32_t threads, uint64_t *g_hash)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const uint64_t IV512[8] = {
+		0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
+		0x510E527FADE682D1, 0x9B05688C2B3E6C1F, 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
+	};
+	uint64_t r[8];
+	uint64_t W[16];
+	if (thread < threads)
+	{
+		uint64_t *pHash = &g_hash[thread<<3];
+
+		*(uint2x4*)&r[0] = *(uint2x4*)&IV512[0];
+		*(uint2x4*)&r[4] = *(uint2x4*)&IV512[4];
+
+		*(uint2x4*)&W[0] = __ldg4((uint2x4*)pHash);
+
+		W[4] = 0x8000000000000000; // end tag
+
+		#pragma unroll
+		for (uint32_t i = 5; i < 15; i++) W[i] = 0;
+
+		W[15] = 0x100; // 256 bits
+
+		#pragma unroll 16
+		for (int i = 0; i < 16; i ++){
+			sha512_step2(r, W[i], K_512[i], i&7);
+		}
+
+		#pragma unroll 5
+		for (uint32_t i = 16; i < 80; i+=16){
+			#pragma unroll
+			for (uint32_t j = 0; j<16; j++){
+				W[(i + j) & 15] += W[((i + j) - 7) & 15] + ssg5_0(W[((i + j) - 15) & 15]) + ssg5_1(W[((i + j) - 2) & 15]);
+			}
+			#pragma unroll
+			for (uint32_t j = 0; j<16; j++){
+				sha512_step2(r, W[j], K_512[i+j], (i+j)&7);
+			}
+		}
+
+		#pragma unroll 8
+		for (uint32_t i = 0; i < 8; i++)
+			r[i] = cuda_swab64(r[i] + IV512[i]);
+
+		*(uint2x4*)&pHash[0] = *(uint2x4*)&r[0];
+		*(uint2x4*)&pHash[4] = *(uint2x4*)&r[4];
+
+	}
+}
+
+__host__
+void lbry_sha512_hash_32(int thr_id, uint32_t threads, uint32_t *d_hash)
+{
+	const uint32_t threadsperblock = 256;
+
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	lbry_sha512_gpu_hash_32 <<<grid, block>>> (threads, (uint64_t*)d_hash);
+}
+
+/**************************************************************************************************/
+
+__host__
+void lbry_sha512_init(int thr_id)
+{
+//	cudaMemcpyToSymbol(K_512, K512, 80*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
+}
diff --git a/lbry/lbry.cu b/lbry/lbry.cu
new file mode 100644
index 0000000000..89f67cd64c
--- /dev/null
+++ b/lbry/lbry.cu
@@ -0,0 +1,242 @@
+/**
+ * Lbry Algo (sha-256 / sha-512 / ripemd)
+ *
+ * tpruvot and Provos Alexis - Jan 2017
+ *
+ * Sponsored by LBRY.IO team
+ */
+
+#include <string.h>
+#include <stdint.h>
+
+extern "C" {
+#include <sph/sph_sha2.h>
+#include <sph/sph_ripemd.h>
+}
+
+#include <cuda_helper.h>
+#include <miner.h>
+
+#define A 64
+#define debug_cpu 0
+
+extern "C" void lbry_hash(void* output, const void* input)
+{
+	uint32_t _ALIGN(A) hashA[16];
+	uint32_t _ALIGN(A) hashB[8];
+	uint32_t _ALIGN(A) hashC[8];
+
+	sph_sha256_context ctx_sha256;
+	sph_sha512_context ctx_sha512;
+	sph_ripemd160_context ctx_ripemd;
+
+	sph_sha256_init(&ctx_sha256);
+	sph_sha256(&ctx_sha256, input, 112);
+	sph_sha256_close(&ctx_sha256, hashA);
+
+	sph_sha256(&ctx_sha256, hashA, 32);
+	sph_sha256_close(&ctx_sha256, hashA);
+
+	sph_sha512_init(&ctx_sha512);
+	sph_sha512(&ctx_sha512, hashA, 32);
+	sph_sha512_close(&ctx_sha512, hashA);
+
+	sph_ripemd160_init(&ctx_ripemd);
+	sph_ripemd160(&ctx_ripemd, hashA, 32);  // sha512 low
+	sph_ripemd160_close(&ctx_ripemd, hashB);
+	if (debug_cpu) applog_hex(hashB, 20);
+
+	sph_ripemd160(&ctx_ripemd, &hashA[8], 32); // sha512 high
+	sph_ripemd160_close(&ctx_ripemd, hashC);
+	if (debug_cpu) applog_hex(hashC, 20);
+
+	sph_sha256(&ctx_sha256, hashB, 20);
+	sph_sha256(&ctx_sha256, hashC, 20);
+	sph_sha256_close(&ctx_sha256, hashA);
+	if (debug_cpu) applog_hex(hashA,32);
+
+	sph_sha256(&ctx_sha256, hashA, 32);
+	sph_sha256_close(&ctx_sha256, hashA);
+
+	memcpy(output, hashA, 32);
+}
+
+/* ############################################################################################################################### */
+
+extern void lbry_sha256_init(int thr_id);
+extern void lbry_sha256_free(int thr_id);
+extern void lbry_sha256_setBlock_112(uint32_t *pdata);
+extern void lbry_sha256d_hash_112(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_outputHash);
+extern void lbry_sha512_init(int thr_id);
+extern void lbry_sha512_hash_32(int thr_id, uint32_t threads, uint32_t *d_hash);
+extern void lbry_sha256d_hash_final(int thr_id, uint32_t threads, uint32_t *d_inputHash, uint32_t *d_resNonce, const uint64_t target64);
+
+extern void lbry_sha256_setBlock_112_merged(uint32_t *pdata);
+extern void lbry_merged(int thr_id,uint32_t startNonce, uint32_t threads, uint32_t *d_resNonce, const uint64_t target64);
+
+static __inline uint32_t swab32_if(uint32_t val, bool iftrue) {
+	return iftrue ? swab32(val) : val;
+}
+
+static bool init[MAX_GPUS] = { 0 };
+
+static uint32_t *d_hash[MAX_GPUS];
+static uint32_t *d_resNonce[MAX_GPUS];
+// nonce position is different
+#define LBC_NONCE_OFT32 27
+
+extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t _ALIGN(A) endiandata[28];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+
+	const uint32_t first_nonce = pdata[LBC_NONCE_OFT32];
+	const int swap = 0; // to toggle nonce endian (need kernel change)
+
+	const int dev_id = device_map[thr_id];
+	const bool merged_kernel = (device_sm[dev_id] > 500);
+
+	int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 22 : 20;
+	if (device_sm[dev_id] >= 600) intensity = 23;
+	if (device_sm[dev_id] < 350) intensity = 18;
+
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (opt_benchmark) {
+		ptarget[7] = 0xf;
+	}
+
+	if (!init[thr_id]){
+		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		cuda_get_arch(thr_id);
+
+		if (CUDART_VERSION == 6050) {
+			applog(LOG_ERR, "This lbry kernel is not compatible with CUDA 6.5!");
+			proper_exit(EXIT_FAILURE);
+		}
+
+		if (!merged_kernel)
+			CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)64 * throughput));
+
+		CUDA_SAFE_CALL(cudaMalloc(&d_resNonce[thr_id], 2 * sizeof(uint32_t)));
+		CUDA_LOG_ERROR();
+
+		init[thr_id] = true;
+	}
+
+	for (int i=0; i < LBC_NONCE_OFT32; i++) {
+		be32enc(&endiandata[i], pdata[i]);
+	}
+
+	if (merged_kernel)
+		lbry_sha256_setBlock_112_merged(endiandata);
+	else
+		lbry_sha256_setBlock_112(endiandata);
+
+	cudaMemset(d_resNonce[thr_id], 0xFF, 2 * sizeof(uint32_t));
+
+	do {
+		uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX };
+
+		// Hash with CUDA
+		if (merged_kernel) {
+			lbry_merged(thr_id, pdata[LBC_NONCE_OFT32], throughput, d_resNonce[thr_id], AS_U64(&ptarget[6]));
+		} else {
+			lbry_sha256d_hash_112(thr_id, throughput, pdata[LBC_NONCE_OFT32], d_hash[thr_id]);
+			lbry_sha512_hash_32(thr_id, throughput, d_hash[thr_id]);
+			lbry_sha256d_hash_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id], AS_U64(&ptarget[6]));
+		}
+
+		*hashes_done = pdata[LBC_NONCE_OFT32] - first_nonce + throughput;
+
+		cudaMemcpy(resNonces, d_resNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
+
+		if (resNonces[0] != UINT32_MAX)
+		{
+			uint32_t _ALIGN(A) vhash[8];
+			const uint32_t Htarg = ptarget[7];
+			const uint32_t startNonce = pdata[LBC_NONCE_OFT32];
+			resNonces[0] += startNonce;
+
+			endiandata[LBC_NONCE_OFT32] = swab32_if(resNonces[0], !swap);
+			lbry_hash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget))
+			{
+				work->nonces[0] = swab32_if(resNonces[0], swap);
+				work_set_target_ratio(work, vhash);
+				work->valid_nonces = 1;
+
+				if (resNonces[1] != UINT32_MAX)
+				{
+					resNonces[1] += startNonce;
+					endiandata[LBC_NONCE_OFT32] = swab32_if(resNonces[1], !swap);
+					lbry_hash(vhash, endiandata);
+					work->nonces[1] = swab32_if(resNonces[1], swap);
+
+					if (bn_hash_target_ratio(vhash, ptarget) > work->shareratio[0]) {
+						// best first
+						xchg(work->nonces[1], work->nonces[0]);
+						work->sharediff[1] = work->sharediff[0];
+						work->shareratio[1] = work->shareratio[0];
+						work_set_target_ratio(work, vhash);
+					} else {
+						bn_set_target_ratio(work, vhash, 1);
+					}
+					work->valid_nonces++;
+				}
+
+				pdata[LBC_NONCE_OFT32] = max(work->nonces[0], work->nonces[1]); // next scan start
+
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", resNonces[0]);
+				cudaMemset(d_resNonce[thr_id], 0xFF, 2 * sizeof(uint32_t));
+			}
+		}
+
+		if ((uint64_t) throughput + pdata[LBC_NONCE_OFT32] >= max_nonce) {
+			pdata[LBC_NONCE_OFT32] = max_nonce;
+			break;
+		}
+
+		pdata[LBC_NONCE_OFT32] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[LBC_NONCE_OFT32] - first_nonce;
+
+	return 0;
+}
+
+// cleanup
+void free_lbry(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	if(device_sm[device_map[thr_id]] <= 500)
+		cudaFree(d_hash[thr_id]);
+
+	cudaFree(d_resNonce[thr_id]);
+
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/lyra2/Lyra2.c b/lyra2/Lyra2.c
new file mode 100644
index 0000000000..256af78ebe
--- /dev/null
+++ b/lyra2/Lyra2.c
@@ -0,0 +1,387 @@
+/**
+ * Implementation of the Lyra2 Password Hashing Scheme (PHS).
+ *
+ * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
+ *
+ * This software is hereby placed in the public domain.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include "Lyra2.h"
+#include "Sponge.h"
+
+/**
+ * Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords
+ * whose combined length is smaller than the size of the memory matrix, (i.e., (nRows x nCols x b) bits,
+ * where "b" is the underlying sponge's bitrate). In this implementation, the "basil" is composed by all
+ * integer parameters (treated as type "unsigned int") in the order they are provided, plus the value
+ * of nCols, (i.e., basil = kLen || pwdlen || saltlen || timeCost || nRows || nCols).
+ *
+ * @param K The derived key to be output by the algorithm
+ * @param kLen Desired key length
+ * @param pwd User password
+ * @param pwdlen Password length
+ * @param salt Salt
+ * @param saltlen Salt length
+ * @param timeCost Parameter to determine the processing time (T)
+ * @param nRows Number or rows of the memory matrix (R)
+ * @param nCols Number of columns of the memory matrix (C)
+ *
+ * @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation)
+ */
+int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols)
+{
+	//============================= Basic variables ============================//
+	int64_t row = 2; //index of row to be processed
+	int64_t prev = 1; //index of prev (last row ever computed/modified)
+	int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
+	int64_t tau; //Time Loop iterator
+	int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
+	int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
+	int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
+	int64_t i; //auxiliary iteration counter
+	int64_t v64; // 64bit var for memcpy
+	//==========================================================================/
+
+	//========== Initializing the Memory Matrix and pointers to it =============//
+	//Tries to allocate enough space for the whole memory matrix
+
+	const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
+	const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+	// for Lyra2REv2, nCols = 4, v1 was using 8
+	const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64 : BLOCK_LEN_BLAKE2_SAFE_BYTES;
+
+	size_t sz = (size_t)ROW_LEN_BYTES * nRows;
+	uint64_t *wholeMatrix = malloc(sz);
+	if (wholeMatrix == NULL) {
+		return -1;
+	}
+	memset(wholeMatrix, 0, sz);
+
+	//Allocates pointers to each row of the matrix
+	uint64_t **memMatrix = malloc(sizeof(uint64_t*) * nRows);
+	if (memMatrix == NULL) {
+		return -1;
+	}
+	//Places the pointers in the correct positions
+	uint64_t *ptrWord = wholeMatrix;
+	for (i = 0; i < nRows; i++) {
+		memMatrix[i] = ptrWord;
+		ptrWord += ROW_LEN_INT64;
+	}
+	//==========================================================================/
+
+	//============= Getting the password + salt + basil padded with 10*1 ===============//
+	//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
+	//but this ensures that the password copied locally will be overwritten as soon as possible
+
+	//First, we clean enough blocks for the password, salt, basil and padding
+	int64_t nBlocksInput = ((saltlen + pwdlen + 6 * sizeof(uint64_t)) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1;
+
+	byte *ptrByte = (byte*) wholeMatrix;
+
+	//Prepends the password
+	memcpy(ptrByte, pwd, pwdlen);
+	ptrByte += pwdlen;
+
+	//Concatenates the salt
+	memcpy(ptrByte, salt, saltlen);
+	ptrByte += saltlen;
+
+	memset(ptrByte, 0, (size_t) (nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - (saltlen + pwdlen)));
+
+	//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
+	memcpy(ptrByte, &kLen, sizeof(int64_t));
+	ptrByte += sizeof(uint64_t);
+	v64 = pwdlen;
+	memcpy(ptrByte, &v64, sizeof(int64_t));
+	ptrByte += sizeof(uint64_t);
+	v64 = saltlen;
+	memcpy(ptrByte, &v64, sizeof(int64_t));
+	ptrByte += sizeof(uint64_t);
+	v64 = timeCost;
+	memcpy(ptrByte, &v64, sizeof(int64_t));
+	ptrByte += sizeof(uint64_t);
+	v64 = nRows;
+	memcpy(ptrByte, &v64, sizeof(int64_t));
+	ptrByte += sizeof(uint64_t);
+	v64 = nCols;
+	memcpy(ptrByte, &v64, sizeof(int64_t));
+	ptrByte += sizeof(uint64_t);
+
+	//Now comes the padding
+	*ptrByte = 0x80; //first byte of padding: right after the password
+	ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
+	ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
+	*ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
+	//==========================================================================/
+
+	//======================= Initializing the Sponge State ====================//
+	//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
+	uint64_t state[16];
+	initState(state);
+	//==========================================================================/
+
+	//================================ Setup Phase =============================//
+	//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
+	ptrWord = wholeMatrix;
+	for (i = 0; i < nBlocksInput; i++) {
+		absorbBlockBlake2Safe(state, ptrWord); //absorbs each block of pad(pwd || salt || basil)
+		ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
+	}
+
+	//Initializes M[0] and M[1]
+	reducedSqueezeRow0(state, memMatrix[0], nCols); //The locally copied password is most likely overwritten here
+
+	reducedDuplexRow1(state, memMatrix[0], memMatrix[1], nCols);
+
+	do {
+		//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
+
+		reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
+
+		//updates the value of row* (deterministically picked during Setup))
+		rowa = (rowa + step) & (window - 1);
+		//update prev: it now points to the last row ever computed
+		prev = row;
+		//updates row: goes to the next row to be computed
+		row++;
+
+		//Checks if all rows in the window where visited.
+		if (rowa == 0) {
+		step = window + gap; //changes the step: approximately doubles its value
+		window *= 2; //doubles the size of the re-visitation window
+		gap = -gap; //inverts the modifier to the step
+	}
+
+	} while (row < nRows);
+	//==========================================================================/
+
+	//============================ Wandering Phase =============================//
+	row = 0; //Resets the visitation to the first row of the memory matrix
+	for (tau = 1; tau <= timeCost; tau++) {
+		//Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
+		step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
+		do {
+			//Selects a pseudorandom index row*
+			//------------------------------------------------------------------------------------------
+			rowa = state[0] & (unsigned int)(nRows-1);  //(USE THIS IF nRows IS A POWER OF 2)
+			//rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+			//------------------------------------------------------------------------------------------
+
+			//Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
+			reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
+
+			//update prev: it now points to the last row ever computed
+			prev = row;
+
+			//updates row: goes to the next row to be computed
+			//------------------------------------------------------------------------------------------
+			row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
+			//row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+			//------------------------------------------------------------------------------------------
+
+		} while (row != 0);
+	}
+
+	//============================ Wrap-up Phase ===============================//
+	//Absorbs the last block of the memory matrix
+	absorbBlock(state, memMatrix[rowa]);
+
+	//Squeezes the key
+	squeeze(state, K, (unsigned int) kLen);
+
+	//========================= Freeing the memory =============================//
+	free(memMatrix);
+	free(wholeMatrix);
+
+	return 0;
+}
+
+int LYRA2_3(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols)
+{
+	//============================= Basic variables ============================//
+	int64_t row = 2; //index of row to be processed
+	int64_t prev = 1; //index of prev (last row ever computed/modified)
+	int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
+	int64_t tau; //Time Loop iterator
+	int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
+	int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
+	int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
+	int64_t i; //auxiliary iteration counter
+	int64_t v64; // 64bit var for memcpy
+	uint64_t instance = 0;
+	//==========================================================================/
+
+	//========== Initializing the Memory Matrix and pointers to it =============//
+	//Tries to allocate enough space for the whole memory matrix
+
+	const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
+	const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+	// for Lyra2REv2, nCols = 4, v1 was using 8
+	const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64 : BLOCK_LEN_BLAKE2_SAFE_BYTES;
+
+	size_t sz = (size_t)ROW_LEN_BYTES * nRows;
+	uint64_t *wholeMatrix = malloc(sz);
+	if (wholeMatrix == NULL) {
+		return -1;
+	}
+	memset(wholeMatrix, 0, sz);
+
+	//Allocates pointers to each row of the matrix
+	uint64_t **memMatrix = malloc(sizeof(uint64_t*) * nRows);
+	if (memMatrix == NULL) {
+		return -1;
+	}
+	//Places the pointers in the correct positions
+	uint64_t *ptrWord = wholeMatrix;
+	for (i = 0; i < nRows; i++) {
+		memMatrix[i] = ptrWord;
+		ptrWord += ROW_LEN_INT64;
+	}
+	//==========================================================================/
+
+	//============= Getting the password + salt + basil padded with 10*1 ===============//
+	//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
+	//but this ensures that the password copied locally will be overwritten as soon as possible
+
+	//First, we clean enough blocks for the password, salt, basil and padding
+	int64_t nBlocksInput = ((saltlen + pwdlen + 6 * sizeof(uint64_t)) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1;
+
+	byte *ptrByte = (byte*) wholeMatrix;
+
+	//Prepends the password
+	memcpy(ptrByte, pwd, pwdlen);
+	ptrByte += pwdlen;
+
+	//Concatenates the salt
+	memcpy(ptrByte, salt, saltlen);
+	ptrByte += saltlen;
+
+	memset(ptrByte, 0, (size_t) (nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - (saltlen + pwdlen)));
+
+	//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
+	memcpy(ptrByte, &kLen, sizeof(int64_t));
+	ptrByte += sizeof(uint64_t);
+	v64 = pwdlen;
+	memcpy(ptrByte, &v64, sizeof(int64_t));
+	ptrByte += sizeof(uint64_t);
+	v64 = saltlen;
+	memcpy(ptrByte, &v64, sizeof(int64_t));
+	ptrByte += sizeof(uint64_t);
+	v64 = timeCost;
+	memcpy(ptrByte, &v64, sizeof(int64_t));
+	ptrByte += sizeof(uint64_t);
+	v64 = nRows;
+	memcpy(ptrByte, &v64, sizeof(int64_t));
+	ptrByte += sizeof(uint64_t);
+	v64 = nCols;
+	memcpy(ptrByte, &v64, sizeof(int64_t));
+	ptrByte += sizeof(uint64_t);
+
+	//Now comes the padding
+	*ptrByte = 0x80; //first byte of padding: right after the password
+	ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
+	ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
+	*ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
+	//==========================================================================/
+
+	//======================= Initializing the Sponge State ====================//
+	//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
+	uint64_t state[16];
+	initState(state);
+	//==========================================================================/
+
+	//================================ Setup Phase =============================//
+	//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
+	ptrWord = wholeMatrix;
+	for (i = 0; i < nBlocksInput; i++) {
+		absorbBlockBlake2Safe(state, ptrWord); //absorbs each block of pad(pwd || salt || basil)
+		ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
+	}
+
+	//Initializes M[0] and M[1]
+	reducedSqueezeRow0(state, memMatrix[0], nCols); //The locally copied password is most likely overwritten here
+
+	reducedDuplexRow1(state, memMatrix[0], memMatrix[1], nCols);
+
+	do {
+		//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
+
+		reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
+
+		//updates the value of row* (deterministically picked during Setup))
+		rowa = (rowa + step) & (window - 1);
+		//update prev: it now points to the last row ever computed
+		prev = row;
+		//updates row: goes to the next row to be computed
+		row++;
+
+		//Checks if all rows in the window where visited.
+		if (rowa == 0) {
+		step = window + gap; //changes the step: approximately doubles its value
+		window *= 2; //doubles the size of the re-visitation window
+		gap = -gap; //inverts the modifier to the step
+	}
+
+	} while (row < nRows);
+	//==========================================================================/
+
+	//============================ Wandering Phase =============================//
+	row = 0; //Resets the visitation to the first row of the memory matrix
+	for (tau = 1; tau <= timeCost; tau++) {
+		//Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
+		step = ((tau & 1) == 0) ? -1 : (nRows >> 1) - 1;
+		do {
+			//Selects a pseudorandom index row* (the only change in REv3)
+			//------------------------------------------------------------------------------------------
+			instance = state[instance & 0xF];
+			rowa = state[instance & 0xF] & (unsigned int)(nRows-1);
+
+			//rowa = state[0] & (unsigned int)(nRows-1);  //(USE THIS IF nRows IS A POWER OF 2)
+			//rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+			//------------------------------------------------------------------------------------------
+
+			//Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
+			reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
+
+			//update prev: it now points to the last row ever computed
+			prev = row;
+
+			//updates row: goes to the next row to be computed
+			//------------------------------------------------------------------------------------------
+			row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
+			//row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+			//------------------------------------------------------------------------------------------
+
+		} while (row != 0);
+	}
+
+	//============================ Wrap-up Phase ===============================//
+	//Absorbs the last block of the memory matrix
+	absorbBlock(state, memMatrix[rowa]);
+
+	//Squeezes the key
+	squeeze(state, K, (unsigned int) kLen);
+
+	//========================= Freeing the memory =============================//
+	free(memMatrix);
+	free(wholeMatrix);
+
+	return 0;
+}
diff --git a/lyra2/Lyra2.h b/lyra2/Lyra2.h
new file mode 100644
index 0000000000..f866462a9a
--- /dev/null
+++ b/lyra2/Lyra2.h
@@ -0,0 +1,43 @@
+/**
+ * Header file for the Lyra2 Password Hashing Scheme (PHS).
+ *
+ * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
+ *
+ * This software is hereby placed in the public domain.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef LYRA2_H_
+#define LYRA2_H_
+
+#include <stdint.h>
+
+typedef unsigned char byte;
+
+//Block length required so Blake2's Initialization Vector (IV) is not overwritten (THIS SHOULD NOT BE MODIFIED)
+#define BLOCK_LEN_BLAKE2_SAFE_INT64 8                                   //512 bits (=64 bytes, =8 uint64_t)
+#define BLOCK_LEN_BLAKE2_SAFE_BYTES (BLOCK_LEN_BLAKE2_SAFE_INT64 * 8)   //same as above, in bytes
+
+
+#ifdef BLOCK_LEN_BITS
+        #define BLOCK_LEN_INT64 (BLOCK_LEN_BITS/64)      //Block length: 768 bits (=96 bytes, =12 uint64_t)
+        #define BLOCK_LEN_BYTES (BLOCK_LEN_BITS/8)       //Block length, in bytes
+#else   //default block lenght: 768 bits
+        #define BLOCK_LEN_INT64 12                       //Block length: 768 bits (=96 bytes, =12 uint64_t)
+        #define BLOCK_LEN_BYTES (BLOCK_LEN_INT64 * 8)    //Block length, in bytes
+#endif
+
+int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);
+int LYRA2_3(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);
+
+#endif /* LYRA2_H_ */
diff --git a/lyra2/Lyra2Z.c b/lyra2/Lyra2Z.c
new file mode 100644
index 0000000000..edf463b1d1
--- /dev/null
+++ b/lyra2/Lyra2Z.c
@@ -0,0 +1,215 @@
+/**
+ * Implementation of the Lyra2 Password Hashing Scheme (PHS).
+ *
+ * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
+ *
+ * This software is hereby placed in the public domain.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include "Lyra2Z.h"
+#include "Sponge.h"
+
+/**
+ * Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords
+ * whose combined length is smaller than the size of the memory matrix, (i.e., (nRows x nCols x b) bits,
+ * where "b" is the underlying sponge's bitrate). In this implementation, the "basil" is composed by all
+ * integer parameters (treated as type "unsigned int") in the order they are provided, plus the value
+ * of nCols, (i.e., basil = kLen || pwdlen || saltlen || timeCost || nRows || nCols).
+ *
+ * @param K The derived key to be output by the algorithm
+ * @param kLen Desired key length
+ * @param pwd User password
+ * @param pwdlen Password length
+ * @param salt Salt
+ * @param saltlen Salt length
+ * @param timeCost Parameter to determine the processing time (T)
+ * @param nRows Number or rows of the memory matrix (R)
+ * @param nCols Number of columns of the memory matrix (C)
+ *
+ * @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation)
+ */
+int LYRA2Z(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols)
+{
+	//============================= Basic variables ============================//
+	int64_t row = 2; //index of row to be processed
+	int64_t prev = 1; //index of prev (last row ever computed/modified)
+	int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
+	int64_t tau; //Time Loop iterator
+	int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
+	int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
+	int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
+	int64_t i; //auxiliary iteration counter
+	int64_t v64; // 64bit var for memcpy
+	//==========================================================================/
+
+	//========== Initializing the Memory Matrix and pointers to it =============//
+	//Tries to allocate enough space for the whole memory matrix
+
+	const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
+	const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+	// for Lyra2REv2, nCols = 4, v1 was using 8
+	const int64_t BLOCK_LEN = BLOCK_LEN_BLAKE2_SAFE_INT64;
+
+	size_t sz = (size_t)ROW_LEN_BYTES * nRows;
+	uint64_t *wholeMatrix = malloc(sz);
+	if (wholeMatrix == NULL) {
+		return -1;
+	}
+	memset(wholeMatrix, 0, sz);
+
+	//Allocates pointers to each row of the matrix
+	uint64_t **memMatrix = malloc(sizeof(uint64_t*) * nRows);
+	if (memMatrix == NULL) {
+		return -1;
+	}
+	//Places the pointers in the correct positions
+	uint64_t *ptrWord = wholeMatrix;
+	for (i = 0; i < nRows; i++) {
+		memMatrix[i] = ptrWord;
+		ptrWord += ROW_LEN_INT64;
+	}
+	//==========================================================================/
+
+	//============= Getting the password + salt + basil padded with 10*1 ===============//
+	//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
+	//but this ensures that the password copied locally will be overwritten as soon as possible
+
+	//First, we clean enough blocks for the password, salt, basil and padding
+	int64_t nBlocksInput = ((saltlen + pwdlen + 6 * sizeof(uint64_t)) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1;
+
+	byte *ptrByte = (byte*) wholeMatrix;
+
+	//Prepends the password
+	memcpy(ptrByte, pwd, pwdlen);
+	ptrByte += pwdlen;
+
+	//Concatenates the salt
+	memcpy(ptrByte, salt, saltlen);
+	ptrByte += saltlen;
+
+	memset(ptrByte, 0, (size_t) (nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - (saltlen + pwdlen)));
+
+	//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
+	memcpy(ptrByte, &kLen, sizeof(int64_t));
+	ptrByte += sizeof(uint64_t);
+	v64 = pwdlen;
+	memcpy(ptrByte, &v64, sizeof(int64_t));
+	ptrByte += sizeof(uint64_t);
+	v64 = saltlen;
+	memcpy(ptrByte, &v64, sizeof(int64_t));
+	ptrByte += sizeof(uint64_t);
+	v64 = timeCost;
+	memcpy(ptrByte, &v64, sizeof(int64_t));
+	ptrByte += sizeof(uint64_t);
+	v64 = nRows;
+	memcpy(ptrByte, &v64, sizeof(int64_t));
+	ptrByte += sizeof(uint64_t);
+	v64 = nCols;
+	memcpy(ptrByte, &v64, sizeof(int64_t));
+	ptrByte += sizeof(uint64_t);
+
+	//Now comes the padding
+	*ptrByte = 0x80; //first byte of padding: right after the password
+	ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
+	ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
+	*ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
+	//==========================================================================/
+
+	//======================= Initializing the Sponge State ====================//
+	//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
+	uint64_t state[16];
+	initState(state);
+	//==========================================================================/
+
+	//================================ Setup Phase =============================//
+	//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
+	ptrWord = wholeMatrix;
+	for (i = 0; i < nBlocksInput; i++) {
+		absorbBlockBlake2Safe(state, ptrWord); //absorbs each block of pad(pwd || salt || basil)
+		ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
+	}
+
+	//Initializes M[0] and M[1]
+	reducedSqueezeRow0(state, memMatrix[0], nCols); //The locally copied password is most likely overwritten here
+
+	reducedDuplexRow1(state, memMatrix[0], memMatrix[1], nCols);
+
+	do {
+		//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
+
+		reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
+
+		//updates the value of row* (deterministically picked during Setup))
+		rowa = (rowa + step) & (window - 1);
+		//update prev: it now points to the last row ever computed
+		prev = row;
+		//updates row: goes to the next row to be computed
+		row++;
+
+		//Checks if all rows in the window where visited.
+		if (rowa == 0) {
+		step = window + gap; //changes the step: approximately doubles its value
+		window *= 2; //doubles the size of the re-visitation window
+		gap = -gap; //inverts the modifier to the step
+	}
+
+	} while (row < nRows);
+	//==========================================================================/
+
+	//============================ Wandering Phase =============================//
+	row = 0; //Resets the visitation to the first row of the memory matrix
+	for (tau = 1; tau <= timeCost; tau++) {
+		//Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
+		step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
+		do {
+			//Selects a pseudorandom index row*
+			//------------------------------------------------------------------------------------------
+			rowa = state[0] & (unsigned int)(nRows-1);  //(USE THIS IF nRows IS A POWER OF 2)
+			//rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+			//------------------------------------------------------------------------------------------
+
+			//Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
+			reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
+
+			//update prev: it now points to the last row ever computed
+			prev = row;
+
+			//updates row: goes to the next row to be computed
+			//------------------------------------------------------------------------------------------
+			row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
+			//row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+			//------------------------------------------------------------------------------------------
+
+		} while (row != 0);
+	}
+
+	//============================ Wrap-up Phase ===============================//
+	//Absorbs the last block of the memory matrix
+	absorbBlock(state, memMatrix[rowa]);
+
+	//Squeezes the key
+	squeeze(state, K, (unsigned int) kLen);
+
+	//========================= Freeing the memory =============================//
+	free(memMatrix);
+	free(wholeMatrix);
+
+	return 0;
+}
+
diff --git a/lyra2/Lyra2Z.h b/lyra2/Lyra2Z.h
new file mode 100644
index 0000000000..aaade365d0
--- /dev/null
+++ b/lyra2/Lyra2Z.h
@@ -0,0 +1,42 @@
+/**
+ * Header file for the Lyra2 Password Hashing Scheme (PHS).
+ *
+ * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
+ *
+ * This software is hereby placed in the public domain.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef LYRA2Z_H_
+#define LYRA2Z_H_
+
+#include <stdint.h>
+
+typedef unsigned char byte;
+
+//Block length required so Blake2's Initialization Vector (IV) is not overwritten (THIS SHOULD NOT BE MODIFIED)
+#define BLOCK_LEN_BLAKE2_SAFE_INT64 8                                   //512 bits (=64 bytes, =8 uint64_t)
+#define BLOCK_LEN_BLAKE2_SAFE_BYTES (BLOCK_LEN_BLAKE2_SAFE_INT64 * 8)   //same as above, in bytes
+
+
+#ifdef BLOCK_LEN_BITS
+        #define BLOCK_LEN_INT64 (BLOCK_LEN_BITS/64)      //Block length: 768 bits (=96 bytes, =12 uint64_t)
+        #define BLOCK_LEN_BYTES (BLOCK_LEN_BITS/8)       //Block length, in bytes
+#else   //default block lenght: 768 bits
+        #define BLOCK_LEN_INT64 12                       //Block length: 768 bits (=96 bytes, =12 uint64_t)
+        #define BLOCK_LEN_BYTES (BLOCK_LEN_INT64 * 8)    //Block length, in bytes
+#endif
+
+int LYRA2Z(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);
+
+#endif /* LYRA2_H_ */
diff --git a/lyra2/Sponge.c b/lyra2/Sponge.c
new file mode 100644
index 0000000000..a698229dfa
--- /dev/null
+++ b/lyra2/Sponge.c
@@ -0,0 +1,410 @@
+/**
+ * A simple implementation of Blake2b's internal permutation
+ * in the form of a sponge.
+ *
+ * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
+ *
+ * This software is hereby placed in the public domain.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <string.h>
+#include <stdio.h>
+#include <time.h>
+#include "Sponge.h"
+#include "Lyra2.h"
+
+
+/**
+ * Initializes the Sponge State. The first 512 bits are set to zeros and the remainder
+ * receive Blake2b's IV as per Blake2b's specification. <b>Note:</b> Even though sponges
+ * typically have their internal state initialized with zeros, Blake2b's G function
+ * has a fixed point: if the internal state and message are both filled with zeros. the
+ * resulting permutation will always be a block filled with zeros; this happens because
+ * Blake2b does not use the constants originally employed in Blake2 inside its G function,
+ * relying on the IV for avoiding possible fixed points.
+ *
+ * @param state         The 1024-bit array to be initialized
+ */
+void initState(uint64_t state[/*16*/]) {
+	//First 512 bis are zeros
+	memset(state, 0, 64);
+	//Remainder BLOCK_LEN_BLAKE2_SAFE_BYTES are reserved to the IV
+	state[8] = blake2b_IV[0];
+	state[9] = blake2b_IV[1];
+	state[10] = blake2b_IV[2];
+	state[11] = blake2b_IV[3];
+	state[12] = blake2b_IV[4];
+	state[13] = blake2b_IV[5];
+	state[14] = blake2b_IV[6];
+	state[15] = blake2b_IV[7];
+}
+
+/**
+ * Execute Blake2b's G function, with all 12 rounds.
+ *
+ * @param v     A 1024-bit (16 uint64_t) array to be processed by Blake2b's G function
+ */
+__inline static void blake2bLyra(uint64_t *v) {
+	ROUND_LYRA(0);
+	ROUND_LYRA(1);
+	ROUND_LYRA(2);
+	ROUND_LYRA(3);
+	ROUND_LYRA(4);
+	ROUND_LYRA(5);
+	ROUND_LYRA(6);
+	ROUND_LYRA(7);
+	ROUND_LYRA(8);
+	ROUND_LYRA(9);
+	ROUND_LYRA(10);
+	ROUND_LYRA(11);
+}
+
+/**
+ * Executes a reduced version of Blake2b's G function with only one round
+ * @param v     A 1024-bit (16 uint64_t) array to be processed by Blake2b's G function
+ */
+__inline static void reducedBlake2bLyra(uint64_t *v) {
+	ROUND_LYRA(0);
+}
+
+/**
+ * Performs a squeeze operation, using Blake2b's G function as the
+ * internal permutation
+ *
+ * @param state      The current state of the sponge
+ * @param out        Array that will receive the data squeezed
+ * @param len        The number of bytes to be squeezed into the "out" array
+ */
+void squeeze(uint64_t *state, byte *out, unsigned int len)
+{
+	int fullBlocks = len / BLOCK_LEN_BYTES;
+	byte *ptr = out;
+	int i;
+	//Squeezes full blocks
+	for (i = 0; i < fullBlocks; i++) {
+		memcpy(ptr, state, BLOCK_LEN_BYTES);
+		blake2bLyra(state);
+		ptr += BLOCK_LEN_BYTES;
+	}
+
+	//Squeezes remaining bytes
+	memcpy(ptr, state, (len % BLOCK_LEN_BYTES));
+}
+
+/**
+ * Performs an absorb operation for a single block (BLOCK_LEN_INT64 words
+ * of type uint64_t), using Blake2b's G function as the internal permutation
+ *
+ * @param state The current state of the sponge
+ * @param in    The block to be absorbed (BLOCK_LEN_INT64 words)
+ */
+void absorbBlock(uint64_t *state, const uint64_t *in)
+{
+	//XORs the first BLOCK_LEN_INT64 words of "in" with the current state
+	state[0] ^= in[0];
+	state[1] ^= in[1];
+	state[2] ^= in[2];
+	state[3] ^= in[3];
+	state[4] ^= in[4];
+	state[5] ^= in[5];
+	state[6] ^= in[6];
+	state[7] ^= in[7];
+	state[8] ^= in[8];
+	state[9] ^= in[9];
+	state[10] ^= in[10];
+	state[11] ^= in[11];
+
+	//Applies the transformation f to the sponge's state
+	blake2bLyra(state);
+}
+
+/**
+ * Performs an absorb operation for a single block (BLOCK_LEN_BLAKE2_SAFE_INT64
+ * words of type uint64_t), using Blake2b's G function as the internal permutation
+ *
+ * @param state The current state of the sponge
+ * @param in    The block to be absorbed (BLOCK_LEN_BLAKE2_SAFE_INT64 words)
+ */
+void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in)
+{
+	//XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with the current state
+
+	state[0] ^= in[0];
+	state[1] ^= in[1];
+	state[2] ^= in[2];
+	state[3] ^= in[3];
+	state[4] ^= in[4];
+	state[5] ^= in[5];
+	state[6] ^= in[6];
+	state[7] ^= in[7];
+
+	//Applies the transformation f to the sponge's state
+	blake2bLyra(state);
+}
+
+/**
+ * Performs a reduced squeeze operation for a single row, from the highest to
+ * the lowest index, using the reduced-round Blake2b's G function as the
+ * internal permutation
+ *
+ * @param state     The current state of the sponge
+ * @param rowOut    Row to receive the data squeezed
+ */
+void reducedSqueezeRow0(uint64_t* state, uint64_t* rowOut, const uint32_t nCols)
+{
+	uint64_t* ptrWord = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1]
+	unsigned int i;
+	//M[row][C-1-col] = H.reduced_squeeze()
+	for (i = 0; i < nCols; i++) {
+		ptrWord[0] = state[0];
+		ptrWord[1] = state[1];
+		ptrWord[2] = state[2];
+		ptrWord[3] = state[3];
+		ptrWord[4] = state[4];
+		ptrWord[5] = state[5];
+		ptrWord[6] = state[6];
+		ptrWord[7] = state[7];
+		ptrWord[8] = state[8];
+		ptrWord[9] = state[9];
+		ptrWord[10] = state[10];
+		ptrWord[11] = state[11];
+
+		//Goes to next block (column) that will receive the squeezed data
+		ptrWord -= BLOCK_LEN_INT64;
+
+		//Applies the reduced-round transformation f to the sponge's state
+		reducedBlake2bLyra(state);
+	}
+}
+
+/**
+ * Performs a reduced duplex operation for a single row, from the highest to
+ * the lowest index, using the reduced-round Blake2b's G function as the
+ * internal permutation
+ *
+ * @param state		The current state of the sponge
+ * @param rowIn		Row to feed the sponge
+ * @param rowOut	Row to receive the sponge's output
+ */
+void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, const uint32_t nCols)
+{
+	uint64_t* ptrWordIn = rowIn;				//In Lyra2: pointer to prev
+	uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
+	unsigned int i;
+
+	for (i = 0; i < nCols; i++) {
+
+		//Absorbing "M[prev][col]"
+		state[0]  ^= (ptrWordIn[0]);
+		state[1]  ^= (ptrWordIn[1]);
+		state[2]  ^= (ptrWordIn[2]);
+		state[3]  ^= (ptrWordIn[3]);
+		state[4]  ^= (ptrWordIn[4]);
+		state[5]  ^= (ptrWordIn[5]);
+		state[6]  ^= (ptrWordIn[6]);
+		state[7]  ^= (ptrWordIn[7]);
+		state[8]  ^= (ptrWordIn[8]);
+		state[9]  ^= (ptrWordIn[9]);
+		state[10] ^= (ptrWordIn[10]);
+		state[11] ^= (ptrWordIn[11]);
+
+		//Applies the reduced-round transformation f to the sponge's state
+		reducedBlake2bLyra(state);
+
+		//M[row][C-1-col] = M[prev][col] XOR rand
+		ptrWordOut[0] = ptrWordIn[0]  ^ state[0];
+		ptrWordOut[1] = ptrWordIn[1]  ^ state[1];
+		ptrWordOut[2] = ptrWordIn[2]  ^ state[2];
+		ptrWordOut[3] = ptrWordIn[3]  ^ state[3];
+		ptrWordOut[4] = ptrWordIn[4]  ^ state[4];
+		ptrWordOut[5] = ptrWordIn[5]  ^ state[5];
+		ptrWordOut[6] = ptrWordIn[6]  ^ state[6];
+		ptrWordOut[7] = ptrWordIn[7]  ^ state[7];
+		ptrWordOut[8] = ptrWordIn[8]  ^ state[8];
+		ptrWordOut[9] = ptrWordIn[9]  ^ state[9];
+		ptrWordOut[10] = ptrWordIn[10] ^ state[10];
+		ptrWordOut[11] = ptrWordIn[11] ^ state[11];
+
+		//Input: next column (i.e., next block in sequence)
+		ptrWordIn += BLOCK_LEN_INT64;
+		//Output: goes to previous column
+		ptrWordOut -= BLOCK_LEN_INT64;
+	}
+}
+
+/**
+ * Performs a duplexing operation over "M[rowInOut][col] [+] M[rowIn][col]" (i.e.,
+ * the wordwise addition of two columns, ignoring carries between words). The
+ * output of this operation, "rand", is then used to make
+ * "M[rowOut][(N_COLS-1)-col] = M[rowIn][col] XOR rand" and
+ * "M[rowInOut][col] =  M[rowInOut][col] XOR rotW(rand)", where rotW is a 64-bit
+ * rotation to the left and N_COLS is a system parameter.
+ *
+ * @param state          The current state of the sponge
+ * @param rowIn          Row used only as input
+ * @param rowInOut       Row used as input and to receive output after rotation
+ * @param rowOut         Row receiving the output
+ *
+ */
+void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, const uint32_t nCols)
+{
+	uint64_t* ptrWordIn = rowIn;				//In Lyra2: pointer to prev
+	uint64_t* ptrWordInOut = rowInOut;				//In Lyra2: pointer to row*
+	uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
+	unsigned int i;
+
+	for (i = 0; i < nCols; i++) {
+
+		//Absorbing "M[prev] [+] M[row*]"
+		state[0]  ^= (ptrWordIn[0]  + ptrWordInOut[0]);
+		state[1]  ^= (ptrWordIn[1]  + ptrWordInOut[1]);
+		state[2]  ^= (ptrWordIn[2]  + ptrWordInOut[2]);
+		state[3]  ^= (ptrWordIn[3]  + ptrWordInOut[3]);
+		state[4]  ^= (ptrWordIn[4]  + ptrWordInOut[4]);
+		state[5]  ^= (ptrWordIn[5]  + ptrWordInOut[5]);
+		state[6]  ^= (ptrWordIn[6]  + ptrWordInOut[6]);
+		state[7]  ^= (ptrWordIn[7]  + ptrWordInOut[7]);
+		state[8]  ^= (ptrWordIn[8]  + ptrWordInOut[8]);
+		state[9]  ^= (ptrWordIn[9]  + ptrWordInOut[9]);
+		state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]);
+		state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]);
+
+		//Applies the reduced-round transformation f to the sponge's state
+		reducedBlake2bLyra(state);
+
+		//M[row][col] = M[prev][col] XOR rand
+		ptrWordOut[0] = ptrWordIn[0]  ^ state[0];
+		ptrWordOut[1] = ptrWordIn[1]  ^ state[1];
+		ptrWordOut[2] = ptrWordIn[2]  ^ state[2];
+		ptrWordOut[3] = ptrWordIn[3]  ^ state[3];
+		ptrWordOut[4] = ptrWordIn[4]  ^ state[4];
+		ptrWordOut[5] = ptrWordIn[5]  ^ state[5];
+		ptrWordOut[6] = ptrWordIn[6]  ^ state[6];
+		ptrWordOut[7] = ptrWordIn[7]  ^ state[7];
+		ptrWordOut[8] = ptrWordIn[8]  ^ state[8];
+		ptrWordOut[9] = ptrWordIn[9]  ^ state[9];
+		ptrWordOut[10] = ptrWordIn[10] ^ state[10];
+		ptrWordOut[11] = ptrWordIn[11] ^ state[11];
+
+		//M[row*][col] = M[row*][col] XOR rotW(rand)
+		ptrWordInOut[0]  ^= state[11];
+		ptrWordInOut[1]  ^= state[0];
+		ptrWordInOut[2]  ^= state[1];
+		ptrWordInOut[3]  ^= state[2];
+		ptrWordInOut[4]  ^= state[3];
+		ptrWordInOut[5]  ^= state[4];
+		ptrWordInOut[6]  ^= state[5];
+		ptrWordInOut[7]  ^= state[6];
+		ptrWordInOut[8]  ^= state[7];
+		ptrWordInOut[9]  ^= state[8];
+		ptrWordInOut[10] ^= state[9];
+		ptrWordInOut[11] ^= state[10];
+
+		//Inputs: next column (i.e., next block in sequence)
+		ptrWordInOut += BLOCK_LEN_INT64;
+		ptrWordIn += BLOCK_LEN_INT64;
+		//Output: goes to previous column
+		ptrWordOut -= BLOCK_LEN_INT64;
+	}
+}
+
+/**
+ * Performs a duplexing operation over "M[rowInOut][col] [+] M[rowIn][col]" (i.e.,
+ * the wordwise addition of two columns, ignoring carries between words). The
+ * output of this operation, "rand", is then used to make
+ * "M[rowOut][col] = M[rowOut][col] XOR rand" and
+ * "M[rowInOut][col] =  M[rowInOut][col] XOR rotW(rand)", where rotW is a 64-bit
+ * rotation to the left.
+ *
+ * @param state          The current state of the sponge
+ * @param rowIn          Row used only as input
+ * @param rowInOut       Row used as input and to receive output after rotation
+ * @param rowOut         Row receiving the output
+ *
+ */
+void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, const uint32_t nCols)
+{
+	uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
+	uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
+	uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row
+	unsigned int i;
+
+	for (i = 0; i < nCols; i++) {
+
+		//Absorbing "M[prev] [+] M[row*]"
+		state[0]  ^= (ptrWordIn[0]  + ptrWordInOut[0]);
+		state[1]  ^= (ptrWordIn[1]  + ptrWordInOut[1]);
+		state[2]  ^= (ptrWordIn[2]  + ptrWordInOut[2]);
+		state[3]  ^= (ptrWordIn[3]  + ptrWordInOut[3]);
+		state[4]  ^= (ptrWordIn[4]  + ptrWordInOut[4]);
+		state[5]  ^= (ptrWordIn[5]  + ptrWordInOut[5]);
+		state[6]  ^= (ptrWordIn[6]  + ptrWordInOut[6]);
+		state[7]  ^= (ptrWordIn[7]  + ptrWordInOut[7]);
+		state[8]  ^= (ptrWordIn[8]  + ptrWordInOut[8]);
+		state[9]  ^= (ptrWordIn[9]  + ptrWordInOut[9]);
+		state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]);
+		state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]);
+
+		//Applies the reduced-round transformation f to the sponge's state
+		reducedBlake2bLyra(state);
+
+		//M[rowOut][col] = M[rowOut][col] XOR rand
+		ptrWordOut[0] ^= state[0];
+		ptrWordOut[1] ^= state[1];
+		ptrWordOut[2] ^= state[2];
+		ptrWordOut[3] ^= state[3];
+		ptrWordOut[4] ^= state[4];
+		ptrWordOut[5] ^= state[5];
+		ptrWordOut[6] ^= state[6];
+		ptrWordOut[7] ^= state[7];
+		ptrWordOut[8] ^= state[8];
+		ptrWordOut[9] ^= state[9];
+		ptrWordOut[10] ^= state[10];
+		ptrWordOut[11] ^= state[11];
+
+		//M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
+		ptrWordInOut[0] ^= state[11];
+		ptrWordInOut[1] ^= state[0];
+		ptrWordInOut[2] ^= state[1];
+		ptrWordInOut[3] ^= state[2];
+		ptrWordInOut[4] ^= state[3];
+		ptrWordInOut[5] ^= state[4];
+		ptrWordInOut[6] ^= state[5];
+		ptrWordInOut[7] ^= state[6];
+		ptrWordInOut[8] ^= state[7];
+		ptrWordInOut[9] ^= state[8];
+		ptrWordInOut[10] ^= state[9];
+		ptrWordInOut[11] ^= state[10];
+
+		//Goes to next block
+		ptrWordOut += BLOCK_LEN_INT64;
+		ptrWordInOut += BLOCK_LEN_INT64;
+		ptrWordIn += BLOCK_LEN_INT64;
+	}
+}
+
+/**
+ * Prints an array of unsigned chars
+ */
+void printArray(unsigned char *array, unsigned int size, char *name)
+{
+	unsigned int i;
+	printf("%s: ", name);
+	for (i = 0; i < size; i++) {
+		printf("%2x|", array[i]);
+	}
+	printf("\n");
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/lyra2/Sponge.h b/lyra2/Sponge.h
new file mode 100644
index 0000000000..7fcd09342e
--- /dev/null
+++ b/lyra2/Sponge.h
@@ -0,0 +1,88 @@
+/**
+ * Header file for Blake2b's internal permutation in the form of a sponge.
+ * This code is based on the original Blake2b's implementation provided by
+ * Samuel Neves (https://blake2.net/)
+ *
+ * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
+ *
+ * This software is hereby placed in the public domain.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPONGE_H_
+#define SPONGE_H_
+
+#include <stdint.h>
+
+/* Blake2b IV Array */
+static const uint64_t blake2b_IV[8] =
+{
+	0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
+	0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
+	0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
+	0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
+};
+
+/* Blake2b's rotation */
+static __inline uint64_t rotr64(const uint64_t w, const unsigned c) {
+#ifdef _MSC_VER
+	return _rotr64(w, c);
+#else
+	return ( w >> c ) | ( w << ( 64 - c ) );
+#endif
+}
+
+/* Blake2b's G function */
+#define G(r,i,a,b,c,d) do { \
+	a = a + b; \
+	d = rotr64(d ^ a, 32); \
+	c = c + d; \
+	b = rotr64(b ^ c, 24); \
+	a = a + b; \
+	d = rotr64(d ^ a, 16); \
+	c = c + d; \
+	b = rotr64(b ^ c, 63); \
+  } while(0)
+
+
+/*One Round of the Blake2b's compression function*/
+#define ROUND_LYRA(r) \
+	G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
+	G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
+	G(r,2,v[ 2],v[ 6],v[10],v[14]); \
+	G(r,3,v[ 3],v[ 7],v[11],v[15]); \
+	G(r,4,v[ 0],v[ 5],v[10],v[15]); \
+	G(r,5,v[ 1],v[ 6],v[11],v[12]); \
+	G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
+	G(r,7,v[ 3],v[ 4],v[ 9],v[14]);
+
+//---- Housekeeping
+void initState(uint64_t state[/*16*/]);
+
+//---- Squeezes
+void squeeze(uint64_t *state, unsigned char *out, unsigned int len);
+void reducedSqueezeRow0(uint64_t* state, uint64_t* row, const uint32_t nCols);
+
+//---- Absorbs
+void absorbBlock(uint64_t *state, const uint64_t *in);
+void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in);
+
+//---- Duplexes
+void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, const uint32_t nCols);
+void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, const uint32_t nCols);
+void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, const uint32_t nCols);
+
+//---- Misc
+void printArray(unsigned char *array, unsigned int size, char *name);
+
+#endif /* SPONGE_H_ */
diff --git a/lyra2/allium.cu b/lyra2/allium.cu
new file mode 100644
index 0000000000..65dbbe3a77
--- /dev/null
+++ b/lyra2/allium.cu
@@ -0,0 +1,217 @@
+extern "C" {
+#include "sph/sph_blake.h"
+#include "sph/sph_keccak.h"
+#include "sph/sph_cubehash.h"
+#include "sph/sph_skein.h"
+#include "sph/sph_groestl.h"
+#include "lyra2/Lyra2.h"
+}
+
+#include <miner.h>
+#include <cuda_helper.h>
+
+static uint64_t* d_hash[MAX_GPUS];
+static uint64_t* d_matrix[MAX_GPUS];
+
+extern void blake256_cpu_init(int thr_id, uint32_t threads);
+extern void blake256_cpu_setBlock_80(uint32_t *pdata);
+//extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
+
+//extern void keccak256_sm3_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
+//extern void keccak256_sm3_init(int thr_id, uint32_t threads);
+//extern void keccak256_sm3_free(int thr_id);
+
+extern void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
+
+extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
+
+extern void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order);
+
+extern void skein256_cpu_init(int thr_id, uint32_t threads);
+
+extern void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix);
+extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint64_t *d_outputHash, bool gtx750ti);
+
+extern void groestl256_cpu_init(int thr_id, uint32_t threads);
+extern void groestl256_cpu_free(int thr_id);
+extern void groestl256_setTarget(const void *ptarget);
+extern uint32_t groestl256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order);
+extern uint32_t groestl256_getSecNonce(int thr_id, int num);
+
+
+extern "C" void allium_hash(void *state, const void *input)
+{
+	uint32_t hashA[8], hashB[8];
+
+	sph_blake256_context     ctx_blake;
+	sph_keccak256_context    ctx_keccak;
+	sph_cubehash256_context  ctx_cube;
+	sph_skein256_context     ctx_skein;
+	sph_groestl256_context   ctx_groestl;
+
+	sph_blake256_set_rounds(14);
+
+	sph_blake256_init(&ctx_blake);
+	sph_blake256(&ctx_blake, input, 80);
+	sph_blake256_close(&ctx_blake, hashA);
+
+	sph_keccak256_init(&ctx_keccak);
+	sph_keccak256(&ctx_keccak, hashA, 32);
+	sph_keccak256_close(&ctx_keccak, hashB);
+
+	LYRA2(hashA, 32, hashB, 32, hashB, 32, 1, 8, 8);
+
+	sph_cubehash256_init(&ctx_cube);
+	sph_cubehash256(&ctx_cube, hashA, 32);
+	sph_cubehash256_close(&ctx_cube, hashB);
+
+	LYRA2(hashA, 32, hashB, 32, hashB, 32, 1, 8, 8);
+
+	sph_skein256_init(&ctx_skein);
+	sph_skein256(&ctx_skein, hashA, 32);
+	sph_skein256_close(&ctx_skein, hashB);
+
+	sph_groestl256_init(&ctx_groestl);
+	sph_groestl256(&ctx_groestl, hashB, 32);
+	sph_groestl256_close(&ctx_groestl, hashA);
+
+	memcpy(state, hashA, 32);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+static __thread uint32_t throughput = 0;
+
+extern "C" int scanhash_allium(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+
+	if (opt_benchmark)
+		ptarget[7] = 0x00ff;
+
+	static __thread bool gtx750ti;
+	if (!init[thr_id])
+	{
+		int dev_id = device_map[thr_id];
+		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+
+		int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 17 : 16;
+		if (device_sm[device_map[thr_id]] == 500) intensity = 15;
+		throughput = cuda_default_throughput(thr_id, 1U << intensity); // 18=256*256*4;
+		if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+		cudaDeviceProp props;
+		cudaGetDeviceProperties(&props, dev_id);
+
+		if (strstr(props.name, "750 Ti")) gtx750ti = true;
+		else gtx750ti = false;
+
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		blake256_cpu_init(thr_id, throughput);
+		//keccak256_sm3_init(thr_id, throughput);
+		skein256_cpu_init(thr_id, throughput);
+		groestl256_cpu_init(thr_id, throughput);
+
+		//cuda_get_arch(thr_id);
+		if (device_sm[dev_id] >= 500)
+		{
+			size_t matrix_sz = device_sm[dev_id] > 500 ? sizeof(uint64_t) * 4 * 4 : sizeof(uint64_t) * 8 * 8 * 3 * 4;
+			CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput));
+			lyra2_cpu_init(thr_id, throughput, d_matrix[thr_id]);
+		}
+
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput));
+
+		init[thr_id] = true;
+	}
+
+	uint32_t _ALIGN(128) endiandata[20];
+	for (int k=0; k < 20; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	blake256_cpu_setBlock_80(pdata);
+	groestl256_setTarget(ptarget);
+
+	do {
+		int order = 0;
+
+		//blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		//keccak256_sm3_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		blakeKeccak256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		lyra2_cpu_hash_32(thr_id, throughput, d_hash[thr_id], gtx750ti);
+		cubehash256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		lyra2_cpu_hash_32(thr_id, throughput, d_hash[thr_id], gtx750ti);
+		skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		work->nonces[0] = groestl256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+
+			be32enc(&endiandata[19], work->nonces[0]);
+			allium_hash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				work->nonces[1] = groestl256_getSecNonce(thr_id, 1);
+				if (work->nonces[1] != UINT32_MAX) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					allium_hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+	return 0;
+}
+
+// cleanup
+extern "C" void free_allium(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+	cudaFree(d_matrix[thr_id]);
+
+	//keccak256_sm3_free(thr_id);
+	groestl256_cpu_free(thr_id);
+
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/lyra2/cuda_lyra2.cu b/lyra2/cuda_lyra2.cu
new file mode 100644
index 0000000000..5cdb6ee3a3
--- /dev/null
+++ b/lyra2/cuda_lyra2.cu
@@ -0,0 +1,662 @@
+/**
+ * Lyra2 (v1) cuda implementation based on djm34 work
+ * tpruvot@github 2015, Nanashi 08/2016 (from 1.8-r2)
+ * tpruvot@github 2018 for phi2 double lyra2-32 support
+ */
+
+#include <stdio.h>
+#include <memory.h>
+
+#define TPB52 32
+
+#include "cuda_lyra2_sm2.cuh"
+#include "cuda_lyra2_sm5.cuh"
+
+#ifdef __INTELLISENSE__
+/* just for vstudio code colors */
+#define __CUDA_ARCH__ 520
+#endif
+
+#if !defined(__CUDA_ARCH__) ||  __CUDA_ARCH__ > 500
+
+#include "cuda_lyra2_vectors.h"
+
+#ifdef __INTELLISENSE__
+/* just for vstudio code colors */
+__device__ uint32_t __shfl(uint32_t a, uint32_t b, uint32_t c);
+#endif
+
+#define Nrow 8
+#define Ncol 8
+#define memshift 3
+
+#define BUF_COUNT 0
+
+__device__ uint2 *DMatrix;
+
+__device__ __forceinline__ void LD4S(uint2 res[3], const int row, const int col, const int thread, const int threads)
+{
+#if BUF_COUNT != 8
+	extern __shared__ uint2 shared_mem[];
+	const int s0 = (Ncol * (row - BUF_COUNT) + col) * memshift;
+#endif
+#if BUF_COUNT != 0
+	const int d0 = (memshift *(Ncol * row + col) * threads + thread)*blockDim.x + threadIdx.x;
+#endif
+
+#if BUF_COUNT == 8
+	#pragma unroll
+	for (int j = 0; j < 3; j++)
+		res[j] = *(DMatrix + d0 + j * threads * blockDim.x);
+#elif BUF_COUNT == 0
+	#pragma unroll
+	for (int j = 0; j < 3; j++)
+		res[j] = shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x];
+#else
+	if (row < BUF_COUNT)
+	{
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			res[j] = *(DMatrix + d0 + j * threads * blockDim.x);
+	}
+	else
+	{
+	#pragma unroll
+		for (int j = 0; j < 3; j++)
+			res[j] = shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x];
+	}
+#endif
+}
+
+__device__ __forceinline__ void ST4S(const int row, const int col, const uint2 data[3], const int thread, const int threads)
+{
+#if BUF_COUNT != 8
+	extern __shared__ uint2 shared_mem[];
+	const int s0 = (Ncol * (row - BUF_COUNT) + col) * memshift;
+#endif
+#if BUF_COUNT != 0
+	const int d0 = (memshift *(Ncol * row + col) * threads + thread)*blockDim.x + threadIdx.x;
+#endif
+
+#if BUF_COUNT == 8
+	#pragma unroll
+	for (int j = 0; j < 3; j++)
+		*(DMatrix + d0 + j * threads * blockDim.x) = data[j];
+
+#elif BUF_COUNT == 0
+	#pragma unroll
+	for (int j = 0; j < 3; j++)
+		shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data[j];
+
+#else
+	if (row < BUF_COUNT)
+	{
+	#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + d0 + j * threads * blockDim.x) = data[j];
+	}
+	else
+	{
+	#pragma unroll
+		for (int j = 0; j < 3; j++)
+			shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data[j];
+	}
+#endif
+}
+
+#if __CUDA_ARCH__ >= 300
+__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
+{
+	return __shfl(a, b, c);
+}
+
+__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c)
+{
+	return make_uint2(__shfl(a.x, b, c), __shfl(a.y, b, c));
+}
+
+__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c)
+{
+	a1 = WarpShuffle(a1, b1, c);
+	a2 = WarpShuffle(a2, b2, c);
+	a3 = WarpShuffle(a3, b3, c);
+}
+
+#else
+__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
+	uint32_t *_ptr = (uint32_t*)shared_mem;
+
+	__threadfence_block();
+	uint32_t buf = _ptr[thread];
+
+	_ptr[thread] = a;
+	__threadfence_block();
+	uint32_t result = _ptr[(thread&~(c - 1)) + (b&(c - 1))];
+
+	__threadfence_block();
+	_ptr[thread] = buf;
+
+	__threadfence_block();
+	return result;
+}
+
+__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
+
+	__threadfence_block();
+	uint2 buf = shared_mem[thread];
+
+	shared_mem[thread] = a;
+	__threadfence_block();
+	uint2 result = shared_mem[(thread&~(c - 1)) + (b&(c - 1))];
+
+	__threadfence_block();
+	shared_mem[thread] = buf;
+
+	__threadfence_block();
+	return result;
+}
+
+__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
+
+	__threadfence_block();
+	uint2 buf = shared_mem[thread];
+
+	shared_mem[thread] = a1;
+	__threadfence_block();
+	a1 = shared_mem[(thread&~(c - 1)) + (b1&(c - 1))];
+	__threadfence_block();
+	shared_mem[thread] = a2;
+	__threadfence_block();
+	a2 = shared_mem[(thread&~(c - 1)) + (b2&(c - 1))];
+	__threadfence_block();
+	shared_mem[thread] = a3;
+	__threadfence_block();
+	a3 = shared_mem[(thread&~(c - 1)) + (b3&(c - 1))];
+
+	__threadfence_block();
+	shared_mem[thread] = buf;
+	__threadfence_block();
+}
+
+#endif
+
+#if __CUDA_ARCH__ > 500 || !defined(__CUDA_ARCH)
+static __device__ __forceinline__
+void Gfunc(uint2 &a, uint2 &b, uint2 &c, uint2 &d)
+{
+	a += b; uint2 tmp = d; d.y = a.x ^ tmp.x; d.x = a.y ^ tmp.y;
+	c += d; b ^= c; b = ROR24(b);
+	a += b; d ^= a; d = ROR16(d);
+	c += d; b ^= c; b = ROR2(b, 63);
+}
+#endif
+
+__device__ __forceinline__ void round_lyra(uint2 s[4])
+{
+	Gfunc(s[0], s[1], s[2], s[3]);
+	WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 1, threadIdx.x + 2, threadIdx.x + 3, 4);
+	Gfunc(s[0], s[1], s[2], s[3]);
+	WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 3, threadIdx.x + 2, threadIdx.x + 1, 4);
+}
+
+static __device__ __forceinline__
+void round_lyra(uint2x4* s)
+{
+	Gfunc(s[0].x, s[1].x, s[2].x, s[3].x);
+	Gfunc(s[0].y, s[1].y, s[2].y, s[3].y);
+	Gfunc(s[0].z, s[1].z, s[2].z, s[3].z);
+	Gfunc(s[0].w, s[1].w, s[2].w, s[3].w);
+	Gfunc(s[0].x, s[1].y, s[2].z, s[3].w);
+	Gfunc(s[0].y, s[1].z, s[2].w, s[3].x);
+	Gfunc(s[0].z, s[1].w, s[2].x, s[3].y);
+	Gfunc(s[0].w, s[1].x, s[2].y, s[3].z);
+}
+
+static __device__ __forceinline__
+void reduceDuplex(uint2 state[4], uint32_t thread, const uint32_t threads)
+{
+	uint2 state1[3];
+
+	#pragma unroll
+	for (int i = 0; i < Nrow; i++)
+	{
+		ST4S(0, Ncol - i - 1, state, thread, threads);
+
+		round_lyra(state);
+	}
+
+	#pragma unroll 4
+	for (int i = 0; i < Nrow; i++)
+	{
+		LD4S(state1, 0, i, thread, threads);
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j];
+
+		round_lyra(state);
+
+		for (int j = 0; j < 3; j++)
+			state1[j] ^= state[j];
+		ST4S(1, Ncol - i - 1, state1, thread, threads);
+	}
+}
+
+static __device__ __forceinline__
+void reduceDuplexRowSetup(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], uint32_t thread, const uint32_t threads)
+{
+	uint2 state1[3], state2[3];
+
+	#pragma unroll 1
+	for (int i = 0; i < Nrow; i++)
+	{
+		LD4S(state1, rowIn, i, thread, threads);
+		LD4S(state2, rowInOut, i, thread, threads);
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra(state);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state1[j] ^= state[j];
+
+		ST4S(rowOut, Ncol - i - 1, state1, thread, threads);
+
+		// simultaneously receive data from preceding thread and send data to following thread
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		} else {
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+		ST4S(rowInOut, i, state2, thread, threads);
+	}
+}
+
+static __device__ __forceinline__
+void reduceDuplexRowt(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], const uint32_t thread, const uint32_t threads)
+{
+	for (int i = 0; i < Nrow; i++)
+	{
+		uint2 state1[3], state2[3];
+
+		LD4S(state1, rowIn, i, thread, threads);
+		LD4S(state2, rowInOut, i, thread, threads);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra(state);
+
+		// simultaneously receive data from preceding thread and send data to following thread
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		}
+		else
+		{
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+		ST4S(rowInOut, i, state2, thread, threads);
+
+		LD4S(state1, rowOut, i, thread, threads);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state1[j] ^= state[j];
+
+		ST4S(rowOut, i, state1, thread, threads);
+	}
+}
+
+static __device__ __forceinline__
+void reduceDuplexRowt_8(const int rowInOut, uint2* state, const uint32_t thread, const uint32_t threads)
+{
+	uint2 state1[3], state2[3], last[3];
+
+	LD4S(state1, 2, 0, thread, threads);
+	LD4S(last, rowInOut, 0, thread, threads);
+
+	#pragma unroll
+	for (int j = 0; j < 3; j++)
+		state[j] ^= state1[j] + last[j];
+
+	round_lyra(state);
+
+	// simultaneously receive data from preceding thread and send data to following thread
+	uint2 Data0 = state[0];
+	uint2 Data1 = state[1];
+	uint2 Data2 = state[2];
+	WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+	if (threadIdx.x == 0)
+	{
+		last[0] ^= Data2;
+		last[1] ^= Data0;
+		last[2] ^= Data1;
+	} else {
+		last[0] ^= Data0;
+		last[1] ^= Data1;
+		last[2] ^= Data2;
+	}
+
+	if (rowInOut == 5)
+	{
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			last[j] ^= state[j];
+	}
+
+	for (int i = 1; i < Nrow; i++)
+	{
+		LD4S(state1, 2, i, thread, threads);
+		LD4S(state2, rowInOut, i, thread, threads);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra(state);
+	}
+
+	#pragma unroll
+	for (int j = 0; j < 3; j++)
+		state[j] ^= last[j];
+}
+
+__constant__ uint2x4 blake2b_IV[2] = {
+	0xf3bcc908lu, 0x6a09e667lu,
+	0x84caa73blu, 0xbb67ae85lu,
+	0xfe94f82blu, 0x3c6ef372lu,
+	0x5f1d36f1lu, 0xa54ff53alu,
+	0xade682d1lu, 0x510e527flu,
+	0x2b3e6c1flu, 0x9b05688clu,
+	0xfb41bd6blu, 0x1f83d9ablu,
+	0x137e2179lu, 0x5be0cd19lu
+};
+
+__global__ __launch_bounds__(64, 1)
+void lyra2_gpu_hash_32_1(uint32_t threads, uint2 *g_hash)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint2x4 state[4];
+		state[0].x = state[1].x = __ldg(&g_hash[thread + threads * 0]);
+		state[0].y = state[1].y = __ldg(&g_hash[thread + threads * 1]);
+		state[0].z = state[1].z = __ldg(&g_hash[thread + threads * 2]);
+		state[0].w = state[1].w = __ldg(&g_hash[thread + threads * 3]);
+		state[2] = blake2b_IV[0];
+		state[3] = blake2b_IV[1];
+
+		for (int i = 0; i<24; i++)
+			round_lyra(state); //because 12 is not enough
+
+		((uint2x4*)DMatrix)[threads * 0 + thread] = state[0];
+		((uint2x4*)DMatrix)[threads * 1 + thread] = state[1];
+		((uint2x4*)DMatrix)[threads * 2 + thread] = state[2];
+		((uint2x4*)DMatrix)[threads * 3 + thread] = state[3];
+	}
+}
+
+__global__
+__launch_bounds__(TPB52, 1)
+void lyra2_gpu_hash_32_2(const uint32_t threads, uint64_t *g_hash)
+{
+	const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y;
+	if (thread < threads)
+	{
+		uint2 state[4];
+		state[0] = __ldg(&DMatrix[(0 * threads + thread) * blockDim.x + threadIdx.x]);
+		state[1] = __ldg(&DMatrix[(1 * threads + thread) * blockDim.x + threadIdx.x]);
+		state[2] = __ldg(&DMatrix[(2 * threads + thread) * blockDim.x + threadIdx.x]);
+		state[3] = __ldg(&DMatrix[(3 * threads + thread) * blockDim.x + threadIdx.x]);
+
+		reduceDuplex(state, thread, threads);
+		reduceDuplexRowSetup(1, 0, 2, state, thread, threads);
+		reduceDuplexRowSetup(2, 1, 3, state, thread, threads);
+		reduceDuplexRowSetup(3, 0, 4, state, thread, threads);
+		reduceDuplexRowSetup(4, 3, 5, state, thread, threads);
+		reduceDuplexRowSetup(5, 2, 6, state, thread, threads);
+		reduceDuplexRowSetup(6, 1, 7, state, thread, threads);
+
+		uint32_t rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowt(7, rowa, 0, state, thread, threads);
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowt(0, rowa, 3, state, thread, threads);
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowt(3, rowa, 6, state, thread, threads);
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowt(6, rowa, 1, state, thread, threads);
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowt(1, rowa, 4, state, thread, threads);
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowt(4, rowa, 7, state, thread, threads);
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowt(7, rowa, 2, state, thread, threads);
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowt_8(rowa, state, thread, threads);
+
+		DMatrix[(0 * threads + thread) * blockDim.x + threadIdx.x] = state[0];
+		DMatrix[(1 * threads + thread) * blockDim.x + threadIdx.x] = state[1];
+		DMatrix[(2 * threads + thread) * blockDim.x + threadIdx.x] = state[2];
+		DMatrix[(3 * threads + thread) * blockDim.x + threadIdx.x] = state[3];
+	}
+}
+
+__global__ __launch_bounds__(64, 1)
+void lyra2_gpu_hash_32_3(uint32_t threads, uint2 *g_hash)
+{
+	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
+	if (thread < threads)
+	{
+		uint2x4 state[4];
+		state[0] = __ldg4(&((uint2x4*)DMatrix)[threads * 0 + thread]);
+		state[1] = __ldg4(&((uint2x4*)DMatrix)[threads * 1 + thread]);
+		state[2] = __ldg4(&((uint2x4*)DMatrix)[threads * 2 + thread]);
+		state[3] = __ldg4(&((uint2x4*)DMatrix)[threads * 3 + thread]);
+
+		for (int i = 0; i < 12; i++)
+			round_lyra(state);
+
+		g_hash[thread + threads * 0] = state[0].x;
+		g_hash[thread + threads * 1] = state[0].y;
+		g_hash[thread + threads * 2] = state[0].z;
+		g_hash[thread + threads * 3] = state[0].w;
+	}
+}
+
+__global__ __launch_bounds__(64, 1)
+void lyra2_gpu_hash_64_1(uint32_t threads, uint2* const d_hash_512, const uint32_t round)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint2x4 state[4];
+		const size_t offset = (size_t)8 * thread + (round * 4U);
+		uint2 *psrc = (uint2*)(&d_hash_512[offset]);
+		state[0].x = state[1].x = __ldg(&psrc[0]);
+		state[0].y = state[1].y = __ldg(&psrc[1]);
+		state[0].z = state[1].z = __ldg(&psrc[2]);
+		state[0].w = state[1].w = __ldg(&psrc[3]);
+		state[2] = blake2b_IV[0];
+		state[3] = blake2b_IV[1];
+
+		for (int i = 0; i<24; i++)
+			round_lyra(state);
+
+		((uint2x4*)DMatrix)[threads * 0 + thread] = state[0];
+		((uint2x4*)DMatrix)[threads * 1 + thread] = state[1];
+		((uint2x4*)DMatrix)[threads * 2 + thread] = state[2];
+		((uint2x4*)DMatrix)[threads * 3 + thread] = state[3];
+	}
+}
+
+__global__ __launch_bounds__(64, 1)
+void lyra2_gpu_hash_64_3(uint32_t threads, uint2 *d_hash_512, const uint32_t round)
+{
+	// This kernel outputs 2x 256-bits hashes in 512-bits chain offsets in 2 rounds
+	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
+	if (thread < threads)
+	{
+		uint2x4 state[4];
+		state[0] = __ldg4(&((uint2x4*)DMatrix)[threads * 0 + thread]);
+		state[1] = __ldg4(&((uint2x4*)DMatrix)[threads * 1 + thread]);
+		state[2] = __ldg4(&((uint2x4*)DMatrix)[threads * 2 + thread]);
+		state[3] = __ldg4(&((uint2x4*)DMatrix)[threads * 3 + thread]);
+
+		for (int i = 0; i < 12; i++)
+			round_lyra(state);
+
+		const size_t offset = (size_t)8 * thread + (round * 4U);
+		uint2 *pdst = (uint2*)(&d_hash_512[offset]);
+		pdst[0] = state[0].x;
+		pdst[1] = state[0].y;
+		pdst[2] = state[0].z;
+		pdst[3] = state[0].w;
+	}
+}
+#else
+#if __CUDA_ARCH__ < 500
+
+/* for unsupported SM arch */
+__device__ void* DMatrix;
+#endif
+__global__ void lyra2_gpu_hash_32_1(uint32_t threads, uint2 *g_hash) {}
+__global__ void lyra2_gpu_hash_32_2(uint32_t threads, uint64_t *g_hash) {}
+__global__ void lyra2_gpu_hash_32_3(uint32_t threads, uint2 *g_hash) {}
+__global__ void lyra2_gpu_hash_64_1(uint32_t threads, uint2* const d_hash_512, const uint32_t round) {}
+__global__ void lyra2_gpu_hash_64_3(uint32_t threads, uint2 *d_hash_512, const uint32_t round) {}
+#endif
+
+__host__
+void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix)
+{
+	// just assign the device pointer allocated in main loop
+	cudaMemcpyToSymbol(DMatrix, &d_matrix, sizeof(uint64_t*), 0, cudaMemcpyHostToDevice);
+}
+
+__host__
+void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint64_t *d_hash, bool gtx750ti)
+{
+	int dev_id = device_map[thr_id % MAX_GPUS];
+
+	uint32_t tpb = TPB52;
+
+	if (cuda_arch[dev_id] >= 520) tpb = TPB52;
+	else if (cuda_arch[dev_id] >= 500) tpb = TPB50;
+	else if (cuda_arch[dev_id] >= 200) tpb = TPB20;
+
+	dim3 grid1((threads * 4 + tpb - 1) / tpb);
+	dim3 block1(4, tpb >> 2);
+
+	dim3 grid2((threads + 64 - 1) / 64);
+	dim3 block2(64);
+
+	dim3 grid3((threads + tpb - 1) / tpb);
+	dim3 block3(tpb);
+
+	if (cuda_arch[dev_id] >= 520)
+	{
+		lyra2_gpu_hash_32_1 <<< grid2, block2 >>> (threads, (uint2*)d_hash);
+		lyra2_gpu_hash_32_2 <<< grid1, block1, 24 * (8 - 0) * sizeof(uint2) * tpb >>> (threads, d_hash);
+		lyra2_gpu_hash_32_3 <<< grid2, block2 >>> (threads, (uint2*)d_hash);
+	}
+	else if (cuda_arch[dev_id] >= 500)
+	{
+		size_t shared_mem = 0;
+
+		if (gtx750ti)
+			// suitable amount to adjust for 8warp
+			shared_mem = 8192;
+		else
+			// suitable amount to adjust for 10warp
+			shared_mem = 6144;
+
+		lyra2_gpu_hash_32_1_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash);
+		lyra2_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, (uint2*)d_hash);
+		lyra2_gpu_hash_32_3_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash);
+	}
+	else
+		lyra2_gpu_hash_32_sm2 <<< grid3, block3 >>> (threads, d_hash);
+}
+
+__host__
+void lyra2_cuda_hash_64(int thr_id, const uint32_t threads, uint64_t* d_hash_256, uint32_t* d_hash_512, bool gtx750ti)
+{
+	int dev_id = device_map[thr_id % MAX_GPUS];
+	uint32_t tpb = TPB52;
+	if (cuda_arch[dev_id] >= 520) tpb = TPB52;
+	else if (cuda_arch[dev_id] >= 500) tpb = TPB50;
+	else if (cuda_arch[dev_id] >= 200) tpb = TPB20;
+
+	dim3 grid1((size_t(threads) * 4 + tpb - 1) / tpb);
+	dim3 block1(4, tpb >> 2);
+
+	dim3 grid2((threads + 64 - 1) / 64);
+	dim3 block2(64);
+
+	if (cuda_arch[dev_id] >= 520)
+	{
+		const size_t shared_mem = sizeof(uint2) * tpb * 192; // 49152;
+		lyra2_gpu_hash_64_1 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 0);
+		lyra2_gpu_hash_32_2 <<< grid1, block1, shared_mem >>> (threads, d_hash_256);
+		lyra2_gpu_hash_64_3 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 0);
+
+		lyra2_gpu_hash_64_1 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 1);
+		lyra2_gpu_hash_32_2 <<< grid1, block1, shared_mem >>> (threads, d_hash_256);
+		lyra2_gpu_hash_64_3 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 1);
+	}
+	else if (cuda_arch[dev_id] >= 500)
+	{
+		size_t shared_mem = gtx750ti ? 8192 : 6144; // 8 or 10 warps
+		lyra2_gpu_hash_64_1_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 0);
+		lyra2_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, (uint2*)d_hash_256);
+		lyra2_gpu_hash_64_3_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 0);
+
+		lyra2_gpu_hash_64_1_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 1);
+		lyra2_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, (uint2*)d_hash_256);
+		lyra2_gpu_hash_64_3_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 1);
+	}
+	else {
+		// alternative method for SM 3.x
+		hash64_to_lyra32(thr_id, threads, d_hash_512, d_hash_256, 0);
+		lyra2_cpu_hash_32(thr_id, threads, d_hash_256, gtx750ti);
+		hash64_from_lyra32(thr_id, threads, d_hash_512, d_hash_256, 0);
+		hash64_to_lyra32(thr_id, threads, d_hash_512, d_hash_256, 1);
+		lyra2_cpu_hash_32(thr_id, threads, d_hash_256, gtx750ti);
+		hash64_from_lyra32(thr_id, threads, d_hash_512, d_hash_256, 1);
+	}
+}
diff --git a/lyra2/cuda_lyra2Z.cu b/lyra2/cuda_lyra2Z.cu
new file mode 100644
index 0000000000..3a7e565fd7
--- /dev/null
+++ b/lyra2/cuda_lyra2Z.cu
@@ -0,0 +1,973 @@
+/**
+ * Lyra2 (v1) cuda implementation based on djm34 work
+ * tpruvot@github 2015, Nanashi 08/2016 (from 1.8-r2)
+ * Lyra2Z implentation for Zcoin based on all the previous
+ * djm34 2017
+ **/
+
+#include <stdio.h>
+#include <memory.h>
+
+#define TPB52 32
+#define TPB30 160
+#define TPB20 160
+
+#include "cuda_lyra2Z_sm5.cuh"
+
+#ifdef __INTELLISENSE__
+/* just for vstudio code colors */
+__device__ uint32_t __shfl(uint32_t a, uint32_t b, uint32_t c);
+#define atomicMin()
+#define __CUDA_ARCH__ 520
+#endif
+
+static uint32_t *h_GNonces[16]; // this need to get fixed as the rest of that routine
+static uint32_t *d_GNonces[16];
+
+#define reduceDuplexRow(rowIn, rowInOut, rowOut) { \
+	for (int i = 0; i < 8; i++) { \
+		for (int j = 0; j < 12; j++) \
+			state[j] ^= Matrix[12 * i + j][rowIn] + Matrix[12 * i + j][rowInOut]; \
+		round_lyra_sm2(state); \
+		for (int j = 0; j < 12; j++) \
+			Matrix[j + 12 * i][rowOut] ^= state[j]; \
+		Matrix[0 + 12 * i][rowInOut] ^= state[11]; \
+		Matrix[1 + 12 * i][rowInOut] ^= state[0]; \
+		Matrix[2 + 12 * i][rowInOut] ^= state[1]; \
+		Matrix[3 + 12 * i][rowInOut] ^= state[2]; \
+		Matrix[4 + 12 * i][rowInOut] ^= state[3]; \
+		Matrix[5 + 12 * i][rowInOut] ^= state[4]; \
+		Matrix[6 + 12 * i][rowInOut] ^= state[5]; \
+		Matrix[7 + 12 * i][rowInOut] ^= state[6]; \
+		Matrix[8 + 12 * i][rowInOut] ^= state[7]; \
+		Matrix[9 + 12 * i][rowInOut] ^= state[8]; \
+		Matrix[10+ 12 * i][rowInOut] ^= state[9]; \
+		Matrix[11+ 12 * i][rowInOut] ^= state[10]; \
+	} \
+  }
+
+#define absorbblock(in)  { \
+	state[0] ^= Matrix[0][in]; \
+	state[1] ^= Matrix[1][in]; \
+	state[2] ^= Matrix[2][in]; \
+	state[3] ^= Matrix[3][in]; \
+	state[4] ^= Matrix[4][in]; \
+	state[5] ^= Matrix[5][in]; \
+	state[6] ^= Matrix[6][in]; \
+	state[7] ^= Matrix[7][in]; \
+	state[8] ^= Matrix[8][in]; \
+	state[9] ^= Matrix[9][in]; \
+	state[10] ^= Matrix[10][in]; \
+	state[11] ^= Matrix[11][in]; \
+	round_lyra_sm2(state); \
+	round_lyra_sm2(state); \
+	round_lyra_sm2(state); \
+	round_lyra_sm2(state); \
+	round_lyra_sm2(state); \
+	round_lyra_sm2(state); \
+	round_lyra_sm2(state); \
+	round_lyra_sm2(state); \
+	round_lyra_sm2(state); \
+	round_lyra_sm2(state); \
+	round_lyra_sm2(state); \
+	round_lyra_sm2(state); \
+  }
+
+__device__ __forceinline__
+static void round_lyra_sm2(uint2 *s)
+{
+	Gfunc(s[0], s[4], s[8], s[12]);
+	Gfunc(s[1], s[5], s[9], s[13]);
+	Gfunc(s[2], s[6], s[10], s[14]);
+	Gfunc(s[3], s[7], s[11], s[15]);
+	Gfunc(s[0], s[5], s[10], s[15]);
+	Gfunc(s[1], s[6], s[11], s[12]);
+	Gfunc(s[2], s[7], s[8], s[13]);
+	Gfunc(s[3], s[4], s[9], s[14]);
+}
+
+__device__ __forceinline__
+void reduceDuplexRowSetup(const int rowIn, const int rowInOut, const int rowOut, uint2 state[16], uint2 Matrix[96][8])
+{
+#if __CUDA_ARCH__ > 500
+#pragma unroll
+#endif
+	for (int i = 0; i < 8; i++)
+	{
+		#pragma unroll
+		for (int j = 0; j < 12; j++)
+			state[j] ^= Matrix[12 * i + j][rowIn] + Matrix[12 * i + j][rowInOut];
+
+		round_lyra_sm2(state);
+
+		#pragma unroll
+		for (int j = 0; j < 12; j++)
+			Matrix[j + 84 - 12 * i][rowOut] = Matrix[12 * i + j][rowIn] ^ state[j];
+
+		Matrix[0 +  12 * i][rowInOut] ^= state[11];
+		Matrix[1 +  12 * i][rowInOut] ^= state[0];
+		Matrix[2 +  12 * i][rowInOut] ^= state[1];
+		Matrix[3 +  12 * i][rowInOut] ^= state[2];
+		Matrix[4 +  12 * i][rowInOut] ^= state[3];
+		Matrix[5 +  12 * i][rowInOut] ^= state[4];
+		Matrix[6 +  12 * i][rowInOut] ^= state[5];
+		Matrix[7 +  12 * i][rowInOut] ^= state[6];
+		Matrix[8 +  12 * i][rowInOut] ^= state[7];
+		Matrix[9 +  12 * i][rowInOut] ^= state[8];
+		Matrix[10 + 12 * i][rowInOut] ^= state[9];
+		Matrix[11 + 12 * i][rowInOut] ^= state[10];
+	}
+}
+
+#if __CUDA_ARCH__ < 350
+
+__constant__ static uint2 blake2b_IV_sm2[8] = {
+	{ 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 },
+	{ 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a },
+	{ 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c },
+	{ 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 }
+};
+
+__global__ __launch_bounds__(TPB30, 1)
+void lyra2Z_gpu_hash_32_sm2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resNonces)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const uint2 Mask[8] = {
+		{ 0x00000020, 0x00000000 },{ 0x00000020, 0x00000000 },
+		{ 0x00000020, 0x00000000 },{ 0x00000008, 0x00000000 },
+		{ 0x00000008, 0x00000000 },{ 0x00000008, 0x00000000 },
+		{ 0x00000080, 0x00000000 },{ 0x00000000, 0x01000000 }
+	};
+	if (thread < threads)
+	{
+		uint2 state[16];
+
+		#pragma unroll
+		for (int i = 0; i<4; i++) {
+			LOHI(state[i].x, state[i].y, g_hash[threads*i + thread]);
+		} //password
+
+		#pragma unroll
+		for (int i = 0; i<4; i++) {
+			state[i + 4] = state[i];
+		} //salt
+
+		#pragma unroll
+		for (int i = 0; i<8; i++) {
+			state[i + 8] = blake2b_IV_sm2[i];
+		}
+
+		// blake2blyra x2
+		//#pragma unroll 24
+		for (int i = 0; i<12; i++) {
+			round_lyra_sm2(state);
+		}
+
+		for (int i = 0; i<8; i++)
+			state[i] ^= Mask[i];
+
+		for (int i = 0; i<12; i++) {
+			round_lyra_sm2(state);
+		}
+
+		uint2 Matrix[96][8]; // not cool
+
+		// reducedSqueezeRow0
+		#pragma unroll 8
+		for (int i = 0; i < 8; i++)
+		{
+			#pragma unroll 12
+			for (int j = 0; j<12; j++) {
+				Matrix[j + 84 - 12 * i][0] = state[j];
+			}
+			round_lyra_sm2(state);
+		}
+
+		// reducedSqueezeRow1
+		#pragma unroll 8
+		for (int i = 0; i < 8; i++)
+		{
+			#pragma unroll 12
+			for (int j = 0; j<12; j++) {
+				state[j] ^= Matrix[j + 12 * i][0];
+			}
+			round_lyra_sm2(state);
+			#pragma unroll 12
+			for (int j = 0; j<12; j++) {
+				Matrix[j + 84 - 12 * i][1] = Matrix[j + 12 * i][0] ^ state[j];
+			}
+		}
+
+		reduceDuplexRowSetup(1, 0, 2, state, Matrix);
+		reduceDuplexRowSetup(2, 1, 3, state, Matrix);
+		reduceDuplexRowSetup(3, 0, 4, state, Matrix);
+		reduceDuplexRowSetup(4, 3, 5, state, Matrix);
+		reduceDuplexRowSetup(5, 2, 6, state, Matrix);
+		reduceDuplexRowSetup(6, 1, 7, state, Matrix);
+
+		uint32_t rowa;
+		uint32_t prev = 7;
+		uint32_t iterator = 0;
+		for (uint32_t i = 0; i<8; i++) {
+			rowa = state[0].x & 7;
+			reduceDuplexRow(prev, rowa, iterator);
+			prev = iterator;
+			iterator = (iterator + 3) & 7;
+		}
+		for (uint32_t i = 0; i<8; i++) {
+			rowa = state[0].x & 7;
+			reduceDuplexRow(prev, rowa, iterator);
+			prev = iterator;
+			iterator = (iterator - 1) & 7;
+		}
+
+		for (uint32_t i = 0; i<8; i++) {
+			rowa = state[0].x & 7;
+			reduceDuplexRow(prev, rowa, iterator);
+			prev = iterator;
+			iterator = (iterator + 3) & 7;
+		}
+		for (uint32_t i = 0; i<8; i++) {
+			rowa = state[0].x & 7;
+			reduceDuplexRow(prev, rowa, iterator);
+			prev = iterator;
+			iterator = (iterator - 1) & 7;
+		}
+
+		for (uint32_t i = 0; i<8; i++) {
+			rowa = state[0].x & 7;
+			reduceDuplexRow(prev, rowa, iterator);
+			prev = iterator;
+			iterator = (iterator + 3) & 7;
+		}
+		for (uint32_t i = 0; i<8; i++) {
+			rowa = state[0].x & 7;
+			reduceDuplexRow(prev, rowa, iterator);
+			prev = iterator;
+			iterator = (iterator - 1) & 7;
+		}
+
+		for (uint32_t i = 0; i<8; i++) {
+			rowa = state[0].x & 7;
+			reduceDuplexRow(prev, rowa, iterator);
+			prev = iterator;
+			iterator = (iterator + 3) & 7;
+		}
+		for (uint32_t i = 0; i<8; i++) {
+			rowa = state[0].x & 7;
+			reduceDuplexRow(prev, rowa, iterator);
+			prev = iterator;
+			iterator = (iterator - 1) & 7;
+		}
+
+		absorbblock(rowa);
+		uint32_t nonce = startNounce + thread;
+		if (((uint64_t*)state)[3] <= ((uint64_t*)pTarget)[3]) {
+			atomicMin(&resNonces[1], resNonces[0]);
+			atomicMin(&resNonces[0], nonce);
+		}
+	} //thread
+}
+#else
+__global__ void lyra2Z_gpu_hash_32_sm2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resNonces) {}
+#endif
+
+#if __CUDA_ARCH__ > 500
+
+#include "cuda_lyra2_vectors.h"
+//#include "cuda_vector_uint2x4.h"
+
+#define Nrow 8
+#define Ncol 8
+#define memshift 3
+
+#define BUF_COUNT 0
+
+__device__ uint2 *DMatrix;
+
+__device__ __forceinline__
+void LD4S(uint2 res[3], const int row, const int col, const int thread, const int threads)
+{
+#if BUF_COUNT != 8
+	extern __shared__ uint2 shared_mem[];
+	const int s0 = (Ncol * (row - BUF_COUNT) + col) * memshift;
+#endif
+#if BUF_COUNT != 0
+	const int d0 = (memshift *(Ncol * row + col) * threads + thread)*blockDim.x + threadIdx.x;
+#endif
+
+#if BUF_COUNT == 8
+	#pragma unroll
+	for (int j = 0; j < 3; j++)
+		res[j] = *(DMatrix + d0 + j * threads * blockDim.x);
+#elif BUF_COUNT == 0
+	#pragma unroll
+	for (int j = 0; j < 3; j++)
+		res[j] = shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x];
+#else
+	if (row < BUF_COUNT) {
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			res[j] = *(DMatrix + d0 + j * threads * blockDim.x);
+	} else {
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			res[j] = shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x];
+	}
+#endif
+}
+
+__device__ __forceinline__
+void ST4S(const int row, const int col, const uint2 data[3], const int thread, const int threads)
+{
+#if BUF_COUNT != 8
+	extern __shared__ uint2 shared_mem[];
+	const int s0 = (Ncol * (row - BUF_COUNT) + col) * memshift;
+#endif
+#if BUF_COUNT != 0
+	const int d0 = (memshift *(Ncol * row + col) * threads + thread)*blockDim.x + threadIdx.x;
+#endif
+
+#if BUF_COUNT == 8
+	#pragma unroll
+	for (int j = 0; j < 3; j++)
+		*(DMatrix + d0 + j * threads * blockDim.x) = data[j];
+
+#elif BUF_COUNT == 0
+	#pragma unroll
+	for (int j = 0; j < 3; j++)
+		shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data[j];
+#else
+	if (row < BUF_COUNT) {
+	#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + d0 + j * threads * blockDim.x) = data[j];
+	} else {
+	#pragma unroll
+		for (int j = 0; j < 3; j++)
+			shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data[j];
+	}
+#endif
+}
+
+#if __CUDA_ARCH__ >= 300
+__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
+{
+	return __shfl(a, b, c);
+}
+
+__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c)
+{
+	return make_uint2(__shfl(a.x, b, c), __shfl(a.y, b, c));
+}
+
+__device__ __forceinline__
+void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c)
+{
+	a1 = WarpShuffle(a1, b1, c);
+	a2 = WarpShuffle(a2, b2, c);
+	a3 = WarpShuffle(a3, b3, c);
+}
+
+#else
+__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
+	uint32_t *_ptr = (uint32_t*)shared_mem;
+
+	__threadfence_block();
+	uint32_t buf = _ptr[thread];
+
+	_ptr[thread] = a;
+	__threadfence_block();
+	uint32_t result = _ptr[(thread&~(c - 1)) + (b&(c - 1))];
+
+	__threadfence_block();
+	_ptr[thread] = buf;
+
+	__threadfence_block();
+	return result;
+}
+
+__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
+
+	__threadfence_block();
+	uint2 buf = shared_mem[thread];
+
+	shared_mem[thread] = a;
+	__threadfence_block();
+	uint2 result = shared_mem[(thread&~(c - 1)) + (b&(c - 1))];
+
+	__threadfence_block();
+	shared_mem[thread] = buf;
+
+	__threadfence_block();
+	return result;
+}
+
+__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
+
+	__threadfence_block();
+	uint2 buf = shared_mem[thread];
+
+	shared_mem[thread] = a1;
+	__threadfence_block();
+	a1 = shared_mem[(thread&~(c - 1)) + (b1&(c - 1))];
+	__threadfence_block();
+	shared_mem[thread] = a2;
+	__threadfence_block();
+	a2 = shared_mem[(thread&~(c - 1)) + (b2&(c - 1))];
+	__threadfence_block();
+	shared_mem[thread] = a3;
+	__threadfence_block();
+	a3 = shared_mem[(thread&~(c - 1)) + (b3&(c - 1))];
+
+	__threadfence_block();
+	shared_mem[thread] = buf;
+	__threadfence_block();
+}
+#endif
+
+__device__ __forceinline__ void round_lyra(uint2 s[4])
+{
+	Gfunc(s[0], s[1], s[2], s[3]);
+	WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 1, threadIdx.x + 2, threadIdx.x + 3, 4);
+	Gfunc(s[0], s[1], s[2], s[3]);
+	WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 3, threadIdx.x + 2, threadIdx.x + 1, 4);
+}
+
+static __device__ __forceinline__
+void round_lyra(uint2x4* s)
+{
+	Gfunc(s[0].x, s[1].x, s[2].x, s[3].x);
+	Gfunc(s[0].y, s[1].y, s[2].y, s[3].y);
+	Gfunc(s[0].z, s[1].z, s[2].z, s[3].z);
+	Gfunc(s[0].w, s[1].w, s[2].w, s[3].w);
+	Gfunc(s[0].x, s[1].y, s[2].z, s[3].w);
+	Gfunc(s[0].y, s[1].z, s[2].w, s[3].x);
+	Gfunc(s[0].z, s[1].w, s[2].x, s[3].y);
+	Gfunc(s[0].w, s[1].x, s[2].y, s[3].z);
+}
+
+static __device__ __forceinline__
+void reduceDuplex(uint2 state[4], uint32_t thread, const uint32_t threads)
+{
+	uint2 state1[3];
+
+#if __CUDA_ARCH__ > 500
+#pragma unroll
+#endif
+	for (int i = 0; i < Nrow; i++)
+	{
+		ST4S(0, Ncol - i - 1, state, thread, threads);
+
+		round_lyra(state);
+	}
+
+	#pragma unroll 4
+	for (int i = 0; i < Nrow; i++)
+	{
+		LD4S(state1, 0, i, thread, threads);
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j];
+
+		round_lyra(state);
+
+		for (int j = 0; j < 3; j++)
+			state1[j] ^= state[j];
+		ST4S(1, Ncol - i - 1, state1, thread, threads);
+	}
+}
+
+static __device__ __forceinline__
+void reduceDuplexRowSetup(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], uint32_t thread, const uint32_t threads)
+{
+	uint2 state1[3], state2[3];
+
+	#pragma unroll 1
+	for (int i = 0; i < Nrow; i++)
+	{
+		LD4S(state1, rowIn, i, thread, threads);
+		LD4S(state2, rowInOut, i, thread, threads);
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra(state);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state1[j] ^= state[j];
+
+		ST4S(rowOut, Ncol - i - 1, state1, thread, threads);
+
+		// simultaneously receive data from preceding thread and send data to following thread
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		} else {
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+		ST4S(rowInOut, i, state2, thread, threads);
+	}
+}
+
+static __device__ __forceinline__
+void reduceDuplexRowt(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], const uint32_t thread, const uint32_t threads)
+{
+	for (int i = 0; i < Nrow; i++)
+	{
+		uint2 state1[3], state2[3];
+
+		LD4S(state1, rowIn, i, thread, threads);
+		LD4S(state2, rowInOut, i, thread, threads);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra(state);
+
+		// simultaneously receive data from preceding thread and send data to following thread
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		}
+		else
+		{
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+		ST4S(rowInOut, i, state2, thread, threads);
+
+		LD4S(state1, rowOut, i, thread, threads);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state1[j] ^= state[j];
+
+		ST4S(rowOut, i, state1, thread, threads);
+	}
+}
+
+#if 0
+static __device__ __forceinline__
+void reduceDuplexRowt_8(const int rowInOut, uint2* state, const uint32_t thread, const uint32_t threads)
+{
+	uint2 state1[3], state2[3], last[3];
+
+	LD4S(state1, 2, 0, thread, threads);
+	LD4S(last, rowInOut, 0, thread, threads);
+
+	#pragma unroll
+	for (int j = 0; j < 3; j++)
+		state[j] ^= state1[j] + last[j];
+
+	round_lyra(state);
+
+	// simultaneously receive data from preceding thread and send data to following thread
+	uint2 Data0 = state[0];
+	uint2 Data1 = state[1];
+	uint2 Data2 = state[2];
+	WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+	if (threadIdx.x == 0)
+	{
+		last[0] ^= Data2;
+		last[1] ^= Data0;
+		last[2] ^= Data1;
+	} else {
+		last[0] ^= Data0;
+		last[1] ^= Data1;
+		last[2] ^= Data2;
+	}
+
+	if (rowInOut == 5)
+	{
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			last[j] ^= state[j];
+	}
+
+	for (int i = 1; i < Nrow; i++)
+	{
+		LD4S(state1, 2, i, thread, threads);
+		LD4S(state2, rowInOut, i, thread, threads);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra(state);
+	}
+
+	#pragma unroll
+	for (int j = 0; j < 3; j++)
+		state[j] ^= last[j];
+}
+#endif
+
+static __device__ __forceinline__
+void reduceDuplexRowt_8_v2(const int rowIn, const int rowOut, const int rowInOut, uint2* state, const uint32_t thread, const uint32_t threads)
+{
+	uint2 state1[3], state2[3], last[3];
+
+	LD4S(state1, rowIn, 0, thread, threads);
+	LD4S(last, rowInOut, 0, thread, threads);
+
+	#pragma unroll
+	for (int j = 0; j < 3; j++)
+		state[j] ^= state1[j] + last[j];
+
+	round_lyra(state);
+
+	// simultaneously receive data from preceding thread and send data to following thread
+	uint2 Data0 = state[0];
+	uint2 Data1 = state[1];
+	uint2 Data2 = state[2];
+	WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+	if (threadIdx.x == 0)
+	{
+		last[0] ^= Data2;
+		last[1] ^= Data0;
+		last[2] ^= Data1;
+	}
+	else {
+		last[0] ^= Data0;
+		last[1] ^= Data1;
+		last[2] ^= Data2;
+	}
+
+	if (rowInOut == rowOut) {
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			last[j] ^= state[j];
+	}
+
+	for (int i = 1; i < Nrow; i++)
+	{
+		LD4S(state1, rowIn, i, thread, threads);
+		LD4S(state2, rowInOut, i, thread, threads);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra(state);
+	}
+
+	#pragma unroll
+	for (int j = 0; j < 3; j++)
+		state[j] ^= last[j];
+}
+
+__global__
+__launch_bounds__(64, 1)
+void lyra2Z_gpu_hash_32_1(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const uint2x4 Mask[2] = {
+		0x00000020UL, 0x00000000UL, 0x00000020UL, 0x00000000UL,
+		0x00000020UL, 0x00000000UL, 0x00000008UL, 0x00000000UL,
+		0x00000008UL, 0x00000000UL, 0x00000008UL, 0x00000000UL,
+		0x00000080UL, 0x00000000UL, 0x00000000UL, 0x01000000UL
+	};
+	const uint2x4 blake2b_IV[2] = {
+		0xf3bcc908lu, 0x6a09e667lu,
+		0x84caa73blu, 0xbb67ae85lu,
+		0xfe94f82blu, 0x3c6ef372lu,
+		0x5f1d36f1lu, 0xa54ff53alu,
+		0xade682d1lu, 0x510e527flu,
+		0x2b3e6c1flu, 0x9b05688clu,
+		0xfb41bd6blu, 0x1f83d9ablu,
+		0x137e2179lu, 0x5be0cd19lu
+	};
+	if (thread < threads)
+	{
+		uint2x4 state[4];
+
+		state[0].x = state[1].x = __ldg(&g_hash[thread + threads * 0]);
+		state[0].y = state[1].y = __ldg(&g_hash[thread + threads * 1]);
+		state[0].z = state[1].z = __ldg(&g_hash[thread + threads * 2]);
+		state[0].w = state[1].w = __ldg(&g_hash[thread + threads * 3]);
+		state[2] = blake2b_IV[0];
+		state[3] = blake2b_IV[1];
+
+		for (int i = 0; i<12; i++)
+			round_lyra(state);
+
+		state[0] ^= Mask[0];
+		state[1] ^= Mask[1];
+
+		for (int i = 0; i<12; i++)
+			round_lyra(state); //because 12 is not enough
+
+		((uint2x4*)DMatrix)[threads * 0 + thread] = state[0];
+		((uint2x4*)DMatrix)[threads * 1 + thread] = state[1];
+		((uint2x4*)DMatrix)[threads * 2 + thread] = state[2];
+		((uint2x4*)DMatrix)[threads * 3 + thread] = state[3];
+	}
+}
+
+__global__
+__launch_bounds__(TPB52, 1)
+void lyra2Z_gpu_hash_32_2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
+{
+	const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y;
+
+	if (thread < threads)
+	{
+		uint2 state[4];
+		state[0] = __ldg(&DMatrix[(0 * threads + thread) * blockDim.x + threadIdx.x]);
+		state[1] = __ldg(&DMatrix[(1 * threads + thread) * blockDim.x + threadIdx.x]);
+		state[2] = __ldg(&DMatrix[(2 * threads + thread) * blockDim.x + threadIdx.x]);
+		state[3] = __ldg(&DMatrix[(3 * threads + thread) * blockDim.x + threadIdx.x]);
+
+		reduceDuplex(state, thread, threads);
+		reduceDuplexRowSetup(1, 0, 2, state, thread, threads);
+		reduceDuplexRowSetup(2, 1, 3, state, thread, threads);
+		reduceDuplexRowSetup(3, 0, 4, state, thread, threads);
+		reduceDuplexRowSetup(4, 3, 5, state, thread, threads);
+		reduceDuplexRowSetup(5, 2, 6, state, thread, threads);
+		reduceDuplexRowSetup(6, 1, 7, state, thread, threads);
+
+		uint32_t rowa; // = WarpShuffle(state[0].x, 0, 4) & 7;
+		uint32_t prev = 7;
+		uint32_t iterator = 0;
+
+	//for (uint32_t j=0;j<4;j++) {
+
+		for (uint32_t i = 0; i<8; i++) {
+			rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+			reduceDuplexRowt(prev, rowa, iterator, state, thread, threads);
+			prev = iterator;
+			iterator = (iterator + 3) & 7;
+		}
+
+		for (uint32_t i = 0; i<8; i++) {
+			rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+			reduceDuplexRowt(prev, rowa, iterator, state, thread, threads);
+			prev = iterator;
+			iterator = (iterator - 1) & 7;
+		}
+
+		for (uint32_t i = 0; i<8; i++) {
+			rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+			reduceDuplexRowt(prev, rowa, iterator, state, thread, threads);
+			prev = iterator;
+			iterator = (iterator + 3) & 7;
+		}
+
+		for (uint32_t i = 0; i<8; i++) {
+			rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+			reduceDuplexRowt(prev, rowa, iterator, state, thread, threads);
+			prev = iterator;
+			iterator = (iterator - 1) & 7;
+		}
+
+		for (uint32_t i = 0; i<8; i++) {
+			rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+			reduceDuplexRowt(prev, rowa, iterator, state, thread, threads);
+			prev = iterator;
+			iterator = (iterator + 3) & 7;
+		}
+
+		for (uint32_t i = 0; i<8; i++) {
+			rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+			reduceDuplexRowt(prev, rowa, iterator, state, thread, threads);
+			prev = iterator;
+			iterator = (iterator - 1) & 7;
+		}
+
+		for (uint32_t i = 0; i<8; i++) {
+			rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+			reduceDuplexRowt(prev, rowa, iterator, state, thread, threads);
+			prev = iterator;
+			iterator = (iterator + 3) & 7;
+		}
+
+		for (uint32_t i = 0; i<7; i++) {
+			rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+			reduceDuplexRowt(prev, rowa, iterator, state, thread, threads);
+			prev = iterator;
+			iterator = (iterator - 1) & 7;
+		}
+
+	//}
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowt_8_v2(prev,iterator,rowa, state, thread, threads);
+
+		DMatrix[(0 * threads + thread) * blockDim.x + threadIdx.x] = state[0];
+		DMatrix[(1 * threads + thread) * blockDim.x + threadIdx.x] = state[1];
+		DMatrix[(2 * threads + thread) * blockDim.x + threadIdx.x] = state[2];
+		DMatrix[(3 * threads + thread) * blockDim.x + threadIdx.x] = state[3];
+	}
+}
+
+__global__
+__launch_bounds__(64, 1)
+void lyra2Z_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *g_hash, uint32_t *resNonces)
+{
+	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
+
+	uint28 state[4];
+
+	if (thread < threads)
+	{
+		state[0] = __ldg4(&((uint2x4*)DMatrix)[threads * 0 + thread]);
+		state[1] = __ldg4(&((uint2x4*)DMatrix)[threads * 1 + thread]);
+		state[2] = __ldg4(&((uint2x4*)DMatrix)[threads * 2 + thread]);
+		state[3] = __ldg4(&((uint2x4*)DMatrix)[threads * 3 + thread]);
+
+		for (int i = 0; i < 12; i++)
+			round_lyra(state);
+
+		uint32_t nonce = startNounce + thread;
+		if (((uint64_t*)state)[3] <= ((uint64_t*)pTarget)[3]) {
+			atomicMin(&resNonces[1], resNonces[0]);
+			atomicMin(&resNonces[0], nonce);
+		}
+/*
+		g_hash[thread + threads * 0] = state[0].x;
+		g_hash[thread + threads * 1] = state[0].y;
+		g_hash[thread + threads * 2] = state[0].z;
+		g_hash[thread + threads * 3] = state[0].w;
+*/
+	}
+}
+#else
+#if __CUDA_ARCH__ < 350
+__device__ void* DMatrix;
+#endif
+__global__ void lyra2Z_gpu_hash_32_1(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
+__global__ void lyra2Z_gpu_hash_32_2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) {}
+__global__ void lyra2Z_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *g_hash, uint32_t *resNonces) {}
+#endif
+
+__host__
+void lyra2Z_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix)
+{
+	// just assign the device pointer allocated in main loop
+	cudaMemcpyToSymbol(DMatrix, &d_matrix, sizeof(uint64_t*), 0, cudaMemcpyHostToDevice);
+	cudaMalloc(&d_GNonces[thr_id], 2 * sizeof(uint32_t));
+	cudaMallocHost(&h_GNonces[thr_id], 2 * sizeof(uint32_t));
+}
+
+__host__
+void lyra2Z_cpu_init_sm2(int thr_id, uint32_t threads)
+{
+	// just assign the device pointer allocated in main loop
+	cudaMalloc(&d_GNonces[thr_id], 2 * sizeof(uint32_t));
+	cudaMallocHost(&h_GNonces[thr_id], 2 * sizeof(uint32_t));
+}
+
+__host__
+void lyra2Z_cpu_free(int thr_id)
+{
+	cudaFree(d_GNonces[thr_id]);
+	cudaFreeHost(h_GNonces[thr_id]);
+}
+
+__host__
+uint32_t lyra2Z_getSecNonce(int thr_id, int num)
+{
+	uint32_t results[2];
+	memset(results, 0xFF, sizeof(results));
+	cudaMemcpy(results, d_GNonces[thr_id], sizeof(results), cudaMemcpyDeviceToHost);
+	if (results[1] == results[0])
+		return UINT32_MAX;
+	return results[num];
+}
+
+__host__
+void lyra2Z_setTarget(const void *pTargetIn)
+{
+	cudaMemcpyToSymbol(pTarget, pTargetIn, 32, 0, cudaMemcpyHostToDevice);
+}
+
+__host__
+uint32_t lyra2Z_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, bool gtx750ti)
+{
+	uint32_t result = UINT32_MAX;
+	cudaMemset(d_GNonces[thr_id], 0xff, 2 * sizeof(uint32_t));
+	int dev_id = device_map[thr_id % MAX_GPUS];
+
+	uint32_t tpb = TPB52;
+
+	if (device_sm[dev_id] == 500)
+		tpb = TPB50;
+	if (device_sm[dev_id] == 200)
+		tpb = TPB20;
+
+	dim3 grid1((threads * 4 + tpb - 1) / tpb);
+	dim3 block1(4, tpb >> 2);
+
+	dim3 grid2((threads + 64 - 1) / 64);
+	dim3 block2(64);
+
+	dim3 grid3((threads + tpb - 1) / tpb);
+	dim3 block3(tpb);
+
+	if (device_sm[dev_id] >= 520)
+	{
+		lyra2Z_gpu_hash_32_1 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash);
+
+		lyra2Z_gpu_hash_32_2 <<< grid1, block1, 24 * (8 - 0) * sizeof(uint2) * tpb >>> (threads, startNounce, d_hash);
+
+		lyra2Z_gpu_hash_32_3 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash, d_GNonces[thr_id]);
+	}
+	else if (device_sm[dev_id] == 500 || device_sm[dev_id] == 350)
+	{
+		size_t shared_mem = 0;
+
+		if (gtx750ti)
+			// suitable amount to adjust for 8warp
+			shared_mem = 8192;
+		else
+			// suitable amount to adjust for 10warp
+			shared_mem = 6144;
+
+		lyra2Z_gpu_hash_32_1_sm5 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash);
+
+		lyra2Z_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, startNounce, (uint2*)d_hash);
+
+		lyra2Z_gpu_hash_32_3_sm5 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash, d_GNonces[thr_id]);
+	}
+	else
+		lyra2Z_gpu_hash_32_sm2 <<< grid3, block3 >>> (threads, startNounce, d_hash, d_GNonces[thr_id]);
+
+	// get first found nonce
+	cudaMemcpy(h_GNonces[thr_id], d_GNonces[thr_id], 1 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	result = *h_GNonces[thr_id];
+
+	return result;
+}
diff --git a/lyra2/cuda_lyra2Z_sm5.cuh b/lyra2/cuda_lyra2Z_sm5.cuh
new file mode 100644
index 0000000000..1d8ae68930
--- /dev/null
+++ b/lyra2/cuda_lyra2Z_sm5.cuh
@@ -0,0 +1,819 @@
+#include <memory.h>
+
+#ifdef __INTELLISENSE__
+/* just for vstudio code colors */
+//#define __CUDA_ARCH__ 500
+#define __threadfence_block()
+#define __ldg(x) *(x)
+#define atomicMin(p,y) y
+#endif
+
+#include "cuda_helper.h"
+
+#define TPB50 32
+
+__constant__ uint32_t pTarget[8];
+
+static __device__ __forceinline__
+void Gfunc(uint2 & a, uint2 &b, uint2 &c, uint2 &d)
+{
+#if __CUDA_ARCH__ > 500
+	a += b; uint2 tmp = d; d.y = a.x ^ tmp.x; d.x = a.y ^ tmp.y;
+	c += d; b ^= c; b = ROR24(b);
+	a += b; d ^= a; d = ROR16(d);
+	c += d; b ^= c; b = ROR2(b, 63);
+#else
+	a += b; d ^= a; d = SWAPUINT2(d);
+	c += d; b ^= c; b = ROR2(b, 24);
+	a += b; d ^= a; d = ROR2(d, 16);
+	c += d; b ^= c; b = ROR2(b, 63);
+#endif
+}
+
+#if __CUDA_ARCH__ == 500 || __CUDA_ARCH__ == 350
+#include "cuda_lyra2_vectors.h"
+
+#define Nrow 8
+#define Ncol 8
+#define memshift 3
+
+__device__ uint2 *DMatrix;
+
+__device__ __forceinline__ uint2 LD4S(const int index)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	return shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x];
+}
+
+__device__ __forceinline__ void ST4S(const int index, const uint2 data)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data;
+}
+
+#if __CUDA_ARCH__ == 300
+__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
+{
+	return __shfl(a, b, c);
+}
+
+__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c)
+{
+	return make_uint2(__shfl(a.x, b, c), __shfl(a.y, b, c));
+}
+
+__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c)
+{
+	a1 = WarpShuffle(a1, b1, c);
+	a2 = WarpShuffle(a2, b2, c);
+	a3 = WarpShuffle(a3, b3, c);
+}
+#else // != 300
+
+__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
+	uint32_t *_ptr = (uint32_t*)shared_mem;
+
+	__threadfence_block();
+	uint32_t buf = _ptr[thread];
+
+	_ptr[thread] = a;
+	__threadfence_block();
+	uint32_t result = _ptr[(thread&~(c - 1)) + (b&(c - 1))];
+
+	__threadfence_block();
+	_ptr[thread] = buf;
+
+	__threadfence_block();
+	return result;
+}
+
+__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
+
+	__threadfence_block();
+	uint2 buf = shared_mem[thread];
+
+	shared_mem[thread] = a;
+	__threadfence_block();
+	uint2 result = shared_mem[(thread&~(c - 1)) + (b&(c - 1))];
+
+	__threadfence_block();
+	shared_mem[thread] = buf;
+
+	__threadfence_block();
+	return result;
+}
+
+__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
+
+	__threadfence_block();
+	uint2 buf = shared_mem[thread];
+
+	shared_mem[thread] = a1;
+	__threadfence_block();
+	a1 = shared_mem[(thread&~(c - 1)) + (b1&(c - 1))];
+	__threadfence_block();
+	shared_mem[thread] = a2;
+	__threadfence_block();
+	a2 = shared_mem[(thread&~(c - 1)) + (b2&(c - 1))];
+	__threadfence_block();
+	shared_mem[thread] = a3;
+	__threadfence_block();
+	a3 = shared_mem[(thread&~(c - 1)) + (b3&(c - 1))];
+
+	__threadfence_block();
+	shared_mem[thread] = buf;
+	__threadfence_block();
+}
+
+#endif // != 300
+
+__device__ __forceinline__ void round_lyra(uint2 s[4])
+{
+	Gfunc(s[0], s[1], s[2], s[3]);
+	WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 1, threadIdx.x + 2, threadIdx.x + 3, 4);
+	Gfunc(s[0], s[1], s[2], s[3]);
+	WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 3, threadIdx.x + 2, threadIdx.x + 1, 4);
+}
+
+static __device__ __forceinline__
+void round_lyra(uint2x4* s)
+{
+	Gfunc(s[0].x, s[1].x, s[2].x, s[3].x);
+	Gfunc(s[0].y, s[1].y, s[2].y, s[3].y);
+	Gfunc(s[0].z, s[1].z, s[2].z, s[3].z);
+	Gfunc(s[0].w, s[1].w, s[2].w, s[3].w);
+	Gfunc(s[0].x, s[1].y, s[2].z, s[3].w);
+	Gfunc(s[0].y, s[1].z, s[2].w, s[3].x);
+	Gfunc(s[0].z, s[1].w, s[2].x, s[3].y);
+	Gfunc(s[0].w, s[1].x, s[2].y, s[3].z);
+}
+
+static __device__ __forceinline__
+void reduceDuplexV5(uint2 state[4], const uint32_t thread, const uint32_t threads)
+{
+	uint2 state1[3], state2[3];
+
+	const uint32_t ps0 = (memshift * Ncol * 0 * threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps1 = (memshift * Ncol * 1 * threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps2 = (memshift * Ncol * 2 * threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps3 = (memshift * Ncol * 3 * threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps4 = (memshift * Ncol * 4 * threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps5 = (memshift * Ncol * 5 * threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps6 = (memshift * Ncol * 6 * threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps7 = (memshift * Ncol * 7 * threads + thread)*blockDim.x + threadIdx.x;
+
+	for (int i = 0; i < 8; i++)
+	{
+		const uint32_t s0 = memshift * Ncol * 0 + (Ncol - 1 - i) * memshift;
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			ST4S(s0 + j, state[j]);
+		round_lyra(state);
+	}
+
+	for (int i = 0; i < 8; i++)
+	{
+		const uint32_t s0 = memshift * Ncol * 0 + i * memshift;
+		const uint32_t s1 = ps1 + (7 - i)*memshift* threads*blockDim.x;
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state1[j] = LD4S(s0 + j);
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j];
+
+		round_lyra(state);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s1 + j*threads*blockDim.x) = state1[j] ^ state[j];
+	}
+
+	// 1, 0, 2
+	for (int i = 0; i < 8; i++)
+	{
+		const uint32_t s0 = memshift * Ncol * 0 + i * memshift;
+		const uint32_t s1 = ps1 + i * memshift* threads*blockDim.x;
+		const uint32_t s2 = ps2 + (7 - i)*memshift* threads*blockDim.x;
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state1[j] = *(DMatrix + s1 + j*threads*blockDim.x);
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state2[j] = LD4S(s0 + j);
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra(state);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s2 + j*threads*blockDim.x) = state1[j] ^ state[j];
+
+		// simultaneously receive data from preceding thread and send data to following thread
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		}
+		else
+		{
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			ST4S(s0 + j, state2[j]);
+	}
+
+	// 2, 1, 3
+	for (int i = 0; i < 8; i++)
+	{
+		const uint32_t s1 = ps1 + i * memshift* threads*blockDim.x;
+		const uint32_t s2 = ps2 + i * memshift* threads*blockDim.x;
+		const uint32_t s3 = ps3 + (7 - i)*memshift* threads*blockDim.x;
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state1[j] = *(DMatrix + s2 + j*threads*blockDim.x);
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state2[j] = *(DMatrix + s1 + j*threads*blockDim.x);
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra(state);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s3 + j*threads*blockDim.x) = state1[j] ^ state[j];
+
+		// simultaneously receive data from preceding thread and send data to following thread
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		} else  {
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s1 + j*threads*blockDim.x) = state2[j];
+	}
+
+	// 3, 0, 4
+	for (int i = 0; i < 8; i++)
+	{
+		const uint32_t ls0 = memshift * Ncol * 0 + i * memshift;
+		const uint32_t s0 = ps0 + i * memshift* threads*blockDim.x;
+		const uint32_t s3 = ps3 + i * memshift* threads*blockDim.x;
+		const uint32_t s4 = ps4 + (7 - i)*memshift* threads*blockDim.x;
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state1[j] = *(DMatrix + s3 + j*threads*blockDim.x);
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state2[j] = LD4S(ls0 + j);
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra(state);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s4 + j*threads*blockDim.x) = state1[j] ^ state[j];
+
+		// simultaneously receive data from preceding thread and send data to following thread
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		} else {
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s0 + j*threads*blockDim.x) = state2[j];
+	}
+
+	// 4, 3, 5
+	for (int i = 0; i < 8; i++)
+	{
+		const uint32_t s3 = ps3 + i * memshift* threads*blockDim.x;
+		const uint32_t s4 = ps4 + i * memshift* threads*blockDim.x;
+		const uint32_t s5 = ps5 + (7 - i)*memshift* threads*blockDim.x;
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state1[j] = *(DMatrix + s4 + j*threads*blockDim.x);
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state2[j] = *(DMatrix + s3 + j*threads*blockDim.x);
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra(state);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s5 + j*threads*blockDim.x) = state1[j] ^ state[j];
+
+		// simultaneously receive data from preceding thread and send data to following thread
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		}
+		else
+		{
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s3 + j*threads*blockDim.x) = state2[j];
+	}
+
+	// 5, 2, 6
+	for (int i = 0; i < 8; i++)
+	{
+		const uint32_t s2 = ps2 + i * memshift* threads*blockDim.x;
+		const uint32_t s5 = ps5 + i * memshift* threads*blockDim.x;
+		const uint32_t s6 = ps6 + (7 - i)*memshift* threads*blockDim.x;
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state1[j] = *(DMatrix + s5 + j*threads*blockDim.x);
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state2[j] = *(DMatrix + s2 + j*threads*blockDim.x);
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra(state);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s6 + j*threads*blockDim.x) = state1[j] ^ state[j];
+
+		// simultaneously receive data from preceding thread and send data to following thread
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		}
+		else
+		{
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s2 + j*threads*blockDim.x) = state2[j];
+	}
+
+	// 6, 1, 7
+	for (int i = 0; i < 8; i++)
+	{
+		const uint32_t s1 = ps1 + i * memshift* threads*blockDim.x;
+		const uint32_t s6 = ps6 + i * memshift* threads*blockDim.x;
+		const uint32_t s7 = ps7 + (7 - i)*memshift* threads*blockDim.x;
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state1[j] = *(DMatrix + s6 + j*threads*blockDim.x);
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state2[j] = *(DMatrix + s1 + j*threads*blockDim.x);
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra(state);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s7 + j*threads*blockDim.x) = state1[j] ^ state[j];
+
+		// simultaneously receive data from preceding thread and send data to following thread
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		} else {
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s1 + j*threads*blockDim.x) = state2[j];
+	}
+}
+
+static __device__ __forceinline__
+void reduceDuplexRowV50(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], const uint32_t thread, const uint32_t threads)
+{
+	const uint32_t ps1 = (memshift * Ncol * rowIn*threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps2 = (memshift * Ncol * rowInOut *threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps3 = (memshift * Ncol * rowOut*threads + thread)*blockDim.x + threadIdx.x;
+
+	#pragma unroll 1
+	for (int i = 0; i < 8; i++)
+	{
+		uint2 state1[3], state2[3];
+
+		const uint32_t s1 = ps1 + i*memshift*threads *blockDim.x;
+		const uint32_t s2 = ps2 + i*memshift*threads *blockDim.x;
+		const uint32_t s3 = ps3 + i*memshift*threads *blockDim.x;
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++) {
+			state1[j] = *(DMatrix + s1 + j*threads*blockDim.x);
+			state2[j] = *(DMatrix + s2 + j*threads*blockDim.x);
+		}
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++) {
+			state1[j] += state2[j];
+			state[j] ^= state1[j];
+		}
+
+		round_lyra(state);
+
+		// simultaneously receive data from preceding thread and send data to following thread
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		} else {
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+		{
+			*(DMatrix + s2 + j*threads*blockDim.x) = state2[j];
+			*(DMatrix + s3 + j*threads*blockDim.x) ^= state[j];
+		}
+	}
+}
+
+static __device__ __forceinline__
+void reduceDuplexRowV50_8(const int rowInOut, uint2 state[4], const uint32_t thread, const uint32_t threads)
+{
+	const uint32_t ps1 = (memshift * Ncol * 2*threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps2 = (memshift * Ncol * rowInOut *threads + thread)*blockDim.x + threadIdx.x;
+	// const uint32_t ps3 = (memshift * Ncol * 5*threads + thread)*blockDim.x + threadIdx.x;
+
+	uint2 state1[3], last[3];
+
+	#pragma unroll
+	for (int j = 0; j < 3; j++) {
+		state1[j] = *(DMatrix + ps1 + j*threads*blockDim.x);
+		last[j] = *(DMatrix + ps2 + j*threads*blockDim.x);
+	}
+
+	#pragma unroll
+	for (int j = 0; j < 3; j++) {
+		state1[j] += last[j];
+		state[j] ^= state1[j];
+	}
+
+	round_lyra(state);
+
+	// simultaneously receive data from preceding thread and send data to following thread
+	uint2 Data0 = state[0];
+	uint2 Data1 = state[1];
+	uint2 Data2 = state[2];
+	WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+	if (threadIdx.x == 0)
+	{
+		last[0] ^= Data2;
+		last[1] ^= Data0;
+		last[2] ^= Data1;
+	} else {
+		last[0] ^= Data0;
+		last[1] ^= Data1;
+		last[2] ^= Data2;
+	}
+
+	if (rowInOut == 5)
+	{
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			last[j] ^= state[j];
+	}
+
+	for (int i = 1; i < 8; i++)
+	{
+		const uint32_t s1 = ps1 + i*memshift*threads *blockDim.x;
+		const uint32_t s2 = ps2 + i*memshift*threads *blockDim.x;
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= *(DMatrix + s1 + j*threads*blockDim.x) + *(DMatrix + s2 + j*threads*blockDim.x);
+
+		round_lyra(state);
+	}
+
+	#pragma unroll
+	for (int j = 0; j < 3; j++)
+		state[j] ^= last[j];
+}
+
+static __device__ __forceinline__
+void reduceDuplexRowV50_8_v2(const int rowIn, const int rowOut,const int rowInOut, uint2 state[4], const uint32_t thread, const uint32_t threads)
+{
+	const uint32_t ps1 = (memshift * Ncol * rowIn * threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps2 = (memshift * Ncol * rowInOut *threads + thread)*blockDim.x + threadIdx.x;
+	// const uint32_t ps3 = (memshift * Ncol * 5*threads + thread)*blockDim.x + threadIdx.x;
+
+	uint2 state1[3], last[3];
+
+	#pragma unroll
+	for (int j = 0; j < 3; j++) {
+		state1[j] = *(DMatrix + ps1 + j*threads*blockDim.x);
+		last[j] = *(DMatrix + ps2 + j*threads*blockDim.x);
+	}
+
+	#pragma unroll
+	for (int j = 0; j < 3; j++) {
+		state1[j] += last[j];
+		state[j] ^= state1[j];
+	}
+
+	round_lyra(state);
+
+	// simultaneously receive data from preceding thread and send data to following thread
+	uint2 Data0 = state[0];
+	uint2 Data1 = state[1];
+	uint2 Data2 = state[2];
+	WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+	if (threadIdx.x == 0)
+	{
+		last[0] ^= Data2;
+		last[1] ^= Data0;
+		last[2] ^= Data1;
+	}
+	else {
+		last[0] ^= Data0;
+		last[1] ^= Data1;
+		last[2] ^= Data2;
+	}
+
+	if (rowInOut == rowOut)
+	{
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			last[j] ^= state[j];
+	}
+
+	for (int i = 1; i < 8; i++)
+	{
+		const uint32_t s1 = ps1 + i*memshift*threads *blockDim.x;
+		const uint32_t s2 = ps2 + i*memshift*threads *blockDim.x;
+
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= *(DMatrix + s1 + j*threads*blockDim.x) + *(DMatrix + s2 + j*threads*blockDim.x);
+
+		round_lyra(state);
+	}
+
+
+#pragma unroll
+	for (int j = 0; j < 3; j++)
+		state[j] ^= last[j];
+
+}
+
+
+__global__ __launch_bounds__(64, 1)
+void lyra2Z_gpu_hash_32_1_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+	const uint2x4 blake2b_IV[2] = {
+		{ { 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a } },
+		{ { 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 } }
+	};
+	const uint2x4 Mask[2] = {
+		0x00000020UL, 0x00000000UL, 0x00000020UL, 0x00000000UL,
+		0x00000020UL, 0x00000000UL, 0x00000008UL, 0x00000000UL,
+		0x00000008UL, 0x00000000UL, 0x00000008UL, 0x00000000UL,
+		0x00000080UL, 0x00000000UL, 0x00000000UL, 0x01000000UL
+	};
+	if (thread < threads)
+	{
+		uint2x4 state[4];
+
+		((uint2*)state)[0] = __ldg(&g_hash[thread]);
+		((uint2*)state)[1] = __ldg(&g_hash[thread + threads]);
+		((uint2*)state)[2] = __ldg(&g_hash[thread + threads * 2]);
+		((uint2*)state)[3] = __ldg(&g_hash[thread + threads * 3]);
+
+		state[1] = state[0];
+		state[2] = blake2b_IV[0];
+		state[3] = blake2b_IV[1];
+
+		for (int i = 0; i < 12; i++)
+			round_lyra(state); //because 12 is not enough
+
+		state[0] ^= Mask[0];
+		state[1] ^= Mask[1];
+
+		for (int i = 0; i < 12; i++)
+			round_lyra(state); //because 12 is not enough
+
+
+		((uint2x4*)DMatrix)[0 * threads + thread] = state[0];
+		((uint2x4*)DMatrix)[1 * threads + thread] = state[1];
+		((uint2x4*)DMatrix)[2 * threads + thread] = state[2];
+		((uint2x4*)DMatrix)[3 * threads + thread] = state[3];
+	}
+}
+
+__global__ __launch_bounds__(TPB50, 1)
+void lyra2Z_gpu_hash_32_2_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
+{
+	const uint32_t thread = (blockDim.y * blockIdx.x + threadIdx.y);
+
+	if (thread < threads)
+	{
+		uint2 state[4];
+
+		state[0] = __ldg(&DMatrix[(0 * threads + thread)*blockDim.x + threadIdx.x]);
+		state[1] = __ldg(&DMatrix[(1 * threads + thread)*blockDim.x + threadIdx.x]);
+		state[2] = __ldg(&DMatrix[(2 * threads + thread)*blockDim.x + threadIdx.x]);
+		state[3] = __ldg(&DMatrix[(3 * threads + thread)*blockDim.x + threadIdx.x]);
+
+		reduceDuplexV5(state, thread, threads);
+
+		uint32_t rowa; // = WarpShuffle(state[0].x, 0, 4) & 7;
+		uint32_t prev = 7;
+		uint32_t iterator = 0;
+		for (uint32_t i = 0; i<8; i++) {
+			rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+			reduceDuplexRowV50(prev, rowa, iterator, state, thread, threads);
+			prev = iterator;
+			iterator = (iterator + 3) & 7;
+		}
+		for (uint32_t i = 0; i<8; i++) {
+			rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+			reduceDuplexRowV50(prev, rowa, iterator, state, thread, threads);
+			prev = iterator;
+			iterator = (iterator - 1) & 7;
+		}
+		for (uint32_t i = 0; i<8; i++) {
+			rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+			reduceDuplexRowV50(prev, rowa, iterator, state, thread, threads);
+			prev = iterator;
+			iterator = (iterator + 3) & 7;
+		}
+		for (uint32_t i = 0; i<8; i++) {
+			rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+			reduceDuplexRowV50(prev, rowa, iterator, state, thread, threads);
+			prev = iterator;
+			iterator = (iterator - 1) & 7;
+		}
+		for (uint32_t i = 0; i<8; i++) {
+			rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+			reduceDuplexRowV50(prev, rowa, iterator, state, thread, threads);
+			prev = iterator;
+			iterator = (iterator + 3) & 7;
+		}
+		for (uint32_t i = 0; i<8; i++) {
+			rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+			reduceDuplexRowV50(prev, rowa, iterator, state, thread, threads);
+			prev = iterator;
+			iterator = (iterator - 1) & 7;
+		}
+		for (uint32_t i = 0; i<8; i++) {
+			rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+			reduceDuplexRowV50(prev, rowa, iterator, state, thread, threads);
+			prev = iterator;
+			iterator = (iterator + 3) & 7;
+		}
+		for (uint32_t i = 0; i<7; i++) {
+			rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+			reduceDuplexRowV50(prev, rowa, iterator, state, thread, threads);
+			prev = iterator;
+			iterator = (iterator - 1) & 7;
+		}
+
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowV50_8_v2(prev,iterator,rowa, state, thread, threads);
+
+		DMatrix[(0 * threads + thread)*blockDim.x + threadIdx.x] = state[0];
+		DMatrix[(1 * threads + thread)*blockDim.x + threadIdx.x] = state[1];
+		DMatrix[(2 * threads + thread)*blockDim.x + threadIdx.x] = state[2];
+		DMatrix[(3 * threads + thread)*blockDim.x + threadIdx.x] = state[3];
+	}
+}
+
+__global__ __launch_bounds__(64, 1)
+void lyra2Z_gpu_hash_32_3_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash, uint32_t *resNonces)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+	if (thread < threads)
+	{
+		uint2x4 state[4];
+
+		state[0] = __ldg4(&((uint2x4*)DMatrix)[0 * threads + thread]);
+		state[1] = __ldg4(&((uint2x4*)DMatrix)[1 * threads + thread]);
+		state[2] = __ldg4(&((uint2x4*)DMatrix)[2 * threads + thread]);
+		state[3] = __ldg4(&((uint2x4*)DMatrix)[3 * threads + thread]);
+
+		for (int i = 0; i < 12; i++)
+			round_lyra(state);
+
+		uint32_t nonce = startNounce + thread;
+		if (((uint64_t*)state)[3] <= ((uint64_t*)pTarget)[3]) {
+			atomicMin(&resNonces[1], resNonces[0]);
+			atomicMin(&resNonces[0], nonce);
+		}
+	}
+}
+
+#else
+/* if __CUDA_ARCH__ != 500 .. host */
+__global__ void lyra2Z_gpu_hash_32_1_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
+__global__ void lyra2Z_gpu_hash_32_2_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
+__global__ void lyra2Z_gpu_hash_32_3_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash, uint32_t *resNonces) {}
+#endif
diff --git a/lyra2/cuda_lyra2_sm2.cuh b/lyra2/cuda_lyra2_sm2.cuh
new file mode 100644
index 0000000000..cc0bd82d76
--- /dev/null
+++ b/lyra2/cuda_lyra2_sm2.cuh
@@ -0,0 +1,291 @@
+#include <memory.h>
+
+#ifdef __INTELLISENSE__
+/* just for vstudio code colors, only uncomment that temporary, dont commit it */
+//#undef __CUDA_ARCH__
+//#define __CUDA_ARCH__ 300
+#endif
+
+#include "cuda_helper.h"
+
+#define TPB30 160
+#define TPB20 160
+
+#if (__CUDA_ARCH__ >= 200 && __CUDA_ARCH__ <= 350) || !defined(__CUDA_ARCH__)
+__constant__ static uint2 blake2b_IV_sm2[8] = {
+	{ 0xf3bcc908, 0x6a09e667 },
+	{ 0x84caa73b, 0xbb67ae85 },
+	{ 0xfe94f82b, 0x3c6ef372 },
+	{ 0x5f1d36f1, 0xa54ff53a },
+	{ 0xade682d1, 0x510e527f },
+	{ 0x2b3e6c1f, 0x9b05688c },
+	{ 0xfb41bd6b, 0x1f83d9ab },
+	{ 0x137e2179, 0x5be0cd19 }
+};
+#endif
+
+#if __CUDA_ARCH__ >= 200 && __CUDA_ARCH__ <= 350
+
+#define reduceDuplexRow(rowIn, rowInOut, rowOut) { \
+	for (int i = 0; i < 8; i++) { \
+		for (int j = 0; j < 12; j++) \
+			state[j] ^= Matrix[12 * i + j][rowIn] + Matrix[12 * i + j][rowInOut]; \
+		round_lyra(state); \
+		for (int j = 0; j < 12; j++) \
+			Matrix[j + 12 * i][rowOut] ^= state[j]; \
+		Matrix[0 + 12 * i][rowInOut] ^= state[11]; \
+		Matrix[1 + 12 * i][rowInOut] ^= state[0]; \
+		Matrix[2 + 12 * i][rowInOut] ^= state[1]; \
+		Matrix[3 + 12 * i][rowInOut] ^= state[2]; \
+		Matrix[4 + 12 * i][rowInOut] ^= state[3]; \
+		Matrix[5 + 12 * i][rowInOut] ^= state[4]; \
+		Matrix[6 + 12 * i][rowInOut] ^= state[5]; \
+		Matrix[7 + 12 * i][rowInOut] ^= state[6]; \
+		Matrix[8 + 12 * i][rowInOut] ^= state[7]; \
+		Matrix[9 + 12 * i][rowInOut] ^= state[8]; \
+		Matrix[10+ 12 * i][rowInOut] ^= state[9]; \
+		Matrix[11+ 12 * i][rowInOut] ^= state[10]; \
+	} \
+  }
+
+#define absorbblock(in)  { \
+	state[0] ^= Matrix[0][in]; \
+	state[1] ^= Matrix[1][in]; \
+	state[2] ^= Matrix[2][in]; \
+	state[3] ^= Matrix[3][in]; \
+	state[4] ^= Matrix[4][in]; \
+	state[5] ^= Matrix[5][in]; \
+	state[6] ^= Matrix[6][in]; \
+	state[7] ^= Matrix[7][in]; \
+	state[8] ^= Matrix[8][in]; \
+	state[9] ^= Matrix[9][in]; \
+	state[10] ^= Matrix[10][in]; \
+	state[11] ^= Matrix[11][in]; \
+	round_lyra(state); \
+	round_lyra(state); \
+	round_lyra(state); \
+	round_lyra(state); \
+	round_lyra(state); \
+	round_lyra(state); \
+	round_lyra(state); \
+	round_lyra(state); \
+	round_lyra(state); \
+	round_lyra(state); \
+	round_lyra(state); \
+	round_lyra(state); \
+  }
+
+static __device__ __forceinline__
+void Gfunc(uint2 & a, uint2 &b, uint2 &c, uint2 &d)
+{
+	a += b; d ^= a; d = SWAPUINT2(d);
+	c += d; b ^= c; b = ROR2(b, 24);
+	a += b; d ^= a; d = ROR2(d, 16);
+	c += d; b ^= c; b = ROR2(b, 63);
+}
+
+__device__ __forceinline__
+static void round_lyra(uint2 *s)
+{
+	Gfunc(s[0], s[4], s[8],  s[12]);
+	Gfunc(s[1], s[5], s[9],  s[13]);
+	Gfunc(s[2], s[6], s[10], s[14]);
+	Gfunc(s[3], s[7], s[11], s[15]);
+	Gfunc(s[0], s[5], s[10], s[15]);
+	Gfunc(s[1], s[6], s[11], s[12]);
+	Gfunc(s[2], s[7], s[8],  s[13]);
+	Gfunc(s[3], s[4], s[9],  s[14]);
+}
+
+__device__ __forceinline__
+void reduceDuplexRowSetup(const int rowIn, const int rowInOut, const int rowOut, uint2 state[16], uint2 Matrix[96][8])
+{
+#if __CUDA_ARCH__ > 500
+	#pragma unroll
+#endif
+	for (int i = 0; i < 8; i++)
+	{
+		#pragma unroll
+		for (int j = 0; j < 12; j++)
+			state[j] ^= Matrix[12 * i + j][rowIn] + Matrix[12 * i + j][rowInOut];
+
+		round_lyra(state);
+
+		#pragma unroll
+		for (int j = 0; j < 12; j++)
+			Matrix[j + 84 - 12 * i][rowOut] = Matrix[12 * i + j][rowIn] ^ state[j];
+
+		Matrix[0 + 12 * i][rowInOut] ^= state[11];
+		Matrix[1 + 12 * i][rowInOut] ^= state[0];
+		Matrix[2 + 12 * i][rowInOut] ^= state[1];
+		Matrix[3 + 12 * i][rowInOut] ^= state[2];
+		Matrix[4 + 12 * i][rowInOut] ^= state[3];
+		Matrix[5 + 12 * i][rowInOut] ^= state[4];
+		Matrix[6 + 12 * i][rowInOut] ^= state[5];
+		Matrix[7 + 12 * i][rowInOut] ^= state[6];
+		Matrix[8 + 12 * i][rowInOut] ^= state[7];
+		Matrix[9 + 12 * i][rowInOut] ^= state[8];
+		Matrix[10 + 12 * i][rowInOut] ^= state[9];
+		Matrix[11 + 12 * i][rowInOut] ^= state[10];
+	}
+}
+
+__global__ __launch_bounds__(TPB30, 1)
+void lyra2_gpu_hash_32_sm2(uint32_t threads, uint64_t *g_hash)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint2 state[16];
+
+		#pragma unroll
+		for (int i = 0; i<4; i++) {
+			LOHI(state[i].x, state[i].y, g_hash[threads*i + thread]);
+		} //password
+
+		#pragma unroll
+		for (int i = 0; i<4; i++) {
+			state[i + 4] = state[i];
+		} //salt
+
+		#pragma unroll
+		for (int i = 0; i<8; i++) {
+			state[i + 8] = blake2b_IV_sm2[i];
+		}
+
+		// blake2blyra x2
+		//#pragma unroll 24
+		for (int i = 0; i<24; i++) {
+			round_lyra(state);
+		} //because 12 is not enough
+
+		uint2 Matrix[96][8]; // not cool
+
+		// reducedSqueezeRow0
+		#pragma unroll 8
+		for (int i = 0; i < 8; i++)
+		{
+			#pragma unroll 12
+			for (int j = 0; j<12; j++) {
+				Matrix[j + 84 - 12 * i][0] = state[j];
+			}
+			round_lyra(state);
+		}
+
+		// reducedSqueezeRow1
+		#pragma unroll 8
+		for (int i = 0; i < 8; i++)
+		{
+			#pragma unroll 12
+			for (int j = 0; j<12; j++) {
+				state[j] ^= Matrix[j + 12 * i][0];
+			}
+			round_lyra(state);
+			#pragma unroll 12
+			for (int j = 0; j<12; j++) {
+				Matrix[j + 84 - 12 * i][1] = Matrix[j + 12 * i][0] ^ state[j];
+			}
+		}
+
+		reduceDuplexRowSetup(1, 0, 2, state, Matrix);
+		reduceDuplexRowSetup(2, 1, 3, state, Matrix);
+		reduceDuplexRowSetup(3, 0, 4, state, Matrix);
+		reduceDuplexRowSetup(4, 3, 5, state, Matrix);
+		reduceDuplexRowSetup(5, 2, 6, state, Matrix);
+		reduceDuplexRowSetup(6, 1, 7, state, Matrix);
+
+		uint32_t rowa;
+		rowa = state[0].x & 7;
+		reduceDuplexRow(7, rowa, 0);
+		rowa = state[0].x & 7;
+		reduceDuplexRow(0, rowa, 3);
+		rowa = state[0].x & 7;
+		reduceDuplexRow(3, rowa, 6);
+		rowa = state[0].x & 7;
+		reduceDuplexRow(6, rowa, 1);
+		rowa = state[0].x & 7;
+		reduceDuplexRow(1, rowa, 4);
+		rowa = state[0].x & 7;
+		reduceDuplexRow(4, rowa, 7);
+		rowa = state[0].x & 7;
+		reduceDuplexRow(7, rowa, 2);
+		rowa = state[0].x & 7;
+		reduceDuplexRow(2, rowa, 5);
+
+		absorbblock(rowa);
+
+		#pragma unroll
+		for (int i = 0; i<4; i++) {
+			g_hash[threads*i + thread] = devectorize(state[i]);
+		}
+
+	} //thread
+}
+
+#else
+/* if __CUDA_ARCH__ < 200 .. host */
+__global__ void lyra2_gpu_hash_32_sm2(uint32_t threads, uint64_t *g_hash) {}
+#endif
+
+// -------------------------------------------------------------------------------------------------------------------------
+
+// lyra2 cant be used as-is in 512-bits hash chains, tx to djm for these weird offsets since first lyra2 algo...
+
+#if __CUDA_ARCH__ >= 200 && __CUDA_ARCH__ <= 350
+
+__global__ __launch_bounds__(128, 8)
+void hash64_to_lyra32_gpu(const uint32_t threads, const uint32_t* d_hash64, uint2* d_hash_lyra, const uint32_t round)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const size_t offset = (size_t) 16 * thread + (round * 8U);
+		uint2 *psrc = (uint2*) (&d_hash64[offset]);
+		uint2 *pdst = (uint2*) (&d_hash_lyra[thread]);
+		pdst[threads*0] = __ldg(&psrc[0]);
+		pdst[threads*1] = __ldg(&psrc[1]);
+		pdst[threads*2] = __ldg(&psrc[2]);
+		pdst[threads*3] = __ldg(&psrc[3]);
+	}
+}
+
+__global__ __launch_bounds__(128, 8)
+void hash64_from_lyra32_gpu(const uint32_t threads, const uint32_t* d_hash64, uint2* d_hash_lyra, const uint32_t round)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const size_t offset = (size_t) 16 * thread + (round * 8U);
+		uint2 *psrc = (uint2*) (&d_hash_lyra[thread]);
+		uint2 *pdst = (uint2*) (&d_hash64[offset]);
+		pdst[0] = psrc[0];
+		pdst[1] = psrc[threads*1];
+		pdst[2] = psrc[threads*2];
+		pdst[3] = psrc[threads*3];
+	}
+}
+#else
+/* if __CUDA_ARCH__ < 200 .. host */
+__global__ void hash64_to_lyra32_gpu(const uint32_t threads, const uint32_t* d_hash64, uint2* d_hash_lyra, const uint32_t round) {}
+__global__ void hash64_from_lyra32_gpu(const uint32_t threads, const uint32_t* d_hash64, uint2* d_hash_lyra, const uint32_t round) {}
+#endif
+
+__host__
+void hash64_to_lyra32(int thr_id, const uint32_t threads, uint32_t* d_hash64, uint64_t* d_hash_lyra, const uint32_t round)
+{
+	const uint32_t threadsperblock = 128;
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	hash64_to_lyra32_gpu <<<grid, block>>> (threads, d_hash64, (uint2*) d_hash_lyra, round);
+}
+
+__host__
+void hash64_from_lyra32(int thr_id, const uint32_t threads, uint32_t* d_hash64, uint64_t* d_hash_lyra, const uint32_t round)
+{
+	const uint32_t threadsperblock = 128;
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	hash64_from_lyra32_gpu <<<grid, block>>> (threads, d_hash64, (uint2*) d_hash_lyra, round);
+}
diff --git a/lyra2/cuda_lyra2_sm5.cuh b/lyra2/cuda_lyra2_sm5.cuh
new file mode 100644
index 0000000000..85adfd91fc
--- /dev/null
+++ b/lyra2/cuda_lyra2_sm5.cuh
@@ -0,0 +1,749 @@
+#include <memory.h>
+
+#ifdef __INTELLISENSE__
+/* just for vstudio code colors */
+#undef __CUDA_ARCH__
+#define __CUDA_ARCH__ 500
+#endif
+
+#include "cuda_helper.h"
+
+#define TPB50 32
+
+#if __CUDA_ARCH__ == 500
+#include "cuda_lyra2_vectors.h"
+
+#define Nrow 8
+#define Ncol 8
+#define memshift 3
+
+__device__ uint2 *DMatrix;
+
+__device__ __forceinline__ uint2 LD4S(const int index)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	return shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x];
+}
+
+__device__ __forceinline__ void ST4S(const int index, const uint2 data)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data;
+}
+
+#if __CUDA_ARCH__ == 300
+__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
+{
+	return __shfl(a, b, c);
+}
+
+__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c)
+{
+	return make_uint2(__shfl(a.x, b, c), __shfl(a.y, b, c));
+}
+
+__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c)
+{
+	a1 = WarpShuffle(a1, b1, c);
+	a2 = WarpShuffle(a2, b2, c);
+	a3 = WarpShuffle(a3, b3, c);
+}
+
+#else
+__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
+	uint32_t *_ptr = (uint32_t*)shared_mem;
+
+	__threadfence_block();
+	uint32_t buf = _ptr[thread];
+
+	_ptr[thread] = a;
+	__threadfence_block();
+	uint32_t result = _ptr[(thread&~(c - 1)) + (b&(c - 1))];
+
+	__threadfence_block();
+	_ptr[thread] = buf;
+
+	__threadfence_block();
+	return result;
+}
+
+__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
+
+	__threadfence_block();
+	uint2 buf = shared_mem[thread];
+
+	shared_mem[thread] = a;
+	__threadfence_block();
+	uint2 result = shared_mem[(thread&~(c - 1)) + (b&(c - 1))];
+
+	__threadfence_block();
+	shared_mem[thread] = buf;
+
+	__threadfence_block();
+	return result;
+}
+
+__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
+
+	__threadfence_block();
+	uint2 buf = shared_mem[thread];
+
+	shared_mem[thread] = a1;
+	__threadfence_block();
+	a1 = shared_mem[(thread&~(c - 1)) + (b1&(c - 1))];
+	__threadfence_block();
+	shared_mem[thread] = a2;
+	__threadfence_block();
+	a2 = shared_mem[(thread&~(c - 1)) + (b2&(c - 1))];
+	__threadfence_block();
+	shared_mem[thread] = a3;
+	__threadfence_block();
+	a3 = shared_mem[(thread&~(c - 1)) + (b3&(c - 1))];
+
+	__threadfence_block();
+	shared_mem[thread] = buf;
+	__threadfence_block();
+}
+
+#endif
+
+#if __CUDA_ARCH__ >= 300
+static __device__ __forceinline__
+void Gfunc(uint2 &a, uint2 &b, uint2 &c, uint2 &d)
+{
+	a += b; d ^= a; d = SWAPUINT2(d);
+	c += d; b ^= c; b = ROR24(b); //ROR2(b, 24);
+	a += b; d ^= a; d = ROR16(d);
+	c += d; b ^= c; b = ROR2(b, 63);
+}
+#endif
+
+__device__ __forceinline__ void round_lyra(uint2 s[4])
+{
+	Gfunc(s[0], s[1], s[2], s[3]);
+	WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 1, threadIdx.x + 2, threadIdx.x + 3, 4);
+	Gfunc(s[0], s[1], s[2], s[3]);
+	WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 3, threadIdx.x + 2, threadIdx.x + 1, 4);
+}
+
+static __device__ __forceinline__
+void round_lyra(uint2x4* s)
+{
+	Gfunc(s[0].x, s[1].x, s[2].x, s[3].x);
+	Gfunc(s[0].y, s[1].y, s[2].y, s[3].y);
+	Gfunc(s[0].z, s[1].z, s[2].z, s[3].z);
+	Gfunc(s[0].w, s[1].w, s[2].w, s[3].w);
+	Gfunc(s[0].x, s[1].y, s[2].z, s[3].w);
+	Gfunc(s[0].y, s[1].z, s[2].w, s[3].x);
+	Gfunc(s[0].z, s[1].w, s[2].x, s[3].y);
+	Gfunc(s[0].w, s[1].x, s[2].y, s[3].z);
+}
+
+static __device__ __forceinline__
+void reduceDuplexV5(uint2 state[4], const uint32_t thread, const uint32_t threads)
+{
+	uint2 state1[3], state2[3];
+
+	const uint32_t ps0 = (memshift * Ncol * 0 * threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps1 = (memshift * Ncol * 1 * threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps2 = (memshift * Ncol * 2 * threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps3 = (memshift * Ncol * 3 * threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps4 = (memshift * Ncol * 4 * threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps5 = (memshift * Ncol * 5 * threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps6 = (memshift * Ncol * 6 * threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps7 = (memshift * Ncol * 7 * threads + thread)*blockDim.x + threadIdx.x;
+
+	for (int i = 0; i < 8; i++)
+	{
+		const uint32_t s0 = memshift * Ncol * 0 + (Ncol - 1 - i) * memshift;
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			ST4S(s0 + j, state[j]);
+		round_lyra(state);
+	}
+
+	for (int i = 0; i < 8; i++)
+	{
+		const uint32_t s0 = memshift * Ncol * 0 + i * memshift;
+		const uint32_t s1 = ps1 + (7 - i)*memshift* threads*blockDim.x;
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state1[j] = LD4S(s0 + j);
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j];
+
+		round_lyra(state);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s1 + j*threads*blockDim.x) = state1[j] ^ state[j];
+	}
+
+	// 1, 0, 2
+	for (int i = 0; i < 8; i++)
+	{
+		const uint32_t s0 = memshift * Ncol * 0 + i * memshift;
+		const uint32_t s1 = ps1 + i * memshift* threads*blockDim.x;
+		const uint32_t s2 = ps2 + (7 - i)*memshift* threads*blockDim.x;
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state1[j] = *(DMatrix + s1 + j*threads*blockDim.x);
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state2[j] = LD4S(s0 + j);
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra(state);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s2 + j*threads*blockDim.x) = state1[j] ^ state[j];
+
+		// simultaneously receive data from preceding thread and send data to following thread
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		}
+		else
+		{
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			ST4S(s0 + j, state2[j]);
+	}
+
+	// 2, 1, 3
+	for (int i = 0; i < 8; i++)
+	{
+		const uint32_t s1 = ps1 + i * memshift* threads*blockDim.x;
+		const uint32_t s2 = ps2 + i * memshift* threads*blockDim.x;
+		const uint32_t s3 = ps3 + (7 - i)*memshift* threads*blockDim.x;
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state1[j] = *(DMatrix + s2 + j*threads*blockDim.x);
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state2[j] = *(DMatrix + s1 + j*threads*blockDim.x);
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra(state);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s3 + j*threads*blockDim.x) = state1[j] ^ state[j];
+
+		// simultaneously receive data from preceding thread and send data to following thread
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		} else  {
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s1 + j*threads*blockDim.x) = state2[j];
+	}
+
+	// 3, 0, 4
+	for (int i = 0; i < 8; i++)
+	{
+		const uint32_t ls0 = memshift * Ncol * 0 + i * memshift;
+		const uint32_t s0 = ps0 + i * memshift* threads*blockDim.x;
+		const uint32_t s3 = ps3 + i * memshift* threads*blockDim.x;
+		const uint32_t s4 = ps4 + (7 - i)*memshift* threads*blockDim.x;
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state1[j] = *(DMatrix + s3 + j*threads*blockDim.x);
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state2[j] = LD4S(ls0 + j);
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra(state);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s4 + j*threads*blockDim.x) = state1[j] ^ state[j];
+
+		// simultaneously receive data from preceding thread and send data to following thread
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		} else {
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s0 + j*threads*blockDim.x) = state2[j];
+	}
+
+	// 4, 3, 5
+	for (int i = 0; i < 8; i++)
+	{
+		const uint32_t s3 = ps3 + i * memshift* threads*blockDim.x;
+		const uint32_t s4 = ps4 + i * memshift* threads*blockDim.x;
+		const uint32_t s5 = ps5 + (7 - i)*memshift* threads*blockDim.x;
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state1[j] = *(DMatrix + s4 + j*threads*blockDim.x);
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state2[j] = *(DMatrix + s3 + j*threads*blockDim.x);
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra(state);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s5 + j*threads*blockDim.x) = state1[j] ^ state[j];
+
+		// simultaneously receive data from preceding thread and send data to following thread
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		}
+		else
+		{
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s3 + j*threads*blockDim.x) = state2[j];
+	}
+
+	// 5, 2, 6
+	for (int i = 0; i < 8; i++)
+	{
+		const uint32_t s2 = ps2 + i * memshift* threads*blockDim.x;
+		const uint32_t s5 = ps5 + i * memshift* threads*blockDim.x;
+		const uint32_t s6 = ps6 + (7 - i)*memshift* threads*blockDim.x;
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state1[j] = *(DMatrix + s5 + j*threads*blockDim.x);
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state2[j] = *(DMatrix + s2 + j*threads*blockDim.x);
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra(state);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s6 + j*threads*blockDim.x) = state1[j] ^ state[j];
+
+		// simultaneously receive data from preceding thread and send data to following thread
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		}
+		else
+		{
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s2 + j*threads*blockDim.x) = state2[j];
+	}
+
+	// 6, 1, 7
+	for (int i = 0; i < 8; i++)
+	{
+		const uint32_t s1 = ps1 + i * memshift* threads*blockDim.x;
+		const uint32_t s6 = ps6 + i * memshift* threads*blockDim.x;
+		const uint32_t s7 = ps7 + (7 - i)*memshift* threads*blockDim.x;
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state1[j] = *(DMatrix + s6 + j*threads*blockDim.x);
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state2[j] = *(DMatrix + s1 + j*threads*blockDim.x);
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra(state);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s7 + j*threads*blockDim.x) = state1[j] ^ state[j];
+
+		// simultaneously receive data from preceding thread and send data to following thread
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		} else {
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s1 + j*threads*blockDim.x) = state2[j];
+	}
+}
+
+static __device__ __forceinline__
+void reduceDuplexRowV50(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], const uint32_t thread, const uint32_t threads)
+{
+	const uint32_t ps1 = (memshift * Ncol * rowIn*threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps2 = (memshift * Ncol * rowInOut *threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps3 = (memshift * Ncol * rowOut*threads + thread)*blockDim.x + threadIdx.x;
+
+	#pragma unroll 1
+	for (int i = 0; i < 8; i++)
+	{
+		uint2 state1[3], state2[3];
+
+		const uint32_t s1 = ps1 + i*memshift*threads *blockDim.x;
+		const uint32_t s2 = ps2 + i*memshift*threads *blockDim.x;
+		const uint32_t s3 = ps3 + i*memshift*threads *blockDim.x;
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++) {
+			state1[j] = *(DMatrix + s1 + j*threads*blockDim.x);
+			state2[j] = *(DMatrix + s2 + j*threads*blockDim.x);
+		}
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++) {
+			state1[j] += state2[j];
+			state[j] ^= state1[j];
+		}
+
+		round_lyra(state);
+
+		// simultaneously receive data from preceding thread and send data to following thread
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		} else {
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+		{
+			*(DMatrix + s2 + j*threads*blockDim.x) = state2[j];
+			*(DMatrix + s3 + j*threads*blockDim.x) ^= state[j];
+		}
+	}
+}
+
+static __device__ __forceinline__
+void reduceDuplexRowV50_8(const int rowInOut, uint2 state[4], const uint32_t thread, const uint32_t threads)
+{
+	const uint32_t ps1 = (memshift * Ncol * 2*threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps2 = (memshift * Ncol * rowInOut *threads + thread)*blockDim.x + threadIdx.x;
+	// const uint32_t ps3 = (memshift * Ncol * 5*threads + thread)*blockDim.x + threadIdx.x;
+
+	uint2 state1[3], last[3];
+
+	#pragma unroll
+	for (int j = 0; j < 3; j++) {
+		state1[j] = *(DMatrix + ps1 + j*threads*blockDim.x);
+		last[j] = *(DMatrix + ps2 + j*threads*blockDim.x);
+	}
+
+	#pragma unroll
+	for (int j = 0; j < 3; j++) {
+		state1[j] += last[j];
+		state[j] ^= state1[j];
+	}
+
+	round_lyra(state);
+
+	// simultaneously receive data from preceding thread and send data to following thread
+	uint2 Data0 = state[0];
+	uint2 Data1 = state[1];
+	uint2 Data2 = state[2];
+	WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+	if (threadIdx.x == 0)
+	{
+		last[0] ^= Data2;
+		last[1] ^= Data0;
+		last[2] ^= Data1;
+	} else {
+		last[0] ^= Data0;
+		last[1] ^= Data1;
+		last[2] ^= Data2;
+	}
+
+	if (rowInOut == 5)
+	{
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			last[j] ^= state[j];
+	}
+
+	for (int i = 1; i < 8; i++)
+	{
+		const uint32_t s1 = ps1 + i*memshift*threads *blockDim.x;
+		const uint32_t s2 = ps2 + i*memshift*threads *blockDim.x;
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= *(DMatrix + s1 + j*threads*blockDim.x) + *(DMatrix + s2 + j*threads*blockDim.x);
+
+		round_lyra(state);
+	}
+
+
+#pragma unroll
+	for (int j = 0; j < 3; j++)
+		state[j] ^= last[j];
+
+}
+
+__global__ __launch_bounds__(64, 1)
+void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint2 *g_hash)
+{
+	const uint2x4 blake2b_IV[2] = {
+		{ { 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a } },
+		{ { 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 } }
+	};
+
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint2x4 state[4];
+
+		((uint2*)state)[0] = __ldg(&g_hash[thread]);
+		((uint2*)state)[1] = __ldg(&g_hash[thread + threads]);
+		((uint2*)state)[2] = __ldg(&g_hash[thread + threads * 2]);
+		((uint2*)state)[3] = __ldg(&g_hash[thread + threads * 3]);
+
+		state[1] = state[0];
+		state[2] = blake2b_IV[0];
+		state[3] = blake2b_IV[1];
+
+		for (int i = 0; i < 24; i++)
+			round_lyra(state); //because 12 is not enough
+
+		((uint2x4*)DMatrix)[0 * threads + thread] = state[0];
+		((uint2x4*)DMatrix)[1 * threads + thread] = state[1];
+		((uint2x4*)DMatrix)[2 * threads + thread] = state[2];
+		((uint2x4*)DMatrix)[3 * threads + thread] = state[3];
+	}
+}
+
+__global__ __launch_bounds__(TPB50, 1)
+void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint2 *g_hash)
+{
+	const uint32_t thread = (blockDim.y * blockIdx.x + threadIdx.y);
+
+	if (thread < threads)
+	{
+		uint2 state[4];
+		state[0] = __ldg(&DMatrix[(0 * threads + thread)*blockDim.x + threadIdx.x]);
+		state[1] = __ldg(&DMatrix[(1 * threads + thread)*blockDim.x + threadIdx.x]);
+		state[2] = __ldg(&DMatrix[(2 * threads + thread)*blockDim.x + threadIdx.x]);
+		state[3] = __ldg(&DMatrix[(3 * threads + thread)*blockDim.x + threadIdx.x]);
+
+		reduceDuplexV5(state, thread, threads);
+
+		uint32_t rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowV50(7, rowa, 0, state, thread, threads);
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowV50(0, rowa, 3, state, thread, threads);
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowV50(3, rowa, 6, state, thread, threads);
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowV50(6, rowa, 1, state, thread, threads);
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowV50(1, rowa, 4, state, thread, threads);
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowV50(4, rowa, 7, state, thread, threads);
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowV50(7, rowa, 2, state, thread, threads);
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowV50_8(rowa, state, thread, threads);
+
+		DMatrix[(0 * threads + thread)*blockDim.x + threadIdx.x] = state[0];
+		DMatrix[(1 * threads + thread)*blockDim.x + threadIdx.x] = state[1];
+		DMatrix[(2 * threads + thread)*blockDim.x + threadIdx.x] = state[2];
+		DMatrix[(3 * threads + thread)*blockDim.x + threadIdx.x] = state[3];
+	}
+}
+
+__global__ __launch_bounds__(64, 1)
+void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint2 *g_hash)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+	if (thread < threads)
+	{
+		uint2x4 state[4];
+		state[0] = __ldg4(&((uint2x4*)DMatrix)[0 * threads + thread]);
+		state[1] = __ldg4(&((uint2x4*)DMatrix)[1 * threads + thread]);
+		state[2] = __ldg4(&((uint2x4*)DMatrix)[2 * threads + thread]);
+		state[3] = __ldg4(&((uint2x4*)DMatrix)[3 * threads + thread]);
+
+		for (int i = 0; i < 12; i++)
+			round_lyra(state);
+
+		g_hash[thread] = ((uint2*)state)[0];
+		g_hash[thread + threads] = ((uint2*)state)[1];
+		g_hash[thread + threads * 2] = ((uint2*)state)[2];
+		g_hash[thread + threads * 3] = ((uint2*)state)[3];
+	}
+}
+
+__global__ __launch_bounds__(64, 1)
+void lyra2_gpu_hash_64_1_sm5(uint32_t threads, uint2* const d_hash_512, const uint32_t round)
+{
+	const uint2x4 blake2b_IV[2] = {
+		{ { 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a } },
+		{ { 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 } }
+	};
+	// This kernel loads 2x 256-bits hashes from 512-bits chain offsets in 2 steps
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint2x4 state[4];
+		const size_t offset = (size_t)8 * thread + (round * 4U);
+		uint2 *psrc = (uint2*)(&d_hash_512[offset]);
+		state[0].x = state[1].x = __ldg(&psrc[0]);
+		state[0].y = state[1].y = __ldg(&psrc[1]);
+		state[0].z = state[1].z = __ldg(&psrc[2]);
+		state[0].w = state[1].w = __ldg(&psrc[3]);
+
+		state[1] = state[0];
+		state[2] = blake2b_IV[0];
+		state[3] = blake2b_IV[1];
+
+		for (int i = 0; i<24; i++)
+			round_lyra(state);
+
+		((uint2x4*)DMatrix)[threads * 0 + thread] = state[0];
+		((uint2x4*)DMatrix)[threads * 1 + thread] = state[1];
+		((uint2x4*)DMatrix)[threads * 2 + thread] = state[2];
+		((uint2x4*)DMatrix)[threads * 3 + thread] = state[3];
+	}
+}
+
+__global__ __launch_bounds__(64, 1)
+void lyra2_gpu_hash_64_3_sm5(uint32_t threads, uint2 *d_hash_512, const uint32_t round)
+{
+	// This kernel outputs 2x 256-bits hashes in 512-bits chain offsets in 2 steps
+	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
+	if (thread < threads)
+	{
+		uint2x4 state[4];
+		state[0] = __ldg4(&((uint2x4*)DMatrix)[threads * 0 + thread]);
+		state[1] = __ldg4(&((uint2x4*)DMatrix)[threads * 1 + thread]);
+		state[2] = __ldg4(&((uint2x4*)DMatrix)[threads * 2 + thread]);
+		state[3] = __ldg4(&((uint2x4*)DMatrix)[threads * 3 + thread]);
+
+		for (int i = 0; i < 12; i++)
+			round_lyra(state);
+
+		const size_t offset = (size_t)8 * thread + (round * 4U);
+		uint2 *pdst = (uint2*)(&d_hash_512[offset]);
+		pdst[0] = state[0].x;
+		pdst[1] = state[0].y;
+		pdst[2] = state[0].z;
+		pdst[3] = state[0].w;
+	}
+}
+#else
+/* if __CUDA_ARCH__ != 500 .. host */
+__global__ void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint2 *g_hash) {}
+__global__ void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint2 *g_hash) {}
+__global__ void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint2 *g_hash) {}
+__global__ void lyra2_gpu_hash_64_1_sm5(uint32_t threads, uint2* const d_hash_512, const uint32_t round) {}
+__global__ void lyra2_gpu_hash_64_3_sm5(uint32_t threads, uint2 *d_hash_512, const uint32_t round) {}
+#endif
diff --git a/lyra2/cuda_lyra2_vectors.h b/lyra2/cuda_lyra2_vectors.h
new file mode 100644
index 0000000000..6bb11d3c50
--- /dev/null
+++ b/lyra2/cuda_lyra2_vectors.h
@@ -0,0 +1,601 @@
+/* DJM CRAP to strip (again) made for SM 3.2+ */
+
+#ifndef CUDA_LYRA_VECTOR_H
+#define CUDA_LYRA_VECTOR_H
+
+///////////////////////////////////////////////////////////////////////////////////
+#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__)
+#define __LDG_PTR   "l"
+#else
+#define __LDG_PTR   "r"
+#endif
+
+#include "cuda_helper.h"
+
+#if __CUDA_ARCH__ < 300
+#define __shfl(x, y, z) (x)
+#endif
+
+#if CUDA_VERSION >= 9000 && __CUDA_ARCH__ >= 300
+#define __shfl2(var, srcLane) __shfl_sync(0xFFFFFFFFu, var, srcLane)
+#else
+#define __shfl2 __shfl
+#endif
+
+#if __CUDA_ARCH__ < 320 && !defined(__ldg4)
+#define __ldg4(x) (*(x))
+#endif
+
+typedef struct __align__(32) uint8 {
+	unsigned int s0, s1, s2, s3, s4, s5, s6, s7;
+} uint8;
+
+typedef struct __align__(64) uint2_8 {
+	uint2 s0, s1, s2, s3, s4, s5, s6, s7;
+} uint2_8;
+
+typedef struct __align__(64) ulonglong2to8 {
+	ulonglong2 l0,l1,l2,l3;
+} ulonglong2to8;
+
+typedef struct __align__(128) ulonglong8to16 {
+	ulonglong2to8 lo, hi;
+} ulonglong8to16;
+
+typedef struct __align__(128) ulonglong16to32{
+	ulonglong8to16 lo, hi;
+} ulonglong16to32;
+
+typedef struct __align__(128) ulonglong32to64{
+	ulonglong16to32 lo, hi;
+} ulonglong32to64;
+
+typedef struct __align__(128) ulonglonglong {
+	ulonglong2 s0,s1,s2,s3,s4,s5,s6,s7;
+} ulonglonglong;
+
+typedef struct __align__(64) uint16 {
+	union {
+		struct {unsigned int  s0, s1, s2, s3, s4, s5, s6, s7;};
+		uint8 lo;
+	};
+	union {
+		struct {unsigned int s8, s9, sa, sb, sc, sd, se, sf;};
+		uint8 hi;
+	};
+} uint16;
+
+typedef struct __align__(128) uint2_16 {
+	union {
+		struct { uint2  s0, s1, s2, s3, s4, s5, s6, s7; };
+		uint2_8 lo;
+	};
+	union {
+		struct { uint2 s8, s9, sa, sb, sc, sd, se, sf; };
+		uint2_8 hi;
+	};
+} uint2_16;
+
+typedef struct __align__(128) uint32 {
+	uint16 lo,hi;
+} uint32;
+
+struct __align__(128) ulong8 {
+	ulonglong4 s0, s1, s2, s3;
+};
+typedef __device_builtin__ struct ulong8 ulong8;
+
+typedef struct __align__(128) ulonglong16{
+	ulonglong4 s0, s1, s2, s3, s4, s5, s6, s7;
+} ulonglong16;
+
+typedef struct __align__(16) uint28 {
+	uint2 x, y, z, w;
+} uint2x4;
+typedef uint2x4 uint28; /* name deprecated */
+
+typedef struct __builtin_align__(32) uint48 {
+	uint4 s0,s1;
+} uint48;
+
+typedef struct __builtin_align__(128) uint4x16{
+	uint4 s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
+} uint4x16;
+
+static __inline__ __device__ ulonglong2to8 make_ulonglong2to8(ulonglong2 s0, ulonglong2 s1, ulonglong2 s2, ulonglong2 s3)
+{
+	ulonglong2to8 t; t.l0=s0; t.l1=s1; t.l2=s2; t.l3=s3;
+	return t;
+}
+
+static __inline__ __device__ ulonglong8to16 make_ulonglong8to16(const ulonglong2to8 &s0, const ulonglong2to8 &s1)
+{
+	ulonglong8to16 t; t.lo = s0; t.hi = s1;
+	return t;
+}
+
+static __inline__ __device__ ulonglong16to32 make_ulonglong16to32(const ulonglong8to16 &s0, const ulonglong8to16 &s1)
+{
+	ulonglong16to32 t; t.lo = s0; t.hi = s1;
+	return t;
+}
+
+static __inline__ __device__ ulonglong32to64 make_ulonglong32to64(const ulonglong16to32 &s0, const ulonglong16to32 &s1)
+{
+	ulonglong32to64 t; t.lo = s0; t.hi = s1;
+	return t;
+}
+
+static __inline__ __host__ __device__ ulonglonglong make_ulonglonglong(
+	const ulonglong2 &s0, const ulonglong2 &s1, const ulonglong2 &s2, const ulonglong2 &s3,
+	const ulonglong2 &s4, const ulonglong2 &s5)
+{
+	ulonglonglong t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5;
+	return t;
+}
+
+static __inline__ __device__ uint48 make_uint48(uint4 s0, uint4 s1)
+{
+	uint48 t; t.s0 = s0; t.s1 = s1;
+	return t;
+}
+
+static __inline__ __device__ uint28 make_uint28(uint2 s0, uint2 s1, uint2 s2, uint2 s3)
+{
+	uint28 t; t.x = s0; t.y = s1; t.z = s2; t.w = s3;
+	return t;
+}
+
+static __inline__ __host__ __device__ uint4x16 make_uint4x16(
+	uint4 s0, uint4 s1, uint4 s2, uint4 s3, uint4 s4, uint4 s5, uint4 s6, uint4 s7,
+	uint4 s8, uint4 s9, uint4 sa, uint4 sb, uint4 sc, uint4 sd, uint4 se, uint4 sf)
+{
+	uint4x16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
+	t.s8 = s8; t.s9 = s9; t.s10 = sa; t.s11 = sb; t.s12 = sc; t.s13 = sd; t.s14 = se; t.s15 = sf;
+	return t;
+}
+
+static __inline__  __device__ uint2_16 make_uint2_16(
+	uint2 s0, uint2 s1, uint2 s2, uint2 s3, uint2 s4, uint2 s5, uint2 s6, uint2 s7,
+	uint2 s8, uint2 s9, uint2 sa, uint2 sb, uint2 sc, uint2 sd, uint2 se, uint2 sf)
+{
+	uint2_16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
+	t.s8 = s8; t.s9 = s9; t.sa = sa; t.sb = sb; t.sc = sc; t.sd = sd; t.se = se; t.sf = sf;
+	return t;
+}
+
+static __inline__ __host__ __device__ uint16 make_uint16(
+	unsigned int s0, unsigned int s1, unsigned int s2, unsigned int s3, unsigned int s4, unsigned int s5, unsigned int s6, unsigned int s7,
+	unsigned int s8, unsigned int s9, unsigned int sa, unsigned int sb, unsigned int sc, unsigned int sd, unsigned int se, unsigned int sf)
+{
+	uint16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
+	t.s8 = s8; t.s9 = s9; t.sa = sa; t.sb = sb; t.sc = sc; t.sd = sd; t.se = se; t.sf = sf;
+	return t;
+}
+
+static __inline__ __host__ __device__ uint16 make_uint16(const uint8 &a, const uint8 &b)
+{
+	uint16 t; t.lo=a; t.hi=b; return t;
+}
+
+static __inline__ __host__ __device__ uint32 make_uint32(const uint16 &a, const uint16 &b)
+{
+	uint32 t; t.lo = a; t.hi = b; return t;
+}
+
+
+static __inline__ __host__ __device__ uint8 make_uint8(
+	unsigned int s0, unsigned int s1, unsigned int s2, unsigned int s3, unsigned int s4, unsigned int s5, unsigned int s6, unsigned int s7)
+{
+	uint8 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
+	return t;
+}
+
+static __inline__ __host__ __device__ uint2_8 make_uint2_8(
+	uint2 s0, uint2 s1, uint2 s2, uint2 s3, uint2 s4, uint2 s5, uint2 s6, uint2 s7)
+{
+	uint2_8 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
+	return t;
+}
+
+static __inline__ __host__ __device__ ulonglong16 make_ulonglong16(const ulonglong4 &s0, const ulonglong4 &s1,
+	const ulonglong4 &s2, const ulonglong4 &s3, const ulonglong4 &s4, const ulonglong4 &s5, const ulonglong4 &s6, const ulonglong4 &s7)
+{
+	ulonglong16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
+	return t;
+}
+
+static __inline__ __host__ __device__ ulong8 make_ulong8(
+	ulonglong4 s0, ulonglong4 s1, ulonglong4 s2, ulonglong4 s3)
+{
+	ulong8 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3;// t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
+	return t;
+}
+
+
+static __forceinline__ __device__ uchar4 operator^ (uchar4 a, uchar4 b) { return make_uchar4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); }
+static __forceinline__ __device__ uchar4 operator+ (uchar4 a, uchar4 b) { return make_uchar4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
+
+static __forceinline__ __device__ uint4 operator+ (uint4 a, uint4 b) { return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
+
+static __forceinline__ __device__ ulonglong4 operator^ (ulonglong4 a, ulonglong4 b) { return make_ulonglong4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); }
+static __forceinline__ __device__ ulonglong4 operator+ (ulonglong4 a, ulonglong4 b) { return make_ulonglong4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
+static __forceinline__ __device__ ulonglong2 operator^ (ulonglong2 a, ulonglong2 b) { return make_ulonglong2(a.x ^ b.x, a.y ^ b.y); }
+static __forceinline__ __device__ ulonglong2 operator+ (ulonglong2 a, ulonglong2 b) { return make_ulonglong2(a.x + b.x, a.y + b.y); }
+
+static __forceinline__ __device__ ulong8 operator^ (const ulong8 &a, const ulong8 &b) {
+	return make_ulong8(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3);
+}
+
+static __forceinline__ __device__ ulong8 operator+ (const ulong8 &a, const ulong8 &b) {
+	return make_ulong8(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3);
+}
+
+static __forceinline__ __device__  __host__ uint8 operator^ (const uint8 &a, const uint8 &b) { return make_uint8(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7); }
+
+static __forceinline__ __device__  __host__ uint8 operator+ (const uint8 &a, const uint8 &b) { return make_uint8(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7); }
+
+static __forceinline__ __device__   uint2_8 operator^ (const uint2_8 &a, const uint2_8 &b) { return make_uint2_8(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7); }
+
+static __forceinline__ __device__   uint2_8 operator+ (const uint2_8 &a, const uint2_8 &b) { return make_uint2_8(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7); }
+
+
+////////////// mess++ //////
+
+static __forceinline__ __device__  uint28 operator^ (const uint28 &a, const uint28 &b) {
+	return make_uint28(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w);
+}
+
+static __forceinline__ __device__  uint28 operator+ (const uint28 &a, const uint28 &b) {
+	return make_uint28(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+}
+
+static __forceinline__ __device__  uint48 operator+ (const uint48 &a, const uint48 &b) {
+	return make_uint48(a.s0 + b.s0, a.s1 + b.s1);
+}
+
+/////////////////////////
+
+static __forceinline__ __device__ __host__ uint16 operator^ (const uint16 &a, const uint16 &b) {
+	return make_uint16(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7,
+		a.s8 ^ b.s8, a.s9 ^ b.s9, a.sa ^ b.sa, a.sb ^ b.sb, a.sc ^ b.sc, a.sd ^ b.sd, a.se ^ b.se, a.sf ^ b.sf);
+}
+
+static __forceinline__ __device__  __host__ uint16 operator+ (const uint16 &a, const uint16 &b) {
+	return make_uint16(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7,
+		a.s8 + b.s8, a.s9 + b.s9, a.sa + b.sa, a.sb + b.sb, a.sc + b.sc, a.sd + b.sd, a.se + b.se, a.sf + b.sf);
+}
+
+static __forceinline__ __device__  uint2_16 operator^ (const uint2_16 &a, const uint2_16 &b) {
+	return make_uint2_16(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7,
+		a.s8 ^ b.s8, a.s9 ^ b.s9, a.sa ^ b.sa, a.sb ^ b.sb, a.sc ^ b.sc, a.sd ^ b.sd, a.se ^ b.se, a.sf ^ b.sf);
+}
+
+static __forceinline__ __device__  uint2_16 operator+ (const uint2_16 &a, const uint2_16 &b) {
+	return make_uint2_16(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7,
+		a.s8 + b.s8, a.s9 + b.s9, a.sa + b.sa, a.sb + b.sb, a.sc + b.sc, a.sd + b.sd, a.se + b.se, a.sf + b.sf);
+}
+
+static __forceinline__ __device__  uint32 operator^ (const uint32 &a, const uint32 &b) {
+	return make_uint32(a.lo ^ b.lo, a.hi ^ b.hi);
+}
+
+static __forceinline__ __device__  uint32 operator+ (const uint32 &a, const uint32 &b) {
+	return make_uint32(a.lo + b.lo, a.hi + b.hi);
+}
+
+static __forceinline__ __device__ ulonglong16 operator^ (const ulonglong16 &a, const ulonglong16 &b) {
+	return make_ulonglong16(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7);
+}
+
+static __forceinline__ __device__ ulonglong16 operator+ (const ulonglong16 &a, const ulonglong16 &b) {
+	return make_ulonglong16(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7);
+}
+
+static __forceinline__ __device__ void operator^= (ulong8 &a, const ulong8 &b) { a = a ^ b; }
+
+static __forceinline__ __device__ void operator^= (uint28 &a, const uint28 &b) { a = a ^ b; }
+static __forceinline__ __device__ void operator+= (uint28 &a, const uint28 &b) { a = a + b; }
+
+static __forceinline__ __device__ void operator^= (uint2_8 &a, const uint2_8 &b) { a = a ^ b; }
+static __forceinline__ __device__ void operator+= (uint2_8 &a, const uint2_8 &b) { a = a + b; }
+
+static __forceinline__ __device__ void operator^= (uint32 &a, const uint32 &b) { a = a ^ b; }
+static __forceinline__ __device__ void operator+= (uint32 &a, const uint32 &b) { a = a + b; }
+
+static __forceinline__ __device__ void operator^= (uchar4 &a, uchar4 b) { a = a ^ b; }
+
+static __forceinline__ __device__  __host__ void operator^= (uint8 &a, const uint8 &b) { a = a ^ b; }
+static __forceinline__ __device__  __host__ void operator^= (uint16 &a, const uint16 &b) { a = a ^ b; }
+
+static __forceinline__ __device__ void operator^= (ulonglong16 &a, const ulonglong16 &b) { a = a ^ b; }
+static __forceinline__ __device__ void operator^= (ulonglong4 &a, const ulonglong4 &b) { a = a ^ b; }
+static __forceinline__ __device__ void operator+= (ulonglong4 &a, const ulonglong4 &b) { a = a + b; }
+
+static __forceinline__ __device__ void operator^= (ulonglong2 &a, const ulonglong2 &b) { a = a ^ b; }
+static __forceinline__ __device__ void operator+= (ulonglong2 &a, const ulonglong2 &b) { a = a + b; }
+
+static __forceinline__ __device__
+ulonglong2to8 operator^ (const ulonglong2to8 &a, const ulonglong2to8 &b)
+{
+	return make_ulonglong2to8(a.l0 ^ b.l0, a.l1 ^ b.l1, a.l2 ^ b.l2, a.l3 ^ b.l3);
+}
+static __forceinline__ __device__
+ulonglong2to8 operator+ (const ulonglong2to8 &a, const ulonglong2to8 &b)
+{
+	return make_ulonglong2to8(a.l0 + b.l0, a.l1 + b.l1, a.l2 + b.l2, a.l3 + b.l3);
+}
+
+static __forceinline__ __device__
+ulonglong8to16 operator^ (const ulonglong8to16 &a, const ulonglong8to16 &b)
+{
+	return make_ulonglong8to16(a.lo ^ b.lo, a.hi ^ b.hi);
+}
+
+static __forceinline__ __device__
+ulonglong8to16 operator+ (const ulonglong8to16 &a, const ulonglong8to16 &b)
+{
+	return make_ulonglong8to16(a.lo + b.lo, a.hi + b.hi);
+}
+
+static __forceinline__ __device__
+ulonglong16to32 operator^ (const ulonglong16to32 &a, const ulonglong16to32 &b)
+{
+	return make_ulonglong16to32(a.lo ^ b.lo, a.hi ^ b.hi);
+}
+
+static __forceinline__ __device__
+ulonglong16to32 operator+ (const ulonglong16to32 &a, const ulonglong16to32 &b)
+{
+	return make_ulonglong16to32(a.lo + b.lo, a.hi + b.hi);
+}
+
+static __forceinline__ __device__
+ulonglong32to64 operator^ (const ulonglong32to64 &a, const ulonglong32to64 &b)
+{
+	return make_ulonglong32to64(a.lo ^ b.lo, a.hi ^ b.hi);
+}
+
+static __forceinline__ __device__
+ulonglong32to64 operator+ (const ulonglong32to64 &a, const ulonglong32to64 &b)
+{
+	return make_ulonglong32to64(a.lo + b.lo, a.hi + b.hi);
+}
+
+static __forceinline__ __device__ ulonglonglong operator^ (const ulonglonglong &a, const ulonglonglong &b) {
+	return make_ulonglonglong(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5);
+}
+
+static __forceinline__ __device__ ulonglonglong operator+ (const ulonglonglong &a, const ulonglonglong &b) {
+	return make_ulonglonglong(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5);
+}
+
+static __forceinline__ __device__ void operator^= (ulonglong2to8 &a, const ulonglong2to8 &b) { a = a ^ b; }
+
+static __forceinline__ __device__ void operator+= (uint4 &a, uint4 b) { a = a + b; }
+static __forceinline__ __device__ void operator+= (uchar4 &a, uchar4 b) { a = a + b; }
+static __forceinline__ __device__ __host__ void operator+= (uint8 &a, const uint8 &b) { a = a + b; }
+static __forceinline__ __device__ __host__ void operator+= (uint16 &a, const uint16 &b) { a = a + b; }
+static __forceinline__ __device__ void operator+= (uint2_16 &a, const uint2_16 &b) { a = a + b; }
+static __forceinline__ __device__ void operator^= (uint2_16 &a, const uint2_16 &b) { a = a + b; }
+
+static __forceinline__ __device__ void operator+= (ulong8 &a, const ulong8 &b) { a = a + b; }
+static __forceinline__ __device__ void operator+= (ulonglong16 &a, const ulonglong16 &b) { a = a + b; }
+static __forceinline__ __device__ void operator+= (ulonglong8to16 &a, const ulonglong8to16 &b) { a = a + b; }
+static __forceinline__ __device__ void operator^= (ulonglong8to16 &a, const ulonglong8to16 &b) { a = a ^ b; }
+
+static __forceinline__ __device__ void operator+= (ulonglong16to32 &a, const ulonglong16to32 &b) { a = a + b; }
+static __forceinline__ __device__ void operator^= (ulonglong16to32 &a, const ulonglong16to32 &b) { a = a ^ b; }
+
+static __forceinline__ __device__ void operator+= (ulonglong32to64 &a, const ulonglong32to64 &b) { a = a + b; }
+static __forceinline__ __device__ void operator^= (ulonglong32to64 &a, const ulonglong32to64 &b) { a = a ^ b; }
+
+static __forceinline__ __device__ void operator+= (ulonglonglong &a, const ulonglonglong &b) { a = a + b; }
+static __forceinline__ __device__ void operator^= (ulonglonglong &a, const ulonglonglong &b) { a = a ^ b; }
+
+#if __CUDA_ARCH__ < 320
+
+#define rotate ROTL32
+#define rotateR ROTR32
+
+#else
+
+static __forceinline__ __device__ uint4 rotate4(uint4 vec4, uint32_t shift)
+{
+	uint4 ret;
+	asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.x) : "r"(vec4.x), "r"(vec4.x), "r"(shift));
+	asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.y) : "r"(vec4.y), "r"(vec4.y), "r"(shift));
+	asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.z) : "r"(vec4.z), "r"(vec4.z), "r"(shift));
+	asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.w) : "r"(vec4.w), "r"(vec4.w), "r"(shift));
+	return ret;
+}
+
+static __forceinline__ __device__ uint4 rotate4R(uint4 vec4, uint32_t shift)
+{
+	uint4 ret;
+	asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.x) : "r"(vec4.x), "r"(vec4.x), "r"(shift));
+	asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.y) : "r"(vec4.y), "r"(vec4.y), "r"(shift));
+	asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.z) : "r"(vec4.z), "r"(vec4.z), "r"(shift));
+	asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.w) : "r"(vec4.w), "r"(vec4.w), "r"(shift));
+	return ret;
+}
+
+static __forceinline__ __device__ uint32_t rotate(uint32_t vec4, uint32_t shift)
+{
+	uint32_t ret;
+	asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(vec4), "r"(vec4), "r"(shift));
+	return ret;
+}
+
+static __forceinline__ __device__ uint32_t rotateR(uint32_t vec4, uint32_t shift)
+{
+	uint32_t ret;
+	asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(vec4), "r"(vec4), "r"(shift));
+	return ret;
+}
+
+static __device__ __inline__ ulonglong4 __ldg4(const ulonglong4 *ptr)
+{
+	ulonglong4 ret;
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];"  : "=l"(ret.z), "=l"(ret.w) : __LDG_PTR(ptr));
+	return ret;
+}
+
+static __device__ __inline__ void ldg4(const ulonglong4 *ptr,ulonglong4 *ret)
+{
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2];"     : "=l"(ret[0].x), "=l"(ret[0].y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];"  : "=l"(ret[0].z), "=l"(ret[0].w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+32];"  : "=l"(ret[1].x), "=l"(ret[1].y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+48];"  : "=l"(ret[1].z), "=l"(ret[1].w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+64];"  : "=l"(ret[2].x), "=l"(ret[2].y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v2.u64 {%0,%1}, [%2+80];"  : "=l"(ret[2].z), "=l"(ret[2].w) : __LDG_PTR(ptr));
+}
+
+static __device__ __inline__ uint28 __ldg4(const uint28 *ptr)
+{
+	uint28 ret;
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x.x), "=r"(ret.x.y), "=r"(ret.y.x), "=r"(ret.y.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.z.x), "=r"(ret.z.y), "=r"(ret.w.x), "=r"(ret.w.y) : __LDG_PTR(ptr));
+	return ret;
+}
+
+static __device__ __inline__ uint48 __ldg4(const uint48 *ptr)
+{
+	uint48 ret;
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.s0.x), "=r"(ret.s0.y), "=r"(ret.s0.z), "=r"(ret.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.s1.x), "=r"(ret.s1.y), "=r"(ret.s1.z), "=r"(ret.s1.w) : __LDG_PTR(ptr));
+	return ret;
+}
+
+static __device__ __inline__ void ldg4(const uint28 *ptr, uint28 *ret)
+{
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret[0].x.x), "=r"(ret[0].x.y), "=r"(ret[0].y.x), "=r"(ret[0].y.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret[0].z.x), "=r"(ret[0].z.y), "=r"(ret[0].w.x), "=r"(ret[0].w.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];"  : "=r"(ret[1].x.x), "=r"(ret[1].x.y), "=r"(ret[1].y.x), "=r"(ret[1].y.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];" : "=r"(ret[1].z.x), "=r"(ret[1].z.y), "=r"(ret[1].w.x), "=r"(ret[1].w.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];"  : "=r"(ret[2].x.x), "=r"(ret[2].x.y), "=r"(ret[2].y.x), "=r"(ret[2].y.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];" : "=r"(ret[2].z.x), "=r"(ret[2].z.y), "=r"(ret[2].w.x), "=r"(ret[2].w.y) : __LDG_PTR(ptr));
+}
+
+#endif /* __CUDA_ARCH__ < 320 */
+
+
+static __forceinline__ __device__ uint8 swapvec(const uint8 &buf)
+{
+	uint8 vec;
+	vec.s0 = cuda_swab32(buf.s0);
+	vec.s1 = cuda_swab32(buf.s1);
+	vec.s2 = cuda_swab32(buf.s2);
+	vec.s3 = cuda_swab32(buf.s3);
+	vec.s4 = cuda_swab32(buf.s4);
+	vec.s5 = cuda_swab32(buf.s5);
+	vec.s6 = cuda_swab32(buf.s6);
+	vec.s7 = cuda_swab32(buf.s7);
+	return vec;
+}
+
+static __forceinline__ __device__ uint8 swapvec(const uint8 *buf)
+{
+	uint8 vec;
+	vec.s0 = cuda_swab32(buf[0].s0);
+	vec.s1 = cuda_swab32(buf[0].s1);
+	vec.s2 = cuda_swab32(buf[0].s2);
+	vec.s3 = cuda_swab32(buf[0].s3);
+	vec.s4 = cuda_swab32(buf[0].s4);
+	vec.s5 = cuda_swab32(buf[0].s5);
+	vec.s6 = cuda_swab32(buf[0].s6);
+	vec.s7 = cuda_swab32(buf[0].s7);
+	return vec;
+}
+
+static __forceinline__ __device__ uint16 swapvec(const uint16 *buf)
+{
+	uint16 vec;
+	vec.s0 = cuda_swab32(buf[0].s0);
+	vec.s1 = cuda_swab32(buf[0].s1);
+	vec.s2 = cuda_swab32(buf[0].s2);
+	vec.s3 = cuda_swab32(buf[0].s3);
+	vec.s4 = cuda_swab32(buf[0].s4);
+	vec.s5 = cuda_swab32(buf[0].s5);
+	vec.s6 = cuda_swab32(buf[0].s6);
+	vec.s7 = cuda_swab32(buf[0].s7);
+	vec.s8 = cuda_swab32(buf[0].s8);
+	vec.s9 = cuda_swab32(buf[0].s9);
+	vec.sa = cuda_swab32(buf[0].sa);
+	vec.sb = cuda_swab32(buf[0].sb);
+	vec.sc = cuda_swab32(buf[0].sc);
+	vec.sd = cuda_swab32(buf[0].sd);
+	vec.se = cuda_swab32(buf[0].se);
+	vec.sf = cuda_swab32(buf[0].sf);
+	return vec;
+}
+
+static __forceinline__ __device__ uint16 swapvec(const uint16 &buf)
+{
+	uint16 vec;
+	vec.s0 = cuda_swab32(buf.s0);
+	vec.s1 = cuda_swab32(buf.s1);
+	vec.s2 = cuda_swab32(buf.s2);
+	vec.s3 = cuda_swab32(buf.s3);
+	vec.s4 = cuda_swab32(buf.s4);
+	vec.s5 = cuda_swab32(buf.s5);
+	vec.s6 = cuda_swab32(buf.s6);
+	vec.s7 = cuda_swab32(buf.s7);
+	vec.s8 = cuda_swab32(buf.s8);
+	vec.s9 = cuda_swab32(buf.s9);
+	vec.sa = cuda_swab32(buf.sa);
+	vec.sb = cuda_swab32(buf.sb);
+	vec.sc = cuda_swab32(buf.sc);
+	vec.sd = cuda_swab32(buf.sd);
+	vec.se = cuda_swab32(buf.se);
+	vec.sf = cuda_swab32(buf.sf);
+	return vec;
+}
+
+static __device__ __forceinline__ uint28 shuffle4(const uint28 &var, int lane)
+{
+#if __CUDA_ARCH__ >= 300
+	uint28 res;
+	res.x.x = __shfl2(var.x.x, lane);
+	res.x.y = __shfl2(var.x.y, lane);
+	res.y.x = __shfl2(var.y.x, lane);
+	res.y.y = __shfl2(var.y.y, lane);
+	res.z.x = __shfl2(var.z.x, lane);
+	res.z.y = __shfl2(var.z.y, lane);
+	res.w.x = __shfl2(var.w.x, lane);
+	res.w.y = __shfl2(var.w.y, lane);
+	return res;
+#else
+	return var;
+#endif
+}
+
+static __device__ __forceinline__ ulonglong4 shuffle4(ulonglong4 var, int lane)
+{
+#if __CUDA_ARCH__ >= 300
+	ulonglong4 res;
+	uint2 temp;
+	temp = vectorize(var.x);
+	temp.x = __shfl2(temp.x, lane);
+	temp.y = __shfl2(temp.y, lane);
+	res.x = devectorize(temp);
+	temp = vectorize(var.y);
+	temp.x = __shfl2(temp.x, lane);
+	temp.y = __shfl2(temp.y, lane);
+	res.y = devectorize(temp);
+	temp = vectorize(var.z);
+	temp.x = __shfl2(temp.x, lane);
+	temp.y = __shfl2(temp.y, lane);
+	res.z = devectorize(temp);
+	temp = vectorize(var.w);
+	temp.x = __shfl2(temp.x, lane);
+	temp.y = __shfl2(temp.y, lane);
+	res.w = devectorize(temp);
+	return res;
+#else
+	return var;
+#endif
+}
+
+#endif // #ifndef CUDA_LYRA_VECTOR_H
diff --git a/lyra2/cuda_lyra2v2.cu b/lyra2/cuda_lyra2v2.cu
new file mode 100644
index 0000000000..df3291c1fc
--- /dev/null
+++ b/lyra2/cuda_lyra2v2.cu
@@ -0,0 +1,469 @@
+/**
+ * Lyra2 (v2) CUDA Implementation
+ *
+ * Based on djm34/VTC sources and incredible 2x boost by Nanashi Meiyo-Meijin (May 2016)
+ */
+#include <stdio.h>
+#include <stdint.h>
+#include <memory.h>
+
+#include "cuda_lyra2v2_sm3.cuh"
+
+#ifdef __INTELLISENSE__
+/* just for vstudio code colors */
+#define __CUDA_ARCH__ 500
+#endif
+
+#define TPB 32
+
+#if __CUDA_ARCH__ >= 500
+
+#include "cuda_lyra2_vectors.h"
+
+#define Nrow 4
+#define Ncol 4
+#define memshift 3
+
+__device__ uint2x4 *DMatrix;
+
+__device__ __forceinline__ uint2 LD4S(const int index)
+{
+	extern __shared__ uint2 shared_mem[];
+	return shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x];
+}
+
+__device__ __forceinline__ void ST4S(const int index, const uint2 data)
+{
+	extern __shared__ uint2 shared_mem[];
+	shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data;
+}
+
+__device__ __forceinline__ uint2 shuffle2(uint2 a, uint32_t b, uint32_t c)
+{
+	return make_uint2(__shfl(a.x, b, c), __shfl(a.y, b, c));
+}
+
+__device__ __forceinline__
+void Gfunc_v5(uint2 &a, uint2 &b, uint2 &c, uint2 &d)
+{
+	a += b; d ^= a; d = SWAPUINT2(d);
+	c += d; b ^= c; b = ROR2(b, 24);
+	a += b; d ^= a; d = ROR2(d, 16);
+	c += d; b ^= c; b = ROR2(b, 63);
+}
+
+__device__ __forceinline__
+void round_lyra_v5(uint2x4 s[4])
+{
+	Gfunc_v5(s[0].x, s[1].x, s[2].x, s[3].x);
+	Gfunc_v5(s[0].y, s[1].y, s[2].y, s[3].y);
+	Gfunc_v5(s[0].z, s[1].z, s[2].z, s[3].z);
+	Gfunc_v5(s[0].w, s[1].w, s[2].w, s[3].w);
+
+	Gfunc_v5(s[0].x, s[1].y, s[2].z, s[3].w);
+	Gfunc_v5(s[0].y, s[1].z, s[2].w, s[3].x);
+	Gfunc_v5(s[0].z, s[1].w, s[2].x, s[3].y);
+	Gfunc_v5(s[0].w, s[1].x, s[2].y, s[3].z);
+}
+
+__device__ __forceinline__
+void round_lyra_v5(uint2 s[4])
+{
+	Gfunc_v5(s[0], s[1], s[2], s[3]);
+	s[1] = shuffle2(s[1], threadIdx.x + 1, 4);
+	s[2] = shuffle2(s[2], threadIdx.x + 2, 4);
+	s[3] = shuffle2(s[3], threadIdx.x + 3, 4);
+	Gfunc_v5(s[0], s[1], s[2], s[3]);
+	s[1] = shuffle2(s[1], threadIdx.x + 3, 4);
+	s[2] = shuffle2(s[2], threadIdx.x + 2, 4);
+	s[3] = shuffle2(s[3], threadIdx.x + 1, 4);
+}
+
+__device__ __forceinline__
+void reduceDuplexRowSetup2(uint2 state[4])
+{
+	uint2 state1[Ncol][3], state0[Ncol][3], state2[3];
+	int i, j;
+
+	#pragma unroll
+	for (int i = 0; i < Ncol; i++)
+	{
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			state0[Ncol - i - 1][j] = state[j];
+		round_lyra_v5(state);
+	}
+
+	//#pragma unroll 4
+	for (i = 0; i < Ncol; i++)
+	{
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			state[j] ^= state0[i][j];
+
+		round_lyra_v5(state);
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			state1[Ncol - i - 1][j] = state0[i][j];
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			state1[Ncol - i - 1][j] ^= state[j];
+	}
+
+	for (i = 0; i < Ncol; i++)
+	{
+		const uint32_t s0 = memshift * Ncol * 0 + i * memshift;
+		const uint32_t s2 = memshift * Ncol * 2 + memshift * (Ncol - 1) - i*memshift;
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			state[j] ^= state1[i][j] + state0[i][j];
+
+		round_lyra_v5(state);
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			state2[j] = state1[i][j];
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			state2[j] ^= state[j];
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			ST4S(s2 + j, state2[j]);
+
+		uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4);
+		uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4);
+		uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0) {
+			state0[i][0] ^= Data2;
+			state0[i][1] ^= Data0;
+			state0[i][2] ^= Data1;
+		} else {
+			state0[i][0] ^= Data0;
+			state0[i][1] ^= Data1;
+			state0[i][2] ^= Data2;
+		}
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			ST4S(s0 + j, state0[i][j]);
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			state0[i][j] = state2[j];
+
+	}
+
+	for (i = 0; i < Ncol; i++)
+	{
+		const uint32_t s1 = memshift * Ncol * 1 + i*memshift;
+		const uint32_t s3 = memshift * Ncol * 3 + memshift * (Ncol - 1) - i*memshift;
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			state[j] ^= state1[i][j] + state0[Ncol - i - 1][j];
+
+		round_lyra_v5(state);
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			state0[Ncol - i - 1][j] ^= state[j];
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			ST4S(s3 + j, state0[Ncol - i - 1][j]);
+
+		uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4);
+		uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4);
+		uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0) {
+			state1[i][0] ^= Data2;
+			state1[i][1] ^= Data0;
+			state1[i][2] ^= Data1;
+		} else  {
+			state1[i][0] ^= Data0;
+			state1[i][1] ^= Data1;
+			state1[i][2] ^= Data2;
+		}
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			ST4S(s1 + j, state1[i][j]);
+	}
+}
+
+__device__
+void reduceDuplexRowt2(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4])
+{
+	uint2 state1[3], state2[3];
+	const uint32_t ps1 = memshift * Ncol * rowIn;
+	const uint32_t ps2 = memshift * Ncol * rowInOut;
+	const uint32_t ps3 = memshift * Ncol * rowOut;
+
+	for (int i = 0; i < Ncol; i++)
+	{
+		const uint32_t s1 = ps1 + i*memshift;
+		const uint32_t s2 = ps2 + i*memshift;
+		const uint32_t s3 = ps3 + i*memshift;
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state1[j] = LD4S(s1 + j);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state2[j] = LD4S(s2 + j);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra_v5(state);
+
+		uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4);
+		uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4);
+		uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0) {
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		} else {
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			ST4S(s2 + j, state2[j]);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			ST4S(s3 + j, LD4S(s3 + j) ^ state[j]);
+	}
+}
+
+__device__
+void reduceDuplexRowt2x4(const int rowInOut, uint2 state[4])
+{
+	const int rowIn = 2;
+	const int rowOut = 3;
+
+	int i, j;
+	uint2 last[3];
+	const uint32_t ps1 = memshift * Ncol * rowIn;
+	const uint32_t ps2 = memshift * Ncol * rowInOut;
+
+	#pragma unroll
+	for (int j = 0; j < 3; j++)
+		last[j] = LD4S(ps2 + j);
+
+	#pragma unroll
+	for (int j = 0; j < 3; j++)
+		state[j] ^= LD4S(ps1 + j) + last[j];
+
+	round_lyra_v5(state);
+
+	uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4);
+	uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4);
+	uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4);
+
+	if (threadIdx.x == 0) {
+		last[0] ^= Data2;
+		last[1] ^= Data0;
+		last[2] ^= Data1;
+	} else {
+		last[0] ^= Data0;
+		last[1] ^= Data1;
+		last[2] ^= Data2;
+	}
+
+	if (rowInOut == rowOut)
+	{
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			last[j] ^= state[j];
+	}
+
+	for (i = 1; i < Ncol; i++)
+	{
+		const uint32_t s1 = ps1 + i*memshift;
+		const uint32_t s2 = ps2 + i*memshift;
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			state[j] ^= LD4S(s1 + j) + LD4S(s2 + j);
+
+		round_lyra_v5(state);
+	}
+
+	#pragma unroll
+	for (int j = 0; j < 3; j++)
+		state[j] ^= last[j];
+}
+
+__global__
+__launch_bounds__(TPB, 1)
+void lyra2v2_gpu_hash_32_1(uint32_t threads, uint2 *inputHash)
+{
+	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
+
+	const uint2x4 blake2b_IV[2] = {
+		0xf3bcc908UL, 0x6a09e667UL, 0x84caa73bUL, 0xbb67ae85UL,
+		0xfe94f82bUL, 0x3c6ef372UL, 0x5f1d36f1UL, 0xa54ff53aUL,
+		0xade682d1UL, 0x510e527fUL, 0x2b3e6c1fUL, 0x9b05688cUL,
+		0xfb41bd6bUL, 0x1f83d9abUL, 0x137e2179UL, 0x5be0cd19UL
+	};
+
+	const uint2x4 Mask[2] = {
+		0x00000020UL, 0x00000000UL, 0x00000020UL, 0x00000000UL,
+		0x00000020UL, 0x00000000UL, 0x00000001UL, 0x00000000UL,
+		0x00000004UL, 0x00000000UL, 0x00000004UL, 0x00000000UL,
+		0x00000080UL, 0x00000000UL, 0x00000000UL, 0x01000000UL
+	};
+
+	uint2x4 state[4];
+
+	if (thread < threads)
+	{
+		state[0].x = state[1].x = __ldg(&inputHash[thread + threads * 0]);
+		state[0].y = state[1].y = __ldg(&inputHash[thread + threads * 1]);
+		state[0].z = state[1].z = __ldg(&inputHash[thread + threads * 2]);
+		state[0].w = state[1].w = __ldg(&inputHash[thread + threads * 3]);
+		state[2] = blake2b_IV[0];
+		state[3] = blake2b_IV[1];
+
+		for (int i = 0; i<12; i++)
+			round_lyra_v5(state);
+
+		state[0] ^= Mask[0];
+		state[1] ^= Mask[1];
+
+		for (int i = 0; i<12; i++)
+			round_lyra_v5(state);
+
+		DMatrix[blockDim.x * gridDim.x * 0 + thread] = state[0];
+		DMatrix[blockDim.x * gridDim.x * 1 + thread] = state[1];
+		DMatrix[blockDim.x * gridDim.x * 2 + thread] = state[2];
+		DMatrix[blockDim.x * gridDim.x * 3 + thread] = state[3];
+	}
+}
+
+__global__
+__launch_bounds__(TPB, 1)
+void lyra2v2_gpu_hash_32_2(uint32_t threads)
+{
+	const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y;
+
+	if (thread < threads)
+	{
+		uint2 state[4];
+		state[0] = ((uint2*)DMatrix)[(0 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x];
+		state[1] = ((uint2*)DMatrix)[(1 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x];
+		state[2] = ((uint2*)DMatrix)[(2 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x];
+		state[3] = ((uint2*)DMatrix)[(3 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x];
+
+		reduceDuplexRowSetup2(state);
+
+		uint32_t rowa;
+		int prev = 3;
+
+		for (int i = 0; i < 3; i++)
+		{
+			rowa = __shfl(state[0].x, 0, 4) & 3;
+			reduceDuplexRowt2(prev, rowa, i, state);
+			prev = i;
+		}
+
+		rowa = __shfl(state[0].x, 0, 4) & 3;
+		reduceDuplexRowt2x4(rowa, state);
+
+		((uint2*)DMatrix)[(0 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[0];
+		((uint2*)DMatrix)[(1 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[1];
+		((uint2*)DMatrix)[(2 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[2];
+		((uint2*)DMatrix)[(3 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[3];
+	}
+}
+
+__global__
+__launch_bounds__(TPB, 1)
+void lyra2v2_gpu_hash_32_3(uint32_t threads, uint2 *outputHash)
+{
+	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
+
+	uint2x4 state[4];
+
+	if (thread < threads)
+	{
+		state[0] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 0 + thread]);
+		state[1] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 1 + thread]);
+		state[2] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 2 + thread]);
+		state[3] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 3 + thread]);
+
+		for (int i = 0; i < 12; i++)
+			round_lyra_v5(state);
+
+		outputHash[thread + threads * 0] = state[0].x;
+		outputHash[thread + threads * 1] = state[0].y;
+		outputHash[thread + threads * 2] = state[0].z;
+		outputHash[thread + threads * 3] = state[0].w;
+	}
+}
+
+#else
+#include "cuda_helper.h"
+#if __CUDA_ARCH__ < 200
+__device__ void* DMatrix;
+#endif
+__global__ void lyra2v2_gpu_hash_32_1(uint32_t threads, uint2 *inputHash) {}
+__global__ void lyra2v2_gpu_hash_32_2(uint32_t threads) {}
+__global__ void lyra2v2_gpu_hash_32_3(uint32_t threads, uint2 *outputHash) {}
+#endif
+
+
+__host__
+void lyra2v2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix)
+{
+	cuda_get_arch(thr_id);
+	// just assign the device pointer allocated in main loop
+	cudaMemcpyToSymbol(DMatrix, &d_matrix, sizeof(uint64_t*), 0, cudaMemcpyHostToDevice);
+}
+
+__host__
+void lyra2v2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, int order)
+{
+	int dev_id = device_map[thr_id % MAX_GPUS];
+
+	if (device_sm[dev_id] >= 500) {
+
+		const uint32_t tpb = TPB;
+
+		dim3 grid2((threads + tpb - 1) / tpb);
+		dim3 block2(tpb);
+		dim3 grid4((threads * 4 + tpb - 1) / tpb);
+		dim3 block4(4, tpb / 4);
+
+		lyra2v2_gpu_hash_32_1 <<< grid2, block2 >>> (threads, (uint2*)g_hash);
+		lyra2v2_gpu_hash_32_2 <<< grid4, block4, 48 * sizeof(uint2) * tpb >>> (threads);
+		lyra2v2_gpu_hash_32_3 <<< grid2, block2 >>> (threads, (uint2*)g_hash);
+
+	} else {
+
+		uint32_t tpb = 16;
+		if (cuda_arch[dev_id] >= 350) tpb = TPB35;
+		else if (cuda_arch[dev_id] >= 300) tpb = TPB30;
+		else if (cuda_arch[dev_id] >= 200) tpb = TPB20;
+
+		dim3 grid((threads + tpb - 1) / tpb);
+		dim3 block(tpb);
+		lyra2v2_gpu_hash_32_v3 <<< grid, block >>> (threads, startNounce, (uint2*)g_hash);
+
+	}
+}
diff --git a/lyra2/cuda_lyra2v2_sm3.cuh b/lyra2/cuda_lyra2v2_sm3.cuh
new file mode 100644
index 0000000000..0999a2651b
--- /dev/null
+++ b/lyra2/cuda_lyra2v2_sm3.cuh
@@ -0,0 +1,338 @@
+/* SM 2/3/3.5 Variant for lyra2REv2 */
+
+#ifdef __INTELLISENSE__
+/* just for vstudio code colors, only uncomment that temporary, dont commit it */
+//#undef __CUDA_ARCH__
+//#define __CUDA_ARCH__ 500
+#endif
+
+#define TPB20 64
+#define TPB30 64
+#define TPB35 64
+
+#if __CUDA_ARCH__ >= 200 && __CUDA_ARCH__ < 500
+
+#include "cuda_lyra2_vectors.h"
+
+#define Nrow 4
+#define Ncol 4
+
+#define vectype ulonglong4
+#define memshift 4
+
+__device__ vectype *DMatrix;
+
+static __device__ __forceinline__
+void Gfunc_v35(unsigned long long &a, unsigned long long &b, unsigned long long &c, unsigned long long &d)
+{
+	a += b; d ^= a; d = ROTR64(d, 32);
+	c += d; b ^= c; b = ROTR64(b, 24);
+	a += b; d ^= a; d = ROTR64(d, 16);
+	c += d; b ^= c; b = ROTR64(b, 63);
+}
+
+static __device__ __forceinline__
+void round_lyra_v35(vectype* s)
+{
+	Gfunc_v35(s[0].x, s[1].x, s[2].x, s[3].x);
+	Gfunc_v35(s[0].y, s[1].y, s[2].y, s[3].y);
+	Gfunc_v35(s[0].z, s[1].z, s[2].z, s[3].z);
+	Gfunc_v35(s[0].w, s[1].w, s[2].w, s[3].w);
+
+	Gfunc_v35(s[0].x, s[1].y, s[2].z, s[3].w);
+	Gfunc_v35(s[0].y, s[1].z, s[2].w, s[3].x);
+	Gfunc_v35(s[0].z, s[1].w, s[2].x, s[3].y);
+	Gfunc_v35(s[0].w, s[1].x, s[2].y, s[3].z);
+}
+
+static __device__ __forceinline__
+void reduceDuplexV3(vectype state[4], uint32_t thread)
+{
+	vectype state1[3];
+	uint32_t ps1 = (Nrow * Ncol * memshift * thread);
+	uint32_t ps2 = (memshift * (Ncol - 1) * Nrow + memshift * 1 + Nrow * Ncol * memshift * thread);
+
+	#pragma unroll 4
+	for (int i = 0; i < Ncol; i++)
+	{
+		uint32_t s1 = ps1 + Nrow * i *memshift;
+		uint32_t s2 = ps2 - Nrow * i *memshift;
+
+		for (int j = 0; j < 3; j++)
+			state1[j] = __ldg4(&(DMatrix + s1)[j]);
+
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j];
+		round_lyra_v35(state);
+
+		for (int j = 0; j < 3; j++)
+			state1[j] ^= state[j];
+
+		for (int j = 0; j < 3; j++)
+			(DMatrix + s2)[j] = state1[j];
+	}
+}
+
+static __device__ __forceinline__
+void reduceDuplexRowSetupV3(const int rowIn, const int rowInOut, const int rowOut, vectype state[4], uint32_t thread)
+{
+	vectype state2[3], state1[3];
+
+	uint32_t ps1 = (memshift * rowIn    + Nrow * Ncol * memshift * thread);
+	uint32_t ps2 = (memshift * rowInOut + Nrow * Ncol * memshift * thread);
+	uint32_t ps3 = (Nrow * memshift * (Ncol - 1) + memshift *  rowOut + Nrow * Ncol * memshift * thread);
+
+	for (int i = 0; i < Ncol; i++)
+	{
+		uint32_t s1 = ps1 + Nrow*i*memshift;
+		uint32_t s2 = ps2 + Nrow*i*memshift;
+		uint32_t s3 = ps3 - Nrow*i*memshift;
+
+		for (int j = 0; j < 3; j++)
+			state1[j] = __ldg4(&(DMatrix + s1 )[j]);
+		for (int j = 0; j < 3; j++)
+			state2[j] = __ldg4(&(DMatrix + s2 )[j]);
+		for (int j = 0; j < 3; j++) {
+			vectype tmp = state1[j] + state2[j];
+			state[j] ^= tmp;
+		}
+
+		round_lyra_v35(state);
+
+		for (int j = 0; j < 3; j++) {
+			state1[j] ^= state[j];
+			(DMatrix + s3)[j] = state1[j];
+		}
+
+		((uint2*)state2)[0] ^= ((uint2*)state)[11];
+		for (int j = 0; j < 11; j++)
+			((uint2*)state2)[j + 1] ^= ((uint2*)state)[j];
+
+		for (int j = 0; j < 3; j++)
+			(DMatrix + s2)[j] = state2[j];
+	}
+}
+
+static __device__ __forceinline__
+void reduceDuplexRowtV3(const int rowIn, const int rowInOut, const int rowOut, vectype* state, uint32_t thread)
+{
+	vectype state1[3], state2[3];
+	uint32_t ps1 = (memshift * rowIn    + Nrow * Ncol * memshift * thread);
+	uint32_t ps2 = (memshift * rowInOut + Nrow * Ncol * memshift * thread);
+	uint32_t ps3 = (memshift * rowOut   + Nrow * Ncol * memshift * thread);
+
+	#pragma nounroll
+	for (int i = 0; i < Ncol; i++)
+	{
+		uint32_t s1 = ps1 + Nrow * i*memshift;
+		uint32_t s2 = ps2 + Nrow * i*memshift;
+		uint32_t s3 = ps3 + Nrow * i*memshift;
+
+		for (int j = 0; j < 3; j++)
+			state1[j] = __ldg4(&(DMatrix + s1)[j]);
+
+		for (int j = 0; j < 3; j++)
+			state2[j] = __ldg4(&(DMatrix + s2)[j]);
+
+		for (int j = 0; j < 3; j++)
+			state1[j] += state2[j];
+
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j];
+
+		round_lyra_v35(state);
+
+		((uint2*)state2)[0] ^= ((uint2*)state)[11];
+
+		for (int j = 0; j < 11; j++)
+			((uint2*)state2)[j + 1] ^= ((uint2*)state)[j];
+
+		if (rowInOut != rowOut) {
+
+			for (int j = 0; j < 3; j++)
+				(DMatrix + s2)[j] = state2[j];
+
+			for (int j = 0; j < 3; j++)
+				(DMatrix + s3)[j] ^= state[j];
+
+		} else {
+
+			for (int j = 0; j < 3; j++)
+				state2[j] ^= state[j];
+
+			for (int j = 0; j < 3; j++)
+				(DMatrix + s2)[j] = state2[j];
+		}
+	}
+}
+
+#if __CUDA_ARCH__ >= 300
+__global__ __launch_bounds__(TPB35, 1)
+void lyra2v2_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+	vectype state[4];
+	vectype blake2b_IV[2];
+	vectype padding[2];
+
+	if (threadIdx.x == 0) {
+
+		((uint16*)blake2b_IV)[0] = make_uint16(
+			0xf3bcc908, 0x6a09e667 , 0x84caa73b, 0xbb67ae85,
+			0xfe94f82b, 0x3c6ef372 , 0x5f1d36f1, 0xa54ff53a,
+			0xade682d1, 0x510e527f , 0x2b3e6c1f, 0x9b05688c,
+			0xfb41bd6b, 0x1f83d9ab , 0x137e2179, 0x5be0cd19
+		);
+		((uint16*)padding)[0] = make_uint16(
+			0x20, 0x0 , 0x20, 0x0 , 0x20, 0x0 , 0x01, 0x0,
+			0x04, 0x0 , 0x04, 0x0 , 0x80, 0x0 , 0x0, 0x01000000
+		);
+	}
+
+	if (thread < threads)
+	{
+		((uint2*)state)[0] = __ldg(&outputHash[thread]);
+		((uint2*)state)[1] = __ldg(&outputHash[thread + threads]);
+		((uint2*)state)[2] = __ldg(&outputHash[thread + 2 * threads]);
+		((uint2*)state)[3] = __ldg(&outputHash[thread + 3 * threads]);
+
+		state[1] = state[0];
+		state[2] = shuffle4(((vectype*)blake2b_IV)[0], 0);
+		state[3] = shuffle4(((vectype*)blake2b_IV)[1], 0);
+
+		for (int i = 0; i<12; i++)
+			round_lyra_v35(state);
+
+		state[0] ^= shuffle4(((vectype*)padding)[0], 0);
+		state[1] ^= shuffle4(((vectype*)padding)[1], 0);
+
+		for (int i = 0; i<12; i++)
+			round_lyra_v35(state);
+
+		uint32_t ps1 = (4 * memshift * 3 + 16 * memshift * thread);
+
+		//#pragma unroll 4
+		for (int i = 0; i < 4; i++)
+		{
+			uint32_t s1 = ps1 - 4 * memshift * i;
+			for (int j = 0; j < 3; j++)
+				(DMatrix + s1)[j] = (state)[j];
+
+			round_lyra_v35(state);
+		}
+
+		reduceDuplexV3(state, thread);
+		reduceDuplexRowSetupV3(1, 0, 2, state, thread);
+		reduceDuplexRowSetupV3(2, 1, 3, state, thread);
+
+		uint32_t rowa;
+		int prev = 3;
+		for (int i = 0; i < 4; i++)
+		{
+			rowa = ((uint2*)state)[0].x & 3;  reduceDuplexRowtV3(prev, rowa, i, state, thread);
+			prev = i;
+		}
+
+		uint32_t shift = (memshift * rowa + 16 * memshift * thread);
+
+		for (int j = 0; j < 3; j++)
+			state[j] ^= __ldg4(&(DMatrix + shift)[j]);
+
+		for (int i = 0; i < 12; i++)
+			round_lyra_v35(state);
+
+		outputHash[thread] = ((uint2*)state)[0];
+		outputHash[thread + threads] = ((uint2*)state)[1];
+		outputHash[thread + 2 * threads] = ((uint2*)state)[2];
+		outputHash[thread + 3 * threads] = ((uint2*)state)[3];
+
+	} //thread
+}
+#elif __CUDA_ARCH__ >= 200
+__global__ __launch_bounds__(TPB20, 1)
+void lyra2v2_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+	vectype state[4];
+	vectype blake2b_IV[2];
+	vectype padding[2];
+
+	((uint16*)blake2b_IV)[0] = make_uint16(
+		0xf3bcc908, 0x6a09e667, 0x84caa73b, 0xbb67ae85,
+		0xfe94f82b, 0x3c6ef372, 0x5f1d36f1, 0xa54ff53a,
+		0xade682d1, 0x510e527f, 0x2b3e6c1f, 0x9b05688c,
+		0xfb41bd6b, 0x1f83d9ab, 0x137e2179, 0x5be0cd19
+	);
+	((uint16*)padding)[0] = make_uint16(
+		0x20, 0x0, 0x20, 0x0, 0x20, 0x0, 0x01, 0x0,
+		0x04, 0x0, 0x04, 0x0, 0x80, 0x0, 0x0, 0x01000000
+	);
+
+	if (thread < threads)
+	{
+
+		((uint2*)state)[0] = outputHash[thread];
+		((uint2*)state)[1] = outputHash[thread + threads];
+		((uint2*)state)[2] = outputHash[thread + 2 * threads];
+		((uint2*)state)[3] = outputHash[thread + 3 * threads];
+
+		state[1] = state[0];
+		state[2] = ((vectype*)blake2b_IV)[0];
+		state[3] = ((vectype*)blake2b_IV)[1];
+
+		for (int i = 0; i<12; i++)
+			round_lyra_v35(state);
+
+		state[0] ^= ((vectype*)padding)[0];
+		state[1] ^= ((vectype*)padding)[1];
+
+		for (int i = 0; i<12; i++)
+			round_lyra_v35(state);
+
+		uint32_t ps1 = (4 * memshift * 3 + 16 * memshift * thread);
+
+		//#pragma unroll 4
+		for (int i = 0; i < 4; i++)
+		{
+			uint32_t s1 = ps1 - 4 * memshift * i;
+			for (int j = 0; j < 3; j++)
+				(DMatrix + s1)[j] = (state)[j];
+
+			round_lyra_v35(state);
+		}
+
+		reduceDuplexV3(state, thread);
+		reduceDuplexRowSetupV3(1, 0, 2, state, thread);
+		reduceDuplexRowSetupV3(2, 1, 3, state, thread);
+
+		uint32_t rowa;
+		int prev = 3;
+		for (int i = 0; i < 4; i++)
+		{
+			rowa = ((uint2*)state)[0].x & 3;  reduceDuplexRowtV3(prev, rowa, i, state, thread);
+			prev = i;
+		}
+
+		uint32_t shift = (memshift * rowa + 16 * memshift * thread);
+
+		for (int j = 0; j < 3; j++)
+			state[j] ^= __ldg4(&(DMatrix + shift)[j]);
+
+		for (int i = 0; i < 12; i++)
+			round_lyra_v35(state);
+
+		outputHash[thread] = ((uint2*)state)[0];
+		outputHash[thread + threads] = ((uint2*)state)[1];
+		outputHash[thread + 2 * threads] = ((uint2*)state)[2];
+		outputHash[thread + 3 * threads] = ((uint2*)state)[3];
+
+	} //thread
+}
+#endif
+
+#else
+/* host & sm5+ */
+__global__ void lyra2v2_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash) {}
+#endif
diff --git a/lyra2/cuda_lyra2v3.cu b/lyra2/cuda_lyra2v3.cu
new file mode 100644
index 0000000000..0278cabc78
--- /dev/null
+++ b/lyra2/cuda_lyra2v3.cu
@@ -0,0 +1,481 @@
+/**
+ * Lyra2 (v3) CUDA Implementation
+ *
+ * Based on VTC sources
+ */
+#include <stdio.h>
+#include <stdint.h>
+#include <memory.h>
+#include "cuda_helper.h"
+
+#include "cuda_lyra2v3_sm3.cuh"
+
+
+
+#ifdef __INTELLISENSE__
+/* just for vstudio code colors */
+#define __CUDA_ARCH__ 500
+#endif
+
+#define TPB 32
+
+#if __CUDA_ARCH__ >= 500
+
+#include "cuda_lyra2_vectors.h"
+
+#define Nrow 4
+#define Ncol 4
+#define memshift 3
+
+
+__device__ uint2x4 *DMatrix;
+
+__device__ __forceinline__ uint2 LD4S(const int index)
+{
+	extern __shared__ uint2 shared_mem[];
+	return shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x];
+}
+
+__device__ __forceinline__ void ST4S(const int index, const uint2 data)
+{
+	extern __shared__ uint2 shared_mem[];
+	shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data;
+}
+
+__device__ __forceinline__ uint2 shuffle2(uint2 a, uint32_t b, uint32_t c)
+{
+	return make_uint2(__shfl(a.x, b, c), __shfl(a.y, b, c));
+}
+
+__device__ __forceinline__
+void Gfunc_v5(uint2 &a, uint2 &b, uint2 &c, uint2 &d)
+{
+	a += b; d ^= a; d = SWAPUINT2(d);
+	c += d; b ^= c; b = ROR2(b, 24);
+	a += b; d ^= a; d = ROR2(d, 16);
+	c += d; b ^= c; b = ROR2(b, 63);
+}
+
+__device__ __forceinline__
+void round_lyra_v5(uint2x4 s[4])
+{
+	Gfunc_v5(s[0].x, s[1].x, s[2].x, s[3].x);
+	Gfunc_v5(s[0].y, s[1].y, s[2].y, s[3].y);
+	Gfunc_v5(s[0].z, s[1].z, s[2].z, s[3].z);
+	Gfunc_v5(s[0].w, s[1].w, s[2].w, s[3].w);
+
+	Gfunc_v5(s[0].x, s[1].y, s[2].z, s[3].w);
+	Gfunc_v5(s[0].y, s[1].z, s[2].w, s[3].x);
+	Gfunc_v5(s[0].z, s[1].w, s[2].x, s[3].y);
+	Gfunc_v5(s[0].w, s[1].x, s[2].y, s[3].z);
+}
+
+__device__ __forceinline__
+void round_lyra_v5(uint2 s[4])
+{
+	Gfunc_v5(s[0], s[1], s[2], s[3]);
+	s[1] = shuffle2(s[1], threadIdx.x + 1, 4);
+	s[2] = shuffle2(s[2], threadIdx.x + 2, 4);
+	s[3] = shuffle2(s[3], threadIdx.x + 3, 4);
+	Gfunc_v5(s[0], s[1], s[2], s[3]);
+	s[1] = shuffle2(s[1], threadIdx.x + 3, 4);
+	s[2] = shuffle2(s[2], threadIdx.x + 2, 4);
+	s[3] = shuffle2(s[3], threadIdx.x + 1, 4);
+}
+
+__device__ __forceinline__
+void reduceDuplexRowSetup2(uint2 state[4])
+{
+	uint2 state1[Ncol][3], state0[Ncol][3], state2[3];
+	int i, j;
+
+	#pragma unroll
+	for (int i = 0; i < Ncol; i++)
+	{
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			state0[Ncol - i - 1][j] = state[j];
+		round_lyra_v5(state);
+	}
+
+	//#pragma unroll 4
+	for (i = 0; i < Ncol; i++)
+	{
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			state[j] ^= state0[i][j];
+
+		round_lyra_v5(state);
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			state1[Ncol - i - 1][j] = state0[i][j];
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			state1[Ncol - i - 1][j] ^= state[j];
+	}
+
+	for (i = 0; i < Ncol; i++)
+	{
+		const uint32_t s0 = memshift * Ncol * 0 + i * memshift;
+		const uint32_t s2 = memshift * Ncol * 2 + memshift * (Ncol - 1) - i*memshift;
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			state[j] ^= state1[i][j] + state0[i][j];
+
+		round_lyra_v5(state);
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			state2[j] = state1[i][j];
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			state2[j] ^= state[j];
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			ST4S(s2 + j, state2[j]);
+
+		uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4);
+		uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4);
+		uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0) {
+			state0[i][0] ^= Data2;
+			state0[i][1] ^= Data0;
+			state0[i][2] ^= Data1;
+		} else {
+			state0[i][0] ^= Data0;
+			state0[i][1] ^= Data1;
+			state0[i][2] ^= Data2;
+		}
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			ST4S(s0 + j, state0[i][j]);
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			state0[i][j] = state2[j];
+
+	}
+
+	for (i = 0; i < Ncol; i++)
+	{
+		const uint32_t s1 = memshift * Ncol * 1 + i*memshift;
+		const uint32_t s3 = memshift * Ncol * 3 + memshift * (Ncol - 1) - i*memshift;
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			state[j] ^= state1[i][j] + state0[Ncol - i - 1][j];
+
+		round_lyra_v5(state);
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			state0[Ncol - i - 1][j] ^= state[j];
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			ST4S(s3 + j, state0[Ncol - i - 1][j]);
+
+		uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4);
+		uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4);
+		uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0) {
+			state1[i][0] ^= Data2;
+			state1[i][1] ^= Data0;
+			state1[i][2] ^= Data1;
+		} else  {
+			state1[i][0] ^= Data0;
+			state1[i][1] ^= Data1;
+			state1[i][2] ^= Data2;
+		}
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			ST4S(s1 + j, state1[i][j]);
+	}
+}
+
+__device__
+void reduceDuplexRowt2(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4])
+{
+	uint2 state1[3], state2[3];
+	const uint32_t ps1 = memshift * Ncol * rowIn;
+	const uint32_t ps2 = memshift * Ncol * rowInOut;
+	const uint32_t ps3 = memshift * Ncol * rowOut;
+
+	for (int i = 0; i < Ncol; i++)
+	{
+		const uint32_t s1 = ps1 + i*memshift;
+		const uint32_t s2 = ps2 + i*memshift;
+		const uint32_t s3 = ps3 + i*memshift;
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state1[j] = LD4S(s1 + j);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state2[j] = LD4S(s2 + j);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra_v5(state);
+
+		uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4);
+		uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4);
+		uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0) {
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		} else {
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			ST4S(s2 + j, state2[j]);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			ST4S(s3 + j, LD4S(s3 + j) ^ state[j]);
+	}
+}
+
+__device__
+void reduceDuplexRowt2x4(const int rowInOut, uint2 state[4])
+{
+	const int rowIn = 2;
+	const int rowOut = 3;
+
+	int i, j;
+	uint2 last[3];
+	const uint32_t ps1 = memshift * Ncol * rowIn;
+	const uint32_t ps2 = memshift * Ncol * rowInOut;
+
+	#pragma unroll
+	for (int j = 0; j < 3; j++)
+		last[j] = LD4S(ps2 + j);
+
+	#pragma unroll
+	for (int j = 0; j < 3; j++)
+		state[j] ^= LD4S(ps1 + j) + last[j];
+
+	round_lyra_v5(state);
+
+	uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4);
+	uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4);
+	uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4);
+
+	if (threadIdx.x == 0) {
+		last[0] ^= Data2;
+		last[1] ^= Data0;
+		last[2] ^= Data1;
+	} else {
+		last[0] ^= Data0;
+		last[1] ^= Data1;
+		last[2] ^= Data2;
+	}
+
+	if (rowInOut == rowOut)
+	{
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			last[j] ^= state[j];
+	}
+
+	for (i = 1; i < Ncol; i++)
+	{
+		const uint32_t s1 = ps1 + i*memshift;
+		const uint32_t s2 = ps2 + i*memshift;
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			state[j] ^= LD4S(s1 + j) + LD4S(s2 + j);
+
+		round_lyra_v5(state);
+	}
+
+	#pragma unroll
+	for (int j = 0; j < 3; j++)
+		state[j] ^= last[j];
+}
+
+__global__
+__launch_bounds__(TPB, 1)
+void lyra2v3_gpu_hash_32_1(uint32_t threads, uint2 *inputHash)
+{
+	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
+
+	const uint2x4 blake2b_IV[2] = {
+		0xf3bcc908UL, 0x6a09e667UL, 0x84caa73bUL, 0xbb67ae85UL,
+		0xfe94f82bUL, 0x3c6ef372UL, 0x5f1d36f1UL, 0xa54ff53aUL,
+		0xade682d1UL, 0x510e527fUL, 0x2b3e6c1fUL, 0x9b05688cUL,
+		0xfb41bd6bUL, 0x1f83d9abUL, 0x137e2179UL, 0x5be0cd19UL
+	};
+
+	const uint2x4 Mask[2] = {
+		0x00000020UL, 0x00000000UL, 0x00000020UL, 0x00000000UL,
+		0x00000020UL, 0x00000000UL, 0x00000001UL, 0x00000000UL,
+		0x00000004UL, 0x00000000UL, 0x00000004UL, 0x00000000UL,
+		0x00000080UL, 0x00000000UL, 0x00000000UL, 0x01000000UL
+	};
+
+	uint2x4 state[4];
+
+	if (thread < threads)
+	{
+		state[0].x = state[1].x = __ldg(&inputHash[thread + threads * 0]);
+		state[0].y = state[1].y = __ldg(&inputHash[thread + threads * 1]);
+		state[0].z = state[1].z = __ldg(&inputHash[thread + threads * 2]);
+		state[0].w = state[1].w = __ldg(&inputHash[thread + threads * 3]);
+		state[2] = blake2b_IV[0];
+		state[3] = blake2b_IV[1];
+
+		for (int i = 0; i<12; i++)
+			round_lyra_v5(state);
+
+		state[0] ^= Mask[0];
+		state[1] ^= Mask[1];
+
+		for (int i = 0; i<12; i++)
+			round_lyra_v5(state);
+
+		DMatrix[blockDim.x * gridDim.x * 0 + thread] = state[0];
+		DMatrix[blockDim.x * gridDim.x * 1 + thread] = state[1];
+		DMatrix[blockDim.x * gridDim.x * 2 + thread] = state[2];
+		DMatrix[blockDim.x * gridDim.x * 3 + thread] = state[3];
+	}
+}
+
+__global__
+__launch_bounds__(TPB, 1)
+void lyra2v3_gpu_hash_32_2(uint32_t threads)
+{
+	const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y;
+
+	if (thread < threads)
+	{
+		uint2 state[4];
+		state[0] = ((uint2*)DMatrix)[(0 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x];
+		state[1] = ((uint2*)DMatrix)[(1 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x];
+		state[2] = ((uint2*)DMatrix)[(2 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x];
+		state[3] = ((uint2*)DMatrix)[(3 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x];
+
+		reduceDuplexRowSetup2(state);
+
+		uint32_t rowa;
+		int prev = 3;
+		unsigned int instance = 0;
+		for (int i = 0; i < 3; i++)
+		{
+			instance = __shfl(state[(instance >> 2) & 0x3].x, instance & 0x3, 4);
+			rowa     = __shfl(state[(instance >> 2) & 0x3].x, instance & 0x3, 4) & 0x3;
+
+			//rowa = __shfl(state[0].x, 0, 4) & 3;
+			reduceDuplexRowt2(prev, rowa, i, state);
+			prev = i;
+		}
+
+		instance = __shfl(state[(instance >> 2) & 0x3].x, instance & 0x3, 4);
+		rowa     = __shfl(state[(instance >> 2) & 0x3].x, instance & 0x3, 4) & 0x3;
+
+		//rowa = __shfl(state[0].x, 0, 4) & 3;
+		reduceDuplexRowt2x4(rowa, state);
+
+		((uint2*)DMatrix)[(0 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[0];
+		((uint2*)DMatrix)[(1 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[1];
+		((uint2*)DMatrix)[(2 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[2];
+		((uint2*)DMatrix)[(3 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[3];
+	}
+}
+
+__global__
+__launch_bounds__(TPB, 1)
+void lyra2v3_gpu_hash_32_3(uint32_t threads, uint2 *outputHash)
+{
+	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
+
+	uint2x4 state[4];
+
+	if (thread < threads)
+	{
+		state[0] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 0 + thread]);
+		state[1] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 1 + thread]);
+		state[2] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 2 + thread]);
+		state[3] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 3 + thread]);
+
+		for (int i = 0; i < 12; i++)
+			round_lyra_v5(state);
+
+		outputHash[thread + threads * 0] = state[0].x;
+		outputHash[thread + threads * 1] = state[0].y;
+		outputHash[thread + threads * 2] = state[0].z;
+		outputHash[thread + threads * 3] = state[0].w;
+	}
+}
+
+#else
+#include "cuda_helper.h"
+#if __CUDA_ARCH__ < 200
+__device__ void* DMatrix;
+#endif
+__global__ void lyra2v3_gpu_hash_32_1(uint32_t threads, uint2 *inputHash) {}
+__global__ void lyra2v3_gpu_hash_32_2(uint32_t threads) {}
+__global__ void lyra2v3_gpu_hash_32_3(uint32_t threads, uint2 *outputHash) {}
+#endif
+
+
+__host__
+void lyra2v3_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix)
+{
+	cuda_get_arch(thr_id);
+	// just assign the device pointer allocated in main loop
+	cudaMemcpyToSymbol(DMatrix, &d_matrix, sizeof(uint64_t*), 0, cudaMemcpyHostToDevice);
+}
+
+__host__
+void lyra2v3_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, int order)
+{
+	int dev_id = device_map[thr_id % MAX_GPUS];
+
+	if (device_sm[dev_id] >= 500) {
+
+		const uint32_t tpb = TPB;
+
+		dim3 grid2((threads + tpb - 1) / tpb);
+		dim3 block2(tpb);
+		dim3 grid4((threads * 4 + tpb - 1) / tpb);
+		dim3 block4(4, tpb / 4);
+
+		lyra2v3_gpu_hash_32_1 <<< grid2, block2 >>> (threads, (uint2*)g_hash);
+		lyra2v3_gpu_hash_32_2 <<< grid4, block4, 48 * sizeof(uint2) * tpb >>> (threads);
+		lyra2v3_gpu_hash_32_3 <<< grid2, block2 >>> (threads, (uint2*)g_hash);
+
+	} else {
+
+		uint32_t tpb = 16;
+		if (cuda_arch[dev_id] >= 350) tpb = TPB35;
+		else if (cuda_arch[dev_id] >= 300) tpb = TPB30;
+		else if (cuda_arch[dev_id] >= 200) tpb = TPB20;
+
+		dim3 grid((threads + tpb - 1) / tpb);
+		dim3 block(tpb);
+		lyra2v3_gpu_hash_32_v3 <<< grid, block >>> (threads, startNounce, (uint2*)g_hash);
+
+	}
+}
+
+
diff --git a/lyra2/cuda_lyra2v3_sm3.cuh b/lyra2/cuda_lyra2v3_sm3.cuh
new file mode 100644
index 0000000000..f84521c869
--- /dev/null
+++ b/lyra2/cuda_lyra2v3_sm3.cuh
@@ -0,0 +1,348 @@
+/* SM 2/3/3.5 Variant for lyra2REv2 */
+
+#ifdef __INTELLISENSE__
+/* just for vstudio code colors, only uncomment that temporary, dont commit it */
+//#undef __CUDA_ARCH__
+//#define __CUDA_ARCH__ 500
+#endif
+
+#define TPB20 64
+#define TPB30 64
+#define TPB35 64
+
+#if __CUDA_ARCH__ >= 200 && __CUDA_ARCH__ < 500
+
+#include "cuda_lyra2_vectors.h"
+
+#define Nrow 4
+#define Ncol 4
+
+#define vectype ulonglong4
+#define memshift 4
+
+__device__ vectype *DMatrix;
+
+static __device__ __forceinline__
+void Gfunc_v35(unsigned long long &a, unsigned long long &b, unsigned long long &c, unsigned long long &d)
+{
+	a += b; d ^= a; d = ROTR64(d, 32);
+	c += d; b ^= c; b = ROTR64(b, 24);
+	a += b; d ^= a; d = ROTR64(d, 16);
+	c += d; b ^= c; b = ROTR64(b, 63);
+}
+
+static __device__ __forceinline__
+void round_lyra_v35(vectype* s)
+{
+	Gfunc_v35(s[0].x, s[1].x, s[2].x, s[3].x);
+	Gfunc_v35(s[0].y, s[1].y, s[2].y, s[3].y);
+	Gfunc_v35(s[0].z, s[1].z, s[2].z, s[3].z);
+	Gfunc_v35(s[0].w, s[1].w, s[2].w, s[3].w);
+
+	Gfunc_v35(s[0].x, s[1].y, s[2].z, s[3].w);
+	Gfunc_v35(s[0].y, s[1].z, s[2].w, s[3].x);
+	Gfunc_v35(s[0].z, s[1].w, s[2].x, s[3].y);
+	Gfunc_v35(s[0].w, s[1].x, s[2].y, s[3].z);
+}
+
+static __device__ __forceinline__
+void reduceDuplexV3(vectype state[4], uint32_t thread)
+{
+	vectype state1[3];
+	uint32_t ps1 = (Nrow * Ncol * memshift * thread);
+	uint32_t ps2 = (memshift * (Ncol - 1) * Nrow + memshift * 1 + Nrow * Ncol * memshift * thread);
+
+	#pragma unroll 4
+	for (int i = 0; i < Ncol; i++)
+	{
+		uint32_t s1 = ps1 + Nrow * i *memshift;
+		uint32_t s2 = ps2 - Nrow * i *memshift;
+
+		for (int j = 0; j < 3; j++)
+			state1[j] = __ldg4(&(DMatrix + s1)[j]);
+
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j];
+		round_lyra_v35(state);
+
+		for (int j = 0; j < 3; j++)
+			state1[j] ^= state[j];
+
+		for (int j = 0; j < 3; j++)
+			(DMatrix + s2)[j] = state1[j];
+	}
+}
+
+static __device__ __forceinline__
+void reduceDuplexRowSetupV3(const int rowIn, const int rowInOut, const int rowOut, vectype state[4], uint32_t thread)
+{
+	vectype state2[3], state1[3];
+
+	uint32_t ps1 = (memshift * rowIn    + Nrow * Ncol * memshift * thread);
+	uint32_t ps2 = (memshift * rowInOut + Nrow * Ncol * memshift * thread);
+	uint32_t ps3 = (Nrow * memshift * (Ncol - 1) + memshift *  rowOut + Nrow * Ncol * memshift * thread);
+
+	for (int i = 0; i < Ncol; i++)
+	{
+		uint32_t s1 = ps1 + Nrow*i*memshift;
+		uint32_t s2 = ps2 + Nrow*i*memshift;
+		uint32_t s3 = ps3 - Nrow*i*memshift;
+
+		for (int j = 0; j < 3; j++)
+			state1[j] = __ldg4(&(DMatrix + s1 )[j]);
+		for (int j = 0; j < 3; j++)
+			state2[j] = __ldg4(&(DMatrix + s2 )[j]);
+		for (int j = 0; j < 3; j++) {
+			vectype tmp = state1[j] + state2[j];
+			state[j] ^= tmp;
+		}
+
+		round_lyra_v35(state);
+
+		for (int j = 0; j < 3; j++) {
+			state1[j] ^= state[j];
+			(DMatrix + s3)[j] = state1[j];
+		}
+
+		((uint2*)state2)[0] ^= ((uint2*)state)[11];
+		for (int j = 0; j < 11; j++)
+			((uint2*)state2)[j + 1] ^= ((uint2*)state)[j];
+
+		for (int j = 0; j < 3; j++)
+			(DMatrix + s2)[j] = state2[j];
+	}
+}
+
+static __device__ __forceinline__
+void reduceDuplexRowtV3(const int rowIn, const int rowInOut, const int rowOut, vectype* state, uint32_t thread)
+{
+	vectype state1[3], state2[3];
+	uint32_t ps1 = (memshift * rowIn    + Nrow * Ncol * memshift * thread);
+	uint32_t ps2 = (memshift * rowInOut + Nrow * Ncol * memshift * thread);
+	uint32_t ps3 = (memshift * rowOut   + Nrow * Ncol * memshift * thread);
+
+	#pragma nounroll
+	for (int i = 0; i < Ncol; i++)
+	{
+		uint32_t s1 = ps1 + Nrow * i*memshift;
+		uint32_t s2 = ps2 + Nrow * i*memshift;
+		uint32_t s3 = ps3 + Nrow * i*memshift;
+
+		for (int j = 0; j < 3; j++)
+			state1[j] = __ldg4(&(DMatrix + s1)[j]);
+
+		for (int j = 0; j < 3; j++)
+			state2[j] = __ldg4(&(DMatrix + s2)[j]);
+
+		for (int j = 0; j < 3; j++)
+			state1[j] += state2[j];
+
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j];
+
+		round_lyra_v35(state);
+
+		((uint2*)state2)[0] ^= ((uint2*)state)[11];
+
+		for (int j = 0; j < 11; j++)
+			((uint2*)state2)[j + 1] ^= ((uint2*)state)[j];
+
+		if (rowInOut != rowOut) {
+
+			for (int j = 0; j < 3; j++)
+				(DMatrix + s2)[j] = state2[j];
+
+			for (int j = 0; j < 3; j++)
+				(DMatrix + s3)[j] ^= state[j];
+
+		} else {
+
+			for (int j = 0; j < 3; j++)
+				state2[j] ^= state[j];
+
+			for (int j = 0; j < 3; j++)
+				(DMatrix + s2)[j] = state2[j];
+		}
+	}
+}
+
+#if __CUDA_ARCH__ >= 300
+__global__ __launch_bounds__(TPB35, 1)
+void lyra2v3_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+	vectype state[4];
+	vectype blake2b_IV[2];
+	vectype padding[2];
+
+	if (threadIdx.x == 0) {
+
+		((uint16*)blake2b_IV)[0] = make_uint16(
+			0xf3bcc908, 0x6a09e667 , 0x84caa73b, 0xbb67ae85,
+			0xfe94f82b, 0x3c6ef372 , 0x5f1d36f1, 0xa54ff53a,
+			0xade682d1, 0x510e527f , 0x2b3e6c1f, 0x9b05688c,
+			0xfb41bd6b, 0x1f83d9ab , 0x137e2179, 0x5be0cd19
+		);
+		((uint16*)padding)[0] = make_uint16(
+			0x20, 0x0 , 0x20, 0x0 , 0x20, 0x0 , 0x01, 0x0,
+			0x04, 0x0 , 0x04, 0x0 , 0x80, 0x0 , 0x0, 0x01000000
+		);
+	}
+
+	if (thread < threads)
+	{
+		((uint2*)state)[0] = __ldg(&outputHash[thread]);
+		((uint2*)state)[1] = __ldg(&outputHash[thread + threads]);
+		((uint2*)state)[2] = __ldg(&outputHash[thread + 2 * threads]);
+		((uint2*)state)[3] = __ldg(&outputHash[thread + 3 * threads]);
+
+		state[1] = state[0];
+		state[2] = shuffle4(((vectype*)blake2b_IV)[0], 0);
+		state[3] = shuffle4(((vectype*)blake2b_IV)[1], 0);
+
+		for (int i = 0; i<12; i++)
+			round_lyra_v35(state);
+
+		state[0] ^= shuffle4(((vectype*)padding)[0], 0);
+		state[1] ^= shuffle4(((vectype*)padding)[1], 0);
+
+		for (int i = 0; i<12; i++)
+			round_lyra_v35(state);
+
+		uint32_t ps1 = (4 * memshift * 3 + 16 * memshift * thread);
+
+		//#pragma unroll 4
+		for (int i = 0; i < 4; i++)
+		{
+			uint32_t s1 = ps1 - 4 * memshift * i;
+			for (int j = 0; j < 3; j++)
+				(DMatrix + s1)[j] = (state)[j];
+
+			round_lyra_v35(state);
+		}
+
+		reduceDuplexV3(state, thread);
+		reduceDuplexRowSetupV3(1, 0, 2, state, thread);
+		reduceDuplexRowSetupV3(2, 1, 3, state, thread);
+
+		unsigned int instance = 0;
+		uint32_t rowa;
+		int prev = 3;
+		for (int i = 0; i < 4; i++)
+		{
+			//rowa = ((uint2*)state)[0].x & 3;
+
+			instance = ((uint2*)state)[instance & 0xf].x;
+			rowa = ((uint2*)state)[instance & 0xf].x & 0x3;
+			reduceDuplexRowtV3(prev, rowa, i, state, thread);
+			prev = i;
+		}
+
+		uint32_t shift = (memshift * rowa + 16 * memshift * thread);
+
+		for (int j = 0; j < 3; j++)
+			state[j] ^= __ldg4(&(DMatrix + shift)[j]);
+
+		for (int i = 0; i < 12; i++)
+			round_lyra_v35(state);
+
+		outputHash[thread] = ((uint2*)state)[0];
+		outputHash[thread + threads] = ((uint2*)state)[1];
+		outputHash[thread + 2 * threads] = ((uint2*)state)[2];
+		outputHash[thread + 3 * threads] = ((uint2*)state)[3];
+
+	} //thread
+}
+#elif __CUDA_ARCH__ >= 200
+__global__ __launch_bounds__(TPB20, 1)
+void lyra2v3_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+	vectype state[4];
+	vectype blake2b_IV[2];
+	vectype padding[2];
+
+	((uint16*)blake2b_IV)[0] = make_uint16(
+		0xf3bcc908, 0x6a09e667, 0x84caa73b, 0xbb67ae85,
+		0xfe94f82b, 0x3c6ef372, 0x5f1d36f1, 0xa54ff53a,
+		0xade682d1, 0x510e527f, 0x2b3e6c1f, 0x9b05688c,
+		0xfb41bd6b, 0x1f83d9ab, 0x137e2179, 0x5be0cd19
+	);
+	((uint16*)padding)[0] = make_uint16(
+		0x20, 0x0, 0x20, 0x0, 0x20, 0x0, 0x01, 0x0,
+		0x04, 0x0, 0x04, 0x0, 0x80, 0x0, 0x0, 0x01000000
+	);
+
+	if (thread < threads)
+	{
+
+		((uint2*)state)[0] = outputHash[thread];
+		((uint2*)state)[1] = outputHash[thread + threads];
+		((uint2*)state)[2] = outputHash[thread + 2 * threads];
+		((uint2*)state)[3] = outputHash[thread + 3 * threads];
+
+		state[1] = state[0];
+		state[2] = ((vectype*)blake2b_IV)[0];
+		state[3] = ((vectype*)blake2b_IV)[1];
+
+		for (int i = 0; i<12; i++)
+			round_lyra_v35(state);
+
+		state[0] ^= ((vectype*)padding)[0];
+		state[1] ^= ((vectype*)padding)[1];
+
+		for (int i = 0; i<12; i++)
+			round_lyra_v35(state);
+
+		uint32_t ps1 = (4 * memshift * 3 + 16 * memshift * thread);
+
+		//#pragma unroll 4
+		for (int i = 0; i < 4; i++)
+		{
+			uint32_t s1 = ps1 - 4 * memshift * i;
+			for (int j = 0; j < 3; j++)
+				(DMatrix + s1)[j] = (state)[j];
+
+			round_lyra_v35(state);
+		}
+
+		reduceDuplexV3(state, thread);
+		reduceDuplexRowSetupV3(1, 0, 2, state, thread);
+		reduceDuplexRowSetupV3(2, 1, 3, state, thread);
+
+		uint instance = 0;
+		uint32_t rowa;
+		int prev = 3;
+		for (int i = 0; i < 4; i++)
+		{
+			// rowa = ((uint2*)state)[0].x & 3;
+
+			instance = ((uint2*)state)[instance & 0xf];
+			rowa = ((uint2*)state)[instance & 0xf] & 0x3;
+			reduceDuplexRowtV3(prev, rowa, i, state, thread);
+			prev = i;
+		}
+
+		uint32_t shift = (memshift * rowa + 16 * memshift * thread);
+
+		for (int j = 0; j < 3; j++)
+			state[j] ^= __ldg4(&(DMatrix + shift)[j]);
+
+		for (int i = 0; i < 12; i++)
+			round_lyra_v35(state);
+
+		outputHash[thread] = ((uint2*)state)[0];
+		outputHash[thread + threads] = ((uint2*)state)[1];
+		outputHash[thread + 2 * threads] = ((uint2*)state)[2];
+		outputHash[thread + 3 * threads] = ((uint2*)state)[3];
+
+	} //thread
+}
+#endif
+
+#else
+/* host & sm5+ */
+__global__ void lyra2v3_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash) {}
+#endif
diff --git a/lyra2/lyra2RE.cu b/lyra2/lyra2RE.cu
new file mode 100644
index 0000000000..b43537122b
--- /dev/null
+++ b/lyra2/lyra2RE.cu
@@ -0,0 +1,200 @@
+extern "C" {
+#include "sph/sph_blake.h"
+#include "sph/sph_groestl.h"
+#include "sph/sph_skein.h"
+#include "sph/sph_keccak.h"
+#include "lyra2/Lyra2.h"
+}
+
+#include <miner.h>
+#include <cuda_helper.h>
+
+static uint64_t* d_hash[MAX_GPUS];
+static uint64_t* d_matrix[MAX_GPUS];
+
+extern void blake256_cpu_init(int thr_id, uint32_t threads);
+extern void blake256_cpu_setBlock_80(uint32_t *pdata);
+//extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
+
+//extern void keccak256_sm3_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
+//extern void keccak256_sm3_init(int thr_id, uint32_t threads);
+//extern void keccak256_sm3_free(int thr_id);
+
+extern void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
+
+extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
+extern void skein256_cpu_init(int thr_id, uint32_t threads);
+
+extern void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix);
+extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint64_t *d_outputHash, bool gtx750ti);
+
+extern void groestl256_cpu_init(int thr_id, uint32_t threads);
+extern void groestl256_cpu_free(int thr_id);
+extern void groestl256_setTarget(const void *ptarget);
+extern uint32_t groestl256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order);
+extern uint32_t groestl256_getSecNonce(int thr_id, int num);
+
+
+extern "C" void lyra2re_hash(void *state, const void *input)
+{
+	uint32_t hashA[8], hashB[8];
+
+	sph_blake256_context     ctx_blake;
+	sph_keccak256_context    ctx_keccak;
+	sph_skein256_context     ctx_skein;
+	sph_groestl256_context   ctx_groestl;
+
+	sph_blake256_set_rounds(14);
+
+	sph_blake256_init(&ctx_blake);
+	sph_blake256(&ctx_blake, input, 80);
+	sph_blake256_close(&ctx_blake, hashA);
+
+	sph_keccak256_init(&ctx_keccak);
+	sph_keccak256(&ctx_keccak, hashA, 32);
+	sph_keccak256_close(&ctx_keccak, hashB);
+
+	LYRA2(hashA, 32, hashB, 32, hashB, 32, 1, 8, 8);
+
+	sph_skein256_init(&ctx_skein);
+	sph_skein256(&ctx_skein, hashA, 32);
+	sph_skein256_close(&ctx_skein, hashB);
+
+	sph_groestl256_init(&ctx_groestl);
+	sph_groestl256(&ctx_groestl, hashB, 32);
+	sph_groestl256_close(&ctx_groestl, hashA);
+
+	memcpy(state, hashA, 32);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+static __thread uint32_t throughput = 0;
+
+extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+
+	if (opt_benchmark)
+		ptarget[7] = 0x00ff;
+
+	static __thread bool gtx750ti;
+	if (!init[thr_id])
+	{
+		int dev_id = device_map[thr_id];
+		cudaSetDevice(dev_id);
+		CUDA_LOG_ERROR();
+
+		int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 17 : 16;
+		if (device_sm[device_map[thr_id]] == 500) intensity = 15;
+		throughput = cuda_default_throughput(thr_id, 1U << intensity); // 18=256*256*4;
+		if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+		cudaDeviceProp props;
+		cudaGetDeviceProperties(&props, dev_id);
+
+		if (strstr(props.name, "750 Ti")) gtx750ti = true;
+		else gtx750ti = false;
+
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		blake256_cpu_init(thr_id, throughput);
+		//keccak256_sm3_init(thr_id, throughput);
+		skein256_cpu_init(thr_id, throughput);
+		groestl256_cpu_init(thr_id, throughput);
+
+		//cuda_get_arch(thr_id);
+		if (device_sm[dev_id] >= 500)
+		{
+			size_t matrix_sz = device_sm[dev_id] > 500 ? sizeof(uint64_t) * 4 * 4 : sizeof(uint64_t) * 8 * 8 * 3 * 4;
+			CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput));
+			lyra2_cpu_init(thr_id, throughput, d_matrix[thr_id]);
+		}
+
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput));
+
+		init[thr_id] = true;
+	}
+
+	uint32_t _ALIGN(128) endiandata[20];
+	for (int k=0; k < 20; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	blake256_cpu_setBlock_80(pdata);
+	groestl256_setTarget(ptarget);
+
+	do {
+		int order = 0;
+
+		//blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		//keccak256_sm3_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		blakeKeccak256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		lyra2_cpu_hash_32(thr_id, throughput, d_hash[thr_id], gtx750ti);
+		skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		work->nonces[0] = groestl256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+
+			be32enc(&endiandata[19], work->nonces[0]);
+			lyra2re_hash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				work->nonces[1] = groestl256_getSecNonce(thr_id, 1);
+				if (work->nonces[1] != UINT32_MAX) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					lyra2re_hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+	return 0;
+}
+
+// cleanup
+extern "C" void free_lyra2(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+	cudaFree(d_matrix[thr_id]);
+
+	//keccak256_sm3_free(thr_id);
+	groestl256_cpu_free(thr_id);
+
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/lyra2/lyra2REv2.cu b/lyra2/lyra2REv2.cu
new file mode 100644
index 0000000000..715f311e74
--- /dev/null
+++ b/lyra2/lyra2REv2.cu
@@ -0,0 +1,209 @@
+extern "C" {
+#include "sph/sph_blake.h"
+#include "sph/sph_bmw.h"
+#include "sph/sph_skein.h"
+#include "sph/sph_keccak.h"
+#include "sph/sph_cubehash.h"
+#include "lyra2/Lyra2.h"
+}
+
+#include <miner.h>
+#include <cuda_helper.h>
+
+static uint64_t *d_hash[MAX_GPUS];
+static uint64_t* d_matrix[MAX_GPUS];
+
+extern void blake256_cpu_init(int thr_id, uint32_t threads);
+extern void blake256_cpu_setBlock_80(uint32_t *pdata);
+//extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
+
+//extern void keccak256_sm3_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
+//extern void keccak256_sm3_init(int thr_id, uint32_t threads);
+//extern void keccak256_sm3_free(int thr_id);
+
+extern void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
+
+extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
+extern void skein256_cpu_init(int thr_id, uint32_t threads);
+extern void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order);
+
+extern void lyra2v2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
+extern void lyra2v2_cpu_init(int thr_id, uint32_t threads, uint64_t* d_matrix);
+
+extern void bmw256_setTarget(const void *ptarget);
+extern void bmw256_cpu_init(int thr_id, uint32_t threads);
+extern void bmw256_cpu_free(int thr_id);
+extern void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces);
+
+void lyra2v2_hash(void *state, const void *input)
+{
+	uint32_t hashA[8], hashB[8];
+
+	sph_blake256_context      ctx_blake;
+	sph_keccak256_context     ctx_keccak;
+	sph_skein256_context      ctx_skein;
+	sph_bmw256_context        ctx_bmw;
+	sph_cubehash256_context   ctx_cube;
+
+	sph_blake256_set_rounds(14);
+
+	sph_blake256_init(&ctx_blake);
+	sph_blake256(&ctx_blake, input, 80);
+	sph_blake256_close(&ctx_blake, hashA);
+
+	sph_keccak256_init(&ctx_keccak);
+	sph_keccak256(&ctx_keccak, hashA, 32);
+	sph_keccak256_close(&ctx_keccak, hashB);
+
+	sph_cubehash256_init(&ctx_cube);
+	sph_cubehash256(&ctx_cube, hashB, 32);
+	sph_cubehash256_close(&ctx_cube, hashA);
+
+	LYRA2(hashB, 32, hashA, 32, hashA, 32, 1, 4, 4);
+
+	sph_skein256_init(&ctx_skein);
+	sph_skein256(&ctx_skein, hashB, 32);
+	sph_skein256_close(&ctx_skein, hashA);
+
+	sph_cubehash256_init(&ctx_cube);
+	sph_cubehash256(&ctx_cube, hashA, 32);
+	sph_cubehash256_close(&ctx_cube, hashB);
+
+	sph_bmw256_init(&ctx_bmw);
+	sph_bmw256(&ctx_bmw, hashB, 32);
+	sph_bmw256_close(&ctx_bmw, hashA);
+
+	memcpy(state, hashA, 32);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	int dev_id = device_map[thr_id];
+	int intensity = (device_sm[dev_id] < 500) ? 18 : is_windows() ? 19 : 20;
+	if (strstr(device_name[dev_id], "GTX 10")) intensity = 20;
+	uint32_t throughput = cuda_default_throughput(dev_id, 1UL << intensity);
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (opt_benchmark)
+		ptarget[7] = 0x000f;
+
+	if (!init[thr_id])
+	{
+		size_t matrix_sz = 16 * sizeof(uint64_t) * 4 * 3;
+		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		blake256_cpu_init(thr_id, throughput);
+		//keccak256_sm3_init(thr_id,throughput);
+		skein256_cpu_init(thr_id, throughput);
+		bmw256_cpu_init(thr_id, throughput);
+
+		cuda_get_arch(thr_id); // cuda_arch[] also used in cubehash256
+
+		// SM 3 implentation requires a bit more memory
+		if (device_sm[dev_id] < 500 || cuda_arch[dev_id] < 500)
+			matrix_sz = 16 * sizeof(uint64_t) * 4 * 4;
+			
+		CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput));
+		lyra2v2_cpu_init(thr_id, throughput, d_matrix[thr_id]);
+
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput));
+
+		api_set_throughput(thr_id, throughput);
+		init[thr_id] = true;
+	}
+
+	uint32_t endiandata[20];
+	for (int k=0; k < 20; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	blake256_cpu_setBlock_80(pdata);
+	bmw256_setTarget(ptarget);
+
+	do {
+		int order = 0;
+
+		//blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		//keccak256_sm3_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		blakeKeccak256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		cubehash256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		lyra2v2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		cubehash256_cpu_hash_32(thr_id, throughput,pdata[19], d_hash[thr_id], order++);
+
+		memset(work->nonces, 0, sizeof(work->nonces));
+		bmw256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], work->nonces);
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		if (work->nonces[0] != 0)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			lyra2v2_hash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					lyra2v2_hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart && !abort_flag);
+
+	*hashes_done = pdata[19] - first_nonce;
+	return 0;
+}
+
+// cleanup
+extern "C" void free_lyra2v2(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+	cudaFree(d_matrix[thr_id]);
+
+	bmw256_cpu_free(thr_id);
+	//keccak256_sm3_free(thr_id);
+
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/lyra2/lyra2REv3.cu b/lyra2/lyra2REv3.cu
new file mode 100644
index 0000000000..7e1b4a7046
--- /dev/null
+++ b/lyra2/lyra2REv3.cu
@@ -0,0 +1,183 @@
+extern "C" {
+#include "sph/sph_blake.h"
+#include "sph/sph_bmw.h"
+#include "sph/sph_cubehash.h"
+#include "lyra2/Lyra2.h"
+}
+
+#include <miner.h>
+#include <cuda_helper.h>
+
+static uint64_t *d_hash[MAX_GPUS];
+static uint64_t* d_matrix[MAX_GPUS];
+
+extern void blake256_cpu_init(int thr_id, uint32_t threads);
+extern void blake256_cpu_setBlock_80(uint32_t *pdata);
+extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
+
+extern void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order);
+
+extern void lyra2v3_setTarget(const void *pTargetIn);
+extern void lyra2v3_cpu_init(int thr_id, uint32_t threads, uint64_t* d_matrix);
+extern void lyra2v3_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
+
+extern void lyra2v3_cpu_hash_32_targ(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces);
+
+extern void bmw256_setTarget(const void *ptarget);
+extern void bmw256_cpu_init(int thr_id, uint32_t threads);
+extern void bmw256_cpu_free(int thr_id);
+extern void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces);
+
+extern "C" void lyra2v3_hash(void *state, const void *input)
+{
+	uint32_t hashA[8], hashB[8];
+
+	sph_blake256_context      ctx_blake;
+	sph_cubehash256_context   ctx_cube;
+	sph_bmw256_context        ctx_bmw;
+
+	sph_blake256_set_rounds(14);
+
+	sph_blake256_init(&ctx_blake);
+	sph_blake256(&ctx_blake, input, 80);
+	sph_blake256_close(&ctx_blake, hashA);
+
+	LYRA2_3(hashB, 32, hashA, 32, hashA, 32, 1, 4, 4);
+
+	sph_cubehash256_init(&ctx_cube);
+	sph_cubehash256(&ctx_cube, hashB, 32);
+	sph_cubehash256_close(&ctx_cube, hashA);
+
+	LYRA2_3(hashB, 32, hashA, 32, hashA, 32, 1, 4, 4);
+
+	sph_bmw256_init(&ctx_bmw);
+	sph_bmw256(&ctx_bmw, hashB, 32);
+	sph_bmw256_close(&ctx_bmw, hashA);
+
+	memcpy(state, hashA, 32);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_lyra2v3(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	int dev_id = device_map[thr_id];
+	int intensity = (device_sm[dev_id] < 500) ? 18 : is_windows() ? 19 : 20;
+	if (strstr(device_name[dev_id], "GTX 1")) intensity = 20;
+	if (strstr(device_name[dev_id], "RTX 20")) intensity = 20;
+	uint32_t throughput = cuda_default_throughput(dev_id, 1UL << intensity);
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (opt_benchmark)
+		ptarget[7] = 0x000f;
+
+
+	if (!init[thr_id])
+	{
+		size_t matrix_sz = 16 * sizeof(uint64_t) * 4 * 3;
+		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		blake256_cpu_init(thr_id, throughput);
+		bmw256_cpu_init(thr_id, throughput);
+
+		cuda_get_arch(thr_id); // cuda_arch[] also used in cubehash256
+
+		// SM 3 implentation requires a bit more memory
+		if (device_sm[dev_id] < 500 || cuda_arch[dev_id] < 500)
+			matrix_sz = 16 * sizeof(uint64_t) * 4 * 4;
+
+		CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput));
+		lyra2v3_cpu_init(thr_id, throughput, d_matrix[thr_id]);
+
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput));
+
+		api_set_throughput(thr_id, throughput);
+		init[thr_id] = true;
+	}
+
+	uint32_t endiandata[20];
+	for (int k=0; k < 20; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	blake256_cpu_setBlock_80(pdata);
+	bmw256_setTarget(ptarget);
+
+	do {
+		int order = 0;
+
+		blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		lyra2v3_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		cubehash256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		lyra2v3_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		memset(work->nonces, 0, sizeof(work->nonces));
+		bmw256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], work->nonces);
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		if (work->nonces[0] != 0)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			lyra2v3_hash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					lyra2v3_hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart && !abort_flag);
+
+	*hashes_done = pdata[19] - first_nonce;
+	return 0;
+}
+
+// cleanup
+extern "C" void free_lyra2v3(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+	cudaFree(d_matrix[thr_id]);
+
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/lyra2/lyra2Z.cu b/lyra2/lyra2Z.cu
new file mode 100644
index 0000000000..7d84b3c6d9
--- /dev/null
+++ b/lyra2/lyra2Z.cu
@@ -0,0 +1,168 @@
+extern "C" {
+#include <sph/sph_blake.h>
+#include "Lyra2Z.h"
+}
+
+#include <miner.h>
+#include <cuda_helper.h>
+
+static uint64_t* d_hash[MAX_GPUS];
+static uint64_t* d_matrix[MAX_GPUS];
+
+extern void blake256_cpu_init(int thr_id, uint32_t threads);
+extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
+extern void blake256_cpu_setBlock_80(uint32_t *pdata);
+
+extern void lyra2Z_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix);
+extern void lyra2Z_cpu_init_sm2(int thr_id, uint32_t threads);
+extern void lyra2Z_cpu_free(int thr_id);
+extern uint32_t lyra2Z_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, bool gtx750ti);
+
+extern void lyra2Z_setTarget(const void *ptarget);
+extern uint32_t lyra2Z_getSecNonce(int thr_id, int num);
+
+extern "C" void lyra2Z_hash(void *state, const void *input)
+{
+	uint32_t _ALIGN(64) hashA[8], hashB[8];
+	sph_blake256_context ctx_blake;
+
+	sph_blake256_set_rounds(14);
+	sph_blake256_init(&ctx_blake);
+	sph_blake256(&ctx_blake, input, 80);
+	sph_blake256_close(&ctx_blake, hashA);
+
+	LYRA2Z(hashB, 32, hashA, 32, hashA, 32, 8, 8, 8);
+
+	memcpy(state, hashB, 32);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+static __thread uint32_t throughput = 0;
+static __thread bool gtx750ti = false;
+
+extern "C" int scanhash_lyra2Z(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	uint32_t _ALIGN(64) endiandata[20];
+	const uint32_t first_nonce = pdata[19];
+	int dev_id = device_map[thr_id];
+
+	if (opt_benchmark)
+		ptarget[7] = 0x00ff;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+
+		cuda_get_arch(thr_id);
+		int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 17 : 16;
+		if (device_sm[dev_id] <= 500) intensity = 15;
+		throughput = cuda_default_throughput(thr_id, 1U << intensity); // 18=256*256*4;
+		if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+		cudaDeviceProp props;
+		cudaGetDeviceProperties(&props, dev_id);
+		gtx750ti = (strstr(props.name, "750 Ti") != NULL);
+
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		blake256_cpu_init(thr_id, throughput);
+
+		if (device_sm[dev_id] >= 350)
+		{
+			size_t matrix_sz = device_sm[dev_id] > 500 ? sizeof(uint64_t) * 4 * 4 : sizeof(uint64_t) * 8 * 8 * 3 * 4;
+			CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput));
+			lyra2Z_cpu_init(thr_id, throughput, d_matrix[thr_id]);
+		}
+		else
+			lyra2Z_cpu_init_sm2(thr_id, throughput);
+
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput));
+
+		init[thr_id] = true;
+	}
+
+	for (int k=0; k < 20; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	blake256_cpu_setBlock_80(pdata);
+	lyra2Z_setTarget(ptarget);
+
+	do {
+		int order = 0;
+
+		blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		work->nonces[0] = lyra2Z_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], gtx750ti);
+
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			uint32_t _ALIGN(64) vhash[8];
+
+			be32enc(&endiandata[19], work->nonces[0]);
+			lyra2Z_hash(vhash, endiandata);
+
+			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work->nonces[1] = lyra2Z_getSecNonce(thr_id, 1);
+				work_set_target_ratio(work, vhash);
+				pdata[19] = work->nonces[0] + 1;
+				if (work->nonces[1] != UINT32_MAX)
+				{
+					be32enc(&endiandata[19], work->nonces[1]);
+					lyra2Z_hash(vhash, endiandata);
+					if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+						bn_set_target_ratio(work, vhash, 1);
+						work->valid_nonces++;
+					}
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; // cursor
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > ptarget[7]) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)	gpulog(LOG_WARNING, thr_id,
+					"result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0];
+				continue;
+			}
+		}
+
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+	return 0;
+}
+
+// cleanup
+extern "C" void free_lyra2Z(int thr_id)
+{
+	int dev_id = device_map[thr_id];
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+	if (device_sm[dev_id] >= 350)
+		cudaFree(d_matrix[thr_id]);
+	lyra2Z_cpu_free(thr_id);
+
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/miner.h b/miner.h
index 0e205188fa..1d75855b9b 100644
--- a/miner.h
+++ b/miner.h
@@ -5,7 +5,7 @@
 extern "C" {
 #endif
 
-#include "cpuminer-config.h"
+#include <ccminer-config.h>
 
 #include <stdbool.h>
 #include <inttypes.h>
@@ -14,12 +14,9 @@ extern "C" {
 #include <jansson.h>
 #include <curl/curl.h>
 
-#ifdef WIN32
-#define snprintf(...) _snprintf(__VA_ARGS__)
-#define strdup(x) _strdup(x)
-#define strncasecmp(x,y,z) _strnicmp(x,y,z)
-#define strcasecmp(x,y) _stricmp(x,y)
-typedef int ssize_t;
+#ifdef _MSC_VER
+#undef HAVE_ALLOCA_H
+#undef HAVE_SYSLOG_H
 #endif
 
 #ifdef STDC_HEADERS
@@ -30,6 +27,7 @@ typedef int ssize_t;
 #  include <stdlib.h>
 # endif
 #endif
+
 #ifdef HAVE_ALLOCA_H
 # include <alloca.h>
 #elif !defined alloca
@@ -41,15 +39,36 @@ typedef int ssize_t;
 #  include <malloc.h>
 #  define alloca _alloca
 # elif !defined HAVE_ALLOCA
-#  ifdef  __cplusplus
-extern "C"
-#  endif
 void *alloca (size_t);
 # endif
 #endif
 
+#include "compat.h"
+
+#ifdef __INTELLISENSE__
+/* should be in stdint.h but... */
+typedef __int64 int64_t;
+typedef unsigned __int64 uint64_t;
+typedef __int32 int32_t;
+typedef unsigned __int32 uint32_t;
+typedef __int16 int16_t;
+typedef unsigned __int16 uint16_t;
+typedef __int16 int8_t;
+typedef unsigned __int16 uint8_t;
+
+typedef unsigned __int32 time_t;
+typedef char *  va_list;
+#endif
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ > 0
+# undef _ALIGN
+# define _ALIGN(x) __align__(x)
+#endif
+
 #ifdef HAVE_SYSLOG_H
 #include <syslog.h>
+#define LOG_BLUE 0x10
+#define LOG_RAW  0x99
 #else
 enum {
 	LOG_ERR,
@@ -57,9 +76,14 @@ enum {
 	LOG_NOTICE,
 	LOG_INFO,
 	LOG_DEBUG,
+	/* custom notices */
+	LOG_BLUE = 0x10,
+	LOG_RAW  = 0x99
 };
 #endif
 
+typedef unsigned char uchar;
+
 #undef unlikely
 #undef likely
 #if defined(__GNUC__) && (__GNUC__ > 2) && defined(__OPTIMIZE__)
@@ -74,11 +98,43 @@ enum {
 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
 #endif
 
+#ifndef max
+# define max(a, b)  ((a) > (b) ? (a) : (b))
+#endif
+#ifndef min
+# define min(a, b)  ((a) < (b) ? (a) : (b))
+#endif
+
+#ifndef UINT32_MAX
+/* for gcc 4.4 */
+#define UINT32_MAX UINT_MAX
+#endif
+
+static inline bool is_windows(void) {
+#ifdef WIN32
+        return 1;
+#else
+        return 0;
+#endif
+}
+
+static inline bool is_x64(void) {
+#if defined(__x86_64__) || defined(_WIN64) || defined(__aarch64__)
+	return 1;
+#elif defined(__amd64__) || defined(__amd64) || defined(_M_X64) || defined(_M_IA64)
+	return 1;
+#else
+	return 0;
+#endif
+}
+
 #if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
 #define WANT_BUILTIN_BSWAP
 #else
 #define bswap_32(x) ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) \
                    | (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
+#define bswap_64(x) (((uint64_t) bswap_32((uint32_t)((x) & 0xffffffffu)) << 32) \
+                   | (uint64_t) bswap_32((uint32_t)((x) >> 32)))
 #endif
 
 static inline uint32_t swab32(uint32_t v)
@@ -90,6 +146,30 @@ static inline uint32_t swab32(uint32_t v)
 #endif
 }
 
+static inline uint64_t swab64(uint64_t v)
+{
+#ifdef WANT_BUILTIN_BSWAP
+	return __builtin_bswap64(v);
+#else
+	return bswap_64(v);
+#endif
+}
+
+static inline void swab256(void *dest_p, const void *src_p)
+{
+	uint32_t *dest = (uint32_t *) dest_p;
+	const uint32_t *src = (const uint32_t *) src_p;
+
+	dest[0] = swab32(src[7]);
+	dest[1] = swab32(src[6]);
+	dest[2] = swab32(src[5]);
+	dest[3] = swab32(src[4]);
+	dest[4] = swab32(src[3]);
+	dest[5] = swab32(src[2]);
+	dest[6] = swab32(src[1]);
+	dest[7] = swab32(src[0]);
+}
+
 #ifdef HAVE_SYS_ENDIAN_H
 #include <sys/endian.h>
 #endif
@@ -168,124 +248,427 @@ static inline void le16enc(void *pp, uint16_t x)
 }
 #endif
 
+/* used for struct work */
+void *aligned_calloc(int size);
+void aligned_free(void *ptr);
+
 #if JANSSON_MAJOR_VERSION >= 2
 #define JSON_LOADS(str, err_ptr) json_loads((str), 0, (err_ptr))
+#define JSON_LOADF(str, err_ptr) json_load_file((str), 0, (err_ptr))
 #else
 #define JSON_LOADS(str, err_ptr) json_loads((str), (err_ptr))
+#define JSON_LOADF(str, err_ptr) json_load_file((str), (err_ptr))
 #endif
 
+json_t * json_load_url(char* cfg_url, json_error_t *err);
+
 #define USER_AGENT PACKAGE_NAME "/" PACKAGE_VERSION
 
 void sha256_init(uint32_t *state);
 void sha256_transform(uint32_t *state, const uint32_t *block, int swap);
 void sha256d(unsigned char *hash, const unsigned char *data, int len);
 
-#if defined(__ARM_NEON__) || defined(__i386__) || defined(__x86_64__)
 #define HAVE_SHA256_4WAY 0
-int sha256_use_4way();
-void sha256_init_4way(uint32_t *state);
-void sha256_transform_4way(uint32_t *state, const uint32_t *block, int swap);
-#endif
-
-#if defined(__x86_64__) && defined(USE_AVX2)
 #define HAVE_SHA256_8WAY 0
-int sha256_use_8way();
-void sha256_init_8way(uint32_t *state);
-void sha256_transform_8way(uint32_t *state, const uint32_t *block, int swap);
-#endif
-
-extern int scanhash_sha256d(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done);
-
-extern unsigned char *scrypt_buffer_alloc();
-
-extern int scanhash_scrypt(int thr_id, uint32_t *pdata,
-	unsigned char *scratchbuf, const uint32_t *ptarget,
-	uint32_t max_nonce, unsigned long *hashes_done);
-
-extern int scanhash_heavy(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done, uint32_t maxvote, int blocklen);
 
-extern int scanhash_fugue256(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done);
-
-extern int scanhash_groestlcoin(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done);
+struct work;
+
+extern int scanhash_allium(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_bastion(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_blake256(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int8_t blakerounds);
+extern int scanhash_blake2b(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_bmw(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant);
+extern int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant);
+extern int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_deep(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_equihash(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_fresh(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_groestlcoin(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_hmq17(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_heavy(int thr_id,struct work *work, uint32_t max_nonce, unsigned long *hashes_done, uint32_t maxvote, int blocklen);
+extern int scanhash_hsr(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_jha(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_jackpot(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); // quark method
+extern int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_luffa(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_lyra2v2(int thr_id,struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_lyra2v3(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_lyra2Z(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_myriad(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_neoscrypt(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_nist5(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_phi(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_phi2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_polytimos(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_quark(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_qubit(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_sha256d(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_sha256t(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_sha256q(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_sia(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_skeincoin(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_skunk(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_s3(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_sonoa(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_timetravel(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_tribus(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_bitcore(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_exosis(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_vanilla(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int8_t blake_rounds);
+extern int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_wildkeccak(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_x11evo(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_x11(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_x12(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_x13(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_x14(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_x15(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_zr5(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
+
+extern int scanhash_scrypt(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done,
+	unsigned char *scratchbuf, struct timeval *tv_start, struct timeval *tv_end);
+extern int scanhash_scrypt_jane(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done,
+	unsigned char *scratchbuf, struct timeval *tv_start, struct timeval *tv_end);
+
+/* free device allocated memory per algo */
+void algo_free_all(int thr_id);
+
+extern void free_allium(int thr_id);
+extern void free_bastion(int thr_id);
+extern void free_bitcore(int thr_id);
+extern void free_blake256(int thr_id);
+extern void free_blake2b(int thr_id);
+extern void free_blake2s(int thr_id);
+extern void free_bmw(int thr_id);
+extern void free_c11(int thr_id);
+extern void free_cryptolight(int thr_id);
+extern void free_cryptonight(int thr_id);
+extern void free_decred(int thr_id);
+extern void free_deep(int thr_id);
+extern void free_equihash(int thr_id);
+extern void free_exosis(int thr_id);
+extern void free_keccak256(int thr_id);
+extern void free_fresh(int thr_id);
+extern void free_fugue256(int thr_id);
+extern void free_groestlcoin(int thr_id);
+extern void free_heavy(int thr_id);
+extern void free_hmq17(int thr_id);
+extern void free_hsr(int thr_id);
+extern void free_jackpot(int thr_id);
+extern void free_jha(int thr_id);
+extern void free_lbry(int thr_id);
+extern void free_luffa(int thr_id);
+extern void free_lyra2(int thr_id);
+extern void free_lyra2v2(int thr_id);
+extern void free_lyra2v3(int thr_id);
+extern void free_lyra2Z(int thr_id);
+extern void free_myriad(int thr_id);
+extern void free_neoscrypt(int thr_id);
+extern void free_nist5(int thr_id);
+extern void free_pentablake(int thr_id);
+extern void free_phi(int thr_id);
+extern void free_phi2(int thr_id);
+extern void free_polytimos(int thr_id);
+extern void free_quark(int thr_id);
+extern void free_qubit(int thr_id);
+extern void free_sha256d(int thr_id);
+extern void free_sha256t(int thr_id);
+extern void free_sha256q(int thr_id);
+extern void free_sia(int thr_id);
+extern void free_sib(int thr_id);
+extern void free_skeincoin(int thr_id);
+extern void free_skein2(int thr_id);
+extern void free_skunk(int thr_id);
+extern void free_s3(int thr_id);
+extern void free_sonoa(int thr_id);
+extern void free_timetravel(int thr_id);
+extern void free_tribus(int thr_id);
+extern void free_bitcore(int thr_id);
+extern void free_vanilla(int thr_id);
+extern void free_veltor(int thr_id);
+extern void free_whirl(int thr_id);
+extern void free_wildkeccak(int thr_id);
+extern void free_x11evo(int thr_id);
+extern void free_x11(int thr_id);
+extern void free_x12(int thr_id);
+extern void free_x13(int thr_id);
+extern void free_x14(int thr_id);
+extern void free_x15(int thr_id);
+extern void free_x16r(int thr_id);
+extern void free_x16s(int thr_id);
+extern void free_x17(int thr_id);
+extern void free_zr5(int thr_id);
+//extern void free_sha256d(int thr_id);
+extern void free_scrypt(int thr_id);
+extern void free_scrypt_jane(int thr_id);
+
+/* api related */
+void *api_thread(void *userdata);
+void api_set_throughput(int thr_id, uint32_t throughput);
+void gpu_increment_reject(int thr_id);
+
+struct monitor_info {
+	uint32_t gpu_temp;
+	uint32_t gpu_fan;
+	uint32_t gpu_clock;
+	uint32_t gpu_memclock;
+	uint32_t gpu_power;
+
+	pthread_mutex_t lock;
+	pthread_cond_t sampling_signal;
+	volatile bool sampling_flag;
+	uint32_t tm_displayed;
+};
 
-extern int scanhash_myriad(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done);
+struct cgpu_info {
+	uint8_t gpu_id;
+	uint8_t thr_id;
+	uint16_t hw_errors;
+	unsigned accepted;
+	uint32_t rejected;
+	double khashes;
+	int has_monitoring;
+	float gpu_temp;
+	uint16_t gpu_fan;
+	uint16_t gpu_fan_rpm;
+	uint16_t gpu_arch;
+	uint32_t gpu_clock;
+	uint32_t gpu_memclock;
+	uint64_t gpu_mem;
+	uint64_t gpu_memfree;
+	uint32_t gpu_power;
+	uint32_t gpu_plimit;
+	double gpu_vddc;
+	int16_t gpu_pstate;
+	int16_t gpu_bus;
+	uint16_t gpu_vid;
+	uint16_t gpu_pid;
+
+	int8_t nvml_id;
+	int8_t nvapi_id;
+
+	char gpu_sn[64];
+	char gpu_desc[64];
+	double intensity;
+	uint32_t throughput;
+
+	struct monitor_info monitor;
+};
 
-extern int scanhash_jackpot(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done);
+struct thr_api {
+	int id;
+	pthread_t pth;
+	struct thread_q	*q;
+};
 
-extern int scanhash_quark(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done);
+struct stats_data {
+	uint32_t uid;
+	uint32_t tm_stat;
+	uint32_t hashcount;
+	uint32_t height;
 
-extern int scanhash_anime(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done);
+	double difficulty;
+	double hashrate;
 
-extern int scanhash_nist5(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done);
+	uint8_t thr_id;
+	uint8_t gpu_id;
+	uint8_t hashfound;
+	uint8_t ignored;
 
-extern int scanhash_x11(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done);
+	uint8_t npool;
+	uint8_t pool_type;
+	uint16_t align;
+};
 
-extern int scanhash_x13(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done);
+struct hashlog_data {
+	uint8_t npool;
+	uint8_t pool_type;
+	uint8_t nonce_id;
+	uint8_t job_nonce_id;
+
+	uint32_t height;
+	double sharediff;
+
+	uint32_t njobid;
+	uint32_t nonce;
+	uint32_t scanned_from;
+	uint32_t scanned_to;
+	uint32_t last_from;
+	uint32_t tm_add;
+	uint32_t tm_upd;
+	uint32_t tm_sent;
+};
 
-extern void fugue256_hash(unsigned char* output, const unsigned char* input, int len);
-extern void heavycoin_hash(unsigned char* output, const unsigned char* input, int len);
-extern void groestlcoin_hash(unsigned char* output, const unsigned char* input, int len);
+/* end of api */
 
 struct thr_info {
 	int		id;
 	pthread_t	pth;
 	struct thread_q	*q;
+	struct cgpu_info gpu;
 };
 
 struct work_restart {
-	volatile unsigned long	restart;
-	char			padding[128 - sizeof(unsigned long)];
+	/* volatile to modify accross threads (vstudio thing) */
+	volatile uint32_t restart;
+	char padding[128 - sizeof(uint32_t)];
+};
+
+#ifdef HAVE_GETOPT_LONG
+#include <getopt.h>
+#else
+struct option {
+	const char *name;
+	int has_arg;
+	int *flag;
+	int val;
 };
+#endif
+extern int options_count();
 
+extern bool opt_benchmark;
 extern bool opt_debug;
+extern bool opt_quiet;
 extern bool opt_protocol;
+extern bool opt_showdiff;
+extern bool opt_tracegpu;
+extern int opt_n_threads;
+extern int active_gpus;
+extern int gpu_threads;
 extern int opt_timeout;
 extern bool want_longpoll;
 extern bool have_longpoll;
 extern bool want_stratum;
 extern bool have_stratum;
+extern bool opt_stratum_stats;
 extern char *opt_cert;
 extern char *opt_proxy;
 extern long opt_proxy_type;
 extern bool use_syslog;
+extern bool use_colors;
+extern int use_pok;
 extern pthread_mutex_t applog_lock;
 extern struct thr_info *thr_info;
 extern int longpoll_thr_id;
 extern int stratum_thr_id;
+extern int api_thr_id;
+extern volatile bool abort_flag;
 extern struct work_restart *work_restart;
 extern bool opt_trust_pool;
 extern uint16_t opt_vote;
 
+extern uint64_t global_hashrate;
+extern uint64_t net_hashrate;
+extern double net_diff;
+extern double stratum_diff;
+
+#define MAX_GPUS 16
+//#define MAX_THREADS 32 todo
+extern char* device_name[MAX_GPUS];
+extern short device_map[MAX_GPUS];
+extern short device_mpcount[MAX_GPUS];
+extern long  device_sm[MAX_GPUS];
+extern uint32_t device_plimit[MAX_GPUS];
+extern uint32_t gpus_intensity[MAX_GPUS];
+extern int opt_cudaschedule;
+
+extern int cryptonight_fork;
+
+// cuda.cpp
+int cuda_num_devices();
+void cuda_devicenames();
+void cuda_reset_device(int thr_id, bool *init);
+void cuda_shutdown();
+int cuda_finddevice(char *name);
+int cuda_version();
+void cuda_print_devices();
+int cuda_gpu_info(struct cgpu_info *gpu);
+int cuda_available_memory(int thr_id);
+
+uint32_t cuda_default_throughput(int thr_id, uint32_t defcount);
+#define device_intensity(t,f,d) cuda_default_throughput(t,d)
+double throughput2intensity(uint32_t throughput);
+
+void cuda_log_lasterror(int thr_id, const char* func, int line);
+void cuda_clear_lasterror();
+#define CUDA_LOG_ERROR() cuda_log_lasterror(thr_id, __func__, __LINE__)
+
+#define CL_N    "\x1B[0m"
+#define CL_RED  "\x1B[31m"
+#define CL_GRN  "\x1B[32m"
+#define CL_YLW  "\x1B[33m"
+#define CL_BLU  "\x1B[34m"
+#define CL_MAG  "\x1B[35m"
+#define CL_CYN  "\x1B[36m"
+
+#define CL_BLK  "\x1B[22;30m" /* black */
+#define CL_RD2  "\x1B[22;31m" /* red */
+#define CL_GR2  "\x1B[22;32m" /* green */
+#define CL_YL2  "\x1B[22;33m" /* dark yellow */
+#define CL_BL2  "\x1B[22;34m" /* blue */
+#define CL_MA2  "\x1B[22;35m" /* magenta */
+#define CL_CY2  "\x1B[22;36m" /* cyan */
+#define CL_SIL  "\x1B[22;37m" /* gray */
+
+#ifdef WIN32
+#define CL_GRY  "\x1B[01;30m" /* dark gray */
+#else
+#define CL_GRY  "\x1B[90m"    /* dark gray selectable in putty */
+#endif
+#define CL_LRD  "\x1B[01;31m" /* light red */
+#define CL_LGR  "\x1B[01;32m" /* light green */
+#define CL_LYL  "\x1B[01;33m" /* tooltips */
+#define CL_LBL  "\x1B[01;34m" /* light blue */
+#define CL_LMA  "\x1B[01;35m" /* light magenta */
+#define CL_LCY  "\x1B[01;36m" /* light cyan */
+
+#define CL_WHT  "\x1B[01;37m" /* white */
+
+extern void format_hashrate(double hashrate, char *output);
+extern void format_hashrate_unit(double hashrate, char *output, const char* unit);
 extern void applog(int prio, const char *fmt, ...);
-extern json_t *json_rpc_call(CURL *curl, const char *url, const char *userpass,
-	const char *rpc_req, bool, bool, int *);
-extern char *bin2hex(const unsigned char *p, size_t len);
-extern bool hex2bin(unsigned char *p, const char *hexstr, size_t len);
+extern void gpulog(int prio, int thr_id, const char *fmt, ...);
+
+void get_defconfig_path(char *out, size_t bufsize, char *argv0);
+extern void cbin2hex(char *out, const char *in, size_t len);
+extern char *bin2hex(const unsigned char *in, size_t len);
+extern bool hex2bin(void *output, const char *hexstr, size_t len);
 extern int timeval_subtract(struct timeval *result, struct timeval *x,
 	struct timeval *y);
 extern bool fulltest(const uint32_t *hash, const uint32_t *target);
-extern void diff_to_target(uint32_t *target, double diff);
+void diff_to_target(uint32_t* target, double diff);
+void work_set_target(struct work* work, double diff);
+double target_to_diff(uint32_t* target);
+extern void get_currentalgo(char* buf, int sz);
+
+// bignum
+double bn_convert_nbits(const uint32_t nbits);
+void bn_nbits_to_uchar(const uint32_t nBits, uchar *target);
+double bn_hash_target_ratio(uint32_t* hash, uint32_t* target);
+void bn_store_hash_target_ratio(uint32_t* hash, uint32_t* target, struct work* work, int nonce);
+void bn_set_target_ratio(struct work* work, uint32_t* hash, int nonce);
+void work_set_target_ratio(struct work* work, uint32_t* hash);
+
+// bench
+extern int bench_algo;
+void bench_init(int threads);
+void bench_free();
+bool bench_algo_switch_next(int thr_id);
+void bench_set_throughput(int thr_id, uint32_t throughput);
+void bench_display_results();
 
 struct stratum_job {
 	char *job_id;
@@ -298,8 +681,11 @@ struct stratum_job {
 	unsigned char version[4];
 	unsigned char nbits[4];
 	unsigned char ntime[4];
+	unsigned char extra[64]; // like lbry claimtrie
 	bool clean;
 	unsigned char nreward[2];
+	uint32_t height;
+	uint32_t shares_count;
 	double diff;
 };
 
@@ -312,18 +698,138 @@ struct stratum_ctx {
 	curl_socket_t sock;
 	size_t sockbuf_size;
 	char *sockbuf;
-	pthread_mutex_t sock_lock;
 
 	double next_diff;
+	double sharediff;
 
 	char *session_id;
 	size_t xnonce1_size;
 	unsigned char *xnonce1;
 	size_t xnonce2_size;
 	struct stratum_job job;
-	pthread_mutex_t work_lock;
+
+	struct timeval tv_submit;
+	uint32_t answer_msec;
+	int pooln;
+	time_t tm_connected;
+
+	int rpc2;
+	int is_equihash;
+	int srvtime_diff;
 };
 
+#define POK_MAX_TXS   4
+#define POK_MAX_TX_SZ 16384U
+struct tx {
+	uint8_t data[POK_MAX_TX_SZ];
+	uint32_t len;
+};
+
+#define MAX_NONCES 2
+struct work {
+	uint32_t data[48];
+	uint32_t target[8];
+	uint32_t maxvote;
+
+	char job_id[128];
+	size_t xnonce2_len;
+	uchar xnonce2[32];
+
+	union {
+		uint32_t u32[2];
+		uint64_t u64[1];
+	} noncerange;
+
+	uint8_t pooln;
+	uint8_t valid_nonces;
+	uint8_t submit_nonce_id;
+	uint8_t job_nonce_id;
+
+	uint32_t nonces[MAX_NONCES];
+	double sharediff[MAX_NONCES];
+	double shareratio[MAX_NONCES];
+	double targetdiff;
+
+	uint32_t height;
+
+	uint32_t scanned_from;
+	uint32_t scanned_to;
+
+	/* pok getwork txs */
+	uint32_t tx_count;
+	struct tx txs[POK_MAX_TXS];
+	// zec solution
+	uint8_t extra[1388];
+};
+
+#define POK_BOOL_MASK 0x00008000
+#define POK_DATA_MASK 0xFFFF0000
+
+#define MAX_POOLS 8
+struct pool_infos {
+	uint8_t id;
+#define POOL_UNUSED   0
+#define POOL_GETWORK  1
+#define POOL_STRATUM  2
+#define POOL_LONGPOLL 4
+	uint8_t type;
+#define POOL_ST_DEFINED 1
+#define POOL_ST_VALID 2
+#define POOL_ST_DISABLED 4
+#define POOL_ST_REMOVED 8
+	uint16_t status;
+	int algo;
+	char name[64];
+	// credentials
+	char url[512];
+	char short_url[64];
+	char user[192];
+	char pass[384];
+	// config options
+	double max_diff;
+	double max_rate;
+	int shares_limit;
+	int time_limit;
+	int scantime;
+	// connection
+	struct stratum_ctx stratum;
+	uint8_t allow_gbt;
+	uint8_t allow_mininginfo;
+	uint16_t check_dups; // 16_t for align
+	int retries;
+	int fail_pause;
+	int timeout;
+	// stats
+	uint32_t work_time;
+	uint32_t wait_time;
+	uint32_t accepted_count;
+	uint32_t rejected_count;
+	uint32_t solved_count;
+	uint32_t stales_count;
+	time_t last_share_time;
+	double best_share;
+	uint32_t disconnects;
+};
+
+extern struct pool_infos pools[MAX_POOLS];
+extern int num_pools;
+extern volatile int cur_pooln;
+
+void pool_init_defaults(void);
+void pool_set_creds(int pooln);
+void pool_set_attr(int pooln, const char* key, char* arg);
+bool pool_switch_url(char *params);
+bool pool_switch(int thr_id, int pooln);
+bool pool_switch_next(int thr_id);
+int pool_get_first_valid(int startfrom);
+bool parse_pool_array(json_t *obj);
+void pool_dump_infos(void);
+
+json_t * json_rpc_call_pool(CURL *curl, struct pool_infos*,
+	const char *req, bool lp_scan, bool lp, int *err);
+json_t * json_rpc_longpoll(CURL *curl, char *lp_url, struct pool_infos*,
+	const char *req, int *err);
+
 bool stratum_socket_full(struct stratum_ctx *sctx, int timeout);
 bool stratum_send_line(struct stratum_ctx *sctx, char *s);
 char *stratum_recv_line(struct stratum_ctx *sctx);
@@ -332,6 +838,39 @@ void stratum_disconnect(struct stratum_ctx *sctx);
 bool stratum_subscribe(struct stratum_ctx *sctx);
 bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass);
 bool stratum_handle_method(struct stratum_ctx *sctx, const char *s);
+void stratum_free_job(struct stratum_ctx *sctx);
+
+bool rpc2_stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass);
+
+bool equi_stratum_notify(struct stratum_ctx *sctx, json_t *params);
+bool equi_stratum_set_target(struct stratum_ctx *sctx, json_t *params);
+bool equi_stratum_submit(struct pool_infos *pool, struct work *work);
+bool equi_stratum_show_message(struct stratum_ctx *sctx, json_t *id, json_t *params);
+void equi_work_set_target(struct work* work, double diff);
+void equi_store_work_solution(struct work* work, uint32_t* hash, void* sol_data);
+int equi_verify_sol(void * const hdr, void * const sol);
+double equi_network_diff(struct work *work);
+
+void hashlog_remember_submit(struct work* work, uint32_t nonce);
+void hashlog_remember_scan_range(struct work* work);
+double hashlog_get_sharediff(char* jobid, int idnonce, double defvalue);
+uint32_t hashlog_already_submittted(char* jobid, uint32_t nounce);
+uint32_t hashlog_get_last_sent(char* jobid);
+uint64_t hashlog_get_scan_range(char* jobid);
+int  hashlog_get_history(struct hashlog_data *data, int max_records);
+void hashlog_purge_old(void);
+void hashlog_purge_job(char* jobid);
+void hashlog_purge_all(void);
+void hashlog_dump_job(char* jobid);
+void hashlog_getmeminfo(uint64_t *mem, uint32_t *records);
+
+void stats_remember_speed(int thr_id, uint32_t hashcount, double hashrate, uint8_t found, uint32_t height);
+double stats_get_speed(int thr_id, double def_speed);
+double stats_get_gpu_speed(int gpu_id);
+int  stats_get_history(int thr_id, struct stats_data *data, int max_records);
+void stats_purge_old(void);
+void stats_purge_all(void);
+void stats_getmeminfo(uint64_t *mem, uint32_t *records);
 
 struct thread_q;
 
@@ -342,6 +881,98 @@ extern void *tq_pop(struct thread_q *tq, const struct timespec *abstime);
 extern void tq_freeze(struct thread_q *tq);
 extern void tq_thaw(struct thread_q *tq);
 
+#define EXIT_CODE_OK            0
+#define EXIT_CODE_USAGE         1
+#define EXIT_CODE_POOL_TIMEOUT  2
+#define EXIT_CODE_SW_INIT_ERROR 3
+#define EXIT_CODE_CUDA_NODEVICE 4
+#define EXIT_CODE_CUDA_ERROR    5
+#define EXIT_CODE_TIME_LIMIT    0
+#define EXIT_CODE_KILLED        7
+
+void parse_arg(int key, char *arg);
+void proper_exit(int reason);
+void restart_threads(void);
+
+size_t time2str(char* buf, time_t timer);
+char* atime2str(time_t timer);
+
+void applog_hex(void *data, int len);
+void applog_hash(void *hash);
+void applog_hash64(void *hash);
+void applog_compare_hash(void *hash, void *hash_ref);
+
+void print_hash_tests(void);
+void allium_hash(void *state, const void *input);
+void bastionhash(void* output, const unsigned char* input);
+void blake256hash(void *output, const void *input, int8_t rounds);
+void blake2b_hash(void *output, const void *input);
+void blake2s_hash(void *output, const void *input);
+void bmw_hash(void *state, const void *input);
+void c11hash(void *output, const void *input);
+void cryptolight_hash_variant(void* output, const void* input, int len, int variant);
+void cryptolight_hash(void* output, const void* input);
+void cryptonight_hash_variant(void* output, const void* input, size_t len, int variant);
+void cryptonight_hash(void* output, const void* input);
+void monero_hash(void* output, const void* input);
+void stellite_hash(void* output, const void* input);
+void decred_hash(void *state, const void *input);
+void deephash(void *state, const void *input);
+void luffa_hash(void *state, const void *input);
+void fresh_hash(void *state, const void *input);
+void fugue256_hash(unsigned char* output, const unsigned char* input, int len);
+void heavycoin_hash(unsigned char* output, const unsigned char* input, int len);
+void hmq17hash(void *output, const void *input);
+void hsr_hash(void *output, const void *input);
+void keccak256_hash(void *state, const void *input);
+void jackpothash(void *state, const void *input);
+void groestlhash(void *state, const void *input);
+void jha_hash(void *output, const void *input);
+void lbry_hash(void *output, const void *input);
+void lyra2re_hash(void *state, const void *input);
+void lyra2v2_hash(void *state, const void *input);
+void lyra2v3_hash(void *state, const void *input);
+void lyra2Z_hash(void *state, const void *input);
+void myriadhash(void *state, const void *input);
+void neoscrypt(uchar *output, const uchar *input, uint32_t profile);
+void nist5hash(void *state, const void *input);
+void pentablakehash(void *output, const void *input);
+void phi_hash(void *output, const void *input);
+void phi2_hash(void *output, const void *input);
+void polytimos_hash(void *output, const void *input);
+void quarkhash(void *state, const void *input);
+void qubithash(void *state, const void *input);
+void scrypthash(void* output, const void* input);
+void scryptjane_hash(void* output, const void* input);
+void sha256d_hash(void *output, const void *input);
+void sha256t_hash(void *output, const void *input);
+void sha256q_hash(void *output, const void *input);
+void sia_blake2b_hash(void *output, const void *input);
+void sibhash(void *output, const void *input);
+void skeincoinhash(void *output, const void *input);
+void skein2hash(void *output, const void *input);
+void skunk_hash(void *state, const void *input);
+void s3hash(void *output, const void *input);
+void timetravel_hash(void *output, const void *input);
+void bitcore_hash(void *output, const void *input);
+void exosis_hash(void *output, const void *input);
+void tribus_hash(void *output, const void *input);
+void veltorhash(void *output, const void *input);
+void wcoinhash(void *state, const void *input);
+void whirlxHash(void *state, const void *input);
+void x11evo_hash(void *output, const void *input);
+void x11hash(void *output, const void *input);
+void x12hash(void *output, const void *input);
+void x13hash(void *output, const void *input);
+void x14hash(void *output, const void *input);
+void x15hash(void *output, const void *input);
+void x16r_hash(void *output, const void *input);
+void x16s_hash(void *output, const void *input);
+void x17hash(void *output, const void *input);
+void wildkeccak_hash(void *output, const void *input, uint64_t* scratchpad, uint64_t ssize);
+void zr5hash(void *output, const void *input);
+void zr5hash_pok(void *output, uint32_t *pdata);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/missing b/missing
deleted file mode 100644
index 1c8ff7049d..0000000000
--- a/missing
+++ /dev/null
@@ -1,367 +0,0 @@
-#! /bin/sh
-# Common stub for a few missing GNU programs while installing.
-
-scriptversion=2006-05-10.23
-
-# Copyright (C) 1996, 1997, 1999, 2000, 2002, 2003, 2004, 2005, 2006
-#   Free Software Foundation, Inc.
-# Originally by Fran,cois Pinard <pinard@iro.umontreal.ca>, 1996.
-
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
-# 02110-1301, USA.
-
-# As a special exception to the GNU General Public License, if you
-# distribute this file as part of a program that contains a
-# configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that program.
-
-if test $# -eq 0; then
-  echo 1>&2 "Try \`$0 --help' for more information"
-  exit 1
-fi
-
-run=:
-sed_output='s/.* --output[ =]\([^ ]*\).*/\1/p'
-sed_minuso='s/.* -o \([^ ]*\).*/\1/p'
-
-# In the cases where this matters, `missing' is being run in the
-# srcdir already.
-if test -f configure.ac; then
-  configure_ac=configure.ac
-else
-  configure_ac=configure.in
-fi
-
-msg="missing on your system"
-
-case $1 in
---run)
-  # Try to run requested program, and just exit if it succeeds.
-  run=
-  shift
-  "$@" && exit 0
-  # Exit code 63 means version mismatch.  This often happens
-  # when the user try to use an ancient version of a tool on
-  # a file that requires a minimum version.  In this case we
-  # we should proceed has if the program had been absent, or
-  # if --run hadn't been passed.
-  if test $? = 63; then
-    run=:
-    msg="probably too old"
-  fi
-  ;;
-
-  -h|--h|--he|--hel|--help)
-    echo "\
-$0 [OPTION]... PROGRAM [ARGUMENT]...
-
-Handle \`PROGRAM [ARGUMENT]...' for when PROGRAM is missing, or return an
-error status if there is no known handling for PROGRAM.
-
-Options:
-  -h, --help      display this help and exit
-  -v, --version   output version information and exit
-  --run           try to run the given command, and emulate it if it fails
-
-Supported PROGRAM values:
-  aclocal      touch file \`aclocal.m4'
-  autoconf     touch file \`configure'
-  autoheader   touch file \`config.h.in'
-  autom4te     touch the output file, or create a stub one
-  automake     touch all \`Makefile.in' files
-  bison        create \`y.tab.[ch]', if possible, from existing .[ch]
-  flex         create \`lex.yy.c', if possible, from existing .c
-  help2man     touch the output file
-  lex          create \`lex.yy.c', if possible, from existing .c
-  makeinfo     touch the output file
-  tar          try tar, gnutar, gtar, then tar without non-portable flags
-  yacc         create \`y.tab.[ch]', if possible, from existing .[ch]
-
-Send bug reports to <bug-automake@gnu.org>."
-    exit $?
-    ;;
-
-  -v|--v|--ve|--ver|--vers|--versi|--versio|--version)
-    echo "missing $scriptversion (GNU Automake)"
-    exit $?
-    ;;
-
-  -*)
-    echo 1>&2 "$0: Unknown \`$1' option"
-    echo 1>&2 "Try \`$0 --help' for more information"
-    exit 1
-    ;;
-
-esac
-
-# Now exit if we have it, but it failed.  Also exit now if we
-# don't have it and --version was passed (most likely to detect
-# the program).
-case $1 in
-  lex|yacc)
-    # Not GNU programs, they don't have --version.
-    ;;
-
-  tar)
-    if test -n "$run"; then
-       echo 1>&2 "ERROR: \`tar' requires --run"
-       exit 1
-    elif test "x$2" = "x--version" || test "x$2" = "x--help"; then
-       exit 1
-    fi
-    ;;
-
-  *)
-    if test -z "$run" && ($1 --version) > /dev/null 2>&1; then
-       # We have it, but it failed.
-       exit 1
-    elif test "x$2" = "x--version" || test "x$2" = "x--help"; then
-       # Could not run --version or --help.  This is probably someone
-       # running `$TOOL --version' or `$TOOL --help' to check whether
-       # $TOOL exists and not knowing $TOOL uses missing.
-       exit 1
-    fi
-    ;;
-esac
-
-# If it does not exist, or fails to run (possibly an outdated version),
-# try to emulate it.
-case $1 in
-  aclocal*)
-    echo 1>&2 "\
-WARNING: \`$1' is $msg.  You should only need it if
-         you modified \`acinclude.m4' or \`${configure_ac}'.  You might want
-         to install the \`Automake' and \`Perl' packages.  Grab them from
-         any GNU archive site."
-    touch aclocal.m4
-    ;;
-
-  autoconf)
-    echo 1>&2 "\
-WARNING: \`$1' is $msg.  You should only need it if
-         you modified \`${configure_ac}'.  You might want to install the
-         \`Autoconf' and \`GNU m4' packages.  Grab them from any GNU
-         archive site."
-    touch configure
-    ;;
-
-  autoheader)
-    echo 1>&2 "\
-WARNING: \`$1' is $msg.  You should only need it if
-         you modified \`acconfig.h' or \`${configure_ac}'.  You might want
-         to install the \`Autoconf' and \`GNU m4' packages.  Grab them
-         from any GNU archive site."
-    files=`sed -n 's/^[ ]*A[CM]_CONFIG_HEADER(\([^)]*\)).*/\1/p' ${configure_ac}`
-    test -z "$files" && files="config.h"
-    touch_files=
-    for f in $files; do
-      case $f in
-      *:*) touch_files="$touch_files "`echo "$f" |
-				       sed -e 's/^[^:]*://' -e 's/:.*//'`;;
-      *) touch_files="$touch_files $f.in";;
-      esac
-    done
-    touch $touch_files
-    ;;
-
-  automake*)
-    echo 1>&2 "\
-WARNING: \`$1' is $msg.  You should only need it if
-         you modified \`Makefile.am', \`acinclude.m4' or \`${configure_ac}'.
-         You might want to install the \`Automake' and \`Perl' packages.
-         Grab them from any GNU archive site."
-    find . -type f -name Makefile.am -print |
-	   sed 's/\.am$/.in/' |
-	   while read f; do touch "$f"; done
-    ;;
-
-  autom4te)
-    echo 1>&2 "\
-WARNING: \`$1' is needed, but is $msg.
-         You might have modified some files without having the
-         proper tools for further handling them.
-         You can get \`$1' as part of \`Autoconf' from any GNU
-         archive site."
-
-    file=`echo "$*" | sed -n "$sed_output"`
-    test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"`
-    if test -f "$file"; then
-	touch $file
-    else
-	test -z "$file" || exec >$file
-	echo "#! /bin/sh"
-	echo "# Created by GNU Automake missing as a replacement of"
-	echo "#  $ $@"
-	echo "exit 0"
-	chmod +x $file
-	exit 1
-    fi
-    ;;
-
-  bison|yacc)
-    echo 1>&2 "\
-WARNING: \`$1' $msg.  You should only need it if
-         you modified a \`.y' file.  You may need the \`Bison' package
-         in order for those modifications to take effect.  You can get
-         \`Bison' from any GNU archive site."
-    rm -f y.tab.c y.tab.h
-    if test $# -ne 1; then
-        eval LASTARG="\${$#}"
-	case $LASTARG in
-	*.y)
-	    SRCFILE=`echo "$LASTARG" | sed 's/y$/c/'`
-	    if test -f "$SRCFILE"; then
-	         cp "$SRCFILE" y.tab.c
-	    fi
-	    SRCFILE=`echo "$LASTARG" | sed 's/y$/h/'`
-	    if test -f "$SRCFILE"; then
-	         cp "$SRCFILE" y.tab.h
-	    fi
-	  ;;
-	esac
-    fi
-    if test ! -f y.tab.h; then
-	echo >y.tab.h
-    fi
-    if test ! -f y.tab.c; then
-	echo 'main() { return 0; }' >y.tab.c
-    fi
-    ;;
-
-  lex|flex)
-    echo 1>&2 "\
-WARNING: \`$1' is $msg.  You should only need it if
-         you modified a \`.l' file.  You may need the \`Flex' package
-         in order for those modifications to take effect.  You can get
-         \`Flex' from any GNU archive site."
-    rm -f lex.yy.c
-    if test $# -ne 1; then
-        eval LASTARG="\${$#}"
-	case $LASTARG in
-	*.l)
-	    SRCFILE=`echo "$LASTARG" | sed 's/l$/c/'`
-	    if test -f "$SRCFILE"; then
-	         cp "$SRCFILE" lex.yy.c
-	    fi
-	  ;;
-	esac
-    fi
-    if test ! -f lex.yy.c; then
-	echo 'main() { return 0; }' >lex.yy.c
-    fi
-    ;;
-
-  help2man)
-    echo 1>&2 "\
-WARNING: \`$1' is $msg.  You should only need it if
-	 you modified a dependency of a manual page.  You may need the
-	 \`Help2man' package in order for those modifications to take
-	 effect.  You can get \`Help2man' from any GNU archive site."
-
-    file=`echo "$*" | sed -n "$sed_output"`
-    test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"`
-    if test -f "$file"; then
-	touch $file
-    else
-	test -z "$file" || exec >$file
-	echo ".ab help2man is required to generate this page"
-	exit 1
-    fi
-    ;;
-
-  makeinfo)
-    echo 1>&2 "\
-WARNING: \`$1' is $msg.  You should only need it if
-         you modified a \`.texi' or \`.texinfo' file, or any other file
-         indirectly affecting the aspect of the manual.  The spurious
-         call might also be the consequence of using a buggy \`make' (AIX,
-         DU, IRIX).  You might want to install the \`Texinfo' package or
-         the \`GNU make' package.  Grab either from any GNU archive site."
-    # The file to touch is that specified with -o ...
-    file=`echo "$*" | sed -n "$sed_output"`
-    test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"`
-    if test -z "$file"; then
-      # ... or it is the one specified with @setfilename ...
-      infile=`echo "$*" | sed 's/.* \([^ ]*\) *$/\1/'`
-      file=`sed -n '
-	/^@setfilename/{
-	  s/.* \([^ ]*\) *$/\1/
-	  p
-	  q
-	}' $infile`
-      # ... or it is derived from the source name (dir/f.texi becomes f.info)
-      test -z "$file" && file=`echo "$infile" | sed 's,.*/,,;s,.[^.]*$,,'`.info
-    fi
-    # If the file does not exist, the user really needs makeinfo;
-    # let's fail without touching anything.
-    test -f $file || exit 1
-    touch $file
-    ;;
-
-  tar)
-    shift
-
-    # We have already tried tar in the generic part.
-    # Look for gnutar/gtar before invocation to avoid ugly error
-    # messages.
-    if (gnutar --version > /dev/null 2>&1); then
-       gnutar "$@" && exit 0
-    fi
-    if (gtar --version > /dev/null 2>&1); then
-       gtar "$@" && exit 0
-    fi
-    firstarg="$1"
-    if shift; then
-	case $firstarg in
-	*o*)
-	    firstarg=`echo "$firstarg" | sed s/o//`
-	    tar "$firstarg" "$@" && exit 0
-	    ;;
-	esac
-	case $firstarg in
-	*h*)
-	    firstarg=`echo "$firstarg" | sed s/h//`
-	    tar "$firstarg" "$@" && exit 0
-	    ;;
-	esac
-    fi
-
-    echo 1>&2 "\
-WARNING: I can't seem to be able to run \`tar' with the given arguments.
-         You may want to install GNU tar or Free paxutils, or check the
-         command line arguments."
-    exit 1
-    ;;
-
-  *)
-    echo 1>&2 "\
-WARNING: \`$1' is needed, and is $msg.
-         You might have modified some files without having the
-         proper tools for further handling them.  Check the \`README' file,
-         it often tells you about the needed prerequisites for installing
-         this package.  You may also peek at any GNU archive site, in case
-         some other package would contain this missing \`$1' program."
-    exit 1
-    ;;
-esac
-
-exit 0
-
-# Local variables:
-# eval: (add-hook 'write-file-hooks 'time-stamp)
-# time-stamp-start: "scriptversion="
-# time-stamp-format: "%:y-%02m-%02d.%02H"
-# time-stamp-end: "$"
-# End:
diff --git a/myriadgroestl.cpp b/myriadgroestl.cpp
index 45d9745102..77ac593f26 100644
--- a/myriadgroestl.cpp
+++ b/myriadgroestl.cpp
@@ -1,108 +1,130 @@
-#include "uint256.h"
-#include "sph/sph_groestl.h"
-
-#include "cpuminer-config.h"
-#include "miner.h"
-
 #include <string.h>
 #include <stdint.h>
+#include <cuda_runtime.h>
 #include <openssl/sha.h>
 
-extern bool opt_benchmark;
+#include "sph/sph_groestl.h"
 
-void myriadgroestl_cpu_init(int thr_id, int threads);
-void myriadgroestl_cpu_setBlock(int thr_id, void *data, void *pTargetIn);
-void myriadgroestl_cpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce);
+#include "miner.h"
 
-#define SWAP32(x) \
-    ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u)   | \
-      (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
+void myriadgroestl_cpu_init(int thr_id, uint32_t threads);
+void myriadgroestl_cpu_free(int thr_id);
+void myriadgroestl_cpu_setBlock(int thr_id, void *data, uint32_t *target);
+void myriadgroestl_cpu_hash(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces);
 
-static void myriadhash(void *state, const void *input)
+void myriadhash(void *state, const void *input)
 {
-    sph_groestl512_context     ctx_groestl;
+	uint32_t _ALIGN(64) hash[16];
+	sph_groestl512_context ctx_groestl;
+	SHA256_CTX sha256;
 
-    uint32_t hashA[16], hashB[16];
+	sph_groestl512_init(&ctx_groestl);
+	sph_groestl512(&ctx_groestl, input, 80);
+	sph_groestl512_close(&ctx_groestl, hash);
 
-    sph_groestl512_init(&ctx_groestl);
-    sph_groestl512 (&ctx_groestl, input, 80);
-    sph_groestl512_close(&ctx_groestl, hashA);
+	SHA256_Init(&sha256);
+	SHA256_Update(&sha256,(unsigned char *)hash, 64);
+	SHA256_Final((unsigned char *)hash, &sha256);
 
-    SHA256_CTX sha256;
-    SHA256_Init(&sha256);
-    SHA256_Update(&sha256,(unsigned char *)hashA, 64);
-    SHA256_Final((unsigned char *)hashB, &sha256);
-    memcpy(state, hashB, 32);
+	memcpy(state, hash, 32);
 }
 
-extern bool opt_benchmark;
-
-extern "C" int scanhash_myriad(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
-	uint32_t max_nonce, unsigned long *hashes_done)
-{	
-    if (opt_benchmark)
-        ((uint32_t*)ptarget)[7] = 0x000000ff;
+static bool init[MAX_GPUS] = { 0 };
 
-	uint32_t start_nonce = pdata[19]++;
-	const uint32_t throughPut = 128 * 1024;
-
-	uint32_t *outputHash = (uint32_t*)malloc(throughPut * 16 * sizeof(uint32_t));
+int scanhash_myriad(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t _ALIGN(64) endiandata[32];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	uint32_t start_nonce = pdata[19];
+	int dev_id = device_map[thr_id];
+	int intensity = (device_sm[dev_id] >= 600) ? 20 : 18;
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
+	if (init[thr_id]) throughput = min(throughput, max_nonce - start_nonce);
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x0000ff;
-
-	const uint32_t Htarg = ptarget[7];
+		ptarget[7] = 0x0000ff;
 
 	// init
-	static bool init[8] = { false, false, false, false, false, false, false, false };
 	if(!init[thr_id])
 	{
-#if BIG_DEBUG
-#else
-		myriadgroestl_cpu_init(thr_id, throughPut);
-#endif
+		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		myriadgroestl_cpu_init(thr_id, throughput);
 		init[thr_id] = true;
 	}
-	
-	uint32_t endiandata[32];
-	for (int kk=0; kk < 32; kk++)
-		be32enc(&endiandata[kk], pdata[kk]);
-
-	// Context mit dem Endian gedrehten Blockheader vorbereiten (Nonce wird sp�ter ersetzt)
-	myriadgroestl_cpu_setBlock(thr_id, endiandata, (void*)ptarget);
-	
+
+	for (int k=0; k < 20; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	myriadgroestl_cpu_setBlock(thr_id, endiandata, ptarget);
+
 	do {
+		memset(work->nonces, 0xff, sizeof(work->nonces));
+
 		// GPU
-		uint32_t foundNounce = 0xFFFFFFFF;
+		myriadgroestl_cpu_hash(thr_id, throughput, pdata[19], work->nonces);
 
-		myriadgroestl_cpu_hash(thr_id, throughPut, pdata[19], outputHash, &foundNounce);
+		*hashes_done = pdata[19] - start_nonce + throughput;
 
-		if(foundNounce < 0xffffffff)
+		if (work->nonces[0] < UINT32_MAX && bench_algo < 0)
 		{
-			uint32_t tmpHash[8];
-			endiandata[19] = SWAP32(foundNounce);
-			myriadhash(tmpHash, endiandata);
-			if (tmpHash[7] <= Htarg && 
-					fulltest(tmpHash, ptarget)) {
-						pdata[19] = foundNounce;
-						*hashes_done = foundNounce - start_nonce;
-						free(outputHash);
-				return true;
-			} else {
-				applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNounce);
+			uint32_t _ALIGN(64) vhash[8];
+			endiandata[19] = swab32(work->nonces[0]);
+			myriadhash(vhash, endiandata);
+			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				if (work->nonces[1] != UINT32_MAX) {
+					endiandata[19] = swab32(work->nonces[1]);
+					myriadhash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces = 2;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > ptarget[7]) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+					gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
 			}
-
-			foundNounce = 0xffffffff;
 		}
 
-		if (pdata[19] + throughPut < pdata[19])
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
 			pdata[19] = max_nonce;
-		else pdata[19] += throughPut;
+			break;
+		}
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = max_nonce - start_nonce;
 
-	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
-	
-	*hashes_done = pdata[19] - start_nonce;
-	free(outputHash);
 	return 0;
 }
 
+// cleanup
+void free_myriad(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	myriadgroestl_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/neoscrypt/cuda_neoscrypt.cu b/neoscrypt/cuda_neoscrypt.cu
new file mode 100644
index 0000000000..59d73b79b5
--- /dev/null
+++ b/neoscrypt/cuda_neoscrypt.cu
@@ -0,0 +1,1552 @@
+// originally from djm34 - github.com/djm34/ccminer-sp-neoscrypt
+// kernel code from Nanashi Meiyo-Meijin 1.7.6-r10 (July 2016)
+
+#include <stdio.h>
+#include <memory.h>
+
+#include <cuda_helper.h>
+#include <cuda_vector_uint2x4.h>
+#include "cuda_vectors.h"
+
+typedef uint48 uint4x2;
+
+#include "miner.h"
+
+#ifdef __INTELLISENSE__
+#define __CUDA_ARCH__ 500
+#define __byte_perm(x,y,c) x
+#define __shfl(x,y,c) x
+#define atomicExch(p,x) x
+#endif
+
+static uint32_t* d_NNonce[MAX_GPUS];
+
+__device__ uint2x4* W;
+__device__ uint2x4* Tr;
+__device__ uint2x4* Tr2;
+__device__ uint2x4* Input;
+
+__constant__ uint32_t c_data[64];
+__constant__ uint32_t c_target[2];
+__constant__ uint32_t key_init[16];
+__constant__ uint32_t input_init[16];
+
+static const __constant__ uint8 BLAKE2S_IV_Vec = {
+	0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+	0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+};
+
+static const uint8 BLAKE2S_IV_Vechost = {
+	0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+	0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+};
+
+static const uint32_t BLAKE2S_SIGMA_host[10][16] = {
+	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+	{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+	{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+	{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+	{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+};
+
+__constant__ uint32_t BLAKE2S_SIGMA[10][16] = {
+	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+	{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+	{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+	{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+	{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+};
+
+#define BLOCK_SIZE         64U
+#define BLAKE2S_BLOCK_SIZE 64U
+#define BLAKE2S_OUT_SIZE   32U
+
+#define SALSA(a,b,c,d) { \
+	t = rotateL(a+d,  7U); b ^= t; \
+	t = rotateL(b+a,  9U); c ^= t; \
+	t = rotateL(c+b, 13U); d ^= t; \
+	t = rotateL(d+c, 18U); a ^= t; \
+}
+
+#define shf_r_clamp32(out,a,b,shift) \
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(out) : "r"(a), "r"(b), "r"(shift));
+
+#if __CUDA_ARCH__ >= 300
+__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
+{
+	return __shfl(a, b, c);
+}
+
+__device__ __forceinline__ void WarpShuffle3(uint32_t &a1, uint32_t &a2, uint32_t &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c)
+{
+	a1 = WarpShuffle(a1, b1, c);
+	a2 = WarpShuffle(a2, b2, c);
+	a3 = WarpShuffle(a3, b3, c);
+}
+
+#else
+__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
+{
+	__shared__ uint32_t shared_mem[32];
+
+	const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
+
+	shared_mem[thread] = a;
+	__threadfence_block();
+
+	uint32_t result = shared_mem[(thread&~(c - 1)) + (b&(c - 1))];
+	__threadfence_block();
+
+	return result;
+}
+
+__device__ __forceinline__ void WarpShuffle3(uint32_t &a1, uint32_t &a2, uint32_t &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c)
+{
+	__shared__ uint32_t shared_mem[32];
+
+	const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
+
+	shared_mem[thread] = a1;
+	__threadfence_block();
+
+	a1 = shared_mem[(thread&~(c - 1)) + (b1&(c - 1))];
+	__threadfence_block();
+
+	shared_mem[thread] = a2;
+	__threadfence_block();
+
+	a2 = shared_mem[(thread&~(c - 1)) + (b2&(c - 1))];
+	__threadfence_block();
+
+	shared_mem[thread] = a3;
+	__threadfence_block();
+
+	a3 = shared_mem[(thread&~(c - 1)) + (b3&(c - 1))];
+	__threadfence_block();
+}
+
+#endif
+
+#define CHACHA_STEP(a,b,c,d) { \
+	a += b; d = __byte_perm(d^a, 0, 0x1032); \
+	c += d; b = rotateL(b^c, 12); \
+	a += b; d = __byte_perm(d^a, 0, 0x2103); \
+	c += d; b = rotateL(b^c, 7); \
+}
+
+#if __CUDA_ARCH__ < 500
+
+__device__ __forceinline__
+static void shift256R4(uint32_t* ret, const uint8 &vec4, const uint32_t shift2)
+{
+#if __CUDA_ARCH__ >= 320
+	uint32_t shift = 32U - shift2;
+	asm("shf.r.clamp.b32 %0, 0, %1, %2;" : "=r"(ret[0]) : "r"(vec4.s0), "r"(shift));
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[1]) : "r"(vec4.s0), "r"(vec4.s1), "r"(shift));
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[2]) : "r"(vec4.s1), "r"(vec4.s2), "r"(shift));
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[3]) : "r"(vec4.s2), "r"(vec4.s3), "r"(shift));
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[4]) : "r"(vec4.s3), "r"(vec4.s4), "r"(shift));
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[5]) : "r"(vec4.s4), "r"(vec4.s5), "r"(shift));
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[6]) : "r"(vec4.s5), "r"(vec4.s6), "r"(shift));
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[7]) : "r"(vec4.s6), "r"(vec4.s7), "r"(shift));
+	asm("shr.b32         %0, %1, %2;"     : "=r"(ret[8]) : "r"(vec4.s7), "r"(shift));
+#else
+	// to check
+	shift256R(ret, vec4, shift2);
+#endif
+}
+
+#define BLAKE(a, b, c, d, key1, key2) { \
+	a += key1; \
+	a += b; d = rotateL(d^a, 16); \
+	c += d; b = rotateR(b^c, 12); \
+	a += key2; \
+	a += b; d = rotateR(d^a, 8); \
+	c += d; b = rotateR(b^c, 7); \
+}
+
+#define BLAKE_G(idx0, idx1, a, b, c, d, key) { \
+	idx = BLAKE2S_SIGMA[idx0][idx1]; a += key[idx]; \
+	a += b; d = rotateL(d^a, 16); \
+	c += d; b = rotateR(b^c, 12); \
+	idx = BLAKE2S_SIGMA[idx0][idx1 + 1]; a += key[idx]; \
+	a += b; d = rotateR(d^a, 8); \
+	c += d; b = rotateR(b^c, 7); \
+}
+
+#define BLAKE_G_PRE(idx0, idx1, a, b, c, d, key) { \
+	a += key[idx0]; \
+	a += b; d = rotateL(d^a, 16); \
+	c += d; b = rotateR(b^c, 12); \
+	a += key[idx1]; \
+	a += b; d = rotateR(d^a, 8); \
+	c += d; b = rotateR(b^c, 7); \
+}
+
+#define BLAKE_G_PRE0(idx0, idx1, a, b, c, d, key) { \
+	a += b; d = rotateL(d^a, 16); \
+	c += d; b = rotateR(b^c, 12); \
+	a += b; d = rotateR(d^a, 8); \
+	c += d; b = rotateR(b^c, 7); \
+}
+
+#define BLAKE_G_PRE1(idx0, idx1, a, b, c, d, key) { \
+	a += key[idx0]; \
+	a += b; d = rotateL(d^a, 16); \
+	c += d; b = rotateR(b^c, 12); \
+	a += b; d = rotateR(d^a, 8); \
+	c += d; b = rotateR(b^c, 7); \
+}
+
+#define BLAKE_G_PRE2(idx0, idx1, a, b, c, d, key) { \
+	a += b; d = rotateL(d^a, 16); \
+	c += d; b = rotateR(b^c, 12); \
+	a += key[idx1]; \
+	a += b; d = rotateR(d^a, 8); \
+	c += d; b = rotateR(b^c, 7); \
+}
+
+static __forceinline__ __device__
+void Blake2S(uint32_t *out, const uint32_t* const __restrict__  inout, const  uint32_t * const __restrict__ TheKey)
+{
+	uint16 V;
+	uint32_t idx;
+	uint8 tmpblock;
+
+	V.hi = BLAKE2S_IV_Vec;
+	V.lo = BLAKE2S_IV_Vec;
+	V.lo.s0 ^= 0x01012020;
+
+	// Copy input block for later
+	tmpblock = V.lo;
+
+	V.hi.s4 ^= BLAKE2S_BLOCK_SIZE;
+
+	//	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	BLAKE_G_PRE(0, 1, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE(2, 3, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE(4, 5, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE(6, 7, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(8, 9, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(10, 11, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE0(12, 13, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(14, 15, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	BLAKE_G_PRE0(14, 10, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(4, 8, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(9, 15, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE2(13, 6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE1(1, 12, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE(0, 2, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(11, 7, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE(5, 3, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	BLAKE_G_PRE0(11, 8, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(12, 0, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE(5, 2, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE0(15, 13, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(10, 14, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE(3, 6, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE(7, 1, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(9, 4, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	BLAKE_G_PRE1(7, 9, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE(3, 1, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(13, 12, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE0(11, 14, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE(2, 6, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE1(5, 10, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE(4, 0, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(15, 8, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+	BLAKE_G_PRE2(9, 0, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE(5, 7, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE(2, 4, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE0(10, 15, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE2(14, 1, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(11, 12, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(6, 8, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE1(3, 13, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+	BLAKE_G_PRE1(2, 12, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(6, 10, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE1(0, 11, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE2(8, 3, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE1(4, 13, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE(7, 5, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE0(15, 14, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE1(1, 9, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+	BLAKE_G_PRE2(12, 5, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(1, 15, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(14, 13, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE1(4, 10, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE(0, 7, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE(6, 3, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(9, 2, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(8, 11, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+	BLAKE_G_PRE0(13, 11, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(7, 14, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(12, 1, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE1(3, 9, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE(5, 0, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE2(15, 4, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(8, 6, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE(2, 10, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+	BLAKE_G_PRE1(6, 15, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE0(14, 9, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(11, 3, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE1(0, 8, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE2(12, 2, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE2(13, 7, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE(1, 4, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(10, 5, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+	BLAKE_G_PRE2(10, 2, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(8, 4, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE(7, 6, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE(1, 5, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(15, 11, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(9, 14, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(3, 12, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(13, 0, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+
+	V.lo ^= V.hi ^ tmpblock;
+
+	V.hi = BLAKE2S_IV_Vec;
+	tmpblock = V.lo;
+
+	V.hi.s4 ^= 128;
+	V.hi.s6 = ~V.hi.s6;
+
+	// { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	BLAKE_G_PRE(0, 1, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+	BLAKE_G_PRE(2, 3, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+	BLAKE_G_PRE(4, 5, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+	BLAKE_G_PRE(6, 7, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+	BLAKE_G_PRE(8, 9, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+	BLAKE_G_PRE(10, 11, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+	BLAKE_G_PRE(12, 13, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+	BLAKE_G_PRE(14, 15, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+	// { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	BLAKE_G_PRE(14, 10, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+	BLAKE_G_PRE(4, 8, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+	BLAKE_G_PRE(9, 15, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+	BLAKE_G_PRE(13, 6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+	BLAKE_G_PRE(1, 12, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+	BLAKE_G_PRE(0, 2, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+	BLAKE_G_PRE(11, 7, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+	BLAKE_G_PRE(5, 3, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+	// { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	BLAKE_G_PRE(11, 8, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+	BLAKE_G_PRE(12, 0, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+	BLAKE_G_PRE(5, 2, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+	BLAKE_G_PRE(15, 13, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+	BLAKE_G_PRE(10, 14, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+	BLAKE_G_PRE(3, 6, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+	BLAKE_G_PRE(7, 1, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+	BLAKE_G_PRE(9, 4, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+	// { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	BLAKE_G_PRE(7, 9, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+	BLAKE_G_PRE(3, 1, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+	BLAKE_G_PRE(13, 12, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+	BLAKE_G_PRE(11, 14, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+	BLAKE_G_PRE(2, 6, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+	BLAKE_G_PRE(5, 10, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+	BLAKE_G_PRE(4, 0, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+	BLAKE_G_PRE(15, 8, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+
+	for (uint32_t x = 4U; x < 10U; x++)
+	{
+		BLAKE_G(x, 0x00, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+		BLAKE_G(x, 0x02, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+		BLAKE_G(x, 0x04, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+		BLAKE_G(x, 0x06, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+		BLAKE_G(x, 0x08, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+		BLAKE_G(x, 0x0A, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+		BLAKE_G(x, 0x0C, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+		BLAKE_G(x, 0x0E, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+	}
+
+	V.lo ^= V.hi ^ tmpblock;
+
+	((uint8*)out)[0] = V.lo;
+}
+#endif
+
+#if __CUDA_ARCH__ >= 500
+
+#define BLAKE_G(idx0, idx1, a, b, c, d, key) { \
+	idx = BLAKE2S_SIGMA[idx0][idx1]; a += key[idx]; \
+	a += b; d = __byte_perm(d^a, 0, 0x1032); \
+	c += d; b = rotateR(b^c, 12); \
+	idx = BLAKE2S_SIGMA[idx0][idx1 + 1]; a += key[idx]; \
+	a += b; d = __byte_perm(d^a, 0, 0x0321); \
+	c += d; b = rotateR(b^c, 7); \
+}
+
+#define BLAKE(a, b, c, d, key1,key2) { \
+	a += key1; \
+	a += b; d = __byte_perm(d^a, 0, 0x1032); \
+	c += d; b = rotateR(b^c, 12); \
+	a += key2; \
+	a += b; d = __byte_perm(d^a, 0, 0x0321); \
+	c += d; b = rotateR(b^c, 7); \
+}
+
+#define BLAKE_G_PRE(idx0,idx1, a, b, c, d, key) { \
+	a += key[idx0]; \
+	a += b; d = __byte_perm(d^a, 0, 0x1032); \
+	c += d; b = rotateR(b^c, 12); \
+	a += key[idx1]; \
+	a += b; d = __byte_perm(d^a, 0, 0x0321); \
+	c += d; b = rotateR(b^c, 7); \
+}
+
+#define BLAKE_G_PRE0(idx0,idx1, a, b, c, d, key) { \
+	a += b; d = __byte_perm(d^a, 0, 0x1032); \
+	c += d; b = rotateR(b^c, 12); \
+	a += b; d = __byte_perm(d^a, 0, 0x0321); \
+	c += d; b = rotateR(b^c, 7); \
+}
+
+#define BLAKE_G_PRE1(idx0,idx1, a, b, c, d, key) { \
+	a += key[idx0]; \
+	a += b; d = __byte_perm(d^a, 0, 0x1032); \
+	c += d; b = rotateR(b^c, 12); \
+	a += b; d = __byte_perm(d^a, 0, 0x0321); \
+	c += d; b = rotateR(b^c, 7); \
+}
+
+#define BLAKE_G_PRE2(idx0,idx1, a, b, c, d, key) { \
+	a += b; d = __byte_perm(d^a, 0, 0x1032); \
+	c += d; b = rotateR(b^c, 12); \
+	a += key[idx1]; \
+	a += b; d = __byte_perm(d^a, 0, 0x0321); \
+	c += d; b = rotateR(b^c, 7); \
+}
+
+static __forceinline__ __device__
+void Blake2S_v2(uint32_t *out, const uint32_t* __restrict__  inout, const  uint32_t * __restrict__ TheKey)
+{
+	uint16 V;
+	uint8 tmpblock;
+
+	V.hi = BLAKE2S_IV_Vec;
+	V.lo = BLAKE2S_IV_Vec;
+	V.lo.s0 ^= 0x01012020;
+
+	// Copy input block for later
+	tmpblock = V.lo;
+
+	V.hi.s4 ^= BLAKE2S_BLOCK_SIZE;
+
+	//	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	BLAKE_G_PRE(0, 1, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE(2, 3, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE(4, 5, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE(6, 7, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(8, 9, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(10, 11, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE0(12, 13, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(14, 15, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	BLAKE_G_PRE0(14, 10, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(4, 8, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(9, 15, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE2(13, 6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE1(1, 12, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE(0, 2, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(11, 7, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE(5, 3, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	BLAKE_G_PRE0(11, 8, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(12, 0, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE(5, 2, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE0(15, 13, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(10, 14, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE(3, 6, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE(7, 1, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(9, 4, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	BLAKE_G_PRE1(7, 9, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE(3, 1, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(13, 12, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE0(11, 14, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE(2, 6, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE1(5, 10, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE(4, 0, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(15, 8, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+	BLAKE_G_PRE2(9, 0, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE(5, 7, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE(2, 4, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE0(10, 15, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE2(14, 1, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(11, 12, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(6, 8, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE1(3, 13, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+	BLAKE_G_PRE1(2, 12, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(6, 10, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE1(0, 11, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE2(8, 3, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE1(4, 13, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE(7, 5, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE0(15, 14, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE1(1, 9, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+	BLAKE_G_PRE2(12, 5, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(1, 15, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(14, 13, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE1(4, 10, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE(0, 7, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE(6, 3, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(9, 2, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(8, 11, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+	BLAKE_G_PRE0(13, 11, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(7, 14, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(12, 1, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE1(3, 9, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE(5, 0, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE2(15, 4, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(8, 6, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE(2, 10, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+	BLAKE_G_PRE1(6, 15, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE0(14, 9, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(11, 3, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE1(0, 8, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE2(12, 2, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE2(13, 7, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE(1, 4, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(10, 5, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+	BLAKE_G_PRE2(10, 2, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(8, 4, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE(7, 6, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE(1, 5, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(15, 11, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(9, 14, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(3, 12, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(13, 0, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+
+	V.lo ^= V.hi;
+	V.lo ^= tmpblock;
+
+	V.hi = BLAKE2S_IV_Vec;
+	tmpblock = V.lo;
+
+	V.hi.s4 ^= 128;
+	V.hi.s6 = ~V.hi.s6;
+
+	// { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	BLAKE_G_PRE(0, 1, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+	BLAKE_G_PRE(2, 3, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+	BLAKE_G_PRE(4, 5, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+	BLAKE_G_PRE(6, 7, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+	BLAKE_G_PRE(8, 9, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+	BLAKE_G_PRE(10, 11, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+	BLAKE_G_PRE(12, 13, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+	BLAKE_G_PRE(14, 15, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+	// { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	BLAKE_G_PRE(14, 10, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+	BLAKE_G_PRE(4, 8, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+	BLAKE_G_PRE(9, 15, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+	BLAKE_G_PRE(13, 6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+	BLAKE_G_PRE(1, 12, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+	BLAKE_G_PRE(0, 2, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+	BLAKE_G_PRE(11, 7, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+	BLAKE_G_PRE(5, 3, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+	// { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	BLAKE_G_PRE(11, 8, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+	BLAKE_G_PRE(12, 0, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+	BLAKE_G_PRE(5, 2, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+	BLAKE_G_PRE(15, 13, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+	BLAKE_G_PRE(10, 14, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+	BLAKE_G_PRE(3, 6, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+	BLAKE_G_PRE(7, 1, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+	BLAKE_G_PRE(9, 4, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+	// { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	BLAKE_G_PRE(7, 9, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+	BLAKE_G_PRE(3, 1, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+	BLAKE_G_PRE(13, 12, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+	BLAKE_G_PRE(11, 14, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+	BLAKE_G_PRE(2, 6, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+	BLAKE_G_PRE(5, 10, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+	BLAKE_G_PRE(4, 0, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+	BLAKE_G_PRE(15, 8, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+
+	BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[9], inout[0]);
+	BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[5], inout[7]);
+	BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[2], inout[4]);
+	BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[10], inout[15]);
+	BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[14], inout[1]);
+	BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[11], inout[12]);
+	BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[6], inout[8]);
+	BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[3], inout[13]);
+
+	BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[2], inout[12]);
+	BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[6], inout[10]);
+	BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[0], inout[11]);
+	BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[8], inout[3]);
+	BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[4], inout[13]);
+	BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[7], inout[5]);
+	BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[15], inout[14]);
+	BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[1], inout[9]);
+
+	BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[12], inout[5]);
+	BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[1], inout[15]);
+	BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[14], inout[13]);
+	BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[4], inout[10]);
+	BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[0], inout[7]);
+	BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[6], inout[3]);
+	BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[9], inout[2]);
+	BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[8], inout[11]);
+	// 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10,
+	BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[13], inout[11]);
+	BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[7], inout[14]);
+	BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[12], inout[1]);
+	BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[3], inout[9]);
+	BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[5], inout[0]);
+	BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[15], inout[4]);
+	BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[8], inout[6]);
+	BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[2], inout[10]);
+	// 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5,
+	BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[6], inout[15]);
+	BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[14], inout[9]);
+	BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[11], inout[3]);
+	BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[0], inout[8]);
+	BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[12], inout[2]);
+	BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[13], inout[7]);
+	BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[1], inout[4]);
+	BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[10], inout[5]);
+	// 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0,
+	BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[10], inout[2]);
+	BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[8], inout[4]);
+	BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[7], inout[6]);
+	BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[1], inout[5]);
+	BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[15], inout[11]);
+	BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[9], inout[14]);
+	BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[3], inout[12]);
+	BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[13], inout[0]);
+
+	V.lo ^= V.hi;
+	V.lo ^= tmpblock;
+
+	((uint8*)out)[0] = V.lo;
+}
+
+#endif /* __CUDA_ARCH__ >= 500 */
+
+#define SALSA_CORE(state) { \
+	uint32_t t; \
+	SALSA(state.x, state.y, state.z, state.w); \
+	WarpShuffle3(state.y, state.z, state.w, threadIdx.x + 3, threadIdx.x + 2, threadIdx.x + 1,4); \
+	SALSA(state.x, state.w, state.z, state.y); \
+	WarpShuffle3(state.y, state.z, state.w, threadIdx.x + 1, threadIdx.x + 2, threadIdx.x + 3,4); \
+}
+
+#define CHACHA_CORE_PARALLEL(state)	{ \
+	CHACHA_STEP(state.x, state.y, state.z, state.w); \
+	WarpShuffle3(state.y, state.z, state.w, threadIdx.x + 1, threadIdx.x + 2, threadIdx.x + 3,4); \
+	CHACHA_STEP(state.x, state.y, state.z, state.w); \
+	WarpShuffle3(state.y, state.z, state.w, threadIdx.x + 3, threadIdx.x + 2, threadIdx.x + 1,4); \
+}
+
+__forceinline__ __device__
+uint4 salsa_small_scalar_rnd(const uint4 X)
+{
+	uint4 state = X;
+
+	#pragma nounroll
+	for (int i = 0; i < 10; i++) {
+		SALSA_CORE(state);
+	}
+
+	return (X + state);
+}
+
+__device__ __forceinline__
+uint4 chacha_small_parallel_rnd(const uint4 X)
+{
+	uint4 state = X;
+
+	#pragma nounroll
+	for (int i = 0; i < 10; i++) {
+		CHACHA_CORE_PARALLEL(state);
+	}
+	return (X + state);
+}
+
+__device__ __forceinline__
+void neoscrypt_chacha(uint4 XV[4])
+{
+	uint4 temp;
+
+	XV[0] = chacha_small_parallel_rnd(XV[0] ^ XV[3]);
+	temp = chacha_small_parallel_rnd(XV[1] ^ XV[0]);
+	XV[1] = chacha_small_parallel_rnd(XV[2] ^ temp);
+	XV[3] = chacha_small_parallel_rnd(XV[3] ^ XV[1]);
+	XV[2] = temp;
+}
+
+__device__ __forceinline__
+void neoscrypt_salsa(uint4 XV[4])
+{
+	uint4 temp;
+
+	XV[0] = salsa_small_scalar_rnd(XV[0] ^ XV[3]);
+	temp = salsa_small_scalar_rnd(XV[1] ^ XV[0]);
+	XV[1] = salsa_small_scalar_rnd(XV[2] ^ temp);
+	XV[3] = salsa_small_scalar_rnd(XV[3] ^ XV[1]);
+	XV[2] = temp;
+}
+
+
+#if __CUDA_ARCH__ < 500
+static __forceinline__ __device__
+void fastkdf256_v1(const uint32_t thread, const uint32_t nonce, uint32_t* const s_data)
+{
+	uint2x4 output[8];
+	uint32_t* B = (uint32_t*)&s_data[threadIdx.x * 64U];
+	uint32_t qbuf, rbuf, bitbuf;
+	uint32_t input[BLAKE2S_BLOCK_SIZE / 4];
+	uint32_t key[BLAKE2S_BLOCK_SIZE / 4] = { 0 };
+
+	const uint32_t data18 = c_data[18];
+	const uint32_t data20 = c_data[0];
+
+	((uintx64*)(B))[0] = ((uintx64*)c_data)[0];
+	((uint32_t*)B)[19] = nonce;
+	((uint32_t*)B)[39] = nonce;
+	((uint32_t*)B)[59] = nonce;
+
+	((uint816*)input)[0] = ((uint816*)input_init)[0];
+	((uint4x2*)key)[0] = ((uint4x2*)key_init)[0];
+
+	#pragma unroll 1
+	for (int i = 0; i < 31; i++)
+	{
+		uint32_t bufidx = 0;
+		#pragma unroll
+		for (int x = 0; x < BLAKE2S_OUT_SIZE / 4; ++x)
+		{
+			uint32_t bufhelper = (input[x] & 0x00ff00ff) + ((input[x] & 0xff00ff00) >> 8);
+			bufhelper = bufhelper + (bufhelper >> 16);
+			bufidx += bufhelper;
+		}
+		bufidx &= 0x000000ff;
+		qbuf = bufidx >> 2;
+		rbuf = bufidx & 3;
+		bitbuf = rbuf << 3;
+
+		uint32_t shifted[9];
+		shift256R4(shifted, ((uint8*)input)[0], bitbuf);
+
+		uint32_t temp[9];
+		//#pragma unroll
+		for (int k = 0; k < 9; k++)
+		{
+			uint32_t indice = (k + qbuf) & 0x3f;
+			temp[k] = B[indice] ^ shifted[k];
+			B[indice] = temp[k];
+		}
+#if __CUDA_ARCH__ >= 320  || !defined(__CUDA_ARCH__)
+		uint32_t a = c_data[qbuf & 0x3f], b;
+		//#pragma unroll
+		for (int k = 0; k<16; k += 2)
+		{
+			b = c_data[(qbuf + k + 1) & 0x3f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k]) : "r"(a), "r"(b), "r"(bitbuf));
+			a = c_data[(qbuf + k + 2) & 0x3f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k + 1]) : "r"(b), "r"(a), "r"(bitbuf));
+		}
+
+		const uint32_t noncepos = 19U - qbuf % 20U;
+		if (noncepos <= 16U && qbuf < 60U)
+		{
+			if (noncepos != 0)
+				asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos - 1]) : "r"(data18), "r"(nonce), "r"(bitbuf));
+			if (noncepos != 16U)
+				asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos]) : "r"(nonce), "r"(data20), "r"(bitbuf));
+		}
+
+		for (int k = 0; k<8; k++)
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[k]) : "r"(temp[k]), "r"(temp[k + 1]), "r"(bitbuf));
+#else
+		//#error SM 3.0 code missing here
+		printf("", data18, data20);
+#endif
+		Blake2S(input, input, key);
+	}
+
+	uint32_t bufidx = 0;
+	#pragma unroll
+	for (int x = 0; x < BLAKE2S_OUT_SIZE / 4; ++x)
+	{
+		uint32_t bufhelper = (input[x] & 0x00ff00ff) + ((input[x] & 0xff00ff00) >> 8);
+		bufhelper = bufhelper + (bufhelper >> 16);
+		bufidx += bufhelper;
+	}
+	bufidx &= 0x000000ff;
+	qbuf = bufidx >> 2;
+	rbuf = bufidx & 3;
+	bitbuf = rbuf << 3;
+
+#if __CUDA_ARCH__ >= 320
+	for (int i = 0; i<64; i++)
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(((uint32_t*)output)[i]) : "r"(B[(qbuf + i) & 0x3f]), "r"(B[(qbuf + i + 1) & 0x3f4]), "r"(bitbuf));
+#endif
+
+	((ulonglong4*)output)[0] ^= ((ulonglong4*)input)[0];
+	((uintx64*)output)[0] ^= ((uintx64*)c_data)[0];
+	((uint32_t*)output)[19] ^= nonce;
+	((uint32_t*)output)[39] ^= nonce;
+	((uint32_t*)output)[59] ^= nonce;
+
+	for (int i = 0; i<8; i++)
+		(Input + 8U * thread)[i] = output[i];
+}
+#endif
+
+#if __CUDA_ARCH__ >= 500
+static __forceinline__ __device__
+void fastkdf256_v2(const uint32_t thread, const uint32_t nonce, uint32_t* const s_data)
+{
+	const uint32_t data18 = c_data[18];
+	const uint32_t data20 = c_data[0];
+	uint32_t input[16];
+	uint32_t key[16] = { 0 };
+	uint32_t qbuf, rbuf, bitbuf;
+
+	uint32_t* B = (uint32_t*)&s_data[threadIdx.x * 64U];
+	((uintx64*)(B))[0] = ((uintx64*)c_data)[0];
+
+	B[19] = nonce;
+	B[39] = nonce;
+	B[59] = nonce;
+
+	{
+		uint32_t bufidx = 0;
+		#pragma unroll
+		for (int x = 0; x < BLAKE2S_OUT_SIZE / 4; ++x)
+		{
+			uint32_t bufhelper = (input_init[x] & 0x00ff00ff) + ((input_init[x] & 0xff00ff00) >> 8);
+			bufhelper = bufhelper + (bufhelper >> 16);
+			bufidx += bufhelper;
+		}
+		bufidx &= 0x000000ff;
+		qbuf = bufidx >> 2;
+		rbuf = bufidx & 3;
+		bitbuf = rbuf << 3;
+
+		uint32_t temp[9];
+
+		uint32_t shifted;
+		uint32_t shift = 32U - bitbuf;
+		asm("shl.b32         %0, %1, %2;"     : "=r"(shifted) : "r"(input_init[0]), "r"(bitbuf));
+		temp[0] = B[(0 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input_init[0]), "r"(input_init[1]), "r"(shift));
+		temp[1] = B[(1 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input_init[1]), "r"(input_init[2]), "r"(shift));
+		temp[2] = B[(2 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input_init[2]), "r"(input_init[3]), "r"(shift));
+		temp[3] = B[(3 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input_init[3]), "r"(input_init[4]), "r"(shift));
+		temp[4] = B[(4 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input_init[4]), "r"(input_init[5]), "r"(shift));
+		temp[5] = B[(5 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input_init[5]), "r"(input_init[6]), "r"(shift));
+		temp[6] = B[(6 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input_init[6]), "r"(input_init[7]), "r"(shift));
+		temp[7] = B[(7 + qbuf) & 0x3f] ^ shifted;
+		asm("shr.b32         %0, %1, %2;"     : "=r"(shifted) : "r"(input_init[7]), "r"(shift));
+		temp[8] = B[(8 + qbuf) & 0x3f] ^ shifted;
+
+		uint32_t a = c_data[qbuf & 0x3f], b;
+
+		#pragma unroll
+		for (int k = 0; k<16; k += 2)
+		{
+			b = c_data[(qbuf + k + 1) & 0x3f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k]) : "r"(a), "r"(b), "r"(bitbuf));
+			a = c_data[(qbuf + k + 2) & 0x3f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k + 1]) : "r"(b), "r"(a), "r"(bitbuf));
+		}
+
+		const uint32_t noncepos = 19 - qbuf % 20U;
+		if (noncepos <= 16U && qbuf < 60U)
+		{
+			if (noncepos)
+				asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos - 1]) : "r"(data18), "r"(nonce), "r"(bitbuf));
+			if (noncepos != 16U)
+				asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos]) : "r"(nonce), "r"(data20), "r"(bitbuf));
+		}
+
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[0]) : "r"(temp[0]), "r"(temp[1]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[1]) : "r"(temp[1]), "r"(temp[2]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[2]) : "r"(temp[2]), "r"(temp[3]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[3]) : "r"(temp[3]), "r"(temp[4]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[4]) : "r"(temp[4]), "r"(temp[5]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[5]) : "r"(temp[5]), "r"(temp[6]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[6]) : "r"(temp[6]), "r"(temp[7]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[7]) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf));
+
+		Blake2S_v2(input, input, key);
+
+		#pragma unroll
+		for (int k = 0; k < 9; k++)
+			B[(k + qbuf) & 0x3f] = temp[k];
+	}
+
+	for (int i = 1; i < 31; i++)
+	{
+		uint32_t bufidx = 0;
+		#pragma unroll
+		for (int x = 0; x < BLAKE2S_OUT_SIZE / 4; ++x)
+		{
+			uint32_t bufhelper = (input[x] & 0x00ff00ff) + ((input[x] & 0xff00ff00) >> 8);
+			bufhelper = bufhelper + (bufhelper >> 16);
+			bufidx += bufhelper;
+		}
+		bufidx &= 0x000000ff;
+		qbuf = bufidx >> 2;
+		rbuf = bufidx & 3;
+		bitbuf = rbuf << 3;
+
+		uint32_t temp[9];
+
+		uint32_t shifted;
+		uint32_t shift = 32U - bitbuf;
+		asm("shl.b32         %0, %1, %2;"     : "=r"(shifted) : "r"(input[0]), "r"(bitbuf));
+		temp[0] = B[(0 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[0]), "r"(input[1]), "r"(shift));
+		temp[1] = B[(1 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[1]), "r"(input[2]), "r"(shift));
+		temp[2] = B[(2 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[2]), "r"(input[3]), "r"(shift));
+		temp[3] = B[(3 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[3]), "r"(input[4]), "r"(shift));
+		temp[4] = B[(4 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[4]), "r"(input[5]), "r"(shift));
+		temp[5] = B[(5 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[5]), "r"(input[6]), "r"(shift));
+		temp[6] = B[(6 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[6]), "r"(input[7]), "r"(shift));
+		temp[7] = B[(7 + qbuf) & 0x3f] ^ shifted;
+		asm("shr.b32         %0, %1, %2;"     : "=r"(shifted) : "r"(input[7]), "r"(shift));
+		temp[8] = B[(8 + qbuf) & 0x3f] ^ shifted;
+
+		uint32_t a = c_data[qbuf & 0x3f], b;
+
+		#pragma unroll
+		for (int k = 0; k<16; k += 2)
+		{
+			b = c_data[(qbuf + k + 1) & 0x3f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k]) : "r"(a), "r"(b), "r"(bitbuf));
+			a = c_data[(qbuf + k + 2) & 0x3f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k + 1]) : "r"(b), "r"(a), "r"(bitbuf));
+		}
+
+		const uint32_t noncepos = 19 - qbuf % 20U;
+		if (noncepos <= 16U && qbuf < 60U)
+		{
+			if (noncepos)
+				asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos - 1]) : "r"(data18), "r"(nonce), "r"(bitbuf));
+			if (noncepos != 16U)
+				asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos]) : "r"(nonce), "r"(data20), "r"(bitbuf));
+		}
+
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[0]) : "r"(temp[0]), "r"(temp[1]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[1]) : "r"(temp[1]), "r"(temp[2]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[2]) : "r"(temp[2]), "r"(temp[3]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[3]) : "r"(temp[3]), "r"(temp[4]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[4]) : "r"(temp[4]), "r"(temp[5]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[5]) : "r"(temp[5]), "r"(temp[6]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[6]) : "r"(temp[6]), "r"(temp[7]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[7]) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf));
+
+		Blake2S_v2(input, input, key);
+
+		#pragma unroll
+		for (int k = 0; k < 9; k++)
+			B[(k + qbuf) & 0x3f] = temp[k];
+	}
+
+	{
+		uint32_t bufidx = 0;
+		#pragma unroll
+		for (int x = 0; x < BLAKE2S_OUT_SIZE / 4; ++x)
+		{
+			uint32_t bufhelper = (input[x] & 0x00ff00ff) + ((input[x] & 0xff00ff00) >> 8);
+			bufhelper = bufhelper + (bufhelper >> 16);
+			bufidx += bufhelper;
+		}
+		bufidx &= 0x000000ff;
+		qbuf = bufidx >> 2;
+		rbuf = bufidx & 3;
+		bitbuf = rbuf << 3;
+	}
+
+	uint2x4 output[8];
+	for (int i = 0; i<64; i++) {
+		const uint32_t a = (qbuf + i) & 0x3f, b = (qbuf + i + 1) & 0x3f;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(((uint32_t*)output)[i]) : "r"(B[a]), "r"(B[b]), "r"(bitbuf));
+	}
+
+	output[0] ^= ((uint2x4*)input)[0];
+	#pragma unroll
+	for (int i = 0; i<8; i++)
+		output[i] ^= ((uint2x4*)c_data)[i];
+
+	((uint32_t*)output)[19] ^= nonce;
+	((uint32_t*)output)[39] ^= nonce;
+	((uint32_t*)output)[59] ^= nonce;;
+	((ulonglong16 *)(Input + 8U * thread))[0] = ((ulonglong16*)output)[0];
+}
+#endif
+
+#if __CUDA_ARCH__ < 500
+static __forceinline__ __device__
+uint32_t fastkdf32_v1(uint32_t thread, const uint32_t nonce, uint32_t* const salt, uint32_t* const s_data)
+{
+	const uint32_t cdata7 = c_data[7];
+	const uint32_t data18 = c_data[18];
+	const uint32_t data20 = c_data[0];
+
+	uint32_t* B0 = (uint32_t*)&s_data[threadIdx.x * 64U];
+	((uintx64*)B0)[0] = ((uintx64*)salt)[0];
+
+	uint32_t input[BLAKE2S_BLOCK_SIZE / 4];
+	((uint816*)input)[0] = ((uint816*)c_data)[0];
+
+	uint32_t key[BLAKE2S_BLOCK_SIZE / 4];
+	((uint4x2*)key)[0] = ((uint4x2*)salt)[0];
+	((uint4*)key)[2] = make_uint4(0, 0, 0, 0);
+	((uint4*)key)[3] = make_uint4(0, 0, 0, 0);
+
+	uint32_t qbuf, rbuf, bitbuf;
+	uint32_t temp[9];
+
+	#pragma nounroll
+	for (int i = 0; i < 31; i++)
+	{
+		Blake2S(input, input, key);
+
+		uint32_t bufidx = 0;
+		#pragma unroll
+		for (int x = 0; x < BLAKE2S_OUT_SIZE / 4; ++x)
+		{
+			uint32_t bufhelper = (input[x] & 0x00ff00ff) + ((input[x] & 0xff00ff00) >> 8);
+			bufhelper = bufhelper + (bufhelper >> 16);
+			bufidx += bufhelper;
+		}
+		bufidx &= 0x000000ff;
+		qbuf = bufidx >> 2;
+		rbuf = bufidx & 3;
+		bitbuf = rbuf << 3;
+		uint32_t shifted[9];
+
+		shift256R4(shifted, ((uint8*)input)[0], bitbuf);
+
+		for (int k = 0; k < 9; k++) {
+			temp[k] = B0[(k + qbuf) & 0x3f];
+		}
+
+		((uint2x4*)temp)[0] ^= ((uint2x4*)shifted)[0];
+		temp[8] ^= shifted[8];
+
+#if __CUDA_ARCH__ >= 320 || !defined(__CUDA_ARCH__)
+		uint32_t a = c_data[qbuf & 0x3f], b;
+		//#pragma unroll
+		for (int k = 0; k<16; k += 2)
+		{
+			b = c_data[(qbuf + k + 1) & 0x3f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k]) : "r"(a), "r"(b), "r"(bitbuf));
+			a = c_data[(qbuf + k + 2) & 0x3f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k + 1]) : "r"(b), "r"(a), "r"(bitbuf));
+		}
+
+		const uint32_t noncepos = 19U - qbuf % 20U;
+		if (noncepos <= 16U && qbuf < 60U)
+		{
+			if (noncepos != 0)	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos - 1]) : "r"(data18), "r"(nonce), "r"(bitbuf));
+			if (noncepos != 16U)	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos]) : "r"(nonce), "r"(data20), "r"(bitbuf));
+		}
+
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[0]) : "r"(temp[0]), "r"(temp[1]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[1]) : "r"(temp[1]), "r"(temp[2]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[2]) : "r"(temp[2]), "r"(temp[3]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[3]) : "r"(temp[3]), "r"(temp[4]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[4]) : "r"(temp[4]), "r"(temp[5]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[5]) : "r"(temp[5]), "r"(temp[6]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[6]) : "r"(temp[6]), "r"(temp[7]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[7]) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf));
+#else
+		//#error SM 3.0 code missing here
+		printf("", data18, data20);
+#endif
+		for (int k = 0; k < 9; k++) {
+			B0[(k + qbuf) & 0x3f] = temp[k];
+		}
+	}
+
+	Blake2S(input, input, key);
+
+	uint32_t bufidx = 0;
+	#pragma unroll
+	for (int x = 0; x < BLAKE2S_OUT_SIZE / 4; ++x)
+	{
+		uint32_t bufhelper = (input[x] & 0x00ff00ff) + ((input[x] & 0xff00ff00) >> 8);
+		bufhelper = bufhelper + (bufhelper >> 16);
+		bufidx += bufhelper;
+	}
+	bufidx &= 0x000000ff;
+	qbuf = bufidx >> 2;
+	rbuf = bufidx & 3;
+	bitbuf = rbuf << 3;
+
+	for (int k = 7; k < 9; k++) {
+		temp[k] = B0[(k + qbuf) & 0x3f];
+	}
+
+	uint32_t output;
+#if __CUDA_ARCH__ >= 320
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(output) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf));
+#else
+	output = (MAKE_ULONGLONG(temp[7], temp[8]) >> bitbuf); // to check maybe 7/8 reversed
+#endif
+	output ^= input[7] ^ cdata7;
+	return output;
+}
+#endif
+
+#if __CUDA_ARCH__ >= 500
+static __forceinline__ __device__
+uint32_t fastkdf32_v3(uint32_t thread, const uint32_t nonce, uint32_t* const salt, uint32_t* const s_data)
+{
+	const uint32_t cdata7 = c_data[7];
+	const uint32_t data18 = c_data[18];
+	const uint32_t data20 = c_data[0];
+
+	uint32_t* B0 = (uint32_t*)&s_data[threadIdx.x * 64U];
+	((uintx64*)B0)[0] = ((uintx64*)salt)[0];
+
+	uint32_t input[BLAKE2S_BLOCK_SIZE / 4];
+	((uint816*)input)[0] = ((uint816*)c_data)[0];
+
+	uint32_t key[BLAKE2S_BLOCK_SIZE / 4];
+	((uint4x2*)key)[0] = ((uint4x2*)salt)[0];
+	((uint4*)key)[2] = make_uint4(0, 0, 0, 0);
+	((uint4*)key)[3] = make_uint4(0, 0, 0, 0);
+
+	uint32_t qbuf, rbuf, bitbuf;
+	uint32_t temp[9];
+
+	#pragma nounroll
+	for (int i = 0; i < 31; i++)
+	{
+		Blake2S_v2(input, input, key);
+
+		uint32_t bufidx = 0;
+		#pragma unroll
+		for (int x = 0; x < BLAKE2S_OUT_SIZE / 4; ++x)
+		{
+			uint32_t bufhelper = (input[x] & 0x00ff00ff) + ((input[x] & 0xff00ff00) >> 8);
+			bufhelper = bufhelper + (bufhelper >> 16);
+			bufidx += bufhelper;
+		}
+		bufidx &= 0x000000ff;
+		qbuf = bufidx >> 2;
+		rbuf = bufidx & 3;
+		bitbuf = rbuf << 3;
+
+		uint32_t shifted;
+		uint32_t shift = 32U - bitbuf;
+		asm("shl.b32         %0, %1, %2;"     : "=r"(shifted) : "r"(input[0]), "r"(bitbuf));
+		temp[0] = B0[(0 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[0]), "r"(input[1]), "r"(shift));
+		temp[1] = B0[(1 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[1]), "r"(input[2]), "r"(shift));
+		temp[2] = B0[(2 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[2]), "r"(input[3]), "r"(shift));
+		temp[3] = B0[(3 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[3]), "r"(input[4]), "r"(shift));
+		temp[4] = B0[(4 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[4]), "r"(input[5]), "r"(shift));
+		temp[5] = B0[(5 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[5]), "r"(input[6]), "r"(shift));
+		temp[6] = B0[(6 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[6]), "r"(input[7]), "r"(shift));
+		temp[7] = B0[(7 + qbuf) & 0x3f] ^ shifted;
+		asm("shr.b32         %0, %1, %2;"     : "=r"(shifted) : "r"(input[7]), "r"(shift));
+		temp[8] = B0[(8 + qbuf) & 0x3f] ^ shifted;
+
+		uint32_t a = c_data[qbuf & 0x3f], b;
+		#pragma unroll
+		for (int k = 0; k<16; k += 2)
+		{
+			b = c_data[(qbuf + k + 1) & 0x3f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k]) : "r"(a), "r"(b), "r"(bitbuf));
+			a = c_data[(qbuf + k + 2) & 0x3f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k + 1]) : "r"(b), "r"(a), "r"(bitbuf));
+		}
+
+		const uint32_t noncepos = 19U - qbuf % 20U;
+		if (noncepos <= 16U && qbuf < 60U)
+		{
+			if (noncepos != 0)
+				asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos - 1]) : "r"(data18), "r"(nonce), "r"(bitbuf));
+			if (noncepos != 16U)
+				asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos]) : "r"(nonce), "r"(data20), "r"(bitbuf));
+		}
+
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[0]) : "r"(temp[0]), "r"(temp[1]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[1]) : "r"(temp[1]), "r"(temp[2]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[2]) : "r"(temp[2]), "r"(temp[3]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[3]) : "r"(temp[3]), "r"(temp[4]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[4]) : "r"(temp[4]), "r"(temp[5]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[5]) : "r"(temp[5]), "r"(temp[6]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[6]) : "r"(temp[6]), "r"(temp[7]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[7]) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf));
+
+		#pragma unroll
+		for (int k = 0; k < 9; k++) {
+			B0[(k + qbuf) & 0x3f] = temp[k];
+		}
+	}
+
+	Blake2S_v2(input, input, key);
+
+	uint32_t bufidx = 0;
+	#pragma unroll
+	for (int x = 0; x < BLAKE2S_OUT_SIZE / 4; ++x)
+	{
+		uint32_t bufhelper = (input[x] & 0x00ff00ff) + ((input[x] & 0xff00ff00) >> 8);
+		bufhelper = bufhelper + (bufhelper >> 16);
+		bufidx += bufhelper;
+	}
+	bufidx &= 0x000000ff;
+	qbuf = bufidx >> 2;
+	rbuf = bufidx & 3;
+	bitbuf = rbuf << 3;
+
+	temp[7] = B0[(qbuf + 7) & 0x3f];
+	temp[8] = B0[(qbuf + 8) & 0x3f];
+
+	uint32_t output;
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(output) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf));
+	output ^= input[7] ^ cdata7;
+	return output;
+}
+#endif
+
+
+#define BLAKE_Ghost(idx0, idx1, a, b, c, d, key) { \
+	idx = BLAKE2S_SIGMA_host[idx0][idx1]; a += key[idx]; \
+	a += b; d = ROTR32(d^a,16); \
+	c += d; b = ROTR32(b^c, 12); \
+	idx = BLAKE2S_SIGMA_host[idx0][idx1 + 1]; a += key[idx]; \
+	a += b; d = ROTR32(d^a,8); \
+	c += d; b = ROTR32(b^c, 7); \
+}
+
+static void Blake2Shost(uint32_t * inout, const uint32_t * inkey)
+{
+	uint16 V;
+	uint32_t idx;
+	uint8 tmpblock;
+
+	V.hi = BLAKE2S_IV_Vechost;
+	V.lo = BLAKE2S_IV_Vechost;
+	V.lo.s0 ^= 0x01012020;
+
+	// Copy input block for later
+	tmpblock = V.lo;
+
+	V.hi.s4 ^= BLAKE2S_BLOCK_SIZE;
+
+	for (int x = 0; x < 10; ++x)
+	{
+		BLAKE_Ghost(x, 0x00, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inkey);
+		BLAKE_Ghost(x, 0x02, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inkey);
+		BLAKE_Ghost(x, 0x04, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inkey);
+		BLAKE_Ghost(x, 0x06, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inkey);
+		BLAKE_Ghost(x, 0x08, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inkey);
+		BLAKE_Ghost(x, 0x0A, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inkey);
+		BLAKE_Ghost(x, 0x0C, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inkey);
+		BLAKE_Ghost(x, 0x0E, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inkey);
+	}
+
+	V.lo ^= V.hi;
+	V.lo ^= tmpblock;
+
+	V.hi = BLAKE2S_IV_Vechost;
+	tmpblock = V.lo;
+
+	V.hi.s4 ^= 128;
+	V.hi.s6 = ~V.hi.s6;
+
+	for (int x = 0; x < 10; ++x)
+	{
+		BLAKE_Ghost(x, 0x00, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+		BLAKE_Ghost(x, 0x02, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+		BLAKE_Ghost(x, 0x04, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+		BLAKE_Ghost(x, 0x06, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+		BLAKE_Ghost(x, 0x08, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+		BLAKE_Ghost(x, 0x0A, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+		BLAKE_Ghost(x, 0x0C, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+		BLAKE_Ghost(x, 0x0E, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+	}
+
+	V.lo ^= V.hi ^ tmpblock;
+
+	((uint8*)inout)[0] = V.lo;
+}
+
+
+#define TPB 32
+#define TPB2 64
+
+__global__
+__launch_bounds__(TPB2, 1)
+void neoscrypt_gpu_hash_start(const int stratum, const uint32_t startNonce)
+{
+	__shared__ uint32_t s_data[64 * TPB2];
+
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const uint32_t nonce = startNonce + thread;
+	const uint32_t ZNonce = (stratum) ? cuda_swab32(nonce) : nonce; //freaking morons !!!
+
+	__syncthreads();
+#if __CUDA_ARCH__ < 500
+	fastkdf256_v1(thread, ZNonce, s_data);
+#else
+	fastkdf256_v2(thread, ZNonce, s_data);
+#endif
+}
+
+__global__
+__launch_bounds__(TPB, 1)
+void neoscrypt_gpu_hash_chacha1()
+{
+	const uint32_t thread = (blockDim.y * blockIdx.x + threadIdx.y);
+	const uint32_t threads = (gridDim.x * blockDim.y);
+	const uint32_t shiftTr = 8U * thread;
+
+	uint4 X[4];
+	for (int i = 0; i < 4; i++)
+	{
+		X[i].x = __ldg((uint32_t*)&(Input + shiftTr)[i * 2] + 0 * 4 + threadIdx.x);
+		X[i].y = __ldg((uint32_t*)&(Input + shiftTr)[i * 2] + 1 * 4 + threadIdx.x);
+		X[i].z = __ldg((uint32_t*)&(Input + shiftTr)[i * 2] + 2 * 4 + threadIdx.x);
+		X[i].w = __ldg((uint32_t*)&(Input + shiftTr)[i * 2] + 3 * 4 + threadIdx.x);
+	}
+
+	#pragma nounroll
+	for (int i = 0; i < 128; i++)
+	{
+		uint32_t offset = 8U * (thread + threads * i);
+		for (int j = 0; j < 4; j++)
+			((uint4*)(W + offset))[j * 4 + threadIdx.x] = X[j];
+		neoscrypt_chacha(X);
+	}
+
+	#pragma nounroll
+	for (int t = 0; t < 128; t++)
+	{
+		uint32_t offset = 8U * (thread + threads * (WarpShuffle(X[3].x, 0, 4) & 0x7F));
+		for (int j = 0; j < 4; j++)
+			X[j] ^= ((uint4*)(W + offset))[j * 4 + threadIdx.x];
+		neoscrypt_chacha(X);
+	}
+
+	#pragma unroll
+	for (int i = 0; i < 4; i++)
+	{
+		*((uint32_t*)&(Tr + shiftTr)[i * 2] + 0 * 4 + threadIdx.x) = X[i].x;
+		*((uint32_t*)&(Tr + shiftTr)[i * 2] + 1 * 4 + threadIdx.x) = X[i].y;
+		*((uint32_t*)&(Tr + shiftTr)[i * 2] + 2 * 4 + threadIdx.x) = X[i].z;
+		*((uint32_t*)&(Tr + shiftTr)[i * 2] + 3 * 4 + threadIdx.x) = X[i].w;
+	}
+}
+
+__global__
+__launch_bounds__(TPB, 1)
+void neoscrypt_gpu_hash_salsa1()
+{
+	const uint32_t thread = (blockDim.y * blockIdx.x + threadIdx.y);
+	const uint32_t threads = (gridDim.x * blockDim.y);
+	const uint32_t shiftTr = 8U * thread;
+
+	uint4 Z[4];
+	for (int i = 0; i < 4; i++)
+	{
+		Z[i].x = __ldg((uint32_t*)&(Input + shiftTr)[i * 2] + ((0 + threadIdx.x) & 3) * 4 + threadIdx.x);
+		Z[i].y = __ldg((uint32_t*)&(Input + shiftTr)[i * 2] + ((1 + threadIdx.x) & 3) * 4 + threadIdx.x);
+		Z[i].z = __ldg((uint32_t*)&(Input + shiftTr)[i * 2] + ((2 + threadIdx.x) & 3) * 4 + threadIdx.x);
+		Z[i].w = __ldg((uint32_t*)&(Input + shiftTr)[i * 2] + ((3 + threadIdx.x) & 3) * 4 + threadIdx.x);
+	}
+
+	#pragma nounroll
+	for (int i = 0; i < 128; i++)
+	{
+		uint32_t offset = 8U * (thread + threads * i);
+		for (int j = 0; j < 4; j++)
+			((uint4*)(W + offset))[j * 4 + threadIdx.x] = Z[j];
+		neoscrypt_salsa(Z);
+	}
+
+	#pragma nounroll
+	for (int t = 0; t < 128; t++)
+	{
+		uint32_t offset = 8U * (thread + threads * (WarpShuffle(Z[3].x, 0, 4) & 0x7F));
+		for (int j = 0; j < 4; j++)
+			Z[j] ^= ((uint4*)(W + offset))[j * 4 + threadIdx.x];
+		neoscrypt_salsa(Z);
+	}
+	#pragma unroll
+	for (int i = 0; i < 4; i++)
+	{
+		*((uint32_t*)&(Tr2 + shiftTr)[i * 2] + ((0 + threadIdx.x) & 3) * 4 + threadIdx.x) = Z[i].x;
+		*((uint32_t*)&(Tr2 + shiftTr)[i * 2] + ((1 + threadIdx.x) & 3) * 4 + threadIdx.x) = Z[i].y;
+		*((uint32_t*)&(Tr2 + shiftTr)[i * 2] + ((2 + threadIdx.x) & 3) * 4 + threadIdx.x) = Z[i].z;
+		*((uint32_t*)&(Tr2 + shiftTr)[i * 2] + ((3 + threadIdx.x) & 3) * 4 + threadIdx.x) = Z[i].w;
+	}
+}
+
+__global__
+__launch_bounds__(TPB2, 8)
+void neoscrypt_gpu_hash_ending(const int stratum, const uint32_t startNonce, uint32_t *resNonces)
+{
+	__shared__ uint32_t s_data[64 * TPB2];
+
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const uint32_t shiftTr = thread * 8U;
+	const uint32_t nonce = startNonce + thread;
+	const uint32_t ZNonce = (stratum) ? cuda_swab32(nonce) : nonce;
+
+	__syncthreads();
+
+	uint2x4 Z[8];
+	#pragma unroll
+	for (int i = 0; i<8; i++)
+		Z[i] = __ldg4(&(Tr2 + shiftTr)[i]) ^ __ldg4(&(Tr + shiftTr)[i]);
+
+#if __CUDA_ARCH__ < 500
+	uint32_t outbuf = fastkdf32_v1(thread, ZNonce, (uint32_t*)Z, s_data);
+#else
+	uint32_t outbuf = fastkdf32_v3(thread, ZNonce, (uint32_t*)Z, s_data);
+#endif
+
+	if (outbuf <= c_target[1])
+	{
+		resNonces[0] = nonce;
+		//uint32_t tmp = atomicExch(resNonces, nonce);
+		//if(tmp != UINT32_MAX)
+		//	resNonces[1] = tmp;
+	}
+}
+
+static __thread uint32_t *hash1 = NULL;
+static __thread uint32_t *Trans1 = NULL;
+static __thread uint32_t *Trans2 = NULL; // 2 streams
+static __thread uint32_t *Trans3 = NULL; // 2 streams
+
+__host__
+void neoscrypt_init(int thr_id, uint32_t threads)
+{
+	cuda_get_arch(thr_id);
+
+	CUDA_SAFE_CALL(cudaMalloc(&d_NNonce[thr_id], 2 * sizeof(uint32_t)));
+	CUDA_SAFE_CALL(cudaMalloc(&hash1, 32 * 128 * sizeof(uint64_t) * threads));
+	CUDA_SAFE_CALL(cudaMalloc(&Trans1, 32 * sizeof(uint64_t) * threads));
+	CUDA_SAFE_CALL(cudaMalloc(&Trans2, 32 * sizeof(uint64_t) * threads));
+	CUDA_SAFE_CALL(cudaMalloc(&Trans3, 32 * sizeof(uint64_t) * threads));
+
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(W, &hash1, sizeof(uint2x4*), 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(Tr, &Trans1, sizeof(uint2x4*), 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(Tr2, &Trans2, sizeof(uint2x4*), 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(Input, &Trans3, sizeof(uint2x4*), 0, cudaMemcpyHostToDevice));
+}
+
+__host__
+void neoscrypt_free(int thr_id)
+{
+	cudaFree(d_NNonce[thr_id]);
+
+	cudaFree(hash1);
+	cudaFree(Trans1);
+	cudaFree(Trans2);
+	cudaFree(Trans3);
+}
+
+__host__
+void neoscrypt_hash_k4(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonces, bool stratum)
+{
+	CUDA_SAFE_CALL(cudaMemset(d_NNonce[thr_id], 0xff, 2 * sizeof(uint32_t)));
+
+	const int threadsperblock2 = TPB2;
+	dim3 grid2((threads + threadsperblock2 - 1) / threadsperblock2);
+	dim3 block2(threadsperblock2);
+
+	const int threadsperblock = TPB;
+	dim3 grid3((threads * 4 + threadsperblock - 1) / threadsperblock);
+	dim3 block3(4, threadsperblock >> 2);
+
+	neoscrypt_gpu_hash_start <<<grid2, block2>>> (stratum, startNounce); //fastkdf
+
+	neoscrypt_gpu_hash_salsa1 <<<grid3, block3>>> ();
+	neoscrypt_gpu_hash_chacha1 <<<grid3, block3>>> ();
+
+	neoscrypt_gpu_hash_ending <<<grid2, block2>>> (stratum, startNounce, d_NNonce[thr_id]); //fastkdf+end
+
+	CUDA_SAFE_CALL(cudaMemcpy(resNonces, d_NNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost));
+}
+
+__host__
+void neoscrypt_setBlockTarget(uint32_t* const pdata, uint32_t* const target)
+{
+	uint32_t PaddedMessage[64];
+	uint32_t input[16], key[16] = { 0 };
+
+	for (int i = 0; i < 19; i++)
+	{
+		PaddedMessage[i] = pdata[i];
+		PaddedMessage[i + 20] = pdata[i];
+		PaddedMessage[i + 40] = pdata[i];
+	}
+	for (int i = 0; i<4; i++)
+		PaddedMessage[i + 60] = pdata[i];
+
+	PaddedMessage[19] = 0;
+	PaddedMessage[39] = 0;
+	PaddedMessage[59] = 0;
+
+	((uint16*)input)[0] = ((uint16*)pdata)[0];
+	((uint8*)key)[0] = ((uint8*)pdata)[0];
+
+	Blake2Shost(input, key);
+
+	cudaMemcpyToSymbol(input_init, input, 64, 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(key_init, key, 64, 0, cudaMemcpyHostToDevice);
+
+	cudaMemcpyToSymbol(c_target, &target[6], 2 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(c_data, PaddedMessage, 64 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
+	CUDA_SAFE_CALL(cudaGetLastError());
+}
+
diff --git a/neoscrypt/cuda_vectors.h b/neoscrypt/cuda_vectors.h
new file mode 100644
index 0000000000..74e8843ce3
--- /dev/null
+++ b/neoscrypt/cuda_vectors.h
@@ -0,0 +1,570 @@
+#ifndef CUDA_VECTOR_H
+#define CUDA_VECTOR_H
+
+
+///////////////////////////////////////////////////////////////////////////////////
+#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__)
+#define __LDG_PTR   "l"
+#else
+#define __LDG_PTR   "r"
+#endif
+
+#include "cuda_helper.h"
+
+//typedef __device_builtin__ struct ulong16 ulong16;
+
+
+typedef struct __align__(32) uint8
+{
+	unsigned int s0, s1, s2, s3, s4, s5, s6, s7;
+} uint8;
+
+typedef struct __align__(64) ulonglong2to8
+{
+ulonglong2 l0,l1,l2,l3;
+} ulonglong2to8;
+
+typedef struct __align__(128) ulonglong8to16
+{
+	ulonglong2to8 lo, hi;
+} ulonglong8to16;
+
+typedef struct __align__(256) ulonglong16to32
+{
+	ulonglong8to16 lo, hi;
+} ulonglong16to32;
+
+typedef struct __align__(512) ulonglong32to64
+{
+	ulonglong16to32 lo, hi;
+} ulonglong32to64;
+
+
+
+typedef struct __align__(1024) ulonglonglong
+{
+	ulonglong8to16 s0,s1,s2,s3,s4,s5,s6,s7;
+} ulonglonglong;
+
+
+
+
+typedef struct __align__(64) uint16
+{
+	union {
+		struct {unsigned int  s0, s1, s2, s3, s4, s5, s6, s7;};
+		uint8 lo;
+	};
+	union {
+		struct {unsigned int s8, s9, sa, sb, sc, sd, se, sf;};
+		uint8 hi;
+	};
+} uint16;
+
+typedef struct __align__(128) uint32
+{
+
+		uint16 lo,hi;
+} uint32;
+
+
+
+struct __align__(128) ulong8
+{
+	ulonglong4 s0, s1, s2, s3;
+};
+typedef __device_builtin__ struct ulong8 ulong8;
+
+
+typedef struct  __align__(256) ulonglong16
+{
+	ulonglong2 s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sa, sb, sc, sd, se, sf;
+} ulonglong16;
+
+typedef struct  __align__(32) uint48
+{
+	uint4 s0, s1;
+
+} uint48;
+
+typedef struct  __align__(64) uint816
+{
+	uint48 s0, s1;
+
+} uint816;
+
+typedef struct  __align__(128) uint1632
+{
+	uint816 s0, s1;
+
+} uint1632;
+
+typedef struct  __align__(256) uintx64
+{
+	uint1632 s0, s1;
+
+} uintx64;
+
+typedef struct  __align__(512) uintx128
+{
+	uintx64 s0, s1;
+
+} uintx128;
+
+typedef struct  __align__(1024) uintx256
+{
+	uintx128 s0, s1;
+
+} uintx256;
+
+
+
+typedef struct __align__(256) uint4x16
+{
+	uint4 s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
+} uint4x16;
+
+static __inline__ __device__ ulonglong2to8 make_ulonglong2to8(ulonglong2 s0, ulonglong2 s1, ulonglong2 s2, ulonglong2 s3)
+{
+ulonglong2to8 t; t.l0=s0; t.l1=s1; t.l2=s2; t.l3=s3;
+return t;
+}
+
+static __inline__ __device__ ulonglong8to16 make_ulonglong8to16(const ulonglong2to8 &s0, const ulonglong2to8 &s1)
+{
+	ulonglong8to16 t; t.lo = s0; t.hi = s1;
+	return t;
+}
+
+static __inline__ __device__ ulonglong16to32 make_ulonglong16to32(const ulonglong8to16 &s0, const ulonglong8to16 &s1)
+{
+	ulonglong16to32 t; t.lo = s0; t.hi = s1;
+	return t;
+}
+
+static __inline__ __device__ ulonglong32to64 make_ulonglong32to64(const ulonglong16to32 &s0, const ulonglong16to32 &s1)
+{
+	ulonglong32to64 t; t.lo = s0; t.hi = s1;
+	return t;
+}
+
+
+static __inline__ __host__ __device__ ulonglonglong make_ulonglonglong(
+	const ulonglong8to16 &s0, const ulonglong8to16 &s1, const ulonglong8to16 &s2, const ulonglong8to16 &s3,
+	const ulonglong8to16 &s4, const ulonglong8to16 &s5, const ulonglong8to16 &s6, const ulonglong8to16 &s7)
+{
+	ulonglonglong t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
+	return t;
+}
+
+
+static __inline__ __device__ uint48 make_uint48(uint4 s0, uint4 s1)
+{
+	uint48 t; t.s0 = s0; t.s1 = s1;
+	return t;
+}
+
+static __inline__ __device__ uint816 make_uint816(const uint48 &s0, const uint48 &s1)
+{
+	uint816 t; t.s0 = s0; t.s1 = s1;
+	return t;
+}
+
+static __inline__ __device__ uint1632 make_uint1632(const uint816 &s0, const uint816 &s1)
+{
+	uint1632 t; t.s0 = s0; t.s1 = s1;
+	return t;
+}
+
+static __inline__ __device__ uintx64 make_uintx64(const uint1632 &s0, const uint1632 &s1)
+{
+	uintx64 t; t.s0 = s0; t.s1 = s1;
+	return t;
+}
+
+static __inline__ __device__ uintx128 make_uintx128(const uintx64 &s0, const uintx64 &s1)
+{
+	uintx128 t; t.s0 = s0; t.s1 = s1;
+	return t;
+}
+
+static __inline__ __device__ uintx256 make_uintx256(const uintx128 &s0, const uintx128 &s1)
+{
+	uintx256 t; t.s0 = s0; t.s1 = s1;
+	return t;
+}
+
+
+static __inline__ __device__ uintx256 make_uintx64(const uintx128 &s0, const uintx128 &s1)
+{
+	uintx256 t; t.s0 = s0; t.s1 = s1;
+	return t;
+}
+
+
+static __inline__ __host__ __device__ uint4x16 make_uint4x16(
+	uint4 s0, uint4 s1, uint4 s2, uint4 s3, uint4 s4, uint4 s5, uint4 s6, uint4 s7,
+	uint4 s8, uint4 s9, uint4 sa, uint4 sb, uint4 sc, uint4 sd, uint4 se, uint4 sf)
+{
+	uint4x16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
+	t.s8 = s8; t.s9 = s9; t.s10 = sa; t.s11 = sb; t.s12 = sc; t.s13 = sd; t.s14 = se; t.s15 = sf;
+	return t;
+}
+
+
+
+
+static __inline__ __host__ __device__ uint16 make_uint16(
+	unsigned int s0, unsigned int s1, unsigned int s2, unsigned int s3, unsigned int s4, unsigned int s5, unsigned int s6, unsigned int s7,
+	unsigned int s8, unsigned int s9, unsigned int sa, unsigned int sb, unsigned int sc, unsigned int sd, unsigned int se, unsigned int sf)
+{
+	uint16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
+	t.s8 = s8; t.s9 = s9; t.sa = sa; t.sb = sb; t.sc = sc; t.sd = sd; t.se = se; t.sf = sf;
+	return t;
+}
+
+static __inline__ __host__ __device__ uint16 make_uint16(const uint8 &a, const uint8 &b)
+{
+	uint16 t; t.lo=a; t.hi=b; return t;
+}
+
+static __inline__ __host__ __device__ uint32 make_uint32(const uint16 &a, const uint16 &b)
+{
+	uint32 t; t.lo = a; t.hi = b; return t;
+}
+
+
+static __inline__ __host__ __device__ uint8 make_uint8(
+	unsigned int s0, unsigned int s1, unsigned int s2, unsigned int s3, unsigned int s4, unsigned int s5, unsigned int s6, unsigned int s7)
+{
+	uint8 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
+	return t;
+}
+
+
+static __inline__ __host__ __device__ ulonglong16 make_ulonglong16(const ulonglong2 &s0, const ulonglong2 &s1,
+	const ulonglong2 &s2, const ulonglong2 &s3, const ulonglong2 &s4, const ulonglong2 &s5, const ulonglong2 &s6, const ulonglong2 &s7,
+	const ulonglong2 &s8, const ulonglong2 &s9,
+	const ulonglong2 &sa, const ulonglong2 &sb, const ulonglong2 &sc, const ulonglong2 &sd, const ulonglong2 &se, const ulonglong2 &sf
+) {
+	ulonglong16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
+	t.s8 = s8; t.s9 = s9; t.sa = sa; t.sb = sb; t.sc = sc; t.sd = sd; t.se = se; t.sf = sf;
+	return t;
+}
+
+
+
+static __inline__ __host__ __device__ ulong8 make_ulong8(
+	ulonglong4 s0, ulonglong4 s1, ulonglong4 s2, ulonglong4 s3)
+{
+	ulong8 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3;// t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
+	return t;
+}
+
+
+static __forceinline__ __device__ uchar4 operator^ (uchar4 a, uchar4 b) { return make_uchar4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); }
+static __forceinline__ __device__ uchar4 operator+ (uchar4 a, uchar4 b) { return make_uchar4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
+
+
+
+
+
+static __forceinline__ __device__ uint4 operator^ (uint4 a, uint4 b) { return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); }
+static __forceinline__ __device__ uint4 operator+ (uint4 a, uint4 b) { return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
+
+
+static __forceinline__ __device__ ulonglong4 operator^ (ulonglong4 a, ulonglong4 b) { return make_ulonglong4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); }
+static __forceinline__ __device__ ulonglong4 operator+ (ulonglong4 a, ulonglong4 b) { return make_ulonglong4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
+static __forceinline__ __device__ ulonglong2 operator^ (ulonglong2 a, ulonglong2 b) { return make_ulonglong2(a.x ^ b.x, a.y ^ b.y); }
+static __forceinline__ __device__ ulonglong2 operator+ (ulonglong2 a, ulonglong2 b) { return make_ulonglong2(a.x + b.x, a.y + b.y); }
+
+static __forceinline__ __device__ ulong8 operator^ (const ulong8 &a, const ulong8 &b) {
+	return make_ulong8(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3);
+} //, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7); }
+
+static __forceinline__ __device__ ulong8 operator+ (const ulong8 &a, const ulong8 &b) {
+	return make_ulong8(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3);
+} //, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7); }
+
+
+static __forceinline__ __device__  __host__ uint8 operator^ (const uint8 &a, const uint8 &b) { return make_uint8(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7); }
+
+static __forceinline__ __device__  __host__ uint8 operator+ (const uint8 &a, const uint8 &b) { return make_uint8(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7); }
+
+////////////// mess++ //////
+
+static __forceinline__ __device__  uint48 operator^ (const uint48 &a, const uint48 &b) {
+	return make_uint48(a.s0 ^ b.s0, a.s1 ^ b.s1);
+}
+
+static __forceinline__ __device__  uint816 operator^ (const uint816 &a, const uint816 &b) {
+	return make_uint816(a.s0 ^ b.s0, a.s1 ^ b.s1);
+}
+
+static __forceinline__ __device__ uint1632 operator^ (const uint1632 &a, const uint1632 &b) {
+	return make_uint1632(a.s0 ^ b.s0, a.s1 ^ b.s1);
+}
+
+
+static __forceinline__ __device__  uintx64 operator^ (const uintx64 &a, const uintx64 &b) {
+	return make_uintx64(a.s0 ^ b.s0, a.s1 ^ b.s1);
+}
+
+static __forceinline__ __device__  uintx128 operator^ (const uintx128 &a, const uintx128 &b) {
+	return make_uintx128(a.s0 ^ b.s0, a.s1 ^ b.s1);
+}
+
+static __forceinline__ __device__  uintx256 operator^ (const uintx256 &a, const uintx256 &b) {
+	return make_uintx256(a.s0 ^ b.s0, a.s1 ^ b.s1);
+}
+
+/////////////////////////
+
+static __forceinline__ __device__ __host__ uint16 operator^ (const uint16 &a, const uint16 &b) {
+	return make_uint16(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7,
+		a.s8 ^ b.s8, a.s9 ^ b.s9, a.sa ^ b.sa, a.sb ^ b.sb, a.sc ^ b.sc, a.sd ^ b.sd, a.se ^ b.se, a.sf ^ b.sf);
+}
+
+static __forceinline__ __device__  __host__ uint16 operator+ (const uint16 &a, const uint16 &b) {
+	return make_uint16(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7,
+		a.s8 + b.s8, a.s9 + b.s9, a.sa + b.sa, a.sb + b.sb, a.sc + b.sc, a.sd + b.sd, a.se + b.se, a.sf + b.sf);
+}
+
+static __forceinline__ __device__  uint32 operator^ (const uint32 &a, const uint32 &b) {
+	return make_uint32(a.lo ^ b.lo, a.hi ^ b.hi);
+}
+
+static __forceinline__ __device__  uint32 operator+ (const uint32 &a, const uint32 &b) {
+	return make_uint32(a.lo + b.lo, a.hi + b.hi);
+}
+
+static __forceinline__ __device__ ulonglong16 operator^ (const ulonglong16 &a, const ulonglong16 &b) {
+	return make_ulonglong16(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7,
+		a.s8 ^ b.s8, a.s9 ^ b.s9, a.sa ^ b.sa, a.sb ^ b.sb, a.sc ^ b.sc, a.sd ^ b.sd, a.se ^ b.se, a.sf ^ b.sf
+);
+}
+
+static __forceinline__ __device__ ulonglong16 operator+ (const ulonglong16 &a, const ulonglong16 &b) {
+	return make_ulonglong16(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7,
+		a.s8 + b.s8, a.s9 + b.s9, a.sa + b.sa, a.sb + b.sb, a.sc + b.sc, a.sd + b.sd, a.se + b.se, a.sf + b.sf
+);
+}
+
+static __forceinline__ __device__ void operator^= (ulong8 &a, const ulong8 &b) { a = a ^ b; }
+static __forceinline__ __device__ void operator^= (uintx64 &a, const uintx64 &b) { a = a ^ b; }
+
+static __forceinline__ __device__ void operator^= (uintx128 &a, const uintx128 &b) { a = a ^ b; }
+static __forceinline__ __device__ void operator^= (uintx256 &a, const uintx256 &b) { a = a ^ b; }
+
+
+static __forceinline__ __device__ void operator^= (uint816 &a, const uint816 &b) { a = a ^ b; }
+
+static __forceinline__ __device__ void operator^= (uint48 &a, const uint48 &b) { a = a ^ b; }
+
+static __forceinline__ __device__ void operator^= (uint32 &a, const uint32 &b) { a = a ^ b; }
+
+static __forceinline__ __device__ void operator+= (uint32 &a, const uint32 &b) { a = a + b; }
+
+
+static __forceinline__ __device__ void operator^= (uint4 &a, uint4 b) { a = a ^ b; }
+static __forceinline__ __device__ void operator^= (uchar4 &a, uchar4 b) { a = a ^ b; }
+static __forceinline__ __device__  __host__ void operator^= (uint8 &a, const uint8 &b) { a = a ^ b; }
+static __forceinline__ __device__  __host__ void operator^= (uint16 &a, const uint16 &b) { a = a ^ b; }
+
+static __forceinline__ __device__ void operator^= (ulonglong16 &a, const ulonglong16 &b) { a = a ^ b; }
+static __forceinline__ __device__ void operator^= (ulonglong4 &a, const ulonglong4 &b) { a = a ^ b; }
+static __forceinline__ __device__ void operator^= (ulonglong2 &a, const ulonglong2 &b) { a = a ^ b; }
+static __forceinline__ __device__ void operator+= (ulonglong2 &a, const ulonglong2 &b) { a = a + b; }
+
+static __forceinline__ __device__
+ulonglong2to8 operator^ (const ulonglong2to8 &a, const ulonglong2to8 &b)
+{
+	return make_ulonglong2to8(a.l0 ^ b.l0, a.l1 ^ b.l1, a.l2 ^ b.l2, a.l3 ^ b.l3);
+}
+static __forceinline__ __device__
+ulonglong2to8 operator+ (const ulonglong2to8 &a, const ulonglong2to8 &b)
+{
+	return make_ulonglong2to8(a.l0 + b.l0, a.l1 + b.l1, a.l2 + b.l2, a.l3 + b.l3);
+}
+
+
+static __forceinline__ __device__
+ulonglong8to16 operator^ (const ulonglong8to16 &a, const ulonglong8to16 &b)
+{
+	return make_ulonglong8to16(a.lo ^ b.lo, a.hi ^ b.hi);
+}
+
+static __forceinline__ __device__
+ulonglong8to16 operator+ (const ulonglong8to16 &a, const ulonglong8to16 &b)
+{
+	return make_ulonglong8to16(a.lo + b.lo, a.hi + b.hi);
+}
+
+static __forceinline__ __device__
+ulonglong16to32 operator^ (const ulonglong16to32 &a, const ulonglong16to32 &b)
+{
+	return make_ulonglong16to32(a.lo ^ b.lo, a.hi ^ b.hi);
+}
+
+static __forceinline__ __device__
+ulonglong16to32 operator+ (const ulonglong16to32 &a, const ulonglong16to32 &b)
+{
+	return make_ulonglong16to32(a.lo + b.lo, a.hi + b.hi);
+}
+
+static __forceinline__ __device__
+ulonglong32to64 operator^ (const ulonglong32to64 &a, const ulonglong32to64 &b)
+{
+	return make_ulonglong32to64(a.lo ^ b.lo, a.hi ^ b.hi);
+}
+
+static __forceinline__ __device__
+ulonglong32to64 operator+ (const ulonglong32to64 &a, const ulonglong32to64 &b)
+{
+	return make_ulonglong32to64(a.lo + b.lo, a.hi + b.hi);
+}
+
+
+static __forceinline__ __device__ ulonglonglong operator^ (const ulonglonglong &a, const ulonglonglong &b) {
+	return make_ulonglonglong(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7);
+}
+
+static __forceinline__ __device__ ulonglonglong operator+ (const ulonglonglong &a, const ulonglonglong &b) {
+	return make_ulonglonglong(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7);
+}
+
+
+static __forceinline__ __device__ void operator^= (ulonglong2to8 &a, const ulonglong2to8 &b) { a = a ^ b; }
+static __forceinline__ __device__ void operator+= (uint4 &a, uint4 b) { a = a + b; }
+static __forceinline__ __device__ void operator+= (uchar4 &a, uchar4 b) { a = a + b; }
+static __forceinline__ __device__  __host__ void operator+= (uint8 &a, const uint8 &b) { a = a + b; }
+static __forceinline__ __device__  __host__ void operator+= (uint16 &a, const uint16 &b) { a = a + b; }
+static __forceinline__ __device__ void operator+= (ulong8 &a, const ulong8 &b) { a = a + b; }
+static __forceinline__ __device__ void operator+= (ulonglong16 &a, const ulonglong16 &b) { a = a + b; }
+static __forceinline__ __device__ void operator+= (ulonglong8to16 &a, const ulonglong8to16 &b) { a = a + b; }
+static __forceinline__ __device__ void operator^= (ulonglong8to16 &a, const ulonglong8to16 &b) { a = a ^ b; }
+
+static __forceinline__ __device__ void operator+= (ulonglong16to32 &a, const ulonglong16to32 &b) { a = a + b; }
+static __forceinline__ __device__ void operator^= (ulonglong16to32 &a, const ulonglong16to32 &b) { a = a ^ b; }
+
+static __forceinline__ __device__ void operator+= (ulonglong32to64 &a, const ulonglong32to64 &b) { a = a + b; }
+static __forceinline__ __device__ void operator^= (ulonglong32to64 &a, const ulonglong32to64 &b) { a = a ^ b; }
+
+static __forceinline__ __device__ void operator+= (ulonglonglong &a, const ulonglonglong &b) { a = a + b; }
+static __forceinline__ __device__ void operator^= (ulonglonglong &a, const ulonglonglong &b) { a = a ^ b; }
+
+#if __CUDA_ARCH__ < 320
+
+#define rotateL ROTL32
+#define rotateR ROTR32
+
+#else
+
+static __forceinline__ __device__ uint32_t rotateL(uint32_t vec4, uint32_t shift)
+{
+	uint32_t ret;
+	asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(vec4), "r"(vec4), "r"(shift));
+	return ret;
+}
+
+static __forceinline__ __device__ uint32_t rotateR(uint32_t vec4, uint32_t shift)
+{
+	uint32_t ret;
+	asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(vec4), "r"(vec4), "r"(shift));
+	return ret;
+}
+
+#endif
+
+#if __CUDA_ARCH__ < 320
+
+// right shift a 64-bytes integer (256-bits) by 0 8 16 24 bits
+// require a uint32_t[9] ret array
+// note: djm neoscrypt implementation is near the limits of gpu capabilities
+//       and weird behaviors can happen when tuning device functions code...
+__device__ static void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
+{
+	uint8_t *v = (uint8_t*) &vec4.s0;
+	uint8_t *r = (uint8_t*) ret;
+	uint8_t bytes = (uint8_t) (shift >> 3);
+	ret[0] = 0;
+	for (uint8_t i=bytes; i<32; i++)
+		r[i] = v[i-bytes];
+	ret[8] = vec4.s7 >> (32 - shift); // shuffled part required
+}
+
+#else
+
+// same for SM 3.5+, really faster ?
+__device__ static void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
+{
+	uint32_t truc = 0, truc2 = cuda_swab32(vec4.s7), truc3 = 0;
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift));
+	ret[8] = cuda_swab32(truc);
+	truc3 = cuda_swab32(vec4.s6);
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc2), "r"(truc3), "r"(shift));
+	ret[7] = cuda_swab32(truc);
+	truc2 = cuda_swab32(vec4.s5);
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift));
+	ret[6] = cuda_swab32(truc);
+	truc3 = cuda_swab32(vec4.s4);
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc2), "r"(truc3), "r"(shift));
+	ret[5] = cuda_swab32(truc);
+	truc2 = cuda_swab32(vec4.s3);
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift));
+	ret[4] = cuda_swab32(truc);
+	truc3 = cuda_swab32(vec4.s2);
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc2), "r"(truc3), "r"(shift));
+	ret[3] = cuda_swab32(truc);
+	truc2 = cuda_swab32(vec4.s1);
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift));
+	ret[2] = cuda_swab32(truc);
+	truc3 = cuda_swab32(vec4.s0);
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc2), "r"(truc3), "r"(shift));
+	ret[1] = cuda_swab32(truc);
+	asm("shr.b32        %0, %1, %2;" : "=r"(truc) : "r"(truc3), "r"(shift));
+	ret[0] = cuda_swab32(truc);
+}
+#endif
+
+#if __CUDA_ARCH__ < 320
+
+// copy 256 bytes
+static __device__ __inline__ uintx64 ldg256(const uint4 *ptr)
+{
+	uintx64 ret;
+	uint32_t *dst = (uint32_t*) &ret.s0;
+	uint32_t *src = (uint32_t*) &ptr[0].x;
+	for (int i=0; i < (256 / sizeof(uint32_t)); i++) {
+		dst[i] = src[i];
+	}
+	return ret;
+}
+
+#else
+
+// complicated way to copy 256 bytes ;)
+static __device__ __inline__ uintx64 ldg256(const uint4 *ptr)
+{
+	uintx64 ret;
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.s0.s0.s0.s0.x), "=r"(ret.s0.s0.s0.s0.y), "=r"(ret.s0.s0.s0.s0.z), "=r"(ret.s0.s0.s0.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];"  : "=r"(ret.s0.s0.s0.s1.x), "=r"(ret.s0.s0.s0.s1.y), "=r"(ret.s0.s0.s0.s1.z), "=r"(ret.s0.s0.s0.s1.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];"  : "=r"(ret.s0.s0.s1.s0.x), "=r"(ret.s0.s0.s1.s0.y), "=r"(ret.s0.s0.s1.s0.z), "=r"(ret.s0.s0.s1.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];"  : "=r"(ret.s0.s0.s1.s1.x), "=r"(ret.s0.s0.s1.s1.y), "=r"(ret.s0.s0.s1.s1.z), "=r"(ret.s0.s0.s1.s1.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];"  : "=r"(ret.s0.s1.s0.s0.x), "=r"(ret.s0.s1.s0.s0.y), "=r"(ret.s0.s1.s0.s0.z), "=r"(ret.s0.s1.s0.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];"  : "=r"(ret.s0.s1.s0.s1.x), "=r"(ret.s0.s1.s0.s1.y), "=r"(ret.s0.s1.s0.s1.z), "=r"(ret.s0.s1.s0.s1.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+96];"  : "=r"(ret.s0.s1.s1.s0.x), "=r"(ret.s0.s1.s1.s0.y), "=r"(ret.s0.s1.s1.s0.z), "=r"(ret.s0.s1.s1.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+112];"  : "=r"(ret.s0.s1.s1.s1.x), "=r"(ret.s0.s1.s1.s1.y), "=r"(ret.s0.s1.s1.s1.z), "=r"(ret.s0.s1.s1.s1.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+128];"  : "=r"(ret.s1.s0.s0.s0.x), "=r"(ret.s1.s0.s0.s0.y), "=r"(ret.s1.s0.s0.s0.z), "=r"(ret.s1.s0.s0.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+144];"  : "=r"(ret.s1.s0.s0.s1.x), "=r"(ret.s1.s0.s0.s1.y), "=r"(ret.s1.s0.s0.s1.z), "=r"(ret.s1.s0.s0.s1.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+160];"  : "=r"(ret.s1.s0.s1.s0.x), "=r"(ret.s1.s0.s1.s0.y), "=r"(ret.s1.s0.s1.s0.z), "=r"(ret.s1.s0.s1.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+176];"  : "=r"(ret.s1.s0.s1.s1.x), "=r"(ret.s1.s0.s1.s1.y), "=r"(ret.s1.s0.s1.s1.z), "=r"(ret.s1.s0.s1.s1.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+192];"  : "=r"(ret.s1.s1.s0.s0.x), "=r"(ret.s1.s1.s0.s0.y), "=r"(ret.s1.s1.s0.s0.z), "=r"(ret.s1.s1.s0.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+208];"  : "=r"(ret.s1.s1.s0.s1.x), "=r"(ret.s1.s1.s0.s1.y), "=r"(ret.s1.s1.s0.s1.z), "=r"(ret.s1.s1.s0.s1.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+224];"  : "=r"(ret.s1.s1.s1.s0.x), "=r"(ret.s1.s1.s1.s0.y), "=r"(ret.s1.s1.s1.s0.z), "=r"(ret.s1.s1.s1.s0.w) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+240];"  : "=r"(ret.s1.s1.s1.s1.x), "=r"(ret.s1.s1.s1.s1.y), "=r"(ret.s1.s1.s1.s1.z), "=r"(ret.s1.s1.s1.s1.w) : __LDG_PTR(ptr));
+	return ret;
+}
+#endif
+
+#endif // #ifndef CUDA_VECTOR_H
diff --git a/neoscrypt/neoscrypt-cpu.c b/neoscrypt/neoscrypt-cpu.c
new file mode 100644
index 0000000000..d1ef2ff3bd
--- /dev/null
+++ b/neoscrypt/neoscrypt-cpu.c
@@ -0,0 +1,989 @@
+/*
+ * Copyright (c) 2009 Colin Percival, 2011 ArtForz
+ * Copyright (c) 2012 Andrew Moon (floodyberry)
+ * Copyright (c) 2012 Samuel Neves <sneves@dei.uc.pt>
+ * Copyright (c) 2014 John Doering <ghostlander@phoenixcoin.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "neoscrypt.h"
+
+#ifdef WIN32
+/* sizeof(unsigned long) = 4 for MinGW64 */
+typedef unsigned long long ulong;
+#else
+typedef unsigned long ulong;
+#endif
+typedef unsigned int  uint;
+typedef unsigned char uchar;
+typedef unsigned int  bool;
+
+#define MIN(a, b) ((a) < (b) ? a : b)
+#define MAX(a, b) ((a) > (b) ? a : b)
+
+/* SHA-256 */
+
+static const uint32_t sha256_constants[64] = {
+	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
+#define Ch(x,y,z)  (z ^ (x & (y ^ z)))
+#define Maj(x,y,z) (((x | y) & z) | (x & y))
+#define S0(x)      (ROTR32(x,  2) ^ ROTR32(x, 13) ^ ROTR32(x, 22))
+#define S1(x)      (ROTR32(x,  6) ^ ROTR32(x, 11) ^ ROTR32(x, 25))
+#define G0(x)      (ROTR32(x,  7) ^ ROTR32(x, 18) ^ (x >>  3))
+#define G1(x)      (ROTR32(x, 17) ^ ROTR32(x, 19) ^ (x >> 10))
+#define W0(in,i)   (U8TO32_BE(&in[i * 4]))
+#define W1(i)      (G1(w[i - 2]) + w[i - 7] + G0(w[i - 15]) + w[i - 16])
+#define STEP(i) \
+	t1 = S0(r[0]) + Maj(r[0], r[1], r[2]); \
+	t0 = r[7] + S1(r[4]) + Ch(r[4], r[5], r[6]) + sha256_constants[i] + w[i]; \
+	r[7] = r[6]; \
+	r[6] = r[5]; \
+	r[5] = r[4]; \
+	r[4] = r[3] + t0; \
+	r[3] = r[2]; \
+	r[2] = r[1]; \
+	r[1] = r[0]; \
+	r[0] = t0 + t1;
+
+
+typedef struct sha256_hash_state_t {
+	uint32_t H[8];
+	uint64_t T;
+	uint32_t leftover;
+	uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
+} sha256_hash_state;
+
+
+static void sha256_blocks(sha256_hash_state *S, const uint8_t *in, size_t blocks)
+{
+	uint32_t r[8], w[64], t0, t1;
+	size_t i;
+
+	for (i = 0; i < 8; i++)
+		r[i] = S->H[i];
+
+	while (blocks--) {
+		for (i = 0U; i < 16; i++) {
+			w[i] = W0(in, i);
+		}
+		for (i = 16; i < 64; i++) {
+			w[i] = W1(i);
+		}
+		for (i = 0U; i < 64; i++) {
+			STEP(i);
+		}
+		for (i = 0U; i < 8U; i++) {
+			r[i] += S->H[i];
+			S->H[i] = r[i];
+		}
+		S->T += SCRYPT_HASH_BLOCK_SIZE * 8;
+		in += SCRYPT_HASH_BLOCK_SIZE;
+	}
+}
+
+static void neoscrypt_hash_init_sha256(sha256_hash_state *S)
+{
+	S->H[0] = 0x6a09e667;
+	S->H[1] = 0xbb67ae85;
+	S->H[2] = 0x3c6ef372;
+	S->H[3] = 0xa54ff53a;
+	S->H[4] = 0x510e527f;
+	S->H[5] = 0x9b05688c;
+	S->H[6] = 0x1f83d9ab;
+	S->H[7] = 0x5be0cd19;
+	S->T = 0;
+	S->leftover = 0;
+}
+
+static void neoscrypt_hash_update_sha256(sha256_hash_state *S, const uint8_t *in, size_t inlen)
+{
+	size_t blocks, want;
+
+	/* handle the previous data */
+	if (S->leftover) {
+		want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover);
+		want = (want < inlen) ? want : inlen;
+		memcpy(S->buffer + S->leftover, in, want);
+		S->leftover += (uint32_t)want;
+		if (S->leftover < SCRYPT_HASH_BLOCK_SIZE)
+			return;
+		in += want;
+		inlen -= want;
+		sha256_blocks(S, S->buffer, 1);
+	}
+
+	/* handle the current data */
+	blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1));
+	S->leftover = (uint32_t)(inlen - blocks);
+	if (blocks) {
+		sha256_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE);
+		in += blocks;
+	}
+
+	/* handle leftover data */
+	if (S->leftover)
+		memcpy(S->buffer, in, S->leftover);
+}
+
+static void neoscrypt_hash_finish_sha256(sha256_hash_state *S, uint8_t *hash)
+{
+	uint64_t t = S->T + (S->leftover * 8);
+
+	S->buffer[S->leftover] = 0x80;
+	if (S->leftover <= 55) {
+		memset(S->buffer + S->leftover + 1, 0, 55 - S->leftover);
+	} else {
+		memset(S->buffer + S->leftover + 1, 0, 63 - S->leftover);
+		sha256_blocks(S, S->buffer, 1);
+		memset(S->buffer, 0, 56);
+	}
+
+	U64TO8_BE(S->buffer + 56, t);
+	sha256_blocks(S, S->buffer, 1);
+
+	U32TO8_BE(&hash[ 0], S->H[0]);
+	U32TO8_BE(&hash[ 4], S->H[1]);
+	U32TO8_BE(&hash[ 8], S->H[2]);
+	U32TO8_BE(&hash[12], S->H[3]);
+	U32TO8_BE(&hash[16], S->H[4]);
+	U32TO8_BE(&hash[20], S->H[5]);
+	U32TO8_BE(&hash[24], S->H[6]);
+	U32TO8_BE(&hash[28], S->H[7]);
+}
+
+static void neoscrypt_hash_sha256(hash_digest hash, const uint8_t *m, size_t mlen)
+{
+	sha256_hash_state st;
+	neoscrypt_hash_init_sha256(&st);
+	neoscrypt_hash_update_sha256(&st, m, mlen);
+	neoscrypt_hash_finish_sha256(&st, hash);
+}
+
+
+/* HMAC for SHA-256 */
+
+typedef struct sha256_hmac_state_t {
+	sha256_hash_state inner, outer;
+} sha256_hmac_state;
+
+static void neoscrypt_hmac_init_sha256(sha256_hmac_state *st, const uint8_t *key, size_t keylen)
+{
+	uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] = {0};
+	size_t i;
+
+	neoscrypt_hash_init_sha256(&st->inner);
+	neoscrypt_hash_init_sha256(&st->outer);
+
+	if (keylen <= SCRYPT_HASH_BLOCK_SIZE) {
+		/* use the key directly if it's <= blocksize bytes */
+		memcpy(pad, key, keylen);
+	} else {
+		/* if it's > blocksize bytes, hash it */
+		neoscrypt_hash_sha256(pad, key, keylen);
+	}
+
+	/* inner = (key ^ 0x36) */
+	/* h(inner || ...) */
+	for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++)
+		pad[i] ^= 0x36;
+	neoscrypt_hash_update_sha256(&st->inner, pad, SCRYPT_HASH_BLOCK_SIZE);
+
+	/* outer = (key ^ 0x5c) */
+	/* h(outer || ...) */
+	for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++)
+		pad[i] ^= (0x5c ^ 0x36);
+	neoscrypt_hash_update_sha256(&st->outer, pad, SCRYPT_HASH_BLOCK_SIZE);
+}
+
+static void neoscrypt_hmac_update_sha256(sha256_hmac_state *st, const uint8_t *m, size_t mlen)
+{
+	/* h(inner || m...) */
+	neoscrypt_hash_update_sha256(&st->inner, m, mlen);
+}
+
+static void neoscrypt_hmac_finish_sha256(sha256_hmac_state *st, hash_digest mac)
+{
+	/* h(inner || m) */
+	hash_digest innerhash;
+	neoscrypt_hash_finish_sha256(&st->inner, innerhash);
+
+	/* h(outer || h(inner || m)) */
+	neoscrypt_hash_update_sha256(&st->outer, innerhash, sizeof(innerhash));
+	neoscrypt_hash_finish_sha256(&st->outer, mac);
+}
+
+
+/* PBKDF2 for SHA-256 */
+
+static void neoscrypt_pbkdf2_sha256(const uint8_t *password, size_t password_len,
+	const uint8_t *salt, size_t salt_len, uint64_t N, uint8_t *output, size_t output_len)
+{
+	sha256_hmac_state hmac_pw, hmac_pw_salt, work;
+	hash_digest ti, u;
+	uint8_t be[4];
+	uint32_t i, j, k, blocks;
+
+	/* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */
+
+	/* hmac(password, ...) */
+	neoscrypt_hmac_init_sha256(&hmac_pw, password, password_len);
+
+	/* hmac(password, salt...) */
+	hmac_pw_salt = hmac_pw;
+	neoscrypt_hmac_update_sha256(&hmac_pw_salt, salt, salt_len);
+
+	blocks = ((uint32_t)output_len + (SCRYPT_HASH_DIGEST_SIZE - 1)) / SCRYPT_HASH_DIGEST_SIZE;
+	for(i = 1; i <= blocks; i++) {
+		/* U1 = hmac(password, salt || be(i)) */
+		U32TO8_BE(be, i);
+		work = hmac_pw_salt;
+		neoscrypt_hmac_update_sha256(&work, be, 4);
+		neoscrypt_hmac_finish_sha256(&work, ti);
+		memcpy(u, ti, sizeof(u));
+
+		/* T[i] = U1 ^ U2 ^ U3... */
+		for(j = 0; j < N - 1; j++) {
+			/* UX = hmac(password, U{X-1}) */
+			work = hmac_pw;
+			neoscrypt_hmac_update_sha256(&work, u, SCRYPT_HASH_DIGEST_SIZE);
+			neoscrypt_hmac_finish_sha256(&work, u);
+
+			/* T[i] ^= UX */
+			for(k = 0; k < sizeof(u); k++)
+				ti[k] ^= u[k];
+		}
+
+		memcpy(output, ti, (output_len > SCRYPT_HASH_DIGEST_SIZE) ? SCRYPT_HASH_DIGEST_SIZE : output_len);
+		output += SCRYPT_HASH_DIGEST_SIZE;
+		output_len -= SCRYPT_HASH_DIGEST_SIZE;
+	}
+}
+
+
+/* NeoScrypt */
+
+#if defined(ASM)
+
+extern void neoscrypt_salsa(uint *X, uint rounds);
+extern void neoscrypt_salsa_tangle(uint *X, uint count);
+extern void neoscrypt_chacha(uint *X, uint rounds);
+
+extern void neoscrypt_blkcpy(void *dstp, const void *srcp, uint len);
+extern void neoscrypt_blkswp(void *blkAp, void *blkBp, uint len);
+extern void neoscrypt_blkxor(void *dstp, const void *srcp, uint len);
+
+#else
+
+/* Salsa20, rounds must be a multiple of 2 */
+static void neoscrypt_salsa(uint *X, uint rounds)
+{
+	uint x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, t;
+
+	x0 = X[0];   x1 = X[1];   x2 = X[2];   x3 = X[3];
+	x4 = X[4];   x5 = X[5];   x6 = X[6];   x7 = X[7];
+	x8 = X[8];   x9 = X[9];  x10 = X[10]; x11 = X[11];
+   x12 = X[12]; x13 = X[13]; x14 = X[14]; x15 = X[15];
+
+#define quarter(a, b, c, d) \
+	t = a + d; t = ROTL32(t,  7); b ^= t; \
+	t = b + a; t = ROTL32(t,  9); c ^= t; \
+	t = c + b; t = ROTL32(t, 13); d ^= t; \
+	t = d + c; t = ROTL32(t, 18); a ^= t;
+
+	for(; rounds; rounds -= 2) {
+		quarter( x0,  x4,  x8, x12);
+		quarter( x5,  x9, x13,  x1);
+		quarter(x10, x14,  x2,  x6);
+		quarter(x15,  x3,  x7, x11);
+		quarter( x0,  x1,  x2,  x3);
+		quarter( x5,  x6,  x7,  x4);
+		quarter(x10, x11,  x8,  x9);
+		quarter(x15, x12, x13, x14);
+	}
+
+	X[0] += x0;   X[1] += x1;   X[2] += x2;   X[3] += x3;
+	X[4] += x4;   X[5] += x5;   X[6] += x6;   X[7] += x7;
+	X[8] += x8;   X[9] += x9;  X[10] += x10; X[11] += x11;
+   X[12] += x12; X[13] += x13; X[14] += x14; X[15] += x15;
+
+#undef quarter
+}
+
+/* ChaCha20, rounds must be a multiple of 2 */
+static void neoscrypt_chacha(uint *X, uint rounds)
+{
+	uint x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, t;
+
+	x0 = X[0];   x1 = X[1];   x2 = X[2];   x3 = X[3];
+	x4 = X[4];   x5 = X[5];   x6 = X[6];   x7 = X[7];
+	x8 = X[8];   x9 = X[9];  x10 = X[10]; x11 = X[11];
+   x12 = X[12]; x13 = X[13]; x14 = X[14]; x15 = X[15];
+
+#define quarter(a,b,c,d) \
+	a += b; t = d ^ a; d = ROTL32(t, 16); \
+	c += d; t = b ^ c; b = ROTL32(t, 12); \
+	a += b; t = d ^ a; d = ROTL32(t,  8); \
+	c += d; t = b ^ c; b = ROTL32(t,  7);
+
+	for(; rounds; rounds -= 2) {
+		quarter( x0,  x4,  x8, x12);
+		quarter( x1,  x5,  x9, x13);
+		quarter( x2,  x6, x10, x14);
+		quarter( x3,  x7, x11, x15);
+		quarter( x0,  x5, x10, x15);
+		quarter( x1,  x6, x11, x12);
+		quarter( x2,  x7,  x8, x13);
+		quarter( x3,  x4,  x9, x14);
+	}
+
+	X[0] += x0;   X[1] += x1;   X[2] += x2;   X[3] += x3;
+	X[4] += x4;   X[5] += x5;   X[6] += x6;   X[7] += x7;
+	X[8] += x8;   X[9] += x9;  X[10] += x10; X[11] += x11;
+   X[12] += x12; X[13] += x13; X[14] += x14; X[15] += x15;
+
+#undef quarter
+}
+
+
+/* Fast 32-bit / 64-bit memcpy();
+ * len must be a multiple of 32 bytes */
+static void neoscrypt_blkcpy(void *dstp, const void *srcp, uint len)
+{
+	ulong *dst = (ulong *) dstp;
+	ulong *src = (ulong *) srcp;
+	uint i;
+
+	for(i = 0; i < (len / sizeof(ulong)); i += 4) {
+		dst[i]     = src[i];
+		dst[i + 1] = src[i + 1];
+		dst[i + 2] = src[i + 2];
+		dst[i + 3] = src[i + 3];
+	}
+}
+
+/* Fast 32-bit / 64-bit block swapper;
+ * len must be a multiple of 32 bytes */
+static void neoscrypt_blkswp(void *blkAp, void *blkBp, uint len)
+{
+	ulong *blkA = (ulong *) blkAp;
+	ulong *blkB = (ulong *) blkBp;
+	register ulong t0, t1, t2, t3;
+	uint i;
+
+	for(i = 0; i < (len / sizeof(ulong)); i += 4) {
+		t0          = blkA[i];
+		t1          = blkA[i + 1];
+		t2          = blkA[i + 2];
+		t3          = blkA[i + 3];
+		blkA[i]     = blkB[i];
+		blkA[i + 1] = blkB[i + 1];
+		blkA[i + 2] = blkB[i + 2];
+		blkA[i + 3] = blkB[i + 3];
+		blkB[i]     = t0;
+		blkB[i + 1] = t1;
+		blkB[i + 2] = t2;
+		blkB[i + 3] = t3;
+	}
+}
+
+/* Fast 32-bit / 64-bit block XOR engine;
+ * len must be a multiple of 32 bytes */
+static void neoscrypt_blkxor(void *dstp, const void *srcp, uint len)
+{
+	ulong *dst = (ulong *) dstp;
+	ulong *src = (ulong *) srcp;
+	uint i;
+
+	for (i = 0; i < (len / sizeof(ulong)); i += 4) {
+		dst[i]     ^= src[i];
+		dst[i + 1] ^= src[i + 1];
+		dst[i + 2] ^= src[i + 2];
+		dst[i + 3] ^= src[i + 3];
+	}
+}
+
+#endif
+
+/* 32-bit / 64-bit optimised memcpy() */
+static void neoscrypt_copy(void *dstp, const void *srcp, uint len)
+{
+	ulong *dst = (ulong *) dstp;
+	ulong *src = (ulong *) srcp;
+	uint i, tail;
+
+	for(i = 0; i < (len / sizeof(ulong)); i++)
+		dst[i] = src[i];
+
+	tail = len & (sizeof(ulong) - 1);
+	if(tail) {
+		uchar *dstb = (uchar *) dstp;
+		uchar *srcb = (uchar *) srcp;
+
+		for(i = len - tail; i < len; i++)
+			dstb[i] = srcb[i];
+	}
+}
+
+/* 32-bit / 64-bit optimised memory erase aka memset() to zero */
+static void neoscrypt_erase(void *dstp, uint len)
+{
+	const ulong null = 0;
+	ulong *dst = (ulong *) dstp;
+	uint i, tail;
+
+	for (i = 0; i < (len / sizeof(ulong)); i++)
+		dst[i] = null;
+
+	tail = len & (sizeof(ulong) - 1);
+	if (tail) {
+		uchar *dstb = (uchar *) dstp;
+
+		for(i = len - tail; i < len; i++)
+			dstb[i] = (uchar)null;
+	}
+}
+
+/* 32-bit / 64-bit optimised XOR engine */
+static void neoscrypt_xor(void *dstp, const void *srcp, uint len)
+{
+	ulong *dst = (ulong *) dstp;
+	ulong *src = (ulong *) srcp;
+	uint i, tail;
+
+	for (i = 0; i < (len / sizeof(ulong)); i++)
+		dst[i] ^= src[i];
+
+	tail = len & (sizeof(ulong) - 1);
+	if (tail) {
+		uchar *dstb = (uchar *) dstp;
+		uchar *srcb = (uchar *) srcp;
+
+		for(i = len - tail; i < len; i++)
+			dstb[i] ^= srcb[i];
+	}
+}
+
+
+/* BLAKE2s */
+
+#define BLAKE2S_BLOCK_SIZE    64U
+#define BLAKE2S_OUT_SIZE      32U
+#define BLAKE2S_KEY_SIZE      32U
+
+/* Parameter block of 32 bytes */
+typedef struct blake2s_param_t {
+	uchar digest_length;
+	uchar key_length;
+	uchar fanout;
+	uchar depth;
+	uint  leaf_length;
+	uchar node_offset[6];
+	uchar node_depth;
+	uchar inner_length;
+	uchar salt[8];
+	uchar personal[8];
+} blake2s_param;
+
+/* State block of 180 bytes */
+typedef struct blake2s_state_t {
+	uint  h[8];
+	uint  t[2];
+	uint  f[2];
+	uchar buf[2 * BLAKE2S_BLOCK_SIZE];
+	uint  buflen;
+} blake2s_state;
+
+static const uint blake2s_IV[8] = {
+	0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+	0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+};
+
+static const uint8_t blake2s_sigma[10][16] = {
+	{  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 } ,
+	{ 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 } ,
+	{ 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 } ,
+	{  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 } ,
+	{  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 } ,
+	{  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 } ,
+	{ 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 } ,
+	{ 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 } ,
+	{  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 } ,
+	{ 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0 } ,
+};
+
+static void blake2s_compress(blake2s_state *S, const uint *buf)
+{
+	uint i;
+	uint m[16];
+	uint v[16];
+
+	neoscrypt_copy(m, buf, 64);
+	neoscrypt_copy(v, S, 32);
+
+	v[ 8] = blake2s_IV[0];
+	v[ 9] = blake2s_IV[1];
+	v[10] = blake2s_IV[2];
+	v[11] = blake2s_IV[3];
+	v[12] = S->t[0] ^ blake2s_IV[4];
+	v[13] = S->t[1] ^ blake2s_IV[5];
+	v[14] = S->f[0] ^ blake2s_IV[6];
+	v[15] = S->f[1] ^ blake2s_IV[7];
+
+#define G(r,i,a,b,c,d) do { \
+	a = a + b + m[blake2s_sigma[r][2*i+0]]; \
+	d = ROTR32(d ^ a, 16); \
+	c = c + d; \
+	b = ROTR32(b ^ c, 12); \
+	a = a + b + m[blake2s_sigma[r][2*i+1]]; \
+	d = ROTR32(d ^ a, 8); \
+	c = c + d; \
+	b = ROTR32(b ^ c, 7); \
+} while(0)
+
+#define ROUND(r) do { \
+	G(r, 0, v[ 0], v[ 4], v[ 8], v[12]); \
+	G(r, 1, v[ 1], v[ 5], v[ 9], v[13]); \
+	G(r, 2, v[ 2], v[ 6], v[10], v[14]); \
+	G(r, 3, v[ 3], v[ 7], v[11], v[15]); \
+	G(r, 4, v[ 0], v[ 5], v[10], v[15]); \
+	G(r, 5, v[ 1], v[ 6], v[11], v[12]); \
+	G(r, 6, v[ 2], v[ 7], v[ 8], v[13]); \
+	G(r, 7, v[ 3], v[ 4], v[ 9], v[14]); \
+} while(0)
+
+	ROUND(0);
+	ROUND(1);
+	ROUND(2);
+	ROUND(3);
+	ROUND(4);
+	ROUND(5);
+	ROUND(6);
+	ROUND(7);
+	ROUND(8);
+	ROUND(9);
+
+	for (i = 0; i < 8; i++)
+		S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];
+
+#undef G
+#undef ROUND
+}
+
+static void blake2s_update(blake2s_state *S, const uchar *input, uint input_size)
+{
+	uint left, fill;
+
+	while(input_size > 0) {
+		left = S->buflen;
+		fill = 2 * BLAKE2S_BLOCK_SIZE - left;
+		if(input_size > fill) {
+			/* Buffer fill */
+			neoscrypt_copy(S->buf + left, input, fill);
+			S->buflen += fill;
+			/* Counter increment */
+			S->t[0] += BLAKE2S_BLOCK_SIZE;
+			/* Compress */
+			blake2s_compress(S, (uint *) S->buf);
+			/* Shift buffer left */
+			neoscrypt_copy(S->buf, S->buf + BLAKE2S_BLOCK_SIZE, BLAKE2S_BLOCK_SIZE);
+			S->buflen -= BLAKE2S_BLOCK_SIZE;
+			input += fill;
+			input_size -= fill;
+		} else {
+			neoscrypt_copy(S->buf + left, input, input_size);
+			S->buflen += input_size;
+			/* Do not compress */
+			input += input_size;
+			input_size = 0;
+		}
+	}
+}
+
+static void neoscrypt_blake2s(const void *input, const uint input_size, const void *key, const uchar key_size,
+	void *output, const uchar output_size)
+{
+	uchar block[BLAKE2S_BLOCK_SIZE];
+	blake2s_param P[1];
+	blake2s_state S[1];
+
+	/* Initialise */
+	neoscrypt_erase(P, 32);
+	P->digest_length = output_size;
+	P->key_length    = key_size;
+	P->fanout        = 1;
+	P->depth         = 1;
+
+	neoscrypt_erase(S, 180);
+	neoscrypt_copy(S, blake2s_IV, 32);
+	neoscrypt_xor(S, P, 32);
+
+	neoscrypt_erase(block, BLAKE2S_BLOCK_SIZE);
+	neoscrypt_copy(block, key, key_size);
+	blake2s_update(S, (uchar *) block, BLAKE2S_BLOCK_SIZE);
+
+	/* Update */
+	blake2s_update(S, (uchar *) input, input_size);
+
+	/* Finish */
+	if(S->buflen > BLAKE2S_BLOCK_SIZE) {
+		S->t[0] += BLAKE2S_BLOCK_SIZE;
+		blake2s_compress(S, (uint *) S->buf);
+		S->buflen -= BLAKE2S_BLOCK_SIZE;
+		neoscrypt_copy(S->buf, S->buf + BLAKE2S_BLOCK_SIZE, S->buflen);
+	}
+	S->t[0] += S->buflen;
+	S->f[0] = ~0U;
+	neoscrypt_erase(S->buf + S->buflen, 2 * BLAKE2S_BLOCK_SIZE - S->buflen);
+	blake2s_compress(S, (uint *) S->buf);
+
+	/* Write back */
+	neoscrypt_copy(output, S, output_size);
+
+	//for (int k = 0; k<4; k++) { printf("cpu blake   %d %08x %08x\n", k, ((unsigned int*)output)[2 * k], ((unsigned int*)output)[2 * k + 1]); }
+}
+
+
+#define FASTKDF_BUFFER_SIZE 256U
+
+/* FastKDF, a fast buffered key derivation function:
+ * FASTKDF_BUFFER_SIZE must be a power of 2;
+ * password_len, salt_len and output_len should not exceed FASTKDF_BUFFER_SIZE;
+ * prf_output_size must be <= prf_key_size; */
+static void neoscrypt_fastkdf(const uchar *password, uint password_len, const uchar *salt, uint salt_len,
+	uint N, uchar *output, uint output_len)
+{
+	//for (int i = 0; i<10; i++) { printf("cpu password %d %08x %08x\n", i, ((unsigned int*)password)[2 * i], ((unsigned int*)password)[2 * i+1]); }
+	const uint stack_align =  0x40;
+	const uint kdf_buf_size = 256U; //FASTKDF_BUFFER_SIZE
+	const uint prf_input_size = 64U; //BLAKE2S_BLOCK_SIZE
+	const uint prf_key_size = 32U; //BLAKE2S_KEY_SIZE
+	const uint prf_output_size = 32U; //BLAKE2S_OUT_SIZE
+	uint bufptr, a, b, i, j;
+	uchar *A, *B, *prf_input, *prf_key, *prf_output;
+	uchar *stack;
+	stack = (uchar*)malloc(sizeof(uchar) * 2 * kdf_buf_size + prf_input_size + prf_key_size + prf_output_size + stack_align);
+	/* Align and set up the buffers in stack */
+	//uchar stack[2 * kdf_buf_size + prf_input_size + prf_key_size + prf_output_size + stack_align];
+
+	A          = &stack[stack_align & ~(stack_align - 1)];
+	B          = &A[kdf_buf_size + prf_input_size];
+	prf_output = &A[2 * kdf_buf_size + prf_input_size + prf_key_size];
+
+	/* Initialise the password buffer */
+	if(password_len > kdf_buf_size)
+		password_len = kdf_buf_size;
+
+	a = kdf_buf_size / password_len;
+	for(i = 0; i < a; i++)
+		neoscrypt_copy(&A[i * password_len], &password[0], password_len);
+	b = kdf_buf_size - a * password_len;
+	if(b)
+		neoscrypt_copy(&A[a * password_len], &password[0], b);
+	neoscrypt_copy(&A[kdf_buf_size], &password[0], prf_input_size);
+
+	/* Initialise the salt buffer */
+	if(salt_len > kdf_buf_size)
+		salt_len = kdf_buf_size;
+
+	a = kdf_buf_size / salt_len;
+	for(i = 0; i < a; i++)
+		neoscrypt_copy(&B[i * salt_len], &salt[0], salt_len);
+	b = kdf_buf_size - a * salt_len;
+	if(b)
+		neoscrypt_copy(&B[a * salt_len], &salt[0], b);
+	neoscrypt_copy(&B[kdf_buf_size], &salt[0], prf_key_size);
+
+	/* The primary iteration */
+	for(i = 0, bufptr = 0; i < N; i++) {
+
+		/* Map the PRF input buffer */
+		prf_input = &A[bufptr];
+
+		/* Map the PRF key buffer */
+		prf_key = &B[bufptr];
+
+		/* PRF */
+		// for (int k = 0; k<(prf_input_size/4); k++) { printf("cpu bufptr %08x before blake %d  %d %08x \n",bufptr, i, k, ((unsigned int*)prf_input)[k]); }
+		neoscrypt_blake2s(prf_input, prf_input_size, prf_key, prf_key_size, prf_output, prf_output_size);
+		// for (int k = 0; k<(prf_output_size/4); k++) { printf("cpu after blake %d  %d %08x \n", i, k, ((unsigned int*)prf_output)[k]); }
+
+		/* Calculate the next buffer pointer */
+		for(j = 0, bufptr = 0; j < prf_output_size; j++)
+			bufptr += prf_output[j];
+		bufptr &= (kdf_buf_size - 1);
+
+		/* Modify the salt buffer */
+		neoscrypt_xor(&B[bufptr], &prf_output[0], prf_output_size);
+
+		/* Head modified, tail updated */
+		if(bufptr < prf_key_size)
+			neoscrypt_copy(&B[kdf_buf_size + bufptr], &B[bufptr], MIN(prf_output_size, prf_key_size - bufptr));
+
+		/* Tail modified, head updated */
+		if((kdf_buf_size - bufptr) < prf_output_size)
+			neoscrypt_copy(&B[0], &B[kdf_buf_size], prf_output_size - (kdf_buf_size - bufptr));
+	}
+
+	/* Modify and copy into the output buffer */
+	if(output_len > kdf_buf_size)
+		output_len = kdf_buf_size;
+
+	a = kdf_buf_size - bufptr;
+	if(a >= output_len) {
+		neoscrypt_xor(&B[bufptr], &A[0], output_len);
+		neoscrypt_copy(&output[0], &B[bufptr], output_len);
+	} else {
+		neoscrypt_xor(&B[bufptr], &A[0], a);
+		neoscrypt_xor(&B[0], &A[a], output_len - a);
+		neoscrypt_copy(&output[0], &B[bufptr], a);
+		neoscrypt_copy(&output[a], &B[0], output_len - a);
+	}
+	// for (int i = 0; i<10; i++) { printf("cpu fastkdf %d %08x %08x\n", i, ((unsigned int*)output)[2 * i], ((unsigned int*)output)[2 * i + 1]); }
+}
+
+
+/* Configurable optimised block mixer */
+static void neoscrypt_blkmix(uint *X, uint *Y, uint r, uint mixmode)
+{
+	uint i, mixer, rounds;
+
+	mixer  = mixmode >> 8;
+	rounds = mixmode & 0xFF;
+
+	/* NeoScrypt flow:                   Scrypt flow:
+		 Xa ^= Xd;  M(Xa'); Ya = Xa";      Xa ^= Xb;  M(Xa'); Ya = Xa";
+		 Xb ^= Xa"; M(Xb'); Yb = Xb";      Xb ^= Xa"; M(Xb'); Yb = Xb";
+		 Xc ^= Xb"; M(Xc'); Yc = Xc";      Xa" = Ya;
+		 Xd ^= Xc"; M(Xd'); Yd = Xd";      Xb" = Yb;
+		 Xa" = Ya; Xb" = Yc;
+		 Xc" = Yb; Xd" = Yd; */
+
+	if (r == 1) {
+		neoscrypt_blkxor(&X[0], &X[16], SCRYPT_BLOCK_SIZE);
+		if(mixer)
+			neoscrypt_chacha(&X[0], rounds);
+		else
+			neoscrypt_salsa(&X[0], rounds);
+		neoscrypt_blkxor(&X[16], &X[0], SCRYPT_BLOCK_SIZE);
+		if(mixer)
+			neoscrypt_chacha(&X[16], rounds);
+		else
+			neoscrypt_salsa(&X[16], rounds);
+		return;
+	}
+
+	if (r == 2) {
+		neoscrypt_blkxor(&X[0], &X[48], SCRYPT_BLOCK_SIZE);
+		if(mixer)
+			neoscrypt_chacha(&X[0], rounds);
+		else
+			neoscrypt_salsa(&X[0], rounds);
+		neoscrypt_blkxor(&X[16], &X[0], SCRYPT_BLOCK_SIZE);
+		if(mixer)
+			neoscrypt_chacha(&X[16], rounds);
+		else
+			neoscrypt_salsa(&X[16], rounds);
+		neoscrypt_blkxor(&X[32], &X[16], SCRYPT_BLOCK_SIZE);
+		if(mixer)
+			neoscrypt_chacha(&X[32], rounds);
+		else
+			neoscrypt_salsa(&X[32], rounds);
+		neoscrypt_blkxor(&X[48], &X[32], SCRYPT_BLOCK_SIZE);
+		if(mixer)
+			neoscrypt_chacha(&X[48], rounds);
+		else
+			neoscrypt_salsa(&X[48], rounds);
+		neoscrypt_blkswp(&X[16], &X[32], SCRYPT_BLOCK_SIZE);
+		return;
+	}
+
+	/* Reference code for any reasonable r */
+	for (i = 0; i < 2 * r; i++) {
+		if(i) neoscrypt_blkxor(&X[16 * i], &X[16 * (i - 1)], SCRYPT_BLOCK_SIZE);
+		else  neoscrypt_blkxor(&X[0], &X[16 * (2 * r - 1)], SCRYPT_BLOCK_SIZE);
+		if(mixer)
+			neoscrypt_chacha(&X[16 * i], rounds);
+		else
+			neoscrypt_salsa(&X[16 * i], rounds);
+		neoscrypt_blkcpy(&Y[16 * i], &X[16 * i], SCRYPT_BLOCK_SIZE);
+	}
+	for (i = 0; i < r; i++)
+		neoscrypt_blkcpy(&X[16 * i], &Y[16 * 2 * i], SCRYPT_BLOCK_SIZE);
+	for (i = 0; i < r; i++)
+		neoscrypt_blkcpy(&X[16 * (i + r)], &Y[16 * (2 * i + 1)], SCRYPT_BLOCK_SIZE);
+}
+
+/* NeoScrypt core engine:
+ * p = 1, salt = password;
+ * Basic customisation (required):
+ *   profile bit 0:
+ *     0 = NeoScrypt(128, 2, 1) with Salsa20/20 and ChaCha20/20;
+ *     1 = Scrypt(1024, 1, 1) with Salsa20/8;
+ *   profile bits 4 to 1:
+ *     0000 = FastKDF-BLAKE2s;
+ *     0001 = PBKDF2-HMAC-SHA256;
+ * Extended customisation (optional):
+ *   profile bit 31:
+ *     0 = extended customisation absent;
+ *     1 = extended customisation present;
+ *   profile bits 7 to 5 (rfactor):
+ *     000 = r of 1;
+ *     001 = r of 2;
+ *     010 = r of 4;
+ *     ...
+ *     111 = r of 128;
+ *   profile bits 12 to 8 (Nfactor):
+ *     00000 = N of 2;
+ *     00001 = N of 4;
+ *     00010 = N of 8;
+ *     .....
+ *     00110 = N of 128;
+ *     .....
+ *     01001 = N of 1024;
+ *     .....
+ *     11110 = N of 2147483648;
+ *   profile bits 30 to 13 are reserved */
+void neoscrypt(unsigned char *output, const unsigned char *input, unsigned int profile)
+{
+	uint N = 128, r = 2, dblmix = 1, mixmode = 0x14, stack_align = 0x40;
+	uint kdf, i, j;
+	uint *X, *Y, *Z, *V;
+
+	if(profile & 0x1) {
+		N = 1024;        /* N = (1 << (Nfactor + 1)); */
+		r = 1;           /* r = (1 << rfactor); */
+		dblmix = 0;      /* Salsa only */
+		mixmode = 0x08;  /* 8 rounds */
+	}
+
+	if(profile >> 31) {
+		N = (1 << (((profile >> 8) & 0x1F) + 1));
+		r = (1 << ((profile >> 5) & 0x7));
+	}
+	uchar *stack;
+	stack = (uchar*)malloc(((N + 3) * r * 2 * SCRYPT_BLOCK_SIZE + stack_align)*sizeof(uchar));
+	/* X = r * 2 * SCRYPT_BLOCK_SIZE */
+	X = (uint *) &stack[stack_align & ~(stack_align - 1)];
+	/* Z is a copy of X for ChaCha */
+	Z = &X[32 * r];
+	/* Y is an X sized temporal space */
+	Y = &X[64 * r];
+	/* V = N * r * 2 * SCRYPT_BLOCK_SIZE */
+	V = &X[96 * r];
+
+	/* X = KDF(password, salt) */
+	kdf = (profile >> 1) & 0xF;
+
+	switch(kdf) {
+
+	default:
+	case(0x0):
+		neoscrypt_fastkdf(input, 80, input, 80, 32, (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE);
+		break;
+
+	case(0x1):
+		neoscrypt_pbkdf2_sha256(input, 80, input, 80, 1, (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE);
+		break;
+	}
+
+	/* Process ChaCha 1st, Salsa 2nd and XOR them into FastKDF; otherwise Salsa only */
+
+	if(dblmix) {
+		/* blkcpy(Z, X) */
+		neoscrypt_blkcpy(&Z[0], &X[0], r * 2 * SCRYPT_BLOCK_SIZE);
+
+		/* Z = SMix(Z) */
+		for(i = 0; i < N; i++) {
+			/* blkcpy(V, Z) */
+			neoscrypt_blkcpy(&V[i * (32 * r)], &Z[0], r * 2 * SCRYPT_BLOCK_SIZE);
+			/* blkmix(Z, Y) */
+			neoscrypt_blkmix(&Z[0], &Y[0], r, (mixmode | 0x0100));
+		}
+		for(i = 0; i < N; i++) {
+			/* integerify(Z) mod N */
+			j = (32 * r) * (Z[16 * (2 * r - 1)] & (N - 1));
+			/* blkxor(Z, V) */
+			neoscrypt_blkxor(&Z[0], &V[j], r * 2 * SCRYPT_BLOCK_SIZE);
+			/* blkmix(Z, Y) */
+			neoscrypt_blkmix(&Z[0], &Y[0], r, (mixmode | 0x0100));
+		}
+	}
+
+#if (ASM)
+	/* Must be called before and after SSE2 Salsa */
+	neoscrypt_salsa_tangle(&X[0], r * 2);
+#endif
+
+	/* X = SMix(X) */
+	for(i = 0; i < N; i++) {
+		/* blkcpy(V, X) */
+		neoscrypt_blkcpy(&V[i * (32 * r)], &X[0], r * 2 * SCRYPT_BLOCK_SIZE);
+		/* blkmix(X, Y) */
+		neoscrypt_blkmix(&X[0], &Y[0], r, mixmode);
+	}
+	for(i = 0; i < N; i++) {
+		/* integerify(X) mod N */
+		j = (32 * r) * (X[16 * (2 * r - 1)] & (N - 1));
+		/* blkxor(X, V) */
+		neoscrypt_blkxor(&X[0], &V[j], r * 2 * SCRYPT_BLOCK_SIZE);
+		/* blkmix(X, Y) */
+		neoscrypt_blkmix(&X[0], &Y[0], r, mixmode);
+	}
+
+#if (ASM)
+	neoscrypt_salsa_tangle(&X[0], r * 2);
+#endif
+
+	if(dblmix)
+		/* blkxor(X, Z) */
+		neoscrypt_blkxor(&X[0], &Z[0], r * 2 * SCRYPT_BLOCK_SIZE);
+
+	/* output = KDF(password, X) */
+	switch(kdf) {
+
+	default:
+	case(0x0):
+		neoscrypt_fastkdf(input, 80, (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE, 32, output, 32);
+		break;
+
+	case(0x1):
+		neoscrypt_pbkdf2_sha256(input, 80, (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE, 1, output, 32);
+		break;
+	}
+}
+
diff --git a/neoscrypt/neoscrypt.cpp b/neoscrypt/neoscrypt.cpp
new file mode 100644
index 0000000000..ba8c63a6a2
--- /dev/null
+++ b/neoscrypt/neoscrypt.cpp
@@ -0,0 +1,121 @@
+#include <cuda_runtime.h>
+#include <string.h>
+#include <miner.h>
+
+#include "neoscrypt.h"
+
+extern void neoscrypt_setBlockTarget(uint32_t* const data, uint32_t* const ptarget);
+
+extern void neoscrypt_init(int thr_id, uint32_t threads);
+extern void neoscrypt_free(int thr_id);
+extern void neoscrypt_hash_k4(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonces, bool stratum);
+
+static bool init[MAX_GPUS] = { 0 };
+
+int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t _ALIGN(64) endiandata[20];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+
+	int dev_id = device_map[thr_id];
+	int intensity = is_windows() ? 18 : 19;
+	if (strstr(device_name[dev_id], "GTX 10")) intensity = 21; // >= 20 need more than 2GB
+	if (strstr(device_name[dev_id], "TITAN")) intensity = 21;
+
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
+	throughput = throughput / 32; /* set for max intensity ~= 20 */
+	api_set_throughput(thr_id, throughput);
+
+	if (opt_benchmark)
+		ptarget[7] = 0x00ff;
+
+	if (!init[thr_id])
+	{
+		cudaDeviceSynchronize();
+		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			cudaGetLastError(); // reset errors if device is not "reset"
+		}
+		if (device_sm[dev_id] <= 300) {
+			gpulog(LOG_ERR, thr_id, "Sorry neoscrypt is not supported on SM 3.0 devices");
+			proper_exit(EXIT_CODE_CUDA_ERROR);
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g (+5), %u cuda threads", throughput2intensity(throughput), throughput);
+
+		neoscrypt_init(thr_id, throughput);
+
+		init[thr_id] = true;
+	}
+
+	if (have_stratum) {
+		for (int k = 0; k < 20; k++)
+			be32enc(&endiandata[k], pdata[k]);
+	} else {
+		for (int k = 0; k < 20; k++)
+			endiandata[k] = pdata[k];
+	}
+
+	neoscrypt_setBlockTarget(endiandata,ptarget);
+
+	do {
+		memset(work->nonces, 0xff, sizeof(work->nonces));
+		neoscrypt_hash_k4(thr_id, throughput, pdata[19], work->nonces, have_stratum);
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+
+			if (have_stratum) {
+				be32enc(&endiandata[19], work->nonces[0]);
+			} else {
+				endiandata[19] = work->nonces[0];
+			}
+			neoscrypt((uchar*)vhash, (uchar*) endiandata, 0x80000620U);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				pdata[19] = work->nonces[0] + 1; // cursor
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+				gpulog(LOG_WARNING, thr_id, "nonce %08x does not validate on CPU!", work->nonces[0]);
+			}
+		}
+
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+	return 0;
+}
+
+// cleanup
+void free_neoscrypt(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	neoscrypt_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/neoscrypt/neoscrypt.h b/neoscrypt/neoscrypt.h
new file mode 100644
index 0000000000..fc4d7e7c8e
--- /dev/null
+++ b/neoscrypt/neoscrypt.h
@@ -0,0 +1,33 @@
+#if (__cplusplus)
+extern "C" {
+#endif
+
+void neoscrypt(unsigned char *output, const unsigned char *input, unsigned int profile);
+
+#if (__cplusplus)
+}
+#else
+
+#define SCRYPT_BLOCK_SIZE 64
+#define SCRYPT_HASH_BLOCK_SIZE 64
+#define SCRYPT_HASH_DIGEST_SIZE 32
+
+typedef uint8_t hash_digest[SCRYPT_HASH_DIGEST_SIZE];
+
+#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b)))
+#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b)))
+
+#define U8TO32_BE(p) \
+    (((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \
+    ((uint32_t)((p)[2]) <<  8) | ((uint32_t)((p)[3])))
+
+#define U32TO8_BE(p, v) \
+    (p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \
+    (p)[2] = (uint8_t)((v) >>  8); (p)[3] = (uint8_t)((v)      );
+
+#define U64TO8_BE(p, v) \
+    U32TO8_BE((p),     (uint32_t)((v) >> 32)); \
+    U32TO8_BE((p) + 4, (uint32_t)((v)      ));
+
+#endif
+
diff --git a/nvapi.cpp b/nvapi.cpp
new file mode 100644
index 0000000000..03880b1f8f
--- /dev/null
+++ b/nvapi.cpp
@@ -0,0 +1,460 @@
+/**
+ * Wrapper to nvapi.dll to query informations missing for x86 binaries (there is no nvml x86)
+ * based on the work of https://github.com/ircubic/lib_gpu
+ *
+ * tpruvot@ccminer.org 06-2016
+ */
+
+#ifdef _WIN32
+
+#include <windows.h>
+#include <memory>
+#include <stdexcept>
+
+#include "compat/nvapi/nvapi_ccminer.h"
+
+class NvAPILibraryHandle
+{
+	typedef void *(*QueryPtr)(uint32_t);
+
+private:
+	HMODULE library;
+	QueryPtr nvidia_query;
+
+public:
+	NvAPILibraryHandle()
+	{
+		bool success = false;
+#ifdef _WIN64
+		library = LoadLibrary("nvapi64.dll");
+#else
+		library = LoadLibrary("nvapi.dll");
+#endif
+		if (library != NULL) {
+			nvidia_query = reinterpret_cast<QueryPtr>(GetProcAddress(library, "nvapi_QueryInterface"));
+			if (nvidia_query != NULL) {
+				const uint32_t NVAPI_ID_INIT = 0x0150E828;
+				auto init = static_cast<NvAPI_Status(*)()>(nvidia_query(NVAPI_ID_INIT));
+				NvAPI_Status ret = init();
+				success = (ret == NVAPI_OK);
+			}
+		}
+
+		if (!success) {
+			throw std::runtime_error("Unable to locate NVAPI library!");
+		}
+	}
+
+	~NvAPILibraryHandle()
+	{
+		NvAPI_DLL_Unload();
+		FreeLibrary(library);
+	}
+
+	void *query(uint32_t ID)
+	{
+		return nvidia_query(ID);
+	}
+
+};
+
+static std::unique_ptr<NvAPILibraryHandle> nvidia_handle;
+bool nvapi_dll_loaded = false;
+
+NvAPI_Status nvapi_dll_init()
+{
+	try {
+		if (!nvapi_dll_loaded) {
+			nvidia_handle = std::make_unique<NvAPILibraryHandle>();
+			nvapi_dll_loaded = true;
+		}
+	}
+	catch (std::runtime_error) {
+		nvapi_dll_loaded = false;
+		return NVAPI_ERROR;
+	}
+
+	return NVAPI_OK;
+}
+
+// Hidden nvapi.dll functions
+
+#define NVAPI_ID_IFVERSION 0x01053FA5
+NvAPI_Status NvAPI_DLL_GetInterfaceVersionString(NvAPI_ShortString string) {
+	static NvAPI_Status (*pointer)(NvAPI_ShortString string) = NULL;
+	if(!nvapi_dll_loaded) return NVAPI_API_NOT_INITIALIZED;
+	if(!pointer) {
+		pointer = (NvAPI_Status (*)(NvAPI_ShortString))nvidia_handle->query(NVAPI_ID_IFVERSION);
+	}
+	if(!pointer) return NVAPI_NO_IMPLEMENTATION;
+	return (*pointer)(string);
+}
+
+#define NVAPI_ID_PERF_INFO 0x409D9841
+NvAPI_Status NvAPI_DLL_PerfPoliciesGetInfo(NvPhysicalGpuHandle handle, NVAPI_GPU_PERF_INFO* pInfo) {
+	static NvAPI_Status (*pointer)(NvPhysicalGpuHandle, NVAPI_GPU_PERF_INFO*) = NULL;
+	if(!nvapi_dll_loaded) return NVAPI_API_NOT_INITIALIZED;
+	if(!pointer) {
+		pointer = (NvAPI_Status (*)(NvPhysicalGpuHandle, NVAPI_GPU_PERF_INFO*))nvidia_handle->query(NVAPI_ID_PERF_INFO);
+	}
+	if(!pointer) return NVAPI_NO_IMPLEMENTATION;
+	return (*pointer)(handle, pInfo);
+}
+
+#define NVAPI_ID_PERF_STATS 0x3D358A0C
+NvAPI_Status NvAPI_DLL_PerfPoliciesGetStatus(NvPhysicalGpuHandle handle, NVAPI_GPU_PERF_STATUS* pStatus) {
+	static NvAPI_Status (*pointer)(NvPhysicalGpuHandle, NVAPI_GPU_PERF_STATUS*) = NULL;
+	if(!nvapi_dll_loaded) return NVAPI_API_NOT_INITIALIZED;
+	if(!pointer) {
+		pointer = (NvAPI_Status (*)(NvPhysicalGpuHandle, NVAPI_GPU_PERF_STATUS*))nvidia_handle->query(NVAPI_ID_PERF_STATS);
+	}
+	if(!pointer) return NVAPI_NO_IMPLEMENTATION;
+	return (*pointer)(handle, pStatus);
+}
+
+#define NVAPI_ID_POWER_INFO 0x34206D86
+NvAPI_Status NvAPI_DLL_ClientPowerPoliciesGetInfo(NvPhysicalGpuHandle handle, NVAPI_GPU_POWER_INFO* pInfo) {
+	static NvAPI_Status (*pointer)(NvPhysicalGpuHandle, NVAPI_GPU_POWER_INFO*) = NULL;
+	if(!nvapi_dll_loaded) return NVAPI_API_NOT_INITIALIZED;
+	if(!pointer) {
+		pointer = (NvAPI_Status (*)(NvPhysicalGpuHandle, NVAPI_GPU_POWER_INFO*))nvidia_handle->query(NVAPI_ID_POWER_INFO);
+	}
+	if(!pointer) return NVAPI_NO_IMPLEMENTATION;
+	return (*pointer)(handle, pInfo);
+}
+
+#define NVAPI_ID_POWERPOL_GET 0x70916171
+NvAPI_Status NvAPI_DLL_ClientPowerPoliciesGetStatus(NvPhysicalGpuHandle handle, NVAPI_GPU_POWER_STATUS* pPolicies) {
+	static NvAPI_Status (*pointer)(NvPhysicalGpuHandle, NVAPI_GPU_POWER_STATUS*) = NULL;
+	if(!nvapi_dll_loaded) return NVAPI_API_NOT_INITIALIZED;
+	if(!pointer) {
+		pointer = (NvAPI_Status (*)(NvPhysicalGpuHandle, NVAPI_GPU_POWER_STATUS*))nvidia_handle->query(NVAPI_ID_POWERPOL_GET);
+	}
+	if(!pointer) return NVAPI_NO_IMPLEMENTATION;
+	return (*pointer)(handle, pPolicies);
+}
+
+#define NVAPI_ID_POWERPOL_SET 0xAD95F5ED
+NvAPI_Status NvAPI_DLL_ClientPowerPoliciesSetStatus(NvPhysicalGpuHandle handle, NVAPI_GPU_POWER_STATUS* pPolicies) {
+	static NvAPI_Status (*pointer)(NvPhysicalGpuHandle, NVAPI_GPU_POWER_STATUS*) = NULL;
+	if(!nvapi_dll_loaded) return NVAPI_API_NOT_INITIALIZED;
+	if(!pointer) {
+		pointer = (NvAPI_Status (*)(NvPhysicalGpuHandle, NVAPI_GPU_POWER_STATUS*))nvidia_handle->query(NVAPI_ID_POWERPOL_SET);
+	}
+	if(!pointer) return NVAPI_NO_IMPLEMENTATION;
+	return (*pointer)(handle, pPolicies);
+}
+
+#define NVAPI_ID_POWERTOPO_GET 0xEDCF624E
+NvAPI_Status NvAPI_DLL_ClientPowerTopologyGetStatus(NvPhysicalGpuHandle handle, NVAPI_GPU_POWER_TOPO* topo) {
+	static NvAPI_Status (*pointer)(NvPhysicalGpuHandle, NVAPI_GPU_POWER_TOPO*) = NULL;
+	if(!nvapi_dll_loaded) return NVAPI_API_NOT_INITIALIZED;
+	if(!pointer) {
+		pointer = (NvAPI_Status (*)(NvPhysicalGpuHandle, NVAPI_GPU_POWER_TOPO*))nvidia_handle->query(NVAPI_ID_POWERTOPO_GET);
+	}
+	if(!pointer) return NVAPI_NO_IMPLEMENTATION;
+	return (*pointer)(handle, topo);
+}
+
+#define NVAPI_ID_THERMAL_INFO 0x0D258BB5
+NvAPI_Status NvAPI_DLL_ClientThermalPoliciesGetInfo(NvPhysicalGpuHandle handle, NVAPI_GPU_THERMAL_INFO* pInfo) {
+	static NvAPI_Status (*pointer)(NvPhysicalGpuHandle, NVAPI_GPU_THERMAL_INFO*) = NULL;
+	if(!nvapi_dll_loaded) return NVAPI_API_NOT_INITIALIZED;
+	if(!pointer) {
+		pointer = (NvAPI_Status (*)(NvPhysicalGpuHandle, NVAPI_GPU_THERMAL_INFO*))nvidia_handle->query(NVAPI_ID_THERMAL_INFO);
+	}
+	if(!pointer) return NVAPI_NO_IMPLEMENTATION;
+	return (*pointer)(handle, pInfo);
+}
+
+#define NVAPI_ID_TLIMIT_GET 0xE9C425A1
+NvAPI_Status NvAPI_DLL_ClientThermalPoliciesGetLimit(NvPhysicalGpuHandle handle, NVAPI_GPU_THERMAL_LIMIT* pLimit) {
+	static NvAPI_Status (*pointer)(NvPhysicalGpuHandle, NVAPI_GPU_THERMAL_LIMIT*) = NULL;
+	if(!nvapi_dll_loaded) return NVAPI_API_NOT_INITIALIZED;
+	if(!pointer) {
+		pointer = (NvAPI_Status (*)(NvPhysicalGpuHandle, NVAPI_GPU_THERMAL_LIMIT*))nvidia_handle->query(NVAPI_ID_TLIMIT_GET);
+	}
+	if(!pointer) return NVAPI_NO_IMPLEMENTATION;
+	return (*pointer)(handle, pLimit);
+}
+
+#define NVAPI_ID_TLIMIT_SET 0x34C0B13D
+NvAPI_Status NvAPI_DLL_ClientThermalPoliciesSetLimit(NvPhysicalGpuHandle handle, NVAPI_GPU_THERMAL_LIMIT* pLimit) {
+	static NvAPI_Status (*pointer)(NvPhysicalGpuHandle, NVAPI_GPU_THERMAL_LIMIT*) = NULL;
+	if(!nvapi_dll_loaded) return NVAPI_API_NOT_INITIALIZED;
+	if(!pointer) {
+		pointer = (NvAPI_Status (*)(NvPhysicalGpuHandle, NVAPI_GPU_THERMAL_LIMIT*))nvidia_handle->query(NVAPI_ID_TLIMIT_SET);
+	}
+	if(!pointer) return NVAPI_NO_IMPLEMENTATION;
+	return (*pointer)(handle, pLimit);
+}
+
+#define NVAPI_ID_SERIALNUM_GET 0x14B83A5F
+NvAPI_Status NvAPI_DLL_GetSerialNumber(NvPhysicalGpuHandle handle, NvAPI_ShortString serial) {
+	static NvAPI_Status (*pointer)(NvPhysicalGpuHandle, NvAPI_ShortString) = NULL;
+	if(!nvapi_dll_loaded) return NVAPI_API_NOT_INITIALIZED;
+	if(!pointer) {
+		pointer = (NvAPI_Status (*)(NvPhysicalGpuHandle, NvAPI_ShortString))nvidia_handle->query(NVAPI_ID_SERIALNUM_GET);
+	}
+	if(!pointer) return NVAPI_NO_IMPLEMENTATION;
+	return (*pointer)(handle, serial);
+}
+
+#define NVAPI_ID_VOLTAGE_GET 0x465F9BCF
+NvAPI_Status NvAPI_DLL_GetCurrentVoltage(NvPhysicalGpuHandle handle, NVAPI_VOLTAGE_STATUS* status) {
+	static NvAPI_Status (*pointer)(NvPhysicalGpuHandle, NVAPI_VOLTAGE_STATUS*) = NULL;
+	if(!nvapi_dll_loaded) return NVAPI_API_NOT_INITIALIZED;
+	if(!pointer) {
+		pointer = (NvAPI_Status (*)(NvPhysicalGpuHandle, NVAPI_VOLTAGE_STATUS*))nvidia_handle->query(NVAPI_ID_VOLTAGE_GET);
+	}
+	if(!pointer) return NVAPI_NO_IMPLEMENTATION;
+	return (*pointer)(handle, status);
+}
+
+#define NVAPI_ID_VOLT_STATUS_GET 0xC16C7E2C // Maxwell
+NvAPI_Status NvAPI_DLL_GetVoltageDomainsStatus(NvPhysicalGpuHandle handle, NVAPI_VOLT_STATUS* data) {
+	static NvAPI_Status (*pointer)(NvPhysicalGpuHandle, NVAPI_VOLT_STATUS*) = NULL;
+	if(!nvapi_dll_loaded) return NVAPI_API_NOT_INITIALIZED;
+	if(!pointer) {
+		pointer = (NvAPI_Status (*)(NvPhysicalGpuHandle, NVAPI_VOLT_STATUS*))nvidia_handle->query(NVAPI_ID_VOLT_STATUS_GET);
+	}
+	if(!pointer) return NVAPI_NO_IMPLEMENTATION;
+	return (*pointer)(handle, data);
+}
+
+#define NVAPI_ID_VOLTAGE 0x28766157 // Maxwell 1-008c Real func name is unknown
+NvAPI_Status NvAPI_DLL_GetVoltageStep(NvPhysicalGpuHandle handle, NVAPI_VOLT_STATUS* data) {
+	static NvAPI_Status (*pointer)(NvPhysicalGpuHandle, NVAPI_VOLT_STATUS*) = NULL;
+	if(!nvapi_dll_loaded) return NVAPI_API_NOT_INITIALIZED;
+	if(!pointer) {
+		pointer = (NvAPI_Status (*)(NvPhysicalGpuHandle, NVAPI_VOLT_STATUS*))nvidia_handle->query(NVAPI_ID_VOLTAGE);
+	}
+	if(!pointer) return NVAPI_NO_IMPLEMENTATION;
+	return (*pointer)(handle, data);
+}
+
+#define NVAPI_ID_CLK_RANGE_GET 0x64B43A6A // Pascal
+NvAPI_Status NvAPI_DLL_GetClockBoostRanges(NvPhysicalGpuHandle handle, NVAPI_CLOCKS_RANGE* range) {
+	static NvAPI_Status (*pointer)(NvPhysicalGpuHandle, NVAPI_CLOCKS_RANGE*) = NULL;
+	if(!nvapi_dll_loaded) return NVAPI_API_NOT_INITIALIZED;
+	if(!pointer) {
+		pointer = (NvAPI_Status (*)(NvPhysicalGpuHandle, NVAPI_CLOCKS_RANGE*))nvidia_handle->query(NVAPI_ID_CLK_RANGE_GET);
+	}
+	if(!pointer) return NVAPI_NO_IMPLEMENTATION;
+	return (*pointer)(handle, range);
+}
+
+#define NVAPI_ID_CLK_BOOST_MASK 0x507B4B59 // Pascal
+NvAPI_Status NvAPI_DLL_GetClockBoostMask(NvPhysicalGpuHandle handle, NVAPI_CLOCK_MASKS* range) {
+	static NvAPI_Status (*pointer)(NvPhysicalGpuHandle, NVAPI_CLOCK_MASKS*) = NULL;
+	if(!nvapi_dll_loaded) return NVAPI_API_NOT_INITIALIZED;
+	if(!pointer) {
+		pointer = (NvAPI_Status (*)(NvPhysicalGpuHandle, NVAPI_CLOCK_MASKS*))nvidia_handle->query(NVAPI_ID_CLK_BOOST_MASK);
+	}
+	if(!pointer) return NVAPI_NO_IMPLEMENTATION;
+	return (*pointer)(handle, range);
+}
+
+#define NVAPI_ID_CLK_BOOST_TABLE_GET 0x23F1B133 // Pascal
+NvAPI_Status NvAPI_DLL_GetClockBoostTable(NvPhysicalGpuHandle handle, NVAPI_CLOCK_TABLE* table) {
+	static NvAPI_Status (*pointer)(NvPhysicalGpuHandle,  NVAPI_CLOCK_TABLE*) = NULL;
+	if(!nvapi_dll_loaded) return NVAPI_API_NOT_INITIALIZED;
+	if(!pointer) {
+		pointer = (NvAPI_Status (*)(NvPhysicalGpuHandle, NVAPI_CLOCK_TABLE*))nvidia_handle->query(NVAPI_ID_CLK_BOOST_TABLE_GET);
+	}
+	if(!pointer) return NVAPI_NO_IMPLEMENTATION;
+	return (*pointer)(handle, table);
+}
+
+#define NVAPI_ID_CLK_BOOST_TABLE_SET 0x0733E009 // Pascal
+NvAPI_Status NvAPI_DLL_SetClockBoostTable(NvPhysicalGpuHandle handle, NVAPI_CLOCK_TABLE* table) {
+	static NvAPI_Status (*pointer)(NvPhysicalGpuHandle,  NVAPI_CLOCK_TABLE*) = NULL;
+	if(!nvapi_dll_loaded) return NVAPI_API_NOT_INITIALIZED;
+	if(!pointer) {
+		pointer = (NvAPI_Status (*)(NvPhysicalGpuHandle, NVAPI_CLOCK_TABLE*))nvidia_handle->query(NVAPI_ID_CLK_BOOST_TABLE_SET);
+	}
+	if(!pointer) return NVAPI_NO_IMPLEMENTATION;
+	return (*pointer)(handle, table);
+}
+
+#define NVAPI_ID_VFP_CURVE_GET 0x21537AD4 // Pascal
+NvAPI_Status NvAPI_DLL_GetVFPCurve(NvPhysicalGpuHandle handle, NVAPI_VFP_CURVE* curve) {
+	static NvAPI_Status (*pointer)(NvPhysicalGpuHandle,  NVAPI_VFP_CURVE*) = NULL;
+	if(!nvapi_dll_loaded) return NVAPI_API_NOT_INITIALIZED;
+	if(!pointer) {
+		pointer = (NvAPI_Status (*)(NvPhysicalGpuHandle, NVAPI_VFP_CURVE*))nvidia_handle->query(NVAPI_ID_VFP_CURVE_GET);
+	}
+	if(!pointer) return NVAPI_NO_IMPLEMENTATION;
+	return (*pointer)(handle, curve);
+}
+
+#define NVAPI_ID_CURVE_GET 0xE440B867 // Pascal 2-030c struct 0C 03 02 00 00 00 00 00 01 00 00 00 06 00 00 00
+#define NVAPI_ID_CURVE_SET 0x39442CFB // Pascal 2-030c struct 0C 03 02 00 00 00 00 00 01 00 00 00 06 00 00 00
+
+#define NVAPI_ID_VOLTBOOST_GET 0x9DF23CA1 // Pascal 1-0028
+NvAPI_Status NvAPI_DLL_GetCoreVoltageBoostPercent(NvPhysicalGpuHandle handle, NVAPI_VOLTBOOST_PERCENT* boost) {
+	static NvAPI_Status (*pointer)(NvPhysicalGpuHandle,  NVAPI_VOLTBOOST_PERCENT*) = NULL;
+	if(!nvapi_dll_loaded) return NVAPI_API_NOT_INITIALIZED;
+	if(!pointer) {
+		pointer = (NvAPI_Status (*)(NvPhysicalGpuHandle, NVAPI_VOLTBOOST_PERCENT*))nvidia_handle->query(NVAPI_ID_VOLTBOOST_GET);
+	}
+	if(!pointer) return NVAPI_NO_IMPLEMENTATION;
+	return (*pointer)(handle, boost);
+}
+#define NVAPI_ID_VOLTBOOST_SET 0xB9306D9B // Pascal 1-0028
+NvAPI_Status NvAPI_DLL_SetCoreVoltageBoostPercent(NvPhysicalGpuHandle handle, NVAPI_VOLTBOOST_PERCENT* boost) {
+	static NvAPI_Status (*pointer)(NvPhysicalGpuHandle,  NVAPI_VOLTBOOST_PERCENT*) = NULL;
+	if(!nvapi_dll_loaded) return NVAPI_API_NOT_INITIALIZED;
+	if(!pointer) {
+		pointer = (NvAPI_Status (*)(NvPhysicalGpuHandle, NVAPI_VOLTBOOST_PERCENT*))nvidia_handle->query(NVAPI_ID_VOLTBOOST_SET);
+	}
+	if(!pointer) return NVAPI_NO_IMPLEMENTATION;
+	return (*pointer)(handle, boost);
+}
+
+#define NVAPI_ID_PERFCLOCKS_GET 0x1EA54A3B
+NvAPI_Status NvAPI_DLL_GetPerfClocks(NvPhysicalGpuHandle handle, uint32_t num, NVAPI_GPU_PERF_CLOCKS* pClocks) {
+	static NvAPI_Status (*pointer)(NvPhysicalGpuHandle, uint32_t, NVAPI_GPU_PERF_CLOCKS*) = NULL;
+	if(!nvapi_dll_loaded) return NVAPI_API_NOT_INITIALIZED;
+	if(!pointer) {
+		pointer = (NvAPI_Status (*)(NvPhysicalGpuHandle, uint32_t, NVAPI_GPU_PERF_CLOCKS*))nvidia_handle->query(NVAPI_ID_PERFCLOCKS_GET);
+	}
+	if(!pointer) return NVAPI_NO_IMPLEMENTATION;
+	return (*pointer)(handle, num, pClocks);
+}
+
+#define NVAPI_ID_PERFCLOCKS_SET 0x07BCF4AC // error
+NvAPI_Status NvAPI_DLL_SetPerfClocks(NvPhysicalGpuHandle handle, uint32_t num, NVAPI_GPU_PERF_CLOCKS* pClocks) {
+	static NvAPI_Status (*pointer)(NvPhysicalGpuHandle, uint32_t, NVAPI_GPU_PERF_CLOCKS*) = NULL;
+	if(!nvapi_dll_loaded) return NVAPI_API_NOT_INITIALIZED;
+	if(!pointer) {
+		pointer = (NvAPI_Status (*)(NvPhysicalGpuHandle, uint32_t, NVAPI_GPU_PERF_CLOCKS*))nvidia_handle->query(NVAPI_ID_PERFCLOCKS_SET);
+	}
+	if(!pointer) return NVAPI_NO_IMPLEMENTATION;
+	return (*pointer)(handle, num, pClocks);
+}
+
+#define NVAPI_ID_PSTATELIMITS_GET 0x88C82104 // wrong prototype or missing struct data ?
+NvAPI_Status NvAPI_DLL_GetPstateClientLimits(NvPhysicalGpuHandle handle, NV_GPU_PERF_PSTATE_ID pst, uint32_t* pLimits) {
+	static NvAPI_Status (*pointer)(NvPhysicalGpuHandle, NV_GPU_PERF_PSTATE_ID, uint32_t*) = NULL;
+	if(!nvapi_dll_loaded) return NVAPI_API_NOT_INITIALIZED;
+	if(!pointer) {
+		pointer = (NvAPI_Status (*)(NvPhysicalGpuHandle, NV_GPU_PERF_PSTATE_ID, uint32_t*))nvidia_handle->query(NVAPI_ID_PSTATELIMITS_GET);
+	}
+	if(!pointer) return NVAPI_NO_IMPLEMENTATION;
+	return (*pointer)(handle, pst, pLimits);
+}
+
+#define NVAPI_ID_PSTATELIMITS_SET 0xFDFC7D49 // wrong prototype or missing struct data ?
+NvAPI_Status NvAPI_DLL_SetPstateClientLimits(NvPhysicalGpuHandle handle, NV_GPU_PERF_PSTATE_ID pst, uint32_t* pLimits) {
+	static NvAPI_Status (*pointer)(NvPhysicalGpuHandle, NV_GPU_PERF_PSTATE_ID, uint32_t*) = NULL;
+	if(!nvapi_dll_loaded) return NVAPI_API_NOT_INITIALIZED;
+	if(!pointer) {
+		pointer = (NvAPI_Status (*)(NvPhysicalGpuHandle, NV_GPU_PERF_PSTATE_ID, uint32_t*))nvidia_handle->query(NVAPI_ID_PSTATELIMITS_SET);
+	}
+	if(!pointer) return NVAPI_NO_IMPLEMENTATION;
+	return (*pointer)(handle, pst, pLimits);
+}
+
+#define NVAPI_ID_PSTATE20_SET 0x0F4DAE6B
+// allow to set gpu/mem core freq delta
+NvAPI_Status NvAPI_DLL_SetPstates20v1(NvPhysicalGpuHandle handle, NV_GPU_PERF_PSTATES20_INFO_V1 *pSet) {
+	static NvAPI_Status (*pointer)(NvPhysicalGpuHandle, NV_GPU_PERF_PSTATES20_INFO_V1*) = NULL;
+	if(!nvapi_dll_loaded) return NVAPI_API_NOT_INITIALIZED;
+	if(!pointer) {
+		pointer = (NvAPI_Status (*)(NvPhysicalGpuHandle, NV_GPU_PERF_PSTATES20_INFO_V1*))nvidia_handle->query(NVAPI_ID_PSTATE20_SET);
+	}
+	if(!pointer) return NVAPI_NO_IMPLEMENTATION;
+	return (*pointer)(handle, pSet);
+}
+
+// allow to set gpu core voltage delta
+NvAPI_Status NvAPI_DLL_SetPstates20v2(NvPhysicalGpuHandle handle, NV_GPU_PERF_PSTATES20_INFO_V2 *pSet) {
+	static NvAPI_Status (*pointer)(NvPhysicalGpuHandle, NV_GPU_PERF_PSTATES20_INFO_V2*) = NULL;
+	if(!nvapi_dll_loaded) return NVAPI_API_NOT_INITIALIZED;
+	if(!pointer) {
+		pointer = (NvAPI_Status (*)(NvPhysicalGpuHandle, NV_GPU_PERF_PSTATES20_INFO_V2*))nvidia_handle->query(NVAPI_ID_PSTATE20_SET);
+	}
+	if(!pointer) return NVAPI_NO_IMPLEMENTATION;
+	return (*pointer)(handle, pSet);
+}
+
+// maxwell voltage table
+#define NVAPI_ID_VOLTAGES 0x7D656244 // 1-40cc
+NvAPI_Status NvAPI_DLL_GetVoltages(NvPhysicalGpuHandle handle, NVAPI_VOLTAGES_TABLE *pInfo) {
+	static NvAPI_Status (*pointer)(NvPhysicalGpuHandle, NVAPI_VOLTAGES_TABLE*) = NULL;
+	if(!nvapi_dll_loaded) return NVAPI_API_NOT_INITIALIZED;
+	if(!pointer) {
+		pointer = (NvAPI_Status (*)(NvPhysicalGpuHandle, NVAPI_VOLTAGES_TABLE*))nvidia_handle->query(NVAPI_ID_VOLTAGES);
+	}
+	if(!pointer) return NVAPI_NO_IMPLEMENTATION;
+	return (*pointer)(handle, pInfo);
+}
+
+#define NVAPI_ID_COOLERSETTINGS 0xDA141340 // 4-0558
+NvAPI_Status NvAPI_DLL_GetCoolerSettings(NvPhysicalGpuHandle handle, uint32_t id, NVAPI_COOLER_SETTINGS* pSettings) {
+	static NvAPI_Status (*pointer)(NvPhysicalGpuHandle, uint32_t, NVAPI_COOLER_SETTINGS*) = NULL;
+	if(!nvapi_dll_loaded) return NVAPI_API_NOT_INITIALIZED;
+	if(!pointer) {
+		pointer = (NvAPI_Status (*)(NvPhysicalGpuHandle, uint32_t, NVAPI_COOLER_SETTINGS*))nvidia_handle->query(NVAPI_ID_COOLERSETTINGS);
+	}
+	if(!pointer) return NVAPI_NO_IMPLEMENTATION;
+	return (*pointer)(handle, id, pSettings);
+}
+
+#define NVAPI_ID_COOLER_SETLEVELS 0x891FA0AE // 1-00A4
+NvAPI_Status NvAPI_DLL_SetCoolerLevels(NvPhysicalGpuHandle handle, uint32_t id, NVAPI_COOLER_LEVEL* pLevel) {
+	static NvAPI_Status (*pointer)(NvPhysicalGpuHandle, uint32_t, NVAPI_COOLER_LEVEL*) = NULL;
+	if(!nvapi_dll_loaded) return NVAPI_API_NOT_INITIALIZED;
+	if(!pointer) {
+		pointer = (NvAPI_Status (*)(NvPhysicalGpuHandle, uint32_t, NVAPI_COOLER_LEVEL*))nvidia_handle->query(NVAPI_ID_COOLER_SETLEVELS);
+	}
+	if(!pointer) return NVAPI_NO_IMPLEMENTATION;
+	return (*pointer)(handle, id, pLevel);
+}
+
+#define NVAPI_ID_COOLER_RESTORE 0x8F6ED0FB
+NvAPI_Status NvAPI_DLL_RestoreCoolerSettings(NvPhysicalGpuHandle handle, NVAPI_COOLER_SETTINGS* pSettings, uint32_t id) {
+	static NvAPI_Status (*pointer)(NvPhysicalGpuHandle, NVAPI_COOLER_SETTINGS*, uint32_t) = NULL;
+	if(!nvapi_dll_loaded) return NVAPI_API_NOT_INITIALIZED;
+	if(!pointer) {
+		pointer = (NvAPI_Status (*)(NvPhysicalGpuHandle, NVAPI_COOLER_SETTINGS*, uint32_t))nvidia_handle->query(NVAPI_ID_COOLER_RESTORE);
+	}
+	if(!pointer) return NVAPI_NO_IMPLEMENTATION;
+	return (*pointer)(handle, pSettings, id);
+}
+
+#define NVAPI_ID_I2CREADEX 0x4D7B0709 // 3-002c
+NvAPI_Status NvAPI_DLL_I2CReadEx(NvPhysicalGpuHandle handle, NV_I2C_INFO_EX *i2c, NvU32 *exData) {
+	static NvAPI_Status (*pointer)(NvPhysicalGpuHandle, NV_I2C_INFO_EX*, NvU32*) = NULL;
+	if(!nvapi_dll_loaded) return NVAPI_API_NOT_INITIALIZED;
+	if(!pointer) {
+		pointer = (NvAPI_Status (*)(NvPhysicalGpuHandle, NV_I2C_INFO_EX*, NvU32*))nvidia_handle->query(NVAPI_ID_I2CREADEX);
+	}
+	if(!pointer) return NVAPI_NO_IMPLEMENTATION;
+	return (*pointer)(handle, i2c, exData);
+}
+
+#define NVAPI_ID_I2CWRITEEX 0x283AC65A
+NvAPI_Status NvAPI_DLL_I2CWriteEx(NvPhysicalGpuHandle handle, NV_I2C_INFO_EX *i2c, NvU32 *exData) {
+	static NvAPI_Status (*pointer)(NvPhysicalGpuHandle, NV_I2C_INFO_EX*, NvU32 *exData) = NULL;
+	if(!nvapi_dll_loaded) return NVAPI_API_NOT_INITIALIZED;
+	if(!pointer) {
+		pointer = (NvAPI_Status (*)(NvPhysicalGpuHandle, NV_I2C_INFO_EX*, NvU32 *exData))nvidia_handle->query(NVAPI_ID_I2CWRITEEX);
+	}
+	if(!pointer) return NVAPI_NO_IMPLEMENTATION;
+	return (*pointer)(handle, i2c, exData);
+}
+
+#define NVAPI_ID_UNLOAD 0xD22BDD7E
+NvAPI_Status NvAPI_DLL_Unload() {
+	static NvAPI_Status (*pointer)() = NULL;
+	if(!nvapi_dll_loaded) return NVAPI_API_NOT_INITIALIZED;
+	if(!pointer) {
+		pointer = (NvAPI_Status (*)())nvidia_handle->query(NVAPI_ID_UNLOAD);
+	}
+	if(!pointer) return NVAPI_NO_IMPLEMENTATION;
+	return (*pointer)();
+}
+
+#endif
\ No newline at end of file
diff --git a/nvml.cpp b/nvml.cpp
new file mode 100644
index 0000000000..6a77986a31
--- /dev/null
+++ b/nvml.cpp
@@ -0,0 +1,2276 @@
+﻿/*
+ * A trivial little dlopen()-based wrapper library for the
+ * NVIDIA NVML library, to allow runtime discovery of NVML on an
+ * arbitrary system.  This is all very hackish and simple-minded, but
+ * it serves my immediate needs in the short term until NVIDIA provides
+ * a static NVML wrapper library themselves, hopefully in
+ * CUDA 6.5 or maybe sometime shortly after.
+ *
+ * This trivial code is made available under the "new" 3-clause BSD license,
+ * and/or any of the GPL licenses you prefer.
+ * Feel free to use the code and modify as you see fit.
+ *
+ * John E. Stone - john.stone@gmail.com
+ * Tanguy Pruvot - tpruvot@github
+ *
+ */
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "miner.h"
+#include "nvml.h"
+#include "cuda_runtime.h"
+
+#ifdef USE_WRAPNVML
+
+extern nvml_handle *hnvml;
+extern char driver_version[32];
+
+static uint32_t device_bus_ids[MAX_GPUS] = { 0 };
+
+extern uint32_t device_gpu_clocks[MAX_GPUS];
+extern uint32_t device_mem_clocks[MAX_GPUS];
+extern int32_t device_mem_offsets[MAX_GPUS];
+extern uint8_t device_tlimit[MAX_GPUS];
+extern int8_t device_pstate[MAX_GPUS];
+extern int32_t device_led[MAX_GPUS];
+int32_t device_led_state[MAX_GPUS] = { 0 };
+static __thread bool has_rgb_ok = false;
+
+uint32_t clock_prev[MAX_GPUS] = { 0 };
+uint32_t clock_prev_mem[MAX_GPUS] = { 0 };
+uint32_t limit_prev[MAX_GPUS] = { 0 };
+
+static bool nvml_plimit_set = false;
+extern bool need_memclockrst;
+
+/*
+ * Wrappers to emulate dlopen() on other systems like Windows
+ */
+#if defined(_MSC_VER) || defined(_WIN32) || defined(_WIN64)
+	#include <windows.h>
+	static void *wrap_dlopen(const char *filename) {
+		HMODULE h = LoadLibrary(filename);
+		if (!h && opt_debug) {
+			applog(LOG_DEBUG, "dlopen(%d): failed to load %s", 
+				GetLastError(), filename);
+		}
+		return (void*)h;
+	}
+	static void *wrap_dlsym(void *h, const char *sym) {
+		return (void *)GetProcAddress((HINSTANCE)h, sym);
+	}
+	static int wrap_dlclose(void *h) {
+		/* FreeLibrary returns nonzero on success */
+		return (!FreeLibrary((HINSTANCE)h));
+	}
+#else
+	/* assume we can use dlopen itself... */
+	#include <dlfcn.h>
+	#include <errno.h>
+	static void *wrap_dlopen(const char *filename) {
+		void *h = dlopen(filename, RTLD_NOW);
+		if (h == NULL && opt_debug) {
+			applog(LOG_DEBUG, "dlopen(%d): failed to load %s", 
+				errno, filename);
+		}
+		return (void*)h;
+	}
+
+	static void *wrap_dlsym(void *h, const char *sym) {
+		return dlsym(h, sym);
+	}
+	static int wrap_dlclose(void *h) {
+		return dlclose(h);
+	}
+#endif
+
+nvml_handle * nvml_create()
+{
+	int i=0;
+	nvml_handle *nvmlh = NULL;
+
+#ifdef WIN32
+	/* Windows (do not use slashes, else ExpandEnvironmentStrings will mix them) */
+#define  libnvidia_ml "%PROGRAMFILES%\\NVIDIA Corporation\\NVSMI\\nvml.dll"
+#else
+	/* linux assumed */
+#define  libnvidia_ml "libnvidia-ml.so"
+#endif
+
+	char tmp[512];
+#ifdef WIN32
+	ExpandEnvironmentStrings(libnvidia_ml, tmp, sizeof(tmp));
+#else
+	strcpy(tmp, libnvidia_ml);
+#endif
+
+	void *nvml_dll = wrap_dlopen(tmp);
+	if (nvml_dll == NULL) {
+#ifdef WIN32
+		nvml_dll = wrap_dlopen("nvml.dll");
+		if (nvml_dll == NULL)
+#endif
+		return NULL;
+	}
+
+	nvmlh = (nvml_handle *) calloc(1, sizeof(nvml_handle));
+
+	nvmlh->nvml_dll = nvml_dll;
+
+	nvmlh->nvmlInit = (nvmlReturn_t (*)(void)) wrap_dlsym(nvmlh->nvml_dll, "nvmlInit_v2");
+	if (!nvmlh->nvmlInit)
+		nvmlh->nvmlInit = (nvmlReturn_t (*)(void)) wrap_dlsym(nvmlh->nvml_dll, "nvmlInit");
+	nvmlh->nvmlDeviceGetCount = (nvmlReturn_t (*)(int *)) wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetCount_v2");
+	if (!nvmlh->nvmlDeviceGetCount)
+		nvmlh->nvmlDeviceGetCount = (nvmlReturn_t (*)(int *)) wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetCount");
+	nvmlh->nvmlDeviceGetHandleByIndex = (nvmlReturn_t (*)(int, nvmlDevice_t *))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetHandleByIndex_v2");
+	nvmlh->nvmlDeviceGetAPIRestriction = (nvmlReturn_t (*)(nvmlDevice_t, nvmlRestrictedAPI_t, nvmlEnableState_t *))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetAPIRestriction");
+	nvmlh->nvmlDeviceSetAPIRestriction = (nvmlReturn_t (*)(nvmlDevice_t, nvmlRestrictedAPI_t, nvmlEnableState_t))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceSetAPIRestriction");
+	nvmlh->nvmlDeviceGetDefaultApplicationsClock = (nvmlReturn_t (*)(nvmlDevice_t, nvmlClockType_t, unsigned int *clock))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetDefaultApplicationsClock");
+	nvmlh->nvmlDeviceGetApplicationsClock = (nvmlReturn_t (*)(nvmlDevice_t, nvmlClockType_t, unsigned int *clocks))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetApplicationsClock");
+	nvmlh->nvmlDeviceSetApplicationsClocks = (nvmlReturn_t (*)(nvmlDevice_t, unsigned int mem, unsigned int gpu))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceSetApplicationsClocks");
+	nvmlh->nvmlDeviceResetApplicationsClocks = (nvmlReturn_t (*)(nvmlDevice_t))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceResetApplicationsClocks");
+	nvmlh->nvmlDeviceGetSupportedGraphicsClocks = (nvmlReturn_t (*)(nvmlDevice_t, uint32_t mem, uint32_t *num, uint32_t *))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetSupportedGraphicsClocks");
+	nvmlh->nvmlDeviceGetSupportedMemoryClocks = (nvmlReturn_t (*)(nvmlDevice_t, unsigned int *count, unsigned int *clocksMHz))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetSupportedMemoryClocks");
+	nvmlh->nvmlDeviceGetClockInfo = (nvmlReturn_t (*)(nvmlDevice_t, nvmlClockType_t, unsigned int *clock))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetClockInfo");
+	nvmlh->nvmlDeviceGetMaxClockInfo = (nvmlReturn_t (*)(nvmlDevice_t, nvmlClockType_t, unsigned int *clock))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetMaxClockInfo");
+	nvmlh->nvmlDeviceGetPciInfo = (nvmlReturn_t (*)(nvmlDevice_t, nvmlPciInfo_t *)) wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetPciInfo_v2");
+	if (!nvmlh->nvmlDeviceGetPciInfo)
+		nvmlh->nvmlDeviceGetPciInfo = (nvmlReturn_t (*)(nvmlDevice_t, nvmlPciInfo_t *)) wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetPciInfo");
+	nvmlh->nvmlDeviceGetCurrPcieLinkGeneration = (nvmlReturn_t (*)(nvmlDevice_t, unsigned int *gen))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetCurrPcieLinkGeneration");
+	nvmlh->nvmlDeviceGetCurrPcieLinkWidth = (nvmlReturn_t (*)(nvmlDevice_t, unsigned int *width))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetCurrPcieLinkWidth");
+	nvmlh->nvmlDeviceGetMaxPcieLinkGeneration = (nvmlReturn_t (*)(nvmlDevice_t, unsigned int *gen))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetMaxPcieLinkGeneration");
+	nvmlh->nvmlDeviceGetMaxPcieLinkWidth = (nvmlReturn_t (*)(nvmlDevice_t, unsigned int *width))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetMaxPcieLinkWidth");
+	nvmlh->nvmlDeviceGetPowerUsage = (nvmlReturn_t (*)(nvmlDevice_t, unsigned int *))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetPowerUsage");
+	nvmlh->nvmlDeviceGetPowerManagementDefaultLimit = (nvmlReturn_t (*)(nvmlDevice_t, unsigned int *limit))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetPowerManagementDefaultLimit");
+	nvmlh->nvmlDeviceGetPowerManagementLimit = (nvmlReturn_t (*)(nvmlDevice_t, unsigned int *limit))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetPowerManagementLimit");
+	nvmlh->nvmlDeviceGetPowerManagementLimitConstraints = (nvmlReturn_t (*)(nvmlDevice_t, unsigned int *min, unsigned int *max))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetPowerManagementLimitConstraints");
+	nvmlh->nvmlDeviceSetPowerManagementLimit = (nvmlReturn_t (*)(nvmlDevice_t, unsigned int limit))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceSetPowerManagementLimit");
+	nvmlh->nvmlDeviceGetName = (nvmlReturn_t (*)(nvmlDevice_t, char *, int))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetName");
+	nvmlh->nvmlDeviceGetTemperature = (nvmlReturn_t (*)(nvmlDevice_t, int, unsigned int *))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetTemperature");
+	nvmlh->nvmlDeviceGetFanSpeed = (nvmlReturn_t (*)(nvmlDevice_t, unsigned int *))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetFanSpeed");
+	nvmlh->nvmlDeviceGetPerformanceState = (nvmlReturn_t (*)(nvmlDevice_t, int *))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetPerformanceState"); /* or nvmlDeviceGetPowerState */
+	nvmlh->nvmlDeviceGetSerial = (nvmlReturn_t (*)(nvmlDevice_t, char *, unsigned int))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetSerial");
+	nvmlh->nvmlDeviceGetUUID = (nvmlReturn_t (*)(nvmlDevice_t, char *, unsigned int))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetUUID");
+	nvmlh->nvmlDeviceGetVbiosVersion = (nvmlReturn_t (*)(nvmlDevice_t, char *, unsigned int))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetVbiosVersion");
+	nvmlh->nvmlSystemGetDriverVersion = (nvmlReturn_t (*)(char *, unsigned int))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlSystemGetDriverVersion");
+	nvmlh->nvmlErrorString = (char* (*)(nvmlReturn_t))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlErrorString");
+	nvmlh->nvmlShutdown = (nvmlReturn_t (*)())
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlShutdown");
+	// v331
+	nvmlh->nvmlDeviceGetEnforcedPowerLimit = (nvmlReturn_t (*)(nvmlDevice_t, unsigned int *limit))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetEnforcedPowerLimit");
+	// v340
+#ifdef __linux__
+	nvmlh->nvmlDeviceClearCpuAffinity = (nvmlReturn_t (*)(nvmlDevice_t))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceClearCpuAffinity");
+	nvmlh->nvmlDeviceGetCpuAffinity = (nvmlReturn_t (*)(nvmlDevice_t, unsigned int sz, unsigned long *cpuSet))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetCpuAffinity");
+	nvmlh->nvmlDeviceSetCpuAffinity = (nvmlReturn_t (*)(nvmlDevice_t))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceSetCpuAffinity");
+#endif
+	// v346
+	nvmlh->nvmlDeviceGetPcieThroughput = (nvmlReturn_t (*)(nvmlDevice_t, nvmlPcieUtilCounter_t, unsigned int *value))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetPcieThroughput");
+	// v36x (API 8 / Pascal)
+	nvmlh->nvmlDeviceGetClock = (nvmlReturn_t (*)(nvmlDevice_t, nvmlClockType_t clockType, nvmlClockId_t clockId, unsigned int *clockMHz))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetClock");
+
+	if (nvmlh->nvmlInit == NULL ||
+			nvmlh->nvmlShutdown == NULL ||
+			nvmlh->nvmlErrorString == NULL ||
+			nvmlh->nvmlDeviceGetCount == NULL ||
+			nvmlh->nvmlDeviceGetHandleByIndex == NULL ||
+			nvmlh->nvmlDeviceGetPciInfo == NULL ||
+			nvmlh->nvmlDeviceGetName == NULL)
+	{
+		if (opt_debug)
+			applog(LOG_DEBUG, "Failed to obtain required NVML function pointers");
+		wrap_dlclose(nvmlh->nvml_dll);
+		free(nvmlh);
+		return NULL;
+	}
+
+	nvmlh->nvmlInit();
+	if (nvmlh->nvmlSystemGetDriverVersion)
+		nvmlh->nvmlSystemGetDriverVersion(driver_version, sizeof(driver_version));
+	nvmlh->nvmlDeviceGetCount(&nvmlh->nvml_gpucount);
+
+	/* Query CUDA device count, in case it doesn't agree with NVML, since  */
+	/* CUDA will only report GPUs with compute capability greater than 1.0 */
+	if (cudaGetDeviceCount(&nvmlh->cuda_gpucount) != cudaSuccess) {
+		if (opt_debug)
+			applog(LOG_DEBUG, "Failed to query CUDA device count!");
+		wrap_dlclose(nvmlh->nvml_dll);
+		free(nvmlh);
+		return NULL;
+	}
+
+	nvmlh->devs = (nvmlDevice_t *) calloc(nvmlh->nvml_gpucount, sizeof(nvmlDevice_t));
+	nvmlh->nvml_pci_domain_id = (unsigned int*) calloc(nvmlh->nvml_gpucount, sizeof(unsigned int));
+	nvmlh->nvml_pci_bus_id = (unsigned int*) calloc(nvmlh->nvml_gpucount, sizeof(unsigned int));
+	nvmlh->nvml_pci_device_id = (unsigned int*) calloc(nvmlh->nvml_gpucount, sizeof(unsigned int));
+	nvmlh->nvml_pci_vendor_id = (unsigned int*) calloc(nvmlh->nvml_gpucount, sizeof(unsigned int));
+	nvmlh->nvml_pci_subsys_id = (unsigned int*) calloc(nvmlh->nvml_gpucount, sizeof(unsigned int));
+	nvmlh->nvml_cuda_device_id = (int*) calloc(nvmlh->nvml_gpucount, sizeof(int));
+	nvmlh->cuda_nvml_device_id = (int*) calloc(nvmlh->cuda_gpucount, sizeof(int));
+	nvmlh->app_clocks = (nvmlEnableState_t*) calloc(nvmlh->nvml_gpucount, sizeof(nvmlEnableState_t));
+
+	/* Obtain GPU device handles we're going to need repeatedly... */
+	for (i=0; i<nvmlh->nvml_gpucount; i++) {
+		nvmlh->nvmlDeviceGetHandleByIndex(i, &nvmlh->devs[i]);
+	}
+
+	/* Query PCI info for each NVML device, and build table for mapping of */
+	/* CUDA device IDs to NVML device IDs and vice versa                   */
+	for (i=0; i<nvmlh->nvml_gpucount; i++) {
+		nvmlPciInfo_t pciinfo;
+
+		nvmlh->nvmlDeviceGetPciInfo(nvmlh->devs[i], &pciinfo);
+		nvmlh->nvml_pci_domain_id[i] = pciinfo.domain;
+		nvmlh->nvml_pci_bus_id[i]    = pciinfo.bus;
+		nvmlh->nvml_pci_device_id[i] = pciinfo.device;
+		nvmlh->nvml_pci_vendor_id[i] = pciinfo.pci_device_id;
+		nvmlh->nvml_pci_subsys_id[i] = pciinfo.pci_subsystem_id;
+
+		nvmlh->app_clocks[i] = NVML_FEATURE_UNKNOWN;
+		if (nvmlh->nvmlDeviceSetAPIRestriction) {
+			nvmlh->nvmlDeviceSetAPIRestriction(nvmlh->devs[i], NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS,
+				NVML_FEATURE_ENABLED);
+			/* there is only this API_SET_APPLICATION_CLOCKS on the 750 Ti (340.58) */
+		}
+		if (nvmlh->nvmlDeviceGetAPIRestriction) {
+			nvmlh->nvmlDeviceGetAPIRestriction(nvmlh->devs[i], NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS,
+				&nvmlh->app_clocks[i]);
+		}
+	}
+
+	/* build mapping of NVML device IDs to CUDA IDs */
+	for (i=0; i<nvmlh->nvml_gpucount; i++) {
+		nvmlh->nvml_cuda_device_id[i] = -1;
+	}
+	for (i=0; i<nvmlh->cuda_gpucount; i++) {
+		cudaDeviceProp props;
+		nvmlh->cuda_nvml_device_id[i] = -1;
+
+		if (cudaGetDeviceProperties(&props, i) == cudaSuccess) {
+			device_bus_ids[i] = props.pciBusID;
+			for (int j = 0; j < nvmlh->nvml_gpucount; j++) {
+				if ((nvmlh->nvml_pci_domain_id[j] == (uint32_t) props.pciDomainID) &&
+				    (nvmlh->nvml_pci_bus_id[j]    == (uint32_t) props.pciBusID) &&
+				    (nvmlh->nvml_pci_device_id[j] == (uint32_t) props.pciDeviceID)) {
+					if (opt_debug)
+						applog(LOG_DEBUG, "CUDA GPU %d matches NVML GPU %d by busId %u",
+							i, j, (uint32_t) props.pciBusID);
+					nvmlh->nvml_cuda_device_id[j] = i;
+					nvmlh->cuda_nvml_device_id[i] = j;
+				}
+			}
+		}
+	}
+
+	return nvmlh;
+}
+
+/* apply config clocks to an used device */
+int nvml_set_clocks(nvml_handle *nvmlh, int dev_id)
+{
+	nvmlReturn_t rc;
+	uint32_t gpu_clk = 0, mem_clk = 0;
+	int n = nvmlh->cuda_nvml_device_id[dev_id];
+	//if (need_nvsettings) /* prefer later than init time */
+	//	nvs_set_clocks(dev_id);
+	if (n < 0 || n >= nvmlh->nvml_gpucount)
+		return -ENODEV;
+
+	if (!device_gpu_clocks[dev_id] && !device_mem_clocks[dev_id])
+		return 0; // nothing to do
+
+	if (nvmlh->app_clocks[n] != NVML_FEATURE_ENABLED) {
+		applog(LOG_WARNING, "GPU #%d: NVML application clock feature is not allowed!", dev_id);
+		return -EPERM;
+	}
+
+	uint32_t mem_prev = clock_prev_mem[dev_id];
+	if (!mem_prev)
+		nvmlh->nvmlDeviceGetApplicationsClock(nvmlh->devs[n], NVML_CLOCK_MEM, &mem_prev);
+	uint32_t gpu_prev = clock_prev[dev_id];
+	if (!gpu_prev)
+		nvmlh->nvmlDeviceGetApplicationsClock(nvmlh->devs[n], NVML_CLOCK_GRAPHICS, &gpu_prev);
+
+	nvmlh->nvmlDeviceGetDefaultApplicationsClock(nvmlh->devs[n], NVML_CLOCK_MEM, &mem_clk);
+	rc = nvmlh->nvmlDeviceGetDefaultApplicationsClock(nvmlh->devs[n], NVML_CLOCK_GRAPHICS, &gpu_clk);
+	if (rc != NVML_SUCCESS) {
+		applog(LOG_WARNING, "GPU #%d: unable to query application clocks", dev_id);
+		return -EINVAL;
+	}
+
+	if (opt_debug)
+		applog(LOG_DEBUG, "GPU #%d: default application clocks are %u/%u", dev_id, mem_clk, gpu_clk);
+
+	// get application config values
+	if (device_mem_clocks[dev_id]) mem_clk = device_mem_clocks[dev_id];
+	if (device_gpu_clocks[dev_id]) gpu_clk = device_gpu_clocks[dev_id];
+
+	// these functions works for the 960 and the 970 (346.72+), and for the 750 Ti with driver ~361+
+	uint32_t nclocks = 0, mem_clocks[32] = { 0 };
+	nvmlh->nvmlDeviceGetSupportedMemoryClocks(nvmlh->devs[n], &nclocks, NULL);
+	nclocks = min(nclocks, 32);
+	if (nclocks)
+		nvmlh->nvmlDeviceGetSupportedMemoryClocks(nvmlh->devs[n], &nclocks, mem_clocks);
+	for (uint8_t u=0; u < nclocks; u++) {
+		// ordered by pstate (so highest is first memory clock - P0)
+		if (mem_clocks[u] <= mem_clk) {
+			mem_clk = mem_clocks[u];
+			break;
+		}
+	}
+
+	uint32_t* gpu_clocks = NULL;
+	nclocks = 0;
+	nvmlh->nvmlDeviceGetSupportedGraphicsClocks(nvmlh->devs[n], mem_clk, &nclocks, NULL);
+	if (nclocks) {
+		if (opt_debug)
+			applog(LOG_DEBUG, "GPU #%d: %u clocks found for mem %u", dev_id, nclocks, mem_clk);
+		gpu_clocks = (uint32_t*) calloc(1, sizeof(uint32_t) * nclocks + 4);
+		nvmlh->nvmlDeviceGetSupportedGraphicsClocks(nvmlh->devs[n], mem_clk, &nclocks, gpu_clocks);
+		for (uint8_t u=0; u < nclocks; u++) {
+			// ordered desc, so get first
+			if (gpu_clocks[u] <= gpu_clk) {
+				gpu_clk = gpu_clocks[u];
+				break;
+			}
+		}
+		free(gpu_clocks);
+	}
+
+	rc = nvmlh->nvmlDeviceSetApplicationsClocks(nvmlh->devs[n], mem_clk, gpu_clk);
+	if (rc == NVML_SUCCESS)
+		applog(LOG_INFO, "GPU #%d: application clocks set to %u/%u", dev_id, mem_clk, gpu_clk);
+	else {
+		applog(LOG_WARNING, "GPU #%d: %u/%u - %s (NVML)", dev_id, mem_clk, gpu_clk, nvmlh->nvmlErrorString(rc));
+		return -1;
+	}
+
+	// store previous clocks for reset on exit (or during wait...)
+	clock_prev[dev_id] = gpu_prev;
+	clock_prev_mem[dev_id] = mem_prev;
+	return 1;
+}
+
+/* reset default app clocks and limits on exit */
+int nvml_reset_clocks(nvml_handle *nvmlh, int dev_id)
+{
+	int ret = 0;
+	nvmlReturn_t rc;
+	uint32_t gpu_clk = 0, mem_clk = 0;
+	int n = nvmlh->cuda_nvml_device_id[dev_id];
+	if (need_nvsettings)
+		nvs_reset_clocks(dev_id);
+	if (n < 0 || n >= nvmlh->nvml_gpucount)
+		return -ENODEV;
+
+	if (clock_prev[dev_id]) {
+		rc = nvmlh->nvmlDeviceResetApplicationsClocks(nvmlh->devs[n]);
+		if (rc != NVML_SUCCESS) {
+			applog(LOG_WARNING, "GPU #%d: unable to reset application clocks", dev_id);
+		}
+		clock_prev[dev_id] = 0;
+		ret = 1;
+	}
+
+	if (limit_prev[dev_id]) {
+		uint32_t plimit = limit_prev[dev_id];
+		if (nvmlh->nvmlDeviceGetPowerManagementDefaultLimit && !plimit) {
+			rc = nvmlh->nvmlDeviceGetPowerManagementDefaultLimit(nvmlh->devs[n], &plimit);
+		} else if (plimit) {
+			rc = NVML_SUCCESS;
+		}
+		if (rc == NVML_SUCCESS)
+			nvmlh->nvmlDeviceSetPowerManagementLimit(nvmlh->devs[n], plimit);
+		ret = 1;
+	}
+	return ret;
+}
+
+/**
+ * Set power state of a device (9xx)
+ * Code is similar as clocks one, which allow the change of the pstate
+ */
+int nvml_set_pstate(nvml_handle *nvmlh, int dev_id)
+{
+	nvmlReturn_t rc;
+	uint32_t gpu_clk = 0, mem_clk = 0;
+	int n = nvmlh->cuda_nvml_device_id[dev_id];
+	if (n < 0 || n >= nvmlh->nvml_gpucount)
+		return -ENODEV;
+
+	if (device_pstate[dev_id] < 0)
+		return 0;
+
+	if (nvmlh->app_clocks[n] != NVML_FEATURE_ENABLED) {
+		applog(LOG_WARNING, "GPU #%d: NVML app. clock feature is not allowed!", dev_id);
+		return -EPERM;
+	}
+
+	nvmlh->nvmlDeviceGetDefaultApplicationsClock(nvmlh->devs[n], NVML_CLOCK_MEM, &mem_clk);
+	rc = nvmlh->nvmlDeviceGetDefaultApplicationsClock(nvmlh->devs[n], NVML_CLOCK_GRAPHICS, &gpu_clk);
+	if (rc != NVML_SUCCESS) {
+		applog(LOG_WARNING, "GPU #%d: unable to query application clocks", dev_id);
+		return -EINVAL;
+	}
+
+	// get application config values
+	if (device_mem_clocks[dev_id]) mem_clk = device_mem_clocks[dev_id];
+	if (device_gpu_clocks[dev_id]) gpu_clk = device_gpu_clocks[dev_id];
+
+	// these functions works for the 960 and the 970 (346.72+), and for the 750 Ti with driver ~361+
+	uint32_t nclocks = 0, mem_clocks[32] = { 0 };
+	int8_t wanted_pstate = device_pstate[dev_id];
+	nvmlh->nvmlDeviceGetSupportedMemoryClocks(nvmlh->devs[n], &nclocks, NULL);
+	nclocks = min(nclocks, 32);
+	if (nclocks)
+		nvmlh->nvmlDeviceGetSupportedMemoryClocks(nvmlh->devs[n], &nclocks, mem_clocks);
+	if ((uint32_t) wanted_pstate+1 > nclocks) {
+		applog(LOG_WARNING, "GPU #%d: only %u mem clocks available (p-states)", dev_id, nclocks);
+	}
+	for (uint8_t u=0; u < nclocks; u++) {
+		// ordered by pstate (so highest P0 first)
+		if (u == wanted_pstate) {
+			mem_clk = mem_clocks[u];
+			break;
+		}
+	}
+
+	uint32_t* gpu_clocks = NULL;
+	nclocks = 0;
+	nvmlh->nvmlDeviceGetSupportedGraphicsClocks(nvmlh->devs[n], mem_clk, &nclocks, NULL);
+	if (nclocks) {
+		gpu_clocks = (uint32_t*) calloc(1, sizeof(uint32_t) * nclocks + 4);
+		rc = nvmlh->nvmlDeviceGetSupportedGraphicsClocks(nvmlh->devs[n], mem_clk, &nclocks, gpu_clocks);
+		if (rc == NVML_SUCCESS) {
+			// ordered desc, get the max app clock (do not limit)
+			gpu_clk = gpu_clocks[0];
+		}
+		free(gpu_clocks);
+	}
+
+	rc = nvmlh->nvmlDeviceSetApplicationsClocks(nvmlh->devs[n], mem_clk, gpu_clk);
+	if (rc != NVML_SUCCESS) {
+		applog(LOG_WARNING, "GPU #%d: pstate P%d (%u/%u) %s", dev_id, (int) wanted_pstate,
+			mem_clk, gpu_clk, nvmlh->nvmlErrorString(rc));
+		return -1;
+	}
+
+	if (!opt_quiet)
+		applog(LOG_INFO, "GPU #%d: app clocks set to P%d (%u/%u)", dev_id, (int) wanted_pstate, mem_clk, gpu_clk);
+
+	clock_prev[dev_id] = 1;
+	return 1;
+}
+
+int nvml_set_plimit(nvml_handle *nvmlh, int dev_id)
+{
+	nvmlReturn_t rc = NVML_ERROR_UNKNOWN;
+	uint32_t gpu_clk = 0, mem_clk = 0;
+	int n = nvmlh->cuda_nvml_device_id[dev_id];
+	if (n < 0 || n >= nvmlh->nvml_gpucount)
+		return -ENODEV;
+
+	if (!device_plimit[dev_id])
+		return 0; // nothing to do
+
+	if (!nvmlh->nvmlDeviceSetPowerManagementLimit)
+		return -ENOSYS;
+
+	uint32_t plimit = device_plimit[dev_id] * 1000;
+	uint32_t pmin = 1000, pmax = 0, prev_limit = 0;
+	if (nvmlh->nvmlDeviceGetPowerManagementLimitConstraints)
+		rc = nvmlh->nvmlDeviceGetPowerManagementLimitConstraints(nvmlh->devs[n], &pmin, &pmax);
+
+	if (rc != NVML_SUCCESS) {
+		if (!nvmlh->nvmlDeviceGetPowerManagementLimit)
+			return -ENOSYS;
+	}
+	nvmlh->nvmlDeviceGetPowerManagementLimit(nvmlh->devs[n], &prev_limit);
+	if (!pmax) pmax = prev_limit;
+
+	plimit = min(plimit, pmax);
+	plimit = max(plimit, pmin);
+	rc = nvmlh->nvmlDeviceSetPowerManagementLimit(nvmlh->devs[n], plimit);
+	if (rc != NVML_SUCCESS) {
+#ifndef WIN32
+		applog(LOG_WARNING, "GPU #%d: plimit %s", dev_id, nvmlh->nvmlErrorString(rc));
+#endif
+		return -1;
+	} else {
+		device_plimit[dev_id] = plimit / 1000;
+		nvml_plimit_set = true;
+	}
+
+	if (!opt_quiet) {
+		applog(LOG_INFO, "GPU #%d: power limit set to %uW (allowed range is %u-%u)",
+			dev_id, plimit/1000U, pmin/1000U, pmax/1000U);
+	}
+
+	limit_prev[dev_id] = prev_limit;
+	return 1;
+}
+
+uint32_t nvml_get_plimit(nvml_handle *nvmlh, int dev_id)
+{
+	uint32_t plimit = 0;
+	int n = nvmlh ? nvmlh->cuda_nvml_device_id[dev_id] : -1;
+	if (n < 0 || n >= nvmlh->nvml_gpucount)
+		return 0;
+
+	if (nvmlh->nvmlDeviceGetPowerManagementLimit) {
+		nvmlh->nvmlDeviceGetPowerManagementLimit(nvmlh->devs[n], &plimit);
+	}
+	return plimit;
+}
+
+// ccminer -D -n
+#define LSTDEV_PFX "        "
+void nvml_print_device_info(int dev_id)
+{
+	if (!hnvml) return;
+
+	int n = hnvml->cuda_nvml_device_id[dev_id];
+	if (n < 0 || n >= hnvml->nvml_gpucount)
+		return;
+
+	nvmlReturn_t rc;
+
+	// fprintf(stderr, "------ Hardware ------\n");
+	int gvid = hnvml->nvml_pci_vendor_id[n] & 0xFFFF;
+	int gpid = hnvml->nvml_pci_vendor_id[n] >> 16;
+	int svid = hnvml->nvml_pci_subsys_id[n] & 0xFFFF;
+	int spid = hnvml->nvml_pci_subsys_id[n] >> 16;
+
+	fprintf(stderr, LSTDEV_PFX "ID %04x:%04x/%04x:%04x BUS %04x:%02x:%02x.0\n", gvid, gpid, svid, spid,
+		(int) hnvml->nvml_pci_domain_id[n], (int) hnvml->nvml_pci_bus_id[n], (int) hnvml->nvml_pci_device_id[n]);
+
+	if (hnvml->nvmlDeviceGetClock) {
+		uint32_t gpu_clk = 0, mem_clk = 0;
+
+		// fprintf(stderr, "------- Clocks -------\n");
+
+		hnvml->nvmlDeviceGetClock(hnvml->devs[n], NVML_CLOCK_GRAPHICS, NVML_CLOCK_ID_APP_CLOCK_DEFAULT, &gpu_clk);
+		rc = hnvml->nvmlDeviceGetClock(hnvml->devs[n], NVML_CLOCK_MEM, NVML_CLOCK_ID_APP_CLOCK_DEFAULT, &mem_clk);
+		if (rc == NVML_SUCCESS) {
+			fprintf(stderr, LSTDEV_PFX "DEFAULT MEM %4u GPU %4u MHz\n", mem_clk, gpu_clk);
+		}
+		hnvml->nvmlDeviceGetClock(hnvml->devs[n], NVML_CLOCK_GRAPHICS, NVML_CLOCK_ID_APP_CLOCK_TARGET, &gpu_clk);
+		rc = hnvml->nvmlDeviceGetClock(hnvml->devs[n], NVML_CLOCK_MEM, NVML_CLOCK_ID_APP_CLOCK_TARGET, &mem_clk);
+		if (rc == NVML_SUCCESS) {
+			fprintf(stderr, LSTDEV_PFX "TARGET  MEM %4u GPU %4u MHz\n", mem_clk, gpu_clk);
+		}
+		hnvml->nvmlDeviceGetClock(hnvml->devs[n], NVML_CLOCK_GRAPHICS, NVML_CLOCK_ID_CURRENT, &gpu_clk);
+		rc = hnvml->nvmlDeviceGetClock(hnvml->devs[n], NVML_CLOCK_MEM, NVML_CLOCK_ID_CURRENT, &mem_clk);
+		if (rc == NVML_SUCCESS) {
+			fprintf(stderr, LSTDEV_PFX "CURRENT MEM %4u GPU %4u MHz\n", mem_clk, gpu_clk);
+		}
+	}
+}
+
+int nvml_get_gpucount(nvml_handle *nvmlh, int *gpucount)
+{
+	*gpucount = nvmlh->nvml_gpucount;
+	return 0;
+}
+
+int cuda_get_gpucount(nvml_handle *nvmlh, int *gpucount)
+{
+	*gpucount = nvmlh->cuda_gpucount;
+	return 0;
+}
+
+
+int nvml_get_gpu_name(nvml_handle *nvmlh, int cudaindex, char *namebuf, int bufsize)
+{
+	int gpuindex = nvmlh->cuda_nvml_device_id[cudaindex];
+	if (gpuindex < 0 || gpuindex >= nvmlh->nvml_gpucount)
+		return -ENODEV;
+
+	if (!nvmlh->nvmlDeviceGetName)
+		return -ENOSYS;
+
+	if (nvmlh->nvmlDeviceGetName(nvmlh->devs[gpuindex], namebuf, bufsize) != NVML_SUCCESS)
+		return -1;
+
+	return 0;
+}
+
+
+int nvml_get_tempC(nvml_handle *nvmlh, int cudaindex, unsigned int *tempC)
+{
+	nvmlReturn_t rc;
+	int gpuindex = nvmlh->cuda_nvml_device_id[cudaindex];
+	if (gpuindex < 0 || gpuindex >= nvmlh->nvml_gpucount)
+		return -ENODEV;
+
+	if (!nvmlh->nvmlDeviceGetTemperature)
+		return -ENOSYS;
+
+	rc = nvmlh->nvmlDeviceGetTemperature(nvmlh->devs[gpuindex], 0u /* NVML_TEMPERATURE_GPU */, tempC);
+	if (rc != NVML_SUCCESS) {
+		return -1;
+	}
+
+	return 0;
+}
+
+
+int nvml_get_fanpcnt(nvml_handle *nvmlh, int cudaindex, unsigned int *fanpcnt)
+{
+	nvmlReturn_t rc;
+	int gpuindex = nvmlh->cuda_nvml_device_id[cudaindex];
+	if (gpuindex < 0 || gpuindex >= nvmlh->nvml_gpucount)
+		return -ENODEV;
+
+	if (!nvmlh->nvmlDeviceGetFanSpeed)
+		return -ENOSYS;
+
+	rc = nvmlh->nvmlDeviceGetFanSpeed(nvmlh->devs[gpuindex], fanpcnt);
+	if (rc != NVML_SUCCESS) {
+		return -1;
+	}
+
+	return 0;
+}
+
+
+int nvml_get_current_clocks(int cudaindex, unsigned int *graphics_clock, unsigned int *mem_clock)
+{
+	nvmlReturn_t rc;
+	int gpuindex = hnvml->cuda_nvml_device_id[cudaindex];
+	if (gpuindex < 0 || gpuindex >= hnvml->nvml_gpucount) return -ENODEV;
+	if (!hnvml->nvmlDeviceGetClockInfo) return -ENOSYS;
+
+	rc = hnvml->nvmlDeviceGetClockInfo(hnvml->devs[gpuindex], NVML_CLOCK_SM, graphics_clock);
+	if (rc != NVML_SUCCESS) return -1;
+	rc = hnvml->nvmlDeviceGetClockInfo(hnvml->devs[gpuindex], NVML_CLOCK_MEM, mem_clock);
+	if (rc != NVML_SUCCESS) return -1;
+
+	return 0;
+}
+
+/* Not Supported on 750Ti 340.23 */
+int nvml_get_power_usage(nvml_handle *nvmlh, int cudaindex, unsigned int *milliwatts)
+{
+	int gpuindex = nvmlh->cuda_nvml_device_id[cudaindex];
+	if (gpuindex < 0 || gpuindex >= nvmlh->nvml_gpucount)
+		return -ENODEV;
+
+	if (!nvmlh->nvmlDeviceGetPowerUsage)
+		return -ENOSYS;
+
+	nvmlReturn_t res = nvmlh->nvmlDeviceGetPowerUsage(nvmlh->devs[gpuindex], milliwatts);
+	if (res != NVML_SUCCESS) {
+		//if (opt_debug)
+		//	applog(LOG_DEBUG, "nvmlDeviceGetPowerUsage: %s", nvmlh->nvmlErrorString(res));
+		return -1;
+	}
+
+	return 0;
+}
+
+/* Not Supported on 750Ti 340.23 */
+int nvml_get_pstate(nvml_handle *nvmlh, int cudaindex, int *pstate)
+{
+	int gpuindex = nvmlh->cuda_nvml_device_id[cudaindex];
+	if (gpuindex < 0 || gpuindex >= nvmlh->nvml_gpucount)
+		return -ENODEV;
+
+	if (!nvmlh->nvmlDeviceGetPerformanceState)
+		return -ENOSYS;
+
+	nvmlReturn_t res = nvmlh->nvmlDeviceGetPerformanceState(nvmlh->devs[gpuindex], pstate);
+	if (res != NVML_SUCCESS) {
+		//if (opt_debug)
+		//	applog(LOG_DEBUG, "nvmlDeviceGetPerformanceState: %s", nvmlh->nvmlErrorString(res));
+		return -1;
+	}
+
+	return 0;
+}
+
+int nvml_get_busid(nvml_handle *nvmlh, int cudaindex, int *busid)
+{
+	int gpuindex = nvmlh->cuda_nvml_device_id[cudaindex];
+	if (gpuindex < 0 || gpuindex >= nvmlh->nvml_gpucount)
+		return -ENODEV;
+
+	(*busid) = nvmlh->nvml_pci_bus_id[gpuindex];
+	return 0;
+}
+
+int nvml_get_serial(nvml_handle *nvmlh, int cudaindex, char *sn, int maxlen)
+{
+	uint32_t subids = 0;
+	char uuid[NVML_DEVICE_UUID_BUFFER_SIZE];
+	int gpuindex = nvmlh->cuda_nvml_device_id[cudaindex];
+	nvmlReturn_t res;
+	if (gpuindex < 0 || gpuindex >= nvmlh->nvml_gpucount)
+		return -ENODEV;
+
+	if (nvmlh->nvmlDeviceGetSerial) {
+		res = nvmlh->nvmlDeviceGetSerial(nvmlh->devs[gpuindex], sn, maxlen);
+		if (res == NVML_SUCCESS)
+			return 0;
+	}
+
+	if (!nvmlh->nvmlDeviceGetUUID)
+		return -ENOSYS;
+
+	// nvmlDeviceGetUUID: GPU-f2bd642c-369f-5a14-e0b4-0d22dfe9a1fc
+	// use a part of uuid to generate an unique serial
+	// todo: check if there is vendor id is inside
+	memset(uuid, 0, sizeof(uuid));
+	res = nvmlh->nvmlDeviceGetUUID(nvmlh->devs[gpuindex], uuid, sizeof(uuid)-1);
+	if (res != NVML_SUCCESS) {
+		if (opt_debug)
+			applog(LOG_DEBUG, "nvmlDeviceGetUUID: %s", nvmlh->nvmlErrorString(res));
+		return -1;
+	}
+	strncpy(sn, &uuid[4], min((int) strlen(uuid), maxlen));
+	sn[maxlen-1] = '\0';
+	return 0;
+}
+
+int nvml_get_bios(nvml_handle *nvmlh, int cudaindex, char *desc, int maxlen)
+{
+	uint32_t subids = 0;
+	int gpuindex = nvmlh->cuda_nvml_device_id[cudaindex];
+	if (gpuindex < 0 || gpuindex >= nvmlh->nvml_gpucount)
+		return -ENODEV;
+
+	if (!nvmlh->nvmlDeviceGetVbiosVersion)
+		return -ENOSYS;
+
+	nvmlReturn_t res = nvmlh->nvmlDeviceGetVbiosVersion(nvmlh->devs[gpuindex], desc, maxlen);
+	if (res != NVML_SUCCESS) {
+		if (opt_debug)
+			applog(LOG_DEBUG, "nvmlDeviceGetVbiosVersion: %s", nvmlh->nvmlErrorString(res));
+		return -1;
+	}
+	return 0;
+}
+
+int nvml_get_info(nvml_handle *nvmlh, int cudaindex, uint16_t &vid, uint16_t &pid)
+{
+	uint32_t subids = 0;
+	int gpuindex = nvmlh->cuda_nvml_device_id[cudaindex];
+	if (gpuindex < 0 || gpuindex >= nvmlh->nvml_gpucount)
+		return -ENODEV;
+
+	subids = nvmlh->nvml_pci_subsys_id[gpuindex];
+	if (!subids) subids = nvmlh->nvml_pci_vendor_id[gpuindex];
+	pid = subids >> 16;
+	vid = subids & 0xFFFF;
+	// Colorful and Inno3D
+	if (pid == 0) pid = nvmlh->nvml_pci_vendor_id[gpuindex] >> 16;
+	return 0;
+}
+
+int nvml_destroy(nvml_handle *nvmlh)
+{
+	nvmlh->nvmlShutdown();
+
+	wrap_dlclose(nvmlh->nvml_dll);
+
+	free(nvmlh->nvml_pci_bus_id);
+	free(nvmlh->nvml_pci_device_id);
+	free(nvmlh->nvml_pci_domain_id);
+	free(nvmlh->nvml_pci_vendor_id);
+	free(nvmlh->nvml_pci_subsys_id);
+	free(nvmlh->nvml_cuda_device_id);
+	free(nvmlh->cuda_nvml_device_id);
+	free(nvmlh->app_clocks);
+	free(nvmlh->devs);
+
+	free(nvmlh);
+	return 0;
+}
+
+// ----------------------------------------------------------------------------
+
+/**
+ * nvapi alternative for windows x86 binaries
+ * nvml api doesn't exists as 32bit dll :///
+ */
+#ifdef WIN32
+#include "nvapi/nvapi_ccminer.h"
+
+static unsigned int nvapi_dev_map[MAX_GPUS] = { 0 };
+static NvDisplayHandle hDisplay_a[NVAPI_MAX_PHYSICAL_GPUS * 2] = { 0 };
+static NvPhysicalGpuHandle phys[NVAPI_MAX_PHYSICAL_GPUS] = { 0 };
+static NvU32 nvapi_dev_cnt = 0;
+extern bool nvapi_dll_loaded;
+
+int nvapi_temperature(unsigned int devNum, unsigned int *temperature)
+{
+	NvAPI_Status ret;
+
+	if (devNum >= nvapi_dev_cnt)
+		return -ENODEV;
+
+	NV_GPU_THERMAL_SETTINGS thermal;
+	thermal.version = NV_GPU_THERMAL_SETTINGS_VER;
+	ret = NvAPI_GPU_GetThermalSettings(phys[devNum], 0, &thermal);
+	if (ret != NVAPI_OK) {
+		NvAPI_ShortString string;
+		NvAPI_GetErrorMessage(ret, string);
+		if (opt_debug)
+			applog(LOG_DEBUG, "NVAPI NvAPI_GPU_GetThermalSettings: %s", string);
+		return -1;
+	}
+
+	(*temperature) = (unsigned int) thermal.sensor[0].currentTemp;
+
+	return 0;
+}
+
+int nvapi_fanspeed(unsigned int devNum, unsigned int *speed)
+{
+	NvAPI_Status ret;
+
+	if (devNum >= nvapi_dev_cnt)
+		return -ENODEV;
+
+	NvU32 fanspeed = 0;
+	ret = NvAPI_GPU_GetTachReading(phys[devNum], &fanspeed);
+	if (ret != NVAPI_OK) {
+		NvAPI_ShortString string;
+		NvAPI_GetErrorMessage(ret, string);
+		if (opt_debug)
+			applog(LOG_DEBUG, "NVAPI NvAPI_GPU_GetTachReading: %s", string);
+		return -1;
+	}
+
+	(*speed) = (unsigned int) fanspeed;
+
+	return 0;
+}
+
+int nvapi_getpstate(unsigned int devNum, unsigned int *pstate)
+{
+	NvAPI_Status ret;
+
+	if (devNum >= nvapi_dev_cnt)
+		return -ENODEV;
+
+	NV_GPU_PERF_PSTATE_ID CurrentPstate = NVAPI_GPU_PERF_PSTATE_UNDEFINED; /* 16 */
+	ret = NvAPI_GPU_GetCurrentPstate(phys[devNum], &CurrentPstate);
+	if (ret != NVAPI_OK) {
+		NvAPI_ShortString string;
+		NvAPI_GetErrorMessage(ret, string);
+		if (opt_debug)
+			applog(LOG_DEBUG, "NVAPI NvAPI_GPU_GetCurrentPstate: %s", string);
+		return -1;
+	}
+	else {
+		// get pstate for the moment... often 0 = P0
+		(*pstate) = (unsigned int)CurrentPstate;
+	}
+
+	return 0;
+}
+
+#define UTIL_DOMAIN_GPU 0
+int nvapi_getusage(unsigned int devNum, unsigned int *pct)
+{
+	NvAPI_Status ret;
+
+	if (devNum >= nvapi_dev_cnt)
+		return -ENODEV;
+
+	NV_GPU_DYNAMIC_PSTATES_INFO_EX info;
+	info.version = NV_GPU_DYNAMIC_PSTATES_INFO_EX_VER;
+	ret = NvAPI_GPU_GetDynamicPstatesInfoEx(phys[devNum], &info);
+	if (ret != NVAPI_OK) {
+		NvAPI_ShortString string;
+		NvAPI_GetErrorMessage(ret, string);
+		if (opt_debug)
+			applog(LOG_DEBUG, "NVAPI GetDynamicPstatesInfoEx: %s", string);
+		return -1;
+	}
+	else {
+		if (info.utilization[UTIL_DOMAIN_GPU].bIsPresent)
+			(*pct) = info.utilization[UTIL_DOMAIN_GPU].percentage;
+	}
+
+	return 0;
+}
+
+int nvapi_getinfo(unsigned int devNum, uint16_t &vid, uint16_t &pid)
+{
+	NvAPI_Status ret;
+	NvU32 pDeviceId, pSubSystemId, pRevisionId, pExtDeviceId;
+
+	if (devNum >= nvapi_dev_cnt)
+		return -ENODEV;
+
+	ret = NvAPI_GPU_GetPCIIdentifiers(phys[devNum], &pDeviceId, &pSubSystemId, &pRevisionId, &pExtDeviceId);
+	if (ret != NVAPI_OK) {
+		NvAPI_ShortString string;
+		NvAPI_GetErrorMessage(ret, string);
+		if (opt_debug)
+			applog(LOG_DEBUG, "NVAPI GetPCIIdentifiers: %s", string);
+		return -1;
+	}
+
+	pid = pDeviceId >> 16;
+	vid = pDeviceId & 0xFFFF;
+	if (vid == 0x10DE && pSubSystemId) {
+		vid = pSubSystemId & 0xFFFF;
+		pid = pSubSystemId >> 16;
+		// Colorful and Inno3D
+		if (pid == 0) pid = pDeviceId >> 16;
+	}
+
+	return 0;
+}
+
+int nvapi_getserial(unsigned int devNum, char *serial, unsigned int maxlen)
+{
+	NvAPI_Status ret;
+	if (devNum >= nvapi_dev_cnt)
+		return -ENODEV;
+
+	memset(serial, 0, maxlen);
+
+	if (maxlen < 11)
+		return -EINVAL;
+
+	NvAPI_ShortString ser = { 0 };
+	ret = NvAPI_DLL_GetSerialNumber(phys[devNum], ser);
+	if (ret != NVAPI_OK) {
+		NvAPI_ShortString string;
+		NvAPI_GetErrorMessage(ret, string);
+		if (opt_debug)
+			applog(LOG_DEBUG, "NVAPI GetSerialNumber: %s", string);
+		return -1;
+	}
+
+	uint8_t *bytes = (uint8_t*) ser;
+	for (int n=0; n<5; n++) sprintf(&serial[n*2], "%02X", bytes[n]);
+	return 0;
+}
+
+int nvapi_getbios(unsigned int devNum, char *desc, unsigned int maxlen)
+{
+	NvAPI_Status ret;
+	if (devNum >= nvapi_dev_cnt)
+		return -ENODEV;
+
+	if (maxlen < 64) // Short String
+		return -1;
+
+	ret = NvAPI_GPU_GetVbiosVersionString(phys[devNum], desc);
+	if (ret != NVAPI_OK) {
+		NvAPI_ShortString string;
+		NvAPI_GetErrorMessage(ret, string);
+		if (opt_debug)
+			applog(LOG_DEBUG, "NVAPI GetVbiosVersionString: %s", string);
+		return -1;
+	}
+	return 0;
+}
+
+static int SetAsusRGBLogo(unsigned int devNum, uint32_t RGB, bool ignorePrevState)
+{
+	NvAPI_Status ret = NVAPI_OK;
+	NV_I2C_INFO_EX* i2cInfo;
+
+	int delay1 = 20000;
+	int delay2 = 0;
+
+	uchar4 rgb = { 0 };
+	memcpy(&rgb, &RGB, 4);
+	uchar4 prgb = { 0 };
+	int32_t prev = device_led_state[nvapi_devid(devNum)];
+	memcpy(&prgb, &prev, 4);
+
+	NV_INIT_STRUCT_ALLOC(NV_I2C_INFO_EX, i2cInfo);
+	if (i2cInfo == NULL) return -ENOMEM;
+
+	NvU32 data[5] = { 0 };
+	NvU32 datv[2] = { 0, 1 };
+	NvU32 datw[2] = { 1, 0 };
+	if (rgb.z != prgb.z || ignorePrevState) {
+		data[2] = 4; // R:4 G:5 B:6, Mode = 7 (1 static, 2 breath, 3 blink, 4 demo)
+		data[3] = 1;
+		datv[0] = rgb.z | 0x13384000;
+
+		i2cInfo->i2cDevAddress = 0x52;
+		i2cInfo->pbI2cRegAddress = (NvU8*) (&data[2]);
+		i2cInfo->regAddrSize = 1;
+		i2cInfo->pbData = (NvU8*) datv;
+		i2cInfo->cbRead = 5;
+		i2cInfo->cbSize = 1;
+		i2cInfo->portId = 1;
+		i2cInfo->bIsPortIdSet = 1;
+
+		ret = NvAPI_DLL_I2CWriteEx(phys[devNum], i2cInfo, datw);
+		usleep(delay1);
+		has_rgb_ok = (ret == NVAPI_OK);
+	}
+
+	if (rgb.y != prgb.y || ignorePrevState) {
+		data[2] = 5;
+		data[3] = 1;
+		datv[0] = rgb.y | 0x4000;
+
+		i2cInfo->i2cDevAddress = 0x52;
+		i2cInfo->pbI2cRegAddress = (NvU8*) (&data[2]);
+		i2cInfo->regAddrSize = 1;
+		i2cInfo->pbData = (NvU8*) datv;
+		i2cInfo->cbRead = 5;
+		i2cInfo->cbSize = 1;
+		i2cInfo->portId = 1;
+		i2cInfo->bIsPortIdSet = 1;
+
+		ret = NvAPI_DLL_I2CWriteEx(phys[devNum], i2cInfo, datw);
+		usleep(delay1);
+		has_rgb_ok = (ret == NVAPI_OK);
+	}
+
+	if (rgb.y != prgb.y || ignorePrevState) {
+		data[2] = 6;
+		data[3] = 1;
+		datv[0] = rgb.x | 0x4000;
+
+		i2cInfo->i2cDevAddress = 0x52;
+		i2cInfo->pbI2cRegAddress = (NvU8*) (&data[2]);
+		i2cInfo->regAddrSize = 1;
+		i2cInfo->pbData = (NvU8*) datv;
+		i2cInfo->cbRead = 5;
+		i2cInfo->cbSize = 1;
+		i2cInfo->portId = 1;
+		i2cInfo->bIsPortIdSet = 1;
+
+		ret = NvAPI_DLL_I2CWriteEx(phys[devNum], i2cInfo, datw);
+		usleep(delay1);
+		has_rgb_ok = (ret == NVAPI_OK);
+	}
+
+	if (rgb.w && ignorePrevState) {
+		data[2] = 7;
+		data[3] = 1;
+		datv[0] = rgb.w | 0x4000;
+
+		i2cInfo->i2cDevAddress = 0x52;
+		i2cInfo->pbI2cRegAddress = (NvU8*) (&data[2]);
+		i2cInfo->regAddrSize = 1;
+		i2cInfo->pbData = (NvU8*) datv;
+		i2cInfo->cbRead = 5;
+		i2cInfo->cbSize = 1;
+		i2cInfo->portId = 1;
+		i2cInfo->bIsPortIdSet = 1;
+
+		ret = NvAPI_DLL_I2CWriteEx(phys[devNum], i2cInfo, datw);
+		usleep(delay1);
+		has_rgb_ok = (ret == NVAPI_OK);
+	}
+	usleep(delay2);
+	free(i2cInfo);
+	return (int) ret;
+}
+
+static int SetGigabyteRGBLogo(unsigned int devNum, uint32_t RGB)
+{
+	NvAPI_Status ret;
+	NV_I2C_INFO_EX* i2cInfo;
+	NV_INIT_STRUCT_ALLOC(NV_I2C_INFO_EX, i2cInfo);
+	if (i2cInfo == NULL)
+		return -ENOMEM;
+
+	NvU32 readBuf[25] = { 0 };
+	NvU32 data[5] = { 0 };
+	data[0] = 1;
+	data[2] = swab32(RGB & 0xfcfcfcU) | 0x40;
+
+	i2cInfo->i2cDevAddress = 0x48 << 1;
+	i2cInfo->pbI2cRegAddress = (NvU8*) (&data[2]);
+	i2cInfo->regAddrSize = 4; // NVAPI_MAX_SIZEOF_I2C_REG_ADDRESS
+	i2cInfo->pbData = (NvU8*) readBuf;
+	i2cInfo->cbRead = 2;
+	i2cInfo->cbSize = sizeof(readBuf);
+	i2cInfo->portId = 1;
+	i2cInfo->bIsPortIdSet = 1;
+
+	//ret = NvAPI_DLL_I2CWriteEx(phys[devNum], i2cInfo, data);
+	ret = NvAPI_DLL_I2CReadEx(phys[devNum], i2cInfo, data);
+	usleep(20000);
+	free(i2cInfo);
+	return (int) ret;
+}
+
+static int SetZotacRGBLogo(unsigned int devNum, uint32_t RGB)
+{
+	NvAPI_Status ret;
+	NV_I2C_INFO* i2cInfo;
+	NV_INIT_STRUCT_ALLOC(NV_I2C_INFO, i2cInfo);
+	if (i2cInfo == NULL)
+		return -ENOMEM;
+
+	NvU32 buf[25] = { 0 };
+	NvU32 data[5] = { 0 };
+
+	uint32_t color = 0, level = 0x40;
+
+	uchar4 rgb = { 0 };
+	memcpy(&rgb, &RGB, 4);
+	level  = rgb.x & 0xF0;
+	level |= rgb.y & 0xF0;
+	level |= rgb.z & 0xF0;
+	//applog(LOG_DEBUG, "R %u G %u B %u", rgb.z, rgb.y, rgb.x);
+
+	// Not really RGB custom, only some basic colors, so convert
+	// 0: Red, 1: Yellow, 2: Green, 3: Cyan, 4: Blue, 5: magenta, 6: white
+	if ((RGB & 0xFF0000) && (RGB & 0xFF00) && (RGB & 0xFF)) color = 6;
+	else if ((RGB & 0xFF0000) && (RGB & 0xFF)) color = 5;
+	else if ((RGB & 0xFF00) && (RGB & 0xFF)) color = 3;
+	else if ((RGB & 0xFF0000) && (RGB & 0xFF00)) color = 1;
+	else if (RGB & 0xFF) color = 4;
+	else if (RGB & 0xFF00) color = 2;
+
+	buf[0] = 0xF0; // F0 set colors
+	buf[0] |= (color << 8);  // logo
+	buf[0] |= (1 << 16); // top
+	if (RGB != 0) // level : 0x10 to 0xF0
+		buf[0] |= (level << 24);
+	else
+		buf[0] |= (0x10U << 24);
+
+	// todo: i2c data crc ?
+
+	i2cInfo->displayMask = 1;
+	i2cInfo->bIsDDCPort = 1;
+	i2cInfo->i2cDevAddress = 0x48 << 1;
+	i2cInfo->pbI2cRegAddress = (NvU8*) (&data[2]);
+	i2cInfo->regAddrSize = 1;
+	i2cInfo->pbData = (NvU8*) buf;
+	i2cInfo->cbSize = 4;
+	i2cInfo->i2cSpeed = NVAPI_I2C_SPEED_DEPRECATED;
+	i2cInfo->i2cSpeedKhz = NVAPI_I2C_SPEED_100KHZ; // 4
+	i2cInfo->portId = 1;
+	i2cInfo->bIsPortIdSet = 1;
+
+	ret = NvAPI_I2CWrite(phys[devNum], i2cInfo);
+	// required to prevent i2c lock
+	usleep(20000);
+
+#if 0
+	buf[0] = 0xF7; // F7 toggle leds
+	if (RGB == 0)
+		buf[0] |= (1 << 8);  // 0 logo on, 1 off
+	buf[0] |= (1 << 16); // 1 top off
+	ret = NvAPI_I2CWrite(phys[devNum], i2cInfo);
+	usleep(20000);
+#endif
+	// other modes:
+	// 0xF1 breathing green (0x070202F1)
+	// 0xF2 strobe green    (0x070202F2)
+	// 0xF3 cycle           (0x000000F3)
+
+	free(i2cInfo);
+	return (int) ret;
+}
+
+int nvapi_set_led(unsigned int devNum, int RGB, char *device_name)
+{
+	uint16_t vid = 0, pid = 0;
+	NvAPI_Status ret;
+	if (strstr(device_name, "Gigabyte GTX 10")) {
+		if (opt_debug)
+			applog(LOG_DEBUG, "GPU %x: Set RGB led to %06x", (int) phys[devNum], RGB);
+		return SetGigabyteRGBLogo(devNum, (uint32_t) RGB);
+	} else if (strstr(device_name, "ASUS GTX 10")) {
+		if (opt_debug)
+			applog(LOG_DEBUG, "GPU %x: Set RGB led to %06x", (int) phys[devNum], RGB);
+		return SetAsusRGBLogo(devNum, (uint32_t) RGB, !has_rgb_ok);
+	} else if (strstr(device_name, "Zotac GTX 10")) {
+		if (opt_debug)
+			applog(LOG_DEBUG, "GPU %x: Set RGB led to %06x", (int) phys[devNum], RGB);
+		return SetZotacRGBLogo(devNum, (uint32_t) RGB);
+	} else {
+		NV_GPU_QUERY_ILLUMINATION_SUPPORT_PARM* illu;
+		NV_INIT_STRUCT_ALLOC(NV_GPU_QUERY_ILLUMINATION_SUPPORT_PARM, illu);
+		illu->hPhysicalGpu = phys[devNum];
+		illu->Attribute = NV_GPU_IA_LOGO_BRIGHTNESS;
+		ret = NvAPI_GPU_QueryIlluminationSupport(illu);
+		if (!ret && illu->bSupported) {
+			NV_GPU_GET_ILLUMINATION_PARM *led;
+			NV_INIT_STRUCT_ALLOC(NV_GPU_GET_ILLUMINATION_PARM, led);
+			led->hPhysicalGpu = phys[devNum];
+			led->Attribute = NV_GPU_IA_LOGO_BRIGHTNESS;
+			NvAPI_GPU_GetIllumination(led);
+			if (opt_debug)
+				applog(LOG_DEBUG, "GPU %x: Led level was %d, set to %d", (int) phys[devNum], led->Value, RGB);
+			led->Value = (uint32_t) RGB;
+			ret = NvAPI_GPU_SetIllumination((NV_GPU_SET_ILLUMINATION_PARM*) led);
+			free(led);
+		}
+		free(illu);
+		return ret;
+	}
+}
+
+int nvapi_pstateinfo(unsigned int devNum)
+{
+	uint32_t n;
+	NvAPI_Status ret;
+	uint32_t* mem = (uint32_t*) calloc(1, 0x4000);
+	if (!mem)
+		return -ENOMEM;
+
+	unsigned int current = 0xFF;
+	// useless on init but...
+	nvapi_getpstate(devNum, &current);
+
+#if 0
+	// try :p
+	uint32_t* buf = (uint32_t*) calloc(1, 0x8000);
+	for (int i=8; i < 0x8000 && buf; i+=4) {
+		buf[0] = 0x10000 + i;
+		NV_GPU_PERF_PSTATE_ID pst = NVAPI_GPU_PERF_PSTATE_P0;
+		ret = NvAPI_DLL_GetPstateClientLimits(phys[devNum], pst, buf);
+		if (ret != NVAPI_INCOMPATIBLE_STRUCT_VERSION) {
+			NvAPI_ShortString string;
+			NvAPI_GetErrorMessage(ret, string);
+			applog(LOG_BLUE, "struct size is %06x : %s", buf[0], string);
+			for (int n=0; n < i/32; n++)
+				applog_hex(&buf[n*(32/4)], 32);
+			break;
+		}
+	}
+	free(buf);
+#endif
+
+#if 0
+	// Unsure of the meaning of these values
+	NVAPI_GPU_POWER_TOPO topo = { 0 };
+	topo.version = NVAPI_GPU_POWER_TOPO_VER;
+	if ((ret = NvAPI_DLL_ClientPowerTopologyGetStatus(phys[devNum], &topo)) == NVAPI_OK) {
+		if (topo.count)
+			applog(LOG_RAW, " GPU TDP is %.1f~%.1f W ?",
+			(double) topo.entries[0].power/1000, (double) topo.entries[1].power/1000);
+
+	// Ok on 970, not pascal
+	NV_GPU_PERF_PSTATES20_INFO_V2 pset2 = { 0 };
+	pset2.version = NV_GPU_PERF_PSTATES20_INFO_VER2;
+	pset2.ov.numVoltages = 1;
+	pset2.ov.voltages[0].voltDelta_uV.value = 3000;  // gpu + 3000 uv;
+	ret = NvAPI_DLL_SetPstates20v2(phys[devNum], &pset2);
+#endif
+
+	NV_GPU_PERF_PSTATES20_INFO* info;
+	NV_INIT_STRUCT_ON(NV_GPU_PERF_PSTATES20_INFO, info, mem);
+	if ((ret = NvAPI_GPU_GetPstates20(phys[devNum], info)) != NVAPI_OK) {
+		NvAPI_ShortString string;
+		NvAPI_GetErrorMessage(ret, string);
+		if (opt_debug)
+			applog(LOG_RAW, "NVAPI GetPstates20: %s", string);
+		return -1;
+	}
+
+	for (n=0; n < info->numPstates; n++) {
+		NV_GPU_PSTATE20_CLOCK_ENTRY_V1* clocks = info->pstates[n].clocks;
+		applog(LOG_RAW, "%sP%d: MEM %4u MHz%s GPU %6.1f MHz%s %4u mV%s \x7F %d/%d",
+			info->pstates[n].pstateId == current ? ">":" ", (int) info->pstates[n].pstateId,
+			clocks[1].data.single.freq_kHz/1000, clocks[1].bIsEditable ? "*":" ",
+			(double) clocks[0].data.single.freq_kHz/1000, clocks[0].bIsEditable ? "*":" ",
+			info->pstates[n].baseVoltages[0].volt_uV/1000, info->pstates[n].baseVoltages[0].bIsEditable ? "*": " ",
+			info->pstates[n].baseVoltages[0].voltDelta_uV.valueRange.min/1000, // range if editable
+			info->pstates[n].baseVoltages[0].voltDelta_uV.valueRange.max/1000);
+		if (clocks[1].freqDelta_kHz.value || clocks[0].freqDelta_kHz.value) {
+			applog(LOG_RAW, "      OC %+4d MHz      %+6.1f MHz",
+				clocks[1].freqDelta_kHz.value/1000, (double) clocks[0].freqDelta_kHz.value/1000);
+		}
+	}
+	// boost over volting (GTX 9xx only ?)
+	for (n=0; n < info->ov.numVoltages; n++) {
+		applog(LOG_RAW, " OV: %u%+d mV%s \x7F %d/%d",
+			info->ov.voltages[n].volt_uV/1000, info->ov.voltages[n].voltDelta_uV.value/1000, info->ov.voltages[n].bIsEditable ? "*":" ",
+			info->ov.voltages[n].voltDelta_uV.valueRange.min/1000, info->ov.voltages[n].voltDelta_uV.valueRange.max/1000);
+	}
+
+	NV_GPU_CLOCK_FREQUENCIES *freqs;
+	NV_INIT_STRUCT_ON(NV_GPU_CLOCK_FREQUENCIES, freqs, mem);
+	freqs->ClockType = NV_GPU_CLOCK_FREQUENCIES_BASE_CLOCK;
+	ret = NvAPI_GPU_GetAllClockFrequencies(phys[devNum], freqs);
+	applog(LOG_RAW, "     MEM %4.0f MHz  GPU %6.1f MHz     Base Clocks",
+		(double) freqs->domain[NVAPI_GPU_PUBLIC_CLOCK_MEMORY].frequency / 1000,
+		(double) freqs->domain[NVAPI_GPU_PUBLIC_CLOCK_GRAPHICS].frequency / 1000);
+
+	freqs->ClockType = NV_GPU_CLOCK_FREQUENCIES_BOOST_CLOCK;
+	ret = NvAPI_GPU_GetAllClockFrequencies(phys[devNum], freqs);
+	applog(LOG_RAW, "     MEM %4.0f MHz  GPU %6.1f MHz     Boost Clocks",
+		(double) freqs->domain[NVAPI_GPU_PUBLIC_CLOCK_MEMORY].frequency / 1000,
+		(double) freqs->domain[NVAPI_GPU_PUBLIC_CLOCK_GRAPHICS].frequency / 1000);
+
+	freqs->ClockType = NV_GPU_CLOCK_FREQUENCIES_CURRENT_FREQ;
+	ret = NvAPI_GPU_GetAllClockFrequencies(phys[devNum], freqs);
+	applog(LOG_RAW, "     MEM %4.0f MHz  GPU %6.1f MHz    >Current",
+		(double) freqs->domain[NVAPI_GPU_PUBLIC_CLOCK_MEMORY].frequency / 1000,
+		(double) freqs->domain[NVAPI_GPU_PUBLIC_CLOCK_GRAPHICS].frequency / 1000);
+
+	// Other clock values ??
+	NVAPI_GPU_PERF_CLOCKS *pcl;
+	NV_INIT_STRUCT_ALLOC(NVAPI_GPU_PERF_CLOCKS, pcl);
+	int numClock=0; ret = NVAPI_OK;
+	while (ret == NVAPI_OK) {
+		if ((ret = NvAPI_DLL_GetPerfClocks(phys[devNum], numClock, pcl)) == NVAPI_OK) {
+			applog(LOG_RAW, " C%d: MEM %4.0f MHz  GPU %6.1f MHz [%5.1f/%6.1f]", numClock,
+				(double) pcl->memFreq1/1000, (double) pcl->gpuFreq1/1000, (double) pcl->gpuFreqMin/1000, (double) pcl->gpuFreqMax/1000);
+		//	ret = NvAPI_DLL_SetPerfClocks(phys[devNum], numClock, pcl); // error
+		}
+		numClock++;
+	}
+
+	// Pascal only
+	NVAPI_VOLTBOOST_PERCENT *pvb;
+	NV_INIT_STRUCT_ON(NVAPI_VOLTBOOST_PERCENT, pvb, mem);
+	if ((ret = NvAPI_DLL_GetCoreVoltageBoostPercent(phys[devNum], pvb)) == NVAPI_OK) {
+		NVAPI_VOLTAGE_STATUS *pvdom;
+		NV_INIT_STRUCT_ALLOC(NVAPI_VOLTAGE_STATUS, pvdom);
+		NvAPI_DLL_GetCurrentVoltage(phys[devNum], pvdom);
+		if (pvdom && pvdom->value_uV)
+			applog(LOG_RAW, " GPU Voltage is %u mV %+d%% boost", pvdom->value_uV/1000, pvb->percent);
+		else if (pvdom)
+			applog(LOG_RAW, " GPU Voltage is %u mV", pvdom->value_uV/1000);
+		free(pvdom);
+	} else {
+		// Maxwell 9xx
+		NVAPI_VOLT_STATUS *mvdom, *mvstep;
+		NV_INIT_STRUCT_ALLOC(NVAPI_VOLT_STATUS, mvdom);
+		if (mvdom && (ret = NvAPI_DLL_GetVoltageDomainsStatus(phys[devNum], mvdom)) == NVAPI_OK) {
+			NV_INIT_STRUCT_ALLOC(NVAPI_VOLT_STATUS, mvstep);
+			NvAPI_DLL_GetVoltageStep(phys[devNum], mvstep);
+			if (mvdom->value_uV) applog(LOG_RAW, " GPU Voltage is %.1f mV with %.3f mV resolution",
+				(double) mvdom->value_uV/1000, (double) mvstep->value_uV/1000);
+			free(mvstep);
+		}
+		free(mvdom);
+	}
+
+	uint32_t plim = nvapi_get_plimit(devNum);
+	double min_pw = 0, max_pw = 0; // percent
+
+	NVAPI_GPU_POWER_INFO nfo = { 0 };
+	nfo.version = NVAPI_GPU_POWER_INFO_VER;
+	ret = NvAPI_DLL_ClientPowerPoliciesGetInfo(phys[devNum], &nfo);
+	if (ret == NVAPI_OK && nfo.valid) {
+		min_pw = (double)nfo.entries[0].min_power / 1000;
+		max_pw = (double)nfo.entries[0].max_power / 1000;
+	}
+	applog(LOG_RAW, " Power limit is set to %u%%, range [%.0f-%.0f%%]", plim, min_pw, max_pw);
+
+#if 0
+	NVAPI_COOLER_SETTINGS *cooler;
+	NV_INIT_STRUCT_ON(NVAPI_COOLER_SETTINGS, cooler, mem);
+	ret = NvAPI_DLL_GetCoolerSettings(phys[devNum], 7, cooler);
+	if (ret == NVAPI_OK) {
+		applog(LOG_RAW, " Fan level is set to %u%%", cooler->level); // wrong val, seems 1 (auto ?)
+		NVAPI_COOLER_LEVEL *fan;
+		NV_INIT_STRUCT_ALLOC(NVAPI_COOLER_LEVEL, fan);
+		fan->level = 100;
+		fan->count = 1;
+		ret = NvAPI_DLL_SetCoolerLevels(phys[devNum], 7, fan);
+		free(fan);
+		sleep(10);
+		ret = NvAPI_DLL_RestoreCoolerSettings(phys[devNum], cooler, 7);
+	}
+#endif
+
+	NV_GPU_THERMAL_SETTINGS *tset;
+	NV_INIT_STRUCT_ON(NV_GPU_THERMAL_SETTINGS, tset, mem);
+
+	NVAPI_GPU_THERMAL_INFO *tnfo;
+	NV_INIT_STRUCT_ALLOC(NVAPI_GPU_THERMAL_INFO, tnfo);
+	NVAPI_GPU_THERMAL_LIMIT *tlim;
+	NV_INIT_STRUCT_ALLOC(NVAPI_GPU_THERMAL_LIMIT, tlim);
+	NvAPI_GPU_GetThermalSettings(phys[devNum], 0, tset);
+	NvAPI_DLL_ClientThermalPoliciesGetInfo(phys[devNum], tnfo);
+	if ((ret = NvAPI_DLL_ClientThermalPoliciesGetLimit(phys[devNum], tlim)) == NVAPI_OK) {
+		applog(LOG_RAW, " Thermal limit is set to %u, current Tc %d, range [%u-%u]",
+			tlim->entries[0].value >> 8, tset->sensor[0].currentTemp,
+			tnfo->entries[0].min_temp >> 8, tnfo->entries[0].max_temp >> 8);
+	}
+	free(tnfo);
+	free(tlim);
+
+#if 1
+	// Read pascal Clocks Table, Empty on 9xx
+	//NVAPI_CLOCKS_RANGE* ranges;
+	//NV_INIT_STRUCT_ON(NVAPI_CLOCKS_RANGE, ranges, mem);
+	//ret = NvAPI_DLL_GetClockBoostRanges(phys[devNum], ranges);
+
+	NVAPI_CLOCK_MASKS* boost;
+	NV_INIT_STRUCT_ON(NVAPI_CLOCK_MASKS, boost, mem);
+	ret = NvAPI_DLL_GetClockBoostMask(phys[devNum], boost);
+	int gpuClocks = 0, memClocks = 0;
+	for (n=0; n < 80+23; n++) {
+		if (boost->clocks[n].memDelta) memClocks++;
+		if (boost->clocks[n].gpuDelta) gpuClocks++;
+	}
+
+	// PASCAL GTX ONLY
+	if (gpuClocks || memClocks) {
+		NVAPI_CLOCK_TABLE *table;
+		NV_INIT_STRUCT_ALLOC(NVAPI_CLOCK_TABLE, table);
+		memcpy(table->mask, boost->mask, 12);
+		ret = NvAPI_DLL_GetClockBoostTable(phys[devNum], table);
+		gpuClocks = 0, memClocks = 0;
+		for (n=0; n < 12; n++) {
+			if (table->buf0[n] != 0) applog(LOG_RAW, "boost table 0[%u] not empty (%u)", n, table->buf0[n]);
+		}
+		for (n=0; n < 80; n++) {
+			if (table->gpuDeltas[n].freqDelta) {
+				// note: gpu delta value seems to be x2, not the memory
+				//applog(LOG_RAW, " Boost gpu clock delta %u set to %d MHz", n, table->gpuDeltas[n].freqDelta/2000);
+				gpuClocks++;
+			}
+		}
+		for (n=0; n < 23; n++) {
+			if (table->memFilled[n]) {
+				//applog(LOG_RAW, " Boost mem clock delta %u set to %d MHz", n, table->memDeltas[n]/1000);
+				memClocks++;
+			}
+		}
+		for (n=0; n < 1529; n++) {
+			if (table->buf1[n] != 0) applog(LOG_RAW, "boost table 1[%u] not empty (%u)", n, table->buf1[n]);
+		}
+		applog(LOG_RAW, " Boost table contains %d gpu and %d mem levels.", gpuClocks, memClocks);
+		free(table);
+
+		NVAPI_VFP_CURVE *curve;
+		NV_INIT_STRUCT_ALLOC(NVAPI_VFP_CURVE, curve);
+		memcpy(curve->mask, boost->mask, 12);
+		ret = NvAPI_DLL_GetVFPCurve(phys[devNum], curve);
+		gpuClocks = 0, memClocks = 0;
+		for (n=0; n < 80; n++) {
+			if (curve->gpuEntries[n].freq_kHz || curve->gpuEntries[n].volt_uV) {
+			//	applog(LOG_RAW, "gpu volt table %2u %4u MHz - %6u mV", n, curve->gpuEntries[n].freq_kHz/1000, curve->gpuEntries[n].volt_uV/1000);
+				gpuClocks++;
+			}
+		}
+		for (n=0; n < 23; n++) {
+			if (curve->memEntries[n].freq_kHz || curve->memEntries[n].volt_uV) {
+			//	applog(LOG_RAW, "mem volt table %2u %4u MHz - %6u mV", n, curve->memEntries[n].freq_kHz/1000, curve->memEntries[n].volt_uV/1000);
+				memClocks++;
+			}
+		}
+		for (n=0; n < 1064; n++) {
+			if (curve->buf1[n] != 0) applog(LOG_RAW, "volt table buf1[%u] not empty (%u)", n, curve->buf1[n]);
+		}
+		applog(LOG_RAW, " Volts table contains %d gpu and %d mem levels.", gpuClocks, memClocks);
+		free(curve);
+	}
+
+	// Maxwell
+	else {
+		NVAPI_VOLTAGES_TABLE* volts;
+		NV_INIT_STRUCT_ALLOC(NVAPI_VOLTAGES_TABLE, volts);
+		int entries = 0;
+		ret = NvAPI_DLL_GetVoltages(phys[devNum], volts);
+		for (n=0; n < 128; n++) {
+			if (volts->entries[n].volt_uV)
+				entries++;
+		}
+		applog(LOG_RAW, " Volts table contains %d gpu levels.", entries);
+		free(volts);
+	}
+
+	NV_DISPLAY_DRIVER_MEMORY_INFO* meminfo;
+	NV_INIT_STRUCT_ON(NV_DISPLAY_DRIVER_MEMORY_INFO, meminfo, mem);
+	meminfo->version = NV_DISPLAY_DRIVER_MEMORY_INFO_VER;
+	if ((ret = NvAPI_GPU_GetMemoryInfo(phys[devNum], meminfo)) == NVAPI_OK) {
+		applog(LOG_RAW, " Memory: %u MB, %.1f used", meminfo->dedicatedVideoMemory/1024,
+			(double) (meminfo->availableDedicatedVideoMemory - meminfo->curAvailableDedicatedVideoMemory)/1024);
+	}
+#if 0 /* some undetermined stats */
+	NVAPI_GPU_PERF_INFO pi = { 0 };
+	pi.version = NVAPI_GPU_PERF_INFO_VER;
+	ret = NvAPI_DLL_PerfPoliciesGetInfo(phys[devNum], &pi);
+
+	NVAPI_GPU_PERF_STATUS ps = { 0 };
+	ps.version = NVAPI_GPU_PERF_STATUS_VER;
+	ret = NvAPI_DLL_PerfPoliciesGetStatus(phys[devNum], &ps);
+	applog(LOG_BLUE, "%llx %lld. %lld. %llx %llx %llx", ps.timeRef, ps.val1, ps.val2, ps.values[0], ps.values[1], ps.values[2]);
+#endif
+
+#endif
+	free(mem);
+	return 0;
+}
+
+// workaround for buggy driver 378.49
+unsigned int nvapi_get_gpu_clock(unsigned int devNum)
+{
+	NvAPI_Status ret = NVAPI_OK;
+	unsigned int freq = 0;
+	NV_GPU_CLOCK_FREQUENCIES *freqs;
+	NV_INIT_STRUCT_ALLOC(NV_GPU_CLOCK_FREQUENCIES, freqs);
+	freqs->ClockType = NV_GPU_CLOCK_FREQUENCIES_CURRENT_FREQ;
+	ret = NvAPI_GPU_GetAllClockFrequencies(phys[devNum], freqs);
+	if (ret == NVAPI_OK) {
+		freq = freqs->domain[NVAPI_GPU_PUBLIC_CLOCK_GRAPHICS].frequency / 1000;
+	}
+	free(freqs);
+	return freq; // in MHz
+}
+
+uint8_t nvapi_get_plimit(unsigned int devNum)
+{
+	NvAPI_Status ret = NVAPI_OK;
+	NVAPI_GPU_POWER_STATUS pol = { 0 };
+	pol.version = NVAPI_GPU_POWER_STATUS_VER;
+	if ((ret = NvAPI_DLL_ClientPowerPoliciesGetStatus(phys[devNum], &pol)) != NVAPI_OK) {
+		NvAPI_ShortString string;
+		NvAPI_GetErrorMessage(ret, string);
+		if (opt_debug)
+			applog(LOG_DEBUG, "NVAPI PowerPoliciesGetStatus: %s", string);
+		return 0;
+	}
+	return (uint8_t) (pol.entries[0].power / 1000); // in percent
+}
+
+int nvapi_set_plimit(unsigned int devNum, uint16_t percent)
+{
+	NvAPI_Status ret = NVAPI_OK;
+	uint32_t val = percent * 1000;
+
+	NVAPI_GPU_POWER_INFO nfo = { 0 };
+	nfo.version = NVAPI_GPU_POWER_INFO_VER;
+	ret = NvAPI_DLL_ClientPowerPoliciesGetInfo(phys[devNum], &nfo);
+	if (ret == NVAPI_OK) {
+		if (val == 0)
+			val = nfo.entries[0].def_power;
+		else if (val < nfo.entries[0].min_power)
+			val = nfo.entries[0].min_power;
+		else if (val > nfo.entries[0].max_power)
+			val = nfo.entries[0].max_power;
+	}
+
+	NVAPI_GPU_POWER_STATUS pol = { 0 };
+	pol.version = NVAPI_GPU_POWER_STATUS_VER;
+	pol.flags = 1;
+	pol.entries[0].power = val;
+	if ((ret = NvAPI_DLL_ClientPowerPoliciesSetStatus(phys[devNum], &pol)) != NVAPI_OK) {
+		NvAPI_ShortString string;
+		NvAPI_GetErrorMessage(ret, string);
+		if (opt_debug)
+			applog(LOG_DEBUG, "NVAPI PowerPoliciesSetStatus: %s", string);
+		return -1;
+	}
+	return ret;
+}
+
+int nvapi_set_tlimit(unsigned int devNum, uint8_t limit)
+{
+	NvAPI_Status ret;
+	uint32_t val = limit;
+
+	if (devNum >= nvapi_dev_cnt)
+		return -ENODEV;
+
+	NV_GPU_THERMAL_SETTINGS tset = { 0 };
+	NVAPI_GPU_THERMAL_INFO tnfo = { 0 };
+	NVAPI_GPU_THERMAL_LIMIT tlim = { 0 };
+	tset.version = NV_GPU_THERMAL_SETTINGS_VER;
+	NvAPI_GPU_GetThermalSettings(phys[devNum], 0, &tset);
+	tnfo.version = NVAPI_GPU_THERMAL_INFO_VER;
+	NvAPI_DLL_ClientThermalPoliciesGetInfo(phys[devNum], &tnfo);
+	tlim.version = NVAPI_GPU_THERMAL_LIMIT_VER;
+	if ((ret = NvAPI_DLL_ClientThermalPoliciesGetLimit(phys[devNum], &tlim)) == NVAPI_OK) {
+		tlim.entries[0].value = val << 8;
+		tlim.flags = 1;
+		ret = NvAPI_DLL_ClientThermalPoliciesSetLimit(phys[devNum], &tlim);
+		if (ret == NVAPI_OK) {
+			applog(LOG_INFO, "GPU #%u: thermal limit set to %u, current Tc %d, range [%u-%u]",
+				devNum, val, tset.sensor[0].currentTemp,
+				tnfo.entries[0].min_temp >> 8, tnfo.entries[0].max_temp >> 8);
+		} else {
+			NvAPI_ShortString string;
+			NvAPI_GetErrorMessage(ret, string);
+			applog(LOG_WARNING, "GPU #%u: thermal limit: %s, valid range is [%u-%u]", devNum, string,
+				tnfo.entries[0].min_temp >> 8, tnfo.entries[0].max_temp >> 8);
+		}
+	}
+	return (int) ret;
+}
+
+int nvapi_set_gpuclock(unsigned int devNum, uint32_t clock)
+{
+	NvAPI_Status ret;
+	NvS32 delta = 0;
+
+	if (devNum >= nvapi_dev_cnt)
+		return -ENODEV;
+#if 0
+	// wrong api to get default base clock when modified, cuda props seems fine
+	NV_GPU_CLOCK_FREQUENCIES freqs = { 0 };
+	freqs.version = NV_GPU_CLOCK_FREQUENCIES_VER;
+	freqs.ClockType = NV_GPU_CLOCK_FREQUENCIES_BASE_CLOCK;
+	ret = NvAPI_GPU_GetAllClockFrequencies(phys[devNum], &freqs);
+	if (ret == NVAPI_OK)  {
+		delta = (clock * 1000) - freqs.domain[NVAPI_GPU_PUBLIC_CLOCK_GRAPHICS].frequency;
+	}
+
+	NV_GPU_PERF_PSTATES_INFO deffreqs = { 0 };
+	deffreqs.version = NV_GPU_PERF_PSTATES_INFO_VER;
+	ret = NvAPI_GPU_GetPstatesInfoEx(phys[devNum], &deffreqs, 0); // we want default clock grr!
+	if (ret == NVAPI_OK) {
+		if (deffreqs.pstates[0].clocks[1].domainId == NVAPI_GPU_PUBLIC_CLOCK_GRAPHICS)
+			delta = (clock * 1000) - deffreqs.pstates[0].clocks[1].freq*2;
+	}
+#endif
+
+	cudaDeviceProp props = { 0 };
+	NvU32 busId = 0xFFFF;
+	ret = NvAPI_GPU_GetBusId(phys[devNum], &busId);
+	for (int d=0; d < (int) nvapi_dev_cnt; d++) {
+		 // unsure about devNum, so be safe
+		cudaGetDeviceProperties(&props, d);
+		if (props.pciBusID == busId) {
+			delta = (clock * 1000) - props.clockRate;
+			break;
+		}
+	}
+
+	if (delta == (clock * 1000))
+		return ret;
+
+	NV_GPU_PERF_PSTATES20_INFO_V1 pset1 = { 0 };
+	pset1.version = NV_GPU_PERF_PSTATES20_INFO_VER1;
+	pset1.numPstates = 1;
+	pset1.numClocks = 1;
+	// Ok on both 1080 and 970
+	pset1.pstates[0].clocks[0].domainId = NVAPI_GPU_PUBLIC_CLOCK_GRAPHICS;
+	pset1.pstates[0].clocks[0].freqDelta_kHz.value = delta;
+	ret = NvAPI_DLL_SetPstates20v1(phys[devNum], &pset1);
+	if (ret == NVAPI_OK) {
+		applog(LOG_INFO, "GPU #%u: boost gpu clock set to %u (delta %d)", devNum, clock, delta/1000);
+	}
+	return ret;
+}
+
+int nvapi_set_memclock(unsigned int devNum, uint32_t clock)
+{
+	NvAPI_Status ret;
+	NvS32 delta = 0;
+
+	if (devNum >= nvapi_dev_cnt)
+		return -ENODEV;
+
+	// wrong to get default base clock (when modified) on maxwell (same as cuda props one)
+	NV_GPU_CLOCK_FREQUENCIES freqs = { 0 };
+	freqs.version = NV_GPU_CLOCK_FREQUENCIES_VER;
+	freqs.ClockType = NV_GPU_CLOCK_FREQUENCIES_BASE_CLOCK;
+	ret = NvAPI_GPU_GetAllClockFrequencies(phys[devNum], &freqs); // wrong base clocks, useless
+	if (ret == NVAPI_OK)  {
+		delta = (clock * 1000) - freqs.domain[NVAPI_GPU_PUBLIC_CLOCK_MEMORY].frequency;
+	}
+
+	// seems ok on maxwell and pascal for the mem clocks
+	NV_GPU_PERF_PSTATES_INFO deffreqs = { 0 };
+	deffreqs.version = NV_GPU_PERF_PSTATES_INFO_VER;
+	ret = NvAPI_GPU_GetPstatesInfoEx(phys[devNum], &deffreqs, 0x1); // deprecated but req for def clocks
+	if (ret == NVAPI_OK) {
+		if (deffreqs.pstates[0].clocks[0].domainId == NVAPI_GPU_PUBLIC_CLOCK_MEMORY)
+			delta = (clock * 1000) - deffreqs.pstates[0].clocks[0].freq;
+	}
+
+	if (delta == (clock * 1000))
+		return ret;
+
+	// todo: bounds check with GetPstates20
+
+	NV_GPU_PERF_PSTATES20_INFO_V1 pset1 = { 0 };
+	pset1.version = NV_GPU_PERF_PSTATES20_INFO_VER1;
+	pset1.numPstates = 1;
+	pset1.numClocks = 1;
+	pset1.pstates[0].clocks[0].domainId = NVAPI_GPU_PUBLIC_CLOCK_MEMORY;
+	pset1.pstates[0].clocks[0].freqDelta_kHz.value = delta;
+	ret = NvAPI_DLL_SetPstates20v1(phys[devNum], &pset1);
+	if (ret == NVAPI_OK) {
+		applog(LOG_INFO, "GPU #%u: Boost mem clock set to %u (delta %d)", devNum, clock, delta/1000);
+	}
+	return ret;
+}
+
+static int nvapi_set_memoffset(unsigned int devNum, int32_t delta, bool log=true)
+{
+	NvAPI_Status ret;
+	NvS32 deltaKHz = delta * 1000;
+
+	if (devNum >= nvapi_dev_cnt)
+		return -ENODEV;
+
+	// todo: bounds check with GetPstates20
+
+	NV_GPU_PERF_PSTATES20_INFO_V1 pset1 = { 0 };
+	pset1.version = NV_GPU_PERF_PSTATES20_INFO_VER1;
+	pset1.numPstates = 1;
+	pset1.numClocks = 1;
+	pset1.pstates[0].clocks[0].domainId = NVAPI_GPU_PUBLIC_CLOCK_MEMORY;
+	pset1.pstates[0].clocks[0].freqDelta_kHz.value = deltaKHz;
+	ret = NvAPI_DLL_SetPstates20v1(phys[devNum], &pset1);
+	if (ret == NVAPI_OK) {
+		if (log) applog(LOG_INFO, "GPU #%u: Memory clock offset set to %+d MHz", devNum, deltaKHz / 1000);
+		need_memclockrst = true;
+	}
+	return ret;
+}
+
+// Replacement for WIN32 CUDA 6.5 on pascal
+int nvapiMemGetInfo(int dev_id, uint64_t *free, uint64_t *total)
+{
+	NvAPI_Status ret = NVAPI_OK;
+	NV_DISPLAY_DRIVER_MEMORY_INFO mem = { 0 };
+	mem.version = NV_DISPLAY_DRIVER_MEMORY_INFO_VER;
+	unsigned int devNum = nvapi_dev_map[dev_id % MAX_GPUS];
+	if ((ret = NvAPI_GPU_GetMemoryInfo(phys[devNum], &mem)) == NVAPI_OK) {
+		*total = (uint64_t) mem.dedicatedVideoMemory;// mem.availableDedicatedVideoMemory;
+		*free  = (uint64_t) mem.curAvailableDedicatedVideoMemory;
+	}
+	return (int) ret;
+}
+
+int nvapi_init()
+{
+	int num_gpus = cuda_num_devices();
+	NvAPI_Status ret = NvAPI_Initialize();
+	if (ret != NVAPI_OK) {
+		NvAPI_ShortString string;
+		NvAPI_GetErrorMessage(ret, string);
+		if (opt_debug)
+			applog(LOG_DEBUG, "NVAPI NvAPI_Initialize: %s", string);
+		return -1;
+	}
+
+	ret = NvAPI_EnumPhysicalGPUs(phys, &nvapi_dev_cnt);
+	if (ret != NVAPI_OK) {
+		NvAPI_ShortString string;
+		NvAPI_GetErrorMessage(ret, string);
+		if (opt_debug)
+			applog(LOG_DEBUG, "NVAPI NvAPI_EnumPhysicalGPUs: %s", string);
+		return -1;
+	}
+
+	for (int g = 0; g < num_gpus; g++) {
+		cudaDeviceProp props;
+		if (cudaGetDeviceProperties(&props, g) == cudaSuccess) {
+			device_bus_ids[g] = props.pciBusID;
+		}
+		nvapi_dev_map[g] = g; // default mapping
+	}
+
+	for (NvU8 i = 0; i < nvapi_dev_cnt; i++) {
+		NvAPI_ShortString name;
+		ret = NvAPI_GPU_GetFullName(phys[i], name);
+		if (ret == NVAPI_OK) {
+			for (int g = 0; g < num_gpus; g++) {
+				NvU32 busId;
+				ret = NvAPI_GPU_GetBusId(phys[i], &busId);
+				if (ret == NVAPI_OK && busId == device_bus_ids[g]) {
+					nvapi_dev_map[g] = i;
+					if (opt_debug)
+						applog(LOG_DEBUG, "CUDA GPU %d matches NVAPI GPU %d by busId %u",
+							g, i, busId);
+					break;
+				}
+			}
+		} else {
+			NvAPI_ShortString string;
+			NvAPI_GetErrorMessage(ret, string);
+			applog(LOG_DEBUG, "NVAPI NvAPI_GPU_GetFullName: %s", string);
+		}
+	}
+#if 0
+	if (opt_debug) {
+		NvAPI_ShortString ver;
+		NvAPI_GetInterfaceVersionString(ver);
+		applog(LOG_DEBUG, "%s", ver);
+	}
+#endif
+
+	NvU32 udv;
+	NvAPI_ShortString str;
+	ret = NvAPI_SYS_GetDriverAndBranchVersion(&udv, str);
+	if (ret == NVAPI_OK) {
+		sprintf(driver_version,"%d.%02d", udv / 100, udv % 100);
+	}
+
+	return 0;
+}
+
+int nvapi_init_settings()
+{
+	// nvapi.dll
+	int ret = nvapi_dll_init();
+	if (ret != NVAPI_OK)
+		return ret;
+
+	if (!opt_n_threads) {
+		opt_n_threads = active_gpus;
+	}
+
+	for (int n=0; n < opt_n_threads; n++) {
+		int dev_id = device_map[n % MAX_GPUS];
+		if (device_plimit[dev_id] && !nvml_plimit_set) {
+			if (nvapi_set_plimit(nvapi_dev_map[dev_id], device_plimit[dev_id]) == NVAPI_OK) {
+				uint32_t res = nvapi_get_plimit(nvapi_dev_map[dev_id]);
+				gpulog(LOG_INFO, n, "Power limit is set to %u%%", res);
+			}
+		}
+		if (device_tlimit[dev_id]) {
+			nvapi_set_tlimit(nvapi_dev_map[dev_id], device_tlimit[dev_id]);
+		}
+		if (device_gpu_clocks[dev_id]) {
+			ret = nvapi_set_gpuclock(nvapi_dev_map[dev_id], device_gpu_clocks[dev_id]);
+			if (ret) {
+				NvAPI_ShortString string;
+				NvAPI_GetErrorMessage((NvAPI_Status) ret, string);
+				gpulog(LOG_WARNING, n, "nvapi_set_gpuclock %s", string);
+			}
+		}
+		if (device_mem_offsets[dev_id]) {
+			ret = nvapi_set_memoffset(nvapi_dev_map[dev_id], device_mem_offsets[dev_id]);
+			if (ret) {
+				NvAPI_ShortString string;
+				NvAPI_GetErrorMessage((NvAPI_Status)ret, string);
+				gpulog(LOG_WARNING, n, "nvapi_set_memoffset %s", string);
+			}
+		}
+		else if (device_mem_clocks[dev_id]) {
+			ret = nvapi_set_memclock(nvapi_dev_map[dev_id], device_mem_clocks[dev_id]);
+			if (ret) {
+				NvAPI_ShortString string;
+				NvAPI_GetErrorMessage((NvAPI_Status) ret, string);
+				gpulog(LOG_WARNING, n, "nvapi_set_memclock %s", string);
+			}
+		}
+		if (device_pstate[dev_id]) {
+			// dunno how via nvapi or/and pascal
+		}
+		if (device_led[dev_id] != -1) {
+			int err = nvapi_set_led(nvapi_dev_map[dev_id], device_led[dev_id], device_name[dev_id]);
+			if (err != 0) {
+				gpulog(LOG_WARNING, n, "Unable to set led value (err %d)", err);
+			}
+			device_led_state[dev_id] = device_led[dev_id];
+		}
+	}
+
+	return ret;
+}
+
+void nvapi_toggle_clocks(int thr_id, bool enable)
+{
+	int dev_id = device_map[thr_id % MAX_GPUS];
+	if (device_mem_offsets[dev_id]) {
+		nvapi_set_memoffset(nvapi_dev_map[dev_id], enable ? device_mem_offsets[dev_id] : 0, false);
+	}
+}
+
+unsigned int nvapi_devnum(int dev_id)
+{
+	return nvapi_dev_map[dev_id];
+}
+
+int nvapi_devid(unsigned int devNum)
+{
+	for (int i=0; i < opt_n_threads; i++) {
+		int dev_id = device_map[i % MAX_GPUS];
+		if (nvapi_dev_map[dev_id] = devNum)
+			return dev_id;
+	}
+	return 0;
+}
+
+#endif /* WIN32 : Windows specific (nvapi) */
+
+/* api functions -------------------------------------- */
+
+// assume 2500 rpm as default, auto-updated if more
+static unsigned int fan_speed_max = 2500;
+
+unsigned int gpu_fanpercent(struct cgpu_info *gpu)
+{
+	unsigned int pct = 0;
+	if (hnvml) {
+		nvml_get_fanpcnt(hnvml, gpu->gpu_id, &pct);
+	}
+#ifdef WIN32
+	else {
+		unsigned int rpm = 0;
+		nvapi_fanspeed(nvapi_dev_map[gpu->gpu_id], &rpm);
+		pct = (rpm * 100) / fan_speed_max;
+		if (pct > 100) {
+			pct = 100;
+			fan_speed_max = rpm;
+		}
+	}
+#endif
+	return pct;
+}
+
+unsigned int gpu_fanrpm(struct cgpu_info *gpu)
+{
+	unsigned int rpm = 0;
+#ifdef WIN32
+	nvapi_fanspeed(nvapi_dev_map[gpu->gpu_id], &rpm);
+#endif
+	return rpm;
+}
+
+
+float gpu_temp(struct cgpu_info *gpu)
+{
+	float tc = 0.0;
+	unsigned int tmp = 0;
+	if (hnvml) {
+		nvml_get_tempC(hnvml, gpu->gpu_id, &tmp);
+		tc = (float)tmp;
+	}
+#ifdef WIN32
+	else {
+		nvapi_temperature(nvapi_dev_map[gpu->gpu_id], &tmp);
+		tc = (float)tmp;
+	}
+#endif
+	return tc;
+}
+
+int gpu_pstate(struct cgpu_info *gpu)
+{
+	int pstate = -1;
+	int support = -1;
+	if (hnvml) {
+		support = nvml_get_pstate(hnvml, gpu->gpu_id, &pstate);
+	}
+#ifdef WIN32
+	if (support == -1) {
+		unsigned int pst = 0;
+		nvapi_getpstate(nvapi_dev_map[gpu->gpu_id], &pst);
+		pstate = (int) pst;
+	}
+#endif
+	return pstate;
+}
+
+int gpu_busid(struct cgpu_info *gpu)
+{
+	int busid = -1;
+	int support = -1;
+	if (hnvml) {
+		support = nvml_get_busid(hnvml, gpu->gpu_id, &busid);
+	}
+#ifdef WIN32
+	if (support == -1) {
+		busid = device_bus_ids[gpu->gpu_id];
+	}
+#endif
+	return busid;
+}
+
+unsigned int gpu_power(struct cgpu_info *gpu)
+{
+	unsigned int mw = 0;
+	int support = -1;
+	if (hnvml) {
+		support = nvml_get_power_usage(hnvml, gpu->gpu_id, &mw);
+	}
+#ifdef WIN32
+	if (support == -1) {
+		unsigned int pct = 0;
+		nvapi_getusage(nvapi_dev_map[gpu->gpu_id], &pct);
+		pct *= nvapi_get_plimit(nvapi_dev_map[gpu->gpu_id]);
+		pct /= 100;
+		mw = pct; // to fix
+	}
+#endif
+	if (gpu->gpu_power > 0) {
+		// average
+		mw = (gpu->gpu_power + mw) / 2;
+	}
+	return mw;
+}
+
+unsigned int gpu_plimit(struct cgpu_info *gpu)
+{
+	unsigned int mw = 0;
+	int support = -1;
+	if (hnvml) {
+		mw = nvml_get_plimit(hnvml, gpu->gpu_id);
+		support = (mw > 0);
+	}
+#ifdef WIN32
+	// NVAPI value is in % (< 100 so)
+	if (support == -1) {
+		mw = nvapi_get_plimit(nvapi_dev_map[gpu->gpu_id]);
+	}
+#endif
+	return mw;
+}
+
+static int translate_vendor_id(uint16_t vid, char *vendorname)
+{
+	struct VENDORS {
+		const uint16_t vid;
+		const char *name;
+	} vendors[] = {
+		{ 0x1043, "ASUS" },
+		{ 0x1048, "Elsa" },
+		{ 0x107D, "Leadtek" },
+		{ 0x10B0, "Gainward" },
+		// { 0x10DE, "NVIDIA" },
+		{ 0x1458, "Gigabyte" },
+		{ 0x1462, "MSI" },
+		{ 0x154B, "PNY" }, // maybe storage devices
+		{ 0x1569, "Palit" },
+		{ 0x1682, "XFX" },
+		{ 0x196D, "Club3D" },
+		{ 0x196E, "PNY" },
+		{ 0x19DA, "Zotac" },
+		{ 0x19F1, "BFG" },
+		{ 0x1ACC, "PoV" },
+		{ 0x1B4C, "Galax" }, // KFA2 in EU, to check on Pascal cards
+		{ 0x3842, "EVGA" },
+		{ 0x7377, "Colorful" },
+		{ 0, "" }
+	};
+
+	if (!vendorname)
+		return -EINVAL;
+
+	for(int v=0; v < ARRAY_SIZE(vendors); v++) {
+		if (vid == vendors[v].vid) {
+			strcpy(vendorname, vendors[v].name);
+			return vid;
+		}
+	}
+	if (opt_debug && vid != 0x10DE)
+		applog(LOG_DEBUG, "nvml: Unknown vendor %04x\n", vid);
+	return 0;
+}
+
+int gpu_vendor(uint8_t pci_bus_id, char *vendorname)
+{
+	uint16_t vid = 0, pid = 0;
+	if (hnvml) { // may not be initialized on start...
+		for (int id=0; id < hnvml->nvml_gpucount; id++) {
+			if (hnvml->nvml_pci_bus_id[id] == pci_bus_id) {
+				int dev_id = hnvml->nvml_cuda_device_id[id];
+				nvml_get_info(hnvml, dev_id, vid, pid);
+			}
+		}
+	} else {
+#ifdef WIN32
+		for (unsigned id = 0; id < nvapi_dev_cnt; id++) {
+			if (device_bus_ids[id] == pci_bus_id) {
+				nvapi_getinfo(nvapi_dev_map[id], vid, pid);
+				break;
+			}
+		}
+#endif
+	}
+	return translate_vendor_id(vid, vendorname);
+}
+
+int gpu_info(struct cgpu_info *gpu)
+{
+	char vendorname[32] = { 0 };
+	int id = gpu->gpu_id;
+	uint8_t bus_id = 0;
+
+	gpu->nvml_id = -1;
+	gpu->nvapi_id = -1;
+
+	if (id < 0)
+		return -1;
+
+	if (hnvml) {
+		gpu->nvml_id = (int8_t) hnvml->cuda_nvml_device_id[id];
+		nvml_get_info(hnvml, id, gpu->gpu_vid, gpu->gpu_pid);
+		nvml_get_serial(hnvml, id, gpu->gpu_sn, sizeof(gpu->gpu_sn));
+		nvml_get_bios(hnvml, id, gpu->gpu_desc, sizeof(gpu->gpu_desc));
+	}
+#ifdef WIN32
+	gpu->nvapi_id = (int8_t) nvapi_dev_map[id];
+	nvapi_getinfo(nvapi_dev_map[id], gpu->gpu_vid, gpu->gpu_pid);
+	nvapi_getserial(nvapi_dev_map[id], gpu->gpu_sn, sizeof(gpu->gpu_sn));
+	nvapi_getbios(nvapi_dev_map[id], gpu->gpu_desc, sizeof(gpu->gpu_desc));
+#endif
+	return 0;
+}
+
+#endif /* USE_WRAPNVML */
+
+static int rgb_percent(int RGB, int percent)
+{
+	uint8_t* comp = (uint8_t*) &RGB;
+	int res = ((percent*comp[2]) / 100) << 16;
+	res += ((percent*comp[1]) / 100) << 8;
+	return res + ((percent*comp[0]) / 100);
+}
+
+void gpu_led_on(int dev_id)
+{
+#if defined(WIN32) && defined(USE_WRAPNVML)
+	int value = device_led[dev_id];
+	if (device_led_state[dev_id] != value) {
+		if (nvapi_set_led(nvapi_dev_map[dev_id], value, device_name[dev_id]) == 0)
+			device_led_state[dev_id] = value;
+	}
+#endif
+}
+
+void gpu_led_percent(int dev_id, int percent)
+{
+#if defined(WIN32) && defined(USE_WRAPNVML)
+	int value = rgb_percent(device_led[dev_id], percent);
+	if (device_led_state[dev_id] != value) {
+		if (nvapi_set_led(nvapi_dev_map[dev_id], value, device_name[dev_id]) == 0)
+			device_led_state[dev_id] = value;
+	}
+#endif
+}
+
+void gpu_led_off(int dev_id)
+{
+#if defined(WIN32) && defined(USE_WRAPNVML)
+	if (device_led_state[dev_id]) {
+		if (nvapi_set_led(nvapi_dev_map[dev_id], 0, device_name[dev_id]) == 0)
+			device_led_state[dev_id] = 0;
+	}
+#endif
+}
+
+#ifdef USE_WRAPNVML
+extern double thr_hashrates[MAX_GPUS];
+extern bool opt_debug_threads;
+extern bool opt_hwmonitor;
+extern int num_cpus;
+
+void *monitor_thread(void *userdata)
+{
+	int thr_id = -1;
+
+	while (!abort_flag && !opt_quiet)
+	{
+		// This thread monitors card's power lazily during scans, one at a time...
+		thr_id = (thr_id + 1) % opt_n_threads;
+		struct cgpu_info *cgpu = &thr_info[thr_id].gpu;
+		int dev_id = cgpu->gpu_id; cudaSetDevice(dev_id);
+
+		if (hnvml != NULL && cgpu)
+		{
+			char khw[32] = { 0 };
+			uint64_t clock = 0, mem_clock = 0;
+			uint32_t fanpercent = 0, power = 0;
+			double tempC = 0, khs_per_watt = 0;
+			uint32_t counter = 0;
+			int max_loops = 1000;
+
+			pthread_cond_wait(&cgpu->monitor.sampling_signal, &cgpu->monitor.lock);
+
+			do {
+				unsigned int tmp_clock=0, tmp_memclock=0;
+				nvml_get_current_clocks(dev_id, &tmp_clock, &tmp_memclock);
+#ifdef WIN32
+				if (tmp_clock < 200) {
+					// workaround for buggy drivers 378.x (real clock)
+					tmp_clock = nvapi_get_gpu_clock(nvapi_dev_map[dev_id]);
+				}
+#endif
+				if (tmp_clock < 200) {
+					// some older cards only report a base clock with cuda props.
+					if (cuda_gpu_info(cgpu) == 0) {
+						tmp_clock = cgpu->gpu_clock/1000;
+						tmp_memclock = cgpu->gpu_memclock/1000;
+					}
+				}
+				clock += tmp_clock;
+				mem_clock += tmp_memclock;
+				tempC += gpu_temp(cgpu);
+				fanpercent += gpu_fanpercent(cgpu);
+				power += gpu_power(cgpu);
+				counter++;
+
+				usleep(50000);
+				if (abort_flag) goto abort;
+
+			} while (cgpu->monitor.sampling_flag && (--max_loops));
+
+			cgpu->monitor.gpu_temp = (uint32_t) (tempC/counter);
+			cgpu->monitor.gpu_fan = fanpercent/counter;
+			cgpu->monitor.gpu_power = power/counter;
+			cgpu->monitor.gpu_clock = (uint32_t) (clock/counter);
+			cgpu->monitor.gpu_memclock = (uint32_t) (mem_clock/counter);
+
+			if (power) {
+				khs_per_watt = stats_get_speed(thr_id, thr_hashrates[thr_id]);
+				khs_per_watt = khs_per_watt / ((double)power / counter);
+				format_hashrate(khs_per_watt * 1000, khw);
+				if (strlen(khw))
+					sprintf(&khw[strlen(khw)-1], "W %uW ", cgpu->monitor.gpu_power / 1000);
+			}
+
+			if (opt_hwmonitor && (time(NULL) - cgpu->monitor.tm_displayed) > 60) {
+				gpulog(LOG_INFO, thr_id, "%u MHz %s%uC FAN %u%%",
+					cgpu->monitor.gpu_clock/*, cgpu->monitor.gpu_memclock*/,
+					khw, cgpu->monitor.gpu_temp, cgpu->monitor.gpu_fan
+				);
+				cgpu->monitor.tm_displayed = (uint32_t)time(NULL);
+			}
+
+			pthread_mutex_unlock(&cgpu->monitor.lock);
+		}
+		usleep(500); // safety
+	}
+abort:
+	if (opt_debug_threads)
+		applog(LOG_DEBUG, "%s() died", __func__);
+	return NULL;
+}
+#endif
diff --git a/nvml.h b/nvml.h
new file mode 100644
index 0000000000..9145daa46d
--- /dev/null
+++ b/nvml.h
@@ -0,0 +1,266 @@
+/*
+ * A trivial little dlopen()-based wrapper library for the
+ * NVIDIA NVML library, to allow runtime discovery of NVML on an
+ * arbitrary system.  This is all very hackish and simple-minded, but
+ * it serves my immediate needs in the short term until NVIDIA provides
+ * a static NVML wrapper library themselves, hopefully in
+ * CUDA 6.5 or maybe sometime shortly after.
+ *
+ * This trivial code is made available under the "new" 3-clause BSD license,
+ * and/or any of the GPL licenses you prefer.
+ * Feel free to use the code and modify as you see fit.
+ *
+ * John E. Stone - john.stone@gmail.com
+ *
+ */
+#ifdef USE_WRAPNVML
+
+#include "miner.h"
+
+void *monitor_thread(void *userdata);
+
+typedef void * nvmlDevice_t;
+
+#define NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE 16
+
+/* our own version of the PCI info struct */
+typedef struct {
+	char bus_id_str[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+	unsigned int domain;
+	unsigned int bus;
+	unsigned int device;
+	unsigned int pci_device_id;      /* combined device and vendor id */
+	unsigned int pci_subsystem_id;
+	unsigned int res0;               /* NVML internal use only */
+	unsigned int res1;
+	unsigned int res2;
+	unsigned int res3;
+} nvmlPciInfo_t;
+
+enum nvmlEnableState_t {
+	NVML_FEATURE_DISABLED = 0,
+	NVML_FEATURE_ENABLED = 1,
+	NVML_FEATURE_UNKNOWN = 2
+};
+
+enum nvmlRestrictedAPI_t {
+	NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS = 0,
+	NVML_RESTRICTED_API_SET_AUTO_BOOSTED_CLOCKS = 1, // not for GTX cards
+	NVML_RESTRICTED_API_COUNT = 2
+};
+
+enum nvmlReturn_t {
+	NVML_SUCCESS = 0,
+	NVML_ERROR_UNINITIALIZED = 1,
+	NVML_ERROR_INVALID_ARGUMENT = 2,
+	NVML_ERROR_NOT_SUPPORTED = 3,
+	NVML_ERROR_NO_PERMISSION = 4,
+	NVML_ERROR_ALREADY_INITIALIZED = 5,
+	NVML_ERROR_NOT_FOUND = 6,
+	NVML_ERROR_INSUFFICIENT_SIZE = 7,
+	NVML_ERROR_INSUFFICIENT_POWER = 8,
+	NVML_ERROR_DRIVER_NOT_LOADED = 9,
+	NVML_ERROR_TIMEOUT = 10,
+	NVML_ERROR_IRQ_ISSUE = 11,
+	NVML_ERROR_LIBRARY_NOT_FOUND = 12,
+	NVML_ERROR_FUNCTION_NOT_FOUND = 13,
+	NVML_ERROR_CORRUPTED_INFOROM = 14,
+	NVML_ERROR_GPU_IS_LOST = 15,
+	NVML_ERROR_RESET_REQUIRED = 16,
+	NVML_ERROR_OPERATING_SYSTEM = 17,
+	NVML_ERROR_LIB_RM_VERSION_MISMATCH = 18,
+	NVML_ERROR_IN_USE = 19,
+	NVML_ERROR_UNKNOWN = 999
+};
+
+enum nvmlClockType_t {
+	NVML_CLOCK_GRAPHICS = 0,
+	NVML_CLOCK_SM = 1,
+	NVML_CLOCK_MEM = 2,
+	NVML_CLOCK_VIDEO = 3,
+	NVML_CLOCK_COUNT
+};
+
+enum nvmlClockId_t {
+	NVML_CLOCK_ID_CURRENT = 0,
+	NVML_CLOCK_ID_APP_CLOCK_TARGET = 1,
+	NVML_CLOCK_ID_APP_CLOCK_DEFAULT = 2,
+	NVML_CLOCK_ID_CUSTOMER_BOOST_MAX = 3,
+	NVML_CLOCK_ID_COUNT
+};
+
+enum nvmlPcieUtilCounter_t {
+	NVML_PCIE_UTIL_TX_BYTES = 0,
+	NVML_PCIE_UTIL_RX_BYTES = 1,
+	NVML_PCIE_UTIL_COUNT
+};
+
+enum nvmlValueType_t {
+	NVML_VALUE_TYPE_DOUBLE = 0,
+	NVML_VALUE_TYPE_UNSIGNED_INT = 1,
+	NVML_VALUE_TYPE_UNSIGNED_LONG = 2,
+	NVML_VALUE_TYPE_UNSIGNED_LONG_LONG = 3,
+	NVML_VALUE_TYPE_COUNT
+};
+
+typedef int nvmlGpuTopologyLevel_t;
+typedef int nvmlNvLinkCapability_t;
+typedef int nvmlNvLinkErrorCounter_t;
+typedef int nvmlNvLinkUtilizationControl_t;
+
+#define NVML_DEVICE_SERIAL_BUFFER_SIZE 30
+#define NVML_DEVICE_UUID_BUFFER_SIZE 80
+#define NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE 32
+
+/*
+ * Handle to hold the function pointers for the entry points we need,
+ * and the shared library itself.
+ */
+typedef struct {
+	void *nvml_dll;
+	int nvml_gpucount;
+	int cuda_gpucount;
+	unsigned int *nvml_pci_domain_id;
+	unsigned int *nvml_pci_bus_id;
+	unsigned int *nvml_pci_device_id;
+	unsigned int *nvml_pci_vendor_id;
+	unsigned int *nvml_pci_subsys_id;
+	int *nvml_cuda_device_id;          /* map NVML dev to CUDA dev */
+	int *cuda_nvml_device_id;          /* map CUDA dev to NVML dev */
+	nvmlDevice_t *devs;
+	nvmlEnableState_t *app_clocks;
+	nvmlReturn_t (*nvmlInit)(void);
+	nvmlReturn_t (*nvmlDeviceGetCount)(int *);
+	nvmlReturn_t (*nvmlDeviceGetHandleByIndex)(int, nvmlDevice_t *);
+	nvmlReturn_t (*nvmlDeviceGetAPIRestriction)(nvmlDevice_t, nvmlRestrictedAPI_t, nvmlEnableState_t *);
+	nvmlReturn_t (*nvmlDeviceSetAPIRestriction)(nvmlDevice_t, nvmlRestrictedAPI_t, nvmlEnableState_t);
+	nvmlReturn_t (*nvmlDeviceGetDefaultApplicationsClock)(nvmlDevice_t, nvmlClockType_t, unsigned int *);
+	nvmlReturn_t (*nvmlDeviceGetApplicationsClock)(nvmlDevice_t, nvmlClockType_t, unsigned int *);
+	nvmlReturn_t (*nvmlDeviceSetApplicationsClocks)(nvmlDevice_t, unsigned int, unsigned int);
+	nvmlReturn_t (*nvmlDeviceResetApplicationsClocks)(nvmlDevice_t);
+	nvmlReturn_t (*nvmlDeviceGetSupportedGraphicsClocks)(nvmlDevice_t, uint32_t mem, uint32_t *num, uint32_t *arr);
+	nvmlReturn_t (*nvmlDeviceGetSupportedMemoryClocks)(nvmlDevice_t, unsigned int *count, unsigned int *clocksMHz);
+	nvmlReturn_t (*nvmlDeviceGetClockInfo)(nvmlDevice_t, nvmlClockType_t, unsigned int *);
+	nvmlReturn_t (*nvmlDeviceGetMaxClockInfo)(nvmlDevice_t, nvmlClockType_t, unsigned int *);
+	nvmlReturn_t (*nvmlDeviceGetPowerManagementDefaultLimit)(nvmlDevice_t, unsigned int *limit);
+	nvmlReturn_t (*nvmlDeviceGetPowerManagementLimit)(nvmlDevice_t, unsigned int *limit);
+	nvmlReturn_t (*nvmlDeviceGetPowerManagementLimitConstraints)(nvmlDevice_t, unsigned int *min, unsigned int *max);
+	nvmlReturn_t (*nvmlDeviceSetPowerManagementLimit)(nvmlDevice_t device, unsigned int limit);
+	nvmlReturn_t (*nvmlDeviceGetPciInfo)(nvmlDevice_t, nvmlPciInfo_t *);
+	nvmlReturn_t (*nvmlDeviceGetCurrPcieLinkGeneration)(nvmlDevice_t device, unsigned int *gen);
+	nvmlReturn_t (*nvmlDeviceGetCurrPcieLinkWidth)(nvmlDevice_t device, unsigned int *width);
+	nvmlReturn_t (*nvmlDeviceGetMaxPcieLinkGeneration)(nvmlDevice_t device, unsigned int *gen);
+	nvmlReturn_t (*nvmlDeviceGetMaxPcieLinkWidth)(nvmlDevice_t device, unsigned int *width);
+	nvmlReturn_t (*nvmlDeviceGetName)(nvmlDevice_t, char *, int);
+	nvmlReturn_t (*nvmlDeviceGetTemperature)(nvmlDevice_t, int, unsigned int *);
+	nvmlReturn_t (*nvmlDeviceGetFanSpeed)(nvmlDevice_t, unsigned int *);
+	nvmlReturn_t (*nvmlDeviceGetPerformanceState)(nvmlDevice_t, int *); /* enum */
+	nvmlReturn_t (*nvmlDeviceGetPowerUsage)(nvmlDevice_t, unsigned int *);
+	nvmlReturn_t (*nvmlDeviceGetSerial)(nvmlDevice_t, char *serial, unsigned int len);
+	nvmlReturn_t (*nvmlDeviceGetUUID)(nvmlDevice_t, char *uuid, unsigned int len);
+	nvmlReturn_t (*nvmlDeviceGetVbiosVersion)(nvmlDevice_t, char *version, unsigned int len);
+	nvmlReturn_t (*nvmlSystemGetDriverVersion)(char *version, unsigned int len);
+	char* (*nvmlErrorString)(nvmlReturn_t);
+	nvmlReturn_t (*nvmlShutdown)(void);
+	// v331
+	nvmlReturn_t (*nvmlDeviceGetEnforcedPowerLimit)(nvmlDevice_t, unsigned int *limit);
+	// v340
+#ifdef __linux__
+	nvmlReturn_t (*nvmlDeviceClearCpuAffinity)(nvmlDevice_t);
+	nvmlReturn_t (*nvmlDeviceGetCpuAffinity)(nvmlDevice_t, unsigned int cpuSetSize, unsigned long* cpuSet);
+	nvmlReturn_t (*nvmlDeviceSetCpuAffinity)(nvmlDevice_t);
+#endif
+	// v346
+	nvmlReturn_t (*nvmlDeviceGetPcieThroughput)(nvmlDevice_t, nvmlPcieUtilCounter_t, unsigned int *value);
+	// v36x (API 8)
+	nvmlReturn_t (*nvmlDeviceGetClock)(nvmlDevice_t, nvmlClockType_t clockType, nvmlClockId_t clockId, unsigned int *clockMHz);
+#ifdef __linux__
+	nvmlReturn_t (*nvmlSystemGetTopologyGpuSet)(unsigned int cpuNumber, unsigned int *count, nvmlDevice_t *deviceArray);
+	nvmlReturn_t (*nvmlDeviceGetTopologyNearestGpus)(nvmlDevice_t, nvmlGpuTopologyLevel_t level, unsigned int *count, nvmlDevice_t *deviceArray);
+	nvmlReturn_t (*nvmlDeviceGetTopologyCommonAncestor)(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuTopologyLevel_t *pathInfo);
+#endif
+	nvmlReturn_t (*nvmlDeviceGetNvLinkState)(nvmlDevice_t, unsigned int link, nvmlEnableState_t *isActive);
+	nvmlReturn_t (*nvmlDeviceGetNvLinkVersion)(nvmlDevice_t, unsigned int link, unsigned int *version);
+	nvmlReturn_t (*nvmlDeviceGetNvLinkCapability)(nvmlDevice_t, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int *capResult);
+	nvmlReturn_t (*nvmlDeviceGetNvLinkRemotePciInfo)(nvmlDevice_t, unsigned int link, nvmlPciInfo_t *pci);
+	nvmlReturn_t (*nvmlDeviceGetNvLinkErrorCounter)(nvmlDevice_t, unsigned int link, nvmlNvLinkErrorCounter_t counter, unsigned long long *counterValue);
+	nvmlReturn_t (*nvmlDeviceResetNvLinkErrorCounters)(nvmlDevice_t, unsigned int link);
+	nvmlReturn_t (*nvmlDeviceSetNvLinkUtilizationControl)(nvmlDevice_t, unsigned int link, unsigned int counter, nvmlNvLinkUtilizationControl_t *control, unsigned int reset);
+	nvmlReturn_t (*nvmlDeviceGetNvLinkUtilizationControl)(nvmlDevice_t, unsigned int link, unsigned int counter, nvmlNvLinkUtilizationControl_t *control);
+	nvmlReturn_t (*nvmlDeviceGetNvLinkUtilizationCounter)(nvmlDevice_t, unsigned int link, unsigned int counter, unsigned long long *rxcounter, unsigned long long *txcounter);
+	nvmlReturn_t (*nvmlDeviceFreezeNvLinkUtilizationCounter)(nvmlDevice_t, unsigned int link, unsigned int counter, nvmlEnableState_t freeze);
+	nvmlReturn_t (*nvmlDeviceResetNvLinkUtilizationCounter)(nvmlDevice_t, unsigned int link, unsigned int counter);
+
+} nvml_handle;
+
+nvml_handle * nvml_create();
+int nvml_destroy(nvml_handle *nvmlh);
+
+// Debug informations
+void nvml_print_device_info(int dev_id);
+
+// Query the number of GPUs seen by NVML
+int nvml_get_gpucount(nvml_handle *nvmlh, int *gpucount);
+
+int nvml_set_plimit(nvml_handle *nvmlh, int dev_id);
+int nvml_set_pstate(nvml_handle *nvmlh, int dev_id);
+
+int nvml_set_clocks(nvml_handle *nvmlh, int dev_id);
+int nvml_reset_clocks(nvml_handle *nvmlh, int dev_id);
+
+/* api functions */
+
+unsigned int gpu_fanpercent(struct cgpu_info *gpu);
+unsigned int gpu_fanrpm(struct cgpu_info *gpu);
+float gpu_temp(struct cgpu_info *gpu);
+unsigned int gpu_power(struct cgpu_info *gpu);
+unsigned int gpu_plimit(struct cgpu_info *gpu);
+int gpu_pstate(struct cgpu_info *gpu);
+int gpu_busid(struct cgpu_info *gpu);
+
+// pid/vid, sn and bios rev
+int gpu_info(struct cgpu_info *gpu);
+
+int gpu_vendor(uint8_t pci_bus_id, char *vendorname);
+
+/* nvapi functions */
+#ifdef WIN32
+int nvapi_init();
+int nvapi_init_settings();
+
+// to debug nvapi..
+int nvapi_pstateinfo(unsigned int devNum);
+uint8_t nvapi_get_plimit(unsigned int devNum);
+
+// nvapi devNum from dev_id (cuda GPU #N)
+unsigned int nvapi_devnum(int dev_id);
+int nvapi_devid(unsigned int devNum);
+
+void nvapi_toggle_clocks(int thr_id, bool enable);
+
+// cuda Replacement for 6.5 compat
+int nvapiMemGetInfo(int dev_id, uint64_t *free, uint64_t *total);
+#endif
+
+#endif /* USE_WRAPNVML */
+
+void gpu_led_on(int dev_id);
+void gpu_led_percent(int dev_id, int percent);
+void gpu_led_off(int dev_id);
+
+#define LED_MODE_OFF    0
+#define LED_MODE_SHARES 1
+#define LED_MODE_MINING 2
+
+/* ------ nvidia-settings stuff for linux -------------------- */
+
+int nvs_init();
+int nvs_set_clocks(int dev_id);
+void nvs_reset_clocks(int dev_id);
+
+// nvidia-settings (X) devNum from dev_id (cuda GPU #N)
+int8_t nvs_devnum(int dev_id);
+int nvs_devid(int8_t devNum);
+
+extern bool need_nvsettings;
+
diff --git a/nvsettings.cpp b/nvsettings.cpp
new file mode 100644
index 0000000000..5ea323381d
--- /dev/null
+++ b/nvsettings.cpp
@@ -0,0 +1,251 @@
+/**
+ * nvidia-settings command line interface for linux - tpruvot 2017
+ *
+ * Notes: need X setup and running, with an opened X session.
+ *        init speed could be improved, running multiple threads
+ */
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <sys/types.h> // pid_t
+
+#include "miner.h"
+#include "nvml.h"
+#include "cuda_runtime.h"
+
+#ifdef __linux__
+
+#define NVS_PATH "/usr/bin/nvidia-settings"
+
+static int8_t nvs_dev_map[MAX_GPUS] = { 0 };
+static uint8_t nvs_bus_ids[MAX_GPUS] = { 0 };
+static int32_t nvs_clocks_set[MAX_GPUS] = { 0 };
+
+extern int32_t device_mem_offsets[MAX_GPUS];
+
+#if 0 /* complicated exec way and not better in fine */
+int nvs_query_fork_int(int nvs_id, const char* field)
+{
+	pid_t pid;
+	int pipes[2] = { 0 };
+	if (pipe(pipes) < 0)
+		return -1;
+
+	if ((pid = fork()) == -1) {
+		close(pipes[0]);
+		close(pipes[1]);
+		return -1;
+	} else if (pid == 0) {
+		char gpu_field[128] = { 0 };
+		sprintf(gpu_field, "[gpu:%d]/%s", nvs_id, field);
+
+		dup2(pipes[1], STDOUT_FILENO);
+		close(pipes[0]);
+		//close(pipes[1]);
+
+		if (-1 == execl(NVS_PATH, "nvidia-settings", "-q", gpu_field, "-t", NULL)) {
+			exit(-1);
+		}
+	} else {
+		int intval = -1;
+		FILE *p = fdopen(pipes[0], "r");
+		close(pipes[1]);
+		if (!p) {
+			applog(LOG_WARNING, "%s: fdopen(%d) failed", __func__, pipes[0]);
+			return -1;
+		}
+		int rc = fscanf(p, "%d", &intval); // BUS 0000:2a:00.0 is read 42
+		if (rc > 0) {
+			//applog(LOG_BLUE, "%s res=%d", field, intval);
+		}
+		fclose(p);
+		close(pipes[0]);
+		return intval;
+	}
+	return -1;
+}
+#endif
+
+int nvs_query_int(int nvs_id, const char* field, int showerr)
+{
+	FILE *fp;
+	char command[256] = { 0 };
+	sprintf(command, "%s -t -q '[gpu:%d]/%s' 2>&1", NVS_PATH, nvs_id, field);
+	fp = popen(command, "r");
+	if (fp) {
+		int intval = -1;
+		if (!showerr) {
+			int b = fscanf(fp, "%d", &intval);
+			if (!b) {
+				pclose(fp);
+				return -1;
+			}
+		} else {
+			char msg[512] = { 0 };
+			char buf[64] = { 0 };
+			ssize_t bytes, len=0, maxlen=sizeof(msg)-1;
+			while ((bytes=fscanf(fp, "%s", buf)) > 0) {
+				len += snprintf(&msg[len], maxlen-len, "%s ", buf);
+				if (len >= maxlen) break;
+			}
+			if (strstr(msg, "ERROR")) {
+				char *xtra = strstr(msg, "; please run");
+				if (xtra) *xtra = '\0'; // strip noise
+				applog(LOG_INFO, "%s", msg);
+				intval = -1;
+			} else {
+				sscanf(msg, "%d", &intval);
+			}
+		}
+		pclose(fp);
+		return intval;
+	}
+	return -1;
+}
+
+int nvs_query_str(int nvs_id, const char* field, char* output, size_t maxlen)
+{
+	FILE *fp;
+	char command[256] = { 0 };
+	*output = '\0';
+	sprintf(command, "%s -t -q '[gpu:%d]/%s' 2>&1", NVS_PATH, nvs_id, field);
+	fp = popen(command, "r");
+	if (fp) {
+		char buf[256] = { 0 };
+		ssize_t len=0;
+	        ssize_t bytes=0;
+		while ((bytes=fscanf(fp, "%s", buf)) > 0) {
+			//applog(LOG_BLUE, "%d %s %d", nvs_id, buf, (int) bytes);
+			len += snprintf(&output[len], maxlen-len, "%s ", buf);
+			if (len >= maxlen) break;
+		}
+		pclose(fp);
+		if (strstr(output, "ERROR")) {
+			char *xtra = strstr(output, "; please run");
+			if (xtra) *xtra = '\0'; // strip noise
+			applog(LOG_INFO, "%s", output);
+			*output='\0';
+		}
+		return (int) len;
+	}
+	return -1;
+}
+
+int nvs_set_int(int nvs_id, const char* field, int value)
+{
+	FILE *fp;
+	char command[256] = { 0 };
+	int res = -1;
+	snprintf(command, 256, "%s -a '[gpu:%d]/%s=%d' 2>&1", NVS_PATH, nvs_id, field, value);
+	fp = popen(command, "r");
+	if (fp) {
+		char msg[512] = { 0 };
+		char buf[64] = { 0 };
+		ssize_t bytes, len=0, maxlen=sizeof(msg)-1;
+		while ((bytes=fscanf(fp, "%s", buf)) > 0) {
+			len += snprintf(&msg[len], maxlen-len, "%s ", buf);
+			if (len >= maxlen) break;
+		}
+		if (strstr(msg, "ERROR")) {
+			char *xtra = strstr(msg, "; please run");
+			if (xtra) *xtra = '\0'; // strip noise
+			applog(LOG_INFO, "%s", msg);
+		} else
+			res = 0;
+		pclose(fp);
+	}
+	return res;
+}
+
+int8_t nvs_devnum(int dev_id)
+{
+	return nvs_dev_map[dev_id];
+}
+
+int nvs_devid(int8_t nvs_id)
+{
+	for (int i=0; i < opt_n_threads; i++) {
+		int dev_id = device_map[i % MAX_GPUS];
+		if (nvs_dev_map[dev_id] == nvs_id)
+			return dev_id;
+	}
+	return 0;
+}
+
+int nvs_init()
+{
+	struct stat info;
+	struct timeval tv_start, tv_end, diff;
+	int x_devices = 0;
+	int n_threads = opt_n_threads;
+	if (stat(NVS_PATH, &info))
+		return -ENOENT;
+
+	gettimeofday(&tv_start, NULL);
+
+	for (int d = 0; d < MAX_GPUS; d++) {
+		// this part can be "slow" (100-200ms per device)
+		int res = nvs_query_int(d, "PCIBus", 1);
+		if (res < 0) break;
+		nvs_bus_ids[d] = 0xFFu & res;
+		x_devices++;
+	}
+
+	if (opt_debug) {
+		gettimeofday(&tv_end, NULL);
+		timeval_subtract(&diff, &tv_end, &tv_start);
+		applog(LOG_DEBUG, "nvidia-settings pci bus queries took %.2f ms",
+			(1000.0 * diff.tv_sec) + (0.001 * diff.tv_usec));
+	}
+
+	if (!x_devices)
+		return -ENODEV;
+	if (!n_threads) n_threads = cuda_num_devices();
+	for (int i = 0; i < n_threads; i++) {
+		int dev_id = device_map[i % MAX_GPUS];
+		cudaDeviceProp props;
+		if (cudaGetDeviceProperties(&props, dev_id) == cudaSuccess) {
+			for (int8_t d = 0; d < x_devices; d++) {
+				if (nvs_bus_ids[d] == (uint8_t) props.pciBusID) {
+					gpulog(LOG_DEBUG, i, "matches X gpu:%d by busId %u",
+						(int) d, (uint) nvs_bus_ids[d]);
+					nvs_dev_map[dev_id] = d;
+					/* char buf[1024] = { 0 };
+					nvs_query_str(d, "GPUCurrentClockFreqsString", buf, sizeof(buf)-1);
+					gpulog(LOG_DEBUG, d, "%s", buf); */
+					break;
+				}
+			}
+		}
+	}
+	return 0;
+}
+
+int nvs_set_clocks(int dev_id)
+{
+	int res;
+	int8_t d = nvs_devnum(dev_id);
+	if (d < 0) return -ENODEV;
+	if (!device_mem_offsets[dev_id] || nvs_clocks_set[d]) return 0;
+	res = nvs_set_int(d, "GPUMemoryTransferRateOffsetAllPerformanceLevels", device_mem_offsets[dev_id]*2);
+	if (res) nvs_clocks_set[d] = device_mem_offsets[dev_id]*2;
+	return res;
+}
+
+void nvs_reset_clocks(int dev_id)
+{
+	int8_t d = nvs_devnum(dev_id);
+	if (d < 0 || !nvs_clocks_set[d]) return;
+	nvs_set_int(d, "GPUMemoryTransferRateOffsetAllPerformanceLevels", 0);
+	nvs_clocks_set[d] = 0;
+}
+
+#else
+int nvs_init() { return -ENOSYS; }
+int nvs_set_clocks(int dev_id) { return -ENOSYS; }
+void nvs_reset_clocks(int dev_id) { }
+#endif
diff --git a/pentablake.cu b/pentablake.cu
new file mode 100644
index 0000000000..5c63d4a419
--- /dev/null
+++ b/pentablake.cu
@@ -0,0 +1,162 @@
+/**
+ * Penta Blake
+ */
+
+#include <stdint.h>
+#include <memory.h>
+#include "miner.h"
+
+extern "C" {
+#include "sph/sph_blake.h"
+}
+
+/* hash by cpu with blake 256 */
+extern "C" void pentablakehash(void *output, const void *input)
+{
+	unsigned char _ALIGN(128) hash[64];
+
+	sph_blake512_context ctx;
+
+	sph_blake512_init(&ctx);
+	sph_blake512(&ctx, input, 80);
+	sph_blake512_close(&ctx, hash);
+
+	sph_blake512(&ctx, hash, 64);
+	sph_blake512_close(&ctx, hash);
+
+	sph_blake512(&ctx, hash, 64);
+	sph_blake512_close(&ctx, hash);
+
+	sph_blake512(&ctx, hash, 64);
+	sph_blake512_close(&ctx, hash);
+
+	sph_blake512(&ctx, hash, 64);
+	sph_blake512_close(&ctx, hash);
+
+	memcpy(output, hash, 32);
+}
+
+#include "cuda_helper.h"
+
+static uint32_t *d_hash[MAX_GPUS];
+
+extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
+extern void quark_blake512_cpu_free(int thr_id);
+extern void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata);
+extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
+extern void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+static bool init[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t _ALIGN(64) endiandata[20];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	int rc = 0;
+	uint32_t throughput =  cuda_default_throughput(thr_id, 1U << 19);
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (opt_benchmark)
+		ptarget[7] = 0x000F;
+
+	if (!init[thr_id]) {
+		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
+
+		quark_blake512_cpu_init(thr_id, throughput);
+		cuda_check_cpu_init(thr_id, throughput);
+		CUDA_LOG_ERROR();
+
+		init[thr_id] = true;
+	}
+
+	for (int k=0; k < 20; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	quark_blake512_cpu_setBlock_80(thr_id, endiandata);
+	cuda_check_cpu_setTarget(ptarget);
+
+	do {
+		int order = 0;
+
+		// GPU HASH
+		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+		quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+
+			be32enc(&endiandata[19], work->nonces[0]);
+			pentablakehash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					pentablakehash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	return rc;
+}
+
+// cleanup
+void free_pentablake(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+
+	quark_blake512_cpu_free(thr_id);
+	cuda_check_cpu_free(thr_id);
+
+	cudaDeviceSynchronize();
+
+	init[thr_id] = false;
+}
diff --git a/phi/cuda_phi2.cu b/phi/cuda_phi2.cu
new file mode 100644
index 0000000000..a0bcf6d42f
--- /dev/null
+++ b/phi/cuda_phi2.cu
@@ -0,0 +1,89 @@
+#include <stdio.h>
+#include <memory.h>
+
+#include "cuda_helper.h"
+
+__global__ __launch_bounds__(128, 8)
+void phi_filter_gpu(const uint32_t threads, const uint32_t* d_hash, uint32_t* d_branch2, uint32_t* d_NonceBranch)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint32_t offset = thread * 16U; // 64U / sizeof(uint32_t);
+		uint4 *psrc = (uint4*) (&d_hash[offset]);
+		d_NonceBranch[thread] = ((uint8_t*)psrc)[0] & 1;
+		if (d_NonceBranch[thread]) return;
+		if (d_branch2) {
+			uint4 *pdst = (uint4*)(&d_branch2[offset]);
+			uint4 data;
+			data = psrc[0]; pdst[0] = data;
+			data = psrc[1]; pdst[1] = data;
+			data = psrc[2]; pdst[2] = data;
+			data = psrc[3]; pdst[3] = data;
+		}
+	}
+}
+
+__global__ __launch_bounds__(128, 8)
+void phi_merge_gpu(const uint32_t threads, uint32_t* d_hash, uint32_t* d_branch2, uint32_t* const d_NonceBranch)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads && !d_NonceBranch[thread])
+	{
+		const uint32_t offset = thread * 16U;
+		uint4 *psrc = (uint4*) (&d_branch2[offset]);
+		uint4 *pdst = (uint4*) (&d_hash[offset]);
+		uint4 data;
+		data = psrc[0]; pdst[0] = data;
+		data = psrc[1]; pdst[1] = data;
+		data = psrc[2]; pdst[2] = data;
+		data = psrc[3]; pdst[3] = data;
+	}
+}
+
+__global__
+void phi_final_compress_gpu(const uint32_t threads, uint32_t* d_hash)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint32_t offset = thread * 16U;
+		uint2 *psrc = (uint2*) (&d_hash[offset]);
+		uint2 *pdst = (uint2*) (&d_hash[offset]);
+		uint2 data;
+		data = psrc[4]; pdst[0] ^= data;
+		data = psrc[5]; pdst[1] ^= data;
+		data = psrc[6]; pdst[2] ^= data;
+		data = psrc[7]; pdst[3] ^= data;
+	}
+}
+
+__host__
+uint32_t phi_filter_cuda(const int thr_id, const uint32_t threads, const uint32_t *inpHashes, uint32_t* d_br2, uint32_t* d_nonces)
+{
+	const uint32_t threadsperblock = 128;
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+	// extract algo permution hashes to a second branch buffer
+	phi_filter_gpu <<<grid, block>>> (threads, inpHashes, d_br2, d_nonces);
+	return threads;
+}
+
+__host__
+void phi_merge_cuda(const int thr_id, const uint32_t threads, uint32_t *outpHashes, uint32_t* d_br2, uint32_t* d_nonces)
+{
+	const uint32_t threadsperblock = 128;
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+	// put back second branch hashes to the common buffer d_hash
+	phi_merge_gpu <<<grid, block>>> (threads, outpHashes, d_br2, d_nonces);
+}
+
+__host__
+void phi_final_compress_cuda(const int thr_id, const uint32_t threads, uint32_t *d_hashes)
+{
+	const uint32_t threadsperblock = 128;
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+	phi_final_compress_gpu <<<grid, block>>> (threads, d_hashes);
+}
diff --git a/phi/cuda_phi2_cubehash512.cu b/phi/cuda_phi2_cubehash512.cu
new file mode 100644
index 0000000000..e0e7fd7d53
--- /dev/null
+++ b/phi/cuda_phi2_cubehash512.cu
@@ -0,0 +1,319 @@
+/* phi2 cubehash-512 144-bytes input (80 + 64) */
+
+#include <cuda_helper.h>
+#include <cuda_vectors.h>
+
+#define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */
+#define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */
+
+#if __CUDA_ARCH__ < 350
+#define LROT(x,bits) ((x << bits) | (x >> (32 - bits)))
+#else
+#define LROT(x, bits) __funnelshift_l(x, x, bits)
+#endif
+
+#define ROTATEUPWARDS7(a)  LROT(a,7)
+#define ROTATEUPWARDS11(a) LROT(a,11)
+
+#define SWAP(a,b) { uint32_t u = a; a = b; b = u; }
+
+#ifdef NO_MIDSTATE
+
+__device__ __constant__
+static const uint32_t c_IV_512[32] = {
+	0x2AEA2A61, 0x50F494D4, 0x2D538B8B, 0x4167D83E,
+	0x3FEE2313, 0xC701CF8C, 0xCC39968E, 0x50AC5695,
+	0x4D42C787, 0xA647A8B3, 0x97CF0BEF, 0x825B4537,
+	0xEEF864D2, 0xF22090C4, 0xD0E5CD33, 0xA23911AE,
+	0xFCD398D9, 0x148FE485, 0x1B017BEF, 0xB6444532,
+	0x6A536159, 0x2FF5781C, 0x91FA7934, 0x0DBADEA9,
+	0xD65C8A2B, 0xA5A70E75, 0xB1C62456, 0xBC796576,
+	0x1921C8F7, 0xE7989AF1, 0x7795D246, 0xD43E3B44
+};
+
+#endif
+
+__device__ __forceinline__
+static void rrounds(uint32_t x[2][2][2][2][2])
+{
+    int r;
+    int j;
+    int k;
+    int l;
+    int m;
+
+//#pragma unroll 16
+    for (r = 0;r < CUBEHASH_ROUNDS;++r) {
+
+        /* "add x_0jklm into x_1jklmn modulo 2^32" */
+#pragma unroll 2
+        for (j = 0;j < 2;++j)
+#pragma unroll 2
+            for (k = 0;k < 2;++k)
+#pragma unroll 2
+                for (l = 0;l < 2;++l)
+#pragma unroll 2
+                    for (m = 0;m < 2;++m)
+                        x[1][j][k][l][m] += x[0][j][k][l][m];
+
+        /* "rotate x_0jklm upwards by 7 bits" */
+#pragma unroll 2
+        for (j = 0;j < 2;++j)
+#pragma unroll 2
+            for (k = 0;k < 2;++k)
+#pragma unroll 2
+                for (l = 0;l < 2;++l)
+#pragma unroll 2
+                    for (m = 0;m < 2;++m)
+                        x[0][j][k][l][m] = ROTATEUPWARDS7(x[0][j][k][l][m]);
+
+        /* "swap x_00klm with x_01klm" */
+#pragma unroll 2
+        for (k = 0;k < 2;++k)
+#pragma unroll 2
+            for (l = 0;l < 2;++l)
+#pragma unroll 2
+                for (m = 0;m < 2;++m)
+                    SWAP(x[0][0][k][l][m],x[0][1][k][l][m])
+
+        /* "xor x_1jklm into x_0jklm" */
+#pragma unroll 2
+        for (j = 0;j < 2;++j)
+#pragma unroll 2
+            for (k = 0;k < 2;++k)
+#pragma unroll 2
+                for (l = 0;l < 2;++l)
+#pragma unroll 2
+                    for (m = 0;m < 2;++m)
+                        x[0][j][k][l][m] ^= x[1][j][k][l][m];
+
+        /* "swap x_1jk0m with x_1jk1m" */
+#pragma unroll 2
+        for (j = 0;j < 2;++j)
+#pragma unroll 2
+            for (k = 0;k < 2;++k)
+#pragma unroll 2
+                for (m = 0;m < 2;++m)
+                    SWAP(x[1][j][k][0][m],x[1][j][k][1][m])
+
+        /* "add x_0jklm into x_1jklm modulo 2^32" */
+#pragma unroll 2
+        for (j = 0;j < 2;++j)
+#pragma unroll 2
+            for (k = 0;k < 2;++k)
+#pragma unroll 2
+                for (l = 0;l < 2;++l)
+#pragma unroll 2
+                    for (m = 0;m < 2;++m)
+                        x[1][j][k][l][m] += x[0][j][k][l][m];
+
+        /* "rotate x_0jklm upwards by 11 bits" */
+#pragma unroll 2
+        for (j = 0;j < 2;++j)
+#pragma unroll 2
+            for (k = 0;k < 2;++k)
+#pragma unroll 2
+                for (l = 0;l < 2;++l)
+#pragma unroll 2
+                    for (m = 0;m < 2;++m)
+                        x[0][j][k][l][m] = ROTATEUPWARDS11(x[0][j][k][l][m]);
+
+        /* "swap x_0j0lm with x_0j1lm" */
+#pragma unroll 2
+        for (j = 0;j < 2;++j)
+#pragma unroll 2
+            for (l = 0;l < 2;++l)
+#pragma unroll 2
+                for (m = 0;m < 2;++m)
+                    SWAP(x[0][j][0][l][m],x[0][j][1][l][m])
+
+        /* "xor x_1jklm into x_0jklm" */
+#pragma unroll 2
+        for (j = 0;j < 2;++j)
+#pragma unroll 2
+            for (k = 0;k < 2;++k)
+#pragma unroll 2
+                for (l = 0;l < 2;++l)
+#pragma unroll 2
+                    for (m = 0;m < 2;++m)
+                        x[0][j][k][l][m] ^= x[1][j][k][l][m];
+
+        /* "swap x_1jkl0 with x_1jkl1" */
+#pragma unroll 2
+        for (j = 0;j < 2;++j)
+#pragma unroll 2
+            for (k = 0;k < 2;++k)
+#pragma unroll 2
+                for (l = 0;l < 2;++l)
+                    SWAP(x[1][j][k][l][0],x[1][j][k][l][1])
+
+    }
+}
+
+__device__ __forceinline__
+static void block_tox(uint32_t* const block, uint32_t x[2][2][2][2][2])
+{
+	// read 32 bytes input from global mem with uint2 chunks
+	AS_UINT2(x[0][0][0][0]) ^= AS_UINT2(&block[0]);
+	AS_UINT2(x[0][0][0][1]) ^= AS_UINT2(&block[2]);
+	AS_UINT2(x[0][0][1][0]) ^= AS_UINT2(&block[4]);
+	AS_UINT2(x[0][0][1][1]) ^= AS_UINT2(&block[6]);
+}
+
+__device__ __forceinline__
+static void hash_fromx(uint32_t hash[16], uint32_t const x[2][2][2][2][2])
+{
+	// used to write final hash to global mem
+	AS_UINT2(&hash[ 0]) = AS_UINT2(x[0][0][0][0]);
+	AS_UINT2(&hash[ 2]) = AS_UINT2(x[0][0][0][1]);
+	AS_UINT2(&hash[ 4]) = AS_UINT2(x[0][0][1][0]);
+	AS_UINT2(&hash[ 6]) = AS_UINT2(x[0][0][1][1]);
+	AS_UINT2(&hash[ 8]) = AS_UINT2(x[0][1][0][0]);
+	AS_UINT2(&hash[10]) = AS_UINT2(x[0][1][0][1]);
+	AS_UINT2(&hash[12]) = AS_UINT2(x[0][1][1][0]);
+	AS_UINT2(&hash[14]) = AS_UINT2(x[0][1][1][1]);
+}
+
+#define Init(x) \
+	AS_UINT2(x[0][0][0][0]) = AS_UINT2(&c_IV_512[ 0]); \
+	AS_UINT2(x[0][0][0][1]) = AS_UINT2(&c_IV_512[ 2]); \
+	AS_UINT2(x[0][0][1][0]) = AS_UINT2(&c_IV_512[ 4]); \
+	AS_UINT2(x[0][0][1][1]) = AS_UINT2(&c_IV_512[ 6]); \
+	AS_UINT2(x[0][1][0][0]) = AS_UINT2(&c_IV_512[ 8]); \
+	AS_UINT2(x[0][1][0][1]) = AS_UINT2(&c_IV_512[10]); \
+	AS_UINT2(x[0][1][1][0]) = AS_UINT2(&c_IV_512[12]); \
+	AS_UINT2(x[0][1][1][1]) = AS_UINT2(&c_IV_512[14]); \
+	AS_UINT2(x[1][0][0][0]) = AS_UINT2(&c_IV_512[16]); \
+	AS_UINT2(x[1][0][0][1]) = AS_UINT2(&c_IV_512[18]); \
+	AS_UINT2(x[1][0][1][0]) = AS_UINT2(&c_IV_512[20]); \
+	AS_UINT2(x[1][0][1][1]) = AS_UINT2(&c_IV_512[22]); \
+	AS_UINT2(x[1][1][0][0]) = AS_UINT2(&c_IV_512[24]); \
+	AS_UINT2(x[1][1][0][1]) = AS_UINT2(&c_IV_512[26]); \
+	AS_UINT2(x[1][1][1][0]) = AS_UINT2(&c_IV_512[28]); \
+	AS_UINT2(x[1][1][1][1]) = AS_UINT2(&c_IV_512[30]);
+
+__device__ __forceinline__
+static void Update32(uint32_t x[2][2][2][2][2], uint32_t* const data)
+{
+	/* "xor the block into the first b bytes of the state" */
+	block_tox(data, x);
+	/* "and then transform the state invertibly through r identical rounds" */
+	rrounds(x);
+}
+
+__device__ __forceinline__
+static void Final(uint32_t x[2][2][2][2][2], uint32_t *hashval)
+{
+	/* "the integer 1 is xored into the last state word x_11111" */
+	x[1][1][1][1][1] ^= 1;
+
+	/* "the state is then transformed invertibly through 10r identical rounds" */
+	#pragma unroll 10
+	for (int i = 0; i < 10; i++) rrounds(x);
+
+	/* "output the first h/8 bytes of the state" */
+	hash_fromx(hashval, x);
+}
+
+__host__ void phi2_cubehash512_cpu_init(int thr_id, uint32_t threads) { }
+
+/***************************************************/
+
+/**
+ * Timetravel and x16 CUBEHASH-80 CUDA implementation
+ *  by tpruvot@github - Jan 2017 / May 2018
+ */
+
+__constant__ static uint32_t c_midstate128[32];
+__constant__ static uint32_t c_PaddedMessage_144[36];
+
+#undef SPH_C32
+#undef SPH_C64
+#undef SPH_T32
+#undef SPH_T64
+#include "sph/sph_cubehash.h"
+
+__host__
+void cubehash512_setBlock_144(int thr_id, uint32_t* endiandata)
+{
+	sph_cubehash512_context ctx_cubehash;
+	sph_cubehash512_init(&ctx_cubehash);
+	sph_cubehash512(&ctx_cubehash, (void*)endiandata, 64);
+#ifndef NO_MIDSTATE
+	cudaMemcpyToSymbol(c_midstate128, ctx_cubehash.state, 128, 0, cudaMemcpyHostToDevice);
+#endif
+	cudaMemcpyToSymbol(c_PaddedMessage_144, endiandata, sizeof(c_PaddedMessage_144), 0, cudaMemcpyHostToDevice);
+}
+
+__global__
+void cubehash512_gpu_hash_144(const uint32_t threads, const uint32_t startNounce, uint64_t *g_outhash)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint32_t nonce = startNounce + thread;
+		uint32_t message[8];
+		uint32_t x[2][2][2][2][2];
+#ifdef NO_MIDSTATE
+		Init(x);
+
+		// first 32 bytes
+		AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage_144[0]);
+		AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage_144[4]);
+		Update32(x, message);
+
+		// second 32 bytes
+		AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage_144[8]);
+		AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage_144[12]);
+		Update32(x, message);
+#else
+		AS_UINT2(x[0][0][0][0]) = AS_UINT2(&c_midstate128[ 0]);
+		AS_UINT2(x[0][0][0][1]) = AS_UINT2(&c_midstate128[ 2]);
+		AS_UINT2(x[0][0][1][0]) = AS_UINT2(&c_midstate128[ 4]);
+		AS_UINT2(x[0][0][1][1]) = AS_UINT2(&c_midstate128[ 6]);
+		AS_UINT2(x[0][1][0][0]) = AS_UINT2(&c_midstate128[ 8]);
+		AS_UINT2(x[0][1][0][1]) = AS_UINT2(&c_midstate128[10]);
+		AS_UINT2(x[0][1][1][0]) = AS_UINT2(&c_midstate128[12]);
+		AS_UINT2(x[0][1][1][1]) = AS_UINT2(&c_midstate128[14]);
+
+		AS_UINT2(x[1][0][0][0]) = AS_UINT2(&c_midstate128[16]);
+		AS_UINT2(x[1][0][0][1]) = AS_UINT2(&c_midstate128[18]);
+		AS_UINT2(x[1][0][1][0]) = AS_UINT2(&c_midstate128[20]);
+		AS_UINT2(x[1][0][1][1]) = AS_UINT2(&c_midstate128[22]);
+		AS_UINT2(x[1][1][0][0]) = AS_UINT2(&c_midstate128[24]);
+		AS_UINT2(x[1][1][0][1]) = AS_UINT2(&c_midstate128[26]);
+		AS_UINT2(x[1][1][1][0]) = AS_UINT2(&c_midstate128[28]);
+		AS_UINT2(x[1][1][1][1]) = AS_UINT2(&c_midstate128[30]);
+#endif
+		// nonce + state root
+		AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage_144[16]);
+		message[3] = cuda_swab32(nonce);
+		AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage_144[20]); // state
+		Update32(x, message);
+
+		AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage_144[24]); // state
+		AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage_144[28]); // utxo
+		Update32(x, message);
+
+		AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage_144[32]); // utxo
+		message[4] = 0x80;
+		message[5] = 0;
+		message[6] = 0;
+		message[7] = 0;
+		Update32(x, message);
+
+		uint32_t* output = (uint32_t*) (&g_outhash[(size_t)8 * thread]);
+		Final(x, output);
+	}
+}
+
+__host__
+void cubehash512_cuda_hash_144(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash)
+{
+	const uint32_t threadsperblock = 256;
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	cubehash512_gpu_hash_144 <<<grid, block>>> (threads, startNounce, (uint64_t*) d_hash);
+}
+
diff --git a/phi/phi.cu b/phi/phi.cu
new file mode 100644
index 0000000000..ba2a9677b6
--- /dev/null
+++ b/phi/phi.cu
@@ -0,0 +1,223 @@
+//
+//
+//  PHI1612 algo
+//  Skein + JH + CubeHash + Fugue + Gost + Echo
+//
+//  Implemented by anorganix @ bitcointalk on 01.10.2017
+//  Feel free to send some satoshis to 1Bitcoin8tfbtGAQNFxDRUVUfFgFWKoWi9
+//
+//
+
+extern "C" {
+#include "sph/sph_skein.h"
+#include "sph/sph_jh.h"
+#include "sph/sph_cubehash.h"
+#include "sph/sph_fugue.h"
+#include "sph/sph_streebog.h"
+#include "sph/sph_echo.h"
+}
+
+#include "miner.h"
+#include "cuda_helper.h"
+#include "x11/cuda_x11.h"
+
+extern void skein512_cpu_setBlock_80(void *pdata);
+extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int swap);
+extern void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
+extern void streebog_hash_64_maxwell(int thr_id, uint32_t threads, uint32_t *d_hash);
+
+extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
+extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x13_fugue512_cpu_free(int thr_id);
+
+extern void tribus_echo512_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t *d_resNonce, const uint64_t target);
+
+#include <stdio.h>
+#include <memory.h>
+
+static uint32_t *d_hash[MAX_GPUS];
+static uint32_t *d_resNonce[MAX_GPUS];
+
+extern "C" void phi_hash(void *output, const void *input)
+{
+	unsigned char _ALIGN(128) hash[128] = { 0 };
+
+	sph_skein512_context ctx_skein;
+	sph_jh512_context ctx_jh;
+	sph_cubehash512_context ctx_cubehash;
+	sph_fugue512_context ctx_fugue;
+	sph_gost512_context ctx_gost;
+	sph_echo512_context ctx_echo;
+
+	sph_skein512_init(&ctx_skein);
+	sph_skein512(&ctx_skein, input, 80);
+	sph_skein512_close(&ctx_skein, (void*)hash);
+
+	sph_jh512_init(&ctx_jh);
+	sph_jh512(&ctx_jh, (const void*)hash, 64);
+	sph_jh512_close(&ctx_jh, (void*)hash);
+
+	sph_cubehash512_init(&ctx_cubehash);
+	sph_cubehash512(&ctx_cubehash, (const void*)hash, 64);
+	sph_cubehash512_close(&ctx_cubehash, (void*)hash);
+
+	sph_fugue512_init(&ctx_fugue);
+	sph_fugue512(&ctx_fugue, (const void*)hash, 64);
+	sph_fugue512_close(&ctx_fugue, (void*)hash);
+
+	sph_gost512_init(&ctx_gost);
+	sph_gost512(&ctx_gost, (const void*)hash, 64);
+	sph_gost512_close(&ctx_gost, (void*)hash);
+
+	sph_echo512_init(&ctx_echo);
+	sph_echo512(&ctx_echo, (const void*)hash, 64);
+	sph_echo512_close(&ctx_echo, (void*)hash);
+
+	memcpy(output, hash, 32);
+}
+
+#define _DEBUG_PREFIX "phi"
+#include "cuda_debug.cuh"
+
+static bool init[MAX_GPUS] = { 0 };
+static bool use_compat_kernels[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_phi(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+
+	const uint32_t first_nonce = pdata[19];
+	const int dev_id = device_map[thr_id];
+
+	int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 19 : 18; // 2^18 = 262144 cuda threads
+	if (device_sm[dev_id] >= 600) intensity = 20;
+
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (opt_benchmark)
+		ptarget[7] = 0xf;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		cuda_get_arch(thr_id);
+		use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
+
+		quark_skein512_cpu_init(thr_id, throughput);
+		quark_jh512_cpu_init(thr_id, throughput);
+		x11_cubehash512_cpu_init(thr_id, throughput);
+		x13_fugue512_cpu_init(thr_id, throughput);
+		if (use_compat_kernels[thr_id])
+			x11_echo512_cpu_init(thr_id, throughput);
+
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t)64 * throughput), -1);
+		CUDA_SAFE_CALL(cudaMalloc(&d_resNonce[thr_id], 2 * sizeof(uint32_t)));
+
+		cuda_check_cpu_init(thr_id, throughput);
+		init[thr_id] = true;
+	}
+
+	uint32_t endiandata[20];
+
+	for (int k = 0; k < 20; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	skein512_cpu_setBlock_80((void*)endiandata);
+	if (use_compat_kernels[thr_id])
+		cuda_check_cpu_setTarget(ptarget);
+	else
+		cudaMemset(d_resNonce[thr_id], 0xFF, 2 * sizeof(uint32_t));
+
+	do {
+		int order = 0;
+
+		skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++;
+		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		if (use_compat_kernels[thr_id]) {
+			streebog_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
+			x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+			work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		} else {
+			streebog_hash_64_maxwell(thr_id, throughput, d_hash[thr_id]);
+			tribus_echo512_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id], AS_U64(&ptarget[6]));
+			cudaMemcpy(&work->nonces[0], d_resNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
+		}
+
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			const uint32_t Htarg = ptarget[7];
+			const uint32_t startNonce = pdata[19];
+			uint32_t _ALIGN(64) vhash[8];
+			if (!use_compat_kernels[thr_id]) work->nonces[0] += startNonce;
+			be32enc(&endiandata[19], work->nonces[0]);
+			phi_hash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				*hashes_done = pdata[19] - first_nonce + throughput;
+				//work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				//if (work->nonces[1] != 0) {
+				if (work->nonces[1] != UINT32_MAX) {
+					work->nonces[1] += startNonce;
+					be32enc(&endiandata[19], work->nonces[1]);
+					phi_hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				}
+				else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+					gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				cudaMemset(d_resNonce[thr_id], 0xFF, 2 * sizeof(uint32_t));
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+	return 0;
+}
+
+// cleanup
+extern "C" void free_phi(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+	cudaFree(d_hash[thr_id]);
+	cudaFree(d_resNonce[thr_id]);
+	x13_fugue512_cpu_free(thr_id);
+
+	cuda_check_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/phi/phi2.cu b/phi/phi2.cu
new file mode 100644
index 0000000000..fbdb9c4083
--- /dev/null
+++ b/phi/phi2.cu
@@ -0,0 +1,268 @@
+//
+//  PHI2 algo (with smart contracts header)
+//  CubeHash + Lyra2 x2 + JH + Gost or Echo + Skein
+//
+//  Implemented by tpruvot in May 2018
+//
+
+extern "C" {
+#include "sph/sph_skein.h"
+#include "sph/sph_jh.h"
+#include "sph/sph_cubehash.h"
+#include "sph/sph_streebog.h"
+#include "sph/sph_echo.h"
+#include "lyra2/Lyra2.h"
+}
+
+#include "miner.h"
+#include "cuda_helper.h"
+#include "x11/cuda_x11.h"
+
+#include <stdio.h>
+#include <memory.h>
+
+extern void cubehash512_setBlock_80(int thr_id, uint32_t* endiandata);
+extern void cubehash512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash);
+
+extern void cubehash512_setBlock_144(int thr_id, uint32_t* endiandata);
+extern void cubehash512_cuda_hash_144(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash);
+
+extern void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix);
+extern void lyra2_cuda_hash_64(int thr_id, const uint32_t threads, uint64_t* d_hash_256, uint32_t* d_hash_512, bool gtx750ti);
+
+extern void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
+extern void phi_streebog_hash_64_filtered(int thr_id, const uint32_t threads, uint32_t *g_hash, uint32_t *d_filter);
+extern void phi_echo512_cpu_hash_64_filtered(int thr_id, const uint32_t threads, uint32_t* g_hash, uint32_t* d_filter);
+
+extern uint32_t phi_filter_cuda(const int thr_id, const uint32_t threads, const uint32_t *inpHashes, uint32_t* d_br2, uint32_t* d_nonces);
+extern void phi_merge_cuda(const int thr_id, const uint32_t threads, uint32_t *outpHashes, uint32_t* d_br2, uint32_t* d_nonces);
+extern void phi_final_compress_cuda(const int thr_id, const uint32_t threads, uint32_t *d_hashes);
+
+static uint64_t* d_matrix[MAX_GPUS];
+static uint32_t* d_hash_512[MAX_GPUS];
+static uint64_t* d_hash_256[MAX_GPUS];
+static uint32_t* d_hash_br2[MAX_GPUS];
+static uint32_t* d_nonce_br[MAX_GPUS];
+
+static bool has_roots;
+
+extern "C" void phi2_hash(void *output, const void *input)
+{
+	unsigned char _ALIGN(128) hash[64];
+	unsigned char _ALIGN(128) hashA[64];
+	unsigned char _ALIGN(128) hashB[64];
+
+	sph_cubehash512_context ctx_cubehash;
+	sph_jh512_context ctx_jh;
+	sph_gost512_context ctx_gost;
+	sph_echo512_context ctx_echo;
+	sph_skein512_context ctx_skein;
+
+	sph_cubehash512_init(&ctx_cubehash);
+	sph_cubehash512(&ctx_cubehash, input, has_roots ? 144 : 80);
+	sph_cubehash512_close(&ctx_cubehash, (void*)hashB);
+
+	LYRA2(&hashA[ 0], 32, &hashB[ 0], 32, &hashB[ 0], 32, 1, 8, 8);
+	LYRA2(&hashA[32], 32, &hashB[32], 32, &hashB[32], 32, 1, 8, 8);
+
+	sph_jh512_init(&ctx_jh);
+	sph_jh512(&ctx_jh, (const void*)hashA, 64);
+	sph_jh512_close(&ctx_jh, (void*)hash);
+
+	if (hash[0] & 1) {
+		sph_gost512_init(&ctx_gost);
+		sph_gost512(&ctx_gost, (const void*)hash, 64);
+		sph_gost512_close(&ctx_gost, (void*)hash);
+	} else {
+		sph_echo512_init(&ctx_echo);
+		sph_echo512(&ctx_echo, (const void*)hash, 64);
+		sph_echo512_close(&ctx_echo, (void*)hash);
+
+		sph_echo512_init(&ctx_echo);
+		sph_echo512(&ctx_echo, (const void*)hash, 64);
+		sph_echo512_close(&ctx_echo, (void*)hash);
+	}
+
+	sph_skein512_init(&ctx_skein);
+	sph_skein512(&ctx_skein, (const void*)hash, 64);
+	sph_skein512_close(&ctx_skein, (void*)hash);
+
+	for (int i=0; i<32; i++)
+		hash[i] ^= hash[i+32];
+
+	memcpy(output, hash, 32);
+}
+
+//#define _DEBUG
+#define _DEBUG_PREFIX "phi-"
+#include "cuda_debug.cuh"
+
+static bool init[MAX_GPUS] = { 0 };
+static bool use_compat_kernels[MAX_GPUS] = { 0 };
+static __thread bool gtx750ti = false;
+
+extern "C" int scanhash_phi2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+
+	const uint32_t first_nonce = pdata[19];
+	const int dev_id = device_map[thr_id];
+
+	int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 17 : 16;
+	if (device_sm[dev_id] == 500) intensity = 15;
+	if (device_sm[dev_id] == 600) intensity = 17;
+
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+	if (init[thr_id]) throughput = max(throughput & 0xffffff80, 128); // for shared mem
+
+	if (opt_benchmark)
+		ptarget[7] = 0xff;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		cuda_get_arch(thr_id);
+		use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
+		gtx750ti = (strstr(device_name[dev_id], "GTX 750 Ti") != NULL);
+
+		size_t matrix_sz = device_sm[dev_id] > 500 ? sizeof(uint64_t) * 16 : sizeof(uint64_t) * 8 * 8 * 3 * 4;
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput), -1);
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash_256[thr_id], (size_t)32 * throughput), -1);
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash_512[thr_id], (size_t)64 * throughput), -1);
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_nonce_br[thr_id], sizeof(uint32_t) * throughput), -1);
+		if (use_compat_kernels[thr_id]) {
+			CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash_br2[thr_id], (size_t)64 * throughput), -1);
+		}
+
+		lyra2_cpu_init(thr_id, throughput, d_matrix[thr_id]);
+		quark_jh512_cpu_init(thr_id, throughput);
+		quark_skein512_cpu_init(thr_id, throughput);
+		if (use_compat_kernels[thr_id]) x11_echo512_cpu_init(thr_id, throughput);
+
+		cuda_check_cpu_init(thr_id, throughput);
+		init[thr_id] = true;
+	}
+
+	has_roots = false;
+	uint32_t endiandata[36];
+	for (int k = 0; k < 36; k++) {
+		be32enc(&endiandata[k], pdata[k]);
+		if (k >= 20 && pdata[k]) has_roots = true;
+	}
+
+	cuda_check_cpu_setTarget(ptarget);
+	if (has_roots)
+		cubehash512_setBlock_144(thr_id, endiandata);
+	else
+		cubehash512_setBlock_80(thr_id, endiandata);
+
+	do {
+		int order = 0;
+		if (has_roots)
+			cubehash512_cuda_hash_144(thr_id, throughput, pdata[19], d_hash_512[thr_id]);
+		else
+			cubehash512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash_512[thr_id]);
+		order++;
+		TRACE("cube   ");
+
+		lyra2_cuda_hash_64(thr_id, throughput, d_hash_256[thr_id], d_hash_512[thr_id], gtx750ti);
+		order++;
+		TRACE("lyra   ");
+
+		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_512[thr_id], order++);
+		TRACE("jh     ");
+
+		order++;
+		if (!use_compat_kernels[thr_id]) {
+			phi_filter_cuda(thr_id, throughput, d_hash_512[thr_id], NULL, d_nonce_br[thr_id]);
+			phi_streebog_hash_64_filtered(thr_id, throughput, d_hash_512[thr_id], d_nonce_br[thr_id]);
+			phi_echo512_cpu_hash_64_filtered(thr_id, throughput, d_hash_512[thr_id], d_nonce_br[thr_id]);
+			phi_echo512_cpu_hash_64_filtered(thr_id, throughput, d_hash_512[thr_id], d_nonce_br[thr_id]);
+		} else {
+			// todo: nonces vector to reduce amount of hashes to compute
+			phi_filter_cuda(thr_id, throughput, d_hash_512[thr_id], d_hash_br2[thr_id], d_nonce_br[thr_id]);
+			streebog_cpu_hash_64(thr_id, throughput, d_hash_512[thr_id]);
+			x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order);
+			x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order);
+			phi_merge_cuda(thr_id, throughput, d_hash_512[thr_id], d_hash_br2[thr_id], d_nonce_br[thr_id]);
+		}
+		TRACE("mix    ");
+
+		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_512[thr_id], order++);
+		TRACE("skein  ");
+
+		phi_final_compress_cuda(thr_id, throughput, d_hash_512[thr_id]);
+		TRACE("xor  ");
+
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash_512[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			phi2_hash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				*hashes_done = pdata[19] - first_nonce + throughput;
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash_512[thr_id], 1);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					phi2_hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				if (pdata[19] > max_nonce) pdata[19] = max_nonce;
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+					gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU! thr=%x", work->nonces[0], throughput);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+	return 0;
+}
+
+// cleanup
+extern "C" void free_phi2(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+	cudaFree(d_matrix[thr_id]);
+	cudaFree(d_hash_512[thr_id]);
+	cudaFree(d_hash_256[thr_id]);
+	cudaFree(d_nonce_br[thr_id]);
+	if (use_compat_kernels[thr_id]) cudaFree(d_hash_br2[thr_id]);
+
+	cuda_check_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/polytimos.cu b/polytimos.cu
new file mode 100644
index 0000000000..fe7776ca56
--- /dev/null
+++ b/polytimos.cu
@@ -0,0 +1,216 @@
+/*
+ * Polytimos algorithm
+ */
+extern "C"
+{
+#include "sph/sph_skein.h"
+#include "sph/sph_shabal.h"
+#include "sph/sph_echo.h"
+#include "sph/sph_luffa.h"
+#include "sph/sph_fugue.h"
+#include "sph/sph_streebog.h"
+}
+
+#include "miner.h"
+
+#include "cuda_helper.h"
+#include "x11/cuda_x11.h"
+
+static uint32_t *d_hash[MAX_GPUS];
+static uint32_t *d_resNonce[MAX_GPUS];
+
+extern void skein512_cpu_setBlock_80(void *pdata);
+extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap);
+extern void x14_shabal512_cpu_init(int thr_id, uint32_t threads);
+extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
+extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x13_fugue512_cpu_free(int thr_id);
+extern void streebog_sm3_set_target(uint32_t* ptarget);
+extern void streebog_sm3_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce);
+extern void skunk_streebog_set_target(uint32_t* ptarget);
+extern void skunk_cuda_streebog(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce);
+
+// CPU Hash
+extern "C" void polytimos_hash(void *output, const void *input)
+{
+	sph_skein512_context ctx_skein;
+	sph_shabal512_context ctx_shabal;
+	sph_echo512_context ctx_echo;
+	sph_luffa512_context ctx_luffa;
+	sph_fugue512_context ctx_fugue;
+	sph_gost512_context ctx_gost;
+
+	uint32_t _ALIGN(128) hash[16];
+	memset(hash, 0, sizeof hash);
+
+	sph_skein512_init(&ctx_skein);
+	sph_skein512(&ctx_skein, input, 80);
+	sph_skein512_close(&ctx_skein, (void*) hash);
+
+	sph_shabal512_init(&ctx_shabal);
+	sph_shabal512(&ctx_shabal, hash, 64);
+	sph_shabal512_close(&ctx_shabal, hash);
+
+	sph_echo512_init(&ctx_echo);
+	sph_echo512(&ctx_echo, hash, 64);
+	sph_echo512_close(&ctx_echo, hash);
+
+	sph_luffa512_init(&ctx_luffa);
+	sph_luffa512(&ctx_luffa, hash, 64);
+	sph_luffa512_close(&ctx_luffa, hash);
+
+	sph_fugue512_init(&ctx_fugue);
+	sph_fugue512(&ctx_fugue, hash, 64);
+	sph_fugue512_close(&ctx_fugue, hash);
+
+	sph_gost512_init(&ctx_gost);
+	sph_gost512(&ctx_gost, (const void*) hash, 64);
+	sph_gost512_close(&ctx_gost, (void*) hash);
+
+	memcpy(output, hash, 32);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+static bool use_compat_kernels[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_polytimos(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	int dev_id = device_map[thr_id];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 20 : 19;
+	uint32_t throughput =  cuda_default_throughput(thr_id, 1 << intensity); // 19=256*256*8;
+	//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (opt_benchmark)
+		((uint32_t*)ptarget)[7] = 0x000f;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		cuda_get_arch(thr_id);
+		use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
+
+		quark_skein512_cpu_init(thr_id, throughput);
+		x14_shabal512_cpu_init(thr_id, throughput);
+		x11_echo512_cpu_init(thr_id, throughput);
+		x11_luffa512_cpu_init(thr_id, throughput);
+		x13_fugue512_cpu_init(thr_id, throughput);
+
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0);
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_resNonce[thr_id], 2 * sizeof(uint32_t)), -1);
+
+		init[thr_id] = true;
+	}
+
+
+	uint32_t _ALIGN(64) h_resNonce[2];
+	uint32_t _ALIGN(64) endiandata[20];
+	for (int k=0; k < 20; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+
+	cudaMemset(d_resNonce[thr_id], 0xff, 2*sizeof(uint32_t));
+	skein512_cpu_setBlock_80(endiandata);
+	if (use_compat_kernels[thr_id]) {
+		streebog_sm3_set_target(ptarget);
+	} else {
+		skunk_streebog_set_target(ptarget);
+	}
+
+	do {
+		int order = 0;
+
+		skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_luffa512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		if (use_compat_kernels[thr_id]) {
+			streebog_sm3_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);
+		} else {
+			skunk_cuda_streebog(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);
+		}
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		cudaMemcpy(h_resNonce, d_resNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
+		CUDA_LOG_ERROR();
+
+		if (h_resNonce[0] != UINT32_MAX)
+		{
+			const uint32_t Htarg = ptarget[7];
+			const uint32_t startNounce = pdata[19];
+			uint32_t _ALIGN(64) vhash[8];
+
+			be32enc(&endiandata[19], startNounce + h_resNonce[0]);
+			polytimos_hash(vhash, endiandata);
+			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work->nonces[0] = startNounce + h_resNonce[0];
+				work_set_target_ratio(work, vhash);
+				if (h_resNonce[1] != UINT32_MAX) {
+					uint32_t secNonce = work->nonces[1] = startNounce + h_resNonce[1];
+					be32enc(&endiandata[19], secNonce);
+					polytimos_hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+					gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				cudaMemset(d_resNonce[thr_id], 0xff, 2*sizeof(uint32_t));
+				pdata[19] = startNounce + h_resNonce[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+
+	CUDA_LOG_ERROR();
+
+	return 0;
+}
+
+// cleanup
+extern "C" void free_polytimos(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+	x13_fugue512_cpu_free(thr_id);
+	cudaFree(d_resNonce[thr_id]);
+
+	CUDA_LOG_ERROR();
+
+	cudaDeviceSynchronize();
+	init[thr_id] = false;
+}
diff --git a/pools.conf b/pools.conf
new file mode 100644
index 0000000000..b13b80185f
--- /dev/null
+++ b/pools.conf
@@ -0,0 +1,30 @@
+{
+	"_note": "Sample rotation of 2 pools",
+
+	"pools":[{
+		"name": "Longpoll sample",
+		"url": "http://ziftr.suprnova.cc:9991",
+		"user": "tpruvot.win",
+		"pass": "x",
+		"max-diff": 0.0,
+		"scantime": 40,
+		"time-limit": 180,
+		"disabled": false
+	},
+	{
+		"name": "Stratum sample",
+		"url": "stratum+tcp://zrc-stratum.suprnova.cc:2257",
+		"user": "tpruvot.elite",
+		"pass": "x",
+		"time-limit": 300
+	}],
+
+	"algo" : "zr5",
+
+	"timeout": 60,
+
+	"api-bind" : "0.0.0.0",
+	"api-remote" : true,
+
+	"no-gbt": true
+}
diff --git a/pools.cpp b/pools.cpp
new file mode 100644
index 0000000000..c8fb1ba8e8
--- /dev/null
+++ b/pools.cpp
@@ -0,0 +1,410 @@
+/**
+ * Functions which handle multiple pools data
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "miner.h"
+#include "compat.h"
+#include "algos.h"
+
+// to move in miner.h
+extern bool allow_gbt;
+extern bool allow_mininginfo;
+extern bool check_dups;
+
+extern double opt_max_diff;
+extern double opt_max_rate;
+extern int opt_scantime;
+extern int opt_shares_limit;
+extern int opt_time_limit;
+
+extern char* rpc_url;
+extern char* rpc_user;
+extern char* rpc_pass;
+extern char* short_url;
+
+extern struct work _ALIGN(64) g_work;
+extern struct stratum_ctx stratum;
+extern pthread_mutex_t stratum_work_lock;
+extern pthread_mutex_t stats_lock;
+extern bool get_work(struct thr_info *thr, struct work *work);
+extern bool stratum_need_reset;
+extern time_t firstwork_time;
+
+extern volatile time_t g_work_time;
+extern volatile int pool_switch_count;
+extern volatile bool pool_is_switching;
+extern uint8_t conditional_state[MAX_GPUS];
+
+extern double thr_hashrates[MAX_GPUS];
+
+extern struct option options[];
+
+#define CFG_NULL 0
+#define CFG_POOL 1
+struct opt_config_array {
+	int cat;
+	const char *name;     // json key
+	const char *longname; // global opt name if different
+} cfg_array_keys[] = {
+	{ CFG_POOL, "url", NULL }, /* let this key first, increment pools */
+	{ CFG_POOL, "user", NULL },
+	{ CFG_POOL, "pass", NULL },
+	{ CFG_POOL, "userpass", NULL },
+	{ CFG_POOL, "name", "pool-name" },
+	{ CFG_POOL, "algo", "pool-algo" },
+	{ CFG_POOL, "scantime", "pool-scantime" },
+	{ CFG_POOL, "max-diff", "pool-max-diff" },
+	{ CFG_POOL, "max-rate", "pool-max-rate" },
+	{ CFG_POOL, "disabled", "pool-disabled" },
+	{ CFG_POOL, "time-limit", "pool-time-limit" },
+	{ CFG_NULL, NULL, NULL }
+};
+
+// store current credentials in pools container
+void pool_set_creds(int pooln)
+{
+	struct pool_infos *p = &pools[pooln];
+
+	snprintf(p->url, sizeof(p->url), "%s", rpc_url);
+	snprintf(p->short_url, sizeof(p->short_url), "%s", short_url);
+	snprintf(p->user, sizeof(p->user), "%s", rpc_user);
+	snprintf(p->pass, sizeof(p->pass), "%s", rpc_pass);
+
+	if (!(p->status & POOL_ST_DEFINED)) {
+		p->id = pooln;
+		p->status |= POOL_ST_DEFINED;
+		// init pool options as "unset"
+		// until cmdline is fully parsed...
+		p->algo = -1;
+		p->max_diff = -1.;
+		p->max_rate = -1.;
+		p->scantime = -1;
+		p->shares_limit = -1;
+		p->time_limit = -1;
+
+		p->allow_mininginfo = allow_mininginfo;
+		p->allow_gbt = allow_gbt;
+		p->check_dups = check_dups;
+
+		p->status |= POOL_ST_DEFINED;
+	}
+
+	if (strlen(rpc_url)) {
+		if (!strncasecmp(rpc_url, "stratum", 7))
+			p->type = POOL_STRATUM;
+		else /* if (!strncasecmp(rpc_url, "http", 4)) */
+			p->type = POOL_GETWORK; // todo: or longpoll
+		p->status |= POOL_ST_VALID;
+	}
+}
+
+// fill the unset pools options with cmdline ones
+void pool_init_defaults()
+{
+	struct pool_infos *p;
+	for (int i=0; i<num_pools; i++) {
+		p = &pools[i];
+		if (p->algo == -1) p->algo = (int) opt_algo;
+		if (p->max_diff == -1.) p->max_diff = opt_max_diff;
+		if (p->max_rate == -1.) p->max_rate = opt_max_rate;
+		if (p->scantime == -1) p->scantime = opt_scantime;
+		if (p->shares_limit == -1) p->shares_limit = opt_shares_limit;
+		if (p->time_limit == -1) p->time_limit = opt_time_limit;
+	}
+}
+
+// attributes only set by a json pools config
+void pool_set_attr(int pooln, const char* key, char* arg)
+{
+	struct pool_infos *p = &pools[pooln];
+	if (!strcasecmp(key, "name")) {
+		snprintf(p->name, sizeof(p->name), "%s", arg);
+		return;
+	}
+	if (!strcasecmp(key, "algo")) {
+		p->algo = algo_to_int(arg);
+		return;
+	}
+	if (!strcasecmp(key, "scantime")) {
+		p->scantime = atoi(arg);
+		return;
+	}
+	if (!strcasecmp(key, "max-diff")) {
+		p->max_diff = atof(arg);
+		return;
+	}
+	if (!strcasecmp(key, "max-rate")) {
+		p->max_rate = atof(arg);
+		return;
+	}
+	if (!strcasecmp(key, "shares-limit")) {
+		p->shares_limit = atoi(arg);
+		return;
+	}
+	if (!strcasecmp(key, "time-limit")) {
+		p->time_limit = atoi(arg);
+		return;
+	}
+	if (!strcasecmp(key, "disabled")) {
+		int removed = atoi(arg);
+		if (removed) {
+			p->status |= POOL_ST_REMOVED;
+		}
+		return;
+	}
+}
+
+// pool switching code
+bool pool_switch(int thr_id, int pooln)
+{
+	int prevn = cur_pooln;
+	bool algo_switch = false;
+	struct pool_infos *prev = &pools[cur_pooln];
+	struct pool_infos* p = NULL;
+
+	// save prev stratum connection infos (struct)
+	if (prev->type & POOL_STRATUM) {
+		// may not be the right moment to free,
+		// to check if required on submit...
+		stratum_free_job(&stratum);
+		prev->stratum = stratum;
+	}
+
+	if (pooln < num_pools) {
+		cur_pooln = pooln;
+		p = &pools[cur_pooln];
+	} else {
+		applog(LOG_ERR, "Switch to inexistant pool %d!", pooln);
+		return false;
+	}
+
+	// save global attributes
+	prev->allow_mininginfo = allow_mininginfo;
+	prev->allow_gbt = allow_gbt;
+	prev->check_dups = check_dups;
+
+	pthread_mutex_lock(&stratum_work_lock);
+
+	free(rpc_user); rpc_user = strdup(p->user);
+	free(rpc_pass); rpc_pass = strdup(p->pass);
+	free(rpc_url);  rpc_url = strdup(p->url);
+
+	short_url = p->short_url; // just a pointer, no alloc
+
+	opt_scantime = p->scantime;
+	opt_max_diff = p->max_diff;
+	opt_max_rate = p->max_rate;
+	opt_shares_limit = p->shares_limit;
+	opt_time_limit = p->time_limit;
+
+	want_stratum = have_stratum = (p->type & POOL_STRATUM) != 0;
+
+	// yiimp stats reporting
+	opt_stratum_stats = (strstr(p->pass, "stats") != NULL) || (strcmp(p->user, "benchmark") == 0);
+
+	pthread_mutex_unlock(&stratum_work_lock);
+
+	// algo "blind" switch without free, not proper
+	// todo: barrier required to free algo resources
+	if (p->algo != (int) opt_algo) {
+
+		if (opt_algo != ALGO_AUTO) {
+
+			algo_switch = true;
+
+			pthread_mutex_lock(&stats_lock);
+			for (int n=0; n<opt_n_threads; n++)
+				thr_hashrates[n] = 0.;
+			stats_purge_all();
+			if (check_dups)
+				hashlog_purge_all();
+			pthread_mutex_unlock(&stats_lock);
+		}
+
+		opt_algo = (enum sha_algos) p->algo;
+	}
+
+	if (prevn != cur_pooln) {
+
+		pool_switch_count++;
+		net_diff = 0;
+		g_work_time = 0;
+		g_work.data[0] = 0;
+		pool_is_switching = true;
+		stratum_need_reset = true;
+		// used to get the pool uptime
+		firstwork_time = time(NULL);
+		restart_threads();
+		// reset wait states
+		for (int n=0; n<opt_n_threads; n++)
+			conditional_state[n] = false;
+
+		// restore flags
+		allow_gbt = p->allow_gbt;
+		allow_mininginfo = p->allow_mininginfo;
+		check_dups = p->check_dups;
+
+		if (want_stratum) {
+
+			// temporary... until stratum code cleanup
+			stratum = p->stratum;
+			stratum.pooln = cur_pooln;
+			stratum.rpc2 = (p->algo == ALGO_WILDKECCAK || p->algo == ALGO_CRYPTONIGHT);
+			stratum.rpc2 |= p->algo == ALGO_CRYPTOLIGHT;
+
+			// unlock the stratum thread
+			tq_push(thr_info[stratum_thr_id].q, strdup(rpc_url));
+			applog(LOG_BLUE, "Switch to stratum pool %d: %s", cur_pooln,
+				strlen(p->name) ? p->name : p->short_url);
+		} else {
+			applog(LOG_BLUE, "Switch to pool %d: %s", cur_pooln,
+				strlen(p->name) ? p->name : p->short_url);
+		}
+
+		// will unlock the longpoll thread on /LP url receive
+		want_longpoll = (p->type & POOL_LONGPOLL) || !(p->type & POOL_STRATUM);
+		if (want_longpoll) {
+			pthread_mutex_lock(&stratum_work_lock);
+			// will issue a lp_url request to unlock the longpoll thread
+			have_longpoll = false;
+			get_work(&thr_info[0], &g_work);
+			pthread_mutex_unlock(&stratum_work_lock);
+		}
+
+	}
+
+	stratum.rpc2 = (p->algo == ALGO_WILDKECCAK || p->algo == ALGO_CRYPTONIGHT);
+	stratum.rpc2 |= p->algo == ALGO_CRYPTOLIGHT;
+
+	return true;
+}
+
+// search available pool
+int pool_get_first_valid(int startfrom)
+{
+	int next = 0;
+	struct pool_infos *p;
+	for (int i=0; i<num_pools; i++) {
+		int pooln = (startfrom + i) % num_pools;
+		p = &pools[pooln];
+		if (!(p->status & POOL_ST_VALID))
+			continue;
+		if (p->status & (POOL_ST_DISABLED | POOL_ST_REMOVED))
+			continue;
+		next = pooln;
+		break;
+	}
+	return next;
+}
+
+// switch to next available pool
+bool pool_switch_next(int thr_id)
+{
+	if (num_pools > 1) {
+		int pooln = pool_get_first_valid(cur_pooln+1);
+		return pool_switch(thr_id, pooln);
+	} else {
+		// no switch possible
+		if (!opt_quiet)
+			applog(LOG_DEBUG, "No other pools to try...");
+		return false;
+	}
+}
+
+// seturl from api remote (deprecated)
+bool pool_switch_url(char *params)
+{
+	int prevn = cur_pooln, nextn;
+	parse_arg('o', params);
+	// cur_pooln modified by parse_arg('o'), get new pool num
+	nextn = cur_pooln;
+	// and to handle the "hot swap" from current one...
+	cur_pooln = prevn;
+	if (nextn == prevn)
+		return false;
+	return pool_switch(-1, nextn);
+}
+
+// Parse pools array in json config
+bool parse_pool_array(json_t *obj)
+{
+	size_t idx;
+	json_t *p, *val;
+
+	if (!json_is_array(obj))
+		return false;
+
+	// array of objects [ {}, {} ]
+	json_array_foreach(obj, idx, p)
+	{
+		if (!json_is_object(p))
+			continue;
+
+		for (int i = 0; i < ARRAY_SIZE(cfg_array_keys); i++)
+		{
+			int opt = -1;
+			char *s = NULL;
+			if (cfg_array_keys[i].cat != CFG_POOL)
+				continue;
+
+			val = json_object_get(p, cfg_array_keys[i].name);
+			if (!val)
+				continue;
+
+			for (int k = 0; k < options_count(); k++)
+			{
+				const char *alias = cfg_array_keys[i].longname;
+				if (alias && !strcasecmp(options[k].name, alias)) {
+					opt = k;
+					break;
+				}
+				if (!alias && !strcasecmp(options[k].name, cfg_array_keys[i].name)) {
+					opt = k;
+					break;
+				}
+			}
+			if (opt == -1)
+				continue;
+
+			if (json_is_string(val)) {
+				s = strdup(json_string_value(val));
+				if (!s)
+					continue;
+
+				// applog(LOG_DEBUG, "pool key %s '%s'", options[opt].name, s);
+				parse_arg(options[opt].val, s);
+				free(s);
+			} else {
+				// numeric or bool
+				char buf[32] = { 0 };
+				double d = 0.;
+				if (json_is_true(val)) d = 1.;
+				else if (json_is_integer(val))
+					d = 1.0 * json_integer_value(val);
+				else if (json_is_real(val))
+					d = json_real_value(val);
+				snprintf(buf, sizeof(buf)-1, "%f", d);
+				// applog(LOG_DEBUG, "pool key %s '%f'", options[opt].name, d);
+				parse_arg(options[opt].val, buf);
+			}
+		}
+	}
+	return true;
+}
+
+// debug stuff
+void pool_dump_infos()
+{
+	struct pool_infos *p;
+	if (opt_benchmark) return;
+	for (int i=0; i<num_pools; i++) {
+		p = &pools[i];
+		applog(LOG_DEBUG, "POOL %01d: %s USER %s -s %d", i,
+			p->short_url, p->user, p->scantime);
+	}
+}
diff --git a/quark/.deps/.dirstamp b/quark/.deps/.dirstamp
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/quark/.dirstamp b/quark/.dirstamp
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/quark/animecoin.cu b/quark/animecoin.cu
deleted file mode 100644
index 6d395be0d9..0000000000
--- a/quark/animecoin.cu
+++ /dev/null
@@ -1,291 +0,0 @@
-
-extern "C"
-{
-#include "sph/sph_blake.h"
-#include "sph/sph_bmw.h"
-#include "sph/sph_groestl.h"
-#include "sph/sph_skein.h"
-#include "sph/sph_jh.h"
-#include "sph/sph_keccak.h"
-#include "miner.h"
-}
-
-#include <stdint.h>
-
-// aus cpu-miner.c
-extern int device_map[8];
-
-// Speicher f�r Input/Output der verketteten Hashfunktionen
-static uint32_t *d_hash[8];
-
-// Speicher zur Generierung der Noncevektoren f�r die bedingten Hashes
-static uint32_t *d_animeNonces[8];
-static uint32_t *d_branch1Nonces[8];
-static uint32_t *d_branch2Nonces[8];
-static uint32_t *d_branch3Nonces[8];
-
-extern void quark_blake512_cpu_init(int thr_id, int threads);
-extern void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
-extern void quark_bmw512_cpu_init(int thr_id, int threads);
-extern void quark_bmw512_cpu_setBlock_80(void *pdata);
-extern void quark_bmw512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order);
-extern void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order);
-
-extern void quark_groestl512_cpu_init(int thr_id, int threads);
-extern void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-extern void quark_doublegroestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
-extern void quark_skein512_cpu_init(int thr_id, int threads);
-extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
-extern void quark_keccak512_cpu_init(int thr_id, int threads);
-extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
-extern void quark_jh512_cpu_init(int thr_id, int threads);
-extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
-extern void quark_check_cpu_init(int thr_id, int threads);
-extern void quark_check_cpu_setTarget(const void *ptarget);
-extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
-
-extern void quark_compactTest_cpu_init(int thr_id, int threads);
-extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
-											uint32_t *d_nonces1, size_t *nrm1,
-											uint32_t *d_nonces2, size_t *nrm2,
-											int order);
-extern void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
-											uint32_t *d_nonces1, size_t *nrm1,
-											int order);
-
-// Original Quarkhash Funktion aus einem miner Quelltext
-inline void animehash(void *state, const void *input)
-{
-    sph_blake512_context ctx_blake;
-    sph_bmw512_context ctx_bmw;
-    sph_groestl512_context ctx_groestl;
-    sph_jh512_context ctx_jh;
-    sph_keccak512_context ctx_keccak;
-    sph_skein512_context ctx_skein;
-    
-    unsigned char hash[64];
-
-    sph_bmw512_init(&ctx_bmw);
-    // ZBMW;
-    sph_bmw512 (&ctx_bmw, (const void*) input, 80);
-    sph_bmw512_close(&ctx_bmw, (void*) hash);
-
-    sph_blake512_init(&ctx_blake);
-    // ZBLAKE;
-    sph_blake512 (&ctx_blake, hash, 64);
-    sph_blake512_close(&ctx_blake, (void*) hash);
-    
-    if (hash[0] & 0x8)
-    {
-        sph_groestl512_init(&ctx_groestl);
-        // ZGROESTL;
-        sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
-        sph_groestl512_close(&ctx_groestl, (void*) hash);
-    }
-    else
-    {
-        sph_skein512_init(&ctx_skein);
-        // ZSKEIN;
-        sph_skein512 (&ctx_skein, (const void*) hash, 64);
-        sph_skein512_close(&ctx_skein, (void*) hash);
-    }
-    
-    sph_groestl512_init(&ctx_groestl);
-    // ZGROESTL;
-    sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
-    sph_groestl512_close(&ctx_groestl, (void*) hash);
-
-    sph_jh512_init(&ctx_jh);
-    // ZJH;
-    sph_jh512 (&ctx_jh, (const void*) hash, 64);
-    sph_jh512_close(&ctx_jh, (void*) hash);
-
-    if (hash[0] & 0x8)
-    {
-        sph_blake512_init(&ctx_blake);
-        // ZBLAKE;
-        sph_blake512 (&ctx_blake, (const void*) hash, 64);
-        sph_blake512_close(&ctx_blake, (void*) hash);
-    }
-    else
-    {
-        sph_bmw512_init(&ctx_bmw);
-        // ZBMW;
-        sph_bmw512 (&ctx_bmw, (const void*) hash, 64);
-        sph_bmw512_close(&ctx_bmw, (void*) hash);
-    }
-
-    sph_keccak512_init(&ctx_keccak);
-    // ZKECCAK;
-    sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
-    sph_keccak512_close(&ctx_keccak, (void*) hash);
-
-    sph_skein512_init(&ctx_skein);
-    // SKEIN;
-    sph_skein512 (&ctx_skein, (const void*) hash, 64);
-    sph_skein512_close(&ctx_skein, (void*) hash);
-
-    if (hash[0] & 0x8)
-    {
-        sph_keccak512_init(&ctx_keccak);
-        // ZKECCAK;
-        sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
-        sph_keccak512_close(&ctx_keccak, (void*) hash);
-    }
-    else
-    {
-        sph_jh512_init(&ctx_jh);
-        // ZJH;
-        sph_jh512 (&ctx_jh, (const void*) hash, 64);
-        sph_jh512_close(&ctx_jh, (void*) hash);
-    }
-
-    memcpy(state, hash, 32);
-}
-
-
-struct HashPredicate
-{    
-    HashPredicate(uint32_t *hashes, uint32_t startNonce) :
-        m_hashes(hashes),
-        m_startNonce(startNonce)
-        { }
-
-    __device__
-    bool operator()(const uint32_t x)
-    {
-        uint32_t *hash = &m_hashes[(x - m_startNonce)*16];
-        return hash[0] & 0x8;
-    }
-
-    uint32_t *m_hashes;
-    uint32_t  m_startNonce;
-};
-
-extern bool opt_benchmark;
-
-extern "C" int scanhash_anime(int thr_id, uint32_t *pdata,
-    const uint32_t *ptarget, uint32_t max_nonce,
-    unsigned long *hashes_done)
-{
-	const uint32_t first_nonce = pdata[19];
-
-	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x00000f;
-
-	const uint32_t Htarg = ptarget[7];
-
-	const int throughput = 256*2048; // 100;
-
-	static bool init[8] = {0,0,0,0,0,0,0,0};
-	if (!init[thr_id])
-	{
-		cudaSetDevice(device_map[thr_id]);
-
-		// Konstanten kopieren, Speicher belegen
-		cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput);
-		quark_blake512_cpu_init(thr_id, throughput);
-		quark_groestl512_cpu_init(thr_id, throughput);
-		quark_skein512_cpu_init(thr_id, throughput);
-		quark_bmw512_cpu_init(thr_id, throughput);
-		quark_keccak512_cpu_init(thr_id, throughput);
-		quark_jh512_cpu_init(thr_id, throughput);
-		quark_check_cpu_init(thr_id, throughput);
-		quark_compactTest_cpu_init(thr_id, throughput);
-		cudaMalloc(&d_animeNonces[thr_id], sizeof(uint32_t)*throughput);
-		cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput);
-		cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput);
-		cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput);
-		init[thr_id] = true;
-	}
-
-	uint32_t endiandata[20];
-	for (int k=0; k < 20; k++)
-		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
-
-	quark_bmw512_cpu_setBlock_80((void*)endiandata);
-	quark_check_cpu_setTarget(ptarget);
-
-	do {
-		int order = 0;
-		size_t nrm1=0, nrm2=0, nrm3=0;
-
-		// erstes BMW512 Hash mit CUDA
-		quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch f�r Blake512
-		quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-
-		quark_compactTest_single_false_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], NULL,
-				d_branch3Nonces[thr_id], &nrm3,
-				order++);
-		
-		// nur den Skein Branch weiterverfolgen
-		quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch f�r Groestl512
-		quark_groestl512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch f�r JH512
-		quark_jh512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
-
-		// quarkNonces in branch1 und branch2 aufsplitten gem�ss if (hash[0] & 0x8)
-		quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
-			d_branch1Nonces[thr_id], &nrm1,
-			d_branch2Nonces[thr_id], &nrm2,
-			order++);
-
-		// das ist der bedingte Branch f�r Blake512
-		quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
-
-		// das ist der bedingte Branch f�r Bmw512
-		quark_bmw512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch f�r Keccak512
-		quark_keccak512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch f�r Skein512
-		quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
-
-		// quarkNonces in branch1 und branch2 aufsplitten gem�ss if (hash[0] & 0x8)
-		quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
-			d_branch1Nonces[thr_id], &nrm1,
-			d_branch2Nonces[thr_id], &nrm2,
-			order++);
-
-		// das ist der bedingte Branch f�r Keccak512
-		quark_keccak512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
-
-		// das ist der bedingte Branch f�r JH512
-		quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
-
-		// Scan nach Gewinner Hashes auf der GPU
-		uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
-		if  (foundNonce != 0xffffffff)
-		{
-			uint32_t vhash64[8];
-			be32enc(&endiandata[19], foundNonce);
-			animehash(vhash64, endiandata);
-
-			if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) {
-
-				pdata[19] = foundNonce;
-				*hashes_done = (foundNonce - first_nonce + 1)/2;
-				return 1;
-			} else {
-				applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNonce);
-			}
-		}
-
-		pdata[19] += throughput;
-
-	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
-
-	*hashes_done = (pdata[19] - first_nonce + 1)/2;
-	return 0;
-}
diff --git a/quark/cuda_bmw512.cu b/quark/cuda_bmw512.cu
index 760d028883..6011beb513 100644
--- a/quark/cuda_bmw512.cu
+++ b/quark/cuda_bmw512.cu
@@ -1,469 +1,491 @@
-#if 1
-
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
-#include <stdio.h>
-#include <memory.h>
-
-// Folgende Definitionen sp�ter durch header ersetzen
-typedef unsigned char uint8_t;
-typedef unsigned int uint32_t;
-
-// Endian Drehung f�r 32 Bit Typen
-/*
-static __device__ uint32_t cuda_swab32(uint32_t x)
-{
-    return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u)
-          | ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu));
-}
-*/
-static __device__ uint32_t cuda_swab32(uint32_t x)
-{
-	return __byte_perm(x, 0, 0x0123);
-}
-// Endian Drehung f�r 64 Bit Typen
-static __device__ unsigned long long cuda_swab64(unsigned long long x) {
-    uint32_t h = (x >> 32);
-    uint32_t l = (x & 0xFFFFFFFFULL);
-    return (((unsigned long long)cuda_swab32(l)) << 32) | ((unsigned long long)cuda_swab32(h));
-}
-
-// das Hi Word aus einem 64 Bit Typen extrahieren
-static __device__ uint32_t HIWORD(const unsigned long long &x) {
-#if __CUDA_ARCH__ >= 130
-	return (uint32_t)__double2hiint(__longlong_as_double(x));
-#else
-	return (uint32_t)(x >> 32);
-#endif
-}
-
-// das Hi Word in einem 64 Bit Typen ersetzen
-static __device__ unsigned long long REPLACE_HIWORD(const unsigned long long &x, const uint32_t &y) {
-	return (x & 0xFFFFFFFFULL) | (((unsigned long long)y) << 32ULL);
-}
-
-// das Lo Word aus einem 64 Bit Typen extrahieren
-static __device__ uint32_t LOWORD(const unsigned long long &x) {
-#if __CUDA_ARCH__ >= 130
-	return (uint32_t)__double2loint(__longlong_as_double(x));
-#else
-	return (uint32_t)(x & 0xFFFFFFFFULL);
-#endif
-}
-
-static __device__ unsigned long long MAKE_ULONGLONG(uint32_t LO, uint32_t HI)
-{
-#if __CUDA_ARCH__ >= 130
-    return __double_as_longlong(__hiloint2double(HI, LO));
-#else
-	return (unsigned long long)LO | (((unsigned long long)HI) << 32ULL);
-#endif
-}
-
-// das Lo Word in einem 64 Bit Typen ersetzen
-static __device__ unsigned long long REPLACE_LOWORD(const unsigned long long &x, const uint32_t &y) {
-	return (x & 0xFFFFFFFF00000000ULL) | ((unsigned long long)y);
-}
-
-// der Versuch, einen Wrapper f�r einen aus 32 Bit Registern zusammengesetzten uin64_t Typen zu entferfen...
-#if 1
-typedef unsigned long long uint64_t;
-#else
-typedef class uint64
-{
-public:
-	__device__ uint64()
-	{
-	}
-	__device__ uint64(unsigned long long init)
-	{
-		val = make_uint2( LOWORD(init), HIWORD(init) );
-	}
-	__device__ uint64(uint32_t lo, uint32_t hi)
-	{
-		val = make_uint2( lo, hi );
-	}
-	__device__ const uint64 operator^(uint64 const& rhs) const
-	{
-		return uint64(val.x ^ rhs.val.x, val.y ^ rhs.val.y);
-	}
-	__device__ const uint64 operator|(uint64 const& rhs) const
-	{
-		return uint64(val.x | rhs.val.x, val.y | rhs.val.y);
-	}
-	__device__ const uint64 operator+(unsigned long long const& rhs) const
-	{
-		return *this+uint64(rhs);
-	}
-	__device__ const uint64 operator+(uint64 const& rhs) const
-	{
-		uint64 res;
-		asm ("add.cc.u32      %0, %2, %4;\n\t"
-			 "addc.cc.u32     %1, %3, %5;\n\t"
-			 : "=r"(res.val.x), "=r"(res.val.y)
-			 : "r"(    val.x), "r"(    val.y),
-			   "r"(rhs.val.x), "r"(rhs.val.y));
-		return res;
-	}
-	__device__ const uint64 operator-(uint64 const& rhs) const
-	{
-		uint64 res;
-		asm ("sub.cc.u32      %0, %2, %4;\n\t"
-			 "subc.cc.u32     %1, %3, %5;\n\t"
-			 : "=r"(res.val.x), "=r"(res.val.y)
-			 : "r"(    val.x), "r"(    val.y),
-			   "r"(rhs.val.x), "r"(rhs.val.y));
-		return res;
-	}
-	__device__ const uint64 operator<<(int n) const
-	{
-		return uint64(unsigned long long(*this)<<n);
-	}
-	__device__ const uint64 operator>>(int n) const
-	{
-		return uint64(unsigned long long(*this)>>n);
-	}
-	__device__ operator unsigned long long() const
-	{
-		return MAKE_ULONGLONG(val.x, val.y);
-	}
-	uint2 val;
-} uint64_t;
-#endif
-
-// aus heavy.cu
-extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
-
-// die Message it Padding zur Berechnung auf der GPU
-__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
-
-#define SPH_C64(x)    ((uint64_t)(x ## ULL))
-
-// aus heavy.cu
-extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
-
-// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
-#if __CUDA_ARCH__ >= 350
-__forceinline__ __device__ uint64_t ROTL64(const uint64_t value, const int offset) {
-    uint2 result;
-    if(offset >= 32) {
-        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-    } else {
-        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-    }
-    return  __double_as_longlong(__hiloint2double(result.y, result.x));
-}
-#else
-#define ROTL64(x, n)        (((x) << (n)) | ((x) >> (64 - (n))))
-#endif
-#define SHL(x, n)            ((x) << (n))
-#define SHR(x, n)            ((x) >> (n))
-
-#define CONST_EXP2    q[i+0] + ROTL64(q[i+1], 5)  + q[i+2] + ROTL64(q[i+3], 11) + \
-                    q[i+4] + ROTL64(q[i+5], 27) + q[i+6] + ROTL64(q[i+7], 32) + \
-                    q[i+8] + ROTL64(q[i+9], 37) + q[i+10] + ROTL64(q[i+11], 43) + \
-                    q[i+12] + ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
-
-__device__ void Compression512(uint64_t *msg, uint64_t *hash)
-{
-    // Compression ref. implementation
-    uint64_t tmp;
-    uint64_t q[32];
-
-    tmp = (msg[ 5] ^ hash[ 5]) - (msg[ 7] ^ hash[ 7]) + (msg[10] ^ hash[10]) + (msg[13] ^ hash[13]) + (msg[14] ^ hash[14]);
-    q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp,  4) ^ ROTL64(tmp, 37)) + hash[1];
-    tmp = (msg[ 6] ^ hash[ 6]) - (msg[ 8] ^ hash[ 8]) + (msg[11] ^ hash[11]) + (msg[14] ^ hash[14]) - (msg[15] ^ hash[15]);
-    q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + hash[2];
-    tmp = (msg[ 0] ^ hash[ 0]) + (msg[ 7] ^ hash[ 7]) + (msg[ 9] ^ hash[ 9]) - (msg[12] ^ hash[12]) + (msg[15] ^ hash[15]);
-    q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + hash[3];
-    tmp = (msg[ 0] ^ hash[ 0]) - (msg[ 1] ^ hash[ 1]) + (msg[ 8] ^ hash[ 8]) - (msg[10] ^ hash[10]) + (msg[13] ^ hash[13]);
-    q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + hash[4];
-    tmp = (msg[ 1] ^ hash[ 1]) + (msg[ 2] ^ hash[ 2]) + (msg[ 9] ^ hash[ 9]) - (msg[11] ^ hash[11]) - (msg[14] ^ hash[14]);
-    q[4] = (SHR(tmp, 1) ^ tmp) + hash[5];
-    tmp = (msg[ 3] ^ hash[ 3]) - (msg[ 2] ^ hash[ 2]) + (msg[10] ^ hash[10]) - (msg[12] ^ hash[12]) + (msg[15] ^ hash[15]);
-    q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp,  4) ^ ROTL64(tmp, 37)) + hash[6];
-    tmp = (msg[ 4] ^ hash[ 4]) - (msg[ 0] ^ hash[ 0]) - (msg[ 3] ^ hash[ 3]) - (msg[11] ^ hash[11]) + (msg[13] ^ hash[13]);
-    q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + hash[7];
-    tmp = (msg[ 1] ^ hash[ 1]) - (msg[ 4] ^ hash[ 4]) - (msg[ 5] ^ hash[ 5]) - (msg[12] ^ hash[12]) - (msg[14] ^ hash[14]);
-    q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + hash[8];
-    tmp = (msg[ 2] ^ hash[ 2]) - (msg[ 5] ^ hash[ 5]) - (msg[ 6] ^ hash[ 6]) + (msg[13] ^ hash[13]) - (msg[15] ^ hash[15]);
-    q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + hash[9];
-    tmp = (msg[ 0] ^ hash[ 0]) - (msg[ 3] ^ hash[ 3]) + (msg[ 6] ^ hash[ 6]) - (msg[ 7] ^ hash[ 7]) + (msg[14] ^ hash[14]);
-    q[9] = (SHR(tmp, 1) ^ tmp) + hash[10];
-    tmp = (msg[ 8] ^ hash[ 8]) - (msg[ 1] ^ hash[ 1]) - (msg[ 4] ^ hash[ 4]) - (msg[ 7] ^ hash[ 7]) + (msg[15] ^ hash[15]);
-    q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp,  4) ^ ROTL64(tmp, 37)) + hash[11];
-    tmp = (msg[ 8] ^ hash[ 8]) - (msg[ 0] ^ hash[ 0]) - (msg[ 2] ^ hash[ 2]) - (msg[ 5] ^ hash[ 5]) + (msg[ 9] ^ hash[ 9]);
-    q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + hash[12];
-    tmp = (msg[ 1] ^ hash[ 1]) + (msg[ 3] ^ hash[ 3]) - (msg[ 6] ^ hash[ 6]) - (msg[ 9] ^ hash[ 9]) + (msg[10] ^ hash[10]);
-    q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + hash[13];
-    tmp = (msg[ 2] ^ hash[ 2]) + (msg[ 4] ^ hash[ 4]) + (msg[ 7] ^ hash[ 7]) + (msg[10] ^ hash[10]) + (msg[11] ^ hash[11]);
-    q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + hash[14];
-    tmp = (msg[ 3] ^ hash[ 3]) - (msg[ 5] ^ hash[ 5]) + (msg[ 8] ^ hash[ 8]) - (msg[11] ^ hash[11]) - (msg[12] ^ hash[12]);
-    q[14] = (SHR(tmp, 1) ^ tmp) + hash[15];
-    tmp = (msg[12] ^ hash[12]) - (msg[ 4] ^ hash[ 4]) - (msg[ 6] ^ hash[ 6]) - (msg[ 9] ^ hash[ 9]) + (msg[13] ^ hash[13]);
-    q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[0];
-
-    // Expand 1
-#pragma unroll 2
-    for(int i=0;i<2;i++)
-    {
-        q[i+16] =
-        (SHR(q[i], 1) ^ SHL(q[i], 2) ^ ROTL64(q[i], 13) ^ ROTL64(q[i], 43)) +
-        (SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ ROTL64(q[i+1], 19) ^ ROTL64(q[i+1], 53)) +
-        (SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ ROTL64(q[i+2], 28) ^ ROTL64(q[i+2], 59)) +
-        (SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ ROTL64(q[i+3],  4) ^ ROTL64(q[i+3], 37)) +
-        (SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ ROTL64(q[i+4], 13) ^ ROTL64(q[i+4], 43)) +
-        (SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ ROTL64(q[i+5], 19) ^ ROTL64(q[i+5], 53)) +
-        (SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ ROTL64(q[i+6], 28) ^ ROTL64(q[i+6], 59)) +
-        (SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ ROTL64(q[i+7],  4) ^ ROTL64(q[i+7], 37)) +
-        (SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ ROTL64(q[i+8], 13) ^ ROTL64(q[i+8], 43)) +
-        (SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ ROTL64(q[i+9], 19) ^ ROTL64(q[i+9], 53)) +
-        (SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ ROTL64(q[i+10], 28) ^ ROTL64(q[i+10], 59)) +
-        (SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ ROTL64(q[i+11],  4) ^ ROTL64(q[i+11], 37)) +
-        (SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ ROTL64(q[i+12], 13) ^ ROTL64(q[i+12], 43)) +
-        (SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ ROTL64(q[i+13], 19) ^ ROTL64(q[i+13], 53)) +
-        (SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ ROTL64(q[i+14], 28) ^ ROTL64(q[i+14], 59)) +
-        (SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ ROTL64(q[i+15],  4) ^ ROTL64(q[i+15], 37)) +
-        ((    ((i+16)*(0x0555555555555555ull)) + ROTL64(msg[i], i+1) +
-            ROTL64(msg[i+3], i+4) - ROTL64(msg[i+10], i+11) ) ^ hash[i+7]);
-    }
-
-#pragma unroll 4
-    for(int i=2;i<6;i++) {
-        q[i+16] = CONST_EXP2 + 
-        ((    ((i+16)*(0x0555555555555555ull)) + ROTL64(msg[i], i+1) +
-            ROTL64(msg[i+3], i+4) - ROTL64(msg[i+10], i+11) ) ^ hash[i+7]);
-    }
-#pragma unroll 3
-    for(int i=6;i<9;i++) {
-        q[i+16] = CONST_EXP2 + 
-        ((    ((i+16)*(0x0555555555555555ull)) + ROTL64(msg[i], i+1) +
-            ROTL64(msg[i+3], i+4) - ROTL64(msg[i-6], (i-6)+1) ) ^ hash[i+7]);
-    }
-#pragma unroll 4
-    for(int i=9;i<13;i++) {
-        q[i+16] = CONST_EXP2 + 
-        ((    ((i+16)*(0x0555555555555555ull)) + ROTL64(msg[i], i+1) +
-            ROTL64(msg[i+3], i+4) - ROTL64(msg[i-6], (i-6)+1) ) ^ hash[i-9]);
-    }
-#pragma unroll 3
-    for(int i=13;i<16;i++) {
-        q[i+16] = CONST_EXP2 + 
-        ((    ((i+16)*(0x0555555555555555ull)) + ROTL64(msg[i], i+1) +
-            ROTL64(msg[i-13], (i-13)+1) - ROTL64(msg[i-6], (i-6)+1) ) ^ hash[i-9]);
-    }
-
-    uint64_t XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
-    uint64_t XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];
-
-    hash[0] =                       (SHL(XH64, 5) ^ SHR(q[16],5) ^ msg[ 0]) + (    XL64    ^ q[24] ^ q[ 0]);
-    hash[1] =                       (SHR(XH64, 7) ^ SHL(q[17],8) ^ msg[ 1]) + (    XL64    ^ q[25] ^ q[ 1]);
-    hash[2] =                       (SHR(XH64, 5) ^ SHL(q[18],5) ^ msg[ 2]) + (    XL64    ^ q[26] ^ q[ 2]);
-    hash[3] =                       (SHR(XH64, 1) ^ SHL(q[19],5) ^ msg[ 3]) + (    XL64    ^ q[27] ^ q[ 3]);
-    hash[4] =                       (SHR(XH64, 3) ^     q[20]    ^ msg[ 4]) + (    XL64    ^ q[28] ^ q[ 4]);
-    hash[5] =                       (SHL(XH64, 6) ^ SHR(q[21],6) ^ msg[ 5]) + (    XL64    ^ q[29] ^ q[ 5]);
-    hash[6] =                       (SHR(XH64, 4) ^ SHL(q[22],6) ^ msg[ 6]) + (    XL64    ^ q[30] ^ q[ 6]);
-    hash[7] =                       (SHR(XH64,11) ^ SHL(q[23],2) ^ msg[ 7]) + (    XL64    ^ q[31] ^ q[ 7]);
-
-    hash[ 8] = ROTL64(hash[4], 9) + (    XH64     ^     q[24]    ^ msg[ 8]) + (SHL(XL64,8) ^ q[23] ^ q[ 8]);
-    hash[ 9] = ROTL64(hash[5],10) + (    XH64     ^     q[25]    ^ msg[ 9]) + (SHR(XL64,6) ^ q[16] ^ q[ 9]);
-    hash[10] = ROTL64(hash[6],11) + (    XH64     ^     q[26]    ^ msg[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]);
-    hash[11] = ROTL64(hash[7],12) + (    XH64     ^     q[27]    ^ msg[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]);
-    hash[12] = ROTL64(hash[0],13) + (    XH64     ^     q[28]    ^ msg[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]);
-    hash[13] = ROTL64(hash[1],14) + (    XH64     ^     q[29]    ^ msg[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]);
-    hash[14] = ROTL64(hash[2],15) + (    XH64     ^     q[30]    ^ msg[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]);
-    hash[15] = ROTL64(hash[3],16) + (    XH64     ^     q[31]    ^ msg[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]);
-}
-static __constant__ uint64_t d_constMem[16];
-static uint64_t h_constMem[16] = {
-	SPH_C64(0x8081828384858687),
-    SPH_C64(0x88898A8B8C8D8E8F),
-    SPH_C64(0x9091929394959697),
-    SPH_C64(0x98999A9B9C9D9E9F),
-    SPH_C64(0xA0A1A2A3A4A5A6A7),
-    SPH_C64(0xA8A9AAABACADAEAF),
-    SPH_C64(0xB0B1B2B3B4B5B6B7),
-    SPH_C64(0xB8B9BABBBCBDBEBF),
-    SPH_C64(0xC0C1C2C3C4C5C6C7),
-    SPH_C64(0xC8C9CACBCCCDCECF),
-    SPH_C64(0xD0D1D2D3D4D5D6D7),
-    SPH_C64(0xD8D9DADBDCDDDEDF),
-    SPH_C64(0xE0E1E2E3E4E5E6E7),
-    SPH_C64(0xE8E9EAEBECEDEEEF),
-    SPH_C64(0xF0F1F2F3F4F5F6F7),
-    SPH_C64(0xF8F9FAFBFCFDFEFF)
-};
-
-__global__ void quark_bmw512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
-{
-    int thread = (blockDim.x * blockIdx.x + threadIdx.x);
-    if (thread < threads)
-    {
-        uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
-
-        int hashPosition = nounce - startNounce;
-        uint64_t *inpHash = &g_hash[8 * hashPosition];
-
-        // Init
-        uint64_t h[16];
-		/*
-        h[ 0] = SPH_C64(0x8081828384858687);
-        h[ 1] = SPH_C64(0x88898A8B8C8D8E8F);
-        h[ 2] = SPH_C64(0x9091929394959697);
-        h[ 3] = SPH_C64(0x98999A9B9C9D9E9F);
-        h[ 4] = SPH_C64(0xA0A1A2A3A4A5A6A7);
-        h[ 5] = SPH_C64(0xA8A9AAABACADAEAF);
-        h[ 6] = SPH_C64(0xB0B1B2B3B4B5B6B7);
-        h[ 7] = SPH_C64(0xB8B9BABBBCBDBEBF);
-        h[ 8] = SPH_C64(0xC0C1C2C3C4C5C6C7);
-        h[ 9] = SPH_C64(0xC8C9CACBCCCDCECF);
-        h[10] = SPH_C64(0xD0D1D2D3D4D5D6D7);
-        h[11] = SPH_C64(0xD8D9DADBDCDDDEDF);
-        h[12] = SPH_C64(0xE0E1E2E3E4E5E6E7);
-        h[13] = SPH_C64(0xE8E9EAEBECEDEEEF);
-        h[14] = SPH_C64(0xF0F1F2F3F4F5F6F7);
-        h[15] = SPH_C64(0xF8F9FAFBFCFDFEFF);
-		*/
-#pragma unroll 16
-		for(int i=0;i<16;i++)
-			h[i] = d_constMem[i];
-        // Nachricht kopieren (Achtung, die Nachricht hat 64 Byte,
-        // BMW arbeitet mit 128 Byte!!!
-        uint64_t message[16];
-#pragma unroll 8
-        for(int i=0;i<8;i++)
-            message[i] = inpHash[i];
-#pragma unroll 6
-        for(int i=9;i<15;i++)
-            message[i] = 0;
-
-        // Padding einf�gen (Byteorder?!?)
-        message[8] = SPH_C64(0x80);
-        // L�nge (in Bits, d.h. 64 Byte * 8 = 512 Bits
-        message[15] = SPH_C64(512);
-
-        // Compression 1
-        Compression512(message, h);
-
-        // Final
-#pragma unroll 16
-        for(int i=0;i<16;i++)
-            message[i] = 0xaaaaaaaaaaaaaaa0ull + (uint64_t)i;
-
-        Compression512(h, message);
-
-        // fertig
-        uint64_t *outpHash = &g_hash[8 * hashPosition];
-
-#pragma unroll 8
-        for(int i=0;i<8;i++)
-            outpHash[i] = message[i+8];
-    }
-}
-
-__global__ void quark_bmw512_gpu_hash_80(int threads, uint32_t startNounce, uint64_t *g_hash)
-{
-    int thread = (blockDim.x * blockIdx.x + threadIdx.x);
-    if (thread < threads)
-    {
-        uint32_t nounce = startNounce + thread;
-
-        // Init
-        uint64_t h[16];
-#pragma unroll 16
-		for(int i=0;i<16;i++)
-			h[i] = d_constMem[i];
-
-        // Nachricht kopieren (Achtung, die Nachricht hat 64 Byte,
-        // BMW arbeitet mit 128 Byte!!!
-        uint64_t message[16];
-#pragma unroll 16
-        for(int i=0;i<16;i++)
-            message[i] = c_PaddedMessage80[i];
-
-        // die Nounce durch die thread-spezifische ersetzen
-        message[9] = REPLACE_HIWORD(message[9], cuda_swab32(nounce));
-
-        // Compression 1
-        Compression512(message, h);
-
-        // Final
-#pragma unroll 16
-        for(int i=0;i<16;i++)
-            message[i] = 0xaaaaaaaaaaaaaaa0ull + (uint64_t)i;
-
-        Compression512(h, message);
-
-        // fertig
-        uint64_t *outpHash = &g_hash[8 * thread];
-
-#pragma unroll 8
-        for(int i=0;i<8;i++)
-            outpHash[i] = message[i+8];
-    }
-}
-
-// Setup-Funktionen
-__host__ void quark_bmw512_cpu_init(int thr_id, int threads)
-{
-    // nix zu tun ;-)
-	// jetzt schon :D
-	cudaMemcpyToSymbol( d_constMem,
-                        h_constMem,
-                        sizeof(h_constMem),
-                        0, cudaMemcpyHostToDevice);
-}
-
-// Bmw512 f�r 80 Byte grosse Eingangsdaten
-__host__ void quark_bmw512_cpu_setBlock_80(void *pdata)
-{
-	// Message mit Padding bereitstellen
-	// lediglich die korrekte Nonce ist noch ab Byte 76 einzusetzen.
-	unsigned char PaddedMessage[128];
-	memcpy(PaddedMessage, pdata, 80);
-	memset(PaddedMessage+80, 0, 48);
-	uint64_t *message = (uint64_t*)PaddedMessage;
-	// Padding einf�gen (Byteorder?!?)
-	message[10] = SPH_C64(0x80);
-	// L�nge (in Bits, d.h. 80 Byte * 8 = 640 Bits
-	message[15] = SPH_C64(640);
-
-	// die Message zur Berechnung auf der GPU
-	cudaMemcpyToSymbol( c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
-}
-
-__host__ void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
-{
-    const int threadsperblock = 256;
-
-    // berechne wie viele Thread Blocks wir brauchen
-    dim3 grid((threads + threadsperblock-1)/threadsperblock);
-    dim3 block(threadsperblock);
-
-    // Gr��e des dynamischen Shared Memory Bereichs
-    size_t shared_size = 0;
-
-    quark_bmw512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
-    MyStreamSynchronize(NULL, order, thr_id);
-}
-
-__host__ void quark_bmw512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order)
-{
-    const int threadsperblock = 256;
-
-    // berechne wie viele Thread Blocks wir brauchen
-    dim3 grid((threads + threadsperblock-1)/threadsperblock);
-    dim3 block(threadsperblock);
-
-    // Gr��e des dynamischen Shared Memory Bereichs
-    size_t shared_size = 0;
-
-    quark_bmw512_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash);
-    MyStreamSynchronize(NULL, order, thr_id);
-}
-
-#endif
+#include <stdio.h>
+#include <memory.h>
+
+#define WANT_BMW512_80
+
+#include "cuda_helper.h"
+
+__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
+
+#include "cuda_bmw512_sm3.cuh"
+
+#ifdef __INTELLISENSE__
+/* just for vstudio code colors */
+#define __CUDA_ARCH__ 500
+#endif
+
+#undef SHL
+#undef SHR
+#undef CONST_EXP2
+
+#define SHR(x, n) SHR2(x, n)
+#define SHL(x, n) SHL2(x, n)
+#define ROL(x, n) ROL2(x, n)
+
+#define CONST_EXP2(i) \
+	q[i+0] + ROL(q[i+1], 5)  + q[i+2] + ROL(q[i+3], 11) + \
+	q[i+4] + ROL(q[i+5], 27) + q[i+6] + SWAPUINT2(q[i+7]) + \
+	q[i+8] + ROL(q[i+9], 37) + q[i+10] + ROL(q[i+11], 43) + \
+	q[i+12] + ROL(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
+
+__device__
+void Compression512_64_first(uint2 *msg, uint2 *hash)
+{
+	// Compression ref. implementation
+	uint2 q[32];
+	uint2 tmp;
+
+	tmp = (msg[5] ^ hash[5]) - (msg[7] ^ hash[7]) + (hash[10]) + (hash[13]) + (hash[14]);
+	q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROL(tmp, 4) ^ ROL(tmp, 37)) + hash[1];
+
+	tmp = (msg[6] ^ hash[6]) - (msg[8] ^ hash[8]) + (hash[11]) + (hash[14]) - (msg[15] ^ hash[15]);
+	q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROL(tmp, 13) ^ ROL(tmp, 43)) + hash[2];
+	tmp = (msg[0] ^ hash[0]) + (msg[7] ^ hash[7]) + (hash[9]) - (hash[12]) + (msg[15] ^ hash[15]);
+	q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROL(tmp, 19) ^ ROL(tmp, 53)) + hash[3];
+	tmp = (msg[0] ^ hash[0]) - (msg[1] ^ hash[1]) + (msg[8] ^ hash[8]) - (hash[10]) + (hash[13]);
+	q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROL(tmp, 28) ^ ROL(tmp, 59)) + hash[4];
+	tmp = (msg[1] ^ hash[1]) + (msg[2] ^ hash[2]) + (hash[9]) - (hash[11]) - (hash[14]);
+	q[4] = (SHR(tmp, 1) ^ tmp) + hash[5];
+	tmp = (msg[3] ^ hash[3]) - (msg[2] ^ hash[2]) + (hash[10]) - (hash[12]) + (msg[15] ^ hash[15]);
+	q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROL(tmp, 4) ^ ROL(tmp, 37)) + hash[6];
+	tmp = (msg[4] ^ hash[4]) - (msg[0] ^ hash[0]) - (msg[3] ^ hash[3]) - (hash[11]) + (hash[13]);
+	q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROL(tmp, 13) ^ ROL(tmp, 43)) + hash[7];
+	tmp = (msg[1] ^ hash[1]) - (msg[4] ^ hash[4]) - (msg[5] ^ hash[5]) - (hash[12]) - (hash[14]);
+	q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROL(tmp, 19) ^ ROL(tmp, 53)) + hash[8];
+
+	tmp = (msg[2] ^ hash[2]) - (msg[5] ^ hash[5]) - (msg[6] ^ hash[6]) + (hash[13]) - (msg[15] ^ hash[15]);
+	q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROL(tmp, 28) ^ ROL(tmp, 59)) + hash[9];
+	tmp = (msg[0] ^ hash[0]) - (msg[3] ^ hash[3]) + (msg[6] ^ hash[6]) - (msg[7] ^ hash[7]) + (hash[14]);
+	q[9] = (SHR(tmp, 1) ^ tmp) + hash[10];
+	tmp = (msg[8] ^ hash[8]) - (msg[1] ^ hash[1]) - (msg[4] ^ hash[4]) - (msg[7] ^ hash[7]) + (msg[15] ^ hash[15]);
+	q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROL(tmp, 4) ^ ROL(tmp, 37)) + hash[11];
+	tmp = (msg[8] ^ hash[8]) - (msg[0] ^ hash[0]) - (msg[2] ^ hash[2]) - (msg[5] ^ hash[5]) + (hash[9]);
+	q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROL(tmp, 13) ^ ROL(tmp, 43)) + hash[12];
+	tmp = (msg[1] ^ hash[1]) + (msg[3] ^ hash[3]) - (msg[6] ^ hash[6]) - (hash[9]) + (hash[10]);
+	q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROL(tmp, 19) ^ ROL(tmp, 53)) + hash[13];
+	tmp = (msg[2] ^ hash[2]) + (msg[4] ^ hash[4]) + (msg[7] ^ hash[7]) + (hash[10]) + (hash[11]);
+	q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROL(tmp, 28) ^ ROL(tmp, 59)) + hash[14];
+	tmp = (msg[3] ^ hash[3]) - (msg[5] ^ hash[5]) + (msg[8] ^ hash[8]) - (hash[11]) - (hash[12]);
+	q[14] = (SHR(tmp, 1) ^ tmp) + hash[15];
+	tmp = (msg[12] ^ hash[12]) - (msg[4] ^ hash[4]) - (msg[6] ^ hash[6]) - (hash[9]) + (hash[13]);
+	q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROL(tmp, 4) ^ ROL(tmp, 37)) + hash[0];
+
+	q[0 + 16] =
+		(SHR(q[0], 1) ^ SHL(q[0], 2) ^ ROL(q[0], 13) ^ ROL(q[0], 43)) +
+		(SHR(q[0 + 1], 2) ^ SHL(q[0 + 1], 1) ^ ROL(q[0 + 1], 19) ^ ROL(q[0 + 1], 53)) +
+		(SHR(q[0 + 2], 2) ^ SHL(q[0 + 2], 2) ^ ROL(q[0 + 2], 28) ^ ROL(q[0 + 2], 59)) +
+		(SHR(q[0 + 3], 1) ^ SHL(q[0 + 3], 3) ^ ROL(q[0 + 3], 4) ^ ROL(q[0 + 3], 37)) +
+		(SHR(q[0 + 4], 1) ^ SHL(q[0 + 4], 2) ^ ROL(q[0 + 4], 13) ^ ROL(q[0 + 4], 43)) +
+		(SHR(q[0 + 5], 2) ^ SHL(q[0 + 5], 1) ^ ROL(q[0 + 5], 19) ^ ROL(q[0 + 5], 53)) +
+		(SHR(q[0 + 6], 2) ^ SHL(q[0 + 6], 2) ^ ROL(q[0 + 6], 28) ^ ROL(q[0 + 6], 59)) +
+		(SHR(q[0 + 7], 1) ^ SHL(q[0 + 7], 3) ^ ROL(q[0 + 7], 4) ^ ROL(q[0 + 7], 37)) +
+		(SHR(q[0 + 8], 1) ^ SHL(q[0 + 8], 2) ^ ROL(q[0 + 8], 13) ^ ROL(q[0 + 8], 43)) +
+		(SHR(q[0 + 9], 2) ^ SHL(q[0 + 9], 1) ^ ROL(q[0 + 9], 19) ^ ROL(q[0 + 9], 53)) +
+		(SHR(q[0 + 10], 2) ^ SHL(q[0 + 10], 2) ^ ROL(q[0 + 10], 28) ^ ROL(q[0 + 10], 59)) +
+		(SHR(q[0 + 11], 1) ^ SHL(q[0 + 11], 3) ^ ROL(q[0 + 11], 4) ^ ROL(q[0 + 11], 37)) +
+		(SHR(q[0 + 12], 1) ^ SHL(q[0 + 12], 2) ^ ROL(q[0 + 12], 13) ^ ROL(q[0 + 12], 43)) +
+		(SHR(q[0 + 13], 2) ^ SHL(q[0 + 13], 1) ^ ROL(q[0 + 13], 19) ^ ROL(q[0 + 13], 53)) +
+		(SHR(q[0 + 14], 2) ^ SHL(q[0 + 14], 2) ^ ROL(q[0 + 14], 28) ^ ROL(q[0 + 14], 59)) +
+		(SHR(q[0 + 15], 1) ^ SHL(q[0 + 15], 3) ^ ROL(q[0 + 15], 4) ^ ROL(q[0 + 15], 37)) +
+		((make_uint2(0x55555550ul,0x55555555) + ROL(msg[0], 0 + 1) +
+		ROL(msg[0 + 3], 0 + 4)) ^ hash[0 + 7]);
+
+	q[1 + 16] =
+		(SHR(q[1], 1) ^ SHL(q[1], 2) ^ ROL(q[1], 13) ^ ROL(q[1], 43)) +
+		(SHR(q[1 + 1], 2) ^ SHL(q[1 + 1], 1) ^ ROL(q[1 + 1], 19) ^ ROL(q[1 + 1], 53)) +
+		(SHR(q[1 + 2], 2) ^ SHL(q[1 + 2], 2) ^ ROL(q[1 + 2], 28) ^ ROL(q[1 + 2], 59)) +
+		(SHR(q[1 + 3], 1) ^ SHL(q[1 + 3], 3) ^ ROL(q[1 + 3], 4) ^ ROL(q[1 + 3], 37)) +
+		(SHR(q[1 + 4], 1) ^ SHL(q[1 + 4], 2) ^ ROL(q[1 + 4], 13) ^ ROL(q[1 + 4], 43)) +
+		(SHR(q[1 + 5], 2) ^ SHL(q[1 + 5], 1) ^ ROL(q[1 + 5], 19) ^ ROL(q[1 + 5], 53)) +
+		(SHR(q[1 + 6], 2) ^ SHL(q[1 + 6], 2) ^ ROL(q[1 + 6], 28) ^ ROL(q[1 + 6], 59)) +
+		(SHR(q[1 + 7], 1) ^ SHL(q[1 + 7], 3) ^ ROL(q[1 + 7], 4) ^ ROL(q[1 + 7], 37)) +
+		(SHR(q[1 + 8], 1) ^ SHL(q[1 + 8], 2) ^ ROL(q[1 + 8], 13) ^ ROL(q[1 + 8], 43)) +
+		(SHR(q[1 + 9], 2) ^ SHL(q[1 + 9], 1) ^ ROL(q[1 + 9], 19) ^ ROL(q[1 + 9], 53)) +
+		(SHR(q[1 + 10], 2) ^ SHL(q[1 + 10], 2) ^ ROL(q[1 + 10], 28) ^ ROL(q[1 + 10], 59)) +
+		(SHR(q[1 + 11], 1) ^ SHL(q[1 + 11], 3) ^ ROL(q[1 + 11], 4) ^ ROL(q[1 + 11], 37)) +
+		(SHR(q[1 + 12], 1) ^ SHL(q[1 + 12], 2) ^ ROL(q[1 + 12], 13) ^ ROL(q[1 + 12], 43)) +
+		(SHR(q[1 + 13], 2) ^ SHL(q[1 + 13], 1) ^ ROL(q[1 + 13], 19) ^ ROL(q[1 + 13], 53)) +
+		(SHR(q[1 + 14], 2) ^ SHL(q[1 + 14], 2) ^ ROL(q[1 + 14], 28) ^ ROL(q[1 + 14], 59)) +
+		(SHR(q[1 + 15], 1) ^ SHL(q[1 + 15], 3) ^ ROL(q[1 + 15], 4) ^ ROL(q[1 + 15], 37)) +
+		((make_uint2(0xAAAAAAA5, 0x5AAAAAAA) + ROL(msg[1], 1 + 1) +
+		ROL(msg[1 + 3], 1 + 4)) ^ hash[1 + 7]);
+
+	q[2 + 16] = CONST_EXP2(2) +
+		((make_uint2(0xFFFFFFFA, 0x5FFFFFFF) + ROL(msg[2], 2 + 1) +
+		ROL(msg[2 + 3], 2 + 4) - ROL(msg[2 + 10], 2 + 11)) ^ hash[2 + 7]);
+	q[3 + 16] = CONST_EXP2(3) +
+		((make_uint2(0x5555554F, 0x65555555) + ROL(msg[3], 3 + 1) +
+		ROL(msg[3 + 3], 3 + 4) - ROL(msg[3 + 10], 3 + 11)) ^ hash[3 + 7]);
+	q[4 + 16] = CONST_EXP2(4) +
+		((make_uint2(0xAAAAAAA4, 0x6AAAAAAA) +ROL(msg[4], 4 + 1) +
+		ROL(msg[4 + 3], 4 + 4) - ROL(msg[4 + 10], 4 + 11)) ^ hash[4 + 7]);
+	q[5 + 16] = CONST_EXP2(5) +
+		((make_uint2(0xFFFFFFF9, 0x6FFFFFFF) + ROL(msg[5], 5 + 1) +
+		ROL(msg[5 + 3], 5 + 4) - ROL(msg[5 + 10], 5 + 11)) ^ hash[5 + 7]);
+
+	#pragma unroll 3
+	for (int i = 6; i<9; i++) {
+		q[i + 16] = CONST_EXP2(i) +
+			((vectorize((i + 16)*(0x0555555555555555ull)) + ROL(msg[i], i + 1) -
+			ROL(msg[i - 6], (i - 6) + 1)) ^ hash[i + 7]);
+	}
+
+	#pragma unroll 4
+	for (int i = 9; i<13; i++) {
+		q[i + 16] = CONST_EXP2(i) +
+			((vectorize((i + 16)*(0x0555555555555555ull)) +
+			ROL(msg[i + 3], i + 4) - ROL(msg[i - 6], (i - 6) + 1)) ^ hash[i - 9]);
+	}
+
+	q[13 + 16] = CONST_EXP2(13) +
+		((make_uint2(0xAAAAAAA1, 0x9AAAAAAA) + ROL(msg[13], 13 + 1) +
+		ROL(msg[13 - 13], (13 - 13) + 1) - ROL(msg[13 - 6], (13 - 6) + 1)) ^ hash[13 - 9]);
+	q[14 + 16] = CONST_EXP2(14) +
+		((make_uint2(0xFFFFFFF6, 0x9FFFFFFF) + ROL(msg[14], 14 + 1) +
+		ROL(msg[14 - 13], (14 - 13) + 1) - ROL(msg[14 - 6], (14 - 6) + 1)) ^ hash[14 - 9]);
+	q[15 + 16] = CONST_EXP2(15) +
+		((make_uint2(0x5555554B, 0xA5555555) + ROL(msg[15], 15 + 1) +
+		ROL(msg[15 - 13], (15 - 13) + 1) - ROL(msg[15 - 6], (15 - 6) + 1)) ^ hash[15 - 9]);
+
+
+	uint2 XL64 = q[16] ^ q[17] ^ q[18] ^ q[19] ^ q[20] ^ q[21] ^ q[22] ^ q[23];
+	uint2 XH64 = XL64^q[24] ^ q[25] ^ q[26] ^ q[27] ^ q[28] ^ q[29] ^ q[30] ^ q[31];
+
+	hash[0] = (SHL(XH64, 5) ^ SHR(q[16], 5) ^ msg[0]) + (XL64 ^ q[24] ^ q[0]);
+	hash[1] = (SHR(XH64, 7) ^ SHL(q[17], 8) ^ msg[1]) + (XL64 ^ q[25] ^ q[1]);
+	hash[2] = (SHR(XH64, 5) ^ SHL(q[18], 5) ^ msg[2]) + (XL64 ^ q[26] ^ q[2]);
+	hash[3] = (SHR(XH64, 1) ^ SHL(q[19], 5) ^ msg[3]) + (XL64 ^ q[27] ^ q[3]);
+	hash[4] = (SHR(XH64, 3) ^ q[20] ^ msg[4]) + (XL64 ^ q[28] ^ q[4]);
+	hash[5] = (SHL(XH64, 6) ^ SHR(q[21], 6) ^ msg[5]) + (XL64 ^ q[29] ^ q[5]);
+	hash[6] = (SHR(XH64, 4) ^ SHL(q[22], 6) ^ msg[6]) + (XL64 ^ q[30] ^ q[6]);
+	hash[7] = (SHR(XH64, 11) ^ SHL(q[23], 2) ^ msg[7]) + (XL64 ^ q[31] ^ q[7]);
+
+	hash[8] =  ROL(hash[4], 9)  + (XH64 ^ q[24] ^ msg[8]) + (SHL(XL64, 8) ^ q[23] ^ q[8]);
+	hash[9] =  ROL(hash[5], 10) + (XH64 ^ q[25]) + (SHR(XL64, 6) ^ q[16] ^ q[9]);
+	hash[10] = ROL(hash[6], 11) + (XH64 ^ q[26]) + (SHL(XL64, 6) ^ q[17] ^ q[10]);
+	hash[11] = ROL(hash[7], 12) + (XH64 ^ q[27]) + (SHL(XL64, 4) ^ q[18] ^ q[11]);
+	hash[12] = ROL(hash[0], 13) + (XH64 ^ q[28]) + (SHR(XL64, 3) ^ q[19] ^ q[12]);
+	hash[13] = ROL(hash[1], 14) + (XH64 ^ q[29]) + (SHR(XL64, 4) ^ q[20] ^ q[13]);
+	hash[14] = ROL(hash[2], 15) + (XH64 ^ q[30]) + (SHR(XL64, 7) ^ q[21] ^ q[14]);
+	hash[15] = ROL(hash[3], 16) + (XH64 ^ q[31] ^ msg[15]) + (SHR(XL64, 2) ^ q[22] ^ q[15]);
+}
+
+__device__
+void Compression512(uint2 *msg, uint2 *hash)
+{
+	// Compression ref. implementation
+	uint2 q[32];
+	uint2 tmp;
+
+	tmp = (msg[ 5] ^ hash[ 5]) - (msg[ 7] ^ hash[ 7]) + (msg[10] ^ hash[10]) + (msg[13] ^ hash[13]) + (msg[14] ^ hash[14]);
+	q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROL(tmp,  4) ^ ROL(tmp, 37)) + hash[1];
+	tmp = (msg[ 6] ^ hash[ 6]) - (msg[ 8] ^ hash[ 8]) + (msg[11] ^ hash[11]) + (msg[14] ^ hash[14]) - (msg[15] ^ hash[15]);
+	q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROL(tmp, 13) ^ ROL(tmp, 43)) + hash[2];
+	tmp = (msg[ 0] ^ hash[ 0]) + (msg[ 7] ^ hash[ 7]) + (msg[ 9] ^ hash[ 9]) - (msg[12] ^ hash[12]) + (msg[15] ^ hash[15]);
+	q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROL(tmp, 19) ^ ROL(tmp, 53)) + hash[3];
+	tmp = (msg[ 0] ^ hash[ 0]) - (msg[ 1] ^ hash[ 1]) + (msg[ 8] ^ hash[ 8]) - (msg[10] ^ hash[10]) + (msg[13] ^ hash[13]);
+	q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROL(tmp, 28) ^ ROL(tmp, 59)) + hash[4];
+	tmp = (msg[ 1] ^ hash[ 1]) + (msg[ 2] ^ hash[ 2]) + (msg[ 9] ^ hash[ 9]) - (msg[11] ^ hash[11]) - (msg[14] ^ hash[14]);
+	q[4] = (SHR(tmp, 1) ^ tmp) + hash[5];
+	tmp = (msg[ 3] ^ hash[ 3]) - (msg[ 2] ^ hash[ 2]) + (msg[10] ^ hash[10]) - (msg[12] ^ hash[12]) + (msg[15] ^ hash[15]);
+	q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROL(tmp,  4) ^ ROL(tmp, 37)) + hash[6];
+	tmp = (msg[ 4] ^ hash[ 4]) - (msg[ 0] ^ hash[ 0]) - (msg[ 3] ^ hash[ 3]) - (msg[11] ^ hash[11]) + (msg[13] ^ hash[13]);
+	q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROL(tmp, 13) ^ ROL(tmp, 43)) + hash[7];
+	tmp = (msg[ 1] ^ hash[ 1]) - (msg[ 4] ^ hash[ 4]) - (msg[ 5] ^ hash[ 5]) - (msg[12] ^ hash[12]) - (msg[14] ^ hash[14]);
+	q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROL(tmp, 19) ^ ROL(tmp, 53)) + hash[8];
+	tmp = (msg[ 2] ^ hash[ 2]) - (msg[ 5] ^ hash[ 5]) - (msg[ 6] ^ hash[ 6]) + (msg[13] ^ hash[13]) - (msg[15] ^ hash[15]);
+	q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROL(tmp, 28) ^ ROL(tmp, 59)) + hash[9];
+	tmp = (msg[ 0] ^ hash[ 0]) - (msg[ 3] ^ hash[ 3]) + (msg[ 6] ^ hash[ 6]) - (msg[ 7] ^ hash[ 7]) + (msg[14] ^ hash[14]);
+	q[9] = (SHR(tmp, 1) ^ tmp) + hash[10];
+	tmp = (msg[ 8] ^ hash[ 8]) - (msg[ 1] ^ hash[ 1]) - (msg[ 4] ^ hash[ 4]) - (msg[ 7] ^ hash[ 7]) + (msg[15] ^ hash[15]);
+	q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROL(tmp,  4) ^ ROL(tmp, 37)) + hash[11];
+	tmp = (msg[ 8] ^ hash[ 8]) - (msg[ 0] ^ hash[ 0]) - (msg[ 2] ^ hash[ 2]) - (msg[ 5] ^ hash[ 5]) + (msg[ 9] ^ hash[ 9]);
+	q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROL(tmp, 13) ^ ROL(tmp, 43)) + hash[12];
+	tmp = (msg[ 1] ^ hash[ 1]) + (msg[ 3] ^ hash[ 3]) - (msg[ 6] ^ hash[ 6]) - (msg[ 9] ^ hash[ 9]) + (msg[10] ^ hash[10]);
+	q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROL(tmp, 19) ^ ROL(tmp, 53)) + hash[13];
+	tmp = (msg[ 2] ^ hash[ 2]) + (msg[ 4] ^ hash[ 4]) + (msg[ 7] ^ hash[ 7]) + (msg[10] ^ hash[10]) + (msg[11] ^ hash[11]);
+	q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROL(tmp, 28) ^ ROL(tmp, 59)) + hash[14];
+	tmp = (msg[ 3] ^ hash[ 3]) - (msg[ 5] ^ hash[ 5]) + (msg[ 8] ^ hash[ 8]) - (msg[11] ^ hash[11]) - (msg[12] ^ hash[12]);
+	q[14] = (SHR(tmp, 1) ^ tmp) + hash[15];
+	tmp = (msg[12] ^ hash[12]) - (msg[ 4] ^ hash[ 4]) - (msg[ 6] ^ hash[ 6]) - (msg[ 9] ^ hash[ 9]) + (msg[13] ^ hash[13]);
+	q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROL(tmp, 4) ^ ROL(tmp, 37)) + hash[0];
+
+	q[0+16] =
+		(SHR(q[0], 1) ^ SHL(q[0], 2) ^ ROL(q[0], 13) ^ ROL(q[0], 43)) +
+		(SHR(q[0+1], 2) ^ SHL(q[0+1], 1) ^ ROL(q[0+1], 19) ^ ROL(q[0+1], 53)) +
+		(SHR(q[0+2], 2) ^ SHL(q[0+2], 2) ^ ROL(q[0+2], 28) ^ ROL(q[0+2], 59)) +
+		(SHR(q[0+3], 1) ^ SHL(q[0+3], 3) ^ ROL(q[0+3],  4) ^ ROL(q[0+3], 37)) +
+		(SHR(q[0+4], 1) ^ SHL(q[0+4], 2) ^ ROL(q[0+4], 13) ^ ROL(q[0+4], 43)) +
+		(SHR(q[0+5], 2) ^ SHL(q[0+5], 1) ^ ROL(q[0+5], 19) ^ ROL(q[0+5], 53)) +
+		(SHR(q[0+6], 2) ^ SHL(q[0+6], 2) ^ ROL(q[0+6], 28) ^ ROL(q[0+6], 59)) +
+		(SHR(q[0+7], 1) ^ SHL(q[0+7], 3) ^ ROL(q[0+7],  4) ^ ROL(q[0+7], 37)) +
+		(SHR(q[0+8], 1) ^ SHL(q[0+8], 2) ^ ROL(q[0+8], 13) ^ ROL(q[0+8], 43)) +
+		(SHR(q[0+9], 2) ^ SHL(q[0+9], 1) ^ ROL(q[0+9], 19) ^ ROL(q[0+9], 53)) +
+		(SHR(q[0+10], 2) ^ SHL(q[0+10], 2) ^ ROL(q[0+10], 28) ^ ROL(q[0+10], 59)) +
+		(SHR(q[0+11], 1) ^ SHL(q[0+11], 3) ^ ROL(q[0+11],  4) ^ ROL(q[0+11], 37)) +
+		(SHR(q[0+12], 1) ^ SHL(q[0+12], 2) ^ ROL(q[0+12], 13) ^ ROL(q[0+12], 43)) +
+		(SHR(q[0+13], 2) ^ SHL(q[0+13], 1) ^ ROL(q[0+13], 19) ^ ROL(q[0+13], 53)) +
+		(SHR(q[0+14], 2) ^ SHL(q[0+14], 2) ^ ROL(q[0+14], 28) ^ ROL(q[0+14], 59)) +
+		(SHR(q[0+15], 1) ^ SHL(q[0+15], 3) ^ ROL(q[0+15],  4) ^ ROL(q[0+15], 37)) +
+		((make_uint2(0x55555550ul, 0x55555555) + ROL(msg[0], 0 + 1) +
+		ROL(msg[0+3], 0+4) - ROL(msg[0+10], 0+11) ) ^ hash[0+7]);
+
+	q[1 + 16] =
+		(SHR(q[1], 1) ^ SHL(q[1], 2) ^ ROL(q[1], 13) ^ ROL(q[1], 43)) +
+		(SHR(q[1 + 1], 2) ^ SHL(q[1 + 1], 1) ^ ROL(q[1 + 1], 19) ^ ROL(q[1 + 1], 53)) +
+		(SHR(q[1 + 2], 2) ^ SHL(q[1 + 2], 2) ^ ROL(q[1 + 2], 28) ^ ROL(q[1 + 2], 59)) +
+		(SHR(q[1 + 3], 1) ^ SHL(q[1 + 3], 3) ^ ROL(q[1 + 3], 4) ^ ROL(q[1 + 3], 37)) +
+		(SHR(q[1 + 4], 1) ^ SHL(q[1 + 4], 2) ^ ROL(q[1 + 4], 13) ^ ROL(q[1 + 4], 43)) +
+		(SHR(q[1 + 5], 2) ^ SHL(q[1 + 5], 1) ^ ROL(q[1 + 5], 19) ^ ROL(q[1 + 5], 53)) +
+		(SHR(q[1 + 6], 2) ^ SHL(q[1 + 6], 2) ^ ROL(q[1 + 6], 28) ^ ROL(q[1 + 6], 59)) +
+		(SHR(q[1 + 7], 1) ^ SHL(q[1 + 7], 3) ^ ROL(q[1 + 7], 4) ^ ROL(q[1 + 7], 37)) +
+		(SHR(q[1 + 8], 1) ^ SHL(q[1 + 8], 2) ^ ROL(q[1 + 8], 13) ^ ROL(q[1 + 8], 43)) +
+		(SHR(q[1 + 9], 2) ^ SHL(q[1 + 9], 1) ^ ROL(q[1 + 9], 19) ^ ROL(q[1 + 9], 53)) +
+		(SHR(q[1 + 10], 2) ^ SHL(q[1 + 10], 2) ^ ROL(q[1 + 10], 28) ^ ROL(q[1 + 10], 59)) +
+		(SHR(q[1 + 11], 1) ^ SHL(q[1 + 11], 3) ^ ROL(q[1 + 11], 4) ^ ROL(q[1 + 11], 37)) +
+		(SHR(q[1 + 12], 1) ^ SHL(q[1 + 12], 2) ^ ROL(q[1 + 12], 13) ^ ROL(q[1 + 12], 43)) +
+		(SHR(q[1 + 13], 2) ^ SHL(q[1 + 13], 1) ^ ROL(q[1 + 13], 19) ^ ROL(q[1 + 13], 53)) +
+		(SHR(q[1 + 14], 2) ^ SHL(q[1 + 14], 2) ^ ROL(q[1 + 14], 28) ^ ROL(q[1 + 14], 59)) +
+		(SHR(q[1 + 15], 1) ^ SHL(q[1 + 15], 3) ^ ROL(q[1 + 15], 4) ^ ROL(q[1 + 15], 37)) +
+		((make_uint2(0xAAAAAAA5, 0x5AAAAAAA) + ROL(msg[1], 1 + 1) +
+		ROL(msg[1 + 3], 1 + 4) - ROL(msg[1 + 10], 1 + 11)) ^ hash[1 + 7]);
+
+	q[2 + 16] = CONST_EXP2(2) +
+		((make_uint2(0xFFFFFFFA, 0x5FFFFFFF) + ROL(msg[2], 2 + 1) +
+		ROL(msg[2+3], 2+4) - ROL(msg[2+10], 2+11) ) ^ hash[2+7]);
+	q[3 + 16] = CONST_EXP2(3) +
+		((make_uint2(0x5555554F, 0x65555555) + ROL(msg[3], 3 + 1) +
+		ROL(msg[3 + 3], 3 + 4) - ROL(msg[3 + 10], 3 + 11)) ^ hash[3 + 7]);
+	q[4 + 16] = CONST_EXP2(4) +
+		((make_uint2(0xAAAAAAA4, 0x6AAAAAAA) + ROL(msg[4], 4 + 1) +
+		ROL(msg[4 + 3], 4 + 4) - ROL(msg[4 + 10], 4 + 11)) ^ hash[4 + 7]);
+	q[5 + 16] = CONST_EXP2(5) +
+		((make_uint2(0xFFFFFFF9, 0x6FFFFFFF) + ROL(msg[5], 5 + 1) +
+		ROL(msg[5 + 3], 5 + 4) - ROL(msg[5 + 10], 5 + 11)) ^ hash[5 + 7]);
+	q[6 + 16] = CONST_EXP2(6) +
+		((make_uint2(0x5555554E, 0x75555555)+ ROL(msg[6], 6 + 1) +
+		ROL(msg[6 + 3], 6 + 4) - ROL(msg[6 - 6], (6 - 6) + 1)) ^ hash[6 + 7]);
+	q[7 + 16] = CONST_EXP2(7) +
+		((make_uint2(0xAAAAAAA3, 0x7AAAAAAA) + ROL(msg[7], 7 + 1) +
+		ROL(msg[7 + 3], 7 + 4) - ROL(msg[7 - 6], (7 - 6) + 1)) ^ hash[7 + 7]);
+	q[8 + 16] = CONST_EXP2(8) +
+		((make_uint2(0xFFFFFFF8, 0x7FFFFFFF) + ROL(msg[8], 8 + 1) +
+		ROL(msg[8 + 3], 8 + 4) - ROL(msg[8 - 6], (8 - 6) + 1)) ^ hash[8 + 7]);
+	q[9 + 16] = CONST_EXP2(9) +
+		((make_uint2(0x5555554D, 0x85555555) + ROL(msg[9], 9 + 1) +
+		ROL(msg[9 + 3], 9 + 4) - ROL(msg[9 - 6], (9 - 6) + 1)) ^ hash[9 - 9]);
+	q[10 + 16] = CONST_EXP2(10) +
+		((make_uint2(0xAAAAAAA2, 0x8AAAAAAA) + ROL(msg[10], 10 + 1) +
+		ROL(msg[10 + 3], 10 + 4) - ROL(msg[10 - 6], (10 - 6) + 1)) ^ hash[10 - 9]);
+	q[11 + 16] = CONST_EXP2(11) +
+		((make_uint2(0xFFFFFFF7, 0x8FFFFFFF) + ROL(msg[11], 11 + 1) +
+		ROL(msg[11 + 3], 11 + 4) - ROL(msg[11 - 6], (11 - 6) + 1)) ^ hash[11 - 9]);
+	q[12 + 16] = CONST_EXP2(12) +
+		((make_uint2(0x5555554C, 0x95555555) + ROL(msg[12], 12 + 1) +
+		ROL(msg[12 + 3], 12 + 4) - ROL(msg[12 - 6], (12 - 6) + 1)) ^ hash[12 - 9]);
+	q[13 + 16] = CONST_EXP2(13) +
+		((make_uint2(0xAAAAAAA1, 0x9AAAAAAA) + ROL(msg[13], 13 + 1) +
+		ROL(msg[13 - 13], (13 - 13) + 1) - ROL(msg[13 - 6], (13 - 6) + 1)) ^ hash[13 - 9]);
+	q[14 + 16] = CONST_EXP2(14) +
+		((make_uint2(0xFFFFFFF6, 0x9FFFFFFF) + ROL(msg[14], 14 + 1) +
+		ROL(msg[14 - 13], (14 - 13) + 1) - ROL(msg[14 - 6], (14 - 6) + 1)) ^ hash[14 - 9]);
+	q[15 + 16] = CONST_EXP2(15) +
+		((make_uint2(0x5555554B, 0xA5555555) + ROL(msg[15], 15 + 1) +
+		ROL(msg[15 - 13], (15 - 13) + 1) - ROL(msg[15 - 6], (15 - 6) + 1)) ^ hash[15 - 9]);
+
+	uint2 XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
+	uint2 XH64 = XL64^q[24] ^ q[25] ^ q[26] ^ q[27] ^ q[28] ^ q[29] ^ q[30] ^ q[31];
+
+	hash[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ msg[ 0]) + (XL64 ^ q[24] ^ q[ 0]);
+	hash[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ msg[ 1]) + (XL64 ^ q[25] ^ q[ 1]);
+	hash[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ msg[ 2]) + (XL64 ^ q[26] ^ q[ 2]);
+	hash[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ msg[ 3]) + (XL64 ^ q[27] ^ q[ 3]);
+	hash[4] = (SHR(XH64, 3) ^     q[20]    ^ msg[ 4]) + (XL64 ^ q[28] ^ q[ 4]);
+	hash[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ msg[ 5]) + (XL64 ^ q[29] ^ q[ 5]);
+	hash[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ msg[ 6]) + (XL64 ^ q[30] ^ q[ 6]);
+	hash[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ msg[ 7]) + (XL64 ^ q[31] ^ q[ 7]);
+
+	hash[ 8] = ROL(hash[4], 9) + (XH64 ^ q[24] ^ msg[ 8]) + (SHL(XL64,8) ^ q[23] ^ q[ 8]);
+	hash[ 9] = ROL(hash[5],10) + (XH64 ^ q[25] ^ msg[ 9]) + (SHR(XL64,6) ^ q[16] ^ q[ 9]);
+	hash[10] = ROL(hash[6],11) + (XH64 ^ q[26] ^ msg[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]);
+	hash[11] = ROL(hash[7],12) + (XH64 ^ q[27] ^ msg[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]);
+	hash[12] = ROL(hash[0],13) + (XH64 ^ q[28] ^ msg[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]);
+	hash[13] = ROL(hash[1],14) + (XH64 ^ q[29] ^ msg[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]);
+	hash[14] = ROL(hash[2],15) + (XH64 ^ q[30] ^ msg[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]);
+	hash[15] = ROL(hash[3],16) + (XH64 ^ q[31] ^ msg[15]) + (SHR(XL64, 2) ^ q[22] ^ q[15]);
+}
+
+__global__
+#if __CUDA_ARCH__ > 500
+__launch_bounds__(32, 16)
+#else
+__launch_bounds__(64, 8)
+#endif
+void quark_bmw512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+
+		uint32_t hashPosition = nounce - startNounce;
+		uint64_t *inpHash = &g_hash[hashPosition * 8];
+
+		// Init
+		uint2 h[16] = {
+			{ 0x84858687UL, 0x80818283UL },
+			{ 0x8C8D8E8FUL, 0x88898A8BUL },
+			{ 0x94959697UL, 0x90919293UL },
+			{ 0x9C9D9E9FUL, 0x98999A9BUL },
+			{ 0xA4A5A6A7UL, 0xA0A1A2A3UL },
+			{ 0xACADAEAFUL, 0xA8A9AAABUL },
+			{ 0xB4B5B6B7UL, 0xB0B1B2B3UL },
+			{ 0xBCBDBEBFUL, 0xB8B9BABBUL },
+			{ 0xC4C5C6C7UL, 0xC0C1C2C3UL },
+			{ 0xCCCDCECFUL, 0xC8C9CACBUL },
+			{ 0xD4D5D6D7UL, 0xD0D1D2D3UL },
+			{ 0xDCDDDEDFUL, 0xD8D9DADBUL },
+			{ 0xE4E5E6E7UL, 0xE0E1E2E3UL },
+			{ 0xECEDEEEFUL, 0xE8E9EAEBUL },
+			{ 0xF4F5F6F7UL, 0xF0F1F2F3UL },
+			{ 0xFCFDFEFFUL, 0xF8F9FAFBUL }
+		};
+
+		// Nachricht kopieren (Achtung, die Nachricht hat 64 Byte,
+		// BMW arbeitet mit 128 Byte!!!
+		uint2 message[16];
+		#pragma unroll
+		for(int i=0;i<8;i++)
+			message[i] = vectorize(inpHash[i]);
+
+		#pragma unroll 6
+		for(int i=9;i<15;i++)
+			message[i] = make_uint2(0,0);
+
+		// Padding einf�gen (Byteorder?!?)
+		message[8] = make_uint2(0x80,0);
+		// L�nge (in Bits, d.h. 64 Byte * 8 = 512 Bits
+		message[15] = make_uint2(512,0);
+
+		// Compression 1
+		Compression512_64_first(message, h);
+
+		// Final
+		#pragma unroll
+		for(int i=0;i<16;i++)
+		{
+			message[i].y = 0xaaaaaaaa;
+			message[i].x = 0xaaaaaaa0ul + (uint32_t)i;
+		}
+		Compression512(h, message);
+
+		// fertig
+		uint64_t *outpHash = &g_hash[hashPosition * 8];
+
+		#pragma unroll
+		for(int i=0;i<8;i++)
+			outpHash[i] = devectorize(message[i+8]);
+	}
+}
+
+__global__ __launch_bounds__(256, 2)
+void quark_bmw512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t nounce = startNounce + thread;
+
+		// Init
+		uint2 h[16] = {
+			{ 0x84858687UL, 0x80818283UL },
+			{ 0x8C8D8E8FUL, 0x88898A8BUL },
+			{ 0x94959697UL, 0x90919293UL },
+			{ 0x9C9D9E9FUL, 0x98999A9BUL },
+			{ 0xA4A5A6A7UL, 0xA0A1A2A3UL },
+			{ 0xACADAEAFUL, 0xA8A9AAABUL },
+			{ 0xB4B5B6B7UL, 0xB0B1B2B3UL },
+			{ 0xBCBDBEBFUL, 0xB8B9BABBUL },
+			{ 0xC4C5C6C7UL, 0xC0C1C2C3UL },
+			{ 0xCCCDCECFUL, 0xC8C9CACBUL },
+			{ 0xD4D5D6D7UL, 0xD0D1D2D3UL },
+			{ 0xDCDDDEDFUL, 0xD8D9DADBUL },
+			{ 0xE4E5E6E7UL, 0xE0E1E2E3UL },
+			{ 0xECEDEEEFUL, 0xE8E9EAEBUL },
+			{ 0xF4F5F6F7UL, 0xF0F1F2F3UL },
+			{ 0xFCFDFEFFUL, 0xF8F9FAFBUL }
+		};
+		// Nachricht kopieren (Achtung, die Nachricht hat 64 Byte,
+		// BMW arbeitet mit 128 Byte!!!
+		uint2 message[16];
+#pragma unroll 16
+		for(int i=0;i<16;i++)
+			message[i] = vectorize(c_PaddedMessage80[i]);
+
+		// die Nounce durch die thread-spezifische ersetzen
+		message[9].y = cuda_swab32(nounce);	//REPLACE_HIDWORD(message[9], cuda_swab32(nounce));
+
+		// Compression 1
+		Compression512(message, h);
+
+#pragma unroll 16
+		for(int i=0;i<16;i++)
+			message[i] = make_uint2(0xaaaaaaa0+i,0xaaaaaaaa);
+
+
+		Compression512(h, message);
+
+		// fertig
+		uint64_t *outpHash = &g_hash[thread * 8];
+
+#pragma unroll 8
+		for(int i=0;i<8;i++)
+			outpHash[i] = devectorize(message[i+8]);
+	}
+}
+
+__host__
+void quark_bmw512_cpu_setBlock_80(void *pdata)
+{
+	unsigned char PaddedMessage[128];
+	memcpy(PaddedMessage, pdata, 80);
+	memset(PaddedMessage+80, 0, 48);
+	uint64_t *message = (uint64_t*)PaddedMessage;
+	message[10] = SPH_C64(0x80);
+	message[15] = SPH_C64(640);
+	cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
+}
+
+__host__
+void quark_bmw512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order)
+{
+	const uint32_t threadsperblock = 128;
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+	int dev_id = device_map[thr_id];
+
+	if (device_sm[dev_id] > 300 && cuda_arch[dev_id] > 300)
+		quark_bmw512_gpu_hash_80<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash);
+	else
+		quark_bmw512_gpu_hash_80_30<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash);
+}
+
+__host__
+void quark_bmw512_cpu_init(int thr_id, uint32_t threads)
+{
+	cuda_get_arch(thr_id);
+}
+
+__host__
+void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+{
+	const uint32_t threadsperblock = 32;
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	int dev_id = device_map[thr_id];
+	if (device_sm[dev_id] > 300 && cuda_arch[dev_id] > 300)
+		quark_bmw512_gpu_hash_64<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+	else
+		quark_bmw512_gpu_hash_64_30<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+}
diff --git a/quark/cuda_bmw512_sm3.cuh b/quark/cuda_bmw512_sm3.cuh
new file mode 100644
index 0000000000..1298b13c85
--- /dev/null
+++ b/quark/cuda_bmw512_sm3.cuh
@@ -0,0 +1,271 @@
+#include <stdio.h>
+#include <memory.h>
+
+#ifdef __INTELLISENSE__
+/* for vstudio code colors */
+#define __CUDA_ARCH__ 300
+#endif
+
+#include "cuda_helper.h"
+
+#define SHL(x, n)            ((x) << (n))
+#define SHR(x, n)            ((x) >> (n))
+
+#define CONST_EXP2 \
+	q[i+0] + ROTL64(q[i+1], 5)  + q[i+2] + ROTL64(q[i+3], 11) + \
+	q[i+4] + ROTL64(q[i+5], 27) + q[i+6] + SWAPDWORDS(q[i+7]) + \
+	q[i+8] + ROTL64(q[i+9], 37) + q[i+10] + ROTL64(q[i+11], 43) + \
+	q[i+12] + ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 200 && __CUDA_ARCH__ < 500) || defined(_DEBUG)
+__constant__ uint64_t d_constMem[16] = {
+	SPH_C64(0x8081828384858687),
+	SPH_C64(0x88898A8B8C8D8E8F),
+	SPH_C64(0x9091929394959697),
+	SPH_C64(0x98999A9B9C9D9E9F),
+	SPH_C64(0xA0A1A2A3A4A5A6A7),
+	SPH_C64(0xA8A9AAABACADAEAF),
+	SPH_C64(0xB0B1B2B3B4B5B6B7),
+	SPH_C64(0xB8B9BABBBCBDBEBF),
+	SPH_C64(0xC0C1C2C3C4C5C6C7),
+	SPH_C64(0xC8C9CACBCCCDCECF),
+	SPH_C64(0xD0D1D2D3D4D5D6D7),
+	SPH_C64(0xD8D9DADBDCDDDEDF),
+	SPH_C64(0xE0E1E2E3E4E5E6E7),
+	SPH_C64(0xE8E9EAEBECEDEEEF),
+	SPH_C64(0xF0F1F2F3F4F5F6F7),
+	SPH_C64(0xF8F9FAFBFCFDFEFF)
+};
+# ifdef __GNUC__
+// windows and linux doesnt require the same ifdef for __constant__
+#  pragma GCC diagnostic ignored "-Wunused-variable"
+# endif
+#endif
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 500 || defined(_DEBUG))
+
+__device__
+void Compression512_30(uint64_t *msg, uint64_t *hash)
+{
+	// Compression ref. implementation
+	uint64_t tmp;
+	uint64_t q[32];
+
+	tmp = (msg[ 5] ^ hash[ 5]) - (msg[ 7] ^ hash[ 7]) + (msg[10] ^ hash[10]) + (msg[13] ^ hash[13]) + (msg[14] ^ hash[14]);
+	q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp,  4) ^ ROTL64(tmp, 37)) + hash[1];
+	tmp = (msg[ 6] ^ hash[ 6]) - (msg[ 8] ^ hash[ 8]) + (msg[11] ^ hash[11]) + (msg[14] ^ hash[14]) - (msg[15] ^ hash[15]);
+	q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + hash[2];
+	tmp = (msg[ 0] ^ hash[ 0]) + (msg[ 7] ^ hash[ 7]) + (msg[ 9] ^ hash[ 9]) - (msg[12] ^ hash[12]) + (msg[15] ^ hash[15]);
+	q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + hash[3];
+	tmp = (msg[ 0] ^ hash[ 0]) - (msg[ 1] ^ hash[ 1]) + (msg[ 8] ^ hash[ 8]) - (msg[10] ^ hash[10]) + (msg[13] ^ hash[13]);
+	q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + hash[4];
+	tmp = (msg[ 1] ^ hash[ 1]) + (msg[ 2] ^ hash[ 2]) + (msg[ 9] ^ hash[ 9]) - (msg[11] ^ hash[11]) - (msg[14] ^ hash[14]);
+	q[4] = (SHR(tmp, 1) ^ tmp) + hash[5];
+	tmp = (msg[ 3] ^ hash[ 3]) - (msg[ 2] ^ hash[ 2]) + (msg[10] ^ hash[10]) - (msg[12] ^ hash[12]) + (msg[15] ^ hash[15]);
+	q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp,  4) ^ ROTL64(tmp, 37)) + hash[6];
+	tmp = (msg[ 4] ^ hash[ 4]) - (msg[ 0] ^ hash[ 0]) - (msg[ 3] ^ hash[ 3]) - (msg[11] ^ hash[11]) + (msg[13] ^ hash[13]);
+	q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + hash[7];
+	tmp = (msg[ 1] ^ hash[ 1]) - (msg[ 4] ^ hash[ 4]) - (msg[ 5] ^ hash[ 5]) - (msg[12] ^ hash[12]) - (msg[14] ^ hash[14]);
+	q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + hash[8];
+	tmp = (msg[ 2] ^ hash[ 2]) - (msg[ 5] ^ hash[ 5]) - (msg[ 6] ^ hash[ 6]) + (msg[13] ^ hash[13]) - (msg[15] ^ hash[15]);
+	q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + hash[9];
+	tmp = (msg[ 0] ^ hash[ 0]) - (msg[ 3] ^ hash[ 3]) + (msg[ 6] ^ hash[ 6]) - (msg[ 7] ^ hash[ 7]) + (msg[14] ^ hash[14]);
+	q[9] = (SHR(tmp, 1) ^ tmp) + hash[10];
+	tmp = (msg[ 8] ^ hash[ 8]) - (msg[ 1] ^ hash[ 1]) - (msg[ 4] ^ hash[ 4]) - (msg[ 7] ^ hash[ 7]) + (msg[15] ^ hash[15]);
+	q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp,  4) ^ ROTL64(tmp, 37)) + hash[11];
+	tmp = (msg[ 8] ^ hash[ 8]) - (msg[ 0] ^ hash[ 0]) - (msg[ 2] ^ hash[ 2]) - (msg[ 5] ^ hash[ 5]) + (msg[ 9] ^ hash[ 9]);
+	q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + hash[12];
+	tmp = (msg[ 1] ^ hash[ 1]) + (msg[ 3] ^ hash[ 3]) - (msg[ 6] ^ hash[ 6]) - (msg[ 9] ^ hash[ 9]) + (msg[10] ^ hash[10]);
+	q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + hash[13];
+	tmp = (msg[ 2] ^ hash[ 2]) + (msg[ 4] ^ hash[ 4]) + (msg[ 7] ^ hash[ 7]) + (msg[10] ^ hash[10]) + (msg[11] ^ hash[11]);
+	q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + hash[14];
+	tmp = (msg[ 3] ^ hash[ 3]) - (msg[ 5] ^ hash[ 5]) + (msg[ 8] ^ hash[ 8]) - (msg[11] ^ hash[11]) - (msg[12] ^ hash[12]);
+	q[14] = (SHR(tmp, 1) ^ tmp) + hash[15];
+	tmp = (msg[12] ^ hash[12]) - (msg[ 4] ^ hash[ 4]) - (msg[ 6] ^ hash[ 6]) - (msg[ 9] ^ hash[ 9]) + (msg[13] ^ hash[13]);
+	q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[0];
+
+	// Expand 1
+#pragma unroll 2
+	for(int i=0;i<2;i++)
+	{
+		q[i+16] =
+		(SHR(q[i], 1) ^ SHL(q[i], 2) ^ ROTL64(q[i], 13) ^ ROTL64(q[i], 43)) +
+		(SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ ROTL64(q[i+1], 19) ^ ROTL64(q[i+1], 53)) +
+		(SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ ROTL64(q[i+2], 28) ^ ROTL64(q[i+2], 59)) +
+		(SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ ROTL64(q[i+3],  4) ^ ROTL64(q[i+3], 37)) +
+		(SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ ROTL64(q[i+4], 13) ^ ROTL64(q[i+4], 43)) +
+		(SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ ROTL64(q[i+5], 19) ^ ROTL64(q[i+5], 53)) +
+		(SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ ROTL64(q[i+6], 28) ^ ROTL64(q[i+6], 59)) +
+		(SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ ROTL64(q[i+7],  4) ^ ROTL64(q[i+7], 37)) +
+		(SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ ROTL64(q[i+8], 13) ^ ROTL64(q[i+8], 43)) +
+		(SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ ROTL64(q[i+9], 19) ^ ROTL64(q[i+9], 53)) +
+		(SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ ROTL64(q[i+10], 28) ^ ROTL64(q[i+10], 59)) +
+		(SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ ROTL64(q[i+11],  4) ^ ROTL64(q[i+11], 37)) +
+		(SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ ROTL64(q[i+12], 13) ^ ROTL64(q[i+12], 43)) +
+		(SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ ROTL64(q[i+13], 19) ^ ROTL64(q[i+13], 53)) +
+		(SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ ROTL64(q[i+14], 28) ^ ROTL64(q[i+14], 59)) +
+		(SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ ROTL64(q[i+15],  4) ^ ROTL64(q[i+15], 37)) +
+		((    ((i+16)*(0x0555555555555555ull)) + ROTL64(msg[i], i+1) +
+			ROTL64(msg[i+3], i+4) - ROTL64(msg[i+10], i+11) ) ^ hash[i+7]);
+	}
+
+#pragma unroll 4
+	for(int i=2;i<6;i++) {
+		q[i+16] = CONST_EXP2 +
+		((    ((i+16)*(0x0555555555555555ull)) + ROTL64(msg[i], i+1) +
+			ROTL64(msg[i+3], i+4) - ROTL64(msg[i+10], i+11) ) ^ hash[i+7]);
+	}
+#pragma unroll 3
+	for(int i=6;i<9;i++) {
+		q[i+16] = CONST_EXP2 +
+		((    ((i+16)*(0x0555555555555555ull)) + ROTL64(msg[i], i+1) +
+			ROTL64(msg[i+3], i+4) - ROTL64(msg[i-6], (i-6)+1) ) ^ hash[i+7]);
+	}
+#pragma unroll 4
+	for(int i=9;i<13;i++) {
+		q[i+16] = CONST_EXP2 +
+		((    ((i+16)*(0x0555555555555555ull)) + ROTL64(msg[i], i+1) +
+			ROTL64(msg[i+3], i+4) - ROTL64(msg[i-6], (i-6)+1) ) ^ hash[i-9]);
+	}
+#pragma unroll 3
+	for(int i=13;i<16;i++) {
+		q[i+16] = CONST_EXP2 +
+		((    ((i+16)*(0x0555555555555555ull)) + ROTL64(msg[i], i+1) +
+			ROTL64(msg[i-13], (i-13)+1) - ROTL64(msg[i-6], (i-6)+1) ) ^ hash[i-9]);
+	}
+
+	uint64_t XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
+	uint64_t XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];
+
+	hash[0] =                       (SHL(XH64, 5) ^ SHR(q[16],5) ^ msg[ 0]) + (    XL64    ^ q[24] ^ q[ 0]);
+	hash[1] =                       (SHR(XH64, 7) ^ SHL(q[17],8) ^ msg[ 1]) + (    XL64    ^ q[25] ^ q[ 1]);
+	hash[2] =                       (SHR(XH64, 5) ^ SHL(q[18],5) ^ msg[ 2]) + (    XL64    ^ q[26] ^ q[ 2]);
+	hash[3] =                       (SHR(XH64, 1) ^ SHL(q[19],5) ^ msg[ 3]) + (    XL64    ^ q[27] ^ q[ 3]);
+	hash[4] =                       (SHR(XH64, 3) ^     q[20]    ^ msg[ 4]) + (    XL64    ^ q[28] ^ q[ 4]);
+	hash[5] =                       (SHL(XH64, 6) ^ SHR(q[21],6) ^ msg[ 5]) + (    XL64    ^ q[29] ^ q[ 5]);
+	hash[6] =                       (SHR(XH64, 4) ^ SHL(q[22],6) ^ msg[ 6]) + (    XL64    ^ q[30] ^ q[ 6]);
+	hash[7] =                       (SHR(XH64,11) ^ SHL(q[23],2) ^ msg[ 7]) + (    XL64    ^ q[31] ^ q[ 7]);
+
+	hash[ 8] = ROTL64(hash[4], 9) + (    XH64     ^     q[24]    ^ msg[ 8]) + (SHL(XL64,8) ^ q[23] ^ q[ 8]);
+	hash[ 9] = ROTL64(hash[5],10) + (    XH64     ^     q[25]    ^ msg[ 9]) + (SHR(XL64,6) ^ q[16] ^ q[ 9]);
+	hash[10] = ROTL64(hash[6],11) + (    XH64     ^     q[26]    ^ msg[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]);
+	hash[11] = ROTL64(hash[7],12) + (    XH64     ^     q[27]    ^ msg[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]);
+	hash[12] = ROTL64(hash[0],13) + (    XH64     ^     q[28]    ^ msg[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]);
+	hash[13] = ROTL64(hash[1],14) + (    XH64     ^     q[29]    ^ msg[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]);
+	hash[14] = ROTL64(hash[2],15) + (    XH64     ^     q[30]    ^ msg[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]);
+	hash[15] = ROTL64(hash[3],16) + (    XH64     ^     q[31]    ^ msg[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]);
+}
+
+__global__
+void quark_bmw512_gpu_hash_64_30(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
+{
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+
+		uint32_t hashPosition = nounce - startNounce;
+		uint64_t *inpHash = &g_hash[hashPosition * 8];
+
+		// Init
+		uint64_t h[16];
+		/*
+		h[ 0] = SPH_C64(0x8081828384858687);
+		h[ 1] = SPH_C64(0x88898A8B8C8D8E8F);
+		h[ 2] = SPH_C64(0x9091929394959697);
+		h[ 3] = SPH_C64(0x98999A9B9C9D9E9F);
+		h[ 4] = SPH_C64(0xA0A1A2A3A4A5A6A7);
+		h[ 5] = SPH_C64(0xA8A9AAABACADAEAF);
+		h[ 6] = SPH_C64(0xB0B1B2B3B4B5B6B7);
+		h[ 7] = SPH_C64(0xB8B9BABBBCBDBEBF);
+		h[ 8] = SPH_C64(0xC0C1C2C3C4C5C6C7);
+		h[ 9] = SPH_C64(0xC8C9CACBCCCDCECF);
+		h[10] = SPH_C64(0xD0D1D2D3D4D5D6D7);
+		h[11] = SPH_C64(0xD8D9DADBDCDDDEDF);
+		h[12] = SPH_C64(0xE0E1E2E3E4E5E6E7);
+		h[13] = SPH_C64(0xE8E9EAEBECEDEEEF);
+		h[14] = SPH_C64(0xF0F1F2F3F4F5F6F7);
+		h[15] = SPH_C64(0xF8F9FAFBFCFDFEFF);
+		*/
+
+		// Nachricht kopieren (Achtung, die Nachricht hat 64 Byte,
+		// BMW arbeitet mit 128 Byte!!!
+		uint64_t message[16];
+
+		#pragma unroll 8
+		for(int i=0;i<8;i++)
+			message[i] = inpHash[i];
+
+		#pragma unroll 6
+		for(int i=9;i<15;i++)
+			message[i] = 0;
+
+		message[8] = SPH_C64(0x80);
+		// 64 Bytes * 8 = 512 Bits
+		message[15] = SPH_C64(512);
+
+		#pragma unroll 16
+		for(int i=0;i<16;i++)
+			h[i] = d_constMem[i];
+
+		// Compression 1
+		Compression512_30(message, h);
+
+		// Final
+		#pragma unroll 16
+		for(int i=0;i<16;i++)
+			message[i] = 0xaaaaaaaaaaaaaaa0ull + (uint64_t)i;
+
+		Compression512_30(h, message);
+
+		uint64_t *outpHash = &g_hash[hashPosition * 8];
+
+		#pragma unroll 8
+		for(int i=0;i<8;i++)
+			outpHash[i] = message[i+8];
+	}
+}
+
+__global__
+void quark_bmw512_gpu_hash_80_30(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
+{
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t nounce = startNounce + thread;
+
+		// Init
+		uint64_t h[16];
+		#pragma unroll 16
+		for(int i=0;i<16;i++)
+			h[i] = d_constMem[i];
+
+		uint64_t message[16];
+		#pragma unroll 16
+		for(int i=0;i<16;i++)
+			message[i] = c_PaddedMessage80[i];
+
+		message[9] = REPLACE_HIDWORD(message[9], cuda_swab32(nounce));
+
+		// Compression 1
+		Compression512_30(message, h);
+
+		// Final
+		#pragma unroll 16
+		for(int i=0;i<16;i++)
+			message[i] = 0xaaaaaaaaaaaaaaa0ull + (uint64_t)i;
+
+		Compression512_30(h, message);
+
+		uint2 *outpHash = (uint2*) (&g_hash[thread * 8U]);
+		#pragma unroll 8
+		for(int i=0;i<8;i++)
+			outpHash[i] = ((uint2*)message)[i+8];
+	}
+}
+
+#else /* stripped stubs for other archs */
+__global__ void quark_bmw512_gpu_hash_64_30(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) {}
+__global__ void quark_bmw512_gpu_hash_80_30(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) {}
+#endif
+
diff --git a/quark/cuda_jh512.cu b/quark/cuda_jh512.cu
index c55ac1a977..3a1bbabe82 100644
--- a/quark/cuda_jh512.cu
+++ b/quark/cuda_jh512.cu
@@ -1,356 +1,508 @@
-#include <stdint.h>
-
-// aus heavy.cu
-extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
-
-typedef struct {
-    uint32_t x[8][4];                     /*the 1024-bit state, ( x[i][0] || x[i][1] || x[i][2] || x[i][3] ) is the ith row of the state in the pseudocode*/
-    uint32_t buffer[16];                  /*the 512-bit message block to be hashed;*/
-} hashState;
-
-/*42 round constants, each round constant is 32-byte (256-bit)*/
-__constant__ uint32_t c_INIT_bitslice[8][4];
-__constant__ unsigned char c_E8_bitslice_roundconstant[42][32];
-
-const uint32_t h_INIT_bitslice[8][4] = {
-	{ 0x964bd16f, 0x17aa003e, 0x052e6a63, 0x43d5157a},
-	{ 0x8d5e228a, 0x0bef970c, 0x591234e9, 0x61c3b3f2},
-	{ 0xc1a01d89, 0x1e806f53, 0x6b05a92a, 0x806d2bea},
-	{ 0xdbcc8e58, 0xa6ba7520, 0x763a0fa9, 0xf73bf8ba},
-	{ 0x05e66901, 0x694ae341, 0x8e8ab546, 0x5ae66f2e},
-	{ 0xd0a74710, 0x243c84c1, 0xb1716e3b, 0x99c15a2d},
-	{ 0xecf657cf, 0x56f8b19d, 0x7c8806a7, 0x56b11657},
-	{ 0xdffcc2e3, 0xfb1785e6, 0x78465a54, 0x4bdd8ccc} };
-
-const unsigned char h_E8_bitslice_roundconstant[42][32]={
-{0x72,0xd5,0xde,0xa2,0xdf,0x15,0xf8,0x67,0x7b,0x84,0x15,0xa,0xb7,0x23,0x15,0x57,0x81,0xab,0xd6,0x90,0x4d,0x5a,0x87,0xf6,0x4e,0x9f,0x4f,0xc5,0xc3,0xd1,0x2b,0x40},
-{0xea,0x98,0x3a,0xe0,0x5c,0x45,0xfa,0x9c,0x3,0xc5,0xd2,0x99,0x66,0xb2,0x99,0x9a,0x66,0x2,0x96,0xb4,0xf2,0xbb,0x53,0x8a,0xb5,0x56,0x14,0x1a,0x88,0xdb,0xa2,0x31},
-{0x3,0xa3,0x5a,0x5c,0x9a,0x19,0xe,0xdb,0x40,0x3f,0xb2,0xa,0x87,0xc1,0x44,0x10,0x1c,0x5,0x19,0x80,0x84,0x9e,0x95,0x1d,0x6f,0x33,0xeb,0xad,0x5e,0xe7,0xcd,0xdc},
-{0x10,0xba,0x13,0x92,0x2,0xbf,0x6b,0x41,0xdc,0x78,0x65,0x15,0xf7,0xbb,0x27,0xd0,0xa,0x2c,0x81,0x39,0x37,0xaa,0x78,0x50,0x3f,0x1a,0xbf,0xd2,0x41,0x0,0x91,0xd3},
-{0x42,0x2d,0x5a,0xd,0xf6,0xcc,0x7e,0x90,0xdd,0x62,0x9f,0x9c,0x92,0xc0,0x97,0xce,0x18,0x5c,0xa7,0xb,0xc7,0x2b,0x44,0xac,0xd1,0xdf,0x65,0xd6,0x63,0xc6,0xfc,0x23},
-{0x97,0x6e,0x6c,0x3,0x9e,0xe0,0xb8,0x1a,0x21,0x5,0x45,0x7e,0x44,0x6c,0xec,0xa8,0xee,0xf1,0x3,0xbb,0x5d,0x8e,0x61,0xfa,0xfd,0x96,0x97,0xb2,0x94,0x83,0x81,0x97},
-{0x4a,0x8e,0x85,0x37,0xdb,0x3,0x30,0x2f,0x2a,0x67,0x8d,0x2d,0xfb,0x9f,0x6a,0x95,0x8a,0xfe,0x73,0x81,0xf8,0xb8,0x69,0x6c,0x8a,0xc7,0x72,0x46,0xc0,0x7f,0x42,0x14},
-{0xc5,0xf4,0x15,0x8f,0xbd,0xc7,0x5e,0xc4,0x75,0x44,0x6f,0xa7,0x8f,0x11,0xbb,0x80,0x52,0xde,0x75,0xb7,0xae,0xe4,0x88,0xbc,0x82,0xb8,0x0,0x1e,0x98,0xa6,0xa3,0xf4},
-{0x8e,0xf4,0x8f,0x33,0xa9,0xa3,0x63,0x15,0xaa,0x5f,0x56,0x24,0xd5,0xb7,0xf9,0x89,0xb6,0xf1,0xed,0x20,0x7c,0x5a,0xe0,0xfd,0x36,0xca,0xe9,0x5a,0x6,0x42,0x2c,0x36},
-{0xce,0x29,0x35,0x43,0x4e,0xfe,0x98,0x3d,0x53,0x3a,0xf9,0x74,0x73,0x9a,0x4b,0xa7,0xd0,0xf5,0x1f,0x59,0x6f,0x4e,0x81,0x86,0xe,0x9d,0xad,0x81,0xaf,0xd8,0x5a,0x9f},
-{0xa7,0x5,0x6,0x67,0xee,0x34,0x62,0x6a,0x8b,0xb,0x28,0xbe,0x6e,0xb9,0x17,0x27,0x47,0x74,0x7,0x26,0xc6,0x80,0x10,0x3f,0xe0,0xa0,0x7e,0x6f,0xc6,0x7e,0x48,0x7b},
-{0xd,0x55,0xa,0xa5,0x4a,0xf8,0xa4,0xc0,0x91,0xe3,0xe7,0x9f,0x97,0x8e,0xf1,0x9e,0x86,0x76,0x72,0x81,0x50,0x60,0x8d,0xd4,0x7e,0x9e,0x5a,0x41,0xf3,0xe5,0xb0,0x62},
-{0xfc,0x9f,0x1f,0xec,0x40,0x54,0x20,0x7a,0xe3,0xe4,0x1a,0x0,0xce,0xf4,0xc9,0x84,0x4f,0xd7,0x94,0xf5,0x9d,0xfa,0x95,0xd8,0x55,0x2e,0x7e,0x11,0x24,0xc3,0x54,0xa5},
-{0x5b,0xdf,0x72,0x28,0xbd,0xfe,0x6e,0x28,0x78,0xf5,0x7f,0xe2,0xf,0xa5,0xc4,0xb2,0x5,0x89,0x7c,0xef,0xee,0x49,0xd3,0x2e,0x44,0x7e,0x93,0x85,0xeb,0x28,0x59,0x7f},
-{0x70,0x5f,0x69,0x37,0xb3,0x24,0x31,0x4a,0x5e,0x86,0x28,0xf1,0x1d,0xd6,0xe4,0x65,0xc7,0x1b,0x77,0x4,0x51,0xb9,0x20,0xe7,0x74,0xfe,0x43,0xe8,0x23,0xd4,0x87,0x8a},
-{0x7d,0x29,0xe8,0xa3,0x92,0x76,0x94,0xf2,0xdd,0xcb,0x7a,0x9,0x9b,0x30,0xd9,0xc1,0x1d,0x1b,0x30,0xfb,0x5b,0xdc,0x1b,0xe0,0xda,0x24,0x49,0x4f,0xf2,0x9c,0x82,0xbf},
-{0xa4,0xe7,0xba,0x31,0xb4,0x70,0xbf,0xff,0xd,0x32,0x44,0x5,0xde,0xf8,0xbc,0x48,0x3b,0xae,0xfc,0x32,0x53,0xbb,0xd3,0x39,0x45,0x9f,0xc3,0xc1,0xe0,0x29,0x8b,0xa0},
-{0xe5,0xc9,0x5,0xfd,0xf7,0xae,0x9,0xf,0x94,0x70,0x34,0x12,0x42,0x90,0xf1,0x34,0xa2,0x71,0xb7,0x1,0xe3,0x44,0xed,0x95,0xe9,0x3b,0x8e,0x36,0x4f,0x2f,0x98,0x4a},
-{0x88,0x40,0x1d,0x63,0xa0,0x6c,0xf6,0x15,0x47,0xc1,0x44,0x4b,0x87,0x52,0xaf,0xff,0x7e,0xbb,0x4a,0xf1,0xe2,0xa,0xc6,0x30,0x46,0x70,0xb6,0xc5,0xcc,0x6e,0x8c,0xe6},
-{0xa4,0xd5,0xa4,0x56,0xbd,0x4f,0xca,0x0,0xda,0x9d,0x84,0x4b,0xc8,0x3e,0x18,0xae,0x73,0x57,0xce,0x45,0x30,0x64,0xd1,0xad,0xe8,0xa6,0xce,0x68,0x14,0x5c,0x25,0x67},
-{0xa3,0xda,0x8c,0xf2,0xcb,0xe,0xe1,0x16,0x33,0xe9,0x6,0x58,0x9a,0x94,0x99,0x9a,0x1f,0x60,0xb2,0x20,0xc2,0x6f,0x84,0x7b,0xd1,0xce,0xac,0x7f,0xa0,0xd1,0x85,0x18},
-{0x32,0x59,0x5b,0xa1,0x8d,0xdd,0x19,0xd3,0x50,0x9a,0x1c,0xc0,0xaa,0xa5,0xb4,0x46,0x9f,0x3d,0x63,0x67,0xe4,0x4,0x6b,0xba,0xf6,0xca,0x19,0xab,0xb,0x56,0xee,0x7e},
-{0x1f,0xb1,0x79,0xea,0xa9,0x28,0x21,0x74,0xe9,0xbd,0xf7,0x35,0x3b,0x36,0x51,0xee,0x1d,0x57,0xac,0x5a,0x75,0x50,0xd3,0x76,0x3a,0x46,0xc2,0xfe,0xa3,0x7d,0x70,0x1},
-{0xf7,0x35,0xc1,0xaf,0x98,0xa4,0xd8,0x42,0x78,0xed,0xec,0x20,0x9e,0x6b,0x67,0x79,0x41,0x83,0x63,0x15,0xea,0x3a,0xdb,0xa8,0xfa,0xc3,0x3b,0x4d,0x32,0x83,0x2c,0x83},
-{0xa7,0x40,0x3b,0x1f,0x1c,0x27,0x47,0xf3,0x59,0x40,0xf0,0x34,0xb7,0x2d,0x76,0x9a,0xe7,0x3e,0x4e,0x6c,0xd2,0x21,0x4f,0xfd,0xb8,0xfd,0x8d,0x39,0xdc,0x57,0x59,0xef},
-{0x8d,0x9b,0xc,0x49,0x2b,0x49,0xeb,0xda,0x5b,0xa2,0xd7,0x49,0x68,0xf3,0x70,0xd,0x7d,0x3b,0xae,0xd0,0x7a,0x8d,0x55,0x84,0xf5,0xa5,0xe9,0xf0,0xe4,0xf8,0x8e,0x65},
-{0xa0,0xb8,0xa2,0xf4,0x36,0x10,0x3b,0x53,0xc,0xa8,0x7,0x9e,0x75,0x3e,0xec,0x5a,0x91,0x68,0x94,0x92,0x56,0xe8,0x88,0x4f,0x5b,0xb0,0x5c,0x55,0xf8,0xba,0xbc,0x4c},
-{0xe3,0xbb,0x3b,0x99,0xf3,0x87,0x94,0x7b,0x75,0xda,0xf4,0xd6,0x72,0x6b,0x1c,0x5d,0x64,0xae,0xac,0x28,0xdc,0x34,0xb3,0x6d,0x6c,0x34,0xa5,0x50,0xb8,0x28,0xdb,0x71},
-{0xf8,0x61,0xe2,0xf2,0x10,0x8d,0x51,0x2a,0xe3,0xdb,0x64,0x33,0x59,0xdd,0x75,0xfc,0x1c,0xac,0xbc,0xf1,0x43,0xce,0x3f,0xa2,0x67,0xbb,0xd1,0x3c,0x2,0xe8,0x43,0xb0},
-{0x33,0xa,0x5b,0xca,0x88,0x29,0xa1,0x75,0x7f,0x34,0x19,0x4d,0xb4,0x16,0x53,0x5c,0x92,0x3b,0x94,0xc3,0xe,0x79,0x4d,0x1e,0x79,0x74,0x75,0xd7,0xb6,0xee,0xaf,0x3f},
-{0xea,0xa8,0xd4,0xf7,0xbe,0x1a,0x39,0x21,0x5c,0xf4,0x7e,0x9,0x4c,0x23,0x27,0x51,0x26,0xa3,0x24,0x53,0xba,0x32,0x3c,0xd2,0x44,0xa3,0x17,0x4a,0x6d,0xa6,0xd5,0xad},
-{0xb5,0x1d,0x3e,0xa6,0xaf,0xf2,0xc9,0x8,0x83,0x59,0x3d,0x98,0x91,0x6b,0x3c,0x56,0x4c,0xf8,0x7c,0xa1,0x72,0x86,0x60,0x4d,0x46,0xe2,0x3e,0xcc,0x8,0x6e,0xc7,0xf6},
-{0x2f,0x98,0x33,0xb3,0xb1,0xbc,0x76,0x5e,0x2b,0xd6,0x66,0xa5,0xef,0xc4,0xe6,0x2a,0x6,0xf4,0xb6,0xe8,0xbe,0xc1,0xd4,0x36,0x74,0xee,0x82,0x15,0xbc,0xef,0x21,0x63},
-{0xfd,0xc1,0x4e,0xd,0xf4,0x53,0xc9,0x69,0xa7,0x7d,0x5a,0xc4,0x6,0x58,0x58,0x26,0x7e,0xc1,0x14,0x16,0x6,0xe0,0xfa,0x16,0x7e,0x90,0xaf,0x3d,0x28,0x63,0x9d,0x3f},
-{0xd2,0xc9,0xf2,0xe3,0x0,0x9b,0xd2,0xc,0x5f,0xaa,0xce,0x30,0xb7,0xd4,0xc,0x30,0x74,0x2a,0x51,0x16,0xf2,0xe0,0x32,0x98,0xd,0xeb,0x30,0xd8,0xe3,0xce,0xf8,0x9a},
-{0x4b,0xc5,0x9e,0x7b,0xb5,0xf1,0x79,0x92,0xff,0x51,0xe6,0x6e,0x4,0x86,0x68,0xd3,0x9b,0x23,0x4d,0x57,0xe6,0x96,0x67,0x31,0xcc,0xe6,0xa6,0xf3,0x17,0xa,0x75,0x5},
-{0xb1,0x76,0x81,0xd9,0x13,0x32,0x6c,0xce,0x3c,0x17,0x52,0x84,0xf8,0x5,0xa2,0x62,0xf4,0x2b,0xcb,0xb3,0x78,0x47,0x15,0x47,0xff,0x46,0x54,0x82,0x23,0x93,0x6a,0x48},
-{0x38,0xdf,0x58,0x7,0x4e,0x5e,0x65,0x65,0xf2,0xfc,0x7c,0x89,0xfc,0x86,0x50,0x8e,0x31,0x70,0x2e,0x44,0xd0,0xb,0xca,0x86,0xf0,0x40,0x9,0xa2,0x30,0x78,0x47,0x4e},
-{0x65,0xa0,0xee,0x39,0xd1,0xf7,0x38,0x83,0xf7,0x5e,0xe9,0x37,0xe4,0x2c,0x3a,0xbd,0x21,0x97,0xb2,0x26,0x1,0x13,0xf8,0x6f,0xa3,0x44,0xed,0xd1,0xef,0x9f,0xde,0xe7},
-{0x8b,0xa0,0xdf,0x15,0x76,0x25,0x92,0xd9,0x3c,0x85,0xf7,0xf6,0x12,0xdc,0x42,0xbe,0xd8,0xa7,0xec,0x7c,0xab,0x27,0xb0,0x7e,0x53,0x8d,0x7d,0xda,0xaa,0x3e,0xa8,0xde},
-{0xaa,0x25,0xce,0x93,0xbd,0x2,0x69,0xd8,0x5a,0xf6,0x43,0xfd,0x1a,0x73,0x8,0xf9,0xc0,0x5f,0xef,0xda,0x17,0x4a,0x19,0xa5,0x97,0x4d,0x66,0x33,0x4c,0xfd,0x21,0x6a},
-{0x35,0xb4,0x98,0x31,0xdb,0x41,0x15,0x70,0xea,0x1e,0xf,0xbb,0xed,0xcd,0x54,0x9b,0x9a,0xd0,0x63,0xa1,0x51,0x97,0x40,0x72,0xf6,0x75,0x9d,0xbf,0x91,0x47,0x6f,0xe2}};
-
-/*swapping bit 2i with bit 2i+1 of 32-bit x*/
-#define SWAP1(x)   (x) = ((((x) & 0x55555555UL) << 1) | (((x) & 0xaaaaaaaaUL) >> 1));
-/*swapping bits 4i||4i+1 with bits 4i+2||4i+3 of 32-bit x*/
-#define SWAP2(x)   (x) = ((((x) & 0x33333333UL) << 2) | (((x) & 0xccccccccUL) >> 2));
-/*swapping bits 8i||8i+1||8i+2||8i+3 with bits 8i+4||8i+5||8i+6||8i+7 of 32-bit x*/
-#define SWAP4(x)   (x) = ((((x) & 0x0f0f0f0fUL) << 4) | (((x) & 0xf0f0f0f0UL) >> 4));
-/*swapping bits 16i||16i+1||......||16i+7  with bits 16i+8||16i+9||......||16i+15 of 32-bit x*/
-//#define SWAP8(x)   (x) = ((((x) & 0x00ff00ffUL) << 8) | (((x) & 0xff00ff00UL) >> 8));
-#define SWAP8(x) (x) = __byte_perm(x, x, 0x2301);
-/*swapping bits 32i||32i+1||......||32i+15 with bits 32i+16||32i+17||......||32i+31 of 32-bit x*/
-//#define SWAP16(x)  (x) = ((((x) & 0x0000ffffUL) << 16) | (((x) & 0xffff0000UL) >> 16));
-#define SWAP16(x) (x) = __byte_perm(x, x, 0x1032);
-
-/*The MDS transform*/
-#define L(m0,m1,m2,m3,m4,m5,m6,m7) \
-      (m4) ^= (m1);                \
-      (m5) ^= (m2);                \
-      (m6) ^= (m0) ^ (m3);         \
-      (m7) ^= (m0);                \
-      (m0) ^= (m5);                \
-      (m1) ^= (m6);                \
-      (m2) ^= (m4) ^ (m7);         \
-      (m3) ^= (m4);
-
-/*The Sbox*/
-#define Sbox(m0,m1,m2,m3,cc)       \
-      m3  = ~(m3);                 \
-      m0 ^= ((~(m2)) & (cc));      \
-      temp0 = (cc) ^ ((m0) & (m1));\
-      m0 ^= ((m2) & (m3));         \
-      m3 ^= ((~(m1)) & (m2));      \
-      m1 ^= ((m0) & (m2));         \
-      m2 ^= ((m0) & (~(m3)));      \
-      m0 ^= ((m1) | (m3));         \
-      m3 ^= ((m1) & (m2));         \
-      m1 ^= (temp0 & (m0));        \
-      m2 ^= temp0;
-
-__device__ __forceinline__ void Sbox_and_MDS_layer(hashState* state, uint32_t roundnumber)
-{
-    uint32_t temp0;
-	uint32_t cc0, cc1;
-    //Sbox and MDS layer
-#pragma unroll 4
-    for (int i = 0; i < 4; i++) {
-		cc0 = ((uint32_t*)c_E8_bitslice_roundconstant[roundnumber])[i];
-		cc1 = ((uint32_t*)c_E8_bitslice_roundconstant[roundnumber])[i+4];
-        Sbox(state->x[0][i],state->x[2][i], state->x[4][i], state->x[6][i], cc0);
-        Sbox(state->x[1][i],state->x[3][i], state->x[5][i], state->x[7][i], cc1);
-        L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
-    }
-}
-
-__device__ __forceinline__ void RoundFunction0(hashState* state, uint32_t roundnumber)
-{
-	Sbox_and_MDS_layer(state, roundnumber);
-
-#pragma unroll 4
-	for (int j = 1; j < 8; j = j+2)
-	{
-#pragma unroll 4
-		for (int i = 0; i < 4; i++) SWAP1(state->x[j][i]);
-	}
-}
-
-__device__ __forceinline__ void RoundFunction1(hashState* state, uint32_t roundnumber)
-{
-	Sbox_and_MDS_layer(state, roundnumber);
-
-#pragma unroll 4
-	for (int j = 1; j < 8; j = j+2)
-	{
-#pragma unroll 4
-		for (int i = 0; i < 4; i++) SWAP2(state->x[j][i]);
-	}
-}
-
-__device__ __forceinline__ void RoundFunction2(hashState* state, uint32_t roundnumber)
-{
-	Sbox_and_MDS_layer(state, roundnumber);
-
-#pragma unroll 4
-	for (int j = 1; j < 8; j = j+2)
-	{
-#pragma unroll 4
-		for (int i = 0; i < 4; i++) SWAP4(state->x[j][i]);
-	}
-}
-
-__device__ __forceinline__ void RoundFunction3(hashState* state, uint32_t roundnumber)
-{
-	Sbox_and_MDS_layer(state, roundnumber);
-
-#pragma unroll 4
-	for (int j = 1; j < 8; j = j+2)
-	{
-#pragma unroll 4
-		for (int i = 0; i < 4; i++) SWAP8(state->x[j][i]);
-	}
-}
-
-__device__ __forceinline__ void RoundFunction4(hashState* state, uint32_t roundnumber)
-{
-	Sbox_and_MDS_layer(state, roundnumber);
-
-#pragma unroll 4
-	for (int j = 1; j < 8; j = j+2)
-	{
-#pragma unroll 4
-		for (int i = 0; i < 4; i++) SWAP16(state->x[j][i]);
-	}
-}
-
-__device__ __forceinline__ void RoundFunction5(hashState* state, uint32_t roundnumber)
-{
-	uint32_t temp0;
-
-	Sbox_and_MDS_layer(state, roundnumber);
-
-#pragma unroll 4
-	for (int j = 1; j < 8; j = j+2)
-	{
-#pragma unroll 2
-		for (int i = 0; i < 4; i = i+2) {
-			temp0 = state->x[j][i]; state->x[j][i] = state->x[j][i+1]; state->x[j][i+1] = temp0;
-		}
-	}
-}
-
-__device__ __forceinline__ void RoundFunction6(hashState* state, uint32_t roundnumber)
-{
-	uint32_t temp0;
-
-	Sbox_and_MDS_layer(state, roundnumber);
-
-#pragma unroll 4
-	for (int j = 1; j < 8; j = j+2)
-	{
-#pragma unroll 2
-		for (int i = 0; i < 2; i++) {
-			temp0 = state->x[j][i]; state->x[j][i] = state->x[j][i+2]; state->x[j][i+2] = temp0;
-		}
-	}
-}
-
-/*The bijective function E8, in bitslice form */
-__device__ __forceinline__ void E8(hashState *state)
-{
-    /*perform 6 rounds*/
-//#pragma unroll 6
-    for (int i = 0; i < 42; i+=7)
-	{
-		RoundFunction0(state, i);
-		RoundFunction1(state, i+1);
-		RoundFunction2(state, i+2);
-		RoundFunction3(state, i+3);
-		RoundFunction4(state, i+4);
-		RoundFunction5(state, i+5);
-		RoundFunction6(state, i+6);
-	}
-}
-
-/*The compression function F8 */
-__device__ __forceinline__ void F8(hashState *state)
-{
-    /*xor the 512-bit message with the fist half of the 1024-bit hash state*/
-#pragma unroll 16
-    for (int i = 0; i < 16; i++)  state->x[i >> 2][i & 3] ^= ((uint32_t*)state->buffer)[i];
-
-    /*the bijective function E8 */
-    E8(state);
-
-    /*xor the 512-bit message with the second half of the 1024-bit hash state*/
-#pragma unroll 16
-    for (int i = 0; i < 16; i++)  state->x[(16+i) >> 2][(16+i) & 3] ^= ((uint32_t*)state->buffer)[i];
-}
-
-
-__device__ __forceinline__ void JHHash(const uint32_t *data, uint32_t *hashval)
-{
-    hashState state;
-
-    /*load the intital hash value H0 into state*/
-	/*
-    #define INIT(a,b,c,d) ((a) | ((b)<<8) | ((c)<<16) | ((d)<<24))
-    state.x[0][0] = INIT(0x6f,0xd1,0x4b,0x96);
-    state.x[0][1] = INIT(0x3e,0x00,0xaa,0x17);
-    state.x[0][2] = INIT(0x63,0x6a,0x2e,0x05);
-    state.x[0][3] = INIT(0x7a,0x15,0xd5,0x43);
-    state.x[1][0] = INIT(0x8a,0x22,0x5e,0x8d);
-    state.x[1][1] = INIT(0x0c,0x97,0xef,0x0b);
-    state.x[1][2] = INIT(0xe9,0x34,0x12,0x59);
-    state.x[1][3] = INIT(0xf2,0xb3,0xc3,0x61);
-    state.x[2][0] = INIT(0x89,0x1d,0xa0,0xc1);
-    state.x[2][1] = INIT(0x53,0x6f,0x80,0x1e);
-    state.x[2][2] = INIT(0x2a,0xa9,0x05,0x6b);
-    state.x[2][3] = INIT(0xea,0x2b,0x6d,0x80);
-    state.x[3][0] = INIT(0x58,0x8e,0xcc,0xdb);
-    state.x[3][1] = INIT(0x20,0x75,0xba,0xa6);
-    state.x[3][2] = INIT(0xa9,0x0f,0x3a,0x76);
-    state.x[3][3] = INIT(0xba,0xf8,0x3b,0xf7);
-    state.x[4][0] = INIT(0x01,0x69,0xe6,0x05);
-    state.x[4][1] = INIT(0x41,0xe3,0x4a,0x69);
-    state.x[4][2] = INIT(0x46,0xb5,0x8a,0x8e);
-    state.x[4][3] = INIT(0x2e,0x6f,0xe6,0x5a);
-    state.x[5][0] = INIT(0x10,0x47,0xa7,0xd0);
-    state.x[5][1] = INIT(0xc1,0x84,0x3c,0x24);
-    state.x[5][2] = INIT(0x3b,0x6e,0x71,0xb1);
-    state.x[5][3] = INIT(0x2d,0x5a,0xc1,0x99);
-    state.x[6][0] = INIT(0xcf,0x57,0xf6,0xec);
-    state.x[6][1] = INIT(0x9d,0xb1,0xf8,0x56);
-    state.x[6][2] = INIT(0xa7,0x06,0x88,0x7c);
-    state.x[6][3] = INIT(0x57,0x16,0xb1,0x56);
-    state.x[7][0] = INIT(0xe3,0xc2,0xfc,0xdf);
-    state.x[7][1] = INIT(0xe6,0x85,0x17,0xfb);
-    state.x[7][2] = INIT(0x54,0x5a,0x46,0x78);
-    state.x[7][3] = INIT(0xcc,0x8c,0xdd,0x4b);
-	*/
-#pragma unroll 8
-	for(int j=0;j<8;j++)
-	{
-#pragma unroll 4
-		for(int i=0;i<4;i++)
-			state.x[j][i] = c_INIT_bitslice[j][i];
-	}
-
-#pragma unroll 16
-    for (int i=0; i < 16; ++i) state.buffer[i] = data[i];
-    F8(&state);
-
-    /*pad the message when databitlen is multiple of 512 bits, then process the padded block*/
-    state.buffer[0] = 0x80;
-#pragma unroll 14
-    for (int i=1; i < 15; i++) state.buffer[i] = 0;
-    state.buffer[15] = 0x00020000;
-    F8(&state);
-
-    /*truncating the final hash value to generate the message digest*/
-#pragma unroll 16
-    for (int i=0; i < 16; ++i) hashval[i] = state.x[4][i];
-}
-
-// Die Hash-Funktion
-__global__ void quark_jh512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
-{
-    int thread = (blockDim.x * blockIdx.x + threadIdx.x);
-    if (thread < threads)
-    {
-        uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
-
-        int hashPosition = nounce - startNounce;
-        uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition];
-
-        JHHash(Hash, Hash);
-    }
-}
-
-
-// Setup-Funktionen
-__host__ void quark_jh512_cpu_init(int thr_id, int threads)
-{
-	
-    cudaMemcpyToSymbol( c_E8_bitslice_roundconstant,
-                        h_E8_bitslice_roundconstant,
-                        sizeof(h_E8_bitslice_roundconstant),
-                        0, cudaMemcpyHostToDevice);
-
-	cudaMemcpyToSymbol( c_INIT_bitslice,
-                        h_INIT_bitslice,
-                        sizeof(h_INIT_bitslice),
-                        0, cudaMemcpyHostToDevice);
-}
-
-__host__ void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
-{
-    const int threadsperblock = 256;
-
-    // berechne wie viele Thread Blocks wir brauchen
-    dim3 grid((threads + threadsperblock-1)/threadsperblock);
-    dim3 block(threadsperblock);
-
-    // Gr��e des dynamischen Shared Memory Bereichs
-    size_t shared_size = 0;
-
-    quark_jh512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
-    MyStreamSynchronize(NULL, order, thr_id);
-}
-
+/**
+ * JH512 64 and 80 kernels
+ *
+ * JH80 by tpruvot - 2017 - under GPLv3
+ **/
+#include <cuda_helper.h>
+
+// #include <stdio.h>  // printf
+// #include <unistd.h> // sleep
+
+/* 1344 bytes, align 16 is there to allow ld.const.v4 (made auto. by the compiler) */
+__constant__ static __align__(16) uint32_t c_E8_bslice32[42][8] = {
+	// Round 0 (Function0)
+	{ 0xa2ded572, 0x90d6ab81, 0x67f815df, 0xf6875a4d, 0x0a15847b, 0xc54f9f4e, 0x571523b7, 0x402bd1c3 },
+	{ 0xe03a98ea, 0xb4960266, 0x9cfa455c, 0x8a53bbf2, 0x99d2c503, 0x1a1456b5, 0x9a99b266, 0x31a2db88 }, // 1
+	{ 0x5c5aa303, 0x8019051c, 0xdb0e199a, 0x1d959e84, 0x0ab23f40, 0xadeb336f, 0x1044c187, 0xdccde75e }, // 2
+	{ 0x9213ba10, 0x39812c0a, 0x416bbf02, 0x5078aa37, 0x156578dc, 0xd2bf1a3f, 0xd027bbf7, 0xd3910041 }, // 3
+	{ 0x0d5a2d42, 0x0ba75c18, 0x907eccf6, 0xac442bc7, 0x9c9f62dd, 0xd665dfd1, 0xce97c092, 0x23fcc663 }, // 4
+	{ 0x036c6e97, 0xbb03f1ee, 0x1ab8e09e, 0xfa618e5d, 0x7e450521, 0xb29796fd, 0xa8ec6c44, 0x97818394 }, // 5
+	{ 0x37858e4a, 0x8173fe8a, 0x2f3003db, 0x6c69b8f8, 0x2d8d672a, 0x4672c78a, 0x956a9ffb, 0x14427fc0 }, // 6
+	// Round 7 (Function0)
+	{ 0x8f15f4c5, 0xb775de52, 0xc45ec7bd, 0xbc88e4ae, 0xa76f4475, 0x1e00b882, 0x80bb118f, 0xf4a3a698 },
+	{ 0x338ff48e, 0x20edf1b6, 0x1563a3a9, 0xfde05a7c, 0x24565faa, 0x5ae9ca36, 0x89f9b7d5, 0x362c4206 },
+	{ 0x433529ce, 0x591ff5d0, 0x3d98fe4e, 0x86814e6f, 0x74f93a53, 0x81ad9d0e, 0xa74b9a73, 0x9f5ad8af },
+	{ 0x670605a7, 0x26077447, 0x6a6234ee, 0x3f1080c6, 0xbe280b8b, 0x6f7ea0e0, 0x2717b96e, 0x7b487ec6 },
+	{ 0xa50a550d, 0x81727686, 0xc0a4f84a, 0xd48d6050, 0x9fe7e391, 0x415a9e7e, 0x9ef18e97, 0x62b0e5f3 },
+	{ 0xec1f9ffc, 0xf594d74f, 0x7a205440, 0xd895fa9d, 0x001ae4e3, 0x117e2e55, 0x84c9f4ce, 0xa554c324 },
+	{ 0x2872df5b, 0xef7c8905, 0x286efebd, 0x2ed349ee, 0xe27ff578, 0x85937e44, 0xb2c4a50f, 0x7f5928eb },
+	// Round 14 (Function0)
+	{ 0x37695f70, 0x04771bc7, 0x4a3124b3, 0xe720b951, 0xf128865e, 0xe843fe74, 0x65e4d61d, 0x8a87d423 },
+	{ 0xa3e8297d, 0xfb301b1d, 0xf2947692, 0xe01bdc5b, 0x097acbdd, 0x4f4924da, 0xc1d9309b, 0xbf829cf2 },
+	{ 0x31bae7a4, 0x32fcae3b, 0xffbf70b4, 0x39d3bb53, 0x0544320d, 0xc1c39f45, 0x48bcf8de, 0xa08b29e0 },
+	{ 0xfd05c9e5, 0x01b771a2, 0x0f09aef7, 0x95ed44e3, 0x12347094, 0x368e3be9, 0x34f19042, 0x4a982f4f },
+	{ 0x631d4088, 0xf14abb7e, 0x15f66ca0, 0x30c60ae2, 0x4b44c147, 0xc5b67046, 0xffaf5287, 0xe68c6ecc },
+	{ 0x56a4d5a4, 0x45ce5773, 0x00ca4fbd, 0xadd16430, 0x4b849dda, 0x68cea6e8, 0xae183ec8, 0x67255c14 },
+	{ 0xf28cdaa3, 0x20b2601f, 0x16e10ecb, 0x7b846fc2, 0x5806e933, 0x7facced1, 0x9a99949a, 0x1885d1a0 },
+	// Round 21 (Function0)
+	{ 0xa15b5932, 0x67633d9f, 0xd319dd8d, 0xba6b04e4, 0xc01c9a50, 0xab19caf6, 0x46b4a5aa, 0x7eee560b },
+	{ 0xea79b11f, 0x5aac571d, 0x742128a9, 0x76d35075, 0x35f7bde9, 0xfec2463a, 0xee51363b, 0x01707da3 },
+	{ 0xafc135f7, 0x15638341, 0x42d8a498, 0xa8db3aea, 0x20eced78, 0x4d3bc3fa, 0x79676b9e, 0x832c8332 },
+	{ 0x1f3b40a7, 0x6c4e3ee7, 0xf347271c, 0xfd4f21d2, 0x34f04059, 0x398dfdb8, 0x9a762db7, 0xef5957dc },
+	{ 0x490c9b8d, 0xd0ae3b7d, 0xdaeb492b, 0x84558d7a, 0x49d7a25b, 0xf0e9a5f5, 0x0d70f368, 0x658ef8e4 },
+	{ 0xf4a2b8a0, 0x92946891, 0x533b1036, 0x4f88e856, 0x9e07a80c, 0x555cb05b, 0x5aec3e75, 0x4cbcbaf8 },
+	{ 0x993bbbe3, 0x28acae64, 0x7b9487f3, 0x6db334dc, 0xd6f4da75, 0x50a5346c, 0x5d1c6b72, 0x71db28b8 },
+	// Round 28 (Function0)
+	{ 0xf2e261f8, 0xf1bcac1c, 0x2a518d10, 0xa23fce43, 0x3364dbe3, 0x3cd1bb67, 0xfc75dd59, 0xb043e802 },
+	{ 0xca5b0a33, 0xc3943b92, 0x75a12988, 0x1e4d790e, 0x4d19347f, 0xd7757479, 0x5c5316b4, 0x3fafeeb6 },
+	{ 0xf7d4a8ea, 0x5324a326, 0x21391abe, 0xd23c32ba, 0x097ef45c, 0x4a17a344, 0x5127234c, 0xadd5a66d },
+	{ 0xa63e1db5, 0xa17cf84c, 0x08c9f2af, 0x4d608672, 0x983d5983, 0xcc3ee246, 0x563c6b91, 0xf6c76e08 },
+	{ 0xb333982f, 0xe8b6f406, 0x5e76bcb1, 0x36d4c1be, 0xa566d62b, 0x1582ee74, 0x2ae6c4ef, 0x6321efbc },
+	{ 0x0d4ec1fd, 0x1614c17e, 0x69c953f4, 0x16fae006, 0xc45a7da7, 0x3daf907e, 0x26585806, 0x3f9d6328 },
+	{ 0xe3f2c9d2, 0x16512a74, 0x0cd29b00, 0x9832e0f2, 0x30ceaa5f, 0xd830eb0d, 0x300cd4b7, 0x9af8cee3 },
+	// Round 35 (Function0)
+	{ 0x7b9ec54b, 0x574d239b, 0x9279f1b5, 0x316796e6, 0x6ee651ff, 0xf3a6e6cc, 0xd3688604, 0x05750a17 },
+	{ 0xd98176b1, 0xb3cb2bf4, 0xce6c3213, 0x47154778, 0x8452173c, 0x825446ff, 0x62a205f8, 0x486a9323 },
+	{ 0x0758df38, 0x442e7031, 0x65655e4e, 0x86ca0bd0, 0x897cfcf2, 0xa20940f0, 0x8e5086fc, 0x4e477830 },
+	{ 0x39eea065, 0x26b29721, 0x8338f7d1, 0x6ff81301, 0x37e95ef7, 0xd1ed44a3, 0xbd3a2ce4, 0xe7de9fef },
+	{ 0x15dfa08b, 0x7ceca7d8, 0xd9922576, 0x7eb027ab, 0xf6f7853c, 0xda7d8d53, 0xbe42dc12, 0xdea83eaa },
+	{ 0x93ce25aa, 0xdaef5fc0, 0xd86902bd, 0xa5194a17, 0xfd43f65a, 0x33664d97, 0xf908731a, 0x6a21fd4c },
+	{ 0x3198b435, 0xa163d09a, 0x701541db, 0x72409751, 0xbb0f1eea, 0xbf9d75f6, 0x9b54cded, 0xe26f4791 }
+	// 42 rounds...
+};
+
+/*swapping bits 32i||32i+1||......||32i+15 with bits 32i+16||32i+17||......||32i+31 of 32-bit x*/
+//#define SWAP16(x)  (x) = ((((x) & 0x0000ffffUL) << 16) | (((x) & 0xffff0000UL) >> 16));
+#define SWAP16(x) (x) = __byte_perm(x, 0, 0x1032);
+
+/*swapping bits 16i||16i+1||......||16i+7  with bits 16i+8||16i+9||......||16i+15 of 32-bit x*/
+//#define SWAP8(x)   (x) = ((((x) & 0x00ff00ffUL) << 8) | (((x) & 0xff00ff00UL) >> 8));
+#define SWAP8(x) (x) = __byte_perm(x, 0, 0x2301);
+
+/*
+__device__ __forceinline__
+static void SWAP4(uint32_t &x) {
+	uint32_t y = x & 0xF0F0F0F0;
+	x = (x ^ y) << 4;
+	x |= y >> 4;
+}
+__device__ __forceinline__
+static void SWAP2(uint32_t &x) {
+	uint32_t y = (x & 0xCCCCCCCC);
+	x = (x ^ y) << 2;
+	x |= y >> 2;
+}
+__device__ __forceinline__
+static void SWAP1(uint32_t &x) {
+	uint32_t y = (x & 0xAAAAAAAA);
+	x = (x ^ y) << 1;
+	x |= y >> 1;
+}
+*/
+
+__device__ __forceinline__
+static void SWAP4x4(uint32_t *x) {
+	#pragma nounroll
+	// y is used as tmp register too
+	for (uint32_t y=0; y<4; y++, ++x) {
+		asm("and.b32 %1, %0, 0xF0F0F0F0;\n\t"
+		"xor.b32 %0, %0, %1; shr.b32 %1, %1, 4;\n\t"
+		"vshl.u32.u32.u32.clamp.add %0, %0, 4, %1;"
+		: "+r"(*x) : "r"(y));
+	}
+}
+
+__device__ __forceinline__
+static void SWAP2x4(uint32_t *x) {
+	#pragma nounroll
+	// y is used as tmp register too
+	for (uint32_t y=0; y<4; y++, ++x) {
+		asm("and.b32 %1, %0, 0xCCCCCCCC;\n\t"
+		"xor.b32 %0, %0, %1; shr.b32 %1, %1, 2; \n\t"
+		"vshl.u32.u32.u32.clamp.add %0, %0, 2, %1;"
+		: "+r"(*x) : "r"(y));
+	}
+}
+
+__device__ __forceinline__
+static void SWAP1x4(uint32_t *x) {
+	#pragma nounroll
+	// y is used as tmp register too
+	for (uint32_t y=0; y<4; y++, ++x) {
+		asm("and.b32 %1, %0, 0xAAAAAAAA;\n\t"
+		"xor.b32 %0, %0, %1; shr.b32 %1, %1, 1; \n\t"
+		"vshl.u32.u32.u32.clamp.add %0, %0, 1, %1;"
+		: "+r"(*x) : "r"(y));
+	}
+}
+
+/* The MDS transform */
+#define L(m0,m1,m2,m3,m4,m5,m6,m7) \
+      m4 ^= m1;                    \
+      m5 ^= m2;                    \
+      m6 ^= m0 ^ m3;               \
+      m7 ^= m0;                    \
+      m0 ^= m5;                    \
+      m1 ^= m6;                    \
+      m2 ^= m4 ^ m7;               \
+      m3 ^= m4;
+
+/* The Sbox */
+#define Sbox(m0, m1, m2, m3, cc)   \
+      m3  = ~(m3);                 \
+      m0 ^= (~(m2)) & cc;          \
+      temp0 = cc ^ (m0 & m1);      \
+      m0 ^= m2 & m3;               \
+      m3 ^= (~(m1)) & m2;          \
+      m1 ^= m0 & m2;               \
+      m2 ^= m0 & (~(m3));          \
+      m0 ^= m1 | m3;               \
+      m3 ^= m1 & m2;               \
+      m1 ^= temp0 & m0;            \
+      m2 ^= temp0;
+
+__device__ __forceinline__
+static void Sbox_and_MDS_layer(uint32_t x[8][4], const int rnd)
+{
+	uint2* cc = (uint2*) &c_E8_bslice32[rnd];
+
+	// Sbox and MDS layer
+	#pragma unroll
+	for (int i = 0; i < 4; i++, ++cc) {
+		uint32_t temp0;
+		Sbox(x[0][i], x[2][i], x[4][i], x[6][i], cc->x);
+		Sbox(x[1][i], x[3][i], x[5][i], x[7][i], cc->y);
+		L(x[0][i], x[2][i], x[4][i], x[6][i], x[1][i], x[3][i], x[5][i], x[7][i]);
+	}
+}
+
+__device__ __forceinline__
+static void RoundFunction0(uint32_t x[8][4], const int rnd)
+{
+	Sbox_and_MDS_layer(x, rnd + 0); // 0, 7, 14 .. 35
+	#pragma unroll 4
+	for (int j = 1; j < 8; j += 2) { // 1, 3, 5, 7 (Even)
+		SWAP1x4(x[j]);
+		// SWAP1(x[j][0]); SWAP1(x[j][1]); SWAP1(x[j][2]); SWAP1(x[j][3]);
+	}
+}
+
+__device__ __forceinline__
+static void RoundFunction1(uint32_t x[8][4], const int rnd)
+{
+	Sbox_and_MDS_layer(x, rnd + 1);
+
+	#pragma unroll 4
+	for (int j = 1; j < 8; j += 2) {
+		SWAP2x4(x[j]);
+		// SWAP2(x[j][0]); SWAP2(x[j][1]); SWAP2(x[j][2]); SWAP2(x[j][3]);
+	}
+}
+
+__device__ __forceinline__
+static void RoundFunction2(uint32_t x[8][4], const int rnd)
+{
+	Sbox_and_MDS_layer(x, rnd + 2);
+
+	#pragma unroll 4
+	for (int j = 1; j < 8; j += 2) {
+		SWAP4x4(x[j]);
+		// SWAP4(x[j][0]); SWAP4(x[j][1]); SWAP4(x[j][2]); SWAP4(x[j][3]);
+	}
+}
+
+__device__ __forceinline__
+static void RoundFunction3(uint32_t x[8][4], const int rnd)
+{
+	Sbox_and_MDS_layer(x, rnd + 3);
+
+	//uint32_t* xj = x[j];
+	#pragma unroll 4
+	for (int j = 1; j < 8; j += 2) {
+		SWAP8(x[j][0]);
+		SWAP8(x[j][1]);
+		SWAP8(x[j][2]);
+		SWAP8(x[j][3]);
+	}
+}
+
+__device__ __forceinline__
+static void RoundFunction4(uint32_t x[8][4], const int rnd)
+{
+	Sbox_and_MDS_layer(x, rnd + 4);
+
+	#pragma unroll 4
+	for (int j = 1; j < 8; j += 2)
+	{
+		//uint32_t* xj = x[j];
+		#pragma unroll
+		for (int i = 0; i < 4; i++)
+			SWAP16(x[j][i]);
+	}
+}
+
+__device__ __forceinline__
+static void RoundFunction5(uint32_t x[8][4], const int rnd)
+{
+	Sbox_and_MDS_layer(x, rnd + 5);
+
+	#pragma unroll 4
+	for (int j = 1; j < 8; j += 2)
+	{
+		xchg(x[j][0], x[j][1]);
+		xchg(x[j][2], x[j][3]);
+	}
+}
+
+__device__ __forceinline__
+static void RoundFunction6(uint32_t x[8][4], const int rnd)
+{
+	Sbox_and_MDS_layer(x, rnd + 6);
+
+	#pragma unroll 4
+	for (int j = 1; j < 8; j += 2)
+	{
+		xchg(x[j][0], x[j][2]);
+		xchg(x[j][1], x[j][3]);
+	}
+}
+
+/* The bijective function E8, in bitslice form */
+__device__
+static void E8(uint32_t x[8][4])
+{
+	/* perform 6 loops of 7 rounds */
+	for (int r = 0; r < 42; r += 7)
+	{
+		RoundFunction0(x, r);
+		RoundFunction1(x, r);
+		RoundFunction2(x, r);
+		RoundFunction3(x, r);
+		RoundFunction4(x, r);
+		RoundFunction5(x, r);
+		RoundFunction6(x, r);
+	}
+}
+
+__global__
+//__launch_bounds__(256,2)
+void quark_jh512_gpu_hash_64(const uint32_t threads, const uint32_t startNounce, uint32_t* g_hash, uint32_t * g_nonceVector)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+		const uint32_t hashPosition = nounce - startNounce;
+		uint32_t *Hash = &g_hash[(size_t)16 * hashPosition];
+
+		uint32_t h[16];
+		AS_UINT4(&h[ 0]) = AS_UINT4(&Hash[ 0]);
+		AS_UINT4(&h[ 4]) = AS_UINT4(&Hash[ 4]);
+		AS_UINT4(&h[ 8]) = AS_UINT4(&Hash[ 8]);
+		AS_UINT4(&h[12]) = AS_UINT4(&Hash[12]);
+
+		uint32_t x[8][4] = { /* init */
+			{ 0x964bd16f, 0x17aa003e, 0x052e6a63, 0x43d5157a },
+			{ 0x8d5e228a, 0x0bef970c, 0x591234e9, 0x61c3b3f2 },
+			{ 0xc1a01d89, 0x1e806f53, 0x6b05a92a, 0x806d2bea },
+			{ 0xdbcc8e58, 0xa6ba7520, 0x763a0fa9, 0xf73bf8ba },
+			{ 0x05e66901, 0x694ae341, 0x8e8ab546, 0x5ae66f2e },
+			{ 0xd0a74710, 0x243c84c1, 0xb1716e3b, 0x99c15a2d },
+			{ 0xecf657cf, 0x56f8b19d, 0x7c8806a7, 0x56b11657 },
+			{ 0xdffcc2e3, 0xfb1785e6, 0x78465a54, 0x4bdd8ccc }
+		};
+
+		#pragma unroll
+		for (int i = 0; i < 16; i++)
+			x[i/4][i & 3] ^= h[i];
+
+		E8(x);
+
+		#pragma unroll
+		for (int i = 0; i < 16; i++)
+			x[(i+16)/4][(i+16) & 3] ^= h[i];
+
+		x[0][0] ^= 0x80U;
+		x[3][3] ^= 0x00020000U;
+
+		E8(x);
+
+		x[4][0] ^= 0x80U;
+		x[7][3] ^= 0x00020000U;
+
+		AS_UINT4(&Hash[ 0]) = AS_UINT4(&x[4][0]);
+		AS_UINT4(&Hash[ 4]) = AS_UINT4(&x[5][0]);
+		AS_UINT4(&Hash[ 8]) = AS_UINT4(&x[6][0]);
+		AS_UINT4(&Hash[12]) = AS_UINT4(&x[7][0]);
+	}
+}
+
+__host__
+void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+{
+	const uint32_t threadsperblock = 256;
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	quark_jh512_gpu_hash_64<<<grid, block>>>(threads, startNounce, d_hash, d_nonceVector);
+}
+
+// Setup function
+__host__ void  quark_jh512_cpu_init(int thr_id, uint32_t threads) {}
+
+#define WANT_JH80_MIDSTATE
+#ifdef WANT_JH80
+
+__constant__
+static uint32_t c_PaddedMessage80[20]; // padded message (80 bytes)
+
+__host__
+void jh512_setBlock_80(int thr_id, uint32_t *endiandata)
+{
+	cudaMemcpyToSymbol(c_PaddedMessage80, endiandata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice);
+}
+
+__global__
+void jh512_gpu_hash_80(const uint32_t threads, const uint32_t startNounce, uint32_t * g_outhash)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t h[20];
+		AS_UINT4(&h[ 0]) = AS_UINT4(&c_PaddedMessage80[ 0]);
+		AS_UINT4(&h[ 4]) = AS_UINT4(&c_PaddedMessage80[ 4]);
+		AS_UINT4(&h[ 8]) = AS_UINT4(&c_PaddedMessage80[ 8]);
+		AS_UINT4(&h[12]) = AS_UINT4(&c_PaddedMessage80[12]);
+		AS_UINT2(&h[16]) = AS_UINT2(&c_PaddedMessage80[16]);
+		h[18] = c_PaddedMessage80[18];
+		h[19] = cuda_swab32(startNounce + thread);
+
+		uint32_t x[8][4] = { /* init */
+			{ 0x964bd16f, 0x17aa003e, 0x052e6a63, 0x43d5157a },
+			{ 0x8d5e228a, 0x0bef970c, 0x591234e9, 0x61c3b3f2 },
+			{ 0xc1a01d89, 0x1e806f53, 0x6b05a92a, 0x806d2bea },
+			{ 0xdbcc8e58, 0xa6ba7520, 0x763a0fa9, 0xf73bf8ba },
+			{ 0x05e66901, 0x694ae341, 0x8e8ab546, 0x5ae66f2e },
+			{ 0xd0a74710, 0x243c84c1, 0xb1716e3b, 0x99c15a2d },
+			{ 0xecf657cf, 0x56f8b19d, 0x7c8806a7, 0x56b11657 },
+			{ 0xdffcc2e3, 0xfb1785e6, 0x78465a54, 0x4bdd8ccc }
+		};
+
+		// 1 (could be precomputed)
+		#pragma unroll
+		for (int i = 0; i < 16; i++)
+			x[i/4][i & 3] ^= h[i];
+		E8(x);
+		#pragma unroll
+		for (int i = 0; i < 16; i++)
+			x[(i+16)/4][(i+16) & 3] ^= h[i];
+
+		// 2 (16 bytes with nonce)
+		#pragma unroll
+		for (int i = 0; i < 4; i++)
+			x[0][i] ^= h[16+i];
+		x[1][0] ^= 0x80U;
+		E8(x);
+		#pragma unroll
+		for (int i = 0; i < 4; i++)
+			x[4][i] ^= h[16+i];
+		x[5][0] ^= 0x80U;
+
+		// 3 close
+		x[3][3] ^= 0x80020000U; // 80 bytes = 640bits (0x280)
+		E8(x);
+		x[7][3] ^= 0x80020000U;
+
+		uint32_t *Hash = &g_outhash[(size_t)16 * thread];
+		AS_UINT4(&Hash[ 0]) = AS_UINT4(&x[4][0]);
+		AS_UINT4(&Hash[ 4]) = AS_UINT4(&x[5][0]);
+		AS_UINT4(&Hash[ 8]) = AS_UINT4(&x[6][0]);
+		AS_UINT4(&Hash[12]) = AS_UINT4(&x[7][0]);
+	}
+}
+
+__host__
+void jh512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash)
+{
+	const uint32_t threadsperblock = 256;
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	jh512_gpu_hash_80 <<<grid, block>>> (threads, startNounce, d_hash);
+}
+
+#endif
+
+#ifdef WANT_JH80_MIDSTATE
+
+__constant__ static uint32_t c_JHState[32];
+__constant__ static uint32_t c_Message[4];
+
+__global__
+void jh512_gpu_hash_80(const uint32_t threads, const uint32_t startNounce, uint32_t * g_outhash)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		// 1 (precomputed state)
+		uint32_t x[8][4];
+		AS_UINT4(&x[0][0]) = AS_UINT4(&c_JHState[ 0]);
+		AS_UINT4(&x[1][0]) = AS_UINT4(&c_JHState[ 4]);
+		AS_UINT4(&x[2][0]) = AS_UINT4(&c_JHState[ 8]);
+		AS_UINT4(&x[3][0]) = AS_UINT4(&c_JHState[12]);
+
+		AS_UINT4(&x[4][0]) = AS_UINT4(&c_JHState[16]);
+		AS_UINT4(&x[5][0]) = AS_UINT4(&c_JHState[20]);
+		AS_UINT4(&x[6][0]) = AS_UINT4(&c_JHState[24]);
+		AS_UINT4(&x[7][0]) = AS_UINT4(&c_JHState[28]);
+
+		// 2 (16 bytes with nonce)
+		uint32_t h[4];
+		AS_UINT2(&h[0]) = AS_UINT2(&c_Message[0]);
+		h[2] = c_Message[2];
+		h[3] = cuda_swab32(startNounce + thread);
+
+		#pragma unroll
+		for (int i = 0; i < 4; i++)
+			x[0][i] ^= h[i];
+		x[1][0] ^= 0x80U;
+		E8(x);
+		#pragma unroll
+		for (int i = 0; i < 4; i++)
+			x[4][i] ^= h[i];
+		x[5][0] ^= 0x80U;
+
+		// 3 close
+		x[3][3] ^= 0x80020000U; // 80 bytes = 640bits (0x280)
+		E8(x);
+		x[7][3] ^= 0x80020000U;
+
+		uint32_t *Hash = &g_outhash[(size_t)16 * thread];
+		AS_UINT4(&Hash[ 0]) = AS_UINT4(&x[4][0]);
+		AS_UINT4(&Hash[ 4]) = AS_UINT4(&x[5][0]);
+		AS_UINT4(&Hash[ 8]) = AS_UINT4(&x[6][0]);
+		AS_UINT4(&Hash[12]) = AS_UINT4(&x[7][0]);
+	}
+}
+
+__host__
+void jh512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash)
+{
+	const uint32_t threadsperblock = 256;
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	jh512_gpu_hash_80 <<<grid, block>>> (threads, startNounce, d_hash);
+}
+
+extern "C" {
+#undef SPH_C32
+#undef SPH_T32
+#undef SPH_C64
+#undef SPH_T64
+#include <sph/sph_jh.h>
+}
+
+__host__
+void jh512_setBlock_80(int thr_id, uint32_t *endiandata)
+{
+	sph_jh512_context ctx_jh;
+
+	sph_jh512_init(&ctx_jh);
+	sph_jh512(&ctx_jh, endiandata, 64);
+
+	cudaMemcpyToSymbol(c_JHState, ctx_jh.H.narrow, 128, 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(c_Message, &endiandata[16], sizeof(c_Message), 0, cudaMemcpyHostToDevice);
+}
+
+#endif
diff --git a/quark/cuda_quark.h b/quark/cuda_quark.h
new file mode 100644
index 0000000000..fbb0c1da43
--- /dev/null
+++ b/quark/cuda_quark.h
@@ -0,0 +1,35 @@
+#include "cuda_helper.h"
+
+/* commonly used cuda quark kernels prototypes */
+
+extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
+extern void quark_blake512_cpu_free(int thr_id);
+extern void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata);
+extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
+extern void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void quark_bmw512_cpu_init(int thr_id, uint32_t threads);
+extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
+extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_doublegroestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_groestl512_cpu_free(int thr_id);
+
+extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
+extern void quark_skein512_cpu_hash_64(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void quark_keccak512_cpu_init(int thr_id, uint32_t threads);
+extern void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void quark_jh512_cpu_init(int thr_id, uint32_t threads);
+extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void quark_compactTest_cpu_init(int thr_id, uint32_t threads);
+extern void quark_compactTest_cpu_free(int thr_id);
+extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
+											uint32_t *d_nonces1, uint32_t *nrm1, uint32_t *d_nonces2, uint32_t *nrm2, int order);
+extern void quark_compactTest_single_false_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
+											uint32_t *d_nonces1, uint32_t *nrm1, int order);
+
+extern uint32_t cuda_check_hash_branch(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
diff --git a/quark/cuda_quark_blake512.cu b/quark/cuda_quark_blake512.cu
index eabef09fc2..68a411a711 100644
--- a/quark/cuda_quark_blake512.cu
+++ b/quark/cuda_quark_blake512.cu
@@ -1,430 +1,319 @@
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
-#include <stdio.h>
-#include <memory.h>
-
-#define USE_SHUFFLE 0
-
-// Folgende Definitionen sp�ter durch header ersetzen
-typedef unsigned char uint8_t;
-typedef unsigned int uint32_t;
-typedef unsigned long long uint64_t;
-
-// aus heavy.cu
-extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
-
-// die Message it Padding zur Berechnung auf der GPU
-__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
-
-// ---------------------------- BEGIN CUDA quark_blake512 functions ------------------------------------
-
-__constant__ uint8_t c_sigma[16][16];
-
-const uint8_t host_sigma[16][16] =
-{
-  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-  {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
-  {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
-  { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
-  { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
-  { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
-  {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
-  {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
-  { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
-  {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 },
-  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-  {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
-  {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
-  { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
-  { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
-  { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
-};
-
-// das Hi Word aus einem 64 Bit Typen extrahieren
-static __device__ uint32_t HIWORD(const uint64_t &x) {
-#if __CUDA_ARCH__ >= 130
-	return (uint32_t)__double2hiint(__longlong_as_double(x));
-#else
-	return (uint32_t)(x >> 32);
-#endif
-}
-
-// das Hi Word in einem 64 Bit Typen ersetzen
-static __device__ uint64_t REPLACE_HIWORD(const uint64_t &x, const uint32_t &y) {
-	return (x & 0xFFFFFFFFULL) | (((uint64_t)y) << 32ULL);
-}
-
-// das Lo Word aus einem 64 Bit Typen extrahieren
-static __device__ uint32_t LOWORD(const uint64_t &x) {
-#if __CUDA_ARCH__ >= 130
-	return (uint32_t)__double2loint(__longlong_as_double(x));
-#else
-	return (uint32_t)(x & 0xFFFFFFFFULL);
-#endif
-}
-
-// das Lo Word in einem 64 Bit Typen ersetzen
-static __device__ uint64_t REPLACE_LOWORD(const uint64_t &x, const uint32_t &y) {
-	return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y);
-}
-
-__device__ __forceinline__ uint64_t SWAP64(uint64_t x)
-{
-	// Input:	77665544 33221100
-	// Output:	00112233 44556677
-	uint64_t temp[2];
-	temp[0] = __byte_perm(HIWORD(x), 0, 0x0123);
-	temp[1] = __byte_perm(LOWORD(x), 0, 0x0123);
-
-	return temp[0] | (temp[1]<<32);
-}
-
-__constant__ uint64_t c_u512[16];
-
-const uint64_t host_u512[16] =
-{
-  0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL, 
-  0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL,
-  0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL, 
-  0xc0ac29b7c97c50ddULL, 0x3f84d5b5b5470917ULL,
-  0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL, 
-  0x2ffd72dbd01adfb7ULL, 0xb8e1afed6a267e96ULL,
-  0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL, 
-  0x0801f2e2858efc16ULL, 0x636920d871574e69ULL
-};
-
-
-// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
-#if __CUDA_ARCH__ >= 350
-__forceinline__ __device__ uint64_t ROTR(const uint64_t value, const int offset) {
-    uint2 result;
-    if(offset < 32) {
-        asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-        asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-    } else {
-        asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-        asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-    }
-    return  __double_as_longlong(__hiloint2double(result.y, result.x));
-}
-#else
-#define ROTR(x, n)        (((x) >> (n)) | ((x) << (64 - (n))))
-#endif
-
-#define G(a,b,c,d,e)          \
-    v[a] += (m[sigma[i][e]] ^ u512[sigma[i][e+1]]) + v[b];\
-    v[d] = ROTR( v[d] ^ v[a],32);        \
-    v[c] += v[d];           \
-    v[b] = ROTR( v[b] ^ v[c],25);        \
-    v[a] += (m[sigma[i][e+1]] ^ u512[sigma[i][e]])+v[b];  \
-    v[d] = ROTR( v[d] ^ v[a],16);        \
-    v[c] += v[d];           \
-    v[b] = ROTR( v[b] ^ v[c],11);
-
-
-__device__ void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t ((*sigma)[16]), const uint64_t *u512, const int bits )
-{
-    uint64_t v[16], m[16], i;
-
-#pragma unroll 16
-    for( i = 0; i < 16; ++i )
-    {
-        m[i] = SWAP64(block[i]);
-    }
-
-#pragma unroll 8
-    for( i = 0; i < 8; ++i )  v[i] = h[i];
-
-    v[ 8] = u512[0];
-    v[ 9] = u512[1];
-    v[10] = u512[2];
-    v[11] = u512[3];
-    v[12] = u512[4];
-    v[13] = u512[5];
-    v[14] = u512[6];
-    v[15] = u512[7];
-
-    v[12] ^= bits;
-    v[13] ^= bits;
-
-//#pragma unroll 16
-    for( i = 0; i < 16; ++i )
-    {
-        /* column step */
-        G( 0, 4, 8, 12, 0 );
-        G( 1, 5, 9, 13, 2 );
-        G( 2, 6, 10, 14, 4 );
-        G( 3, 7, 11, 15, 6 );
-        /* diagonal step */
-        G( 0, 5, 10, 15, 8 );
-        G( 1, 6, 11, 12, 10 );
-        G( 2, 7, 8, 13, 12 );
-        G( 3, 4, 9, 14, 14 );
-    }
-
-#pragma unroll 16
-    for( i = 0; i < 16; ++i )  h[i % 8] ^= v[i];
-}
-
-// Endian Drehung f�r 32 Bit Typen
-
-static __device__ uint32_t cuda_swab32(uint32_t x)
-{
-	return __byte_perm(x, 0, 0x0123);
-}
-
-/*
-// Endian Drehung f�r 64 Bit Typen
-static __device__ uint64_t cuda_swab64(uint64_t x) {
-    uint32_t h = (x >> 32);
-    uint32_t l = (x & 0xFFFFFFFFULL);
-    return (((uint64_t)cuda_swab32(l)) << 32) | ((uint64_t)cuda_swab32(h));
-}
-*/
-
-static __constant__ uint64_t d_constMem[8];
-static const uint64_t h_constMem[8] = {
-	0x6a09e667f3bcc908ULL,
-	0xbb67ae8584caa73bULL,
-	0x3c6ef372fe94f82bULL,
-	0xa54ff53a5f1d36f1ULL,
-	0x510e527fade682d1ULL,
-	0x9b05688c2b3e6c1fULL,
-	0x1f83d9abfb41bd6bULL,
-	0x5be0cd19137e2179ULL };
-
-// Hash-Padding
-static __constant__ uint64_t d_constHashPadding[8];
-static const uint64_t h_constHashPadding[8] = {
-	0x0000000000000080ull,
-	0,
-	0,
-	0,
-	0,
-	0x0100000000000000ull,
-	0,
-	0x0002000000000000ull };
-
-__global__ __launch_bounds__(256, 2) void quark_blake512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint64_t *g_hash)
-{
-	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
-
-#if USE_SHUFFLE
-	const int warpID = threadIdx.x & 0x0F; // 16 warps
-	const int warpBlockID = (thread + 15)>>4; // aufrunden auf volle Warp-Bl�cke
-	const int maxHashPosition = thread<<3;
-#endif
-
-#if USE_SHUFFLE
-	if (warpBlockID < ( (threads+15)>>4 ))
-#else
-	if (thread < threads)
-#endif
-	{
-		// bestimme den aktuellen Z�hler
-		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
-
-		int hashPosition = nounce - startNounce;
-		//uint64_t *inpHash = &g_hash[8 * hashPosition];
-		uint64_t *inpHash = &g_hash[hashPosition<<3];
-
-		// State vorbereiten
-		uint64_t h[8];
-		/*
-		h[0] = 0x6a09e667f3bcc908ULL;
-		h[1] = 0xbb67ae8584caa73bULL;
-		h[2] = 0x3c6ef372fe94f82bULL;
-		h[3] = 0xa54ff53a5f1d36f1ULL;
-		h[4] = 0x510e527fade682d1ULL;
-		h[5] = 0x9b05688c2b3e6c1fULL;
-		h[6] = 0x1f83d9abfb41bd6bULL;
-		h[7] = 0x5be0cd19137e2179ULL;
-		*/
-#pragma unroll 8
-		for(int i=0;i<8;i++)
-			h[i] = d_constMem[i];
-
-		// 128 Byte f�r die Message
-		uint64_t buf[16];
-
-		// Message f�r die erste Runde in Register holen
-#pragma unroll 8
-		for (int i=0; i < 8; ++i) buf[i] = inpHash[i];
-
-		/*
-		buf[ 8] = 0x0000000000000080ull;
-		buf[ 9] = 0;
-		buf[10] = 0;
-		buf[11] = 0;
-		buf[12] = 0;
-		buf[13] = 0x0100000000000000ull;
-		buf[14] = 0;
-		buf[15] = 0x0002000000000000ull;
-		*/
-#pragma unroll 8
-		for(int i=0;i<8;i++)
-			buf[i+8] = d_constHashPadding[i];
-
-		// die einzige Hashing-Runde
-		quark_blake512_compress( h, buf, c_sigma, c_u512, 512 );
-
-		// Hash rauslassen
-#if __CUDA_ARCH__ >= 130
-		// ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verf�gbar sind
-		uint32_t *outHash = (uint32_t*)&g_hash[8 * hashPosition];
-#pragma unroll 8
-		for (int i=0; i < 8; ++i) {
-			outHash[2*i+0] = cuda_swab32( HIWORD(h[i]) );
-			outHash[2*i+1] = cuda_swab32( LOWORD(h[i]) );
-		}
-#else
-		// in dieser Version passieren auch ein paar 64 Bit Shifts
-		uint64_t *outHash = &g_hash[8 * hashPosition];
-#pragma unroll 8
-		for (int i=0; i < 8; ++i)
-		{
-			//outHash[i] = cuda_swab64( h[i] );
-			outHash[i] = SWAP64(h[i]);
-		}
-#endif
-	}
-}
-
-__global__ void quark_blake512_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash)
-{
-	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
-	{
-		// bestimme den aktuellen Z�hler
-		uint32_t nounce = startNounce + thread;
-
-		// State vorbereiten
-		uint64_t h[8];
-		/*
-		h[0] = 0x6a09e667f3bcc908ULL;
-		h[1] = 0xbb67ae8584caa73bULL;
-		h[2] = 0x3c6ef372fe94f82bULL;
-		h[3] = 0xa54ff53a5f1d36f1ULL;
-		h[4] = 0x510e527fade682d1ULL;
-		h[5] = 0x9b05688c2b3e6c1fULL;
-		h[6] = 0x1f83d9abfb41bd6bULL;
-		h[7] = 0x5be0cd19137e2179ULL;
-		*/
-#pragma unroll 8
-		for(int i=0;i<8;i++)
-			h[i] = d_constMem[i];
-		// 128 Byte f�r die Message
-		uint64_t buf[16];
-
-		// Message f�r die erste Runde in Register holen
-#pragma unroll 16
-		for (int i=0; i < 16; ++i) buf[i] = c_PaddedMessage80[i];
-
-		// die Nounce durch die thread-spezifische ersetzen
-		buf[9] = REPLACE_HIWORD(buf[9], cuda_swab32(nounce));
-
-		// die einzige Hashing-Runde
-		quark_blake512_compress( h, buf, c_sigma, c_u512, 640 );
-
-		// Hash rauslassen
-#if __CUDA_ARCH__ >= 130
-		// ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verf�gbar sind
-		uint32_t *outHash = (uint32_t *)outputHash + 16 * thread;
-#pragma unroll 8
-		for (int i=0; i < 8; ++i) {
-			outHash[2*i+0] = cuda_swab32( HIWORD(h[i]) );
-			outHash[2*i+1] = cuda_swab32( LOWORD(h[i]) );
-		}
-#else
-		// in dieser Version passieren auch ein paar 64 Bit Shifts
-		uint64_t *outHash = (uint64_t *)outputHash + 8 * thread;
-#pragma unroll 8
-		for (int i=0; i < 8; ++i)
-		{
-			//outHash[i] = cuda_swab64( h[i] );
-			outHash[i] = SWAP64(h[i]);
-		}
-#endif
-	}
-}
-
-
-// ---------------------------- END CUDA quark_blake512 functions ------------------------------------
-
-// Setup-Funktionen
-__host__ void quark_blake512_cpu_init(int thr_id, int threads)
-{
-	// Kopiere die Hash-Tabellen in den GPU-Speicher
-	cudaMemcpyToSymbol( c_sigma,
-						host_sigma,
-						sizeof(host_sigma),
-						0, cudaMemcpyHostToDevice);
-
-	cudaMemcpyToSymbol( c_u512,
-						host_u512,
-						sizeof(host_u512),
-						0, cudaMemcpyHostToDevice);
-
-	cudaMemcpyToSymbol( d_constMem,
-						h_constMem,
-						sizeof(h_constMem),
-						0, cudaMemcpyHostToDevice);
-
-	cudaMemcpyToSymbol( d_constHashPadding,
-						h_constHashPadding,
-						sizeof(h_constHashPadding),
-						0, cudaMemcpyHostToDevice);
-}
-
-// Blake512 f�r 80 Byte grosse Eingangsdaten
-__host__ void quark_blake512_cpu_setBlock_80(void *pdata)
-{
-	// Message mit Padding bereitstellen
-	// lediglich die korrekte Nonce ist noch ab Byte 76 einzusetzen.
-	unsigned char PaddedMessage[128];
-	memcpy(PaddedMessage, pdata, 80);
-	memset(PaddedMessage+80, 0, 48);
-	PaddedMessage[80] = 0x80;
-	PaddedMessage[111] = 1;
-	PaddedMessage[126] = 0x02;
-	PaddedMessage[127] = 0x80;
-
-	// die Message zur Berechnung auf der GPU
-	cudaMemcpyToSymbol( c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
-}
-
-__host__ void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order)
-{
-	const int threadsperblock = 256;
-
-	// berechne wie viele Thread Blocks wir brauchen
-	dim3 grid((threads + threadsperblock-1)/threadsperblock);
-	dim3 block(threadsperblock);
-
-	// Gr��e des dynamischen Shared Memory Bereichs
-	size_t shared_size = 0;
-
-	quark_blake512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, d_nonceVector, (uint64_t*)d_outputHash);
-
-	// Strategisches Sleep Kommando zur Senkung der CPU Last
-	MyStreamSynchronize(NULL, order, thr_id);
-}
-
-__host__ void quark_blake512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order)
-{
-	const int threadsperblock = 256;
-
-	// berechne wie viele Thread Blocks wir brauchen
-	dim3 grid((threads + threadsperblock-1)/threadsperblock);
-	dim3 block(threadsperblock);
-
-	// Gr��e des dynamischen Shared Memory Bereichs
-	size_t shared_size = 0;
-
-	quark_blake512_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash);
-
-	// Strategisches Sleep Kommando zur Senkung der CPU Last
-	MyStreamSynchronize(NULL, order, thr_id);
-}
+﻿#include <stdio.h>
+#include <memory.h>
+#include <sys/types.h> // off_t
+
+#include "miner.h"
+#include "cuda_helper.h"
+
+#define ROTR(x,n) ROTR64(x,n)
+
+// use sp kernel on SM 5+
+#define SP_KERNEL
+
+#define USE_SHUFFLE 0
+
+__constant__
+static uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
+
+// ---------------------------- BEGIN CUDA quark_blake512 functions ------------------------------------
+
+__device__ __constant__
+static const uint8_t c_sigma_big[16][16] = {
+	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+
+	{12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+	{13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+	{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+	{10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 },
+
+	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
+};
+
+__device__ __constant__
+static const uint64_t c_u512[16] =
+{
+	0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL,
+	0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL,
+	0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL,
+	0xc0ac29b7c97c50ddULL, 0x3f84d5b5b5470917ULL,
+	0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL,
+	0x2ffd72dbd01adfb7ULL, 0xb8e1afed6a267e96ULL,
+	0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL,
+	0x0801f2e2858efc16ULL, 0x636920d871574e69ULL
+};
+
+#define G(a,b,c,d,x) { \
+	uint32_t idx1 = sigma[i][x]; \
+	uint32_t idx2 = sigma[i][x+1]; \
+	v[a] += (m[idx1] ^ u512[idx2]) + v[b]; \
+	v[d] = SWAPDWORDS(v[d] ^ v[a]); \
+	v[c] += v[d]; \
+	v[b] = ROTR( v[b] ^ v[c], 25); \
+	v[a] += (m[idx2] ^ u512[idx1]) + v[b]; \
+	v[d] = ROTR( v[d] ^ v[a], 16); \
+	v[c] += v[d]; \
+	v[b] = ROTR( v[b] ^ v[c], 11); \
+}
+
+__device__ __forceinline__
+void quark_blake512_compress(uint64_t *h, const uint64_t *block, const uint8_t ((*sigma)[16]), const uint64_t *u512, const int T0)
+{
+	uint64_t v[16];
+	uint64_t m[16];
+
+	#pragma unroll
+	for(int i=0; i < 16; i++) {
+		m[i] = cuda_swab64(block[i]);
+	}
+
+	//#pragma unroll 8
+	for(int i=0; i < 8; i++)
+		v[i] = h[i];
+
+	v[ 8] = u512[0];
+	v[ 9] = u512[1];
+	v[10] = u512[2];
+	v[11] = u512[3];
+	v[12] = u512[4] ^ T0;
+	v[13] = u512[5] ^ T0;
+	v[14] = u512[6];
+	v[15] = u512[7];
+
+	//#pragma unroll 16
+	for(int i=0; i < 16; i++)
+	{
+		/* column step */
+		G( 0, 4, 8, 12, 0 );
+		G( 1, 5, 9, 13, 2 );
+		G( 2, 6, 10, 14, 4 );
+		G( 3, 7, 11, 15, 6 );
+		/* diagonal step */
+		G( 0, 5, 10, 15, 8 );
+		G( 1, 6, 11, 12, 10 );
+		G( 2, 7, 8, 13, 12 );
+		G( 3, 4, 9, 14, 14 );
+	}
+
+	h[0] ^= v[0] ^ v[8];
+	h[1] ^= v[1] ^ v[9];
+	h[2] ^= v[2] ^ v[10];
+	h[3] ^= v[3] ^ v[11];
+	h[4] ^= v[4] ^ v[12];
+	h[5] ^= v[5] ^ v[13];
+	h[6] ^= v[6] ^ v[14];
+	h[7] ^= v[7] ^ v[15];
+}
+
+__global__ __launch_bounds__(256, 4)
+void quark_blake512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *g_nonceVector, uint64_t *g_hash)
+{
+#if !defined(SP_KERNEL) || __CUDA_ARCH__ < 500
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+#if USE_SHUFFLE
+	const uint32_t warpBlockID = (thread + 15)>>4; // aufrunden auf volle Warp-Blöcke
+
+	if (warpBlockID < ( (threads+15)>>4 ))
+#else
+	if (thread < threads)
+#endif
+	{
+		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+
+		off_t hashPosition = nounce - startNounce;
+		uint64_t *inpHash = &g_hash[hashPosition<<3]; // hashPosition * 8
+
+		// 128 Bytes
+		uint64_t buf[16];
+
+		// State
+		uint64_t h[8] = {
+			0x6a09e667f3bcc908ULL,
+			0xbb67ae8584caa73bULL,
+			0x3c6ef372fe94f82bULL,
+			0xa54ff53a5f1d36f1ULL,
+			0x510e527fade682d1ULL,
+			0x9b05688c2b3e6c1fULL,
+			0x1f83d9abfb41bd6bULL,
+			0x5be0cd19137e2179ULL
+		};
+
+		// Message for first round
+		#pragma unroll 8
+		for (int i=0; i < 8; ++i)
+			buf[i] = inpHash[i];
+
+		// Hash Pad
+		buf[8]  = 0x0000000000000080ull;
+		buf[9]  = 0;
+		buf[10] = 0;
+		buf[11] = 0;
+		buf[12] = 0;
+		buf[13] = 0x0100000000000000ull;
+		buf[14] = 0;
+		buf[15] = 0x0002000000000000ull;
+
+		// Ending round
+		quark_blake512_compress(h, buf, c_sigma_big, c_u512, 512);
+
+#if __CUDA_ARCH__ <= 350
+		uint32_t *outHash = (uint32_t*)&g_hash[hashPosition * 8U];
+		#pragma unroll 8
+		for (int i=0; i < 8; i++) {
+			outHash[2*i+0] = cuda_swab32( _HIDWORD(h[i]) );
+			outHash[2*i+1] = cuda_swab32( _LODWORD(h[i]) );
+		}
+#else
+		uint64_t *outHash = &g_hash[hashPosition * 8U];
+		for (int i=0; i < 8; i++) {
+			outHash[i] = cuda_swab64(h[i]);
+		}
+#endif
+	}
+#endif /* SP */
+}
+
+__global__ __launch_bounds__(256,4)
+void quark_blake512_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash)
+{
+//#if !defined(SP_KERNEL) || __CUDA_ARCH__ < 500
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint64_t buf[16];
+		#pragma unroll
+		for (int i=0; i < 16; ++i)
+			buf[i] = c_PaddedMessage80[i];
+
+		// The test Nonce
+		const uint32_t nounce = startNounce + thread;
+		((uint32_t*)buf)[19] = cuda_swab32(nounce);
+
+		uint64_t h[8] = {
+			0x6a09e667f3bcc908ULL,
+			0xbb67ae8584caa73bULL,
+			0x3c6ef372fe94f82bULL,
+			0xa54ff53a5f1d36f1ULL,
+			0x510e527fade682d1ULL,
+			0x9b05688c2b3e6c1fULL,
+			0x1f83d9abfb41bd6bULL,
+			0x5be0cd19137e2179ULL
+		};
+
+		quark_blake512_compress(h, buf, c_sigma_big, c_u512, 640);
+
+#if __CUDA_ARCH__ <= 350
+		uint32_t *outHash = (uint32_t*)outputHash + (thread * 16U);
+		#pragma unroll 8
+		for (uint32_t i=0; i < 8; i++) {
+			outHash[2*i]   = cuda_swab32( _HIDWORD(h[i]) );
+			outHash[2*i+1] = cuda_swab32( _LODWORD(h[i]) );
+		}
+#else
+		uint64_t *outHash = (uint64_t*)outputHash + (thread * 8U);
+		for (uint32_t i=0; i < 8; i++) {
+			outHash[i] = cuda_swab64( h[i] );
+		}
+#endif
+	}
+//#endif
+}
+
+#ifdef SP_KERNEL
+#include "cuda_quark_blake512_sp.cuh"
+#endif
+
+__host__
+void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order)
+{
+#ifdef SP_KERNEL
+	int dev_id = device_map[thr_id];
+	if (device_sm[dev_id] >= 500 && cuda_arch[dev_id] >= 500)
+		quark_blake512_cpu_hash_64_sp(threads, startNounce, d_nonceVector, d_outputHash);
+	else
+#endif
+	{
+		const uint32_t threadsperblock = 256;
+		dim3 grid((threads + threadsperblock-1)/threadsperblock);
+		dim3 block(threadsperblock);
+		quark_blake512_gpu_hash_64<<<grid, block>>>(threads, startNounce, d_nonceVector, (uint64_t*)d_outputHash);
+	}
+	MyStreamSynchronize(NULL, order, thr_id);
+}
+
+__host__
+void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash)
+{
+#ifdef SP_KERNEL
+	int dev_id = device_map[thr_id];
+	if (device_sm[dev_id] >= 500 && cuda_arch[dev_id] >= 500)
+		quark_blake512_cpu_hash_80_sp(threads, startNounce, d_outputHash);
+	else
+#endif
+	{
+		const uint32_t threadsperblock = 256;
+		dim3 grid((threads + threadsperblock-1)/threadsperblock);
+		dim3 block(threadsperblock);
+
+		quark_blake512_gpu_hash_80<<<grid, block>>>(threads, startNounce, d_outputHash);
+	}
+}
+
+// ---------------------------- END CUDA quark_blake512 functions ------------------------------------
+
+__host__
+void quark_blake512_cpu_init(int thr_id, uint32_t threads)
+{
+	cuda_get_arch(thr_id);
+}
+
+__host__
+void quark_blake512_cpu_free(int thr_id)
+{
+}
+
+// ----------------------------- Host midstate for 80-bytes input ------------------------------------
+
+#undef SPH_C32
+#undef SPH_T32
+#undef SPH_C64
+#undef SPH_T64
+
+extern "C" {
+#include "sph/sph_blake.h"
+}
+
+__host__
+void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *endiandata)
+{
+#ifdef SP_KERNEL
+	int dev_id = device_map[thr_id];
+	if (device_sm[dev_id] >= 500 && cuda_arch[dev_id] >= 500)
+		quark_blake512_cpu_setBlock_80_sp(thr_id, (uint64_t*) endiandata);
+	else
+#endif
+	{
+		uint64_t message[16];
+
+		memcpy(message, endiandata, 80);
+		message[10] = 0x80;
+		message[11] = 0;
+		message[12] = 0;
+		message[13] = 0x0100000000000000ull;
+		message[14] = 0;
+		message[15] = 0x8002000000000000ull; // 0x280
+
+		cudaMemcpyToSymbol(c_PaddedMessage80, message, sizeof(message), 0, cudaMemcpyHostToDevice);
+	}
+	CUDA_LOG_ERROR();
+}
diff --git a/quark/cuda_quark_blake512_sp.cuh b/quark/cuda_quark_blake512_sp.cuh
new file mode 100644
index 0000000000..069620a680
--- /dev/null
+++ b/quark/cuda_quark_blake512_sp.cuh
@@ -0,0 +1,670 @@
+/* sp implementation of blake */
+
+#include <stdio.h>
+#include <memory.h>
+#include "miner.h"
+
+// Should stay outside the ifdef on WIN64 (wtf)
+#include "cuda_vector_uint2x4.h"
+__constant__ static uint2 c_PaddedM[16];
+__constant__ static uint2x4 c_Hostprecalc[4];
+
+#if __CUDA_ARCH__ >= 500  || !defined(__CUDA_ARCH__)
+
+#undef G
+#define vectorizelow(/* uint32_t*/ v) make_uint2(v,0)
+#define vectorizehigh(/*uint32_t*/ v) make_uint2(0,v)
+
+static __device__ __forceinline__ uint2 cuda_swap(uint2 v) {
+	const uint32_t t = cuda_swab32(v.x);
+	v.x = cuda_swab32(v.y);
+	v.y = t;
+	return v;
+}
+static __device__ __forceinline__ uint2 eorswap32(uint2 u, uint2 v) {
+	uint2 result;
+	result.y = u.x ^ v.x;
+	result.x = u.y ^ v.y;
+	return result;
+}
+
+__constant__ uint2 c_512_u2[16] =
+{
+	{ 0x85a308d3UL, 0x243f6a88 }, { 0x03707344UL, 0x13198a2e },
+	{ 0x299f31d0UL, 0xa4093822 }, { 0xec4e6c89UL, 0x082efa98 },
+	{ 0x38d01377UL, 0x452821e6 }, { 0x34e90c6cUL, 0xbe5466cf },
+	{ 0xc97c50ddUL, 0xc0ac29b7 }, { 0xb5470917UL, 0x3f84d5b5 },
+	{ 0x8979fb1bUL, 0x9216d5d9 }, { 0x98dfb5acUL, 0xd1310ba6 },
+	{ 0xd01adfb7UL, 0x2ffd72db }, { 0x6a267e96UL, 0xb8e1afed },
+	{ 0xf12c7f99UL, 0xba7c9045 }, { 0xb3916cf7UL, 0x24a19947 },
+	{ 0x858efc16UL, 0x0801f2e2 }, { 0x71574e69UL, 0x636920d8 }
+};
+
+__constant__ uint8_t c_sigma[6][16] = {
+	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
+};
+
+// ---------------------------- BEGIN CUDA quark_blake512 functions ------------------------------------
+
+#define Gprecalc(a,b,c,d,idx1,idx2) { \
+	v[a] += (block[idx2] ^ c_512_u2[idx1]) + v[b]; \
+	v[d] = eorswap32( v[d] , v[a]); \
+	v[c] += v[d]; \
+	v[b] = ROR2(v[b] ^ v[c], 25); \
+	v[a] += (block[idx1] ^ c_512_u2[idx2]) + v[b]; \
+	v[d] = ROR16(v[d] ^ v[a]); \
+	v[c] += v[d]; \
+	v[b] = ROR2(v[b] ^ v[c], 11); \
+}
+
+#define GprecalcHost(a,b,c,d,idx1,idx2) { \
+	v[a] += (block[idx2] ^ u512[idx1]) + v[b]; \
+	v[d] = ROTR64( v[d] ^ v[a],32); \
+	v[c] += v[d]; \
+	v[b] = ROTR64(v[b] ^ v[c], 25); \
+	v[a] += (block[idx1] ^ u512[idx2]) + v[b]; \
+	v[d] = ROTR64(v[d] ^ v[a],16); \
+	v[c] += v[d]; \
+	v[b] = ROTR64(v[b] ^ v[c], 11); \
+}
+
+#define G(a,b,c,d,x) { \
+	uint32_t idx1 = c_sigma[i][x]; \
+	uint32_t idx2 = c_sigma[i][x+1]; \
+	v[a] += (block[idx1] ^ c_512_u2[idx2]) + v[b]; \
+	v[d] = eorswap32(v[d] , v[a]); \
+	v[c] += v[d]; \
+	v[b] = ROR2( v[b] ^ v[c], 25); \
+	v[a] += (block[idx2] ^ c_512_u2[idx1]) + v[b]; \
+	v[d] = ROR16( v[d] ^ v[a]); \
+	v[c] += v[d]; \
+	v[b] = ROR2( v[b] ^ v[c], 11); \
+}
+
+__global__
+#if __CUDA_ARCH__ > 500
+__launch_bounds__(256, 1)
+#endif
+void quark_blake512_gpu_hash_64_sp(uint32_t threads, uint32_t startNounce, uint32_t *const __restrict__ g_nonceVector, uint2* g_hash)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+	if (thread < threads)
+	{
+		const uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+		const uint32_t hashPosition = nounce - startNounce;
+
+		uint2 msg[16];
+
+		uint2x4 *phash = (uint2x4*)&g_hash[hashPosition * 8U];
+		uint2x4 *outpt = (uint2x4*)msg;
+		outpt[0] = phash[0];
+		outpt[1] = phash[1];
+
+		uint2 block[16];
+		block[0].x = cuda_swab32(msg[0].y);
+		block[0].y = cuda_swab32(msg[0].x);
+		block[1].x = cuda_swab32(msg[1].y);
+		block[1].y = cuda_swab32(msg[1].x);
+		block[2].x = cuda_swab32(msg[2].y);
+		block[2].y = cuda_swab32(msg[2].x);
+		block[3].x = cuda_swab32(msg[3].y);
+		block[3].y = cuda_swab32(msg[3].x);
+		block[4].x = cuda_swab32(msg[4].y);
+		block[4].y = cuda_swab32(msg[4].x);
+		block[5].x = cuda_swab32(msg[5].y);
+		block[5].y = cuda_swab32(msg[5].x);
+		block[6].x = cuda_swab32(msg[6].y);
+		block[6].y = cuda_swab32(msg[6].x);
+		block[7].x = cuda_swab32(msg[7].y);
+		block[7].y = cuda_swab32(msg[7].x);
+
+		block[8] = vectorizehigh(0x80000000);
+		block[9] = vectorizelow(0x0);
+		block[10] = vectorizelow(0x0);
+		block[11] = vectorizelow(0x0);
+		block[12] = vectorizelow(0x0);
+		block[13] = vectorizelow(0x1);
+		block[14] = vectorizelow(0x0);
+		block[15] = vectorizelow(0x200);
+
+		const uint2 h[8] = {
+			{ 0xf3bcc908UL, 0x6a09e667UL },
+			{ 0x84caa73bUL, 0xbb67ae85UL },
+			{ 0xfe94f82bUL, 0x3c6ef372UL },
+			{ 0x5f1d36f1UL, 0xa54ff53aUL },
+			{ 0xade682d1UL, 0x510e527fUL },
+			{ 0x2b3e6c1fUL, 0x9b05688cUL },
+			{ 0xfb41bd6bUL, 0x1f83d9abUL },
+			{ 0x137e2179UL, 0x5be0cd19UL }
+		};
+
+		uint2 v[16] = {
+			h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7],
+			c_512_u2[0], c_512_u2[1], c_512_u2[2], c_512_u2[3],
+			c_512_u2[4], c_512_u2[5], c_512_u2[6], c_512_u2[7]
+		};
+		v[12].x ^= 512U;
+		v[13].x ^= 512U;
+
+		Gprecalc(0, 4, 8, 12, 0x1, 0x0)
+		Gprecalc(1, 5, 9, 13, 0x3, 0x2)
+		Gprecalc(2, 6, 10, 14, 0x5, 0x4)
+		Gprecalc(3, 7, 11, 15, 0x7, 0x6)
+		Gprecalc(0, 5, 10, 15, 0x9, 0x8)
+		Gprecalc(1, 6, 11, 12, 0xb, 0xa)
+		Gprecalc(2, 7, 8, 13, 0xd, 0xc)
+		Gprecalc(3, 4, 9, 14, 0xf, 0xe)
+
+		Gprecalc(0, 4, 8, 12, 0xa, 0xe)
+		Gprecalc(1, 5, 9, 13, 0x8, 0x4)
+		Gprecalc(2, 6, 10, 14, 0xf, 0x9)
+		Gprecalc(3, 7, 11, 15, 0x6, 0xd)
+		Gprecalc(0, 5, 10, 15, 0xc, 0x1)
+		Gprecalc(1, 6, 11, 12, 0x2, 0x0)
+		Gprecalc(2, 7, 8, 13, 0x7, 0xb)
+		Gprecalc(3, 4, 9, 14, 0x3, 0x5)
+
+		Gprecalc(0, 4, 8, 12, 0x8, 0xb)
+		Gprecalc(1, 5, 9, 13, 0x0, 0xc)
+		Gprecalc(2, 6, 10, 14, 0x2, 0x5)
+		Gprecalc(3, 7, 11, 15, 0xd, 0xf)
+		Gprecalc(0, 5, 10, 15, 0xe, 0xa)
+		Gprecalc(1, 6, 11, 12, 0x6, 0x3)
+		Gprecalc(2, 7, 8, 13, 0x1, 0x7)
+		Gprecalc(3, 4, 9, 14, 0x4, 0x9)
+
+		Gprecalc(0, 4, 8, 12, 0x9, 0x7)
+		Gprecalc(1, 5, 9, 13, 0x1, 0x3)
+		Gprecalc(2, 6, 10, 14, 0xc, 0xd)
+		Gprecalc(3, 7, 11, 15, 0xe, 0xb)
+		Gprecalc(0, 5, 10, 15, 0x6, 0x2)
+		Gprecalc(1, 6, 11, 12, 0xa, 0x5)
+		Gprecalc(2, 7, 8, 13, 0x0, 0x4)
+		Gprecalc(3, 4, 9, 14, 0x8, 0xf)
+
+		Gprecalc(0, 4, 8, 12, 0x0, 0x9)
+		Gprecalc(1, 5, 9, 13, 0x7, 0x5)
+		Gprecalc(2, 6, 10, 14, 0x4, 0x2)
+		Gprecalc(3, 7, 11, 15, 0xf, 0xa)
+		Gprecalc(0, 5, 10, 15, 0x1, 0xe)
+		Gprecalc(1, 6, 11, 12, 0xc, 0xb)
+		Gprecalc(2, 7, 8, 13, 0x8, 0x6)
+		Gprecalc(3, 4, 9, 14, 0xd, 0x3)
+
+		Gprecalc(0, 4, 8, 12, 0xc, 0x2)
+		Gprecalc(1, 5, 9, 13, 0xa, 0x6)
+		Gprecalc(2, 6, 10, 14, 0xb, 0x0)
+		Gprecalc(3, 7, 11, 15, 0x3, 0x8)
+		Gprecalc(0, 5, 10, 15, 0xd, 0x4)
+		Gprecalc(1, 6, 11, 12, 0x5, 0x7)
+		Gprecalc(2, 7, 8, 13, 0xe, 0xf)
+		Gprecalc(3, 4, 9, 14, 0x9, 0x1)
+
+		Gprecalc(0, 4, 8, 12, 0x5, 0xc)
+		Gprecalc(1, 5, 9, 13, 0xf, 0x1)
+		Gprecalc(2, 6, 10, 14, 0xd, 0xe)
+		Gprecalc(3, 7, 11, 15, 0xa, 0x4)
+		Gprecalc(0, 5, 10, 15, 0x7, 0x0)
+		Gprecalc(1, 6, 11, 12, 0x3, 0x6)
+		Gprecalc(2, 7, 8, 13, 0x2, 0x9)
+		Gprecalc(3, 4, 9, 14, 0xb, 0x8)
+
+		Gprecalc(0, 4, 8, 12, 0xb, 0xd)
+		Gprecalc(1, 5, 9, 13, 0xe, 0x7)
+		Gprecalc(2, 6, 10, 14, 0x1, 0xc)
+		Gprecalc(3, 7, 11, 15, 0x9, 0x3)
+		Gprecalc(0, 5, 10, 15, 0x0, 0x5)
+		Gprecalc(1, 6, 11, 12, 0x4, 0xf)
+		Gprecalc(2, 7, 8, 13, 0x6, 0x8)
+		Gprecalc(3, 4, 9, 14, 0xa, 0x2)
+
+		Gprecalc(0, 4, 8, 12, 0xf, 0x6)
+		Gprecalc(1, 5, 9, 13, 0x9, 0xe)
+		Gprecalc(2, 6, 10, 14, 0x3, 0xb)
+		Gprecalc(3, 7, 11, 15, 0x8, 0x0)
+		Gprecalc(0, 5, 10, 15, 0x2, 0xc)
+		Gprecalc(1, 6, 11, 12, 0x7, 0xd)
+		Gprecalc(2, 7, 8, 13, 0x4, 0x1)
+		Gprecalc(3, 4, 9, 14, 0x5, 0xa)
+
+		Gprecalc(0, 4, 8, 12, 0x2, 0xa)
+		Gprecalc(1, 5, 9, 13, 0x4, 0x8)
+		Gprecalc(2, 6, 10, 14, 0x6, 0x7)
+		Gprecalc(3, 7, 11, 15, 0x5, 0x1)
+		Gprecalc(0, 5, 10, 15, 0xb, 0xf)
+		Gprecalc(1, 6, 11, 12, 0xe, 0x9)
+		Gprecalc(2, 7, 8, 13, 0xc, 0x3)
+		Gprecalc(3, 4, 9, 14, 0x0, 0xd)
+
+		#if __CUDA_ARCH__ == 500
+
+		Gprecalc(0, 4, 8, 12, 0x1, 0x0)
+		Gprecalc(1, 5, 9, 13, 0x3, 0x2)
+		Gprecalc(2, 6, 10, 14, 0x5, 0x4)
+		Gprecalc(3, 7, 11, 15, 0x7, 0x6)
+		Gprecalc(0, 5, 10, 15, 0x9, 0x8)
+		Gprecalc(1, 6, 11, 12, 0xb, 0xa)
+		Gprecalc(2, 7, 8, 13, 0xd, 0xc)
+		Gprecalc(3, 4, 9, 14, 0xf, 0xe)
+
+		Gprecalc(0, 4, 8, 12, 0xa, 0xe)
+		Gprecalc(1, 5, 9, 13, 0x8, 0x4)
+		Gprecalc(2, 6, 10, 14, 0xf, 0x9)
+		Gprecalc(3, 7, 11, 15, 0x6, 0xd)
+		Gprecalc(0, 5, 10, 15, 0xc, 0x1)
+		Gprecalc(1, 6, 11, 12, 0x2, 0x0)
+		Gprecalc(2, 7, 8, 13, 0x7, 0xb)
+		Gprecalc(3, 4, 9, 14, 0x3, 0x5)
+
+		Gprecalc(0, 4, 8, 12, 0x8, 0xb)
+		Gprecalc(1, 5, 9, 13, 0x0, 0xc)
+		Gprecalc(2, 6, 10, 14, 0x2, 0x5)
+		Gprecalc(3, 7, 11, 15, 0xd, 0xf)
+		Gprecalc(0, 5, 10, 15, 0xe, 0xa)
+		Gprecalc(1, 6, 11, 12, 0x6, 0x3)
+		Gprecalc(2, 7, 8, 13, 0x1, 0x7)
+		Gprecalc(3, 4, 9, 14, 0x4, 0x9)
+
+		Gprecalc(0, 4, 8, 12, 0x9, 0x7)
+		Gprecalc(1, 5, 9, 13, 0x1, 0x3)
+		Gprecalc(2, 6, 10, 14, 0xc, 0xd)
+		Gprecalc(3, 7, 11, 15, 0xe, 0xb)
+		Gprecalc(0, 5, 10, 15, 0x6, 0x2)
+		Gprecalc(1, 6, 11, 12, 0xa, 0x5)
+		Gprecalc(2, 7, 8, 13, 0x0, 0x4)
+		Gprecalc(3, 4, 9, 14, 0x8, 0xf)
+
+		Gprecalc(0, 4, 8, 12, 0x0, 0x9)
+		Gprecalc(1, 5, 9, 13, 0x7, 0x5)
+		Gprecalc(2, 6, 10, 14, 0x4, 0x2)
+		Gprecalc(3, 7, 11, 15, 0xf, 0xa)
+		Gprecalc(0, 5, 10, 15, 0x1, 0xe)
+		Gprecalc(1, 6, 11, 12, 0xc, 0xb)
+		Gprecalc(2, 7, 8, 13, 0x8, 0x6)
+		Gprecalc(3, 4, 9, 14, 0xd, 0x3)
+
+		Gprecalc(0, 4, 8, 12, 0xc, 0x2)
+		Gprecalc(1, 5, 9, 13, 0xa, 0x6)
+		Gprecalc(2, 6, 10, 14, 0xb, 0x0)
+		Gprecalc(3, 7, 11, 15, 0x3, 0x8)
+		Gprecalc(0, 5, 10, 15, 0xd, 0x4)
+		Gprecalc(1, 6, 11, 12, 0x5, 0x7)
+		Gprecalc(2, 7, 8, 13, 0xe, 0xf)
+		Gprecalc(3, 4, 9, 14, 0x9, 0x1)
+
+		#else
+
+		for (int i = 0; i < 6; i++)
+		{
+			/* column step */
+			G(0, 4, 8, 12, 0);
+			G(1, 5, 9, 13, 2);
+			G(2, 6, 10, 14, 4);
+			G(3, 7, 11, 15, 6);
+			/* diagonal step */
+			G(0, 5, 10, 15, 8);
+			G(1, 6, 11, 12, 10);
+			G(2, 7, 8, 13, 12);
+			G(3, 4, 9, 14, 14);
+		}
+		#endif
+
+		v[0] = cuda_swap(h[0] ^ v[0] ^ v[8]);
+		v[1] = cuda_swap(h[1] ^ v[1] ^ v[9]);
+		v[2] = cuda_swap(h[2] ^ v[2] ^ v[10]);
+		v[3] = cuda_swap(h[3] ^ v[3] ^ v[11]);
+		v[4] = cuda_swap(h[4] ^ v[4] ^ v[12]);
+		v[5] = cuda_swap(h[5] ^ v[5] ^ v[13]);
+		v[6] = cuda_swap(h[6] ^ v[6] ^ v[14]);
+		v[7] = cuda_swap(h[7] ^ v[7] ^ v[15]);
+
+		phash = (uint2x4*)v;
+		outpt = (uint2x4*)&g_hash[hashPosition * 8U];
+		outpt[0] = phash[0];
+		outpt[1] = phash[1];
+	}
+}
+
+
+__global__
+__launch_bounds__(128, 8)
+void quark_blake512_gpu_hash_80_sp(uint32_t threads, uint32_t startNounce, uint2 *outputHash)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint32_t nounce = startNounce + thread;
+		uint2 block[16];
+
+		block[0] = c_PaddedM[0];
+		block[1] = c_PaddedM[1];
+		block[2] = c_PaddedM[2];
+		block[3] = c_PaddedM[3];
+		block[4] = c_PaddedM[4];
+		block[5] = c_PaddedM[5];
+		block[6] = c_PaddedM[6];
+		block[7] = c_PaddedM[7];
+		block[8] = c_PaddedM[8];
+		block[9].y = c_PaddedM[9].y;
+
+		block[10] = vectorizehigh(0x80000000);
+		block[11] = vectorizelow(0);
+		block[12] = vectorizelow(0);
+		block[13] = vectorizelow(0x1);
+		block[14] = vectorizelow(0);
+		block[15] = vectorizelow(0x280);
+		block[9].x = nounce;
+
+		const uint2 h[8] = {
+			{ 0xf3bcc908UL, 0x6a09e667UL },
+			{ 0x84caa73bUL, 0xbb67ae85UL },
+			{ 0xfe94f82bUL, 0x3c6ef372UL },
+			{ 0x5f1d36f1UL, 0xa54ff53aUL },
+			{ 0xade682d1UL, 0x510e527fUL },
+			{ 0x2b3e6c1fUL, 0x9b05688cUL },
+			{ 0xfb41bd6bUL, 0x1f83d9abUL },
+			{ 0x137e2179UL, 0x5be0cd19UL }
+		};
+
+		uint2 v[16];
+		uint2x4 *outpt = (uint2x4*)v;
+
+		outpt[0] = c_Hostprecalc[0];
+		outpt[1] = c_Hostprecalc[1];
+		outpt[2] = c_Hostprecalc[2];
+		outpt[3] = c_Hostprecalc[3];
+
+		v[0] += (block[9] ^ c_512_u2[8]);
+		v[15] = ROR16(v[15] ^ v[0]);
+		v[10] += v[15];
+		v[5] = ROR2(v[5] ^ v[10], 11);
+
+		Gprecalc(0, 4, 8, 12, 0xa, 0xe)
+
+//		Gprecalc(1, 5, 9, 13, 0x8, 0x4)
+		v[1] += v[5];
+		v[13] = eorswap32(v[13], v[1]);
+		v[9] += v[13];
+
+		v[5] = ROR2(v[5] ^ v[9], 25);
+		v[1] += (block[8] ^ c_512_u2[4]) + v[5];
+		v[13] = ROR16(v[13] ^ v[1]);
+		v[9] += v[13];
+		v[5] = ROR2(v[5] ^ v[9], 11);
+
+//		Gprecalc(2, 6, 10, 14, 0xf, 0x9)
+		v[2] += (block[9] ^ c_512_u2[0xf]);
+		v[14] = eorswap32(v[14], v[2]);
+		v[10] += v[14];
+		v[6] = ROR2(v[6] ^ v[10], 25);
+		v[2] += (block[0xf] ^ c_512_u2[9]) + v[6];
+		v[14] = ROR16(v[14] ^ v[2]);
+		v[10] += v[14];
+		v[6] = ROR2(v[6] ^ v[10], 11);
+
+//		Gprecalc(3, 7, 11, 15, 0x6, 0xd)
+		v[15] = eorswap32( v[15] , v[3]);
+		v[11] += v[15];
+		v[7] = ROR2(v[7] ^ v[11], 25);
+		v[3] += (block[6] ^ c_512_u2[0xd]) + v[7];
+		v[15] = ROR16(v[15] ^ v[3]);
+		v[11] += v[15];
+		v[7] = ROR2(v[7] ^ v[11], 11);
+
+		Gprecalc(0, 5, 10, 15, 0xc, 0x1)
+		Gprecalc(1, 6, 11, 12, 0x2, 0x0)
+		Gprecalc(2, 7, 8, 13, 0x7, 0xb)
+		Gprecalc(3, 4, 9, 14, 0x3, 0x5)
+
+		Gprecalc(0, 4, 8, 12, 0x8, 0xb)
+		Gprecalc(1, 5, 9, 13, 0x0, 0xc)
+		Gprecalc(2, 6, 10, 14, 0x2, 0x5)
+		Gprecalc(3, 7, 11, 15, 0xd, 0xf)
+		Gprecalc(0, 5, 10, 15, 0xe, 0xa)
+		Gprecalc(1, 6, 11, 12, 0x6, 0x3)
+		Gprecalc(2, 7, 8, 13, 0x1, 0x7)
+		Gprecalc(3, 4, 9, 14, 0x4, 0x9)
+
+		Gprecalc(0, 4, 8, 12, 0x9, 0x7)
+		Gprecalc(1, 5, 9, 13, 0x1, 0x3)
+		Gprecalc(2, 6, 10, 14, 0xc, 0xd)
+		Gprecalc(3, 7, 11, 15, 0xe, 0xb)
+		Gprecalc(0, 5, 10, 15, 0x6, 0x2)
+		Gprecalc(1, 6, 11, 12, 0xa, 0x5)
+		Gprecalc(2, 7, 8, 13, 0x0, 0x4)
+		Gprecalc(3, 4, 9, 14, 0x8, 0xf)
+
+		Gprecalc(0, 4, 8, 12, 0x0, 0x9)
+		Gprecalc(1, 5, 9, 13, 0x7, 0x5)
+		Gprecalc(2, 6, 10, 14, 0x4, 0x2)
+		Gprecalc(3, 7, 11, 15, 0xf, 0xa)
+		Gprecalc(0, 5, 10, 15, 0x1, 0xe)
+		Gprecalc(1, 6, 11, 12, 0xc, 0xb)
+		Gprecalc(2, 7, 8, 13, 0x8, 0x6)
+		Gprecalc(3, 4, 9, 14, 0xd, 0x3)
+
+		Gprecalc(0, 4, 8, 12, 0xc, 0x2)
+		Gprecalc(1, 5, 9, 13, 0xa, 0x6)
+		Gprecalc(2, 6, 10, 14, 0xb, 0x0)
+		Gprecalc(3, 7, 11, 15, 0x3, 0x8)
+		Gprecalc(0, 5, 10, 15, 0xd, 0x4)
+		Gprecalc(1, 6, 11, 12, 0x5, 0x7)
+		Gprecalc(2, 7, 8, 13, 0xe, 0xf)
+		Gprecalc(3, 4, 9, 14, 0x9, 0x1)
+
+		Gprecalc(0, 4, 8, 12, 0x5, 0xc)
+		Gprecalc(1, 5, 9, 13, 0xf, 0x1)
+		Gprecalc(2, 6, 10, 14, 0xd, 0xe)
+		Gprecalc(3, 7, 11, 15, 0xa, 0x4)
+		Gprecalc(0, 5, 10, 15, 0x7, 0x0)
+		Gprecalc(1, 6, 11, 12, 0x3, 0x6)
+		Gprecalc(2, 7, 8, 13, 0x2, 0x9)
+		Gprecalc(3, 4, 9, 14, 0xb, 0x8)
+
+		Gprecalc(0, 4, 8, 12, 0xb, 0xd)
+		Gprecalc(1, 5, 9, 13, 0xe, 0x7)
+		Gprecalc(2, 6, 10, 14, 0x1, 0xc)
+		Gprecalc(3, 7, 11, 15, 0x9, 0x3)
+		Gprecalc(0, 5, 10, 15, 0x0, 0x5)
+		Gprecalc(1, 6, 11, 12, 0x4, 0xf)
+		Gprecalc(2, 7, 8, 13, 0x6, 0x8)
+		Gprecalc(3, 4, 9, 14, 0xa, 0x2)
+
+		Gprecalc(0, 4, 8, 12, 0xf, 0x6)
+		Gprecalc(1, 5, 9, 13, 0x9, 0xe)
+		Gprecalc(2, 6, 10, 14, 0x3, 0xb)
+		Gprecalc(3, 7, 11, 15, 0x8, 0x0)
+		Gprecalc(0, 5, 10, 15, 0x2, 0xc)
+		Gprecalc(1, 6, 11, 12, 0x7, 0xd)
+		Gprecalc(2, 7, 8, 13, 0x4, 0x1)
+		Gprecalc(3, 4, 9, 14, 0x5, 0xa)
+
+		Gprecalc(0, 4, 8, 12, 0x2, 0xa)
+		Gprecalc(1, 5, 9, 13, 0x4, 0x8)
+		Gprecalc(2, 6, 10, 14, 0x6, 0x7)
+		Gprecalc(3, 7, 11, 15, 0x5, 0x1)
+		Gprecalc(0, 5, 10, 15, 0xb, 0xf)
+		Gprecalc(1, 6, 11, 12, 0xe, 0x9)
+		Gprecalc(2, 7, 8, 13, 0xc, 0x3)
+		Gprecalc(3, 4, 9, 14, 0x0, 0xd)
+
+		Gprecalc(0, 4, 8, 12, 0x1, 0x0)
+		Gprecalc(1, 5, 9, 13, 0x3, 0x2)
+		Gprecalc(2, 6, 10, 14, 0x5, 0x4)
+		Gprecalc(3, 7, 11, 15, 0x7, 0x6)
+		Gprecalc(0, 5, 10, 15, 0x9, 0x8)
+		Gprecalc(1, 6, 11, 12, 0xb, 0xa)
+		Gprecalc(2, 7, 8, 13, 0xd, 0xc)
+		Gprecalc(3, 4, 9, 14, 0xf, 0xe)
+
+		Gprecalc(0, 4, 8, 12, 0xa, 0xe)
+		Gprecalc(1, 5, 9, 13, 0x8, 0x4)
+		Gprecalc(2, 6, 10, 14, 0xf, 0x9)
+		Gprecalc(3, 7, 11, 15, 0x6, 0xd)
+		Gprecalc(0, 5, 10, 15, 0xc, 0x1)
+		Gprecalc(1, 6, 11, 12, 0x2, 0x0)
+		Gprecalc(2, 7, 8, 13, 0x7, 0xb)
+		Gprecalc(3, 4, 9, 14, 0x3, 0x5)
+
+		Gprecalc(0, 4, 8, 12, 0x8, 0xb)
+		Gprecalc(1, 5, 9, 13, 0x0, 0xc)
+		Gprecalc(2, 6, 10, 14, 0x2, 0x5)
+		Gprecalc(3, 7, 11, 15, 0xd, 0xf)
+		Gprecalc(0, 5, 10, 15, 0xe, 0xa)
+		Gprecalc(1, 6, 11, 12, 0x6, 0x3)
+		Gprecalc(2, 7, 8, 13, 0x1, 0x7)
+		Gprecalc(3, 4, 9, 14, 0x4, 0x9)
+
+		Gprecalc(0, 4, 8, 12, 0x9, 0x7)
+		Gprecalc(1, 5, 9, 13, 0x1, 0x3)
+		Gprecalc(2, 6, 10, 14, 0xc, 0xd)
+		Gprecalc(3, 7, 11, 15, 0xe, 0xb)
+		Gprecalc(0, 5, 10, 15, 0x6, 0x2)
+		Gprecalc(1, 6, 11, 12, 0xa, 0x5)
+		Gprecalc(2, 7, 8, 13, 0x0, 0x4)
+		Gprecalc(3, 4, 9, 14, 0x8, 0xf)
+
+		Gprecalc(0, 4, 8, 12, 0x0, 0x9)
+		Gprecalc(1, 5, 9, 13, 0x7, 0x5)
+		Gprecalc(2, 6, 10, 14, 0x4, 0x2)
+		Gprecalc(3, 7, 11, 15, 0xf, 0xa)
+		Gprecalc(0, 5, 10, 15, 0x1, 0xe)
+		Gprecalc(1, 6, 11, 12, 0xc, 0xb)
+		Gprecalc(2, 7, 8, 13, 0x8, 0x6)
+		Gprecalc(3, 4, 9, 14, 0xd, 0x3)
+
+		Gprecalc(0, 4, 8, 12, 0xc, 0x2)
+		Gprecalc(1, 5, 9, 13, 0xa, 0x6)
+		Gprecalc(2, 6, 10, 14, 0xb, 0x0)
+		Gprecalc(3, 7, 11, 15, 0x3, 0x8)
+		Gprecalc(0, 5, 10, 15, 0xd, 0x4)
+		Gprecalc(1, 6, 11, 12, 0x5, 0x7)
+		Gprecalc(2, 7, 8, 13, 0xe, 0xf)
+		Gprecalc(3, 4, 9, 14, 0x9, 0x1)
+
+		v[0] = cuda_swap(h[0] ^ v[0] ^ v[8]);
+		v[1] = cuda_swap(h[1] ^ v[1] ^ v[9]);
+		v[2] = cuda_swap(h[2] ^ v[2] ^ v[10]);
+		v[3] = cuda_swap(h[3] ^ v[3] ^ v[11]);
+		v[4] = cuda_swap(h[4] ^ v[4] ^ v[12]);
+		v[5] = cuda_swap(h[5] ^ v[5] ^ v[13]);
+		v[6] = cuda_swap(h[6] ^ v[6] ^ v[14]);
+		v[7] = cuda_swap(h[7] ^ v[7] ^ v[15]);
+
+		uint2x4 *phash = (uint2x4*)v;
+		outpt = (uint2x4*) &outputHash[thread * 8U];
+		outpt[0] = phash[0];
+		outpt[1] = phash[1];
+	}
+}
+
+// ---------------------------- END CUDA quark_blake512 functions ------------------------------------
+
+__host__ void quark_blake512_cpu_setBlock_80_sp(int thr_id, uint64_t *pdata)
+{
+	uint64_t block[16];
+	uint64_t PaddedMessage[10];
+	uint64_t *peker = (uint64_t*) &PaddedMessage[0];
+	for (int i = 0; i < 10; i++)
+		PaddedMessage[i] = cuda_swab64(pdata[i]);
+
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol((c_PaddedM), PaddedMessage, 10 * sizeof(uint64_t)));
+
+	block[0] = peker[0];
+	block[1] = peker[1];
+	block[2] = peker[2];
+	block[3] = peker[3];
+	block[4] = peker[4];
+	block[5] = peker[5];
+	block[6] = peker[6];
+	block[7] = peker[7];
+	block[8] = peker[8];
+	block[9] = peker[9];
+	block[10] = 0x8000000000000000;
+	block[11] = 0;
+	block[12] = 0;
+	block[13] = 1;
+	block[14] = 0;
+	block[15] = 0x280;
+
+	const uint64_t u512[16] = {
+		0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL,
+		0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL,
+		0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL,
+		0xc0ac29b7c97c50ddULL, 0x3f84d5b5b5470917ULL,
+		0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL,
+		0x2ffd72dbd01adfb7ULL, 0xb8e1afed6a267e96ULL,
+		0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL,
+		0x0801f2e2858efc16ULL, 0x636920d871574e69ULL
+	};
+
+	uint64_t h[8] = {
+		0x6a09e667f3bcc908ULL,
+		0xbb67ae8584caa73bULL,
+		0x3c6ef372fe94f82bULL,
+		0xa54ff53a5f1d36f1ULL,
+		0x510e527fade682d1ULL,
+		0x9b05688c2b3e6c1fULL,
+		0x1f83d9abfb41bd6bULL,
+		0x5be0cd19137e2179ULL
+	};
+
+	uint64_t v[16] = {
+		h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7],
+		u512[0], u512[1], u512[2], u512[3], u512[4] ^ 640U, u512[5] ^ 640U, u512[6], u512[7]
+	};
+
+	GprecalcHost(0, 4, 8, 12, 0x1, 0x0)
+	GprecalcHost(1, 5, 9, 13, 0x3, 0x2)
+	GprecalcHost(2, 6, 10, 14, 0x5, 0x4)
+	GprecalcHost(3, 7, 11, 15, 0x7, 0x6)
+
+	GprecalcHost(1, 6, 11, 12, 0xb, 0xa)
+	GprecalcHost(2, 7, 8, 13, 0xd, 0xc)
+
+	v[0] += (block[8] ^ u512[9]) + v[5];
+	v[15] = ROTR64(v[15] ^ v[0], 32); \
+	v[10] += v[15];
+	v[5] = ROTR64(v[5] ^ v[10], 25);
+	v[0] += v[5];
+
+	GprecalcHost(3, 4, 9, 14, 0xf, 0xe);
+
+	v[1] += (block[0x4] ^ u512[0x8]);
+	v[2] += v[6];
+
+	v[3] += (block[0xd] ^ u512[6]) + v[7];
+
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_Hostprecalc, v, 128, 0, cudaMemcpyHostToDevice));
+}
+
+#else
+// __CUDA_ARCH__ < 500
+__host__ void quark_blake512_cpu_setBlock_80_sp(int thr_id, uint64_t *pdata) {}
+__global__ void quark_blake512_gpu_hash_64_sp(uint32_t, uint32_t startNounce, uint32_t *const __restrict__ g_nonceVector, uint2 *const __restrict__ g_hash) {}
+__global__ void quark_blake512_gpu_hash_80_sp(uint32_t, uint32_t startNounce, uint2 *outputHash) {}
+#endif
+
+__host__
+void quark_blake512_cpu_hash_64_sp(uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash)
+{
+	const uint32_t threadsperblock = 32;
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+	quark_blake512_gpu_hash_64_sp <<<grid, block>>>(threads, startNounce, d_nonceVector, (uint2*)d_outputHash);
+}
+
+__host__
+void quark_blake512_cpu_hash_80_sp(uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash)
+{
+	const uint32_t threadsperblock = 64;
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+	quark_blake512_gpu_hash_80_sp <<<grid, block>>>(threads, startNounce, (uint2*)d_outputHash);
+}
diff --git a/quark/cuda_quark_checkhash.cu b/quark/cuda_quark_checkhash.cu
deleted file mode 100644
index 8502bf4336..0000000000
--- a/quark/cuda_quark_checkhash.cu
+++ /dev/null
@@ -1,105 +0,0 @@
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
-#include <stdio.h>
-#include <memory.h>
-
-// Folgende Definitionen sp�ter durch header ersetzen
-typedef unsigned char uint8_t;
-typedef unsigned int uint32_t;
-typedef unsigned long long uint64_t;
-
-// das Hash Target gegen das wir testen sollen
-__constant__ uint32_t pTarget[8];
-
-uint32_t *d_resNounce[8];
-uint32_t *h_resNounce[8];
-
-// aus heavy.cu
-extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
-
-__global__ void quark_check_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint32_t *g_hash, uint32_t *resNounce)
-{
-	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
-	{
-		// bestimme den aktuellen Z�hler
-		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
-
-		int hashPosition = nounce - startNounce;
-		uint32_t *inpHash = &g_hash[16 * hashPosition];
-
-		uint32_t hash[8];
-#pragma unroll 8
-		for (int i=0; i < 8; i++)
-			hash[i] = inpHash[i];
-
-		// kopiere Ergebnis
-		int i, position = -1;
-		bool rc = true;
-
-#pragma unroll 8
-		for (i = 7; i >= 0; i--) {
-			if (hash[i] > pTarget[i]) {
-				if(position < i) {
-					position = i;
-					rc = false;
-				}
-	 		}
-	 		if (hash[i] < pTarget[i]) {
-				if(position < i) {
-					position = i;
-					rc = true;
-				}
-	 		}
-		}
-
-		if(rc == true)
-			if(resNounce[0] > nounce)
-				resNounce[0] = nounce;
-	}
-}
-
-// Setup-Funktionen
-__host__ void quark_check_cpu_init(int thr_id, int threads)
-{
-    cudaMallocHost(&h_resNounce[thr_id], 1*sizeof(uint32_t));
-    cudaMalloc(&d_resNounce[thr_id], 1*sizeof(uint32_t));
-}
-
-// Target Difficulty setzen
-__host__ void quark_check_cpu_setTarget(const void *ptarget)
-{
-	// die Message zur Berechnung auf der GPU
-	cudaMemcpyToSymbol( pTarget, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
-}
-
-__host__ uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order)
-{
-	uint32_t result = 0xffffffff;
-	cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t));
-
-	const int threadsperblock = 256;
-
-	// berechne wie viele Thread Blocks wir brauchen
-	dim3 grid((threads + threadsperblock-1)/threadsperblock);
-	dim3 block(threadsperblock);
-
-	// Gr��e des dynamischen Shared Memory Bereichs
-	size_t shared_size = 0;
-
-	quark_check_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]);
-
-	// Strategisches Sleep Kommando zur Senkung der CPU Last
-	MyStreamSynchronize(NULL, order, thr_id);
-
-	// Ergebnis zum Host kopieren (in page locked memory, damits schneller geht)
-	cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
-
-	// cudaMemcpy() ist asynchron!
-	cudaThreadSynchronize();
-	result = *h_resNounce[thr_id];
-
-	return result;
-}
diff --git a/quark/cuda_quark_compactionTest.cu b/quark/cuda_quark_compactionTest.cu
index ed2e2c9ddb..3c9137ea7f 100644
--- a/quark/cuda_quark_compactionTest.cu
+++ b/quark/cuda_quark_compactionTest.cu
@@ -1,363 +1,398 @@
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-#include "sm_30_intrinsics.h"
-
-#include <stdio.h>
-#include <memory.h>
-#include <stdint.h>
-
-// aus cpu-miner.c
-extern int device_map[8];
-
-// diese Struktur wird in der Init Funktion angefordert
-static cudaDeviceProp props[8];
-
-static uint32_t *d_tempBranch1Nonces[8];
-static uint32_t *d_numValid[8];
-static uint32_t *h_numValid[8];
-
-static uint32_t *d_partSum[2][8]; // f�r bis zu vier partielle Summen
-
-// aus heavy.cu
-extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
-
-// True/False tester
-typedef uint32_t(*cuda_compactTestFunction_t)(uint32_t *inpHash);
-
-__device__ uint32_t QuarkTrueTest(uint32_t *inpHash)
-{
-	return ((inpHash[0] & 0x08) == 0x08);
-}
-
-__device__ uint32_t QuarkFalseTest(uint32_t *inpHash)
-{
-	return ((inpHash[0] & 0x08) == 0);
-}
-
-__device__ cuda_compactTestFunction_t d_QuarkTrueFunction = QuarkTrueTest, d_QuarkFalseFunction = QuarkFalseTest;
-cuda_compactTestFunction_t h_QuarkTrueFunction[8], h_QuarkFalseFunction[8];
-
-// Setup-Funktionen
-__host__ void quark_compactTest_cpu_init(int thr_id, int threads)
-{
-	cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
-
-	cudaMemcpyFromSymbol(&h_QuarkTrueFunction[thr_id], d_QuarkTrueFunction, sizeof(cuda_compactTestFunction_t));
-	cudaMemcpyFromSymbol(&h_QuarkFalseFunction[thr_id], d_QuarkFalseFunction, sizeof(cuda_compactTestFunction_t));
-
-	// wir brauchen auch Speicherplatz auf dem Device
-	cudaMalloc(&d_tempBranch1Nonces[thr_id], sizeof(uint32_t) * threads * 2);	
-	cudaMalloc(&d_numValid[thr_id], 2*sizeof(uint32_t));
-	cudaMallocHost(&h_numValid[thr_id], 2*sizeof(uint32_t));
-
-	uint32_t s1;
-	s1 = (threads / 256) * 2;
-
-	cudaMalloc(&d_partSum[0][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block)
-	cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block)
-}
-
-// Die Summenfunktion (vom NVIDIA SDK)
-__global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
-{
-	extern __shared__ uint32_t sums[];
-	int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
-	//int lane_id = id % warpSize;
-	int lane_id = id % width;
-	// determine a warp_id within a block
-	 //int warp_id = threadIdx.x / warpSize;
-	int warp_id = threadIdx.x / width;
-
-	sums[lane_id] = 0;
-
-	// Below is the basic structure of using a shfl instruction
-	// for a scan.
-	// Record "value" as a variable - we accumulate it along the way
-	uint32_t value;
-	if(testFunc != NULL)
-	{
-		if (id < threads)
-		{
-			uint32_t *inpHash;
-			if(d_validNonceTable == NULL)
-			{
-				// keine Nonce-Liste
-				inpHash = &inpHashes[id<<4];
-			}else
-			{
-				// Nonce-Liste verf�gbar
-				int nonce = d_validNonceTable[id] - startNounce;
-				inpHash = &inpHashes[nonce<<4];
-			}			
-			value = (*testFunc)(inpHash);
-		}else
-		{
-			value = 0;
-		}
-	}else
-	{
-		value = data[id];
-	}
-
-	__syncthreads();
-
-	// Now accumulate in log steps up the chain
-	// compute sums, with another thread's value who is
-	// distance delta away (i).  Note
-	// those threads where the thread 'i' away would have
-	// been out of bounds of the warp are unaffected.  This
-	// creates the scan sum.
-#pragma unroll
-
-	for (int i=1; i<=width; i*=2)
-	{
-		uint32_t n = __shfl_up((int)value, i, width);
-
-		if (lane_id >= i) value += n;
-	}
-
-	// value now holds the scan value for the individual thread
-	// next sum the largest values for each warp
-
-	// write the sum of the warp to smem
-	//if (threadIdx.x % warpSize == warpSize-1)
-	if (threadIdx.x % width == width-1)
-	{
-		sums[warp_id] = value;
-	}
-
-	__syncthreads();
-
-	//
-	// scan sum the warp sums
-	// the same shfl scan operation, but performed on warp sums
-	//
-	if (warp_id == 0)
-	{
-		uint32_t warp_sum = sums[lane_id];
-
-		for (int i=1; i<=width; i*=2)
-		{
-			uint32_t n = __shfl_up((int)warp_sum, i, width);
-
-		if (lane_id >= i) warp_sum += n;
-		}
-
-		sums[lane_id] = warp_sum;
-	}
-
-	__syncthreads();
-
-	// perform a uniform add across warps in the block
-	// read neighbouring warp's sum and add it to threads value
-	uint32_t blockSum = 0;
-
-	if (warp_id > 0)
-	{
-		blockSum = sums[warp_id-1];
-	}
-
-	value += blockSum;
-
-	// Now write out our result
-	data[id] = value;
-
-	// last thread has sum, write write out the block's sum
-	if (partial_sums != NULL && threadIdx.x == blockDim.x-1)
-	{
-		partial_sums[blockIdx.x] = value;
-	}
-}
-
-// Uniform add: add partial sums array
-__global__ void quark_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_sums, int len)
-{
-	__shared__ uint32_t buf;
-	int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
-
-	if (id > len) return;
-
-	if (threadIdx.x == 0)
-	{
-		buf = partial_sums[blockIdx.x];
-	}
-
-	__syncthreads();
-	data[id] += buf;
-}
-
-// Der Scatter
-__global__ void quark_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, cuda_compactTestFunction_t testFunc, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
-{
-	int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
-	uint32_t actNounce = id;
-	uint32_t value;
-	if (id < threads)
-	{
-//		uint32_t nounce = startNounce + id;
-		uint32_t *inpHash;
-		if(d_validNonceTable == NULL)
-		{
-			// keine Nonce-Liste
-			inpHash = &inpHashes[id<<4];
-		}else
-		{
-			// Nonce-Liste verf�gbar
-			int nonce = d_validNonceTable[id] - startNounce;
-			actNounce = nonce;
-			inpHash = &inpHashes[nonce<<4];
-		}
-
-		value = (*testFunc)(inpHash);
-	}else
-	{
-		value = 0;
-	}
-
-	if( value )
-	{
-		int idx = sum[id];
-		if(idx > 0)
-			outp[idx-1] = startNounce + actNounce;
-	}
-}
-
-__host__ static uint32_t quark_compactTest_roundUpExp(uint32_t val)
-{
-	if(val == 0)
-		return 0;
-
-	uint32_t mask = 0x80000000;
-	while( (val & mask) == 0 ) mask = mask >> 1;
-
-	if( (val & (~mask)) != 0 )
-		return mask << 1;
-
-	return mask;
-}
-
-__host__ void quark_compactTest_cpu_singleCompaction(int thr_id, int threads, uint32_t *nrm,
-														uint32_t *d_nonces1, cuda_compactTestFunction_t function,
-														uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
-{
-	int orgThreads = threads;
-	threads = (int)quark_compactTest_roundUpExp((uint32_t)threads);
-	// threadsPerBlock ausrechnen
-	int blockSize = 256;
-	int nSummen = threads / blockSize;
-
-	int thr1 = (threads+blockSize-1) / blockSize;
-	int thr2 = threads / (blockSize*blockSize);
-	int blockSize2 = (nSummen < blockSize) ? nSummen : blockSize;
-	int thr3 = (nSummen + blockSize2-1) / blockSize2;
-
-	bool callThrid = (thr2 > 0) ? true : false;
-
-	// Erster Initialscan
-	quark_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>(
-		d_tempBranch1Nonces[thr_id], 32, d_partSum[0][thr_id], function, orgThreads, startNounce, inpHashes, d_validNonceTable);	
-
-	// weitere Scans
-	if(callThrid)
-	{		
-		quark_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum[0][thr_id], 32, d_partSum[1][thr_id]);
-		quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum[1][thr_id], (thr2>32) ? 32 : thr2);
-	}else
-	{
-		quark_compactTest_gpu_SCAN<<<thr3,blockSize2, 32*sizeof(uint32_t)>>>(d_partSum[0][thr_id], (blockSize2>32) ? 32 : blockSize2);
-	}
-
-	// Sync + Anzahl merken
-	cudaStreamSynchronize(NULL);
-
-	if(callThrid)
-		cudaMemcpy(nrm, &(d_partSum[1][thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
-	else
-		cudaMemcpy(nrm, &(d_partSum[0][thr_id])[nSummen-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
-
-	
-	// Addieren
-	if(callThrid)
-	{
-		quark_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum[0][thr_id]+blockSize, d_partSum[1][thr_id], blockSize*thr2);
-	}
-	quark_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum[0][thr_id], threads);
-	
-	// Scatter
-	quark_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch1Nonces[thr_id], d_nonces1, 
-		function, orgThreads, startNounce, inpHashes, d_validNonceTable);
-
-	// Sync
-	cudaStreamSynchronize(NULL);
-}
-
-////// ACHTUNG: Diese funktion geht aktuell nur mit threads > 65536 (Am besten 256 * 1024 oder 256*2048)
-__host__ void quark_compactTest_cpu_dualCompaction(int thr_id, int threads, uint32_t *nrm,
-													 uint32_t *d_nonces1, uint32_t *d_nonces2,
-													 uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
-{
-	quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[0], d_nonces1, h_QuarkTrueFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
-	quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[1], d_nonces2, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
-
-	/*
-	// threadsPerBlock ausrechnen
-	int blockSize = 256;
-	int thr1 = threads / blockSize;
-	int thr2 = threads / (blockSize*blockSize);
-
-	// 1
-	quark_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>(d_tempBranch1Nonces[thr_id], 32, d_partSum1[thr_id], h_QuarkTrueFunction[thr_id], threads, startNounce, inpHashes);
-	quark_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]);
-	quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2);
-	cudaStreamSynchronize(NULL);
-	cudaMemcpy(&nrm[0], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
-	quark_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2);
-	quark_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads);
-
-	// 2
-	quark_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>(d_tempBranch2Nonces[thr_id], 32, d_partSum1[thr_id], h_QuarkFalseFunction[thr_id], threads, startNounce, inpHashes);
-	quark_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]);
-	quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2);
-	cudaStreamSynchronize(NULL);
-	cudaMemcpy(&nrm[1], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);	
-	quark_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2);
-	quark_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch2Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads);
-	
-	// Hier ist noch eine Besonderheit: in d_tempBranch1Nonces sind die element von 1...nrm1 die Interessanten
-	// Schritt 3: Scatter
-	quark_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch1Nonces[thr_id], d_nonces1, h_QuarkTrueFunction[thr_id], threads, startNounce, inpHashes);
-	quark_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch2Nonces[thr_id], d_nonces2, h_QuarkFalseFunction[thr_id], threads, startNounce, inpHashes);
-	cudaStreamSynchronize(NULL);
-	*/
-}
-
-__host__ void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
-											uint32_t *d_nonces1, size_t *nrm1,
-											uint32_t *d_nonces2, size_t *nrm2,
-											int order)
-{
-	// Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind
-	// "threads" ist in diesem Fall auf die L�nge dieses Array's zu setzen!
-	
-	quark_compactTest_cpu_dualCompaction(thr_id, threads,
-		h_numValid[thr_id], d_nonces1, d_nonces2,
-		startNounce, inpHashes, d_validNonceTable);
-
-	cudaStreamSynchronize(NULL); // Das original braucht zwar etwas CPU-Last, ist an dieser Stelle aber evtl besser
-	*nrm1 = (size_t)h_numValid[thr_id][0];
-	*nrm2 = (size_t)h_numValid[thr_id][1];
-}
-
-__host__ void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
-											uint32_t *d_nonces1, size_t *nrm1,
-											int order)
-{
-	// Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind
-	// "threads" ist in diesem Fall auf die L�nge dieses Array's zu setzen!
-
-	quark_compactTest_cpu_singleCompaction(thr_id, threads, h_numValid[thr_id], d_nonces1, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
-
-	cudaStreamSynchronize(NULL); // Das original braucht zwar etwas CPU-Last, ist an dieser Stelle aber evtl besser
-	*nrm1 = (size_t)h_numValid[thr_id][0];
-}
+/**
+ * __shfl_up require SM 3.0 arch!
+ *
+ * SM 2 alternative method by tpruvot@github 2015
+ */
+#include <stdio.h>
+#include <memory.h>
+
+#include "cuda_helper.h"
+
+#ifdef __INTELLISENSE__
+/* just for vstudio code colors */
+#define __CUDA_ARCH__ 300
+#define __shfl_up(var, delta, width) (0)
+#endif
+
+static uint32_t *h_numValid[MAX_GPUS];
+static uint32_t *d_tempBranch1Nonces[MAX_GPUS];
+static uint32_t *d_partSum[2][MAX_GPUS]; // für bis zu vier partielle Summen
+
+// True/False tester
+typedef uint32_t(*cuda_compactTestFunction_t)(uint32_t *inpHash);
+
+__device__ uint32_t QuarkTrueTest(uint32_t *inpHash)
+{
+	return ((inpHash[0] & 0x08) == 0x08);
+}
+
+__device__ uint32_t QuarkFalseTest(uint32_t *inpHash)
+{
+	return ((inpHash[0] & 0x08) == 0);
+}
+
+__device__ cuda_compactTestFunction_t d_QuarkTrueFunction = QuarkTrueTest, d_QuarkFalseFunction = QuarkFalseTest;
+
+cuda_compactTestFunction_t h_QuarkTrueFunction[MAX_GPUS], h_QuarkFalseFunction[MAX_GPUS];
+
+// Setup/Alloc Function
+__host__
+void quark_compactTest_cpu_init(int thr_id, uint32_t threads)
+{
+	int dev_id = device_map[thr_id];
+	cuda_get_arch(thr_id);
+
+	cudaMemcpyFromSymbol(&h_QuarkTrueFunction[thr_id], d_QuarkTrueFunction, sizeof(cuda_compactTestFunction_t));
+	cudaMemcpyFromSymbol(&h_QuarkFalseFunction[thr_id], d_QuarkFalseFunction, sizeof(cuda_compactTestFunction_t));
+
+	if (cuda_arch[dev_id] >= 300) {
+		uint32_t s1 = (threads / 256) * 2;
+		CUDA_SAFE_CALL(cudaMalloc(&d_tempBranch1Nonces[thr_id], sizeof(uint32_t) * threads * 2));
+		CUDA_SAFE_CALL(cudaMalloc(&d_partSum[0][thr_id], sizeof(uint32_t) * s1)); // BLOCKSIZE (Threads/Block)
+		CUDA_SAFE_CALL(cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1)); // BLOCKSIZE (Threads/Block)
+	} else {
+		CUDA_SAFE_CALL(cudaMalloc(&d_tempBranch1Nonces[thr_id], sizeof(uint32_t) * threads));
+	}
+
+	cudaMallocHost(&h_numValid[thr_id], 2*sizeof(uint32_t));
+}
+
+// Because all alloc should have a free...
+__host__
+void quark_compactTest_cpu_free(int thr_id)
+{
+	int dev_id = device_map[thr_id];
+
+	cudaFreeHost(h_numValid[thr_id]);
+
+	if (cuda_arch[dev_id] >= 300) {
+		cudaFree(d_tempBranch1Nonces[thr_id]);
+		cudaFree(d_partSum[0][thr_id]);
+		cudaFree(d_partSum[1][thr_id]);
+	} else {
+		cudaFree(d_tempBranch1Nonces[thr_id]);
+	}
+}
+
+__global__
+void quark_compactTest_gpu_SCAN(uint32_t *data, const int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL,
+	uint32_t threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
+{
+#if __CUDA_ARCH__ >= 300
+	__shared__ uint32_t sums[32];
+	int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
+	//int lane_id = id % warpSize;
+	int lane_id = id % width;
+	// determine a warp_id within a block
+	 //int warp_id = threadIdx.x / warpSize;
+	int warp_id = threadIdx.x / width;
+
+	sums[lane_id] = 0;
+
+	// Below is the basic structure of using a shfl instruction
+	// for a scan.
+	// Record "value" as a variable - we accumulate it along the way
+	uint32_t value;
+	if(testFunc != NULL)
+	{
+		if (id < threads)
+		{
+			uint32_t *inpHash;
+			if(d_validNonceTable == NULL)
+			{
+				// keine Nonce-Liste
+				inpHash = &inpHashes[id<<4];
+			} else {
+				// Nonce-Liste verfügbar
+				int nonce = d_validNonceTable[id] - startNounce;
+				inpHash = &inpHashes[nonce<<4];
+			}			
+			value = (*testFunc)(inpHash);
+		} else {
+			value = 0;
+		}
+	} else {
+		value = data[id];
+	}
+
+	__syncthreads();
+
+	// Now accumulate in log steps up the chain
+	// compute sums, with another thread's value who is
+	// distance delta away (i).  Note
+	// those threads where the thread 'i' away would have
+	// been out of bounds of the warp are unaffected.  This
+	// creates the scan sum.
+
+	#pragma unroll
+	for (int i=1; i<=width; i*=2)
+	{
+		uint32_t n = __shfl_up((int)value, i, width);
+		if (lane_id >= i) value += n;
+	}
+
+	// value now holds the scan value for the individual thread
+	// next sum the largest values for each warp
+
+	// write the sum of the warp to smem
+	//if (threadIdx.x % warpSize == warpSize-1)
+	if (threadIdx.x % width == width-1)
+	{
+		sums[warp_id] = value;
+	}
+
+	__syncthreads();
+
+	//
+	// scan sum the warp sums
+	// the same shfl scan operation, but performed on warp sums
+	//
+	if (warp_id == 0)
+	{
+		uint32_t warp_sum = sums[lane_id];
+
+		for (int i=1; i<=width; i*=2)
+		{
+			uint32_t n = __shfl_up((int)warp_sum, i, width);
+			if (lane_id >= i) warp_sum += n;
+		}
+
+		sums[lane_id] = warp_sum;
+	}
+
+	__syncthreads();
+
+	// perform a uniform add across warps in the block
+	// read neighbouring warp's sum and add it to threads value
+	uint32_t blockSum = 0;
+
+	if (warp_id > 0)
+	{
+		blockSum = sums[warp_id-1];
+	}
+
+	value += blockSum;
+
+	// Now write out our result
+	data[id] = value;
+
+	// last thread has sum, write write out the block's sum
+	if (partial_sums != NULL && threadIdx.x == blockDim.x-1)
+	{
+		partial_sums[blockIdx.x] = value;
+	}
+#endif // SM3+
+}
+
+// Uniform add: add partial sums array
+__global__
+void quark_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_sums, int len)
+{
+	__shared__ uint32_t buf;
+	int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
+
+	if (id > len) return;
+
+	if (threadIdx.x == 0)
+	{
+		buf = partial_sums[blockIdx.x];
+	}
+
+	__syncthreads();
+	data[id] += buf;
+}
+
+__global__
+void quark_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, cuda_compactTestFunction_t testFunc,
+	uint32_t threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
+{
+	int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
+	uint32_t actNounce = id;
+	uint32_t value;
+	if (id < threads)
+	{
+		uint32_t *inpHash;
+		if(d_validNonceTable == NULL)
+		{
+			// keine Nonce-Liste
+			inpHash = &inpHashes[id<<4];
+		} else {
+			// Nonce-Liste verfügbar
+			int nonce = d_validNonceTable[id] - startNounce;
+			actNounce = nonce;
+			inpHash = &inpHashes[nonce<<4];
+		}
+
+		value = (*testFunc)(inpHash);
+	} else {
+		value = 0;
+	}
+
+	if (value) {
+		int idx = sum[id];
+		if(idx > 0)
+			outp[idx-1] = startNounce + actNounce;
+	}
+}
+
+__host__ static uint32_t quark_compactTest_roundUpExp(uint32_t val)
+{
+	if(val == 0)
+		return 0;
+
+	uint32_t mask = 0x80000000;
+	while( (val & mask) == 0 ) mask = mask >> 1;
+
+	if( (val & (~mask)) != 0 )
+		return mask << 1;
+
+	return mask;
+}
+
+__host__
+void quark_compactTest_cpu_singleCompaction(int thr_id, uint32_t threads, uint32_t *nrm,uint32_t *d_nonces1,
+	cuda_compactTestFunction_t function, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
+{
+	int orgThreads = threads;
+	threads = quark_compactTest_roundUpExp(threads);
+	// threadsPerBlock ausrechnen
+	int blockSize = 256;
+	int nSummen = threads / blockSize;
+
+	int thr1 = (threads+blockSize-1) / blockSize;
+	int thr2 = threads / (blockSize*blockSize);
+	int blockSize2 = (nSummen < blockSize) ? nSummen : blockSize;
+	int thr3 = (nSummen + blockSize2-1) / blockSize2;
+
+	bool callThrid = (thr2 > 0) ? true : false;
+
+	// Erster Initialscan
+	quark_compactTest_gpu_SCAN <<<thr1,blockSize>>>(
+		d_tempBranch1Nonces[thr_id], 32, d_partSum[0][thr_id], function, orgThreads, startNounce, inpHashes, d_validNonceTable);	
+
+	// weitere Scans
+	if(callThrid) {
+		quark_compactTest_gpu_SCAN<<<thr2,blockSize>>>(d_partSum[0][thr_id], 32, d_partSum[1][thr_id]);
+		quark_compactTest_gpu_SCAN<<<1, thr2>>>(d_partSum[1][thr_id], (thr2>32) ? 32 : thr2);
+	} else {
+		quark_compactTest_gpu_SCAN<<<thr3,blockSize2>>>(d_partSum[0][thr_id], (blockSize2>32) ? 32 : blockSize2);
+	}
+
+	// Sync + Anzahl merken
+	cudaStreamSynchronize(NULL);
+
+	if(callThrid)
+		cudaMemcpy(nrm, &(d_partSum[1][thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	else
+		cudaMemcpy(nrm, &(d_partSum[0][thr_id])[nSummen-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+
+	if(callThrid) {
+		quark_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum[0][thr_id]+blockSize, d_partSum[1][thr_id], blockSize*thr2);
+	}
+	quark_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum[0][thr_id], threads);
+
+	quark_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch1Nonces[thr_id], d_nonces1, 
+		function, orgThreads, startNounce, inpHashes, d_validNonceTable);
+
+	// Sync
+	cudaStreamSynchronize(NULL);
+}
+
+#if __CUDA_ARCH__ < 300
+__global__ __launch_bounds__(128, 8)
+void quark_filter_gpu_sm2(const uint32_t threads, const uint32_t* d_hash, uint32_t* d_branch2, uint32_t* d_NonceBranch)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint32_t offset = thread * 16U; // 64U / sizeof(uint32_t);
+		uint4 *psrc = (uint4*) (&d_hash[offset]);
+		d_NonceBranch[thread] = ((uint8_t*)psrc)[0] & 0x8;
+		if (d_NonceBranch[thread]) return;
+		// uint4 = 4x uint32_t = 16 bytes
+		uint4 *pdst = (uint4*) (&d_branch2[offset]);
+		pdst[0] = psrc[0];
+		pdst[1] = psrc[1];
+		pdst[2] = psrc[2];
+		pdst[3] = psrc[3];
+	}
+}
+
+__global__ __launch_bounds__(128, 8)
+void quark_merge_gpu_sm2(const uint32_t threads, uint32_t* d_hash, uint32_t* d_branch2, uint32_t* const d_NonceBranch)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads && !d_NonceBranch[thread])
+	{
+		const uint32_t offset = thread * 16U;
+		uint4 *pdst = (uint4*) (&d_hash[offset]);
+		uint4 *psrc = (uint4*) (&d_branch2[offset]);
+		pdst[0] = psrc[0];
+		pdst[1] = psrc[1];
+		pdst[2] = psrc[2];
+		pdst[3] = psrc[3];
+	}
+}
+#else
+__global__ void quark_filter_gpu_sm2(const uint32_t threads, const uint32_t* d_hash, uint32_t* d_branch2, uint32_t* d_NonceBranch) {}
+__global__ void quark_merge_gpu_sm2(const uint32_t threads, uint32_t* d_hash, uint32_t* d_branch2, uint32_t* const d_NonceBranch) {}
+#endif
+
+__host__
+uint32_t quark_filter_cpu_sm2(const int thr_id, const uint32_t threads, const uint32_t *inpHashes, uint32_t* d_branch2)
+{
+	const uint32_t threadsperblock = 128;
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+	// extract algo permution hashes to a second branch buffer
+	quark_filter_gpu_sm2 <<<grid, block>>> (threads, inpHashes, d_branch2, d_tempBranch1Nonces[thr_id]);
+	return threads;
+}
+
+__host__
+void quark_merge_cpu_sm2(const int thr_id, const uint32_t threads, uint32_t *outpHashes, uint32_t* d_branch2)
+{
+	const uint32_t threadsperblock = 128;
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+	// put back second branch hashes to the common buffer d_hash
+	quark_merge_gpu_sm2 <<<grid, block>>> (threads, outpHashes, d_branch2, d_tempBranch1Nonces[thr_id]);
+}
+
+////// ACHTUNG: Diese funktion geht aktuell nur mit threads > 65536 (Am besten 256 * 1024 oder 256*2048)
+__host__
+void quark_compactTest_cpu_dualCompaction(int thr_id, uint32_t threads, uint32_t *nrm, uint32_t *d_nonces1,
+	 uint32_t *d_nonces2, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
+{
+	quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[0], d_nonces1, h_QuarkTrueFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
+	quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[1], d_nonces2, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
+}
+
+__host__
+void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes,
+	uint32_t *d_validNonceTable, uint32_t *d_nonces1, uint32_t *nrm1, uint32_t *d_nonces2, uint32_t *nrm2, int order)
+{
+	// Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind
+	// "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen!
+	
+	quark_compactTest_cpu_dualCompaction(thr_id, threads,
+		h_numValid[thr_id], d_nonces1, d_nonces2,
+		startNounce, inpHashes, d_validNonceTable);
+
+	cudaStreamSynchronize(NULL); // Das original braucht zwar etwas CPU-Last, ist an dieser Stelle aber evtl besser
+	*nrm1 = h_numValid[thr_id][0];
+	*nrm2 = h_numValid[thr_id][1];
+}
+
+__host__
+void quark_compactTest_single_false_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes,
+	uint32_t *d_validNonceTable, uint32_t *d_nonces1, uint32_t *nrm1, int order)
+{
+	// Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind
+	// "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen!
+
+	quark_compactTest_cpu_singleCompaction(thr_id, threads, h_numValid[thr_id], d_nonces1, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
+
+	cudaStreamSynchronize(NULL);
+	*nrm1 = h_numValid[thr_id][0];
+}
\ No newline at end of file
diff --git a/quark/cuda_quark_groestl512.cu b/quark/cuda_quark_groestl512.cu
index dbe23c8f9f..91903acec7 100644
--- a/quark/cuda_quark_groestl512.cu
+++ b/quark/cuda_quark_groestl512.cu
@@ -1,173 +1,211 @@
 // Auf QuarkCoin spezialisierte Version von Groestl inkl. Bitslice
 
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
 #include <stdio.h>
 #include <memory.h>
+#include <sys/types.h> // off_t
+
+#include <cuda_helper.h>
 
-// aus cpu-miner.c
-extern int device_map[8];
+#ifdef __INTELLISENSE__
+#define __CUDA_ARCH__ 500
+#endif
 
-// aus heavy.cu
-extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
+#define TPB 256
+#define THF 4U
 
-// Folgende Definitionen sp�ter durch header ersetzen
-typedef unsigned char uint8_t;
-typedef unsigned short uint16_t;
-typedef unsigned int uint32_t;
+#if __CUDA_ARCH__ >= 300
+#include "groestl_functions_quad.h"
+#include "groestl_transf_quad.h"
+#endif
 
-// diese Struktur wird in der Init Funktion angefordert
-static cudaDeviceProp props[8];
+#define WANT_GROESTL80
+#ifdef WANT_GROESTL80
+__constant__ static uint32_t c_Message80[20];
+#endif
 
-// 64 Register Variante f�r Compute 3.0
-#include "groestl_functions_quad.cu"
-#include "bitslice_transformations_quad.cu"
+#include "cuda_quark_groestl512_sm2.cuh"
 
-__global__ void __launch_bounds__(256, 4)
- quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
+__global__ __launch_bounds__(TPB, THF)
+void quark_groestl512_gpu_hash_64_quad(const uint32_t threads, const uint32_t startNounce, uint32_t * g_hash, uint32_t * __restrict g_nonceVector)
 {
-    // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
-    int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2;
-    if (thread < threads)
-    {
-        // GROESTL
-        uint32_t message[8];
-        uint32_t state[8];
-
-        uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
-
-        int hashPosition = nounce - startNounce;
-        uint32_t *inpHash = &g_hash[hashPosition<<4];
-
-#pragma unroll 4
-        for(int k=0;k<4;k++) message[k] = inpHash[(k<<2) + (threadIdx.x&0x03)];
-#pragma unroll 4
-        for(int k=4;k<8;k++) message[k] = 0;
-
-        if ((threadIdx.x&0x03) == 0) message[4] = 0x80;
-        if ((threadIdx.x&0x03) == 3) message[7] = 0x01000000;
-
-        uint32_t msgBitsliced[8];
-        to_bitslice_quad(message, msgBitsliced);
-
-        groestl512_progressMessage_quad(state, msgBitsliced);
-
-        // Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash
-        uint32_t *outpHash = &g_hash[hashPosition<<4];
-        uint32_t hash[16];
-        from_bitslice_quad(state, hash);
-
-        if ((threadIdx.x & 0x03) == 0)
-        {
-#pragma unroll 16
-            for(int k=0;k<16;k++) outpHash[k] = hash[k];
-        }
-    }
+#if __CUDA_ARCH__ >= 300
+
+	// BEWARE : 4-WAY CODE (one hash need 4 threads)
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2;
+
+	if (thread < threads)
+	{
+		uint32_t message[8];
+		uint32_t state[8];
+
+		uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread);
+		off_t hashPosition = nounce - startNounce;
+		uint32_t *pHash = &g_hash[hashPosition << 4];
+
+		const uint32_t thr = threadIdx.x & 0x3; // % THF
+
+		/*| M0 M1 M2 M3 | M4 M5 M6 M7 | (input)
+		--|-------------|-------------|
+		T0|  0  4  8 12 | 80          |
+		T1|  1  5    13 |             |
+		T2|  2  6    14 |             |
+		T3|  3  7    15 |          01 |
+		--|-------------|-------------| */
+
+		#pragma unroll
+		for(int k=0;k<4;k++) message[k] = pHash[thr + (k * THF)];
+
+		#pragma unroll
+		for(int k=4;k<8;k++) message[k] = 0;
+
+		if (thr == 0) message[4] = 0x80U; // end of data tag
+		if (thr == 3) message[7] = 0x01000000U;
+
+		uint32_t msgBitsliced[8];
+		to_bitslice_quad(message, msgBitsliced);
+
+		groestl512_progressMessage_quad(state, msgBitsliced);
+
+		uint32_t hash[16];
+		from_bitslice_quad(state, hash);
+
+		// uint4 = 4x4 uint32_t = 16 bytes
+		if (thr == 0) {
+			uint4 *phash = (uint4*) hash;
+			uint4 *outpt = (uint4*) pHash;
+			outpt[0] = phash[0];
+			outpt[1] = phash[1];
+			outpt[2] = phash[2];
+			outpt[3] = phash[3];
+		}
+	}
+#endif
 }
 
-__global__ void __launch_bounds__(256, 4)
- quark_doublegroestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
+__host__
+void quark_groestl512_cpu_init(int thr_id, uint32_t threads)
 {
-    int thread = (blockDim.x * blockIdx.x + threadIdx.x)>>2;
-    if (thread < threads)
-    {
-        // GROESTL
-        uint32_t message[8];
-        uint32_t state[8];
-
-        uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
-
-        int hashPosition = nounce - startNounce;
-        uint32_t *inpHash = &g_hash[hashPosition<<4];
-
-#pragma unroll 4
-        for(int k=0;k<4;k++) message[k] = inpHash[(k<<2)+(threadIdx.x&0x03)];
-#pragma unroll 4
-        for(int k=4;k<8;k++) message[k] = 0;
-
-        if ((threadIdx.x&0x03) == 0) message[4] = 0x80;
-        if ((threadIdx.x&0x03) == 3) message[7] = 0x01000000;
-
-        uint32_t msgBitsliced[8];
-        to_bitslice_quad(message, msgBitsliced);
-
-        for (int round=0; round<2; round++)
-        {
-            groestl512_progressMessage_quad(state, msgBitsliced);
-
-            if (round < 1)
-            {
-                // Verkettung zweier Runden inclusive Padding.
-                msgBitsliced[ 0] = __byte_perm(state[ 0], 0x00800100, 0x4341 + (((threadIdx.x%4)==3)<<13));
-                msgBitsliced[ 1] = __byte_perm(state[ 1], 0x00800100, 0x4341);
-                msgBitsliced[ 2] = __byte_perm(state[ 2], 0x00800100, 0x4341);
-                msgBitsliced[ 3] = __byte_perm(state[ 3], 0x00800100, 0x4341);
-                msgBitsliced[ 4] = __byte_perm(state[ 4], 0x00800100, 0x4341);
-                msgBitsliced[ 5] = __byte_perm(state[ 5], 0x00800100, 0x4341);
-                msgBitsliced[ 6] = __byte_perm(state[ 6], 0x00800100, 0x4341);
-                msgBitsliced[ 7] = __byte_perm(state[ 7], 0x00800100, 0x4341 + (((threadIdx.x%4)==0)<<4));
-            }
-        }
-
-        // Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash
-        uint32_t *outpHash = &g_hash[hashPosition<<4];
-        uint32_t hash[16];
-        from_bitslice_quad(state, hash);
-
-        if ((threadIdx.x & 0x03) == 0)
-        {
-#pragma unroll 16
-            for(int k=0;k<16;k++) outpHash[k] = hash[k];
-        }
-    }
+	int dev_id = device_map[thr_id];
+	cuda_get_arch(thr_id);
+	if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300)
+		quark_groestl512_sm20_init(thr_id, threads);
 }
 
-// Setup-Funktionen
-__host__ void quark_groestl512_cpu_init(int thr_id, int threads)
+__host__
+void quark_groestl512_cpu_free(int thr_id)
 {
-    cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
+	int dev_id = device_map[thr_id];
+	if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300)
+		quark_groestl512_sm20_free(thr_id);
 }
 
-__host__ void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+__host__
+void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
 {
-    int threadsperblock = 256;
+	uint32_t threadsperblock = TPB;
+
+	// Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
+	// mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
+	const uint32_t factor = THF;
 
-    // Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
-    // mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
-    const int factor = 4;
+	dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
+	dim3 block(threadsperblock);
 
-    // berechne wie viele Thread Blocks wir brauchen
-    dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
-    dim3 block(threadsperblock);
+	int dev_id = device_map[thr_id];
 
-    // Gr��e des dynamischen Shared Memory Bereichs
-    size_t shared_size = 0;
+	if (device_sm[dev_id] >= 300 && cuda_arch[dev_id] >= 300)
+		quark_groestl512_gpu_hash_64_quad<<<grid, block>>>(threads, startNounce, d_hash, d_nonceVector);
+	else
+		quark_groestl512_sm20_hash_64(thr_id, threads, startNounce, d_nonceVector, d_hash, order);
+}
+
+// --------------------------------------------------------------------------------------------------------------------------------------------
 
-    quark_groestl512_gpu_hash_64_quad<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector);
+#ifdef WANT_GROESTL80
 
-    // Strategisches Sleep Kommando zur Senkung der CPU Last
-    MyStreamSynchronize(NULL, order, thr_id);
+__host__
+void groestl512_setBlock_80(int thr_id, uint32_t *endiandata)
+{
+	cudaMemcpyToSymbol(c_Message80, endiandata, sizeof(c_Message80), 0, cudaMemcpyHostToDevice);
 }
 
-__host__ void quark_doublegroestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+__global__ __launch_bounds__(TPB, THF)
+void groestl512_gpu_hash_80_quad(const uint32_t threads, const uint32_t startNounce, uint32_t * g_outhash)
 {
-    int threadsperblock = 256;
+#if __CUDA_ARCH__ >= 300
+	// BEWARE : 4-WAY CODE (one hash need 4 threads)
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2;
+	if (thread < threads)
+	{
+		const uint32_t thr = threadIdx.x & 0x3; // % THF
+
+		/*| M0 M1 M2 M3 M4 | M5 M6 M7 | (input)
+		--|----------------|----------|
+		T0|  0  4  8 12 16 | 80       |
+		T1|  1  5       17 |          |
+		T2|  2  6       18 |          |
+		T3|  3  7       Nc |       01 |
+		--|----------------|----------| TPR */
+
+		uint32_t message[8];
+
+		#pragma unroll 5
+		for(int k=0; k<5; k++) message[k] = c_Message80[thr + (k * THF)];
+
+		#pragma unroll 3
+		for(int k=5; k<8; k++) message[k] = 0;
+
+		if (thr == 0) message[5] = 0x80U;
+		if (thr == 3) {
+			message[4] = cuda_swab32(startNounce + thread);
+			message[7] = 0x01000000U;
+		}
+
+		uint32_t msgBitsliced[8];
+		to_bitslice_quad(message, msgBitsliced);
+
+		uint32_t state[8];
+		groestl512_progressMessage_quad(state, msgBitsliced);
+
+		uint32_t hash[16];
+		from_bitslice_quad(state, hash);
+
+		if (thr == 0) { /* 4 threads were done */
+			const off_t hashPosition = thread;
+			//if (!thread) hash[15] = 0xFFFFFFFF;
+			uint4 *outpt = (uint4*) &g_outhash[hashPosition << 4];
+			uint4 *phash = (uint4*) hash;
+			outpt[0] = phash[0];
+			outpt[1] = phash[1];
+			outpt[2] = phash[2];
+			outpt[3] = phash[3];
+		}
+	}
+#endif
+}
+
+__host__
+void groestl512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash)
+{
+	int dev_id = device_map[thr_id];
 
-    // Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
-    // mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
-    const int factor = 4;
+	if (device_sm[dev_id] >= 300 && cuda_arch[dev_id] >= 300) {
+		const uint32_t threadsperblock = TPB;
+		const uint32_t factor = THF;
 
-    // berechne wie viele Thread Blocks wir brauchen
-    dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
-    dim3 block(threadsperblock);
+		dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
+		dim3 block(threadsperblock);
 
-    // Gr��e des dynamischen Shared Memory Bereichs
-    size_t shared_size = 0;
+		groestl512_gpu_hash_80_quad <<<grid, block>>> (threads, startNounce, d_hash);
 
-    quark_doublegroestl512_gpu_hash_64_quad<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector);
+	} else {
 
-    // Strategisches Sleep Kommando zur Senkung der CPU Last
-    MyStreamSynchronize(NULL, order, thr_id);
+		const uint32_t threadsperblock = 256;
+		dim3 grid((threads + threadsperblock-1)/threadsperblock);
+		dim3 block(threadsperblock);
+
+		groestl512_gpu_hash_80_sm2 <<<grid, block>>> (threads, startNounce, d_hash);
+	}
 }
+
+#endif
diff --git a/quark/cuda_quark_groestl512_sm2.cuh b/quark/cuda_quark_groestl512_sm2.cuh
new file mode 100644
index 0000000000..c07340d897
--- /dev/null
+++ b/quark/cuda_quark_groestl512_sm2.cuh
@@ -0,0 +1,359 @@
+// SM 2.x variant (tpruvot)
+
+#ifdef __INTELLISENSE__
+//#define __CUDA_ARCH__ 210
+#define __CUDACC__
+#include <cuda_helper.h>
+#include <cuda_texture_types.h>
+#define __byte_perm(a,b,c) (a)
+#define tex1Dfetch(t, n) (n)
+#endif
+
+#define USE_SHARED 1
+
+static unsigned int *d_textures[MAX_GPUS][8];
+
+#define PC32up(j, r)   ((uint32_t)((j) + (r)))
+#define PC32dn(j, r)   0
+#define QC32up(j, r)   0xFFFFFFFF
+#define QC32dn(j, r)   (((uint32_t)(r) << 24) ^ SPH_T32(~((uint32_t)(j) << 24)))
+
+#define B32_0(x)    __byte_perm(x, 0, 0x4440)
+//((x) & 0xFF)
+#define B32_1(x)    __byte_perm(x, 0, 0x4441)
+//(((x) >> 8) & 0xFF)
+#define B32_2(x)    __byte_perm(x, 0, 0x4442)
+//(((x) >> 16) & 0xFF)
+#define B32_3(x)    __byte_perm(x, 0, 0x4443)
+//((x) >> 24)
+
+#define T0up(x) (*((uint32_t*)mixtabs + (     (x))))
+#define T0dn(x) (*((uint32_t*)mixtabs + ( 256+(x))))
+#define T1up(x) (*((uint32_t*)mixtabs + ( 512+(x))))
+#define T1dn(x) (*((uint32_t*)mixtabs + ( 768+(x))))
+#define T2up(x) (*((uint32_t*)mixtabs + (1024+(x))))
+#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
+#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
+#define T3dn(x) (*((uint32_t*)mixtabs + (1792+(x))))
+
+texture<unsigned int, 1, cudaReadModeElementType> t0up1;
+texture<unsigned int, 1, cudaReadModeElementType> t0dn1;
+texture<unsigned int, 1, cudaReadModeElementType> t1up1;
+texture<unsigned int, 1, cudaReadModeElementType> t1dn1;
+texture<unsigned int, 1, cudaReadModeElementType> t2up1;
+texture<unsigned int, 1, cudaReadModeElementType> t2dn1;
+texture<unsigned int, 1, cudaReadModeElementType> t3up1;
+texture<unsigned int, 1, cudaReadModeElementType> t3dn1;
+
+extern uint32_t T0up_cpu[];
+extern uint32_t T0dn_cpu[];
+extern uint32_t T1up_cpu[];
+extern uint32_t T1dn_cpu[];
+extern uint32_t T2up_cpu[];
+extern uint32_t T2dn_cpu[];
+extern uint32_t T3up_cpu[];
+extern uint32_t T3dn_cpu[];
+
+#if __CUDA_ARCH__ < 300 || defined(_DEBUG)
+
+#if (!USE_SHARED)
+#include "groestl_simple.cuh"
+#endif
+
+__device__ __forceinline__
+void quark_groestl512_perm_P(uint32_t *a, char *mixtabs)
+{
+	#pragma unroll 1
+	for(int r=0; r<14; r++)
+	{
+		uint32_t t[32];
+
+		#pragma unroll 16
+		for (int k=0; k<16; k++)
+			a[(k*2)+0] ^= PC32up(k<< 4, r);
+
+		#pragma unroll 16
+		for(int k=0;k<32;k+=2) {
+			uint32_t t0_0 = B32_0(a[(k    ) & 0x1f]), t9_0  = B32_0(a[(k +  9) & 0x1f]);
+			uint32_t t2_1 = B32_1(a[(k + 2) & 0x1f]), t11_1 = B32_1(a[(k + 11) & 0x1f]);
+			uint32_t t4_2 = B32_2(a[(k + 4) & 0x1f]), t13_2 = B32_2(a[(k + 13) & 0x1f]);
+			uint32_t t6_3 = B32_3(a[(k + 6) & 0x1f]), t23_3 = B32_3(a[(k + 23) & 0x1f]);
+
+			t[k + 0] =  T0up( t0_0 ) ^ T1up(  t2_1 ) ^ T2up(  t4_2 ) ^ T3up(  t6_3 ) ^
+						T0dn( t9_0 ) ^ T1dn( t11_1 ) ^ T2dn( t13_2 ) ^ T3dn( t23_3 );
+
+			t[k + 1] =  T0dn( t0_0 ) ^ T1dn(  t2_1 ) ^ T2dn(  t4_2 ) ^ T3dn(  t6_3 ) ^
+						T0up( t9_0 ) ^ T1up( t11_1 ) ^ T2up( t13_2 ) ^ T3up( t23_3 );
+		}
+
+		#pragma unroll 32
+		for(int k=0; k<32; k++)
+			a[k] = t[k];
+	}
+}
+
+__device__ __forceinline__
+void quark_groestl512_perm_Q(uint32_t *a, char *mixtabs)
+{
+	#pragma unroll 1
+	for(int r=0; r<14; r++)
+	{
+		uint32_t t[32];
+
+		#pragma unroll 16
+		for (int k=0; k<16; k++) {
+			a[(k*2)+0] ^= QC32up(k << 4, r);
+			a[(k*2)+1] ^= QC32dn(k << 4, r);
+		}
+
+		#pragma unroll 16
+		for(int k=0;k<32;k+=2)
+		{
+			uint32_t t2_0  = B32_0(a[(k +  2) & 0x1f]), t1_0  = B32_0(a[(k +  1) & 0x1f]);
+			uint32_t t6_1  = B32_1(a[(k +  6) & 0x1f]), t5_1  = B32_1(a[(k +  5) & 0x1f]);
+			uint32_t t10_2 = B32_2(a[(k + 10) & 0x1f]), t9_2  = B32_2(a[(k +  9) & 0x1f]);
+			uint32_t t22_3 = B32_3(a[(k + 22) & 0x1f]), t13_3 = B32_3(a[(k + 13) & 0x1f]);
+
+			t[k + 0] =  T0up( t2_0 ) ^ T1up( t6_1 ) ^ T2up( t10_2 ) ^ T3up( t22_3 ) ^
+						T0dn( t1_0 ) ^ T1dn( t5_1 ) ^ T2dn(  t9_2 ) ^ T3dn( t13_3 );
+
+			t[k + 1] =  T0dn( t2_0 ) ^ T1dn( t6_1 ) ^ T2dn( t10_2 ) ^ T3dn( t22_3 ) ^
+						T0up( t1_0 ) ^ T1up( t5_1 ) ^ T2up(  t9_2 ) ^ T3up( t13_3 );
+		}
+		#pragma unroll 32
+		for(int k=0; k<32; k++)
+			a[k] = t[k];
+	}
+}
+
+#endif
+
+__global__
+void quark_groestl512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
+{
+#if __CUDA_ARCH__ < 300 || defined(_DEBUG)
+
+#if USE_SHARED
+	__shared__ char mixtabs[8 * 1024];
+	if (threadIdx.x < 256) {
+		*((uint32_t*)mixtabs + (     threadIdx.x)) = tex1Dfetch(t0up1, threadIdx.x);
+		*((uint32_t*)mixtabs + ( 256+threadIdx.x)) = tex1Dfetch(t0dn1, threadIdx.x);
+		*((uint32_t*)mixtabs + ( 512+threadIdx.x)) = tex1Dfetch(t1up1, threadIdx.x);
+		*((uint32_t*)mixtabs + ( 768+threadIdx.x)) = tex1Dfetch(t1dn1, threadIdx.x);
+		*((uint32_t*)mixtabs + (1024+threadIdx.x)) = tex1Dfetch(t2up1, threadIdx.x);
+		*((uint32_t*)mixtabs + (1280+threadIdx.x)) = tex1Dfetch(t2dn1, threadIdx.x);
+		*((uint32_t*)mixtabs + (1536+threadIdx.x)) = tex1Dfetch(t3up1, threadIdx.x);
+		*((uint32_t*)mixtabs + (1792+threadIdx.x)) = tex1Dfetch(t3dn1, threadIdx.x);
+	}
+	__syncthreads();
+#endif
+
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		// GROESTL
+		uint32_t message[32];
+		uint32_t state[32];
+
+		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+
+		off_t hashPosition = nounce - startNounce;
+		uint32_t *pHash = &g_hash[hashPosition * 16];
+
+		#pragma unroll 4
+		for (int i=0; i<16; i += 4)
+			AS_UINT4(&message[i]) = AS_UINT4(&pHash[i]);
+
+		message[16] = 0x80U;
+		#pragma unroll 14
+		for(int i=17; i<31; i++) message[i] = 0;
+		message[31] = 0x01000000U;
+
+		#pragma unroll 32
+		for(int i=0; i<32; i++) state[i] = message[i];
+		state[31] ^= 0x20000U;
+
+		// Perm
+#if USE_SHARED
+		quark_groestl512_perm_P(state, mixtabs);
+		state[31] ^= 0x20000U;
+		quark_groestl512_perm_Q(message, mixtabs);
+		#pragma unroll 32
+		for(int i=0; i<32; i++) state[i] ^= message[i];
+		#pragma unroll 16
+		for(int i=16; i<32; i++) message[i] = state[i];
+		quark_groestl512_perm_P(state, mixtabs);
+#else
+		tex_groestl512_perm_P(state);
+		state[31] ^= 0x20000U;
+		tex_groestl512_perm_Q(message);
+		#pragma unroll 32
+		for(int i=0; i<32; i++) state[i] ^= message[i];
+		#pragma unroll 16
+		for(int i=16; i<32; i++) message[i] = state[i];
+		tex_groestl512_perm_P(state);
+#endif
+
+		#pragma unroll 16
+		for(int i=16; i<32; i++) state[i] ^= message[i];
+
+		uint4 *outpt = (uint4*)(pHash);
+		uint4 *phash = (uint4*)(&state[16]);
+		outpt[0] = phash[0];
+		outpt[1] = phash[1];
+		outpt[2] = phash[2];
+		outpt[3] = phash[3];
+	}
+#endif
+}
+
+#define texDef(id, texname, texmem, texsource, texsize) { \
+	unsigned int *texmem; \
+	cudaMalloc(&texmem, texsize); \
+	d_textures[thr_id][id] = texmem; \
+	cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
+	texname.normalized = 0; \
+	texname.filterMode = cudaFilterModePoint; \
+	texname.addressMode[0] = cudaAddressModeClamp; \
+	{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
+	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); \
+	} \
+}
+
+__host__
+void quark_groestl512_sm20_init(int thr_id, uint32_t threads)
+{
+	// Texturen mit obigem Makro initialisieren
+	texDef(0, t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256);
+	texDef(1, t0dn1, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256);
+	texDef(2, t1up1, d_T1up, T1up_cpu, sizeof(uint32_t)*256);
+	texDef(3, t1dn1, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256);
+	texDef(4, t2up1, d_T2up, T2up_cpu, sizeof(uint32_t)*256);
+	texDef(5, t2dn1, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256);
+	texDef(6, t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
+	texDef(7, t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
+}
+
+__host__
+void quark_groestl512_sm20_free(int thr_id)
+{
+	if (!d_textures[thr_id][0]) return;
+	for (int i=0; i<8; i++)
+		cudaFree(d_textures[thr_id][i]);
+	d_textures[thr_id][0] = NULL;
+}
+
+__host__
+void quark_groestl512_sm20_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+{
+	int threadsperblock = 512;
+
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	quark_groestl512_gpu_hash_64<<<grid, block>>>(threads, startNounce, d_hash, d_nonceVector);
+}
+
+__host__
+void quark_doublegroestl512_sm20_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+{
+	int threadsperblock = 512;
+
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	quark_groestl512_gpu_hash_64<<<grid, block>>>(threads, startNounce, d_hash, d_nonceVector);
+	quark_groestl512_gpu_hash_64<<<grid, block>>>(threads, startNounce, d_hash, d_nonceVector);
+}
+
+// --------------------------------------------------------------------------------------------------------------------------------------------
+
+#ifdef WANT_GROESTL80
+
+// defined in groest512.cu
+// __constant__ static uint32_t c_Message80[20];
+
+__global__
+//__launch_bounds__(256)
+void groestl512_gpu_hash_80_sm2(const uint32_t threads, const uint32_t startNounce, uint32_t * g_outhash)
+{
+#if __CUDA_ARCH__ < 300 || defined(_DEBUG)
+
+#if USE_SHARED
+	__shared__ char mixtabs[8 * 1024];
+	if (threadIdx.x < 256) {
+		*((uint32_t*)mixtabs + (     threadIdx.x)) = tex1Dfetch(t0up1, threadIdx.x);
+		*((uint32_t*)mixtabs + ( 256+threadIdx.x)) = tex1Dfetch(t0dn1, threadIdx.x);
+		*((uint32_t*)mixtabs + ( 512+threadIdx.x)) = tex1Dfetch(t1up1, threadIdx.x);
+		*((uint32_t*)mixtabs + ( 768+threadIdx.x)) = tex1Dfetch(t1dn1, threadIdx.x);
+		*((uint32_t*)mixtabs + (1024+threadIdx.x)) = tex1Dfetch(t2up1, threadIdx.x);
+		*((uint32_t*)mixtabs + (1280+threadIdx.x)) = tex1Dfetch(t2dn1, threadIdx.x);
+		*((uint32_t*)mixtabs + (1536+threadIdx.x)) = tex1Dfetch(t3up1, threadIdx.x);
+		*((uint32_t*)mixtabs + (1792+threadIdx.x)) = tex1Dfetch(t3dn1, threadIdx.x);
+	}
+	__syncthreads();
+#endif
+
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t message[32];
+
+		#pragma unroll 5
+		for (int i=0; i < 20; i += 4)
+			AS_UINT4(&message[i]) = AS_UINT4(&c_Message80[i]);
+
+		message[19] = cuda_swab32(startNounce + thread);
+		message[20] = 0x80U; // end tag
+
+		#pragma unroll
+		for(int i=21; i<31; i++) message[i] = 0U;
+		message[31] = 0x01000000U; // end block
+
+		uint32_t state[32];
+		#pragma unroll
+		for(int i=0; i<32; i++) state[i] = message[i];
+		state[31] ^= 0x00020000U; // "...00000201"
+
+#if USE_SHARED
+		quark_groestl512_perm_P(state, mixtabs);
+		quark_groestl512_perm_Q(message, mixtabs);
+
+		state[31] ^= 0x00020000U;
+		#pragma unroll 32
+		for(int i=0; i<32; i++) state[i] ^= message[i];
+
+		#pragma unroll 16
+		for(int i=16; i<32; i++) message[i] = state[i];
+
+		quark_groestl512_perm_P(state, mixtabs);
+#else
+		tex_groestl512_perm_P(state);
+		tex_groestl512_perm_Q(message);
+
+		state[31] ^= 0x00020000U;
+		#pragma unroll 32
+		for(int i=0; i<32; i++) state[i] ^= message[i];
+
+		#pragma unroll 16
+		for(int i=16; i<32; i++) message[i] = state[i];
+
+		tex_groestl512_perm_P(state);
+#endif
+		#pragma unroll 16
+		for(int i=16; i<32; i++) state[i] ^= message[i];
+
+		// uint4 = 4 x uint32_t = 16 bytes, x 4 => 64 bytes
+		const off_t hashPosition = thread;
+
+		uint4 *outpt = (uint4*) (&g_outhash[hashPosition << 4]);
+		uint4 *phash = (uint4*) (&state[16]);
+		outpt[0] = phash[0];
+		outpt[1] = phash[1];
+		outpt[2] = phash[2];
+		outpt[3] = phash[3];
+	}
+#endif
+}
+
+#endif // WANT_GROESTL80
diff --git a/quark/cuda_quark_keccak512.cu b/quark/cuda_quark_keccak512.cu
index 775d5e2ee4..1a6136ff7b 100644
--- a/quark/cuda_quark_keccak512.cu
+++ b/quark/cuda_quark_keccak512.cu
@@ -1,180 +1,277 @@
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
-#include <stdio.h>
-#include <memory.h>
-
-// Folgende Definitionen sp�ter durch header ersetzen
-typedef unsigned char uint8_t;
-typedef unsigned int uint32_t;
-typedef unsigned long long uint64_t;
-
-// aus heavy.cu
-extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
-
-#include "cuda_helper.h"
-
-#define U32TO64_LE(p) \
-    (((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32))
-
-#define U64TO32_LE(p, v) \
-    *p = (uint32_t)((v)); *(p+1) = (uint32_t)((v) >> 32);
-
-static const uint64_t host_keccak_round_constants[24] = {
-    0x0000000000000001ull, 0x0000000000008082ull,
-    0x800000000000808aull, 0x8000000080008000ull,
-    0x000000000000808bull, 0x0000000080000001ull,
-    0x8000000080008081ull, 0x8000000000008009ull,
-    0x000000000000008aull, 0x0000000000000088ull,
-    0x0000000080008009ull, 0x000000008000000aull,
-    0x000000008000808bull, 0x800000000000008bull,
-    0x8000000000008089ull, 0x8000000000008003ull,
-    0x8000000000008002ull, 0x8000000000000080ull,
-    0x000000000000800aull, 0x800000008000000aull,
-    0x8000000080008081ull, 0x8000000000008080ull,
-    0x0000000080000001ull, 0x8000000080008008ull
-};
-
-__constant__ uint64_t c_keccak_round_constants[24];
-
-static __device__ __forceinline__ void
-keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_constants) {
-    size_t i;
-    uint64_t t[5], u[5], v, w;
-
-    /* absorb input */
-#pragma unroll 9
-    for (i = 0; i < 72 / 8; i++, in += 2)
-        s[i] ^= U32TO64_LE(in);
-    
-    for (i = 0; i < 24; i++) {
-        /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
-        t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
-        t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
-        t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
-        t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
-        t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
-
-        /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
-        u[0] = t[4] ^ ROTL64(t[1], 1);
-        u[1] = t[0] ^ ROTL64(t[2], 1);
-        u[2] = t[1] ^ ROTL64(t[3], 1);
-        u[3] = t[2] ^ ROTL64(t[4], 1);
-        u[4] = t[3] ^ ROTL64(t[0], 1);
-
-        /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
-        s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
-        s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
-        s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
-        s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
-        s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
-
-        /* rho pi: b[..] = rotl(a[..], ..) */
-        v = s[ 1];
-        s[ 1] = ROTL64(s[ 6], 44);
-        s[ 6] = ROTL64(s[ 9], 20);
-        s[ 9] = ROTL64(s[22], 61);
-        s[22] = ROTL64(s[14], 39);
-        s[14] = ROTL64(s[20], 18);
-        s[20] = ROTL64(s[ 2], 62);
-        s[ 2] = ROTL64(s[12], 43);
-        s[12] = ROTL64(s[13], 25);
-        s[13] = ROTL64(s[19],  8);
-        s[19] = ROTL64(s[23], 56);
-        s[23] = ROTL64(s[15], 41);
-        s[15] = ROTL64(s[ 4], 27);
-        s[ 4] = ROTL64(s[24], 14);
-        s[24] = ROTL64(s[21],  2);
-        s[21] = ROTL64(s[ 8], 55);
-        s[ 8] = ROTL64(s[16], 45);
-        s[16] = ROTL64(s[ 5], 36);
-        s[ 5] = ROTL64(s[ 3], 28);
-        s[ 3] = ROTL64(s[18], 21);
-        s[18] = ROTL64(s[17], 15);
-        s[17] = ROTL64(s[11], 10);
-        s[11] = ROTL64(s[ 7],  6);
-        s[ 7] = ROTL64(s[10],  3);
-        s[10] = ROTL64(    v,  1);
-
-        /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
-        v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w;
-        v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w;
-        v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
-        v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
-        v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
-
-        /* iota: a[0,0] ^= round constant */
-        s[0] ^= keccak_round_constants[i];
-    }
-}
-
-__global__ void quark_keccak512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
-{
-    int thread = (blockDim.x * blockIdx.x + threadIdx.x);
-    if (thread < threads)
-    {
-        uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
-
-        int hashPosition = nounce - startNounce;
-        uint32_t *inpHash = (uint32_t*)&g_hash[8 * hashPosition];
-
-        // Nachricht kopieren
-        uint32_t message[18];
-#pragma unroll 16
-        for(int i=0;i<16;i++)
-            message[i] = inpHash[i];
-
-        message[16] = 0x01;
-        message[17] = 0x80000000;
-
-        // State initialisieren
-        uint64_t keccak_gpu_state[25];
-#pragma unroll 25
-        for (int i=0; i<25; i++)
-            keccak_gpu_state[i] = 0;
-
-        // den Block einmal gut durchsch�tteln
-        keccak_block(keccak_gpu_state, message, c_keccak_round_constants);
-
-        // das Hash erzeugen
-        uint32_t hash[16];
-
-#pragma unroll 8
-        for (size_t i = 0; i < 64; i += 8) {
-            U64TO32_LE((&hash[i/4]), keccak_gpu_state[i / 8]);
-        }
-
-        // fertig
-        uint32_t *outpHash = (uint32_t*)&g_hash[8 * hashPosition];
-
-#pragma unroll 16
-        for(int i=0;i<16;i++)
-            outpHash[i] = hash[i];
-    }
-}
-
-// Setup-Funktionen
-__host__ void quark_keccak512_cpu_init(int thr_id, int threads)
-{
-    // Kopiere die Hash-Tabellen in den GPU-Speicher
-    cudaMemcpyToSymbol( c_keccak_round_constants,
-                        host_keccak_round_constants,
-                        sizeof(host_keccak_round_constants),
-                        0, cudaMemcpyHostToDevice);
-}
-
-__host__ void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
-{
-    const int threadsperblock = 256;
-
-    // berechne wie viele Thread Blocks wir brauchen
-    dim3 grid((threads + threadsperblock-1)/threadsperblock);
-    dim3 block(threadsperblock);
-
-    // Gr��e des dynamischen Shared Memory Bereichs
-    size_t shared_size = 0;
-
-    quark_keccak512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
-    MyStreamSynchronize(NULL, order, thr_id);
-}
+#include <stdio.h>
+#include <memory.h>
+#include <sys/types.h> // off_t
+
+#include "cuda_helper.h"
+
+#define U32TO64_LE(p) \
+	(((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32))
+
+#define U64TO32_LE(p, v) \
+	*p = (uint32_t)((v)); *(p+1) = (uint32_t)((v) >> 32);
+
+static const uint64_t host_keccak_round_constants[24] = {
+	0x0000000000000001ull, 0x0000000000008082ull,
+	0x800000000000808aull, 0x8000000080008000ull,
+	0x000000000000808bull, 0x0000000080000001ull,
+	0x8000000080008081ull, 0x8000000000008009ull,
+	0x000000000000008aull, 0x0000000000000088ull,
+	0x0000000080008009ull, 0x000000008000000aull,
+	0x000000008000808bull, 0x800000000000008bull,
+	0x8000000000008089ull, 0x8000000000008003ull,
+	0x8000000000008002ull, 0x8000000000000080ull,
+	0x000000000000800aull, 0x800000008000000aull,
+	0x8000000080008081ull, 0x8000000000008080ull,
+	0x0000000080000001ull, 0x8000000080008008ull
+};
+
+__constant__ uint64_t d_keccak_round_constants[24];
+
+__device__ __forceinline__
+static void keccak_block(uint2 *s)
+{
+	size_t i;
+	uint2 t[5], u[5], v, w;
+
+	for (i = 0; i < 24; i++) {
+		/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
+		t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
+		t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
+		t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
+		t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
+		t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
+
+		/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
+		u[0] = t[4] ^ ROL2(t[1], 1);
+		u[1] = t[0] ^ ROL2(t[2], 1);
+		u[2] = t[1] ^ ROL2(t[3], 1);
+		u[3] = t[2] ^ ROL2(t[4], 1);
+		u[4] = t[3] ^ ROL2(t[0], 1);
+
+		/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
+		s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
+		s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
+		s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
+		s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
+		s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
+
+		/* rho pi: b[..] = rotl(a[..], ..) */
+		v = s[1];
+		s[1]  = ROL2(s[6], 44);
+		s[6]  = ROL2(s[9], 20);
+		s[9]  = ROL2(s[22], 61);
+		s[22] = ROL2(s[14], 39);
+		s[14] = ROL2(s[20], 18);
+		s[20] = ROL2(s[2], 62);
+		s[2]  = ROL2(s[12], 43);
+		s[12] = ROL2(s[13], 25);
+		s[13] = ROL2(s[19], 8);
+		s[19] = ROL2(s[23], 56);
+		s[23] = ROL2(s[15], 41);
+		s[15] = ROL2(s[4], 27);
+		s[4]  = ROL2(s[24], 14);
+		s[24] = ROL2(s[21], 2);
+		s[21] = ROL2(s[8], 55);
+		s[8]  = ROL2(s[16], 45);
+		s[16] = ROL2(s[5], 36);
+		s[5]  = ROL2(s[3], 28);
+		s[3]  = ROL2(s[18], 21);
+		s[18] = ROL2(s[17], 15);
+		s[17] = ROL2(s[11], 10);
+		s[11] = ROL2(s[7], 6);
+		s[7]  = ROL2(s[10], 3);
+		s[10] = ROL2(v, 1);
+
+		/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
+		v = s[0]; w = s[1]; s[0] ^= (~w) & s[2]; s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & v; s[4] ^= (~v) & w;
+		v = s[5]; w = s[6]; s[5] ^= (~w) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9]; s[8] ^= (~s[9]) & v; s[9] ^= (~v) & w;
+		v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
+		v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
+		v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
+
+		/* iota: a[0,0] ^= round constant */
+		s[0] ^= vectorize(d_keccak_round_constants[i]);
+	}
+}
+
+__global__
+void quark_keccak512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+
+		off_t hashPosition = nounce - startNounce;
+		uint64_t *inpHash = &g_hash[hashPosition * 8];
+		uint2 keccak_gpu_state[25];
+
+		for (int i = 0; i<8; i++) {
+			keccak_gpu_state[i] = vectorize(inpHash[i]);
+		}
+		keccak_gpu_state[8] = vectorize(0x8000000000000001ULL);
+
+		for (int i=9; i<25; i++) {
+			keccak_gpu_state[i] = make_uint2(0, 0);
+		}
+		keccak_block(keccak_gpu_state);
+
+		for(int i=0; i<8; i++) {
+			inpHash[i] = devectorize(keccak_gpu_state[i]);
+		}
+	}
+}
+
+__device__ __forceinline__
+static void keccak_block_v30(uint64_t *s, const uint32_t *in)
+{
+	size_t i;
+	uint64_t t[5], u[5], v, w;
+
+	#pragma unroll 9
+	for (i = 0; i < 72 / 8; i++, in += 2)
+		s[i] ^= U32TO64_LE(in);
+
+	for (i = 0; i < 24; i++) {
+		/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
+		t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
+		t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
+		t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
+		t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
+		t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
+
+		/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
+		u[0] = t[4] ^ ROTL64(t[1], 1);
+		u[1] = t[0] ^ ROTL64(t[2], 1);
+		u[2] = t[1] ^ ROTL64(t[3], 1);
+		u[3] = t[2] ^ ROTL64(t[4], 1);
+		u[4] = t[3] ^ ROTL64(t[0], 1);
+
+		/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
+		s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
+		s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
+		s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
+		s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
+		s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
+
+		/* rho pi: b[..] = rotl(a[..], ..) */
+		v = s[ 1];
+		s[ 1] = ROTL64(s[ 6], 44);
+		s[ 6] = ROTL64(s[ 9], 20);
+		s[ 9] = ROTL64(s[22], 61);
+		s[22] = ROTL64(s[14], 39);
+		s[14] = ROTL64(s[20], 18);
+		s[20] = ROTL64(s[ 2], 62);
+		s[ 2] = ROTL64(s[12], 43);
+		s[12] = ROTL64(s[13], 25);
+		s[13] = ROTL64(s[19],  8);
+		s[19] = ROTL64(s[23], 56);
+		s[23] = ROTL64(s[15], 41);
+		s[15] = ROTL64(s[ 4], 27);
+		s[ 4] = ROTL64(s[24], 14);
+		s[24] = ROTL64(s[21],  2);
+		s[21] = ROTL64(s[ 8], 55);
+		s[ 8] = ROTL64(s[16], 45);
+		s[16] = ROTL64(s[ 5], 36);
+		s[ 5] = ROTL64(s[ 3], 28);
+		s[ 3] = ROTL64(s[18], 21);
+		s[18] = ROTL64(s[17], 15);
+		s[17] = ROTL64(s[11], 10);
+		s[11] = ROTL64(s[ 7],  6);
+		s[ 7] = ROTL64(s[10],  3);
+		s[10] = ROTL64(    v,  1);
+
+		/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
+		v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w;
+		v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w;
+		v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
+		v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
+		v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
+
+		/* iota: a[0,0] ^= round constant */
+		s[0] ^= d_keccak_round_constants[i];
+	}
+}
+
+__global__
+void quark_keccak512_gpu_hash_64_v30(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+
+		off_t hashPosition = nounce - startNounce;
+		uint32_t *inpHash = (uint32_t*)&g_hash[hashPosition * 8];
+
+		uint32_t message[18];
+		#pragma unroll 16
+		for(int i=0;i<16;i++)
+			message[i] = inpHash[i];
+
+		message[16] = 0x01;
+		message[17] = 0x80000000;
+
+		uint64_t keccak_gpu_state[25];
+		#pragma unroll 25
+		for (int i=0; i<25; i++)
+			keccak_gpu_state[i] = 0;
+
+		keccak_block_v30(keccak_gpu_state, message);
+
+		uint32_t hash[16];
+		#pragma unroll 8
+		for (size_t i = 0; i < 64; i += 8) {
+			U64TO32_LE((&hash[i/4]), keccak_gpu_state[i / 8]);
+		}
+
+		uint32_t *outpHash = (uint32_t*)&g_hash[hashPosition * 8];
+		#pragma unroll 16
+		for(int i=0; i<16; i++)
+			outpHash[i] = hash[i];
+	}
+}
+
+__host__
+void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+{
+	const uint32_t threadsperblock = 256;
+
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	int dev_id = device_map[thr_id];
+
+	if (device_sm[dev_id] >= 320)
+		quark_keccak512_gpu_hash_64<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+	else
+		quark_keccak512_gpu_hash_64_v30<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+
+	MyStreamSynchronize(NULL, order, thr_id);
+}
+
+void jackpot_keccak512_cpu_init(int thr_id, uint32_t threads);
+void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen);
+void jackpot_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
+
+__host__
+void quark_keccak512_cpu_init(int thr_id, uint32_t threads)
+{
+	// required for the 64 bytes one
+	cudaMemcpyToSymbol(d_keccak_round_constants, host_keccak_round_constants,
+			sizeof(host_keccak_round_constants), 0, cudaMemcpyHostToDevice);
+
+	jackpot_keccak512_cpu_init(thr_id, threads);
+}
+
+__host__
+void keccak512_setBlock_80(int thr_id, uint32_t *endiandata)
+{
+	jackpot_keccak512_cpu_setBlock((void*)endiandata, 80);
+}
+
+__host__
+void keccak512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash)
+{
+	jackpot_keccak512_cpu_hash(thr_id, threads, startNounce, d_hash, 0);
+}
diff --git a/quark/cuda_skein512.cu b/quark/cuda_skein512.cu
index d32e9c0bbd..7c4d99ab3a 100644
--- a/quark/cuda_skein512.cu
+++ b/quark/cuda_skein512.cu
@@ -1,449 +1,1103 @@
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
-#include <stdio.h>
-#include <memory.h>
-
-// Folgende Definitionen sp�ter durch header ersetzen
-typedef unsigned char uint8_t;
-typedef unsigned int uint32_t;
-typedef unsigned long long uint64_t;
-
-#define SPH_C64(x)    ((uint64_t)(x ## ULL))
-
-// aus cpu-miner.c
-extern "C" extern int device_map[8];
-// aus heavy.cu
-extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
-
-// Take a look at: https://www.schneier.com/skein1.3.pdf
-
-#if __CUDA_ARCH__ >= 350
-__forceinline__ __device__ uint64_t ROTL64(const uint64_t value, const int offset) {
-    uint2 result;
-    if(offset >= 32) {
-        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-    } else {
-        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-    }
-    return  __double_as_longlong(__hiloint2double(result.y, result.x));
-}
-#else
-#define ROTL64(x, n)        (((x) << (n)) | ((x) >> (64 - (n))))
-#endif
-#define SHL(x, n)			((x) << (n))
-#define SHR(x, n)			((x) >> (n))
-
-// Zum testen Hostcode...
-/*	Hier erstmal die Tabelle mit den Konstanten f�r die Mix-Funktion. Kann sp�ter vll.
-	mal direkt in den Code eingesetzt werden
-*/
-
-/*
- * M9_ ## s ## _ ## i  evaluates to s+i mod 9 (0 <= s <= 18, 0 <= i <= 7).
- */
-
-#define M9_0_0    0
-#define M9_0_1    1
-#define M9_0_2    2
-#define M9_0_3    3
-#define M9_0_4    4
-#define M9_0_5    5
-#define M9_0_6    6
-#define M9_0_7    7
-
-#define M9_1_0    1
-#define M9_1_1    2
-#define M9_1_2    3
-#define M9_1_3    4
-#define M9_1_4    5
-#define M9_1_5    6
-#define M9_1_6    7
-#define M9_1_7    8
-
-#define M9_2_0    2
-#define M9_2_1    3
-#define M9_2_2    4
-#define M9_2_3    5
-#define M9_2_4    6
-#define M9_2_5    7
-#define M9_2_6    8
-#define M9_2_7    0
-
-#define M9_3_0    3
-#define M9_3_1    4
-#define M9_3_2    5
-#define M9_3_3    6
-#define M9_3_4    7
-#define M9_3_5    8
-#define M9_3_6    0
-#define M9_3_7    1
-
-#define M9_4_0    4
-#define M9_4_1    5
-#define M9_4_2    6
-#define M9_4_3    7
-#define M9_4_4    8
-#define M9_4_5    0
-#define M9_4_6    1
-#define M9_4_7    2
-
-#define M9_5_0    5
-#define M9_5_1    6
-#define M9_5_2    7
-#define M9_5_3    8
-#define M9_5_4    0
-#define M9_5_5    1
-#define M9_5_6    2
-#define M9_5_7    3
-
-#define M9_6_0    6
-#define M9_6_1    7
-#define M9_6_2    8
-#define M9_6_3    0
-#define M9_6_4    1
-#define M9_6_5    2
-#define M9_6_6    3
-#define M9_6_7    4
-
-#define M9_7_0    7
-#define M9_7_1    8
-#define M9_7_2    0
-#define M9_7_3    1
-#define M9_7_4    2
-#define M9_7_5    3
-#define M9_7_6    4
-#define M9_7_7    5
-
-#define M9_8_0    8
-#define M9_8_1    0
-#define M9_8_2    1
-#define M9_8_3    2
-#define M9_8_4    3
-#define M9_8_5    4
-#define M9_8_6    5
-#define M9_8_7    6
-
-#define M9_9_0    0
-#define M9_9_1    1
-#define M9_9_2    2
-#define M9_9_3    3
-#define M9_9_4    4
-#define M9_9_5    5
-#define M9_9_6    6
-#define M9_9_7    7
-
-#define M9_10_0   1
-#define M9_10_1   2
-#define M9_10_2   3
-#define M9_10_3   4
-#define M9_10_4   5
-#define M9_10_5   6
-#define M9_10_6   7
-#define M9_10_7   8
-
-#define M9_11_0   2
-#define M9_11_1   3
-#define M9_11_2   4
-#define M9_11_3   5
-#define M9_11_4   6
-#define M9_11_5   7
-#define M9_11_6   8
-#define M9_11_7   0
-
-#define M9_12_0   3
-#define M9_12_1   4
-#define M9_12_2   5
-#define M9_12_3   6
-#define M9_12_4   7
-#define M9_12_5   8
-#define M9_12_6   0
-#define M9_12_7   1
-
-#define M9_13_0   4
-#define M9_13_1   5
-#define M9_13_2   6
-#define M9_13_3   7
-#define M9_13_4   8
-#define M9_13_5   0
-#define M9_13_6   1
-#define M9_13_7   2
-
-#define M9_14_0   5
-#define M9_14_1   6
-#define M9_14_2   7
-#define M9_14_3   8
-#define M9_14_4   0
-#define M9_14_5   1
-#define M9_14_6   2
-#define M9_14_7   3
-
-#define M9_15_0   6
-#define M9_15_1   7
-#define M9_15_2   8
-#define M9_15_3   0
-#define M9_15_4   1
-#define M9_15_5   2
-#define M9_15_6   3
-#define M9_15_7   4
-
-#define M9_16_0   7
-#define M9_16_1   8
-#define M9_16_2   0
-#define M9_16_3   1
-#define M9_16_4   2
-#define M9_16_5   3
-#define M9_16_6   4
-#define M9_16_7   5
-
-#define M9_17_0   8
-#define M9_17_1   0
-#define M9_17_2   1
-#define M9_17_3   2
-#define M9_17_4   3
-#define M9_17_5   4
-#define M9_17_6   5
-#define M9_17_7   6
-
-#define M9_18_0   0
-#define M9_18_1   1
-#define M9_18_2   2
-#define M9_18_3   3
-#define M9_18_4   4
-#define M9_18_5   5
-#define M9_18_6   6
-#define M9_18_7   7
-
-/*
- * M3_ ## s ## _ ## i  evaluates to s+i mod 3 (0 <= s <= 18, 0 <= i <= 1).
- */
-
-#define M3_0_0    0
-#define M3_0_1    1
-#define M3_1_0    1
-#define M3_1_1    2
-#define M3_2_0    2
-#define M3_2_1    0
-#define M3_3_0    0
-#define M3_3_1    1
-#define M3_4_0    1
-#define M3_4_1    2
-#define M3_5_0    2
-#define M3_5_1    0
-#define M3_6_0    0
-#define M3_6_1    1
-#define M3_7_0    1
-#define M3_7_1    2
-#define M3_8_0    2
-#define M3_8_1    0
-#define M3_9_0    0
-#define M3_9_1    1
-#define M3_10_0   1
-#define M3_10_1   2
-#define M3_11_0   2
-#define M3_11_1   0
-#define M3_12_0   0
-#define M3_12_1   1
-#define M3_13_0   1
-#define M3_13_1   2
-#define M3_14_0   2
-#define M3_14_1   0
-#define M3_15_0   0
-#define M3_15_1   1
-#define M3_16_0   1
-#define M3_16_1   2
-#define M3_17_0   2
-#define M3_17_1   0
-#define M3_18_0   0
-#define M3_18_1   1
-
-#define XCAT(x, y)     XCAT_(x, y)
-#define XCAT_(x, y)    x ## y
-
-#define SKBI(k, s, i)   XCAT(k, XCAT(XCAT(XCAT(M9_, s), _), i))
-#define SKBT(t, s, v)   XCAT(t, XCAT(XCAT(XCAT(M3_, s), _), v))
-
-#define TFBIG_KINIT(k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2) { \
-		k8 = ((k0 ^ k1) ^ (k2 ^ k3)) ^ ((k4 ^ k5) ^ (k6 ^ k7)) \
-			^ SPH_C64(0x1BD11BDAA9FC1A22); \
-		t2 = t0 ^ t1; \
-	}
-
-#define TFBIG_ADDKEY(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) { \
-		w0 = (w0 + SKBI(k, s, 0)); \
-		w1 = (w1 + SKBI(k, s, 1)); \
-		w2 = (w2 + SKBI(k, s, 2)); \
-		w3 = (w3 + SKBI(k, s, 3)); \
-		w4 = (w4 + SKBI(k, s, 4)); \
-		w5 = (w5 + SKBI(k, s, 5) + SKBT(t, s, 0)); \
-		w6 = (w6 + SKBI(k, s, 6) + SKBT(t, s, 1)); \
-		w7 = (w7 + SKBI(k, s, 7) + (uint64_t)s); \
-	}
-
-#define TFBIG_MIX(x0, x1, rc) { \
-		x0 = x0 + x1; \
-		x1 = ROTL64(x1, rc) ^ x0; \
-	}
-
-#define TFBIG_MIX8(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) { \
-		TFBIG_MIX(w0, w1, rc0); \
-		TFBIG_MIX(w2, w3, rc1); \
-		TFBIG_MIX(w4, w5, rc2); \
-		TFBIG_MIX(w6, w7, rc3); \
-	}
-
-#define TFBIG_4e(s)  { \
-		TFBIG_ADDKEY(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \
-		TFBIG_MIX8(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 46, 36, 19, 37); \
-		TFBIG_MIX8(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 33, 27, 14, 42); \
-		TFBIG_MIX8(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 17, 49, 36, 39); \
-		TFBIG_MIX8(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 44,  9, 54, 56); \
-	}
-
-#define TFBIG_4o(s)  { \
-		TFBIG_ADDKEY(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \
-		TFBIG_MIX8(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 39, 30, 34, 24); \
-		TFBIG_MIX8(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 13, 50, 10, 17); \
-		TFBIG_MIX8(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 25, 29, 39, 43); \
-		TFBIG_MIX8(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3],  8, 35, 56, 22); \
-	}
-
-static __constant__ uint64_t d_constMem[8];
-static uint64_t h_constMem[8] = {
-	SPH_C64(0x4903ADFF749C51CE),
-	SPH_C64(0x0D95DE399746DF03),
-	SPH_C64(0x8FD1934127C79BCE),
-	SPH_C64(0x9A255629FF352CB1),
-	SPH_C64(0x5DB62599DF6CA7B0),
-	SPH_C64(0xEABE394CA9D5C3F4),
-	SPH_C64(0x991112C71A75B523),
-	SPH_C64(0xAE18A40B660FCC33) };
-
-__global__ void quark_skein512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
-{
-	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
-	{
-		// Skein
-		uint64_t p[8];
-		uint64_t h0, h1, h2, h3, h4, h5, h6, h7, h8;
-		uint64_t t0, t1, t2;
-
-		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
-
-		int hashPosition = nounce - startNounce;
-		uint64_t *inpHash = &g_hash[8 * hashPosition];
-
-		// Initialisierung
-		h0 = d_constMem[0];
-		h1 = d_constMem[1];
-		h2 = d_constMem[2];
-		h3 = d_constMem[3];
-		h4 = d_constMem[4];
-		h5 = d_constMem[5];
-		h6 = d_constMem[6];
-		h7 = d_constMem[7];
-
-		// 1. Runde -> etype = 480, ptr = 64, bcount = 0, data = msg		
-#pragma unroll 8
-		for(int i=0;i<8;i++)
-			p[i] = inpHash[i];
-
-		t0 = 64; // ptr
-		t1 = 480ull << 55; // etype
-		TFBIG_KINIT(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2);
-		TFBIG_4e(0);
-		TFBIG_4o(1);
-		TFBIG_4e(2);
-		TFBIG_4o(3);
-		TFBIG_4e(4);
-		TFBIG_4o(5);
-		TFBIG_4e(6);
-		TFBIG_4o(7);
-		TFBIG_4e(8);
-		TFBIG_4o(9);
-		TFBIG_4e(10);
-		TFBIG_4o(11);
-		TFBIG_4e(12);
-		TFBIG_4o(13);
-		TFBIG_4e(14);
-		TFBIG_4o(15);
-		TFBIG_4e(16);
-		TFBIG_4o(17);
-		TFBIG_ADDKEY(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18);
-
-		h0 = inpHash[0] ^ p[0];
-		h1 = inpHash[1] ^ p[1];
-		h2 = inpHash[2] ^ p[2];
-		h3 = inpHash[3] ^ p[3];
-		h4 = inpHash[4] ^ p[4];
-		h5 = inpHash[5] ^ p[5];
-		h6 = inpHash[6] ^ p[6];
-		h7 = inpHash[7] ^ p[7];
-
-		// 2. Runde -> etype = 510, ptr = 8, bcount = 0, data = 0
-#pragma unroll 8
-		for(int i=0;i<8;i++)
-			p[i] = 0;
-
-		t0 = 8; // ptr
-		t1 = 510ull << 55; // etype
-		TFBIG_KINIT(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2);
-		TFBIG_4e(0);
-		TFBIG_4o(1);
-		TFBIG_4e(2);
-		TFBIG_4o(3);
-		TFBIG_4e(4);
-		TFBIG_4o(5);
-		TFBIG_4e(6);
-		TFBIG_4o(7);
-		TFBIG_4e(8);
-		TFBIG_4o(9);
-		TFBIG_4e(10);
-		TFBIG_4o(11);
-		TFBIG_4e(12);
-		TFBIG_4o(13);
-		TFBIG_4e(14);
-		TFBIG_4o(15);
-		TFBIG_4e(16);
-		TFBIG_4o(17);
-		TFBIG_ADDKEY(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18);
-
-		// fertig
-		uint64_t *outpHash = &g_hash[8 * hashPosition];
-
-#pragma unroll 8
-		for(int i=0;i<8;i++)
-			outpHash[i] = p[i];
-	}
-}
-
-// Setup-Funktionen
-__host__ void quark_skein512_cpu_init(int thr_id, int threads)
-{
-	// nix zu tun ;-)
-	cudaMemcpyToSymbol( d_constMem,
-                        h_constMem,
-                        sizeof(h_constMem),
-                        0, cudaMemcpyHostToDevice);
-}
-
-__host__ void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
-{
-	const int threadsperblock = 256;
-
-	// berechne wie viele Thread Blocks wir brauchen
-	dim3 grid((threads + threadsperblock-1)/threadsperblock);
-	dim3 block(threadsperblock);
-
-	// Gr��e des dynamischen Shared Memory Bereichs
-	size_t shared_size = 0;
-
-	quark_skein512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
-
-	// Strategisches Sleep Kommando zur Senkung der CPU Last
-	MyStreamSynchronize(NULL, order, thr_id);
-}
+/* SKEIN 64 and 80 based on Alexis Provos version */
+
+#define TPB52 512
+#define TPB50 256
+
+#include <stdio.h>
+#include <cuda_vectors.h>
+#include <cuda_vector_uint2x4.h>
+
+/* ************************ */
+
+/*
+ * M9_ ## s ## _ ## i  evaluates to s+i mod 9 (0 <= s <= 18, 0 <= i <= 7).
+ */
+
+#define M9_0_0    0
+#define M9_0_1    1
+#define M9_0_2    2
+#define M9_0_3    3
+#define M9_0_4    4
+#define M9_0_5    5
+#define M9_0_6    6
+#define M9_0_7    7
+
+#define M9_1_0    1
+#define M9_1_1    2
+#define M9_1_2    3
+#define M9_1_3    4
+#define M9_1_4    5
+#define M9_1_5    6
+#define M9_1_6    7
+#define M9_1_7    8
+
+#define M9_2_0    2
+#define M9_2_1    3
+#define M9_2_2    4
+#define M9_2_3    5
+#define M9_2_4    6
+#define M9_2_5    7
+#define M9_2_6    8
+#define M9_2_7    0
+
+#define M9_3_0    3
+#define M9_3_1    4
+#define M9_3_2    5
+#define M9_3_3    6
+#define M9_3_4    7
+#define M9_3_5    8
+#define M9_3_6    0
+#define M9_3_7    1
+
+#define M9_4_0    4
+#define M9_4_1    5
+#define M9_4_2    6
+#define M9_4_3    7
+#define M9_4_4    8
+#define M9_4_5    0
+#define M9_4_6    1
+#define M9_4_7    2
+
+#define M9_5_0    5
+#define M9_5_1    6
+#define M9_5_2    7
+#define M9_5_3    8
+#define M9_5_4    0
+#define M9_5_5    1
+#define M9_5_6    2
+#define M9_5_7    3
+
+#define M9_6_0    6
+#define M9_6_1    7
+#define M9_6_2    8
+#define M9_6_3    0
+#define M9_6_4    1
+#define M9_6_5    2
+#define M9_6_6    3
+#define M9_6_7    4
+
+#define M9_7_0    7
+#define M9_7_1    8
+#define M9_7_2    0
+#define M9_7_3    1
+#define M9_7_4    2
+#define M9_7_5    3
+#define M9_7_6    4
+#define M9_7_7    5
+
+#define M9_8_0    8
+#define M9_8_1    0
+#define M9_8_2    1
+#define M9_8_3    2
+#define M9_8_4    3
+#define M9_8_5    4
+#define M9_8_6    5
+#define M9_8_7    6
+
+#define M9_9_0    0
+#define M9_9_1    1
+#define M9_9_2    2
+#define M9_9_3    3
+#define M9_9_4    4
+#define M9_9_5    5
+#define M9_9_6    6
+#define M9_9_7    7
+
+#define M9_10_0   1
+#define M9_10_1   2
+#define M9_10_2   3
+#define M9_10_3   4
+#define M9_10_4   5
+#define M9_10_5   6
+#define M9_10_6   7
+#define M9_10_7   8
+
+#define M9_11_0   2
+#define M9_11_1   3
+#define M9_11_2   4
+#define M9_11_3   5
+#define M9_11_4   6
+#define M9_11_5   7
+#define M9_11_6   8
+#define M9_11_7   0
+
+#define M9_12_0   3
+#define M9_12_1   4
+#define M9_12_2   5
+#define M9_12_3   6
+#define M9_12_4   7
+#define M9_12_5   8
+#define M9_12_6   0
+#define M9_12_7   1
+
+#define M9_13_0   4
+#define M9_13_1   5
+#define M9_13_2   6
+#define M9_13_3   7
+#define M9_13_4   8
+#define M9_13_5   0
+#define M9_13_6   1
+#define M9_13_7   2
+
+#define M9_14_0   5
+#define M9_14_1   6
+#define M9_14_2   7
+#define M9_14_3   8
+#define M9_14_4   0
+#define M9_14_5   1
+#define M9_14_6   2
+#define M9_14_7   3
+
+#define M9_15_0   6
+#define M9_15_1   7
+#define M9_15_2   8
+#define M9_15_3   0
+#define M9_15_4   1
+#define M9_15_5   2
+#define M9_15_6   3
+#define M9_15_7   4
+
+#define M9_16_0   7
+#define M9_16_1   8
+#define M9_16_2   0
+#define M9_16_3   1
+#define M9_16_4   2
+#define M9_16_5   3
+#define M9_16_6   4
+#define M9_16_7   5
+
+#define M9_17_0   8
+#define M9_17_1   0
+#define M9_17_2   1
+#define M9_17_3   2
+#define M9_17_4   3
+#define M9_17_5   4
+#define M9_17_6   5
+#define M9_17_7   6
+
+#define M9_18_0   0
+#define M9_18_1   1
+#define M9_18_2   2
+#define M9_18_3   3
+#define M9_18_4   4
+#define M9_18_5   5
+#define M9_18_6   6
+#define M9_18_7   7
+
+/*
+ * M3_ ## s ## _ ## i  evaluates to s+i mod 3 (0 <= s <= 18, 0 <= i <= 1).
+ */
+
+#define M3_0_0    0
+#define M3_0_1    1
+#define M3_1_0    1
+#define M3_1_1    2
+#define M3_2_0    2
+#define M3_2_1    0
+#define M3_3_0    0
+#define M3_3_1    1
+#define M3_4_0    1
+#define M3_4_1    2
+#define M3_5_0    2
+#define M3_5_1    0
+#define M3_6_0    0
+#define M3_6_1    1
+#define M3_7_0    1
+#define M3_7_1    2
+#define M3_8_0    2
+#define M3_8_1    0
+#define M3_9_0    0
+#define M3_9_1    1
+#define M3_10_0   1
+#define M3_10_1   2
+#define M3_11_0   2
+#define M3_11_1   0
+#define M3_12_0   0
+#define M3_12_1   1
+#define M3_13_0   1
+#define M3_13_1   2
+#define M3_14_0   2
+#define M3_14_1   0
+#define M3_15_0   0
+#define M3_15_1   1
+#define M3_16_0   1
+#define M3_16_1   2
+#define M3_17_0   2
+#define M3_17_1   0
+#define M3_18_0   0
+#define M3_18_1   1
+
+#define XCAT(x, y)     XCAT_(x, y)
+#define XCAT_(x, y)    x ## y
+
+#define SKBI(k, s, i)   XCAT(k, XCAT(XCAT(XCAT(M9_, s), _), i))
+#define SKBT(t, s, v)   XCAT(t, XCAT(XCAT(XCAT(M3_, s), _), v))
+
+#define TFBIG_ADDKEY(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) { \
+	w0 = (w0 + SKBI(k, s, 0)); \
+	w1 = (w1 + SKBI(k, s, 1)); \
+	w2 = (w2 + SKBI(k, s, 2)); \
+	w3 = (w3 + SKBI(k, s, 3)); \
+	w4 = (w4 + SKBI(k, s, 4)); \
+	w5 = (w5 + SKBI(k, s, 5) + SKBT(t, s, 0)); \
+	w6 = (w6 + SKBI(k, s, 6) + SKBT(t, s, 1)); \
+	w7 = (w7 + SKBI(k, s, 7) + make_uint2(s,0); \
+}
+
+#define TFBIG_MIX(x0, x1, rc) { \
+	x0 = x0 + x1; \
+	x1 = ROL2(x1, rc) ^ x0; \
+}
+
+#define TFBIG_MIX8(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) { \
+	TFBIG_MIX(w0, w1, rc0); \
+	TFBIG_MIX(w2, w3, rc1); \
+	TFBIG_MIX(w4, w5, rc2); \
+	TFBIG_MIX(w6, w7, rc3); \
+}
+
+#define TFBIG_4e(s)  { \
+	TFBIG_ADDKEY(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \
+	TFBIG_MIX8(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 46, 36, 19, 37); \
+	TFBIG_MIX8(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 33, 27, 14, 42); \
+	TFBIG_MIX8(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 17, 49, 36, 39); \
+	TFBIG_MIX8(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 44,  9, 54, 56); \
+}
+
+#define TFBIG_4o(s)  { \
+	TFBIG_ADDKEY(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \
+	TFBIG_MIX8(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 39, 30, 34, 24); \
+	TFBIG_MIX8(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 13, 50, 10, 17); \
+	TFBIG_MIX8(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 25, 29, 39, 43); \
+	TFBIG_MIX8(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3],  8, 35, 56, 22); \
+}
+
+/* uint2 variant for SM3.2+ */
+
+#define TFBIG_KINIT_UI2(k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2) { \
+	k8 = ((k0 ^ k1) ^ (k2 ^ k3)) ^ ((k4 ^ k5) ^ (k6 ^ k7)) \
+		^ vectorize(0x1BD11BDAA9FC1A22); \
+	t2 = t0 ^ t1; \
+}
+
+#define TFBIG_ADDKEY_UI2(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) { \
+	w0 = (w0 + SKBI(k, s, 0)); \
+	w1 = (w1 + SKBI(k, s, 1)); \
+	w2 = (w2 + SKBI(k, s, 2)); \
+	w3 = (w3 + SKBI(k, s, 3)); \
+	w4 = (w4 + SKBI(k, s, 4)); \
+	w5 = (w5 + SKBI(k, s, 5) + SKBT(t, s, 0)); \
+	w6 = (w6 + SKBI(k, s, 6) + SKBT(t, s, 1)); \
+	w7 = (w7 + SKBI(k, s, 7) + vectorize(s)); \
+}
+
+#define TFBIG_ADDKEY_PRE(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) { \
+	w0 = (w0 + SKBI(k, s, 0)); \
+	w1 = (w1 + SKBI(k, s, 1)); \
+	w2 = (w2 + SKBI(k, s, 2)); \
+	w3 = (w3 + SKBI(k, s, 3)); \
+	w4 = (w4 + SKBI(k, s, 4)); \
+	w5 = (w5 + SKBI(k, s, 5) + SKBT(t, s, 0)); \
+	w6 = (w6 + SKBI(k, s, 6) + SKBT(t, s, 1)); \
+	w7 = (w7 + SKBI(k, s, 7) + (s)); \
+}
+
+#define TFBIG_MIX_UI2(x0, x1, rc) { \
+	x0 = x0 + x1; \
+	x1 = ROL2(x1, rc) ^ x0; \
+}
+
+#define TFBIG_MIX_PRE(x0, x1, rc) { \
+	x0 = x0 + x1; \
+	x1 = ROTL64(x1, rc) ^ x0; \
+}
+
+#define TFBIG_MIX8_UI2(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) { \
+	TFBIG_MIX_UI2(w0, w1, rc0); \
+	TFBIG_MIX_UI2(w2, w3, rc1); \
+	TFBIG_MIX_UI2(w4, w5, rc2); \
+	TFBIG_MIX_UI2(w6, w7, rc3); \
+}
+
+#define TFBIG_MIX8_PRE(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) { \
+	TFBIG_MIX_PRE(w0, w1, rc0); \
+	TFBIG_MIX_PRE(w2, w3, rc1); \
+	TFBIG_MIX_PRE(w4, w5, rc2); \
+	TFBIG_MIX_PRE(w6, w7, rc3); \
+}
+
+#define TFBIG_4e_UI2(s) { \
+	TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \
+	TFBIG_MIX8_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 46, 36, 19, 37); \
+	TFBIG_MIX8_UI2(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 33, 27, 14, 42); \
+	TFBIG_MIX8_UI2(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 17, 49, 36, 39); \
+	TFBIG_MIX8_UI2(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 44,  9, 54, 56); \
+}
+
+#define TFBIG_4e_PRE(s) { \
+	TFBIG_ADDKEY_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \
+	TFBIG_MIX8_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 46, 36, 19, 37); \
+	TFBIG_MIX8_PRE(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 33, 27, 14, 42); \
+	TFBIG_MIX8_PRE(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 17, 49, 36, 39); \
+	TFBIG_MIX8_PRE(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 44,  9, 54, 56); \
+}
+
+#define TFBIG_4o_UI2(s) { \
+	TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \
+	TFBIG_MIX8_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 39, 30, 34, 24); \
+	TFBIG_MIX8_UI2(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 13, 50, 10, 17); \
+	TFBIG_MIX8_UI2(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 25, 29, 39, 43); \
+	TFBIG_MIX8_UI2(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3],  8, 35, 56, 22); \
+}
+
+#define TFBIG_4o_PRE(s) { \
+	TFBIG_ADDKEY_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \
+	TFBIG_MIX8_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 39, 30, 34, 24); \
+	TFBIG_MIX8_PRE(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 13, 50, 10, 17); \
+	TFBIG_MIX8_PRE(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 25, 29, 39, 43); \
+	TFBIG_MIX8_PRE(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3],  8, 35, 56, 22); \
+}
+
+#define macro1() {\
+	p[0] += p[1]; p[2] += p[3]; p[4] += p[5]; p[6] += p[7]; p[1] = ROL2(p[1],46) ^ p[0]; \
+	p[3] = ROL2(p[3],36) ^ p[2]; p[5] = ROL2(p[5],19) ^ p[4]; p[7] = ROL2(p[7], 37) ^ p[6]; \
+	p[2] += p[1]; p[4] += p[7]; p[6] += p[5]; p[0] += p[3]; p[1] = ROL2(p[1],33) ^ p[2]; \
+	p[7] = ROL2(p[7],27) ^ p[4]; p[5] = ROL2(p[5],14) ^ p[6]; p[3] = ROL2(p[3], 42) ^ p[0]; \
+	p[4] += p[1]; p[6] += p[3]; p[0] += p[5]; p[2] += p[7]; p[1] = ROL2(p[1],17) ^ p[4]; \
+	p[3] = ROL2(p[3],49) ^ p[6]; p[5] = ROL2(p[5],36) ^ p[0]; p[7] = ROL2(p[7], 39) ^ p[2]; \
+	p[6] += p[1]; p[0] += p[7]; p[2] += p[5]; p[4] += p[3]; p[1] = ROL2(p[1],44) ^ p[6]; \
+	p[7] = ROL2(p[7], 9) ^ p[0]; p[5] = ROL2(p[5],54) ^ p[2]; p[3] = ROR8(p[3]) ^ p[4]; \
+}
+
+#define macro2() { \
+	p[0] += p[1]; p[2] += p[3]; p[4] += p[5]; p[6] += p[7]; p[1] = ROL2(p[1], 39) ^ p[0]; \
+	p[3] = ROL2(p[3], 30) ^ p[2]; p[5] = ROL2(p[5], 34) ^ p[4]; p[7] = ROL24(p[7]) ^ p[6]; \
+	p[2] += p[1]; p[4] += p[7]; p[6] += p[5]; p[0] += p[3]; p[1] = ROL2(p[1], 13) ^ p[2]; \
+	p[7] = ROL2(p[7], 50) ^ p[4]; p[5] = ROL2(p[5], 10) ^ p[6]; p[3] = ROL2(p[3], 17) ^ p[0]; \
+	p[4] += p[1]; p[6] += p[3]; p[0] += p[5]; p[2] += p[7]; p[1] = ROL2(p[1], 25) ^ p[4]; \
+	p[3] = ROL2(p[3], 29) ^ p[6]; p[5] = ROL2(p[5], 39) ^ p[0]; p[7] = ROL2(p[7], 43) ^ p[2]; \
+	p[6] += p[1]; p[0] += p[7]; p[2] += p[5]; p[4] += p[3]; p[1] = ROL8(p[1]) ^ p[6]; \
+	p[7] = ROL2(p[7], 35) ^ p[0]; p[5] = ROR8(p[5]) ^ p[2]; p[3] = ROL2(p[3], 22) ^ p[4]; \
+}
+
+#define macro3() { \
+	hash64[0]+= hash64[1]; hash64[2]+= hash64[3]; hash64[4]+= hash64[5]; hash64[6]+= hash64[7]; \
+	hash64[1] = ROL2(hash64[1], 39) ^ hash64[0]; \
+	hash64[3] = ROL2(hash64[3], 30) ^ hash64[2]; \
+	hash64[5] = ROL2(hash64[5], 34) ^ hash64[4]; \
+	hash64[7] = ROL24(hash64[7]) ^ hash64[6]; \
+	hash64[2]+= hash64[1]; hash64[4]+= hash64[7]; hash64[6]+= hash64[5]; hash64[0]+= hash64[3]; \
+	hash64[1] = ROL2(hash64[1], 13) ^ hash64[2]; \
+	hash64[7] = ROL2(hash64[7], 50) ^ hash64[4]; \
+	hash64[5] = ROL2(hash64[5], 10) ^ hash64[6]; \
+	hash64[3] = ROL2(hash64[3], 17) ^ hash64[0]; \
+	hash64[4]+= hash64[1]; hash64[6]+= hash64[3]; hash64[0]+= hash64[5]; hash64[2]+= hash64[7]; \
+	hash64[1] = ROL2(hash64[1], 25) ^ hash64[4]; \
+	hash64[3] = ROL2(hash64[3], 29) ^ hash64[6]; \
+	hash64[5] = ROL2(hash64[5], 39) ^ hash64[0]; \
+	hash64[7] = ROL2(hash64[7], 43) ^ hash64[2]; \
+	hash64[6]+= hash64[1]; hash64[0]+= hash64[7]; hash64[2]+= hash64[5]; hash64[4]+= hash64[3]; \
+	hash64[1] = ROL8(hash64[1]) ^ hash64[6]; \
+	hash64[7] = ROL2(hash64[7], 35) ^ hash64[0]; \
+	hash64[5] = ROR8(hash64[5]) ^ hash64[2]; \
+	hash64[3] = ROL2(hash64[3], 22) ^ hash64[4]; \
+}
+
+#define macro4() {\
+	hash64[0]+= hash64[1]; hash64[2]+= hash64[3]; hash64[4]+= hash64[5]; hash64[6]+= hash64[7]; \
+	hash64[1] = ROL2(hash64[1], 46) ^ hash64[0]; \
+	hash64[3] = ROL2(hash64[3], 36) ^ hash64[2]; \
+	hash64[5] = ROL2(hash64[5], 19) ^ hash64[4]; \
+	hash64[7] = ROL2(hash64[7], 37) ^ hash64[6]; \
+	hash64[2]+= hash64[1]; hash64[4]+= hash64[7]; hash64[6]+= hash64[5]; hash64[0]+= hash64[3]; \
+	hash64[1] = ROL2(hash64[1], 33) ^ hash64[2]; \
+	hash64[7] = ROL2(hash64[7], 27) ^ hash64[4]; \
+	hash64[5] = ROL2(hash64[5], 14) ^ hash64[6]; \
+	hash64[3] = ROL2(hash64[3], 42) ^ hash64[0]; \
+	hash64[4]+= hash64[1]; hash64[6]+= hash64[3]; hash64[0]+= hash64[5]; hash64[2]+= hash64[7]; \
+	hash64[1] = ROL2(hash64[1], 17) ^ hash64[4]; \
+	hash64[3] = ROL2(hash64[3], 49) ^ hash64[6]; \
+	hash64[5] = ROL2(hash64[5], 36) ^ hash64[0]; \
+	hash64[7] = ROL2(hash64[7], 39) ^ hash64[2]; \
+	hash64[6]+= hash64[1]; hash64[0]+= hash64[7]; hash64[2]+= hash64[5]; hash64[4]+= hash64[3]; \
+	hash64[1] = ROL2(hash64[1], 44) ^ hash64[6]; \
+	hash64[7] = ROL2(hash64[7], 9) ^ hash64[0]; \
+	hash64[5] = ROL2(hash64[5], 54) ^ hash64[2]; \
+	hash64[3] = ROR8(hash64[3]) ^ hash64[4]; \
+}
+
+__constant__ const uint2 buffer[112] = {
+	{0x749C51CE, 0x4903ADFF}, {0x9746DF03, 0x0D95DE39}, {0x27C79BCE, 0x8FD19341}, {0xFF352CB1, 0x9A255629},
+	{0xDF6CA7B0, 0x5DB62599}, {0xA9D5C434, 0xEABE394C}, {0x1A75B523, 0x891112C7}, {0x660FCC33, 0xAE18A40B},
+	{0x9746DF03, 0x0D95DE39}, {0x27C79BCE, 0x8FD19341}, {0xFF352CB1, 0x9A255629}, {0xDF6CA7B0, 0x5DB62599},
+	{0xA9D5C3F4, 0xEABE394C}, {0x1A75B523, 0x891112C7}, {0x660FCC73, 0x9E18A40B}, {0x98173EC5, 0xCAB2076D},
+	{0x27C79BCE, 0x8FD19341}, {0xFF352CB1, 0x9A255629}, {0xDF6CA7B0, 0x5DB62599}, {0xA9D5C3F4, 0xEABE394C},
+	{0x1A75B523, 0x991112C7}, {0x660FCC73, 0x9E18A40B}, {0x98173F04, 0xCAB2076D}, {0x749C51D0, 0x4903ADFF},
+	{0xFF352CB1, 0x9A255629}, {0xDF6CA7B0, 0x5DB62599}, {0xA9D5C3F4, 0xEABE394C}, {0x1A75B523, 0x991112C7},
+	{0x660FCC33, 0xAE18A40B}, {0x98173F04, 0xCAB2076D}, {0x749C51CE, 0x3903ADFF}, {0x9746DF06, 0x0D95DE39},
+	{0xDF6CA7B0, 0x5DB62599}, {0xA9D5C3F4, 0xEABE394C}, {0x1A75B523, 0x991112C7}, {0x660FCC33, 0xAE18A40B},
+	{0x98173EC4, 0xCAB2076D}, {0x749C51CE, 0x3903ADFF}, {0x9746DF43, 0xFD95DE39}, {0x27C79BD2, 0x8FD19341},
+	{0xA9D5C3F4, 0xEABE394C}, {0x1A75B523, 0x991112C7}, {0x660FCC33, 0xAE18A40B}, {0x98173EC4, 0xCAB2076D},
+	{0x749C51CE, 0x4903ADFF}, {0x9746DF43, 0xFD95DE39}, {0x27C79C0E, 0x8FD19341}, {0xFF352CB6, 0x9A255629},
+	{0x1A75B523, 0x991112C7}, {0x660FCC33, 0xAE18A40B}, {0x98173EC4, 0xCAB2076D}, {0x749C51CE, 0x4903ADFF},
+	{0x9746DF03, 0x0D95DE39}, {0x27C79C0E, 0x8FD19341}, {0xFF352CB1, 0x8A255629}, {0xDF6CA7B6, 0x5DB62599},
+	{0x660FCC33, 0xAE18A40B}, {0x98173EC4, 0xCAB2076D}, {0x749C51CE, 0x4903ADFF}, {0x9746DF03, 0x0D95DE39},
+	{0x27C79BCE, 0x8FD19341}, {0xFF352CB1, 0x8A255629}, {0xDF6CA7F0, 0x4DB62599}, {0xA9D5C3FB, 0xEABE394C},
+	{0x98173EC4, 0xCAB2076D}, {0x749C51CE, 0x4903ADFF}, {0x9746DF03, 0x0D95DE39}, {0x27C79BCE, 0x8FD19341},
+	{0xFF352CB1, 0x9A255629}, {0xDF6CA7F0, 0x4DB62599}, {0xA9D5C434, 0xEABE394C}, {0x1A75B52B, 0x991112C7},
+	{0x749C51CE, 0x4903ADFF}, {0x9746DF03, 0x0D95DE39}, {0x27C79BCE, 0x8FD19341}, {0xFF352CB1, 0x9A255629},
+	{0xDF6CA7B0, 0x5DB62599}, {0xA9D5C434, 0xEABE394C}, {0x1A75B523, 0x891112C7}, {0x660FCC3C, 0xAE18A40B},
+	{0x9746DF03, 0x0D95DE39}, {0x27C79BCE, 0x8FD19341}, {0xFF352CB1, 0x9A255629}, {0xDF6CA7B0, 0x5DB62599},
+	{0xA9D5C3F4, 0xEABE394C}, {0x1A75B523, 0x891112C7}, {0x660FCC73, 0x9E18A40B}, {0x98173ece, 0xcab2076d},
+	{0x27C79BCE, 0x8FD19341}, {0xFF352CB1, 0x9A255629}, {0xDF6CA7B0, 0x5DB62599}, {0xA9D5C3F4, 0xEABE394C},
+	{0x1A75B523, 0x991112C7}, {0x660FCC73, 0x9E18A40B}, {0x98173F04, 0xCAB2076D}, {0x749C51D9, 0x4903ADFF},
+	{0xFF352CB1, 0x9A255629}, {0xDF6CA7B0, 0x5DB62599}, {0xA9D5C3F4, 0xEABE394C}, {0x1A75B523, 0x991112C7},
+	{0x660FCC33, 0xAE18A40B}, {0x98173F04, 0xCAB2076D}, {0x749C51CE, 0x3903ADFF}, {0x9746DF0F, 0x0D95DE39},
+	{0xDF6CA7B0, 0x5DB62599}, {0xA9D5C3F4, 0xEABE394C}, {0x1A75B523, 0x991112C7}, {0x660FCC33, 0xAE18A40B},
+	{0x98173EC4, 0xCAB2076D}, {0x749C51CE, 0x3903ADFF}, {0x9746DF43, 0xFD95DE39}, {0x27C79BDB, 0x8FD19341}
+};
+
+__global__
+#if __CUDA_ARCH__ > 500
+__launch_bounds__(TPB52, 3)
+#else
+__launch_bounds__(TPB50, 5)
+#endif
+void quark_skein512_gpu_hash_64(const uint32_t threads, const uint32_t startNonce, uint64_t* __restrict__ g_hash, const uint32_t *const __restrict__ g_nonceVector)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+	if (thread < threads){
+
+		// Skein
+		uint2 p[8], h[9];
+
+		const uint32_t hashPosition = (g_nonceVector == NULL) ? thread : g_nonceVector[thread] - startNonce;
+
+		uint64_t *Hash = &g_hash[hashPosition<<3];
+
+		uint2x4 *phash = (uint2x4*)Hash;
+		*(uint2x4*)&p[0] = __ldg4(&phash[0]);
+		*(uint2x4*)&p[4] = __ldg4(&phash[1]);
+
+		h[0] = p[0];	h[1] = p[1];	h[2] = p[2];	h[3] = p[3];
+		h[4] = p[4];	h[5] = p[5];	h[6] = p[6];	h[7] = p[7];
+
+		p[0] += buffer[  0];	p[1] += buffer[  1];	p[2] += buffer[  2];	p[3] += buffer[3];
+		p[4] += buffer[  4];	p[5] += buffer[  5];	p[6] += buffer[  6];	p[7] += buffer[7];
+		macro1();
+		p[0] += buffer[  8];	p[1] += buffer[  9];	p[2] += buffer[ 10];	p[3] += buffer[11];
+		p[4] += buffer[ 12];	p[5] += buffer[ 13];	p[6] += buffer[ 14];	p[7] += buffer[15];
+		macro2();
+		p[0] += buffer[ 16];	p[1] += buffer[ 17];	p[2] += buffer[ 18];	p[3] += buffer[19];
+		p[4] += buffer[ 20];	p[5] += buffer[ 21];	p[6] += buffer[ 22];	p[7] += buffer[23];
+		macro1();
+		p[0] += buffer[ 24];	p[1] += buffer[ 25];	p[2] += buffer[ 26];	p[3] += buffer[27];
+		p[4] += buffer[ 28];	p[5] += buffer[ 29];	p[6] += buffer[ 30];	p[7] += buffer[31];
+		macro2();
+		p[0] += buffer[ 32];	p[1] += buffer[ 33];	p[2] += buffer[ 34];	p[3] += buffer[35];
+		p[4] += buffer[ 36];	p[5] += buffer[ 37];	p[6] += buffer[ 38];	p[7] += buffer[39];
+		macro1();
+		p[0] += buffer[ 40];	p[1] += buffer[ 41];	p[2] += buffer[ 42];	p[3] += buffer[43];
+		p[4] += buffer[ 44];	p[5] += buffer[ 45];	p[6] += buffer[ 46];	p[7] += buffer[47];
+		macro2();
+		p[0] += buffer[ 48];	p[1] += buffer[ 49];	p[2] += buffer[ 50];	p[3] += buffer[51];
+		p[4] += buffer[ 52];	p[5] += buffer[ 53];	p[6] += buffer[ 54];	p[7] += buffer[55];
+		macro1();
+		p[0] += buffer[ 56];	p[1] += buffer[ 57];	p[2] += buffer[ 58];	p[3] += buffer[59];
+		p[4] += buffer[ 60];	p[5] += buffer[ 61];	p[6] += buffer[ 62];	p[7] += buffer[63];
+		macro2();
+		p[0] += buffer[ 64];	p[1] += buffer[ 65];	p[2] += buffer[ 66];	p[3] += buffer[67];
+		p[4] += buffer[ 68];	p[5] += buffer[ 69];	p[6] += buffer[ 70];	p[7] += buffer[71];
+		macro1();
+		p[0] += buffer[ 72];	p[1] += buffer[ 73];	p[2] += buffer[ 74];	p[3] += buffer[75];
+		p[4] += buffer[ 76];	p[5] += buffer[ 77];	p[6] += buffer[ 78];	p[7] += buffer[79];
+		macro2();
+		p[0] += buffer[ 80];	p[1] += buffer[ 81];	p[2] += buffer[ 82];	p[3] += buffer[83];
+		p[4] += buffer[ 84];	p[5] += buffer[ 85];	p[6] += buffer[ 86];	p[7] += buffer[87];
+		macro1();
+		p[0] += buffer[ 88];	p[1] += buffer[ 89];	p[2] += buffer[ 90];	p[3] += buffer[91];
+		p[4] += buffer[ 92];	p[5] += buffer[ 93];	p[6] += buffer[ 94];	p[7] += buffer[95];
+		macro2();
+		p[0] += buffer[ 96];	p[1] += buffer[ 97];	p[2] += buffer[ 98];	p[3] += buffer[99];
+		p[4] += buffer[100];	p[5] += buffer[101];	p[6] += buffer[102];	p[7] += buffer[103];
+		macro1();
+		p[0] += buffer[104];	p[1] += buffer[105];	p[2] += buffer[106];	p[3] += buffer[107];
+		p[4] += buffer[108];	p[5] += buffer[109];	p[6] += buffer[110];	p[7] += buffer[111];
+		macro2();
+		p[0]+= make_uint2(0xA9D5C3F4, 0xEABE394C);	p[1]+= make_uint2(0x1A75B523, 0x991112C7);
+		p[2]+= make_uint2(0x660FCC33, 0xAE18A40B);	p[3]+= make_uint2(0x98173EC4, 0xCAB2076D);
+		p[4]+= make_uint2(0x749C51CE, 0x4903ADFF);	p[5]+= make_uint2(0x9746DF43, 0xFD95DE39);
+		p[6]+= make_uint2(0x27C79C0E, 0x8FD19341);	p[7]+= make_uint2(0xFF352CBF, 0x9A255629);
+		macro1();
+		p[0]+= make_uint2(0x1A75B523, 0x991112C7);	p[1]+= make_uint2(0x660FCC33, 0xAE18A40B);
+		p[2]+= make_uint2(0x98173EC4, 0xCAB2076D);	p[3]+= make_uint2(0x749C51CE, 0x4903ADFF);
+		p[4]+= make_uint2(0x9746DF03, 0x0D95DE39);	p[5]+= make_uint2(0x27C79C0E, 0x8FD19341);
+		p[6]+= make_uint2(0xFF352CB1, 0x8A255629);	p[7]+= make_uint2(0xDF6CA7BF, 0x5DB62599);
+		macro2();
+		p[0] += vectorize(0xAE18A40B660FCC33);		p[1] += vectorize(0xcab2076d98173ec4);
+		p[2] += vectorize(0x4903ADFF749C51CE);		p[3] += vectorize(0x0D95DE399746DF03);
+		p[4] += vectorize(0x8FD1934127C79BCE);		p[5] += vectorize(0x8A255629FF352CB1);
+		p[6] += vectorize(0x4DB62599DF6CA7F0);		p[7] += vectorize(0xEABE394CA9D5C3F4 + 16);
+		macro1();
+		p[0] += vectorize(0xcab2076d98173ec4);		p[1] += vectorize(0x4903ADFF749C51CE);
+		p[2] += vectorize(0x0D95DE399746DF03);		p[3] += vectorize(0x8FD1934127C79BCE);
+		p[4] += vectorize(0x9A255629FF352CB1);		p[5] += vectorize(0x4DB62599DF6CA7F0);
+		p[6] += vectorize(0xEABE394CA9D5C3F4 + 0x0000000000000040);
+		p[7] += vectorize(0x991112C71A75B523 + 17);
+		macro2();
+		p[0] += vectorize(0x4903ADFF749C51CE);		p[1] += vectorize(0x0D95DE399746DF03);
+		p[2] += vectorize(0x8FD1934127C79BCE);		p[3] += vectorize(0x9A255629FF352CB1);
+		p[4] += vectorize(0x5DB62599DF6CA7B0);		p[5] += vectorize(0xEABE394CA9D5C3F4 + 0x0000000000000040);
+		p[6] += vectorize(0x891112C71A75B523);		p[7] += vectorize(0xAE18A40B660FCC33 + 18);
+
+		#define h0 p[0]
+		#define h1 p[1]
+		#define h2 p[2]
+		#define h3 p[3]
+		#define h4 p[4]
+		#define h5 p[5]
+		#define h6 p[6]
+		#define h7 p[7]
+
+		h0 ^= h[0];	h1 ^= h[1];	h2 ^= h[2];	h3 ^= h[3];
+		h4 ^= h[4];	h5 ^= h[5];	h6 ^= h[6];	h7 ^= h[7];
+
+		uint2 skein_h8 = h0 ^ h1 ^ h2 ^ h3 ^ h4 ^ h5 ^ h6 ^ h7 ^ vectorize(0x1BD11BDAA9FC1A22);
+
+		uint2 hash64[8];
+
+		hash64[5] = h5 + 8;
+
+		hash64[0] = h0 + h1;
+		hash64[1] = ROL2(h1, 46) ^ hash64[0];
+		hash64[2] = h2 + h3;
+		hash64[3] = ROL2(h3, 36) ^ hash64[2];
+		hash64[4] = h4 + hash64[5];
+		hash64[5] = ROL2(hash64[5], 19) ^ hash64[4];
+		hash64[6] = (h6 + h7 + make_uint2(0,0xff000000));
+		hash64[7] = ROL2(h7, 37) ^ hash64[6];
+		hash64[2]+= hash64[1];
+		hash64[1] = ROL2(hash64[1], 33) ^ hash64[2];
+		hash64[4]+= hash64[7];
+		hash64[7] = ROL2(hash64[7], 27) ^ hash64[4];
+		hash64[6]+= hash64[5];
+		hash64[5] = ROL2(hash64[5], 14) ^ hash64[6];
+		hash64[0]+= hash64[3];
+		hash64[3] = ROL2(hash64[3], 42) ^ hash64[0];
+		hash64[4]+= hash64[1];
+		hash64[1] = ROL2(hash64[1], 17) ^ hash64[4];
+		hash64[6]+= hash64[3];
+		hash64[3] = ROL2(hash64[3], 49) ^ hash64[6];
+		hash64[0]+= hash64[5];
+		hash64[5] = ROL2(hash64[5], 36) ^ hash64[0];
+		hash64[2]+= hash64[7];
+		hash64[7] = ROL2(hash64[7], 39) ^ hash64[2];
+		hash64[6]+= hash64[1];
+		hash64[1] = ROL2(hash64[1], 44) ^ hash64[6];
+		hash64[0]+= hash64[7];
+		hash64[7] = ROL2(hash64[7], 9) ^ hash64[0];
+		hash64[2]+= hash64[5];
+		hash64[5] = ROL2(hash64[5], 54) ^ hash64[2];
+		hash64[4]+= hash64[3];
+		hash64[3] = ROR8(hash64[3]) ^ hash64[4];
+
+		hash64[0]+= h1; hash64[1]+= h2; hash64[2]+= h3; hash64[3]+= h4;
+		hash64[4]+= h5;
+		hash64[5]+= h6 + make_uint2(0,0xff000000);
+		hash64[6]+= h7 + vectorize(0xff00000000000008);
+		hash64[7]+= skein_h8 + 1;
+		macro3();
+		hash64[0]+= h2; hash64[1]+= h3; hash64[2]+= h4; hash64[3]+= h5;
+		hash64[4]+= h6;
+		hash64[5]+= h7 + vectorize(0xff00000000000008);
+		hash64[6]+= skein_h8 + 8;
+		hash64[7]+= h0 + 2;
+		macro4();
+		hash64[0] = (hash64[0] + h3);			hash64[1] = (hash64[1] + h4);
+		hash64[2] = (hash64[2] + h5);			hash64[3] = (hash64[3] + h6);
+		hash64[4] = (hash64[4] + h7);			hash64[5] = (hash64[5] + skein_h8 + 8);
+		hash64[6] = (hash64[6] + h0 + make_uint2(0,0xff000000));
+		hash64[7] = (hash64[7] + h1 + 3);
+		macro3();
+		hash64[0] = (hash64[0] + h4);			hash64[1] = (hash64[1] + h5);
+		hash64[2] = (hash64[2] + h6);			hash64[3] = (hash64[3] + h7);
+		hash64[4] = (hash64[4] + skein_h8);		hash64[5] = (hash64[5] + h0 + make_uint2(0,0xff000000));
+		hash64[6] = (hash64[6] + h1 + vectorize(0xff00000000000008));
+		hash64[7] = (hash64[7] + h2 + 4);
+		macro4();
+		hash64[0] = (hash64[0] + h5);			hash64[1] = (hash64[1] + h6);
+		hash64[2] = (hash64[2] + h7);			hash64[3] = (hash64[3] + skein_h8);
+		hash64[4] = (hash64[4] + h0);			hash64[5] = (hash64[5] + h1 + vectorize(0xff00000000000008));
+		hash64[6] = (hash64[6] + h2 + 8);		hash64[7] = (hash64[7] + h3 + 5);
+		macro3();
+		hash64[0] = (hash64[0] + h6);			hash64[1] = (hash64[1] + h7);
+		hash64[2] = (hash64[2] + skein_h8);		hash64[3] = (hash64[3] + h0);
+		hash64[4] = (hash64[4] + h1);			hash64[5] = (hash64[5] + h2 + 8);
+		hash64[6] = (hash64[6] + h3 + make_uint2(0,0xff000000));
+		hash64[7] = (hash64[7] + h4 + 6);
+		macro4();
+		hash64[0] = (hash64[0] + h7);			hash64[1] = (hash64[1] + skein_h8);
+		hash64[2] = (hash64[2] + h0);			hash64[3] = (hash64[3] + h1);
+		hash64[4] = (hash64[4] + h2);			hash64[5] = (hash64[5] + h3 + make_uint2(0,0xff000000));
+		hash64[6] = (hash64[6] + h4 + vectorize(0xff00000000000008));
+		hash64[7] = (hash64[7] + h5 + 7);
+		macro3();
+		hash64[0] = (hash64[0] + skein_h8);		hash64[1] = (hash64[1] + h0);
+		hash64[2] = (hash64[2] + h1);			hash64[3] = (hash64[3] + h2);
+		hash64[4] = (hash64[4] + h3);			hash64[5] = (hash64[5] + h4 + vectorize(0xff00000000000008));
+		hash64[6] = (hash64[6] + h5 + 8);		hash64[7] = (hash64[7] + h6 + 8);
+		macro4();
+		hash64[0] = vectorize(devectorize(hash64[0]) + devectorize(h0));
+		hash64[1] = vectorize(devectorize(hash64[1]) + devectorize(h1));
+		hash64[2] = vectorize(devectorize(hash64[2]) + devectorize(h2));
+		hash64[3] = vectorize(devectorize(hash64[3]) + devectorize(h3));
+		hash64[4] = vectorize(devectorize(hash64[4]) + devectorize(h4));
+		hash64[5] = vectorize(devectorize(hash64[5]) + devectorize(h5) + 8);
+		hash64[6] = vectorize(devectorize(hash64[6]) + devectorize(h6) + 0xff00000000000000);
+		hash64[7] = vectorize(devectorize(hash64[7]) + devectorize(h7) + 9);
+		macro3();
+		hash64[0] = vectorize(devectorize(hash64[0]) + devectorize(h1));
+		hash64[1] = vectorize(devectorize(hash64[1]) + devectorize(h2));
+		hash64[2] = vectorize(devectorize(hash64[2]) + devectorize(h3));
+		hash64[3] = vectorize(devectorize(hash64[3]) + devectorize(h4));
+		hash64[4] = vectorize(devectorize(hash64[4]) + devectorize(h5));
+		hash64[5] = vectorize(devectorize(hash64[5]) + devectorize(h6) + 0xff00000000000000);
+		hash64[6] = vectorize(devectorize(hash64[6]) + devectorize(h7) + 0xff00000000000008);
+		hash64[7] = vectorize(devectorize(hash64[7]) + devectorize(skein_h8) + 10);
+		macro4();
+		hash64[0] = vectorize(devectorize(hash64[0]) + devectorize(h2));
+		hash64[1] = vectorize(devectorize(hash64[1]) + devectorize(h3));
+		hash64[2] = vectorize(devectorize(hash64[2]) + devectorize(h4));
+		hash64[3] = vectorize(devectorize(hash64[3]) + devectorize(h5));
+		hash64[4] = vectorize(devectorize(hash64[4]) + devectorize(h6));
+		hash64[5] = vectorize(devectorize(hash64[5]) + devectorize(h7) + 0xff00000000000008);
+		hash64[6] = vectorize(devectorize(hash64[6]) + devectorize(skein_h8) + 8);
+		hash64[7] = vectorize(devectorize(hash64[7]) + devectorize(h0) + 11);
+		macro3();
+		hash64[0] = vectorize(devectorize(hash64[0]) + devectorize(h3));
+		hash64[1] = vectorize(devectorize(hash64[1]) + devectorize(h4));
+		hash64[2] = vectorize(devectorize(hash64[2]) + devectorize(h5));
+		hash64[3] = vectorize(devectorize(hash64[3]) + devectorize(h6));
+		hash64[4] = vectorize(devectorize(hash64[4]) + devectorize(h7));
+		hash64[5] = vectorize(devectorize(hash64[5]) + devectorize(skein_h8) + 8);
+		hash64[6] = vectorize(devectorize(hash64[6]) + devectorize(h0) + 0xff00000000000000);
+		hash64[7] = vectorize(devectorize(hash64[7]) + devectorize(h1) + 12);
+		macro4();
+		hash64[0] = vectorize(devectorize(hash64[0]) + devectorize(h4));
+		hash64[1] = vectorize(devectorize(hash64[1]) + devectorize(h5));
+		hash64[2] = vectorize(devectorize(hash64[2]) + devectorize(h6));
+		hash64[3] = vectorize(devectorize(hash64[3]) + devectorize(h7));
+		hash64[4] = vectorize(devectorize(hash64[4]) + devectorize(skein_h8));
+		hash64[5] = vectorize(devectorize(hash64[5]) + devectorize(h0) + 0xff00000000000000);
+		hash64[6] = vectorize(devectorize(hash64[6]) + devectorize(h1) + 0xff00000000000008);
+		hash64[7] = vectorize(devectorize(hash64[7]) + devectorize(h2) + 13);
+		macro3();
+		hash64[0] = vectorize(devectorize(hash64[0]) + devectorize(h5));
+		hash64[1] = vectorize(devectorize(hash64[1]) + devectorize(h6));
+		hash64[2] = vectorize(devectorize(hash64[2]) + devectorize(h7));
+		hash64[3] = vectorize(devectorize(hash64[3]) + devectorize(skein_h8));
+		hash64[4] = vectorize(devectorize(hash64[4]) + devectorize(h0));
+		hash64[5] = vectorize(devectorize(hash64[5]) + devectorize(h1) + 0xff00000000000008);
+		hash64[6] = vectorize(devectorize(hash64[6]) + devectorize(h2) + 8);
+		hash64[7] = vectorize(devectorize(hash64[7]) + devectorize(h3) + 14);
+		macro4();
+		hash64[0] = vectorize(devectorize(hash64[0]) + devectorize(h6));
+		hash64[1] = vectorize(devectorize(hash64[1]) + devectorize(h7));
+		hash64[2] = vectorize(devectorize(hash64[2]) + devectorize(skein_h8));
+		hash64[3] = vectorize(devectorize(hash64[3]) + devectorize(h0));
+		hash64[4] = vectorize(devectorize(hash64[4]) + devectorize(h1));
+		hash64[5] = vectorize(devectorize(hash64[5]) + devectorize(h2) + 8);
+		hash64[6] = vectorize(devectorize(hash64[6]) + devectorize(h3) + 0xff00000000000000);
+		hash64[7] = vectorize(devectorize(hash64[7]) + devectorize(h4) + 15);
+		macro3();
+		hash64[0] = vectorize(devectorize(hash64[0]) + devectorize(h7));
+		hash64[1] = vectorize(devectorize(hash64[1]) + devectorize(skein_h8));
+		hash64[2] = vectorize(devectorize(hash64[2]) + devectorize(h0));
+		hash64[3] = vectorize(devectorize(hash64[3]) + devectorize(h1));
+		hash64[4] = vectorize(devectorize(hash64[4]) + devectorize(h2));
+		hash64[5] = vectorize(devectorize(hash64[5]) + devectorize(h3) + 0xff00000000000000);
+		hash64[6] = vectorize(devectorize(hash64[6]) + devectorize(h4) + 0xff00000000000008);
+		hash64[7] = vectorize(devectorize(hash64[7]) + devectorize(h5) + 16);
+		macro4();
+		hash64[0] = vectorize(devectorize(hash64[0]) + devectorize(skein_h8));
+		hash64[1] = vectorize(devectorize(hash64[1]) + devectorize(h0));
+		hash64[2] = vectorize(devectorize(hash64[2]) + devectorize(h1));
+		hash64[3] = vectorize(devectorize(hash64[3]) + devectorize(h2));
+		hash64[4] = vectorize(devectorize(hash64[4]) + devectorize(h3));
+		hash64[5] = vectorize(devectorize(hash64[5]) + devectorize(h4) + 0xff00000000000008);
+		hash64[6] = vectorize(devectorize(hash64[6]) + devectorize(h5) + 8);
+		hash64[7] = vectorize(devectorize(hash64[7]) + devectorize(h6) + 17);
+		macro3();
+		hash64[0] = vectorize(devectorize(hash64[0]) + devectorize(h0));
+		hash64[1] = vectorize(devectorize(hash64[1]) + devectorize(h1));
+		hash64[2] = vectorize(devectorize(hash64[2]) + devectorize(h2));
+		hash64[3] = vectorize(devectorize(hash64[3]) + devectorize(h3));
+		hash64[4] = vectorize(devectorize(hash64[4]) + devectorize(h4));
+		hash64[5] = vectorize(devectorize(hash64[5]) + devectorize(h5) + 8);
+		hash64[6] = vectorize(devectorize(hash64[6]) + devectorize(h6) + 0xff00000000000000);
+		hash64[7] = vectorize(devectorize(hash64[7]) + devectorize(h7) + 18);
+
+		phash = (uint2x4*)hash64;
+		uint2x4 *outpt = (uint2x4*)Hash;
+		outpt[0] = phash[0];
+		outpt[1] = phash[1];
+
+		#undef h0
+		#undef h1
+		#undef h2
+		#undef h3
+		#undef h4
+		#undef h5
+		#undef h6
+		#undef h7
+	}
+}
+
+__host__
+//void quark_skein512_cpu_hash_64(int thr_id,uint32_t threads, uint32_t *d_nonceVector, uint32_t *d_hash)
+void quark_skein512_cpu_hash_64(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+{
+	uint32_t tpb = TPB52;
+	int dev_id = device_map[thr_id];
+
+	if (device_sm[dev_id] <= 500) tpb = TPB50;
+	const dim3 grid((threads + tpb-1)/tpb);
+	const dim3 block(tpb);
+	quark_skein512_gpu_hash_64 <<<grid, block >>>(threads, startNonce, (uint64_t*)d_hash, d_nonceVector);
+
+}
+
+// 120 * 8 = 960 ... too big ?
+static __constant__ uint2 c_buffer[120]; // padded message (80 bytes + 72*8 bytes midstate + align)
+
+__global__
+#if __CUDA_ARCH__ > 500
+__launch_bounds__(TPB52, 3)
+#else
+__launch_bounds__(TPB50, 5)
+#endif
+void skein512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *output64)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		// Skein
+		uint2 h0, h1, h2, h3, h4, h5, h6, h7, h8;
+		uint2 t0, t1, t2;
+
+		uint32_t nonce = cuda_swab32(startNounce + thread);
+		uint2 nonce2 = make_uint2(c_buffer[0].x, nonce);
+
+		uint2 p[8];
+		p[1] = nonce2;
+
+		h0 = c_buffer[ 1];
+		h1 = c_buffer[ 2];
+		h2 = c_buffer[ 3];
+		h3 = c_buffer[ 4];
+		h4 = c_buffer[ 5];
+		h5 = c_buffer[ 6];
+		h6 = c_buffer[ 7];
+		h7 = c_buffer[ 8];
+		h8 = c_buffer[ 9];
+
+		t0 = vectorize(0x50ull);
+		t1 = vectorize(0xB000000000000000ull);
+		t2 = t0^t1;
+
+		p[1]=nonce2 + h1;	p[0]= c_buffer[10] + p[1];
+		p[2]=c_buffer[11];
+		p[3]=c_buffer[12];
+		p[4]=c_buffer[13];
+		p[5]=c_buffer[14];
+		p[6]=c_buffer[15];
+		p[7]=c_buffer[16];
+
+//		macro1();
+		p[1] = ROL2(p[1], 46) ^ p[0];
+		p[2] += p[1];
+		p[0] += p[3];
+		p[1] = ROL2(p[1], 33) ^ p[2];
+		p[3] = c_buffer[17] ^ p[0];
+		p[4] += p[1];
+		p[6] += p[3];
+		p[0] += p[5];
+		p[2] += p[7];
+		p[1] = ROL2(p[1], 17) ^ p[4];
+		p[3] = ROL2(p[3], 49) ^ p[6];
+		p[5] = c_buffer[18] ^ p[0];
+		p[7] = c_buffer[19] ^ p[2];
+		p[6] += p[1];
+		p[0] += p[7];
+		p[2] += p[5];
+		p[4] += p[3];
+		p[1] = ROL2(p[1], 44) ^ p[6];
+		p[7] = ROL2(p[7], 9) ^ p[0];
+		p[5] = ROL2(p[5], 54) ^ p[2];
+		p[3] = ROR8(p[3]) ^ p[4];
+
+		p[0]+=h1;	p[1]+=h2;	p[2]+=h3;	p[3]+=h4;	p[4]+=h5;
+		p[5]+=c_buffer[20];	p[7]+=c_buffer[21];	p[6]+=c_buffer[22];
+		macro2();
+		p[0]+=h2;	p[1]+=h3;	p[2]+=h4;	p[3]+=h5;	p[4]+=h6;
+		p[5]+=c_buffer[22];	p[7]+=c_buffer[23];	p[6]+=c_buffer[24];
+		macro1();
+		p[0]+=h3;	p[1]+=h4;	p[2]+=h5;	p[3]+=h6;	p[4]+=h7;
+		p[5]+=c_buffer[24];	p[7]+=c_buffer[25];	p[6]+=c_buffer[26];
+		macro2();
+		p[0]+=h4;	p[1]+=h5;	p[2]+=h6;	p[3]+=h7;	p[4]+=h8;
+		p[5]+=c_buffer[26];	p[7]+=c_buffer[27];	p[6]+=c_buffer[28];
+		macro1();
+		p[0]+=h5;	p[1]+=h6;	p[2]+=h7;	p[3]+=h8;	p[4]+=h0;
+		p[5]+=c_buffer[28];	p[7]+=c_buffer[29];	p[6]+=c_buffer[30];
+		macro2();
+		p[0]+=h6;	p[1]+=h7;	p[2]+=h8;	p[3]+=h0;	p[4]+=h1;
+		p[5]+=c_buffer[30];	p[7]+=c_buffer[31];	p[6]+=c_buffer[32];
+		macro1();
+		p[0]+=h7;	p[1]+=h8;	p[2]+=h0;	p[3]+=h1;	p[4]+=h2;
+		p[5]+=c_buffer[32];	p[7]+=c_buffer[33];	p[6]+=c_buffer[34];
+		macro2();
+		p[0]+=h8;	p[1]+=h0;	p[2]+=h1;	p[3]+=h2;	p[4]+=h3;
+		p[5]+=c_buffer[34];	p[7]+=c_buffer[35];	p[6]+=c_buffer[36];
+		macro1();
+		p[0]+=h0;	p[1]+=h1;	p[2]+=h2;	p[3]+=h3;	p[4]+=h4;
+		p[5]+=c_buffer[36];	p[7]+=c_buffer[37];	p[6]+=c_buffer[38];
+		macro2();
+		p[0]+=h1;	p[1]+=h2;	p[2]+=h3;	p[3]+=h4;	p[4]+=h5;
+		p[5]+=c_buffer[38];	p[7]+=c_buffer[39];	p[6]+=c_buffer[40];
+		macro1();
+		p[0]+=h2;	p[1]+=h3;	p[2]+=h4;	p[3]+=h5;	p[4]+=h6;
+		p[5]+=c_buffer[40];	p[7]+=c_buffer[41];	p[6]+=c_buffer[42];
+		macro2();
+		p[0]+=h3;	p[1]+=h4;	p[2]+=h5;	p[3]+=h6;	p[4]+=h7;
+		p[5]+=c_buffer[42];	p[7]+=c_buffer[43];	p[6]+=c_buffer[44];
+		macro1();
+		p[0]+=h4;	p[1]+=h5;	p[2]+=h6;	p[3]+=h7;	p[4]+=h8;
+		p[5]+=c_buffer[44];	p[7]+=c_buffer[45];	p[6]+=c_buffer[46];
+		macro2();
+		p[0]+=h5;	p[1]+=h6;	p[2]+=h7;	p[3]+=h8;	p[4]+=h0;
+		p[5]+=c_buffer[46];	p[7]+=c_buffer[47];	p[6]+=c_buffer[48];
+		macro1();
+		p[0]+=h6;	p[1]+=h7;	p[2]+=h8;	p[3]+=h0;	p[4]+=h1;
+		p[5]+=c_buffer[48];	p[7]+=c_buffer[49];	p[6]+=c_buffer[50];
+		macro2();
+		p[0]+=h7;	p[1]+=h8;	p[2]+=h0;	p[3]+=h1;	p[4]+=h2;
+		p[5]+=c_buffer[50];	p[7]+=c_buffer[51];	p[6]+=c_buffer[52];
+		macro1();
+		p[0]+=h8;	p[1]+=h0;	p[2]+=h1;	p[3]+=h2;	p[4]+=h3;
+		p[5]+=c_buffer[52];	p[7]+=c_buffer[53];	p[6]+=c_buffer[54];
+		macro2();
+		p[0]+=h0;	p[1]+=h1;	p[2]+=h2;	p[3]+=h3;	p[4]+=h4;
+		p[5]+=c_buffer[54];	p[7]+=c_buffer[55];	p[6]+=c_buffer[56];
+
+		p[0]^= c_buffer[57];
+		p[1]^= nonce2;
+
+		t0 = vectorize(8); // extra
+		t1 = vectorize(0xFF00000000000000ull); // etype
+//		t2 = vectorize(0xB000000000000050ull);
+
+		h0 = p[0];
+		h1 = p[1];
+		h2 = p[2];
+		h3 = p[3];
+		h4 = p[4];
+		h5 = p[5];
+		h6 = p[6];
+		h7 = p[7];
+
+		TFBIG_KINIT_UI2(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2);
+
+		p[0] = p[1] = p[2] = p[3] = p[4] =p[5] =p[6] = p[7] = vectorize(0);
+
+		TFBIG_4e_UI2(0);
+		TFBIG_4o_UI2(1);
+		TFBIG_4e_UI2(2);
+		TFBIG_4o_UI2(3);
+		TFBIG_4e_UI2(4);
+		TFBIG_4o_UI2(5);
+		TFBIG_4e_UI2(6);
+		TFBIG_4o_UI2(7);
+		TFBIG_4e_UI2(8);
+		TFBIG_4o_UI2(9);
+		TFBIG_4e_UI2(10);
+		TFBIG_4o_UI2(11);
+		TFBIG_4e_UI2(12);
+		TFBIG_4o_UI2(13);
+		TFBIG_4e_UI2(14);
+		TFBIG_4o_UI2(15);
+		TFBIG_4e_UI2(16);
+		TFBIG_4o_UI2(17);
+		TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18);
+
+		uint64_t *outpHash = &output64[thread<<3];
+		#pragma unroll 8
+		for (int i = 0; i < 8; i++)
+			outpHash[i] = devectorize(p[i]);
+	}
+}
+
+__host__
+void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap)
+{
+	uint32_t tpb = TPB52;
+	int dev_id = device_map[thr_id];
+	if (device_sm[dev_id] <= 500) tpb = TPB50;
+
+	const dim3 grid((threads + tpb-1)/tpb);
+	const dim3 block(tpb);
+
+	// hash function is cut in 2 parts to reduce kernel size
+	skein512_gpu_hash_80 <<< grid, block >>> (threads, startNounce, (uint64_t*)d_hash);
+}
+
+__host__
+void skein512_cpu_setBlock_80(void *pdata)
+{
+	uint64_t message[20];
+	memcpy(&message[0], pdata, 80);
+
+	uint64_t p[8];
+	uint64_t h0, h1, h2, h3, h4, h5, h6, h7, h8;
+	uint64_t t0, t1, t2;
+
+	h0 = 0x4903ADFF749C51CEull;
+	h1 = 0x0D95DE399746DF03ull;
+	h2 = 0x8FD1934127C79BCEull;
+	h3 = 0x9A255629FF352CB1ull;
+	h4 = 0x5DB62599DF6CA7B0ull;
+	h5 = 0xEABE394CA9D5C3F4ull;
+	h6 = 0x991112C71A75B523ull;
+	h7 = 0xAE18A40B660FCC33ull;
+	// h8 = h0 ^ h1 ^ h2 ^ h3 ^ h4 ^ h5 ^ h6 ^ h7 ^ SPH_C64(0x1BD11BDAA9FC1A22);
+	h8 = 0xcab2076d98173ec4ULL;
+
+	t0 = 64; // ptr
+	t1 = 0x7000000000000000ull;
+	t2 = 0x7000000000000040ull;
+
+	memcpy(&p[0], &message[0], 64);
+
+	TFBIG_4e_PRE(0);
+	TFBIG_4o_PRE(1);
+	TFBIG_4e_PRE(2);
+	TFBIG_4o_PRE(3);
+	TFBIG_4e_PRE(4);
+	TFBIG_4o_PRE(5);
+	TFBIG_4e_PRE(6);
+	TFBIG_4o_PRE(7);
+	TFBIG_4e_PRE(8);
+	TFBIG_4o_PRE(9);
+	TFBIG_4e_PRE(10);
+	TFBIG_4o_PRE(11);
+	TFBIG_4e_PRE(12);
+	TFBIG_4o_PRE(13);
+	TFBIG_4e_PRE(14);
+	TFBIG_4o_PRE(15);
+	TFBIG_4e_PRE(16);
+	TFBIG_4o_PRE(17);
+	TFBIG_ADDKEY_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18);
+
+	message[10] = message[0] ^ p[0];
+	message[11] = message[1] ^ p[1];
+	message[12] = message[2] ^ p[2];
+	message[13] = message[3] ^ p[3];
+	message[14] = message[4] ^ p[4];
+	message[15] = message[5] ^ p[5];
+	message[16] = message[6] ^ p[6];
+	message[17] = message[7] ^ p[7];
+
+	message[18] = t2;
+
+	uint64_t buffer[128];
+
+//	buffer[ 0] = message[ 8];
+	buffer[ 0] = message[ 9];
+	h0 = buffer[ 1] = message[10];
+	h1 = buffer[ 2] = message[11];
+	h2 = buffer[ 3] = message[12];
+	h3 = buffer[ 4] = message[13];
+	h4 = buffer[ 5] = message[14];
+	h5 = buffer[ 6] = message[15];
+	h6 = buffer[ 7] = message[16];
+	h7 = buffer[ 8] = message[17];
+	h8 = buffer[ 9] = h0^h1^h2^h3^h4^h5^h6^h7^0x1BD11BDAA9FC1A22ULL;
+
+	t0 = 0x50ull;
+	t1 = 0xB000000000000000ull;
+	t2 = t0^t1;
+
+	p[0] = message[ 8] + h0;
+	p[2] = h2; p[3] = h3; p[4] = h4;
+	p[5] = h5 + t0; p[6] = h6 + t1; p[7] = h7;
+	p[2] += p[3]; p[4] += p[5]; p[6] += p[7];
+	p[3] = ROTL64(p[3], 36) ^ p[2];
+	p[5] = ROTL64(p[5], 19) ^ p[4];
+	p[7] = ROTL64(p[7], 37) ^ p[6];
+	p[4] += p[7];
+	p[6] += p[5];
+	p[7] = ROTL64(p[7], 27) ^ p[4];
+	p[5] = ROTL64(p[5], 14) ^ p[6];
+
+	buffer[10] = p[0];
+	buffer[11] = p[2];
+	buffer[12] = p[3];
+	buffer[13] = p[4];
+	buffer[14] = p[5];
+	buffer[15] = p[6];
+	buffer[16] = p[7];
+	buffer[17] = ROTL64(p[3], 42);
+	buffer[18] = ROTL64(p[5], 36);
+	buffer[19] = ROTL64(p[7], 39);
+
+	buffer[20] = h6+t1;
+	buffer[21] = h8+1;
+	buffer[22] = h7+t2;
+	buffer[23] = h0+2;
+	buffer[24] = h8+t0;
+	buffer[25] = h1+3;
+	buffer[26] = h0+t1;
+	buffer[27] = h2+4;
+	buffer[28] = h1+t2;
+	buffer[29] = h3+5;
+	buffer[30] = h2+t0;
+	buffer[31] = h4+6;
+	buffer[32] = h3+t1;
+	buffer[33] = h5+7;
+	buffer[34] = h4+t2;
+	buffer[35] = h6+8;
+	buffer[36] = h5+t0;
+	buffer[37] = h7+9;
+	buffer[38] = h6+t1;
+	buffer[39] = h8+10;
+	buffer[40] = h7+t2;
+	buffer[41] = h0+11;
+	buffer[42] = h8+t0;
+	buffer[43] = h1+12;
+	buffer[44] = h0+t1;
+	buffer[45] = h2+13;
+	buffer[46] = h1+t2;
+	buffer[47] = h3+14;
+	buffer[48] = h2+t0;
+	buffer[49] = h4+15;
+	buffer[50] = h3+t1;
+	buffer[51] = h5+16;
+	buffer[52] = h4+t2;
+	buffer[53] = h6+17;
+	buffer[54] = h5+t0;
+	buffer[55] = h7+18;
+	buffer[56] = h6+t1;
+
+	buffer[57] = message[8];
+
+	cudaMemcpyToSymbol(c_buffer, buffer, sizeof(c_buffer), 0, cudaMemcpyHostToDevice);
+	CUDA_SAFE_CALL(cudaGetLastError());
+}
+
+__host__
+void quark_skein512_cpu_init(int thr_id, uint32_t threads)
+{
+	cuda_get_arch(thr_id);
+}
+
diff --git a/quark/groestl_functions_quad.h b/quark/groestl_functions_quad.h
new file mode 100644
index 0000000000..7688839bbd
--- /dev/null
+++ b/quark/groestl_functions_quad.h
@@ -0,0 +1,350 @@
+#include "cuda_helper.h"
+
+__device__ __forceinline__
+void G256_Mul2(uint32_t *regs)
+{
+    uint32_t tmp = regs[7];
+    regs[7] = regs[6];
+    regs[6] = regs[5];
+    regs[5] = regs[4];
+    regs[4] = regs[3] ^ tmp;
+    regs[3] = regs[2] ^ tmp;
+    regs[2] = regs[1];
+    regs[1] = regs[0] ^ tmp;
+    regs[0] = tmp;
+}
+
+__device__ __forceinline__
+void G256_AddRoundConstantQ_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0, int rnd)
+{
+    x0 = ~x0;
+    x1 = ~x1;
+    x2 = ~x2;
+    x3 = ~x3;
+    x4 = ~x4;
+    x5 = ~x5;
+    x6 = ~x6;
+    x7 = ~x7;
+
+#if 0
+    if ((threadIdx.x & 3) != 3)
+        return;
+
+    int andmask = 0xFFFF0000;
+#else
+    /* from sp: faster (branching problem with if ?) */
+    uint32_t andmask = -((threadIdx.x & 3) == 3) & 0xFFFF0000U;
+#endif
+
+    x0 ^= ((- (rnd & 0x01)    ) & andmask);
+    x1 ^= ((-((rnd & 0x02)>>1)) & andmask);
+    x2 ^= ((-((rnd & 0x04)>>2)) & andmask);
+    x3 ^= ((-((rnd & 0x08)>>3)) & andmask);
+
+    x4 ^= (0xAAAA0000 & andmask);
+    x5 ^= (0xCCCC0000 & andmask);
+    x6 ^= (0xF0F00000 & andmask);
+    x7 ^= (0xFF000000 & andmask);
+}
+
+__device__ __forceinline__
+void G256_AddRoundConstantP_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0, int rnd)
+{
+    if (threadIdx.x & 3)
+        return;
+
+    int andmask = 0xFFFF;
+
+    x0 ^= ((- (rnd & 0x01)    ) & andmask);
+    x1 ^= ((-((rnd & 0x02)>>1)) & andmask);
+    x2 ^= ((-((rnd & 0x04)>>2)) & andmask);
+    x3 ^= ((-((rnd & 0x08)>>3)) & andmask);
+
+    x4 ^= 0xAAAAU;
+    x5 ^= 0xCCCCU;
+    x6 ^= 0xF0F0U;
+    x7 ^= 0xFF00U;
+}
+
+__device__ __forceinline__
+void G16mul_quad(uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0,
+                                       uint32_t &y3, uint32_t &y2, uint32_t &y1, uint32_t &y0)
+{
+    uint32_t t0,t1,t2;
+    
+    t0 = ((x2 ^ x0) ^ (x3 ^ x1)) & ((y2 ^ y0) ^ (y3 ^ y1));
+    t1 = ((x2 ^ x0) & (y2 ^ y0)) ^ t0;
+    t2 = ((x3 ^ x1) & (y3 ^ y1)) ^ t0 ^ t1;
+
+    t0 = (x2^x3) & (y2^y3);
+    x3 = (x3 & y3) ^ t0 ^ t1;
+    x2 = (x2 & y2) ^ t0 ^ t2;
+
+    t0 = (x0^x1) & (y0^y1);
+    x1 = (x1 & y1) ^ t0 ^ t1;
+    x0 = (x0 & y0) ^ t0 ^ t2;
+}
+
+__device__ __forceinline__
+void G256_inv_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0)
+{
+    uint32_t t0,t1,t2,t3,t4,t5,t6,a,b;
+
+    t3 = x7;
+    t2 = x6;
+    t1 = x5;
+    t0 = x4;
+
+    G16mul_quad(t3, t2, t1, t0, x3, x2, x1, x0);
+
+    a = (x4 ^ x0);
+    t0 ^= a;
+    t2 ^= (x7 ^ x3) ^ (x5 ^ x1); 
+    t1 ^= (x5 ^ x1) ^ a;
+    t3 ^= (x6 ^ x2) ^ a;
+
+    b = t0 ^ t1;
+    t4 = (t2 ^ t3) & b;
+    a = t4 ^ t3 ^ t1;
+    t5 = (t3 & t1) ^ a;
+    t6 = (t2 & t0) ^ a ^ (t2 ^ t0);
+
+    t4 = (t5 ^ t6) & b;
+    t1 = (t6 & t1) ^ t4;
+    t0 = (t5 & t0) ^ t4;
+
+    t4 = (t5 ^ t6) & (t2^t3);
+    t3 = (t6 & t3) ^ t4;
+    t2 = (t5 & t2) ^ t4;
+
+    G16mul_quad(x3, x2, x1, x0, t1, t0, t3, t2);
+
+    G16mul_quad(x7, x6, x5, x4, t1, t0, t3, t2);
+}
+
+__device__ __forceinline__
+void transAtoX_quad(uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &x4, uint32_t &x5, uint32_t &x6, uint32_t &x7)
+{
+    uint32_t t0, t1;
+    t0 = x0 ^ x1 ^ x2;
+    t1 = x5 ^ x6;
+    x2 = t0 ^ t1 ^ x7;
+    x6 = t0 ^ x3 ^ x6;
+    x3 = x0 ^ x1 ^ x3 ^ x4 ^ x7;    
+    x4 = x0 ^ x4 ^ t1;
+    x2 = t0 ^ t1 ^ x7;
+    x1 = x0 ^ x1 ^ t1;
+    x7 = x0 ^ t1 ^ x7;
+    x5 = x0 ^ t1;
+}
+
+__device__ __forceinline__
+void transXtoA_quad(uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &x4, uint32_t &x5, uint32_t &x6, uint32_t &x7)
+{
+    uint32_t t0,t2,t3,t5;
+
+    x1 ^= x4;
+    t0 = x1 ^ x6;
+    x1 ^= x5;
+
+    t2 = x0 ^ x2;
+    x2 = x3 ^ x5;
+    t2 ^= x2 ^ x6;
+    x2 ^= x7;
+    t3 = x4 ^ x2 ^ x6;
+
+    t5 = x0 ^ x6;
+    x4 = x3 ^ x7;
+    x0 = x3 ^ x5;
+
+    x6 = t0;    
+    x3 = t2;
+    x7 = t3;    
+    x5 = t5;    
+}
+
+__device__ __forceinline__
+void sbox_quad(uint32_t *r)
+{
+    transAtoX_quad(r[0], r[1], r[2], r[3], r[4], r[5], r[6], r[7]);
+
+    G256_inv_quad(r[2], r[4], r[1], r[7], r[3], r[0], r[5], r[6]);
+
+    transXtoA_quad(r[7], r[1], r[4], r[2], r[6], r[5], r[0], r[3]);
+    
+    r[0] = ~r[0];
+    r[1] = ~r[1];
+    r[5] = ~r[5];
+    r[6] = ~r[6];
+}
+
+__device__ __forceinline__
+void G256_ShiftBytesP_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0)
+{
+    uint32_t t0,t1;
+
+    int tpos = threadIdx.x & 0x03;
+    int shift1 = tpos << 1;
+    int shift2 = shift1+1 + ((tpos == 3)<<2);
+
+    t0 = __byte_perm(x0, 0, 0x1010)>>shift1;
+    t1 = __byte_perm(x0, 0, 0x3232)>>shift2;
+    x0 = __byte_perm(t0, t1, 0x5410);
+
+    t0 = __byte_perm(x1, 0, 0x1010)>>shift1;
+    t1 = __byte_perm(x1, 0, 0x3232)>>shift2;
+    x1 = __byte_perm(t0, t1, 0x5410);
+
+    t0 = __byte_perm(x2, 0, 0x1010)>>shift1;
+    t1 = __byte_perm(x2, 0, 0x3232)>>shift2;
+    x2 = __byte_perm(t0, t1, 0x5410);
+
+    t0 = __byte_perm(x3, 0, 0x1010)>>shift1;
+    t1 = __byte_perm(x3, 0, 0x3232)>>shift2;
+    x3 = __byte_perm(t0, t1, 0x5410);
+
+    t0 = __byte_perm(x4, 0, 0x1010)>>shift1;
+    t1 = __byte_perm(x4, 0, 0x3232)>>shift2;
+    x4 = __byte_perm(t0, t1, 0x5410);
+
+    t0 = __byte_perm(x5, 0, 0x1010)>>shift1;
+    t1 = __byte_perm(x5, 0, 0x3232)>>shift2;
+    x5 = __byte_perm(t0, t1, 0x5410);
+
+    t0 = __byte_perm(x6, 0, 0x1010)>>shift1;
+    t1 = __byte_perm(x6, 0, 0x3232)>>shift2;
+    x6 = __byte_perm(t0, t1, 0x5410);
+
+    t0 = __byte_perm(x7, 0, 0x1010)>>shift1;
+    t1 = __byte_perm(x7, 0, 0x3232)>>shift2;
+    x7 = __byte_perm(t0, t1, 0x5410);
+}
+
+__device__ __forceinline__
+void G256_ShiftBytesQ_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0)
+{
+    uint32_t t0,t1;
+
+    int tpos = threadIdx.x & 0x03;
+    int shift1 = (1-(tpos>>1)) + ((tpos & 0x01)<<2);
+    int shift2 = shift1+2 + ((tpos == 1)<<2);
+
+    t0 = __byte_perm(x0, 0, 0x1010)>>shift1;
+    t1 = __byte_perm(x0, 0, 0x3232)>>shift2;
+    x0 = __byte_perm(t0, t1, 0x5410);
+
+    t0 = __byte_perm(x1, 0, 0x1010)>>shift1;
+    t1 = __byte_perm(x1, 0, 0x3232)>>shift2;
+    x1 = __byte_perm(t0, t1, 0x5410);
+
+    t0 = __byte_perm(x2, 0, 0x1010)>>shift1;
+    t1 = __byte_perm(x2, 0, 0x3232)>>shift2;
+    x2 = __byte_perm(t0, t1, 0x5410);
+
+    t0 = __byte_perm(x3, 0, 0x1010)>>shift1;
+    t1 = __byte_perm(x3, 0, 0x3232)>>shift2;
+    x3 = __byte_perm(t0, t1, 0x5410);
+
+    t0 = __byte_perm(x4, 0, 0x1010)>>shift1;
+    t1 = __byte_perm(x4, 0, 0x3232)>>shift2;
+    x4 = __byte_perm(t0, t1, 0x5410);
+
+    t0 = __byte_perm(x5, 0, 0x1010)>>shift1;
+    t1 = __byte_perm(x5, 0, 0x3232)>>shift2;
+    x5 = __byte_perm(t0, t1, 0x5410);
+
+    t0 = __byte_perm(x6, 0, 0x1010)>>shift1;
+    t1 = __byte_perm(x6, 0, 0x3232)>>shift2;
+    x6 = __byte_perm(t0, t1, 0x5410);
+
+    t0 = __byte_perm(x7, 0, 0x1010)>>shift1;
+    t1 = __byte_perm(x7, 0, 0x3232)>>shift2;
+    x7 = __byte_perm(t0, t1, 0x5410);
+}
+
+#if __CUDA_ARCH__ < 300
+/**
+ * __shfl() returns the value of var held by the thread whose ID is given by srcLane.
+ * If srcLane is outside the range 0..width-1, the thread’s own value of var is returned.
+ */
+#undef __shfl
+#define __shfl(var, srcLane, width) (uint32_t)(var)
+#endif
+
+__device__ __forceinline__
+void G256_MixFunction_quad(uint32_t *r)
+{
+#define SHIFT64_16(hi, lo)    __byte_perm(lo, hi, 0x5432)
+#define A(v, u)             __shfl((int)r[v], ((threadIdx.x+u)&0x03), 4)
+#define S(idx, l)            SHIFT64_16( A(idx, (l+1)), A(idx, l) )
+
+#define DOUBLE_ODD(i, bc)        ( S(i, (bc)) ^ A(i, (bc) + 1) )
+#define DOUBLE_EVEN(i, bc)        ( S(i, (bc)) ^ A(i, (bc)    ) )
+
+#define SINGLE_ODD(i, bc)        ( S(i, (bc)) )
+#define SINGLE_EVEN(i, bc)        ( A(i, (bc)) )
+    uint32_t b[8];
+
+#pragma unroll 8
+    for(int i=0;i<8;i++)
+        b[i] = DOUBLE_ODD(i, 1) ^ DOUBLE_EVEN(i, 3);
+
+    G256_Mul2(b);
+#pragma unroll 8
+    for(int i=0;i<8;i++)
+        b[i] = b[i] ^ DOUBLE_ODD(i, 3) ^ DOUBLE_ODD(i, 4) ^ SINGLE_ODD(i, 6);
+
+    G256_Mul2(b);
+#pragma unroll 8
+    for(int i=0;i<8;i++)
+        r[i] = b[i] ^ DOUBLE_EVEN(i, 2) ^ DOUBLE_EVEN(i, 3) ^ SINGLE_EVEN(i, 5);
+
+#undef S
+#undef A
+#undef SHIFT64_16
+#undef t
+#undef X
+}
+
+__device__ __forceinline__
+void groestl512_perm_P_quad(uint32_t *r)
+{
+    for(int round=0;round<14;round++)
+    {
+        G256_AddRoundConstantP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0], round);
+        sbox_quad(r);
+        G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]);
+        G256_MixFunction_quad(r);
+    }
+}
+
+__device__ __forceinline__
+void groestl512_perm_Q_quad(uint32_t *r)
+{    
+    for(int round=0;round<14;round++)
+    {
+        G256_AddRoundConstantQ_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0], round);
+        sbox_quad(r);
+        G256_ShiftBytesQ_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]);
+        G256_MixFunction_quad(r);
+    }
+}
+
+__device__ __forceinline__
+void groestl512_progressMessage_quad(uint32_t *state, uint32_t *message)
+{
+#pragma unroll 8
+    for(int u=0;u<8;u++) state[u] = message[u];
+
+    if ((threadIdx.x & 0x03) == 3) state[ 1] ^= 0x00008000;
+    groestl512_perm_P_quad(state);
+    if ((threadIdx.x & 0x03) == 3) state[ 1] ^= 0x00008000;
+    groestl512_perm_Q_quad(message);
+#pragma unroll 8
+    for(int u=0;u<8;u++) state[u] ^= message[u];
+#pragma unroll 8
+    for(int u=0;u<8;u++) message[u] = state[u];
+    groestl512_perm_P_quad(message);
+#pragma unroll 8
+    for(int u=0;u<8;u++) state[u] ^= message[u];
+}
diff --git a/quark/groestl_simple.cuh b/quark/groestl_simple.cuh
new file mode 100644
index 0000000000..f1c7a1f7b7
--- /dev/null
+++ b/quark/groestl_simple.cuh
@@ -0,0 +1,111 @@
+/**
+ * Simplified groestl512 big perm code
+ * tpruvot - 2017
+ */
+
+#ifdef __INTELLISENSE__
+#define __CUDA_ARCH__ 210
+#define __CUDACC__
+#include <cuda_helper.h>
+#include <cuda_texture_types.h>
+#define __byte_perm(a,b,c) (a)
+#define tex1Dfetch(t, n) (n)
+#endif
+
+// todo: merge with cuda_quark_groestl512_sm20.cu (used for groestl512-80)
+
+#if __CUDA_ARCH__ < 300 || defined(_DEBUG)
+
+#ifndef SPH_C32
+#define SPH_C32(x)    ((uint32_t)(x ## U))
+#define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
+#endif
+
+#ifndef PC32up
+#define PC32up(j, r)   ((uint32_t)((j) + (r)))
+#define PC32dn(j, r)   0
+#define QC32up(j, r)   0xFFFFFFFF
+#define QC32dn(j, r)   (((uint32_t)(r) << 24) ^ SPH_T32(~((uint32_t)(j) << 24)))
+#endif
+
+#define tT0up(x) tex1Dfetch(t0up1, x)
+#define tT0dn(x) tex1Dfetch(t0dn1, x)
+#define tT1up(x) tex1Dfetch(t1up1, x)
+#define tT1dn(x) tex1Dfetch(t1dn1, x)
+#define tT2up(x) tex1Dfetch(t2up1, x)
+#define tT2dn(x) tex1Dfetch(t2dn1, x)
+#define tT3up(x) tex1Dfetch(t3up1, x)
+#define tT3dn(x) tex1Dfetch(t3dn1, x)
+
+#undef B32_0
+#define B32_0(x) ((x) & 0xFFu)
+
+__device__ __forceinline__
+static void tex_groestl512_perm_P(uint32_t *a)
+{
+	#pragma unroll 1
+	for(int r=0; r<14; r++)
+	{
+		uint32_t t[32];
+
+		#pragma unroll 16
+		for (int k=0; k<16; k++)
+			a[(k*2)+0] ^= PC32up(k<< 4, r);
+
+		#pragma unroll 16
+		for(int k=0; k<32; k+=2)
+		{
+			uint32_t t0_0 = B32_0(a[(k    ) & 0x1f]), t9_0  = B32_0(a[(k +  9) & 0x1f]);
+			uint32_t t2_1 = B32_1(a[(k + 2) & 0x1f]), t11_1 = B32_1(a[(k + 11) & 0x1f]);
+			uint32_t t4_2 = B32_2(a[(k + 4) & 0x1f]), t13_2 = B32_2(a[(k + 13) & 0x1f]);
+			uint32_t t6_3 = B32_3(a[(k + 6) & 0x1f]), t23_3 = B32_3(a[(k + 23) & 0x1f]);
+
+			t[k + 0] =  tT0up( t0_0 ) ^ tT1up(  t2_1 ) ^ tT2up(  t4_2 ) ^ tT3up(  t6_3 ) ^
+						tT0dn( t9_0 ) ^ tT1dn( t11_1 ) ^ tT2dn( t13_2 ) ^ tT3dn( t23_3 );
+
+			t[k + 1] =  tT0dn( t0_0 ) ^ tT1dn(  t2_1 ) ^ tT2dn(  t4_2 ) ^ tT3dn(  t6_3 ) ^
+						tT0up( t9_0 ) ^ tT1up( t11_1 ) ^ tT2up( t13_2 ) ^ tT3up( t23_3 );
+		}
+
+		#pragma unroll 32
+		for(int k=0; k<32; k++)
+			a[k] = t[k];
+	}
+}
+
+__device__ __forceinline__
+static void tex_groestl512_perm_Q(uint32_t *a)
+{
+	#pragma unroll 1
+	for(int r=0; r<14; r++)
+	{
+		uint32_t t[32];
+
+		#pragma unroll 16
+		for (int k=0; k<16; k++) {
+			a[(k*2)+0] ^= QC32up(k<< 4, r);
+			a[(k*2)+1] ^= QC32dn(k<< 4, r);
+		}
+
+		#pragma unroll 16
+		for(int k=0; k<32; k+=2)
+		{
+			uint32_t t2_0  = B32_0(a[(k +  2) & 0x1f]), t1_0  = B32_0(a[(k +  1) & 0x1f]);
+			uint32_t t6_1  = B32_1(a[(k +  6) & 0x1f]), t5_1  = B32_1(a[(k +  5) & 0x1f]);
+			uint32_t t10_2 = B32_2(a[(k + 10) & 0x1f]), t9_2  = B32_2(a[(k +  9) & 0x1f]);
+			uint32_t t22_3 = B32_3(a[(k + 22) & 0x1f]), t13_3 = B32_3(a[(k + 13) & 0x1f]);
+
+			t[k + 0] =  tT0up( t2_0 ) ^ tT1up( t6_1 ) ^ tT2up( t10_2 ) ^ tT3up( t22_3 ) ^
+						tT0dn( t1_0 ) ^ tT1dn( t5_1 ) ^ tT2dn(  t9_2 ) ^ tT3dn( t13_3 );
+
+			t[k + 1] =  tT0dn( t2_0 ) ^ tT1dn( t6_1 ) ^ tT2dn( t10_2 ) ^ tT3dn( t22_3 ) ^
+						tT0up( t1_0 ) ^ tT1up( t5_1 ) ^ tT2up(  t9_2 ) ^ tT3up( t13_3 );
+		}
+
+		#pragma unroll 32
+		for(int k=0; k<32; k++)
+			a[k] = t[k];
+	}
+}
+
+#endif
diff --git a/quark/groestl_transf_quad.h b/quark/groestl_transf_quad.h
new file mode 100644
index 0000000000..545a5e680c
--- /dev/null
+++ b/quark/groestl_transf_quad.h
@@ -0,0 +1,133 @@
+/* File included in quark/groestl (quark/jha,nist5/X11+) and groest/myriad coins for SM 3+ */
+
+#define merge8(z,x,y)\
+	z=__byte_perm(x, y, 0x5140); \
+
+#define SWAP8(x,y)\
+	x=__byte_perm(x, y, 0x5410); \
+	y=__byte_perm(x, y, 0x7632);
+
+#define SWAP4(x,y)\
+	t = (y<<4); \
+	t = (x ^ t); \
+	t = 0xf0f0f0f0UL & t; \
+	x = (x ^ t); \
+	t=  t>>4;\
+	y=  y ^ t;
+
+#define SWAP2(x,y)\
+	t = (y<<2); \
+	t = (x ^ t); \
+	t = 0xccccccccUL & t; \
+	x = (x ^ t); \
+	t=  t>>2;\
+	y=  y ^ t;
+
+#define SWAP1(x,y)\
+	t = (y+y); \
+	t = (x ^ t); \
+	t = 0xaaaaaaaaUL & t; \
+	x = (x ^ t); \
+	t=  t>>1;\
+	y=  y ^ t;
+
+
+__device__ __forceinline__
+void to_bitslice_quad(uint32_t *const __restrict__ input, uint32_t *const __restrict__ output)
+{
+	uint32_t other[8];
+	uint32_t d[8];
+	uint32_t t;
+	const unsigned int n = threadIdx.x & 3;
+
+	#pragma unroll
+	for (int i = 0; i < 8; i++) {
+		input[i] = __shfl((int)input[i], n ^ (3*(n >=1 && n <=2)), 4);
+		other[i] = __shfl((int)input[i], (threadIdx.x + 1) & 3, 4);
+		input[i] = __shfl((int)input[i], threadIdx.x & 2, 4);
+		other[i] = __shfl((int)other[i], threadIdx.x & 2, 4);
+		if (threadIdx.x & 1) {
+			input[i] = __byte_perm(input[i], 0, 0x1032);
+			other[i] = __byte_perm(other[i], 0, 0x1032);
+		}
+	}
+
+	merge8(d[0], input[0], input[4]);
+	merge8(d[1], other[0], other[4]);
+	merge8(d[2], input[1], input[5]);
+	merge8(d[3], other[1], other[5]);
+	merge8(d[4], input[2], input[6]);
+	merge8(d[5], other[2], other[6]);
+	merge8(d[6], input[3], input[7]);
+	merge8(d[7], other[3], other[7]);
+
+	SWAP1(d[0], d[1]);
+	SWAP1(d[2], d[3]);
+	SWAP1(d[4], d[5]);
+	SWAP1(d[6], d[7]);
+
+	SWAP2(d[0], d[2]);
+	SWAP2(d[1], d[3]);
+	SWAP2(d[4], d[6]);
+	SWAP2(d[5], d[7]);
+
+	SWAP4(d[0], d[4]);
+	SWAP4(d[1], d[5]);
+	SWAP4(d[2], d[6]);
+	SWAP4(d[3], d[7]);
+
+	output[0] = d[0];
+	output[1] = d[1];
+	output[2] = d[2];
+	output[3] = d[3];
+	output[4] = d[4];
+	output[5] = d[5];
+	output[6] = d[6];
+	output[7] = d[7];
+}
+
+__device__ __forceinline__
+void from_bitslice_quad(const uint32_t *const __restrict__ input, uint32_t *const __restrict__ output)
+{
+	uint32_t d[8];
+	uint32_t t;
+
+	d[0] = __byte_perm(input[0], input[4], 0x7531);
+	d[1] = __byte_perm(input[1], input[5], 0x7531);
+	d[2] = __byte_perm(input[2], input[6], 0x7531);
+	d[3] = __byte_perm(input[3], input[7], 0x7531);
+
+	SWAP1(d[0], d[1]);
+	SWAP1(d[2], d[3]);
+
+	SWAP2(d[0], d[2]);
+	SWAP2(d[1], d[3]);
+
+	t = __byte_perm(d[0], d[2], 0x5410);
+	d[2] = __byte_perm(d[0], d[2], 0x7632);
+	d[0] = t;
+
+	t = __byte_perm(d[1], d[3], 0x5410);
+	d[3] = __byte_perm(d[1], d[3], 0x7632);
+	d[1] = t;
+
+	SWAP4(d[0], d[2]);
+	SWAP4(d[1], d[3]);
+
+	output[0] = d[0];
+	output[2] = d[1];
+	output[4] = d[0] >> 16;
+	output[6] = d[1] >> 16;
+	output[8] = d[2];
+	output[10] = d[3];
+	output[12] = d[2] >> 16;
+	output[14] = d[3] >> 16;
+
+	#pragma unroll 8
+	for (int i = 0; i < 16; i+=2) {
+		if (threadIdx.x & 1) output[i] = __byte_perm(output[i], 0, 0x1032);
+		output[i] = __byte_perm(output[i], __shfl((int)output[i], (threadIdx.x+1)&3, 4), 0x7610);
+		output[i+1] = __shfl((int)output[i], (threadIdx.x+2)&3, 4);
+		if (threadIdx.x & 3) output[i] = output[i+1] = 0;
+	}
+}
diff --git a/quark/nist5.cu b/quark/nist5.cu
new file mode 100644
index 0000000000..25aff74311
--- /dev/null
+++ b/quark/nist5.cu
@@ -0,0 +1,184 @@
+extern "C"
+{
+#include "sph/sph_blake.h"
+#include "sph/sph_groestl.h"
+#include "sph/sph_skein.h"
+#include "sph/sph_jh.h"
+#include "sph/sph_keccak.h"
+}
+
+#include "miner.h"
+
+#include "cuda_helper.h"
+#include "quark/cuda_quark.h"
+
+static uint32_t *d_hash[MAX_GPUS];
+
+// Original nist5hash Funktion aus einem miner Quelltext
+extern "C" void nist5hash(void *state, const void *input)
+{
+    sph_blake512_context ctx_blake;
+    sph_groestl512_context ctx_groestl;
+    sph_jh512_context ctx_jh;
+    sph_keccak512_context ctx_keccak;
+    sph_skein512_context ctx_skein;
+
+    uint8_t hash[64];
+
+    sph_blake512_init(&ctx_blake);
+    sph_blake512 (&ctx_blake, input, 80);
+    sph_blake512_close(&ctx_blake, (void*) hash);
+
+    sph_groestl512_init(&ctx_groestl);
+    sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
+    sph_groestl512_close(&ctx_groestl, (void*) hash);
+
+    sph_jh512_init(&ctx_jh);
+    sph_jh512 (&ctx_jh, (const void*) hash, 64);
+    sph_jh512_close(&ctx_jh, (void*) hash);
+
+    sph_keccak512_init(&ctx_keccak);
+    sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
+    sph_keccak512_close(&ctx_keccak, (void*) hash);
+
+    sph_skein512_init(&ctx_skein);
+    sph_skein512 (&ctx_skein, (const void*) hash, 64);
+    sph_skein512_close(&ctx_skein, (void*) hash);
+
+    memcpy(state, hash, 32);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_nist5(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t _ALIGN(64) endiandata[20];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+
+	uint32_t throughput =  cuda_default_throughput(thr_id, 1 << 20); // 256*256*16
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (opt_benchmark)
+		((uint32_t*)ptarget)[7] = 0x00FF;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		// Constants copy/init (no device alloc in these algos)
+		quark_blake512_cpu_init(thr_id, throughput);
+		quark_groestl512_cpu_init(thr_id, throughput);
+		quark_jh512_cpu_init(thr_id, throughput);
+		quark_keccak512_cpu_init(thr_id, throughput);
+		quark_skein512_cpu_init(thr_id, throughput);
+
+		// char[64] work space for hashes results
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)64 * throughput));
+
+		cuda_check_cpu_init(thr_id, throughput);
+		init[thr_id] = true;
+	}
+
+#ifdef USE_STREAMS
+	cudaStream_t stream[5];
+	for (int i = 0; i < 5; i++)
+		cudaStreamCreate(&stream[i]);
+#endif
+
+	for (int k=0; k < 20; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	quark_blake512_cpu_setBlock_80(thr_id, endiandata);
+	cuda_check_cpu_setTarget(ptarget);
+
+	work->valid_nonces = 0;
+
+	do {
+		int order = 0;
+
+		// Hash with CUDA
+		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			nist5hash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					nist5hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				goto out;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+out:
+//	*hashes_done = pdata[19] - first_nonce;
+#ifdef USE_STREAMS
+	for (int i = 0; i < 5; i++)
+		cudaStreamDestroy(stream[i]);
+#endif
+
+	return work->valid_nonces;
+}
+
+// ressources cleanup
+extern "C" void free_nist5(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+
+	quark_blake512_cpu_free(thr_id);
+	quark_groestl512_cpu_free(thr_id);
+	cuda_check_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/quark/quarkcoin.cu b/quark/quarkcoin.cu
index 0bc46d9fc0..445b1cfebf 100644
--- a/quark/quarkcoin.cu
+++ b/quark/quarkcoin.cu
@@ -1,273 +1,339 @@
-
-extern "C"
-{
-#include "sph/sph_blake.h"
-#include "sph/sph_bmw.h"
-#include "sph/sph_groestl.h"
-#include "sph/sph_skein.h"
-#include "sph/sph_jh.h"
-#include "sph/sph_keccak.h"
-#include "miner.h"
-}
-
-#include <stdint.h>
-
-// aus cpu-miner.c
-extern int device_map[8];
-
-// Speicher f�r Input/Output der verketteten Hashfunktionen
-static uint32_t *d_hash[8];
-
-// Speicher zur Generierung der Noncevektoren f�r die bedingten Hashes
-static uint32_t *d_quarkNonces[8];
-static uint32_t *d_branch1Nonces[8];
-static uint32_t *d_branch2Nonces[8];
-static uint32_t *d_branch3Nonces[8];
-
-extern void quark_blake512_cpu_init(int thr_id, int threads);
-extern void quark_blake512_cpu_setBlock_80(void *pdata);
-extern void quark_blake512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order);
-extern void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
-extern void quark_bmw512_cpu_init(int thr_id, int threads);
-extern void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
-extern void quark_groestl512_cpu_init(int thr_id, int threads);
-extern void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-extern void quark_doublegroestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
-extern void quark_skein512_cpu_init(int thr_id, int threads);
-extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
-extern void quark_keccak512_cpu_init(int thr_id, int threads);
-extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
-extern void quark_jh512_cpu_init(int thr_id, int threads);
-extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
-extern void quark_check_cpu_init(int thr_id, int threads);
-extern void quark_check_cpu_setTarget(const void *ptarget);
-extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
-
-extern void quark_compactTest_cpu_init(int thr_id, int threads);
-extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
-											uint32_t *d_nonces1, size_t *nrm1,
-											uint32_t *d_nonces2, size_t *nrm2,
-											int order);
-extern void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
-											uint32_t *d_nonces1, size_t *nrm1,
-											int order);
-
-// Original Quarkhash Funktion aus einem miner Quelltext
-inline void quarkhash(void *state, const void *input)
-{
-    sph_blake512_context ctx_blake;
-    sph_bmw512_context ctx_bmw;
-    sph_groestl512_context ctx_groestl;
-    sph_jh512_context ctx_jh;
-    sph_keccak512_context ctx_keccak;
-    sph_skein512_context ctx_skein;
-    
-    unsigned char hash[64];
-
-    sph_blake512_init(&ctx_blake);
-    // ZBLAKE;
-    sph_blake512 (&ctx_blake, input, 80);
-    sph_blake512_close(&ctx_blake, (void*) hash);
-    
-    sph_bmw512_init(&ctx_bmw);
-    // ZBMW;
-    sph_bmw512 (&ctx_bmw, (const void*) hash, 64);
-    sph_bmw512_close(&ctx_bmw, (void*) hash);
-
-    if (hash[0] & 0x8)
-    {
-        sph_groestl512_init(&ctx_groestl);
-        // ZGROESTL;
-        sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
-        sph_groestl512_close(&ctx_groestl, (void*) hash);
-    }
-    else
-    {
-        sph_skein512_init(&ctx_skein);
-        // ZSKEIN;
-        sph_skein512 (&ctx_skein, (const void*) hash, 64);
-        sph_skein512_close(&ctx_skein, (void*) hash);
-    }
-    
-    sph_groestl512_init(&ctx_groestl);
-    // ZGROESTL;
-    sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
-    sph_groestl512_close(&ctx_groestl, (void*) hash);
-
-    sph_jh512_init(&ctx_jh);
-    // ZJH;
-    sph_jh512 (&ctx_jh, (const void*) hash, 64);
-    sph_jh512_close(&ctx_jh, (void*) hash);
-
-    if (hash[0] & 0x8)
-    {
-        sph_blake512_init(&ctx_blake);
-        // ZBLAKE;
-        sph_blake512 (&ctx_blake, (const void*) hash, 64);
-        sph_blake512_close(&ctx_blake, (void*) hash);
-    }
-    else
-    {
-        sph_bmw512_init(&ctx_bmw);
-        // ZBMW;
-        sph_bmw512 (&ctx_bmw, (const void*) hash, 64);
-        sph_bmw512_close(&ctx_bmw, (void*) hash);
-    }
-
-    sph_keccak512_init(&ctx_keccak);
-    // ZKECCAK;
-    sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
-    sph_keccak512_close(&ctx_keccak, (void*) hash);
-
-    sph_skein512_init(&ctx_skein);
-    // SKEIN;
-    sph_skein512 (&ctx_skein, (const void*) hash, 64);
-    sph_skein512_close(&ctx_skein, (void*) hash);
-
-    if (hash[0] & 0x8)
-    {
-        sph_keccak512_init(&ctx_keccak);
-        // ZKECCAK;
-        sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
-        sph_keccak512_close(&ctx_keccak, (void*) hash);
-    }
-    else
-    {
-        sph_jh512_init(&ctx_jh);
-        // ZJH;
-        sph_jh512 (&ctx_jh, (const void*) hash, 64);
-        sph_jh512_close(&ctx_jh, (void*) hash);
-    }
-
-    memcpy(state, hash, 32);
-}
-
-
-extern bool opt_benchmark;
-
-extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
-    const uint32_t *ptarget, uint32_t max_nonce,
-    unsigned long *hashes_done)
-{
-	const uint32_t first_nonce = pdata[19];
-
-	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x0000ff;
-
-	const uint32_t Htarg = ptarget[7];
-
-	const int throughput = 256*4096; // 100;
-
-	static bool init[8] = {0,0,0,0,0,0,0,0};
-	if (!init[thr_id])
-	{
-		cudaSetDevice(device_map[thr_id]);
-
-		// Konstanten kopieren, Speicher belegen
-		cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput);
-		quark_blake512_cpu_init(thr_id, throughput);
-		quark_groestl512_cpu_init(thr_id, throughput);
-		quark_skein512_cpu_init(thr_id, throughput);
-		quark_bmw512_cpu_init(thr_id, throughput);
-		quark_keccak512_cpu_init(thr_id, throughput);
-		quark_jh512_cpu_init(thr_id, throughput);
-		quark_check_cpu_init(thr_id, throughput);
-		quark_compactTest_cpu_init(thr_id, throughput);
-		cudaMalloc(&d_quarkNonces[thr_id], sizeof(uint32_t)*throughput);
-		cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput);
-		cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput);
-		cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput);
-		init[thr_id] = true;
-	}
-
-	uint32_t endiandata[20];
-	for (int k=0; k < 20; k++)
-		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
-
-	quark_blake512_cpu_setBlock_80((void*)endiandata);
-	quark_check_cpu_setTarget(ptarget);
-
-	do {
-		int order = 0;
-		size_t nrm1=0, nrm2=0, nrm3=0;
-
-		// erstes Blake512 Hash mit CUDA
-		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch f�r BMW512
-		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-
-		quark_compactTest_single_false_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], NULL,
-				d_branch3Nonces[thr_id], &nrm3,
-				order++);
-		
-		// nur den Skein Branch weiterverfolgen
-		quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch f�r Groestl512
-		quark_groestl512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch f�r JH512
-		quark_jh512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
-
-		// quarkNonces in branch1 und branch2 aufsplitten gem�ss if (hash[0] & 0x8)
-		quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
-			d_branch1Nonces[thr_id], &nrm1,
-			d_branch2Nonces[thr_id], &nrm2,
-			order++);
-
-		// das ist der bedingte Branch f�r Blake512
-		quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
-
-		// das ist der bedingte Branch f�r Bmw512
-		quark_bmw512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch f�r Keccak512
-		quark_keccak512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch f�r Skein512
-		quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
-
-		// quarkNonces in branch1 und branch2 aufsplitten gem�ss if (hash[0] & 0x8)
-		quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
-			d_branch1Nonces[thr_id], &nrm1,
-			d_branch2Nonces[thr_id], &nrm2,
-			order++);
-
-		// das ist der bedingte Branch f�r Keccak512
-		quark_keccak512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
-
-		// das ist der bedingte Branch f�r JH512
-		quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
-
-		// Scan nach Gewinner Hashes auf der GPU
-		uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
-		if  (foundNonce != 0xffffffff)
-		{
-			uint32_t vhash64[8];
-			be32enc(&endiandata[19], foundNonce);
-			quarkhash(vhash64, endiandata);
-
-			if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) {
-
-				pdata[19] = foundNonce;
-				*hashes_done = (foundNonce - first_nonce + 1)/2;
-				return 1;
-			} else {
-				applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNonce);
-			}
-		}
-
-		pdata[19] += throughput;
-
-	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
-
-	*hashes_done = (pdata[19] - first_nonce + 1)/2;
-	return 0;
-}
+extern "C"
+{
+#include "sph/sph_blake.h"
+#include "sph/sph_bmw.h"
+#include "sph/sph_groestl.h"
+#include "sph/sph_skein.h"
+#include "sph/sph_jh.h"
+#include "sph/sph_keccak.h"
+}
+
+#include "miner.h"
+
+#include "cuda_helper.h"
+#include "cuda_quark.h"
+
+#include <stdio.h>
+
+extern uint32_t quark_filter_cpu_sm2(const int thr_id, const uint32_t threads, const uint32_t *inpHashes, uint32_t* d_branch2);
+extern void quark_merge_cpu_sm2(const int thr_id, const uint32_t threads, uint32_t *outpHashes, uint32_t* d_branch2);
+
+static uint32_t *d_hash[MAX_GPUS];
+static uint32_t* d_hash_br2[MAX_GPUS];  // SM 2
+
+// Speicher zur Generierung der Noncevektoren für die bedingten Hashes
+static uint32_t *d_branch1Nonces[MAX_GPUS];
+static uint32_t *d_branch2Nonces[MAX_GPUS];
+static uint32_t *d_branch3Nonces[MAX_GPUS];
+
+// Original Quarkhash Funktion aus einem miner Quelltext
+extern "C" void quarkhash(void *state, const void *input)
+{
+	unsigned char _ALIGN(128) hash[64];
+
+	sph_blake512_context ctx_blake;
+	sph_bmw512_context ctx_bmw;
+	sph_groestl512_context ctx_groestl;
+	sph_jh512_context ctx_jh;
+	sph_keccak512_context ctx_keccak;
+	sph_skein512_context ctx_skein;
+
+	sph_blake512_init(&ctx_blake);
+	sph_blake512 (&ctx_blake, input, 80);
+	sph_blake512_close(&ctx_blake, (void*) hash);
+
+	sph_bmw512_init(&ctx_bmw);
+	sph_bmw512 (&ctx_bmw, (const void*) hash, 64);
+	sph_bmw512_close(&ctx_bmw, (void*) hash);
+
+	if (hash[0] & 0x8)
+	{
+		sph_groestl512_init(&ctx_groestl);
+		sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
+		sph_groestl512_close(&ctx_groestl, (void*) hash);
+	}
+	else
+	{
+		sph_skein512_init(&ctx_skein);
+		sph_skein512 (&ctx_skein, (const void*) hash, 64);
+		sph_skein512_close(&ctx_skein, (void*) hash);
+	}
+
+	sph_groestl512_init(&ctx_groestl);
+	sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
+	sph_groestl512_close(&ctx_groestl, (void*) hash);
+
+	sph_jh512_init(&ctx_jh);
+	sph_jh512 (&ctx_jh, (const void*) hash, 64);
+	sph_jh512_close(&ctx_jh, (void*) hash);
+
+	if (hash[0] & 0x8)
+	{
+		sph_blake512_init(&ctx_blake);
+		sph_blake512 (&ctx_blake, (const void*) hash, 64);
+		sph_blake512_close(&ctx_blake, (void*) hash);
+	}
+	else
+	{
+		sph_bmw512_init(&ctx_bmw);
+		sph_bmw512 (&ctx_bmw, (const void*) hash, 64);
+		sph_bmw512_close(&ctx_bmw, (void*) hash);
+	}
+
+	sph_keccak512_init(&ctx_keccak);
+	sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
+	sph_keccak512_close(&ctx_keccak, (void*) hash);
+
+	sph_skein512_init(&ctx_skein);
+	sph_skein512 (&ctx_skein, (const void*) hash, 64);
+	sph_skein512_close(&ctx_skein, (void*) hash);
+
+	if (hash[0] & 0x8)
+	{
+		sph_keccak512_init(&ctx_keccak);
+		sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
+		sph_keccak512_close(&ctx_keccak, (void*) hash);
+	}
+	else
+	{
+		sph_jh512_init(&ctx_jh);
+		sph_jh512 (&ctx_jh, (const void*) hash, 64);
+		sph_jh512_close(&ctx_jh, (void*) hash);
+	}
+
+	memcpy(state, hash, 32);
+}
+
+#ifdef _DEBUG
+#define TRACE(algo) { \
+	if (max_nonce == 1 && pdata[19] <= 1) { \
+		uint32_t* debugbuf = NULL; \
+		cudaMallocHost(&debugbuf, 32); \
+		cudaMemcpy(debugbuf, d_hash[thr_id], 32, cudaMemcpyDeviceToHost); \
+		printf("quark %s %08x %08x %08x %08x...%08x... \n", algo, swab32(debugbuf[0]), swab32(debugbuf[1]), \
+			swab32(debugbuf[2]), swab32(debugbuf[3]), swab32(debugbuf[7])); \
+		cudaFreeHost(debugbuf); \
+	} \
+}
+#else
+#define TRACE(algo) {}
+#endif
+
+static bool init[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t _ALIGN(64) endiandata[20];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+
+	int dev_id = device_map[thr_id];
+	uint32_t def_thr = 1U << 20; // 256*4096
+	uint32_t throughput = cuda_default_throughput(thr_id, def_thr);
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (opt_benchmark)
+		ptarget[7] = 0x00F;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		cudaGetLastError();
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
+
+		quark_blake512_cpu_init(thr_id, throughput);
+		quark_groestl512_cpu_init(thr_id, throughput);
+		quark_skein512_cpu_init(thr_id, throughput);
+		quark_bmw512_cpu_init(thr_id, throughput);
+		quark_keccak512_cpu_init(thr_id, throughput);
+		quark_jh512_cpu_init(thr_id, throughput);
+		quark_compactTest_cpu_init(thr_id, throughput);
+
+		if (cuda_arch[dev_id] >= 300) {
+			cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput);
+			cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput);
+			cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput);
+		} else {
+			cudaMalloc(&d_hash_br2[thr_id], (size_t) 64 * throughput);
+		}
+
+		cuda_check_cpu_init(thr_id, throughput);
+		CUDA_SAFE_CALL(cudaGetLastError());
+
+		init[thr_id] = true;
+	}
+
+	for (int k=0; k < 20; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	quark_blake512_cpu_setBlock_80(thr_id, endiandata);
+	cuda_check_cpu_setTarget(ptarget);
+
+	do {
+		int order = 0;
+		uint32_t nrm1=0, nrm2=0, nrm3=0;
+
+		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+		TRACE("blake  :");
+		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		TRACE("bmw    :");
+
+		if (cuda_arch[dev_id] >= 300) {
+
+			quark_compactTest_single_false_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], NULL,
+				d_branch3Nonces[thr_id], &nrm3, order++);
+
+			// nur den Skein Branch weiterverfolgen
+			quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
+
+			// das ist der unbedingte Branch für Groestl512
+			quark_groestl512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
+
+			// das ist der unbedingte Branch für JH512
+			quark_jh512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
+
+			// quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8)
+			quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
+				d_branch1Nonces[thr_id], &nrm1,
+				d_branch2Nonces[thr_id], &nrm2,
+				order++);
+
+			// das ist der bedingte Branch für Blake512
+			quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
+
+			// das ist der bedingte Branch für Bmw512
+			quark_bmw512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
+
+			// das ist der unbedingte Branch für Keccak512
+			quark_keccak512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
+
+			// das ist der unbedingte Branch für Skein512
+			quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
+
+			// quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8)
+			quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
+				d_branch1Nonces[thr_id], &nrm1,
+				d_branch2Nonces[thr_id], &nrm2,
+				order++);
+
+			quark_keccak512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
+			quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
+
+			work->nonces[0] = cuda_check_hash_branch(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
+			work->nonces[1] = 0;
+		} else {
+			/* algo permutations are made with 2 different buffers */
+
+			quark_filter_cpu_sm2(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]);
+			quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+			quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order++);
+			quark_merge_cpu_sm2(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]);
+			TRACE("perm1  :");
+
+			quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+			TRACE("groestl:");
+			quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+			TRACE("jh512  :");
+
+			quark_filter_cpu_sm2(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]);
+			quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+			quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order++);
+			quark_merge_cpu_sm2(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]);
+			TRACE("perm2  :");
+
+			quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+			TRACE("keccak :");
+			quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+			TRACE("skein  :");
+
+			quark_filter_cpu_sm2(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]);
+			quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+			quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order++);
+			quark_merge_cpu_sm2(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]);
+			TRACE("perm3  :");
+
+			CUDA_LOG_ERROR();
+			work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+			work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+		}
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			quarkhash(vhash, endiandata);
+
+			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					quarkhash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > ptarget[7]) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	return 0;
+}
+
+// cleanup
+extern "C" void free_quark(int thr_id)
+{
+	int dev_id = device_map[thr_id];
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+
+	if (cuda_arch[dev_id] >= 300) {
+		cudaFree(d_branch1Nonces[thr_id]);
+		cudaFree(d_branch2Nonces[thr_id]);
+		cudaFree(d_branch3Nonces[thr_id]);
+	} else {
+		cudaFree(d_hash_br2[thr_id]);
+	}
+
+	quark_blake512_cpu_free(thr_id);
+	quark_groestl512_cpu_free(thr_id);
+	quark_compactTest_cpu_free(thr_id);
+
+	cuda_check_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/qubit/deep.cu b/qubit/deep.cu
new file mode 100644
index 0000000000..0de2a9ce32
--- /dev/null
+++ b/qubit/deep.cu
@@ -0,0 +1,157 @@
+/*
+ * deepcoin algorithm
+ *
+ */
+extern "C" {
+#include "sph/sph_luffa.h"
+#include "sph/sph_cubehash.h"
+#include "sph/sph_shavite.h"
+#include "sph/sph_simd.h"
+#include "sph/sph_echo.h"
+}
+
+#include "miner.h"
+
+#include "cuda_helper.h"
+#include "x11/cuda_x11.h"
+
+static uint32_t *d_hash[MAX_GPUS];
+
+extern void qubit_luffa512_cpu_init(int thr_id, uint32_t threads);
+extern void qubit_luffa512_cpu_setBlock_80(void *pdata);
+extern void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
+
+extern "C" void deephash(void *state, const void *input)
+{
+	uint8_t _ALIGN(64) hash[64];
+
+	// luffa-80 cubehash-64 echo-64
+	sph_luffa512_context ctx_luffa;
+	sph_cubehash512_context ctx_cubehash;
+	sph_echo512_context ctx_echo;
+
+	sph_luffa512_init(&ctx_luffa);
+	sph_luffa512 (&ctx_luffa, input, 80);
+	sph_luffa512_close(&ctx_luffa, (void*) hash);
+
+	sph_cubehash512_init(&ctx_cubehash);
+	sph_cubehash512 (&ctx_cubehash, (const void*) hash, 64);
+	sph_cubehash512_close(&ctx_cubehash, (void*) hash);
+
+	sph_echo512_init(&ctx_echo);
+	sph_echo512 (&ctx_echo, (const void*) hash, 64);
+	sph_echo512_close(&ctx_echo, (void*) hash);
+
+	memcpy(state, hash, 32);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_deep(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t _ALIGN(64) endiandata[20];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	uint32_t throughput =  cuda_default_throughput(thr_id, 1U << 19); // 256*256*8
+	if (init[thr_id]) throughput = min(throughput, (max_nonce - first_nonce));
+
+	if (opt_benchmark)
+		((uint32_t*)ptarget)[7] = 0x0000f;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
+
+		qubit_luffa512_cpu_init(thr_id, throughput);
+		x11_cubehash512_cpu_init(thr_id, throughput);
+		x11_echo512_cpu_init(thr_id, throughput);
+
+		cuda_check_cpu_init(thr_id, throughput);
+
+		init[thr_id] = true;
+	}
+
+	for (int k=0; k < 19; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	qubit_luffa512_cpu_setBlock_80((void*)endiandata);
+	cuda_check_cpu_setTarget(ptarget);
+
+	do {
+		int order = 0;
+
+		qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			deephash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					deephash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce + 1;
+	return 0;
+}
+
+// cleanup
+extern "C" void free_deep(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+
+	cuda_check_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/qubit/luffa.cu b/qubit/luffa.cu
new file mode 100644
index 0000000000..057dbada3c
--- /dev/null
+++ b/qubit/luffa.cu
@@ -0,0 +1,133 @@
+/*
+ * luffa 80 algo (Introduced by Doomcoin)
+ */
+extern "C" {
+#include "sph/sph_luffa.h"
+}
+
+#include "miner.h"
+
+#include "cuda_helper.h"
+
+static uint32_t *d_hash[MAX_GPUS];
+
+extern void qubit_luffa512_cpu_init(int thr_id, uint32_t threads);
+extern void qubit_luffa512_cpu_setBlock_80(void *pdata);
+extern void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
+
+extern "C" void luffa_hash(void *state, const void *input)
+{
+	uint8_t _ALIGN(64) hash[64];
+
+	sph_luffa512_context ctx_luffa;
+
+	sph_luffa512_init(&ctx_luffa);
+	sph_luffa512 (&ctx_luffa, input, 80);
+	sph_luffa512_close(&ctx_luffa, (void*) hash);
+
+	memcpy(state, hash, 32);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_luffa(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t _ALIGN(64) endiandata[20];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << 21);
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (opt_benchmark)
+		ptarget[7] = 0x0000f;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
+
+		qubit_luffa512_cpu_init(thr_id, throughput);
+		cuda_check_cpu_init(thr_id, throughput);
+
+		init[thr_id] = true;
+	}
+
+	for (int k=0; k < 19; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	qubit_luffa512_cpu_setBlock_80((void*)endiandata);
+	cuda_check_cpu_setTarget(ptarget);
+
+	do {
+		qubit_luffa512_cpu_hash_80(thr_id, (int) throughput, pdata[19], d_hash[thr_id], 0);
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			luffa_hash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					luffa_hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+	return 0;
+}
+
+// cleanup
+extern "C" void free_luffa(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+
+	cuda_check_cpu_free(thr_id);
+
+	init[thr_id] = false;
+	cudaDeviceSynchronize();
+}
diff --git a/qubit/qubit.cu b/qubit/qubit.cu
new file mode 100644
index 0000000000..9520ea367f
--- /dev/null
+++ b/qubit/qubit.cu
@@ -0,0 +1,175 @@
+/*
+ * qubit algorithm
+ *
+ */
+extern "C" {
+#include "sph/sph_luffa.h"
+#include "sph/sph_cubehash.h"
+#include "sph/sph_shavite.h"
+#include "sph/sph_simd.h"
+#include "sph/sph_echo.h"
+}
+
+#include "miner.h"
+
+#include "cuda_helper.h"
+#include "x11/cuda_x11.h"
+
+static uint32_t *d_hash[MAX_GPUS];
+
+extern void qubit_luffa512_cpu_init(int thr_id, uint32_t threads);
+extern void qubit_luffa512_cpu_setBlock_80(void *pdata);
+extern void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
+
+extern "C" void qubithash(void *state, const void *input)
+{
+	uint8_t _ALIGN(128) hash[64];
+
+	// luffa1-cubehash2-shavite3-simd4-echo5
+
+	sph_luffa512_context ctx_luffa;
+	sph_cubehash512_context ctx_cubehash;
+	sph_shavite512_context ctx_shavite;
+	sph_simd512_context ctx_simd;
+	sph_echo512_context ctx_echo;
+
+	sph_luffa512_init(&ctx_luffa);
+	sph_luffa512 (&ctx_luffa, input, 80);
+	sph_luffa512_close(&ctx_luffa, (void*) hash);
+
+	sph_cubehash512_init(&ctx_cubehash);
+	sph_cubehash512 (&ctx_cubehash, (const void*) hash, 64);
+	sph_cubehash512_close(&ctx_cubehash, (void*) hash);
+
+	sph_shavite512_init(&ctx_shavite);
+	sph_shavite512 (&ctx_shavite, (const void*) hash, 64);
+	sph_shavite512_close(&ctx_shavite, (void*) hash);
+
+	sph_simd512_init(&ctx_simd);
+	sph_simd512 (&ctx_simd, (const void*) hash, 64);
+	sph_simd512_close(&ctx_simd, (void*) hash);
+
+	sph_echo512_init(&ctx_echo);
+	sph_echo512 (&ctx_echo, (const void*) hash, 64);
+	sph_echo512_close(&ctx_echo, (void*) hash);
+
+	memcpy(state, hash, 32);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_qubit(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t _ALIGN(64) endiandata[20];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	uint32_t throughput =  cuda_default_throughput(thr_id, 1U << 19); // 256*256*8
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (opt_benchmark)
+		ptarget[7] = 0x007f;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		qubit_luffa512_cpu_init(thr_id, throughput);
+		x11_cubehash512_cpu_init(thr_id, throughput);
+		x11_shavite512_cpu_init(thr_id, throughput);
+		x11_simd512_cpu_init(thr_id, throughput);
+		x11_echo512_cpu_init(thr_id, throughput);
+
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
+
+		cuda_check_cpu_init(thr_id, throughput);
+
+		init[thr_id] = true;
+	}
+
+	for (int k=0; k < 20; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	qubit_luffa512_cpu_setBlock_80((void*)endiandata);
+	cuda_check_cpu_setTarget(ptarget);
+
+	do {
+		int order = 0;
+
+		// Hash with CUDA
+		qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			qubithash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					qubithash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+	return 0;
+}
+
+// cleanup
+extern "C" void free_qubit(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+
+	x11_simd512_cpu_free(thr_id);
+
+	cuda_check_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/qubit/qubit_luffa512.cu b/qubit/qubit_luffa512.cu
new file mode 100644
index 0000000000..57584ee65c
--- /dev/null
+++ b/qubit/qubit_luffa512.cu
@@ -0,0 +1,516 @@
+/*******************************************************************************
+ * luffa512 for 80-bytes input (with midstate precalc by klausT)
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <memory.h>
+
+#include "cuda_helper.h"
+
+static __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
+static __constant__ uint32_t statebufferpre[8];
+static __constant__ uint32_t statechainvpre[40];
+
+#define MULT2(a,j) {\
+	tmp = a[7+(8*j)];\
+	a[7+(8*j)] = a[6+(8*j)];\
+	a[6+(8*j)] = a[5+(8*j)];\
+	a[5+(8*j)] = a[4+(8*j)];\
+	a[4+(8*j)] = a[3+(8*j)] ^ tmp;\
+	a[3+(8*j)] = a[2+(8*j)] ^ tmp;\
+	a[2+(8*j)] = a[1+(8*j)];\
+	a[1+(8*j)] = a[0+(8*j)] ^ tmp;\
+	a[0+(8*j)] = tmp;\
+}
+
+#define TWEAK(a0,a1,a2,a3,j) { \
+	a0 = (a0<<(j))|(a0>>(32-j));\
+	a1 = (a1<<(j))|(a1>>(32-j));\
+	a2 = (a2<<(j))|(a2>>(32-j));\
+	a3 = (a3<<(j))|(a3>>(32-j));\
+}
+
+#define STEP(c0,c1) { \
+	SUBCRUMB(chainv[0],chainv[1],chainv[2],chainv[3],tmp);\
+	SUBCRUMB(chainv[5],chainv[6],chainv[7],chainv[4],tmp);\
+	MIXWORD(chainv[0],chainv[4]);\
+	MIXWORD(chainv[1],chainv[5]);\
+	MIXWORD(chainv[2],chainv[6]);\
+	MIXWORD(chainv[3],chainv[7]);\
+	ADD_CONSTANT(chainv[0],chainv[4],c0,c1);\
+}
+
+#define SUBCRUMB(a0,a1,a2,a3,a4)\
+	a4  = a0;\
+	a0 |= a1;\
+	a2 ^= a3;\
+	a1  = ~a1;\
+	a0 ^= a3;\
+	a3 &= a4;\
+	a1 ^= a3;\
+	a3 ^= a2;\
+	a2 &= a0;\
+	a0  = ~a0;\
+	a2 ^= a1;\
+	a1 |= a3;\
+	a4 ^= a1;\
+	a3 ^= a2;\
+	a2 &= a1;\
+	a1 ^= a0;\
+	a0  = a4;
+
+#define MIXWORD(a0,a4)\
+	a4 ^= a0;\
+	a0  = (a0<<2) | (a0>>(30));\
+	a0 ^= a4;\
+	a4  = (a4<<14) | (a4>>(18));\
+	a4 ^= a0;\
+	a0  = (a0<<10) | (a0>>(22));\
+	a0 ^= a4;\
+	a4  = (a4<<1) | (a4>>(31));
+
+#define ADD_CONSTANT(a0,b0,c0,c1)\
+	a0 ^= c0;\
+	b0 ^= c1;
+
+/* initial values of chaining variables */
+__constant__ uint32_t c_IV[40];
+static const uint32_t h_IV[40] = {
+	0x6d251e69,0x44b051e0,0x4eaa6fb4,0xdbf78465,
+	0x6e292011,0x90152df4,0xee058139,0xdef610bb,
+	0xc3b44b95,0xd9d2f256,0x70eee9a0,0xde099fa3,
+	0x5d9b0557,0x8fc944b3,0xcf1ccf0e,0x746cd581,
+	0xf7efc89d,0x5dba5781,0x04016ce5,0xad659c05,
+	0x0306194f,0x666d1836,0x24aa230a,0x8b264ae7,
+	0x858075d5,0x36d79cce,0xe571f7d7,0x204b1f67,
+	0x35870c6a,0x57e9e923,0x14bcb808,0x7cde72ce,
+	0x6c68e9be,0x5ec41e22,0xc825b7c7,0xaffb4363,
+	0xf5df3999,0x0fc688f1,0xb07224cc,0x03e86cea};
+
+__constant__ uint32_t c_CNS[80];
+static const uint32_t h_CNS[80] = {
+	0x303994a6,0xe0337818,0xc0e65299,0x441ba90d,
+	0x6cc33a12,0x7f34d442,0xdc56983e,0x9389217f,
+	0x1e00108f,0xe5a8bce6,0x7800423d,0x5274baf4,
+	0x8f5b7882,0x26889ba7,0x96e1db12,0x9a226e9d,
+	0xb6de10ed,0x01685f3d,0x70f47aae,0x05a17cf4,
+	0x0707a3d4,0xbd09caca,0x1c1e8f51,0xf4272b28,
+	0x707a3d45,0x144ae5cc,0xaeb28562,0xfaa7ae2b,
+	0xbaca1589,0x2e48f1c1,0x40a46f3e,0xb923c704,
+	0xfc20d9d2,0xe25e72c1,0x34552e25,0xe623bb72,
+	0x7ad8818f,0x5c58a4a4,0x8438764a,0x1e38e2e7,
+	0xbb6de032,0x78e38b9d,0xedb780c8,0x27586719,
+	0xd9847356,0x36eda57f,0xa2c78434,0x703aace7,
+	0xb213afa5,0xe028c9bf,0xc84ebe95,0x44756f91,
+	0x4e608a22,0x7e8fce32,0x56d858fe,0x956548be,
+	0x343b138f,0xfe191be2,0xd0ec4e3d,0x3cb226e5,
+	0x2ceb4882,0x5944a28e,0xb3ad2208,0xa1c4c355,
+	0xf0d2e9e3,0x5090d577,0xac11d7fa,0x2d1925ab,
+	0x1bcb66f2,0xb46496ac,0x6f2d9bc9,0xd1925ab0,
+	0x78602649,0x29131ab6,0x8edae952,0x0fc053c3,
+	0x3b6ba548,0x3f014f0c,0xedae9520,0xfc053c31};
+
+
+/***************************************************/
+__device__ __forceinline__
+void rnd512(uint32_t *statebuffer, uint32_t *statechainv)
+{
+	int i,j;
+	uint32_t t[40];
+	uint32_t chainv[8];
+	uint32_t tmp;
+
+	#pragma unroll 8
+	for(i=0; i<8; i++) {
+		t[i]=0;
+		#pragma unroll 5
+		for(j=0; j<5; j++)
+			t[i] ^= statechainv[i+8*j];
+	}
+
+	MULT2(t, 0);
+
+	#pragma unroll 5
+	for(j=0; j<5; j++) {
+		#pragma unroll 8
+		for(i=0; i<8; i++)
+			statechainv[i+8*j] ^= t[i];
+	}
+
+	#pragma unroll 5
+	for(j=0; j<5; j++) {
+		#pragma unroll 8
+		for(i=0; i<8; i++)
+			t[i+8*j] = statechainv[i+8*j];
+	}
+
+	#pragma unroll
+	for(j=0; j<5; j++)
+		MULT2(statechainv, j);
+
+	#pragma unroll 5
+	for(j=0; j<5; j++) {
+		#pragma unroll 8
+		for(i=0; i<8; i++)
+			statechainv[8*j+i] ^= t[8*((j+1)%5)+i];
+	}
+
+	#pragma unroll 5
+	for(j=0; j<5; j++) {
+		#pragma unroll 8
+		for(i=0; i<8; i++)
+			t[i+8*j] = statechainv[i+8*j];
+	}
+
+	#pragma unroll
+	for(j=0; j<5; j++)
+		MULT2(statechainv, j);
+
+	#pragma unroll 5
+	for(j=0; j<5; j++) {
+		#pragma unroll 8
+		for(i=0; i<8; i++)
+			statechainv[8*j+i] ^= t[8*((j+4)%5)+i];
+	}
+
+	#pragma unroll 5
+	for(j=0; j<5; j++) {
+		#pragma unroll 8
+		for(i=0; i<8; i++)
+			statechainv[i+8*j] ^= statebuffer[i];
+		MULT2(statebuffer, 0);
+	}
+
+	#pragma unroll
+	for(i=0; i<8; i++)
+		chainv[i] = statechainv[i];
+
+	#pragma unroll
+	for(i=0; i<8; i++)
+		STEP(c_CNS[(2*i)], c_CNS[(2*i)+1]);
+
+	#pragma unroll
+	for(i=0; i<8; i++) {
+		statechainv[i] = chainv[i];
+		chainv[i] = statechainv[i+8];
+	}
+
+	TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],1);
+
+	#pragma unroll
+	for(i=0; i<8; i++)
+		STEP(c_CNS[(2*i)+16], c_CNS[(2*i)+16+1]);
+
+	#pragma unroll
+	for(i=0; i<8; i++) {
+		statechainv[i+8] = chainv[i];
+		chainv[i] = statechainv[i+16];
+	}
+
+	TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],2);
+
+	#pragma unroll
+	for(i=0; i<8; i++)
+		STEP(c_CNS[(2*i)+32],c_CNS[(2*i)+32+1]);
+
+	#pragma unroll
+	for(i=0; i<8; i++) {
+		statechainv[i+16] = chainv[i];
+		chainv[i] = statechainv[i+24];
+	}
+
+	TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],3);
+
+	#pragma unroll
+	for(i=0; i<8; i++)
+		STEP(c_CNS[(2*i)+48],c_CNS[(2*i)+48+1]);
+
+	#pragma unroll
+	for(i=0; i<8; i++) {
+		statechainv[i+24] = chainv[i];
+		chainv[i] = statechainv[i+32];
+	}
+
+	TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],4);
+
+	#pragma unroll
+	for(i=0; i<8; i++)
+		STEP(c_CNS[(2*i)+64],c_CNS[(2*i)+64+1]);
+
+	#pragma unroll 8
+	for(i=0; i<8; i++)
+		statechainv[i+32] = chainv[i];
+}
+
+static void rnd512_cpu(uint32_t *statebuffer, uint32_t *statechainv)
+{
+	int i, j;
+	uint32_t t[40];
+	uint32_t chainv[8];
+	uint32_t tmp;
+
+	for (i = 0; i<8; i++) {
+		t[i] = statechainv[i];
+		for (j = 1; j<5; j++)
+			t[i] ^= statechainv[i + 8 * j];
+	}
+
+	MULT2(t, 0);
+
+	for (j = 0; j<5; j++) {
+		for (i = 0; i<8; i++)
+			statechainv[i + 8 * j] ^= t[i];
+	}
+
+	for (j = 0; j<5; j++) {
+		for (i = 0; i<8; i++)
+			t[i + 8 * j] = statechainv[i + 8 * j];
+	}
+
+	for (j = 0; j<5; j++)
+		MULT2(statechainv, j);
+
+	for (j = 0; j<5; j++) {
+		for (i = 0; i<8; i++)
+			statechainv[8 * j + i] ^= t[8 * ((j + 1) % 5) + i];
+	}
+
+	for (j = 0; j<5; j++) {
+		for (i = 0; i<8; i++)
+			t[i + 8 * j] = statechainv[i + 8 * j];
+	}
+
+	for (j = 0; j<5; j++)
+		MULT2(statechainv, j);
+
+	for (j = 0; j<5; j++) {
+		for (i = 0; i<8; i++)
+			statechainv[8 * j + i] ^= t[8 * ((j + 4) % 5) + i];
+	}
+
+	for (j = 0; j<5; j++) {
+		for (i = 0; i<8; i++)
+			statechainv[i + 8 * j] ^= statebuffer[i];
+		MULT2(statebuffer, 0);
+	}
+
+	for (i = 0; i<8; i++)
+		chainv[i] = statechainv[i];
+
+	for (i = 0; i<8; i++)
+		STEP(h_CNS[(2 * i)], h_CNS[(2 * i) + 1]);
+
+	for (i = 0; i<8; i++) {
+		statechainv[i] = chainv[i];
+		chainv[i] = statechainv[i + 8];
+	}
+
+	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 1);
+
+	for (i = 0; i<8; i++)
+		STEP(h_CNS[(2 * i) + 16], h_CNS[(2 * i) + 16 + 1]);
+
+	for (i = 0; i<8; i++) {
+		statechainv[i + 8] = chainv[i];
+		chainv[i] = statechainv[i + 16];
+	}
+
+	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 2);
+
+	for (i = 0; i<8; i++)
+		STEP(h_CNS[(2 * i) + 32], h_CNS[(2 * i) + 32 + 1]);
+
+	for (i = 0; i<8; i++) {
+		statechainv[i + 16] = chainv[i];
+		chainv[i] = statechainv[i + 24];
+	}
+
+	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 3);
+
+	for (i = 0; i<8; i++)
+		STEP(h_CNS[(2 * i) + 48], h_CNS[(2 * i) + 48 + 1]);
+
+	for (i = 0; i<8; i++) {
+		statechainv[i + 24] = chainv[i];
+		chainv[i] = statechainv[i + 32];
+	}
+
+	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 4);
+
+	for (i = 0; i<8; i++)
+		STEP(h_CNS[(2 * i) + 64], h_CNS[(2 * i) + 64 + 1]);
+
+	for (i = 0; i<8; i++)
+		statechainv[i + 32] = chainv[i];
+}
+
+/***************************************************/
+__device__ __forceinline__
+void Update512(uint32_t* statebuffer, uint32_t *statechainv, const uint32_t *const __restrict__ data)
+{
+	#pragma unroll
+	for (int i = 0; i<8; i++)
+		statebuffer[i] = cuda_swab32((data[i]));
+	rnd512(statebuffer, statechainv);
+
+	#pragma unroll
+	for(int i=0; i<8; i++)
+		statebuffer[i] = cuda_swab32((data[i+8]));
+	rnd512(statebuffer, statechainv);
+
+	#pragma unroll
+	for(int i=0; i<4; i++)
+		statebuffer[i] = cuda_swab32((data[i+16]));
+}
+
+
+/***************************************************/
+__device__ __forceinline__
+void finalization512(uint32_t* statebuffer, uint32_t *statechainv, uint32_t *b)
+{
+	int i,j;
+
+	statebuffer[4] = 0x80000000U;
+
+	#pragma unroll 3
+	for(int i=5; i<8; i++)
+		statebuffer[i] = 0;
+	rnd512(statebuffer, statechainv);
+
+	/*---- blank round with m=0 ----*/
+	#pragma unroll
+	for(i=0; i<8; i++)
+		statebuffer[i] =0;
+	rnd512(statebuffer, statechainv);
+
+	#pragma unroll
+	for(i=0; i<8; i++) {
+		b[i] = 0;
+		#pragma unroll 5
+		for(j=0; j<5; j++)
+			b[i] ^= statechainv[i+8*j];
+		b[i] = cuda_swab32((b[i]));
+	}
+
+	#pragma unroll
+	for(i=0; i<8; i++)
+		statebuffer[i]=0;
+	rnd512(statebuffer, statechainv);
+
+	#pragma unroll
+	for(i=0; i<8; i++)
+	{
+		b[8+i] = 0;
+		#pragma unroll 5
+		for(j=0; j<5; j++)
+			b[8+i] ^= statechainv[i+8*j];
+		b[8+i] = cuda_swab32((b[8+i]));
+	}
+}
+
+
+/***************************************************/
+__global__
+void qubit_luffa512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *outputHash)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t nounce = startNounce + thread;
+		union {
+		uint64_t buf64[16];
+		uint32_t buf32[32];
+		} buff;
+
+		#pragma unroll 8
+		for (int i=8; i < 16; i++)
+			buff.buf64[i] = c_PaddedMessage80[i];
+
+		// die Nounce durch die thread-spezifische ersetzen
+		buff.buf64[9] = REPLACE_HIDWORD(buff.buf64[9], cuda_swab32(nounce));
+
+		uint32_t statebuffer[8], statechainv[40];
+
+		#pragma unroll
+		for (int i = 0; i<4; i++)
+			statebuffer[i] = cuda_swab32(buff.buf32[i + 16]);
+
+		#pragma unroll 4
+		for (int i = 4; i<8; i++)
+			statebuffer[i] = statebufferpre[i];
+
+		#pragma unroll
+		for (int i = 0; i<40; i++)
+			statechainv[i] = statechainvpre[i];
+
+		uint32_t *outHash = &outputHash[thread * 16];
+		finalization512(statebuffer, statechainv, outHash);
+	}
+}
+
+__host__
+void qubit_luffa512_cpu_init(int thr_id, uint32_t threads)
+{
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_IV, h_IV, sizeof(h_IV), 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_CNS, h_CNS, sizeof(h_CNS), 0, cudaMemcpyHostToDevice));
+}
+
+__host__
+void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash,int order)
+{
+	const uint32_t threadsperblock = 256;
+
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+	size_t shared_size = 0;
+
+	qubit_luffa512_gpu_hash_80 <<<grid, block, shared_size>>> (threads, startNounce, d_outputHash);
+}
+
+__host__
+static void qubit_cpu_precalc(uint32_t* message)
+{
+	uint32_t statebuffer[8];
+	uint32_t statechainv[40] =
+	{
+		0x6d251e69, 0x44b051e0, 0x4eaa6fb4, 0xdbf78465,
+		0x6e292011, 0x90152df4, 0xee058139, 0xdef610bb,
+		0xc3b44b95, 0xd9d2f256, 0x70eee9a0, 0xde099fa3,
+		0x5d9b0557, 0x8fc944b3, 0xcf1ccf0e, 0x746cd581,
+		0xf7efc89d, 0x5dba5781, 0x04016ce5, 0xad659c05,
+		0x0306194f, 0x666d1836, 0x24aa230a, 0x8b264ae7,
+		0x858075d5, 0x36d79cce, 0xe571f7d7, 0x204b1f67,
+		0x35870c6a, 0x57e9e923, 0x14bcb808, 0x7cde72ce,
+		0x6c68e9be, 0x5ec41e22, 0xc825b7c7, 0xaffb4363,
+		0xf5df3999, 0x0fc688f1, 0xb07224cc, 0x03e86cea
+	};
+
+	for (int i = 0; i<8; i++)
+		statebuffer[i] = cuda_swab32(message[i]);
+	rnd512_cpu(statebuffer, statechainv);
+
+	for (int i = 0; i<8; i++)
+		statebuffer[i] = cuda_swab32(message[i+8]);
+
+	rnd512_cpu(statebuffer, statechainv);
+
+	cudaMemcpyToSymbol(statebufferpre, statebuffer, sizeof(statebuffer), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(statechainvpre, statechainv, sizeof(statechainv), 0, cudaMemcpyHostToDevice);
+}
+
+__host__
+void qubit_luffa512_cpu_setBlock_80(void *pdata)
+{
+	unsigned char PaddedMessage[128];
+
+	memcpy(PaddedMessage, pdata, 80);
+	memset(PaddedMessage+80, 0, 48);
+	PaddedMessage[80] = 0x80;
+	PaddedMessage[111] = 1;
+	PaddedMessage[126] = 0x02;
+	PaddedMessage[127] = 0x80;
+
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, sizeof(PaddedMessage), 0, cudaMemcpyHostToDevice));
+	qubit_cpu_precalc((uint32_t*) PaddedMessage);
+}
diff --git a/res/ccminer.ico b/res/ccminer.ico
new file mode 100644
index 0000000000..3bd81bbcee
Binary files /dev/null and b/res/ccminer.ico differ
diff --git a/res/ccminer.rc b/res/ccminer.rc
new file mode 100644
index 0000000000..bc285bfaf6
--- /dev/null
+++ b/res/ccminer.rc
@@ -0,0 +1,105 @@
+// Microsoft Visual C++ generated resource script.
+//
+#include "resource.h"
+
+#define APSTUDIO_READONLY_SYMBOLS
+/////////////////////////////////////////////////////////////////////////////
+//
+// Generated from the TEXTINCLUDE 2 resource.
+//
+#include "winres.h"
+
+/////////////////////////////////////////////////////////////////////////////
+#undef APSTUDIO_READONLY_SYMBOLS
+
+/////////////////////////////////////////////////////////////////////////////
+// English (United States) resources
+
+#if !defined(AFX_RESOURCE_DLL) || defined(AFX_TARG_ENU)
+LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US
+#pragma code_page(1252)
+
+#ifdef APSTUDIO_INVOKED
+/////////////////////////////////////////////////////////////////////////////
+//
+// TEXTINCLUDE
+//
+
+1 TEXTINCLUDE 
+BEGIN
+    "resource.h\0"
+END
+
+2 TEXTINCLUDE 
+BEGIN
+    "#include ""winres.h""\r\n"
+    "\0"
+END
+
+3 TEXTINCLUDE 
+BEGIN
+    "\r\n"
+    "\0"
+END
+
+#endif    // APSTUDIO_INVOKED
+
+
+/////////////////////////////////////////////////////////////////////////////
+//
+// Icon
+//
+
+// Icon with lowest ID value placed first to ensure application icon
+// remains consistent on all systems.
+IDI_ICON1               ICON                    "ccminer.ico"
+
+/////////////////////////////////////////////////////////////////////////////
+//
+// Version
+//
+
+VS_VERSION_INFO VERSIONINFO
+ FILEVERSION 2,3,1,0
+ PRODUCTVERSION 2,3,1,0
+ FILEFLAGSMASK 0x3fL
+#ifdef _DEBUG
+ FILEFLAGS 0x21L
+#else
+ FILEFLAGS 0x20L
+#endif
+ FILEOS 0x40004L
+ FILETYPE 0x1L
+ FILESUBTYPE 0x0L
+BEGIN
+    BLOCK "StringFileInfo"
+    BEGIN
+        BLOCK "040904e4"
+        BEGIN
+            VALUE "FileVersion", "2.3.1"
+            VALUE "LegalCopyright", "Copyright (C) 2019"
+            VALUE "ProductName", "ccminer"
+            VALUE "ProductVersion", "2.3.1"
+        END
+    END
+    BLOCK "VarFileInfo"
+    BEGIN
+        VALUE "Translation", 0x409, 1252
+    END
+END
+
+#endif    // English (United States) resources
+/////////////////////////////////////////////////////////////////////////////
+
+
+
+#ifndef APSTUDIO_INVOKED
+/////////////////////////////////////////////////////////////////////////////
+//
+// Generated from the TEXTINCLUDE 3 resource.
+//
+
+
+/////////////////////////////////////////////////////////////////////////////
+#endif    // not APSTUDIO_INVOKED
+
diff --git a/res/resource.h b/res/resource.h
new file mode 100644
index 0000000000..1b0177d10d
Binary files /dev/null and b/res/resource.h differ
diff --git a/scrypt-jane.cpp b/scrypt-jane.cpp
new file mode 100644
index 0000000000..89e7472626
--- /dev/null
+++ b/scrypt-jane.cpp
@@ -0,0 +1,713 @@
+/*
+ * scrypt-jane by Andrew M, https://github.com/floodyberry/scrypt-jane
+ *
+ * Public Domain or MIT License, whichever is easier
+ *
+ * Adapted to ccminer by tpruvot@github (2015)
+ */
+
+#include "miner.h"
+
+#include "scrypt/scrypt-jane.h"
+#include "scrypt/code/scrypt-jane-portable.h"
+#include "scrypt/code/scrypt-jane-chacha.h"
+#include "scrypt/keccak.h"
+
+#include "scrypt/salsa_kernel.h"
+
+#define scrypt_maxN 30  /* (1 << (30 + 1)) = ~2 billion */
+#define scrypt_r_32kb 8 /* (1 << 8) = 256 * 2 blocks in a chunk * 64 bytes = Max of 32kb in a chunk */
+#define scrypt_maxr scrypt_r_32kb /* 32kb */
+#define scrypt_maxp 25  /* (1 << 25) = ~33 million */
+
+// ---------------------------- BEGIN keccak functions ------------------------------------
+
+#define SCRYPT_HASH "Keccak-512"
+#define SCRYPT_HASH_DIGEST_SIZE 64
+#define SCRYPT_KECCAK_F 1600
+#define SCRYPT_KECCAK_C (SCRYPT_HASH_DIGEST_SIZE * 8 * 2) /* 1024 */
+#define SCRYPT_KECCAK_R (SCRYPT_KECCAK_F - SCRYPT_KECCAK_C) /* 576 */
+#define SCRYPT_HASH_BLOCK_SIZE (SCRYPT_KECCAK_R / 8)
+
+typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE];
+
+typedef struct scrypt_hash_state_t {
+	uint64_t state[SCRYPT_KECCAK_F / 64];
+	uint32_t leftover;
+	uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
+} scrypt_hash_state;
+
+static const uint64_t keccak_round_constants[24] = {
+	0x0000000000000001ull, 0x0000000000008082ull,
+	0x800000000000808aull, 0x8000000080008000ull,
+	0x000000000000808bull, 0x0000000080000001ull,
+	0x8000000080008081ull, 0x8000000000008009ull,
+	0x000000000000008aull, 0x0000000000000088ull,
+	0x0000000080008009ull, 0x000000008000000aull,
+	0x000000008000808bull, 0x800000000000008bull,
+	0x8000000000008089ull, 0x8000000000008003ull,
+	0x8000000000008002ull, 0x8000000000000080ull,
+	0x000000000000800aull, 0x800000008000000aull,
+	0x8000000080008081ull, 0x8000000000008080ull,
+	0x0000000080000001ull, 0x8000000080008008ull
+};
+
+static void keccak_block(scrypt_hash_state *S, const uint8_t *in)
+{
+	size_t i;
+	uint64_t *s = S->state, t[5], u[5], v, w;
+
+	/* absorb input */
+	for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE / 8; i++, in += 8)
+		s[i] ^= U8TO64_LE(in);
+
+	for (i = 0; i < 24; i++) {
+		/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
+		t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
+		t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
+		t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
+		t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
+		t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
+
+		/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
+		u[0] = t[4] ^ ROTL64(t[1], 1);
+		u[1] = t[0] ^ ROTL64(t[2], 1);
+		u[2] = t[1] ^ ROTL64(t[3], 1);
+		u[3] = t[2] ^ ROTL64(t[4], 1);
+		u[4] = t[3] ^ ROTL64(t[0], 1);
+
+		/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
+		s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
+		s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
+		s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
+		s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
+		s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
+
+		/* rho pi: b[..] = rotl(a[..], ..) */
+		v = s[ 1];
+		s[ 1] = ROTL64(s[ 6], 44);
+		s[ 6] = ROTL64(s[ 9], 20);
+		s[ 9] = ROTL64(s[22], 61);
+		s[22] = ROTL64(s[14], 39);
+		s[14] = ROTL64(s[20], 18);
+		s[20] = ROTL64(s[ 2], 62);
+		s[ 2] = ROTL64(s[12], 43);
+		s[12] = ROTL64(s[13], 25);
+		s[13] = ROTL64(s[19],  8);
+		s[19] = ROTL64(s[23], 56);
+		s[23] = ROTL64(s[15], 41);
+		s[15] = ROTL64(s[ 4], 27);
+		s[ 4] = ROTL64(s[24], 14);
+		s[24] = ROTL64(s[21],  2);
+		s[21] = ROTL64(s[ 8], 55);
+		s[ 8] = ROTL64(s[16], 45);
+		s[16] = ROTL64(s[ 5], 36);
+		s[ 5] = ROTL64(s[ 3], 28);
+		s[ 3] = ROTL64(s[18], 21);
+		s[18] = ROTL64(s[17], 15);
+		s[17] = ROTL64(s[11], 10);
+		s[11] = ROTL64(s[ 7],  6);
+		s[ 7] = ROTL64(s[10],  3);
+		s[10] = ROTL64(    v,  1);
+
+		/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
+		v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w;
+		v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w;
+		v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
+		v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
+		v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
+
+		/* iota: a[0,0] ^= round constant */
+		s[0] ^= keccak_round_constants[i];
+	}
+}
+
+static void scrypt_hash_init(scrypt_hash_state *S) {
+	memset(S, 0, sizeof(*S));
+}
+
+static void scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen)
+{
+	size_t want;
+
+	/* handle the previous data */
+	if (S->leftover) {
+		want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover);
+		want = (want < inlen) ? want : inlen;
+		memcpy(S->buffer + S->leftover, in, want);
+		S->leftover += (uint32_t)want;
+		if (S->leftover < SCRYPT_HASH_BLOCK_SIZE)
+			return;
+		in += want;
+		inlen -= want;
+		keccak_block(S, S->buffer);
+	}
+
+	/* handle the current data */
+	while (inlen >= SCRYPT_HASH_BLOCK_SIZE) {
+		keccak_block(S, in);
+		in += SCRYPT_HASH_BLOCK_SIZE;
+		inlen -= SCRYPT_HASH_BLOCK_SIZE;
+	}
+
+	/* handle leftover data */
+	S->leftover = (uint32_t)inlen;
+	if (S->leftover)
+		memcpy(S->buffer, in, S->leftover);
+}
+
+static void scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash)
+{
+	size_t i;
+
+	S->buffer[S->leftover] = 0x01;
+	memset(S->buffer + (S->leftover + 1), 0, SCRYPT_HASH_BLOCK_SIZE - (S->leftover + 1));
+	S->buffer[SCRYPT_HASH_BLOCK_SIZE - 1] |= 0x80;
+	keccak_block(S, S->buffer);
+
+	for (i = 0; i < SCRYPT_HASH_DIGEST_SIZE; i += 8) {
+		U64TO8_LE(&hash[i], S->state[i / 8]);
+	}
+}
+
+// ---------------------------- END keccak functions ------------------------------------
+
+// ---------------------------- BEGIN PBKDF2 functions ------------------------------------
+
+typedef struct scrypt_hmac_state_t {
+	scrypt_hash_state inner, outer;
+} scrypt_hmac_state;
+
+
+static void scrypt_hash(scrypt_hash_digest hash, const uint8_t *m, size_t mlen)
+{
+	scrypt_hash_state st;
+
+	scrypt_hash_init(&st);
+	scrypt_hash_update(&st, m, mlen);
+	scrypt_hash_finish(&st, hash);
+}
+
+/* hmac */
+static void scrypt_hmac_init(scrypt_hmac_state *st, const uint8_t *key, size_t keylen)
+{
+	uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] = {0};
+	size_t i;
+
+	scrypt_hash_init(&st->inner);
+	scrypt_hash_init(&st->outer);
+
+	if (keylen <= SCRYPT_HASH_BLOCK_SIZE) {
+		/* use the key directly if it's <= blocksize bytes */
+		memcpy(pad, key, keylen);
+	} else {
+		/* if it's > blocksize bytes, hash it */
+		scrypt_hash(pad, key, keylen);
+	}
+
+	/* inner = (key ^ 0x36) */
+	/* h(inner || ...) */
+	for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++)
+		pad[i] ^= 0x36;
+	scrypt_hash_update(&st->inner, pad, SCRYPT_HASH_BLOCK_SIZE);
+
+	/* outer = (key ^ 0x5c) */
+	/* h(outer || ...) */
+	for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++)
+		pad[i] ^= (0x5c ^ 0x36);
+	scrypt_hash_update(&st->outer, pad, SCRYPT_HASH_BLOCK_SIZE);
+}
+
+static void scrypt_hmac_update(scrypt_hmac_state *st, const uint8_t *m, size_t mlen)
+{
+	/* h(inner || m...) */
+	scrypt_hash_update(&st->inner, m, mlen);
+}
+
+static void scrypt_hmac_finish(scrypt_hmac_state *st, scrypt_hash_digest mac)
+{
+	/* h(inner || m) */
+	scrypt_hash_digest innerhash;
+	scrypt_hash_finish(&st->inner, innerhash);
+
+	/* h(outer || h(inner || m)) */
+	scrypt_hash_update(&st->outer, innerhash, sizeof(innerhash));
+	scrypt_hash_finish(&st->outer, mac);
+}
+
+/*
+ * Special version where N = 1
+ *  - mikaelh
+ */
+static void scrypt_pbkdf2_1(const uint8_t *password, size_t password_len,
+	const uint8_t *salt, size_t salt_len, uint8_t *out, uint64_t bytes)
+{
+	scrypt_hmac_state hmac_pw, hmac_pw_salt, work;
+	scrypt_hash_digest ti, u;
+	uint8_t be[4];
+	uint32_t i, blocks;
+
+	/* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */
+
+	/* hmac(password, ...) */
+	scrypt_hmac_init(&hmac_pw, password, password_len);
+
+	/* hmac(password, salt...) */
+	hmac_pw_salt = hmac_pw;
+	scrypt_hmac_update(&hmac_pw_salt, salt, salt_len);
+
+	blocks = ((uint32_t)bytes + (SCRYPT_HASH_DIGEST_SIZE - 1)) / SCRYPT_HASH_DIGEST_SIZE;
+	for (i = 1; i <= blocks; i++) {
+		/* U1 = hmac(password, salt || be(i)) */
+		U32TO8_BE(be, i);
+		work = hmac_pw_salt;
+		scrypt_hmac_update(&work, be, 4);
+		scrypt_hmac_finish(&work, ti);
+		memcpy(u, ti, sizeof(u));
+
+		memcpy(out, ti, (size_t) (bytes > SCRYPT_HASH_DIGEST_SIZE ? SCRYPT_HASH_DIGEST_SIZE : bytes));
+		out += SCRYPT_HASH_DIGEST_SIZE;
+		bytes -= SCRYPT_HASH_DIGEST_SIZE;
+	}
+}
+
+// ---------------------------- END PBKDF2 functions ------------------------------------
+
+static void scrypt_fatal_error_default(const char *msg) {
+	fprintf(stderr, "%s\n", msg);
+	exit(1);
+}
+
+static scrypt_fatal_errorfn scrypt_fatal_error = scrypt_fatal_error_default;
+
+void scrypt_set_fatal_error_default(scrypt_fatal_errorfn fn) {
+	scrypt_fatal_error = fn;
+}
+
+typedef struct scrypt_aligned_alloc_t {
+	uint8_t *mem, *ptr;
+} scrypt_aligned_alloc;
+
+#if defined(SCRYPT_TEST_SPEED)
+static uint8_t *mem_base = (uint8_t *)0;
+static size_t mem_bump = 0;
+
+/* allocations are assumed to be multiples of 64 bytes and total allocations not to exceed ~1.01gb */
+static scrypt_aligned_alloc scrypt_alloc(uint64_t size)
+{
+	scrypt_aligned_alloc aa;
+	if (!mem_base) {
+		mem_base = (uint8_t *)malloc((1024 * 1024 * 1024) + (1024 * 1024) + (SCRYPT_BLOCK_BYTES - 1));
+		if (!mem_base)
+			scrypt_fatal_error("scrypt: out of memory");
+		mem_base = (uint8_t *)(((size_t)mem_base + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1));
+	}
+	aa.mem = mem_base + mem_bump;
+	aa.ptr = aa.mem;
+	mem_bump += (size_t)size;
+	return aa;
+}
+
+static void scrypt_free(scrypt_aligned_alloc *aa)
+{
+	mem_bump = 0;
+}
+#else
+static scrypt_aligned_alloc scrypt_alloc(uint64_t size)
+{
+	static const size_t max_alloc = (size_t)-1;
+	scrypt_aligned_alloc aa;
+	size += (SCRYPT_BLOCK_BYTES - 1);
+	if (size > max_alloc)
+		scrypt_fatal_error("scrypt: not enough address space on this CPU to allocate required memory");
+	aa.mem = (uint8_t *)malloc((size_t)size);
+	aa.ptr = (uint8_t *)(((size_t)aa.mem + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1));
+	if (!aa.mem)
+		scrypt_fatal_error("scrypt: out of memory");
+	return aa;
+}
+
+static void scrypt_free(scrypt_aligned_alloc *aa)
+{
+	free(aa->mem);
+}
+#endif
+
+
+// yacoin: increasing Nfactor gradually
+unsigned char GetNfactor(unsigned int nTimestamp)
+{
+	int l = 0;
+
+	unsigned int Nfactor = 0;
+
+	// Yacoin defaults
+	unsigned int Ntimestamp = 1367991200;
+	unsigned int minN = 4;
+	unsigned int maxN = 30;
+
+	if (strlen(jane_params) > 0) {
+		if (!strcmp(jane_params, "YAC") || !strcasecmp(jane_params, "Yacoin")) {} // No-Op
+		//
+		// NO WARRANTY FOR CORRECTNESS. Look for the int64 nChainStartTime constant
+		// in the src/main.cpp file of the official wallet clients as well as the
+		// const unsigned char minNfactor and const unsigned char maxNfactor
+		//
+		else if (!strcmp(jane_params, "YBC") || !strcasecmp(jane_params, "YBCoin")) {
+			// YBCoin:   1372386273, minN:  4, maxN: 30
+			Ntimestamp = 1372386273; minN=  4; maxN= 30;
+		} else if (!strcmp(jane_params, "ZZC") || !strcasecmp(jane_params, "ZZCoin")) {
+			// ZcCoin:   1375817223, minN: 12, maxN: 30
+			Ntimestamp = 1375817223; minN= 12; maxN= 30;
+		} else if (!strcmp(jane_params, "FEC") || !strcasecmp(jane_params, "FreeCoin")) {
+			// FreeCoin: 1375801200, minN:  6, maxN: 32
+			Ntimestamp = 1375801200; minN=  6; maxN= 32;
+		} else if (!strcmp(jane_params, "ONC") || !strcasecmp(jane_params, "OneCoin")) {
+			// OneCoin:  1371119462, minN:  6, maxN: 30
+			Ntimestamp = 1371119462; minN=  6; maxN= 30;
+		} else if (!strcmp(jane_params, "QQC") || !strcasecmp(jane_params, "QQCoin")) {
+			// QQCoin:   1387769316, minN:  4, maxN: 30
+			Ntimestamp = 1387769316; minN=  4; maxN= 30;
+		} else if (!strcmp(jane_params, "GPL") || !strcasecmp(jane_params, "GoldPressedLatinum")) {
+			// GoldPressedLatinum:1377557832, minN:  4, maxN: 30
+			Ntimestamp = 1377557832; minN=  4; maxN= 30;
+		} else if (!strcmp(jane_params, "MRC") || !strcasecmp(jane_params, "MicroCoin")) {
+			// MicroCoin:1389028879, minN:  4, maxN: 30
+			Ntimestamp = 1389028879; minN=  4; maxN= 30;
+		} else if (!strcmp(jane_params, "APC") || !strcasecmp(jane_params, "AppleCoin")) {
+			// AppleCoin:1384720832, minN:  4, maxN: 30
+			Ntimestamp = 1384720832; minN=  4; maxN= 30;
+		} else if (!strcmp(jane_params, "CPR") || !strcasecmp(jane_params, "Copperbars")) {
+			// Copperbars:1376184687, minN: 4, maxN: 30
+			Ntimestamp = 1376184687; minN= 4; maxN= 30;
+		} else if (!strcmp(jane_params, "CACH") || !strcasecmp(jane_params, "CacheCoin")) {
+			// CacheCoin:1388949883, minN: 4, maxN: 30
+			Ntimestamp = 1388949883; minN= 4; maxN= 30;
+		} else if (!strcmp(jane_params, "UTC") || !strcasecmp(jane_params, "UltraCoin")) {
+			// MicroCoin:1388361600, minN: 4, maxN: 30
+			Ntimestamp = 1388361600; minN= 4; maxN= 30;
+		} else if (!strcmp(jane_params, "VEL") || !strcasecmp(jane_params, "VelocityCoin")) {
+			// VelocityCoin:1387769316, minN: 4, maxN: 30
+			Ntimestamp = 1387769316; minN= 4; maxN= 30;
+		} else if (!strcmp(jane_params, "ITC") || !strcasecmp(jane_params, "InternetCoin")) {
+			// InternetCoin:1388385602, minN: 4, maxN: 30
+			Ntimestamp = 1388385602; minN= 4; maxN= 30;
+		} else if (!strcmp(jane_params, "RAD") || !strcasecmp(jane_params, "RadioactiveCoin")) {
+			// InternetCoin:1389196388, minN: 4, maxN: 30
+			Ntimestamp = 1389196388; minN= 4; maxN= 30;
+		} else if (!strcmp(jane_params, "LEO") || !strcasecmp(jane_params, "LEOCoin")) {
+			// LEOCoin:1402845776, minN: 4, maxN: 30
+			Ntimestamp = 1402845776; minN= 4; maxN= 30;
+		} else {
+			if (sscanf(jane_params, "%u,%u,%u", &Ntimestamp, &minN, &maxN) != 3)
+			if (sscanf(jane_params, "%u", &Nfactor) == 1) return Nfactor; // skip bounding against minN, maxN
+			else applog(LOG_INFO, "Unable to parse scrypt-jane parameters: '%s'. Defaulting to Yacoin.", jane_params);
+		}
+	}
+	// determination based on the constants determined above
+	if (nTimestamp <= Ntimestamp)
+		return minN;
+
+	unsigned long int s = nTimestamp - Ntimestamp;
+	while ((s >> 1) > 3) {
+		l += 1;
+		s >>= 1;
+	}
+
+	s &= 3;
+
+	int n = (l * 170 + s * 25 - 2320) / 100;
+
+	if (n < 0) n = 0;
+
+	if (n > 255)
+		printf("GetNfactor(%d) - something wrong(n == %d)\n", nTimestamp, n);
+
+	Nfactor = n;
+	if (Nfactor<minN) return minN;
+	if (Nfactor>maxN) return maxN;
+	return Nfactor;
+}
+
+static bool init[MAX_GPUS] = { 0 };
+
+// cleanup
+void free_scrypt_jane(int thr_id)
+{
+	int dev_id = device_map[thr_id];
+
+	if (!init[thr_id])
+		return;
+
+	cudaSetDevice(dev_id);
+	cudaDeviceSynchronize();
+	cudaDeviceReset(); // well, simple way to free ;)
+
+	init[thr_id] = false;
+}
+
+#define bswap_32x4(x) ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) \
+					 | (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
+static int s_Nfactor = 0;
+
+int scanhash_scrypt_jane(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done,
+	unsigned char *scratchbuf, struct timeval *tv_start, struct timeval *tv_end)
+{
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t Htarg = ptarget[7];
+	uint32_t N;
+
+	if (s_Nfactor == 0 && strlen(jane_params) > 0)
+		applog(LOG_INFO, "Given scrypt-jane parameters: %s", jane_params);
+
+	int Nfactor = GetNfactor(bswap_32x4(pdata[17]));
+	if (Nfactor > scrypt_maxN) {
+		scrypt_fatal_error("scrypt: N out of range");
+	}
+	N = (1 << (Nfactor + 1));
+
+	if (Nfactor != s_Nfactor)
+	{
+		opt_nfactor = Nfactor;
+		applog(LOG_INFO, "N-factor is %d (%d)!", Nfactor, N);
+		if (s_Nfactor != 0) {
+			// handle N-factor increase at runtime
+			// by adjusting the lookup_gap by factor 2
+			if (s_Nfactor == Nfactor-1)
+				for (int i=0; i < 8; ++i)
+					device_lookup_gap[i] *= 2;
+		}
+		s_Nfactor = Nfactor;
+	}
+
+	static __thread int throughput = 0;
+	if(!init[thr_id]) {
+		int dev_id = device_map[thr_id];
+
+		cudaSetDevice(dev_id);
+		cudaDeviceSynchronize();
+		cudaDeviceReset();
+		cudaSetDevice(dev_id);
+
+		throughput = cuda_throughput(thr_id);
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		init[thr_id] = true;
+	}
+
+	if(throughput == 0)
+		return -1;
+
+	gettimeofday(tv_start, NULL);
+
+	uint32_t *data[2] = { new uint32_t[20*throughput], new uint32_t[20*throughput] };
+	uint32_t* hash[2]   = { cuda_hashbuffer(thr_id,0), cuda_hashbuffer(thr_id,1) };
+
+	uint32_t n = pdata[19];
+
+	/* byte swap pdata into data[0]/[1] arrays */
+	for (int k=0; k<2; ++k) {
+		for(int z=0;z<20;z++) data[k][z] = bswap_32x4(pdata[z]);
+		for(int i=1;i<throughput;++i) memcpy(&data[k][20*i], &data[k][0], 20*sizeof(uint32_t));
+	}
+	if (parallel == 2) prepare_keccak512(thr_id, pdata);
+
+	scrypt_aligned_alloc Xbuf[2] = { scrypt_alloc(128 * throughput), scrypt_alloc(128 * throughput) };
+	scrypt_aligned_alloc Vbuf = scrypt_alloc(N * 128);
+	scrypt_aligned_alloc Ybuf = scrypt_alloc(128);
+
+	uint32_t nonce[2];
+	uint32_t* cuda_X[2]      = { cuda_transferbuffer(thr_id,0), cuda_transferbuffer(thr_id,1) };
+
+#if !defined(SCRYPT_CHOOSE_COMPILETIME)
+	scrypt_ROMixfn scrypt_ROMix = scrypt_getROMix();
+#endif
+
+	int cur = 0, nxt = 1;
+	int iteration = 0;
+
+	do {
+		nonce[nxt] = n;
+
+		if (parallel < 2)
+		{
+			// half of cpu
+
+			for(int i=0;i<throughput;++i) {
+				uint32_t tmp_nonce = n++;
+				data[nxt][20*i + 19] = bswap_32x4(tmp_nonce);
+			}
+
+			for(int i=0;i<throughput;++i)
+				scrypt_pbkdf2_1((unsigned char *)&data[nxt][20*i], 80, (unsigned char *)&data[nxt][20*i], 80, Xbuf[nxt].ptr + 128 * i, 128);
+
+			memcpy(cuda_X[nxt], Xbuf[nxt].ptr, 128 * throughput);
+			cuda_scrypt_serialize(thr_id, nxt);
+			cuda_scrypt_HtoD(thr_id, cuda_X[nxt], nxt);
+			cuda_scrypt_core(thr_id, nxt, N);
+			cuda_scrypt_done(thr_id, nxt);
+
+			cuda_scrypt_DtoH(thr_id, cuda_X[nxt], nxt, false);
+
+			//cuda_scrypt_flush(thr_id, nxt);
+			if(!cuda_scrypt_sync(thr_id, nxt)) {
+				break;
+			}
+
+			memcpy(Xbuf[cur].ptr, cuda_X[cur], 128 * throughput);
+			for(int i=0;i<throughput;++i)
+				scrypt_pbkdf2_1((unsigned char *)&data[cur][20*i], 80, Xbuf[cur].ptr + 128 * i, 128, (unsigned char *)(&hash[cur][8*i]), 32);
+
+#define VERIFY_ALL 0
+#if VERIFY_ALL
+			{
+				/* 2: X = ROMix(X) */
+				for(int i=0;i<throughput;++i)
+					scrypt_ROMix_1((scrypt_mix_word_t *)(Xbuf[cur].ptr + 128 * i), (scrypt_mix_word_t *)Ybuf.ptr, (scrypt_mix_word_t *)Vbuf.ptr, N);
+
+				unsigned int err = 0;
+				for(int i=0;i<throughput;++i) {
+					unsigned char *ref = (Xbuf[cur].ptr + 128 * i);
+					unsigned char *dat = (unsigned char*)(cuda_X[cur] + 32 * i);
+					if (memcmp(ref, dat, 128) != 0)
+					{
+						err++;
+#if 0
+						uint32_t *ref32 = (uint32_t*) ref;
+						uint32_t *dat32 = (uint32_t*) dat;
+						for (int j=0; j<32; ++j) {
+							if (ref32[j] != dat32[j])
+							fprintf(stderr, "ref32[i=%d][j=%d] = $%08x / $%08x\n", i, j, ref32[j], dat32[j]);
+						}
+#endif
+					}
+				}
+				if (err > 0) fprintf(stderr, "%d out of %d hashes differ.\n", err, throughput);
+			}
+#endif
+		} else {
+
+			// all on gpu
+
+			n += throughput;
+			if (opt_debug && (iteration % 64 == 0))
+				applog(LOG_DEBUG, "GPU #%d: n=%x", device_map[thr_id], n);
+
+			cuda_scrypt_serialize(thr_id, nxt);
+			pre_keccak512(thr_id, nxt, nonce[nxt], throughput);
+			cuda_scrypt_core(thr_id, nxt, N);
+			//cuda_scrypt_flush(thr_id, nxt);
+			if (!cuda_scrypt_sync(thr_id, nxt)) {
+				break;
+			}
+
+			post_keccak512(thr_id, nxt, nonce[nxt], throughput);
+			cuda_scrypt_done(thr_id, nxt);
+
+			cuda_scrypt_DtoH(thr_id, hash[nxt], nxt, true);
+			//cuda_scrypt_flush(thr_id, nxt); // made by cuda_scrypt_sync
+			if (!cuda_scrypt_sync(thr_id, nxt)) {
+				break;
+			}
+		}
+
+		for (int i=0; iteration > 0 && i<throughput; i++)
+		{
+			if (hash[cur][8*i+7] <= Htarg && fulltest(&hash[cur][8*i], ptarget))
+			{
+				uint32_t _ALIGN(64) thash[8], tdata[20];
+				uint32_t tmp_nonce = nonce[cur] + i;
+
+				for(int z=0;z<19;z++)
+					tdata[z] = bswap_32x4(pdata[z]);
+				tdata[19] = bswap_32x4(tmp_nonce);
+
+				scrypt_pbkdf2_1((unsigned char *)tdata, 80, (unsigned char *)tdata, 80, Xbuf[cur].ptr + 128 * i, 128);
+				scrypt_ROMix_1((scrypt_mix_word_t *)(Xbuf[cur].ptr + 128 * i), (scrypt_mix_word_t *)(Ybuf.ptr), (scrypt_mix_word_t *)(Vbuf.ptr), N);
+				scrypt_pbkdf2_1((unsigned char *)tdata, 80, Xbuf[cur].ptr + 128 * i, 128, (unsigned char *)thash, 32);
+
+				if (memcmp(thash, &hash[cur][8*i], 32) == 0)
+				{
+					work_set_target_ratio(work, thash);
+					*hashes_done = n - pdata[19];
+					pdata[19] = tmp_nonce;
+					scrypt_free(&Vbuf);
+					scrypt_free(&Ybuf);
+					scrypt_free(&Xbuf[0]); scrypt_free(&Xbuf[1]);
+					delete[] data[0]; delete[] data[1];
+					gettimeofday(tv_end, NULL);
+					return 1;
+				} else {
+					gpulog(LOG_WARNING, thr_id, "result does not validate on CPU! (i=%d, s=%d)", i, cur);
+				}
+			}
+		}
+
+		cur = (cur+1)&1;
+		nxt = (nxt+1)&1;
+		++iteration;
+	} while (n <= max_nonce && !work_restart[thr_id].restart);
+
+	scrypt_free(&Vbuf);
+	scrypt_free(&Ybuf);
+	scrypt_free(&Xbuf[0]); scrypt_free(&Xbuf[1]);
+	delete[] data[0]; delete[] data[1];
+
+	*hashes_done = n - pdata[19];
+	pdata[19] = n;
+	gettimeofday(tv_end, NULL);
+	return 0;
+}
+
+
+static void scrypt_jane_hash_1_1(const uchar *password, size_t password_len, const uchar*salt, size_t salt_len, uint32_t N,
+	uchar *out, uint32_t bytes, uint8_t *X, uint8_t *Y, uint8_t *V)
+{
+	uint32_t chunk_bytes, i;
+	const uint32_t p = SCRYPT_P;
+
+#if !defined(SCRYPT_CHOOSE_COMPILETIME)
+	scrypt_ROMixfn scrypt_ROMix = scrypt_getROMix();
+#endif
+
+	chunk_bytes = SCRYPT_BLOCK_BYTES * SCRYPT_R * 2;
+
+	/* 1: X = PBKDF2(password, salt) */
+	scrypt_pbkdf2_1(password, password_len, salt, salt_len, X, chunk_bytes * p);
+
+	/* 2: X = ROMix(X) */
+	for (i = 0; i < p; i++)
+		scrypt_ROMix_1((scrypt_mix_word_t *)(X + (chunk_bytes * i)), (scrypt_mix_word_t *)Y, (scrypt_mix_word_t *)V, N);
+
+	/* 3: Out = PBKDF2(password, X) */
+	scrypt_pbkdf2_1(password, password_len, X, chunk_bytes * p, out, (size_t) bytes);
+
+#ifdef SCRYPT_PREVENT_STATE_LEAK
+	/* This is an unnecessary security feature - mikaelh */
+	scrypt_ensure_zero(Y, (p + 1) * chunk_bytes);
+#endif
+}
+
+/* for cpu hash test */
+void scryptjane_hash(void* output, const void* input)
+{
+	uint32_t Nsize = 1UL << (opt_nfactor + 1);
+	uint64_t chunk_bytes;
+	uint8_t *X, *Y;
+	scrypt_aligned_alloc YX, V;
+
+	chunk_bytes = 2ULL * SCRYPT_BLOCK_BYTES * SCRYPT_R;
+	V  = scrypt_alloc(Nsize * chunk_bytes);
+	YX = scrypt_alloc((SCRYPT_P + 1) * chunk_bytes);
+
+	memset(V.ptr, 0, (size_t) (Nsize * chunk_bytes));
+
+	Y = YX.ptr;
+	X = Y + chunk_bytes;
+
+	scrypt_jane_hash_1_1((uchar*)input, 80, (uchar*)input, 80, (uint32_t) Nsize, (uchar*)output, 32, X, Y, V.ptr);
+
+	scrypt_free(&V);
+	scrypt_free(&YX);
+}
diff --git a/scrypt.c b/scrypt.c
deleted file mode 100644
index 2ff29026cb..0000000000
--- a/scrypt.c
+++ /dev/null
@@ -1,756 +0,0 @@
-/*
- * Copyright 2009 Colin Percival, 2011 ArtForz, 2011-2013 pooler
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * This file was originally written by Colin Percival as part of the Tarsnap
- * online backup system.
- */
-
-#include "cpuminer-config.h"
-#include "miner.h"
-
-#include <stdlib.h>
-#include <string.h>
-#include <inttypes.h>
-
-static const uint32_t keypad[12] = {
-	0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280
-};
-static const uint32_t innerpad[11] = {
-	0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x000004a0
-};
-static const uint32_t outerpad[8] = {
-	0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300
-};
-static const uint32_t finalblk[16] = {
-	0x00000001, 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000620
-};
-
-static inline void HMAC_SHA256_80_init(const uint32_t *key,
-	uint32_t *tstate, uint32_t *ostate)
-{
-	uint32_t ihash[8];
-	uint32_t pad[16];
-	int i;
-
-	/* tstate is assumed to contain the midstate of key */
-	memcpy(pad, key + 16, 16);
-	memcpy(pad + 4, keypad, 48);
-	sha256_transform(tstate, pad, 0);
-	memcpy(ihash, tstate, 32);
-
-	sha256_init(ostate);
-	for (i = 0; i < 8; i++)
-		pad[i] = ihash[i] ^ 0x5c5c5c5c;
-	for (; i < 16; i++)
-		pad[i] = 0x5c5c5c5c;
-	sha256_transform(ostate, pad, 0);
-
-	sha256_init(tstate);
-	for (i = 0; i < 8; i++)
-		pad[i] = ihash[i] ^ 0x36363636;
-	for (; i < 16; i++)
-		pad[i] = 0x36363636;
-	sha256_transform(tstate, pad, 0);
-}
-
-static inline void PBKDF2_SHA256_80_128(const uint32_t *tstate,
-	const uint32_t *ostate, const uint32_t *salt, uint32_t *output)
-{
-	uint32_t istate[8], ostate2[8];
-	uint32_t ibuf[16], obuf[16];
-	int i, j;
-
-	memcpy(istate, tstate, 32);
-	sha256_transform(istate, salt, 0);
-	
-	memcpy(ibuf, salt + 16, 16);
-	memcpy(ibuf + 5, innerpad, 44);
-	memcpy(obuf + 8, outerpad, 32);
-
-	for (i = 0; i < 4; i++) {
-		memcpy(obuf, istate, 32);
-		ibuf[4] = i + 1;
-		sha256_transform(obuf, ibuf, 0);
-
-		memcpy(ostate2, ostate, 32);
-		sha256_transform(ostate2, obuf, 0);
-		for (j = 0; j < 8; j++)
-			output[8 * i + j] = swab32(ostate2[j]);
-	}
-}
-
-static inline void PBKDF2_SHA256_128_32(uint32_t *tstate, uint32_t *ostate,
-	const uint32_t *salt, uint32_t *output)
-{
-	uint32_t buf[16];
-	int i;
-	
-	sha256_transform(tstate, salt, 1);
-	sha256_transform(tstate, salt + 16, 1);
-	sha256_transform(tstate, finalblk, 0);
-	memcpy(buf, tstate, 32);
-	memcpy(buf + 8, outerpad, 32);
-
-	sha256_transform(ostate, buf, 0);
-	for (i = 0; i < 8; i++)
-		output[i] = swab32(ostate[i]);
-}
-
-
-#if HAVE_SHA256_4WAY
-
-static const uint32_t keypad_4way[4 * 12] = {
-	0x80000000, 0x80000000, 0x80000000, 0x80000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000280, 0x00000280, 0x00000280, 0x00000280
-};
-static const uint32_t innerpad_4way[4 * 11] = {
-	0x80000000, 0x80000000, 0x80000000, 0x80000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x000004a0, 0x000004a0, 0x000004a0, 0x000004a0
-};
-static const uint32_t outerpad_4way[4 * 8] = {
-	0x80000000, 0x80000000, 0x80000000, 0x80000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000300, 0x00000300, 0x00000300, 0x00000300
-};
-static const uint32_t finalblk_4way[4 * 16] __attribute__((aligned(16))) = {
-	0x00000001, 0x00000001, 0x00000001, 0x00000001,
-	0x80000000, 0x80000000, 0x80000000, 0x80000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000620, 0x00000620, 0x00000620, 0x00000620
-};
-
-static inline void HMAC_SHA256_80_init_4way(const uint32_t *key,
-	uint32_t *tstate, uint32_t *ostate)
-{
-	uint32_t ihash[4 * 8] __attribute__((aligned(16)));
-	uint32_t pad[4 * 16] __attribute__((aligned(16)));
-	int i;
-
-	/* tstate is assumed to contain the midstate of key */
-	memcpy(pad, key + 4 * 16, 4 * 16);
-	memcpy(pad + 4 * 4, keypad_4way, 4 * 48);
-	sha256_transform_4way(tstate, pad, 0);
-	memcpy(ihash, tstate, 4 * 32);
-
-	sha256_init_4way(ostate);
-	for (i = 0; i < 4 * 8; i++)
-		pad[i] = ihash[i] ^ 0x5c5c5c5c;
-	for (; i < 4 * 16; i++)
-		pad[i] = 0x5c5c5c5c;
-	sha256_transform_4way(ostate, pad, 0);
-
-	sha256_init_4way(tstate);
-	for (i = 0; i < 4 * 8; i++)
-		pad[i] = ihash[i] ^ 0x36363636;
-	for (; i < 4 * 16; i++)
-		pad[i] = 0x36363636;
-	sha256_transform_4way(tstate, pad, 0);
-}
-
-static inline void PBKDF2_SHA256_80_128_4way(const uint32_t *tstate,
-	const uint32_t *ostate, const uint32_t *salt, uint32_t *output)
-{
-	uint32_t istate[4 * 8] __attribute__((aligned(16)));
-	uint32_t ostate2[4 * 8] __attribute__((aligned(16)));
-	uint32_t ibuf[4 * 16] __attribute__((aligned(16)));
-	uint32_t obuf[4 * 16] __attribute__((aligned(16)));
-	int i, j;
-
-	memcpy(istate, tstate, 4 * 32);
-	sha256_transform_4way(istate, salt, 0);
-	
-	memcpy(ibuf, salt + 4 * 16, 4 * 16);
-	memcpy(ibuf + 4 * 5, innerpad_4way, 4 * 44);
-	memcpy(obuf + 4 * 8, outerpad_4way, 4 * 32);
-
-	for (i = 0; i < 4; i++) {
-		memcpy(obuf, istate, 4 * 32);
-		ibuf[4 * 4 + 0] = i + 1;
-		ibuf[4 * 4 + 1] = i + 1;
-		ibuf[4 * 4 + 2] = i + 1;
-		ibuf[4 * 4 + 3] = i + 1;
-		sha256_transform_4way(obuf, ibuf, 0);
-
-		memcpy(ostate2, ostate, 4 * 32);
-		sha256_transform_4way(ostate2, obuf, 0);
-		for (j = 0; j < 4 * 8; j++)
-			output[4 * 8 * i + j] = swab32(ostate2[j]);
-	}
-}
-
-static inline void PBKDF2_SHA256_128_32_4way(uint32_t *tstate,
-	uint32_t *ostate, const uint32_t *salt, uint32_t *output)
-{
-	uint32_t buf[4 * 16] __attribute__((aligned(16)));
-	int i;
-	
-	sha256_transform_4way(tstate, salt, 1);
-	sha256_transform_4way(tstate, salt + 4 * 16, 1);
-	sha256_transform_4way(tstate, finalblk_4way, 0);
-	memcpy(buf, tstate, 4 * 32);
-	memcpy(buf + 4 * 8, outerpad_4way, 4 * 32);
-
-	sha256_transform_4way(ostate, buf, 0);
-	for (i = 0; i < 4 * 8; i++)
-		output[i] = swab32(ostate[i]);
-}
-
-#endif /* HAVE_SHA256_4WAY */
-
-
-#if HAVE_SHA256_8WAY
-
-static const uint32_t finalblk_8way[8 * 16] __attribute__((aligned(32))) = {
-	0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001,
-	0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620
-};
-
-static inline void HMAC_SHA256_80_init_8way(const uint32_t *key,
-	uint32_t *tstate, uint32_t *ostate)
-{
-	uint32_t ihash[8 * 8] __attribute__((aligned(32)));
-	uint32_t pad[8 * 16] __attribute__((aligned(32)));
-	int i;
-	
-	/* tstate is assumed to contain the midstate of key */
-	memcpy(pad, key + 8 * 16, 8 * 16);
-	for (i = 0; i < 8; i++)
-		pad[8 * 4 + i] = 0x80000000;
-	memset(pad + 8 * 5, 0x00, 8 * 40);
-	for (i = 0; i < 8; i++)
-		pad[8 * 15 + i] = 0x00000280;
-	sha256_transform_8way(tstate, pad, 0);
-	memcpy(ihash, tstate, 8 * 32);
-	
-	sha256_init_8way(ostate);
-	for (i = 0; i < 8 * 8; i++)
-		pad[i] = ihash[i] ^ 0x5c5c5c5c;
-	for (; i < 8 * 16; i++)
-		pad[i] = 0x5c5c5c5c;
-	sha256_transform_8way(ostate, pad, 0);
-	
-	sha256_init_8way(tstate);
-	for (i = 0; i < 8 * 8; i++)
-		pad[i] = ihash[i] ^ 0x36363636;
-	for (; i < 8 * 16; i++)
-		pad[i] = 0x36363636;
-	sha256_transform_8way(tstate, pad, 0);
-}
-
-static inline void PBKDF2_SHA256_80_128_8way(const uint32_t *tstate,
-	const uint32_t *ostate, const uint32_t *salt, uint32_t *output)
-{
-	uint32_t istate[8 * 8] __attribute__((aligned(32)));
-	uint32_t ostate2[8 * 8] __attribute__((aligned(32)));
-	uint32_t ibuf[8 * 16] __attribute__((aligned(32)));
-	uint32_t obuf[8 * 16] __attribute__((aligned(32)));
-	int i, j;
-	
-	memcpy(istate, tstate, 8 * 32);
-	sha256_transform_8way(istate, salt, 0);
-	
-	memcpy(ibuf, salt + 8 * 16, 8 * 16);
-	for (i = 0; i < 8; i++)
-		ibuf[8 * 5 + i] = 0x80000000;
-	memset(ibuf + 8 * 6, 0x00, 8 * 36);
-	for (i = 0; i < 8; i++)
-		ibuf[8 * 15 + i] = 0x000004a0;
-	
-	for (i = 0; i < 8; i++)
-		obuf[8 * 8 + i] = 0x80000000;
-	memset(obuf + 8 * 9, 0x00, 8 * 24);
-	for (i = 0; i < 8; i++)
-		obuf[8 * 15 + i] = 0x00000300;
-	
-	for (i = 0; i < 4; i++) {
-		memcpy(obuf, istate, 8 * 32);
-		ibuf[8 * 4 + 0] = i + 1;
-		ibuf[8 * 4 + 1] = i + 1;
-		ibuf[8 * 4 + 2] = i + 1;
-		ibuf[8 * 4 + 3] = i + 1;
-		ibuf[8 * 4 + 4] = i + 1;
-		ibuf[8 * 4 + 5] = i + 1;
-		ibuf[8 * 4 + 6] = i + 1;
-		ibuf[8 * 4 + 7] = i + 1;
-		sha256_transform_8way(obuf, ibuf, 0);
-		
-		memcpy(ostate2, ostate, 8 * 32);
-		sha256_transform_8way(ostate2, obuf, 0);
-		for (j = 0; j < 8 * 8; j++)
-			output[8 * 8 * i + j] = swab32(ostate2[j]);
-	}
-}
-
-static inline void PBKDF2_SHA256_128_32_8way(uint32_t *tstate,
-	uint32_t *ostate, const uint32_t *salt, uint32_t *output)
-{
-	uint32_t buf[8 * 16] __attribute__((aligned(32)));
-	int i;
-	
-	sha256_transform_8way(tstate, salt, 1);
-	sha256_transform_8way(tstate, salt + 8 * 16, 1);
-	sha256_transform_8way(tstate, finalblk_8way, 0);
-	
-	memcpy(buf, tstate, 8 * 32);
-	for (i = 0; i < 8; i++)
-		buf[8 * 8 + i] = 0x80000000;
-	memset(buf + 8 * 9, 0x00, 8 * 24);
-	for (i = 0; i < 8; i++)
-		buf[8 * 15 + i] = 0x00000300;
-	sha256_transform_8way(ostate, buf, 0);
-	
-	for (i = 0; i < 8 * 8; i++)
-		output[i] = swab32(ostate[i]);
-}
-
-#endif /* HAVE_SHA256_8WAY */
-
-
-#if defined(__x86_64__)
-
-#define SCRYPT_MAX_WAYS 1
-#define HAVE_SCRYPT_3WAY 0
-#define scrypt_best_throughput() 1
-static void scrypt_core(uint32_t *X, uint32_t *V);
-void scrypt_core_3way(uint32_t *X, uint32_t *V);
-#if defined(USE_AVX2)
-#undef SCRYPT_MAX_WAYS
-#define SCRYPT_MAX_WAYS 21
-#define HAVE_SCRYPT_6WAY 0
-void scrypt_core_6way(uint32_t *X, uint32_t *V);
-#endif
-
-#elif defined(__i386__)
-
-#define SCRYPT_MAX_WAYS 1
-#define scrypt_best_throughput() 1
-static void scrypt_core(uint32_t *X, uint32_t *V);
-
-#elif defined(__arm__) && defined(__APCS_32__)
-
-static void scrypt_core(uint32_t *X, uint32_t *V);
-#if defined(__ARM_NEON__)
-#undef HAVE_SHA256_4WAY
-#define SCRYPT_MAX_WAYS 1
-#define HAVE_SCRYPT_3WAY 0
-#define scrypt_best_throughput() 1
-void scrypt_core_3way(uint32_t *X, uint32_t *V);
-#endif
-
-#endif
-
-static inline void xor_salsa8(uint32_t B[16], const uint32_t Bx[16])
-{
-	uint32_t x00,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,x11,x12,x13,x14,x15;
-	int i;
-
-	x00 = (B[ 0] ^= Bx[ 0]);
-	x01 = (B[ 1] ^= Bx[ 1]);
-	x02 = (B[ 2] ^= Bx[ 2]);
-	x03 = (B[ 3] ^= Bx[ 3]);
-	x04 = (B[ 4] ^= Bx[ 4]);
-	x05 = (B[ 5] ^= Bx[ 5]);
-	x06 = (B[ 6] ^= Bx[ 6]);
-	x07 = (B[ 7] ^= Bx[ 7]);
-	x08 = (B[ 8] ^= Bx[ 8]);
-	x09 = (B[ 9] ^= Bx[ 9]);
-	x10 = (B[10] ^= Bx[10]);
-	x11 = (B[11] ^= Bx[11]);
-	x12 = (B[12] ^= Bx[12]);
-	x13 = (B[13] ^= Bx[13]);
-	x14 = (B[14] ^= Bx[14]);
-	x15 = (B[15] ^= Bx[15]);
-	for (i = 0; i < 8; i += 2) {
-#define R(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
-		/* Operate on columns. */
-		x04 ^= R(x00+x12, 7);	x09 ^= R(x05+x01, 7);
-		x14 ^= R(x10+x06, 7);	x03 ^= R(x15+x11, 7);
-		
-		x08 ^= R(x04+x00, 9);	x13 ^= R(x09+x05, 9);
-		x02 ^= R(x14+x10, 9);	x07 ^= R(x03+x15, 9);
-		
-		x12 ^= R(x08+x04,13);	x01 ^= R(x13+x09,13);
-		x06 ^= R(x02+x14,13);	x11 ^= R(x07+x03,13);
-		
-		x00 ^= R(x12+x08,18);	x05 ^= R(x01+x13,18);
-		x10 ^= R(x06+x02,18);	x15 ^= R(x11+x07,18);
-		
-		/* Operate on rows. */
-		x01 ^= R(x00+x03, 7);	x06 ^= R(x05+x04, 7);
-		x11 ^= R(x10+x09, 7);	x12 ^= R(x15+x14, 7);
-		
-		x02 ^= R(x01+x00, 9);	x07 ^= R(x06+x05, 9);
-		x08 ^= R(x11+x10, 9);	x13 ^= R(x12+x15, 9);
-		
-		x03 ^= R(x02+x01,13);	x04 ^= R(x07+x06,13);
-		x09 ^= R(x08+x11,13);	x14 ^= R(x13+x12,13);
-		
-		x00 ^= R(x03+x02,18);	x05 ^= R(x04+x07,18);
-		x10 ^= R(x09+x08,18);	x15 ^= R(x14+x13,18);
-#undef R
-	}
-	B[ 0] += x00;
-	B[ 1] += x01;
-	B[ 2] += x02;
-	B[ 3] += x03;
-	B[ 4] += x04;
-	B[ 5] += x05;
-	B[ 6] += x06;
-	B[ 7] += x07;
-	B[ 8] += x08;
-	B[ 9] += x09;
-	B[10] += x10;
-	B[11] += x11;
-	B[12] += x12;
-	B[13] += x13;
-	B[14] += x14;
-	B[15] += x15;
-}
-
-static inline void scrypt_core(uint32_t *X, uint32_t *V)
-{
-	uint32_t i, j, k;
-	
-	for (i = 0; i < 1024; i++) {
-		memcpy(&V[i * 32], X, 128);
-		xor_salsa8(&X[0], &X[16]);
-		xor_salsa8(&X[16], &X[0]);
-	}
-	for (i = 0; i < 1024; i++) {
-		j = 32 * (X[16] & 1023);
-		for (k = 0; k < 32; k++)
-			X[k] ^= V[j + k];
-		xor_salsa8(&X[0], &X[16]);
-		xor_salsa8(&X[16], &X[0]);
-	}
-}
-
-#ifndef SCRYPT_MAX_WAYS
-#define SCRYPT_MAX_WAYS 1
-#define scrypt_best_throughput() 1
-#endif
-
-#define SCRYPT_BUFFER_SIZE (SCRYPT_MAX_WAYS * 131072 + 63)
-
-unsigned char *scrypt_buffer_alloc()
-{
-	return malloc(SCRYPT_BUFFER_SIZE);
-}
-
-static void scrypt_1024_1_1_256(const uint32_t *input, uint32_t *output,
-	uint32_t *midstate, unsigned char *scratchpad)
-{
-	uint32_t tstate[8], ostate[8];
-	uint32_t X[32];
-	uint32_t *V;
-	
-	V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
-
-	memcpy(tstate, midstate, 32);
-	HMAC_SHA256_80_init(input, tstate, ostate);
-	PBKDF2_SHA256_80_128(tstate, ostate, input, X);
-
-	scrypt_core(X, V);
-
-	PBKDF2_SHA256_128_32(tstate, ostate, X, output);
-}
-
-#if HAVE_SHA256_4WAY
-static void scrypt_1024_1_1_256_4way(const uint32_t *input,
-	uint32_t *output, uint32_t *midstate, unsigned char *scratchpad)
-{
-	uint32_t tstate[4 * 8] __attribute__((aligned(128)));
-	uint32_t ostate[4 * 8] __attribute__((aligned(128)));
-	uint32_t W[4 * 32] __attribute__((aligned(128)));
-	uint32_t X[4 * 32] __attribute__((aligned(128)));
-	uint32_t *V;
-	int i, k;
-	
-	V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
-
-	for (i = 0; i < 20; i++)
-		for (k = 0; k < 4; k++)
-			W[4 * i + k] = input[k * 20 + i];
-	for (i = 0; i < 8; i++)
-		for (k = 0; k < 4; k++)
-			tstate[4 * i + k] = midstate[i];
-	HMAC_SHA256_80_init_4way(W, tstate, ostate);
-	PBKDF2_SHA256_80_128_4way(tstate, ostate, W, W);
-	for (i = 0; i < 32; i++)
-		for (k = 0; k < 4; k++)
-			X[k * 32 + i] = W[4 * i + k];
-	scrypt_core(X + 0 * 32, V);
-	scrypt_core(X + 1 * 32, V);
-	scrypt_core(X + 2 * 32, V);
-	scrypt_core(X + 3 * 32, V);
-	for (i = 0; i < 32; i++)
-		for (k = 0; k < 4; k++)
-			W[4 * i + k] = X[k * 32 + i];
-	PBKDF2_SHA256_128_32_4way(tstate, ostate, W, W);
-	for (i = 0; i < 8; i++)
-		for (k = 0; k < 4; k++)
-			output[k * 8 + i] = W[4 * i + k];
-}
-#endif /* HAVE_SHA256_4WAY */
-
-#if HAVE_SCRYPT_3WAY
-
-static void scrypt_1024_1_1_256_3way(const uint32_t *input,
-	uint32_t *output, uint32_t *midstate, unsigned char *scratchpad)
-{
-	uint32_t tstate[3 * 8], ostate[3 * 8];
-	uint32_t X[3 * 32] __attribute__((aligned(64)));
-	uint32_t *V;
-	
-	V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
-
-	memcpy(tstate +  0, midstate, 32);
-	memcpy(tstate +  8, midstate, 32);
-	memcpy(tstate + 16, midstate, 32);
-	HMAC_SHA256_80_init(input +  0, tstate +  0, ostate +  0);
-	HMAC_SHA256_80_init(input + 20, tstate +  8, ostate +  8);
-	HMAC_SHA256_80_init(input + 40, tstate + 16, ostate + 16);
-	PBKDF2_SHA256_80_128(tstate +  0, ostate +  0, input +  0, X +  0);
-	PBKDF2_SHA256_80_128(tstate +  8, ostate +  8, input + 20, X + 32);
-	PBKDF2_SHA256_80_128(tstate + 16, ostate + 16, input + 40, X + 64);
-
-	scrypt_core_3way(X, V);
-
-	PBKDF2_SHA256_128_32(tstate +  0, ostate +  0, X +  0, output +  0);
-	PBKDF2_SHA256_128_32(tstate +  8, ostate +  8, X + 32, output +  8);
-	PBKDF2_SHA256_128_32(tstate + 16, ostate + 16, X + 64, output + 16);
-}
-
-#if HAVE_SHA256_4WAY
-static void scrypt_1024_1_1_256_12way(const uint32_t *input,
-	uint32_t *output, uint32_t *midstate, unsigned char *scratchpad)
-{
-	uint32_t tstate[12 * 8] __attribute__((aligned(128)));
-	uint32_t ostate[12 * 8] __attribute__((aligned(128)));
-	uint32_t W[12 * 32] __attribute__((aligned(128)));
-	uint32_t X[12 * 32] __attribute__((aligned(128)));
-	uint32_t *V;
-	int i, j, k;
-	
-	V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
-
-	for (j = 0; j < 3; j++)
-		for (i = 0; i < 20; i++)
-			for (k = 0; k < 4; k++)
-				W[128 * j + 4 * i + k] = input[80 * j + k * 20 + i];
-	for (j = 0; j < 3; j++)
-		for (i = 0; i < 8; i++)
-			for (k = 0; k < 4; k++)
-				tstate[32 * j + 4 * i + k] = midstate[i];
-	HMAC_SHA256_80_init_4way(W +   0, tstate +  0, ostate +  0);
-	HMAC_SHA256_80_init_4way(W + 128, tstate + 32, ostate + 32);
-	HMAC_SHA256_80_init_4way(W + 256, tstate + 64, ostate + 64);
-	PBKDF2_SHA256_80_128_4way(tstate +  0, ostate +  0, W +   0, W +   0);
-	PBKDF2_SHA256_80_128_4way(tstate + 32, ostate + 32, W + 128, W + 128);
-	PBKDF2_SHA256_80_128_4way(tstate + 64, ostate + 64, W + 256, W + 256);
-	for (j = 0; j < 3; j++)
-		for (i = 0; i < 32; i++)
-			for (k = 0; k < 4; k++)
-				X[128 * j + k * 32 + i] = W[128 * j + 4 * i + k];
-	scrypt_core_3way(X + 0 * 96, V);
-	scrypt_core_3way(X + 1 * 96, V);
-	scrypt_core_3way(X + 2 * 96, V);
-	scrypt_core_3way(X + 3 * 96, V);
-	for (j = 0; j < 3; j++)
-		for (i = 0; i < 32; i++)
-			for (k = 0; k < 4; k++)
-				W[128 * j + 4 * i + k] = X[128 * j + k * 32 + i];
-	PBKDF2_SHA256_128_32_4way(tstate +  0, ostate +  0, W +   0, W +   0);
-	PBKDF2_SHA256_128_32_4way(tstate + 32, ostate + 32, W + 128, W + 128);
-	PBKDF2_SHA256_128_32_4way(tstate + 64, ostate + 64, W + 256, W + 256);
-	for (j = 0; j < 3; j++)
-		for (i = 0; i < 8; i++)
-			for (k = 0; k < 4; k++)
-				output[32 * j + k * 8 + i] = W[128 * j + 4 * i + k];
-}
-#endif /* HAVE_SHA256_4WAY */
-
-#endif /* HAVE_SCRYPT_3WAY */
-
-#if HAVE_SCRYPT_6WAY
-static void scrypt_1024_1_1_256_24way(const uint32_t *input,
-	uint32_t *output, uint32_t *midstate, unsigned char *scratchpad)
-{
-	uint32_t tstate[24 * 8] __attribute__((aligned(128)));
-	uint32_t ostate[24 * 8] __attribute__((aligned(128)));
-	uint32_t W[24 * 32] __attribute__((aligned(128)));
-	uint32_t X[24 * 32] __attribute__((aligned(128)));
-	uint32_t *V;
-	int i, j, k;
-	
-	V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
-	
-	for (j = 0; j < 3; j++) 
-		for (i = 0; i < 20; i++)
-			for (k = 0; k < 8; k++)
-				W[8 * 32 * j + 8 * i + k] = input[8 * 20 * j + k * 20 + i];
-	for (j = 0; j < 3; j++)
-		for (i = 0; i < 8; i++)
-			for (k = 0; k < 8; k++)
-				tstate[8 * 8 * j + 8 * i + k] = midstate[i];
-	HMAC_SHA256_80_init_8way(W +   0, tstate +   0, ostate +   0);
-	HMAC_SHA256_80_init_8way(W + 256, tstate +  64, ostate +  64);
-	HMAC_SHA256_80_init_8way(W + 512, tstate + 128, ostate + 128);
-	PBKDF2_SHA256_80_128_8way(tstate +   0, ostate +   0, W +   0, W +   0);
-	PBKDF2_SHA256_80_128_8way(tstate +  64, ostate +  64, W + 256, W + 256);
-	PBKDF2_SHA256_80_128_8way(tstate + 128, ostate + 128, W + 512, W + 512);
-	for (j = 0; j < 3; j++)
-		for (i = 0; i < 32; i++)
-			for (k = 0; k < 8; k++)
-				X[8 * 32 * j + k * 32 + i] = W[8 * 32 * j + 8 * i + k];
-	scrypt_core_6way(X + 0 * 32, V);
-	scrypt_core_6way(X + 6 * 32, V);
-	scrypt_core_6way(X + 12 * 32, V);
-	scrypt_core_6way(X + 18 * 32, V);
-	for (j = 0; j < 3; j++)
-		for (i = 0; i < 32; i++)
-			for (k = 0; k < 8; k++)
-				W[8 * 32 * j + 8 * i + k] = X[8 * 32 * j + k * 32 + i];
-	PBKDF2_SHA256_128_32_8way(tstate +   0, ostate +   0, W +   0, W +   0);
-	PBKDF2_SHA256_128_32_8way(tstate +  64, ostate +  64, W + 256, W + 256);
-	PBKDF2_SHA256_128_32_8way(tstate + 128, ostate + 128, W + 512, W + 512);
-	for (j = 0; j < 3; j++)
-		for (i = 0; i < 8; i++)
-			for (k = 0; k < 8; k++)
-				output[8 * 8 * j + k * 8 + i] = W[8 * 32 * j + 8 * i + k];
-}
-#endif /* HAVE_SCRYPT_6WAY */
-
-int scanhash_scrypt(int thr_id, uint32_t *pdata,
-	unsigned char *scratchbuf, const uint32_t *ptarget,
-	uint32_t max_nonce, unsigned long *hashes_done)
-{
-	uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8];
-	uint32_t midstate[8];
-	uint32_t n = pdata[19] - 1;
-	const uint32_t Htarg = ptarget[7];
-	int throughput = scrypt_best_throughput();
-	int i;
-	
-#if HAVE_SHA256_4WAY
-	if (sha256_use_4way())
-		throughput *= 4;
-#endif
-	
-	for (i = 0; i < throughput; i++)
-		memcpy(data + i * 20, pdata, 80);
-	
-	sha256_init(midstate);
-	sha256_transform(midstate, data, 0);
-	
-	do {
-		for (i = 0; i < throughput; i++)
-			data[i * 20 + 19] = ++n;
-		
-#if defined(HAVE_SHA256_4WAY)
-		if (throughput == 4)
-			scrypt_1024_1_1_256_4way(data, hash, midstate, scratchbuf);
-		else
-#endif
-#if defined(HAVE_SCRYPT_3WAY) && defined(HAVE_SHA256_4WAY)
-		if (throughput == 12)
-			scrypt_1024_1_1_256_12way(data, hash, midstate, scratchbuf);
-		else
-#endif
-#if defined(HAVE_SCRYPT_6WAY)
-		if (throughput == 24)
-			scrypt_1024_1_1_256_24way(data, hash, midstate, scratchbuf);
-		else
-#endif
-#if defined(HAVE_SCRYPT_3WAY)
-		if (throughput == 3)
-			scrypt_1024_1_1_256_3way(data, hash, midstate, scratchbuf);
-		else
-#endif
-		scrypt_1024_1_1_256(data, hash, midstate, scratchbuf);
-		
-		for (i = 0; i < throughput; i++) {
-			if (hash[i * 8 + 7] <= Htarg && fulltest(hash + i * 8, ptarget)) {
-				*hashes_done = n - pdata[19] + 1;
-				pdata[19] = data[i * 20 + 19];
-				return 1;
-			}
-		}
-	} while (n < max_nonce && !work_restart[thr_id].restart);
-	
-	*hashes_done = n - pdata[19] + 1;
-	pdata[19] = n;
-	return 0;
-}
diff --git a/scrypt.cpp b/scrypt.cpp
new file mode 100644
index 0000000000..68e81e4e09
--- /dev/null
+++ b/scrypt.cpp
@@ -0,0 +1,1115 @@
+/*
+ * Copyright 2009 Colin Percival, 2011 ArtForz, 2011-2013 pooler
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ */
+
+#ifdef WIN32
+#include <ppl.h>
+using namespace Concurrency;
+#else
+#include <omp.h>
+#endif
+
+#include "miner.h"
+#include "scrypt/salsa_kernel.h"
+#include "scrypt/sha256.h"
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <emmintrin.h>
+#ifndef __APPLE__
+#include <malloc.h>
+#endif
+#include <new>
+
+#if _MSC_VER > 1800
+#undef _THROW1
+#if __cplusplus < 201101L
+#define _THROW1(x) throw(std::bad_alloc)
+#else
+#define _THROW1(x) noexcept(false)
+#endif
+#elif !defined(_MSC_VER)
+#if __cplusplus < 201101L
+#define _THROW1(x) throw(std::bad_alloc)
+#else
+#define _THROW1(x) noexcept(false)
+#endif
+#endif
+
+// A thin wrapper around the builtin __m128i type
+class uint32x4_t
+{
+public:
+#if WIN32
+	void * operator new(size_t size) _THROW1(_STD bad_alloc) { void *p; if ((p = _aligned_malloc(size, 16)) == 0) { static const std::bad_alloc nomem; _RAISE(nomem); } return (p); }
+	void operator delete(void *p) { _aligned_free(p); }
+	void * operator new[](size_t size) _THROW1(_STD bad_alloc) { void *p; if ((p = _aligned_malloc(size, 16)) == 0) { static const std::bad_alloc nomem; _RAISE(nomem); } return (p); }
+	void operator delete[](void *p) { _aligned_free(p); }
+#else
+	void * operator new(size_t size) _THROW1(_STD bad_alloc) { void *p; if (posix_memalign(&p, 16, size) < 0) { static const std::bad_alloc nomem; throw nomem; } return (p); }
+	void operator delete(void *p) { free(p); }
+	void * operator new[](size_t size) _THROW1(_STD bad_alloc) { void *p; if (posix_memalign(&p, 16, size) < 0) { static const std::bad_alloc nomem; throw nomem; } return (p); }
+	void operator delete[](void *p) { free(p); }
+#endif
+	uint32x4_t() { };
+	uint32x4_t(const __m128i init) { val = init; }
+	uint32x4_t(const uint32_t init) { val = _mm_set1_epi32((int)init); }
+	uint32x4_t(const uint32_t a, const uint32_t b, const uint32_t c, const uint32_t d) { val = _mm_setr_epi32((int)a,(int)b,(int)c,(int)d); }
+	inline operator const __m128i() const { return val; }
+	inline const uint32x4_t operator+(const uint32x4_t &other) const { return _mm_add_epi32(val, other); }
+	inline const uint32x4_t operator+(const uint32_t other) const { return _mm_add_epi32(val, _mm_set1_epi32((int)other)); }
+	inline uint32x4_t& operator+=(const uint32x4_t other) { val = _mm_add_epi32(val, other); return *this; }
+	inline uint32x4_t& operator+=(const uint32_t other) { val = _mm_add_epi32(val, _mm_set1_epi32((int)other)); return *this; }
+	inline const uint32x4_t operator&(const uint32_t other) const { return _mm_and_si128(val, _mm_set1_epi32((int)other)); }
+	inline const uint32x4_t operator&(const uint32x4_t &other) const { return _mm_and_si128(val, other); }
+	inline const uint32x4_t operator|(const uint32x4_t &other) const { return _mm_or_si128(val, other); }
+	inline const uint32x4_t operator^(const uint32x4_t &other) const { return _mm_xor_si128(val, other); }
+	inline const uint32x4_t operator<<(const int num) const { return _mm_slli_epi32(val, num); }
+	inline const uint32x4_t operator>>(const int num) const { return _mm_srli_epi32(val, num); }
+	inline const uint32_t operator[](const int num) const { return ((uint32_t*)&val)[num]; }
+ protected:
+	__m128i val;
+};
+
+// non-member overload
+inline const uint32x4_t operator+(const uint32_t left, const uint32x4_t &right) { return _mm_add_epi32(_mm_set1_epi32((int)left), right); }
+
+
+//
+// Code taken from sha2.cpp and vectorized, with minimal changes where required
+// Not all subroutines are actually used.
+//
+
+#define bswap_32x4(x) ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) \
+					 | (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
+
+static __inline uint32x4_t swab32x4(const uint32x4_t &v)
+{
+	return bswap_32x4(v);
+}
+
+static const uint32_t sha256_h[8] = {
+	0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
+	0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
+};
+
+static const uint32_t sha256_k[64] = {
+	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
+void sha256_initx4(uint32x4_t *statex4)
+{
+	for (int i=0; i<8; ++i)
+		statex4[i] = sha256_h[i];
+}
+
+/* Elementary functions used by SHA256 */
+#define Ch(x, y, z)     ((x & (y ^ z)) ^ z)
+#define Maj(x, y, z)    ((x & (y | z)) | (y & z))
+#define ROTR(x, n)      ((x >> n) | (x << (32 - n)))
+#define S0(x)           (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
+#define S1(x)           (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
+#define s0(x)           (ROTR(x, 7) ^ ROTR(x, 18) ^ (x >> 3))
+#define s1(x)           (ROTR(x, 17) ^ ROTR(x, 19) ^ (x >> 10))
+
+/* SHA256 round function */
+#define RND(a, b, c, d, e, f, g, h, k) \
+	do { \
+		t0 = h + S1(e) + Ch(e, f, g) + k; \
+		t1 = S0(a) + Maj(a, b, c); \
+		d += t0; \
+		h  = t0 + t1; \
+	} while (0)
+
+/* Adjusted round function for rotating state */
+#define RNDr(S, W, i) \
+	RND(S[(64 - i) % 8], S[(65 - i) % 8], \
+		S[(66 - i) % 8], S[(67 - i) % 8], \
+		S[(68 - i) % 8], S[(69 - i) % 8], \
+		S[(70 - i) % 8], S[(71 - i) % 8], \
+		W[i] + sha256_k[i])
+
+/*
+ * SHA256 block compression function.  The 256-bit state is transformed via
+ * the 512-bit input block to produce a new state.
+ */
+void sha256_transformx4(uint32x4_t *state, const uint32x4_t *block, int swap)
+{
+	uint32x4_t W[64];
+	uint32x4_t S[8];
+	uint32x4_t t0, t1;
+	int i;
+
+	/* 1. Prepare message schedule W. */
+	if (swap) {
+		for (i = 0; i < 16; i++)
+			W[i] = swab32x4(block[i]);
+	} else
+		memcpy(W, block, 4*64);
+	for (i = 16; i < 64; i += 2) {
+		W[i]   = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
+		W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15];
+	}
+
+	/* 2. Initialize working variables. */
+	memcpy(S, state, 4*32);
+
+	/* 3. Mix. */
+	RNDr(S, W,  0);
+	RNDr(S, W,  1);
+	RNDr(S, W,  2);
+	RNDr(S, W,  3);
+	RNDr(S, W,  4);
+	RNDr(S, W,  5);
+	RNDr(S, W,  6);
+	RNDr(S, W,  7);
+	RNDr(S, W,  8);
+	RNDr(S, W,  9);
+	RNDr(S, W, 10);
+	RNDr(S, W, 11);
+	RNDr(S, W, 12);
+	RNDr(S, W, 13);
+	RNDr(S, W, 14);
+	RNDr(S, W, 15);
+	RNDr(S, W, 16);
+	RNDr(S, W, 17);
+	RNDr(S, W, 18);
+	RNDr(S, W, 19);
+	RNDr(S, W, 20);
+	RNDr(S, W, 21);
+	RNDr(S, W, 22);
+	RNDr(S, W, 23);
+	RNDr(S, W, 24);
+	RNDr(S, W, 25);
+	RNDr(S, W, 26);
+	RNDr(S, W, 27);
+	RNDr(S, W, 28);
+	RNDr(S, W, 29);
+	RNDr(S, W, 30);
+	RNDr(S, W, 31);
+	RNDr(S, W, 32);
+	RNDr(S, W, 33);
+	RNDr(S, W, 34);
+	RNDr(S, W, 35);
+	RNDr(S, W, 36);
+	RNDr(S, W, 37);
+	RNDr(S, W, 38);
+	RNDr(S, W, 39);
+	RNDr(S, W, 40);
+	RNDr(S, W, 41);
+	RNDr(S, W, 42);
+	RNDr(S, W, 43);
+	RNDr(S, W, 44);
+	RNDr(S, W, 45);
+	RNDr(S, W, 46);
+	RNDr(S, W, 47);
+	RNDr(S, W, 48);
+	RNDr(S, W, 49);
+	RNDr(S, W, 50);
+	RNDr(S, W, 51);
+	RNDr(S, W, 52);
+	RNDr(S, W, 53);
+	RNDr(S, W, 54);
+	RNDr(S, W, 55);
+	RNDr(S, W, 56);
+	RNDr(S, W, 57);
+	RNDr(S, W, 58);
+	RNDr(S, W, 59);
+	RNDr(S, W, 60);
+	RNDr(S, W, 61);
+	RNDr(S, W, 62);
+	RNDr(S, W, 63);
+
+	/* 4. Mix local working variables into global state */
+	for (i = 0; i < 8; i++)
+		state[i] += S[i];
+}
+
+static const uint32_t sha256d_hash1[16] = {
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x80000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000100
+};
+
+static void sha256dx4(uint32x4_t *hash, uint32x4_t *data)
+{
+	uint32x4_t S[16];
+
+	sha256_initx4(S);
+	sha256_transformx4(S, data, 0);
+	sha256_transformx4(S, data + 16, 0);
+	for (int i=8; i<16; ++i)
+		S[i] = sha256d_hash1[i];
+	sha256_initx4(hash);
+	sha256_transformx4(hash, S, 0);
+}
+
+static inline void sha256d_preextendx4(uint32x4_t *W)
+{
+	W[16] = s1(W[14]) + W[ 9] + s0(W[ 1]) + W[ 0];
+	W[17] = s1(W[15]) + W[10] + s0(W[ 2]) + W[ 1];
+	W[18] = s1(W[16]) + W[11]             + W[ 2];
+	W[19] = s1(W[17]) + W[12] + s0(W[ 4]);
+	W[20] =             W[13] + s0(W[ 5]) + W[ 4];
+	W[21] =             W[14] + s0(W[ 6]) + W[ 5];
+	W[22] =             W[15] + s0(W[ 7]) + W[ 6];
+	W[23] =             W[16] + s0(W[ 8]) + W[ 7];
+	W[24] =             W[17] + s0(W[ 9]) + W[ 8];
+	W[25] =                     s0(W[10]) + W[ 9];
+	W[26] =                     s0(W[11]) + W[10];
+	W[27] =                     s0(W[12]) + W[11];
+	W[28] =                     s0(W[13]) + W[12];
+	W[29] =                     s0(W[14]) + W[13];
+	W[30] =                     s0(W[15]) + W[14];
+	W[31] =                     s0(W[16]) + W[15];
+}
+
+static inline void sha256d_prehashx4(uint32x4_t *S, const uint32x4_t *W)
+{
+	uint32x4_t t0, t1;
+	RNDr(S, W, 0);
+	RNDr(S, W, 1);
+	RNDr(S, W, 2);
+}
+
+static inline void sha256d_msx4(uint32x4_t *hash, uint32x4_t *W,
+	const uint32_t *midstate, const uint32_t *prehash)
+{
+	uint32x4_t S[64];
+	uint32x4_t t0, t1;
+	int i;
+
+	S[18] = W[18];
+	S[19] = W[19];
+	S[20] = W[20];
+	S[22] = W[22];
+	S[23] = W[23];
+	S[24] = W[24];
+	S[30] = W[30];
+	S[31] = W[31];
+
+	W[18] += s0(W[3]);
+	W[19] += W[3];
+	W[20] += s1(W[18]);
+	W[21]  = s1(W[19]);
+	W[22] += s1(W[20]);
+	W[23] += s1(W[21]);
+	W[24] += s1(W[22]);
+	W[25]  = s1(W[23]) + W[18];
+	W[26]  = s1(W[24]) + W[19];
+	W[27]  = s1(W[25]) + W[20];
+	W[28]  = s1(W[26]) + W[21];
+	W[29]  = s1(W[27]) + W[22];
+	W[30] += s1(W[28]) + W[23];
+	W[31] += s1(W[29]) + W[24];
+	for (i = 32; i < 64; i += 2) {
+		W[i]   = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
+		W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15];
+	}
+
+	for (i=0; i<8; ++i)
+		S[i] = prehash[i];
+
+	RNDr(S, W,  3);
+	RNDr(S, W,  4);
+	RNDr(S, W,  5);
+	RNDr(S, W,  6);
+	RNDr(S, W,  7);
+	RNDr(S, W,  8);
+	RNDr(S, W,  9);
+	RNDr(S, W, 10);
+	RNDr(S, W, 11);
+	RNDr(S, W, 12);
+	RNDr(S, W, 13);
+	RNDr(S, W, 14);
+	RNDr(S, W, 15);
+	RNDr(S, W, 16);
+	RNDr(S, W, 17);
+	RNDr(S, W, 18);
+	RNDr(S, W, 19);
+	RNDr(S, W, 20);
+	RNDr(S, W, 21);
+	RNDr(S, W, 22);
+	RNDr(S, W, 23);
+	RNDr(S, W, 24);
+	RNDr(S, W, 25);
+	RNDr(S, W, 26);
+	RNDr(S, W, 27);
+	RNDr(S, W, 28);
+	RNDr(S, W, 29);
+	RNDr(S, W, 30);
+	RNDr(S, W, 31);
+	RNDr(S, W, 32);
+	RNDr(S, W, 33);
+	RNDr(S, W, 34);
+	RNDr(S, W, 35);
+	RNDr(S, W, 36);
+	RNDr(S, W, 37);
+	RNDr(S, W, 38);
+	RNDr(S, W, 39);
+	RNDr(S, W, 40);
+	RNDr(S, W, 41);
+	RNDr(S, W, 42);
+	RNDr(S, W, 43);
+	RNDr(S, W, 44);
+	RNDr(S, W, 45);
+	RNDr(S, W, 46);
+	RNDr(S, W, 47);
+	RNDr(S, W, 48);
+	RNDr(S, W, 49);
+	RNDr(S, W, 50);
+	RNDr(S, W, 51);
+	RNDr(S, W, 52);
+	RNDr(S, W, 53);
+	RNDr(S, W, 54);
+	RNDr(S, W, 55);
+	RNDr(S, W, 56);
+	RNDr(S, W, 57);
+	RNDr(S, W, 58);
+	RNDr(S, W, 59);
+	RNDr(S, W, 60);
+	RNDr(S, W, 61);
+	RNDr(S, W, 62);
+	RNDr(S, W, 63);
+
+	for (i = 0; i < 8; i++)
+		S[i] += midstate[i];
+
+	W[18] = S[18];
+	W[19] = S[19];
+	W[20] = S[20];
+	W[22] = S[22];
+	W[23] = S[23];
+	W[24] = S[24];
+	W[30] = S[30];
+	W[31] = S[31];
+
+	for (i=8; i<16; ++i)
+		S[i] = sha256d_hash1[i];
+	S[16] = s1(sha256d_hash1[14]) + sha256d_hash1[ 9] + s0(S[ 1]) + S[ 0];
+	S[17] = s1(sha256d_hash1[15]) + sha256d_hash1[10] + s0(S[ 2]) + S[ 1];
+	S[18] = s1(S[16]) + sha256d_hash1[11] + s0(S[ 3]) + S[ 2];
+	S[19] = s1(S[17]) + sha256d_hash1[12] + s0(S[ 4]) + S[ 3];
+	S[20] = s1(S[18]) + sha256d_hash1[13] + s0(S[ 5]) + S[ 4];
+	S[21] = s1(S[19]) + sha256d_hash1[14] + s0(S[ 6]) + S[ 5];
+	S[22] = s1(S[20]) + sha256d_hash1[15] + s0(S[ 7]) + S[ 6];
+	S[23] = s1(S[21]) + S[16] + s0(sha256d_hash1[ 8]) + S[ 7];
+	S[24] = s1(S[22]) + S[17] + s0(sha256d_hash1[ 9]) + sha256d_hash1[ 8];
+	S[25] = s1(S[23]) + S[18] + s0(sha256d_hash1[10]) + sha256d_hash1[ 9];
+	S[26] = s1(S[24]) + S[19] + s0(sha256d_hash1[11]) + sha256d_hash1[10];
+	S[27] = s1(S[25]) + S[20] + s0(sha256d_hash1[12]) + sha256d_hash1[11];
+	S[28] = s1(S[26]) + S[21] + s0(sha256d_hash1[13]) + sha256d_hash1[12];
+	S[29] = s1(S[27]) + S[22] + s0(sha256d_hash1[14]) + sha256d_hash1[13];
+	S[30] = s1(S[28]) + S[23] + s0(sha256d_hash1[15]) + sha256d_hash1[14];
+	S[31] = s1(S[29]) + S[24] + s0(S[16])             + sha256d_hash1[15];
+	for (i = 32; i < 60; i += 2) {
+		S[i]   = s1(S[i - 2]) + S[i - 7] + s0(S[i - 15]) + S[i - 16];
+		S[i+1] = s1(S[i - 1]) + S[i - 6] + s0(S[i - 14]) + S[i - 15];
+	}
+	S[60] = s1(S[58]) + S[53] + s0(S[45]) + S[44];
+
+	sha256_initx4(hash);
+
+	RNDr(hash, S,  0);
+	RNDr(hash, S,  1);
+	RNDr(hash, S,  2);
+	RNDr(hash, S,  3);
+	RNDr(hash, S,  4);
+	RNDr(hash, S,  5);
+	RNDr(hash, S,  6);
+	RNDr(hash, S,  7);
+	RNDr(hash, S,  8);
+	RNDr(hash, S,  9);
+	RNDr(hash, S, 10);
+	RNDr(hash, S, 11);
+	RNDr(hash, S, 12);
+	RNDr(hash, S, 13);
+	RNDr(hash, S, 14);
+	RNDr(hash, S, 15);
+	RNDr(hash, S, 16);
+	RNDr(hash, S, 17);
+	RNDr(hash, S, 18);
+	RNDr(hash, S, 19);
+	RNDr(hash, S, 20);
+	RNDr(hash, S, 21);
+	RNDr(hash, S, 22);
+	RNDr(hash, S, 23);
+	RNDr(hash, S, 24);
+	RNDr(hash, S, 25);
+	RNDr(hash, S, 26);
+	RNDr(hash, S, 27);
+	RNDr(hash, S, 28);
+	RNDr(hash, S, 29);
+	RNDr(hash, S, 30);
+	RNDr(hash, S, 31);
+	RNDr(hash, S, 32);
+	RNDr(hash, S, 33);
+	RNDr(hash, S, 34);
+	RNDr(hash, S, 35);
+	RNDr(hash, S, 36);
+	RNDr(hash, S, 37);
+	RNDr(hash, S, 38);
+	RNDr(hash, S, 39);
+	RNDr(hash, S, 40);
+	RNDr(hash, S, 41);
+	RNDr(hash, S, 42);
+	RNDr(hash, S, 43);
+	RNDr(hash, S, 44);
+	RNDr(hash, S, 45);
+	RNDr(hash, S, 46);
+	RNDr(hash, S, 47);
+	RNDr(hash, S, 48);
+	RNDr(hash, S, 49);
+	RNDr(hash, S, 50);
+	RNDr(hash, S, 51);
+	RNDr(hash, S, 52);
+	RNDr(hash, S, 53);
+	RNDr(hash, S, 54);
+	RNDr(hash, S, 55);
+	RNDr(hash, S, 56);
+
+	hash[2] += hash[6] + S1(hash[3]) + Ch(hash[3], hash[4], hash[5])
+			 + S[57] + sha256_k[57];
+	hash[1] += hash[5] + S1(hash[2]) + Ch(hash[2], hash[3], hash[4])
+			 + S[58] + sha256_k[58];
+	hash[0] += hash[4] + S1(hash[1]) + Ch(hash[1], hash[2], hash[3])
+			 + S[59] + sha256_k[59];
+	hash[7] += hash[3] + S1(hash[0]) + Ch(hash[0], hash[1], hash[2])
+			 + S[60] + sha256_k[60]
+			 + sha256_h[7];
+}
+
+//
+// Code taken from original scrypt.cpp and vectorized with minimal changes.
+//
+
+static const uint32x4_t keypadx4[12] = {
+	0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280
+};
+static const uint32x4_t innerpadx4[11] = {
+	0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x000004a0
+};
+static const uint32x4_t outerpadx4[8] = {
+	0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300
+};
+static const uint32x4_t finalblkx4[16] = {
+	0x00000001, 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000620
+};
+
+static inline void HMAC_SHA256_80_initx4(const uint32x4_t *key,
+	uint32x4_t *tstate, uint32x4_t *ostate)
+{
+	uint32x4_t ihash[8];
+	uint32x4_t pad[16];
+	int i;
+
+	/* tstate is assumed to contain the midstate of key */
+	memcpy(pad, key + 16, 4*16);
+	memcpy(pad + 4, keypadx4, 4*48);
+	sha256_transformx4(tstate, pad, 0);
+	memcpy(ihash, tstate, 4*32);
+
+	sha256_initx4(ostate);
+	for (i = 0; i < 8; i++)
+		pad[i] = ihash[i] ^ 0x5c5c5c5c;
+	for (; i < 16; i++)
+		pad[i] = 0x5c5c5c5c;
+	sha256_transformx4(ostate, pad, 0);
+
+	sha256_initx4(tstate);
+	for (i = 0; i < 8; i++)
+		pad[i] = ihash[i] ^ 0x36363636;
+	for (; i < 16; i++)
+		pad[i] = 0x36363636;
+	sha256_transformx4(tstate, pad, 0);
+}
+
+static inline void PBKDF2_SHA256_80_128x4(const uint32x4_t *tstate,
+	const uint32x4_t *ostate, const uint32x4_t *salt, uint32x4_t *output)
+{
+	uint32x4_t istate[8], ostate2[8];
+	uint32x4_t ibuf[16], obuf[16];
+	int i, j;
+
+	memcpy(istate, tstate, 4*32);
+	sha256_transformx4(istate, salt, 0);
+
+	memcpy(ibuf, salt + 16, 4*16);
+	memcpy(ibuf + 5, innerpadx4, 4*44);
+	memcpy(obuf + 8, outerpadx4, 4*32);
+
+	for (i = 0; i < 4; i++) {
+		memcpy(obuf, istate, 4*32);
+		ibuf[4] = i + 1;
+		sha256_transformx4(obuf, ibuf, 0);
+
+		memcpy(ostate2, ostate, 4*32);
+		sha256_transformx4(ostate2, obuf, 0);
+		for (j = 0; j < 8; j++)
+			output[8 * i + j] = swab32x4(ostate2[j]);
+	}
+}
+
+static inline void PBKDF2_SHA256_128_32x4(uint32x4_t *tstate, uint32x4_t *ostate,
+	const uint32x4_t *salt, uint32x4_t *output)
+{
+	uint32x4_t buf[16];
+	int i;
+
+	sha256_transformx4(tstate, salt, 1);
+	sha256_transformx4(tstate, salt + 16, 1);
+	sha256_transformx4(tstate, finalblkx4, 0);
+	memcpy(buf, tstate, 4*32);
+	memcpy(buf + 8, outerpadx4, 4*32);
+
+	sha256_transformx4(ostate, buf, 0);
+	for (i = 0; i < 8; i++)
+		output[i] = swab32x4(ostate[i]);
+}
+
+
+//
+// Original scrypt.cpp HMAC SHA256 functions
+//
+
+static const uint32_t keypad[12] = {
+	0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280
+};
+static const uint32_t innerpad[11] = {
+	0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x000004a0
+};
+static const uint32_t outerpad[8] = {
+	0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300
+};
+static const uint32_t finalblk[16] = {
+	0x00000001, 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000620
+};
+
+static inline void HMAC_SHA256_80_init(const uint32_t *key,
+	uint32_t *tstate, uint32_t *ostate)
+{
+	uint32_t ihash[8];
+	uint32_t pad[16];
+	int i;
+
+	/* tstate is assumed to contain the midstate of key */
+	memcpy(pad, key + 16, 16);
+	memcpy(pad + 4, keypad, 48);
+	sha256_transform(tstate, pad, 0);
+	memcpy(ihash, tstate, 32);
+
+	sha256_init(ostate);
+	for (i = 0; i < 8; i++)
+		pad[i] = ihash[i] ^ 0x5c5c5c5c;
+	for (; i < 16; i++)
+		pad[i] = 0x5c5c5c5c;
+	sha256_transform(ostate, pad, 0);
+
+	sha256_init(tstate);
+	for (i = 0; i < 8; i++)
+		pad[i] = ihash[i] ^ 0x36363636;
+	for (; i < 16; i++)
+		pad[i] = 0x36363636;
+	sha256_transform(tstate, pad, 0);
+}
+
+static inline void PBKDF2_SHA256_80_128(const uint32_t *tstate,
+	const uint32_t *ostate, const uint32_t *salt, uint32_t *output)
+{
+	uint32_t istate[8], ostate2[8];
+	uint32_t ibuf[16], obuf[16];
+	int i, j;
+
+	memcpy(istate, tstate, 32);
+	sha256_transform(istate, salt, 0);
+
+	memcpy(ibuf, salt + 16, 16);
+	memcpy(ibuf + 5, innerpad, 44);
+	memcpy(obuf + 8, outerpad, 32);
+
+	for (i = 0; i < 4; i++) {
+		memcpy(obuf, istate, 32);
+		ibuf[4] = i + 1;
+		sha256_transform(obuf, ibuf, 0);
+
+		memcpy(ostate2, ostate, 32);
+		sha256_transform(ostate2, obuf, 0);
+		for (j = 0; j < 8; j++)
+			output[8 * i + j] = swab32(ostate2[j]);
+	}
+}
+
+static inline void PBKDF2_SHA256_128_32(uint32_t *tstate, uint32_t *ostate,
+	const uint32_t *salt, uint32_t *output)
+{
+	uint32_t buf[16];
+
+	sha256_transform(tstate, salt, 1);
+	sha256_transform(tstate, salt + 16, 1);
+	sha256_transform(tstate, finalblk, 0);
+	memcpy(buf, tstate, 32);
+	memcpy(buf + 8, outerpad, 32);
+
+	sha256_transform(ostate, buf, 0);
+	for (int i = 0; i < 8; i++)
+		output[i] = swab32(ostate[i]);
+}
+
+static int lastFactor = 0;
+
+static void computeGold(uint32_t* const input, uint32_t *reference, uchar *scratchpad);
+
+static bool init[MAX_GPUS] = { 0 };
+
+// cleanup
+void free_scrypt(int thr_id)
+{
+	int dev_id = device_map[thr_id];
+
+	if (!init[thr_id])
+		return;
+
+	// trivial way to free all...
+	cudaSetDevice(dev_id);
+	cudaDeviceSynchronize();
+	cudaDeviceReset();
+
+	init[thr_id] = false;
+}
+
+// Scrypt proof of work algorithm
+// using SSE2 vectorized HMAC SHA256 on CPU and
+// a salsa core implementation on GPU with CUDA
+//
+int scanhash_scrypt(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done,
+	unsigned char *scratchbuf, struct timeval *tv_start, struct timeval *tv_end)
+{
+	int result = 0;
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	static __thread int throughput = 0;
+
+	if (!init[thr_id]) {
+		int dev_id = device_map[thr_id];
+		cudaSetDevice(dev_id);
+		cudaDeviceSynchronize();
+		cudaDeviceReset();
+		cudaSetDevice(dev_id);
+
+		throughput = cuda_throughput(thr_id);
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		init[thr_id] = true;
+	}
+
+	if (throughput == 0)
+		return -1;
+
+	gettimeofday(tv_start, NULL);
+
+	uint32_t n = pdata[19];
+	const uint32_t Htarg = ptarget[7];
+
+	// no default set with --cputest
+	if (opt_nfactor == 0) opt_nfactor = 9;
+	uint32_t N = (1UL<<(opt_nfactor+1));
+	uint32_t *scratch = new uint32_t[N*32]; // scratchbuffer for CPU based validation
+
+	uint32_t nonce[2];
+	uint32_t* hash[2]   = { cuda_hashbuffer(thr_id,0), cuda_hashbuffer(thr_id,1) };
+	uint32_t* X[2]      = { cuda_transferbuffer(thr_id,0), cuda_transferbuffer(thr_id,1) };
+
+	bool sha_on_cpu = (parallel < 2);
+	bool sha_multithreaded = (parallel == 1);
+	uint32x4_t* datax4[2]   = { sha_on_cpu ? new uint32x4_t[throughput/4 * 20] : NULL, sha_on_cpu ? new uint32x4_t[throughput/4 * 20] : NULL };
+	uint32x4_t* hashx4[2]   = { sha_on_cpu ? new uint32x4_t[throughput/4 * 8]  : NULL, sha_on_cpu ? new uint32x4_t[throughput/4 * 8]  : NULL };
+	uint32x4_t* tstatex4[2] = { sha_on_cpu ? new uint32x4_t[throughput/4 * 8]  : NULL, sha_on_cpu ? new uint32x4_t[throughput/4 * 8]  : NULL };
+	uint32x4_t* ostatex4[2] = { sha_on_cpu ? new uint32x4_t[throughput/4 * 8]  : NULL, sha_on_cpu ? new uint32x4_t[throughput/4 * 8]  : NULL };
+	uint32x4_t* Xx4[2]      = { sha_on_cpu ? new uint32x4_t[throughput/4 * 32] : NULL, sha_on_cpu ? new uint32x4_t[throughput/4 * 32] : NULL };
+
+	// log n-factor
+	if (!opt_quiet && lastFactor != opt_nfactor) {
+		applog(LOG_WARNING, "scrypt factor set to %d (%u)", opt_nfactor, N);
+		lastFactor = opt_nfactor;
+	}
+
+	uint32_t _ALIGN(64) midstate[8];
+	sha256_init(midstate);
+	sha256_transform(midstate, pdata, 0);
+
+	if (sha_on_cpu) {
+		for (int i = 0; i < throughput/4; ++i) {
+			for (int j = 0; j < 20; j++) {
+				datax4[0][20*i+j] = uint32x4_t(pdata[j]);
+				datax4[1][20*i+j] = uint32x4_t(pdata[j]);
+			}
+		}
+	}
+	else prepare_sha256(thr_id, pdata, midstate);
+
+	int cur = 1, nxt = 0;
+	int iteration = 0;
+	int num_shares = (4*opt_n_threads) || 1; // opt_n_threads can be 0 with --cputest
+	int share_workload = ((((throughput + num_shares-1) / num_shares) + 3) / 4) * 4;
+
+	do {
+		nonce[nxt] = n;
+
+		if (sha_on_cpu)
+		{
+			for (int i = 0; i < throughput/4; i++) {
+				datax4[nxt][i * 20 + 19] = uint32x4_t(n+0, n+1, n+2, n+3);
+				n += 4;
+			}
+			if (sha_multithreaded)
+			{
+#ifdef WIN32
+				parallel_for (0, num_shares, [&](int share) {
+					for (int k = (share_workload*share)/4; k < (share_workload*(share+1))/4 && k < throughput/4; k++) {
+						for (int l = 0; l < 8; l++)
+							tstatex4[nxt][k * 8 + l] = uint32x4_t(midstate[l]);
+							HMAC_SHA256_80_initx4(&datax4[nxt][k * 20], &tstatex4[nxt][k * 8], &ostatex4[nxt][k * 8]);
+							PBKDF2_SHA256_80_128x4(&tstatex4[nxt][k * 8], &ostatex4[nxt][k * 8], &datax4[nxt][k * 20], &Xx4[nxt][k * 32]);
+					}
+				} );
+#else
+			#pragma omp parallel for
+				for (int share = 0; share < num_shares; share++) {
+					for (int k = (share_workload*share)/4; k < (share_workload*(share+1))/4 && k < throughput/4; k++) {
+						for (int l = 0; l < 8; l++)
+							tstatex4[nxt][k * 8 + l] = uint32x4_t(midstate[l]);
+							HMAC_SHA256_80_initx4(&datax4[nxt][k * 20], &tstatex4[nxt][k * 8], &ostatex4[nxt][k * 8]);
+							PBKDF2_SHA256_80_128x4(&tstatex4[nxt][k * 8], &ostatex4[nxt][k * 8], &datax4[nxt][k * 20], &Xx4[nxt][k * 32]);
+					}
+				}
+#endif
+			}
+			else /* sha_multithreaded */
+			{
+				for (int k = 0; k < throughput/4; k++) {
+					for (int l = 0; l < 8; l++)
+						tstatex4[nxt][k * 8 + l] = uint32x4_t(midstate[l]);
+						HMAC_SHA256_80_initx4(&datax4[nxt][k * 20], &tstatex4[nxt][k * 8], &ostatex4[nxt][k * 8]);
+						PBKDF2_SHA256_80_128x4(&tstatex4[nxt][k * 8], &ostatex4[nxt][k * 8], &datax4[nxt][k * 20], &Xx4[nxt][k * 32]);
+				}
+			}
+
+			for (int i = 0; i < throughput/4; i++) {
+				for (int j = 0; j < 32; j++) {
+					uint32x4_t &t = Xx4[nxt][i * 32 + j];
+					X[nxt][(4*i+0)*32+j] = t[0]; X[nxt][(4*i+1)*32+j] = t[1];
+					X[nxt][(4*i+2)*32+j] = t[2]; X[nxt][(4*i+3)*32+j] = t[3];
+				}
+			}
+
+			cuda_scrypt_serialize(thr_id, nxt);
+			cuda_scrypt_HtoD(thr_id, X[nxt], nxt);
+
+			cuda_scrypt_core(thr_id, nxt, N);
+			cuda_scrypt_done(thr_id, nxt);
+
+			cuda_scrypt_DtoH(thr_id, X[nxt], nxt, false);
+			//cuda_scrypt_flush(thr_id, nxt);
+			if(!cuda_scrypt_sync(thr_id, nxt))
+			{
+				result = -1;
+				break;
+			}
+
+			for (int i = 0; i < throughput/4; i++) {
+				for (int j = 0; j < 32; j++) {
+					Xx4[cur][i * 32 + j] = uint32x4_t(
+						X[cur][(4*i+0)*32+j], X[cur][(4*i+1)*32+j],
+						X[cur][(4*i+2)*32+j], X[cur][(4*i+3)*32+j]
+					);
+				}
+			}
+
+			if (sha_multithreaded)
+			{
+#ifdef WIN32
+				parallel_for (0, num_shares, [&](int share) {
+					for (int k = (share_workload*share)/4; k < (share_workload*(share+1))/4 && k < throughput/4; k++) {
+						PBKDF2_SHA256_128_32x4(&tstatex4[cur][k * 8], &ostatex4[cur][k * 8], &Xx4[cur][k * 32], &hashx4[cur][k * 8]);
+					}
+				} );
+#else
+				#pragma omp parallel for
+				for (int share = 0; share < num_shares; share++) {
+					for (int k = (share_workload*share)/4; k < (share_workload*(share+1))/4 && k < throughput/4; k++) {
+						PBKDF2_SHA256_128_32x4(&tstatex4[cur][k * 8], &ostatex4[cur][k * 8], &Xx4[cur][k * 32], &hashx4[cur][k * 8]);
+					}
+				}
+#endif
+			} else {
+
+				for (int k = 0; k < throughput/4; k++) {
+					PBKDF2_SHA256_128_32x4(&tstatex4[cur][k * 8], &ostatex4[cur][k * 8], &Xx4[cur][k * 32], &hashx4[cur][k * 8]);
+				}
+			}
+
+			for (int i = 0; i < throughput/4; i++) {
+				for (int j = 0; j < 8; j++) {
+					uint32x4_t &t = hashx4[cur][i * 8 + j];
+					hash[cur][(4*i+0)*8+j] = t[0]; hash[cur][(4*i+1)*8+j] = t[1];
+					hash[cur][(4*i+2)*8+j] = t[2]; hash[cur][(4*i+3)*8+j] = t[3];
+				}
+			}
+		}
+		else /* sha_on_cpu */
+		{
+			n += throughput;
+
+			cuda_scrypt_serialize(thr_id, nxt);
+			pre_sha256(thr_id, nxt, nonce[nxt], throughput);
+
+			cuda_scrypt_core(thr_id, nxt, N);
+			// cuda_scrypt_flush(thr_id, nxt);
+			if (!cuda_scrypt_sync(thr_id, nxt)) {
+				printf("error\n");
+				result = -1;
+				break;
+			}
+
+			post_sha256(thr_id, nxt, throughput);
+			cuda_scrypt_done(thr_id, nxt);
+
+			cuda_scrypt_DtoH(thr_id, hash[nxt], nxt, true);
+			// cuda_scrypt_flush(thr_id, nxt);
+			if (!cuda_scrypt_sync(thr_id, nxt)) {
+				printf("error\n");
+				result = -1;
+				break;
+			}
+		}
+
+		if (iteration > 0 || opt_n_threads == 0)
+		{
+			for (int i = 0; i < throughput; i++)
+			{
+				if (hash[cur][i * 8 + 7] <= Htarg && fulltest(hash[cur] + i * 8, ptarget))
+				{
+					// CPU based validation to rule out GPU errors (scalar CPU code)
+					uint32_t _ALIGN(64) inp[32], ref[32], tstate[8], ostate[8], refhash[8], ldata[20];
+
+					memcpy(ldata, pdata, 80); ldata[19] = nonce[cur] + i;
+					memcpy(tstate, midstate, 32);
+					HMAC_SHA256_80_init(ldata, tstate, ostate);
+					PBKDF2_SHA256_80_128(tstate, ostate, ldata, inp);
+					computeGold(inp, ref, (uchar*)scratch);
+					bool good = true;
+
+					if (sha_on_cpu) {
+						if (memcmp(&X[cur][i * 32], ref, 32*sizeof(uint32_t)) != 0) good = false;
+					} else {
+						PBKDF2_SHA256_128_32(tstate, ostate, ref, refhash);
+						if (memcmp(&hash[cur][i * 8], refhash, 32) != 0) good = false;
+					}
+
+					if (!good) {
+						gpulog(LOG_WARNING, thr_id, "result does not validate on CPU! (i=%d, s=%d)", i, cur);
+					} else {
+						*hashes_done = n - pdata[19];
+						work_set_target_ratio(work, refhash);
+						pdata[19] = nonce[cur] + i;
+						result = 1;
+						goto byebye;
+					}
+				}
+			}
+		}
+
+		cur = (cur+1)&1;
+		nxt = (nxt+1)&1;
+		++iteration;
+
+		//printf("n=%d, thr=%d, max=%d, rest=%d\n", n, throughput, max_nonce, work_restart[thr_id].restart);
+	} while (n <= max_nonce && !work_restart[thr_id].restart);
+
+	*hashes_done = n - pdata[19];
+	pdata[19] = n;
+byebye:
+	delete[] datax4[0]; delete[] datax4[1]; delete[] hashx4[0]; delete[] hashx4[1];
+	delete[] tstatex4[0]; delete[] tstatex4[1]; delete[] ostatex4[0]; delete[] ostatex4[1];
+	delete[] Xx4[0]; delete[] Xx4[1];
+	delete [] scratch;
+	gettimeofday(tv_end, NULL);
+	return result;
+}
+
+#define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
+
+static void xor_salsa8(uint32_t * const B, const uint32_t * const C)
+{
+	uint32_t x0 = (B[ 0] ^= C[ 0]), x1 = (B[ 1] ^= C[ 1]), x2 = (B[ 2] ^= C[ 2]), x3 = (B[ 3] ^= C[ 3]);
+	uint32_t x4 = (B[ 4] ^= C[ 4]), x5 = (B[ 5] ^= C[ 5]), x6 = (B[ 6] ^= C[ 6]), x7 = (B[ 7] ^= C[ 7]);
+	uint32_t x8 = (B[ 8] ^= C[ 8]), x9 = (B[ 9] ^= C[ 9]), xa = (B[10] ^= C[10]), xb = (B[11] ^= C[11]);
+	uint32_t xc = (B[12] ^= C[12]), xd = (B[13] ^= C[13]), xe = (B[14] ^= C[14]), xf = (B[15] ^= C[15]);
+
+	/* Operate on columns. */
+	x4 ^= ROTL(x0 + xc,  7);  x9 ^= ROTL(x5 + x1,  7); xe ^= ROTL(xa + x6,  7);  x3 ^= ROTL(xf + xb,  7);
+	x8 ^= ROTL(x4 + x0,  9);  xd ^= ROTL(x9 + x5,  9); x2 ^= ROTL(xe + xa,  9);  x7 ^= ROTL(x3 + xf,  9);
+	xc ^= ROTL(x8 + x4, 13);  x1 ^= ROTL(xd + x9, 13); x6 ^= ROTL(x2 + xe, 13);  xb ^= ROTL(x7 + x3, 13);
+	x0 ^= ROTL(xc + x8, 18);  x5 ^= ROTL(x1 + xd, 18); xa ^= ROTL(x6 + x2, 18);  xf ^= ROTL(xb + x7, 18);
+
+	/* Operate on rows. */
+	x1 ^= ROTL(x0 + x3,  7);  x6 ^= ROTL(x5 + x4,  7); xb ^= ROTL(xa + x9,  7);  xc ^= ROTL(xf + xe,  7);
+	x2 ^= ROTL(x1 + x0,  9);  x7 ^= ROTL(x6 + x5,  9); x8 ^= ROTL(xb + xa,  9);  xd ^= ROTL(xc + xf,  9);
+	x3 ^= ROTL(x2 + x1, 13);  x4 ^= ROTL(x7 + x6, 13); x9 ^= ROTL(x8 + xb, 13);  xe ^= ROTL(xd + xc, 13);
+	x0 ^= ROTL(x3 + x2, 18);  x5 ^= ROTL(x4 + x7, 18); xa ^= ROTL(x9 + x8, 18);  xf ^= ROTL(xe + xd, 18);
+
+	/* Operate on columns. */
+	x4 ^= ROTL(x0 + xc,  7);  x9 ^= ROTL(x5 + x1,  7); xe ^= ROTL(xa + x6,  7);  x3 ^= ROTL(xf + xb,  7);
+	x8 ^= ROTL(x4 + x0,  9);  xd ^= ROTL(x9 + x5,  9); x2 ^= ROTL(xe + xa,  9);  x7 ^= ROTL(x3 + xf,  9);
+	xc ^= ROTL(x8 + x4, 13);  x1 ^= ROTL(xd + x9, 13); x6 ^= ROTL(x2 + xe, 13);  xb ^= ROTL(x7 + x3, 13);
+	x0 ^= ROTL(xc + x8, 18);  x5 ^= ROTL(x1 + xd, 18); xa ^= ROTL(x6 + x2, 18);  xf ^= ROTL(xb + x7, 18);
+
+	/* Operate on rows. */
+	x1 ^= ROTL(x0 + x3,  7);  x6 ^= ROTL(x5 + x4,  7); xb ^= ROTL(xa + x9,  7);  xc ^= ROTL(xf + xe,  7);
+	x2 ^= ROTL(x1 + x0,  9);  x7 ^= ROTL(x6 + x5,  9); x8 ^= ROTL(xb + xa,  9);  xd ^= ROTL(xc + xf,  9);
+	x3 ^= ROTL(x2 + x1, 13);  x4 ^= ROTL(x7 + x6, 13); x9 ^= ROTL(x8 + xb, 13);  xe ^= ROTL(xd + xc, 13);
+	x0 ^= ROTL(x3 + x2, 18);  x5 ^= ROTL(x4 + x7, 18); xa ^= ROTL(x9 + x8, 18);  xf ^= ROTL(xe + xd, 18);
+
+	/* Operate on columns. */
+	x4 ^= ROTL(x0 + xc,  7);  x9 ^= ROTL(x5 + x1,  7); xe ^= ROTL(xa + x6,  7);  x3 ^= ROTL(xf + xb,  7);
+	x8 ^= ROTL(x4 + x0,  9);  xd ^= ROTL(x9 + x5,  9); x2 ^= ROTL(xe + xa,  9);  x7 ^= ROTL(x3 + xf,  9);
+	xc ^= ROTL(x8 + x4, 13);  x1 ^= ROTL(xd + x9, 13); x6 ^= ROTL(x2 + xe, 13);  xb ^= ROTL(x7 + x3, 13);
+	x0 ^= ROTL(xc + x8, 18);  x5 ^= ROTL(x1 + xd, 18); xa ^= ROTL(x6 + x2, 18);  xf ^= ROTL(xb + x7, 18);
+
+	/* Operate on rows. */
+	x1 ^= ROTL(x0 + x3,  7);  x6 ^= ROTL(x5 + x4,  7); xb ^= ROTL(xa + x9,  7);  xc ^= ROTL(xf + xe,  7);
+	x2 ^= ROTL(x1 + x0,  9);  x7 ^= ROTL(x6 + x5,  9); x8 ^= ROTL(xb + xa,  9);  xd ^= ROTL(xc + xf,  9);
+	x3 ^= ROTL(x2 + x1, 13);  x4 ^= ROTL(x7 + x6, 13); x9 ^= ROTL(x8 + xb, 13);  xe ^= ROTL(xd + xc, 13);
+	x0 ^= ROTL(x3 + x2, 18);  x5 ^= ROTL(x4 + x7, 18); xa ^= ROTL(x9 + x8, 18);  xf ^= ROTL(xe + xd, 18);
+
+	/* Operate on columns. */
+	x4 ^= ROTL(x0 + xc,  7);  x9 ^= ROTL(x5 + x1,  7); xe ^= ROTL(xa + x6,  7);  x3 ^= ROTL(xf + xb,  7);
+	x8 ^= ROTL(x4 + x0,  9);  xd ^= ROTL(x9 + x5,  9); x2 ^= ROTL(xe + xa,  9);  x7 ^= ROTL(x3 + xf,  9);
+	xc ^= ROTL(x8 + x4, 13);  x1 ^= ROTL(xd + x9, 13); x6 ^= ROTL(x2 + xe, 13);  xb ^= ROTL(x7 + x3, 13);
+	x0 ^= ROTL(xc + x8, 18);  x5 ^= ROTL(x1 + xd, 18); xa ^= ROTL(x6 + x2, 18);  xf ^= ROTL(xb + x7, 18);
+
+	/* Operate on rows. */
+	x1 ^= ROTL(x0 + x3,  7);  x6 ^= ROTL(x5 + x4,  7); xb ^= ROTL(xa + x9,  7);  xc ^= ROTL(xf + xe,  7);
+	x2 ^= ROTL(x1 + x0,  9);  x7 ^= ROTL(x6 + x5,  9); x8 ^= ROTL(xb + xa,  9);  xd ^= ROTL(xc + xf,  9);
+	x3 ^= ROTL(x2 + x1, 13);  x4 ^= ROTL(x7 + x6, 13); x9 ^= ROTL(x8 + xb, 13);  xe ^= ROTL(xd + xc, 13);
+	x0 ^= ROTL(x3 + x2, 18);  x5 ^= ROTL(x4 + x7, 18); xa ^= ROTL(x9 + x8, 18);  xf ^= ROTL(xe + xd, 18);
+
+	B[ 0] += x0; B[ 1] += x1; B[ 2] += x2; B[ 3] += x3; B[ 4] += x4; B[ 5] += x5; B[ 6] += x6; B[ 7] += x7;
+	B[ 8] += x8; B[ 9] += x9; B[10] += xa; B[11] += xb; B[12] += xc; B[13] += xd; B[14] += xe; B[15] += xf;
+}
+
+/**
+ * @param X input/ouput
+ * @param V scratch buffer
+ * @param N factor (def. 1024)
+ */
+static void scrypt_core(uint32_t *X, uint32_t *V, uint32_t N)
+{
+	for (uint32_t i = 0; i < N; i++) {
+		memcpy(&V[i * 32], X, 128);
+		xor_salsa8(&X[0], &X[16]);
+		xor_salsa8(&X[16], &X[0]);
+	}
+	for (uint32_t i = 0; i < N; i++) {
+		uint32_t j = 32 * (X[16] & (N - 1));
+		for (uint8_t k = 0; k < 32; k++)
+			X[k] ^= V[j + k];
+		xor_salsa8(&X[0], &X[16]);
+		xor_salsa8(&X[16], &X[0]);
+	}
+}
+
+/**
+ * Compute reference data set on the CPU
+ * @param input      input data as provided to device
+ * @param reference  reference data, computed but preallocated
+ * @param scratchpad scrypt scratchpad
+ **/
+static void computeGold(uint32_t* const input, uint32_t *reference, uchar *scratchpad)
+{
+	uint32_t X[32] = { 0 };
+	uint32_t *V = (uint32_t*) scratchpad;
+	uint32_t N = (1<<(opt_nfactor+1)); // default 9 = 1024
+
+	for (int k = 0; k < 32; k++)
+		X[k] = input[k];
+
+	scrypt_core(X, V, N);
+
+	for (int k = 0; k < 32; k++)
+		reference[k] = X[k];
+}
+
+/* cputest */
+void scrypthash(void* output, const void* input)
+{
+	uint32_t _ALIGN(64) X[32], ref[32] = { 0 }, tstate[8], ostate[8], midstate[8];
+	uint32_t _ALIGN(64) data[20];
+	uchar *scratchbuf;
+
+	// no default set with --cputest
+	if (opt_nfactor == 0) opt_nfactor = 9;
+
+	scratchbuf = (uchar*) calloc(4 * 128 + 63, 1UL << (opt_nfactor+1));
+
+	memcpy(data, input, 80);
+
+	sha256_init(midstate);
+	sha256_transform(midstate, data, 0); /* ok */
+
+	memcpy(tstate, midstate, 32);
+	HMAC_SHA256_80_init(data, tstate, ostate);
+	PBKDF2_SHA256_80_128(tstate, ostate, data, X); /* ok */
+
+	if (scratchbuf) {
+		computeGold(X, ref, scratchbuf);
+		PBKDF2_SHA256_128_32(tstate, ostate, ref, (uint32_t*) output);
+	} else {
+		memset(output, 0, 32);
+	}
+
+	free(scratchbuf);
+}
diff --git a/scrypt/blake.cu b/scrypt/blake.cu
new file mode 100644
index 0000000000..5e8443a9f8
--- /dev/null
+++ b/scrypt/blake.cu
@@ -0,0 +1,453 @@
+//
+//  =============== BLAKE part on nVidia GPU ======================
+//
+// This is the generic "default" implementation when no architecture
+// specific implementation is available in the kernel.
+//
+// NOTE: compile this .cu module for compute_10,sm_10 with --maxrregcount=64
+//
+// TODO: CUDA porting work remains to be done.
+//
+
+#include <map>
+#include <stdint.h>
+
+#include "miner.h"
+#include "salsa_kernel.h"
+
+#include "cuda_helper.h"
+typedef uint32_t sph_u32;
+#define SPH_ROTL32 ROTL32
+#define SPH_ROTR32 ROTR32
+
+__constant__ uint64_t ptarget64[4];
+__constant__ uint32_t pdata[20];
+
+// define some error checking macros
+#define DELIMITER '/'
+#define __FILENAME__ ( strrchr(__FILE__, DELIMITER) != NULL ? strrchr(__FILE__, DELIMITER)+1 : __FILE__ )
+
+#undef checkCudaErrors
+#define checkCudaErrors(x) \
+{ \
+	cudaGetLastError(); \
+	x; \
+	cudaError_t err = cudaGetLastError(); \
+	if (err != cudaSuccess && !abort_flag) \
+		applog(LOG_ERR, "GPU #%d: cudaError %d (%s) (%s line %d)\n", device_map[thr_id], err, cudaGetErrorString(err), __FILENAME__, __LINE__); \
+}
+
+// from salsa_kernel.cu
+extern std::map<int, uint32_t *> context_idata[2];
+extern std::map<int, uint32_t *> context_odata[2];
+extern std::map<int, cudaStream_t> context_streams[2];
+extern std::map<int, uint32_t *> context_hash[2];
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+/**
+ * Encode a 32-bit value into the provided buffer (big endian convention).
+ *
+ * @param dst   the destination buffer
+ * @param val   the 32-bit value to encode
+ */
+static __device__ void
+cuda_sph_enc32be(void *dst, sph_u32 val)
+{
+	*(sph_u32 *)dst = cuda_swab32(val);
+}
+
+#define Z00   0
+#define Z01   1
+#define Z02   2
+#define Z03   3
+#define Z04   4
+#define Z05   5
+#define Z06   6
+#define Z07   7
+#define Z08   8
+#define Z09   9
+#define Z0A   A
+#define Z0B   B
+#define Z0C   C
+#define Z0D   D
+#define Z0E   E
+#define Z0F   F
+
+#define Z10   E
+#define Z11   A
+#define Z12   4
+#define Z13   8
+#define Z14   9
+#define Z15   F
+#define Z16   D
+#define Z17   6
+#define Z18   1
+#define Z19   C
+#define Z1A   0
+#define Z1B   2
+#define Z1C   B
+#define Z1D   7
+#define Z1E   5
+#define Z1F   3
+
+#define Z20   B
+#define Z21   8
+#define Z22   C
+#define Z23   0
+#define Z24   5
+#define Z25   2
+#define Z26   F
+#define Z27   D
+#define Z28   A
+#define Z29   E
+#define Z2A   3
+#define Z2B   6
+#define Z2C   7
+#define Z2D   1
+#define Z2E   9
+#define Z2F   4
+
+#define Z30   7
+#define Z31   9
+#define Z32   3
+#define Z33   1
+#define Z34   D
+#define Z35   C
+#define Z36   B
+#define Z37   E
+#define Z38   2
+#define Z39   6
+#define Z3A   5
+#define Z3B   A
+#define Z3C   4
+#define Z3D   0
+#define Z3E   F
+#define Z3F   8
+
+#define Z40   9
+#define Z41   0
+#define Z42   5
+#define Z43   7
+#define Z44   2
+#define Z45   4
+#define Z46   A
+#define Z47   F
+#define Z48   E
+#define Z49   1
+#define Z4A   B
+#define Z4B   C
+#define Z4C   6
+#define Z4D   8
+#define Z4E   3
+#define Z4F   D
+
+#define Z50   2
+#define Z51   C
+#define Z52   6
+#define Z53   A
+#define Z54   0
+#define Z55   B
+#define Z56   8
+#define Z57   3
+#define Z58   4
+#define Z59   D
+#define Z5A   7
+#define Z5B   5
+#define Z5C   F
+#define Z5D   E
+#define Z5E   1
+#define Z5F   9
+
+#define Z60   C
+#define Z61   5
+#define Z62   1
+#define Z63   F
+#define Z64   E
+#define Z65   D
+#define Z66   4
+#define Z67   A
+#define Z68   0
+#define Z69   7
+#define Z6A   6
+#define Z6B   3
+#define Z6C   9
+#define Z6D   2
+#define Z6E   8
+#define Z6F   B
+
+#define Z70   D
+#define Z71   B
+#define Z72   7
+#define Z73   E
+#define Z74   C
+#define Z75   1
+#define Z76   3
+#define Z77   9
+#define Z78   5
+#define Z79   0
+#define Z7A   F
+#define Z7B   4
+#define Z7C   8
+#define Z7D   6
+#define Z7E   2
+#define Z7F   A
+
+#define Z80   6
+#define Z81   F
+#define Z82   E
+#define Z83   9
+#define Z84   B
+#define Z85   3
+#define Z86   0
+#define Z87   8
+#define Z88   C
+#define Z89   2
+#define Z8A   D
+#define Z8B   7
+#define Z8C   1
+#define Z8D   4
+#define Z8E   A
+#define Z8F   5
+
+#define Z90   A
+#define Z91   2
+#define Z92   8
+#define Z93   4
+#define Z94   7
+#define Z95   6
+#define Z96   1
+#define Z97   5
+#define Z98   F
+#define Z99   B
+#define Z9A   9
+#define Z9B   E
+#define Z9C   3
+#define Z9D   C
+#define Z9E   D
+#define Z9F   0
+
+#define Mx(r, i)    Mx_(Z ## r ## i)
+#define Mx_(n)      Mx__(n)
+#define Mx__(n)     M ## n
+
+#define CSx(r, i)   CSx_(Z ## r ## i)
+#define CSx_(n)     CSx__(n)
+#define CSx__(n)    CS ## n
+
+#define CS0   SPH_C32(0x243F6A88)
+#define CS1   SPH_C32(0x85A308D3)
+#define CS2   SPH_C32(0x13198A2E)
+#define CS3   SPH_C32(0x03707344)
+#define CS4   SPH_C32(0xA4093822)
+#define CS5   SPH_C32(0x299F31D0)
+#define CS6   SPH_C32(0x082EFA98)
+#define CS7   SPH_C32(0xEC4E6C89)
+#define CS8   SPH_C32(0x452821E6)
+#define CS9   SPH_C32(0x38D01377)
+#define CSA   SPH_C32(0xBE5466CF)
+#define CSB   SPH_C32(0x34E90C6C)
+#define CSC   SPH_C32(0xC0AC29B7)
+#define CSD   SPH_C32(0xC97C50DD)
+#define CSE   SPH_C32(0x3F84D5B5)
+#define CSF   SPH_C32(0xB5470917)
+
+#define GS(m0, m1, c0, c1, a, b, c, d)   do { \
+		a = SPH_T32(a + b + (m0 ^ c1)); \
+		d = SPH_ROTR32(d ^ a, 16); \
+		c = SPH_T32(c + d); \
+		b = SPH_ROTR32(b ^ c, 12); \
+		a = SPH_T32(a + b + (m1 ^ c0)); \
+		d = SPH_ROTR32(d ^ a, 8); \
+		c = SPH_T32(c + d); \
+		b = SPH_ROTR32(b ^ c, 7); \
+	} while (0)
+
+#define ROUND_S(r)   do { \
+		GS(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
+		GS(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
+		GS(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
+		GS(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \
+		GS(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \
+		GS(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
+		GS(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
+		GS(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
+	} while (0)
+
+#define COMPRESS32   do { \
+		sph_u32 M0, M1, M2, M3, M4, M5, M6, M7; \
+		sph_u32 M8, M9, MA, MB, MC, MD, ME, MF; \
+		sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \
+		sph_u32 V8, V9, VA, VB, VC, VD, VE, VF; \
+		V0 = H0; \
+		V1 = H1; \
+		V2 = H2; \
+		V3 = H3; \
+		V4 = H4; \
+		V5 = H5; \
+		V6 = H6; \
+		V7 = H7; \
+		V8 = S0 ^ CS0; \
+		V9 = S1 ^ CS1; \
+		VA = S2 ^ CS2; \
+		VB = S3 ^ CS3; \
+		VC = T0 ^ CS4; \
+		VD = T0 ^ CS5; \
+		VE = T1 ^ CS6; \
+		VF = T1 ^ CS7; \
+		M0 = input[0]; \
+		M1 = input[1]; \
+		M2 = input[2]; \
+		M3 = input[3]; \
+		M4 = input[4]; \
+		M5 = input[5]; \
+		M6 = input[6]; \
+		M7 = input[7]; \
+		M8 = input[8]; \
+		M9 = input[9]; \
+		MA = input[10]; \
+		MB = input[11]; \
+		MC = input[12]; \
+		MD = input[13]; \
+		ME = input[14]; \
+		MF = input[15]; \
+		ROUND_S(0); \
+		ROUND_S(1); \
+		ROUND_S(2); \
+		ROUND_S(3); \
+		ROUND_S(4); \
+		ROUND_S(5); \
+		ROUND_S(6); \
+		ROUND_S(7); \
+		H0 ^= S0 ^ V0 ^ V8; \
+		H1 ^= S1 ^ V1 ^ V9; \
+		H2 ^= S2 ^ V2 ^ VA; \
+		H3 ^= S3 ^ V3 ^ VB; \
+		H4 ^= S0 ^ V4 ^ VC; \
+		H5 ^= S1 ^ V5 ^ VD; \
+		H6 ^= S2 ^ V6 ^ VE; \
+		H7 ^= S3 ^ V7 ^ VF; \
+	} while (0)
+
+__global__
+void cuda_blake256_hash( uint64_t *g_out, uint32_t nonce, uint32_t *g_good, bool validate )
+{
+	uint32_t input[16];
+	uint64_t output[4];
+
+	#pragma unroll
+	for (int i=0; i < 16; ++i) input[i] = pdata[i];
+
+	sph_u32 H0 = 0x6A09E667;
+	sph_u32 H1 = 0xBB67AE85;
+	sph_u32 H2 = 0x3C6EF372;
+	sph_u32 H3 = 0xA54FF53A;
+	sph_u32 H4 = 0x510E527F;
+	sph_u32 H5 = 0x9B05688C;
+	sph_u32 H6 = 0x1F83D9AB;
+	sph_u32 H7 = 0x5BE0CD19;
+	sph_u32 S0 = 0;
+	sph_u32 S1 = 0;
+	sph_u32 S2 = 0;
+	sph_u32 S3 = 0;
+	sph_u32 T0 = 0;
+	sph_u32 T1 = 0;
+	T0 = SPH_T32(T0 + 512);
+	COMPRESS32;
+
+	#pragma unroll
+	for (int i=0; i < 3; ++i) input[i] = pdata[16+i];
+	input[3] = nonce + ((blockIdx.x * blockDim.x) + threadIdx.x);
+	input[4] = 0x80000000;
+	#pragma unroll 8
+	for (int i=5; i < 13; ++i) input[i] = 0;
+	input[13] = 0x00000001;
+	input[14] = T1;
+	input[15] = T0 + 128;
+
+	T0 = SPH_T32(T0 + 128);
+	COMPRESS32;
+
+	cuda_sph_enc32be((unsigned char*)output + 4*6, H6);
+	cuda_sph_enc32be((unsigned char*)output + 4*7, H7);
+	if (validate || output[3] <=  ptarget64[3])
+	{
+		// this data is only needed when we actually need to save the hashes
+		cuda_sph_enc32be((unsigned char*)output + 4*0, H0);
+		cuda_sph_enc32be((unsigned char*)output + 4*1, H1);
+		cuda_sph_enc32be((unsigned char*)output + 4*2, H2);
+		cuda_sph_enc32be((unsigned char*)output + 4*3, H3);
+		cuda_sph_enc32be((unsigned char*)output + 4*4, H4);
+		cuda_sph_enc32be((unsigned char*)output + 4*5, H5);
+	}
+
+	if (validate)
+	{
+		g_out += 4 * ((blockIdx.x * blockDim.x) + threadIdx.x);
+		#pragma unroll
+		for (int i=0; i < 4; ++i) g_out[i] = output[i];
+	}
+
+	if (output[3] <=  ptarget64[3]) {
+		uint64_t *g_good64 = (uint64_t*)g_good;
+		if (output[3] < g_good64[3]) {
+			g_good64[3] = output[3];
+			g_good64[2] = output[2];
+			g_good64[1] = output[1];
+			g_good64[0] = output[0];
+			g_good[8] = nonce + ((blockIdx.x * blockDim.x) + threadIdx.x);
+		}
+	}
+}
+
+static std::map<int, uint32_t *> context_good[2];
+
+static bool init[MAX_GPUS] = { 0 };
+
+bool default_prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8])
+{
+	if (!init[thr_id])
+	{
+		// allocate pinned host memory for good hashes
+		uint32_t *tmp;
+		checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[0][thr_id] = tmp;
+		checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[1][thr_id] = tmp;
+
+		init[thr_id] = true;
+	}
+	checkCudaErrors(cudaMemcpyToSymbol(pdata, host_pdata, 80, 0, cudaMemcpyHostToDevice));
+	checkCudaErrors(cudaMemcpyToSymbol(ptarget64, host_ptarget, 32, 0, cudaMemcpyHostToDevice));
+
+	return context_good[0][thr_id] && context_good[1][thr_id];
+}
+
+void default_do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h)
+{
+	checkCudaErrors(cudaMemsetAsync(context_good[stream][thr_id], 0xff, 9 * sizeof(uint32_t), context_streams[stream][thr_id]));
+
+	cuda_blake256_hash<<<grid, threads, 0, context_streams[stream][thr_id]>>>((uint64_t*)context_hash[stream][thr_id], nonce, context_good[stream][thr_id], do_d2h);
+
+	// copy hashes from device memory to host (ALL hashes, lots of data...)
+	if (do_d2h && hash != NULL) {
+		size_t mem_size = throughput * sizeof(uint32_t) * 8;
+		checkCudaErrors(cudaMemcpyAsync(hash, context_hash[stream][thr_id], mem_size,
+						cudaMemcpyDeviceToHost, context_streams[stream][thr_id]));
+	}
+	else if (hash != NULL) {
+		// asynchronous copy of winning nonce (just 4 bytes...)
+		checkCudaErrors(cudaMemcpyAsync(hash, context_good[stream][thr_id]+8, sizeof(uint32_t),
+						cudaMemcpyDeviceToHost, context_streams[stream][thr_id]));
+	}
+}
+
+void default_free_blake256(int thr_id)
+{
+	if (init[thr_id]) {
+		cudaFree(context_good[0][thr_id]);
+		cudaFree(context_good[1][thr_id]);
+		init[thr_id] = false;
+	}
+}
+
diff --git a/scrypt/code/scrypt-conf.h b/scrypt/code/scrypt-conf.h
new file mode 100644
index 0000000000..46685a5186
--- /dev/null
+++ b/scrypt/code/scrypt-conf.h
@@ -0,0 +1,28 @@
+/*
+	pick the best algo at runtime or compile time?
+	----------------------------------------------
+	SCRYPT_CHOOSE_COMPILETIME (gcc only!)
+	SCRYPT_CHOOSE_RUNTIME
+*/
+#define SCRYPT_CHOOSE_RUNTIME
+
+
+/*
+	hash function to use
+	-------------------------------
+	SCRYPT_BLAKE256
+	SCRYPT_BLAKE512
+	SCRYPT_SHA256
+	SCRYPT_SHA512
+	SCRYPT_SKEIN512
+*/
+//#define SCRYPT_SHA256
+
+
+/*
+	block mixer to use
+	-----------------------------
+	SCRYPT_CHACHA
+	SCRYPT_SALSA
+*/
+//#define SCRYPT_SALSA
diff --git a/scrypt/code/scrypt-jane-chacha.h b/scrypt/code/scrypt-jane-chacha.h
new file mode 100644
index 0000000000..a4017e0109
--- /dev/null
+++ b/scrypt/code/scrypt-jane-chacha.h
@@ -0,0 +1,60 @@
+#define SCRYPT_MIX_BASE "ChaCha20/8"
+
+typedef uint32_t scrypt_mix_word_t;
+
+#define SCRYPT_WORDTO8_LE U32TO8_LE
+#define SCRYPT_WORD_ENDIAN_SWAP U32_SWAP
+
+#define SCRYPT_P 1
+#define SCRYPT_R 1
+#define SCRYPT_BLOCK_BYTES 64
+#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t))
+
+/* must have these here in case block bytes is ever != 64 */
+#include "scrypt-jane-romix-basic.h"
+
+#include "scrypt-jane-mix_chacha.h"
+
+/* cpu agnostic */
+#define SCRYPT_ROMIX_FN scrypt_ROMix_basic
+#define SCRYPT_MIX_FN chacha_core_basic
+#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_convert_endian
+#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_convert_endian
+#include "scrypt-jane-romix-template.h"
+
+#if !defined(SCRYPT_CHOOSE_COMPILETIME)
+static scrypt_ROMixfn
+scrypt_getROMix() {
+	size_t cpuflags = detect_cpu();
+
+	return scrypt_ROMix_basic;
+}
+#endif
+
+
+#if defined(SCRYPT_TEST_SPEED)
+static size_t
+available_implementations() {
+	size_t cpuflags = detect_cpu();
+	size_t flags = 0;
+
+	return flags;
+}
+#endif
+
+static int
+scrypt_test_mix() {
+	static const uint8_t expected[16] = {
+		0x48,0x2b,0x2d,0xb8,0xa1,0x33,0x22,0x73,0xcd,0x16,0xc4,0xb4,0xb0,0x7f,0xb1,0x8a,
+	};
+
+	int ret = 1;
+	size_t cpuflags = detect_cpu();
+
+#if defined(SCRYPT_CHACHA_BASIC)
+	ret &= scrypt_test_mix_instance(scrypt_ChunkMix_basic, scrypt_romix_convert_endian, scrypt_romix_convert_endian, expected);
+#endif
+
+	return ret;
+}
+
diff --git a/scrypt/code/scrypt-jane-mix_chacha.h b/scrypt/code/scrypt-jane-mix_chacha.h
new file mode 100644
index 0000000000..85ee9c1ce4
--- /dev/null
+++ b/scrypt/code/scrypt-jane-mix_chacha.h
@@ -0,0 +1,69 @@
+#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)
+
+#undef SCRYPT_MIX
+#define SCRYPT_MIX "ChaCha20/8 Ref"
+
+#undef SCRYPT_CHACHA_INCLUDED
+#define SCRYPT_CHACHA_INCLUDED
+#define SCRYPT_CHACHA_BASIC
+
+static void
+chacha_core_basic(uint32_t state[16]) {
+	size_t rounds = 8;
+	uint32_t x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,t;
+
+	x0 = state[0];
+	x1 = state[1];
+	x2 = state[2];
+	x3 = state[3];
+	x4 = state[4];
+	x5 = state[5];
+	x6 = state[6];
+	x7 = state[7];
+	x8 = state[8];
+	x9 = state[9];
+	x10 = state[10];
+	x11 = state[11];
+	x12 = state[12];
+	x13 = state[13];
+	x14 = state[14];
+	x15 = state[15];
+
+	#define quarter(a,b,c,d) \
+		a += b; t = d^a; d = ROTL32(t,16); \
+		c += d; t = b^c; b = ROTL32(t,12); \
+		a += b; t = d^a; d = ROTL32(t, 8); \
+		c += d; t = b^c; b = ROTL32(t, 7);
+
+	for (; rounds; rounds -= 2) {
+		quarter( x0, x4, x8,x12)
+		quarter( x1, x5, x9,x13)
+		quarter( x2, x6,x10,x14)
+		quarter( x3, x7,x11,x15)
+		quarter( x0, x5,x10,x15)
+		quarter( x1, x6,x11,x12)
+		quarter( x2, x7, x8,x13)
+		quarter( x3, x4, x9,x14)
+	}
+
+	state[0] += x0;
+	state[1] += x1;
+	state[2] += x2;
+	state[3] += x3;
+	state[4] += x4;
+	state[5] += x5;
+	state[6] += x6;
+	state[7] += x7;
+	state[8] += x8;
+	state[9] += x9;
+	state[10] += x10;
+	state[11] += x11;
+	state[12] += x12;
+	state[13] += x13;
+	state[14] += x14;
+	state[15] += x15;
+
+	#undef quarter
+}
+
+#endif
\ No newline at end of file
diff --git a/scrypt/code/scrypt-jane-portable-x86.h b/scrypt/code/scrypt-jane-portable-x86.h
new file mode 100644
index 0000000000..44f97f9940
--- /dev/null
+++ b/scrypt/code/scrypt-jane-portable-x86.h
@@ -0,0 +1,32 @@
+
+typedef enum cpu_flags_x86_t { }cpu_flags_x86;
+
+typedef enum cpu_vendors_x86_t {
+	cpu_nobody,
+	cpu_intel,
+	cpu_amd
+} cpu_vendors_x86;
+
+typedef struct x86_regs_t {
+	uint32_t eax, ebx, ecx, edx;
+} x86_regs;
+
+
+#if defined(SCRYPT_TEST_SPEED)
+size_t cpu_detect_mask = (size_t)-1;
+#endif
+
+static size_t
+detect_cpu(void) {
+	size_t cpu_flags = 0;
+	return cpu_flags;
+}
+
+#if defined(SCRYPT_TEST_SPEED)
+static const char *
+get_top_cpuflag_desc(size_t flag) {
+	return "Basic";
+}
+#endif
+
+#define asm_calling_convention
diff --git a/scrypt/code/scrypt-jane-portable.h b/scrypt/code/scrypt-jane-portable.h
new file mode 100644
index 0000000000..ef5b93d48b
--- /dev/null
+++ b/scrypt/code/scrypt-jane-portable.h
@@ -0,0 +1,284 @@
+/* determine os */
+#if defined(_WIN32)	|| defined(_WIN64) || defined(__TOS_WIN__) || defined(__WINDOWS__)
+	#include <windows.h>
+	#include <wincrypt.h>
+	#define OS_WINDOWS
+#elif defined(sun) || defined(__sun) || defined(__SVR4) || defined(__svr4__)
+	#include <sys/mman.h>
+	#include <sys/time.h>
+	#include <fcntl.h>
+
+	#define OS_SOLARIS
+#else
+	#include <sys/mman.h>
+	#include <sys/time.h>
+	#include <sys/param.h> /* need this to define BSD */
+	#include <unistd.h>
+	#include <fcntl.h>
+
+	#define OS_NIX
+	#if defined(__linux__)
+		#include <endian.h>
+		#define OS_LINUX
+	#elif defined(BSD)
+		#define OS_BSD
+
+		#if defined(MACOS_X) || (defined(__APPLE__) & defined(__MACH__))
+			#define OS_OSX
+		#elif defined(macintosh) || defined(Macintosh)
+			#define OS_MAC
+		#elif defined(__OpenBSD__)
+			#define OS_OPENBSD
+		#endif
+	#endif
+#endif
+
+
+/* determine compiler */
+#if defined(_MSC_VER)
+	#define COMPILER_MSVC _MSC_VER
+	#if ((COMPILER_MSVC > 1200) || defined(_mm_free))
+		#define COMPILER_MSVC6PP_AND_LATER
+	#endif
+	#if (COMPILER_MSVC >= 1500)
+		#define COMPILER_HAS_TMMINTRIN
+	#endif
+	
+	#pragma warning(disable : 4127) /* conditional expression is constant */
+	#pragma warning(disable : 4100) /* unreferenced formal parameter */
+	
+	#ifndef _CRT_SECURE_NO_WARNINGS
+	#define _CRT_SECURE_NO_WARNINGS
+	#endif
+
+	#include <float.h>
+	#include <stdlib.h> /* _rotl */
+	#include <intrin.h>
+
+	typedef unsigned char uint8_t;
+	typedef unsigned short uint16_t;
+	typedef unsigned int uint32_t;
+	typedef signed int int32_t;	
+	typedef unsigned __int64 uint64_t;
+	typedef signed __int64 int64_t;
+
+	#define ROTL32(a,b) _rotl(a,b)
+	#define ROTR32(a,b) _rotr(a,b)
+	#define ROTL64(a,b) _rotl64(a,b)
+	#define ROTR64(a,b) _rotr64(a,b)
+	#undef NOINLINE
+	#define NOINLINE __declspec(noinline)
+	#undef INLINE
+	#define INLINE __forceinline
+	#undef FASTCALL
+	#define FASTCALL __fastcall
+	#undef CDECL
+	#define CDECL __cdecl
+	#undef STDCALL
+	#define STDCALL __stdcall
+	#undef NAKED
+	#define NAKED __declspec(naked)
+	#define MM16 __declspec(align(16))
+#endif
+#if defined(__ICC)
+	#define COMPILER_INTEL
+#endif
+#if defined(__GNUC__)
+	#if (__GNUC__ >= 3)
+		#define COMPILER_GCC_PATCHLEVEL __GNUC_PATCHLEVEL__
+	#else
+		#define COMPILER_GCC_PATCHLEVEL 0
+	#endif
+	#define COMPILER_GCC (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + COMPILER_GCC_PATCHLEVEL)
+	#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b)))
+	#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b)))
+	#define ROTL64(a,b) (((a) << (b)) | ((a) >> (64 - b)))
+	#define ROTR64(a,b) (((a) >> (b)) | ((a) << (64 - b)))
+	#undef NOINLINE
+	#if (COMPILER_GCC >= 30000)
+		#define NOINLINE __attribute__((noinline))
+	#else
+		#define NOINLINE
+	#endif
+	#undef INLINE
+	#if (COMPILER_GCC >= 30000)
+		#define INLINE __attribute__((always_inline))
+	#else
+		#define INLINE inline
+	#endif
+	#undef FASTCALL
+	#if (COMPILER_GCC >= 30400)
+		#define FASTCALL __attribute__((fastcall))
+	#else
+		#define FASTCALL
+	#endif
+	#undef CDECL
+	#define CDECL __attribute__((cdecl))
+	#undef STDCALL
+	#define STDCALL __attribute__((stdcall))
+	#define MM16 __attribute__((aligned(16)))
+	#include <stdint.h>
+#endif
+#if defined(__MINGW32__) || defined(__MINGW64__)
+	#define COMPILER_MINGW
+#endif
+#if defined(__PATHCC__)
+	#define COMPILER_PATHCC
+#endif
+
+#define OPTIONAL_INLINE
+#if defined(OPTIONAL_INLINE)
+	#undef OPTIONAL_INLINE
+	#define OPTIONAL_INLINE INLINE
+#else
+	#define OPTIONAL_INLINE
+#endif
+
+#define CRYPTO_FN NOINLINE STDCALL
+
+/* determine cpu */
+#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__ ) || defined(_M_X64)
+	#define CPU_X86_64
+#elif defined(__i586__) || defined(__i686__) || (defined(_M_IX86) && (_M_IX86 >= 500))
+	#define CPU_X86 500
+#elif defined(__i486__) || (defined(_M_IX86) && (_M_IX86 >= 400))
+	#define CPU_X86 400
+#elif defined(__i386__) || (defined(_M_IX86) && (_M_IX86 >= 300)) || defined(__X86__) || defined(_X86_) || defined(__I86__)
+	#define CPU_X86 300
+#elif defined(__ia64__) || defined(_IA64) || defined(__IA64__) || defined(_M_IA64) || defined(__ia64)
+	#define CPU_IA64
+#endif
+
+#if defined(__sparc__) || defined(__sparc) || defined(__sparcv9)
+	#define CPU_SPARC
+	#if defined(__sparcv9)
+		#define CPU_SPARC64
+	#endif
+#endif
+
+#if defined(CPU_X86_64) || defined(CPU_IA64) || defined(CPU_SPARC64) || defined(__64BIT__) || defined(__LP64__) || defined(_LP64) || (defined(_MIPS_SZLONG) && (_MIPS_SZLONG == 64))
+	#define CPU_64BITS
+	#undef FASTCALL
+	#define FASTCALL
+	#undef CDECL
+	#define CDECL
+	#undef STDCALL
+	#define STDCALL
+#endif
+
+#if defined(powerpc) || defined(__PPC__) || defined(__ppc__) || defined(_ARCH_PPC) || defined(__powerpc__) || defined(__powerpc) || defined(POWERPC) || defined(_M_PPC)
+	#define CPU_PPC
+	#if defined(_ARCH_PWR7)
+		#define CPU_POWER7
+	#elif defined(__64BIT__)
+		#define CPU_PPC64
+	#else
+		#define CPU_PPC32
+	#endif
+#endif
+
+#if defined(__hppa__) || defined(__hppa)
+	#define CPU_HPPA
+#endif
+
+#if defined(__alpha__) || defined(__alpha) || defined(_M_ALPHA)
+	#define CPU_ALPHA
+#endif
+
+/* endian */
+
+#if ((defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && (__BYTE_ORDER == __LITTLE_ENDIAN)) || \
+	 (defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && (BYTE_ORDER == LITTLE_ENDIAN)) || \
+	 (defined(CPU_X86) || defined(CPU_X86_64)) || \
+	 (defined(vax) || defined(MIPSEL) || defined(_MIPSEL)))
+#define CPU_LE
+#elif ((defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && (__BYTE_ORDER == __BIG_ENDIAN)) || \
+	   (defined(BYTE_ORDER) && defined(BIG_ENDIAN) && (BYTE_ORDER == BIG_ENDIAN)) || \
+	   (defined(CPU_SPARC) || defined(CPU_PPC) || defined(mc68000) || defined(sel)) || defined(_MIPSEB))
+#define CPU_BE
+#else
+	/* unknown endian! */
+#endif
+
+
+#define U8TO32_BE(p)                                            \
+	(((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) |  \
+	 ((uint32_t)((p)[2]) <<  8) | ((uint32_t)((p)[3])      ))
+
+#define U8TO32_LE(p)                                            \
+	(((uint32_t)((p)[0])      ) | ((uint32_t)((p)[1]) <<  8) |  \
+	 ((uint32_t)((p)[2]) << 16) | ((uint32_t)((p)[3]) << 24))
+
+#define U32TO8_BE(p, v)                                           \
+	(p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \
+	(p)[2] = (uint8_t)((v) >>  8); (p)[3] = (uint8_t)((v)      );
+
+#define U32TO8_LE(p, v)                                           \
+	(p)[0] = (uint8_t)((v)      ); (p)[1] = (uint8_t)((v) >>  8); \
+	(p)[2] = (uint8_t)((v) >> 16); (p)[3] = (uint8_t)((v) >> 24);
+
+#define U8TO64_BE(p)                                                  \
+	(((uint64_t)U8TO32_BE(p) << 32) | (uint64_t)U8TO32_BE((p) + 4))
+
+#define U8TO64_LE(p)                                                  \
+	(((uint64_t)U8TO32_LE(p)) | ((uint64_t)U8TO32_LE((p) + 4) << 32))
+
+#define U64TO8_BE(p, v)                        \
+	U32TO8_BE((p),     (uint32_t)((v) >> 32)); \
+	U32TO8_BE((p) + 4, (uint32_t)((v)      ));
+
+#define U64TO8_LE(p, v)                        \
+	U32TO8_LE((p),     (uint32_t)((v)      )); \
+	U32TO8_LE((p) + 4, (uint32_t)((v) >> 32));
+
+#define U32_SWAP(v) {                                             \
+	(v) = (((v) << 8) & 0xFF00FF00 ) | (((v) >> 8) & 0xFF00FF );  \
+    (v) = ((v) << 16) | ((v) >> 16);                              \
+}
+
+#define U64_SWAP(v) {                                                                       \
+	(v) = (((v) <<  8) & 0xFF00FF00FF00FF00ull ) | (((v) >>  8) & 0x00FF00FF00FF00FFull );  \
+	(v) = (((v) << 16) & 0xFFFF0000FFFF0000ull ) | (((v) >> 16) & 0x0000FFFF0000FFFFull );  \
+    (v) = ((v) << 32) | ((v) >> 32);                                                        \
+}
+
+static int
+scrypt_verify(const uint8_t *x, const uint8_t *y, size_t len) {
+	uint32_t differentbits = 0;
+	while (len--)
+		differentbits |= (*x++ ^ *y++);
+	return (1 & ((differentbits - 1) >> 8));
+}
+
+void
+scrypt_ensure_zero(void *p, size_t len) {
+#if ((defined(CPU_X86) || defined(CPU_X86_64)) && defined(COMPILER_MSVC))
+		__stosb((unsigned char *)p, 0, len);
+#elif (defined(CPU_X86) && defined(COMPILER_GCC))
+	__asm__ __volatile__(
+		"pushl %%edi;\n"
+		"pushl %%ecx;\n"
+		"rep stosb;\n"
+		"popl %%ecx;\n"
+		"popl %%edi;\n"
+		:: "a"(0), "D"(p), "c"(len) : "cc", "memory"
+	);
+#elif (defined(CPU_X86_64) && defined(COMPILER_GCC))
+	__asm__ __volatile__(
+		"pushq %%rdi;\n"
+		"pushq %%rcx;\n"
+		"rep stosb;\n"
+		"popq %%rcx;\n"
+		"popq %%rdi;\n"
+		:: "a"(0), "D"(p), "c"(len) : "cc", "memory"
+	);
+#else
+	volatile uint8_t *b = (volatile uint8_t *)p;
+	size_t i;
+	for (i = 0; i < len; i++)
+		b[i] = 0;
+#endif
+}
+
+#include "scrypt-jane-portable-x86.h"
+
diff --git a/scrypt/code/scrypt-jane-romix-basic.h b/scrypt/code/scrypt-jane-romix-basic.h
new file mode 100644
index 0000000000..1cdb3fb06f
--- /dev/null
+++ b/scrypt/code/scrypt-jane-romix-basic.h
@@ -0,0 +1,67 @@
+#if !defined(SCRYPT_CHOOSE_COMPILETIME)
+/* function type returned by scrypt_getROMix, used with cpu detection */
+typedef void (FASTCALL *scrypt_ROMixfn)(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[chunkWords * N]*/, uint32_t N, uint32_t r);
+#endif
+
+/* romix pre/post nop function */
+static void asm_calling_convention
+scrypt_romix_nop(scrypt_mix_word_t *blocks, size_t nblocks) {
+}
+
+/* romix pre/post endian conversion function */
+static void asm_calling_convention
+scrypt_romix_convert_endian(scrypt_mix_word_t *blocks, size_t nblocks) {
+#if !defined(CPU_LE)
+	static const union { uint8_t b[2]; uint16_t w; } endian_test = {{1,0}};
+	size_t i;
+	if (endian_test.w == 0x100) {
+		nblocks *= SCRYPT_BLOCK_WORDS;
+		for (i = 0; i < nblocks; i++) {
+			SCRYPT_WORD_ENDIAN_SWAP(blocks[i]);
+		}
+	}
+#endif
+}
+
+/* chunkmix test function */
+typedef void (asm_calling_convention *chunkmixfn)(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r);
+typedef void (asm_calling_convention *blockfixfn)(scrypt_mix_word_t *blocks, size_t nblocks);
+
+static int
+scrypt_test_mix_instance(chunkmixfn mixfn, blockfixfn prefn, blockfixfn postfn, const uint8_t expected[16]) {
+	/* r = 2, (2 * r) = 4 blocks in a chunk, 4 * SCRYPT_BLOCK_WORDS total */
+	const uint32_t r = 2, blocks = 2 * r, words = blocks * SCRYPT_BLOCK_WORDS;
+	scrypt_mix_word_t MM16 chunk[2][4 * SCRYPT_BLOCK_WORDS], v;
+	uint8_t final[16];
+	size_t i;
+
+	for (i = 0; i < words; i++) {
+		v = (scrypt_mix_word_t)i;
+		v = (v << 8) | v;
+		v = (v << 16) | v;
+		chunk[0][i] = v;
+	}
+
+	prefn(chunk[0], blocks);
+	mixfn(chunk[1], chunk[0], NULL, r);
+	postfn(chunk[1], blocks);
+
+	/* grab the last 16 bytes of the final block */
+	for (i = 0; i < 16; i += sizeof(scrypt_mix_word_t)) {
+		SCRYPT_WORDTO8_LE(final + i, chunk[1][words - (16 / sizeof(scrypt_mix_word_t)) + (i / sizeof(scrypt_mix_word_t))]);
+	}
+
+	return scrypt_verify(expected, final, 16);
+}
+
+/* returns a pointer to item i, where item is len scrypt_mix_word_t's long */
+static scrypt_mix_word_t *
+scrypt_item(scrypt_mix_word_t *base, scrypt_mix_word_t i, scrypt_mix_word_t len) {
+	return base + (i * len);
+}
+
+/* returns a pointer to block i */
+static scrypt_mix_word_t *
+scrypt_block(scrypt_mix_word_t *base, scrypt_mix_word_t i) {
+	return base + (i * SCRYPT_BLOCK_WORDS);
+}
diff --git a/scrypt/code/scrypt-jane-romix-template.h b/scrypt/code/scrypt-jane-romix-template.h
new file mode 100644
index 0000000000..7879c58f84
--- /dev/null
+++ b/scrypt/code/scrypt-jane-romix-template.h
@@ -0,0 +1,179 @@
+#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX)
+
+#if defined(SCRYPT_CHOOSE_COMPILETIME)
+#undef SCRYPT_ROMIX_FN
+#define SCRYPT_ROMIX_FN scrypt_ROMix
+#endif
+
+#undef SCRYPT_HAVE_ROMIX
+#define SCRYPT_HAVE_ROMIX
+
+#if !defined(SCRYPT_CHUNKMIX_FN)
+
+#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_basic
+
+/*
+	Bout = ChunkMix(Bin)
+
+	2*r: number of blocks in the chunk
+*/
+static void asm_calling_convention
+SCRYPT_CHUNKMIX_FN(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r) {
+	scrypt_mix_word_t MM16 X[SCRYPT_BLOCK_WORDS], *block;
+	uint32_t i, j, blocksPerChunk = r * 2, half = 0;
+
+	/* 1: X = B_{2r - 1} */
+	block = scrypt_block(Bin, blocksPerChunk - 1);
+	for (i = 0; i < SCRYPT_BLOCK_WORDS; i++)
+		X[i] = block[i];
+
+	if (Bxor) {
+		block = scrypt_block(Bxor, blocksPerChunk - 1);
+		for (i = 0; i < SCRYPT_BLOCK_WORDS; i++)
+			X[i] ^= block[i];
+	}
+
+	/* 2: for i = 0 to 2r - 1 do */
+	for (i = 0; i < blocksPerChunk; i++, half ^= r) {
+		/* 3: X = H(X ^ B_i) */
+		block = scrypt_block(Bin, i);
+		for (j = 0; j < SCRYPT_BLOCK_WORDS; j++)
+			X[j] ^= block[j];
+
+		if (Bxor) {
+			block = scrypt_block(Bxor, i);
+			for (j = 0; j < SCRYPT_BLOCK_WORDS; j++)
+				X[j] ^= block[j];
+		}
+		SCRYPT_MIX_FN(X);
+
+		/* 4: Y_i = X */
+		/* 6: B'[0..r-1] = Y_even */
+		/* 6: B'[r..2r-1] = Y_odd */
+		block = scrypt_block(Bout, (i / 2) + half);
+		for (j = 0; j < SCRYPT_BLOCK_WORDS; j++)
+			block[j] = X[j];
+	}
+}
+#endif
+
+/*
+	X = ROMix(X)
+
+	X: chunk to mix
+	Y: scratch chunk
+	N: number of rounds
+	V[N]: array of chunks to randomly index in to
+	2*r: number of blocks in a chunk
+*/
+
+static void NOINLINE FASTCALL
+SCRYPT_ROMIX_FN(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[N * chunkWords]*/, uint32_t N, uint32_t r) {
+	uint32_t i, j, chunkWords = SCRYPT_BLOCK_WORDS * r * 2;
+	scrypt_mix_word_t *block = V;
+
+	SCRYPT_ROMIX_TANGLE_FN(X, r * 2);
+
+	/* 1: X = B */
+	/* implicit */
+
+	/* 2: for i = 0 to N - 1 do */
+	memcpy(block, X, chunkWords * sizeof(scrypt_mix_word_t));
+	for (i = 0; i < N - 1; i++, block += chunkWords) {
+		/* 3: V_i = X */
+		/* 4: X = H(X) */
+		SCRYPT_CHUNKMIX_FN(block + chunkWords, block, NULL, r);
+	}
+	SCRYPT_CHUNKMIX_FN(X, block, NULL, r);
+
+	/* 6: for i = 0 to N - 1 do */
+	for (i = 0; i < N; i += 2) {
+		/* 7: j = Integerify(X) % N */
+		j = X[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1);
+
+		/* 8: X = H(Y ^ V_j) */
+		SCRYPT_CHUNKMIX_FN(Y, X, scrypt_item(V, j, chunkWords), r);
+
+		/* 7: j = Integerify(Y) % N */
+		j = Y[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1);
+
+		/* 8: X = H(Y ^ V_j) */
+		SCRYPT_CHUNKMIX_FN(X, Y, scrypt_item(V, j, chunkWords), r);
+	}
+
+	/* 10: B' = X */
+	/* implicit */
+
+	SCRYPT_ROMIX_UNTANGLE_FN(X, r * 2);
+}
+
+/*
+ * Special version with hard-coded r = 1
+ *  - mikaelh
+ */
+static void NOINLINE FASTCALL
+scrypt_ROMix_1(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[N * chunkWords]*/, uint32_t N) {
+	const uint32_t r = 1;
+	uint32_t i, j, chunkWords = SCRYPT_BLOCK_WORDS * r * 2;
+	scrypt_mix_word_t *block = V;
+
+	SCRYPT_ROMIX_TANGLE_FN(X, r * 2);
+
+	/* 1: X = B */
+	/* implicit */
+
+	/* 2: for i = 0 to N - 1 do */
+	memcpy(block, X, chunkWords * sizeof(scrypt_mix_word_t));
+	for (i = 0; i < N - 1; i++, block += chunkWords) {
+		/* 3: V_i = X */
+		/* 4: X = H(X) */
+#ifdef SCRYPT_CHUNKMIX_1_FN
+		SCRYPT_CHUNKMIX_1_FN(block + chunkWords, block);
+#else
+		SCRYPT_CHUNKMIX_FN(block + chunkWords, block, NULL, r);
+#endif
+	}
+#ifdef SCRYPT_CHUNKMIX_1_FN
+	SCRYPT_CHUNKMIX_1_FN(X, block);
+#else
+	SCRYPT_CHUNKMIX_FN(X, block, NULL, r);
+#endif
+
+	/* 6: for i = 0 to N - 1 do */
+	for (i = 0; i < N; i += 2) {
+		/* 7: j = Integerify(X) % N */
+		j = X[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1);
+
+		/* 8: X = H(Y ^ V_j) */
+#ifdef SCRYPT_CHUNKMIX_1_XOR_FN
+		SCRYPT_CHUNKMIX_1_XOR_FN(Y, X, scrypt_item(V, j, chunkWords));
+#else
+		SCRYPT_CHUNKMIX_FN(Y, X, scrypt_item(V, j, chunkWords), r);
+#endif
+
+		/* 7: j = Integerify(Y) % N */
+		j = Y[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1);
+
+		/* 8: X = H(Y ^ V_j) */
+#ifdef SCRYPT_CHUNKMIX_1_XOR_FN
+		SCRYPT_CHUNKMIX_1_XOR_FN(X, Y, scrypt_item(V, j, chunkWords));
+#else
+		SCRYPT_CHUNKMIX_FN(X, Y, scrypt_item(V, j, chunkWords), r);
+#endif
+	}
+
+	/* 10: B' = X */
+	/* implicit */
+
+	SCRYPT_ROMIX_UNTANGLE_FN(X, r * 2);
+}
+
+#endif /* !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX) */
+
+
+#undef SCRYPT_CHUNKMIX_FN
+#undef SCRYPT_ROMIX_FN
+#undef SCRYPT_MIX_FN
+#undef SCRYPT_ROMIX_TANGLE_FN
+#undef SCRYPT_ROMIX_UNTANGLE_FN
+
diff --git a/scrypt/fermi_kernel.cu b/scrypt/fermi_kernel.cu
new file mode 100644
index 0000000000..d58bd7f46b
--- /dev/null
+++ b/scrypt/fermi_kernel.cu
@@ -0,0 +1,907 @@
+//
+// Kernel that runs best on Fermi devices
+//
+// - shared memory use reduced by nearly factor 2 over legacy kernel
+//   by transferring only half work units (16 x uint32_t) at once.
+// - uses ulong2/uint4 based memory transfers (each thread moves 16 bytes),
+//   allowing for shorter unrolled loops. This relies on Fermi's better
+//   memory controllers to get high memory troughput.
+//
+// NOTE: compile this .cu module for compute_20,sm_20 with --maxrregcount=63
+//
+// TODO: batch-size support for this kernel
+//
+
+#include <map>
+
+#include <cuda_runtime.h>
+#include "miner.h"
+
+#include "salsa_kernel.h"
+#include "fermi_kernel.h"
+
+#define THREADS_PER_WU 1  // single thread per hash
+
+#define TEXWIDTH 32768
+
+// forward references
+template <int ALGO> __global__ void fermi_scrypt_core_kernelA(uint32_t *g_idata, unsigned int N);
+template <int ALGO> __global__ void fermi_scrypt_core_kernelB(uint32_t *g_odata, unsigned int N);
+template <int ALGO, int TEX_DIM> __global__ void fermi_scrypt_core_kernelB_tex(uint32_t *g_odata, unsigned int N);
+template <int ALGO> __global__ void fermi_scrypt_core_kernelA_LG(uint32_t *g_idata, unsigned int N, unsigned int LOOKUP_GAP);
+template <int ALGO> __global__ void fermi_scrypt_core_kernelB_LG(uint32_t *g_odata, unsigned int N, unsigned int LOOKUP_GAP);
+template <int ALGO, int TEX_DIM> __global__ void fermi_scrypt_core_kernelB_LG_tex(uint32_t *g_odata, unsigned int N, unsigned int LOOKUP_GAP);
+
+// scratchbuf constants (pointers to scratch buffer for each warp, i.e. 32 hashes)
+__constant__ uint32_t* c_V[TOTAL_WARP_LIMIT];
+
+// using texture references for the "tex" variants of the B kernels
+texture<uint4, 1, cudaReadModeElementType> texRef1D_4_V;
+texture<uint4, 2, cudaReadModeElementType> texRef2D_4_V;
+
+FermiKernel::FermiKernel() : KernelInterface()
+{
+}
+
+bool FermiKernel::bindtexture_1D(uint32_t *d_V, size_t size)
+{
+	cudaChannelFormatDesc channelDesc4 = cudaCreateChannelDesc<uint4>();
+	texRef1D_4_V.normalized = 0;
+	texRef1D_4_V.filterMode = cudaFilterModePoint;
+	texRef1D_4_V.addressMode[0] = cudaAddressModeClamp;
+	checkCudaErrors(cudaBindTexture(NULL, &texRef1D_4_V, d_V, &channelDesc4, size));
+	return true;
+}
+
+bool FermiKernel::bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch)
+{
+	cudaChannelFormatDesc channelDesc4 = cudaCreateChannelDesc<uint4>();
+	texRef2D_4_V.normalized = 0;
+	texRef2D_4_V.filterMode = cudaFilterModePoint;
+	texRef2D_4_V.addressMode[0] = cudaAddressModeClamp;
+	texRef2D_4_V.addressMode[1] = cudaAddressModeClamp;
+	// maintain texture width of TEXWIDTH (max. limit is 65000)
+	while (width > TEXWIDTH) { width /= 2; height *= 2; pitch /= 2; }
+	while (width < TEXWIDTH) { width *= 2; height = (height+1)/2; pitch *= 2; }
+//    fprintf(stderr, "total size: %u, %u bytes\n", pitch * height, width * sizeof(uint32_t) * 4 * height);
+//    fprintf(stderr, "binding width width=%d, height=%d, pitch=%d\n", width, height,pitch);
+	checkCudaErrors(cudaBindTexture2D(NULL, &texRef2D_4_V, d_V, &channelDesc4, width, height, pitch));
+	return true;
+}
+
+bool FermiKernel::unbindtexture_1D()
+{
+	checkCudaErrors(cudaUnbindTexture(texRef1D_4_V));
+	return true;
+}
+
+bool FermiKernel::unbindtexture_2D()
+{
+	checkCudaErrors(cudaUnbindTexture(texRef2D_4_V));
+	return true;
+}
+
+void FermiKernel::set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V)
+{
+	checkCudaErrors(cudaMemcpyToSymbol(c_V, h_V, MAXWARPS*sizeof(uint32_t*), 0, cudaMemcpyHostToDevice));
+}
+
+bool FermiKernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache)
+{
+	bool success = true;
+
+	int shared = WARPS_PER_BLOCK * WU_PER_WARP * (16+4) * sizeof(uint32_t);
+
+	// First phase: Sequential writes to scratchpad.
+
+	if (LOOKUP_GAP == 1) {
+		  if (IS_SCRYPT())      fermi_scrypt_core_kernelA<A_SCRYPT><<< grid, threads, shared, stream >>>(d_idata, N);
+		  if (IS_SCRYPT_JANE()) fermi_scrypt_core_kernelA<A_SCRYPT_JANE><<< grid, threads, shared, stream >>>(d_idata, N);
+	} else {
+		  if (IS_SCRYPT())      fermi_scrypt_core_kernelA_LG<A_SCRYPT><<< grid, threads, shared, stream >>>(d_idata, N, LOOKUP_GAP);
+		  if (IS_SCRYPT_JANE()) fermi_scrypt_core_kernelA_LG<A_SCRYPT_JANE><<< grid, threads, shared, stream >>>(d_idata, N, LOOKUP_GAP);
+	}
+
+	// Second phase: Random read access from scratchpad.
+
+	if (LOOKUP_GAP == 1) {
+		if (texture_cache) {
+			if (texture_cache == 1) {
+				if (IS_SCRYPT())      fermi_scrypt_core_kernelB_tex<A_SCRYPT,1><<< grid, threads, shared, stream >>>(d_odata, N);
+				if (IS_SCRYPT_JANE()) fermi_scrypt_core_kernelB_tex<A_SCRYPT_JANE,1><<< grid, threads, shared, stream >>>(d_odata, N);
+			} else if (texture_cache == 2) {
+				if (IS_SCRYPT())      fermi_scrypt_core_kernelB_tex<A_SCRYPT,2><<< grid, threads, shared, stream >>>(d_odata, N);
+				if (IS_SCRYPT_JANE()) fermi_scrypt_core_kernelB_tex<A_SCRYPT_JANE,2><<< grid, threads, shared, stream >>>(d_odata, N);
+			}
+			else success = false;
+		} else {
+			if (IS_SCRYPT())      fermi_scrypt_core_kernelB<A_SCRYPT><<< grid, threads, shared, stream >>>(d_odata, N);
+			if (IS_SCRYPT_JANE()) fermi_scrypt_core_kernelB<A_SCRYPT_JANE><<< grid, threads, shared, stream >>>(d_odata, N);
+		}
+	} else {
+		if (texture_cache) {
+			if (texture_cache == 1) {
+				if (IS_SCRYPT())       fermi_scrypt_core_kernelB_LG_tex<A_SCRYPT,1><<< grid, threads, shared, stream >>>(d_odata, N, LOOKUP_GAP);
+				if (IS_SCRYPT_JANE())  fermi_scrypt_core_kernelB_LG_tex<A_SCRYPT_JANE,1><<< grid, threads, shared, stream >>>(d_odata, N, LOOKUP_GAP);
+			} else if (texture_cache == 2) {
+				if (IS_SCRYPT())       fermi_scrypt_core_kernelB_LG_tex<A_SCRYPT,2><<< grid, threads, shared, stream >>>(d_odata, N, LOOKUP_GAP);
+				if (IS_SCRYPT_JANE())  fermi_scrypt_core_kernelB_LG_tex<A_SCRYPT_JANE,2><<< grid, threads, shared, stream >>>(d_odata, N, LOOKUP_GAP);
+			}
+			else success = false;
+		} else {
+			if (IS_SCRYPT())       fermi_scrypt_core_kernelB_LG<A_SCRYPT><<< grid, threads, shared, stream >>>(d_odata, N, LOOKUP_GAP);
+			if (IS_SCRYPT_JANE())  fermi_scrypt_core_kernelB_LG<A_SCRYPT_JANE><<< grid, threads, shared, stream >>>(d_odata, N, LOOKUP_GAP);
+		}
+	}
+
+	return success;
+}
+
+#if 0
+
+#define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
+
+#define QUARTER(a,b,c,d) \
+	a += b; d ^= a; d = ROTL(d,16); \
+	c += d; b ^= c; b = ROTL(b,12); \
+	a += b; d ^= a; d = ROTL(d,8); \
+	c += d; b ^= c; b = ROTL(b,7);
+
+static __device__ void xor_chacha8(uint4 *B, uint4 *C)
+{
+	uint32_t x[16];
+	x[0]=(B[0].x ^= C[0].x);
+	x[1]=(B[0].y ^= C[0].y);
+	x[2]=(B[0].z ^= C[0].z);
+	x[3]=(B[0].w ^= C[0].w);
+	x[4]=(B[1].x ^= C[1].x);
+	x[5]=(B[1].y ^= C[1].y);
+	x[6]=(B[1].z ^= C[1].z);
+	x[7]=(B[1].w ^= C[1].w);
+	x[8]=(B[2].x ^= C[2].x);
+	x[9]=(B[2].y ^= C[2].y);
+	x[10]=(B[2].z ^= C[2].z);
+	x[11]=(B[2].w ^= C[2].w);
+	x[12]=(B[3].x ^= C[3].x);
+	x[13]=(B[3].y ^= C[3].y);
+	x[14]=(B[3].z ^= C[3].z);
+	x[15]=(B[3].w ^= C[3].w);
+
+	/* Operate on columns. */
+	QUARTER( x[0], x[4], x[ 8], x[12] )
+	QUARTER( x[1], x[5], x[ 9], x[13] )
+	QUARTER( x[2], x[6], x[10], x[14] )
+	QUARTER( x[3], x[7], x[11], x[15] )
+
+	/* Operate on diagonals */
+	QUARTER( x[0], x[5], x[10], x[15] )
+	QUARTER( x[1], x[6], x[11], x[12] )
+	QUARTER( x[2], x[7], x[ 8], x[13] )
+	QUARTER( x[3], x[4], x[ 9], x[14] )
+
+	/* Operate on columns. */
+	QUARTER( x[0], x[4], x[ 8], x[12] )
+	QUARTER( x[1], x[5], x[ 9], x[13] )
+	QUARTER( x[2], x[6], x[10], x[14] )
+	QUARTER( x[3], x[7], x[11], x[15] )
+
+	/* Operate on diagonals */
+	QUARTER( x[0], x[5], x[10], x[15] )
+	QUARTER( x[1], x[6], x[11], x[12] )
+	QUARTER( x[2], x[7], x[ 8], x[13] )
+	QUARTER( x[3], x[4], x[ 9], x[14] )
+
+	/* Operate on columns. */
+	QUARTER( x[0], x[4], x[ 8], x[12] )
+	QUARTER( x[1], x[5], x[ 9], x[13] )
+	QUARTER( x[2], x[6], x[10], x[14] )
+	QUARTER( x[3], x[7], x[11], x[15] )
+
+	/* Operate on diagonals */
+	QUARTER( x[0], x[5], x[10], x[15] )
+	QUARTER( x[1], x[6], x[11], x[12] )
+	QUARTER( x[2], x[7], x[ 8], x[13] )
+	QUARTER( x[3], x[4], x[ 9], x[14] )
+
+	/* Operate on columns. */
+	QUARTER( x[0], x[4], x[ 8], x[12] )
+	QUARTER( x[1], x[5], x[ 9], x[13] )
+	QUARTER( x[2], x[6], x[10], x[14] )
+	QUARTER( x[3], x[7], x[11], x[15] )
+
+	/* Operate on diagonals */
+	QUARTER( x[0], x[5], x[10], x[15] )
+	QUARTER( x[1], x[6], x[11], x[12] )
+	QUARTER( x[2], x[7], x[ 8], x[13] )
+	QUARTER( x[3], x[4], x[ 9], x[14] )
+
+	B[0].x += x[0]; B[0].y += x[1]; B[0].z += x[2];  B[0].w += x[3];  B[1].x += x[4];  B[1].y += x[5];  B[1].z += x[6];  B[1].w += x[7];
+	B[2].x += x[8]; B[2].y += x[9]; B[2].z += x[10]; B[2].w += x[11]; B[3].x += x[12]; B[3].y += x[13]; B[3].z += x[14]; B[3].w += x[15];
+}
+
+#else
+
+#define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
+
+#define ADD4(d1,d2,d3,d4,s1,s2,s3,s4) \
+	d1 += s1; d2 += s2; d3 += s3; d4 += s4;
+
+#define XOR4(d1,d2,d3,d4,s1,s2,s3,s4) \
+	d1 ^= s1; d2 ^= s2; d3 ^= s3; d4 ^= s4;
+
+#define ROTL4(d1,d2,d3,d4,amt) \
+	d1 = ROTL(d1, amt); d2 = ROTL(d2, amt); d3 = ROTL(d3, amt); d4 = ROTL(d4, amt);
+
+#define QROUND(a1,a2,a3,a4, b1,b2,b3,b4, c1,c2,c3,c4, amt) \
+	ADD4 (a1,a2,a3,a4, c1,c2,c3,c4) \
+	XOR4 (b1,b2,b3,b4, a1,a2,a3,a4) \
+	ROTL4(b1,b2,b3,b4, amt)
+
+static __device__ void xor_chacha8(uint4 *B, uint4 *C)
+{
+	uint32_t x[16];
+	x[0]=(B[0].x ^= C[0].x);
+	x[1]=(B[0].y ^= C[0].y);
+	x[2]=(B[0].z ^= C[0].z);
+	x[3]=(B[0].w ^= C[0].w);
+	x[4]=(B[1].x ^= C[1].x);
+	x[5]=(B[1].y ^= C[1].y);
+	x[6]=(B[1].z ^= C[1].z);
+	x[7]=(B[1].w ^= C[1].w);
+	x[8]=(B[2].x ^= C[2].x);
+	x[9]=(B[2].y ^= C[2].y);
+	x[10]=(B[2].z ^= C[2].z);
+	x[11]=(B[2].w ^= C[2].w);
+	x[12]=(B[3].x ^= C[3].x);
+	x[13]=(B[3].y ^= C[3].y);
+	x[14]=(B[3].z ^= C[3].z);
+	x[15]=(B[3].w ^= C[3].w);
+
+	/* Operate on columns. */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 16);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7],  8);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15],  7);
+
+	/* Operate on diagonals */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 16);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4],  8);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14],  7);
+
+	/* Operate on columns. */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 16);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7],  8);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15],  7);
+
+	/* Operate on diagonals */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 16);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4],  8);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14],  7);
+
+	/* Operate on columns. */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 16);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7],  8);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15],  7);
+
+	/* Operate on diagonals */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 16);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4],  8);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14],  7);
+
+	/* Operate on columns. */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 16);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7],  8);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15],  7);
+
+	/* Operate on diagonals */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 16);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4],  8);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14],  7);
+
+	B[0].x += x[0]; B[0].y += x[1]; B[0].z += x[2];  B[0].w += x[3];  B[1].x += x[4];  B[1].y += x[5];  B[1].z += x[6];  B[1].w += x[7];
+	B[2].x += x[8]; B[2].y += x[9]; B[2].z += x[10]; B[2].w += x[11]; B[3].x += x[12]; B[3].y += x[13]; B[3].z += x[14]; B[3].w += x[15];
+}
+
+#endif
+
+#define ROTL7(a0,a1,a2,a3,a00,a10,a20,a30){\
+a0^=(((a00)<<7) | ((a00)>>25) );\
+a1^=(((a10)<<7) | ((a10)>>25) );\
+a2^=(((a20)<<7) | ((a20)>>25) );\
+a3^=(((a30)<<7) | ((a30)>>25) );\
+};\
+
+#define ROTL9(a0,a1,a2,a3,a00,a10,a20,a30){\
+a0^=(((a00)<<9) | ((a00)>>23) );\
+a1^=(((a10)<<9) | ((a10)>>23) );\
+a2^=(((a20)<<9) | ((a20)>>23) );\
+a3^=(((a30)<<9) | ((a30)>>23) );\
+};\
+
+#define ROTL13(a0,a1,a2,a3,a00,a10,a20,a30){\
+a0^=(((a00)<<13) | ((a00)>>19) );\
+a1^=(((a10)<<13) | ((a10)>>19) );\
+a2^=(((a20)<<13) | ((a20)>>19) );\
+a3^=(((a30)<<13) | ((a30)>>19) );\
+};\
+
+#define ROTL18(a0,a1,a2,a3,a00,a10,a20,a30){\
+a0^=(((a00)<<18) | ((a00)>>14) );\
+a1^=(((a10)<<18) | ((a10)>>14) );\
+a2^=(((a20)<<18) | ((a20)>>14) );\
+a3^=(((a30)<<18) | ((a30)>>14) );\
+};\
+
+static __device__ void xor_salsa8(uint4 *B, uint4 *C)
+{
+	uint32_t x[16];
+	x[0]=(B[0].x ^= C[0].x);
+	x[1]=(B[0].y ^= C[0].y);
+	x[2]=(B[0].z ^= C[0].z);
+	x[3]=(B[0].w ^= C[0].w);
+	x[4]=(B[1].x ^= C[1].x);
+	x[5]=(B[1].y ^= C[1].y);
+	x[6]=(B[1].z ^= C[1].z);
+	x[7]=(B[1].w ^= C[1].w);
+	x[8]=(B[2].x ^= C[2].x);
+	x[9]=(B[2].y ^= C[2].y);
+	x[10]=(B[2].z ^= C[2].z);
+	x[11]=(B[2].w ^= C[2].w);
+	x[12]=(B[3].x ^= C[3].x);
+	x[13]=(B[3].y ^= C[3].y);
+	x[14]=(B[3].z ^= C[3].z);
+	x[15]=(B[3].w ^= C[3].w);
+
+	/* Operate on columns. */
+	ROTL7(x[4],x[9],x[14],x[3],x[0]+x[12],x[1]+x[5],x[6]+x[10],x[11]+x[15]);
+	ROTL9(x[8],x[13],x[2],x[7],x[0]+x[4],x[5]+x[9],x[10]+x[14],x[3]+x[15]);
+	ROTL13(x[12],x[1],x[6],x[11],x[4]+x[8],x[9]+x[13],x[2]+x[14],x[3]+x[7]);
+	ROTL18(x[0],x[5],x[10],x[15],x[8]+x[12],x[1]+x[13],x[2]+x[6],x[7]+x[11]);
+
+	/* Operate on rows. */
+	ROTL7(x[1],x[6],x[11],x[12],x[0]+x[3],x[4]+x[5],x[9]+x[10],x[14]+x[15]);
+	ROTL9(x[2],x[7],x[8],x[13],x[0]+x[1],x[5]+x[6],x[10]+x[11],x[12]+x[15]);
+	ROTL13(x[3],x[4],x[9],x[14],x[1]+x[2],x[6]+x[7],x[8]+x[11],x[12]+x[13]);
+	ROTL18(x[0],x[5],x[10],x[15],x[2]+x[3],x[4]+x[7],x[8]+x[9],x[13]+x[14]);
+
+	/* Operate on columns. */
+	ROTL7(x[4],x[9],x[14],x[3],x[0]+x[12],x[1]+x[5],x[6]+x[10],x[11]+x[15]);
+	ROTL9(x[8],x[13],x[2],x[7],x[0]+x[4],x[5]+x[9],x[10]+x[14],x[3]+x[15]);
+	ROTL13(x[12],x[1],x[6],x[11],x[4]+x[8],x[9]+x[13],x[2]+x[14],x[3]+x[7]);
+	ROTL18(x[0],x[5],x[10],x[15],x[8]+x[12],x[1]+x[13],x[2]+x[6],x[7]+x[11]);
+
+	/* Operate on rows. */
+	ROTL7(x[1],x[6],x[11],x[12],x[0]+x[3],x[4]+x[5],x[9]+x[10],x[14]+x[15]);
+	ROTL9(x[2],x[7],x[8],x[13],x[0]+x[1],x[5]+x[6],x[10]+x[11],x[12]+x[15]);
+	ROTL13(x[3],x[4],x[9],x[14],x[1]+x[2],x[6]+x[7],x[8]+x[11],x[12]+x[13]);
+	ROTL18(x[0],x[5],x[10],x[15],x[2]+x[3],x[4]+x[7],x[8]+x[9],x[13]+x[14]);
+
+	/* Operate on columns. */
+	ROTL7(x[4],x[9],x[14],x[3],x[0]+x[12],x[1]+x[5],x[6]+x[10],x[11]+x[15]);
+	ROTL9(x[8],x[13],x[2],x[7],x[0]+x[4],x[5]+x[9],x[10]+x[14],x[3]+x[15]);
+	ROTL13(x[12],x[1],x[6],x[11],x[4]+x[8],x[9]+x[13],x[2]+x[14],x[3]+x[7]);
+	ROTL18(x[0],x[5],x[10],x[15],x[8]+x[12],x[1]+x[13],x[2]+x[6],x[7]+x[11]);
+
+	/* Operate on rows. */
+	ROTL7(x[1],x[6],x[11],x[12],x[0]+x[3],x[4]+x[5],x[9]+x[10],x[14]+x[15]);
+	ROTL9(x[2],x[7],x[8],x[13],x[0]+x[1],x[5]+x[6],x[10]+x[11],x[12]+x[15]);
+	ROTL13(x[3],x[4],x[9],x[14],x[1]+x[2],x[6]+x[7],x[8]+x[11],x[12]+x[13]);
+	ROTL18(x[0],x[5],x[10],x[15],x[2]+x[3],x[4]+x[7],x[8]+x[9],x[13]+x[14]);
+
+	/* Operate on columns. */
+	ROTL7(x[4],x[9],x[14],x[3],x[0]+x[12],x[1]+x[5],x[6]+x[10],x[11]+x[15]);
+	ROTL9(x[8],x[13],x[2],x[7],x[0]+x[4],x[5]+x[9],x[10]+x[14],x[3]+x[15]);
+	ROTL13(x[12],x[1],x[6],x[11],x[4]+x[8],x[9]+x[13],x[2]+x[14],x[3]+x[7]);
+	ROTL18(x[0],x[5],x[10],x[15],x[8]+x[12],x[1]+x[13],x[2]+x[6],x[7]+x[11]);
+
+	/* Operate on rows. */
+	ROTL7(x[1],x[6],x[11],x[12],x[0]+x[3],x[4]+x[5],x[9]+x[10],x[14]+x[15]);
+	ROTL9(x[2],x[7],x[8],x[13],x[0]+x[1],x[5]+x[6],x[10]+x[11],x[12]+x[15]);
+	ROTL13(x[3],x[4],x[9],x[14],x[1]+x[2],x[6]+x[7],x[8]+x[11],x[12]+x[13]);
+	ROTL18(x[0],x[5],x[10],x[15],x[2]+x[3],x[4]+x[7],x[8]+x[9],x[13]+x[14]);
+
+	B[0].x += x[0]; B[0].y += x[1]; B[0].z += x[2];  B[0].w += x[3];  B[1].x += x[4];  B[1].y += x[5];  B[1].z += x[6];  B[1].w += x[7];
+	B[2].x += x[8]; B[2].y += x[9]; B[2].z += x[10]; B[2].w += x[11]; B[3].x += x[12]; B[3].y += x[13]; B[3].z += x[14]; B[3].w += x[15];
+}
+
+static __device__ __forceinline__ uint4& operator^=(uint4& left, const uint4& right)
+{
+	left.x ^= right.x;
+	left.y ^= right.y;
+	left.z ^= right.z;
+	left.w ^= right.w;
+	return left;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Scrypt core kernel for Fermi class devices.
+//! @param g_idata  input data in global memory
+//! @param g_odata  output data in global memory
+////////////////////////////////////////////////////////////////////////////////
+template <int ALGO> __global__
+void fermi_scrypt_core_kernelA(uint32_t *g_idata, unsigned int N)
+{
+	extern __shared__ unsigned char x[];
+	uint32_t ((*X)[WU_PER_WARP][16+4]) = (uint32_t (*)[WU_PER_WARP][16+4]) x;
+
+	int warpIdx        = threadIdx.x / warpSize;
+	int warpThread     = threadIdx.x % warpSize;
+	const unsigned int LOOKUP_GAP = 1;
+
+	// variables supporting the large memory transaction magic
+	unsigned int Y = warpThread/4;
+	unsigned int Z = 4*(warpThread%4);
+
+	// add block specific offsets
+	int WARPS_PER_BLOCK = blockDim.x / 32;
+	int offset = blockIdx.x * WU_PER_BLOCK + warpIdx * WU_PER_WARP;
+	g_idata += 32 * offset;
+	uint32_t * V = c_V[offset / WU_PER_WARP]  + SCRATCH*Y + Z;
+
+	// registers to store an entire work unit
+	uint4 B[4], C[4];
+
+	uint32_t ((*XB)[16+4]) = (uint32_t (*)[16+4])&X[warpIdx][Y][Z];
+	uint32_t *XX = X[warpIdx][warpThread];
+
+#pragma unroll 4
+	for (int wu=0; wu < 32; wu+=8)
+		*((ulonglong2*)(&V[SCRATCH*wu])) = *((ulonglong2*)XB[wu]) = *((ulonglong2*)(&g_idata[32*(wu+Y)+Z]));
+#pragma unroll 4
+	for (int idx=0; idx < 4; idx++) B[idx] = *((uint4*)&XX[4*idx]);
+
+#pragma unroll 4
+	for (int wu=0; wu < 32; wu+=8)
+		*((ulonglong2*)(&V[SCRATCH*wu+16])) = *((ulonglong2*)XB[wu]) = *((ulonglong2*)(&g_idata[32*(wu+Y)+16+Z]));
+#pragma unroll 4
+	for (int idx=0; idx < 4; idx++) C[idx] = *((uint4*)&XX[4*idx]);
+
+	for (int i = 1; i < N; i++) {
+
+		switch(ALGO) {
+		case A_SCRYPT:      xor_salsa8(B, C); xor_salsa8(C, B); break;
+		case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break;
+		}
+
+#pragma unroll 4
+		for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = B[idx];
+#pragma unroll 4
+		for (int wu=0; wu < 32; wu+=8)
+			*((ulonglong2*)(&V[SCRATCH*wu + i*32])) = *((ulonglong2*)XB[wu]);
+
+#pragma unroll 4
+		for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = C[idx];
+#pragma unroll 4
+		for (int wu=0; wu < 32; wu+=8)
+			*((ulonglong2*)(&V[SCRATCH*wu + i*32 + 16])) = *((ulonglong2*)XB[wu]);
+	}
+}
+
+template <int ALGO> __global__
+void fermi_scrypt_core_kernelB(uint32_t *g_odata, unsigned int N)
+{
+	extern __shared__ unsigned char x[];
+	uint32_t ((*X)[WU_PER_WARP][16+4]) = (uint32_t (*)[WU_PER_WARP][16+4]) x;
+
+	int warpIdx        = threadIdx.x / warpSize;
+	int warpThread     = threadIdx.x % warpSize;
+	const unsigned int LOOKUP_GAP = 1;
+
+	// variables supporting the large memory transaction magic
+	unsigned int Y = warpThread/4;
+	unsigned int Z = 4*(warpThread%4);
+
+	// add block specific offsets
+	int WARPS_PER_BLOCK = blockDim.x / 32;
+	int offset = blockIdx.x * WU_PER_BLOCK + warpIdx * WU_PER_WARP;
+	g_odata += 32 * offset;
+	uint32_t * V = c_V[offset / WU_PER_WARP] + SCRATCH*Y + Z;
+
+	// registers to store an entire work unit
+	uint4 B[4], C[4];
+
+	uint32_t ((*XB)[16+4]) = (uint32_t (*)[16+4])&X[warpIdx][Y][Z];
+	uint32_t *XX = X[warpIdx][warpThread];
+
+#pragma unroll 4
+	for (int wu=0; wu < 32; wu+=8)
+		*((ulonglong2*)XB[wu]) = *((ulonglong2*)(&V[SCRATCH*wu + (N-1)*32]));
+#pragma unroll 4
+	for (int idx=0; idx < 4; idx++) B[idx] = *((uint4*)&XX[4*idx]);
+
+#pragma unroll 4
+	for (int wu=0; wu < 32; wu+=8)
+		*((ulonglong2*)XB[wu]) = *((ulonglong2*)(&V[SCRATCH*wu + (N-1)*32 + 16]));
+#pragma unroll 4
+	for (int idx=0; idx < 4; idx++) C[idx] = *((uint4*)&XX[4*idx]);
+
+	switch(ALGO) {
+	case A_SCRYPT:      xor_salsa8(B, C); xor_salsa8(C, B); break;
+	case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break;
+	}
+
+	for (int i = 0; i < N; i++) {
+
+		XX[16] = 32 * (C[0].x & (N-1));
+
+#pragma unroll 4
+		for (int wu=0; wu < 32; wu+=8)
+			*((ulonglong2*)XB[wu]) = *((ulonglong2*)(&V[SCRATCH*wu + XB[wu][16-Z]]));
+#pragma unroll 4
+		for (int idx=0; idx < 4; idx++) B[idx] ^= *((uint4*)&XX[4*idx]);
+
+#pragma unroll 4
+		for (int wu=0; wu < 32; wu+=8)
+			*((ulonglong2*)XB[wu]) = *((ulonglong2*)(&V[SCRATCH*wu + XB[wu][16-Z] + 16]));
+#pragma unroll 4
+		for (int idx=0; idx < 4; idx++) C[idx] ^= *((uint4*)&XX[4*idx]);
+
+		switch(ALGO) {
+		case A_SCRYPT:      xor_salsa8(B, C); xor_salsa8(C, B); break;
+		case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break;
+		}
+	}
+
+#pragma unroll 4
+	for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = B[idx];
+#pragma unroll 4
+	for (int wu=0; wu < 32; wu+=8)
+		*((ulonglong2*)(&g_odata[32*(wu+Y)+Z])) = *((ulonglong2*)XB[wu]);
+
+#pragma unroll 4
+	for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = C[idx];
+#pragma unroll 4
+	for (int wu=0; wu < 32; wu+=8)
+		*((ulonglong2*)(&g_odata[32*(wu+Y)+16+Z])) = *((ulonglong2*)XB[wu]);
+
+}
+
+template <int ALGO, int TEX_DIM> __global__ void
+fermi_scrypt_core_kernelB_tex(uint32_t *g_odata, unsigned int N)
+{
+	extern __shared__ unsigned char x[];
+	uint32_t ((*X)[WU_PER_WARP][16+4]) = (uint32_t (*)[WU_PER_WARP][16+4]) x;
+
+	int warpIdx        = threadIdx.x / warpSize;
+	int warpThread     = threadIdx.x % warpSize;
+	const unsigned int LOOKUP_GAP = 1;
+
+	// variables supporting the large memory transaction magic
+	unsigned int Y = warpThread/4;
+	unsigned int Z = 4*(warpThread%4);
+
+	// add block specific offsets
+	int WARPS_PER_BLOCK = blockDim.x / 32;
+	int offset = blockIdx.x * WU_PER_BLOCK + warpIdx * WU_PER_WARP;
+	g_odata += 32 * offset;
+
+	// registers to store an entire work unit
+	uint4 B[4], C[4];
+
+	uint32_t ((*XB)[16+4]) = (uint32_t (*)[16+4])&X[warpIdx][Y][Z];
+	uint32_t *XX = X[warpIdx][warpThread];
+
+#pragma unroll 4
+	for (int wu=0; wu < 32; wu+=8) { unsigned int loc = (SCRATCH*(offset+wu+Y) + (N-1)*32 + Z)/4;
+		*((uint4*)XB[wu]) = ((TEX_DIM == 1) ?
+					tex1Dfetch(texRef1D_4_V, loc) :
+					tex2D(texRef2D_4_V, 0.5f + (loc%TEXWIDTH), 0.5f + (loc/TEXWIDTH))); }
+#pragma unroll 4
+	for (int idx=0; idx < 4; idx++) B[idx] = *((uint4*)&XX[4*idx]);
+
+#pragma unroll 4
+	for (int wu=0; wu < 32; wu+=8) { unsigned int loc = (SCRATCH*(offset+wu+Y) + (N-1)*32 + 16+Z)/4;
+		*((uint4*)XB[wu]) = ((TEX_DIM == 1) ?
+					tex1Dfetch(texRef1D_4_V, loc) :
+					tex2D(texRef2D_4_V, 0.5f + (loc%TEXWIDTH), 0.5f + (loc/TEXWIDTH))); }
+#pragma unroll 4
+	for (int idx=0; idx < 4; idx++) C[idx] = *((uint4*)&XX[4*idx]);
+
+	switch(ALGO) {
+	case A_SCRYPT:      xor_salsa8(B, C); xor_salsa8(C, B); break;
+	case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break;
+	}
+
+	for (int i = 0; i < N; i++) {
+
+		XX[16] = 32 * (C[0].x & (N-1));
+
+#pragma unroll 4
+		for (int wu=0; wu < 32; wu+=8) { unsigned int loc = (SCRATCH*(offset+wu+Y) + XB[wu][16-Z] + Z)/4;
+			*((uint4*)XB[wu]) = ((TEX_DIM == 1) ?
+						tex1Dfetch(texRef1D_4_V, loc) :
+						tex2D(texRef2D_4_V, 0.5f + (loc%TEXWIDTH), 0.5f + (loc/TEXWIDTH))); }
+#pragma unroll 4
+		for (int idx=0; idx < 4; idx++) B[idx] ^= *((uint4*)&XX[4*idx]);
+
+#pragma unroll 4
+		for (int wu=0; wu < 32; wu+=8) { unsigned int loc = (SCRATCH*(offset+wu+Y) + XB[wu][16-Z] + 16+Z)/4;
+			*((uint4*)XB[wu]) = ((TEX_DIM == 1) ?
+						tex1Dfetch(texRef1D_4_V, loc) :
+						tex2D(texRef2D_4_V, 0.5f + (loc%TEXWIDTH), 0.5f + (loc/TEXWIDTH))); }
+#pragma unroll 4
+		for (int idx=0; idx < 4; idx++) C[idx] ^= *((uint4*)&XX[4*idx]);
+
+		switch(ALGO) {
+		case A_SCRYPT:      xor_salsa8(B, C);  xor_salsa8(C, B); break;
+		case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break;
+		}
+	}
+
+#pragma unroll 4
+	for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = B[idx];
+#pragma unroll 4
+	for (int wu=0; wu < 32; wu+=8)
+		*((ulonglong2*)(&g_odata[32*(wu+Y)+Z])) = *((ulonglong2*)XB[wu]);
+
+#pragma unroll 4
+	for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = C[idx];
+#pragma unroll 4
+	for (int wu=0; wu < 32; wu+=8)
+		*((ulonglong2*)(&g_odata[32*(wu+Y)+16+Z])) = *((ulonglong2*)XB[wu]);
+}
+
+//
+// Lookup-Gap variations of the above functions
+//
+
+template <int ALGO> __global__ void
+fermi_scrypt_core_kernelA_LG(uint32_t *g_idata, unsigned int N, unsigned int LOOKUP_GAP)
+{
+	extern __shared__ unsigned char x[];
+	uint32_t ((*X)[WU_PER_WARP][16+4]) = (uint32_t (*)[WU_PER_WARP][16+4]) x;
+
+	int warpIdx        = threadIdx.x / warpSize;
+	int warpThread     = threadIdx.x % warpSize;
+
+	// variables supporting the large memory transaction magic
+	unsigned int Y = warpThread/4;
+	unsigned int Z = 4*(warpThread%4);
+
+	// add block specific offsets
+	int WARPS_PER_BLOCK = blockDim.x / 32;
+	int offset = blockIdx.x * WU_PER_BLOCK + warpIdx * WU_PER_WARP;
+	g_idata += 32 * offset;
+	uint32_t * V = c_V[offset / WU_PER_WARP]  + SCRATCH*Y + Z;
+
+	// registers to store an entire work unit
+	uint4 B[4], C[4];
+
+	uint32_t ((*XB)[16+4]) = (uint32_t (*)[16+4])&X[warpIdx][Y][Z];
+	uint32_t *XX = X[warpIdx][warpThread];
+
+#pragma unroll 4
+	for (int wu=0; wu < 32; wu+=8)
+		*((ulonglong2*)(&V[SCRATCH*wu])) = *((ulonglong2*)XB[wu]) = *((ulonglong2*)(&g_idata[32*(wu+Y)+Z]));
+#pragma unroll 4
+	for (int idx=0; idx < 4; idx++) B[idx] = *((uint4*)&XX[4*idx]);
+
+#pragma unroll 4
+	for (int wu=0; wu < 32; wu+=8)
+		*((ulonglong2*)(&V[SCRATCH*wu+16])) = *((ulonglong2*)XB[wu]) = *((ulonglong2*)(&g_idata[32*(wu+Y)+16+Z]));
+#pragma unroll 4
+	for (int idx=0; idx < 4; idx++) C[idx] = *((uint4*)&XX[4*idx]);
+
+	for (int i = 1; i < N; i++) {
+
+		switch(ALGO) {
+		case A_SCRYPT:      xor_salsa8(B, C);  xor_salsa8(C, B); break;
+		case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break;
+		}
+
+		if (i % LOOKUP_GAP == 0) {
+#pragma unroll 4
+			for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = B[idx];
+#pragma unroll 4
+			for (int wu=0; wu < 32; wu+=8)
+				*((ulonglong2*)(&V[SCRATCH*wu + (i/LOOKUP_GAP)*32])) = *((ulonglong2*)XB[wu]);
+
+#pragma unroll 4
+			for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = C[idx];
+#pragma unroll 4
+			for (int wu=0; wu < 32; wu+=8)
+				*((ulonglong2*)(&V[SCRATCH*wu + (i/LOOKUP_GAP)*32 + 16])) = *((ulonglong2*)XB[wu]);
+		}
+	}
+}
+
+template <int ALGO> __global__ void
+fermi_scrypt_core_kernelB_LG(uint32_t *g_odata, unsigned int N, unsigned int LOOKUP_GAP)
+{
+	extern __shared__ unsigned char x[];
+	uint32_t ((*X)[WU_PER_WARP][16+4]) = (uint32_t (*)[WU_PER_WARP][16+4]) x;
+
+	int warpIdx        = threadIdx.x / warpSize;
+	int warpThread     = threadIdx.x % warpSize;
+
+	// variables supporting the large memory transaction magic
+	unsigned int Y = warpThread/4;
+	unsigned int Z = 4*(warpThread%4);
+
+	// add block specific offsets
+	int WARPS_PER_BLOCK = blockDim.x / 32;
+	int offset = blockIdx.x * WU_PER_BLOCK + warpIdx * WU_PER_WARP;
+	g_odata += 32 * offset;
+	uint32_t * V = c_V[offset / WU_PER_WARP] + SCRATCH*Y + Z;
+
+	// registers to store an entire work unit
+	uint4 B[4], C[4];
+
+	uint32_t ((*XB)[16+4]) = (uint32_t (*)[16+4])&X[warpIdx][Y][Z];
+	uint32_t *XX = X[warpIdx][warpThread];
+
+	uint32_t pos = (N-1)/LOOKUP_GAP; uint32_t loop = 1 + (N-1)-pos*LOOKUP_GAP;
+#pragma unroll 4
+	for (int wu=0; wu < 32; wu+=8)
+		*((ulonglong2*)XB[wu]) = *((ulonglong2*)(&V[SCRATCH*wu + pos*32]));
+#pragma unroll 4
+	for (int idx=0; idx < 4; idx++) B[idx] = *((uint4*)&XX[4*idx]);
+
+#pragma unroll 4
+	for (int wu=0; wu < 32; wu+=8)
+		*((ulonglong2*)XB[wu]) = *((ulonglong2*)(&V[SCRATCH*wu + pos*32 + 16]));
+#pragma unroll 4
+	for (int idx=0; idx < 4; idx++) C[idx] = *((uint4*)&XX[4*idx]);
+
+	while (loop--)
+		switch(ALGO) {
+		case A_SCRYPT:      xor_salsa8(B, C);  xor_salsa8(C, B); break;
+		case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break;
+		}
+
+	for (int i = 0; i < N; i++) {
+
+		uint32_t j = C[0].x & (N-1);
+		uint32_t pos = j / LOOKUP_GAP; uint32_t loop = j - pos*LOOKUP_GAP;
+		XX[16] = 32 * pos;
+
+		uint4 b[4], c[4];
+#pragma unroll 4
+		for (int wu=0; wu < 32; wu+=8)
+			*((ulonglong2*)XB[wu]) = *((ulonglong2*)(&V[SCRATCH*wu + XB[wu][16-Z]]));
+#pragma unroll 4
+		for (int idx=0; idx < 4; idx++) b[idx] = *((uint4*)&XX[4*idx]);
+
+#pragma unroll 4
+		for (int wu=0; wu < 32; wu+=8)
+			*((ulonglong2*)XB[wu]) = *((ulonglong2*)(&V[SCRATCH*wu + XB[wu][16-Z] + 16]));
+#pragma unroll 4
+		for (int idx=0; idx < 4; idx++) c[idx] = *((uint4*)&XX[4*idx]);
+
+		while (loop--)
+			switch(ALGO) {
+			case A_SCRYPT:      xor_salsa8(b, c);  xor_salsa8(c, b); break;
+			case A_SCRYPT_JANE: xor_chacha8(b, c); xor_chacha8(c, b); break;
+			}
+
+#pragma unroll 4
+		for (int idx=0; idx < 4; idx++) B[idx] ^= b[idx];
+#pragma unroll 4
+		for (int idx=0; idx < 4; idx++) C[idx] ^= c[idx];
+
+		switch(ALGO) {
+		case A_SCRYPT:      xor_salsa8(B, C);  xor_salsa8(C, B); break;
+		case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break;
+		}
+	}
+
+#pragma unroll 4
+	for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = B[idx];
+#pragma unroll 4
+	for (int wu=0; wu < 32; wu+=8)
+		*((ulonglong2*)(&g_odata[32*(wu+Y)+Z])) = *((ulonglong2*)XB[wu]);
+
+#pragma unroll 4
+	for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = C[idx];
+#pragma unroll 4
+	for (int wu=0; wu < 32; wu+=8)
+		*((ulonglong2*)(&g_odata[32*(wu+Y)+16+Z])) = *((ulonglong2*)XB[wu]);
+
+}
+
+template <int ALGO, int TEX_DIM> __global__ void
+fermi_scrypt_core_kernelB_LG_tex(uint32_t *g_odata, unsigned int N, unsigned int LOOKUP_GAP)
+{
+	extern __shared__ unsigned char x[];
+	uint32_t ((*X)[WU_PER_WARP][16+4]) = (uint32_t (*)[WU_PER_WARP][16+4]) x;
+
+	int warpIdx        = threadIdx.x / warpSize;
+	int warpThread     = threadIdx.x % warpSize;
+
+	// variables supporting the large memory transaction magic
+	unsigned int Y = warpThread/4;
+	unsigned int Z = 4*(warpThread%4);
+
+	// add block specific offsets
+	int WARPS_PER_BLOCK = blockDim.x / 32;
+	int offset = blockIdx.x * WU_PER_BLOCK + warpIdx * WU_PER_WARP;
+	g_odata += 32 * offset;
+
+	// registers to store an entire work unit
+	uint4 B[4], C[4];
+
+	uint32_t ((*XB)[16+4]) = (uint32_t (*)[16+4])&X[warpIdx][Y][Z];
+	uint32_t *XX = X[warpIdx][warpThread];
+
+	uint32_t pos = (N-1)/LOOKUP_GAP; uint32_t loop = 1 + (N-1)-pos*LOOKUP_GAP;
+#pragma unroll 4
+	for (int wu=0; wu < 32; wu+=8) { unsigned int loc = (SCRATCH*(offset+wu+Y) + pos*32 + Z)/4;
+		*((uint4*)XB[wu]) = ((TEX_DIM == 1) ?
+					tex1Dfetch(texRef1D_4_V, loc) :
+					tex2D(texRef2D_4_V, 0.5f + (loc%TEXWIDTH), 0.5f + (loc/TEXWIDTH))); }
+#pragma unroll 4
+	for (int idx=0; idx < 4; idx++) B[idx] = *((uint4*)&XX[4*idx]);
+
+#pragma unroll 4
+	for (int wu=0; wu < 32; wu+=8) { unsigned int loc = (SCRATCH*(offset+wu+Y) + pos*32 + 16+Z)/4;
+		*((uint4*)XB[wu]) = ((TEX_DIM == 1) ?
+					tex1Dfetch(texRef1D_4_V, loc) :
+					tex2D(texRef2D_4_V, 0.5f + (loc%TEXWIDTH), 0.5f + (loc/TEXWIDTH))); }
+#pragma unroll 4
+	for (int idx=0; idx < 4; idx++) C[idx] = *((uint4*)&XX[4*idx]);
+
+	while (loop--)
+		switch(ALGO) {
+		case A_SCRYPT:      xor_salsa8(B, C);  xor_salsa8(C, B); break;
+		case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break;
+		}
+
+	for (int i = 0; i < N; i++) {
+
+		uint32_t j = C[0].x & (N-1);
+		uint32_t pos = j / LOOKUP_GAP; uint32_t loop = j - pos*LOOKUP_GAP;
+		XX[16] = 32 * pos;
+
+		uint4 b[4], c[4];
+#pragma unroll 4
+		for (int wu=0; wu < 32; wu+=8) { unsigned int loc = (SCRATCH*(offset+wu+Y) + XB[wu][16-Z] + Z)/4;
+			*((uint4*)XB[wu]) = ((TEX_DIM == 1) ?
+						tex1Dfetch(texRef1D_4_V, loc) :
+						tex2D(texRef2D_4_V, 0.5f + (loc%TEXWIDTH), 0.5f + (loc/TEXWIDTH))); }
+#pragma unroll 4
+		for (int idx=0; idx < 4; idx++) b[idx] = *((uint4*)&XX[4*idx]);
+
+#pragma unroll 4
+		for (int wu=0; wu < 32; wu+=8) { unsigned int loc = (SCRATCH*(offset+wu+Y) + XB[wu][16-Z] + 16+Z)/4;
+			*((uint4*)XB[wu]) = ((TEX_DIM == 1) ?
+						tex1Dfetch(texRef1D_4_V, loc) :
+						tex2D(texRef2D_4_V, 0.5f + (loc%TEXWIDTH), 0.5f + (loc/TEXWIDTH))); }
+#pragma unroll 4
+		for (int idx=0; idx < 4; idx++) c[idx] = *((uint4*)&XX[4*idx]);
+
+		while (loop--)
+			switch(ALGO) {
+			case A_SCRYPT:      xor_salsa8(b, c);  xor_salsa8(c, b); break;
+			case A_SCRYPT_JANE: xor_chacha8(b, c); xor_chacha8(c, b); break;
+			}
+
+#pragma unroll 4
+		for (int idx=0; idx < 4; idx++) B[idx] ^= b[idx];
+#pragma unroll 4
+		for (int idx=0; idx < 4; idx++) C[idx] ^= c[idx];
+
+		switch(ALGO) {
+		case A_SCRYPT:      xor_salsa8(B, C);  xor_salsa8(C, B); break;
+		case A_SCRYPT_JANE: xor_chacha8(B, C); xor_chacha8(C, B); break;
+		}
+	}
+
+#pragma unroll 4
+	for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = B[idx];
+#pragma unroll 4
+	for (int wu=0; wu < 32; wu+=8)
+		*((ulonglong2*)(&g_odata[32*(wu+Y)+Z])) = *((ulonglong2*)XB[wu]);
+
+#pragma unroll 4
+	for (int idx=0; idx < 4; idx++) *((uint4*)&XX[4*idx]) = C[idx];
+#pragma unroll 4
+	for (int wu=0; wu < 32; wu+=8)
+		*((ulonglong2*)(&g_odata[32*(wu+Y)+16+Z])) = *((ulonglong2*)XB[wu]);
+}
diff --git a/scrypt/fermi_kernel.h b/scrypt/fermi_kernel.h
new file mode 100644
index 0000000000..54f822d2f2
--- /dev/null
+++ b/scrypt/fermi_kernel.h
@@ -0,0 +1,28 @@
+#ifndef FERMI_KERNEL_H
+#define FERMI_KERNEL_H
+
+#include "salsa_kernel.h"
+
+class FermiKernel : public KernelInterface
+{
+public:
+	FermiKernel();
+
+	virtual void set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V);
+	virtual bool run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache);
+	virtual bool bindtexture_1D(uint32_t *d_V, size_t size);
+	virtual bool bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch);
+	virtual bool unbindtexture_1D();
+	virtual bool unbindtexture_2D();
+
+	virtual char get_identifier() { return 'F'; };
+	virtual int get_major_version() { return 1; }
+	virtual int get_minor_version() { return 0; }
+	virtual int max_warps_per_block() { return 16; };
+	virtual int get_texel_width() { return 4; };
+	virtual bool support_lookup_gap() { return true; }
+	virtual cudaSharedMemConfig shared_mem_config() { return cudaSharedMemBankSizeFourByte; }
+	virtual cudaFuncCache cache_config() { return cudaFuncCachePreferShared; }
+};
+
+#endif // #ifndef FERMI_KERNEL_H
diff --git a/scrypt/keccak.cu b/scrypt/keccak.cu
new file mode 100644
index 0000000000..7675442964
--- /dev/null
+++ b/scrypt/keccak.cu
@@ -0,0 +1,492 @@
+//
+//  =============== KECCAK part on nVidia GPU ======================
+//
+// The keccak512 (SHA-3) is used in the PBKDF2 for scrypt-jane coins
+// in place of the SHA2 based PBKDF2 used in scrypt coins.
+//
+// NOTE: compile this .cu module for compute_20,sm_20 with --maxrregcount=64
+//
+
+#include <map>
+
+#include "miner.h"
+#include "cuda_helper.h"
+
+#include "keccak.h"
+#include "salsa_kernel.h"
+
+// define some error checking macros
+#define DELIMITER '/'
+#define __FILENAME__ ( strrchr(__FILE__, DELIMITER) != NULL ? strrchr(__FILE__, DELIMITER)+1 : __FILE__ )
+
+#undef checkCudaErrors
+#define checkCudaErrors(x) \
+{ \
+	cudaGetLastError(); \
+	x; \
+	cudaError_t err = cudaGetLastError(); \
+	if (err != cudaSuccess && !abort_flag) \
+		applog(LOG_ERR, "GPU #%d: cudaError %d (%s) (%s line %d)\n", device_map[thr_id], err, cudaGetErrorString(err), __FILENAME__, __LINE__); \
+}
+
+// from salsa_kernel.cu
+extern std::map<int, uint32_t *> context_idata[2];
+extern std::map<int, uint32_t *> context_odata[2];
+extern std::map<int, cudaStream_t> context_streams[2];
+extern std::map<int, uint32_t *> context_hash[2];
+
+#ifndef ROTL64
+#define ROTL64(a,b) (((a) << (b)) | ((a) >> (64 - b)))
+#endif
+
+// CB
+#define U32TO64_LE(p) \
+	(((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32))
+
+#define U64TO32_LE(p, v) \
+	*p = (uint32_t)((v)); *(p+1) = (uint32_t)((v) >> 32);
+
+static __device__ void mycpy64(uint32_t *d, const uint32_t *s) {
+#pragma unroll 16
+	for (int k=0; k < 16; ++k) d[k] = s[k];
+}
+
+static __device__ void mycpy56(uint32_t *d, const uint32_t *s) {
+#pragma unroll 14
+	for (int k=0; k < 14; ++k) d[k] = s[k];
+}
+
+static __device__ void mycpy32(uint32_t *d, const uint32_t *s) {
+#pragma unroll 8
+	for (int k=0; k < 8; ++k) d[k] = s[k];
+}
+
+static __device__ void mycpy8(uint32_t *d, const uint32_t *s) {
+#pragma unroll 2
+	for (int k=0; k < 2; ++k) d[k] = s[k];
+}
+
+static __device__ void mycpy4(uint32_t *d, const uint32_t *s) {
+	*d = *s;
+}
+
+// ---------------------------- BEGIN keccak functions ------------------------------------
+
+#define KECCAK_HASH "Keccak-512"
+
+typedef struct keccak_hash_state_t {
+	uint64_t state[25];                        // 25*2
+	uint32_t buffer[72/4];                     // 72
+} keccak_hash_state;
+
+__device__ void statecopy0(keccak_hash_state *d, keccak_hash_state *s)
+{
+#pragma unroll 25
+	for (int i=0; i < 25; ++i)
+		d->state[i] = s->state[i];
+}
+
+__device__ void statecopy8(keccak_hash_state *d, keccak_hash_state *s)
+{
+#pragma unroll 25
+	for (int i=0; i < 25; ++i)
+		d->state[i] = s->state[i];
+#pragma unroll 2
+	for (int i=0; i < 2; ++i)
+		d->buffer[i] = s->buffer[i];
+}
+
+static const uint64_t host_keccak_round_constants[24] = {
+	0x0000000000000001ull, 0x0000000000008082ull,
+	0x800000000000808aull, 0x8000000080008000ull,
+	0x000000000000808bull, 0x0000000080000001ull,
+	0x8000000080008081ull, 0x8000000000008009ull,
+	0x000000000000008aull, 0x0000000000000088ull,
+	0x0000000080008009ull, 0x000000008000000aull,
+	0x000000008000808bull, 0x800000000000008bull,
+	0x8000000000008089ull, 0x8000000000008003ull,
+	0x8000000000008002ull, 0x8000000000000080ull,
+	0x000000000000800aull, 0x800000008000000aull,
+	0x8000000080008081ull, 0x8000000000008080ull,
+	0x0000000080000001ull, 0x8000000080008008ull
+};
+
+__constant__ uint64_t c_keccak_round_constants[24];
+__constant__ uint32_t c_data[20];
+
+__device__
+void keccak_block(keccak_hash_state *S, const uint32_t *in)
+{
+	uint64_t *s = S->state, t[5], u[5], v, w;
+
+	/* absorb input */
+	#pragma unroll 9
+	for (int i = 0; i < 72 / 8; i++, in += 2)
+		s[i] ^= U32TO64_LE(in);
+
+	for (int i = 0; i < 24; i++) {
+		/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
+		t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
+		t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
+		t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
+		t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
+		t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
+
+		/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
+		u[0] = t[4] ^ ROTL64(t[1], 1);
+		u[1] = t[0] ^ ROTL64(t[2], 1);
+		u[2] = t[1] ^ ROTL64(t[3], 1);
+		u[3] = t[2] ^ ROTL64(t[4], 1);
+		u[4] = t[3] ^ ROTL64(t[0], 1);
+
+		/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
+		s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
+		s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
+		s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
+		s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
+		s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
+
+		/* rho pi: b[..] = rotl(a[..], ..) */
+		v = s[ 1];
+		s[ 1] = ROTL64(s[ 6], 44);
+		s[ 6] = ROTL64(s[ 9], 20);
+		s[ 9] = ROTL64(s[22], 61);
+		s[22] = ROTL64(s[14], 39);
+		s[14] = ROTL64(s[20], 18);
+		s[20] = ROTL64(s[ 2], 62);
+		s[ 2] = ROTL64(s[12], 43);
+		s[12] = ROTL64(s[13], 25);
+		s[13] = ROTL64(s[19],  8);
+		s[19] = ROTL64(s[23], 56);
+		s[23] = ROTL64(s[15], 41);
+		s[15] = ROTL64(s[ 4], 27);
+		s[ 4] = ROTL64(s[24], 14);
+		s[24] = ROTL64(s[21],  2);
+		s[21] = ROTL64(s[ 8], 55);
+		s[ 8] = ROTL64(s[16], 45);
+		s[16] = ROTL64(s[ 5], 36);
+		s[ 5] = ROTL64(s[ 3], 28);
+		s[ 3] = ROTL64(s[18], 21);
+		s[18] = ROTL64(s[17], 15);
+		s[17] = ROTL64(s[11], 10);
+		s[11] = ROTL64(s[ 7],  6);
+		s[ 7] = ROTL64(s[10],  3);
+		s[10] = ROTL64(    v,  1);
+
+		/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
+		v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w;
+		v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w;
+		v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
+		v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
+		v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
+
+		/* iota: a[0,0] ^= round constant */
+		s[0] ^= c_keccak_round_constants[i];
+	}
+}
+
+__device__
+void keccak_hash_init(keccak_hash_state *S)
+{
+	#pragma unroll 25
+	for (int i=0; i<25; ++i)
+		S->state[i] = 0ULL;
+}
+
+// assuming there is no leftover data and exactly 72 bytes are incoming
+// we can directly call into the block hashing function
+__device__ void keccak_hash_update72(keccak_hash_state *S, const uint32_t *in) {
+	keccak_block(S, in);
+}
+
+__device__ void keccak_hash_update8(keccak_hash_state *S, const uint32_t *in) {
+	mycpy8(S->buffer, in);
+}
+
+__device__ void keccak_hash_update4_8(keccak_hash_state *S, const uint32_t *in) {
+	mycpy4(S->buffer+8/4, in);
+}
+
+__device__ void keccak_hash_update4_56(keccak_hash_state *S, const uint32_t *in) {
+	mycpy4(S->buffer+56/4, in);
+}
+
+__device__ void keccak_hash_update56(keccak_hash_state *S, const uint32_t *in) {
+	mycpy56(S->buffer, in);
+}
+
+__device__ void keccak_hash_update64(keccak_hash_state *S, const uint32_t *in) {
+	mycpy64(S->buffer, in);
+}
+
+__device__
+void keccak_hash_finish8(keccak_hash_state *S, uint32_t *hash)
+{
+	S->buffer[8/4] = 0x01;
+	#pragma unroll 15
+	for (int i=8/4+1; i < 72/4; ++i) S->buffer[i] = 0;
+	S->buffer[72/4 - 1] |= 0x80000000U;
+	keccak_block(S, (const uint32_t*)S->buffer);
+
+	#pragma unroll 8
+	for (int i = 0; i < 64; i += 8) {
+		U64TO32_LE((&hash[i/4]), S->state[i / 8]);
+	}
+}
+
+__device__
+void keccak_hash_finish12(keccak_hash_state *S, uint32_t *hash)
+{
+	S->buffer[12/4] = 0x01;
+	#pragma unroll 14
+	for (int i=12/4+1; i < 72/4; ++i) S->buffer[i] = 0;
+	S->buffer[72/4 - 1] |= 0x80000000U;
+	keccak_block(S, (const uint32_t*)S->buffer);
+
+	#pragma unroll 8
+	for (int i = 0; i < 64; i += 8) {
+		U64TO32_LE((&hash[i/4]), S->state[i / 8]);
+	}
+}
+
+__device__
+void keccak_hash_finish60(keccak_hash_state *S, uint32_t *hash)
+{
+	S->buffer[60/4] = 0x01;
+	#pragma unroll
+	for (int i=60/4+1; i < 72/4; ++i) S->buffer[i] = 0;
+	S->buffer[72/4 - 1] |= 0x80000000U;
+	keccak_block(S, (const uint32_t*)S->buffer);
+
+	#pragma unroll 8
+	for (int i = 0; i < 64; i += 8) {
+		U64TO32_LE((&hash[i/4]), S->state[i / 8]);
+	}
+}
+
+__device__
+void keccak_hash_finish64(keccak_hash_state *S, uint32_t *hash)
+{
+	S->buffer[64/4] = 0x01;
+	#pragma unroll
+	for (int i=64/4+1; i < 72/4; ++i) S->buffer[i] = 0;
+	S->buffer[72/4 - 1] |= 0x80000000U;
+	keccak_block(S, (const uint32_t*)S->buffer);
+
+	#pragma unroll 8
+	for (int i = 0; i < 64; i += 8) {
+		U64TO32_LE((&hash[i/4]), S->state[i / 8]);
+	}
+}
+
+// ---------------------------- END keccak functions ------------------------------------
+
+// ---------------------------- BEGIN PBKDF2 functions ------------------------------------
+
+typedef struct pbkdf2_hmac_state_t {
+	keccak_hash_state inner, outer;
+} pbkdf2_hmac_state;
+
+
+__device__ void pbkdf2_hash(uint32_t *hash, const uint32_t *m)
+{
+	keccak_hash_state st;
+	keccak_hash_init(&st);
+	keccak_hash_update72(&st, m);
+	keccak_hash_update8(&st, m+72/4);
+	keccak_hash_finish8(&st, hash);
+}
+
+/* hmac */
+__device__
+void pbkdf2_hmac_init80(pbkdf2_hmac_state *st, const uint32_t *key)
+{
+	uint32_t pad[72/4] = { 0 };
+	//#pragma unroll 18
+	//for (int i = 0; i < 72/4; i++)
+	//	pad[i] = 0;
+
+	keccak_hash_init(&st->inner);
+	keccak_hash_init(&st->outer);
+
+	/* key > blocksize bytes, hash it */
+	pbkdf2_hash(pad, key);
+
+	/* inner = (key ^ 0x36) */
+	/* h(inner || ...) */
+	#pragma unroll 18
+	for (int i = 0; i < 72/4; i++)
+		pad[i] ^= 0x36363636U;
+	keccak_hash_update72(&st->inner, pad);
+
+	/* outer = (key ^ 0x5c) */
+	/* h(outer || ...) */
+	#pragma unroll 18
+	for (int i = 0; i < 72/4; i++)
+		pad[i] ^= 0x6a6a6a6aU;
+	keccak_hash_update72(&st->outer, pad);
+}
+
+// assuming there is no leftover data and exactly 72 bytes are incoming
+// we can directly call into the block hashing function
+__device__ void pbkdf2_hmac_update72(pbkdf2_hmac_state *st, const uint32_t *m) {
+	/* h(inner || m...) */
+	keccak_hash_update72(&st->inner, m);
+}
+
+__device__ void pbkdf2_hmac_update8(pbkdf2_hmac_state *st, const uint32_t *m) {
+	/* h(inner || m...) */
+	keccak_hash_update8(&st->inner, m);
+}
+
+__device__ void pbkdf2_hmac_update4_8(pbkdf2_hmac_state *st, const uint32_t *m) {
+	/* h(inner || m...) */
+	keccak_hash_update4_8(&st->inner, m);
+}
+
+__device__ void pbkdf2_hmac_update4_56(pbkdf2_hmac_state *st, const uint32_t *m) {
+	/* h(inner || m...) */
+	keccak_hash_update4_56(&st->inner, m);
+}
+
+__device__ void pbkdf2_hmac_update56(pbkdf2_hmac_state *st, const uint32_t *m) {
+	/* h(inner || m...) */
+	keccak_hash_update56(&st->inner, m);
+}
+
+__device__ void pbkdf2_hmac_finish12(pbkdf2_hmac_state *st, uint32_t *mac) {
+	/* h(inner || m) */
+	uint32_t innerhash[16];
+	keccak_hash_finish12(&st->inner, innerhash);
+
+	/* h(outer || h(inner || m)) */
+	keccak_hash_update64(&st->outer, innerhash);
+	keccak_hash_finish64(&st->outer, mac);
+}
+
+__device__ void pbkdf2_hmac_finish60(pbkdf2_hmac_state *st, uint32_t *mac) {
+	/* h(inner || m) */
+	uint32_t innerhash[16];
+	keccak_hash_finish60(&st->inner, innerhash);
+
+	/* h(outer || h(inner || m)) */
+	keccak_hash_update64(&st->outer, innerhash);
+	keccak_hash_finish64(&st->outer, mac);
+}
+
+__device__ void pbkdf2_statecopy8(pbkdf2_hmac_state *d, pbkdf2_hmac_state *s) {
+	statecopy8(&d->inner, &s->inner);
+	statecopy0(&d->outer, &s->outer);
+}
+
+// ---------------------------- END PBKDF2 functions ------------------------------------
+
+__global__ __launch_bounds__(128)
+void cuda_pre_keccak512(uint32_t *g_idata, uint32_t nonce)
+{
+	uint32_t data[20];
+
+	const uint32_t thread = (blockIdx.x * blockDim.x) + threadIdx.x;
+	nonce   += thread;
+	g_idata += thread * 32;
+
+	#pragma unroll
+	for (int i=0; i<19; i++)
+		data[i] = cuda_swab32(c_data[i]);
+	data[19] = cuda_swab32(nonce);
+
+//    scrypt_pbkdf2_1((const uint8_t*)data, 80, (const uint8_t*)data, 80, (uint8_t*)g_idata, 128);
+
+	pbkdf2_hmac_state hmac_pw;
+
+	/* hmac(password, ...) */
+	pbkdf2_hmac_init80(&hmac_pw, data);
+
+	/* hmac(password, salt...) */
+	pbkdf2_hmac_update72(&hmac_pw, data);
+	pbkdf2_hmac_update8(&hmac_pw, data+72/4);
+
+	pbkdf2_hmac_state work;
+	uint32_t ti[16];
+
+	/* U1 = hmac(password, salt || be(i)) */
+	uint32_t be = 0x01000000U;//cuda_swab32(1);
+	pbkdf2_statecopy8(&work, &hmac_pw);
+	pbkdf2_hmac_update4_8(&work, &be);
+	pbkdf2_hmac_finish12(&work, ti);
+	mycpy64(g_idata, ti);
+
+	be = 0x02000000U;//cuda_swab32(2);
+	pbkdf2_statecopy8(&work, &hmac_pw);
+	pbkdf2_hmac_update4_8(&work, &be);
+	pbkdf2_hmac_finish12(&work, ti);
+	mycpy64(g_idata+16, ti);
+}
+
+
+__global__ __launch_bounds__(128)
+void cuda_post_keccak512(uint32_t *g_odata, uint32_t *g_hash, uint32_t nonce)
+{
+	uint32_t data[20];
+
+	const uint32_t thread = (blockIdx.x * blockDim.x) + threadIdx.x;
+	g_hash  += thread * 8;
+	g_odata += thread * 32;
+	nonce   += thread;
+
+	#pragma unroll
+	for (int i=0; i<19; i++)
+		data[i] = cuda_swab32(c_data[i]);
+	data[19] = cuda_swab32(nonce);
+
+//	scrypt_pbkdf2_1((const uint8_t*)data, 80, (const uint8_t*)g_odata, 128, (uint8_t*)g_hash, 32);
+
+	pbkdf2_hmac_state hmac_pw;
+
+	/* hmac(password, ...) */
+	pbkdf2_hmac_init80(&hmac_pw, data);
+
+	/* hmac(password, salt...) */
+	pbkdf2_hmac_update72(&hmac_pw, g_odata);
+	pbkdf2_hmac_update56(&hmac_pw, g_odata+72/4);
+
+	uint32_t ti[16];
+
+	/* U1 = hmac(password, salt || be(i)) */
+	uint32_t be = 0x01000000U;//cuda_swab32(1);
+	pbkdf2_hmac_update4_56(&hmac_pw, &be);
+	pbkdf2_hmac_finish60(&hmac_pw, ti);
+	mycpy32(g_hash, ti);
+}
+
+//
+// callable host code to initialize constants and to call kernels
+//
+
+extern "C" void prepare_keccak512(int thr_id, const uint32_t host_pdata[20])
+{
+	static bool init[MAX_GPUS] = { 0 };
+
+	if (!init[thr_id])
+	{
+		checkCudaErrors(cudaMemcpyToSymbol(c_keccak_round_constants, host_keccak_round_constants, sizeof(host_keccak_round_constants), 0, cudaMemcpyHostToDevice));
+		init[thr_id] = true;
+	}
+	checkCudaErrors(cudaMemcpyToSymbol(c_data, host_pdata, 20*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
+}
+
+extern "C" void pre_keccak512(int thr_id, int stream, uint32_t nonce, int throughput)
+{
+	dim3 block(128);
+	dim3 grid((throughput+127)/128);
+
+	cuda_pre_keccak512<<<grid, block, 0, context_streams[stream][thr_id]>>>(context_idata[stream][thr_id], nonce);
+}
+
+extern "C" void post_keccak512(int thr_id, int stream, uint32_t nonce, int throughput)
+{
+	dim3 block(128);
+	dim3 grid((throughput+127)/128);
+
+	cuda_post_keccak512<<<grid, block, 0, context_streams[stream][thr_id]>>>(context_odata[stream][thr_id], context_hash[stream][thr_id], nonce);
+}
diff --git a/scrypt/keccak.h b/scrypt/keccak.h
new file mode 100644
index 0000000000..62ac1cd7fc
--- /dev/null
+++ b/scrypt/keccak.h
@@ -0,0 +1,8 @@
+#ifndef KECCAK_H
+#define KEKKAC_H
+
+extern "C" void prepare_keccak512(int thr_id, const uint32_t host_pdata[20]);
+extern "C" void pre_keccak512(int thr_id, int stream, uint32_t nonce, int throughput);
+extern "C" void post_keccak512(int thr_id, int stream, uint32_t nonce, int throughput);
+
+#endif // #ifndef KEKKAC_H
diff --git a/scrypt/kepler_kernel.cu b/scrypt/kepler_kernel.cu
new file mode 100644
index 0000000000..247fdb8cea
--- /dev/null
+++ b/scrypt/kepler_kernel.cu
@@ -0,0 +1,791 @@
+/* Copyright (C) 2013 David G. Andersen. All rights reserved.
+ * with modifications by Christian Buchner
+ *
+ * Use of this code is covered under the Apache 2.0 license, which
+ * can be found in the file "LICENSE"
+ */
+
+// TODO: attempt V.Volkov style ILP (factor 4)
+
+#include <map>
+
+#include <cuda_runtime.h>
+#include <cuda_helper.h>
+
+#include "miner.h"
+
+#include "salsa_kernel.h"
+#include "kepler_kernel.h"
+
+#define TEXWIDTH 32768
+#define THREADS_PER_WU 4  // four threads per hash
+
+#if CUDA_VERSION >= 9000 && __CUDA_ARCH__ >= 300
+#define __shfl2(var, srcLane)  __shfl_sync(0xFFFFFFFFu, var, srcLane)
+#else
+#define __shfl2 __shfl
+#endif
+
+typedef enum
+{
+		ANDERSEN,
+		SIMPLE
+} MemoryAccess;
+
+// scratchbuf constants (pointers to scratch buffer for each warp, i.e. 32 hashes)
+__constant__ uint32_t* c_V[TOTAL_WARP_LIMIT];
+
+// iteration count N
+__constant__ uint32_t c_N;
+__constant__ uint32_t c_N_1;                   // N-1
+// scratch buffer size SCRATCH
+__constant__ uint32_t c_SCRATCH;
+__constant__ uint32_t c_SCRATCH_WU_PER_WARP;   // (SCRATCH * WU_PER_WARP)
+__constant__ uint32_t c_SCRATCH_WU_PER_WARP_1; // (SCRATCH * WU_PER_WARP) - 1
+
+// using texture references for the "tex" variants of the B kernels
+texture<uint4, 1, cudaReadModeElementType> texRef1D_4_V;
+texture<uint4, 2, cudaReadModeElementType> texRef2D_4_V;
+
+template <int ALGO> __device__  __forceinline__ void block_mixer(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3);
+
+static __host__ __device__ uint4& operator ^= (uint4& left, const uint4& right) {
+	left.x ^= right.x;
+	left.y ^= right.y;
+	left.z ^= right.z;
+	left.w ^= right.w;
+	return left;
+}
+
+static __host__ __device__ uint4& operator += (uint4& left, const uint4& right) {
+	left.x += right.x;
+	left.y += right.y;
+	left.z += right.z;
+	left.w += right.w;
+	return left;
+}
+
+static __device__ uint4 shfl4(const uint4 bx, int target_thread) {
+	return make_uint4(
+		__shfl2((int)bx.x, target_thread),
+		__shfl2((int)bx.y, target_thread),
+		__shfl2((int)bx.z, target_thread),
+		__shfl2((int)bx.w, target_thread)
+	);
+}
+
+/* write_keys writes the 8 keys being processed by a warp to the global
+ * scratchpad. To effectively use memory bandwidth, it performs the writes
+ * (and reads, for read_keys) 128 bytes at a time per memory location
+ * by __shfl'ing the 4 entries in bx to the threads in the next-up
+ * thread group. It then has eight threads together perform uint4
+ * (128 bit) writes to the destination region. This seems to make
+ * quite effective use of memory bandwidth. An approach that spread
+ * uint32s across more threads was slower because of the increased
+ * computation it required.
+ *
+ * "start" is the loop iteration producing the write - the offset within
+ * the block's memory.
+ *
+ * Internally, this algorithm first __shfl's the 4 bx entries to
+ * the next up thread group, and then uses a conditional move to
+ * ensure that odd-numbered thread groups exchange the b/bx ordering
+ * so that the right parts are written together.
+ *
+ * Thanks to Babu for helping design the 128-bit-per-write version.
+ *
+ * _direct lets the caller specify the absolute start location instead of
+ * the relative start location, as an attempt to reduce some recomputation.
+ */
+
+template <MemoryAccess SCHEME> __device__ __forceinline__
+void write_keys_direct(const uint4 &b, const uint4 &bx, uint32_t start)
+{
+	uint32_t *scratch = c_V[(blockIdx.x*blockDim.x + threadIdx.x)/32];
+
+	if (SCHEME == ANDERSEN) {
+		int target_thread = (threadIdx.x + 4)%32;
+		uint4 t = b, t2 = shfl4(bx, target_thread);
+		int t2_start = __shfl2((int)start, target_thread) + 4;
+		bool c = (threadIdx.x & 0x4);
+		*((uint4 *)(&scratch[c ? t2_start : start])) = (c ? t2 : t);
+		*((uint4 *)(&scratch[c ? start : t2_start])) = (c ? t : t2);
+	} else if (SCHEME == SIMPLE) {
+		*((uint4 *)(&scratch[start   ])) = b;
+		*((uint4 *)(&scratch[start+16])) = bx;
+	}
+}
+
+template <MemoryAccess SCHEME, int TEX_DIM> __device__  __forceinline__
+void read_keys_direct(uint4 &b, uint4 &bx, uint32_t start)
+{
+	uint32_t *scratch;
+
+	if (TEX_DIM == 0) scratch = c_V[(blockIdx.x*blockDim.x + threadIdx.x)/32];
+	if (SCHEME == ANDERSEN) {
+		int t2_start = __shfl2((int)start, (threadIdx.x + 4)%32) + 4;
+		if (TEX_DIM > 0) { start /= 4; t2_start /= 4; }
+		bool c = (threadIdx.x & 0x4);
+		if (TEX_DIM == 0) {
+				b  = *((uint4 *)(&scratch[c ? t2_start : start]));
+				bx = *((uint4 *)(&scratch[c ? start : t2_start]));
+		} else if (TEX_DIM == 1) {
+				b  = tex1Dfetch(texRef1D_4_V, c ? t2_start : start);
+				bx = tex1Dfetch(texRef1D_4_V, c ? start : t2_start);
+		} else if (TEX_DIM == 2) {
+				b  = tex2D(texRef2D_4_V, 0.5f + ((c ? t2_start : start)%TEXWIDTH), 0.5f + ((c ? t2_start : start)/TEXWIDTH));
+				bx = tex2D(texRef2D_4_V, 0.5f + ((c ? start : t2_start)%TEXWIDTH), 0.5f + ((c ? start : t2_start)/TEXWIDTH));
+		}
+		uint4 tmp = b; b = (c ? bx : b); bx = (c ? tmp : bx);
+		bx = shfl4(bx, (threadIdx.x + 28)%32);
+	} else {
+				 if (TEX_DIM == 0) b = *((uint4 *)(&scratch[start]));
+		else if (TEX_DIM == 1) b = tex1Dfetch(texRef1D_4_V, start/4);
+		else if (TEX_DIM == 2) b = tex2D(texRef2D_4_V, 0.5f + ((start/4)%TEXWIDTH), 0.5f + ((start/4)/TEXWIDTH));
+				 if (TEX_DIM == 0) bx = *((uint4 *)(&scratch[start+16]));
+		else if (TEX_DIM == 1) bx = tex1Dfetch(texRef1D_4_V, (start+16)/4);
+		else if (TEX_DIM == 2) bx = tex2D(texRef2D_4_V, 0.5f + (((start+16)/4)%TEXWIDTH), 0.5f + (((start+16)/4)/TEXWIDTH));
+	}
+}
+
+
+__device__  __forceinline__
+void primary_order_shuffle(uint4 &b, uint4 &bx)
+{
+	/* Inner loop shuffle targets */
+	int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3);
+	int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
+	int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
+
+	b.w = __shfl2((int)b.w, x1);
+	b.z = __shfl2((int)b.z, x2);
+	b.y = __shfl2((int)b.y, x3);
+
+	uint32_t tmp = b.y; b.y = b.w; b.w = tmp;
+
+	bx.w = __shfl2((int)bx.w, x1);
+	bx.z = __shfl2((int)bx.z, x2);
+	bx.y = __shfl2((int)bx.y, x3);
+	tmp = bx.y; bx.y = bx.w; bx.w = tmp;
+}
+
+/*
+ * load_key loads a 32*32bit key from a contiguous region of memory in B.
+ * The input keys are in external order (i.e., 0, 1, 2, 3, ...).
+ * After loading, each thread has its four b and four bx keys stored
+ * in internal processing order.
+ */
+
+__device__  __forceinline__
+void load_key_salsa(const uint32_t *B, uint4 &b, uint4 &bx)
+{
+	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	int key_offset = scrypt_block * 32;
+	uint32_t thread_in_block = threadIdx.x % 4;
+
+	// Read in permuted order. Key loads are not our bottleneck right now.
+	b.x = B[key_offset + 4*thread_in_block + (thread_in_block+0)%4];
+	b.y = B[key_offset + 4*thread_in_block + (thread_in_block+1)%4];
+	b.z = B[key_offset + 4*thread_in_block + (thread_in_block+2)%4];
+	b.w = B[key_offset + 4*thread_in_block + (thread_in_block+3)%4];
+	bx.x = B[key_offset + 4*thread_in_block + (thread_in_block+0)%4 + 16];
+	bx.y = B[key_offset + 4*thread_in_block + (thread_in_block+1)%4 + 16];
+	bx.z = B[key_offset + 4*thread_in_block + (thread_in_block+2)%4 + 16];
+	bx.w = B[key_offset + 4*thread_in_block + (thread_in_block+3)%4 + 16];
+
+	primary_order_shuffle(b, bx);
+}
+
+/*
+ * store_key performs the opposite transform as load_key, taking
+ * internally-ordered b and bx and storing them into a contiguous
+ * region of B in external order.
+ */
+
+__device__  __forceinline__
+void store_key_salsa(uint32_t *B, uint4 &b, uint4 &bx)
+{
+	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	int key_offset = scrypt_block * 32;
+	uint32_t thread_in_block = threadIdx.x % 4;
+
+	primary_order_shuffle(b, bx);
+
+	B[key_offset + 4*thread_in_block + (thread_in_block+0)%4] = b.x;
+	B[key_offset + 4*thread_in_block + (thread_in_block+1)%4] = b.y;
+	B[key_offset + 4*thread_in_block + (thread_in_block+2)%4] = b.z;
+	B[key_offset + 4*thread_in_block + (thread_in_block+3)%4] = b.w;
+	B[key_offset + 4*thread_in_block + (thread_in_block+0)%4 + 16] = bx.x;
+	B[key_offset + 4*thread_in_block + (thread_in_block+1)%4 + 16] = bx.y;
+	B[key_offset + 4*thread_in_block + (thread_in_block+2)%4 + 16] = bx.z;
+	B[key_offset + 4*thread_in_block + (thread_in_block+3)%4 + 16] = bx.w;
+}
+
+
+/*
+ * load_key loads a 32*32bit key from a contiguous region of memory in B.
+ * The input keys are in external order (i.e., 0, 1, 2, 3, ...).
+ * After loading, each thread has its four b and four bx keys stored
+ * in internal processing order.
+ */
+
+__device__  __forceinline__
+void load_key_chacha(const uint32_t *B, uint4 &b, uint4 &bx)
+{
+	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	int key_offset = scrypt_block * 32;
+	uint32_t thread_in_block = threadIdx.x % 4;
+
+	// Read in permuted order. Key loads are not our bottleneck right now.
+	b.x = B[key_offset + 4*0 + thread_in_block%4];
+	b.y = B[key_offset + 4*1 + thread_in_block%4];
+	b.z = B[key_offset + 4*2 + thread_in_block%4];
+	b.w = B[key_offset + 4*3 + thread_in_block%4];
+	bx.x = B[key_offset + 4*0 + thread_in_block%4 + 16];
+	bx.y = B[key_offset + 4*1 + thread_in_block%4 + 16];
+	bx.z = B[key_offset + 4*2 + thread_in_block%4 + 16];
+	bx.w = B[key_offset + 4*3 + thread_in_block%4 + 16];
+}
+
+/*
+ * store_key performs the opposite transform as load_key, taking
+ * internally-ordered b and bx and storing them into a contiguous
+ * region of B in external order.
+ */
+
+__device__  __forceinline__
+void store_key_chacha(uint32_t *B, const uint4 &b, const uint4 &bx)
+{
+	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	int key_offset = scrypt_block * 32;
+	uint32_t thread_in_block = threadIdx.x % 4;
+
+	B[key_offset + 4*0 + thread_in_block%4] = b.x;
+	B[key_offset + 4*1 + thread_in_block%4] = b.y;
+	B[key_offset + 4*2 + thread_in_block%4] = b.z;
+	B[key_offset + 4*3 + thread_in_block%4] = b.w;
+	B[key_offset + 4*0 + thread_in_block%4 + 16] = bx.x;
+	B[key_offset + 4*1 + thread_in_block%4 + 16] = bx.y;
+	B[key_offset + 4*2 + thread_in_block%4 + 16] = bx.z;
+	B[key_offset + 4*3 + thread_in_block%4 + 16] = bx.w;
+}
+
+
+template <int ALGO> __device__  __forceinline__
+void load_key(const uint32_t *B, uint4 &b, uint4 &bx)
+{
+		switch(ALGO) {
+		case A_SCRYPT:      load_key_salsa(B, b, bx); break;
+		case A_SCRYPT_JANE: load_key_chacha(B, b, bx); break;
+		}
+}
+
+template <int ALGO> __device__  __forceinline__
+void store_key(uint32_t *B, uint4 &b, uint4 &bx)
+{
+		switch(ALGO) {
+		case A_SCRYPT:      store_key_salsa(B, b, bx); break;
+		case A_SCRYPT_JANE: store_key_chacha(B, b, bx); break;
+		}
+}
+
+
+/*
+ * salsa_xor_core (Salsa20/8 cypher)
+ * The original scrypt called:
+ * xor_salsa8(&X[0], &X[16]); <-- the "b" loop
+ * xor_salsa8(&X[16], &X[0]); <-- the "bx" loop
+ * This version is unrolled to handle both of these loops in a single
+ * call to avoid unnecessary data movement.
+ */
+
+#define XOR_ROTATE_ADD(dst, s1, s2, amt) { uint32_t tmp = s1+s2; dst ^= ((tmp<<amt)|(tmp>>(32-amt))); }
+
+__device__  __forceinline__
+void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3)
+{
+	uint4 x;
+
+	b ^= bx;
+	x = b;
+
+	// Enter in "primary order" (t0 has  0,  4,  8, 12)
+	//                          (t1 has  5,  9, 13,  1)
+	//                          (t2 has 10, 14,  2,  6)
+	//                          (t3 has 15,  3,  7, 11)
+
+	#pragma unroll
+	for (int j = 0; j < 4; j++) {
+
+		// Mixing phase of salsa
+		XOR_ROTATE_ADD(x.y, x.x, x.w, 7);
+		XOR_ROTATE_ADD(x.z, x.y, x.x, 9);
+		XOR_ROTATE_ADD(x.w, x.z, x.y, 13);
+		XOR_ROTATE_ADD(x.x, x.w, x.z, 18);
+
+		/* Transpose rows and columns. */
+		/* Unclear if this optimization is needed: These are ordered based
+		 * upon the dependencies needed in the later xors. Compiler should be
+		 * able to figure this out, but might as well give it a hand. */
+		x.y = __shfl2((int)x.y, x3);
+		x.w = __shfl2((int)x.w, x1);
+		x.z = __shfl2((int)x.z, x2);
+
+		/* The next XOR_ROTATE_ADDS could be written to be a copy-paste of the first,
+		 * but the register targets are rewritten here to swap x[1] and x[3] so that
+		 * they can be directly shuffled to and from our peer threads without
+		 * reassignment. The reverse shuffle then puts them back in the right place.
+		 */
+
+		XOR_ROTATE_ADD(x.w, x.x, x.y, 7);
+		XOR_ROTATE_ADD(x.z, x.w, x.x, 9);
+		XOR_ROTATE_ADD(x.y, x.z, x.w, 13);
+		XOR_ROTATE_ADD(x.x, x.y, x.z, 18);
+
+		x.w = __shfl2((int)x.w, x3);
+		x.y = __shfl2((int)x.y, x1);
+		x.z = __shfl2((int)x.z, x2);
+	}
+
+	b += x;
+	// The next two lines are the beginning of the BX-centric loop iteration
+	bx ^= b;
+	x = bx;
+
+	// This is a copy of the same loop above, identical but stripped of comments.
+	// Duplicated so that we can complete a bx-based loop with fewer register moves.
+	#pragma unroll
+	for (int j = 0; j < 4; j++) {
+		XOR_ROTATE_ADD(x.y, x.x, x.w, 7);
+		XOR_ROTATE_ADD(x.z, x.y, x.x, 9);
+		XOR_ROTATE_ADD(x.w, x.z, x.y, 13);
+		XOR_ROTATE_ADD(x.x, x.w, x.z, 18);
+
+		x.y = __shfl2((int)x.y, x3);
+		x.w = __shfl2((int)x.w, x1);
+		x.z = __shfl2((int)x.z, x2);
+
+		XOR_ROTATE_ADD(x.w, x.x, x.y, 7);
+		XOR_ROTATE_ADD(x.z, x.w, x.x, 9);
+		XOR_ROTATE_ADD(x.y, x.z, x.w, 13);
+		XOR_ROTATE_ADD(x.x, x.y, x.z, 18);
+
+		x.w = __shfl2((int)x.w, x3);
+		x.y = __shfl2((int)x.y, x1);
+		x.z = __shfl2((int)x.z, x2);
+	}
+
+	// At the end of these iterations, the data is in primary order again.
+#undef XOR_ROTATE_ADD
+
+	bx += x;
+}
+
+
+/*
+ * chacha_xor_core (ChaCha20/8 cypher)
+ * This version is unrolled to handle both of these loops in a single
+ * call to avoid unnecessary data movement.
+ *
+ * load_key and store_key must not use primary order when
+ * using ChaCha20/8, but rather the basic transposed order
+ * (referred to as "column mode" below)
+ */
+
+#define CHACHA_PRIMITIVE(pt, rt, ps, amt) { uint32_t tmp = rt ^ (pt += ps); rt = ((tmp<<amt)|(tmp>>(32-amt))); }
+
+__device__  __forceinline__
+void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3)
+{
+	uint4 x;
+
+	b ^= bx;
+	x = b;
+
+	// Enter in "column" mode (t0 has 0, 4,  8, 12)
+	//                        (t1 has 1, 5,  9, 13)
+	//                        (t2 has 2, 6, 10, 14)
+	//                        (t3 has 3, 7, 11, 15)
+
+	#pragma unroll
+	for (int j = 0; j < 4; j++) {
+
+		// Column Mixing phase of chacha
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12)
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y,  8)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w,  7)
+
+		x.y = __shfl2((int)x.y, x1);
+		x.z = __shfl2((int)x.z, x2);
+		x.w = __shfl2((int)x.w, x3);
+
+		// Diagonal Mixing phase of chacha
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12)
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y,  8)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w,  7)
+
+		x.y = __shfl2((int)x.y, x3);
+		x.z = __shfl2((int)x.z, x2);
+		x.w = __shfl2((int)x.w, x1);
+	}
+
+	b += x;
+	// The next two lines are the beginning of the BX-centric loop iteration
+	bx ^= b;
+	x = bx;
+
+	#pragma unroll
+	for (int j = 0; j < 4; j++) {
+
+		// Column Mixing phase of chacha
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12)
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y,  8)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w,  7)
+
+		x.y = __shfl2((int)x.y, x1);
+		x.z = __shfl2((int)x.z, x2);
+		x.w = __shfl2((int)x.w, x3);
+
+		// Diagonal Mixing phase of chacha
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12)
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y,  8)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w,  7)
+
+		x.y = __shfl2((int)x.y, x3);
+		x.z = __shfl2((int)x.z, x2);
+		x.w = __shfl2((int)x.w, x1);
+	}
+
+#undef CHACHA_PRIMITIVE
+
+	bx += x;
+}
+
+
+template <int ALGO> __device__  __forceinline__
+void block_mixer(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3)
+{
+	switch(ALGO) {
+	case A_SCRYPT:      salsa_xor_core(b, bx, x1, x2, x3);  break;
+	case A_SCRYPT_JANE: chacha_xor_core(b, bx, x1, x2, x3); break;
+	}
+}
+
+
+/*
+ * The hasher_gen_kernel operates on a group of 1024-bit input keys
+ * in B, stored as:
+ * B = { k1B k1Bx k2B k2Bx ... }
+ * and fills up the scratchpad with the iterative hashes derived from
+ * those keys:
+ * scratch { k1h1B k1h1Bx K1h2B K1h2Bx ... K2h1B K2h1Bx K2h2B K2h2Bx ... }
+ * scratch is 1024 times larger than the input keys B.
+ * It is extremely important to stream writes effectively into scratch;
+ * less important to coalesce the reads from B.
+ *
+ * Key ordering note: Keys are input from B in "original" order:
+ * K = {k1, k2, k3, k4, k5, ..., kx15, kx16, kx17, ..., kx31 }
+ * After inputting into kernel_gen, each component k and kx of the
+ * key is transmuted into a permuted internal order to make processing faster:
+ * K = k, kx with:
+ * k = 0, 4, 8, 12, 5, 9, 13, 1, 10, 14, 2, 6, 15, 3, 7, 11
+ * and similarly for kx.
+ */
+
+template <int ALGO, MemoryAccess SCHEME> __global__
+void kepler_scrypt_core_kernelA(const uint32_t *d_idata, int begin, int end)
+{
+	uint4 b, bx;
+
+	int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3);
+	int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
+	int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
+
+	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	int start = (scrypt_block*c_SCRATCH + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4)) % c_SCRATCH_WU_PER_WARP;
+
+	int i=begin;
+
+	if (i == 0) {
+		load_key<ALGO>(d_idata, b, bx);
+		write_keys_direct<SCHEME>(b, bx, start);
+		++i;
+	} else read_keys_direct<SCHEME,0>(b, bx, start+32*(i-1));
+
+	while (i < end) {
+		block_mixer<ALGO>(b, bx, x1, x2, x3);
+		write_keys_direct<SCHEME>(b, bx, start+32*i);
+		++i;
+	}
+}
+
+template <int ALGO, MemoryAccess SCHEME> __global__
+void kepler_scrypt_core_kernelA_LG(const uint32_t *d_idata, int begin, int end, unsigned int LOOKUP_GAP)
+{
+	uint4 b, bx;
+
+	int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3);
+	int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
+	int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
+
+	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	int start = (scrypt_block*c_SCRATCH + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4)) % c_SCRATCH_WU_PER_WARP;
+
+	int i=begin;
+
+	if (i == 0) {
+		load_key<ALGO>(d_idata, b, bx);
+		write_keys_direct<SCHEME>(b, bx, start);
+		++i;
+	} else {
+		int pos = (i-1)/LOOKUP_GAP, loop = (i-1)-pos*LOOKUP_GAP;
+		read_keys_direct<SCHEME,0>(b, bx, start+32*pos);
+		while(loop--) block_mixer<ALGO>(b, bx, x1, x2, x3);
+	}
+
+	while (i < end) {
+		block_mixer<ALGO>(b, bx, x1, x2, x3);
+		if (i % LOOKUP_GAP == 0)
+			write_keys_direct<SCHEME>(b, bx, start+32*(i/LOOKUP_GAP));
+		++i;
+	}
+}
+
+
+/*
+ * hasher_hash_kernel runs the second phase of scrypt after the scratch
+ * buffer is filled with the iterative hashes: It bounces through
+ * the scratch buffer in pseudorandom order, mixing the key as it goes.
+ */
+
+template <int ALGO, MemoryAccess SCHEME, int TEX_DIM> __global__
+void kepler_scrypt_core_kernelB(uint32_t *d_odata, int begin, int end)
+{
+	uint4 b, bx;
+
+	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	int start = (scrypt_block*c_SCRATCH) + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4);
+	if (TEX_DIM == 0) start %= c_SCRATCH_WU_PER_WARP;
+
+	int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3);
+	int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
+	int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
+
+	if (begin == 0) {
+		read_keys_direct<SCHEME, TEX_DIM>(b, bx, start+32*c_N_1);
+		block_mixer<ALGO>(b, bx, x1, x2, x3);
+	} else load_key<ALGO>(d_odata, b, bx);
+
+	for (int i = begin; i < end; i++) {
+		int j = (__shfl2((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
+		uint4 t, tx; read_keys_direct<SCHEME, TEX_DIM>(t, tx, start+32*j);
+		b ^= t; bx ^= tx;
+		block_mixer<ALGO>(b, bx, x1, x2, x3);
+	}
+
+	store_key<ALGO>(d_odata, b, bx);
+}
+
+template <int ALGO, MemoryAccess SCHEME, int TEX_DIM> __global__
+void kepler_scrypt_core_kernelB_LG(uint32_t *d_odata, int begin, int end, unsigned int LOOKUP_GAP)
+{
+	uint4 b, bx;
+
+	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	int start = (scrypt_block*c_SCRATCH) + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4);
+	if (TEX_DIM == 0) start %= c_SCRATCH_WU_PER_WARP;
+
+	int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3);
+	int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
+	int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
+
+	if (begin == 0) {
+		int pos = c_N_1/LOOKUP_GAP, loop = 1 + (c_N_1-pos*LOOKUP_GAP);
+		read_keys_direct<SCHEME,TEX_DIM>(b, bx, start+32*pos);
+		while(loop--) block_mixer<ALGO>(b, bx, x1, x2, x3);
+	} else load_key<ALGO>(d_odata, b, bx);
+
+	if (SCHEME == SIMPLE)
+	{
+		// better divergent thread handling submitted by nVidia engineers, but
+		// supposedly this does not run with the ANDERSEN memory access scheme
+		int j = (__shfl2((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
+		int pos = j/LOOKUP_GAP;
+		int loop = -1;
+		uint4 t, tx;
+
+		int i = begin;
+		while(i < end) {
+			if (loop==-1) {
+				j = (__shfl2((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
+				pos = j/LOOKUP_GAP;
+				loop = j-pos*LOOKUP_GAP;
+				read_keys_direct<SCHEME,TEX_DIM>(t, tx, start+32*pos);
+			}
+			if (loop==0) {
+				b ^= t; bx ^= tx;
+				t=b;tx=bx;
+			}
+			block_mixer<ALGO>(t, tx, x1, x2, x3);
+			if (loop==0) {
+				b=t;bx=tx;
+				i++;
+			}
+			loop--;
+		}
+	}
+	else
+	{
+		// this is my original implementation, now used with the ANDERSEN
+		// memory access scheme only.
+		for (int i = begin; i < end; i++) {
+			int j = (__shfl2((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
+			int pos = j/LOOKUP_GAP, loop = j-pos*LOOKUP_GAP;
+			uint4 t, tx; read_keys_direct<SCHEME,TEX_DIM>(t, tx, start+32*pos);
+			while(loop--) block_mixer<ALGO>(t, tx, x1, x2, x3);
+			b ^= t; bx ^= tx;
+			block_mixer<ALGO>(b, bx, x1, x2, x3);
+		}
+	}
+
+//for (int i = begin; i < end; i++) {
+//	int j = (__shfl2((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
+//	int pos = j/LOOKUP_GAP, loop = j-pos*LOOKUP_GAP;
+//	uint4 t, tx; read_keys_direct<SCHEME,TEX_DIM>(t, tx, start+32*pos);
+//	while(loop--) block_mixer<ALGO>(t, tx, x1, x2, x3);
+//	b ^= t; bx ^= tx;
+//	block_mixer<ALGO>(b, bx, x1, x2, x3);
+//}
+
+	store_key<ALGO>(d_odata, b, bx);
+}
+
+KeplerKernel::KeplerKernel() : KernelInterface()
+{
+}
+
+bool KeplerKernel::bindtexture_1D(uint32_t *d_V, size_t size)
+{
+	cudaChannelFormatDesc channelDesc4 = cudaCreateChannelDesc<uint4>();
+	texRef1D_4_V.normalized = 0;
+	texRef1D_4_V.filterMode = cudaFilterModePoint;
+	texRef1D_4_V.addressMode[0] = cudaAddressModeClamp;
+	checkCudaErrors(cudaBindTexture(NULL, &texRef1D_4_V, d_V, &channelDesc4, size));
+	return true;
+}
+
+bool KeplerKernel::bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch)
+{
+	cudaChannelFormatDesc channelDesc4 = cudaCreateChannelDesc<uint4>();
+	texRef2D_4_V.normalized = 0;
+	texRef2D_4_V.filterMode = cudaFilterModePoint;
+	texRef2D_4_V.addressMode[0] = cudaAddressModeClamp;
+	texRef2D_4_V.addressMode[1] = cudaAddressModeClamp;
+	// maintain texture width of TEXWIDTH (max. limit is 65000)
+	while (width > TEXWIDTH) { width /= 2; height *= 2; pitch /= 2; }
+	while (width < TEXWIDTH) { width *= 2; height = (height+1)/2; pitch *= 2; }
+	checkCudaErrors(cudaBindTexture2D(NULL, &texRef2D_4_V, d_V, &channelDesc4, width, height, pitch));
+	return true;
+}
+
+bool KeplerKernel::unbindtexture_1D()
+{
+	checkCudaErrors(cudaUnbindTexture(texRef1D_4_V));
+	return true;
+}
+
+bool KeplerKernel::unbindtexture_2D()
+{
+	checkCudaErrors(cudaUnbindTexture(texRef2D_4_V));
+	return true;
+}
+
+void KeplerKernel::set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V)
+{
+	checkCudaErrors(cudaMemcpyToSymbol(c_V, h_V, MAXWARPS*sizeof(uint32_t*), 0, cudaMemcpyHostToDevice));
+}
+
+bool KeplerKernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream,
+	uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache)
+{
+	bool success = true;
+
+	// make some constants available to kernel, update only initially and when changing
+	static uint32_t prev_N[MAX_GPUS] = { 0 };
+
+	if (N != prev_N[thr_id]) {
+		uint32_t h_N = N;
+		uint32_t h_N_1 = N-1;
+		uint32_t h_SCRATCH = SCRATCH;
+		uint32_t h_SCRATCH_WU_PER_WARP = (SCRATCH * WU_PER_WARP);
+		uint32_t h_SCRATCH_WU_PER_WARP_1 = (SCRATCH * WU_PER_WARP) - 1;
+
+		cudaMemcpyToSymbolAsync(c_N, &h_N, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
+		cudaMemcpyToSymbolAsync(c_N_1, &h_N_1, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
+		cudaMemcpyToSymbolAsync(c_SCRATCH, &h_SCRATCH, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
+		cudaMemcpyToSymbolAsync(c_SCRATCH_WU_PER_WARP, &h_SCRATCH_WU_PER_WARP, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
+		cudaMemcpyToSymbolAsync(c_SCRATCH_WU_PER_WARP_1, &h_SCRATCH_WU_PER_WARP_1, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
+
+		prev_N[thr_id] = N;
+	}
+
+	// First phase: Sequential writes to scratchpad.
+
+	int batch = device_batchsize[thr_id];
+	//int num_sleeps = 2* ((N + (batch-1)) / batch);
+	//int sleeptime = 100;
+
+	unsigned int pos = 0;
+	do
+	{
+		if (LOOKUP_GAP == 1) {
+			if (IS_SCRYPT())      kepler_scrypt_core_kernelA<A_SCRYPT,    ANDERSEN> <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N));
+			if (IS_SCRYPT_JANE()) kepler_scrypt_core_kernelA<A_SCRYPT_JANE, SIMPLE> <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N));
+		} else {
+			if (IS_SCRYPT())      kepler_scrypt_core_kernelA_LG<A_SCRYPT,    ANDERSEN> <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP);
+			if (IS_SCRYPT_JANE()) kepler_scrypt_core_kernelA_LG<A_SCRYPT_JANE, SIMPLE> <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP);
+		}
+		pos += batch;
+	} while (pos < N);
+
+	// Second phase: Random read access from scratchpad.
+
+	pos = 0;
+	do
+	{
+		if (LOOKUP_GAP == 1) {
+
+			if (texture_cache == 0) {
+				if (IS_SCRYPT())      kepler_scrypt_core_kernelB<A_SCRYPT     ,ANDERSEN, 0><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
+				if (IS_SCRYPT_JANE()) kepler_scrypt_core_kernelB<A_SCRYPT_JANE,SIMPLE,   0><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
+			} else if (texture_cache == 1) {
+				if (IS_SCRYPT())      kepler_scrypt_core_kernelB<A_SCRYPT     ,ANDERSEN,1><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
+				if (IS_SCRYPT_JANE()) kepler_scrypt_core_kernelB<A_SCRYPT_JANE,SIMPLE,  1><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
+			} else if (texture_cache == 2) {
+				if (IS_SCRYPT())      kepler_scrypt_core_kernelB<A_SCRYPT     ,ANDERSEN,2><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
+				if (IS_SCRYPT_JANE()) kepler_scrypt_core_kernelB<A_SCRYPT_JANE,SIMPLE,  2><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
+			}
+
+		} else {
+
+			if (texture_cache == 0) {
+				if (IS_SCRYPT())       kepler_scrypt_core_kernelB_LG<A_SCRYPT     ,ANDERSEN,0><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
+				if (IS_SCRYPT_JANE())  kepler_scrypt_core_kernelB_LG<A_SCRYPT_JANE,SIMPLE,  0><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
+			} else if (texture_cache == 1) {
+				if (IS_SCRYPT())       kepler_scrypt_core_kernelB_LG<A_SCRYPT     ,ANDERSEN,1><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
+				if (IS_SCRYPT_JANE())  kepler_scrypt_core_kernelB_LG<A_SCRYPT_JANE,SIMPLE,  1><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
+			} else if (texture_cache == 2) {
+				if (IS_SCRYPT())       kepler_scrypt_core_kernelB_LG<A_SCRYPT     ,ANDERSEN,2><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
+				if (IS_SCRYPT_JANE())  kepler_scrypt_core_kernelB_LG<A_SCRYPT_JANE,SIMPLE,  2><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
+			}
+		}
+
+		pos += batch;
+	} while (pos < N);
+
+	return success;
+}
diff --git a/scrypt/kepler_kernel.h b/scrypt/kepler_kernel.h
new file mode 100644
index 0000000000..afe78dabee
--- /dev/null
+++ b/scrypt/kepler_kernel.h
@@ -0,0 +1,29 @@
+#ifndef KEPLER_KERNEL_H
+#define KEPLER_KERNEL_H
+
+#include "salsa_kernel.h"
+
+class KeplerKernel : public KernelInterface
+{
+public:
+	KeplerKernel();
+
+	virtual void set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V);
+	virtual bool run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache);
+	virtual bool bindtexture_1D(uint32_t *d_V, size_t size);
+	virtual bool bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch);
+	virtual bool unbindtexture_1D();
+	virtual bool unbindtexture_2D();
+
+	virtual char get_identifier() { return 'k'; };
+	virtual int get_major_version() { return 3; };
+	virtual int get_minor_version() { return 0; };
+
+	virtual int max_warps_per_block() { return 32; };
+	virtual int get_texel_width() { return 4; };
+	virtual int threads_per_wu() { return 4; }
+	virtual bool support_lookup_gap() { return true; }
+	virtual cudaFuncCache cache_config() { return cudaFuncCachePreferL1; }
+};
+
+#endif // #ifndef KEPLER_KERNEL_H
diff --git a/scrypt/nv_kernel.cu b/scrypt/nv_kernel.cu
new file mode 100644
index 0000000000..90b4c8a7e9
--- /dev/null
+++ b/scrypt/nv_kernel.cu
@@ -0,0 +1,712 @@
+//
+// Experimental Kernel for Kepler (Compute 3.5) devices
+// code submitted by nVidia performance engineer Alexey Panteleev
+// with modifications by Christian Buchner
+//
+// for Compute 3.5
+// NOTE: compile this .cu module for compute_35,sm_35 with --maxrregcount=80
+// for Compute 3.0
+// NOTE: compile this .cu module for compute_30,sm_30 with --maxrregcount=63
+//
+
+#include <map>
+
+#include <cuda_runtime.h>
+#include <cuda_helper.h>
+
+#include "miner.h"
+#include "salsa_kernel.h"
+#include "nv_kernel.h"
+
+#define THREADS_PER_WU 1  // single thread per hash
+
+#define TEXWIDTH 32768
+
+#if __CUDA_ARCH__ < 350
+	// Kepler (Compute 3.0)
+	#define __ldg(x) (*(x))
+#endif
+
+// grab lane ID
+static __device__ __inline__ unsigned int __laneId() { unsigned int laneId; asm( "mov.u32 %0, %%laneid;" : "=r"( laneId ) ); return laneId; }
+
+// forward references
+template <int ALGO> __global__ void nv_scrypt_core_kernelA(uint32_t *g_idata, int begin, int end);
+template <int ALGO, int TEX_DIM> __global__ void nv_scrypt_core_kernelB(uint32_t *g_odata, int begin, int end);
+template <int ALGO> __global__ void nv_scrypt_core_kernelA_LG(uint32_t *g_idata, int begin, int end, unsigned int LOOKUP_GAP);
+template <int ALGO, int TEX_DIM> __global__ void nv_scrypt_core_kernelB_LG(uint32_t *g_odata, int begin, int end, unsigned int LOOKUP_GAP);
+
+// scratchbuf constants (pointers to scratch buffer for each work unit)
+__constant__ uint32_t* c_V[TOTAL_WARP_LIMIT];
+
+// using texture references for the "tex" variants of the B kernels
+texture<uint4, 1, cudaReadModeElementType> texRef1D_4_V;
+texture<uint4, 2, cudaReadModeElementType> texRef2D_4_V;
+
+// iteration count N
+__constant__ uint32_t c_N;
+__constant__ uint32_t c_N_1; // N - 1
+__constant__ uint32_t c_spacing; // (N+LOOKUP_GAP-1)/LOOKUP_GAP
+
+NVKernel::NVKernel() : KernelInterface()
+{
+}
+
+bool NVKernel::bindtexture_1D(uint32_t *d_V, size_t size)
+{
+	cudaChannelFormatDesc channelDesc4 = cudaCreateChannelDesc<uint4>();
+	texRef1D_4_V.normalized = 0;
+	texRef1D_4_V.filterMode = cudaFilterModePoint;
+	texRef1D_4_V.addressMode[0] = cudaAddressModeClamp;
+	checkCudaErrors(cudaBindTexture(NULL, &texRef1D_4_V, d_V, &channelDesc4, size));
+	return true;
+}
+
+bool NVKernel::bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch)
+{
+	cudaChannelFormatDesc channelDesc4 = cudaCreateChannelDesc<uint4>();
+	texRef2D_4_V.normalized = 0;
+	texRef2D_4_V.filterMode = cudaFilterModePoint;
+	texRef2D_4_V.addressMode[0] = cudaAddressModeClamp;
+	texRef2D_4_V.addressMode[1] = cudaAddressModeClamp;
+	// maintain texture width of TEXWIDTH (max. limit is 65000)
+	while (width > TEXWIDTH) { width /= 2; height *= 2; pitch /= 2; }
+	while (width < TEXWIDTH) { width *= 2; height = (height+1)/2; pitch *= 2; }
+	checkCudaErrors(cudaBindTexture2D(NULL, &texRef2D_4_V, d_V, &channelDesc4, width, height, pitch));
+	return true;
+}
+
+bool NVKernel::unbindtexture_1D()
+{
+	checkCudaErrors(cudaUnbindTexture(texRef1D_4_V));
+	return true;
+}
+
+bool NVKernel::unbindtexture_2D()
+{
+	checkCudaErrors(cudaUnbindTexture(texRef2D_4_V));
+	return true;
+}
+
+void NVKernel::set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V)
+{
+	checkCudaErrors(cudaMemcpyToSymbol(c_V, h_V, MAXWARPS*sizeof(uint32_t*), 0, cudaMemcpyHostToDevice));
+}
+
+bool NVKernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache)
+{
+	bool success = true;
+
+	// make some constants available to kernel, update only initially and when changing
+	static uint32_t prev_N[MAX_GPUS] = { 0 };
+
+	if (N != prev_N[thr_id]) {
+		uint32_t h_N = N;
+		uint32_t h_N_1 = N-1;
+		uint32_t h_spacing = (N+LOOKUP_GAP-1)/LOOKUP_GAP;
+
+		cudaMemcpyToSymbolAsync(c_N, &h_N, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
+		cudaMemcpyToSymbolAsync(c_N_1, &h_N_1, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
+		cudaMemcpyToSymbolAsync(c_spacing, &h_spacing, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
+
+		prev_N[thr_id] = N;
+	}
+
+	// First phase: Sequential writes to scratchpad.
+	const int batch = device_batchsize[thr_id];
+	unsigned int pos = 0;
+
+	do
+	{
+		if (LOOKUP_GAP == 1) {
+				if (IS_SCRYPT())      nv_scrypt_core_kernelA<A_SCRYPT>     <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N));
+				if (IS_SCRYPT_JANE()) nv_scrypt_core_kernelA<A_SCRYPT_JANE><<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N));
+			}
+		else {
+				if (IS_SCRYPT())      nv_scrypt_core_kernelA_LG<A_SCRYPT>     <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP);
+				if (IS_SCRYPT_JANE()) nv_scrypt_core_kernelA_LG<A_SCRYPT_JANE><<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP);
+			}
+
+		pos += batch;
+	} while (pos < N);
+
+	// Second phase: Random read access from scratchpad.
+	pos = 0;
+	do
+	{
+		if (LOOKUP_GAP == 1) {
+			if (texture_cache == 0) {
+				if (IS_SCRYPT())      nv_scrypt_core_kernelB<A_SCRYPT     ,0><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
+				if (IS_SCRYPT_JANE()) nv_scrypt_core_kernelB<A_SCRYPT_JANE,0><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
+			}
+			else if (texture_cache == 1) {
+				if (IS_SCRYPT())      nv_scrypt_core_kernelB<A_SCRYPT     ,1><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
+				if (IS_SCRYPT_JANE()) nv_scrypt_core_kernelB<A_SCRYPT_JANE,1><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
+			}
+			else if (texture_cache == 2) {
+				if (IS_SCRYPT())      nv_scrypt_core_kernelB<A_SCRYPT     ,2><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
+				if (IS_SCRYPT_JANE()) nv_scrypt_core_kernelB<A_SCRYPT_JANE,2><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
+			}
+		} else {
+			if (texture_cache == 0) {
+				if (IS_SCRYPT())      nv_scrypt_core_kernelB_LG<A_SCRYPT     ,0><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
+				if (IS_SCRYPT_JANE()) nv_scrypt_core_kernelB_LG<A_SCRYPT_JANE,0><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
+			}
+			else if (texture_cache == 1) {
+				if (IS_SCRYPT())      nv_scrypt_core_kernelB_LG<A_SCRYPT     ,1><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
+				if (IS_SCRYPT_JANE()) nv_scrypt_core_kernelB_LG<A_SCRYPT_JANE,1><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
+			}
+			else if (texture_cache == 2) {
+				if (IS_SCRYPT())      nv_scrypt_core_kernelB_LG<A_SCRYPT     ,2><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
+				if (IS_SCRYPT_JANE()) nv_scrypt_core_kernelB_LG<A_SCRYPT_JANE,2><<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
+			}
+		}
+
+		pos += batch;
+	} while (pos < N);
+
+	return success;
+}
+
+static __device__ uint4& operator^=(uint4& left, const uint4& right)
+{
+	left.x ^= right.x;
+	left.y ^= right.y;
+	left.z ^= right.z;
+	left.w ^= right.w;
+	return left;
+}
+
+__device__ __forceinline__ uint4 shfl4(const uint4 val, unsigned int lane, unsigned int width)
+{
+	return make_uint4(
+		(unsigned int)__shfl((int)val.x, lane, width),
+		(unsigned int)__shfl((int)val.y, lane, width),
+		(unsigned int)__shfl((int)val.z, lane, width),
+		(unsigned int)__shfl((int)val.w, lane, width)
+	);
+}
+
+__device__ __forceinline__ void __transposed_write_BC(uint4 (&B)[4], uint4 (&C)[4], uint4 *D, int spacing)
+{
+	unsigned int laneId = __laneId();
+
+	unsigned int lane8 = laneId%8;
+	unsigned int tile  = laneId/8;
+
+	uint4 T1[8], T2[8];
+
+	/* Source matrix, A-H are threads, 0-7 are data items, thread A is marked with `*`:
+
+	   *A0  B0  C0  D0  E0  F0  G0  H0
+	   *A1  B1  C1  D1  E1  F1  G1  H1
+	   *A2  B2  C2  D2  E2  F2  G2  H2
+	   *A3  B3  C3  D3  E3  F3  G3  H3
+	   *A4  B4  C4  D4  E4  F4  G4  H4
+	   *A5  B5  C5  D5  E5  F5  G5  H5
+	   *A6  B6  C6  D6  E6  F6  G6  H6
+	   *A7  B7  C7  D7  E7  F7  G7  H7
+	*/
+
+	// rotate rows
+	T1[0] = B[0];
+	T1[1] = shfl4(B[1], lane8 + 7, 8);
+	T1[2] = shfl4(B[2], lane8 + 6, 8);
+	T1[3] = shfl4(B[3], lane8 + 5, 8);
+	T1[4] = shfl4(C[0], lane8 + 4, 8);
+	T1[5] = shfl4(C[1], lane8 + 3, 8);
+	T1[6] = shfl4(C[2], lane8 + 2, 8);
+	T1[7] = shfl4(C[3], lane8 + 1, 8);
+
+	/* Matrix after row rotates:
+
+	   *A0  B0  C0  D0  E0  F0  G0  H0
+		H1 *A1  B1  C1  D1  E1  F1  G1
+		G2  H2 *A2  B2  C2  D2  E2  F2
+		F3  G3  H3 *A3  B3  C3  D3  E3
+		E4  F4  G4  H4 *A4  B4  C4  D4
+		D5  E5  F5  G5  H5 *A5  B5  C5
+		C6  D6  E6  F6  G6  H6 *A6  B6
+		B7  C7  D7  E7  F7  G7  H7 *A7
+	*/
+
+	// rotate columns up using a barrel shifter simulation
+	// column X is rotated up by (X+1) items
+#pragma unroll 8
+	for(int n = 0; n < 8; n++) T2[n] = ((lane8+1) & 1) ? T1[(n+1) % 8] : T1[n];
+#pragma unroll 8
+	for(int n = 0; n < 8; n++) T1[n] = ((lane8+1) & 2) ? T2[(n+2) % 8] : T2[n];
+#pragma unroll 8
+	for(int n = 0; n < 8; n++) T2[n] = ((lane8+1) & 4) ? T1[(n+4) % 8] : T1[n];
+
+	/* Matrix after column rotates:
+
+		H1  H2  H3  H4  H5  H6  H7  H0
+		G2  G3  G4  G5  G6  G7  G0  G1
+		F3  F4  F5  F6  F7  F0  F1  F2
+		E4  E5  E6  E7  E0  E1  E2  E3
+		D5  D6  D7  D0  D1  D2  D3  D4
+		C6  C7  C0  C1  C2  C3  C4  C5
+		B7  B0  B1  B2  B3  B4  B5  B6
+	   *A0 *A1 *A2 *A3 *A4 *A5 *A6 *A7
+	*/
+
+	// rotate rows again using address math and write to D, in reverse row order
+	D[spacing*2*(32*tile   )+ lane8     ] = T2[7];
+	D[spacing*2*(32*tile+4 )+(lane8+7)%8] = T2[6];
+	D[spacing*2*(32*tile+8 )+(lane8+6)%8] = T2[5];
+	D[spacing*2*(32*tile+12)+(lane8+5)%8] = T2[4];
+	D[spacing*2*(32*tile+16)+(lane8+4)%8] = T2[3];
+	D[spacing*2*(32*tile+20)+(lane8+3)%8] = T2[2];
+	D[spacing*2*(32*tile+24)+(lane8+2)%8] = T2[1];
+	D[spacing*2*(32*tile+28)+(lane8+1)%8] = T2[0];
+}
+
+template <int TEX_DIM> __device__ __forceinline__ void __transposed_read_BC(const uint4 *S, uint4 (&B)[4], uint4 (&C)[4], int spacing, int row)
+{
+	unsigned int laneId = __laneId();
+
+	unsigned int lane8 = laneId%8;
+	unsigned int tile  = laneId/8;
+
+	// Perform the same transposition as in __transposed_write_BC, but in reverse order.
+	// See the illustrations in comments for __transposed_write_BC.
+
+	// read and rotate rows, in reverse row order
+	uint4 T1[8], T2[8];
+	const uint4 *loc;
+	loc = &S[(spacing*2*(32*tile   ) +  lane8      + 8*__shfl(row, 0, 8))];
+	T1[7] = TEX_DIM==0 ? __ldg(loc) : TEX_DIM==1 ? tex1Dfetch(texRef1D_4_V, loc-(uint4*)c_V[0]) : tex2D(texRef2D_4_V, 0.5f + ((loc-(uint4*)c_V[0])%TEXWIDTH), 0.5f + ((loc-(uint4*)c_V[0])/TEXWIDTH));
+	loc = &S[(spacing*2*(32*tile+4 ) + (lane8+7)%8 + 8*__shfl(row, 1, 8))];
+	T1[6] = TEX_DIM==0 ? __ldg(loc) : TEX_DIM==1 ? tex1Dfetch(texRef1D_4_V, loc-(uint4*)c_V[0]) : tex2D(texRef2D_4_V, 0.5f + ((loc-(uint4*)c_V[0])%TEXWIDTH), 0.5f + ((loc-(uint4*)c_V[0])/TEXWIDTH));
+	loc = &S[(spacing*2*(32*tile+8 ) + (lane8+6)%8 + 8*__shfl(row, 2, 8))];
+	T1[5] = TEX_DIM==0 ? __ldg(loc) : TEX_DIM==1 ? tex1Dfetch(texRef1D_4_V, loc-(uint4*)c_V[0]) : tex2D(texRef2D_4_V, 0.5f + ((loc-(uint4*)c_V[0])%TEXWIDTH), 0.5f + ((loc-(uint4*)c_V[0])/TEXWIDTH));
+	loc = &S[(spacing*2*(32*tile+12) + (lane8+5)%8 + 8*__shfl(row, 3, 8))];
+	T1[4] = TEX_DIM==0 ? __ldg(loc) : TEX_DIM==1 ? tex1Dfetch(texRef1D_4_V, loc-(uint4*)c_V[0]) : tex2D(texRef2D_4_V, 0.5f + ((loc-(uint4*)c_V[0])%TEXWIDTH), 0.5f + ((loc-(uint4*)c_V[0])/TEXWIDTH));
+	loc = &S[(spacing*2*(32*tile+16) + (lane8+4)%8 + 8*__shfl(row, 4, 8))];
+	T1[3] = TEX_DIM==0 ? __ldg(loc) : TEX_DIM==1 ? tex1Dfetch(texRef1D_4_V, loc-(uint4*)c_V[0]) : tex2D(texRef2D_4_V, 0.5f + ((loc-(uint4*)c_V[0])%TEXWIDTH), 0.5f + ((loc-(uint4*)c_V[0])/TEXWIDTH));
+	loc = &S[(spacing*2*(32*tile+20) + (lane8+3)%8 + 8*__shfl(row, 5, 8))];
+	T1[2] = TEX_DIM==0 ? __ldg(loc) : TEX_DIM==1 ? tex1Dfetch(texRef1D_4_V, loc-(uint4*)c_V[0]) : tex2D(texRef2D_4_V, 0.5f + ((loc-(uint4*)c_V[0])%TEXWIDTH), 0.5f + ((loc-(uint4*)c_V[0])/TEXWIDTH));
+	loc = &S[(spacing*2*(32*tile+24) + (lane8+2)%8 + 8*__shfl(row, 6, 8))];
+	T1[1] = TEX_DIM==0 ? __ldg(loc) : TEX_DIM==1 ? tex1Dfetch(texRef1D_4_V, loc-(uint4*)c_V[0]) : tex2D(texRef2D_4_V, 0.5f + ((loc-(uint4*)c_V[0])%TEXWIDTH), 0.5f + ((loc-(uint4*)c_V[0])/TEXWIDTH));
+	loc = &S[(spacing*2*(32*tile+28) + (lane8+1)%8 + 8*__shfl(row, 7, 8))];
+	T1[0] = TEX_DIM==0 ? __ldg(loc) : TEX_DIM==1 ? tex1Dfetch(texRef1D_4_V, loc-(uint4*)c_V[0]) : tex2D(texRef2D_4_V, 0.5f + ((loc-(uint4*)c_V[0])%TEXWIDTH), 0.5f + ((loc-(uint4*)c_V[0])/TEXWIDTH));
+
+	// rotate columns down using a barrel shifter simulation
+	// column X is rotated down by (X+1) items, or up by (8-(X+1)) = (7-X) items
+#pragma unroll 8
+	for(int n = 0; n < 8; n++) T2[n] = ((7-lane8) & 1) ? T1[(n+1) % 8] : T1[n];
+#pragma unroll 8
+	for(int n = 0; n < 8; n++) T1[n] = ((7-lane8) & 2) ? T2[(n+2) % 8] : T2[n];
+#pragma unroll 8
+	for(int n = 0; n < 8; n++) T2[n] = ((7-lane8) & 4) ? T1[(n+4) % 8] : T1[n];
+
+	// rotate rows
+	B[0] = T2[0];
+	B[1] = shfl4(T2[1], lane8 + 1, 8);
+	B[2] = shfl4(T2[2], lane8 + 2, 8);
+	B[3] = shfl4(T2[3], lane8 + 3, 8);
+	C[0] = shfl4(T2[4], lane8 + 4, 8);
+	C[1] = shfl4(T2[5], lane8 + 5, 8);
+	C[2] = shfl4(T2[6], lane8 + 6, 8);
+	C[3] = shfl4(T2[7], lane8 + 7, 8);
+
+}
+
+template <int TEX_DIM> __device__ __forceinline__ void __transposed_xor_BC(const uint4 *S, uint4 (&B)[4], uint4 (&C)[4], int spacing, int row)
+{
+	uint4 BT[4], CT[4];
+	__transposed_read_BC<TEX_DIM>(S, BT, CT, spacing, row);
+
+#pragma unroll 4
+	for(int n = 0; n < 4; n++)
+	{
+		B[n] ^= BT[n];
+		C[n] ^= CT[n];
+	}
+}
+
+#if __CUDA_ARCH__ < 350
+	// Kepler (Compute 3.0)
+	#define ROTL(a, b) ((a)<<(b))|((a)>>(32-(b)))
+#else
+	// Kepler (Compute 3.5)
+	#define ROTL(a, b) __funnelshift_l( a, a, b );
+#endif
+
+
+
+#if 0
+
+#define QUARTER(a,b,c,d) \
+	a += b; d ^= a; d = ROTL(d,16); \
+	c += d; b ^= c; b = ROTL(b,12); \
+	a += b; d ^= a; d = ROTL(d,8); \
+	c += d; b ^= c; b = ROTL(b,7);
+
+static __device__ void xor_chacha8(uint4 *B, uint4 *C)
+{
+	uint32_t x[16];
+	x[0]=(B[0].x ^= C[0].x);
+	x[1]=(B[0].y ^= C[0].y);
+	x[2]=(B[0].z ^= C[0].z);
+	x[3]=(B[0].w ^= C[0].w);
+	x[4]=(B[1].x ^= C[1].x);
+	x[5]=(B[1].y ^= C[1].y);
+	x[6]=(B[1].z ^= C[1].z);
+	x[7]=(B[1].w ^= C[1].w);
+	x[8]=(B[2].x ^= C[2].x);
+	x[9]=(B[2].y ^= C[2].y);
+	x[10]=(B[2].z ^= C[2].z);
+	x[11]=(B[2].w ^= C[2].w);
+	x[12]=(B[3].x ^= C[3].x);
+	x[13]=(B[3].y ^= C[3].y);
+	x[14]=(B[3].z ^= C[3].z);
+	x[15]=(B[3].w ^= C[3].w);
+
+	/* Operate on columns. */
+	QUARTER( x[0], x[4], x[ 8], x[12] )
+	QUARTER( x[1], x[5], x[ 9], x[13] )
+	QUARTER( x[2], x[6], x[10], x[14] )
+	QUARTER( x[3], x[7], x[11], x[15] )
+
+	/* Operate on diagonals */
+	QUARTER( x[0], x[5], x[10], x[15] )
+	QUARTER( x[1], x[6], x[11], x[12] )
+	QUARTER( x[2], x[7], x[ 8], x[13] )
+	QUARTER( x[3], x[4], x[ 9], x[14] )
+
+	/* Operate on columns. */
+	QUARTER( x[0], x[4], x[ 8], x[12] )
+	QUARTER( x[1], x[5], x[ 9], x[13] )
+	QUARTER( x[2], x[6], x[10], x[14] )
+	QUARTER( x[3], x[7], x[11], x[15] )
+
+	/* Operate on diagonals */
+	QUARTER( x[0], x[5], x[10], x[15] )
+	QUARTER( x[1], x[6], x[11], x[12] )
+	QUARTER( x[2], x[7], x[ 8], x[13] )
+	QUARTER( x[3], x[4], x[ 9], x[14] )
+
+	/* Operate on columns. */
+	QUARTER( x[0], x[4], x[ 8], x[12] )
+	QUARTER( x[1], x[5], x[ 9], x[13] )
+	QUARTER( x[2], x[6], x[10], x[14] )
+	QUARTER( x[3], x[7], x[11], x[15] )
+
+	/* Operate on diagonals */
+	QUARTER( x[0], x[5], x[10], x[15] )
+	QUARTER( x[1], x[6], x[11], x[12] )
+	QUARTER( x[2], x[7], x[ 8], x[13] )
+	QUARTER( x[3], x[4], x[ 9], x[14] )
+
+	/* Operate on columns. */
+	QUARTER( x[0], x[4], x[ 8], x[12] )
+	QUARTER( x[1], x[5], x[ 9], x[13] )
+	QUARTER( x[2], x[6], x[10], x[14] )
+	QUARTER( x[3], x[7], x[11], x[15] )
+
+	/* Operate on diagonals */
+	QUARTER( x[0], x[5], x[10], x[15] )
+	QUARTER( x[1], x[6], x[11], x[12] )
+	QUARTER( x[2], x[7], x[ 8], x[13] )
+	QUARTER( x[3], x[4], x[ 9], x[14] )
+
+	B[0].x += x[0]; B[0].y += x[1]; B[0].z += x[2];  B[0].w += x[3];  B[1].x += x[4];  B[1].y += x[5];  B[1].z += x[6];  B[1].w += x[7];
+	B[2].x += x[8]; B[2].y += x[9]; B[2].z += x[10]; B[2].w += x[11]; B[3].x += x[12]; B[3].y += x[13]; B[3].z += x[14]; B[3].w += x[15];
+}
+
+#else
+
+#define ADD4(d1,d2,d3,d4,s1,s2,s3,s4) \
+	d1 += s1; d2 += s2; d3 += s3; d4 += s4;
+
+#define XOR4(d1,d2,d3,d4,s1,s2,s3,s4) \
+	d1 ^= s1; d2 ^= s2; d3 ^= s3; d4 ^= s4;
+
+#define ROTL4(d1,d2,d3,d4,amt) \
+	d1 = ROTL(d1, amt); d2 = ROTL(d2, amt); d3 = ROTL(d3, amt); d4 = ROTL(d4, amt);
+
+#define QROUND(a1,a2,a3,a4, b1,b2,b3,b4, c1,c2,c3,c4, amt) \
+	ADD4 (a1,a2,a3,a4, c1,c2,c3,c4) \
+	XOR4 (b1,b2,b3,b4, a1,a2,a3,a4) \
+	ROTL4(b1,b2,b3,b4, amt)
+
+static __device__ void xor_chacha8(uint4 *B, uint4 *C)
+{
+	uint32_t x[16];
+	x[0]=(B[0].x ^= C[0].x);
+	x[1]=(B[0].y ^= C[0].y);
+	x[2]=(B[0].z ^= C[0].z);
+	x[3]=(B[0].w ^= C[0].w);
+	x[4]=(B[1].x ^= C[1].x);
+	x[5]=(B[1].y ^= C[1].y);
+	x[6]=(B[1].z ^= C[1].z);
+	x[7]=(B[1].w ^= C[1].w);
+	x[8]=(B[2].x ^= C[2].x);
+	x[9]=(B[2].y ^= C[2].y);
+	x[10]=(B[2].z ^= C[2].z);
+	x[11]=(B[2].w ^= C[2].w);
+	x[12]=(B[3].x ^= C[3].x);
+	x[13]=(B[3].y ^= C[3].y);
+	x[14]=(B[3].z ^= C[3].z);
+	x[15]=(B[3].w ^= C[3].w);
+
+	/* Operate on columns. */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 16);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7],  8);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15],  7);
+
+	/* Operate on diagonals */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 16);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4],  8);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14],  7);
+
+	/* Operate on columns. */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 16);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7],  8);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15],  7);
+
+	/* Operate on diagonals */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 16);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4],  8);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14],  7);
+
+	/* Operate on columns. */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 16);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7],  8);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15],  7);
+
+	/* Operate on diagonals */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 16);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4],  8);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14],  7);
+
+	/* Operate on columns. */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 16);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7],  8);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15],  7);
+
+	/* Operate on diagonals */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 16);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4],  8);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14],  7);
+
+	B[0].x += x[0]; B[0].y += x[1]; B[0].z += x[2];  B[0].w += x[3];  B[1].x += x[4];  B[1].y += x[5];  B[1].z += x[6];  B[1].w += x[7];
+	B[2].x += x[8]; B[2].y += x[9]; B[2].z += x[10]; B[2].w += x[11]; B[3].x += x[12]; B[3].y += x[13]; B[3].z += x[14]; B[3].w += x[15];
+}
+
+#endif
+
+
+#define ROTL7(a0,a1,a2,a3,a00,a10,a20,a30){\
+a0^=ROTL(a00, 7); a1^=ROTL(a10, 7); a2^=ROTL(a20, 7); a3^=ROTL(a30, 7);\
+};\
+
+#define ROTL9(a0,a1,a2,a3,a00,a10,a20,a30){\
+a0^=ROTL(a00, 9); a1^=ROTL(a10, 9); a2^=ROTL(a20, 9); a3^=ROTL(a30, 9);\
+};\
+
+#define ROTL13(a0,a1,a2,a3,a00,a10,a20,a30){\
+a0^=ROTL(a00, 13); a1^=ROTL(a10, 13); a2^=ROTL(a20, 13); a3^=ROTL(a30, 13);\
+};\
+
+#define ROTL18(a0,a1,a2,a3,a00,a10,a20,a30){\
+a0^=ROTL(a00, 18); a1^=ROTL(a10, 18); a2^=ROTL(a20, 18); a3^=ROTL(a30, 18);\
+};\
+
+static __device__ void xor_salsa8(uint4 *B, uint4 *C)
+{
+	uint32_t x[16];
+	x[0]=(B[0].x ^= C[0].x);
+	x[1]=(B[0].y ^= C[0].y);
+	x[2]=(B[0].z ^= C[0].z);
+	x[3]=(B[0].w ^= C[0].w);
+	x[4]=(B[1].x ^= C[1].x);
+	x[5]=(B[1].y ^= C[1].y);
+	x[6]=(B[1].z ^= C[1].z);
+	x[7]=(B[1].w ^= C[1].w);
+	x[8]=(B[2].x ^= C[2].x);
+	x[9]=(B[2].y ^= C[2].y);
+	x[10]=(B[2].z ^= C[2].z);
+	x[11]=(B[2].w ^= C[2].w);
+	x[12]=(B[3].x ^= C[3].x);
+	x[13]=(B[3].y ^= C[3].y);
+	x[14]=(B[3].z ^= C[3].z);
+	x[15]=(B[3].w ^= C[3].w);
+
+	/* Operate on columns. */
+	ROTL7(x[4],x[9],x[14],x[3],x[0]+x[12],x[1]+x[5],x[6]+x[10],x[11]+x[15]);
+	ROTL9(x[8],x[13],x[2],x[7],x[0]+x[4],x[5]+x[9],x[10]+x[14],x[3]+x[15]);
+	ROTL13(x[12],x[1],x[6],x[11],x[4]+x[8],x[9]+x[13],x[2]+x[14],x[3]+x[7]);
+	ROTL18(x[0],x[5],x[10],x[15],x[8]+x[12],x[1]+x[13],x[2]+x[6],x[7]+x[11]);
+
+	/* Operate on rows. */
+	ROTL7(x[1],x[6],x[11],x[12],x[0]+x[3],x[4]+x[5],x[9]+x[10],x[14]+x[15]);
+	ROTL9(x[2],x[7],x[8],x[13],x[0]+x[1],x[5]+x[6],x[10]+x[11],x[12]+x[15]);
+	ROTL13(x[3],x[4],x[9],x[14],x[1]+x[2],x[6]+x[7],x[8]+x[11],x[12]+x[13]);
+	ROTL18(x[0],x[5],x[10],x[15],x[2]+x[3],x[4]+x[7],x[8]+x[9],x[13]+x[14]);
+
+	/* Operate on columns. */
+	ROTL7(x[4],x[9],x[14],x[3],x[0]+x[12],x[1]+x[5],x[6]+x[10],x[11]+x[15]);
+	ROTL9(x[8],x[13],x[2],x[7],x[0]+x[4],x[5]+x[9],x[10]+x[14],x[3]+x[15]);
+	ROTL13(x[12],x[1],x[6],x[11],x[4]+x[8],x[9]+x[13],x[2]+x[14],x[3]+x[7]);
+	ROTL18(x[0],x[5],x[10],x[15],x[8]+x[12],x[1]+x[13],x[2]+x[6],x[7]+x[11]);
+
+	/* Operate on rows. */
+	ROTL7(x[1],x[6],x[11],x[12],x[0]+x[3],x[4]+x[5],x[9]+x[10],x[14]+x[15]);
+	ROTL9(x[2],x[7],x[8],x[13],x[0]+x[1],x[5]+x[6],x[10]+x[11],x[12]+x[15]);
+	ROTL13(x[3],x[4],x[9],x[14],x[1]+x[2],x[6]+x[7],x[8]+x[11],x[12]+x[13]);
+	ROTL18(x[0],x[5],x[10],x[15],x[2]+x[3],x[4]+x[7],x[8]+x[9],x[13]+x[14]);
+
+	/* Operate on columns. */
+	ROTL7(x[4],x[9],x[14],x[3],x[0]+x[12],x[1]+x[5],x[6]+x[10],x[11]+x[15]);
+	ROTL9(x[8],x[13],x[2],x[7],x[0]+x[4],x[5]+x[9],x[10]+x[14],x[3]+x[15]);
+	ROTL13(x[12],x[1],x[6],x[11],x[4]+x[8],x[9]+x[13],x[2]+x[14],x[3]+x[7]);
+	ROTL18(x[0],x[5],x[10],x[15],x[8]+x[12],x[1]+x[13],x[2]+x[6],x[7]+x[11]);
+
+	/* Operate on rows. */
+	ROTL7(x[1],x[6],x[11],x[12],x[0]+x[3],x[4]+x[5],x[9]+x[10],x[14]+x[15]);
+	ROTL9(x[2],x[7],x[8],x[13],x[0]+x[1],x[5]+x[6],x[10]+x[11],x[12]+x[15]);
+	ROTL13(x[3],x[4],x[9],x[14],x[1]+x[2],x[6]+x[7],x[8]+x[11],x[12]+x[13]);
+	ROTL18(x[0],x[5],x[10],x[15],x[2]+x[3],x[4]+x[7],x[8]+x[9],x[13]+x[14]);
+
+	/* Operate on columns. */
+	ROTL7(x[4],x[9],x[14],x[3],x[0]+x[12],x[1]+x[5],x[6]+x[10],x[11]+x[15]);
+	ROTL9(x[8],x[13],x[2],x[7],x[0]+x[4],x[5]+x[9],x[10]+x[14],x[3]+x[15]);
+	ROTL13(x[12],x[1],x[6],x[11],x[4]+x[8],x[9]+x[13],x[2]+x[14],x[3]+x[7]);
+	ROTL18(x[0],x[5],x[10],x[15],x[8]+x[12],x[1]+x[13],x[2]+x[6],x[7]+x[11]);
+
+	/* Operate on rows. */
+	ROTL7(x[1],x[6],x[11],x[12],x[0]+x[3],x[4]+x[5],x[9]+x[10],x[14]+x[15]);
+	ROTL9(x[2],x[7],x[8],x[13],x[0]+x[1],x[5]+x[6],x[10]+x[11],x[12]+x[15]);
+	ROTL13(x[3],x[4],x[9],x[14],x[1]+x[2],x[6]+x[7],x[8]+x[11],x[12]+x[13]);
+	ROTL18(x[0],x[5],x[10],x[15],x[2]+x[3],x[4]+x[7],x[8]+x[9],x[13]+x[14]);
+
+	B[0].x += x[0]; B[0].y += x[1]; B[0].z += x[2];  B[0].w += x[3];  B[1].x += x[4];  B[1].y += x[5];  B[1].z += x[6];  B[1].w += x[7];
+	B[2].x += x[8]; B[2].y += x[9]; B[2].z += x[10]; B[2].w += x[11]; B[3].x += x[12]; B[3].y += x[13]; B[3].z += x[14]; B[3].w += x[15];
+}
+
+
+template <int ALGO> static __device__ void block_mixer(uint4 *B, uint4 *C)
+{
+	switch (ALGO) {
+		case A_SCRYPT:      xor_salsa8(B, C); break;
+		case A_SCRYPT_JANE: xor_chacha8(B, C); break;
+	}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Experimental Scrypt core kernel for Kepler devices.
+//! @param g_idata  input data in global memory
+//! @param g_odata  output data in global memory
+////////////////////////////////////////////////////////////////////////////////
+template <int ALGO> __global__
+void nv_scrypt_core_kernelA(uint32_t *g_idata, int begin, int end)
+{
+	int offset = blockIdx.x * blockDim.x + threadIdx.x / warpSize * warpSize;
+	g_idata += 32 * offset;
+	uint32_t * V = c_V[offset / warpSize];
+	uint4 B[4], C[4];
+	int i = begin;
+
+	if(i == 0) {
+		__transposed_read_BC<0>((uint4*)g_idata, B, C, 1, 0);
+		__transposed_write_BC(B, C, (uint4*)V, c_N);
+		++i;
+	} else
+		__transposed_read_BC<0>((uint4*)(V + (i-1)*32), B, C, c_N, 0);
+
+	while(i < end) {
+		block_mixer<ALGO>(B, C); block_mixer<ALGO>(C, B);
+		__transposed_write_BC(B, C, (uint4*)(V + i*32), c_N);
+		++i;
+	}
+}
+
+template <int ALGO> __global__
+void nv_scrypt_core_kernelA_LG(uint32_t *g_idata, int begin, int end, unsigned int LOOKUP_GAP)
+{
+	int offset = blockIdx.x * blockDim.x + threadIdx.x / warpSize * warpSize;
+	g_idata += 32 * offset;
+	uint32_t * V = c_V[offset / warpSize];
+	uint4 B[4], C[4];
+	int i = begin;
+
+	if(i == 0) {
+		__transposed_read_BC<0>((uint4*)g_idata, B, C, 1, 0);
+		__transposed_write_BC(B, C, (uint4*)V, c_spacing);
+		++i;
+	} else {
+		int pos = (i-1)/LOOKUP_GAP, loop = (i-1)-pos*LOOKUP_GAP;
+		__transposed_read_BC<0>((uint4*)(V + pos*32), B, C, c_spacing, 0);
+		while(loop--) { block_mixer<ALGO>(B, C); block_mixer<ALGO>(C, B); }
+	}
+
+	while(i < end) {
+		block_mixer<ALGO>(B, C); block_mixer<ALGO>(C, B);
+		if (i % LOOKUP_GAP == 0)
+		  __transposed_write_BC(B, C, (uint4*)(V + (i/LOOKUP_GAP)*32), c_spacing);
+		++i;
+	}
+}
+
+template <int ALGO, int TEX_DIM>__global__
+void nv_scrypt_core_kernelB(uint32_t *g_odata, int begin, int end)
+{
+	int offset = blockIdx.x * blockDim.x + threadIdx.x / warpSize * warpSize;
+	g_odata += 32 * offset;
+	uint32_t * V = c_V[offset / warpSize];
+	uint4 B[4], C[4];
+
+	if(begin == 0) {
+		__transposed_read_BC<TEX_DIM>((uint4*)V, B, C, c_N, c_N_1);
+		block_mixer<ALGO>(B, C); block_mixer<ALGO>(C, B);
+	} else
+		__transposed_read_BC<0>((uint4*)g_odata, B, C, 1, 0);
+
+	for (int i = begin; i < end; i++)  {
+		int slot = C[0].x & c_N_1;
+		__transposed_xor_BC<TEX_DIM>((uint4*)(V), B, C, c_N, slot);
+		block_mixer<ALGO>(B, C); block_mixer<ALGO>(C, B);
+	}
+
+	__transposed_write_BC(B, C, (uint4*)(g_odata), 1);
+}
+
+template <int ALGO, int TEX_DIM> __global__
+void nv_scrypt_core_kernelB_LG(uint32_t *g_odata, int begin, int end, unsigned int LOOKUP_GAP)
+{
+	int offset = blockIdx.x * blockDim.x + threadIdx.x / warpSize * warpSize;
+	g_odata += 32 * offset;
+	uint32_t * V = c_V[offset / warpSize];
+	uint4 B[4], C[4];
+
+	if(begin == 0) {
+	  int pos = c_N_1/LOOKUP_GAP, loop = 1 + (c_N_1-pos*LOOKUP_GAP);
+	  __transposed_read_BC<TEX_DIM>((uint4*)V, B, C, c_spacing, pos);
+	  while(loop--) { block_mixer<ALGO>(B, C); block_mixer<ALGO>(C, B); }
+	} else {
+		__transposed_read_BC<TEX_DIM>((uint4*)g_odata, B, C, 1, 0);
+	}
+
+	for (int i = begin; i < end; i++)  {
+		int slot = C[0].x & c_N_1;
+		int pos = slot/LOOKUP_GAP, loop = slot-pos*LOOKUP_GAP;
+		uint4 b[4], c[4]; __transposed_read_BC<TEX_DIM>((uint4*)(V), b, c, c_spacing, pos);
+		while(loop--) { block_mixer<ALGO>(b, c); block_mixer<ALGO>(c, b); }
+#pragma unroll 4
+		for(int n = 0; n < 4; n++) { B[n] ^= b[n]; C[n] ^= c[n]; }
+		block_mixer<ALGO>(B, C); block_mixer<ALGO>(C, B);
+	}
+
+	__transposed_write_BC(B, C, (uint4*)(g_odata), 1);
+}
diff --git a/scrypt/nv_kernel.h b/scrypt/nv_kernel.h
new file mode 100644
index 0000000000..fa839cfb27
--- /dev/null
+++ b/scrypt/nv_kernel.h
@@ -0,0 +1,30 @@
+#ifndef NV_KERNEL_H
+#define NV_KERNEL_H
+
+#include "salsa_kernel.h"
+
+class NVKernel : public KernelInterface
+{
+public:
+	NVKernel();
+
+	virtual void set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V);
+	virtual bool run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache);
+
+	virtual bool bindtexture_1D(uint32_t *d_V, size_t size);
+	virtual bool bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch);
+	virtual bool unbindtexture_1D();
+	virtual bool unbindtexture_2D();
+
+	virtual char get_identifier() { return 'K'; };
+	virtual int get_major_version() { return 3; };
+	virtual int get_minor_version() { return 0; };
+
+	virtual int max_warps_per_block() { return 32; };
+	virtual int get_texel_width() { return 4; };
+	virtual bool support_lookup_gap() { return true; }
+	virtual cudaSharedMemConfig shared_mem_config() { return cudaSharedMemBankSizeFourByte; }
+	virtual cudaFuncCache cache_config() { return cudaFuncCachePreferL1; }
+};
+
+#endif // #ifndef NV_KERNEL_H
diff --git a/scrypt/nv_kernel2.cu b/scrypt/nv_kernel2.cu
new file mode 100644
index 0000000000..5f4e2c8ca2
--- /dev/null
+++ b/scrypt/nv_kernel2.cu
@@ -0,0 +1,644 @@
+//
+// Experimental Kernel for Kepler (Compute 3.5) devices
+// code submitted by nVidia performance engineer Alexey Panteleev
+// with modifications by Christian Buchner
+//
+// for Compute 3.5
+// NOTE: compile this .cu module for compute_35,sm_35 with --maxrregcount=80
+// for Compute 3.0
+// NOTE: compile this .cu module for compute_30,sm_30 with --maxrregcount=63
+//
+
+#include <map>
+
+#include <cuda_runtime.h>
+#include <cuda_helper.h>
+#include "miner.h"
+
+#include "salsa_kernel.h"
+#include "nv_kernel2.h"
+
+#define THREADS_PER_WU 1  // single thread per hash
+
+#if __CUDA_ARCH__ < 350
+	// Kepler (Compute 3.0)
+	#define __ldg(x) (*(x))
+#endif
+
+#if !defined(__CUDA_ARCH__) ||  __CUDA_ARCH__ >= 300
+
+// grab lane ID
+static __device__ __inline__ unsigned int __laneId() { unsigned int laneId; asm( "mov.u32 %0, %%laneid;" : "=r"( laneId ) ); return laneId; }
+
+// forward references
+template <int ALGO> __global__ void nv2_scrypt_core_kernelA(uint32_t *g_idata, int begin, int end);
+template <int ALGO> __global__ void nv2_scrypt_core_kernelB(uint32_t *g_odata, int begin, int end);
+template <int ALGO> __global__ void nv2_scrypt_core_kernelA_LG(uint32_t *g_idata, int begin, int end, unsigned int LOOKUP_GAP);
+template <int ALGO> __global__ void nv2_scrypt_core_kernelB_LG(uint32_t *g_odata, int begin, int end, unsigned int LOOKUP_GAP);
+
+// scratchbuf constants (pointers to scratch buffer for each work unit)
+__constant__ uint32_t* c_V[TOTAL_WARP_LIMIT];
+
+// iteration count N
+__constant__ uint32_t c_N;
+__constant__ uint32_t c_N_1; // N - 1
+__constant__ uint32_t c_spacing; // (N+LOOKUP_GAP-1)/LOOKUP_GAP
+
+
+NV2Kernel::NV2Kernel() : KernelInterface()
+{
+}
+
+void NV2Kernel::set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V)
+{
+	checkCudaErrors(cudaMemcpyToSymbol(c_V, h_V, MAXWARPS*sizeof(uint32_t*), 0, cudaMemcpyHostToDevice));
+}
+
+bool NV2Kernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache)
+{
+	bool success = true;
+	bool scrypt = IS_SCRYPT();
+	bool chacha = IS_SCRYPT_JANE();
+
+	// make some constants available to kernel, update only initially and when changing
+	static uint32_t prev_N[MAX_GPUS] = { 0 };
+
+	if (N != prev_N[thr_id]) {
+		uint32_t h_N = N;
+		uint32_t h_N_1 = N-1;
+		uint32_t h_spacing = (N+LOOKUP_GAP-1)/LOOKUP_GAP;
+
+		cudaMemcpyToSymbolAsync(c_N, &h_N, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
+		cudaMemcpyToSymbolAsync(c_N_1, &h_N_1, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
+		cudaMemcpyToSymbolAsync(c_spacing, &h_spacing, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
+
+		prev_N[thr_id] = N;
+	}
+
+	// First phase: Sequential writes to scratchpad.
+	const int batch = device_batchsize[thr_id];
+	unsigned int pos = 0;
+
+	do
+	{
+		if (LOOKUP_GAP == 1) {
+			if (scrypt) nv2_scrypt_core_kernelA<A_SCRYPT>     <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N));
+			if (chacha) nv2_scrypt_core_kernelA<A_SCRYPT_JANE><<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N));
+		} else {
+			if (scrypt) nv2_scrypt_core_kernelA_LG<A_SCRYPT>     <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP);
+			if (chacha) nv2_scrypt_core_kernelA_LG<A_SCRYPT_JANE><<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP);
+		}
+		pos += batch;
+	} while (pos < N);
+
+	// Second phase: Random read access from scratchpad.
+	pos = 0;
+	do
+	{
+		if (LOOKUP_GAP == 1) {
+			if (scrypt) nv2_scrypt_core_kernelB<A_SCRYPT     > <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
+			if (chacha) nv2_scrypt_core_kernelB<A_SCRYPT_JANE> <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
+		} else {
+			if (scrypt) nv2_scrypt_core_kernelB_LG<A_SCRYPT     > <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
+			if (chacha) nv2_scrypt_core_kernelB_LG<A_SCRYPT_JANE> <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
+		}
+
+		pos += batch;
+	} while (pos < N);
+
+	return success;
+}
+
+static __device__ uint4& operator^=(uint4& left, const uint4& right)
+{
+	left.x ^= right.x;
+	left.y ^= right.y;
+	left.z ^= right.z;
+	left.w ^= right.w;
+	return left;
+}
+
+__device__ __forceinline__ uint4 shfl4(const uint4 val, unsigned int lane, unsigned int width)
+{
+	return make_uint4(
+		(unsigned int)__shfl((int)val.x, lane, width),
+		(unsigned int)__shfl((int)val.y, lane, width),
+		(unsigned int)__shfl((int)val.z, lane, width),
+		(unsigned int)__shfl((int)val.w, lane, width)
+	);
+}
+
+__device__ __forceinline__ void __transposed_write_BC(uint4 (&B)[4], uint4 (&C)[4], uint4 *D, int spacing)
+{
+	unsigned int laneId = __laneId();
+
+	unsigned int lane8 = laneId%8;
+	unsigned int tile  = laneId/8;
+
+	uint4 T1[8], T2[8];
+
+	/* Source matrix, A-H are threads, 0-7 are data items, thread A is marked with `*`:
+
+	   *A0  B0  C0  D0  E0  F0  G0  H0
+	   *A1  B1  C1  D1  E1  F1  G1  H1
+	   *A2  B2  C2  D2  E2  F2  G2  H2
+	   *A3  B3  C3  D3  E3  F3  G3  H3
+	   *A4  B4  C4  D4  E4  F4  G4  H4
+	   *A5  B5  C5  D5  E5  F5  G5  H5
+	   *A6  B6  C6  D6  E6  F6  G6  H6
+	   *A7  B7  C7  D7  E7  F7  G7  H7
+	*/
+
+	// rotate rows
+	T1[0] = B[0];
+	T1[1] = shfl4(B[1], lane8 + 7, 8);
+	T1[2] = shfl4(B[2], lane8 + 6, 8);
+	T1[3] = shfl4(B[3], lane8 + 5, 8);
+	T1[4] = shfl4(C[0], lane8 + 4, 8);
+	T1[5] = shfl4(C[1], lane8 + 3, 8);
+	T1[6] = shfl4(C[2], lane8 + 2, 8);
+	T1[7] = shfl4(C[3], lane8 + 1, 8);
+
+	/* Matrix after row rotates:
+
+	   *A0  B0  C0  D0  E0  F0  G0  H0
+		H1 *A1  B1  C1  D1  E1  F1  G1
+		G2  H2 *A2  B2  C2  D2  E2  F2
+		F3  G3  H3 *A3  B3  C3  D3  E3
+		E4  F4  G4  H4 *A4  B4  C4  D4
+		D5  E5  F5  G5  H5 *A5  B5  C5
+		C6  D6  E6  F6  G6  H6 *A6  B6
+		B7  C7  D7  E7  F7  G7  H7 *A7
+	*/
+
+	// rotate columns up using a barrel shifter simulation
+	// column X is rotated up by (X+1) items
+#pragma unroll 8
+	for(int n = 0; n < 8; n++) T2[n] = ((lane8+1) & 1) ? T1[(n+1) % 8] : T1[n];
+#pragma unroll 8
+	for(int n = 0; n < 8; n++) T1[n] = ((lane8+1) & 2) ? T2[(n+2) % 8] : T2[n];
+#pragma unroll 8
+	for(int n = 0; n < 8; n++) T2[n] = ((lane8+1) & 4) ? T1[(n+4) % 8] : T1[n];
+
+	/* Matrix after column rotates:
+
+		H1  H2  H3  H4  H5  H6  H7  H0
+		G2  G3  G4  G5  G6  G7  G0  G1
+		F3  F4  F5  F6  F7  F0  F1  F2
+		E4  E5  E6  E7  E0  E1  E2  E3
+		D5  D6  D7  D0  D1  D2  D3  D4
+		C6  C7  C0  C1  C2  C3  C4  C5
+		B7  B0  B1  B2  B3  B4  B5  B6
+	   *A0 *A1 *A2 *A3 *A4 *A5 *A6 *A7
+	*/
+
+	// rotate rows again using address math and write to D, in reverse row order
+	D[spacing*2*(32*tile   )+ lane8     ] = T2[7];
+	D[spacing*2*(32*tile+4 )+(lane8+7)%8] = T2[6];
+	D[spacing*2*(32*tile+8 )+(lane8+6)%8] = T2[5];
+	D[spacing*2*(32*tile+12)+(lane8+5)%8] = T2[4];
+	D[spacing*2*(32*tile+16)+(lane8+4)%8] = T2[3];
+	D[spacing*2*(32*tile+20)+(lane8+3)%8] = T2[2];
+	D[spacing*2*(32*tile+24)+(lane8+2)%8] = T2[1];
+	D[spacing*2*(32*tile+28)+(lane8+1)%8] = T2[0];
+}
+
+__device__ __forceinline__ void __transposed_read_BC(const uint4 *S, uint4 (&B)[4], uint4 (&C)[4], int spacing, int row)
+{
+	unsigned int laneId = __laneId();
+
+	unsigned int lane8 = laneId%8;
+	unsigned int tile  = laneId/8;
+
+	// Perform the same transposition as in __transposed_write_BC, but in reverse order.
+	// See the illustrations in comments for __transposed_write_BC.
+
+	// read and rotate rows, in reverse row order
+	uint4 T1[8], T2[8];
+	T1[7] = __ldg(&S[(spacing*2*(32*tile   ) +  lane8      + 8*__shfl(row, 0, 8))]);
+	T1[6] = __ldg(&S[(spacing*2*(32*tile+4 ) + (lane8+7)%8 + 8*__shfl(row, 1, 8))]);
+	T1[5] = __ldg(&S[(spacing*2*(32*tile+8 ) + (lane8+6)%8 + 8*__shfl(row, 2, 8))]);
+	T1[4] = __ldg(&S[(spacing*2*(32*tile+12) + (lane8+5)%8 + 8*__shfl(row, 3, 8))]);
+	T1[3] = __ldg(&S[(spacing*2*(32*tile+16) + (lane8+4)%8 + 8*__shfl(row, 4, 8))]);
+	T1[2] = __ldg(&S[(spacing*2*(32*tile+20) + (lane8+3)%8 + 8*__shfl(row, 5, 8))]);
+	T1[1] = __ldg(&S[(spacing*2*(32*tile+24) + (lane8+2)%8 + 8*__shfl(row, 6, 8))]);
+	T1[0] = __ldg(&S[(spacing*2*(32*tile+28) + (lane8+1)%8 + 8*__shfl(row, 7, 8))]);
+
+	// rotate columns down using a barrel shifter simulation
+	// column X is rotated down by (X+1) items, or up by (8-(X+1)) = (7-X) items
+#pragma unroll 8
+	for(int n = 0; n < 8; n++) T2[n] = ((7-lane8) & 1) ? T1[(n+1) % 8] : T1[n];
+#pragma unroll 8
+	for(int n = 0; n < 8; n++) T1[n] = ((7-lane8) & 2) ? T2[(n+2) % 8] : T2[n];
+#pragma unroll 8
+	for(int n = 0; n < 8; n++) T2[n] = ((7-lane8) & 4) ? T1[(n+4) % 8] : T1[n];
+
+	// rotate rows
+	B[0] = T2[0];
+	B[1] = shfl4(T2[1], lane8 + 1, 8);
+	B[2] = shfl4(T2[2], lane8 + 2, 8);
+	B[3] = shfl4(T2[3], lane8 + 3, 8);
+	C[0] = shfl4(T2[4], lane8 + 4, 8);
+	C[1] = shfl4(T2[5], lane8 + 5, 8);
+	C[2] = shfl4(T2[6], lane8 + 6, 8);
+	C[3] = shfl4(T2[7], lane8 + 7, 8);
+
+}
+
+__device__ __forceinline__ void __transposed_xor_BC(const uint4 *S, uint4 (&B)[4], uint4 (&C)[4], int spacing, int row)
+{
+	uint4 BT[4], CT[4];
+	__transposed_read_BC(S, BT, CT, spacing, row);
+
+#pragma unroll 4
+	for(int n = 0; n < 4; n++)
+	{
+		B[n] ^= BT[n];
+		C[n] ^= CT[n];
+	}
+}
+
+#if __CUDA_ARCH__ < 350
+	// Kepler (Compute 3.0)
+	#define ROTL(a, b) ((a)<<(b))|((a)>>(32-(b)))
+#else
+	// Kepler (Compute 3.5)
+	#define ROTL(a, b) __funnelshift_l( a, a, b );
+#endif
+
+
+
+#if 0
+
+#define QUARTER(a,b,c,d) \
+	a += b; d ^= a; d = ROTL(d,16); \
+	c += d; b ^= c; b = ROTL(b,12); \
+	a += b; d ^= a; d = ROTL(d,8); \
+	c += d; b ^= c; b = ROTL(b,7);
+
+static __device__ void xor_chacha8(uint4 *B, uint4 *C)
+{
+	uint32_t x[16];
+	x[0]=(B[0].x ^= C[0].x);
+	x[1]=(B[0].y ^= C[0].y);
+	x[2]=(B[0].z ^= C[0].z);
+	x[3]=(B[0].w ^= C[0].w);
+	x[4]=(B[1].x ^= C[1].x);
+	x[5]=(B[1].y ^= C[1].y);
+	x[6]=(B[1].z ^= C[1].z);
+	x[7]=(B[1].w ^= C[1].w);
+	x[8]=(B[2].x ^= C[2].x);
+	x[9]=(B[2].y ^= C[2].y);
+	x[10]=(B[2].z ^= C[2].z);
+	x[11]=(B[2].w ^= C[2].w);
+	x[12]=(B[3].x ^= C[3].x);
+	x[13]=(B[3].y ^= C[3].y);
+	x[14]=(B[3].z ^= C[3].z);
+	x[15]=(B[3].w ^= C[3].w);
+
+	/* Operate on columns. */
+	QUARTER( x[0], x[4], x[ 8], x[12] )
+	QUARTER( x[1], x[5], x[ 9], x[13] )
+	QUARTER( x[2], x[6], x[10], x[14] )
+	QUARTER( x[3], x[7], x[11], x[15] )
+
+	/* Operate on diagonals */
+	QUARTER( x[0], x[5], x[10], x[15] )
+	QUARTER( x[1], x[6], x[11], x[12] )
+	QUARTER( x[2], x[7], x[ 8], x[13] )
+	QUARTER( x[3], x[4], x[ 9], x[14] )
+
+	/* Operate on columns. */
+	QUARTER( x[0], x[4], x[ 8], x[12] )
+	QUARTER( x[1], x[5], x[ 9], x[13] )
+	QUARTER( x[2], x[6], x[10], x[14] )
+	QUARTER( x[3], x[7], x[11], x[15] )
+
+	/* Operate on diagonals */
+	QUARTER( x[0], x[5], x[10], x[15] )
+	QUARTER( x[1], x[6], x[11], x[12] )
+	QUARTER( x[2], x[7], x[ 8], x[13] )
+	QUARTER( x[3], x[4], x[ 9], x[14] )
+
+	/* Operate on columns. */
+	QUARTER( x[0], x[4], x[ 8], x[12] )
+	QUARTER( x[1], x[5], x[ 9], x[13] )
+	QUARTER( x[2], x[6], x[10], x[14] )
+	QUARTER( x[3], x[7], x[11], x[15] )
+
+	/* Operate on diagonals */
+	QUARTER( x[0], x[5], x[10], x[15] )
+	QUARTER( x[1], x[6], x[11], x[12] )
+	QUARTER( x[2], x[7], x[ 8], x[13] )
+	QUARTER( x[3], x[4], x[ 9], x[14] )
+
+	/* Operate on columns. */
+	QUARTER( x[0], x[4], x[ 8], x[12] )
+	QUARTER( x[1], x[5], x[ 9], x[13] )
+	QUARTER( x[2], x[6], x[10], x[14] )
+	QUARTER( x[3], x[7], x[11], x[15] )
+
+	/* Operate on diagonals */
+	QUARTER( x[0], x[5], x[10], x[15] )
+	QUARTER( x[1], x[6], x[11], x[12] )
+	QUARTER( x[2], x[7], x[ 8], x[13] )
+	QUARTER( x[3], x[4], x[ 9], x[14] )
+
+	B[0].x += x[0]; B[0].y += x[1]; B[0].z += x[2];  B[0].w += x[3];  B[1].x += x[4];  B[1].y += x[5];  B[1].z += x[6];  B[1].w += x[7];
+	B[2].x += x[8]; B[2].y += x[9]; B[2].z += x[10]; B[2].w += x[11]; B[3].x += x[12]; B[3].y += x[13]; B[3].z += x[14]; B[3].w += x[15];
+}
+
+#else
+
+#define ADD4(d1,d2,d3,d4,s1,s2,s3,s4) \
+	d1 += s1; d2 += s2; d3 += s3; d4 += s4;
+
+#define XOR4(d1,d2,d3,d4,s1,s2,s3,s4) \
+	d1 ^= s1; d2 ^= s2; d3 ^= s3; d4 ^= s4;
+
+#define ROTL4(d1,d2,d3,d4,amt) \
+	d1 = ROTL(d1, amt); d2 = ROTL(d2, amt); d3 = ROTL(d3, amt); d4 = ROTL(d4, amt);
+
+#define QROUND(a1,a2,a3,a4, b1,b2,b3,b4, c1,c2,c3,c4, amt) \
+	ADD4 (a1,a2,a3,a4, c1,c2,c3,c4) \
+	XOR4 (b1,b2,b3,b4, a1,a2,a3,a4) \
+	ROTL4(b1,b2,b3,b4, amt)
+
+static __device__ void xor_chacha8(uint4 *B, uint4 *C)
+{
+	uint32_t x[16];
+	x[0]=(B[0].x ^= C[0].x);
+	x[1]=(B[0].y ^= C[0].y);
+	x[2]=(B[0].z ^= C[0].z);
+	x[3]=(B[0].w ^= C[0].w);
+	x[4]=(B[1].x ^= C[1].x);
+	x[5]=(B[1].y ^= C[1].y);
+	x[6]=(B[1].z ^= C[1].z);
+	x[7]=(B[1].w ^= C[1].w);
+	x[8]=(B[2].x ^= C[2].x);
+	x[9]=(B[2].y ^= C[2].y);
+	x[10]=(B[2].z ^= C[2].z);
+	x[11]=(B[2].w ^= C[2].w);
+	x[12]=(B[3].x ^= C[3].x);
+	x[13]=(B[3].y ^= C[3].y);
+	x[14]=(B[3].z ^= C[3].z);
+	x[15]=(B[3].w ^= C[3].w);
+
+	/* Operate on columns. */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 16);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7],  8);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15],  7);
+
+	/* Operate on diagonals */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 16);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4],  8);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14],  7);
+
+	/* Operate on columns. */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 16);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7],  8);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15],  7);
+
+	/* Operate on diagonals */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 16);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4],  8);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14],  7);
+
+	/* Operate on columns. */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 16);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7],  8);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15],  7);
+
+	/* Operate on diagonals */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 16);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4],  8);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14],  7);
+
+	/* Operate on columns. */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7], 16);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[12],x[13],x[14],x[15], x[ 4],x[ 5],x[ 6],x[ 7],  8);
+	QROUND(x[ 8],x[ 9],x[10],x[11], x[ 4],x[ 5],x[ 6],x[ 7], x[12],x[13],x[14],x[15],  7);
+
+	/* Operate on diagonals */
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4], 16);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14], 12);
+	QROUND(x[ 0],x[ 1],x[ 2],x[ 3], x[15],x[12],x[13],x[14], x[ 5],x[ 6],x[ 7],x[ 4],  8);
+	QROUND(x[10],x[11],x[ 8],x[ 9], x[ 5],x[ 6],x[ 7],x[ 4], x[15],x[12],x[13],x[14],  7);
+
+	B[0].x += x[0]; B[0].y += x[1]; B[0].z += x[2];  B[0].w += x[3];  B[1].x += x[4];  B[1].y += x[5];  B[1].z += x[6];  B[1].w += x[7];
+	B[2].x += x[8]; B[2].y += x[9]; B[2].z += x[10]; B[2].w += x[11]; B[3].x += x[12]; B[3].y += x[13]; B[3].z += x[14]; B[3].w += x[15];
+}
+
+#endif
+
+
+#define ROTL7(a0,a1,a2,a3,a00,a10,a20,a30){\
+a0^=ROTL(a00, 7); a1^=ROTL(a10, 7); a2^=ROTL(a20, 7); a3^=ROTL(a30, 7);\
+};\
+
+#define ROTL9(a0,a1,a2,a3,a00,a10,a20,a30){\
+a0^=ROTL(a00, 9); a1^=ROTL(a10, 9); a2^=ROTL(a20, 9); a3^=ROTL(a30, 9);\
+};\
+
+#define ROTL13(a0,a1,a2,a3,a00,a10,a20,a30){\
+a0^=ROTL(a00, 13); a1^=ROTL(a10, 13); a2^=ROTL(a20, 13); a3^=ROTL(a30, 13);\
+};\
+
+#define ROTL18(a0,a1,a2,a3,a00,a10,a20,a30){\
+a0^=ROTL(a00, 18); a1^=ROTL(a10, 18); a2^=ROTL(a20, 18); a3^=ROTL(a30, 18);\
+};\
+
+static __device__ void xor_salsa8(uint4 *B, uint4 *C)
+{
+	uint32_t x[16];
+	x[0]=(B[0].x ^= C[0].x);
+	x[1]=(B[0].y ^= C[0].y);
+	x[2]=(B[0].z ^= C[0].z);
+	x[3]=(B[0].w ^= C[0].w);
+	x[4]=(B[1].x ^= C[1].x);
+	x[5]=(B[1].y ^= C[1].y);
+	x[6]=(B[1].z ^= C[1].z);
+	x[7]=(B[1].w ^= C[1].w);
+	x[8]=(B[2].x ^= C[2].x);
+	x[9]=(B[2].y ^= C[2].y);
+	x[10]=(B[2].z ^= C[2].z);
+	x[11]=(B[2].w ^= C[2].w);
+	x[12]=(B[3].x ^= C[3].x);
+	x[13]=(B[3].y ^= C[3].y);
+	x[14]=(B[3].z ^= C[3].z);
+	x[15]=(B[3].w ^= C[3].w);
+
+	/* Operate on columns. */
+	ROTL7(x[4],x[9],x[14],x[3],x[0]+x[12],x[1]+x[5],x[6]+x[10],x[11]+x[15]);
+	ROTL9(x[8],x[13],x[2],x[7],x[0]+x[4],x[5]+x[9],x[10]+x[14],x[3]+x[15]);
+	ROTL13(x[12],x[1],x[6],x[11],x[4]+x[8],x[9]+x[13],x[2]+x[14],x[3]+x[7]);
+	ROTL18(x[0],x[5],x[10],x[15],x[8]+x[12],x[1]+x[13],x[2]+x[6],x[7]+x[11]);
+
+	/* Operate on rows. */
+	ROTL7(x[1],x[6],x[11],x[12],x[0]+x[3],x[4]+x[5],x[9]+x[10],x[14]+x[15]);
+	ROTL9(x[2],x[7],x[8],x[13],x[0]+x[1],x[5]+x[6],x[10]+x[11],x[12]+x[15]);
+	ROTL13(x[3],x[4],x[9],x[14],x[1]+x[2],x[6]+x[7],x[8]+x[11],x[12]+x[13]);
+	ROTL18(x[0],x[5],x[10],x[15],x[2]+x[3],x[4]+x[7],x[8]+x[9],x[13]+x[14]);
+
+	/* Operate on columns. */
+	ROTL7(x[4],x[9],x[14],x[3],x[0]+x[12],x[1]+x[5],x[6]+x[10],x[11]+x[15]);
+	ROTL9(x[8],x[13],x[2],x[7],x[0]+x[4],x[5]+x[9],x[10]+x[14],x[3]+x[15]);
+	ROTL13(x[12],x[1],x[6],x[11],x[4]+x[8],x[9]+x[13],x[2]+x[14],x[3]+x[7]);
+	ROTL18(x[0],x[5],x[10],x[15],x[8]+x[12],x[1]+x[13],x[2]+x[6],x[7]+x[11]);
+
+	/* Operate on rows. */
+	ROTL7(x[1],x[6],x[11],x[12],x[0]+x[3],x[4]+x[5],x[9]+x[10],x[14]+x[15]);
+	ROTL9(x[2],x[7],x[8],x[13],x[0]+x[1],x[5]+x[6],x[10]+x[11],x[12]+x[15]);
+	ROTL13(x[3],x[4],x[9],x[14],x[1]+x[2],x[6]+x[7],x[8]+x[11],x[12]+x[13]);
+	ROTL18(x[0],x[5],x[10],x[15],x[2]+x[3],x[4]+x[7],x[8]+x[9],x[13]+x[14]);
+
+	/* Operate on columns. */
+	ROTL7(x[4],x[9],x[14],x[3],x[0]+x[12],x[1]+x[5],x[6]+x[10],x[11]+x[15]);
+	ROTL9(x[8],x[13],x[2],x[7],x[0]+x[4],x[5]+x[9],x[10]+x[14],x[3]+x[15]);
+	ROTL13(x[12],x[1],x[6],x[11],x[4]+x[8],x[9]+x[13],x[2]+x[14],x[3]+x[7]);
+	ROTL18(x[0],x[5],x[10],x[15],x[8]+x[12],x[1]+x[13],x[2]+x[6],x[7]+x[11]);
+
+	/* Operate on rows. */
+	ROTL7(x[1],x[6],x[11],x[12],x[0]+x[3],x[4]+x[5],x[9]+x[10],x[14]+x[15]);
+	ROTL9(x[2],x[7],x[8],x[13],x[0]+x[1],x[5]+x[6],x[10]+x[11],x[12]+x[15]);
+	ROTL13(x[3],x[4],x[9],x[14],x[1]+x[2],x[6]+x[7],x[8]+x[11],x[12]+x[13]);
+	ROTL18(x[0],x[5],x[10],x[15],x[2]+x[3],x[4]+x[7],x[8]+x[9],x[13]+x[14]);
+
+	/* Operate on columns. */
+	ROTL7(x[4],x[9],x[14],x[3],x[0]+x[12],x[1]+x[5],x[6]+x[10],x[11]+x[15]);
+	ROTL9(x[8],x[13],x[2],x[7],x[0]+x[4],x[5]+x[9],x[10]+x[14],x[3]+x[15]);
+	ROTL13(x[12],x[1],x[6],x[11],x[4]+x[8],x[9]+x[13],x[2]+x[14],x[3]+x[7]);
+	ROTL18(x[0],x[5],x[10],x[15],x[8]+x[12],x[1]+x[13],x[2]+x[6],x[7]+x[11]);
+
+	/* Operate on rows. */
+	ROTL7(x[1],x[6],x[11],x[12],x[0]+x[3],x[4]+x[5],x[9]+x[10],x[14]+x[15]);
+	ROTL9(x[2],x[7],x[8],x[13],x[0]+x[1],x[5]+x[6],x[10]+x[11],x[12]+x[15]);
+	ROTL13(x[3],x[4],x[9],x[14],x[1]+x[2],x[6]+x[7],x[8]+x[11],x[12]+x[13]);
+	ROTL18(x[0],x[5],x[10],x[15],x[2]+x[3],x[4]+x[7],x[8]+x[9],x[13]+x[14]);
+
+	B[0].x += x[0]; B[0].y += x[1]; B[0].z += x[2];  B[0].w += x[3];  B[1].x += x[4];  B[1].y += x[5];  B[1].z += x[6];  B[1].w += x[7];
+	B[2].x += x[8]; B[2].y += x[9]; B[2].z += x[10]; B[2].w += x[11]; B[3].x += x[12]; B[3].y += x[13]; B[3].z += x[14]; B[3].w += x[15];
+}
+
+
+template <int ALGO> static __device__ void block_mixer(uint4 *B, uint4 *C)
+{
+  switch (ALGO)
+  {
+	case A_SCRYPT:      xor_salsa8(B, C); break;
+	case A_SCRYPT_JANE: xor_chacha8(B, C); break;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Experimental Scrypt core kernel for Titan devices.
+//! @param g_idata  input data in global memory
+//! @param g_odata  output data in global memory
+////////////////////////////////////////////////////////////////////////////////
+template <int ALGO> __global__ void nv2_scrypt_core_kernelA(uint32_t *g_idata, int begin, int end)
+{
+	int offset = blockIdx.x * blockDim.x + threadIdx.x / warpSize * warpSize;
+	g_idata += 32 * offset;
+	uint32_t * V = c_V[offset / warpSize];
+	uint4 B[4], C[4];
+	int i = begin;
+
+	if(i == 0) {
+		__transposed_read_BC((uint4*)g_idata, B, C, 1, 0);
+		__transposed_write_BC(B, C, (uint4*)V, c_N);
+		++i;
+	} else
+		__transposed_read_BC((uint4*)(V + (i-1)*32), B, C, c_N, 0);
+
+	while(i < end) {
+		block_mixer<ALGO>(B, C); block_mixer<ALGO>(C, B);
+		__transposed_write_BC(B, C, (uint4*)(V + i*32), c_N);
+		++i;
+	}
+}
+
+template <int ALGO> __global__ void nv2_scrypt_core_kernelA_LG(uint32_t *g_idata, int begin, int end, unsigned int LOOKUP_GAP)
+{
+	int offset = blockIdx.x * blockDim.x + threadIdx.x / warpSize * warpSize;
+	g_idata += 32 * offset;
+	uint32_t * V = c_V[offset / warpSize];
+	uint4 B[4], C[4];
+	int i = begin;
+
+	if(i == 0) {
+		__transposed_read_BC((uint4*)g_idata, B, C, 1, 0);
+		__transposed_write_BC(B, C, (uint4*)V, c_spacing);
+		++i;
+	} else {
+		int pos = (i-1)/LOOKUP_GAP, loop = (i-1)-pos*LOOKUP_GAP;
+		__transposed_read_BC((uint4*)(V + pos*32), B, C, c_spacing, 0);
+		while(loop--) { block_mixer<ALGO>(B, C); block_mixer<ALGO>(C, B); }
+	}
+
+	while(i < end) {
+		block_mixer<ALGO>(B, C); block_mixer<ALGO>(C, B);
+		if (i % LOOKUP_GAP == 0)
+		  __transposed_write_BC(B, C, (uint4*)(V + (i/LOOKUP_GAP)*32), c_spacing);
+		++i;
+	}
+}
+
+template <int ALGO> __global__ void nv2_scrypt_core_kernelB(uint32_t *g_odata, int begin, int end)
+{
+	int offset = blockIdx.x * blockDim.x + threadIdx.x / warpSize * warpSize;
+	g_odata += 32 * offset;
+	uint32_t * V = c_V[offset / warpSize];
+	uint4 B[4], C[4];
+
+	if(begin == 0) {
+		__transposed_read_BC((uint4*)V, B, C, c_N, c_N_1);
+		block_mixer<ALGO>(B, C); block_mixer<ALGO>(C, B);
+	} else
+		__transposed_read_BC((uint4*)g_odata, B, C, 1, 0);
+
+	for (int i = begin; i < end; i++)  {
+		int slot = C[0].x & c_N_1;
+		__transposed_xor_BC((uint4*)(V), B, C, c_N, slot);
+		block_mixer<ALGO>(B, C); block_mixer<ALGO>(C, B);
+	}
+
+	__transposed_write_BC(B, C, (uint4*)(g_odata), 1);
+}
+
+template <int ALGO> __global__ void nv2_scrypt_core_kernelB_LG(uint32_t *g_odata, int begin, int end, unsigned int LOOKUP_GAP)
+{
+	int offset = blockIdx.x * blockDim.x + threadIdx.x / warpSize * warpSize;
+	g_odata += 32 * offset;
+	uint32_t * V = c_V[offset / warpSize];
+	uint4 B[4], C[4];
+
+	if(begin == 0) {
+	  int pos = c_N_1/LOOKUP_GAP, loop = 1 + (c_N_1-pos*LOOKUP_GAP);
+	  __transposed_read_BC((uint4*)V, B, C, c_spacing, pos);
+	  while(loop--) { block_mixer<ALGO>(B, C); block_mixer<ALGO>(C, B); }
+	} else {
+		__transposed_read_BC((uint4*)g_odata, B, C, 1, 0);
+	}
+
+	for (int i = begin; i < end; i++)  {
+		int slot = C[0].x & c_N_1;
+		int pos = slot/LOOKUP_GAP, loop = slot-pos*LOOKUP_GAP;
+		uint4 b[4], c[4]; __transposed_read_BC((uint4*)(V), b, c, c_spacing, pos);
+		while(loop--) { block_mixer<ALGO>(b, c); block_mixer<ALGO>(c, b); }
+#pragma unroll 4
+		for(int n = 0; n < 4; n++) { B[n] ^= b[n]; C[n] ^= c[n]; }
+		block_mixer<ALGO>(B, C); block_mixer<ALGO>(C, B);
+	}
+
+	__transposed_write_BC(B, C, (uint4*)(g_odata), 1);
+}
+
+#endif /* prevent SM 2 */
+
diff --git a/scrypt/nv_kernel2.h b/scrypt/nv_kernel2.h
new file mode 100644
index 0000000000..79e1b1c81b
--- /dev/null
+++ b/scrypt/nv_kernel2.h
@@ -0,0 +1,30 @@
+#ifndef NV2_KERNEL_H
+#define NV2_KERNEL_H
+
+#include "miner.h"
+#include <cuda_runtime.h>
+
+#include "salsa_kernel.h"
+
+class NV2Kernel : public KernelInterface
+{
+public:
+	NV2Kernel();
+
+	virtual void set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V);
+	virtual bool run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache);
+
+	virtual char get_identifier() { return 'T'; };
+	virtual int get_major_version() { return 3; };
+	virtual int get_minor_version() { return 5; };
+
+	virtual int max_warps_per_block() { return 24; };
+	virtual int get_texel_width() { return 4; };
+	virtual bool no_textures() { return true; }
+	virtual bool support_lookup_gap() { return true; }
+
+	virtual cudaSharedMemConfig shared_mem_config() { return cudaSharedMemBankSizeFourByte; }
+	virtual cudaFuncCache cache_config() { return cudaFuncCachePreferL1; }
+};
+
+#endif // #ifndef NV2_KERNEL_H
diff --git a/scrypt/salsa_kernel.cu b/scrypt/salsa_kernel.cu
new file mode 100644
index 0000000000..9422988faa
--- /dev/null
+++ b/scrypt/salsa_kernel.cu
@@ -0,0 +1,869 @@
+//
+// Contains the autotuning logic and some utility functions.
+// Note that all CUDA kernels have been moved to other .cu files
+//
+
+#include <stdio.h>
+#include <map>
+#include <algorithm>
+#include <unistd.h> // usleep
+#include <ctype.h> // tolower
+#include "cuda_helper.h"
+
+#include "salsa_kernel.h"
+
+#include "nv_kernel2.h"
+#include "titan_kernel.h"
+#include "nv_kernel.h"
+#include "kepler_kernel.h"
+#include "fermi_kernel.h"
+#include "test_kernel.h"
+
+#include "miner.h"
+
+#if defined(_WIN64) || defined(__x86_64__) || defined(__64BIT__)
+#define MAXMEM 0x300000000ULL  // 12 GB (the largest Kepler)
+#else
+#define MAXMEM  0xFFFFFFFFULL  // nearly 4 GB (32 bit limitations)
+#endif
+
+// require CUDA 5.5 driver API
+#define DMAJ 5
+#define DMIN 5
+
+// define some error checking macros
+#define DELIMITER '/'
+#define __FILENAME__ ( strrchr(__FILE__, DELIMITER) != NULL ? strrchr(__FILE__, DELIMITER)+1 : __FILE__ )
+
+#undef checkCudaErrors
+#define checkCudaErrors(x) \
+{ \
+	cudaGetLastError(); \
+	x; \
+	cudaError_t err = cudaGetLastError(); \
+	if (err != cudaSuccess && !abort_flag) \
+		applog(LOG_ERR, "GPU #%d: Err %d: %s (%s:%d)", device_map[thr_id], err, cudaGetErrorString(err), __FILENAME__, __LINE__); \
+}
+
+// some globals containing pointers to device memory (for chunked allocation)
+// [MAX_GPUS] indexes up to MAX_GPUS threads (0...MAX_GPUS-1)
+int       MAXWARPS[MAX_GPUS];
+uint32_t* h_V[MAX_GPUS][TOTAL_WARP_LIMIT*64];          // NOTE: the *64 prevents buffer overflow for --keccak
+uint32_t  h_V_extra[MAX_GPUS][TOTAL_WARP_LIMIT*64];    //       with really large kernel launch configurations
+
+KernelInterface *Best_Kernel_Heuristics(cudaDeviceProp *props)
+{
+	KernelInterface *kernel = NULL;
+	uint64_t N = 1UL << (opt_nfactor+1);
+
+	if (IS_SCRYPT() || (IS_SCRYPT_JANE() && N <= 8192))
+	{
+		// high register count kernels (scrypt, low N-factor scrypt-jane)
+		if (props->major > 3 || (props->major == 3 && props->minor >= 5))
+			kernel = new NV2Kernel(); // we don't want this for Keccak though
+		else if (props->major == 3 && props->minor == 0)
+			kernel = new NVKernel();
+		else
+			kernel = new FermiKernel();
+	}
+	else
+	{
+	   // high N-factor scrypt-jane = low registers count kernels
+	   if (props->major > 3 || (props->major == 3 && props->minor >= 5))
+			kernel = new TitanKernel();
+		else if (props->major == 3 && props->minor == 0)
+			kernel = new KeplerKernel();
+		else
+			kernel = new TestKernel();
+	}
+	return kernel;
+}
+
+
+bool validate_config(char *config, int &b, int &w, KernelInterface **kernel = NULL, cudaDeviceProp *props = NULL)
+{
+	bool success = false;
+	char kernelid = ' ';
+	if (config != NULL)
+	{
+		if (config[0] == 'T' || config[0] == 'K' || config[0] == 'F' || config[0] == 'L' ||
+			config[0] == 't' || config[0] == 'k' || config[0] == 'f' ||
+			config[0] == 'Z' || config[0] == 'Y' || config[0] == 'X') {
+			kernelid = config[0];
+			config++;
+		}
+
+		if (config[0] >= '0' && config[0] <= '9')
+			if (sscanf(config, "%dx%d", &b, &w) == 2)
+				success = true;
+
+		if (success && kernel != NULL)
+		{
+			switch (kernelid)
+			{
+				case 'T': case 'Z': *kernel = new NV2Kernel(); break;
+				case 't':           *kernel = new TitanKernel(); break;
+				case 'K': case 'Y': *kernel = new NVKernel(); break;
+				case 'k':           *kernel = new KeplerKernel(); break;
+				case 'F': case 'L': *kernel = new FermiKernel(); break;
+				case 'f': case 'X': *kernel = new TestKernel(); break;
+				case ' ': // choose based on device architecture
+					*kernel = Best_Kernel_Heuristics(props);
+				break;
+			}
+		}
+	}
+	return success;
+}
+
+std::map<int, int> context_blocks;
+std::map<int, int> context_wpb;
+std::map<int, bool> context_concurrent;
+std::map<int, KernelInterface *> context_kernel;
+std::map<int, uint32_t *> context_idata[2];
+std::map<int, uint32_t *> context_odata[2];
+std::map<int, cudaStream_t> context_streams[2];
+std::map<int, uint32_t *> context_X[2];
+std::map<int, uint32_t *> context_H[2];
+std::map<int, cudaEvent_t> context_serialize[2];
+
+// for SHA256 hashing on GPU
+std::map<int, uint32_t *> context_tstate[2];
+std::map<int, uint32_t *> context_ostate[2];
+std::map<int, uint32_t *> context_hash[2];
+
+int find_optimal_blockcount(int thr_id, KernelInterface* &kernel, bool &concurrent, int &wpb);
+
+int cuda_throughput(int thr_id)
+{
+	int GRID_BLOCKS, WARPS_PER_BLOCK;
+	if (context_blocks.find(thr_id) == context_blocks.end())
+	{
+#if 0
+		CUcontext ctx;
+		cuCtxCreate( &ctx, CU_CTX_SCHED_YIELD, device_map[thr_id] );
+		cuCtxSetCurrent(ctx);
+#else
+		checkCudaErrors(cudaSetDevice(device_map[thr_id]));
+		checkCudaErrors(cudaSetDeviceFlags(cudaDeviceScheduleYield));
+#endif
+
+		KernelInterface *kernel;
+		bool concurrent;
+		GRID_BLOCKS = find_optimal_blockcount(thr_id, kernel, concurrent, WARPS_PER_BLOCK);
+
+		if(GRID_BLOCKS == 0)
+			return 0;
+
+		unsigned int THREADS_PER_WU = kernel->threads_per_wu();
+		unsigned int mem_size = WU_PER_LAUNCH * sizeof(uint32_t) * 32;
+		unsigned int state_size = WU_PER_LAUNCH * sizeof(uint32_t) * 8;
+
+		// allocate device memory for scrypt_core inputs and outputs
+		uint32_t *tmp;
+		checkCudaErrors(cudaMalloc((void **) &tmp, mem_size)); context_idata[0][thr_id] = tmp;
+		checkCudaErrors(cudaMalloc((void **) &tmp, mem_size)); context_idata[1][thr_id] = tmp;
+		checkCudaErrors(cudaMalloc((void **) &tmp, mem_size)); context_odata[0][thr_id] = tmp;
+		checkCudaErrors(cudaMalloc((void **) &tmp, mem_size)); context_odata[1][thr_id] = tmp;
+
+		// allocate pinned host memory for scrypt hashes
+		checkCudaErrors(cudaHostAlloc((void **) &tmp, state_size, cudaHostAllocDefault)); context_H[0][thr_id] = tmp;
+		checkCudaErrors(cudaHostAlloc((void **) &tmp, state_size, cudaHostAllocDefault)); context_H[1][thr_id] = tmp;
+
+		if (IS_SCRYPT())
+		{
+			if (parallel < 2)
+			{
+				// allocate pinned host memory for scrypt_core input/output
+				checkCudaErrors(cudaHostAlloc((void **) &tmp, mem_size, cudaHostAllocDefault)); context_X[0][thr_id] = tmp;
+				checkCudaErrors(cudaHostAlloc((void **) &tmp, mem_size, cudaHostAllocDefault)); context_X[1][thr_id] = tmp;
+			}
+			else
+			{
+				// allocate tstate, ostate, scrypt hash device memory
+				checkCudaErrors(cudaMalloc((void **) &tmp, state_size)); context_tstate[0][thr_id] = tmp;
+				checkCudaErrors(cudaMalloc((void **) &tmp, state_size)); context_tstate[1][thr_id] = tmp;
+				checkCudaErrors(cudaMalloc((void **) &tmp, state_size)); context_ostate[0][thr_id] = tmp;
+				checkCudaErrors(cudaMalloc((void **) &tmp, state_size)); context_ostate[1][thr_id] = tmp;
+				checkCudaErrors(cudaMalloc((void **) &tmp, state_size)); context_hash[0][thr_id] = tmp;
+				checkCudaErrors(cudaMalloc((void **) &tmp, state_size)); context_hash[1][thr_id] = tmp;
+			}
+		}
+		else /* if (IS_SCRYPT_JANE()) */
+		{
+			// allocate pinned host memory for scrypt_core input/output
+			checkCudaErrors(cudaHostAlloc((void **) &tmp, mem_size, cudaHostAllocDefault)); context_X[0][thr_id] = tmp;
+			checkCudaErrors(cudaHostAlloc((void **) &tmp, mem_size, cudaHostAllocDefault)); context_X[1][thr_id] = tmp;
+
+			checkCudaErrors(cudaMalloc((void **) &tmp, state_size)); context_hash[0][thr_id] = tmp;
+			checkCudaErrors(cudaMalloc((void **) &tmp, state_size)); context_hash[1][thr_id] = tmp;
+		}
+
+		// create two CUDA streams
+		cudaStream_t tmp2;
+		checkCudaErrors( cudaStreamCreate(&tmp2) ); context_streams[0][thr_id] = tmp2;
+		checkCudaErrors( cudaStreamCreate(&tmp2) ); context_streams[1][thr_id] = tmp2;
+
+		// events used to serialize the kernel launches (we don't want any overlapping of kernels)
+		cudaEvent_t tmp4;
+		checkCudaErrors(cudaEventCreateWithFlags(&tmp4, cudaEventDisableTiming)); context_serialize[0][thr_id] = tmp4;
+		checkCudaErrors(cudaEventCreateWithFlags(&tmp4, cudaEventDisableTiming)); context_serialize[1][thr_id] = tmp4;
+		checkCudaErrors(cudaEventRecord(context_serialize[1][thr_id]));
+
+		context_kernel[thr_id] = kernel;
+		context_concurrent[thr_id] = concurrent;
+		context_blocks[thr_id] = GRID_BLOCKS;
+		context_wpb[thr_id] = WARPS_PER_BLOCK;
+	}
+
+	GRID_BLOCKS = context_blocks[thr_id];
+	WARPS_PER_BLOCK = context_wpb[thr_id];
+	unsigned int THREADS_PER_WU = context_kernel[thr_id]->threads_per_wu();
+	return WU_PER_LAUNCH;
+}
+
+// Beginning of GPU Architecture definitions
+inline int _ConvertSMVer2Cores(int major, int minor)
+{
+	// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
+	typedef struct {
+		int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
+		int Cores;
+	} sSMtoCores;
+
+	sSMtoCores nGpuArchCoresPerSM[] = {
+		{ 0x10, 8   }, // Tesla Generation (SM 1.0) G80 class
+		{ 0x11, 8   }, // Tesla Generation (SM 1.1) G8x class
+		{ 0x12, 8   }, // Tesla Generation (SM 1.2) G9x class
+		{ 0x13, 8   }, // Tesla Generation (SM 1.3) GT200 class
+		{ 0x20, 32  }, // Fermi Generation (SM 2.0) GF100 class
+		{ 0x21, 48  }, // Fermi Generation (SM 2.1) GF10x class
+		{ 0x30, 192 }, // Kepler Generation (SM 3.0) GK10x class - GK104 = 1536 cores / 8 SMs
+		{ 0x35, 192 }, // Kepler Generation (SM 3.5) GK11x class
+		{ 0x50, 128 }, // Maxwell First Generation (SM 5.0) GTX750/750Ti
+		{ 0x52, 128 }, // Maxwell Second Generation (SM 5.2) GTX980 = 2048 cores / 16 SMs - GTX970 1664 cores / 13 SMs
+		{ 0x61, 128 }, // Pascal GeForce (SM 6.1)
+		{ -1, -1 },
+	};
+
+	int index = 0;
+	while (nGpuArchCoresPerSM[index].SM != -1)
+	{
+		if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
+			return nGpuArchCoresPerSM[index].Cores;
+		}
+		index++;
+	}
+
+	// If we don't find the values, we default use the previous one to run properly
+	applog(LOG_WARNING, "MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM", major, minor, 128);
+	return 128;
+}
+
+#ifdef WIN32
+#include <windows.h>
+static int console_width() {
+	CONSOLE_SCREEN_BUFFER_INFO csbi;
+	GetConsoleScreenBufferInfo(GetStdHandle(STD_OUTPUT_HANDLE), &csbi);
+	return csbi.srWindow.Right - csbi.srWindow.Left + 1;
+}
+#else
+static inline int console_width() {
+	return 999;
+}
+#endif
+
+int find_optimal_blockcount(int thr_id, KernelInterface* &kernel, bool &concurrent, int &WARPS_PER_BLOCK)
+{
+	int cw = console_width();
+	int optimal_blocks = 0;
+
+	cudaDeviceProp props;
+	checkCudaErrors(cudaGetDeviceProperties(&props, device_map[thr_id]));
+	concurrent = (props.concurrentKernels > 0);
+
+	WARPS_PER_BLOCK = -1;
+
+	// if not specified, use interactive mode for devices that have the watchdog timer enabled
+	if (device_interactive[thr_id] == -1)
+		device_interactive[thr_id] = props.kernelExecTimeoutEnabled;
+
+	// turn off texture cache if not otherwise specified
+	if (device_texturecache[thr_id] == -1)
+		device_texturecache[thr_id] = 0;
+
+	// if not otherwise specified or required, turn single memory allocations off as they reduce
+	// the amount of memory that we can allocate on Windows Vista, 7 and 8 (WDDM driver model issue)
+	if (device_singlememory[thr_id] == -1) device_singlememory[thr_id] = 0;
+
+	// figure out which kernel implementation to use
+	if (!validate_config(device_config[thr_id], optimal_blocks, WARPS_PER_BLOCK, &kernel, &props)) {
+		kernel = NULL;
+		if (device_config[thr_id] != NULL) {
+				 if (device_config[thr_id][0] == 'T' || device_config[thr_id][0] == 'Z')
+				kernel = new NV2Kernel();
+			else if (device_config[thr_id][0] == 't')
+				kernel = new TitanKernel();
+			else if (device_config[thr_id][0] == 'K' || device_config[thr_id][0] == 'Y')
+				kernel = new NVKernel();
+			else if (device_config[thr_id][0] == 'k')
+				kernel = new KeplerKernel();
+			else if (device_config[thr_id][0] == 'F' || device_config[thr_id][0] == 'L')
+				kernel = new FermiKernel();
+			else if (device_config[thr_id][0] == 'f' || device_config[thr_id][0] == 'X')
+				kernel = new TestKernel();
+		}
+		if (kernel == NULL) kernel = Best_Kernel_Heuristics(&props);
+	}
+
+	if (kernel->get_major_version() > props.major || kernel->get_major_version() == props.major && kernel->get_minor_version() > props.minor)
+	{
+		applog(LOG_ERR, "GPU #%d: FATAL: the '%c' kernel requires %d.%d capability!", device_map[thr_id], kernel->get_identifier(), kernel->get_major_version(), kernel->get_minor_version());
+		return 0;
+	}
+
+	// set whatever cache configuration and shared memory bank mode the kernel prefers
+	checkCudaErrors(cudaDeviceSetCacheConfig(kernel->cache_config()));
+	checkCudaErrors(cudaDeviceSetSharedMemConfig(kernel->shared_mem_config()));
+
+	// some kernels (e.g. Titan) do not support the texture cache
+	if (kernel->no_textures() && device_texturecache[thr_id]) {
+		applog(LOG_WARNING, "GPU #%d: the '%c' kernel ignores the texture cache argument", device_map[thr_id], kernel->get_identifier());
+		device_texturecache[thr_id] = 0;
+	}
+
+	// Texture caching only works with single memory allocation
+	if (device_texturecache[thr_id]) device_singlememory[thr_id] = 1;
+
+	if (kernel->single_memory() && !device_singlememory[thr_id]) {
+		applog(LOG_WARNING, "GPU #%d: the '%c' kernel requires single memory allocation", device_map[thr_id], kernel->get_identifier());
+		device_singlememory[thr_id] = 1;
+	}
+
+	if (device_lookup_gap[thr_id] == 0) device_lookup_gap[thr_id] = 1;
+	if (!kernel->support_lookup_gap() && device_lookup_gap[thr_id] > 1)
+	{
+		applog(LOG_WARNING, "GPU #%d: the '%c' kernel does not support a lookup gap", device_map[thr_id], kernel->get_identifier());
+		device_lookup_gap[thr_id] = 1;
+	}
+
+	if (opt_debug) {
+		applog(LOG_INFO, "GPU #%d: interactive: %d, tex-cache: %d%s, single-alloc: %d", device_map[thr_id],
+		   (device_interactive[thr_id]  != 0) ? 1 : 0,
+		   (device_texturecache[thr_id] != 0) ? device_texturecache[thr_id] : 0, (device_texturecache[thr_id] != 0) ? "D" : "",
+		   (device_singlememory[thr_id] != 0) ? 1 : 0 );
+	}
+
+	// number of threads collaborating on one work unit (hash)
+	unsigned int THREADS_PER_WU = kernel->threads_per_wu();
+	unsigned int LOOKUP_GAP = device_lookup_gap[thr_id];
+	unsigned int BACKOFF = device_backoff[thr_id];
+	unsigned int N = (1 << (opt_nfactor+1));
+	double szPerWarp = (double)(SCRATCH * WU_PER_WARP * sizeof(uint32_t));
+	//applog(LOG_INFO, "WU_PER_WARP=%u, THREADS_PER_WU=%u, LOOKUP_GAP=%u, BACKOFF=%u, SCRATCH=%u", WU_PER_WARP, THREADS_PER_WU, LOOKUP_GAP, BACKOFF, SCRATCH);
+	applog(LOG_INFO, "GPU #%d: %d hashes / %.1f MB per warp.", device_map[thr_id], WU_PER_WARP, szPerWarp / (1024.0 * 1024.0));
+
+	// compute highest MAXWARPS numbers for kernels allowing cudaBindTexture to succeed
+	int MW_1D_4 = 134217728 / (SCRATCH * WU_PER_WARP / 4); // for uint4_t textures
+	int MW_1D_2 = 134217728 / (SCRATCH * WU_PER_WARP / 2); // for uint2_t textures
+	int MW_1D = kernel->get_texel_width() == 2 ? MW_1D_2 : MW_1D_4;
+
+	uint32_t *d_V = NULL;
+	if (device_singlememory[thr_id])
+	{
+		// if no launch config was specified, we simply
+		// allocate the single largest memory chunk on the device that we can get
+		if (validate_config(device_config[thr_id], optimal_blocks, WARPS_PER_BLOCK)) {
+			MAXWARPS[thr_id] = optimal_blocks * WARPS_PER_BLOCK;
+		}
+		else {
+			// compute no. of warps to allocate the largest number producing a single memory block
+			// PROBLEM: one some devices, ALL allocations will fail if the first one failed. This sucks.
+			size_t MEM_LIMIT = (size_t)min((unsigned long long)MAXMEM, (unsigned long long)props.totalGlobalMem);
+			int warpmax = (int)min((unsigned long long)TOTAL_WARP_LIMIT, (unsigned long long)(MEM_LIMIT / szPerWarp));
+
+			// run a bisection algorithm for memory allocation (way more reliable than the previous approach)
+			int best = 0;
+			int warp = (warpmax+1)/2;
+			int interval = (warpmax+1)/2;
+			while (interval > 0)
+			{
+				cudaGetLastError(); // clear the error state
+				cudaMalloc((void **)&d_V, (size_t)(szPerWarp * warp));
+				if (cudaGetLastError() == cudaSuccess) {
+					checkCudaErrors(cudaFree(d_V)); d_V = NULL;
+					if (warp > best) best = warp;
+					if (warp == warpmax) break;
+					interval = (interval+1)/2;
+					warp += interval;
+					if (warp > warpmax) warp = warpmax;
+				}
+				else
+				{
+					interval = interval/2;
+					warp -= interval;
+					if (warp < 1) warp = 1;
+				}
+			}
+			// back off a bit from the largest possible allocation size
+			MAXWARPS[thr_id] = ((100-BACKOFF)*best+50)/100;
+		}
+
+		// now allocate a buffer for determined MAXWARPS setting
+		cudaGetLastError(); // clear the error state
+		cudaMalloc((void **)&d_V, (size_t)SCRATCH * WU_PER_WARP * MAXWARPS[thr_id] * sizeof(uint32_t));
+		if (cudaGetLastError() == cudaSuccess) {
+			for (int i=0; i < MAXWARPS[thr_id]; ++i)
+				h_V[thr_id][i] = d_V + SCRATCH * WU_PER_WARP * i;
+
+			if (device_texturecache[thr_id] == 1)
+			{
+				if (validate_config(device_config[thr_id], optimal_blocks, WARPS_PER_BLOCK))
+				{
+					if ( optimal_blocks * WARPS_PER_BLOCK > MW_1D ) {
+						applog(LOG_ERR, "GPU #%d: '%s' exceeds limits for 1D cache. Using 2D cache instead.", device_map[thr_id], device_config[thr_id]);
+						device_texturecache[thr_id] = 2;
+					}
+				}
+				// bind linear memory to a 1D texture reference
+				if (kernel->get_texel_width() == 2)
+					kernel->bindtexture_1D(d_V, SCRATCH * WU_PER_WARP * min(MAXWARPS[thr_id],MW_1D_2) * sizeof(uint32_t));
+				else
+					kernel->bindtexture_1D(d_V, SCRATCH * WU_PER_WARP * min(MAXWARPS[thr_id],MW_1D_4) * sizeof(uint32_t));
+			}
+			else if (device_texturecache[thr_id] == 2)
+			{
+				// bind pitch linear memory to a 2D texture reference
+				if (kernel->get_texel_width() == 2)
+					kernel->bindtexture_2D(d_V, SCRATCH/2, WU_PER_WARP * MAXWARPS[thr_id], SCRATCH*sizeof(uint32_t));
+				else
+					kernel->bindtexture_2D(d_V, SCRATCH/4, WU_PER_WARP * MAXWARPS[thr_id], SCRATCH*sizeof(uint32_t));
+			}
+		}
+		else
+		{
+			applog(LOG_ERR, "GPU #%d: FATAL: Launch config '%s' requires too much memory!", device_map[thr_id], device_config[thr_id]);
+			return 0;
+		}
+	}
+	else
+	{
+		if (validate_config(device_config[thr_id], optimal_blocks, WARPS_PER_BLOCK))
+			MAXWARPS[thr_id] = optimal_blocks * WARPS_PER_BLOCK;
+		else
+			MAXWARPS[thr_id] = TOTAL_WARP_LIMIT;
+
+		// chunked memory allocation up to device limits
+		int warp;
+		for (warp = 0; warp < MAXWARPS[thr_id]; ++warp) {
+			// work around partition camping problems by adding a random start address offset to each allocation
+			h_V_extra[thr_id][warp] = (props.major == 1) ? (16 * (rand()%(16384/16))) : 0;
+			cudaGetLastError(); // clear the error state
+			cudaMalloc((void **) &h_V[thr_id][warp], (SCRATCH * WU_PER_WARP + h_V_extra[thr_id][warp])*sizeof(uint32_t));
+			if (cudaGetLastError() == cudaSuccess) h_V[thr_id][warp] += h_V_extra[thr_id][warp];
+			else {
+				h_V_extra[thr_id][warp] = 0;
+
+				// back off by several warp allocations to have some breathing room
+				int remove = (BACKOFF*warp+50)/100;
+				for (int i=0; warp > 0 && i < remove; ++i) {
+					warp--;
+					checkCudaErrors(cudaFree(h_V[thr_id][warp]-h_V_extra[thr_id][warp]));
+					h_V[thr_id][warp] = NULL; h_V_extra[thr_id][warp] = 0;
+				}
+
+				break;
+			}
+		}
+		MAXWARPS[thr_id] = warp;
+	}
+	kernel->set_scratchbuf_constants(MAXWARPS[thr_id], h_V[thr_id]);
+
+	if (validate_config(device_config[thr_id], optimal_blocks, WARPS_PER_BLOCK))
+	{
+		if (optimal_blocks * WARPS_PER_BLOCK > MAXWARPS[thr_id])
+		{
+			applog(LOG_ERR, "GPU #%d: FATAL: Given launch config '%s' requires too much memory.", device_map[thr_id], device_config[thr_id]);
+			return 0;
+		}
+
+		if (WARPS_PER_BLOCK > kernel->max_warps_per_block())
+		{
+			applog(LOG_ERR, "GPU #%d: FATAL: Given launch config '%s' exceeds warp limit for '%c' kernel.", device_map[thr_id], device_config[thr_id], kernel->get_identifier());
+			return 0;
+		}
+	}
+	else
+	{
+		if (device_config[thr_id] != NULL && strcasecmp("auto", device_config[thr_id]))
+			applog(LOG_WARNING, "GPU #%d: Given launch config '%s' does not validate.", device_map[thr_id], device_config[thr_id]);
+
+		if (opt_autotune)
+		{
+			applog(LOG_INFO, "GPU #%d: Performing auto-tuning, please wait 2 minutes...", device_map[thr_id]);
+
+			// allocate device memory
+			uint32_t *d_idata = NULL, *d_odata = NULL;
+			unsigned int mem_size = MAXWARPS[thr_id] * WU_PER_WARP * sizeof(uint32_t) * 32;
+			checkCudaErrors(cudaMalloc((void **) &d_idata, mem_size));
+			checkCudaErrors(cudaMalloc((void **) &d_odata, mem_size));
+
+			// pre-initialize some device memory
+			uint32_t *h_idata = (uint32_t*)malloc(mem_size);
+			for (unsigned int i=0; i < mem_size/sizeof(uint32_t); ++i) h_idata[i] = i*2654435761UL; // knuth's method
+			checkCudaErrors(cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice));
+			free(h_idata);
+
+			double best_hash_sec = 0.0;
+			int best_wpb = 0;
+
+			// auto-tuning loop
+			{
+				// we want to have enough total warps for half the multiprocessors at least
+				// compute highest MAXWARPS number that we can support based on texture cache mode
+				int MINTW = props.multiProcessorCount / 2;
+				int MAXTW = (device_texturecache[thr_id] == 1) ? min(MAXWARPS[thr_id],MW_1D) : MAXWARPS[thr_id];
+
+				// we want to have blocks for half the multiprocessors at least
+				int MINB = props.multiProcessorCount / 2;
+				int MAXB = MAXTW;
+
+				double tmin = 0.05;
+
+				applog(LOG_INFO, "GPU #%d: maximum total warps (BxW): %d", (int) device_map[thr_id], MAXTW);
+
+				for (int GRID_BLOCKS = MINB; !abort_flag && GRID_BLOCKS <= MAXB; ++GRID_BLOCKS)
+				{
+					double Hash[32+1] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 };
+					for (WARPS_PER_BLOCK = 1; !abort_flag && WARPS_PER_BLOCK <= kernel->max_warps_per_block(); ++WARPS_PER_BLOCK)
+					{
+						double hash_sec = 0;
+						if (GRID_BLOCKS * WARPS_PER_BLOCK >= MINTW &&
+							GRID_BLOCKS * WARPS_PER_BLOCK <= MAXTW)
+						{
+							// setup execution parameters
+							dim3  grid(WU_PER_LAUNCH/WU_PER_BLOCK, 1, 1);
+							dim3  threads(THREADS_PER_WU*WU_PER_BLOCK, 1, 1);
+
+							struct timeval tv_start, tv_end;
+							double tdelta = 0;
+
+							checkCudaErrors(cudaDeviceSynchronize());
+							gettimeofday(&tv_start, NULL);
+							int repeat = 0;
+							do  // average several measurements for better exactness
+							{
+								kernel->run_kernel(
+									grid, threads, WARPS_PER_BLOCK, thr_id, NULL, d_idata, d_odata, N,
+									LOOKUP_GAP, device_interactive[thr_id], true, device_texturecache[thr_id]
+								);
+								if(cudaDeviceSynchronize() != cudaSuccess)
+									break;
+								++repeat;
+								gettimeofday(&tv_end, NULL);
+								// for a better result averaging, measure for at least 50ms (10ms for Keccak)
+							} while ((tdelta=(1e-6 * (tv_end.tv_usec-tv_start.tv_usec) + (tv_end.tv_sec-tv_start.tv_sec))) < tmin);
+							if (cudaGetLastError() != cudaSuccess) continue;
+
+							tdelta /= repeat; // BUGFIX: this averaging over multiple measurements was missing
+
+							// for scrypt: in interactive mode only find launch configs where kernel launch times are short enough
+							// TODO: instead we could reduce the batchsize parameter to meet the launch time requirement.
+							if (IS_SCRYPT() && device_interactive[thr_id]
+								&& GRID_BLOCKS > 2*props.multiProcessorCount && tdelta > 1.0/30)
+							{
+								if (WARPS_PER_BLOCK == 1) goto skip; else goto skip2;
+							}
+
+							hash_sec = (double)WU_PER_LAUNCH / tdelta;
+							Hash[WARPS_PER_BLOCK] = hash_sec;
+							if (hash_sec > best_hash_sec) {
+								optimal_blocks = GRID_BLOCKS;
+								best_hash_sec = hash_sec;
+								best_wpb = WARPS_PER_BLOCK;
+							}
+						}
+					}
+skip2:
+					if (opt_debug) {
+
+						if (GRID_BLOCKS == MINB) {
+							char line[512] = "    ";
+							for (int i=1; i<=kernel->max_warps_per_block(); ++i) {
+								char tmp[16]; sprintf(tmp, i < 10 ? "   x%-2d" : "  x%-2d ", i);
+								strcat(line, tmp);
+								if (cw == 80 && (i % 8 == 0 && i != kernel->max_warps_per_block()))
+									strcat(line, "\n                          ");
+							}
+							applog(LOG_DEBUG, line);
+						}
+
+						char kMGT = ' '; bool flag;
+						for (int j=0; j < 4; ++j) {
+							flag=false; for (int i=1; i<=kernel->max_warps_per_block(); flag|=Hash[i] >= 1000, i++);
+							if (flag)   for (int i=1; i<=kernel->max_warps_per_block(); Hash[i] /= 1000, i++);
+							else break;
+								 if (kMGT == ' ') kMGT = 'k';
+							else if (kMGT == 'k') kMGT = 'M';
+							else if (kMGT == 'M') kMGT = 'G';
+							else if (kMGT == 'G') kMGT = 'T';
+						}
+						const char *format = "%5.4f%c";
+						flag = false; for (int i=1; i<=kernel->max_warps_per_block(); flag|=Hash[i] >= 1, i++); if (flag) format = "%5.3f%c";
+						flag = false; for (int i=1; i<=kernel->max_warps_per_block(); flag|=Hash[i] >= 10, i++); if (flag) format = "%5.2f%c";
+						flag = false; for (int i=1; i<=kernel->max_warps_per_block(); flag|=Hash[i] >= 100, i++); if (flag) format = "%5.1f%c";
+
+						char line[512]; sprintf(line, "%3d:", GRID_BLOCKS);
+						for (int i=1; i<=kernel->max_warps_per_block(); ++i) {
+							char tmp[16];
+							if (Hash[i]>0)
+								sprintf(tmp, format, Hash[i], (i<kernel->max_warps_per_block())?'|':' ');
+							else
+								sprintf(tmp, "     %c", (i<kernel->max_warps_per_block())?'|':' ');
+							strcat(line, tmp);
+							if (cw == 80 && (i % 8 == 0 && i != kernel->max_warps_per_block()))
+								strcat(line, "\n                          ");
+						}
+						int n = strlen(line)-1; line[n++] = '|'; line[n++] = ' '; line[n++] = kMGT; line[n++] = '\0';
+						strcat(line, "H/s");
+						applog(LOG_DEBUG, line);
+					}
+				}
+skip:           ;
+			}
+
+			checkCudaErrors(cudaFree(d_odata));
+			checkCudaErrors(cudaFree(d_idata));
+
+			WARPS_PER_BLOCK = best_wpb;
+			applog(LOG_INFO, "GPU #%d: %7.2f hash/s with configuration %c%dx%d", device_map[thr_id], best_hash_sec, kernel->get_identifier(), optimal_blocks, WARPS_PER_BLOCK);
+		}
+		else
+		{
+			// Heuristics to find a good kernel launch configuration
+
+			// base the initial block estimate on the number of multiprocessors
+			int device_cores = props.multiProcessorCount * _ConvertSMVer2Cores(props.major, props.minor);
+
+			// defaults, in case nothing else is chosen below
+			optimal_blocks = 4 * device_cores / WU_PER_WARP;
+			WARPS_PER_BLOCK = 2;
+
+			// Based on compute capability, pick a known good block x warp configuration.
+			if (props.major >= 3)
+			{
+				if (props.major == 3 && props.minor == 5) // GK110 (Tesla K20X, K20, GeForce GTX TITAN)
+				{
+					// TODO: what to do with Titan and Tesla K20(X)?
+					// for now, do the same as for GTX 660Ti (2GB)
+					optimal_blocks = (int)(optimal_blocks * 0.8809524);
+					WARPS_PER_BLOCK = 2;
+				}
+				else // GK104, GK106, GK107 ...
+				{
+					if (MAXWARPS[thr_id] > (int)(optimal_blocks * 1.7261905) * 2)
+					{
+						// this results in 290x2 configuration on GTX 660Ti (3GB)
+						// but it requires 3GB memory on the card!
+						optimal_blocks = (int)(optimal_blocks * 1.7261905);
+						WARPS_PER_BLOCK = 2;
+					}
+					else
+					{
+						// this results in 148x2 configuration on GTX 660Ti (2GB)
+						optimal_blocks = (int)(optimal_blocks * 0.8809524);
+						WARPS_PER_BLOCK = 2;
+					}
+				}
+			}
+			// 1st generation Fermi (compute 2.0) GF100, GF110
+			else if (props.major == 2 && props.minor == 0)
+			{
+				// this results in a 60x4 configuration on GTX 570
+				optimal_blocks = 4 * device_cores / WU_PER_WARP;
+				WARPS_PER_BLOCK = 4;
+			}
+			// 2nd generation Fermi (compute 2.1) GF104,106,108,114,116
+			else if (props.major == 2 && props.minor == 1)
+			{
+				// this results in a 56x2 configuration on GTX 460
+				optimal_blocks = props.multiProcessorCount * 8;
+				WARPS_PER_BLOCK = 2;
+			}
+
+			// in case we run out of memory with the automatically chosen configuration,
+			// first back off with WARPS_PER_BLOCK, then reduce optimal_blocks.
+			if (WARPS_PER_BLOCK==3 && optimal_blocks * WARPS_PER_BLOCK > MAXWARPS[thr_id])
+				WARPS_PER_BLOCK = 2;
+			while (optimal_blocks > 0 && optimal_blocks * WARPS_PER_BLOCK > MAXWARPS[thr_id])
+				optimal_blocks--;
+		}
+	}
+
+	applog(LOG_INFO, "GPU #%d: using launch configuration %c%dx%d", device_map[thr_id], kernel->get_identifier(), optimal_blocks, WARPS_PER_BLOCK);
+
+	if (device_singlememory[thr_id])
+	{
+		if (MAXWARPS[thr_id] != optimal_blocks * WARPS_PER_BLOCK)
+		{
+			MAXWARPS[thr_id] = optimal_blocks * WARPS_PER_BLOCK;
+			if (device_texturecache[thr_id] == 1)
+				kernel->unbindtexture_1D();
+			else if (device_texturecache[thr_id] == 2)
+				kernel->unbindtexture_2D();
+			checkCudaErrors(cudaFree(d_V)); d_V = NULL;
+
+			cudaGetLastError(); // clear the error state
+			cudaMalloc((void **)&d_V, (size_t)SCRATCH * WU_PER_WARP * MAXWARPS[thr_id] * sizeof(uint32_t));
+			if (cudaGetLastError() == cudaSuccess) {
+				for (int i=0; i < MAXWARPS[thr_id]; ++i)
+					h_V[thr_id][i] = d_V + SCRATCH * WU_PER_WARP * i;
+
+				if (device_texturecache[thr_id] == 1)
+				{
+					// bind linear memory to a 1D texture reference
+					if (kernel->get_texel_width() == 2)
+						kernel->bindtexture_1D(d_V, SCRATCH * WU_PER_WARP * MAXWARPS[thr_id] * sizeof(uint32_t));
+					else
+						kernel->bindtexture_1D(d_V, SCRATCH * WU_PER_WARP * MAXWARPS[thr_id] * sizeof(uint32_t));
+				}
+				else if (device_texturecache[thr_id] == 2)
+				{
+					// bind pitch linear memory to a 2D texture reference
+					if (kernel->get_texel_width() == 2)
+						kernel->bindtexture_2D(d_V, SCRATCH/2, WU_PER_WARP * MAXWARPS[thr_id], SCRATCH*sizeof(uint32_t));
+					else
+						kernel->bindtexture_2D(d_V, SCRATCH/4, WU_PER_WARP * MAXWARPS[thr_id], SCRATCH*sizeof(uint32_t));
+				}
+
+				// update pointers to scratch buffer in constant memory after reallocation
+				kernel->set_scratchbuf_constants(MAXWARPS[thr_id], h_V[thr_id]);
+			}
+			else
+			{
+				applog(LOG_ERR, "GPU #%d: Unable to allocate enough memory for launch config '%s'.", device_map[thr_id], device_config[thr_id]);
+			}
+		}
+	}
+	else
+	{
+		// back off unnecessary memory allocations to have some breathing room
+		while (MAXWARPS[thr_id] > 0 && MAXWARPS[thr_id] > optimal_blocks * WARPS_PER_BLOCK) {
+			(MAXWARPS[thr_id])--;
+			checkCudaErrors(cudaFree(h_V[thr_id][MAXWARPS[thr_id]]-h_V_extra[thr_id][MAXWARPS[thr_id]]));
+			h_V[thr_id][MAXWARPS[thr_id]] = NULL; h_V_extra[thr_id][MAXWARPS[thr_id]] = 0;
+		}
+	}
+
+	return optimal_blocks;
+}
+
+void cuda_scrypt_HtoD(int thr_id, uint32_t *X, int stream)
+{
+	unsigned int GRID_BLOCKS = context_blocks[thr_id];
+	unsigned int WARPS_PER_BLOCK = context_wpb[thr_id];
+	unsigned int THREADS_PER_WU = context_kernel[thr_id]->threads_per_wu();
+	unsigned int mem_size = WU_PER_LAUNCH * sizeof(uint32_t) * 32;
+
+	// copy host memory to device
+	cudaMemcpyAsync(context_idata[stream][thr_id], X, mem_size, cudaMemcpyHostToDevice, context_streams[stream][thr_id]);
+}
+
+void cuda_scrypt_serialize(int thr_id, int stream)
+{
+	// if the device can concurrently execute multiple kernels, then we must
+	// wait for the serialization event recorded by the other stream
+	if (context_concurrent[thr_id] || device_interactive[thr_id])
+		cudaStreamWaitEvent(context_streams[stream][thr_id], context_serialize[(stream+1)&1][thr_id], 0);
+}
+
+void cuda_scrypt_done(int thr_id, int stream)
+{
+	// record the serialization event in the current stream
+	cudaEventRecord(context_serialize[stream][thr_id], context_streams[stream][thr_id]);
+}
+
+void cuda_scrypt_flush(int thr_id, int stream)
+{
+	// flush the work queue (required for WDDM drivers)
+	cudaStreamSynchronize(context_streams[stream][thr_id]);
+}
+
+void cuda_scrypt_core(int thr_id, int stream, unsigned int N)
+{
+	unsigned int GRID_BLOCKS = context_blocks[thr_id];
+	unsigned int WARPS_PER_BLOCK = context_wpb[thr_id];
+	unsigned int THREADS_PER_WU = context_kernel[thr_id]->threads_per_wu();
+	unsigned int LOOKUP_GAP = device_lookup_gap[thr_id];
+
+	// setup execution parameters
+	dim3 grid(WU_PER_LAUNCH/WU_PER_BLOCK, 1, 1);
+	dim3 threads(THREADS_PER_WU*WU_PER_BLOCK, 1, 1);
+
+	context_kernel[thr_id]->run_kernel(grid, threads, WARPS_PER_BLOCK, thr_id,
+		context_streams[stream][thr_id], context_idata[stream][thr_id], context_odata[stream][thr_id],
+		N, LOOKUP_GAP, device_interactive[thr_id], opt_benchmark, device_texturecache[thr_id]
+	);
+}
+
+void cuda_scrypt_DtoH(int thr_id, uint32_t *X, int stream, bool postSHA)
+{
+	unsigned int GRID_BLOCKS = context_blocks[thr_id];
+	unsigned int WARPS_PER_BLOCK = context_wpb[thr_id];
+	unsigned int THREADS_PER_WU = context_kernel[thr_id]->threads_per_wu();
+	unsigned int mem_size = WU_PER_LAUNCH * sizeof(uint32_t) * (postSHA ? 8 : 32);
+	// copy result from device to host (asynchronously)
+	checkCudaErrors(cudaMemcpyAsync(X, postSHA ? context_hash[stream][thr_id] : context_odata[stream][thr_id], mem_size, cudaMemcpyDeviceToHost, context_streams[stream][thr_id]));
+}
+
+bool cuda_scrypt_sync(int thr_id, int stream)
+{
+	cudaError_t err;
+	uint32_t wait_us = 0;
+
+	if (device_interactive[thr_id] && !opt_benchmark)
+	{
+		// For devices that also do desktop rendering or compositing, we want to free up some time slots.
+		// That requires making a pause in work submission when there is no active task on the GPU,
+		// and Device Synchronize ensures that.
+
+		// this call was replaced by the loop below to workaround the high CPU usage issue
+		//err = cudaDeviceSynchronize();
+
+		while((err = cudaStreamQuery(context_streams[0][thr_id])) == cudaErrorNotReady ||
+			  (err == cudaSuccess && (err = cudaStreamQuery(context_streams[1][thr_id])) == cudaErrorNotReady)) {
+			usleep(50); wait_us+=50;
+		}
+
+		usleep(50); wait_us+=50;
+	} else {
+		// this call was replaced by the loop below to workaround the high CPU usage issue
+		//err = cudaStreamSynchronize(context_streams[stream][thr_id]);
+
+		while((err = cudaStreamQuery(context_streams[stream][thr_id])) == cudaErrorNotReady) {
+			usleep(50); wait_us+=50;
+		}
+	}
+
+	if (err != cudaSuccess) {
+		if (!abort_flag)
+			applog(LOG_ERR, "GPU #%d: CUDA error `%s` while waiting the kernel.", device_map[thr_id], cudaGetErrorString(err));
+		return false;
+	}
+
+	//if (opt_debug) {
+	//	applog(LOG_DEBUG, "GPU #%d: %s %u us", device_map[thr_id], __FUNCTION__, wait_us);
+	//}
+
+	return true;
+}
+
+uint32_t* cuda_transferbuffer(int thr_id, int stream)
+{
+	return context_X[stream][thr_id];
+}
+
+uint32_t* cuda_hashbuffer(int thr_id, int stream)
+{
+	return context_H[stream][thr_id];
+}
diff --git a/scrypt/salsa_kernel.h b/scrypt/salsa_kernel.h
new file mode 100644
index 0000000000..f0ea15aea0
--- /dev/null
+++ b/scrypt/salsa_kernel.h
@@ -0,0 +1,109 @@
+#ifndef SALSA_KERNEL_H
+#define SALSA_KERNEL_H
+
+#include <stdio.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#ifndef __APPLE__
+#include <malloc.h>
+#endif
+#include <string.h>
+#include <cuda_runtime.h>
+
+#include "miner.h"
+
+// from ccminer.cpp
+extern short device_map[MAX_GPUS];
+extern int device_batchsize[MAX_GPUS]; // cudaminer -b
+extern int device_interactive[MAX_GPUS]; // cudaminer -i
+extern int device_texturecache[MAX_GPUS]; // cudaminer -C
+extern int device_singlememory[MAX_GPUS]; // cudaminer -m
+extern int device_lookup_gap[MAX_GPUS]; // -L
+extern int device_backoff[MAX_GPUS]; // WIN32/LINUX var
+extern char *device_config[MAX_GPUS]; // -l
+extern char *device_name[MAX_GPUS];
+
+extern bool opt_autotune;
+extern int opt_nfactor;
+extern char *jane_params;
+extern int parallel;
+
+extern void get_currentalgo(char* buf, int sz);
+
+typedef unsigned int uint32_t; // define this as 32 bit type derived from int
+
+// scrypt variants
+#define A_SCRYPT 0
+#define A_SCRYPT_JANE 1
+static char algo[64] = { 0 };
+static int scrypt_algo = -1;
+static __inline int get_scrypt_type() {
+	if (scrypt_algo != -1) return scrypt_algo;
+	get_currentalgo(algo, 64);
+	if (!strncasecmp(algo,"scrypt-jane",11)) scrypt_algo = A_SCRYPT_JANE;
+	else if (!strncasecmp(algo,"scrypt",6)) scrypt_algo = A_SCRYPT;
+	return scrypt_algo;
+}
+static __inline bool IS_SCRYPT() { get_scrypt_type(); return (scrypt_algo == A_SCRYPT); }
+static __inline bool IS_SCRYPT_JANE() { get_scrypt_type(); return (scrypt_algo == A_SCRYPT_JANE); }
+
+// CUDA externals
+extern int cuda_throughput(int thr_id);
+extern uint32_t *cuda_transferbuffer(int thr_id, int stream);
+extern uint32_t *cuda_hashbuffer(int thr_id, int stream);
+
+extern void cuda_scrypt_HtoD(int thr_id, uint32_t *X, int stream);
+extern void cuda_scrypt_serialize(int thr_id, int stream);
+extern void cuda_scrypt_core(int thr_id, int stream, unsigned int N);
+extern void cuda_scrypt_done(int thr_id, int stream);
+extern void cuda_scrypt_DtoH(int thr_id, uint32_t *X, int stream, bool postSHA);
+extern bool cuda_scrypt_sync(int thr_id, int stream);
+extern void cuda_scrypt_flush(int thr_id, int stream);
+
+// If we're in C++ mode, we're either compiling .cu files or scrypt.cpp
+
+#ifdef __NVCC__
+
+/**
+ * An pure virtual interface for a CUDA kernel implementation.
+ * TODO: encapsulate the kernel launch parameters in some kind of wrapper.
+ */
+class KernelInterface
+{
+public:
+	virtual void set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V) = 0;
+	virtual bool run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache) = 0;
+	virtual bool bindtexture_1D(uint32_t *d_V, size_t size) { return true; }
+	virtual bool bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch) { return true; }
+	virtual bool unbindtexture_1D() { return true; }
+	virtual bool unbindtexture_2D() { return true; }
+
+	virtual char get_identifier() = 0;
+	virtual int get_major_version() { return 1; }
+	virtual int get_minor_version() { return 0; }
+	virtual int max_warps_per_block() = 0;
+	virtual int get_texel_width() = 0;
+	virtual bool no_textures() { return false; };
+	virtual bool single_memory() { return false; };
+	virtual int threads_per_wu() { return 1; }
+	virtual bool support_lookup_gap() { return false; }
+	virtual cudaSharedMemConfig shared_mem_config() { return cudaSharedMemBankSizeDefault; }
+	virtual cudaFuncCache cache_config() { return cudaFuncCachePreferNone; }
+};
+
+// Not performing error checking is actually bad, but...
+#define checkCudaErrors(x) x
+#define getLastCudaError(x)
+
+#endif // #ifdef __NVCC__
+
+// Define work unit size
+#define TOTAL_WARP_LIMIT 4096
+#define WU_PER_WARP (32 / THREADS_PER_WU)
+#define WU_PER_BLOCK (WU_PER_WARP*WARPS_PER_BLOCK)
+#define WU_PER_LAUNCH (GRID_BLOCKS*WU_PER_BLOCK)
+
+// make scratchpad size dependent on N and LOOKUP_GAP
+#define SCRATCH   (((N+LOOKUP_GAP-1)/LOOKUP_GAP)*32)
+
+#endif // #ifndef SALSA_KERNEL_H
diff --git a/scrypt/scrypt-jane.h b/scrypt/scrypt-jane.h
new file mode 100644
index 0000000000..e8c270bf32
--- /dev/null
+++ b/scrypt/scrypt-jane.h
@@ -0,0 +1,29 @@
+#ifndef SCRYPT_JANE_H
+#define SCRYPT_JANE_H
+
+/*
+	Nfactor: Increases CPU & Memory Hardness
+	N = (1 << (Nfactor + 1)): How many times to mix a chunk and how many temporary chunks are used
+
+	rfactor: Increases Memory Hardness
+	r = (1 << rfactor): How large a chunk is
+
+	pfactor: Increases CPU Hardness
+	p = (1 << pfactor): Number of times to mix the main chunk
+
+	A block is the basic mixing unit (salsa/chacha block = 64 bytes)
+	A chunk is (2 * r) blocks
+
+	~Memory used = (N + 2) * ((2 * r) * block size)
+*/
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <memory.h>
+
+typedef void (*scrypt_fatal_errorfn)(const char *msg);
+void scrypt_set_fatal_error(scrypt_fatal_errorfn fn);
+
+void scrypt_N_1_1(const unsigned char *password, size_t password_len, const unsigned char *salt, size_t salt_len, uint32_t N, unsigned char *out, size_t bytes, uint8_t *X, uint8_t *Y, uint8_t *V);
+
+#endif /* SCRYPT_JANE_H */
diff --git a/scrypt/sha256.cu b/scrypt/sha256.cu
new file mode 100644
index 0000000000..1ecaf92953
--- /dev/null
+++ b/scrypt/sha256.cu
@@ -0,0 +1,437 @@
+//
+//  =============== SHA256 part on nVidia GPU ======================
+//
+// NOTE: compile this .cu module for compute_10,sm_10 with --maxrregcount=64
+//
+
+#include <map>
+
+#include <cuda_runtime.h>
+#include "miner.h"
+
+#include "salsa_kernel.h"
+
+#include "sha256.h"
+
+// define some error checking macros
+#define DELIMITER '/'
+#define __FILENAME__ ( strrchr(__FILE__, DELIMITER) != NULL ? strrchr(__FILE__, DELIMITER)+1 : __FILE__ )
+
+#undef checkCudaErrors
+#define checkCudaErrors(x) { \
+	cudaGetLastError(); \
+	x; \
+	cudaError_t err = cudaGetLastError(); \
+	if (err != cudaSuccess && !abort_flag) \
+		applog(LOG_ERR, "GPU #%d: cudaError %d (%s) (%s line %d)\n", (int) device_map[thr_id], err, cudaGetErrorString(err), __FILENAME__, __LINE__); \
+}
+
+// from salsa_kernel.cu
+extern std::map<int, uint32_t *> context_idata[2];
+extern std::map<int, uint32_t *> context_odata[2];
+extern std::map<int, cudaStream_t> context_streams[2];
+extern std::map<int, uint32_t *> context_tstate[2];
+extern std::map<int, uint32_t *> context_ostate[2];
+extern std::map<int, uint32_t *> context_hash[2];
+
+static const uint32_t host_sha256_h[8] = {
+	0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
+	0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
+};
+
+static const uint32_t host_sha256_k[64] = {
+	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
+/* Elementary functions used by SHA256 */
+#define Ch(x, y, z)     ((x & (y ^ z)) ^ z)
+#define Maj(x, y, z)    ((x & (y | z)) | (y & z))
+#define ROTR(x, n)      ((x >> n) | (x << (32 - n)))
+#define S0(x)           (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
+#define S1(x)           (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
+#define s0(x)           (ROTR(x, 7) ^ ROTR(x, 18) ^ (x >> 3))
+#define s1(x)           (ROTR(x, 17) ^ ROTR(x, 19) ^ (x >> 10))
+
+/* SHA256 round function */
+#define RND(a, b, c, d, e, f, g, h, k) \
+	do { \
+		t0 = h + S1(e) + Ch(e, f, g) + k; \
+		t1 = S0(a) + Maj(a, b, c); \
+		d += t0; \
+		h  = t0 + t1; \
+	} while (0)
+
+/* Adjusted round function for rotating state */
+#define RNDr(S, W, i) \
+	RND(S[(64 - i) % 8], S[(65 - i) % 8], \
+		S[(66 - i) % 8], S[(67 - i) % 8], \
+		S[(68 - i) % 8], S[(69 - i) % 8], \
+		S[(70 - i) % 8], S[(71 - i) % 8], \
+		W[i] + sha256_k[i])
+
+static const uint32_t host_keypad[12] = {
+	0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280
+};
+
+static const uint32_t host_innerpad[11] = {
+	0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x000004a0
+};
+
+static const uint32_t host_outerpad[8] = {
+	0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300
+};
+
+static const uint32_t host_finalblk[16] = {
+	0x00000001, 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000620
+};
+
+//
+// CUDA code
+//
+
+__constant__ uint32_t sha256_h[8];
+__constant__ uint32_t sha256_k[64];
+__constant__ uint32_t keypad[12];
+__constant__ uint32_t innerpad[11];
+__constant__ uint32_t outerpad[8];
+__constant__ uint32_t finalblk[16];
+__constant__ uint32_t pdata[20];
+__constant__ uint32_t midstate[8];
+
+__device__ void mycpy12(uint32_t *d, const uint32_t *s) {
+#pragma unroll 3
+	for (int k=0; k < 3; k++) d[k] = s[k];
+}
+
+__device__ void mycpy16(uint32_t *d, const uint32_t *s) {
+#pragma unroll 4
+	for (int k=0; k < 4; k++) d[k] = s[k];
+}
+
+__device__ void mycpy32(uint32_t *d, const uint32_t *s) {
+#pragma unroll 8
+	for (int k=0; k < 8; k++) d[k] = s[k];
+}
+
+__device__ void mycpy44(uint32_t *d, const uint32_t *s) {
+#pragma unroll 11
+	for (int k=0; k < 11; k++) d[k] = s[k];
+}
+
+__device__ void mycpy48(uint32_t *d, const uint32_t *s) {
+#pragma unroll 12
+	for (int k=0; k < 12; k++) d[k] = s[k];
+}
+
+__device__ void mycpy64(uint32_t *d, const uint32_t *s) {
+#pragma unroll 16
+	for (int k=0; k < 16; k++) d[k] = s[k];
+}
+
+__device__ uint32_t cuda_swab32(uint32_t x)
+{
+	return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u)
+		  | ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu));
+}
+
+__device__ void mycpy32_swab32(uint32_t *d, const uint32_t *s) {
+#pragma unroll 8
+	for (int k=0; k < 8; k++) d[k] = cuda_swab32(s[k]);
+}
+
+__device__ void mycpy64_swab32(uint32_t *d, const uint32_t *s) {
+#pragma unroll 16
+	for (int k=0; k < 16; k++) d[k] = cuda_swab32(s[k]);
+}
+
+__device__ void cuda_sha256_init(uint32_t *state)
+{
+	mycpy32(state, sha256_h);
+}
+
+/*
+ * SHA256 block compression function.  The 256-bit state is transformed via
+ * the 512-bit input block to produce a new state. Modified for lower register use.
+ */
+__device__ void cuda_sha256_transform(uint32_t *state, const uint32_t *block)
+{
+	uint32_t W[64]; // only 4 of these are accessed during each partial Mix
+	uint32_t S[8];
+	uint32_t t0, t1;
+	int i;
+
+	/* 1. Initialize working variables. */
+	mycpy32(S, state);
+
+	/* 2. Prepare message schedule W and Mix. */
+	mycpy16(W, block);
+	RNDr(S, W,  0); RNDr(S, W,  1); RNDr(S, W,  2); RNDr(S, W,  3);
+
+	mycpy16(W+4, block+4);
+	RNDr(S, W,  4); RNDr(S, W,  5); RNDr(S, W,  6); RNDr(S, W,  7);
+
+	mycpy16(W+8, block+8);
+	RNDr(S, W,  8); RNDr(S, W,  9); RNDr(S, W, 10); RNDr(S, W, 11);
+
+	mycpy16(W+12, block+12);
+	RNDr(S, W, 12); RNDr(S, W, 13); RNDr(S, W, 14); RNDr(S, W, 15);
+
+#pragma unroll 2
+	for (i = 16; i < 20; i += 2) {
+		W[i]   = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
+		W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; }
+	RNDr(S, W, 16); RNDr(S, W, 17); RNDr(S, W, 18); RNDr(S, W, 19);
+
+#pragma unroll 2
+	for (i = 20; i < 24; i += 2) {
+		W[i]   = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
+		W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; }
+	RNDr(S, W, 20); RNDr(S, W, 21); RNDr(S, W, 22); RNDr(S, W, 23);
+
+#pragma unroll 2
+	for (i = 24; i < 28; i += 2) {
+		W[i]   = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
+		W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; }
+	RNDr(S, W, 24); RNDr(S, W, 25); RNDr(S, W, 26); RNDr(S, W, 27);
+
+#pragma unroll 2
+	for (i = 28; i < 32; i += 2) {
+		W[i]   = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
+		W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; }
+	RNDr(S, W, 28); RNDr(S, W, 29); RNDr(S, W, 30); RNDr(S, W, 31);
+
+#pragma unroll 2
+	for (i = 32; i < 36; i += 2) {
+		W[i]   = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
+		W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; }
+	RNDr(S, W, 32); RNDr(S, W, 33); RNDr(S, W, 34); RNDr(S, W, 35);
+
+#pragma unroll 2
+	for (i = 36; i < 40; i += 2) {
+		W[i]   = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
+		W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; }
+	RNDr(S, W, 36); RNDr(S, W, 37); RNDr(S, W, 38); RNDr(S, W, 39);
+
+#pragma unroll 2
+	for (i = 40; i < 44; i += 2) {
+		W[i]   = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
+		W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; }
+	RNDr(S, W, 40); RNDr(S, W, 41); RNDr(S, W, 42); RNDr(S, W, 43);
+
+#pragma unroll 2
+	for (i = 44; i < 48; i += 2) {
+		W[i]   = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
+		W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; }
+	RNDr(S, W, 44); RNDr(S, W, 45); RNDr(S, W, 46); RNDr(S, W, 47);
+
+#pragma unroll 2
+	for (i = 48; i < 52; i += 2) {
+		W[i]   = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
+		W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; }
+	RNDr(S, W, 48); RNDr(S, W, 49); RNDr(S, W, 50); RNDr(S, W, 51);
+
+#pragma unroll 2
+	for (i = 52; i < 56; i += 2) {
+		W[i]   = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
+		W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; }
+	RNDr(S, W, 52); RNDr(S, W, 53); RNDr(S, W, 54); RNDr(S, W, 55);
+
+#pragma unroll 2
+	for (i = 56; i < 60; i += 2) {
+		W[i]   = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
+		W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; }
+	RNDr(S, W, 56); RNDr(S, W, 57); RNDr(S, W, 58); RNDr(S, W, 59);
+
+#pragma unroll 2
+	for (i = 60; i < 64; i += 2) {
+		W[i]   = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
+		W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; }
+	RNDr(S, W, 60); RNDr(S, W, 61); RNDr(S, W, 62); RNDr(S, W, 63);
+
+	/* 3. Mix local working variables into global state */
+#pragma unroll 8
+	for (i = 0; i < 8; i++)
+		state[i] += S[i];
+}
+
+//
+// HMAC SHA256 functions, modified to work with pdata and nonce directly
+//
+
+__device__ void cuda_HMAC_SHA256_80_init(uint32_t *tstate, uint32_t *ostate, uint32_t nonce)
+{
+	uint32_t ihash[8];
+	uint32_t pad[16];
+	int i;
+
+	/* tstate is assumed to contain the midstate of key */
+	mycpy12(pad, pdata + 16);
+	pad[3] = nonce;
+	mycpy48(pad + 4, keypad);
+	cuda_sha256_transform(tstate, pad);
+	mycpy32(ihash, tstate);
+
+	cuda_sha256_init(ostate);
+#pragma unroll 8
+	for (i = 0; i < 8; i++)
+		pad[i] = ihash[i] ^ 0x5c5c5c5c;
+#pragma unroll 8
+	for (i=8; i < 16; i++)
+		pad[i] = 0x5c5c5c5c;
+	cuda_sha256_transform(ostate, pad);
+
+	cuda_sha256_init(tstate);
+#pragma unroll 8
+	for (i = 0; i < 8; i++)
+		pad[i] = ihash[i] ^ 0x36363636;
+#pragma unroll 8
+	for (i=8; i < 16; i++)
+		pad[i] = 0x36363636;
+	cuda_sha256_transform(tstate, pad);
+}
+
+__device__ void cuda_PBKDF2_SHA256_80_128(const uint32_t *tstate,
+	const uint32_t *ostate, uint32_t *output, uint32_t nonce)
+{
+	uint32_t istate[8], ostate2[8];
+	uint32_t ibuf[16], obuf[16];
+
+	mycpy32(istate, tstate);
+	cuda_sha256_transform(istate, pdata);
+
+	mycpy12(ibuf, pdata + 16);
+	ibuf[3] = nonce;
+	ibuf[4] = 1;
+	mycpy44(ibuf + 5, innerpad);
+
+	mycpy32(obuf, istate);
+	mycpy32(obuf + 8, outerpad);
+	cuda_sha256_transform(obuf, ibuf);
+
+	mycpy32(ostate2, ostate);
+	cuda_sha256_transform(ostate2, obuf);
+	mycpy32_swab32(output, ostate2);       // TODO: coalescing would be desired
+
+	mycpy32(obuf, istate);
+	ibuf[4] = 2;
+	cuda_sha256_transform(obuf, ibuf);
+
+	mycpy32(ostate2, ostate);
+	cuda_sha256_transform(ostate2, obuf);
+	mycpy32_swab32(output+8, ostate2);     // TODO: coalescing would be desired
+
+	mycpy32(obuf, istate);
+	ibuf[4] = 3;
+	cuda_sha256_transform(obuf, ibuf);
+
+	mycpy32(ostate2, ostate);
+	cuda_sha256_transform(ostate2, obuf);
+	mycpy32_swab32(output+16, ostate2);    // TODO: coalescing would be desired
+
+	mycpy32(obuf, istate);
+	ibuf[4] = 4;
+	cuda_sha256_transform(obuf, ibuf);
+
+	mycpy32(ostate2, ostate);
+	cuda_sha256_transform(ostate2, obuf);
+	mycpy32_swab32(output+24, ostate2);    // TODO: coalescing would be desired
+}
+
+__global__ void cuda_pre_sha256(uint32_t g_inp[32], uint32_t g_tstate_ext[8], uint32_t g_ostate_ext[8], uint32_t nonce)
+{
+	nonce        +=       (blockIdx.x * blockDim.x) + threadIdx.x;
+	g_inp        += 32 * ((blockIdx.x * blockDim.x) + threadIdx.x);
+	g_tstate_ext +=  8 * ((blockIdx.x * blockDim.x) + threadIdx.x);
+	g_ostate_ext +=  8 * ((blockIdx.x * blockDim.x) + threadIdx.x);
+
+	uint32_t tstate[8], ostate[8];
+	mycpy32(tstate, midstate);
+
+	cuda_HMAC_SHA256_80_init(tstate, ostate, nonce);
+
+	mycpy32(g_tstate_ext, tstate);            // TODO: coalescing would be desired
+	mycpy32(g_ostate_ext, ostate);            // TODO: coalescing would be desired
+
+	cuda_PBKDF2_SHA256_80_128(tstate, ostate, g_inp, nonce);
+}
+
+__global__ void cuda_post_sha256(uint32_t g_output[8], uint32_t g_tstate_ext[8], uint32_t g_ostate_ext[8], uint32_t g_salt_ext[32])
+{
+	g_output     +=  8 * ((blockIdx.x * blockDim.x) + threadIdx.x);
+	g_tstate_ext +=  8 * ((blockIdx.x * blockDim.x) + threadIdx.x);
+	g_ostate_ext +=  8 * ((blockIdx.x * blockDim.x) + threadIdx.x);
+	g_salt_ext   += 32 * ((blockIdx.x * blockDim.x) + threadIdx.x);
+
+	uint32_t tstate[16];
+	mycpy32(tstate, g_tstate_ext);            // TODO: coalescing would be desired
+
+	uint32_t halfsalt[16];
+	mycpy64_swab32(halfsalt, g_salt_ext);     // TODO: coalescing would be desired
+	cuda_sha256_transform(tstate, halfsalt);
+	mycpy64_swab32(halfsalt, g_salt_ext+16);  // TODO: coalescing would be desired
+	cuda_sha256_transform(tstate, halfsalt);
+	cuda_sha256_transform(tstate, finalblk);
+
+	uint32_t buf[16];
+	mycpy32(buf, tstate);
+	mycpy32(buf + 8, outerpad);
+
+	uint32_t ostate[16];
+	mycpy32(ostate, g_ostate_ext);
+
+	cuda_sha256_transform(ostate, buf);
+	mycpy32_swab32(g_output, ostate);        // TODO: coalescing would be desired
+}
+
+//
+// callable host code to initialize constants and to call kernels
+//
+
+void prepare_sha256(int thr_id, uint32_t host_pdata[20], uint32_t host_midstate[8])
+{
+	static bool init[MAX_GPUS] = { 0 };
+
+	if (!init[thr_id])
+	{
+		checkCudaErrors(cudaMemcpyToSymbol(sha256_h, host_sha256_h, sizeof(host_sha256_h), 0, cudaMemcpyHostToDevice));
+		checkCudaErrors(cudaMemcpyToSymbol(sha256_k, host_sha256_k, sizeof(host_sha256_k), 0, cudaMemcpyHostToDevice));
+		checkCudaErrors(cudaMemcpyToSymbol(keypad, host_keypad, sizeof(host_keypad), 0, cudaMemcpyHostToDevice));
+		checkCudaErrors(cudaMemcpyToSymbol(innerpad, host_innerpad, sizeof(host_innerpad), 0, cudaMemcpyHostToDevice));
+		checkCudaErrors(cudaMemcpyToSymbol(outerpad, host_outerpad, sizeof(host_outerpad), 0, cudaMemcpyHostToDevice));
+		checkCudaErrors(cudaMemcpyToSymbol(finalblk, host_finalblk, sizeof(host_finalblk), 0, cudaMemcpyHostToDevice));
+		init[thr_id] = true;
+	}
+	checkCudaErrors(cudaMemcpyToSymbol(pdata, host_pdata, 20*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
+	checkCudaErrors(cudaMemcpyToSymbol(midstate, host_midstate, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
+}
+
+void pre_sha256(int thr_id, int stream, uint32_t nonce, int throughput)
+{
+	dim3 block(128);
+	dim3 grid((throughput+127)/128);
+
+	cuda_pre_sha256<<<grid, block, 0, context_streams[stream][thr_id]>>>(context_idata[stream][thr_id], context_tstate[stream][thr_id], context_ostate[stream][thr_id], nonce);
+}
+
+void post_sha256(int thr_id, int stream, int throughput)
+{
+	dim3 block(128);
+	dim3 grid((throughput+127)/128);
+
+	cuda_post_sha256<<<grid, block, 0, context_streams[stream][thr_id]>>>(context_hash[stream][thr_id], context_tstate[stream][thr_id], context_ostate[stream][thr_id], context_odata[stream][thr_id]);
+}
diff --git a/scrypt/sha256.h b/scrypt/sha256.h
new file mode 100644
index 0000000000..cacb08f180
--- /dev/null
+++ b/scrypt/sha256.h
@@ -0,0 +1,10 @@
+#ifndef SHA256_H
+#define SHA256_H
+
+#include <stdint.h>
+
+extern "C" void prepare_sha256(int thr_id, uint32_t cpu_pdata[20], uint32_t cpu_midstate[8]);
+extern "C" void pre_sha256(int thr_id, int stream, uint32_t nonce, int throughput);
+extern "C" void post_sha256(int thr_id, int stream, int throughput);
+
+#endif // #ifndef SHA256_H
diff --git a/scrypt/test_kernel.cu b/scrypt/test_kernel.cu
new file mode 100644
index 0000000000..ab5b03c9e5
--- /dev/null
+++ b/scrypt/test_kernel.cu
@@ -0,0 +1,781 @@
+/* Copyright (C) 2013 David G. Andersen. All rights reserved.
+ * with modifications by Christian Buchner
+ *
+ * Use of this code is covered under the Apache 2.0 license, which
+ * can be found in the file "LICENSE"
+ *
+ * The array notation for b[] and bx[] arrays was converted to uint4,
+ * in preparation for some experimental changes to memory access patterns.
+ * Also this kernel is going to be a testbed for adaptation to Fermi devices.
+ */
+
+// TODO: experiment with different memory access patterns in write/read_keys_direct functions
+// TODO: attempt V.Volkov style ILP (factor 4)
+
+#include <map>
+
+#include <cuda_runtime.h>
+#include "miner.h"
+
+#include "salsa_kernel.h"
+#include "test_kernel.h"
+
+#define TEXWIDTH 32768
+#define THREADS_PER_WU 4  // four threads per hash
+
+typedef enum
+{
+		ANDERSEN,
+		SIMPLE
+} MemoryAccess;
+
+
+// scratchbuf constants (pointers to scratch buffer for each warp, i.e. 32 hashes)
+__constant__ uint32_t* c_V[TOTAL_WARP_LIMIT];
+
+// iteration count N
+__constant__ uint32_t c_N;
+__constant__ uint32_t c_N_1;                   // N-1
+// scratch buffer size SCRATCH
+__constant__ uint32_t c_SCRATCH;
+__constant__ uint32_t c_SCRATCH_WU_PER_WARP;   // (SCRATCH * WU_PER_WARP)
+__constant__ uint32_t c_SCRATCH_WU_PER_WARP_1; // (SCRATCH * WU_PER_WARP) - 1
+
+// using texture references for the "tex" variants of the B kernels
+texture<uint4, 1, cudaReadModeElementType> texRef1D_4_V;
+texture<uint4, 2, cudaReadModeElementType> texRef2D_4_V;
+
+template <int ALGO> __device__  __forceinline__ void block_mixer(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3);
+
+static __device__ uint4& operator^=(uint4& left, const uint4& right) {
+	left.x ^= right.x;
+	left.y ^= right.y;
+	left.z ^= right.z;
+	left.w ^= right.w;
+	return left;
+}
+
+static __device__ uint4& operator+=(uint4& left, const uint4& right) {
+	left.x += right.x;
+	left.y += right.y;
+	left.z += right.z;
+	left.w += right.w;
+	return left;
+}
+
+/* write_keys writes the 8 keys being processed by a warp to the global
+ * scratchpad. To effectively use memory bandwidth, it performs the writes
+ * (and reads, for read_keys) 128 bytes at a time per memory location
+ * by __shfl'ing the 4 entries in bx to the threads in the next-up
+ * thread group. It then has eight threads together perform uint4
+ * (128 bit) writes to the destination region. This seems to make
+ * quite effective use of memory bandwidth. An approach that spread
+ * uint32s across more threads was slower because of the increased
+ * computation it required.
+ *
+ * "start" is the loop iteration producing the write - the offset within
+ * the block's memory.
+ *
+ * Internally, this algorithm first __shfl's the 4 bx entries to
+ * the next up thread group, and then uses a conditional move to
+ * ensure that odd-numbered thread groups exchange the b/bx ordering
+ * so that the right parts are written together.
+ *
+ * Thanks to Babu for helping design the 128-bit-per-write version.
+ *
+ * _direct lets the caller specify the absolute start location instead of
+ * the relative start location, as an attempt to reduce some recomputation.
+ */
+
+template <MemoryAccess SCHEME> __device__ __forceinline__
+void write_keys_direct(const uint4 &b, const uint4 &bx, uint32_t start)
+{
+	uint32_t *scratch = c_V[(blockIdx.x*blockDim.x + threadIdx.x)/32];
+
+	if (SCHEME == ANDERSEN) {
+		uint4 t=b, t2;
+		extern __shared__ unsigned char shared[];
+		uint32_t (*tmp)[32+1] = (uint32_t (*)[32+1])(shared);
+		uint32_t *s = &tmp[threadIdx.x/32][threadIdx.x%32];
+		uint32_t *st = &tmp[threadIdx.x/32][(threadIdx.x + 4)%32];
+		*s = bx.x; t2.x = *st;
+		*s = bx.y; t2.y = *st;
+		*s = bx.z; t2.z = *st;
+		*s = bx.w; t2.w = *st;
+		*s = start; int t2_start = *st + 4;
+		bool c = (threadIdx.x & 0x4);
+		*((uint4 *)(&scratch[c ? t2_start : start])) = (c ? t2 : t);
+		*((uint4 *)(&scratch[c ? start : t2_start])) = (c ? t : t2);
+	} else {
+		*((uint4 *)(&scratch[start   ])) = b;
+		*((uint4 *)(&scratch[start+16])) = bx;
+	}
+}
+
+template <MemoryAccess SCHEME, int TEX_DIM> __device__  __forceinline__
+void read_keys_direct(uint4 &b, uint4 &bx, uint32_t start)
+{
+	uint32_t *scratch;
+
+	if (TEX_DIM == 0) scratch = c_V[(blockIdx.x*blockDim.x + threadIdx.x)/32];
+	if (SCHEME == ANDERSEN) {
+		extern __shared__ unsigned char shared[];
+		uint32_t (*tmp)[32+1] = (uint32_t (*)[32+1])(shared);
+		uint32_t *s = &tmp[threadIdx.x/32][threadIdx.x%32];
+		*s = start; int t2_start = tmp[threadIdx.x/32][(threadIdx.x + 4)%32] + 4;
+		if (TEX_DIM > 0) { start /= 4; t2_start /= 4; }
+		bool c = (threadIdx.x & 0x4);
+		if (TEX_DIM == 0) {
+				b  = *((uint4 *)(&scratch[c ? t2_start : start]));
+				bx = *((uint4 *)(&scratch[c ? start : t2_start]));
+		} else if (TEX_DIM == 1) {
+				b  = tex1Dfetch(texRef1D_4_V, c ? t2_start : start);
+				bx = tex1Dfetch(texRef1D_4_V, c ? start : t2_start);
+		} else if (TEX_DIM == 2) {
+				b  = tex2D(texRef2D_4_V, 0.5f + ((c ? t2_start : start)%TEXWIDTH), 0.5f + ((c ? t2_start : start)/TEXWIDTH));
+				bx = tex2D(texRef2D_4_V, 0.5f + ((c ? start : t2_start)%TEXWIDTH), 0.5f + ((c ? start : t2_start)/TEXWIDTH));
+		}
+		uint4 temp = b; b = (c ? bx : b); bx = (c ? temp : bx);
+		uint32_t *st = &tmp[threadIdx.x/32][(threadIdx.x + 28)%32];
+		*s = bx.x; bx.x = *st;
+		*s = bx.y; bx.y = *st;
+		*s = bx.z; bx.z = *st;
+		*s = bx.w; bx.w = *st;
+	} else {
+				 if (TEX_DIM == 0) b = *((uint4 *)(&scratch[start]));
+		else if (TEX_DIM == 1) b = tex1Dfetch(texRef1D_4_V, start/4);
+		else if (TEX_DIM == 2) b = tex2D(texRef2D_4_V, 0.5f + ((start/4)%TEXWIDTH), 0.5f + ((start/4)/TEXWIDTH));
+				 if (TEX_DIM == 0) bx = *((uint4 *)(&scratch[start+16]));
+		else if (TEX_DIM == 1) bx = tex1Dfetch(texRef1D_4_V, (start+16)/4);
+		else if (TEX_DIM == 2) bx = tex2D(texRef2D_4_V, 0.5f + (((start+16)/4)%TEXWIDTH), 0.5f + (((start+16)/4)/TEXWIDTH));
+	}
+}
+
+
+__device__  __forceinline__
+void primary_order_shuffle(uint4 &b, uint4 &bx)
+{
+	/* Inner loop shuffle targets */
+	int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3);
+	int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
+	int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
+
+	extern __shared__ unsigned char shared[];
+	uint32_t (*tmp)[32+1] = (uint32_t (*)[32+1])(shared);
+	unsigned int wrp  = threadIdx.x/32, lane = threadIdx.x%32;
+	uint32_t *s = &tmp[wrp][lane];
+	uint32_t *s1 = &tmp[wrp][x1];
+	uint32_t *s2 = &tmp[wrp][x2];
+	uint32_t *s3 = &tmp[wrp][x3];
+
+	*s = b.w; b.w = *s1;
+	*s = b.z; b.z = *s2;
+	*s = b.y; b.y = *s3;
+	uint32_t temp = b.y; b.y = b.w; b.w = temp;
+
+	*s = bx.w; bx.w = *s1;
+	*s = bx.z; bx.z = *s2;
+	*s = bx.y; bx.y = *s3;
+	temp = bx.y; bx.y = bx.w; bx.w = temp;
+}
+
+/*
+ * load_key loads a 32*32bit key from a contiguous region of memory in B.
+ * The input keys are in external order (i.e., 0, 1, 2, 3, ...).
+ * After loading, each thread has its four b and four bx keys stored
+ * in internal processing order.
+ */
+
+__device__  __forceinline__
+void load_key_salsa(const uint32_t *B, uint4 &b, uint4 &bx)
+{
+	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	int key_offset = scrypt_block * 32;
+	uint32_t thread_in_block = threadIdx.x % 4;
+
+	// Read in permuted order. Key loads are not our bottleneck right now.
+	b.x = B[key_offset + 4*thread_in_block + (thread_in_block+0)%4];
+	b.y = B[key_offset + 4*thread_in_block + (thread_in_block+1)%4];
+	b.z = B[key_offset + 4*thread_in_block + (thread_in_block+2)%4];
+	b.w = B[key_offset + 4*thread_in_block + (thread_in_block+3)%4];
+	bx.x = B[key_offset + 4*thread_in_block + (thread_in_block+0)%4 + 16];
+	bx.y = B[key_offset + 4*thread_in_block + (thread_in_block+1)%4 + 16];
+	bx.z = B[key_offset + 4*thread_in_block + (thread_in_block+2)%4 + 16];
+	bx.w = B[key_offset + 4*thread_in_block + (thread_in_block+3)%4 + 16];
+
+	primary_order_shuffle(b, bx);
+
+}
+
+/*
+ * store_key performs the opposite transform as load_key, taking
+ * internally-ordered b and bx and storing them into a contiguous
+ * region of B in external order.
+ */
+
+__device__  __forceinline__
+void store_key_salsa(uint32_t *B, uint4 &b, uint4 &bx)
+{
+	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	int key_offset = scrypt_block * 32;
+	uint32_t thread_in_block = threadIdx.x % 4;
+
+	primary_order_shuffle(b, bx);
+
+	B[key_offset + 4*thread_in_block + (thread_in_block+0)%4] = b.x;
+	B[key_offset + 4*thread_in_block + (thread_in_block+1)%4] = b.y;
+	B[key_offset + 4*thread_in_block + (thread_in_block+2)%4] = b.z;
+	B[key_offset + 4*thread_in_block + (thread_in_block+3)%4] = b.w;
+	B[key_offset + 4*thread_in_block + (thread_in_block+0)%4 + 16] = bx.x;
+	B[key_offset + 4*thread_in_block + (thread_in_block+1)%4 + 16] = bx.y;
+	B[key_offset + 4*thread_in_block + (thread_in_block+2)%4 + 16] = bx.z;
+	B[key_offset + 4*thread_in_block + (thread_in_block+3)%4 + 16] = bx.w;
+}
+
+
+/*
+ * load_key loads a 32*32bit key from a contiguous region of memory in B.
+ * The input keys are in external order (i.e., 0, 1, 2, 3, ...).
+ * After loading, each thread has its four b and four bx keys stored
+ * in internal processing order.
+ */
+
+__device__  __forceinline__ 
+void load_key_chacha(const uint32_t *B, uint4 &b, uint4 &bx)
+{
+	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	int key_offset = scrypt_block * 32;
+	uint32_t thread_in_block = threadIdx.x % 4;
+
+	// Read in permuted order. Key loads are not our bottleneck right now.
+	b.x = B[key_offset + 4*0 + thread_in_block%4];
+	b.y = B[key_offset + 4*1 + thread_in_block%4];
+	b.z = B[key_offset + 4*2 + thread_in_block%4];
+	b.w = B[key_offset + 4*3 + thread_in_block%4];
+	bx.x = B[key_offset + 4*0 + thread_in_block%4 + 16];
+	bx.y = B[key_offset + 4*1 + thread_in_block%4 + 16];
+	bx.z = B[key_offset + 4*2 + thread_in_block%4 + 16];
+	bx.w = B[key_offset + 4*3 + thread_in_block%4 + 16];
+}
+
+/*
+ * store_key performs the opposite transform as load_key, taking
+ * internally-ordered b and bx and storing them into a contiguous
+ * region of B in external order.
+ */
+
+__device__  __forceinline__
+void store_key_chacha(uint32_t *B, const uint4 &b, const uint4 &bx)
+{
+	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	int key_offset = scrypt_block * 32;
+	uint32_t thread_in_block = threadIdx.x % 4;
+
+	B[key_offset + 4*0 + thread_in_block%4] = b.x;
+	B[key_offset + 4*1 + thread_in_block%4] = b.y;
+	B[key_offset + 4*2 + thread_in_block%4] = b.z;
+	B[key_offset + 4*3 + thread_in_block%4] = b.w;
+	B[key_offset + 4*0 + thread_in_block%4 + 16] = bx.x;
+	B[key_offset + 4*1 + thread_in_block%4 + 16] = bx.y;
+	B[key_offset + 4*2 + thread_in_block%4 + 16] = bx.z;
+	B[key_offset + 4*3 + thread_in_block%4 + 16] = bx.w;
+}
+
+
+template <int ALGO> __device__  __forceinline__
+void load_key(const uint32_t *B, uint4 &b, uint4 &bx)
+{
+	switch(ALGO) {
+	case A_SCRYPT:      load_key_salsa(B, b, bx); break;
+	case A_SCRYPT_JANE: load_key_chacha(B, b, bx); break;
+	}
+}
+
+template <int ALGO> __device__  __forceinline__
+void store_key(uint32_t *B, uint4 &b, uint4 &bx)
+{
+	switch(ALGO) {
+	case A_SCRYPT:      store_key_salsa(B, b, bx); break;
+	case A_SCRYPT_JANE: store_key_chacha(B, b, bx); break;
+	}
+}
+
+
+/*
+ * salsa_xor_core (Salsa20/8 cypher)
+ * The original scrypt called:
+ * xor_salsa8(&X[0], &X[16]); <-- the "b" loop
+ * xor_salsa8(&X[16], &X[0]); <-- the "bx" loop
+ * This version is unrolled to handle both of these loops in a single
+ * call to avoid unnecessary data movement.
+ */
+
+#define XOR_ROTATE_ADD(dst, s1, s2, amt) { uint32_t tmp = s1+s2; dst ^= ((tmp<<amt)|(tmp>>(32-amt))); }
+
+__device__  __forceinline__
+void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3)
+{
+	extern __shared__ unsigned char shared[];
+	uint32_t (*tmp)[32+1] = (uint32_t (*)[32+1])(shared);
+	unsigned int wrp  = threadIdx.x/32, lane = threadIdx.x%32;
+	uint32_t *s = &tmp[wrp][lane];
+	uint32_t *s1 = &tmp[wrp][x1];
+	uint32_t *s2 = &tmp[wrp][x2];
+	uint32_t *s3 = &tmp[wrp][x3];
+
+	uint4 x;
+
+	b ^= bx;
+	x = b;
+
+	// Enter in "primary order" (t0 has  0,  4,  8, 12)
+	//                          (t1 has  5,  9, 13,  1)
+	//                          (t2 has 10, 14,  2,  6)
+	//                          (t3 has 15,  3,  7, 11)
+
+	#pragma unroll
+	for (int j = 0; j < 4; j++) {
+
+		// Mixing phase of salsa
+		XOR_ROTATE_ADD(x.y, x.x, x.w, 7);
+		XOR_ROTATE_ADD(x.z, x.y, x.x, 9);
+		XOR_ROTATE_ADD(x.w, x.z, x.y, 13);
+		XOR_ROTATE_ADD(x.x, x.w, x.z, 18);
+
+		/* Transpose rows and columns. */
+		/* Unclear if this optimization is needed: These are ordered based
+		 * upon the dependencies needed in the later xors. Compiler should be
+		 * able to figure this out, but might as well give it a hand. */
+		*s = x.y; x.y = *s3;
+		*s = x.w; x.w = *s1;
+		*s = x.z; x.z = *s2;
+
+		/* The next XOR_ROTATE_ADDS could be written to be a copy-paste of the first,
+		 * but the register targets are rewritten here to swap x[1] and x[3] so that
+		 * they can be directly shuffled to and from our peer threads without
+		 * reassignment. The reverse shuffle then puts them back in the right place.
+		 */
+
+		XOR_ROTATE_ADD(x.w, x.x, x.y, 7);
+		XOR_ROTATE_ADD(x.z, x.w, x.x, 9);
+		XOR_ROTATE_ADD(x.y, x.z, x.w, 13);
+		XOR_ROTATE_ADD(x.x, x.y, x.z, 18);
+
+		*s = x.w; x.w = *s3;
+		*s = x.y; x.y = *s1;
+		*s = x.z; x.z = *s2;
+	}
+
+	b += x;
+	// The next two lines are the beginning of the BX-centric loop iteration
+	bx ^= b;
+	x = bx;
+
+	// This is a copy of the same loop above, identical but stripped of comments.
+	// Duplicated so that we can complete a bx-based loop with fewer register moves.
+	#pragma unroll
+	for (int j = 0; j < 4; j++) {
+		XOR_ROTATE_ADD(x.y, x.x, x.w, 7);
+		XOR_ROTATE_ADD(x.z, x.y, x.x, 9);
+		XOR_ROTATE_ADD(x.w, x.z, x.y, 13);
+		XOR_ROTATE_ADD(x.x, x.w, x.z, 18);
+
+		*s = x.y; x.y = *s3;
+		*s = x.w; x.w = *s1;
+		*s = x.z; x.z = *s2;
+
+		XOR_ROTATE_ADD(x.w, x.x, x.y, 7);
+		XOR_ROTATE_ADD(x.z, x.w, x.x, 9);
+		XOR_ROTATE_ADD(x.y, x.z, x.w, 13);
+		XOR_ROTATE_ADD(x.x, x.y, x.z, 18);
+
+		*s = x.w; x.w = *s3;
+		*s = x.y; x.y = *s1;
+		*s = x.z; x.z = *s2;
+	}
+
+	// At the end of these iterations, the data is in primary order again.
+#undef XOR_ROTATE_ADD
+
+	bx += x;
+}
+
+
+/*
+ * chacha_xor_core (ChaCha20/8 cypher)
+ * This version is unrolled to handle both of these loops in a single
+ * call to avoid unnecessary data movement.
+ *
+ * load_key and store_key must not use primary order when
+ * using ChaCha20/8, but rather the basic transposed order
+ * (referred to as "column mode" below)
+ */
+
+#define CHACHA_PRIMITIVE(pt, rt, ps, amt) { uint32_t tmp = rt ^ (pt += ps); rt = ((tmp<<amt)|(tmp>>(32-amt))); }
+
+__device__  __forceinline__
+void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3)
+{
+	extern __shared__ unsigned char shared[];
+	uint32_t (*tmp)[32+1] = (uint32_t (*)[32+1])(shared);
+	unsigned int wrp  = threadIdx.x/32, lane = threadIdx.x%32;
+	uint32_t *s = &tmp[wrp][lane];
+	uint32_t *s1 = &tmp[wrp][x1];
+	uint32_t *s2 = &tmp[wrp][x2];
+	uint32_t *s3 = &tmp[wrp][x3];
+
+	uint4 x;
+
+	b ^= bx;
+	x = b;
+
+	// Enter in "column" mode (t0 has 0, 4,  8, 12)
+	//                        (t1 has 1, 5,  9, 13)
+	//                        (t2 has 2, 6, 10, 14)
+	//                        (t3 has 3, 7, 11, 15)
+
+	#pragma unroll
+	for (int j = 0; j < 4; j++) {
+
+		// Column Mixing phase of chacha
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12)
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y,  8)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w,  7)
+
+		*s = x.y; x.y = *s1;
+		*s = x.z; x.z = *s2;
+		*s = x.w; x.w = *s3;
+
+		// Diagonal Mixing phase of chacha
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12)
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y,  8)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w,  7)
+
+		*s = x.y; x.y = *s3;
+		*s = x.z; x.z = *s2;
+		*s = x.w; x.w = *s1;
+	}
+
+	b += x;
+	// The next two lines are the beginning of the BX-centric loop iteration
+	bx ^= b;
+	x = bx;
+
+	#pragma unroll
+	for (int j = 0; j < 4; j++) {
+
+		// Column Mixing phase of chacha
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12)
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y,  8)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w,  7)
+
+		*s = x.y; x.y = *s1;
+		*s = x.z; x.z = *s2;
+		*s = x.w; x.w = *s3;
+
+		// Diagonal Mixing phase of chacha
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12)
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y,  8)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w,  7)
+
+		*s = x.y; x.y = *s3;
+		*s = x.z; x.z = *s2;
+		*s = x.w; x.w = *s1;
+	}
+
+#undef CHACHA_PRIMITIVE
+
+	bx += x;
+}
+
+template <int ALGO> __device__  __forceinline__
+void block_mixer(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3)
+{
+	switch(ALGO) {
+	case A_SCRYPT:      salsa_xor_core(b, bx, x1, x2, x3); break;
+	case A_SCRYPT_JANE: chacha_xor_core(b, bx, x1, x2, x3); break;
+	}
+}
+
+
+/*
+ * The hasher_gen_kernel operates on a group of 1024-bit input keys
+ * in B, stored as:
+ * B = { k1B k1Bx k2B k2Bx ... }
+ * and fills up the scratchpad with the iterative hashes derived from
+ * those keys:
+ * scratch { k1h1B k1h1Bx K1h2B K1h2Bx ... K2h1B K2h1Bx K2h2B K2h2Bx ... }
+ * scratch is 1024 times larger than the input keys B.
+ * It is extremely important to stream writes effectively into scratch;
+ * less important to coalesce the reads from B.
+ *
+ * Key ordering note: Keys are input from B in "original" order:
+ * K = {k1, k2, k3, k4, k5, ..., kx15, kx16, kx17, ..., kx31 }
+ * After inputting into kernel_gen, each component k and kx of the
+ * key is transmuted into a permuted internal order to make processing faster:
+ * K = k, kx with:
+ * k = 0, 4, 8, 12, 5, 9, 13, 1, 10, 14, 2, 6, 15, 3, 7, 11
+ * and similarly for kx.
+ */
+
+template <int ALGO, MemoryAccess SCHEME> __global__
+void test_scrypt_core_kernelA(const uint32_t *d_idata, int begin, int end)
+{
+	uint4 b, bx;
+
+	int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3);
+	int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
+	int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
+
+	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	int start = (scrypt_block*c_SCRATCH + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4)) % c_SCRATCH_WU_PER_WARP;
+
+	int i=begin;
+
+	if (i == 0) {
+		load_key<ALGO>(d_idata, b, bx);
+		write_keys_direct<SCHEME>(b, bx, start);
+		++i;
+	} else read_keys_direct<SCHEME,0>(b, bx, start+32*(i-1));
+
+	while (i < end) {
+		block_mixer<ALGO>(b, bx, x1, x2, x3);
+		write_keys_direct<SCHEME>(b, bx, start+32*i);
+		++i;
+	}
+}
+
+template <int ALGO, MemoryAccess SCHEME> __global__
+void test_scrypt_core_kernelA_LG(const uint32_t *d_idata, int begin, int end, unsigned int LOOKUP_GAP)
+{
+	uint4 b, bx;
+
+	int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3);
+	int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
+	int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
+
+	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	int start = (scrypt_block*c_SCRATCH + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4)) % c_SCRATCH_WU_PER_WARP;
+
+	int i=begin;
+
+	if (i == 0) {
+		load_key<ALGO>(d_idata, b, bx);
+		write_keys_direct<SCHEME>(b, bx, start);
+		++i;
+	} else {
+		int pos = (i-1)/LOOKUP_GAP, loop = (i-1)-pos*LOOKUP_GAP;
+		read_keys_direct<SCHEME,0>(b, bx, start+32*pos);
+		while(loop--) block_mixer<ALGO>(b, bx, x1, x2, x3);
+	}
+
+	while (i < end) {
+		block_mixer<ALGO>(b, bx, x1, x2, x3);
+		if (i % LOOKUP_GAP == 0)
+			write_keys_direct<SCHEME>(b, bx, start+32*(i/LOOKUP_GAP));
+		++i;
+	}
+}
+
+
+/*
+ * hasher_hash_kernel runs the second phase of scrypt after the scratch
+ * buffer is filled with the iterative hashes: It bounces through
+ * the scratch buffer in pseudorandom order, mixing the key as it goes.
+ */
+
+template <int ALGO, MemoryAccess SCHEME, int TEX_DIM> __global__
+void test_scrypt_core_kernelB(uint32_t *d_odata, int begin, int end)
+{
+	extern __shared__ unsigned char shared[];
+	uint32_t (*tmp)[32+1] = (uint32_t (*)[32+1])(shared);
+
+	uint4 b, bx;
+
+	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	int start = (scrypt_block*c_SCRATCH) + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4);
+	if (TEX_DIM == 0) start %= c_SCRATCH_WU_PER_WARP;
+
+	int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3);
+	int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
+	int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
+
+	if (begin == 0) {
+		read_keys_direct<SCHEME,TEX_DIM>(b, bx, start+32*c_N_1);
+		block_mixer<ALGO>(b, bx, x1, x2, x3);
+	} else load_key<ALGO>(d_odata, b, bx);
+
+	for (int i = begin; i < end; i++) {
+		tmp[threadIdx.x/32][threadIdx.x%32] = bx.x;
+		int j = (tmp[threadIdx.x/32][(threadIdx.x & 0x1c)] & (c_N_1));
+		uint4 t, tx; read_keys_direct<SCHEME,TEX_DIM>(t, tx, start+32*j);
+		b ^= t; bx ^= tx;
+		block_mixer<ALGO>(b, bx, x1, x2, x3);
+	}
+
+	store_key<ALGO>(d_odata, b, bx);
+}
+
+template <int ALGO, MemoryAccess SCHEME, int TEX_DIM> __global__
+void test_scrypt_core_kernelB_LG(uint32_t *d_odata, int begin, int end, unsigned int LOOKUP_GAP)
+{
+	extern __shared__ unsigned char shared[];
+	uint32_t (*tmp)[32+1] = (uint32_t (*)[32+1])(shared);
+
+	uint4 b, bx;
+
+	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	int start = (scrypt_block*c_SCRATCH) + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4);
+	if (TEX_DIM == 0) start %= c_SCRATCH_WU_PER_WARP;
+
+	int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3);
+	int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
+	int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
+
+	if (begin == 0) {
+		int pos = c_N_1/LOOKUP_GAP, loop = 1 + (c_N_1-pos*LOOKUP_GAP);
+		read_keys_direct<SCHEME,TEX_DIM>(b, bx, start+32*pos);
+		while(loop--) block_mixer<ALGO>(b, bx, x1, x2, x3);
+	} else load_key<ALGO>(d_odata, b, bx);
+
+	for (int i = begin; i < end; i++) {
+		tmp[threadIdx.x/32][threadIdx.x%32] = bx.x;
+		int j = (tmp[threadIdx.x/32][(threadIdx.x & 0x1c)] & (c_N_1));
+		int pos = j/LOOKUP_GAP, loop = j-pos*LOOKUP_GAP;
+		uint4 t, tx; read_keys_direct<SCHEME,TEX_DIM>(t, tx, start+32*pos);
+		while(loop--) block_mixer<ALGO>(t, tx, x1, x2, x3);
+		b ^= t; bx ^= tx;
+		block_mixer<ALGO>(b, bx, x1, x2, x3);
+	}
+
+	store_key<ALGO>(d_odata, b, bx);
+}
+
+
+TestKernel::TestKernel() : KernelInterface()
+{
+}
+
+bool TestKernel::bindtexture_1D(uint32_t *d_V, size_t size)
+{
+	cudaChannelFormatDesc channelDesc4 = cudaCreateChannelDesc<uint4>();
+	texRef1D_4_V.normalized = 0;
+	texRef1D_4_V.filterMode = cudaFilterModePoint;
+	texRef1D_4_V.addressMode[0] = cudaAddressModeClamp;
+	checkCudaErrors(cudaBindTexture(NULL, &texRef1D_4_V, d_V, &channelDesc4, size));
+	return true;
+}
+
+bool TestKernel::bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch)
+{
+	cudaChannelFormatDesc channelDesc4 = cudaCreateChannelDesc<uint4>();
+	texRef2D_4_V.normalized = 0;
+	texRef2D_4_V.filterMode = cudaFilterModePoint;
+	texRef2D_4_V.addressMode[0] = cudaAddressModeClamp;
+	texRef2D_4_V.addressMode[1] = cudaAddressModeClamp;
+	// maintain texture width of TEXWIDTH (max. limit is 65000)
+	while (width > TEXWIDTH) { width /= 2; height *= 2; pitch /= 2; }
+	while (width < TEXWIDTH) { width *= 2; height = (height+1)/2; pitch *= 2; }
+	checkCudaErrors(cudaBindTexture2D(NULL, &texRef2D_4_V, d_V, &channelDesc4, width, height, pitch));
+	return true;
+}
+
+bool TestKernel::unbindtexture_1D()
+{
+	checkCudaErrors(cudaUnbindTexture(texRef1D_4_V));
+	return true;
+}
+
+bool TestKernel::unbindtexture_2D()
+{
+	checkCudaErrors(cudaUnbindTexture(texRef2D_4_V));
+	return true;
+}
+
+void TestKernel::set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V)
+{
+	checkCudaErrors(cudaMemcpyToSymbol(c_V, h_V, MAXWARPS*sizeof(uint32_t*), 0, cudaMemcpyHostToDevice));
+}
+
+bool TestKernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache)
+{
+	bool success = true;
+
+	// compute required shared memory per block for __shfl() emulation
+	size_t shared = ((threads.x + 31) / 32) * (32+1) * sizeof(uint32_t);
+
+	// make some constants available to kernel, update only initially and when changing
+	static uint32_t prev_N[MAX_GPUS] = { 0 };
+
+	if (N != prev_N[thr_id]) {
+		uint32_t h_N = N;
+		uint32_t h_N_1 = N-1;
+		uint32_t h_SCRATCH = SCRATCH;
+		uint32_t h_SCRATCH_WU_PER_WARP = (SCRATCH * WU_PER_WARP);
+		uint32_t h_SCRATCH_WU_PER_WARP_1 = (SCRATCH * WU_PER_WARP) - 1;
+
+		cudaMemcpyToSymbolAsync(c_N, &h_N, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
+		cudaMemcpyToSymbolAsync(c_N_1, &h_N_1, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
+		cudaMemcpyToSymbolAsync(c_SCRATCH, &h_SCRATCH, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
+		cudaMemcpyToSymbolAsync(c_SCRATCH_WU_PER_WARP, &h_SCRATCH_WU_PER_WARP, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
+		cudaMemcpyToSymbolAsync(c_SCRATCH_WU_PER_WARP_1, &h_SCRATCH_WU_PER_WARP_1, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
+
+		prev_N[thr_id] = N;
+	}
+
+	// First phase: Sequential writes to scratchpad.
+
+	int batch = device_batchsize[thr_id];
+
+	unsigned int pos = 0;
+	do {
+		if (LOOKUP_GAP == 1) {
+			if (IS_SCRYPT())      test_scrypt_core_kernelA<A_SCRYPT,    ANDERSEN> <<< grid, threads, shared, stream >>>(d_idata, pos, min(pos+batch, N));
+			if (IS_SCRYPT_JANE()) test_scrypt_core_kernelA<A_SCRYPT_JANE, SIMPLE> <<< grid, threads, shared, stream >>>(d_idata, pos, min(pos+batch, N));
+		} else {
+			if (IS_SCRYPT())      test_scrypt_core_kernelA_LG<A_SCRYPT,    ANDERSEN> <<< grid, threads, shared, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP);
+			if (IS_SCRYPT_JANE())	test_scrypt_core_kernelA_LG<A_SCRYPT_JANE, SIMPLE> <<< grid, threads, shared, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP);
+		}
+		pos += batch;
+	} while (pos < N);
+
+	// Second phase: Random read access from scratchpad.
+	pos = 0;
+	do {
+		if (LOOKUP_GAP == 1) {
+			if (texture_cache == 0) {
+				if (IS_SCRYPT())      test_scrypt_core_kernelB<A_SCRYPT,    ANDERSEN, 0><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N));
+				if (IS_SCRYPT_JANE()) test_scrypt_core_kernelB<A_SCRYPT_JANE, SIMPLE, 0><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N));
+			}
+			else if (texture_cache == 1) {
+				if (IS_SCRYPT())      test_scrypt_core_kernelB<A_SCRYPT,    ANDERSEN, 1><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N));
+				if (IS_SCRYPT_JANE()) test_scrypt_core_kernelB<A_SCRYPT_JANE, SIMPLE, 1><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N));
+			}
+			else if (texture_cache == 2) {
+				if (IS_SCRYPT())      test_scrypt_core_kernelB<A_SCRYPT,    ANDERSEN, 2><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N));
+				if (IS_SCRYPT_JANE()) test_scrypt_core_kernelB<A_SCRYPT_JANE, SIMPLE, 2><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N));
+			}
+		} else {
+			if (texture_cache == 0) {
+				if (IS_SCRYPT())      test_scrypt_core_kernelB_LG<A_SCRYPT,    ANDERSEN, 0><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
+				if (IS_SCRYPT_JANE()) test_scrypt_core_kernelB_LG<A_SCRYPT_JANE, SIMPLE, 0><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
+			}
+			else if (texture_cache == 1) {
+				if (IS_SCRYPT())      test_scrypt_core_kernelB_LG<A_SCRYPT,    ANDERSEN, 1><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
+				if (IS_SCRYPT_JANE()) test_scrypt_core_kernelB_LG<A_SCRYPT_JANE, SIMPLE, 1><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
+			}
+			else if (texture_cache == 2) {
+				if (IS_SCRYPT())      test_scrypt_core_kernelB_LG<A_SCRYPT,    ANDERSEN, 2><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
+				if (IS_SCRYPT_JANE()) test_scrypt_core_kernelB_LG<A_SCRYPT_JANE, SIMPLE, 2><<< grid, threads, shared, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
+			}
+		}
+
+		pos += batch;
+	} while (pos < N);
+
+	return success;
+}
diff --git a/scrypt/test_kernel.h b/scrypt/test_kernel.h
new file mode 100644
index 0000000000..e084f72a0d
--- /dev/null
+++ b/scrypt/test_kernel.h
@@ -0,0 +1,30 @@
+#ifndef TEST_KERNEL_H
+#define TEST_KERNEL_H
+
+#include "salsa_kernel.h"
+
+class TestKernel : public KernelInterface
+{
+public:
+	TestKernel();
+
+	virtual void set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V);
+	virtual bool run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache);
+	virtual bool bindtexture_1D(uint32_t *d_V, size_t size);
+	virtual bool bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch);
+	virtual bool unbindtexture_1D();
+	virtual bool unbindtexture_2D();
+
+	virtual char get_identifier() { return 'f'; };
+	virtual int get_major_version() { return 1; };
+	virtual int get_minor_version() { return 0; };
+
+	virtual int max_warps_per_block() { return 32; };
+	virtual int get_texel_width() { return 4; };
+	virtual int threads_per_wu() { return 4; }
+	virtual bool support_lookup_gap() { return true; }
+	virtual cudaSharedMemConfig shared_mem_config() { return cudaSharedMemBankSizeFourByte; }
+	virtual cudaFuncCache cache_config() { return cudaFuncCachePreferL1; }
+};
+
+#endif // #ifndef TEST_KERNEL_H
diff --git a/scrypt/titan_kernel.cu b/scrypt/titan_kernel.cu
new file mode 100644
index 0000000000..57672a2220
--- /dev/null
+++ b/scrypt/titan_kernel.cu
@@ -0,0 +1,753 @@
+/* Copyright (C) 2013 David G. Andersen. All rights reserved.
+ * with modifications by Christian Buchner
+ *
+ * Use of this code is covered under the Apache 2.0 license, which
+ * can be found in the file "LICENSE"
+ */
+
+//       attempt V.Volkov style ILP (factor 4)
+
+#include <map>
+
+#include <cuda_runtime.h>
+#include <cuda_helper.h>
+#include "miner.h"
+
+#include "salsa_kernel.h"
+#include "titan_kernel.h"
+
+#define THREADS_PER_WU 4  // four threads per hash
+
+typedef enum
+{
+	ANDERSEN,
+	SIMPLE
+} MemoryAccess;
+
+#if __CUDA_ARCH__ < 320
+	// Kepler (Compute 3.0)
+	#define __ldg(x) (*(x))
+#endif
+
+#if CUDA_VERSION >= 9000 && __CUDA_ARCH__ >= 300
+#define __shfl2(var, srcLane)  __shfl_sync(0xFFFFFFFFu, var, srcLane)
+#else
+#define __shfl2 __shfl
+#endif
+
+#if !defined(__CUDA_ARCH__) ||  __CUDA_ARCH__ >= 300
+
+// scratchbuf constants (pointers to scratch buffer for each warp, i.e. 32 hashes)
+__constant__ uint32_t* c_V[TOTAL_WARP_LIMIT];
+
+// iteration count N
+__constant__ uint32_t c_N;
+__constant__ uint32_t c_N_1;                   // N-1
+// scratch buffer size SCRATCH
+__constant__ uint32_t c_SCRATCH;
+__constant__ uint32_t c_SCRATCH_WU_PER_WARP;   // (SCRATCH * WU_PER_WARP)
+__constant__ uint32_t c_SCRATCH_WU_PER_WARP_1; // (SCRATCH * WU_PER_WARP)-1
+
+template <int ALGO> __device__  __forceinline__ void block_mixer(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3);
+
+static __device__ uint4& operator ^= (uint4& left, const uint4& right) {
+	left.x ^= right.x;
+	left.y ^= right.y;
+	left.z ^= right.z;
+	left.w ^= right.w;
+	return left;
+}
+
+static __device__ uint4& operator += (uint4& left, const uint4& right) {
+	left.x += right.x;
+	left.y += right.y;
+	left.z += right.z;
+	left.w += right.w;
+	return left;
+}
+
+
+static __device__ uint4 shfl4(const uint4 bx, int target_thread) {
+	return make_uint4(
+		__shfl2((int)bx.x, target_thread), __shfl2((int)bx.y, target_thread),
+		__shfl2((int)bx.z, target_thread), __shfl2((int)bx.w, target_thread)
+	);
+}
+
+/* write_keys writes the 8 keys being processed by a warp to the global
+ * scratchpad. To effectively use memory bandwidth, it performs the writes
+ * (and reads, for read_keys) 128 bytes at a time per memory location
+ * by __shfl'ing the 4 entries in bx to the threads in the next-up
+ * thread group. It then has eight threads together perform uint4
+ * (128 bit) writes to the destination region. This seems to make
+ * quite effective use of memory bandwidth. An approach that spread
+ * uint32s across more threads was slower because of the increased
+ * computation it required.
+ *
+ * "start" is the loop iteration producing the write - the offset within
+ * the block's memory.
+ *
+ * Internally, this algorithm first __shfl's the 4 bx entries to
+ * the next up thread group, and then uses a conditional move to
+ * ensure that odd-numbered thread groups exchange the b/bx ordering
+ * so that the right parts are written together.
+ *
+ * Thanks to Babu for helping design the 128-bit-per-write version.
+ *
+ * _direct lets the caller specify the absolute start location instead of
+ * the relative start location, as an attempt to reduce some recomputation.
+ */
+
+template <MemoryAccess SCHEME> __device__ __forceinline__
+void write_keys_direct(const uint4 &b, const uint4 &bx, uint32_t start)
+{
+	uint32_t *scratch = c_V[(blockIdx.x*blockDim.x + threadIdx.x)/32];
+	if (SCHEME == ANDERSEN) {
+		int target_thread = (threadIdx.x + 4)&31;
+		uint4 t = b, t2 = shfl4(bx, target_thread);
+		int t2_start = __shfl2((int)start, target_thread) + 4;
+		bool c = (threadIdx.x & 0x4);
+		*((uint4 *)(&scratch[c ? t2_start : start])) = (c ? t2 : t);
+		*((uint4 *)(&scratch[c ? start : t2_start])) = (c ? t : t2);
+	} else {
+		*((uint4 *)(&scratch[start   ])) = b;
+		*((uint4 *)(&scratch[start+16])) = bx;
+	}
+}
+
+template <MemoryAccess SCHEME> __device__ __forceinline__
+void read_keys_direct(uint4 &b, uint4 &bx, uint32_t start)
+{
+	uint32_t *scratch = c_V[(blockIdx.x*blockDim.x + threadIdx.x)/32];
+	if (SCHEME == ANDERSEN) {
+		int t2_start = __shfl2((int)start, (threadIdx.x + 4)&31) + 4;
+		bool c = (threadIdx.x & 0x4);
+		b  = __ldg((uint4 *)(&scratch[c ? t2_start : start]));
+		bx = __ldg((uint4 *)(&scratch[c ? start : t2_start]));
+		uint4 tmp = b; b = (c ? bx : b); bx = (c ? tmp : bx);
+		bx = shfl4(bx, (threadIdx.x + 28)&31);
+	} else {
+		b = *((uint4 *)(&scratch[start]));
+		bx = *((uint4 *)(&scratch[start+16]));
+	}
+}
+
+__device__  __forceinline__
+void primary_order_shuffle(uint32_t b[4], uint32_t bx[4]) {
+	/* Inner loop shuffle targets */
+	int x1 = (threadIdx.x & 0xfc) + (((threadIdx.x & 3)+1)&3);
+	int x2 = (threadIdx.x & 0xfc) + (((threadIdx.x & 3)+2)&3);
+	int x3 = (threadIdx.x & 0xfc) + (((threadIdx.x & 3)+3)&3);
+
+	b[3] = __shfl2((int)b[3], x1);
+	b[2] = __shfl2((int)b[2], x2);
+	b[1] = __shfl2((int)b[1], x3);
+	uint32_t tmp = b[1]; b[1] = b[3]; b[3] = tmp;
+
+	bx[3] = __shfl2((int)bx[3], x1);
+	bx[2] = __shfl2((int)bx[2], x2);
+	bx[1] = __shfl2((int)bx[1], x3);
+	tmp = bx[1]; bx[1] = bx[3]; bx[3] = tmp;
+}
+
+__device__  __forceinline__
+void primary_order_shuffle(uint4 &b, uint4 &bx) {
+	/* Inner loop shuffle targets */
+	int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 3)+1)&3);
+	int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 3)+2)&3);
+	int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 3)+3)&3);
+
+	b.w = __shfl2((int)b.w, x1);
+	b.z = __shfl2((int)b.z, x2);
+	b.y = __shfl2((int)b.y, x3);
+	uint32_t tmp = b.y; b.y = b.w; b.w = tmp;
+
+	bx.w = __shfl2((int)bx.w, x1);
+	bx.z = __shfl2((int)bx.z, x2);
+	bx.y = __shfl2((int)bx.y, x3);
+	tmp = bx.y; bx.y = bx.w; bx.w = tmp;
+}
+
+/*
+ * load_key loads a 32*32bit key from a contiguous region of memory in B.
+ * The input keys are in external order (i.e., 0, 1, 2, 3, ...).
+ * After loading, each thread has its four b and four bx keys stored
+ * in internal processing order.
+ */
+__device__  __forceinline__
+void load_key_salsa(const uint32_t *B, uint4 &b, uint4 &bx)
+{
+	uint32_t scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	uint32_t thread_in_block = threadIdx.x & 3U;
+	uint32_t key_offset = scrypt_block * 32 + (thread_in_block*4);
+
+	// Read in permuted order. Key loads are not our bottleneck right now.
+	b.x  = B[key_offset + (thread_in_block+0) & 3U];
+	b.y  = B[key_offset + (thread_in_block+1) & 3U];
+	b.z  = B[key_offset + (thread_in_block+2) & 3U];
+	b.w  = B[key_offset + (thread_in_block+3) & 3U];
+
+	key_offset += 16;
+	bx.x = B[key_offset + (thread_in_block+0) & 3U];
+	bx.y = B[key_offset + (thread_in_block+1) & 3U];
+	bx.z = B[key_offset + (thread_in_block+2) & 3U];
+	bx.w = B[key_offset + (thread_in_block+3) & 3U];
+
+	primary_order_shuffle(b, bx);
+}
+
+/*
+ * store_key performs the opposite transform as load_key, taking
+ * internally-ordered b and bx and storing them into a contiguous
+ * region of B in external order.
+ */
+__device__  __forceinline__
+void store_key_salsa(uint32_t *B, uint4 &b, uint4 &bx)
+{
+	uint32_t scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	uint32_t thread_in_block = threadIdx.x & 3U;
+	uint32_t key_offset = scrypt_block * 32 + (thread_in_block*4);
+
+	primary_order_shuffle(b, bx);
+
+	B[key_offset + (thread_in_block+0) & 3U] = b.x;
+	B[key_offset + (thread_in_block+1) & 3U] = b.y;
+	B[key_offset + (thread_in_block+2) & 3U] = b.z;
+	B[key_offset + (thread_in_block+3) & 3U] = b.w;
+
+	key_offset += 16;
+	B[key_offset + (thread_in_block+0) & 3U] = bx.x;
+	B[key_offset + (thread_in_block+1) & 3U] = bx.y;
+	B[key_offset + (thread_in_block+2) & 3U] = bx.z;
+	B[key_offset + (thread_in_block+3) & 3U] = bx.w;
+}
+
+
+/*
+ * load_key loads a 32*32bit key from a contiguous region of memory in B.
+ * The input keys are in external order (i.e., 0, 1, 2, 3, ...).
+ * After loading, each thread has its four b and four bx keys stored
+ * in internal processing order.
+ */
+__device__  __forceinline__
+void load_key_chacha(const uint32_t *B, uint4 &b, uint4 &bx)
+{
+	uint32_t scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	uint32_t thread_in_block = threadIdx.x & 3U;
+	uint32_t key_offset = scrypt_block * 32 + thread_in_block;
+
+	// Read in permuted order. Key loads are not our bottleneck right now.
+	b.x  = B[key_offset      ];
+	b.y  = B[key_offset + 4*1];
+	b.z  = B[key_offset + 4*2];
+	b.w  = B[key_offset + 4*3];
+
+	key_offset += 16;
+	bx.x = B[key_offset      ];
+	bx.y = B[key_offset + 4  ];
+	bx.z = B[key_offset + 4*2];
+	bx.w = B[key_offset + 4*3];
+}
+
+/*
+ * store_key performs the opposite transform as load_key, taking
+ * internally-ordered b and bx and storing them into a contiguous
+ * region of B in external order.
+ */
+__device__  __forceinline__
+void store_key_chacha(uint32_t *B, const uint4 &b, const uint4 &bx)
+{
+	uint32_t scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	uint32_t thread_in_block = threadIdx.x & 3U;
+	uint32_t key_offset = scrypt_block * 32U + thread_in_block;
+
+	B[key_offset      ] = b.x;
+	B[key_offset + 4  ] = b.y;
+	B[key_offset + 4*2] = b.z;
+	B[key_offset + 4*3] = b.w;
+
+	key_offset += 16;
+	B[key_offset      ] = bx.x;
+	B[key_offset + 4  ] = bx.y;
+	B[key_offset + 4*2] = bx.z;
+	B[key_offset + 4*3] = bx.w;
+}
+
+
+template <int ALGO> __device__  __forceinline__
+void load_key(const uint32_t *B, uint4 &b, uint4 &bx)
+{
+	switch(ALGO) {
+	case A_SCRYPT:      load_key_salsa(B, b, bx); break;
+	case A_SCRYPT_JANE: load_key_chacha(B, b, bx); break;
+	}
+}
+
+template <int ALGO> __device__  __forceinline__
+void store_key(uint32_t *B, uint4 &b, uint4 &bx)
+{
+	switch(ALGO) {
+	case A_SCRYPT:      store_key_salsa(B, b, bx); break;
+	case A_SCRYPT_JANE: store_key_chacha(B, b, bx); break;
+	}
+}
+
+
+/*
+ * salsa_xor_core (Salsa20/8 cypher)
+ * The original scrypt called:
+ * xor_salsa8(&X[0], &X[16]); <-- the "b" loop
+ * xor_salsa8(&X[16], &X[0]); <-- the "bx" loop
+ * This version is unrolled to handle both of these loops in a single
+ * call to avoid unnecessary data movement.
+ */
+
+#if __CUDA_ARCH__ < 350
+	// Kepler (Compute 3.0)
+	#define XOR_ROTATE_ADD(dst, s1, s2, amt) { uint32_t tmp = s1+s2; dst ^= ((tmp<<amt)|(tmp>>(32-amt))); }
+#else
+	// Kepler (Compute 3.5)
+	#define ROTL(a, b) __funnelshift_l( a, a, b );
+	#define XOR_ROTATE_ADD(dst, s1, s2, amt) dst ^= ROTL(s1+s2, amt);
+#endif
+
+
+__device__  __forceinline__
+void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3)
+{
+	uint4 x;
+
+	b ^= bx;
+	x = b;
+
+	// Enter in "primary order" (t0 has  0,  4,  8, 12)
+	//                          (t1 has  5,  9, 13,  1)
+	//                          (t2 has 10, 14,  2,  6)
+	//                          (t3 has 15,  3,  7, 11)
+
+	#pragma unroll
+	for (int j = 0; j < 4; j++)
+	{
+		// Mixing phase of salsa
+		XOR_ROTATE_ADD(x.y, x.x, x.w, 7);
+		XOR_ROTATE_ADD(x.z, x.y, x.x, 9);
+		XOR_ROTATE_ADD(x.w, x.z, x.y, 13);
+		XOR_ROTATE_ADD(x.x, x.w, x.z, 18);
+
+		/* Transpose rows and columns. */
+		/* Unclear if this optimization is needed: These are ordered based
+		 * upon the dependencies needed in the later xors. Compiler should be
+		 * able to figure this out, but might as well give it a hand. */
+		x.y = __shfl2((int)x.y, x3);
+		x.w = __shfl2((int)x.w, x1);
+		x.z = __shfl2((int)x.z, x2);
+
+		/* The next XOR_ROTATE_ADDS could be written to be a copy-paste of the first,
+		 * but the register targets are rewritten here to swap x[1] and x[3] so that
+		 * they can be directly shuffled to and from our peer threads without
+		 * reassignment. The reverse shuffle then puts them back in the right place.
+		 */
+
+		XOR_ROTATE_ADD(x.w, x.x, x.y, 7);
+		XOR_ROTATE_ADD(x.z, x.w, x.x, 9);
+		XOR_ROTATE_ADD(x.y, x.z, x.w, 13);
+		XOR_ROTATE_ADD(x.x, x.y, x.z, 18);
+
+		x.w = __shfl2((int)x.w, x3);
+		x.y = __shfl2((int)x.y, x1);
+		x.z = __shfl2((int)x.z, x2);
+	}
+
+	b += x;
+	// The next two lines are the beginning of the BX-centric loop iteration
+	bx ^= b;
+	x = bx;
+
+	// This is a copy of the same loop above, identical but stripped of comments.
+	// Duplicated so that we can complete a bx-based loop with fewer register moves.
+	#pragma unroll 4
+	for (int j = 0; j < 4; j++)
+	{
+		XOR_ROTATE_ADD(x.y, x.x, x.w, 7);
+		XOR_ROTATE_ADD(x.z, x.y, x.x, 9);
+		XOR_ROTATE_ADD(x.w, x.z, x.y, 13);
+		XOR_ROTATE_ADD(x.x, x.w, x.z, 18);
+
+		x.y = __shfl2((int)x.y, x3);
+		x.w = __shfl2((int)x.w, x1);
+		x.z = __shfl2((int)x.z, x2);
+
+		XOR_ROTATE_ADD(x.w, x.x, x.y, 7);
+		XOR_ROTATE_ADD(x.z, x.w, x.x, 9);
+		XOR_ROTATE_ADD(x.y, x.z, x.w, 13);
+		XOR_ROTATE_ADD(x.x, x.y, x.z, 18);
+
+		x.w = __shfl2((int)x.w, x3);
+		x.y = __shfl2((int)x.y, x1);
+		x.z = __shfl2((int)x.z, x2);
+	}
+
+	// At the end of these iterations, the data is in primary order again.
+#undef XOR_ROTATE_ADD
+
+	bx += x;
+}
+
+
+/*
+ * chacha_xor_core (ChaCha20/8 cypher)
+ * This version is unrolled to handle both of these loops in a single
+ * call to avoid unnecessary data movement.
+ *
+ * load_key and store_key must not use primary order when
+ * using ChaCha20/8, but rather the basic transposed order
+ * (referred to as "column mode" below)
+ */
+
+#if __CUDA_ARCH__ < 320
+	// Kepler (Compute 3.0)
+	#define CHACHA_PRIMITIVE(pt, rt, ps, amt) { uint32_t tmp = rt ^ (pt += ps); rt = ((tmp<<amt)|(tmp>>(32-amt))); }
+#else
+	// Kepler (Compute 3.5)
+	#define ROTL(a, b) __funnelshift_l( a, a, b );
+	#define CHACHA_PRIMITIVE(pt, rt, ps, amt) { pt += ps; rt = ROTL(rt ^ pt,amt); }
+#endif
+
+__device__  __forceinline__
+void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3)
+{
+	uint4 x = b ^= bx;
+
+	//b ^= bx;
+	//x = b;
+
+	// Enter in "column" mode (t0 has 0, 4,  8, 12)
+	//                        (t1 has 1, 5,  9, 13)
+	//                        (t2 has 2, 6, 10, 14)
+	//                        (t3 has 3, 7, 11, 15)
+
+	//#pragma unroll
+	for (int j = 0; j < 4; j++) {
+
+		// Column Mixing phase of chacha
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12)
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y,  8)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w,  7)
+
+		x.y = __shfl2((int)x.y, x1);
+		x.z = __shfl2((int)x.z, x2);
+		x.w = __shfl2((int)x.w, x3);
+
+		// Diagonal Mixing phase of chacha
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12)
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y,  8)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w,  7)
+
+		x.y = __shfl2((int)x.y, x3);
+		x.z = __shfl2((int)x.z, x2);
+		x.w = __shfl2((int)x.w, x1);
+	}
+
+	b += x;
+	// The next two lines are the beginning of the BX-centric loop iteration
+	bx ^= b;
+	x = bx;
+
+	//#pragma unroll
+	for (int j = 0; j < 4; j++) 
+	{
+
+		// Column Mixing phase of chacha
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12)
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y,  8)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w,  7)
+
+		x.y = __shfl2((int)x.y, x1);
+		x.z = __shfl2((int)x.z, x2);
+		x.w = __shfl2((int)x.w, x3);
+
+		// Diagonal Mixing phase of chacha
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12)
+		CHACHA_PRIMITIVE(x.x ,x.w, x.y,  8)
+		CHACHA_PRIMITIVE(x.z ,x.y, x.w,  7)
+
+		x.y = __shfl2((int)x.y, x3);
+		x.z = __shfl2((int)x.z, x2);
+		x.w = __shfl2((int)x.w, x1);
+	}
+
+#undef CHACHA_PRIMITIVE
+
+	bx += x;
+}
+
+
+template <int ALGO> __device__  __forceinline__
+void block_mixer(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3)
+{
+	switch(ALGO) {
+	case A_SCRYPT:      salsa_xor_core(b, bx, x1, x2, x3); break;
+	case A_SCRYPT_JANE: chacha_xor_core(b, bx, x1, x2, x3); break;
+	}
+}
+
+
+/*
+ * The hasher_gen_kernel operates on a group of 1024-bit input keys
+ * in B, stored as:
+ * B = { k1B k1Bx k2B k2Bx ... }
+ * and fills up the scratchpad with the iterative hashes derived from
+ * those keys:
+ * scratch { k1h1B k1h1Bx K1h2B K1h2Bx ... K2h1B K2h1Bx K2h2B K2h2Bx ... }
+ * scratch is 1024 times larger than the input keys B.
+ * It is extremely important to stream writes effectively into scratch;
+ * less important to coalesce the reads from B.
+ *
+ * Key ordering note: Keys are input from B in "original" order:
+ * K = {k1, k2, k3, k4, k5, ..., kx15, kx16, kx17, ..., kx31 }
+ * After inputting into kernel_gen, each component k and kx of the
+ * key is transmuted into a permuted internal order to make processing faster:
+ * K = k, kx with:
+ * k = 0, 4, 8, 12, 5, 9, 13, 1, 10, 14, 2, 6, 15, 3, 7, 11
+ * and similarly for kx.
+ */
+
+template <int ALGO, MemoryAccess SCHEME> __global__
+void titan_scrypt_core_kernelA(const uint32_t *d_idata, int begin, int end)
+{
+	uint4 b, bx;
+
+	int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3);
+	int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
+	int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
+
+	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	int start = (scrypt_block*c_SCRATCH + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4)) % c_SCRATCH_WU_PER_WARP;
+
+	int i=begin;
+
+	if (i == 0) {
+		load_key<ALGO>(d_idata, b, bx);
+		write_keys_direct<SCHEME>(b, bx, start);
+		++i;
+	} else read_keys_direct<SCHEME>(b, bx, start+32*(i-1));
+
+	while (i < end) {
+		block_mixer<ALGO>(b, bx, x1, x2, x3);
+		write_keys_direct<SCHEME>(b, bx, start+32*i);
+		++i;
+	}
+}
+
+template <int ALGO, MemoryAccess SCHEME> __global__
+void titan_scrypt_core_kernelA_LG(const uint32_t *d_idata, int begin, int end, unsigned int LOOKUP_GAP)
+{
+	uint4 b, bx;
+
+	int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3);
+	int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
+	int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
+
+	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	int start = (scrypt_block*c_SCRATCH + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4)) % c_SCRATCH_WU_PER_WARP;
+
+	int i=begin;
+
+	if (i == 0) {
+		load_key<ALGO>(d_idata, b, bx);
+		write_keys_direct<SCHEME>(b, bx, start);
+		++i;
+	} else {
+		int pos = (i-1)/LOOKUP_GAP, loop = (i-1)-pos*LOOKUP_GAP;
+		read_keys_direct<SCHEME>(b, bx, start+32*pos);
+		while(loop--) block_mixer<ALGO>(b, bx, x1, x2, x3);
+	}
+
+	while (i < end) {
+		block_mixer<ALGO>(b, bx, x1, x2, x3);
+		if (i % LOOKUP_GAP == 0)
+			write_keys_direct<SCHEME>(b, bx, start+32*(i/LOOKUP_GAP));
+		++i;
+	}
+}
+
+
+/*
+ * hasher_hash_kernel runs the second phase of scrypt after the scratch
+ * buffer is filled with the iterative hashes: It bounces through
+ * the scratch buffer in pseudorandom order, mixing the key as it goes.
+ */
+
+template <int ALGO, MemoryAccess SCHEME> __global__
+void titan_scrypt_core_kernelB(uint32_t *d_odata, int begin, int end)
+{
+	uint4 b, bx;
+
+	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	int start = ((scrypt_block*c_SCRATCH) + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4)) % c_SCRATCH_WU_PER_WARP;
+
+	int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3);
+	int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
+	int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
+
+	if (begin == 0) {
+		read_keys_direct<SCHEME>(b, bx, start+32*c_N_1);
+		block_mixer<ALGO>(b, bx, x1, x2, x3);
+	} else load_key<ALGO>(d_odata, b, bx);
+
+	for (int i = begin; i < end; i++) {
+		int j = (__shfl2((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
+		uint4 t, tx; read_keys_direct<SCHEME>(t, tx, start+32*j);
+		b ^= t; bx ^= tx;
+		block_mixer<ALGO>(b, bx, x1, x2, x3);
+	}
+
+	store_key<ALGO>(d_odata, b, bx);
+}
+
+template <int ALGO, MemoryAccess SCHEME> __global__
+void titan_scrypt_core_kernelB_LG(uint32_t *d_odata, int begin, int end, unsigned int LOOKUP_GAP)
+{
+	uint4 b, bx;
+
+	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	int start = ((scrypt_block*c_SCRATCH) + (SCHEME==ANDERSEN?8:4)*(threadIdx.x%4)) % c_SCRATCH_WU_PER_WARP;
+
+	int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3);
+	int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
+	int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
+
+	if (begin == 0) {
+		int pos = c_N_1/LOOKUP_GAP, loop = 1 + (c_N_1-pos*LOOKUP_GAP);
+		read_keys_direct<SCHEME>(b, bx, start+32*pos);
+		while(loop--)
+			block_mixer<ALGO>(b, bx, x1, x2, x3);
+	}
+	else
+			load_key<ALGO>(d_odata, b, bx);
+
+	if (SCHEME == SIMPLE)
+	{
+		// better divergent thread handling submitted by nVidia engineers, but
+		// supposedly this does not run with the ANDERSEN memory access scheme
+		int j = (__shfl2((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
+		int pos = j/LOOKUP_GAP;
+		int loop = -1;
+		uint4 t, tx;
+
+		int i = begin;
+		while(i < end)
+		{
+			if (loop == -1) {
+				j = (__shfl2((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
+				pos = j/LOOKUP_GAP;
+				loop = j-pos*LOOKUP_GAP;
+				read_keys_direct<SCHEME>(t, tx, start+32*pos);
+			}
+			if (loop == 0) {
+				b ^= t; bx ^= tx;
+				t=b;tx=bx;
+			}
+
+			block_mixer<ALGO>(t, tx, x1, x2, x3);
+			if (loop == 0) {
+				b=t;bx=tx;
+				i++;
+			}
+			loop--;
+		}
+	}
+	else
+	{
+		// this is my original implementation, now used with the ANDERSEN
+		// memory access scheme only.
+		for (int i = begin; i < end; i++) {
+			int j = (__shfl2((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
+			int pos = j/LOOKUP_GAP, loop = j-pos*LOOKUP_GAP;
+			uint4 t, tx; read_keys_direct<SCHEME>(t, tx, start+32*pos);
+			while (loop--)
+				block_mixer<ALGO>(t, tx, x1, x2, x3);
+			b ^= t; bx ^= tx;
+			block_mixer<ALGO>(b, bx, x1, x2, x3);
+		}
+	}
+
+	store_key<ALGO>(d_odata, b, bx);
+}
+
+
+TitanKernel::TitanKernel() : KernelInterface()
+{
+}
+
+void TitanKernel::set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V)
+{
+	checkCudaErrors(cudaMemcpyToSymbol(c_V, h_V, MAXWARPS*sizeof(uint32_t*), 0, cudaMemcpyHostToDevice));
+}
+
+bool TitanKernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream,
+	uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache)
+{
+	bool success = true;
+	bool scrypt = IS_SCRYPT();
+	bool chacha = IS_SCRYPT_JANE();
+
+	// make some constants available to kernel, update only initially and when changing
+	static uint32_t prev_N[MAX_GPUS] = { 0 };
+
+	if (N != prev_N[thr_id]) {
+		uint32_t h_N = N;
+		uint32_t h_N_1 = N-1;
+		uint32_t h_SCRATCH = SCRATCH;
+		uint32_t h_SCRATCH_WU_PER_WARP = (SCRATCH * WU_PER_WARP);
+		uint32_t h_SCRATCH_WU_PER_WARP_1 = (SCRATCH * WU_PER_WARP) - 1;
+
+		cudaMemcpyToSymbolAsync(c_N, &h_N, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
+		cudaMemcpyToSymbolAsync(c_N_1, &h_N_1, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
+		cudaMemcpyToSymbolAsync(c_SCRATCH, &h_SCRATCH, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
+		cudaMemcpyToSymbolAsync(c_SCRATCH_WU_PER_WARP, &h_SCRATCH_WU_PER_WARP, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
+		cudaMemcpyToSymbolAsync(c_SCRATCH_WU_PER_WARP_1, &h_SCRATCH_WU_PER_WARP_1, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream);
+
+		prev_N[thr_id] = N;
+	}
+
+	// First phase: Sequential writes to scratchpad.
+
+	int batch = device_batchsize[thr_id];
+
+	unsigned int pos = 0;
+	do {
+		if (LOOKUP_GAP == 1) {
+			if (scrypt) titan_scrypt_core_kernelA<A_SCRYPT,    ANDERSEN> <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N));
+			if (chacha) titan_scrypt_core_kernelA<A_SCRYPT_JANE, SIMPLE> <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N));
+		} else {
+			if (scrypt) titan_scrypt_core_kernelA_LG<A_SCRYPT,    ANDERSEN> <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP);
+			if (chacha) titan_scrypt_core_kernelA_LG<A_SCRYPT_JANE, SIMPLE> <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP);
+		}
+		pos += batch;
+
+	} while (pos < N);
+
+	// Second phase: Random read access from scratchpad.
+
+	pos = 0;
+	do {
+		if (LOOKUP_GAP == 1)  {
+			if (scrypt) titan_scrypt_core_kernelB<A_SCRYPT,    ANDERSEN> <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
+			if (chacha) titan_scrypt_core_kernelB<A_SCRYPT_JANE, SIMPLE> <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N));
+		} else {
+			if (scrypt) titan_scrypt_core_kernelB_LG<A_SCRYPT,    ANDERSEN> <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
+			if (chacha) titan_scrypt_core_kernelB_LG<A_SCRYPT_JANE, SIMPLE> <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP);
+		}
+		pos += batch;
+
+	} while (pos < N);
+
+	return success;
+}
+
+#endif /* prevent SM 2 */
diff --git a/scrypt/titan_kernel.h b/scrypt/titan_kernel.h
new file mode 100644
index 0000000000..720b9a3a80
--- /dev/null
+++ b/scrypt/titan_kernel.h
@@ -0,0 +1,26 @@
+#ifndef TITAN_KERNEL_H
+#define TITAN_KERNEL_H
+
+#include "salsa_kernel.h"
+
+class TitanKernel : public KernelInterface
+{
+public:
+	TitanKernel();
+
+	virtual void set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V);
+	virtual bool run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache);
+
+	virtual char get_identifier() { return 't'; }
+	virtual int get_major_version() { return 3; }
+	virtual int get_minor_version() { return 5; }
+
+	virtual int max_warps_per_block() { return 32; }
+	virtual int get_texel_width() { return 4; }
+	virtual bool no_textures() { return true; }
+	virtual int threads_per_wu() { return 4; }
+	virtual bool support_lookup_gap() { return true; }
+	virtual cudaFuncCache cache_config() { return cudaFuncCachePreferL1; }
+};
+
+#endif // #ifndef TITAN_KERNEL_H
diff --git a/serialize.hpp b/serialize.hpp
new file mode 100644
index 0000000000..e06a43469e
--- /dev/null
+++ b/serialize.hpp
@@ -0,0 +1,875 @@
+// Copyright (c) 2009-2010 Satoshi Nakamoto
+// Copyright (c) 2009-2014 The Bitcoin Core developers
+// Distributed under the MIT software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+#ifndef BITCOIN_SERIALIZE_H
+#define BITCOIN_SERIALIZE_H
+
+//#include "compat/endian.h"
+
+#include <algorithm>
+#include <assert.h>
+#include <ios>
+#include <limits>
+#include <map>
+#include <set>
+#include <stdint.h>
+#include <string>
+#include <string.h>
+#include <utility>
+#include <vector>
+
+#ifdef __APPLE__
+#include <libkern/OSByteOrder.h>
+
+#define htobe16(x) OSSwapHostToBigInt16(x)
+#define htole16(x) OSSwapHostToLittleInt16(x)
+#define be16toh(x) OSSwapBigToHostInt16(x)
+#define le16toh(x) OSSwapLittleToHostInt16(x)
+
+#define htobe32(x) OSSwapHostToBigInt32(x)
+#define htole32(x) OSSwapHostToLittleInt32(x)
+#define be32toh(x) OSSwapBigToHostInt32(x)
+#define le32toh(x) OSSwapLittleToHostInt32(x)
+
+#define htobe64(x) OSSwapHostToBigInt64(x)
+#define htole64(x) OSSwapHostToLittleInt64(x)
+#define be64toh(x) OSSwapBigToHostInt64(x)
+#define le64toh(x) OSSwapLittleToHostInt64(x)
+#endif
+
+class CScript;
+
+static const unsigned int MAX_SIZE = 0x02000000;
+
+/**
+ * Used to bypass the rule against non-const reference to temporary
+ * where it makes sense with wrappers such as CFlatData or CTxDB
+ */
+template<typename T>
+inline T& REF(const T& val)
+{
+    return const_cast<T&>(val);
+}
+
+/**
+ * Used to acquire a non-const pointer "this" to generate bodies
+ * of const serialization operations from a template
+ */
+template<typename T>
+inline T* NCONST_PTR(const T* val)
+{
+    return const_cast<T*>(val);
+}
+
+/**
+ * Get begin pointer of vector (non-const version).
+ * @note These functions avoid the undefined case of indexing into an empty
+ * vector, as well as that of indexing after the end of the vector.
+ */
+template <class T, class TAl>
+inline T* begin_ptr(std::vector<T,TAl>& v)
+{
+    return v.empty() ? NULL : &v[0];
+}
+/** Get begin pointer of vector (const version) */
+template <class T, class TAl>
+inline const T* begin_ptr(const std::vector<T,TAl>& v)
+{
+    return v.empty() ? NULL : &v[0];
+}
+/** Get end pointer of vector (non-const version) */
+template <class T, class TAl>
+inline T* end_ptr(std::vector<T,TAl>& v)
+{
+    return v.empty() ? NULL : (&v[0] + v.size());
+}
+/** Get end pointer of vector (const version) */
+template <class T, class TAl>
+inline const T* end_ptr(const std::vector<T,TAl>& v)
+{
+    return v.empty() ? NULL : (&v[0] + v.size());
+}
+
+/*
+ * Lowest-level serialization and conversion.
+ * @note Sizes of these types are verified in the tests
+ */
+template<typename Stream> inline void ser_writedata8(Stream &s, uint8_t obj)
+{
+    s.write((char*)&obj, 1);
+}
+template<typename Stream> inline void ser_writedata16(Stream &s, uint16_t obj)
+{
+    obj = htole16(obj);
+    s.write((char*)&obj, 2);
+}
+template<typename Stream> inline void ser_writedata32(Stream &s, uint32_t obj)
+{
+    obj = htole32(obj);
+    s.write((char*)&obj, 4);
+}
+template<typename Stream> inline void ser_writedata64(Stream &s, uint64_t obj)
+{
+    obj = htole64(obj);
+    s.write((char*)&obj, 8);
+}
+template<typename Stream> inline uint8_t ser_readdata8(Stream &s)
+{
+    uint8_t obj;
+    s.read((char*)&obj, 1);
+    return obj;
+}
+template<typename Stream> inline uint16_t ser_readdata16(Stream &s)
+{
+    uint16_t obj;
+    s.read((char*)&obj, 2);
+    return le16toh(obj);
+}
+template<typename Stream> inline uint32_t ser_readdata32(Stream &s)
+{
+    uint32_t obj;
+    s.read((char*)&obj, 4);
+    return le32toh(obj);
+}
+template<typename Stream> inline uint64_t ser_readdata64(Stream &s)
+{
+    uint64_t obj;
+    s.read((char*)&obj, 8);
+    return le64toh(obj);
+}
+inline uint64_t ser_double_to_uint64(double x)
+{
+    union { double x; uint64_t y; } tmp;
+    tmp.x = x;
+    return tmp.y;
+}
+inline uint32_t ser_float_to_uint32(float x)
+{
+    union { float x; uint32_t y; } tmp;
+    tmp.x = x;
+    return tmp.y;
+}
+inline double ser_uint64_to_double(uint64_t y)
+{
+    union { double x; uint64_t y; } tmp;
+    tmp.y = y;
+    return tmp.x;
+}
+inline float ser_uint32_to_float(uint32_t y)
+{
+    union { float x; uint32_t y; } tmp;
+    tmp.y = y;
+    return tmp.x;
+}
+
+
+/////////////////////////////////////////////////////////////////
+//
+// Templates for serializing to anything that looks like a stream,
+// i.e. anything that supports .read(char*, size_t) and .write(char*, size_t)
+//
+
+enum
+{
+    // primary actions
+    SER_NETWORK         = (1 << 0),
+    SER_DISK            = (1 << 1),
+    SER_GETHASH         = (1 << 2),
+};
+
+#define READWRITE(obj)      (::SerReadWrite(s, (obj), nType, nVersion, ser_action))
+
+/**
+ * Implement three methods for serializable objects. These are actually wrappers over
+ * "SerializationOp" template, which implements the body of each class' serialization
+ * code. Adding "ADD_SERIALIZE_METHODS" in the body of the class causes these wrappers to be
+ * added as members.
+ */
+#define ADD_SERIALIZE_METHODS                                                          \
+    size_t GetSerializeSize(int nType, int nVersion) const {                         \
+        CSizeComputer s(nType, nVersion);                                            \
+        NCONST_PTR(this)->SerializationOp(s, CSerActionSerialize(), nType, nVersion);\
+        return s.size();                                                             \
+    }                                                                                \
+    template<typename Stream>                                                        \
+    void Serialize(Stream& s, int nType, int nVersion) const {                       \
+        NCONST_PTR(this)->SerializationOp(s, CSerActionSerialize(), nType, nVersion);\
+    }                                                                                \
+    template<typename Stream>                                                        \
+    void Unserialize(Stream& s, int nType, int nVersion) {                           \
+        SerializationOp(s, CSerActionUnserialize(), nType, nVersion);                \
+    }
+
+/*
+ * Basic Types
+ */
+inline unsigned int GetSerializeSize(char a,      int, int=0) { return 1; }
+inline unsigned int GetSerializeSize(int8_t a,    int, int=0) { return 1; }
+inline unsigned int GetSerializeSize(uint8_t a,   int, int=0) { return 1; }
+inline unsigned int GetSerializeSize(int16_t a,   int, int=0) { return 2; }
+inline unsigned int GetSerializeSize(uint16_t a,  int, int=0) { return 2; }
+inline unsigned int GetSerializeSize(int32_t a,   int, int=0) { return 4; }
+inline unsigned int GetSerializeSize(uint32_t a,  int, int=0) { return 4; }
+inline unsigned int GetSerializeSize(int64_t a,   int, int=0) { return 8; }
+inline unsigned int GetSerializeSize(uint64_t a,  int, int=0) { return 8; }
+inline unsigned int GetSerializeSize(float a,     int, int=0) { return 4; }
+inline unsigned int GetSerializeSize(double a,    int, int=0) { return 8; }
+
+template<typename Stream> inline void Serialize(Stream& s, char a,         int, int=0) { ser_writedata8(s, a); } // TODO Get rid of bare char
+template<typename Stream> inline void Serialize(Stream& s, int8_t a,       int, int=0) { ser_writedata8(s, a); }
+template<typename Stream> inline void Serialize(Stream& s, uint8_t a,      int, int=0) { ser_writedata8(s, a); }
+template<typename Stream> inline void Serialize(Stream& s, int16_t a,      int, int=0) { ser_writedata16(s, a); }
+template<typename Stream> inline void Serialize(Stream& s, uint16_t a,     int, int=0) { ser_writedata16(s, a); }
+template<typename Stream> inline void Serialize(Stream& s, int32_t a,      int, int=0) { ser_writedata32(s, a); }
+template<typename Stream> inline void Serialize(Stream& s, uint32_t a,     int, int=0) { ser_writedata32(s, a); }
+template<typename Stream> inline void Serialize(Stream& s, int64_t a,      int, int=0) { ser_writedata64(s, a); }
+template<typename Stream> inline void Serialize(Stream& s, uint64_t a,     int, int=0) { ser_writedata64(s, a); }
+template<typename Stream> inline void Serialize(Stream& s, float a,        int, int=0) { ser_writedata32(s, ser_float_to_uint32(a)); }
+template<typename Stream> inline void Serialize(Stream& s, double a,       int, int=0) { ser_writedata64(s, ser_double_to_uint64(a)); }
+
+template<typename Stream> inline void Unserialize(Stream& s, char& a,      int, int=0) { a = ser_readdata8(s); } // TODO Get rid of bare char
+template<typename Stream> inline void Unserialize(Stream& s, int8_t& a,    int, int=0) { a = ser_readdata8(s); }
+template<typename Stream> inline void Unserialize(Stream& s, uint8_t& a,   int, int=0) { a = ser_readdata8(s); }
+template<typename Stream> inline void Unserialize(Stream& s, int16_t& a,   int, int=0) { a = ser_readdata16(s); }
+template<typename Stream> inline void Unserialize(Stream& s, uint16_t& a,  int, int=0) { a = ser_readdata16(s); }
+template<typename Stream> inline void Unserialize(Stream& s, int32_t& a,   int, int=0) { a = ser_readdata32(s); }
+template<typename Stream> inline void Unserialize(Stream& s, uint32_t& a,  int, int=0) { a = ser_readdata32(s); }
+template<typename Stream> inline void Unserialize(Stream& s, int64_t& a,   int, int=0) { a = ser_readdata64(s); }
+template<typename Stream> inline void Unserialize(Stream& s, uint64_t& a,  int, int=0) { a = ser_readdata64(s); }
+template<typename Stream> inline void Unserialize(Stream& s, float& a,     int, int=0) { a = ser_uint32_to_float(ser_readdata32(s)); }
+template<typename Stream> inline void Unserialize(Stream& s, double& a,    int, int=0) { a = ser_uint64_to_double(ser_readdata64(s)); }
+
+inline unsigned int GetSerializeSize(bool a, int, int=0)                          { return sizeof(char); }
+template<typename Stream> inline void Serialize(Stream& s, bool a, int, int=0)    { char f=a; ser_writedata8(s, f); }
+template<typename Stream> inline void Unserialize(Stream& s, bool& a, int, int=0) { char f=ser_readdata8(s); a=f; }
+
+
+
+
+
+
+/**
+ * Compact Size
+ * size <  253        -- 1 byte
+ * size <= USHRT_MAX  -- 3 bytes  (253 + 2 bytes)
+ * size <= UINT_MAX   -- 5 bytes  (254 + 4 bytes)
+ * size >  UINT_MAX   -- 9 bytes  (255 + 8 bytes)
+ */
+inline unsigned int GetSizeOfCompactSize(uint64_t nSize)
+{
+    if (nSize < 253)             return sizeof(unsigned char);
+    else if (nSize <= std::numeric_limits<unsigned short>::max()) return sizeof(unsigned char) + sizeof(unsigned short);
+    else if (nSize <= std::numeric_limits<unsigned int>::max())  return sizeof(unsigned char) + sizeof(unsigned int);
+    else                         return sizeof(unsigned char) + sizeof(uint64_t);
+}
+
+template<typename Stream>
+void WriteCompactSize(Stream& os, uint64_t nSize)
+{
+    if (nSize < 253)
+    {
+        ser_writedata8(os, nSize);
+    }
+    else if (nSize <= std::numeric_limits<unsigned short>::max())
+    {
+        ser_writedata8(os, 253);
+        ser_writedata16(os, nSize);
+    }
+    else if (nSize <= std::numeric_limits<unsigned int>::max())
+    {
+        ser_writedata8(os, 254);
+        ser_writedata32(os, nSize);
+    }
+    else
+    {
+        ser_writedata8(os, 255);
+        ser_writedata64(os, nSize);
+    }
+    return;
+}
+
+template<typename Stream>
+uint64_t ReadCompactSize(Stream& is)
+{
+    uint8_t chSize = ser_readdata8(is);
+    uint64_t nSizeRet = 0;
+    if (chSize < 253)
+    {
+        nSizeRet = chSize;
+    }
+    else if (chSize == 253)
+    {
+        nSizeRet = ser_readdata16(is);
+        if (nSizeRet < 253)
+            throw std::ios_base::failure("non-canonical ReadCompactSize()");
+    }
+    else if (chSize == 254)
+    {
+        nSizeRet = ser_readdata32(is);
+        if (nSizeRet < 0x10000u)
+            throw std::ios_base::failure("non-canonical ReadCompactSize()");
+    }
+    else
+    {
+        nSizeRet = ser_readdata64(is);
+        if (nSizeRet < 0x100000000ULL)
+            throw std::ios_base::failure("non-canonical ReadCompactSize()");
+    }
+    if (nSizeRet > (uint64_t)MAX_SIZE)
+        throw std::ios_base::failure("ReadCompactSize(): size too large");
+    return nSizeRet;
+}
+
+/**
+ * Variable-length integers: bytes are a MSB base-128 encoding of the number.
+ * The high bit in each byte signifies whether another digit follows. To make
+ * sure the encoding is one-to-one, one is subtracted from all but the last digit.
+ * Thus, the byte sequence a[] with length len, where all but the last byte
+ * has bit 128 set, encodes the number:
+ *
+ *  (a[len-1] & 0x7F) + sum(i=1..len-1, 128^i*((a[len-i-1] & 0x7F)+1))
+ *
+ * Properties:
+ * * Very small (0-127: 1 byte, 128-16511: 2 bytes, 16512-2113663: 3 bytes)
+ * * Every integer has exactly one encoding
+ * * Encoding does not depend on size of original integer type
+ * * No redundancy: every (infinite) byte sequence corresponds to a list
+ *   of encoded integers.
+ *
+ * 0:         [0x00]  256:        [0x81 0x00]
+ * 1:         [0x01]  16383:      [0xFE 0x7F]
+ * 127:       [0x7F]  16384:      [0xFF 0x00]
+ * 128:  [0x80 0x00]  16511: [0x80 0xFF 0x7F]
+ * 255:  [0x80 0x7F]  65535: [0x82 0xFD 0x7F]
+ * 2^32:           [0x8E 0xFE 0xFE 0xFF 0x00]
+ */
+
+template<typename I>
+inline unsigned int GetSizeOfVarInt(I n)
+{
+    int nRet = 0;
+    while(true) {
+        nRet++;
+        if (n <= 0x7F)
+            break;
+        n = (n >> 7) - 1;
+    }
+    return nRet;
+}
+
+template<typename Stream, typename I>
+void WriteVarInt(Stream& os, I n)
+{
+    unsigned char tmp[(sizeof(n)*8+6)/7];
+    int len=0;
+    while(true) {
+        tmp[len] = (n & 0x7F) | (len ? 0x80 : 0x00);
+        if (n <= 0x7F)
+            break;
+        n = (n >> 7) - 1;
+        len++;
+    }
+    do {
+        ser_writedata8(os, tmp[len]);
+    } while(len--);
+}
+
+template<typename Stream, typename I>
+I ReadVarInt(Stream& is)
+{
+    I n = 0;
+    while(true) {
+        unsigned char chData = ser_readdata8(is);
+        n = (n << 7) | (chData & 0x7F);
+        if (chData & 0x80)
+            n++;
+        else
+            return n;
+    }
+}
+
+#define FLATDATA(obj) REF(CFlatData((char*)&(obj), (char*)&(obj) + sizeof(obj)))
+#define VARINT(obj) REF(WrapVarInt(REF(obj)))
+#define LIMITED_STRING(obj,n) REF(LimitedString< n >(REF(obj)))
+
+/**
+ * Wrapper for serializing arrays and POD.
+ */
+class CFlatData
+{
+protected:
+    char* pbegin;
+    char* pend;
+public:
+    CFlatData(void* pbeginIn, void* pendIn) : pbegin((char*)pbeginIn), pend((char*)pendIn) { }
+    template <class T, class TAl>
+    explicit CFlatData(std::vector<T,TAl> &v)
+    {
+        pbegin = (char*)begin_ptr(v);
+        pend = (char*)end_ptr(v);
+    }
+    char* begin() { return pbegin; }
+    const char* begin() const { return pbegin; }
+    char* end() { return pend; }
+    const char* end() const { return pend; }
+
+    unsigned int GetSerializeSize(int, int=0) const
+    {
+        return (unsigned int) (pend - pbegin);
+    }
+
+    template<typename Stream>
+    void Serialize(Stream& s, int, int=0) const
+    {
+        s.write(pbegin, pend - pbegin);
+    }
+
+    template<typename Stream>
+    void Unserialize(Stream& s, int, int=0)
+    {
+        s.read(pbegin, pend - pbegin);
+    }
+};
+
+template<typename I>
+class CVarInt
+{
+protected:
+    I &n;
+public:
+    CVarInt(I& nIn) : n(nIn) { }
+
+    unsigned int GetSerializeSize(int, int) const {
+        return GetSizeOfVarInt<I>(n);
+    }
+
+    template<typename Stream>
+    void Serialize(Stream &s, int, int) const {
+        WriteVarInt<Stream,I>(s, n);
+    }
+
+    template<typename Stream>
+    void Unserialize(Stream& s, int, int) {
+        n = ReadVarInt<Stream,I>(s);
+    }
+};
+
+template<size_t Limit>
+class LimitedString
+{
+protected:
+    std::string& string;
+public:
+    LimitedString(std::string& string) : string(string) {}
+
+    template<typename Stream>
+    void Unserialize(Stream& s, int, int=0)
+    {
+        size_t size = ReadCompactSize(s);
+        if (size > Limit) {
+            throw std::ios_base::failure("String length limit exceeded");
+        }
+        string.resize(size);
+        if (size != 0)
+            s.read((char*)&string[0], size);
+    }
+
+    template<typename Stream>
+    void Serialize(Stream& s, int, int=0) const
+    {
+        WriteCompactSize(s, string.size());
+        if (!string.empty())
+            s.write((char*)&string[0], string.size());
+    }
+
+    unsigned int GetSerializeSize(int, int=0) const
+    {
+        return GetSizeOfCompactSize(string.size()) + string.size();
+    }
+};
+
+template<typename I>
+CVarInt<I> WrapVarInt(I& n) { return CVarInt<I>(n); }
+
+/**
+ * Forward declarations
+ */
+
+/**
+ *  string
+ */
+template<typename C> unsigned int GetSerializeSize(const std::basic_string<C>& str, int, int=0);
+template<typename Stream, typename C> void Serialize(Stream& os, const std::basic_string<C>& str, int, int=0);
+template<typename Stream, typename C> void Unserialize(Stream& is, std::basic_string<C>& str, int, int=0);
+
+/**
+ * vector
+ * vectors of unsigned char are a special case and are intended to be serialized as a single opaque blob.
+ */
+template<typename T, typename A> unsigned int GetSerializeSize_impl(const std::vector<T, A>& v, int nType, int nVersion, const unsigned char&);
+template<typename T, typename A, typename V> unsigned int GetSerializeSize_impl(const std::vector<T, A>& v, int nType, int nVersion, const V&);
+template<typename T, typename A> inline unsigned int GetSerializeSize(const std::vector<T, A>& v, int nType, int nVersion);
+template<typename Stream, typename T, typename A> void Serialize_impl(Stream& os, const std::vector<T, A>& v, int nType, int nVersion, const unsigned char&);
+template<typename Stream, typename T, typename A, typename V> void Serialize_impl(Stream& os, const std::vector<T, A>& v, int nType, int nVersion, const V&);
+template<typename Stream, typename T, typename A> inline void Serialize(Stream& os, const std::vector<T, A>& v, int nType, int nVersion);
+template<typename Stream, typename T, typename A> void Unserialize_impl(Stream& is, std::vector<T, A>& v, int nType, int nVersion, const unsigned char&);
+template<typename Stream, typename T, typename A, typename V> void Unserialize_impl(Stream& is, std::vector<T, A>& v, int nType, int nVersion, const V&);
+template<typename Stream, typename T, typename A> inline void Unserialize(Stream& is, std::vector<T, A>& v, int nType, int nVersion);
+
+/**
+ * others derived from vector
+ */
+extern inline unsigned int GetSerializeSize(const CScript& v, int nType, int nVersion);
+template<typename Stream> void Serialize(Stream& os, const CScript& v, int nType, int nVersion);
+template<typename Stream> void Unserialize(Stream& is, CScript& v, int nType, int nVersion);
+
+/**
+ * pair
+ */
+template<typename K, typename T> unsigned int GetSerializeSize(const std::pair<K, T>& item, int nType, int nVersion);
+template<typename Stream, typename K, typename T> void Serialize(Stream& os, const std::pair<K, T>& item, int nType, int nVersion);
+template<typename Stream, typename K, typename T> void Unserialize(Stream& is, std::pair<K, T>& item, int nType, int nVersion);
+
+/**
+ * map
+ */
+template<typename K, typename T, typename Pred, typename A> unsigned int GetSerializeSize(const std::map<K, T, Pred, A>& m, int nType, int nVersion);
+template<typename Stream, typename K, typename T, typename Pred, typename A> void Serialize(Stream& os, const std::map<K, T, Pred, A>& m, int nType, int nVersion);
+template<typename Stream, typename K, typename T, typename Pred, typename A> void Unserialize(Stream& is, std::map<K, T, Pred, A>& m, int nType, int nVersion);
+
+/**
+ * set
+ */
+template<typename K, typename Pred, typename A> unsigned int GetSerializeSize(const std::set<K, Pred, A>& m, int nType, int nVersion);
+template<typename Stream, typename K, typename Pred, typename A> void Serialize(Stream& os, const std::set<K, Pred, A>& m, int nType, int nVersion);
+template<typename Stream, typename K, typename Pred, typename A> void Unserialize(Stream& is, std::set<K, Pred, A>& m, int nType, int nVersion);
+
+
+
+
+
+/**
+ * If none of the specialized versions above matched, default to calling member function.
+ * "int nType" is changed to "long nType" to keep from getting an ambiguous overload error.
+ * The compiler will only cast int to long if none of the other templates matched.
+ * Thanks to Boost serialization for this idea.
+ */
+template<typename T>
+inline unsigned int GetSerializeSize(const T& a, long nType, int nVersion)
+{
+    return a.GetSerializeSize((int)nType, nVersion);
+}
+
+template<typename Stream, typename T>
+inline void Serialize(Stream& os, const T& a, long nType, int nVersion)
+{
+    a.Serialize(os, (int)nType, nVersion);
+}
+
+template<typename Stream, typename T>
+inline void Unserialize(Stream& is, T& a, long nType, int nVersion)
+{
+    a.Unserialize(is, (int)nType, nVersion);
+}
+
+
+
+
+
+/**
+ * string
+ */
+template<typename C>
+unsigned int GetSerializeSize(const std::basic_string<C>& str, int, int)
+{
+    return GetSizeOfCompactSize(str.size()) + str.size() * sizeof(str[0]);
+}
+
+template<typename Stream, typename C>
+void Serialize(Stream& os, const std::basic_string<C>& str, int, int)
+{
+    WriteCompactSize(os, str.size());
+    if (!str.empty())
+        os.write((char*)&str[0], str.size() * sizeof(str[0]));
+}
+
+template<typename Stream, typename C>
+void Unserialize(Stream& is, std::basic_string<C>& str, int, int)
+{
+    unsigned int nSize = ReadCompactSize(is);
+    str.resize(nSize);
+    if (nSize != 0)
+        is.read((char*)&str[0], nSize * sizeof(str[0]));
+}
+
+
+
+/**
+ * vector
+ */
+template<typename T, typename A>
+unsigned int GetSerializeSize_impl(const std::vector<T, A>& v, int nType, int nVersion, const unsigned char&)
+{
+    size_t sz = v.size() + v.size() * sizeof(T);
+    return (GetSizeOfCompactSize((uint64_t)sz));
+}
+
+template<typename T, typename A, typename V>
+unsigned int GetSerializeSize_impl(const std::vector<T, A>& v, int nType, int nVersion, const V&)
+{
+    unsigned int nSize = GetSizeOfCompactSize(v.size());
+    for (typename std::vector<T, A>::const_iterator vi = v.begin(); vi != v.end(); ++vi)
+        nSize += GetSerializeSize((*vi), nType, nVersion);
+    return nSize;
+}
+
+template<typename T, typename A>
+inline unsigned int GetSerializeSize(const std::vector<T, A>& v, int nType, int nVersion)
+{
+    return GetSerializeSize_impl(v, nType, nVersion, T());
+}
+
+
+template<typename Stream, typename T, typename A>
+void Serialize_impl(Stream& os, const std::vector<T, A>& v, int nType, int nVersion, const unsigned char&)
+{
+    WriteCompactSize(os, v.size());
+    if (!v.empty())
+        os.write((char*)&v[0], v.size() * sizeof(T));
+}
+
+template<typename Stream, typename T, typename A, typename V>
+void Serialize_impl(Stream& os, const std::vector<T, A>& v, int nType, int nVersion, const V&)
+{
+    WriteCompactSize(os, v.size());
+    for (typename std::vector<T, A>::const_iterator vi = v.begin(); vi != v.end(); ++vi)
+        ::Serialize(os, (*vi), nType, nVersion);
+}
+
+template<typename Stream, typename T, typename A>
+inline void Serialize(Stream& os, const std::vector<T, A>& v, int nType, int nVersion)
+{
+    Serialize_impl(os, v, nType, nVersion, T());
+}
+
+
+template<typename Stream, typename T, typename A>
+void Unserialize_impl(Stream& is, std::vector<T, A>& v, int nType, int nVersion, const unsigned char&)
+{
+    // Limit size per read so bogus size value won't cause out of memory
+    v.clear();
+    unsigned int nSize = ReadCompactSize(is);
+    unsigned int i = 0;
+    while (i < nSize)
+    {
+        unsigned int blk = std::min(nSize - i, (unsigned int)(1 + 4999999 / sizeof(T)));
+        v.resize(i + blk);
+        is.read((char*)&v[i], blk * sizeof(T));
+        i += blk;
+    }
+}
+
+template<typename Stream, typename T, typename A, typename V>
+void Unserialize_impl(Stream& is, std::vector<T, A>& v, int nType, int nVersion, const V&)
+{
+    v.clear();
+    unsigned int nSize = ReadCompactSize(is);
+    unsigned int i = 0;
+    unsigned int nMid = 0;
+    while (nMid < nSize)
+    {
+        nMid += 5000000 / sizeof(T);
+        if (nMid > nSize)
+            nMid = nSize;
+        v.resize(nMid);
+        for (; i < nMid; i++)
+            Unserialize(is, v[i], nType, nVersion);
+    }
+}
+
+template<typename Stream, typename T, typename A>
+inline void Unserialize(Stream& is, std::vector<T, A>& v, int nType, int nVersion)
+{
+    Unserialize_impl(is, v, nType, nVersion, T());
+}
+
+
+
+/**
+ * others derived from vector
+ */
+inline unsigned int GetSerializeSize(const CScript& v, int nType, int nVersion)
+{
+    return GetSerializeSize((const std::vector<unsigned char>&)v, nType, nVersion);
+}
+
+template<typename Stream>
+void Serialize(Stream& os, const CScript& v, int nType, int nVersion)
+{
+    Serialize(os, (const std::vector<unsigned char>&)v, nType, nVersion);
+}
+
+template<typename Stream>
+void Unserialize(Stream& is, CScript& v, int nType, int nVersion)
+{
+    Unserialize(is, (std::vector<unsigned char>&)v, nType, nVersion);
+}
+
+
+
+/**
+ * pair
+ */
+template<typename K, typename T>
+unsigned int GetSerializeSize(const std::pair<K, T>& item, int nType, int nVersion)
+{
+    return GetSerializeSize(item.first, nType, nVersion) + GetSerializeSize(item.second, nType, nVersion);
+}
+
+template<typename Stream, typename K, typename T>
+void Serialize(Stream& os, const std::pair<K, T>& item, int nType, int nVersion)
+{
+    Serialize(os, item.first, nType, nVersion);
+    Serialize(os, item.second, nType, nVersion);
+}
+
+template<typename Stream, typename K, typename T>
+void Unserialize(Stream& is, std::pair<K, T>& item, int nType, int nVersion)
+{
+    Unserialize(is, item.first, nType, nVersion);
+    Unserialize(is, item.second, nType, nVersion);
+}
+
+
+
+/**
+ * map
+ */
+template<typename K, typename T, typename Pred, typename A>
+unsigned int GetSerializeSize(const std::map<K, T, Pred, A>& m, int nType, int nVersion)
+{
+    unsigned int nSize = GetSizeOfCompactSize(m.size());
+    for (typename std::map<K, T, Pred, A>::const_iterator mi = m.begin(); mi != m.end(); ++mi)
+        nSize += GetSerializeSize((*mi), nType, nVersion);
+    return nSize;
+}
+
+template<typename Stream, typename K, typename T, typename Pred, typename A>
+void Serialize(Stream& os, const std::map<K, T, Pred, A>& m, int nType, int nVersion)
+{
+    WriteCompactSize(os, m.size());
+    for (typename std::map<K, T, Pred, A>::const_iterator mi = m.begin(); mi != m.end(); ++mi)
+        Serialize(os, (*mi), nType, nVersion);
+}
+
+template<typename Stream, typename K, typename T, typename Pred, typename A>
+void Unserialize(Stream& is, std::map<K, T, Pred, A>& m, int nType, int nVersion)
+{
+    m.clear();
+    unsigned int nSize = ReadCompactSize(is);
+    typename std::map<K, T, Pred, A>::iterator mi = m.begin();
+    for (unsigned int i = 0; i < nSize; i++)
+    {
+        std::pair<K, T> item;
+        Unserialize(is, item, nType, nVersion);
+        mi = m.insert(mi, item);
+    }
+}
+
+
+
+/**
+ * set
+ */
+template<typename K, typename Pred, typename A>
+unsigned int GetSerializeSize(const std::set<K, Pred, A>& m, int nType, int nVersion)
+{
+    unsigned int nSize = GetSizeOfCompactSize(m.size());
+    for (typename std::set<K, Pred, A>::const_iterator it = m.begin(); it != m.end(); ++it)
+        nSize += GetSerializeSize((*it), nType, nVersion);
+    return nSize;
+}
+
+template<typename Stream, typename K, typename Pred, typename A>
+void Serialize(Stream& os, const std::set<K, Pred, A>& m, int nType, int nVersion)
+{
+    WriteCompactSize(os, m.size());
+    for (typename std::set<K, Pred, A>::const_iterator it = m.begin(); it != m.end(); ++it)
+        Serialize(os, (*it), nType, nVersion);
+}
+
+template<typename Stream, typename K, typename Pred, typename A>
+void Unserialize(Stream& is, std::set<K, Pred, A>& m, int nType, int nVersion)
+{
+    m.clear();
+    unsigned int nSize = ReadCompactSize(is);
+    typename std::set<K, Pred, A>::iterator it = m.begin();
+    for (unsigned int i = 0; i < nSize; i++)
+    {
+        K key;
+        Unserialize(is, key, nType, nVersion);
+        it = m.insert(it, key);
+    }
+}
+
+
+
+/**
+ * Support for ADD_SERIALIZE_METHODS and READWRITE macro
+ */
+struct CSerActionSerialize
+{
+    bool ForRead() const { return false; }
+};
+struct CSerActionUnserialize
+{
+    bool ForRead() const { return true; }
+};
+
+template<typename Stream, typename T>
+inline void SerReadWrite(Stream& s, const T& obj, int nType, int nVersion, CSerActionSerialize ser_action)
+{
+    ::Serialize(s, obj, nType, nVersion);
+}
+
+template<typename Stream, typename T>
+inline void SerReadWrite(Stream& s, T& obj, int nType, int nVersion, CSerActionUnserialize ser_action)
+{
+    ::Unserialize(s, obj, nType, nVersion);
+}
+
+
+
+
+class CSizeComputer
+{
+protected:
+    size_t nSize;
+
+public:
+    int nType;
+    int nVersion;
+
+    CSizeComputer(int nTypeIn, int nVersionIn) : nSize(0), nType(nTypeIn), nVersion(nVersionIn) {}
+
+    CSizeComputer& write(const char *psz, size_t nSize)
+    {
+        this->nSize += nSize;
+        return *this;
+    }
+
+    template<typename T>
+    CSizeComputer& operator<<(const T& obj)
+    {
+        ::Serialize(*this, obj, nType, nVersion);
+        return (*this);
+    }
+
+    size_t size() const {
+        return nSize;
+    }
+};
+
+#endif // BITCOIN_SERIALIZE_H
diff --git a/sha256/cuda_sha256d.cu b/sha256/cuda_sha256d.cu
new file mode 100644
index 0000000000..5f6b84530b
--- /dev/null
+++ b/sha256/cuda_sha256d.cu
@@ -0,0 +1,477 @@
+/*
+ * sha256d CUDA implementation.
+ * tpruvot 2017
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <memory.h>
+
+#include <cuda_helper.h>
+#include <miner.h>
+
+__constant__ static uint32_t __align__(8) c_midstate76[8];
+__constant__ static uint32_t __align__(8) c_dataEnd80[4];
+
+const __constant__  uint32_t __align__(8) c_H256[8] = {
+	0x6A09E667U, 0xBB67AE85U, 0x3C6EF372U, 0xA54FF53AU,
+	0x510E527FU, 0x9B05688CU, 0x1F83D9ABU, 0x5BE0CD19U
+};
+__constant__ static uint32_t __align__(8) c_K[64];
+__constant__ static uint32_t __align__(8) c_target[2];
+__device__ uint64_t d_target[1];
+
+static uint32_t* d_resNonces[MAX_GPUS] = { 0 };
+
+// ------------------------------------------------------------------------------------------------
+
+static const uint32_t cpu_H256[8] = {
+	0x6A09E667U, 0xBB67AE85U, 0x3C6EF372U, 0xA54FF53AU,
+	0x510E527FU, 0x9B05688CU, 0x1F83D9ABU, 0x5BE0CD19U
+};
+
+static const uint32_t cpu_K[64] = {
+	0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
+	0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3, 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
+	0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC, 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
+	0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
+	0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13, 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
+	0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3, 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
+	0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
+	0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208, 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
+};
+
+#define ROTR ROTR32
+
+__host__
+static void sha256_step1_host(uint32_t a, uint32_t b, uint32_t c, uint32_t &d,
+	uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
+	uint32_t in, const uint32_t Kshared)
+{
+	uint32_t t1,t2;
+	uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
+	uint32_t bsg21 = ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e);
+	uint32_t bsg20 = ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a);
+	uint32_t andorv = ((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);
+
+	t1 = h + bsg21 + vxandx + Kshared + in;
+	t2 = bsg20 + andorv;
+	d = d + t1;
+	h = t1 + t2;
+}
+
+__host__
+static void sha256_step2_host(uint32_t a, uint32_t b, uint32_t c, uint32_t &d,
+	uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
+	uint32_t* in, uint32_t pc, const uint32_t Kshared)
+{
+	uint32_t t1,t2;
+
+	int pcidx1 = (pc-2)  & 0xF;
+	int pcidx2 = (pc-7)  & 0xF;
+	int pcidx3 = (pc-15) & 0xF;
+
+	uint32_t inx0 = in[pc];
+	uint32_t inx1 = in[pcidx1];
+	uint32_t inx2 = in[pcidx2];
+	uint32_t inx3 = in[pcidx3];
+
+	uint32_t ssg21 = ROTR(inx1, 17) ^ ROTR(inx1, 19) ^ SPH_T32((inx1) >> 10); //ssg2_1(inx1);
+	uint32_t ssg20 = ROTR(inx3, 7) ^ ROTR(inx3, 18) ^ SPH_T32((inx3) >> 3); //ssg2_0(inx3);
+	uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
+	uint32_t bsg21 = ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e);
+	uint32_t bsg20 = ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a);
+	uint32_t andorv = ((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);
+
+	in[pc] = ssg21 + inx2 + ssg20 + inx0;
+
+	t1 = h + bsg21 + vxandx + Kshared + in[pc];
+	t2 = bsg20 + andorv;
+	d =  d + t1;
+	h = t1 + t2;
+}
+
+__host__
+static void sha256_round_body_host(uint32_t* in, uint32_t* state, const uint32_t* Kshared)
+{
+	uint32_t a = state[0];
+	uint32_t b = state[1];
+	uint32_t c = state[2];
+	uint32_t d = state[3];
+	uint32_t e = state[4];
+	uint32_t f = state[5];
+	uint32_t g = state[6];
+	uint32_t h = state[7];
+
+	sha256_step1_host(a,b,c,d,e,f,g,h,in[ 0], Kshared[ 0]);
+	sha256_step1_host(h,a,b,c,d,e,f,g,in[ 1], Kshared[ 1]);
+	sha256_step1_host(g,h,a,b,c,d,e,f,in[ 2], Kshared[ 2]);
+	sha256_step1_host(f,g,h,a,b,c,d,e,in[ 3], Kshared[ 3]);
+	sha256_step1_host(e,f,g,h,a,b,c,d,in[ 4], Kshared[ 4]);
+	sha256_step1_host(d,e,f,g,h,a,b,c,in[ 5], Kshared[ 5]);
+	sha256_step1_host(c,d,e,f,g,h,a,b,in[ 6], Kshared[ 6]);
+	sha256_step1_host(b,c,d,e,f,g,h,a,in[ 7], Kshared[ 7]);
+	sha256_step1_host(a,b,c,d,e,f,g,h,in[ 8], Kshared[ 8]);
+	sha256_step1_host(h,a,b,c,d,e,f,g,in[ 9], Kshared[ 9]);
+	sha256_step1_host(g,h,a,b,c,d,e,f,in[10], Kshared[10]);
+	sha256_step1_host(f,g,h,a,b,c,d,e,in[11], Kshared[11]);
+	sha256_step1_host(e,f,g,h,a,b,c,d,in[12], Kshared[12]);
+	sha256_step1_host(d,e,f,g,h,a,b,c,in[13], Kshared[13]);
+	sha256_step1_host(c,d,e,f,g,h,a,b,in[14], Kshared[14]);
+	sha256_step1_host(b,c,d,e,f,g,h,a,in[15], Kshared[15]);
+
+	for (int i=0; i<3; i++)
+	{
+		sha256_step2_host(a,b,c,d,e,f,g,h,in,0, Kshared[16+16*i]);
+		sha256_step2_host(h,a,b,c,d,e,f,g,in,1, Kshared[17+16*i]);
+		sha256_step2_host(g,h,a,b,c,d,e,f,in,2, Kshared[18+16*i]);
+		sha256_step2_host(f,g,h,a,b,c,d,e,in,3, Kshared[19+16*i]);
+		sha256_step2_host(e,f,g,h,a,b,c,d,in,4, Kshared[20+16*i]);
+		sha256_step2_host(d,e,f,g,h,a,b,c,in,5, Kshared[21+16*i]);
+		sha256_step2_host(c,d,e,f,g,h,a,b,in,6, Kshared[22+16*i]);
+		sha256_step2_host(b,c,d,e,f,g,h,a,in,7, Kshared[23+16*i]);
+		sha256_step2_host(a,b,c,d,e,f,g,h,in,8, Kshared[24+16*i]);
+		sha256_step2_host(h,a,b,c,d,e,f,g,in,9, Kshared[25+16*i]);
+		sha256_step2_host(g,h,a,b,c,d,e,f,in,10,Kshared[26+16*i]);
+		sha256_step2_host(f,g,h,a,b,c,d,e,in,11,Kshared[27+16*i]);
+		sha256_step2_host(e,f,g,h,a,b,c,d,in,12,Kshared[28+16*i]);
+		sha256_step2_host(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]);
+		sha256_step2_host(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]);
+		sha256_step2_host(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]);
+	}
+
+	state[0] += a;
+	state[1] += b;
+	state[2] += c;
+	state[3] += d;
+	state[4] += e;
+	state[5] += f;
+	state[6] += g;
+	state[7] += h;
+}
+
+#define xor3b(a,b,c) (a ^ b ^ c)
+
+__device__ __forceinline__ uint32_t bsg2_0(const uint32_t x)
+{
+	return xor3b(ROTR32(x,2),ROTR32(x,13),ROTR32(x,22));
+}
+
+__device__ __forceinline__ uint32_t bsg2_1(const uint32_t x)
+{
+	return xor3b(ROTR32(x,6),ROTR32(x,11),ROTR32(x,25));
+}
+
+__device__ __forceinline__ uint32_t ssg2_0(const uint32_t x)
+{
+	return xor3b(ROTR32(x,7),ROTR32(x,18),(x>>3));
+}
+
+__device__ __forceinline__ uint32_t ssg2_1(const uint32_t x)
+{
+	return xor3b(ROTR32(x,17),ROTR32(x,19),(x>>10));
+}
+
+__device__ __forceinline__ uint32_t andor32(const uint32_t a, const uint32_t b, const uint32_t c)
+{
+	uint32_t result;
+	asm("{\n\t"
+		".reg .u32 m,n,o;\n\t"
+		"and.b32 m,  %1, %2;\n\t"
+		" or.b32 n,  %1, %2;\n\t"
+		"and.b32 o,   n, %3;\n\t"
+		" or.b32 %0,  m, o ;\n\t"
+		"}\n\t" : "=r"(result) : "r"(a), "r"(b), "r"(c)
+	);
+	return result;
+}
+
+__device__ __forceinline__ uint2 vectorizeswap(uint64_t v) {
+	uint2 result;
+	asm("mov.b64 {%0,%1},%2; \n\t"
+		: "=r"(result.y), "=r"(result.x) : "l"(v));
+	return result;
+}
+
+__device__
+static void sha2_step1(uint32_t a, uint32_t b, uint32_t c, uint32_t &d, uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
+	uint32_t in, const uint32_t Kshared)
+{
+	uint32_t t1,t2;
+	uint32_t vxandx = xandx(e, f, g);
+	uint32_t bsg21 = bsg2_1(e);
+	uint32_t bsg20 = bsg2_0(a);
+	uint32_t andorv = andor32(a,b,c);
+
+	t1 = h + bsg21 + vxandx + Kshared + in;
+	t2 = bsg20 + andorv;
+	d = d + t1;
+	h = t1 + t2;
+}
+
+__device__
+static void sha2_step2(uint32_t a, uint32_t b, uint32_t c, uint32_t &d, uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
+	uint32_t* in, uint32_t pc, const uint32_t Kshared)
+{
+	uint32_t t1,t2;
+
+	int pcidx1 = (pc-2) & 0xF;
+	int pcidx2 = (pc-7) & 0xF;
+	int pcidx3 = (pc-15) & 0xF;
+
+	uint32_t inx0 = in[pc];
+	uint32_t inx1 = in[pcidx1];
+	uint32_t inx2 = in[pcidx2];
+	uint32_t inx3 = in[pcidx3];
+
+	uint32_t ssg21 = ssg2_1(inx1);
+	uint32_t ssg20 = ssg2_0(inx3);
+	uint32_t vxandx = xandx(e, f, g);
+	uint32_t bsg21 = bsg2_1(e);
+	uint32_t bsg20 = bsg2_0(a);
+	uint32_t andorv = andor32(a,b,c);
+
+	in[pc] = ssg21 + inx2 + ssg20 + inx0;
+
+	t1 = h + bsg21 + vxandx + Kshared + in[pc];
+	t2 = bsg20 + andorv;
+	d =  d + t1;
+	h = t1 + t2;
+}
+
+__device__
+static void sha256_round_body(uint32_t* in, uint32_t* state, uint32_t* const Kshared)
+{
+	uint32_t a = state[0];
+	uint32_t b = state[1];
+	uint32_t c = state[2];
+	uint32_t d = state[3];
+	uint32_t e = state[4];
+	uint32_t f = state[5];
+	uint32_t g = state[6];
+	uint32_t h = state[7];
+
+	sha2_step1(a,b,c,d,e,f,g,h,in[ 0], Kshared[ 0]);
+	sha2_step1(h,a,b,c,d,e,f,g,in[ 1], Kshared[ 1]);
+	sha2_step1(g,h,a,b,c,d,e,f,in[ 2], Kshared[ 2]);
+	sha2_step1(f,g,h,a,b,c,d,e,in[ 3], Kshared[ 3]);
+	sha2_step1(e,f,g,h,a,b,c,d,in[ 4], Kshared[ 4]);
+	sha2_step1(d,e,f,g,h,a,b,c,in[ 5], Kshared[ 5]);
+	sha2_step1(c,d,e,f,g,h,a,b,in[ 6], Kshared[ 6]);
+	sha2_step1(b,c,d,e,f,g,h,a,in[ 7], Kshared[ 7]);
+	sha2_step1(a,b,c,d,e,f,g,h,in[ 8], Kshared[ 8]);
+	sha2_step1(h,a,b,c,d,e,f,g,in[ 9], Kshared[ 9]);
+	sha2_step1(g,h,a,b,c,d,e,f,in[10], Kshared[10]);
+	sha2_step1(f,g,h,a,b,c,d,e,in[11], Kshared[11]);
+	sha2_step1(e,f,g,h,a,b,c,d,in[12], Kshared[12]);
+	sha2_step1(d,e,f,g,h,a,b,c,in[13], Kshared[13]);
+	sha2_step1(c,d,e,f,g,h,a,b,in[14], Kshared[14]);
+	sha2_step1(b,c,d,e,f,g,h,a,in[15], Kshared[15]);
+
+	#pragma unroll
+	for (int i=0; i<3; i++)
+	{
+		sha2_step2(a,b,c,d,e,f,g,h,in,0, Kshared[16+16*i]);
+		sha2_step2(h,a,b,c,d,e,f,g,in,1, Kshared[17+16*i]);
+		sha2_step2(g,h,a,b,c,d,e,f,in,2, Kshared[18+16*i]);
+		sha2_step2(f,g,h,a,b,c,d,e,in,3, Kshared[19+16*i]);
+		sha2_step2(e,f,g,h,a,b,c,d,in,4, Kshared[20+16*i]);
+		sha2_step2(d,e,f,g,h,a,b,c,in,5, Kshared[21+16*i]);
+		sha2_step2(c,d,e,f,g,h,a,b,in,6, Kshared[22+16*i]);
+		sha2_step2(b,c,d,e,f,g,h,a,in,7, Kshared[23+16*i]);
+		sha2_step2(a,b,c,d,e,f,g,h,in,8, Kshared[24+16*i]);
+		sha2_step2(h,a,b,c,d,e,f,g,in,9, Kshared[25+16*i]);
+		sha2_step2(g,h,a,b,c,d,e,f,in,10,Kshared[26+16*i]);
+		sha2_step2(f,g,h,a,b,c,d,e,in,11,Kshared[27+16*i]);
+		sha2_step2(e,f,g,h,a,b,c,d,in,12,Kshared[28+16*i]);
+		sha2_step2(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]);
+		sha2_step2(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]);
+		sha2_step2(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]);
+	}
+
+	state[0] += a;
+	state[1] += b;
+	state[2] += c;
+	state[3] += d;
+	state[4] += e;
+	state[5] += f;
+	state[6] += g;
+	state[7] += h;
+}
+
+__device__
+static void sha256_round_last(uint32_t* in, uint32_t* state, uint32_t* const Kshared)
+{
+	uint32_t a = state[0];
+	uint32_t b = state[1];
+	uint32_t c = state[2];
+	uint32_t d = state[3];
+	uint32_t e = state[4];
+	uint32_t f = state[5];
+	uint32_t g = state[6];
+	uint32_t h = state[7];
+
+	sha2_step1(a,b,c,d, e,f,g,h, in[ 0], Kshared[ 0]);
+	sha2_step1(h,a,b,c, d,e,f,g, in[ 1], Kshared[ 1]);
+	sha2_step1(g,h,a,b, c,d,e,f, in[ 2], Kshared[ 2]);
+	sha2_step1(f,g,h,a, b,c,d,e, in[ 3], Kshared[ 3]);
+	sha2_step1(e,f,g,h, a,b,c,d, in[ 4], Kshared[ 4]);
+	sha2_step1(d,e,f,g, h,a,b,c, in[ 5], Kshared[ 5]);
+	sha2_step1(c,d,e,f, g,h,a,b, in[ 6], Kshared[ 6]);
+	sha2_step1(b,c,d,e, f,g,h,a, in[ 7], Kshared[ 7]);
+	sha2_step1(a,b,c,d, e,f,g,h, in[ 8], Kshared[ 8]);
+	sha2_step1(h,a,b,c, d,e,f,g, in[ 9], Kshared[ 9]);
+	sha2_step1(g,h,a,b, c,d,e,f, in[10], Kshared[10]);
+	sha2_step1(f,g,h,a, b,c,d,e, in[11], Kshared[11]);
+	sha2_step1(e,f,g,h, a,b,c,d, in[12], Kshared[12]);
+	sha2_step1(d,e,f,g, h,a,b,c, in[13], Kshared[13]);
+	sha2_step1(c,d,e,f, g,h,a,b, in[14], Kshared[14]);
+	sha2_step1(b,c,d,e, f,g,h,a, in[15], Kshared[15]);
+
+	#pragma unroll
+	for (int i=0; i<2; i++)
+	{
+		sha2_step2(a,b,c,d, e,f,g,h, in, 0, Kshared[16+16*i]);
+		sha2_step2(h,a,b,c, d,e,f,g, in, 1, Kshared[17+16*i]);
+		sha2_step2(g,h,a,b, c,d,e,f, in, 2, Kshared[18+16*i]);
+		sha2_step2(f,g,h,a, b,c,d,e, in, 3, Kshared[19+16*i]);
+		sha2_step2(e,f,g,h, a,b,c,d, in, 4, Kshared[20+16*i]);
+		sha2_step2(d,e,f,g, h,a,b,c, in, 5, Kshared[21+16*i]);
+		sha2_step2(c,d,e,f, g,h,a,b, in, 6, Kshared[22+16*i]);
+		sha2_step2(b,c,d,e, f,g,h,a, in, 7, Kshared[23+16*i]);
+		sha2_step2(a,b,c,d, e,f,g,h, in, 8, Kshared[24+16*i]);
+		sha2_step2(h,a,b,c, d,e,f,g, in, 9, Kshared[25+16*i]);
+		sha2_step2(g,h,a,b, c,d,e,f, in,10, Kshared[26+16*i]);
+		sha2_step2(f,g,h,a, b,c,d,e, in,11, Kshared[27+16*i]);
+		sha2_step2(e,f,g,h, a,b,c,d, in,12, Kshared[28+16*i]);
+		sha2_step2(d,e,f,g, h,a,b,c, in,13, Kshared[29+16*i]);
+		sha2_step2(c,d,e,f, g,h,a,b, in,14, Kshared[30+16*i]);
+		sha2_step2(b,c,d,e, f,g,h,a, in,15, Kshared[31+16*i]);
+	}
+
+	sha2_step2(a,b,c,d, e,f,g,h, in, 0, Kshared[16+16*2]);
+	sha2_step2(h,a,b,c, d,e,f,g, in, 1, Kshared[17+16*2]);
+	sha2_step2(g,h,a,b, c,d,e,f, in, 2, Kshared[18+16*2]);
+	sha2_step2(f,g,h,a, b,c,d,e, in, 3, Kshared[19+16*2]);
+	sha2_step2(e,f,g,h, a,b,c,d, in, 4, Kshared[20+16*2]);
+	sha2_step2(d,e,f,g, h,a,b,c, in, 5, Kshared[21+16*2]);
+	sha2_step2(c,d,e,f, g,h,a,b, in, 6, Kshared[22+16*2]);
+	sha2_step2(b,c,d,e, f,g,h,a, in, 7, Kshared[23+16*2]);
+	sha2_step2(a,b,c,d, e,f,g,h, in, 8, Kshared[24+16*2]);
+	sha2_step2(h,a,b,c, d,e,f,g, in, 9, Kshared[25+16*2]);
+	sha2_step2(g,h,a,b, c,d,e,f, in,10, Kshared[26+16*2]);
+	sha2_step2(f,g,h,a, b,c,d,e, in,11, Kshared[27+16*2]);
+	sha2_step2(e,f,g,h, a,b,c,d, in,12, Kshared[28+16*2]);
+	sha2_step2(d,e,f,g, h,a,b,c, in,13, Kshared[29+16*2]);
+
+	state[6] += g;
+	state[7] += h;
+}
+
+__device__ __forceinline__
+uint64_t cuda_swab32ll(uint64_t x) {
+	return MAKE_ULONGLONG(cuda_swab32(_LODWORD(x)), cuda_swab32(_HIDWORD(x)));
+}
+
+__global__
+/*__launch_bounds__(256,3)*/
+void sha256d_gpu_hash_shared(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonces)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+	__shared__ uint32_t s_K[64*4];
+	//s_K[thread & 63] = c_K[thread & 63];
+	if (threadIdx.x < 64U) s_K[threadIdx.x] = c_K[threadIdx.x];
+
+	if (thread < threads)
+	{
+		const uint32_t nonce = startNonce + thread;
+
+		uint32_t dat[16];
+		AS_UINT2(dat) = AS_UINT2(c_dataEnd80);
+		dat[ 2] = c_dataEnd80[2];
+		dat[ 3] = nonce;
+		dat[ 4] = 0x80000000;
+		dat[15] = 0x280;
+		#pragma unroll
+		for (int i=5; i<15; i++) dat[i] = 0;
+
+		uint32_t buf[8];
+		#pragma unroll
+		for (int i=0; i<8; i+=2) AS_UINT2(&buf[i]) = AS_UINT2(&c_midstate76[i]);
+		//for (int i=0; i<8; i++) buf[i] = c_midstate76[i];
+
+		sha256_round_body(dat, buf, s_K);
+
+		// second sha256
+
+		#pragma unroll
+		for (int i=0; i<8; i++) dat[i] = buf[i];
+		dat[8] = 0x80000000;
+		#pragma unroll
+		for (int i=9; i<15; i++) dat[i] = 0;
+		dat[15] = 0x100;
+
+		#pragma unroll
+		for (int i=0; i<8; i++) buf[i] = c_H256[i];
+
+		sha256_round_last(dat, buf, s_K);
+
+		// valid nonces
+		uint64_t high = cuda_swab32ll(((uint64_t*)buf)[3]);
+		if (high <= c_target[0]) {
+			//printf("%08x %08x - %016llx %016llx - %08x %08x\n", buf[7], buf[6], high, d_target[0], c_target[1], c_target[0]);
+			resNonces[1] = atomicExch(resNonces, nonce);
+			//d_target[0] = high;
+		}
+	}
+}
+
+__host__
+void sha256d_init(int thr_id)
+{
+	cuda_get_arch(thr_id);
+	cudaMemcpyToSymbol(c_K, cpu_K, sizeof(cpu_K), 0, cudaMemcpyHostToDevice);
+	CUDA_SAFE_CALL(cudaMalloc(&d_resNonces[thr_id], 2*sizeof(uint32_t)));
+}
+
+__host__
+void sha256d_free(int thr_id)
+{
+	if (d_resNonces[thr_id]) cudaFree(d_resNonces[thr_id]);
+	d_resNonces[thr_id] = NULL;
+}
+
+__host__
+void sha256d_setBlock_80(uint32_t *pdata, uint32_t *ptarget)
+{
+	uint32_t _ALIGN(64) in[16], buf[8], end[4];
+	for (int i=0;i<16;i++) in[i] = cuda_swab32(pdata[i]);
+	for (int i=0;i<8;i++) buf[i] = cpu_H256[i];
+	for (int i=0;i<4;i++) end[i] = cuda_swab32(pdata[16+i]);
+	sha256_round_body_host(in, buf, cpu_K);
+
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_midstate76, buf, 32, 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_dataEnd80,  end, sizeof(end), 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_target, &ptarget[6], 8, 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_target, &ptarget[6], 8, 0, cudaMemcpyHostToDevice));
+}
+
+__host__
+void sha256d_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces)
+{
+	const uint32_t threadsperblock = 256;
+
+	dim3 grid(threads/threadsperblock);
+	dim3 block(threadsperblock);
+
+	CUDA_SAFE_CALL(cudaMemset(d_resNonces[thr_id], 0xFF, 2 * sizeof(uint32_t)));
+	cudaThreadSynchronize();
+	sha256d_gpu_hash_shared <<<grid, block>>> (threads, startNonce, d_resNonces[thr_id]);
+	cudaThreadSynchronize();
+
+	CUDA_SAFE_CALL(cudaMemcpy(resNonces, d_resNonces[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost));
+	if (resNonces[0] == resNonces[1]) {
+		resNonces[1] = UINT32_MAX;
+	}
+}
diff --git a/sha256/cuda_sha256q.cu b/sha256/cuda_sha256q.cu
new file mode 100644
index 0000000000..80733ac425
--- /dev/null
+++ b/sha256/cuda_sha256q.cu
@@ -0,0 +1,507 @@
+/*
+ * sha256(-q) CUDA implementation.
+ * pyritepirate 2018
+ * tpruvot 2017
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <memory.h>
+
+#include <cuda_helper.h>
+#include <miner.h>
+
+__constant__ static uint32_t __align__(8) c_midstate76[8];
+__constant__ static uint32_t __align__(8) c_dataEnd80[4];
+
+const __constant__  uint32_t __align__(8) c_H256[8] = {
+	0x6A09E667U, 0xBB67AE85U, 0x3C6EF372U, 0xA54FF53AU,
+	0x510E527FU, 0x9B05688CU, 0x1F83D9ABU, 0x5BE0CD19U
+};
+__constant__ static uint32_t __align__(8) c_K[64];
+__constant__ static uint32_t __align__(8) c_target[2];
+__device__ uint64_t d_target[1];
+
+static uint32_t* d_resNonces[MAX_GPUS] = { 0 };
+
+// ------------------------------------------------------------------------------------------------
+
+static const uint32_t cpu_H256[8] = {
+	0x6A09E667U, 0xBB67AE85U, 0x3C6EF372U, 0xA54FF53AU,
+	0x510E527FU, 0x9B05688CU, 0x1F83D9ABU, 0x5BE0CD19U
+};
+
+static const uint32_t cpu_K[64] = {
+	0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
+	0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3, 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
+	0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC, 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
+	0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
+	0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13, 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
+	0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3, 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
+	0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
+	0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208, 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
+};
+
+#define ROTR ROTR32
+
+__host__
+static void sha256_step1_host(uint32_t a, uint32_t b, uint32_t c, uint32_t &d,
+	uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
+	uint32_t in, const uint32_t Kshared)
+{
+	uint32_t t1,t2;
+	uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
+	uint32_t bsg21 = ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e);
+	uint32_t bsg20 = ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a);
+	uint32_t andorv = ((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);
+
+	t1 = h + bsg21 + vxandx + Kshared + in;
+	t2 = bsg20 + andorv;
+	d = d + t1;
+	h = t1 + t2;
+}
+
+__host__
+static void sha256_step2_host(uint32_t a, uint32_t b, uint32_t c, uint32_t &d,
+	uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
+	uint32_t* in, uint32_t pc, const uint32_t Kshared)
+{
+	uint32_t t1,t2;
+
+	int pcidx1 = (pc-2)  & 0xF;
+	int pcidx2 = (pc-7)  & 0xF;
+	int pcidx3 = (pc-15) & 0xF;
+
+	uint32_t inx0 = in[pc];
+	uint32_t inx1 = in[pcidx1];
+	uint32_t inx2 = in[pcidx2];
+	uint32_t inx3 = in[pcidx3];
+
+	uint32_t ssg21 = ROTR(inx1, 17) ^ ROTR(inx1, 19) ^ SPH_T32((inx1) >> 10); //ssg2_1(inx1);
+	uint32_t ssg20 = ROTR(inx3, 7) ^ ROTR(inx3, 18) ^ SPH_T32((inx3) >> 3); //ssg2_0(inx3);
+	uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
+	uint32_t bsg21 = ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e);
+	uint32_t bsg20 = ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a);
+	uint32_t andorv = ((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);
+
+	in[pc] = ssg21 + inx2 + ssg20 + inx0;
+
+	t1 = h + bsg21 + vxandx + Kshared + in[pc];
+	t2 = bsg20 + andorv;
+	d =  d + t1;
+	h = t1 + t2;
+}
+
+__host__
+static void sha256_round_body_host(uint32_t* in, uint32_t* state, const uint32_t* Kshared)
+{
+	uint32_t a = state[0];
+	uint32_t b = state[1];
+	uint32_t c = state[2];
+	uint32_t d = state[3];
+	uint32_t e = state[4];
+	uint32_t f = state[5];
+	uint32_t g = state[6];
+	uint32_t h = state[7];
+
+	sha256_step1_host(a,b,c,d,e,f,g,h,in[ 0], Kshared[ 0]);
+	sha256_step1_host(h,a,b,c,d,e,f,g,in[ 1], Kshared[ 1]);
+	sha256_step1_host(g,h,a,b,c,d,e,f,in[ 2], Kshared[ 2]);
+	sha256_step1_host(f,g,h,a,b,c,d,e,in[ 3], Kshared[ 3]);
+	sha256_step1_host(e,f,g,h,a,b,c,d,in[ 4], Kshared[ 4]);
+	sha256_step1_host(d,e,f,g,h,a,b,c,in[ 5], Kshared[ 5]);
+	sha256_step1_host(c,d,e,f,g,h,a,b,in[ 6], Kshared[ 6]);
+	sha256_step1_host(b,c,d,e,f,g,h,a,in[ 7], Kshared[ 7]);
+	sha256_step1_host(a,b,c,d,e,f,g,h,in[ 8], Kshared[ 8]);
+	sha256_step1_host(h,a,b,c,d,e,f,g,in[ 9], Kshared[ 9]);
+	sha256_step1_host(g,h,a,b,c,d,e,f,in[10], Kshared[10]);
+	sha256_step1_host(f,g,h,a,b,c,d,e,in[11], Kshared[11]);
+	sha256_step1_host(e,f,g,h,a,b,c,d,in[12], Kshared[12]);
+	sha256_step1_host(d,e,f,g,h,a,b,c,in[13], Kshared[13]);
+	sha256_step1_host(c,d,e,f,g,h,a,b,in[14], Kshared[14]);
+	sha256_step1_host(b,c,d,e,f,g,h,a,in[15], Kshared[15]);
+
+	for (int i=0; i<3; i++)
+	{
+		sha256_step2_host(a,b,c,d,e,f,g,h,in,0, Kshared[16+16*i]);
+		sha256_step2_host(h,a,b,c,d,e,f,g,in,1, Kshared[17+16*i]);
+		sha256_step2_host(g,h,a,b,c,d,e,f,in,2, Kshared[18+16*i]);
+		sha256_step2_host(f,g,h,a,b,c,d,e,in,3, Kshared[19+16*i]);
+		sha256_step2_host(e,f,g,h,a,b,c,d,in,4, Kshared[20+16*i]);
+		sha256_step2_host(d,e,f,g,h,a,b,c,in,5, Kshared[21+16*i]);
+		sha256_step2_host(c,d,e,f,g,h,a,b,in,6, Kshared[22+16*i]);
+		sha256_step2_host(b,c,d,e,f,g,h,a,in,7, Kshared[23+16*i]);
+		sha256_step2_host(a,b,c,d,e,f,g,h,in,8, Kshared[24+16*i]);
+		sha256_step2_host(h,a,b,c,d,e,f,g,in,9, Kshared[25+16*i]);
+		sha256_step2_host(g,h,a,b,c,d,e,f,in,10,Kshared[26+16*i]);
+		sha256_step2_host(f,g,h,a,b,c,d,e,in,11,Kshared[27+16*i]);
+		sha256_step2_host(e,f,g,h,a,b,c,d,in,12,Kshared[28+16*i]);
+		sha256_step2_host(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]);
+		sha256_step2_host(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]);
+		sha256_step2_host(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]);
+	}
+
+	state[0] += a;
+	state[1] += b;
+	state[2] += c;
+	state[3] += d;
+	state[4] += e;
+	state[5] += f;
+	state[6] += g;
+	state[7] += h;
+}
+
+#define xor3b(a,b,c) (a ^ b ^ c)
+
+__device__ __forceinline__ uint32_t bsg2_0(const uint32_t x)
+{
+	return xor3b(ROTR32(x,2),ROTR32(x,13),ROTR32(x,22));
+}
+
+__device__ __forceinline__ uint32_t bsg2_1(const uint32_t x)
+{
+	return xor3b(ROTR32(x,6),ROTR32(x,11),ROTR32(x,25));
+}
+
+__device__ __forceinline__ uint32_t ssg2_0(const uint32_t x)
+{
+	return xor3b(ROTR32(x,7),ROTR32(x,18),(x>>3));
+}
+
+__device__ __forceinline__ uint32_t ssg2_1(const uint32_t x)
+{
+	return xor3b(ROTR32(x,17),ROTR32(x,19),(x>>10));
+}
+
+__device__ __forceinline__ uint32_t andor32(const uint32_t a, const uint32_t b, const uint32_t c)
+{
+	uint32_t result;
+	asm("{\n\t"
+		".reg .u32 m,n,o;\n\t"
+		"and.b32 m,  %1, %2;\n\t"
+		" or.b32 n,  %1, %2;\n\t"
+		"and.b32 o,   n, %3;\n\t"
+		" or.b32 %0,  m, o ;\n\t"
+		"}\n\t" : "=r"(result) : "r"(a), "r"(b), "r"(c)
+	);
+	return result;
+}
+
+__device__ __forceinline__ uint2 vectorizeswap(uint64_t v) {
+	uint2 result;
+	asm("mov.b64 {%0,%1},%2; \n\t"
+		: "=r"(result.y), "=r"(result.x) : "l"(v));
+	return result;
+}
+
+__device__
+static void sha2_step1(uint32_t a, uint32_t b, uint32_t c, uint32_t &d, uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
+	uint32_t in, const uint32_t Kshared)
+{
+	uint32_t t1,t2;
+	uint32_t vxandx = xandx(e, f, g);
+	uint32_t bsg21 = bsg2_1(e);
+	uint32_t bsg20 = bsg2_0(a);
+	uint32_t andorv = andor32(a,b,c);
+
+	t1 = h + bsg21 + vxandx + Kshared + in;
+	t2 = bsg20 + andorv;
+	d = d + t1;
+	h = t1 + t2;
+}
+
+__device__
+static void sha2_step2(uint32_t a, uint32_t b, uint32_t c, uint32_t &d, uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
+	uint32_t* in, uint32_t pc, const uint32_t Kshared)
+{
+	uint32_t t1,t2;
+
+	int pcidx1 = (pc-2) & 0xF;
+	int pcidx2 = (pc-7) & 0xF;
+	int pcidx3 = (pc-15) & 0xF;
+
+	uint32_t inx0 = in[pc];
+	uint32_t inx1 = in[pcidx1];
+	uint32_t inx2 = in[pcidx2];
+	uint32_t inx3 = in[pcidx3];
+
+	uint32_t ssg21 = ssg2_1(inx1);
+	uint32_t ssg20 = ssg2_0(inx3);
+	uint32_t vxandx = xandx(e, f, g);
+	uint32_t bsg21 = bsg2_1(e);
+	uint32_t bsg20 = bsg2_0(a);
+	uint32_t andorv = andor32(a,b,c);
+
+	in[pc] = ssg21 + inx2 + ssg20 + inx0;
+
+	t1 = h + bsg21 + vxandx + Kshared + in[pc];
+	t2 = bsg20 + andorv;
+	d =  d + t1;
+	h = t1 + t2;
+}
+
+__device__
+static void sha256_round_body(uint32_t* in, uint32_t* state, uint32_t* const Kshared)
+{
+	uint32_t a = state[0];
+	uint32_t b = state[1];
+	uint32_t c = state[2];
+	uint32_t d = state[3];
+	uint32_t e = state[4];
+	uint32_t f = state[5];
+	uint32_t g = state[6];
+	uint32_t h = state[7];
+
+	sha2_step1(a,b,c,d,e,f,g,h,in[ 0], Kshared[ 0]);
+	sha2_step1(h,a,b,c,d,e,f,g,in[ 1], Kshared[ 1]);
+	sha2_step1(g,h,a,b,c,d,e,f,in[ 2], Kshared[ 2]);
+	sha2_step1(f,g,h,a,b,c,d,e,in[ 3], Kshared[ 3]);
+	sha2_step1(e,f,g,h,a,b,c,d,in[ 4], Kshared[ 4]);
+	sha2_step1(d,e,f,g,h,a,b,c,in[ 5], Kshared[ 5]);
+	sha2_step1(c,d,e,f,g,h,a,b,in[ 6], Kshared[ 6]);
+	sha2_step1(b,c,d,e,f,g,h,a,in[ 7], Kshared[ 7]);
+	sha2_step1(a,b,c,d,e,f,g,h,in[ 8], Kshared[ 8]);
+	sha2_step1(h,a,b,c,d,e,f,g,in[ 9], Kshared[ 9]);
+	sha2_step1(g,h,a,b,c,d,e,f,in[10], Kshared[10]);
+	sha2_step1(f,g,h,a,b,c,d,e,in[11], Kshared[11]);
+	sha2_step1(e,f,g,h,a,b,c,d,in[12], Kshared[12]);
+	sha2_step1(d,e,f,g,h,a,b,c,in[13], Kshared[13]);
+	sha2_step1(c,d,e,f,g,h,a,b,in[14], Kshared[14]);
+	sha2_step1(b,c,d,e,f,g,h,a,in[15], Kshared[15]);
+
+	#pragma unroll
+	for (int i=0; i<3; i++)
+	{
+		sha2_step2(a,b,c,d,e,f,g,h,in,0, Kshared[16+16*i]);
+		sha2_step2(h,a,b,c,d,e,f,g,in,1, Kshared[17+16*i]);
+		sha2_step2(g,h,a,b,c,d,e,f,in,2, Kshared[18+16*i]);
+		sha2_step2(f,g,h,a,b,c,d,e,in,3, Kshared[19+16*i]);
+		sha2_step2(e,f,g,h,a,b,c,d,in,4, Kshared[20+16*i]);
+		sha2_step2(d,e,f,g,h,a,b,c,in,5, Kshared[21+16*i]);
+		sha2_step2(c,d,e,f,g,h,a,b,in,6, Kshared[22+16*i]);
+		sha2_step2(b,c,d,e,f,g,h,a,in,7, Kshared[23+16*i]);
+		sha2_step2(a,b,c,d,e,f,g,h,in,8, Kshared[24+16*i]);
+		sha2_step2(h,a,b,c,d,e,f,g,in,9, Kshared[25+16*i]);
+		sha2_step2(g,h,a,b,c,d,e,f,in,10,Kshared[26+16*i]);
+		sha2_step2(f,g,h,a,b,c,d,e,in,11,Kshared[27+16*i]);
+		sha2_step2(e,f,g,h,a,b,c,d,in,12,Kshared[28+16*i]);
+		sha2_step2(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]);
+		sha2_step2(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]);
+		sha2_step2(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]);
+	}
+
+	state[0] += a;
+	state[1] += b;
+	state[2] += c;
+	state[3] += d;
+	state[4] += e;
+	state[5] += f;
+	state[6] += g;
+	state[7] += h;
+}
+
+__device__
+static void sha256_round_last(uint32_t* in, uint32_t* state, uint32_t* const Kshared)
+{
+	uint32_t a = state[0];
+	uint32_t b = state[1];
+	uint32_t c = state[2];
+	uint32_t d = state[3];
+	uint32_t e = state[4];
+	uint32_t f = state[5];
+	uint32_t g = state[6];
+	uint32_t h = state[7];
+
+	sha2_step1(a,b,c,d, e,f,g,h, in[ 0], Kshared[ 0]);
+	sha2_step1(h,a,b,c, d,e,f,g, in[ 1], Kshared[ 1]);
+	sha2_step1(g,h,a,b, c,d,e,f, in[ 2], Kshared[ 2]);
+	sha2_step1(f,g,h,a, b,c,d,e, in[ 3], Kshared[ 3]);
+	sha2_step1(e,f,g,h, a,b,c,d, in[ 4], Kshared[ 4]);
+	sha2_step1(d,e,f,g, h,a,b,c, in[ 5], Kshared[ 5]);
+	sha2_step1(c,d,e,f, g,h,a,b, in[ 6], Kshared[ 6]);
+	sha2_step1(b,c,d,e, f,g,h,a, in[ 7], Kshared[ 7]);
+	sha2_step1(a,b,c,d, e,f,g,h, in[ 8], Kshared[ 8]);
+	sha2_step1(h,a,b,c, d,e,f,g, in[ 9], Kshared[ 9]);
+	sha2_step1(g,h,a,b, c,d,e,f, in[10], Kshared[10]);
+	sha2_step1(f,g,h,a, b,c,d,e, in[11], Kshared[11]);
+	sha2_step1(e,f,g,h, a,b,c,d, in[12], Kshared[12]);
+	sha2_step1(d,e,f,g, h,a,b,c, in[13], Kshared[13]);
+	sha2_step1(c,d,e,f, g,h,a,b, in[14], Kshared[14]);
+	sha2_step1(b,c,d,e, f,g,h,a, in[15], Kshared[15]);
+
+	#pragma unroll
+	for (int i=0; i<2; i++)
+	{
+		sha2_step2(a,b,c,d, e,f,g,h, in, 0, Kshared[16+16*i]);
+		sha2_step2(h,a,b,c, d,e,f,g, in, 1, Kshared[17+16*i]);
+		sha2_step2(g,h,a,b, c,d,e,f, in, 2, Kshared[18+16*i]);
+		sha2_step2(f,g,h,a, b,c,d,e, in, 3, Kshared[19+16*i]);
+		sha2_step2(e,f,g,h, a,b,c,d, in, 4, Kshared[20+16*i]);
+		sha2_step2(d,e,f,g, h,a,b,c, in, 5, Kshared[21+16*i]);
+		sha2_step2(c,d,e,f, g,h,a,b, in, 6, Kshared[22+16*i]);
+		sha2_step2(b,c,d,e, f,g,h,a, in, 7, Kshared[23+16*i]);
+		sha2_step2(a,b,c,d, e,f,g,h, in, 8, Kshared[24+16*i]);
+		sha2_step2(h,a,b,c, d,e,f,g, in, 9, Kshared[25+16*i]);
+		sha2_step2(g,h,a,b, c,d,e,f, in,10, Kshared[26+16*i]);
+		sha2_step2(f,g,h,a, b,c,d,e, in,11, Kshared[27+16*i]);
+		sha2_step2(e,f,g,h, a,b,c,d, in,12, Kshared[28+16*i]);
+		sha2_step2(d,e,f,g, h,a,b,c, in,13, Kshared[29+16*i]);
+		sha2_step2(c,d,e,f, g,h,a,b, in,14, Kshared[30+16*i]);
+		sha2_step2(b,c,d,e, f,g,h,a, in,15, Kshared[31+16*i]);
+	}
+
+	sha2_step2(a,b,c,d, e,f,g,h, in, 0, Kshared[16+16*2]);
+	sha2_step2(h,a,b,c, d,e,f,g, in, 1, Kshared[17+16*2]);
+	sha2_step2(g,h,a,b, c,d,e,f, in, 2, Kshared[18+16*2]);
+	sha2_step2(f,g,h,a, b,c,d,e, in, 3, Kshared[19+16*2]);
+	sha2_step2(e,f,g,h, a,b,c,d, in, 4, Kshared[20+16*2]);
+	sha2_step2(d,e,f,g, h,a,b,c, in, 5, Kshared[21+16*2]);
+	sha2_step2(c,d,e,f, g,h,a,b, in, 6, Kshared[22+16*2]);
+	sha2_step2(b,c,d,e, f,g,h,a, in, 7, Kshared[23+16*2]);
+	sha2_step2(a,b,c,d, e,f,g,h, in, 8, Kshared[24+16*2]);
+	sha2_step2(h,a,b,c, d,e,f,g, in, 9, Kshared[25+16*2]);
+	sha2_step2(g,h,a,b, c,d,e,f, in,10, Kshared[26+16*2]);
+	sha2_step2(f,g,h,a, b,c,d,e, in,11, Kshared[27+16*2]);
+	sha2_step2(e,f,g,h, a,b,c,d, in,12, Kshared[28+16*2]);
+	sha2_step2(d,e,f,g, h,a,b,c, in,13, Kshared[29+16*2]);
+
+	state[6] += g;
+	state[7] += h;
+}
+
+__device__ __forceinline__
+uint64_t cuda_swab32ll(uint64_t x) {
+	return MAKE_ULONGLONG(cuda_swab32(_LODWORD(x)), cuda_swab32(_HIDWORD(x)));
+}
+
+__global__
+/*__launch_bounds__(256,3)*/
+void sha256q_gpu_hash_shared(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonces)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+	__shared__ uint32_t s_K[64*4];
+	//s_K[thread & 63] = c_K[thread & 63];
+	if (threadIdx.x < 64U) s_K[threadIdx.x] = c_K[threadIdx.x];
+
+	if (thread < threads)
+	{
+		const uint32_t nonce = startNonce + thread;
+
+		uint32_t dat[16];
+		AS_UINT2(dat) = AS_UINT2(c_dataEnd80);
+		dat[ 2] = c_dataEnd80[2];
+		dat[ 3] = nonce;
+		dat[ 4] = 0x80000000;
+		dat[15] = 0x280;
+		#pragma unroll
+		for (int i=5; i<15; i++) dat[i] = 0;
+
+		uint32_t buf[8];
+		#pragma unroll
+		for (int i=0; i<8; i+=2) AS_UINT2(&buf[i]) = AS_UINT2(&c_midstate76[i]);
+		//for (int i=0; i<8; i++) buf[i] = c_midstate76[i];
+
+		sha256_round_body(dat, buf, s_K);
+
+		// second sha256
+
+		#pragma unroll
+		for (int i=0; i<8; i++) dat[i] = buf[i];
+		dat[8] = 0x80000000;
+		#pragma unroll
+		for (int i=9; i<15; i++) dat[i] = 0;
+		dat[15] = 0x100;
+
+		#pragma unroll
+		for (int i=0; i<8; i++) buf[i] = c_H256[i];
+
+		sha256_round_body(dat, buf, s_K);
+
+		// third sha256
+
+		#pragma unroll
+		for (int i=0; i<8; i++) dat[i] = buf[i];
+		dat[8] = 0x80000000;
+		#pragma unroll
+		for (int i=9; i<15; i++) dat[i] = 0;
+		dat[15] = 0x100;
+
+		#pragma unroll
+		for (int i=0; i<8; i++) buf[i] = c_H256[i];
+
+		sha256_round_body(dat, buf, s_K);
+
+		// last sha256
+
+		#pragma unroll
+		for (int i=0; i<8; i++) dat[i] = buf[i];
+		dat[8] = 0x80000000;
+		#pragma unroll
+		for (int i=9; i<15; i++) dat[i] = 0;
+		dat[15] = 0x100;
+
+		#pragma unroll
+		for (int i=0; i<8; i++) buf[i] = c_H256[i];
+
+		sha256_round_last(dat, buf, s_K);
+
+
+		// valid nonces
+		uint64_t high = cuda_swab32ll(((uint64_t*)buf)[3]);
+		if (high <= c_target[0]) {
+			//printf("%08x %08x - %016llx %016llx - %08x %08x\n", buf[7], buf[6], high, d_target[0], c_target[1], c_target[0]);
+			resNonces[1] = atomicExch(resNonces, nonce);
+			//d_target[0] = high;
+		}
+	}
+}
+
+__host__
+void sha256q_init(int thr_id)
+{
+	cuda_get_arch(thr_id);
+	cudaMemcpyToSymbol(c_K, cpu_K, sizeof(cpu_K), 0, cudaMemcpyHostToDevice);
+	CUDA_SAFE_CALL(cudaMalloc(&d_resNonces[thr_id], 2*sizeof(uint32_t)));
+}
+
+__host__
+void sha256q_free(int thr_id)
+{
+	if (d_resNonces[thr_id]) cudaFree(d_resNonces[thr_id]);
+	d_resNonces[thr_id] = NULL;
+}
+
+__host__
+void sha256q_setBlock_80(uint32_t *pdata, uint32_t *ptarget)
+{
+	uint32_t _ALIGN(64) in[16], buf[8], end[4];
+	for (int i=0;i<16;i++) in[i] = cuda_swab32(pdata[i]);
+	for (int i=0;i<8;i++) buf[i] = cpu_H256[i];
+	for (int i=0;i<4;i++) end[i] = cuda_swab32(pdata[16+i]);
+	sha256_round_body_host(in, buf, cpu_K);
+
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_midstate76, buf, 32, 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_dataEnd80,  end, sizeof(end), 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_target, &ptarget[6], 8, 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_target, &ptarget[6], 8, 0, cudaMemcpyHostToDevice));
+}
+
+__host__
+void sha256q_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces)
+{
+	const uint32_t threadsperblock = 128;
+
+	dim3 grid(threads/threadsperblock);
+	dim3 block(threadsperblock);
+
+	CUDA_SAFE_CALL(cudaMemset(d_resNonces[thr_id], 0xFF, 2 * sizeof(uint32_t)));
+	cudaThreadSynchronize();
+	sha256q_gpu_hash_shared <<<grid, block>>> (threads, startNonce, d_resNonces[thr_id]);
+	cudaThreadSynchronize();
+
+	CUDA_SAFE_CALL(cudaMemcpy(resNonces, d_resNonces[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost));
+	if (resNonces[0] == resNonces[1]) {
+		resNonces[1] = UINT32_MAX;
+	}
+}
diff --git a/sha256/cuda_sha256t.cu b/sha256/cuda_sha256t.cu
new file mode 100644
index 0000000000..95689947a8
--- /dev/null
+++ b/sha256/cuda_sha256t.cu
@@ -0,0 +1,491 @@
+/*
+ * sha256(-t) CUDA implementation.
+ * tpruvot 2017
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <memory.h>
+
+#include <cuda_helper.h>
+#include <miner.h>
+
+__constant__ static uint32_t __align__(8) c_midstate76[8];
+__constant__ static uint32_t __align__(8) c_dataEnd80[4];
+
+const __constant__  uint32_t __align__(8) c_H256[8] = {
+	0x6A09E667U, 0xBB67AE85U, 0x3C6EF372U, 0xA54FF53AU,
+	0x510E527FU, 0x9B05688CU, 0x1F83D9ABU, 0x5BE0CD19U
+};
+__constant__ static uint32_t __align__(8) c_K[64];
+__constant__ static uint32_t __align__(8) c_target[2];
+__device__ uint64_t d_target[1];
+
+static uint32_t* d_resNonces[MAX_GPUS] = { 0 };
+
+// ------------------------------------------------------------------------------------------------
+
+static const uint32_t cpu_H256[8] = {
+	0x6A09E667U, 0xBB67AE85U, 0x3C6EF372U, 0xA54FF53AU,
+	0x510E527FU, 0x9B05688CU, 0x1F83D9ABU, 0x5BE0CD19U
+};
+
+static const uint32_t cpu_K[64] = {
+	0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
+	0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3, 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
+	0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC, 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
+	0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
+	0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13, 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
+	0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3, 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
+	0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
+	0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208, 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
+};
+
+#define ROTR ROTR32
+
+__host__
+static void sha256_step1_host(uint32_t a, uint32_t b, uint32_t c, uint32_t &d,
+	uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
+	uint32_t in, const uint32_t Kshared)
+{
+	uint32_t t1,t2;
+	uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
+	uint32_t bsg21 = ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e);
+	uint32_t bsg20 = ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a);
+	uint32_t andorv = ((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);
+
+	t1 = h + bsg21 + vxandx + Kshared + in;
+	t2 = bsg20 + andorv;
+	d = d + t1;
+	h = t1 + t2;
+}
+
+__host__
+static void sha256_step2_host(uint32_t a, uint32_t b, uint32_t c, uint32_t &d,
+	uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
+	uint32_t* in, uint32_t pc, const uint32_t Kshared)
+{
+	uint32_t t1,t2;
+
+	int pcidx1 = (pc-2)  & 0xF;
+	int pcidx2 = (pc-7)  & 0xF;
+	int pcidx3 = (pc-15) & 0xF;
+
+	uint32_t inx0 = in[pc];
+	uint32_t inx1 = in[pcidx1];
+	uint32_t inx2 = in[pcidx2];
+	uint32_t inx3 = in[pcidx3];
+
+	uint32_t ssg21 = ROTR(inx1, 17) ^ ROTR(inx1, 19) ^ SPH_T32((inx1) >> 10); //ssg2_1(inx1);
+	uint32_t ssg20 = ROTR(inx3, 7) ^ ROTR(inx3, 18) ^ SPH_T32((inx3) >> 3); //ssg2_0(inx3);
+	uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
+	uint32_t bsg21 = ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e);
+	uint32_t bsg20 = ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a);
+	uint32_t andorv = ((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);
+
+	in[pc] = ssg21 + inx2 + ssg20 + inx0;
+
+	t1 = h + bsg21 + vxandx + Kshared + in[pc];
+	t2 = bsg20 + andorv;
+	d =  d + t1;
+	h = t1 + t2;
+}
+
+__host__
+static void sha256_round_body_host(uint32_t* in, uint32_t* state, const uint32_t* Kshared)
+{
+	uint32_t a = state[0];
+	uint32_t b = state[1];
+	uint32_t c = state[2];
+	uint32_t d = state[3];
+	uint32_t e = state[4];
+	uint32_t f = state[5];
+	uint32_t g = state[6];
+	uint32_t h = state[7];
+
+	sha256_step1_host(a,b,c,d,e,f,g,h,in[ 0], Kshared[ 0]);
+	sha256_step1_host(h,a,b,c,d,e,f,g,in[ 1], Kshared[ 1]);
+	sha256_step1_host(g,h,a,b,c,d,e,f,in[ 2], Kshared[ 2]);
+	sha256_step1_host(f,g,h,a,b,c,d,e,in[ 3], Kshared[ 3]);
+	sha256_step1_host(e,f,g,h,a,b,c,d,in[ 4], Kshared[ 4]);
+	sha256_step1_host(d,e,f,g,h,a,b,c,in[ 5], Kshared[ 5]);
+	sha256_step1_host(c,d,e,f,g,h,a,b,in[ 6], Kshared[ 6]);
+	sha256_step1_host(b,c,d,e,f,g,h,a,in[ 7], Kshared[ 7]);
+	sha256_step1_host(a,b,c,d,e,f,g,h,in[ 8], Kshared[ 8]);
+	sha256_step1_host(h,a,b,c,d,e,f,g,in[ 9], Kshared[ 9]);
+	sha256_step1_host(g,h,a,b,c,d,e,f,in[10], Kshared[10]);
+	sha256_step1_host(f,g,h,a,b,c,d,e,in[11], Kshared[11]);
+	sha256_step1_host(e,f,g,h,a,b,c,d,in[12], Kshared[12]);
+	sha256_step1_host(d,e,f,g,h,a,b,c,in[13], Kshared[13]);
+	sha256_step1_host(c,d,e,f,g,h,a,b,in[14], Kshared[14]);
+	sha256_step1_host(b,c,d,e,f,g,h,a,in[15], Kshared[15]);
+
+	for (int i=0; i<3; i++)
+	{
+		sha256_step2_host(a,b,c,d,e,f,g,h,in,0, Kshared[16+16*i]);
+		sha256_step2_host(h,a,b,c,d,e,f,g,in,1, Kshared[17+16*i]);
+		sha256_step2_host(g,h,a,b,c,d,e,f,in,2, Kshared[18+16*i]);
+		sha256_step2_host(f,g,h,a,b,c,d,e,in,3, Kshared[19+16*i]);
+		sha256_step2_host(e,f,g,h,a,b,c,d,in,4, Kshared[20+16*i]);
+		sha256_step2_host(d,e,f,g,h,a,b,c,in,5, Kshared[21+16*i]);
+		sha256_step2_host(c,d,e,f,g,h,a,b,in,6, Kshared[22+16*i]);
+		sha256_step2_host(b,c,d,e,f,g,h,a,in,7, Kshared[23+16*i]);
+		sha256_step2_host(a,b,c,d,e,f,g,h,in,8, Kshared[24+16*i]);
+		sha256_step2_host(h,a,b,c,d,e,f,g,in,9, Kshared[25+16*i]);
+		sha256_step2_host(g,h,a,b,c,d,e,f,in,10,Kshared[26+16*i]);
+		sha256_step2_host(f,g,h,a,b,c,d,e,in,11,Kshared[27+16*i]);
+		sha256_step2_host(e,f,g,h,a,b,c,d,in,12,Kshared[28+16*i]);
+		sha256_step2_host(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]);
+		sha256_step2_host(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]);
+		sha256_step2_host(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]);
+	}
+
+	state[0] += a;
+	state[1] += b;
+	state[2] += c;
+	state[3] += d;
+	state[4] += e;
+	state[5] += f;
+	state[6] += g;
+	state[7] += h;
+}
+
+#define xor3b(a,b,c) (a ^ b ^ c)
+
+__device__ __forceinline__ uint32_t bsg2_0(const uint32_t x)
+{
+	return xor3b(ROTR32(x,2),ROTR32(x,13),ROTR32(x,22));
+}
+
+__device__ __forceinline__ uint32_t bsg2_1(const uint32_t x)
+{
+	return xor3b(ROTR32(x,6),ROTR32(x,11),ROTR32(x,25));
+}
+
+__device__ __forceinline__ uint32_t ssg2_0(const uint32_t x)
+{
+	return xor3b(ROTR32(x,7),ROTR32(x,18),(x>>3));
+}
+
+__device__ __forceinline__ uint32_t ssg2_1(const uint32_t x)
+{
+	return xor3b(ROTR32(x,17),ROTR32(x,19),(x>>10));
+}
+
+__device__ __forceinline__ uint32_t andor32(const uint32_t a, const uint32_t b, const uint32_t c)
+{
+	uint32_t result;
+	asm("{\n\t"
+		".reg .u32 m,n,o;\n\t"
+		"and.b32 m,  %1, %2;\n\t"
+		" or.b32 n,  %1, %2;\n\t"
+		"and.b32 o,   n, %3;\n\t"
+		" or.b32 %0,  m, o ;\n\t"
+		"}\n\t" : "=r"(result) : "r"(a), "r"(b), "r"(c)
+	);
+	return result;
+}
+
+__device__ __forceinline__ uint2 vectorizeswap(uint64_t v) {
+	uint2 result;
+	asm("mov.b64 {%0,%1},%2; \n\t"
+		: "=r"(result.y), "=r"(result.x) : "l"(v));
+	return result;
+}
+
+__device__
+static void sha2_step1(uint32_t a, uint32_t b, uint32_t c, uint32_t &d, uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
+	uint32_t in, const uint32_t Kshared)
+{
+	uint32_t t1,t2;
+	uint32_t vxandx = xandx(e, f, g);
+	uint32_t bsg21 = bsg2_1(e);
+	uint32_t bsg20 = bsg2_0(a);
+	uint32_t andorv = andor32(a,b,c);
+
+	t1 = h + bsg21 + vxandx + Kshared + in;
+	t2 = bsg20 + andorv;
+	d = d + t1;
+	h = t1 + t2;
+}
+
+__device__
+static void sha2_step2(uint32_t a, uint32_t b, uint32_t c, uint32_t &d, uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
+	uint32_t* in, uint32_t pc, const uint32_t Kshared)
+{
+	uint32_t t1,t2;
+
+	int pcidx1 = (pc-2) & 0xF;
+	int pcidx2 = (pc-7) & 0xF;
+	int pcidx3 = (pc-15) & 0xF;
+
+	uint32_t inx0 = in[pc];
+	uint32_t inx1 = in[pcidx1];
+	uint32_t inx2 = in[pcidx2];
+	uint32_t inx3 = in[pcidx3];
+
+	uint32_t ssg21 = ssg2_1(inx1);
+	uint32_t ssg20 = ssg2_0(inx3);
+	uint32_t vxandx = xandx(e, f, g);
+	uint32_t bsg21 = bsg2_1(e);
+	uint32_t bsg20 = bsg2_0(a);
+	uint32_t andorv = andor32(a,b,c);
+
+	in[pc] = ssg21 + inx2 + ssg20 + inx0;
+
+	t1 = h + bsg21 + vxandx + Kshared + in[pc];
+	t2 = bsg20 + andorv;
+	d =  d + t1;
+	h = t1 + t2;
+}
+
+__device__
+static void sha256_round_body(uint32_t* in, uint32_t* state, uint32_t* const Kshared)
+{
+	uint32_t a = state[0];
+	uint32_t b = state[1];
+	uint32_t c = state[2];
+	uint32_t d = state[3];
+	uint32_t e = state[4];
+	uint32_t f = state[5];
+	uint32_t g = state[6];
+	uint32_t h = state[7];
+
+	sha2_step1(a,b,c,d,e,f,g,h,in[ 0], Kshared[ 0]);
+	sha2_step1(h,a,b,c,d,e,f,g,in[ 1], Kshared[ 1]);
+	sha2_step1(g,h,a,b,c,d,e,f,in[ 2], Kshared[ 2]);
+	sha2_step1(f,g,h,a,b,c,d,e,in[ 3], Kshared[ 3]);
+	sha2_step1(e,f,g,h,a,b,c,d,in[ 4], Kshared[ 4]);
+	sha2_step1(d,e,f,g,h,a,b,c,in[ 5], Kshared[ 5]);
+	sha2_step1(c,d,e,f,g,h,a,b,in[ 6], Kshared[ 6]);
+	sha2_step1(b,c,d,e,f,g,h,a,in[ 7], Kshared[ 7]);
+	sha2_step1(a,b,c,d,e,f,g,h,in[ 8], Kshared[ 8]);
+	sha2_step1(h,a,b,c,d,e,f,g,in[ 9], Kshared[ 9]);
+	sha2_step1(g,h,a,b,c,d,e,f,in[10], Kshared[10]);
+	sha2_step1(f,g,h,a,b,c,d,e,in[11], Kshared[11]);
+	sha2_step1(e,f,g,h,a,b,c,d,in[12], Kshared[12]);
+	sha2_step1(d,e,f,g,h,a,b,c,in[13], Kshared[13]);
+	sha2_step1(c,d,e,f,g,h,a,b,in[14], Kshared[14]);
+	sha2_step1(b,c,d,e,f,g,h,a,in[15], Kshared[15]);
+
+	#pragma unroll
+	for (int i=0; i<3; i++)
+	{
+		sha2_step2(a,b,c,d,e,f,g,h,in,0, Kshared[16+16*i]);
+		sha2_step2(h,a,b,c,d,e,f,g,in,1, Kshared[17+16*i]);
+		sha2_step2(g,h,a,b,c,d,e,f,in,2, Kshared[18+16*i]);
+		sha2_step2(f,g,h,a,b,c,d,e,in,3, Kshared[19+16*i]);
+		sha2_step2(e,f,g,h,a,b,c,d,in,4, Kshared[20+16*i]);
+		sha2_step2(d,e,f,g,h,a,b,c,in,5, Kshared[21+16*i]);
+		sha2_step2(c,d,e,f,g,h,a,b,in,6, Kshared[22+16*i]);
+		sha2_step2(b,c,d,e,f,g,h,a,in,7, Kshared[23+16*i]);
+		sha2_step2(a,b,c,d,e,f,g,h,in,8, Kshared[24+16*i]);
+		sha2_step2(h,a,b,c,d,e,f,g,in,9, Kshared[25+16*i]);
+		sha2_step2(g,h,a,b,c,d,e,f,in,10,Kshared[26+16*i]);
+		sha2_step2(f,g,h,a,b,c,d,e,in,11,Kshared[27+16*i]);
+		sha2_step2(e,f,g,h,a,b,c,d,in,12,Kshared[28+16*i]);
+		sha2_step2(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]);
+		sha2_step2(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]);
+		sha2_step2(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]);
+	}
+
+	state[0] += a;
+	state[1] += b;
+	state[2] += c;
+	state[3] += d;
+	state[4] += e;
+	state[5] += f;
+	state[6] += g;
+	state[7] += h;
+}
+
+__device__
+static void sha256_round_last(uint32_t* in, uint32_t* state, uint32_t* const Kshared)
+{
+	uint32_t a = state[0];
+	uint32_t b = state[1];
+	uint32_t c = state[2];
+	uint32_t d = state[3];
+	uint32_t e = state[4];
+	uint32_t f = state[5];
+	uint32_t g = state[6];
+	uint32_t h = state[7];
+
+	sha2_step1(a,b,c,d, e,f,g,h, in[ 0], Kshared[ 0]);
+	sha2_step1(h,a,b,c, d,e,f,g, in[ 1], Kshared[ 1]);
+	sha2_step1(g,h,a,b, c,d,e,f, in[ 2], Kshared[ 2]);
+	sha2_step1(f,g,h,a, b,c,d,e, in[ 3], Kshared[ 3]);
+	sha2_step1(e,f,g,h, a,b,c,d, in[ 4], Kshared[ 4]);
+	sha2_step1(d,e,f,g, h,a,b,c, in[ 5], Kshared[ 5]);
+	sha2_step1(c,d,e,f, g,h,a,b, in[ 6], Kshared[ 6]);
+	sha2_step1(b,c,d,e, f,g,h,a, in[ 7], Kshared[ 7]);
+	sha2_step1(a,b,c,d, e,f,g,h, in[ 8], Kshared[ 8]);
+	sha2_step1(h,a,b,c, d,e,f,g, in[ 9], Kshared[ 9]);
+	sha2_step1(g,h,a,b, c,d,e,f, in[10], Kshared[10]);
+	sha2_step1(f,g,h,a, b,c,d,e, in[11], Kshared[11]);
+	sha2_step1(e,f,g,h, a,b,c,d, in[12], Kshared[12]);
+	sha2_step1(d,e,f,g, h,a,b,c, in[13], Kshared[13]);
+	sha2_step1(c,d,e,f, g,h,a,b, in[14], Kshared[14]);
+	sha2_step1(b,c,d,e, f,g,h,a, in[15], Kshared[15]);
+
+	#pragma unroll
+	for (int i=0; i<2; i++)
+	{
+		sha2_step2(a,b,c,d, e,f,g,h, in, 0, Kshared[16+16*i]);
+		sha2_step2(h,a,b,c, d,e,f,g, in, 1, Kshared[17+16*i]);
+		sha2_step2(g,h,a,b, c,d,e,f, in, 2, Kshared[18+16*i]);
+		sha2_step2(f,g,h,a, b,c,d,e, in, 3, Kshared[19+16*i]);
+		sha2_step2(e,f,g,h, a,b,c,d, in, 4, Kshared[20+16*i]);
+		sha2_step2(d,e,f,g, h,a,b,c, in, 5, Kshared[21+16*i]);
+		sha2_step2(c,d,e,f, g,h,a,b, in, 6, Kshared[22+16*i]);
+		sha2_step2(b,c,d,e, f,g,h,a, in, 7, Kshared[23+16*i]);
+		sha2_step2(a,b,c,d, e,f,g,h, in, 8, Kshared[24+16*i]);
+		sha2_step2(h,a,b,c, d,e,f,g, in, 9, Kshared[25+16*i]);
+		sha2_step2(g,h,a,b, c,d,e,f, in,10, Kshared[26+16*i]);
+		sha2_step2(f,g,h,a, b,c,d,e, in,11, Kshared[27+16*i]);
+		sha2_step2(e,f,g,h, a,b,c,d, in,12, Kshared[28+16*i]);
+		sha2_step2(d,e,f,g, h,a,b,c, in,13, Kshared[29+16*i]);
+		sha2_step2(c,d,e,f, g,h,a,b, in,14, Kshared[30+16*i]);
+		sha2_step2(b,c,d,e, f,g,h,a, in,15, Kshared[31+16*i]);
+	}
+
+	sha2_step2(a,b,c,d, e,f,g,h, in, 0, Kshared[16+16*2]);
+	sha2_step2(h,a,b,c, d,e,f,g, in, 1, Kshared[17+16*2]);
+	sha2_step2(g,h,a,b, c,d,e,f, in, 2, Kshared[18+16*2]);
+	sha2_step2(f,g,h,a, b,c,d,e, in, 3, Kshared[19+16*2]);
+	sha2_step2(e,f,g,h, a,b,c,d, in, 4, Kshared[20+16*2]);
+	sha2_step2(d,e,f,g, h,a,b,c, in, 5, Kshared[21+16*2]);
+	sha2_step2(c,d,e,f, g,h,a,b, in, 6, Kshared[22+16*2]);
+	sha2_step2(b,c,d,e, f,g,h,a, in, 7, Kshared[23+16*2]);
+	sha2_step2(a,b,c,d, e,f,g,h, in, 8, Kshared[24+16*2]);
+	sha2_step2(h,a,b,c, d,e,f,g, in, 9, Kshared[25+16*2]);
+	sha2_step2(g,h,a,b, c,d,e,f, in,10, Kshared[26+16*2]);
+	sha2_step2(f,g,h,a, b,c,d,e, in,11, Kshared[27+16*2]);
+	sha2_step2(e,f,g,h, a,b,c,d, in,12, Kshared[28+16*2]);
+	sha2_step2(d,e,f,g, h,a,b,c, in,13, Kshared[29+16*2]);
+
+	state[6] += g;
+	state[7] += h;
+}
+
+__device__ __forceinline__
+uint64_t cuda_swab32ll(uint64_t x) {
+	return MAKE_ULONGLONG(cuda_swab32(_LODWORD(x)), cuda_swab32(_HIDWORD(x)));
+}
+
+__global__
+/*__launch_bounds__(256,3)*/
+void sha256t_gpu_hash_shared(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonces)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+	__shared__ uint32_t s_K[64*4];
+	//s_K[thread & 63] = c_K[thread & 63];
+	if (threadIdx.x < 64U) s_K[threadIdx.x] = c_K[threadIdx.x];
+
+	if (thread < threads)
+	{
+		const uint32_t nonce = startNonce + thread;
+
+		uint32_t dat[16];
+		AS_UINT2(dat) = AS_UINT2(c_dataEnd80);
+		dat[ 2] = c_dataEnd80[2];
+		dat[ 3] = nonce;
+		dat[ 4] = 0x80000000;
+		dat[15] = 0x280;
+		#pragma unroll
+		for (int i=5; i<15; i++) dat[i] = 0;
+
+		uint32_t buf[8];
+		#pragma unroll
+		for (int i=0; i<8; i+=2) AS_UINT2(&buf[i]) = AS_UINT2(&c_midstate76[i]);
+		//for (int i=0; i<8; i++) buf[i] = c_midstate76[i];
+
+		sha256_round_body(dat, buf, s_K);
+
+		// second sha256
+
+		#pragma unroll
+		for (int i=0; i<8; i++) dat[i] = buf[i];
+		dat[8] = 0x80000000;
+		#pragma unroll
+		for (int i=9; i<15; i++) dat[i] = 0;
+		dat[15] = 0x100;
+
+		#pragma unroll
+		for (int i=0; i<8; i++) buf[i] = c_H256[i];
+
+		sha256_round_body(dat, buf, s_K);
+
+		// last sha256
+
+		#pragma unroll
+		for (int i=0; i<8; i++) dat[i] = buf[i];
+		dat[8] = 0x80000000;
+		#pragma unroll
+		for (int i=9; i<15; i++) dat[i] = 0;
+		dat[15] = 0x100;
+
+		#pragma unroll
+		for (int i=0; i<8; i++) buf[i] = c_H256[i];
+
+		sha256_round_last(dat, buf, s_K);
+
+		// valid nonces
+		uint64_t high = cuda_swab32ll(((uint64_t*)buf)[3]);
+		if (high <= c_target[0]) {
+			//printf("%08x %08x - %016llx %016llx - %08x %08x\n", buf[7], buf[6], high, d_target[0], c_target[1], c_target[0]);
+			resNonces[1] = atomicExch(resNonces, nonce);
+			//d_target[0] = high;
+		}
+	}
+}
+
+__host__
+void sha256t_init(int thr_id)
+{
+	cuda_get_arch(thr_id);
+	cudaMemcpyToSymbol(c_K, cpu_K, sizeof(cpu_K), 0, cudaMemcpyHostToDevice);
+	CUDA_SAFE_CALL(cudaMalloc(&d_resNonces[thr_id], 2*sizeof(uint32_t)));
+}
+
+__host__
+void sha256t_free(int thr_id)
+{
+	if (d_resNonces[thr_id]) cudaFree(d_resNonces[thr_id]);
+	d_resNonces[thr_id] = NULL;
+}
+
+__host__
+void sha256t_setBlock_80(uint32_t *pdata, uint32_t *ptarget)
+{
+	uint32_t _ALIGN(64) in[16], buf[8], end[4];
+	for (int i=0;i<16;i++) in[i] = cuda_swab32(pdata[i]);
+	for (int i=0;i<8;i++) buf[i] = cpu_H256[i];
+	for (int i=0;i<4;i++) end[i] = cuda_swab32(pdata[16+i]);
+	sha256_round_body_host(in, buf, cpu_K);
+
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_midstate76, buf, 32, 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_dataEnd80,  end, sizeof(end), 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_target, &ptarget[6], 8, 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_target, &ptarget[6], 8, 0, cudaMemcpyHostToDevice));
+}
+
+__host__
+void sha256t_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces)
+{
+	const uint32_t threadsperblock = 128;
+
+	dim3 grid(threads/threadsperblock);
+	dim3 block(threadsperblock);
+
+	CUDA_SAFE_CALL(cudaMemset(d_resNonces[thr_id], 0xFF, 2 * sizeof(uint32_t)));
+	cudaThreadSynchronize();
+	sha256t_gpu_hash_shared <<<grid, block>>> (threads, startNonce, d_resNonces[thr_id]);
+	cudaThreadSynchronize();
+
+	CUDA_SAFE_CALL(cudaMemcpy(resNonces, d_resNonces[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost));
+	if (resNonces[0] == resNonces[1]) {
+		resNonces[1] = UINT32_MAX;
+	}
+}
diff --git a/sha256/sha256d.cu b/sha256/sha256d.cu
new file mode 100644
index 0000000000..bab6a7d11b
--- /dev/null
+++ b/sha256/sha256d.cu
@@ -0,0 +1,127 @@
+/**
+ * SHA256d
+ * by tpruvot@github - 2017
+ */
+
+#include <miner.h>
+#include <cuda_helper.h>
+#include <openssl/sha.h>
+
+// CPU Check
+extern "C" void sha256d_hash(void *output, const void *input)
+{
+	unsigned char _ALIGN(64) hash[64];
+	SHA256_CTX sha256;
+
+	SHA256_Init(&sha256);
+	SHA256_Update(&sha256, (unsigned char *)input, 80);
+	SHA256_Final(hash, &sha256);
+
+	SHA256_Init(&sha256);
+	SHA256_Update(&sha256, hash, 32);
+	SHA256_Final((unsigned char *)output, &sha256);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+extern void sha256d_init(int thr_id);
+extern void sha256d_free(int thr_id);
+extern void sha256d_setBlock_80(uint32_t *pdata, uint32_t *ptarget);
+extern void sha256d_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces);
+
+extern "C" int scanhash_sha256d(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t _ALIGN(64) endiandata[20];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << 25);
+	if (init[thr_id]) throughput = min(throughput, (max_nonce - first_nonce));
+
+	if (opt_benchmark)
+		((uint32_t*)ptarget)[7] = 0x03;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		sha256d_init(thr_id);
+
+		init[thr_id] = true;
+	}
+
+	for (int k=0; k < 19; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	sha256d_setBlock_80(endiandata, ptarget);
+
+	do {
+		// Hash with CUDA
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		sha256d_hash_80(thr_id, throughput, pdata[19], work->nonces);
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			uint32_t _ALIGN(64) vhash[8];
+
+			endiandata[19] = swab32(work->nonces[0]);
+			sha256d_hash(vhash, endiandata);
+			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				if (work->nonces[1] != UINT32_MAX) {
+					endiandata[19] = swab32(work->nonces[1]);
+					sha256d_hash(vhash, endiandata);
+					if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+						work->valid_nonces++;
+						bn_set_target_ratio(work, vhash, 1);
+					}
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1;
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > ptarget[7]) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+					gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+
+	return 0;
+}
+
+// cleanup
+extern "C" void free_sha256d(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	sha256d_free(thr_id);
+
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/sha256/sha256q.cu b/sha256/sha256q.cu
new file mode 100644
index 0000000000..d3efa40eba
--- /dev/null
+++ b/sha256/sha256q.cu
@@ -0,0 +1,136 @@
+/**
+ * SHA256 4x
+ * by pyritepirate - 2018
+ * by tpruvot@github - 2017
+ */
+
+#include <miner.h>
+#include <cuda_helper.h>
+#include <openssl/sha.h>
+
+// CPU Check
+extern "C" void sha256q_hash(void *output, const void *input)
+{
+	unsigned char _ALIGN(64) hash[64];
+	SHA256_CTX sha256;
+
+	SHA256_Init(&sha256);
+	SHA256_Update(&sha256, (unsigned char *)input, 80);
+	SHA256_Final(hash, &sha256);
+
+	SHA256_Init(&sha256);
+	SHA256_Update(&sha256, hash, 32);
+	SHA256_Final(hash, &sha256);
+
+	SHA256_Init(&sha256);
+	SHA256_Update(&sha256, hash, 32);
+	SHA256_Final(hash, &sha256);
+
+	SHA256_Init(&sha256);
+	SHA256_Update(&sha256, hash, 32);
+	SHA256_Final((unsigned char *)output, &sha256);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+extern void sha256q_init(int thr_id);
+extern void sha256q_free(int thr_id);
+extern void sha256q_setBlock_80(uint32_t *pdata, uint32_t *ptarget);
+extern void sha256q_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces);
+
+extern "C" int scanhash_sha256q(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t _ALIGN(64) endiandata[20];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << 23);
+	if (init[thr_id]) throughput = min(throughput, (max_nonce - first_nonce));
+
+	if (opt_benchmark)
+		((uint32_t*)ptarget)[7] = 0x03;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		sha256q_init(thr_id);
+
+		init[thr_id] = true;
+	}
+
+	for (int k=0; k < 19; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	sha256q_setBlock_80(endiandata, ptarget);
+
+	do {
+		// Hash with CUDA
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		sha256q_hash_80(thr_id, throughput, pdata[19], work->nonces);
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			uint32_t _ALIGN(64) vhash[8];
+
+			endiandata[19] = swab32(work->nonces[0]);
+			sha256q_hash(vhash, endiandata);
+			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				if (work->nonces[1] != UINT32_MAX) {
+					endiandata[19] = swab32(work->nonces[1]);
+					sha256q_hash(vhash, endiandata);
+					if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+						work->valid_nonces++;
+						bn_set_target_ratio(work, vhash, 1);
+					}
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1;
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > ptarget[7]) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+					gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+
+	return 0;
+}
+
+// cleanup
+extern "C" void free_sha256q(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	sha256q_free(thr_id);
+
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/sha256/sha256t.cu b/sha256/sha256t.cu
new file mode 100644
index 0000000000..2defb00898
--- /dev/null
+++ b/sha256/sha256t.cu
@@ -0,0 +1,131 @@
+/**
+ * SHA256 3x
+ * by tpruvot@github - 2017
+ */
+
+#include <miner.h>
+#include <cuda_helper.h>
+#include <openssl/sha.h>
+
+// CPU Check
+extern "C" void sha256t_hash(void *output, const void *input)
+{
+	unsigned char _ALIGN(64) hash[64];
+	SHA256_CTX sha256;
+
+	SHA256_Init(&sha256);
+	SHA256_Update(&sha256, (unsigned char *)input, 80);
+	SHA256_Final(hash, &sha256);
+
+	SHA256_Init(&sha256);
+	SHA256_Update(&sha256, hash, 32);
+	SHA256_Final(hash, &sha256);
+
+	SHA256_Init(&sha256);
+	SHA256_Update(&sha256, hash, 32);
+	SHA256_Final((unsigned char *)output, &sha256);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+extern void sha256t_init(int thr_id);
+extern void sha256t_free(int thr_id);
+extern void sha256t_setBlock_80(uint32_t *pdata, uint32_t *ptarget);
+extern void sha256t_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces);
+
+extern "C" int scanhash_sha256t(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t _ALIGN(64) endiandata[20];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << 23);
+	if (init[thr_id]) throughput = min(throughput, (max_nonce - first_nonce));
+
+	if (opt_benchmark)
+		((uint32_t*)ptarget)[7] = 0x03;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		sha256t_init(thr_id);
+
+		init[thr_id] = true;
+	}
+
+	for (int k=0; k < 19; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	sha256t_setBlock_80(endiandata, ptarget);
+
+	do {
+		// Hash with CUDA
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		sha256t_hash_80(thr_id, throughput, pdata[19], work->nonces);
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			uint32_t _ALIGN(64) vhash[8];
+
+			endiandata[19] = swab32(work->nonces[0]);
+			sha256t_hash(vhash, endiandata);
+			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				if (work->nonces[1] != UINT32_MAX) {
+					endiandata[19] = swab32(work->nonces[1]);
+					sha256t_hash(vhash, endiandata);
+					if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+						work->valid_nonces++;
+						bn_set_target_ratio(work, vhash, 1);
+					}
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1;
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > ptarget[7]) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+					gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+
+	return 0;
+}
+
+// cleanup
+extern "C" void free_sha256t(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	sha256t_free(thr_id);
+
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/sia/sia-rpc.cpp b/sia/sia-rpc.cpp
new file mode 100644
index 0000000000..4770426107
--- /dev/null
+++ b/sia/sia-rpc.cpp
@@ -0,0 +1,203 @@
+#include <ccminer-config.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <inttypes.h>
+#include <unistd.h>
+#include <math.h>
+#include <sys/time.h>
+#include <time.h>
+#include <signal.h>
+#include <curl/curl.h>
+#include <miner.h>
+
+#include "sia-rpc.h"
+
+static bool sia_debug_diff = false;
+
+extern int share_result(int result, int pooln, double sharediff, const char *reason);
+
+/* compute nbits to get the network diff */
+static void calc_network_diff(struct work *work)
+{
+	uint32_t nbits = work->data[11]; // unsure if correct
+	uint32_t bits = (nbits & 0xffffff);
+	int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28
+
+	uint64_t diffone = 0x0000FFFF00000000ull;
+	double d = (double)0x0000ffff / (double)bits;
+
+	for (int m=shift; m < 29; m++) d *= 256.0;
+	for (int m=29; m < shift; m++) d /= 256.0;
+	if (sia_debug_diff)
+		applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits);
+
+	net_diff = d;
+}
+
+// ---- SIA LONGPOLL --------------------------------------------------------------------------------
+
+struct data_buffer {
+	void *buf;
+	size_t len;
+};
+
+static size_t sia_data_cb(const void *ptr, size_t size, size_t nmemb,
+			  void *user_data)
+{
+	struct data_buffer *db = (struct data_buffer *)user_data;
+	size_t len = size * nmemb;
+	size_t oldlen, newlen;
+	void *newmem;
+	static const uchar zero = 0;
+
+	oldlen = db->len;
+	newlen = oldlen + len;
+
+	newmem = realloc(db->buf, newlen + 1);
+	if (!newmem)
+		return 0;
+
+	db->buf = newmem;
+	db->len = newlen;
+	memcpy((char*)db->buf + oldlen, ptr, len);
+	memcpy((char*)db->buf + newlen, &zero, 1);	/* null terminate */
+
+	return len;
+}
+
+char* sia_getheader(CURL *curl, struct pool_infos *pool)
+{
+	char curl_err_str[CURL_ERROR_SIZE] = { 0 };
+	struct data_buffer all_data = { 0 };
+	struct curl_slist *headers = NULL;
+	char data[256] = { 0 };
+	char url[512*3];
+
+	// nanopool
+	snprintf(url, sizeof(url), "%s/miner/header?address=%s&worker=%s", //&longpoll
+		pool->url, pool->user, pool->pass);
+
+	if (opt_protocol)
+		curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
+	curl_easy_setopt(curl, CURLOPT_URL, url);
+	curl_easy_setopt(curl, CURLOPT_POST, 0);
+	curl_easy_setopt(curl, CURLOPT_ENCODING, "");
+	curl_easy_setopt(curl, CURLOPT_FAILONERROR, 0);
+	curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);
+	curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1);
+	curl_easy_setopt(curl, CURLOPT_TIMEOUT, opt_timeout);
+	curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);
+	curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_err_str);
+	curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, sia_data_cb);
+	curl_easy_setopt(curl, CURLOPT_WRITEDATA, &all_data);
+
+	headers = curl_slist_append(headers, "Accept: application/octet-stream");
+	headers = curl_slist_append(headers, "Expect:"); // disable Expect hdr
+	headers = curl_slist_append(headers, "User-Agent: Sia-Agent"); // required for now
+//	headers = curl_slist_append(headers, "User-Agent: " USER_AGENT);
+//	headers = curl_slist_append(headers, "X-Mining-Extensions: longpoll");
+
+	curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
+
+	int rc = curl_easy_perform(curl);
+	if (rc && strlen(curl_err_str)) {
+		applog(LOG_WARNING, "%s", curl_err_str);
+	}
+
+	if (all_data.len >= 112)
+		cbin2hex(data, (const char*) all_data.buf, 112);
+	if (opt_protocol || all_data.len != 112)
+		applog(LOG_DEBUG, "received %d bytes: %s", (int) all_data.len, data);
+
+	curl_slist_free_all(headers);
+
+	return rc == 0 && all_data.len ? strdup(data) : NULL;
+}
+
+bool sia_work_decode(const char *hexdata, struct work *work)
+{
+	uint8_t target[32];
+	if (!work) return false;
+
+	hex2bin((uchar*)target, &hexdata[0], 32);
+	swab256(work->target, target);
+	work->targetdiff = target_to_diff(work->target);
+
+	hex2bin((uchar*)work->data, &hexdata[64], 80);
+	// high 16 bits of the 64 bits nonce
+	work->data[9] = rand() << 16;
+
+	// use work ntime as job id
+	cbin2hex(work->job_id, (const char*)&work->data[10], 4);
+	calc_network_diff(work);
+
+	if (stratum_diff != work->targetdiff) {
+		stratum_diff = work->targetdiff;
+		applog(LOG_WARNING, "Pool diff set to %g", stratum_diff);
+	}
+
+	return true;
+}
+
+bool sia_submit(CURL *curl, struct pool_infos *pool, struct work *work)
+{
+	char curl_err_str[CURL_ERROR_SIZE] = { 0 };
+	struct data_buffer all_data = { 0 };
+	struct curl_slist *headers = NULL;
+	char buf[256] = { 0 };
+	char url[512*3];
+
+	if (opt_protocol)
+		applog_hex(work->data, 80);
+	//applog_hex(&work->data[8], 16);
+	//applog_hex(&work->data[10], 4);
+
+	// nanopool
+	snprintf(url, sizeof(url), "%s/miner/header?address=%s&worker=%s",
+		pool->url, pool->user, pool->pass);
+
+	if (opt_protocol)
+		curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
+	curl_easy_setopt(curl, CURLOPT_URL, url);
+	curl_easy_setopt(curl, CURLOPT_ENCODING, "");
+	curl_easy_setopt(curl, CURLOPT_FAILONERROR, 0);
+	curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);
+	curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1);
+	curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_err_str);
+	curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);
+	curl_easy_setopt(curl, CURLOPT_TIMEOUT, 10);
+
+	curl_easy_setopt(curl, CURLOPT_WRITEDATA, &all_data);
+	curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, sia_data_cb);
+
+	memcpy(buf, work->data, 80);
+	curl_easy_setopt(curl, CURLOPT_POST, 1);
+	curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE, 80);
+	curl_easy_setopt(curl, CURLOPT_POSTFIELDS, (void*) buf);
+
+//	headers = curl_slist_append(headers, "Content-Type: application/octet-stream");
+//	headers = curl_slist_append(headers, "Content-Length: 80");
+	headers = curl_slist_append(headers, "Accept:"); // disable Accept hdr
+	headers = curl_slist_append(headers, "Expect:"); // disable Expect hdr
+	headers = curl_slist_append(headers, "User-Agent: Sia-Agent");
+//	headers = curl_slist_append(headers, "User-Agent: " USER_AGENT);
+	curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
+
+	int res = curl_easy_perform(curl) == 0;
+	long errcode;
+	CURLcode c = curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &errcode);
+	if (errcode != 204) {
+		if (strlen(curl_err_str))
+			applog(LOG_ERR, "submit err %ld %s", errcode, curl_err_str);
+		res = 0;
+	}
+	share_result(res, work->pooln, work->sharediff[0], res ? NULL : (char*) all_data.buf);
+
+	curl_slist_free_all(headers);
+	return true;
+}
+
+// ---- END SIA LONGPOLL ----------------------------------------------------------------------------
diff --git a/sia/sia-rpc.h b/sia/sia-rpc.h
new file mode 100644
index 0000000000..0b427e4723
--- /dev/null
+++ b/sia/sia-rpc.h
@@ -0,0 +1,6 @@
+#include <miner.h>
+
+char* sia_getheader(CURL *curl, struct pool_infos *pool);
+bool sia_work_decode(const char *hexdata, struct work *work);
+bool sia_submit(CURL *curl, struct pool_infos *pool, struct work *work);
+
diff --git a/sia/sia.cu b/sia/sia.cu
new file mode 100644
index 0000000000..4ffdccbb6b
--- /dev/null
+++ b/sia/sia.cu
@@ -0,0 +1,304 @@
+/**
+ * Blake2-B CUDA Implementation
+ *
+ * tpruvot@github July 2016
+ *
+ */
+
+#include <miner.h>
+
+#include <string.h>
+#include <stdint.h>
+
+#include <sph/blake2b.h>
+
+#include <cuda_helper.h>
+#include <cuda_vector_uint2x4.h>
+
+#define TPB 512
+#define NBN 2
+
+static uint32_t *d_resNonces[MAX_GPUS];
+
+__device__ uint64_t d_data[10];
+
+static __constant__ const int8_t blake2b_sigma[12][16] = {
+	{ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15 } ,
+	{ 14, 10, 4,  8,  9,  15, 13, 6,  1,  12, 0,  2,  11, 7,  5,  3  } ,
+	{ 11, 8,  12, 0,  5,  2,  15, 13, 10, 14, 3,  6,  7,  1,  9,  4  } ,
+	{ 7,  9,  3,  1,  13, 12, 11, 14, 2,  6,  5,  10, 4,  0,  15, 8  } ,
+	{ 9,  0,  5,  7,  2,  4,  10, 15, 14, 1,  11, 12, 6,  8,  3,  13 } ,
+	{ 2,  12, 6,  10, 0,  11, 8,  3,  4,  13, 7,  5,  15, 14, 1,  9  } ,
+	{ 12, 5,  1,  15, 14, 13, 4,  10, 0,  7,  6,  3,  9,  2,  8,  11 } ,
+	{ 13, 11, 7,  14, 12, 1,  3,  9,  5,  0,  15, 4,  8,  6,  2,  10 } ,
+	{ 6,  15, 14, 9,  11, 3,  0,  8,  12, 2,  13, 7,  1,  4,  10, 5  } ,
+	{ 10, 2,  8,  4,  7,  6,  1,  5,  15, 11, 9,  14, 3,  12, 13, 0  } ,
+	{ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15 } ,
+	{ 14, 10, 4,  8,  9,  15, 13, 6,  1,  12, 0,  2,  11, 7,  5,  3  }
+};
+
+// host mem align
+#define A 64
+
+extern "C" void sia_blake2b_hash(void *output, const void *input)
+{
+	uint8_t _ALIGN(A) hash[32];
+	blake2b_ctx ctx;
+
+	blake2b_init(&ctx, 32, NULL, 0);
+	blake2b_update(&ctx, input, 80);
+	blake2b_final(&ctx, hash);
+
+	memcpy(output, hash, 32);
+}
+
+// ----------------------------------------------------------------
+
+__device__ __forceinline__
+static void G(const int r, const int i, uint64_t &a, uint64_t &b, uint64_t &c, uint64_t &d, uint64_t const m[16])
+{
+	a = a + b + m[ blake2b_sigma[r][2*i] ];
+	((uint2*)&d)[0] = SWAPUINT2( ((uint2*)&d)[0] ^ ((uint2*)&a)[0] );
+	c = c + d;
+	((uint2*)&b)[0] = ROR24( ((uint2*)&b)[0] ^ ((uint2*)&c)[0] );
+	a = a + b + m[ blake2b_sigma[r][2*i+1] ];
+	((uint2*)&d)[0] = ROR16( ((uint2*)&d)[0] ^ ((uint2*)&a)[0] );
+	c = c + d;
+	((uint2*)&b)[0] = ROR2( ((uint2*)&b)[0] ^ ((uint2*)&c)[0], 63U);
+}
+
+#define ROUND(r) \
+	G(r, 0, v[0], v[4], v[ 8], v[12], m); \
+	G(r, 1, v[1], v[5], v[ 9], v[13], m); \
+	G(r, 2, v[2], v[6], v[10], v[14], m); \
+	G(r, 3, v[3], v[7], v[11], v[15], m); \
+	G(r, 4, v[0], v[5], v[10], v[15], m); \
+	G(r, 5, v[1], v[6], v[11], v[12], m); \
+	G(r, 6, v[2], v[7], v[ 8], v[13], m); \
+	G(r, 7, v[3], v[4], v[ 9], v[14], m);
+
+// simplified for the last round
+__device__ __forceinline__
+static void H(const int r, const int i, uint64_t &a, uint64_t &b, uint64_t &c, uint64_t &d, uint64_t const m[16])
+{
+	a = a + b + m[ blake2b_sigma[r][2*i] ];
+	((uint2*)&d)[0] = SWAPUINT2( ((uint2*)&d)[0] ^ ((uint2*)&a)[0] );
+	c = c + d;
+	((uint2*)&b)[0] = ROR24( ((uint2*)&b)[0] ^ ((uint2*)&c)[0] );
+	a = a + b + m[ blake2b_sigma[r][2*i+1] ];
+	((uint2*)&d)[0] = ROR16( ((uint2*)&d)[0] ^ ((uint2*)&a)[0] );
+	c = c + d;
+}
+
+// we only check v[0] and v[8]
+#define ROUND_F(r) \
+	G(r, 0, v[0], v[4], v[ 8], v[12], m); \
+	G(r, 1, v[1], v[5], v[ 9], v[13], m); \
+	G(r, 2, v[2], v[6], v[10], v[14], m); \
+	G(r, 3, v[3], v[7], v[11], v[15], m); \
+	G(r, 4, v[0], v[5], v[10], v[15], m); \
+	G(r, 5, v[1], v[6], v[11], v[12], m); \
+	H(r, 6, v[2], v[7], v[ 8], v[13], m);
+
+__global__
+//__launch_bounds__(128, 8) /* to force 64 regs */
+void sia_blake2b_gpu_hash(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce, const uint2 target2)
+{
+	const uint32_t nonce = (blockDim.x * blockIdx.x + threadIdx.x) + startNonce;
+	__shared__ uint64_t s_target;
+	if (!threadIdx.x) s_target = devectorize(target2);
+
+	uint64_t m[16];
+
+	m[0] = d_data[0];
+	m[1] = d_data[1];
+	m[2] = d_data[2];
+	m[3] = d_data[3];
+	m[4] = d_data[4] | nonce;
+	m[5] = d_data[5];
+	m[6] = d_data[6];
+	m[7] = d_data[7];
+	m[8] = d_data[8];
+	m[9] = d_data[9];
+
+	m[10] = m[11] = 0;
+	m[12] = m[13] = m[14] = m[15] = 0;
+
+	uint64_t v[16] = {
+		0x6a09e667f2bdc928, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
+		0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
+		0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
+		0x510e527fade68281, 0x9b05688c2b3e6c1f, 0xe07c265404be4294, 0x5be0cd19137e2179
+	};
+
+	ROUND( 0 );
+	ROUND( 1 );
+	ROUND( 2 );
+	ROUND( 3 );
+	ROUND( 4 );
+	ROUND( 5 );
+	ROUND( 6 );
+	ROUND( 7 );
+	ROUND( 8 );
+	ROUND( 9 );
+	ROUND( 10 );
+	ROUND_F( 11 );
+
+	uint64_t h64 = cuda_swab64(0x6a09e667f2bdc928 ^ v[0] ^ v[8]);
+	if (h64 <= s_target) {
+		resNonce[1] = resNonce[0];
+		resNonce[0] = nonce;
+		s_target = h64;
+	}
+	// if (!nonce) printf("%016lx ", s_target);
+}
+
+__host__
+uint32_t sia_blake2b_hash_cuda(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint2 target2, uint32_t &secNonce)
+{
+	uint32_t resNonces[NBN] = { UINT32_MAX, UINT32_MAX };
+	uint32_t result = UINT32_MAX;
+
+	dim3 grid((threads + TPB-1)/TPB);
+	dim3 block(TPB);
+
+	/* Check error on Ctrl+C or kill to prevent segfaults on exit */
+	if (cudaMemset(d_resNonces[thr_id], 0xff, NBN*sizeof(uint32_t)) != cudaSuccess)
+		return result;
+
+	sia_blake2b_gpu_hash <<<grid, block, 8>>> (threads, startNonce, d_resNonces[thr_id], target2);
+	cudaThreadSynchronize();
+
+	if (cudaSuccess == cudaMemcpy(resNonces, d_resNonces[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
+		result = resNonces[0];
+		secNonce = resNonces[1];
+		if (secNonce == result) secNonce = UINT32_MAX;
+	}
+	return result;
+}
+
+__host__
+void sia_blake2b_setBlock(uint32_t *data)
+{
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_data, data, 80, 0, cudaMemcpyHostToDevice));
+}
+
+static bool init[MAX_GPUS] = { 0 };
+
+int scanhash_sia(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t _ALIGN(A) hash[8];
+	uint32_t _ALIGN(A) vhashcpu[8];
+	uint32_t _ALIGN(A) inputdata[20];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+
+	const uint32_t Htarg = ptarget[7];
+	const uint32_t first_nonce = pdata[8];
+
+	int dev_id = device_map[thr_id];
+	int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 28 : 25;
+	if (device_sm[dev_id] >= 520 && is_windows()) intensity = 26;
+	if (device_sm[dev_id] < 350) intensity = 22;
+
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage (linux)
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_resNonces[thr_id], NBN * sizeof(uint32_t)), -1);
+		init[thr_id] = true;
+	}
+
+	memcpy(inputdata, pdata, 80);
+	inputdata[11] = 0; // nbits
+
+	const uint2 target = make_uint2(ptarget[6], ptarget[7]);
+
+	sia_blake2b_setBlock(inputdata);
+
+	do {
+		work->nonces[0] = sia_blake2b_hash_cuda(thr_id, throughput, pdata[8], target, work->nonces[1]);
+
+		*hashes_done = pdata[8] - first_nonce + throughput;
+
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			work->valid_nonces = 0;
+			inputdata[8] = work->nonces[0];
+			sia_blake2b_hash(hash, inputdata);
+			if (swab32(hash[0]) <= Htarg) {
+				// sia hash target is reversed (start of hash)
+				swab256(vhashcpu, hash);
+				if (fulltest(vhashcpu, ptarget)) {
+					work_set_target_ratio(work, vhashcpu);
+					work->valid_nonces++;
+					pdata[8] = work->nonces[0] + 1;
+				}
+			} else {
+				gpu_increment_reject(thr_id);
+			}
+
+			if (work->nonces[1] != UINT32_MAX) {
+				inputdata[8] = work->nonces[1];
+				sia_blake2b_hash(hash, inputdata);
+				if (swab32(hash[0]) <= Htarg) {
+					swab256(vhashcpu, hash);
+					if (fulltest(vhashcpu, ptarget)) {
+						if (bn_hash_target_ratio(vhashcpu, ptarget) > work->shareratio[0]) {
+							work->sharediff[1] = work->sharediff[0];
+							work->shareratio[1] = work->shareratio[0];
+							xchg(work->nonces[1], work->nonces[0]);
+							work_set_target_ratio(work, vhashcpu);
+						} else {
+							bn_set_target_ratio(work, vhashcpu, 1);
+						}
+						work->valid_nonces++;
+						pdata[8] = work->nonces[1] + 1;
+					}
+				} else {
+					gpu_increment_reject(thr_id);
+				}
+			}
+			if (work->valid_nonces) {
+				return work->valid_nonces;
+			}
+		}
+
+		if ((uint64_t) throughput + pdata[8] >= max_nonce) {
+			pdata[8] = max_nonce;
+			break;
+		}
+
+		pdata[8] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[8] - first_nonce;
+
+	return 0;
+}
+
+// cleanup
+extern "C" void free_sia(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_resNonces[thr_id]);
+
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/skein.cu b/skein.cu
new file mode 100644
index 0000000000..568839f840
--- /dev/null
+++ b/skein.cu
@@ -0,0 +1,484 @@
+/**
+ * SKEIN512 80 + SHA256 64
+ * by tpruvot@github - 2015
+ */
+
+#include "sph/sph_skein.h"
+
+#include "miner.h"
+#include "cuda_helper.h"
+
+#include <openssl/sha.h>
+
+static uint32_t *d_hash[MAX_GPUS];
+static __thread bool sm5 = true;
+
+extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
+extern void skein512_cpu_setBlock_80(void *pdata);
+extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap);
+
+extern void skeincoin_init(int thr_id);
+extern void skeincoin_free(int thr_id);
+extern void skeincoin_setBlock_80(int thr_id, void *pdata);
+extern uint32_t skeincoin_hash_sm5(int thr_id, uint32_t threads, uint32_t startNounce, int swap, uint64_t target64, uint32_t *secNonce);
+
+static __device__ uint32_t sha256_hashTable[] = {
+	0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
+};
+
+static __device__ __constant__ uint32_t sha256_constantTable[64] = {
+	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
+static __device__ __constant__ uint32_t sha256_endingTable[] = {
+	0x80000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000200,
+	0x80000000, 0x01400000, 0x00205000, 0x00005088, 0x22000800, 0x22550014, 0x05089742, 0xa0000020,
+	0x5a880000, 0x005c9400, 0x0016d49d, 0xfa801f00, 0xd33225d0, 0x11675959, 0xf6e6bfda, 0xb30c1549,
+	0x08b2b050, 0x9d7c4c27, 0x0ce2a393, 0x88e6e1ea, 0xa52b4335, 0x67a16f49, 0xd732016f, 0x4eeb2e91,
+	0x5dbf55e5, 0x8eee2335, 0xe2bc5ec2, 0xa83f4394, 0x45ad78f7, 0x36f3d0cd, 0xd99c05e8, 0xb0511dc7,
+	0x69bc7ac4, 0xbd11375b, 0xe3ba71e5, 0x3b209ff2, 0x18feee17, 0xe25ad9e7, 0x13375046, 0x0515089d,
+	0x4f0d0f04, 0x2627484e, 0x310128d2, 0xc668b434, 0x420841cc, 0x62d311b8, 0xe59ba771, 0x85a7a484
+};
+
+/* Elementary functions used by SHA256 */
+#define SWAB32(x)     cuda_swab32(x)
+//#define ROTR32(x,n)   SPH_ROTR32(x,n)
+
+#define R(x, n)       ((x) >> (n))
+#define Ch(x, y, z)   ((x & (y ^ z)) ^ z)
+#define Maj(x, y, z)  ((x & (y | z)) | (y & z))
+#define S0(x)         (ROTR32(x, 2) ^ ROTR32(x, 13) ^ ROTR32(x, 22))
+#define S1(x)         (ROTR32(x, 6) ^ ROTR32(x, 11) ^ ROTR32(x, 25))
+#define s0(x)         (ROTR32(x, 7) ^ ROTR32(x, 18) ^ R(x, 3))
+#define s1(x)         (ROTR32(x, 17) ^ ROTR32(x, 19) ^ R(x, 10))
+
+#define ADVANCED_SHA2
+
+#ifndef ADVANCED_SHA2
+
+/* SHA256 round function */
+#define RND(a, b, c, d, e, f, g, h, k) \
+	do { \
+		t0 = h + S1(e) + Ch(e, f, g) + k; \
+		t1 = S0(a) + Maj(a, b, c); \
+		d += t0; \
+		h  = t0 + t1; \
+	} while (0)
+
+/* Adjusted round function for rotating state */
+#define RNDr(S, W, i) \
+	RND(S[(64 - i) & 7], S[(65 - i) & 7], \
+	    S[(66 - i) & 7], S[(67 - i) & 7], \
+	    S[(68 - i) & 7], S[(69 - i) & 7], \
+	    S[(70 - i) & 7], S[(71 - i) & 7], \
+	    W[i] + sha256_constantTable[i])
+
+static __constant__ uint32_t sha256_ending[16] = {
+	0x80000000UL, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x200UL
+};
+__device__
+void sha256_transform_gpu(uint32_t *state, uint32_t *message)
+{
+	uint32_t S[8];
+	uint32_t W[64];
+	uint32_t t0, t1;
+
+	/* Initialize work variables. */
+	for (int i = 0; i < 8; i++) {
+		S[i] = state[i];
+	}
+
+	for (int i = 0; i < 16; i++) {
+		W[i] = message[i];
+	}
+
+	for (int i = 16; i < 64; i += 2) {
+		W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
+		W[i + 1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15];
+	}
+
+	/* 3. Mix. */
+	#pragma unroll
+	for (int i = 0; i < 64; i++) {
+		RNDr(S, W, i);
+	}
+
+	for (int i = 0; i < 8; i++)
+		state[i] += S[i];
+}
+#endif
+
+#ifdef ADVANCED_SHA2
+__device__
+void skeincoin_gpu_sha256(uint32_t *message)
+{
+	uint32_t W1[16];
+	uint32_t W2[16];
+
+	uint32_t regs[8];
+	uint32_t hash[8];
+
+	// Init with Hash-Table
+	#pragma unroll 8
+	for (int k=0; k < 8; k++) {
+		hash[k] = regs[k] = sha256_hashTable[k];
+	}
+
+	#pragma unroll 16
+	for (int k = 0; k<16; k++)
+		W1[k] = SWAB32(message[k]);
+
+	// Progress W1
+	#pragma unroll 16
+	for (int j = 0; j<16; j++)
+	{
+		uint32_t T1, T2;
+		T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j] + W1[j];
+		T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+
+		#pragma unroll 7
+		for (int k = 6; k >= 0; k--) regs[k + 1] = regs[k];
+		regs[0] = T1 + T2;
+		regs[4] += T1;
+	}
+
+	// Progress W2...W3
+
+	////// PART 1
+	#pragma unroll 2
+	for (int j = 0; j<2; j++)
+		W2[j] = s1(W1[14 + j]) + W1[9 + j] + s0(W1[1 + j]) + W1[j];
+	#pragma unroll 5
+	for (int j = 2; j<7; j++)
+		W2[j] = s1(W2[j - 2]) + W1[9 + j] + s0(W1[1 + j]) + W1[j];
+
+	#pragma unroll 8
+	for (int j = 7; j<15; j++)
+		W2[j] = s1(W2[j - 2]) + W2[j - 7] + s0(W1[1 + j]) + W1[j];
+
+	W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
+
+	// Round function
+	#pragma unroll 16
+	for (int j = 0; j<16; j++)
+	{
+		uint32_t T1, T2;
+		T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j + 16] + W2[j];
+		T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+
+		#pragma unroll 7
+		for (int l = 6; l >= 0; l--) regs[l + 1] = regs[l];
+		regs[0] = T1 + T2;
+		regs[4] += T1;
+	}
+
+	////// PART 2
+	#pragma unroll 2
+	for (int j = 0; j<2; j++)
+		W1[j] = s1(W2[14 + j]) + W2[9 + j] + s0(W2[1 + j]) + W2[j];
+
+	#pragma unroll 5
+	for (int j = 2; j<7; j++)
+		W1[j] = s1(W1[j - 2]) + W2[9 + j] + s0(W2[1 + j]) + W2[j];
+
+	#pragma unroll 8
+	for (int j = 7; j<15; j++)
+		W1[j] = s1(W1[j - 2]) + W1[j - 7] + s0(W2[1 + j]) + W2[j];
+
+	W1[15] = s1(W1[13]) + W1[8] + s0(W1[0]) + W2[15];
+
+	// Round function
+	#pragma unroll 16
+	for (int j = 0; j<16; j++)
+	{
+		uint32_t T1, T2;
+		T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j + 32] + W1[j];
+		T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+
+		#pragma unroll 7
+		for (int l = 6; l >= 0; l--) regs[l + 1] = regs[l];
+		regs[0] = T1 + T2;
+		regs[4] += T1;
+	}
+
+	////// PART 3
+	#pragma unroll 2
+	for (int j = 0; j<2; j++)
+		W2[j] = s1(W1[14 + j]) + W1[9 + j] + s0(W1[1 + j]) + W1[j];
+
+	#pragma unroll 5
+	for (int j = 2; j<7; j++)
+		W2[j] = s1(W2[j - 2]) + W1[9 + j] + s0(W1[1 + j]) + W1[j];
+
+	#pragma unroll 8
+	for (int j = 7; j<15; j++)
+		W2[j] = s1(W2[j - 2]) + W2[j - 7] + s0(W1[1 + j]) + W1[j];
+
+	W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
+
+	// Round function
+	#pragma unroll 16
+	for (int j = 0; j<16; j++)
+	{
+		uint32_t T1, T2;
+		T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j + 48] + W2[j];
+		T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+
+		#pragma unroll 7
+		for (int l = 6; l >= 0; l--) regs[l + 1] = regs[l];
+		regs[0] = T1 + T2;
+		regs[4] += T1;
+	}
+
+	#pragma unroll 8
+	for (int k = 0; k<8; k++)
+		hash[k] += regs[k];
+
+#if 1
+	/////
+	///// Second Pass (ending)
+	/////
+	#pragma unroll 8
+	for (int k = 0; k<8; k++)
+		regs[k] = hash[k];
+
+	// Progress W1
+	#pragma unroll 64
+	for (int j = 0; j<64; j++)
+	{
+		uint32_t T1, T2;
+		T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j] + sha256_endingTable[j];
+		T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+
+		#pragma unroll 7
+		for (int k = 6; k >= 0; k--) regs[k + 1] = regs[k];
+		regs[0] = T1 + T2;
+		regs[4] += T1;
+	}
+
+	#pragma unroll 8
+	for (int k = 0; k<8; k++)
+		hash[k] += regs[k];
+
+	// Final Hash
+	#pragma unroll 8
+	for (int k = 0; k<8; k++)
+		message[k] = SWAB32(hash[k]);
+#else
+	// sha256_transform only, require an additional sha256_transform_gpu() call
+	#pragma unroll 8
+	for (int k = 0; k<8; k++)
+		message[k] = hash[k];
+#endif
+}
+#endif
+
+__global__
+void sha2_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *hashBuffer)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t *hash = &hashBuffer[thread << 4];
+#ifdef ADVANCED_SHA2
+		skeincoin_gpu_sha256(hash);
+#else
+		uint32_t state[16];
+		uint32_t msg[16];
+		#pragma unroll
+		for (int i = 0; i < 8; i++)
+			state[i] = sha256_hashTable[i];
+
+		#pragma unroll
+		for (int i = 0; i < 16; i++)
+			msg[i] = SWAB32(hash[i]);
+
+		sha256_transform_gpu(state, msg);
+		sha256_transform_gpu(state, sha256_ending);
+
+		#pragma unroll
+		for (int i = 0; i < 8; i++)
+			hash[i] = SWAB32(state[i]);
+#endif
+	}
+}
+
+__host__
+void sha2_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHashes)
+{
+	uint32_t threadsperblock = 128;
+	dim3 block(threadsperblock);
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+
+	sha2_gpu_hash_64 <<< grid, block >>>(threads, startNounce, d_outputHashes);
+
+	// required once per scan loop to prevent cpu 100% usage (linux)
+	MyStreamSynchronize(NULL, 0, thr_id);
+}
+
+extern "C" void skeincoinhash(void *output, const void *input)
+{
+	sph_skein512_context ctx_skein;
+	SHA256_CTX sha256;
+
+	uint32_t hash[16];
+
+	sph_skein512_init(&ctx_skein);
+	sph_skein512(&ctx_skein, input, 80);
+	sph_skein512_close(&ctx_skein, hash);
+
+	SHA256_Init(&sha256);
+	SHA256_Update(&sha256, (unsigned char *)hash, 64);
+	SHA256_Final((unsigned char *)hash, &sha256);
+
+	memcpy(output, hash, 32);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_skeincoin(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t _ALIGN(64) endiandata[20];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+
+	const uint32_t first_nonce = pdata[19];
+
+	sm5 = (device_sm[device_map[thr_id]] >= 500);
+	bool checkSecnonce = (have_stratum || have_longpoll) && !sm5;
+
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << 20);
+	if (init[thr_id]) throughput = min(throughput, (max_nonce - first_nonce));
+
+	uint64_t target64 = 0;
+
+	if (opt_benchmark)
+		((uint32_t*)ptarget)[7] = 0x03;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		cuda_get_arch(thr_id);
+
+		if (sm5) {
+			skeincoin_init(thr_id);
+		} else {
+			cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput);
+			quark_skein512_cpu_init(thr_id, throughput);
+			cuda_check_cpu_init(thr_id, throughput);
+			CUDA_SAFE_CALL(cudaDeviceSynchronize());
+		}
+
+		init[thr_id] = true;
+	}
+
+	for (int k=0; k < 19; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	if (sm5) {
+		skeincoin_setBlock_80(thr_id, (void*)endiandata);
+		target64 = ((uint64_t*)ptarget)[3];
+	} else {
+		skein512_cpu_setBlock_80((void*)endiandata);
+		cuda_check_cpu_setTarget(ptarget);
+	}
+
+	do {
+		// Hash with CUDA
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		if (sm5) {
+			/* cuda_skeincoin.cu */
+			work->nonces[0] = skeincoin_hash_sm5(thr_id, throughput, pdata[19], 1, target64, &work->nonces[1]);
+		} else {
+			/* quark/cuda_skein512.cu */
+			skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+			sha2_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]);
+			work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		}
+
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			uint32_t _ALIGN(64) vhash[8];
+
+			endiandata[19] = swab32(work->nonces[0]);
+			skeincoinhash(vhash, endiandata);
+			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				if (checkSecnonce) {
+					work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], work->valid_nonces);
+					if (work->nonces[1] != 0) {
+						endiandata[19] = swab32(work->nonces[1]);
+						skeincoinhash(vhash, endiandata);
+						if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+							work->valid_nonces++;
+							bn_set_target_ratio(work, vhash, 1);
+						}
+						pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+					} else {
+						pdata[19] = work->nonces[0] + 1;
+					}
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor for next scan
+				}
+				return work->valid_nonces;
+			}
+			 else if (vhash[7] > ptarget[7]) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+					gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+
+	return 0;
+}
+
+// cleanup
+extern "C" void free_skeincoin(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	if (sm5)
+		skeincoin_free(thr_id);
+	else {
+		cudaFree(d_hash[thr_id]);
+		cuda_check_cpu_free(thr_id);
+	}
+
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/skein2.cpp b/skein2.cpp
new file mode 100644
index 0000000000..2d57f2d4a8
--- /dev/null
+++ b/skein2.cpp
@@ -0,0 +1,149 @@
+/**
+ * SKEIN512 80 + SKEIN512 64 (Woodcoin)
+ * by tpruvot@github - 2015
+ */
+#include <string.h>
+
+#include "sph/sph_skein.h"
+
+#include "miner.h"
+#include "cuda_helper.h"
+
+static uint32_t *d_hash[MAX_GPUS];
+
+extern void skein512_cpu_setBlock_80(void *pdata);
+extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap);
+
+extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
+extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+void skein2hash(void *output, const void *input)
+{
+	uint32_t _ALIGN(64) hash[16];
+	sph_skein512_context ctx_skein;
+
+	sph_skein512_init(&ctx_skein);
+	sph_skein512(&ctx_skein, input, 80);
+	sph_skein512_close(&ctx_skein, hash);
+
+	sph_skein512_init(&ctx_skein);
+	sph_skein512(&ctx_skein, hash, 64);
+	sph_skein512_close(&ctx_skein, hash);
+
+	memcpy(output, (void*) hash, 32);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+
+int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	int dev_id = device_map[thr_id];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << 19); // 256*256*8
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (opt_benchmark)
+		((uint32_t*)ptarget)[7] = 0;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput);
+
+		quark_skein512_cpu_init(thr_id, throughput);
+		cuda_check_cpu_init(thr_id, throughput);
+
+		CUDA_SAFE_CALL(cudaDeviceSynchronize());
+
+		init[thr_id] = true;
+	}
+
+	uint32_t endiandata[20];
+	for (int k=0; k < 19; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	skein512_cpu_setBlock_80((void*)endiandata);
+	cuda_check_cpu_setTarget(ptarget);
+
+	do {
+		int order = 0;
+
+		// Hash with CUDA
+		skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			uint32_t _ALIGN(64) vhash[8];
+
+			endiandata[19] = swab32(work->nonces[0]);
+			skein2hash(vhash, endiandata);
+
+			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				work_set_target_ratio(work, vhash);
+				if (work->nonces[1] != 0) {
+					endiandata[19] = swab32(work->nonces[1]);
+					skein2hash(vhash, endiandata);
+					work->valid_nonces++;
+					bn_set_target_ratio(work, vhash, 1);
+					gpulog(LOG_DEBUG, thr_id, "found second nonce %08x!", endiandata[19]);
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor for next scan
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > ptarget[7]) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+					gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+
+	return 0;
+}
+
+// cleanup
+void free_skein2(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+
+	cuda_check_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/skunk/cuda_skunk.cu b/skunk/cuda_skunk.cu
new file mode 100644
index 0000000000..cd4c269f7a
--- /dev/null
+++ b/skunk/cuda_skunk.cu
@@ -0,0 +1,701 @@
+/**
+ * skein + cube + fugue merged kernel, based on krnlx work
+ *
+ * based on alexis78 sib kernels, final touch by tpruvot
+ */
+
+#include <miner.h>
+#include <cuda_vectors.h>
+#include "skunk/skein_header.h"
+#include <cuda_vector_uint2x4.h>
+
+#define TPB 512
+
+/* ************************ */
+static __constant__ uint2 c_buffer[120]; // padded message (80 bytes + 72 bytes midstate + align)
+
+#define SWAP(a,b) { uint32_t u = a; a = b; b = u; }
+
+__device__ __forceinline__
+static void rrounds(uint32_t *x){
+	#pragma unroll 2
+	for (int r = 0; r < 16; r++) {
+		/* "add x_0jklm into x_1jklmn modulo 2^32 rotate x_0jklm upwards by 7 bits" */
+		x[16] = x[16] + x[ 0]; x[ 0] = ROTL32(x[ 0], 7);x[17] = x[17] + x[ 1];x[ 1] = ROTL32(x[ 1], 7);
+		x[18] = x[18] + x[ 2]; x[ 2] = ROTL32(x[ 2], 7);x[19] = x[19] + x[ 3];x[ 3] = ROTL32(x[ 3], 7);
+		x[20] = x[20] + x[ 4]; x[ 4] = ROTL32(x[ 4], 7);x[21] = x[21] + x[ 5];x[ 5] = ROTL32(x[ 5], 7);
+		x[22] = x[22] + x[ 6]; x[ 6] = ROTL32(x[ 6], 7);x[23] = x[23] + x[ 7];x[ 7] = ROTL32(x[ 7], 7);
+		x[24] = x[24] + x[ 8]; x[ 8] = ROTL32(x[ 8], 7);x[25] = x[25] + x[ 9];x[ 9] = ROTL32(x[ 9], 7);
+		x[26] = x[26] + x[10]; x[10] = ROTL32(x[10], 7);x[27] = x[27] + x[11];x[11] = ROTL32(x[11], 7);
+		x[28] = x[28] + x[12]; x[12] = ROTL32(x[12], 7);x[29] = x[29] + x[13];x[13] = ROTL32(x[13], 7);
+		x[30] = x[30] + x[14]; x[14] = ROTL32(x[14], 7);x[31] = x[31] + x[15];x[15] = ROTL32(x[15], 7);
+		/* "swap x_00klm with x_01klm" */
+		SWAP(x[ 0], x[ 8]);x[ 0] ^= x[16];x[ 8] ^= x[24];SWAP(x[ 1], x[ 9]);x[ 1] ^= x[17];x[ 9] ^= x[25];
+		SWAP(x[ 2], x[10]);x[ 2] ^= x[18];x[10] ^= x[26];SWAP(x[ 3], x[11]);x[ 3] ^= x[19];x[11] ^= x[27];
+		SWAP(x[ 4], x[12]);x[ 4] ^= x[20];x[12] ^= x[28];SWAP(x[ 5], x[13]);x[ 5] ^= x[21];x[13] ^= x[29];
+		SWAP(x[ 6], x[14]);x[ 6] ^= x[22];x[14] ^= x[30];SWAP(x[ 7], x[15]);x[ 7] ^= x[23];x[15] ^= x[31];
+		/* "swap x_1jk0m with x_1jk1m" */
+		SWAP(x[16], x[18]); SWAP(x[17], x[19]); SWAP(x[20], x[22]); SWAP(x[21], x[23]);
+		SWAP(x[24], x[26]); SWAP(x[25], x[27]); SWAP(x[28], x[30]); SWAP(x[29], x[31]);
+		/* "add x_0jklm into x_1jklm modulo 2^32 rotate x_0jklm upwards by 11 bits" */
+		x[16] = x[16] + x[ 0]; x[ 0] = ROTL32(x[ 0],11);x[17] = x[17] + x[ 1];x[ 1] = ROTL32(x[ 1],11);
+		x[18] = x[18] + x[ 2]; x[ 2] = ROTL32(x[ 2],11);x[19] = x[19] + x[ 3];x[ 3] = ROTL32(x[ 3],11);
+		x[20] = x[20] + x[ 4]; x[ 4] = ROTL32(x[ 4],11);x[21] = x[21] + x[ 5];x[ 5] = ROTL32(x[ 5],11);
+		x[22] = x[22] + x[ 6]; x[ 6] = ROTL32(x[ 6],11);x[23] = x[23] + x[ 7];x[ 7] = ROTL32(x[ 7],11);
+		x[24] = x[24] + x[ 8]; x[ 8] = ROTL32(x[ 8],11);x[25] = x[25] + x[ 9];x[ 9] = ROTL32(x[ 9],11);
+		x[26] = x[26] + x[10]; x[10] = ROTL32(x[10],11);x[27] = x[27] + x[11];x[11] = ROTL32(x[11],11);
+		x[28] = x[28] + x[12]; x[12] = ROTL32(x[12],11);x[29] = x[29] + x[13];x[13] = ROTL32(x[13],11);
+		x[30] = x[30] + x[14]; x[14] = ROTL32(x[14],11);x[31] = x[31] + x[15];x[15] = ROTL32(x[15],11);
+		/* "swap x_0j0lm with x_0j1lm" */
+		SWAP(x[ 0], x[ 4]); x[ 0] ^= x[16]; x[ 4] ^= x[20]; SWAP(x[ 1], x[ 5]); x[ 1] ^= x[17]; x[ 5] ^= x[21];
+		SWAP(x[ 2], x[ 6]); x[ 2] ^= x[18]; x[ 6] ^= x[22]; SWAP(x[ 3], x[ 7]); x[ 3] ^= x[19]; x[ 7] ^= x[23];
+		SWAP(x[ 8], x[12]); x[ 8] ^= x[24]; x[12] ^= x[28]; SWAP(x[ 9], x[13]); x[ 9] ^= x[25]; x[13] ^= x[29];
+		SWAP(x[10], x[14]); x[10] ^= x[26]; x[14] ^= x[30]; SWAP(x[11], x[15]); x[11] ^= x[27]; x[15] ^= x[31];
+		/* "swap x_1jkl0 with x_1jkl1" */
+		SWAP(x[16], x[17]); SWAP(x[18], x[19]); SWAP(x[20], x[21]); SWAP(x[22], x[23]);
+		SWAP(x[24], x[25]); SWAP(x[26], x[27]); SWAP(x[28], x[29]); SWAP(x[30], x[31]);
+	}
+}
+
+// fugue
+static __constant__ const uint32_t c_S[16] = {
+	0x8807a57e, 0xe616af75, 0xc5d3e4db, 0xac9ab027,
+	0xd915f117, 0xb6eecc54, 0x06e8020b, 0x4a92efd1,
+	0xaac6e2c9, 0xddb21398, 0xcae65838, 0x437f203f,
+	0x25ea78e7, 0x951fddd6, 0xda6ed11d, 0xe13e3567
+};
+
+static __device__ uint32_t mixtab0[256] = {
+	0x63633297, 0x7c7c6feb, 0x77775ec7, 0x7b7b7af7, 0xf2f2e8e5, 0x6b6b0ab7,	0x6f6f16a7, 0xc5c56d39,
+	0x303090c0, 0x01010704, 0x67672e87, 0x2b2bd1ac, 0xfefeccd5, 0xd7d71371, 0xabab7c9a, 0x767659c3,
+	0xcaca4005, 0x8282a33e, 0xc9c94909, 0x7d7d68ef, 0xfafad0c5, 0x5959947f, 0x4747ce07, 0xf0f0e6ed,
+	0xadad6e82, 0xd4d41a7d, 0xa2a243be, 0xafaf608a, 0x9c9cf946, 0xa4a451a6, 0x727245d3, 0xc0c0762d,
+	0xb7b728ea, 0xfdfdc5d9, 0x9393d47a, 0x2626f298, 0x363682d8, 0x3f3fbdfc, 0xf7f7f3f1, 0xcccc521d,
+	0x34348cd0, 0xa5a556a2, 0xe5e58db9, 0xf1f1e1e9, 0x71714cdf, 0xd8d83e4d, 0x313197c4, 0x15156b54,
+	0x04041c10, 0xc7c76331, 0x2323e98c, 0xc3c37f21, 0x18184860, 0x9696cf6e, 0x05051b14, 0x9a9aeb5e,
+	0x0707151c, 0x12127e48, 0x8080ad36, 0xe2e298a5, 0xebeba781, 0x2727f59c, 0xb2b233fe, 0x757550cf,
+	0x09093f24, 0x8383a43a, 0x2c2cc4b0, 0x1a1a4668, 0x1b1b416c, 0x6e6e11a3, 0x5a5a9d73, 0xa0a04db6,
+	0x5252a553, 0x3b3ba1ec, 0xd6d61475, 0xb3b334fa, 0x2929dfa4, 0xe3e39fa1, 0x2f2fcdbc, 0x8484b126,
+	0x5353a257, 0xd1d10169, 0x00000000, 0xededb599, 0x2020e080, 0xfcfcc2dd, 0xb1b13af2, 0x5b5b9a77,
+	0x6a6a0db3, 0xcbcb4701, 0xbebe17ce, 0x3939afe4, 0x4a4aed33, 0x4c4cff2b, 0x5858937b, 0xcfcf5b11,
+	0xd0d0066d, 0xefefbb91, 0xaaaa7b9e, 0xfbfbd7c1, 0x4343d217, 0x4d4df82f, 0x333399cc, 0x8585b622,
+	0x4545c00f, 0xf9f9d9c9, 0x02020e08, 0x7f7f66e7, 0x5050ab5b, 0x3c3cb4f0, 0x9f9ff04a, 0xa8a87596,
+	0x5151ac5f, 0xa3a344ba, 0x4040db1b, 0x8f8f800a, 0x9292d37e, 0x9d9dfe42, 0x3838a8e0, 0xf5f5fdf9,
+	0xbcbc19c6, 0xb6b62fee, 0xdada3045, 0x2121e784, 0x10107040, 0xffffcbd1, 0xf3f3efe1, 0xd2d20865,
+	0xcdcd5519, 0x0c0c2430, 0x1313794c, 0xececb29d, 0x5f5f8667, 0x9797c86a, 0x4444c70b, 0x1717655c,
+	0xc4c46a3d, 0xa7a758aa, 0x7e7e61e3, 0x3d3db3f4, 0x6464278b, 0x5d5d886f, 0x19194f64, 0x737342d7,
+	0x60603b9b, 0x8181aa32, 0x4f4ff627, 0xdcdc225d, 0x2222ee88, 0x2a2ad6a8, 0x9090dd76, 0x88889516,
+	0x4646c903, 0xeeeebc95, 0xb8b805d6, 0x14146c50, 0xdede2c55, 0x5e5e8163, 0x0b0b312c, 0xdbdb3741,
+	0xe0e096ad, 0x32329ec8, 0x3a3aa6e8, 0x0a0a3628, 0x4949e43f, 0x06061218, 0x2424fc90, 0x5c5c8f6b,
+	0xc2c27825, 0xd3d30f61, 0xacac6986, 0x62623593, 0x9191da72, 0x9595c662, 0xe4e48abd, 0x797974ff,
+	0xe7e783b1, 0xc8c84e0d, 0x373785dc, 0x6d6d18af, 0x8d8d8e02, 0xd5d51d79, 0x4e4ef123, 0xa9a97292,
+	0x6c6c1fab, 0x5656b943, 0xf4f4fafd, 0xeaeaa085, 0x6565208f, 0x7a7a7df3, 0xaeae678e, 0x08083820,
+	0xbaba0bde, 0x787873fb, 0x2525fb94, 0x2e2ecab8, 0x1c1c5470, 0xa6a65fae, 0xb4b421e6, 0xc6c66435,
+	0xe8e8ae8d, 0xdddd2559, 0x747457cb, 0x1f1f5d7c, 0x4b4bea37, 0xbdbd1ec2, 0x8b8b9c1a, 0x8a8a9b1e,
+	0x70704bdb, 0x3e3ebaf8, 0xb5b526e2, 0x66662983, 0x4848e33b, 0x0303090c, 0xf6f6f4f5, 0x0e0e2a38,
+	0x61613c9f, 0x35358bd4, 0x5757be47, 0xb9b902d2, 0x8686bf2e, 0xc1c17129, 0x1d1d5374, 0x9e9ef74e,
+	0xe1e191a9, 0xf8f8decd, 0x9898e556, 0x11117744, 0x696904bf, 0xd9d93949, 0x8e8e870e, 0x9494c166,
+	0x9b9bec5a, 0x1e1e5a78, 0x8787b82a, 0xe9e9a989, 0xcece5c15, 0x5555b04f, 0x2828d8a0, 0xdfdf2b51,
+	0x8c8c8906, 0xa1a14ab2, 0x89899212, 0x0d0d2334, 0xbfbf10ca, 0xe6e684b5, 0x4242d513, 0x686803bb,
+	0x4141dc1f, 0x9999e252, 0x2d2dc3b4, 0x0f0f2d3c, 0xb0b03df6, 0x5454b74b, 0xbbbb0cda, 0x16166258
+};
+
+__device__ __forceinline__
+uint32_t ROL8X(const uint32_t a){
+	return __byte_perm(a, 0, 0x2103);
+}
+__device__ __forceinline__
+uint32_t ROL16X(const uint32_t a){
+	return __byte_perm(a, 0, 0x1032);
+}
+__device__ __forceinline__
+uint32_t ROR8X(const uint32_t a){
+	return __byte_perm(a, 0, 0x0321);
+}
+
+#define mixtab0(x) shared[0][x]
+#define mixtab1(x) shared[1][x]
+#define mixtab2(x) shared[2][x]
+#define mixtab3(x) shared[3][x]
+
+#define TIX4(q, x00, x01, x04, x07, x08, x22, x24, x27, x30) { \
+		x22 ^= x00; \
+		x00 = (q); \
+		x08 ^= (q); \
+		x01 ^= x24; \
+		x04 ^= x27; \
+		x07 ^= x30; \
+	}
+
+#define CMIX36(x00, x01, x02, x04, x05, x06, x18, x19, x20) { \
+		x00 ^= x04; \
+		x01 ^= x05; \
+		x02 ^= x06; \
+		x18 ^= x04; \
+		x19 ^= x05; \
+		x20 ^= x06; \
+	}
+
+__device__ __forceinline__
+static void SMIX(const uint32_t shared[4][256], uint32_t &x0,uint32_t &x1,uint32_t &x2,uint32_t &x3){
+	uint32_t c0 = mixtab0(__byte_perm(x0,0,0x4443));
+	uint32_t r1 = mixtab1(__byte_perm(x0,0,0x4442));
+	uint32_t r2 = mixtab2(__byte_perm(x0,0,0x4441));
+	uint32_t r3 = mixtab3(__byte_perm(x0,0,0x4440));
+	c0 = c0 ^ r1 ^ r2 ^ r3;
+	uint32_t r0 = mixtab0(__byte_perm(x1,0,0x4443));
+	uint32_t c1 = r0 ^ mixtab1(__byte_perm(x1,0,0x4442));
+	uint32_t tmp = mixtab2(__byte_perm(x1,0,0x4441));
+	c1 ^= tmp;
+	r2 ^= tmp;
+	tmp = mixtab3(__byte_perm(x1,0,0x4440));
+	c1 ^= tmp;
+	r3 ^= tmp;
+	uint32_t c2 = mixtab0(__byte_perm(x2,0,0x4443));
+	r0 ^= c2;
+	tmp = mixtab1(__byte_perm(x2,0,0x4442));
+	c2 ^= tmp;
+	r1 ^= tmp;
+	tmp = mixtab2(__byte_perm(x2,0,0x4441));
+	c2 ^= tmp;
+	tmp = mixtab3(__byte_perm(x2,0,0x4440));
+	c2 ^= tmp;
+	r3 ^= tmp;
+	uint32_t c3 = mixtab0(__byte_perm(x3,0,0x4443));
+	r0 ^= c3;
+	tmp = mixtab1(__byte_perm(x3,0,0x4442));
+	c3 ^= tmp;
+	r1 ^= tmp;
+	tmp = mixtab2(__byte_perm(x3,0,0x4441));
+	c3 ^= tmp;
+	r2 ^= tmp;
+	tmp = mixtab3(__byte_perm(x3,0,0x4440));
+	c3 ^= tmp;
+	x0 = ((c0 ^ (r0 << 0)) & 0xFF000000) | ((c1 ^ (r1 << 0)) & 0x00FF0000) | ((c2 ^ (r2 << 0)) & 0x0000FF00) | ((c3 ^ (r3 << 0)) & 0x000000FF);
+	x1 = ((c1 ^ (r0 << 8)) & 0xFF000000) | ((c2 ^ (r1 << 8)) & 0x00FF0000) | ((c3 ^ (r2 << 8)) & 0x0000FF00) | ((c0 ^ (r3 >>24)) & 0x000000FF);
+	x2 = ((c2 ^ (r0 <<16)) & 0xFF000000) | ((c3 ^ (r1 <<16)) & 0x00FF0000) | ((c0 ^ (r2 >>16)) & 0x0000FF00) | ((c1 ^ (r3 >>16)) & 0x000000FF);
+	x3 = ((c3 ^ (r0 <<24)) & 0xFF000000) | ((c0 ^ (r1 >> 8)) & 0x00FF0000) | ((c1 ^ (r2 >> 8)) & 0x0000FF00) | ((c2 ^ (r3 >> 8)) & 0x000000FF);
+}
+
+__device__
+static void SMIX_LDG(const uint32_t shared[4][256], uint32_t &x0,uint32_t &x1,uint32_t &x2,uint32_t &x3){
+	uint32_t c0 = __ldg(&mixtab0[__byte_perm(x0,0,0x4443)]);
+	uint32_t r1 = mixtab1(__byte_perm(x0,0,0x4442));
+	uint32_t r2 = mixtab2(__byte_perm(x0,0,0x4441));
+	uint32_t r3 = mixtab3(__byte_perm(x0,0,0x4440));
+	c0 = c0 ^ r1 ^ r2 ^ r3;
+	uint32_t r0 = __ldg(&mixtab0[__byte_perm(x1,0,0x4443)]);
+	uint32_t c1 = r0 ^ mixtab1(__byte_perm(x1,0,0x4442));
+	uint32_t tmp = mixtab2(__byte_perm(x1,0,0x4441));
+	c1 ^= tmp;
+	r2 ^= tmp;
+	tmp = mixtab3(__byte_perm(x1,0,0x4440));
+	c1 ^= tmp;
+	r3 ^= tmp;
+	uint32_t c2 = __ldg(&mixtab0[__byte_perm(x2,0,0x4443)]);
+	r0 ^= c2;
+	tmp = mixtab1(__byte_perm(x2,0,0x4442));
+	c2 ^= tmp;
+	r1 ^= tmp;
+	tmp = mixtab2(__byte_perm(x2,0,0x4441));
+	c2 ^= tmp;
+	tmp = mixtab3(__byte_perm(x2,0,0x4440));
+	c2 ^= tmp;
+	r3 ^= tmp;
+	uint32_t c3 = __ldg(&mixtab0[__byte_perm(x3,0,0x4443)]);
+	r0 ^= c3;
+	tmp = mixtab1(__byte_perm(x3,0,0x4442));
+	c3 ^= tmp;
+	r1 ^= tmp;
+	tmp = mixtab2(__byte_perm(x3,0,0x4441));
+	c3 ^= tmp;
+	r2 ^= tmp;
+	tmp = ROL8X(__ldg(&mixtab0[__byte_perm(x3,0,0x4440)]));
+	c3 ^= tmp;
+	x0 = ((c0 ^ (r0 << 0)) & 0xFF000000) | ((c1 ^ (r1 << 0)) & 0x00FF0000) | ((c2 ^ (r2 << 0)) & 0x0000FF00) | ((c3 ^ (r3 << 0)) & 0x000000FF);
+	x1 = ((c1 ^ (r0 << 8)) & 0xFF000000) | ((c2 ^ (r1 << 8)) & 0x00FF0000) | ((c3 ^ (r2 << 8)) & 0x0000FF00) | ((c0 ^ (r3 >>24)) & 0x000000FF);
+	x2 = ((c2 ^ (r0 <<16)) & 0xFF000000) | ((c3 ^ (r1 <<16)) & 0x00FF0000) | ((c0 ^ (r2 >>16)) & 0x0000FF00) | ((c1 ^ (r3 >>16)) & 0x000000FF);
+	x3 = ((c3 ^ (r0 <<24)) & 0xFF000000) | ((c0 ^ (r1 >> 8)) & 0x00FF0000) | ((c1 ^ (r2 >> 8)) & 0x0000FF00) | ((c2 ^ (r3 >> 8)) & 0x000000FF);
+}
+
+#define mROR3 { \
+	B[ 6] = S[33], B[ 7] = S[34], B[ 8] = S[35]; \
+	S[35] = S[32]; S[34] = S[31]; S[33] = S[30]; S[32] = S[29]; S[31] = S[28]; S[30] = S[27]; S[29] = S[26]; S[28] = S[25]; S[27] = S[24]; \
+	S[26] = S[23]; S[25] = S[22]; S[24] = S[21]; S[23] = S[20]; S[22] = S[19]; S[21] = S[18]; S[20] = S[17]; S[19] = S[16]; S[18] = S[15]; \
+	S[17] = S[14]; S[16] = S[13]; S[15] = S[12]; S[14] = S[11]; S[13] = S[10]; S[12] = S[ 9]; S[11] = S[ 8]; S[10] = S[ 7]; S[ 9] = S[ 6]; \
+	S[ 8] = S[ 5]; S[ 7] = S[ 4]; S[ 6] = S[ 3]; S[ 5] = S[ 2]; S[ 4] = S[ 1]; S[ 3] = S[ 0]; S[ 2] = B[ 8]; S[ 1] = B[ 7]; S[ 0] = B[ 6]; \
+	}
+
+#define mROR8 { \
+	B[ 1] = S[28], B[ 2] = S[29], B[ 3] = S[30], B[ 4] = S[31], B[ 5] = S[32], B[ 6] = S[33], B[ 7] = S[34], B[ 8] = S[35]; \
+	S[35] = S[27]; S[34] = S[26]; S[33] = S[25]; S[32] = S[24]; S[31] = S[23]; S[30] = S[22]; S[29] = S[21]; S[28] = S[20]; S[27] = S[19]; \
+	S[26] = S[18]; S[25] = S[17]; S[24] = S[16]; S[23] = S[15]; S[22] = S[14]; S[21] = S[13]; S[20] = S[12]; S[19] = S[11]; S[18] = S[10]; \
+	S[17] = S[ 9]; S[16] = S[ 8]; S[15] = S[ 7]; S[14] = S[ 6]; S[13] = S[ 5]; S[12] = S[ 4]; S[11] = S[ 3]; S[10] = S[ 2]; S[ 9] = S[ 1]; \
+	S[ 8] = S[ 0]; S[ 7] = B[ 8]; S[ 6] = B[ 7]; S[ 5] = B[ 6]; S[ 4] = B[ 5]; S[ 3] = B[ 4]; S[ 2] = B[ 3]; S[ 1] = B[ 2]; S[ 0] = B[ 1]; \
+	}
+
+#define mROR9 { \
+	B[ 0] = S[27], B[ 1] = S[28], B[ 2] = S[29], B[ 3] = S[30], B[ 4] = S[31], B[ 5] = S[32], B[ 6] = S[33], B[ 7] = S[34], B[ 8] = S[35]; \
+	S[35] = S[26]; S[34] = S[25]; S[33] = S[24]; S[32] = S[23]; S[31] = S[22]; S[30] = S[21]; S[29] = S[20]; S[28] = S[19]; S[27] = S[18]; \
+	S[26] = S[17]; S[25] = S[16]; S[24] = S[15]; S[23] = S[14]; S[22] = S[13]; S[21] = S[12]; S[20] = S[11]; S[19] = S[10]; S[18] = S[ 9]; \
+	S[17] = S[ 8]; S[16] = S[ 7]; S[15] = S[ 6]; S[14] = S[ 5]; S[13] = S[ 4]; S[12] = S[ 3]; S[11] = S[ 2]; S[10] = S[ 1]; S[ 9] = S[ 0]; \
+	S[ 8] = B[ 8]; S[ 7] = B[ 7]; S[ 6] = B[ 6]; S[ 5] = B[ 5]; S[ 4] = B[ 4]; S[ 3] = B[ 3]; S[ 2] = B[ 2]; S[ 1] = B[ 1]; S[ 0] = B[ 0]; \
+	}
+
+#define FUGUE512_3(x, y, z) { \
+    TIX4(x, S[ 0], S[ 1], S[ 4], S[ 7], S[ 8], S[22], S[24], S[27], S[30]); \
+    CMIX36(S[33], S[34], S[35], S[ 1], S[ 2], S[ 3], S[15], S[16], S[17]); \
+    SMIX_LDG(shared, S[33], S[34], S[35], S[ 0]); \
+    CMIX36(S[30], S[31], S[32], S[34], S[35], S[ 0], S[12], S[13], S[14]); \
+    SMIX_LDG(shared, S[30], S[31], S[32], S[33]); \
+    CMIX36(S[27], S[28], S[29], S[31], S[32], S[33], S[ 9], S[10], S[11]); \
+    SMIX(shared, S[27], S[28], S[29], S[30]); \
+    CMIX36(S[24], S[25], S[26], S[28], S[29], S[30], S[ 6], S[ 7], S[ 8]); \
+    SMIX_LDG(shared, S[24], S[25], S[26], S[27]); \
+    \
+    TIX4(y, S[24], S[25], S[28], S[31], S[32], S[10], S[12], S[15], S[18]); \
+    CMIX36(S[21], S[22], S[23], S[25], S[26], S[27], S[ 3], S[ 4], S[ 5]); \
+    SMIX(shared, S[21], S[22], S[23], S[24]); \
+    CMIX36(S[18], S[19], S[20], S[22], S[23], S[24], S[ 0], S[ 1], S[ 2]); \
+    SMIX_LDG(shared, S[18], S[19], S[20], S[21]); \
+    CMIX36(S[15], S[16], S[17], S[19], S[20], S[21], S[33], S[34], S[35]); \
+    SMIX_LDG(shared, S[15], S[16], S[17], S[18]); \
+    CMIX36(S[12], S[13], S[14], S[16], S[17], S[18], S[30], S[31], S[32]); \
+    SMIX(shared, S[12], S[13], S[14], S[15]); \
+    \
+    TIX4(z, S[12], S[13], S[16], S[19], S[20], S[34], S[ 0], S[ 3], S[ 6]); \
+    CMIX36(S[ 9], S[10], S[11], S[13], S[14], S[15], S[27], S[28], S[29]); \
+    SMIX_LDG(shared, S[ 9], S[10], S[11], S[12]); \
+    CMIX36(S[ 6], S[ 7], S[ 8], S[10], S[11], S[12], S[24], S[25], S[26]); \
+    SMIX_LDG(shared, S[ 6], S[ 7], S[ 8], S[ 9]); \
+    CMIX36(S[ 3], S[ 4], S[ 5], S[ 7], S[ 8], S[ 9], S[21], S[22], S[23]); \
+    SMIX_LDG(shared, S[ 3], S[ 4], S[ 5], S[ 6]); \
+    CMIX36(S[ 0], S[ 1], S[ 2], S[ 4], S[ 5], S[ 6], S[18], S[19], S[20]); \
+    SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]); \
+	}
+
+__global__
+__launch_bounds__(TPB, 2)
+void skunk_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *output64)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+	__shared__ uint32_t shared[4][256];
+
+	if(threadIdx.x<256) {
+		const uint32_t tmp = __ldg(&mixtab0[threadIdx.x]);
+		shared[0][threadIdx.x] = tmp;
+		shared[1][threadIdx.x] = ROR8X(tmp);
+		shared[2][threadIdx.x] = ROL16X(tmp);
+		shared[3][threadIdx.x] = ROL8X(tmp);
+	}
+	__syncthreads();
+
+	if (thread < threads)
+	{
+		// Skein
+		uint2 h[9];
+		uint2 t0, t1, t2;
+
+		uint32_t nonce = cuda_swab32(startNounce + thread);
+		uint2 nonce2 = make_uint2(c_buffer[0].x, nonce);
+
+		uint2 p[8];
+		p[1] = nonce2;
+
+		h[0] = c_buffer[ 1];
+		h[1] = c_buffer[ 2];
+		h[2] = c_buffer[ 3];
+		h[3] = c_buffer[ 4];
+		h[4] = c_buffer[ 5];
+		h[5] = c_buffer[ 6];
+		h[6] = c_buffer[ 7];
+		h[7] = c_buffer[ 8];
+		h[8] = c_buffer[ 9];
+
+		t0 = vectorize(0x50ull);
+		t1 = vectorize(0xB000000000000000ull);
+		t2 = t0^t1;
+
+		p[ 1]=nonce2 + h[1];	p[ 0]= c_buffer[10] + p[ 1];
+		p[ 2]=c_buffer[11];
+		p[ 3]=c_buffer[12];
+		p[ 4]=c_buffer[13];
+		p[ 5]=c_buffer[14];
+		p[ 6]=c_buffer[15];
+		p[ 7]=c_buffer[16];
+
+//		TFBIGMIX8e();
+		p[1] = ROL2(p[1], 46) ^ p[0];
+		p[2] += p[1];
+		p[0] += p[3];
+		p[1] = ROL2(p[1], 33) ^ p[2];
+		p[3] = c_buffer[17] ^ p[0];
+		p[4] += p[1];
+		p[6] += p[3];
+		p[0] += p[5];
+		p[2] += p[7];
+		p[1] = ROL2(p[1], 17) ^ p[4];
+		p[3] = ROL2(p[3], 49) ^ p[6];
+		p[5] = c_buffer[18] ^ p[0];
+		p[7] = c_buffer[19] ^ p[2];
+		p[6] += p[1];
+		p[0] += p[7];
+		p[2] += p[5];
+		p[4] += p[3];
+		p[1] = ROL2(p[1], 44) ^ p[6];
+		p[7] = ROL2(p[7], 9) ^ p[0];
+		p[5] = ROL2(p[5], 54) ^ p[2];
+		p[3] = ROR8(p[3]) ^ p[4];
+
+		p[ 0]+=h[1];	p[ 1]+=h[2];	p[ 2]+=h[3];	p[ 3]+=h[4];	p[ 4]+=h[5];	p[ 5]+=c_buffer[20];	p[ 7]+=c_buffer[21];	p[ 6]+=c_buffer[22];
+		TFBIGMIX8o();
+		p[ 0]+=h[2];	p[ 1]+=h[3];	p[ 2]+=h[4];	p[ 3]+=h[5];	p[ 4]+=h[6];	p[ 5]+=c_buffer[22];	p[ 7]+=c_buffer[23];	p[ 6]+=c_buffer[24];
+		TFBIGMIX8e();
+		p[ 0]+=h[3];	p[ 1]+=h[4];	p[ 2]+=h[5];	p[ 3]+=h[6];	p[ 4]+=h[7];	p[ 5]+=c_buffer[24];	p[ 7]+=c_buffer[25];	p[ 6]+=c_buffer[26];
+		TFBIGMIX8o();
+		p[ 0]+=h[4];	p[ 1]+=h[5];	p[ 2]+=h[6];	p[ 3]+=h[7];	p[ 4]+=h[8];	p[ 5]+=c_buffer[26];	p[ 7]+=c_buffer[27];	p[ 6]+=c_buffer[28];
+		TFBIGMIX8e();
+		p[ 0]+=h[5];	p[ 1]+=h[6];	p[ 2]+=h[7];	p[ 3]+=h[8];	p[ 4]+=h[0];	p[ 5]+=c_buffer[28];	p[ 7]+=c_buffer[29];	p[ 6]+=c_buffer[30];
+		TFBIGMIX8o();
+		p[ 0]+=h[6];	p[ 1]+=h[7];	p[ 2]+=h[8];	p[ 3]+=h[0];	p[ 4]+=h[1];	p[ 5]+=c_buffer[30];	p[ 7]+=c_buffer[31];	p[ 6]+=c_buffer[32];
+		TFBIGMIX8e();
+		p[ 0]+=h[7];	p[ 1]+=h[8];	p[ 2]+=h[0];	p[ 3]+=h[1];	p[ 4]+=h[2];	p[ 5]+=c_buffer[32];	p[ 7]+=c_buffer[33];	p[ 6]+=c_buffer[34];
+		TFBIGMIX8o();
+		p[ 0]+=h[8];	p[ 1]+=h[0];	p[ 2]+=h[1];	p[ 3]+=h[2];	p[ 4]+=h[3];	p[ 5]+=c_buffer[34];	p[ 7]+=c_buffer[35];	p[ 6]+=c_buffer[36];
+		TFBIGMIX8e();
+		p[ 0]+=h[0];	p[ 1]+=h[1];	p[ 2]+=h[2];	p[ 3]+=h[3];	p[ 4]+=h[4];	p[ 5]+=c_buffer[36];	p[ 7]+=c_buffer[37];	p[ 6]+=c_buffer[38];
+		TFBIGMIX8o();
+		p[ 0]+=h[1];	p[ 1]+=h[2];	p[ 2]+=h[3];	p[ 3]+=h[4];	p[ 4]+=h[5];	p[ 5]+=c_buffer[38];	p[ 7]+=c_buffer[39];	p[ 6]+=c_buffer[40];
+		TFBIGMIX8e();
+		p[ 0]+=h[2];	p[ 1]+=h[3];	p[ 2]+=h[4];	p[ 3]+=h[5];	p[ 4]+=h[6];	p[ 5]+=c_buffer[40];	p[ 7]+=c_buffer[41];	p[ 6]+=c_buffer[42];
+		TFBIGMIX8o();
+		p[ 0]+=h[3];	p[ 1]+=h[4];	p[ 2]+=h[5];	p[ 3]+=h[6];	p[ 4]+=h[7];	p[ 5]+=c_buffer[42];	p[ 7]+=c_buffer[43];	p[ 6]+=c_buffer[44];
+		TFBIGMIX8e();
+		p[ 0]+=h[4];	p[ 1]+=h[5];	p[ 2]+=h[6];	p[ 3]+=h[7];	p[ 4]+=h[8];	p[ 5]+=c_buffer[44];	p[ 7]+=c_buffer[45];	p[ 6]+=c_buffer[46];
+		TFBIGMIX8o();
+		p[ 0]+=h[5];	p[ 1]+=h[6];	p[ 2]+=h[7];	p[ 3]+=h[8];	p[ 4]+=h[0];	p[ 5]+=c_buffer[46];	p[ 7]+=c_buffer[47];	p[ 6]+=c_buffer[48];
+		TFBIGMIX8e();
+		p[ 0]+=h[6];	p[ 1]+=h[7];	p[ 2]+=h[8];	p[ 3]+=h[0];	p[ 4]+=h[1];	p[ 5]+=c_buffer[48];	p[ 7]+=c_buffer[49];	p[ 6]+=c_buffer[50];
+		TFBIGMIX8o();
+		p[ 0]+=h[7];	p[ 1]+=h[8];	p[ 2]+=h[0];	p[ 3]+=h[1];	p[ 4]+=h[2];	p[ 5]+=c_buffer[50];	p[ 7]+=c_buffer[51];	p[ 6]+=c_buffer[52];
+		TFBIGMIX8e();
+		p[ 0]+=h[8];	p[ 1]+=h[0];	p[ 2]+=h[1];	p[ 3]+=h[2];	p[ 4]+=h[3];	p[ 5]+=c_buffer[52];	p[ 7]+=c_buffer[53];	p[ 6]+=c_buffer[54];
+		TFBIGMIX8o();
+		p[ 0]+=h[0];	p[ 1]+=h[1];	p[ 2]+=h[2];	p[ 3]+=h[3];	p[ 4]+=h[4];	p[ 5]+=c_buffer[54];	p[ 7]+=c_buffer[55];	p[ 6]+=c_buffer[56];
+
+		p[0]^= c_buffer[57];
+		p[1]^= nonce2;
+
+		t0 = vectorize(8); // extra
+		t1 = vectorize(0xFF00000000000000ull); // etype
+		t2 = t0^t1;
+
+		h[0] = p[ 0];
+		h[1] = p[ 1];
+		h[2] = p[ 2];
+		h[3] = p[ 3];
+		h[4] = p[ 4];
+		h[5] = p[ 5];
+		h[6] = p[ 6];
+		h[7] = p[ 7];
+
+		h[8] = h[0] ^ h[1] ^ h[2] ^ h[3] ^ h[4] ^ h[5] ^ h[6] ^ h[7] ^ vectorize(0x1BD11BDAA9FC1A22);
+		p[ 0] = p[ 1] = p[ 2] = p[ 3] = p[ 4] =p[ 5] =p[ 6] = p[ 7] = vectorize(0);
+
+		#define h0 h[0]
+		#define h1 h[1]
+		#define h2 h[2]
+		#define h3 h[3]
+		#define h4 h[4]
+		#define h5 h[5]
+		#define h6 h[6]
+		#define h7 h[7]
+		#define h8 h[8]
+
+		TFBIG_4e_UI2(0);
+		TFBIG_4o_UI2(1);
+		TFBIG_4e_UI2(2);
+		TFBIG_4o_UI2(3);
+		TFBIG_4e_UI2(4);
+		TFBIG_4o_UI2(5);
+		TFBIG_4e_UI2(6);
+		TFBIG_4o_UI2(7);
+		TFBIG_4e_UI2(8);
+		TFBIG_4o_UI2(9);
+		TFBIG_4e_UI2(10);
+		TFBIG_4o_UI2(11);
+		TFBIG_4e_UI2(12);
+		TFBIG_4o_UI2(13);
+		TFBIG_4e_UI2(14);
+		TFBIG_4o_UI2(15);
+		TFBIG_4e_UI2(16);
+		TFBIG_4o_UI2(17);
+		TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18);
+
+		// cubehash512
+		uint32_t x[32] = {
+			0x2AEA2A61, 0x50F494D4, 0x2D538B8B, 0x4167D83E,
+			0x3FEE2313, 0xC701CF8C, 0xCC39968E, 0x50AC5695,
+			0x4D42C787, 0xA647A8B3, 0x97CF0BEF, 0x825B4537,
+			0xEEF864D2, 0xF22090C4, 0xD0E5CD33, 0xA23911AE,
+			0xFCD398D9, 0x148FE485, 0x1B017BEF, 0xB6444532,
+			0x6A536159, 0x2FF5781C, 0x91FA7934, 0x0DBADEA9,
+			0xD65C8A2B, 0xA5A70E75, 0xB1C62456, 0xBC796576,
+			0x1921C8F7, 0xE7989AF1, 0x7795D246, 0xD43E3B44
+		};
+
+//		*(uint2x4*)&x[ 0] ^= *((uint2x4*)&p[0]);
+		#pragma unroll 4
+		for(int i=0;i<4;i++){
+			x[i*2] ^= p[i].x;
+			x[i*2+1] ^= p[i].y;
+		}
+		rrounds(x);
+
+//		*(uint2x4*)&x[ 0] ^= *((uint2x4*)&p[4]);
+		#pragma unroll 4
+		for(int i=0;i<4;i++){
+			x[i*2] ^= p[i+4].x;
+			x[i*2+1] ^= p[i+4].y;
+		}
+		rrounds(x);
+
+		// Padding Block
+		x[ 0] ^= 0x80;
+		rrounds(x);
+
+//		Final(x, (BitSequence*)Hash);
+		x[31] ^= 1;
+
+		/* "the state is then transformed invertibly through 10r identical rounds" */
+		#pragma unroll 10
+		for (int i = 0;i < 10;++i)
+			rrounds(x);
+
+		// fugue512
+		uint32_t Hash[16];
+		#pragma unroll 16
+		for(int i = 0; i < 16; i++)
+			Hash[i] = cuda_swab32(x[i]);
+
+		uint32_t S[36];
+		uint32_t B[ 9];
+
+		S[ 0] = S[ 1] = S[ 2] = S[ 3] = S[ 4] = S[ 5] = S[ 6] = S[ 7] = S[ 8] = S[ 9] = S[10] = S[11] = S[12] = S[13] = S[14] = S[15] = S[16] = S[17] = S[18] = S[19] = 0;
+		*(uint2x4*)&S[20] = *(uint2x4*)&c_S[ 0];
+		*(uint2x4*)&S[28] = *(uint2x4*)&c_S[ 8];
+
+		FUGUE512_3(Hash[0x0], Hash[0x1], Hash[0x2]);
+		FUGUE512_3(Hash[0x3], Hash[0x4], Hash[0x5]);
+		FUGUE512_3(Hash[0x6], Hash[0x7], Hash[0x8]);
+		FUGUE512_3(Hash[0x9], Hash[0xA], Hash[0xB]);
+		FUGUE512_3(Hash[0xC], Hash[0xD], Hash[0xE]);
+		FUGUE512_3(Hash[0xF], 0U, 512U);
+
+		//#pragma unroll 16
+		for (uint32_t i = 0; i < 32; i+=2){
+			mROR3;
+			CMIX36(S[ 0], S[ 1], S[ 2], S[ 4], S[ 5], S[ 6], S[18], S[19], S[20]);
+			SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]);
+			mROR3;
+			CMIX36(S[ 0], S[ 1], S[ 2], S[ 4], S[ 5], S[ 6], S[18], S[19], S[20]);
+			SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]);
+		}
+		//#pragma unroll 13
+		for (uint32_t i = 0; i < 13; i ++) {
+			S[ 4] ^= S[ 0];	S[ 9] ^= S[ 0];	S[18] ^= S[ 0];	S[27] ^= S[ 0];
+			mROR9;
+			SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]);
+			S[ 4] ^= S[ 0];	S[10] ^= S[ 0];	S[18] ^= S[ 0];	S[27] ^= S[ 0];
+			mROR9;
+			SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]);
+			S[ 4] ^= S[ 0];	S[10] ^= S[ 0];	S[19] ^= S[ 0];	S[27] ^= S[ 0];
+			mROR9;
+			SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]);
+			S[ 4] ^= S[ 0];	S[10] ^= S[ 0];	S[19] ^= S[ 0];	S[28] ^= S[ 0];
+			mROR8;
+			SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]);
+		}
+		S[ 4] ^= S[ 0];	S[ 9] ^= S[ 0];	S[18] ^= S[ 0];	S[27] ^= S[ 0];
+
+		S[ 0] = cuda_swab32(S[ 1]);	S[ 1] = cuda_swab32(S[ 2]);
+		S[ 2] = cuda_swab32(S[ 3]);	S[ 3] = cuda_swab32(S[ 4]);
+		S[ 4] = cuda_swab32(S[ 9]);	S[ 5] = cuda_swab32(S[10]);
+		S[ 6] = cuda_swab32(S[11]);	S[ 7] = cuda_swab32(S[12]);
+		S[ 8] = cuda_swab32(S[18]);	S[ 9] = cuda_swab32(S[19]);
+		S[10] = cuda_swab32(S[20]);	S[11] = cuda_swab32(S[21]);
+		S[12] = cuda_swab32(S[27]);	S[13] = cuda_swab32(S[28]);
+		S[14] = cuda_swab32(S[29]);	S[15] = cuda_swab32(S[30]);
+
+		uint64_t *outpHash = &output64[thread<<3];
+		*(uint2x4*)&outpHash[ 0] = *(uint2x4*)&S[ 0];
+		*(uint2x4*)&outpHash[ 4] = *(uint2x4*)&S[ 8];
+	}
+}
+
+__host__
+void skunk_cuda_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *g_hash)
+{
+	const dim3 grid((threads + TPB - 1) / TPB);
+	const dim3 block(TPB);
+
+	uint64_t *d_hash = (uint64_t*) g_hash;
+	skunk_gpu_hash_80 <<< grid, block >>> (threads, startNounce, d_hash);
+
+	MyStreamSynchronize(NULL, 1, thr_id);
+}
+
+__host__
+void skunk_setBlock_80(int thr_id, void *pdata)
+{
+	uint64_t message[20];
+	memcpy(&message[0], pdata, 80);
+
+	uint64_t p[8];
+	uint64_t h[9];
+	uint64_t t0, t1, t2;
+
+	h[0] = 0x4903ADFF749C51CEull;
+	h[1] = 0x0D95DE399746DF03ull;
+	h[2] = 0x8FD1934127C79BCEull;
+	h[3] = 0x9A255629FF352CB1ull;
+	h[4] = 0x5DB62599DF6CA7B0ull;
+	h[5] = 0xEABE394CA9D5C3F4ull;
+	h[6] = 0x991112C71A75B523ull;
+	h[7] = 0xAE18A40B660FCC33ull;
+	// h[8] = h[0] ^ h[1] ^ h[2] ^ h[3] ^ h[4] ^ h[5] ^ h[6] ^ h[7] ^ SPH_C64(0x1BD11BDAA9FC1A22);
+	h[8] = 0xcab2076d98173ec4ULL;
+
+	t0 = 64; // ptr
+	t1 = 0x7000000000000000ull;
+	t2 = 0x7000000000000040ull;
+
+	memcpy(&p[0], &message[0], 64);
+
+	TFBIG_4e_PRE(0);
+	TFBIG_4o_PRE(1);
+	TFBIG_4e_PRE(2);
+	TFBIG_4o_PRE(3);
+	TFBIG_4e_PRE(4);
+	TFBIG_4o_PRE(5);
+	TFBIG_4e_PRE(6);
+	TFBIG_4o_PRE(7);
+	TFBIG_4e_PRE(8);
+	TFBIG_4o_PRE(9);
+	TFBIG_4e_PRE(10);
+	TFBIG_4o_PRE(11);
+	TFBIG_4e_PRE(12);
+	TFBIG_4o_PRE(13);
+	TFBIG_4e_PRE(14);
+	TFBIG_4o_PRE(15);
+	TFBIG_4e_PRE(16);
+	TFBIG_4o_PRE(17);
+	TFBIG_ADDKEY_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18);
+
+	message[10] = message[0] ^ p[0];
+	message[11] = message[1] ^ p[1];
+	message[12] = message[2] ^ p[2];
+	message[13] = message[3] ^ p[3];
+	message[14] = message[4] ^ p[4];
+	message[15] = message[5] ^ p[5];
+	message[16] = message[6] ^ p[6];
+	message[17] = message[7] ^ p[7];
+	message[18] = t2;
+
+	uint64_t buffer[128];
+
+//	buffer[ 0] = message[ 8];
+	buffer[ 0] = message[ 9];
+	h[0] = buffer[ 1] = message[10];
+	h[1] = buffer[ 2] = message[11];
+	h[2] = buffer[ 3] = message[12];
+	h[3] = buffer[ 4] = message[13];
+	h[4] = buffer[ 5] = message[14];
+	h[5] = buffer[ 6] = message[15];
+	h[6] = buffer[ 7] = message[16];
+	h[7] = buffer[ 8] = message[17];
+	h[8] = buffer[ 9] = h[0]^h[1]^h[2]^h[3]^h[4]^h[5]^h[6]^h[7]^0x1BD11BDAA9FC1A22ULL;
+
+	t0 = 0x50ull;
+	t1 = 0xB000000000000000ull;
+	t2 = t0^t1;
+
+	p[0] = message[ 8] + h[0];
+	p[2] = h[2]; p[3] = h[3]; p[4] = h[4];
+	p[5] = h[5] + t0;
+	p[6] = h[6] + t1;
+	p[7] = h[7];
+
+	p[2] += p[3];
+	p[4] += p[5]; p[6] += p[7];
+
+	p[3] = ROTL64(p[3], 36) ^ p[2];
+	p[5] = ROTL64(p[5], 19) ^ p[4];
+	p[7] = ROTL64(p[7], 37) ^ p[6];
+	p[4] += p[7]; p[6] += p[5];
+
+	p[7] = ROTL64(p[7], 27) ^ p[4];
+	p[5] = ROTL64(p[5], 14) ^ p[6];
+
+	buffer[10] = p[ 0];
+	buffer[11] = p[ 2];
+	buffer[12] = p[ 3];
+	buffer[13] = p[ 4];
+	buffer[14] = p[ 5];
+	buffer[15] = p[ 6];
+	buffer[16] = p[ 7];
+	buffer[17] = ROTL64(p[3], 42);
+	buffer[18] = ROTL64(p[5], 36);
+	buffer[19] = ROTL64(p[7], 39);
+
+	buffer[20] = h[6]+t1;
+	buffer[21] = h[8]+1;
+	buffer[22] = h[7]+t2;
+	buffer[23] = h[0]+2;
+	buffer[24] = h[8]+t0;
+	buffer[25] = h[1]+3;
+	buffer[26] = h[0]+t1;
+	buffer[27] = h[2]+4;
+	buffer[28] = h[1]+t2;
+	buffer[29] = h[3]+5;
+	buffer[30] = h[2]+t0;
+	buffer[31] = h[4]+6;
+	buffer[32] = h[3]+t1;
+	buffer[33] = h[5]+7;
+	buffer[34] = h[4]+t2;
+	buffer[35] = h[6]+8;
+	buffer[36] = h[5]+t0;
+	buffer[37] = h[7]+9;
+	buffer[38] = h[6]+t1;
+	buffer[39] = h[8]+10;
+	buffer[40] = h[7]+t2;
+	buffer[41] = h[0]+11;
+	buffer[42] = h[8]+t0;
+	buffer[43] = h[1]+12;
+	buffer[44] = h[0]+t1;
+	buffer[45] = h[2]+13;
+	buffer[46] = h[1]+t2;
+	buffer[47] = h[3]+14;
+	buffer[48] = h[2]+t0;
+	buffer[49] = h[4]+15;
+	buffer[50] = h[3]+t1;
+	buffer[51] = h[5]+16;
+	buffer[52] = h[4]+t2;
+	buffer[53] = h[6]+17;
+	buffer[54] = h[5]+t0;
+	buffer[55] = h[7]+18;
+	buffer[56] = h[6]+t1;
+
+	buffer[57] = message[ 8];
+
+	cudaMemcpyToSymbol(c_buffer, buffer, sizeof(c_buffer), 0, cudaMemcpyHostToDevice);
+	CUDA_LOG_ERROR();
+}
+
+__host__
+void skunk_cpu_init(int thr_id, uint32_t threads)
+{
+	cuda_get_arch(thr_id);
+}
+
diff --git a/skunk/cuda_skunk_streebog.cu b/skunk/cuda_skunk_streebog.cu
new file mode 100644
index 0000000000..36ec7923c0
--- /dev/null
+++ b/skunk/cuda_skunk_streebog.cu
@@ -0,0 +1,371 @@
+/*
+ * Streebog GOST R 34.10-2012 stripped CUDA implementation for final hash
+ *
+ * https://tools.ietf.org/html/rfc6986
+ * https://en.wikipedia.org/wiki/Streebog
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * @author   Tanguy Pruvot - 2017
+ * @author   Alexis Provos - 2016
+ */
+
+// Further improved with shared memory partial utilization
+// Tested under CUDA7.5 toolkit for cp 5.0/5.2
+
+#include <miner.h>
+#include <cuda_helper.h>
+#include <cuda_vectors.h>
+#include <cuda_vector_uint2x4.h>
+
+#include "x11/streebog_arrays.cuh"
+
+//#define FULL_UNROLL
+__device__ __forceinline__
+static void GOST_FS(const uint2 shared[8][256],const uint2 *const __restrict__ state,uint2* return_state)
+{
+	return_state[0] = __ldg(&T02[__byte_perm(state[7].x,0,0x44440)])
+			^ shared[1][__byte_perm(state[6].x,0,0x44440)]
+			^ shared[2][__byte_perm(state[5].x,0,0x44440)]
+			^ shared[3][__byte_perm(state[4].x,0,0x44440)]
+			^ shared[4][__byte_perm(state[3].x,0,0x44440)]
+			^ shared[5][__byte_perm(state[2].x,0,0x44440)]
+			^ shared[6][__byte_perm(state[1].x,0,0x44440)]
+			^ __ldg(&T72[__byte_perm(state[0].x,0,0x44440)]);
+
+	return_state[1] =  __ldg(&T02[__byte_perm(state[7].x,0,0x44441)])
+			^ __ldg(&T12[__byte_perm(state[6].x,0,0x44441)])
+			^ shared[2][__byte_perm(state[5].x,0,0x44441)]
+			^ shared[3][__byte_perm(state[4].x,0,0x44441)]
+			^ shared[4][__byte_perm(state[3].x,0,0x44441)]
+			^ shared[5][__byte_perm(state[2].x,0,0x44441)]
+			^ shared[6][__byte_perm(state[1].x,0,0x44441)]
+			^ __ldg(&T72[__byte_perm(state[0].x,0,0x44441)]);
+
+	return_state[2] = __ldg(&T02[__byte_perm(state[7].x,0,0x44442)])
+			^ __ldg(&T12[__byte_perm(state[6].x,0,0x44442)])
+			^ shared[2][__byte_perm(state[5].x,0,0x44442)]
+			^ shared[3][__byte_perm(state[4].x,0,0x44442)]
+			^ shared[4][__byte_perm(state[3].x,0,0x44442)]
+			^ shared[5][__byte_perm(state[2].x,0,0x44442)]
+			^ __ldg(&T72[__byte_perm(state[0].x,0,0x44442)])
+			^ shared[6][__byte_perm(state[1].x,0,0x44442)];
+
+	return_state[3] = __ldg(&T02[__byte_perm(state[7].x,0,0x44443)])
+			^ shared[1][__byte_perm(state[6].x,0,0x44443)]
+			^ shared[2][__byte_perm(state[5].x,0,0x44443)]
+			^ shared[3][__byte_perm(state[4].x,0,0x44443)]
+			^ __ldg(&T42[__byte_perm(state[3].x,0,0x44443)])
+			^ shared[5][__byte_perm(state[2].x,0,0x44443)]
+			^ __ldg(&T72[__byte_perm(state[0].x,0,0x44443)])
+			^ shared[6][__byte_perm(state[1].x,0,0x44443)];
+
+	return_state[4] = __ldg(&T02[__byte_perm(state[7].y,0,0x44440)])
+			^ shared[1][__byte_perm(state[6].y,0,0x44440)]
+			^ __ldg(&T22[__byte_perm(state[5].y,0,0x44440)])
+			^ shared[3][__byte_perm(state[4].y,0,0x44440)]
+			^ shared[4][__byte_perm(state[3].y,0,0x44440)]
+			^ __ldg(&T62[__byte_perm(state[1].y,0,0x44440)])
+			^ shared[5][__byte_perm(state[2].y,0,0x44440)]
+			^ __ldg(&T72[__byte_perm(state[0].y,0,0x44440)]);
+
+	return_state[5] = __ldg(&T02[__byte_perm(state[7].y,0,0x44441)])
+			^ shared[2][__byte_perm(state[5].y,0,0x44441)]
+			^ __ldg(&T12[__byte_perm(state[6].y,0,0x44441)])
+			^ shared[3][__byte_perm(state[4].y,0,0x44441)]
+			^ shared[4][__byte_perm(state[3].y,0,0x44441)]
+			^ shared[5][__byte_perm(state[2].y,0,0x44441)]
+			^ __ldg(&T62[__byte_perm(state[1].y,0,0x44441)])
+			^ __ldg(&T72[__byte_perm(state[0].y,0,0x44441)]);
+
+	return_state[6] = __ldg(&T02[__byte_perm(state[7].y,0,0x44442)])
+			^ shared[1][__byte_perm(state[6].y,0,0x44442)]
+			^ shared[2][__byte_perm(state[5].y,0,0x44442)]
+			^ shared[3][__byte_perm(state[4].y,0,0x44442)]
+			^ shared[4][__byte_perm(state[3].y,0,0x44442)]
+			^ shared[5][__byte_perm(state[2].y,0,0x44442)]
+			^ __ldg(&T62[__byte_perm(state[1].y,0,0x44442)])
+			^ __ldg(&T72[__byte_perm(state[0].y,0,0x44442)]);
+
+	return_state[7] = __ldg(&T02[__byte_perm(state[7].y,0,0x44443)])
+			^ __ldg(&T12[__byte_perm(state[6].y,0,0x44443)])
+			^ shared[2][__byte_perm(state[5].y,0,0x44443)]
+			^ shared[3][__byte_perm(state[4].y,0,0x44443)]
+			^ shared[4][__byte_perm(state[3].y,0,0x44443)]
+			^ shared[5][__byte_perm(state[2].y,0,0x44443)]
+			^ __ldg(&T62[__byte_perm(state[1].y,0,0x44443)])
+			^ __ldg(&T72[__byte_perm(state[0].y,0,0x44443)]);
+}
+
+__device__ __forceinline__
+static void GOST_FS_LDG(const uint2 shared[8][256],const uint2 *const __restrict__ state,uint2* return_state)
+{
+	return_state[0] =  __ldg(&T02[__byte_perm(state[7].x,0,0x44440)])
+			^ __ldg(&T12[__byte_perm(state[6].x,0,0x44440)])
+			^ shared[2][__byte_perm(state[5].x,0,0x44440)]
+			^ shared[3][__byte_perm(state[4].x,0,0x44440)]
+			^ shared[4][__byte_perm(state[3].x,0,0x44440)]
+			^ shared[5][__byte_perm(state[2].x,0,0x44440)]
+			^ shared[6][__byte_perm(state[1].x,0,0x44440)]
+			^ __ldg(&T72[__byte_perm(state[0].x,0,0x44440)]);
+
+	return_state[1] =  __ldg(&T02[__byte_perm(state[7].x,0,0x44441)])
+			^ __ldg(&T12[__byte_perm(state[6].x,0,0x44441)])
+			^ shared[2][__byte_perm(state[5].x,0,0x44441)]
+			^ shared[3][__byte_perm(state[4].x,0,0x44441)]
+			^ shared[4][__byte_perm(state[3].x,0,0x44441)]
+			^ shared[5][__byte_perm(state[2].x,0,0x44441)]
+			^ __ldg(&T72[__byte_perm(state[0].x,0,0x44441)])
+			^ shared[6][__byte_perm(state[1].x,0,0x44441)];
+
+	return_state[2] =  __ldg(&T02[__byte_perm(state[7].x,0,0x44442)])
+			^ __ldg(&T12[__byte_perm(state[6].x,0,0x44442)])
+			^ shared[2][__byte_perm(state[5].x,0,0x44442)]
+			^ shared[3][__byte_perm(state[4].x,0,0x44442)]
+			^ shared[4][__byte_perm(state[3].x,0,0x44442)]
+			^ shared[5][__byte_perm(state[2].x,0,0x44442)]
+			^ shared[6][__byte_perm(state[1].x,0,0x44442)]
+			^ __ldg(&T72[__byte_perm(state[0].x,0,0x44442)]);
+
+	return_state[3] = __ldg(&T02[__byte_perm(state[7].x,0,0x44443)])
+			^ __ldg(&T12[__byte_perm(state[6].x,0,0x44443)])
+			^ shared[2][__byte_perm(state[5].x,0,0x44443)]
+			^ shared[3][__byte_perm(state[4].x,0,0x44443)]
+			^ shared[4][__byte_perm(state[3].x,0,0x44443)]
+			^ shared[5][__byte_perm(state[2].x,0,0x44443)]
+			^ shared[6][__byte_perm(state[1].x,0,0x44443)]
+			^ __ldg(&T72[__byte_perm(state[0].x,0,0x44443)]);
+
+	return_state[4] = __ldg(&T02[__byte_perm(state[7].y,0,0x44440)])
+			^ shared[1][__byte_perm(state[6].y,0,0x44440)]
+			^ __ldg(&T22[__byte_perm(state[5].y,0,0x44440)])
+			^ shared[3][__byte_perm(state[4].y,0,0x44440)]
+			^ shared[4][__byte_perm(state[3].y,0,0x44440)]
+			^ shared[5][__byte_perm(state[2].y,0,0x44440)]
+			^ __ldg(&T72[__byte_perm(state[0].y,0,0x44440)])
+			^ __ldg(&T62[__byte_perm(state[1].y,0,0x44440)]);
+
+	return_state[5] =  __ldg(&T02[__byte_perm(state[7].y,0,0x44441)])
+			^ __ldg(&T12[__byte_perm(state[6].y,0,0x44441)])
+			^ shared[2][__byte_perm(state[5].y,0,0x44441)]
+			^ shared[3][__byte_perm(state[4].y,0,0x44441)]
+			^ shared[4][__byte_perm(state[3].y,0,0x44441)]
+			^ shared[5][__byte_perm(state[2].y,0,0x44441)]
+			^ __ldg(&T72[__byte_perm(state[0].y,0,0x44441)])
+			^ __ldg(&T62[__byte_perm(state[1].y,0,0x44441)]);
+
+	return_state[6] = __ldg(&T02[__byte_perm(state[7].y,0,0x44442)])
+			^ __ldg(&T12[__byte_perm(state[6].y,0,0x44442)])
+			^ __ldg(&T22[__byte_perm(state[5].y,0,0x44442)])
+			^ shared[3][__byte_perm(state[4].y,0,0x44442)]
+			^ shared[4][__byte_perm(state[3].y,0,0x44442)]
+			^ shared[5][__byte_perm(state[2].y,0,0x44442)]
+			^ __ldg(&T72[__byte_perm(state[0].y,0,0x44442)])
+			^ __ldg(&T62[__byte_perm(state[1].y,0,0x44442)]);
+
+	return_state[7] = __ldg(&T02[__byte_perm(state[7].y,0,0x44443)])
+			^ shared[1][__byte_perm(state[6].y,0,0x44443)]
+			^ __ldg(&T22[__byte_perm(state[5].y,0,0x44443)])
+			^ shared[3][__byte_perm(state[4].y,0,0x44443)]
+			^ shared[4][__byte_perm(state[3].y,0,0x44443)]
+			^ shared[5][__byte_perm(state[2].y,0,0x44443)]
+			^ __ldg(&T72[__byte_perm(state[0].y,0,0x44443)])
+			^ __ldg(&T62[__byte_perm(state[1].y,0,0x44443)]);
+}
+
+__device__ __forceinline__
+static void GOST_E12(const uint2 shared[8][256],uint2 *const __restrict__ K, uint2 *const __restrict__ state)
+{
+	uint2 t[ 8];
+	//#pragma unroll 12
+	for(int i=0; i<12; i++){
+		GOST_FS(shared,state, t);
+
+		#pragma unroll 8
+		for(int j=0;j<8;j++)
+			K[ j] ^= *(uint2*)&CC[i][j];
+
+		#pragma unroll 8
+		for(int j=0;j<8;j++)
+			state[ j] = t[ j];
+
+		GOST_FS_LDG(shared,K, t);
+
+		#pragma unroll 8
+		for(int j=0;j<8;j++)
+			state[ j]^= t[ j];
+
+		#pragma unroll 8
+		for(int j=0;j<8;j++)
+			K[ j] = t[ j];
+	}
+}
+
+__constant__ uint64_t target64[4];
+
+__host__
+void skunk_streebog_set_target(uint32_t* ptarget)
+{
+	cudaMemcpyToSymbol(target64, ptarget, 4*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
+}
+
+#define TPB 256
+__global__
+__launch_bounds__(TPB, 2)
+void skunk_streebog_gpu_final_64(uint64_t *g_hash, uint32_t* resNonce)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	uint2 buf[8], t[8], temp[8], K0[8], hash[8];
+
+	__shared__ uint2 shared[8][256];
+	shared[0][threadIdx.x] = __ldg(&T02[threadIdx.x]);
+	shared[1][threadIdx.x] = __ldg(&T12[threadIdx.x]);
+	shared[2][threadIdx.x] = __ldg(&T22[threadIdx.x]);
+	shared[3][threadIdx.x] = __ldg(&T32[threadIdx.x]);
+	shared[4][threadIdx.x] = __ldg(&T42[threadIdx.x]);
+	shared[5][threadIdx.x] = __ldg(&T52[threadIdx.x]);
+	shared[6][threadIdx.x] = __ldg(&T62[threadIdx.x]);
+	shared[7][threadIdx.x] = __ldg(&T72[threadIdx.x]);
+
+//	if (thread < threads)
+//	{
+	uint64_t* inout = &g_hash[thread<<3];
+	*(uint2x4*)&hash[0] = __ldg4((uint2x4*)&inout[0]);
+	*(uint2x4*)&hash[4] = __ldg4((uint2x4*)&inout[4]);
+
+	__threadfence_block();
+
+	K0[0] = vectorize(0x74a5d4ce2efc83b3);
+
+	#pragma unroll 8
+	for(uint32_t i=0;i<8;i++){
+		buf[ i] = hash[ i] ^ K0[ 0];
+	}
+	//#pragma unroll 12
+	for(int i=0; i<12; i++){
+		GOST_FS(shared, buf, temp);
+		#pragma unroll 8
+		for(uint32_t j=0;j<8;j++){
+			buf[ j] = temp[ j] ^ *(uint2*)&precomputed_values[i][j];
+		}
+	}
+	#pragma unroll 8
+	for(int j=0;j<8;j++){
+		buf[ j]^= hash[ j];
+	}
+	#pragma unroll 8
+	for(int j=0;j<8;j++){
+		K0[ j] = buf[ j];
+	}
+
+	K0[7].y ^= 0x00020000;
+
+	GOST_FS(shared, K0, t);
+
+	#pragma unroll 8
+	for(uint32_t i=0;i<8;i++)
+		K0[ i] = t[ i];
+
+	t[7].y ^= 0x01000000;
+	GOST_E12(shared, K0, t);
+
+	#pragma unroll 8
+	for(int j=0;j<8;j++)
+		buf[ j] ^= t[ j];
+
+	buf[7].y ^= 0x01000000;
+
+	GOST_FS(shared, buf,K0);
+
+	buf[7].y ^= 0x00020000;
+
+	#pragma unroll 8
+	for(uint32_t j=0;j<8;j++)
+		t[ j] = K0[ j];
+
+	t[7].y ^= 0x00020000;
+	GOST_E12(shared, K0, t);
+
+	#pragma unroll 8
+	for(uint32_t j=0;j<8;j++)
+		buf[ j] ^= t[ j];
+
+	GOST_FS(shared, buf,K0); // K = F(h)
+
+	hash[7]+= vectorize(0x0100000000000000);
+
+	#pragma unroll 8
+	for(uint32_t j=0;j<8;j++)
+		t[ j] = K0[ j] ^ hash[ j];
+
+//	#pragma unroll
+	for(uint32_t i=0; i<10; i++){
+		GOST_FS(shared, t, temp);
+
+		#pragma unroll 8
+		for(uint32_t j=0;j<8;j++){
+			t[ j] = temp[ j];
+			K0[ j] = K0[ j] ^ *(uint2*)&CC[ i][ j];
+		}
+
+		GOST_FS(shared, K0, temp);
+
+		#pragma unroll 8
+		for(uint32_t j=0;j<8;j++){
+			K0[ j] = temp[ j];
+			t[ j]^= temp[ j];
+		}
+	}
+
+	GOST_FS(shared, t, temp);
+
+	#pragma unroll 8
+	for(uint32_t j=0;j<8;j++){
+		t[ j] = temp[ j];
+		K0[ j] = K0[ j] ^ *(uint2*)&CC[10][ j];
+	}
+
+	GOST_FS(shared, K0, temp);
+
+	#pragma unroll 8
+	for(int i=7;i>=0;i--){
+		t[i].x = t[i].x ^ temp[i].x;
+		temp[i].x = temp[i].x ^ ((uint32_t*)&CC[11])[i<<1];
+	}
+
+	uint2 last[2];
+
+#define T0(x) shared[0][x]
+#define T1(x) shared[1][x]
+#define T2(x) shared[2][x]
+#define T3(x) shared[3][x]
+#define T4(x) shared[4][x]
+#define T5(x) shared[5][x]
+#define T6(x) shared[6][x]
+#define T7(x) shared[7][x]
+
+	last[ 0] = T0(__byte_perm(t[7].x,0,0x44443)) ^ T1(__byte_perm(t[6].x,0,0x44443))
+		 ^ T2(__byte_perm(t[5].x,0,0x44443)) ^ T3(__byte_perm(t[4].x,0,0x44443))
+		 ^ T4(__byte_perm(t[3].x,0,0x44443)) ^ T5(__byte_perm(t[2].x,0,0x44443))
+		 ^ T6(__byte_perm(t[1].x,0,0x44443)) ^ T7(__byte_perm(t[0].x,0,0x44443));
+
+	last[ 1] = T0(__byte_perm(temp[7].x,0,0x44443)) ^ T1(__byte_perm(temp[6].x,0,0x44443))
+		 ^ T2(__byte_perm(temp[5].x,0,0x44443)) ^ T3(__byte_perm(temp[4].x,0,0x44443))
+		 ^ T4(__byte_perm(temp[3].x,0,0x44443)) ^ T5(__byte_perm(temp[2].x,0,0x44443))
+		 ^ T6(__byte_perm(temp[1].x,0,0x44443)) ^ T7(__byte_perm(temp[0].x,0,0x44443));
+
+	if(devectorize(buf[3] ^ hash[3] ^ last[ 0] ^ last[ 1]) <= target64[3]){
+		uint32_t tmp = atomicExch(&resNonce[0], thread);
+		if (tmp != UINT32_MAX)
+			resNonce[1] = tmp;
+	}
+}
+
+__host__
+void skunk_cuda_streebog(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce)
+{
+	dim3 grid((threads + TPB-1) / TPB);
+	dim3 block(TPB);
+
+	skunk_streebog_gpu_final_64 <<< grid, block >>> ((uint64_t*)d_hash, d_resNonce);
+}
diff --git a/skunk/skein_header.h b/skunk/skein_header.h
new file mode 100644
index 0000000000..460b311f57
--- /dev/null
+++ b/skunk/skein_header.h
@@ -0,0 +1,385 @@
+/* Elementary defines for SKEIN */
+
+/*
+ * M9_ ## s ## _ ## i  evaluates to s+i mod 9 (0 <= s <= 18, 0 <= i <= 7).
+ */
+
+#define M9_0_0    0
+#define M9_0_1    1
+#define M9_0_2    2
+#define M9_0_3    3
+#define M9_0_4    4
+#define M9_0_5    5
+#define M9_0_6    6
+#define M9_0_7    7
+
+#define M9_1_0    1
+#define M9_1_1    2
+#define M9_1_2    3
+#define M9_1_3    4
+#define M9_1_4    5
+#define M9_1_5    6
+#define M9_1_6    7
+#define M9_1_7    8
+
+#define M9_2_0    2
+#define M9_2_1    3
+#define M9_2_2    4
+#define M9_2_3    5
+#define M9_2_4    6
+#define M9_2_5    7
+#define M9_2_6    8
+#define M9_2_7    0
+
+#define M9_3_0    3
+#define M9_3_1    4
+#define M9_3_2    5
+#define M9_3_3    6
+#define M9_3_4    7
+#define M9_3_5    8
+#define M9_3_6    0
+#define M9_3_7    1
+
+#define M9_4_0    4
+#define M9_4_1    5
+#define M9_4_2    6
+#define M9_4_3    7
+#define M9_4_4    8
+#define M9_4_5    0
+#define M9_4_6    1
+#define M9_4_7    2
+
+#define M9_5_0    5
+#define M9_5_1    6
+#define M9_5_2    7
+#define M9_5_3    8
+#define M9_5_4    0
+#define M9_5_5    1
+#define M9_5_6    2
+#define M9_5_7    3
+
+#define M9_6_0    6
+#define M9_6_1    7
+#define M9_6_2    8
+#define M9_6_3    0
+#define M9_6_4    1
+#define M9_6_5    2
+#define M9_6_6    3
+#define M9_6_7    4
+
+#define M9_7_0    7
+#define M9_7_1    8
+#define M9_7_2    0
+#define M9_7_3    1
+#define M9_7_4    2
+#define M9_7_5    3
+#define M9_7_6    4
+#define M9_7_7    5
+
+#define M9_8_0    8
+#define M9_8_1    0
+#define M9_8_2    1
+#define M9_8_3    2
+#define M9_8_4    3
+#define M9_8_5    4
+#define M9_8_6    5
+#define M9_8_7    6
+
+#define M9_9_0    0
+#define M9_9_1    1
+#define M9_9_2    2
+#define M9_9_3    3
+#define M9_9_4    4
+#define M9_9_5    5
+#define M9_9_6    6
+#define M9_9_7    7
+
+#define M9_10_0   1
+#define M9_10_1   2
+#define M9_10_2   3
+#define M9_10_3   4
+#define M9_10_4   5
+#define M9_10_5   6
+#define M9_10_6   7
+#define M9_10_7   8
+
+#define M9_11_0   2
+#define M9_11_1   3
+#define M9_11_2   4
+#define M9_11_3   5
+#define M9_11_4   6
+#define M9_11_5   7
+#define M9_11_6   8
+#define M9_11_7   0
+
+#define M9_12_0   3
+#define M9_12_1   4
+#define M9_12_2   5
+#define M9_12_3   6
+#define M9_12_4   7
+#define M9_12_5   8
+#define M9_12_6   0
+#define M9_12_7   1
+
+#define M9_13_0   4
+#define M9_13_1   5
+#define M9_13_2   6
+#define M9_13_3   7
+#define M9_13_4   8
+#define M9_13_5   0
+#define M9_13_6   1
+#define M9_13_7   2
+
+#define M9_14_0   5
+#define M9_14_1   6
+#define M9_14_2   7
+#define M9_14_3   8
+#define M9_14_4   0
+#define M9_14_5   1
+#define M9_14_6   2
+#define M9_14_7   3
+
+#define M9_15_0   6
+#define M9_15_1   7
+#define M9_15_2   8
+#define M9_15_3   0
+#define M9_15_4   1
+#define M9_15_5   2
+#define M9_15_6   3
+#define M9_15_7   4
+
+#define M9_16_0   7
+#define M9_16_1   8
+#define M9_16_2   0
+#define M9_16_3   1
+#define M9_16_4   2
+#define M9_16_5   3
+#define M9_16_6   4
+#define M9_16_7   5
+
+#define M9_17_0   8
+#define M9_17_1   0
+#define M9_17_2   1
+#define M9_17_3   2
+#define M9_17_4   3
+#define M9_17_5   4
+#define M9_17_6   5
+#define M9_17_7   6
+
+#define M9_18_0   0
+#define M9_18_1   1
+#define M9_18_2   2
+#define M9_18_3   3
+#define M9_18_4   4
+#define M9_18_5   5
+#define M9_18_6   6
+#define M9_18_7   7
+
+/*
+ * M3_ ## s ## _ ## i  evaluates to s+i mod 3 (0 <= s <= 18, 0 <= i <= 1).
+ */
+
+#define M3_0_0    0
+#define M3_0_1    1
+#define M3_1_0    1
+#define M3_1_1    2
+#define M3_2_0    2
+#define M3_2_1    0
+#define M3_3_0    0
+#define M3_3_1    1
+#define M3_4_0    1
+#define M3_4_1    2
+#define M3_5_0    2
+#define M3_5_1    0
+#define M3_6_0    0
+#define M3_6_1    1
+#define M3_7_0    1
+#define M3_7_1    2
+#define M3_8_0    2
+#define M3_8_1    0
+#define M3_9_0    0
+#define M3_9_1    1
+#define M3_10_0   1
+#define M3_10_1   2
+#define M3_11_0   2
+#define M3_11_1   0
+#define M3_12_0   0
+#define M3_12_1   1
+#define M3_13_0   1
+#define M3_13_1   2
+#define M3_14_0   2
+#define M3_14_1   0
+#define M3_15_0   0
+#define M3_15_1   1
+#define M3_16_0   1
+#define M3_16_1   2
+#define M3_17_0   2
+#define M3_17_1   0
+#define M3_18_0   0
+#define M3_18_1   1
+
+#define XCAT(x, y)     XCAT_(x, y)
+#define XCAT_(x, y)    x ## y
+
+#define SKBI(k, s, i)   XCAT(k, XCAT(XCAT(XCAT(M9_, s), _), i))
+#define SKBT(t, s, v)   XCAT(t, XCAT(XCAT(XCAT(M3_, s), _), v))
+
+#define TFBIG_ADDKEY(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) { \
+		w0 = (w0 + SKBI(k, s, 0)); \
+		w1 = (w1 + SKBI(k, s, 1)); \
+		w2 = (w2 + SKBI(k, s, 2)); \
+		w3 = (w3 + SKBI(k, s, 3)); \
+		w4 = (w4 + SKBI(k, s, 4)); \
+		w5 = (w5 + SKBI(k, s, 5) + SKBT(t, s, 0)); \
+		w6 = (w6 + SKBI(k, s, 6) + SKBT(t, s, 1)); \
+		w7 = (w7 + SKBI(k, s, 7) + make_uint2(s,0); \
+	}
+
+#define TFBIG_MIX(x0, x1, rc) { \
+		x0 = x0 + x1; \
+		x1 = ROL2(x1, rc) ^ x0; \
+	}
+
+#define TFBIG_MIX8(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) { \
+		TFBIG_MIX(w0, w1, rc0); \
+		TFBIG_MIX(w2, w3, rc1); \
+		TFBIG_MIX(w4, w5, rc2); \
+		TFBIG_MIX(w6, w7, rc3); \
+	}
+
+#define TFBIG_4e(s)  { \
+		TFBIG_ADDKEY(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \
+		TFBIG_MIX8(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 46, 36, 19, 37); \
+		TFBIG_MIX8(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 33, 27, 14, 42); \
+		TFBIG_MIX8(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 17, 49, 36, 39); \
+		TFBIG_MIX8(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 44,  9, 54, 56); \
+	}
+
+#define TFBIG_4o(s)  { \
+		TFBIG_ADDKEY(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \
+		TFBIG_MIX8(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 39, 30, 34, 24); \
+		TFBIG_MIX8(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 13, 50, 10, 17); \
+		TFBIG_MIX8(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 25, 29, 39, 43); \
+		TFBIG_MIX8(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3],  8, 35, 56, 22); \
+	}
+
+#define TFBIG_KINIT_UI2(k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2) { \
+		k8 = ((k0 ^ k1) ^ (k2 ^ k3)) ^ ((k4 ^ k5) ^ (k6 ^ k7)) \
+			^ vectorize(0x1BD11BDAA9FC1A22); \
+		t2 = t0 ^ t1; \
+	}
+
+#define TFBIG_ADDKEY_UI2(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) { \
+		w0 = (w0 + SKBI(k, s, 0)); \
+		w1 = (w1 + SKBI(k, s, 1)); \
+		w2 = (w2 + SKBI(k, s, 2)); \
+		w3 = (w3 + SKBI(k, s, 3)); \
+		w4 = (w4 + SKBI(k, s, 4)); \
+		w5 = (w5 + SKBI(k, s, 5) + SKBT(t, s, 0)); \
+		w6 = (w6 + SKBI(k, s, 6) + SKBT(t, s, 1)); \
+		w7 = (w7 + SKBI(k, s, 7) + vectorize(s)); \
+	}
+
+#define TFBIG_ADDKEY_PRE(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) { \
+		w0 = (w0 + SKBI(k, s, 0)); \
+		w1 = (w1 + SKBI(k, s, 1)); \
+		w2 = (w2 + SKBI(k, s, 2)); \
+		w3 = (w3 + SKBI(k, s, 3)); \
+		w4 = (w4 + SKBI(k, s, 4)); \
+		w5 = (w5 + SKBI(k, s, 5) + SKBT(t, s, 0)); \
+		w6 = (w6 + SKBI(k, s, 6) + SKBT(t, s, 1)); \
+		w7 = (w7 + SKBI(k, s, 7) + (s)); \
+	}
+
+#define TFBIG_MIX_UI2(x0, x1, rc) { \
+		x0 = x0 + x1; \
+		x1 = ROL2(x1, rc) ^ x0; \
+	}
+
+#define TFBIG_MIX_PRE(x0, x1, rc) { \
+		x0 = x0 + x1; \
+		x1 = ROTL64(x1, rc) ^ x0; \
+	}
+
+#define TFBIG_MIX8_UI2(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) { \
+		TFBIG_MIX_UI2(w0, w1, rc0); \
+		TFBIG_MIX_UI2(w2, w3, rc1); \
+		TFBIG_MIX_UI2(w4, w5, rc2); \
+		TFBIG_MIX_UI2(w6, w7, rc3); \
+	}
+
+#define TFBIG_MIX8_PRE(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) { \
+		TFBIG_MIX_PRE(w0, w1, rc0); \
+		TFBIG_MIX_PRE(w2, w3, rc1); \
+		TFBIG_MIX_PRE(w4, w5, rc2); \
+		TFBIG_MIX_PRE(w6, w7, rc3); \
+	}
+
+#define TFBIG_4e_UI2(s)  { \
+		TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \
+		TFBIG_MIX8_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 46, 36, 19, 37); \
+		TFBIG_MIX8_UI2(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 33, 27, 14, 42); \
+		TFBIG_MIX8_UI2(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 17, 49, 36, 39); \
+		TFBIG_MIX8_UI2(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 44,  9, 54, 56); \
+	}
+
+#define TFBIG_4e_PRE(s)  { \
+		TFBIG_ADDKEY_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \
+		TFBIG_MIX8_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 46, 36, 19, 37); \
+		TFBIG_MIX8_PRE(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 33, 27, 14, 42); \
+		TFBIG_MIX8_PRE(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 17, 49, 36, 39); \
+		TFBIG_MIX8_PRE(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 44,  9, 54, 56); \
+	}
+
+#define TFBIG_4o_UI2(s)  { \
+		TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \
+		TFBIG_MIX8_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 39, 30, 34, 24); \
+		TFBIG_MIX8_UI2(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 13, 50, 10, 17); \
+		TFBIG_MIX8_UI2(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 25, 29, 39, 43); \
+		TFBIG_MIX8_UI2(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3],  8, 35, 56, 22); \
+	}
+
+#define TFBIG_4o_PRE(s)  { \
+		TFBIG_ADDKEY_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \
+		TFBIG_MIX8_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 39, 30, 34, 24); \
+		TFBIG_MIX8_PRE(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 13, 50, 10, 17); \
+		TFBIG_MIX8_PRE(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 25, 29, 39, 43); \
+		TFBIG_MIX8_PRE(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3],  8, 35, 56, 22); \
+	}
+
+#define TFBIGMIX8e(){\
+		p[ 0]+=p[ 1];p[ 2]+=p[ 3];p[ 4]+=p[ 5];p[ 6]+=p[ 7];p[ 1]=ROL2(p[ 1],46) ^ p[ 0];p[ 3]=ROL2(p[ 3],36) ^ p[ 2];p[ 5]=ROL2(p[ 5],19) ^ p[ 4];p[ 7]=ROL2(p[ 7],37) ^ p[ 6];\
+		p[ 2]+=p[ 1];p[ 4]+=p[ 7];p[ 6]+=p[ 5];p[ 0]+=p[ 3];p[ 1]=ROL2(p[ 1],33) ^ p[ 2];p[ 7]=ROL2(p[ 7],27) ^ p[ 4];p[ 5]=ROL2(p[ 5],14) ^ p[ 6];p[ 3]=ROL2(p[ 3],42) ^ p[ 0];\
+		p[ 4]+=p[ 1];p[ 6]+=p[ 3];p[ 0]+=p[ 5];p[ 2]+=p[ 7];p[ 1]=ROL2(p[ 1],17) ^ p[ 4];p[ 3]=ROL2(p[ 3],49) ^ p[ 6];p[ 5]=ROL2(p[ 5],36) ^ p[ 0];p[ 7]=ROL2(p[ 7],39) ^ p[ 2];\
+		p[ 6]+=p[ 1];p[ 0]+=p[ 7];p[ 2]+=p[ 5];p[ 4]+=p[ 3];p[ 1]=ROL2(p[ 1],44) ^ p[ 6];p[ 7]=ROL2(p[ 7], 9) ^ p[ 0];p[ 5]=ROL2(p[ 5],54) ^ p[ 2];p[ 3]=ROR8(p[ 3])    ^ p[ 4];\
+}
+#define TFBIGMIX8o(){\
+		p[ 0]+=p[ 1];p[ 2]+=p[ 3];p[ 4]+=p[ 5];p[ 6]+=p[ 7];p[ 1]=ROL2(p[ 1],39) ^ p[ 0];p[ 3]=ROL2(p[ 3],30) ^ p[ 2];p[ 5]=ROL2(p[ 5],34) ^ p[ 4];p[ 7]=ROL24(p[ 7])   ^ p[ 6];\
+		p[ 2]+=p[ 1];p[ 4]+=p[ 7];p[ 6]+=p[ 5];p[ 0]+=p[ 3];p[ 1]=ROL2(p[ 1],13) ^ p[ 2];p[ 7]=ROL2(p[ 7],50) ^ p[ 4];p[ 5]=ROL2(p[ 5],10) ^ p[ 6];p[ 3]=ROL2(p[ 3],17) ^ p[ 0];\
+		p[ 4]+=p[ 1];p[ 6]+=p[ 3];p[ 0]+=p[ 5];p[ 2]+=p[ 7];p[ 1]=ROL2(p[ 1],25) ^ p[ 4];p[ 3]=ROL2(p[ 3],29) ^ p[ 6];p[ 5]=ROL2(p[ 5],39) ^ p[ 0];p[ 7]=ROL2(p[ 7],43) ^ p[ 2];\
+		p[ 6]+=p[ 1];p[ 0]+=p[ 7];p[ 2]+=p[ 5];p[ 4]+=p[ 3];p[ 1]=ROL8(p[ 1])    ^ p[ 6];p[ 7]=ROL2(p[ 7],35) ^ p[ 0];p[ 5]=ROR8(p[ 5])    ^ p[ 2];p[ 3]=ROL2(p[ 3],22) ^ p[ 4];\
+}
+
+#define addwBuff(x0,x1,x2,x3,x4){\
+	p[ 0]+=h[x0];\
+	p[ 1]+=h[x1];\
+	p[ 2]+=h[x2];\
+	p[ 3]+=h[x3];\
+	p[ 4]+=h[x4];\
+	p[ 5]+=c_buffer[i++];\
+	p[ 7]+=c_buffer[i++];\
+	p[ 6]+=c_buffer[i];\
+}
+
+#define addwCon(x0,x1,x2,x3,x4,x5,x6,x7,y0,y1,y2){\
+	p[ 0]+= h[x0];\
+	p[ 1]+= h[x1];\
+	p[ 2]+= h[x2];\
+	p[ 3]+= h[x3];\
+	p[ 4]+= h[x4];\
+	p[ 5]+= h[x5] + c_t[y0];\
+	p[ 6]+= h[x6] + c_t[y1];\
+	p[ 7]+= h[x7] + c_add[y2];\
+}
+
+
diff --git a/skunk/skunk.cu b/skunk/skunk.cu
new file mode 100644
index 0000000000..c1add50303
--- /dev/null
+++ b/skunk/skunk.cu
@@ -0,0 +1,211 @@
+/**
+ * Skunk Algo for Signatum
+ * (skein, cube, fugue, gost streebog)
+ *
+ * tpruvot@github 08 2017 - GPLv3
+ */
+extern "C" {
+#include "sph/sph_skein.h"
+#include "sph/sph_cubehash.h"
+#include "sph/sph_fugue.h"
+#include "sph/sph_streebog.h"
+}
+
+#include "miner.h"
+#include "cuda_helper.h"
+
+//#define WANT_COMPAT_KERNEL
+
+// compatibility kernels
+extern void skein512_cpu_setBlock_80(void *pdata);
+extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap);
+extern void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
+extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x13_fugue512_cpu_free(int thr_id);
+extern void streebog_sm3_set_target(uint32_t* ptarget);
+extern void streebog_sm3_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce);
+
+// krnlx merged kernel (for high-end cards only)
+extern void skunk_cpu_init(int thr_id, uint32_t threads);
+extern void skunk_streebog_set_target(uint32_t* ptarget);
+extern void skunk_setBlock_80(int thr_id, void *pdata);
+extern void skunk_cuda_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
+extern void skunk_cuda_streebog(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce);
+
+#include <stdio.h>
+#include <memory.h>
+
+#define NBN 2
+static uint32_t *d_hash[MAX_GPUS];
+static uint32_t *d_resNonce[MAX_GPUS];
+
+// CPU Hash
+extern "C" void skunk_hash(void *output, const void *input)
+{
+	unsigned char _ALIGN(128) hash[128] = { 0 };
+
+	sph_skein512_context ctx_skein;
+	sph_cubehash512_context ctx_cubehash;
+	sph_fugue512_context ctx_fugue;
+	sph_gost512_context ctx_gost;
+
+	sph_skein512_init(&ctx_skein);
+	sph_skein512(&ctx_skein, input, 80);
+	sph_skein512_close(&ctx_skein, (void*) hash);
+
+	sph_cubehash512_init(&ctx_cubehash);
+	sph_cubehash512(&ctx_cubehash, (const void*) hash, 64);
+	sph_cubehash512_close(&ctx_cubehash, (void*) hash);
+
+	sph_fugue512_init(&ctx_fugue);
+	sph_fugue512(&ctx_fugue, (const void*) hash, 64);
+	sph_fugue512_close(&ctx_fugue, (void*) hash);
+
+	sph_gost512_init(&ctx_gost);
+	sph_gost512(&ctx_gost, (const void*) hash, 64);
+	sph_gost512_close(&ctx_gost, (void*) hash);
+
+	memcpy(output, hash, 32);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+static bool use_compat_kernels[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_skunk(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	int dev_id = device_map[thr_id];
+
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	int intensity = (device_sm[device_map[thr_id]] > 500) ? 18 : 17;
+	if (strstr(device_name[dev_id], "GTX 10")) intensity = 20;
+	if (strstr(device_name[dev_id], "GTX 1080")) intensity = 21;
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
+	//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (opt_benchmark)
+		ptarget[7] = 0xf;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		skunk_cpu_init(thr_id, throughput);
+		use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
+		if (use_compat_kernels[thr_id]) x13_fugue512_cpu_init(thr_id, throughput);
+
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_resNonce[thr_id], NBN * sizeof(uint32_t)), -1);
+
+		init[thr_id] = true;
+	}
+
+	uint32_t _ALIGN(64) h_resNonce[NBN];
+	uint32_t _ALIGN(64) endiandata[20];
+	for (int k=0; k < 20; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t));
+	if (use_compat_kernels[thr_id]) {
+		skein512_cpu_setBlock_80(endiandata);
+		streebog_sm3_set_target(ptarget);
+	} else {
+		skunk_setBlock_80(thr_id, endiandata);
+		skunk_streebog_set_target(ptarget);
+	}
+
+	do {
+		int order = 0;
+		if (use_compat_kernels[thr_id]) {
+			skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+			x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+			x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+			streebog_sm3_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);
+		} else {
+			skunk_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]);
+			skunk_cuda_streebog(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);
+		}
+		cudaMemcpy(h_resNonce, d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost);
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		if (h_resNonce[0] != UINT32_MAX)
+		{
+			uint32_t _ALIGN(64) vhash[8];
+			const uint32_t Htarg = ptarget[7];
+			const uint32_t startNounce = pdata[19];
+
+			be32enc(&endiandata[19], startNounce + h_resNonce[0]);
+			skunk_hash(vhash, endiandata);
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget))
+			{
+				work->nonces[0] = startNounce + h_resNonce[0];
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				if (h_resNonce[1] != UINT32_MAX)
+				{
+					uint32_t secNonce = work->nonces[1] = startNounce + h_resNonce[1];
+					be32enc(&endiandata[19], secNonce);
+					skunk_hash(vhash, endiandata);
+					if (bn_hash_target_ratio(vhash, ptarget) > work->shareratio[0]) {
+						work_set_target_ratio(work, vhash);
+						xchg(work->nonces[1], work->nonces[0]);
+					} else {
+						bn_set_target_ratio(work, vhash, work->valid_nonces);
+					}
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t));
+				gpulog(LOG_WARNING, thr_id, "result does not validate on CPU!");
+				pdata[19] = startNounce + h_resNonce[0] + 1;
+				continue;
+			}
+		}
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+
+	return 0;
+}
+
+// cleanup
+extern "C" void free_skunk(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	if (use_compat_kernels[thr_id])
+		x13_fugue512_cpu_free(thr_id);
+
+	cudaFree(d_hash[thr_id]);
+	cudaFree(d_resNonce[thr_id]);
+
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/sph/blake.c b/sph/blake.c
index 0650b9cf21..5863cf592a 100644
--- a/sph/blake.c
+++ b/sph/blake.c
@@ -36,6 +36,8 @@
 
 #include "sph_blake.h"
 
+int blake256_rounds = 14;
+
 #ifdef __cplusplus
 extern "C"{
 #endif
@@ -548,7 +550,7 @@ static const sph_u64 CB[16] = {
 		M[0xD] = sph_dec32be_aligned(buf + 52); \
 		M[0xE] = sph_dec32be_aligned(buf + 56); \
 		M[0xF] = sph_dec32be_aligned(buf + 60); \
-		for (r = 0; r < 14; r ++) \
+		for (r = 0; r < blake256_rounds; r ++) \
 			ROUND_S(r); \
 		H0 ^= S0 ^ V0 ^ V8; \
 		H1 ^= S1 ^ V1 ^ V9; \
@@ -607,12 +609,14 @@ static const sph_u64 CB[16] = {
 		ROUND_S(5); \
 		ROUND_S(6); \
 		ROUND_S(7); \
+		if (blake256_rounds == 14) { \
 		ROUND_S(8); \
 		ROUND_S(9); \
 		ROUND_S(0); \
 		ROUND_S(1); \
 		ROUND_S(2); \
 		ROUND_S(3); \
+		} \
 		H0 ^= S0 ^ V0 ^ V8; \
 		H1 ^= S1 ^ V1 ^ V9; \
 		H2 ^= S2 ^ V2 ^ VA; \
@@ -1053,6 +1057,13 @@ sph_blake256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 	sph_blake256_init(cc);
 }
 
+/* see sph_blake.h */
+void
+sph_blake256_set_rounds(int rounds)
+{
+	blake256_rounds = rounds;
+}
+
 #if SPH_64
 
 /* see sph_blake.h */
diff --git a/sph/blake2b.c b/sph/blake2b.c
new file mode 100644
index 0000000000..f85c977131
--- /dev/null
+++ b/sph/blake2b.c
@@ -0,0 +1,196 @@
+/*
+ * Copyright 2009 Colin Percival, 2014 savale
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ */
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "sph_types.h"
+#include "blake2b.h"
+
+// Cyclic right rotation.
+
+#ifndef ROTR64
+#define ROTR64(x, y)  (((x) >> (y)) ^ ((x) << (64 - (y))))
+#endif
+
+// Little-endian byte access.
+
+#define B2B_GET64(p)                            \
+	(((uint64_t) ((uint8_t *) (p))[0]) ^        \
+	(((uint64_t) ((uint8_t *) (p))[1]) << 8) ^  \
+	(((uint64_t) ((uint8_t *) (p))[2]) << 16) ^ \
+	(((uint64_t) ((uint8_t *) (p))[3]) << 24) ^ \
+	(((uint64_t) ((uint8_t *) (p))[4]) << 32) ^ \
+	(((uint64_t) ((uint8_t *) (p))[5]) << 40) ^ \
+	(((uint64_t) ((uint8_t *) (p))[6]) << 48) ^ \
+	(((uint64_t) ((uint8_t *) (p))[7]) << 56))
+
+// G Mixing function.
+
+#define B2B_G(a, b, c, d, x, y) {   \
+	v[a] = v[a] + v[b] + x;         \
+	v[d] = ROTR64(v[d] ^ v[a], 32); \
+	v[c] = v[c] + v[d];             \
+	v[b] = ROTR64(v[b] ^ v[c], 24); \
+	v[a] = v[a] + v[b] + y;         \
+	v[d] = ROTR64(v[d] ^ v[a], 16); \
+	v[c] = v[c] + v[d];             \
+	v[b] = ROTR64(v[b] ^ v[c], 63); }
+
+// Initialization Vector.
+
+static const uint64_t blake2b_iv[8] = {
+	0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
+	0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
+	0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
+	0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
+};
+
+// Compression function. "last" flag indicates last block.
+
+static void blake2b_compress(blake2b_ctx *ctx, int last)
+{
+	const uint8_t sigma[12][16] = {
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+		{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+		{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+		{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+		{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+		{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+		{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+		{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+		{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+		{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+		{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
+	};
+	int i;
+	uint64_t v[16], m[16];
+
+	for (i = 0; i < 8; i++) {           // init work variables
+		v[i] = ctx->h[i];
+		v[i + 8] = blake2b_iv[i];
+	}
+
+	v[12] ^= ctx->t[0];                 // low 64 bits of offset
+	v[13] ^= ctx->t[1];                 // high 64 bits
+	if (last)                           // last block flag set ?
+		v[14] = ~v[14];
+
+	for (i = 0; i < 16; i++)            // get little-endian words
+		m[i] = B2B_GET64(&ctx->b[8 * i]);
+
+	for (i = 0; i < 12; i++) {          // twelve rounds
+		B2B_G( 0, 4,  8, 12, m[sigma[i][ 0]], m[sigma[i][ 1]]);
+		B2B_G( 1, 5,  9, 13, m[sigma[i][ 2]], m[sigma[i][ 3]]);
+		B2B_G( 2, 6, 10, 14, m[sigma[i][ 4]], m[sigma[i][ 5]]);
+		B2B_G( 3, 7, 11, 15, m[sigma[i][ 6]], m[sigma[i][ 7]]);
+		B2B_G( 0, 5, 10, 15, m[sigma[i][ 8]], m[sigma[i][ 9]]);
+		B2B_G( 1, 6, 11, 12, m[sigma[i][10]], m[sigma[i][11]]);
+		B2B_G( 2, 7,  8, 13, m[sigma[i][12]], m[sigma[i][13]]);
+		B2B_G( 3, 4,  9, 14, m[sigma[i][14]], m[sigma[i][15]]);
+	}
+
+	for( i = 0; i < 8; ++i )
+		ctx->h[i] ^= v[i] ^ v[i + 8];
+}
+
+// Initialize the hashing context "ctx" with optional key "key".
+//      1 <= outlen <= 64 gives the digest size in bytes.
+//      Secret key (also <= 64 bytes) is optional (keylen = 0).
+
+int blake2b_init(blake2b_ctx *ctx, size_t outlen,
+	const void *key, size_t keylen)        // (keylen=0: no key)
+{
+	size_t i;
+
+	if (outlen == 0 || outlen > 64 || keylen > 64)
+		return -1;                      // illegal parameters
+
+	for (i = 0; i < 8; i++)             // state, "param block"
+		ctx->h[i] = blake2b_iv[i];
+	ctx->h[0] ^= 0x01010000 ^ (keylen << 8) ^ outlen;
+
+	ctx->t[0] = 0;                      // input count low word
+	ctx->t[1] = 0;                      // input count high word
+	ctx->c = 0;                         // pointer within buffer
+	ctx->outlen = outlen;
+
+	for (i = keylen; i < 128; i++)      // zero input block
+		ctx->b[i] = 0;
+	if (keylen > 0) {
+		blake2b_update(ctx, key, keylen);
+		ctx->c = 128;                   // at the end
+	}
+
+	return 0;
+}
+
+// Add "inlen" bytes from "in" into the hash.
+
+void blake2b_update(blake2b_ctx *ctx,
+	const void *in, size_t inlen)       // data bytes
+{
+	size_t i;
+
+	for (i = 0; i < inlen; i++) {
+		if (ctx->c == 128) {            // buffer full ?
+			ctx->t[0] += ctx->c;        // add counters
+			if (ctx->t[0] < ctx->c)     // carry overflow ?
+				ctx->t[1]++;            // high word
+			blake2b_compress(ctx, 0);   // compress (not last)
+			ctx->c = 0;                 // counter to zero
+		}
+		ctx->b[ctx->c++] = ((const uint8_t *) in)[i];
+	}
+}
+
+// Generate the message digest (size given in init).
+//      Result placed in "out".
+
+void blake2b_final(blake2b_ctx *ctx, void *out)
+{
+	size_t i;
+
+	ctx->t[0] += ctx->c;                // mark last block offset
+	if (ctx->t[0] < ctx->c)             // carry overflow
+		ctx->t[1]++;                    // high word
+
+	while (ctx->c < 128)                // fill up with zeros
+		ctx->b[ctx->c++] = 0;
+	blake2b_compress(ctx, 1);           // final block flag = 1
+
+	// little endian convert and store
+	for (i = 0; i < ctx->outlen; i++) {
+		((uint8_t *) out)[i] =
+			(ctx->h[i >> 3] >> (8 * (i & 7))) & 0xFF;
+	}
+}
+
diff --git a/sph/blake2b.h b/sph/blake2b.h
new file mode 100644
index 0000000000..f8652c1808
--- /dev/null
+++ b/sph/blake2b.h
@@ -0,0 +1,41 @@
+#pragma once
+#ifndef __BLAKE2B_H__
+#define __BLAKE2B_H__
+
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(_MSC_VER)
+#include <inttypes.h>
+#define inline __inline
+#define ALIGN(x) __declspec(align(x))
+#else
+#define ALIGN(x) __attribute__((aligned(x)))
+#endif
+
+#if defined(_MSC_VER) || defined(__x86_64__) || defined(__x86__)
+#define NATIVE_LITTLE_ENDIAN
+#endif
+
+// state context
+ALIGN(64) typedef struct {
+	uint8_t b[128]; // input buffer
+	uint64_t h[8];  // chained state
+	uint64_t t[2];  // total number of bytes
+	size_t c;       // pointer for b[]
+	size_t outlen;  // digest size
+} blake2b_ctx;
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int blake2b_init(blake2b_ctx *ctx, size_t outlen, const void *key, size_t keylen);
+void blake2b_update(blake2b_ctx *ctx, const void *in, size_t inlen);
+void blake2b_final(blake2b_ctx *ctx, void *out);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/sph/blake2s.c b/sph/blake2s.c
new file mode 100644
index 0000000000..62b5a39571
--- /dev/null
+++ b/sph/blake2s.c
@@ -0,0 +1,387 @@
+/**
+ * BLAKE2 reference source code package - reference C implementations
+ *
+ * Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+ *
+ * To the extent possible under law, the author(s) have dedicated all copyright
+ * and related and neighboring rights to this software to the public domain
+ * worldwide. This software is distributed without any warranty.
+ *
+ * You should have received a copy of the CC0 Public Domain Dedication along with
+ * this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#include "sph_types.h"
+#include "blake2s.h"
+
+static const uint32_t blake2s_IV[8] =
+{
+	0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
+	0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
+};
+
+static const uint8_t blake2s_sigma[10][16] =
+{
+	{  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 } ,
+	{ 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 } ,
+	{ 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 } ,
+	{  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 } ,
+	{  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 } ,
+	{  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 } ,
+	{ 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 } ,
+	{ 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 } ,
+	{  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 } ,
+	{ 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0 } ,
+};
+
+static inline int blake2s_set_lastnode( blake2s_state *S )
+{
+	S->f[1] = ~0U;
+	return 0;
+}
+
+static inline int blake2s_clear_lastnode( blake2s_state *S )
+{
+	S->f[1] = 0U;
+	return 0;
+}
+
+/* Some helper functions, not necessarily useful */
+static inline int blake2s_set_lastblock( blake2s_state *S )
+{
+	if( S->last_node ) blake2s_set_lastnode( S );
+
+	S->f[0] = ~0U;
+	return 0;
+}
+
+static inline int blake2s_clear_lastblock( blake2s_state *S )
+{
+	if( S->last_node ) blake2s_clear_lastnode( S );
+
+	S->f[0] = 0U;
+	return 0;
+}
+
+static inline int blake2s_increment_counter( blake2s_state *S, const uint32_t inc )
+{
+	S->t[0] += inc;
+	S->t[1] += ( S->t[0] < inc );
+	return 0;
+}
+
+// Parameter-related functions
+static inline int blake2s_param_set_digest_length( blake2s_param *P, const uint8_t digest_length )
+{
+	P->digest_length = digest_length;
+	return 0;
+}
+
+static inline int blake2s_param_set_fanout( blake2s_param *P, const uint8_t fanout )
+{
+	P->fanout = fanout;
+	return 0;
+}
+
+static inline int blake2s_param_set_max_depth( blake2s_param *P, const uint8_t depth )
+{
+	P->depth = depth;
+	return 0;
+}
+
+static inline int blake2s_param_set_leaf_length( blake2s_param *P, const uint32_t leaf_length )
+{
+	store32( &P->leaf_length, leaf_length );
+	return 0;
+}
+
+static inline int blake2s_param_set_node_offset( blake2s_param *P, const uint64_t node_offset )
+{
+	store48( P->node_offset, node_offset );
+	return 0;
+}
+
+static inline int blake2s_param_set_node_depth( blake2s_param *P, const uint8_t node_depth )
+{
+	P->node_depth = node_depth;
+	return 0;
+}
+
+static inline int blake2s_param_set_inner_length( blake2s_param *P, const uint8_t inner_length )
+{
+	P->inner_length = inner_length;
+	return 0;
+}
+
+static inline int blake2s_param_set_salt( blake2s_param *P, const uint8_t salt[BLAKE2S_SALTBYTES] )
+{
+	memcpy( P->salt, salt, BLAKE2S_SALTBYTES );
+	return 0;
+}
+
+static inline int blake2s_param_set_personal( blake2s_param *P, const uint8_t personal[BLAKE2S_PERSONALBYTES] )
+{
+	memcpy( P->personal, personal, BLAKE2S_PERSONALBYTES );
+	return 0;
+}
+
+static inline int blake2s_init0( blake2s_state *S )
+{
+	memset( S, 0, sizeof( blake2s_state ) );
+
+	for( int i = 0; i < 8; ++i ) S->h[i] = blake2s_IV[i];
+
+	return 0;
+}
+
+/* init2 xors IV with input parameter block */
+int blake2s_init_param( blake2s_state *S, const blake2s_param *P )
+{
+	blake2s_init0( S );
+	uint32_t *p = ( uint32_t * )( P );
+
+	/* IV XOR ParamBlock */
+	for( size_t i = 0; i < 8; ++i )
+		S->h[i] ^= load32( &p[i] );
+
+	return 0;
+}
+
+
+// Sequential blake2s initialization
+int blake2s_init( blake2s_state *S, const uint8_t outlen )
+{
+	blake2s_param P[1];
+
+	/* Move interval verification here? */
+	if ( ( !outlen ) || ( outlen > BLAKE2S_OUTBYTES ) ) return -1;
+
+	P->digest_length = outlen;
+	P->key_length    = 0;
+	P->fanout        = 1;
+	P->depth         = 1;
+	store32( &P->leaf_length, 0 );
+	store48( &P->node_offset, 0 );
+	P->node_depth    = 0;
+	P->inner_length  = 0;
+	// memset(P->reserved, 0, sizeof(P->reserved) );
+	memset( P->salt,     0, sizeof( P->salt ) );
+	memset( P->personal, 0, sizeof( P->personal ) );
+	return blake2s_init_param( S, P );
+}
+
+int blake2s_init_key( blake2s_state *S, const uint8_t outlen, const void *key, const uint8_t keylen )
+{
+	blake2s_param P[1];
+
+	if ( ( !outlen ) || ( outlen > BLAKE2S_OUTBYTES ) ) return -1;
+
+	if ( !key || !keylen || keylen > BLAKE2S_KEYBYTES ) return -1;
+
+	P->digest_length = outlen;
+	P->key_length    = keylen;
+	P->fanout        = 1;
+	P->depth         = 1;
+	store32( &P->leaf_length, 0 );
+	store48( &P->node_offset, 0 );
+	P->node_depth    = 0;
+	P->inner_length  = 0;
+	// memset(P->reserved, 0, sizeof(P->reserved) );
+	memset( P->salt,     0, sizeof( P->salt ) );
+	memset( P->personal, 0, sizeof( P->personal ) );
+
+	if( blake2s_init_param( S, P ) < 0 ) return -1;
+
+	{
+		uint8_t block[BLAKE2S_BLOCKBYTES];
+		memset( block, 0, BLAKE2S_BLOCKBYTES );
+		memcpy( block, key, keylen );
+		blake2s_update( S, block, BLAKE2S_BLOCKBYTES );
+		secure_zero_memory( block, BLAKE2S_BLOCKBYTES ); /* Burn the key from stack */
+	}
+	return 0;
+}
+
+int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES] )
+{
+	uint32_t m[16];
+	uint32_t v[16];
+
+	for( size_t i = 0; i < 16; ++i )
+		m[i] = load32( block + i * sizeof( m[i] ) );
+
+	for( size_t i = 0; i < 8; ++i )
+		v[i] = S->h[i];
+
+	v[ 8] = blake2s_IV[0];
+	v[ 9] = blake2s_IV[1];
+	v[10] = blake2s_IV[2];
+	v[11] = blake2s_IV[3];
+	v[12] = S->t[0] ^ blake2s_IV[4];
+	v[13] = S->t[1] ^ blake2s_IV[5];
+	v[14] = S->f[0] ^ blake2s_IV[6];
+	v[15] = S->f[1] ^ blake2s_IV[7];
+#define G(r,i,a,b,c,d) \
+	do { \
+		a = a + b + m[blake2s_sigma[r][2*i+0]]; \
+		d = SPH_ROTR32(d ^ a, 16); \
+		c = c + d; \
+		b = SPH_ROTR32(b ^ c, 12); \
+		a = a + b + m[blake2s_sigma[r][2*i+1]]; \
+		d = SPH_ROTR32(d ^ a, 8); \
+		c = c + d; \
+		b = SPH_ROTR32(b ^ c, 7); \
+	} while(0)
+#define ROUND(r)  \
+	do { \
+		G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
+		G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
+		G(r,2,v[ 2],v[ 6],v[10],v[14]); \
+		G(r,3,v[ 3],v[ 7],v[11],v[15]); \
+		G(r,4,v[ 0],v[ 5],v[10],v[15]); \
+		G(r,5,v[ 1],v[ 6],v[11],v[12]); \
+		G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
+		G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
+	} while(0)
+	ROUND( 0 );
+	ROUND( 1 );
+	ROUND( 2 );
+	ROUND( 3 );
+	ROUND( 4 );
+	ROUND( 5 );
+	ROUND( 6 );
+	ROUND( 7 );
+	ROUND( 8 );
+	ROUND( 9 );
+
+	for( size_t i = 0; i < 8; ++i )
+		S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];
+
+#undef G
+#undef ROUND
+	return 0;
+}
+
+
+int blake2s_update( blake2s_state *S, const uint8_t *in, uint64_t inlen )
+{
+	while( inlen > 0 )
+	{
+		size_t left = S->buflen;
+		size_t fill = 2 * BLAKE2S_BLOCKBYTES - left;
+
+		if( inlen > fill )
+		{
+			memcpy( S->buf + left, in, fill ); // Fill buffer
+			S->buflen += fill;
+			blake2s_increment_counter( S, BLAKE2S_BLOCKBYTES );
+			blake2s_compress( S, S->buf ); // Compress
+			memcpy( S->buf, S->buf + BLAKE2S_BLOCKBYTES, BLAKE2S_BLOCKBYTES ); // Shift buffer left
+			S->buflen -= BLAKE2S_BLOCKBYTES;
+			in += fill;
+			inlen -= fill;
+		}
+		else // inlen <= fill
+		{
+			memcpy(S->buf + left, in, (size_t) inlen);
+			S->buflen += (size_t) inlen; // Be lazy, do not compress
+			in += inlen;
+			inlen -= inlen;
+		}
+	}
+
+	return 0;
+}
+
+int blake2s_final( blake2s_state *S, uint8_t *out, uint8_t outlen )
+{
+	uint8_t buffer[BLAKE2S_OUTBYTES];
+
+	if( S->buflen > BLAKE2S_BLOCKBYTES )
+	{
+		blake2s_increment_counter( S, BLAKE2S_BLOCKBYTES );
+		blake2s_compress( S, S->buf );
+		S->buflen -= BLAKE2S_BLOCKBYTES;
+		memcpy( S->buf, S->buf + BLAKE2S_BLOCKBYTES, S->buflen );
+	}
+
+	blake2s_increment_counter( S, ( uint32_t )S->buflen );
+	blake2s_set_lastblock( S );
+	memset( S->buf + S->buflen, 0, 2 * BLAKE2S_BLOCKBYTES - S->buflen ); /* Padding */
+	blake2s_compress( S, S->buf );
+
+	for( int i = 0; i < 8; ++i ) /* Output full hash to temp buffer */
+		store32( buffer + sizeof( S->h[i] ) * i, S->h[i] );
+
+	memcpy( out, buffer, outlen );
+	return 0;
+}
+
+int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
+{
+	blake2s_state S[1];
+
+	/* Verify parameters */
+	if ( NULL == in ) return -1;
+
+	if ( NULL == out ) return -1;
+
+	if ( NULL == key ) keylen = 0; /* Fail here instead if keylen != 0 and key == NULL? */
+
+	if( keylen > 0 )
+	{
+		if( blake2s_init_key( S, outlen, key, keylen ) < 0 ) return -1;
+	}
+	else
+	{
+		if( blake2s_init( S, outlen ) < 0 ) return -1;
+	}
+
+	blake2s_update( S, ( uint8_t * )in, inlen );
+	blake2s_final( S, out, outlen );
+	return 0;
+}
+
+#if defined(__cplusplus)
+}
+#endif
+
+
+#if defined(BLAKE2S_SELFTEST)
+#include <string.h>
+#include "blake2-kat.h" /* test data not included */
+int main( int argc, char **argv )
+{
+	uint8_t key[BLAKE2S_KEYBYTES];
+	uint8_t buf[KAT_LENGTH];
+
+	for( size_t i = 0; i < BLAKE2S_KEYBYTES; ++i )
+		key[i] = ( uint8_t )i;
+
+	for( size_t i = 0; i < KAT_LENGTH; ++i )
+		buf[i] = ( uint8_t )i;
+
+	for( size_t i = 0; i < KAT_LENGTH; ++i )
+	{
+		uint8_t hash[BLAKE2S_OUTBYTES];
+		blake2s( hash, buf, key, BLAKE2S_OUTBYTES, i, BLAKE2S_KEYBYTES );
+
+		if( 0 != memcmp( hash, blake2s_keyed_kat[i], BLAKE2S_OUTBYTES ) )
+		{
+			puts( "error" );
+			return -1;
+		}
+	}
+
+	puts( "ok" );
+	return 0;
+}
+#endif
diff --git a/sph/blake2s.h b/sph/blake2s.h
new file mode 100644
index 0000000000..64aa25b546
--- /dev/null
+++ b/sph/blake2s.h
@@ -0,0 +1,150 @@
+/**
+ * BLAKE2 reference source code package - reference C implementations
+ *
+ * Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+ *
+ * To the extent possible under law, the author(s) have dedicated all copyright
+ * and related and neighboring rights to this software to the public domain
+ * worldwide. This software is distributed without any warranty.
+ *
+ * You should have received a copy of the CC0 Public Domain Dedication along with
+ * this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+ */
+#pragma once
+#ifndef __BLAKE2_H__
+#define __BLAKE2_H__
+
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(_MSC_VER)
+#include <inttypes.h>
+#define inline __inline
+#define ALIGN(x) __declspec(align(x))
+#else
+#define ALIGN(x) __attribute__((aligned(x)))
+#endif
+
+/* blake2-impl.h */
+
+static inline uint32_t load32(const void *src)
+{
+#if defined(NATIVE_LITTLE_ENDIAN)
+	return *(uint32_t *)(src);
+#else
+	const uint8_t *p = (uint8_t *)src;
+	uint32_t w = *p++;
+	w |= (uint32_t)(*p++) << 8;
+	w |= (uint32_t)(*p++) << 16;
+	w |= (uint32_t)(*p++) << 24;
+	return w;
+#endif
+}
+
+static inline void store32(void *dst, uint32_t w)
+{
+#if defined(NATIVE_LITTLE_ENDIAN)
+	*(uint32_t *)(dst) = w;
+#else
+	uint8_t *p = (uint8_t *)dst;
+	*p++ = (uint8_t)w; w >>= 8;
+	*p++ = (uint8_t)w; w >>= 8;
+	*p++ = (uint8_t)w; w >>= 8;
+	*p++ = (uint8_t)w;
+#endif
+}
+
+static inline uint64_t load48(const void *src)
+{
+	const uint8_t *p = (const uint8_t *)src;
+	uint64_t w = *p++;
+	w |= (uint64_t)(*p++) << 8;
+	w |= (uint64_t)(*p++) << 16;
+	w |= (uint64_t)(*p++) << 24;
+	w |= (uint64_t)(*p++) << 32;
+	w |= (uint64_t)(*p++) << 40;
+	return w;
+}
+
+static inline void store48(void *dst, uint64_t w)
+{
+	uint8_t *p = (uint8_t *)dst;
+	*p++ = (uint8_t)w; w >>= 8;
+	*p++ = (uint8_t)w; w >>= 8;
+	*p++ = (uint8_t)w; w >>= 8;
+	*p++ = (uint8_t)w; w >>= 8;
+	*p++ = (uint8_t)w; w >>= 8;
+	*p++ = (uint8_t)w;
+}
+
+/* prevents compiler optimizing out memset() */
+static inline void secure_zero_memory(void *v, size_t n)
+{
+	volatile uint8_t *p = ( volatile uint8_t * )v;
+
+	while( n-- ) *p++ = 0;
+}
+
+/* blake2.h */
+
+enum blake2s_constant
+{
+	BLAKE2S_BLOCKBYTES = 64,
+	BLAKE2S_OUTBYTES   = 32,
+	BLAKE2S_KEYBYTES   = 32,
+	BLAKE2S_SALTBYTES  = 8,
+	BLAKE2S_PERSONALBYTES = 8
+};
+
+#pragma pack(push, 1)
+typedef struct __blake2s_param
+{
+	uint8_t  digest_length; // 1
+	uint8_t  key_length;    // 2
+	uint8_t  fanout;        // 3
+	uint8_t  depth;         // 4
+	uint32_t leaf_length;   // 8
+	uint8_t  node_offset[6];// 14
+	uint8_t  node_depth;    // 15
+	uint8_t  inner_length;  // 16
+	// uint8_t  reserved[0];
+	uint8_t  salt[BLAKE2S_SALTBYTES]; // 24
+	uint8_t  personal[BLAKE2S_PERSONALBYTES];  // 32
+} blake2s_param;
+
+ALIGN( 64 ) typedef struct __blake2s_state
+{
+	uint32_t h[8];
+	uint32_t t[2];
+	uint32_t f[2];
+	uint8_t  buf[2 * BLAKE2S_BLOCKBYTES];
+	size_t   buflen;
+	uint8_t  last_node;
+} blake2s_state;
+#pragma pack(pop)
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+	int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES] );
+
+	// Streaming API
+	int blake2s_init( blake2s_state *S, const uint8_t outlen );
+	int blake2s_init_key( blake2s_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
+	int blake2s_init_param( blake2s_state *S, const blake2s_param *P );
+	int blake2s_update( blake2s_state *S, const uint8_t *in, uint64_t inlen );
+	int blake2s_final( blake2s_state *S, uint8_t *out, uint8_t outlen );
+
+	// Simple API
+	int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
+
+	// Direct Hash Mining Helpers
+	#define blake2s_salt32(out, in, inlen, key32) blake2s(out, in, key32, 32, inlen, 32) /* neoscrypt */
+	#define blake2s_simple(out, in, inlen) blake2s(out, in, NULL, 32, inlen, 0)
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/sph/haval.c b/sph/haval.c
new file mode 100644
index 0000000000..f9a8918760
--- /dev/null
+++ b/sph/haval.c
@@ -0,0 +1,983 @@
+/* $Id: haval.c 227 2010-06-16 17:28:38Z tp $ */
+/*
+ * HAVAL implementation.
+ *
+ * The HAVAL reference paper is of questionable clarity with regards to
+ * some details such as endianness of bits within a byte, bytes within
+ * a 32-bit word, or the actual ordering of words within a stream of
+ * words. This implementation has been made compatible with the reference
+ * implementation available on: http://labs.calyptix.com/haval.php
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "sph_haval.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_HAVAL
+#define SPH_SMALL_FOOTPRINT_HAVAL   1
+#endif
+
+/*
+ * Basic definition from the reference paper.
+ *
+#define F1(x6, x5, x4, x3, x2, x1, x0) \
+	(((x1) & (x4)) ^ ((x2) & (x5)) ^ ((x3) & (x6)) ^ ((x0) & (x1)) ^ (x0))
+ *
+ */
+
+#define F1(x6, x5, x4, x3, x2, x1, x0) \
+	(((x1) & ((x0) ^ (x4))) ^ ((x2) & (x5)) ^ ((x3) & (x6)) ^ (x0))
+
+/*
+ * Basic definition from the reference paper.
+ *
+#define F2(x6, x5, x4, x3, x2, x1, x0) \
+	(((x1) & (x2) & (x3)) ^ ((x2) & (x4) & (x5)) ^ ((x1) & (x2)) \
+	^ ((x1) & (x4)) ^ ((x2) & (x6)) ^ ((x3) & (x5)) \
+	^ ((x4) & (x5)) ^ ((x0) & (x2)) ^ (x0))
+ *
+ */
+
+#define F2(x6, x5, x4, x3, x2, x1, x0) \
+	(((x2) & (((x1) & ~(x3)) ^ ((x4) & (x5)) ^ (x6) ^ (x0))) \
+	^ ((x4) & ((x1) ^ (x5))) ^ ((x3 & (x5)) ^ (x0)))
+
+/*
+ * Basic definition from the reference paper.
+ *
+#define F3(x6, x5, x4, x3, x2, x1, x0) \
+	(((x1) & (x2) & (x3)) ^ ((x1) & (x4)) ^ ((x2) & (x5)) \
+	^ ((x3) & (x6)) ^ ((x0) & (x3)) ^ (x0))
+ *
+ */
+
+#define F3(x6, x5, x4, x3, x2, x1, x0) \
+	(((x3) & (((x1) & (x2)) ^ (x6) ^ (x0))) \
+	^ ((x1) & (x4)) ^ ((x2) & (x5)) ^ (x0))
+
+/*
+ * Basic definition from the reference paper.
+ *
+#define F4(x6, x5, x4, x3, x2, x1, x0) \
+	(((x1) & (x2) & (x3)) ^ ((x2) & (x4) & (x5)) ^ ((x3) & (x4) & (x6)) \
+	^ ((x1) & (x4)) ^ ((x2) & (x6)) ^ ((x3) & (x4)) ^ ((x3) & (x5)) \
+	^ ((x3) & (x6)) ^ ((x4) & (x5)) ^ ((x4) & (x6)) ^ ((x0) & (x4)) ^ (x0))
+ *
+ */
+
+#define F4(x6, x5, x4, x3, x2, x1, x0) \
+	(((x3) & (((x1) & (x2)) ^ ((x4) | (x6)) ^ (x5))) \
+	^ ((x4) & ((~(x2) & (x5)) ^ (x1) ^ (x6) ^ (x0))) \
+	^ ((x2) & (x6)) ^ (x0))
+
+/*
+ * Basic definition from the reference paper.
+ *
+#define F5(x6, x5, x4, x3, x2, x1, x0) \
+	(((x1) & (x4)) ^ ((x2) & (x5)) ^ ((x3) & (x6)) \
+	^ ((x0) & (x1) & (x2) & (x3)) ^ ((x0) & (x5)) ^ (x0))
+ *
+ */
+
+#define F5(x6, x5, x4, x3, x2, x1, x0) \
+	(((x0) & ~(((x1) & (x2) & (x3)) ^ (x5))) \
+	^ ((x1) & (x4)) ^ ((x2) & (x5)) ^ ((x3) & (x6)))
+
+/*
+ * The macros below integrate the phi() permutations, depending on the
+ * pass and the total number of passes.
+ */
+
+#define FP3_1(x6, x5, x4, x3, x2, x1, x0) \
+	F1(x1, x0, x3, x5, x6, x2, x4)
+#define FP3_2(x6, x5, x4, x3, x2, x1, x0) \
+	F2(x4, x2, x1, x0, x5, x3, x6)
+#define FP3_3(x6, x5, x4, x3, x2, x1, x0) \
+	F3(x6, x1, x2, x3, x4, x5, x0)
+
+#define FP4_1(x6, x5, x4, x3, x2, x1, x0) \
+	F1(x2, x6, x1, x4, x5, x3, x0)
+#define FP4_2(x6, x5, x4, x3, x2, x1, x0) \
+	F2(x3, x5, x2, x0, x1, x6, x4)
+#define FP4_3(x6, x5, x4, x3, x2, x1, x0) \
+	F3(x1, x4, x3, x6, x0, x2, x5)
+#define FP4_4(x6, x5, x4, x3, x2, x1, x0) \
+	F4(x6, x4, x0, x5, x2, x1, x3)
+
+#define FP5_1(x6, x5, x4, x3, x2, x1, x0) \
+	F1(x3, x4, x1, x0, x5, x2, x6)
+#define FP5_2(x6, x5, x4, x3, x2, x1, x0) \
+	F2(x6, x2, x1, x0, x3, x4, x5)
+#define FP5_3(x6, x5, x4, x3, x2, x1, x0) \
+	F3(x2, x6, x0, x4, x3, x1, x5)
+#define FP5_4(x6, x5, x4, x3, x2, x1, x0) \
+	F4(x1, x5, x3, x2, x0, x4, x6)
+#define FP5_5(x6, x5, x4, x3, x2, x1, x0) \
+	F5(x2, x5, x0, x6, x4, x3, x1)
+
+/*
+ * One step, for "n" passes, pass number "p" (1 <= p <= n), using
+ * input word number "w" and step constant "c".
+ */
+#define STEP(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w, c)  do { \
+		sph_u32 t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \
+		(x7) = SPH_T32(SPH_ROTR32(t, 7) + SPH_ROTR32((x7), 11) \
+			+ (w) + (c)); \
+	} while (0)
+
+/*
+ * PASSy(n, in) computes pass number "y", for a total of "n", using the
+ * one-argument macro "in" to access input words. Current state is assumed
+ * to be held in variables "s0" to "s7".
+ */
+
+#if SPH_SMALL_FOOTPRINT_HAVAL
+
+#define PASS1(n, in)   do { \
+		unsigned pass_count; \
+		for (pass_count = 0; pass_count < 32; pass_count += 8) { \
+			STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
+				in(pass_count + 0), SPH_C32(0x00000000)); \
+			STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
+				in(pass_count + 1), SPH_C32(0x00000000)); \
+			STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
+				in(pass_count + 2), SPH_C32(0x00000000)); \
+			STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
+				in(pass_count + 3), SPH_C32(0x00000000)); \
+			STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
+				in(pass_count + 4), SPH_C32(0x00000000)); \
+			STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
+				in(pass_count + 5), SPH_C32(0x00000000)); \
+			STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
+				in(pass_count + 6), SPH_C32(0x00000000)); \
+			STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
+				in(pass_count + 7), SPH_C32(0x00000000)); \
+   		} \
+	} while (0)
+
+#define PASSG(p, n, in)   do { \
+		unsigned pass_count; \
+		for (pass_count = 0; pass_count < 32; pass_count += 8) { \
+			STEP(n, p, s7, s6, s5, s4, s3, s2, s1, s0, \
+				in(MP ## p[pass_count + 0]), \
+				RK ## p[pass_count + 0]); \
+			STEP(n, p, s6, s5, s4, s3, s2, s1, s0, s7, \
+				in(MP ## p[pass_count + 1]), \
+				RK ## p[pass_count + 1]); \
+			STEP(n, p, s5, s4, s3, s2, s1, s0, s7, s6, \
+				in(MP ## p[pass_count + 2]), \
+				RK ## p[pass_count + 2]); \
+			STEP(n, p, s4, s3, s2, s1, s0, s7, s6, s5, \
+				in(MP ## p[pass_count + 3]), \
+				RK ## p[pass_count + 3]); \
+			STEP(n, p, s3, s2, s1, s0, s7, s6, s5, s4, \
+				in(MP ## p[pass_count + 4]), \
+				RK ## p[pass_count + 4]); \
+			STEP(n, p, s2, s1, s0, s7, s6, s5, s4, s3, \
+				in(MP ## p[pass_count + 5]), \
+				RK ## p[pass_count + 5]); \
+			STEP(n, p, s1, s0, s7, s6, s5, s4, s3, s2, \
+				in(MP ## p[pass_count + 6]), \
+				RK ## p[pass_count + 6]); \
+			STEP(n, p, s0, s7, s6, s5, s4, s3, s2, s1, \
+				in(MP ## p[pass_count + 7]), \
+				RK ## p[pass_count + 7]); \
+   		} \
+	} while (0)
+
+#define PASS2(n, in)    PASSG(2, n, in)
+#define PASS3(n, in)    PASSG(3, n, in)
+#define PASS4(n, in)    PASSG(4, n, in)
+#define PASS5(n, in)    PASSG(5, n, in)
+
+static const unsigned MP2[32] = {
+	 5, 14, 26, 18, 11, 28,  7, 16,
+	 0, 23, 20, 22,  1, 10,  4,  8,
+	30,  3, 21,  9, 17, 24, 29,  6,
+	19, 12, 15, 13,  2, 25, 31, 27
+};
+
+static const unsigned MP3[32] = {
+	19,  9,  4, 20, 28, 17,  8, 22,
+	29, 14, 25, 12, 24, 30, 16, 26,
+	31, 15,  7,  3,  1,  0, 18, 27,
+	13,  6, 21, 10, 23, 11,  5,  2
+};
+
+static const unsigned MP4[32] = {
+	24,  4,  0, 14,  2,  7, 28, 23,
+	26,  6, 30, 20, 18, 25, 19,  3,
+	22, 11, 31, 21,  8, 27, 12,  9,
+	 1, 29,  5, 15, 17, 10, 16, 13
+};
+
+static const unsigned MP5[32] = {
+	27,  3, 21, 26, 17, 11, 20, 29,
+	19,  0, 12,  7, 13,  8, 31, 10,
+	 5,  9, 14, 30, 18,  6, 28, 24,
+	 2, 23, 16, 22,  4,  1, 25, 15
+};
+
+static const sph_u32 RK2[32] = {
+	SPH_C32(0x452821E6), SPH_C32(0x38D01377),
+	SPH_C32(0xBE5466CF), SPH_C32(0x34E90C6C),
+	SPH_C32(0xC0AC29B7), SPH_C32(0xC97C50DD),
+	SPH_C32(0x3F84D5B5), SPH_C32(0xB5470917),
+	SPH_C32(0x9216D5D9), SPH_C32(0x8979FB1B),
+	SPH_C32(0xD1310BA6), SPH_C32(0x98DFB5AC),
+	SPH_C32(0x2FFD72DB), SPH_C32(0xD01ADFB7),
+	SPH_C32(0xB8E1AFED), SPH_C32(0x6A267E96),
+	SPH_C32(0xBA7C9045), SPH_C32(0xF12C7F99),
+	SPH_C32(0x24A19947), SPH_C32(0xB3916CF7),
+	SPH_C32(0x0801F2E2), SPH_C32(0x858EFC16),
+	SPH_C32(0x636920D8), SPH_C32(0x71574E69),
+	SPH_C32(0xA458FEA3), SPH_C32(0xF4933D7E),
+	SPH_C32(0x0D95748F), SPH_C32(0x728EB658),
+	SPH_C32(0x718BCD58), SPH_C32(0x82154AEE),
+	SPH_C32(0x7B54A41D), SPH_C32(0xC25A59B5)
+};
+
+static const sph_u32 RK3[32] = {
+	SPH_C32(0x9C30D539), SPH_C32(0x2AF26013),
+	SPH_C32(0xC5D1B023), SPH_C32(0x286085F0),
+	SPH_C32(0xCA417918), SPH_C32(0xB8DB38EF),
+	SPH_C32(0x8E79DCB0), SPH_C32(0x603A180E),
+	SPH_C32(0x6C9E0E8B), SPH_C32(0xB01E8A3E),
+	SPH_C32(0xD71577C1), SPH_C32(0xBD314B27),
+	SPH_C32(0x78AF2FDA), SPH_C32(0x55605C60),
+	SPH_C32(0xE65525F3), SPH_C32(0xAA55AB94),
+	SPH_C32(0x57489862), SPH_C32(0x63E81440),
+	SPH_C32(0x55CA396A), SPH_C32(0x2AAB10B6),
+	SPH_C32(0xB4CC5C34), SPH_C32(0x1141E8CE),
+	SPH_C32(0xA15486AF), SPH_C32(0x7C72E993),
+	SPH_C32(0xB3EE1411), SPH_C32(0x636FBC2A),
+	SPH_C32(0x2BA9C55D), SPH_C32(0x741831F6),
+	SPH_C32(0xCE5C3E16), SPH_C32(0x9B87931E),
+	SPH_C32(0xAFD6BA33), SPH_C32(0x6C24CF5C)
+};
+
+static const sph_u32 RK4[32] = {
+	SPH_C32(0x7A325381), SPH_C32(0x28958677),
+	SPH_C32(0x3B8F4898), SPH_C32(0x6B4BB9AF),
+	SPH_C32(0xC4BFE81B), SPH_C32(0x66282193),
+	SPH_C32(0x61D809CC), SPH_C32(0xFB21A991),
+	SPH_C32(0x487CAC60), SPH_C32(0x5DEC8032),
+	SPH_C32(0xEF845D5D), SPH_C32(0xE98575B1),
+	SPH_C32(0xDC262302), SPH_C32(0xEB651B88),
+	SPH_C32(0x23893E81), SPH_C32(0xD396ACC5),
+	SPH_C32(0x0F6D6FF3), SPH_C32(0x83F44239),
+	SPH_C32(0x2E0B4482), SPH_C32(0xA4842004),
+	SPH_C32(0x69C8F04A), SPH_C32(0x9E1F9B5E),
+	SPH_C32(0x21C66842), SPH_C32(0xF6E96C9A),
+	SPH_C32(0x670C9C61), SPH_C32(0xABD388F0),
+	SPH_C32(0x6A51A0D2), SPH_C32(0xD8542F68),
+	SPH_C32(0x960FA728), SPH_C32(0xAB5133A3),
+	SPH_C32(0x6EEF0B6C), SPH_C32(0x137A3BE4)
+};
+
+static const sph_u32 RK5[32] = {
+	SPH_C32(0xBA3BF050), SPH_C32(0x7EFB2A98),
+	SPH_C32(0xA1F1651D), SPH_C32(0x39AF0176),
+	SPH_C32(0x66CA593E), SPH_C32(0x82430E88),
+	SPH_C32(0x8CEE8619), SPH_C32(0x456F9FB4),
+	SPH_C32(0x7D84A5C3), SPH_C32(0x3B8B5EBE),
+	SPH_C32(0xE06F75D8), SPH_C32(0x85C12073),
+	SPH_C32(0x401A449F), SPH_C32(0x56C16AA6),
+	SPH_C32(0x4ED3AA62), SPH_C32(0x363F7706),
+	SPH_C32(0x1BFEDF72), SPH_C32(0x429B023D),
+	SPH_C32(0x37D0D724), SPH_C32(0xD00A1248),
+	SPH_C32(0xDB0FEAD3), SPH_C32(0x49F1C09B),
+	SPH_C32(0x075372C9), SPH_C32(0x80991B7B),
+	SPH_C32(0x25D479D8), SPH_C32(0xF6E8DEF7),
+	SPH_C32(0xE3FE501A), SPH_C32(0xB6794C3B),
+	SPH_C32(0x976CE0BD), SPH_C32(0x04C006BA),
+	SPH_C32(0xC1A94FB6), SPH_C32(0x409F60C4)
+};
+
+#else
+
+#define PASS1(n, in)   do { \
+   STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, in( 0), SPH_C32(0x00000000)); \
+   STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, in( 1), SPH_C32(0x00000000)); \
+   STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, in( 2), SPH_C32(0x00000000)); \
+   STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, in( 3), SPH_C32(0x00000000)); \
+   STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, in( 4), SPH_C32(0x00000000)); \
+   STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, in( 5), SPH_C32(0x00000000)); \
+   STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, in( 6), SPH_C32(0x00000000)); \
+   STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, in( 7), SPH_C32(0x00000000)); \
+ \
+   STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, in( 8), SPH_C32(0x00000000)); \
+   STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, in( 9), SPH_C32(0x00000000)); \
+   STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, in(10), SPH_C32(0x00000000)); \
+   STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, in(11), SPH_C32(0x00000000)); \
+   STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, in(12), SPH_C32(0x00000000)); \
+   STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, in(13), SPH_C32(0x00000000)); \
+   STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, in(14), SPH_C32(0x00000000)); \
+   STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, in(15), SPH_C32(0x00000000)); \
+ \
+   STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, in(16), SPH_C32(0x00000000)); \
+   STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, in(17), SPH_C32(0x00000000)); \
+   STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, in(18), SPH_C32(0x00000000)); \
+   STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, in(19), SPH_C32(0x00000000)); \
+   STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, in(20), SPH_C32(0x00000000)); \
+   STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, in(21), SPH_C32(0x00000000)); \
+   STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, in(22), SPH_C32(0x00000000)); \
+   STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, in(23), SPH_C32(0x00000000)); \
+ \
+   STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, in(24), SPH_C32(0x00000000)); \
+   STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, in(25), SPH_C32(0x00000000)); \
+   STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, in(26), SPH_C32(0x00000000)); \
+   STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, in(27), SPH_C32(0x00000000)); \
+   STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, in(28), SPH_C32(0x00000000)); \
+   STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, in(29), SPH_C32(0x00000000)); \
+   STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, in(30), SPH_C32(0x00000000)); \
+   STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, in(31), SPH_C32(0x00000000)); \
+	} while (0)
+
+#define PASS2(n, in)   do { \
+   STEP(n, 2, s7, s6, s5, s4, s3, s2, s1, s0, in( 5), SPH_C32(0x452821E6)); \
+   STEP(n, 2, s6, s5, s4, s3, s2, s1, s0, s7, in(14), SPH_C32(0x38D01377)); \
+   STEP(n, 2, s5, s4, s3, s2, s1, s0, s7, s6, in(26), SPH_C32(0xBE5466CF)); \
+   STEP(n, 2, s4, s3, s2, s1, s0, s7, s6, s5, in(18), SPH_C32(0x34E90C6C)); \
+   STEP(n, 2, s3, s2, s1, s0, s7, s6, s5, s4, in(11), SPH_C32(0xC0AC29B7)); \
+   STEP(n, 2, s2, s1, s0, s7, s6, s5, s4, s3, in(28), SPH_C32(0xC97C50DD)); \
+   STEP(n, 2, s1, s0, s7, s6, s5, s4, s3, s2, in( 7), SPH_C32(0x3F84D5B5)); \
+   STEP(n, 2, s0, s7, s6, s5, s4, s3, s2, s1, in(16), SPH_C32(0xB5470917)); \
+ \
+   STEP(n, 2, s7, s6, s5, s4, s3, s2, s1, s0, in( 0), SPH_C32(0x9216D5D9)); \
+   STEP(n, 2, s6, s5, s4, s3, s2, s1, s0, s7, in(23), SPH_C32(0x8979FB1B)); \
+   STEP(n, 2, s5, s4, s3, s2, s1, s0, s7, s6, in(20), SPH_C32(0xD1310BA6)); \
+   STEP(n, 2, s4, s3, s2, s1, s0, s7, s6, s5, in(22), SPH_C32(0x98DFB5AC)); \
+   STEP(n, 2, s3, s2, s1, s0, s7, s6, s5, s4, in( 1), SPH_C32(0x2FFD72DB)); \
+   STEP(n, 2, s2, s1, s0, s7, s6, s5, s4, s3, in(10), SPH_C32(0xD01ADFB7)); \
+   STEP(n, 2, s1, s0, s7, s6, s5, s4, s3, s2, in( 4), SPH_C32(0xB8E1AFED)); \
+   STEP(n, 2, s0, s7, s6, s5, s4, s3, s2, s1, in( 8), SPH_C32(0x6A267E96)); \
+ \
+   STEP(n, 2, s7, s6, s5, s4, s3, s2, s1, s0, in(30), SPH_C32(0xBA7C9045)); \
+   STEP(n, 2, s6, s5, s4, s3, s2, s1, s0, s7, in( 3), SPH_C32(0xF12C7F99)); \
+   STEP(n, 2, s5, s4, s3, s2, s1, s0, s7, s6, in(21), SPH_C32(0x24A19947)); \
+   STEP(n, 2, s4, s3, s2, s1, s0, s7, s6, s5, in( 9), SPH_C32(0xB3916CF7)); \
+   STEP(n, 2, s3, s2, s1, s0, s7, s6, s5, s4, in(17), SPH_C32(0x0801F2E2)); \
+   STEP(n, 2, s2, s1, s0, s7, s6, s5, s4, s3, in(24), SPH_C32(0x858EFC16)); \
+   STEP(n, 2, s1, s0, s7, s6, s5, s4, s3, s2, in(29), SPH_C32(0x636920D8)); \
+   STEP(n, 2, s0, s7, s6, s5, s4, s3, s2, s1, in( 6), SPH_C32(0x71574E69)); \
+ \
+   STEP(n, 2, s7, s6, s5, s4, s3, s2, s1, s0, in(19), SPH_C32(0xA458FEA3)); \
+   STEP(n, 2, s6, s5, s4, s3, s2, s1, s0, s7, in(12), SPH_C32(0xF4933D7E)); \
+   STEP(n, 2, s5, s4, s3, s2, s1, s0, s7, s6, in(15), SPH_C32(0x0D95748F)); \
+   STEP(n, 2, s4, s3, s2, s1, s0, s7, s6, s5, in(13), SPH_C32(0x728EB658)); \
+   STEP(n, 2, s3, s2, s1, s0, s7, s6, s5, s4, in( 2), SPH_C32(0x718BCD58)); \
+   STEP(n, 2, s2, s1, s0, s7, s6, s5, s4, s3, in(25), SPH_C32(0x82154AEE)); \
+   STEP(n, 2, s1, s0, s7, s6, s5, s4, s3, s2, in(31), SPH_C32(0x7B54A41D)); \
+   STEP(n, 2, s0, s7, s6, s5, s4, s3, s2, s1, in(27), SPH_C32(0xC25A59B5)); \
+	} while (0)
+
+#define PASS3(n, in)   do { \
+   STEP(n, 3, s7, s6, s5, s4, s3, s2, s1, s0, in(19), SPH_C32(0x9C30D539)); \
+   STEP(n, 3, s6, s5, s4, s3, s2, s1, s0, s7, in( 9), SPH_C32(0x2AF26013)); \
+   STEP(n, 3, s5, s4, s3, s2, s1, s0, s7, s6, in( 4), SPH_C32(0xC5D1B023)); \
+   STEP(n, 3, s4, s3, s2, s1, s0, s7, s6, s5, in(20), SPH_C32(0x286085F0)); \
+   STEP(n, 3, s3, s2, s1, s0, s7, s6, s5, s4, in(28), SPH_C32(0xCA417918)); \
+   STEP(n, 3, s2, s1, s0, s7, s6, s5, s4, s3, in(17), SPH_C32(0xB8DB38EF)); \
+   STEP(n, 3, s1, s0, s7, s6, s5, s4, s3, s2, in( 8), SPH_C32(0x8E79DCB0)); \
+   STEP(n, 3, s0, s7, s6, s5, s4, s3, s2, s1, in(22), SPH_C32(0x603A180E)); \
+ \
+   STEP(n, 3, s7, s6, s5, s4, s3, s2, s1, s0, in(29), SPH_C32(0x6C9E0E8B)); \
+   STEP(n, 3, s6, s5, s4, s3, s2, s1, s0, s7, in(14), SPH_C32(0xB01E8A3E)); \
+   STEP(n, 3, s5, s4, s3, s2, s1, s0, s7, s6, in(25), SPH_C32(0xD71577C1)); \
+   STEP(n, 3, s4, s3, s2, s1, s0, s7, s6, s5, in(12), SPH_C32(0xBD314B27)); \
+   STEP(n, 3, s3, s2, s1, s0, s7, s6, s5, s4, in(24), SPH_C32(0x78AF2FDA)); \
+   STEP(n, 3, s2, s1, s0, s7, s6, s5, s4, s3, in(30), SPH_C32(0x55605C60)); \
+   STEP(n, 3, s1, s0, s7, s6, s5, s4, s3, s2, in(16), SPH_C32(0xE65525F3)); \
+   STEP(n, 3, s0, s7, s6, s5, s4, s3, s2, s1, in(26), SPH_C32(0xAA55AB94)); \
+ \
+   STEP(n, 3, s7, s6, s5, s4, s3, s2, s1, s0, in(31), SPH_C32(0x57489862)); \
+   STEP(n, 3, s6, s5, s4, s3, s2, s1, s0, s7, in(15), SPH_C32(0x63E81440)); \
+   STEP(n, 3, s5, s4, s3, s2, s1, s0, s7, s6, in( 7), SPH_C32(0x55CA396A)); \
+   STEP(n, 3, s4, s3, s2, s1, s0, s7, s6, s5, in( 3), SPH_C32(0x2AAB10B6)); \
+   STEP(n, 3, s3, s2, s1, s0, s7, s6, s5, s4, in( 1), SPH_C32(0xB4CC5C34)); \
+   STEP(n, 3, s2, s1, s0, s7, s6, s5, s4, s3, in( 0), SPH_C32(0x1141E8CE)); \
+   STEP(n, 3, s1, s0, s7, s6, s5, s4, s3, s2, in(18), SPH_C32(0xA15486AF)); \
+   STEP(n, 3, s0, s7, s6, s5, s4, s3, s2, s1, in(27), SPH_C32(0x7C72E993)); \
+ \
+   STEP(n, 3, s7, s6, s5, s4, s3, s2, s1, s0, in(13), SPH_C32(0xB3EE1411)); \
+   STEP(n, 3, s6, s5, s4, s3, s2, s1, s0, s7, in( 6), SPH_C32(0x636FBC2A)); \
+   STEP(n, 3, s5, s4, s3, s2, s1, s0, s7, s6, in(21), SPH_C32(0x2BA9C55D)); \
+   STEP(n, 3, s4, s3, s2, s1, s0, s7, s6, s5, in(10), SPH_C32(0x741831F6)); \
+   STEP(n, 3, s3, s2, s1, s0, s7, s6, s5, s4, in(23), SPH_C32(0xCE5C3E16)); \
+   STEP(n, 3, s2, s1, s0, s7, s6, s5, s4, s3, in(11), SPH_C32(0x9B87931E)); \
+   STEP(n, 3, s1, s0, s7, s6, s5, s4, s3, s2, in( 5), SPH_C32(0xAFD6BA33)); \
+   STEP(n, 3, s0, s7, s6, s5, s4, s3, s2, s1, in( 2), SPH_C32(0x6C24CF5C)); \
+	} while (0)
+
+#define PASS4(n, in)   do { \
+   STEP(n, 4, s7, s6, s5, s4, s3, s2, s1, s0, in(24), SPH_C32(0x7A325381)); \
+   STEP(n, 4, s6, s5, s4, s3, s2, s1, s0, s7, in( 4), SPH_C32(0x28958677)); \
+   STEP(n, 4, s5, s4, s3, s2, s1, s0, s7, s6, in( 0), SPH_C32(0x3B8F4898)); \
+   STEP(n, 4, s4, s3, s2, s1, s0, s7, s6, s5, in(14), SPH_C32(0x6B4BB9AF)); \
+   STEP(n, 4, s3, s2, s1, s0, s7, s6, s5, s4, in( 2), SPH_C32(0xC4BFE81B)); \
+   STEP(n, 4, s2, s1, s0, s7, s6, s5, s4, s3, in( 7), SPH_C32(0x66282193)); \
+   STEP(n, 4, s1, s0, s7, s6, s5, s4, s3, s2, in(28), SPH_C32(0x61D809CC)); \
+   STEP(n, 4, s0, s7, s6, s5, s4, s3, s2, s1, in(23), SPH_C32(0xFB21A991)); \
+ \
+   STEP(n, 4, s7, s6, s5, s4, s3, s2, s1, s0, in(26), SPH_C32(0x487CAC60)); \
+   STEP(n, 4, s6, s5, s4, s3, s2, s1, s0, s7, in( 6), SPH_C32(0x5DEC8032)); \
+   STEP(n, 4, s5, s4, s3, s2, s1, s0, s7, s6, in(30), SPH_C32(0xEF845D5D)); \
+   STEP(n, 4, s4, s3, s2, s1, s0, s7, s6, s5, in(20), SPH_C32(0xE98575B1)); \
+   STEP(n, 4, s3, s2, s1, s0, s7, s6, s5, s4, in(18), SPH_C32(0xDC262302)); \
+   STEP(n, 4, s2, s1, s0, s7, s6, s5, s4, s3, in(25), SPH_C32(0xEB651B88)); \
+   STEP(n, 4, s1, s0, s7, s6, s5, s4, s3, s2, in(19), SPH_C32(0x23893E81)); \
+   STEP(n, 4, s0, s7, s6, s5, s4, s3, s2, s1, in( 3), SPH_C32(0xD396ACC5)); \
+ \
+   STEP(n, 4, s7, s6, s5, s4, s3, s2, s1, s0, in(22), SPH_C32(0x0F6D6FF3)); \
+   STEP(n, 4, s6, s5, s4, s3, s2, s1, s0, s7, in(11), SPH_C32(0x83F44239)); \
+   STEP(n, 4, s5, s4, s3, s2, s1, s0, s7, s6, in(31), SPH_C32(0x2E0B4482)); \
+   STEP(n, 4, s4, s3, s2, s1, s0, s7, s6, s5, in(21), SPH_C32(0xA4842004)); \
+   STEP(n, 4, s3, s2, s1, s0, s7, s6, s5, s4, in( 8), SPH_C32(0x69C8F04A)); \
+   STEP(n, 4, s2, s1, s0, s7, s6, s5, s4, s3, in(27), SPH_C32(0x9E1F9B5E)); \
+   STEP(n, 4, s1, s0, s7, s6, s5, s4, s3, s2, in(12), SPH_C32(0x21C66842)); \
+   STEP(n, 4, s0, s7, s6, s5, s4, s3, s2, s1, in( 9), SPH_C32(0xF6E96C9A)); \
+ \
+   STEP(n, 4, s7, s6, s5, s4, s3, s2, s1, s0, in( 1), SPH_C32(0x670C9C61)); \
+   STEP(n, 4, s6, s5, s4, s3, s2, s1, s0, s7, in(29), SPH_C32(0xABD388F0)); \
+   STEP(n, 4, s5, s4, s3, s2, s1, s0, s7, s6, in( 5), SPH_C32(0x6A51A0D2)); \
+   STEP(n, 4, s4, s3, s2, s1, s0, s7, s6, s5, in(15), SPH_C32(0xD8542F68)); \
+   STEP(n, 4, s3, s2, s1, s0, s7, s6, s5, s4, in(17), SPH_C32(0x960FA728)); \
+   STEP(n, 4, s2, s1, s0, s7, s6, s5, s4, s3, in(10), SPH_C32(0xAB5133A3)); \
+   STEP(n, 4, s1, s0, s7, s6, s5, s4, s3, s2, in(16), SPH_C32(0x6EEF0B6C)); \
+   STEP(n, 4, s0, s7, s6, s5, s4, s3, s2, s1, in(13), SPH_C32(0x137A3BE4)); \
+	} while (0)
+
+#define PASS5(n, in)   do { \
+   STEP(n, 5, s7, s6, s5, s4, s3, s2, s1, s0, in(27), SPH_C32(0xBA3BF050)); \
+   STEP(n, 5, s6, s5, s4, s3, s2, s1, s0, s7, in( 3), SPH_C32(0x7EFB2A98)); \
+   STEP(n, 5, s5, s4, s3, s2, s1, s0, s7, s6, in(21), SPH_C32(0xA1F1651D)); \
+   STEP(n, 5, s4, s3, s2, s1, s0, s7, s6, s5, in(26), SPH_C32(0x39AF0176)); \
+   STEP(n, 5, s3, s2, s1, s0, s7, s6, s5, s4, in(17), SPH_C32(0x66CA593E)); \
+   STEP(n, 5, s2, s1, s0, s7, s6, s5, s4, s3, in(11), SPH_C32(0x82430E88)); \
+   STEP(n, 5, s1, s0, s7, s6, s5, s4, s3, s2, in(20), SPH_C32(0x8CEE8619)); \
+   STEP(n, 5, s0, s7, s6, s5, s4, s3, s2, s1, in(29), SPH_C32(0x456F9FB4)); \
+ \
+   STEP(n, 5, s7, s6, s5, s4, s3, s2, s1, s0, in(19), SPH_C32(0x7D84A5C3)); \
+   STEP(n, 5, s6, s5, s4, s3, s2, s1, s0, s7, in( 0), SPH_C32(0x3B8B5EBE)); \
+   STEP(n, 5, s5, s4, s3, s2, s1, s0, s7, s6, in(12), SPH_C32(0xE06F75D8)); \
+   STEP(n, 5, s4, s3, s2, s1, s0, s7, s6, s5, in( 7), SPH_C32(0x85C12073)); \
+   STEP(n, 5, s3, s2, s1, s0, s7, s6, s5, s4, in(13), SPH_C32(0x401A449F)); \
+   STEP(n, 5, s2, s1, s0, s7, s6, s5, s4, s3, in( 8), SPH_C32(0x56C16AA6)); \
+   STEP(n, 5, s1, s0, s7, s6, s5, s4, s3, s2, in(31), SPH_C32(0x4ED3AA62)); \
+   STEP(n, 5, s0, s7, s6, s5, s4, s3, s2, s1, in(10), SPH_C32(0x363F7706)); \
+ \
+   STEP(n, 5, s7, s6, s5, s4, s3, s2, s1, s0, in( 5), SPH_C32(0x1BFEDF72)); \
+   STEP(n, 5, s6, s5, s4, s3, s2, s1, s0, s7, in( 9), SPH_C32(0x429B023D)); \
+   STEP(n, 5, s5, s4, s3, s2, s1, s0, s7, s6, in(14), SPH_C32(0x37D0D724)); \
+   STEP(n, 5, s4, s3, s2, s1, s0, s7, s6, s5, in(30), SPH_C32(0xD00A1248)); \
+   STEP(n, 5, s3, s2, s1, s0, s7, s6, s5, s4, in(18), SPH_C32(0xDB0FEAD3)); \
+   STEP(n, 5, s2, s1, s0, s7, s6, s5, s4, s3, in( 6), SPH_C32(0x49F1C09B)); \
+   STEP(n, 5, s1, s0, s7, s6, s5, s4, s3, s2, in(28), SPH_C32(0x075372C9)); \
+   STEP(n, 5, s0, s7, s6, s5, s4, s3, s2, s1, in(24), SPH_C32(0x80991B7B)); \
+ \
+   STEP(n, 5, s7, s6, s5, s4, s3, s2, s1, s0, in( 2), SPH_C32(0x25D479D8)); \
+   STEP(n, 5, s6, s5, s4, s3, s2, s1, s0, s7, in(23), SPH_C32(0xF6E8DEF7)); \
+   STEP(n, 5, s5, s4, s3, s2, s1, s0, s7, s6, in(16), SPH_C32(0xE3FE501A)); \
+   STEP(n, 5, s4, s3, s2, s1, s0, s7, s6, s5, in(22), SPH_C32(0xB6794C3B)); \
+   STEP(n, 5, s3, s2, s1, s0, s7, s6, s5, s4, in( 4), SPH_C32(0x976CE0BD)); \
+   STEP(n, 5, s2, s1, s0, s7, s6, s5, s4, s3, in( 1), SPH_C32(0x04C006BA)); \
+   STEP(n, 5, s1, s0, s7, s6, s5, s4, s3, s2, in(25), SPH_C32(0xC1A94FB6)); \
+   STEP(n, 5, s0, s7, s6, s5, s4, s3, s2, s1, in(15), SPH_C32(0x409F60C4)); \
+	} while (0)
+
+#endif
+
+#define SAVE_STATE \
+	sph_u32 u0, u1, u2, u3, u4, u5, u6, u7; \
+	do { \
+		u0 = s0; \
+		u1 = s1; \
+		u2 = s2; \
+		u3 = s3; \
+		u4 = s4; \
+		u5 = s5; \
+		u6 = s6; \
+		u7 = s7; \
+	} while (0)
+
+#define UPDATE_STATE   do { \
+		s0 = SPH_T32(s0 + u0); \
+		s1 = SPH_T32(s1 + u1); \
+		s2 = SPH_T32(s2 + u2); \
+		s3 = SPH_T32(s3 + u3); \
+		s4 = SPH_T32(s4 + u4); \
+		s5 = SPH_T32(s5 + u5); \
+		s6 = SPH_T32(s6 + u6); \
+		s7 = SPH_T32(s7 + u7); \
+	} while (0)
+
+/*
+ * COREn(in) performs the core HAVAL computation for "n" passes, using
+ * the one-argument macro "in" to access the input words. Running state
+ * is held in variable "s0" to "s7".
+ */
+
+#define CORE3(in)  do { \
+		SAVE_STATE; \
+		PASS1(3, in); \
+		PASS2(3, in); \
+		PASS3(3, in); \
+		UPDATE_STATE; \
+	} while (0)
+
+#define CORE4(in)  do { \
+		SAVE_STATE; \
+		PASS1(4, in); \
+		PASS2(4, in); \
+		PASS3(4, in); \
+		PASS4(4, in); \
+		UPDATE_STATE; \
+	} while (0)
+
+#define CORE5(in)  do { \
+		SAVE_STATE; \
+		PASS1(5, in); \
+		PASS2(5, in); \
+		PASS3(5, in); \
+		PASS4(5, in); \
+		PASS5(5, in); \
+		UPDATE_STATE; \
+	} while (0)
+
+/*
+ * DSTATE declares the state variables "s0" to "s7".
+ */
+#define DSTATE   sph_u32 s0, s1, s2, s3, s4, s5, s6, s7
+
+/*
+ * RSTATE fills the state variables from the context "sc".
+ */
+#define RSTATE   do { \
+		s0 = sc->s0; \
+		s1 = sc->s1; \
+		s2 = sc->s2; \
+		s3 = sc->s3; \
+		s4 = sc->s4; \
+		s5 = sc->s5; \
+		s6 = sc->s6; \
+		s7 = sc->s7; \
+	} while (0)
+
+/*
+ * WSTATE updates the context "sc" from the state variables.
+ */
+#define WSTATE   do { \
+		sc->s0 = s0; \
+		sc->s1 = s1; \
+		sc->s2 = s2; \
+		sc->s3 = s3; \
+		sc->s4 = s4; \
+		sc->s5 = s5; \
+		sc->s6 = s6; \
+		sc->s7 = s7; \
+	} while (0)
+
+/*
+ * Initialize a context. "olen" is the output length, in 32-bit words
+ * (between 4 and 8, inclusive). "passes" is the number of passes
+ * (3, 4 or 5).
+ */
+static void
+haval_init(sph_haval_context *sc, unsigned olen, unsigned passes)
+{
+	sc->s0 = SPH_C32(0x243F6A88);
+	sc->s1 = SPH_C32(0x85A308D3);
+	sc->s2 = SPH_C32(0x13198A2E);
+	sc->s3 = SPH_C32(0x03707344);
+	sc->s4 = SPH_C32(0xA4093822);
+	sc->s5 = SPH_C32(0x299F31D0);
+	sc->s6 = SPH_C32(0x082EFA98);
+	sc->s7 = SPH_C32(0xEC4E6C89);
+	sc->olen = olen;
+	sc->passes = passes;
+#if SPH_64
+	sc->count = 0;
+#else
+	sc->count_high = 0;
+	sc->count_low = 0;
+#endif
+	
+}
+
+/*
+ * IN_PREPARE(data) contains declarations and code to prepare for
+ * reading input words pointed to by "data".
+ * INW(i) reads the word number "i" (from 0 to 31).
+ */
+#if SPH_LITTLE_FAST
+#define IN_PREPARE(indata)   const unsigned char *const load_ptr = \
+                             (const unsigned char *)(indata)
+#define INW(i)   sph_dec32le_aligned(load_ptr + 4 * (i))
+#else
+#define IN_PREPARE(indata) \
+	sph_u32 X_var[32]; \
+	int load_index; \
+ \
+	for (load_index = 0; load_index < 32; load_index ++) \
+		X_var[load_index] = sph_dec32le_aligned( \
+			(const unsigned char *)(indata) + 4 * load_index)
+#define INW(i)   X_var[i]
+#endif
+
+/*
+ * Mixing operation used for 128-bit output tailoring. This function
+ * takes the byte 0 from a0, byte 1 from a1, byte 2 from a2 and byte 3
+ * from a3, and combines them into a 32-bit word, which is then rotated
+ * to the left by n bits.
+ */
+static SPH_INLINE sph_u32
+mix128(sph_u32 a0, sph_u32 a1, sph_u32 a2, sph_u32 a3, int n)
+{
+	sph_u32 tmp;
+
+	tmp = (a0 & SPH_C32(0x000000FF))
+		| (a1 & SPH_C32(0x0000FF00))
+		| (a2 & SPH_C32(0x00FF0000))
+		| (a3 & SPH_C32(0xFF000000));
+	if (n > 0)
+		tmp = SPH_ROTL32(tmp, n);
+	return tmp;
+}
+
+/*
+ * Mixing operation used to compute output word 0 for 160-bit output.
+ */
+static SPH_INLINE sph_u32
+mix160_0(sph_u32 x5, sph_u32 x6, sph_u32 x7)
+{
+	sph_u32 tmp;
+
+	tmp = (x5 & SPH_C32(0x01F80000))
+		| (x6 & SPH_C32(0xFE000000))
+		| (x7 & SPH_C32(0x0000003F));
+	return SPH_ROTL32(tmp, 13);
+}
+
+/*
+ * Mixing operation used to compute output word 1 for 160-bit output.
+ */
+static SPH_INLINE sph_u32
+mix160_1(sph_u32 x5, sph_u32 x6, sph_u32 x7)
+{
+	sph_u32 tmp;
+
+	tmp = (x5 & SPH_C32(0xFE000000))
+		| (x6 & SPH_C32(0x0000003F))
+		| (x7 & SPH_C32(0x00000FC0));
+	return SPH_ROTL32(tmp, 7);
+}
+
+/*
+ * Mixing operation used to compute output word 2 for 160-bit output.
+ */
+static SPH_INLINE sph_u32
+mix160_2(sph_u32 x5, sph_u32 x6, sph_u32 x7)
+{
+	sph_u32 tmp;
+
+	tmp = (x5 & SPH_C32(0x0000003F))
+		| (x6 & SPH_C32(0x00000FC0))
+		| (x7 & SPH_C32(0x0007F000));
+	return tmp;
+}
+
+/*
+ * Mixing operation used to compute output word 3 for 160-bit output.
+ */
+static SPH_INLINE sph_u32
+mix160_3(sph_u32 x5, sph_u32 x6, sph_u32 x7)
+{
+	sph_u32 tmp;
+
+	tmp = (x5 & SPH_C32(0x00000FC0))
+		| (x6 & SPH_C32(0x0007F000))
+		| (x7 & SPH_C32(0x01F80000));
+	return tmp >> 6;
+}
+
+/*
+ * Mixing operation used to compute output word 4 for 160-bit output.
+ */
+static SPH_INLINE sph_u32
+mix160_4(sph_u32 x5, sph_u32 x6, sph_u32 x7)
+{
+	sph_u32 tmp;
+
+	tmp = (x5 & SPH_C32(0x0007F000))
+		| (x6 & SPH_C32(0x01F80000))
+		| (x7 & SPH_C32(0xFE000000));
+	return tmp >> 12;
+}
+
+/*
+ * Mixing operation used to compute output word 0 for 192-bit output.
+ */
+static SPH_INLINE sph_u32
+mix192_0(sph_u32 x6, sph_u32 x7)
+{
+	sph_u32 tmp;
+
+	tmp = (x6 & SPH_C32(0xFC000000)) | (x7 & SPH_C32(0x0000001F));
+	return SPH_ROTL32(tmp, 6);
+}
+
+/*
+ * Mixing operation used to compute output word 1 for 192-bit output.
+ */
+static SPH_INLINE sph_u32
+mix192_1(sph_u32 x6, sph_u32 x7)
+{
+	return (x6 & SPH_C32(0x0000001F)) | (x7 & SPH_C32(0x000003E0));
+}
+
+/*
+ * Mixing operation used to compute output word 2 for 192-bit output.
+ */
+static SPH_INLINE sph_u32
+mix192_2(sph_u32 x6, sph_u32 x7)
+{
+	return ((x6 & SPH_C32(0x000003E0)) | (x7 & SPH_C32(0x0000FC00))) >> 5;
+}
+
+/*
+ * Mixing operation used to compute output word 3 for 192-bit output.
+ */
+static SPH_INLINE sph_u32
+mix192_3(sph_u32 x6, sph_u32 x7)
+{
+	return ((x6 & SPH_C32(0x0000FC00)) | (x7 & SPH_C32(0x001F0000))) >> 10;
+}
+
+/*
+ * Mixing operation used to compute output word 4 for 192-bit output.
+ */
+static SPH_INLINE sph_u32
+mix192_4(sph_u32 x6, sph_u32 x7)
+{
+	return ((x6 & SPH_C32(0x001F0000)) | (x7 & SPH_C32(0x03E00000))) >> 16;
+}
+
+/*
+ * Mixing operation used to compute output word 5 for 192-bit output.
+ */
+static SPH_INLINE sph_u32
+mix192_5(sph_u32 x6, sph_u32 x7)
+{
+	return ((x6 & SPH_C32(0x03E00000)) | (x7 & SPH_C32(0xFC000000))) >> 21;
+}
+
+/*
+ * Write out HAVAL output. The output length is tailored to the requested
+ * length.
+ */
+static void
+haval_out(sph_haval_context *sc, void *dst)
+{
+	DSTATE;
+	unsigned char *buf;
+
+	buf = dst;
+	RSTATE;
+	switch (sc->olen) {
+	case 4:
+		sph_enc32le(buf,      SPH_T32(s0 + mix128(s7, s4, s5, s6, 24)));
+		sph_enc32le(buf + 4,  SPH_T32(s1 + mix128(s6, s7, s4, s5, 16)));
+		sph_enc32le(buf + 8,  SPH_T32(s2 + mix128(s5, s6, s7, s4, 8)));
+		sph_enc32le(buf + 12, SPH_T32(s3 + mix128(s4, s5, s6, s7, 0)));
+		break;
+	case 5:
+		sph_enc32le(buf,      SPH_T32(s0 + mix160_0(s5, s6, s7)));
+		sph_enc32le(buf + 4,  SPH_T32(s1 + mix160_1(s5, s6, s7)));
+		sph_enc32le(buf + 8,  SPH_T32(s2 + mix160_2(s5, s6, s7)));
+		sph_enc32le(buf + 12, SPH_T32(s3 + mix160_3(s5, s6, s7)));
+		sph_enc32le(buf + 16, SPH_T32(s4 + mix160_4(s5, s6, s7)));
+		break;
+	case 6:
+		sph_enc32le(buf,      SPH_T32(s0 + mix192_0(s6, s7)));
+		sph_enc32le(buf + 4,  SPH_T32(s1 + mix192_1(s6, s7)));
+		sph_enc32le(buf + 8,  SPH_T32(s2 + mix192_2(s6, s7)));
+		sph_enc32le(buf + 12, SPH_T32(s3 + mix192_3(s6, s7)));
+		sph_enc32le(buf + 16, SPH_T32(s4 + mix192_4(s6, s7)));
+		sph_enc32le(buf + 20, SPH_T32(s5 + mix192_5(s6, s7)));
+		break;
+	case 7:
+		sph_enc32le(buf,      SPH_T32(s0 + ((s7 >> 27) & 0x1F)));
+		sph_enc32le(buf + 4,  SPH_T32(s1 + ((s7 >> 22) & 0x1F)));
+		sph_enc32le(buf + 8,  SPH_T32(s2 + ((s7 >> 18) & 0x0F)));
+		sph_enc32le(buf + 12, SPH_T32(s3 + ((s7 >> 13) & 0x1F)));
+		sph_enc32le(buf + 16, SPH_T32(s4 + ((s7 >>  9) & 0x0F)));
+		sph_enc32le(buf + 20, SPH_T32(s5 + ((s7 >>  4) & 0x1F)));
+		sph_enc32le(buf + 24, SPH_T32(s6 + ((s7      ) & 0x0F)));
+		break;
+	case 8:
+		sph_enc32le(buf,      s0);
+		sph_enc32le(buf + 4,  s1);
+		sph_enc32le(buf + 8,  s2);
+		sph_enc32le(buf + 12, s3);
+		sph_enc32le(buf + 16, s4);
+		sph_enc32le(buf + 20, s5);
+		sph_enc32le(buf + 24, s6);
+		sph_enc32le(buf + 28, s7);
+		break;
+	}
+}
+
+/*
+ * The main core functions inline the code with the COREx() macros. We
+ * use a helper file, included three times, which avoids code copying.
+ */
+
+#undef PASSES
+#define PASSES   3
+#include "haval_helper.c"
+
+#undef PASSES
+#define PASSES   4
+#include "haval_helper.c"
+
+#undef PASSES
+#define PASSES   5
+#include "haval_helper.c"
+
+/* ====================================================================== */
+
+#define API(xxx, y) \
+void \
+sph_haval ## xxx ## _ ## y ## _init(void *cc) \
+{ \
+	haval_init(cc, xxx >> 5, y); \
+} \
+ \
+void \
+sph_haval ## xxx ## _ ## y (void *cc, const void *data, size_t len) \
+{ \
+	haval ## y(cc, data, len); \
+} \
+ \
+void \
+sph_haval ## xxx ## _ ## y ## _close(void *cc, void *dst) \
+{ \
+	haval ## y ## _close(cc, 0, 0, dst); \
+} \
+ \
+void \
+sph_haval ## xxx ## _ ## y ## addbits_and_close( \
+	void *cc, unsigned ub, unsigned n, void *dst) \
+{ \
+	haval ## y ## _close(cc, ub, n, dst); \
+}
+
+API(128, 3)
+API(128, 4)
+API(128, 5)
+API(160, 3)
+API(160, 4)
+API(160, 5)
+API(192, 3)
+API(192, 4)
+API(192, 5)
+API(224, 3)
+API(224, 4)
+API(224, 5)
+API(256, 3)
+API(256, 4)
+API(256, 5)
+
+#define RVAL   do { \
+		s0 = val[0]; \
+		s1 = val[1]; \
+		s2 = val[2]; \
+		s3 = val[3]; \
+		s4 = val[4]; \
+		s5 = val[5]; \
+		s6 = val[6]; \
+		s7 = val[7]; \
+	} while (0)
+
+#define WVAL   do { \
+		val[0] = s0; \
+		val[1] = s1; \
+		val[2] = s2; \
+		val[3] = s3; \
+		val[4] = s4; \
+		val[5] = s5; \
+		val[6] = s6; \
+		val[7] = s7; \
+	} while (0)
+
+#define INMSG(i)   msg[i]
+
+/* see sph_haval.h */
+void
+sph_haval_3_comp(const sph_u32 msg[32], sph_u32 val[8])
+{
+	DSTATE;
+
+	RVAL;
+	CORE3(INMSG);
+	WVAL;
+}
+
+/* see sph_haval.h */
+void
+sph_haval_4_comp(const sph_u32 msg[32], sph_u32 val[8])
+{
+	DSTATE;
+
+	RVAL;
+	CORE4(INMSG);
+	WVAL;
+}
+
+/* see sph_haval.h */
+void
+sph_haval_5_comp(const sph_u32 msg[32], sph_u32 val[8])
+{
+	DSTATE;
+
+	RVAL;
+	CORE5(INMSG);
+	WVAL;
+}
+
+#ifdef __cplusplus
+}
+#endif	
diff --git a/sph/haval_helper.c b/sph/haval_helper.c
new file mode 100644
index 0000000000..ce0a9fd685
--- /dev/null
+++ b/sph/haval_helper.c
@@ -0,0 +1,190 @@
+/* $Id: haval_helper.c 218 2010-06-08 17:06:34Z tp $ */
+/*
+ * Helper code, included (three times !) by HAVAL implementation.
+ *
+ * TODO: try to merge this with md_helper.c.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#undef SPH_XCAT
+#define SPH_XCAT(a, b)    SPH_XCAT_(a, b)
+#undef SPH_XCAT_
+#define SPH_XCAT_(a, b)   a ## b
+
+static void
+#ifdef SPH_UPTR
+SPH_XCAT(SPH_XCAT(haval, PASSES), _short)
+#else
+SPH_XCAT(haval, PASSES)
+#endif
+(sph_haval_context *sc, const void *data, size_t len)
+{
+	unsigned current;
+
+#if SPH_64
+	current = (unsigned)sc->count & 127U;
+#else
+	current = (unsigned)sc->count_low & 127U;
+#endif
+	while (len > 0) {
+		unsigned clen;
+#if !SPH_64
+		sph_u32 clow, clow2;
+#endif
+
+		clen = 128U - current;
+		if (clen > len)
+			clen = len;
+		memcpy(sc->buf + current, data, clen);
+		data = (const unsigned char *)data + clen;
+		current += clen;
+		len -= clen;
+		if (current == 128U) {
+			DSTATE;
+			IN_PREPARE(sc->buf);
+			RSTATE;
+			SPH_XCAT(CORE, PASSES)(INW);
+			WSTATE;
+			current = 0;
+		}
+#if SPH_64
+		sc->count += clen;
+#else
+		clow = sc->count_low;
+		clow2 = SPH_T32(clow + clen);
+		sc->count_low = clow2;
+		if (clow2 < clow)
+			sc->count_high ++;
+#endif
+	}
+}
+
+#ifdef SPH_UPTR
+static void
+SPH_XCAT(haval, PASSES)(sph_haval_context *sc, const void *data, size_t len)
+{
+	unsigned current;
+	size_t orig_len;
+#if !SPH_64
+	sph_u32 clow, clow2;
+#endif
+	DSTATE;
+
+	if (len < 256U) {
+		SPH_XCAT(SPH_XCAT(haval, PASSES), _short)(sc, data, len);
+		return;
+	}
+#if SPH_64
+	current = (unsigned)sc->count & 127U;
+#else
+	current = (unsigned)sc->count_low & 127U;
+#endif
+	if (current > 0) {
+		unsigned clen;
+		clen = 128U - current;
+		SPH_XCAT(SPH_XCAT(haval, PASSES), _short)(sc, data, clen);
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+	}
+#if !SPH_UNALIGNED
+	if (((SPH_UPTR)data & 3U) != 0) {
+		SPH_XCAT(SPH_XCAT(haval, PASSES), _short)(sc, data, len);
+		return;
+	}
+#endif
+	orig_len = len;
+	RSTATE;
+	while (len >= 128U) {
+		IN_PREPARE(data);
+		SPH_XCAT(CORE, PASSES)(INW);
+		data = (const unsigned char *)data + 128U;
+		len -= 128U;
+	}
+	WSTATE;
+	if (len > 0)
+		memcpy(sc->buf, data, len);
+#if SPH_64
+	sc->count += (sph_u64)orig_len;
+#else
+	clow = sc->count_low;
+	clow2 = SPH_T32(clow + orig_len);
+	sc->count_low = clow2;
+	if (clow2 < clow)
+		sc->count_high ++;
+	orig_len >>= 12;
+	orig_len >>= 10;
+	orig_len >>= 10;
+	sc->count_high += orig_len;
+#endif
+}
+#endif
+
+static void
+SPH_XCAT(SPH_XCAT(haval, PASSES), _close)(sph_haval_context *sc,
+	unsigned ub, unsigned n, void *dst)
+{
+	unsigned current;
+	DSTATE;
+
+#if SPH_64
+	current = (unsigned)sc->count & 127U;
+#else
+	current = (unsigned)sc->count_low & 127U;
+#endif
+	sc->buf[current ++] = (0x01 << n) | ((ub & 0xFF) >> (8 - n));
+	RSTATE;
+	if (current > 118U) {
+		memset(sc->buf + current, 0, 128U - current);
+
+		do {
+			IN_PREPARE(sc->buf);
+			SPH_XCAT(CORE, PASSES)(INW);
+		} while (0);
+		current = 0;
+	}
+	memset(sc->buf + current, 0, 118U - current);
+	sc->buf[118] = 0x01 | (PASSES << 3);
+	sc->buf[119] = sc->olen << 3;
+#if SPH_64
+	sph_enc64le_aligned(sc->buf + 120, SPH_T64(sc->count << 3));
+#else
+	sph_enc32le_aligned(sc->buf + 120, SPH_T32(sc->count_low << 3));
+	sph_enc32le_aligned(sc->buf + 124,
+		SPH_T32((sc->count_high << 3) | (sc->count_low >> 29)));
+#endif
+
+	do {
+		IN_PREPARE(sc->buf);
+		SPH_XCAT(CORE, PASSES)(INW);
+	} while (0);
+	WSTATE;
+
+	haval_out(sc, dst);
+	haval_init(sc, sc->olen, sc->passes);
+}
diff --git a/sph/md_helper.c b/sph/md_helper.c
new file mode 100644
index 0000000000..5384f03f73
--- /dev/null
+++ b/sph/md_helper.c
@@ -0,0 +1,346 @@
+/* $Id: md_helper.c 216 2010-06-08 09:46:57Z tp $ */
+/*
+ * This file contains some functions which implement the external data
+ * handling and padding for Merkle-Damgard hash functions which follow
+ * the conventions set out by MD4 (little-endian) or SHA-1 (big-endian).
+ *
+ * API: this file is meant to be included, not compiled as a stand-alone
+ * file. Some macros must be defined:
+ *   RFUN   name for the round function
+ *   HASH   "short name" for the hash function
+ *   BE32   defined for big-endian, 32-bit based (e.g. SHA-1)
+ *   LE32   defined for little-endian, 32-bit based (e.g. MD5)
+ *   BE64   defined for big-endian, 64-bit based (e.g. SHA-512)
+ *   LE64   defined for little-endian, 64-bit based (no example yet)
+ *   PW01   if defined, append 0x01 instead of 0x80 (for Tiger)
+ *   BLEN   if defined, length of a message block (in bytes)
+ *   PLW1   if defined, length is defined on one 64-bit word only (for Tiger)
+ *   PLW4   if defined, length is defined on four 64-bit words (for WHIRLPOOL)
+ *   SVAL   if defined, reference to the context state information
+ *
+ * BLEN is used when a message block is not 16 (32-bit or 64-bit) words:
+ * this is used for instance for Tiger, which works on 64-bit words but
+ * uses 512-bit message blocks (eight 64-bit words). PLW1 and PLW4 are
+ * ignored if 32-bit words are used; if 64-bit words are used and PLW1 is
+ * set, then only one word (64 bits) will be used to encode the input
+ * message length (in bits), otherwise two words will be used (as in
+ * SHA-384 and SHA-512). If 64-bit words are used and PLW4 is defined (but
+ * not PLW1), four 64-bit words will be used to encode the message length
+ * (in bits). Note that regardless of those settings, only 64-bit message
+ * lengths are supported (in bits): messages longer than 2 Exabytes will be
+ * improperly hashed (this is unlikely to happen soon: 2 Exabytes is about
+ * 2 millions Terabytes, which is huge).
+ *
+ * If CLOSE_ONLY is defined, then this file defines only the sph_XXX_close()
+ * function. This is used for Tiger2, which is identical to Tiger except
+ * when it comes to the padding (Tiger2 uses the standard 0x80 byte instead
+ * of the 0x01 from original Tiger).
+ *
+ * The RFUN function is invoked with two arguments, the first pointing to
+ * aligned data (as a "const void *"), the second being state information
+ * from the context structure. By default, this state information is the
+ * "val" field from the context, and this field is assumed to be an array
+ * of words ("sph_u32" or "sph_u64", depending on BE32/LE32/BE64/LE64).
+ * from the context structure. The "val" field can have any type, except
+ * for the output encoding which assumes that it is an array of "sph_u32"
+ * values. By defining NO_OUTPUT, this last step is deactivated; the
+ * includer code is then responsible for writing out the hash result. When
+ * NO_OUTPUT is defined, the third parameter to the "close()" function is
+ * ignored.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+#undef SPH_XCAT
+#define SPH_XCAT(a, b)     SPH_XCAT_(a, b)
+#undef SPH_XCAT_
+#define SPH_XCAT_(a, b)    a ## b
+
+#undef SPH_BLEN
+#undef SPH_WLEN
+#if defined BE64 || defined LE64
+#define SPH_BLEN    128U
+#define SPH_WLEN      8U
+#else
+#define SPH_BLEN     64U
+#define SPH_WLEN      4U
+#endif
+
+#ifdef BLEN
+#undef SPH_BLEN
+#define SPH_BLEN    BLEN
+#endif
+
+#undef SPH_MAXPAD
+#if defined PLW1
+#define SPH_MAXPAD   (SPH_BLEN - SPH_WLEN)
+#elif defined PLW4
+#define SPH_MAXPAD   (SPH_BLEN - (SPH_WLEN << 2))
+#else
+#define SPH_MAXPAD   (SPH_BLEN - (SPH_WLEN << 1))
+#endif
+
+#undef SPH_VAL
+#undef SPH_NO_OUTPUT
+#ifdef SVAL
+#define SPH_VAL         SVAL
+#define SPH_NO_OUTPUT   1
+#else
+#define SPH_VAL   sc->val
+#endif
+
+#ifndef CLOSE_ONLY
+
+#ifdef SPH_UPTR
+static void
+SPH_XCAT(HASH, _short)(void *cc, const void *data, size_t len)
+#else
+void
+SPH_XCAT(sph_, HASH)(void *cc, const void *data, size_t len)
+#endif
+{
+	SPH_XCAT(sph_, SPH_XCAT(HASH, _context)) *sc;
+	unsigned current;
+
+	sc = cc;
+#if SPH_64
+	current = (unsigned)sc->count & (SPH_BLEN - 1U);
+#else
+	current = (unsigned)sc->count_low & (SPH_BLEN - 1U);
+#endif
+	while (len > 0) {
+		unsigned clen;
+#if !SPH_64
+		sph_u32 clow, clow2;
+#endif
+
+		clen = SPH_BLEN - current;
+		if (clen > len)
+			clen = len;
+		memcpy(sc->buf + current, data, clen);
+		data = (const unsigned char *)data + clen;
+		current += clen;
+		len -= clen;
+		if (current == SPH_BLEN) {
+			RFUN(sc->buf, SPH_VAL);
+			current = 0;
+		}
+#if SPH_64
+		sc->count += clen;
+#else
+		clow = sc->count_low;
+		clow2 = SPH_T32(clow + clen);
+		sc->count_low = clow2;
+		if (clow2 < clow)
+			sc->count_high ++;
+#endif
+	}
+}
+
+#ifdef SPH_UPTR
+void
+SPH_XCAT(sph_, HASH)(void *cc, const void *data, size_t len)
+{
+	SPH_XCAT(sph_, SPH_XCAT(HASH, _context)) *sc;
+	unsigned current;
+	size_t orig_len;
+#if !SPH_64
+	sph_u32 clow, clow2;
+#endif
+
+	if (len < (2 * SPH_BLEN)) {
+		SPH_XCAT(HASH, _short)(cc, data, len);
+		return;
+	}
+	sc = cc;
+#if SPH_64
+	current = (unsigned)sc->count & (SPH_BLEN - 1U);
+#else
+	current = (unsigned)sc->count_low & (SPH_BLEN - 1U);
+#endif
+	if (current > 0) {
+		unsigned t;
+
+		t = SPH_BLEN - current;
+		SPH_XCAT(HASH, _short)(cc, data, t);
+		data = (const unsigned char *)data + t;
+		len -= t;
+	}
+#if !SPH_UNALIGNED
+	if (((SPH_UPTR)data & (SPH_WLEN - 1U)) != 0) {
+		SPH_XCAT(HASH, _short)(cc, data, len);
+		return;
+	}
+#endif
+	orig_len = len;
+	while (len >= SPH_BLEN) {
+		RFUN(data, SPH_VAL);
+		len -= SPH_BLEN;
+		data = (const unsigned char *)data + SPH_BLEN;
+	}
+	if (len > 0)
+		memcpy(sc->buf, data, len);
+#if SPH_64
+	sc->count += (sph_u64)orig_len;
+#else
+	clow = sc->count_low;
+	clow2 = SPH_T32(clow + orig_len);
+	sc->count_low = clow2;
+	if (clow2 < clow)
+		sc->count_high ++;
+	/*
+	 * This code handles the improbable situation where "size_t" is
+	 * greater than 32 bits, and yet we do not have a 64-bit type.
+	 */
+	orig_len >>= 12;
+	orig_len >>= 10;
+	orig_len >>= 10;
+	sc->count_high += orig_len;
+#endif
+}
+#endif
+
+#endif
+
+/*
+ * Perform padding and produce result. The context is NOT reinitialized
+ * by this function.
+ */
+static void
+SPH_XCAT(HASH, _addbits_and_close)(void *cc,
+	unsigned ub, unsigned n, void *dst, unsigned rnum)
+{
+	SPH_XCAT(sph_, SPH_XCAT(HASH, _context)) *sc;
+	unsigned current, u;
+#if !SPH_64
+	sph_u32 low, high;
+#endif
+
+	sc = cc;
+#if SPH_64
+	current = (unsigned)sc->count & (SPH_BLEN - 1U);
+#else
+	current = (unsigned)sc->count_low & (SPH_BLEN - 1U);
+#endif
+#ifdef PW01
+	sc->buf[current ++] = (0x100 | (ub & 0xFF)) >> (8 - n);
+#else
+	{
+		unsigned z;
+
+		z = 0x80 >> n;
+		sc->buf[current ++] = ((ub & -z) | z) & 0xFF;
+	}
+#endif
+	if (current > SPH_MAXPAD) {
+		memset(sc->buf + current, 0, SPH_BLEN - current);
+		RFUN(sc->buf, SPH_VAL);
+		memset(sc->buf, 0, SPH_MAXPAD);
+	} else {
+		memset(sc->buf + current, 0, SPH_MAXPAD - current);
+	}
+#if defined BE64
+#if defined PLW1
+	sph_enc64be_aligned(sc->buf + SPH_MAXPAD,
+		SPH_T64(sc->count << 3) + (sph_u64)n);
+#elif defined PLW4
+	memset(sc->buf + SPH_MAXPAD, 0, 2 * SPH_WLEN);
+	sph_enc64be_aligned(sc->buf + SPH_MAXPAD + 2 * SPH_WLEN,
+		sc->count >> 61);
+	sph_enc64be_aligned(sc->buf + SPH_MAXPAD + 3 * SPH_WLEN,
+		SPH_T64(sc->count << 3) + (sph_u64)n);
+#else
+	sph_enc64be_aligned(sc->buf + SPH_MAXPAD, sc->count >> 61);
+	sph_enc64be_aligned(sc->buf + SPH_MAXPAD + SPH_WLEN,
+		SPH_T64(sc->count << 3) + (sph_u64)n);
+#endif
+#elif defined LE64
+#if defined PLW1
+	sph_enc64le_aligned(sc->buf + SPH_MAXPAD,
+		SPH_T64(sc->count << 3) + (sph_u64)n);
+#elif defined PLW1
+	sph_enc64le_aligned(sc->buf + SPH_MAXPAD,
+		SPH_T64(sc->count << 3) + (sph_u64)n);
+	sph_enc64le_aligned(sc->buf + SPH_MAXPAD + SPH_WLEN, sc->count >> 61);
+	memset(sc->buf + SPH_MAXPAD + 2 * SPH_WLEN, 0, 2 * SPH_WLEN);
+#else
+	sph_enc64le_aligned(sc->buf + SPH_MAXPAD,
+		SPH_T64(sc->count << 3) + (sph_u64)n);
+	sph_enc64le_aligned(sc->buf + SPH_MAXPAD + SPH_WLEN, sc->count >> 61);
+#endif
+#else
+#if SPH_64
+#ifdef BE32
+	sph_enc64be_aligned(sc->buf + SPH_MAXPAD,
+		SPH_T64(sc->count << 3) + (sph_u64)n);
+#else
+	sph_enc64le_aligned(sc->buf + SPH_MAXPAD,
+		SPH_T64(sc->count << 3) + (sph_u64)n);
+#endif
+#else
+	low = sc->count_low;
+	high = SPH_T32((sc->count_high << 3) | (low >> 29));
+	low = SPH_T32(low << 3) + (sph_u32)n;
+#ifdef BE32
+	sph_enc32be(sc->buf + SPH_MAXPAD, high);
+	sph_enc32be(sc->buf + SPH_MAXPAD + SPH_WLEN, low);
+#else
+	sph_enc32le(sc->buf + SPH_MAXPAD, low);
+	sph_enc32le(sc->buf + SPH_MAXPAD + SPH_WLEN, high);
+#endif
+#endif
+#endif
+	RFUN(sc->buf, SPH_VAL);
+#ifdef SPH_NO_OUTPUT
+	(void)dst;
+	(void)rnum;
+	(void)u;
+#else
+	for (u = 0; u < rnum; u ++) {
+#if defined BE64
+		sph_enc64be((unsigned char *)dst + 8 * u, sc->val[u]);
+#elif defined LE64
+		sph_enc64le((unsigned char *)dst + 8 * u, sc->val[u]);
+#elif defined BE32
+		sph_enc32be((unsigned char *)dst + 4 * u, sc->val[u]);
+#else
+		sph_enc32le((unsigned char *)dst + 4 * u, sc->val[u]);
+#endif
+	}
+#endif
+}
+
+static void
+SPH_XCAT(HASH, _close)(void *cc, void *dst, unsigned rnum)
+{
+	SPH_XCAT(HASH, _addbits_and_close)(cc, 0, 0, dst, rnum);
+}
diff --git a/sph/ripemd.c b/sph/ripemd.c
new file mode 100644
index 0000000000..dd12b1fa2a
--- /dev/null
+++ b/sph/ripemd.c
@@ -0,0 +1,833 @@
+/* $Id: ripemd.c 216 2010-06-08 09:46:57Z tp $ */
+/*
+ * RIPEMD-160 implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "sph_ripemd.h"
+
+/*
+ * Round functions for RIPEMD (original).
+ */
+#define F(x, y, z)    ((((y) ^ (z)) & (x)) ^ (z))
+#define G(x, y, z)    (((x) & (y)) | (((x) | (y)) & (z)))
+#define H(x, y, z)    ((x) ^ (y) ^ (z))
+
+static const sph_u32 oIV[5] = {
+	SPH_C32(0x67452301), SPH_C32(0xEFCDAB89),
+	SPH_C32(0x98BADCFE), SPH_C32(0x10325476)
+};
+
+/*
+ * Round functions for RIPEMD-128 and RIPEMD-160.
+ */
+#define F1(x, y, z)   ((x) ^ (y) ^ (z))
+#define F2(x, y, z)   ((((y) ^ (z)) & (x)) ^ (z))
+#define F3(x, y, z)   (((x) | ~(y)) ^ (z))
+#define F4(x, y, z)   ((((x) ^ (y)) & (z)) ^ (y))
+#define F5(x, y, z)   ((x) ^ ((y) | ~(z)))
+
+static const sph_u32 IV[5] = {
+	SPH_C32(0x67452301), SPH_C32(0xEFCDAB89), SPH_C32(0x98BADCFE),
+	SPH_C32(0x10325476), SPH_C32(0xC3D2E1F0)
+};
+
+#define ROTL    SPH_ROTL32
+
+/* ===================================================================== */
+/*
+ * RIPEMD (original hash, deprecated).
+ */
+
+#define FF1(A, B, C, D, X, s)   do { \
+		sph_u32 tmp = SPH_T32((A) + F(B, C, D) + (X)); \
+		(A) = ROTL(tmp, (s)); \
+	} while (0)
+
+#define GG1(A, B, C, D, X, s)   do { \
+		sph_u32 tmp = SPH_T32((A) + G(B, C, D) \
+			+ (X) + SPH_C32(0x5A827999)); \
+		(A) = ROTL(tmp, (s)); \
+	} while (0)
+
+#define HH1(A, B, C, D, X, s)   do { \
+		sph_u32 tmp = SPH_T32((A) + H(B, C, D) \
+			+ (X) + SPH_C32(0x6ED9EBA1)); \
+		(A) = ROTL(tmp, (s)); \
+	} while (0)
+
+#define FF2(A, B, C, D, X, s)   do { \
+		sph_u32 tmp = SPH_T32((A) + F(B, C, D) \
+			+ (X) + SPH_C32(0x50A28BE6)); \
+		(A) = ROTL(tmp, (s)); \
+	} while (0)
+
+#define GG2(A, B, C, D, X, s)   do { \
+		sph_u32 tmp = SPH_T32((A) + G(B, C, D) + (X)); \
+		(A) = ROTL(tmp, (s)); \
+	} while (0)
+
+#define HH2(A, B, C, D, X, s)   do { \
+		sph_u32 tmp = SPH_T32((A) + H(B, C, D) \
+			+ (X) + SPH_C32(0x5C4DD124)); \
+		(A) = ROTL(tmp, (s)); \
+	} while (0)
+
+#define RIPEMD_ROUND_BODY(in, h)   do { \
+		sph_u32 A1, B1, C1, D1; \
+		sph_u32 A2, B2, C2, D2; \
+		sph_u32 tmp; \
+ \
+		A1 = A2 = (h)[0]; \
+		B1 = B2 = (h)[1]; \
+		C1 = C2 = (h)[2]; \
+		D1 = D2 = (h)[3]; \
+ \
+		FF1(A1, B1, C1, D1, in( 0), 11); \
+		FF1(D1, A1, B1, C1, in( 1), 14); \
+		FF1(C1, D1, A1, B1, in( 2), 15); \
+		FF1(B1, C1, D1, A1, in( 3), 12); \
+		FF1(A1, B1, C1, D1, in( 4),  5); \
+		FF1(D1, A1, B1, C1, in( 5),  8); \
+		FF1(C1, D1, A1, B1, in( 6),  7); \
+		FF1(B1, C1, D1, A1, in( 7),  9); \
+		FF1(A1, B1, C1, D1, in( 8), 11); \
+		FF1(D1, A1, B1, C1, in( 9), 13); \
+		FF1(C1, D1, A1, B1, in(10), 14); \
+		FF1(B1, C1, D1, A1, in(11), 15); \
+		FF1(A1, B1, C1, D1, in(12),  6); \
+		FF1(D1, A1, B1, C1, in(13),  7); \
+		FF1(C1, D1, A1, B1, in(14),  9); \
+		FF1(B1, C1, D1, A1, in(15),  8); \
+ \
+		GG1(A1, B1, C1, D1, in( 7),  7); \
+		GG1(D1, A1, B1, C1, in( 4),  6); \
+		GG1(C1, D1, A1, B1, in(13),  8); \
+		GG1(B1, C1, D1, A1, in( 1), 13); \
+		GG1(A1, B1, C1, D1, in(10), 11); \
+		GG1(D1, A1, B1, C1, in( 6),  9); \
+		GG1(C1, D1, A1, B1, in(15),  7); \
+		GG1(B1, C1, D1, A1, in( 3), 15); \
+		GG1(A1, B1, C1, D1, in(12),  7); \
+		GG1(D1, A1, B1, C1, in( 0), 12); \
+		GG1(C1, D1, A1, B1, in( 9), 15); \
+		GG1(B1, C1, D1, A1, in( 5),  9); \
+		GG1(A1, B1, C1, D1, in(14),  7); \
+		GG1(D1, A1, B1, C1, in( 2), 11); \
+		GG1(C1, D1, A1, B1, in(11), 13); \
+		GG1(B1, C1, D1, A1, in( 8), 12); \
+ \
+		HH1(A1, B1, C1, D1, in( 3), 11); \
+		HH1(D1, A1, B1, C1, in(10), 13); \
+		HH1(C1, D1, A1, B1, in( 2), 14); \
+		HH1(B1, C1, D1, A1, in( 4),  7); \
+		HH1(A1, B1, C1, D1, in( 9), 14); \
+		HH1(D1, A1, B1, C1, in(15),  9); \
+		HH1(C1, D1, A1, B1, in( 8), 13); \
+		HH1(B1, C1, D1, A1, in( 1), 15); \
+		HH1(A1, B1, C1, D1, in(14),  6); \
+		HH1(D1, A1, B1, C1, in( 7),  8); \
+		HH1(C1, D1, A1, B1, in( 0), 13); \
+		HH1(B1, C1, D1, A1, in( 6),  6); \
+		HH1(A1, B1, C1, D1, in(11), 12); \
+		HH1(D1, A1, B1, C1, in(13),  5); \
+		HH1(C1, D1, A1, B1, in( 5),  7); \
+		HH1(B1, C1, D1, A1, in(12),  5); \
+ \
+		FF2(A2, B2, C2, D2, in( 0), 11); \
+		FF2(D2, A2, B2, C2, in( 1), 14); \
+		FF2(C2, D2, A2, B2, in( 2), 15); \
+		FF2(B2, C2, D2, A2, in( 3), 12); \
+		FF2(A2, B2, C2, D2, in( 4),  5); \
+		FF2(D2, A2, B2, C2, in( 5),  8); \
+		FF2(C2, D2, A2, B2, in( 6),  7); \
+		FF2(B2, C2, D2, A2, in( 7),  9); \
+		FF2(A2, B2, C2, D2, in( 8), 11); \
+		FF2(D2, A2, B2, C2, in( 9), 13); \
+		FF2(C2, D2, A2, B2, in(10), 14); \
+		FF2(B2, C2, D2, A2, in(11), 15); \
+		FF2(A2, B2, C2, D2, in(12),  6); \
+		FF2(D2, A2, B2, C2, in(13),  7); \
+		FF2(C2, D2, A2, B2, in(14),  9); \
+		FF2(B2, C2, D2, A2, in(15),  8); \
+ \
+		GG2(A2, B2, C2, D2, in( 7),  7); \
+		GG2(D2, A2, B2, C2, in( 4),  6); \
+		GG2(C2, D2, A2, B2, in(13),  8); \
+		GG2(B2, C2, D2, A2, in( 1), 13); \
+		GG2(A2, B2, C2, D2, in(10), 11); \
+		GG2(D2, A2, B2, C2, in( 6),  9); \
+		GG2(C2, D2, A2, B2, in(15),  7); \
+		GG2(B2, C2, D2, A2, in( 3), 15); \
+		GG2(A2, B2, C2, D2, in(12),  7); \
+		GG2(D2, A2, B2, C2, in( 0), 12); \
+		GG2(C2, D2, A2, B2, in( 9), 15); \
+		GG2(B2, C2, D2, A2, in( 5),  9); \
+		GG2(A2, B2, C2, D2, in(14),  7); \
+		GG2(D2, A2, B2, C2, in( 2), 11); \
+		GG2(C2, D2, A2, B2, in(11), 13); \
+		GG2(B2, C2, D2, A2, in( 8), 12); \
+ \
+		HH2(A2, B2, C2, D2, in( 3), 11); \
+		HH2(D2, A2, B2, C2, in(10), 13); \
+		HH2(C2, D2, A2, B2, in( 2), 14); \
+		HH2(B2, C2, D2, A2, in( 4),  7); \
+		HH2(A2, B2, C2, D2, in( 9), 14); \
+		HH2(D2, A2, B2, C2, in(15),  9); \
+		HH2(C2, D2, A2, B2, in( 8), 13); \
+		HH2(B2, C2, D2, A2, in( 1), 15); \
+		HH2(A2, B2, C2, D2, in(14),  6); \
+		HH2(D2, A2, B2, C2, in( 7),  8); \
+		HH2(C2, D2, A2, B2, in( 0), 13); \
+		HH2(B2, C2, D2, A2, in( 6),  6); \
+		HH2(A2, B2, C2, D2, in(11), 12); \
+		HH2(D2, A2, B2, C2, in(13),  5); \
+		HH2(C2, D2, A2, B2, in( 5),  7); \
+		HH2(B2, C2, D2, A2, in(12),  5); \
+ \
+		tmp = SPH_T32((h)[1] + C1 + D2); \
+		(h)[1] = SPH_T32((h)[2] + D1 + A2); \
+		(h)[2] = SPH_T32((h)[3] + A1 + B2); \
+		(h)[3] = SPH_T32((h)[0] + B1 + C2); \
+		(h)[0] = tmp; \
+	} while (0)
+
+/*
+ * One round of RIPEMD. The data must be aligned for 32-bit access.
+ */
+static void
+ripemd_round(const unsigned char *data, sph_u32 r[5])
+{
+#if SPH_LITTLE_FAST
+
+#define RIPEMD_IN(x)   sph_dec32le_aligned(data + (4 * (x)))
+
+#else
+
+	sph_u32 X_var[16];
+	int i;
+
+	for (i = 0; i < 16; i ++)
+		X_var[i] = sph_dec32le_aligned(data + 4 * i);
+#define RIPEMD_IN(x)   X_var[x]
+
+#endif
+	RIPEMD_ROUND_BODY(RIPEMD_IN, r);
+#undef RIPEMD_IN
+}
+
+/* see sph_ripemd.h */
+void
+sph_ripemd_init(void *cc)
+{
+	sph_ripemd_context *sc;
+
+	sc = cc;
+	memcpy(sc->val, oIV, sizeof sc->val);
+#if SPH_64
+	sc->count = 0;
+#else
+	sc->count_high = sc->count_low = 0;
+#endif
+}
+
+#define RFUN   ripemd_round
+#define HASH   ripemd
+#define LE32   1
+#include "md_helper.c"
+#undef RFUN
+#undef HASH
+#undef LE32
+
+/* see sph_ripemd.h */
+void
+sph_ripemd_close(void *cc, void *dst)
+{
+	ripemd_close(cc, dst, 4);
+	sph_ripemd_init(cc);
+}
+
+/* see sph_ripemd.h */
+void
+sph_ripemd_comp(const sph_u32 msg[16], sph_u32 val[4])
+{
+#define RIPEMD_IN(x)   msg[x]
+	RIPEMD_ROUND_BODY(RIPEMD_IN, val);
+#undef RIPEMD_IN
+}
+
+/* ===================================================================== */
+/*
+ * RIPEMD-128.
+ */
+
+/*
+ * Round constants for RIPEMD-128.
+ */
+#define sK11   SPH_C32(0x00000000)
+#define sK12   SPH_C32(0x5A827999)
+#define sK13   SPH_C32(0x6ED9EBA1)
+#define sK14   SPH_C32(0x8F1BBCDC)
+
+#define sK21   SPH_C32(0x50A28BE6)
+#define sK22   SPH_C32(0x5C4DD124)
+#define sK23   SPH_C32(0x6D703EF3)
+#define sK24   SPH_C32(0x00000000)
+
+#define sRR(a, b, c, d, f, s, r, k)   do { \
+		a = ROTL(SPH_T32(a + f(b, c, d) + r + k), s); \
+	} while (0)
+
+#define sROUND1(a, b, c, d, f, s, r, k)  \
+	sRR(a ## 1, b ## 1, c ## 1, d ## 1, f, s, r, sK1 ## k)
+
+#define sROUND2(a, b, c, d, f, s, r, k)  \
+	sRR(a ## 2, b ## 2, c ## 2, d ## 2, f, s, r, sK2 ## k)
+
+/*
+ * This macro defines the body for a RIPEMD-128 compression function
+ * implementation. The "in" parameter should evaluate, when applied to a
+ * numerical input parameter from 0 to 15, to an expression which yields
+ * the corresponding input block. The "h" parameter should evaluate to
+ * an array or pointer expression designating the array of 4 words which
+ * contains the input and output of the compression function.
+ */
+
+#define RIPEMD128_ROUND_BODY(in, h)   do { \
+		sph_u32 A1, B1, C1, D1; \
+		sph_u32 A2, B2, C2, D2; \
+		sph_u32 tmp; \
+ \
+		A1 = A2 = (h)[0]; \
+		B1 = B2 = (h)[1]; \
+		C1 = C2 = (h)[2]; \
+		D1 = D2 = (h)[3]; \
+ \
+		sROUND1(A, B, C, D, F1, 11, in( 0),  1); \
+		sROUND1(D, A, B, C, F1, 14, in( 1),  1); \
+		sROUND1(C, D, A, B, F1, 15, in( 2),  1); \
+		sROUND1(B, C, D, A, F1, 12, in( 3),  1); \
+		sROUND1(A, B, C, D, F1,  5, in( 4),  1); \
+		sROUND1(D, A, B, C, F1,  8, in( 5),  1); \
+		sROUND1(C, D, A, B, F1,  7, in( 6),  1); \
+		sROUND1(B, C, D, A, F1,  9, in( 7),  1); \
+		sROUND1(A, B, C, D, F1, 11, in( 8),  1); \
+		sROUND1(D, A, B, C, F1, 13, in( 9),  1); \
+		sROUND1(C, D, A, B, F1, 14, in(10),  1); \
+		sROUND1(B, C, D, A, F1, 15, in(11),  1); \
+		sROUND1(A, B, C, D, F1,  6, in(12),  1); \
+		sROUND1(D, A, B, C, F1,  7, in(13),  1); \
+		sROUND1(C, D, A, B, F1,  9, in(14),  1); \
+		sROUND1(B, C, D, A, F1,  8, in(15),  1); \
+ \
+		sROUND1(A, B, C, D, F2,  7, in( 7),  2); \
+		sROUND1(D, A, B, C, F2,  6, in( 4),  2); \
+		sROUND1(C, D, A, B, F2,  8, in(13),  2); \
+		sROUND1(B, C, D, A, F2, 13, in( 1),  2); \
+		sROUND1(A, B, C, D, F2, 11, in(10),  2); \
+		sROUND1(D, A, B, C, F2,  9, in( 6),  2); \
+		sROUND1(C, D, A, B, F2,  7, in(15),  2); \
+		sROUND1(B, C, D, A, F2, 15, in( 3),  2); \
+		sROUND1(A, B, C, D, F2,  7, in(12),  2); \
+		sROUND1(D, A, B, C, F2, 12, in( 0),  2); \
+		sROUND1(C, D, A, B, F2, 15, in( 9),  2); \
+		sROUND1(B, C, D, A, F2,  9, in( 5),  2); \
+		sROUND1(A, B, C, D, F2, 11, in( 2),  2); \
+		sROUND1(D, A, B, C, F2,  7, in(14),  2); \
+		sROUND1(C, D, A, B, F2, 13, in(11),  2); \
+		sROUND1(B, C, D, A, F2, 12, in( 8),  2); \
+ \
+		sROUND1(A, B, C, D, F3, 11, in( 3),  3); \
+		sROUND1(D, A, B, C, F3, 13, in(10),  3); \
+		sROUND1(C, D, A, B, F3,  6, in(14),  3); \
+		sROUND1(B, C, D, A, F3,  7, in( 4),  3); \
+		sROUND1(A, B, C, D, F3, 14, in( 9),  3); \
+		sROUND1(D, A, B, C, F3,  9, in(15),  3); \
+		sROUND1(C, D, A, B, F3, 13, in( 8),  3); \
+		sROUND1(B, C, D, A, F3, 15, in( 1),  3); \
+		sROUND1(A, B, C, D, F3, 14, in( 2),  3); \
+		sROUND1(D, A, B, C, F3,  8, in( 7),  3); \
+		sROUND1(C, D, A, B, F3, 13, in( 0),  3); \
+		sROUND1(B, C, D, A, F3,  6, in( 6),  3); \
+		sROUND1(A, B, C, D, F3,  5, in(13),  3); \
+		sROUND1(D, A, B, C, F3, 12, in(11),  3); \
+		sROUND1(C, D, A, B, F3,  7, in( 5),  3); \
+		sROUND1(B, C, D, A, F3,  5, in(12),  3); \
+ \
+		sROUND1(A, B, C, D, F4, 11, in( 1),  4); \
+		sROUND1(D, A, B, C, F4, 12, in( 9),  4); \
+		sROUND1(C, D, A, B, F4, 14, in(11),  4); \
+		sROUND1(B, C, D, A, F4, 15, in(10),  4); \
+		sROUND1(A, B, C, D, F4, 14, in( 0),  4); \
+		sROUND1(D, A, B, C, F4, 15, in( 8),  4); \
+		sROUND1(C, D, A, B, F4,  9, in(12),  4); \
+		sROUND1(B, C, D, A, F4,  8, in( 4),  4); \
+		sROUND1(A, B, C, D, F4,  9, in(13),  4); \
+		sROUND1(D, A, B, C, F4, 14, in( 3),  4); \
+		sROUND1(C, D, A, B, F4,  5, in( 7),  4); \
+		sROUND1(B, C, D, A, F4,  6, in(15),  4); \
+		sROUND1(A, B, C, D, F4,  8, in(14),  4); \
+		sROUND1(D, A, B, C, F4,  6, in( 5),  4); \
+		sROUND1(C, D, A, B, F4,  5, in( 6),  4); \
+		sROUND1(B, C, D, A, F4, 12, in( 2),  4); \
+ \
+		sROUND2(A, B, C, D, F4,  8, in( 5),  1); \
+		sROUND2(D, A, B, C, F4,  9, in(14),  1); \
+		sROUND2(C, D, A, B, F4,  9, in( 7),  1); \
+		sROUND2(B, C, D, A, F4, 11, in( 0),  1); \
+		sROUND2(A, B, C, D, F4, 13, in( 9),  1); \
+		sROUND2(D, A, B, C, F4, 15, in( 2),  1); \
+		sROUND2(C, D, A, B, F4, 15, in(11),  1); \
+		sROUND2(B, C, D, A, F4,  5, in( 4),  1); \
+		sROUND2(A, B, C, D, F4,  7, in(13),  1); \
+		sROUND2(D, A, B, C, F4,  7, in( 6),  1); \
+		sROUND2(C, D, A, B, F4,  8, in(15),  1); \
+		sROUND2(B, C, D, A, F4, 11, in( 8),  1); \
+		sROUND2(A, B, C, D, F4, 14, in( 1),  1); \
+		sROUND2(D, A, B, C, F4, 14, in(10),  1); \
+		sROUND2(C, D, A, B, F4, 12, in( 3),  1); \
+		sROUND2(B, C, D, A, F4,  6, in(12),  1); \
+ \
+		sROUND2(A, B, C, D, F3,  9, in( 6),  2); \
+		sROUND2(D, A, B, C, F3, 13, in(11),  2); \
+		sROUND2(C, D, A, B, F3, 15, in( 3),  2); \
+		sROUND2(B, C, D, A, F3,  7, in( 7),  2); \
+		sROUND2(A, B, C, D, F3, 12, in( 0),  2); \
+		sROUND2(D, A, B, C, F3,  8, in(13),  2); \
+		sROUND2(C, D, A, B, F3,  9, in( 5),  2); \
+		sROUND2(B, C, D, A, F3, 11, in(10),  2); \
+		sROUND2(A, B, C, D, F3,  7, in(14),  2); \
+		sROUND2(D, A, B, C, F3,  7, in(15),  2); \
+		sROUND2(C, D, A, B, F3, 12, in( 8),  2); \
+		sROUND2(B, C, D, A, F3,  7, in(12),  2); \
+		sROUND2(A, B, C, D, F3,  6, in( 4),  2); \
+		sROUND2(D, A, B, C, F3, 15, in( 9),  2); \
+		sROUND2(C, D, A, B, F3, 13, in( 1),  2); \
+		sROUND2(B, C, D, A, F3, 11, in( 2),  2); \
+ \
+		sROUND2(A, B, C, D, F2,  9, in(15),  3); \
+		sROUND2(D, A, B, C, F2,  7, in( 5),  3); \
+		sROUND2(C, D, A, B, F2, 15, in( 1),  3); \
+		sROUND2(B, C, D, A, F2, 11, in( 3),  3); \
+		sROUND2(A, B, C, D, F2,  8, in( 7),  3); \
+		sROUND2(D, A, B, C, F2,  6, in(14),  3); \
+		sROUND2(C, D, A, B, F2,  6, in( 6),  3); \
+		sROUND2(B, C, D, A, F2, 14, in( 9),  3); \
+		sROUND2(A, B, C, D, F2, 12, in(11),  3); \
+		sROUND2(D, A, B, C, F2, 13, in( 8),  3); \
+		sROUND2(C, D, A, B, F2,  5, in(12),  3); \
+		sROUND2(B, C, D, A, F2, 14, in( 2),  3); \
+		sROUND2(A, B, C, D, F2, 13, in(10),  3); \
+		sROUND2(D, A, B, C, F2, 13, in( 0),  3); \
+		sROUND2(C, D, A, B, F2,  7, in( 4),  3); \
+		sROUND2(B, C, D, A, F2,  5, in(13),  3); \
+ \
+		sROUND2(A, B, C, D, F1, 15, in( 8),  4); \
+		sROUND2(D, A, B, C, F1,  5, in( 6),  4); \
+		sROUND2(C, D, A, B, F1,  8, in( 4),  4); \
+		sROUND2(B, C, D, A, F1, 11, in( 1),  4); \
+		sROUND2(A, B, C, D, F1, 14, in( 3),  4); \
+		sROUND2(D, A, B, C, F1, 14, in(11),  4); \
+		sROUND2(C, D, A, B, F1,  6, in(15),  4); \
+		sROUND2(B, C, D, A, F1, 14, in( 0),  4); \
+		sROUND2(A, B, C, D, F1,  6, in( 5),  4); \
+		sROUND2(D, A, B, C, F1,  9, in(12),  4); \
+		sROUND2(C, D, A, B, F1, 12, in( 2),  4); \
+		sROUND2(B, C, D, A, F1,  9, in(13),  4); \
+		sROUND2(A, B, C, D, F1, 12, in( 9),  4); \
+		sROUND2(D, A, B, C, F1,  5, in( 7),  4); \
+		sROUND2(C, D, A, B, F1, 15, in(10),  4); \
+		sROUND2(B, C, D, A, F1,  8, in(14),  4); \
+ \
+		tmp = SPH_T32((h)[1] + C1 + D2); \
+		(h)[1] = SPH_T32((h)[2] + D1 + A2); \
+		(h)[2] = SPH_T32((h)[3] + A1 + B2); \
+		(h)[3] = SPH_T32((h)[0] + B1 + C2); \
+		(h)[0] = tmp; \
+	} while (0)
+
+/*
+ * One round of RIPEMD-128. The data must be aligned for 32-bit access.
+ */
+static void
+ripemd128_round(const unsigned char *data, sph_u32 r[5])
+{
+#if SPH_LITTLE_FAST
+
+#define RIPEMD128_IN(x)   sph_dec32le_aligned(data + (4 * (x)))
+
+#else
+
+	sph_u32 X_var[16];
+	int i;
+
+	for (i = 0; i < 16; i ++)
+		X_var[i] = sph_dec32le_aligned(data + 4 * i);
+#define RIPEMD128_IN(x)   X_var[x]
+
+#endif
+	RIPEMD128_ROUND_BODY(RIPEMD128_IN, r);
+#undef RIPEMD128_IN
+}
+
+/* see sph_ripemd.h */
+void
+sph_ripemd128_init(void *cc)
+{
+	sph_ripemd128_context *sc;
+
+	sc = cc;
+	memcpy(sc->val, IV, sizeof sc->val);
+#if SPH_64
+	sc->count = 0;
+#else
+	sc->count_high = sc->count_low = 0;
+#endif
+}
+
+#define RFUN   ripemd128_round
+#define HASH   ripemd128
+#define LE32   1
+#include "md_helper.c"
+#undef RFUN
+#undef HASH
+#undef LE32
+
+/* see sph_ripemd.h */
+void
+sph_ripemd128_close(void *cc, void *dst)
+{
+	ripemd128_close(cc, dst, 4);
+	sph_ripemd128_init(cc);
+}
+
+/* see sph_ripemd.h */
+void
+sph_ripemd128_comp(const sph_u32 msg[16], sph_u32 val[4])
+{
+#define RIPEMD128_IN(x)   msg[x]
+	RIPEMD128_ROUND_BODY(RIPEMD128_IN, val);
+#undef RIPEMD128_IN
+}
+
+/* ===================================================================== */
+/*
+ * RIPEMD-160.
+ */
+
+/*
+ * Round constants for RIPEMD-160.
+ */
+#define K11    SPH_C32(0x00000000)
+#define K12    SPH_C32(0x5A827999)
+#define K13    SPH_C32(0x6ED9EBA1)
+#define K14    SPH_C32(0x8F1BBCDC)
+#define K15    SPH_C32(0xA953FD4E)
+
+#define K21    SPH_C32(0x50A28BE6)
+#define K22    SPH_C32(0x5C4DD124)
+#define K23    SPH_C32(0x6D703EF3)
+#define K24    SPH_C32(0x7A6D76E9)
+#define K25    SPH_C32(0x00000000)
+
+#define RR(a, b, c, d, e, f, s, r, k)   do { \
+		a = SPH_T32(ROTL(SPH_T32(a + f(b, c, d) + r + k), s) + e); \
+		c = ROTL(c, 10); \
+	} while (0)
+
+#define ROUND1(a, b, c, d, e, f, s, r, k)  \
+	RR(a ## 1, b ## 1, c ## 1, d ## 1, e ## 1, f, s, r, K1 ## k)
+
+#define ROUND2(a, b, c, d, e, f, s, r, k)  \
+	RR(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k)
+
+/*
+ * This macro defines the body for a RIPEMD-160 compression function
+ * implementation. The "in" parameter should evaluate, when applied to a
+ * numerical input parameter from 0 to 15, to an expression which yields
+ * the corresponding input block. The "h" parameter should evaluate to
+ * an array or pointer expression designating the array of 5 words which
+ * contains the input and output of the compression function.
+ */
+
+#define RIPEMD160_ROUND_BODY(in, h)   do { \
+		sph_u32 A1, B1, C1, D1, E1; \
+		sph_u32 A2, B2, C2, D2, E2; \
+		sph_u32 tmp; \
+ \
+		A1 = A2 = (h)[0]; \
+		B1 = B2 = (h)[1]; \
+		C1 = C2 = (h)[2]; \
+		D1 = D2 = (h)[3]; \
+		E1 = E2 = (h)[4]; \
+ \
+		ROUND1(A, B, C, D, E, F1, 11, in( 0),  1); \
+		ROUND1(E, A, B, C, D, F1, 14, in( 1),  1); \
+		ROUND1(D, E, A, B, C, F1, 15, in( 2),  1); \
+		ROUND1(C, D, E, A, B, F1, 12, in( 3),  1); \
+		ROUND1(B, C, D, E, A, F1,  5, in( 4),  1); \
+		ROUND1(A, B, C, D, E, F1,  8, in( 5),  1); \
+		ROUND1(E, A, B, C, D, F1,  7, in( 6),  1); \
+		ROUND1(D, E, A, B, C, F1,  9, in( 7),  1); \
+		ROUND1(C, D, E, A, B, F1, 11, in( 8),  1); \
+		ROUND1(B, C, D, E, A, F1, 13, in( 9),  1); \
+		ROUND1(A, B, C, D, E, F1, 14, in(10),  1); \
+		ROUND1(E, A, B, C, D, F1, 15, in(11),  1); \
+		ROUND1(D, E, A, B, C, F1,  6, in(12),  1); \
+		ROUND1(C, D, E, A, B, F1,  7, in(13),  1); \
+		ROUND1(B, C, D, E, A, F1,  9, in(14),  1); \
+		ROUND1(A, B, C, D, E, F1,  8, in(15),  1); \
+ \
+		ROUND1(E, A, B, C, D, F2,  7, in( 7),  2); \
+		ROUND1(D, E, A, B, C, F2,  6, in( 4),  2); \
+		ROUND1(C, D, E, A, B, F2,  8, in(13),  2); \
+		ROUND1(B, C, D, E, A, F2, 13, in( 1),  2); \
+		ROUND1(A, B, C, D, E, F2, 11, in(10),  2); \
+		ROUND1(E, A, B, C, D, F2,  9, in( 6),  2); \
+		ROUND1(D, E, A, B, C, F2,  7, in(15),  2); \
+		ROUND1(C, D, E, A, B, F2, 15, in( 3),  2); \
+		ROUND1(B, C, D, E, A, F2,  7, in(12),  2); \
+		ROUND1(A, B, C, D, E, F2, 12, in( 0),  2); \
+		ROUND1(E, A, B, C, D, F2, 15, in( 9),  2); \
+		ROUND1(D, E, A, B, C, F2,  9, in( 5),  2); \
+		ROUND1(C, D, E, A, B, F2, 11, in( 2),  2); \
+		ROUND1(B, C, D, E, A, F2,  7, in(14),  2); \
+		ROUND1(A, B, C, D, E, F2, 13, in(11),  2); \
+		ROUND1(E, A, B, C, D, F2, 12, in( 8),  2); \
+ \
+		ROUND1(D, E, A, B, C, F3, 11, in( 3),  3); \
+		ROUND1(C, D, E, A, B, F3, 13, in(10),  3); \
+		ROUND1(B, C, D, E, A, F3,  6, in(14),  3); \
+		ROUND1(A, B, C, D, E, F3,  7, in( 4),  3); \
+		ROUND1(E, A, B, C, D, F3, 14, in( 9),  3); \
+		ROUND1(D, E, A, B, C, F3,  9, in(15),  3); \
+		ROUND1(C, D, E, A, B, F3, 13, in( 8),  3); \
+		ROUND1(B, C, D, E, A, F3, 15, in( 1),  3); \
+		ROUND1(A, B, C, D, E, F3, 14, in( 2),  3); \
+		ROUND1(E, A, B, C, D, F3,  8, in( 7),  3); \
+		ROUND1(D, E, A, B, C, F3, 13, in( 0),  3); \
+		ROUND1(C, D, E, A, B, F3,  6, in( 6),  3); \
+		ROUND1(B, C, D, E, A, F3,  5, in(13),  3); \
+		ROUND1(A, B, C, D, E, F3, 12, in(11),  3); \
+		ROUND1(E, A, B, C, D, F3,  7, in( 5),  3); \
+		ROUND1(D, E, A, B, C, F3,  5, in(12),  3); \
+ \
+		ROUND1(C, D, E, A, B, F4, 11, in( 1),  4); \
+		ROUND1(B, C, D, E, A, F4, 12, in( 9),  4); \
+		ROUND1(A, B, C, D, E, F4, 14, in(11),  4); \
+		ROUND1(E, A, B, C, D, F4, 15, in(10),  4); \
+		ROUND1(D, E, A, B, C, F4, 14, in( 0),  4); \
+		ROUND1(C, D, E, A, B, F4, 15, in( 8),  4); \
+		ROUND1(B, C, D, E, A, F4,  9, in(12),  4); \
+		ROUND1(A, B, C, D, E, F4,  8, in( 4),  4); \
+		ROUND1(E, A, B, C, D, F4,  9, in(13),  4); \
+		ROUND1(D, E, A, B, C, F4, 14, in( 3),  4); \
+		ROUND1(C, D, E, A, B, F4,  5, in( 7),  4); \
+		ROUND1(B, C, D, E, A, F4,  6, in(15),  4); \
+		ROUND1(A, B, C, D, E, F4,  8, in(14),  4); \
+		ROUND1(E, A, B, C, D, F4,  6, in( 5),  4); \
+		ROUND1(D, E, A, B, C, F4,  5, in( 6),  4); \
+		ROUND1(C, D, E, A, B, F4, 12, in( 2),  4); \
+ \
+		ROUND1(B, C, D, E, A, F5,  9, in( 4),  5); \
+		ROUND1(A, B, C, D, E, F5, 15, in( 0),  5); \
+		ROUND1(E, A, B, C, D, F5,  5, in( 5),  5); \
+		ROUND1(D, E, A, B, C, F5, 11, in( 9),  5); \
+		ROUND1(C, D, E, A, B, F5,  6, in( 7),  5); \
+		ROUND1(B, C, D, E, A, F5,  8, in(12),  5); \
+		ROUND1(A, B, C, D, E, F5, 13, in( 2),  5); \
+		ROUND1(E, A, B, C, D, F5, 12, in(10),  5); \
+		ROUND1(D, E, A, B, C, F5,  5, in(14),  5); \
+		ROUND1(C, D, E, A, B, F5, 12, in( 1),  5); \
+		ROUND1(B, C, D, E, A, F5, 13, in( 3),  5); \
+		ROUND1(A, B, C, D, E, F5, 14, in( 8),  5); \
+		ROUND1(E, A, B, C, D, F5, 11, in(11),  5); \
+		ROUND1(D, E, A, B, C, F5,  8, in( 6),  5); \
+		ROUND1(C, D, E, A, B, F5,  5, in(15),  5); \
+		ROUND1(B, C, D, E, A, F5,  6, in(13),  5); \
+ \
+		ROUND2(A, B, C, D, E, F5,  8, in( 5),  1); \
+		ROUND2(E, A, B, C, D, F5,  9, in(14),  1); \
+		ROUND2(D, E, A, B, C, F5,  9, in( 7),  1); \
+		ROUND2(C, D, E, A, B, F5, 11, in( 0),  1); \
+		ROUND2(B, C, D, E, A, F5, 13, in( 9),  1); \
+		ROUND2(A, B, C, D, E, F5, 15, in( 2),  1); \
+		ROUND2(E, A, B, C, D, F5, 15, in(11),  1); \
+		ROUND2(D, E, A, B, C, F5,  5, in( 4),  1); \
+		ROUND2(C, D, E, A, B, F5,  7, in(13),  1); \
+		ROUND2(B, C, D, E, A, F5,  7, in( 6),  1); \
+		ROUND2(A, B, C, D, E, F5,  8, in(15),  1); \
+		ROUND2(E, A, B, C, D, F5, 11, in( 8),  1); \
+		ROUND2(D, E, A, B, C, F5, 14, in( 1),  1); \
+		ROUND2(C, D, E, A, B, F5, 14, in(10),  1); \
+		ROUND2(B, C, D, E, A, F5, 12, in( 3),  1); \
+		ROUND2(A, B, C, D, E, F5,  6, in(12),  1); \
+ \
+		ROUND2(E, A, B, C, D, F4,  9, in( 6),  2); \
+		ROUND2(D, E, A, B, C, F4, 13, in(11),  2); \
+		ROUND2(C, D, E, A, B, F4, 15, in( 3),  2); \
+		ROUND2(B, C, D, E, A, F4,  7, in( 7),  2); \
+		ROUND2(A, B, C, D, E, F4, 12, in( 0),  2); \
+		ROUND2(E, A, B, C, D, F4,  8, in(13),  2); \
+		ROUND2(D, E, A, B, C, F4,  9, in( 5),  2); \
+		ROUND2(C, D, E, A, B, F4, 11, in(10),  2); \
+		ROUND2(B, C, D, E, A, F4,  7, in(14),  2); \
+		ROUND2(A, B, C, D, E, F4,  7, in(15),  2); \
+		ROUND2(E, A, B, C, D, F4, 12, in( 8),  2); \
+		ROUND2(D, E, A, B, C, F4,  7, in(12),  2); \
+		ROUND2(C, D, E, A, B, F4,  6, in( 4),  2); \
+		ROUND2(B, C, D, E, A, F4, 15, in( 9),  2); \
+		ROUND2(A, B, C, D, E, F4, 13, in( 1),  2); \
+		ROUND2(E, A, B, C, D, F4, 11, in( 2),  2); \
+ \
+		ROUND2(D, E, A, B, C, F3,  9, in(15),  3); \
+		ROUND2(C, D, E, A, B, F3,  7, in( 5),  3); \
+		ROUND2(B, C, D, E, A, F3, 15, in( 1),  3); \
+		ROUND2(A, B, C, D, E, F3, 11, in( 3),  3); \
+		ROUND2(E, A, B, C, D, F3,  8, in( 7),  3); \
+		ROUND2(D, E, A, B, C, F3,  6, in(14),  3); \
+		ROUND2(C, D, E, A, B, F3,  6, in( 6),  3); \
+		ROUND2(B, C, D, E, A, F3, 14, in( 9),  3); \
+		ROUND2(A, B, C, D, E, F3, 12, in(11),  3); \
+		ROUND2(E, A, B, C, D, F3, 13, in( 8),  3); \
+		ROUND2(D, E, A, B, C, F3,  5, in(12),  3); \
+		ROUND2(C, D, E, A, B, F3, 14, in( 2),  3); \
+		ROUND2(B, C, D, E, A, F3, 13, in(10),  3); \
+		ROUND2(A, B, C, D, E, F3, 13, in( 0),  3); \
+		ROUND2(E, A, B, C, D, F3,  7, in( 4),  3); \
+		ROUND2(D, E, A, B, C, F3,  5, in(13),  3); \
+ \
+		ROUND2(C, D, E, A, B, F2, 15, in( 8),  4); \
+		ROUND2(B, C, D, E, A, F2,  5, in( 6),  4); \
+		ROUND2(A, B, C, D, E, F2,  8, in( 4),  4); \
+		ROUND2(E, A, B, C, D, F2, 11, in( 1),  4); \
+		ROUND2(D, E, A, B, C, F2, 14, in( 3),  4); \
+		ROUND2(C, D, E, A, B, F2, 14, in(11),  4); \
+		ROUND2(B, C, D, E, A, F2,  6, in(15),  4); \
+		ROUND2(A, B, C, D, E, F2, 14, in( 0),  4); \
+		ROUND2(E, A, B, C, D, F2,  6, in( 5),  4); \
+		ROUND2(D, E, A, B, C, F2,  9, in(12),  4); \
+		ROUND2(C, D, E, A, B, F2, 12, in( 2),  4); \
+		ROUND2(B, C, D, E, A, F2,  9, in(13),  4); \
+		ROUND2(A, B, C, D, E, F2, 12, in( 9),  4); \
+		ROUND2(E, A, B, C, D, F2,  5, in( 7),  4); \
+		ROUND2(D, E, A, B, C, F2, 15, in(10),  4); \
+		ROUND2(C, D, E, A, B, F2,  8, in(14),  4); \
+ \
+		ROUND2(B, C, D, E, A, F1,  8, in(12),  5); \
+		ROUND2(A, B, C, D, E, F1,  5, in(15),  5); \
+		ROUND2(E, A, B, C, D, F1, 12, in(10),  5); \
+		ROUND2(D, E, A, B, C, F1,  9, in( 4),  5); \
+		ROUND2(C, D, E, A, B, F1, 12, in( 1),  5); \
+		ROUND2(B, C, D, E, A, F1,  5, in( 5),  5); \
+		ROUND2(A, B, C, D, E, F1, 14, in( 8),  5); \
+		ROUND2(E, A, B, C, D, F1,  6, in( 7),  5); \
+		ROUND2(D, E, A, B, C, F1,  8, in( 6),  5); \
+		ROUND2(C, D, E, A, B, F1, 13, in( 2),  5); \
+		ROUND2(B, C, D, E, A, F1,  6, in(13),  5); \
+		ROUND2(A, B, C, D, E, F1,  5, in(14),  5); \
+		ROUND2(E, A, B, C, D, F1, 15, in( 0),  5); \
+		ROUND2(D, E, A, B, C, F1, 13, in( 3),  5); \
+		ROUND2(C, D, E, A, B, F1, 11, in( 9),  5); \
+		ROUND2(B, C, D, E, A, F1, 11, in(11),  5); \
+ \
+		tmp = SPH_T32((h)[1] + C1 + D2); \
+		(h)[1] = SPH_T32((h)[2] + D1 + E2); \
+		(h)[2] = SPH_T32((h)[3] + E1 + A2); \
+		(h)[3] = SPH_T32((h)[4] + A1 + B2); \
+		(h)[4] = SPH_T32((h)[0] + B1 + C2); \
+		(h)[0] = tmp; \
+	} while (0)
+
+/*
+ * One round of RIPEMD-160. The data must be aligned for 32-bit access.
+ */
+static void
+ripemd160_round(const unsigned char *data, sph_u32 r[5])
+{
+#if SPH_LITTLE_FAST
+
+#define RIPEMD160_IN(x)   sph_dec32le_aligned(data + (4 * (x)))
+
+#else
+
+	sph_u32 X_var[16];
+	int i;
+
+	for (i = 0; i < 16; i ++)
+		X_var[i] = sph_dec32le_aligned(data + 4 * i);
+#define RIPEMD160_IN(x)   X_var[x]
+
+#endif
+	RIPEMD160_ROUND_BODY(RIPEMD160_IN, r);
+#undef RIPEMD160_IN
+}
+
+/* see sph_ripemd.h */
+void
+sph_ripemd160_init(void *cc)
+{
+	sph_ripemd160_context *sc;
+
+	sc = cc;
+	memcpy(sc->val, IV, sizeof sc->val);
+#if SPH_64
+	sc->count = 0;
+#else
+	sc->count_high = sc->count_low = 0;
+#endif
+}
+
+#define RFUN   ripemd160_round
+#define HASH   ripemd160
+#define LE32   1
+#include "md_helper.c"
+#undef RFUN
+#undef HASH
+#undef LE32
+
+/* see sph_ripemd.h */
+void
+sph_ripemd160_close(void *cc, void *dst)
+{
+	ripemd160_close(cc, dst, 5);
+	sph_ripemd160_init(cc);
+}
+
+/* see sph_ripemd.h */
+void
+sph_ripemd160_comp(const sph_u32 msg[16], sph_u32 val[5])
+{
+#define RIPEMD160_IN(x)   msg[x]
+	RIPEMD160_ROUND_BODY(RIPEMD160_IN, val);
+#undef RIPEMD160_IN
+}
diff --git a/sha2.c b/sph/sha2.c
similarity index 96%
rename from sha2.c
rename to sph/sha2.c
index 8c5df1e742..354fcaa11f 100644
--- a/sha2.c
+++ b/sph/sha2.c
@@ -8,7 +8,6 @@
  * any later version.  See COPYING for more details.
  */
 
-#include "cpuminer-config.h"
 #include "miner.h"
 
 #include <string.h>
@@ -468,8 +467,8 @@ static inline void sha256d_ms(uint32_t *hash, uint32_t *W,
 void sha256d_ms_4way(uint32_t *hash,  uint32_t *data,
 	const uint32_t *midstate, const uint32_t *prehash);
 
-static inline int scanhash_sha256d_4way(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done)
+static inline int scanhash_sha256d_4way(int thr_id,  uint32_t *pdata,
+	const uint32_t *ptarget uint32_t max_nonce, unsigned long *hashes_done)
 {
 	uint32_t data[4 * 64] __attribute__((aligned(128)));
 	uint32_t hash[4 * 8] __attribute__((aligned(32)));
@@ -508,6 +507,7 @@ static inline int scanhash_sha256d_4way(int thr_id, uint32_t *pdata,
 				pdata[19] = data[4 * 3 + i];
 				sha256d_80_swap(hash, pdata);
 				if (fulltest(hash, ptarget)) {
+					work_set_target_ratio(work, hash);
 					*hashes_done = n - first_nonce + 1;
 					return 1;
 				}
@@ -581,13 +581,15 @@ static inline int scanhash_sha256d_8way(int thr_id, uint32_t *pdata,
 
 #endif /* HAVE_SHA256_8WAY */
 
-int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
-	uint32_t max_nonce, unsigned long *hashes_done)
+#if 0
+int scanhash_sha256d(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
 {
-	uint32_t data[64] /* __attribute__((aligned(128))) */;
-	uint32_t hash[8] /* __attribute__((aligned(32))) */;
-	uint32_t midstate[8] /* __attribute__((aligned(32))) */;
-	uint32_t prehash[8] /* __attribute__((aligned(32))) */;
+	uint32_t _ALIGN(128) data[64];
+	uint32_t hash[8];
+	uint32_t midstate[8];
+	uint32_t prehash[8];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
 	uint32_t n = pdata[19] - 1;
 	const uint32_t first_nonce = pdata[19];
 	const uint32_t Htarg = ptarget[7];
@@ -628,3 +630,5 @@ int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
 	pdata[19] = n;
 	return 0;
 }
+
+#endif
diff --git a/sph/sha2big.c b/sph/sha2big.c
new file mode 100644
index 0000000000..bd3a895c89
--- /dev/null
+++ b/sph/sha2big.c
@@ -0,0 +1,256 @@
+/* $Id: sha2big.c 216 2010-06-08 09:46:57Z tp $ */
+/*
+ * SHA-384 / SHA-512 implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "sph_sha2.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#if SPH_64
+
+#define CH(X, Y, Z)    ((((Y) ^ (Z)) & (X)) ^ (Z))
+#define MAJ(X, Y, Z)   (((X) & (Y)) | (((X) | (Y)) & (Z)))
+
+#define ROTR64    SPH_ROTR64
+
+#define BSG5_0(x)      (ROTR64(x, 28) ^ ROTR64(x, 34) ^ ROTR64(x, 39))
+#define BSG5_1(x)      (ROTR64(x, 14) ^ ROTR64(x, 18) ^ ROTR64(x, 41))
+#define SSG5_0(x)      (ROTR64(x, 1) ^ ROTR64(x, 8) ^ SPH_T64((x) >> 7))
+#define SSG5_1(x)      (ROTR64(x, 19) ^ ROTR64(x, 61) ^ SPH_T64((x) >> 6))
+
+static const sph_u64 K512[80] = {
+	SPH_C64(0x428A2F98D728AE22), SPH_C64(0x7137449123EF65CD),
+	SPH_C64(0xB5C0FBCFEC4D3B2F), SPH_C64(0xE9B5DBA58189DBBC),
+	SPH_C64(0x3956C25BF348B538), SPH_C64(0x59F111F1B605D019),
+	SPH_C64(0x923F82A4AF194F9B), SPH_C64(0xAB1C5ED5DA6D8118),
+	SPH_C64(0xD807AA98A3030242), SPH_C64(0x12835B0145706FBE),
+	SPH_C64(0x243185BE4EE4B28C), SPH_C64(0x550C7DC3D5FFB4E2),
+	SPH_C64(0x72BE5D74F27B896F), SPH_C64(0x80DEB1FE3B1696B1),
+	SPH_C64(0x9BDC06A725C71235), SPH_C64(0xC19BF174CF692694),
+	SPH_C64(0xE49B69C19EF14AD2), SPH_C64(0xEFBE4786384F25E3),
+	SPH_C64(0x0FC19DC68B8CD5B5), SPH_C64(0x240CA1CC77AC9C65),
+	SPH_C64(0x2DE92C6F592B0275), SPH_C64(0x4A7484AA6EA6E483),
+	SPH_C64(0x5CB0A9DCBD41FBD4), SPH_C64(0x76F988DA831153B5),
+	SPH_C64(0x983E5152EE66DFAB), SPH_C64(0xA831C66D2DB43210),
+	SPH_C64(0xB00327C898FB213F), SPH_C64(0xBF597FC7BEEF0EE4),
+	SPH_C64(0xC6E00BF33DA88FC2), SPH_C64(0xD5A79147930AA725),
+	SPH_C64(0x06CA6351E003826F), SPH_C64(0x142929670A0E6E70),
+	SPH_C64(0x27B70A8546D22FFC), SPH_C64(0x2E1B21385C26C926),
+	SPH_C64(0x4D2C6DFC5AC42AED), SPH_C64(0x53380D139D95B3DF),
+	SPH_C64(0x650A73548BAF63DE), SPH_C64(0x766A0ABB3C77B2A8),
+	SPH_C64(0x81C2C92E47EDAEE6), SPH_C64(0x92722C851482353B),
+	SPH_C64(0xA2BFE8A14CF10364), SPH_C64(0xA81A664BBC423001),
+	SPH_C64(0xC24B8B70D0F89791), SPH_C64(0xC76C51A30654BE30),
+	SPH_C64(0xD192E819D6EF5218), SPH_C64(0xD69906245565A910),
+	SPH_C64(0xF40E35855771202A), SPH_C64(0x106AA07032BBD1B8),
+	SPH_C64(0x19A4C116B8D2D0C8), SPH_C64(0x1E376C085141AB53),
+	SPH_C64(0x2748774CDF8EEB99), SPH_C64(0x34B0BCB5E19B48A8),
+	SPH_C64(0x391C0CB3C5C95A63), SPH_C64(0x4ED8AA4AE3418ACB),
+	SPH_C64(0x5B9CCA4F7763E373), SPH_C64(0x682E6FF3D6B2B8A3),
+	SPH_C64(0x748F82EE5DEFB2FC), SPH_C64(0x78A5636F43172F60),
+	SPH_C64(0x84C87814A1F0AB72), SPH_C64(0x8CC702081A6439EC),
+	SPH_C64(0x90BEFFFA23631E28), SPH_C64(0xA4506CEBDE82BDE9),
+	SPH_C64(0xBEF9A3F7B2C67915), SPH_C64(0xC67178F2E372532B),
+	SPH_C64(0xCA273ECEEA26619C), SPH_C64(0xD186B8C721C0C207),
+	SPH_C64(0xEADA7DD6CDE0EB1E), SPH_C64(0xF57D4F7FEE6ED178),
+	SPH_C64(0x06F067AA72176FBA), SPH_C64(0x0A637DC5A2C898A6),
+	SPH_C64(0x113F9804BEF90DAE), SPH_C64(0x1B710B35131C471B),
+	SPH_C64(0x28DB77F523047D84), SPH_C64(0x32CAAB7B40C72493),
+	SPH_C64(0x3C9EBE0A15C9BEBC), SPH_C64(0x431D67C49C100D4C),
+	SPH_C64(0x4CC5D4BECB3E42B6), SPH_C64(0x597F299CFC657E2A),
+	SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817)
+};
+
+static const sph_u64 H384[8] = {
+	SPH_C64(0xCBBB9D5DC1059ED8), SPH_C64(0x629A292A367CD507),
+	SPH_C64(0x9159015A3070DD17), SPH_C64(0x152FECD8F70E5939),
+	SPH_C64(0x67332667FFC00B31), SPH_C64(0x8EB44A8768581511),
+	SPH_C64(0xDB0C2E0D64F98FA7), SPH_C64(0x47B5481DBEFA4FA4)
+};
+
+static const sph_u64 H512[8] = {
+	SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
+	SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
+	SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
+	SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
+};
+
+/*
+ * This macro defines the body for a SHA-384 / SHA-512 compression function
+ * implementation. The "in" parameter should evaluate, when applied to a
+ * numerical input parameter from 0 to 15, to an expression which yields
+ * the corresponding input block. The "r" parameter should evaluate to
+ * an array or pointer expression designating the array of 8 words which
+ * contains the input and output of the compression function.
+ *
+ * SHA-512 is hard for the compiler. If the loop is completely unrolled,
+ * then the code will be quite huge (possibly more than 100 kB), and the
+ * performance will be degraded due to cache misses on the code. We
+ * unroll only eight steps, which avoids all needless copies when
+ * 64-bit registers are swapped.
+ */
+
+#define SHA3_STEP(A, B, C, D, E, F, G, H, i)   do { \
+		sph_u64 T1, T2; \
+		T1 = SPH_T64(H + BSG5_1(E) + CH(E, F, G) + K512[i] + W[i]); \
+		T2 = SPH_T64(BSG5_0(A) + MAJ(A, B, C)); \
+		D = SPH_T64(D + T1); \
+		H = SPH_T64(T1 + T2); \
+	} while (0)
+
+#define SHA3_ROUND_BODY(in, r)   do { \
+		int i; \
+		sph_u64 A, B, C, D, E, F, G, H; \
+		sph_u64 W[80]; \
+ \
+ 		for (i = 0; i < 16; i ++) \
+			W[i] = in(i); \
+			\
+		for (i = 16; i < 80; i ++) \
+ 			W[i] = SPH_T64(SSG5_1(W[i - 2]) + W[i - 7] \
+				+ SSG5_0(W[i - 15]) + W[i - 16]); \
+		A = (r)[0]; \
+		B = (r)[1]; \
+		C = (r)[2]; \
+		D = (r)[3]; \
+		E = (r)[4]; \
+		F = (r)[5]; \
+		G = (r)[6]; \
+		H = (r)[7]; \
+		for (i = 0; i < 80; i += 8) { \
+			SHA3_STEP(A, B, C, D, E, F, G, H, i + 0); \
+			SHA3_STEP(H, A, B, C, D, E, F, G, i + 1); \
+			SHA3_STEP(G, H, A, B, C, D, E, F, i + 2); \
+			SHA3_STEP(F, G, H, A, B, C, D, E, i + 3); \
+			SHA3_STEP(E, F, G, H, A, B, C, D, i + 4); \
+			SHA3_STEP(D, E, F, G, H, A, B, C, i + 5); \
+			SHA3_STEP(C, D, E, F, G, H, A, B, i + 6); \
+			SHA3_STEP(B, C, D, E, F, G, H, A, i + 7); \
+		} \
+		(r)[0] = SPH_T64((r)[0] + A); \
+		(r)[1] = SPH_T64((r)[1] + B); \
+		(r)[2] = SPH_T64((r)[2] + C); \
+		(r)[3] = SPH_T64((r)[3] + D); \
+		(r)[4] = SPH_T64((r)[4] + E); \
+		(r)[5] = SPH_T64((r)[5] + F); \
+		(r)[6] = SPH_T64((r)[6] + G); \
+		(r)[7] = SPH_T64((r)[7] + H); \
+	} while (0)
+
+/*
+ * One round of SHA-384 / SHA-512. The data must be aligned for 64-bit access.
+ */
+static void
+sha3_round(const unsigned char *data, sph_u64 r[8])
+{
+#define SHA3_IN(x)   sph_dec64be_aligned(data + (8 * (x)))
+	SHA3_ROUND_BODY(SHA3_IN, r);
+#undef SHA3_IN
+}
+
+/* see sph_sha3.h */
+void
+sph_sha384_init(void *cc)
+{
+	sph_sha384_context *sc;
+
+	sc = cc;
+	memcpy(sc->val, H384, sizeof H384);
+	sc->count = 0;
+}
+
+/* see sph_sha3.h */
+void
+sph_sha512_init(void *cc)
+{
+	sph_sha512_context *sc;
+
+	sc = cc;
+	memcpy(sc->val, H512, sizeof H512);
+	sc->count = 0;
+}
+
+#define RFUN   sha3_round
+#define HASH   sha384
+#define BE64   1
+#include "md_helper.c"
+
+/* see sph_sha3.h */
+void
+sph_sha384_close(void *cc, void *dst)
+{
+	sha384_close(cc, dst, 6);
+	sph_sha384_init(cc);
+}
+
+/* see sph_sha3.h */
+void
+sph_sha384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	sha384_addbits_and_close(cc, ub, n, dst, 6);
+	sph_sha384_init(cc);
+}
+
+/* see sph_sha3.h */
+void
+sph_sha512_close(void *cc, void *dst)
+{
+	sha384_close(cc, dst, 8);
+	sph_sha512_init(cc);
+}
+
+/* see sph_sha3.h */
+void
+sph_sha512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	sha384_addbits_and_close(cc, ub, n, dst, 8);
+	sph_sha512_init(cc);
+}
+
+/* see sph_sha3.h */
+void
+sph_sha384_comp(const sph_u64 msg[16], sph_u64 val[8])
+{
+#define SHA3_IN(x)   msg[x]
+	SHA3_ROUND_BODY(SHA3_IN, val);
+#undef SHA3_IN
+}
+
+#endif
+#ifdef __cplusplus
+}
+#endif
+
diff --git a/sph/shabal.c b/sph/shabal.c
new file mode 100644
index 0000000000..06d368ce54
--- /dev/null
+++ b/sph/shabal.c
@@ -0,0 +1,810 @@
+/* $Id: shabal.c 175 2010-05-07 16:03:20Z tp $ */
+/*
+ * Shabal implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "sph_shabal.h"
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+/*
+ * Part of this code was automatically generated (the part between
+ * the "BEGIN" and "END" markers).
+ */
+
+#define sM    16
+
+#define C32   SPH_C32
+#define T32   SPH_T32
+
+#define O1   13
+#define O2    9
+#define O3    6
+
+/*
+ * We copy the state into local variables, so that the compiler knows
+ * that it can optimize them at will.
+ */
+
+/* BEGIN -- automatically generated code. */
+
+#define DECL_STATE   \
+	sph_u32 A00, A01, A02, A03, A04, A05, A06, A07, \
+	        A08, A09, A0A, A0B; \
+	sph_u32 B0, B1, B2, B3, B4, B5, B6, B7, \
+	        B8, B9, BA, BB, BC, BD, BE, BF; \
+	sph_u32 C0, C1, C2, C3, C4, C5, C6, C7, \
+	        C8, C9, CA, CB, CC, CD, CE, CF; \
+	sph_u32 M0, M1, M2, M3, M4, M5, M6, M7, \
+	        M8, M9, MA, MB, MC, MD, ME, MF; \
+	sph_u32 Wlow, Whigh;
+
+#define READ_STATE(state)   do { \
+		A00 = (state)->A[0]; \
+		A01 = (state)->A[1]; \
+		A02 = (state)->A[2]; \
+		A03 = (state)->A[3]; \
+		A04 = (state)->A[4]; \
+		A05 = (state)->A[5]; \
+		A06 = (state)->A[6]; \
+		A07 = (state)->A[7]; \
+		A08 = (state)->A[8]; \
+		A09 = (state)->A[9]; \
+		A0A = (state)->A[10]; \
+		A0B = (state)->A[11]; \
+		B0 = (state)->B[0]; \
+		B1 = (state)->B[1]; \
+		B2 = (state)->B[2]; \
+		B3 = (state)->B[3]; \
+		B4 = (state)->B[4]; \
+		B5 = (state)->B[5]; \
+		B6 = (state)->B[6]; \
+		B7 = (state)->B[7]; \
+		B8 = (state)->B[8]; \
+		B9 = (state)->B[9]; \
+		BA = (state)->B[10]; \
+		BB = (state)->B[11]; \
+		BC = (state)->B[12]; \
+		BD = (state)->B[13]; \
+		BE = (state)->B[14]; \
+		BF = (state)->B[15]; \
+		C0 = (state)->C[0]; \
+		C1 = (state)->C[1]; \
+		C2 = (state)->C[2]; \
+		C3 = (state)->C[3]; \
+		C4 = (state)->C[4]; \
+		C5 = (state)->C[5]; \
+		C6 = (state)->C[6]; \
+		C7 = (state)->C[7]; \
+		C8 = (state)->C[8]; \
+		C9 = (state)->C[9]; \
+		CA = (state)->C[10]; \
+		CB = (state)->C[11]; \
+		CC = (state)->C[12]; \
+		CD = (state)->C[13]; \
+		CE = (state)->C[14]; \
+		CF = (state)->C[15]; \
+		Wlow = (state)->Wlow; \
+		Whigh = (state)->Whigh; \
+	} while (0)
+
+#define WRITE_STATE(state)   do { \
+		(state)->A[0] = A00; \
+		(state)->A[1] = A01; \
+		(state)->A[2] = A02; \
+		(state)->A[3] = A03; \
+		(state)->A[4] = A04; \
+		(state)->A[5] = A05; \
+		(state)->A[6] = A06; \
+		(state)->A[7] = A07; \
+		(state)->A[8] = A08; \
+		(state)->A[9] = A09; \
+		(state)->A[10] = A0A; \
+		(state)->A[11] = A0B; \
+		(state)->B[0] = B0; \
+		(state)->B[1] = B1; \
+		(state)->B[2] = B2; \
+		(state)->B[3] = B3; \
+		(state)->B[4] = B4; \
+		(state)->B[5] = B5; \
+		(state)->B[6] = B6; \
+		(state)->B[7] = B7; \
+		(state)->B[8] = B8; \
+		(state)->B[9] = B9; \
+		(state)->B[10] = BA; \
+		(state)->B[11] = BB; \
+		(state)->B[12] = BC; \
+		(state)->B[13] = BD; \
+		(state)->B[14] = BE; \
+		(state)->B[15] = BF; \
+		(state)->C[0] = C0; \
+		(state)->C[1] = C1; \
+		(state)->C[2] = C2; \
+		(state)->C[3] = C3; \
+		(state)->C[4] = C4; \
+		(state)->C[5] = C5; \
+		(state)->C[6] = C6; \
+		(state)->C[7] = C7; \
+		(state)->C[8] = C8; \
+		(state)->C[9] = C9; \
+		(state)->C[10] = CA; \
+		(state)->C[11] = CB; \
+		(state)->C[12] = CC; \
+		(state)->C[13] = CD; \
+		(state)->C[14] = CE; \
+		(state)->C[15] = CF; \
+		(state)->Wlow = Wlow; \
+		(state)->Whigh = Whigh; \
+	} while (0)
+
+#define DECODE_BLOCK   do { \
+		M0 = sph_dec32le_aligned(buf + 0); \
+		M1 = sph_dec32le_aligned(buf + 4); \
+		M2 = sph_dec32le_aligned(buf + 8); \
+		M3 = sph_dec32le_aligned(buf + 12); \
+		M4 = sph_dec32le_aligned(buf + 16); \
+		M5 = sph_dec32le_aligned(buf + 20); \
+		M6 = sph_dec32le_aligned(buf + 24); \
+		M7 = sph_dec32le_aligned(buf + 28); \
+		M8 = sph_dec32le_aligned(buf + 32); \
+		M9 = sph_dec32le_aligned(buf + 36); \
+		MA = sph_dec32le_aligned(buf + 40); \
+		MB = sph_dec32le_aligned(buf + 44); \
+		MC = sph_dec32le_aligned(buf + 48); \
+		MD = sph_dec32le_aligned(buf + 52); \
+		ME = sph_dec32le_aligned(buf + 56); \
+		MF = sph_dec32le_aligned(buf + 60); \
+	} while (0)
+
+#define INPUT_BLOCK_ADD   do { \
+		B0 = T32(B0 + M0); \
+		B1 = T32(B1 + M1); \
+		B2 = T32(B2 + M2); \
+		B3 = T32(B3 + M3); \
+		B4 = T32(B4 + M4); \
+		B5 = T32(B5 + M5); \
+		B6 = T32(B6 + M6); \
+		B7 = T32(B7 + M7); \
+		B8 = T32(B8 + M8); \
+		B9 = T32(B9 + M9); \
+		BA = T32(BA + MA); \
+		BB = T32(BB + MB); \
+		BC = T32(BC + MC); \
+		BD = T32(BD + MD); \
+		BE = T32(BE + ME); \
+		BF = T32(BF + MF); \
+	} while (0)
+
+#define INPUT_BLOCK_SUB   do { \
+		C0 = T32(C0 - M0); \
+		C1 = T32(C1 - M1); \
+		C2 = T32(C2 - M2); \
+		C3 = T32(C3 - M3); \
+		C4 = T32(C4 - M4); \
+		C5 = T32(C5 - M5); \
+		C6 = T32(C6 - M6); \
+		C7 = T32(C7 - M7); \
+		C8 = T32(C8 - M8); \
+		C9 = T32(C9 - M9); \
+		CA = T32(CA - MA); \
+		CB = T32(CB - MB); \
+		CC = T32(CC - MC); \
+		CD = T32(CD - MD); \
+		CE = T32(CE - ME); \
+		CF = T32(CF - MF); \
+	} while (0)
+
+#define XOR_W   do { \
+		A00 ^= Wlow; \
+		A01 ^= Whigh; \
+	} while (0)
+
+#define SWAP(v1, v2)   do { \
+		sph_u32 tmp = (v1); \
+		(v1) = (v2); \
+		(v2) = tmp; \
+	} while (0)
+
+#define SWAP_BC   do { \
+		SWAP(B0, C0); \
+		SWAP(B1, C1); \
+		SWAP(B2, C2); \
+		SWAP(B3, C3); \
+		SWAP(B4, C4); \
+		SWAP(B5, C5); \
+		SWAP(B6, C6); \
+		SWAP(B7, C7); \
+		SWAP(B8, C8); \
+		SWAP(B9, C9); \
+		SWAP(BA, CA); \
+		SWAP(BB, CB); \
+		SWAP(BC, CC); \
+		SWAP(BD, CD); \
+		SWAP(BE, CE); \
+		SWAP(BF, CF); \
+	} while (0)
+
+#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm)   do { \
+		xa0 = T32((xa0 \
+			^ (((xa1 << 15) | (xa1 >> 17)) * 5U) \
+			^ xc) * 3U) \
+			^ xb1 ^ (xb2 & ~xb3) ^ xm; \
+		xb0 = T32(~(((xb0 << 1) | (xb0 >> 31)) ^ xa0)); \
+	} while (0)
+
+#define PERM_STEP_0   do { \
+		PERM_ELT(A00, A0B, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A01, A00, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A02, A01, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A03, A02, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A04, A03, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A05, A04, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A06, A05, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A07, A06, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A08, A07, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A09, A08, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A0A, A09, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A0B, A0A, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A00, A0B, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A01, A00, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A02, A01, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A03, A02, BF, BC, B8, B5, C9, MF); \
+	} while (0)
+
+#define PERM_STEP_1   do { \
+		PERM_ELT(A04, A03, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A05, A04, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A06, A05, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A07, A06, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A08, A07, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A09, A08, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A0A, A09, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A0B, A0A, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A00, A0B, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A01, A00, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A02, A01, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A03, A02, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A04, A03, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A05, A04, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A06, A05, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A07, A06, BF, BC, B8, B5, C9, MF); \
+	} while (0)
+
+#define PERM_STEP_2   do { \
+		PERM_ELT(A08, A07, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A09, A08, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A0A, A09, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A0B, A0A, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A00, A0B, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A01, A00, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A02, A01, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A03, A02, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A04, A03, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A05, A04, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A06, A05, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A07, A06, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A08, A07, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A09, A08, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A0A, A09, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A0B, A0A, BF, BC, B8, B5, C9, MF); \
+	} while (0)
+
+#define APPLY_P   do { \
+		B0 = T32(B0 << 17) | (B0 >> 15); \
+		B1 = T32(B1 << 17) | (B1 >> 15); \
+		B2 = T32(B2 << 17) | (B2 >> 15); \
+		B3 = T32(B3 << 17) | (B3 >> 15); \
+		B4 = T32(B4 << 17) | (B4 >> 15); \
+		B5 = T32(B5 << 17) | (B5 >> 15); \
+		B6 = T32(B6 << 17) | (B6 >> 15); \
+		B7 = T32(B7 << 17) | (B7 >> 15); \
+		B8 = T32(B8 << 17) | (B8 >> 15); \
+		B9 = T32(B9 << 17) | (B9 >> 15); \
+		BA = T32(BA << 17) | (BA >> 15); \
+		BB = T32(BB << 17) | (BB >> 15); \
+		BC = T32(BC << 17) | (BC >> 15); \
+		BD = T32(BD << 17) | (BD >> 15); \
+		BE = T32(BE << 17) | (BE >> 15); \
+		BF = T32(BF << 17) | (BF >> 15); \
+		PERM_STEP_0; \
+		PERM_STEP_1; \
+		PERM_STEP_2; \
+		A0B = T32(A0B + C6); \
+		A0A = T32(A0A + C5); \
+		A09 = T32(A09 + C4); \
+		A08 = T32(A08 + C3); \
+		A07 = T32(A07 + C2); \
+		A06 = T32(A06 + C1); \
+		A05 = T32(A05 + C0); \
+		A04 = T32(A04 + CF); \
+		A03 = T32(A03 + CE); \
+		A02 = T32(A02 + CD); \
+		A01 = T32(A01 + CC); \
+		A00 = T32(A00 + CB); \
+		A0B = T32(A0B + CA); \
+		A0A = T32(A0A + C9); \
+		A09 = T32(A09 + C8); \
+		A08 = T32(A08 + C7); \
+		A07 = T32(A07 + C6); \
+		A06 = T32(A06 + C5); \
+		A05 = T32(A05 + C4); \
+		A04 = T32(A04 + C3); \
+		A03 = T32(A03 + C2); \
+		A02 = T32(A02 + C1); \
+		A01 = T32(A01 + C0); \
+		A00 = T32(A00 + CF); \
+		A0B = T32(A0B + CE); \
+		A0A = T32(A0A + CD); \
+		A09 = T32(A09 + CC); \
+		A08 = T32(A08 + CB); \
+		A07 = T32(A07 + CA); \
+		A06 = T32(A06 + C9); \
+		A05 = T32(A05 + C8); \
+		A04 = T32(A04 + C7); \
+		A03 = T32(A03 + C6); \
+		A02 = T32(A02 + C5); \
+		A01 = T32(A01 + C4); \
+		A00 = T32(A00 + C3); \
+	} while (0)
+
+#define INCR_W   do { \
+		if ((Wlow = T32(Wlow + 1)) == 0) \
+			Whigh = T32(Whigh + 1); \
+	} while (0)
+#if 0
+static const sph_u32 A_init_192[] = {
+	C32(0xFD749ED4), C32(0xB798E530), C32(0x33904B6F), C32(0x46BDA85E),
+	C32(0x076934B4), C32(0x454B4058), C32(0x77F74527), C32(0xFB4CF465),
+	C32(0x62931DA9), C32(0xE778C8DB), C32(0x22B3998E), C32(0xAC15CFB9)
+};
+
+static const sph_u32 B_init_192[] = {
+	C32(0x58BCBAC4), C32(0xEC47A08E), C32(0xAEE933B2), C32(0xDFCBC824),
+	C32(0xA7944804), C32(0xBF65BDB0), C32(0x5A9D4502), C32(0x59979AF7),
+	C32(0xC5CEA54E), C32(0x4B6B8150), C32(0x16E71909), C32(0x7D632319),
+	C32(0x930573A0), C32(0xF34C63D1), C32(0xCAF914B4), C32(0xFDD6612C)
+};
+
+static const sph_u32 C_init_192[] = {
+	C32(0x61550878), C32(0x89EF2B75), C32(0xA1660C46), C32(0x7EF3855B),
+	C32(0x7297B58C), C32(0x1BC67793), C32(0x7FB1C723), C32(0xB66FC640),
+	C32(0x1A48B71C), C32(0xF0976D17), C32(0x088CE80A), C32(0xA454EDF3),
+	C32(0x1C096BF4), C32(0xAC76224B), C32(0x5215781C), C32(0xCD5D2669)
+};
+
+static const sph_u32 A_init_224[] = {
+	C32(0xA5201467), C32(0xA9B8D94A), C32(0xD4CED997), C32(0x68379D7B),
+	C32(0xA7FC73BA), C32(0xF1A2546B), C32(0x606782BF), C32(0xE0BCFD0F),
+	C32(0x2F25374E), C32(0x069A149F), C32(0x5E2DFF25), C32(0xFAECF061)
+};
+
+static const sph_u32 B_init_224[] = {
+	C32(0xEC9905D8), C32(0xF21850CF), C32(0xC0A746C8), C32(0x21DAD498),
+	C32(0x35156EEB), C32(0x088C97F2), C32(0x26303E40), C32(0x8A2D4FB5),
+	C32(0xFEEE44B6), C32(0x8A1E9573), C32(0x7B81111A), C32(0xCBC139F0),
+	C32(0xA3513861), C32(0x1D2C362E), C32(0x918C580E), C32(0xB58E1B9C)
+};
+
+static const sph_u32 C_init_224[] = {
+	C32(0xE4B573A1), C32(0x4C1A0880), C32(0x1E907C51), C32(0x04807EFD),
+	C32(0x3AD8CDE5), C32(0x16B21302), C32(0x02512C53), C32(0x2204CB18),
+	C32(0x99405F2D), C32(0xE5B648A1), C32(0x70AB1D43), C32(0xA10C25C2),
+	C32(0x16F1AC05), C32(0x38BBEB56), C32(0x9B01DC60), C32(0xB1096D83)
+};
+
+static const sph_u32 A_init_256[] = {
+	C32(0x52F84552), C32(0xE54B7999), C32(0x2D8EE3EC), C32(0xB9645191),
+	C32(0xE0078B86), C32(0xBB7C44C9), C32(0xD2B5C1CA), C32(0xB0D2EB8C),
+	C32(0x14CE5A45), C32(0x22AF50DC), C32(0xEFFDBC6B), C32(0xEB21B74A)
+};
+
+static const sph_u32 B_init_256[] = {
+	C32(0xB555C6EE), C32(0x3E710596), C32(0xA72A652F), C32(0x9301515F),
+	C32(0xDA28C1FA), C32(0x696FD868), C32(0x9CB6BF72), C32(0x0AFE4002),
+	C32(0xA6E03615), C32(0x5138C1D4), C32(0xBE216306), C32(0xB38B8890),
+	C32(0x3EA8B96B), C32(0x3299ACE4), C32(0x30924DD4), C32(0x55CB34A5)
+};
+
+static const sph_u32 C_init_256[] = {
+	C32(0xB405F031), C32(0xC4233EBA), C32(0xB3733979), C32(0xC0DD9D55),
+	C32(0xC51C28AE), C32(0xA327B8E1), C32(0x56C56167), C32(0xED614433),
+	C32(0x88B59D60), C32(0x60E2CEBA), C32(0x758B4B8B), C32(0x83E82A7F),
+	C32(0xBC968828), C32(0xE6E00BF7), C32(0xBA839E55), C32(0x9B491C60)
+};
+
+static const sph_u32 A_init_384[] = {
+	C32(0xC8FCA331), C32(0xE55C504E), C32(0x003EBF26), C32(0xBB6B8D83),
+	C32(0x7B0448C1), C32(0x41B82789), C32(0x0A7C9601), C32(0x8D659CFF),
+	C32(0xB6E2673E), C32(0xCA54C77B), C32(0x1460FD7E), C32(0x3FCB8F2D)
+};
+
+static const sph_u32 B_init_384[] = {
+	C32(0x527291FC), C32(0x2A16455F), C32(0x78E627E5), C32(0x944F169F),
+	C32(0x1CA6F016), C32(0xA854EA25), C32(0x8DB98ABE), C32(0xF2C62641),
+	C32(0x30117DCB), C32(0xCF5C4309), C32(0x93711A25), C32(0xF9F671B8),
+	C32(0xB01D2116), C32(0x333F4B89), C32(0xB285D165), C32(0x86829B36)
+};
+
+static const sph_u32 C_init_384[] = {
+	C32(0xF764B11A), C32(0x76172146), C32(0xCEF6934D), C32(0xC6D28399),
+	C32(0xFE095F61), C32(0x5E6018B4), C32(0x5048ECF5), C32(0x51353261),
+	C32(0x6E6E36DC), C32(0x63130DAD), C32(0xA9C69BD6), C32(0x1E90EA0C),
+	C32(0x7C35073B), C32(0x28D95E6D), C32(0xAA340E0D), C32(0xCB3DEE70)
+};
+#endif
+static const sph_u32 A_init_512[] = {
+	C32(0x20728DFD), C32(0x46C0BD53), C32(0xE782B699), C32(0x55304632),
+	C32(0x71B4EF90), C32(0x0EA9E82C), C32(0xDBB930F1), C32(0xFAD06B8B),
+	C32(0xBE0CAE40), C32(0x8BD14410), C32(0x76D2ADAC), C32(0x28ACAB7F)
+};
+
+static const sph_u32 B_init_512[] = {
+	C32(0xC1099CB7), C32(0x07B385F3), C32(0xE7442C26), C32(0xCC8AD640),
+	C32(0xEB6F56C7), C32(0x1EA81AA9), C32(0x73B9D314), C32(0x1DE85D08),
+	C32(0x48910A5A), C32(0x893B22DB), C32(0xC5A0DF44), C32(0xBBC4324E),
+	C32(0x72D2F240), C32(0x75941D99), C32(0x6D8BDE82), C32(0xA1A7502B)
+};
+
+static const sph_u32 C_init_512[] = {
+	C32(0xD9BF68D1), C32(0x58BAD750), C32(0x56028CB2), C32(0x8134F359),
+	C32(0xB5D469D8), C32(0x941A8CC2), C32(0x418B2A6E), C32(0x04052780),
+	C32(0x7F07D787), C32(0x5194358F), C32(0x3C60D665), C32(0xBE97D79A),
+	C32(0x950C3434), C32(0xAED9A06D), C32(0x2537DC8D), C32(0x7CDB5969)
+};
+
+/* END -- automatically generated code. */
+
+static void
+shabal_init(void *cc, unsigned size)
+{
+	/*
+	 * We have precomputed initial states for all the supported
+	 * output bit lengths.
+	 */
+	const sph_u32 *A_init, *B_init, *C_init;
+	sph_shabal_context *sc;
+
+	switch (size) {
+#if 0
+	case 192:
+		A_init = A_init_192;
+		B_init = B_init_192;
+		C_init = C_init_192;
+		break;
+	case 224:
+		A_init = A_init_224;
+		B_init = B_init_224;
+		C_init = C_init_224;
+		break;
+	case 256:
+		A_init = A_init_256;
+		B_init = B_init_256;
+		C_init = C_init_256;
+		break;
+	case 384:
+		A_init = A_init_384;
+		B_init = B_init_384;
+		C_init = C_init_384;
+		break;
+#endif
+	case 512:
+		A_init = A_init_512;
+		B_init = B_init_512;
+		C_init = C_init_512;
+		break;
+	default:
+		return;
+	}
+	sc = cc;
+	memcpy(sc->A, A_init, sizeof sc->A);
+	memcpy(sc->B, B_init, sizeof sc->B);
+	memcpy(sc->C, C_init, sizeof sc->C);
+	sc->Wlow = 1;
+	sc->Whigh = 0;
+	sc->ptr = 0;
+}
+
+static void
+shabal_core(void *cc, const unsigned char *data, size_t len)
+{
+	sph_shabal_context *sc;
+	unsigned char *buf;
+	size_t ptr;
+	DECL_STATE
+
+	sc = cc;
+	buf = sc->buf;
+	ptr = sc->ptr;
+
+	/*
+	 * We do not want to copy the state to local variables if the
+	 * amount of data is less than what is needed to complete the
+	 * current block. Note that it is anyway suboptimal to call
+	 * this method many times for small chunks of data.
+	 */
+	if (len < (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	READ_STATE(sc);
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data += clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			DECODE_BLOCK;
+			INPUT_BLOCK_ADD;
+			XOR_W;
+			APPLY_P;
+			INPUT_BLOCK_SUB;
+			SWAP_BC;
+			INCR_W;
+			ptr = 0;
+		}
+	}
+	WRITE_STATE(sc);
+	sc->ptr = ptr;
+}
+
+static void
+shabal_close(void *cc, unsigned ub, unsigned n, void *dst, unsigned size_words)
+{
+	sph_shabal_context *sc;
+	unsigned char *buf;
+	size_t ptr;
+	int i;
+	unsigned z;
+	union {
+		unsigned char tmp_out[64];
+		sph_u32 dummy;
+	} u;
+	size_t out_len;
+	DECL_STATE
+
+	sc = cc;
+	buf = sc->buf;
+	ptr = sc->ptr;
+	z = 0x80 >> n;
+	buf[ptr] = ((ub & -z) | z) & 0xFF;
+	memset(buf + ptr + 1, 0, (sizeof sc->buf) - (ptr + 1));
+	READ_STATE(sc);
+	DECODE_BLOCK;
+	INPUT_BLOCK_ADD;
+	XOR_W;
+	APPLY_P;
+	for (i = 0; i < 3; i ++) {
+		SWAP_BC;
+		XOR_W;
+		APPLY_P;
+	}
+
+	/*
+	 * We just use our local variables; no need to go through
+	 * the state structure. In order to share some code, we
+	 * emit the relevant words into a temporary buffer, which
+	 * we finally copy into the destination array.
+	 */
+	switch (size_words) {
+	case 16:
+		sph_enc32le_aligned(u.tmp_out +  0, B0);
+		sph_enc32le_aligned(u.tmp_out +  4, B1);
+		sph_enc32le_aligned(u.tmp_out +  8, B2);
+		sph_enc32le_aligned(u.tmp_out + 12, B3);
+		/* fall through */
+	case 12:
+		sph_enc32le_aligned(u.tmp_out + 16, B4);
+		sph_enc32le_aligned(u.tmp_out + 20, B5);
+		sph_enc32le_aligned(u.tmp_out + 24, B6);
+		sph_enc32le_aligned(u.tmp_out + 28, B7);
+		/* fall through */
+	case 8:
+		sph_enc32le_aligned(u.tmp_out + 32, B8);
+		/* fall through */
+	case 7:
+		sph_enc32le_aligned(u.tmp_out + 36, B9);
+		/* fall through */
+	case 6:
+		sph_enc32le_aligned(u.tmp_out + 40, BA);
+		sph_enc32le_aligned(u.tmp_out + 44, BB);
+		sph_enc32le_aligned(u.tmp_out + 48, BC);
+		sph_enc32le_aligned(u.tmp_out + 52, BD);
+		sph_enc32le_aligned(u.tmp_out + 56, BE);
+		sph_enc32le_aligned(u.tmp_out + 60, BF);
+		break;
+	default:
+		return;
+	}
+	out_len = size_words << 2;
+	memcpy(dst, u.tmp_out + (sizeof u.tmp_out) - out_len, out_len);
+	shabal_init(sc, size_words << 5);
+}
+
+#if 0
+/* see sph_shabal.h */
+void
+sph_shabal192_init(void *cc)
+{
+	shabal_init(cc, 192);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal192(void *cc, const void *data, size_t len)
+{
+	shabal_core(cc, data, len);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal192_close(void *cc, void *dst)
+{
+	shabal_close(cc, 0, 0, dst, 6);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal192_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	shabal_close(cc, ub, n, dst, 6);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal224_init(void *cc)
+{
+	shabal_init(cc, 224);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal224(void *cc, const void *data, size_t len)
+{
+	shabal_core(cc, data, len);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal224_close(void *cc, void *dst)
+{
+	shabal_close(cc, 0, 0, dst, 7);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	shabal_close(cc, ub, n, dst, 7);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal256_init(void *cc)
+{
+	shabal_init(cc, 256);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal256(void *cc, const void *data, size_t len)
+{
+	shabal_core(cc, data, len);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal256_close(void *cc, void *dst)
+{
+	shabal_close(cc, 0, 0, dst, 8);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	shabal_close(cc, ub, n, dst, 8);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal384_init(void *cc)
+{
+	shabal_init(cc, 384);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal384(void *cc, const void *data, size_t len)
+{
+	shabal_core(cc, data, len);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal384_close(void *cc, void *dst)
+{
+	shabal_close(cc, 0, 0, dst, 12);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	shabal_close(cc, ub, n, dst, 12);
+}
+
+#endif
+
+/* see sph_shabal.h */
+void
+sph_shabal512_init(void *cc)
+{
+	shabal_init(cc, 512);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal512(void *cc, const void *data, size_t len)
+{
+	shabal_core(cc, data, len);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal512_close(void *cc, void *dst)
+{
+	shabal_close(cc, 0, 0, dst, 16);
+}
+
+/* see sph_shabal.h */
+void
+sph_shabal512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	shabal_close(cc, ub, n, dst, 16);
+}
+#ifdef __cplusplus
+}
+#endif
diff --git a/sph/skein.c b/sph/skein.c
index 2fcfae5383..7e47e35221 100644
--- a/sph/skein.c
+++ b/sph/skein.c
@@ -35,6 +35,11 @@
 
 #include "sph_skein.h"
 
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+
 #if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SKEIN
 #define SPH_SMALL_FOOTPRINT_SKEIN   1
 #endif
@@ -1242,3 +1247,8 @@ sph_skein512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 }
 
 #endif
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/sph/sph_blake.h b/sph/sph_blake.h
index d8d794399d..2c2b3da292 100644
--- a/sph/sph_blake.h
+++ b/sph/sph_blake.h
@@ -181,6 +181,11 @@ void sph_blake224_close(void *cc, void *dst);
 void sph_blake224_addbits_and_close(
 	void *cc, unsigned ub, unsigned n, void *dst);
 
+/**
+ * Switch for the number of rounds (old blake was 8)
+ */
+extern int blake256_rounds;
+
 /**
  * Initialize a BLAKE-256 context. This process performs no memory allocation.
  *
@@ -226,6 +231,11 @@ void sph_blake256_close(void *cc, void *dst);
 void sph_blake256_addbits_and_close(
 	void *cc, unsigned ub, unsigned n, void *dst);
 
+/**
+ * Allow blakecoin and blake variants
+ */
+void sph_blake256_set_rounds(int rounds);
+
 #if SPH_64
 
 /**
diff --git a/sph/sph_haval.h b/sph/sph_haval.h
new file mode 100644
index 0000000000..409daaf417
--- /dev/null
+++ b/sph/sph_haval.h
@@ -0,0 +1,976 @@
+/* $Id: sph_haval.h 218 2010-06-08 17:06:34Z tp $ */
+/**
+ * HAVAL interface.
+ *
+ * HAVAL is actually a family of 15 hash functions, depending on whether
+ * the internal computation uses 3, 4 or 5 passes, and on the output
+ * length, which is 128, 160, 192, 224 or 256 bits. This implementation
+ * provides interface functions for all 15, which internally map to
+ * three cores (depending on the number of passes). Note that output
+ * lengths other than 256 bits are not obtained by a simple truncation
+ * of a longer result; the requested length is encoded within the
+ * padding data.
+ *
+ * HAVAL was published in: Yuliang Zheng, Josef Pieprzyk and Jennifer
+ * Seberry: "HAVAL -- a one-way hashing algorithm with variable length
+ * of output", Advances in Cryptology -- AUSCRYPT'92, Lecture Notes in
+ * Computer Science, Vol.718, pp.83-104, Springer-Verlag, 1993.
+ *
+ * This paper, and a reference implementation, are available on the
+ * Calyptix web site: http://labs.calyptix.com/haval.php
+ *
+ * The HAVAL reference paper is quite unclear on the data encoding
+ * details, i.e. endianness (both byte order within a 32-bit word, and
+ * word order within a message block). This implementation has been
+ * made compatible with the reference implementation referenced above.
+ *
+ * @warning   A collision for HAVAL-128/3 (HAVAL with three passes and
+ * 128-bit output) has been published; this function is thus considered
+ * as cryptographically broken. The status for other variants is unclear;
+ * use only with care.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_haval.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_HAVAL_H__
+#define SPH_HAVAL_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "sph_types.h"
+
+/**
+ * Output size (in bits) for HAVAL-128/3.
+ */
+#define SPH_SIZE_haval128_3   128
+
+/**
+ * Output size (in bits) for HAVAL-128/4.
+ */
+#define SPH_SIZE_haval128_4   128
+
+/**
+ * Output size (in bits) for HAVAL-128/5.
+ */
+#define SPH_SIZE_haval128_5   128
+
+/**
+ * Output size (in bits) for HAVAL-160/3.
+ */
+#define SPH_SIZE_haval160_3   160
+
+/**
+ * Output size (in bits) for HAVAL-160/4.
+ */
+#define SPH_SIZE_haval160_4   160
+
+/**
+ * Output size (in bits) for HAVAL-160/5.
+ */
+#define SPH_SIZE_haval160_5   160
+
+/**
+ * Output size (in bits) for HAVAL-192/3.
+ */
+#define SPH_SIZE_haval192_3   192
+
+/**
+ * Output size (in bits) for HAVAL-192/4.
+ */
+#define SPH_SIZE_haval192_4   192
+
+/**
+ * Output size (in bits) for HAVAL-192/5.
+ */
+#define SPH_SIZE_haval192_5   192
+
+/**
+ * Output size (in bits) for HAVAL-224/3.
+ */
+#define SPH_SIZE_haval224_3   224
+
+/**
+ * Output size (in bits) for HAVAL-224/4.
+ */
+#define SPH_SIZE_haval224_4   224
+
+/**
+ * Output size (in bits) for HAVAL-224/5.
+ */
+#define SPH_SIZE_haval224_5   224
+
+/**
+ * Output size (in bits) for HAVAL-256/3.
+ */
+#define SPH_SIZE_haval256_3   256
+
+/**
+ * Output size (in bits) for HAVAL-256/4.
+ */
+#define SPH_SIZE_haval256_4   256
+
+/**
+ * Output size (in bits) for HAVAL-256/5.
+ */
+#define SPH_SIZE_haval256_5   256
+
+/**
+ * This structure is a context for HAVAL computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * a HAVAL computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running HAVAL computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[128];    /* first field, for alignment */
+	sph_u32 s0, s1, s2, s3, s4, s5, s6, s7;
+	unsigned olen, passes;
+#if SPH_64
+	sph_u64 count;
+#else
+	sph_u32 count_high, count_low;
+#endif
+#endif
+} sph_haval_context;
+
+/**
+ * Type for a HAVAL-128/3 context (identical to the common context).
+ */
+typedef sph_haval_context sph_haval128_3_context;
+
+/**
+ * Type for a HAVAL-128/4 context (identical to the common context).
+ */
+typedef sph_haval_context sph_haval128_4_context;
+
+/**
+ * Type for a HAVAL-128/5 context (identical to the common context).
+ */
+typedef sph_haval_context sph_haval128_5_context;
+
+/**
+ * Type for a HAVAL-160/3 context (identical to the common context).
+ */
+typedef sph_haval_context sph_haval160_3_context;
+
+/**
+ * Type for a HAVAL-160/4 context (identical to the common context).
+ */
+typedef sph_haval_context sph_haval160_4_context;
+
+/**
+ * Type for a HAVAL-160/5 context (identical to the common context).
+ */
+typedef sph_haval_context sph_haval160_5_context;
+
+/**
+ * Type for a HAVAL-192/3 context (identical to the common context).
+ */
+typedef sph_haval_context sph_haval192_3_context;
+
+/**
+ * Type for a HAVAL-192/4 context (identical to the common context).
+ */
+typedef sph_haval_context sph_haval192_4_context;
+
+/**
+ * Type for a HAVAL-192/5 context (identical to the common context).
+ */
+typedef sph_haval_context sph_haval192_5_context;
+
+/**
+ * Type for a HAVAL-224/3 context (identical to the common context).
+ */
+typedef sph_haval_context sph_haval224_3_context;
+
+/**
+ * Type for a HAVAL-224/4 context (identical to the common context).
+ */
+typedef sph_haval_context sph_haval224_4_context;
+
+/**
+ * Type for a HAVAL-224/5 context (identical to the common context).
+ */
+typedef sph_haval_context sph_haval224_5_context;
+
+/**
+ * Type for a HAVAL-256/3 context (identical to the common context).
+ */
+typedef sph_haval_context sph_haval256_3_context;
+
+/**
+ * Type for a HAVAL-256/4 context (identical to the common context).
+ */
+typedef sph_haval_context sph_haval256_4_context;
+
+/**
+ * Type for a HAVAL-256/5 context (identical to the common context).
+ */
+typedef sph_haval_context sph_haval256_5_context;
+
+/**
+ * Initialize the context for HAVAL-128/3.
+ *
+ * @param cc   context to initialize (pointer to a
+ *             <code>sph_haval128_3_context</code> structure)
+ */
+void sph_haval128_3_init(void *cc);
+
+/**
+ * Process some data bytes for HAVAL-128/3. If <code>len</code> is 0,
+ * then this function does nothing.
+ *
+ * @param cc     the HAVAL-128/3 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_haval128_3(void *cc, const void *data, size_t len);
+
+/**
+ * Close a HAVAL-128/3 computation. The output buffer must be wide
+ * enough to accomodate the result (16 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the HAVAL-128/3 context
+ * @param dst   the output buffer
+ */
+void sph_haval128_3_close(void *cc, void *dst);
+
+/**
+ * Close a HAVAL-128/3 computation. Up to 7 extra input bits may be added
+ * to the input message; these are the <code>n</code> upper bits of
+ * the <code>ub</code> byte (i.e. the first extra bit has value 128 in
+ * <code>ub</code>, the second extra bit has value 64, and so on). Other
+ * bits in <code>ub</code> are ignored.
+ *
+ * The output buffer must be wide enough to accomodate the result (16
+ * bytes). The context is automatically reinitialized.
+ *
+ * @param cc    the HAVAL-128/3 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the output buffer
+ */
+void sph_haval128_3_addbits_and_close(void *cc,
+	unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize the context for HAVAL-128/4.
+ *
+ * @param cc   context to initialize (pointer to a
+ *             <code>sph_haval128_4_context</code> structure)
+ */
+void sph_haval128_4_init(void *cc);
+
+/**
+ * Process some data bytes for HAVAL-128/4. If <code>len</code> is 0,
+ * then this function does nothing.
+ *
+ * @param cc     the HAVAL-128/4 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_haval128_4(void *cc, const void *data, size_t len);
+
+/**
+ * Close a HAVAL-128/4 computation. The output buffer must be wide
+ * enough to accomodate the result (16 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the HAVAL-128/4 context
+ * @param dst   the output buffer
+ */
+void sph_haval128_4_close(void *cc, void *dst);
+
+/**
+ * Close a HAVAL-128/4 computation. Up to 7 extra input bits may be added
+ * to the input message; these are the <code>n</code> upper bits of
+ * the <code>ub</code> byte (i.e. the first extra bit has value 128 in
+ * <code>ub</code>, the second extra bit has value 64, and so on). Other
+ * bits in <code>ub</code> are ignored.
+ *
+ * The output buffer must be wide enough to accomodate the result (16
+ * bytes). The context is automatically reinitialized.
+ *
+ * @param cc    the HAVAL-128/4 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the output buffer
+ */
+void sph_haval128_4_addbits_and_close(void *cc,
+	unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize the context for HAVAL-128/5.
+ *
+ * @param cc   context to initialize (pointer to a
+ *             <code>sph_haval128_5_context</code> structure)
+ */
+void sph_haval128_5_init(void *cc);
+
+/**
+ * Process some data bytes for HAVAL-128/5. If <code>len</code> is 0,
+ * then this function does nothing.
+ *
+ * @param cc     the HAVAL-128/5 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_haval128_5(void *cc, const void *data, size_t len);
+
+/**
+ * Close a HAVAL-128/5 computation. The output buffer must be wide
+ * enough to accomodate the result (16 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the HAVAL-128/5 context
+ * @param dst   the output buffer
+ */
+void sph_haval128_5_close(void *cc, void *dst);
+
+/**
+ * Close a HAVAL-128/5 computation. Up to 7 extra input bits may be added
+ * to the input message; these are the <code>n</code> upper bits of
+ * the <code>ub</code> byte (i.e. the first extra bit has value 128 in
+ * <code>ub</code>, the second extra bit has value 64, and so on). Other
+ * bits in <code>ub</code> are ignored.
+ *
+ * The output buffer must be wide enough to accomodate the result (16
+ * bytes). The context is automatically reinitialized.
+ *
+ * @param cc    the HAVAL-128/5 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the output buffer
+ */
+void sph_haval128_5_addbits_and_close(void *cc,
+	unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize the context for HAVAL-160/3.
+ *
+ * @param cc   context to initialize (pointer to a
+ *             <code>sph_haval160_3_context</code> structure)
+ */
+void sph_haval160_3_init(void *cc);
+
+/**
+ * Process some data bytes for HAVAL-160/3. If <code>len</code> is 0,
+ * then this function does nothing.
+ *
+ * @param cc     the HAVAL-160/3 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_haval160_3(void *cc, const void *data, size_t len);
+
+/**
+ * Close a HAVAL-160/3 computation. The output buffer must be wide
+ * enough to accomodate the result (20 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the HAVAL-160/3 context
+ * @param dst   the output buffer
+ */
+void sph_haval160_3_close(void *cc, void *dst);
+
+/**
+ * Close a HAVAL-160/3 computation. Up to 7 extra input bits may be added
+ * to the input message; these are the <code>n</code> upper bits of
+ * the <code>ub</code> byte (i.e. the first extra bit has value 128 in
+ * <code>ub</code>, the second extra bit has value 64, and so on). Other
+ * bits in <code>ub</code> are ignored.
+ *
+ * The output buffer must be wide enough to accomodate the result (20
+ * bytes). The context is automatically reinitialized.
+ *
+ * @param cc    the HAVAL-160/3 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the output buffer
+ */
+void sph_haval160_3_addbits_and_close(void *cc,
+	unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize the context for HAVAL-160/4.
+ *
+ * @param cc   context to initialize (pointer to a
+ *             <code>sph_haval160_4_context</code> structure)
+ */
+void sph_haval160_4_init(void *cc);
+
+/**
+ * Process some data bytes for HAVAL-160/4. If <code>len</code> is 0,
+ * then this function does nothing.
+ *
+ * @param cc     the HAVAL-160/4 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_haval160_4(void *cc, const void *data, size_t len);
+
+/**
+ * Close a HAVAL-160/4 computation. The output buffer must be wide
+ * enough to accomodate the result (20 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the HAVAL-160/4 context
+ * @param dst   the output buffer
+ */
+void sph_haval160_4_close(void *cc, void *dst);
+
+/**
+ * Close a HAVAL-160/4 computation. Up to 7 extra input bits may be added
+ * to the input message; these are the <code>n</code> upper bits of
+ * the <code>ub</code> byte (i.e. the first extra bit has value 128 in
+ * <code>ub</code>, the second extra bit has value 64, and so on). Other
+ * bits in <code>ub</code> are ignored.
+ *
+ * The output buffer must be wide enough to accomodate the result (20
+ * bytes). The context is automatically reinitialized.
+ *
+ * @param cc    the HAVAL-160/4 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the output buffer
+ */
+void sph_haval160_3_addbits_and_close(void *cc,
+	unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize the context for HAVAL-160/5.
+ *
+ * @param cc   context to initialize (pointer to a
+ *             <code>sph_haval160_5_context</code> structure)
+ */
+void sph_haval160_5_init(void *cc);
+
+/**
+ * Process some data bytes for HAVAL-160/5. If <code>len</code> is 0,
+ * then this function does nothing.
+ *
+ * @param cc     the HAVAL-160/5 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_haval160_5(void *cc, const void *data, size_t len);
+
+/**
+ * Close a HAVAL-160/5 computation. The output buffer must be wide
+ * enough to accomodate the result (20 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the HAVAL-160/5 context
+ * @param dst   the output buffer
+ */
+void sph_haval160_5_close(void *cc, void *dst);
+
+/**
+ * Close a HAVAL-160/5 computation. Up to 7 extra input bits may be added
+ * to the input message; these are the <code>n</code> upper bits of
+ * the <code>ub</code> byte (i.e. the first extra bit has value 128 in
+ * <code>ub</code>, the second extra bit has value 64, and so on). Other
+ * bits in <code>ub</code> are ignored.
+ *
+ * The output buffer must be wide enough to accomodate the result (20
+ * bytes). The context is automatically reinitialized.
+ *
+ * @param cc    the HAVAL-160/5 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the output buffer
+ */
+void sph_haval160_5_addbits_and_close(void *cc,
+	unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize the context for HAVAL-192/3.
+ *
+ * @param cc   context to initialize (pointer to a
+ *             <code>sph_haval192_3_context</code> structure)
+ */
+void sph_haval192_3_init(void *cc);
+
+/**
+ * Process some data bytes for HAVAL-192/3. If <code>len</code> is 0,
+ * then this function does nothing.
+ *
+ * @param cc     the HAVAL-192/3 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_haval192_3(void *cc, const void *data, size_t len);
+
+/**
+ * Close a HAVAL-192/3 computation. The output buffer must be wide
+ * enough to accomodate the result (24 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the HAVAL-192/3 context
+ * @param dst   the output buffer
+ */
+void sph_haval192_3_close(void *cc, void *dst);
+
+/**
+ * Close a HAVAL-192/3 computation. Up to 7 extra input bits may be added
+ * to the input message; these are the <code>n</code> upper bits of
+ * the <code>ub</code> byte (i.e. the first extra bit has value 128 in
+ * <code>ub</code>, the second extra bit has value 64, and so on). Other
+ * bits in <code>ub</code> are ignored.
+ *
+ * The output buffer must be wide enough to accomodate the result (24
+ * bytes). The context is automatically reinitialized.
+ *
+ * @param cc    the HAVAL-192/3 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the output buffer
+ */
+void sph_haval192_3_addbits_and_close(void *cc,
+	unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize the context for HAVAL-192/4.
+ *
+ * @param cc   context to initialize (pointer to a
+ *             <code>sph_haval192_4_context</code> structure)
+ */
+void sph_haval192_4_init(void *cc);
+
+/**
+ * Process some data bytes for HAVAL-192/4. If <code>len</code> is 0,
+ * then this function does nothing.
+ *
+ * @param cc     the HAVAL-192/4 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_haval192_4(void *cc, const void *data, size_t len);
+
+/**
+ * Close a HAVAL-192/4 computation. The output buffer must be wide
+ * enough to accomodate the result (24 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the HAVAL-192/4 context
+ * @param dst   the output buffer
+ */
+void sph_haval192_4_close(void *cc, void *dst);
+
+/**
+ * Close a HAVAL-192/4 computation. Up to 7 extra input bits may be added
+ * to the input message; these are the <code>n</code> upper bits of
+ * the <code>ub</code> byte (i.e. the first extra bit has value 128 in
+ * <code>ub</code>, the second extra bit has value 64, and so on). Other
+ * bits in <code>ub</code> are ignored.
+ *
+ * The output buffer must be wide enough to accomodate the result (24
+ * bytes). The context is automatically reinitialized.
+ *
+ * @param cc    the HAVAL-192/4 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the output buffer
+ */
+void sph_haval192_4_addbits_and_close(void *cc,
+	unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize the context for HAVAL-192/5.
+ *
+ * @param cc   context to initialize (pointer to a
+ *             <code>sph_haval192_5_context</code> structure)
+ */
+void sph_haval192_5_init(void *cc);
+
+/**
+ * Process some data bytes for HAVAL-192/5. If <code>len</code> is 0,
+ * then this function does nothing.
+ *
+ * @param cc     the HAVAL-192/5 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_haval192_5(void *cc, const void *data, size_t len);
+
+/**
+ * Close a HAVAL-192/5 computation. The output buffer must be wide
+ * enough to accomodate the result (24 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the HAVAL-192/5 context
+ * @param dst   the output buffer
+ */
+void sph_haval192_5_close(void *cc, void *dst);
+
+/**
+ * Close a HAVAL-192/5 computation. Up to 7 extra input bits may be added
+ * to the input message; these are the <code>n</code> upper bits of
+ * the <code>ub</code> byte (i.e. the first extra bit has value 128 in
+ * <code>ub</code>, the second extra bit has value 64, and so on). Other
+ * bits in <code>ub</code> are ignored.
+ *
+ * The output buffer must be wide enough to accomodate the result (24
+ * bytes). The context is automatically reinitialized.
+ *
+ * @param cc    the HAVAL-192/5 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the output buffer
+ */
+void sph_haval192_5_addbits_and_close(void *cc,
+	unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize the context for HAVAL-224/3.
+ *
+ * @param cc   context to initialize (pointer to a
+ *             <code>sph_haval224_3_context</code> structure)
+ */
+void sph_haval224_3_init(void *cc);
+
+/**
+ * Process some data bytes for HAVAL-224/3. If <code>len</code> is 0,
+ * then this function does nothing.
+ *
+ * @param cc     the HAVAL-224/3 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_haval224_3(void *cc, const void *data, size_t len);
+
+/**
+ * Close a HAVAL-224/3 computation. The output buffer must be wide
+ * enough to accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the HAVAL-224/3 context
+ * @param dst   the output buffer
+ */
+void sph_haval224_3_close(void *cc, void *dst);
+
+/**
+ * Close a HAVAL-224/3 computation. Up to 7 extra input bits may be added
+ * to the input message; these are the <code>n</code> upper bits of
+ * the <code>ub</code> byte (i.e. the first extra bit has value 128 in
+ * <code>ub</code>, the second extra bit has value 64, and so on). Other
+ * bits in <code>ub</code> are ignored.
+ *
+ * The output buffer must be wide enough to accomodate the result (28
+ * bytes). The context is automatically reinitialized.
+ *
+ * @param cc    the HAVAL-224/3 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the output buffer
+ */
+void sph_haval224_3_addbits_and_close(void *cc,
+	unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize the context for HAVAL-224/4.
+ *
+ * @param cc   context to initialize (pointer to a
+ *             <code>sph_haval224_4_context</code> structure)
+ */
+void sph_haval224_4_init(void *cc);
+
+/**
+ * Process some data bytes for HAVAL-224/4. If <code>len</code> is 0,
+ * then this function does nothing.
+ *
+ * @param cc     the HAVAL-224/4 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_haval224_4(void *cc, const void *data, size_t len);
+
+/**
+ * Close a HAVAL-224/4 computation. The output buffer must be wide
+ * enough to accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the HAVAL-224/4 context
+ * @param dst   the output buffer
+ */
+void sph_haval224_4_close(void *cc, void *dst);
+
+/**
+ * Close a HAVAL-224/4 computation. Up to 7 extra input bits may be added
+ * to the input message; these are the <code>n</code> upper bits of
+ * the <code>ub</code> byte (i.e. the first extra bit has value 128 in
+ * <code>ub</code>, the second extra bit has value 64, and so on). Other
+ * bits in <code>ub</code> are ignored.
+ *
+ * The output buffer must be wide enough to accomodate the result (28
+ * bytes). The context is automatically reinitialized.
+ *
+ * @param cc    the HAVAL-224/4 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the output buffer
+ */
+void sph_haval224_4_addbits_and_close(void *cc,
+	unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize the context for HAVAL-224/5.
+ *
+ * @param cc   context to initialize (pointer to a
+ *             <code>sph_haval224_5_context</code> structure)
+ */
+void sph_haval224_5_init(void *cc);
+
+/**
+ * Process some data bytes for HAVAL-224/5. If <code>len</code> is 0,
+ * then this function does nothing.
+ *
+ * @param cc     the HAVAL-224/5 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_haval224_5(void *cc, const void *data, size_t len);
+
+/**
+ * Close a HAVAL-224/5 computation. The output buffer must be wide
+ * enough to accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the HAVAL-224/5 context
+ * @param dst   the output buffer
+ */
+void sph_haval224_5_close(void *cc, void *dst);
+
+/**
+ * Close a HAVAL-224/5 computation. Up to 7 extra input bits may be added
+ * to the input message; these are the <code>n</code> upper bits of
+ * the <code>ub</code> byte (i.e. the first extra bit has value 128 in
+ * <code>ub</code>, the second extra bit has value 64, and so on). Other
+ * bits in <code>ub</code> are ignored.
+ *
+ * The output buffer must be wide enough to accomodate the result (28
+ * bytes). The context is automatically reinitialized.
+ *
+ * @param cc    the HAVAL-224/5 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the output buffer
+ */
+void sph_haval224_5_addbits_and_close(void *cc,
+	unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize the context for HAVAL-256/3.
+ *
+ * @param cc   context to initialize (pointer to a
+ *             <code>sph_haval256_3_context</code> structure)
+ */
+void sph_haval256_3_init(void *cc);
+
+/**
+ * Process some data bytes for HAVAL-256/3. If <code>len</code> is 0,
+ * then this function does nothing.
+ *
+ * @param cc     the HAVAL-256/3 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_haval256_3(void *cc, const void *data, size_t len);
+
+/**
+ * Close a HAVAL-256/3 computation. The output buffer must be wide
+ * enough to accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the HAVAL-256/3 context
+ * @param dst   the output buffer
+ */
+void sph_haval256_3_close(void *cc, void *dst);
+
+/**
+ * Close a HAVAL-256/3 computation. Up to 7 extra input bits may be added
+ * to the input message; these are the <code>n</code> upper bits of
+ * the <code>ub</code> byte (i.e. the first extra bit has value 128 in
+ * <code>ub</code>, the second extra bit has value 64, and so on). Other
+ * bits in <code>ub</code> are ignored.
+ *
+ * The output buffer must be wide enough to accomodate the result (32
+ * bytes). The context is automatically reinitialized.
+ *
+ * @param cc    the HAVAL-256/3 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the output buffer
+ */
+void sph_haval256_3_addbits_and_close(void *cc,
+	unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize the context for HAVAL-256/4.
+ *
+ * @param cc   context to initialize (pointer to a
+ *             <code>sph_haval256_4_context</code> structure)
+ */
+void sph_haval256_4_init(void *cc);
+
+/**
+ * Process some data bytes for HAVAL-256/4. If <code>len</code> is 0,
+ * then this function does nothing.
+ *
+ * @param cc     the HAVAL-256/4 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_haval256_4(void *cc, const void *data, size_t len);
+
+/**
+ * Close a HAVAL-256/4 computation. The output buffer must be wide
+ * enough to accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the HAVAL-256/4 context
+ * @param dst   the output buffer
+ */
+void sph_haval256_4_close(void *cc, void *dst);
+
+/**
+ * Close a HAVAL-256/4 computation. Up to 7 extra input bits may be added
+ * to the input message; these are the <code>n</code> upper bits of
+ * the <code>ub</code> byte (i.e. the first extra bit has value 128 in
+ * <code>ub</code>, the second extra bit has value 64, and so on). Other
+ * bits in <code>ub</code> are ignored.
+ *
+ * The output buffer must be wide enough to accomodate the result (32
+ * bytes). The context is automatically reinitialized.
+ *
+ * @param cc    the HAVAL-256/4 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the output buffer
+ */
+void sph_haval256_4_addbits_and_close(void *cc,
+	unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize the context for HAVAL-256/5.
+ *
+ * @param cc   context to initialize (pointer to a
+ *             <code>sph_haval256_5_context</code> structure)
+ */
+void sph_haval256_5_init(void *cc);
+
+/**
+ * Process some data bytes for HAVAL-256/5. If <code>len</code> is 0,
+ * then this function does nothing.
+ *
+ * @param cc     the HAVAL-256/5 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_haval256_5(void *cc, const void *data, size_t len);
+
+/**
+ * Close a HAVAL-256/5 computation. The output buffer must be wide
+ * enough to accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the HAVAL-256/5 context
+ * @param dst   the output buffer
+ */
+void sph_haval256_5_close(void *cc, void *dst);
+
+/**
+ * Close a HAVAL-256/5 computation. Up to 7 extra input bits may be added
+ * to the input message; these are the <code>n</code> upper bits of
+ * the <code>ub</code> byte (i.e. the first extra bit has value 128 in
+ * <code>ub</code>, the second extra bit has value 64, and so on). Other
+ * bits in <code>ub</code> are ignored.
+ *
+ * The output buffer must be wide enough to accomodate the result (32
+ * bytes). The context is automatically reinitialized.
+ *
+ * @param cc    the HAVAL-256/5 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the output buffer
+ */
+void sph_haval256_5_addbits_and_close(void *cc,
+	unsigned ub, unsigned n, void *dst);
+
+/**
+ * Apply the HAVAL compression function on the provided data. The
+ * <code>msg</code> parameter contains the 32 32-bit input blocks,
+ * as numerical values (hence after the little-endian decoding). The
+ * <code>val</code> parameter contains the 8 32-bit input blocks for
+ * the compression function; the output is written in place in this
+ * array. This function uses three internal passes.
+ *
+ * @param msg   the message block (32 values)
+ * @param val   the function 256-bit input and output
+ */
+void sph_haval_3_comp(const sph_u32 msg[32], sph_u32 val[8]);
+
+/**
+ * Apply the HAVAL compression function on the provided data. The
+ * <code>msg</code> parameter contains the 32 32-bit input blocks,
+ * as numerical values (hence after the little-endian decoding). The
+ * <code>val</code> parameter contains the 8 32-bit input blocks for
+ * the compression function; the output is written in place in this
+ * array. This function uses four internal passes.
+ *
+ * @param msg   the message block (32 values)
+ * @param val   the function 256-bit input and output
+ */
+void sph_haval_4_comp(const sph_u32 msg[32], sph_u32 val[8]);
+
+/**
+ * Apply the HAVAL compression function on the provided data. The
+ * <code>msg</code> parameter contains the 32 32-bit input blocks,
+ * as numerical values (hence after the little-endian decoding). The
+ * <code>val</code> parameter contains the 8 32-bit input blocks for
+ * the compression function; the output is written in place in this
+ * array. This function uses five internal passes.
+ *
+ * @param msg   the message block (32 values)
+ * @param val   the function 256-bit input and output
+ */
+void sph_haval_5_comp(const sph_u32 msg[32], sph_u32 val[8]);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/sph/sph_ripemd.h b/sph/sph_ripemd.h
new file mode 100644
index 0000000000..f1f0982f4f
--- /dev/null
+++ b/sph/sph_ripemd.h
@@ -0,0 +1,273 @@
+/* $Id: sph_ripemd.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * RIPEMD, RIPEMD-128 and RIPEMD-160 interface.
+ *
+ * RIPEMD was first described in: Research and Development in Advanced
+ * Communication Technologies in Europe, "RIPE Integrity Primitives:
+ * Final Report of RACE Integrity Primitives Evaluation (R1040)", RACE,
+ * June 1992.
+ *
+ * A new, strengthened version, dubbed RIPEMD-160, was published in: H.
+ * Dobbertin, A. Bosselaers, and B. Preneel, "RIPEMD-160, a strengthened
+ * version of RIPEMD", Fast Software Encryption - FSE'96, LNCS 1039,
+ * Springer (1996), pp. 71--82.
+ *
+ * This article describes both RIPEMD-160, with a 160-bit output, and a
+ * reduced version called RIPEMD-128, which has a 128-bit output. RIPEMD-128
+ * was meant as a "drop-in" replacement for any hash function with 128-bit
+ * output, especially the original RIPEMD.
+ *
+ * @warning   Collisions, and an efficient method to build other collisions,
+ * have been published for the original RIPEMD, which is thus considered as
+ * cryptographically broken. It is also very rarely encountered, and there
+ * seems to exist no free description or implementation of RIPEMD (except
+ * the sphlib code, of course). As of january 2007, RIPEMD-128 and RIPEMD-160
+ * seem as secure as their output length allows.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_ripemd.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_RIPEMD_H__
+#define SPH_RIPEMD_H__
+
+#include <stddef.h>
+#include "sph_types.h"
+
+/**
+ * Output size (in bits) for RIPEMD.
+ */
+#define SPH_SIZE_ripemd   128
+
+/**
+ * Output size (in bits) for RIPEMD-128.
+ */
+#define SPH_SIZE_ripemd128   128
+
+/**
+ * Output size (in bits) for RIPEMD-160.
+ */
+#define SPH_SIZE_ripemd160   160
+
+/**
+ * This structure is a context for RIPEMD computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * a RIPEMD computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running RIPEMD computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	sph_u32 val[4];
+#if SPH_64
+	sph_u64 count;
+#else
+	sph_u32 count_high, count_low;
+#endif
+#endif
+} sph_ripemd_context;
+
+/**
+ * Initialize a RIPEMD context. This process performs no memory allocation.
+ *
+ * @param cc   the RIPEMD context (pointer to
+ *             a <code>sph_ripemd_context</code>)
+ */
+void sph_ripemd_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the RIPEMD context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_ripemd(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current RIPEMD computation and output the result into the
+ * provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (16 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the RIPEMD context
+ * @param dst   the destination buffer
+ */
+void sph_ripemd_close(void *cc, void *dst);
+
+/**
+ * Apply the RIPEMD compression function on the provided data. The
+ * <code>msg</code> parameter contains the 16 32-bit input blocks,
+ * as numerical values (hence after the little-endian decoding). The
+ * <code>val</code> parameter contains the 5 32-bit input blocks for
+ * the compression function; the output is written in place in this
+ * array.
+ *
+ * @param msg   the message block (16 values)
+ * @param val   the function 128-bit input and output
+ */
+void sph_ripemd_comp(const sph_u32 msg[16], sph_u32 val[4]);
+
+/* ===================================================================== */
+
+/**
+ * This structure is a context for RIPEMD-128 computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * a RIPEMD-128 computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running RIPEMD-128 computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	sph_u32 val[4];
+#if SPH_64
+	sph_u64 count;
+#else
+	sph_u32 count_high, count_low;
+#endif
+#endif
+} sph_ripemd128_context;
+
+/**
+ * Initialize a RIPEMD-128 context. This process performs no memory allocation.
+ *
+ * @param cc   the RIPEMD-128 context (pointer to
+ *             a <code>sph_ripemd128_context</code>)
+ */
+void sph_ripemd128_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the RIPEMD-128 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_ripemd128(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current RIPEMD-128 computation and output the result into the
+ * provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (16 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the RIPEMD-128 context
+ * @param dst   the destination buffer
+ */
+void sph_ripemd128_close(void *cc, void *dst);
+
+/**
+ * Apply the RIPEMD-128 compression function on the provided data. The
+ * <code>msg</code> parameter contains the 16 32-bit input blocks,
+ * as numerical values (hence after the little-endian decoding). The
+ * <code>val</code> parameter contains the 5 32-bit input blocks for
+ * the compression function; the output is written in place in this
+ * array.
+ *
+ * @param msg   the message block (16 values)
+ * @param val   the function 128-bit input and output
+ */
+void sph_ripemd128_comp(const sph_u32 msg[16], sph_u32 val[4]);
+
+/* ===================================================================== */
+
+/**
+ * This structure is a context for RIPEMD-160 computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * a RIPEMD-160 computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running RIPEMD-160 computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	sph_u32 val[5];
+#if SPH_64
+	sph_u64 count;
+#else
+	sph_u32 count_high, count_low;
+#endif
+#endif
+} sph_ripemd160_context;
+
+/**
+ * Initialize a RIPEMD-160 context. This process performs no memory allocation.
+ *
+ * @param cc   the RIPEMD-160 context (pointer to
+ *             a <code>sph_ripemd160_context</code>)
+ */
+void sph_ripemd160_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the RIPEMD-160 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_ripemd160(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current RIPEMD-160 computation and output the result into the
+ * provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (20 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the RIPEMD-160 context
+ * @param dst   the destination buffer
+ */
+void sph_ripemd160_close(void *cc, void *dst);
+
+/**
+ * Apply the RIPEMD-160 compression function on the provided data. The
+ * <code>msg</code> parameter contains the 16 32-bit input blocks,
+ * as numerical values (hence after the little-endian decoding). The
+ * <code>val</code> parameter contains the 5 32-bit input blocks for
+ * the compression function; the output is written in place in this
+ * array.
+ *
+ * @param msg   the message block (16 values)
+ * @param val   the function 160-bit input and output
+ */
+void sph_ripemd160_comp(const sph_u32 msg[16], sph_u32 val[5]);
+
+#endif
diff --git a/sph/sph_sha2.c b/sph/sph_sha2.c
new file mode 100644
index 0000000000..aab2c5518c
--- /dev/null
+++ b/sph/sph_sha2.c
@@ -0,0 +1,691 @@
+/* $Id: sha2.c 227 2010-06-16 17:28:38Z tp $ */
+/*
+ * SHA-224 / SHA-256 implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "sph_sha2.h"
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SHA2
+#define SPH_SMALL_FOOTPRINT_SHA2   1
+#endif
+
+#define CH(X, Y, Z)    ((((Y) ^ (Z)) & (X)) ^ (Z))
+#define MAJ(X, Y, Z)   (((Y) & (Z)) | (((Y) | (Z)) & (X)))
+
+#define ROTR    SPH_ROTR32
+
+#define BSG2_0(x)      (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
+#define BSG2_1(x)      (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
+#define SSG2_0(x)      (ROTR(x, 7) ^ ROTR(x, 18) ^ SPH_T32((x) >> 3))
+#define SSG2_1(x)      (ROTR(x, 17) ^ ROTR(x, 19) ^ SPH_T32((x) >> 10))
+
+static const sph_u32 H224[8] = {
+	SPH_C32(0xC1059ED8), SPH_C32(0x367CD507), SPH_C32(0x3070DD17),
+	SPH_C32(0xF70E5939), SPH_C32(0xFFC00B31), SPH_C32(0x68581511),
+	SPH_C32(0x64F98FA7), SPH_C32(0xBEFA4FA4)
+};
+
+static const sph_u32 H256[8] = {
+	SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85), SPH_C32(0x3C6EF372),
+	SPH_C32(0xA54FF53A), SPH_C32(0x510E527F), SPH_C32(0x9B05688C),
+	SPH_C32(0x1F83D9AB), SPH_C32(0x5BE0CD19)
+};
+
+/*
+ * The SHA2_ROUND_BODY defines the body for a SHA-224 / SHA-256
+ * compression function implementation. The "in" parameter should
+ * evaluate, when applied to a numerical input parameter from 0 to 15,
+ * to an expression which yields the corresponding input block. The "r"
+ * parameter should evaluate to an array or pointer expression
+ * designating the array of 8 words which contains the input and output
+ * of the compression function.
+ */
+
+#if SPH_SMALL_FOOTPRINT_SHA2
+
+static const sph_u32 K[64] = {
+	SPH_C32(0x428A2F98), SPH_C32(0x71374491),
+	SPH_C32(0xB5C0FBCF), SPH_C32(0xE9B5DBA5),
+	SPH_C32(0x3956C25B), SPH_C32(0x59F111F1),
+	SPH_C32(0x923F82A4), SPH_C32(0xAB1C5ED5),
+	SPH_C32(0xD807AA98), SPH_C32(0x12835B01),
+	SPH_C32(0x243185BE), SPH_C32(0x550C7DC3),
+	SPH_C32(0x72BE5D74), SPH_C32(0x80DEB1FE),
+	SPH_C32(0x9BDC06A7), SPH_C32(0xC19BF174),
+	SPH_C32(0xE49B69C1), SPH_C32(0xEFBE4786),
+	SPH_C32(0x0FC19DC6), SPH_C32(0x240CA1CC),
+	SPH_C32(0x2DE92C6F), SPH_C32(0x4A7484AA),
+	SPH_C32(0x5CB0A9DC), SPH_C32(0x76F988DA),
+	SPH_C32(0x983E5152), SPH_C32(0xA831C66D),
+	SPH_C32(0xB00327C8), SPH_C32(0xBF597FC7),
+	SPH_C32(0xC6E00BF3), SPH_C32(0xD5A79147),
+	SPH_C32(0x06CA6351), SPH_C32(0x14292967),
+	SPH_C32(0x27B70A85), SPH_C32(0x2E1B2138),
+	SPH_C32(0x4D2C6DFC), SPH_C32(0x53380D13),
+	SPH_C32(0x650A7354), SPH_C32(0x766A0ABB),
+	SPH_C32(0x81C2C92E), SPH_C32(0x92722C85),
+	SPH_C32(0xA2BFE8A1), SPH_C32(0xA81A664B),
+	SPH_C32(0xC24B8B70), SPH_C32(0xC76C51A3),
+	SPH_C32(0xD192E819), SPH_C32(0xD6990624),
+	SPH_C32(0xF40E3585), SPH_C32(0x106AA070),
+	SPH_C32(0x19A4C116), SPH_C32(0x1E376C08),
+	SPH_C32(0x2748774C), SPH_C32(0x34B0BCB5),
+	SPH_C32(0x391C0CB3), SPH_C32(0x4ED8AA4A),
+	SPH_C32(0x5B9CCA4F), SPH_C32(0x682E6FF3),
+	SPH_C32(0x748F82EE), SPH_C32(0x78A5636F),
+	SPH_C32(0x84C87814), SPH_C32(0x8CC70208),
+	SPH_C32(0x90BEFFFA), SPH_C32(0xA4506CEB),
+	SPH_C32(0xBEF9A3F7), SPH_C32(0xC67178F2)
+};
+
+#define SHA2_MEXP1(in, pc)   do { \
+		W[pc] = in(pc); \
+	} while (0)
+
+#define SHA2_MEXP2(in, pc)   do { \
+		W[(pc) & 0x0F] = SPH_T32(SSG2_1(W[((pc) - 2) & 0x0F]) \
+			+ W[((pc) - 7) & 0x0F] \
+			+ SSG2_0(W[((pc) - 15) & 0x0F]) + W[(pc) & 0x0F]); \
+	} while (0)
+
+#define SHA2_STEPn(n, a, b, c, d, e, f, g, h, in, pc)   do { \
+		sph_u32 t1, t2; \
+		SHA2_MEXP ## n(in, pc); \
+		t1 = SPH_T32(h + BSG2_1(e) + CH(e, f, g) \
+			+ K[pcount + (pc)] + W[(pc) & 0x0F]); \
+		t2 = SPH_T32(BSG2_0(a) + MAJ(a, b, c)); \
+		d = SPH_T32(d + t1); \
+		h = SPH_T32(t1 + t2); \
+	} while (0)
+
+#define SHA2_STEP1(a, b, c, d, e, f, g, h, in, pc) \
+	SHA2_STEPn(1, a, b, c, d, e, f, g, h, in, pc)
+#define SHA2_STEP2(a, b, c, d, e, f, g, h, in, pc) \
+	SHA2_STEPn(2, a, b, c, d, e, f, g, h, in, pc)
+
+#define SHA2_ROUND_BODY(in, r)   do { \
+		sph_u32 A, B, C, D, E, F, G, H; \
+		sph_u32 W[16]; \
+		unsigned pcount; \
+ \
+		A = (r)[0]; \
+		B = (r)[1]; \
+		C = (r)[2]; \
+		D = (r)[3]; \
+		E = (r)[4]; \
+		F = (r)[5]; \
+		G = (r)[6]; \
+		H = (r)[7]; \
+		pcount = 0; \
+		SHA2_STEP1(A, B, C, D, E, F, G, H, in,  0); \
+		SHA2_STEP1(H, A, B, C, D, E, F, G, in,  1); \
+		SHA2_STEP1(G, H, A, B, C, D, E, F, in,  2); \
+		SHA2_STEP1(F, G, H, A, B, C, D, E, in,  3); \
+		SHA2_STEP1(E, F, G, H, A, B, C, D, in,  4); \
+		SHA2_STEP1(D, E, F, G, H, A, B, C, in,  5); \
+		SHA2_STEP1(C, D, E, F, G, H, A, B, in,  6); \
+		SHA2_STEP1(B, C, D, E, F, G, H, A, in,  7); \
+		SHA2_STEP1(A, B, C, D, E, F, G, H, in,  8); \
+		SHA2_STEP1(H, A, B, C, D, E, F, G, in,  9); \
+		SHA2_STEP1(G, H, A, B, C, D, E, F, in, 10); \
+		SHA2_STEP1(F, G, H, A, B, C, D, E, in, 11); \
+		SHA2_STEP1(E, F, G, H, A, B, C, D, in, 12); \
+		SHA2_STEP1(D, E, F, G, H, A, B, C, in, 13); \
+		SHA2_STEP1(C, D, E, F, G, H, A, B, in, 14); \
+		SHA2_STEP1(B, C, D, E, F, G, H, A, in, 15); \
+		for (pcount = 16; pcount < 64; pcount += 16) { \
+			SHA2_STEP2(A, B, C, D, E, F, G, H, in,  0); \
+			SHA2_STEP2(H, A, B, C, D, E, F, G, in,  1); \
+			SHA2_STEP2(G, H, A, B, C, D, E, F, in,  2); \
+			SHA2_STEP2(F, G, H, A, B, C, D, E, in,  3); \
+			SHA2_STEP2(E, F, G, H, A, B, C, D, in,  4); \
+			SHA2_STEP2(D, E, F, G, H, A, B, C, in,  5); \
+			SHA2_STEP2(C, D, E, F, G, H, A, B, in,  6); \
+			SHA2_STEP2(B, C, D, E, F, G, H, A, in,  7); \
+			SHA2_STEP2(A, B, C, D, E, F, G, H, in,  8); \
+			SHA2_STEP2(H, A, B, C, D, E, F, G, in,  9); \
+			SHA2_STEP2(G, H, A, B, C, D, E, F, in, 10); \
+			SHA2_STEP2(F, G, H, A, B, C, D, E, in, 11); \
+			SHA2_STEP2(E, F, G, H, A, B, C, D, in, 12); \
+			SHA2_STEP2(D, E, F, G, H, A, B, C, in, 13); \
+			SHA2_STEP2(C, D, E, F, G, H, A, B, in, 14); \
+			SHA2_STEP2(B, C, D, E, F, G, H, A, in, 15); \
+		} \
+		(r)[0] = SPH_T32((r)[0] + A); \
+		(r)[1] = SPH_T32((r)[1] + B); \
+		(r)[2] = SPH_T32((r)[2] + C); \
+		(r)[3] = SPH_T32((r)[3] + D); \
+		(r)[4] = SPH_T32((r)[4] + E); \
+		(r)[5] = SPH_T32((r)[5] + F); \
+		(r)[6] = SPH_T32((r)[6] + G); \
+		(r)[7] = SPH_T32((r)[7] + H); \
+	} while (0)
+
+#else
+
+#define SHA2_ROUND_BODY(in, r)   do { \
+		sph_u32 A, B, C, D, E, F, G, H, T1, T2; \
+		sph_u32 W00, W01, W02, W03, W04, W05, W06, W07; \
+		sph_u32 W08, W09, W10, W11, W12, W13, W14, W15; \
+		int i; \
+ \
+ 		A = (r)[0]; \
+		B = (r)[1]; \
+		C = (r)[2]; \
+		D = (r)[3]; \
+		E = (r)[4]; \
+		F = (r)[5]; \
+		G = (r)[6]; \
+		H = (r)[7]; \
+		W00 = in(0); \
+		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+			+ SPH_C32(0x428A2F98) + W00); \
+		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+		D = SPH_T32(D + T1); \
+		H = SPH_T32(T1 + T2); \
+		W01 = in(1); \
+		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+			+ SPH_C32(0x71374491) + W01); \
+		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+		C = SPH_T32(C + T1); \
+		G = SPH_T32(T1 + T2); \
+		W02 = in(2); \
+		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+			+ SPH_C32(0xB5C0FBCF) + W02); \
+		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+		B = SPH_T32(B + T1); \
+		F = SPH_T32(T1 + T2); \
+		W03 = in(3); \
+		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+			+ SPH_C32(0xE9B5DBA5) + W03); \
+		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+		A = SPH_T32(A + T1); \
+		E = SPH_T32(T1 + T2); \
+		W04 = in(4); \
+		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+			+ SPH_C32(0x3956C25B) + W04); \
+		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+		H = SPH_T32(H + T1); \
+		D = SPH_T32(T1 + T2); \
+		W05 = in(5); \
+		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+			+ SPH_C32(0x59F111F1) + W05); \
+		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+		G = SPH_T32(G + T1); \
+		C = SPH_T32(T1 + T2); \
+		W06 = in(6); \
+		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+			+ SPH_C32(0x923F82A4) + W06); \
+		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+		F = SPH_T32(F + T1); \
+		B = SPH_T32(T1 + T2); \
+		W07 = in(7); \
+		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+			+ SPH_C32(0xAB1C5ED5) + W07); \
+		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+		E = SPH_T32(E + T1); \
+		A = SPH_T32(T1 + T2); \
+		W08 = in(8); \
+		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+			+ SPH_C32(0xD807AA98) + W08); \
+		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+		D = SPH_T32(D + T1); \
+		H = SPH_T32(T1 + T2); \
+		W09 = in(9); \
+		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+			+ SPH_C32(0x12835B01) + W09); \
+		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+		C = SPH_T32(C + T1); \
+		G = SPH_T32(T1 + T2); \
+		W10 = in(10); \
+		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+			+ SPH_C32(0x243185BE) + W10); \
+		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+		B = SPH_T32(B + T1); \
+		F = SPH_T32(T1 + T2); \
+		W11 = in(11); \
+		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+			+ SPH_C32(0x550C7DC3) + W11); \
+		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+		A = SPH_T32(A + T1); \
+		E = SPH_T32(T1 + T2); \
+		W12 = in(12); \
+		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+			+ SPH_C32(0x72BE5D74) + W12); \
+		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+		H = SPH_T32(H + T1); \
+		D = SPH_T32(T1 + T2); \
+		W13 = in(13); \
+		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+			+ SPH_C32(0x80DEB1FE) + W13); \
+		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+		G = SPH_T32(G + T1); \
+		C = SPH_T32(T1 + T2); \
+		W14 = in(14); \
+		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+			+ SPH_C32(0x9BDC06A7) + W14); \
+		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+		F = SPH_T32(F + T1); \
+		B = SPH_T32(T1 + T2); \
+		W15 = in(15); \
+		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+			+ SPH_C32(0xC19BF174) + W15); \
+		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+		E = SPH_T32(E + T1); \
+		A = SPH_T32(T1 + T2); \
+		W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
+		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+			+ SPH_C32(0xE49B69C1) + W00); \
+		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+		D = SPH_T32(D + T1); \
+		H = SPH_T32(T1 + T2); \
+		W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
+		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+			+ SPH_C32(0xEFBE4786) + W01); \
+		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+		C = SPH_T32(C + T1); \
+		G = SPH_T32(T1 + T2); \
+		W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
+		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+			+ SPH_C32(0x0FC19DC6) + W02); \
+		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+		B = SPH_T32(B + T1); \
+		F = SPH_T32(T1 + T2); \
+		W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
+		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+			+ SPH_C32(0x240CA1CC) + W03); \
+		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+		A = SPH_T32(A + T1); \
+		E = SPH_T32(T1 + T2); \
+		W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
+		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+			+ SPH_C32(0x2DE92C6F) + W04); \
+		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+		H = SPH_T32(H + T1); \
+		D = SPH_T32(T1 + T2); \
+		W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
+		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+			+ SPH_C32(0x4A7484AA) + W05); \
+		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+		G = SPH_T32(G + T1); \
+		C = SPH_T32(T1 + T2); \
+		W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
+		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+			+ SPH_C32(0x5CB0A9DC) + W06); \
+		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+		F = SPH_T32(F + T1); \
+		B = SPH_T32(T1 + T2); \
+		W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
+		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+			+ SPH_C32(0x76F988DA) + W07); \
+		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+		E = SPH_T32(E + T1); \
+		A = SPH_T32(T1 + T2); \
+		W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
+		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+			+ SPH_C32(0x983E5152) + W08); \
+		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+		D = SPH_T32(D + T1); \
+		H = SPH_T32(T1 + T2); \
+		W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
+		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+			+ SPH_C32(0xA831C66D) + W09); \
+		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+		C = SPH_T32(C + T1); \
+		G = SPH_T32(T1 + T2); \
+		W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
+		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+			+ SPH_C32(0xB00327C8) + W10); \
+		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+		B = SPH_T32(B + T1); \
+		F = SPH_T32(T1 + T2); \
+		W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
+		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+			+ SPH_C32(0xBF597FC7) + W11); \
+		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+		A = SPH_T32(A + T1); \
+		E = SPH_T32(T1 + T2); \
+		W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
+		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+			+ SPH_C32(0xC6E00BF3) + W12); \
+		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+		H = SPH_T32(H + T1); \
+		D = SPH_T32(T1 + T2); \
+		W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
+		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+			+ SPH_C32(0xD5A79147) + W13); \
+		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+		G = SPH_T32(G + T1); \
+		C = SPH_T32(T1 + T2); \
+		W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
+		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+			+ SPH_C32(0x06CA6351) + W14); \
+		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+		F = SPH_T32(F + T1); \
+		B = SPH_T32(T1 + T2); \
+		W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
+		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+			+ SPH_C32(0x14292967) + W15); \
+		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+		E = SPH_T32(E + T1); \
+		A = SPH_T32(T1 + T2); \
+		W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
+		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+			+ SPH_C32(0x27B70A85) + W00); \
+		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+		D = SPH_T32(D + T1); \
+		H = SPH_T32(T1 + T2); \
+		W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
+		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+			+ SPH_C32(0x2E1B2138) + W01); \
+		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+		C = SPH_T32(C + T1); \
+		G = SPH_T32(T1 + T2); \
+		W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
+		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+			+ SPH_C32(0x4D2C6DFC) + W02); \
+		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+		B = SPH_T32(B + T1); \
+		F = SPH_T32(T1 + T2); \
+		W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
+		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+			+ SPH_C32(0x53380D13) + W03); \
+		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+		A = SPH_T32(A + T1); \
+		E = SPH_T32(T1 + T2); \
+		W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
+		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+			+ SPH_C32(0x650A7354) + W04); \
+		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+		H = SPH_T32(H + T1); \
+		D = SPH_T32(T1 + T2); \
+		W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
+		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+			+ SPH_C32(0x766A0ABB) + W05); \
+		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+		G = SPH_T32(G + T1); \
+		C = SPH_T32(T1 + T2); \
+		W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
+		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+			+ SPH_C32(0x81C2C92E) + W06); \
+		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+		F = SPH_T32(F + T1); \
+		B = SPH_T32(T1 + T2); \
+		W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
+		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+			+ SPH_C32(0x92722C85) + W07); \
+		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+		E = SPH_T32(E + T1); \
+		A = SPH_T32(T1 + T2); \
+		W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
+		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+			+ SPH_C32(0xA2BFE8A1) + W08); \
+		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+		D = SPH_T32(D + T1); \
+		H = SPH_T32(T1 + T2); \
+		W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
+		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+			+ SPH_C32(0xA81A664B) + W09); \
+		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+		C = SPH_T32(C + T1); \
+		G = SPH_T32(T1 + T2); \
+		W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
+		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+			+ SPH_C32(0xC24B8B70) + W10); \
+		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+		B = SPH_T32(B + T1); \
+		F = SPH_T32(T1 + T2); \
+		W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
+		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+			+ SPH_C32(0xC76C51A3) + W11); \
+		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+		A = SPH_T32(A + T1); \
+		E = SPH_T32(T1 + T2); \
+		W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
+		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+			+ SPH_C32(0xD192E819) + W12); \
+		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+		H = SPH_T32(H + T1); \
+		D = SPH_T32(T1 + T2); \
+		W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
+		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+			+ SPH_C32(0xD6990624) + W13); \
+		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+		G = SPH_T32(G + T1); \
+		C = SPH_T32(T1 + T2); \
+		W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
+		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+			+ SPH_C32(0xF40E3585) + W14); \
+		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+		F = SPH_T32(F + T1); \
+		B = SPH_T32(T1 + T2); \
+		W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
+		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+			+ SPH_C32(0x106AA070) + W15); \
+		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+		E = SPH_T32(E + T1); \
+		A = SPH_T32(T1 + T2); \
+		W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
+		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+			+ SPH_C32(0x19A4C116) + W00); \
+		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+		D = SPH_T32(D + T1); \
+		H = SPH_T32(T1 + T2); \
+		W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
+		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+			+ SPH_C32(0x1E376C08) + W01); \
+		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+		C = SPH_T32(C + T1); \
+		G = SPH_T32(T1 + T2); \
+		W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
+		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+			+ SPH_C32(0x2748774C) + W02); \
+		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+		B = SPH_T32(B + T1); \
+		F = SPH_T32(T1 + T2); \
+		W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
+		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+			+ SPH_C32(0x34B0BCB5) + W03); \
+		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+		A = SPH_T32(A + T1); \
+		E = SPH_T32(T1 + T2); \
+		W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
+		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+			+ SPH_C32(0x391C0CB3) + W04); \
+		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+		H = SPH_T32(H + T1); \
+		D = SPH_T32(T1 + T2); \
+		W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
+		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+			+ SPH_C32(0x4ED8AA4A) + W05); \
+		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+		G = SPH_T32(G + T1); \
+		C = SPH_T32(T1 + T2); \
+		W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
+		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+			+ SPH_C32(0x5B9CCA4F) + W06); \
+		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+		F = SPH_T32(F + T1); \
+		B = SPH_T32(T1 + T2); \
+		W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
+		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+			+ SPH_C32(0x682E6FF3) + W07); \
+		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+		E = SPH_T32(E + T1); \
+		A = SPH_T32(T1 + T2); \
+		W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
+		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+			+ SPH_C32(0x748F82EE) + W08); \
+		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+		D = SPH_T32(D + T1); \
+		H = SPH_T32(T1 + T2); \
+		W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
+		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+			+ SPH_C32(0x78A5636F) + W09); \
+		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+		C = SPH_T32(C + T1); \
+		G = SPH_T32(T1 + T2); \
+		W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
+		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+			+ SPH_C32(0x84C87814) + W10); \
+		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+		B = SPH_T32(B + T1); \
+		F = SPH_T32(T1 + T2); \
+		W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
+		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+			+ SPH_C32(0x8CC70208) + W11); \
+		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+		A = SPH_T32(A + T1); \
+		E = SPH_T32(T1 + T2); \
+		W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
+		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+			+ SPH_C32(0x90BEFFFA) + W12); \
+		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+		H = SPH_T32(H + T1); \
+		D = SPH_T32(T1 + T2); \
+		W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
+		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+			+ SPH_C32(0xA4506CEB) + W13); \
+		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+		G = SPH_T32(G + T1); \
+		C = SPH_T32(T1 + T2); \
+		W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
+		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+			+ SPH_C32(0xBEF9A3F7) + W14); \
+		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+		F = SPH_T32(F + T1); \
+		B = SPH_T32(T1 + T2); \
+		W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
+		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+			+ SPH_C32(0xC67178F2) + W15); \
+		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+		E = SPH_T32(E + T1); \
+		A = SPH_T32(T1 + T2); \
+		(r)[0] = SPH_T32((r)[0] + A); \
+		(r)[1] = SPH_T32((r)[1] + B); \
+		(r)[2] = SPH_T32((r)[2] + C); \
+		(r)[3] = SPH_T32((r)[3] + D); \
+		(r)[4] = SPH_T32((r)[4] + E); \
+		(r)[5] = SPH_T32((r)[5] + F); \
+		(r)[6] = SPH_T32((r)[6] + G); \
+		(r)[7] = SPH_T32((r)[7] + H); \
+	} while (0)
+
+#endif
+
+/*
+ * One round of SHA-224 / SHA-256. The data must be aligned for 32-bit access.
+ */
+static void
+sha2_round(const unsigned char *data, sph_u32 r[8])
+{
+#define SHA2_IN(x)   sph_dec32be_aligned(data + (4 * (x)))
+	SHA2_ROUND_BODY(SHA2_IN, r);
+#undef SHA2_IN
+}
+
+/* see sph_sha2.h */
+void
+sph_sha224_init(void *cc)
+{
+	sph_sha224_context *sc;
+
+	sc = cc;
+	memcpy(sc->val, H224, sizeof H224);
+#if SPH_64
+	sc->count = 0;
+#else
+	sc->count_high = sc->count_low = 0;
+#endif
+}
+
+/* see sph_sha2.h */
+void
+sph_sha256_init(void *cc)
+{
+	sph_sha256_context *sc;
+
+	sc = cc;
+	memcpy(sc->val, H256, sizeof H256);
+#if SPH_64
+	sc->count = 0;
+#else
+	sc->count_high = sc->count_low = 0;
+#endif
+}
+
+#define RFUN   sha2_round
+#define HASH   sha224
+#define BE32   1
+#include "md_helper.c"
+
+/* see sph_sha2.h */
+void
+sph_sha224_close(void *cc, void *dst)
+{
+	sha224_close(cc, dst, 7);
+	sph_sha224_init(cc);
+}
+
+/* see sph_sha2.h */
+void
+sph_sha224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	sha224_addbits_and_close(cc, ub, n, dst, 7);
+	sph_sha224_init(cc);
+}
+
+/* see sph_sha2.h */
+void
+sph_sha256_close(void *cc, void *dst)
+{
+	sha224_close(cc, dst, 8);
+	sph_sha256_init(cc);
+}
+
+/* see sph_sha2.h */
+void
+sph_sha256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	sha224_addbits_and_close(cc, ub, n, dst, 8);
+	sph_sha256_init(cc);
+}
+
+/* see sph_sha2.h */
+void
+sph_sha224_comp(const sph_u32 msg[16], sph_u32 val[8])
+{
+#define SHA2_IN(x)   msg[x]
+	SHA2_ROUND_BODY(SHA2_IN, val);
+#undef SHA2_IN
+}
diff --git a/sph/sph_sha2.h b/sph/sph_sha2.h
new file mode 100644
index 0000000000..c47b0f3698
--- /dev/null
+++ b/sph/sph_sha2.h
@@ -0,0 +1,378 @@
+/* $Id: sph_sha2.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * SHA-224, SHA-256, SHA-384 and SHA-512 interface.
+ *
+ * SHA-256 has been published in FIPS 180-2, now amended with a change
+ * notice to include SHA-224 as well (which is a simple variation on
+ * SHA-256). SHA-384 and SHA-512 are also defined in FIPS 180-2. FIPS
+ * standards can be found at:
+ *    http://csrc.nist.gov/publications/fips/
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_sha2.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_SHA2_H__
+#define SPH_SHA2_H__
+
+#include <stddef.h>
+#include "sph_types.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+/**
+ * Output size (in bits) for SHA-224.
+ */
+#define SPH_SIZE_sha224   224
+
+/**
+ * Output size (in bits) for SHA-256.
+ */
+#define SPH_SIZE_sha256   256
+
+/**
+ * This structure is a context for SHA-224 computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * a SHA-224 computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running SHA-224 computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	sph_u32 val[8];
+#if SPH_64
+	sph_u64 count;
+#else
+	sph_u32 count_high, count_low;
+#endif
+#endif
+} sph_sha224_context;
+
+/**
+ * This structure is a context for SHA-256 computations. It is identical
+ * to the SHA-224 context. However, a context is initialized for SHA-224
+ * <strong>or</strong> SHA-256, but not both (the internal IV is not the
+ * same).
+ */
+typedef sph_sha224_context sph_sha256_context;
+
+/**
+ * Initialize a SHA-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the SHA-224 context (pointer to
+ *             a <code>sph_sha224_context</code>)
+ */
+void sph_sha224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the SHA-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_sha224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current SHA-224 computation and output the result into the
+ * provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SHA-224 context
+ * @param dst   the destination buffer
+ */
+void sph_sha224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SHA-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_sha224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Apply the SHA-224 compression function on the provided data. The
+ * <code>msg</code> parameter contains the 16 32-bit input blocks,
+ * as numerical values (hence after the big-endian decoding). The
+ * <code>val</code> parameter contains the 8 32-bit input blocks for
+ * the compression function; the output is written in place in this
+ * array.
+ *
+ * @param msg   the message block (16 values)
+ * @param val   the function 256-bit input and output
+ */
+void sph_sha224_comp(const sph_u32 msg[16], sph_u32 val[8]);
+
+/**
+ * Initialize a SHA-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the SHA-256 context (pointer to
+ *             a <code>sph_sha256_context</code>)
+ */
+void sph_sha256_init(void *cc);
+
+#ifdef DOXYGEN_IGNORE
+/**
+ * Process some data bytes, for SHA-256. This function is identical to
+ * <code>sha_224()</code>
+ *
+ * @param cc     the SHA-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_sha256(void *cc, const void *data, size_t len);
+#endif
+
+#ifndef DOXYGEN_IGNORE
+#define sph_sha256   sph_sha224
+#endif
+
+/**
+ * Terminate the current SHA-256 computation and output the result into the
+ * provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SHA-256 context
+ * @param dst   the destination buffer
+ */
+void sph_sha256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SHA-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_sha256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst);
+
+#ifdef DOXYGEN_IGNORE
+/**
+ * Apply the SHA-256 compression function on the provided data. This
+ * function is identical to <code>sha224_comp()</code>.
+ *
+ * @param msg   the message block (16 values)
+ * @param val   the function 256-bit input and output
+ */
+void sph_sha256_comp(const sph_u32 msg[16], sph_u32 val[8]);
+#endif
+
+#ifndef DOXYGEN_IGNORE
+#define sph_sha256_comp   sph_sha224_comp
+#endif
+
+#if SPH_64
+
+/**
+ * Output size (in bits) for SHA-384.
+ */
+#define SPH_SIZE_sha384   384
+
+/**
+ * Output size (in bits) for SHA-512.
+ */
+#define SPH_SIZE_sha512   512
+
+/**
+ * This structure is a context for SHA-384 computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * a SHA-384 computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running SHA-384 computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[128];    /* first field, for alignment */
+	sph_u64 val[8];
+	sph_u64 count;
+#endif
+} sph_sha384_context;
+
+/**
+ * Initialize a SHA-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the SHA-384 context (pointer to
+ *             a <code>sph_sha384_context</code>)
+ */
+void sph_sha384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the SHA-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_sha384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current SHA-384 computation and output the result into the
+ * provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SHA-384 context
+ * @param dst   the destination buffer
+ */
+void sph_sha384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SHA-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_sha384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Apply the SHA-384 compression function on the provided data. The
+ * <code>msg</code> parameter contains the 16 64-bit input blocks,
+ * as numerical values (hence after the big-endian decoding). The
+ * <code>val</code> parameter contains the 8 64-bit input blocks for
+ * the compression function; the output is written in place in this
+ * array.
+ *
+ * @param msg   the message block (16 values)
+ * @param val   the function 512-bit input and output
+ */
+void sph_sha384_comp(const sph_u64 msg[16], sph_u64 val[8]);
+
+/**
+ * This structure is a context for SHA-512 computations. It is identical
+ * to the SHA-384 context. However, a context is initialized for SHA-384
+ * <strong>or</strong> SHA-512, but not both (the internal IV is not the
+ * same).
+ */
+typedef sph_sha384_context sph_sha512_context;
+
+/**
+ * Initialize a SHA-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the SHA-512 context (pointer to
+ *             a <code>sph_sha512_context</code>)
+ */
+void sph_sha512_init(void *cc);
+
+#ifdef DOXYGEN_IGNORE
+/**
+ * Process some data bytes, for SHA-512. This function is identical to
+ * <code>sph_sha384()</code>.
+ *
+ * @param cc     the SHA-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_sha512(void *cc, const void *data, size_t len);
+#endif
+
+#ifndef DOXYGEN_IGNORE
+#define sph_sha512   sph_sha384
+#endif
+
+/**
+ * Terminate the current SHA-512 computation and output the result into the
+ * provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the SHA-512 context
+ * @param dst   the destination buffer
+ */
+void sph_sha512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the SHA-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_sha512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst);
+
+#ifdef DOXYGEN_IGNORE
+/**
+ * Apply the SHA-512 compression function. This function is identical to
+ * <code>sph_sha384_comp()</code>.
+ *
+ * @param msg   the message block (16 values)
+ * @param val   the function 512-bit input and output
+ */
+void sph_sha512_comp(const sph_u64 msg[16], sph_u64 val[8]);
+#endif
+
+#ifndef DOXYGEN_IGNORE
+#define sph_sha512_comp   sph_sha384_comp
+#endif
+
+#endif
+
+#endif
+#ifdef __cplusplus
+}
+#endif
+
diff --git a/sph/sph_shabal.h b/sph/sph_shabal.h
new file mode 100644
index 0000000000..08aa5f73cb
--- /dev/null
+++ b/sph/sph_shabal.h
@@ -0,0 +1,344 @@
+/* $Id: sph_shabal.h 175 2010-05-07 16:03:20Z tp $ */
+/**
+ * Shabal interface. Shabal is a family of functions which differ by
+ * their output size; this implementation defines Shabal for output
+ * sizes 192, 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_shabal.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_SHABAL_H__
+#define SPH_SHABAL_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "sph_types.h"
+
+/**
+ * Output size (in bits) for Shabal-192.
+ */
+#define SPH_SIZE_shabal192   192
+
+/**
+ * Output size (in bits) for Shabal-224.
+ */
+#define SPH_SIZE_shabal224   224
+
+/**
+ * Output size (in bits) for Shabal-256.
+ */
+#define SPH_SIZE_shabal256   256
+
+/**
+ * Output size (in bits) for Shabal-384.
+ */
+#define SPH_SIZE_shabal384   384
+
+/**
+ * Output size (in bits) for Shabal-512.
+ */
+#define SPH_SIZE_shabal512   512
+
+/**
+ * This structure is a context for Shabal computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * a Shabal computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running Shabal computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 A[12], B[16], C[16];
+	sph_u32 Whigh, Wlow;
+#endif
+} sph_shabal_context;
+
+/**
+ * Type for a Shabal-192 context (identical to the common context).
+ */
+typedef sph_shabal_context sph_shabal192_context;
+
+/**
+ * Type for a Shabal-224 context (identical to the common context).
+ */
+typedef sph_shabal_context sph_shabal224_context;
+
+/**
+ * Type for a Shabal-256 context (identical to the common context).
+ */
+typedef sph_shabal_context sph_shabal256_context;
+
+/**
+ * Type for a Shabal-384 context (identical to the common context).
+ */
+typedef sph_shabal_context sph_shabal384_context;
+
+/**
+ * Type for a Shabal-512 context (identical to the common context).
+ */
+typedef sph_shabal_context sph_shabal512_context;
+
+/**
+ * Initialize a Shabal-192 context. This process performs no memory allocation.
+ *
+ * @param cc   the Shabal-192 context (pointer to a
+ *             <code>sph_shabal192_context</code>)
+ */
+void sph_shabal192_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Shabal-192 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_shabal192(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Shabal-192 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (24 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Shabal-192 context
+ * @param dst   the destination buffer
+ */
+void sph_shabal192_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (24 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Shabal-192 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_shabal192_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Shabal-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the Shabal-224 context (pointer to a
+ *             <code>sph_shabal224_context</code>)
+ */
+void sph_shabal224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Shabal-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_shabal224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Shabal-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Shabal-224 context
+ * @param dst   the destination buffer
+ */
+void sph_shabal224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Shabal-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_shabal224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Shabal-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the Shabal-256 context (pointer to a
+ *             <code>sph_shabal256_context</code>)
+ */
+void sph_shabal256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Shabal-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_shabal256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Shabal-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Shabal-256 context
+ * @param dst   the destination buffer
+ */
+void sph_shabal256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Shabal-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_shabal256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Shabal-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the Shabal-384 context (pointer to a
+ *             <code>sph_shabal384_context</code>)
+ */
+void sph_shabal384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Shabal-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_shabal384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Shabal-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Shabal-384 context
+ * @param dst   the destination buffer
+ */
+void sph_shabal384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Shabal-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_shabal384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Shabal-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the Shabal-512 context (pointer to a
+ *             <code>sph_shabal512_context</code>)
+ */
+void sph_shabal512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Shabal-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_shabal512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current Shabal-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the Shabal-512 context
+ * @param dst   the destination buffer
+ */
+void sph_shabal512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the Shabal-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_shabal512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/sph/sph_skein.h b/sph/sph_skein.h
index 85559843f9..bddbc86fa5 100644
--- a/sph/sph_skein.h
+++ b/sph/sph_skein.h
@@ -41,6 +41,10 @@
 #ifndef SPH_SKEIN_H__
 #define SPH_SKEIN_H__
 
+#ifdef __cplusplus
+extern "C"{
+#endif
+
 #include <stddef.h>
 #include "sph_types.h"
 
@@ -287,4 +291,8 @@ void sph_skein512_addbits_and_close(
 
 #endif
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif
diff --git a/sph/sph_streebog.h b/sph/sph_streebog.h
new file mode 100644
index 0000000000..aa8f69d0df
--- /dev/null
+++ b/sph/sph_streebog.h
@@ -0,0 +1,185 @@
+/* $Id: sph_gost.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * GOST interface. This is the interface for GOST R 12 with the
+ * recommended parameters for SHA-3, with output lengths 256
+ * and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_gost.h
+ * @author   Mish <mish@btchouse.com>
+ */
+
+#ifndef SPH_GOST_H__
+#define SPH_GOST_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "sph_types.h"
+
+/**
+ * Output size (in bits) for GOST-256.
+ */
+#define SPH_SIZE_gost256   256
+
+/**
+ * Output size (in bits) for GOST-512.
+ */
+#define SPH_SIZE_gost512   512
+
+/**
+ * This structure is a context for Keccak computations: it contains the
+ * intermediate values and some data from the last entered block. Once a
+ * GOST computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running GOST computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+
+/**
+ * This structure is a context for Gost-256 computations.
+ */
+
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[32];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 V[3][8];
+#endif
+} sph_gost256_context;
+
+/**
+ * This structure is a context for Gost-512 computations.
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 V[5][8];
+#endif
+} sph_gost512_context;
+
+
+/**
+ * Initialize a GOST-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the GOST-256 context (pointer to a
+ *             <code>sph_gost256_context</code>)
+ */
+void sph_gost256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Gost-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_gost256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current GOST-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the GOST-256 context
+ * @param dst   the destination buffer
+ */
+void sph_gost256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the GOST-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_gost256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Gost-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the GOST-512 context (pointer to a
+ *             <code>sph_gost512_context</code>)
+ */
+void sph_gost512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the GOST-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_gost512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current GOST-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the GOST-512 context
+ * @param dst   the destination buffer
+ */
+void sph_gost512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the GOST-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_gost512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/sph/sph_whirlpool.h b/sph/sph_whirlpool.h
new file mode 100644
index 0000000000..493b7d7918
--- /dev/null
+++ b/sph/sph_whirlpool.h
@@ -0,0 +1,216 @@
+/* $Id: sph_whirlpool.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * WHIRLPOOL interface.
+ *
+ * WHIRLPOOL knows three variants, dubbed "WHIRLPOOL-0" (original
+ * version, published in 2000, studied by NESSIE), "WHIRLPOOL-1"
+ * (first revision, 2001, with a new S-box) and "WHIRLPOOL" (current
+ * version, 2003, with a new diffusion matrix, also described as "plain
+ * WHIRLPOOL"). All three variants are implemented here.
+ *
+ * The original WHIRLPOOL (i.e. WHIRLPOOL-0) was published in: P. S. L.
+ * M. Barreto, V. Rijmen, "The Whirlpool Hashing Function", First open
+ * NESSIE Workshop, Leuven, Belgium, November 13--14, 2000.
+ *
+ * The current WHIRLPOOL specification and a reference implementation
+ * can be found on the WHIRLPOOL web page:
+ * http://paginas.terra.com.br/informatica/paulobarreto/WhirlpoolPage.html
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_whirlpool.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_WHIRLPOOL_H__
+#define SPH_WHIRLPOOL_H__
+
+#include <stddef.h>
+#include "sph_types.h"
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#if SPH_64
+
+/**
+ * Output size (in bits) for WHIRLPOOL.
+ */
+#define SPH_SIZE_whirlpool   512
+
+/**
+ * Output size (in bits) for WHIRLPOOL-0.
+ */
+#define SPH_SIZE_whirlpool0   512
+
+/**
+ * Output size (in bits) for WHIRLPOOL-1.
+ */
+#define SPH_SIZE_whirlpool1   512
+
+/**
+ * This structure is a context for WHIRLPOOL computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * a WHIRLPOOL computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running WHIRLPOOL computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	sph_u64 state[8];
+#if SPH_64
+	sph_u64 count;
+#else
+	sph_u32 count_high, count_low;
+#endif
+#endif
+} sph_whirlpool_context;
+
+/**
+ * Initialize a WHIRLPOOL context. This process performs no memory allocation.
+ *
+ * @param cc   the WHIRLPOOL context (pointer to a
+ *             <code>sph_whirlpool_context</code>)
+ */
+void sph_whirlpool_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing). This function applies the
+ * plain WHIRLPOOL algorithm.
+ *
+ * @param cc     the WHIRLPOOL context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_whirlpool(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current WHIRLPOOL computation and output the result into the
+ * provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the WHIRLPOOL context
+ * @param dst   the destination buffer
+ */
+void sph_whirlpool_close(void *cc, void *dst);
+
+/**
+ * WHIRLPOOL-0 uses the same structure than plain WHIRLPOOL.
+ */
+typedef sph_whirlpool_context sph_whirlpool0_context;
+
+#ifdef DOXYGEN_IGNORE
+/**
+ * Initialize a WHIRLPOOL-0 context. This function is identical to
+ * <code>sph_whirlpool_init()</code>.
+ *
+ * @param cc   the WHIRLPOOL context (pointer to a
+ *             <code>sph_whirlpool0_context</code>)
+ */
+void sph_whirlpool0_init(void *cc);
+#endif
+
+#ifndef DOXYGEN_IGNORE
+#define sph_whirlpool0_init   sph_whirlpool_init
+#endif
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing). This function applies the
+ * WHIRLPOOL-0 algorithm.
+ *
+ * @param cc     the WHIRLPOOL context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_whirlpool0(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current WHIRLPOOL-0 computation and output the result into the
+ * provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the WHIRLPOOL-0 context
+ * @param dst   the destination buffer
+ */
+void sph_whirlpool0_close(void *cc, void *dst);
+
+/**
+ * WHIRLPOOL-1 uses the same structure than plain WHIRLPOOL.
+ */
+typedef sph_whirlpool_context sph_whirlpool1_context;
+
+#ifdef DOXYGEN_IGNORE
+/**
+ * Initialize a WHIRLPOOL-1 context. This function is identical to
+ * <code>sph_whirlpool_init()</code>.
+ *
+ * @param cc   the WHIRLPOOL context (pointer to a
+ *             <code>sph_whirlpool1_context</code>)
+ */
+void sph_whirlpool1_init(void *cc);
+#endif
+
+#ifndef DOXYGEN_IGNORE
+#define sph_whirlpool1_init   sph_whirlpool_init
+#endif
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing). This function applies the
+ * WHIRLPOOL-1 algorithm.
+ *
+ * @param cc     the WHIRLPOOL context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_whirlpool1(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current WHIRLPOOL-1 computation and output the result into the
+ * provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the WHIRLPOOL-1 context
+ * @param dst   the destination buffer
+ */
+void sph_whirlpool1_close(void *cc, void *dst);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
\ No newline at end of file
diff --git a/sph/streebog.c b/sph/streebog.c
new file mode 100644
index 0000000000..dbae4e54b7
--- /dev/null
+++ b/sph/streebog.c
@@ -0,0 +1,1045 @@
+/* Streebog GOST hash function for sib algo SibCoin */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <math.h>
+
+#include "sph_streebog.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+//--------------------------------------------------------------------------------------------
+//
+// Streebog 512 implementation
+//
+//--------------------------------------------------------------------------------------------
+
+
+// Tables for function F
+static const sph_u64 TG[8][256] = {{
+	0xE6F87E5C5B711FD0,0x258377800924FA16,0xC849E07E852EA4A8,0x5B4686A18F06C16A,
+	0x0B32E9A2D77B416E,0xABDA37A467815C66,0xF61796A81A686676,0xF5DC0B706391954B,
+	0x4862F38DB7E64BF1,0xFF5C629A68BD85C5,0xCB827DA6FCD75795,0x66D36DAF69B9F089,
+	0x356C9F74483D83B0,0x7CBCECB1238C99A1,0x36A702AC31C4708D,0x9EB6A8D02FBCDFD6,
+	0x8B19FA51E5B3AE37,0x9CCFB5408A127D0B,0xBC0C78B508208F5A,0xE533E3842288ECED,
+	0xCEC2C7D377C15FD2,0xEC7817B6505D0F5E,0xB94CC2C08336871D,0x8C205DB4CB0B04AD,
+	0x763C855B28A0892F,0x588D1B79F6FF3257,0x3FECF69E4311933E,0x0FC0D39F803A18C9,
+	0xEE010A26F5F3AD83,0x10EFE8F4411979A6,0x5DCDA10C7DE93A10,0x4A1BEE1D1248E92C,
+	0x53BFF2DB21847339,0xB4F50CCFA6A23D09,0x5FB4BC9CD84798CD,0xE88A2D8B071C56F9,
+	0x7F7771695A756A9C,0xC5F02E71A0BA1EBC,0xA663F9AB4215E672,0x2EB19E22DE5FBB78,
+	0x0DB9CE0F2594BA14,0x82520E6397664D84,0x2F031E6A0208EA98,0x5C7F2144A1BE6BF0,
+	0x7A37CB1CD16362DB,0x83E08E2B4B311C64,0xCF70479BAB960E32,0x856BA986B9DEE71E,
+	0xB5478C877AF56CE9,0xB8FE42885F61D6FD,0x1BDD0156966238C8,0x622157923EF8A92E,
+	0xFC97FF42114476F8,0x9D7D350856452CEB,0x4C90C9B0E0A71256,0x2308502DFBCB016C,
+	0x2D7A03FAA7A64845,0xF46E8B38BFC6C4AB,0xBDBEF8FDD477DEBA,0x3AAC4CEBC8079B79,
+	0xF09CB105E8879D0C,0x27FA6A10AC8A58CB,0x8960E7C1401D0CEA,0x1A6F811E4A356928,
+	0x90C4FB0773D196FF,0x43501A2F609D0A9F,0xF7A516E0C63F3796,0x1CE4A6B3B8DA9252,
+	0x1324752C38E08A9B,0xA5A864733BEC154F,0x2BF124575549B33F,0xD766DB15440DC5C7,
+	0xA7D179E39E42B792,0xDADF151A61997FD3,0x86A0345EC0271423,0x38D5517B6DA939A4,
+	0x6518F077104003B4,0x02791D90A5AEA2DD,0x88D267899C4A5D0A,0x930F66DF0A2865C2,
+	0x4EE9D4204509B08B,0x325538916685292A,0x412907BFC533A842,0xB27E2B62544DC673,
+	0x6C5304456295E007,0x5AF406E95351908A,0x1F2F3B6BC123616F,0xC37B09DC5255E5C6,
+	0x3967D133B1FE6844,0x298839C7F0E711E2,0x409B87F71964F9A2,0xE938ADC3DB4B0719,
+	0x0C0B4E47F9C3EBF4,0x5534D576D36B8843,0x4610A05AEB8B02D8,0x20C3CDF58232F251,
+	0x6DE1840DBEC2B1E7,0xA0E8DE06B0FA1D08,0x7B854B540D34333B,0x42E29A67BCCA5B7F,
+	0xD8A6088AC437DD0E,0xC63BB3A9D943ED81,0x21714DBD5E65A3B1,0x6761EDE7B5EEA169,
+	0x2431F7C8D573ABF6,0xD51FC685E1A3671A,0x5E063CD40410C92D,0x283AB98F2CB04002,
+	0x8FEBC06CB2F2F790,0x17D64F116FA1D33C,0xE07359F1A99EE4AA,0x784ED68C74CDC006,
+	0x6E2A19D5C73B42DA,0x8712B4161C7045C3,0x371582E4ED93216D,0xACE390414939F6FC,
+	0x7EC5F12186223B7C,0xC0B094042BAC16FB,0xF9D745379A527EBF,0x737C3F2EA3B68168,
+	0x33E7B8D9BAD278CA,0xA9A32A34C22FFEBB,0xE48163CCFEDFBD0D,0x8E5940246EA5A670,
+	0x51C6EF4B842AD1E4,0x22BAD065279C508C,0xD91488C218608CEE,0x319EA5491F7CDA17,
+	0xD394E128134C9C60,0x094BF43272D5E3B3,0x9BF612A5A4AAD791,0xCCBBDA43D26FFD0F,
+	0x34DE1F3C946AD250,0x4F5B5468995EE16B,0xDF9FAF6FEA8F7794,0x2648EA5870DD092B,
+	0xBFC7E56D71D97C67,0xDDE6B2FF4F21D549,0x3C276B463AE86003,0x91767B4FAF86C71F,
+	0x68A13E7835D4B9A0,0xB68C115F030C9FD4,0x141DD2C916582001,0x983D8F7DDD5324AC,
+	0x64AA703FCC175254,0xC2C989948E02B426,0x3E5E76D69F46C2DE,0x50746F03587D8004,
+	0x45DB3D829272F1E5,0x60584A029B560BF3,0xFBAE58A73FFCDC62,0xA15A5E4E6CAD4CE8,
+	0x4BA96E55CE1FB8CC,0x08F9747AAE82B253,0xC102144CF7FB471B,0x9F042898F3EB8E36,
+	0x068B27ADF2EFFB7A,0xEDCA97FE8C0A5EBE,0x778E0513F4F7D8CF,0x302C2501C32B8BF7,
+	0x8D92DDFC175C554D,0xF865C57F46052F5F,0xEAF3301BA2B2F424,0xAA68B7ECBBD60D86,
+	0x998F0F350104754C,0x0000000000000000,0xF12E314D34D0CCEC,0x710522BE061823B5,
+	0xAF280D9930C005C1,0x97FD5CE25D693C65,0x19A41CC633CC9A15,0x95844172F8C79EB8,
+	0xDC5432B7937684A9,0x9436C13A2490CF58,0x802B13F332C8EF59,0xC442AE397CED4F5C,
+	0xFA1CD8EFE3AB8D82,0xF2E5AC954D293FD1,0x6AD823E8907A1B7D,0x4D2249F83CF043B6,
+	0x03CB9DD879F9F33D,0xDE2D2F2736D82674,0x2A43A41F891EE2DF,0x6F98999D1B6C133A,
+	0xD4AD46CD3DF436FA,0xBB35DF50269825C0,0x964FDCAA813E6D85,0xEB41B0537EE5A5C4,
+	0x0540BA758B160847,0xA41AE43BE7BB44AF,0xE3B8C429D0671797,0x819993BBEE9FBEB9,
+	0xAE9A8DD1EC975421,0xF3572CDD917E6E31,0x6393D7DAE2AFF8CE,0x47A2201237DC5338,
+	0xA32343DEC903EE35,0x79FC56C4A89A91E6,0x01B28048DC5751E0,0x1296F564E4B7DB7B,
+	0x75F7188351597A12,0xDB6D9552BDCE2E33,0x1E9DBB231D74308F,0x520D7293FDD322D9,
+	0xE20A44610C304677,0xFEEEE2D2B4EAD425,0xCA30FDEE20800675,0x61EACA4A47015A13,
+	0xE74AFE1487264E30,0x2CC883B27BF119A5,0x1664CF59B3F682DC,0xA811AA7C1E78AF5B,
+	0x1D5626FB648DC3B2,0xB73E9117DF5BCE34,0xD05F7CF06AB56F5D,0xFD257F0ACD132718,
+	0x574DC8E676C52A9E,0x0739A7E52EB8AA9A,0x5486553E0F3CD9A3,0x56FF48AEAA927B7E,
+	0xBE756525AD8E2D87,0x7D0E6CF9FFDBC841,0x3B1ECCA31450CA99,0x6913BE30E983E840,
+	0xAD511009956EA71C,0xB1B5B6BA2DB4354E,0x4469BDCA4E25A005,0x15AF5281CA0F71E1,
+	0x744598CB8D0E2BF2,0x593F9B312AA863B7,0xEFB38A6E29A4FC63,0x6B6AA3A04C2D4A9D,
+	0x3D95EB0EE6BF31E3,0xA291C3961554BFD5,0x18169C8EEF9BCBF5,0x115D68BC9D4E2846,
+	0xBA875F18FACF7420,0xD1EDFCB8B6E23EBD,0xB00736F2F1E364AE,0x84D929CE6589B6FE,
+	0x70B7A2F6DA4F7255,0x0E7253D75C6D4929,0x04F23A3D574159A7,0x0A8069EA0B2C108E,
+	0x49D073C56BB11A11,0x8AAB7A1939E4FFD7,0xCD095A0B0E38ACEF,0xC9FB60365979F548,
+	0x92BDE697D67F3422,0xC78933E10514BC61,0xE1C1D9B975C9B54A,0xD2266160CF1BCD80,
+	0x9A4492ED78FD8671,0xB3CCAB2A881A9793,0x72CEBF667FE1D088,0xD6D45B5D985A9427
+},{
+	0xC811A8058C3F55DE,0x65F5B43196B50619,0xF74F96B1D6706E43,0x859D1E8BCB43D336,
+	0x5AAB8A85CCFA3D84,0xF9C7BF99C295FCFD,0xA21FD5A1DE4B630F,0xCDB3EF763B8B456D,
+	0x803F59F87CF7C385,0xB27C73BE5F31913C,0x98E3AC6633B04821,0xBF61674C26B8F818,
+	0x0FFBC995C4C130C8,0xAAA0862010761A98,0x6057F342210116AA,0xF63C760C0654CC35,
+	0x2DDB45CC667D9042,0xBCF45A964BD40382,0x68E8A0C3EF3C6F3D,0xA7BD92D269FF73BC,
+	0x290AE20201ED2287,0xB7DE34CDE885818F,0xD901EEA7DD61059B,0xD6FA273219A03553,
+	0xD56F1AE874CCCEC9,0xEA31245C2E83F554,0x7034555DA07BE499,0xCE26D2AC56E7BEF7,
+	0xFD161857A5054E38,0x6A0E7DA4527436D1,0x5BD86A381CDE9FF2,0xCAF7756231770C32,
+	0xB09AAED9E279C8D0,0x5DEF1091C60674DB,0x111046A2515E5045,0x23536CE4729802FC,
+	0xC50CBCF7F5B63CFA,0x73A16887CD171F03,0x7D2941AFD9F28DBD,0x3F5E3EB45A4F3B9D,
+	0x84EEFE361B677140,0x3DB8E3D3E7076271,0x1A3A28F9F20FD248,0x7EBC7C75B49E7627,
+	0x74E5F293C7EB565C,0x18DCF59E4F478BA4,0x0C6EF44FA9ADCB52,0xC699812D98DAC760,
+	0x788B06DC6E469D0E,0xFC65F8EA7521EC4E,0x30A5F7219E8E0B55,0x2BEC3F65BCA57B6B,
+	0xDDD04969BAF1B75E,0x99904CDBE394EA57,0x14B201D1E6EA40F6,0xBBB0C08241284ADD,
+	0x50F20463BF8F1DFF,0xE8D7F93B93CBACB8,0x4D8CB68E477C86E8,0xC1DD1B3992268E3F,
+	0x7C5AA11209D62FCB,0x2F3D98ABDB35C9AE,0x671369562BFD5FF5,0x15C1E16C36CEE280,
+	0x1D7EB2EDF8F39B17,0xDA94D37DB00DFE01,0x877BC3EC760B8ADA,0xCB8495DFE153AE44,
+	0x05A24773B7B410B3,0x12857B783C32ABDF,0x8EB770D06812513B,0x536739B9D2E3E665,
+	0x584D57E271B26468,0xD789C78FC9849725,0xA935BBFA7D1AE102,0x8B1537A3DFA64188,
+	0xD0CD5D9BC378DE7A,0x4AC82C9A4D80CFB7,0x42777F1B83BDB620,0x72D2883A1D33BD75,
+	0x5E7A2D4BAB6A8F41,0xF4DAAB6BBB1C95D9,0x905CFFE7FD8D31B6,0x83AA6422119B381F,
+	0xC0AEFB8442022C49,0xA0F908C663033AE3,0xA428AF0804938826,0xADE41C341A8A53C7,
+	0xAE7121EE77E6A85D,0xC47F5C4A25929E8C,0xB538E9AA55CDD863,0x06377AA9DAD8EB29,
+	0xA18AE87BB3279895,0x6EDFDA6A35E48414,0x6B7D9D19825094A7,0xD41CFA55A4E86CBF,
+	0xE5CAEDC9EA42C59C,0xA36C351C0E6FC179,0x5181E4DE6FABBF89,0xFFF0C530184D17D4,
+	0x9D41EB1584045892,0x1C0D525028D73961,0xF178EC180CA8856A,0x9A0571018EF811CD,
+	0x4091A27C3EF5EFCC,0x19AF15239F6329D2,0x347450EFF91EB990,0xE11B4A078DD27759,
+	0xB9561DE5FC601331,0x912F1F5A2DA993C0,0x1654DCB65BA2191A,0x3E2DDE098A6B99EB,
+	0x8A66D71E0F82E3FE,0x8C51ADB7D55A08D7,0x4533E50F8941FF7F,0x02E6DD67BD4859EC,
+	0xE068AABA5DF6D52F,0xC24826E3FF4A75A5,0x6C39070D88ACDDF8,0x6486548C4691A46F,
+	0xD1BEBD26135C7C0C,0xB30F93038F15334A,0x82D9849FC1BF9A69,0x9C320BA85420FAE4,
+	0xFA528243AFF90767,0x9ED4D6CFE968A308,0xB825FD582C44B147,0x9B7691BC5EDCB3BB,
+	0xC7EA619048FE6516,0x1063A61F817AF233,0x47D538683409A693,0x63C2CE984C6DED30,
+	0x2A9FDFD86C81D91D,0x7B1E3B06032A6694,0x666089EBFBD9FD83,0x0A598EE67375207B,
+	0x07449A140AFC495F,0x2CA8A571B6593234,0x1F986F8A45BBC2FB,0x381AA4A050B372C2,
+	0x5423A3ADD81FAF3A,0x17273C0B8B86BB6C,0xFE83258DC869B5A2,0x287902BFD1C980F1,
+	0xF5A94BD66B3837AF,0x88800A79B2CABA12,0x55504310083B0D4C,0xDF36940E07B9EEB2,
+	0x04D1A7CE6790B2C5,0x612413FFF125B4DC,0x26F12B97C52C124F,0x86082351A62F28AC,
+	0xEF93632F9937E5E7,0x3507B052293A1BE6,0xE72C30AE570A9C70,0xD3586041AE1425E0,
+	0xDE4574B3D79D4CC4,0x92BA228040C5685A,0xF00B0CA5DC8C271C,0xBE1287F1F69C5A6E,
+	0xF39E317FB1E0DC86,0x495D114020EC342D,0x699B407E3F18CD4B,0xDCA3A9D46AD51528,
+	0x0D1D14F279896924,0x0000000000000000,0x593EB75FA196C61E,0x2E4E78160B116BD8,
+	0x6D4AE7B058887F8E,0xE65FD013872E3E06,0x7A6DDBBBD30EC4E2,0xAC97FC89CAAEF1B1,
+	0x09CCB33C1E19DBE1,0x89F3EAC462EE1864,0x7770CF49AA87ADC6,0x56C57ECA6557F6D6,
+	0x03953DDA6D6CFB9A,0x36928D884456E07C,0x1EEB8F37959F608D,0x31D6179C4EAAA923,
+	0x6FAC3AD7E5C02662,0x43049FA653991456,0xABD3669DC052B8EE,0xAF02C153A7C20A2B,
+	0x3CCB036E3723C007,0x93C9C23D90E1CA2C,0xC33BC65E2F6ED7D3,0x4CFF56339758249E,
+	0xB1E94E64325D6AA6,0x37E16D359472420A,0x79F8E661BE623F78,0x5214D90402C74413,
+	0x482EF1FDF0C8965B,0x13F69BC5EC1609A9,0x0E88292814E592BE,0x4E198B542A107D72,
+	0xCCC00FCBEBAFE71B,0x1B49C844222B703E,0x2564164DA840E9D5,0x20C6513E1FF4F966,
+	0xBAC3203F910CE8AB,0xF2EDD1C261C47EF0,0x814CB945ACD361F3,0x95FEB8944A392105,
+	0x5C9CF02C1622D6AD,0x971865F3F77178E9,0xBD87BA2B9BF0A1F4,0x444005B259655D09,
+	0xED75BE48247FBC0B,0x7596122E17CFF42A,0xB44B091785E97A15,0x966B854E2755DA9F,
+	0xEEE0839249134791,0x32432A4623C652B9,0xA8465B47AD3E4374,0xF8B45F2412B15E8B,
+	0x2417F6F078644BA3,0xFB2162FE7FDDA511,0x4BBBCC279DA46DC1,0x0173E0BDD024A276,
+	0x22208C59A2BCA08A,0x8FC4906DB836F34D,0xE4B90D743A6667EA,0x7147B5E0705F46EF,
+	0x2782CB2A1508B039,0xEC065EF5F45B1E7D,0x21B5B183CFD05B10,0xDBE733C060295C77,
+	0x9FA73672394C017E,0xCF55321186C31C81,0xD8720E1A0D45A7ED,0x3B8F997A3DDF8958,
+	0x3AFC79C7EDFB2B2E,0xE9A4198643EF0ECE,0x5F09CDF67B4E2D37,0x4F6A6BE9FA34DF04,
+	0xB6ADD47038A123F9,0x8D224D0A057EAAA1,0xC96248B85C1BF7A8,0xE3FD9760309A2EB5,
+	0x0B2A6E5BA351820D,0xEB42C4E1FEA75722,0x948D58299A1D8373,0x7FCF9CC864BAD451,
+	0xA55B4FB5D4B72A50,0x08BF5381CE3D7997,0x46A6D8D5E42D04E5,0xD22B80FC7E308796,
+	0x57B69E77B57354A0,0x3969441D8097D0B4,0x3330CAFBF3E2F0CF,0xE28E77DDE0BE8CC3,
+	0x62B12E259C494F46,0xA6CE726FB9DBD1CA,0x41E242C1EED14DBA,0x76032FF47AA30FB0
+},{
+	0x45B268A93ACDE4CC,0xAF7F0BE884549D08,0x048354B3C1468263,0x925435C2C80EFED2,
+	0xEE4E37F27FDFFBA7,0x167A33920C60F14D,0xFB123B52EA03E584,0x4A0CAB53FDBB9007,
+	0x9DEAF6380F788A19,0xCB48EC558F0CB32A,0xB59DC4B2D6FEF7E0,0xDCDBCA22F4F3ECB6,
+	0x11DF5813549A9C40,0xE33FDEDF568ACED3,0xA0C1C8124322E9C3,0x07A56B8158FA6D0D,
+	0x77279579B1E1F3DD,0xD9B18B74422AC004,0xB8EC2D9FFFABC294,0xF4ACF8A82D75914F,
+	0x7BBF69B1EF2B6878,0xC4F62FAF487AC7E1,0x76CE809CC67E5D0C,0x6711D88F92E4C14C,
+	0x627B99D9243DEDFE,0x234AA5C3DFB68B51,0x909B1F15262DBF6D,0x4F66EA054B62BCB5,
+	0x1AE2CF5A52AA6AE8,0xBEA053FBD0CE0148,0xED6808C0E66314C9,0x43FE16CD15A82710,
+	0xCD049231A06970F6,0xE7BC8A6C97CC4CB0,0x337CE835FCB3B9C0,0x65DEF2587CC780F3,
+	0x52214EDE4132BB50,0x95F15E4390F493DF,0x870839625DD2E0F1,0x41313C1AFB8B66AF,
+	0x91720AF051B211BC,0x477D427ED4EEA573,0x2E3B4CEEF6E3BE25,0x82627834EB0BCC43,
+	0x9C03E3DD78E724C8,0x2877328AD9867DF9,0x14B51945E243B0F2,0x574B0F88F7EB97E2,
+	0x88B6FA989AA4943A,0x19C4F068CB168586,0x50EE6409AF11FAEF,0x7DF317D5C04EABA4,
+	0x7A567C5498B4C6A9,0xB6BBFB804F42188E,0x3CC22BCF3BC5CD0B,0xD04336EAAA397713,
+	0xF02FAC1BEC33132C,0x2506DBA7F0D3488D,0xD7E65D6BF2C31A1E,0x5EB9B2161FF820F5,
+	0x842E0650C46E0F9F,0x716BEB1D9E843001,0xA933758CAB315ED4,0x3FE414FDA2792265,
+	0x27C9F1701EF00932,0x73A4C1CA70A771BE,0x94184BA6E76B3D0E,0x40D829FF8C14C87E,
+	0x0FBEC3FAC77674CB,0x3616A9634A6A9572,0x8F139119C25EF937,0xF545ED4D5AEA3F9E,
+	0xE802499650BA387B,0x6437E7BD0B582E22,0xE6559F89E053E261,0x80AD52E305288DFC,
+	0x6DC55A23E34B9935,0xDE14E0F51AD0AD09,0xC6390578A659865E,0x96D7617109487CB1,
+	0xE2D6CB3A21156002,0x01E915E5779FAED1,0xADB0213F6A77DCB7,0x9880B76EB9A1A6AB,
+	0x5D9F8D248644CF9B,0xFD5E4536C5662658,0xF1C6B9FE9BACBDFD,0xEACD6341BE9979C4,
+	0xEFA7221708405576,0x510771ECD88E543E,0xC2BA51CB671F043D,0x0AD482AC71AF5879,
+	0xFE787A045CDAC936,0xB238AF338E049AED,0xBD866CC94972EE26,0x615DA6EBBD810290,
+	0x3295FDD08B2C1711,0xF834046073BF0AEA,0xF3099329758FFC42,0x1CAEB13E7DCFA934,
+	0xBA2307481188832B,0x24EFCE42874CE65C,0x0E57D61FB0E9DA1A,0xB3D1BAD6F99B343C,
+	0xC0757B1C893C4582,0x2B510DB8403A9297,0x5C7698C1F1DB614A,0x3E0D0118D5E68CB4,
+	0xD60F488E855CB4CF,0xAE961E0DF3CB33D9,0x3A8E55AB14A00ED7,0x42170328623789C1,
+	0x838B6DD19C946292,0x895FEF7DED3B3AEB,0xCFCBB8E64E4A3149,0x064C7E642F65C3DC,
+	0x3D2B3E2A4C5A63DA,0x5BD3F340A9210C47,0xB474D157A1615931,0xAC5934DA1DE87266,
+	0x6EE365117AF7765B,0xC86ED36716B05C44,0x9BA6885C201D49C5,0xB905387A88346C45,
+	0x131072C4BAB9DDFF,0xBF49461EA751AF99,0xD52977BC1CE05BA1,0xB0F785E46027DB52,
+	0x546D30BA6E57788C,0x305AD707650F56AE,0xC987C682612FF295,0xA5AB8944F5FBC571,
+	0x7ED528E759F244CA,0x8DDCBBCE2C7DB888,0xAA154ABE328DB1BA,0x1E619BE993ECE88B,
+	0x09F2BD9EE813B717,0x7401AA4B285D1CB3,0x21858F143195CAEE,0x48C381841398D1B8,
+	0xFCB750D3B2F98889,0x39A86A998D1CE1B9,0x1F888E0CE473465A,0x7899568376978716,
+	0x02CF2AD7EE2341BF,0x85C713B5B3F1A14E,0xFF916FE12B4567E7,0x7C1A0230B7D10575,
+	0x0C98FCC85ECA9BA5,0xA3E7F720DA9E06AD,0x6A6031A2BBB1F438,0x973E74947ED7D260,
+	0x2CF4663918C0FF9A,0x5F50A7F368678E24,0x34D983B4A449D4CD,0x68AF1B755592B587,
+	0x7F3C3D022E6DEA1B,0xABFC5F5B45121F6B,0x0D71E92D29553574,0xDFFDF5106D4F03D8,
+	0x081BA87B9F8C19C6,0xDB7EA1A3AC0981BB,0xBBCA12AD66172DFA,0x79704366010829C7,
+	0x179326777BFF5F9C,0x0000000000000000,0xEB2476A4C906D715,0x724DD42F0738DF6F,
+	0xB752EE6538DDB65F,0x37FFBC863DF53BA3,0x8EFA84FCB5C157E6,0xE9EB5C73272596AA,
+	0x1B0BDABF2535C439,0x86E12C872A4D4E20,0x9969A28BCE3E087A,0xFAFB2EB79D9C4B55,
+	0x056A4156B6D92CB2,0x5A3AE6A5DEBEA296,0x22A3B026A8292580,0x53C85B3B36AD1581,
+	0xB11E900117B87583,0xC51F3A4A3FE56930,0xE019E1EDCF3621BD,0xEC811D2591FCBA18,
+	0x445B7D4C4D524A1D,0xA8DA6069DCAEF005,0x58F5CC72309DE329,0xD4C062596B7FF570,
+	0xCE22AD0339D59F98,0x591CD99747024DF8,0x8B90C5AA03187B54,0xF663D27FC356D0F0,
+	0xD8589E9135B56ED5,0x35309651D3D67A1C,0x12F96721CD26732E,0xD28C1C3D441A36AC,
+	0x492A946164077F69,0x2D1D73DC6F5F514B,0x6F0A70F40D68D88A,0x60B4B30ECA1EAC41,
+	0xD36509D83385987D,0x0B3D97490630F6A8,0x9ECCC90A96C46577,0xA20EE2C5AD01A87C,
+	0xE49AB55E0E70A3DE,0xA4429CA182646BA0,0xDA97B446DB962F6A,0xCCED87D4D7F6DE27,
+	0x2AB8185D37A53C46,0x9F25DCEFE15BCBA6,0xC19C6EF9FEA3EB53,0xA764A3931BD884CE,
+	0x2FD2590B817C10F4,0x56A21A6D80743933,0xE573A0BB79EF0D0F,0x155C0CA095DC1E23,
+	0x6C2C4FC694D437E4,0x10364DF623053291,0xDD32DFC7836C4267,0x03263F3299BCEF6E,
+	0x66F8CD6AE57B6F9D,0x8C35AE2B5BE21659,0x31B3C2E21290F87F,0x93BD2027BF915003,
+	0x69460E90220D1B56,0x299E276FAE19D328,0x63928C3C53A2432F,0x7082FEF8E91B9ED0,
+	0xBC6F792C3EED40F7,0x4C40D537D2DE53DB,0x75E8BFAE5FC2B262,0x4DA9C0D2A541FD0A,
+	0x4E8FFFE03CFD1264,0x2620E495696FA7E3,0xE1F0F408B8A98F6C,0xD1AA230FDDA6D9C2,
+	0xC7D0109DD1C6288F,0x8A79D04F7487D585,0x4694579BA3710BA2,0x38417F7CFA834F68,
+	0x1D47A4DB0A5007E5,0x206C9AF1460A643F,0xA128DDF734BD4712,0x8144470672B7232D,
+	0xF2E086CC02105293,0x182DE58DBC892B57,0xCAA1F9B0F8931DFB,0x6B892447CC2E5AE9,
+	0xF9DD11850420A43B,0x4BE5BEB68A243ED6,0x5584255F19C8D65D,0x3B67404E633FA006,
+	0xA68DB6766C472A1F,0xF78AC79AB4C97E21,0xC353442E1080AAEC,0x9A4F9DB95782E714
+},{
+	0x05BA7BC82C9B3220,0x31A54665F8B65E4F,0xB1B651F77547F4D4,0x8BFA0D857BA46682,
+	0x85A96C5AA16A98BB,0x990FAEF908EB79C9,0xA15E37A247F4A62D,0x76857DCD5D27741E,
+	0xF8C50B800A1820BC,0xBE65DCB201F7A2B4,0x666D1B986F9426E7,0x4CC921BF53C4E648,
+	0x95410A0F93D9CA42,0x20CDCCAA647BA4EF,0x429A4060890A1871,0x0C4EA4F69B32B38B,
+	0xCCDA362DDE354CD3,0x96DC23BC7C5B2FA9,0xC309BB68AA851AB3,0xD26131A73648E013,
+	0x021DC52941FC4DB2,0xCD5ADAB7704BE48A,0xA77965D984ED71E6,0x32386FD61734BBA4,
+	0xE82D6DD538AB7245,0x5C2147EA6177B4B1,0x5DA1AB70CF091CE8,0xAC907FCE72B8BDFF,
+	0x57C85DFD972278A8,0xA4E44C6A6B6F940D,0x3851995B4F1FDFE4,0x62578CCAED71BC9E,
+	0xD9882BB0C01D2C0A,0x917B9D5D113C503B,0xA2C31E11A87643C6,0xE463C923A399C1CE,
+	0xF71686C57EA876DC,0x87B4A973E096D509,0xAF0D567D9D3A5814,0xB40C2A3F59DCC6F4,
+	0x3602F88495D121DD,0xD3E1DD3D9836484A,0xF945E71AA46688E5,0x7518547EB2A591F5,
+	0x9366587450C01D89,0x9EA81018658C065B,0x4F54080CBC4603A3,0x2D0384C65137BF3D,
+	0xDC325078EC861E2A,0xEA30A8FC79573FF7,0x214D2030CA050CB6,0x65F0322B8016C30C,
+	0x69BE96DD1B247087,0xDB95EE9981E161B8,0xD1FC1814D9CA05F8,0x820ED2BBCC0DE729,
+	0x63D76050430F14C7,0x3BCCB0E8A09D3A0F,0x8E40764D573F54A2,0x39D175C1E16177BD,
+	0x12F5A37C734F1F4B,0xAB37C12F1FDFC26D,0x5648B167395CD0F1,0x6C04ED1537BF42A7,
+	0xED97161D14304065,0x7D6C67DAAB72B807,0xEC17FA87BA4EE83C,0xDFAF79CB0304FBC1,
+	0x733F060571BC463E,0x78D61C1287E98A27,0xD07CF48E77B4ADA1,0xB9C262536C90DD26,
+	0xE2449B5860801605,0x8FC09AD7F941FCFB,0xFAD8CEA94BE46D0E,0xA343F28B0608EB9F,
+	0x9B126BD04917347B,0x9A92874AE7699C22,0x1B017C42C4E69EE0,0x3A4C5C720EE39256,
+	0x4B6E9F5E3EA399DA,0x6BA353F45AD83D35,0xE7FEE0904C1B2425,0x22D009832587E95D,
+	0x842980C00F1430E2,0xC6B3C0A0861E2893,0x087433A419D729F2,0x341F3DADD42D6C6F,
+	0xEE0A3FAEFBB2A58E,0x4AEE73C490DD3183,0xAAB72DB5B1A16A34,0xA92A04065E238FDF,
+	0x7B4B35A1686B6FCC,0x6A23BF6EF4A6956C,0x191CB96B851AD352,0x55D598D4D6DE351A,
+	0xC9604DE5F2AE7EF3,0x1CA6C2A3A981E172,0xDE2F9551AD7A5398,0x3025AAFF56C8F616,
+	0x15521D9D1E2860D9,0x506FE31CFA45073A,0x189C55F12B647B0B,0x0180EC9AAE7EA859,
+	0x7CEC8B40050C105E,0x2350E5198BF94104,0xEF8AD33455CC0DD7,0x07A7BEE16D677F92,
+	0xE5E325B90DE76997,0x5A061591A26E637A,0xB611EF1618208B46,0x09F4DF3EB7A981AB,
+	0x1EBB078AE87DACC0,0xB791038CB65E231F,0x0FD38D4574B05660,0x67EDF702C1EA8EBE,
+	0xBA5F4BE0831238CD,0xE3C477C2CEFEBE5C,0x0DCE486C354C1BD2,0x8C5DB36416C31910,
+	0x26EA9ED1A7627324,0x039D29B3EF82E5EB,0x9F28FC82CBF2AE02,0xA8AAE89CF05D2786,
+	0x431AACFA2774B028,0xCF471F9E31B7A938,0x581BD0B8E3922EC8,0xBC78199B400BEF06,
+	0x90FB71C7BF42F862,0x1F3BEB1046030499,0x683E7A47B55AD8DE,0x988F4263A695D190,
+	0xD808C72A6E638453,0x0627527BC319D7CB,0xEBB04466D72997AE,0xE67E0C0AE2658C7C,
+	0x14D2F107B056C880,0x7122C32C30400B8C,0x8A7AE11FD5DACEDB,0xA0DEDB38E98A0E74,
+	0xAD109354DCC615A6,0x0BE91A17F655CC19,0x8DDD5FFEB8BDB149,0xBFE53028AF890AED,
+	0xD65BA6F5B4AD7A6A,0x7956F0882997227E,0x10E8665532B352F9,0x0E5361DFDACEFE39,
+	0xCEC7F3049FC90161,0xFF62B561677F5F2E,0x975CCF26D22587F0,0x51EF0F86543BAF63,
+	0x2F1E41EF10CBF28F,0x52722635BBB94A88,0xAE8DBAE73344F04D,0x410769D36688FD9A,
+	0xB3AB94DE34BBB966,0x801317928DF1AA9B,0xA564A0F0C5113C54,0xF131D4BEBDB1A117,
+	0x7F71A2F3EA8EF5B5,0x40878549C8F655C3,0x7EF14E6944F05DEC,0xD44663DCF55137D8,
+	0xF2ACFD0D523344FC,0x0000000000000000,0x5FBC6E598EF5515A,0x16CF342EF1AA8532,
+	0xB036BD6DDB395C8D,0x13754FE6DD31B712,0xBBDFA77A2D6C9094,0x89E7C8AC3A582B30,
+	0x3C6B0E09CDFA459D,0xC4AE0589C7E26521,0x49735A777F5FD468,0xCAFD64561D2C9B18,
+	0xDA1502032F9FC9E1,0x8867243694268369,0x3782141E3BAF8984,0x9CB5D53124704BE9,
+	0xD7DB4A6F1AD3D233,0xA6F989432A93D9BF,0x9D3539AB8A0EE3B0,0x53F2CAAF15C7E2D1,
+	0x6E19283C76430F15,0x3DEBE2936384EDC4,0x5E3C82C3208BF903,0x33B8834CB94A13FD,
+	0x6470DEB12E686B55,0x359FD1377A53C436,0x61CAA57902F35975,0x043A975282E59A79,
+	0xFD7F70482683129C,0xC52EE913699CCD78,0x28B9FF0E7DAC8D1D,0x5455744E78A09D43,
+	0xCB7D88CCB3523341,0x44BD121B4A13CFBA,0x4D49CD25FDBA4E11,0x3E76CB208C06082F,
+	0x3FF627BA2278A076,0xC28957F204FBB2EA,0x453DFE81E46D67E3,0x94C1E6953DA7621B,
+	0x2C83685CFF491764,0xF32C1197FC4DECA5,0x2B24D6BD922E68F6,0xB22B78449AC5113F,
+	0x48F3B6EDD1217C31,0x2E9EAD75BEB55AD6,0x174FD8B45FD42D6B,0x4ED4E4961238ABFA,
+	0x92E6B4EEFEBEB5D0,0x46A0D7320BEF8208,0x47203BA8A5912A51,0x24F75BF8E69E3E96,
+	0xF0B1382413CF094E,0xFEE259FBC901F777,0x276A724B091CDB7D,0xBDF8F501EE75475F,
+	0x599B3C224DEC8691,0x6D84018F99C1EAFE,0x7498B8E41CDB39AC,0xE0595E71217C5BB7,
+	0x2AA43A273C50C0AF,0xF50B43EC3F543B6E,0x838E3E2162734F70,0xC09492DB4507FF58,
+	0x72BFEA9FDFC2EE67,0x11688ACF9CCDFAA0,0x1A8190D86A9836B9,0x7ACBD93BC615C795,
+	0xC7332C3A286080CA,0x863445E94EE87D50,0xF6966A5FD0D6DE85,0xE9AD814F96D5DA1C,
+	0x70A22FB69E3EA3D5,0x0A69F68D582B6440,0xB8428EC9C2EE757F,0x604A49E3AC8DF12C,
+	0x5B86F90B0C10CB23,0xE1D9B2EB8F02F3EE,0x29391394D3D22544,0xC8E0A17F5CD0D6AA,
+	0xB58CC6A5F7A26EAD,0x8193FB08238F02C2,0xD5C68F465B2F9F81,0xFCFF9CD288FDBAC5,
+	0x77059157F359DC47,0x1D262E3907FF492B,0xFB582233E59AC557,0xDDB2BCE242F8B673,
+	0x2577B76248E096CF,0x6F99C4A6D83DA74C,0xC1147E41EB795701,0xF48BAF76912A9337
+},{
+	0x3EF29D249B2C0A19,0xE9E16322B6F8622F,0x5536994047757F7A,0x9F4D56D5A47B0B33,
+	0x822567466AA1174C,0xB8F5057DEB082FB2,0xCC48C10BF4475F53,0x373088D4275DEC3A,
+	0x968F4325180AED10,0x173D232CF7016151,0xAE4ED09F946FCC13,0xFD4B4741C4539873,
+	0x1B5B3F0DD9933765,0x2FFCB0967B644052,0xE02376D20A89840C,0xA3AE3A70329B18D7,
+	0x419CBD2335DE8526,0xFAFEBF115B7C3199,0x0397074F85AA9B0D,0xC58AD4FB4836B970,
+	0xBEC60BE3FC4104A8,0x1EFF36DC4B708772,0x131FDC33ED8453B6,0x0844E33E341764D3,
+	0x0FF11B6EAB38CD39,0x64351F0A7761B85A,0x3B5694F509CFBA0E,0x30857084B87245D0,
+	0x47AFB3BD2297AE3C,0xF2BA5C2F6F6B554A,0x74BDC4761F4F70E1,0xCFDFC64471EDC45E,
+	0xE610784C1DC0AF16,0x7ACA29D63C113F28,0x2DED411776A859AF,0xAC5F211E99A3D5EE,
+	0xD484F949A87EF33B,0x3CE36CA596E013E4,0xD120F0983A9D432C,0x6BC40464DC597563,
+	0x69D5F5E5D1956C9E,0x9AE95F043698BB24,0xC9ECC8DA66A4EF44,0xD69508C8A5B2EAC6,
+	0xC40C2235C0503B80,0x38C193BA8C652103,0x1CEEC75D46BC9E8F,0xD331011937515AD1,
+	0xD8E2E56886ECA50F,0xB137108D5779C991,0x709F3B6905CA4206,0x4FEB50831680CAEF,
+	0xEC456AF3241BD238,0x58D673AFE181ABBE,0x242F54E7CAD9BF8C,0x0211F1810DCC19FD,
+	0x90BC4DBB0F43C60A,0x9518446A9DA0761D,0xA1BFCBF13F57012A,0x2BDE4F8961E172B5,
+	0x27B853A84F732481,0xB0B1E643DF1F4B61,0x18CC38425C39AC68,0xD2B7F7D7BF37D821,
+	0x3103864A3014C720,0x14AA246372ABFA5C,0x6E600DB54EBAC574,0x394765740403A3F3,
+	0x09C215F0BC71E623,0x2A58B947E987F045,0x7B4CDF18B477BDD8,0x9709B5EB906C6FE0,
+	0x73083C268060D90B,0xFEDC400E41F9037E,0x284948C6E44BE9B8,0x728ECAE808065BFB,
+	0x06330E9E17492B1A,0x5950856169E7294E,0xBAE4F4FCE6C4364F,0xCA7BCF95E30E7449,
+	0x7D7FD186A33E96C2,0x52836110D85AD690,0x4DFAA1021B4CD312,0x913ABB75872544FA,
+	0xDD46ECB9140F1518,0x3D659A6B1E869114,0xC23F2CABD719109A,0xD713FE062DD46836,
+	0xD0A60656B2FBC1DC,0x221C5A79DD909496,0xEFD26DBCA1B14935,0x0E77EDA0235E4FC9,
+	0xCBFD395B6B68F6B9,0x0DE0EAEFA6F4D4C4,0x0422FF1F1A8532E7,0xF969B85EDED6AA94,
+	0x7F6E2007AEF28F3F,0x3AD0623B81A938FE,0x6624EE8B7AADA1A7,0xB682E8DDC856607B,
+	0xA78CC56F281E2A30,0xC79B257A45FAA08D,0x5B4174E0642B30B3,0x5F638BFF7EAE0254,
+	0x4BC9AF9C0C05F808,0xCE59308AF98B46AE,0x8FC58DA9CC55C388,0x803496C7676D0EB1,
+	0xF33CAAE1E70DD7BA,0xBB6202326EA2B4BF,0xD5020F87201871CB,0x9D5CA754A9B712CE,
+	0x841669D87DE83C56,0x8A6184785EB6739F,0x420BBA6CB0741E2B,0xF12D5B60EAC1CE47,
+	0x76AC35F71283691C,0x2C6BB7D9FECEDB5F,0xFCCDB18F4C351A83,0x1F79C012C3160582,
+	0xF0ABADAE62A74CB7,0xE1A5801C82EF06FC,0x67A21845F2CB2357,0x5114665F5DF04D9D,
+	0xBF40FD2D74278658,0xA0393D3FB73183DA,0x05A409D192E3B017,0xA9FB28CF0B4065F9,
+	0x25A9A22942BF3D7C,0xDB75E22703463E02,0xB326E10C5AB5D06C,0xE7968E8295A62DE6,
+	0xB973F3B3636EAD42,0xDF571D3819C30CE5,0xEE549B7229D7CBC5,0x12992AFD65E2D146,
+	0xF8EF4E9056B02864,0xB7041E134030E28B,0xC02EDD2ADAD50967,0x932B4AF48AE95D07,
+	0x6FE6FB7BC6DC4784,0x239AACB755F61666,0x401A4BEDBDB807D6,0x485EA8D389AF6305,
+	0xA41BC220ADB4B13D,0x753B32B89729F211,0x997E584BB3322029,0x1D683193CEDA1C7F,
+	0xFF5AB6C0C99F818E,0x16BBD5E27F67E3A1,0xA59D34EE25D233CD,0x98F8AE853B54A2D9,
+	0x6DF70AFACB105E79,0x795D2E99B9BBA425,0x8E437B6744334178,0x0186F6CE886682F0,
+	0xEBF092A3BB347BD2,0xBCD7FA62F18D1D55,0xADD9D7D011C5571E,0x0BD3E471B1BDFFDE,
+	0xAA6C2F808EEAFEF4,0x5EE57D31F6C880A4,0xF50FA47FF044FCA0,0x1ADDC9C351F5B595,
+	0xEA76646D3352F922,0x0000000000000000,0x85909F16F58EBEA6,0x46294573AAF12CCC,
+	0x0A5512BF39DB7D2E,0x78DBD85731DD26D5,0x29CFBE086C2D6B48,0x218B5D36583A0F9B,
+	0x152CD2ADFACD78AC,0x83A39188E2C795BC,0xC3B9DA655F7F926A,0x9ECBA01B2C1D89C3,
+	0x07B5F8509F2FA9EA,0x7EE8D6C926940DCF,0x36B67E1AAF3B6ECA,0x86079859702425AB,
+	0xFB7849DFD31AB369,0x4C7C57CC932A51E2,0xD96413A60E8A27FF,0x263EA566C715A671,
+	0x6C71FC344376DC89,0x4A4F595284637AF8,0xDAF314E98B20BCF2,0x572768C14AB96687,
+	0x1088DB7C682EC8BB,0x887075F9537A6A62,0x2E7A4658F302C2A2,0x619116DBE582084D,
+	0xA87DDE018326E709,0xDCC01A779C6997E8,0xEDC39C3DAC7D50C8,0xA60A33A1A078A8C0,
+	0xC1A82BE452B38B97,0x3F746BEA134A88E9,0xA228CCBEBAFD9A27,0xABEAD94E068C7C04,
+	0xF48952B178227E50,0x5CF48CB0FB049959,0x6017E0156DE48ABD,0x4438B4F2A73D3531,
+	0x8C528AE649FF5885,0xB515EF924DFCFB76,0x0C661C212E925634,0xB493195CC59A7986,
+	0x9CDA519A21D1903E,0x32948105B5BE5C2D,0x194ACE8CD45F2E98,0x438D4CA238129CDB,
+	0x9B6FA9CABEFE39D4,0x81B26009EF0B8C41,0xDED1EBF691A58E15,0x4E6DA64D9EE6481F,
+	0x54B06F8ECF13FD8A,0x49D85E1D01C9E1F5,0xAFC826511C094EE3,0xF698A33075EE67AD,
+	0x5AC7822EEC4DB243,0x8DD47C28C199DA75,0x89F68337DB1CE892,0xCDCE37C57C21DDA3,
+	0x530597DE503C5460,0x6A42F2AA543FF793,0x5D727A7E73621BA9,0xE232875307459DF1,
+	0x56A19E0FC2DFE477,0xC61DD3B4CD9C227D,0xE5877F03986A341B,0x949EB2A415C6F4ED,
+	0x6206119460289340,0x6380E75AE84E11B0,0x8BE772B6D6D0F16F,0x50929091D596CF6D,
+	0xE86795EC3E9EE0DF,0x7CF927482B581432,0xC86A3E14EEC26DB4,0x7119CDA78DACC0F6,
+	0xE40189CD100CB6EB,0x92ADBC3A028FDFF7,0xB2A017C2D2D3529C,0x200DABF8D05C8D6B,
+	0x34A78F9BA2F77737,0xE3B4719D8F231F01,0x45BE423C2F5BB7C1,0xF71E55FEFD88E55D,
+	0x6853032B59F3EE6E,0x65B3E9C4FF073AAA,0x772AC3399AE5EBEC,0x87816E97F842A75B,
+	0x110E2DB2E0484A4B,0x331277CB3DD8DEDD,0xBD510CAC79EB9FA5,0x352179552A91F5C7
+},{
+	0x8AB0A96846E06A6D,0x43C7E80B4BF0B33A,0x08C9B3546B161EE5,0x39F1C235EBA990BE,
+	0xC1BEF2376606C7B2,0x2C209233614569AA,0xEB01523B6FC3289A,0x946953AB935ACEDD,
+	0x272838F63E13340E,0x8B0455ECA12BA052,0x77A1B2C4978FF8A2,0xA55122CA13E54086,
+	0x2276135862D3F1CD,0xDB8DDFDE08B76CFE,0x5D1E12C89E4A178A,0x0E56816B03969867,
+	0xEE5F79953303ED59,0xAFED748BAB78D71D,0x6D929F2DF93E53EE,0xF5D8A8F8BA798C2A,
+	0xF619B1698E39CF6B,0x95DDAF2F749104E2,0xEC2A9C80E0886427,0xCE5C8FD8825B95EA,
+	0xC4E0D9993AC60271,0x4699C3A5173076F9,0x3D1B151F50A29F42,0x9ED505EA2BC75946,
+	0x34665ACFDC7F4B98,0x61B1FB53292342F7,0xC721C0080E864130,0x8693CD1696FD7B74,
+	0x872731927136B14B,0xD3446C8A63A1721B,0x669A35E8A6680E4A,0xCAB658F239509A16,
+	0xA4E5DE4EF42E8AB9,0x37A7435EE83F08D9,0x134E6239E26C7F96,0x82791A3C2DF67488,
+	0x3F6EF00A8329163C,0x8E5A7E42FDEB6591,0x5CAAEE4C7981DDB5,0x19F234785AF1E80D,
+	0x255DDDE3ED98BD70,0x50898A32A99CCCAC,0x28CA4519DA4E6656,0xAE59880F4CB31D22,
+	0x0D9798FA37D6DB26,0x32F968F0B4FFCD1A,0xA00F09644F258545,0xFA3AD5175E24DE72,
+	0xF46C547C5DB24615,0x713E80FBFF0F7E20,0x7843CF2B73D2AAFA,0xBD17EA36AEDF62B4,
+	0xFD111BACD16F92CF,0x4ABAA7DBC72D67E0,0xB3416B5DAD49FAD3,0xBCA316B24914A88B,
+	0x15D150068AECF914,0xE27C1DEBE31EFC40,0x4FE48C759BEDA223,0x7EDCFD141B522C78,
+	0x4E5070F17C26681C,0xE696CAC15815F3BC,0x35D2A64B3BB481A7,0x800CFF29FE7DFDF6,
+	0x1ED9FAC3D5BAA4B0,0x6C2663A91EF599D1,0x03C1199134404341,0xF7AD4DED69F20554,
+	0xCD9D9649B61BD6AB,0xC8C3BDE7EADB1368,0xD131899FB02AFB65,0x1D18E352E1FAE7F1,
+	0xDA39235AEF7CA6C1,0xA1BBF5E0A8EE4F7A,0x91377805CF9A0B1E,0x3138716180BF8E5B,
+	0xD9F83ACBDB3CE580,0x0275E515D38B897E,0x472D3F21F0FBBCC6,0x2D946EB7868EA395,
+	0xBA3C248D21942E09,0xE7223645BFDE3983,0xFF64FEB902E41BB1,0xC97741630D10D957,
+	0xC3CB1722B58D4ECC,0xA27AEC719CAE0C3B,0x99FECB51A48C15FB,0x1465AC826D27332B,
+	0xE1BD047AD75EBF01,0x79F733AF941960C5,0x672EC96C41A3C475,0xC27FEBA6524684F3,
+	0x64EFD0FD75E38734,0xED9E60040743AE18,0xFB8E2993B9EF144D,0x38453EB10C625A81,
+	0x6978480742355C12,0x48CF42CE14A6EE9E,0x1CAC1FD606312DCE,0x7B82D6BA4792E9BB,
+	0x9D141C7B1F871A07,0x5616B80DC11C4A2E,0xB849C198F21FA777,0x7CA91801C8D9A506,
+	0xB1348E487EC273AD,0x41B20D1E987B3A44,0x7460AB55A3CFBBE3,0x84E628034576F20A,
+	0x1B87D16D897A6173,0x0FE27DEFE45D5258,0x83CDE6B8CA3DBEB7,0x0C23647ED01D1119,
+	0x7A362A3EA0592384,0xB61F40F3F1893F10,0x75D457D1440471DC,0x4558DA34237035B8,
+	0xDCA6116587FC2043,0x8D9B67D3C9AB26D0,0x2B0B5C88EE0E2517,0x6FE77A382AB5DA90,
+	0x269CC472D9D8FE31,0x63C41E46FAA8CB89,0xB7ABBC771642F52F,0x7D1DE4852F126F39,
+	0xA8C6BA3024339BA0,0x600507D7CEE888C8,0x8FEE82C61A20AFAE,0x57A2448926D78011,
+	0xFCA5E72836A458F0,0x072BCEBB8F4B4CBD,0x497BBE4AF36D24A1,0x3CAFE99BB769557D,
+	0x12FA9EBD05A7B5A9,0xE8C04BAA5B836BDB,0x4273148FAC3B7905,0x908384812851C121,
+	0xE557D3506C55B0FD,0x72FF996ACB4F3D61,0x3EDA0C8E64E2DC03,0xF0868356E6B949E9,
+	0x04EAD72ABB0B0FFC,0x17A4B5135967706A,0xE3C8E16F04D5367F,0xF84F30028DAF570C,
+	0x1846C8FCBD3A2232,0x5B8120F7F6CA9108,0xD46FA231ECEA3EA6,0x334D947453340725,
+	0x58403966C28AD249,0xBED6F3A79A9F21F5,0x68CCB483A5FE962D,0xD085751B57E1315A,
+	0xFED0023DE52FD18E,0x4B0E5B5F20E6ADDF,0x1A332DE96EB1AB4C,0xA3CE10F57B65C604,
+	0x108F7BA8D62C3CD7,0xAB07A3A11073D8E1,0x6B0DAD1291BED56C,0xF2F366433532C097,
+	0x2E557726B2CEE0D4,0x0000000000000000,0xCB02A476DE9B5029,0xE4E32FD48B9E7AC2,
+	0x734B65EE2C84F75E,0x6E5386BCCD7E10AF,0x01B4FC84E7CBCA3F,0xCFE8735C65905FD5,
+	0x3613BFDA0FF4C2E6,0x113B872C31E7F6E8,0x2FE18BA255052AEB,0xE974B72EBC48A1E4,
+	0x0ABC5641B89D979B,0xB46AA5E62202B66E,0x44EC26B0C4BBFF87,0xA6903B5B27A503C7,
+	0x7F680190FC99E647,0x97A84A3AA71A8D9C,0xDD12EDE16037EA7C,0xC554251DDD0DC84E,
+	0x88C54C7D956BE313,0x4D91696048662B5D,0xB08072CC9909B992,0xB5DE5962C5C97C51,
+	0x81B803AD19B637C9,0xB2F597D94A8230EC,0x0B08AAC55F565DA4,0xF1327FD2017283D6,
+	0xAD98919E78F35E63,0x6AB9519676751F53,0x24E921670A53774F,0xB9FD3D1C15D46D48,
+	0x92F66194FBDA485F,0x5A35DC7311015B37,0xDED3F4705477A93D,0xC00A0EB381CD0D8D,
+	0xBB88D809C65FE436,0x16104997BEACBA55,0x21B70AC95693B28C,0x59F4C5E225411876,
+	0xD5DB5EB50B21F499,0x55D7A19CF55C096F,0xA97246B4C3F8519F,0x8552D487A2BD3835,
+	0x54635D181297C350,0x23C2EFDC85183BF2,0x9F61F96ECC0C9379,0x534893A39DDC8FED,
+	0x5EDF0B59AA0A54CB,0xAC2C6D1A9F38945C,0xD7AEBBA0D8AA7DE7,0x2ABFA00C09C5EF28,
+	0xD84CC64F3CF72FBF,0x2003F64DB15878B3,0xA724C7DFC06EC9F8,0x069F323F68808682,
+	0xCC296ACD51D01C94,0x055E2BAE5CC0C5C3,0x6270E2C21D6301B6,0x3B842720382219C0,
+	0xD2F0900E846AB824,0x52FC6F277A1745D2,0xC6953C8CE94D8B0F,0xE009F8FE3095753E,
+	0x655B2C7992284D0B,0x984A37D54347DFC4,0xEAB5AEBF8808E2A5,0x9A3FD2C090CC56BA,
+	0x9CA0E0FFF84CD038,0x4C2595E4AFADE162,0xDF6708F4B3BC6302,0xBF620F237D54EBCA,
+	0x93429D101C118260,0x097D4FD08CDDD4DA,0x8C2F9B572E60ECEF,0x708A7C7F18C4B41F,
+	0x3A30DBA4DFE9D3FF,0x4006F19A7FB0F07B,0x5F6BF7DD4DC19EF4,0x1F6D064732716E8F,
+	0xF9FBCC866A649D33,0x308C8DE567744464,0x8971B0F972A0292C,0xD61A47243F61B7D8,
+	0xEFEB8511D4C82766,0x961CB6BE40D147A3,0xAAB35F25F7B812DE,0x76154E407044329D,
+	0x513D76B64E570693,0xF3479AC7D2F90AA8,0x9B8B2E4477079C85,0x297EB99D3D85AC69
+},{
+	0x7E37E62DFC7D40C3,0x776F25A4EE939E5B,0xE045C850DD8FB5AD,0x86ED5BA711FF1952,
+	0xE91D0BD9CF616B35,0x37E0AB256E408FFB,0x9607F6C031025A7A,0x0B02F5E116D23C9D,
+	0xF3D8486BFB50650C,0x621CFF27C40875F5,0x7D40CB71FA5FD34A,0x6DAA6616DAA29062,
+	0x9F5F354923EC84E2,0xEC847C3DC507C3B3,0x025A3668043CE205,0xA8BF9E6C4DAC0B19,
+	0xFA808BE2E9BEBB94,0xB5B99C5277C74FA3,0x78D9BC95F0397BCC,0xE332E50CDBAD2624,
+	0xC74FCE129332797E,0x1729ECEB2EA709AB,0xC2D6B9F69954D1F8,0x5D898CBFBAB8551A,
+	0x859A76FB17DD8ADB,0x1BE85886362F7FB5,0xF6413F8FF136CD8A,0xD3110FA5BBB7E35C,
+	0x0A2FEED514CC4D11,0xE83010EDCD7F1AB9,0xA1E75DE55F42D581,0xEEDE4A55C13B21B6,
+	0xF2F5535FF94E1480,0x0CC1B46D1888761E,0xBCE15FDB6529913B,0x2D25E8975A7181C2,
+	0x71817F1CE2D7A554,0x2E52C5CB5C53124B,0xF9F7A6BEEF9C281D,0x9E722E7D21F2F56E,
+	0xCE170D9B81DCA7E6,0x0E9B82051CB4941B,0x1E712F623C49D733,0x21E45CFA42F9F7DC,
+	0xCB8E7A7F8BBA0F60,0x8E98831A010FB646,0x474CCF0D8E895B23,0xA99285584FB27A95,
+	0x8CC2B57205335443,0x42D5B8E984EFF3A5,0x012D1B34021E718C,0x57A6626AAE74180B,
+	0xFF19FC06E3D81312,0x35BA9D4D6A7C6DFE,0xC9D44C178F86ED65,0x506523E6A02E5288,
+	0x03772D5C06229389,0x8B01F4FE0B691EC0,0xF8DABD8AED825991,0x4C4E3AEC985B67BE,
+	0xB10DF0827FBF96A9,0x6A69279AD4F8DAE1,0xE78689DCD3D5FF2E,0x812E1A2B1FA553D1,
+	0xFBAD90D6EBA0CA18,0x1AC543B234310E39,0x1604F7DF2CB97827,0xA6241C6951189F02,
+	0x753513CCEAAF7C5E,0x64F2A59FC84C4EFA,0x247D2B1E489F5F5A,0xDB64D718AB474C48,
+	0x79F4A7A1F2270A40,0x1573DA832A9BEBAE,0x3497867968621C72,0x514838D2A2302304,
+	0xF0AF6537FD72F685,0x1D06023E3A6B44BA,0x678588C3CE6EDD73,0x66A893F7CC70ACFF,
+	0xD4D24E29B5EDA9DF,0x3856321470EA6A6C,0x07C3418C0E5A4A83,0x2BCBB22F5635BACD,
+	0x04B46CD00878D90A,0x06EE5AB80C443B0F,0x3B211F4876C8F9E5,0x0958C38912EEDE98,
+	0xD14B39CDBF8B0159,0x397B292072F41BE0,0x87C0409313E168DE,0xAD26E98847CAA39F,
+	0x4E140C849C6785BB,0xD5FF551DB7F3D853,0xA0CA46D15D5CA40D,0xCD6020C787FE346F,
+	0x84B76DCF15C3FB57,0xDEFDA0FCA121E4CE,0x4B8D7B6096012D3D,0x9AC642AD298A2C64,
+	0x0875D8BD10F0AF14,0xB357C6EA7B8374AC,0x4D6321D89A451632,0xEDA96709C719B23F,
+	0xF76C24BBF328BC06,0xC662D526912C08F2,0x3CE25EC47892B366,0xB978283F6F4F39BD,
+	0xC08C8F9E9D6833FD,0x4F3917B09E79F437,0x593DE06FB2C08C10,0xD6887841B1D14BDA,
+	0x19B26EEE32139DB0,0xB494876675D93E2F,0x825937771987C058,0x90E9AC783D466175,
+	0xF1827E03FF6C8709,0x945DC0A8353EB87F,0x4516F9658AB5B926,0x3F9573987EB020EF,
+	0xB855330B6D514831,0x2AE6A91B542BCB41,0x6331E413C6160479,0x408F8E8180D311A0,
+	0xEFF35161C325503A,0xD06622F9BD9570D5,0x8876D9A20D4B8D49,0xA5533135573A0C8B,
+	0xE168D364DF91C421,0xF41B09E7F50A2F8F,0x12B09B0F24C1A12D,0xDA49CC2CA9593DC4,
+	0x1F5C34563E57A6BF,0x54D14F36A8568B82,0xAF7CDFE043F6419A,0xEA6A2685C943F8BC,
+	0xE5DCBFB4D7E91D2B,0xB27ADDDE799D0520,0x6B443CAED6E6AB6D,0x7BAE91C9F61BE845,
+	0x3EB868AC7CAE5163,0x11C7B65322E332A4,0xD23C1491B9A992D0,0x8FB5982E0311C7CA,
+	0x70AC6428E0C9D4D8,0x895BC2960F55FCC5,0x76423E90EC8DEFD7,0x6FF0507EDE9E7267,
+	0x3DCF45F07A8CC2EA,0x4AA06054941F5CB1,0x5810FB5BB0DEFD9C,0x5EFEA1E3BC9AC693,
+	0x6EDD4B4ADC8003EB,0x741808F8E8B10DD2,0x145EC1B728859A22,0x28BC9F7350172944,
+	0x270A06424EBDCCD3,0x972AEDF4331C2BF6,0x059977E40A66A886,0x2550302A4A812ED6,
+	0xDD8A8DA0A7037747,0xC515F87A970E9B7B,0x3023EAA9601AC578,0xB7E3AA3A73FBADA6,
+	0x0FB699311EAAE597,0x0000000000000000,0x310EF19D6204B4F4,0x229371A644DB6455,
+	0x0DECAF591A960792,0x5CA4978BB8A62496,0x1C2B190A38753536,0x41A295B582CD602C,
+	0x3279DCC16426277D,0xC1A194AA9F764271,0x139D803B26DFD0A1,0xAE51C4D441E83016,
+	0xD813FA44AD65DFC1,0xAC0BF2BC45D4D213,0x23BE6A9246C515D9,0x49D74D08923DCF38,
+	0x9D05032127D066E7,0x2F7FDEFF5E4D63C7,0xA47E2A0155247D07,0x99B16FF12FA8BFED,
+	0x4661D4398C972AAF,0xDFD0BBC8A33F9542,0xDCA79694A51D06CB,0xB020EBB67DA1E725,
+	0xBA0F0563696DAA34,0xE4F1A480D5F76CA7,0xC438E34E9510EAF7,0x939E81243B64F2FC,
+	0x8DEFAE46072D25CF,0x2C08F3A3586FF04E,0xD7A56375B3CF3A56,0x20C947CE40E78650,
+	0x43F8A3DD86F18229,0x568B795EAC6A6987,0x8003011F1DBB225D,0xF53612D3F7145E03,
+	0x189F75DA300DEC3C,0x9570DB9C3720C9F3,0xBB221E576B73DBB8,0x72F65240E4F536DD,
+	0x443BE25188ABC8AA,0xE21FFE38D9B357A8,0xFD43CA6EE7E4F117,0xCAA3614B89A47EEC,
+	0xFE34E732E1C6629E,0x83742C431B99B1D4,0xCF3A16AF83C2D66A,0xAAE5A8044990E91C,
+	0x26271D764CA3BD5F,0x91C4B74C3F5810F9,0x7C6DD045F841A2C6,0x7F1AFD19FE63314F,
+	0xC8F957238D989CE9,0xA709075D5306EE8E,0x55FC5402AA48FA0E,0x48FA563C9023BEB4,
+	0x65DFBEABCA523F76,0x6C877D22D8BCE1EE,0xCC4D3BF385E045E3,0xBEBB69B36115733E,
+	0x10EAAD6720FD4328,0xB6CEB10E71E5DC2A,0xBDCC44EF6737E0B7,0x523F158EA412B08D,
+	0x989C74C52DB6CE61,0x9BEB59992B945DE8,0x8A2CEFCA09776F4C,0xA3BD6B8D5B7E3784,
+	0xEB473DB1CB5D8930,0xC3FBA2C29B4AA074,0x9C28181525CE176B,0x683311F2D0C438E4,
+	0x5FD3BAD7BE84B71F,0xFC6ED15AE5FA809B,0x36CDB0116C5EFE77,0x29918447520958C8,
+	0xA29070B959604608,0x53120EBAA60CC101,0x3A0C047C74D68869,0x691E0AC6D2DA4968,
+	0x73DB4974E6EB4751,0x7A838AFDF40599C9,0x5A4ACD33B4E21F99,0x6046C94FC03497F0,
+	0xE6AB92E8D1CB8EA2,0x3354C7F5663856F1,0xD93EE170AF7BAE4D,0x616BD27BC22AE67C,
+	0x92B39A10397A8370,0xABC8B3304B8E9890,0xBF967287630B02B2,0x5B67D607B6FC6E15
+},{
+	0xD031C397CE553FE6,0x16BA5B01B006B525,0xA89BADE6296E70C8,0x6A1F525D77D3435B,
+	0x6E103570573DFA0B,0x660EFB2A17FC95AB,0x76327A9E97634BF6,0x4BAD9D6462458BF5,
+	0xF1830CAEDBC3F748,0xC5C8F542669131FF,0x95044A1CDC48B0CB,0x892962DF3CF8B866,
+	0xB0B9E208E930C135,0xA14FB3F0611A767C,0x8D2605F21C160136,0xD6B71922FECC549E,
+	0x37089438A5907D8B,0x0B5DA38E5803D49C,0x5A5BCC9CEA6F3CBC,0xEDAE246D3B73FFE5,
+	0xD2B87E0FDE22EDCE,0x5E54ABB1CA8185EC,0x1DE7F88FE80561B9,0xAD5E1A870135A08C,
+	0x2F2ADBD665CECC76,0x5780B5A782F58358,0x3EDC8A2EEDE47B3F,0xC9D95C3506BEE70F,
+	0x83BE111D6C4E05EE,0xA603B90959367410,0x103C81B4809FDE5D,0x2C69B6027D0C774A,
+	0x399080D7D5C87953,0x09D41E16487406B4,0xCDD63B1826505E5F,0xF99DC2F49B0298E8,
+	0x9CD0540A943CB67F,0xBCA84B7F891F17C5,0x723D1DB3B78DF2A6,0x78AA6E71E73B4F2E,
+	0x1433E699A071670D,0x84F21BE454620782,0x98DF3327B4D20F2F,0xF049DCE2D3769E5C,
+	0xDB6C60199656EB7A,0x648746B2078B4783,0x32CD23598DCBADCF,0x1EA4955BF0C7DA85,
+	0xE9A143401B9D46B5,0xFD92A5D9BBEC21B8,0xC8138C790E0B8E1B,0x2EE00B9A6D7BA562,
+	0xF85712B893B7F1FC,0xEB28FED80BEA949D,0x564A65EB8A40EA4C,0x6C9988E8474A2823,
+	0x4535898B121D8F2D,0xABD8C03231ACCBF4,0xBA2E91CAB9867CBD,0x7960BE3DEF8E263A,
+	0x0C11A977602FD6F0,0xCB50E1AD16C93527,0xEAE22E94035FFD89,0x2866D12F5DE2CE1A,
+	0xFF1B1841AB9BF390,0x9F9339DE8CFE0D43,0x964727C8C48A0BF7,0x524502C6AAAE531C,
+	0x9B9C5EF3AC10B413,0x4FA2FA4942AB32A5,0x3F165A62E551122B,0xC74148DA76E6E3D7,
+	0x924840E5E464B2A7,0xD372AE43D69784DA,0x233B72A105E11A86,0xA48A04914941A638,
+	0xB4B68525C9DE7865,0xDDEABAACA6CF8002,0x0A9773C250B6BD88,0xC284FFBB5EBD3393,
+	0x8BA0DF472C8F6A4E,0x2AEF6CB74D951C32,0x427983722A318D41,0x73F7CDFFBF389BB2,
+	0x074C0AF9382C026C,0x8A6A0F0B243A035A,0x6FDAE53C5F88931F,0xC68B98967E538AC3,
+	0x44FF59C71AA8E639,0xE2FCE0CE439E9229,0xA20CDE2479D8CD40,0x19E89FA2C8EBD8E9,
+	0xF446BBCFF398270C,0x43B3533E2284E455,0xD82F0DCD8E945046,0x51066F12B26CE820,
+	0xE73957AF6BC5426D,0x081ECE5A40C16FA0,0x3B193D4FC5BFAB7B,0x7FE66488DF174D42,
+	0x0E9814EF705804D8,0x8137AC857C39D7C6,0xB1733244E185A821,0x695C3F896F11F867,
+	0xF6CF0657E3EFF524,0x1AABF276D02963D5,0x2DA3664E75B91E5E,0x0289BD981077D228,
+	0x90C1FD7DF413608F,0x3C5537B6FD93A917,0xAA12107E3919A2E0,0x0686DAB530996B78,
+	0xDAA6B0559EE3826E,0xC34E2FF756085A87,0x6D5358A44FFF4137,0xFC587595B35948AC,
+	0x7CA5095CC7D5F67E,0xFB147F6C8B754AC0,0xBFEB26AB91DDACF9,0x6896EFC567A49173,
+	0xCA9A31E11E7C5C33,0xBBE44186B13315A9,0x0DDB793B689ABFE4,0x70B4A02BA7FA208E,
+	0xE47A3A7B7307F951,0x8CECD5BE14A36822,0xEEED49B923B144D9,0x17708B4DB8B3DC31,
+	0x6088219F2765FED3,0xB3FA8FDCF1F27A09,0x910B2D31FCA6099B,0x0F52C4A378ED6DCC,
+	0x50CCBF5EBAD98134,0x6BD582117F662A4F,0x94CE9A50D4FDD9DF,0x2B25BCFB45207526,
+	0x67C42B661F49FCBF,0x492420FC723259DD,0x03436DD418C2BB3C,0x1F6E4517F872B391,
+	0xA08563BC69AF1F68,0xD43EA4BAEEBB86B6,0x01CAD04C08B56914,0xAC94CACB0980C998,
+	0x54C3D8739A373864,0x26FEC5C02DBACAC2,0xDEA9D778BE0D3B3E,0x040F672D20EEB950,
+	0xE5B0EA377BB29045,0xF30AB136CBB42560,0x62019C0737122CFB,0xE86B930C13282FA1,
+	0xCC1CEB542EE5374B,0x538FD28AA21B3A08,0x1B61223AD89C0AC1,0x36C24474AD25149F,
+	0x7A23D3E9F74C9D06,0xBE21F6E79968C5ED,0xCF5F868036278C77,0xF705D61BEB5A9C30,
+	0x4D2B47D152DCE08D,0x5F9E7BFDC234ECF8,0x247778583DCD18EA,0x867BA67C4415D5AA,
+	0x4CE1979D5A698999,0x0000000000000000,0xEC64F42133C696F1,0xB57C5569C16B1171,
+	0xC1C7926F467F88AF,0x654D96FE0F3E2E97,0x15F936D5A8C40E19,0xB8A72C52A9F1AE95,
+	0xA9517DAA21DB19DC,0x58D27104FA18EE94,0x5918A148F2AD8780,0x5CDD1629DAF657C4,
+	0x8274C15164FB6CFA,0xD1FB13DBC6E056F2,0x7D6FD910CF609F6A,0xB63F38BDD9A9AA4D,
+	0x3D9FE7FAF526C003,0x74BBC706871499DE,0xDF630734B6B8522A,0x3AD3ED03CD0AC26F,
+	0xFADEAF2083C023D4,0xC00D42234ECAE1BB,0x8538CBA85CD76E96,0xC402250E6E2458EB,
+	0x47BC3413026A5D05,0xAFD7A71F114272A4,0x978DF784CC3F62E3,0xB96DFC1EA144C781,
+	0x21B2CF391596C8AE,0x318E4E8D950916F3,0xCE9556CC3E92E563,0x385A509BDD7D1047,
+	0x358129A0B5E7AFA3,0xE6F387E363702B79,0xE0755D5653E94001,0x7BE903A5FFF9F412,
+	0x12B53C2C90E80C75,0x3307F315857EC4DB,0x8FAFB86A0C61D31E,0xD9E5DD8186213952,
+	0x77F8AAD29FD622E2,0x25BDA814357871FE,0x7571174A8FA1F0CA,0x137FEC60985D6561,
+	0x30449EC19DBC7FE7,0xA540D4DD41F4CF2C,0xDC206AE0AE7AE916,0x5B911CD0E2DA55A8,
+	0xB2305F90F947131D,0x344BF9ECBD52C6B7,0x5D17C665D2433ED0,0x18224FEEC05EB1FD,
+	0x9E59E992844B6457,0x9A568EBFA4A5DD07,0xA3C60E68716DA454,0x7E2CB4C4D7A22456,
+	0x87B176304CA0BCBE,0x413AEEA632F3367D,0x9915E36BBC67663B,0x40F03EEA3A465F69,
+	0x1C2D28C3E0B008AD,0x4E682A054A1E5BB1,0x05C5B761285BD044,0xE1BF8D1A5B5C2915,
+	0xF2C0617AC3014C74,0xB7F5E8F1D11CC359,0x63CB4C4B3FA745EF,0x9D1A84469C89DF6B,
+	0xE33630824B2BFB3D,0xD5F474F6E60EEFA2,0xF58C6B83FB2D4E18,0x4676E45F0ADF3411,
+	0x20781F751D23A1BA,0xBD629B3381AA7ED1,0xAE1D775319F71BB0,0xFED1C80DA32E9A84,
+	0x5509083F92825170,0x29AC01635557A70E,0xA7C9694551831D04,0x8E65682604D4BA0A,
+	0x11F651F8882AB749,0xD77DC96EF6793D8A,0xEF2799F52B042DCD,0x48EEF0B07A8730C9,
+	0x22F1A2ED0D547392,0x6142F1D32FD097C7,0x4A674D286AF0E2E1,0x80FD7CC9748CBED2,
+	0x717E7067AF4F499A,0x938290A9ECD1DBB3,0x88E3B293344DD172,0x2734158C250FA3D6
+}};
+
+// Constant values for KeySchedule function
+const unsigned char C[12][64] = {{
+	0xB1,0x08,0x5B,0xDA,0x1E,0xCA,0xDA,0xE9,0xEB,0xCB,0x2F,0x81,0xC0,0x65,0x7C,0x1F,
+	0x2F,0x6A,0x76,0x43,0x2E,0x45,0xD0,0x16,0x71,0x4E,0xB8,0x8D,0x75,0x85,0xC4,0xFC,
+	0x4B,0x7C,0xE0,0x91,0x92,0x67,0x69,0x01,0xA2,0x42,0x2A,0x08,0xA4,0x60,0xD3,0x15,
+	0x05,0x76,0x74,0x36,0xCC,0x74,0x4D,0x23,0xDD,0x80,0x65,0x59,0xF2,0xA6,0x45,0x07
+},{
+	0x6F,0xA3,0xB5,0x8A,0xA9,0x9D,0x2F,0x1A,0x4F,0xE3,0x9D,0x46,0x0F,0x70,0xB5,0xD7,
+	0xF3,0xFE,0xEA,0x72,0x0A,0x23,0x2B,0x98,0x61,0xD5,0x5E,0x0F,0x16,0xB5,0x01,0x31,
+	0x9A,0xB5,0x17,0x6B,0x12,0xD6,0x99,0x58,0x5C,0xB5,0x61,0xC2,0xDB,0x0A,0xA7,0xCA,
+	0x55,0xDD,0xA2,0x1B,0xD7,0xCB,0xCD,0x56,0xE6,0x79,0x04,0x70,0x21,0xB1,0x9B,0xB7
+},{
+	0xF5,0x74,0xDC,0xAC,0x2B,0xCE,0x2F,0xC7,0x0A,0x39,0xFC,0x28,0x6A,0x3D,0x84,0x35,
+	0x06,0xF1,0x5E,0x5F,0x52,0x9C,0x1F,0x8B,0xF2,0xEA,0x75,0x14,0xB1,0x29,0x7B,0x7B,
+	0xD3,0xE2,0x0F,0xE4,0x90,0x35,0x9E,0xB1,0xC1,0xC9,0x3A,0x37,0x60,0x62,0xDB,0x09,
+	0xC2,0xB6,0xF4,0x43,0x86,0x7A,0xDB,0x31,0x99,0x1E,0x96,0xF5,0x0A,0xBA,0x0A,0xB2
+},{
+	0xEF,0x1F,0xDF,0xB3,0xE8,0x15,0x66,0xD2,0xF9,0x48,0xE1,0xA0,0x5D,0x71,0xE4,0xDD,
+	0x48,0x8E,0x85,0x7E,0x33,0x5C,0x3C,0x7D,0x9D,0x72,0x1C,0xAD,0x68,0x5E,0x35,0x3F,
+	0xA9,0xD7,0x2C,0x82,0xED,0x03,0xD6,0x75,0xD8,0xB7,0x13,0x33,0x93,0x52,0x03,0xBE,
+	0x34,0x53,0xEA,0xA1,0x93,0xE8,0x37,0xF1,0x22,0x0C,0xBE,0xBC,0x84,0xE3,0xD1,0x2E
+},{
+	0x4B,0xEA,0x6B,0xAC,0xAD,0x47,0x47,0x99,0x9A,0x3F,0x41,0x0C,0x6C,0xA9,0x23,0x63,
+	0x7F,0x15,0x1C,0x1F,0x16,0x86,0x10,0x4A,0x35,0x9E,0x35,0xD7,0x80,0x0F,0xFF,0xBD,
+	0xBF,0xCD,0x17,0x47,0x25,0x3A,0xF5,0xA3,0xDF,0xFF,0x00,0xB7,0x23,0x27,0x1A,0x16,
+	0x7A,0x56,0xA2,0x7E,0xA9,0xEA,0x63,0xF5,0x60,0x17,0x58,0xFD,0x7C,0x6C,0xFE,0x57
+},{
+	0xAE,0x4F,0xAE,0xAE,0x1D,0x3A,0xD3,0xD9,0x6F,0xA4,0xC3,0x3B,0x7A,0x30,0x39,0xC0,
+	0x2D,0x66,0xC4,0xF9,0x51,0x42,0xA4,0x6C,0x18,0x7F,0x9A,0xB4,0x9A,0xF0,0x8E,0xC6,
+	0xCF,0xFA,0xA6,0xB7,0x1C,0x9A,0xB7,0xB4,0x0A,0xF2,0x1F,0x66,0xC2,0xBE,0xC6,0xB6,
+	0xBF,0x71,0xC5,0x72,0x36,0x90,0x4F,0x35,0xFA,0x68,0x40,0x7A,0x46,0x64,0x7D,0x6E
+},{
+	0xF4,0xC7,0x0E,0x16,0xEE,0xAA,0xC5,0xEC,0x51,0xAC,0x86,0xFE,0xBF,0x24,0x09,0x54,
+	0x39,0x9E,0xC6,0xC7,0xE6,0xBF,0x87,0xC9,0xD3,0x47,0x3E,0x33,0x19,0x7A,0x93,0xC9,
+	0x09,0x92,0xAB,0xC5,0x2D,0x82,0x2C,0x37,0x06,0x47,0x69,0x83,0x28,0x4A,0x05,0x04,
+	0x35,0x17,0x45,0x4C,0xA2,0x3C,0x4A,0xF3,0x88,0x86,0x56,0x4D,0x3A,0x14,0xD4,0x93
+},{
+	0x9B,0x1F,0x5B,0x42,0x4D,0x93,0xC9,0xA7,0x03,0xE7,0xAA,0x02,0x0C,0x6E,0x41,0x41,
+	0x4E,0xB7,0xF8,0x71,0x9C,0x36,0xDE,0x1E,0x89,0xB4,0x44,0x3B,0x4D,0xDB,0xC4,0x9A,
+	0xF4,0x89,0x2B,0xCB,0x92,0x9B,0x06,0x90,0x69,0xD1,0x8D,0x2B,0xD1,0xA5,0xC4,0x2F,
+	0x36,0xAC,0xC2,0x35,0x59,0x51,0xA8,0xD9,0xA4,0x7F,0x0D,0xD4,0xBF,0x02,0xE7,0x1E
+},{
+	0x37,0x8F,0x5A,0x54,0x16,0x31,0x22,0x9B,0x94,0x4C,0x9A,0xD8,0xEC,0x16,0x5F,0xDE,
+	0x3A,0x7D,0x3A,0x1B,0x25,0x89,0x42,0x24,0x3C,0xD9,0x55,0xB7,0xE0,0x0D,0x09,0x84,
+	0x80,0x0A,0x44,0x0B,0xDB,0xB2,0xCE,0xB1,0x7B,0x2B,0x8A,0x9A,0xA6,0x07,0x9C,0x54,
+	0x0E,0x38,0xDC,0x92,0xCB,0x1F,0x2A,0x60,0x72,0x61,0x44,0x51,0x83,0x23,0x5A,0xDB
+},{
+	0xAB,0xBE,0xDE,0xA6,0x80,0x05,0x6F,0x52,0x38,0x2A,0xE5,0x48,0xB2,0xE4,0xF3,0xF3,
+	0x89,0x41,0xE7,0x1C,0xFF,0x8A,0x78,0xDB,0x1F,0xFF,0xE1,0x8A,0x1B,0x33,0x61,0x03,
+	0x9F,0xE7,0x67,0x02,0xAF,0x69,0x33,0x4B,0x7A,0x1E,0x6C,0x30,0x3B,0x76,0x52,0xF4,
+	0x36,0x98,0xFA,0xD1,0x15,0x3B,0xB6,0xC3,0x74,0xB4,0xC7,0xFB,0x98,0x45,0x9C,0xED
+},{
+	0x7B,0xCD,0x9E,0xD0,0xEF,0xC8,0x89,0xFB,0x30,0x02,0xC6,0xCD,0x63,0x5A,0xFE,0x94,
+	0xD8,0xFA,0x6B,0xBB,0xEB,0xAB,0x07,0x61,0x20,0x01,0x80,0x21,0x14,0x84,0x66,0x79,
+	0x8A,0x1D,0x71,0xEF,0xEA,0x48,0xB9,0xCA,0xEF,0xBA,0xCD,0x1D,0x7D,0x47,0x6E,0x98,
+	0xDE,0xA2,0x59,0x4A,0xC0,0x6F,0xD8,0x5D,0x6B,0xCA,0xA4,0xCD,0x81,0xF3,0x2D,0x1B
+},{
+	0x37,0x8E,0xE7,0x67,0xF1,0x16,0x31,0xBA,0xD2,0x13,0x80,0xB0,0x04,0x49,0xB1,0x7A,
+	0xCD,0xA4,0x3C,0x32,0xBC,0xDF,0x1D,0x77,0xF8,0x20,0x12,0xD4,0x30,0x21,0x9F,0x9B,
+	0x5D,0x80,0xEF,0x9D,0x18,0x91,0xCC,0x86,0xE7,0x1D,0xA4,0xAA,0x88,0xE1,0x28,0x52,
+	0xFA,0xF4,0x17,0xD5,0xD9,0xB2,0x1B,0x99,0x48,0xBC,0x92,0x4A,0xF1,0x1B,0xD7,0x20
+}};
+
+
+static void AddModulo512(const void *a,const void *b,void *c)
+{
+	const unsigned char *A=a, *B=b;
+	unsigned char *C=c;
+	int t = 0;
+#ifdef FULL_UNROLL
+#define ADDBYTE_8(i) t = A[i] + B[i] + (t >> 8); C[i] = t & 0xFF;
+
+	ADDBYTE_8(63)
+	ADDBYTE_8(62)
+	ADDBYTE_8(61)
+	ADDBYTE_8(60)
+	ADDBYTE_8(59)
+	ADDBYTE_8(58)
+	ADDBYTE_8(57)
+	ADDBYTE_8(56)
+	ADDBYTE_8(55)
+	ADDBYTE_8(54)
+	ADDBYTE_8(53)
+	ADDBYTE_8(52)
+	ADDBYTE_8(51)
+	ADDBYTE_8(50)
+	ADDBYTE_8(49)
+	ADDBYTE_8(48)
+	ADDBYTE_8(47)
+	ADDBYTE_8(46)
+	ADDBYTE_8(45)
+	ADDBYTE_8(44)
+	ADDBYTE_8(43)
+	ADDBYTE_8(42)
+	ADDBYTE_8(41)
+	ADDBYTE_8(40)
+	ADDBYTE_8(39)
+	ADDBYTE_8(38)
+	ADDBYTE_8(37)
+	ADDBYTE_8(36)
+	ADDBYTE_8(35)
+	ADDBYTE_8(34)
+	ADDBYTE_8(33)
+	ADDBYTE_8(32)
+	ADDBYTE_8(31)
+	ADDBYTE_8(30)
+	ADDBYTE_8(29)
+	ADDBYTE_8(28)
+	ADDBYTE_8(27)
+	ADDBYTE_8(26)
+	ADDBYTE_8(25)
+	ADDBYTE_8(24)
+	ADDBYTE_8(23)
+	ADDBYTE_8(22)
+	ADDBYTE_8(21)
+	ADDBYTE_8(20)
+	ADDBYTE_8(19)
+	ADDBYTE_8(18)
+	ADDBYTE_8(17)
+	ADDBYTE_8(16)
+	ADDBYTE_8(15)
+	ADDBYTE_8(14)
+	ADDBYTE_8(13)
+	ADDBYTE_8(12)
+	ADDBYTE_8(11)
+	ADDBYTE_8(10)
+	ADDBYTE_8(9)
+	ADDBYTE_8(8)
+	ADDBYTE_8(7)
+	ADDBYTE_8(6)
+	ADDBYTE_8(5)
+	ADDBYTE_8(4)
+	ADDBYTE_8(3)
+	ADDBYTE_8(2)
+	ADDBYTE_8(1)
+	ADDBYTE_8(0)
+
+#else
+	int i = 0;
+
+	for(i=63;i>=0;i--)
+	{
+		t = A[i] + B[i] + (t >> 8);
+		C[i] = t & 0xFF;
+	}
+#endif
+}
+
+static void AddXor512(const void *a,const void *b,void *c)
+{
+	const unsigned long long *A=a, *B=b;
+	unsigned long long *C=c;
+#ifdef FULL_UNROLL
+	C[0] = A[0] ^ B[0];
+	C[1] = A[1] ^ B[1];
+	C[2] = A[2] ^ B[2];
+	C[3] = A[3] ^ B[3];
+	C[4] = A[4] ^ B[4];
+	C[5] = A[5] ^ B[5];
+	C[6] = A[6] ^ B[6];
+	C[7] = A[7] ^ B[7];
+#else
+	int i = 0;
+
+	for(i=0; i<8; i++) {
+		C[i] = A[i] ^ B[i];
+	}
+#endif
+}
+
+static void F(unsigned char *state)
+{
+	unsigned long long return_state[8];
+	register unsigned long long r = 0;
+	r ^= TG[0][state[56]];
+	r ^= TG[1][state[48]];
+	r ^= TG[2][state[40]];
+	r ^= TG[3][state[32]];
+	r ^= TG[4][state[24]];
+	r ^= TG[5][state[16]];
+	r ^= TG[6][state[8]];
+	r ^= TG[7][state[0]];
+	return_state[0] = r;
+	r = 0;
+
+	r ^= TG[0][state[57]];
+	r ^= TG[1][state[49]];
+	r ^= TG[2][state[41]];
+	r ^= TG[3][state[33]];
+	r ^= TG[4][state[25]];
+	r ^= TG[5][state[17]];
+	r ^= TG[6][state[9]];
+	r ^= TG[7][state[1]];
+	return_state[1] = r;
+	r = 0;
+
+	r ^= TG[0][state[58]];
+	r ^= TG[1][state[50]];
+	r ^= TG[2][state[42]];
+	r ^= TG[3][state[34]];
+	r ^= TG[4][state[26]];
+	r ^= TG[5][state[18]];
+	r ^= TG[6][state[10]];
+	r ^= TG[7][state[2]];
+	return_state[2] = r;
+	r = 0;
+
+	r ^= TG[0][state[59]];
+	r ^= TG[1][state[51]];
+	r ^= TG[2][state[43]];
+	r ^= TG[3][state[35]];
+	r ^= TG[4][state[27]];
+	r ^= TG[5][state[19]];
+	r ^= TG[6][state[11]];
+	r ^= TG[7][state[3]];
+	return_state[3] = r;
+	r = 0;
+
+	r ^= TG[0][state[60]];
+	r ^= TG[1][state[52]];
+	r ^= TG[2][state[44]];
+	r ^= TG[3][state[36]];
+	r ^= TG[4][state[28]];
+	r ^= TG[5][state[20]];
+	r ^= TG[6][state[12]];
+	r ^= TG[7][state[4]];
+	return_state[4] = r;
+	r = 0;
+
+	r ^= TG[0][state[61]];
+	r ^= TG[1][state[53]];
+	r ^= TG[2][state[45]];
+	r ^= TG[3][state[37]];
+	r ^= TG[4][state[29]];
+	r ^= TG[5][state[21]];
+	r ^= TG[6][state[13]];
+	r ^= TG[7][state[5]];
+	return_state[5] = r;
+	r = 0;
+
+	r ^= TG[0][state[62]];
+	r ^= TG[1][state[54]];
+	r ^= TG[2][state[46]];
+	r ^= TG[3][state[38]];
+	r ^= TG[4][state[30]];
+	r ^= TG[5][state[22]];
+	r ^= TG[6][state[14]];
+	r ^= TG[7][state[6]];
+	return_state[6] = r;
+	r = 0;
+
+	r ^= TG[0][state[63]];
+	r ^= TG[1][state[55]];
+	r ^= TG[2][state[47]];
+	r ^= TG[3][state[39]];
+	r ^= TG[4][state[31]];
+	r ^= TG[5][state[23]];
+	r ^= TG[6][state[15]];
+	r ^= TG[7][state[7]];
+	return_state[7] = r;
+
+	memcpy(state,(unsigned char*)return_state,64);
+}
+
+#define KeySchedule(K,i) AddXor512(K,C[i],K); F(K);
+
+static void E(unsigned char *K,const unsigned char *m, unsigned char *state)
+{
+#ifdef FULL_UNROLL
+	AddXor512(m,K,state);
+
+	F(state);
+	KeySchedule(K,0);
+	AddXor512(state,K,state);
+
+	F(state);
+	KeySchedule(K,1);
+	AddXor512(state,K,state);
+
+	F(state);
+	KeySchedule(K,2);
+	AddXor512(state,K,state);
+
+	F(state);
+	KeySchedule(K,3);
+	AddXor512(state,K,state);
+
+	F(state);
+	KeySchedule(K,4);
+	AddXor512(state,K,state);
+
+	F(state);
+	KeySchedule(K,5);
+	AddXor512(state,K,state);
+
+	F(state);
+	KeySchedule(K,6);
+	AddXor512(state,K,state);
+
+	F(state);
+	KeySchedule(K,7);
+	AddXor512(state,K,state);
+
+	F(state);
+	KeySchedule(K,8);
+	AddXor512(state,K,state);
+
+	F(state);
+	KeySchedule(K,9);
+	AddXor512(state,K,state);
+
+	F(state);
+	KeySchedule(K,10);
+	AddXor512(state,K,state);
+
+	F(state);
+	KeySchedule(K,11);
+	AddXor512(state,K,state);
+#else
+	int i = 0;
+
+	AddXor512(m,K,state);
+
+	for(i=0;i<12;i++) {
+		F(state);
+		KeySchedule(K,i);
+		AddXor512(state,K,state);
+	}
+#endif
+}
+
+static void g_N(const unsigned char *N,unsigned char *h,const unsigned char *m)
+{
+	unsigned char t[64], K[64];
+
+	AddXor512(N,h,K);
+
+	F(K);
+
+	E(K,m,t);
+
+	AddXor512(t,h,t);
+	AddXor512(t,m,h);
+}
+
+static void hash_X(unsigned char *IV,const unsigned char *message,unsigned long long length,unsigned char *out)
+{
+	unsigned char v512[64] = {
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x02,0x00
+	};
+	unsigned char v0[64] = {
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+	};
+	unsigned char Sigma[64] = {
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+	};
+	unsigned char N[64] = {
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+	};
+	unsigned char m[64], *hash = IV;
+	unsigned long long len = length;
+
+	// Stage 2
+	while (len >= 512)
+	{
+		memcpy(m, message + len/8 - 63 - ( (len & 0x7) == 0 ), 64);
+
+		g_N(N,hash,m);
+		AddModulo512(N,v512,N);
+		AddModulo512(Sigma,m,Sigma);
+		len -= 512;
+	}
+
+	memset(m,0,64);
+	memcpy(m + 63 - len/8 + ( (len & 0x7) == 0 ), message, len/8 + 1 - ( (len & 0x7) == 0 ));
+
+	// Stage 3
+	m[ 63 - len/8 ] |= (1 << (len & 0x7));
+
+	g_N(N,hash,m);
+	v512[63] = len & 0xFF;
+	v512[62] = (unsigned char) (len >> 8);
+	AddModulo512(N,v512,N);
+
+	AddModulo512(Sigma,m,Sigma);
+
+	g_N(v0,hash,N);
+	g_N(v0,hash,Sigma);
+
+	memcpy(out, hash, 64);
+}
+
+static void hash_512(const unsigned char *message, unsigned long long length, unsigned char *out)
+{
+	unsigned char IV[64] = {
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+	};
+
+	hash_X(IV,message,length,out);
+}
+
+static void hash_256(const unsigned char *message, unsigned long long length, unsigned char *out)
+{
+	unsigned char IV[64] = {
+		0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,
+		0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,
+		0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,
+		0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01
+	};
+	unsigned char hash[64];
+
+	hash_X(IV,message,length,hash);
+
+	memcpy(out,hash,32);
+}
+
+
+
+
+
+/* see sph_gost.h */
+void
+sph_gost256_init(void *cc)
+{
+	//gost_init(cc, 256);
+}
+
+/* see sph_gost.h */
+void
+sph_gost256(void *cc, const void *data, size_t len)
+{
+	hash_256(data, 8*len, cc);
+}
+
+/* see sph_gost.h */
+void
+sph_gost256_close(void *cc, void *dst)
+{
+	//sph_gost256_addbits_and_close(cc, 0, 0, dst);
+	memcpy(dst, cc, 32);
+}
+
+/* see sph_gost.h */
+void
+sph_gost256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	//gost_close32(cc, ub, n, dst);
+}
+
+/* see sph_gost.h */
+void
+sph_gost512_init(void *cc)
+{
+	//gost_init(cc, 512);
+}
+
+/* see sph_gost.h */
+void
+sph_gost512(void *cc, const void *data, size_t len)
+{
+	hash_512(data, 8*len, cc);
+}
+
+/* see sph_gost.h */
+void
+sph_gost512_close(void *cc, void *dst)
+{
+	//sph_gost512_addbits_and_close(cc, 0, 0, dst);
+	memcpy(dst, cc, 64);
+}
+
+/* see sph_gost.h */
+void
+sph_gost512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	//gost_close64(cc, ub, n, dst);
+}
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/sph/whirlpool.c b/sph/whirlpool.c
new file mode 100644
index 0000000000..07ff50cbd1
--- /dev/null
+++ b/sph/whirlpool.c
@@ -0,0 +1,3480 @@
+/* $Id: whirlpool.c 227 2010-06-16 17:28:38Z tp $ */
+/*
+ * WHIRLPOOL implementation.
+ *
+ * Internally, we use little-endian convention, on the assumption that
+ * architectures which favour big-endian encoding are:
+ *   1. rarer
+ *   2. in decreasing numbers
+ *   3. able to decode little-endian data efficiently anyway
+ *
+ * The most common big-endian architecture is Sparc, and Ultrasparc CPU
+ * include special opcodes to perform little-endian accesses, which we use
+ * (see sph_types.h). Most modern CPU designs can work with both endianness
+ * and architecture designer now favour little-endian (basically, x86 has
+ * won the endianness war).
+ *
+ * TODO: implement a 32-bit version. Not only such a version would be handy
+ * for non-64-bit-able architectures, but it may also use smaller tables,
+ * at the expense of more lookups and XORs.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "sph_whirlpool.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_WHIRLPOOL
+#define SPH_SMALL_FOOTPRINT_WHIRLPOOL   1
+#endif
+
+/* ====================================================================== */
+/*
+ * Constants for plain WHIRLPOOL (current version).
+ */
+
+static const sph_u64 plain_T0[256] = {
+	SPH_C64(0xD83078C018601818), SPH_C64(0x2646AF05238C2323),
+	SPH_C64(0xB891F97EC63FC6C6), SPH_C64(0xFBCD6F13E887E8E8),
+	SPH_C64(0xCB13A14C87268787), SPH_C64(0x116D62A9B8DAB8B8),
+	SPH_C64(0x0902050801040101), SPH_C64(0x0D9E6E424F214F4F),
+	SPH_C64(0x9B6CEEAD36D83636), SPH_C64(0xFF510459A6A2A6A6),
+	SPH_C64(0x0CB9BDDED26FD2D2), SPH_C64(0x0EF706FBF5F3F5F5),
+	SPH_C64(0x96F280EF79F97979), SPH_C64(0x30DECE5F6FA16F6F),
+	SPH_C64(0x6D3FEFFC917E9191), SPH_C64(0xF8A407AA52555252),
+	SPH_C64(0x47C0FD27609D6060), SPH_C64(0x35657689BCCABCBC),
+	SPH_C64(0x372BCDAC9B569B9B), SPH_C64(0x8A018C048E028E8E),
+	SPH_C64(0xD25B1571A3B6A3A3), SPH_C64(0x6C183C600C300C0C),
+	SPH_C64(0x84F68AFF7BF17B7B), SPH_C64(0x806AE1B535D43535),
+	SPH_C64(0xF53A69E81D741D1D), SPH_C64(0xB3DD4753E0A7E0E0),
+	SPH_C64(0x21B3ACF6D77BD7D7), SPH_C64(0x9C99ED5EC22FC2C2),
+	SPH_C64(0x435C966D2EB82E2E), SPH_C64(0x29967A624B314B4B),
+	SPH_C64(0x5DE121A3FEDFFEFE), SPH_C64(0xD5AE168257415757),
+	SPH_C64(0xBD2A41A815541515), SPH_C64(0xE8EEB69F77C17777),
+	SPH_C64(0x926EEBA537DC3737), SPH_C64(0x9ED7567BE5B3E5E5),
+	SPH_C64(0x1323D98C9F469F9F), SPH_C64(0x23FD17D3F0E7F0F0),
+	SPH_C64(0x20947F6A4A354A4A), SPH_C64(0x44A9959EDA4FDADA),
+	SPH_C64(0xA2B025FA587D5858), SPH_C64(0xCF8FCA06C903C9C9),
+	SPH_C64(0x7C528D5529A42929), SPH_C64(0x5A1422500A280A0A),
+	SPH_C64(0x507F4FE1B1FEB1B1), SPH_C64(0xC95D1A69A0BAA0A0),
+	SPH_C64(0x14D6DA7F6BB16B6B), SPH_C64(0xD917AB5C852E8585),
+	SPH_C64(0x3C677381BDCEBDBD), SPH_C64(0x8FBA34D25D695D5D),
+	SPH_C64(0x9020508010401010), SPH_C64(0x07F503F3F4F7F4F4),
+	SPH_C64(0xDD8BC016CB0BCBCB), SPH_C64(0xD37CC6ED3EF83E3E),
+	SPH_C64(0x2D0A112805140505), SPH_C64(0x78CEE61F67816767),
+	SPH_C64(0x97D55373E4B7E4E4), SPH_C64(0x024EBB25279C2727),
+	SPH_C64(0x7382583241194141), SPH_C64(0xA70B9D2C8B168B8B),
+	SPH_C64(0xF6530151A7A6A7A7), SPH_C64(0xB2FA94CF7DE97D7D),
+	SPH_C64(0x4937FBDC956E9595), SPH_C64(0x56AD9F8ED847D8D8),
+	SPH_C64(0x70EB308BFBCBFBFB), SPH_C64(0xCDC17123EE9FEEEE),
+	SPH_C64(0xBBF891C77CED7C7C), SPH_C64(0x71CCE31766856666),
+	SPH_C64(0x7BA78EA6DD53DDDD), SPH_C64(0xAF2E4BB8175C1717),
+	SPH_C64(0x458E460247014747), SPH_C64(0x1A21DC849E429E9E),
+	SPH_C64(0xD489C51ECA0FCACA), SPH_C64(0x585A99752DB42D2D),
+	SPH_C64(0x2E637991BFC6BFBF), SPH_C64(0x3F0E1B38071C0707),
+	SPH_C64(0xAC472301AD8EADAD), SPH_C64(0xB0B42FEA5A755A5A),
+	SPH_C64(0xEF1BB56C83368383), SPH_C64(0xB666FF8533CC3333),
+	SPH_C64(0x5CC6F23F63916363), SPH_C64(0x12040A1002080202),
+	SPH_C64(0x93493839AA92AAAA), SPH_C64(0xDEE2A8AF71D97171),
+	SPH_C64(0xC68DCF0EC807C8C8), SPH_C64(0xD1327DC819641919),
+	SPH_C64(0x3B92707249394949), SPH_C64(0x5FAF9A86D943D9D9),
+	SPH_C64(0x31F91DC3F2EFF2F2), SPH_C64(0xA8DB484BE3ABE3E3),
+	SPH_C64(0xB9B62AE25B715B5B), SPH_C64(0xBC0D9234881A8888),
+	SPH_C64(0x3E29C8A49A529A9A), SPH_C64(0x0B4CBE2D26982626),
+	SPH_C64(0xBF64FA8D32C83232), SPH_C64(0x597D4AE9B0FAB0B0),
+	SPH_C64(0xF2CF6A1BE983E9E9), SPH_C64(0x771E33780F3C0F0F),
+	SPH_C64(0x33B7A6E6D573D5D5), SPH_C64(0xF41DBA74803A8080),
+	SPH_C64(0x27617C99BEC2BEBE), SPH_C64(0xEB87DE26CD13CDCD),
+	SPH_C64(0x8968E4BD34D03434), SPH_C64(0x3290757A483D4848),
+	SPH_C64(0x54E324ABFFDBFFFF), SPH_C64(0x8DF48FF77AF57A7A),
+	SPH_C64(0x643DEAF4907A9090), SPH_C64(0x9DBE3EC25F615F5F),
+	SPH_C64(0x3D40A01D20802020), SPH_C64(0x0FD0D56768BD6868),
+	SPH_C64(0xCA3472D01A681A1A), SPH_C64(0xB7412C19AE82AEAE),
+	SPH_C64(0x7D755EC9B4EAB4B4), SPH_C64(0xCEA8199A544D5454),
+	SPH_C64(0x7F3BE5EC93769393), SPH_C64(0x2F44AA0D22882222),
+	SPH_C64(0x63C8E907648D6464), SPH_C64(0x2AFF12DBF1E3F1F1),
+	SPH_C64(0xCCE6A2BF73D17373), SPH_C64(0x82245A9012481212),
+	SPH_C64(0x7A805D3A401D4040), SPH_C64(0x4810284008200808),
+	SPH_C64(0x959BE856C32BC3C3), SPH_C64(0xDFC57B33EC97ECEC),
+	SPH_C64(0x4DAB9096DB4BDBDB), SPH_C64(0xC05F1F61A1BEA1A1),
+	SPH_C64(0x9107831C8D0E8D8D), SPH_C64(0xC87AC9F53DF43D3D),
+	SPH_C64(0x5B33F1CC97669797), SPH_C64(0x0000000000000000),
+	SPH_C64(0xF983D436CF1BCFCF), SPH_C64(0x6E5687452BAC2B2B),
+	SPH_C64(0xE1ECB39776C57676), SPH_C64(0xE619B06482328282),
+	SPH_C64(0x28B1A9FED67FD6D6), SPH_C64(0xC33677D81B6C1B1B),
+	SPH_C64(0x74775BC1B5EEB5B5), SPH_C64(0xBE432911AF86AFAF),
+	SPH_C64(0x1DD4DF776AB56A6A), SPH_C64(0xEAA00DBA505D5050),
+	SPH_C64(0x578A4C1245094545), SPH_C64(0x38FB18CBF3EBF3F3),
+	SPH_C64(0xAD60F09D30C03030), SPH_C64(0xC4C3742BEF9BEFEF),
+	SPH_C64(0xDA7EC3E53FFC3F3F), SPH_C64(0xC7AA1C9255495555),
+	SPH_C64(0xDB591079A2B2A2A2), SPH_C64(0xE9C96503EA8FEAEA),
+	SPH_C64(0x6ACAEC0F65896565), SPH_C64(0x036968B9BAD2BABA),
+	SPH_C64(0x4A5E93652FBC2F2F), SPH_C64(0x8E9DE74EC027C0C0),
+	SPH_C64(0x60A181BEDE5FDEDE), SPH_C64(0xFC386CE01C701C1C),
+	SPH_C64(0x46E72EBBFDD3FDFD), SPH_C64(0x1F9A64524D294D4D),
+	SPH_C64(0x7639E0E492729292), SPH_C64(0xFAEABC8F75C97575),
+	SPH_C64(0x360C1E3006180606), SPH_C64(0xAE0998248A128A8A),
+	SPH_C64(0x4B7940F9B2F2B2B2), SPH_C64(0x85D15963E6BFE6E6),
+	SPH_C64(0x7E1C36700E380E0E), SPH_C64(0xE73E63F81F7C1F1F),
+	SPH_C64(0x55C4F73762956262), SPH_C64(0x3AB5A3EED477D4D4),
+	SPH_C64(0x814D3229A89AA8A8), SPH_C64(0x5231F4C496629696),
+	SPH_C64(0x62EF3A9BF9C3F9F9), SPH_C64(0xA397F666C533C5C5),
+	SPH_C64(0x104AB13525942525), SPH_C64(0xABB220F259795959),
+	SPH_C64(0xD015AE54842A8484), SPH_C64(0xC5E4A7B772D57272),
+	SPH_C64(0xEC72DDD539E43939), SPH_C64(0x1698615A4C2D4C4C),
+	SPH_C64(0x94BC3BCA5E655E5E), SPH_C64(0x9FF085E778FD7878),
+	SPH_C64(0xE570D8DD38E03838), SPH_C64(0x980586148C0A8C8C),
+	SPH_C64(0x17BFB2C6D163D1D1), SPH_C64(0xE4570B41A5AEA5A5),
+	SPH_C64(0xA1D94D43E2AFE2E2), SPH_C64(0x4EC2F82F61996161),
+	SPH_C64(0x427B45F1B3F6B3B3), SPH_C64(0x3442A51521842121),
+	SPH_C64(0x0825D6949C4A9C9C), SPH_C64(0xEE3C66F01E781E1E),
+	SPH_C64(0x6186522243114343), SPH_C64(0xB193FC76C73BC7C7),
+	SPH_C64(0x4FE52BB3FCD7FCFC), SPH_C64(0x2408142004100404),
+	SPH_C64(0xE3A208B251595151), SPH_C64(0x252FC7BC995E9999),
+	SPH_C64(0x22DAC44F6DA96D6D), SPH_C64(0x651A39680D340D0D),
+	SPH_C64(0x79E93583FACFFAFA), SPH_C64(0x69A384B6DF5BDFDF),
+	SPH_C64(0xA9FC9BD77EE57E7E), SPH_C64(0x1948B43D24902424),
+	SPH_C64(0xFE76D7C53BEC3B3B), SPH_C64(0x9A4B3D31AB96ABAB),
+	SPH_C64(0xF081D13ECE1FCECE), SPH_C64(0x9922558811441111),
+	SPH_C64(0x8303890C8F068F8F), SPH_C64(0x049C6B4A4E254E4E),
+	SPH_C64(0x667351D1B7E6B7B7), SPH_C64(0xE0CB600BEB8BEBEB),
+	SPH_C64(0xC178CCFD3CF03C3C), SPH_C64(0xFD1FBF7C813E8181),
+	SPH_C64(0x4035FED4946A9494), SPH_C64(0x1CF30CEBF7FBF7F7),
+	SPH_C64(0x186F67A1B9DEB9B9), SPH_C64(0x8B265F98134C1313),
+	SPH_C64(0x51589C7D2CB02C2C), SPH_C64(0x05BBB8D6D36BD3D3),
+	SPH_C64(0x8CD35C6BE7BBE7E7), SPH_C64(0x39DCCB576EA56E6E),
+	SPH_C64(0xAA95F36EC437C4C4), SPH_C64(0x1B060F18030C0303),
+	SPH_C64(0xDCAC138A56455656), SPH_C64(0x5E88491A440D4444),
+	SPH_C64(0xA0FE9EDF7FE17F7F), SPH_C64(0x884F3721A99EA9A9),
+	SPH_C64(0x6754824D2AA82A2A), SPH_C64(0x0A6B6DB1BBD6BBBB),
+	SPH_C64(0x879FE246C123C1C1), SPH_C64(0xF1A602A253515353),
+	SPH_C64(0x72A58BAEDC57DCDC), SPH_C64(0x531627580B2C0B0B),
+	SPH_C64(0x0127D39C9D4E9D9D), SPH_C64(0x2BD8C1476CAD6C6C),
+	SPH_C64(0xA462F59531C43131), SPH_C64(0xF3E8B98774CD7474),
+	SPH_C64(0x15F109E3F6FFF6F6), SPH_C64(0x4C8C430A46054646),
+	SPH_C64(0xA5452609AC8AACAC), SPH_C64(0xB50F973C891E8989),
+	SPH_C64(0xB42844A014501414), SPH_C64(0xBADF425BE1A3E1E1),
+	SPH_C64(0xA62C4EB016581616), SPH_C64(0xF774D2CD3AE83A3A),
+	SPH_C64(0x06D2D06F69B96969), SPH_C64(0x41122D4809240909),
+	SPH_C64(0xD7E0ADA770DD7070), SPH_C64(0x6F7154D9B6E2B6B6),
+	SPH_C64(0x1EBDB7CED067D0D0), SPH_C64(0xD6C77E3BED93EDED),
+	SPH_C64(0xE285DB2ECC17CCCC), SPH_C64(0x6884572A42154242),
+	SPH_C64(0x2C2DC2B4985A9898), SPH_C64(0xED550E49A4AAA4A4),
+	SPH_C64(0x7550885D28A02828), SPH_C64(0x86B831DA5C6D5C5C),
+	SPH_C64(0x6BED3F93F8C7F8F8), SPH_C64(0xC211A44486228686)
+};
+
+#if !SPH_SMALL_FOOTPRINT_WHIRLPOOL
+
+static const sph_u64 plain_T1[256] = {
+	SPH_C64(0x3078C018601818D8), SPH_C64(0x46AF05238C232326),
+	SPH_C64(0x91F97EC63FC6C6B8), SPH_C64(0xCD6F13E887E8E8FB),
+	SPH_C64(0x13A14C87268787CB), SPH_C64(0x6D62A9B8DAB8B811),
+	SPH_C64(0x0205080104010109), SPH_C64(0x9E6E424F214F4F0D),
+	SPH_C64(0x6CEEAD36D836369B), SPH_C64(0x510459A6A2A6A6FF),
+	SPH_C64(0xB9BDDED26FD2D20C), SPH_C64(0xF706FBF5F3F5F50E),
+	SPH_C64(0xF280EF79F9797996), SPH_C64(0xDECE5F6FA16F6F30),
+	SPH_C64(0x3FEFFC917E91916D), SPH_C64(0xA407AA52555252F8),
+	SPH_C64(0xC0FD27609D606047), SPH_C64(0x657689BCCABCBC35),
+	SPH_C64(0x2BCDAC9B569B9B37), SPH_C64(0x018C048E028E8E8A),
+	SPH_C64(0x5B1571A3B6A3A3D2), SPH_C64(0x183C600C300C0C6C),
+	SPH_C64(0xF68AFF7BF17B7B84), SPH_C64(0x6AE1B535D4353580),
+	SPH_C64(0x3A69E81D741D1DF5), SPH_C64(0xDD4753E0A7E0E0B3),
+	SPH_C64(0xB3ACF6D77BD7D721), SPH_C64(0x99ED5EC22FC2C29C),
+	SPH_C64(0x5C966D2EB82E2E43), SPH_C64(0x967A624B314B4B29),
+	SPH_C64(0xE121A3FEDFFEFE5D), SPH_C64(0xAE168257415757D5),
+	SPH_C64(0x2A41A815541515BD), SPH_C64(0xEEB69F77C17777E8),
+	SPH_C64(0x6EEBA537DC373792), SPH_C64(0xD7567BE5B3E5E59E),
+	SPH_C64(0x23D98C9F469F9F13), SPH_C64(0xFD17D3F0E7F0F023),
+	SPH_C64(0x947F6A4A354A4A20), SPH_C64(0xA9959EDA4FDADA44),
+	SPH_C64(0xB025FA587D5858A2), SPH_C64(0x8FCA06C903C9C9CF),
+	SPH_C64(0x528D5529A429297C), SPH_C64(0x1422500A280A0A5A),
+	SPH_C64(0x7F4FE1B1FEB1B150), SPH_C64(0x5D1A69A0BAA0A0C9),
+	SPH_C64(0xD6DA7F6BB16B6B14), SPH_C64(0x17AB5C852E8585D9),
+	SPH_C64(0x677381BDCEBDBD3C), SPH_C64(0xBA34D25D695D5D8F),
+	SPH_C64(0x2050801040101090), SPH_C64(0xF503F3F4F7F4F407),
+	SPH_C64(0x8BC016CB0BCBCBDD), SPH_C64(0x7CC6ED3EF83E3ED3),
+	SPH_C64(0x0A1128051405052D), SPH_C64(0xCEE61F6781676778),
+	SPH_C64(0xD55373E4B7E4E497), SPH_C64(0x4EBB25279C272702),
+	SPH_C64(0x8258324119414173), SPH_C64(0x0B9D2C8B168B8BA7),
+	SPH_C64(0x530151A7A6A7A7F6), SPH_C64(0xFA94CF7DE97D7DB2),
+	SPH_C64(0x37FBDC956E959549), SPH_C64(0xAD9F8ED847D8D856),
+	SPH_C64(0xEB308BFBCBFBFB70), SPH_C64(0xC17123EE9FEEEECD),
+	SPH_C64(0xF891C77CED7C7CBB), SPH_C64(0xCCE3176685666671),
+	SPH_C64(0xA78EA6DD53DDDD7B), SPH_C64(0x2E4BB8175C1717AF),
+	SPH_C64(0x8E46024701474745), SPH_C64(0x21DC849E429E9E1A),
+	SPH_C64(0x89C51ECA0FCACAD4), SPH_C64(0x5A99752DB42D2D58),
+	SPH_C64(0x637991BFC6BFBF2E), SPH_C64(0x0E1B38071C07073F),
+	SPH_C64(0x472301AD8EADADAC), SPH_C64(0xB42FEA5A755A5AB0),
+	SPH_C64(0x1BB56C83368383EF), SPH_C64(0x66FF8533CC3333B6),
+	SPH_C64(0xC6F23F639163635C), SPH_C64(0x040A100208020212),
+	SPH_C64(0x493839AA92AAAA93), SPH_C64(0xE2A8AF71D97171DE),
+	SPH_C64(0x8DCF0EC807C8C8C6), SPH_C64(0x327DC819641919D1),
+	SPH_C64(0x927072493949493B), SPH_C64(0xAF9A86D943D9D95F),
+	SPH_C64(0xF91DC3F2EFF2F231), SPH_C64(0xDB484BE3ABE3E3A8),
+	SPH_C64(0xB62AE25B715B5BB9), SPH_C64(0x0D9234881A8888BC),
+	SPH_C64(0x29C8A49A529A9A3E), SPH_C64(0x4CBE2D269826260B),
+	SPH_C64(0x64FA8D32C83232BF), SPH_C64(0x7D4AE9B0FAB0B059),
+	SPH_C64(0xCF6A1BE983E9E9F2), SPH_C64(0x1E33780F3C0F0F77),
+	SPH_C64(0xB7A6E6D573D5D533), SPH_C64(0x1DBA74803A8080F4),
+	SPH_C64(0x617C99BEC2BEBE27), SPH_C64(0x87DE26CD13CDCDEB),
+	SPH_C64(0x68E4BD34D0343489), SPH_C64(0x90757A483D484832),
+	SPH_C64(0xE324ABFFDBFFFF54), SPH_C64(0xF48FF77AF57A7A8D),
+	SPH_C64(0x3DEAF4907A909064), SPH_C64(0xBE3EC25F615F5F9D),
+	SPH_C64(0x40A01D208020203D), SPH_C64(0xD0D56768BD68680F),
+	SPH_C64(0x3472D01A681A1ACA), SPH_C64(0x412C19AE82AEAEB7),
+	SPH_C64(0x755EC9B4EAB4B47D), SPH_C64(0xA8199A544D5454CE),
+	SPH_C64(0x3BE5EC937693937F), SPH_C64(0x44AA0D228822222F),
+	SPH_C64(0xC8E907648D646463), SPH_C64(0xFF12DBF1E3F1F12A),
+	SPH_C64(0xE6A2BF73D17373CC), SPH_C64(0x245A901248121282),
+	SPH_C64(0x805D3A401D40407A), SPH_C64(0x1028400820080848),
+	SPH_C64(0x9BE856C32BC3C395), SPH_C64(0xC57B33EC97ECECDF),
+	SPH_C64(0xAB9096DB4BDBDB4D), SPH_C64(0x5F1F61A1BEA1A1C0),
+	SPH_C64(0x07831C8D0E8D8D91), SPH_C64(0x7AC9F53DF43D3DC8),
+	SPH_C64(0x33F1CC976697975B), SPH_C64(0x0000000000000000),
+	SPH_C64(0x83D436CF1BCFCFF9), SPH_C64(0x5687452BAC2B2B6E),
+	SPH_C64(0xECB39776C57676E1), SPH_C64(0x19B06482328282E6),
+	SPH_C64(0xB1A9FED67FD6D628), SPH_C64(0x3677D81B6C1B1BC3),
+	SPH_C64(0x775BC1B5EEB5B574), SPH_C64(0x432911AF86AFAFBE),
+	SPH_C64(0xD4DF776AB56A6A1D), SPH_C64(0xA00DBA505D5050EA),
+	SPH_C64(0x8A4C124509454557), SPH_C64(0xFB18CBF3EBF3F338),
+	SPH_C64(0x60F09D30C03030AD), SPH_C64(0xC3742BEF9BEFEFC4),
+	SPH_C64(0x7EC3E53FFC3F3FDA), SPH_C64(0xAA1C9255495555C7),
+	SPH_C64(0x591079A2B2A2A2DB), SPH_C64(0xC96503EA8FEAEAE9),
+	SPH_C64(0xCAEC0F658965656A), SPH_C64(0x6968B9BAD2BABA03),
+	SPH_C64(0x5E93652FBC2F2F4A), SPH_C64(0x9DE74EC027C0C08E),
+	SPH_C64(0xA181BEDE5FDEDE60), SPH_C64(0x386CE01C701C1CFC),
+	SPH_C64(0xE72EBBFDD3FDFD46), SPH_C64(0x9A64524D294D4D1F),
+	SPH_C64(0x39E0E49272929276), SPH_C64(0xEABC8F75C97575FA),
+	SPH_C64(0x0C1E300618060636), SPH_C64(0x0998248A128A8AAE),
+	SPH_C64(0x7940F9B2F2B2B24B), SPH_C64(0xD15963E6BFE6E685),
+	SPH_C64(0x1C36700E380E0E7E), SPH_C64(0x3E63F81F7C1F1FE7),
+	SPH_C64(0xC4F7376295626255), SPH_C64(0xB5A3EED477D4D43A),
+	SPH_C64(0x4D3229A89AA8A881), SPH_C64(0x31F4C49662969652),
+	SPH_C64(0xEF3A9BF9C3F9F962), SPH_C64(0x97F666C533C5C5A3),
+	SPH_C64(0x4AB1352594252510), SPH_C64(0xB220F259795959AB),
+	SPH_C64(0x15AE54842A8484D0), SPH_C64(0xE4A7B772D57272C5),
+	SPH_C64(0x72DDD539E43939EC), SPH_C64(0x98615A4C2D4C4C16),
+	SPH_C64(0xBC3BCA5E655E5E94), SPH_C64(0xF085E778FD78789F),
+	SPH_C64(0x70D8DD38E03838E5), SPH_C64(0x0586148C0A8C8C98),
+	SPH_C64(0xBFB2C6D163D1D117), SPH_C64(0x570B41A5AEA5A5E4),
+	SPH_C64(0xD94D43E2AFE2E2A1), SPH_C64(0xC2F82F619961614E),
+	SPH_C64(0x7B45F1B3F6B3B342), SPH_C64(0x42A5152184212134),
+	SPH_C64(0x25D6949C4A9C9C08), SPH_C64(0x3C66F01E781E1EEE),
+	SPH_C64(0x8652224311434361), SPH_C64(0x93FC76C73BC7C7B1),
+	SPH_C64(0xE52BB3FCD7FCFC4F), SPH_C64(0x0814200410040424),
+	SPH_C64(0xA208B251595151E3), SPH_C64(0x2FC7BC995E999925),
+	SPH_C64(0xDAC44F6DA96D6D22), SPH_C64(0x1A39680D340D0D65),
+	SPH_C64(0xE93583FACFFAFA79), SPH_C64(0xA384B6DF5BDFDF69),
+	SPH_C64(0xFC9BD77EE57E7EA9), SPH_C64(0x48B43D2490242419),
+	SPH_C64(0x76D7C53BEC3B3BFE), SPH_C64(0x4B3D31AB96ABAB9A),
+	SPH_C64(0x81D13ECE1FCECEF0), SPH_C64(0x2255881144111199),
+	SPH_C64(0x03890C8F068F8F83), SPH_C64(0x9C6B4A4E254E4E04),
+	SPH_C64(0x7351D1B7E6B7B766), SPH_C64(0xCB600BEB8BEBEBE0),
+	SPH_C64(0x78CCFD3CF03C3CC1), SPH_C64(0x1FBF7C813E8181FD),
+	SPH_C64(0x35FED4946A949440), SPH_C64(0xF30CEBF7FBF7F71C),
+	SPH_C64(0x6F67A1B9DEB9B918), SPH_C64(0x265F98134C13138B),
+	SPH_C64(0x589C7D2CB02C2C51), SPH_C64(0xBBB8D6D36BD3D305),
+	SPH_C64(0xD35C6BE7BBE7E78C), SPH_C64(0xDCCB576EA56E6E39),
+	SPH_C64(0x95F36EC437C4C4AA), SPH_C64(0x060F18030C03031B),
+	SPH_C64(0xAC138A56455656DC), SPH_C64(0x88491A440D44445E),
+	SPH_C64(0xFE9EDF7FE17F7FA0), SPH_C64(0x4F3721A99EA9A988),
+	SPH_C64(0x54824D2AA82A2A67), SPH_C64(0x6B6DB1BBD6BBBB0A),
+	SPH_C64(0x9FE246C123C1C187), SPH_C64(0xA602A253515353F1),
+	SPH_C64(0xA58BAEDC57DCDC72), SPH_C64(0x1627580B2C0B0B53),
+	SPH_C64(0x27D39C9D4E9D9D01), SPH_C64(0xD8C1476CAD6C6C2B),
+	SPH_C64(0x62F59531C43131A4), SPH_C64(0xE8B98774CD7474F3),
+	SPH_C64(0xF109E3F6FFF6F615), SPH_C64(0x8C430A460546464C),
+	SPH_C64(0x452609AC8AACACA5), SPH_C64(0x0F973C891E8989B5),
+	SPH_C64(0x2844A014501414B4), SPH_C64(0xDF425BE1A3E1E1BA),
+	SPH_C64(0x2C4EB016581616A6), SPH_C64(0x74D2CD3AE83A3AF7),
+	SPH_C64(0xD2D06F69B9696906), SPH_C64(0x122D480924090941),
+	SPH_C64(0xE0ADA770DD7070D7), SPH_C64(0x7154D9B6E2B6B66F),
+	SPH_C64(0xBDB7CED067D0D01E), SPH_C64(0xC77E3BED93EDEDD6),
+	SPH_C64(0x85DB2ECC17CCCCE2), SPH_C64(0x84572A4215424268),
+	SPH_C64(0x2DC2B4985A98982C), SPH_C64(0x550E49A4AAA4A4ED),
+	SPH_C64(0x50885D28A0282875), SPH_C64(0xB831DA5C6D5C5C86),
+	SPH_C64(0xED3F93F8C7F8F86B), SPH_C64(0x11A44486228686C2)
+};
+
+static const sph_u64 plain_T2[256] = {
+	SPH_C64(0x78C018601818D830), SPH_C64(0xAF05238C23232646),
+	SPH_C64(0xF97EC63FC6C6B891), SPH_C64(0x6F13E887E8E8FBCD),
+	SPH_C64(0xA14C87268787CB13), SPH_C64(0x62A9B8DAB8B8116D),
+	SPH_C64(0x0508010401010902), SPH_C64(0x6E424F214F4F0D9E),
+	SPH_C64(0xEEAD36D836369B6C), SPH_C64(0x0459A6A2A6A6FF51),
+	SPH_C64(0xBDDED26FD2D20CB9), SPH_C64(0x06FBF5F3F5F50EF7),
+	SPH_C64(0x80EF79F9797996F2), SPH_C64(0xCE5F6FA16F6F30DE),
+	SPH_C64(0xEFFC917E91916D3F), SPH_C64(0x07AA52555252F8A4),
+	SPH_C64(0xFD27609D606047C0), SPH_C64(0x7689BCCABCBC3565),
+	SPH_C64(0xCDAC9B569B9B372B), SPH_C64(0x8C048E028E8E8A01),
+	SPH_C64(0x1571A3B6A3A3D25B), SPH_C64(0x3C600C300C0C6C18),
+	SPH_C64(0x8AFF7BF17B7B84F6), SPH_C64(0xE1B535D43535806A),
+	SPH_C64(0x69E81D741D1DF53A), SPH_C64(0x4753E0A7E0E0B3DD),
+	SPH_C64(0xACF6D77BD7D721B3), SPH_C64(0xED5EC22FC2C29C99),
+	SPH_C64(0x966D2EB82E2E435C), SPH_C64(0x7A624B314B4B2996),
+	SPH_C64(0x21A3FEDFFEFE5DE1), SPH_C64(0x168257415757D5AE),
+	SPH_C64(0x41A815541515BD2A), SPH_C64(0xB69F77C17777E8EE),
+	SPH_C64(0xEBA537DC3737926E), SPH_C64(0x567BE5B3E5E59ED7),
+	SPH_C64(0xD98C9F469F9F1323), SPH_C64(0x17D3F0E7F0F023FD),
+	SPH_C64(0x7F6A4A354A4A2094), SPH_C64(0x959EDA4FDADA44A9),
+	SPH_C64(0x25FA587D5858A2B0), SPH_C64(0xCA06C903C9C9CF8F),
+	SPH_C64(0x8D5529A429297C52), SPH_C64(0x22500A280A0A5A14),
+	SPH_C64(0x4FE1B1FEB1B1507F), SPH_C64(0x1A69A0BAA0A0C95D),
+	SPH_C64(0xDA7F6BB16B6B14D6), SPH_C64(0xAB5C852E8585D917),
+	SPH_C64(0x7381BDCEBDBD3C67), SPH_C64(0x34D25D695D5D8FBA),
+	SPH_C64(0x5080104010109020), SPH_C64(0x03F3F4F7F4F407F5),
+	SPH_C64(0xC016CB0BCBCBDD8B), SPH_C64(0xC6ED3EF83E3ED37C),
+	SPH_C64(0x1128051405052D0A), SPH_C64(0xE61F6781676778CE),
+	SPH_C64(0x5373E4B7E4E497D5), SPH_C64(0xBB25279C2727024E),
+	SPH_C64(0x5832411941417382), SPH_C64(0x9D2C8B168B8BA70B),
+	SPH_C64(0x0151A7A6A7A7F653), SPH_C64(0x94CF7DE97D7DB2FA),
+	SPH_C64(0xFBDC956E95954937), SPH_C64(0x9F8ED847D8D856AD),
+	SPH_C64(0x308BFBCBFBFB70EB), SPH_C64(0x7123EE9FEEEECDC1),
+	SPH_C64(0x91C77CED7C7CBBF8), SPH_C64(0xE3176685666671CC),
+	SPH_C64(0x8EA6DD53DDDD7BA7), SPH_C64(0x4BB8175C1717AF2E),
+	SPH_C64(0x460247014747458E), SPH_C64(0xDC849E429E9E1A21),
+	SPH_C64(0xC51ECA0FCACAD489), SPH_C64(0x99752DB42D2D585A),
+	SPH_C64(0x7991BFC6BFBF2E63), SPH_C64(0x1B38071C07073F0E),
+	SPH_C64(0x2301AD8EADADAC47), SPH_C64(0x2FEA5A755A5AB0B4),
+	SPH_C64(0xB56C83368383EF1B), SPH_C64(0xFF8533CC3333B666),
+	SPH_C64(0xF23F639163635CC6), SPH_C64(0x0A10020802021204),
+	SPH_C64(0x3839AA92AAAA9349), SPH_C64(0xA8AF71D97171DEE2),
+	SPH_C64(0xCF0EC807C8C8C68D), SPH_C64(0x7DC819641919D132),
+	SPH_C64(0x7072493949493B92), SPH_C64(0x9A86D943D9D95FAF),
+	SPH_C64(0x1DC3F2EFF2F231F9), SPH_C64(0x484BE3ABE3E3A8DB),
+	SPH_C64(0x2AE25B715B5BB9B6), SPH_C64(0x9234881A8888BC0D),
+	SPH_C64(0xC8A49A529A9A3E29), SPH_C64(0xBE2D269826260B4C),
+	SPH_C64(0xFA8D32C83232BF64), SPH_C64(0x4AE9B0FAB0B0597D),
+	SPH_C64(0x6A1BE983E9E9F2CF), SPH_C64(0x33780F3C0F0F771E),
+	SPH_C64(0xA6E6D573D5D533B7), SPH_C64(0xBA74803A8080F41D),
+	SPH_C64(0x7C99BEC2BEBE2761), SPH_C64(0xDE26CD13CDCDEB87),
+	SPH_C64(0xE4BD34D034348968), SPH_C64(0x757A483D48483290),
+	SPH_C64(0x24ABFFDBFFFF54E3), SPH_C64(0x8FF77AF57A7A8DF4),
+	SPH_C64(0xEAF4907A9090643D), SPH_C64(0x3EC25F615F5F9DBE),
+	SPH_C64(0xA01D208020203D40), SPH_C64(0xD56768BD68680FD0),
+	SPH_C64(0x72D01A681A1ACA34), SPH_C64(0x2C19AE82AEAEB741),
+	SPH_C64(0x5EC9B4EAB4B47D75), SPH_C64(0x199A544D5454CEA8),
+	SPH_C64(0xE5EC937693937F3B), SPH_C64(0xAA0D228822222F44),
+	SPH_C64(0xE907648D646463C8), SPH_C64(0x12DBF1E3F1F12AFF),
+	SPH_C64(0xA2BF73D17373CCE6), SPH_C64(0x5A90124812128224),
+	SPH_C64(0x5D3A401D40407A80), SPH_C64(0x2840082008084810),
+	SPH_C64(0xE856C32BC3C3959B), SPH_C64(0x7B33EC97ECECDFC5),
+	SPH_C64(0x9096DB4BDBDB4DAB), SPH_C64(0x1F61A1BEA1A1C05F),
+	SPH_C64(0x831C8D0E8D8D9107), SPH_C64(0xC9F53DF43D3DC87A),
+	SPH_C64(0xF1CC976697975B33), SPH_C64(0x0000000000000000),
+	SPH_C64(0xD436CF1BCFCFF983), SPH_C64(0x87452BAC2B2B6E56),
+	SPH_C64(0xB39776C57676E1EC), SPH_C64(0xB06482328282E619),
+	SPH_C64(0xA9FED67FD6D628B1), SPH_C64(0x77D81B6C1B1BC336),
+	SPH_C64(0x5BC1B5EEB5B57477), SPH_C64(0x2911AF86AFAFBE43),
+	SPH_C64(0xDF776AB56A6A1DD4), SPH_C64(0x0DBA505D5050EAA0),
+	SPH_C64(0x4C1245094545578A), SPH_C64(0x18CBF3EBF3F338FB),
+	SPH_C64(0xF09D30C03030AD60), SPH_C64(0x742BEF9BEFEFC4C3),
+	SPH_C64(0xC3E53FFC3F3FDA7E), SPH_C64(0x1C9255495555C7AA),
+	SPH_C64(0x1079A2B2A2A2DB59), SPH_C64(0x6503EA8FEAEAE9C9),
+	SPH_C64(0xEC0F658965656ACA), SPH_C64(0x68B9BAD2BABA0369),
+	SPH_C64(0x93652FBC2F2F4A5E), SPH_C64(0xE74EC027C0C08E9D),
+	SPH_C64(0x81BEDE5FDEDE60A1), SPH_C64(0x6CE01C701C1CFC38),
+	SPH_C64(0x2EBBFDD3FDFD46E7), SPH_C64(0x64524D294D4D1F9A),
+	SPH_C64(0xE0E4927292927639), SPH_C64(0xBC8F75C97575FAEA),
+	SPH_C64(0x1E3006180606360C), SPH_C64(0x98248A128A8AAE09),
+	SPH_C64(0x40F9B2F2B2B24B79), SPH_C64(0x5963E6BFE6E685D1),
+	SPH_C64(0x36700E380E0E7E1C), SPH_C64(0x63F81F7C1F1FE73E),
+	SPH_C64(0xF7376295626255C4), SPH_C64(0xA3EED477D4D43AB5),
+	SPH_C64(0x3229A89AA8A8814D), SPH_C64(0xF4C4966296965231),
+	SPH_C64(0x3A9BF9C3F9F962EF), SPH_C64(0xF666C533C5C5A397),
+	SPH_C64(0xB13525942525104A), SPH_C64(0x20F259795959ABB2),
+	SPH_C64(0xAE54842A8484D015), SPH_C64(0xA7B772D57272C5E4),
+	SPH_C64(0xDDD539E43939EC72), SPH_C64(0x615A4C2D4C4C1698),
+	SPH_C64(0x3BCA5E655E5E94BC), SPH_C64(0x85E778FD78789FF0),
+	SPH_C64(0xD8DD38E03838E570), SPH_C64(0x86148C0A8C8C9805),
+	SPH_C64(0xB2C6D163D1D117BF), SPH_C64(0x0B41A5AEA5A5E457),
+	SPH_C64(0x4D43E2AFE2E2A1D9), SPH_C64(0xF82F619961614EC2),
+	SPH_C64(0x45F1B3F6B3B3427B), SPH_C64(0xA515218421213442),
+	SPH_C64(0xD6949C4A9C9C0825), SPH_C64(0x66F01E781E1EEE3C),
+	SPH_C64(0x5222431143436186), SPH_C64(0xFC76C73BC7C7B193),
+	SPH_C64(0x2BB3FCD7FCFC4FE5), SPH_C64(0x1420041004042408),
+	SPH_C64(0x08B251595151E3A2), SPH_C64(0xC7BC995E9999252F),
+	SPH_C64(0xC44F6DA96D6D22DA), SPH_C64(0x39680D340D0D651A),
+	SPH_C64(0x3583FACFFAFA79E9), SPH_C64(0x84B6DF5BDFDF69A3),
+	SPH_C64(0x9BD77EE57E7EA9FC), SPH_C64(0xB43D249024241948),
+	SPH_C64(0xD7C53BEC3B3BFE76), SPH_C64(0x3D31AB96ABAB9A4B),
+	SPH_C64(0xD13ECE1FCECEF081), SPH_C64(0x5588114411119922),
+	SPH_C64(0x890C8F068F8F8303), SPH_C64(0x6B4A4E254E4E049C),
+	SPH_C64(0x51D1B7E6B7B76673), SPH_C64(0x600BEB8BEBEBE0CB),
+	SPH_C64(0xCCFD3CF03C3CC178), SPH_C64(0xBF7C813E8181FD1F),
+	SPH_C64(0xFED4946A94944035), SPH_C64(0x0CEBF7FBF7F71CF3),
+	SPH_C64(0x67A1B9DEB9B9186F), SPH_C64(0x5F98134C13138B26),
+	SPH_C64(0x9C7D2CB02C2C5158), SPH_C64(0xB8D6D36BD3D305BB),
+	SPH_C64(0x5C6BE7BBE7E78CD3), SPH_C64(0xCB576EA56E6E39DC),
+	SPH_C64(0xF36EC437C4C4AA95), SPH_C64(0x0F18030C03031B06),
+	SPH_C64(0x138A56455656DCAC), SPH_C64(0x491A440D44445E88),
+	SPH_C64(0x9EDF7FE17F7FA0FE), SPH_C64(0x3721A99EA9A9884F),
+	SPH_C64(0x824D2AA82A2A6754), SPH_C64(0x6DB1BBD6BBBB0A6B),
+	SPH_C64(0xE246C123C1C1879F), SPH_C64(0x02A253515353F1A6),
+	SPH_C64(0x8BAEDC57DCDC72A5), SPH_C64(0x27580B2C0B0B5316),
+	SPH_C64(0xD39C9D4E9D9D0127), SPH_C64(0xC1476CAD6C6C2BD8),
+	SPH_C64(0xF59531C43131A462), SPH_C64(0xB98774CD7474F3E8),
+	SPH_C64(0x09E3F6FFF6F615F1), SPH_C64(0x430A460546464C8C),
+	SPH_C64(0x2609AC8AACACA545), SPH_C64(0x973C891E8989B50F),
+	SPH_C64(0x44A014501414B428), SPH_C64(0x425BE1A3E1E1BADF),
+	SPH_C64(0x4EB016581616A62C), SPH_C64(0xD2CD3AE83A3AF774),
+	SPH_C64(0xD06F69B9696906D2), SPH_C64(0x2D48092409094112),
+	SPH_C64(0xADA770DD7070D7E0), SPH_C64(0x54D9B6E2B6B66F71),
+	SPH_C64(0xB7CED067D0D01EBD), SPH_C64(0x7E3BED93EDEDD6C7),
+	SPH_C64(0xDB2ECC17CCCCE285), SPH_C64(0x572A421542426884),
+	SPH_C64(0xC2B4985A98982C2D), SPH_C64(0x0E49A4AAA4A4ED55),
+	SPH_C64(0x885D28A028287550), SPH_C64(0x31DA5C6D5C5C86B8),
+	SPH_C64(0x3F93F8C7F8F86BED), SPH_C64(0xA44486228686C211)
+};
+
+static const sph_u64 plain_T3[256] = {
+	SPH_C64(0xC018601818D83078), SPH_C64(0x05238C23232646AF),
+	SPH_C64(0x7EC63FC6C6B891F9), SPH_C64(0x13E887E8E8FBCD6F),
+	SPH_C64(0x4C87268787CB13A1), SPH_C64(0xA9B8DAB8B8116D62),
+	SPH_C64(0x0801040101090205), SPH_C64(0x424F214F4F0D9E6E),
+	SPH_C64(0xAD36D836369B6CEE), SPH_C64(0x59A6A2A6A6FF5104),
+	SPH_C64(0xDED26FD2D20CB9BD), SPH_C64(0xFBF5F3F5F50EF706),
+	SPH_C64(0xEF79F9797996F280), SPH_C64(0x5F6FA16F6F30DECE),
+	SPH_C64(0xFC917E91916D3FEF), SPH_C64(0xAA52555252F8A407),
+	SPH_C64(0x27609D606047C0FD), SPH_C64(0x89BCCABCBC356576),
+	SPH_C64(0xAC9B569B9B372BCD), SPH_C64(0x048E028E8E8A018C),
+	SPH_C64(0x71A3B6A3A3D25B15), SPH_C64(0x600C300C0C6C183C),
+	SPH_C64(0xFF7BF17B7B84F68A), SPH_C64(0xB535D43535806AE1),
+	SPH_C64(0xE81D741D1DF53A69), SPH_C64(0x53E0A7E0E0B3DD47),
+	SPH_C64(0xF6D77BD7D721B3AC), SPH_C64(0x5EC22FC2C29C99ED),
+	SPH_C64(0x6D2EB82E2E435C96), SPH_C64(0x624B314B4B29967A),
+	SPH_C64(0xA3FEDFFEFE5DE121), SPH_C64(0x8257415757D5AE16),
+	SPH_C64(0xA815541515BD2A41), SPH_C64(0x9F77C17777E8EEB6),
+	SPH_C64(0xA537DC3737926EEB), SPH_C64(0x7BE5B3E5E59ED756),
+	SPH_C64(0x8C9F469F9F1323D9), SPH_C64(0xD3F0E7F0F023FD17),
+	SPH_C64(0x6A4A354A4A20947F), SPH_C64(0x9EDA4FDADA44A995),
+	SPH_C64(0xFA587D5858A2B025), SPH_C64(0x06C903C9C9CF8FCA),
+	SPH_C64(0x5529A429297C528D), SPH_C64(0x500A280A0A5A1422),
+	SPH_C64(0xE1B1FEB1B1507F4F), SPH_C64(0x69A0BAA0A0C95D1A),
+	SPH_C64(0x7F6BB16B6B14D6DA), SPH_C64(0x5C852E8585D917AB),
+	SPH_C64(0x81BDCEBDBD3C6773), SPH_C64(0xD25D695D5D8FBA34),
+	SPH_C64(0x8010401010902050), SPH_C64(0xF3F4F7F4F407F503),
+	SPH_C64(0x16CB0BCBCBDD8BC0), SPH_C64(0xED3EF83E3ED37CC6),
+	SPH_C64(0x28051405052D0A11), SPH_C64(0x1F6781676778CEE6),
+	SPH_C64(0x73E4B7E4E497D553), SPH_C64(0x25279C2727024EBB),
+	SPH_C64(0x3241194141738258), SPH_C64(0x2C8B168B8BA70B9D),
+	SPH_C64(0x51A7A6A7A7F65301), SPH_C64(0xCF7DE97D7DB2FA94),
+	SPH_C64(0xDC956E95954937FB), SPH_C64(0x8ED847D8D856AD9F),
+	SPH_C64(0x8BFBCBFBFB70EB30), SPH_C64(0x23EE9FEEEECDC171),
+	SPH_C64(0xC77CED7C7CBBF891), SPH_C64(0x176685666671CCE3),
+	SPH_C64(0xA6DD53DDDD7BA78E), SPH_C64(0xB8175C1717AF2E4B),
+	SPH_C64(0x0247014747458E46), SPH_C64(0x849E429E9E1A21DC),
+	SPH_C64(0x1ECA0FCACAD489C5), SPH_C64(0x752DB42D2D585A99),
+	SPH_C64(0x91BFC6BFBF2E6379), SPH_C64(0x38071C07073F0E1B),
+	SPH_C64(0x01AD8EADADAC4723), SPH_C64(0xEA5A755A5AB0B42F),
+	SPH_C64(0x6C83368383EF1BB5), SPH_C64(0x8533CC3333B666FF),
+	SPH_C64(0x3F639163635CC6F2), SPH_C64(0x100208020212040A),
+	SPH_C64(0x39AA92AAAA934938), SPH_C64(0xAF71D97171DEE2A8),
+	SPH_C64(0x0EC807C8C8C68DCF), SPH_C64(0xC819641919D1327D),
+	SPH_C64(0x72493949493B9270), SPH_C64(0x86D943D9D95FAF9A),
+	SPH_C64(0xC3F2EFF2F231F91D), SPH_C64(0x4BE3ABE3E3A8DB48),
+	SPH_C64(0xE25B715B5BB9B62A), SPH_C64(0x34881A8888BC0D92),
+	SPH_C64(0xA49A529A9A3E29C8), SPH_C64(0x2D269826260B4CBE),
+	SPH_C64(0x8D32C83232BF64FA), SPH_C64(0xE9B0FAB0B0597D4A),
+	SPH_C64(0x1BE983E9E9F2CF6A), SPH_C64(0x780F3C0F0F771E33),
+	SPH_C64(0xE6D573D5D533B7A6), SPH_C64(0x74803A8080F41DBA),
+	SPH_C64(0x99BEC2BEBE27617C), SPH_C64(0x26CD13CDCDEB87DE),
+	SPH_C64(0xBD34D034348968E4), SPH_C64(0x7A483D4848329075),
+	SPH_C64(0xABFFDBFFFF54E324), SPH_C64(0xF77AF57A7A8DF48F),
+	SPH_C64(0xF4907A9090643DEA), SPH_C64(0xC25F615F5F9DBE3E),
+	SPH_C64(0x1D208020203D40A0), SPH_C64(0x6768BD68680FD0D5),
+	SPH_C64(0xD01A681A1ACA3472), SPH_C64(0x19AE82AEAEB7412C),
+	SPH_C64(0xC9B4EAB4B47D755E), SPH_C64(0x9A544D5454CEA819),
+	SPH_C64(0xEC937693937F3BE5), SPH_C64(0x0D228822222F44AA),
+	SPH_C64(0x07648D646463C8E9), SPH_C64(0xDBF1E3F1F12AFF12),
+	SPH_C64(0xBF73D17373CCE6A2), SPH_C64(0x901248121282245A),
+	SPH_C64(0x3A401D40407A805D), SPH_C64(0x4008200808481028),
+	SPH_C64(0x56C32BC3C3959BE8), SPH_C64(0x33EC97ECECDFC57B),
+	SPH_C64(0x96DB4BDBDB4DAB90), SPH_C64(0x61A1BEA1A1C05F1F),
+	SPH_C64(0x1C8D0E8D8D910783), SPH_C64(0xF53DF43D3DC87AC9),
+	SPH_C64(0xCC976697975B33F1), SPH_C64(0x0000000000000000),
+	SPH_C64(0x36CF1BCFCFF983D4), SPH_C64(0x452BAC2B2B6E5687),
+	SPH_C64(0x9776C57676E1ECB3), SPH_C64(0x6482328282E619B0),
+	SPH_C64(0xFED67FD6D628B1A9), SPH_C64(0xD81B6C1B1BC33677),
+	SPH_C64(0xC1B5EEB5B574775B), SPH_C64(0x11AF86AFAFBE4329),
+	SPH_C64(0x776AB56A6A1DD4DF), SPH_C64(0xBA505D5050EAA00D),
+	SPH_C64(0x1245094545578A4C), SPH_C64(0xCBF3EBF3F338FB18),
+	SPH_C64(0x9D30C03030AD60F0), SPH_C64(0x2BEF9BEFEFC4C374),
+	SPH_C64(0xE53FFC3F3FDA7EC3), SPH_C64(0x9255495555C7AA1C),
+	SPH_C64(0x79A2B2A2A2DB5910), SPH_C64(0x03EA8FEAEAE9C965),
+	SPH_C64(0x0F658965656ACAEC), SPH_C64(0xB9BAD2BABA036968),
+	SPH_C64(0x652FBC2F2F4A5E93), SPH_C64(0x4EC027C0C08E9DE7),
+	SPH_C64(0xBEDE5FDEDE60A181), SPH_C64(0xE01C701C1CFC386C),
+	SPH_C64(0xBBFDD3FDFD46E72E), SPH_C64(0x524D294D4D1F9A64),
+	SPH_C64(0xE4927292927639E0), SPH_C64(0x8F75C97575FAEABC),
+	SPH_C64(0x3006180606360C1E), SPH_C64(0x248A128A8AAE0998),
+	SPH_C64(0xF9B2F2B2B24B7940), SPH_C64(0x63E6BFE6E685D159),
+	SPH_C64(0x700E380E0E7E1C36), SPH_C64(0xF81F7C1F1FE73E63),
+	SPH_C64(0x376295626255C4F7), SPH_C64(0xEED477D4D43AB5A3),
+	SPH_C64(0x29A89AA8A8814D32), SPH_C64(0xC4966296965231F4),
+	SPH_C64(0x9BF9C3F9F962EF3A), SPH_C64(0x66C533C5C5A397F6),
+	SPH_C64(0x3525942525104AB1), SPH_C64(0xF259795959ABB220),
+	SPH_C64(0x54842A8484D015AE), SPH_C64(0xB772D57272C5E4A7),
+	SPH_C64(0xD539E43939EC72DD), SPH_C64(0x5A4C2D4C4C169861),
+	SPH_C64(0xCA5E655E5E94BC3B), SPH_C64(0xE778FD78789FF085),
+	SPH_C64(0xDD38E03838E570D8), SPH_C64(0x148C0A8C8C980586),
+	SPH_C64(0xC6D163D1D117BFB2), SPH_C64(0x41A5AEA5A5E4570B),
+	SPH_C64(0x43E2AFE2E2A1D94D), SPH_C64(0x2F619961614EC2F8),
+	SPH_C64(0xF1B3F6B3B3427B45), SPH_C64(0x15218421213442A5),
+	SPH_C64(0x949C4A9C9C0825D6), SPH_C64(0xF01E781E1EEE3C66),
+	SPH_C64(0x2243114343618652), SPH_C64(0x76C73BC7C7B193FC),
+	SPH_C64(0xB3FCD7FCFC4FE52B), SPH_C64(0x2004100404240814),
+	SPH_C64(0xB251595151E3A208), SPH_C64(0xBC995E9999252FC7),
+	SPH_C64(0x4F6DA96D6D22DAC4), SPH_C64(0x680D340D0D651A39),
+	SPH_C64(0x83FACFFAFA79E935), SPH_C64(0xB6DF5BDFDF69A384),
+	SPH_C64(0xD77EE57E7EA9FC9B), SPH_C64(0x3D249024241948B4),
+	SPH_C64(0xC53BEC3B3BFE76D7), SPH_C64(0x31AB96ABAB9A4B3D),
+	SPH_C64(0x3ECE1FCECEF081D1), SPH_C64(0x8811441111992255),
+	SPH_C64(0x0C8F068F8F830389), SPH_C64(0x4A4E254E4E049C6B),
+	SPH_C64(0xD1B7E6B7B7667351), SPH_C64(0x0BEB8BEBEBE0CB60),
+	SPH_C64(0xFD3CF03C3CC178CC), SPH_C64(0x7C813E8181FD1FBF),
+	SPH_C64(0xD4946A94944035FE), SPH_C64(0xEBF7FBF7F71CF30C),
+	SPH_C64(0xA1B9DEB9B9186F67), SPH_C64(0x98134C13138B265F),
+	SPH_C64(0x7D2CB02C2C51589C), SPH_C64(0xD6D36BD3D305BBB8),
+	SPH_C64(0x6BE7BBE7E78CD35C), SPH_C64(0x576EA56E6E39DCCB),
+	SPH_C64(0x6EC437C4C4AA95F3), SPH_C64(0x18030C03031B060F),
+	SPH_C64(0x8A56455656DCAC13), SPH_C64(0x1A440D44445E8849),
+	SPH_C64(0xDF7FE17F7FA0FE9E), SPH_C64(0x21A99EA9A9884F37),
+	SPH_C64(0x4D2AA82A2A675482), SPH_C64(0xB1BBD6BBBB0A6B6D),
+	SPH_C64(0x46C123C1C1879FE2), SPH_C64(0xA253515353F1A602),
+	SPH_C64(0xAEDC57DCDC72A58B), SPH_C64(0x580B2C0B0B531627),
+	SPH_C64(0x9C9D4E9D9D0127D3), SPH_C64(0x476CAD6C6C2BD8C1),
+	SPH_C64(0x9531C43131A462F5), SPH_C64(0x8774CD7474F3E8B9),
+	SPH_C64(0xE3F6FFF6F615F109), SPH_C64(0x0A460546464C8C43),
+	SPH_C64(0x09AC8AACACA54526), SPH_C64(0x3C891E8989B50F97),
+	SPH_C64(0xA014501414B42844), SPH_C64(0x5BE1A3E1E1BADF42),
+	SPH_C64(0xB016581616A62C4E), SPH_C64(0xCD3AE83A3AF774D2),
+	SPH_C64(0x6F69B9696906D2D0), SPH_C64(0x480924090941122D),
+	SPH_C64(0xA770DD7070D7E0AD), SPH_C64(0xD9B6E2B6B66F7154),
+	SPH_C64(0xCED067D0D01EBDB7), SPH_C64(0x3BED93EDEDD6C77E),
+	SPH_C64(0x2ECC17CCCCE285DB), SPH_C64(0x2A42154242688457),
+	SPH_C64(0xB4985A98982C2DC2), SPH_C64(0x49A4AAA4A4ED550E),
+	SPH_C64(0x5D28A02828755088), SPH_C64(0xDA5C6D5C5C86B831),
+	SPH_C64(0x93F8C7F8F86BED3F), SPH_C64(0x4486228686C211A4)
+};
+
+static const sph_u64 plain_T4[256] = {
+	SPH_C64(0x18601818D83078C0), SPH_C64(0x238C23232646AF05),
+	SPH_C64(0xC63FC6C6B891F97E), SPH_C64(0xE887E8E8FBCD6F13),
+	SPH_C64(0x87268787CB13A14C), SPH_C64(0xB8DAB8B8116D62A9),
+	SPH_C64(0x0104010109020508), SPH_C64(0x4F214F4F0D9E6E42),
+	SPH_C64(0x36D836369B6CEEAD), SPH_C64(0xA6A2A6A6FF510459),
+	SPH_C64(0xD26FD2D20CB9BDDE), SPH_C64(0xF5F3F5F50EF706FB),
+	SPH_C64(0x79F9797996F280EF), SPH_C64(0x6FA16F6F30DECE5F),
+	SPH_C64(0x917E91916D3FEFFC), SPH_C64(0x52555252F8A407AA),
+	SPH_C64(0x609D606047C0FD27), SPH_C64(0xBCCABCBC35657689),
+	SPH_C64(0x9B569B9B372BCDAC), SPH_C64(0x8E028E8E8A018C04),
+	SPH_C64(0xA3B6A3A3D25B1571), SPH_C64(0x0C300C0C6C183C60),
+	SPH_C64(0x7BF17B7B84F68AFF), SPH_C64(0x35D43535806AE1B5),
+	SPH_C64(0x1D741D1DF53A69E8), SPH_C64(0xE0A7E0E0B3DD4753),
+	SPH_C64(0xD77BD7D721B3ACF6), SPH_C64(0xC22FC2C29C99ED5E),
+	SPH_C64(0x2EB82E2E435C966D), SPH_C64(0x4B314B4B29967A62),
+	SPH_C64(0xFEDFFEFE5DE121A3), SPH_C64(0x57415757D5AE1682),
+	SPH_C64(0x15541515BD2A41A8), SPH_C64(0x77C17777E8EEB69F),
+	SPH_C64(0x37DC3737926EEBA5), SPH_C64(0xE5B3E5E59ED7567B),
+	SPH_C64(0x9F469F9F1323D98C), SPH_C64(0xF0E7F0F023FD17D3),
+	SPH_C64(0x4A354A4A20947F6A), SPH_C64(0xDA4FDADA44A9959E),
+	SPH_C64(0x587D5858A2B025FA), SPH_C64(0xC903C9C9CF8FCA06),
+	SPH_C64(0x29A429297C528D55), SPH_C64(0x0A280A0A5A142250),
+	SPH_C64(0xB1FEB1B1507F4FE1), SPH_C64(0xA0BAA0A0C95D1A69),
+	SPH_C64(0x6BB16B6B14D6DA7F), SPH_C64(0x852E8585D917AB5C),
+	SPH_C64(0xBDCEBDBD3C677381), SPH_C64(0x5D695D5D8FBA34D2),
+	SPH_C64(0x1040101090205080), SPH_C64(0xF4F7F4F407F503F3),
+	SPH_C64(0xCB0BCBCBDD8BC016), SPH_C64(0x3EF83E3ED37CC6ED),
+	SPH_C64(0x051405052D0A1128), SPH_C64(0x6781676778CEE61F),
+	SPH_C64(0xE4B7E4E497D55373), SPH_C64(0x279C2727024EBB25),
+	SPH_C64(0x4119414173825832), SPH_C64(0x8B168B8BA70B9D2C),
+	SPH_C64(0xA7A6A7A7F6530151), SPH_C64(0x7DE97D7DB2FA94CF),
+	SPH_C64(0x956E95954937FBDC), SPH_C64(0xD847D8D856AD9F8E),
+	SPH_C64(0xFBCBFBFB70EB308B), SPH_C64(0xEE9FEEEECDC17123),
+	SPH_C64(0x7CED7C7CBBF891C7), SPH_C64(0x6685666671CCE317),
+	SPH_C64(0xDD53DDDD7BA78EA6), SPH_C64(0x175C1717AF2E4BB8),
+	SPH_C64(0x47014747458E4602), SPH_C64(0x9E429E9E1A21DC84),
+	SPH_C64(0xCA0FCACAD489C51E), SPH_C64(0x2DB42D2D585A9975),
+	SPH_C64(0xBFC6BFBF2E637991), SPH_C64(0x071C07073F0E1B38),
+	SPH_C64(0xAD8EADADAC472301), SPH_C64(0x5A755A5AB0B42FEA),
+	SPH_C64(0x83368383EF1BB56C), SPH_C64(0x33CC3333B666FF85),
+	SPH_C64(0x639163635CC6F23F), SPH_C64(0x0208020212040A10),
+	SPH_C64(0xAA92AAAA93493839), SPH_C64(0x71D97171DEE2A8AF),
+	SPH_C64(0xC807C8C8C68DCF0E), SPH_C64(0x19641919D1327DC8),
+	SPH_C64(0x493949493B927072), SPH_C64(0xD943D9D95FAF9A86),
+	SPH_C64(0xF2EFF2F231F91DC3), SPH_C64(0xE3ABE3E3A8DB484B),
+	SPH_C64(0x5B715B5BB9B62AE2), SPH_C64(0x881A8888BC0D9234),
+	SPH_C64(0x9A529A9A3E29C8A4), SPH_C64(0x269826260B4CBE2D),
+	SPH_C64(0x32C83232BF64FA8D), SPH_C64(0xB0FAB0B0597D4AE9),
+	SPH_C64(0xE983E9E9F2CF6A1B), SPH_C64(0x0F3C0F0F771E3378),
+	SPH_C64(0xD573D5D533B7A6E6), SPH_C64(0x803A8080F41DBA74),
+	SPH_C64(0xBEC2BEBE27617C99), SPH_C64(0xCD13CDCDEB87DE26),
+	SPH_C64(0x34D034348968E4BD), SPH_C64(0x483D48483290757A),
+	SPH_C64(0xFFDBFFFF54E324AB), SPH_C64(0x7AF57A7A8DF48FF7),
+	SPH_C64(0x907A9090643DEAF4), SPH_C64(0x5F615F5F9DBE3EC2),
+	SPH_C64(0x208020203D40A01D), SPH_C64(0x68BD68680FD0D567),
+	SPH_C64(0x1A681A1ACA3472D0), SPH_C64(0xAE82AEAEB7412C19),
+	SPH_C64(0xB4EAB4B47D755EC9), SPH_C64(0x544D5454CEA8199A),
+	SPH_C64(0x937693937F3BE5EC), SPH_C64(0x228822222F44AA0D),
+	SPH_C64(0x648D646463C8E907), SPH_C64(0xF1E3F1F12AFF12DB),
+	SPH_C64(0x73D17373CCE6A2BF), SPH_C64(0x1248121282245A90),
+	SPH_C64(0x401D40407A805D3A), SPH_C64(0x0820080848102840),
+	SPH_C64(0xC32BC3C3959BE856), SPH_C64(0xEC97ECECDFC57B33),
+	SPH_C64(0xDB4BDBDB4DAB9096), SPH_C64(0xA1BEA1A1C05F1F61),
+	SPH_C64(0x8D0E8D8D9107831C), SPH_C64(0x3DF43D3DC87AC9F5),
+	SPH_C64(0x976697975B33F1CC), SPH_C64(0x0000000000000000),
+	SPH_C64(0xCF1BCFCFF983D436), SPH_C64(0x2BAC2B2B6E568745),
+	SPH_C64(0x76C57676E1ECB397), SPH_C64(0x82328282E619B064),
+	SPH_C64(0xD67FD6D628B1A9FE), SPH_C64(0x1B6C1B1BC33677D8),
+	SPH_C64(0xB5EEB5B574775BC1), SPH_C64(0xAF86AFAFBE432911),
+	SPH_C64(0x6AB56A6A1DD4DF77), SPH_C64(0x505D5050EAA00DBA),
+	SPH_C64(0x45094545578A4C12), SPH_C64(0xF3EBF3F338FB18CB),
+	SPH_C64(0x30C03030AD60F09D), SPH_C64(0xEF9BEFEFC4C3742B),
+	SPH_C64(0x3FFC3F3FDA7EC3E5), SPH_C64(0x55495555C7AA1C92),
+	SPH_C64(0xA2B2A2A2DB591079), SPH_C64(0xEA8FEAEAE9C96503),
+	SPH_C64(0x658965656ACAEC0F), SPH_C64(0xBAD2BABA036968B9),
+	SPH_C64(0x2FBC2F2F4A5E9365), SPH_C64(0xC027C0C08E9DE74E),
+	SPH_C64(0xDE5FDEDE60A181BE), SPH_C64(0x1C701C1CFC386CE0),
+	SPH_C64(0xFDD3FDFD46E72EBB), SPH_C64(0x4D294D4D1F9A6452),
+	SPH_C64(0x927292927639E0E4), SPH_C64(0x75C97575FAEABC8F),
+	SPH_C64(0x06180606360C1E30), SPH_C64(0x8A128A8AAE099824),
+	SPH_C64(0xB2F2B2B24B7940F9), SPH_C64(0xE6BFE6E685D15963),
+	SPH_C64(0x0E380E0E7E1C3670), SPH_C64(0x1F7C1F1FE73E63F8),
+	SPH_C64(0x6295626255C4F737), SPH_C64(0xD477D4D43AB5A3EE),
+	SPH_C64(0xA89AA8A8814D3229), SPH_C64(0x966296965231F4C4),
+	SPH_C64(0xF9C3F9F962EF3A9B), SPH_C64(0xC533C5C5A397F666),
+	SPH_C64(0x25942525104AB135), SPH_C64(0x59795959ABB220F2),
+	SPH_C64(0x842A8484D015AE54), SPH_C64(0x72D57272C5E4A7B7),
+	SPH_C64(0x39E43939EC72DDD5), SPH_C64(0x4C2D4C4C1698615A),
+	SPH_C64(0x5E655E5E94BC3BCA), SPH_C64(0x78FD78789FF085E7),
+	SPH_C64(0x38E03838E570D8DD), SPH_C64(0x8C0A8C8C98058614),
+	SPH_C64(0xD163D1D117BFB2C6), SPH_C64(0xA5AEA5A5E4570B41),
+	SPH_C64(0xE2AFE2E2A1D94D43), SPH_C64(0x619961614EC2F82F),
+	SPH_C64(0xB3F6B3B3427B45F1), SPH_C64(0x218421213442A515),
+	SPH_C64(0x9C4A9C9C0825D694), SPH_C64(0x1E781E1EEE3C66F0),
+	SPH_C64(0x4311434361865222), SPH_C64(0xC73BC7C7B193FC76),
+	SPH_C64(0xFCD7FCFC4FE52BB3), SPH_C64(0x0410040424081420),
+	SPH_C64(0x51595151E3A208B2), SPH_C64(0x995E9999252FC7BC),
+	SPH_C64(0x6DA96D6D22DAC44F), SPH_C64(0x0D340D0D651A3968),
+	SPH_C64(0xFACFFAFA79E93583), SPH_C64(0xDF5BDFDF69A384B6),
+	SPH_C64(0x7EE57E7EA9FC9BD7), SPH_C64(0x249024241948B43D),
+	SPH_C64(0x3BEC3B3BFE76D7C5), SPH_C64(0xAB96ABAB9A4B3D31),
+	SPH_C64(0xCE1FCECEF081D13E), SPH_C64(0x1144111199225588),
+	SPH_C64(0x8F068F8F8303890C), SPH_C64(0x4E254E4E049C6B4A),
+	SPH_C64(0xB7E6B7B7667351D1), SPH_C64(0xEB8BEBEBE0CB600B),
+	SPH_C64(0x3CF03C3CC178CCFD), SPH_C64(0x813E8181FD1FBF7C),
+	SPH_C64(0x946A94944035FED4), SPH_C64(0xF7FBF7F71CF30CEB),
+	SPH_C64(0xB9DEB9B9186F67A1), SPH_C64(0x134C13138B265F98),
+	SPH_C64(0x2CB02C2C51589C7D), SPH_C64(0xD36BD3D305BBB8D6),
+	SPH_C64(0xE7BBE7E78CD35C6B), SPH_C64(0x6EA56E6E39DCCB57),
+	SPH_C64(0xC437C4C4AA95F36E), SPH_C64(0x030C03031B060F18),
+	SPH_C64(0x56455656DCAC138A), SPH_C64(0x440D44445E88491A),
+	SPH_C64(0x7FE17F7FA0FE9EDF), SPH_C64(0xA99EA9A9884F3721),
+	SPH_C64(0x2AA82A2A6754824D), SPH_C64(0xBBD6BBBB0A6B6DB1),
+	SPH_C64(0xC123C1C1879FE246), SPH_C64(0x53515353F1A602A2),
+	SPH_C64(0xDC57DCDC72A58BAE), SPH_C64(0x0B2C0B0B53162758),
+	SPH_C64(0x9D4E9D9D0127D39C), SPH_C64(0x6CAD6C6C2BD8C147),
+	SPH_C64(0x31C43131A462F595), SPH_C64(0x74CD7474F3E8B987),
+	SPH_C64(0xF6FFF6F615F109E3), SPH_C64(0x460546464C8C430A),
+	SPH_C64(0xAC8AACACA5452609), SPH_C64(0x891E8989B50F973C),
+	SPH_C64(0x14501414B42844A0), SPH_C64(0xE1A3E1E1BADF425B),
+	SPH_C64(0x16581616A62C4EB0), SPH_C64(0x3AE83A3AF774D2CD),
+	SPH_C64(0x69B9696906D2D06F), SPH_C64(0x0924090941122D48),
+	SPH_C64(0x70DD7070D7E0ADA7), SPH_C64(0xB6E2B6B66F7154D9),
+	SPH_C64(0xD067D0D01EBDB7CE), SPH_C64(0xED93EDEDD6C77E3B),
+	SPH_C64(0xCC17CCCCE285DB2E), SPH_C64(0x421542426884572A),
+	SPH_C64(0x985A98982C2DC2B4), SPH_C64(0xA4AAA4A4ED550E49),
+	SPH_C64(0x28A028287550885D), SPH_C64(0x5C6D5C5C86B831DA),
+	SPH_C64(0xF8C7F8F86BED3F93), SPH_C64(0x86228686C211A444)
+};
+
+static const sph_u64 plain_T5[256] = {
+	SPH_C64(0x601818D83078C018), SPH_C64(0x8C23232646AF0523),
+	SPH_C64(0x3FC6C6B891F97EC6), SPH_C64(0x87E8E8FBCD6F13E8),
+	SPH_C64(0x268787CB13A14C87), SPH_C64(0xDAB8B8116D62A9B8),
+	SPH_C64(0x0401010902050801), SPH_C64(0x214F4F0D9E6E424F),
+	SPH_C64(0xD836369B6CEEAD36), SPH_C64(0xA2A6A6FF510459A6),
+	SPH_C64(0x6FD2D20CB9BDDED2), SPH_C64(0xF3F5F50EF706FBF5),
+	SPH_C64(0xF9797996F280EF79), SPH_C64(0xA16F6F30DECE5F6F),
+	SPH_C64(0x7E91916D3FEFFC91), SPH_C64(0x555252F8A407AA52),
+	SPH_C64(0x9D606047C0FD2760), SPH_C64(0xCABCBC35657689BC),
+	SPH_C64(0x569B9B372BCDAC9B), SPH_C64(0x028E8E8A018C048E),
+	SPH_C64(0xB6A3A3D25B1571A3), SPH_C64(0x300C0C6C183C600C),
+	SPH_C64(0xF17B7B84F68AFF7B), SPH_C64(0xD43535806AE1B535),
+	SPH_C64(0x741D1DF53A69E81D), SPH_C64(0xA7E0E0B3DD4753E0),
+	SPH_C64(0x7BD7D721B3ACF6D7), SPH_C64(0x2FC2C29C99ED5EC2),
+	SPH_C64(0xB82E2E435C966D2E), SPH_C64(0x314B4B29967A624B),
+	SPH_C64(0xDFFEFE5DE121A3FE), SPH_C64(0x415757D5AE168257),
+	SPH_C64(0x541515BD2A41A815), SPH_C64(0xC17777E8EEB69F77),
+	SPH_C64(0xDC3737926EEBA537), SPH_C64(0xB3E5E59ED7567BE5),
+	SPH_C64(0x469F9F1323D98C9F), SPH_C64(0xE7F0F023FD17D3F0),
+	SPH_C64(0x354A4A20947F6A4A), SPH_C64(0x4FDADA44A9959EDA),
+	SPH_C64(0x7D5858A2B025FA58), SPH_C64(0x03C9C9CF8FCA06C9),
+	SPH_C64(0xA429297C528D5529), SPH_C64(0x280A0A5A1422500A),
+	SPH_C64(0xFEB1B1507F4FE1B1), SPH_C64(0xBAA0A0C95D1A69A0),
+	SPH_C64(0xB16B6B14D6DA7F6B), SPH_C64(0x2E8585D917AB5C85),
+	SPH_C64(0xCEBDBD3C677381BD), SPH_C64(0x695D5D8FBA34D25D),
+	SPH_C64(0x4010109020508010), SPH_C64(0xF7F4F407F503F3F4),
+	SPH_C64(0x0BCBCBDD8BC016CB), SPH_C64(0xF83E3ED37CC6ED3E),
+	SPH_C64(0x1405052D0A112805), SPH_C64(0x81676778CEE61F67),
+	SPH_C64(0xB7E4E497D55373E4), SPH_C64(0x9C2727024EBB2527),
+	SPH_C64(0x1941417382583241), SPH_C64(0x168B8BA70B9D2C8B),
+	SPH_C64(0xA6A7A7F6530151A7), SPH_C64(0xE97D7DB2FA94CF7D),
+	SPH_C64(0x6E95954937FBDC95), SPH_C64(0x47D8D856AD9F8ED8),
+	SPH_C64(0xCBFBFB70EB308BFB), SPH_C64(0x9FEEEECDC17123EE),
+	SPH_C64(0xED7C7CBBF891C77C), SPH_C64(0x85666671CCE31766),
+	SPH_C64(0x53DDDD7BA78EA6DD), SPH_C64(0x5C1717AF2E4BB817),
+	SPH_C64(0x014747458E460247), SPH_C64(0x429E9E1A21DC849E),
+	SPH_C64(0x0FCACAD489C51ECA), SPH_C64(0xB42D2D585A99752D),
+	SPH_C64(0xC6BFBF2E637991BF), SPH_C64(0x1C07073F0E1B3807),
+	SPH_C64(0x8EADADAC472301AD), SPH_C64(0x755A5AB0B42FEA5A),
+	SPH_C64(0x368383EF1BB56C83), SPH_C64(0xCC3333B666FF8533),
+	SPH_C64(0x9163635CC6F23F63), SPH_C64(0x08020212040A1002),
+	SPH_C64(0x92AAAA93493839AA), SPH_C64(0xD97171DEE2A8AF71),
+	SPH_C64(0x07C8C8C68DCF0EC8), SPH_C64(0x641919D1327DC819),
+	SPH_C64(0x3949493B92707249), SPH_C64(0x43D9D95FAF9A86D9),
+	SPH_C64(0xEFF2F231F91DC3F2), SPH_C64(0xABE3E3A8DB484BE3),
+	SPH_C64(0x715B5BB9B62AE25B), SPH_C64(0x1A8888BC0D923488),
+	SPH_C64(0x529A9A3E29C8A49A), SPH_C64(0x9826260B4CBE2D26),
+	SPH_C64(0xC83232BF64FA8D32), SPH_C64(0xFAB0B0597D4AE9B0),
+	SPH_C64(0x83E9E9F2CF6A1BE9), SPH_C64(0x3C0F0F771E33780F),
+	SPH_C64(0x73D5D533B7A6E6D5), SPH_C64(0x3A8080F41DBA7480),
+	SPH_C64(0xC2BEBE27617C99BE), SPH_C64(0x13CDCDEB87DE26CD),
+	SPH_C64(0xD034348968E4BD34), SPH_C64(0x3D48483290757A48),
+	SPH_C64(0xDBFFFF54E324ABFF), SPH_C64(0xF57A7A8DF48FF77A),
+	SPH_C64(0x7A9090643DEAF490), SPH_C64(0x615F5F9DBE3EC25F),
+	SPH_C64(0x8020203D40A01D20), SPH_C64(0xBD68680FD0D56768),
+	SPH_C64(0x681A1ACA3472D01A), SPH_C64(0x82AEAEB7412C19AE),
+	SPH_C64(0xEAB4B47D755EC9B4), SPH_C64(0x4D5454CEA8199A54),
+	SPH_C64(0x7693937F3BE5EC93), SPH_C64(0x8822222F44AA0D22),
+	SPH_C64(0x8D646463C8E90764), SPH_C64(0xE3F1F12AFF12DBF1),
+	SPH_C64(0xD17373CCE6A2BF73), SPH_C64(0x48121282245A9012),
+	SPH_C64(0x1D40407A805D3A40), SPH_C64(0x2008084810284008),
+	SPH_C64(0x2BC3C3959BE856C3), SPH_C64(0x97ECECDFC57B33EC),
+	SPH_C64(0x4BDBDB4DAB9096DB), SPH_C64(0xBEA1A1C05F1F61A1),
+	SPH_C64(0x0E8D8D9107831C8D), SPH_C64(0xF43D3DC87AC9F53D),
+	SPH_C64(0x6697975B33F1CC97), SPH_C64(0x0000000000000000),
+	SPH_C64(0x1BCFCFF983D436CF), SPH_C64(0xAC2B2B6E5687452B),
+	SPH_C64(0xC57676E1ECB39776), SPH_C64(0x328282E619B06482),
+	SPH_C64(0x7FD6D628B1A9FED6), SPH_C64(0x6C1B1BC33677D81B),
+	SPH_C64(0xEEB5B574775BC1B5), SPH_C64(0x86AFAFBE432911AF),
+	SPH_C64(0xB56A6A1DD4DF776A), SPH_C64(0x5D5050EAA00DBA50),
+	SPH_C64(0x094545578A4C1245), SPH_C64(0xEBF3F338FB18CBF3),
+	SPH_C64(0xC03030AD60F09D30), SPH_C64(0x9BEFEFC4C3742BEF),
+	SPH_C64(0xFC3F3FDA7EC3E53F), SPH_C64(0x495555C7AA1C9255),
+	SPH_C64(0xB2A2A2DB591079A2), SPH_C64(0x8FEAEAE9C96503EA),
+	SPH_C64(0x8965656ACAEC0F65), SPH_C64(0xD2BABA036968B9BA),
+	SPH_C64(0xBC2F2F4A5E93652F), SPH_C64(0x27C0C08E9DE74EC0),
+	SPH_C64(0x5FDEDE60A181BEDE), SPH_C64(0x701C1CFC386CE01C),
+	SPH_C64(0xD3FDFD46E72EBBFD), SPH_C64(0x294D4D1F9A64524D),
+	SPH_C64(0x7292927639E0E492), SPH_C64(0xC97575FAEABC8F75),
+	SPH_C64(0x180606360C1E3006), SPH_C64(0x128A8AAE0998248A),
+	SPH_C64(0xF2B2B24B7940F9B2), SPH_C64(0xBFE6E685D15963E6),
+	SPH_C64(0x380E0E7E1C36700E), SPH_C64(0x7C1F1FE73E63F81F),
+	SPH_C64(0x95626255C4F73762), SPH_C64(0x77D4D43AB5A3EED4),
+	SPH_C64(0x9AA8A8814D3229A8), SPH_C64(0x6296965231F4C496),
+	SPH_C64(0xC3F9F962EF3A9BF9), SPH_C64(0x33C5C5A397F666C5),
+	SPH_C64(0x942525104AB13525), SPH_C64(0x795959ABB220F259),
+	SPH_C64(0x2A8484D015AE5484), SPH_C64(0xD57272C5E4A7B772),
+	SPH_C64(0xE43939EC72DDD539), SPH_C64(0x2D4C4C1698615A4C),
+	SPH_C64(0x655E5E94BC3BCA5E), SPH_C64(0xFD78789FF085E778),
+	SPH_C64(0xE03838E570D8DD38), SPH_C64(0x0A8C8C980586148C),
+	SPH_C64(0x63D1D117BFB2C6D1), SPH_C64(0xAEA5A5E4570B41A5),
+	SPH_C64(0xAFE2E2A1D94D43E2), SPH_C64(0x9961614EC2F82F61),
+	SPH_C64(0xF6B3B3427B45F1B3), SPH_C64(0x8421213442A51521),
+	SPH_C64(0x4A9C9C0825D6949C), SPH_C64(0x781E1EEE3C66F01E),
+	SPH_C64(0x1143436186522243), SPH_C64(0x3BC7C7B193FC76C7),
+	SPH_C64(0xD7FCFC4FE52BB3FC), SPH_C64(0x1004042408142004),
+	SPH_C64(0x595151E3A208B251), SPH_C64(0x5E9999252FC7BC99),
+	SPH_C64(0xA96D6D22DAC44F6D), SPH_C64(0x340D0D651A39680D),
+	SPH_C64(0xCFFAFA79E93583FA), SPH_C64(0x5BDFDF69A384B6DF),
+	SPH_C64(0xE57E7EA9FC9BD77E), SPH_C64(0x9024241948B43D24),
+	SPH_C64(0xEC3B3BFE76D7C53B), SPH_C64(0x96ABAB9A4B3D31AB),
+	SPH_C64(0x1FCECEF081D13ECE), SPH_C64(0x4411119922558811),
+	SPH_C64(0x068F8F8303890C8F), SPH_C64(0x254E4E049C6B4A4E),
+	SPH_C64(0xE6B7B7667351D1B7), SPH_C64(0x8BEBEBE0CB600BEB),
+	SPH_C64(0xF03C3CC178CCFD3C), SPH_C64(0x3E8181FD1FBF7C81),
+	SPH_C64(0x6A94944035FED494), SPH_C64(0xFBF7F71CF30CEBF7),
+	SPH_C64(0xDEB9B9186F67A1B9), SPH_C64(0x4C13138B265F9813),
+	SPH_C64(0xB02C2C51589C7D2C), SPH_C64(0x6BD3D305BBB8D6D3),
+	SPH_C64(0xBBE7E78CD35C6BE7), SPH_C64(0xA56E6E39DCCB576E),
+	SPH_C64(0x37C4C4AA95F36EC4), SPH_C64(0x0C03031B060F1803),
+	SPH_C64(0x455656DCAC138A56), SPH_C64(0x0D44445E88491A44),
+	SPH_C64(0xE17F7FA0FE9EDF7F), SPH_C64(0x9EA9A9884F3721A9),
+	SPH_C64(0xA82A2A6754824D2A), SPH_C64(0xD6BBBB0A6B6DB1BB),
+	SPH_C64(0x23C1C1879FE246C1), SPH_C64(0x515353F1A602A253),
+	SPH_C64(0x57DCDC72A58BAEDC), SPH_C64(0x2C0B0B531627580B),
+	SPH_C64(0x4E9D9D0127D39C9D), SPH_C64(0xAD6C6C2BD8C1476C),
+	SPH_C64(0xC43131A462F59531), SPH_C64(0xCD7474F3E8B98774),
+	SPH_C64(0xFFF6F615F109E3F6), SPH_C64(0x0546464C8C430A46),
+	SPH_C64(0x8AACACA5452609AC), SPH_C64(0x1E8989B50F973C89),
+	SPH_C64(0x501414B42844A014), SPH_C64(0xA3E1E1BADF425BE1),
+	SPH_C64(0x581616A62C4EB016), SPH_C64(0xE83A3AF774D2CD3A),
+	SPH_C64(0xB9696906D2D06F69), SPH_C64(0x24090941122D4809),
+	SPH_C64(0xDD7070D7E0ADA770), SPH_C64(0xE2B6B66F7154D9B6),
+	SPH_C64(0x67D0D01EBDB7CED0), SPH_C64(0x93EDEDD6C77E3BED),
+	SPH_C64(0x17CCCCE285DB2ECC), SPH_C64(0x1542426884572A42),
+	SPH_C64(0x5A98982C2DC2B498), SPH_C64(0xAAA4A4ED550E49A4),
+	SPH_C64(0xA028287550885D28), SPH_C64(0x6D5C5C86B831DA5C),
+	SPH_C64(0xC7F8F86BED3F93F8), SPH_C64(0x228686C211A44486)
+};
+
+static const sph_u64 plain_T6[256] = {
+	SPH_C64(0x1818D83078C01860), SPH_C64(0x23232646AF05238C),
+	SPH_C64(0xC6C6B891F97EC63F), SPH_C64(0xE8E8FBCD6F13E887),
+	SPH_C64(0x8787CB13A14C8726), SPH_C64(0xB8B8116D62A9B8DA),
+	SPH_C64(0x0101090205080104), SPH_C64(0x4F4F0D9E6E424F21),
+	SPH_C64(0x36369B6CEEAD36D8), SPH_C64(0xA6A6FF510459A6A2),
+	SPH_C64(0xD2D20CB9BDDED26F), SPH_C64(0xF5F50EF706FBF5F3),
+	SPH_C64(0x797996F280EF79F9), SPH_C64(0x6F6F30DECE5F6FA1),
+	SPH_C64(0x91916D3FEFFC917E), SPH_C64(0x5252F8A407AA5255),
+	SPH_C64(0x606047C0FD27609D), SPH_C64(0xBCBC35657689BCCA),
+	SPH_C64(0x9B9B372BCDAC9B56), SPH_C64(0x8E8E8A018C048E02),
+	SPH_C64(0xA3A3D25B1571A3B6), SPH_C64(0x0C0C6C183C600C30),
+	SPH_C64(0x7B7B84F68AFF7BF1), SPH_C64(0x3535806AE1B535D4),
+	SPH_C64(0x1D1DF53A69E81D74), SPH_C64(0xE0E0B3DD4753E0A7),
+	SPH_C64(0xD7D721B3ACF6D77B), SPH_C64(0xC2C29C99ED5EC22F),
+	SPH_C64(0x2E2E435C966D2EB8), SPH_C64(0x4B4B29967A624B31),
+	SPH_C64(0xFEFE5DE121A3FEDF), SPH_C64(0x5757D5AE16825741),
+	SPH_C64(0x1515BD2A41A81554), SPH_C64(0x7777E8EEB69F77C1),
+	SPH_C64(0x3737926EEBA537DC), SPH_C64(0xE5E59ED7567BE5B3),
+	SPH_C64(0x9F9F1323D98C9F46), SPH_C64(0xF0F023FD17D3F0E7),
+	SPH_C64(0x4A4A20947F6A4A35), SPH_C64(0xDADA44A9959EDA4F),
+	SPH_C64(0x5858A2B025FA587D), SPH_C64(0xC9C9CF8FCA06C903),
+	SPH_C64(0x29297C528D5529A4), SPH_C64(0x0A0A5A1422500A28),
+	SPH_C64(0xB1B1507F4FE1B1FE), SPH_C64(0xA0A0C95D1A69A0BA),
+	SPH_C64(0x6B6B14D6DA7F6BB1), SPH_C64(0x8585D917AB5C852E),
+	SPH_C64(0xBDBD3C677381BDCE), SPH_C64(0x5D5D8FBA34D25D69),
+	SPH_C64(0x1010902050801040), SPH_C64(0xF4F407F503F3F4F7),
+	SPH_C64(0xCBCBDD8BC016CB0B), SPH_C64(0x3E3ED37CC6ED3EF8),
+	SPH_C64(0x05052D0A11280514), SPH_C64(0x676778CEE61F6781),
+	SPH_C64(0xE4E497D55373E4B7), SPH_C64(0x2727024EBB25279C),
+	SPH_C64(0x4141738258324119), SPH_C64(0x8B8BA70B9D2C8B16),
+	SPH_C64(0xA7A7F6530151A7A6), SPH_C64(0x7D7DB2FA94CF7DE9),
+	SPH_C64(0x95954937FBDC956E), SPH_C64(0xD8D856AD9F8ED847),
+	SPH_C64(0xFBFB70EB308BFBCB), SPH_C64(0xEEEECDC17123EE9F),
+	SPH_C64(0x7C7CBBF891C77CED), SPH_C64(0x666671CCE3176685),
+	SPH_C64(0xDDDD7BA78EA6DD53), SPH_C64(0x1717AF2E4BB8175C),
+	SPH_C64(0x4747458E46024701), SPH_C64(0x9E9E1A21DC849E42),
+	SPH_C64(0xCACAD489C51ECA0F), SPH_C64(0x2D2D585A99752DB4),
+	SPH_C64(0xBFBF2E637991BFC6), SPH_C64(0x07073F0E1B38071C),
+	SPH_C64(0xADADAC472301AD8E), SPH_C64(0x5A5AB0B42FEA5A75),
+	SPH_C64(0x8383EF1BB56C8336), SPH_C64(0x3333B666FF8533CC),
+	SPH_C64(0x63635CC6F23F6391), SPH_C64(0x020212040A100208),
+	SPH_C64(0xAAAA93493839AA92), SPH_C64(0x7171DEE2A8AF71D9),
+	SPH_C64(0xC8C8C68DCF0EC807), SPH_C64(0x1919D1327DC81964),
+	SPH_C64(0x49493B9270724939), SPH_C64(0xD9D95FAF9A86D943),
+	SPH_C64(0xF2F231F91DC3F2EF), SPH_C64(0xE3E3A8DB484BE3AB),
+	SPH_C64(0x5B5BB9B62AE25B71), SPH_C64(0x8888BC0D9234881A),
+	SPH_C64(0x9A9A3E29C8A49A52), SPH_C64(0x26260B4CBE2D2698),
+	SPH_C64(0x3232BF64FA8D32C8), SPH_C64(0xB0B0597D4AE9B0FA),
+	SPH_C64(0xE9E9F2CF6A1BE983), SPH_C64(0x0F0F771E33780F3C),
+	SPH_C64(0xD5D533B7A6E6D573), SPH_C64(0x8080F41DBA74803A),
+	SPH_C64(0xBEBE27617C99BEC2), SPH_C64(0xCDCDEB87DE26CD13),
+	SPH_C64(0x34348968E4BD34D0), SPH_C64(0x48483290757A483D),
+	SPH_C64(0xFFFF54E324ABFFDB), SPH_C64(0x7A7A8DF48FF77AF5),
+	SPH_C64(0x9090643DEAF4907A), SPH_C64(0x5F5F9DBE3EC25F61),
+	SPH_C64(0x20203D40A01D2080), SPH_C64(0x68680FD0D56768BD),
+	SPH_C64(0x1A1ACA3472D01A68), SPH_C64(0xAEAEB7412C19AE82),
+	SPH_C64(0xB4B47D755EC9B4EA), SPH_C64(0x5454CEA8199A544D),
+	SPH_C64(0x93937F3BE5EC9376), SPH_C64(0x22222F44AA0D2288),
+	SPH_C64(0x646463C8E907648D), SPH_C64(0xF1F12AFF12DBF1E3),
+	SPH_C64(0x7373CCE6A2BF73D1), SPH_C64(0x121282245A901248),
+	SPH_C64(0x40407A805D3A401D), SPH_C64(0x0808481028400820),
+	SPH_C64(0xC3C3959BE856C32B), SPH_C64(0xECECDFC57B33EC97),
+	SPH_C64(0xDBDB4DAB9096DB4B), SPH_C64(0xA1A1C05F1F61A1BE),
+	SPH_C64(0x8D8D9107831C8D0E), SPH_C64(0x3D3DC87AC9F53DF4),
+	SPH_C64(0x97975B33F1CC9766), SPH_C64(0x0000000000000000),
+	SPH_C64(0xCFCFF983D436CF1B), SPH_C64(0x2B2B6E5687452BAC),
+	SPH_C64(0x7676E1ECB39776C5), SPH_C64(0x8282E619B0648232),
+	SPH_C64(0xD6D628B1A9FED67F), SPH_C64(0x1B1BC33677D81B6C),
+	SPH_C64(0xB5B574775BC1B5EE), SPH_C64(0xAFAFBE432911AF86),
+	SPH_C64(0x6A6A1DD4DF776AB5), SPH_C64(0x5050EAA00DBA505D),
+	SPH_C64(0x4545578A4C124509), SPH_C64(0xF3F338FB18CBF3EB),
+	SPH_C64(0x3030AD60F09D30C0), SPH_C64(0xEFEFC4C3742BEF9B),
+	SPH_C64(0x3F3FDA7EC3E53FFC), SPH_C64(0x5555C7AA1C925549),
+	SPH_C64(0xA2A2DB591079A2B2), SPH_C64(0xEAEAE9C96503EA8F),
+	SPH_C64(0x65656ACAEC0F6589), SPH_C64(0xBABA036968B9BAD2),
+	SPH_C64(0x2F2F4A5E93652FBC), SPH_C64(0xC0C08E9DE74EC027),
+	SPH_C64(0xDEDE60A181BEDE5F), SPH_C64(0x1C1CFC386CE01C70),
+	SPH_C64(0xFDFD46E72EBBFDD3), SPH_C64(0x4D4D1F9A64524D29),
+	SPH_C64(0x92927639E0E49272), SPH_C64(0x7575FAEABC8F75C9),
+	SPH_C64(0x0606360C1E300618), SPH_C64(0x8A8AAE0998248A12),
+	SPH_C64(0xB2B24B7940F9B2F2), SPH_C64(0xE6E685D15963E6BF),
+	SPH_C64(0x0E0E7E1C36700E38), SPH_C64(0x1F1FE73E63F81F7C),
+	SPH_C64(0x626255C4F7376295), SPH_C64(0xD4D43AB5A3EED477),
+	SPH_C64(0xA8A8814D3229A89A), SPH_C64(0x96965231F4C49662),
+	SPH_C64(0xF9F962EF3A9BF9C3), SPH_C64(0xC5C5A397F666C533),
+	SPH_C64(0x2525104AB1352594), SPH_C64(0x5959ABB220F25979),
+	SPH_C64(0x8484D015AE54842A), SPH_C64(0x7272C5E4A7B772D5),
+	SPH_C64(0x3939EC72DDD539E4), SPH_C64(0x4C4C1698615A4C2D),
+	SPH_C64(0x5E5E94BC3BCA5E65), SPH_C64(0x78789FF085E778FD),
+	SPH_C64(0x3838E570D8DD38E0), SPH_C64(0x8C8C980586148C0A),
+	SPH_C64(0xD1D117BFB2C6D163), SPH_C64(0xA5A5E4570B41A5AE),
+	SPH_C64(0xE2E2A1D94D43E2AF), SPH_C64(0x61614EC2F82F6199),
+	SPH_C64(0xB3B3427B45F1B3F6), SPH_C64(0x21213442A5152184),
+	SPH_C64(0x9C9C0825D6949C4A), SPH_C64(0x1E1EEE3C66F01E78),
+	SPH_C64(0x4343618652224311), SPH_C64(0xC7C7B193FC76C73B),
+	SPH_C64(0xFCFC4FE52BB3FCD7), SPH_C64(0x0404240814200410),
+	SPH_C64(0x5151E3A208B25159), SPH_C64(0x9999252FC7BC995E),
+	SPH_C64(0x6D6D22DAC44F6DA9), SPH_C64(0x0D0D651A39680D34),
+	SPH_C64(0xFAFA79E93583FACF), SPH_C64(0xDFDF69A384B6DF5B),
+	SPH_C64(0x7E7EA9FC9BD77EE5), SPH_C64(0x24241948B43D2490),
+	SPH_C64(0x3B3BFE76D7C53BEC), SPH_C64(0xABAB9A4B3D31AB96),
+	SPH_C64(0xCECEF081D13ECE1F), SPH_C64(0x1111992255881144),
+	SPH_C64(0x8F8F8303890C8F06), SPH_C64(0x4E4E049C6B4A4E25),
+	SPH_C64(0xB7B7667351D1B7E6), SPH_C64(0xEBEBE0CB600BEB8B),
+	SPH_C64(0x3C3CC178CCFD3CF0), SPH_C64(0x8181FD1FBF7C813E),
+	SPH_C64(0x94944035FED4946A), SPH_C64(0xF7F71CF30CEBF7FB),
+	SPH_C64(0xB9B9186F67A1B9DE), SPH_C64(0x13138B265F98134C),
+	SPH_C64(0x2C2C51589C7D2CB0), SPH_C64(0xD3D305BBB8D6D36B),
+	SPH_C64(0xE7E78CD35C6BE7BB), SPH_C64(0x6E6E39DCCB576EA5),
+	SPH_C64(0xC4C4AA95F36EC437), SPH_C64(0x03031B060F18030C),
+	SPH_C64(0x5656DCAC138A5645), SPH_C64(0x44445E88491A440D),
+	SPH_C64(0x7F7FA0FE9EDF7FE1), SPH_C64(0xA9A9884F3721A99E),
+	SPH_C64(0x2A2A6754824D2AA8), SPH_C64(0xBBBB0A6B6DB1BBD6),
+	SPH_C64(0xC1C1879FE246C123), SPH_C64(0x5353F1A602A25351),
+	SPH_C64(0xDCDC72A58BAEDC57), SPH_C64(0x0B0B531627580B2C),
+	SPH_C64(0x9D9D0127D39C9D4E), SPH_C64(0x6C6C2BD8C1476CAD),
+	SPH_C64(0x3131A462F59531C4), SPH_C64(0x7474F3E8B98774CD),
+	SPH_C64(0xF6F615F109E3F6FF), SPH_C64(0x46464C8C430A4605),
+	SPH_C64(0xACACA5452609AC8A), SPH_C64(0x8989B50F973C891E),
+	SPH_C64(0x1414B42844A01450), SPH_C64(0xE1E1BADF425BE1A3),
+	SPH_C64(0x1616A62C4EB01658), SPH_C64(0x3A3AF774D2CD3AE8),
+	SPH_C64(0x696906D2D06F69B9), SPH_C64(0x090941122D480924),
+	SPH_C64(0x7070D7E0ADA770DD), SPH_C64(0xB6B66F7154D9B6E2),
+	SPH_C64(0xD0D01EBDB7CED067), SPH_C64(0xEDEDD6C77E3BED93),
+	SPH_C64(0xCCCCE285DB2ECC17), SPH_C64(0x42426884572A4215),
+	SPH_C64(0x98982C2DC2B4985A), SPH_C64(0xA4A4ED550E49A4AA),
+	SPH_C64(0x28287550885D28A0), SPH_C64(0x5C5C86B831DA5C6D),
+	SPH_C64(0xF8F86BED3F93F8C7), SPH_C64(0x8686C211A4448622)
+};
+
+static const sph_u64 plain_T7[256] = {
+	SPH_C64(0x18D83078C0186018), SPH_C64(0x232646AF05238C23),
+	SPH_C64(0xC6B891F97EC63FC6), SPH_C64(0xE8FBCD6F13E887E8),
+	SPH_C64(0x87CB13A14C872687), SPH_C64(0xB8116D62A9B8DAB8),
+	SPH_C64(0x0109020508010401), SPH_C64(0x4F0D9E6E424F214F),
+	SPH_C64(0x369B6CEEAD36D836), SPH_C64(0xA6FF510459A6A2A6),
+	SPH_C64(0xD20CB9BDDED26FD2), SPH_C64(0xF50EF706FBF5F3F5),
+	SPH_C64(0x7996F280EF79F979), SPH_C64(0x6F30DECE5F6FA16F),
+	SPH_C64(0x916D3FEFFC917E91), SPH_C64(0x52F8A407AA525552),
+	SPH_C64(0x6047C0FD27609D60), SPH_C64(0xBC35657689BCCABC),
+	SPH_C64(0x9B372BCDAC9B569B), SPH_C64(0x8E8A018C048E028E),
+	SPH_C64(0xA3D25B1571A3B6A3), SPH_C64(0x0C6C183C600C300C),
+	SPH_C64(0x7B84F68AFF7BF17B), SPH_C64(0x35806AE1B535D435),
+	SPH_C64(0x1DF53A69E81D741D), SPH_C64(0xE0B3DD4753E0A7E0),
+	SPH_C64(0xD721B3ACF6D77BD7), SPH_C64(0xC29C99ED5EC22FC2),
+	SPH_C64(0x2E435C966D2EB82E), SPH_C64(0x4B29967A624B314B),
+	SPH_C64(0xFE5DE121A3FEDFFE), SPH_C64(0x57D5AE1682574157),
+	SPH_C64(0x15BD2A41A8155415), SPH_C64(0x77E8EEB69F77C177),
+	SPH_C64(0x37926EEBA537DC37), SPH_C64(0xE59ED7567BE5B3E5),
+	SPH_C64(0x9F1323D98C9F469F), SPH_C64(0xF023FD17D3F0E7F0),
+	SPH_C64(0x4A20947F6A4A354A), SPH_C64(0xDA44A9959EDA4FDA),
+	SPH_C64(0x58A2B025FA587D58), SPH_C64(0xC9CF8FCA06C903C9),
+	SPH_C64(0x297C528D5529A429), SPH_C64(0x0A5A1422500A280A),
+	SPH_C64(0xB1507F4FE1B1FEB1), SPH_C64(0xA0C95D1A69A0BAA0),
+	SPH_C64(0x6B14D6DA7F6BB16B), SPH_C64(0x85D917AB5C852E85),
+	SPH_C64(0xBD3C677381BDCEBD), SPH_C64(0x5D8FBA34D25D695D),
+	SPH_C64(0x1090205080104010), SPH_C64(0xF407F503F3F4F7F4),
+	SPH_C64(0xCBDD8BC016CB0BCB), SPH_C64(0x3ED37CC6ED3EF83E),
+	SPH_C64(0x052D0A1128051405), SPH_C64(0x6778CEE61F678167),
+	SPH_C64(0xE497D55373E4B7E4), SPH_C64(0x27024EBB25279C27),
+	SPH_C64(0x4173825832411941), SPH_C64(0x8BA70B9D2C8B168B),
+	SPH_C64(0xA7F6530151A7A6A7), SPH_C64(0x7DB2FA94CF7DE97D),
+	SPH_C64(0x954937FBDC956E95), SPH_C64(0xD856AD9F8ED847D8),
+	SPH_C64(0xFB70EB308BFBCBFB), SPH_C64(0xEECDC17123EE9FEE),
+	SPH_C64(0x7CBBF891C77CED7C), SPH_C64(0x6671CCE317668566),
+	SPH_C64(0xDD7BA78EA6DD53DD), SPH_C64(0x17AF2E4BB8175C17),
+	SPH_C64(0x47458E4602470147), SPH_C64(0x9E1A21DC849E429E),
+	SPH_C64(0xCAD489C51ECA0FCA), SPH_C64(0x2D585A99752DB42D),
+	SPH_C64(0xBF2E637991BFC6BF), SPH_C64(0x073F0E1B38071C07),
+	SPH_C64(0xADAC472301AD8EAD), SPH_C64(0x5AB0B42FEA5A755A),
+	SPH_C64(0x83EF1BB56C833683), SPH_C64(0x33B666FF8533CC33),
+	SPH_C64(0x635CC6F23F639163), SPH_C64(0x0212040A10020802),
+	SPH_C64(0xAA93493839AA92AA), SPH_C64(0x71DEE2A8AF71D971),
+	SPH_C64(0xC8C68DCF0EC807C8), SPH_C64(0x19D1327DC8196419),
+	SPH_C64(0x493B927072493949), SPH_C64(0xD95FAF9A86D943D9),
+	SPH_C64(0xF231F91DC3F2EFF2), SPH_C64(0xE3A8DB484BE3ABE3),
+	SPH_C64(0x5BB9B62AE25B715B), SPH_C64(0x88BC0D9234881A88),
+	SPH_C64(0x9A3E29C8A49A529A), SPH_C64(0x260B4CBE2D269826),
+	SPH_C64(0x32BF64FA8D32C832), SPH_C64(0xB0597D4AE9B0FAB0),
+	SPH_C64(0xE9F2CF6A1BE983E9), SPH_C64(0x0F771E33780F3C0F),
+	SPH_C64(0xD533B7A6E6D573D5), SPH_C64(0x80F41DBA74803A80),
+	SPH_C64(0xBE27617C99BEC2BE), SPH_C64(0xCDEB87DE26CD13CD),
+	SPH_C64(0x348968E4BD34D034), SPH_C64(0x483290757A483D48),
+	SPH_C64(0xFF54E324ABFFDBFF), SPH_C64(0x7A8DF48FF77AF57A),
+	SPH_C64(0x90643DEAF4907A90), SPH_C64(0x5F9DBE3EC25F615F),
+	SPH_C64(0x203D40A01D208020), SPH_C64(0x680FD0D56768BD68),
+	SPH_C64(0x1ACA3472D01A681A), SPH_C64(0xAEB7412C19AE82AE),
+	SPH_C64(0xB47D755EC9B4EAB4), SPH_C64(0x54CEA8199A544D54),
+	SPH_C64(0x937F3BE5EC937693), SPH_C64(0x222F44AA0D228822),
+	SPH_C64(0x6463C8E907648D64), SPH_C64(0xF12AFF12DBF1E3F1),
+	SPH_C64(0x73CCE6A2BF73D173), SPH_C64(0x1282245A90124812),
+	SPH_C64(0x407A805D3A401D40), SPH_C64(0x0848102840082008),
+	SPH_C64(0xC3959BE856C32BC3), SPH_C64(0xECDFC57B33EC97EC),
+	SPH_C64(0xDB4DAB9096DB4BDB), SPH_C64(0xA1C05F1F61A1BEA1),
+	SPH_C64(0x8D9107831C8D0E8D), SPH_C64(0x3DC87AC9F53DF43D),
+	SPH_C64(0x975B33F1CC976697), SPH_C64(0x0000000000000000),
+	SPH_C64(0xCFF983D436CF1BCF), SPH_C64(0x2B6E5687452BAC2B),
+	SPH_C64(0x76E1ECB39776C576), SPH_C64(0x82E619B064823282),
+	SPH_C64(0xD628B1A9FED67FD6), SPH_C64(0x1BC33677D81B6C1B),
+	SPH_C64(0xB574775BC1B5EEB5), SPH_C64(0xAFBE432911AF86AF),
+	SPH_C64(0x6A1DD4DF776AB56A), SPH_C64(0x50EAA00DBA505D50),
+	SPH_C64(0x45578A4C12450945), SPH_C64(0xF338FB18CBF3EBF3),
+	SPH_C64(0x30AD60F09D30C030), SPH_C64(0xEFC4C3742BEF9BEF),
+	SPH_C64(0x3FDA7EC3E53FFC3F), SPH_C64(0x55C7AA1C92554955),
+	SPH_C64(0xA2DB591079A2B2A2), SPH_C64(0xEAE9C96503EA8FEA),
+	SPH_C64(0x656ACAEC0F658965), SPH_C64(0xBA036968B9BAD2BA),
+	SPH_C64(0x2F4A5E93652FBC2F), SPH_C64(0xC08E9DE74EC027C0),
+	SPH_C64(0xDE60A181BEDE5FDE), SPH_C64(0x1CFC386CE01C701C),
+	SPH_C64(0xFD46E72EBBFDD3FD), SPH_C64(0x4D1F9A64524D294D),
+	SPH_C64(0x927639E0E4927292), SPH_C64(0x75FAEABC8F75C975),
+	SPH_C64(0x06360C1E30061806), SPH_C64(0x8AAE0998248A128A),
+	SPH_C64(0xB24B7940F9B2F2B2), SPH_C64(0xE685D15963E6BFE6),
+	SPH_C64(0x0E7E1C36700E380E), SPH_C64(0x1FE73E63F81F7C1F),
+	SPH_C64(0x6255C4F737629562), SPH_C64(0xD43AB5A3EED477D4),
+	SPH_C64(0xA8814D3229A89AA8), SPH_C64(0x965231F4C4966296),
+	SPH_C64(0xF962EF3A9BF9C3F9), SPH_C64(0xC5A397F666C533C5),
+	SPH_C64(0x25104AB135259425), SPH_C64(0x59ABB220F2597959),
+	SPH_C64(0x84D015AE54842A84), SPH_C64(0x72C5E4A7B772D572),
+	SPH_C64(0x39EC72DDD539E439), SPH_C64(0x4C1698615A4C2D4C),
+	SPH_C64(0x5E94BC3BCA5E655E), SPH_C64(0x789FF085E778FD78),
+	SPH_C64(0x38E570D8DD38E038), SPH_C64(0x8C980586148C0A8C),
+	SPH_C64(0xD117BFB2C6D163D1), SPH_C64(0xA5E4570B41A5AEA5),
+	SPH_C64(0xE2A1D94D43E2AFE2), SPH_C64(0x614EC2F82F619961),
+	SPH_C64(0xB3427B45F1B3F6B3), SPH_C64(0x213442A515218421),
+	SPH_C64(0x9C0825D6949C4A9C), SPH_C64(0x1EEE3C66F01E781E),
+	SPH_C64(0x4361865222431143), SPH_C64(0xC7B193FC76C73BC7),
+	SPH_C64(0xFC4FE52BB3FCD7FC), SPH_C64(0x0424081420041004),
+	SPH_C64(0x51E3A208B2515951), SPH_C64(0x99252FC7BC995E99),
+	SPH_C64(0x6D22DAC44F6DA96D), SPH_C64(0x0D651A39680D340D),
+	SPH_C64(0xFA79E93583FACFFA), SPH_C64(0xDF69A384B6DF5BDF),
+	SPH_C64(0x7EA9FC9BD77EE57E), SPH_C64(0x241948B43D249024),
+	SPH_C64(0x3BFE76D7C53BEC3B), SPH_C64(0xAB9A4B3D31AB96AB),
+	SPH_C64(0xCEF081D13ECE1FCE), SPH_C64(0x1199225588114411),
+	SPH_C64(0x8F8303890C8F068F), SPH_C64(0x4E049C6B4A4E254E),
+	SPH_C64(0xB7667351D1B7E6B7), SPH_C64(0xEBE0CB600BEB8BEB),
+	SPH_C64(0x3CC178CCFD3CF03C), SPH_C64(0x81FD1FBF7C813E81),
+	SPH_C64(0x944035FED4946A94), SPH_C64(0xF71CF30CEBF7FBF7),
+	SPH_C64(0xB9186F67A1B9DEB9), SPH_C64(0x138B265F98134C13),
+	SPH_C64(0x2C51589C7D2CB02C), SPH_C64(0xD305BBB8D6D36BD3),
+	SPH_C64(0xE78CD35C6BE7BBE7), SPH_C64(0x6E39DCCB576EA56E),
+	SPH_C64(0xC4AA95F36EC437C4), SPH_C64(0x031B060F18030C03),
+	SPH_C64(0x56DCAC138A564556), SPH_C64(0x445E88491A440D44),
+	SPH_C64(0x7FA0FE9EDF7FE17F), SPH_C64(0xA9884F3721A99EA9),
+	SPH_C64(0x2A6754824D2AA82A), SPH_C64(0xBB0A6B6DB1BBD6BB),
+	SPH_C64(0xC1879FE246C123C1), SPH_C64(0x53F1A602A2535153),
+	SPH_C64(0xDC72A58BAEDC57DC), SPH_C64(0x0B531627580B2C0B),
+	SPH_C64(0x9D0127D39C9D4E9D), SPH_C64(0x6C2BD8C1476CAD6C),
+	SPH_C64(0x31A462F59531C431), SPH_C64(0x74F3E8B98774CD74),
+	SPH_C64(0xF615F109E3F6FFF6), SPH_C64(0x464C8C430A460546),
+	SPH_C64(0xACA5452609AC8AAC), SPH_C64(0x89B50F973C891E89),
+	SPH_C64(0x14B42844A0145014), SPH_C64(0xE1BADF425BE1A3E1),
+	SPH_C64(0x16A62C4EB0165816), SPH_C64(0x3AF774D2CD3AE83A),
+	SPH_C64(0x6906D2D06F69B969), SPH_C64(0x0941122D48092409),
+	SPH_C64(0x70D7E0ADA770DD70), SPH_C64(0xB66F7154D9B6E2B6),
+	SPH_C64(0xD01EBDB7CED067D0), SPH_C64(0xEDD6C77E3BED93ED),
+	SPH_C64(0xCCE285DB2ECC17CC), SPH_C64(0x426884572A421542),
+	SPH_C64(0x982C2DC2B4985A98), SPH_C64(0xA4ED550E49A4AAA4),
+	SPH_C64(0x287550885D28A028), SPH_C64(0x5C86B831DA5C6D5C),
+	SPH_C64(0xF86BED3F93F8C7F8), SPH_C64(0x86C211A444862286)
+};
+
+#endif
+
+/*
+ * Round constants.
+ */
+static const sph_u64 plain_RC[10] = {
+	SPH_C64(0x4F01B887E8C62318),
+	SPH_C64(0x52916F79F5D2A636),
+	SPH_C64(0x357B0CA38E9BBC60),
+	SPH_C64(0x57FE4B2EC2D7E01D),
+	SPH_C64(0xDA4AF09FE5377715),
+	SPH_C64(0x856BA0B10A29C958),
+	SPH_C64(0x67053ECBF4105DBD),
+	SPH_C64(0xD8957DA78B4127E4),
+	SPH_C64(0x9E4717DD667CEEFB),
+	SPH_C64(0x33835AAD07BF2DCA)
+};
+
+/* ====================================================================== */
+/*
+ * Constants for plain WHIRLPOOL-0 (first version).
+ */
+
+static const sph_u64 old0_T0[256] = {
+	SPH_C64(0xD50F67D568B86868), SPH_C64(0xB71ECEB7D06DD0D0),
+	SPH_C64(0x60E00B60EB20EBEB), SPH_C64(0x876E45872B7D2B2B),
+	SPH_C64(0x75327A7548D84848), SPH_C64(0xD3019CD39DBA9D9D),
+	SPH_C64(0xDF1D77DF6ABE6A6A), SPH_C64(0x53977353E431E4E4),
+	SPH_C64(0x48A84B48E338E3E3), SPH_C64(0x15D27115A3F8A3A3),
+	SPH_C64(0x13DC8A1356FA5656), SPH_C64(0xBFFD7CBF819E8181),
+	SPH_C64(0x94B2CF947D877D7D), SPH_C64(0x122ADB12F10EF1F1),
+	SPH_C64(0xABD95CAB85928585), SPH_C64(0xDC1A84DC9EBF9E9E),
+	SPH_C64(0x9C517D9C2C742C2C), SPH_C64(0x8C8A048C8E8F8E8E),
+	SPH_C64(0x859FE78578887878), SPH_C64(0xC5D41EC5CA43CACA),
+	SPH_C64(0x4BAFB84B17391717), SPH_C64(0x37882137A9E6A9A9),
+	SPH_C64(0xF84E2FF861A36161), SPH_C64(0xA633E6A6D562D5D5),
+	SPH_C64(0x348FD2345DE75D5D), SPH_C64(0x275358270B1D0B0B),
+	SPH_C64(0x869814868C898C8C), SPH_C64(0xCCC1FDCC3C443C3C),
+	SPH_C64(0xB6E89FB677997777), SPH_C64(0x08E3B20851F35151),
+	SPH_C64(0xAA2F0DAA22662222), SPH_C64(0x57682A5742C64242),
+	SPH_C64(0xC3DAE5C33F413F3F), SPH_C64(0x19CE9A1954FC5454),
+	SPH_C64(0x5873325841C34141), SPH_C64(0xBAF474BA809D8080),
+	SPH_C64(0xDBE22EDBCC49CCCC), SPH_C64(0xA4C244A486978686),
+	SPH_C64(0x4542F145B3C8B3B3), SPH_C64(0x78D8C07818281818),
+	SPH_C64(0x96436D962E722E2E), SPH_C64(0x16D5821657F95757),
+	SPH_C64(0x1E36301E060A0606), SPH_C64(0xF75537F762A66262),
+	SPH_C64(0x0307F303F401F4F4), SPH_C64(0xEE9BADEE365A3636),
+	SPH_C64(0xB217C6B2D16ED1D1), SPH_C64(0xDA147FDA6BBD6B6B),
+	SPH_C64(0x77C3D8771B2D1B1B), SPH_C64(0xEC6A0FEC65AF6565),
+	SPH_C64(0xBCFA8FBC759F7575), SPH_C64(0x5090805010301010),
+	SPH_C64(0x95449E95DA73DADA), SPH_C64(0x703B727049DB4949),
+	SPH_C64(0xBE0B2DBE266A2626), SPH_C64(0x3A629B3AF916F9F9),
+	SPH_C64(0xC0DD16C0CB40CBCB), SPH_C64(0xE37117E366AA6666),
+	SPH_C64(0x5C8C6B5CE734E7E7), SPH_C64(0x6803B968BAD3BABA),
+	SPH_C64(0x2CB7192CAEEFAEAE), SPH_C64(0x0DEABA0D50F05050),
+	SPH_C64(0x07F8AA0752F65252), SPH_C64(0x3D9A313DABE0ABAB),
+	SPH_C64(0x112D2811050F0505), SPH_C64(0x1723D317F00DF0F0),
+	SPH_C64(0x396568390D170D0D), SPH_C64(0xA2CCBFA273957373),
+	SPH_C64(0xD7FEC5D73B4D3B3B), SPH_C64(0x14242014040C0404),
+	SPH_C64(0xA03D1DA020602020), SPH_C64(0x215DA321FE1FFEFE),
+	SPH_C64(0x8E7BA68EDD7ADDDD), SPH_C64(0x060EFB06F502F5F5),
+	SPH_C64(0x5E7DC95EB4C1B4B4), SPH_C64(0x3E9DC23E5FE15F5F),
+	SPH_C64(0x225A50220A1E0A0A), SPH_C64(0x5B74C15BB5C2B5B5),
+	SPH_C64(0xE78E4EE7C05DC0C0), SPH_C64(0x1AC9691AA0FDA0A0),
+	SPH_C64(0xA8DEAFA871937171), SPH_C64(0x0BE4410BA5F2A5A5),
+	SPH_C64(0x995875992D772D2D), SPH_C64(0xFD4727FD60A06060),
+	SPH_C64(0xA7C5B7A772967272), SPH_C64(0xE57FECE593A89393),
+	SPH_C64(0xDDECD5DD394B3939), SPH_C64(0x2848402808180808),
+	SPH_C64(0xB5EF6CB583988383), SPH_C64(0xA53415A521632121),
+	SPH_C64(0x3186DA315CE45C5C), SPH_C64(0xA1CB4CA187948787),
+	SPH_C64(0x4F50E14FB1CEB1B1), SPH_C64(0x47B35347E03DE0E0),
+	SPH_C64(0x0000000000000000), SPH_C64(0xE89556E8C358C3C3),
+	SPH_C64(0x5A82905A12361212), SPH_C64(0xEF6DFCEF91AE9191),
+	SPH_C64(0x98AE24988A838A8A), SPH_C64(0x0A12100A02060202),
+	SPH_C64(0x6CFCE06C1C241C1C), SPH_C64(0x59856359E637E6E6),
+	SPH_C64(0x4C57124C45CF4545), SPH_C64(0xED9C5EEDC25BC2C2),
+	SPH_C64(0xF3AA6EF3C451C4C4), SPH_C64(0x2E46BB2EFD1AFDFD),
+	SPH_C64(0x792E9179BFDCBFBF), SPH_C64(0x495E1A4944CC4444),
+	SPH_C64(0x1FC0611FA1FEA1A1), SPH_C64(0x61165A614CD44C4C),
+	SPH_C64(0xFFB685FF33553333), SPH_C64(0xF6A366F6C552C5C5),
+	SPH_C64(0xAED054AE84918484), SPH_C64(0xAF2605AF23652323),
+	SPH_C64(0x91BBC7917C847C7C), SPH_C64(0x4A59E94AB0CDB0B0),
+	SPH_C64(0xB11035B1256F2525), SPH_C64(0x41BDA841153F1515),
+	SPH_C64(0xE180B5E1355F3535), SPH_C64(0xD0066FD069BB6969),
+	SPH_C64(0x2454AB24FF1CFFFF), SPH_C64(0xFE40D4FE94A19494),
+	SPH_C64(0x641F52644DD74D4D), SPH_C64(0xADD7A7AD70907070),
+	SPH_C64(0x10DB7910A2FBA2A2), SPH_C64(0x29BE1129AFECAFAF),
+	SPH_C64(0xDEEB26DECD4ACDCD), SPH_C64(0xA928FEA9D667D6D6),
+	SPH_C64(0xC12B47C16CB46C6C), SPH_C64(0x5166D151B7C4B7B7),
+	SPH_C64(0x3F6B933FF815F8F8), SPH_C64(0x2D41482D091B0909),
+	SPH_C64(0x1838CB18F308F3F3), SPH_C64(0xE6781FE667A96767),
+	SPH_C64(0x0EED490EA4F1A4A4), SPH_C64(0x65E90365EA23EAEA),
+	SPH_C64(0x7BDF337BEC29ECEC), SPH_C64(0x546FD954B6C7B6B6),
+	SPH_C64(0xA33AEEA3D461D4D4), SPH_C64(0xBD0CDEBDD26BD2D2),
+	SPH_C64(0x44B4A044143C1414), SPH_C64(0x66EEF0661E221E1E),
+	SPH_C64(0x42BA5B42E13EE1E1), SPH_C64(0xB4193DB4246C2424),
+	SPH_C64(0xD8E5DDD838483838), SPH_C64(0xF9B87EF9C657C6C6),
+	SPH_C64(0x904D9690DB70DBDB), SPH_C64(0x7A29627A4BDD4B4B),
+	SPH_C64(0x8F8DF78F7A8E7A7A), SPH_C64(0xD2F7CDD23A4E3A3A),
+	SPH_C64(0x8160BE81DE7FDEDE), SPH_C64(0x3B94CA3B5EE25E5E),
+	SPH_C64(0x8469B684DF7CDFDF), SPH_C64(0xFB49DCFB95A29595),
+	SPH_C64(0x2B4FB32BFC19FCFC), SPH_C64(0x38933938AAE3AAAA),
+	SPH_C64(0xAC21F6ACD764D7D7), SPH_C64(0xD1F03ED1CE4FCECE),
+	SPH_C64(0x1B3F381B07090707), SPH_C64(0x337778330F110F0F),
+	SPH_C64(0xC9C8F5C93D473D3D), SPH_C64(0x25A2FA2558E85858),
+	SPH_C64(0xC83EA4C89AB39A9A), SPH_C64(0xC22CB4C298B59898),
+	SPH_C64(0xD60894D69CB99C9C), SPH_C64(0x1D31C31DF20BF2F2),
+	SPH_C64(0x01F65101A7F4A7A7), SPH_C64(0x5599885511331111),
+	SPH_C64(0x9BA9D79B7E827E7E), SPH_C64(0x9DA72C9D8B808B8B),
+	SPH_C64(0x5261225243C54343), SPH_C64(0x0F1B180F03050303),
+	SPH_C64(0x4DA1434DE23BE2E2), SPH_C64(0x8B72AE8BDC79DCDC),
+	SPH_C64(0x569E7B56E532E5E5), SPH_C64(0x404BF940B2CBB2B2),
+	SPH_C64(0x6B044A6B4ED24E4E), SPH_C64(0xFCB176FCC754C7C7),
+	SPH_C64(0xC4224FC46DB76D6D), SPH_C64(0x6AF21B6AE926E9E9),
+	SPH_C64(0xBB0225BB27692727), SPH_C64(0x5D7A3A5D40C04040),
+	SPH_C64(0x9F568E9FD875D8D8), SPH_C64(0xEB92A5EB37593737),
+	SPH_C64(0xE076E4E092AB9292), SPH_C64(0x89830C898F8C8F8F),
+	SPH_C64(0x0509080501030101), SPH_C64(0x69F5E8691D271D1D),
+	SPH_C64(0x02F1A20253F55353), SPH_C64(0xC6D3EDC63E423E3E),
+	SPH_C64(0x20ABF22059EB5959), SPH_C64(0xE28746E2C15EC1C1),
+	SPH_C64(0x6E0D426E4FD14F4F), SPH_C64(0xFABF8DFA32563232),
+	SPH_C64(0x4EA6B04E163A1616), SPH_C64(0x35798335FA13FAFA),
+	SPH_C64(0xB9F387B9749C7474), SPH_C64(0x30708B30FB10FBFB),
+	SPH_C64(0xF25C3FF263A56363), SPH_C64(0xD9138CD99FBC9F9F),
+	SPH_C64(0xE489BDE4345C3434), SPH_C64(0x72CAD0721A2E1A1A),
+	SPH_C64(0x82674D822A7E2A2A), SPH_C64(0x2FB0EA2F5AEE5A5A),
+	SPH_C64(0x83911C838D8A8D8D), SPH_C64(0xCACF06CAC946C9C9),
+	SPH_C64(0xD4F936D4CF4CCFCF), SPH_C64(0x0915E309F607F6F6),
+	SPH_C64(0xEA64F4EA90AD9090), SPH_C64(0x88755D8828782828),
+	SPH_C64(0x92BC349288858888), SPH_C64(0xCD37ACCD9BB09B9B),
+	SPH_C64(0xF5A495F531533131), SPH_C64(0x367E70360E120E0E),
+	SPH_C64(0x733C8173BDDABDBD), SPH_C64(0x7F206A7F4ADE4A4A),
+	SPH_C64(0x6FFB136FE825E8E8), SPH_C64(0xF452C4F496A79696),
+	SPH_C64(0x04FF5904A6F7A6A6), SPH_C64(0x3C6C603C0C140C0C),
+	SPH_C64(0xCFC60ECFC845C8C8), SPH_C64(0x8096EF80798B7979),
+	SPH_C64(0x76358976BCD9BCBC), SPH_C64(0x7C27997CBEDFBEBE),
+	SPH_C64(0x74C42B74EF2CEFEF), SPH_C64(0xCB3957CB6EB26E6E),
+	SPH_C64(0x434C0A4346CA4646), SPH_C64(0xF15BCCF197A49797),
+	SPH_C64(0x2AB9E22A5BED5B5B), SPH_C64(0x7ED63B7EED2AEDED),
+	SPH_C64(0x7DD1C87D192B1919), SPH_C64(0x9A5F869AD976D9D9),
+	SPH_C64(0x26A50926ACE9ACAC), SPH_C64(0xC725BCC799B69999),
+	SPH_C64(0x32812932A8E5A8A8), SPH_C64(0x8D7C558D297B2929),
+	SPH_C64(0xE96307E964AC6464), SPH_C64(0x63E7F8631F211F1F),
+	SPH_C64(0x23AC0123ADEAADAD), SPH_C64(0x1CC7921C55FF5555),
+	SPH_C64(0x5F8B985F13351313), SPH_C64(0x6D0AB16DBBD0BBBB),
+	SPH_C64(0x0C1CEB0CF704F7F7), SPH_C64(0xCE305FCE6FB16F6F),
+	SPH_C64(0x6718A167B9D6B9B9), SPH_C64(0x4645024647C94747),
+	SPH_C64(0x934A65932F712F2F), SPH_C64(0x71CD2371EE2FEEEE),
+	SPH_C64(0x6211A962B8D5B8B8), SPH_C64(0x8A84FF8A7B8D7B7B),
+	SPH_C64(0x97B53C9789868989), SPH_C64(0xF0AD9DF030503030),
+	SPH_C64(0xB805D6B8D368D3D3), SPH_C64(0x9EA0DF9E7F817F7F),
+	SPH_C64(0xB3E197B3769A7676), SPH_C64(0xB0E664B0829B8282)
+};
+
+#if !SPH_SMALL_FOOTPRINT_WHIRLPOOL
+
+static const sph_u64 old0_T1[256] = {
+	SPH_C64(0x0F67D568B86868D5), SPH_C64(0x1ECEB7D06DD0D0B7),
+	SPH_C64(0xE00B60EB20EBEB60), SPH_C64(0x6E45872B7D2B2B87),
+	SPH_C64(0x327A7548D8484875), SPH_C64(0x019CD39DBA9D9DD3),
+	SPH_C64(0x1D77DF6ABE6A6ADF), SPH_C64(0x977353E431E4E453),
+	SPH_C64(0xA84B48E338E3E348), SPH_C64(0xD27115A3F8A3A315),
+	SPH_C64(0xDC8A1356FA565613), SPH_C64(0xFD7CBF819E8181BF),
+	SPH_C64(0xB2CF947D877D7D94), SPH_C64(0x2ADB12F10EF1F112),
+	SPH_C64(0xD95CAB85928585AB), SPH_C64(0x1A84DC9EBF9E9EDC),
+	SPH_C64(0x517D9C2C742C2C9C), SPH_C64(0x8A048C8E8F8E8E8C),
+	SPH_C64(0x9FE7857888787885), SPH_C64(0xD41EC5CA43CACAC5),
+	SPH_C64(0xAFB84B173917174B), SPH_C64(0x882137A9E6A9A937),
+	SPH_C64(0x4E2FF861A36161F8), SPH_C64(0x33E6A6D562D5D5A6),
+	SPH_C64(0x8FD2345DE75D5D34), SPH_C64(0x5358270B1D0B0B27),
+	SPH_C64(0x9814868C898C8C86), SPH_C64(0xC1FDCC3C443C3CCC),
+	SPH_C64(0xE89FB677997777B6), SPH_C64(0xE3B20851F3515108),
+	SPH_C64(0x2F0DAA22662222AA), SPH_C64(0x682A5742C6424257),
+	SPH_C64(0xDAE5C33F413F3FC3), SPH_C64(0xCE9A1954FC545419),
+	SPH_C64(0x73325841C3414158), SPH_C64(0xF474BA809D8080BA),
+	SPH_C64(0xE22EDBCC49CCCCDB), SPH_C64(0xC244A486978686A4),
+	SPH_C64(0x42F145B3C8B3B345), SPH_C64(0xD8C0781828181878),
+	SPH_C64(0x436D962E722E2E96), SPH_C64(0xD5821657F9575716),
+	SPH_C64(0x36301E060A06061E), SPH_C64(0x5537F762A66262F7),
+	SPH_C64(0x07F303F401F4F403), SPH_C64(0x9BADEE365A3636EE),
+	SPH_C64(0x17C6B2D16ED1D1B2), SPH_C64(0x147FDA6BBD6B6BDA),
+	SPH_C64(0xC3D8771B2D1B1B77), SPH_C64(0x6A0FEC65AF6565EC),
+	SPH_C64(0xFA8FBC759F7575BC), SPH_C64(0x9080501030101050),
+	SPH_C64(0x449E95DA73DADA95), SPH_C64(0x3B727049DB494970),
+	SPH_C64(0x0B2DBE266A2626BE), SPH_C64(0x629B3AF916F9F93A),
+	SPH_C64(0xDD16C0CB40CBCBC0), SPH_C64(0x7117E366AA6666E3),
+	SPH_C64(0x8C6B5CE734E7E75C), SPH_C64(0x03B968BAD3BABA68),
+	SPH_C64(0xB7192CAEEFAEAE2C), SPH_C64(0xEABA0D50F050500D),
+	SPH_C64(0xF8AA0752F6525207), SPH_C64(0x9A313DABE0ABAB3D),
+	SPH_C64(0x2D2811050F050511), SPH_C64(0x23D317F00DF0F017),
+	SPH_C64(0x6568390D170D0D39), SPH_C64(0xCCBFA273957373A2),
+	SPH_C64(0xFEC5D73B4D3B3BD7), SPH_C64(0x242014040C040414),
+	SPH_C64(0x3D1DA020602020A0), SPH_C64(0x5DA321FE1FFEFE21),
+	SPH_C64(0x7BA68EDD7ADDDD8E), SPH_C64(0x0EFB06F502F5F506),
+	SPH_C64(0x7DC95EB4C1B4B45E), SPH_C64(0x9DC23E5FE15F5F3E),
+	SPH_C64(0x5A50220A1E0A0A22), SPH_C64(0x74C15BB5C2B5B55B),
+	SPH_C64(0x8E4EE7C05DC0C0E7), SPH_C64(0xC9691AA0FDA0A01A),
+	SPH_C64(0xDEAFA871937171A8), SPH_C64(0xE4410BA5F2A5A50B),
+	SPH_C64(0x5875992D772D2D99), SPH_C64(0x4727FD60A06060FD),
+	SPH_C64(0xC5B7A772967272A7), SPH_C64(0x7FECE593A89393E5),
+	SPH_C64(0xECD5DD394B3939DD), SPH_C64(0x4840280818080828),
+	SPH_C64(0xEF6CB583988383B5), SPH_C64(0x3415A521632121A5),
+	SPH_C64(0x86DA315CE45C5C31), SPH_C64(0xCB4CA187948787A1),
+	SPH_C64(0x50E14FB1CEB1B14F), SPH_C64(0xB35347E03DE0E047),
+	SPH_C64(0x0000000000000000), SPH_C64(0x9556E8C358C3C3E8),
+	SPH_C64(0x82905A123612125A), SPH_C64(0x6DFCEF91AE9191EF),
+	SPH_C64(0xAE24988A838A8A98), SPH_C64(0x12100A020602020A),
+	SPH_C64(0xFCE06C1C241C1C6C), SPH_C64(0x856359E637E6E659),
+	SPH_C64(0x57124C45CF45454C), SPH_C64(0x9C5EEDC25BC2C2ED),
+	SPH_C64(0xAA6EF3C451C4C4F3), SPH_C64(0x46BB2EFD1AFDFD2E),
+	SPH_C64(0x2E9179BFDCBFBF79), SPH_C64(0x5E1A4944CC444449),
+	SPH_C64(0xC0611FA1FEA1A11F), SPH_C64(0x165A614CD44C4C61),
+	SPH_C64(0xB685FF33553333FF), SPH_C64(0xA366F6C552C5C5F6),
+	SPH_C64(0xD054AE84918484AE), SPH_C64(0x2605AF23652323AF),
+	SPH_C64(0xBBC7917C847C7C91), SPH_C64(0x59E94AB0CDB0B04A),
+	SPH_C64(0x1035B1256F2525B1), SPH_C64(0xBDA841153F151541),
+	SPH_C64(0x80B5E1355F3535E1), SPH_C64(0x066FD069BB6969D0),
+	SPH_C64(0x54AB24FF1CFFFF24), SPH_C64(0x40D4FE94A19494FE),
+	SPH_C64(0x1F52644DD74D4D64), SPH_C64(0xD7A7AD70907070AD),
+	SPH_C64(0xDB7910A2FBA2A210), SPH_C64(0xBE1129AFECAFAF29),
+	SPH_C64(0xEB26DECD4ACDCDDE), SPH_C64(0x28FEA9D667D6D6A9),
+	SPH_C64(0x2B47C16CB46C6CC1), SPH_C64(0x66D151B7C4B7B751),
+	SPH_C64(0x6B933FF815F8F83F), SPH_C64(0x41482D091B09092D),
+	SPH_C64(0x38CB18F308F3F318), SPH_C64(0x781FE667A96767E6),
+	SPH_C64(0xED490EA4F1A4A40E), SPH_C64(0xE90365EA23EAEA65),
+	SPH_C64(0xDF337BEC29ECEC7B), SPH_C64(0x6FD954B6C7B6B654),
+	SPH_C64(0x3AEEA3D461D4D4A3), SPH_C64(0x0CDEBDD26BD2D2BD),
+	SPH_C64(0xB4A044143C141444), SPH_C64(0xEEF0661E221E1E66),
+	SPH_C64(0xBA5B42E13EE1E142), SPH_C64(0x193DB4246C2424B4),
+	SPH_C64(0xE5DDD838483838D8), SPH_C64(0xB87EF9C657C6C6F9),
+	SPH_C64(0x4D9690DB70DBDB90), SPH_C64(0x29627A4BDD4B4B7A),
+	SPH_C64(0x8DF78F7A8E7A7A8F), SPH_C64(0xF7CDD23A4E3A3AD2),
+	SPH_C64(0x60BE81DE7FDEDE81), SPH_C64(0x94CA3B5EE25E5E3B),
+	SPH_C64(0x69B684DF7CDFDF84), SPH_C64(0x49DCFB95A29595FB),
+	SPH_C64(0x4FB32BFC19FCFC2B), SPH_C64(0x933938AAE3AAAA38),
+	SPH_C64(0x21F6ACD764D7D7AC), SPH_C64(0xF03ED1CE4FCECED1),
+	SPH_C64(0x3F381B070907071B), SPH_C64(0x7778330F110F0F33),
+	SPH_C64(0xC8F5C93D473D3DC9), SPH_C64(0xA2FA2558E8585825),
+	SPH_C64(0x3EA4C89AB39A9AC8), SPH_C64(0x2CB4C298B59898C2),
+	SPH_C64(0x0894D69CB99C9CD6), SPH_C64(0x31C31DF20BF2F21D),
+	SPH_C64(0xF65101A7F4A7A701), SPH_C64(0x9988551133111155),
+	SPH_C64(0xA9D79B7E827E7E9B), SPH_C64(0xA72C9D8B808B8B9D),
+	SPH_C64(0x61225243C5434352), SPH_C64(0x1B180F030503030F),
+	SPH_C64(0xA1434DE23BE2E24D), SPH_C64(0x72AE8BDC79DCDC8B),
+	SPH_C64(0x9E7B56E532E5E556), SPH_C64(0x4BF940B2CBB2B240),
+	SPH_C64(0x044A6B4ED24E4E6B), SPH_C64(0xB176FCC754C7C7FC),
+	SPH_C64(0x224FC46DB76D6DC4), SPH_C64(0xF21B6AE926E9E96A),
+	SPH_C64(0x0225BB27692727BB), SPH_C64(0x7A3A5D40C040405D),
+	SPH_C64(0x568E9FD875D8D89F), SPH_C64(0x92A5EB37593737EB),
+	SPH_C64(0x76E4E092AB9292E0), SPH_C64(0x830C898F8C8F8F89),
+	SPH_C64(0x0908050103010105), SPH_C64(0xF5E8691D271D1D69),
+	SPH_C64(0xF1A20253F5535302), SPH_C64(0xD3EDC63E423E3EC6),
+	SPH_C64(0xABF22059EB595920), SPH_C64(0x8746E2C15EC1C1E2),
+	SPH_C64(0x0D426E4FD14F4F6E), SPH_C64(0xBF8DFA32563232FA),
+	SPH_C64(0xA6B04E163A16164E), SPH_C64(0x798335FA13FAFA35),
+	SPH_C64(0xF387B9749C7474B9), SPH_C64(0x708B30FB10FBFB30),
+	SPH_C64(0x5C3FF263A56363F2), SPH_C64(0x138CD99FBC9F9FD9),
+	SPH_C64(0x89BDE4345C3434E4), SPH_C64(0xCAD0721A2E1A1A72),
+	SPH_C64(0x674D822A7E2A2A82), SPH_C64(0xB0EA2F5AEE5A5A2F),
+	SPH_C64(0x911C838D8A8D8D83), SPH_C64(0xCF06CAC946C9C9CA),
+	SPH_C64(0xF936D4CF4CCFCFD4), SPH_C64(0x15E309F607F6F609),
+	SPH_C64(0x64F4EA90AD9090EA), SPH_C64(0x755D882878282888),
+	SPH_C64(0xBC34928885888892), SPH_C64(0x37ACCD9BB09B9BCD),
+	SPH_C64(0xA495F531533131F5), SPH_C64(0x7E70360E120E0E36),
+	SPH_C64(0x3C8173BDDABDBD73), SPH_C64(0x206A7F4ADE4A4A7F),
+	SPH_C64(0xFB136FE825E8E86F), SPH_C64(0x52C4F496A79696F4),
+	SPH_C64(0xFF5904A6F7A6A604), SPH_C64(0x6C603C0C140C0C3C),
+	SPH_C64(0xC60ECFC845C8C8CF), SPH_C64(0x96EF80798B797980),
+	SPH_C64(0x358976BCD9BCBC76), SPH_C64(0x27997CBEDFBEBE7C),
+	SPH_C64(0xC42B74EF2CEFEF74), SPH_C64(0x3957CB6EB26E6ECB),
+	SPH_C64(0x4C0A4346CA464643), SPH_C64(0x5BCCF197A49797F1),
+	SPH_C64(0xB9E22A5BED5B5B2A), SPH_C64(0xD63B7EED2AEDED7E),
+	SPH_C64(0xD1C87D192B19197D), SPH_C64(0x5F869AD976D9D99A),
+	SPH_C64(0xA50926ACE9ACAC26), SPH_C64(0x25BCC799B69999C7),
+	SPH_C64(0x812932A8E5A8A832), SPH_C64(0x7C558D297B29298D),
+	SPH_C64(0x6307E964AC6464E9), SPH_C64(0xE7F8631F211F1F63),
+	SPH_C64(0xAC0123ADEAADAD23), SPH_C64(0xC7921C55FF55551C),
+	SPH_C64(0x8B985F133513135F), SPH_C64(0x0AB16DBBD0BBBB6D),
+	SPH_C64(0x1CEB0CF704F7F70C), SPH_C64(0x305FCE6FB16F6FCE),
+	SPH_C64(0x18A167B9D6B9B967), SPH_C64(0x45024647C9474746),
+	SPH_C64(0x4A65932F712F2F93), SPH_C64(0xCD2371EE2FEEEE71),
+	SPH_C64(0x11A962B8D5B8B862), SPH_C64(0x84FF8A7B8D7B7B8A),
+	SPH_C64(0xB53C978986898997), SPH_C64(0xAD9DF030503030F0),
+	SPH_C64(0x05D6B8D368D3D3B8), SPH_C64(0xA0DF9E7F817F7F9E),
+	SPH_C64(0xE197B3769A7676B3), SPH_C64(0xE664B0829B8282B0)
+};
+
+static const sph_u64 old0_T2[256] = {
+	SPH_C64(0x67D568B86868D50F), SPH_C64(0xCEB7D06DD0D0B71E),
+	SPH_C64(0x0B60EB20EBEB60E0), SPH_C64(0x45872B7D2B2B876E),
+	SPH_C64(0x7A7548D848487532), SPH_C64(0x9CD39DBA9D9DD301),
+	SPH_C64(0x77DF6ABE6A6ADF1D), SPH_C64(0x7353E431E4E45397),
+	SPH_C64(0x4B48E338E3E348A8), SPH_C64(0x7115A3F8A3A315D2),
+	SPH_C64(0x8A1356FA565613DC), SPH_C64(0x7CBF819E8181BFFD),
+	SPH_C64(0xCF947D877D7D94B2), SPH_C64(0xDB12F10EF1F1122A),
+	SPH_C64(0x5CAB85928585ABD9), SPH_C64(0x84DC9EBF9E9EDC1A),
+	SPH_C64(0x7D9C2C742C2C9C51), SPH_C64(0x048C8E8F8E8E8C8A),
+	SPH_C64(0xE78578887878859F), SPH_C64(0x1EC5CA43CACAC5D4),
+	SPH_C64(0xB84B173917174BAF), SPH_C64(0x2137A9E6A9A93788),
+	SPH_C64(0x2FF861A36161F84E), SPH_C64(0xE6A6D562D5D5A633),
+	SPH_C64(0xD2345DE75D5D348F), SPH_C64(0x58270B1D0B0B2753),
+	SPH_C64(0x14868C898C8C8698), SPH_C64(0xFDCC3C443C3CCCC1),
+	SPH_C64(0x9FB677997777B6E8), SPH_C64(0xB20851F3515108E3),
+	SPH_C64(0x0DAA22662222AA2F), SPH_C64(0x2A5742C642425768),
+	SPH_C64(0xE5C33F413F3FC3DA), SPH_C64(0x9A1954FC545419CE),
+	SPH_C64(0x325841C341415873), SPH_C64(0x74BA809D8080BAF4),
+	SPH_C64(0x2EDBCC49CCCCDBE2), SPH_C64(0x44A486978686A4C2),
+	SPH_C64(0xF145B3C8B3B34542), SPH_C64(0xC0781828181878D8),
+	SPH_C64(0x6D962E722E2E9643), SPH_C64(0x821657F9575716D5),
+	SPH_C64(0x301E060A06061E36), SPH_C64(0x37F762A66262F755),
+	SPH_C64(0xF303F401F4F40307), SPH_C64(0xADEE365A3636EE9B),
+	SPH_C64(0xC6B2D16ED1D1B217), SPH_C64(0x7FDA6BBD6B6BDA14),
+	SPH_C64(0xD8771B2D1B1B77C3), SPH_C64(0x0FEC65AF6565EC6A),
+	SPH_C64(0x8FBC759F7575BCFA), SPH_C64(0x8050103010105090),
+	SPH_C64(0x9E95DA73DADA9544), SPH_C64(0x727049DB4949703B),
+	SPH_C64(0x2DBE266A2626BE0B), SPH_C64(0x9B3AF916F9F93A62),
+	SPH_C64(0x16C0CB40CBCBC0DD), SPH_C64(0x17E366AA6666E371),
+	SPH_C64(0x6B5CE734E7E75C8C), SPH_C64(0xB968BAD3BABA6803),
+	SPH_C64(0x192CAEEFAEAE2CB7), SPH_C64(0xBA0D50F050500DEA),
+	SPH_C64(0xAA0752F6525207F8), SPH_C64(0x313DABE0ABAB3D9A),
+	SPH_C64(0x2811050F0505112D), SPH_C64(0xD317F00DF0F01723),
+	SPH_C64(0x68390D170D0D3965), SPH_C64(0xBFA273957373A2CC),
+	SPH_C64(0xC5D73B4D3B3BD7FE), SPH_C64(0x2014040C04041424),
+	SPH_C64(0x1DA020602020A03D), SPH_C64(0xA321FE1FFEFE215D),
+	SPH_C64(0xA68EDD7ADDDD8E7B), SPH_C64(0xFB06F502F5F5060E),
+	SPH_C64(0xC95EB4C1B4B45E7D), SPH_C64(0xC23E5FE15F5F3E9D),
+	SPH_C64(0x50220A1E0A0A225A), SPH_C64(0xC15BB5C2B5B55B74),
+	SPH_C64(0x4EE7C05DC0C0E78E), SPH_C64(0x691AA0FDA0A01AC9),
+	SPH_C64(0xAFA871937171A8DE), SPH_C64(0x410BA5F2A5A50BE4),
+	SPH_C64(0x75992D772D2D9958), SPH_C64(0x27FD60A06060FD47),
+	SPH_C64(0xB7A772967272A7C5), SPH_C64(0xECE593A89393E57F),
+	SPH_C64(0xD5DD394B3939DDEC), SPH_C64(0x4028081808082848),
+	SPH_C64(0x6CB583988383B5EF), SPH_C64(0x15A521632121A534),
+	SPH_C64(0xDA315CE45C5C3186), SPH_C64(0x4CA187948787A1CB),
+	SPH_C64(0xE14FB1CEB1B14F50), SPH_C64(0x5347E03DE0E047B3),
+	SPH_C64(0x0000000000000000), SPH_C64(0x56E8C358C3C3E895),
+	SPH_C64(0x905A123612125A82), SPH_C64(0xFCEF91AE9191EF6D),
+	SPH_C64(0x24988A838A8A98AE), SPH_C64(0x100A020602020A12),
+	SPH_C64(0xE06C1C241C1C6CFC), SPH_C64(0x6359E637E6E65985),
+	SPH_C64(0x124C45CF45454C57), SPH_C64(0x5EEDC25BC2C2ED9C),
+	SPH_C64(0x6EF3C451C4C4F3AA), SPH_C64(0xBB2EFD1AFDFD2E46),
+	SPH_C64(0x9179BFDCBFBF792E), SPH_C64(0x1A4944CC4444495E),
+	SPH_C64(0x611FA1FEA1A11FC0), SPH_C64(0x5A614CD44C4C6116),
+	SPH_C64(0x85FF33553333FFB6), SPH_C64(0x66F6C552C5C5F6A3),
+	SPH_C64(0x54AE84918484AED0), SPH_C64(0x05AF23652323AF26),
+	SPH_C64(0xC7917C847C7C91BB), SPH_C64(0xE94AB0CDB0B04A59),
+	SPH_C64(0x35B1256F2525B110), SPH_C64(0xA841153F151541BD),
+	SPH_C64(0xB5E1355F3535E180), SPH_C64(0x6FD069BB6969D006),
+	SPH_C64(0xAB24FF1CFFFF2454), SPH_C64(0xD4FE94A19494FE40),
+	SPH_C64(0x52644DD74D4D641F), SPH_C64(0xA7AD70907070ADD7),
+	SPH_C64(0x7910A2FBA2A210DB), SPH_C64(0x1129AFECAFAF29BE),
+	SPH_C64(0x26DECD4ACDCDDEEB), SPH_C64(0xFEA9D667D6D6A928),
+	SPH_C64(0x47C16CB46C6CC12B), SPH_C64(0xD151B7C4B7B75166),
+	SPH_C64(0x933FF815F8F83F6B), SPH_C64(0x482D091B09092D41),
+	SPH_C64(0xCB18F308F3F31838), SPH_C64(0x1FE667A96767E678),
+	SPH_C64(0x490EA4F1A4A40EED), SPH_C64(0x0365EA23EAEA65E9),
+	SPH_C64(0x337BEC29ECEC7BDF), SPH_C64(0xD954B6C7B6B6546F),
+	SPH_C64(0xEEA3D461D4D4A33A), SPH_C64(0xDEBDD26BD2D2BD0C),
+	SPH_C64(0xA044143C141444B4), SPH_C64(0xF0661E221E1E66EE),
+	SPH_C64(0x5B42E13EE1E142BA), SPH_C64(0x3DB4246C2424B419),
+	SPH_C64(0xDDD838483838D8E5), SPH_C64(0x7EF9C657C6C6F9B8),
+	SPH_C64(0x9690DB70DBDB904D), SPH_C64(0x627A4BDD4B4B7A29),
+	SPH_C64(0xF78F7A8E7A7A8F8D), SPH_C64(0xCDD23A4E3A3AD2F7),
+	SPH_C64(0xBE81DE7FDEDE8160), SPH_C64(0xCA3B5EE25E5E3B94),
+	SPH_C64(0xB684DF7CDFDF8469), SPH_C64(0xDCFB95A29595FB49),
+	SPH_C64(0xB32BFC19FCFC2B4F), SPH_C64(0x3938AAE3AAAA3893),
+	SPH_C64(0xF6ACD764D7D7AC21), SPH_C64(0x3ED1CE4FCECED1F0),
+	SPH_C64(0x381B070907071B3F), SPH_C64(0x78330F110F0F3377),
+	SPH_C64(0xF5C93D473D3DC9C8), SPH_C64(0xFA2558E8585825A2),
+	SPH_C64(0xA4C89AB39A9AC83E), SPH_C64(0xB4C298B59898C22C),
+	SPH_C64(0x94D69CB99C9CD608), SPH_C64(0xC31DF20BF2F21D31),
+	SPH_C64(0x5101A7F4A7A701F6), SPH_C64(0x8855113311115599),
+	SPH_C64(0xD79B7E827E7E9BA9), SPH_C64(0x2C9D8B808B8B9DA7),
+	SPH_C64(0x225243C543435261), SPH_C64(0x180F030503030F1B),
+	SPH_C64(0x434DE23BE2E24DA1), SPH_C64(0xAE8BDC79DCDC8B72),
+	SPH_C64(0x7B56E532E5E5569E), SPH_C64(0xF940B2CBB2B2404B),
+	SPH_C64(0x4A6B4ED24E4E6B04), SPH_C64(0x76FCC754C7C7FCB1),
+	SPH_C64(0x4FC46DB76D6DC422), SPH_C64(0x1B6AE926E9E96AF2),
+	SPH_C64(0x25BB27692727BB02), SPH_C64(0x3A5D40C040405D7A),
+	SPH_C64(0x8E9FD875D8D89F56), SPH_C64(0xA5EB37593737EB92),
+	SPH_C64(0xE4E092AB9292E076), SPH_C64(0x0C898F8C8F8F8983),
+	SPH_C64(0x0805010301010509), SPH_C64(0xE8691D271D1D69F5),
+	SPH_C64(0xA20253F5535302F1), SPH_C64(0xEDC63E423E3EC6D3),
+	SPH_C64(0xF22059EB595920AB), SPH_C64(0x46E2C15EC1C1E287),
+	SPH_C64(0x426E4FD14F4F6E0D), SPH_C64(0x8DFA32563232FABF),
+	SPH_C64(0xB04E163A16164EA6), SPH_C64(0x8335FA13FAFA3579),
+	SPH_C64(0x87B9749C7474B9F3), SPH_C64(0x8B30FB10FBFB3070),
+	SPH_C64(0x3FF263A56363F25C), SPH_C64(0x8CD99FBC9F9FD913),
+	SPH_C64(0xBDE4345C3434E489), SPH_C64(0xD0721A2E1A1A72CA),
+	SPH_C64(0x4D822A7E2A2A8267), SPH_C64(0xEA2F5AEE5A5A2FB0),
+	SPH_C64(0x1C838D8A8D8D8391), SPH_C64(0x06CAC946C9C9CACF),
+	SPH_C64(0x36D4CF4CCFCFD4F9), SPH_C64(0xE309F607F6F60915),
+	SPH_C64(0xF4EA90AD9090EA64), SPH_C64(0x5D88287828288875),
+	SPH_C64(0x34928885888892BC), SPH_C64(0xACCD9BB09B9BCD37),
+	SPH_C64(0x95F531533131F5A4), SPH_C64(0x70360E120E0E367E),
+	SPH_C64(0x8173BDDABDBD733C), SPH_C64(0x6A7F4ADE4A4A7F20),
+	SPH_C64(0x136FE825E8E86FFB), SPH_C64(0xC4F496A79696F452),
+	SPH_C64(0x5904A6F7A6A604FF), SPH_C64(0x603C0C140C0C3C6C),
+	SPH_C64(0x0ECFC845C8C8CFC6), SPH_C64(0xEF80798B79798096),
+	SPH_C64(0x8976BCD9BCBC7635), SPH_C64(0x997CBEDFBEBE7C27),
+	SPH_C64(0x2B74EF2CEFEF74C4), SPH_C64(0x57CB6EB26E6ECB39),
+	SPH_C64(0x0A4346CA4646434C), SPH_C64(0xCCF197A49797F15B),
+	SPH_C64(0xE22A5BED5B5B2AB9), SPH_C64(0x3B7EED2AEDED7ED6),
+	SPH_C64(0xC87D192B19197DD1), SPH_C64(0x869AD976D9D99A5F),
+	SPH_C64(0x0926ACE9ACAC26A5), SPH_C64(0xBCC799B69999C725),
+	SPH_C64(0x2932A8E5A8A83281), SPH_C64(0x558D297B29298D7C),
+	SPH_C64(0x07E964AC6464E963), SPH_C64(0xF8631F211F1F63E7),
+	SPH_C64(0x0123ADEAADAD23AC), SPH_C64(0x921C55FF55551CC7),
+	SPH_C64(0x985F133513135F8B), SPH_C64(0xB16DBBD0BBBB6D0A),
+	SPH_C64(0xEB0CF704F7F70C1C), SPH_C64(0x5FCE6FB16F6FCE30),
+	SPH_C64(0xA167B9D6B9B96718), SPH_C64(0x024647C947474645),
+	SPH_C64(0x65932F712F2F934A), SPH_C64(0x2371EE2FEEEE71CD),
+	SPH_C64(0xA962B8D5B8B86211), SPH_C64(0xFF8A7B8D7B7B8A84),
+	SPH_C64(0x3C978986898997B5), SPH_C64(0x9DF030503030F0AD),
+	SPH_C64(0xD6B8D368D3D3B805), SPH_C64(0xDF9E7F817F7F9EA0),
+	SPH_C64(0x97B3769A7676B3E1), SPH_C64(0x64B0829B8282B0E6)
+};
+
+static const sph_u64 old0_T3[256] = {
+	SPH_C64(0xD568B86868D50F67), SPH_C64(0xB7D06DD0D0B71ECE),
+	SPH_C64(0x60EB20EBEB60E00B), SPH_C64(0x872B7D2B2B876E45),
+	SPH_C64(0x7548D8484875327A), SPH_C64(0xD39DBA9D9DD3019C),
+	SPH_C64(0xDF6ABE6A6ADF1D77), SPH_C64(0x53E431E4E4539773),
+	SPH_C64(0x48E338E3E348A84B), SPH_C64(0x15A3F8A3A315D271),
+	SPH_C64(0x1356FA565613DC8A), SPH_C64(0xBF819E8181BFFD7C),
+	SPH_C64(0x947D877D7D94B2CF), SPH_C64(0x12F10EF1F1122ADB),
+	SPH_C64(0xAB85928585ABD95C), SPH_C64(0xDC9EBF9E9EDC1A84),
+	SPH_C64(0x9C2C742C2C9C517D), SPH_C64(0x8C8E8F8E8E8C8A04),
+	SPH_C64(0x8578887878859FE7), SPH_C64(0xC5CA43CACAC5D41E),
+	SPH_C64(0x4B173917174BAFB8), SPH_C64(0x37A9E6A9A9378821),
+	SPH_C64(0xF861A36161F84E2F), SPH_C64(0xA6D562D5D5A633E6),
+	SPH_C64(0x345DE75D5D348FD2), SPH_C64(0x270B1D0B0B275358),
+	SPH_C64(0x868C898C8C869814), SPH_C64(0xCC3C443C3CCCC1FD),
+	SPH_C64(0xB677997777B6E89F), SPH_C64(0x0851F3515108E3B2),
+	SPH_C64(0xAA22662222AA2F0D), SPH_C64(0x5742C6424257682A),
+	SPH_C64(0xC33F413F3FC3DAE5), SPH_C64(0x1954FC545419CE9A),
+	SPH_C64(0x5841C34141587332), SPH_C64(0xBA809D8080BAF474),
+	SPH_C64(0xDBCC49CCCCDBE22E), SPH_C64(0xA486978686A4C244),
+	SPH_C64(0x45B3C8B3B34542F1), SPH_C64(0x781828181878D8C0),
+	SPH_C64(0x962E722E2E96436D), SPH_C64(0x1657F9575716D582),
+	SPH_C64(0x1E060A06061E3630), SPH_C64(0xF762A66262F75537),
+	SPH_C64(0x03F401F4F40307F3), SPH_C64(0xEE365A3636EE9BAD),
+	SPH_C64(0xB2D16ED1D1B217C6), SPH_C64(0xDA6BBD6B6BDA147F),
+	SPH_C64(0x771B2D1B1B77C3D8), SPH_C64(0xEC65AF6565EC6A0F),
+	SPH_C64(0xBC759F7575BCFA8F), SPH_C64(0x5010301010509080),
+	SPH_C64(0x95DA73DADA95449E), SPH_C64(0x7049DB4949703B72),
+	SPH_C64(0xBE266A2626BE0B2D), SPH_C64(0x3AF916F9F93A629B),
+	SPH_C64(0xC0CB40CBCBC0DD16), SPH_C64(0xE366AA6666E37117),
+	SPH_C64(0x5CE734E7E75C8C6B), SPH_C64(0x68BAD3BABA6803B9),
+	SPH_C64(0x2CAEEFAEAE2CB719), SPH_C64(0x0D50F050500DEABA),
+	SPH_C64(0x0752F6525207F8AA), SPH_C64(0x3DABE0ABAB3D9A31),
+	SPH_C64(0x11050F0505112D28), SPH_C64(0x17F00DF0F01723D3),
+	SPH_C64(0x390D170D0D396568), SPH_C64(0xA273957373A2CCBF),
+	SPH_C64(0xD73B4D3B3BD7FEC5), SPH_C64(0x14040C0404142420),
+	SPH_C64(0xA020602020A03D1D), SPH_C64(0x21FE1FFEFE215DA3),
+	SPH_C64(0x8EDD7ADDDD8E7BA6), SPH_C64(0x06F502F5F5060EFB),
+	SPH_C64(0x5EB4C1B4B45E7DC9), SPH_C64(0x3E5FE15F5F3E9DC2),
+	SPH_C64(0x220A1E0A0A225A50), SPH_C64(0x5BB5C2B5B55B74C1),
+	SPH_C64(0xE7C05DC0C0E78E4E), SPH_C64(0x1AA0FDA0A01AC969),
+	SPH_C64(0xA871937171A8DEAF), SPH_C64(0x0BA5F2A5A50BE441),
+	SPH_C64(0x992D772D2D995875), SPH_C64(0xFD60A06060FD4727),
+	SPH_C64(0xA772967272A7C5B7), SPH_C64(0xE593A89393E57FEC),
+	SPH_C64(0xDD394B3939DDECD5), SPH_C64(0x2808180808284840),
+	SPH_C64(0xB583988383B5EF6C), SPH_C64(0xA521632121A53415),
+	SPH_C64(0x315CE45C5C3186DA), SPH_C64(0xA187948787A1CB4C),
+	SPH_C64(0x4FB1CEB1B14F50E1), SPH_C64(0x47E03DE0E047B353),
+	SPH_C64(0x0000000000000000), SPH_C64(0xE8C358C3C3E89556),
+	SPH_C64(0x5A123612125A8290), SPH_C64(0xEF91AE9191EF6DFC),
+	SPH_C64(0x988A838A8A98AE24), SPH_C64(0x0A020602020A1210),
+	SPH_C64(0x6C1C241C1C6CFCE0), SPH_C64(0x59E637E6E6598563),
+	SPH_C64(0x4C45CF45454C5712), SPH_C64(0xEDC25BC2C2ED9C5E),
+	SPH_C64(0xF3C451C4C4F3AA6E), SPH_C64(0x2EFD1AFDFD2E46BB),
+	SPH_C64(0x79BFDCBFBF792E91), SPH_C64(0x4944CC4444495E1A),
+	SPH_C64(0x1FA1FEA1A11FC061), SPH_C64(0x614CD44C4C61165A),
+	SPH_C64(0xFF33553333FFB685), SPH_C64(0xF6C552C5C5F6A366),
+	SPH_C64(0xAE84918484AED054), SPH_C64(0xAF23652323AF2605),
+	SPH_C64(0x917C847C7C91BBC7), SPH_C64(0x4AB0CDB0B04A59E9),
+	SPH_C64(0xB1256F2525B11035), SPH_C64(0x41153F151541BDA8),
+	SPH_C64(0xE1355F3535E180B5), SPH_C64(0xD069BB6969D0066F),
+	SPH_C64(0x24FF1CFFFF2454AB), SPH_C64(0xFE94A19494FE40D4),
+	SPH_C64(0x644DD74D4D641F52), SPH_C64(0xAD70907070ADD7A7),
+	SPH_C64(0x10A2FBA2A210DB79), SPH_C64(0x29AFECAFAF29BE11),
+	SPH_C64(0xDECD4ACDCDDEEB26), SPH_C64(0xA9D667D6D6A928FE),
+	SPH_C64(0xC16CB46C6CC12B47), SPH_C64(0x51B7C4B7B75166D1),
+	SPH_C64(0x3FF815F8F83F6B93), SPH_C64(0x2D091B09092D4148),
+	SPH_C64(0x18F308F3F31838CB), SPH_C64(0xE667A96767E6781F),
+	SPH_C64(0x0EA4F1A4A40EED49), SPH_C64(0x65EA23EAEA65E903),
+	SPH_C64(0x7BEC29ECEC7BDF33), SPH_C64(0x54B6C7B6B6546FD9),
+	SPH_C64(0xA3D461D4D4A33AEE), SPH_C64(0xBDD26BD2D2BD0CDE),
+	SPH_C64(0x44143C141444B4A0), SPH_C64(0x661E221E1E66EEF0),
+	SPH_C64(0x42E13EE1E142BA5B), SPH_C64(0xB4246C2424B4193D),
+	SPH_C64(0xD838483838D8E5DD), SPH_C64(0xF9C657C6C6F9B87E),
+	SPH_C64(0x90DB70DBDB904D96), SPH_C64(0x7A4BDD4B4B7A2962),
+	SPH_C64(0x8F7A8E7A7A8F8DF7), SPH_C64(0xD23A4E3A3AD2F7CD),
+	SPH_C64(0x81DE7FDEDE8160BE), SPH_C64(0x3B5EE25E5E3B94CA),
+	SPH_C64(0x84DF7CDFDF8469B6), SPH_C64(0xFB95A29595FB49DC),
+	SPH_C64(0x2BFC19FCFC2B4FB3), SPH_C64(0x38AAE3AAAA389339),
+	SPH_C64(0xACD764D7D7AC21F6), SPH_C64(0xD1CE4FCECED1F03E),
+	SPH_C64(0x1B070907071B3F38), SPH_C64(0x330F110F0F337778),
+	SPH_C64(0xC93D473D3DC9C8F5), SPH_C64(0x2558E8585825A2FA),
+	SPH_C64(0xC89AB39A9AC83EA4), SPH_C64(0xC298B59898C22CB4),
+	SPH_C64(0xD69CB99C9CD60894), SPH_C64(0x1DF20BF2F21D31C3),
+	SPH_C64(0x01A7F4A7A701F651), SPH_C64(0x5511331111559988),
+	SPH_C64(0x9B7E827E7E9BA9D7), SPH_C64(0x9D8B808B8B9DA72C),
+	SPH_C64(0x5243C54343526122), SPH_C64(0x0F030503030F1B18),
+	SPH_C64(0x4DE23BE2E24DA143), SPH_C64(0x8BDC79DCDC8B72AE),
+	SPH_C64(0x56E532E5E5569E7B), SPH_C64(0x40B2CBB2B2404BF9),
+	SPH_C64(0x6B4ED24E4E6B044A), SPH_C64(0xFCC754C7C7FCB176),
+	SPH_C64(0xC46DB76D6DC4224F), SPH_C64(0x6AE926E9E96AF21B),
+	SPH_C64(0xBB27692727BB0225), SPH_C64(0x5D40C040405D7A3A),
+	SPH_C64(0x9FD875D8D89F568E), SPH_C64(0xEB37593737EB92A5),
+	SPH_C64(0xE092AB9292E076E4), SPH_C64(0x898F8C8F8F89830C),
+	SPH_C64(0x0501030101050908), SPH_C64(0x691D271D1D69F5E8),
+	SPH_C64(0x0253F5535302F1A2), SPH_C64(0xC63E423E3EC6D3ED),
+	SPH_C64(0x2059EB595920ABF2), SPH_C64(0xE2C15EC1C1E28746),
+	SPH_C64(0x6E4FD14F4F6E0D42), SPH_C64(0xFA32563232FABF8D),
+	SPH_C64(0x4E163A16164EA6B0), SPH_C64(0x35FA13FAFA357983),
+	SPH_C64(0xB9749C7474B9F387), SPH_C64(0x30FB10FBFB30708B),
+	SPH_C64(0xF263A56363F25C3F), SPH_C64(0xD99FBC9F9FD9138C),
+	SPH_C64(0xE4345C3434E489BD), SPH_C64(0x721A2E1A1A72CAD0),
+	SPH_C64(0x822A7E2A2A82674D), SPH_C64(0x2F5AEE5A5A2FB0EA),
+	SPH_C64(0x838D8A8D8D83911C), SPH_C64(0xCAC946C9C9CACF06),
+	SPH_C64(0xD4CF4CCFCFD4F936), SPH_C64(0x09F607F6F60915E3),
+	SPH_C64(0xEA90AD9090EA64F4), SPH_C64(0x882878282888755D),
+	SPH_C64(0x928885888892BC34), SPH_C64(0xCD9BB09B9BCD37AC),
+	SPH_C64(0xF531533131F5A495), SPH_C64(0x360E120E0E367E70),
+	SPH_C64(0x73BDDABDBD733C81), SPH_C64(0x7F4ADE4A4A7F206A),
+	SPH_C64(0x6FE825E8E86FFB13), SPH_C64(0xF496A79696F452C4),
+	SPH_C64(0x04A6F7A6A604FF59), SPH_C64(0x3C0C140C0C3C6C60),
+	SPH_C64(0xCFC845C8C8CFC60E), SPH_C64(0x80798B79798096EF),
+	SPH_C64(0x76BCD9BCBC763589), SPH_C64(0x7CBEDFBEBE7C2799),
+	SPH_C64(0x74EF2CEFEF74C42B), SPH_C64(0xCB6EB26E6ECB3957),
+	SPH_C64(0x4346CA4646434C0A), SPH_C64(0xF197A49797F15BCC),
+	SPH_C64(0x2A5BED5B5B2AB9E2), SPH_C64(0x7EED2AEDED7ED63B),
+	SPH_C64(0x7D192B19197DD1C8), SPH_C64(0x9AD976D9D99A5F86),
+	SPH_C64(0x26ACE9ACAC26A509), SPH_C64(0xC799B69999C725BC),
+	SPH_C64(0x32A8E5A8A8328129), SPH_C64(0x8D297B29298D7C55),
+	SPH_C64(0xE964AC6464E96307), SPH_C64(0x631F211F1F63E7F8),
+	SPH_C64(0x23ADEAADAD23AC01), SPH_C64(0x1C55FF55551CC792),
+	SPH_C64(0x5F133513135F8B98), SPH_C64(0x6DBBD0BBBB6D0AB1),
+	SPH_C64(0x0CF704F7F70C1CEB), SPH_C64(0xCE6FB16F6FCE305F),
+	SPH_C64(0x67B9D6B9B96718A1), SPH_C64(0x4647C94747464502),
+	SPH_C64(0x932F712F2F934A65), SPH_C64(0x71EE2FEEEE71CD23),
+	SPH_C64(0x62B8D5B8B86211A9), SPH_C64(0x8A7B8D7B7B8A84FF),
+	SPH_C64(0x978986898997B53C), SPH_C64(0xF030503030F0AD9D),
+	SPH_C64(0xB8D368D3D3B805D6), SPH_C64(0x9E7F817F7F9EA0DF),
+	SPH_C64(0xB3769A7676B3E197), SPH_C64(0xB0829B8282B0E664)
+};
+
+static const sph_u64 old0_T4[256] = {
+	SPH_C64(0x68B86868D50F67D5), SPH_C64(0xD06DD0D0B71ECEB7),
+	SPH_C64(0xEB20EBEB60E00B60), SPH_C64(0x2B7D2B2B876E4587),
+	SPH_C64(0x48D8484875327A75), SPH_C64(0x9DBA9D9DD3019CD3),
+	SPH_C64(0x6ABE6A6ADF1D77DF), SPH_C64(0xE431E4E453977353),
+	SPH_C64(0xE338E3E348A84B48), SPH_C64(0xA3F8A3A315D27115),
+	SPH_C64(0x56FA565613DC8A13), SPH_C64(0x819E8181BFFD7CBF),
+	SPH_C64(0x7D877D7D94B2CF94), SPH_C64(0xF10EF1F1122ADB12),
+	SPH_C64(0x85928585ABD95CAB), SPH_C64(0x9EBF9E9EDC1A84DC),
+	SPH_C64(0x2C742C2C9C517D9C), SPH_C64(0x8E8F8E8E8C8A048C),
+	SPH_C64(0x78887878859FE785), SPH_C64(0xCA43CACAC5D41EC5),
+	SPH_C64(0x173917174BAFB84B), SPH_C64(0xA9E6A9A937882137),
+	SPH_C64(0x61A36161F84E2FF8), SPH_C64(0xD562D5D5A633E6A6),
+	SPH_C64(0x5DE75D5D348FD234), SPH_C64(0x0B1D0B0B27535827),
+	SPH_C64(0x8C898C8C86981486), SPH_C64(0x3C443C3CCCC1FDCC),
+	SPH_C64(0x77997777B6E89FB6), SPH_C64(0x51F3515108E3B208),
+	SPH_C64(0x22662222AA2F0DAA), SPH_C64(0x42C6424257682A57),
+	SPH_C64(0x3F413F3FC3DAE5C3), SPH_C64(0x54FC545419CE9A19),
+	SPH_C64(0x41C3414158733258), SPH_C64(0x809D8080BAF474BA),
+	SPH_C64(0xCC49CCCCDBE22EDB), SPH_C64(0x86978686A4C244A4),
+	SPH_C64(0xB3C8B3B34542F145), SPH_C64(0x1828181878D8C078),
+	SPH_C64(0x2E722E2E96436D96), SPH_C64(0x57F9575716D58216),
+	SPH_C64(0x060A06061E36301E), SPH_C64(0x62A66262F75537F7),
+	SPH_C64(0xF401F4F40307F303), SPH_C64(0x365A3636EE9BADEE),
+	SPH_C64(0xD16ED1D1B217C6B2), SPH_C64(0x6BBD6B6BDA147FDA),
+	SPH_C64(0x1B2D1B1B77C3D877), SPH_C64(0x65AF6565EC6A0FEC),
+	SPH_C64(0x759F7575BCFA8FBC), SPH_C64(0x1030101050908050),
+	SPH_C64(0xDA73DADA95449E95), SPH_C64(0x49DB4949703B7270),
+	SPH_C64(0x266A2626BE0B2DBE), SPH_C64(0xF916F9F93A629B3A),
+	SPH_C64(0xCB40CBCBC0DD16C0), SPH_C64(0x66AA6666E37117E3),
+	SPH_C64(0xE734E7E75C8C6B5C), SPH_C64(0xBAD3BABA6803B968),
+	SPH_C64(0xAEEFAEAE2CB7192C), SPH_C64(0x50F050500DEABA0D),
+	SPH_C64(0x52F6525207F8AA07), SPH_C64(0xABE0ABAB3D9A313D),
+	SPH_C64(0x050F0505112D2811), SPH_C64(0xF00DF0F01723D317),
+	SPH_C64(0x0D170D0D39656839), SPH_C64(0x73957373A2CCBFA2),
+	SPH_C64(0x3B4D3B3BD7FEC5D7), SPH_C64(0x040C040414242014),
+	SPH_C64(0x20602020A03D1DA0), SPH_C64(0xFE1FFEFE215DA321),
+	SPH_C64(0xDD7ADDDD8E7BA68E), SPH_C64(0xF502F5F5060EFB06),
+	SPH_C64(0xB4C1B4B45E7DC95E), SPH_C64(0x5FE15F5F3E9DC23E),
+	SPH_C64(0x0A1E0A0A225A5022), SPH_C64(0xB5C2B5B55B74C15B),
+	SPH_C64(0xC05DC0C0E78E4EE7), SPH_C64(0xA0FDA0A01AC9691A),
+	SPH_C64(0x71937171A8DEAFA8), SPH_C64(0xA5F2A5A50BE4410B),
+	SPH_C64(0x2D772D2D99587599), SPH_C64(0x60A06060FD4727FD),
+	SPH_C64(0x72967272A7C5B7A7), SPH_C64(0x93A89393E57FECE5),
+	SPH_C64(0x394B3939DDECD5DD), SPH_C64(0x0818080828484028),
+	SPH_C64(0x83988383B5EF6CB5), SPH_C64(0x21632121A53415A5),
+	SPH_C64(0x5CE45C5C3186DA31), SPH_C64(0x87948787A1CB4CA1),
+	SPH_C64(0xB1CEB1B14F50E14F), SPH_C64(0xE03DE0E047B35347),
+	SPH_C64(0x0000000000000000), SPH_C64(0xC358C3C3E89556E8),
+	SPH_C64(0x123612125A82905A), SPH_C64(0x91AE9191EF6DFCEF),
+	SPH_C64(0x8A838A8A98AE2498), SPH_C64(0x020602020A12100A),
+	SPH_C64(0x1C241C1C6CFCE06C), SPH_C64(0xE637E6E659856359),
+	SPH_C64(0x45CF45454C57124C), SPH_C64(0xC25BC2C2ED9C5EED),
+	SPH_C64(0xC451C4C4F3AA6EF3), SPH_C64(0xFD1AFDFD2E46BB2E),
+	SPH_C64(0xBFDCBFBF792E9179), SPH_C64(0x44CC4444495E1A49),
+	SPH_C64(0xA1FEA1A11FC0611F), SPH_C64(0x4CD44C4C61165A61),
+	SPH_C64(0x33553333FFB685FF), SPH_C64(0xC552C5C5F6A366F6),
+	SPH_C64(0x84918484AED054AE), SPH_C64(0x23652323AF2605AF),
+	SPH_C64(0x7C847C7C91BBC791), SPH_C64(0xB0CDB0B04A59E94A),
+	SPH_C64(0x256F2525B11035B1), SPH_C64(0x153F151541BDA841),
+	SPH_C64(0x355F3535E180B5E1), SPH_C64(0x69BB6969D0066FD0),
+	SPH_C64(0xFF1CFFFF2454AB24), SPH_C64(0x94A19494FE40D4FE),
+	SPH_C64(0x4DD74D4D641F5264), SPH_C64(0x70907070ADD7A7AD),
+	SPH_C64(0xA2FBA2A210DB7910), SPH_C64(0xAFECAFAF29BE1129),
+	SPH_C64(0xCD4ACDCDDEEB26DE), SPH_C64(0xD667D6D6A928FEA9),
+	SPH_C64(0x6CB46C6CC12B47C1), SPH_C64(0xB7C4B7B75166D151),
+	SPH_C64(0xF815F8F83F6B933F), SPH_C64(0x091B09092D41482D),
+	SPH_C64(0xF308F3F31838CB18), SPH_C64(0x67A96767E6781FE6),
+	SPH_C64(0xA4F1A4A40EED490E), SPH_C64(0xEA23EAEA65E90365),
+	SPH_C64(0xEC29ECEC7BDF337B), SPH_C64(0xB6C7B6B6546FD954),
+	SPH_C64(0xD461D4D4A33AEEA3), SPH_C64(0xD26BD2D2BD0CDEBD),
+	SPH_C64(0x143C141444B4A044), SPH_C64(0x1E221E1E66EEF066),
+	SPH_C64(0xE13EE1E142BA5B42), SPH_C64(0x246C2424B4193DB4),
+	SPH_C64(0x38483838D8E5DDD8), SPH_C64(0xC657C6C6F9B87EF9),
+	SPH_C64(0xDB70DBDB904D9690), SPH_C64(0x4BDD4B4B7A29627A),
+	SPH_C64(0x7A8E7A7A8F8DF78F), SPH_C64(0x3A4E3A3AD2F7CDD2),
+	SPH_C64(0xDE7FDEDE8160BE81), SPH_C64(0x5EE25E5E3B94CA3B),
+	SPH_C64(0xDF7CDFDF8469B684), SPH_C64(0x95A29595FB49DCFB),
+	SPH_C64(0xFC19FCFC2B4FB32B), SPH_C64(0xAAE3AAAA38933938),
+	SPH_C64(0xD764D7D7AC21F6AC), SPH_C64(0xCE4FCECED1F03ED1),
+	SPH_C64(0x070907071B3F381B), SPH_C64(0x0F110F0F33777833),
+	SPH_C64(0x3D473D3DC9C8F5C9), SPH_C64(0x58E8585825A2FA25),
+	SPH_C64(0x9AB39A9AC83EA4C8), SPH_C64(0x98B59898C22CB4C2),
+	SPH_C64(0x9CB99C9CD60894D6), SPH_C64(0xF20BF2F21D31C31D),
+	SPH_C64(0xA7F4A7A701F65101), SPH_C64(0x1133111155998855),
+	SPH_C64(0x7E827E7E9BA9D79B), SPH_C64(0x8B808B8B9DA72C9D),
+	SPH_C64(0x43C5434352612252), SPH_C64(0x030503030F1B180F),
+	SPH_C64(0xE23BE2E24DA1434D), SPH_C64(0xDC79DCDC8B72AE8B),
+	SPH_C64(0xE532E5E5569E7B56), SPH_C64(0xB2CBB2B2404BF940),
+	SPH_C64(0x4ED24E4E6B044A6B), SPH_C64(0xC754C7C7FCB176FC),
+	SPH_C64(0x6DB76D6DC4224FC4), SPH_C64(0xE926E9E96AF21B6A),
+	SPH_C64(0x27692727BB0225BB), SPH_C64(0x40C040405D7A3A5D),
+	SPH_C64(0xD875D8D89F568E9F), SPH_C64(0x37593737EB92A5EB),
+	SPH_C64(0x92AB9292E076E4E0), SPH_C64(0x8F8C8F8F89830C89),
+	SPH_C64(0x0103010105090805), SPH_C64(0x1D271D1D69F5E869),
+	SPH_C64(0x53F5535302F1A202), SPH_C64(0x3E423E3EC6D3EDC6),
+	SPH_C64(0x59EB595920ABF220), SPH_C64(0xC15EC1C1E28746E2),
+	SPH_C64(0x4FD14F4F6E0D426E), SPH_C64(0x32563232FABF8DFA),
+	SPH_C64(0x163A16164EA6B04E), SPH_C64(0xFA13FAFA35798335),
+	SPH_C64(0x749C7474B9F387B9), SPH_C64(0xFB10FBFB30708B30),
+	SPH_C64(0x63A56363F25C3FF2), SPH_C64(0x9FBC9F9FD9138CD9),
+	SPH_C64(0x345C3434E489BDE4), SPH_C64(0x1A2E1A1A72CAD072),
+	SPH_C64(0x2A7E2A2A82674D82), SPH_C64(0x5AEE5A5A2FB0EA2F),
+	SPH_C64(0x8D8A8D8D83911C83), SPH_C64(0xC946C9C9CACF06CA),
+	SPH_C64(0xCF4CCFCFD4F936D4), SPH_C64(0xF607F6F60915E309),
+	SPH_C64(0x90AD9090EA64F4EA), SPH_C64(0x2878282888755D88),
+	SPH_C64(0x8885888892BC3492), SPH_C64(0x9BB09B9BCD37ACCD),
+	SPH_C64(0x31533131F5A495F5), SPH_C64(0x0E120E0E367E7036),
+	SPH_C64(0xBDDABDBD733C8173), SPH_C64(0x4ADE4A4A7F206A7F),
+	SPH_C64(0xE825E8E86FFB136F), SPH_C64(0x96A79696F452C4F4),
+	SPH_C64(0xA6F7A6A604FF5904), SPH_C64(0x0C140C0C3C6C603C),
+	SPH_C64(0xC845C8C8CFC60ECF), SPH_C64(0x798B79798096EF80),
+	SPH_C64(0xBCD9BCBC76358976), SPH_C64(0xBEDFBEBE7C27997C),
+	SPH_C64(0xEF2CEFEF74C42B74), SPH_C64(0x6EB26E6ECB3957CB),
+	SPH_C64(0x46CA4646434C0A43), SPH_C64(0x97A49797F15BCCF1),
+	SPH_C64(0x5BED5B5B2AB9E22A), SPH_C64(0xED2AEDED7ED63B7E),
+	SPH_C64(0x192B19197DD1C87D), SPH_C64(0xD976D9D99A5F869A),
+	SPH_C64(0xACE9ACAC26A50926), SPH_C64(0x99B69999C725BCC7),
+	SPH_C64(0xA8E5A8A832812932), SPH_C64(0x297B29298D7C558D),
+	SPH_C64(0x64AC6464E96307E9), SPH_C64(0x1F211F1F63E7F863),
+	SPH_C64(0xADEAADAD23AC0123), SPH_C64(0x55FF55551CC7921C),
+	SPH_C64(0x133513135F8B985F), SPH_C64(0xBBD0BBBB6D0AB16D),
+	SPH_C64(0xF704F7F70C1CEB0C), SPH_C64(0x6FB16F6FCE305FCE),
+	SPH_C64(0xB9D6B9B96718A167), SPH_C64(0x47C9474746450246),
+	SPH_C64(0x2F712F2F934A6593), SPH_C64(0xEE2FEEEE71CD2371),
+	SPH_C64(0xB8D5B8B86211A962), SPH_C64(0x7B8D7B7B8A84FF8A),
+	SPH_C64(0x8986898997B53C97), SPH_C64(0x30503030F0AD9DF0),
+	SPH_C64(0xD368D3D3B805D6B8), SPH_C64(0x7F817F7F9EA0DF9E),
+	SPH_C64(0x769A7676B3E197B3), SPH_C64(0x829B8282B0E664B0)
+};
+
+static const sph_u64 old0_T5[256] = {
+	SPH_C64(0xB86868D50F67D568), SPH_C64(0x6DD0D0B71ECEB7D0),
+	SPH_C64(0x20EBEB60E00B60EB), SPH_C64(0x7D2B2B876E45872B),
+	SPH_C64(0xD8484875327A7548), SPH_C64(0xBA9D9DD3019CD39D),
+	SPH_C64(0xBE6A6ADF1D77DF6A), SPH_C64(0x31E4E453977353E4),
+	SPH_C64(0x38E3E348A84B48E3), SPH_C64(0xF8A3A315D27115A3),
+	SPH_C64(0xFA565613DC8A1356), SPH_C64(0x9E8181BFFD7CBF81),
+	SPH_C64(0x877D7D94B2CF947D), SPH_C64(0x0EF1F1122ADB12F1),
+	SPH_C64(0x928585ABD95CAB85), SPH_C64(0xBF9E9EDC1A84DC9E),
+	SPH_C64(0x742C2C9C517D9C2C), SPH_C64(0x8F8E8E8C8A048C8E),
+	SPH_C64(0x887878859FE78578), SPH_C64(0x43CACAC5D41EC5CA),
+	SPH_C64(0x3917174BAFB84B17), SPH_C64(0xE6A9A937882137A9),
+	SPH_C64(0xA36161F84E2FF861), SPH_C64(0x62D5D5A633E6A6D5),
+	SPH_C64(0xE75D5D348FD2345D), SPH_C64(0x1D0B0B275358270B),
+	SPH_C64(0x898C8C869814868C), SPH_C64(0x443C3CCCC1FDCC3C),
+	SPH_C64(0x997777B6E89FB677), SPH_C64(0xF3515108E3B20851),
+	SPH_C64(0x662222AA2F0DAA22), SPH_C64(0xC6424257682A5742),
+	SPH_C64(0x413F3FC3DAE5C33F), SPH_C64(0xFC545419CE9A1954),
+	SPH_C64(0xC341415873325841), SPH_C64(0x9D8080BAF474BA80),
+	SPH_C64(0x49CCCCDBE22EDBCC), SPH_C64(0x978686A4C244A486),
+	SPH_C64(0xC8B3B34542F145B3), SPH_C64(0x28181878D8C07818),
+	SPH_C64(0x722E2E96436D962E), SPH_C64(0xF9575716D5821657),
+	SPH_C64(0x0A06061E36301E06), SPH_C64(0xA66262F75537F762),
+	SPH_C64(0x01F4F40307F303F4), SPH_C64(0x5A3636EE9BADEE36),
+	SPH_C64(0x6ED1D1B217C6B2D1), SPH_C64(0xBD6B6BDA147FDA6B),
+	SPH_C64(0x2D1B1B77C3D8771B), SPH_C64(0xAF6565EC6A0FEC65),
+	SPH_C64(0x9F7575BCFA8FBC75), SPH_C64(0x3010105090805010),
+	SPH_C64(0x73DADA95449E95DA), SPH_C64(0xDB4949703B727049),
+	SPH_C64(0x6A2626BE0B2DBE26), SPH_C64(0x16F9F93A629B3AF9),
+	SPH_C64(0x40CBCBC0DD16C0CB), SPH_C64(0xAA6666E37117E366),
+	SPH_C64(0x34E7E75C8C6B5CE7), SPH_C64(0xD3BABA6803B968BA),
+	SPH_C64(0xEFAEAE2CB7192CAE), SPH_C64(0xF050500DEABA0D50),
+	SPH_C64(0xF6525207F8AA0752), SPH_C64(0xE0ABAB3D9A313DAB),
+	SPH_C64(0x0F0505112D281105), SPH_C64(0x0DF0F01723D317F0),
+	SPH_C64(0x170D0D396568390D), SPH_C64(0x957373A2CCBFA273),
+	SPH_C64(0x4D3B3BD7FEC5D73B), SPH_C64(0x0C04041424201404),
+	SPH_C64(0x602020A03D1DA020), SPH_C64(0x1FFEFE215DA321FE),
+	SPH_C64(0x7ADDDD8E7BA68EDD), SPH_C64(0x02F5F5060EFB06F5),
+	SPH_C64(0xC1B4B45E7DC95EB4), SPH_C64(0xE15F5F3E9DC23E5F),
+	SPH_C64(0x1E0A0A225A50220A), SPH_C64(0xC2B5B55B74C15BB5),
+	SPH_C64(0x5DC0C0E78E4EE7C0), SPH_C64(0xFDA0A01AC9691AA0),
+	SPH_C64(0x937171A8DEAFA871), SPH_C64(0xF2A5A50BE4410BA5),
+	SPH_C64(0x772D2D995875992D), SPH_C64(0xA06060FD4727FD60),
+	SPH_C64(0x967272A7C5B7A772), SPH_C64(0xA89393E57FECE593),
+	SPH_C64(0x4B3939DDECD5DD39), SPH_C64(0x1808082848402808),
+	SPH_C64(0x988383B5EF6CB583), SPH_C64(0x632121A53415A521),
+	SPH_C64(0xE45C5C3186DA315C), SPH_C64(0x948787A1CB4CA187),
+	SPH_C64(0xCEB1B14F50E14FB1), SPH_C64(0x3DE0E047B35347E0),
+	SPH_C64(0x0000000000000000), SPH_C64(0x58C3C3E89556E8C3),
+	SPH_C64(0x3612125A82905A12), SPH_C64(0xAE9191EF6DFCEF91),
+	SPH_C64(0x838A8A98AE24988A), SPH_C64(0x0602020A12100A02),
+	SPH_C64(0x241C1C6CFCE06C1C), SPH_C64(0x37E6E659856359E6),
+	SPH_C64(0xCF45454C57124C45), SPH_C64(0x5BC2C2ED9C5EEDC2),
+	SPH_C64(0x51C4C4F3AA6EF3C4), SPH_C64(0x1AFDFD2E46BB2EFD),
+	SPH_C64(0xDCBFBF792E9179BF), SPH_C64(0xCC4444495E1A4944),
+	SPH_C64(0xFEA1A11FC0611FA1), SPH_C64(0xD44C4C61165A614C),
+	SPH_C64(0x553333FFB685FF33), SPH_C64(0x52C5C5F6A366F6C5),
+	SPH_C64(0x918484AED054AE84), SPH_C64(0x652323AF2605AF23),
+	SPH_C64(0x847C7C91BBC7917C), SPH_C64(0xCDB0B04A59E94AB0),
+	SPH_C64(0x6F2525B11035B125), SPH_C64(0x3F151541BDA84115),
+	SPH_C64(0x5F3535E180B5E135), SPH_C64(0xBB6969D0066FD069),
+	SPH_C64(0x1CFFFF2454AB24FF), SPH_C64(0xA19494FE40D4FE94),
+	SPH_C64(0xD74D4D641F52644D), SPH_C64(0x907070ADD7A7AD70),
+	SPH_C64(0xFBA2A210DB7910A2), SPH_C64(0xECAFAF29BE1129AF),
+	SPH_C64(0x4ACDCDDEEB26DECD), SPH_C64(0x67D6D6A928FEA9D6),
+	SPH_C64(0xB46C6CC12B47C16C), SPH_C64(0xC4B7B75166D151B7),
+	SPH_C64(0x15F8F83F6B933FF8), SPH_C64(0x1B09092D41482D09),
+	SPH_C64(0x08F3F31838CB18F3), SPH_C64(0xA96767E6781FE667),
+	SPH_C64(0xF1A4A40EED490EA4), SPH_C64(0x23EAEA65E90365EA),
+	SPH_C64(0x29ECEC7BDF337BEC), SPH_C64(0xC7B6B6546FD954B6),
+	SPH_C64(0x61D4D4A33AEEA3D4), SPH_C64(0x6BD2D2BD0CDEBDD2),
+	SPH_C64(0x3C141444B4A04414), SPH_C64(0x221E1E66EEF0661E),
+	SPH_C64(0x3EE1E142BA5B42E1), SPH_C64(0x6C2424B4193DB424),
+	SPH_C64(0x483838D8E5DDD838), SPH_C64(0x57C6C6F9B87EF9C6),
+	SPH_C64(0x70DBDB904D9690DB), SPH_C64(0xDD4B4B7A29627A4B),
+	SPH_C64(0x8E7A7A8F8DF78F7A), SPH_C64(0x4E3A3AD2F7CDD23A),
+	SPH_C64(0x7FDEDE8160BE81DE), SPH_C64(0xE25E5E3B94CA3B5E),
+	SPH_C64(0x7CDFDF8469B684DF), SPH_C64(0xA29595FB49DCFB95),
+	SPH_C64(0x19FCFC2B4FB32BFC), SPH_C64(0xE3AAAA38933938AA),
+	SPH_C64(0x64D7D7AC21F6ACD7), SPH_C64(0x4FCECED1F03ED1CE),
+	SPH_C64(0x0907071B3F381B07), SPH_C64(0x110F0F337778330F),
+	SPH_C64(0x473D3DC9C8F5C93D), SPH_C64(0xE8585825A2FA2558),
+	SPH_C64(0xB39A9AC83EA4C89A), SPH_C64(0xB59898C22CB4C298),
+	SPH_C64(0xB99C9CD60894D69C), SPH_C64(0x0BF2F21D31C31DF2),
+	SPH_C64(0xF4A7A701F65101A7), SPH_C64(0x3311115599885511),
+	SPH_C64(0x827E7E9BA9D79B7E), SPH_C64(0x808B8B9DA72C9D8B),
+	SPH_C64(0xC543435261225243), SPH_C64(0x0503030F1B180F03),
+	SPH_C64(0x3BE2E24DA1434DE2), SPH_C64(0x79DCDC8B72AE8BDC),
+	SPH_C64(0x32E5E5569E7B56E5), SPH_C64(0xCBB2B2404BF940B2),
+	SPH_C64(0xD24E4E6B044A6B4E), SPH_C64(0x54C7C7FCB176FCC7),
+	SPH_C64(0xB76D6DC4224FC46D), SPH_C64(0x26E9E96AF21B6AE9),
+	SPH_C64(0x692727BB0225BB27), SPH_C64(0xC040405D7A3A5D40),
+	SPH_C64(0x75D8D89F568E9FD8), SPH_C64(0x593737EB92A5EB37),
+	SPH_C64(0xAB9292E076E4E092), SPH_C64(0x8C8F8F89830C898F),
+	SPH_C64(0x0301010509080501), SPH_C64(0x271D1D69F5E8691D),
+	SPH_C64(0xF5535302F1A20253), SPH_C64(0x423E3EC6D3EDC63E),
+	SPH_C64(0xEB595920ABF22059), SPH_C64(0x5EC1C1E28746E2C1),
+	SPH_C64(0xD14F4F6E0D426E4F), SPH_C64(0x563232FABF8DFA32),
+	SPH_C64(0x3A16164EA6B04E16), SPH_C64(0x13FAFA35798335FA),
+	SPH_C64(0x9C7474B9F387B974), SPH_C64(0x10FBFB30708B30FB),
+	SPH_C64(0xA56363F25C3FF263), SPH_C64(0xBC9F9FD9138CD99F),
+	SPH_C64(0x5C3434E489BDE434), SPH_C64(0x2E1A1A72CAD0721A),
+	SPH_C64(0x7E2A2A82674D822A), SPH_C64(0xEE5A5A2FB0EA2F5A),
+	SPH_C64(0x8A8D8D83911C838D), SPH_C64(0x46C9C9CACF06CAC9),
+	SPH_C64(0x4CCFCFD4F936D4CF), SPH_C64(0x07F6F60915E309F6),
+	SPH_C64(0xAD9090EA64F4EA90), SPH_C64(0x78282888755D8828),
+	SPH_C64(0x85888892BC349288), SPH_C64(0xB09B9BCD37ACCD9B),
+	SPH_C64(0x533131F5A495F531), SPH_C64(0x120E0E367E70360E),
+	SPH_C64(0xDABDBD733C8173BD), SPH_C64(0xDE4A4A7F206A7F4A),
+	SPH_C64(0x25E8E86FFB136FE8), SPH_C64(0xA79696F452C4F496),
+	SPH_C64(0xF7A6A604FF5904A6), SPH_C64(0x140C0C3C6C603C0C),
+	SPH_C64(0x45C8C8CFC60ECFC8), SPH_C64(0x8B79798096EF8079),
+	SPH_C64(0xD9BCBC76358976BC), SPH_C64(0xDFBEBE7C27997CBE),
+	SPH_C64(0x2CEFEF74C42B74EF), SPH_C64(0xB26E6ECB3957CB6E),
+	SPH_C64(0xCA4646434C0A4346), SPH_C64(0xA49797F15BCCF197),
+	SPH_C64(0xED5B5B2AB9E22A5B), SPH_C64(0x2AEDED7ED63B7EED),
+	SPH_C64(0x2B19197DD1C87D19), SPH_C64(0x76D9D99A5F869AD9),
+	SPH_C64(0xE9ACAC26A50926AC), SPH_C64(0xB69999C725BCC799),
+	SPH_C64(0xE5A8A832812932A8), SPH_C64(0x7B29298D7C558D29),
+	SPH_C64(0xAC6464E96307E964), SPH_C64(0x211F1F63E7F8631F),
+	SPH_C64(0xEAADAD23AC0123AD), SPH_C64(0xFF55551CC7921C55),
+	SPH_C64(0x3513135F8B985F13), SPH_C64(0xD0BBBB6D0AB16DBB),
+	SPH_C64(0x04F7F70C1CEB0CF7), SPH_C64(0xB16F6FCE305FCE6F),
+	SPH_C64(0xD6B9B96718A167B9), SPH_C64(0xC947474645024647),
+	SPH_C64(0x712F2F934A65932F), SPH_C64(0x2FEEEE71CD2371EE),
+	SPH_C64(0xD5B8B86211A962B8), SPH_C64(0x8D7B7B8A84FF8A7B),
+	SPH_C64(0x86898997B53C9789), SPH_C64(0x503030F0AD9DF030),
+	SPH_C64(0x68D3D3B805D6B8D3), SPH_C64(0x817F7F9EA0DF9E7F),
+	SPH_C64(0x9A7676B3E197B376), SPH_C64(0x9B8282B0E664B082)
+};
+
+static const sph_u64 old0_T6[256] = {
+	SPH_C64(0x6868D50F67D568B8), SPH_C64(0xD0D0B71ECEB7D06D),
+	SPH_C64(0xEBEB60E00B60EB20), SPH_C64(0x2B2B876E45872B7D),
+	SPH_C64(0x484875327A7548D8), SPH_C64(0x9D9DD3019CD39DBA),
+	SPH_C64(0x6A6ADF1D77DF6ABE), SPH_C64(0xE4E453977353E431),
+	SPH_C64(0xE3E348A84B48E338), SPH_C64(0xA3A315D27115A3F8),
+	SPH_C64(0x565613DC8A1356FA), SPH_C64(0x8181BFFD7CBF819E),
+	SPH_C64(0x7D7D94B2CF947D87), SPH_C64(0xF1F1122ADB12F10E),
+	SPH_C64(0x8585ABD95CAB8592), SPH_C64(0x9E9EDC1A84DC9EBF),
+	SPH_C64(0x2C2C9C517D9C2C74), SPH_C64(0x8E8E8C8A048C8E8F),
+	SPH_C64(0x7878859FE7857888), SPH_C64(0xCACAC5D41EC5CA43),
+	SPH_C64(0x17174BAFB84B1739), SPH_C64(0xA9A937882137A9E6),
+	SPH_C64(0x6161F84E2FF861A3), SPH_C64(0xD5D5A633E6A6D562),
+	SPH_C64(0x5D5D348FD2345DE7), SPH_C64(0x0B0B275358270B1D),
+	SPH_C64(0x8C8C869814868C89), SPH_C64(0x3C3CCCC1FDCC3C44),
+	SPH_C64(0x7777B6E89FB67799), SPH_C64(0x515108E3B20851F3),
+	SPH_C64(0x2222AA2F0DAA2266), SPH_C64(0x424257682A5742C6),
+	SPH_C64(0x3F3FC3DAE5C33F41), SPH_C64(0x545419CE9A1954FC),
+	SPH_C64(0x41415873325841C3), SPH_C64(0x8080BAF474BA809D),
+	SPH_C64(0xCCCCDBE22EDBCC49), SPH_C64(0x8686A4C244A48697),
+	SPH_C64(0xB3B34542F145B3C8), SPH_C64(0x181878D8C0781828),
+	SPH_C64(0x2E2E96436D962E72), SPH_C64(0x575716D5821657F9),
+	SPH_C64(0x06061E36301E060A), SPH_C64(0x6262F75537F762A6),
+	SPH_C64(0xF4F40307F303F401), SPH_C64(0x3636EE9BADEE365A),
+	SPH_C64(0xD1D1B217C6B2D16E), SPH_C64(0x6B6BDA147FDA6BBD),
+	SPH_C64(0x1B1B77C3D8771B2D), SPH_C64(0x6565EC6A0FEC65AF),
+	SPH_C64(0x7575BCFA8FBC759F), SPH_C64(0x1010509080501030),
+	SPH_C64(0xDADA95449E95DA73), SPH_C64(0x4949703B727049DB),
+	SPH_C64(0x2626BE0B2DBE266A), SPH_C64(0xF9F93A629B3AF916),
+	SPH_C64(0xCBCBC0DD16C0CB40), SPH_C64(0x6666E37117E366AA),
+	SPH_C64(0xE7E75C8C6B5CE734), SPH_C64(0xBABA6803B968BAD3),
+	SPH_C64(0xAEAE2CB7192CAEEF), SPH_C64(0x50500DEABA0D50F0),
+	SPH_C64(0x525207F8AA0752F6), SPH_C64(0xABAB3D9A313DABE0),
+	SPH_C64(0x0505112D2811050F), SPH_C64(0xF0F01723D317F00D),
+	SPH_C64(0x0D0D396568390D17), SPH_C64(0x7373A2CCBFA27395),
+	SPH_C64(0x3B3BD7FEC5D73B4D), SPH_C64(0x040414242014040C),
+	SPH_C64(0x2020A03D1DA02060), SPH_C64(0xFEFE215DA321FE1F),
+	SPH_C64(0xDDDD8E7BA68EDD7A), SPH_C64(0xF5F5060EFB06F502),
+	SPH_C64(0xB4B45E7DC95EB4C1), SPH_C64(0x5F5F3E9DC23E5FE1),
+	SPH_C64(0x0A0A225A50220A1E), SPH_C64(0xB5B55B74C15BB5C2),
+	SPH_C64(0xC0C0E78E4EE7C05D), SPH_C64(0xA0A01AC9691AA0FD),
+	SPH_C64(0x7171A8DEAFA87193), SPH_C64(0xA5A50BE4410BA5F2),
+	SPH_C64(0x2D2D995875992D77), SPH_C64(0x6060FD4727FD60A0),
+	SPH_C64(0x7272A7C5B7A77296), SPH_C64(0x9393E57FECE593A8),
+	SPH_C64(0x3939DDECD5DD394B), SPH_C64(0x0808284840280818),
+	SPH_C64(0x8383B5EF6CB58398), SPH_C64(0x2121A53415A52163),
+	SPH_C64(0x5C5C3186DA315CE4), SPH_C64(0x8787A1CB4CA18794),
+	SPH_C64(0xB1B14F50E14FB1CE), SPH_C64(0xE0E047B35347E03D),
+	SPH_C64(0x0000000000000000), SPH_C64(0xC3C3E89556E8C358),
+	SPH_C64(0x12125A82905A1236), SPH_C64(0x9191EF6DFCEF91AE),
+	SPH_C64(0x8A8A98AE24988A83), SPH_C64(0x02020A12100A0206),
+	SPH_C64(0x1C1C6CFCE06C1C24), SPH_C64(0xE6E659856359E637),
+	SPH_C64(0x45454C57124C45CF), SPH_C64(0xC2C2ED9C5EEDC25B),
+	SPH_C64(0xC4C4F3AA6EF3C451), SPH_C64(0xFDFD2E46BB2EFD1A),
+	SPH_C64(0xBFBF792E9179BFDC), SPH_C64(0x4444495E1A4944CC),
+	SPH_C64(0xA1A11FC0611FA1FE), SPH_C64(0x4C4C61165A614CD4),
+	SPH_C64(0x3333FFB685FF3355), SPH_C64(0xC5C5F6A366F6C552),
+	SPH_C64(0x8484AED054AE8491), SPH_C64(0x2323AF2605AF2365),
+	SPH_C64(0x7C7C91BBC7917C84), SPH_C64(0xB0B04A59E94AB0CD),
+	SPH_C64(0x2525B11035B1256F), SPH_C64(0x151541BDA841153F),
+	SPH_C64(0x3535E180B5E1355F), SPH_C64(0x6969D0066FD069BB),
+	SPH_C64(0xFFFF2454AB24FF1C), SPH_C64(0x9494FE40D4FE94A1),
+	SPH_C64(0x4D4D641F52644DD7), SPH_C64(0x7070ADD7A7AD7090),
+	SPH_C64(0xA2A210DB7910A2FB), SPH_C64(0xAFAF29BE1129AFEC),
+	SPH_C64(0xCDCDDEEB26DECD4A), SPH_C64(0xD6D6A928FEA9D667),
+	SPH_C64(0x6C6CC12B47C16CB4), SPH_C64(0xB7B75166D151B7C4),
+	SPH_C64(0xF8F83F6B933FF815), SPH_C64(0x09092D41482D091B),
+	SPH_C64(0xF3F31838CB18F308), SPH_C64(0x6767E6781FE667A9),
+	SPH_C64(0xA4A40EED490EA4F1), SPH_C64(0xEAEA65E90365EA23),
+	SPH_C64(0xECEC7BDF337BEC29), SPH_C64(0xB6B6546FD954B6C7),
+	SPH_C64(0xD4D4A33AEEA3D461), SPH_C64(0xD2D2BD0CDEBDD26B),
+	SPH_C64(0x141444B4A044143C), SPH_C64(0x1E1E66EEF0661E22),
+	SPH_C64(0xE1E142BA5B42E13E), SPH_C64(0x2424B4193DB4246C),
+	SPH_C64(0x3838D8E5DDD83848), SPH_C64(0xC6C6F9B87EF9C657),
+	SPH_C64(0xDBDB904D9690DB70), SPH_C64(0x4B4B7A29627A4BDD),
+	SPH_C64(0x7A7A8F8DF78F7A8E), SPH_C64(0x3A3AD2F7CDD23A4E),
+	SPH_C64(0xDEDE8160BE81DE7F), SPH_C64(0x5E5E3B94CA3B5EE2),
+	SPH_C64(0xDFDF8469B684DF7C), SPH_C64(0x9595FB49DCFB95A2),
+	SPH_C64(0xFCFC2B4FB32BFC19), SPH_C64(0xAAAA38933938AAE3),
+	SPH_C64(0xD7D7AC21F6ACD764), SPH_C64(0xCECED1F03ED1CE4F),
+	SPH_C64(0x07071B3F381B0709), SPH_C64(0x0F0F337778330F11),
+	SPH_C64(0x3D3DC9C8F5C93D47), SPH_C64(0x585825A2FA2558E8),
+	SPH_C64(0x9A9AC83EA4C89AB3), SPH_C64(0x9898C22CB4C298B5),
+	SPH_C64(0x9C9CD60894D69CB9), SPH_C64(0xF2F21D31C31DF20B),
+	SPH_C64(0xA7A701F65101A7F4), SPH_C64(0x1111559988551133),
+	SPH_C64(0x7E7E9BA9D79B7E82), SPH_C64(0x8B8B9DA72C9D8B80),
+	SPH_C64(0x43435261225243C5), SPH_C64(0x03030F1B180F0305),
+	SPH_C64(0xE2E24DA1434DE23B), SPH_C64(0xDCDC8B72AE8BDC79),
+	SPH_C64(0xE5E5569E7B56E532), SPH_C64(0xB2B2404BF940B2CB),
+	SPH_C64(0x4E4E6B044A6B4ED2), SPH_C64(0xC7C7FCB176FCC754),
+	SPH_C64(0x6D6DC4224FC46DB7), SPH_C64(0xE9E96AF21B6AE926),
+	SPH_C64(0x2727BB0225BB2769), SPH_C64(0x40405D7A3A5D40C0),
+	SPH_C64(0xD8D89F568E9FD875), SPH_C64(0x3737EB92A5EB3759),
+	SPH_C64(0x9292E076E4E092AB), SPH_C64(0x8F8F89830C898F8C),
+	SPH_C64(0x0101050908050103), SPH_C64(0x1D1D69F5E8691D27),
+	SPH_C64(0x535302F1A20253F5), SPH_C64(0x3E3EC6D3EDC63E42),
+	SPH_C64(0x595920ABF22059EB), SPH_C64(0xC1C1E28746E2C15E),
+	SPH_C64(0x4F4F6E0D426E4FD1), SPH_C64(0x3232FABF8DFA3256),
+	SPH_C64(0x16164EA6B04E163A), SPH_C64(0xFAFA35798335FA13),
+	SPH_C64(0x7474B9F387B9749C), SPH_C64(0xFBFB30708B30FB10),
+	SPH_C64(0x6363F25C3FF263A5), SPH_C64(0x9F9FD9138CD99FBC),
+	SPH_C64(0x3434E489BDE4345C), SPH_C64(0x1A1A72CAD0721A2E),
+	SPH_C64(0x2A2A82674D822A7E), SPH_C64(0x5A5A2FB0EA2F5AEE),
+	SPH_C64(0x8D8D83911C838D8A), SPH_C64(0xC9C9CACF06CAC946),
+	SPH_C64(0xCFCFD4F936D4CF4C), SPH_C64(0xF6F60915E309F607),
+	SPH_C64(0x9090EA64F4EA90AD), SPH_C64(0x282888755D882878),
+	SPH_C64(0x888892BC34928885), SPH_C64(0x9B9BCD37ACCD9BB0),
+	SPH_C64(0x3131F5A495F53153), SPH_C64(0x0E0E367E70360E12),
+	SPH_C64(0xBDBD733C8173BDDA), SPH_C64(0x4A4A7F206A7F4ADE),
+	SPH_C64(0xE8E86FFB136FE825), SPH_C64(0x9696F452C4F496A7),
+	SPH_C64(0xA6A604FF5904A6F7), SPH_C64(0x0C0C3C6C603C0C14),
+	SPH_C64(0xC8C8CFC60ECFC845), SPH_C64(0x79798096EF80798B),
+	SPH_C64(0xBCBC76358976BCD9), SPH_C64(0xBEBE7C27997CBEDF),
+	SPH_C64(0xEFEF74C42B74EF2C), SPH_C64(0x6E6ECB3957CB6EB2),
+	SPH_C64(0x4646434C0A4346CA), SPH_C64(0x9797F15BCCF197A4),
+	SPH_C64(0x5B5B2AB9E22A5BED), SPH_C64(0xEDED7ED63B7EED2A),
+	SPH_C64(0x19197DD1C87D192B), SPH_C64(0xD9D99A5F869AD976),
+	SPH_C64(0xACAC26A50926ACE9), SPH_C64(0x9999C725BCC799B6),
+	SPH_C64(0xA8A832812932A8E5), SPH_C64(0x29298D7C558D297B),
+	SPH_C64(0x6464E96307E964AC), SPH_C64(0x1F1F63E7F8631F21),
+	SPH_C64(0xADAD23AC0123ADEA), SPH_C64(0x55551CC7921C55FF),
+	SPH_C64(0x13135F8B985F1335), SPH_C64(0xBBBB6D0AB16DBBD0),
+	SPH_C64(0xF7F70C1CEB0CF704), SPH_C64(0x6F6FCE305FCE6FB1),
+	SPH_C64(0xB9B96718A167B9D6), SPH_C64(0x47474645024647C9),
+	SPH_C64(0x2F2F934A65932F71), SPH_C64(0xEEEE71CD2371EE2F),
+	SPH_C64(0xB8B86211A962B8D5), SPH_C64(0x7B7B8A84FF8A7B8D),
+	SPH_C64(0x898997B53C978986), SPH_C64(0x3030F0AD9DF03050),
+	SPH_C64(0xD3D3B805D6B8D368), SPH_C64(0x7F7F9EA0DF9E7F81),
+	SPH_C64(0x7676B3E197B3769A), SPH_C64(0x8282B0E664B0829B)
+};
+
+static const sph_u64 old0_T7[256] = {
+	SPH_C64(0x68D50F67D568B868), SPH_C64(0xD0B71ECEB7D06DD0),
+	SPH_C64(0xEB60E00B60EB20EB), SPH_C64(0x2B876E45872B7D2B),
+	SPH_C64(0x4875327A7548D848), SPH_C64(0x9DD3019CD39DBA9D),
+	SPH_C64(0x6ADF1D77DF6ABE6A), SPH_C64(0xE453977353E431E4),
+	SPH_C64(0xE348A84B48E338E3), SPH_C64(0xA315D27115A3F8A3),
+	SPH_C64(0x5613DC8A1356FA56), SPH_C64(0x81BFFD7CBF819E81),
+	SPH_C64(0x7D94B2CF947D877D), SPH_C64(0xF1122ADB12F10EF1),
+	SPH_C64(0x85ABD95CAB859285), SPH_C64(0x9EDC1A84DC9EBF9E),
+	SPH_C64(0x2C9C517D9C2C742C), SPH_C64(0x8E8C8A048C8E8F8E),
+	SPH_C64(0x78859FE785788878), SPH_C64(0xCAC5D41EC5CA43CA),
+	SPH_C64(0x174BAFB84B173917), SPH_C64(0xA937882137A9E6A9),
+	SPH_C64(0x61F84E2FF861A361), SPH_C64(0xD5A633E6A6D562D5),
+	SPH_C64(0x5D348FD2345DE75D), SPH_C64(0x0B275358270B1D0B),
+	SPH_C64(0x8C869814868C898C), SPH_C64(0x3CCCC1FDCC3C443C),
+	SPH_C64(0x77B6E89FB6779977), SPH_C64(0x5108E3B20851F351),
+	SPH_C64(0x22AA2F0DAA226622), SPH_C64(0x4257682A5742C642),
+	SPH_C64(0x3FC3DAE5C33F413F), SPH_C64(0x5419CE9A1954FC54),
+	SPH_C64(0x415873325841C341), SPH_C64(0x80BAF474BA809D80),
+	SPH_C64(0xCCDBE22EDBCC49CC), SPH_C64(0x86A4C244A4869786),
+	SPH_C64(0xB34542F145B3C8B3), SPH_C64(0x1878D8C078182818),
+	SPH_C64(0x2E96436D962E722E), SPH_C64(0x5716D5821657F957),
+	SPH_C64(0x061E36301E060A06), SPH_C64(0x62F75537F762A662),
+	SPH_C64(0xF40307F303F401F4), SPH_C64(0x36EE9BADEE365A36),
+	SPH_C64(0xD1B217C6B2D16ED1), SPH_C64(0x6BDA147FDA6BBD6B),
+	SPH_C64(0x1B77C3D8771B2D1B), SPH_C64(0x65EC6A0FEC65AF65),
+	SPH_C64(0x75BCFA8FBC759F75), SPH_C64(0x1050908050103010),
+	SPH_C64(0xDA95449E95DA73DA), SPH_C64(0x49703B727049DB49),
+	SPH_C64(0x26BE0B2DBE266A26), SPH_C64(0xF93A629B3AF916F9),
+	SPH_C64(0xCBC0DD16C0CB40CB), SPH_C64(0x66E37117E366AA66),
+	SPH_C64(0xE75C8C6B5CE734E7), SPH_C64(0xBA6803B968BAD3BA),
+	SPH_C64(0xAE2CB7192CAEEFAE), SPH_C64(0x500DEABA0D50F050),
+	SPH_C64(0x5207F8AA0752F652), SPH_C64(0xAB3D9A313DABE0AB),
+	SPH_C64(0x05112D2811050F05), SPH_C64(0xF01723D317F00DF0),
+	SPH_C64(0x0D396568390D170D), SPH_C64(0x73A2CCBFA2739573),
+	SPH_C64(0x3BD7FEC5D73B4D3B), SPH_C64(0x0414242014040C04),
+	SPH_C64(0x20A03D1DA0206020), SPH_C64(0xFE215DA321FE1FFE),
+	SPH_C64(0xDD8E7BA68EDD7ADD), SPH_C64(0xF5060EFB06F502F5),
+	SPH_C64(0xB45E7DC95EB4C1B4), SPH_C64(0x5F3E9DC23E5FE15F),
+	SPH_C64(0x0A225A50220A1E0A), SPH_C64(0xB55B74C15BB5C2B5),
+	SPH_C64(0xC0E78E4EE7C05DC0), SPH_C64(0xA01AC9691AA0FDA0),
+	SPH_C64(0x71A8DEAFA8719371), SPH_C64(0xA50BE4410BA5F2A5),
+	SPH_C64(0x2D995875992D772D), SPH_C64(0x60FD4727FD60A060),
+	SPH_C64(0x72A7C5B7A7729672), SPH_C64(0x93E57FECE593A893),
+	SPH_C64(0x39DDECD5DD394B39), SPH_C64(0x0828484028081808),
+	SPH_C64(0x83B5EF6CB5839883), SPH_C64(0x21A53415A5216321),
+	SPH_C64(0x5C3186DA315CE45C), SPH_C64(0x87A1CB4CA1879487),
+	SPH_C64(0xB14F50E14FB1CEB1), SPH_C64(0xE047B35347E03DE0),
+	SPH_C64(0x0000000000000000), SPH_C64(0xC3E89556E8C358C3),
+	SPH_C64(0x125A82905A123612), SPH_C64(0x91EF6DFCEF91AE91),
+	SPH_C64(0x8A98AE24988A838A), SPH_C64(0x020A12100A020602),
+	SPH_C64(0x1C6CFCE06C1C241C), SPH_C64(0xE659856359E637E6),
+	SPH_C64(0x454C57124C45CF45), SPH_C64(0xC2ED9C5EEDC25BC2),
+	SPH_C64(0xC4F3AA6EF3C451C4), SPH_C64(0xFD2E46BB2EFD1AFD),
+	SPH_C64(0xBF792E9179BFDCBF), SPH_C64(0x44495E1A4944CC44),
+	SPH_C64(0xA11FC0611FA1FEA1), SPH_C64(0x4C61165A614CD44C),
+	SPH_C64(0x33FFB685FF335533), SPH_C64(0xC5F6A366F6C552C5),
+	SPH_C64(0x84AED054AE849184), SPH_C64(0x23AF2605AF236523),
+	SPH_C64(0x7C91BBC7917C847C), SPH_C64(0xB04A59E94AB0CDB0),
+	SPH_C64(0x25B11035B1256F25), SPH_C64(0x1541BDA841153F15),
+	SPH_C64(0x35E180B5E1355F35), SPH_C64(0x69D0066FD069BB69),
+	SPH_C64(0xFF2454AB24FF1CFF), SPH_C64(0x94FE40D4FE94A194),
+	SPH_C64(0x4D641F52644DD74D), SPH_C64(0x70ADD7A7AD709070),
+	SPH_C64(0xA210DB7910A2FBA2), SPH_C64(0xAF29BE1129AFECAF),
+	SPH_C64(0xCDDEEB26DECD4ACD), SPH_C64(0xD6A928FEA9D667D6),
+	SPH_C64(0x6CC12B47C16CB46C), SPH_C64(0xB75166D151B7C4B7),
+	SPH_C64(0xF83F6B933FF815F8), SPH_C64(0x092D41482D091B09),
+	SPH_C64(0xF31838CB18F308F3), SPH_C64(0x67E6781FE667A967),
+	SPH_C64(0xA40EED490EA4F1A4), SPH_C64(0xEA65E90365EA23EA),
+	SPH_C64(0xEC7BDF337BEC29EC), SPH_C64(0xB6546FD954B6C7B6),
+	SPH_C64(0xD4A33AEEA3D461D4), SPH_C64(0xD2BD0CDEBDD26BD2),
+	SPH_C64(0x1444B4A044143C14), SPH_C64(0x1E66EEF0661E221E),
+	SPH_C64(0xE142BA5B42E13EE1), SPH_C64(0x24B4193DB4246C24),
+	SPH_C64(0x38D8E5DDD8384838), SPH_C64(0xC6F9B87EF9C657C6),
+	SPH_C64(0xDB904D9690DB70DB), SPH_C64(0x4B7A29627A4BDD4B),
+	SPH_C64(0x7A8F8DF78F7A8E7A), SPH_C64(0x3AD2F7CDD23A4E3A),
+	SPH_C64(0xDE8160BE81DE7FDE), SPH_C64(0x5E3B94CA3B5EE25E),
+	SPH_C64(0xDF8469B684DF7CDF), SPH_C64(0x95FB49DCFB95A295),
+	SPH_C64(0xFC2B4FB32BFC19FC), SPH_C64(0xAA38933938AAE3AA),
+	SPH_C64(0xD7AC21F6ACD764D7), SPH_C64(0xCED1F03ED1CE4FCE),
+	SPH_C64(0x071B3F381B070907), SPH_C64(0x0F337778330F110F),
+	SPH_C64(0x3DC9C8F5C93D473D), SPH_C64(0x5825A2FA2558E858),
+	SPH_C64(0x9AC83EA4C89AB39A), SPH_C64(0x98C22CB4C298B598),
+	SPH_C64(0x9CD60894D69CB99C), SPH_C64(0xF21D31C31DF20BF2),
+	SPH_C64(0xA701F65101A7F4A7), SPH_C64(0x1155998855113311),
+	SPH_C64(0x7E9BA9D79B7E827E), SPH_C64(0x8B9DA72C9D8B808B),
+	SPH_C64(0x435261225243C543), SPH_C64(0x030F1B180F030503),
+	SPH_C64(0xE24DA1434DE23BE2), SPH_C64(0xDC8B72AE8BDC79DC),
+	SPH_C64(0xE5569E7B56E532E5), SPH_C64(0xB2404BF940B2CBB2),
+	SPH_C64(0x4E6B044A6B4ED24E), SPH_C64(0xC7FCB176FCC754C7),
+	SPH_C64(0x6DC4224FC46DB76D), SPH_C64(0xE96AF21B6AE926E9),
+	SPH_C64(0x27BB0225BB276927), SPH_C64(0x405D7A3A5D40C040),
+	SPH_C64(0xD89F568E9FD875D8), SPH_C64(0x37EB92A5EB375937),
+	SPH_C64(0x92E076E4E092AB92), SPH_C64(0x8F89830C898F8C8F),
+	SPH_C64(0x0105090805010301), SPH_C64(0x1D69F5E8691D271D),
+	SPH_C64(0x5302F1A20253F553), SPH_C64(0x3EC6D3EDC63E423E),
+	SPH_C64(0x5920ABF22059EB59), SPH_C64(0xC1E28746E2C15EC1),
+	SPH_C64(0x4F6E0D426E4FD14F), SPH_C64(0x32FABF8DFA325632),
+	SPH_C64(0x164EA6B04E163A16), SPH_C64(0xFA35798335FA13FA),
+	SPH_C64(0x74B9F387B9749C74), SPH_C64(0xFB30708B30FB10FB),
+	SPH_C64(0x63F25C3FF263A563), SPH_C64(0x9FD9138CD99FBC9F),
+	SPH_C64(0x34E489BDE4345C34), SPH_C64(0x1A72CAD0721A2E1A),
+	SPH_C64(0x2A82674D822A7E2A), SPH_C64(0x5A2FB0EA2F5AEE5A),
+	SPH_C64(0x8D83911C838D8A8D), SPH_C64(0xC9CACF06CAC946C9),
+	SPH_C64(0xCFD4F936D4CF4CCF), SPH_C64(0xF60915E309F607F6),
+	SPH_C64(0x90EA64F4EA90AD90), SPH_C64(0x2888755D88287828),
+	SPH_C64(0x8892BC3492888588), SPH_C64(0x9BCD37ACCD9BB09B),
+	SPH_C64(0x31F5A495F5315331), SPH_C64(0x0E367E70360E120E),
+	SPH_C64(0xBD733C8173BDDABD), SPH_C64(0x4A7F206A7F4ADE4A),
+	SPH_C64(0xE86FFB136FE825E8), SPH_C64(0x96F452C4F496A796),
+	SPH_C64(0xA604FF5904A6F7A6), SPH_C64(0x0C3C6C603C0C140C),
+	SPH_C64(0xC8CFC60ECFC845C8), SPH_C64(0x798096EF80798B79),
+	SPH_C64(0xBC76358976BCD9BC), SPH_C64(0xBE7C27997CBEDFBE),
+	SPH_C64(0xEF74C42B74EF2CEF), SPH_C64(0x6ECB3957CB6EB26E),
+	SPH_C64(0x46434C0A4346CA46), SPH_C64(0x97F15BCCF197A497),
+	SPH_C64(0x5B2AB9E22A5BED5B), SPH_C64(0xED7ED63B7EED2AED),
+	SPH_C64(0x197DD1C87D192B19), SPH_C64(0xD99A5F869AD976D9),
+	SPH_C64(0xAC26A50926ACE9AC), SPH_C64(0x99C725BCC799B699),
+	SPH_C64(0xA832812932A8E5A8), SPH_C64(0x298D7C558D297B29),
+	SPH_C64(0x64E96307E964AC64), SPH_C64(0x1F63E7F8631F211F),
+	SPH_C64(0xAD23AC0123ADEAAD), SPH_C64(0x551CC7921C55FF55),
+	SPH_C64(0x135F8B985F133513), SPH_C64(0xBB6D0AB16DBBD0BB),
+	SPH_C64(0xF70C1CEB0CF704F7), SPH_C64(0x6FCE305FCE6FB16F),
+	SPH_C64(0xB96718A167B9D6B9), SPH_C64(0x474645024647C947),
+	SPH_C64(0x2F934A65932F712F), SPH_C64(0xEE71CD2371EE2FEE),
+	SPH_C64(0xB86211A962B8D5B8), SPH_C64(0x7B8A84FF8A7B8D7B),
+	SPH_C64(0x8997B53C97898689), SPH_C64(0x30F0AD9DF0305030),
+	SPH_C64(0xD3B805D6B8D368D3), SPH_C64(0x7F9EA0DF9E7F817F),
+	SPH_C64(0x76B3E197B3769A76), SPH_C64(0x82B0E664B0829B82)
+};
+
+#endif
+
+static const sph_u64 old0_RC[10] = {
+	SPH_C64(0xE46A9D482BEBD068),
+	SPH_C64(0x9E85F17D8156A3E3),
+	SPH_C64(0xD561A917CA788E2C),
+	SPH_C64(0x422251773C8C0B5D),
+	SPH_C64(0x18B386CC8041543F),
+	SPH_C64(0x6BD136F46206572E),
+	SPH_C64(0xF92649DA1075651B),
+	SPH_C64(0xAB5250AEBAE766CB),
+	SPH_C64(0xFE20043B730DF005),
+	SPH_C64(0xA0C0B50A5FB4F5DD)
+};
+
+/* ====================================================================== */
+/*
+ * Constants for plain WHIRLPOOL-1 (second version).
+ */
+
+static const sph_u64 old1_T0[256] = {
+	SPH_C64(0x78D8C07818281818), SPH_C64(0xAF2605AF23652323),
+	SPH_C64(0xF9B87EF9C657C6C6), SPH_C64(0x6FFB136FE825E8E8),
+	SPH_C64(0xA1CB4CA187948787), SPH_C64(0x6211A962B8D5B8B8),
+	SPH_C64(0x0509080501030101), SPH_C64(0x6E0D426E4FD14F4F),
+	SPH_C64(0xEE9BADEE365A3636), SPH_C64(0x04FF5904A6F7A6A6),
+	SPH_C64(0xBD0CDEBDD26BD2D2), SPH_C64(0x060EFB06F502F5F5),
+	SPH_C64(0x8096EF80798B7979), SPH_C64(0xCE305FCE6FB16F6F),
+	SPH_C64(0xEF6DFCEF91AE9191), SPH_C64(0x07F8AA0752F65252),
+	SPH_C64(0xFD4727FD60A06060), SPH_C64(0x76358976BCD9BCBC),
+	SPH_C64(0xCD37ACCD9BB09B9B), SPH_C64(0x8C8A048C8E8F8E8E),
+	SPH_C64(0x15D27115A3F8A3A3), SPH_C64(0x3C6C603C0C140C0C),
+	SPH_C64(0x8A84FF8A7B8D7B7B), SPH_C64(0xE180B5E1355F3535),
+	SPH_C64(0x69F5E8691D271D1D), SPH_C64(0x47B35347E03DE0E0),
+	SPH_C64(0xAC21F6ACD764D7D7), SPH_C64(0xED9C5EEDC25BC2C2),
+	SPH_C64(0x96436D962E722E2E), SPH_C64(0x7A29627A4BDD4B4B),
+	SPH_C64(0x215DA321FE1FFEFE), SPH_C64(0x16D5821657F95757),
+	SPH_C64(0x41BDA841153F1515), SPH_C64(0xB6E89FB677997777),
+	SPH_C64(0xEB92A5EB37593737), SPH_C64(0x569E7B56E532E5E5),
+	SPH_C64(0xD9138CD99FBC9F9F), SPH_C64(0x1723D317F00DF0F0),
+	SPH_C64(0x7F206A7F4ADE4A4A), SPH_C64(0x95449E95DA73DADA),
+	SPH_C64(0x25A2FA2558E85858), SPH_C64(0xCACF06CAC946C9C9),
+	SPH_C64(0x8D7C558D297B2929), SPH_C64(0x225A50220A1E0A0A),
+	SPH_C64(0x4F50E14FB1CEB1B1), SPH_C64(0x1AC9691AA0FDA0A0),
+	SPH_C64(0xDA147FDA6BBD6B6B), SPH_C64(0xABD95CAB85928585),
+	SPH_C64(0x733C8173BDDABDBD), SPH_C64(0x348FD2345DE75D5D),
+	SPH_C64(0x5090805010301010), SPH_C64(0x0307F303F401F4F4),
+	SPH_C64(0xC0DD16C0CB40CBCB), SPH_C64(0xC6D3EDC63E423E3E),
+	SPH_C64(0x112D2811050F0505), SPH_C64(0xE6781FE667A96767),
+	SPH_C64(0x53977353E431E4E4), SPH_C64(0xBB0225BB27692727),
+	SPH_C64(0x5873325841C34141), SPH_C64(0x9DA72C9D8B808B8B),
+	SPH_C64(0x01F65101A7F4A7A7), SPH_C64(0x94B2CF947D877D7D),
+	SPH_C64(0xFB49DCFB95A29595), SPH_C64(0x9F568E9FD875D8D8),
+	SPH_C64(0x30708B30FB10FBFB), SPH_C64(0x71CD2371EE2FEEEE),
+	SPH_C64(0x91BBC7917C847C7C), SPH_C64(0xE37117E366AA6666),
+	SPH_C64(0x8E7BA68EDD7ADDDD), SPH_C64(0x4BAFB84B17391717),
+	SPH_C64(0x4645024647C94747), SPH_C64(0xDC1A84DC9EBF9E9E),
+	SPH_C64(0xC5D41EC5CA43CACA), SPH_C64(0x995875992D772D2D),
+	SPH_C64(0x792E9179BFDCBFBF), SPH_C64(0x1B3F381B07090707),
+	SPH_C64(0x23AC0123ADEAADAD), SPH_C64(0x2FB0EA2F5AEE5A5A),
+	SPH_C64(0xB5EF6CB583988383), SPH_C64(0xFFB685FF33553333),
+	SPH_C64(0xF25C3FF263A56363), SPH_C64(0x0A12100A02060202),
+	SPH_C64(0x38933938AAE3AAAA), SPH_C64(0xA8DEAFA871937171),
+	SPH_C64(0xCFC60ECFC845C8C8), SPH_C64(0x7DD1C87D192B1919),
+	SPH_C64(0x703B727049DB4949), SPH_C64(0x9A5F869AD976D9D9),
+	SPH_C64(0x1D31C31DF20BF2F2), SPH_C64(0x48A84B48E338E3E3),
+	SPH_C64(0x2AB9E22A5BED5B5B), SPH_C64(0x92BC349288858888),
+	SPH_C64(0xC83EA4C89AB39A9A), SPH_C64(0xBE0B2DBE266A2626),
+	SPH_C64(0xFABF8DFA32563232), SPH_C64(0x4A59E94AB0CDB0B0),
+	SPH_C64(0x6AF21B6AE926E9E9), SPH_C64(0x337778330F110F0F),
+	SPH_C64(0xA633E6A6D562D5D5), SPH_C64(0xBAF474BA809D8080),
+	SPH_C64(0x7C27997CBEDFBEBE), SPH_C64(0xDEEB26DECD4ACDCD),
+	SPH_C64(0xE489BDE4345C3434), SPH_C64(0x75327A7548D84848),
+	SPH_C64(0x2454AB24FF1CFFFF), SPH_C64(0x8F8DF78F7A8E7A7A),
+	SPH_C64(0xEA64F4EA90AD9090), SPH_C64(0x3E9DC23E5FE15F5F),
+	SPH_C64(0xA03D1DA020602020), SPH_C64(0xD50F67D568B86868),
+	SPH_C64(0x72CAD0721A2E1A1A), SPH_C64(0x2CB7192CAEEFAEAE),
+	SPH_C64(0x5E7DC95EB4C1B4B4), SPH_C64(0x19CE9A1954FC5454),
+	SPH_C64(0xE57FECE593A89393), SPH_C64(0xAA2F0DAA22662222),
+	SPH_C64(0xE96307E964AC6464), SPH_C64(0x122ADB12F10EF1F1),
+	SPH_C64(0xA2CCBFA273957373), SPH_C64(0x5A82905A12361212),
+	SPH_C64(0x5D7A3A5D40C04040), SPH_C64(0x2848402808180808),
+	SPH_C64(0xE89556E8C358C3C3), SPH_C64(0x7BDF337BEC29ECEC),
+	SPH_C64(0x904D9690DB70DBDB), SPH_C64(0x1FC0611FA1FEA1A1),
+	SPH_C64(0x83911C838D8A8D8D), SPH_C64(0xC9C8F5C93D473D3D),
+	SPH_C64(0xF15BCCF197A49797), SPH_C64(0x0000000000000000),
+	SPH_C64(0xD4F936D4CF4CCFCF), SPH_C64(0x876E45872B7D2B2B),
+	SPH_C64(0xB3E197B3769A7676), SPH_C64(0xB0E664B0829B8282),
+	SPH_C64(0xA928FEA9D667D6D6), SPH_C64(0x77C3D8771B2D1B1B),
+	SPH_C64(0x5B74C15BB5C2B5B5), SPH_C64(0x29BE1129AFECAFAF),
+	SPH_C64(0xDF1D77DF6ABE6A6A), SPH_C64(0x0DEABA0D50F05050),
+	SPH_C64(0x4C57124C45CF4545), SPH_C64(0x1838CB18F308F3F3),
+	SPH_C64(0xF0AD9DF030503030), SPH_C64(0x74C42B74EF2CEFEF),
+	SPH_C64(0xC3DAE5C33F413F3F), SPH_C64(0x1CC7921C55FF5555),
+	SPH_C64(0x10DB7910A2FBA2A2), SPH_C64(0x65E90365EA23EAEA),
+	SPH_C64(0xEC6A0FEC65AF6565), SPH_C64(0x6803B968BAD3BABA),
+	SPH_C64(0x934A65932F712F2F), SPH_C64(0xE78E4EE7C05DC0C0),
+	SPH_C64(0x8160BE81DE7FDEDE), SPH_C64(0x6CFCE06C1C241C1C),
+	SPH_C64(0x2E46BB2EFD1AFDFD), SPH_C64(0x641F52644DD74D4D),
+	SPH_C64(0xE076E4E092AB9292), SPH_C64(0xBCFA8FBC759F7575),
+	SPH_C64(0x1E36301E060A0606), SPH_C64(0x98AE24988A838A8A),
+	SPH_C64(0x404BF940B2CBB2B2), SPH_C64(0x59856359E637E6E6),
+	SPH_C64(0x367E70360E120E0E), SPH_C64(0x63E7F8631F211F1F),
+	SPH_C64(0xF75537F762A66262), SPH_C64(0xA33AEEA3D461D4D4),
+	SPH_C64(0x32812932A8E5A8A8), SPH_C64(0xF452C4F496A79696),
+	SPH_C64(0x3A629B3AF916F9F9), SPH_C64(0xF6A366F6C552C5C5),
+	SPH_C64(0xB11035B1256F2525), SPH_C64(0x20ABF22059EB5959),
+	SPH_C64(0xAED054AE84918484), SPH_C64(0xA7C5B7A772967272),
+	SPH_C64(0xDDECD5DD394B3939), SPH_C64(0x61165A614CD44C4C),
+	SPH_C64(0x3B94CA3B5EE25E5E), SPH_C64(0x859FE78578887878),
+	SPH_C64(0xD8E5DDD838483838), SPH_C64(0x869814868C898C8C),
+	SPH_C64(0xB217C6B2D16ED1D1), SPH_C64(0x0BE4410BA5F2A5A5),
+	SPH_C64(0x4DA1434DE23BE2E2), SPH_C64(0xF84E2FF861A36161),
+	SPH_C64(0x4542F145B3C8B3B3), SPH_C64(0xA53415A521632121),
+	SPH_C64(0xD60894D69CB99C9C), SPH_C64(0x66EEF0661E221E1E),
+	SPH_C64(0x5261225243C54343), SPH_C64(0xFCB176FCC754C7C7),
+	SPH_C64(0x2B4FB32BFC19FCFC), SPH_C64(0x14242014040C0404),
+	SPH_C64(0x08E3B20851F35151), SPH_C64(0xC725BCC799B69999),
+	SPH_C64(0xC4224FC46DB76D6D), SPH_C64(0x396568390D170D0D),
+	SPH_C64(0x35798335FA13FAFA), SPH_C64(0x8469B684DF7CDFDF),
+	SPH_C64(0x9BA9D79B7E827E7E), SPH_C64(0xB4193DB4246C2424),
+	SPH_C64(0xD7FEC5D73B4D3B3B), SPH_C64(0x3D9A313DABE0ABAB),
+	SPH_C64(0xD1F03ED1CE4FCECE), SPH_C64(0x5599885511331111),
+	SPH_C64(0x89830C898F8C8F8F), SPH_C64(0x6B044A6B4ED24E4E),
+	SPH_C64(0x5166D151B7C4B7B7), SPH_C64(0x60E00B60EB20EBEB),
+	SPH_C64(0xCCC1FDCC3C443C3C), SPH_C64(0xBFFD7CBF819E8181),
+	SPH_C64(0xFE40D4FE94A19494), SPH_C64(0x0C1CEB0CF704F7F7),
+	SPH_C64(0x6718A167B9D6B9B9), SPH_C64(0x5F8B985F13351313),
+	SPH_C64(0x9C517D9C2C742C2C), SPH_C64(0xB805D6B8D368D3D3),
+	SPH_C64(0x5C8C6B5CE734E7E7), SPH_C64(0xCB3957CB6EB26E6E),
+	SPH_C64(0xF3AA6EF3C451C4C4), SPH_C64(0x0F1B180F03050303),
+	SPH_C64(0x13DC8A1356FA5656), SPH_C64(0x495E1A4944CC4444),
+	SPH_C64(0x9EA0DF9E7F817F7F), SPH_C64(0x37882137A9E6A9A9),
+	SPH_C64(0x82674D822A7E2A2A), SPH_C64(0x6D0AB16DBBD0BBBB),
+	SPH_C64(0xE28746E2C15EC1C1), SPH_C64(0x02F1A20253F55353),
+	SPH_C64(0x8B72AE8BDC79DCDC), SPH_C64(0x275358270B1D0B0B),
+	SPH_C64(0xD3019CD39DBA9D9D), SPH_C64(0xC12B47C16CB46C6C),
+	SPH_C64(0xF5A495F531533131), SPH_C64(0xB9F387B9749C7474),
+	SPH_C64(0x0915E309F607F6F6), SPH_C64(0x434C0A4346CA4646),
+	SPH_C64(0x26A50926ACE9ACAC), SPH_C64(0x97B53C9789868989),
+	SPH_C64(0x44B4A044143C1414), SPH_C64(0x42BA5B42E13EE1E1),
+	SPH_C64(0x4EA6B04E163A1616), SPH_C64(0xD2F7CDD23A4E3A3A),
+	SPH_C64(0xD0066FD069BB6969), SPH_C64(0x2D41482D091B0909),
+	SPH_C64(0xADD7A7AD70907070), SPH_C64(0x546FD954B6C7B6B6),
+	SPH_C64(0xB71ECEB7D06DD0D0), SPH_C64(0x7ED63B7EED2AEDED),
+	SPH_C64(0xDBE22EDBCC49CCCC), SPH_C64(0x57682A5742C64242),
+	SPH_C64(0xC22CB4C298B59898), SPH_C64(0x0EED490EA4F1A4A4),
+	SPH_C64(0x88755D8828782828), SPH_C64(0x3186DA315CE45C5C),
+	SPH_C64(0x3F6B933FF815F8F8), SPH_C64(0xA4C244A486978686)
+};
+
+#if !SPH_SMALL_FOOTPRINT_WHIRLPOOL
+
+static const sph_u64 old1_T1[256] = {
+	SPH_C64(0xD8C0781828181878), SPH_C64(0x2605AF23652323AF),
+	SPH_C64(0xB87EF9C657C6C6F9), SPH_C64(0xFB136FE825E8E86F),
+	SPH_C64(0xCB4CA187948787A1), SPH_C64(0x11A962B8D5B8B862),
+	SPH_C64(0x0908050103010105), SPH_C64(0x0D426E4FD14F4F6E),
+	SPH_C64(0x9BADEE365A3636EE), SPH_C64(0xFF5904A6F7A6A604),
+	SPH_C64(0x0CDEBDD26BD2D2BD), SPH_C64(0x0EFB06F502F5F506),
+	SPH_C64(0x96EF80798B797980), SPH_C64(0x305FCE6FB16F6FCE),
+	SPH_C64(0x6DFCEF91AE9191EF), SPH_C64(0xF8AA0752F6525207),
+	SPH_C64(0x4727FD60A06060FD), SPH_C64(0x358976BCD9BCBC76),
+	SPH_C64(0x37ACCD9BB09B9BCD), SPH_C64(0x8A048C8E8F8E8E8C),
+	SPH_C64(0xD27115A3F8A3A315), SPH_C64(0x6C603C0C140C0C3C),
+	SPH_C64(0x84FF8A7B8D7B7B8A), SPH_C64(0x80B5E1355F3535E1),
+	SPH_C64(0xF5E8691D271D1D69), SPH_C64(0xB35347E03DE0E047),
+	SPH_C64(0x21F6ACD764D7D7AC), SPH_C64(0x9C5EEDC25BC2C2ED),
+	SPH_C64(0x436D962E722E2E96), SPH_C64(0x29627A4BDD4B4B7A),
+	SPH_C64(0x5DA321FE1FFEFE21), SPH_C64(0xD5821657F9575716),
+	SPH_C64(0xBDA841153F151541), SPH_C64(0xE89FB677997777B6),
+	SPH_C64(0x92A5EB37593737EB), SPH_C64(0x9E7B56E532E5E556),
+	SPH_C64(0x138CD99FBC9F9FD9), SPH_C64(0x23D317F00DF0F017),
+	SPH_C64(0x206A7F4ADE4A4A7F), SPH_C64(0x449E95DA73DADA95),
+	SPH_C64(0xA2FA2558E8585825), SPH_C64(0xCF06CAC946C9C9CA),
+	SPH_C64(0x7C558D297B29298D), SPH_C64(0x5A50220A1E0A0A22),
+	SPH_C64(0x50E14FB1CEB1B14F), SPH_C64(0xC9691AA0FDA0A01A),
+	SPH_C64(0x147FDA6BBD6B6BDA), SPH_C64(0xD95CAB85928585AB),
+	SPH_C64(0x3C8173BDDABDBD73), SPH_C64(0x8FD2345DE75D5D34),
+	SPH_C64(0x9080501030101050), SPH_C64(0x07F303F401F4F403),
+	SPH_C64(0xDD16C0CB40CBCBC0), SPH_C64(0xD3EDC63E423E3EC6),
+	SPH_C64(0x2D2811050F050511), SPH_C64(0x781FE667A96767E6),
+	SPH_C64(0x977353E431E4E453), SPH_C64(0x0225BB27692727BB),
+	SPH_C64(0x73325841C3414158), SPH_C64(0xA72C9D8B808B8B9D),
+	SPH_C64(0xF65101A7F4A7A701), SPH_C64(0xB2CF947D877D7D94),
+	SPH_C64(0x49DCFB95A29595FB), SPH_C64(0x568E9FD875D8D89F),
+	SPH_C64(0x708B30FB10FBFB30), SPH_C64(0xCD2371EE2FEEEE71),
+	SPH_C64(0xBBC7917C847C7C91), SPH_C64(0x7117E366AA6666E3),
+	SPH_C64(0x7BA68EDD7ADDDD8E), SPH_C64(0xAFB84B173917174B),
+	SPH_C64(0x45024647C9474746), SPH_C64(0x1A84DC9EBF9E9EDC),
+	SPH_C64(0xD41EC5CA43CACAC5), SPH_C64(0x5875992D772D2D99),
+	SPH_C64(0x2E9179BFDCBFBF79), SPH_C64(0x3F381B070907071B),
+	SPH_C64(0xAC0123ADEAADAD23), SPH_C64(0xB0EA2F5AEE5A5A2F),
+	SPH_C64(0xEF6CB583988383B5), SPH_C64(0xB685FF33553333FF),
+	SPH_C64(0x5C3FF263A56363F2), SPH_C64(0x12100A020602020A),
+	SPH_C64(0x933938AAE3AAAA38), SPH_C64(0xDEAFA871937171A8),
+	SPH_C64(0xC60ECFC845C8C8CF), SPH_C64(0xD1C87D192B19197D),
+	SPH_C64(0x3B727049DB494970), SPH_C64(0x5F869AD976D9D99A),
+	SPH_C64(0x31C31DF20BF2F21D), SPH_C64(0xA84B48E338E3E348),
+	SPH_C64(0xB9E22A5BED5B5B2A), SPH_C64(0xBC34928885888892),
+	SPH_C64(0x3EA4C89AB39A9AC8), SPH_C64(0x0B2DBE266A2626BE),
+	SPH_C64(0xBF8DFA32563232FA), SPH_C64(0x59E94AB0CDB0B04A),
+	SPH_C64(0xF21B6AE926E9E96A), SPH_C64(0x7778330F110F0F33),
+	SPH_C64(0x33E6A6D562D5D5A6), SPH_C64(0xF474BA809D8080BA),
+	SPH_C64(0x27997CBEDFBEBE7C), SPH_C64(0xEB26DECD4ACDCDDE),
+	SPH_C64(0x89BDE4345C3434E4), SPH_C64(0x327A7548D8484875),
+	SPH_C64(0x54AB24FF1CFFFF24), SPH_C64(0x8DF78F7A8E7A7A8F),
+	SPH_C64(0x64F4EA90AD9090EA), SPH_C64(0x9DC23E5FE15F5F3E),
+	SPH_C64(0x3D1DA020602020A0), SPH_C64(0x0F67D568B86868D5),
+	SPH_C64(0xCAD0721A2E1A1A72), SPH_C64(0xB7192CAEEFAEAE2C),
+	SPH_C64(0x7DC95EB4C1B4B45E), SPH_C64(0xCE9A1954FC545419),
+	SPH_C64(0x7FECE593A89393E5), SPH_C64(0x2F0DAA22662222AA),
+	SPH_C64(0x6307E964AC6464E9), SPH_C64(0x2ADB12F10EF1F112),
+	SPH_C64(0xCCBFA273957373A2), SPH_C64(0x82905A123612125A),
+	SPH_C64(0x7A3A5D40C040405D), SPH_C64(0x4840280818080828),
+	SPH_C64(0x9556E8C358C3C3E8), SPH_C64(0xDF337BEC29ECEC7B),
+	SPH_C64(0x4D9690DB70DBDB90), SPH_C64(0xC0611FA1FEA1A11F),
+	SPH_C64(0x911C838D8A8D8D83), SPH_C64(0xC8F5C93D473D3DC9),
+	SPH_C64(0x5BCCF197A49797F1), SPH_C64(0x0000000000000000),
+	SPH_C64(0xF936D4CF4CCFCFD4), SPH_C64(0x6E45872B7D2B2B87),
+	SPH_C64(0xE197B3769A7676B3), SPH_C64(0xE664B0829B8282B0),
+	SPH_C64(0x28FEA9D667D6D6A9), SPH_C64(0xC3D8771B2D1B1B77),
+	SPH_C64(0x74C15BB5C2B5B55B), SPH_C64(0xBE1129AFECAFAF29),
+	SPH_C64(0x1D77DF6ABE6A6ADF), SPH_C64(0xEABA0D50F050500D),
+	SPH_C64(0x57124C45CF45454C), SPH_C64(0x38CB18F308F3F318),
+	SPH_C64(0xAD9DF030503030F0), SPH_C64(0xC42B74EF2CEFEF74),
+	SPH_C64(0xDAE5C33F413F3FC3), SPH_C64(0xC7921C55FF55551C),
+	SPH_C64(0xDB7910A2FBA2A210), SPH_C64(0xE90365EA23EAEA65),
+	SPH_C64(0x6A0FEC65AF6565EC), SPH_C64(0x03B968BAD3BABA68),
+	SPH_C64(0x4A65932F712F2F93), SPH_C64(0x8E4EE7C05DC0C0E7),
+	SPH_C64(0x60BE81DE7FDEDE81), SPH_C64(0xFCE06C1C241C1C6C),
+	SPH_C64(0x46BB2EFD1AFDFD2E), SPH_C64(0x1F52644DD74D4D64),
+	SPH_C64(0x76E4E092AB9292E0), SPH_C64(0xFA8FBC759F7575BC),
+	SPH_C64(0x36301E060A06061E), SPH_C64(0xAE24988A838A8A98),
+	SPH_C64(0x4BF940B2CBB2B240), SPH_C64(0x856359E637E6E659),
+	SPH_C64(0x7E70360E120E0E36), SPH_C64(0xE7F8631F211F1F63),
+	SPH_C64(0x5537F762A66262F7), SPH_C64(0x3AEEA3D461D4D4A3),
+	SPH_C64(0x812932A8E5A8A832), SPH_C64(0x52C4F496A79696F4),
+	SPH_C64(0x629B3AF916F9F93A), SPH_C64(0xA366F6C552C5C5F6),
+	SPH_C64(0x1035B1256F2525B1), SPH_C64(0xABF22059EB595920),
+	SPH_C64(0xD054AE84918484AE), SPH_C64(0xC5B7A772967272A7),
+	SPH_C64(0xECD5DD394B3939DD), SPH_C64(0x165A614CD44C4C61),
+	SPH_C64(0x94CA3B5EE25E5E3B), SPH_C64(0x9FE7857888787885),
+	SPH_C64(0xE5DDD838483838D8), SPH_C64(0x9814868C898C8C86),
+	SPH_C64(0x17C6B2D16ED1D1B2), SPH_C64(0xE4410BA5F2A5A50B),
+	SPH_C64(0xA1434DE23BE2E24D), SPH_C64(0x4E2FF861A36161F8),
+	SPH_C64(0x42F145B3C8B3B345), SPH_C64(0x3415A521632121A5),
+	SPH_C64(0x0894D69CB99C9CD6), SPH_C64(0xEEF0661E221E1E66),
+	SPH_C64(0x61225243C5434352), SPH_C64(0xB176FCC754C7C7FC),
+	SPH_C64(0x4FB32BFC19FCFC2B), SPH_C64(0x242014040C040414),
+	SPH_C64(0xE3B20851F3515108), SPH_C64(0x25BCC799B69999C7),
+	SPH_C64(0x224FC46DB76D6DC4), SPH_C64(0x6568390D170D0D39),
+	SPH_C64(0x798335FA13FAFA35), SPH_C64(0x69B684DF7CDFDF84),
+	SPH_C64(0xA9D79B7E827E7E9B), SPH_C64(0x193DB4246C2424B4),
+	SPH_C64(0xFEC5D73B4D3B3BD7), SPH_C64(0x9A313DABE0ABAB3D),
+	SPH_C64(0xF03ED1CE4FCECED1), SPH_C64(0x9988551133111155),
+	SPH_C64(0x830C898F8C8F8F89), SPH_C64(0x044A6B4ED24E4E6B),
+	SPH_C64(0x66D151B7C4B7B751), SPH_C64(0xE00B60EB20EBEB60),
+	SPH_C64(0xC1FDCC3C443C3CCC), SPH_C64(0xFD7CBF819E8181BF),
+	SPH_C64(0x40D4FE94A19494FE), SPH_C64(0x1CEB0CF704F7F70C),
+	SPH_C64(0x18A167B9D6B9B967), SPH_C64(0x8B985F133513135F),
+	SPH_C64(0x517D9C2C742C2C9C), SPH_C64(0x05D6B8D368D3D3B8),
+	SPH_C64(0x8C6B5CE734E7E75C), SPH_C64(0x3957CB6EB26E6ECB),
+	SPH_C64(0xAA6EF3C451C4C4F3), SPH_C64(0x1B180F030503030F),
+	SPH_C64(0xDC8A1356FA565613), SPH_C64(0x5E1A4944CC444449),
+	SPH_C64(0xA0DF9E7F817F7F9E), SPH_C64(0x882137A9E6A9A937),
+	SPH_C64(0x674D822A7E2A2A82), SPH_C64(0x0AB16DBBD0BBBB6D),
+	SPH_C64(0x8746E2C15EC1C1E2), SPH_C64(0xF1A20253F5535302),
+	SPH_C64(0x72AE8BDC79DCDC8B), SPH_C64(0x5358270B1D0B0B27),
+	SPH_C64(0x019CD39DBA9D9DD3), SPH_C64(0x2B47C16CB46C6CC1),
+	SPH_C64(0xA495F531533131F5), SPH_C64(0xF387B9749C7474B9),
+	SPH_C64(0x15E309F607F6F609), SPH_C64(0x4C0A4346CA464643),
+	SPH_C64(0xA50926ACE9ACAC26), SPH_C64(0xB53C978986898997),
+	SPH_C64(0xB4A044143C141444), SPH_C64(0xBA5B42E13EE1E142),
+	SPH_C64(0xA6B04E163A16164E), SPH_C64(0xF7CDD23A4E3A3AD2),
+	SPH_C64(0x066FD069BB6969D0), SPH_C64(0x41482D091B09092D),
+	SPH_C64(0xD7A7AD70907070AD), SPH_C64(0x6FD954B6C7B6B654),
+	SPH_C64(0x1ECEB7D06DD0D0B7), SPH_C64(0xD63B7EED2AEDED7E),
+	SPH_C64(0xE22EDBCC49CCCCDB), SPH_C64(0x682A5742C6424257),
+	SPH_C64(0x2CB4C298B59898C2), SPH_C64(0xED490EA4F1A4A40E),
+	SPH_C64(0x755D882878282888), SPH_C64(0x86DA315CE45C5C31),
+	SPH_C64(0x6B933FF815F8F83F), SPH_C64(0xC244A486978686A4)
+};
+
+static const sph_u64 old1_T2[256] = {
+	SPH_C64(0xC0781828181878D8), SPH_C64(0x05AF23652323AF26),
+	SPH_C64(0x7EF9C657C6C6F9B8), SPH_C64(0x136FE825E8E86FFB),
+	SPH_C64(0x4CA187948787A1CB), SPH_C64(0xA962B8D5B8B86211),
+	SPH_C64(0x0805010301010509), SPH_C64(0x426E4FD14F4F6E0D),
+	SPH_C64(0xADEE365A3636EE9B), SPH_C64(0x5904A6F7A6A604FF),
+	SPH_C64(0xDEBDD26BD2D2BD0C), SPH_C64(0xFB06F502F5F5060E),
+	SPH_C64(0xEF80798B79798096), SPH_C64(0x5FCE6FB16F6FCE30),
+	SPH_C64(0xFCEF91AE9191EF6D), SPH_C64(0xAA0752F6525207F8),
+	SPH_C64(0x27FD60A06060FD47), SPH_C64(0x8976BCD9BCBC7635),
+	SPH_C64(0xACCD9BB09B9BCD37), SPH_C64(0x048C8E8F8E8E8C8A),
+	SPH_C64(0x7115A3F8A3A315D2), SPH_C64(0x603C0C140C0C3C6C),
+	SPH_C64(0xFF8A7B8D7B7B8A84), SPH_C64(0xB5E1355F3535E180),
+	SPH_C64(0xE8691D271D1D69F5), SPH_C64(0x5347E03DE0E047B3),
+	SPH_C64(0xF6ACD764D7D7AC21), SPH_C64(0x5EEDC25BC2C2ED9C),
+	SPH_C64(0x6D962E722E2E9643), SPH_C64(0x627A4BDD4B4B7A29),
+	SPH_C64(0xA321FE1FFEFE215D), SPH_C64(0x821657F9575716D5),
+	SPH_C64(0xA841153F151541BD), SPH_C64(0x9FB677997777B6E8),
+	SPH_C64(0xA5EB37593737EB92), SPH_C64(0x7B56E532E5E5569E),
+	SPH_C64(0x8CD99FBC9F9FD913), SPH_C64(0xD317F00DF0F01723),
+	SPH_C64(0x6A7F4ADE4A4A7F20), SPH_C64(0x9E95DA73DADA9544),
+	SPH_C64(0xFA2558E8585825A2), SPH_C64(0x06CAC946C9C9CACF),
+	SPH_C64(0x558D297B29298D7C), SPH_C64(0x50220A1E0A0A225A),
+	SPH_C64(0xE14FB1CEB1B14F50), SPH_C64(0x691AA0FDA0A01AC9),
+	SPH_C64(0x7FDA6BBD6B6BDA14), SPH_C64(0x5CAB85928585ABD9),
+	SPH_C64(0x8173BDDABDBD733C), SPH_C64(0xD2345DE75D5D348F),
+	SPH_C64(0x8050103010105090), SPH_C64(0xF303F401F4F40307),
+	SPH_C64(0x16C0CB40CBCBC0DD), SPH_C64(0xEDC63E423E3EC6D3),
+	SPH_C64(0x2811050F0505112D), SPH_C64(0x1FE667A96767E678),
+	SPH_C64(0x7353E431E4E45397), SPH_C64(0x25BB27692727BB02),
+	SPH_C64(0x325841C341415873), SPH_C64(0x2C9D8B808B8B9DA7),
+	SPH_C64(0x5101A7F4A7A701F6), SPH_C64(0xCF947D877D7D94B2),
+	SPH_C64(0xDCFB95A29595FB49), SPH_C64(0x8E9FD875D8D89F56),
+	SPH_C64(0x8B30FB10FBFB3070), SPH_C64(0x2371EE2FEEEE71CD),
+	SPH_C64(0xC7917C847C7C91BB), SPH_C64(0x17E366AA6666E371),
+	SPH_C64(0xA68EDD7ADDDD8E7B), SPH_C64(0xB84B173917174BAF),
+	SPH_C64(0x024647C947474645), SPH_C64(0x84DC9EBF9E9EDC1A),
+	SPH_C64(0x1EC5CA43CACAC5D4), SPH_C64(0x75992D772D2D9958),
+	SPH_C64(0x9179BFDCBFBF792E), SPH_C64(0x381B070907071B3F),
+	SPH_C64(0x0123ADEAADAD23AC), SPH_C64(0xEA2F5AEE5A5A2FB0),
+	SPH_C64(0x6CB583988383B5EF), SPH_C64(0x85FF33553333FFB6),
+	SPH_C64(0x3FF263A56363F25C), SPH_C64(0x100A020602020A12),
+	SPH_C64(0x3938AAE3AAAA3893), SPH_C64(0xAFA871937171A8DE),
+	SPH_C64(0x0ECFC845C8C8CFC6), SPH_C64(0xC87D192B19197DD1),
+	SPH_C64(0x727049DB4949703B), SPH_C64(0x869AD976D9D99A5F),
+	SPH_C64(0xC31DF20BF2F21D31), SPH_C64(0x4B48E338E3E348A8),
+	SPH_C64(0xE22A5BED5B5B2AB9), SPH_C64(0x34928885888892BC),
+	SPH_C64(0xA4C89AB39A9AC83E), SPH_C64(0x2DBE266A2626BE0B),
+	SPH_C64(0x8DFA32563232FABF), SPH_C64(0xE94AB0CDB0B04A59),
+	SPH_C64(0x1B6AE926E9E96AF2), SPH_C64(0x78330F110F0F3377),
+	SPH_C64(0xE6A6D562D5D5A633), SPH_C64(0x74BA809D8080BAF4),
+	SPH_C64(0x997CBEDFBEBE7C27), SPH_C64(0x26DECD4ACDCDDEEB),
+	SPH_C64(0xBDE4345C3434E489), SPH_C64(0x7A7548D848487532),
+	SPH_C64(0xAB24FF1CFFFF2454), SPH_C64(0xF78F7A8E7A7A8F8D),
+	SPH_C64(0xF4EA90AD9090EA64), SPH_C64(0xC23E5FE15F5F3E9D),
+	SPH_C64(0x1DA020602020A03D), SPH_C64(0x67D568B86868D50F),
+	SPH_C64(0xD0721A2E1A1A72CA), SPH_C64(0x192CAEEFAEAE2CB7),
+	SPH_C64(0xC95EB4C1B4B45E7D), SPH_C64(0x9A1954FC545419CE),
+	SPH_C64(0xECE593A89393E57F), SPH_C64(0x0DAA22662222AA2F),
+	SPH_C64(0x07E964AC6464E963), SPH_C64(0xDB12F10EF1F1122A),
+	SPH_C64(0xBFA273957373A2CC), SPH_C64(0x905A123612125A82),
+	SPH_C64(0x3A5D40C040405D7A), SPH_C64(0x4028081808082848),
+	SPH_C64(0x56E8C358C3C3E895), SPH_C64(0x337BEC29ECEC7BDF),
+	SPH_C64(0x9690DB70DBDB904D), SPH_C64(0x611FA1FEA1A11FC0),
+	SPH_C64(0x1C838D8A8D8D8391), SPH_C64(0xF5C93D473D3DC9C8),
+	SPH_C64(0xCCF197A49797F15B), SPH_C64(0x0000000000000000),
+	SPH_C64(0x36D4CF4CCFCFD4F9), SPH_C64(0x45872B7D2B2B876E),
+	SPH_C64(0x97B3769A7676B3E1), SPH_C64(0x64B0829B8282B0E6),
+	SPH_C64(0xFEA9D667D6D6A928), SPH_C64(0xD8771B2D1B1B77C3),
+	SPH_C64(0xC15BB5C2B5B55B74), SPH_C64(0x1129AFECAFAF29BE),
+	SPH_C64(0x77DF6ABE6A6ADF1D), SPH_C64(0xBA0D50F050500DEA),
+	SPH_C64(0x124C45CF45454C57), SPH_C64(0xCB18F308F3F31838),
+	SPH_C64(0x9DF030503030F0AD), SPH_C64(0x2B74EF2CEFEF74C4),
+	SPH_C64(0xE5C33F413F3FC3DA), SPH_C64(0x921C55FF55551CC7),
+	SPH_C64(0x7910A2FBA2A210DB), SPH_C64(0x0365EA23EAEA65E9),
+	SPH_C64(0x0FEC65AF6565EC6A), SPH_C64(0xB968BAD3BABA6803),
+	SPH_C64(0x65932F712F2F934A), SPH_C64(0x4EE7C05DC0C0E78E),
+	SPH_C64(0xBE81DE7FDEDE8160), SPH_C64(0xE06C1C241C1C6CFC),
+	SPH_C64(0xBB2EFD1AFDFD2E46), SPH_C64(0x52644DD74D4D641F),
+	SPH_C64(0xE4E092AB9292E076), SPH_C64(0x8FBC759F7575BCFA),
+	SPH_C64(0x301E060A06061E36), SPH_C64(0x24988A838A8A98AE),
+	SPH_C64(0xF940B2CBB2B2404B), SPH_C64(0x6359E637E6E65985),
+	SPH_C64(0x70360E120E0E367E), SPH_C64(0xF8631F211F1F63E7),
+	SPH_C64(0x37F762A66262F755), SPH_C64(0xEEA3D461D4D4A33A),
+	SPH_C64(0x2932A8E5A8A83281), SPH_C64(0xC4F496A79696F452),
+	SPH_C64(0x9B3AF916F9F93A62), SPH_C64(0x66F6C552C5C5F6A3),
+	SPH_C64(0x35B1256F2525B110), SPH_C64(0xF22059EB595920AB),
+	SPH_C64(0x54AE84918484AED0), SPH_C64(0xB7A772967272A7C5),
+	SPH_C64(0xD5DD394B3939DDEC), SPH_C64(0x5A614CD44C4C6116),
+	SPH_C64(0xCA3B5EE25E5E3B94), SPH_C64(0xE78578887878859F),
+	SPH_C64(0xDDD838483838D8E5), SPH_C64(0x14868C898C8C8698),
+	SPH_C64(0xC6B2D16ED1D1B217), SPH_C64(0x410BA5F2A5A50BE4),
+	SPH_C64(0x434DE23BE2E24DA1), SPH_C64(0x2FF861A36161F84E),
+	SPH_C64(0xF145B3C8B3B34542), SPH_C64(0x15A521632121A534),
+	SPH_C64(0x94D69CB99C9CD608), SPH_C64(0xF0661E221E1E66EE),
+	SPH_C64(0x225243C543435261), SPH_C64(0x76FCC754C7C7FCB1),
+	SPH_C64(0xB32BFC19FCFC2B4F), SPH_C64(0x2014040C04041424),
+	SPH_C64(0xB20851F3515108E3), SPH_C64(0xBCC799B69999C725),
+	SPH_C64(0x4FC46DB76D6DC422), SPH_C64(0x68390D170D0D3965),
+	SPH_C64(0x8335FA13FAFA3579), SPH_C64(0xB684DF7CDFDF8469),
+	SPH_C64(0xD79B7E827E7E9BA9), SPH_C64(0x3DB4246C2424B419),
+	SPH_C64(0xC5D73B4D3B3BD7FE), SPH_C64(0x313DABE0ABAB3D9A),
+	SPH_C64(0x3ED1CE4FCECED1F0), SPH_C64(0x8855113311115599),
+	SPH_C64(0x0C898F8C8F8F8983), SPH_C64(0x4A6B4ED24E4E6B04),
+	SPH_C64(0xD151B7C4B7B75166), SPH_C64(0x0B60EB20EBEB60E0),
+	SPH_C64(0xFDCC3C443C3CCCC1), SPH_C64(0x7CBF819E8181BFFD),
+	SPH_C64(0xD4FE94A19494FE40), SPH_C64(0xEB0CF704F7F70C1C),
+	SPH_C64(0xA167B9D6B9B96718), SPH_C64(0x985F133513135F8B),
+	SPH_C64(0x7D9C2C742C2C9C51), SPH_C64(0xD6B8D368D3D3B805),
+	SPH_C64(0x6B5CE734E7E75C8C), SPH_C64(0x57CB6EB26E6ECB39),
+	SPH_C64(0x6EF3C451C4C4F3AA), SPH_C64(0x180F030503030F1B),
+	SPH_C64(0x8A1356FA565613DC), SPH_C64(0x1A4944CC4444495E),
+	SPH_C64(0xDF9E7F817F7F9EA0), SPH_C64(0x2137A9E6A9A93788),
+	SPH_C64(0x4D822A7E2A2A8267), SPH_C64(0xB16DBBD0BBBB6D0A),
+	SPH_C64(0x46E2C15EC1C1E287), SPH_C64(0xA20253F5535302F1),
+	SPH_C64(0xAE8BDC79DCDC8B72), SPH_C64(0x58270B1D0B0B2753),
+	SPH_C64(0x9CD39DBA9D9DD301), SPH_C64(0x47C16CB46C6CC12B),
+	SPH_C64(0x95F531533131F5A4), SPH_C64(0x87B9749C7474B9F3),
+	SPH_C64(0xE309F607F6F60915), SPH_C64(0x0A4346CA4646434C),
+	SPH_C64(0x0926ACE9ACAC26A5), SPH_C64(0x3C978986898997B5),
+	SPH_C64(0xA044143C141444B4), SPH_C64(0x5B42E13EE1E142BA),
+	SPH_C64(0xB04E163A16164EA6), SPH_C64(0xCDD23A4E3A3AD2F7),
+	SPH_C64(0x6FD069BB6969D006), SPH_C64(0x482D091B09092D41),
+	SPH_C64(0xA7AD70907070ADD7), SPH_C64(0xD954B6C7B6B6546F),
+	SPH_C64(0xCEB7D06DD0D0B71E), SPH_C64(0x3B7EED2AEDED7ED6),
+	SPH_C64(0x2EDBCC49CCCCDBE2), SPH_C64(0x2A5742C642425768),
+	SPH_C64(0xB4C298B59898C22C), SPH_C64(0x490EA4F1A4A40EED),
+	SPH_C64(0x5D88287828288875), SPH_C64(0xDA315CE45C5C3186),
+	SPH_C64(0x933FF815F8F83F6B), SPH_C64(0x44A486978686A4C2)
+};
+
+static const sph_u64 old1_T3[256] = {
+	SPH_C64(0x781828181878D8C0), SPH_C64(0xAF23652323AF2605),
+	SPH_C64(0xF9C657C6C6F9B87E), SPH_C64(0x6FE825E8E86FFB13),
+	SPH_C64(0xA187948787A1CB4C), SPH_C64(0x62B8D5B8B86211A9),
+	SPH_C64(0x0501030101050908), SPH_C64(0x6E4FD14F4F6E0D42),
+	SPH_C64(0xEE365A3636EE9BAD), SPH_C64(0x04A6F7A6A604FF59),
+	SPH_C64(0xBDD26BD2D2BD0CDE), SPH_C64(0x06F502F5F5060EFB),
+	SPH_C64(0x80798B79798096EF), SPH_C64(0xCE6FB16F6FCE305F),
+	SPH_C64(0xEF91AE9191EF6DFC), SPH_C64(0x0752F6525207F8AA),
+	SPH_C64(0xFD60A06060FD4727), SPH_C64(0x76BCD9BCBC763589),
+	SPH_C64(0xCD9BB09B9BCD37AC), SPH_C64(0x8C8E8F8E8E8C8A04),
+	SPH_C64(0x15A3F8A3A315D271), SPH_C64(0x3C0C140C0C3C6C60),
+	SPH_C64(0x8A7B8D7B7B8A84FF), SPH_C64(0xE1355F3535E180B5),
+	SPH_C64(0x691D271D1D69F5E8), SPH_C64(0x47E03DE0E047B353),
+	SPH_C64(0xACD764D7D7AC21F6), SPH_C64(0xEDC25BC2C2ED9C5E),
+	SPH_C64(0x962E722E2E96436D), SPH_C64(0x7A4BDD4B4B7A2962),
+	SPH_C64(0x21FE1FFEFE215DA3), SPH_C64(0x1657F9575716D582),
+	SPH_C64(0x41153F151541BDA8), SPH_C64(0xB677997777B6E89F),
+	SPH_C64(0xEB37593737EB92A5), SPH_C64(0x56E532E5E5569E7B),
+	SPH_C64(0xD99FBC9F9FD9138C), SPH_C64(0x17F00DF0F01723D3),
+	SPH_C64(0x7F4ADE4A4A7F206A), SPH_C64(0x95DA73DADA95449E),
+	SPH_C64(0x2558E8585825A2FA), SPH_C64(0xCAC946C9C9CACF06),
+	SPH_C64(0x8D297B29298D7C55), SPH_C64(0x220A1E0A0A225A50),
+	SPH_C64(0x4FB1CEB1B14F50E1), SPH_C64(0x1AA0FDA0A01AC969),
+	SPH_C64(0xDA6BBD6B6BDA147F), SPH_C64(0xAB85928585ABD95C),
+	SPH_C64(0x73BDDABDBD733C81), SPH_C64(0x345DE75D5D348FD2),
+	SPH_C64(0x5010301010509080), SPH_C64(0x03F401F4F40307F3),
+	SPH_C64(0xC0CB40CBCBC0DD16), SPH_C64(0xC63E423E3EC6D3ED),
+	SPH_C64(0x11050F0505112D28), SPH_C64(0xE667A96767E6781F),
+	SPH_C64(0x53E431E4E4539773), SPH_C64(0xBB27692727BB0225),
+	SPH_C64(0x5841C34141587332), SPH_C64(0x9D8B808B8B9DA72C),
+	SPH_C64(0x01A7F4A7A701F651), SPH_C64(0x947D877D7D94B2CF),
+	SPH_C64(0xFB95A29595FB49DC), SPH_C64(0x9FD875D8D89F568E),
+	SPH_C64(0x30FB10FBFB30708B), SPH_C64(0x71EE2FEEEE71CD23),
+	SPH_C64(0x917C847C7C91BBC7), SPH_C64(0xE366AA6666E37117),
+	SPH_C64(0x8EDD7ADDDD8E7BA6), SPH_C64(0x4B173917174BAFB8),
+	SPH_C64(0x4647C94747464502), SPH_C64(0xDC9EBF9E9EDC1A84),
+	SPH_C64(0xC5CA43CACAC5D41E), SPH_C64(0x992D772D2D995875),
+	SPH_C64(0x79BFDCBFBF792E91), SPH_C64(0x1B070907071B3F38),
+	SPH_C64(0x23ADEAADAD23AC01), SPH_C64(0x2F5AEE5A5A2FB0EA),
+	SPH_C64(0xB583988383B5EF6C), SPH_C64(0xFF33553333FFB685),
+	SPH_C64(0xF263A56363F25C3F), SPH_C64(0x0A020602020A1210),
+	SPH_C64(0x38AAE3AAAA389339), SPH_C64(0xA871937171A8DEAF),
+	SPH_C64(0xCFC845C8C8CFC60E), SPH_C64(0x7D192B19197DD1C8),
+	SPH_C64(0x7049DB4949703B72), SPH_C64(0x9AD976D9D99A5F86),
+	SPH_C64(0x1DF20BF2F21D31C3), SPH_C64(0x48E338E3E348A84B),
+	SPH_C64(0x2A5BED5B5B2AB9E2), SPH_C64(0x928885888892BC34),
+	SPH_C64(0xC89AB39A9AC83EA4), SPH_C64(0xBE266A2626BE0B2D),
+	SPH_C64(0xFA32563232FABF8D), SPH_C64(0x4AB0CDB0B04A59E9),
+	SPH_C64(0x6AE926E9E96AF21B), SPH_C64(0x330F110F0F337778),
+	SPH_C64(0xA6D562D5D5A633E6), SPH_C64(0xBA809D8080BAF474),
+	SPH_C64(0x7CBEDFBEBE7C2799), SPH_C64(0xDECD4ACDCDDEEB26),
+	SPH_C64(0xE4345C3434E489BD), SPH_C64(0x7548D8484875327A),
+	SPH_C64(0x24FF1CFFFF2454AB), SPH_C64(0x8F7A8E7A7A8F8DF7),
+	SPH_C64(0xEA90AD9090EA64F4), SPH_C64(0x3E5FE15F5F3E9DC2),
+	SPH_C64(0xA020602020A03D1D), SPH_C64(0xD568B86868D50F67),
+	SPH_C64(0x721A2E1A1A72CAD0), SPH_C64(0x2CAEEFAEAE2CB719),
+	SPH_C64(0x5EB4C1B4B45E7DC9), SPH_C64(0x1954FC545419CE9A),
+	SPH_C64(0xE593A89393E57FEC), SPH_C64(0xAA22662222AA2F0D),
+	SPH_C64(0xE964AC6464E96307), SPH_C64(0x12F10EF1F1122ADB),
+	SPH_C64(0xA273957373A2CCBF), SPH_C64(0x5A123612125A8290),
+	SPH_C64(0x5D40C040405D7A3A), SPH_C64(0x2808180808284840),
+	SPH_C64(0xE8C358C3C3E89556), SPH_C64(0x7BEC29ECEC7BDF33),
+	SPH_C64(0x90DB70DBDB904D96), SPH_C64(0x1FA1FEA1A11FC061),
+	SPH_C64(0x838D8A8D8D83911C), SPH_C64(0xC93D473D3DC9C8F5),
+	SPH_C64(0xF197A49797F15BCC), SPH_C64(0x0000000000000000),
+	SPH_C64(0xD4CF4CCFCFD4F936), SPH_C64(0x872B7D2B2B876E45),
+	SPH_C64(0xB3769A7676B3E197), SPH_C64(0xB0829B8282B0E664),
+	SPH_C64(0xA9D667D6D6A928FE), SPH_C64(0x771B2D1B1B77C3D8),
+	SPH_C64(0x5BB5C2B5B55B74C1), SPH_C64(0x29AFECAFAF29BE11),
+	SPH_C64(0xDF6ABE6A6ADF1D77), SPH_C64(0x0D50F050500DEABA),
+	SPH_C64(0x4C45CF45454C5712), SPH_C64(0x18F308F3F31838CB),
+	SPH_C64(0xF030503030F0AD9D), SPH_C64(0x74EF2CEFEF74C42B),
+	SPH_C64(0xC33F413F3FC3DAE5), SPH_C64(0x1C55FF55551CC792),
+	SPH_C64(0x10A2FBA2A210DB79), SPH_C64(0x65EA23EAEA65E903),
+	SPH_C64(0xEC65AF6565EC6A0F), SPH_C64(0x68BAD3BABA6803B9),
+	SPH_C64(0x932F712F2F934A65), SPH_C64(0xE7C05DC0C0E78E4E),
+	SPH_C64(0x81DE7FDEDE8160BE), SPH_C64(0x6C1C241C1C6CFCE0),
+	SPH_C64(0x2EFD1AFDFD2E46BB), SPH_C64(0x644DD74D4D641F52),
+	SPH_C64(0xE092AB9292E076E4), SPH_C64(0xBC759F7575BCFA8F),
+	SPH_C64(0x1E060A06061E3630), SPH_C64(0x988A838A8A98AE24),
+	SPH_C64(0x40B2CBB2B2404BF9), SPH_C64(0x59E637E6E6598563),
+	SPH_C64(0x360E120E0E367E70), SPH_C64(0x631F211F1F63E7F8),
+	SPH_C64(0xF762A66262F75537), SPH_C64(0xA3D461D4D4A33AEE),
+	SPH_C64(0x32A8E5A8A8328129), SPH_C64(0xF496A79696F452C4),
+	SPH_C64(0x3AF916F9F93A629B), SPH_C64(0xF6C552C5C5F6A366),
+	SPH_C64(0xB1256F2525B11035), SPH_C64(0x2059EB595920ABF2),
+	SPH_C64(0xAE84918484AED054), SPH_C64(0xA772967272A7C5B7),
+	SPH_C64(0xDD394B3939DDECD5), SPH_C64(0x614CD44C4C61165A),
+	SPH_C64(0x3B5EE25E5E3B94CA), SPH_C64(0x8578887878859FE7),
+	SPH_C64(0xD838483838D8E5DD), SPH_C64(0x868C898C8C869814),
+	SPH_C64(0xB2D16ED1D1B217C6), SPH_C64(0x0BA5F2A5A50BE441),
+	SPH_C64(0x4DE23BE2E24DA143), SPH_C64(0xF861A36161F84E2F),
+	SPH_C64(0x45B3C8B3B34542F1), SPH_C64(0xA521632121A53415),
+	SPH_C64(0xD69CB99C9CD60894), SPH_C64(0x661E221E1E66EEF0),
+	SPH_C64(0x5243C54343526122), SPH_C64(0xFCC754C7C7FCB176),
+	SPH_C64(0x2BFC19FCFC2B4FB3), SPH_C64(0x14040C0404142420),
+	SPH_C64(0x0851F3515108E3B2), SPH_C64(0xC799B69999C725BC),
+	SPH_C64(0xC46DB76D6DC4224F), SPH_C64(0x390D170D0D396568),
+	SPH_C64(0x35FA13FAFA357983), SPH_C64(0x84DF7CDFDF8469B6),
+	SPH_C64(0x9B7E827E7E9BA9D7), SPH_C64(0xB4246C2424B4193D),
+	SPH_C64(0xD73B4D3B3BD7FEC5), SPH_C64(0x3DABE0ABAB3D9A31),
+	SPH_C64(0xD1CE4FCECED1F03E), SPH_C64(0x5511331111559988),
+	SPH_C64(0x898F8C8F8F89830C), SPH_C64(0x6B4ED24E4E6B044A),
+	SPH_C64(0x51B7C4B7B75166D1), SPH_C64(0x60EB20EBEB60E00B),
+	SPH_C64(0xCC3C443C3CCCC1FD), SPH_C64(0xBF819E8181BFFD7C),
+	SPH_C64(0xFE94A19494FE40D4), SPH_C64(0x0CF704F7F70C1CEB),
+	SPH_C64(0x67B9D6B9B96718A1), SPH_C64(0x5F133513135F8B98),
+	SPH_C64(0x9C2C742C2C9C517D), SPH_C64(0xB8D368D3D3B805D6),
+	SPH_C64(0x5CE734E7E75C8C6B), SPH_C64(0xCB6EB26E6ECB3957),
+	SPH_C64(0xF3C451C4C4F3AA6E), SPH_C64(0x0F030503030F1B18),
+	SPH_C64(0x1356FA565613DC8A), SPH_C64(0x4944CC4444495E1A),
+	SPH_C64(0x9E7F817F7F9EA0DF), SPH_C64(0x37A9E6A9A9378821),
+	SPH_C64(0x822A7E2A2A82674D), SPH_C64(0x6DBBD0BBBB6D0AB1),
+	SPH_C64(0xE2C15EC1C1E28746), SPH_C64(0x0253F5535302F1A2),
+	SPH_C64(0x8BDC79DCDC8B72AE), SPH_C64(0x270B1D0B0B275358),
+	SPH_C64(0xD39DBA9D9DD3019C), SPH_C64(0xC16CB46C6CC12B47),
+	SPH_C64(0xF531533131F5A495), SPH_C64(0xB9749C7474B9F387),
+	SPH_C64(0x09F607F6F60915E3), SPH_C64(0x4346CA4646434C0A),
+	SPH_C64(0x26ACE9ACAC26A509), SPH_C64(0x978986898997B53C),
+	SPH_C64(0x44143C141444B4A0), SPH_C64(0x42E13EE1E142BA5B),
+	SPH_C64(0x4E163A16164EA6B0), SPH_C64(0xD23A4E3A3AD2F7CD),
+	SPH_C64(0xD069BB6969D0066F), SPH_C64(0x2D091B09092D4148),
+	SPH_C64(0xAD70907070ADD7A7), SPH_C64(0x54B6C7B6B6546FD9),
+	SPH_C64(0xB7D06DD0D0B71ECE), SPH_C64(0x7EED2AEDED7ED63B),
+	SPH_C64(0xDBCC49CCCCDBE22E), SPH_C64(0x5742C6424257682A),
+	SPH_C64(0xC298B59898C22CB4), SPH_C64(0x0EA4F1A4A40EED49),
+	SPH_C64(0x882878282888755D), SPH_C64(0x315CE45C5C3186DA),
+	SPH_C64(0x3FF815F8F83F6B93), SPH_C64(0xA486978686A4C244)
+};
+
+static const sph_u64 old1_T4[256] = {
+	SPH_C64(0x1828181878D8C078), SPH_C64(0x23652323AF2605AF),
+	SPH_C64(0xC657C6C6F9B87EF9), SPH_C64(0xE825E8E86FFB136F),
+	SPH_C64(0x87948787A1CB4CA1), SPH_C64(0xB8D5B8B86211A962),
+	SPH_C64(0x0103010105090805), SPH_C64(0x4FD14F4F6E0D426E),
+	SPH_C64(0x365A3636EE9BADEE), SPH_C64(0xA6F7A6A604FF5904),
+	SPH_C64(0xD26BD2D2BD0CDEBD), SPH_C64(0xF502F5F5060EFB06),
+	SPH_C64(0x798B79798096EF80), SPH_C64(0x6FB16F6FCE305FCE),
+	SPH_C64(0x91AE9191EF6DFCEF), SPH_C64(0x52F6525207F8AA07),
+	SPH_C64(0x60A06060FD4727FD), SPH_C64(0xBCD9BCBC76358976),
+	SPH_C64(0x9BB09B9BCD37ACCD), SPH_C64(0x8E8F8E8E8C8A048C),
+	SPH_C64(0xA3F8A3A315D27115), SPH_C64(0x0C140C0C3C6C603C),
+	SPH_C64(0x7B8D7B7B8A84FF8A), SPH_C64(0x355F3535E180B5E1),
+	SPH_C64(0x1D271D1D69F5E869), SPH_C64(0xE03DE0E047B35347),
+	SPH_C64(0xD764D7D7AC21F6AC), SPH_C64(0xC25BC2C2ED9C5EED),
+	SPH_C64(0x2E722E2E96436D96), SPH_C64(0x4BDD4B4B7A29627A),
+	SPH_C64(0xFE1FFEFE215DA321), SPH_C64(0x57F9575716D58216),
+	SPH_C64(0x153F151541BDA841), SPH_C64(0x77997777B6E89FB6),
+	SPH_C64(0x37593737EB92A5EB), SPH_C64(0xE532E5E5569E7B56),
+	SPH_C64(0x9FBC9F9FD9138CD9), SPH_C64(0xF00DF0F01723D317),
+	SPH_C64(0x4ADE4A4A7F206A7F), SPH_C64(0xDA73DADA95449E95),
+	SPH_C64(0x58E8585825A2FA25), SPH_C64(0xC946C9C9CACF06CA),
+	SPH_C64(0x297B29298D7C558D), SPH_C64(0x0A1E0A0A225A5022),
+	SPH_C64(0xB1CEB1B14F50E14F), SPH_C64(0xA0FDA0A01AC9691A),
+	SPH_C64(0x6BBD6B6BDA147FDA), SPH_C64(0x85928585ABD95CAB),
+	SPH_C64(0xBDDABDBD733C8173), SPH_C64(0x5DE75D5D348FD234),
+	SPH_C64(0x1030101050908050), SPH_C64(0xF401F4F40307F303),
+	SPH_C64(0xCB40CBCBC0DD16C0), SPH_C64(0x3E423E3EC6D3EDC6),
+	SPH_C64(0x050F0505112D2811), SPH_C64(0x67A96767E6781FE6),
+	SPH_C64(0xE431E4E453977353), SPH_C64(0x27692727BB0225BB),
+	SPH_C64(0x41C3414158733258), SPH_C64(0x8B808B8B9DA72C9D),
+	SPH_C64(0xA7F4A7A701F65101), SPH_C64(0x7D877D7D94B2CF94),
+	SPH_C64(0x95A29595FB49DCFB), SPH_C64(0xD875D8D89F568E9F),
+	SPH_C64(0xFB10FBFB30708B30), SPH_C64(0xEE2FEEEE71CD2371),
+	SPH_C64(0x7C847C7C91BBC791), SPH_C64(0x66AA6666E37117E3),
+	SPH_C64(0xDD7ADDDD8E7BA68E), SPH_C64(0x173917174BAFB84B),
+	SPH_C64(0x47C9474746450246), SPH_C64(0x9EBF9E9EDC1A84DC),
+	SPH_C64(0xCA43CACAC5D41EC5), SPH_C64(0x2D772D2D99587599),
+	SPH_C64(0xBFDCBFBF792E9179), SPH_C64(0x070907071B3F381B),
+	SPH_C64(0xADEAADAD23AC0123), SPH_C64(0x5AEE5A5A2FB0EA2F),
+	SPH_C64(0x83988383B5EF6CB5), SPH_C64(0x33553333FFB685FF),
+	SPH_C64(0x63A56363F25C3FF2), SPH_C64(0x020602020A12100A),
+	SPH_C64(0xAAE3AAAA38933938), SPH_C64(0x71937171A8DEAFA8),
+	SPH_C64(0xC845C8C8CFC60ECF), SPH_C64(0x192B19197DD1C87D),
+	SPH_C64(0x49DB4949703B7270), SPH_C64(0xD976D9D99A5F869A),
+	SPH_C64(0xF20BF2F21D31C31D), SPH_C64(0xE338E3E348A84B48),
+	SPH_C64(0x5BED5B5B2AB9E22A), SPH_C64(0x8885888892BC3492),
+	SPH_C64(0x9AB39A9AC83EA4C8), SPH_C64(0x266A2626BE0B2DBE),
+	SPH_C64(0x32563232FABF8DFA), SPH_C64(0xB0CDB0B04A59E94A),
+	SPH_C64(0xE926E9E96AF21B6A), SPH_C64(0x0F110F0F33777833),
+	SPH_C64(0xD562D5D5A633E6A6), SPH_C64(0x809D8080BAF474BA),
+	SPH_C64(0xBEDFBEBE7C27997C), SPH_C64(0xCD4ACDCDDEEB26DE),
+	SPH_C64(0x345C3434E489BDE4), SPH_C64(0x48D8484875327A75),
+	SPH_C64(0xFF1CFFFF2454AB24), SPH_C64(0x7A8E7A7A8F8DF78F),
+	SPH_C64(0x90AD9090EA64F4EA), SPH_C64(0x5FE15F5F3E9DC23E),
+	SPH_C64(0x20602020A03D1DA0), SPH_C64(0x68B86868D50F67D5),
+	SPH_C64(0x1A2E1A1A72CAD072), SPH_C64(0xAEEFAEAE2CB7192C),
+	SPH_C64(0xB4C1B4B45E7DC95E), SPH_C64(0x54FC545419CE9A19),
+	SPH_C64(0x93A89393E57FECE5), SPH_C64(0x22662222AA2F0DAA),
+	SPH_C64(0x64AC6464E96307E9), SPH_C64(0xF10EF1F1122ADB12),
+	SPH_C64(0x73957373A2CCBFA2), SPH_C64(0x123612125A82905A),
+	SPH_C64(0x40C040405D7A3A5D), SPH_C64(0x0818080828484028),
+	SPH_C64(0xC358C3C3E89556E8), SPH_C64(0xEC29ECEC7BDF337B),
+	SPH_C64(0xDB70DBDB904D9690), SPH_C64(0xA1FEA1A11FC0611F),
+	SPH_C64(0x8D8A8D8D83911C83), SPH_C64(0x3D473D3DC9C8F5C9),
+	SPH_C64(0x97A49797F15BCCF1), SPH_C64(0x0000000000000000),
+	SPH_C64(0xCF4CCFCFD4F936D4), SPH_C64(0x2B7D2B2B876E4587),
+	SPH_C64(0x769A7676B3E197B3), SPH_C64(0x829B8282B0E664B0),
+	SPH_C64(0xD667D6D6A928FEA9), SPH_C64(0x1B2D1B1B77C3D877),
+	SPH_C64(0xB5C2B5B55B74C15B), SPH_C64(0xAFECAFAF29BE1129),
+	SPH_C64(0x6ABE6A6ADF1D77DF), SPH_C64(0x50F050500DEABA0D),
+	SPH_C64(0x45CF45454C57124C), SPH_C64(0xF308F3F31838CB18),
+	SPH_C64(0x30503030F0AD9DF0), SPH_C64(0xEF2CEFEF74C42B74),
+	SPH_C64(0x3F413F3FC3DAE5C3), SPH_C64(0x55FF55551CC7921C),
+	SPH_C64(0xA2FBA2A210DB7910), SPH_C64(0xEA23EAEA65E90365),
+	SPH_C64(0x65AF6565EC6A0FEC), SPH_C64(0xBAD3BABA6803B968),
+	SPH_C64(0x2F712F2F934A6593), SPH_C64(0xC05DC0C0E78E4EE7),
+	SPH_C64(0xDE7FDEDE8160BE81), SPH_C64(0x1C241C1C6CFCE06C),
+	SPH_C64(0xFD1AFDFD2E46BB2E), SPH_C64(0x4DD74D4D641F5264),
+	SPH_C64(0x92AB9292E076E4E0), SPH_C64(0x759F7575BCFA8FBC),
+	SPH_C64(0x060A06061E36301E), SPH_C64(0x8A838A8A98AE2498),
+	SPH_C64(0xB2CBB2B2404BF940), SPH_C64(0xE637E6E659856359),
+	SPH_C64(0x0E120E0E367E7036), SPH_C64(0x1F211F1F63E7F863),
+	SPH_C64(0x62A66262F75537F7), SPH_C64(0xD461D4D4A33AEEA3),
+	SPH_C64(0xA8E5A8A832812932), SPH_C64(0x96A79696F452C4F4),
+	SPH_C64(0xF916F9F93A629B3A), SPH_C64(0xC552C5C5F6A366F6),
+	SPH_C64(0x256F2525B11035B1), SPH_C64(0x59EB595920ABF220),
+	SPH_C64(0x84918484AED054AE), SPH_C64(0x72967272A7C5B7A7),
+	SPH_C64(0x394B3939DDECD5DD), SPH_C64(0x4CD44C4C61165A61),
+	SPH_C64(0x5EE25E5E3B94CA3B), SPH_C64(0x78887878859FE785),
+	SPH_C64(0x38483838D8E5DDD8), SPH_C64(0x8C898C8C86981486),
+	SPH_C64(0xD16ED1D1B217C6B2), SPH_C64(0xA5F2A5A50BE4410B),
+	SPH_C64(0xE23BE2E24DA1434D), SPH_C64(0x61A36161F84E2FF8),
+	SPH_C64(0xB3C8B3B34542F145), SPH_C64(0x21632121A53415A5),
+	SPH_C64(0x9CB99C9CD60894D6), SPH_C64(0x1E221E1E66EEF066),
+	SPH_C64(0x43C5434352612252), SPH_C64(0xC754C7C7FCB176FC),
+	SPH_C64(0xFC19FCFC2B4FB32B), SPH_C64(0x040C040414242014),
+	SPH_C64(0x51F3515108E3B208), SPH_C64(0x99B69999C725BCC7),
+	SPH_C64(0x6DB76D6DC4224FC4), SPH_C64(0x0D170D0D39656839),
+	SPH_C64(0xFA13FAFA35798335), SPH_C64(0xDF7CDFDF8469B684),
+	SPH_C64(0x7E827E7E9BA9D79B), SPH_C64(0x246C2424B4193DB4),
+	SPH_C64(0x3B4D3B3BD7FEC5D7), SPH_C64(0xABE0ABAB3D9A313D),
+	SPH_C64(0xCE4FCECED1F03ED1), SPH_C64(0x1133111155998855),
+	SPH_C64(0x8F8C8F8F89830C89), SPH_C64(0x4ED24E4E6B044A6B),
+	SPH_C64(0xB7C4B7B75166D151), SPH_C64(0xEB20EBEB60E00B60),
+	SPH_C64(0x3C443C3CCCC1FDCC), SPH_C64(0x819E8181BFFD7CBF),
+	SPH_C64(0x94A19494FE40D4FE), SPH_C64(0xF704F7F70C1CEB0C),
+	SPH_C64(0xB9D6B9B96718A167), SPH_C64(0x133513135F8B985F),
+	SPH_C64(0x2C742C2C9C517D9C), SPH_C64(0xD368D3D3B805D6B8),
+	SPH_C64(0xE734E7E75C8C6B5C), SPH_C64(0x6EB26E6ECB3957CB),
+	SPH_C64(0xC451C4C4F3AA6EF3), SPH_C64(0x030503030F1B180F),
+	SPH_C64(0x56FA565613DC8A13), SPH_C64(0x44CC4444495E1A49),
+	SPH_C64(0x7F817F7F9EA0DF9E), SPH_C64(0xA9E6A9A937882137),
+	SPH_C64(0x2A7E2A2A82674D82), SPH_C64(0xBBD0BBBB6D0AB16D),
+	SPH_C64(0xC15EC1C1E28746E2), SPH_C64(0x53F5535302F1A202),
+	SPH_C64(0xDC79DCDC8B72AE8B), SPH_C64(0x0B1D0B0B27535827),
+	SPH_C64(0x9DBA9D9DD3019CD3), SPH_C64(0x6CB46C6CC12B47C1),
+	SPH_C64(0x31533131F5A495F5), SPH_C64(0x749C7474B9F387B9),
+	SPH_C64(0xF607F6F60915E309), SPH_C64(0x46CA4646434C0A43),
+	SPH_C64(0xACE9ACAC26A50926), SPH_C64(0x8986898997B53C97),
+	SPH_C64(0x143C141444B4A044), SPH_C64(0xE13EE1E142BA5B42),
+	SPH_C64(0x163A16164EA6B04E), SPH_C64(0x3A4E3A3AD2F7CDD2),
+	SPH_C64(0x69BB6969D0066FD0), SPH_C64(0x091B09092D41482D),
+	SPH_C64(0x70907070ADD7A7AD), SPH_C64(0xB6C7B6B6546FD954),
+	SPH_C64(0xD06DD0D0B71ECEB7), SPH_C64(0xED2AEDED7ED63B7E),
+	SPH_C64(0xCC49CCCCDBE22EDB), SPH_C64(0x42C6424257682A57),
+	SPH_C64(0x98B59898C22CB4C2), SPH_C64(0xA4F1A4A40EED490E),
+	SPH_C64(0x2878282888755D88), SPH_C64(0x5CE45C5C3186DA31),
+	SPH_C64(0xF815F8F83F6B933F), SPH_C64(0x86978686A4C244A4)
+};
+
+static const sph_u64 old1_T5[256] = {
+	SPH_C64(0x28181878D8C07818), SPH_C64(0x652323AF2605AF23),
+	SPH_C64(0x57C6C6F9B87EF9C6), SPH_C64(0x25E8E86FFB136FE8),
+	SPH_C64(0x948787A1CB4CA187), SPH_C64(0xD5B8B86211A962B8),
+	SPH_C64(0x0301010509080501), SPH_C64(0xD14F4F6E0D426E4F),
+	SPH_C64(0x5A3636EE9BADEE36), SPH_C64(0xF7A6A604FF5904A6),
+	SPH_C64(0x6BD2D2BD0CDEBDD2), SPH_C64(0x02F5F5060EFB06F5),
+	SPH_C64(0x8B79798096EF8079), SPH_C64(0xB16F6FCE305FCE6F),
+	SPH_C64(0xAE9191EF6DFCEF91), SPH_C64(0xF6525207F8AA0752),
+	SPH_C64(0xA06060FD4727FD60), SPH_C64(0xD9BCBC76358976BC),
+	SPH_C64(0xB09B9BCD37ACCD9B), SPH_C64(0x8F8E8E8C8A048C8E),
+	SPH_C64(0xF8A3A315D27115A3), SPH_C64(0x140C0C3C6C603C0C),
+	SPH_C64(0x8D7B7B8A84FF8A7B), SPH_C64(0x5F3535E180B5E135),
+	SPH_C64(0x271D1D69F5E8691D), SPH_C64(0x3DE0E047B35347E0),
+	SPH_C64(0x64D7D7AC21F6ACD7), SPH_C64(0x5BC2C2ED9C5EEDC2),
+	SPH_C64(0x722E2E96436D962E), SPH_C64(0xDD4B4B7A29627A4B),
+	SPH_C64(0x1FFEFE215DA321FE), SPH_C64(0xF9575716D5821657),
+	SPH_C64(0x3F151541BDA84115), SPH_C64(0x997777B6E89FB677),
+	SPH_C64(0x593737EB92A5EB37), SPH_C64(0x32E5E5569E7B56E5),
+	SPH_C64(0xBC9F9FD9138CD99F), SPH_C64(0x0DF0F01723D317F0),
+	SPH_C64(0xDE4A4A7F206A7F4A), SPH_C64(0x73DADA95449E95DA),
+	SPH_C64(0xE8585825A2FA2558), SPH_C64(0x46C9C9CACF06CAC9),
+	SPH_C64(0x7B29298D7C558D29), SPH_C64(0x1E0A0A225A50220A),
+	SPH_C64(0xCEB1B14F50E14FB1), SPH_C64(0xFDA0A01AC9691AA0),
+	SPH_C64(0xBD6B6BDA147FDA6B), SPH_C64(0x928585ABD95CAB85),
+	SPH_C64(0xDABDBD733C8173BD), SPH_C64(0xE75D5D348FD2345D),
+	SPH_C64(0x3010105090805010), SPH_C64(0x01F4F40307F303F4),
+	SPH_C64(0x40CBCBC0DD16C0CB), SPH_C64(0x423E3EC6D3EDC63E),
+	SPH_C64(0x0F0505112D281105), SPH_C64(0xA96767E6781FE667),
+	SPH_C64(0x31E4E453977353E4), SPH_C64(0x692727BB0225BB27),
+	SPH_C64(0xC341415873325841), SPH_C64(0x808B8B9DA72C9D8B),
+	SPH_C64(0xF4A7A701F65101A7), SPH_C64(0x877D7D94B2CF947D),
+	SPH_C64(0xA29595FB49DCFB95), SPH_C64(0x75D8D89F568E9FD8),
+	SPH_C64(0x10FBFB30708B30FB), SPH_C64(0x2FEEEE71CD2371EE),
+	SPH_C64(0x847C7C91BBC7917C), SPH_C64(0xAA6666E37117E366),
+	SPH_C64(0x7ADDDD8E7BA68EDD), SPH_C64(0x3917174BAFB84B17),
+	SPH_C64(0xC947474645024647), SPH_C64(0xBF9E9EDC1A84DC9E),
+	SPH_C64(0x43CACAC5D41EC5CA), SPH_C64(0x772D2D995875992D),
+	SPH_C64(0xDCBFBF792E9179BF), SPH_C64(0x0907071B3F381B07),
+	SPH_C64(0xEAADAD23AC0123AD), SPH_C64(0xEE5A5A2FB0EA2F5A),
+	SPH_C64(0x988383B5EF6CB583), SPH_C64(0x553333FFB685FF33),
+	SPH_C64(0xA56363F25C3FF263), SPH_C64(0x0602020A12100A02),
+	SPH_C64(0xE3AAAA38933938AA), SPH_C64(0x937171A8DEAFA871),
+	SPH_C64(0x45C8C8CFC60ECFC8), SPH_C64(0x2B19197DD1C87D19),
+	SPH_C64(0xDB4949703B727049), SPH_C64(0x76D9D99A5F869AD9),
+	SPH_C64(0x0BF2F21D31C31DF2), SPH_C64(0x38E3E348A84B48E3),
+	SPH_C64(0xED5B5B2AB9E22A5B), SPH_C64(0x85888892BC349288),
+	SPH_C64(0xB39A9AC83EA4C89A), SPH_C64(0x6A2626BE0B2DBE26),
+	SPH_C64(0x563232FABF8DFA32), SPH_C64(0xCDB0B04A59E94AB0),
+	SPH_C64(0x26E9E96AF21B6AE9), SPH_C64(0x110F0F337778330F),
+	SPH_C64(0x62D5D5A633E6A6D5), SPH_C64(0x9D8080BAF474BA80),
+	SPH_C64(0xDFBEBE7C27997CBE), SPH_C64(0x4ACDCDDEEB26DECD),
+	SPH_C64(0x5C3434E489BDE434), SPH_C64(0xD8484875327A7548),
+	SPH_C64(0x1CFFFF2454AB24FF), SPH_C64(0x8E7A7A8F8DF78F7A),
+	SPH_C64(0xAD9090EA64F4EA90), SPH_C64(0xE15F5F3E9DC23E5F),
+	SPH_C64(0x602020A03D1DA020), SPH_C64(0xB86868D50F67D568),
+	SPH_C64(0x2E1A1A72CAD0721A), SPH_C64(0xEFAEAE2CB7192CAE),
+	SPH_C64(0xC1B4B45E7DC95EB4), SPH_C64(0xFC545419CE9A1954),
+	SPH_C64(0xA89393E57FECE593), SPH_C64(0x662222AA2F0DAA22),
+	SPH_C64(0xAC6464E96307E964), SPH_C64(0x0EF1F1122ADB12F1),
+	SPH_C64(0x957373A2CCBFA273), SPH_C64(0x3612125A82905A12),
+	SPH_C64(0xC040405D7A3A5D40), SPH_C64(0x1808082848402808),
+	SPH_C64(0x58C3C3E89556E8C3), SPH_C64(0x29ECEC7BDF337BEC),
+	SPH_C64(0x70DBDB904D9690DB), SPH_C64(0xFEA1A11FC0611FA1),
+	SPH_C64(0x8A8D8D83911C838D), SPH_C64(0x473D3DC9C8F5C93D),
+	SPH_C64(0xA49797F15BCCF197), SPH_C64(0x0000000000000000),
+	SPH_C64(0x4CCFCFD4F936D4CF), SPH_C64(0x7D2B2B876E45872B),
+	SPH_C64(0x9A7676B3E197B376), SPH_C64(0x9B8282B0E664B082),
+	SPH_C64(0x67D6D6A928FEA9D6), SPH_C64(0x2D1B1B77C3D8771B),
+	SPH_C64(0xC2B5B55B74C15BB5), SPH_C64(0xECAFAF29BE1129AF),
+	SPH_C64(0xBE6A6ADF1D77DF6A), SPH_C64(0xF050500DEABA0D50),
+	SPH_C64(0xCF45454C57124C45), SPH_C64(0x08F3F31838CB18F3),
+	SPH_C64(0x503030F0AD9DF030), SPH_C64(0x2CEFEF74C42B74EF),
+	SPH_C64(0x413F3FC3DAE5C33F), SPH_C64(0xFF55551CC7921C55),
+	SPH_C64(0xFBA2A210DB7910A2), SPH_C64(0x23EAEA65E90365EA),
+	SPH_C64(0xAF6565EC6A0FEC65), SPH_C64(0xD3BABA6803B968BA),
+	SPH_C64(0x712F2F934A65932F), SPH_C64(0x5DC0C0E78E4EE7C0),
+	SPH_C64(0x7FDEDE8160BE81DE), SPH_C64(0x241C1C6CFCE06C1C),
+	SPH_C64(0x1AFDFD2E46BB2EFD), SPH_C64(0xD74D4D641F52644D),
+	SPH_C64(0xAB9292E076E4E092), SPH_C64(0x9F7575BCFA8FBC75),
+	SPH_C64(0x0A06061E36301E06), SPH_C64(0x838A8A98AE24988A),
+	SPH_C64(0xCBB2B2404BF940B2), SPH_C64(0x37E6E659856359E6),
+	SPH_C64(0x120E0E367E70360E), SPH_C64(0x211F1F63E7F8631F),
+	SPH_C64(0xA66262F75537F762), SPH_C64(0x61D4D4A33AEEA3D4),
+	SPH_C64(0xE5A8A832812932A8), SPH_C64(0xA79696F452C4F496),
+	SPH_C64(0x16F9F93A629B3AF9), SPH_C64(0x52C5C5F6A366F6C5),
+	SPH_C64(0x6F2525B11035B125), SPH_C64(0xEB595920ABF22059),
+	SPH_C64(0x918484AED054AE84), SPH_C64(0x967272A7C5B7A772),
+	SPH_C64(0x4B3939DDECD5DD39), SPH_C64(0xD44C4C61165A614C),
+	SPH_C64(0xE25E5E3B94CA3B5E), SPH_C64(0x887878859FE78578),
+	SPH_C64(0x483838D8E5DDD838), SPH_C64(0x898C8C869814868C),
+	SPH_C64(0x6ED1D1B217C6B2D1), SPH_C64(0xF2A5A50BE4410BA5),
+	SPH_C64(0x3BE2E24DA1434DE2), SPH_C64(0xA36161F84E2FF861),
+	SPH_C64(0xC8B3B34542F145B3), SPH_C64(0x632121A53415A521),
+	SPH_C64(0xB99C9CD60894D69C), SPH_C64(0x221E1E66EEF0661E),
+	SPH_C64(0xC543435261225243), SPH_C64(0x54C7C7FCB176FCC7),
+	SPH_C64(0x19FCFC2B4FB32BFC), SPH_C64(0x0C04041424201404),
+	SPH_C64(0xF3515108E3B20851), SPH_C64(0xB69999C725BCC799),
+	SPH_C64(0xB76D6DC4224FC46D), SPH_C64(0x170D0D396568390D),
+	SPH_C64(0x13FAFA35798335FA), SPH_C64(0x7CDFDF8469B684DF),
+	SPH_C64(0x827E7E9BA9D79B7E), SPH_C64(0x6C2424B4193DB424),
+	SPH_C64(0x4D3B3BD7FEC5D73B), SPH_C64(0xE0ABAB3D9A313DAB),
+	SPH_C64(0x4FCECED1F03ED1CE), SPH_C64(0x3311115599885511),
+	SPH_C64(0x8C8F8F89830C898F), SPH_C64(0xD24E4E6B044A6B4E),
+	SPH_C64(0xC4B7B75166D151B7), SPH_C64(0x20EBEB60E00B60EB),
+	SPH_C64(0x443C3CCCC1FDCC3C), SPH_C64(0x9E8181BFFD7CBF81),
+	SPH_C64(0xA19494FE40D4FE94), SPH_C64(0x04F7F70C1CEB0CF7),
+	SPH_C64(0xD6B9B96718A167B9), SPH_C64(0x3513135F8B985F13),
+	SPH_C64(0x742C2C9C517D9C2C), SPH_C64(0x68D3D3B805D6B8D3),
+	SPH_C64(0x34E7E75C8C6B5CE7), SPH_C64(0xB26E6ECB3957CB6E),
+	SPH_C64(0x51C4C4F3AA6EF3C4), SPH_C64(0x0503030F1B180F03),
+	SPH_C64(0xFA565613DC8A1356), SPH_C64(0xCC4444495E1A4944),
+	SPH_C64(0x817F7F9EA0DF9E7F), SPH_C64(0xE6A9A937882137A9),
+	SPH_C64(0x7E2A2A82674D822A), SPH_C64(0xD0BBBB6D0AB16DBB),
+	SPH_C64(0x5EC1C1E28746E2C1), SPH_C64(0xF5535302F1A20253),
+	SPH_C64(0x79DCDC8B72AE8BDC), SPH_C64(0x1D0B0B275358270B),
+	SPH_C64(0xBA9D9DD3019CD39D), SPH_C64(0xB46C6CC12B47C16C),
+	SPH_C64(0x533131F5A495F531), SPH_C64(0x9C7474B9F387B974),
+	SPH_C64(0x07F6F60915E309F6), SPH_C64(0xCA4646434C0A4346),
+	SPH_C64(0xE9ACAC26A50926AC), SPH_C64(0x86898997B53C9789),
+	SPH_C64(0x3C141444B4A04414), SPH_C64(0x3EE1E142BA5B42E1),
+	SPH_C64(0x3A16164EA6B04E16), SPH_C64(0x4E3A3AD2F7CDD23A),
+	SPH_C64(0xBB6969D0066FD069), SPH_C64(0x1B09092D41482D09),
+	SPH_C64(0x907070ADD7A7AD70), SPH_C64(0xC7B6B6546FD954B6),
+	SPH_C64(0x6DD0D0B71ECEB7D0), SPH_C64(0x2AEDED7ED63B7EED),
+	SPH_C64(0x49CCCCDBE22EDBCC), SPH_C64(0xC6424257682A5742),
+	SPH_C64(0xB59898C22CB4C298), SPH_C64(0xF1A4A40EED490EA4),
+	SPH_C64(0x78282888755D8828), SPH_C64(0xE45C5C3186DA315C),
+	SPH_C64(0x15F8F83F6B933FF8), SPH_C64(0x978686A4C244A486)
+};
+
+static const sph_u64 old1_T6[256] = {
+	SPH_C64(0x181878D8C0781828), SPH_C64(0x2323AF2605AF2365),
+	SPH_C64(0xC6C6F9B87EF9C657), SPH_C64(0xE8E86FFB136FE825),
+	SPH_C64(0x8787A1CB4CA18794), SPH_C64(0xB8B86211A962B8D5),
+	SPH_C64(0x0101050908050103), SPH_C64(0x4F4F6E0D426E4FD1),
+	SPH_C64(0x3636EE9BADEE365A), SPH_C64(0xA6A604FF5904A6F7),
+	SPH_C64(0xD2D2BD0CDEBDD26B), SPH_C64(0xF5F5060EFB06F502),
+	SPH_C64(0x79798096EF80798B), SPH_C64(0x6F6FCE305FCE6FB1),
+	SPH_C64(0x9191EF6DFCEF91AE), SPH_C64(0x525207F8AA0752F6),
+	SPH_C64(0x6060FD4727FD60A0), SPH_C64(0xBCBC76358976BCD9),
+	SPH_C64(0x9B9BCD37ACCD9BB0), SPH_C64(0x8E8E8C8A048C8E8F),
+	SPH_C64(0xA3A315D27115A3F8), SPH_C64(0x0C0C3C6C603C0C14),
+	SPH_C64(0x7B7B8A84FF8A7B8D), SPH_C64(0x3535E180B5E1355F),
+	SPH_C64(0x1D1D69F5E8691D27), SPH_C64(0xE0E047B35347E03D),
+	SPH_C64(0xD7D7AC21F6ACD764), SPH_C64(0xC2C2ED9C5EEDC25B),
+	SPH_C64(0x2E2E96436D962E72), SPH_C64(0x4B4B7A29627A4BDD),
+	SPH_C64(0xFEFE215DA321FE1F), SPH_C64(0x575716D5821657F9),
+	SPH_C64(0x151541BDA841153F), SPH_C64(0x7777B6E89FB67799),
+	SPH_C64(0x3737EB92A5EB3759), SPH_C64(0xE5E5569E7B56E532),
+	SPH_C64(0x9F9FD9138CD99FBC), SPH_C64(0xF0F01723D317F00D),
+	SPH_C64(0x4A4A7F206A7F4ADE), SPH_C64(0xDADA95449E95DA73),
+	SPH_C64(0x585825A2FA2558E8), SPH_C64(0xC9C9CACF06CAC946),
+	SPH_C64(0x29298D7C558D297B), SPH_C64(0x0A0A225A50220A1E),
+	SPH_C64(0xB1B14F50E14FB1CE), SPH_C64(0xA0A01AC9691AA0FD),
+	SPH_C64(0x6B6BDA147FDA6BBD), SPH_C64(0x8585ABD95CAB8592),
+	SPH_C64(0xBDBD733C8173BDDA), SPH_C64(0x5D5D348FD2345DE7),
+	SPH_C64(0x1010509080501030), SPH_C64(0xF4F40307F303F401),
+	SPH_C64(0xCBCBC0DD16C0CB40), SPH_C64(0x3E3EC6D3EDC63E42),
+	SPH_C64(0x0505112D2811050F), SPH_C64(0x6767E6781FE667A9),
+	SPH_C64(0xE4E453977353E431), SPH_C64(0x2727BB0225BB2769),
+	SPH_C64(0x41415873325841C3), SPH_C64(0x8B8B9DA72C9D8B80),
+	SPH_C64(0xA7A701F65101A7F4), SPH_C64(0x7D7D94B2CF947D87),
+	SPH_C64(0x9595FB49DCFB95A2), SPH_C64(0xD8D89F568E9FD875),
+	SPH_C64(0xFBFB30708B30FB10), SPH_C64(0xEEEE71CD2371EE2F),
+	SPH_C64(0x7C7C91BBC7917C84), SPH_C64(0x6666E37117E366AA),
+	SPH_C64(0xDDDD8E7BA68EDD7A), SPH_C64(0x17174BAFB84B1739),
+	SPH_C64(0x47474645024647C9), SPH_C64(0x9E9EDC1A84DC9EBF),
+	SPH_C64(0xCACAC5D41EC5CA43), SPH_C64(0x2D2D995875992D77),
+	SPH_C64(0xBFBF792E9179BFDC), SPH_C64(0x07071B3F381B0709),
+	SPH_C64(0xADAD23AC0123ADEA), SPH_C64(0x5A5A2FB0EA2F5AEE),
+	SPH_C64(0x8383B5EF6CB58398), SPH_C64(0x3333FFB685FF3355),
+	SPH_C64(0x6363F25C3FF263A5), SPH_C64(0x02020A12100A0206),
+	SPH_C64(0xAAAA38933938AAE3), SPH_C64(0x7171A8DEAFA87193),
+	SPH_C64(0xC8C8CFC60ECFC845), SPH_C64(0x19197DD1C87D192B),
+	SPH_C64(0x4949703B727049DB), SPH_C64(0xD9D99A5F869AD976),
+	SPH_C64(0xF2F21D31C31DF20B), SPH_C64(0xE3E348A84B48E338),
+	SPH_C64(0x5B5B2AB9E22A5BED), SPH_C64(0x888892BC34928885),
+	SPH_C64(0x9A9AC83EA4C89AB3), SPH_C64(0x2626BE0B2DBE266A),
+	SPH_C64(0x3232FABF8DFA3256), SPH_C64(0xB0B04A59E94AB0CD),
+	SPH_C64(0xE9E96AF21B6AE926), SPH_C64(0x0F0F337778330F11),
+	SPH_C64(0xD5D5A633E6A6D562), SPH_C64(0x8080BAF474BA809D),
+	SPH_C64(0xBEBE7C27997CBEDF), SPH_C64(0xCDCDDEEB26DECD4A),
+	SPH_C64(0x3434E489BDE4345C), SPH_C64(0x484875327A7548D8),
+	SPH_C64(0xFFFF2454AB24FF1C), SPH_C64(0x7A7A8F8DF78F7A8E),
+	SPH_C64(0x9090EA64F4EA90AD), SPH_C64(0x5F5F3E9DC23E5FE1),
+	SPH_C64(0x2020A03D1DA02060), SPH_C64(0x6868D50F67D568B8),
+	SPH_C64(0x1A1A72CAD0721A2E), SPH_C64(0xAEAE2CB7192CAEEF),
+	SPH_C64(0xB4B45E7DC95EB4C1), SPH_C64(0x545419CE9A1954FC),
+	SPH_C64(0x9393E57FECE593A8), SPH_C64(0x2222AA2F0DAA2266),
+	SPH_C64(0x6464E96307E964AC), SPH_C64(0xF1F1122ADB12F10E),
+	SPH_C64(0x7373A2CCBFA27395), SPH_C64(0x12125A82905A1236),
+	SPH_C64(0x40405D7A3A5D40C0), SPH_C64(0x0808284840280818),
+	SPH_C64(0xC3C3E89556E8C358), SPH_C64(0xECEC7BDF337BEC29),
+	SPH_C64(0xDBDB904D9690DB70), SPH_C64(0xA1A11FC0611FA1FE),
+	SPH_C64(0x8D8D83911C838D8A), SPH_C64(0x3D3DC9C8F5C93D47),
+	SPH_C64(0x9797F15BCCF197A4), SPH_C64(0x0000000000000000),
+	SPH_C64(0xCFCFD4F936D4CF4C), SPH_C64(0x2B2B876E45872B7D),
+	SPH_C64(0x7676B3E197B3769A), SPH_C64(0x8282B0E664B0829B),
+	SPH_C64(0xD6D6A928FEA9D667), SPH_C64(0x1B1B77C3D8771B2D),
+	SPH_C64(0xB5B55B74C15BB5C2), SPH_C64(0xAFAF29BE1129AFEC),
+	SPH_C64(0x6A6ADF1D77DF6ABE), SPH_C64(0x50500DEABA0D50F0),
+	SPH_C64(0x45454C57124C45CF), SPH_C64(0xF3F31838CB18F308),
+	SPH_C64(0x3030F0AD9DF03050), SPH_C64(0xEFEF74C42B74EF2C),
+	SPH_C64(0x3F3FC3DAE5C33F41), SPH_C64(0x55551CC7921C55FF),
+	SPH_C64(0xA2A210DB7910A2FB), SPH_C64(0xEAEA65E90365EA23),
+	SPH_C64(0x6565EC6A0FEC65AF), SPH_C64(0xBABA6803B968BAD3),
+	SPH_C64(0x2F2F934A65932F71), SPH_C64(0xC0C0E78E4EE7C05D),
+	SPH_C64(0xDEDE8160BE81DE7F), SPH_C64(0x1C1C6CFCE06C1C24),
+	SPH_C64(0xFDFD2E46BB2EFD1A), SPH_C64(0x4D4D641F52644DD7),
+	SPH_C64(0x9292E076E4E092AB), SPH_C64(0x7575BCFA8FBC759F),
+	SPH_C64(0x06061E36301E060A), SPH_C64(0x8A8A98AE24988A83),
+	SPH_C64(0xB2B2404BF940B2CB), SPH_C64(0xE6E659856359E637),
+	SPH_C64(0x0E0E367E70360E12), SPH_C64(0x1F1F63E7F8631F21),
+	SPH_C64(0x6262F75537F762A6), SPH_C64(0xD4D4A33AEEA3D461),
+	SPH_C64(0xA8A832812932A8E5), SPH_C64(0x9696F452C4F496A7),
+	SPH_C64(0xF9F93A629B3AF916), SPH_C64(0xC5C5F6A366F6C552),
+	SPH_C64(0x2525B11035B1256F), SPH_C64(0x595920ABF22059EB),
+	SPH_C64(0x8484AED054AE8491), SPH_C64(0x7272A7C5B7A77296),
+	SPH_C64(0x3939DDECD5DD394B), SPH_C64(0x4C4C61165A614CD4),
+	SPH_C64(0x5E5E3B94CA3B5EE2), SPH_C64(0x7878859FE7857888),
+	SPH_C64(0x3838D8E5DDD83848), SPH_C64(0x8C8C869814868C89),
+	SPH_C64(0xD1D1B217C6B2D16E), SPH_C64(0xA5A50BE4410BA5F2),
+	SPH_C64(0xE2E24DA1434DE23B), SPH_C64(0x6161F84E2FF861A3),
+	SPH_C64(0xB3B34542F145B3C8), SPH_C64(0x2121A53415A52163),
+	SPH_C64(0x9C9CD60894D69CB9), SPH_C64(0x1E1E66EEF0661E22),
+	SPH_C64(0x43435261225243C5), SPH_C64(0xC7C7FCB176FCC754),
+	SPH_C64(0xFCFC2B4FB32BFC19), SPH_C64(0x040414242014040C),
+	SPH_C64(0x515108E3B20851F3), SPH_C64(0x9999C725BCC799B6),
+	SPH_C64(0x6D6DC4224FC46DB7), SPH_C64(0x0D0D396568390D17),
+	SPH_C64(0xFAFA35798335FA13), SPH_C64(0xDFDF8469B684DF7C),
+	SPH_C64(0x7E7E9BA9D79B7E82), SPH_C64(0x2424B4193DB4246C),
+	SPH_C64(0x3B3BD7FEC5D73B4D), SPH_C64(0xABAB3D9A313DABE0),
+	SPH_C64(0xCECED1F03ED1CE4F), SPH_C64(0x1111559988551133),
+	SPH_C64(0x8F8F89830C898F8C), SPH_C64(0x4E4E6B044A6B4ED2),
+	SPH_C64(0xB7B75166D151B7C4), SPH_C64(0xEBEB60E00B60EB20),
+	SPH_C64(0x3C3CCCC1FDCC3C44), SPH_C64(0x8181BFFD7CBF819E),
+	SPH_C64(0x9494FE40D4FE94A1), SPH_C64(0xF7F70C1CEB0CF704),
+	SPH_C64(0xB9B96718A167B9D6), SPH_C64(0x13135F8B985F1335),
+	SPH_C64(0x2C2C9C517D9C2C74), SPH_C64(0xD3D3B805D6B8D368),
+	SPH_C64(0xE7E75C8C6B5CE734), SPH_C64(0x6E6ECB3957CB6EB2),
+	SPH_C64(0xC4C4F3AA6EF3C451), SPH_C64(0x03030F1B180F0305),
+	SPH_C64(0x565613DC8A1356FA), SPH_C64(0x4444495E1A4944CC),
+	SPH_C64(0x7F7F9EA0DF9E7F81), SPH_C64(0xA9A937882137A9E6),
+	SPH_C64(0x2A2A82674D822A7E), SPH_C64(0xBBBB6D0AB16DBBD0),
+	SPH_C64(0xC1C1E28746E2C15E), SPH_C64(0x535302F1A20253F5),
+	SPH_C64(0xDCDC8B72AE8BDC79), SPH_C64(0x0B0B275358270B1D),
+	SPH_C64(0x9D9DD3019CD39DBA), SPH_C64(0x6C6CC12B47C16CB4),
+	SPH_C64(0x3131F5A495F53153), SPH_C64(0x7474B9F387B9749C),
+	SPH_C64(0xF6F60915E309F607), SPH_C64(0x4646434C0A4346CA),
+	SPH_C64(0xACAC26A50926ACE9), SPH_C64(0x898997B53C978986),
+	SPH_C64(0x141444B4A044143C), SPH_C64(0xE1E142BA5B42E13E),
+	SPH_C64(0x16164EA6B04E163A), SPH_C64(0x3A3AD2F7CDD23A4E),
+	SPH_C64(0x6969D0066FD069BB), SPH_C64(0x09092D41482D091B),
+	SPH_C64(0x7070ADD7A7AD7090), SPH_C64(0xB6B6546FD954B6C7),
+	SPH_C64(0xD0D0B71ECEB7D06D), SPH_C64(0xEDED7ED63B7EED2A),
+	SPH_C64(0xCCCCDBE22EDBCC49), SPH_C64(0x424257682A5742C6),
+	SPH_C64(0x9898C22CB4C298B5), SPH_C64(0xA4A40EED490EA4F1),
+	SPH_C64(0x282888755D882878), SPH_C64(0x5C5C3186DA315CE4),
+	SPH_C64(0xF8F83F6B933FF815), SPH_C64(0x8686A4C244A48697)
+};
+
+static const sph_u64 old1_T7[256] = {
+	SPH_C64(0x1878D8C078182818), SPH_C64(0x23AF2605AF236523),
+	SPH_C64(0xC6F9B87EF9C657C6), SPH_C64(0xE86FFB136FE825E8),
+	SPH_C64(0x87A1CB4CA1879487), SPH_C64(0xB86211A962B8D5B8),
+	SPH_C64(0x0105090805010301), SPH_C64(0x4F6E0D426E4FD14F),
+	SPH_C64(0x36EE9BADEE365A36), SPH_C64(0xA604FF5904A6F7A6),
+	SPH_C64(0xD2BD0CDEBDD26BD2), SPH_C64(0xF5060EFB06F502F5),
+	SPH_C64(0x798096EF80798B79), SPH_C64(0x6FCE305FCE6FB16F),
+	SPH_C64(0x91EF6DFCEF91AE91), SPH_C64(0x5207F8AA0752F652),
+	SPH_C64(0x60FD4727FD60A060), SPH_C64(0xBC76358976BCD9BC),
+	SPH_C64(0x9BCD37ACCD9BB09B), SPH_C64(0x8E8C8A048C8E8F8E),
+	SPH_C64(0xA315D27115A3F8A3), SPH_C64(0x0C3C6C603C0C140C),
+	SPH_C64(0x7B8A84FF8A7B8D7B), SPH_C64(0x35E180B5E1355F35),
+	SPH_C64(0x1D69F5E8691D271D), SPH_C64(0xE047B35347E03DE0),
+	SPH_C64(0xD7AC21F6ACD764D7), SPH_C64(0xC2ED9C5EEDC25BC2),
+	SPH_C64(0x2E96436D962E722E), SPH_C64(0x4B7A29627A4BDD4B),
+	SPH_C64(0xFE215DA321FE1FFE), SPH_C64(0x5716D5821657F957),
+	SPH_C64(0x1541BDA841153F15), SPH_C64(0x77B6E89FB6779977),
+	SPH_C64(0x37EB92A5EB375937), SPH_C64(0xE5569E7B56E532E5),
+	SPH_C64(0x9FD9138CD99FBC9F), SPH_C64(0xF01723D317F00DF0),
+	SPH_C64(0x4A7F206A7F4ADE4A), SPH_C64(0xDA95449E95DA73DA),
+	SPH_C64(0x5825A2FA2558E858), SPH_C64(0xC9CACF06CAC946C9),
+	SPH_C64(0x298D7C558D297B29), SPH_C64(0x0A225A50220A1E0A),
+	SPH_C64(0xB14F50E14FB1CEB1), SPH_C64(0xA01AC9691AA0FDA0),
+	SPH_C64(0x6BDA147FDA6BBD6B), SPH_C64(0x85ABD95CAB859285),
+	SPH_C64(0xBD733C8173BDDABD), SPH_C64(0x5D348FD2345DE75D),
+	SPH_C64(0x1050908050103010), SPH_C64(0xF40307F303F401F4),
+	SPH_C64(0xCBC0DD16C0CB40CB), SPH_C64(0x3EC6D3EDC63E423E),
+	SPH_C64(0x05112D2811050F05), SPH_C64(0x67E6781FE667A967),
+	SPH_C64(0xE453977353E431E4), SPH_C64(0x27BB0225BB276927),
+	SPH_C64(0x415873325841C341), SPH_C64(0x8B9DA72C9D8B808B),
+	SPH_C64(0xA701F65101A7F4A7), SPH_C64(0x7D94B2CF947D877D),
+	SPH_C64(0x95FB49DCFB95A295), SPH_C64(0xD89F568E9FD875D8),
+	SPH_C64(0xFB30708B30FB10FB), SPH_C64(0xEE71CD2371EE2FEE),
+	SPH_C64(0x7C91BBC7917C847C), SPH_C64(0x66E37117E366AA66),
+	SPH_C64(0xDD8E7BA68EDD7ADD), SPH_C64(0x174BAFB84B173917),
+	SPH_C64(0x474645024647C947), SPH_C64(0x9EDC1A84DC9EBF9E),
+	SPH_C64(0xCAC5D41EC5CA43CA), SPH_C64(0x2D995875992D772D),
+	SPH_C64(0xBF792E9179BFDCBF), SPH_C64(0x071B3F381B070907),
+	SPH_C64(0xAD23AC0123ADEAAD), SPH_C64(0x5A2FB0EA2F5AEE5A),
+	SPH_C64(0x83B5EF6CB5839883), SPH_C64(0x33FFB685FF335533),
+	SPH_C64(0x63F25C3FF263A563), SPH_C64(0x020A12100A020602),
+	SPH_C64(0xAA38933938AAE3AA), SPH_C64(0x71A8DEAFA8719371),
+	SPH_C64(0xC8CFC60ECFC845C8), SPH_C64(0x197DD1C87D192B19),
+	SPH_C64(0x49703B727049DB49), SPH_C64(0xD99A5F869AD976D9),
+	SPH_C64(0xF21D31C31DF20BF2), SPH_C64(0xE348A84B48E338E3),
+	SPH_C64(0x5B2AB9E22A5BED5B), SPH_C64(0x8892BC3492888588),
+	SPH_C64(0x9AC83EA4C89AB39A), SPH_C64(0x26BE0B2DBE266A26),
+	SPH_C64(0x32FABF8DFA325632), SPH_C64(0xB04A59E94AB0CDB0),
+	SPH_C64(0xE96AF21B6AE926E9), SPH_C64(0x0F337778330F110F),
+	SPH_C64(0xD5A633E6A6D562D5), SPH_C64(0x80BAF474BA809D80),
+	SPH_C64(0xBE7C27997CBEDFBE), SPH_C64(0xCDDEEB26DECD4ACD),
+	SPH_C64(0x34E489BDE4345C34), SPH_C64(0x4875327A7548D848),
+	SPH_C64(0xFF2454AB24FF1CFF), SPH_C64(0x7A8F8DF78F7A8E7A),
+	SPH_C64(0x90EA64F4EA90AD90), SPH_C64(0x5F3E9DC23E5FE15F),
+	SPH_C64(0x20A03D1DA0206020), SPH_C64(0x68D50F67D568B868),
+	SPH_C64(0x1A72CAD0721A2E1A), SPH_C64(0xAE2CB7192CAEEFAE),
+	SPH_C64(0xB45E7DC95EB4C1B4), SPH_C64(0x5419CE9A1954FC54),
+	SPH_C64(0x93E57FECE593A893), SPH_C64(0x22AA2F0DAA226622),
+	SPH_C64(0x64E96307E964AC64), SPH_C64(0xF1122ADB12F10EF1),
+	SPH_C64(0x73A2CCBFA2739573), SPH_C64(0x125A82905A123612),
+	SPH_C64(0x405D7A3A5D40C040), SPH_C64(0x0828484028081808),
+	SPH_C64(0xC3E89556E8C358C3), SPH_C64(0xEC7BDF337BEC29EC),
+	SPH_C64(0xDB904D9690DB70DB), SPH_C64(0xA11FC0611FA1FEA1),
+	SPH_C64(0x8D83911C838D8A8D), SPH_C64(0x3DC9C8F5C93D473D),
+	SPH_C64(0x97F15BCCF197A497), SPH_C64(0x0000000000000000),
+	SPH_C64(0xCFD4F936D4CF4CCF), SPH_C64(0x2B876E45872B7D2B),
+	SPH_C64(0x76B3E197B3769A76), SPH_C64(0x82B0E664B0829B82),
+	SPH_C64(0xD6A928FEA9D667D6), SPH_C64(0x1B77C3D8771B2D1B),
+	SPH_C64(0xB55B74C15BB5C2B5), SPH_C64(0xAF29BE1129AFECAF),
+	SPH_C64(0x6ADF1D77DF6ABE6A), SPH_C64(0x500DEABA0D50F050),
+	SPH_C64(0x454C57124C45CF45), SPH_C64(0xF31838CB18F308F3),
+	SPH_C64(0x30F0AD9DF0305030), SPH_C64(0xEF74C42B74EF2CEF),
+	SPH_C64(0x3FC3DAE5C33F413F), SPH_C64(0x551CC7921C55FF55),
+	SPH_C64(0xA210DB7910A2FBA2), SPH_C64(0xEA65E90365EA23EA),
+	SPH_C64(0x65EC6A0FEC65AF65), SPH_C64(0xBA6803B968BAD3BA),
+	SPH_C64(0x2F934A65932F712F), SPH_C64(0xC0E78E4EE7C05DC0),
+	SPH_C64(0xDE8160BE81DE7FDE), SPH_C64(0x1C6CFCE06C1C241C),
+	SPH_C64(0xFD2E46BB2EFD1AFD), SPH_C64(0x4D641F52644DD74D),
+	SPH_C64(0x92E076E4E092AB92), SPH_C64(0x75BCFA8FBC759F75),
+	SPH_C64(0x061E36301E060A06), SPH_C64(0x8A98AE24988A838A),
+	SPH_C64(0xB2404BF940B2CBB2), SPH_C64(0xE659856359E637E6),
+	SPH_C64(0x0E367E70360E120E), SPH_C64(0x1F63E7F8631F211F),
+	SPH_C64(0x62F75537F762A662), SPH_C64(0xD4A33AEEA3D461D4),
+	SPH_C64(0xA832812932A8E5A8), SPH_C64(0x96F452C4F496A796),
+	SPH_C64(0xF93A629B3AF916F9), SPH_C64(0xC5F6A366F6C552C5),
+	SPH_C64(0x25B11035B1256F25), SPH_C64(0x5920ABF22059EB59),
+	SPH_C64(0x84AED054AE849184), SPH_C64(0x72A7C5B7A7729672),
+	SPH_C64(0x39DDECD5DD394B39), SPH_C64(0x4C61165A614CD44C),
+	SPH_C64(0x5E3B94CA3B5EE25E), SPH_C64(0x78859FE785788878),
+	SPH_C64(0x38D8E5DDD8384838), SPH_C64(0x8C869814868C898C),
+	SPH_C64(0xD1B217C6B2D16ED1), SPH_C64(0xA50BE4410BA5F2A5),
+	SPH_C64(0xE24DA1434DE23BE2), SPH_C64(0x61F84E2FF861A361),
+	SPH_C64(0xB34542F145B3C8B3), SPH_C64(0x21A53415A5216321),
+	SPH_C64(0x9CD60894D69CB99C), SPH_C64(0x1E66EEF0661E221E),
+	SPH_C64(0x435261225243C543), SPH_C64(0xC7FCB176FCC754C7),
+	SPH_C64(0xFC2B4FB32BFC19FC), SPH_C64(0x0414242014040C04),
+	SPH_C64(0x5108E3B20851F351), SPH_C64(0x99C725BCC799B699),
+	SPH_C64(0x6DC4224FC46DB76D), SPH_C64(0x0D396568390D170D),
+	SPH_C64(0xFA35798335FA13FA), SPH_C64(0xDF8469B684DF7CDF),
+	SPH_C64(0x7E9BA9D79B7E827E), SPH_C64(0x24B4193DB4246C24),
+	SPH_C64(0x3BD7FEC5D73B4D3B), SPH_C64(0xAB3D9A313DABE0AB),
+	SPH_C64(0xCED1F03ED1CE4FCE), SPH_C64(0x1155998855113311),
+	SPH_C64(0x8F89830C898F8C8F), SPH_C64(0x4E6B044A6B4ED24E),
+	SPH_C64(0xB75166D151B7C4B7), SPH_C64(0xEB60E00B60EB20EB),
+	SPH_C64(0x3CCCC1FDCC3C443C), SPH_C64(0x81BFFD7CBF819E81),
+	SPH_C64(0x94FE40D4FE94A194), SPH_C64(0xF70C1CEB0CF704F7),
+	SPH_C64(0xB96718A167B9D6B9), SPH_C64(0x135F8B985F133513),
+	SPH_C64(0x2C9C517D9C2C742C), SPH_C64(0xD3B805D6B8D368D3),
+	SPH_C64(0xE75C8C6B5CE734E7), SPH_C64(0x6ECB3957CB6EB26E),
+	SPH_C64(0xC4F3AA6EF3C451C4), SPH_C64(0x030F1B180F030503),
+	SPH_C64(0x5613DC8A1356FA56), SPH_C64(0x44495E1A4944CC44),
+	SPH_C64(0x7F9EA0DF9E7F817F), SPH_C64(0xA937882137A9E6A9),
+	SPH_C64(0x2A82674D822A7E2A), SPH_C64(0xBB6D0AB16DBBD0BB),
+	SPH_C64(0xC1E28746E2C15EC1), SPH_C64(0x5302F1A20253F553),
+	SPH_C64(0xDC8B72AE8BDC79DC), SPH_C64(0x0B275358270B1D0B),
+	SPH_C64(0x9DD3019CD39DBA9D), SPH_C64(0x6CC12B47C16CB46C),
+	SPH_C64(0x31F5A495F5315331), SPH_C64(0x74B9F387B9749C74),
+	SPH_C64(0xF60915E309F607F6), SPH_C64(0x46434C0A4346CA46),
+	SPH_C64(0xAC26A50926ACE9AC), SPH_C64(0x8997B53C97898689),
+	SPH_C64(0x1444B4A044143C14), SPH_C64(0xE142BA5B42E13EE1),
+	SPH_C64(0x164EA6B04E163A16), SPH_C64(0x3AD2F7CDD23A4E3A),
+	SPH_C64(0x69D0066FD069BB69), SPH_C64(0x092D41482D091B09),
+	SPH_C64(0x70ADD7A7AD709070), SPH_C64(0xB6546FD954B6C7B6),
+	SPH_C64(0xD0B71ECEB7D06DD0), SPH_C64(0xED7ED63B7EED2AED),
+	SPH_C64(0xCCDBE22EDBCC49CC), SPH_C64(0x4257682A5742C642),
+	SPH_C64(0x98C22CB4C298B598), SPH_C64(0xA40EED490EA4F1A4),
+	SPH_C64(0x2888755D88287828), SPH_C64(0x5C3186DA315CE45C),
+	SPH_C64(0xF83F6B933FF815F8), SPH_C64(0x86A4C244A4869786)
+};
+
+#endif
+
+static const sph_u64 old1_RC[10] = {
+	SPH_C64(0x4F01B887E8C62318),
+	SPH_C64(0x52916F79F5D2A636),
+	SPH_C64(0x357B0CA38E9BBC60),
+	SPH_C64(0x57FE4B2EC2D7E01D),
+	SPH_C64(0xDA4AF09FE5377715),
+	SPH_C64(0x856BA0B10A29C958),
+	SPH_C64(0x67053ECBF4105DBD),
+	SPH_C64(0xD8957DA78B4127E4),
+	SPH_C64(0x9E4717DD667CEEFB),
+	SPH_C64(0x33835AAD07BF2DCA)
+};
+
+/* ====================================================================== */
+
+#define DECL8(z)   sph_u64 z ## 0, z ## 1, z ## 2, z ## 3, \
+                   z ## 4, z ## 5, z ## 6, z ## 7
+
+#if SPH_LITTLE_FAST
+#define READ_DATA_W(x)   do { \
+		n ## x = sph_dec64le_aligned( \
+			(const unsigned char *)src + 8 * (x)); \
+	} while (0)
+#define UPDATE_STATE_W(x)   do { \
+		state[x] ^= n ## x ^ sph_dec64le_aligned( \
+			(const unsigned char *)src + 8 * (x)); \
+	} while (0)
+#define LVARS   DECL8(n); DECL8(h);
+#else
+#define READ_DATA_W(x)   do { \
+		sn ## x = n ## x = sph_dec64le_aligned( \
+			(const unsigned char *)src + 8 * (x)); \
+	} while (0)
+#define UPDATE_STATE_W(x)   do { \
+		state[x] ^= n ## x ^ sn ## x; \
+	} while (0)
+#define LVARS   DECL8(n); DECL8(sn); DECL8(h);
+#endif
+
+#define READ_STATE_W(x)   do { h ## x = state[x]; } while (0)
+
+#define MUL8(FUN)   do { \
+		FUN(0); \
+		FUN(1); \
+		FUN(2); \
+		FUN(3); \
+		FUN(4); \
+		FUN(5); \
+		FUN(6); \
+		FUN(7); \
+	} while (0)
+
+/*
+ * First operation: XOR the input data with the first round key.
+ */
+#define ROUND0_W(x)   do { \
+		n ## x ^= h ## x; \
+	} while (0)
+
+#define READ_DATA      MUL8(READ_DATA_W)
+#define READ_STATE     MUL8(READ_STATE_W)
+#define ROUND0         MUL8(ROUND0_W)
+#define UPDATE_STATE   MUL8(UPDATE_STATE_W)
+
+#define BYTE(x, n)     ((unsigned)((x) >> (8 * (n))) & 0xFF)
+
+#if SPH_SMALL_FOOTPRINT_WHIRLPOOL
+
+static SPH_INLINE sph_u64
+table_skew(sph_u64 val, int num)
+{
+	return SPH_ROTL64(val, 8 * num);
+}
+
+#define ROUND_ELT(table, in, i0, i1, i2, i3, i4, i5, i6, i7) \
+	(table ## 0[BYTE(in ## i0, 0)] \
+	^ table_skew(table ## 0[BYTE(in ## i1, 1)], 1) \
+	^ table_skew(table ## 0[BYTE(in ## i2, 2)], 2) \
+	^ table_skew(table ## 0[BYTE(in ## i3, 3)], 3) \
+	^ table_skew(table ## 0[BYTE(in ## i4, 4)], 4) \
+	^ table_skew(table ## 0[BYTE(in ## i5, 5)], 5) \
+	^ table_skew(table ## 0[BYTE(in ## i6, 6)], 6) \
+	^ table_skew(table ## 0[BYTE(in ## i7, 7)], 7))
+#else
+#define ROUND_ELT(table, in, i0, i1, i2, i3, i4, i5, i6, i7) \
+	(table ## 0[BYTE(in ## i0, 0)] \
+	^ table ## 1[BYTE(in ## i1, 1)] \
+	^ table ## 2[BYTE(in ## i2, 2)] \
+	^ table ## 3[BYTE(in ## i3, 3)] \
+	^ table ## 4[BYTE(in ## i4, 4)] \
+	^ table ## 5[BYTE(in ## i5, 5)] \
+	^ table ## 6[BYTE(in ## i6, 6)] \
+	^ table ## 7[BYTE(in ## i7, 7)])
+#endif
+
+#define ROUND(table, in, out, c0, c1, c2, c3, c4, c5, c6, c7)   do { \
+		out ## 0 = ROUND_ELT(table, in, 0, 7, 6, 5, 4, 3, 2, 1) ^ c0; \
+		out ## 1 = ROUND_ELT(table, in, 1, 0, 7, 6, 5, 4, 3, 2) ^ c1; \
+		out ## 2 = ROUND_ELT(table, in, 2, 1, 0, 7, 6, 5, 4, 3) ^ c2; \
+		out ## 3 = ROUND_ELT(table, in, 3, 2, 1, 0, 7, 6, 5, 4) ^ c3; \
+		out ## 4 = ROUND_ELT(table, in, 4, 3, 2, 1, 0, 7, 6, 5) ^ c4; \
+		out ## 5 = ROUND_ELT(table, in, 5, 4, 3, 2, 1, 0, 7, 6) ^ c5; \
+		out ## 6 = ROUND_ELT(table, in, 6, 5, 4, 3, 2, 1, 0, 7) ^ c6; \
+		out ## 7 = ROUND_ELT(table, in, 7, 6, 5, 4, 3, 2, 1, 0) ^ c7; \
+	} while (0)
+
+#define ROUND_KSCHED(table, in, out, c) \
+	ROUND(table, in, out, c, 0, 0, 0, 0, 0, 0, 0)
+
+#define ROUND_WENC(table, in, key, out) \
+	ROUND(table, in, out, key ## 0, key ## 1, key ## 2, \
+		key ## 3, key ## 4, key ## 5, key ## 6, key ## 7)
+
+#define TRANSFER(dst, src)   do { \
+		dst ## 0 = src ## 0; \
+		dst ## 1 = src ## 1; \
+		dst ## 2 = src ## 2; \
+		dst ## 3 = src ## 3; \
+		dst ## 4 = src ## 4; \
+		dst ## 5 = src ## 5; \
+		dst ## 6 = src ## 6; \
+		dst ## 7 = src ## 7; \
+	} while (0)
+
+/* see sph_whirlpool.h */
+void
+sph_whirlpool_init(void *cc)
+{
+	sph_whirlpool_context *sc;
+
+	sc = cc;
+	/*
+	 * We want to set all eight 64-bit words to 0. A "memset()"
+	 * is not, theoretically, fully standard, but in practice it
+	 * will work everywhere.
+	 */
+	memset(sc->state, 0, sizeof sc->state);
+#if SPH_64
+	sc->count = 0;
+#else
+	sc->count_high = sc->count_low = 0;
+#endif
+}
+
+#define ROUND_FUN(name, type) \
+static void \
+name ## _round(const void *src, sph_u64 *state) \
+{ \
+	LVARS \
+	int r; \
+ \
+	READ_DATA; \
+	READ_STATE; \
+	ROUND0; \
+	for (r = 0; r < 10; r ++) { \
+		DECL8(tmp); \
+ \
+		ROUND_KSCHED(type ## _T, h, tmp, type ## _RC[r]); \
+		TRANSFER(h, tmp); \
+		ROUND_WENC(type ## _T, n, h, tmp); \
+		TRANSFER(n, tmp); \
+	} \
+	UPDATE_STATE; \
+}
+
+ROUND_FUN(whirlpool, plain)
+ROUND_FUN(whirlpool0, old0)
+ROUND_FUN(whirlpool1, old1)
+
+/*
+ * We want big-endian encoding of the message length, over 256 bits. BE64
+ * triggers that. However, our block length is 512 bits, not 1024 bits.
+ * Internally, our encoding/decoding is little-endian, which is not a
+ * problem here since we also deactivate output in md_helper.c.
+ */
+#define BE64   1
+#define SVAL   sc->state
+#define BLEN   64U
+#define PLW4   1
+
+#define RFUN   whirlpool_round
+#define HASH   whirlpool
+#include "md_helper.c"
+#undef RFUN
+#undef HASH
+
+#define RFUN   whirlpool0_round
+#define HASH   whirlpool0
+#include "md_helper.c"
+#undef RFUN
+#undef HASH
+
+#define RFUN   whirlpool1_round
+#define HASH   whirlpool1
+#include "md_helper.c"
+#undef RFUN
+#undef HASH
+
+#define MAKE_CLOSE(name) \
+void \
+sph_ ## name ## _close(void *cc, void *dst) \
+{ \
+	sph_ ## name ## _context *sc; \
+	int i; \
+ \
+	name ## _close(cc, dst, 0); \
+	sc = cc; \
+	for (i = 0; i < 8; i ++) \
+		sph_enc64le((unsigned char *)dst + 8 * i, sc->state[i]); \
+	sph_ ## name ## _init(cc); \
+}
+
+MAKE_CLOSE(whirlpool)
+MAKE_CLOSE(whirlpool0)
+MAKE_CLOSE(whirlpool1)
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/stats.cpp b/stats.cpp
new file mode 100644
index 0000000000..b58f997ce0
--- /dev/null
+++ b/stats.cpp
@@ -0,0 +1,164 @@
+/**
+ * Stats place holder
+ *
+ * Note: this source is C++ (requires std::map)
+ *
+ * tpruvot@github 2014
+ */
+#include <stdlib.h>
+#include <memory.h>
+#include <map>
+
+#include "miner.h"
+
+static std::map<uint64_t, stats_data> tlastscans;
+static uint64_t uid = 0;
+
+#define STATS_AVG_SAMPLES 30
+#define STATS_PURGE_TIMEOUT 120*60 /* 120 mn */
+
+extern uint64_t global_hashrate;
+extern int opt_statsavg;
+
+/**
+ * Store speed per thread
+ */
+void stats_remember_speed(int thr_id, uint32_t hashcount, double hashrate, uint8_t found, uint32_t height)
+{
+	const uint64_t key = uid++;
+	stats_data data;
+	// to enough hashes to give right stats
+	if (hashcount < 1000 || hashrate < 0.01)
+		return;
+
+	// first hash rates are often erroneous
+	//if (uid < opt_n_threads * 2)
+	//	return;
+
+	memset(&data, 0, sizeof(data));
+	data.uid = (uint32_t) uid;
+	data.gpu_id = (uint8_t) device_map[thr_id];
+	data.thr_id = (uint8_t) thr_id;
+	data.tm_stat = (uint32_t) time(NULL);
+	data.height = height;
+	data.npool = (uint8_t) cur_pooln;
+	data.pool_type = pools[cur_pooln].type;
+	data.hashcount = hashcount;
+	data.hashfound = found;
+	data.hashrate = hashrate;
+	data.difficulty = net_diff ? net_diff : stratum_diff;
+	if (opt_n_threads == 1 && global_hashrate && uid > 10) {
+		// prevent stats on too high vardiff (erroneous rates)
+		double ratio = (hashrate / (1.0 * global_hashrate));
+		if (ratio < 0.4 || ratio > 1.6)
+			data.ignored = 1;
+	}
+	tlastscans[key] = data;
+}
+
+/**
+ * Get the computed average speed
+ * @param thr_id int (-1 for all threads)
+ */
+double stats_get_speed(int thr_id, double def_speed)
+{
+	double speed = 0.0;
+	int records = 0;
+
+	std::map<uint64_t, stats_data>::reverse_iterator i = tlastscans.rbegin();
+	while (i != tlastscans.rend() && records < opt_statsavg) {
+		if (!i->second.ignored)
+		if (thr_id == -1 || i->second.thr_id == thr_id) {
+			if (i->second.hashcount > 1000) {
+				speed += i->second.hashrate;
+				records++;
+				// applog(LOG_BLUE, "%d %x %.1f", thr_id, i->second.thr_id, i->second.hashrate);
+			}
+		}
+		++i;
+	}
+
+	if (records)
+		speed /= (double)(records);
+	else
+		speed = def_speed;
+
+	if (thr_id == -1)
+		speed *= (double)(opt_n_threads);
+
+	return speed;
+}
+
+/**
+ * Get the gpu average speed
+ * @param gpu_id int (-1 for all threads)
+ */
+double stats_get_gpu_speed(int gpu_id)
+{
+	double speed = 0.0;
+
+	for (int thr_id=0; thr_id<opt_n_threads; thr_id++) {
+		int dev_id = device_map[thr_id];
+		if (gpu_id == -1 || dev_id == gpu_id)
+			speed += stats_get_speed(thr_id, 0.0);
+	}
+
+	return speed;
+}
+
+/**
+ * Export data for api calls
+ */
+int stats_get_history(int thr_id, struct stats_data *data, int max_records)
+{
+	int records = 0;
+
+	std::map<uint64_t, stats_data>::reverse_iterator i = tlastscans.rbegin();
+	while (i != tlastscans.rend() && records < max_records) {
+		if (!i->second.ignored)
+			if (thr_id == -1 || i->second.thr_id == thr_id) {
+				memcpy(&data[records], &(i->second), sizeof(struct stats_data));
+				records++;
+			}
+		++i;
+	}
+	return records;
+}
+
+/**
+ * Remove old entries to reduce memory usage
+ */
+void stats_purge_old(void)
+{
+	int deleted = 0;
+	uint32_t now = (uint32_t) time(NULL);
+	uint32_t sz = (uint32_t) tlastscans.size();
+	std::map<uint64_t, stats_data>::iterator i = tlastscans.begin();
+	while (i != tlastscans.end()) {
+		if (i->second.ignored || (now - i->second.tm_stat) > STATS_PURGE_TIMEOUT) {
+			deleted++;
+			tlastscans.erase(i++);
+		}
+		else ++i;
+	}
+	if (opt_debug && deleted) {
+		applog(LOG_DEBUG, "stats: %d/%d records purged", deleted, sz);
+	}
+}
+
+/**
+ * Reset the cache
+ */
+void stats_purge_all(void)
+{
+	tlastscans.clear();
+}
+
+/**
+ * API meminfo
+ */
+void stats_getmeminfo(uint64_t *mem, uint32_t *records)
+{
+	(*records) = (uint32_t) tlastscans.size();
+	(*mem) = (*records) * sizeof(stats_data);
+}
diff --git a/sysinfos.cpp b/sysinfos.cpp
new file mode 100644
index 0000000000..e5d93960bd
--- /dev/null
+++ b/sysinfos.cpp
@@ -0,0 +1,95 @@
+/**
+ * Unit to read cpu informations
+ *
+ * TODO: WMI implementation for windows
+ *
+ * tpruvot 2014
+ */
+
+#include <stdio.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "miner.h"
+
+#ifndef WIN32
+
+#define HWMON_PATH \
+ "/sys/class/hwmon/hwmon1/device/temp1_input"
+#define HWMON_ALT \
+ "/sys/class/hwmon/hwmon0/temp1_input"
+
+static double linux_cputemp(int core)
+{
+	double tc = 0.0;
+	FILE *fd = fopen(HWMON_PATH, "r");
+	uint32_t val = 0;
+
+	if (!fd)
+		fd = fopen(HWMON_ALT, "r");
+
+	if (!fd)
+		return tc;
+
+	if (fscanf(fd, "%d", &val))
+		tc = (double)val / 1000.0;
+
+	fclose(fd);
+	return tc;
+}
+
+#define CPUFREQ_PATH \
+ "/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_cur_freq"
+static uint32_t linux_cpufreq(int core)
+{
+	FILE *fd = fopen(CPUFREQ_PATH, "r");
+	uint32_t freq = 0;
+
+	if (!fd)
+		return freq;
+
+	if (!fscanf(fd, "%d", &freq))
+		;
+
+	fclose(fd);
+	return freq;
+}
+
+#else /* WIN32 */
+
+static float win32_cputemp(int core)
+{
+	// todo
+	return 0.0;
+}
+
+#endif /* !WIN32 */
+
+
+/* exports */
+
+
+float cpu_temp(int core)
+{
+#ifdef WIN32
+	return win32_cputemp(core);
+#else
+	return (float) linux_cputemp(core);
+#endif
+}
+
+uint32_t cpu_clock(int core)
+{
+#ifdef WIN32
+	return 0;
+#else
+	return linux_cpufreq(core);
+#endif
+}
+
+int cpu_fanpercent()
+{
+	return 0;
+}
+
diff --git a/tribus/cuda_echo512_aes.cuh b/tribus/cuda_echo512_aes.cuh
new file mode 100644
index 0000000000..ff205aaeb7
--- /dev/null
+++ b/tribus/cuda_echo512_aes.cuh
@@ -0,0 +1,318 @@
+#ifdef __INTELLISENSE__
+#define __byte_perm(x, y, b) x
+#define __CUDA_ARCH__ 520
+#include <cuda_helper.h>
+#endif
+
+#undef ROL8
+#undef ROR8
+#undef ROL16
+
+#ifdef __CUDA_ARCH__
+__device__ __forceinline__
+uint32_t ROL8(const uint32_t a) {
+	return __byte_perm(a, 0, 0x2103);
+}
+__device__ __forceinline__
+uint32_t ROR8(const uint32_t a) {
+	return __byte_perm(a, 0, 0x0321);
+}
+__device__ __forceinline__
+uint32_t ROL16(const uint32_t a) {
+	return __byte_perm(a, 0, 0x1032);
+}
+#else
+#define ROL8(u)  ROTL32(u, 8)
+#define ROR8(u)  ROTR32(u, 8)
+#define ROL16(u) ROTL32(u,16)
+#endif
+
+__device__ uint32_t d_AES0[256] = {
+	0xA56363C6, 0x847C7CF8, 0x997777EE, 0x8D7B7BF6, 0x0DF2F2FF, 0xBD6B6BD6, 0xB16F6FDE, 0x54C5C591,
+	0x50303060, 0x03010102, 0xA96767CE, 0x7D2B2B56, 0x19FEFEE7, 0x62D7D7B5, 0xE6ABAB4D, 0x9A7676EC,
+	0x45CACA8F, 0x9D82821F, 0x40C9C989, 0x877D7DFA, 0x15FAFAEF, 0xEB5959B2, 0xC947478E, 0x0BF0F0FB,
+	0xECADAD41, 0x67D4D4B3, 0xFDA2A25F, 0xEAAFAF45, 0xBF9C9C23, 0xF7A4A453, 0x967272E4, 0x5BC0C09B,
+	0xC2B7B775, 0x1CFDFDE1, 0xAE93933D, 0x6A26264C, 0x5A36366C, 0x413F3F7E, 0x02F7F7F5, 0x4FCCCC83,
+	0x5C343468, 0xF4A5A551, 0x34E5E5D1, 0x08F1F1F9, 0x937171E2, 0x73D8D8AB, 0x53313162, 0x3F15152A,
+	0x0C040408, 0x52C7C795, 0x65232346, 0x5EC3C39D, 0x28181830, 0xA1969637, 0x0F05050A, 0xB59A9A2F,
+	0x0907070E, 0x36121224, 0x9B80801B, 0x3DE2E2DF, 0x26EBEBCD, 0x6927274E, 0xCDB2B27F, 0x9F7575EA,
+	0x1B090912, 0x9E83831D, 0x742C2C58, 0x2E1A1A34, 0x2D1B1B36, 0xB26E6EDC, 0xEE5A5AB4, 0xFBA0A05B,
+	0xF65252A4, 0x4D3B3B76, 0x61D6D6B7, 0xCEB3B37D, 0x7B292952, 0x3EE3E3DD, 0x712F2F5E, 0x97848413,
+	0xF55353A6, 0x68D1D1B9, 0x00000000, 0x2CEDEDC1, 0x60202040, 0x1FFCFCE3, 0xC8B1B179, 0xED5B5BB6,
+	0xBE6A6AD4, 0x46CBCB8D, 0xD9BEBE67, 0x4B393972, 0xDE4A4A94, 0xD44C4C98, 0xE85858B0, 0x4ACFCF85,
+	0x6BD0D0BB, 0x2AEFEFC5, 0xE5AAAA4F, 0x16FBFBED, 0xC5434386, 0xD74D4D9A, 0x55333366, 0x94858511,
+	0xCF45458A, 0x10F9F9E9, 0x06020204, 0x817F7FFE, 0xF05050A0, 0x443C3C78, 0xBA9F9F25, 0xE3A8A84B,
+	0xF35151A2, 0xFEA3A35D, 0xC0404080, 0x8A8F8F05, 0xAD92923F, 0xBC9D9D21, 0x48383870, 0x04F5F5F1,
+	0xDFBCBC63, 0xC1B6B677, 0x75DADAAF, 0x63212142, 0x30101020, 0x1AFFFFE5, 0x0EF3F3FD, 0x6DD2D2BF,
+	0x4CCDCD81, 0x140C0C18, 0x35131326, 0x2FECECC3, 0xE15F5FBE, 0xA2979735, 0xCC444488, 0x3917172E,
+	0x57C4C493, 0xF2A7A755, 0x827E7EFC, 0x473D3D7A, 0xAC6464C8, 0xE75D5DBA, 0x2B191932, 0x957373E6,
+	0xA06060C0, 0x98818119, 0xD14F4F9E, 0x7FDCDCA3, 0x66222244, 0x7E2A2A54, 0xAB90903B, 0x8388880B,
+	0xCA46468C, 0x29EEEEC7, 0xD3B8B86B, 0x3C141428, 0x79DEDEA7, 0xE25E5EBC, 0x1D0B0B16, 0x76DBDBAD,
+	0x3BE0E0DB, 0x56323264, 0x4E3A3A74, 0x1E0A0A14, 0xDB494992, 0x0A06060C, 0x6C242448, 0xE45C5CB8,
+	0x5DC2C29F, 0x6ED3D3BD, 0xEFACAC43, 0xA66262C4, 0xA8919139, 0xA4959531, 0x37E4E4D3, 0x8B7979F2,
+	0x32E7E7D5, 0x43C8C88B, 0x5937376E, 0xB76D6DDA, 0x8C8D8D01, 0x64D5D5B1, 0xD24E4E9C, 0xE0A9A949,
+	0xB46C6CD8, 0xFA5656AC, 0x07F4F4F3, 0x25EAEACF, 0xAF6565CA, 0x8E7A7AF4, 0xE9AEAE47, 0x18080810,
+	0xD5BABA6F, 0x887878F0, 0x6F25254A, 0x722E2E5C, 0x241C1C38, 0xF1A6A657, 0xC7B4B473, 0x51C6C697,
+	0x23E8E8CB, 0x7CDDDDA1, 0x9C7474E8, 0x211F1F3E, 0xDD4B4B96, 0xDCBDBD61, 0x868B8B0D, 0x858A8A0F,
+	0x907070E0, 0x423E3E7C, 0xC4B5B571, 0xAA6666CC, 0xD8484890, 0x05030306, 0x01F6F6F7, 0x120E0E1C,
+	0xA36161C2, 0x5F35356A, 0xF95757AE, 0xD0B9B969, 0x91868617, 0x58C1C199, 0x271D1D3A, 0xB99E9E27,
+	0x38E1E1D9, 0x13F8F8EB, 0xB398982B, 0x33111122, 0xBB6969D2, 0x70D9D9A9, 0x898E8E07, 0xA7949433,
+	0xB69B9B2D, 0x221E1E3C, 0x92878715, 0x20E9E9C9, 0x49CECE87, 0xFF5555AA, 0x78282850, 0x7ADFDFA5,
+	0x8F8C8C03, 0xF8A1A159, 0x80898909, 0x170D0D1A, 0xDABFBF65, 0x31E6E6D7, 0xC6424284, 0xB86868D0,
+	0xC3414182, 0xB0999929, 0x772D2D5A, 0x110F0F1E, 0xCBB0B07B, 0xFC5454A8, 0xD6BBBB6D, 0x3A16162C
+};
+
+__device__ uint32_t d_AES3[256] = {
+	0xC6A56363, 0xF8847C7C, 0xEE997777, 0xF68D7B7B, 0xFF0DF2F2, 0xD6BD6B6B, 0xDEB16F6F, 0x9154C5C5,
+	0x60503030, 0x02030101, 0xCEA96767, 0x567D2B2B, 0xE719FEFE, 0xB562D7D7, 0x4DE6ABAB, 0xEC9A7676,
+	0x8F45CACA, 0x1F9D8282, 0x8940C9C9, 0xFA877D7D, 0xEF15FAFA, 0xB2EB5959, 0x8EC94747, 0xFB0BF0F0,
+	0x41ECADAD, 0xB367D4D4, 0x5FFDA2A2, 0x45EAAFAF, 0x23BF9C9C, 0x53F7A4A4, 0xE4967272, 0x9B5BC0C0,
+	0x75C2B7B7, 0xE11CFDFD, 0x3DAE9393, 0x4C6A2626, 0x6C5A3636, 0x7E413F3F, 0xF502F7F7, 0x834FCCCC,
+	0x685C3434, 0x51F4A5A5, 0xD134E5E5, 0xF908F1F1, 0xE2937171, 0xAB73D8D8, 0x62533131, 0x2A3F1515,
+	0x080C0404, 0x9552C7C7, 0x46652323, 0x9D5EC3C3, 0x30281818, 0x37A19696, 0x0A0F0505, 0x2FB59A9A,
+	0x0E090707, 0x24361212, 0x1B9B8080, 0xDF3DE2E2, 0xCD26EBEB, 0x4E692727, 0x7FCDB2B2, 0xEA9F7575,
+	0x121B0909, 0x1D9E8383, 0x58742C2C, 0x342E1A1A, 0x362D1B1B, 0xDCB26E6E, 0xB4EE5A5A, 0x5BFBA0A0,
+	0xA4F65252, 0x764D3B3B, 0xB761D6D6, 0x7DCEB3B3, 0x527B2929, 0xDD3EE3E3, 0x5E712F2F, 0x13978484,
+	0xA6F55353, 0xB968D1D1, 0x00000000, 0xC12CEDED, 0x40602020, 0xE31FFCFC, 0x79C8B1B1, 0xB6ED5B5B,
+	0xD4BE6A6A, 0x8D46CBCB, 0x67D9BEBE, 0x724B3939, 0x94DE4A4A, 0x98D44C4C, 0xB0E85858, 0x854ACFCF,
+	0xBB6BD0D0, 0xC52AEFEF, 0x4FE5AAAA, 0xED16FBFB, 0x86C54343, 0x9AD74D4D, 0x66553333, 0x11948585,
+	0x8ACF4545, 0xE910F9F9, 0x04060202, 0xFE817F7F, 0xA0F05050, 0x78443C3C, 0x25BA9F9F, 0x4BE3A8A8,
+	0xA2F35151, 0x5DFEA3A3, 0x80C04040, 0x058A8F8F, 0x3FAD9292, 0x21BC9D9D, 0x70483838, 0xF104F5F5,
+	0x63DFBCBC, 0x77C1B6B6, 0xAF75DADA, 0x42632121, 0x20301010, 0xE51AFFFF, 0xFD0EF3F3, 0xBF6DD2D2,
+	0x814CCDCD, 0x18140C0C, 0x26351313, 0xC32FECEC, 0xBEE15F5F, 0x35A29797, 0x88CC4444, 0x2E391717,
+	0x9357C4C4, 0x55F2A7A7, 0xFC827E7E, 0x7A473D3D, 0xC8AC6464, 0xBAE75D5D, 0x322B1919, 0xE6957373,
+	0xC0A06060, 0x19988181, 0x9ED14F4F, 0xA37FDCDC, 0x44662222, 0x547E2A2A, 0x3BAB9090, 0x0B838888,
+	0x8CCA4646, 0xC729EEEE, 0x6BD3B8B8, 0x283C1414, 0xA779DEDE, 0xBCE25E5E, 0x161D0B0B, 0xAD76DBDB,
+	0xDB3BE0E0, 0x64563232, 0x744E3A3A, 0x141E0A0A, 0x92DB4949, 0x0C0A0606, 0x486C2424, 0xB8E45C5C,
+	0x9F5DC2C2, 0xBD6ED3D3, 0x43EFACAC, 0xC4A66262, 0x39A89191, 0x31A49595, 0xD337E4E4, 0xF28B7979,
+	0xD532E7E7, 0x8B43C8C8, 0x6E593737, 0xDAB76D6D, 0x018C8D8D, 0xB164D5D5, 0x9CD24E4E, 0x49E0A9A9,
+	0xD8B46C6C, 0xACFA5656, 0xF307F4F4, 0xCF25EAEA, 0xCAAF6565, 0xF48E7A7A, 0x47E9AEAE, 0x10180808,
+	0x6FD5BABA, 0xF0887878, 0x4A6F2525, 0x5C722E2E, 0x38241C1C, 0x57F1A6A6, 0x73C7B4B4, 0x9751C6C6,
+	0xCB23E8E8, 0xA17CDDDD, 0xE89C7474, 0x3E211F1F, 0x96DD4B4B, 0x61DCBDBD, 0x0D868B8B, 0x0F858A8A,
+	0xE0907070, 0x7C423E3E, 0x71C4B5B5, 0xCCAA6666, 0x90D84848, 0x06050303, 0xF701F6F6, 0x1C120E0E,
+	0xC2A36161, 0x6A5F3535, 0xAEF95757, 0x69D0B9B9, 0x17918686, 0x9958C1C1, 0x3A271D1D, 0x27B99E9E,
+	0xD938E1E1, 0xEB13F8F8, 0x2BB39898, 0x22331111, 0xD2BB6969, 0xA970D9D9, 0x07898E8E, 0x33A79494,
+	0x2DB69B9B, 0x3C221E1E, 0x15928787, 0xC920E9E9, 0x8749CECE, 0xAAFF5555, 0x50782828, 0xA57ADFDF,
+	0x038F8C8C, 0x59F8A1A1, 0x09808989, 0x1A170D0D, 0x65DABFBF, 0xD731E6E6, 0x84C64242, 0xD0B86868,
+	0x82C34141, 0x29B09999, 0x5A772D2D, 0x1E110F0F, 0x7BCBB0B0, 0xA8FC5454, 0x6DD6BBBB, 0x2C3A1616
+};
+
+__device__ __forceinline__
+void aes_gpu_init_mt_256(uint32_t sharedMemory[4][256])
+{
+	/* each thread startup will fill a uint32 */
+	if (threadIdx.x < 256) {
+		uint32_t temp = __ldg(&d_AES0[threadIdx.x]);
+		sharedMemory[0][threadIdx.x] = temp;
+		sharedMemory[1][threadIdx.x] = ROL8(temp);
+		sharedMemory[2][threadIdx.x] = ROL16(temp);
+		sharedMemory[3][threadIdx.x] = ROR8(temp);
+	}
+}
+
+__device__ __forceinline__
+void aes_gpu_init256(uint32_t sharedMemory[4][256])
+{
+	/* each thread startup will fill a uint32 */
+	uint32_t temp = __ldg(&d_AES0[threadIdx.x]);
+	sharedMemory[0][threadIdx.x] = temp;
+	sharedMemory[1][threadIdx.x] = ROL8(temp);
+	sharedMemory[2][threadIdx.x] = ROL16(temp);
+	sharedMemory[3][threadIdx.x] = ROR8(temp);
+}
+
+__device__ __forceinline__
+void aes_gpu_init128(uint32_t sharedMemory[4][256])
+{
+	/* each thread startup will fill 2 uint32 */
+	uint2 temp = __ldg(&((uint2*)&d_AES0)[threadIdx.x]);
+
+	sharedMemory[0][(threadIdx.x << 1) + 0] = temp.x;
+	sharedMemory[0][(threadIdx.x << 1) + 1] = temp.y;
+	sharedMemory[1][(threadIdx.x << 1) + 0] = ROL8(temp.x);
+	sharedMemory[1][(threadIdx.x << 1) + 1] = ROL8(temp.y);
+	sharedMemory[2][(threadIdx.x << 1) + 0] = ROL16(temp.x);
+	sharedMemory[2][(threadIdx.x << 1) + 1] = ROL16(temp.y);
+	sharedMemory[3][(threadIdx.x << 1) + 0] = ROR8(temp.x);
+	sharedMemory[3][(threadIdx.x << 1) + 1] = ROR8(temp.y);
+}
+
+__device__ __forceinline__
+void aes_gpu_init_lt_256(uint32_t sharedMemory[4][256])
+{
+	if (threadIdx.x < 128) {
+		/* each thread startup will fill 2 uint32 */
+		uint2 temp = __ldg(&((uint2*)&d_AES0)[threadIdx.x]);
+
+		sharedMemory[0][(threadIdx.x << 1) + 0] = temp.x;
+		sharedMemory[0][(threadIdx.x << 1) + 1] = temp.y;
+		sharedMemory[1][(threadIdx.x << 1) + 0] = ROL8(temp.x);
+		sharedMemory[1][(threadIdx.x << 1) + 1] = ROL8(temp.y);
+		sharedMemory[2][(threadIdx.x << 1) + 0] = ROL16(temp.x);
+		sharedMemory[2][(threadIdx.x << 1) + 1] = ROL16(temp.y);
+		sharedMemory[3][(threadIdx.x << 1) + 0] = ROR8(temp.x);
+		sharedMemory[3][(threadIdx.x << 1) + 1] = ROR8(temp.y);
+	}
+}
+
+__device__ __forceinline__
+static void aes_round(const uint32_t sharedMemory[4][256], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3,
+	const uint32_t k0, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
+{
+	y0 = __ldg(&d_AES0[__byte_perm(x0, 0, 0x4440)]);
+	y3 = sharedMemory[1][__byte_perm(x0, 0, 0x4441)];
+	y2 = sharedMemory[2][__byte_perm(x0, 0, 0x4442)];
+	y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]);
+
+	y1 ^= sharedMemory[0][__byte_perm(x1, 0, 0x4440)];
+	y0 ^= sharedMemory[1][__byte_perm(x1, 0, 0x4441)];
+	y3 ^= sharedMemory[2][__byte_perm(x1, 0, 0x4442)];
+#ifdef INTENSIVE_GMF
+	y2 ^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]);
+#else
+	y2 ^= sharedMemory[3][__byte_perm(x1, 0, 0x4443)];
+#endif
+
+	y0 ^= k0;
+
+	y2 ^= __ldg(&d_AES0[__byte_perm(x2, 0, 0x4440)]);
+	y1 ^= sharedMemory[1][__byte_perm(x2, 0, 0x4441)];
+	y0 ^= sharedMemory[2][__byte_perm(x2, 0, 0x4442)];
+	y3 ^= __ldg(&d_AES3[__byte_perm(x2, 0, 0x4443)]);
+
+	y3 ^= sharedMemory[0][__byte_perm(x3, 0, 0x4440)];
+	y2 ^= sharedMemory[1][__byte_perm(x3, 0, 0x4441)];
+	y1 ^= sharedMemory[2][__byte_perm(x3, 0, 0x4442)];
+	y0 ^= __ldg(&d_AES3[__byte_perm(x3, 0, 0x4443)]);
+}
+
+__device__ __forceinline__
+static void aes_round_LDG(const uint32_t sharedMemory[4][256], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3,
+	const uint32_t k0, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
+{
+	y0 = __ldg(&d_AES0[__byte_perm(x0, 0, 0x4440)]);
+	y3 = sharedMemory[1][__byte_perm(x0, 0, 0x4441)];
+	y2 = sharedMemory[2][__byte_perm(x0, 0, 0x4442)];
+	y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]);
+
+	y1 ^= sharedMemory[0][__byte_perm(x1, 0, 0x4440)];
+	y0 ^= sharedMemory[1][__byte_perm(x1, 0, 0x4441)];
+	y3 ^= sharedMemory[2][__byte_perm(x1, 0, 0x4442)];
+	y2 ^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]);
+
+	y0 ^= k0;
+
+	y2 ^= __ldg(&d_AES0[__byte_perm(x2, 0, 0x4440)]);
+	y1 ^= sharedMemory[1][__byte_perm(x2, 0, 0x4441)];
+	y0 ^= sharedMemory[2][__byte_perm(x2, 0, 0x4442)];
+	y3 ^= __ldg(&d_AES3[__byte_perm(x2, 0, 0x4443)]);
+
+	y3 ^= __ldg(&d_AES0[__byte_perm(x3, 0, 0x4440)]);
+	y2 ^= sharedMemory[1][__byte_perm(x3, 0, 0x4441)];
+	y1 ^= sharedMemory[2][__byte_perm(x3, 0, 0x4442)];
+	y0 ^= __ldg(&d_AES3[__byte_perm(x3, 0, 0x4443)]);
+}
+
+__device__ __forceinline__
+static void aes_round(const uint32_t sharedMemory[4][256], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3,
+	uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
+{
+	y0 = sharedMemory[0][__byte_perm(x0, 0, 0x4440)];
+	y3 = sharedMemory[1][__byte_perm(x0, 0, 0x4441)];
+	y2 = sharedMemory[2][__byte_perm(x0, 0, 0x4442)];
+	y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]);
+
+#ifdef INTENSIVE_GMF
+	y1 ^= __ldg(&d_AES0[__byte_perm(x1, 0, 0x4440)]);
+#else
+	y1 ^= sharedMemory[0][__byte_perm(x1, 0, 0x4440)];
+#endif
+	y0 ^= sharedMemory[1][__byte_perm(x1, 0, 0x4441)];
+	y3 ^= sharedMemory[2][__byte_perm(x1, 0, 0x4442)];
+	y2 ^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]);
+
+	y2 ^= sharedMemory[0][__byte_perm(x2, 0, 0x4440)];
+	y1 ^= sharedMemory[1][__byte_perm(x2, 0, 0x4441)];
+	y0 ^= sharedMemory[2][__byte_perm(x2, 0, 0x4442)];
+	y3 ^= __ldg(&d_AES3[__byte_perm(x2, 0, 0x4443)]);
+
+	y3 ^= sharedMemory[0][__byte_perm(x3, 0, 0x4440)];
+	y2 ^= sharedMemory[1][__byte_perm(x3, 0, 0x4441)];
+	y1 ^= sharedMemory[2][__byte_perm(x3, 0, 0x4442)];
+	y0 ^= __ldg(&d_AES3[__byte_perm(x3, 0, 0x4443)]);
+}
+
+__device__ __forceinline__
+static void aes_round_LDG(const uint32_t sharedMemory[4][256], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3,
+	uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
+{
+	y0 = sharedMemory[0][__byte_perm(x0, 0, 0x4440)];
+	y3 = sharedMemory[1][__byte_perm(x0, 0, 0x4441)];
+	y2 = sharedMemory[2][__byte_perm(x0, 0, 0x4442)];
+	y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]);
+
+	y1 ^= __ldg(&d_AES0[__byte_perm(x1, 0, 0x4440)]);
+	y0 ^= sharedMemory[1][__byte_perm(x1, 0, 0x4441)];
+	y3 ^= sharedMemory[2][__byte_perm(x1, 0, 0x4442)];
+	y2 ^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]);
+
+	y2 ^= sharedMemory[0][__byte_perm(x2, 0, 0x4440)];
+	y1 ^= sharedMemory[1][__byte_perm(x2, 0, 0x4441)];
+	y0 ^= sharedMemory[2][__byte_perm(x2, 0, 0x4442)];
+	y3 ^= __ldg(&d_AES3[__byte_perm(x2, 0, 0x4443)]);
+
+	y3 ^= sharedMemory[0][__byte_perm(x3, 0, 0x4440)];
+	y2 ^= sharedMemory[1][__byte_perm(x3, 0, 0x4441)];
+	y1 ^= sharedMemory[2][__byte_perm(x3, 0, 0x4442)];
+	y0 ^= __ldg(&d_AES3[__byte_perm(x3, 0, 0x4443)]);
+}
+
+__device__ __forceinline__
+static void AES_2ROUND(const uint32_t sharedMemory[4][256], uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &k0)
+{
+	uint32_t y0, y1, y2, y3;
+
+	aes_round(sharedMemory, x0, x1, x2, x3, k0, y0, y1, y2, y3);
+
+	aes_round(sharedMemory, y0, y1, y2, y3, x0, x1, x2, x3);
+
+	// hier werden wir ein carry brauchen (oder auch nicht)
+	k0++;
+}
+
+__device__ __forceinline__
+static void AES_2ROUND_LDG(const uint32_t sharedMemory[4][256], uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &k0)
+{
+	uint32_t y0, y1, y2, y3;
+
+	aes_round_LDG(sharedMemory, x0, x1, x2, x3, k0, y0, y1, y2, y3);
+
+	aes_round_LDG(sharedMemory, y0, y1, y2, y3, x0, x1, x2, x3);
+
+	// hier werden wir ein carry brauchen (oder auch nicht)
+	k0++;
+}
+
+__device__ __forceinline__
+static void AES_ROUND_NOKEY(const uint32_t sharedMemory[4][256], uint4* x)
+{
+	uint32_t y0, y1, y2, y3;
+	aes_round(sharedMemory, x->x, x->y, x->z, x->w, y0, y1, y2, y3);
+
+	x->x = y0;
+	x->y = y1;
+	x->z = y2;
+	x->w = y3;
+}
+
+__device__ __forceinline__
+static void KEY_EXPAND_ELT(const uint32_t sharedMemory[4][256], uint32_t *k)
+{
+	uint32_t y0, y1, y2, y3;
+	aes_round(sharedMemory, k[0], k[1], k[2], k[3], y0, y1, y2, y3);
+
+	k[0] = y1;
+	k[1] = y2;
+	k[2] = y3;
+	k[3] = y0;
+}
diff --git a/tribus/cuda_echo512_final.cu b/tribus/cuda_echo512_final.cu
new file mode 100644
index 0000000000..b68a9c6ea5
--- /dev/null
+++ b/tribus/cuda_echo512_final.cu
@@ -0,0 +1,285 @@
+/**
+ *  Based on Provos Alexis work - 2016 FOR SM 5+
+ *
+ *  final touch by tpruvot for tribus - 09 2017
+ */
+#include <cuda_helper.h>
+#include <cuda_vector_uint2x4.h>
+#include <cuda_vectors.h>
+
+#define INTENSIVE_GMF
+#include "tribus/cuda_echo512_aes.cuh"
+
+#ifdef __INTELLISENSE__
+#define __byte_perm(x, y, b) x
+#define atomicExch(p,y) (*p) = y
+#endif
+
+__device__
+static void echo_round(const uint32_t sharedMemory[4][256], uint32_t *W, uint32_t &k0)
+{
+	// Big Sub Words
+	#pragma unroll 16
+	for (int idx = 0; idx < 16; idx++)
+		AES_2ROUND(sharedMemory,W[(idx<<2) + 0], W[(idx<<2) + 1], W[(idx<<2) + 2], W[(idx<<2) + 3], k0);
+
+	// Shift Rows
+	#pragma unroll 4
+	for (int i = 0; i < 4; i++)
+	{
+		uint32_t t[4];
+		/// 1, 5, 9, 13
+		t[0] = W[i +  4];
+		t[1] = W[i +  8];
+		t[2] = W[i + 24];
+		t[3] = W[i + 60];
+
+		W[i +  4] = W[i + 20];
+		W[i +  8] = W[i + 40];
+		W[i + 24] = W[i + 56];
+		W[i + 60] = W[i + 44];
+
+		W[i + 20] = W[i + 36];
+		W[i + 40] = t[1];
+		W[i + 56] = t[2];
+		W[i + 44] = W[i + 28];
+
+		W[i + 28] = W[i + 12];
+		W[i + 12] = t[3];
+		W[i + 36] = W[i + 52];
+		W[i + 52] = t[0];
+	}
+	// Mix Columns
+	#pragma unroll 4
+	for (int i = 0; i < 4; i++)
+	{
+		#pragma unroll 4
+		for (int idx = 0; idx < 64; idx += 16)
+		{
+			uint32_t a[4];
+			a[0] = W[idx + i];
+			a[1] = W[idx + i + 4];
+			a[2] = W[idx + i + 8];
+			a[3] = W[idx + i +12];
+
+			uint32_t ab = a[0] ^ a[1];
+			uint32_t bc = a[1] ^ a[2];
+			uint32_t cd = a[2] ^ a[3];
+
+			uint32_t t, t2, t3;
+			t  = (ab & 0x80808080);
+			t2 = (bc & 0x80808080);
+			t3 = (cd & 0x80808080);
+
+			uint32_t abx = (t  >> 7) * 27U ^ ((ab^t) << 1);
+			uint32_t bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
+			uint32_t cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
+
+			W[idx + i] = bc ^ a[3] ^ abx;
+			W[idx + i + 4] = a[0] ^ cd ^ bcx;
+			W[idx + i + 8] = ab ^ a[3] ^ cdx;
+			W[idx + i +12] = ab ^ a[2] ^ (abx ^ bcx ^ cdx);
+		}
+	}
+}
+
+__global__ __launch_bounds__(256, 3) /* will force 80 registers */
+static void tribus_echo512_gpu_final(uint32_t threads, uint64_t *g_hash, uint32_t* resNonce, const uint64_t target)
+{
+	__shared__ uint32_t sharedMemory[4][256];
+
+	aes_gpu_init256(sharedMemory);
+
+	const uint32_t P[48] = {
+		0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
+		//8-12
+		0x01425eb8, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,0x65978b09, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
+		//21-25
+		0x2cb6b661, 0x6b23b3b3, 0xcf93a7cf, 0x9d9d3751,0x9ac2dea3, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
+		//34-38
+		0x579f9f33, 0xfbfbfbfb, 0xfbfbfbfb, 0xefefd3c7,0xdbfde1dd, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
+		0x34514d9e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,0xb134347e, 0xea6f7e7e, 0xbd7731bd, 0x8a8a1968,
+		0x14b8a457, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,0x265f4382, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af
+		//58-61
+	};
+	uint32_t k0;
+	uint32_t h[16];
+
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint32_t *hash = (uint32_t*)&g_hash[thread<<3];
+
+		*(uint2x4*)&h[0] = __ldg4((uint2x4*)&hash[0]);
+		*(uint2x4*)&h[8] = __ldg4((uint2x4*)&hash[8]);
+
+		uint64_t backup = *(uint64_t*)&h[6];
+
+		k0 = 512 + 8;
+
+		#pragma unroll 4
+		for (uint32_t idx = 0; idx < 16; idx += 4)
+			AES_2ROUND(sharedMemory,h[idx + 0], h[idx + 1], h[idx + 2], h[idx + 3], k0);
+
+		k0 += 4;
+
+		uint32_t W[64];
+
+		#pragma unroll 4
+		for (uint32_t i = 0; i < 4; i++)
+		{
+			uint32_t a = P[i];
+			uint32_t b = P[i + 4];
+			uint32_t c = h[i + 8];
+			uint32_t d = P[i + 8];
+
+			uint32_t ab = a ^ b;
+			uint32_t bc = b ^ c;
+			uint32_t cd = c ^ d;
+
+			uint32_t t =  ((a ^ b) & 0x80808080);
+			uint32_t t2 = ((b ^ c) & 0x80808080);
+			uint32_t t3 = ((c ^ d) & 0x80808080);
+
+			uint32_t abx = ((t  >> 7) * 27U) ^ ((ab^t) << 1);
+			uint32_t bcx = ((t2 >> 7) * 27U) ^ ((bc^t2) << 1);
+			uint32_t cdx = ((t3 >> 7) * 27U) ^ ((cd^t3) << 1);
+
+			W[0 + i] = bc ^ d ^ abx;
+			W[4 + i] = a ^ cd ^ bcx;
+			W[8 + i] = ab ^ d ^ cdx;
+			W[12+ i] = abx ^ bcx ^ cdx ^ ab ^ c;
+
+			a = P[12 + i];
+			b = h[i + 4];
+			c = P[12 + i + 4];
+			d = P[12 + i + 8];
+
+			ab = a ^ b;
+			bc = b ^ c;
+			cd = c ^ d;
+
+			t  = (ab & 0x80808080);
+			t2 = (bc & 0x80808080);
+			t3 = (cd & 0x80808080);
+
+			abx = (t  >> 7) * 27U ^ ((ab^t) << 1);
+			bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
+			cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
+
+			W[16 + i] = abx ^ bc ^ d;
+			W[16 + i + 4] = bcx ^ a ^ cd;
+			W[16 + i + 8] = cdx ^ ab ^ d;
+			W[16 + i +12] = abx ^ bcx ^ cdx ^ ab ^ c;
+
+			a = h[i];
+			b = P[24 + i];
+			c = P[24 + i + 4];
+			d = P[24 + i + 8];
+
+			ab = a ^ b;
+			bc = b ^ c;
+			cd = c ^ d;
+
+			t  = (ab & 0x80808080);
+			t2 = (bc & 0x80808080);
+			t3 = (cd & 0x80808080);
+
+			abx = (t  >> 7) * 27U ^ ((ab^t) << 1);
+			bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
+			cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
+
+			W[32 + i] = abx ^ bc ^ d;
+			W[32 + i + 4] = bcx ^ a ^ cd;
+			W[32 + i + 8] = cdx ^ ab ^ d;
+			W[32 + i +12] = abx ^ bcx ^ cdx ^ ab ^ c;
+
+			a = P[36 + i ];
+			b = P[36 + i + 4];
+			c = P[36 + i + 8];
+			d = h[i + 12];
+
+			ab = a ^ b;
+			bc = b ^ c;
+			cd = c ^ d;
+
+			t  = (ab & 0x80808080);
+			t2 = (bc & 0x80808080);
+			t3 = (cd & 0x80808080);
+
+			abx = (t  >> 7) * 27U ^ ((ab^t) << 1);
+			bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
+			cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
+
+			W[48 + i] = abx ^ bc ^ d;
+			W[48 + i + 4] = bcx ^ a ^ cd;
+			W[48 + i + 8] = cdx ^ ab ^ d;
+			W[48 + i +12] = abx ^ bcx ^ cdx ^ ab ^ c;
+		}
+
+		for (int k = 1; k < 9; k++)
+			echo_round(sharedMemory,W,k0);
+
+		// Big Sub Words
+		uint32_t y0, y1, y2, y3;
+//		AES_2ROUND(sharedMemory,W[ 0], W[ 1], W[ 2], W[ 3], k0);
+		aes_round(sharedMemory, W[ 0], W[ 1], W[ 2], W[ 3], k0, y0, y1, y2, y3);
+		aes_round(sharedMemory, y0, y1, y2, y3, W[ 0], W[ 1], W[ 2], W[ 3]);
+
+		aes_round(sharedMemory, W[ 4], W[ 5], W[ 6], W[ 7], k0, y0, y1, y2, y3);
+		aes_round(sharedMemory, y0, y1, y2, y3, W[ 4], W[ 5], W[ 6], W[ 7]);
+		aes_round(sharedMemory, W[ 8], W[ 9], W[10], W[11], k0, y0, y1, y2, y3);
+		aes_round(sharedMemory, y0, y1, y2, y3, W[ 8], W[ 9], W[10], W[11]);
+
+		aes_round(sharedMemory, W[20], W[21], W[22], W[23], k0, y0, y1, y2, y3);
+		aes_round(sharedMemory, y0, y1, y2, y3, W[20], W[21], W[22], W[23]);
+		aes_round(sharedMemory, W[28], W[29], W[30], W[31], k0, y0, y1, y2, y3);
+		aes_round(sharedMemory, y0, y1, y2, y3, W[28], W[29], W[30], W[31]);
+
+		aes_round(sharedMemory, W[32], W[33], W[34], W[35], k0, y0, y1, y2, y3);
+		aes_round(sharedMemory, y0, y1, y2, y3, W[32], W[33], W[34], W[35]);
+		aes_round(sharedMemory, W[40], W[41], W[42], W[43], k0, y0, y1, y2, y3);
+		aes_round(sharedMemory, y0, y1, y2, y3, W[40], W[41], W[42], W[43]);
+
+		aes_round(sharedMemory, W[52], W[53], W[54], W[55], k0, y0, y1, y2, y3);
+		aes_round(sharedMemory, y0, y1, y2, y3, W[52], W[53], W[54], W[55]);
+		aes_round(sharedMemory, W[60], W[61], W[62], W[63], k0, y0, y1, y2, y3);
+		aes_round(sharedMemory, y0, y1, y2, y3, W[60], W[61], W[62], W[63]);
+
+		uint32_t bc = W[22] ^ W[42];
+		uint32_t t2 = (bc & 0x80808080);
+		W[ 6] = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
+
+		bc = W[23] ^ W[43];
+		t2 = (bc & 0x80808080);
+		W[ 7] = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
+
+		bc = W[10] ^ W[54];
+		t2 = (bc & 0x80808080);
+		W[38] = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
+
+		bc = W[11] ^ W[55];
+		t2 = (bc & 0x80808080);
+		W[39] = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
+
+		uint64_t check = backup ^ *(uint64_t*)&W[2] ^ *(uint64_t*)&W[6] ^ *(uint64_t*)&W[10] ^ *(uint64_t*)&W[30]
+			^ *(uint64_t*)&W[34] ^ *(uint64_t*)&W[38] ^ *(uint64_t*)&W[42] ^ *(uint64_t*)&W[62];
+
+		if(check <= target){
+			uint32_t tmp = atomicExch(&resNonce[0], thread);
+			if (tmp != UINT32_MAX)
+				resNonce[1] = tmp;
+		}
+	}
+}
+
+__host__
+void tribus_echo512_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t *d_resNonce, const uint64_t target)
+{
+	const uint32_t threadsperblock = 256;
+
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	tribus_echo512_gpu_final <<<grid, block>>> (threads, (uint64_t*)d_hash, d_resNonce, target);
+}
diff --git a/tribus/tribus.cu b/tribus/tribus.cu
new file mode 100644
index 0000000000..4516e7d69c
--- /dev/null
+++ b/tribus/tribus.cu
@@ -0,0 +1,184 @@
+/**
+ * Tribus Algo for Denarius
+ *
+ * tpruvot@github 09 2017 - GPLv3
+ *
+ */
+extern "C" {
+#include "sph/sph_jh.h"
+#include "sph/sph_keccak.h"
+#include "sph/sph_echo.h"
+}
+
+#include "miner.h"
+#include "cuda_helper.h"
+#include "x11/cuda_x11.h"
+
+void jh512_setBlock_80(int thr_id, uint32_t *endiandata);
+void jh512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash);
+void tribus_echo512_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t *d_resNonce, const uint64_t target);
+
+static uint32_t *d_hash[MAX_GPUS];
+static uint32_t *d_resNonce[MAX_GPUS];
+
+// cpu hash
+
+extern "C" void tribus_hash(void *state, const void *input)
+{
+	uint8_t _ALIGN(64) hash[64];
+
+	sph_jh512_context ctx_jh;
+	sph_keccak512_context ctx_keccak;
+	sph_echo512_context ctx_echo;
+
+	sph_jh512_init(&ctx_jh);
+	sph_jh512(&ctx_jh, input, 80);
+	sph_jh512_close(&ctx_jh, (void*) hash);
+
+	sph_keccak512_init(&ctx_keccak);
+	sph_keccak512(&ctx_keccak, (const void*) hash, 64);
+	sph_keccak512_close(&ctx_keccak, (void*) hash);
+
+	sph_echo512_init(&ctx_echo);
+	sph_echo512(&ctx_echo, (const void*) hash, 64);
+	sph_echo512_close(&ctx_echo, (void*) hash);
+
+	memcpy(state, hash, 32);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+static bool use_compat_kernels[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_tribus(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t _ALIGN(64) endiandata[20];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+
+	int8_t intensity = is_windows() ? 20 : 23;
+	uint32_t throughput =  cuda_default_throughput(thr_id, 1 << intensity);
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (opt_benchmark)
+		((uint32_t*)ptarget)[7] = 0x00FF;
+
+	if (!init[thr_id])
+	{
+		int dev_id = device_map[thr_id];
+		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		quark_jh512_cpu_init(thr_id, throughput);
+		quark_keccak512_cpu_init(thr_id, throughput);
+
+		cuda_get_arch(thr_id);
+		use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
+		if (use_compat_kernels[thr_id])
+			x11_echo512_cpu_init(thr_id, throughput);
+
+		// char[64] work space for hashes results
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)64 * throughput));
+		CUDA_SAFE_CALL(cudaMalloc(&d_resNonce[thr_id], 2 * sizeof(uint32_t)));
+
+		cuda_check_cpu_init(thr_id, throughput);
+		init[thr_id] = true;
+	}
+
+	for (int k=0; k < 20; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	jh512_setBlock_80(thr_id, endiandata);
+	if (use_compat_kernels[thr_id])
+		cuda_check_cpu_setTarget(ptarget);
+	else
+		cudaMemset(d_resNonce[thr_id], 0xFF, 2 * sizeof(uint32_t));
+
+	work->valid_nonces = 0;
+
+	do {
+		int order = 1;
+		jh512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+
+		if (use_compat_kernels[thr_id]) {
+			x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+			work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+			work->nonces[1] = UINT32_MAX;
+		} else {
+			tribus_echo512_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id], AS_U64(&ptarget[6]));
+			cudaMemcpy(&work->nonces[0], d_resNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
+		}
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			uint32_t _ALIGN(64) vhash[8];
+			const uint32_t Htarg = ptarget[7];
+			const uint32_t startNounce = pdata[19];
+			if (!use_compat_kernels[thr_id]) work->nonces[0] += startNounce;
+			be32enc(&endiandata[19], work->nonces[0]);
+			tribus_hash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				if (work->nonces[1] != UINT32_MAX) {
+					work->nonces[1] += startNounce;
+					be32enc(&endiandata[19], work->nonces[1]);
+					tribus_hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				goto out;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+					gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				cudaMemset(d_resNonce[thr_id], 0xFF, 2 * sizeof(uint32_t));
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+out:
+//	*hashes_done = pdata[19] - first_nonce;
+	return work->valid_nonces;
+}
+
+// ressources cleanup
+extern "C" void free_tribus(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+	cudaFree(d_resNonce[thr_id]);
+
+	cuda_check_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/util.c b/util.c
deleted file mode 100644
index 451aaed484..0000000000
--- a/util.c
+++ /dev/null
@@ -1,1320 +0,0 @@
-/*
- * Copyright 2010 Jeff Garzik
- * Copyright 2012-2014 pooler
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.  See COPYING for more details.
- */
-
-#define _GNU_SOURCE
-#include "cpuminer-config.h"
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <ctype.h>
-#include <stdarg.h>
-#include <string.h>
-#include <stdbool.h>
-#include <inttypes.h>
-#include <unistd.h>
-#include <jansson.h>
-#include <curl/curl.h>
-#include <time.h>
-#if defined(WIN32)
-#include <winsock2.h>
-#include <mstcpip.h>
-#else
-#include <errno.h>
-#include <sys/socket.h>
-#include <netinet/in.h>
-#include <netinet/tcp.h>
-#endif
-#include "compat.h"
-#include "miner.h"
-#include "elist.h"
-
-struct data_buffer {
-	void		*buf;
-	size_t		len;
-};
-
-struct upload_buffer {
-	const void	*buf;
-	size_t		len;
-	size_t		pos;
-};
-
-struct header_info {
-	char		*lp_path;
-	char		*reason;
-	char		*stratum_url;
-};
-
-struct tq_ent {
-	void			*data;
-	struct list_head	q_node;
-};
-
-struct thread_q {
-	struct list_head	q;
-
-	bool frozen;
-
-	pthread_mutex_t		mutex;
-	pthread_cond_t		cond;
-};
-
-void applog(int prio, const char *fmt, ...)
-{
-	va_list ap;
-
-	va_start(ap, fmt);
-
-#ifdef HAVE_SYSLOG_H
-	if (use_syslog) {
-		va_list ap2;
-		char *buf;
-		int len;
-		
-		va_copy(ap2, ap);
-		len = vsnprintf(NULL, 0, fmt, ap2) + 1;
-		va_end(ap2);
-		buf = alloca(len);
-		if (vsnprintf(buf, len, fmt, ap) >= 0)
-			syslog(prio, "%s", buf);
-	}
-#else
-	if (0) {}
-#endif
-	else {
-		char *f;
-		int len;
-		time_t now;
-		struct tm tm, *tm_p;
-
-		time(&now);
-
-		pthread_mutex_lock(&applog_lock);
-		tm_p = localtime(&now);
-		memcpy(&tm, tm_p, sizeof(tm));
-		pthread_mutex_unlock(&applog_lock);
-
-		len = (int)(40 + strlen(fmt) + 2);
-		f = (char*)alloca(len);
-		sprintf(f, "[%d-%02d-%02d %02d:%02d:%02d] %s\n",
-			tm.tm_year + 1900,
-			tm.tm_mon + 1,
-			tm.tm_mday,
-			tm.tm_hour,
-			tm.tm_min,
-			tm.tm_sec,
-			fmt);
-		pthread_mutex_lock(&applog_lock);
-		vfprintf(stderr, f, ap);	/* atomic write to stderr */
-		fflush(stderr);
-		pthread_mutex_unlock(&applog_lock);
-	}
-	va_end(ap);
-}
-
-static void databuf_free(struct data_buffer *db)
-{
-	if (!db)
-		return;
-
-	free(db->buf);
-
-	memset(db, 0, sizeof(*db));
-}
-
-static size_t all_data_cb(const void *ptr, size_t size, size_t nmemb,
-			  void *user_data)
-{
-	struct data_buffer *db = (struct data_buffer *)user_data;
-	size_t len = size * nmemb;
-	size_t oldlen, newlen;
-	void *newmem;
-	static const unsigned char zero = 0;
-
-	oldlen = db->len;
-	newlen = oldlen + len;
-
-	newmem = realloc(db->buf, newlen + 1);
-	if (!newmem)
-		return 0;
-
-	db->buf = newmem;
-	db->len = newlen;
-	memcpy((char*)db->buf + oldlen, ptr, len);
-	memcpy((char*)db->buf + newlen, &zero, 1);	/* null terminate */
-
-	return len;
-}
-
-static size_t upload_data_cb(void *ptr, size_t size, size_t nmemb,
-			     void *user_data)
-{
-	struct upload_buffer *ub = (struct upload_buffer *)user_data;
-	unsigned int len = (unsigned int)(size * nmemb);
-
-	if (len > ub->len - ub->pos)
-		len = (unsigned int)(ub->len - ub->pos);
-
-	if (len) {
-		memcpy(ptr, (char*)ub->buf + ub->pos, len);
-		ub->pos += len;
-	}
-
-	return len;
-}
-
-#if LIBCURL_VERSION_NUM >= 0x071200
-static int seek_data_cb(void *user_data, curl_off_t offset, int origin)
-{
-	struct upload_buffer *ub = (struct upload_buffer *)user_data;
-	
-	switch (origin) {
-	case SEEK_SET:
-		ub->pos = (size_t)offset;
-		break;
-	case SEEK_CUR:
-		ub->pos += (size_t)offset;
-		break;
-	case SEEK_END:
-		ub->pos = ub->len + (size_t)offset;
-		break;
-	default:
-		return 1; /* CURL_SEEKFUNC_FAIL */
-	}
-
-	return 0; /* CURL_SEEKFUNC_OK */
-}
-#endif
-
-static size_t resp_hdr_cb(void *ptr, size_t size, size_t nmemb, void *user_data)
-{
-	struct header_info *hi = (struct header_info *)user_data;
-	size_t remlen, slen, ptrlen = size * nmemb;
-	char *rem, *val = NULL, *key = NULL;
-	void *tmp;
-
-	val = (char*)calloc(1, ptrlen);
-	key = (char*)calloc(1, ptrlen);
-	if (!key || !val)
-		goto out;
-
-	tmp = memchr(ptr, ':', ptrlen);
-	if (!tmp || (tmp == ptr))	/* skip empty keys / blanks */
-		goto out;
-	slen = (size_t)((char*)tmp - (char*)ptr);
-	if ((slen + 1) == ptrlen)	/* skip key w/ no value */
-		goto out;
-	memcpy(key, ptr, slen);		/* store & nul term key */
-	key[slen] = 0;
-
-	rem = (char*)ptr + slen + 1;		/* trim value's leading whitespace */
-	remlen = ptrlen - slen - 1;
-	while ((remlen > 0) && (isspace(*rem))) {
-		remlen--;
-		rem++;
-	}
-
-	memcpy(val, rem, remlen);	/* store value, trim trailing ws */
-	val[remlen] = 0;
-	while ((*val) && (isspace(val[strlen(val) - 1]))) {
-		val[strlen(val) - 1] = 0;
-	}
-	if (!*val)			/* skip blank value */
-		goto out;
-
-	if (!strcasecmp("X-Long-Polling", key)) {
-		hi->lp_path = val;	/* steal memory reference */
-		val = NULL;
-	}
-
-	if (!strcasecmp("X-Reject-Reason", key)) {
-		hi->reason = val;	/* steal memory reference */
-		val = NULL;
-	}
-
-	if (!strcasecmp("X-Stratum", key)) {
-		hi->stratum_url = val;	/* steal memory reference */
-		val = NULL;
-	}
-
-out:
-	free(key);
-	free(val);
-	return ptrlen;
-}
-
-#if LIBCURL_VERSION_NUM >= 0x070f06
-static int sockopt_keepalive_cb(void *userdata, curl_socket_t fd,
-	curlsocktype purpose)
-{
-	int keepalive = 1;
-	int tcp_keepcnt = 3;
-	int tcp_keepidle = 50;
-	int tcp_keepintvl = 50;
-#ifdef WIN32
-	DWORD outputBytes;
-#endif
-
-#ifndef WIN32	
-	if (unlikely(setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &keepalive,
-		sizeof(keepalive))))
-		return 1;
-#ifdef __linux
-	if (unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPCNT,
-		&tcp_keepcnt, sizeof(tcp_keepcnt))))
-		return 1;
-	if (unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPIDLE,
-		&tcp_keepidle, sizeof(tcp_keepidle))))
-		return 1;
-	if (unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPINTVL,
-		&tcp_keepintvl, sizeof(tcp_keepintvl))))
-		return 1;
-#endif /* __linux */
-#ifdef __APPLE_CC__
-	if (unlikely(setsockopt(fd, IPPROTO_TCP, TCP_KEEPALIVE,
-		&tcp_keepintvl, sizeof(tcp_keepintvl))))
-		return 1;
-#endif /* __APPLE_CC__ */
-#else /* WIN32 */
-	struct tcp_keepalive vals;
-	vals.onoff = 1;
-	vals.keepalivetime = tcp_keepidle * 1000;
-	vals.keepaliveinterval = tcp_keepintvl * 1000;	
-	if (unlikely(WSAIoctl(fd, SIO_KEEPALIVE_VALS, &vals, sizeof(vals),
-		NULL, 0, &outputBytes, NULL, NULL)))
-		return 1;
-#endif /* WIN32 */
-
-	return 0;
-}
-#endif
-
-json_t *json_rpc_call(CURL *curl, const char *url,
-		      const char *userpass, const char *rpc_req,
-		      bool longpoll_scan, bool longpoll, int *curl_err)
-{
-	json_t *val, *err_val, *res_val;
-	int rc;
-	struct data_buffer all_data = {0};
-	struct upload_buffer upload_data;
-	json_error_t err;
-	struct curl_slist *headers = NULL;
-	char len_hdr[64];
-	char curl_err_str[CURL_ERROR_SIZE];
-	long timeout = longpoll ? opt_timeout : 30;
-	struct header_info hi = {0};
-	bool lp_scanning = longpoll_scan && !have_longpoll;
-
-	/* it is assumed that 'curl' is freshly [re]initialized at this pt */
-
-	if (opt_protocol)
-		curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
-	curl_easy_setopt(curl, CURLOPT_URL, url);
-	if (opt_cert)
-		curl_easy_setopt(curl, CURLOPT_CAINFO, opt_cert);
-	curl_easy_setopt(curl, CURLOPT_ENCODING, "");
-	curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1);
-	curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);
-	curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1);
-	curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, all_data_cb);
-	curl_easy_setopt(curl, CURLOPT_WRITEDATA, &all_data);
-	curl_easy_setopt(curl, CURLOPT_READFUNCTION, upload_data_cb);
-	curl_easy_setopt(curl, CURLOPT_READDATA, &upload_data);
-#if LIBCURL_VERSION_NUM >= 0x071200
-	curl_easy_setopt(curl, CURLOPT_SEEKFUNCTION, &seek_data_cb);
-	curl_easy_setopt(curl, CURLOPT_SEEKDATA, &upload_data);
-#endif
-	curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_err_str);
-	curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);
-	curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeout);
-	curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, resp_hdr_cb);
-	curl_easy_setopt(curl, CURLOPT_HEADERDATA, &hi);
-	if (opt_proxy) {
-		curl_easy_setopt(curl, CURLOPT_PROXY, opt_proxy);
-		curl_easy_setopt(curl, CURLOPT_PROXYTYPE, opt_proxy_type);
-	}
-	if (userpass) {
-		curl_easy_setopt(curl, CURLOPT_USERPWD, userpass);
-		curl_easy_setopt(curl, CURLOPT_HTTPAUTH, CURLAUTH_BASIC);
-	}
-#if LIBCURL_VERSION_NUM >= 0x070f06
-	if (longpoll)
-		curl_easy_setopt(curl, CURLOPT_SOCKOPTFUNCTION, sockopt_keepalive_cb);
-#endif
-	curl_easy_setopt(curl, CURLOPT_POST, 1);
-
-	if (opt_protocol)
-		applog(LOG_DEBUG, "JSON protocol request:\n%s\n", rpc_req);
-
-	upload_data.buf = rpc_req;
-	upload_data.len = strlen(rpc_req);
-	upload_data.pos = 0;
-	sprintf(len_hdr, "Content-Length: %lu",
-		(unsigned long) upload_data.len);
-
-	headers = curl_slist_append(headers, "Content-Type: application/json");
-	headers = curl_slist_append(headers, len_hdr);
-	headers = curl_slist_append(headers, "User-Agent: " USER_AGENT);
-	headers = curl_slist_append(headers, "X-Mining-Extensions: midstate");
-	headers = curl_slist_append(headers, "Accept:"); /* disable Accept hdr*/
-	headers = curl_slist_append(headers, "Expect:"); /* disable Expect hdr*/
-
-	curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
-
-	rc = curl_easy_perform(curl);
-	if (curl_err != NULL)
-		*curl_err = rc;
-	if (rc) {
-		if (!(longpoll && rc == CURLE_OPERATION_TIMEDOUT))
-			applog(LOG_ERR, "HTTP request failed: %s", curl_err_str);
-		goto err_out;
-	}
-
-	/* If X-Stratum was found, activate Stratum */
-	if (want_stratum && hi.stratum_url &&
-	    !strncasecmp(hi.stratum_url, "stratum+tcp://", 14) &&
-	    !(opt_proxy && opt_proxy_type == CURLPROXY_HTTP)) {
-		have_stratum = true;
-		tq_push(thr_info[stratum_thr_id].q, hi.stratum_url);
-		hi.stratum_url = NULL;
-	}
-
-	/* If X-Long-Polling was found, activate long polling */
-	if (lp_scanning && hi.lp_path && !have_stratum) {
-		have_longpoll = true;
-		tq_push(thr_info[longpoll_thr_id].q, hi.lp_path);
-		hi.lp_path = NULL;
-	}
-
-	if (!all_data.buf) {
-		applog(LOG_ERR, "Empty data received in json_rpc_call.");
-		goto err_out;
-	}
-
-	val = JSON_LOADS((const char*)all_data.buf, &err);
-	if (!val) {
-		applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text);
-		goto err_out;
-	}
-
-	if (opt_protocol) {
-		char *s = json_dumps(val, JSON_INDENT(3));
-		applog(LOG_DEBUG, "JSON protocol response:\n%s", s);
-		free(s);
-	}
-
-	/* JSON-RPC valid response returns a non-null 'result',
-	 * and a null 'error'. */
-	res_val = json_object_get(val, "result");
-	err_val = json_object_get(val, "error");
-
-	if (!res_val || json_is_null(res_val) ||
-	    (err_val && !json_is_null(err_val))) {
-		char *s;
-
-		if (err_val)
-			s = json_dumps(err_val, JSON_INDENT(3));
-		else
-			s = strdup("(unknown reason)");
-
-		applog(LOG_ERR, "JSON-RPC call failed: %s", s);
-
-		free(s);
-
-		goto err_out;
-	}
-
-	if (hi.reason)
-		json_object_set_new(val, "reject-reason", json_string(hi.reason));
-
-	databuf_free(&all_data);
-	curl_slist_free_all(headers);
-	curl_easy_reset(curl);
-	return val;
-
-err_out:
-	free(hi.lp_path);
-	free(hi.reason);
-	free(hi.stratum_url);
-	databuf_free(&all_data);
-	curl_slist_free_all(headers);
-	curl_easy_reset(curl);
-	return NULL;
-}
-
-char *bin2hex(const unsigned char *p, size_t len)
-{
-	unsigned int i;
-	char *s = (char*)malloc((len * 2) + 1);
-	if (!s)
-		return NULL;
-
-	for (i = 0; i < len; i++)
-		sprintf(s + (i * 2), "%02x", (unsigned int) p[i]);
-
-	return s;
-}
-
-bool hex2bin(unsigned char *p, const char *hexstr, size_t len)
-{
-	char hex_byte[3];
-	char *ep;
-
-	hex_byte[2] = '\0';
-
-	while (*hexstr && len) {
-		if (!hexstr[1]) {
-			applog(LOG_ERR, "hex2bin str truncated");
-			return false;
-		}
-		hex_byte[0] = hexstr[0];
-		hex_byte[1] = hexstr[1];
-		*p = (unsigned char) strtol(hex_byte, &ep, 16);
-		if (*ep) {
-			applog(LOG_ERR, "hex2bin failed on '%s'", hex_byte);
-			return false;
-		}
-		p++;
-		hexstr += 2;
-		len--;
-	}
-
-	return (len == 0 && *hexstr == 0) ? true : false;
-}
-
-/* Subtract the `struct timeval' values X and Y,
-   storing the result in RESULT.
-   Return 1 if the difference is negative, otherwise 0.  */
-int timeval_subtract(struct timeval *result, struct timeval *x,
-	struct timeval *y)
-{
-	/* Perform the carry for the later subtraction by updating Y. */
-	if (x->tv_usec < y->tv_usec) {
-		int nsec = (y->tv_usec - x->tv_usec) / 1000000 + 1;
-		y->tv_usec -= 1000000 * nsec;
-		y->tv_sec += nsec;
-	}
-	if (x->tv_usec - y->tv_usec > 1000000) {
-		int nsec = (x->tv_usec - y->tv_usec) / 1000000;
-		y->tv_usec += 1000000 * nsec;
-		y->tv_sec -= nsec;
-	}
-
-	/* Compute the time remaining to wait.
-	 * `tv_usec' is certainly positive. */
-	result->tv_sec = x->tv_sec - y->tv_sec;
-	result->tv_usec = x->tv_usec - y->tv_usec;
-
-	/* Return 1 if result is negative. */
-	return x->tv_sec < y->tv_sec;
-}
-
-bool fulltest(const uint32_t *hash, const uint32_t *target)
-{
-	int i;
-	bool rc = true;
-	
-	for (i = 7; i >= 0; i--) {
-		if (hash[i] > target[i]) {
-			rc = false;
-			break;
-		}
-		if (hash[i] < target[i]) {
-			rc = true;
-			break;
-		}
-	}
-
-	if (opt_debug) {
-		uint32_t hash_be[8], target_be[8];
-		char *hash_str, *target_str;
-		
-		for (i = 0; i < 8; i++) {
-			be32enc(hash_be + i, hash[7 - i]);
-			be32enc(target_be + i, target[7 - i]);
-		}
-		hash_str = bin2hex((unsigned char *)hash_be, 32);
-		target_str = bin2hex((unsigned char *)target_be, 32);
-
-		applog(LOG_DEBUG, "DEBUG: %s\nHash:   %s\nTarget: %s",
-			rc ? "hash <= target"
-			   : "hash > target (false positive)",
-			hash_str,
-			target_str);
-
-		free(hash_str);
-		free(target_str);
-	}
-
-	return rc;
-}
-
-void diff_to_target(uint32_t *target, double diff)
-{
-	uint64_t m;
-	int k;
-	
-	for (k = 6; k > 0 && diff > 1.0; k--)
-		diff /= 4294967296.0;
-	m = (uint64_t)(4294901760.0 / diff);
-	if (m == 0 && k == 6)
-		memset(target, 0xff, 32);
-	else {
-		memset(target, 0, 32);
-		target[k] = (uint32_t)m;
-		target[k + 1] = (uint32_t)(m >> 32);
-	}
-}
-
-#ifdef WIN32
-#define socket_blocks() (WSAGetLastError() == WSAEWOULDBLOCK)
-#else
-#define socket_blocks() (errno == EAGAIN || errno == EWOULDBLOCK)
-#endif
-
-static bool send_line(curl_socket_t sock, char *s)
-{
-	ssize_t len, sent = 0;
-	
-	len = (ssize_t)strlen(s);
-	s[len++] = '\n';
-
-	while (len > 0) {
-		struct timeval timeout = {0, 0};
-		ssize_t n;
-		fd_set wd;
-
-		FD_ZERO(&wd);
-		FD_SET(sock, &wd);
-		if (select((int)sock + 1, NULL, &wd, NULL, &timeout) < 1)
-			return false;
-		n = send(sock, s + sent, len, 0);
-		if (n < 0) {
-			if (!socket_blocks())
-				return false;
-			n = 0;
-		}
-		sent += n;
-		len -= n;
-	}
-
-	return true;
-}
-
-bool stratum_send_line(struct stratum_ctx *sctx, char *s)
-{
-	bool ret = false;
-
-	if (opt_protocol)
-		applog(LOG_DEBUG, "> %s", s);
-
-	pthread_mutex_lock(&sctx->sock_lock);
-	ret = send_line(sctx->sock, s);
-	pthread_mutex_unlock(&sctx->sock_lock);
-
-	return ret;
-}
-
-static bool socket_full(curl_socket_t sock, int timeout)
-{
-	struct timeval tv;
-	fd_set rd;
-
-	FD_ZERO(&rd);
-	FD_SET(sock, &rd);
-	tv.tv_sec = timeout;
-	tv.tv_usec = 0;
-	if (select((int)sock + 1, &rd, NULL, NULL, &tv) > 0)
-		return true;
-	return false;
-}
-
-bool stratum_socket_full(struct stratum_ctx *sctx, int timeout)
-{
-	return strlen(sctx->sockbuf) || socket_full(sctx->sock, timeout);
-}
-
-#define RBUFSIZE 2048
-#define RECVSIZE (RBUFSIZE - 4)
-
-static void stratum_buffer_append(struct stratum_ctx *sctx, const char *s)
-{
-	size_t old, snew;
-
-	old = strlen(sctx->sockbuf);
-	snew = old + strlen(s) + 1;
-	if (snew >= sctx->sockbuf_size) {
-		sctx->sockbuf_size = snew + (RBUFSIZE - (snew % RBUFSIZE));
-		sctx->sockbuf = (char*)realloc(sctx->sockbuf, sctx->sockbuf_size);
-	}
-	strcpy(sctx->sockbuf + old, s);
-}
-
-char *stratum_recv_line(struct stratum_ctx *sctx)
-{
-	ssize_t len, buflen;
-	char *tok, *sret = NULL;
-
-	if (!strstr(sctx->sockbuf, "\n")) {
-		bool ret = true;
-		time_t rstart;
-
-		time(&rstart);
-		if (!socket_full(sctx->sock, 60)) {
-			applog(LOG_ERR, "stratum_recv_line timed out");
-			goto out;
-		}
-		do {
-			char s[RBUFSIZE];
-			ssize_t n;
-
-			memset(s, 0, RBUFSIZE);
-			n = recv(sctx->sock, s, RECVSIZE, 0);
-			if (!n) {
-				ret = false;
-				break;
-			}
-			if (n < 0) {
-				if (!socket_blocks() || !socket_full(sctx->sock, 1)) {
-					ret = false;
-					break;
-				}
-			} else
-				stratum_buffer_append(sctx, s);
-		} while (time(NULL) - rstart < 60 && !strstr(sctx->sockbuf, "\n"));
-
-		if (!ret) {
-			applog(LOG_ERR, "stratum_recv_line failed");
-			goto out;
-		}
-	}
-
-	buflen = (ssize_t)strlen(sctx->sockbuf);
-	tok = strtok(sctx->sockbuf, "\n");
-	if (!tok) {
-		applog(LOG_ERR, "stratum_recv_line failed to parse a newline-terminated string");
-		goto out;
-	}
-	sret = strdup(tok);
-	len = (ssize_t)strlen(sret);
-
-	if (buflen > len + 1)
-		memmove(sctx->sockbuf, sctx->sockbuf + len + 1, buflen - len + 1);
-	else
-		sctx->sockbuf[0] = '\0';
-
-out:
-	if (sret && opt_protocol)
-		applog(LOG_DEBUG, "< %s", sret);
-	return sret;
-}
-
-#if LIBCURL_VERSION_NUM >= 0x071101
-static curl_socket_t opensocket_grab_cb(void *clientp, curlsocktype purpose,
-	struct curl_sockaddr *addr)
-{
-	curl_socket_t *sock = (curl_socket_t *)clientp;
-	*sock = socket(addr->family, addr->socktype, addr->protocol);
-	return *sock;
-}
-#endif
-
-bool stratum_connect(struct stratum_ctx *sctx, const char *url)
-{
-	CURL *curl;
-	int rc;
-
-	pthread_mutex_lock(&sctx->sock_lock);
-	if (sctx->curl)
-		curl_easy_cleanup(sctx->curl);
-	sctx->curl = curl_easy_init();
-	if (!sctx->curl) {
-		applog(LOG_ERR, "CURL initialization failed");
-		pthread_mutex_unlock(&sctx->sock_lock);
-		return false;
-	}
-	curl = sctx->curl;
-	if (!sctx->sockbuf) {
-		sctx->sockbuf = (char*)calloc(RBUFSIZE, 1);
-		sctx->sockbuf_size = RBUFSIZE;
-	}
-	sctx->sockbuf[0] = '\0';
-	pthread_mutex_unlock(&sctx->sock_lock);
-
-	if (url != sctx->url) {
-		free(sctx->url);
-		sctx->url = strdup(url);
-	}
-	free(sctx->curl_url);
-	sctx->curl_url = (char*)malloc(strlen(url));
-	sprintf(sctx->curl_url, "http%s", strstr(url, "://"));
-
-	if (opt_protocol)
-		curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
-	curl_easy_setopt(curl, CURLOPT_URL, sctx->curl_url);
-	curl_easy_setopt(curl, CURLOPT_FRESH_CONNECT, 1);
-	curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 30);
-	curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, sctx->curl_err_str);
-	curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);
-	curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1);
-	if (opt_proxy && opt_proxy_type != CURLPROXY_HTTP) {
-		curl_easy_setopt(curl, CURLOPT_PROXY, opt_proxy);
-		curl_easy_setopt(curl, CURLOPT_PROXYTYPE, opt_proxy_type);
-	} else if (getenv("http_proxy")) {
-		if (getenv("all_proxy"))
-			curl_easy_setopt(curl, CURLOPT_PROXY, getenv("all_proxy"));
-		else if (getenv("ALL_PROXY"))
-			curl_easy_setopt(curl, CURLOPT_PROXY, getenv("ALL_PROXY"));
-		else
-			curl_easy_setopt(curl, CURLOPT_PROXY, "");
-	}
-#if LIBCURL_VERSION_NUM >= 0x070f06
-	curl_easy_setopt(curl, CURLOPT_SOCKOPTFUNCTION, sockopt_keepalive_cb);
-#endif
-#if LIBCURL_VERSION_NUM >= 0x071101
-	curl_easy_setopt(curl, CURLOPT_OPENSOCKETFUNCTION, opensocket_grab_cb);
-	curl_easy_setopt(curl, CURLOPT_OPENSOCKETDATA, &sctx->sock);
-#endif
-	curl_easy_setopt(curl, CURLOPT_CONNECT_ONLY, 1);
-
-	rc = curl_easy_perform(curl);
-	if (rc) {
-		applog(LOG_ERR, "Stratum connection failed: %s", sctx->curl_err_str);
-		curl_easy_cleanup(curl);
-		sctx->curl = NULL;
-		return false;
-	}
-
-#if LIBCURL_VERSION_NUM < 0x071101
-	/* CURLINFO_LASTSOCKET is broken on Win64; only use it as a last resort */
-	curl_easy_getinfo(curl, CURLINFO_LASTSOCKET, (long *)&sctx->sock);
-#endif
-
-	return true;
-}
-
-void stratum_disconnect(struct stratum_ctx *sctx)
-{
-	pthread_mutex_lock(&sctx->sock_lock);
-	if (sctx->curl) {
-		curl_easy_cleanup(sctx->curl);
-		sctx->curl = NULL;
-		sctx->sockbuf[0] = '\0';
-	}
-	pthread_mutex_unlock(&sctx->sock_lock);
-}
-
-static const char *get_stratum_session_id(json_t *val)
-{
-	json_t *arr_val;
-	int i, n;
-
-	arr_val = json_array_get(val, 0);
-	if (!arr_val || !json_is_array(arr_val))
-		return NULL;
-	n = json_array_size(arr_val);
-	for (i = 0; i < n; i++) {
-		const char *notify;
-		json_t *arr = json_array_get(arr_val, i);
-
-		if (!arr || !json_is_array(arr))
-			break;
-		notify = json_string_value(json_array_get(arr, 0));
-		if (!notify)
-			continue;
-		if (!strcasecmp(notify, "mining.notify"))
-			return json_string_value(json_array_get(arr, 1));
-	}
-	return NULL;
-}
-
-bool stratum_subscribe(struct stratum_ctx *sctx)
-{
-	char *s, *sret = NULL;
-	const char *sid, *xnonce1;
-	int xn2_size;
-	json_t *val = NULL, *res_val, *err_val;
-	json_error_t err;
-	bool ret = false, retry = false;
-
-start:
-	s = (char*)malloc(128 + (sctx->session_id ? strlen(sctx->session_id) : 0));
-	if (retry)
-		sprintf(s, "{\"id\": 1, \"method\": \"mining.subscribe\", \"params\": []}");
-	else if (sctx->session_id)
-		sprintf(s, "{\"id\": 1, \"method\": \"mining.subscribe\", \"params\": [\"" USER_AGENT "\", \"%s\"]}", sctx->session_id);
-	else
-		sprintf(s, "{\"id\": 1, \"method\": \"mining.subscribe\", \"params\": [\"" USER_AGENT "\"]}");
-
-	if (!stratum_send_line(sctx, s))
-		goto out;
-
-	if (!socket_full(sctx->sock, 30)) {
-		applog(LOG_ERR, "stratum_subscribe timed out");
-		goto out;
-	}
-
-	sret = stratum_recv_line(sctx);
-	if (!sret)
-		goto out;
-
-	val = JSON_LOADS(sret, &err);
-	free(sret);
-	if (!val) {
-		applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text);
-		goto out;
-	}
-
-	res_val = json_object_get(val, "result");
-	err_val = json_object_get(val, "error");
-
-	if (!res_val || json_is_null(res_val) ||
-	    (err_val && !json_is_null(err_val))) {
-		if (opt_debug || retry) {
-			free(s);
-			if (err_val)
-				s = json_dumps(err_val, JSON_INDENT(3));
-			else
-				s = strdup("(unknown reason)");
-			applog(LOG_ERR, "JSON-RPC call failed: %s", s);
-		}
-		goto out;
-	}
-
-	sid = get_stratum_session_id(res_val);
-	if (opt_debug && !sid)
-		applog(LOG_DEBUG, "Failed to get Stratum session id");
-	xnonce1 = json_string_value(json_array_get(res_val, 1));
-	if (!xnonce1) {
-		applog(LOG_ERR, "Failed to get extranonce1");
-		goto out;
-	}
-	xn2_size = json_integer_value(json_array_get(res_val, 2));
-	if (!xn2_size) {
-		applog(LOG_ERR, "Failed to get extranonce2_size");
-		goto out;
-	}
-
-	pthread_mutex_lock(&sctx->work_lock);
-	free(sctx->session_id);
-	free(sctx->xnonce1);
-	sctx->session_id = sid ? strdup(sid) : NULL;
-	sctx->xnonce1_size = strlen(xnonce1) / 2;
-	sctx->xnonce1 = (unsigned char*)malloc(sctx->xnonce1_size);
-	hex2bin(sctx->xnonce1, xnonce1, sctx->xnonce1_size);
-	sctx->xnonce2_size = xn2_size;
-	sctx->next_diff = 1.0;
-	pthread_mutex_unlock(&sctx->work_lock);
-
-	if (opt_debug && sid)
-		applog(LOG_DEBUG, "Stratum session id: %s", sctx->session_id);
-
-	ret = true;
-
-out:
-	free(s);
-	if (val)
-		json_decref(val);
-
-	if (!ret) {
-		if (sret && !retry) {
-			retry = true;
-			goto start;
-		}
-	}
-
-	return ret;
-}
-
-bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass)
-{
-	json_t *val = NULL, *res_val, *err_val;
-	char *s, *sret;
-	json_error_t err;
-	bool ret = false;
-
-	s = (char*)malloc(80 + strlen(user) + strlen(pass));
-	sprintf(s, "{\"id\": 2, \"method\": \"mining.authorize\", \"params\": [\"%s\", \"%s\"]}",
-	        user, pass);
-
-	if (!stratum_send_line(sctx, s))
-		goto out;
-
-	while (1) {
-		sret = stratum_recv_line(sctx);
-		if (!sret)
-			goto out;
-		if (!stratum_handle_method(sctx, sret))
-			break;
-		free(sret);
-	}
-
-	val = JSON_LOADS(sret, &err);
-	free(sret);
-	if (!val) {
-		applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text);
-		goto out;
-	}
-
-	res_val = json_object_get(val, "result");
-	err_val = json_object_get(val, "error");
-
-	if (!res_val || json_is_false(res_val) ||
-	    (err_val && !json_is_null(err_val)))  {
-		applog(LOG_ERR, "Stratum authentication failed");
-		goto out;
-	}
-
-	ret = true;
-
-out:
-	free(s);
-	if (val)
-		json_decref(val);
-
-	return ret;
-}
-
-static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
-{
-	const char *job_id, *prevhash, *coinb1, *coinb2, *version, *nbits, *ntime, *nreward;
-	size_t coinb1_size, coinb2_size;
-	bool clean, ret = false;
-	int merkle_count, i;
-	json_t *merkle_arr;
-	unsigned char **merkle;
-
-	job_id = json_string_value(json_array_get(params, 0));
-	prevhash = json_string_value(json_array_get(params, 1));
-	coinb1 = json_string_value(json_array_get(params, 2));
-	coinb2 = json_string_value(json_array_get(params, 3));
-	merkle_arr = json_array_get(params, 4);
-	if (!merkle_arr || !json_is_array(merkle_arr))
-		goto out;
-	merkle_count = json_array_size(merkle_arr);
-	version = json_string_value(json_array_get(params, 5));
-	nbits = json_string_value(json_array_get(params, 6));
-	ntime = json_string_value(json_array_get(params, 7));
-	clean = json_is_true(json_array_get(params, 8));
-	nreward = json_string_value(json_array_get(params, 9));
-
-	if (!job_id || !prevhash || !coinb1 || !coinb2 || !version || !nbits || !ntime ||
-	    strlen(prevhash) != 64 || strlen(version) != 8 ||
-	    strlen(nbits) != 8 || strlen(ntime) != 8) {
-		applog(LOG_ERR, "Stratum notify: invalid parameters");
-		goto out;
-	}
-	merkle = (unsigned char**)malloc(merkle_count * sizeof(char *));
-	for (i = 0; i < merkle_count; i++) {
-		const char *s = json_string_value(json_array_get(merkle_arr, i));
-		if (!s || strlen(s) != 64) {
-			while (i--)
-				free(merkle[i]);
-			free(merkle);
-			applog(LOG_ERR, "Stratum notify: invalid Merkle branch");
-			goto out;
-		}
-		merkle[i] = (unsigned char*)malloc(32);
-		hex2bin(merkle[i], s, 32);
-	}
-
-	pthread_mutex_lock(&sctx->work_lock);
-
-	coinb1_size = strlen(coinb1) / 2;
-	coinb2_size = strlen(coinb2) / 2;
-	sctx->job.coinbase_size = coinb1_size + sctx->xnonce1_size +
-	                          sctx->xnonce2_size + coinb2_size;
-	sctx->job.coinbase = (unsigned char*)realloc(sctx->job.coinbase, sctx->job.coinbase_size);
-	sctx->job.xnonce2 = sctx->job.coinbase + coinb1_size + sctx->xnonce1_size;
-	hex2bin(sctx->job.coinbase, coinb1, coinb1_size);
-	memcpy(sctx->job.coinbase + coinb1_size, sctx->xnonce1, sctx->xnonce1_size);
-	if (!sctx->job.job_id || strcmp(sctx->job.job_id, job_id))
-		memset(sctx->job.xnonce2, 0, sctx->xnonce2_size);
-	hex2bin(sctx->job.xnonce2 + sctx->xnonce2_size, coinb2, coinb2_size);
-
-	free(sctx->job.job_id);
-	sctx->job.job_id = strdup(job_id);
-	hex2bin(sctx->job.prevhash, prevhash, 32);
-
-	for (i = 0; i < sctx->job.merkle_count; i++)
-		free(sctx->job.merkle[i]);
-	free(sctx->job.merkle);
-	sctx->job.merkle = merkle;
-	sctx->job.merkle_count = merkle_count;
-
-	hex2bin(sctx->job.version, version, 4);
-	hex2bin(sctx->job.nbits, nbits, 4);
-	hex2bin(sctx->job.ntime, ntime, 4);
-	if(nreward != NULL)
-	{
-		if(strlen(nreward) == 4)
-			hex2bin(sctx->job.nreward, nreward, 2);
-	}
-	sctx->job.clean = clean;
-
-	sctx->job.diff = sctx->next_diff;
-
-	pthread_mutex_unlock(&sctx->work_lock);
-
-	ret = true;
-
-out:
-	return ret;
-}
-
-static bool stratum_set_difficulty(struct stratum_ctx *sctx, json_t *params)
-{
-	double diff;
-
-	diff = json_number_value(json_array_get(params, 0));
-	if (diff == 0)
-		return false;
-
-	pthread_mutex_lock(&sctx->work_lock);
-	sctx->next_diff = diff;
-	pthread_mutex_unlock(&sctx->work_lock);
-
-	if (opt_debug)
-		applog(LOG_DEBUG, "Stratum difficulty set to %g", diff);
-
-	return true;
-}
-
-static bool stratum_reconnect(struct stratum_ctx *sctx, json_t *params)
-{
-	json_t *port_val;
-	const char *host;
-	int port;
-
-	host = json_string_value(json_array_get(params, 0));
-	port_val = json_array_get(params, 1);
-	if (json_is_string(port_val))
-		port = atoi(json_string_value(port_val));
-	else
-		port = json_integer_value(port_val);
-	if (!host || !port)
-		return false;
-	
-	free(sctx->url);
-	sctx->url = (char*)malloc(32 + strlen(host));
-	sprintf(sctx->url, "stratum+tcp://%s:%d", host, port);
-
-	applog(LOG_NOTICE, "Server requested reconnection to %s", sctx->url);
-
-	stratum_disconnect(sctx);
-
-	return true;
-}
-
-static bool stratum_get_version(struct stratum_ctx *sctx, json_t *id)
-{
-	char *s;
-	json_t *val;
-	bool ret;
-	
-	if (!id || json_is_null(id))
-		return false;
-
-	val = json_object();
-	json_object_set(val, "id", id);
-	json_object_set_new(val, "error", json_null());
-	json_object_set_new(val, "result", json_string(USER_AGENT));
-	s = json_dumps(val, 0);
-	ret = stratum_send_line(sctx, s);
-	json_decref(val);
-	free(s);
-
-	return ret;
-}
-
-static bool stratum_show_message(struct stratum_ctx *sctx, json_t *id, json_t *params)
-{
-	char *s;
-	json_t *val;
-	bool ret;
-
-	val = json_array_get(params, 0);
-	if (val)
-		applog(LOG_NOTICE, "MESSAGE FROM SERVER: %s", json_string_value(val));
-	
-	if (!id || json_is_null(id))
-		return true;
-
-	val = json_object();
-	json_object_set(val, "id", id);
-	json_object_set_new(val, "error", json_null());
-	json_object_set_new(val, "result", json_true());
-	s = json_dumps(val, 0);
-	ret = stratum_send_line(sctx, s);
-	json_decref(val);
-	free(s);
-
-	return ret;
-}
-
-bool stratum_handle_method(struct stratum_ctx *sctx, const char *s)
-{
-	json_t *val, *id, *params;
-	json_error_t err;
-	const char *method;
-	bool ret = false;
-
-	val = JSON_LOADS(s, &err);
-	if (!val) {
-		applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text);
-		goto out;
-	}
-
-	method = json_string_value(json_object_get(val, "method"));
-	if (!method)
-		goto out;
-	id = json_object_get(val, "id");
-	params = json_object_get(val, "params");
-
-	if (!strcasecmp(method, "mining.notify")) {
-		ret = stratum_notify(sctx, params);
-		goto out;
-	}
-	if (!strcasecmp(method, "mining.set_difficulty")) {
-		ret = stratum_set_difficulty(sctx, params);
-		goto out;
-	}
-	if (!strcasecmp(method, "client.reconnect")) {
-		ret = stratum_reconnect(sctx, params);
-		goto out;
-	}
-	if (!strcasecmp(method, "client.get_version")) {
-		ret = stratum_get_version(sctx, id);
-		goto out;
-	}
-	if (!strcasecmp(method, "client.show_message")) {
-		ret = stratum_show_message(sctx, id, params);
-		goto out;
-	}
-
-out:
-	if (val)
-		json_decref(val);
-
-	return ret;
-}
-
-struct thread_q *tq_new(void)
-{
-	struct thread_q *tq;
-
-	tq = (struct thread_q *)calloc(1, sizeof(*tq));
-	if (!tq)
-		return NULL;
-
-	INIT_LIST_HEAD(&tq->q);
-	pthread_mutex_init(&tq->mutex, NULL);
-	pthread_cond_init(&tq->cond, NULL);
-
-	return tq;
-}
-
-void tq_free(struct thread_q *tq)
-{
-	struct tq_ent *ent, *iter;
-
-	if (!tq)
-		return;
-
-	list_for_each_entry_safe(ent, iter, &tq->q, q_node, struct tq_ent, struct tq_ent) {
-		list_del(&ent->q_node);
-		free(ent);
-	}
-
-	pthread_cond_destroy(&tq->cond);
-	pthread_mutex_destroy(&tq->mutex);
-
-	memset(tq, 0, sizeof(*tq));	/* poison */
-	free(tq);
-}
-
-static void tq_freezethaw(struct thread_q *tq, bool frozen)
-{
-	pthread_mutex_lock(&tq->mutex);
-
-	tq->frozen = frozen;
-
-	pthread_cond_signal(&tq->cond);
-	pthread_mutex_unlock(&tq->mutex);
-}
-
-void tq_freeze(struct thread_q *tq)
-{
-	tq_freezethaw(tq, true);
-}
-
-void tq_thaw(struct thread_q *tq)
-{
-	tq_freezethaw(tq, false);
-}
-
-bool tq_push(struct thread_q *tq, void *data)
-{
-	struct tq_ent *ent;
-	bool rc = true;
-
-	ent = (struct tq_ent *)calloc(1, sizeof(*ent));
-	if (!ent)
-		return false;
-
-	ent->data = data;
-	INIT_LIST_HEAD(&ent->q_node);
-
-	pthread_mutex_lock(&tq->mutex);
-
-	if (!tq->frozen) {
-		list_add_tail(&ent->q_node, &tq->q);
-	} else {
-		free(ent);
-		rc = false;
-	}
-
-	pthread_cond_signal(&tq->cond);
-	pthread_mutex_unlock(&tq->mutex);
-
-	return rc;
-}
-
-void *tq_pop(struct thread_q *tq, const struct timespec *abstime)
-{
-	struct tq_ent *ent;
-	void *rval = NULL;
-	int rc;
-
-	pthread_mutex_lock(&tq->mutex);
-
-	if (!list_empty(&tq->q))
-		goto pop;
-
-	if (abstime)
-		rc = pthread_cond_timedwait(&tq->cond, &tq->mutex, abstime);
-	else
-		rc = pthread_cond_wait(&tq->cond, &tq->mutex);
-	if (rc)
-		goto out;
-	if (list_empty(&tq->q))
-		goto out;
-
-pop:
-	ent = list_entry(tq->q.next, struct tq_ent, q_node);
-	rval = ent->data;
-
-	list_del(&ent->q_node);
-	free(ent);
-
-out:
-	pthread_mutex_unlock(&tq->mutex);
-	return rval;
-}
diff --git a/util.cpp b/util.cpp
new file mode 100644
index 0000000000..fca1b5c803
--- /dev/null
+++ b/util.cpp
@@ -0,0 +1,2379 @@
+/*
+ * Copyright 2010 Jeff Garzik
+ * Copyright 2012-2014 pooler
+ * Copyright 2014 ccminer team
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.  See COPYING for more details.
+ */
+
+//#define _GNU_SOURCE
+#include <ccminer-config.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <stdarg.h>
+#include <string.h>
+#include <inttypes.h>
+#include <unistd.h>
+#include <jansson.h>
+#include <curl/curl.h>
+#include <sys/stat.h>
+#include <time.h>
+#ifdef WIN32
+#include "compat/winansi.h"
+#include <winsock2.h>
+#include <mstcpip.h>
+#else
+#include <errno.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#endif
+#include "miner.h"
+#include "elist.h"
+
+#include "crypto/xmr-rpc.h"
+
+extern pthread_mutex_t stratum_sock_lock;
+extern pthread_mutex_t stratum_work_lock;
+extern bool opt_debug_diff;
+
+bool opt_tracegpu = false;
+
+struct data_buffer {
+	void		*buf;
+	size_t		len;
+};
+
+struct upload_buffer {
+	const void	*buf;
+	size_t		len;
+	size_t		pos;
+};
+
+struct header_info {
+	char		*lp_path;
+	char		*reason;
+	char		*stratum_url;
+};
+
+struct tq_ent {
+	void			*data;
+	struct list_head	q_node;
+};
+
+struct thread_q {
+	struct list_head	q;
+
+	bool frozen;
+
+	pthread_mutex_t		mutex;
+	pthread_cond_t		cond;
+};
+
+void applog(int prio, const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+
+#ifdef HAVE_SYSLOG_H
+	if (use_syslog) {
+		va_list ap2;
+		char *buf;
+		int len;
+
+		/* custom colors to syslog prio */
+		if (prio > LOG_DEBUG) {
+			switch (prio) {
+				case LOG_BLUE: prio = LOG_NOTICE; break;
+			}
+		}
+
+		va_copy(ap2, ap);
+		len = vsnprintf(NULL, 0, fmt, ap2) + 1;
+		va_end(ap2);
+		buf = (char*) alloca(len);
+		if (vsnprintf(buf, len, fmt, ap) >= 0)
+			syslog(prio, "%s", buf);
+	}
+#else
+	if (0) {}
+#endif
+	else {
+		const char* color = "";
+		const time_t now = time(NULL);
+		char *f;
+		int len;
+		struct tm tm;
+
+		localtime_r(&now, &tm);
+
+		switch (prio) {
+			case LOG_ERR:     color = CL_RED; break;
+			case LOG_WARNING: color = CL_YLW; break;
+			case LOG_NOTICE:  color = CL_WHT; break;
+			case LOG_INFO:    color = ""; break;
+			case LOG_DEBUG:   color = CL_GRY; break;
+
+			case LOG_BLUE:
+				prio = LOG_NOTICE;
+				color = CL_CYN;
+				break;
+		}
+		if (!use_colors)
+			color = "";
+
+		len = 40 + (int) strlen(fmt) + 2;
+		f = (char*) alloca(len);
+		sprintf(f, "[%d-%02d-%02d %02d:%02d:%02d]%s %s%s\n",
+			tm.tm_year + 1900,
+			tm.tm_mon + 1,
+			tm.tm_mday,
+			tm.tm_hour,
+			tm.tm_min,
+			tm.tm_sec,
+			color,
+			fmt,
+			use_colors ? CL_N : ""
+		);
+		if (prio == LOG_RAW) {
+			// no time prefix, for ccminer -n
+			sprintf(f, "%s%s\n", fmt, CL_N);
+		}
+		pthread_mutex_lock(&applog_lock);
+		vfprintf(stdout, f, ap);	/* atomic write to stdout */
+		fflush(stdout);
+		pthread_mutex_unlock(&applog_lock);
+	}
+	va_end(ap);
+}
+
+extern int gpu_threads;
+// Use different prefix if multiple cpu threads per gpu
+// Also, auto hide LOG_DEBUG if --debug (-D) is not used
+void gpulog(int prio, int thr_id, const char *fmt, ...)
+{
+	char _ALIGN(128) pfmt[128];
+	char _ALIGN(128) line[256];
+	int len, dev_id = device_map[thr_id % MAX_GPUS];
+	va_list ap;
+
+	if (prio == LOG_DEBUG && !opt_debug)
+		return;
+
+	if (gpu_threads > 1)
+		len = snprintf(pfmt, 128, "GPU T%d: %s", thr_id, fmt);
+	else
+		len = snprintf(pfmt, 128, "GPU #%d: %s", dev_id, fmt);
+	pfmt[sizeof(pfmt)-1]='\0';
+
+	va_start(ap, fmt);
+
+	if (len && vsnprintf(line, sizeof(line), pfmt, ap)) {
+		line[sizeof(line)-1]='\0';
+		applog(prio, "%s", line);
+	} else {
+		fprintf(stderr, "%s OOM!\n", __func__);
+	}
+
+	va_end(ap);
+}
+
+/* Get default config.json path (system specific) */
+void get_defconfig_path(char *out, size_t bufsize, char *argv0)
+{
+	char *cmd = strdup(argv0);
+	char *dir = dirname(cmd);
+	const char *sep = strstr(dir, "\\") ? "\\" : "/";
+	struct stat info;
+#ifdef WIN32
+	snprintf(out, bufsize, "%s\\ccminer\\ccminer.conf\0", getenv("APPDATA"));
+#else
+	snprintf(out, bufsize, "%s\\.ccminer\\ccminer.conf", getenv("HOME"));
+#endif
+	if (dir && stat(out, &info) != 0) {
+		// binary folder if not present in user folder
+		snprintf(out, bufsize, "%s%sccminer.conf%s", dir, sep, "");
+	}
+	if (stat(out, &info) != 0) {
+		out[0] = '\0';
+		return;
+	}
+	out[bufsize - 1] = '\0';
+	free(cmd);
+#ifdef WIN32
+	if (dir) free(dir);
+#endif
+}
+
+void format_hashrate_unit(double hashrate, char *output, const char *unit)
+{
+	char prefix[2] = { 0, 0 };
+
+	if (hashrate < 10000) {
+		// nop
+	}
+	else if (hashrate < 1e7) {
+		prefix[0] = 'k';
+		hashrate *= 1e-3;
+	}
+	else if (hashrate < 1e10) {
+		prefix[0] = 'M';
+		hashrate *= 1e-6;
+	}
+	else if (hashrate < 1e13) {
+		prefix[0] = 'G';
+		hashrate *= 1e-9;
+	}
+	else {
+		prefix[0] = 'T';
+		hashrate *= 1e-12;
+	}
+
+	sprintf(output, "%.2f %s%s", hashrate, prefix, unit);
+}
+
+static void databuf_free(struct data_buffer *db)
+{
+	if (!db)
+		return;
+
+	free(db->buf);
+
+	memset(db, 0, sizeof(*db));
+}
+
+static size_t all_data_cb(const void *ptr, size_t size, size_t nmemb,
+			  void *user_data)
+{
+	struct data_buffer *db = (struct data_buffer *)user_data;
+	size_t len = size * nmemb;
+	size_t oldlen, newlen;
+	void *newmem;
+	static const uchar zero = 0;
+
+	oldlen = db->len;
+	newlen = oldlen + len;
+
+	newmem = realloc(db->buf, newlen + 1);
+	if (!newmem)
+		return 0;
+
+	db->buf = newmem;
+	db->len = newlen;
+	memcpy((char*)db->buf + oldlen, ptr, len);
+	memcpy((char*)db->buf + newlen, &zero, 1);	/* null terminate */
+
+	return len;
+}
+
+static size_t upload_data_cb(void *ptr, size_t size, size_t nmemb,
+			     void *user_data)
+{
+	struct upload_buffer *ub = (struct upload_buffer *)user_data;
+	unsigned int len = (unsigned int)(size * nmemb);
+
+	if (len > ub->len - ub->pos)
+		len = (unsigned int)(ub->len - ub->pos);
+
+	if (len) {
+		memcpy(ptr, (char*)ub->buf + ub->pos, len);
+		ub->pos += len;
+	}
+
+	return len;
+}
+
+#if LIBCURL_VERSION_NUM >= 0x071200
+static int seek_data_cb(void *user_data, curl_off_t offset, int origin)
+{
+	struct upload_buffer *ub = (struct upload_buffer *)user_data;
+	
+	switch (origin) {
+	case SEEK_SET:
+		ub->pos = (size_t)offset;
+		break;
+	case SEEK_CUR:
+		ub->pos += (size_t)offset;
+		break;
+	case SEEK_END:
+		ub->pos = ub->len + (size_t)offset;
+		break;
+	default:
+		return 1; /* CURL_SEEKFUNC_FAIL */
+	}
+
+	return 0; /* CURL_SEEKFUNC_OK */
+}
+#endif
+
+static size_t resp_hdr_cb(void *ptr, size_t size, size_t nmemb, void *user_data)
+{
+	struct header_info *hi = (struct header_info *)user_data;
+	size_t remlen, slen, ptrlen = size * nmemb;
+	char *rem, *val = NULL, *key = NULL;
+	void *tmp;
+
+	val = (char*)calloc(1, ptrlen);
+	key = (char*)calloc(1, ptrlen);
+	if (!key || !val)
+		goto out;
+
+	tmp = memchr(ptr, ':', ptrlen);
+	if (!tmp || (tmp == ptr))	/* skip empty keys / blanks */
+		goto out;
+	slen = (size_t)((char*)tmp - (char*)ptr);
+	if ((slen + 1) == ptrlen)	/* skip key w/ no value */
+		goto out;
+	memcpy(key, ptr, slen);		/* store & nul term key */
+	key[slen] = 0;
+
+	rem = (char*)ptr + slen + 1;		/* trim value's leading whitespace */
+	remlen = ptrlen - slen - 1;
+	while ((remlen > 0) && (isspace(*rem))) {
+		remlen--;
+		rem++;
+	}
+
+	memcpy(val, rem, remlen);	/* store value, trim trailing ws */
+	val[remlen] = 0;
+	while ((*val) && (isspace(val[strlen(val) - 1]))) {
+		val[strlen(val) - 1] = 0;
+	}
+	if (!*val)			/* skip blank value */
+		goto out;
+
+	if (!strcasecmp("X-Long-Polling", key)) {
+		hi->lp_path = val;	/* X-Mining-Extensions: longpoll */
+		val = NULL;
+	}
+
+	if (!strcasecmp("X-Reject-Reason", key)) {
+		hi->reason = val;	/* X-Mining-Extensions: reject-reason */
+		//applog(LOG_WARNING, "%s:%s", key, val);
+		val = NULL;
+	}
+
+	if (!strcasecmp("X-Stratum", key)) {
+		hi->stratum_url = val;	/* steal memory reference */
+		val = NULL;
+	}
+
+	if (!strcasecmp("X-Nonce-Range", key)) {
+		/* todo when available: X-Mining-Extensions: noncerange */
+	}
+out:
+	free(key);
+	free(val);
+	return ptrlen;
+}
+
+#if LIBCURL_VERSION_NUM >= 0x070f06
+static int sockopt_keepalive_cb(void *userdata, curl_socket_t fd,
+	curlsocktype purpose)
+{
+	int keepalive = 1;
+	int tcp_keepcnt = 3;
+	int tcp_keepidle = 50;
+	int tcp_keepintvl = 50;
+#ifdef WIN32
+	DWORD outputBytes;
+#endif
+
+#ifndef WIN32	
+	if (unlikely(setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &keepalive,
+		sizeof(keepalive))))
+		return 1;
+#ifdef __linux
+	if (unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPCNT,
+		&tcp_keepcnt, sizeof(tcp_keepcnt))))
+		return 1;
+	if (unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPIDLE,
+		&tcp_keepidle, sizeof(tcp_keepidle))))
+		return 1;
+	if (unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPINTVL,
+		&tcp_keepintvl, sizeof(tcp_keepintvl))))
+		return 1;
+#endif /* __linux */
+#ifdef __APPLE_CC__
+	if (unlikely(setsockopt(fd, IPPROTO_TCP, TCP_KEEPALIVE,
+		&tcp_keepintvl, sizeof(tcp_keepintvl))))
+		return 1;
+#endif /* __APPLE_CC__ */
+#else /* WIN32 */
+	struct tcp_keepalive vals;
+	vals.onoff = 1;
+	vals.keepalivetime = tcp_keepidle * 1000;
+	vals.keepaliveinterval = tcp_keepintvl * 1000;	
+	if (unlikely(WSAIoctl(fd, SIO_KEEPALIVE_VALS, &vals, sizeof(vals),
+		NULL, 0, &outputBytes, NULL, NULL)))
+		return 1;
+#endif /* WIN32 */
+
+	return 0;
+}
+#endif
+
+/* For getwork (longpoll or wallet) - not stratum pools!
+ * DO NOT USE DIRECTLY
+ */
+static json_t *json_rpc_call(CURL *curl, const char *url,
+		      const char *userpass, const char *rpc_req,
+		      bool longpoll_scan, bool longpoll, bool keepalive, int *curl_err)
+{
+	json_t *val, *err_val, *res_val;
+	int rc;
+	struct data_buffer all_data = { 0 };
+	struct upload_buffer upload_data;
+	json_error_t err;
+	struct curl_slist *headers = NULL;
+	char *httpdata;
+	char len_hdr[64], hashrate_hdr[64];
+	char curl_err_str[CURL_ERROR_SIZE] = { 0 };
+	long timeout = longpoll ? opt_timeout : opt_timeout/2;
+	struct header_info hi = { 0 };
+	bool lp_scanning = longpoll_scan && !have_longpoll;
+
+	/* it is assumed that 'curl' is freshly [re]initialized at this pt */
+
+	if (opt_protocol)
+		curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
+	curl_easy_setopt(curl, CURLOPT_URL, url);
+	if (opt_cert) {
+		curl_easy_setopt(curl, CURLOPT_CAINFO, opt_cert);
+		// ignore CN domain name, allow to move cert files
+		curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 0);
+	}
+	curl_easy_setopt(curl, CURLOPT_ENCODING, "");
+	curl_easy_setopt(curl, CURLOPT_FAILONERROR, 0);
+	curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);
+	curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1);
+	curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, all_data_cb);
+	curl_easy_setopt(curl, CURLOPT_WRITEDATA, &all_data);
+	curl_easy_setopt(curl, CURLOPT_READFUNCTION, upload_data_cb);
+	curl_easy_setopt(curl, CURLOPT_READDATA, &upload_data);
+#if LIBCURL_VERSION_NUM >= 0x071200
+	curl_easy_setopt(curl, CURLOPT_SEEKFUNCTION, &seek_data_cb);
+	curl_easy_setopt(curl, CURLOPT_SEEKDATA, &upload_data);
+#endif
+	curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_err_str);
+	curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);
+	curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeout);
+	curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, resp_hdr_cb);
+	curl_easy_setopt(curl, CURLOPT_HEADERDATA, &hi);
+	if (opt_proxy) {
+		curl_easy_setopt(curl, CURLOPT_PROXY, opt_proxy);
+		curl_easy_setopt(curl, CURLOPT_PROXYTYPE, opt_proxy_type);
+	}
+	if (userpass) {
+		curl_easy_setopt(curl, CURLOPT_USERPWD, userpass);
+		curl_easy_setopt(curl, CURLOPT_HTTPAUTH, CURLAUTH_BASIC);
+	}
+#if LIBCURL_VERSION_NUM >= 0x070f06
+	if (keepalive)
+		curl_easy_setopt(curl, CURLOPT_SOCKOPTFUNCTION, sockopt_keepalive_cb);
+#endif
+	curl_easy_setopt(curl, CURLOPT_POST, 1);
+
+	if (opt_protocol)
+		applog(LOG_DEBUG, "JSON protocol request:\n%s", rpc_req);
+
+	upload_data.buf = rpc_req;
+	upload_data.len = strlen(rpc_req);
+	upload_data.pos = 0;
+	sprintf(len_hdr, "Content-Length: %lu", (unsigned long) upload_data.len);
+	sprintf(hashrate_hdr, "X-Mining-Hashrate: %llu", (unsigned long long) global_hashrate);
+
+	headers = curl_slist_append(headers, "Content-Type: application/json");
+	headers = curl_slist_append(headers, len_hdr);
+	headers = curl_slist_append(headers, "User-Agent: " USER_AGENT);
+	headers = curl_slist_append(headers, "X-Mining-Extensions: longpoll noncerange reject-reason");
+	headers = curl_slist_append(headers, hashrate_hdr);
+	headers = curl_slist_append(headers, "Accept:"); /* disable Accept hdr*/
+	headers = curl_slist_append(headers, "Expect:"); /* disable Expect hdr*/
+
+	curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
+
+	rc = curl_easy_perform(curl);
+	if (curl_err != NULL)
+		*curl_err = rc;
+	if (rc) {
+		if (!(longpoll && rc == CURLE_OPERATION_TIMEDOUT)) {
+			applog(LOG_ERR, "HTTP request failed: %s", curl_err_str);
+			goto err_out;
+		}
+	}
+
+	/* If X-Stratum was found, activate Stratum */
+	if (want_stratum && hi.stratum_url &&
+	    !strncasecmp(hi.stratum_url, "stratum+tcp://", 14) &&
+	    !(opt_proxy && opt_proxy_type == CURLPROXY_HTTP)) {
+		have_stratum = true;
+		tq_push(thr_info[stratum_thr_id].q, hi.stratum_url);
+		hi.stratum_url = NULL;
+	}
+
+	/* If X-Long-Polling was found, activate long polling */
+	if (lp_scanning && hi.lp_path && !have_stratum) {
+		have_longpoll = true;
+		tq_push(thr_info[longpoll_thr_id].q, hi.lp_path);
+		hi.lp_path = NULL;
+	}
+
+	if (!all_data.buf || !all_data.len) {
+		if (!have_longpoll) // seems normal on longpoll timeout
+			applog(LOG_ERR, "Empty data received in json_rpc_call.");
+		goto err_out;
+	}
+
+	httpdata = (char*) all_data.buf;
+
+	if (*httpdata != '{' && *httpdata != '[') {
+		long errcode = 0;
+		CURLcode c = curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &errcode);
+		if (c == CURLE_OK && errcode == 401) {
+			applog(LOG_ERR, "You are not authorized, check your login and password.");
+			goto err_out;
+		}
+	}
+
+	val = JSON_LOADS(httpdata, &err);
+	if (!val) {
+		applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text);
+		if (opt_protocol)
+			applog(LOG_DEBUG, "%s", httpdata);
+		goto err_out;
+	}
+
+	if (opt_protocol) {
+		char *s = json_dumps(val, JSON_INDENT(3));
+		applog(LOG_DEBUG, "JSON protocol response:\n%s\n", s);
+		free(s);
+	}
+
+	/* JSON-RPC valid response returns a non-null 'result',
+	 * and a null 'error'. */
+	res_val = json_object_get(val, "result");
+	err_val = json_object_get(val, "error");
+
+	if (!res_val || json_is_null(res_val) ||
+	    (err_val && !json_is_null(err_val))) {
+		char *s = NULL;
+
+		if (err_val) {
+			s = json_dumps(err_val, 0);
+			json_t *msg = json_object_get(err_val, "message");
+			json_t *err_code = json_object_get(err_val, "code");
+			if (curl_err && json_integer_value(err_code))
+				*curl_err = (int) json_integer_value(err_code);
+
+			if (json_is_string(msg)) {
+				free(s);
+				s = strdup(json_string_value(msg));
+				if (have_longpoll && s && !strcmp(s, "method not getwork")) {
+					json_decref(err_val);
+					free(s);
+					goto err_out;
+				}
+			}
+			json_decref(err_val);
+		}
+		else
+			s = strdup("(unknown reason)");
+
+		if (!curl_err || opt_debug)
+			applog(LOG_ERR, "JSON-RPC call failed: %s", s);
+
+		free(s);
+
+		goto err_out;
+	}
+
+	if (hi.reason)
+		json_object_set_new(val, "reject-reason", json_string(hi.reason));
+
+	databuf_free(&all_data);
+	curl_slist_free_all(headers);
+	curl_easy_reset(curl);
+	return val;
+
+err_out:
+	free(hi.lp_path);
+	free(hi.reason);
+	free(hi.stratum_url);
+	databuf_free(&all_data);
+	curl_slist_free_all(headers);
+	curl_easy_reset(curl);
+	return NULL;
+}
+
+/* getwork calls with pool pointer (wallet/longpoll pools) */
+json_t *json_rpc_call_pool(CURL *curl, struct pool_infos *pool, const char *req,
+	bool longpoll_scan, bool longpoll, int *curl_err)
+{
+	char userpass[768];
+	// todo, malloc and store that in pool array
+	snprintf(userpass, sizeof(userpass), "%s%c%s", pool->user,
+		strlen(pool->pass)?':':'\0', pool->pass);
+
+	return json_rpc_call(curl, pool->url, userpass, req, longpoll_scan, false, false, curl_err);
+}
+
+/* called only from longpoll thread, we have the lp_url */
+json_t *json_rpc_longpoll(CURL *curl, char *lp_url, struct pool_infos *pool, const char *req, int *curl_err)
+{
+	char userpass[768];
+	snprintf(userpass, sizeof(userpass), "%s%c%s", pool->user,
+		strlen(pool->pass)?':':'\0', pool->pass);
+
+	// on pool rotate by time-limit, this keepalive can be a problem
+	bool keepalive = pool->time_limit == 0 || pool->time_limit > opt_timeout;
+
+	return json_rpc_call(curl, lp_url, userpass, req, false, true, keepalive, curl_err);
+}
+
+json_t *json_load_url(char* cfg_url, json_error_t *err)
+{
+	char err_str[CURL_ERROR_SIZE] = { 0 };
+	struct data_buffer all_data = { 0 };
+	int rc = 0; json_t *cfg = NULL;
+	CURL *curl = curl_easy_init();
+	if (unlikely(!curl)) {
+		applog(LOG_ERR, "Remote config init failed!");
+		return NULL;
+	}
+	curl_easy_setopt(curl, CURLOPT_URL, cfg_url);
+	curl_easy_setopt(curl, CURLOPT_FRESH_CONNECT, 1);
+	curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 15);
+	curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, err_str);
+	curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);
+	curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1);
+	curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, all_data_cb);
+	curl_easy_setopt(curl, CURLOPT_WRITEDATA, &all_data);
+	if (opt_proxy) {
+		curl_easy_setopt(curl, CURLOPT_PROXY, opt_proxy);
+		curl_easy_setopt(curl, CURLOPT_PROXYTYPE, opt_proxy_type);
+	} else if (getenv("http_proxy")) {
+		if (getenv("all_proxy"))
+			curl_easy_setopt(curl, CURLOPT_PROXY, getenv("all_proxy"));
+		else if (getenv("ALL_PROXY"))
+			curl_easy_setopt(curl, CURLOPT_PROXY, getenv("ALL_PROXY"));
+		else
+			curl_easy_setopt(curl, CURLOPT_PROXY, "");
+	}
+	rc = curl_easy_perform(curl);
+	if (rc) {
+		applog(LOG_ERR, "Remote config read failed: %s", err_str);
+		goto err_out;
+	}
+	if (!all_data.buf || !all_data.len) {
+		applog(LOG_ERR, "Empty data received for config");
+		goto err_out;
+	}
+
+	cfg = JSON_LOADS((char*)all_data.buf, err);
+err_out:
+	curl_easy_cleanup(curl);
+	return cfg;
+}
+
+/**
+ * Unlike malloc, calloc set the memory to zero
+ */
+void *aligned_calloc(int size)
+{
+	const int ALIGN = 64; // cache line
+#ifdef _MSC_VER
+	void* res = _aligned_malloc(size, ALIGN);
+	memset(res, 0, size);
+	return res;
+#else
+	void *mem = calloc(1, size+ALIGN+sizeof(uintptr_t));
+	void **ptr = (void**)((size_t)(((uintptr_t)(mem))+ALIGN+sizeof(uintptr_t)) & ~(ALIGN-1));
+	ptr[-1] = mem;
+	return ptr;
+#endif
+}
+
+void aligned_free(void *ptr)
+{
+#ifdef _MSC_VER
+	_aligned_free(ptr);
+#else
+	free(((void**)ptr)[-1]);
+#endif
+}
+
+void cbin2hex(char *out, const char *in, size_t len)
+{
+	if (out) {
+		unsigned int i;
+		for (i = 0; i < len; i++)
+			sprintf(out + (i * 2), "%02x", (uint8_t)in[i]);
+	}
+}
+
+char *bin2hex(const uchar *in, size_t len)
+{
+	char *s = (char*)malloc((len * 2) + 1);
+	if (!s)
+		return NULL;
+
+	cbin2hex(s, (const char *) in, len);
+
+	return s;
+}
+
+bool hex2bin(void *output, const char *hexstr, size_t len)
+{
+	uchar *p = (uchar *) output;
+	char hex_byte[4];
+	char *ep;
+
+	hex_byte[2] = '\0';
+
+	while (*hexstr && len) {
+		if (!hexstr[1]) {
+			applog(LOG_ERR, "hex2bin str truncated");
+			return false;
+		}
+		hex_byte[0] = hexstr[0];
+		hex_byte[1] = hexstr[1];
+		*p = (uchar) strtol(hex_byte, &ep, 16);
+		if (*ep) {
+			applog(LOG_ERR, "hex2bin failed on '%s'", hex_byte);
+			return false;
+		}
+		p++;
+		hexstr += 2;
+		len--;
+	}
+
+	return (len == 0 && *hexstr == 0) ? true : false;
+}
+
+/* Subtract the `struct timeval' values X and Y,
+   storing the result in RESULT.
+   Return 1 if the difference is negative, otherwise 0.  */
+int timeval_subtract(struct timeval *result, struct timeval *x,
+	struct timeval *y)
+{
+	/* Perform the carry for the later subtraction by updating Y. */
+	if (x->tv_usec < y->tv_usec) {
+		int nsec = (y->tv_usec - x->tv_usec) / 1000000 + 1;
+		y->tv_usec -= 1000000 * nsec;
+		y->tv_sec += nsec;
+	}
+	if (x->tv_usec - y->tv_usec > 1000000) {
+		int nsec = (x->tv_usec - y->tv_usec) / 1000000;
+		y->tv_usec += 1000000 * nsec;
+		y->tv_sec -= nsec;
+	}
+
+	/* Compute the time remaining to wait.
+	 * `tv_usec' is certainly positive. */
+	result->tv_sec = x->tv_sec - y->tv_sec;
+	result->tv_usec = x->tv_usec - y->tv_usec;
+
+	/* Return 1 if result is negative. */
+	return x->tv_sec < y->tv_sec;
+}
+
+bool fulltest(const uint32_t *hash, const uint32_t *target)
+{
+	int i;
+	bool rc = true;
+	
+	for (i = 7; i >= 0; i--) {
+		if (hash[i] > target[i]) {
+			rc = false;
+			break;
+		}
+		if (hash[i] < target[i]) {
+			rc = true;
+			break;
+		}
+		if (hash[1] == target[1]) {
+			applog(LOG_NOTICE, "We found a close match!");
+		}
+	}
+
+	if ((!rc && opt_debug) || opt_debug_diff) {
+		uint32_t hash_be[8], target_be[8];
+		char *hash_str, *target_str;
+		
+		for (i = 0; i < 8; i++) {
+			be32enc(hash_be + i, hash[7 - i]);
+			be32enc(target_be + i, target[7 - i]);
+		}
+		hash_str = bin2hex((uchar *)hash_be, 32);
+		target_str = bin2hex((uchar *)target_be, 32);
+
+		applog(LOG_DEBUG, "DEBUG: %s\nHash:   %s\nTarget: %s",
+			rc ? "hash <= target"
+			   : CL_YLW "hash > target (false positive)" CL_N,
+			hash_str,
+			target_str);
+
+		free(hash_str);
+		free(target_str);
+	}
+
+	return rc;
+}
+
+// Only used by stratum pools
+void diff_to_target(uint32_t *target, double diff)
+{
+	uint64_t m;
+	int k;
+
+	for (k = 6; k > 0 && diff > 1.0; k--)
+		diff /= 4294967296.0;
+	m = (uint64_t)(4294901760.0 / diff);
+	if (m == 0 && k == 6)
+		memset(target, 0xff, 32);
+	else {
+		memset(target, 0, 32);
+		target[k] = (uint32_t)m;
+		target[k + 1] = (uint32_t)(m >> 32);
+	}
+}
+
+// Only used by stratum pools
+void work_set_target(struct work* work, double diff)
+{
+	diff_to_target(work->target, diff);
+	work->targetdiff = diff;
+}
+
+
+// Only used by longpoll pools
+double target_to_diff(uint32_t* target)
+{
+	uchar* tgt = (uchar*) target;
+	uint64_t m =
+		(uint64_t)tgt[29] << 56 |
+		(uint64_t)tgt[28] << 48 |
+		(uint64_t)tgt[27] << 40 |
+		(uint64_t)tgt[26] << 32 |
+		(uint64_t)tgt[25] << 24 |
+		(uint64_t)tgt[24] << 16 |
+		(uint64_t)tgt[23] << 8  |
+		(uint64_t)tgt[22] << 0;
+
+	if (!m)
+		return 0.;
+	else
+		return (double)0x0000ffff00000000/m;
+}
+
+#ifdef WIN32
+#define socket_blocks() (WSAGetLastError() == WSAEWOULDBLOCK)
+#else
+#define socket_blocks() (errno == EAGAIN || errno == EWOULDBLOCK)
+#endif
+
+static bool send_line(curl_socket_t sock, char *s)
+{
+	ssize_t len, sent = 0;
+	
+	len = (ssize_t)strlen(s);
+	s[len++] = '\n';
+
+	while (len > 0) {
+		struct timeval timeout = {0, 0};
+		ssize_t n;
+		fd_set wd;
+
+		FD_ZERO(&wd);
+		FD_SET(sock, &wd);
+		if (select((int)sock + 1, NULL, &wd, NULL, &timeout) < 1)
+			return false;
+		n = send(sock, s + sent, len, 0);
+		if (n < 0) {
+			if (!socket_blocks())
+				return false;
+			n = 0;
+		}
+		sent += n;
+		len -= n;
+	}
+
+	return true;
+}
+
+bool stratum_send_line(struct stratum_ctx *sctx, char *s)
+{
+	bool ret = false;
+
+	if (opt_protocol)
+		applog(LOG_DEBUG, "> %s", s);
+
+	pthread_mutex_lock(&stratum_sock_lock);
+	ret = send_line(sctx->sock, s);
+	pthread_mutex_unlock(&stratum_sock_lock);
+
+	return ret;
+}
+
+static bool socket_full(curl_socket_t sock, int timeout)
+{
+	struct timeval tv;
+	fd_set rd;
+
+	FD_ZERO(&rd);
+	FD_SET(sock, &rd);
+	tv.tv_sec = timeout;
+	tv.tv_usec = 0;
+	if (select((int)sock + 1, &rd, NULL, NULL, &tv) > 0)
+		return true;
+	return false;
+}
+
+bool stratum_socket_full(struct stratum_ctx *sctx, int timeout)
+{
+	if (!sctx->sockbuf) return false;
+	return strlen(sctx->sockbuf) || socket_full(sctx->sock, timeout);
+}
+
+#define RBUFSIZE 2048
+#define RECVSIZE (RBUFSIZE - 4)
+
+static void stratum_buffer_append(struct stratum_ctx *sctx, const char *s)
+{
+	size_t old, snew;
+
+	old = strlen(sctx->sockbuf);
+	snew = old + strlen(s) + 1;
+	if (snew >= sctx->sockbuf_size) {
+		sctx->sockbuf_size = snew + (RBUFSIZE - (snew % RBUFSIZE));
+		sctx->sockbuf = (char*)realloc(sctx->sockbuf, sctx->sockbuf_size);
+	}
+	strcpy(sctx->sockbuf + old, s);
+}
+
+char *stratum_recv_line(struct stratum_ctx *sctx)
+{
+	ssize_t len, buflen;
+	char *tok, *sret = NULL;
+	int timeout = opt_timeout;
+
+	if (!sctx->sockbuf)
+		return NULL;
+
+	if (!strstr(sctx->sockbuf, "\n")) {
+		bool ret = true;
+		time_t rstart = time(NULL);
+		if (!socket_full(sctx->sock, timeout)) {
+			applog(LOG_ERR, "stratum_recv_line timed out");
+			goto out;
+		}
+		do {
+			char s[RBUFSIZE];
+			ssize_t n;
+
+			memset(s, 0, RBUFSIZE);
+			n = recv(sctx->sock, s, RECVSIZE, 0);
+			if (!n) {
+				ret = false;
+				break;
+			}
+			if (n < 0) {
+				if (!socket_blocks() || !socket_full(sctx->sock, 1)) {
+					ret = false;
+					break;
+				}
+			} else
+				stratum_buffer_append(sctx, s);
+		} while (time(NULL) - rstart < timeout && !strstr(sctx->sockbuf, "\n"));
+
+		if (!ret) {
+			if (opt_debug) applog(LOG_ERR, "stratum_recv_line failed");
+			goto out;
+		}
+	}
+
+	buflen = (ssize_t)strlen(sctx->sockbuf);
+	tok = strtok(sctx->sockbuf, "\n");
+	if (!tok) {
+		applog(LOG_ERR, "stratum_recv_line failed to parse a newline-terminated string");
+		goto out;
+	}
+	sret = strdup(tok);
+	len = (ssize_t)strlen(sret);
+
+	if (buflen > len + 1)
+		memmove(sctx->sockbuf, sctx->sockbuf + len + 1, buflen - len + 1);
+	else
+		sctx->sockbuf[0] = '\0';
+
+out:
+	if (sret && opt_protocol)
+		applog(LOG_DEBUG, "< %s", sret);
+	return sret;
+}
+
+#if LIBCURL_VERSION_NUM >= 0x071101
+static curl_socket_t opensocket_grab_cb(void *clientp, curlsocktype purpose,
+	struct curl_sockaddr *addr)
+{
+	curl_socket_t *sock = (curl_socket_t *)clientp;
+	*sock = socket(addr->family, addr->socktype, addr->protocol);
+	return *sock;
+}
+#endif
+
+bool stratum_connect(struct stratum_ctx *sctx, const char *url)
+{
+	CURL *curl;
+	int rc;
+
+	pthread_mutex_lock(&stratum_sock_lock);
+	if (sctx->curl)
+		curl_easy_cleanup(sctx->curl);
+	sctx->curl = curl_easy_init();
+	if (!sctx->curl) {
+		applog(LOG_ERR, "CURL initialization failed");
+		pthread_mutex_unlock(&stratum_sock_lock);
+		return false;
+	}
+	curl = sctx->curl;
+	if (!sctx->sockbuf) {
+		sctx->sockbuf = (char*)calloc(RBUFSIZE, 1);
+		sctx->sockbuf_size = RBUFSIZE;
+	}
+	sctx->sockbuf[0] = '\0';
+	pthread_mutex_unlock(&stratum_sock_lock);
+
+	if (url != sctx->url) {
+		free(sctx->url);
+		sctx->url = strdup(url);
+	}
+	free(sctx->curl_url);
+	sctx->curl_url = (char*)malloc(strlen(url)+1);
+	sprintf(sctx->curl_url, "http%s", strstr(url, "://"));
+
+	if (opt_protocol)
+		curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
+	curl_easy_setopt(curl, CURLOPT_URL, sctx->curl_url);
+	curl_easy_setopt(curl, CURLOPT_FRESH_CONNECT, 1);
+	curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, opt_timeout);
+	curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, sctx->curl_err_str);
+	curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);
+	curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1);
+	if (opt_proxy && opt_proxy_type != CURLPROXY_HTTP) {
+		curl_easy_setopt(curl, CURLOPT_PROXY, opt_proxy);
+		curl_easy_setopt(curl, CURLOPT_PROXYTYPE, opt_proxy_type);
+	} else if (getenv("http_proxy")) {
+		if (getenv("all_proxy"))
+			curl_easy_setopt(curl, CURLOPT_PROXY, getenv("all_proxy"));
+		else if (getenv("ALL_PROXY"))
+			curl_easy_setopt(curl, CURLOPT_PROXY, getenv("ALL_PROXY"));
+		else
+			curl_easy_setopt(curl, CURLOPT_PROXY, "");
+	}
+#if LIBCURL_VERSION_NUM >= 0x070f06
+	curl_easy_setopt(curl, CURLOPT_SOCKOPTFUNCTION, sockopt_keepalive_cb);
+#endif
+#if LIBCURL_VERSION_NUM >= 0x071101
+	curl_easy_setopt(curl, CURLOPT_OPENSOCKETFUNCTION, opensocket_grab_cb);
+	curl_easy_setopt(curl, CURLOPT_OPENSOCKETDATA, &sctx->sock);
+#endif
+	curl_easy_setopt(curl, CURLOPT_CONNECT_ONLY, 1);
+
+	rc = curl_easy_perform(curl);
+	if (rc) {
+		applog(LOG_ERR, "Stratum connection failed: %s", sctx->curl_err_str);
+		curl_easy_cleanup(curl);
+		sctx->curl = NULL;
+		return false;
+	}
+
+#if LIBCURL_VERSION_NUM < 0x071101
+	/* CURLINFO_LASTSOCKET is broken on Win64; only use it as a last resort */
+	curl_easy_getinfo(curl, CURLINFO_LASTSOCKET, (long *)&sctx->sock);
+#endif
+
+	return true;
+}
+
+void stratum_free_job(struct stratum_ctx *sctx)
+{
+	pthread_mutex_lock(&stratum_work_lock);
+	if (sctx->job.job_id) {
+		free(sctx->job.job_id);
+	}
+	if (sctx->job.merkle_count) {
+		for (int i = 0; i < sctx->job.merkle_count; i++) {
+			free(sctx->job.merkle[i]);
+			sctx->job.merkle[i] = NULL;
+		}
+		free(sctx->job.merkle);
+	}
+	free(sctx->job.coinbase);
+	// note: xnonce2 is not allocated
+	memset(&(sctx->job.job_id), 0, sizeof(struct stratum_job));
+	pthread_mutex_unlock(&stratum_work_lock);
+}
+
+void stratum_disconnect(struct stratum_ctx *sctx)
+{
+	pthread_mutex_lock(&stratum_sock_lock);
+	if (sctx->curl) {
+		pools[sctx->pooln].disconnects++;
+		curl_easy_cleanup(sctx->curl);
+		sctx->curl = NULL;
+		if (sctx->sockbuf)
+			sctx->sockbuf[0] = '\0';
+		// free(sctx->sockbuf);
+		// sctx->sockbuf = NULL;
+	}
+	if (sctx->job.job_id) {
+		stratum_free_job(sctx);
+	}
+	pthread_mutex_unlock(&stratum_sock_lock);
+}
+
+static const char *get_stratum_session_id(json_t *val)
+{
+	json_t *arr_val;
+	int i, n;
+
+	arr_val = json_array_get(val, 0);
+	if (!arr_val || !json_is_array(arr_val))
+		return NULL;
+	n = (int) json_array_size(arr_val);
+	for (i = 0; i < n; i++) {
+		const char *notify;
+		json_t *arr = json_array_get(arr_val, i);
+		if (!arr || !json_is_array(arr))
+			break;
+		notify = json_string_value(json_array_get(arr, 0));
+		if (!notify)
+			continue;
+		if (!strcasecmp(notify, "mining.notify"))
+			return json_string_value(json_array_get(arr, 1));
+	}
+	return NULL;
+}
+
+static bool stratum_parse_extranonce(struct stratum_ctx *sctx, json_t *params, int pndx)
+{
+	const char* xnonce1;
+	int xn2_size;
+
+	xnonce1 = json_string_value(json_array_get(params, pndx));
+	if (!xnonce1) {
+		applog(LOG_ERR, "Failed to get extranonce1");
+		goto out;
+	}
+	xn2_size = (int) json_integer_value(json_array_get(params, pndx+1));
+	if (!xn2_size) {
+		char algo[64] = { 0 };
+		get_currentalgo(algo, sizeof(algo));
+		if (strcmp(algo, "equihash") == 0) {
+			int xn1_size = (int)strlen(xnonce1) / 2;
+			xn2_size = 32 - xn1_size;
+			if (xn1_size < 4 || xn1_size > 12) {
+				// This miner iterates the nonces at data32[30]
+				applog(LOG_ERR, "Unsupported extranonce size of %d (12 maxi)", xn1_size);
+				goto out;
+			}
+			goto skip_n2;
+		} else {
+			applog(LOG_ERR, "Failed to get extranonce2_size");
+			goto out;
+		}
+	}
+	if (xn2_size < 2 || xn2_size > 16) {
+		applog(LOG_ERR, "Failed to get valid n2size in parse_extranonce (%d)", xn2_size);
+		goto out;
+	}
+skip_n2:
+	pthread_mutex_lock(&stratum_work_lock);
+	if (sctx->xnonce1)
+		free(sctx->xnonce1);
+	sctx->xnonce1_size = strlen(xnonce1) / 2;
+	sctx->xnonce1 = (uchar*) calloc(1, sctx->xnonce1_size);
+	if (unlikely(!sctx->xnonce1)) {
+		applog(LOG_ERR, "Failed to alloc xnonce1");
+		pthread_mutex_unlock(&stratum_work_lock);
+		goto out;
+	}
+	hex2bin(sctx->xnonce1, xnonce1, sctx->xnonce1_size);
+	sctx->xnonce2_size = xn2_size;
+	pthread_mutex_unlock(&stratum_work_lock);
+
+	if (pndx == 0 && opt_debug) /* pool dynamic change */
+		applog(LOG_DEBUG, "Stratum set nonce %s with extranonce2 size=%d",
+			xnonce1, xn2_size);
+
+	return true;
+out:
+	return false;
+}
+
+bool stratum_subscribe(struct stratum_ctx *sctx)
+{
+	char *s, *sret = NULL;
+	const char *sid;
+	json_t *val = NULL, *res_val, *err_val;
+	json_error_t err;
+	bool ret = false, retry = false;
+
+	if (sctx->rpc2) return true;
+
+start:
+	s = (char*)malloc(128 + (sctx->session_id ? strlen(sctx->session_id) : 0));
+	if (retry)
+		sprintf(s, "{\"id\": 1, \"method\": \"mining.subscribe\", \"params\": []}");
+	else if (sctx->session_id)
+		sprintf(s, "{\"id\": 1, \"method\": \"mining.subscribe\", \"params\": [\"" USER_AGENT "\", \"%s\"]}", sctx->session_id);
+	else
+		sprintf(s, "{\"id\": 1, \"method\": \"mining.subscribe\", \"params\": [\"" USER_AGENT "\"]}");
+
+	if (!stratum_send_line(sctx, s))
+		goto out;
+
+	if (!socket_full(sctx->sock, 10)) {
+		applog(LOG_ERR, "stratum_subscribe timed out");
+		goto out;
+	}
+
+	sret = stratum_recv_line(sctx);
+	if (!sret)
+		goto out;
+
+	val = JSON_LOADS(sret, &err);
+	free(sret);
+	if (!val) {
+		applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text);
+		goto out;
+	}
+
+	if (json_integer_value(json_object_get(val, "id")) != 1) {
+		applog(LOG_WARNING, "Stratum subscribe answer id is not correct!");
+	}
+
+	res_val = json_object_get(val, "result");
+	err_val = json_object_get(val, "error");
+
+	if (!res_val || json_is_null(res_val) ||
+	    (err_val && !json_is_null(err_val))) {
+		if (opt_debug || retry) {
+			free(s);
+			if (err_val)
+				s = json_dumps(err_val, JSON_INDENT(3));
+			else
+				s = strdup("(unknown reason)");
+			applog(LOG_ERR, "JSON-RPC call failed: %s", s);
+		}
+		goto out;
+	}
+
+	// sid is param 1, extranonce params are 2 and 3
+	if (!stratum_parse_extranonce(sctx, res_val, 1)) {
+		goto out;
+	}
+
+	ret = true;
+
+	// session id (optional)
+	sid = get_stratum_session_id(res_val);
+	if (opt_debug && sid)
+		applog(LOG_DEBUG, "Stratum session id: %s", sid);
+
+	pthread_mutex_lock(&stratum_work_lock);
+	if (sctx->session_id)
+		free(sctx->session_id);
+	sctx->session_id = sid ? strdup(sid) : NULL;
+	sctx->next_diff = 1.0;
+	pthread_mutex_unlock(&stratum_work_lock);
+
+out:
+	free(s);
+	if (val)
+		json_decref(val);
+
+	if (!ret) {
+		if (sret && !retry) {
+			retry = true;
+			goto start;
+		}
+	}
+
+	return ret;
+}
+
+extern bool opt_extranonce;
+
+bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass)
+{
+	json_t *val = NULL, *res_val, *err_val;
+	char *s, *sret;
+	json_error_t err;
+	bool ret = false;
+
+	if (sctx->rpc2)
+		return rpc2_stratum_authorize(sctx, user, pass);
+
+	s = (char*)malloc(80 + strlen(user) + strlen(pass));
+	sprintf(s, "{\"id\": 2, \"method\": \"mining.authorize\", \"params\": [\"%s\", \"%s\"]}",
+	        user, pass);
+
+	if (!stratum_send_line(sctx, s))
+		goto out;
+
+	while (1) {
+		sret = stratum_recv_line(sctx);
+		if (!sret)
+			goto out;
+		if (!stratum_handle_method(sctx, sret))
+			break;
+		free(sret);
+	}
+
+	val = JSON_LOADS(sret, &err);
+	free(sret);
+	if (!val) {
+		applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text);
+		goto out;
+	}
+
+	if (json_integer_value(json_object_get(val, "id")) != 2) {
+		applog(LOG_WARNING, "Stratum authorize answer id is not correct!");
+	}
+	res_val = json_object_get(val, "result");
+	err_val = json_object_get(val, "error");
+
+	if (!res_val || json_is_false(res_val) ||
+	    (err_val && !json_is_null(err_val)))  {
+		if (err_val && json_is_array(err_val)) {
+			const char* reason = json_string_value(json_array_get(err_val, 1));
+			applog(LOG_ERR, "Stratum authentication failed (%s)", reason);
+		}
+		else applog(LOG_ERR, "Stratum authentication failed");
+		goto out;
+	}
+
+	sctx->tm_connected = time(NULL);
+	ret = true;
+
+	if (!opt_extranonce)
+		goto out;
+
+	// subscribe to extranonce (optional)
+	sprintf(s, "{\"id\": 3, \"method\": \"mining.extranonce.subscribe\", \"params\": []}");
+
+	if (!stratum_send_line(sctx, s))
+		goto out;
+
+	// reduced timeout to handle pools ignoring this method without answer (like xpool.ca)
+	if (!socket_full(sctx->sock, 1)) {
+		if (opt_debug)
+			applog(LOG_DEBUG, "stratum extranonce subscribe timed out");
+		goto out;
+	}
+
+	sret = stratum_recv_line(sctx);
+	if (sret) {
+		json_t *extra = JSON_LOADS(sret, &err);
+		if (!extra) {
+			applog(LOG_WARNING, "JSON decode failed(%d): %s", err.line, err.text);
+		} else {
+			if (json_integer_value(json_object_get(extra, "id")) != 3) {
+				// we receive a standard method if extranonce is ignored
+				if (!stratum_handle_method(sctx, sret))
+					applog(LOG_WARNING, "Stratum extranonce answer id was not correct!");
+			} else {
+				res_val = json_object_get(extra, "result");
+				if (opt_debug && (!res_val || json_is_false(res_val)))
+					applog(LOG_DEBUG, "extranonce subscribe not supported");
+			}
+			json_decref(extra);
+		}
+		free(sret);
+	}
+
+out:
+	free(s);
+	if (val)
+		json_decref(val);
+
+	return ret;
+}
+
+/**
+ * Extract bloc height     L H... here len=3, height=0x1333e8
+ * "...0000000000ffffffff2703e83313062f503253482f043d61105408"
+ */
+static uint32_t getblocheight(struct stratum_ctx *sctx)
+{
+	uint32_t height = 0;
+	uint8_t hlen = 0, *p, *m;
+
+	// find 0xffff tag
+	p = (uint8_t*) sctx->job.coinbase + 32;
+	m = p + 128;
+	while (*p != 0xff && p < m) p++;
+	while (*p == 0xff && p < m) p++;
+	if (*(p-1) == 0xff && *(p-2) == 0xff) {
+		p++; hlen = *p;
+		p++; height = le16dec(p);
+		p += 2;
+		switch (hlen) {
+			case 4:
+				height += 0x10000UL * le16dec(p);
+				break;
+			case 3:
+				height += 0x10000UL * (*p);
+				break;
+		}
+	}
+	return height;
+}
+
+static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
+{
+	const char *job_id, *prevhash, *coinb1, *coinb2, *version, *nbits, *stime;
+	const char *extradata = NULL, *nreward = NULL;
+	size_t coinb1_size, coinb2_size;
+	bool clean, ret = false;
+	int merkle_count, i, p=0;
+	json_t *merkle_arr;
+	uchar **merkle = NULL;
+	// uchar(*merkle_tree)[32] = { 0 };
+	int ntime;
+	char algo[64] = { 0 };
+	get_currentalgo(algo, sizeof(algo));
+	bool has_claim = !strcmp(algo, "lbry");
+	bool has_roots = !strcmp(algo, "phi2") && json_array_size(params) == 10;
+
+	if (sctx->is_equihash) {
+		return equi_stratum_notify(sctx, params);
+	}
+
+	job_id = json_string_value(json_array_get(params, p++));
+	prevhash = json_string_value(json_array_get(params, p++));
+	if (has_claim) {
+		extradata = json_string_value(json_array_get(params, p++));
+		if (!extradata || strlen(extradata) != 64) {
+			applog(LOG_ERR, "Stratum notify: invalid claim parameter");
+			goto out;
+		}
+	} else if (has_roots) {
+		extradata = json_string_value(json_array_get(params, p++));
+		if (!extradata || strlen(extradata) != 128) {
+			applog(LOG_ERR, "Stratum notify: invalid UTXO root parameter");
+			goto out;
+		}
+	}
+	coinb1 = json_string_value(json_array_get(params, p++));
+	coinb2 = json_string_value(json_array_get(params, p++));
+	merkle_arr = json_array_get(params, p++);
+	if (!merkle_arr || !json_is_array(merkle_arr))
+		goto out;
+	merkle_count = (int) json_array_size(merkle_arr);
+	version = json_string_value(json_array_get(params, p++));
+	nbits = json_string_value(json_array_get(params, p++));
+	stime = json_string_value(json_array_get(params, p++));
+	clean = json_is_true(json_array_get(params, p)); p++;
+	nreward = json_string_value(json_array_get(params, p++));
+
+	if (!job_id || !prevhash || !coinb1 || !coinb2 || !version || !nbits || !stime ||
+	    strlen(prevhash) != 64 || strlen(version) != 8 ||
+	    strlen(nbits) != 8 || strlen(stime) != 8) {
+		applog(LOG_ERR, "Stratum notify: invalid parameters");
+		goto out;
+	}
+
+	/* store stratum server time diff */
+	hex2bin((uchar *)&ntime, stime, 4);
+	ntime = swab32(ntime) - (uint32_t) time(0);
+	if (ntime > sctx->srvtime_diff) {
+		sctx->srvtime_diff = ntime;
+		if (opt_protocol && ntime > 20)
+			applog(LOG_DEBUG, "stratum time is at least %ds in the future", ntime);
+	}
+
+	if (merkle_count)
+		merkle = (uchar**) malloc(merkle_count * sizeof(char *));
+	for (i = 0; i < merkle_count; i++) {
+		const char *s = json_string_value(json_array_get(merkle_arr, i));
+		if (!s || strlen(s) != 64) {
+			while (i--)
+				free(merkle[i]);
+			free(merkle);
+			applog(LOG_ERR, "Stratum notify: invalid Merkle branch");
+			goto out;
+		}
+		merkle[i] = (uchar*) malloc(32);
+		hex2bin(merkle[i], s, 32);
+	}
+
+	pthread_mutex_lock(&stratum_work_lock);
+
+	coinb1_size = strlen(coinb1) / 2;
+	coinb2_size = strlen(coinb2) / 2;
+	sctx->job.coinbase_size = coinb1_size + sctx->xnonce1_size +
+	                          sctx->xnonce2_size + coinb2_size;
+
+	sctx->job.coinbase = (uchar*) realloc(sctx->job.coinbase, sctx->job.coinbase_size);
+	sctx->job.xnonce2 = sctx->job.coinbase + coinb1_size + sctx->xnonce1_size;
+	hex2bin(sctx->job.coinbase, coinb1, coinb1_size);
+	memcpy(sctx->job.coinbase + coinb1_size, sctx->xnonce1, sctx->xnonce1_size);
+
+	if (!sctx->job.job_id || strcmp(sctx->job.job_id, job_id))
+		memset(sctx->job.xnonce2, 0, sctx->xnonce2_size);
+	hex2bin(sctx->job.xnonce2 + sctx->xnonce2_size, coinb2, coinb2_size);
+
+	free(sctx->job.job_id);
+	sctx->job.job_id = strdup(job_id);
+	hex2bin(sctx->job.prevhash, prevhash, 32);
+	if (has_claim) hex2bin(sctx->job.extra, extradata, 32);
+	if (has_roots) hex2bin(sctx->job.extra, extradata, 64);
+
+	sctx->job.height = getblocheight(sctx);
+
+	for (i = 0; i < sctx->job.merkle_count; i++)
+		free(sctx->job.merkle[i]);
+	free(sctx->job.merkle);
+	sctx->job.merkle = merkle;
+	sctx->job.merkle_count = merkle_count;
+
+	hex2bin(sctx->job.version, version, 4);
+	hex2bin(sctx->job.nbits, nbits, 4);
+	hex2bin(sctx->job.ntime, stime, 4);
+	if(nreward != NULL)
+	{
+		if(strlen(nreward) == 4)
+			hex2bin(sctx->job.nreward, nreward, 2);
+	}
+	sctx->job.clean = clean;
+
+	sctx->job.diff = sctx->next_diff;
+
+	pthread_mutex_unlock(&stratum_work_lock);
+
+	ret = true;
+
+out:
+	return ret;
+}
+
+extern volatile time_t g_work_time;
+static bool stratum_set_difficulty(struct stratum_ctx *sctx, json_t *params)
+{
+	double diff;
+
+	diff = json_number_value(json_array_get(params, 0));
+	if (diff <= 0.0)
+		return false;
+
+	pthread_mutex_lock(&stratum_work_lock);
+	sctx->next_diff = diff;
+	pthread_mutex_unlock(&stratum_work_lock);
+
+	return true;
+}
+
+static bool stratum_reconnect(struct stratum_ctx *sctx, json_t *params)
+{
+	json_t *port_val;
+	const char *host;
+	int port;
+
+	host = json_string_value(json_array_get(params, 0));
+	port_val = json_array_get(params, 1);
+	if (json_is_string(port_val))
+		port = atoi(json_string_value(port_val));
+	else
+		port = (int) json_integer_value(port_val);
+	if (!host || !port)
+		return false;
+	
+	free(sctx->url);
+	sctx->url = (char*)malloc(32 + strlen(host));
+	sprintf(sctx->url, "stratum+tcp://%s:%d", host, port);
+
+	applog(LOG_NOTICE, "Server requested reconnection to %s", sctx->url);
+
+	stratum_disconnect(sctx);
+
+	return true;
+}
+
+static bool stratum_pong(struct stratum_ctx *sctx, json_t *id)
+{
+	char buf[64];
+	bool ret = false;
+
+	if (!id || json_is_null(id))
+		return ret;
+
+	sprintf(buf, "{\"id\":%d,\"result\":\"pong\",\"error\":null}",
+		(int) json_integer_value(id));
+	ret = stratum_send_line(sctx, buf);
+
+	return ret;
+}
+
+static bool stratum_get_algo(struct stratum_ctx *sctx, json_t *id, json_t *params)
+{
+	char algo[64] = { 0 };
+	char *s;
+	json_t *val;
+	bool ret = true;
+
+	if (!id || json_is_null(id))
+		return false;
+
+	get_currentalgo(algo, sizeof(algo));
+
+	val = json_object();
+	json_object_set(val, "id", id);
+	json_object_set_new(val, "error", json_null());
+	json_object_set_new(val, "result", json_string(algo));
+
+	s = json_dumps(val, 0);
+	ret = stratum_send_line(sctx, s);
+	json_decref(val);
+	free(s);
+
+	return ret;
+}
+
+#include "nvml.h"
+extern char driver_version[32];
+extern int cuda_arch[MAX_GPUS];
+
+void gpu_increment_reject(int thr_id)
+{
+	struct cgpu_info *gpu = &thr_info[thr_id].gpu;
+	if (gpu) gpu->rejected++;
+}
+
+static bool json_object_set_error(json_t *result, int code, const char *msg)
+{
+	json_t *val = json_object();
+	json_object_set_new(val, "code", json_integer(code));
+	json_object_set_new(val, "message", json_string(msg));
+	return json_object_set_new(result, "error", val) != -1;
+}
+
+/* allow to report algo/device perf to the pool for algo stats */
+static bool stratum_benchdata(json_t *result, json_t *params, int thr_id)
+{
+	char algo[64] = { 0 };
+	char vid[32], arch[8], driver[32];
+	char *card;
+	char os[8];
+	uint32_t watts = 0, plimit = 0;
+	int dev_id = device_map[thr_id];
+	int cuda_ver = cuda_version();
+	struct cgpu_info *cgpu = &thr_info[thr_id].gpu;
+	json_t *val;
+
+	if (!cgpu || !opt_stratum_stats) return false;
+
+#if defined(WIN32) && (defined(_M_X64) || defined(__x86_64__))
+	strcpy(os, "win64");
+#else
+	strcpy(os, is_windows() ? "win32" : "linux");
+#endif
+
+	cuda_gpu_info(cgpu);
+#ifdef USE_WRAPNVML
+	cgpu->has_monitoring = true;
+	if (cgpu->monitor.gpu_power)
+		cgpu->gpu_power = cgpu->monitor.gpu_power;
+	else
+		cgpu->gpu_power = gpu_power(cgpu); // mWatts
+	watts = (cgpu->gpu_power >= 1000) ? cgpu->gpu_power / 1000 : 0; // ignore nvapi %
+	plimit = device_plimit[dev_id] > 0 ? device_plimit[dev_id] : 0;
+	gpu_info(cgpu); // vid/pid
+#endif
+	get_currentalgo(algo, sizeof(algo));
+
+	card = device_name[dev_id];
+	cgpu->khashes = stats_get_speed(thr_id, 0.0) / 1000.0;
+
+	sprintf(vid, "%04hx:%04hx", cgpu->gpu_vid, cgpu->gpu_pid);
+	sprintf(arch, "%d", (int) cgpu->gpu_arch);
+	if (cuda_arch[dev_id] > 0 && cuda_arch[dev_id] != cgpu->gpu_arch) {
+		// if binary was not compiled for the highest cuda arch, add it
+		snprintf(arch, 8, "%d@%d", (int) cgpu->gpu_arch, cuda_arch[dev_id]);
+	}
+	snprintf(driver, 32, "CUDA %d.%d %s", cuda_ver/1000, (cuda_ver%1000) / 10, driver_version);
+	driver[31] = '\0';
+
+	val = json_object();
+	json_object_set_new(val, "algo", json_string(algo));
+	json_object_set_new(val, "type", json_string("gpu"));
+	json_object_set_new(val, "device", json_string(card));
+	json_object_set_new(val, "vendorid", json_string(vid));
+	json_object_set_new(val, "arch", json_string(arch));
+	json_object_set_new(val, "freq", json_integer(cgpu->gpu_clock/1000));
+	json_object_set_new(val, "memf", json_integer(cgpu->gpu_memclock/1000));
+	json_object_set_new(val, "curr_freq", json_integer(cgpu->monitor.gpu_clock));
+	json_object_set_new(val, "curr_memf", json_integer(cgpu->monitor.gpu_memclock));
+	json_object_set_new(val, "power", json_integer(watts));
+	json_object_set_new(val, "plimit", json_integer(plimit));
+	json_object_set_new(val, "khashes", json_real(cgpu->khashes));
+	json_object_set_new(val, "intensity", json_real(cgpu->intensity));
+	json_object_set_new(val, "throughput", json_integer(cgpu->throughput));
+	json_object_set_new(val, "client", json_string(PACKAGE_NAME "/" PACKAGE_VERSION));
+	json_object_set_new(val, "os", json_string(os));
+	json_object_set_new(val, "driver", json_string(driver));
+
+	json_object_set_new(result, "result", val);
+
+	return true;
+}
+
+static bool stratum_get_stats(struct stratum_ctx *sctx, json_t *id, json_t *params)
+{
+	char *s;
+	json_t *val;
+	bool ret;
+
+	if (!id || json_is_null(id))
+		return false;
+
+	val = json_object();
+	json_object_set(val, "id", id);
+
+	ret = stratum_benchdata(val, params, 0);
+
+	if (!ret) {
+		json_object_set_error(val, 1, "disabled"); //EPERM
+	} else {
+		json_object_set_new(val, "error", json_null());
+	}
+
+	s = json_dumps(val, 0);
+	ret = stratum_send_line(sctx, s);
+	json_decref(val);
+	free(s);
+
+	return ret;
+}
+
+static bool stratum_get_version(struct stratum_ctx *sctx, json_t *id, json_t *params)
+{
+	char *s;
+	json_t *val;
+	bool ret = true;
+
+	if (!id || json_is_null(id))
+		return false;
+
+	val = json_object();
+	json_object_set(val, "id", id);
+	json_object_set_new(val, "result", json_string(USER_AGENT));
+	if (ret) json_object_set_new(val, "error", json_null());
+
+	s = json_dumps(val, 0);
+	ret = stratum_send_line(sctx, s);
+
+	json_decref(val);
+	free(s);
+
+	return ret;
+}
+
+static bool stratum_show_message(struct stratum_ctx *sctx, json_t *id, json_t *params)
+{
+	char *s;
+	json_t *val;
+	bool ret;
+
+	if (sctx->is_equihash)
+		return equi_stratum_show_message(sctx, id, params);
+
+	val = json_array_get(params, 0);
+	if (val)
+		applog(LOG_NOTICE, "MESSAGE FROM SERVER: %s", json_string_value(val));
+	
+	if (!id || json_is_null(id))
+		return true;
+
+	val = json_object();
+	json_object_set(val, "id", id);
+	json_object_set_new(val, "error", json_null());
+	json_object_set_new(val, "result", json_true());
+	s = json_dumps(val, 0);
+	ret = stratum_send_line(sctx, s);
+	json_decref(val);
+	free(s);
+
+	return ret;
+}
+
+static bool stratum_unknown_method(struct stratum_ctx *sctx, json_t *id)
+{
+	char *s;
+	json_t *val;
+	bool ret = false;
+
+	if (!id || json_is_null(id))
+		return ret;
+
+	val = json_object();
+	json_object_set(val, "id", id);
+	json_object_set_new(val, "result", json_false());
+	json_object_set_error(val, 38, "unknown method"); // ENOSYS
+
+	s = json_dumps(val, 0);
+	ret = stratum_send_line(sctx, s);
+	json_decref(val);
+	free(s);
+
+	return ret;
+}
+
+bool stratum_handle_method(struct stratum_ctx *sctx, const char *s)
+{
+	json_t *val, *id, *params;
+	json_error_t err;
+	const char *method;
+	bool ret = false;
+
+	val = JSON_LOADS(s, &err);
+	if (!val) {
+		applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text);
+		goto out;
+	}
+
+	method = json_string_value(json_object_get(val, "method"));
+	if (!method)
+		goto out;
+	id = json_object_get(val, "id");
+	params = json_object_get(val, "params");
+
+	if (!strcasecmp(method, "mining.notify")) {
+		ret = stratum_notify(sctx, params);
+		goto out;
+	}
+	if (!strcasecmp(method, "mining.ping")) { // cgminer 4.7.1+
+		if (opt_debug) applog(LOG_DEBUG, "Pool ping");
+		ret = stratum_pong(sctx, id);
+		goto out;
+	}
+	if (!strcasecmp(method, "mining.set_difficulty")) {
+		ret = stratum_set_difficulty(sctx, params);
+		goto out;
+	}
+	if (!strcasecmp(method, "mining.set_target")) {
+		sctx->is_equihash = true;
+		ret = equi_stratum_set_target(sctx, params);
+		goto out;
+	}
+	if (!strcasecmp(method, "mining.set_extranonce")) {
+		ret = stratum_parse_extranonce(sctx, params, 0);
+		goto out;
+	}
+	if (!strcasecmp(method, "client.reconnect")) {
+		ret = stratum_reconnect(sctx, params);
+		goto out;
+	}
+	if (!strcasecmp(method, "client.get_algo")) { // ccminer only yet!
+		// will prevent wrong algo parameters on a pool, will be used as test on rejects
+		if (!opt_quiet) applog(LOG_NOTICE, "Pool asked your algo parameter");
+		ret = stratum_get_algo(sctx, id, params);
+		goto out;
+	}
+	if (!strcasecmp(method, "client.get_stats")) { // ccminer/yiimp only yet!
+		// optional to fill device benchmarks
+		ret = stratum_get_stats(sctx, id, params);
+		goto out;
+	}
+	if (!strcasecmp(method, "client.get_version")) { // common
+		ret = stratum_get_version(sctx, id, params);
+		goto out;
+	}
+	if (!strcasecmp(method, "client.show_message")) { // common
+		ret = stratum_show_message(sctx, id, params);
+		goto out;
+	}
+	if (sctx->rpc2 && !strcasecmp(method, "job")) { // xmr/bbr
+		ret = rpc2_stratum_job(sctx, id, params);
+		goto out;
+	}
+
+	if (!ret) {
+		// don't fail = disconnect stratum on unknown (and optional?) methods
+		if (opt_debug) applog(LOG_WARNING, "unknown stratum method %s!", method);
+		ret = stratum_unknown_method(sctx, id);
+	}
+
+out:
+	if (val)
+		json_decref(val);
+
+	return ret;
+}
+
+struct thread_q *tq_new(void)
+{
+	struct thread_q *tq;
+
+	tq = (struct thread_q *)calloc(1, sizeof(*tq));
+	if (!tq)
+		return NULL;
+
+	INIT_LIST_HEAD(&tq->q);
+	pthread_mutex_init(&tq->mutex, NULL);
+	pthread_cond_init(&tq->cond, NULL);
+
+	return tq;
+}
+
+void tq_free(struct thread_q *tq)
+{
+	struct tq_ent *ent, *iter;
+
+	if (!tq)
+		return;
+
+	list_for_each_entry_safe(ent, iter, &tq->q, q_node, struct tq_ent, struct tq_ent) {
+		list_del(&ent->q_node);
+		free(ent);
+	}
+
+	pthread_cond_destroy(&tq->cond);
+	pthread_mutex_destroy(&tq->mutex);
+
+	memset(tq, 0, sizeof(*tq));	/* poison */
+	free(tq);
+}
+
+static void tq_freezethaw(struct thread_q *tq, bool frozen)
+{
+	pthread_mutex_lock(&tq->mutex);
+
+	tq->frozen = frozen;
+
+	pthread_cond_signal(&tq->cond);
+	pthread_mutex_unlock(&tq->mutex);
+}
+
+void tq_freeze(struct thread_q *tq)
+{
+	tq_freezethaw(tq, true);
+}
+
+void tq_thaw(struct thread_q *tq)
+{
+	tq_freezethaw(tq, false);
+}
+
+bool tq_push(struct thread_q *tq, void *data)
+{
+	struct tq_ent *ent;
+	bool rc = true;
+
+	ent = (struct tq_ent *)calloc(1, sizeof(*ent));
+	if (!ent)
+		return false;
+
+	ent->data = data;
+	INIT_LIST_HEAD(&ent->q_node);
+
+	pthread_mutex_lock(&tq->mutex);
+
+	if (!tq->frozen) {
+		list_add_tail(&ent->q_node, &tq->q);
+	} else {
+		free(ent);
+		rc = false;
+	}
+
+	pthread_cond_signal(&tq->cond);
+	pthread_mutex_unlock(&tq->mutex);
+
+	return rc;
+}
+
+void *tq_pop(struct thread_q *tq, const struct timespec *abstime)
+{
+	struct tq_ent *ent;
+	void *rval = NULL;
+	int rc;
+
+	pthread_mutex_lock(&tq->mutex);
+
+	if (!list_empty(&tq->q))
+		goto pop;
+
+	if (abstime)
+		rc = pthread_cond_timedwait(&tq->cond, &tq->mutex, abstime);
+	else
+		rc = pthread_cond_wait(&tq->cond, &tq->mutex);
+	if (rc)
+		goto out;
+	if (list_empty(&tq->q))
+		goto out;
+
+pop:
+	ent = list_entry(tq->q.next, struct tq_ent, q_node);
+	rval = ent->data;
+
+	list_del(&ent->q_node);
+	free(ent);
+
+out:
+	pthread_mutex_unlock(&tq->mutex);
+	return rval;
+}
+
+/**
+ * @param buf char[9] mini
+ * @param time_t timer to convert
+ */
+size_t time2str(char* buf, time_t timer)
+{
+	struct tm* tm_info;
+	tm_info = localtime(&timer);
+	return strftime(buf, 19, "%H:%M:%S", tm_info);
+}
+
+/**
+ * Alloc and returns time string (to be freed)
+ * @param time_t timer to convert
+ */
+char* atime2str(time_t timer)
+{
+	char* buf = (char*) malloc(16);
+	memset(buf, 0, 16);
+	time2str(buf, timer);
+	return buf;
+}
+
+/* sprintf can be used in applog */
+static char* format_hash(char* buf, uint8_t* h)
+{
+	uchar *hash = (uchar*) h;
+	int len = 0;
+	for (int i=0; i < 32; i += 4) {
+		len += sprintf(buf+len, "%02x%02x%02x%02x ",
+			hash[i], hash[i+1], hash[i+2], hash[i+3]);
+	}
+	return buf;
+}
+
+/* to debug diff in data */
+void applog_compare_hash(void *hash, void *hash_ref)
+{
+	char s[256] = "";
+	int len = 0;
+	uchar* hash1 = (uchar*)hash;
+	uchar* hash2 = (uchar*)hash_ref;
+	for (int i=0; i < 32; i += 4) {
+		const char *color = memcmp(hash1+i, hash2+i, 4) ? CL_WHT : CL_GRY;
+		len += sprintf(s+len, "%s%02x%02x%02x%02x " CL_GRY, color,
+			hash1[i], hash1[i+1], hash1[i+2], hash1[i+3]);
+		s[len] = '\0';
+	}
+	applog(LOG_DEBUG, "%s", s);
+}
+
+void applog_hash(void *hash)
+{
+	char s[128] = {'\0'};
+	applog(LOG_DEBUG, "%s", format_hash(s, (uint8_t*)hash));
+}
+
+void applog_hash64(void *hash)
+{
+	char s[128] = {'\0'};
+	char t[128] = {'\0'};
+	applog(LOG_DEBUG, "%s %s", format_hash(s, (uint8_t*)hash), format_hash(t, &((uint8_t*)hash)[32]));
+}
+
+void applog_hex(void *data, int len)
+{
+	char* hex = bin2hex((uchar*)data, len);
+	applog(LOG_DEBUG, "%s", hex);
+	free(hex);
+}
+
+#define printpfx(n,h) \
+	printf("%s%11s%s: %s\n", CL_GRN, n, CL_N, format_hash(s, h))
+
+static uint32_t zrtest[20] = {
+	swab32(0x01806486),
+	swab32(0x00000000),
+	swab32(0x00000000),
+	swab32(0x00000000),
+	swab32(0x00000000),
+	swab32(0x00000000),
+	swab32(0x00000000),
+	swab32(0x00000000),
+	swab32(0x00000000),
+	swab32(0x2ab03251),
+	swab32(0x87d4f28b),
+	swab32(0x6e22f086),
+	swab32(0x4845ddd5),
+	swab32(0x0ac4e6aa),
+	swab32(0x22a1709f),
+	swab32(0xfb4275d9),
+	swab32(0x25f26636),
+	swab32(0x300eed54),
+	swab32(0xffff0f1e),
+	swab32(0x2a9e2300),
+};
+
+void do_gpu_tests(void)
+{
+#ifdef _DEBUG
+	unsigned long done;
+	char s[128] = { '\0' };
+	struct work work;
+	memset(&work, 0, sizeof(work));
+
+	opt_tracegpu = true;
+	work_restart = (struct work_restart*) malloc(sizeof(struct work_restart));
+	work_restart[0].restart = 1;
+	work.target[7] = 0xffff;
+
+	//struct timeval tv;
+	//memset(work.data, 0, sizeof(work.data));
+	//scanhash_scrypt_jane(0, &work, NULL, 1, &done, &tv, &tv);
+
+	memset(work.data, 0, sizeof(work.data));
+	work.data[0] = 0;
+	scanhash_hmq17(0, &work, 1, &done);
+
+	free(work_restart);
+	work_restart = NULL;
+	opt_tracegpu = false;
+#endif
+}
+
+void print_hash_tests(void)
+{
+	uchar *scratchbuf = NULL;
+	char s[128] = {'\0'};
+	uchar hash[128];
+	uchar buf[192];
+
+	// work space for scratchpad based algos
+	scratchbuf = (uchar*)calloc(128, 1024);
+	memset(buf, 0, sizeof buf);
+
+	// buf[0] = 1; buf[64] = 2; // for endian tests
+
+	printf(CL_WHT "CPU HASH ON EMPTY BUFFER RESULTS:" CL_N "\n");
+
+	allium_hash(&hash[0], &buf[0]);
+	printpfx("allium", hash);
+
+	bastionhash(&hash[0], &buf[0]);
+	printpfx("bastion", hash);
+
+	blake256hash(&hash[0], &buf[0], 8);
+	printpfx("blakecoin", hash);
+
+	blake256hash(&hash[0], &buf[0], 14);
+	printpfx("blake", hash);
+
+	blake2b_hash(&hash[0], &buf[0]);
+	printpfx("blake2b", hash);
+
+	blake2s_hash(&hash[0], &buf[0]);
+	printpfx("blake2s", hash);
+
+	bmw_hash(&hash[0], &buf[0]);
+	printpfx("bmw", hash);
+
+	c11hash(&hash[0], &buf[0]);
+	printpfx("c11", hash);
+
+	cryptolight_hash(&hash[0], &buf[0]);
+	printpfx("cryptolight", hash);
+
+	cryptonight_hash(&hash[0], &buf[0]);
+	printpfx("cryptonight", hash);
+
+	memset(buf, 0, 180);
+	decred_hash(&hash[0], &buf[0]);
+	printpfx("decred", hash);
+
+	deephash(&hash[0], &buf[0]);
+	printpfx("deep", hash);
+
+	fresh_hash(&hash[0], &buf[0]);
+	printpfx("fresh", hash);
+
+	fugue256_hash(&hash[0], &buf[0], 32);
+	printpfx("fugue256", hash);
+
+	groestlhash(&hash[0], &buf[0]);
+	printpfx("groestl", hash);
+
+	heavycoin_hash(&hash[0], &buf[0], 32);
+	printpfx("heavy", hash);
+
+	hmq17hash(&hash[0], &buf[0]);
+	printpfx("hmq1725", hash);
+
+	hsr_hash(&hash[0], &buf[0]);
+        printpfx("hsr", hash);
+
+	jha_hash(&hash[0], &buf[0]);
+	printpfx("jha", hash);
+
+	keccak256_hash(&hash[0], &buf[0]);
+	printpfx("keccak", hash);
+
+	memset(buf, 0, 128);
+	lbry_hash(&hash[0], &buf[0]);
+	printpfx("lbry", hash);
+
+	luffa_hash(&hash[0], &buf[0]);
+	printpfx("luffa", hash);
+
+	lyra2re_hash(&hash[0], &buf[0]);
+	printpfx("lyra2", hash);
+
+	lyra2v2_hash(&hash[0], &buf[0]);
+	printpfx("lyra2v2", hash);
+
+	lyra2v3_hash(&hash[0], &buf[0]);
+	printpfx("lyra2v3", hash);
+
+	lyra2Z_hash(&hash[0], &buf[0]);
+	printpfx("lyra2z", hash);
+
+	monero_hash(&hash[0], &buf[0]);
+	printpfx("monero", hash);
+
+	myriadhash(&hash[0], &buf[0]);
+	printpfx("myriad", hash);
+
+	neoscrypt(&hash[0], &buf[0], 80000620);
+	printpfx("neoscrypt", hash);
+
+	nist5hash(&hash[0], &buf[0]);
+	printpfx("nist5", hash);
+
+	pentablakehash(&hash[0], &buf[0]);
+	printpfx("pentablake", hash);
+
+	phi2_hash(&hash[0], &buf[0]);
+	printpfx("phi", hash);
+
+	polytimos_hash(&hash[0], &buf[0]);
+	printpfx("polytimos", hash);
+
+	quarkhash(&hash[0], &buf[0]);
+	printpfx("quark", hash);
+
+	qubithash(&hash[0], &buf[0]);
+	printpfx("qubit", hash);
+
+	scrypthash(&hash[0], &buf[0]);
+	printpfx("scrypt", hash);
+
+	scryptjane_hash(&hash[0], &buf[0]);
+	printpfx("scrypt-jane", hash);
+
+	sha256d_hash(&hash[0], &buf[0]);
+	printpfx("sha256d", hash);
+
+	sha256t_hash(&hash[0], &buf[0]);
+	printpfx("sha256t", hash);
+
+	sha256q_hash(&hash[0], &buf[0]);
+	printpfx("sha256q", hash);
+  
+	sia_blake2b_hash(&hash[0], &buf[0]);
+	printpfx("sia", hash);
+
+	sibhash(&hash[0], &buf[0]);
+	printpfx("sib", hash);
+
+	skeincoinhash(&hash[0], &buf[0]);
+	printpfx("skein", hash);
+
+	skein2hash(&hash[0], &buf[0]);
+	printpfx("skein2", hash);
+
+	skunk_hash(&hash[0], &buf[0]);
+	printpfx("skunk", hash);
+
+	stellite_hash(&hash[0], &buf[0]);
+	printpfx("stelitte", hash);
+
+	s3hash(&hash[0], &buf[0]);
+	printpfx("S3", hash);
+
+	timetravel_hash(&hash[0], &buf[0]);
+	printpfx("timetravel", hash);
+
+	bitcore_hash(&hash[0], &buf[0]);
+	printpfx("bitcore", hash);
+	
+	exosis_hash(&hash[0], &buf[0]);
+	printpfx("exosis", hash);
+
+	blake256hash(&hash[0], &buf[0], 8);
+	printpfx("vanilla", hash);
+
+	tribus_hash(&hash[0], &buf[0]);
+	printpfx("tribus", hash);
+
+	veltorhash(&hash[0], &buf[0]);
+	printpfx("veltor", hash);
+
+	wcoinhash(&hash[0], &buf[0]);
+	printpfx("whirlpool", hash);
+
+	//whirlxHash(&hash[0], &buf[0]);
+	//printpfx("whirlpoolx", hash);
+
+	x11evo_hash(&hash[0], &buf[0]);
+	printpfx("x11evo", hash);
+
+	x11hash(&hash[0], &buf[0]);
+	printpfx("x11", hash);
+
+	x12hash(&hash[0], &buf[0]);
+	printpfx("x12", hash);
+
+	x13hash(&hash[0], &buf[0]);
+	printpfx("x13", hash);
+
+	x14hash(&hash[0], &buf[0]);
+	printpfx("x14", hash);
+
+	x15hash(&hash[0], &buf[0]);
+	printpfx("x15", hash);
+
+	x16r_hash(&hash[0], &buf[0]);
+	printpfx("x16r", hash);
+
+	x16s_hash(&hash[0], &buf[0]);
+	printpfx("x16s", hash);
+
+	x17hash(&hash[0], &buf[0]);
+	printpfx("x17", hash);
+
+	//memcpy(buf, zrtest, 80);
+	zr5hash(&hash[0], &buf[0]);
+	//zr5hash_pok(&hash[0], (uint32_t*) &buf[0]);
+	printpfx("ZR5", hash);
+
+	printf("\n");
+
+	do_gpu_tests();
+
+	free(scratchbuf);
+}
diff --git a/x11/bitcore.cu b/x11/bitcore.cu
new file mode 100644
index 0000000000..78739679c3
--- /dev/null
+++ b/x11/bitcore.cu
@@ -0,0 +1,439 @@
+/**
+ * Timetravel-10 (bitcore) CUDA implementation
+ *  by tpruvot@github - May 2017
+ */
+
+#include <stdio.h>
+#include <memory.h>
+#include <unistd.h>
+
+#define HASH_FUNC_BASE_TIMESTAMP 1492973331U
+#define HASH_FUNC_COUNT 10
+#define HASH_FUNC_COUNT_PERMUTATIONS 40320U
+
+extern "C" {
+#include "sph/sph_blake.h"
+#include "sph/sph_bmw.h"
+#include "sph/sph_groestl.h"
+#include "sph/sph_skein.h"
+#include "sph/sph_jh.h"
+#include "sph/sph_keccak.h"
+#include "sph/sph_luffa.h"
+#include "sph/sph_cubehash.h"
+#include "sph/sph_shavite.h"
+#include "sph/sph_simd.h"
+#if HASH_FUNC_COUNT > 10
+#include "sph/sph_echo.h"
+#endif
+}
+
+#include "miner.h"
+#include "cuda_helper.h"
+#include "cuda_x11.h"
+
+static uint32_t *d_hash[MAX_GPUS];
+
+enum Algo {
+	BLAKE = 0,
+	BMW,
+	GROESTL,
+	SKEIN,
+	JH,
+	KECCAK,
+	LUFFA,
+	CUBEHASH,
+	SHAVITE,
+	SIMD,
+#if HASH_FUNC_COUNT > 10
+	ECHO,
+#endif
+	MAX_ALGOS_COUNT
+};
+
+inline void swap8(uint8_t *a, uint8_t *b)
+{
+	uint8_t t = *a;
+	*a = *b;
+	*b = t;
+}
+
+inline void initPerm(uint8_t n[], int count)
+{
+	for (int i = 0; i < count; i++)
+		n[i] = i;
+}
+
+static int nextPerm(uint8_t n[], int count)
+{
+	int tail, i, j;
+
+	if (count <= 1)
+		return 0;
+
+	for (i = count - 1; i>0 && n[i - 1] >= n[i]; i--);
+	tail = i;
+
+	if (tail > 0) {
+		for (j = count - 1; j>tail && n[j] <= n[tail - 1]; j--);
+		swap8(&n[tail - 1], &n[j]);
+	}
+
+	for (i = tail, j = count - 1; i<j; i++, j--)
+		swap8(&n[i], &n[j]);
+
+	return (tail != 0);
+}
+
+static void getAlgoString(char *str, int seq)
+{
+	uint8_t algoList[HASH_FUNC_COUNT];
+	char *sptr;
+
+	initPerm(algoList, HASH_FUNC_COUNT);
+
+	for (int k = 0; k < seq; k++) {
+		nextPerm(algoList, HASH_FUNC_COUNT);
+	}
+
+	sptr = str;
+	for (int j = 0; j < HASH_FUNC_COUNT; j++) {
+		if (algoList[j] >= 10)
+			sprintf(sptr, "%c", 'A' + (algoList[j] - 10));
+		else
+			sprintf(sptr, "%u", (uint32_t) algoList[j]);
+		sptr++;
+	}
+	*sptr = '\0';
+}
+
+static __thread uint32_t s_ntime = 0;
+static uint32_t s_sequence = UINT32_MAX;
+static uint8_t s_firstalgo = 0xFF;
+static char hashOrder[HASH_FUNC_COUNT + 1] = { 0 };
+
+#define INITIAL_DATE HASH_FUNC_BASE_TIMESTAMP
+static inline uint32_t getCurrentAlgoSeq(uint32_t ntime)
+{
+	// unlike x11evo, the permutation changes often (with ntime)
+	return (uint32_t) (ntime - INITIAL_DATE) % HASH_FUNC_COUNT_PERMUTATIONS;
+}
+
+// To finish...
+static void get_travel_order(uint32_t ntime, char *permstr)
+{
+	uint32_t seq = getCurrentAlgoSeq(ntime);
+	if (s_sequence != seq) {
+		getAlgoString(permstr, seq);
+		s_sequence = seq;
+	}
+}
+
+// CPU Hash
+extern "C" void bitcore_hash(void *output, const void *input)
+{
+	uint32_t _ALIGN(64) hash[64/4] = { 0 };
+
+	sph_blake512_context     ctx_blake;
+	sph_bmw512_context       ctx_bmw;
+	sph_groestl512_context   ctx_groestl;
+	sph_skein512_context     ctx_skein;
+	sph_jh512_context        ctx_jh;
+	sph_keccak512_context    ctx_keccak;
+	sph_luffa512_context     ctx_luffa1;
+	sph_cubehash512_context  ctx_cubehash1;
+	sph_shavite512_context   ctx_shavite1;
+	sph_simd512_context      ctx_simd1;
+#if HASH_FUNC_COUNT > 10
+	sph_echo512_context      ctx_echo1;
+#endif
+
+	if (s_sequence == UINT32_MAX) {
+		uint32_t *data = (uint32_t*) input;
+		const uint32_t ntime = (opt_benchmark || !data[17]) ? (uint32_t) time(NULL) : data[17];
+		get_travel_order(ntime, hashOrder);
+	}
+
+	void *in = (void*) input;
+	int size = 80;
+
+	const int hashes = (int) strlen(hashOrder);
+
+	for (int i = 0; i < hashes; i++)
+	{
+		const char elem = hashOrder[i];
+		uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+		if (i > 0) {
+			in = (void*) hash;
+			size = 64;
+		}
+
+		switch (algo) {
+		case BLAKE:
+			sph_blake512_init(&ctx_blake);
+			sph_blake512(&ctx_blake, in, size);
+			sph_blake512_close(&ctx_blake, hash);
+			break;
+		case BMW:
+			sph_bmw512_init(&ctx_bmw);
+			sph_bmw512(&ctx_bmw, in, size);
+			sph_bmw512_close(&ctx_bmw, hash);
+			break;
+		case GROESTL:
+			sph_groestl512_init(&ctx_groestl);
+			sph_groestl512(&ctx_groestl, in, size);
+			sph_groestl512_close(&ctx_groestl, hash);
+			break;
+		case SKEIN:
+			sph_skein512_init(&ctx_skein);
+			sph_skein512(&ctx_skein, in, size);
+			sph_skein512_close(&ctx_skein, hash);
+			break;
+		case JH:
+			sph_jh512_init(&ctx_jh);
+			sph_jh512(&ctx_jh, in, size);
+			sph_jh512_close(&ctx_jh, hash);
+			break;
+		case KECCAK:
+			sph_keccak512_init(&ctx_keccak);
+			sph_keccak512(&ctx_keccak, in, size);
+			sph_keccak512_close(&ctx_keccak, hash);
+			break;
+		case LUFFA:
+			sph_luffa512_init(&ctx_luffa1);
+			sph_luffa512(&ctx_luffa1, in, size);
+			sph_luffa512_close(&ctx_luffa1, hash);
+			break;
+		case CUBEHASH:
+			sph_cubehash512_init(&ctx_cubehash1);
+			sph_cubehash512(&ctx_cubehash1, in, size);
+			sph_cubehash512_close(&ctx_cubehash1, hash);
+			break;
+		case SHAVITE:
+			sph_shavite512_init(&ctx_shavite1);
+			sph_shavite512(&ctx_shavite1, in, size);
+			sph_shavite512_close(&ctx_shavite1, hash);
+			break;
+		case SIMD:
+			sph_simd512_init(&ctx_simd1);
+			sph_simd512(&ctx_simd1, in, size);
+			sph_simd512_close(&ctx_simd1, hash);
+			break;
+#if HASH_FUNC_COUNT > 10
+		case ECHO:
+			sph_echo512_init(&ctx_echo1);
+			sph_echo512(&ctx_echo1, in, size);
+			sph_echo512_close(&ctx_echo1, hash);
+			break;
+#endif
+		}
+	}
+
+	memcpy(output, hash, 32);
+}
+
+//#define _DEBUG
+#define _DEBUG_PREFIX "tt-"
+#include "cuda_debug.cuh"
+
+void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order);
+
+static bool init[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_bitcore(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	int intensity = (device_sm[device_map[thr_id]] >= 500 && !is_windows()) ? 20 : 19;
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); // 19=256*256*8;
+	//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (opt_benchmark) pdata[17] = swab32(0x59090909);
+
+	if (opt_debug || s_ntime != pdata[17] || s_sequence == UINT32_MAX) {
+		uint32_t ntime = swab32(work->data[17]);
+		get_travel_order(ntime, hashOrder);
+		s_ntime = pdata[17];
+		if (opt_debug && !thr_id) {
+			applog(LOG_DEBUG, "timetravel10 hash order %s (%08x)", hashOrder, ntime);
+		}
+	}
+
+	if (opt_benchmark)
+		ptarget[7] = 0x5;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		quark_blake512_cpu_init(thr_id, throughput);
+		quark_bmw512_cpu_init(thr_id, throughput);
+		quark_groestl512_cpu_init(thr_id, throughput);
+		quark_skein512_cpu_init(thr_id, throughput);
+		quark_keccak512_cpu_init(thr_id, throughput);
+		quark_jh512_cpu_init(thr_id, throughput);
+		x11_luffa512_cpu_init(thr_id, throughput);
+		x11_cubehash512_cpu_init(thr_id, throughput);
+		x11_shavite512_cpu_init(thr_id, throughput);
+		if (x11_simd512_cpu_init(thr_id, throughput) != 0) {
+			return 0;
+		}
+#if HASH_FUNC_COUNT > 10
+		x11_echo512_cpu_init(thr_id, throughput);
+#endif
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), -1);
+		CUDA_CALL_OR_RET_X(cudaMemset(d_hash[thr_id], 0, (size_t) 64 * throughput), -1);
+
+		cuda_check_cpu_init(thr_id, throughput);
+
+		init[thr_id] = true;
+	}
+
+	uint32_t endiandata[20];
+	for (int k=0; k < 19; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	cuda_check_cpu_setTarget(ptarget);
+
+	const int hashes = (int) strlen(hashOrder);
+	const char first = hashOrder[0];
+	const uint8_t algo80 = first >= 'A' ? first - 'A' + 10 : first - '0';
+	if (algo80 != s_firstalgo) {
+		s_firstalgo = algo80;
+	}
+
+	// first algo seems locked to blake in bitcore, fine!
+	quark_blake512_cpu_setBlock_80(thr_id, endiandata);
+
+	do {
+		// Hash with CUDA
+
+		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		TRACE("blake80:");
+
+		for (int i = 1; i < hashes; i++)
+		{
+			const char elem = hashOrder[i];
+			const uint8_t algo64 = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+			switch (algo64) {
+			case BLAKE:
+				quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], i);
+				TRACE("blake  :");
+				break;
+			case BMW:
+				quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], i);
+				TRACE("bmw    :");
+				break;
+			case GROESTL:
+				quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], i);
+				TRACE("groestl:");
+				break;
+			case SKEIN:
+				quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], i);
+				TRACE("skein  :");
+				break;
+			case JH:
+				quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], i);
+				TRACE("jh512  :");
+				break;
+			case KECCAK:
+				quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], i);
+				TRACE("keccak :");
+				break;
+			case LUFFA:
+				x11_luffa512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], i);
+				TRACE("luffa  :");
+				break;
+			case CUBEHASH:
+				x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], i);
+				TRACE("cube   :");
+				break;
+			case SHAVITE:
+				x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], i);
+				TRACE("shavite:");
+				break;
+			case SIMD:
+				x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], i);
+				TRACE("simd   :");
+				break;
+#if HASH_FUNC_COUNT > 10
+			case ECHO:
+				x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], i);
+				TRACE("echo   :");
+				break;
+#endif
+			}
+		}
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			uint32_t _ALIGN(64) vhash[8];
+			const uint32_t Htarg = ptarget[7];
+			be32enc(&endiandata[19], work->nonces[0]);
+			bitcore_hash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				pdata[19] = work->nonces[0];
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					bitcore_hash(vhash, endiandata);
+					if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+						bn_set_target_ratio(work, vhash, 1);
+						work->valid_nonces++;
+					}
+					pdata[19] = max(pdata[19], work->nonces[1]) + 1;
+				}
+				return work->valid_nonces;
+			} else if (vhash[7] > Htarg) {
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+	return 0;
+}
+
+// cleanup
+extern "C" void free_bitcore(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+
+	quark_blake512_cpu_free(thr_id);
+	quark_groestl512_cpu_free(thr_id);
+	x11_simd512_cpu_free(thr_id);
+
+	cuda_check_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/x11/c11.cu b/x11/c11.cu
new file mode 100644
index 0000000000..8f8f6663b2
--- /dev/null
+++ b/x11/c11.cu
@@ -0,0 +1,269 @@
+extern "C"
+{
+#include "sph/sph_blake.h"
+#include "sph/sph_bmw.h"
+#include "sph/sph_groestl.h"
+#include "sph/sph_skein.h"
+#include "sph/sph_jh.h"
+#include "sph/sph_keccak.h"
+
+#include "sph/sph_luffa.h"
+#include "sph/sph_cubehash.h"
+#include "sph/sph_shavite.h"
+#include "sph/sph_simd.h"
+#include "sph/sph_echo.h"
+}
+
+#include "miner.h"
+#include "cuda_helper.h"
+#include "cuda_x11.h"
+
+void tribus_echo512_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t *d_resNonce, const uint64_t target);
+
+#include <stdio.h>
+#include <memory.h>
+
+static uint32_t *d_hash[MAX_GPUS];
+static uint32_t *d_resNonce[MAX_GPUS];
+
+// Flax/Chaincoin C11 CPU Hash
+extern "C" void c11hash(void *output, const void *input)
+{
+	unsigned char hash[128] = { 0 };
+
+	sph_blake512_context ctx_blake;
+	sph_bmw512_context ctx_bmw;
+	sph_groestl512_context ctx_groestl;
+	sph_jh512_context ctx_jh;
+	sph_keccak512_context ctx_keccak;
+	sph_skein512_context ctx_skein;
+	sph_luffa512_context ctx_luffa;
+	sph_cubehash512_context ctx_cubehash;
+	sph_shavite512_context ctx_shavite;
+	sph_simd512_context ctx_simd;
+	sph_echo512_context ctx_echo;
+
+	sph_blake512_init(&ctx_blake);
+	sph_blake512 (&ctx_blake, input, 80);
+	sph_blake512_close(&ctx_blake, (void*) hash);
+
+	sph_bmw512_init(&ctx_bmw);
+	sph_bmw512 (&ctx_bmw, (const void*) hash, 64);
+	sph_bmw512_close(&ctx_bmw, (void*) hash);
+
+	sph_groestl512_init(&ctx_groestl);
+	sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
+	sph_groestl512_close(&ctx_groestl, (void*) hash);
+
+	sph_jh512_init(&ctx_jh);
+	sph_jh512 (&ctx_jh, (const void*) hash, 64);
+	sph_jh512_close(&ctx_jh, (void*) hash);
+
+	sph_keccak512_init(&ctx_keccak);
+	sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
+	sph_keccak512_close(&ctx_keccak, (void*) hash);
+
+	sph_skein512_init(&ctx_skein);
+	sph_skein512 (&ctx_skein, (const void*) hash, 64);
+	sph_skein512_close(&ctx_skein, (void*) hash);
+
+	sph_luffa512_init(&ctx_luffa);
+	sph_luffa512 (&ctx_luffa, (const void*) hash, 64);
+	sph_luffa512_close (&ctx_luffa, (void*) hash);
+
+	sph_cubehash512_init(&ctx_cubehash);
+	sph_cubehash512 (&ctx_cubehash, (const void*) hash, 64);
+	sph_cubehash512_close(&ctx_cubehash, (void*) hash);
+
+	sph_shavite512_init(&ctx_shavite);
+	sph_shavite512 (&ctx_shavite, (const void*) hash, 64);
+	sph_shavite512_close(&ctx_shavite, (void*) hash);
+
+	sph_simd512_init(&ctx_simd);
+	sph_simd512 (&ctx_simd, (const void*) hash, 64);
+	sph_simd512_close(&ctx_simd, (void*) hash);
+
+	sph_echo512_init(&ctx_echo);
+	sph_echo512 (&ctx_echo, (const void*) hash, 64);
+	sph_echo512_close(&ctx_echo, (void*) hash);
+
+	memcpy(output, hash, 32);
+}
+
+#ifdef _DEBUG
+#define TRACE(algo) { \
+	if (max_nonce == 1 && pdata[19] <= 1) { \
+		uint32_t* debugbuf = NULL; \
+		cudaMallocHost(&debugbuf, 8*sizeof(uint32_t)); \
+		cudaMemcpy(debugbuf, d_hash[thr_id], 8*sizeof(uint32_t), cudaMemcpyDeviceToHost); \
+		printf("X11 %s %08x %08x %08x %08x...\n", algo, swab32(debugbuf[0]), swab32(debugbuf[1]), \
+			swab32(debugbuf[2]), swab32(debugbuf[3])); \
+		cudaFreeHost(debugbuf); \
+	} \
+}
+#else
+#define TRACE(algo) {}
+#endif
+
+static bool init[MAX_GPUS] = { 0 };
+static bool use_compat_kernels[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	int intensity = (device_sm[device_map[thr_id]] >= 500 && !is_windows()) ? 20 : 19;
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); // 19=256*256*8;
+	//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (opt_benchmark)
+		((uint32_t*)ptarget)[7] = 0x5;
+
+	if (!init[thr_id])
+	{
+		int dev_id = device_map[thr_id];
+		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		cuda_get_arch(thr_id);
+		use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
+
+		quark_blake512_cpu_init(thr_id, throughput);
+		quark_bmw512_cpu_init(thr_id, throughput);
+		quark_groestl512_cpu_init(thr_id, throughput);
+		quark_skein512_cpu_init(thr_id, throughput);
+		quark_keccak512_cpu_init(thr_id, throughput);
+		quark_jh512_cpu_init(thr_id, throughput);
+		x11_luffaCubehash512_cpu_init(thr_id, throughput);
+		x11_shavite512_cpu_init(thr_id, throughput);
+		if (use_compat_kernels[thr_id])
+			x11_echo512_cpu_init(thr_id, throughput);
+		if (x11_simd512_cpu_init(thr_id, throughput) != 0) {
+			return 0;
+		}
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 64 * throughput), 0);
+		CUDA_SAFE_CALL(cudaMalloc(&d_resNonce[thr_id], 2 * sizeof(uint32_t)));
+
+		cuda_check_cpu_init(thr_id, throughput);
+
+		init[thr_id] = true;
+	}
+
+	uint32_t endiandata[20];
+	for (int k=0; k < 20; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	quark_blake512_cpu_setBlock_80(thr_id, endiandata);
+	if (use_compat_kernels[thr_id])
+		cuda_check_cpu_setTarget(ptarget);
+	else
+		cudaMemset(d_resNonce[thr_id], 0xFF, 2 * sizeof(uint32_t));
+
+	do {
+		int order = 0;
+
+		// Hash with CUDA
+		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+		TRACE("blake  :");
+		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		TRACE("bmw    :");
+		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		TRACE("groestl:");
+		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		TRACE("jh512  :");
+		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		TRACE("keccak :");
+		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		TRACE("skein  :");
+		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
+		TRACE("luffa+c:");
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		TRACE("shavite:");
+		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		TRACE("simd   :");
+
+		if (use_compat_kernels[thr_id]) {
+			x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+			work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+			work->nonces[1] = UINT32_MAX;
+		} else {
+			tribus_echo512_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id], AS_U64(&ptarget[6]));
+			cudaMemcpy(&work->nonces[0], d_resNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
+		}
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			uint32_t _ALIGN(64) vhash[8];
+			const uint32_t Htarg = ptarget[7];
+			const uint32_t startNounce = pdata[19];
+			if (!use_compat_kernels[thr_id]) work->nonces[0] += startNounce;
+			be32enc(&endiandata[19], work->nonces[0]);
+			c11hash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				if (work->nonces[1] != UINT32_MAX) {
+					work->nonces[1] += startNounce;
+					be32enc(&endiandata[19], work->nonces[1]);
+					c11hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+					gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				cudaMemset(d_resNonce[thr_id], 0xFF, 2 * sizeof(uint32_t));
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+	return 0;
+}
+
+// cleanup
+extern "C" void free_c11(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+	cudaFree(d_resNonce[thr_id]);
+
+	quark_blake512_cpu_free(thr_id);
+	quark_groestl512_cpu_free(thr_id);
+	x11_simd512_cpu_free(thr_id);
+
+	cuda_check_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/x11/cuda_streebog.cu b/x11/cuda_streebog.cu
new file mode 100644
index 0000000000..d6e3685e62
--- /dev/null
+++ b/x11/cuda_streebog.cu
@@ -0,0 +1,1008 @@
+/*
+ * Streebog GOST R 34.10-2012 CUDA implementation.
+ *
+ * https://tools.ietf.org/html/rfc6986
+ * https://en.wikipedia.org/wiki/Streebog
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * @author   Tanguy Pruvot - 2015
+ * @author   Alexis Provos - 2016
+ */
+
+// Further improved with shared memory utilization, Provos Alexis, June 2016
+
+#include <miner.h>
+
+#include <cuda_helper.h>
+#include <cuda_vector_uint2x4.h>
+#include "neoscrypt/cuda_vectors.h"
+
+typedef unsigned char uchar;
+
+//#define FULL_UNROLL
+
+// Tables for function F
+__constant__ uint2 T02[256] = {
+	{0x5B711FD0,0xE6F87E5C}, {0x0924FA16,0x25837780}, {0x852EA4A8,0xC849E07E}, {0x8F06C16A,0x5B4686A1},
+	{0xD77B416E,0x0B32E9A2}, {0x67815C66,0xABDA37A4}, {0x1A686676,0xF61796A8}, {0x6391954B,0xF5DC0B70},
+	{0xB7E64BF1,0x4862F38D}, {0x68BD85C5,0xFF5C629A}, {0xFCD75795,0xCB827DA6}, {0x69B9F089,0x66D36DAF},
+	{0x483D83B0,0x356C9F74}, {0x238C99A1,0x7CBCECB1}, {0x31C4708D,0x36A702AC}, {0x2FBCDFD6,0x9EB6A8D0},
+	{0xE5B3AE37,0x8B19FA51}, {0x8A127D0B,0x9CCFB540}, {0x08208F5A,0xBC0C78B5}, {0x2288ECED,0xE533E384},
+	{0x77C15FD2,0xCEC2C7D3}, {0x505D0F5E,0xEC7817B6}, {0x8336871D,0xB94CC2C0}, {0xCB0B04AD,0x8C205DB4},
+	{0x28A0892F,0x763C855B}, {0xF6FF3257,0x588D1B79}, {0x4311933E,0x3FECF69E}, {0x803A18C9,0x0FC0D39F},
+	{0xF5F3AD83,0xEE010A26}, {0x411979A6,0x10EFE8F4}, {0x7DE93A10,0x5DCDA10C}, {0x1248E92C,0x4A1BEE1D},
+	{0x21847339,0x53BFF2DB}, {0xA6A23D09,0xB4F50CCF}, {0xD84798CD,0x5FB4BC9C}, {0x071C56F9,0xE88A2D8B},
+	{0x5A756A9C,0x7F777169}, {0xA0BA1EBC,0xC5F02E71}, {0x4215E672,0xA663F9AB}, {0xDE5FBB78,0x2EB19E22},
+	{0x2594BA14,0x0DB9CE0F}, {0x97664D84,0x82520E63}, {0x0208EA98,0x2F031E6A}, {0xA1BE6BF0,0x5C7F2144},
+	{0xD16362DB,0x7A37CB1C}, {0x4B311C64,0x83E08E2B}, {0xAB960E32,0xCF70479B}, {0xB9DEE71E,0x856BA986},
+	{0x7AF56CE9,0xB5478C87}, {0x5F61D6FD,0xB8FE4288}, {0x966238C8,0x1BDD0156}, {0x3EF8A92E,0x62215792},
+	{0x114476F8,0xFC97FF42}, {0x56452CEB,0x9D7D3508}, {0xE0A71256,0x4C90C9B0}, {0xFBCB016C,0x2308502D},
+	{0xA7A64845,0x2D7A03FA}, {0xBFC6C4AB,0xF46E8B38}, {0xD477DEBA,0xBDBEF8FD}, {0xC8079B79,0x3AAC4CEB},
+	{0xE8879D0C,0xF09CB105}, {0xAC8A58CB,0x27FA6A10}, {0x401D0CEA,0x8960E7C1}, {0x4A356928,0x1A6F811E},
+	{0x73D196FF,0x90C4FB07}, {0x609D0A9F,0x43501A2F}, {0xC63F3796,0xF7A516E0}, {0xB8DA9252,0x1CE4A6B3},
+	{0x38E08A9B,0x1324752C}, {0x3BEC154F,0xA5A86473}, {0x5549B33F,0x2BF12457}, {0x440DC5C7,0xD766DB15},
+	{0x9E42B792,0xA7D179E3}, {0x61997FD3,0xDADF151A}, {0xC0271423,0x86A0345E}, {0x6DA939A4,0x38D5517B},
+	{0x104003B4,0x6518F077}, {0xA5AEA2DD,0x02791D90}, {0x9C4A5D0A,0x88D26789}, {0x0A2865C2,0x930F66DF},
+	{0x4509B08B,0x4EE9D420}, {0x6685292A,0x32553891}, {0xC533A842,0x412907BF}, {0x544DC673,0xB27E2B62},
+	{0x6295E007,0x6C530445}, {0x5351908A,0x5AF406E9}, {0xC123616F,0x1F2F3B6B}, {0x5255E5C6,0xC37B09DC},
+	{0xB1FE6844,0x3967D133}, {0xF0E711E2,0x298839C7}, {0x1964F9A2,0x409B87F7}, {0xDB4B0719,0xE938ADC3},
+	{0xF9C3EBF4,0x0C0B4E47}, {0xD36B8843,0x5534D576}, {0xEB8B02D8,0x4610A05A}, {0x8232F251,0x20C3CDF5},
+	{0xBEC2B1E7,0x6DE1840D}, {0xB0FA1D08,0xA0E8DE06}, {0x0D34333B,0x7B854B54}, {0xBCCA5B7F,0x42E29A67},
+	{0xC437DD0E,0xD8A6088A}, {0xD943ED81,0xC63BB3A9}, {0x5E65A3B1,0x21714DBD}, {0xB5EEA169,0x6761EDE7},
+	{0xD573ABF6,0x2431F7C8}, {0xE1A3671A,0xD51FC685}, {0x0410C92D,0x5E063CD4}, {0x2CB04002,0x283AB98F},
+	{0xB2F2F790,0x8FEBC06C}, {0x6FA1D33C,0x17D64F11}, {0xA99EE4AA,0xE07359F1}, {0x74CDC006,0x784ED68C},
+	{0xC73B42DA,0x6E2A19D5}, {0x1C7045C3,0x8712B416}, {0xED93216D,0x371582E4}, {0x4939F6FC,0xACE39041},
+	{0x86223B7C,0x7EC5F121}, {0x2BAC16FB,0xC0B09404}, {0x9A527EBF,0xF9D74537}, {0xA3B68168,0x737C3F2E},
+	{0xBAD278CA,0x33E7B8D9}, {0xC22FFEBB,0xA9A32A34}, {0xFEDFBD0D,0xE48163CC}, {0x6EA5A670,0x8E594024},
+	{0x842AD1E4,0x51C6EF4B}, {0x279C508C,0x22BAD065}, {0x18608CEE,0xD91488C2}, {0x1F7CDA17,0x319EA549},
+	{0x134C9C60,0xD394E128}, {0x72D5E3B3,0x094BF432}, {0xA4AAD791,0x9BF612A5}, {0xD26FFD0F,0xCCBBDA43},
+	{0x946AD250,0x34DE1F3C}, {0x995EE16B,0x4F5B5468}, {0xEA8F7794,0xDF9FAF6F}, {0x70DD092B,0x2648EA58},
+	{0x71D97C67,0xBFC7E56D}, {0x4F21D549,0xDDE6B2FF}, {0x3AE86003,0x3C276B46}, {0xAF86C71F,0x91767B4F},
+	{0x35D4B9A0,0x68A13E78}, {0x030C9FD4,0xB68C115F}, {0x16582001,0x141DD2C9}, {0xDD5324AC,0x983D8F7D},
+	{0xCC175254,0x64AA703F}, {0x8E02B426,0xC2C98994}, {0x9F46C2DE,0x3E5E76D6}, {0x587D8004,0x50746F03},
+	{0x9272F1E5,0x45DB3D82}, {0x9B560BF3,0x60584A02}, {0x3FFCDC62,0xFBAE58A7}, {0x6CAD4CE8,0xA15A5E4E},
+	{0xCE1FB8CC,0x4BA96E55}, {0xAE82B253,0x08F9747A}, {0xF7FB471B,0xC102144C}, {0xF3EB8E36,0x9F042898},
+	{0xF2EFFB7A,0x068B27AD}, {0x8C0A5EBE,0xEDCA97FE}, {0xF4F7D8CF,0x778E0513}, {0xC32B8BF7,0x302C2501},
+	{0x175C554D,0x8D92DDFC}, {0x46052F5F,0xF865C57F}, {0xA2B2F424,0xEAF3301B}, {0xBBD60D86,0xAA68B7EC},
+	{0x0104754C,0x998F0F35}, {0x00000000,0x00000000}, {0x34D0CCEC,0xF12E314D}, {0x061823B5,0x710522BE},
+	{0x30C005C1,0xAF280D99}, {0x5D693C65,0x97FD5CE2}, {0x33CC9A15,0x19A41CC6}, {0xF8C79EB8,0x95844172},
+	{0x937684A9,0xDC5432B7}, {0x2490CF58,0x9436C13A}, {0x32C8EF59,0x802B13F3}, {0x7CED4F5C,0xC442AE39},
+	{0xE3AB8D82,0xFA1CD8EF}, {0x4D293FD1,0xF2E5AC95}, {0x907A1B7D,0x6AD823E8}, {0x3CF043B6,0x4D2249F8},
+	{0x79F9F33D,0x03CB9DD8}, {0x36D82674,0xDE2D2F27}, {0x891EE2DF,0x2A43A41F}, {0x1B6C133A,0x6F98999D},
+	{0x3DF436FA,0xD4AD46CD}, {0x269825C0,0xBB35DF50}, {0x813E6D85,0x964FDCAA}, {0x7EE5A5C4,0xEB41B053},
+	{0x8B160847,0x0540BA75}, {0xE7BB44AF,0xA41AE43B}, {0xD0671797,0xE3B8C429}, {0xEE9FBEB9,0x819993BB},
+	{0xEC975421,0xAE9A8DD1}, {0x917E6E31,0xF3572CDD}, {0xE2AFF8CE,0x6393D7DA}, {0x37DC5338,0x47A22012},
+	{0xC903EE35,0xA32343DE}, {0xA89A91E6,0x79FC56C4}, {0xDC5751E0,0x01B28048}, {0xE4B7DB7B,0x1296F564},
+	{0x51597A12,0x75F71883}, {0xBDCE2E33,0xDB6D9552}, {0x1D74308F,0x1E9DBB23}, {0xFDD322D9,0x520D7293},
+	{0x0C304677,0xE20A4461}, {0xB4EAD425,0xFEEEE2D2}, {0x20800675,0xCA30FDEE}, {0x47015A13,0x61EACA4A},
+	{0x87264E30,0xE74AFE14}, {0x7BF119A5,0x2CC883B2}, {0xB3F682DC,0x1664CF59}, {0x1E78AF5B,0xA811AA7C},
+	{0x648DC3B2,0x1D5626FB}, {0xDF5BCE34,0xB73E9117}, {0x6AB56F5D,0xD05F7CF0}, {0xCD132718,0xFD257F0A},
+	{0x76C52A9E,0x574DC8E6}, {0x2EB8AA9A,0x0739A7E5}, {0x0F3CD9A3,0x5486553E}, {0xAA927B7E,0x56FF48AE},
+	{0xAD8E2D87,0xBE756525}, {0xFFDBC841,0x7D0E6CF9}, {0x1450CA99,0x3B1ECCA3}, {0xE983E840,0x6913BE30},
+	{0x956EA71C,0xAD511009}, {0x2DB4354E,0xB1B5B6BA}, {0x4E25A005,0x4469BDCA}, {0xCA0F71E1,0x15AF5281},
+	{0x8D0E2BF2,0x744598CB}, {0x2AA863B7,0x593F9B31}, {0x29A4FC63,0xEFB38A6E}, {0x4C2D4A9D,0x6B6AA3A0},
+	{0xE6BF31E3,0x3D95EB0E}, {0x1554BFD5,0xA291C396}, {0xEF9BCBF5,0x18169C8E}, {0x9D4E2846,0x115D68BC},
+	{0xFACF7420,0xBA875F18}, {0xB6E23EBD,0xD1EDFCB8}, {0xF1E364AE,0xB00736F2}, {0x6589B6FE,0x84D929CE},
+	{0xDA4F7255,0x70B7A2F6}, {0x5C6D4929,0x0E7253D7}, {0x574159A7,0x04F23A3D}, {0x0B2C108E,0x0A8069EA},
+	{0x6BB11A11,0x49D073C5}, {0x39E4FFD7,0x8AAB7A19}, {0x0E38ACEF,0xCD095A0B}, {0x5979F548,0xC9FB6036},
+	{0xD67F3422,0x92BDE697}, {0x0514BC61,0xC78933E1}, {0x75C9B54A,0xE1C1D9B9}, {0xCF1BCD80,0xD2266160},
+	{0x78FD8671,0x9A4492ED}, {0x881A9793,0xB3CCAB2A}, {0x7FE1D088,0x72CEBF66}, {0x985A9427,0xD6D45B5D},
+};
+
+__constant__ uint2 T12[256] = {
+	{0x8C3F55DE,0xC811A805}, {0x96B50619,0x65F5B431}, {0xD6706E43,0xF74F96B1}, {0xCB43D336,0x859D1E8B},
+	{0xCCFA3D84,0x5AAB8A85}, {0xC295FCFD,0xF9C7BF99}, {0xDE4B630F,0xA21FD5A1}, {0x3B8B456D,0xCDB3EF76},
+	{0x7CF7C385,0x803F59F8}, {0x5F31913C,0xB27C73BE}, {0x33B04821,0x98E3AC66}, {0x26B8F818,0xBF61674C},
+	{0xC4C130C8,0x0FFBC995}, {0x10761A98,0xAAA08620}, {0x210116AA,0x6057F342}, {0x0654CC35,0xF63C760C},
+	{0x667D9042,0x2DDB45CC}, {0x4BD40382,0xBCF45A96}, {0xEF3C6F3D,0x68E8A0C3}, {0x69FF73BC,0xA7BD92D2},
+	{0x01ED2287,0x290AE202}, {0xE885818F,0xB7DE34CD}, {0xDD61059B,0xD901EEA7}, {0x19A03553,0xD6FA2732},
+	{0x74CCCEC9,0xD56F1AE8}, {0x2E83F554,0xEA31245C}, {0xA07BE499,0x7034555D}, {0x56E7BEF7,0xCE26D2AC},
+	{0xA5054E38,0xFD161857}, {0x527436D1,0x6A0E7DA4}, {0x1CDE9FF2,0x5BD86A38}, {0x31770C32,0xCAF77562},
+	{0xE279C8D0,0xB09AAED9}, {0xC60674DB,0x5DEF1091}, {0x515E5045,0x111046A2}, {0x729802FC,0x23536CE4},
+	{0xF5B63CFA,0xC50CBCF7}, {0xCD171F03,0x73A16887}, {0xD9F28DBD,0x7D2941AF}, {0x5A4F3B9D,0x3F5E3EB4},
+	{0x1B677140,0x84EEFE36}, {0xE7076271,0x3DB8E3D3}, {0xF20FD248,0x1A3A28F9}, {0xB49E7627,0x7EBC7C75},
+	{0xC7EB565C,0x74E5F293}, {0x4F478BA4,0x18DCF59E}, {0xA9ADCB52,0x0C6EF44F}, {0x98DAC760,0xC699812D},
+	{0x6E469D0E,0x788B06DC}, {0x7521EC4E,0xFC65F8EA}, {0x9E8E0B55,0x30A5F721}, {0xBCA57B6B,0x2BEC3F65},
+	{0xBAF1B75E,0xDDD04969}, {0xE394EA57,0x99904CDB}, {0xE6EA40F6,0x14B201D1}, {0x41284ADD,0xBBB0C082},
+	{0xBF8F1DFF,0x50F20463}, {0x93CBACB8,0xE8D7F93B}, {0x477C86E8,0x4D8CB68E}, {0x92268E3F,0xC1DD1B39},
+	{0x09D62FCB,0x7C5AA112}, {0xDB35C9AE,0x2F3D98AB}, {0x2BFD5FF5,0x67136956}, {0x36CEE280,0x15C1E16C},
+	{0xF8F39B17,0x1D7EB2ED}, {0xB00DFE01,0xDA94D37D}, {0x760B8ADA,0x877BC3EC}, {0xE153AE44,0xCB8495DF},
+	{0xB7B410B3,0x05A24773}, {0x3C32ABDF,0x12857B78}, {0x6812513B,0x8EB770D0}, {0xD2E3E665,0x536739B9},
+	{0x71B26468,0x584D57E2}, {0xC9849725,0xD789C78F}, {0x7D1AE102,0xA935BBFA}, {0xDFA64188,0x8B1537A3},
+	{0xC378DE7A,0xD0CD5D9B}, {0x4D80CFB7,0x4AC82C9A}, {0x83BDB620,0x42777F1B}, {0x1D33BD75,0x72D2883A},
+	{0xAB6A8F41,0x5E7A2D4B}, {0xBB1C95D9,0xF4DAAB6B}, {0xFD8D31B6,0x905CFFE7}, {0x119B381F,0x83AA6422},
+	{0x42022C49,0xC0AEFB84}, {0x63033AE3,0xA0F908C6}, {0x04938826,0xA428AF08}, {0x1A8A53C7,0xADE41C34},
+	{0x77E6A85D,0xAE7121EE}, {0x25929E8C,0xC47F5C4A}, {0x55CDD863,0xB538E9AA}, {0xDAD8EB29,0x06377AA9},
+	{0xB3279895,0xA18AE87B}, {0x35E48414,0x6EDFDA6A}, {0x825094A7,0x6B7D9D19}, {0xA4E86CBF,0xD41CFA55},
+	{0xEA42C59C,0xE5CAEDC9}, {0x0E6FC179,0xA36C351C}, {0x6FABBF89,0x5181E4DE}, {0x184D17D4,0xFFF0C530},
+	{0x84045892,0x9D41EB15}, {0x28D73961,0x1C0D5250}, {0x0CA8856A,0xF178EC18}, {0x8EF811CD,0x9A057101},
+	{0x3EF5EFCC,0x4091A27C}, {0x9F6329D2,0x19AF1523}, {0xF91EB990,0x347450EF}, {0x8DD27759,0xE11B4A07},
+	{0xFC601331,0xB9561DE5}, {0x2DA993C0,0x912F1F5A}, {0x5BA2191A,0x1654DCB6}, {0x8A6B99EB,0x3E2DDE09},
+	{0x0F82E3FE,0x8A66D71E}, {0xD55A08D7,0x8C51ADB7}, {0x8941FF7F,0x4533E50F}, {0xBD4859EC,0x02E6DD67},
+	{0x5DF6D52F,0xE068AABA}, {0xFF4A75A5,0xC24826E3}, {0x88ACDDF8,0x6C39070D}, {0x4691A46F,0x6486548C},
+	{0x135C7C0C,0xD1BEBD26}, {0x8F15334A,0xB30F9303}, {0xC1BF9A69,0x82D9849F}, {0x5420FAE4,0x9C320BA8},
+	{0xAFF90767,0xFA528243}, {0xE968A308,0x9ED4D6CF}, {0x2C44B147,0xB825FD58}, {0x5EDCB3BB,0x9B7691BC},
+	{0x48FE6516,0xC7EA6190}, {0x817AF233,0x1063A61F}, {0x3409A693,0x47D53868}, {0x4C6DED30,0x63C2CE98},
+	{0x6C81D91D,0x2A9FDFD8}, {0x032A6694,0x7B1E3B06}, {0xFBD9FD83,0x666089EB}, {0x7375207B,0x0A598EE6},
+	{0x0AFC495F,0x07449A14}, {0xB6593234,0x2CA8A571}, {0x45BBC2FB,0x1F986F8A}, {0x50B372C2,0x381AA4A0},
+	{0xD81FAF3A,0x5423A3AD}, {0x8B86BB6C,0x17273C0B}, {0xC869B5A2,0xFE83258D}, {0xD1C980F1,0x287902BF},
+	{0x6B3837AF,0xF5A94BD6}, {0xB2CABA12,0x88800A79}, {0x083B0D4C,0x55504310}, {0x07B9EEB2,0xDF36940E},
+	{0x6790B2C5,0x04D1A7CE}, {0xF125B4DC,0x612413FF}, {0xC52C124F,0x26F12B97}, {0xA62F28AC,0x86082351},
+	{0x9937E5E7,0xEF93632F}, {0x293A1BE6,0x3507B052}, {0x570A9C70,0xE72C30AE}, {0xAE1425E0,0xD3586041},
+	{0xD79D4CC4,0xDE4574B3}, {0x40C5685A,0x92BA2280}, {0xDC8C271C,0xF00B0CA5}, {0xF69C5A6E,0xBE1287F1},
+	{0xB1E0DC86,0xF39E317F}, {0x20EC342D,0x495D1140}, {0x3F18CD4B,0x699B407E}, {0x6AD51528,0xDCA3A9D4},
+	{0x79896924,0x0D1D14F2}, {0x00000000,0x00000000}, {0xA196C61E,0x593EB75F}, {0x0B116BD8,0x2E4E7816},
+	{0x58887F8E,0x6D4AE7B0}, {0x872E3E06,0xE65FD013}, {0xD30EC4E2,0x7A6DDBBB}, {0xCAAEF1B1,0xAC97FC89},
+	{0x1E19DBE1,0x09CCB33C}, {0x62EE1864,0x89F3EAC4}, {0xAA87ADC6,0x7770CF49}, {0x6557F6D6,0x56C57ECA},
+	{0x6D6CFB9A,0x03953DDA}, {0x4456E07C,0x36928D88}, {0x959F608D,0x1EEB8F37}, {0x4EAAA923,0x31D6179C},
+	{0xE5C02662,0x6FAC3AD7}, {0x53991456,0x43049FA6}, {0xC052B8EE,0xABD3669D}, {0xA7C20A2B,0xAF02C153},
+	{0x3723C007,0x3CCB036E}, {0x90E1CA2C,0x93C9C23D}, {0x2F6ED7D3,0xC33BC65E}, {0x9758249E,0x4CFF5633},
+	{0x325D6AA6,0xB1E94E64}, {0x9472420A,0x37E16D35}, {0xBE623F78,0x79F8E661}, {0x02C74413,0x5214D904},
+	{0xF0C8965B,0x482EF1FD}, {0xEC1609A9,0x13F69BC5}, {0x14E592BE,0x0E882928}, {0x2A107D72,0x4E198B54},
+	{0xEBAFE71B,0xCCC00FCB}, {0x222B703E,0x1B49C844}, {0xA840E9D5,0x2564164D}, {0x1FF4F966,0x20C6513E},
+	{0x910CE8AB,0xBAC3203F}, {0x61C47EF0,0xF2EDD1C2}, {0xACD361F3,0x814CB945}, {0x4A392105,0x95FEB894},
+	{0x1622D6AD,0x5C9CF02C}, {0xF77178E9,0x971865F3}, {0x9BF0A1F4,0xBD87BA2B}, {0x59655D09,0x444005B2},
+	{0x247FBC0B,0xED75BE48}, {0x17CFF42A,0x7596122E}, {0x85E97A15,0xB44B0917}, {0x2755DA9F,0x966B854E},
+	{0x49134791,0xEEE08392}, {0x23C652B9,0x32432A46}, {0xAD3E4374,0xA8465B47}, {0x12B15E8B,0xF8B45F24},
+	{0x78644BA3,0x2417F6F0}, {0x7FDDA511,0xFB2162FE}, {0x9DA46DC1,0x4BBBCC27}, {0xD024A276,0x0173E0BD},
+	{0xA2BCA08A,0x22208C59}, {0xB836F34D,0x8FC4906D}, {0x3A6667EA,0xE4B90D74}, {0x705F46EF,0x7147B5E0},
+	{0x1508B039,0x2782CB2A}, {0xF45B1E7D,0xEC065EF5}, {0xCFD05B10,0x21B5B183}, {0x60295C77,0xDBE733C0},
+	{0x394C017E,0x9FA73672}, {0x86C31C81,0xCF553211}, {0x0D45A7ED,0xD8720E1A}, {0x3DDF8958,0x3B8F997A},
+	{0xEDFB2B2E,0x3AFC79C7}, {0x43EF0ECE,0xE9A41986}, {0x7B4E2D37,0x5F09CDF6}, {0xFA34DF04,0x4F6A6BE9},
+	{0x38A123F9,0xB6ADD470}, {0x057EAAA1,0x8D224D0A}, {0x5C1BF7A8,0xC96248B8}, {0x309A2EB5,0xE3FD9760},
+	{0xA351820D,0x0B2A6E5B}, {0xFEA75722,0xEB42C4E1}, {0x9A1D8373,0x948D5829}, {0x64BAD451,0x7FCF9CC8},
+	{0xD4B72A50,0xA55B4FB5}, {0xCE3D7997,0x08BF5381}, {0xE42D04E5,0x46A6D8D5}, {0x7E308796,0xD22B80FC},
+	{0xB57354A0,0x57B69E77}, {0x8097D0B4,0x3969441D}, {0xF3E2F0CF,0x3330CAFB}, {0xE0BE8CC3,0xE28E77DD},
+	{0x9C494F46,0x62B12E25}, {0xB9DBD1CA,0xA6CE726F}, {0xEED14DBA,0x41E242C1}, {0x7AA30FB0,0x76032FF4},
+};
+
+__constant__ uint2 T22[256] = {
+	{0x3ACDE4CC,0x45B268A9}, {0x84549D08,0xAF7F0BE8}, {0xC1468263,0x048354B3}, {0xC80EFED2,0x925435C2},
+	{0x7FDFFBA7,0xEE4E37F2}, {0x0C60F14D,0x167A3392}, {0xEA03E584,0xFB123B52}, {0xFDBB9007,0x4A0CAB53},
+	{0x0F788A19,0x9DEAF638}, {0x8F0CB32A,0xCB48EC55}, {0xD6FEF7E0,0xB59DC4B2}, {0xF4F3ECB6,0xDCDBCA22},
+	{0x549A9C40,0x11DF5813}, {0x568ACED3,0xE33FDEDF}, {0x4322E9C3,0xA0C1C812}, {0x58FA6D0D,0x07A56B81},
+	{0xB1E1F3DD,0x77279579}, {0x422AC004,0xD9B18B74}, {0xFFABC294,0xB8EC2D9F}, {0x2D75914F,0xF4ACF8A8},
+	{0xEF2B6878,0x7BBF69B1}, {0x487AC7E1,0xC4F62FAF}, {0xC67E5D0C,0x76CE809C}, {0x92E4C14C,0x6711D88F},
+	{0x243DEDFE,0x627B99D9}, {0xDFB68B51,0x234AA5C3}, {0x262DBF6D,0x909B1F15}, {0x4B62BCB5,0x4F66EA05},
+	{0x52AA6AE8,0x1AE2CF5A}, {0xD0CE0148,0xBEA053FB}, {0xE66314C9,0xED6808C0}, {0x15A82710,0x43FE16CD},
+	{0xA06970F6,0xCD049231}, {0x97CC4CB0,0xE7BC8A6C}, {0xFCB3B9C0,0x337CE835}, {0x7CC780F3,0x65DEF258},
+	{0x4132BB50,0x52214EDE}, {0x90F493DF,0x95F15E43}, {0x5DD2E0F1,0x87083962}, {0xFB8B66AF,0x41313C1A},
+	{0x51B211BC,0x91720AF0}, {0xD4EEA573,0x477D427E}, {0xF6E3BE25,0x2E3B4CEE}, {0xEB0BCC43,0x82627834},
+	{0x78E724C8,0x9C03E3DD}, {0xD9867DF9,0x2877328A}, {0xE243B0F2,0x14B51945}, {0xF7EB97E2,0x574B0F88},
+	{0x9AA4943A,0x88B6FA98}, {0xCB168586,0x19C4F068}, {0xAF11FAEF,0x50EE6409}, {0xC04EABA4,0x7DF317D5},
+	{0x98B4C6A9,0x7A567C54}, {0x4F42188E,0xB6BBFB80}, {0x3BC5CD0B,0x3CC22BCF}, {0xAA397713,0xD04336EA},
+	{0xEC33132C,0xF02FAC1B}, {0xF0D3488D,0x2506DBA7}, {0xF2C31A1E,0xD7E65D6B}, {0x1FF820F5,0x5EB9B216},
+	{0xC46E0F9F,0x842E0650}, {0x9E843001,0x716BEB1D}, {0xAB315ED4,0xA933758C}, {0xA2792265,0x3FE414FD},
+	{0x1EF00932,0x27C9F170}, {0x70A771BE,0x73A4C1CA}, {0xE76B3D0E,0x94184BA6}, {0x8C14C87E,0x40D829FF},
+	{0xC77674CB,0x0FBEC3FA}, {0x4A6A9572,0x3616A963}, {0xC25EF937,0x8F139119}, {0x5AEA3F9E,0xF545ED4D},
+	{0x50BA387B,0xE8024996}, {0x0B582E22,0x6437E7BD}, {0xE053E261,0xE6559F89}, {0x05288DFC,0x80AD52E3},
+	{0xE34B9935,0x6DC55A23}, {0x1AD0AD09,0xDE14E0F5}, {0xA659865E,0xC6390578}, {0x09487CB1,0x96D76171},
+	{0x21156002,0xE2D6CB3A}, {0x779FAED1,0x01E915E5}, {0x6A77DCB7,0xADB0213F}, {0xB9A1A6AB,0x9880B76E},
+	{0x8644CF9B,0x5D9F8D24}, {0xC5662658,0xFD5E4536}, {0x9BACBDFD,0xF1C6B9FE}, {0xBE9979C4,0xEACD6341},
+	{0x08405576,0xEFA72217}, {0xD88E543E,0x510771EC}, {0x671F043D,0xC2BA51CB}, {0x71AF5879,0x0AD482AC},
+	{0x5CDAC936,0xFE787A04}, {0x8E049AED,0xB238AF33}, {0x4972EE26,0xBD866CC9}, {0xBD810290,0x615DA6EB},
+	{0x8B2C1711,0x3295FDD0}, {0x73BF0AEA,0xF8340460}, {0x758FFC42,0xF3099329}, {0x7DCFA934,0x1CAEB13E},
+	{0x1188832B,0xBA230748}, {0x874CE65C,0x24EFCE42}, {0xB0E9DA1A,0x0E57D61F}, {0xF99B343C,0xB3D1BAD6},
+	{0x893C4582,0xC0757B1C}, {0x403A9297,0x2B510DB8}, {0xF1DB614A,0x5C7698C1}, {0xD5E68CB4,0x3E0D0118},
+	{0x855CB4CF,0xD60F488E}, {0xF3CB33D9,0xAE961E0D}, {0x14A00ED7,0x3A8E55AB}, {0x623789C1,0x42170328},
+	{0x9C946292,0x838B6DD1}, {0xED3B3AEB,0x895FEF7D}, {0x4E4A3149,0xCFCBB8E6}, {0x2F65C3DC,0x064C7E64},
+	{0x4C5A63DA,0x3D2B3E2A}, {0xA9210C47,0x5BD3F340}, {0xA1615931,0xB474D157}, {0x1DE87266,0xAC5934DA},
+	{0x7AF7765B,0x6EE36511}, {0x16B05C44,0xC86ED367}, {0x201D49C5,0x9BA6885C}, {0x88346C45,0xB905387A},
+	{0xBAB9DDFF,0x131072C4}, {0xA751AF99,0xBF49461E}, {0x1CE05BA1,0xD52977BC}, {0x6027DB52,0xB0F785E4},
+	{0x6E57788C,0x546D30BA}, {0x650F56AE,0x305AD707}, {0x612FF295,0xC987C682}, {0xF5FBC571,0xA5AB8944},
+	{0x59F244CA,0x7ED528E7}, {0x2C7DB888,0x8DDCBBCE}, {0x328DB1BA,0xAA154ABE}, {0x93ECE88B,0x1E619BE9},
+	{0xE813B717,0x09F2BD9E}, {0x285D1CB3,0x7401AA4B}, {0x3195CAEE,0x21858F14}, {0x1398D1B8,0x48C38184},
+	{0xB2F98889,0xFCB750D3}, {0x8D1CE1B9,0x39A86A99}, {0xE473465A,0x1F888E0C}, {0x76978716,0x78995683},
+	{0xEE2341BF,0x02CF2AD7}, {0xB3F1A14E,0x85C713B5}, {0x2B4567E7,0xFF916FE1}, {0xB7D10575,0x7C1A0230},
+	{0x5ECA9BA5,0x0C98FCC8}, {0xDA9E06AD,0xA3E7F720}, {0xBBB1F438,0x6A6031A2}, {0x7ED7D260,0x973E7494},
+	{0x18C0FF9A,0x2CF46639}, {0x68678E24,0x5F50A7F3}, {0xA449D4CD,0x34D983B4}, {0x5592B587,0x68AF1B75},
+	{0x2E6DEA1B,0x7F3C3D02}, {0x45121F6B,0xABFC5F5B}, {0x29553574,0x0D71E92D}, {0x6D4F03D8,0xDFFDF510},
+	{0x9F8C19C6,0x081BA87B}, {0xAC0981BB,0xDB7EA1A3}, {0x66172DFA,0xBBCA12AD}, {0x010829C7,0x79704366},
+	{0x7BFF5F9C,0x17932677}, {0x00000000,0x00000000}, {0xC906D715,0xEB2476A4}, {0x0738DF6F,0x724DD42F},
+	{0x38DDB65F,0xB752EE65}, {0x3DF53BA3,0x37FFBC86}, {0xB5C157E6,0x8EFA84FC}, {0x272596AA,0xE9EB5C73},
+	{0x2535C439,0x1B0BDABF}, {0x2A4D4E20,0x86E12C87}, {0xCE3E087A,0x9969A28B}, {0x9D9C4B55,0xFAFB2EB7},
+	{0xB6D92CB2,0x056A4156}, {0xDEBEA296,0x5A3AE6A5}, {0xA8292580,0x22A3B026}, {0x36AD1581,0x53C85B3B},
+	{0x17B87583,0xB11E9001}, {0x3FE56930,0xC51F3A4A}, {0xCF3621BD,0xE019E1ED}, {0x91FCBA18,0xEC811D25},
+	{0x4D524A1D,0x445B7D4C}, {0xDCAEF005,0xA8DA6069}, {0x309DE329,0x58F5CC72}, {0x6B7FF570,0xD4C06259},
+	{0x39D59F98,0xCE22AD03}, {0x47024DF8,0x591CD997}, {0x03187B54,0x8B90C5AA}, {0xC356D0F0,0xF663D27F},
+	{0x35B56ED5,0xD8589E91}, {0xD3D67A1C,0x35309651}, {0xCD26732E,0x12F96721}, {0x441A36AC,0xD28C1C3D},
+	{0x64077F69,0x492A9461}, {0x6F5F514B,0x2D1D73DC}, {0x0D68D88A,0x6F0A70F4}, {0xCA1EAC41,0x60B4B30E},
+	{0x3385987D,0xD36509D8}, {0x0630F6A8,0x0B3D9749}, {0x96C46577,0x9ECCC90A}, {0xAD01A87C,0xA20EE2C5},
+	{0x0E70A3DE,0xE49AB55E}, {0x82646BA0,0xA4429CA1}, {0xDB962F6A,0xDA97B446}, {0xD7F6DE27,0xCCED87D4},
+	{0x37A53C46,0x2AB8185D}, {0xE15BCBA6,0x9F25DCEF}, {0xFEA3EB53,0xC19C6EF9}, {0x1BD884CE,0xA764A393},
+	{0x817C10F4,0x2FD2590B}, {0x80743933,0x56A21A6D}, {0x79EF0D0F,0xE573A0BB}, {0x95DC1E23,0x155C0CA0},
+	{0x94D437E4,0x6C2C4FC6}, {0x23053291,0x10364DF6}, {0x836C4267,0xDD32DFC7}, {0x99BCEF6E,0x03263F32},
+	{0xE57B6F9D,0x66F8CD6A}, {0x5BE21659,0x8C35AE2B}, {0x1290F87F,0x31B3C2E2}, {0xBF915003,0x93BD2027},
+	{0x220D1B56,0x69460E90}, {0xAE19D328,0x299E276F}, {0x53A2432F,0x63928C3C}, {0xE91B9ED0,0x7082FEF8},
+	{0x3EED40F7,0xBC6F792C}, {0xD2DE53DB,0x4C40D537}, {0x5FC2B262,0x75E8BFAE}, {0xA541FD0A,0x4DA9C0D2},
+	{0x3CFD1264,0x4E8FFFE0}, {0x696FA7E3,0x2620E495}, {0xB8A98F6C,0xE1F0F408}, {0xDDA6D9C2,0xD1AA230F},
+	{0xD1C6288F,0xC7D0109D}, {0x7487D585,0x8A79D04F}, {0xA3710BA2,0x4694579B}, {0xFA834F68,0x38417F7C},
+	{0x0A5007E5,0x1D47A4DB}, {0x460A643F,0x206C9AF1}, {0x34BD4712,0xA128DDF7}, {0x72B7232D,0x81444706},
+	{0x02105293,0xF2E086CC}, {0xBC892B57,0x182DE58D}, {0xF8931DFB,0xCAA1F9B0}, {0xCC2E5AE9,0x6B892447},
+	{0x0420A43B,0xF9DD1185}, {0x8A243ED6,0x4BE5BEB6}, {0x19C8D65D,0x5584255F}, {0x633FA006,0x3B67404E},
+	{0x6C472A1F,0xA68DB676}, {0xB4C97E21,0xF78AC79A}, {0x1080AAEC,0xC353442E}, {0x5782E714,0x9A4F9DB9},
+};
+
+__constant__ uint2 T32[256] = {
+	{0x2C9B3220,0x05BA7BC8}, {0xF8B65E4F,0x31A54665}, {0x7547F4D4,0xB1B651F7}, {0x7BA46682,0x8BFA0D85},
+	{0xA16A98BB,0x85A96C5A}, {0x08EB79C9,0x990FAEF9}, {0x47F4A62D,0xA15E37A2}, {0x5D27741E,0x76857DCD},
+	{0x0A1820BC,0xF8C50B80}, {0x01F7A2B4,0xBE65DCB2}, {0x6F9426E7,0x666D1B98}, {0x53C4E648,0x4CC921BF},
+	{0x93D9CA42,0x95410A0F}, {0x647BA4EF,0x20CDCCAA}, {0x890A1871,0x429A4060}, {0x9B32B38B,0x0C4EA4F6},
+	{0xDE354CD3,0xCCDA362D}, {0x7C5B2FA9,0x96DC23BC}, {0xAA851AB3,0xC309BB68}, {0x3648E013,0xD26131A7},
+	{0x41FC4DB2,0x021DC529}, {0x704BE48A,0xCD5ADAB7}, {0x84ED71E6,0xA77965D9}, {0x1734BBA4,0x32386FD6},
+	{0x38AB7245,0xE82D6DD5}, {0x6177B4B1,0x5C2147EA}, {0xCF091CE8,0x5DA1AB70}, {0x72B8BDFF,0xAC907FCE},
+	{0x972278A8,0x57C85DFD}, {0x6B6F940D,0xA4E44C6A}, {0x4F1FDFE4,0x3851995B}, {0xED71BC9E,0x62578CCA},
+	{0xC01D2C0A,0xD9882BB0}, {0x113C503B,0x917B9D5D}, {0xA87643C6,0xA2C31E11}, {0xA399C1CE,0xE463C923},
+	{0x7EA876DC,0xF71686C5}, {0xE096D509,0x87B4A973}, {0x9D3A5814,0xAF0D567D}, {0x59DCC6F4,0xB40C2A3F},
+	{0x95D121DD,0x3602F884}, {0x9836484A,0xD3E1DD3D}, {0xA46688E5,0xF945E71A}, {0xB2A591F5,0x7518547E},
+	{0x50C01D89,0x93665874}, {0x658C065B,0x9EA81018}, {0xBC4603A3,0x4F54080C}, {0x5137BF3D,0x2D0384C6},
+	{0xEC861E2A,0xDC325078}, {0x79573FF7,0xEA30A8FC}, {0xCA050CB6,0x214D2030}, {0x8016C30C,0x65F0322B},
+	{0x1B247087,0x69BE96DD}, {0x81E161B8,0xDB95EE99}, {0xD9CA05F8,0xD1FC1814}, {0xCC0DE729,0x820ED2BB},
+	{0x430F14C7,0x63D76050}, {0xA09D3A0F,0x3BCCB0E8}, {0x573F54A2,0x8E40764D}, {0xE16177BD,0x39D175C1},
+	{0x734F1F4B,0x12F5A37C}, {0x1FDFC26D,0xAB37C12F}, {0x395CD0F1,0x5648B167}, {0x37BF42A7,0x6C04ED15},
+	{0x14304065,0xED97161D}, {0xAB72B807,0x7D6C67DA}, {0xBA4EE83C,0xEC17FA87}, {0x0304FBC1,0xDFAF79CB},
+	{0x71BC463E,0x733F0605}, {0x87E98A27,0x78D61C12}, {0x77B4ADA1,0xD07CF48E}, {0x6C90DD26,0xB9C26253},
+	{0x60801605,0xE2449B58}, {0xF941FCFB,0x8FC09AD7}, {0x4BE46D0E,0xFAD8CEA9}, {0x0608EB9F,0xA343F28B},
+	{0x4917347B,0x9B126BD0}, {0xE7699C22,0x9A92874A}, {0xC4E69EE0,0x1B017C42}, {0x0EE39256,0x3A4C5C72},
+	{0x3EA399DA,0x4B6E9F5E}, {0x5AD83D35,0x6BA353F4}, {0x4C1B2425,0xE7FEE090}, {0x2587E95D,0x22D00983},
+	{0x0F1430E2,0x842980C0}, {0x861E2893,0xC6B3C0A0}, {0x19D729F2,0x087433A4}, {0xD42D6C6F,0x341F3DAD},
+	{0xFBB2A58E,0xEE0A3FAE}, {0x90DD3183,0x4AEE73C4}, {0xB1A16A34,0xAAB72DB5}, {0x5E238FDF,0xA92A0406},
+	{0x686B6FCC,0x7B4B35A1}, {0xF4A6956C,0x6A23BF6E}, {0x851AD352,0x191CB96B}, {0xD6DE351A,0x55D598D4},
+	{0xF2AE7EF3,0xC9604DE5}, {0xA981E172,0x1CA6C2A3}, {0xAD7A5398,0xDE2F9551}, {0x56C8F616,0x3025AAFF},
+	{0x1E2860D9,0x15521D9D}, {0xFA45073A,0x506FE31C}, {0x2B647B0B,0x189C55F1}, {0xAE7EA859,0x0180EC9A},
+	{0x050C105E,0x7CEC8B40}, {0x8BF94104,0x2350E519}, {0x55CC0DD7,0xEF8AD334}, {0x6D677F92,0x07A7BEE1},
+	{0x0DE76997,0xE5E325B9}, {0xA26E637A,0x5A061591}, {0x18208B46,0xB611EF16}, {0xB7A981AB,0x09F4DF3E},
+	{0xE87DACC0,0x1EBB078A}, {0xB65E231F,0xB791038C}, {0x74B05660,0x0FD38D45}, {0xC1EA8EBE,0x67EDF702},
+	{0x831238CD,0xBA5F4BE0}, {0xCEFEBE5C,0xE3C477C2}, {0x354C1BD2,0x0DCE486C}, {0x16C31910,0x8C5DB364},
+	{0xA7627324,0x26EA9ED1}, {0xEF82E5EB,0x039D29B3}, {0xCBF2AE02,0x9F28FC82}, {0xF05D2786,0xA8AAE89C},
+	{0x2774B028,0x431AACFA}, {0x31B7A938,0xCF471F9E}, {0xE3922EC8,0x581BD0B8}, {0x400BEF06,0xBC78199B},
+	{0xBF42F862,0x90FB71C7}, {0x46030499,0x1F3BEB10}, {0xB55AD8DE,0x683E7A47}, {0xA695D190,0x988F4263},
+	{0x6E638453,0xD808C72A}, {0xC319D7CB,0x0627527B}, {0xD72997AE,0xEBB04466}, {0xE2658C7C,0xE67E0C0A},
+	{0xB056C880,0x14D2F107}, {0x30400B8C,0x7122C32C}, {0xD5DACEDB,0x8A7AE11F}, {0xE98A0E74,0xA0DEDB38},
+	{0xDCC615A6,0xAD109354}, {0xF655CC19,0x0BE91A17}, {0xB8BDB149,0x8DDD5FFE}, {0xAF890AED,0xBFE53028},
+	{0xB4AD7A6A,0xD65BA6F5}, {0x2997227E,0x7956F088}, {0x32B352F9,0x10E86655}, {0xDACEFE39,0x0E5361DF},
+	{0x9FC90161,0xCEC7F304}, {0x677F5F2E,0xFF62B561}, {0xD22587F0,0x975CCF26}, {0x543BAF63,0x51EF0F86},
+	{0x10CBF28F,0x2F1E41EF}, {0xBBB94A88,0x52722635}, {0x3344F04D,0xAE8DBAE7}, {0x6688FD9A,0x410769D3},
+	{0x34BBB966,0xB3AB94DE}, {0x8DF1AA9B,0x80131792}, {0xC5113C54,0xA564A0F0}, {0xBDB1A117,0xF131D4BE},
+	{0xEA8EF5B5,0x7F71A2F3}, {0xC8F655C3,0x40878549}, {0x44F05DEC,0x7EF14E69}, {0xF55137D8,0xD44663DC},
+	{0x523344FC,0xF2ACFD0D}, {0x00000000,0x00000000}, {0x8EF5515A,0x5FBC6E59}, {0xF1AA8532,0x16CF342E},
+	{0xDB395C8D,0xB036BD6D}, {0xDD31B712,0x13754FE6}, {0x2D6C9094,0xBBDFA77A}, {0x3A582B30,0x89E7C8AC},
+	{0xCDFA459D,0x3C6B0E09}, {0xC7E26521,0xC4AE0589}, {0x7F5FD468,0x49735A77}, {0x1D2C9B18,0xCAFD6456},
+	{0x2F9FC9E1,0xDA150203}, {0x94268369,0x88672436}, {0x3BAF8984,0x3782141E}, {0x24704BE9,0x9CB5D531},
+	{0x1AD3D233,0xD7DB4A6F}, {0x2A93D9BF,0xA6F98943}, {0x8A0EE3B0,0x9D3539AB}, {0x15C7E2D1,0x53F2CAAF},
+	{0x76430F15,0x6E19283C}, {0x6384EDC4,0x3DEBE293}, {0x208BF903,0x5E3C82C3}, {0xB94A13FD,0x33B8834C},
+	{0x2E686B55,0x6470DEB1}, {0x7A53C436,0x359FD137}, {0x02F35975,0x61CAA579}, {0x82E59A79,0x043A9752},
+	{0x2683129C,0xFD7F7048}, {0x699CCD78,0xC52EE913}, {0x7DAC8D1D,0x28B9FF0E}, {0x78A09D43,0x5455744E},
+	{0xB3523341,0xCB7D88CC}, {0x4A13CFBA,0x44BD121B}, {0xFDBA4E11,0x4D49CD25}, {0x8C06082F,0x3E76CB20},
+	{0x2278A076,0x3FF627BA}, {0x04FBB2EA,0xC28957F2}, {0xE46D67E3,0x453DFE81}, {0x3DA7621B,0x94C1E695},
+	{0xFF491764,0x2C83685C}, {0xFC4DECA5,0xF32C1197}, {0x922E68F6,0x2B24D6BD}, {0x9AC5113F,0xB22B7844},
+	{0xD1217C31,0x48F3B6ED}, {0xBEB55AD6,0x2E9EAD75}, {0x5FD42D6B,0x174FD8B4}, {0x1238ABFA,0x4ED4E496},
+	{0xFEBEB5D0,0x92E6B4EE}, {0x0BEF8208,0x46A0D732}, {0xA5912A51,0x47203BA8}, {0xE69E3E96,0x24F75BF8},
+	{0x13CF094E,0xF0B13824}, {0xC901F777,0xFEE259FB}, {0x091CDB7D,0x276A724B}, {0xEE75475F,0xBDF8F501},
+	{0x4DEC8691,0x599B3C22}, {0x99C1EAFE,0x6D84018F}, {0x1CDB39AC,0x7498B8E4}, {0x217C5BB7,0xE0595E71},
+	{0x3C50C0AF,0x2AA43A27}, {0x3F543B6E,0xF50B43EC}, {0x62734F70,0x838E3E21}, {0x4507FF58,0xC09492DB},
+	{0xDFC2EE67,0x72BFEA9F}, {0x9CCDFAA0,0x11688ACF}, {0x6A9836B9,0x1A8190D8}, {0xC615C795,0x7ACBD93B},
+	{0x286080CA,0xC7332C3A}, {0x4EE87D50,0x863445E9}, {0xD0D6DE85,0xF6966A5F}, {0x96D5DA1C,0xE9AD814F},
+	{0x9E3EA3D5,0x70A22FB6}, {0x582B6440,0x0A69F68D}, {0xC2EE757F,0xB8428EC9}, {0xAC8DF12C,0x604A49E3},
+	{0x0C10CB23,0x5B86F90B}, {0x8F02F3EE,0xE1D9B2EB}, {0xD3D22544,0x29391394}, {0x5CD0D6AA,0xC8E0A17F},
+	{0xF7A26EAD,0xB58CC6A5}, {0x238F02C2,0x8193FB08}, {0x5B2F9F81,0xD5C68F46}, {0x88FDBAC5,0xFCFF9CD2},
+	{0xF359DC47,0x77059157}, {0x07FF492B,0x1D262E39}, {0xE59AC557,0xFB582233}, {0x42F8B673,0xDDB2BCE2},
+	{0x48E096CF,0x2577B762}, {0xD83DA74C,0x6F99C4A6}, {0xEB795701,0xC1147E41}, {0x912A9337,0xF48BAF76},
+};
+
+__constant__ uint2 T42[256] = {
+	{0x9B2C0A19,0x3EF29D24}, {0xB6F8622F,0xE9E16322}, {0x47757F7A,0x55369940}, {0xA47B0B33,0x9F4D56D5},
+	{0x6AA1174C,0x82256746}, {0xEB082FB2,0xB8F5057D}, {0xF4475F53,0xCC48C10B}, {0x275DEC3A,0x373088D4},
+	{0x180AED10,0x968F4325}, {0xF7016151,0x173D232C}, {0x946FCC13,0xAE4ED09F}, {0xC4539873,0xFD4B4741},
+	{0xD9933765,0x1B5B3F0D}, {0x7B644052,0x2FFCB096}, {0x0A89840C,0xE02376D2}, {0x329B18D7,0xA3AE3A70},
+	{0x35DE8526,0x419CBD23}, {0x5B7C3199,0xFAFEBF11}, {0x85AA9B0D,0x0397074F}, {0x4836B970,0xC58AD4FB},
+	{0xFC4104A8,0xBEC60BE3}, {0x4B708772,0x1EFF36DC}, {0xED8453B6,0x131FDC33}, {0x341764D3,0x0844E33E},
+	{0xAB38CD39,0x0FF11B6E}, {0x7761B85A,0x64351F0A}, {0x09CFBA0E,0x3B5694F5}, {0xB87245D0,0x30857084},
+	{0x2297AE3C,0x47AFB3BD}, {0x6F6B554A,0xF2BA5C2F}, {0x1F4F70E1,0x74BDC476}, {0x71EDC45E,0xCFDFC644},
+	{0x1DC0AF16,0xE610784C}, {0x3C113F28,0x7ACA29D6}, {0x76A859AF,0x2DED4117}, {0x99A3D5EE,0xAC5F211E},
+	{0xA87EF33B,0xD484F949}, {0x96E013E4,0x3CE36CA5}, {0x3A9D432C,0xD120F098}, {0xDC597563,0x6BC40464},
+	{0xD1956C9E,0x69D5F5E5}, {0x3698BB24,0x9AE95F04}, {0x66A4EF44,0xC9ECC8DA}, {0xA5B2EAC6,0xD69508C8},
+	{0xC0503B80,0xC40C2235}, {0x8C652103,0x38C193BA}, {0x46BC9E8F,0x1CEEC75D}, {0x37515AD1,0xD3310119},
+	{0x86ECA50F,0xD8E2E568}, {0x5779C991,0xB137108D}, {0x05CA4206,0x709F3B69}, {0x1680CAEF,0x4FEB5083},
+	{0x241BD238,0xEC456AF3}, {0xE181ABBE,0x58D673AF}, {0xCAD9BF8C,0x242F54E7}, {0x0DCC19FD,0x0211F181},
+	{0x0F43C60A,0x90BC4DBB}, {0x9DA0761D,0x9518446A}, {0x3F57012A,0xA1BFCBF1}, {0x61E172B5,0x2BDE4F89},
+	{0x4F732481,0x27B853A8}, {0xDF1F4B61,0xB0B1E643}, {0x5C39AC68,0x18CC3842}, {0xBF37D821,0xD2B7F7D7},
+	{0x3014C720,0x3103864A}, {0x72ABFA5C,0x14AA2463}, {0x4EBAC574,0x6E600DB5}, {0x0403A3F3,0x39476574},
+	{0xBC71E623,0x09C215F0}, {0xE987F045,0x2A58B947}, {0xB477BDD8,0x7B4CDF18}, {0x906C6FE0,0x9709B5EB},
+	{0x8060D90B,0x73083C26}, {0x41F9037E,0xFEDC400E}, {0xE44BE9B8,0x284948C6}, {0x08065BFB,0x728ECAE8},
+	{0x17492B1A,0x06330E9E}, {0x69E7294E,0x59508561}, {0xE6C4364F,0xBAE4F4FC}, {0xE30E7449,0xCA7BCF95},
+	{0xA33E96C2,0x7D7FD186}, {0xD85AD690,0x52836110}, {0x1B4CD312,0x4DFAA102}, {0x872544FA,0x913ABB75},
+	{0x140F1518,0xDD46ECB9}, {0x1E869114,0x3D659A6B}, {0xD719109A,0xC23F2CAB}, {0x2DD46836,0xD713FE06},
+	{0xB2FBC1DC,0xD0A60656}, {0xDD909496,0x221C5A79}, {0xA1B14935,0xEFD26DBC}, {0x235E4FC9,0x0E77EDA0},
+	{0x6B68F6B9,0xCBFD395B}, {0xA6F4D4C4,0x0DE0EAEF}, {0x1A8532E7,0x0422FF1F}, {0xDED6AA94,0xF969B85E},
+	{0xAEF28F3F,0x7F6E2007}, {0x81A938FE,0x3AD0623B}, {0x7AADA1A7,0x6624EE8B}, {0xC856607B,0xB682E8DD},
+	{0x281E2A30,0xA78CC56F}, {0x45FAA08D,0xC79B257A}, {0x642B30B3,0x5B4174E0}, {0x7EAE0254,0x5F638BFF},
+	{0x0C05F808,0x4BC9AF9C}, {0xF98B46AE,0xCE59308A}, {0xCC55C388,0x8FC58DA9}, {0x676D0EB1,0x803496C7},
+	{0xE70DD7BA,0xF33CAAE1}, {0x6EA2B4BF,0xBB620232}, {0x201871CB,0xD5020F87}, {0xA9B712CE,0x9D5CA754},
+	{0x7DE83C56,0x841669D8}, {0x5EB6739F,0x8A618478}, {0xB0741E2B,0x420BBA6C}, {0xEAC1CE47,0xF12D5B60},
+	{0x1283691C,0x76AC35F7}, {0xFECEDB5F,0x2C6BB7D9}, {0x4C351A83,0xFCCDB18F}, {0xC3160582,0x1F79C012},
+	{0x62A74CB7,0xF0ABADAE}, {0x82EF06FC,0xE1A5801C}, {0xF2CB2357,0x67A21845}, {0x5DF04D9D,0x5114665F},
+	{0x74278658,0xBF40FD2D}, {0xB73183DA,0xA0393D3F}, {0x92E3B017,0x05A409D1}, {0x0B4065F9,0xA9FB28CF},
+	{0x42BF3D7C,0x25A9A229}, {0x03463E02,0xDB75E227}, {0x5AB5D06C,0xB326E10C}, {0x95A62DE6,0xE7968E82},
+	{0x636EAD42,0xB973F3B3}, {0x19C30CE5,0xDF571D38}, {0x29D7CBC5,0xEE549B72}, {0x65E2D146,0x12992AFD},
+	{0x56B02864,0xF8EF4E90}, {0x4030E28B,0xB7041E13}, {0xDAD50967,0xC02EDD2A}, {0x8AE95D07,0x932B4AF4},
+	{0xC6DC4784,0x6FE6FB7B}, {0x55F61666,0x239AACB7}, {0xBDB807D6,0x401A4BED}, {0x89AF6305,0x485EA8D3},
+	{0xADB4B13D,0xA41BC220}, {0x9729F211,0x753B32B8}, {0xB3322029,0x997E584B}, {0xCEDA1C7F,0x1D683193},
+	{0xC99F818E,0xFF5AB6C0}, {0x7F67E3A1,0x16BBD5E2}, {0x25D233CD,0xA59D34EE}, {0x3B54A2D9,0x98F8AE85},
+	{0xCB105E79,0x6DF70AFA}, {0xB9BBA425,0x795D2E99}, {0x44334178,0x8E437B67}, {0x886682F0,0x0186F6CE},
+	{0xBB347BD2,0xEBF092A3}, {0xF18D1D55,0xBCD7FA62}, {0x11C5571E,0xADD9D7D0}, {0xB1BDFFDE,0x0BD3E471},
+	{0x8EEAFEF4,0xAA6C2F80}, {0xF6C880A4,0x5EE57D31}, {0xF044FCA0,0xF50FA47F}, {0x51F5B595,0x1ADDC9C3},
+	{0x3352F922,0xEA76646D}, {0x00000000,0x00000000}, {0xF58EBEA6,0x85909F16}, {0xAAF12CCC,0x46294573},
+	{0x39DB7D2E,0x0A5512BF}, {0x31DD26D5,0x78DBD857}, {0x6C2D6B48,0x29CFBE08}, {0x583A0F9B,0x218B5D36},
+	{0xFACD78AC,0x152CD2AD}, {0xE2C795BC,0x83A39188}, {0x5F7F926A,0xC3B9DA65}, {0x2C1D89C3,0x9ECBA01B},
+	{0x9F2FA9EA,0x07B5F850}, {0x26940DCF,0x7EE8D6C9}, {0xAF3B6ECA,0x36B67E1A}, {0x702425AB,0x86079859},
+	{0xD31AB369,0xFB7849DF}, {0x932A51E2,0x4C7C57CC}, {0x0E8A27FF,0xD96413A6}, {0xC715A671,0x263EA566},
+	{0x4376DC89,0x6C71FC34}, {0x84637AF8,0x4A4F5952}, {0x8B20BCF2,0xDAF314E9}, {0x4AB96687,0x572768C1},
+	{0x682EC8BB,0x1088DB7C}, {0x537A6A62,0x887075F9}, {0xF302C2A2,0x2E7A4658}, {0xE582084D,0x619116DB},
+	{0x8326E709,0xA87DDE01}, {0x9C6997E8,0xDCC01A77}, {0xAC7D50C8,0xEDC39C3D}, {0xA078A8C0,0xA60A33A1},
+	{0x52B38B97,0xC1A82BE4}, {0x134A88E9,0x3F746BEA}, {0xBAFD9A27,0xA228CCBE}, {0x068C7C04,0xABEAD94E},
+	{0x78227E50,0xF48952B1}, {0xFB049959,0x5CF48CB0}, {0x6DE48ABD,0x6017E015}, {0xA73D3531,0x4438B4F2},
+	{0x49FF5885,0x8C528AE6}, {0x4DFCFB76,0xB515EF92}, {0x2E925634,0x0C661C21}, {0xC59A7986,0xB493195C},
+	{0x21D1903E,0x9CDA519A}, {0xB5BE5C2D,0x32948105}, {0xD45F2E98,0x194ACE8C}, {0x38129CDB,0x438D4CA2},
+	{0xBEFE39D4,0x9B6FA9CA}, {0xEF0B8C41,0x81B26009}, {0x91A58E15,0xDED1EBF6}, {0x9EE6481F,0x4E6DA64D},
+	{0xCF13FD8A,0x54B06F8E}, {0x01C9E1F5,0x49D85E1D}, {0x1C094EE3,0xAFC82651}, {0x75EE67AD,0xF698A330},
+	{0xEC4DB243,0x5AC7822E}, {0xC199DA75,0x8DD47C28}, {0xDB1CE892,0x89F68337}, {0x7C21DDA3,0xCDCE37C5},
+	{0x503C5460,0x530597DE}, {0x543FF793,0x6A42F2AA}, {0x73621BA9,0x5D727A7E}, {0x07459DF1,0xE2328753},
+	{0xC2DFE477,0x56A19E0F}, {0xCD9C227D,0xC61DD3B4}, {0x986A341B,0xE5877F03}, {0x15C6F4ED,0x949EB2A4},
+	{0x60289340,0x62061194}, {0xE84E11B0,0x6380E75A}, {0xD6D0F16F,0x8BE772B6}, {0xD596CF6D,0x50929091},
+	{0x3E9EE0DF,0xE86795EC}, {0x2B581432,0x7CF92748}, {0xEEC26DB4,0xC86A3E14}, {0x8DACC0F6,0x7119CDA7},
+	{0x100CB6EB,0xE40189CD}, {0x028FDFF7,0x92ADBC3A}, {0xD2D3529C,0xB2A017C2}, {0xD05C8D6B,0x200DABF8},
+	{0xA2F77737,0x34A78F9B}, {0x8F231F01,0xE3B4719D}, {0x2F5BB7C1,0x45BE423C}, {0xFD88E55D,0xF71E55FE},
+	{0x59F3EE6E,0x6853032B}, {0xFF073AAA,0x65B3E9C4}, {0x9AE5EBEC,0x772AC339}, {0xF842A75B,0x87816E97},
+	{0xE0484A4B,0x110E2DB2}, {0x3DD8DEDD,0x331277CB}, {0x79EB9FA5,0xBD510CAC}, {0x2A91F5C7,0x35217955},
+};
+
+__constant__ uint2 T52[256] = {
+	{0x46E06A6D,0x8AB0A968}, {0x4BF0B33A,0x43C7E80B}, {0x6B161EE5,0x08C9B354}, {0xEBA990BE,0x39F1C235},
+	{0x6606C7B2,0xC1BEF237}, {0x614569AA,0x2C209233}, {0x6FC3289A,0xEB01523B}, {0x935ACEDD,0x946953AB},
+	{0x3E13340E,0x272838F6}, {0xA12BA052,0x8B0455EC}, {0x978FF8A2,0x77A1B2C4}, {0x13E54086,0xA55122CA},
+	{0x62D3F1CD,0x22761358}, {0x08B76CFE,0xDB8DDFDE}, {0x9E4A178A,0x5D1E12C8}, {0x03969867,0x0E56816B},
+	{0x3303ED59,0xEE5F7995}, {0xAB78D71D,0xAFED748B}, {0xF93E53EE,0x6D929F2D}, {0xBA798C2A,0xF5D8A8F8},
+	{0x8E39CF6B,0xF619B169}, {0x749104E2,0x95DDAF2F}, {0xE0886427,0xEC2A9C80}, {0x825B95EA,0xCE5C8FD8},
+	{0x3AC60271,0xC4E0D999}, {0x173076F9,0x4699C3A5}, {0x50A29F42,0x3D1B151F}, {0x2BC75946,0x9ED505EA},
+	{0xDC7F4B98,0x34665ACF}, {0x292342F7,0x61B1FB53}, {0x0E864130,0xC721C008}, {0x96FD7B74,0x8693CD16},
+	{0x7136B14B,0x87273192}, {0x63A1721B,0xD3446C8A}, {0xA6680E4A,0x669A35E8}, {0x39509A16,0xCAB658F2},
+	{0xF42E8AB9,0xA4E5DE4E}, {0xE83F08D9,0x37A7435E}, {0xE26C7F96,0x134E6239}, {0x2DF67488,0x82791A3C},
+	{0x8329163C,0x3F6EF00A}, {0xFDEB6591,0x8E5A7E42}, {0x7981DDB5,0x5CAAEE4C}, {0x5AF1E80D,0x19F23478},
+	{0xED98BD70,0x255DDDE3}, {0xA99CCCAC,0x50898A32}, {0xDA4E6656,0x28CA4519}, {0x4CB31D22,0xAE59880F},
+	{0x37D6DB26,0x0D9798FA}, {0xB4FFCD1A,0x32F968F0}, {0x4F258545,0xA00F0964}, {0x5E24DE72,0xFA3AD517},
+	{0x5DB24615,0xF46C547C}, {0xFF0F7E20,0x713E80FB}, {0x73D2AAFA,0x7843CF2B}, {0xAEDF62B4,0xBD17EA36},
+	{0xD16F92CF,0xFD111BAC}, {0xC72D67E0,0x4ABAA7DB}, {0xAD49FAD3,0xB3416B5D}, {0x4914A88B,0xBCA316B2},
+	{0x8AECF914,0x15D15006}, {0xE31EFC40,0xE27C1DEB}, {0x9BEDA223,0x4FE48C75}, {0x1B522C78,0x7EDCFD14},
+	{0x7C26681C,0x4E5070F1}, {0x5815F3BC,0xE696CAC1}, {0x3BB481A7,0x35D2A64B}, {0xFE7DFDF6,0x800CFF29},
+	{0xD5BAA4B0,0x1ED9FAC3}, {0x1EF599D1,0x6C2663A9}, {0x34404341,0x03C11991}, {0x69F20554,0xF7AD4DED},
+	{0xB61BD6AB,0xCD9D9649}, {0xEADB1368,0xC8C3BDE7}, {0xB02AFB65,0xD131899F}, {0xE1FAE7F1,0x1D18E352},
+	{0xEF7CA6C1,0xDA39235A}, {0xA8EE4F7A,0xA1BBF5E0}, {0xCF9A0B1E,0x91377805}, {0x80BF8E5B,0x31387161},
+	{0xDB3CE580,0xD9F83ACB}, {0xD38B897E,0x0275E515}, {0xF0FBBCC6,0x472D3F21}, {0x868EA395,0x2D946EB7},
+	{0x21942E09,0xBA3C248D}, {0xBFDE3983,0xE7223645}, {0x02E41BB1,0xFF64FEB9}, {0x0D10D957,0xC9774163},
+	{0xB58D4ECC,0xC3CB1722}, {0x9CAE0C3B,0xA27AEC71}, {0xA48C15FB,0x99FECB51}, {0x6D27332B,0x1465AC82},
+	{0xD75EBF01,0xE1BD047A}, {0x941960C5,0x79F733AF}, {0x41A3C475,0x672EC96C}, {0x524684F3,0xC27FEBA6},
+	{0x75E38734,0x64EFD0FD}, {0x0743AE18,0xED9E6004}, {0xB9EF144D,0xFB8E2993}, {0x0C625A81,0x38453EB1},
+	{0x42355C12,0x69784807}, {0x14A6EE9E,0x48CF42CE}, {0x06312DCE,0x1CAC1FD6}, {0x4792E9BB,0x7B82D6BA},
+	{0x1F871A07,0x9D141C7B}, {0xC11C4A2E,0x5616B80D}, {0xF21FA777,0xB849C198}, {0xC8D9A506,0x7CA91801},
+	{0x7EC273AD,0xB1348E48}, {0x987B3A44,0x41B20D1E}, {0xA3CFBBE3,0x7460AB55}, {0x4576F20A,0x84E62803},
+	{0x897A6173,0x1B87D16D}, {0xE45D5258,0x0FE27DEF}, {0xCA3DBEB7,0x83CDE6B8}, {0xD01D1119,0x0C23647E},
+	{0xA0592384,0x7A362A3E}, {0xF1893F10,0xB61F40F3}, {0x440471DC,0x75D457D1}, {0x237035B8,0x4558DA34},
+	{0x87FC2043,0xDCA61165}, {0xC9AB26D0,0x8D9B67D3}, {0xEE0E2517,0x2B0B5C88}, {0x2AB5DA90,0x6FE77A38},
+	{0xD9D8FE31,0x269CC472}, {0xFAA8CB89,0x63C41E46}, {0x1642F52F,0xB7ABBC77}, {0x2F126F39,0x7D1DE485},
+	{0x24339BA0,0xA8C6BA30}, {0xCEE888C8,0x600507D7}, {0x1A20AFAE,0x8FEE82C6}, {0x26D78011,0x57A24489},
+	{0x36A458F0,0xFCA5E728}, {0x8F4B4CBD,0x072BCEBB}, {0xF36D24A1,0x497BBE4A}, {0xB769557D,0x3CAFE99B},
+	{0x05A7B5A9,0x12FA9EBD}, {0x5B836BDB,0xE8C04BAA}, {0xAC3B7905,0x4273148F}, {0x2851C121,0x90838481},
+	{0x6C55B0FD,0xE557D350}, {0xCB4F3D61,0x72FF996A}, {0x64E2DC03,0x3EDA0C8E}, {0xE6B949E9,0xF0868356},
+	{0xBB0B0FFC,0x04EAD72A}, {0x5967706A,0x17A4B513}, {0x04D5367F,0xE3C8E16F}, {0x8DAF570C,0xF84F3002},
+	{0xBD3A2232,0x1846C8FC}, {0xF6CA9108,0x5B8120F7}, {0xECEA3EA6,0xD46FA231}, {0x53340725,0x334D9474},
+	{0xC28AD249,0x58403966}, {0x9A9F21F5,0xBED6F3A7}, {0xA5FE962D,0x68CCB483}, {0x57E1315A,0xD085751B},
+	{0xE52FD18E,0xFED0023D}, {0x20E6ADDF,0x4B0E5B5F}, {0x6EB1AB4C,0x1A332DE9}, {0x7B65C604,0xA3CE10F5},
+	{0xD62C3CD7,0x108F7BA8}, {0x1073D8E1,0xAB07A3A1}, {0x91BED56C,0x6B0DAD12}, {0x3532C097,0xF2F36643},
+	{0xB2CEE0D4,0x2E557726}, {0x00000000,0x00000000}, {0xDE9B5029,0xCB02A476}, {0x8B9E7AC2,0xE4E32FD4},
+	{0x2C84F75E,0x734B65EE}, {0xCD7E10AF,0x6E5386BC}, {0xE7CBCA3F,0x01B4FC84}, {0x65905FD5,0xCFE8735C},
+	{0x0FF4C2E6,0x3613BFDA}, {0x31E7F6E8,0x113B872C}, {0x55052AEB,0x2FE18BA2}, {0xBC48A1E4,0xE974B72E},
+	{0xB89D979B,0x0ABC5641}, {0x2202B66E,0xB46AA5E6}, {0xC4BBFF87,0x44EC26B0}, {0x27A503C7,0xA6903B5B},
+	{0xFC99E647,0x7F680190}, {0xA71A8D9C,0x97A84A3A}, {0x6037EA7C,0xDD12EDE1}, {0xDD0DC84E,0xC554251D},
+	{0x956BE313,0x88C54C7D}, {0x48662B5D,0x4D916960}, {0x9909B992,0xB08072CC}, {0xC5C97C51,0xB5DE5962},
+	{0x19B637C9,0x81B803AD}, {0x4A8230EC,0xB2F597D9}, {0x5F565DA4,0x0B08AAC5}, {0x017283D6,0xF1327FD2},
+	{0x78F35E63,0xAD98919E}, {0x76751F53,0x6AB95196}, {0x0A53774F,0x24E92167}, {0x15D46D48,0xB9FD3D1C},
+	{0xFBDA485F,0x92F66194}, {0x11015B37,0x5A35DC73}, {0x5477A93D,0xDED3F470}, {0x81CD0D8D,0xC00A0EB3},
+	{0xC65FE436,0xBB88D809}, {0xBEACBA55,0x16104997}, {0x5693B28C,0x21B70AC9}, {0x25411876,0x59F4C5E2},
+	{0x0B21F499,0xD5DB5EB5}, {0xF55C096F,0x55D7A19C}, {0xC3F8519F,0xA97246B4}, {0xA2BD3835,0x8552D487},
+	{0x1297C350,0x54635D18}, {0x85183BF2,0x23C2EFDC}, {0xCC0C9379,0x9F61F96E}, {0x9DDC8FED,0x534893A3},
+	{0xAA0A54CB,0x5EDF0B59}, {0x9F38945C,0xAC2C6D1A}, {0xD8AA7DE7,0xD7AEBBA0}, {0x09C5EF28,0x2ABFA00C},
+	{0x3CF72FBF,0xD84CC64F}, {0xB15878B3,0x2003F64D}, {0xC06EC9F8,0xA724C7DF}, {0x68808682,0x069F323F},
+	{0x51D01C94,0xCC296ACD}, {0x5CC0C5C3,0x055E2BAE}, {0x1D6301B6,0x6270E2C2}, {0x382219C0,0x3B842720},
+	{0x846AB824,0xD2F0900E}, {0x7A1745D2,0x52FC6F27}, {0xE94D8B0F,0xC6953C8C}, {0x3095753E,0xE009F8FE},
+	{0x92284D0B,0x655B2C79}, {0x4347DFC4,0x984A37D5}, {0x8808E2A5,0xEAB5AEBF}, {0x90CC56BA,0x9A3FD2C0},
+	{0xF84CD038,0x9CA0E0FF}, {0xAFADE162,0x4C2595E4}, {0xB3BC6302,0xDF6708F4}, {0x7D54EBCA,0xBF620F23},
+	{0x1C118260,0x93429D10}, {0x8CDDD4DA,0x097D4FD0}, {0x2E60ECEF,0x8C2F9B57}, {0x18C4B41F,0x708A7C7F},
+	{0xDFE9D3FF,0x3A30DBA4}, {0x7FB0F07B,0x4006F19A}, {0x4DC19EF4,0x5F6BF7DD}, {0x32716E8F,0x1F6D0647},
+	{0x6A649D33,0xF9FBCC86}, {0x67744464,0x308C8DE5}, {0x72A0292C,0x8971B0F9}, {0x3F61B7D8,0xD61A4724},
+	{0xD4C82766,0xEFEB8511}, {0x40D147A3,0x961CB6BE}, {0xF7B812DE,0xAAB35F25}, {0x7044329D,0x76154E40},
+	{0x4E570693,0x513D76B6}, {0xD2F90AA8,0xF3479AC7}, {0x77079C85,0x9B8B2E44}, {0x3D85AC69,0x297EB99D},
+};
+
+__constant__ uint2 T62[256] = {
+	{0xFC7D40C3,0x7E37E62D}, {0xEE939E5B,0x776F25A4}, {0xDD8FB5AD,0xE045C850}, {0x11FF1952,0x86ED5BA7},
+	{0xCF616B35,0xE91D0BD9}, {0x6E408FFB,0x37E0AB25}, {0x31025A7A,0x9607F6C0}, {0x16D23C9D,0x0B02F5E1},
+	{0xFB50650C,0xF3D8486B}, {0xC40875F5,0x621CFF27}, {0xFA5FD34A,0x7D40CB71}, {0xDAA29062,0x6DAA6616},
+	{0x23EC84E2,0x9F5F3549}, {0xC507C3B3,0xEC847C3D}, {0x043CE205,0x025A3668}, {0x4DAC0B19,0xA8BF9E6C},
+	{0xE9BEBB94,0xFA808BE2}, {0x77C74FA3,0xB5B99C52}, {0xF0397BCC,0x78D9BC95}, {0xDBAD2624,0xE332E50C},
+	{0x9332797E,0xC74FCE12}, {0x2EA709AB,0x1729ECEB}, {0x9954D1F8,0xC2D6B9F6}, {0xBAB8551A,0x5D898CBF},
+	{0x17DD8ADB,0x859A76FB}, {0x362F7FB5,0x1BE85886}, {0xF136CD8A,0xF6413F8F}, {0xBBB7E35C,0xD3110FA5},
+	{0x14CC4D11,0x0A2FEED5}, {0xCD7F1AB9,0xE83010ED}, {0x5F42D581,0xA1E75DE5}, {0xC13B21B6,0xEEDE4A55},
+	{0xF94E1480,0xF2F5535F}, {0x1888761E,0x0CC1B46D}, {0x6529913B,0xBCE15FDB}, {0x5A7181C2,0x2D25E897},
+	{0xE2D7A554,0x71817F1C}, {0x5C53124B,0x2E52C5CB}, {0xEF9C281D,0xF9F7A6BE}, {0x21F2F56E,0x9E722E7D},
+	{0x81DCA7E6,0xCE170D9B}, {0x1CB4941B,0x0E9B8205}, {0x3C49D733,0x1E712F62}, {0x42F9F7DC,0x21E45CFA},
+	{0x8BBA0F60,0xCB8E7A7F}, {0x010FB646,0x8E98831A}, {0x8E895B23,0x474CCF0D}, {0x4FB27A95,0xA9928558},
+	{0x05335443,0x8CC2B572}, {0x84EFF3A5,0x42D5B8E9}, {0x021E718C,0x012D1B34}, {0xAE74180B,0x57A6626A},
+	{0xE3D81312,0xFF19FC06}, {0x6A7C6DFE,0x35BA9D4D}, {0x8F86ED65,0xC9D44C17}, {0xA02E5288,0x506523E6},
+	{0x06229389,0x03772D5C}, {0x0B691EC0,0x8B01F4FE}, {0xED825991,0xF8DABD8A}, {0x985B67BE,0x4C4E3AEC},
+	{0x7FBF96A9,0xB10DF082}, {0xD4F8DAE1,0x6A69279A}, {0xD3D5FF2E,0xE78689DC}, {0x1FA553D1,0x812E1A2B},
+	{0xEBA0CA18,0xFBAD90D6}, {0x34310E39,0x1AC543B2}, {0x2CB97827,0x1604F7DF}, {0x51189F02,0xA6241C69},
+	{0xEAAF7C5E,0x753513CC}, {0xC84C4EFA,0x64F2A59F}, {0x489F5F5A,0x247D2B1E}, {0xAB474C48,0xDB64D718},
+	{0xF2270A40,0x79F4A7A1}, {0x2A9BEBAE,0x1573DA83}, {0x68621C72,0x34978679}, {0xA2302304,0x514838D2},
+	{0xFD72F685,0xF0AF6537}, {0x3A6B44BA,0x1D06023E}, {0xCE6EDD73,0x678588C3}, {0xCC70ACFF,0x66A893F7},
+	{0xB5EDA9DF,0xD4D24E29}, {0x70EA6A6C,0x38563214}, {0x0E5A4A83,0x07C3418C}, {0x5635BACD,0x2BCBB22F},
+	{0x0878D90A,0x04B46CD0}, {0x0C443B0F,0x06EE5AB8}, {0x76C8F9E5,0x3B211F48}, {0x12EEDE98,0x0958C389},
+	{0xBF8B0159,0xD14B39CD}, {0x72F41BE0,0x397B2920}, {0x13E168DE,0x87C04093}, {0x47CAA39F,0xAD26E988},
+	{0x9C6785BB,0x4E140C84}, {0xB7F3D853,0xD5FF551D}, {0x5D5CA40D,0xA0CA46D1}, {0x87FE346F,0xCD6020C7},
+	{0x15C3FB57,0x84B76DCF}, {0xA121E4CE,0xDEFDA0FC}, {0x96012D3D,0x4B8D7B60}, {0x298A2C64,0x9AC642AD},
+	{0x10F0AF14,0x0875D8BD}, {0x7B8374AC,0xB357C6EA}, {0x9A451632,0x4D6321D8}, {0xC719B23F,0xEDA96709},
+	{0xF328BC06,0xF76C24BB}, {0x912C08F2,0xC662D526}, {0x7892B366,0x3CE25EC4}, {0x6F4F39BD,0xB978283F},
+	{0x9D6833FD,0xC08C8F9E}, {0x9E79F437,0x4F3917B0}, {0xB2C08C10,0x593DE06F}, {0xB1D14BDA,0xD6887841},
+	{0x32139DB0,0x19B26EEE}, {0x75D93E2F,0xB4948766}, {0x1987C058,0x82593777}, {0x3D466175,0x90E9AC78},
+	{0xFF6C8709,0xF1827E03}, {0x353EB87F,0x945DC0A8}, {0x8AB5B926,0x4516F965}, {0x7EB020EF,0x3F957398},
+	{0x6D514831,0xB855330B}, {0x542BCB41,0x2AE6A91B}, {0xC6160479,0x6331E413}, {0x80D311A0,0x408F8E81},
+	{0xC325503A,0xEFF35161}, {0xBD9570D5,0xD06622F9}, {0x0D4B8D49,0x8876D9A2}, {0x573A0C8B,0xA5533135},
+	{0xDF91C421,0xE168D364}, {0xF50A2F8F,0xF41B09E7}, {0x24C1A12D,0x12B09B0F}, {0xA9593DC4,0xDA49CC2C},
+	{0x3E57A6BF,0x1F5C3456}, {0xA8568B82,0x54D14F36}, {0x43F6419A,0xAF7CDFE0}, {0xC943F8BC,0xEA6A2685},
+	{0xD7E91D2B,0xE5DCBFB4}, {0x799D0520,0xB27ADDDE}, {0xD6E6AB6D,0x6B443CAE}, {0xF61BE845,0x7BAE91C9},
+	{0x7CAE5163,0x3EB868AC}, {0x22E332A4,0x11C7B653}, {0xB9A992D0,0xD23C1491}, {0x0311C7CA,0x8FB5982E},
+	{0xE0C9D4D8,0x70AC6428}, {0x0F55FCC5,0x895BC296}, {0xEC8DEFD7,0x76423E90}, {0xDE9E7267,0x6FF0507E},
+	{0x7A8CC2EA,0x3DCF45F0}, {0x941F5CB1,0x4AA06054}, {0xB0DEFD9C,0x5810FB5B}, {0xBC9AC693,0x5EFEA1E3},
+	{0xDC8003EB,0x6EDD4B4A}, {0xE8B10DD2,0x741808F8}, {0x28859A22,0x145EC1B7}, {0x50172944,0x28BC9F73},
+	{0x4EBDCCD3,0x270A0642}, {0x331C2BF6,0x972AEDF4}, {0x0A66A886,0x059977E4}, {0x4A812ED6,0x2550302A},
+	{0xA7037747,0xDD8A8DA0}, {0x970E9B7B,0xC515F87A}, {0x601AC578,0x3023EAA9}, {0x73FBADA6,0xB7E3AA3A},
+	{0x1EAAE597,0x0FB69931}, {0x00000000,0x00000000}, {0x6204B4F4,0x310EF19D}, {0x44DB6455,0x229371A6},
+	{0x1A960792,0x0DECAF59}, {0xB8A62496,0x5CA4978B}, {0x38753536,0x1C2B190A}, {0x82CD602C,0x41A295B5},
+	{0x6426277D,0x3279DCC1}, {0x9F764271,0xC1A194AA}, {0x26DFD0A1,0x139D803B}, {0x41E83016,0xAE51C4D4},
+	{0xAD65DFC1,0xD813FA44}, {0x45D4D213,0xAC0BF2BC}, {0x46C515D9,0x23BE6A92}, {0x923DCF38,0x49D74D08},
+	{0x27D066E7,0x9D050321}, {0x5E4D63C7,0x2F7FDEFF}, {0x55247D07,0xA47E2A01}, {0x2FA8BFED,0x99B16FF1},
+	{0x8C972AAF,0x4661D439}, {0xA33F9542,0xDFD0BBC8}, {0xA51D06CB,0xDCA79694}, {0x7DA1E725,0xB020EBB6},
+	{0x696DAA34,0xBA0F0563}, {0xD5F76CA7,0xE4F1A480}, {0x9510EAF7,0xC438E34E}, {0x3B64F2FC,0x939E8124},
+	{0x072D25CF,0x8DEFAE46}, {0x586FF04E,0x2C08F3A3}, {0xB3CF3A56,0xD7A56375}, {0x40E78650,0x20C947CE},
+	{0x86F18229,0x43F8A3DD}, {0xAC6A6987,0x568B795E}, {0x1DBB225D,0x8003011F}, {0xF7145E03,0xF53612D3},
+	{0x300DEC3C,0x189F75DA}, {0x3720C9F3,0x9570DB9C}, {0x6B73DBB8,0xBB221E57}, {0xE4F536DD,0x72F65240},
+	{0x88ABC8AA,0x443BE251}, {0xD9B357A8,0xE21FFE38}, {0xE7E4F117,0xFD43CA6E}, {0x89A47EEC,0xCAA3614B},
+	{0xE1C6629E,0xFE34E732}, {0x1B99B1D4,0x83742C43}, {0x83C2D66A,0xCF3A16AF}, {0x4990E91C,0xAAE5A804},
+	{0x4CA3BD5F,0x26271D76}, {0x3F5810F9,0x91C4B74C}, {0xF841A2C6,0x7C6DD045}, {0xFE63314F,0x7F1AFD19},
+	{0x8D989CE9,0xC8F95723}, {0x5306EE8E,0xA709075D}, {0xAA48FA0E,0x55FC5402}, {0x9023BEB4,0x48FA563C},
+	{0xCA523F76,0x65DFBEAB}, {0xD8BCE1EE,0x6C877D22}, {0x85E045E3,0xCC4D3BF3}, {0x6115733E,0xBEBB69B3},
+	{0x20FD4328,0x10EAAD67}, {0x71E5DC2A,0xB6CEB10E}, {0x6737E0B7,0xBDCC44EF}, {0xA412B08D,0x523F158E},
+	{0x2DB6CE61,0x989C74C5}, {0x2B945DE8,0x9BEB5999}, {0x09776F4C,0x8A2CEFCA}, {0x5B7E3784,0xA3BD6B8D},
+	{0xCB5D8930,0xEB473DB1}, {0x9B4AA074,0xC3FBA2C2}, {0x25CE176B,0x9C281815}, {0xD0C438E4,0x683311F2},
+	{0xBE84B71F,0x5FD3BAD7}, {0xE5FA809B,0xFC6ED15A}, {0x6C5EFE77,0x36CDB011}, {0x520958C8,0x29918447},
+	{0x59604608,0xA29070B9}, {0xA60CC101,0x53120EBA}, {0x74D68869,0x3A0C047C}, {0xD2DA4968,0x691E0AC6},
+	{0xE6EB4751,0x73DB4974}, {0xF40599C9,0x7A838AFD}, {0xB4E21F99,0x5A4ACD33}, {0xC03497F0,0x6046C94F},
+	{0xD1CB8EA2,0xE6AB92E8}, {0x663856F1,0x3354C7F5}, {0xAF7BAE4D,0xD93EE170}, {0xC22AE67C,0x616BD27B},
+	{0x397A8370,0x92B39A10}, {0x4B8E9890,0xABC8B330}, {0x630B02B2,0xBF967287}, {0xB6FC6E15,0x5B67D607},
+};
+
+__constant__ uint2 T72[256] = {
+	{0xCE553FE6,0xD031C397}, {0xB006B525,0x16BA5B01}, {0x296E70C8,0xA89BADE6}, {0x77D3435B,0x6A1F525D},
+	{0x573DFA0B,0x6E103570}, {0x17FC95AB,0x660EFB2A}, {0x97634BF6,0x76327A9E}, {0x62458BF5,0x4BAD9D64},
+	{0xDBC3F748,0xF1830CAE}, {0x669131FF,0xC5C8F542}, {0xDC48B0CB,0x95044A1C}, {0x3CF8B866,0x892962DF},
+	{0xE930C135,0xB0B9E208}, {0x611A767C,0xA14FB3F0}, {0x1C160136,0x8D2605F2}, {0xFECC549E,0xD6B71922},
+	{0xA5907D8B,0x37089438}, {0x5803D49C,0x0B5DA38E}, {0xEA6F3CBC,0x5A5BCC9C}, {0x3B73FFE5,0xEDAE246D},
+	{0xDE22EDCE,0xD2B87E0F}, {0xCA8185EC,0x5E54ABB1}, {0xE80561B9,0x1DE7F88F}, {0x0135A08C,0xAD5E1A87},
+	{0x65CECC76,0x2F2ADBD6}, {0x82F58358,0x5780B5A7}, {0xEDE47B3F,0x3EDC8A2E}, {0x06BEE70F,0xC9D95C35},
+	{0x6C4E05EE,0x83BE111D}, {0x59367410,0xA603B909}, {0x809FDE5D,0x103C81B4}, {0x7D0C774A,0x2C69B602},
+	{0xD5C87953,0x399080D7}, {0x487406B4,0x09D41E16}, {0x26505E5F,0xCDD63B18}, {0x9B0298E8,0xF99DC2F4},
+	{0x943CB67F,0x9CD0540A}, {0x891F17C5,0xBCA84B7F}, {0xB78DF2A6,0x723D1DB3}, {0xE73B4F2E,0x78AA6E71},
+	{0xA071670D,0x1433E699}, {0x54620782,0x84F21BE4}, {0xB4D20F2F,0x98DF3327}, {0xD3769E5C,0xF049DCE2},
+	{0x9656EB7A,0xDB6C6019}, {0x078B4783,0x648746B2}, {0x8DCBADCF,0x32CD2359}, {0xF0C7DA85,0x1EA4955B},
+	{0x1B9D46B5,0xE9A14340}, {0xBBEC21B8,0xFD92A5D9}, {0x0E0B8E1B,0xC8138C79}, {0x6D7BA562,0x2EE00B9A},
+	{0x93B7F1FC,0xF85712B8}, {0x0BEA949D,0xEB28FED8}, {0x8A40EA4C,0x564A65EB}, {0x474A2823,0x6C9988E8},
+	{0x121D8F2D,0x4535898B}, {0x31ACCBF4,0xABD8C032}, {0xB9867CBD,0xBA2E91CA}, {0xEF8E263A,0x7960BE3D},
+	{0x602FD6F0,0x0C11A977}, {0x16C93527,0xCB50E1AD}, {0x035FFD89,0xEAE22E94}, {0x5DE2CE1A,0x2866D12F},
+	{0xAB9BF390,0xFF1B1841}, {0x8CFE0D43,0x9F9339DE}, {0xC48A0BF7,0x964727C8}, {0xAAAE531C,0x524502C6},
+	{0xAC10B413,0x9B9C5EF3}, {0x42AB32A5,0x4FA2FA49}, {0xE551122B,0x3F165A62}, {0x76E6E3D7,0xC74148DA},
+	{0xE464B2A7,0x924840E5}, {0xD69784DA,0xD372AE43}, {0x05E11A86,0x233B72A1}, {0x4941A638,0xA48A0491},
+	{0xC9DE7865,0xB4B68525}, {0xA6CF8002,0xDDEABAAC}, {0x50B6BD88,0x0A9773C2}, {0x5EBD3393,0xC284FFBB},
+	{0x2C8F6A4E,0x8BA0DF47}, {0x4D951C32,0x2AEF6CB7}, {0x2A318D41,0x42798372}, {0xBF389BB2,0x73F7CDFF},
+	{0x382C026C,0x074C0AF9}, {0x243A035A,0x8A6A0F0B}, {0x5F88931F,0x6FDAE53C}, {0x7E538AC3,0xC68B9896},
+	{0x1AA8E639,0x44FF59C7}, {0x439E9229,0xE2FCE0CE}, {0x79D8CD40,0xA20CDE24}, {0xC8EBD8E9,0x19E89FA2},
+	{0xF398270C,0xF446BBCF}, {0x2284E455,0x43B3533E}, {0x8E945046,0xD82F0DCD}, {0xB26CE820,0x51066F12},
+	{0x6BC5426D,0xE73957AF}, {0x40C16FA0,0x081ECE5A}, {0xC5BFAB7B,0x3B193D4F}, {0xDF174D42,0x7FE66488},
+	{0x705804D8,0x0E9814EF}, {0x7C39D7C6,0x8137AC85}, {0xE185A821,0xB1733244}, {0x6F11F867,0x695C3F89},
+	{0xE3EFF524,0xF6CF0657}, {0xD02963D5,0x1AABF276}, {0x75B91E5E,0x2DA3664E}, {0x1077D228,0x0289BD98},
+	{0xF413608F,0x90C1FD7D}, {0xFD93A917,0x3C5537B6}, {0x3919A2E0,0xAA12107E}, {0x30996B78,0x0686DAB5},
+	{0x9EE3826E,0xDAA6B055}, {0x56085A87,0xC34E2FF7}, {0x4FFF4137,0x6D5358A4}, {0xB35948AC,0xFC587595},
+	{0xC7D5F67E,0x7CA5095C}, {0x8B754AC0,0xFB147F6C}, {0x91DDACF9,0xBFEB26AB}, {0x67A49173,0x6896EFC5},
+	{0x1E7C5C33,0xCA9A31E1}, {0xB13315A9,0xBBE44186}, {0x689ABFE4,0x0DDB793B}, {0xA7FA208E,0x70B4A02B},
+	{0x7307F951,0xE47A3A7B}, {0x14A36822,0x8CECD5BE}, {0x23B144D9,0xEEED49B9}, {0xB8B3DC31,0x17708B4D},
+	{0x2765FED3,0x6088219F}, {0xF1F27A09,0xB3FA8FDC}, {0xFCA6099B,0x910B2D31}, {0x78ED6DCC,0x0F52C4A3},
+	{0xBAD98134,0x50CCBF5E}, {0x7F662A4F,0x6BD58211}, {0xD4FDD9DF,0x94CE9A50}, {0x45207526,0x2B25BCFB},
+	{0x1F49FCBF,0x67C42B66}, {0x723259DD,0x492420FC}, {0x18C2BB3C,0x03436DD4}, {0xF872B391,0x1F6E4517},
+	{0x69AF1F68,0xA08563BC}, {0xEEBB86B6,0xD43EA4BA}, {0x08B56914,0x01CAD04C}, {0x0980C998,0xAC94CACB},
+	{0x9A373864,0x54C3D873}, {0x2DBACAC2,0x26FEC5C0}, {0xBE0D3B3E,0xDEA9D778}, {0x20EEB950,0x040F672D},
+	{0x7BB29045,0xE5B0EA37}, {0xCBB42560,0xF30AB136}, {0x37122CFB,0x62019C07}, {0x13282FA1,0xE86B930C},
+	{0x2EE5374B,0xCC1CEB54}, {0xA21B3A08,0x538FD28A}, {0xD89C0AC1,0x1B61223A}, {0xAD25149F,0x36C24474},
+	{0xF74C9D06,0x7A23D3E9}, {0x9968C5ED,0xBE21F6E7}, {0x36278C77,0xCF5F8680}, {0xEB5A9C30,0xF705D61B},
+	{0x52DCE08D,0x4D2B47D1}, {0xC234ECF8,0x5F9E7BFD}, {0x3DCD18EA,0x24777858}, {0x4415D5AA,0x867BA67C},
+	{0x5A698999,0x4CE1979D}, {0x00000000,0x00000000}, {0x33C696F1,0xEC64F421}, {0xC16B1171,0xB57C5569},
+	{0x467F88AF,0xC1C7926F}, {0x0F3E2E97,0x654D96FE}, {0xA8C40E19,0x15F936D5}, {0xA9F1AE95,0xB8A72C52},
+	{0x21DB19DC,0xA9517DAA}, {0xFA18EE94,0x58D27104}, {0xF2AD8780,0x5918A148}, {0xDAF657C4,0x5CDD1629},
+	{0x64FB6CFA,0x8274C151}, {0xC6E056F2,0xD1FB13DB}, {0xCF609F6A,0x7D6FD910}, {0xD9A9AA4D,0xB63F38BD},
+	{0xF526C003,0x3D9FE7FA}, {0x871499DE,0x74BBC706}, {0xB6B8522A,0xDF630734}, {0xCD0AC26F,0x3AD3ED03},
+	{0x83C023D4,0xFADEAF20}, {0x4ECAE1BB,0xC00D4223}, {0x5CD76E96,0x8538CBA8}, {0x6E2458EB,0xC402250E},
+	{0x026A5D05,0x47BC3413}, {0x114272A4,0xAFD7A71F}, {0xCC3F62E3,0x978DF784}, {0xA144C781,0xB96DFC1E},
+	{0x1596C8AE,0x21B2CF39}, {0x950916F3,0x318E4E8D}, {0x3E92E563,0xCE9556CC}, {0xDD7D1047,0x385A509B},
+	{0xB5E7AFA3,0x358129A0}, {0x63702B79,0xE6F387E3}, {0x53E94001,0xE0755D56}, {0xFFF9F412,0x7BE903A5},
+	{0x90E80C75,0x12B53C2C}, {0x857EC4DB,0x3307F315}, {0x0C61D31E,0x8FAFB86A}, {0x86213952,0xD9E5DD81},
+	{0x9FD622E2,0x77F8AAD2}, {0x357871FE,0x25BDA814}, {0x8FA1F0CA,0x7571174A}, {0x985D6561,0x137FEC60},
+	{0x9DBC7FE7,0x30449EC1}, {0x41F4CF2C,0xA540D4DD}, {0xAE7AE916,0xDC206AE0}, {0xE2DA55A8,0x5B911CD0},
+	{0xF947131D,0xB2305F90}, {0xBD52C6B7,0x344BF9EC}, {0xD2433ED0,0x5D17C665}, {0xC05EB1FD,0x18224FEE},
+	{0x844B6457,0x9E59E992}, {0xA4A5DD07,0x9A568EBF}, {0x716DA454,0xA3C60E68}, {0xD7A22456,0x7E2CB4C4},
+	{0x4CA0BCBE,0x87B17630}, {0x32F3367D,0x413AEEA6}, {0xBC67663B,0x9915E36B}, {0x3A465F69,0x40F03EEA},
+	{0xE0B008AD,0x1C2D28C3}, {0x4A1E5BB1,0x4E682A05}, {0x285BD044,0x05C5B761}, {0x5B5C2915,0xE1BF8D1A},
+	{0xC3014C74,0xF2C0617A}, {0xD11CC359,0xB7F5E8F1}, {0x3FA745EF,0x63CB4C4B}, {0x9C89DF6B,0x9D1A8446},
+	{0x4B2BFB3D,0xE3363082}, {0xE60EEFA2,0xD5F474F6}, {0xFB2D4E18,0xF58C6B83}, {0x0ADF3411,0x4676E45F},
+	{0x1D23A1BA,0x20781F75}, {0x81AA7ED1,0xBD629B33}, {0x19F71BB0,0xAE1D7753}, {0xA32E9A84,0xFED1C80D},
+	{0x92825170,0x5509083F}, {0x5557A70E,0x29AC0163}, {0x51831D04,0xA7C96945}, {0x04D4BA0A,0x8E656826},
+	{0x882AB749,0x11F651F8}, {0xF6793D8A,0xD77DC96E}, {0x2B042DCD,0xEF2799F5}, {0x7A8730C9,0x48EEF0B0},
+	{0x0D547392,0x22F1A2ED}, {0x2FD097C7,0x6142F1D3}, {0x6AF0E2E1,0x4A674D28}, {0x748CBED2,0x80FD7CC9},
+	{0xAF4F499A,0x717E7067}, {0xECD1DBB3,0x938290A9}, {0x344DD172,0x88E3B293}, {0x250FA3D6,0x2734158C},
+};
+// KeySchedule
+__constant__ static uint64_t _ALIGN(32) CC[12][8] = {
+	{0xe9daca1eda5b08b1, 0x1f7c65c0812fcbeb, 0x16d0452e43766a2f, 0xfcc485758db84e71, 0x0169679291e07c4b, 0x15d360a4082a42a2, 0x234d74cc36747605, 0x0745a6f2596580dd},
+	{0x1a2f9da98ab5a36f, 0xd7b5700f469de34f, 0x982b230a72eafef3, 0x3101b5160f5ed561, 0x5899d6126b17b59a, 0xcaa70adbc261b55c, 0x56cdcbd71ba2dd55, 0xb79bb121700479e6},
+	{0xc72fce2bacdc74f5, 0x35843d6a28fc390a, 0x8b1f9c525f5ef106, 0x7b7b29b11475eaf2, 0xb19e3590e40fe2d3, 0x09db6260373ac9c1, 0x31db7a8643f4b6c2, 0xb20aba0af5961e99},
+	{0xd26615e8b3df1fef, 0xdde4715da0e148f9, 0x7d3c5c337e858e48, 0x3f355e68ad1c729d, 0x75d603ed822cd7a9, 0xbe0352933313b7d8, 0xf137e893a1ea5334, 0x2ed1e384bcbe0c22},
+	{0x994747adac6bea4b, 0x6323a96c0c413f9a, 0x4a1086161f1c157f, 0xbdff0f80d7359e35, 0xa3f53a254717cdbf, 0x161a2723b700ffdf, 0xf563eaa97ea2567a, 0x57fe6c7cfd581760},
+	{0xd9d33a1daeae4fae, 0xc039307a3bc3a46f, 0x6ca44251f9c4662d, 0xc68ef09ab49a7f18, 0xb4b79a1cb7a6facf, 0xb6c6bec2661ff20a, 0x354f903672c571bf, 0x6e7d64467a4068fa},
+	{0xecc5aaee160ec7f4, 0x540924bffe86ac51, 0xc987bfe6c7c69e39, 0xc9937a19333e47d3, 0x372c822dc5ab9209, 0x04054a2883694706, 0xf34a3ca24c451735, 0x93d4143a4d568688},
+	{0xa7c9934d425b1f9b, 0x41416e0c02aae703, 0x1ede369c71f8b74e, 0x9ac4db4d3b44b489, 0x90069b92cb2b89f4, 0x2fc4a5d12b8dd169, 0xd9a8515935c2ac36, 0x1ee702bfd40d7fa4},
+	{0x9b223116545a8f37, 0xde5f16ecd89a4c94, 0x244289251b3a7d3a, 0x84090de0b755d93c, 0xb1ceb2db0b440a80, 0x549c07a69a8a2b7b, 0x602a1fcb92dc380e, 0xdb5a238351446172},
+	{0x526f0580a6debeab, 0xf3f3e4b248e52a38, 0xdb788aff1ce74189, 0x0361331b8ae1ff1f, 0x4b3369af0267e79f, 0xf452763b306c1e7a, 0xc3b63b15d1fa9836, 0xed9c4598fbc7b474},
+	{0xfb89c8efd09ecd7b, 0x94fe5a63cdc60230, 0x6107abebbb6bfad8, 0x7966841421800120, 0xcab948eaef711d8a, 0x986e477d1dcdbaef, 0x5dd86fc04a59a2de, 0x1b2df381cda4ca6b},
+	{0xba3116f167e78e37, 0x7ab14904b08013d2, 0x771ddfbc323ca4cd, 0x9b9f2130d41220f8, 0x86cc91189def805d, 0x5228e188aaa41de7, 0x991bb2d9d517f4fa, 0x20d71bf14a92bc48}
+};
+
+__constant__ static uint64_t _ALIGN(32) precomputed_values[12][8] = {
+	0x8FD72F640708B0D0, 0x0DE874C7EBC3F213, 0xE92EEF3AD202E9E0, 0xC1E9DA0708013DA7, 0x9727DAB2F014BE88, 0x103051A02BCD6935, 0x33EC7E1DBD28F736, 0x1ECF460CF78AD1F4,
+	0x0B2D9F89C775449D, 0x6B6EEFC6DAB7E8B0, 0xF1A0D31667F6EC44, 0x2A71132D5E108166, 0x0E9357C2EC87931A, 0xC99F5C1B4A01612D, 0x7E60B16E637D4EE2, 0xA9FCB827F9BA6D81,
+	0x231FECA5AB3D285C, 0x70C6E1483C838C3B, 0x9C21C3C40CE4E2DA, 0x2FA796BD5688E573, 0x04C0E3FF55809FDF, 0x5FF978BFB8E3CDC8, 0xC54A19D6A3D07033, 0x0FCA83FDDE872478,
+	0xBDF9312726339F10, 0x51A5BA1793BC9C56, 0xC4428DA14F96D2D4, 0xEC925222374EAB1F, 0x79477893747DD92F, 0xC495E19A46886304, 0x9C23F893BA7CFA36, 0x0C47268881FC5FEB,
+	0xCF117966029B2CB3, 0x07179ABE77088A8F, 0x671EF4CC2650E257, 0x7474B8B170DAB5C6, 0x4224FEBECF35113E, 0x993D156C675C5537, 0x2DEE3A5782C39B45, 0xE7C586F2990DD385,
+	0x8608FD95B1C1138A, 0x8BB0847D9E9849AC, 0x5E76623F4F0EB0C7, 0x34C2BDBAFC5060CE, 0xE9E814475907826C, 0x22C9ED94D6AAC7C9, 0xE6B75E28171EB0D6, 0xF1329E5534E60215,
+	0x86BB4814B1C3CE52, 0xE8F226C9FBDDD017, 0xCEDED67991CB3087, 0x76C33E32FDBFACA5, 0xDBB13BE1A9F7474C, 0x3D0273470342C356, 0x8E7246C51CF07F61, 0xAC8C125DDEF8DF71,
+	0x6D73E747795B8CF3, 0x4E4AA65EA0072050, 0xA14A1582CB43C2B9, 0x748EF2B7BB63B938, 0x126789534410D7D4, 0xD4D48FF40301D791, 0xC67DFBE315C41FC0, 0x35E7A1A1AF88601C,
+	0x9BD33EA0FAB34007, 0xF51B7CDBE3D67D25, 0xD3ABDA0CE4186E6B, 0x8E61DDADCBCE1706, 0x58994565B41BE6A5, 0x7A87ABC1240CD31D, 0xFAFE6C28487968D0, 0x15B368609FF9EEA7,
+	0xAE33263CCF115818, 0x93B2DBE9CADFCFC8, 0x0A91952BF91B0147, 0x458E67CA5F1ED73A, 0x94C2E5F288F074E3, 0x377895E85C69E996, 0xF11A4456AAB37B10, 0x163131934816821A,
+	0xD07E4A2366BF469D, 0x5EF1A3D220213B6C, 0x3C5BB78971D8ED0F, 0x0DE05E6B9006F2D2, 0xC58CFB00B8EAA1C9, 0xEFCDB54D1F250B76, 0xFD135634FA527042, 0x4CEE791290516407,
+	0xD800B9264010790F, 0x974C4823E2B668D7, 0xA605A4B385C5E361, 0x3F6C92DA5A56D8D2, 0x82B9D67C12EF8277, 0x0AB6B4582561BF90, 0x46954FD98FC2CBA3, 0x70BE45CB21B6760D
+};
+
+
+#define T0(x) shared[0][x]
+#define T1(x) shared[1][x]
+#define T2(x) shared[2][x]
+#define T3(x) shared[3][x]
+#define T4(x) shared[4][x]
+#define T5(x) shared[5][x]
+#define T6(x) shared[6][x]
+#define T7(x) shared[7][x]
+
+__device__ __forceinline__
+void GOST_FS(const uint2* state,uint2* return_state, const uint2 shared[8][256]){
+
+	#pragma unroll 4
+	for(uint32_t i=0;i<4;i++){
+		return_state[i]   = T0(__byte_perm(state[7].x,0,0x44440 + (i&3))) ^ T1(__byte_perm(state[6].x,0,0x44440 + (i&3)))
+				  ^ T2(__byte_perm(state[5].x,0,0x44440 + (i&3))) ^ T3(__byte_perm(state[4].x,0,0x44440 + (i&3)))
+				  ^ T4(__byte_perm(state[3].x,0,0x44440 + (i&3))) ^ T5(__byte_perm(state[2].x,0,0x44440 + (i&3)))
+				  ^ T6(__byte_perm(state[1].x,0,0x44440 + (i&3))) ^ T7(__byte_perm(state[0].x,0,0x44440 + (i&3)));
+	}
+
+	#pragma unroll 4
+	for(uint32_t i=0;i<4;i++){
+		return_state[i+4] = T0(__byte_perm(state[7].y,0,0x44440 + (i&3))) ^ T1(__byte_perm(state[6].y,0,0x44440 + (i&3)))
+				  ^ T2(__byte_perm(state[5].y,0,0x44440 + (i&3))) ^ T3(__byte_perm(state[4].y,0,0x44440 + (i&3)))
+				  ^ T4(__byte_perm(state[3].y,0,0x44440 + (i&3))) ^ T5(__byte_perm(state[2].y,0,0x44440 + (i&3)))
+				  ^ T6(__byte_perm(state[1].y,0,0x44440 + (i&3))) ^ T7(__byte_perm(state[0].y,0,0x44440 + (i&3)));
+	}
+}
+
+__device__ __forceinline__
+static void GOST_E12(uint2* K, uint2* state,const uint2 shared[8][256]){
+
+	uint2 t[ 8];
+
+	#pragma unroll 2
+	for(uint32_t i=0; i<12; i++){
+		GOST_FS(state, t, shared);
+		#pragma unroll 8
+		for(uint32_t j=0;j<8;j++){
+			state[ j] = t[ j];
+			K[ j] ^= *(uint2*)&CC[i][j];
+		}
+
+		GOST_FS(K, t, shared);
+		#pragma unroll 8
+		for(uint32_t j=0;j<8;j++){
+			K[ j] = t[ j];
+			state[ j] = state[ j] ^ t[ j];
+		}
+	}
+}
+
+#define TPB 128
+__global__
+__launch_bounds__(TPB, 4)
+void streebog_gpu_hash_64(uint64_t *g_hash)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	uint2 buf[8], t[8], temp[8],K0[8], hash[8];
+
+	__shared__ uint2 shared[8][256];
+	shared[0][threadIdx.x] = T02[threadIdx.x];
+	shared[1][threadIdx.x] = T12[threadIdx.x];
+	shared[2][threadIdx.x] = T22[threadIdx.x];
+	shared[3][threadIdx.x] = T32[threadIdx.x];
+	shared[4][threadIdx.x] = T42[threadIdx.x];
+	shared[5][threadIdx.x] = T52[threadIdx.x];
+	shared[6][threadIdx.x] = T62[threadIdx.x];
+	shared[7][threadIdx.x] = T72[threadIdx.x];
+
+	const uint32_t t2 = (threadIdx.x & 0x7f) + 0x80;
+	shared[0][t2] = T02[t2]; shared[1][t2] = T12[t2];
+	shared[2][t2] = T22[t2]; shared[3][t2] = T32[t2];
+	shared[4][t2] = T42[t2]; shared[5][t2] = T52[t2];
+	shared[6][t2] = T62[t2]; shared[7][t2] = T72[t2];
+
+	__syncthreads();
+//	if (thread < threads)
+//	{
+	uint64_t* inout = &g_hash[thread<<3];
+
+	*(uint2x4*)&hash[0] = __ldg4((uint2x4*)&inout[0]);
+	*(uint2x4*)&hash[4] = __ldg4((uint2x4*)&inout[4]);
+
+	K0[0] = K0[1] = K0[2] = K0[3] = K0[4] = K0[5] = K0[6] = K0[7] = vectorize(0x74a5d4ce2efc83b3);
+
+	#pragma unroll 8
+	for(uint32_t i=0;i<8;i++){
+		buf[ i] = hash[ i] ^ K0[ i];
+	}
+
+	#pragma unroll 11
+	for(uint32_t i=0; i<11; i++){
+		GOST_FS(buf, temp, shared);
+		#pragma unroll 8
+		for(uint32_t j=0;j<8;j++){
+			buf[ j] = temp[ j] ^ *(uint2*)&precomputed_values[i][j];
+		}
+	}
+	GOST_FS(buf, temp, shared);
+	#pragma unroll 8
+	for(uint32_t j=0;j<8;j++){
+		buf[ j] = hash[ j] ^ temp[ j] ^ *(uint2*)&precomputed_values[11][j];
+		K0[ j] = buf[ j];
+	}
+
+	K0[7].y ^= 0x00020000;
+
+	GOST_FS(K0, t, shared);
+
+	#pragma unroll 8
+	for(uint32_t i=0;i<8;i++)
+		K0[ i] = t[ i];
+
+	t[7].y ^= 0x01000000;
+
+	#pragma unroll 1
+	for(uint32_t i=0; i<11; i++){
+		GOST_FS(t, temp, shared);
+
+		#pragma unroll 8
+		for(uint32_t j=0;j<8;j++){
+			t[ j] = temp[ j];
+			K0[ j] = K0[ j] ^ *(uint2*)&CC[i][j];
+		}
+
+		GOST_FS(K0, temp, shared);
+
+		#pragma unroll 8
+		for(uint32_t j=0;j<8;j++){
+			K0[ j] = temp[ j];
+			t[ j] ^= temp[ j];
+		}
+	}
+	GOST_FS(t, temp, shared);
+
+	#pragma unroll 8
+	for(uint32_t j=0;j<8;j++){
+		t[ j] = temp[ j];
+		K0[ j] = K0[ j] ^ *(uint2*)&CC[11][j];
+	}
+
+	GOST_FS(K0, temp, shared);
+
+	#pragma unroll 8
+	for(uint32_t j=0;j<8;j++){
+		t[ j] ^= temp[ j];
+	}
+
+	#pragma unroll 8
+	for(uint32_t j=0;j<8;j++)
+		buf[ j] ^= t[ j];
+
+	buf[7].y ^= 0x01000000;
+
+	GOST_FS(buf,K0, shared);
+
+	buf[7].y ^= 0x00020000;
+
+	#pragma unroll 8
+	for(uint32_t j=0;j<8;j++)
+		t[ j] = K0[ j];
+
+	t[7].y ^= 0x00020000;
+
+	GOST_E12(K0, t, shared);
+
+	#pragma unroll 8
+	for(uint32_t j=0;j<8;j++)
+		buf[ j] ^= t[ j];
+
+	GOST_FS(buf,K0, shared); // K = F(h)
+
+	hash[7]+= vectorize(0x0100000000000000);
+
+	#pragma unroll 8
+	for(uint32_t j=0;j<8;j++)
+		t[ j] = K0[ j] ^ hash[ j];
+
+	GOST_E12(K0, t, shared);
+
+	*(ulonglong2to8*)&t[ 0] = *(ulonglong2to8*)&t[ 0] ^ *(ulonglong2to8*)&hash[ 0] ^ *(ulonglong2to8*)&buf[ 0];
+	*(uint2x4*)&inout[ 0] = *(uint2x4*)&t[ 0];
+	*(uint2x4*)&inout[ 4] = *(uint2x4*)&t[ 4];
+}
+
+__host__
+void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash)
+{
+	dim3 grid((threads + TPB-1) / TPB);
+	dim3 block(TPB);
+	if(((float)((threads + TPB-1) / TPB) - (int)((threads + TPB-1) / TPB))!=0.0){
+		applog(LOG_WARNING,"Invalid intensity for streeebog");
+	}
+
+	streebog_gpu_hash_64<<<grid, block>>>((uint64_t*)d_hash);
+}
+
+#undef T0
+#undef T1
+#undef T2
+#undef T3
+#undef T4
+#undef T5
+#undef T6
+#undef T7
+
+#define T0(x) shared[0][x]
+#define T1(x) shared[1][x]
+#define T2(x) shared[2][x]
+#define T3(x) shared[3][x]
+#define T4(x) shared[4][x]
+#define T5(x) shared[5][x]
+#define T6(x) shared[6][x]
+#define T7(x) shared[7][x]
+
+// Streebog final for Veltor and skunk on SM 3.x
+__constant__ uint64_t target64[4];
+
+__host__
+void streebog_sm3_set_target(uint32_t* ptarget)
+{
+	cudaMemcpyToSymbol(target64,ptarget,4*sizeof(uint64_t),0,cudaMemcpyHostToDevice);
+}
+
+__global__
+__launch_bounds__(TPB, 4)
+void streebog_gpu_hash_64_final(uint64_t *g_hash, uint32_t* resNonce)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	uint2 buf[8], t[8], temp[8],K0[8], hash[8];
+
+	__shared__ uint2 shared[8][256];
+	shared[0][threadIdx.x] = T02[threadIdx.x];
+	shared[1][threadIdx.x] = T12[threadIdx.x];
+	shared[2][threadIdx.x] = T22[threadIdx.x];
+	shared[3][threadIdx.x] = T32[threadIdx.x];
+	shared[4][threadIdx.x] = T42[threadIdx.x];
+	shared[5][threadIdx.x] = T52[threadIdx.x];
+	shared[6][threadIdx.x] = T62[threadIdx.x];
+	shared[7][threadIdx.x] = T72[threadIdx.x];
+
+	const uint32_t t2 = (threadIdx.x & 0x7f) + 0x80;
+	shared[0][t2] = T02[t2]; shared[1][t2] = T12[t2];
+	shared[2][t2] = T22[t2]; shared[3][t2] = T32[t2];
+	shared[4][t2] = T42[t2]; shared[5][t2] = T52[t2];
+	shared[6][t2] = T62[t2]; shared[7][t2] = T72[t2];
+
+	__syncthreads();
+//	if (thread < threads)
+//	{
+	uint64_t* inout = &g_hash[thread<<3];
+
+	*(uint2x4*)&hash[0] = __ldg4((uint2x4*)&inout[0]);
+	*(uint2x4*)&hash[4] = __ldg4((uint2x4*)&inout[4]);
+
+	K0[0] = K0[1] = K0[2] = K0[3] = K0[4] = K0[5] = K0[6] = K0[7] = vectorize(0x74a5d4ce2efc83b3);
+
+	#pragma unroll 8
+	for(uint32_t i=0;i<8;i++){
+		buf[ i] = hash[ i] ^ K0[ i];
+	}
+
+	#pragma unroll 11
+	for(uint32_t i=0; i<11; i++){
+		GOST_FS(buf, temp, shared);
+		#pragma unroll 8
+		for(uint32_t j=0;j<8;j++){
+			buf[ j] = temp[ j] ^ *(uint2*)&precomputed_values[i][j];
+		}
+	}
+	GOST_FS(buf, temp, shared);
+	#pragma unroll 8
+	for(uint32_t j=0;j<8;j++){
+		buf[ j] = hash[ j] ^ temp[ j] ^ *(uint2*)&precomputed_values[11][j];
+		K0[ j] = buf[ j];
+	}
+
+	K0[7].y ^= 0x00020000;
+
+	GOST_FS(K0, t, shared);
+
+	#pragma unroll 8
+	for(uint32_t i=0;i<8;i++)
+		K0[ i] = t[ i];
+
+	t[7].y ^= 0x01000000;
+
+	#pragma unroll 1
+	for(uint32_t i=0; i<11; i++){
+		GOST_FS(t, temp, shared);
+
+		#pragma unroll 8
+		for(uint32_t j=0;j<8;j++){
+			t[ j] = temp[ j];
+			K0[ j] = K0[ j] ^ *(uint2*)&CC[i][j];
+		}
+
+		GOST_FS(K0, temp, shared);
+
+		#pragma unroll 8
+		for(uint32_t j=0;j<8;j++){
+			K0[ j] = temp[ j];
+			t[ j] ^= temp[ j];
+		}
+	}
+	GOST_FS(t, temp, shared);
+
+	#pragma unroll 8
+	for(uint32_t j=0;j<8;j++){
+		t[ j] = temp[ j];
+		K0[ j] = K0[ j] ^ *(uint2*)&CC[11][j];
+	}
+
+	GOST_FS(K0, temp, shared);
+
+	#pragma unroll 8
+	for(uint32_t j=0;j<8;j++){
+		t[ j] ^= temp[ j];
+	}
+
+	#pragma unroll 8
+	for(uint32_t j=0;j<8;j++)
+		buf[ j] ^= t[ j];
+
+	buf[7].y ^= 0x01000000;
+
+	GOST_FS(buf,K0, shared);
+
+	buf[7].y ^= 0x00020000;
+
+	#pragma unroll 8
+	for(uint32_t j=0;j<8;j++)
+		t[ j] = K0[ j];
+
+	t[7].y ^= 0x00020000;
+
+	GOST_E12(K0, t, shared);
+
+	#pragma unroll 8
+	for(uint32_t j=0;j<8;j++)
+		buf[ j] ^= t[ j];
+
+	GOST_FS(buf,K0, shared); // K = F(h)
+
+	hash[7]+= vectorize(0x0100000000000000);
+
+	#pragma unroll 8
+	for(uint32_t j=0;j<8;j++)
+		t[ j] = K0[ j] ^ hash[ j];
+
+//	#pragma unroll
+	for(uint32_t i=0; i<10; i++){
+		GOST_FS(t, temp, shared);
+
+		#pragma unroll 8
+		for(uint32_t j=0;j<8;j++){
+			t[ j] = temp[ j];
+			K0[ j] = K0[ j] ^ *(uint2*)&CC[ i][ j];
+		}
+
+		GOST_FS(K0, temp, shared);
+
+		#pragma unroll 8
+		for(uint32_t j=0;j<8;j++){
+			K0[ j] = temp[ j];
+			t[ j]^= temp[ j];
+		}
+	}
+
+	GOST_FS(t, temp, shared);
+
+	#pragma unroll 8
+	for(uint32_t j=0;j<8;j++){
+		t[ j] = temp[ j];
+		K0[ j] = K0[ j] ^ *(uint2*)&CC[10][ j];
+	}
+
+	GOST_FS(K0, temp, shared);
+
+	#pragma unroll 8
+	for(int i=7;i>=0;i--){
+		t[i].x = t[i].x ^ temp[i].x;
+		temp[i].x = temp[i].x ^ ((uint32_t*)&CC[11])[i<<1];
+	}
+
+	uint2 last[2];
+
+	last[ 0] = T0(__byte_perm(t[7].x,0,0x44443)) ^ T1(__byte_perm(t[6].x,0,0x44443))
+		 ^ T2(__byte_perm(t[5].x,0,0x44443)) ^ T3(__byte_perm(t[4].x,0,0x44443))
+		 ^ T4(__byte_perm(t[3].x,0,0x44443)) ^ T5(__byte_perm(t[2].x,0,0x44443))
+		 ^ T6(__byte_perm(t[1].x,0,0x44443)) ^ T7(__byte_perm(t[0].x,0,0x44443));
+
+	last[ 1] = T0(__byte_perm(temp[7].x,0,0x44443)) ^ T1(__byte_perm(temp[6].x,0,0x44443))
+		 ^ T2(__byte_perm(temp[5].x,0,0x44443)) ^ T3(__byte_perm(temp[4].x,0,0x44443))
+		 ^ T4(__byte_perm(temp[3].x,0,0x44443)) ^ T5(__byte_perm(temp[2].x,0,0x44443))
+		 ^ T6(__byte_perm(temp[1].x,0,0x44443)) ^ T7(__byte_perm(temp[0].x,0,0x44443));
+
+	if(devectorize(buf[3] ^ hash[3] ^ last[ 0] ^ last[ 1]) <= target64[3]){
+		uint32_t tmp = atomicExch(&resNonce[0], thread);
+		if (tmp != UINT32_MAX)
+			resNonce[1] = tmp;
+	}
+}
+
+__host__
+void streebog_sm3_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash,uint32_t* d_resNonce)
+{
+	dim3 grid((threads + TPB-1) / TPB);
+	dim3 block(TPB);
+	if(((float)((threads + TPB-1) / TPB) - (int)((threads + TPB-1) / TPB))!=0.0){
+		applog(LOG_WARNING,"Invalid intensity for streeebog final");
+	}
+
+	streebog_gpu_hash_64_final<<<grid, block>>>((uint64_t*)d_hash, d_resNonce);
+}
diff --git a/x11/cuda_streebog_maxwell.cu b/x11/cuda_streebog_maxwell.cu
new file mode 100644
index 0000000000..4ff580b15e
--- /dev/null
+++ b/x11/cuda_streebog_maxwell.cu
@@ -0,0 +1,320 @@
+/*
+ * Streebog GOST R 34.10-2012 CUDA implementation.
+ *
+ * https://tools.ietf.org/html/rfc6986
+ * https://en.wikipedia.org/wiki/Streebog
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * @author   Tanguy Pruvot - 2015
+ * @author   Alexis Provos - 2016
+ */
+
+// Further improved with shared memory partial utilization
+// Tested under CUDA7.5 toolkit for cp 5.0/5.2
+
+//#include <miner.h>
+#include <cuda_helper.h>
+#include <cuda_vectors.h>
+#include <cuda_vector_uint2x4.h>
+
+#include "streebog_arrays.cuh"
+
+//#define FULL_UNROLL
+__device__ __forceinline__
+static void GOST_FS(const uint2 shared[8][256],const uint2 *const __restrict__ state,uint2* return_state)
+{
+	return_state[0] = __ldg(&T02[__byte_perm(state[7].x,0,0x44440)])
+			^ shared[1][__byte_perm(state[6].x,0,0x44440)]
+			^ shared[2][__byte_perm(state[5].x,0,0x44440)]
+			^ shared[3][__byte_perm(state[4].x,0,0x44440)]
+			^ shared[4][__byte_perm(state[3].x,0,0x44440)]
+			^ shared[5][__byte_perm(state[2].x,0,0x44440)]
+			^ shared[6][__byte_perm(state[1].x,0,0x44440)]
+			^ __ldg(&T72[__byte_perm(state[0].x,0,0x44440)]);
+
+	return_state[1] = __ldg(&T02[__byte_perm(state[7].x,0,0x44441)])
+			^ __ldg(&T12[__byte_perm(state[6].x,0,0x44441)])
+			^ shared[2][__byte_perm(state[5].x,0,0x44441)]
+			^ shared[3][__byte_perm(state[4].x,0,0x44441)]
+			^ shared[4][__byte_perm(state[3].x,0,0x44441)]
+			^ shared[5][__byte_perm(state[2].x,0,0x44441)]
+			^ shared[6][__byte_perm(state[1].x,0,0x44441)]
+			^ __ldg(&T72[__byte_perm(state[0].x,0,0x44441)]);
+
+	return_state[2] = __ldg(&T02[__byte_perm(state[7].x,0,0x44442)])
+			^ __ldg(&T12[__byte_perm(state[6].x,0,0x44442)])
+			^ shared[2][__byte_perm(state[5].x,0,0x44442)]
+			^ shared[3][__byte_perm(state[4].x,0,0x44442)]
+			^ shared[4][__byte_perm(state[3].x,0,0x44442)]
+			^ shared[5][__byte_perm(state[2].x,0,0x44442)]
+			^ __ldg(&T72[__byte_perm(state[0].x,0,0x44442)])
+			^ shared[6][__byte_perm(state[1].x,0,0x44442)];
+
+	return_state[3] = __ldg(&T02[__byte_perm(state[7].x,0,0x44443)])
+			^ shared[1][__byte_perm(state[6].x,0,0x44443)]
+			^ shared[2][__byte_perm(state[5].x,0,0x44443)]
+			^ shared[3][__byte_perm(state[4].x,0,0x44443)]
+			^ __ldg(&T42[__byte_perm(state[3].x,0,0x44443)])
+			^ shared[5][__byte_perm(state[2].x,0,0x44443)]
+			^ __ldg(&T72[__byte_perm(state[0].x,0,0x44443)])
+			^ shared[6][__byte_perm(state[1].x,0,0x44443)];
+
+	return_state[4] = __ldg(&T02[__byte_perm(state[7].y,0,0x44440)])
+			^ shared[1][__byte_perm(state[6].y,0,0x44440)]
+			^ __ldg(&T22[__byte_perm(state[5].y,0,0x44440)])
+			^ shared[3][__byte_perm(state[4].y,0,0x44440)]
+			^ shared[4][__byte_perm(state[3].y,0,0x44440)]
+			^ __ldg(&T62[__byte_perm(state[1].y,0,0x44440)])
+			^ shared[5][__byte_perm(state[2].y,0,0x44440)]
+			^ __ldg(&T72[__byte_perm(state[0].y,0,0x44440)]);
+
+	return_state[5] = __ldg(&T02[__byte_perm(state[7].y,0,0x44441)])
+			^ shared[2][__byte_perm(state[5].y,0,0x44441)]
+			^ __ldg(&T12[__byte_perm(state[6].y,0,0x44441)])
+			^ shared[3][__byte_perm(state[4].y,0,0x44441)]
+			^ shared[4][__byte_perm(state[3].y,0,0x44441)]
+			^ shared[5][__byte_perm(state[2].y,0,0x44441)]
+			^ __ldg(&T62[__byte_perm(state[1].y,0,0x44441)])
+			^ __ldg(&T72[__byte_perm(state[0].y,0,0x44441)]);
+
+	return_state[6] = __ldg(&T02[__byte_perm(state[7].y,0,0x44442)])
+			^ shared[1][__byte_perm(state[6].y,0,0x44442)]
+			^ shared[2][__byte_perm(state[5].y,0,0x44442)]
+			^ shared[3][__byte_perm(state[4].y,0,0x44442)]
+			^ shared[4][__byte_perm(state[3].y,0,0x44442)]
+			^ shared[5][__byte_perm(state[2].y,0,0x44442)]
+			^ __ldg(&T62[__byte_perm(state[1].y,0,0x44442)])
+			^ __ldg(&T72[__byte_perm(state[0].y,0,0x44442)]);
+
+	return_state[7] = __ldg(&T02[__byte_perm(state[7].y,0,0x44443)])
+			^ __ldg(&T12[__byte_perm(state[6].y,0,0x44443)])
+			^ shared[2][__byte_perm(state[5].y,0,0x44443)]
+			^ shared[3][__byte_perm(state[4].y,0,0x44443)]
+			^ shared[4][__byte_perm(state[3].y,0,0x44443)]
+			^ shared[5][__byte_perm(state[2].y,0,0x44443)]
+			^ __ldg(&T62[__byte_perm(state[1].y,0,0x44443)])
+			^ __ldg(&T72[__byte_perm(state[0].y,0,0x44443)]);
+}
+
+__device__ __forceinline__
+static void GOST_FS_LDG(const uint2 shared[8][256],const uint2 *const __restrict__ state,uint2* return_state)
+{
+	return_state[0] = __ldg(&T02[__byte_perm(state[7].x,0,0x44440)])
+			^ __ldg(&T12[__byte_perm(state[6].x,0,0x44440)])
+			^ shared[2][__byte_perm(state[5].x,0,0x44440)]
+			^ shared[3][__byte_perm(state[4].x,0,0x44440)]
+			^ shared[4][__byte_perm(state[3].x,0,0x44440)]
+			^ shared[5][__byte_perm(state[2].x,0,0x44440)]
+			^ shared[6][__byte_perm(state[1].x,0,0x44440)]
+			^ __ldg(&T72[__byte_perm(state[0].x,0,0x44440)]);
+
+	return_state[1] = __ldg(&T02[__byte_perm(state[7].x,0,0x44441)])
+			^ __ldg(&T12[__byte_perm(state[6].x,0,0x44441)])
+			^ shared[2][__byte_perm(state[5].x,0,0x44441)]
+			^ shared[3][__byte_perm(state[4].x,0,0x44441)]
+			^ shared[4][__byte_perm(state[3].x,0,0x44441)]
+			^ shared[5][__byte_perm(state[2].x,0,0x44441)]
+			^ __ldg(&T72[__byte_perm(state[0].x,0,0x44441)])
+			^ shared[6][__byte_perm(state[1].x,0,0x44441)];
+
+	return_state[2] = __ldg(&T02[__byte_perm(state[7].x,0,0x44442)])
+			^ __ldg(&T12[__byte_perm(state[6].x,0,0x44442)])
+			^ shared[2][__byte_perm(state[5].x,0,0x44442)]
+			^ shared[3][__byte_perm(state[4].x,0,0x44442)]
+			^ shared[4][__byte_perm(state[3].x,0,0x44442)]
+			^ shared[5][__byte_perm(state[2].x,0,0x44442)]
+			^ shared[6][__byte_perm(state[1].x,0,0x44442)]
+			^ __ldg(&T72[__byte_perm(state[0].x,0,0x44442)]);
+
+	return_state[3] = __ldg(&T02[__byte_perm(state[7].x,0,0x44443)])
+			^ __ldg(&T12[__byte_perm(state[6].x,0,0x44443)])
+			^ shared[2][__byte_perm(state[5].x,0,0x44443)]
+			^ shared[3][__byte_perm(state[4].x,0,0x44443)]
+			^ shared[4][__byte_perm(state[3].x,0,0x44443)]
+			^ shared[5][__byte_perm(state[2].x,0,0x44443)]
+			^ shared[6][__byte_perm(state[1].x,0,0x44443)]
+			^ __ldg(&T72[__byte_perm(state[0].x,0,0x44443)]);
+
+	return_state[4] = __ldg(&T02[__byte_perm(state[7].y,0,0x44440)])
+			^ shared[1][__byte_perm(state[6].y,0,0x44440)]
+			^ __ldg(&T22[__byte_perm(state[5].y,0,0x44440)])
+			^ shared[3][__byte_perm(state[4].y,0,0x44440)]
+			^ shared[4][__byte_perm(state[3].y,0,0x44440)]
+			^ shared[5][__byte_perm(state[2].y,0,0x44440)]
+			^ __ldg(&T72[__byte_perm(state[0].y,0,0x44440)])
+			^ __ldg(&T62[__byte_perm(state[1].y,0,0x44440)]);
+
+	return_state[5] = __ldg(&T02[__byte_perm(state[7].y,0,0x44441)])
+			^ __ldg(&T12[__byte_perm(state[6].y,0,0x44441)])
+			^ shared[2][__byte_perm(state[5].y,0,0x44441)]
+			^ shared[3][__byte_perm(state[4].y,0,0x44441)]
+			^ shared[4][__byte_perm(state[3].y,0,0x44441)]
+			^ shared[5][__byte_perm(state[2].y,0,0x44441)]
+			^ __ldg(&T72[__byte_perm(state[0].y,0,0x44441)])
+			^ __ldg(&T62[__byte_perm(state[1].y,0,0x44441)]);
+
+	return_state[6] = __ldg(&T02[__byte_perm(state[7].y,0,0x44442)])
+			^ __ldg(&T12[__byte_perm(state[6].y,0,0x44442)])
+			^ __ldg(&T22[__byte_perm(state[5].y,0,0x44442)])
+			^ shared[3][__byte_perm(state[4].y,0,0x44442)]
+			^ shared[4][__byte_perm(state[3].y,0,0x44442)]
+			^ shared[5][__byte_perm(state[2].y,0,0x44442)]
+			^ __ldg(&T72[__byte_perm(state[0].y,0,0x44442)])
+			^ __ldg(&T62[__byte_perm(state[1].y,0,0x44442)]);
+
+	return_state[7] = __ldg(&T02[__byte_perm(state[7].y,0,0x44443)])
+			^ shared[1][__byte_perm(state[6].y,0,0x44443)]
+			^ __ldg(&T22[__byte_perm(state[5].y,0,0x44443)])
+			^ shared[3][__byte_perm(state[4].y,0,0x44443)]
+			^ shared[4][__byte_perm(state[3].y,0,0x44443)]
+			^ shared[5][__byte_perm(state[2].y,0,0x44443)]
+			^ __ldg(&T72[__byte_perm(state[0].y,0,0x44443)])
+			^ __ldg(&T62[__byte_perm(state[1].y,0,0x44443)]);
+}
+
+__device__ __forceinline__
+static void GOST_E12(const uint2 shared[8][256],uint2 *const __restrict__ K, uint2 *const __restrict__ state)
+{
+	uint2 t[8];
+	for(int i=0; i<12; i++){
+		GOST_FS(shared,state, t);
+
+		#pragma unroll 8
+		for(int j=0;j<8;j++)
+			K[ j] ^= *(uint2*)&CC[i][j];
+
+		#pragma unroll 8
+		for(int j=0;j<8;j++)
+			state[ j] = t[ j];
+
+		GOST_FS_LDG(shared,K, t);
+
+		#pragma unroll 8
+		for(int j=0;j<8;j++)
+			state[ j]^= t[ j];
+
+		#pragma unroll 8
+		for(int j=0;j<8;j++)
+			K[ j] = t[ j];
+	}
+}
+
+#define TPB 256
+__global__
+#if __CUDA_ARCH__ > 500
+__launch_bounds__(TPB, 3)
+#else
+__launch_bounds__(TPB, 3)
+#endif
+void streebog_gpu_hash_64_sm5(uint64_t *g_hash, uint32_t* const d_filter, const uint32_t filter_val)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	uint2 buf[8], t[8], temp[8], K0[8], hash[8];
+
+	__shared__ uint2 shared[8][256];
+	shared[0][threadIdx.x] = __ldg(&T02[threadIdx.x]);
+	shared[1][threadIdx.x] = __ldg(&T12[threadIdx.x]);
+	shared[2][threadIdx.x] = __ldg(&T22[threadIdx.x]);
+	shared[3][threadIdx.x] = __ldg(&T32[threadIdx.x]);
+	shared[4][threadIdx.x] = __ldg(&T42[threadIdx.x]);
+	shared[5][threadIdx.x] = __ldg(&T52[threadIdx.x]);
+	shared[6][threadIdx.x] = __ldg(&T62[threadIdx.x]);
+	shared[7][threadIdx.x] = __ldg(&T72[threadIdx.x]);
+
+	//__threadfence_block();
+	__syncthreads();
+
+	if (d_filter && d_filter[thread] != filter_val) return;
+
+	uint64_t* inout = &g_hash[thread<<3];
+
+	*(uint2x4*)&hash[0] = __ldg4((uint2x4*)&inout[0]);
+	*(uint2x4*)&hash[4] = __ldg4((uint2x4*)&inout[4]);
+
+	K0[0] = vectorize(0x74a5d4ce2efc83b3);
+
+	#pragma unroll 8
+	for(int i=0;i<8;i++){
+		buf[ i] = K0[ 0] ^ hash[ i];
+	}
+
+	for(int i=0; i<12; i++){
+		GOST_FS(shared, buf, temp);
+		#pragma unroll 8
+		for(uint32_t j=0;j<8;j++){
+			buf[ j] = temp[ j] ^ *(uint2*)&precomputed_values[i][j];
+		}
+	}
+	#pragma unroll 8
+	for(int j=0;j<8;j++){
+		buf[ j]^= hash[ j];
+	}
+	#pragma unroll 8
+	for(int j=0;j<8;j++){
+		K0[ j] = buf[ j];
+	}
+
+	K0[7].y ^= 0x00020000;
+
+	GOST_FS(shared, K0, t);
+
+	#pragma unroll 8
+	for(int i=0;i<8;i++)
+		K0[ i] = t[ i];
+
+	t[7].y ^= 0x01000000;
+
+	GOST_E12(shared, K0, t);
+
+	#pragma unroll 8
+	for(int j=0;j<8;j++)
+		buf[ j] ^= t[ j];
+
+	buf[7].y ^= 0x01000000;
+
+	GOST_FS(shared, buf,K0);
+
+	buf[7].y ^= 0x00020000;
+
+	#pragma unroll 8
+	for(int j=0;j<8;j++)
+		t[ j] = K0[ j];
+
+	t[7].y ^= 0x00020000;
+
+	GOST_E12(shared, K0, t);
+
+	#pragma unroll 8
+	for(int j=0;j<8;j++)
+		buf[ j] ^= t[ j];
+
+	GOST_FS(shared, buf,K0); // K = F(h)
+
+	hash[7]+= vectorize(0x0100000000000000);
+
+	#pragma unroll 8
+	for(int j=0;j<8;j++)
+		t[ j] = K0[ j] ^ hash[ j];
+
+	GOST_E12(shared, K0, t);
+
+	*(uint2x4*)&inout[0] = *(uint2x4*)&t[0] ^ *(uint2x4*)&hash[0] ^ *(uint2x4*)&buf[0];
+	*(uint2x4*)&inout[4] = *(uint2x4*)&t[4] ^ *(uint2x4*)&hash[4] ^ *(uint2x4*)&buf[4];
+}
+
+__host__
+void streebog_hash_64_maxwell(int thr_id, uint32_t threads, uint32_t *g_hash)
+{
+	dim3 grid((threads + TPB-1) / TPB);
+	dim3 block(TPB);
+	streebog_gpu_hash_64_sm5 <<<grid, block>>> ((uint64_t*)g_hash, NULL, 0);
+}
+
+__host__
+void phi_streebog_hash_64_filtered(int thr_id, const uint32_t threads, uint32_t *g_hash, uint32_t *d_filter)
+{
+	dim3 grid((threads + TPB-1) / TPB);
+	dim3 block(TPB);
+	streebog_gpu_hash_64_sm5 <<<grid, block>>> ((uint64_t*)g_hash, d_filter, 1);
+}
diff --git a/x11/cuda_x11.h b/x11/cuda_x11.h
new file mode 100644
index 0000000000..8552157941
--- /dev/null
+++ b/x11/cuda_x11.h
@@ -0,0 +1,21 @@
+#include "quark/cuda_quark.h"
+
+extern void x11_luffaCubehash512_cpu_init(int thr_id, uint32_t threads);
+extern void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash, int order);
+
+extern void x11_luffa512_cpu_init(int thr_id, uint32_t threads);
+extern void x11_luffa512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void x11_cubehash512_cpu_init(int thr_id, uint32_t threads);
+extern void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void x11_shavite512_cpu_init(int thr_id, uint32_t threads);
+extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern int  x11_simd512_cpu_init(int thr_id, uint32_t threads);
+extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_simd512_cpu_free(int thr_id);
+
+extern void x11_echo512_cpu_init(int thr_id, uint32_t threads);
+extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
diff --git a/x11/cuda_x11_aes.cu b/x11/cuda_x11_aes.cu
deleted file mode 100644
index 97cd1dd829..0000000000
--- a/x11/cuda_x11_aes.cu
+++ /dev/null
@@ -1,402 +0,0 @@
-
-/* AES Helper for inline-usage from SPH */
-#define AESx(x) SPH_C32(x)
-
-static const uint32_t h_AES0[256] = {
-	AESx(0xA56363C6), AESx(0x847C7CF8), AESx(0x997777EE), AESx(0x8D7B7BF6),
-	AESx(0x0DF2F2FF), AESx(0xBD6B6BD6), AESx(0xB16F6FDE), AESx(0x54C5C591),
-	AESx(0x50303060), AESx(0x03010102), AESx(0xA96767CE), AESx(0x7D2B2B56),
-	AESx(0x19FEFEE7), AESx(0x62D7D7B5), AESx(0xE6ABAB4D), AESx(0x9A7676EC),
-	AESx(0x45CACA8F), AESx(0x9D82821F), AESx(0x40C9C989), AESx(0x877D7DFA),
-	AESx(0x15FAFAEF), AESx(0xEB5959B2), AESx(0xC947478E), AESx(0x0BF0F0FB),
-	AESx(0xECADAD41), AESx(0x67D4D4B3), AESx(0xFDA2A25F), AESx(0xEAAFAF45),
-	AESx(0xBF9C9C23), AESx(0xF7A4A453), AESx(0x967272E4), AESx(0x5BC0C09B),
-	AESx(0xC2B7B775), AESx(0x1CFDFDE1), AESx(0xAE93933D), AESx(0x6A26264C),
-	AESx(0x5A36366C), AESx(0x413F3F7E), AESx(0x02F7F7F5), AESx(0x4FCCCC83),
-	AESx(0x5C343468), AESx(0xF4A5A551), AESx(0x34E5E5D1), AESx(0x08F1F1F9),
-	AESx(0x937171E2), AESx(0x73D8D8AB), AESx(0x53313162), AESx(0x3F15152A),
-	AESx(0x0C040408), AESx(0x52C7C795), AESx(0x65232346), AESx(0x5EC3C39D),
-	AESx(0x28181830), AESx(0xA1969637), AESx(0x0F05050A), AESx(0xB59A9A2F),
-	AESx(0x0907070E), AESx(0x36121224), AESx(0x9B80801B), AESx(0x3DE2E2DF),
-	AESx(0x26EBEBCD), AESx(0x6927274E), AESx(0xCDB2B27F), AESx(0x9F7575EA),
-	AESx(0x1B090912), AESx(0x9E83831D), AESx(0x742C2C58), AESx(0x2E1A1A34),
-	AESx(0x2D1B1B36), AESx(0xB26E6EDC), AESx(0xEE5A5AB4), AESx(0xFBA0A05B),
-	AESx(0xF65252A4), AESx(0x4D3B3B76), AESx(0x61D6D6B7), AESx(0xCEB3B37D),
-	AESx(0x7B292952), AESx(0x3EE3E3DD), AESx(0x712F2F5E), AESx(0x97848413),
-	AESx(0xF55353A6), AESx(0x68D1D1B9), AESx(0x00000000), AESx(0x2CEDEDC1),
-	AESx(0x60202040), AESx(0x1FFCFCE3), AESx(0xC8B1B179), AESx(0xED5B5BB6),
-	AESx(0xBE6A6AD4), AESx(0x46CBCB8D), AESx(0xD9BEBE67), AESx(0x4B393972),
-	AESx(0xDE4A4A94), AESx(0xD44C4C98), AESx(0xE85858B0), AESx(0x4ACFCF85),
-	AESx(0x6BD0D0BB), AESx(0x2AEFEFC5), AESx(0xE5AAAA4F), AESx(0x16FBFBED),
-	AESx(0xC5434386), AESx(0xD74D4D9A), AESx(0x55333366), AESx(0x94858511),
-	AESx(0xCF45458A), AESx(0x10F9F9E9), AESx(0x06020204), AESx(0x817F7FFE),
-	AESx(0xF05050A0), AESx(0x443C3C78), AESx(0xBA9F9F25), AESx(0xE3A8A84B),
-	AESx(0xF35151A2), AESx(0xFEA3A35D), AESx(0xC0404080), AESx(0x8A8F8F05),
-	AESx(0xAD92923F), AESx(0xBC9D9D21), AESx(0x48383870), AESx(0x04F5F5F1),
-	AESx(0xDFBCBC63), AESx(0xC1B6B677), AESx(0x75DADAAF), AESx(0x63212142),
-	AESx(0x30101020), AESx(0x1AFFFFE5), AESx(0x0EF3F3FD), AESx(0x6DD2D2BF),
-	AESx(0x4CCDCD81), AESx(0x140C0C18), AESx(0x35131326), AESx(0x2FECECC3),
-	AESx(0xE15F5FBE), AESx(0xA2979735), AESx(0xCC444488), AESx(0x3917172E),
-	AESx(0x57C4C493), AESx(0xF2A7A755), AESx(0x827E7EFC), AESx(0x473D3D7A),
-	AESx(0xAC6464C8), AESx(0xE75D5DBA), AESx(0x2B191932), AESx(0x957373E6),
-	AESx(0xA06060C0), AESx(0x98818119), AESx(0xD14F4F9E), AESx(0x7FDCDCA3),
-	AESx(0x66222244), AESx(0x7E2A2A54), AESx(0xAB90903B), AESx(0x8388880B),
-	AESx(0xCA46468C), AESx(0x29EEEEC7), AESx(0xD3B8B86B), AESx(0x3C141428),
-	AESx(0x79DEDEA7), AESx(0xE25E5EBC), AESx(0x1D0B0B16), AESx(0x76DBDBAD),
-	AESx(0x3BE0E0DB), AESx(0x56323264), AESx(0x4E3A3A74), AESx(0x1E0A0A14),
-	AESx(0xDB494992), AESx(0x0A06060C), AESx(0x6C242448), AESx(0xE45C5CB8),
-	AESx(0x5DC2C29F), AESx(0x6ED3D3BD), AESx(0xEFACAC43), AESx(0xA66262C4),
-	AESx(0xA8919139), AESx(0xA4959531), AESx(0x37E4E4D3), AESx(0x8B7979F2),
-	AESx(0x32E7E7D5), AESx(0x43C8C88B), AESx(0x5937376E), AESx(0xB76D6DDA),
-	AESx(0x8C8D8D01), AESx(0x64D5D5B1), AESx(0xD24E4E9C), AESx(0xE0A9A949),
-	AESx(0xB46C6CD8), AESx(0xFA5656AC), AESx(0x07F4F4F3), AESx(0x25EAEACF),
-	AESx(0xAF6565CA), AESx(0x8E7A7AF4), AESx(0xE9AEAE47), AESx(0x18080810),
-	AESx(0xD5BABA6F), AESx(0x887878F0), AESx(0x6F25254A), AESx(0x722E2E5C),
-	AESx(0x241C1C38), AESx(0xF1A6A657), AESx(0xC7B4B473), AESx(0x51C6C697),
-	AESx(0x23E8E8CB), AESx(0x7CDDDDA1), AESx(0x9C7474E8), AESx(0x211F1F3E),
-	AESx(0xDD4B4B96), AESx(0xDCBDBD61), AESx(0x868B8B0D), AESx(0x858A8A0F),
-	AESx(0x907070E0), AESx(0x423E3E7C), AESx(0xC4B5B571), AESx(0xAA6666CC),
-	AESx(0xD8484890), AESx(0x05030306), AESx(0x01F6F6F7), AESx(0x120E0E1C),
-	AESx(0xA36161C2), AESx(0x5F35356A), AESx(0xF95757AE), AESx(0xD0B9B969),
-	AESx(0x91868617), AESx(0x58C1C199), AESx(0x271D1D3A), AESx(0xB99E9E27),
-	AESx(0x38E1E1D9), AESx(0x13F8F8EB), AESx(0xB398982B), AESx(0x33111122),
-	AESx(0xBB6969D2), AESx(0x70D9D9A9), AESx(0x898E8E07), AESx(0xA7949433),
-	AESx(0xB69B9B2D), AESx(0x221E1E3C), AESx(0x92878715), AESx(0x20E9E9C9),
-	AESx(0x49CECE87), AESx(0xFF5555AA), AESx(0x78282850), AESx(0x7ADFDFA5),
-	AESx(0x8F8C8C03), AESx(0xF8A1A159), AESx(0x80898909), AESx(0x170D0D1A),
-	AESx(0xDABFBF65), AESx(0x31E6E6D7), AESx(0xC6424284), AESx(0xB86868D0),
-	AESx(0xC3414182), AESx(0xB0999929), AESx(0x772D2D5A), AESx(0x110F0F1E),
-	AESx(0xCBB0B07B), AESx(0xFC5454A8), AESx(0xD6BBBB6D), AESx(0x3A16162C)
-};
-
-static const uint32_t h_AES1[256] = {
-	AESx(0x6363C6A5), AESx(0x7C7CF884), AESx(0x7777EE99), AESx(0x7B7BF68D),
-	AESx(0xF2F2FF0D), AESx(0x6B6BD6BD), AESx(0x6F6FDEB1), AESx(0xC5C59154),
-	AESx(0x30306050), AESx(0x01010203), AESx(0x6767CEA9), AESx(0x2B2B567D),
-	AESx(0xFEFEE719), AESx(0xD7D7B562), AESx(0xABAB4DE6), AESx(0x7676EC9A),
-	AESx(0xCACA8F45), AESx(0x82821F9D), AESx(0xC9C98940), AESx(0x7D7DFA87),
-	AESx(0xFAFAEF15), AESx(0x5959B2EB), AESx(0x47478EC9), AESx(0xF0F0FB0B),
-	AESx(0xADAD41EC), AESx(0xD4D4B367), AESx(0xA2A25FFD), AESx(0xAFAF45EA),
-	AESx(0x9C9C23BF), AESx(0xA4A453F7), AESx(0x7272E496), AESx(0xC0C09B5B),
-	AESx(0xB7B775C2), AESx(0xFDFDE11C), AESx(0x93933DAE), AESx(0x26264C6A),
-	AESx(0x36366C5A), AESx(0x3F3F7E41), AESx(0xF7F7F502), AESx(0xCCCC834F),
-	AESx(0x3434685C), AESx(0xA5A551F4), AESx(0xE5E5D134), AESx(0xF1F1F908),
-	AESx(0x7171E293), AESx(0xD8D8AB73), AESx(0x31316253), AESx(0x15152A3F),
-	AESx(0x0404080C), AESx(0xC7C79552), AESx(0x23234665), AESx(0xC3C39D5E),
-	AESx(0x18183028), AESx(0x969637A1), AESx(0x05050A0F), AESx(0x9A9A2FB5),
-	AESx(0x07070E09), AESx(0x12122436), AESx(0x80801B9B), AESx(0xE2E2DF3D),
-	AESx(0xEBEBCD26), AESx(0x27274E69), AESx(0xB2B27FCD), AESx(0x7575EA9F),
-	AESx(0x0909121B), AESx(0x83831D9E), AESx(0x2C2C5874), AESx(0x1A1A342E),
-	AESx(0x1B1B362D), AESx(0x6E6EDCB2), AESx(0x5A5AB4EE), AESx(0xA0A05BFB),
-	AESx(0x5252A4F6), AESx(0x3B3B764D), AESx(0xD6D6B761), AESx(0xB3B37DCE),
-	AESx(0x2929527B), AESx(0xE3E3DD3E), AESx(0x2F2F5E71), AESx(0x84841397),
-	AESx(0x5353A6F5), AESx(0xD1D1B968), AESx(0x00000000), AESx(0xEDEDC12C),
-	AESx(0x20204060), AESx(0xFCFCE31F), AESx(0xB1B179C8), AESx(0x5B5BB6ED),
-	AESx(0x6A6AD4BE), AESx(0xCBCB8D46), AESx(0xBEBE67D9), AESx(0x3939724B),
-	AESx(0x4A4A94DE), AESx(0x4C4C98D4), AESx(0x5858B0E8), AESx(0xCFCF854A),
-	AESx(0xD0D0BB6B), AESx(0xEFEFC52A), AESx(0xAAAA4FE5), AESx(0xFBFBED16),
-	AESx(0x434386C5), AESx(0x4D4D9AD7), AESx(0x33336655), AESx(0x85851194),
-	AESx(0x45458ACF), AESx(0xF9F9E910), AESx(0x02020406), AESx(0x7F7FFE81),
-	AESx(0x5050A0F0), AESx(0x3C3C7844), AESx(0x9F9F25BA), AESx(0xA8A84BE3),
-	AESx(0x5151A2F3), AESx(0xA3A35DFE), AESx(0x404080C0), AESx(0x8F8F058A),
-	AESx(0x92923FAD), AESx(0x9D9D21BC), AESx(0x38387048), AESx(0xF5F5F104),
-	AESx(0xBCBC63DF), AESx(0xB6B677C1), AESx(0xDADAAF75), AESx(0x21214263),
-	AESx(0x10102030), AESx(0xFFFFE51A), AESx(0xF3F3FD0E), AESx(0xD2D2BF6D),
-	AESx(0xCDCD814C), AESx(0x0C0C1814), AESx(0x13132635), AESx(0xECECC32F),
-	AESx(0x5F5FBEE1), AESx(0x979735A2), AESx(0x444488CC), AESx(0x17172E39),
-	AESx(0xC4C49357), AESx(0xA7A755F2), AESx(0x7E7EFC82), AESx(0x3D3D7A47),
-	AESx(0x6464C8AC), AESx(0x5D5DBAE7), AESx(0x1919322B), AESx(0x7373E695),
-	AESx(0x6060C0A0), AESx(0x81811998), AESx(0x4F4F9ED1), AESx(0xDCDCA37F),
-	AESx(0x22224466), AESx(0x2A2A547E), AESx(0x90903BAB), AESx(0x88880B83),
-	AESx(0x46468CCA), AESx(0xEEEEC729), AESx(0xB8B86BD3), AESx(0x1414283C),
-	AESx(0xDEDEA779), AESx(0x5E5EBCE2), AESx(0x0B0B161D), AESx(0xDBDBAD76),
-	AESx(0xE0E0DB3B), AESx(0x32326456), AESx(0x3A3A744E), AESx(0x0A0A141E),
-	AESx(0x494992DB), AESx(0x06060C0A), AESx(0x2424486C), AESx(0x5C5CB8E4),
-	AESx(0xC2C29F5D), AESx(0xD3D3BD6E), AESx(0xACAC43EF), AESx(0x6262C4A6),
-	AESx(0x919139A8), AESx(0x959531A4), AESx(0xE4E4D337), AESx(0x7979F28B),
-	AESx(0xE7E7D532), AESx(0xC8C88B43), AESx(0x37376E59), AESx(0x6D6DDAB7),
-	AESx(0x8D8D018C), AESx(0xD5D5B164), AESx(0x4E4E9CD2), AESx(0xA9A949E0),
-	AESx(0x6C6CD8B4), AESx(0x5656ACFA), AESx(0xF4F4F307), AESx(0xEAEACF25),
-	AESx(0x6565CAAF), AESx(0x7A7AF48E), AESx(0xAEAE47E9), AESx(0x08081018),
-	AESx(0xBABA6FD5), AESx(0x7878F088), AESx(0x25254A6F), AESx(0x2E2E5C72),
-	AESx(0x1C1C3824), AESx(0xA6A657F1), AESx(0xB4B473C7), AESx(0xC6C69751),
-	AESx(0xE8E8CB23), AESx(0xDDDDA17C), AESx(0x7474E89C), AESx(0x1F1F3E21),
-	AESx(0x4B4B96DD), AESx(0xBDBD61DC), AESx(0x8B8B0D86), AESx(0x8A8A0F85),
-	AESx(0x7070E090), AESx(0x3E3E7C42), AESx(0xB5B571C4), AESx(0x6666CCAA),
-	AESx(0x484890D8), AESx(0x03030605), AESx(0xF6F6F701), AESx(0x0E0E1C12),
-	AESx(0x6161C2A3), AESx(0x35356A5F), AESx(0x5757AEF9), AESx(0xB9B969D0),
-	AESx(0x86861791), AESx(0xC1C19958), AESx(0x1D1D3A27), AESx(0x9E9E27B9),
-	AESx(0xE1E1D938), AESx(0xF8F8EB13), AESx(0x98982BB3), AESx(0x11112233),
-	AESx(0x6969D2BB), AESx(0xD9D9A970), AESx(0x8E8E0789), AESx(0x949433A7),
-	AESx(0x9B9B2DB6), AESx(0x1E1E3C22), AESx(0x87871592), AESx(0xE9E9C920),
-	AESx(0xCECE8749), AESx(0x5555AAFF), AESx(0x28285078), AESx(0xDFDFA57A),
-	AESx(0x8C8C038F), AESx(0xA1A159F8), AESx(0x89890980), AESx(0x0D0D1A17),
-	AESx(0xBFBF65DA), AESx(0xE6E6D731), AESx(0x424284C6), AESx(0x6868D0B8),
-	AESx(0x414182C3), AESx(0x999929B0), AESx(0x2D2D5A77), AESx(0x0F0F1E11),
-	AESx(0xB0B07BCB), AESx(0x5454A8FC), AESx(0xBBBB6DD6), AESx(0x16162C3A)
-};
-
-static const uint32_t h_AES2[256] = {
-	AESx(0x63C6A563), AESx(0x7CF8847C), AESx(0x77EE9977), AESx(0x7BF68D7B),
-	AESx(0xF2FF0DF2), AESx(0x6BD6BD6B), AESx(0x6FDEB16F), AESx(0xC59154C5),
-	AESx(0x30605030), AESx(0x01020301), AESx(0x67CEA967), AESx(0x2B567D2B),
-	AESx(0xFEE719FE), AESx(0xD7B562D7), AESx(0xAB4DE6AB), AESx(0x76EC9A76),
-	AESx(0xCA8F45CA), AESx(0x821F9D82), AESx(0xC98940C9), AESx(0x7DFA877D),
-	AESx(0xFAEF15FA), AESx(0x59B2EB59), AESx(0x478EC947), AESx(0xF0FB0BF0),
-	AESx(0xAD41ECAD), AESx(0xD4B367D4), AESx(0xA25FFDA2), AESx(0xAF45EAAF),
-	AESx(0x9C23BF9C), AESx(0xA453F7A4), AESx(0x72E49672), AESx(0xC09B5BC0),
-	AESx(0xB775C2B7), AESx(0xFDE11CFD), AESx(0x933DAE93), AESx(0x264C6A26),
-	AESx(0x366C5A36), AESx(0x3F7E413F), AESx(0xF7F502F7), AESx(0xCC834FCC),
-	AESx(0x34685C34), AESx(0xA551F4A5), AESx(0xE5D134E5), AESx(0xF1F908F1),
-	AESx(0x71E29371), AESx(0xD8AB73D8), AESx(0x31625331), AESx(0x152A3F15),
-	AESx(0x04080C04), AESx(0xC79552C7), AESx(0x23466523), AESx(0xC39D5EC3),
-	AESx(0x18302818), AESx(0x9637A196), AESx(0x050A0F05), AESx(0x9A2FB59A),
-	AESx(0x070E0907), AESx(0x12243612), AESx(0x801B9B80), AESx(0xE2DF3DE2),
-	AESx(0xEBCD26EB), AESx(0x274E6927), AESx(0xB27FCDB2), AESx(0x75EA9F75),
-	AESx(0x09121B09), AESx(0x831D9E83), AESx(0x2C58742C), AESx(0x1A342E1A),
-	AESx(0x1B362D1B), AESx(0x6EDCB26E), AESx(0x5AB4EE5A), AESx(0xA05BFBA0),
-	AESx(0x52A4F652), AESx(0x3B764D3B), AESx(0xD6B761D6), AESx(0xB37DCEB3),
-	AESx(0x29527B29), AESx(0xE3DD3EE3), AESx(0x2F5E712F), AESx(0x84139784),
-	AESx(0x53A6F553), AESx(0xD1B968D1), AESx(0x00000000), AESx(0xEDC12CED),
-	AESx(0x20406020), AESx(0xFCE31FFC), AESx(0xB179C8B1), AESx(0x5BB6ED5B),
-	AESx(0x6AD4BE6A), AESx(0xCB8D46CB), AESx(0xBE67D9BE), AESx(0x39724B39),
-	AESx(0x4A94DE4A), AESx(0x4C98D44C), AESx(0x58B0E858), AESx(0xCF854ACF),
-	AESx(0xD0BB6BD0), AESx(0xEFC52AEF), AESx(0xAA4FE5AA), AESx(0xFBED16FB),
-	AESx(0x4386C543), AESx(0x4D9AD74D), AESx(0x33665533), AESx(0x85119485),
-	AESx(0x458ACF45), AESx(0xF9E910F9), AESx(0x02040602), AESx(0x7FFE817F),
-	AESx(0x50A0F050), AESx(0x3C78443C), AESx(0x9F25BA9F), AESx(0xA84BE3A8),
-	AESx(0x51A2F351), AESx(0xA35DFEA3), AESx(0x4080C040), AESx(0x8F058A8F),
-	AESx(0x923FAD92), AESx(0x9D21BC9D), AESx(0x38704838), AESx(0xF5F104F5),
-	AESx(0xBC63DFBC), AESx(0xB677C1B6), AESx(0xDAAF75DA), AESx(0x21426321),
-	AESx(0x10203010), AESx(0xFFE51AFF), AESx(0xF3FD0EF3), AESx(0xD2BF6DD2),
-	AESx(0xCD814CCD), AESx(0x0C18140C), AESx(0x13263513), AESx(0xECC32FEC),
-	AESx(0x5FBEE15F), AESx(0x9735A297), AESx(0x4488CC44), AESx(0x172E3917),
-	AESx(0xC49357C4), AESx(0xA755F2A7), AESx(0x7EFC827E), AESx(0x3D7A473D),
-	AESx(0x64C8AC64), AESx(0x5DBAE75D), AESx(0x19322B19), AESx(0x73E69573),
-	AESx(0x60C0A060), AESx(0x81199881), AESx(0x4F9ED14F), AESx(0xDCA37FDC),
-	AESx(0x22446622), AESx(0x2A547E2A), AESx(0x903BAB90), AESx(0x880B8388),
-	AESx(0x468CCA46), AESx(0xEEC729EE), AESx(0xB86BD3B8), AESx(0x14283C14),
-	AESx(0xDEA779DE), AESx(0x5EBCE25E), AESx(0x0B161D0B), AESx(0xDBAD76DB),
-	AESx(0xE0DB3BE0), AESx(0x32645632), AESx(0x3A744E3A), AESx(0x0A141E0A),
-	AESx(0x4992DB49), AESx(0x060C0A06), AESx(0x24486C24), AESx(0x5CB8E45C),
-	AESx(0xC29F5DC2), AESx(0xD3BD6ED3), AESx(0xAC43EFAC), AESx(0x62C4A662),
-	AESx(0x9139A891), AESx(0x9531A495), AESx(0xE4D337E4), AESx(0x79F28B79),
-	AESx(0xE7D532E7), AESx(0xC88B43C8), AESx(0x376E5937), AESx(0x6DDAB76D),
-	AESx(0x8D018C8D), AESx(0xD5B164D5), AESx(0x4E9CD24E), AESx(0xA949E0A9),
-	AESx(0x6CD8B46C), AESx(0x56ACFA56), AESx(0xF4F307F4), AESx(0xEACF25EA),
-	AESx(0x65CAAF65), AESx(0x7AF48E7A), AESx(0xAE47E9AE), AESx(0x08101808),
-	AESx(0xBA6FD5BA), AESx(0x78F08878), AESx(0x254A6F25), AESx(0x2E5C722E),
-	AESx(0x1C38241C), AESx(0xA657F1A6), AESx(0xB473C7B4), AESx(0xC69751C6),
-	AESx(0xE8CB23E8), AESx(0xDDA17CDD), AESx(0x74E89C74), AESx(0x1F3E211F),
-	AESx(0x4B96DD4B), AESx(0xBD61DCBD), AESx(0x8B0D868B), AESx(0x8A0F858A),
-	AESx(0x70E09070), AESx(0x3E7C423E), AESx(0xB571C4B5), AESx(0x66CCAA66),
-	AESx(0x4890D848), AESx(0x03060503), AESx(0xF6F701F6), AESx(0x0E1C120E),
-	AESx(0x61C2A361), AESx(0x356A5F35), AESx(0x57AEF957), AESx(0xB969D0B9),
-	AESx(0x86179186), AESx(0xC19958C1), AESx(0x1D3A271D), AESx(0x9E27B99E),
-	AESx(0xE1D938E1), AESx(0xF8EB13F8), AESx(0x982BB398), AESx(0x11223311),
-	AESx(0x69D2BB69), AESx(0xD9A970D9), AESx(0x8E07898E), AESx(0x9433A794),
-	AESx(0x9B2DB69B), AESx(0x1E3C221E), AESx(0x87159287), AESx(0xE9C920E9),
-	AESx(0xCE8749CE), AESx(0x55AAFF55), AESx(0x28507828), AESx(0xDFA57ADF),
-	AESx(0x8C038F8C), AESx(0xA159F8A1), AESx(0x89098089), AESx(0x0D1A170D),
-	AESx(0xBF65DABF), AESx(0xE6D731E6), AESx(0x4284C642), AESx(0x68D0B868),
-	AESx(0x4182C341), AESx(0x9929B099), AESx(0x2D5A772D), AESx(0x0F1E110F),
-	AESx(0xB07BCBB0), AESx(0x54A8FC54), AESx(0xBB6DD6BB), AESx(0x162C3A16)
-};
-
-static const uint32_t h_AES3[256] = {
-	AESx(0xC6A56363), AESx(0xF8847C7C), AESx(0xEE997777), AESx(0xF68D7B7B),
-	AESx(0xFF0DF2F2), AESx(0xD6BD6B6B), AESx(0xDEB16F6F), AESx(0x9154C5C5),
-	AESx(0x60503030), AESx(0x02030101), AESx(0xCEA96767), AESx(0x567D2B2B),
-	AESx(0xE719FEFE), AESx(0xB562D7D7), AESx(0x4DE6ABAB), AESx(0xEC9A7676),
-	AESx(0x8F45CACA), AESx(0x1F9D8282), AESx(0x8940C9C9), AESx(0xFA877D7D),
-	AESx(0xEF15FAFA), AESx(0xB2EB5959), AESx(0x8EC94747), AESx(0xFB0BF0F0),
-	AESx(0x41ECADAD), AESx(0xB367D4D4), AESx(0x5FFDA2A2), AESx(0x45EAAFAF),
-	AESx(0x23BF9C9C), AESx(0x53F7A4A4), AESx(0xE4967272), AESx(0x9B5BC0C0),
-	AESx(0x75C2B7B7), AESx(0xE11CFDFD), AESx(0x3DAE9393), AESx(0x4C6A2626),
-	AESx(0x6C5A3636), AESx(0x7E413F3F), AESx(0xF502F7F7), AESx(0x834FCCCC),
-	AESx(0x685C3434), AESx(0x51F4A5A5), AESx(0xD134E5E5), AESx(0xF908F1F1),
-	AESx(0xE2937171), AESx(0xAB73D8D8), AESx(0x62533131), AESx(0x2A3F1515),
-	AESx(0x080C0404), AESx(0x9552C7C7), AESx(0x46652323), AESx(0x9D5EC3C3),
-	AESx(0x30281818), AESx(0x37A19696), AESx(0x0A0F0505), AESx(0x2FB59A9A),
-	AESx(0x0E090707), AESx(0x24361212), AESx(0x1B9B8080), AESx(0xDF3DE2E2),
-	AESx(0xCD26EBEB), AESx(0x4E692727), AESx(0x7FCDB2B2), AESx(0xEA9F7575),
-	AESx(0x121B0909), AESx(0x1D9E8383), AESx(0x58742C2C), AESx(0x342E1A1A),
-	AESx(0x362D1B1B), AESx(0xDCB26E6E), AESx(0xB4EE5A5A), AESx(0x5BFBA0A0),
-	AESx(0xA4F65252), AESx(0x764D3B3B), AESx(0xB761D6D6), AESx(0x7DCEB3B3),
-	AESx(0x527B2929), AESx(0xDD3EE3E3), AESx(0x5E712F2F), AESx(0x13978484),
-	AESx(0xA6F55353), AESx(0xB968D1D1), AESx(0x00000000), AESx(0xC12CEDED),
-	AESx(0x40602020), AESx(0xE31FFCFC), AESx(0x79C8B1B1), AESx(0xB6ED5B5B),
-	AESx(0xD4BE6A6A), AESx(0x8D46CBCB), AESx(0x67D9BEBE), AESx(0x724B3939),
-	AESx(0x94DE4A4A), AESx(0x98D44C4C), AESx(0xB0E85858), AESx(0x854ACFCF),
-	AESx(0xBB6BD0D0), AESx(0xC52AEFEF), AESx(0x4FE5AAAA), AESx(0xED16FBFB),
-	AESx(0x86C54343), AESx(0x9AD74D4D), AESx(0x66553333), AESx(0x11948585),
-	AESx(0x8ACF4545), AESx(0xE910F9F9), AESx(0x04060202), AESx(0xFE817F7F),
-	AESx(0xA0F05050), AESx(0x78443C3C), AESx(0x25BA9F9F), AESx(0x4BE3A8A8),
-	AESx(0xA2F35151), AESx(0x5DFEA3A3), AESx(0x80C04040), AESx(0x058A8F8F),
-	AESx(0x3FAD9292), AESx(0x21BC9D9D), AESx(0x70483838), AESx(0xF104F5F5),
-	AESx(0x63DFBCBC), AESx(0x77C1B6B6), AESx(0xAF75DADA), AESx(0x42632121),
-	AESx(0x20301010), AESx(0xE51AFFFF), AESx(0xFD0EF3F3), AESx(0xBF6DD2D2),
-	AESx(0x814CCDCD), AESx(0x18140C0C), AESx(0x26351313), AESx(0xC32FECEC),
-	AESx(0xBEE15F5F), AESx(0x35A29797), AESx(0x88CC4444), AESx(0x2E391717),
-	AESx(0x9357C4C4), AESx(0x55F2A7A7), AESx(0xFC827E7E), AESx(0x7A473D3D),
-	AESx(0xC8AC6464), AESx(0xBAE75D5D), AESx(0x322B1919), AESx(0xE6957373),
-	AESx(0xC0A06060), AESx(0x19988181), AESx(0x9ED14F4F), AESx(0xA37FDCDC),
-	AESx(0x44662222), AESx(0x547E2A2A), AESx(0x3BAB9090), AESx(0x0B838888),
-	AESx(0x8CCA4646), AESx(0xC729EEEE), AESx(0x6BD3B8B8), AESx(0x283C1414),
-	AESx(0xA779DEDE), AESx(0xBCE25E5E), AESx(0x161D0B0B), AESx(0xAD76DBDB),
-	AESx(0xDB3BE0E0), AESx(0x64563232), AESx(0x744E3A3A), AESx(0x141E0A0A),
-	AESx(0x92DB4949), AESx(0x0C0A0606), AESx(0x486C2424), AESx(0xB8E45C5C),
-	AESx(0x9F5DC2C2), AESx(0xBD6ED3D3), AESx(0x43EFACAC), AESx(0xC4A66262),
-	AESx(0x39A89191), AESx(0x31A49595), AESx(0xD337E4E4), AESx(0xF28B7979),
-	AESx(0xD532E7E7), AESx(0x8B43C8C8), AESx(0x6E593737), AESx(0xDAB76D6D),
-	AESx(0x018C8D8D), AESx(0xB164D5D5), AESx(0x9CD24E4E), AESx(0x49E0A9A9),
-	AESx(0xD8B46C6C), AESx(0xACFA5656), AESx(0xF307F4F4), AESx(0xCF25EAEA),
-	AESx(0xCAAF6565), AESx(0xF48E7A7A), AESx(0x47E9AEAE), AESx(0x10180808),
-	AESx(0x6FD5BABA), AESx(0xF0887878), AESx(0x4A6F2525), AESx(0x5C722E2E),
-	AESx(0x38241C1C), AESx(0x57F1A6A6), AESx(0x73C7B4B4), AESx(0x9751C6C6),
-	AESx(0xCB23E8E8), AESx(0xA17CDDDD), AESx(0xE89C7474), AESx(0x3E211F1F),
-	AESx(0x96DD4B4B), AESx(0x61DCBDBD), AESx(0x0D868B8B), AESx(0x0F858A8A),
-	AESx(0xE0907070), AESx(0x7C423E3E), AESx(0x71C4B5B5), AESx(0xCCAA6666),
-	AESx(0x90D84848), AESx(0x06050303), AESx(0xF701F6F6), AESx(0x1C120E0E),
-	AESx(0xC2A36161), AESx(0x6A5F3535), AESx(0xAEF95757), AESx(0x69D0B9B9),
-	AESx(0x17918686), AESx(0x9958C1C1), AESx(0x3A271D1D), AESx(0x27B99E9E),
-	AESx(0xD938E1E1), AESx(0xEB13F8F8), AESx(0x2BB39898), AESx(0x22331111),
-	AESx(0xD2BB6969), AESx(0xA970D9D9), AESx(0x07898E8E), AESx(0x33A79494),
-	AESx(0x2DB69B9B), AESx(0x3C221E1E), AESx(0x15928787), AESx(0xC920E9E9),
-	AESx(0x8749CECE), AESx(0xAAFF5555), AESx(0x50782828), AESx(0xA57ADFDF),
-	AESx(0x038F8C8C), AESx(0x59F8A1A1), AESx(0x09808989), AESx(0x1A170D0D),
-	AESx(0x65DABFBF), AESx(0xD731E6E6), AESx(0x84C64242), AESx(0xD0B86868),
-	AESx(0x82C34141), AESx(0x29B09999), AESx(0x5A772D2D), AESx(0x1E110F0F),
-	AESx(0x7BCBB0B0), AESx(0xA8FC5454), AESx(0x6DD6BBBB), AESx(0x2C3A1616)
-};
-
-static __constant__ uint32_t d_AES0[256];
-static __constant__ uint32_t d_AES1[256];
-static __constant__ uint32_t d_AES2[256];
-static __constant__ uint32_t d_AES3[256];
-
-static void aes_cpu_init()
-{
-	cudaMemcpyToSymbol( d_AES0,
-                        h_AES0,
-                        sizeof(h_AES0),
-                        0, cudaMemcpyHostToDevice);
-
-	cudaMemcpyToSymbol( d_AES1,
-                        h_AES1,
-                        sizeof(h_AES1),
-                        0, cudaMemcpyHostToDevice);
-
-	cudaMemcpyToSymbol( d_AES2,
-                        h_AES2,
-                        sizeof(h_AES2),
-                        0, cudaMemcpyHostToDevice);
-
-	cudaMemcpyToSymbol( d_AES3,
-                        h_AES3,
-                        sizeof(h_AES3),
-                        0, cudaMemcpyHostToDevice);
-}
-
-static __device__ __forceinline__ void aes_gpu_init(uint32_t *sharedMemory)
-{
-	if(threadIdx.x < 256)
-	{
-		sharedMemory[threadIdx.x] = d_AES0[threadIdx.x];
-		sharedMemory[threadIdx.x+256] = d_AES1[threadIdx.x];
-		sharedMemory[threadIdx.x+512] = d_AES2[threadIdx.x];
-		sharedMemory[threadIdx.x+768] = d_AES3[threadIdx.x];
-	}
-}
-
-static __device__ __forceinline__ void aes_round(
-	const uint32_t *sharedMemory,
-	uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, 
-	uint32_t k0, 
-	uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
-{
-	uint32_t idx0, idx1, idx2, idx3;
-
-	idx0 = __byte_perm(x0, 0, 0x4440);
-	idx1 = __byte_perm(x1, 0, 0x4441) + 256;
-	idx2 = __byte_perm(x2, 0, 0x4442) + 512;
-	idx3 = __byte_perm(x3, 0, 0x4443) + 768;
-	y0 =sharedMemory[idx0] ^ 
-		sharedMemory[idx1] ^ 
-		sharedMemory[idx2] ^ 
-		sharedMemory[idx3] ^ 
-		k0;
-
-	idx0 = __byte_perm(x1, 0, 0x4440);
-	idx1 = __byte_perm(x2, 0, 0x4441) + 256;
-	idx2 = __byte_perm(x3, 0, 0x4442) + 512;
-	idx3 = __byte_perm(x0, 0, 0x4443) + 768;
-
-	y1 =sharedMemory[idx0] ^ 
-		sharedMemory[idx1] ^ 
-		sharedMemory[idx2] ^ 
-		sharedMemory[idx3]; // ^k3
-
-	idx0 = __byte_perm(x2, 0, 0x4440);
-	idx1 = __byte_perm(x3, 0, 0x4441) + 256;
-	idx2 = __byte_perm(x0, 0, 0x4442) + 512;
-	idx3 = __byte_perm(x1, 0, 0x4443) + 768;
-	y2 =sharedMemory[idx0] ^ 
-		sharedMemory[idx1] ^ 
-		sharedMemory[idx2] ^ 
-		sharedMemory[idx3]; // ^k2
-
-	idx0 = __byte_perm(x3, 0, 0x4440);
-	idx1 = __byte_perm(x0, 0, 0x4441) + 256;
-	idx2 = __byte_perm(x1, 0, 0x4442) + 512;
-	idx3 = __byte_perm(x2, 0, 0x4443) + 768;
-	y3 =sharedMemory[idx0] ^ 
-		sharedMemory[idx1] ^ 
-		sharedMemory[idx2] ^ 
-		sharedMemory[idx3]; // ^k3
-}
-
-static __device__ __forceinline__ void aes_round(
-	const uint32_t *sharedMemory,
-	uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, 
-	uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
-{
-	uint32_t idx0, idx1, idx2, idx3;
-
-	idx0 = __byte_perm(x0, 0, 0x4440);
-	idx1 = __byte_perm(x1, 0, 0x4441) + 256;
-	idx2 = __byte_perm(x2, 0, 0x4442) + 512;
-	idx3 = __byte_perm(x3, 0, 0x4443) + 768;
-	y0 =sharedMemory[idx0] ^ 
-		sharedMemory[idx1] ^ 
-		sharedMemory[idx2] ^ 
-		sharedMemory[idx3];
-
-	idx0 = __byte_perm(x1, 0, 0x4440);
-	idx1 = __byte_perm(x2, 0, 0x4441) + 256;
-	idx2 = __byte_perm(x3, 0, 0x4442) + 512;
-	idx3 = __byte_perm(x0, 0, 0x4443) + 768;
-
-	y1 =sharedMemory[idx0] ^ 
-		sharedMemory[idx1] ^ 
-		sharedMemory[idx2] ^ 
-		sharedMemory[idx3]; // ^k3
-
-	idx0 = __byte_perm(x2, 0, 0x4440);
-	idx1 = __byte_perm(x3, 0, 0x4441) + 256;
-	idx2 = __byte_perm(x0, 0, 0x4442) + 512;
-	idx3 = __byte_perm(x1, 0, 0x4443) + 768;
-	y2 =sharedMemory[idx0] ^ 
-		sharedMemory[idx1] ^ 
-		sharedMemory[idx2] ^ 
-		sharedMemory[idx3]; // ^k2
-
-	idx0 = __byte_perm(x3, 0, 0x4440);
-	idx1 = __byte_perm(x0, 0, 0x4441) + 256;
-	idx2 = __byte_perm(x1, 0, 0x4442) + 512;
-	idx3 = __byte_perm(x2, 0, 0x4443) + 768;
-	y3 =sharedMemory[idx0] ^ 
-		sharedMemory[idx1] ^ 
-		sharedMemory[idx2] ^ 
-		sharedMemory[idx3]; // ^k3
-}
diff --git a/x11/cuda_x11_aes.cuh b/x11/cuda_x11_aes.cuh
new file mode 100644
index 0000000000..362dd9c15a
--- /dev/null
+++ b/x11/cuda_x11_aes.cuh
@@ -0,0 +1,400 @@
+
+/* AES Helper for inline-usage from SPH */
+#define AESx(x) (x ##UL) /* SPH_C32(x) */
+
+//#define DEVICE_DIRECT_CONSTANTS
+
+#ifdef DEVICE_DIRECT_CONSTANTS
+__constant__ __align__(64) uint32_t d_AES0[256] = {
+#else
+static const uint32_t h_AES0[256] = {
+#endif
+	AESx(0xA56363C6), AESx(0x847C7CF8), AESx(0x997777EE), AESx(0x8D7B7BF6),
+	AESx(0x0DF2F2FF), AESx(0xBD6B6BD6), AESx(0xB16F6FDE), AESx(0x54C5C591),
+	AESx(0x50303060), AESx(0x03010102), AESx(0xA96767CE), AESx(0x7D2B2B56),
+	AESx(0x19FEFEE7), AESx(0x62D7D7B5), AESx(0xE6ABAB4D), AESx(0x9A7676EC),
+	AESx(0x45CACA8F), AESx(0x9D82821F), AESx(0x40C9C989), AESx(0x877D7DFA),
+	AESx(0x15FAFAEF), AESx(0xEB5959B2), AESx(0xC947478E), AESx(0x0BF0F0FB),
+	AESx(0xECADAD41), AESx(0x67D4D4B3), AESx(0xFDA2A25F), AESx(0xEAAFAF45),
+	AESx(0xBF9C9C23), AESx(0xF7A4A453), AESx(0x967272E4), AESx(0x5BC0C09B),
+	AESx(0xC2B7B775), AESx(0x1CFDFDE1), AESx(0xAE93933D), AESx(0x6A26264C),
+	AESx(0x5A36366C), AESx(0x413F3F7E), AESx(0x02F7F7F5), AESx(0x4FCCCC83),
+	AESx(0x5C343468), AESx(0xF4A5A551), AESx(0x34E5E5D1), AESx(0x08F1F1F9),
+	AESx(0x937171E2), AESx(0x73D8D8AB), AESx(0x53313162), AESx(0x3F15152A),
+	AESx(0x0C040408), AESx(0x52C7C795), AESx(0x65232346), AESx(0x5EC3C39D),
+	AESx(0x28181830), AESx(0xA1969637), AESx(0x0F05050A), AESx(0xB59A9A2F),
+	AESx(0x0907070E), AESx(0x36121224), AESx(0x9B80801B), AESx(0x3DE2E2DF),
+	AESx(0x26EBEBCD), AESx(0x6927274E), AESx(0xCDB2B27F), AESx(0x9F7575EA),
+	AESx(0x1B090912), AESx(0x9E83831D), AESx(0x742C2C58), AESx(0x2E1A1A34),
+	AESx(0x2D1B1B36), AESx(0xB26E6EDC), AESx(0xEE5A5AB4), AESx(0xFBA0A05B),
+	AESx(0xF65252A4), AESx(0x4D3B3B76), AESx(0x61D6D6B7), AESx(0xCEB3B37D),
+	AESx(0x7B292952), AESx(0x3EE3E3DD), AESx(0x712F2F5E), AESx(0x97848413),
+	AESx(0xF55353A6), AESx(0x68D1D1B9), AESx(0x00000000), AESx(0x2CEDEDC1),
+	AESx(0x60202040), AESx(0x1FFCFCE3), AESx(0xC8B1B179), AESx(0xED5B5BB6),
+	AESx(0xBE6A6AD4), AESx(0x46CBCB8D), AESx(0xD9BEBE67), AESx(0x4B393972),
+	AESx(0xDE4A4A94), AESx(0xD44C4C98), AESx(0xE85858B0), AESx(0x4ACFCF85),
+	AESx(0x6BD0D0BB), AESx(0x2AEFEFC5), AESx(0xE5AAAA4F), AESx(0x16FBFBED),
+	AESx(0xC5434386), AESx(0xD74D4D9A), AESx(0x55333366), AESx(0x94858511),
+	AESx(0xCF45458A), AESx(0x10F9F9E9), AESx(0x06020204), AESx(0x817F7FFE),
+	AESx(0xF05050A0), AESx(0x443C3C78), AESx(0xBA9F9F25), AESx(0xE3A8A84B),
+	AESx(0xF35151A2), AESx(0xFEA3A35D), AESx(0xC0404080), AESx(0x8A8F8F05),
+	AESx(0xAD92923F), AESx(0xBC9D9D21), AESx(0x48383870), AESx(0x04F5F5F1),
+	AESx(0xDFBCBC63), AESx(0xC1B6B677), AESx(0x75DADAAF), AESx(0x63212142),
+	AESx(0x30101020), AESx(0x1AFFFFE5), AESx(0x0EF3F3FD), AESx(0x6DD2D2BF),
+	AESx(0x4CCDCD81), AESx(0x140C0C18), AESx(0x35131326), AESx(0x2FECECC3),
+	AESx(0xE15F5FBE), AESx(0xA2979735), AESx(0xCC444488), AESx(0x3917172E),
+	AESx(0x57C4C493), AESx(0xF2A7A755), AESx(0x827E7EFC), AESx(0x473D3D7A),
+	AESx(0xAC6464C8), AESx(0xE75D5DBA), AESx(0x2B191932), AESx(0x957373E6),
+	AESx(0xA06060C0), AESx(0x98818119), AESx(0xD14F4F9E), AESx(0x7FDCDCA3),
+	AESx(0x66222244), AESx(0x7E2A2A54), AESx(0xAB90903B), AESx(0x8388880B),
+	AESx(0xCA46468C), AESx(0x29EEEEC7), AESx(0xD3B8B86B), AESx(0x3C141428),
+	AESx(0x79DEDEA7), AESx(0xE25E5EBC), AESx(0x1D0B0B16), AESx(0x76DBDBAD),
+	AESx(0x3BE0E0DB), AESx(0x56323264), AESx(0x4E3A3A74), AESx(0x1E0A0A14),
+	AESx(0xDB494992), AESx(0x0A06060C), AESx(0x6C242448), AESx(0xE45C5CB8),
+	AESx(0x5DC2C29F), AESx(0x6ED3D3BD), AESx(0xEFACAC43), AESx(0xA66262C4),
+	AESx(0xA8919139), AESx(0xA4959531), AESx(0x37E4E4D3), AESx(0x8B7979F2),
+	AESx(0x32E7E7D5), AESx(0x43C8C88B), AESx(0x5937376E), AESx(0xB76D6DDA),
+	AESx(0x8C8D8D01), AESx(0x64D5D5B1), AESx(0xD24E4E9C), AESx(0xE0A9A949),
+	AESx(0xB46C6CD8), AESx(0xFA5656AC), AESx(0x07F4F4F3), AESx(0x25EAEACF),
+	AESx(0xAF6565CA), AESx(0x8E7A7AF4), AESx(0xE9AEAE47), AESx(0x18080810),
+	AESx(0xD5BABA6F), AESx(0x887878F0), AESx(0x6F25254A), AESx(0x722E2E5C),
+	AESx(0x241C1C38), AESx(0xF1A6A657), AESx(0xC7B4B473), AESx(0x51C6C697),
+	AESx(0x23E8E8CB), AESx(0x7CDDDDA1), AESx(0x9C7474E8), AESx(0x211F1F3E),
+	AESx(0xDD4B4B96), AESx(0xDCBDBD61), AESx(0x868B8B0D), AESx(0x858A8A0F),
+	AESx(0x907070E0), AESx(0x423E3E7C), AESx(0xC4B5B571), AESx(0xAA6666CC),
+	AESx(0xD8484890), AESx(0x05030306), AESx(0x01F6F6F7), AESx(0x120E0E1C),
+	AESx(0xA36161C2), AESx(0x5F35356A), AESx(0xF95757AE), AESx(0xD0B9B969),
+	AESx(0x91868617), AESx(0x58C1C199), AESx(0x271D1D3A), AESx(0xB99E9E27),
+	AESx(0x38E1E1D9), AESx(0x13F8F8EB), AESx(0xB398982B), AESx(0x33111122),
+	AESx(0xBB6969D2), AESx(0x70D9D9A9), AESx(0x898E8E07), AESx(0xA7949433),
+	AESx(0xB69B9B2D), AESx(0x221E1E3C), AESx(0x92878715), AESx(0x20E9E9C9),
+	AESx(0x49CECE87), AESx(0xFF5555AA), AESx(0x78282850), AESx(0x7ADFDFA5),
+	AESx(0x8F8C8C03), AESx(0xF8A1A159), AESx(0x80898909), AESx(0x170D0D1A),
+	AESx(0xDABFBF65), AESx(0x31E6E6D7), AESx(0xC6424284), AESx(0xB86868D0),
+	AESx(0xC3414182), AESx(0xB0999929), AESx(0x772D2D5A), AESx(0x110F0F1E),
+	AESx(0xCBB0B07B), AESx(0xFC5454A8), AESx(0xD6BBBB6D), AESx(0x3A16162C)
+};
+
+#ifdef DEVICE_DIRECT_CONSTANTS
+__constant__ __align__(64) uint32_t d_AES1[256] = {
+#else
+static const uint32_t h_AES1[256] = {
+#endif
+	AESx(0x6363C6A5), AESx(0x7C7CF884), AESx(0x7777EE99), AESx(0x7B7BF68D),
+	AESx(0xF2F2FF0D), AESx(0x6B6BD6BD), AESx(0x6F6FDEB1), AESx(0xC5C59154),
+	AESx(0x30306050), AESx(0x01010203), AESx(0x6767CEA9), AESx(0x2B2B567D),
+	AESx(0xFEFEE719), AESx(0xD7D7B562), AESx(0xABAB4DE6), AESx(0x7676EC9A),
+	AESx(0xCACA8F45), AESx(0x82821F9D), AESx(0xC9C98940), AESx(0x7D7DFA87),
+	AESx(0xFAFAEF15), AESx(0x5959B2EB), AESx(0x47478EC9), AESx(0xF0F0FB0B),
+	AESx(0xADAD41EC), AESx(0xD4D4B367), AESx(0xA2A25FFD), AESx(0xAFAF45EA),
+	AESx(0x9C9C23BF), AESx(0xA4A453F7), AESx(0x7272E496), AESx(0xC0C09B5B),
+	AESx(0xB7B775C2), AESx(0xFDFDE11C), AESx(0x93933DAE), AESx(0x26264C6A),
+	AESx(0x36366C5A), AESx(0x3F3F7E41), AESx(0xF7F7F502), AESx(0xCCCC834F),
+	AESx(0x3434685C), AESx(0xA5A551F4), AESx(0xE5E5D134), AESx(0xF1F1F908),
+	AESx(0x7171E293), AESx(0xD8D8AB73), AESx(0x31316253), AESx(0x15152A3F),
+	AESx(0x0404080C), AESx(0xC7C79552), AESx(0x23234665), AESx(0xC3C39D5E),
+	AESx(0x18183028), AESx(0x969637A1), AESx(0x05050A0F), AESx(0x9A9A2FB5),
+	AESx(0x07070E09), AESx(0x12122436), AESx(0x80801B9B), AESx(0xE2E2DF3D),
+	AESx(0xEBEBCD26), AESx(0x27274E69), AESx(0xB2B27FCD), AESx(0x7575EA9F),
+	AESx(0x0909121B), AESx(0x83831D9E), AESx(0x2C2C5874), AESx(0x1A1A342E),
+	AESx(0x1B1B362D), AESx(0x6E6EDCB2), AESx(0x5A5AB4EE), AESx(0xA0A05BFB),
+	AESx(0x5252A4F6), AESx(0x3B3B764D), AESx(0xD6D6B761), AESx(0xB3B37DCE),
+	AESx(0x2929527B), AESx(0xE3E3DD3E), AESx(0x2F2F5E71), AESx(0x84841397),
+	AESx(0x5353A6F5), AESx(0xD1D1B968), AESx(0x00000000), AESx(0xEDEDC12C),
+	AESx(0x20204060), AESx(0xFCFCE31F), AESx(0xB1B179C8), AESx(0x5B5BB6ED),
+	AESx(0x6A6AD4BE), AESx(0xCBCB8D46), AESx(0xBEBE67D9), AESx(0x3939724B),
+	AESx(0x4A4A94DE), AESx(0x4C4C98D4), AESx(0x5858B0E8), AESx(0xCFCF854A),
+	AESx(0xD0D0BB6B), AESx(0xEFEFC52A), AESx(0xAAAA4FE5), AESx(0xFBFBED16),
+	AESx(0x434386C5), AESx(0x4D4D9AD7), AESx(0x33336655), AESx(0x85851194),
+	AESx(0x45458ACF), AESx(0xF9F9E910), AESx(0x02020406), AESx(0x7F7FFE81),
+	AESx(0x5050A0F0), AESx(0x3C3C7844), AESx(0x9F9F25BA), AESx(0xA8A84BE3),
+	AESx(0x5151A2F3), AESx(0xA3A35DFE), AESx(0x404080C0), AESx(0x8F8F058A),
+	AESx(0x92923FAD), AESx(0x9D9D21BC), AESx(0x38387048), AESx(0xF5F5F104),
+	AESx(0xBCBC63DF), AESx(0xB6B677C1), AESx(0xDADAAF75), AESx(0x21214263),
+	AESx(0x10102030), AESx(0xFFFFE51A), AESx(0xF3F3FD0E), AESx(0xD2D2BF6D),
+	AESx(0xCDCD814C), AESx(0x0C0C1814), AESx(0x13132635), AESx(0xECECC32F),
+	AESx(0x5F5FBEE1), AESx(0x979735A2), AESx(0x444488CC), AESx(0x17172E39),
+	AESx(0xC4C49357), AESx(0xA7A755F2), AESx(0x7E7EFC82), AESx(0x3D3D7A47),
+	AESx(0x6464C8AC), AESx(0x5D5DBAE7), AESx(0x1919322B), AESx(0x7373E695),
+	AESx(0x6060C0A0), AESx(0x81811998), AESx(0x4F4F9ED1), AESx(0xDCDCA37F),
+	AESx(0x22224466), AESx(0x2A2A547E), AESx(0x90903BAB), AESx(0x88880B83),
+	AESx(0x46468CCA), AESx(0xEEEEC729), AESx(0xB8B86BD3), AESx(0x1414283C),
+	AESx(0xDEDEA779), AESx(0x5E5EBCE2), AESx(0x0B0B161D), AESx(0xDBDBAD76),
+	AESx(0xE0E0DB3B), AESx(0x32326456), AESx(0x3A3A744E), AESx(0x0A0A141E),
+	AESx(0x494992DB), AESx(0x06060C0A), AESx(0x2424486C), AESx(0x5C5CB8E4),
+	AESx(0xC2C29F5D), AESx(0xD3D3BD6E), AESx(0xACAC43EF), AESx(0x6262C4A6),
+	AESx(0x919139A8), AESx(0x959531A4), AESx(0xE4E4D337), AESx(0x7979F28B),
+	AESx(0xE7E7D532), AESx(0xC8C88B43), AESx(0x37376E59), AESx(0x6D6DDAB7),
+	AESx(0x8D8D018C), AESx(0xD5D5B164), AESx(0x4E4E9CD2), AESx(0xA9A949E0),
+	AESx(0x6C6CD8B4), AESx(0x5656ACFA), AESx(0xF4F4F307), AESx(0xEAEACF25),
+	AESx(0x6565CAAF), AESx(0x7A7AF48E), AESx(0xAEAE47E9), AESx(0x08081018),
+	AESx(0xBABA6FD5), AESx(0x7878F088), AESx(0x25254A6F), AESx(0x2E2E5C72),
+	AESx(0x1C1C3824), AESx(0xA6A657F1), AESx(0xB4B473C7), AESx(0xC6C69751),
+	AESx(0xE8E8CB23), AESx(0xDDDDA17C), AESx(0x7474E89C), AESx(0x1F1F3E21),
+	AESx(0x4B4B96DD), AESx(0xBDBD61DC), AESx(0x8B8B0D86), AESx(0x8A8A0F85),
+	AESx(0x7070E090), AESx(0x3E3E7C42), AESx(0xB5B571C4), AESx(0x6666CCAA),
+	AESx(0x484890D8), AESx(0x03030605), AESx(0xF6F6F701), AESx(0x0E0E1C12),
+	AESx(0x6161C2A3), AESx(0x35356A5F), AESx(0x5757AEF9), AESx(0xB9B969D0),
+	AESx(0x86861791), AESx(0xC1C19958), AESx(0x1D1D3A27), AESx(0x9E9E27B9),
+	AESx(0xE1E1D938), AESx(0xF8F8EB13), AESx(0x98982BB3), AESx(0x11112233),
+	AESx(0x6969D2BB), AESx(0xD9D9A970), AESx(0x8E8E0789), AESx(0x949433A7),
+	AESx(0x9B9B2DB6), AESx(0x1E1E3C22), AESx(0x87871592), AESx(0xE9E9C920),
+	AESx(0xCECE8749), AESx(0x5555AAFF), AESx(0x28285078), AESx(0xDFDFA57A),
+	AESx(0x8C8C038F), AESx(0xA1A159F8), AESx(0x89890980), AESx(0x0D0D1A17),
+	AESx(0xBFBF65DA), AESx(0xE6E6D731), AESx(0x424284C6), AESx(0x6868D0B8),
+	AESx(0x414182C3), AESx(0x999929B0), AESx(0x2D2D5A77), AESx(0x0F0F1E11),
+	AESx(0xB0B07BCB), AESx(0x5454A8FC), AESx(0xBBBB6DD6), AESx(0x16162C3A)
+};
+
+#ifdef DEVICE_DIRECT_CONSTANTS
+__constant__ __align__(64) uint32_t d_AES2[256] = {
+#else
+static const uint32_t h_AES2[256] = {
+#endif
+	AESx(0x63C6A563), AESx(0x7CF8847C), AESx(0x77EE9977), AESx(0x7BF68D7B),
+	AESx(0xF2FF0DF2), AESx(0x6BD6BD6B), AESx(0x6FDEB16F), AESx(0xC59154C5),
+	AESx(0x30605030), AESx(0x01020301), AESx(0x67CEA967), AESx(0x2B567D2B),
+	AESx(0xFEE719FE), AESx(0xD7B562D7), AESx(0xAB4DE6AB), AESx(0x76EC9A76),
+	AESx(0xCA8F45CA), AESx(0x821F9D82), AESx(0xC98940C9), AESx(0x7DFA877D),
+	AESx(0xFAEF15FA), AESx(0x59B2EB59), AESx(0x478EC947), AESx(0xF0FB0BF0),
+	AESx(0xAD41ECAD), AESx(0xD4B367D4), AESx(0xA25FFDA2), AESx(0xAF45EAAF),
+	AESx(0x9C23BF9C), AESx(0xA453F7A4), AESx(0x72E49672), AESx(0xC09B5BC0),
+	AESx(0xB775C2B7), AESx(0xFDE11CFD), AESx(0x933DAE93), AESx(0x264C6A26),
+	AESx(0x366C5A36), AESx(0x3F7E413F), AESx(0xF7F502F7), AESx(0xCC834FCC),
+	AESx(0x34685C34), AESx(0xA551F4A5), AESx(0xE5D134E5), AESx(0xF1F908F1),
+	AESx(0x71E29371), AESx(0xD8AB73D8), AESx(0x31625331), AESx(0x152A3F15),
+	AESx(0x04080C04), AESx(0xC79552C7), AESx(0x23466523), AESx(0xC39D5EC3),
+	AESx(0x18302818), AESx(0x9637A196), AESx(0x050A0F05), AESx(0x9A2FB59A),
+	AESx(0x070E0907), AESx(0x12243612), AESx(0x801B9B80), AESx(0xE2DF3DE2),
+	AESx(0xEBCD26EB), AESx(0x274E6927), AESx(0xB27FCDB2), AESx(0x75EA9F75),
+	AESx(0x09121B09), AESx(0x831D9E83), AESx(0x2C58742C), AESx(0x1A342E1A),
+	AESx(0x1B362D1B), AESx(0x6EDCB26E), AESx(0x5AB4EE5A), AESx(0xA05BFBA0),
+	AESx(0x52A4F652), AESx(0x3B764D3B), AESx(0xD6B761D6), AESx(0xB37DCEB3),
+	AESx(0x29527B29), AESx(0xE3DD3EE3), AESx(0x2F5E712F), AESx(0x84139784),
+	AESx(0x53A6F553), AESx(0xD1B968D1), AESx(0x00000000), AESx(0xEDC12CED),
+	AESx(0x20406020), AESx(0xFCE31FFC), AESx(0xB179C8B1), AESx(0x5BB6ED5B),
+	AESx(0x6AD4BE6A), AESx(0xCB8D46CB), AESx(0xBE67D9BE), AESx(0x39724B39),
+	AESx(0x4A94DE4A), AESx(0x4C98D44C), AESx(0x58B0E858), AESx(0xCF854ACF),
+	AESx(0xD0BB6BD0), AESx(0xEFC52AEF), AESx(0xAA4FE5AA), AESx(0xFBED16FB),
+	AESx(0x4386C543), AESx(0x4D9AD74D), AESx(0x33665533), AESx(0x85119485),
+	AESx(0x458ACF45), AESx(0xF9E910F9), AESx(0x02040602), AESx(0x7FFE817F),
+	AESx(0x50A0F050), AESx(0x3C78443C), AESx(0x9F25BA9F), AESx(0xA84BE3A8),
+	AESx(0x51A2F351), AESx(0xA35DFEA3), AESx(0x4080C040), AESx(0x8F058A8F),
+	AESx(0x923FAD92), AESx(0x9D21BC9D), AESx(0x38704838), AESx(0xF5F104F5),
+	AESx(0xBC63DFBC), AESx(0xB677C1B6), AESx(0xDAAF75DA), AESx(0x21426321),
+	AESx(0x10203010), AESx(0xFFE51AFF), AESx(0xF3FD0EF3), AESx(0xD2BF6DD2),
+	AESx(0xCD814CCD), AESx(0x0C18140C), AESx(0x13263513), AESx(0xECC32FEC),
+	AESx(0x5FBEE15F), AESx(0x9735A297), AESx(0x4488CC44), AESx(0x172E3917),
+	AESx(0xC49357C4), AESx(0xA755F2A7), AESx(0x7EFC827E), AESx(0x3D7A473D),
+	AESx(0x64C8AC64), AESx(0x5DBAE75D), AESx(0x19322B19), AESx(0x73E69573),
+	AESx(0x60C0A060), AESx(0x81199881), AESx(0x4F9ED14F), AESx(0xDCA37FDC),
+	AESx(0x22446622), AESx(0x2A547E2A), AESx(0x903BAB90), AESx(0x880B8388),
+	AESx(0x468CCA46), AESx(0xEEC729EE), AESx(0xB86BD3B8), AESx(0x14283C14),
+	AESx(0xDEA779DE), AESx(0x5EBCE25E), AESx(0x0B161D0B), AESx(0xDBAD76DB),
+	AESx(0xE0DB3BE0), AESx(0x32645632), AESx(0x3A744E3A), AESx(0x0A141E0A),
+	AESx(0x4992DB49), AESx(0x060C0A06), AESx(0x24486C24), AESx(0x5CB8E45C),
+	AESx(0xC29F5DC2), AESx(0xD3BD6ED3), AESx(0xAC43EFAC), AESx(0x62C4A662),
+	AESx(0x9139A891), AESx(0x9531A495), AESx(0xE4D337E4), AESx(0x79F28B79),
+	AESx(0xE7D532E7), AESx(0xC88B43C8), AESx(0x376E5937), AESx(0x6DDAB76D),
+	AESx(0x8D018C8D), AESx(0xD5B164D5), AESx(0x4E9CD24E), AESx(0xA949E0A9),
+	AESx(0x6CD8B46C), AESx(0x56ACFA56), AESx(0xF4F307F4), AESx(0xEACF25EA),
+	AESx(0x65CAAF65), AESx(0x7AF48E7A), AESx(0xAE47E9AE), AESx(0x08101808),
+	AESx(0xBA6FD5BA), AESx(0x78F08878), AESx(0x254A6F25), AESx(0x2E5C722E),
+	AESx(0x1C38241C), AESx(0xA657F1A6), AESx(0xB473C7B4), AESx(0xC69751C6),
+	AESx(0xE8CB23E8), AESx(0xDDA17CDD), AESx(0x74E89C74), AESx(0x1F3E211F),
+	AESx(0x4B96DD4B), AESx(0xBD61DCBD), AESx(0x8B0D868B), AESx(0x8A0F858A),
+	AESx(0x70E09070), AESx(0x3E7C423E), AESx(0xB571C4B5), AESx(0x66CCAA66),
+	AESx(0x4890D848), AESx(0x03060503), AESx(0xF6F701F6), AESx(0x0E1C120E),
+	AESx(0x61C2A361), AESx(0x356A5F35), AESx(0x57AEF957), AESx(0xB969D0B9),
+	AESx(0x86179186), AESx(0xC19958C1), AESx(0x1D3A271D), AESx(0x9E27B99E),
+	AESx(0xE1D938E1), AESx(0xF8EB13F8), AESx(0x982BB398), AESx(0x11223311),
+	AESx(0x69D2BB69), AESx(0xD9A970D9), AESx(0x8E07898E), AESx(0x9433A794),
+	AESx(0x9B2DB69B), AESx(0x1E3C221E), AESx(0x87159287), AESx(0xE9C920E9),
+	AESx(0xCE8749CE), AESx(0x55AAFF55), AESx(0x28507828), AESx(0xDFA57ADF),
+	AESx(0x8C038F8C), AESx(0xA159F8A1), AESx(0x89098089), AESx(0x0D1A170D),
+	AESx(0xBF65DABF), AESx(0xE6D731E6), AESx(0x4284C642), AESx(0x68D0B868),
+	AESx(0x4182C341), AESx(0x9929B099), AESx(0x2D5A772D), AESx(0x0F1E110F),
+	AESx(0xB07BCBB0), AESx(0x54A8FC54), AESx(0xBB6DD6BB), AESx(0x162C3A16)
+};
+
+#ifdef DEVICE_DIRECT_CONSTANTS
+__constant__ __align__(64) uint32_t d_AES3[256] = {
+#else
+static const uint32_t h_AES3[256] = {
+#endif
+	AESx(0xC6A56363), AESx(0xF8847C7C), AESx(0xEE997777), AESx(0xF68D7B7B),
+	AESx(0xFF0DF2F2), AESx(0xD6BD6B6B), AESx(0xDEB16F6F), AESx(0x9154C5C5),
+	AESx(0x60503030), AESx(0x02030101), AESx(0xCEA96767), AESx(0x567D2B2B),
+	AESx(0xE719FEFE), AESx(0xB562D7D7), AESx(0x4DE6ABAB), AESx(0xEC9A7676),
+	AESx(0x8F45CACA), AESx(0x1F9D8282), AESx(0x8940C9C9), AESx(0xFA877D7D),
+	AESx(0xEF15FAFA), AESx(0xB2EB5959), AESx(0x8EC94747), AESx(0xFB0BF0F0),
+	AESx(0x41ECADAD), AESx(0xB367D4D4), AESx(0x5FFDA2A2), AESx(0x45EAAFAF),
+	AESx(0x23BF9C9C), AESx(0x53F7A4A4), AESx(0xE4967272), AESx(0x9B5BC0C0),
+	AESx(0x75C2B7B7), AESx(0xE11CFDFD), AESx(0x3DAE9393), AESx(0x4C6A2626),
+	AESx(0x6C5A3636), AESx(0x7E413F3F), AESx(0xF502F7F7), AESx(0x834FCCCC),
+	AESx(0x685C3434), AESx(0x51F4A5A5), AESx(0xD134E5E5), AESx(0xF908F1F1),
+	AESx(0xE2937171), AESx(0xAB73D8D8), AESx(0x62533131), AESx(0x2A3F1515),
+	AESx(0x080C0404), AESx(0x9552C7C7), AESx(0x46652323), AESx(0x9D5EC3C3),
+	AESx(0x30281818), AESx(0x37A19696), AESx(0x0A0F0505), AESx(0x2FB59A9A),
+	AESx(0x0E090707), AESx(0x24361212), AESx(0x1B9B8080), AESx(0xDF3DE2E2),
+	AESx(0xCD26EBEB), AESx(0x4E692727), AESx(0x7FCDB2B2), AESx(0xEA9F7575),
+	AESx(0x121B0909), AESx(0x1D9E8383), AESx(0x58742C2C), AESx(0x342E1A1A),
+	AESx(0x362D1B1B), AESx(0xDCB26E6E), AESx(0xB4EE5A5A), AESx(0x5BFBA0A0),
+	AESx(0xA4F65252), AESx(0x764D3B3B), AESx(0xB761D6D6), AESx(0x7DCEB3B3),
+	AESx(0x527B2929), AESx(0xDD3EE3E3), AESx(0x5E712F2F), AESx(0x13978484),
+	AESx(0xA6F55353), AESx(0xB968D1D1), AESx(0x00000000), AESx(0xC12CEDED),
+	AESx(0x40602020), AESx(0xE31FFCFC), AESx(0x79C8B1B1), AESx(0xB6ED5B5B),
+	AESx(0xD4BE6A6A), AESx(0x8D46CBCB), AESx(0x67D9BEBE), AESx(0x724B3939),
+	AESx(0x94DE4A4A), AESx(0x98D44C4C), AESx(0xB0E85858), AESx(0x854ACFCF),
+	AESx(0xBB6BD0D0), AESx(0xC52AEFEF), AESx(0x4FE5AAAA), AESx(0xED16FBFB),
+	AESx(0x86C54343), AESx(0x9AD74D4D), AESx(0x66553333), AESx(0x11948585),
+	AESx(0x8ACF4545), AESx(0xE910F9F9), AESx(0x04060202), AESx(0xFE817F7F),
+	AESx(0xA0F05050), AESx(0x78443C3C), AESx(0x25BA9F9F), AESx(0x4BE3A8A8),
+	AESx(0xA2F35151), AESx(0x5DFEA3A3), AESx(0x80C04040), AESx(0x058A8F8F),
+	AESx(0x3FAD9292), AESx(0x21BC9D9D), AESx(0x70483838), AESx(0xF104F5F5),
+	AESx(0x63DFBCBC), AESx(0x77C1B6B6), AESx(0xAF75DADA), AESx(0x42632121),
+	AESx(0x20301010), AESx(0xE51AFFFF), AESx(0xFD0EF3F3), AESx(0xBF6DD2D2),
+	AESx(0x814CCDCD), AESx(0x18140C0C), AESx(0x26351313), AESx(0xC32FECEC),
+	AESx(0xBEE15F5F), AESx(0x35A29797), AESx(0x88CC4444), AESx(0x2E391717),
+	AESx(0x9357C4C4), AESx(0x55F2A7A7), AESx(0xFC827E7E), AESx(0x7A473D3D),
+	AESx(0xC8AC6464), AESx(0xBAE75D5D), AESx(0x322B1919), AESx(0xE6957373),
+	AESx(0xC0A06060), AESx(0x19988181), AESx(0x9ED14F4F), AESx(0xA37FDCDC),
+	AESx(0x44662222), AESx(0x547E2A2A), AESx(0x3BAB9090), AESx(0x0B838888),
+	AESx(0x8CCA4646), AESx(0xC729EEEE), AESx(0x6BD3B8B8), AESx(0x283C1414),
+	AESx(0xA779DEDE), AESx(0xBCE25E5E), AESx(0x161D0B0B), AESx(0xAD76DBDB),
+	AESx(0xDB3BE0E0), AESx(0x64563232), AESx(0x744E3A3A), AESx(0x141E0A0A),
+	AESx(0x92DB4949), AESx(0x0C0A0606), AESx(0x486C2424), AESx(0xB8E45C5C),
+	AESx(0x9F5DC2C2), AESx(0xBD6ED3D3), AESx(0x43EFACAC), AESx(0xC4A66262),
+	AESx(0x39A89191), AESx(0x31A49595), AESx(0xD337E4E4), AESx(0xF28B7979),
+	AESx(0xD532E7E7), AESx(0x8B43C8C8), AESx(0x6E593737), AESx(0xDAB76D6D),
+	AESx(0x018C8D8D), AESx(0xB164D5D5), AESx(0x9CD24E4E), AESx(0x49E0A9A9),
+	AESx(0xD8B46C6C), AESx(0xACFA5656), AESx(0xF307F4F4), AESx(0xCF25EAEA),
+	AESx(0xCAAF6565), AESx(0xF48E7A7A), AESx(0x47E9AEAE), AESx(0x10180808),
+	AESx(0x6FD5BABA), AESx(0xF0887878), AESx(0x4A6F2525), AESx(0x5C722E2E),
+	AESx(0x38241C1C), AESx(0x57F1A6A6), AESx(0x73C7B4B4), AESx(0x9751C6C6),
+	AESx(0xCB23E8E8), AESx(0xA17CDDDD), AESx(0xE89C7474), AESx(0x3E211F1F),
+	AESx(0x96DD4B4B), AESx(0x61DCBDBD), AESx(0x0D868B8B), AESx(0x0F858A8A),
+	AESx(0xE0907070), AESx(0x7C423E3E), AESx(0x71C4B5B5), AESx(0xCCAA6666),
+	AESx(0x90D84848), AESx(0x06050303), AESx(0xF701F6F6), AESx(0x1C120E0E),
+	AESx(0xC2A36161), AESx(0x6A5F3535), AESx(0xAEF95757), AESx(0x69D0B9B9),
+	AESx(0x17918686), AESx(0x9958C1C1), AESx(0x3A271D1D), AESx(0x27B99E9E),
+	AESx(0xD938E1E1), AESx(0xEB13F8F8), AESx(0x2BB39898), AESx(0x22331111),
+	AESx(0xD2BB6969), AESx(0xA970D9D9), AESx(0x07898E8E), AESx(0x33A79494),
+	AESx(0x2DB69B9B), AESx(0x3C221E1E), AESx(0x15928787), AESx(0xC920E9E9),
+	AESx(0x8749CECE), AESx(0xAAFF5555), AESx(0x50782828), AESx(0xA57ADFDF),
+	AESx(0x038F8C8C), AESx(0x59F8A1A1), AESx(0x09808989), AESx(0x1A170D0D),
+	AESx(0x65DABFBF), AESx(0xD731E6E6), AESx(0x84C64242), AESx(0xD0B86868),
+	AESx(0x82C34141), AESx(0x29B09999), AESx(0x5A772D2D), AESx(0x1E110F0F),
+	AESx(0x7BCBB0B0), AESx(0xA8FC5454), AESx(0x6DD6BBBB), AESx(0x2C3A1616)
+};
+
+#ifndef DEVICE_DIRECT_CONSTANTS
+static __constant__ __align__(64) uint32_t d_AES0[256];
+static __constant__ __align__(64) uint32_t d_AES1[256];
+static __constant__ __align__(64) uint32_t d_AES2[256];
+static __constant__ __align__(64) uint32_t d_AES3[256];
+
+static void aes_cpu_init(int thr_id)
+{
+	CUDA_CALL_OR_RET(cudaMemcpyToSymbol( d_AES0,
+                        h_AES0,
+                        sizeof(h_AES0),
+                        0, cudaMemcpyHostToDevice));
+
+	CUDA_CALL_OR_RET(cudaMemcpyToSymbol( d_AES1,
+                        h_AES1,
+                        sizeof(h_AES1),
+                        0, cudaMemcpyHostToDevice));
+
+	CUDA_CALL_OR_RET(cudaMemcpyToSymbol( d_AES2,
+                        h_AES2,
+                        sizeof(h_AES2),
+                        0, cudaMemcpyHostToDevice));
+
+	CUDA_CALL_OR_RET(cudaMemcpyToSymbol( d_AES3,
+                        h_AES3,
+                        sizeof(h_AES3),
+                        0, cudaMemcpyHostToDevice));
+}
+#else
+static void aes_cpu_init(int thr_id) {}
+#endif
+
+__device__ __forceinline__
+void aes_gpu_init(uint32_t *sharedMemory)
+{
+	/* each thread startup will fill a uint32 */
+	if (threadIdx.x < 256) {
+		sharedMemory[threadIdx.x] = d_AES0[threadIdx.x];
+		sharedMemory[threadIdx.x+256] = d_AES1[threadIdx.x];
+		sharedMemory[threadIdx.x+512] = d_AES2[threadIdx.x];
+		sharedMemory[threadIdx.x+768] = d_AES3[threadIdx.x];
+	}
+}
+
+/* tried with 3 xor.b32 asm, not faster */
+#define xor4_32(a,b,c,d) ((a ^ b) ^ (c ^ d));
+
+__device__
+static void aes_round(
+	const uint32_t *sharedMemory,
+	uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, uint32_t k0,
+	uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
+{
+	y0 = xor4_32(
+		sharedMemory[__byte_perm(x0, 0, 0x4440)],
+		sharedMemory[__byte_perm(x1, 0, 0x4441) + 256],
+		sharedMemory[__byte_perm(x2, 0, 0x4442) + 512],
+		sharedMemory[__byte_perm(x3, 0, 0x4443) + 768]);
+
+	y1 = xor4_32(
+		sharedMemory[__byte_perm(x1, 0, 0x4440)],
+		sharedMemory[__byte_perm(x2, 0, 0x4441) + 256],
+		sharedMemory[__byte_perm(x3, 0, 0x4442) + 512],
+		sharedMemory[__byte_perm(x0, 0, 0x4443) + 768]);
+
+	y2 = xor4_32(
+		sharedMemory[__byte_perm(x2, 0, 0x4440)],
+		sharedMemory[__byte_perm(x3, 0, 0x4441) + 256],
+		sharedMemory[__byte_perm(x0, 0, 0x4442) + 512],
+		sharedMemory[__byte_perm(x1, 0, 0x4443) + 768]); // ^k2
+
+	y0 ^= k0;
+
+	y3 = xor4_32(
+		sharedMemory[__byte_perm(x3, 0, 0x4440)],
+		sharedMemory[__byte_perm(x0, 0, 0x4441) + 256],
+		sharedMemory[__byte_perm(x1, 0, 0x4442) + 512],
+		sharedMemory[__byte_perm(x2, 0, 0x4443) + 768]); // ^k3
+}
+
+__device__
+static void aes_round(
+	const uint32_t *sharedMemory,
+	uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
+	uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
+{
+	y0 = xor4_32(
+		sharedMemory[__byte_perm(x0, 0, 0x4440)],
+		sharedMemory[__byte_perm(x1, 0, 0x4441) + 256],
+		sharedMemory[__byte_perm(x2, 0, 0x4442) + 512],
+		sharedMemory[__byte_perm(x3, 0, 0x4443) + 768]);
+
+	y1 = xor4_32(
+		sharedMemory[__byte_perm(x1, 0, 0x4440)],
+		sharedMemory[__byte_perm(x2, 0, 0x4441) + 256],
+		sharedMemory[__byte_perm(x3, 0, 0x4442) + 512],
+		sharedMemory[__byte_perm(x0, 0, 0x4443) + 768]);
+
+	y2 = xor4_32(
+		sharedMemory[__byte_perm(x2, 0, 0x4440)],
+		sharedMemory[__byte_perm(x3, 0, 0x4441) + 256],
+		sharedMemory[__byte_perm(x0, 0, 0x4442) + 512],
+		sharedMemory[__byte_perm(x1, 0, 0x4443) + 768]); // ^k2
+
+	y3 = xor4_32(
+		sharedMemory[__byte_perm(x3, 0, 0x4440)],
+		sharedMemory[__byte_perm(x0, 0, 0x4441) + 256],
+		sharedMemory[__byte_perm(x1, 0, 0x4442) + 512],
+		sharedMemory[__byte_perm(x2, 0, 0x4443) + 768]); // ^k3
+}
diff --git a/x11/cuda_x11_cubehash512.cu b/x11/cuda_x11_cubehash512.cu
index 602e7c4197..b5aa534177 100644
--- a/x11/cuda_x11_cubehash512.cu
+++ b/x11/cuda_x11_cubehash512.cu
@@ -1,46 +1,34 @@
-// aus heavy.cu
-extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
-
-typedef unsigned char BitSequence;
-typedef unsigned long long DataLength;
-
-typedef unsigned char uint8_t;
-typedef unsigned int uint32_t;
-typedef unsigned long long uint64_t;
-
-static __device__ uint32_t cuda_swab32(uint32_t x)
-{
-	return __byte_perm(x, 0, 0x0123);
-}
-
-typedef unsigned char BitSequence;
-typedef unsigned long long DataLength;
+#include <cuda_helper.h>
+#include <cuda_vectors.h>
 
 #define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */
 #define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */
 
-typedef unsigned int uint32_t; /* must be exactly 32 bits */
+#if __CUDA_ARCH__ < 350
+#define LROT(x,bits) ((x << bits) | (x >> (32 - bits)))
+#else
+#define LROT(x, bits) __funnelshift_l(x, x, bits)
+#endif
+
+#define ROTATEUPWARDS7(a)  LROT(a,7)
+#define ROTATEUPWARDS11(a) LROT(a,11)
 
-#define ROTATEUPWARDS7(a) (((a) << 7) | ((a) >> 25))
-#define ROTATEUPWARDS11(a) (((a) << 11) | ((a) >> 21))
 #define SWAP(a,b) { uint32_t u = a; a = b; b = u; }
 
-__constant__ uint32_t c_IV_512[32];
-static const uint32_t h_IV_512[32] = {
-	0x2AEA2A61, 0x50F494D4, 0x2D538B8B,
-	0x4167D83E, 0x3FEE2313, 0xC701CF8C,
-	0xCC39968E, 0x50AC5695, 0x4D42C787,
-	0xA647A8B3, 0x97CF0BEF, 0x825B4537,
-	0xEEF864D2, 0xF22090C4, 0xD0E5CD33,
-	0xA23911AE, 0xFCD398D9, 0x148FE485,
-	0x1B017BEF, 0xB6444532, 0x6A536159,
-	0x2FF5781C, 0x91FA7934, 0x0DBADEA9,
-	0xD65C8A2B, 0xA5A70E75, 0xB1C62456,
-	0xBC796576, 0x1921C8F7, 0xE7989AF1,
-	0x7795D246, 0xD43E3B44
+__device__ __constant__
+static const uint32_t c_IV_512[32] = {
+	0x2AEA2A61, 0x50F494D4, 0x2D538B8B, 0x4167D83E,
+	0x3FEE2313, 0xC701CF8C, 0xCC39968E, 0x50AC5695,
+	0x4D42C787, 0xA647A8B3, 0x97CF0BEF, 0x825B4537,
+	0xEEF864D2, 0xF22090C4, 0xD0E5CD33, 0xA23911AE,
+	0xFCD398D9, 0x148FE485, 0x1B017BEF, 0xB6444532,
+	0x6A536159, 0x2FF5781C, 0x91FA7934, 0x0DBADEA9,
+	0xD65C8A2B, 0xA5A70E75, 0xB1C62456, 0xBC796576,
+	0x1921C8F7, 0xE7989AF1, 0x7795D246, 0xD43E3B44
 };
 
-static __device__ void rrounds(uint32_t x[2][2][2][2][2])
+__device__ __forceinline__
+static void rrounds(uint32_t x[2][2][2][2][2])
 {
     int r;
     int j;
@@ -156,158 +144,215 @@ static __device__ void rrounds(uint32_t x[2][2][2][2][2])
     }
 }
 
-
-static __device__ void block_tox(uint32_t block[16], uint32_t x[2][2][2][2][2])
+__device__ __forceinline__
+static void block_tox(uint32_t* const block, uint32_t x[2][2][2][2][2])
 {
-    int k;
-    int l;
-    int m;
-    uint32_t *in = block;
-
-#pragma unroll 2
-    for (k = 0;k < 2;++k)
-#pragma unroll 2
-        for (l = 0;l < 2;++l)
-#pragma unroll 2
-            for (m = 0;m < 2;++m)
-                x[0][0][k][l][m] ^= *in++;
+	// read 32 bytes input from global mem with uint2 chunks
+	AS_UINT2(x[0][0][0][0]) ^= AS_UINT2(&block[0]);
+	AS_UINT2(x[0][0][0][1]) ^= AS_UINT2(&block[2]);
+	AS_UINT2(x[0][0][1][0]) ^= AS_UINT2(&block[4]);
+	AS_UINT2(x[0][0][1][1]) ^= AS_UINT2(&block[6]);
 }
 
-static __device__ void hash_fromx(uint32_t hash[16], uint32_t x[2][2][2][2][2])
+__device__ __forceinline__
+static void hash_fromx(uint32_t hash[16], uint32_t const x[2][2][2][2][2])
 {
-    int j;
-    int k;
-    int l;
-    int m;
-    uint32_t *out = hash;
+	// used to write final hash to global mem
+	AS_UINT2(&hash[ 0]) = AS_UINT2(x[0][0][0][0]);
+	AS_UINT2(&hash[ 2]) = AS_UINT2(x[0][0][0][1]);
+	AS_UINT2(&hash[ 4]) = AS_UINT2(x[0][0][1][0]);
+	AS_UINT2(&hash[ 6]) = AS_UINT2(x[0][0][1][1]);
+	AS_UINT2(&hash[ 8]) = AS_UINT2(x[0][1][0][0]);
+	AS_UINT2(&hash[10]) = AS_UINT2(x[0][1][0][1]);
+	AS_UINT2(&hash[12]) = AS_UINT2(x[0][1][1][0]);
+	AS_UINT2(&hash[14]) = AS_UINT2(x[0][1][1][1]);
+}
 
-#pragma unroll 2
-    for (j = 0;j < 2;++j)
-#pragma unroll 2
-        for (k = 0;k < 2;++k)
-#pragma unroll 2
-            for (l = 0;l < 2;++l)
-#pragma unroll 2
-                for (m = 0;m < 2;++m)
-                    *out++ = x[0][j][k][l][m];
+#define Init(x) \
+	AS_UINT2(x[0][0][0][0]) = AS_UINT2(&c_IV_512[ 0]); \
+	AS_UINT2(x[0][0][0][1]) = AS_UINT2(&c_IV_512[ 2]); \
+	AS_UINT2(x[0][0][1][0]) = AS_UINT2(&c_IV_512[ 4]); \
+	AS_UINT2(x[0][0][1][1]) = AS_UINT2(&c_IV_512[ 6]); \
+	AS_UINT2(x[0][1][0][0]) = AS_UINT2(&c_IV_512[ 8]); \
+	AS_UINT2(x[0][1][0][1]) = AS_UINT2(&c_IV_512[10]); \
+	AS_UINT2(x[0][1][1][0]) = AS_UINT2(&c_IV_512[12]); \
+	AS_UINT2(x[0][1][1][1]) = AS_UINT2(&c_IV_512[14]); \
+	AS_UINT2(x[1][0][0][0]) = AS_UINT2(&c_IV_512[16]); \
+	AS_UINT2(x[1][0][0][1]) = AS_UINT2(&c_IV_512[18]); \
+	AS_UINT2(x[1][0][1][0]) = AS_UINT2(&c_IV_512[20]); \
+	AS_UINT2(x[1][0][1][1]) = AS_UINT2(&c_IV_512[22]); \
+	AS_UINT2(x[1][1][0][0]) = AS_UINT2(&c_IV_512[24]); \
+	AS_UINT2(x[1][1][0][1]) = AS_UINT2(&c_IV_512[26]); \
+	AS_UINT2(x[1][1][1][0]) = AS_UINT2(&c_IV_512[28]); \
+	AS_UINT2(x[1][1][1][1]) = AS_UINT2(&c_IV_512[30]);
+
+__device__ __forceinline__
+static void Update32(uint32_t x[2][2][2][2][2], uint32_t* const data)
+{
+	/* "xor the block into the first b bytes of the state" */
+	block_tox(data, x);
+	/* "and then transform the state invertibly through r identical rounds" */
+	rrounds(x);
 }
 
-void __device__ Init(uint32_t x[2][2][2][2][2])
+__device__ __forceinline__
+static void Final(uint32_t x[2][2][2][2][2], uint32_t *hashval)
 {
-    int i,j,k,l,m;
-#if 0
-    /* "the first three state words x_00000, x_00001, x_00010" */
-    /* "are set to the integers h/8, b, r respectively." */
-    /* "the remaining state words are set to 0." */
-#pragma unroll 2
-    for (i = 0;i < 2;++i)
-#pragma unroll 2
-      for (j = 0;j < 2;++j)
-#pragma unroll 2
-        for (k = 0;k < 2;++k)
-#pragma unroll 2
-          for (l = 0;l < 2;++l)
-#pragma unroll 2
-            for (m = 0;m < 2;++m)
-              x[i][j][k][l][m] = 0;
-    x[0][0][0][0][0] = 512/8;
-    x[0][0][0][0][1] = CUBEHASH_BLOCKBYTES;
-    x[0][0][0][1][0] = CUBEHASH_ROUNDS;
+	/* "the integer 1 is xored into the last state word x_11111" */
+	x[1][1][1][1][1] ^= 1;
 
-    /* "the state is then transformed invertibly through 10r identical rounds */
-    for (i = 0;i < 10;++i) rrounds(x);
-#else
-    uint32_t *iv = c_IV_512;
+	/* "the state is then transformed invertibly through 10r identical rounds" */
+	#pragma unroll 10
+	for (int i = 0; i < 10; i++) rrounds(x);
 
-#pragma unroll 2
-    for (i = 0;i < 2;++i)
-#pragma unroll 2
-      for (j = 0;j < 2;++j)
-#pragma unroll 2
-        for (k = 0;k < 2;++k)
-#pragma unroll 2
-          for (l = 0;l < 2;++l)
-#pragma unroll 2
-            for (m = 0;m < 2;++m)
-              x[i][j][k][l][m] = *iv++;
-#endif
+	/* "output the first h/8 bytes of the state" */
+	hash_fromx(hashval, x);
 }
 
-void __device__ Update32(uint32_t x[2][2][2][2][2], const BitSequence *data)
-{
-    /* "xor the block into the first b bytes of the state" */
-    /* "and then transform the state invertibly through r identical rounds" */
-    block_tox((uint32_t*)data, x);
-    rrounds(x);
-}
 
-void __device__ Final(uint32_t x[2][2][2][2][2], BitSequence *hashval)
+/***************************************************/
+
+__global__
+void x11_cubehash512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
 {
-    int i;
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
 
-    /* "the integer 1 is xored into the last state word x_11111" */
-    x[1][1][1][1][1] ^= 1;
+		int hashPosition = nounce - startNounce;
+		uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition];
 
-    /* "the state is then transformed invertibly through 10r identical rounds" */
-#pragma unroll 10
-    for (i = 0;i < 10;++i) rrounds(x);
+		uint32_t x[2][2][2][2][2];
+		Init(x);
 
-    /* "output the first h/8 bytes of the state" */
-    hash_fromx((uint32_t*)hashval, x);
-}
+		Update32(x, &Hash[0]);
+		Update32(x, &Hash[8]);
 
+		// Padding Block
+		uint32_t last[8];
+		last[0] = 0x80;
+		#pragma unroll 7
+		for (int i=1; i < 8; i++) last[i] = 0;
+		Update32(x, last);
 
-/***************************************************/
-// Die Hash-Funktion
-__global__ void x11_cubehash512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
+		Final(x, Hash);
+	}
+}
+
+__host__
+void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
 {
-    int thread = (blockDim.x * blockIdx.x + threadIdx.x);
-    if (thread < threads)
-    {
-        uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+	const uint32_t threadsperblock = 256;
+
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
 
-        int hashPosition = nounce - startNounce;
-        uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition];
+	size_t shared_size = 0;
 
-        uint32_t x[2][2][2][2][2];
-        Init(x);
+	x11_cubehash512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+}
 
-        // erste H�lfte des Hashes (32 bytes)
-        Update32(x, (const BitSequence*)Hash);
+__host__
+void x11_cubehash512_cpu_init(int thr_id, uint32_t threads) { }
 
-        // zweite H�lfte des Hashes (32 bytes)
-        Update32(x, (const BitSequence*)(Hash+8));
 
-        // Padding Block
-        uint32_t last[8];
-        last[0] = 0x80;
-#pragma unroll 7
-        for (int i=1; i < 8; i++) last[i] = 0;
-        Update32(x, (const BitSequence*)last);
+/***************************************************/
 
-        Final(x, (BitSequence*)Hash);
-    }
-}
+/**
+ * Timetravel and x16 CUBEHASH-80 CUDA implementation
+ *  by tpruvot@github - Jan 2017 / May 2018
+ */
 
+__constant__ static uint32_t c_midstate128[32];
+__constant__ static uint32_t c_PaddedMessage80[20];
 
-// Setup-Funktionen
-__host__ void x11_cubehash512_cpu_init(int thr_id, int threads)
+#undef SPH_C32
+#undef SPH_C64
+#undef SPH_T32
+#undef SPH_T64
+#include "sph/sph_cubehash.h"
+
+__host__
+void cubehash512_setBlock_80(int thr_id, uint32_t* endiandata)
 {
-    cudaMemcpyToSymbol( c_IV_512, h_IV_512, sizeof(h_IV_512), 0, cudaMemcpyHostToDevice);
+	sph_cubehash512_context ctx_cubehash;
+	sph_cubehash512_init(&ctx_cubehash);
+	sph_cubehash512(&ctx_cubehash, (void*)endiandata, 64);
+#ifndef NO_MIDSTATE
+	cudaMemcpyToSymbol(c_midstate128, ctx_cubehash.state, 128, 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(c_PaddedMessage80, &endiandata[16], 16, 0, cudaMemcpyHostToDevice);
+#else
+	cudaMemcpyToSymbol(c_PaddedMessage80, endiandata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice);
+#endif
 }
 
-__host__ void x11_cubehash512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+__global__
+void cubehash512_gpu_hash_80(const uint32_t threads, const uint32_t startNounce, uint64_t *g_outhash)
 {
-    const int threadsperblock = 256;
-
-    // berechne wie viele Thread Blocks wir brauchen
-    dim3 grid((threads + threadsperblock-1)/threadsperblock);
-    dim3 block(threadsperblock);
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint32_t nonce = startNounce + thread;
+		uint32_t message[8];
+		uint32_t x[2][2][2][2][2];
+#ifdef NO_MIDSTATE
+		Init(x);
+
+		// first 32 bytes
+		AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage80[0]);
+		AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage80[4]);
+		Update32(x, message);
+
+		// second 32 bytes
+		AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage80[8]);
+		AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage80[12]);
+		Update32(x, message);
+
+		// last 16 bytes
+		AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage80[16]);
+#else
+		AS_UINT2(x[0][0][0][0]) = AS_UINT2(&c_midstate128[ 0]);
+		AS_UINT2(x[0][0][0][1]) = AS_UINT2(&c_midstate128[ 2]);
+		AS_UINT2(x[0][0][1][0]) = AS_UINT2(&c_midstate128[ 4]);
+		AS_UINT2(x[0][0][1][1]) = AS_UINT2(&c_midstate128[ 6]);
+		AS_UINT2(x[0][1][0][0]) = AS_UINT2(&c_midstate128[ 8]);
+		AS_UINT2(x[0][1][0][1]) = AS_UINT2(&c_midstate128[10]);
+		AS_UINT2(x[0][1][1][0]) = AS_UINT2(&c_midstate128[12]);
+		AS_UINT2(x[0][1][1][1]) = AS_UINT2(&c_midstate128[14]);
+
+		AS_UINT2(x[1][0][0][0]) = AS_UINT2(&c_midstate128[16]);
+		AS_UINT2(x[1][0][0][1]) = AS_UINT2(&c_midstate128[18]);
+		AS_UINT2(x[1][0][1][0]) = AS_UINT2(&c_midstate128[20]);
+		AS_UINT2(x[1][0][1][1]) = AS_UINT2(&c_midstate128[22]);
+		AS_UINT2(x[1][1][0][0]) = AS_UINT2(&c_midstate128[24]);
+		AS_UINT2(x[1][1][0][1]) = AS_UINT2(&c_midstate128[26]);
+		AS_UINT2(x[1][1][1][0]) = AS_UINT2(&c_midstate128[28]);
+		AS_UINT2(x[1][1][1][1]) = AS_UINT2(&c_midstate128[30]);
+
+		// last 16 bytes
+		AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage80[0]);
+#endif
+		// nonce + Padding
+		message[3] = cuda_swab32(nonce);
+		message[4] = 0x80;
+		message[5] = 0;
+		message[6] = 0;
+		message[7] = 0;
+		Update32(x, message);
+
+		uint32_t* output = (uint32_t*) (&g_outhash[(size_t)8 * thread]);
+		Final(x, output);
+	}
+}
 
-    // Gr��e des dynamischen Shared Memory Bereichs
-    size_t shared_size = 0;
+__host__
+void cubehash512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash)
+{
+	const uint32_t threadsperblock = 256;
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
 
-    x11_cubehash512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
-    MyStreamSynchronize(NULL, order, thr_id);
+	cubehash512_gpu_hash_80 <<<grid, block>>> (threads, startNounce, (uint64_t*) d_hash);
 }
 
diff --git a/x11/cuda_x11_echo.cu b/x11/cuda_x11_echo.cu
index 443936eabb..fa5c4f7885 100644
--- a/x11/cuda_x11_echo.cu
+++ b/x11/cuda_x11_echo.cu
@@ -1,48 +1,19 @@
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
 #include <stdio.h>
 #include <memory.h>
 
-// Folgende Definitionen sp�ter durch header ersetzen
-typedef unsigned char uint8_t;
-typedef unsigned int uint32_t;
-typedef unsigned long long uint64_t;
-
-// das Hi Word aus einem 64 Bit Typen extrahieren
-static __device__ uint32_t HIWORD(const uint64_t &x) {
-#if __CUDA_ARCH__ >= 130
-	return (uint32_t)__double2hiint(__longlong_as_double(x));
-#else
-	return (uint32_t)(x >> 32);
-#endif
-}
-
-// das Lo Word aus einem 64 Bit Typen extrahieren
-static __device__ uint32_t LOWORD(const uint64_t &x) {
-#if __CUDA_ARCH__ >= 130
-	return (uint32_t)__double2loint(__longlong_as_double(x));
-#else
-	return (uint32_t)(x & 0xFFFFFFFFULL);
-#endif
-}
+#include "cuda_helper.h"
 
-#define SPH_C64(x)    ((uint64_t)(x ## ULL))
-#define SPH_C32(x)    ((uint32_t)(x ## U))
+extern __device__ __device_builtin__ void __threadfence_block(void);
 
-// aus heavy.cu
-extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
-
-#include "cuda_x11_aes.cu"
+#include "cuda_x11_aes.cuh"
 
 __device__ __forceinline__ void AES_2ROUND(
 	const uint32_t* __restrict__ sharedMemory,
 	uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3,
-	uint32_t &k0, uint32_t &k1, uint32_t &k2, uint32_t &k3)
+	uint32_t &k0)
 {
 	uint32_t y0, y1, y2, y3;
-	
+
 	aes_round(sharedMemory,
 		x0, x1, x2, x3,
 		k0,
@@ -56,181 +27,293 @@ __device__ __forceinline__ void AES_2ROUND(
 	k0++;
 }
 
-__device__ __forceinline__ void cuda_echo_round(
-	const uint32_t *sharedMemory,
-	uint32_t &k0, uint32_t &k1, uint32_t &k2, uint32_t &k3,
-	uint32_t *W, int round)
+__device__ __forceinline__
+void cuda_echo_round(
+	const uint32_t *const __restrict__ sharedMemory, uint32_t *const __restrict__  hash)
 {
-	// W hat 16*4 als Abma�e
-
-	// Big Sub Words
-#pragma unroll 16
-	for(int i=0;i<16;i++)
+	const uint32_t P[48] = {
+	0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
+	0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
+	//8-12
+	0x01425eb8,	0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
+	0x65978b09, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
+	//21-25
+	0x2cb6b661, 0x6b23b3b3, 0xcf93a7cf, 0x9d9d3751,
+	0x9ac2dea3, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
+	//34-38
+	0x579f9f33, 0xfbfbfbfb, 0xfbfbfbfb, 0xefefd3c7,
+	0xdbfde1dd, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
+	0x34514d9e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
+	0xb134347e, 0xea6f7e7e, 0xbd7731bd, 0x8a8a1968,
+	0x14b8a457, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
+	0x265f4382, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af
+	//58-61
+	};
+	uint32_t k0;
+	uint32_t h[16];
+
+	#pragma unroll 16
+	for (int i = 0; i < 16; i++)
 	{
-		int idx = i<<2; // *4
-		AES_2ROUND(sharedMemory,
-			W[idx+0], W[idx+1], W[idx+2], W[idx+3],
-			k0, k1, k2, k3);
+		h[i] = hash[i];
 	}
 
-	// Shift Rows
-#pragma unroll 4
-	for(int i=0;i<4;i++)
+	k0 = 512 + 8;
+
+	#pragma unroll 4
+	for (int idx = 0; idx < 16; idx += 4)
 	{
-		uint32_t t;
-
-		/// 1, 5, 9, 13
-		t = W[4 + i];
-		W[4 + i] = W[20 + i];
-		W[20 + i] = W[36 + i];
-		W[36 + i] = W[52 + i];
-		W[52 + i] = t;
-
-		// 2, 6, 10, 14
-		t = W[8 + i];
-		W[8 + i] = W[40 + i];
-		W[40 + i] = t;
-		t = W[24 + i];
-		W[24 + i] = W[56 + i];
-		W[56 + i] = t;
-
-		// 15, 11, 7, 3
-		t = W[60 + i];
-		W[60 + i] = W[44 + i];
-		W[44 + i] = W[28 + i];
-		W[28 + i] = W[12 + i];
-		W[12 + i] = t;
+		AES_2ROUND(sharedMemory,
+			h[idx + 0], h[idx + 1], h[idx + 2], h[idx + 3], k0);
 	}
+	k0 += 4;
+
+	uint32_t W[64];
 
-	// Mix Columns
-#pragma unroll 4
-	for(int i=0;i<4;i++) // Schleife �ber je 2*uint32_t
+	#pragma unroll 4
+	for (int i = 0; i < 4; i++)
 	{
-#pragma unroll 4
-		for(int j=0;j<4;j++) // Schleife �ber die elemnte
-		{
-			int idx = j<<2; // j*4
+		uint32_t a = P[i];
+		uint32_t b = P[i + 4];
+		uint32_t c = h[i + 8];
+		uint32_t d = P[i + 8];
 
-			uint32_t a = W[ ((idx + 0)<<2) + i];
-			uint32_t b = W[ ((idx + 1)<<2) + i];
-			uint32_t c = W[ ((idx + 2)<<2) + i];
-			uint32_t d = W[ ((idx + 3)<<2) + i];
+		uint32_t ab = a ^ b;
+		uint32_t bc = b ^ c;
+		uint32_t cd = c ^ d;
 
-			uint32_t ab = a ^ b;
-			uint32_t bc = b ^ c;
-			uint32_t cd = c ^ d;
 
-			uint32_t t;
-			t = ((ab & 0x80808080) >> 7);
-			uint32_t abx = t<<4 ^ t<<3 ^ t<<1 ^ t;
-			t = ((bc & 0x80808080) >> 7);
-			uint32_t bcx = t<<4 ^ t<<3 ^ t<<1 ^ t;
-			t = ((cd & 0x80808080) >> 7);
-			uint32_t cdx = t<<4 ^ t<<3 ^ t<<1 ^ t;
-
-			abx ^= ((ab & 0x7F7F7F7F) << 1);
-			bcx ^= ((bc & 0x7F7F7F7F) << 1);
-			cdx ^= ((cd & 0x7F7F7F7F) << 1);
-
-			W[ ((idx + 0)<<2) + i] = abx ^ bc ^ d;
-			W[ ((idx + 1)<<2) + i] = bcx ^ a ^ cd;
-			W[ ((idx + 2)<<2) + i] = cdx ^ ab ^ d;
-			W[ ((idx + 3)<<2) + i] = abx ^ bcx ^ cdx ^ ab ^ c;
-		}
-	}
-}
+		uint32_t t =  (ab & 0x80808080);
+		uint32_t t2 = (bc & 0x80808080);
+		uint32_t t3 = (cd & 0x80808080);
+
+		uint32_t abx = (t >> 7) * 27 ^ ((ab^t) << 1);
+		uint32_t bcx = (t2 >> 7) * 27 ^ ((bc^t2) << 1);
+		uint32_t cdx = (t3 >> 7) * 27 ^ ((cd^t3) << 1);
+
+		W[0 + i] = abx ^ bc ^ d;
+		W[0 + i + 4] = bcx ^ a ^ cd;
+		W[0 + i + 8] = cdx ^ ab ^ d;
+		W[0 + i + 12] = abx ^ bcx ^ cdx ^ ab ^ c;
+
+		a = P[12 + i];
+		b = h[i + 4];
+		c = P[12 + i + 4];
+		d = P[12 + i + 8];
+
+		ab = a ^ b;
+		bc = b ^ c;
+		cd = c ^ d;
+
+
+		t = (ab & 0x80808080);
+		t2 = (bc & 0x80808080);
+		t3 = (cd & 0x80808080);
+
+		abx = (t >> 7) * 27 ^ ((ab^t) << 1);
+		bcx = (t2 >> 7) * 27 ^ ((bc^t2) << 1);
+		cdx = (t3 >> 7) * 27 ^ ((cd^t3) << 1);
+
+		W[16 + i] = abx ^ bc ^ d;
+		W[16 + i + 4] = bcx ^ a ^ cd;
+		W[16 + i + 8] = cdx ^ ab ^ d;
+		W[16 + i + 12] = abx ^ bcx ^ cdx ^ ab ^ c;
+
+		a = h[i];
+		b = P[24 + i + 0];
+		c = P[24 + i + 4];
+		d = P[24 + i + 8];
+
+		ab = a ^ b;
+		bc = b ^ c;
+		cd = c ^ d;
 
-__global__ void x11_echo512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
-{
-	__shared__ uint32_t sharedMemory[1024];
 
-	aes_gpu_init(sharedMemory);
+		t = (ab & 0x80808080);
+		t2 = (bc & 0x80808080);
+		t3 = (cd & 0x80808080);
 
-	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
-    if (thread < threads)
-    {
-        uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+		abx = (t >> 7) * 27 ^ ((ab^t) << 1);
+		bcx = (t2 >> 7) * 27 ^ ((bc^t2) << 1);
+		cdx = (t3 >> 7) * 27 ^ ((cd^t3) << 1);
 
-        int hashPosition = nounce - startNounce;
-        uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3];
+		W[32 + i] = abx ^ bc ^ d;
+		W[32 + i + 4] = bcx ^ a ^ cd;
+		W[32 + i + 8] = cdx ^ ab ^ d;
+		W[32 + i + 12] = abx ^ bcx ^ cdx ^ ab ^ c;
 
-		uint32_t W[64];
-		uint32_t k0 = 512, k1 = 0, k2 = 0, k3 = 0; // K0 = bitlen
-		/* Initialisierung */
-#pragma unroll 8
-		for(int i=0;i<32;i+=4)
+		a = P[36 + i ];
+		b = P[36 + i + 4];
+		c = P[36 + i + 8];
+		d = h[i + 12];
+
+		ab = a ^ b;
+		bc = b ^ c;
+		cd = c ^ d;
+
+		t = (ab & 0x80808080);
+		t2 = (bc & 0x80808080);
+		t3 = (cd & 0x80808080);
+
+		abx = (t >> 7) * 27 ^ ((ab^t) << 1);
+		bcx = (t2 >> 7) * 27 ^ ((bc^t2) << 1);
+		cdx = (t3 >> 7) * 27 ^ ((cd^t3) << 1);
+
+		W[48 + i] = abx ^ bc ^ d;
+		W[48 + i + 4] = bcx ^ a ^ cd;
+		W[48 + i + 8] = cdx ^ ab ^ d;
+		W[48 + i + 12] = abx ^ bcx ^ cdx ^ ab ^ c;
+
+	}
+
+	for (int k = 1; k < 10; k++)
+	{
+
+		// Big Sub Words
+		#pragma unroll 4
+		for (int idx = 0; idx < 64; idx += 16)
 		{
-			W[i + 0] = 512;
-			W[i + 1] = 0;
-			W[i + 2] = 0;
-			W[i + 3] = 0;
+			AES_2ROUND(sharedMemory,
+				W[idx + 0], W[idx + 1], W[idx + 2], W[idx + 3],
+				k0);
+			AES_2ROUND(sharedMemory,
+				W[idx + 4], W[idx + 5], W[idx + 6], W[idx + 7],
+				k0);
+			AES_2ROUND(sharedMemory,
+				W[idx + 8], W[idx + 9], W[idx + 10], W[idx + 11],
+				k0);
+			AES_2ROUND(sharedMemory,
+				W[idx + 12], W[idx + 13], W[idx + 14], W[idx + 15],
+				k0);
 		}
 
-		// kopiere 32-byte gro�en hash
-#pragma unroll 16
-		for(int i=0;i<16;i++)
-			W[i+32] = Hash[i];
-		W[48] = 0x80; // fest
-#pragma unroll 10
-		for(int i=49;i<59;i++)
-			W[i] = 0;
-		W[59] = 0x02000000; // fest
-		W[60] = k0; // bitlen
-		W[61] = k1;
-		W[62] = k2;
-		W[63] = k3;
-		
-		for(int i=0;i<10;i++)
+		// Shift Rows
+		#pragma unroll 4
+		for (int i = 0; i < 4; i++)
 		{
-			cuda_echo_round(sharedMemory, k0, k1, k2, k3, W, i);
+			uint32_t t;
+
+			/// 1, 5, 9, 13
+			t = W[4 + i];
+			W[4 + i] = W[20 + i];
+			W[20 + i] = W[36 + i];
+			W[36 + i] = W[52 + i];
+			W[52 + i] = t;
+
+			// 2, 6, 10, 14
+			t = W[8 + i];
+			W[8 + i] = W[40 + i];
+			W[40 + i] = t;
+			t = W[24 + i];
+			W[24 + i] = W[56 + i];
+			W[56 + i] = t;
+
+			// 15, 11, 7, 3
+			t = W[60 + i];
+			W[60 + i] = W[44 + i];
+			W[44 + i] = W[28 + i];
+			W[28 + i] = W[12 + i];
+			W[12 + i] = t;
 		}
 
-#pragma unroll 8
-		for(int i=0;i<32;i+=4)
+		// Mix Columns
+		#pragma unroll 4
+		for (int i = 0; i < 4; i++) // Schleife über je 2*uint32_t
 		{
-			W[i  ] ^= W[32 + i    ] ^ 512;
-			W[i+1] ^= W[32 + i + 1];
-			W[i+2] ^= W[32 + i + 2];
-			W[i+3] ^= W[32 + i + 3];
+			#pragma unroll 4
+			for (int idx = 0; idx < 64; idx += 16) // Schleife über die elemnte
+			{
+				uint32_t a = W[idx + i];
+				uint32_t b = W[idx + i + 4];
+				uint32_t c = W[idx + i + 8];
+				uint32_t d = W[idx + i + 12];
+
+				uint32_t ab = a ^ b;
+				uint32_t bc = b ^ c;
+				uint32_t cd = c ^ d;
+
+				uint32_t t, t2, t3;
+				t = (ab & 0x80808080);
+				t2 = (bc & 0x80808080);
+				t3 = (cd & 0x80808080);
+
+				uint32_t abx = (t >> 7) * 27 ^ ((ab^t) << 1);
+				uint32_t bcx = (t2 >> 7) * 27 ^ ((bc^t2) << 1);
+				uint32_t cdx = (t3 >> 7) * 27 ^ ((cd^t3) << 1);
+
+				W[idx + i] = abx ^ bc ^ d;
+				W[idx + i + 4] = bcx ^ a ^ cd;
+				W[idx + i + 8] = cdx ^ ab ^ d;
+				W[idx + i + 12] = abx ^ bcx ^ cdx ^ ab ^ c;
+			}
 		}
+	}
 
-#pragma unroll 16
-		for(int i=0;i<16;i++)
-			W[i] ^= Hash[i];
-
-		// tsiv: I feel	iffy about removing	this, but it seems to break	the	full hash
-		// fortunately for X11 the flipped bit lands outside the first 32 bytes	used as	the	final X11 hash
-		// try chaining	more algos after echo (X13)	and	boom
-		//W[8] ^= 0x10;
-
-		W[27] ^= 0x02000000;
-		W[28] ^= k0;
+	#pragma unroll
+	for (int i = 0; i < 16; i += 4)
+	{
+		W[i] ^= W[32 + i] ^ 512;
+		W[i + 1] ^= W[32 + i + 1];
+		W[i + 2] ^= W[32 + i + 2];
+		W[i + 3] ^= W[32 + i + 3];
+	}
 
-#pragma unroll 16
-		for(int i=0;i<16;i++)
-			Hash[i] = W[i];
-    }
+	#pragma unroll
+	for (int i = 0; i < 16; i++)
+		hash[i] ^= W[i];
 }
 
-// Setup-Funktionen
-__host__ void x11_echo512_cpu_init(int thr_id, int threads)
+__device__ __forceinline__
+void echo_gpu_init(uint32_t *const __restrict__ sharedMemory)
 {
-	aes_cpu_init();
+	/* each thread startup will fill a uint32 */
+	if (threadIdx.x < 128) {
+		sharedMemory[threadIdx.x] = d_AES0[threadIdx.x];
+		sharedMemory[threadIdx.x + 256] = d_AES1[threadIdx.x];
+		sharedMemory[threadIdx.x + 512] = d_AES2[threadIdx.x];
+		sharedMemory[threadIdx.x + 768] = d_AES3[threadIdx.x];
+
+		sharedMemory[threadIdx.x + 64 * 2] = d_AES0[threadIdx.x + 64 * 2];
+		sharedMemory[threadIdx.x + 64 * 2 + 256] = d_AES1[threadIdx.x + 64 * 2];
+		sharedMemory[threadIdx.x + 64 * 2 + 512] = d_AES2[threadIdx.x + 64 * 2];
+		sharedMemory[threadIdx.x + 64 * 2 + 768] = d_AES3[threadIdx.x + 64 * 2];
+	}
 }
 
-__host__ void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+__global__ __launch_bounds__(128, 7) /* will force 72 registers */
+void x11_echo512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
 {
-    const int threadsperblock = 256;
+	__shared__ uint32_t sharedMemory[1024];
+
+	echo_gpu_init(sharedMemory);
+	__threadfence_block();
+
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
 
-    // berechne wie viele Thread Blocks wir brauchen
-    dim3 grid((threads + threadsperblock-1)/threadsperblock);
-    dim3 block(threadsperblock);
+		int hashPosition = nounce - startNounce;
+		uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3];
 
-    // Gr��e des dynamischen Shared Memory Bereichs
-    size_t shared_size = 0;
+		cuda_echo_round(sharedMemory, Hash);
+	}
+}
+
+__host__
+void x11_echo512_cpu_init(int thr_id, uint32_t threads)
+{
+	aes_cpu_init(thr_id);
+}
+
+__host__
+void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+{
+	const uint32_t threadsperblock = 128;
 
-//    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
 
-    x11_echo512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
-    MyStreamSynchronize(NULL, order, thr_id);
+	x11_echo512_gpu_hash_64<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+	MyStreamSynchronize(NULL, order, thr_id);
 }
diff --git a/x11/cuda_x11_luffa512.cu b/x11/cuda_x11_luffa512.cu
index c1ca6c0da3..b2272ecba8 100644
--- a/x11/cuda_x11_luffa512.cu
+++ b/x11/cuda_x11_luffa512.cu
@@ -18,26 +18,15 @@
  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 
-// aus heavy.cu
-extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
+#include "cuda_helper.h"
 
 typedef unsigned char BitSequence;
 
-typedef unsigned char uint8_t;
-typedef unsigned int uint32_t;
-typedef unsigned long long uint64_t;
-
 typedef struct {
     uint32_t buffer[8]; /* Buffer to be hashed */
     uint32_t chainv[40];   /* Chaining values */
 } hashState;
 
-
-static __device__ __forceinline__ uint32_t BYTES_SWAP32(uint32_t x)
-{
-	return __byte_perm(x, x, 0x0123);
-}
-
 #define MULT2(a,j)\
     tmp = a[7+(8*j)];\
     a[7+(8*j)] = a[6+(8*j)];\
@@ -49,11 +38,17 @@ static __device__ __forceinline__ uint32_t BYTES_SWAP32(uint32_t x)
     a[1+(8*j)] = a[0+(8*j)] ^ tmp;\
     a[0+(8*j)] = tmp;
 
+#if __CUDA_ARCH__ < 350
+#define LROT(x,bits) ((x << bits) | (x >> (32 - bits)))
+#else
+#define LROT(x, bits) __funnelshift_l(x, x, bits)
+#endif
+
 #define TWEAK(a0,a1,a2,a3,j)\
-    a0 = (a0<<(j))|(a0>>(32-j));\
-    a1 = (a1<<(j))|(a1>>(32-j));\
-    a2 = (a2<<(j))|(a2>>(32-j));\
-    a3 = (a3<<(j))|(a3>>(32-j));
+    a0 = LROT(a0,j);\
+    a1 = LROT(a1,j);\
+    a2 = LROT(a2,j);\
+    a3 = LROT(a3,j);
 
 #define STEP(c0,c1)\
     SUBCRUMB(chainv[0],chainv[1],chainv[2],chainv[3],tmp);\
@@ -85,20 +80,20 @@ static __device__ __forceinline__ uint32_t BYTES_SWAP32(uint32_t x)
 
 #define MIXWORD(a0,a4)\
     a4 ^= a0;\
-    a0  = (a0<<2) | (a0>>(30));\
+    a0  = LROT(a0,2);\
     a0 ^= a4;\
-    a4  = (a4<<14) | (a4>>(18));\
+    a4  = LROT(a4,14);\
     a4 ^= a0;\
-    a0  = (a0<<10) | (a0>>(22));\
+    a0  = LROT(a0,10);\
     a0 ^= a4;\
-    a4  = (a4<<1) | (a4>>(31));
+    a4  = LROT(a4,1);
 
 #define ADD_CONSTANT(a0,b0,c0,c1)\
     a0 ^= c0;\
     b0 ^= c1;
 
 /* initial values of chaining variables */
-__constant__ uint32_t c_IV[40];
+__device__ __constant__ uint32_t c_IV[40];
 const uint32_t h_IV[40] = {
     0x6d251e69,0x44b051e0,0x4eaa6fb4,0xdbf78465,
     0x6e292011,0x90152df4,0xee058139,0xdef610bb,
@@ -111,8 +106,8 @@ const uint32_t h_IV[40] = {
     0x6c68e9be,0x5ec41e22,0xc825b7c7,0xaffb4363,
     0xf5df3999,0x0fc688f1,0xb07224cc,0x03e86cea};
 
-__constant__ uint32_t c_CNS[80];
-uint32_t h_CNS[80] = {
+__device__ __constant__ uint32_t c_CNS[80];
+const uint32_t h_CNS[80] = {
     0x303994a6,0xe0337818,0xc0e65299,0x441ba90d,
     0x6cc33a12,0x7f34d442,0xdc56983e,0x9389217f,
     0x1e00108f,0xe5a8bce6,0x7800423d,0x5274baf4,
@@ -136,7 +131,8 @@ uint32_t h_CNS[80] = {
 
 
 /***************************************************/
-__device__ __forceinline__ void rnd512(hashState *state)
+__device__ __forceinline__
+void rnd512(hashState *state)
 {
     int i,j;
     uint32_t t[40];
@@ -282,20 +278,22 @@ __device__ __forceinline__ void rnd512(hashState *state)
 }
 
 
-__device__ __forceinline__ void Update512(hashState *state, const BitSequence *data) 
+__device__ __forceinline__
+void Update512(hashState *state, const BitSequence *data)
 {
 #pragma unroll 8
-    for(int i=0;i<8;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)data)[i]);
+    for(int i=0;i<8;i++) state->buffer[i] = cuda_swab32(((uint32_t*)data)[i]);
     rnd512(state);
 
 #pragma unroll 8
-    for(int i=0;i<8;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)(data+32))[i]);
+    for(int i=0;i<8;i++) state->buffer[i] = cuda_swab32(((uint32_t*)(data+32))[i]);
     rnd512(state);
 }
 
 
 /***************************************************/
-__device__ __forceinline__ void finalization512(hashState *state, uint32_t *b)
+__device__ __forceinline__
+void finalization512(hashState *state, uint32_t *b)
 {
     int i,j;
 
@@ -316,7 +314,7 @@ __device__ __forceinline__ void finalization512(hashState *state, uint32_t *b)
         for(j=0;j<5;j++) {
             b[i] ^= state->chainv[i+8*j];
         }
-        b[i] = BYTES_SWAP32((b[i]));
+        b[i] = cuda_swab32((b[i]));
     }
 
 #pragma unroll 8
@@ -330,16 +328,16 @@ __device__ __forceinline__ void finalization512(hashState *state, uint32_t *b)
         for(j=0;j<5;j++) {
             b[8+i] ^= state->chainv[i+8*j];
         }
-        b[8+i] = BYTES_SWAP32((b[8+i]));
+        b[8 + i] = cuda_swab32((b[8 + i]));
     }
 }
 
 
 /***************************************************/
 // Die Hash-Funktion
-__global__ void x11_luffa512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
+__global__ void x11_luffa512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
 {
-    int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+    uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
     if (thread < threads)
     {
         uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
@@ -358,22 +356,23 @@ __global__ void x11_luffa512_gpu_hash_64(int threads, uint32_t startNounce, uint
 }
 
 
-// Setup-Funktionen
-__host__ void x11_luffa512_cpu_init(int thr_id, int threads)
+// Setup Function
+__host__
+void x11_luffa512_cpu_init(int thr_id, uint32_t threads)
 {
-    cudaMemcpyToSymbol( c_IV, h_IV, sizeof(h_IV), 0, cudaMemcpyHostToDevice );
-    cudaMemcpyToSymbol( c_CNS, h_CNS, sizeof(h_CNS), 0, cudaMemcpyHostToDevice );
+    CUDA_CALL_OR_RET(cudaMemcpyToSymbol(c_IV, h_IV, sizeof(h_IV), 0, cudaMemcpyHostToDevice));
+    CUDA_CALL_OR_RET(cudaMemcpyToSymbol(c_CNS, h_CNS, sizeof(h_CNS), 0, cudaMemcpyHostToDevice));
 }
 
-__host__ void x11_luffa512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+__host__ void x11_luffa512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
 {
-    const int threadsperblock = 256;
+    const uint32_t threadsperblock = 256;
 
     // berechne wie viele Thread Blocks wir brauchen
     dim3 grid((threads + threadsperblock-1)/threadsperblock);
     dim3 block(threadsperblock);
 
-    // Gr��e des dynamischen Shared Memory Bereichs
+    // Größe des dynamischen Shared Memory Bereichs
     size_t shared_size = 0;
 
     x11_luffa512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
diff --git a/x11/cuda_x11_luffa512_Cubehash.cu b/x11/cuda_x11_luffa512_Cubehash.cu
new file mode 100644
index 0000000000..cab0062443
--- /dev/null
+++ b/x11/cuda_x11_luffa512_Cubehash.cu
@@ -0,0 +1,833 @@
+/*
+ * Merged LUFFA512 64 + CUBE512 64 - from sp
+ */
+
+#include "cuda_helper.h"
+
+#define MULT0(a) {\
+	tmp = a[7]; \
+	a[7] = a[6]; \
+	a[6] = a[5]; \
+	a[5] = a[4]; \
+	a[4] = a[3] ^ tmp; \
+	a[3] = a[2] ^ tmp; \
+	a[2] = a[1]; \
+	a[1] = a[0] ^ tmp; \
+	a[0] = tmp; \
+}
+
+#define MULT2(a,j) { \
+	tmp = a[(j<<3)+7]; \
+	a[(j*8)+7] = a[(j*8)+6]; \
+	a[(j*8)+6] = a[(j*8)+5]; \
+	a[(j*8)+5] = a[(j*8)+4]; \
+	a[(j*8)+4] = a[(j*8)+3] ^ tmp; \
+	a[(j*8)+3] = a[(j*8)+2] ^ tmp; \
+	a[(j*8)+2] = a[(j*8)+1]; \
+	a[(j*8)+1] = a[(j*8)+0] ^ tmp; \
+	a[j*8] = tmp; \
+}
+
+#define TWEAK(a0,a1,a2,a3,j) { \
+	a0 = ROTL32(a0,j); \
+	a1 = ROTL32(a1,j); \
+	a2 = ROTL32(a2,j); \
+	a3 = ROTL32(a3,j); \
+}
+
+#define STEP(c0,c1) { \
+	SUBCRUMB(chainv[0],chainv[1],chainv[2],chainv[3],tmp); \
+	SUBCRUMB(chainv[5],chainv[6],chainv[7],chainv[4],tmp); \
+	MIXWORD(chainv[0],chainv[4]); \
+	MIXWORD(chainv[1],chainv[5]); \
+	MIXWORD(chainv[2],chainv[6]); \
+	MIXWORD(chainv[3],chainv[7]); \
+	ADD_CONSTANT(chainv[0],chainv[4],c0,c1); \
+}
+
+#define SUBCRUMB(a0,a1,a2,a3,a4) { \
+	a4  = a0; \
+	a0 |= a1; \
+	a2 ^= a3; \
+	a1  = ~a1;\
+	a0 ^= a3; \
+	a3 &= a4; \
+	a1 ^= a3; \
+	a3 ^= a2; \
+	a2 &= a0; \
+	a0  = ~a0;\
+	a2 ^= a1; \
+	a1 |= a3; \
+	a4 ^= a1; \
+	a3 ^= a2; \
+	a2 &= a1; \
+	a1 ^= a0; \
+	a0  = a4; \
+}
+
+#define MIXWORD(a0,a4) { \
+	a4 ^= a0; \
+	a0  = ROTL32(a0,2); \
+	a0 ^= a4; \
+	a4  = ROTL32(a4,14); \
+	a4 ^= a0; \
+	a0  = ROTL32(a0,10); \
+	a0 ^= a4; \
+	a4  = ROTL32(a4,1); \
+}
+
+#define ADD_CONSTANT(a0,b0,c0,c1) { \
+	a0 ^= c0; \
+	b0 ^= c1; \
+}
+
+__device__ __constant__ uint32_t c_CNS[80] = {
+	0x303994a6,0xe0337818,0xc0e65299,0x441ba90d,
+	0x6cc33a12,0x7f34d442,0xdc56983e,0x9389217f,
+	0x1e00108f,0xe5a8bce6,0x7800423d,0x5274baf4,
+	0x8f5b7882,0x26889ba7,0x96e1db12,0x9a226e9d,
+	0xb6de10ed,0x01685f3d,0x70f47aae,0x05a17cf4,
+	0x0707a3d4,0xbd09caca,0x1c1e8f51,0xf4272b28,
+	0x707a3d45,0x144ae5cc,0xaeb28562,0xfaa7ae2b,
+	0xbaca1589,0x2e48f1c1,0x40a46f3e,0xb923c704,
+	0xfc20d9d2,0xe25e72c1,0x34552e25,0xe623bb72,
+	0x7ad8818f,0x5c58a4a4,0x8438764a,0x1e38e2e7,
+	0xbb6de032,0x78e38b9d,0xedb780c8,0x27586719,
+	0xd9847356,0x36eda57f,0xa2c78434,0x703aace7,
+	0xb213afa5,0xe028c9bf,0xc84ebe95,0x44756f91,
+	0x4e608a22,0x7e8fce32,0x56d858fe,0x956548be,
+	0x343b138f,0xfe191be2,0xd0ec4e3d,0x3cb226e5,
+	0x2ceb4882,0x5944a28e,0xb3ad2208,0xa1c4c355,
+	0xf0d2e9e3,0x5090d577,0xac11d7fa,0x2d1925ab,
+	0x1bcb66f2,0xb46496ac,0x6f2d9bc9,0xd1925ab0,
+	0x78602649,0x29131ab6,0x8edae952,0x0fc053c3,
+	0x3b6ba548,0x3f014f0c,0xedae9520,0xfc053c31
+};
+
+// Precalculated chaining values
+__device__ __constant__ uint32_t c_IV[40] = {
+	0x8bb0a761, 0xc2e4aa8b, 0x2d539bc9, 0x381408f8,
+	0x478f6633, 0x255a46ff, 0x581c37f7, 0x601c2e8e,
+	0x266c5f9d, 0xc34715d8, 0x8900670e, 0x51a540be,
+	0xe4ce69fb, 0x5089f4d4, 0x3cc0a506, 0x609bcb02,
+	0xa4e3cd82, 0xd24fd6ca, 0xc0f196dc, 0xcf41eafe,
+	0x0ff2e673, 0x303804f2, 0xa7b3cd48, 0x677addd4,
+	0x66e66a8a, 0x2303208f, 0x486dafb4, 0xc0d37dc6,
+	0x634d15af, 0xe5af6747, 0x10af7e38, 0xee7e6428,
+	0x01262e5d, 0xc92c2e64, 0x82fee966, 0xcea738d3,
+	0x867de2b0, 0xe0714818, 0xda6e831f, 0xa7062529
+};
+
+/***************************************************/
+__device__ __forceinline__
+static void rnd512(uint32_t *statebuffer, uint32_t *statechainv)
+{
+	uint32_t t[40];
+	uint32_t chainv[8];
+	uint32_t tmp;
+	int i,j;
+
+	#pragma unroll
+	for(i=0;i<8;i++) {
+		t[i] = 0;
+		#pragma unroll 5
+		for(j=0;j<5;j++)
+		   t[i] ^= statechainv[i+8*j];
+	}
+
+	MULT0(t);
+
+	#pragma unroll
+	for(j=0;j<5;j++) {
+		#pragma unroll
+		for(i=0;i<8;i++)
+			statechainv[i+8*j] ^= t[i];
+	}
+
+	#pragma unroll
+	for(j=0;j<5;j++) {
+		#pragma unroll
+		for(i=0;i<8;i++)
+			t[i+8*j] = statechainv[i+8*j];
+	}
+
+	MULT0(statechainv);
+	#pragma unroll 4
+	for(j=1;j<5;j++) {
+		MULT2(statechainv, j);
+	}
+
+	#pragma unroll
+	for(j=0;j<5;j++) {
+		#pragma unroll
+		for(i=0;i<8;i++)
+			statechainv[8*j+i] ^= t[8*((j+1)%5)+i];
+	}
+
+	#pragma unroll
+	for(j=0;j<5;j++) {
+		#pragma unroll
+		for(i=0;i<8;i++)
+			t[i+8*j] = statechainv[i+8*j];
+	}
+
+	MULT0(statechainv);
+	#pragma unroll 4
+	for(j=1;j<5;j++) {
+		MULT2(statechainv, j);
+	}
+
+	#pragma unroll
+	for(j=0;j<5;j++) {
+		#pragma unroll
+		for(i=0;i<8;i++)
+			statechainv[8*j+i] ^= t[8*((j+4)%5)+i];
+	}
+
+	#pragma unroll
+	for(j=0;j<5;j++) {
+		#pragma unroll 8
+		for(i=0;i<8;i++)
+			statechainv[i+8*j] ^= statebuffer[i];
+		MULT0(statebuffer);
+	}
+
+	#pragma unroll
+	for(i=0;i<8;i++) {
+		chainv[i] = statechainv[i];
+	}
+
+	#pragma unroll 1
+	for(i=0;i<8;i++) {
+		STEP(c_CNS[(2*i)],c_CNS[(2*i)+1]);
+	}
+
+	#pragma unroll
+	for(i=0;i<8;i++) {
+		statechainv[i] = chainv[i];
+		chainv[i] = statechainv[i+8];
+	}
+
+	TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],1);
+
+	#pragma unroll 1
+	for(i=0;i<8;i++) {
+		STEP(c_CNS[(2*i)+16],c_CNS[(2*i)+16+1]);
+	}
+
+	#pragma unroll
+	for(i=0;i<8;i++) {
+		statechainv[i+8] = chainv[i];
+		chainv[i] = statechainv[i+16];
+	}
+
+	TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],2);
+
+	#pragma unroll 1
+	for(i=0;i<8;i++) {
+		STEP(c_CNS[(2*i)+32],c_CNS[(2*i)+32+1]);
+	}
+
+	#pragma unroll
+	for(i=0;i<8;i++) {
+		statechainv[i+16] = chainv[i];
+		chainv[i] = statechainv[i+24];
+	}
+
+	TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],3);
+
+	#pragma unroll 1
+	for(i=0;i<8;i++) {
+		STEP(c_CNS[(2*i)+48],c_CNS[(2*i)+48+1]);
+	}
+
+	#pragma unroll
+	for(i=0;i<8;i++) {
+		statechainv[i+24] = chainv[i];
+		chainv[i] = statechainv[i+32];
+	}
+
+	TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],4);
+
+	#pragma unroll 1
+	for(i=0;i<8;i++) {
+		STEP(c_CNS[(2*i)+64],c_CNS[(2*i)+64+1]);
+	}
+
+	#pragma unroll
+	for(i=0;i<8;i++) {
+		statechainv[i+32] = chainv[i];
+	}
+}
+
+__device__ __forceinline__
+static void rnd512_first(uint32_t state[40], uint32_t buffer[8])
+{
+	uint32_t chainv[8];
+	uint32_t tmp;
+	int i, j;
+
+	for (j = 0; j<5; j++) {
+		state[8 * j] ^= buffer[0];
+
+		#pragma unroll 7
+		for (i = 1; i<8; i++)
+			state[i + 8 * j] ^= buffer[i];
+		MULT0(buffer);
+	}
+
+	#pragma unroll
+	for (i = 0; i<8; i++)
+		chainv[i] = state[i];
+
+	#pragma unroll 1
+	for (i = 0; i<8; i++) {
+		STEP(c_CNS[(2 * i)], c_CNS[(2 * i) + 1]);
+	}
+
+	#pragma unroll
+	for (i = 0; i<8; i++) {
+		state[i] = chainv[i];
+		chainv[i] = state[i + 8];
+	}
+
+	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 1);
+
+	#pragma unroll 1
+	for (i = 0; i<8; i++) {
+		STEP(c_CNS[(2 * i) + 16], c_CNS[(2 * i) + 16 + 1]);
+	}
+
+	#pragma unroll
+	for (i = 0; i<8; i++) {
+		state[i + 8] = chainv[i];
+		chainv[i] = state[i + 16];
+	}
+
+	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 2);
+
+	#pragma unroll 1
+	for (i = 0; i<8; i++) {
+		STEP(c_CNS[(2 * i) + 32], c_CNS[(2 * i) + 32 + 1]);
+	}
+
+	#pragma unroll
+	for (i = 0; i<8; i++) {
+		state[i + 16] = chainv[i];
+		chainv[i] = state[i + 24];
+	}
+
+	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 3);
+
+	#pragma unroll 1
+	for (i = 0; i<8; i++) {
+		STEP(c_CNS[(2 * i) + 48], c_CNS[(2 * i) + 48 + 1]);
+	}
+
+	#pragma unroll
+	for (i = 0; i<8; i++) {
+		state[i + 24] = chainv[i];
+		chainv[i] = state[i + 32];
+	}
+
+	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 4);
+
+	#pragma unroll 1
+	for (i = 0; i<8; i++) {
+		STEP(c_CNS[(2 * i) + 64], c_CNS[(2 * i) + 64 + 1]);
+	}
+
+	#pragma unroll
+	for (i = 0; i<8; i++)
+		state[i + 32] = chainv[i];
+}
+
+/***************************************************/
+__device__ __forceinline__
+static void rnd512_nullhash(uint32_t *state)
+{
+	uint32_t t[40];
+	uint32_t chainv[8];
+	uint32_t tmp;
+	int i, j;
+
+	#pragma unroll
+	for (i = 0; i<8; i++) {
+		t[i] = state[i + 8 * 0];
+		#pragma unroll 4
+		for (j = 1; j<5; j++)
+			t[i] ^= state[i + 8 * j];
+	}
+
+	MULT0(t);
+
+	#pragma unroll
+	for (j = 0; j<5; j++) {
+		#pragma unroll
+		for (i = 0; i<8; i++)
+			state[i + 8 * j] ^= t[i];
+	}
+
+	#pragma unroll
+	for (j = 0; j<5; j++) {
+		#pragma unroll
+		for (i = 0; i<8; i++)
+			t[i + 8 * j] = state[i + 8 * j];
+	}
+
+	MULT0(state);
+	#pragma unroll 4
+	for(j=1; j<5; j++) {
+		MULT2(state, j);
+	}
+
+	#pragma unroll
+	for (j = 0; j<5; j++) {
+		#pragma unroll
+		for (i = 0; i<8; i++)
+			state[8 * j + i] ^= t[8 * ((j + 1) % 5) + i];
+	}
+
+	#pragma unroll
+	for (j = 0; j<5; j++) {
+		#pragma unroll 8
+		for (i = 0; i<8; i++)
+			t[i + 8 * j] = state[i + 8 * j];
+	}
+
+	MULT0(state);
+	#pragma unroll 4
+	for(j=1; j<5; j++) {
+		MULT2(state, j);
+	}
+
+	#pragma unroll
+	for (j = 0; j<5; j++) {
+		#pragma unroll
+		for (i = 0; i<8; i++)
+			state[8 * j + i] ^= t[8 * ((j + 4) % 5) + i];
+	}
+
+	#pragma unroll
+	for (i = 0; i<8; i++)
+		chainv[i] = state[i];
+
+	#pragma unroll 1
+	for (i = 0; i<8; i++) {
+		STEP(c_CNS[(2 * i)], c_CNS[(2 * i) + 1]);
+	}
+
+	#pragma unroll
+	for (i = 0; i<8; i++) {
+		state[i] = chainv[i];
+		chainv[i] = state[i + 8];
+	}
+
+	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 1);
+
+	#pragma unroll 1
+	for (i = 0; i<8; i++) {
+		STEP(c_CNS[(2 * i) + 16], c_CNS[(2 * i) + 16 + 1]);
+	}
+
+	#pragma unroll
+	for (i = 0; i<8; i++) {
+		state[i + 8] = chainv[i];
+		chainv[i] = state[i + 16];
+	}
+
+	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 2);
+
+	#pragma unroll 1
+	for (i = 0; i<8; i++) {
+		STEP(c_CNS[(2 * i) + 32], c_CNS[(2 * i) + 32 + 1]);
+	}
+
+	#pragma unroll
+	for (i = 0; i<8; i++) {
+		state[i + 16] = chainv[i];
+		chainv[i] = state[i + 24];
+	}
+
+	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 3);
+
+	#pragma unroll 1
+	for (i = 0; i<8; i++) {
+		STEP(c_CNS[(2 * i) + 48], c_CNS[(2 * i) + 48 + 1]);
+	}
+
+	#pragma unroll
+	for (i = 0; i<8; i++) {
+		state[i + 24] = chainv[i];
+		chainv[i] = state[i + 32];
+	}
+
+	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 4);
+
+	#pragma unroll 1
+	for (i = 0; i<8; i++) {
+		STEP(c_CNS[(2 * i) + 64], c_CNS[(2 * i) + 64 + 1]);
+	}
+
+	#pragma unroll
+	for (i = 0; i<8; i++) {
+		state[i + 32] = chainv[i];
+	}
+}
+
+__device__ __forceinline__
+static void Update512(uint32_t *statebuffer, uint32_t *statechainv, const uint32_t *data)
+{
+	#pragma unroll
+	for (int i = 0; i < 8; i++) statebuffer[i] = cuda_swab32(data[i]);
+	rnd512_first(statechainv, statebuffer);
+
+	#pragma unroll
+	for (int i = 0; i < 8; i++) statebuffer[i] = cuda_swab32(data[i + 8]);
+	rnd512(statebuffer, statechainv);
+}
+
+/***************************************************/
+__device__ __forceinline__
+static void finalization512(uint32_t *statebuffer, uint32_t *statechainv, uint32_t *b)
+{
+	int i,j;
+
+	statebuffer[0] = 0x80000000;
+	#pragma unroll 7
+	for(int i=1;i<8;i++) statebuffer[i] = 0;
+	rnd512(statebuffer, statechainv);
+
+	/*---- blank round with m=0 ----*/
+	rnd512_nullhash(statechainv);
+
+	#pragma unroll
+	for(i=0;i<8;i++) {
+		b[i] = statechainv[i];
+		#pragma unroll 4
+		for(j=1;j<5;j++) {
+			b[i] ^= statechainv[i+8*j];
+		}
+		b[i] = cuda_swab32((b[i]));
+	}
+
+	rnd512_nullhash(statechainv);
+
+	#pragma unroll
+	for(i=0;i<8;i++) {
+		b[8 + i] = statechainv[i];
+		#pragma unroll 4
+		for(j=1;j<5;j++) {
+			b[8+i] ^= statechainv[i+8*j];
+		}
+		b[8 + i] = cuda_swab32((b[8 + i]));
+	}
+}
+
+#define ROUND_EVEN { \
+	xg = (x0 + xg); \
+	x0 = ROTL32(x0, 7); \
+	xh = (x1 + xh); \
+	x1 = ROTL32(x1, 7); \
+	xi = (x2 + xi); \
+	x2 = ROTL32(x2, 7); \
+	xj = (x3 + xj); \
+	x3 = ROTL32(x3, 7); \
+	xk = (x4 + xk); \
+	x4 = ROTL32(x4, 7); \
+	xl = (x5 + xl); \
+	x5 = ROTL32(x5, 7); \
+	xm = (x6 + xm); \
+	x6 = ROTL32(x6, 7); \
+	xn = (x7 + xn); \
+	x7 = ROTL32(x7, 7); \
+	xo = (x8 + xo); \
+	x8 = ROTL32(x8, 7); \
+	xp = (x9 + xp); \
+	x9 = ROTL32(x9, 7); \
+	xq = (xa + xq); \
+	xa = ROTL32(xa, 7); \
+	xr = (xb + xr); \
+	xb = ROTL32(xb, 7); \
+	xs = (xc + xs); \
+	xc = ROTL32(xc, 7); \
+	xt = (xd + xt); \
+	xd = ROTL32(xd, 7); \
+	xu = (xe + xu); \
+	xe = ROTL32(xe, 7); \
+	xv = (xf + xv); \
+	xf = ROTL32(xf, 7); \
+	x8 ^= xg; \
+	x9 ^= xh; \
+	xa ^= xi; \
+	xb ^= xj; \
+	xc ^= xk; \
+	xd ^= xl; \
+	xe ^= xm; \
+	xf ^= xn; \
+	x0 ^= xo; \
+	x1 ^= xp; \
+	x2 ^= xq; \
+	x3 ^= xr; \
+	x4 ^= xs; \
+	x5 ^= xt; \
+	x6 ^= xu; \
+	x7 ^= xv; \
+	xi = (x8 + xi); \
+	x8 = ROTL32(x8, 11); \
+	xj = (x9 + xj); \
+	x9 = ROTL32(x9, 11); \
+	xg = (xa + xg); \
+	xa = ROTL32(xa, 11); \
+	xh = (xb + xh); \
+	xb = ROTL32(xb, 11); \
+	xm = (xc + xm); \
+	xc = ROTL32(xc, 11); \
+	xn = (xd + xn); \
+	xd = ROTL32(xd, 11); \
+	xk = (xe + xk); \
+	xe = ROTL32(xe, 11); \
+	xl = (xf + xl); \
+	xf = ROTL32(xf, 11); \
+	xq = (x0 + xq); \
+	x0 = ROTL32(x0, 11); \
+	xr = (x1 + xr); \
+	x1 = ROTL32(x1, 11); \
+	xo = (x2 + xo); \
+	x2 = ROTL32(x2, 11); \
+	xp = (x3 + xp); \
+	x3 = ROTL32(x3, 11); \
+	xu = (x4 + xu); \
+	x4 = ROTL32(x4, 11); \
+	xv = (x5 + xv); \
+	x5 = ROTL32(x5, 11); \
+	xs = (x6 + xs); \
+	x6 = ROTL32(x6, 11); \
+	xt = (x7 + xt); \
+	x7 = ROTL32(x7, 11); \
+	xc ^= xi; \
+	xd ^= xj; \
+	xe ^= xg; \
+	xf ^= xh; \
+	x8 ^= xm; \
+	x9 ^= xn; \
+	xa ^= xk; \
+	xb ^= xl; \
+	x4 ^= xq; \
+	x5 ^= xr; \
+	x6 ^= xo; \
+	x7 ^= xp; \
+	x0 ^= xu; \
+	x1 ^= xv; \
+	x2 ^= xs; \
+	x3 ^= xt; \
+}
+
+#define ROUND_ODD { \
+	xj = (xc + xj); \
+	xc = ROTL32(xc, 7); \
+	xi = (xd + xi); \
+	xd = ROTL32(xd, 7); \
+	xh = (xe + xh); \
+	xe = ROTL32(xe, 7); \
+	xg = (xf + xg); \
+	xf = ROTL32(xf, 7); \
+	xn = (x8 + xn); \
+	x8 = ROTL32(x8, 7); \
+	xm = (x9 + xm); \
+	x9 = ROTL32(x9, 7); \
+	xl = (xa + xl); \
+	xa = ROTL32(xa, 7); \
+	xk = (xb + xk); \
+	xb = ROTL32(xb, 7); \
+	xr = (x4 + xr); \
+	x4 = ROTL32(x4, 7); \
+	xq = (x5 + xq); \
+	x5 = ROTL32(x5, 7); \
+	xp = (x6 + xp); \
+	x6 = ROTL32(x6, 7); \
+	xo = (x7 + xo); \
+	x7 = ROTL32(x7, 7); \
+	xv = (x0 + xv); \
+	x0 = ROTL32(x0, 7); \
+	xu = (x1 + xu); \
+	x1 = ROTL32(x1, 7); \
+	xt = (x2 + xt); \
+	x2 = ROTL32(x2, 7); \
+	xs = (x3 + xs); \
+	x3 = ROTL32(x3, 7); \
+	x4 ^= xj; \
+	x5 ^= xi; \
+	x6 ^= xh; \
+	x7 ^= xg; \
+	x0 ^= xn; \
+	x1 ^= xm; \
+	x2 ^= xl; \
+	x3 ^= xk; \
+	xc ^= xr; \
+	xd ^= xq; \
+	xe ^= xp; \
+	xf ^= xo; \
+	x8 ^= xv; \
+	x9 ^= xu; \
+	xa ^= xt; \
+	xb ^= xs; \
+	xh = (x4 + xh); \
+	x4 = ROTL32(x4, 11); \
+	xg = (x5 + xg); \
+	x5 = ROTL32(x5, 11); \
+	xj = (x6 + xj); \
+	x6 = ROTL32(x6, 11); \
+	xi = (x7 + xi); \
+	x7 = ROTL32(x7, 11); \
+	xl = (x0 + xl); \
+	x0 = ROTL32(x0, 11); \
+	xk = (x1 + xk); \
+	x1 = ROTL32(x1, 11); \
+	xn = (x2 + xn); \
+	x2 = ROTL32(x2, 11); \
+	xm = (x3 + xm); \
+	x3 = ROTL32(x3, 11); \
+	xp = (xc + xp); \
+	xc = ROTL32(xc, 11); \
+	xo = (xd + xo); \
+	xd = ROTL32(xd, 11); \
+	xr = (xe + xr); \
+	xe = ROTL32(xe, 11); \
+	xq = (xf + xq); \
+	xf = ROTL32(xf, 11); \
+	xt = (x8 + xt); \
+	x8 = ROTL32(x8, 11); \
+	xs = (x9 + xs); \
+	x9 = ROTL32(x9, 11); \
+	xv = (xa + xv); \
+	xa = ROTL32(xa, 11); \
+	xu = (xb + xu); \
+	xb = ROTL32(xb, 11); \
+	x0 ^= xh; \
+	x1 ^= xg; \
+	x2 ^= xj; \
+	x3 ^= xi; \
+	x4 ^= xl; \
+	x5 ^= xk; \
+	x6 ^= xn; \
+	x7 ^= xm; \
+	x8 ^= xp; \
+	x9 ^= xo; \
+	xa ^= xr; \
+	xb ^= xq; \
+	xc ^= xt; \
+	xd ^= xs; \
+	xe ^= xv; \
+	xf ^= xu; \
+}
+
+#define SIXTEEN_ROUNDS \
+	for (int j = 0; j < 8; j ++) { \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+	}
+
+__global__
+#if __CUDA_ARCH__ > 500
+__launch_bounds__(256, 4)
+#endif
+void x11_luffaCubehash512_gpu_hash_64(uint32_t threads, uint32_t *g_hash)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t statechainv[40] = {
+			0x8bb0a761, 0xc2e4aa8b, 0x2d539bc9, 0x381408f8,
+			0x478f6633, 0x255a46ff, 0x581c37f7, 0x601c2e8e,
+			0x266c5f9d, 0xc34715d8, 0x8900670e, 0x51a540be,
+			0xe4ce69fb, 0x5089f4d4, 0x3cc0a506, 0x609bcb02,
+			0xa4e3cd82, 0xd24fd6ca, 0xc0f196dc, 0xcf41eafe,
+			0x0ff2e673, 0x303804f2, 0xa7b3cd48, 0x677addd4,
+			0x66e66a8a, 0x2303208f, 0x486dafb4, 0xc0d37dc6,
+			0x634d15af, 0xe5af6747, 0x10af7e38, 0xee7e6428,
+			0x01262e5d, 0xc92c2e64, 0x82fee966, 0xcea738d3,
+			0x867de2b0, 0xe0714818, 0xda6e831f, 0xa7062529
+		};
+
+		uint32_t statebuffer[8];
+		uint32_t *const Hash = &g_hash[thread * 16U];
+
+		Update512(statebuffer, statechainv, Hash);
+		finalization512(statebuffer, statechainv, Hash);
+
+		//Cubehash
+
+		uint32_t x0 = 0x2AEA2A61, x1 = 0x50F494D4, x2 = 0x2D538B8B, x3 = 0x4167D83E;
+		uint32_t x4 = 0x3FEE2313, x5 = 0xC701CF8C, x6 = 0xCC39968E, x7 = 0x50AC5695;
+		uint32_t x8 = 0x4D42C787, x9 = 0xA647A8B3, xa = 0x97CF0BEF, xb = 0x825B4537;
+		uint32_t xc = 0xEEF864D2, xd = 0xF22090C4, xe = 0xD0E5CD33, xf = 0xA23911AE;
+		uint32_t xg = 0xFCD398D9, xh = 0x148FE485, xi = 0x1B017BEF, xj = 0xB6444532;
+		uint32_t xk = 0x6A536159, xl = 0x2FF5781C, xm = 0x91FA7934, xn = 0x0DBADEA9;
+		uint32_t xo = 0xD65C8A2B, xp = 0xA5A70E75, xq = 0xB1C62456, xr = 0xBC796576;
+		uint32_t xs = 0x1921C8F7, xt = 0xE7989AF1, xu = 0x7795D246, xv = 0xD43E3B44;
+
+		x0 ^= Hash[0];
+		x1 ^= Hash[1];
+		x2 ^= Hash[2];
+		x3 ^= Hash[3];
+		x4 ^= Hash[4];
+		x5 ^= Hash[5];
+		x6 ^= Hash[6];
+		x7 ^= Hash[7];
+
+		SIXTEEN_ROUNDS;
+
+		x0 ^= Hash[8];
+		x1 ^= Hash[9];
+		x2 ^= Hash[10];
+		x3 ^= Hash[11];
+		x4 ^= Hash[12];
+		x5 ^= Hash[13];
+		x6 ^= Hash[14];
+		x7 ^= Hash[15];
+
+		SIXTEEN_ROUNDS;
+		x0 ^= 0x80;
+
+		SIXTEEN_ROUNDS;
+		xv ^= 1;
+
+		for (int i = 3; i < 13; i++) {
+			SIXTEEN_ROUNDS;
+		}
+
+		Hash[0] = x0;
+		Hash[1] = x1;
+		Hash[2] = x2;
+		Hash[3] = x3;
+		Hash[4] = x4;
+		Hash[5] = x5;
+		Hash[6] = x6;
+		Hash[7] = x7;
+		Hash[8] = x8;
+		Hash[9] = x9;
+		Hash[10] = xa;
+		Hash[11] = xb;
+		Hash[12] = xc;
+		Hash[13] = xd;
+		Hash[14] = xe;
+		Hash[15] = xf;
+	}
+}
+
+__host__
+void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash, int order)
+{
+	const uint32_t threadsperblock = 256;
+
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	x11_luffaCubehash512_gpu_hash_64 <<<grid, block>>> (threads, d_hash);
+	MyStreamSynchronize(NULL, order, thr_id);
+}
+
+// Setup
+__host__
+void x11_luffaCubehash512_cpu_init(int thr_id, uint32_t threads) {}
diff --git a/x11/cuda_x11_shavite512.cu b/x11/cuda_x11_shavite512.cu
index b3fd9258ed..cfebf0d8ee 100644
--- a/x11/cuda_x11_shavite512.cu
+++ b/x11/cuda_x11_shavite512.cu
@@ -1,29 +1,17 @@
-// aus heavy.cu
-extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
+#include <memory.h> // memcpy()
 
-typedef unsigned char BitSequence;
-typedef unsigned long long DataLength;
+#include "cuda_helper.h"
 
-typedef unsigned char uint8_t;
-typedef unsigned short uint16_t;
-typedef unsigned int uint32_t;
-typedef unsigned long long uint64_t;
+extern __device__ __device_builtin__ void __threadfence_block(void);
 
-#define SPH_C64(x)    ((uint64_t)(x ## ULL))
-#define SPH_C32(x)    ((uint32_t)(x ## U))
-#define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
+#define TPB 128
 
-static __constant__ uint32_t d_ShaviteInitVector[16];
-static const uint32_t h_ShaviteInitVector[] = {
-	SPH_C32(0x72FCCDD8), SPH_C32(0x79CA4727), SPH_C32(0x128A077B), SPH_C32(0x40D55AEC),
-	SPH_C32(0xD1901A06), SPH_C32(0x430AE307), SPH_C32(0xB29F5CD1), SPH_C32(0xDF07FBFC),
-	SPH_C32(0x8E45D73D), SPH_C32(0x681AB538), SPH_C32(0xBDE86578), SPH_C32(0xDD577E47),
-	SPH_C32(0xE275EADE), SPH_C32(0x502D9FCD), SPH_C32(0xB9357178), SPH_C32(0x022A4B9A)
-};
+__constant__ uint32_t c_PaddedMessage80[32]; // padded message (80 bytes + padding)
 
-#include "cuda_x11_aes.cu"
+#include "cuda_x11_aes.cuh"
 
-static __device__ __forceinline__ void AES_ROUND_NOKEY(
+__device__ __forceinline__
+static void AES_ROUND_NOKEY(
 	const uint32_t* __restrict__ sharedMemory,
 	uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3)
 {
@@ -38,7 +26,8 @@ static __device__ __forceinline__ void AES_ROUND_NOKEY(
 	x3 = y3;
 }
 
-static __device__ __forceinline__ void KEY_EXPAND_ELT(
+__device__ __forceinline__
+static void KEY_EXPAND_ELT(
 	const uint32_t* __restrict__ sharedMemory,
 	uint32_t &k0, uint32_t &k1, uint32_t &k2, uint32_t &k3)
 {
@@ -53,8 +42,8 @@ static __device__ __forceinline__ void KEY_EXPAND_ELT(
 	k3 = y0;
 }
 
-static __device__ void
-c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg)
+__device__ __forceinline__
+static void c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg, const uint32_t count)
 {
 	uint32_t p0, p1, p2, p3, p4, p5, p6, p7;
 	uint32_t p8, p9, pA, pB, pC, pD, pE, pF;
@@ -63,7 +52,7 @@ c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg)
 	uint32_t rk08, rk09, rk0A, rk0B, rk0C, rk0D, rk0E, rk0F;
 	uint32_t rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17;
 	uint32_t rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F;
-	const uint32_t counter = 512;
+	const uint32_t counter = count;
 
 	p0 = state[0x0];
 	p1 = state[0x1];
@@ -81,82 +70,114 @@ c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg)
 	pD = state[0xD];
 	pE = state[0xE];
 	pF = state[0xF];
+
 	/* round 0 */
 	rk00 = msg[0];
-	x0 = p4 ^ rk00;
+	x0 = p4 ^ msg[0];
 	rk01 = msg[1];
-	x1 = p5 ^ rk01;
+	x1 = p5 ^ msg[1];
 	rk02 = msg[2];
-	x2 = p6 ^ rk02;
+	x2 = p6 ^ msg[2];
 	rk03 = msg[3];
-	x3 = p7 ^ rk03;
+	x3 = p7 ^ msg[3];
 	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
 	rk04 = msg[4];
-	x0 ^= rk04;
+	x0 ^= msg[4];
 	rk05 = msg[5];
-	x1 ^= rk05;
+	x1 ^= msg[5];
 	rk06 = msg[6];
-	x2 ^= rk06;
+	x2 ^= msg[6];
 	rk07 = msg[7];
-	x3 ^= rk07;
+	x3 ^= msg[7];
 	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
 	rk08 = msg[8];
-	x0 ^= rk08;
+	x0 ^= msg[8];
 	rk09 = msg[9];
-	x1 ^= rk09;
+	x1 ^= msg[9];
 	rk0A = msg[10];
-	x2 ^= rk0A;
+	x2 ^= msg[10];
 	rk0B = msg[11];
-	x3 ^= rk0B;
+	x3 ^= msg[11];
 	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
 	rk0C = msg[12];
-	x0 ^= rk0C;
+	x0 ^= msg[12];
 	rk0D = msg[13];
-	x1 ^= rk0D;
+	x1 ^= msg[13];
 	rk0E = msg[14];
-	x2 ^= rk0E;
+	x2 ^= msg[14];
 	rk0F = msg[15];
-	x3 ^= rk0F;
+	x3 ^= msg[15];
 	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
 	p0 ^= x0;
 	p1 ^= x1;
 	p2 ^= x2;
 	p3 ^= x3;
-	rk10 = msg[16];
-	x0 = pC ^ rk10;
-	rk11 = msg[17];
-	x1 = pD ^ rk11;
-	rk12 = msg[18];
-	x2 = pE ^ rk12;
-	rk13 = msg[19];
-	x3 = pF ^ rk13;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk14 = msg[20];
-	x0 ^= rk14;
-	rk15 = msg[21];
-	x1 ^= rk15;
-	rk16 = msg[22];
-	x2 ^= rk16;
-	rk17 = msg[23];
-	x3 ^= rk17;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk18 = msg[24];
-	x0 ^= rk18;
-	rk19 = msg[25];
-	x1 ^= rk19;
-	rk1A = msg[26];
-	x2 ^= rk1A;
-	rk1B = msg[27];
-	x3 ^= rk1B;
-	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
-	rk1C = msg[28];
-	x0 ^= rk1C;
-	rk1D = msg[29];
-	x1 ^= rk1D;
-	rk1E = msg[30];
-	x2 ^= rk1E;
-	rk1F = msg[31];
-	x3 ^= rk1F;
+	if (count == 512)
+	{
+		rk10 = 0x80U;
+		x0 = pC ^ 0x80U;
+		rk11 = 0;
+		x1 = pD;
+		rk12 = 0;
+		x2 = pE;
+		rk13 = 0;
+		x3 = pF;
+		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+		rk14 = 0;
+		rk15 = 0;
+		rk16 = 0;
+		rk17 = 0;
+		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+		rk18 = 0;
+		rk19 = 0;
+		rk1A = 0;
+		rk1B = 0x02000000U;
+		x3 ^= 0x02000000U;
+		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+		rk1C = 0;
+		rk1D = 0;
+		rk1E = 0;
+		rk1F = 0x02000000;
+		x3 ^= 0x02000000;
+	}
+	else
+	{
+		rk10 = msg[16];
+		x0 = pC ^ msg[16];
+		rk11 = msg[17];
+		x1 = pD ^ msg[17];
+		rk12 = msg[18];
+		x2 = pE ^ msg[18];
+		rk13 = msg[19];
+		x3 = pF ^ msg[19];
+		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+		rk14 = msg[20];
+		x0 ^= msg[20];
+		rk15 = msg[21];
+		x1 ^= msg[21];
+		rk16 = msg[22];
+		x2 ^= msg[22];
+		rk17 = msg[23];
+		x3 ^= msg[23];
+		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+		rk18 = msg[24];
+		x0 ^= msg[24];
+		rk19 = msg[25];
+		x1 ^= msg[25];
+		rk1A = msg[26];
+		x2 ^= msg[26];
+		rk1B = msg[27];
+		x3 ^= msg[27];
+		AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
+		rk1C = msg[28];
+		x0 ^= msg[28];
+		rk1D = msg[29];
+		x1 ^= msg[29];
+		rk1E = msg[30];
+		x2 ^= msg[30];
+		rk1F = msg[31];
+		x3 ^= msg[31];
+	}
 	AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3);
 	p8 ^= x0;
 	p9 ^= x1;
@@ -254,7 +275,7 @@ c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg)
 	p5 ^= x1;
 	p6 ^= x2;
 	p7 ^= x3;
-	
+
 	rk00 ^= rk19;
 	x0 = pC ^ rk00;
 	rk01 ^= rk1A;
@@ -335,6 +356,7 @@ c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg)
 	p1 ^= x1;
 	p2 ^= x2;
 	p3 ^= x3;
+
 	/* round 3, 7, 11 */
 	KEY_EXPAND_ELT(sharedMemory, rk00, rk01, rk02, rk03);
 	rk00 ^= rk1C;
@@ -424,6 +446,7 @@ c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg)
 	pD ^= x1;
 	pE ^= x2;
 	pF ^= x3;
+
 	/* round 4, 8, 12 */
 	rk00 ^= rk19;
 	x0 = p4 ^ rk00;
@@ -521,7 +544,7 @@ c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg)
 	rk04 ^= rk00;
 	rk05 ^= rk01;
 	rk06 ^= rk02;
-	rk07 ^= rk03;	
+	rk07 ^= rk03;
 	rk07 ^= SPH_T32(~counter);
 	x0 ^= rk04;
 	x1 ^= rk05;
@@ -596,7 +619,7 @@ c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg)
 	p5 ^= x1;
 	p6 ^= x2;
 	p7 ^= x3;
-	
+
 	rk00 ^= rk19;
 	x0 = pC ^ rk00;
 	rk01 ^= rk1A;
@@ -677,6 +700,7 @@ c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg)
 	p1 ^= x1;
 	p2 ^= x2;
 	p3 ^= x3;
+
 	/* round 3, 7, 11 */
 	KEY_EXPAND_ELT(sharedMemory, rk00, rk01, rk02, rk03);
 	rk00 ^= rk1C;
@@ -766,6 +790,7 @@ c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg)
 	pD ^= x1;
 	pE ^= x2;
 	pF ^= x3;
+
 	/* round 4, 8, 12 */
 	rk00 ^= rk19;
 	x0 = p4 ^ rk00;
@@ -939,7 +964,7 @@ c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg)
 	p5 ^= x1;
 	p6 ^= x2;
 	p7 ^= x3;
-	
+
 	rk00 ^= rk19;
 	x0 = pC ^ rk00;
 	rk01 ^= rk1A;
@@ -1020,6 +1045,7 @@ c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg)
 	p1 ^= x1;
 	p2 ^= x2;
 	p3 ^= x3;
+
 	/* round 3, 7, 11 */
 	KEY_EXPAND_ELT(sharedMemory, rk00, rk01, rk02, rk03);
 	rk00 ^= rk1C;
@@ -1298,40 +1324,59 @@ c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg)
 	state[0xF] ^= p7;
 }
 
+__device__ __forceinline__
+void shavite_gpu_init(uint32_t *sharedMemory)
+{
+	/* each thread startup will fill a uint32 */
+	if (threadIdx.x < 128) {
+		sharedMemory[threadIdx.x] = d_AES0[threadIdx.x];
+		sharedMemory[threadIdx.x + 256] = d_AES1[threadIdx.x];
+		sharedMemory[threadIdx.x + 512] = d_AES2[threadIdx.x];
+		sharedMemory[threadIdx.x + 768] = d_AES3[threadIdx.x];
 
-// Die Hash-Funktion
-__global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
+		sharedMemory[threadIdx.x + 64 * 2] = d_AES0[threadIdx.x + 64 * 2];
+		sharedMemory[threadIdx.x + 64 * 2 + 256] = d_AES1[threadIdx.x + 64 * 2];
+		sharedMemory[threadIdx.x + 64 * 2 + 512] = d_AES2[threadIdx.x + 64 * 2];
+		sharedMemory[threadIdx.x + 64 * 2 + 768] = d_AES3[threadIdx.x + 64 * 2];
+	}
+}
+
+// GPU Hash
+__global__ __launch_bounds__(TPB, 7) /* 64 registers with 128,8 - 72 regs with 128,7 */
+void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
 {
 	__shared__ uint32_t sharedMemory[1024];
 
-	aes_gpu_init(sharedMemory);
+	shavite_gpu_init(sharedMemory);
+	__threadfence_block();
 
-    int thread = (blockDim.x * blockIdx.x + threadIdx.x);
-    if (thread < threads)
-    {
-        uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
 
-        int hashPosition = nounce - startNounce;
-        uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition];
+		int hashPosition = nounce - startNounce;
+		uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3];
 
 		// kopiere init-state
-		uint32_t state[16];
-
-#pragma unroll 16
-		for(int i=0;i<16;i++)
-			state[i] = d_ShaviteInitVector[i];
+		uint32_t state[16] = {
+			SPH_C32(0x72FCCDD8), SPH_C32(0x79CA4727), SPH_C32(0x128A077B), SPH_C32(0x40D55AEC),
+			SPH_C32(0xD1901A06), SPH_C32(0x430AE307), SPH_C32(0xB29F5CD1), SPH_C32(0xDF07FBFC),
+			SPH_C32(0x8E45D73D), SPH_C32(0x681AB538), SPH_C32(0xBDE86578), SPH_C32(0xDD577E47),
+			SPH_C32(0xE275EADE), SPH_C32(0x502D9FCD), SPH_C32(0xB9357178), SPH_C32(0x022A4B9A)
+		};
 
 		// nachricht laden
 		uint32_t msg[32];
 
-		// f�lle die Nachricht mit 64-byte (vorheriger Hash)
-#pragma unroll 16
+		// fülle die Nachricht mit 64-byte (vorheriger Hash)
+		#pragma unroll 16
 		for(int i=0;i<16;i++)
-			msg[i] = Hash[i];			
+			msg[i] = Hash[i];
 
 		// Nachrichtenende
 		msg[16] = 0x80;
-#pragma unroll 10
+		#pragma unroll 10
 		for(int i=17;i<27;i++)
 			msg[i] = 0;
 
@@ -1341,38 +1386,95 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
 		msg[30] = 0;
 		msg[31] = 0x02000000;
 
-		c512(sharedMemory, state, msg);
+		c512(sharedMemory, state, msg, 512);
 
-#pragma unroll 16
+		#pragma unroll 16
 		for(int i=0;i<16;i++)
 			Hash[i] = state[i];
-    }
+	}
 }
 
+__global__ __launch_bounds__(TPB, 7)
+void x11_shavite512_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash)
+{
+	__shared__ uint32_t sharedMemory[1024];
+
+	shavite_gpu_init(sharedMemory);
+	__threadfence_block();
 
-// Setup-Funktionen
-__host__ void x11_shavite512_cpu_init(int thr_id, int threads)
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint32_t nounce = startNounce + thread;
+
+		// initial state
+		uint32_t state[16] = {
+			SPH_C32(0x72FCCDD8), SPH_C32(0x79CA4727), SPH_C32(0x128A077B), SPH_C32(0x40D55AEC),
+			SPH_C32(0xD1901A06), SPH_C32(0x430AE307), SPH_C32(0xB29F5CD1), SPH_C32(0xDF07FBFC),
+			SPH_C32(0x8E45D73D), SPH_C32(0x681AB538), SPH_C32(0xBDE86578), SPH_C32(0xDD577E47),
+			SPH_C32(0xE275EADE), SPH_C32(0x502D9FCD), SPH_C32(0xB9357178), SPH_C32(0x022A4B9A)
+		};
+
+		uint32_t msg[32];
+
+		#pragma unroll 32
+		for(int i=0;i<32;i++) {
+			msg[i] = c_PaddedMessage80[i];
+		}
+		msg[19] = cuda_swab32(nounce);
+		msg[20] = 0x80;
+		msg[27] = 0x2800000;
+		msg[31] = 0x2000000;
+
+		c512(sharedMemory, state, msg, 640);
+
+		uint32_t *outHash = (uint32_t *)outputHash + 16 * thread;
+
+		#pragma unroll 16
+		for(int i=0;i<16;i++)
+			outHash[i] = state[i];
+
+	} //thread < threads
+}
+
+__host__
+void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
 {
-	aes_cpu_init();
+	const uint32_t threadsperblock = TPB;
 
-	cudaMemcpyToSymbol( d_ShaviteInitVector,
-                        h_ShaviteInitVector,
-                        sizeof(h_ShaviteInitVector),
-                        0, cudaMemcpyHostToDevice);
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	// note: 128 threads minimum are required to init the shared memory array
+	x11_shavite512_gpu_hash_64<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+	//MyStreamSynchronize(NULL, order, thr_id);
 }
 
-__host__ void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+__host__
+void x11_shavite512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash, int order)
 {
-    const int threadsperblock = 256;
+	const uint32_t threadsperblock = TPB;
 
-    // berechne wie viele Thread Blocks wir brauchen
-    dim3 grid((threads + threadsperblock-1)/threadsperblock);
-    dim3 block(threadsperblock);
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
 
-    // Gr��e des dynamischen Shared Memory Bereichs
-    size_t shared_size = 0;
+	x11_shavite512_gpu_hash_80<<<grid, block>>>(threads, startNounce, d_outputHash);
+}
 
-    x11_shavite512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
-    MyStreamSynchronize(NULL, order, thr_id);
+__host__
+void x11_shavite512_cpu_init(int thr_id, uint32_t threads)
+{
+	aes_cpu_init(thr_id);
 }
 
+__host__
+void x11_shavite512_setBlock_80(void *pdata)
+{
+	// Message with Padding
+	// The nonce is at Byte 76.
+	unsigned char PaddedMessage[128];
+	memcpy(PaddedMessage, pdata, 80);
+	memset(PaddedMessage+80, 0, 48);
+
+	cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 32*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
+}
diff --git a/x11/cuda_x11_simd512.cu b/x11/cuda_x11_simd512.cu
index 801910f1e7..5495edadf4 100644
--- a/x11/cuda_x11_simd512.cu
+++ b/x11/cuda_x11_simd512.cu
@@ -1,88 +1,102 @@
-// Parallelisierung:
-//
-// FFT_8  wird 2 mal 8-fach parallel ausgef�hrt (in FFT_64)
-//        und 1 mal 16-fach parallel (in FFT_128_full)
-//
-// STEP8_IF und STEP8_MAJ beinhalten je zwei 8-fach parallele Operationen
-
-#define TPB 256
+/***************************************************************************************************
+ * SIMD512 SM3+ CUDA IMPLEMENTATION (require cuda_x11_simd512_func.cuh)
+ */
 
-// aus heavy.cu
-extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
+#include "miner.h"
+#include "cuda_helper.h"
 
-typedef unsigned int uint32_t;
-typedef unsigned long long uint64_t;
+#define TPB 128
 
-int *d_state[8];
-uint4 *d_temp4[8];
+uint32_t *d_state[MAX_GPUS];
+uint4 *d_temp4[MAX_GPUS];
 
 // texture bound to d_temp4[thr_id], for read access in Compaction kernel
 texture<uint4, 1, cudaReadModeElementType> texRef1D_128;
 
-#define C32(x)    ((uint32_t)(x ## U))
-#define T32(x) ((x) & C32(0xFFFFFFFF))
+#define DEVICE_DIRECT_CONSTANTS
 
-#if __CUDA_ARCH__ < 350 
-    // Kepler (Compute 3.0)
-    #define ROTL32(x, n) T32(((x) << (n)) | ((x) >> (32 - (n))))
+#ifdef DEVICE_DIRECT_CONSTANTS
+__constant__ uint8_t c_perm[8][8] = {
 #else
-    // Kepler (Compute 3.5)
-    #define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
+__constant__ uint8_t c_perm[8][8];
+const uint8_t h_perm[8][8] = {
 #endif
+	{ 2, 3, 6, 7, 0, 1, 4, 5 },
+	{ 6, 7, 2, 3, 4, 5, 0, 1 },
+	{ 7, 6, 5, 4, 3, 2, 1, 0 },
+	{ 1, 0, 3, 2, 5, 4, 7, 6 },
+	{ 0, 1, 4, 5, 6, 7, 2, 3 },
+	{ 6, 7, 2, 3, 0, 1, 4, 5 },
+	{ 6, 7, 0, 1, 4, 5, 2, 3 },
+	{ 4, 5, 2, 3, 6, 7, 0, 1 }
+};
 
-__constant__  uint32_t c_IV_512[32];
+/* used in cuda_x11_simd512_func.cuh (SIMD_Compress2) */
+#ifdef DEVICE_DIRECT_CONSTANTS
+__constant__ uint32_t c_IV_512[32] = {
+#else
+__constant__ uint32_t c_IV_512[32];
 const uint32_t h_IV_512[32] = {
-  0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, 0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558,
-  0xac506643, 0xa90635a5, 0xe25b878b, 0xaab7878f, 0x88817f7a, 0x0a02892b, 0x559a7550, 0x598f657e,
-  0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8, 0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257,
-  0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4, 0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22
+#endif
+	0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, 0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558,
+	0xac506643, 0xa90635a5, 0xe25b878b, 0xaab7878f, 0x88817f7a, 0x0a02892b, 0x559a7550, 0x598f657e,
+	0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8, 0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257,
+	0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4, 0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22
+};
+
+#ifdef DEVICE_DIRECT_CONSTANTS
+__constant__ short c_FFT128_8_16_Twiddle[128] = {
+#else
+__constant__ short c_FFT128_8_16_Twiddle[128];
+static const short h_FFT128_8_16_Twiddle[128] = {
+#endif
+	1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
+	1,  60,   2, 120,   4,  -17,   8, -34,  16, -68,  32, 121,  64, -15, 128, -30,
+	1,  46,  60, -67,   2,   92, 120, 123,   4, -73, -17, -11,   8, 111, -34, -22,
+	1, -67, 120, -73,   8,  -22, -68, -70,  64,  81, -30, -46,  -2,-123,  17,-111,
+	1,-118,  46, -31,  60,  116, -67, -61,   2,  21,  92, -62, 120, -25, 123,-122,
+	1, 116,  92,-122, -17,   84, -22,  18,  32, 114, 117, -49, -30, 118,  67,  62,
+	1, -31, -67,  21, 120, -122, -73, -50,   8,   9, -22, -89, -68,  52, -70, 114,
+	1, -61, 123, -50, -34,   18, -70, -99, 128, -98,  67,  25,  17,  -9,  35, -79
 };
 
- __constant__ int c_FFT128_8_16_Twiddle[128];
- static const int h_FFT128_8_16_Twiddle[128] = {
-1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-1, 60, 2, 120, 4, -17, 8, -34, 16, -68, 32, 121, 64, -15, 128, -30,
-1, 46, 60, -67, 2, 92, 120, 123, 4, -73, -17, -11, 8, 111, -34, -22,
-1, -67, 120, -73, 8, -22, -68, -70, 64, 81, -30, -46, -2, -123, 17, -111,
-1, -118, 46, -31, 60, 116, -67, -61, 2, 21, 92, -62, 120, -25, 123, -122,
-1, 116, 92, -122, -17, 84, -22, 18, 32, 114, 117, -49, -30, 118, 67, 62,
-1, -31, -67, 21, 120, -122, -73, -50, 8, 9, -22, -89, -68, 52, -70, 114,
-1, -61, 123, -50, -34, 18, -70, -99, 128, -98, 67, 25, 17, -9, 35, -79};
-
-
-__constant__ int c_FFT256_2_128_Twiddle[128];
-static const int h_FFT256_2_128_Twiddle[128] = {
-   1, 41, -118, 45, 46, 87, -31, 14,
-  60, -110, 116, -127, -67, 80, -61, 69,
-   2, 82, 21, 90, 92, -83, -62, 28,
- 120, 37, -25, 3, 123, -97, -122, -119,
-   4, -93, 42, -77, -73, 91, -124, 56,
- -17, 74, -50, 6, -11, 63, 13, 19,
-   8, 71, 84, 103, 111, -75, 9, 112,
- -34, -109, -100, 12, -22, 126, 26, 38,
-  16, -115, -89, -51, -35, 107, 18, -33,
- -68, 39, 57, 24, -44, -5, 52, 76,
-  32, 27, 79, -102, -70, -43, 36, -66,
- 121, 78, 114, 48, -88, -10, 104, -105,
-  64, 54, -99, 53, 117, -86, 72, 125,
- -15, -101, -29, 96, 81, -20, -49, 47,
- 128, 108, 59, 106, -23, 85, -113, -7,
- -30, 55, -58, -65, -95, -40, -98, 94};
-
-#define p8_xor(x) ( ((x)%7) == 0 ? 1 : \
-                    ((x)%7) == 1 ? 6 : \
-                    ((x)%7) == 2 ? 2 : \
-                    ((x)%7) == 3 ? 3 : \
-                    ((x)%7) == 4 ? 5 : \
-                    ((x)%7) == 5 ? 7 : \
-                                   4 )
+#ifdef DEVICE_DIRECT_CONSTANTS
+__constant__ short c_FFT256_2_128_Twiddle[128] = {
+#else
+__constant__ short c_FFT256_2_128_Twiddle[128];
+static const short h_FFT256_2_128_Twiddle[128] = {
+#endif
+	  1,  41,-118,  45,  46,  87, -31,  14,
+	 60,-110, 116,-127, -67,  80, -61,  69,
+	  2,  82,  21,  90,  92, -83, -62,  28,
+	120,  37, -25,   3, 123, -97,-122,-119,
+	  4, -93,  42, -77, -73,  91,-124,  56,
+	-17,  74, -50,   6, -11,  63,  13,  19,
+	  8,  71,  84, 103, 111, -75,   9, 112,
+	-34,-109,-100,  12, -22, 126,  26,  38,
+	 16,-115, -89, -51, -35, 107,  18, -33,
+	-68,  39,  57,  24, -44,  -5,  52,  76,
+	 32,  27,  79,-102, -70, -43,  36, -66,
+	121,  78, 114,  48, -88, -10, 104,-105,
+	 64,  54, -99,  53, 117, -86,  72, 125,
+	-15,-101, -29,  96,  81, -20, -49,  47,
+	128, 108,  59, 106, -23,  85,-113,  -7,
+	-30,  55, -58, -65, -95, -40, -98,  94
+};
 
 /************* the round function ****************/
+#define IF(x, y, z) (((y ^ z) & x) ^ z)
+#define MAJ(x, y, z) ((z &y) | ((z|y) & x))
 
-#define IF(x, y, z) ((((y) ^ (z)) & (x)) ^ (z))
+#include "cuda_x11_simd512_sm2.cuh"
+#include "cuda_x11_simd512_func.cuh"
+
+#ifdef __INTELLISENSE__
+/* just for vstudio code colors */
+#define __CUDA_ARCH__ 500
+#endif
 
-#define MAJ(x, y, z) (((z) & (y)) | (((z) | (y)) & (x)))
-#include "x11/simd_functions.cu"
+#if __CUDA_ARCH__ >= 300
 
 /********************* Message expansion ************************/
 
@@ -90,76 +104,84 @@ static const int h_FFT256_2_128_Twiddle[128] = {
  * Reduce modulo 257; result is in [-127; 383]
  * REDUCE(x) := (x&255) - (x>>8)
  */
-#define REDUCE(x) (((x)&255) - ((x)>>8))
+#define REDUCE(x) \
+	(((x)&255) - ((x)>>8))
 
 /*
  * Reduce from [-127; 383] to [-128; 128]
  * EXTRA_REDUCE_S(x) := x<=128 ? x : x-257
  */
 #define EXTRA_REDUCE_S(x) \
-((x)<=128 ? (x) : (x)-257)
- 
+	((x)<=128 ? (x) : (x)-257)
+
 /*
  * Reduce modulo 257; result is in [-128; 128]
  */
 #define REDUCE_FULL_S(x) \
-EXTRA_REDUCE_S(REDUCE(x))
-
-__device__ __forceinline__  void FFT_8(int *y, int stripe) {
+	EXTRA_REDUCE_S(REDUCE(x))
 
- /*
-  * FFT_8 using w=4 as 8th root of unity
-  * Unrolled decimation in frequency (DIF) radix-2 NTT.
-  * Output data is in revbin_permuted order.
-  */
+// Parallelization:
+//
+// FFT_8  wird 2 times 8-fach parallel ausgeführt (in FFT_64)
+//        and  1 time 16-fach parallel (in FFT_128_full)
+//
+// STEP8_IF and STEP8_MAJ beinhalten je 2x 8-fach parallel Operations
 
+/**
+ * FFT_8 using w=4 as 8th root of unity
+ * Unrolled decimation in frequency (DIF) radix-2 NTT.
+ * Output data is in revbin_permuted order.
+ */
+__device__ __forceinline__
+void FFT_8(int *y, int stripe)
+{
 #define X(i) y[stripe*i]
 
 #define DO_REDUCE(i) \
-X(i) = REDUCE(X(i))
+	X(i) = REDUCE(X(i))
 
 #define DO_REDUCE_FULL_S(i) \
 do { \
-X(i) = REDUCE(X(i)); \
-X(i) = EXTRA_REDUCE_S(X(i)); \
+	X(i) = REDUCE(X(i)); \
+	X(i) = EXTRA_REDUCE_S(X(i)); \
 } while(0)
 
 #define BUTTERFLY(i,j,n) \
 do { \
-int u= X(i); \
-int v= X(j); \
-X(i) = u+v; \
-X(j) = (u-v) << (2*n); \
+	int u= X(i); \
+	int v= X(j); \
+	X(i) = u+v; \
+	X(j) = (u-v) << (2*n); \
 } while(0)
 
-    BUTTERFLY(0, 4, 0);
-    BUTTERFLY(1, 5, 1);
-    BUTTERFLY(2, 6, 2);
-    BUTTERFLY(3, 7, 3);
-
-    DO_REDUCE(6);
-    DO_REDUCE(7);
-
-    BUTTERFLY(0, 2, 0);
-    BUTTERFLY(4, 6, 0);
-    BUTTERFLY(1, 3, 2);
-    BUTTERFLY(5, 7, 2);
-
-    DO_REDUCE(7);
-    
-    BUTTERFLY(0, 1, 0);
-    BUTTERFLY(2, 3, 0);
-    BUTTERFLY(4, 5, 0);
-    BUTTERFLY(6, 7, 0);
-
-    DO_REDUCE_FULL_S(0);
-    DO_REDUCE_FULL_S(1);
-    DO_REDUCE_FULL_S(2);
-    DO_REDUCE_FULL_S(3);
-    DO_REDUCE_FULL_S(4);
-    DO_REDUCE_FULL_S(5);
-    DO_REDUCE_FULL_S(6);
-    DO_REDUCE_FULL_S(7);
+	BUTTERFLY(0, 4, 0);
+	BUTTERFLY(1, 5, 1);
+	BUTTERFLY(2, 6, 2);
+	BUTTERFLY(3, 7, 3);
+
+	DO_REDUCE(6);
+	DO_REDUCE(7);
+
+	BUTTERFLY(0, 2, 0);
+	BUTTERFLY(4, 6, 0);
+	BUTTERFLY(1, 3, 2);
+	BUTTERFLY(5, 7, 2);
+
+	DO_REDUCE(7);
+
+	BUTTERFLY(0, 1, 0);
+	BUTTERFLY(2, 3, 0);
+	BUTTERFLY(4, 5, 0);
+	BUTTERFLY(6, 7, 0);
+
+	DO_REDUCE_FULL_S(0);
+	DO_REDUCE_FULL_S(1);
+	DO_REDUCE_FULL_S(2);
+	DO_REDUCE_FULL_S(3);
+	DO_REDUCE_FULL_S(4);
+	DO_REDUCE_FULL_S(5);
+	DO_REDUCE_FULL_S(6);
+	DO_REDUCE_FULL_S(7);
 
 #undef X
 #undef DO_REDUCE
@@ -167,261 +189,263 @@ X(j) = (u-v) << (2*n); \
 #undef BUTTERFLY
 }
 
-__device__ __forceinline__ void FFT_16(int *y) {
-
-  /*
-   * FFT_16 using w=2 as 16th root of unity
-   * Unrolled decimation in frequency (DIF) radix-2 NTT.
-   * Output data is in revbin_permuted order.
-   */
+#if defined(__CUDA_ARCH__)
+#if __CUDA_ARCH__ < 300
+  #define __shfl(var, srcLane, width) (uint32_t)(var)
+  // #error __shfl() not supported by SM 2.x
+#endif
+#endif
 
+/**
+ * FFT_16 using w=2 as 16th root of unity
+ * Unrolled decimation in frequency (DIF) radix-2 NTT.
+ * Output data is in revbin_permuted order.
+ */
+__device__ __forceinline__
+void FFT_16(int *y)
+{
 #define DO_REDUCE_FULL_S(i) \
-do { \
-y[i] = REDUCE(y[i]); \
-y[i] = EXTRA_REDUCE_S(y[i]); \
-} while(0)
-
-    int u,v;
-
-    // BUTTERFLY(0, 8, 0);
-    // BUTTERFLY(1, 9, 1);
-    // BUTTERFLY(2, 10, 2);
-    // BUTTERFLY(3, 11, 3);
-    // BUTTERFLY(4, 12, 4);
-    // BUTTERFLY(5, 13, 5);
-    // BUTTERFLY(6, 14, 6);
-    // BUTTERFLY(7, 15, 7);
-    {
-      u= y[0]; // 0..7
-      v= y[1]; // 8..15
-      y[0] = u+v;
-      y[1] = (u-v) << (threadIdx.x&7);
-    }
-
-    // DO_REDUCE(11);
-    // DO_REDUCE(12);
-    // DO_REDUCE(13);
-    // DO_REDUCE(14);
-    // DO_REDUCE(15);
-    if ((threadIdx.x&7) >=3) y[1] = REDUCE(y[1]);  // 11...15
-
-    // BUTTERFLY( 0, 4, 0);
-    // BUTTERFLY( 1, 5, 2);
-    // BUTTERFLY( 2, 6, 4);
-    // BUTTERFLY( 3, 7, 6);
-    {
-      u= __shfl((int)y[0],  (threadIdx.x&3),8); // 0,1,2,3  0,1,2,3
-      v= __shfl((int)y[0],4+(threadIdx.x&3),8); // 4,5,6,7  4,5,6,7
-      y[0] = ((threadIdx.x&7) < 4) ? (u+v) : ((u-v) << (2*(threadIdx.x&3)));
-    }
-
-    // BUTTERFLY( 8, 12, 0);
-    // BUTTERFLY( 9, 13, 2);
-    // BUTTERFLY(10, 14, 4);
-    // BUTTERFLY(11, 15, 6);
-    {
-      u= __shfl((int)y[1],  (threadIdx.x&3),8); // 8,9,10,11    8,9,10,11
-      v= __shfl((int)y[1],4+(threadIdx.x&3),8); // 12,13,14,15  12,13,14,15
-      y[1] = ((threadIdx.x&7) < 4) ? (u+v) : ((u-v) << (2*(threadIdx.x&3)));
-    }
-
-    // DO_REDUCE(5);
-    // DO_REDUCE(7);
-    // DO_REDUCE(13);
-    // DO_REDUCE(15);
-    if ((threadIdx.x&1) && (threadIdx.x&7) >= 4) {
-        y[0] = REDUCE(y[0]);  // 5, 7
-        y[1] = REDUCE(y[1]);  // 13, 15
-    }
-
-    // BUTTERFLY( 0, 2, 0);
-    // BUTTERFLY( 1, 3, 4);
-    // BUTTERFLY( 4, 6, 0);
-    // BUTTERFLY( 5, 7, 4);
-    {
-      u= __shfl((int)y[0],  (threadIdx.x&5),8); // 0,1,0,1  4,5,4,5
-      v= __shfl((int)y[0],2+(threadIdx.x&5),8); // 2,3,2,3  6,7,6,7
-      y[0] = ((threadIdx.x&3) < 2) ? (u+v) : ((u-v) << (4*(threadIdx.x&1)));
-    }
-
-    // BUTTERFLY( 8, 10, 0);
-    // BUTTERFLY( 9, 11, 4);
-    // BUTTERFLY(12, 14, 0);
-    // BUTTERFLY(13, 15, 4);
-    {
-      u= __shfl((int)y[1],  (threadIdx.x&5),8); // 8,9,8,9      12,13,12,13
-      v= __shfl((int)y[1],2+(threadIdx.x&5),8); // 10,11,10,11  14,15,14,15
-      y[1] = ((threadIdx.x&3) < 2) ? (u+v) : ((u-v) << (4*(threadIdx.x&1)));
-    }
-
-    // BUTTERFLY( 0, 1, 0);
-    // BUTTERFLY( 2, 3, 0);
-    // BUTTERFLY( 4, 5, 0);
-    // BUTTERFLY( 6, 7, 0);
-    {
-      u= __shfl((int)y[0],  (threadIdx.x&6),8); // 0,0,2,2      4,4,6,6
-      v= __shfl((int)y[0],1+(threadIdx.x&6),8); // 1,1,3,3      5,5,7,7
-      y[0] = ((threadIdx.x&1) < 1) ? (u+v) : (u-v);
-    }
-
-    // BUTTERFLY( 8, 9, 0);
-    // BUTTERFLY(10, 11, 0);
-    // BUTTERFLY(12, 13, 0);
-    // BUTTERFLY(14, 15, 0);
-    {
-      u= __shfl((int)y[1],  (threadIdx.x&6),8); // 8,8,10,10    12,12,14,14
-      v= __shfl((int)y[1],1+(threadIdx.x&6),8); // 9,9,11,11    13,13,15,15
-      y[1] = ((threadIdx.x&1) < 1) ? (u+v) : (u-v);
-    }
-
-    DO_REDUCE_FULL_S( 0); // 0...7
-    DO_REDUCE_FULL_S( 1); // 8...15
+	do { \
+		y[i] = REDUCE(y[i]); \
+		y[i] = EXTRA_REDUCE_S(y[i]); \
+	} while(0)
+
+	int u,v;
+
+	// BUTTERFLY(0, 8, 0);
+	// BUTTERFLY(1, 9, 1);
+	// BUTTERFLY(2, 10, 2);
+	// BUTTERFLY(3, 11, 3);
+	// BUTTERFLY(4, 12, 4);
+	// BUTTERFLY(5, 13, 5);
+	// BUTTERFLY(6, 14, 6);
+	// BUTTERFLY(7, 15, 7);
+	{
+		u = y[0]; // 0..7
+		v = y[1]; // 8..15
+		y[0] = u+v;
+		y[1] = (u-v) << (threadIdx.x&7);
+	}
+
+	// DO_REDUCE(11);
+	// DO_REDUCE(12);
+	// DO_REDUCE(13);
+	// DO_REDUCE(14);
+	// DO_REDUCE(15);
+	if ((threadIdx.x&7) >=3) y[1] = REDUCE(y[1]);  // 11...15
+
+	// BUTTERFLY( 0, 4, 0);
+	// BUTTERFLY( 1, 5, 2);
+	// BUTTERFLY( 2, 6, 4);
+	// BUTTERFLY( 3, 7, 6);
+	{
+		u = __shfl((int)y[0],  (threadIdx.x&3),8); // 0,1,2,3  0,1,2,3
+		v = __shfl((int)y[0],4+(threadIdx.x&3),8); // 4,5,6,7  4,5,6,7
+		y[0] = ((threadIdx.x&7) < 4) ? (u+v) : ((u-v) << (2*(threadIdx.x&3)));
+	}
+
+	// BUTTERFLY( 8, 12, 0);
+	// BUTTERFLY( 9, 13, 2);
+	// BUTTERFLY(10, 14, 4);
+	// BUTTERFLY(11, 15, 6);
+	{
+		u = __shfl((int)y[1],  (threadIdx.x&3),8); // 8,9,10,11    8,9,10,11
+		v = __shfl((int)y[1],4+(threadIdx.x&3),8); // 12,13,14,15  12,13,14,15
+		y[1] = ((threadIdx.x&7) < 4) ? (u+v) : ((u-v) << (2*(threadIdx.x&3)));
+	}
+
+	// DO_REDUCE(5);
+	// DO_REDUCE(7);
+	// DO_REDUCE(13);
+	// DO_REDUCE(15);
+	if ((threadIdx.x&1) && (threadIdx.x&7) >= 4) {
+		y[0] = REDUCE(y[0]);  // 5, 7
+		y[1] = REDUCE(y[1]);  // 13, 15
+	}
+
+	// BUTTERFLY( 0, 2, 0);
+	// BUTTERFLY( 1, 3, 4);
+	// BUTTERFLY( 4, 6, 0);
+	// BUTTERFLY( 5, 7, 4);
+	{
+		u = __shfl((int)y[0],  (threadIdx.x&5),8); // 0,1,0,1  4,5,4,5
+		v = __shfl((int)y[0],2+(threadIdx.x&5),8); // 2,3,2,3  6,7,6,7
+		y[0] = ((threadIdx.x&3) < 2) ? (u+v) : ((u-v) << (4*(threadIdx.x&1)));
+	}
+
+	// BUTTERFLY( 8, 10, 0);
+	// BUTTERFLY( 9, 11, 4);
+	// BUTTERFLY(12, 14, 0);
+	// BUTTERFLY(13, 15, 4);
+	{
+		u = __shfl((int)y[1],  (threadIdx.x&5),8); // 8,9,8,9      12,13,12,13
+		v = __shfl((int)y[1],2+(threadIdx.x&5),8); // 10,11,10,11  14,15,14,15
+		y[1] = ((threadIdx.x&3) < 2) ? (u+v) : ((u-v) << (4*(threadIdx.x&1)));
+	}
+
+	// BUTTERFLY( 0, 1, 0);
+	// BUTTERFLY( 2, 3, 0);
+	// BUTTERFLY( 4, 5, 0);
+	// BUTTERFLY( 6, 7, 0);
+	{
+		u = __shfl((int)y[0],  (threadIdx.x&6),8); // 0,0,2,2      4,4,6,6
+		v = __shfl((int)y[0],1+(threadIdx.x&6),8); // 1,1,3,3      5,5,7,7
+		y[0] = ((threadIdx.x&1) < 1) ? (u+v) : (u-v);
+	}
+
+	// BUTTERFLY( 8, 9, 0);
+	// BUTTERFLY(10, 11, 0);
+	// BUTTERFLY(12, 13, 0);
+	// BUTTERFLY(14, 15, 0);
+	{
+		u = __shfl((int)y[1],  (threadIdx.x&6),8); // 8,8,10,10    12,12,14,14
+		v = __shfl((int)y[1],1+(threadIdx.x&6),8); // 9,9,11,11    13,13,15,15
+		y[1] = ((threadIdx.x&1) < 1) ? (u+v) : (u-v);
+	}
+
+	DO_REDUCE_FULL_S( 0); // 0...7
+	DO_REDUCE_FULL_S( 1); // 8...15
 
 #undef DO_REDUCE_FULL_S
 }
 
-__device__ __forceinline__ void FFT_128_full(int y[128]) {
-  int i;
- 
-  FFT_8(y+0,2); // eight parallel FFT8's
-  FFT_8(y+1,2); // eight parallel FFT8's
+__device__ __forceinline__
+void FFT_128_full(int y[128])
+{
+	int i;
+
+	FFT_8(y+0,2); // eight parallel FFT8's
+	FFT_8(y+1,2); // eight parallel FFT8's
 
 #pragma unroll 16
-  for (i=0; i<16; i++)
-    /*if (i & 7)*/ y[i] = REDUCE(y[i]*c_FFT128_8_16_Twiddle[i*8+(threadIdx.x&7)]);
+	for (i=0; i<16; i++)
+	/*if (i & 7)*/ y[i] = REDUCE(y[i]*c_FFT128_8_16_Twiddle[i*8+(threadIdx.x&7)]);
 
 #pragma unroll 8
-  for (i=0; i<8; i++)
-    FFT_16(y+2*i);  // eight sequential FFT16's, each one executed in parallel by 8 threads
+	for (i=0; i<8; i++)
+		FFT_16(y+2*i);  // eight sequential FFT16's, each one executed in parallel by 8 threads
 }
 
-
-__device__ __forceinline__ void FFT_256_halfzero(int y[256]) {
-
-   int i;
-
-    /*
-    * FFT_256 using w=41 as 256th root of unity.
-    * Decimation in frequency (DIF) NTT.
-    * Output data is in revbin_permuted order.
-    * In place.
-    */
-    const int tmp = y[15];
+__device__ __forceinline__
+void FFT_256_halfzero(int y[256])
+{
+	/*
+	 * FFT_256 using w=41 as 256th root of unity.
+	 * Decimation in frequency (DIF) NTT.
+	 * Output data is in revbin_permuted order.
+	 * In place.
+	 */
+	const int tmp = y[15];
 
 #pragma unroll 8
-    for (i=0; i<8; i++)
-        y[16+i] = REDUCE(y[i] * c_FFT256_2_128_Twiddle[8*i+(threadIdx.x&7)]);
+	for (int i=0; i<8; i++)
+		y[16+i] = REDUCE(y[i] * c_FFT256_2_128_Twiddle[8*i+(threadIdx.x&7)]);
 #pragma unroll 8
-    for (i=8; i<16; i++)
-        y[16+i] = 0;
-
-    /* handle X^255 with an additional butterfly */
-    if ((threadIdx.x&7) == 7)
-    {
-        y[15] = REDUCE(tmp + 1);
-        y[31] = REDUCE((tmp - 1) * c_FFT256_2_128_Twiddle[127]);
-    }
-
-    FFT_128_full(y);
-    FFT_128_full(y+16);
+	for (int i=8; i<16; i++)
+		y[16+i] = 0;
+
+	/* handle X^255 with an additional butterfly */
+	if ((threadIdx.x&7) == 7)
+	{
+		y[15] = REDUCE(tmp + 1);
+		y[31] = REDUCE((tmp - 1) * c_FFT256_2_128_Twiddle[127]);
+	}
+
+	FFT_128_full(y);
+	FFT_128_full(y+16);
 }
 
-
 /***************************************************/
 
-__device__ __forceinline__ void Expansion(const uint32_t *data, uint4 *g_temp4)
+__device__ __forceinline__
+void Expansion(const uint32_t *data, uint4 *g_temp4)
 {
-  int i;
-
-  /* Message Expansion using Number Theoretical Transform similar to FFT */
-  int expanded[32];
+	/* Message Expansion using Number Theoretical Transform similar to FFT */
+	int expanded[32];
 #pragma unroll 4
-  for (i=0; i < 4; i++) {
-      expanded[  i] = __byte_perm(__shfl((int)data[0], 2*i, 8), __shfl((int)data[0], (2*i)+1, 8), threadIdx.x&7)&0xff;
-      expanded[4+i] = __byte_perm(__shfl((int)data[1], 2*i, 8), __shfl((int)data[1], (2*i)+1, 8), threadIdx.x&7)&0xff;
-  }
+	for (int i=0; i < 4; i++) {
+		expanded[  i] = __byte_perm(__shfl((int)data[0], 2*i, 8), __shfl((int)data[0], (2*i)+1, 8), threadIdx.x&7)&0xff;
+		expanded[4+i] = __byte_perm(__shfl((int)data[1], 2*i, 8), __shfl((int)data[1], (2*i)+1, 8), threadIdx.x&7)&0xff;
+	}
 #pragma unroll 8
-  for (i=8; i < 16; i++)
-      expanded[i] = 0;
+	for (int i=8; i < 16; i++)
+		expanded[i] = 0;
 
-  FFT_256_halfzero(expanded);
+	FFT_256_halfzero(expanded);
 
-  // store w matrices in global memory
+	// store w matrices in global memory
 
 #define mul_185(x) ( (x)*185 )
 #define mul_233(x) ( (x)*233 )
 
-  uint4 vec0;
-  int P, Q, P1, Q1, P2, Q2;
-  bool even = (threadIdx.x & 1) == 0;
+	uint4 vec0;
+	int P, Q, P1, Q1, P2, Q2;
+	bool even = (threadIdx.x & 1) == 0;
 
 //  0   8   4  12   2  10   6  14      16  24  20  28  18  26  22  30         2 2 2 2 2 2 2 2     2 2 2 2 2 2 2 2
 //  0   8   4  12   2  10   6  14      16  24  20  28  18  26  22  30         6 6 6 6 6 6 6 6     6 6 6 6 6 6 6 6
 //  0   8   4  12   2  10   6  14      16  24  20  28  18  26  22  30         0 0 0 0 0 0 0 0     0 0 0 0 0 0 0 0
 //  0   8   4  12   2  10   6  14      16  24  20  28  18  26  22  30         4 4 4 4 4 4 4 4     4 4 4 4 4 4 4 4
 
-  // 2 6 0 4
-  const int perm0[8] = { 2,3,6,7,0,1,4,5 };  // TODO: das landet im lmem. doof.
-
-  P1 = expanded[ 0]; P2 = __shfl(expanded[ 2], (threadIdx.x-1)&7, 8); P = even ? P1 : P2;
-  Q1 = expanded[16]; Q2 = __shfl(expanded[18], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2;
-  vec0.x = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), perm0[threadIdx.x&7], 8);
-  P1 = expanded[ 8]; P2 = __shfl(expanded[10], (threadIdx.x-1)&7, 8); P = even ? P1 : P2;
-  Q1 = expanded[24]; Q2 = __shfl(expanded[26], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2;
-  vec0.y = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), perm0[threadIdx.x&7], 8);
-  P1 = expanded[ 4]; P2 = __shfl(expanded[ 6], (threadIdx.x-1)&7, 8); P = even ? P1 : P2;
-  Q1 = expanded[20]; Q2 = __shfl(expanded[22], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; 
-  vec0.z = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), perm0[threadIdx.x&7], 8);
-  P1 = expanded[12]; P2 = __shfl(expanded[14], (threadIdx.x-1)&7, 8); P = even ? P1 : P2;
-  Q1 = expanded[28]; Q2 = __shfl(expanded[30], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2;
-  vec0.w = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), perm0[threadIdx.x&7], 8);
-  g_temp4[threadIdx.x&7] = vec0;
+	// 2 6 0 4
+
+	P1 = expanded[ 0]; P2 = __shfl(expanded[ 2], (threadIdx.x-1)&7, 8); P = even ? P1 : P2;
+	Q1 = expanded[16]; Q2 = __shfl(expanded[18], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2;
+	vec0.x = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), c_perm[0][threadIdx.x&7], 8);
+	P1 = expanded[ 8]; P2 = __shfl(expanded[10], (threadIdx.x-1)&7, 8); P = even ? P1 : P2;
+	Q1 = expanded[24]; Q2 = __shfl(expanded[26], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2;
+	vec0.y = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), c_perm[0][threadIdx.x&7], 8);
+	P1 = expanded[ 4]; P2 = __shfl(expanded[ 6], (threadIdx.x-1)&7, 8); P = even ? P1 : P2;
+	Q1 = expanded[20]; Q2 = __shfl(expanded[22], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2;
+	vec0.z = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), c_perm[0][threadIdx.x&7], 8);
+	P1 = expanded[12]; P2 = __shfl(expanded[14], (threadIdx.x-1)&7, 8); P = even ? P1 : P2;
+	Q1 = expanded[28]; Q2 = __shfl(expanded[30], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2;
+	vec0.w = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), c_perm[0][threadIdx.x&7], 8);
+	g_temp4[threadIdx.x&7] = vec0;
 
 //  1   9   5  13   3  11   7  15      17  25  21  29  19  27  23  31         6 6 6 6 6 6 6 6     6 6 6 6 6 6 6 6
 //  1   9   5  13   3  11   7  15      17  25  21  29  19  27  23  31         2 2 2 2 2 2 2 2     2 2 2 2 2 2 2 2
 //  1   9   5  13   3  11   7  15      17  25  21  29  19  27  23  31         4 4 4 4 4 4 4 4     4 4 4 4 4 4 4 4
 //  1   9   5  13   3  11   7  15      17  25  21  29  19  27  23  31         0 0 0 0 0 0 0 0     0 0 0 0 0 0 0 0
 
-  // 6 2 4 0
-  const int perm1[8] = { 6,7,2,3,4,5,0,1 };  // TODO: das landet im lmem. doof.
-
-  P1 = expanded[ 1]; P2 = __shfl(expanded[ 3], (threadIdx.x-1)&7, 8); P = even ? P1 : P2;
-  Q1 = expanded[17]; Q2 = __shfl(expanded[19], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2;
-  vec0.x = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), perm1[threadIdx.x&7], 8);
-  P1 = expanded[ 9]; P2 = __shfl(expanded[11], (threadIdx.x-1)&7, 8); P = even ? P1 : P2;
-  Q1 = expanded[25]; Q2 = __shfl(expanded[27], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2;
-  vec0.y = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), perm1[threadIdx.x&7], 8);
-  P1 = expanded[ 5]; P2 = __shfl(expanded[ 7], (threadIdx.x-1)&7, 8); P = even ? P1 : P2;
-  Q1 = expanded[21]; Q2 = __shfl(expanded[23], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; 
-  vec0.z = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), perm1[threadIdx.x&7], 8);
-  P1 = expanded[13]; P2 = __shfl(expanded[15], (threadIdx.x-1)&7, 8); P = even ? P1 : P2;
-  Q1 = expanded[29]; Q2 = __shfl(expanded[31], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2;
-  vec0.w = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), perm1[threadIdx.x&7], 8);
-  g_temp4[8+(threadIdx.x&7)] = vec0;
+	// 6 2 4 0
+
+	P1 = expanded[ 1]; P2 = __shfl(expanded[ 3], (threadIdx.x-1)&7, 8); P = even ? P1 : P2;
+	Q1 = expanded[17]; Q2 = __shfl(expanded[19], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2;
+	vec0.x = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), c_perm[1][threadIdx.x&7], 8);
+	P1 = expanded[ 9]; P2 = __shfl(expanded[11], (threadIdx.x-1)&7, 8); P = even ? P1 : P2;
+	Q1 = expanded[25]; Q2 = __shfl(expanded[27], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2;
+	vec0.y = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), c_perm[1][threadIdx.x&7], 8);
+	P1 = expanded[ 5]; P2 = __shfl(expanded[ 7], (threadIdx.x-1)&7, 8); P = even ? P1 : P2;
+	Q1 = expanded[21]; Q2 = __shfl(expanded[23], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2;
+	vec0.z = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), c_perm[1][threadIdx.x&7], 8);
+	P1 = expanded[13]; P2 = __shfl(expanded[15], (threadIdx.x-1)&7, 8); P = even ? P1 : P2;
+	Q1 = expanded[29]; Q2 = __shfl(expanded[31], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2;
+	vec0.w = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), c_perm[1][threadIdx.x&7], 8);
+	g_temp4[8+(threadIdx.x&7)] = vec0;
 
 //  1   9   5  13   3  11   7  15      17  25  21  29  19  27  23  31         7 7 7 7 7 7 7 7     7 7 7 7 7 7 7 7
 //  1   9   5  13   3  11   7  15      17  25  21  29  19  27  23  31         5 5 5 5 5 5 5 5     5 5 5 5 5 5 5 5
 //  0   8   4  12   2  10   6  14      16  24  20  28  18  26  22  30         3 3 3 3 3 3 3 3     3 3 3 3 3 3 3 3
 //  0   8   4  12   2  10   6  14      16  24  20  28  18  26  22  30         1 1 1 1 1 1 1 1     1 1 1 1 1 1 1 1
 
-  // 7 5 3 1
-  const int perm2[8] = { 7,6,5,4,3,2,1,0 };  // TODO: das landet im lmem. doof.
-
-  bool hi = (threadIdx.x&7)>=4;
-
-  P1 = hi?expanded[ 1]:expanded[ 0]; P2 = __shfl(hi?expanded[ 3]:expanded[ 2], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2;
-  Q1 = hi?expanded[17]:expanded[16]; Q2 = __shfl(hi?expanded[19]:expanded[18], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2;
-  vec0.x = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), perm2[threadIdx.x&7], 8);
-  P1 = hi?expanded[ 9]:expanded[ 8]; P2 = __shfl(hi?expanded[11]:expanded[10], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2;
-  Q1 = hi?expanded[25]:expanded[24]; Q2 = __shfl(hi?expanded[27]:expanded[26], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2;
-  vec0.y = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), perm2[threadIdx.x&7], 8);
-  P1 = hi?expanded[ 5]:expanded[ 4]; P2 = __shfl(hi?expanded[ 7]:expanded[ 6], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2;
-  Q1 = hi?expanded[21]:expanded[20]; Q2 = __shfl(hi?expanded[23]:expanded[22], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; 
-  vec0.z = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), perm2[threadIdx.x&7], 8);
-  P1 = hi?expanded[13]:expanded[12]; P2 = __shfl(hi?expanded[15]:expanded[14], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2;
-  Q1 = hi?expanded[29]:expanded[28]; Q2 = __shfl(hi?expanded[31]:expanded[30], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2;
-  vec0.w = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), perm2[threadIdx.x&7], 8);
-  g_temp4[16+(threadIdx.x&7)] = vec0;
+	// 7 5 3 1
+
+	bool hi = (threadIdx.x&7)>=4;
+
+	P1 = hi?expanded[ 1]:expanded[ 0]; P2 = __shfl(hi?expanded[ 3]:expanded[ 2], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2;
+	Q1 = hi?expanded[17]:expanded[16]; Q2 = __shfl(hi?expanded[19]:expanded[18], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2;
+	vec0.x = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), c_perm[2][threadIdx.x&7], 8);
+	P1 = hi?expanded[ 9]:expanded[ 8]; P2 = __shfl(hi?expanded[11]:expanded[10], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2;
+	Q1 = hi?expanded[25]:expanded[24]; Q2 = __shfl(hi?expanded[27]:expanded[26], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2;
+	vec0.y = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), c_perm[2][threadIdx.x&7], 8);
+	P1 = hi?expanded[ 5]:expanded[ 4]; P2 = __shfl(hi?expanded[ 7]:expanded[ 6], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2;
+	Q1 = hi?expanded[21]:expanded[20]; Q2 = __shfl(hi?expanded[23]:expanded[22], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2;
+	vec0.z = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), c_perm[2][threadIdx.x&7], 8);
+	P1 = hi?expanded[13]:expanded[12]; P2 = __shfl(hi?expanded[15]:expanded[14], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2;
+	Q1 = hi?expanded[29]:expanded[28]; Q2 = __shfl(hi?expanded[31]:expanded[30], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2;
+	vec0.w = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), c_perm[2][threadIdx.x&7], 8);
+	g_temp4[16+(threadIdx.x&7)] = vec0;
 
 //  1   9   5  13   3  11   7  15      17  25  21  29  19  27  23  31         1 1 1 1 1 1 1 1     1 1 1 1 1 1 1 1
 //  1   9   5  13   3  11   7  15      17  25  21  29  19  27  23  31         3 3 3 3 3 3 3 3     3 3 3 3 3 3 3 3
@@ -429,23 +453,22 @@ __device__ __forceinline__ void Expansion(const uint32_t *data, uint4 *g_temp4)
 //  0   8   4  12   2  10   6  14      16  24  20  28  18  26  22  30         7 7 7 7 7 7 7 7     7 7 7 7 7 7 7 7
 
   // 1 3 5 7
-  const int perm3[8] = { 1,0,3,2,5,4,7,6 };  // TODO: das landet im lmem. doof.
-
-  bool lo = (threadIdx.x&7)<4;
-
-  P1 = lo?expanded[ 1]:expanded[ 0]; P2 = __shfl(lo?expanded[ 3]:expanded[ 2], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2;
-  Q1 = lo?expanded[17]:expanded[16]; Q2 = __shfl(lo?expanded[19]:expanded[18], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2;
-  vec0.x = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), perm3[threadIdx.x&7], 8);
-  P1 = lo?expanded[ 9]:expanded[ 8]; P2 = __shfl(lo?expanded[11]:expanded[10], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2;
-  Q1 = lo?expanded[25]:expanded[24]; Q2 = __shfl(lo?expanded[27]:expanded[26], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2;
-  vec0.y = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), perm3[threadIdx.x&7], 8);
-  P1 = lo?expanded[ 5]:expanded[ 4]; P2 = __shfl(lo?expanded[ 7]:expanded[ 6], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2;
-  Q1 = lo?expanded[21]:expanded[20]; Q2 = __shfl(lo?expanded[23]:expanded[22], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; 
-  vec0.z = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), perm3[threadIdx.x&7], 8);
-  P1 = lo?expanded[13]:expanded[12]; P2 = __shfl(lo?expanded[15]:expanded[14], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2;
-  Q1 = lo?expanded[29]:expanded[28]; Q2 = __shfl(lo?expanded[31]:expanded[30], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2;
-  vec0.w = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), perm3[threadIdx.x&7], 8);
-  g_temp4[24+(threadIdx.x&7)] = vec0;
+
+	bool lo = (threadIdx.x&7)<4;
+
+	P1 = lo?expanded[ 1]:expanded[ 0]; P2 = __shfl(lo?expanded[ 3]:expanded[ 2], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2;
+	Q1 = lo?expanded[17]:expanded[16]; Q2 = __shfl(lo?expanded[19]:expanded[18], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2;
+	vec0.x = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), c_perm[3][threadIdx.x&7], 8);
+	P1 = lo?expanded[ 9]:expanded[ 8]; P2 = __shfl(lo?expanded[11]:expanded[10], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2;
+	Q1 = lo?expanded[25]:expanded[24]; Q2 = __shfl(lo?expanded[27]:expanded[26], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2;
+	vec0.y = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), c_perm[3][threadIdx.x&7], 8);
+	P1 = lo?expanded[ 5]:expanded[ 4]; P2 = __shfl(lo?expanded[ 7]:expanded[ 6], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2;
+	Q1 = lo?expanded[21]:expanded[20]; Q2 = __shfl(lo?expanded[23]:expanded[22], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2;
+	vec0.z = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), c_perm[3][threadIdx.x&7], 8);
+	P1 = lo?expanded[13]:expanded[12]; P2 = __shfl(lo?expanded[15]:expanded[14], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2;
+	Q1 = lo?expanded[29]:expanded[28]; Q2 = __shfl(lo?expanded[31]:expanded[30], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2;
+	vec0.w = __shfl((int)__byte_perm(mul_185(P),  mul_185(Q) , 0x5410), c_perm[3][threadIdx.x&7], 8);
+	g_temp4[24+(threadIdx.x&7)] = vec0;
 
 //  1   9   5  13   3  11   7  15       1   9   5  13   3  11   7  15         0 0 0 0 0 0 0 0     1 1 1 1 1 1 1 1
 //  0   8   4  12   2  10   6  14       0   8   4  12   2  10   6  14         4 4 4 4 4 4 4 4     5 5 5 5 5 5 5 5
@@ -457,235 +480,255 @@ __device__ __forceinline__ void Expansion(const uint32_t *data, uint4 *g_temp4)
 //{ 14, 78, 46, 110, 30, 94, 62, 126 },  { 15, 79, 47, 111, 31, 95, 63, 127 },
 //{ 2, 66, 34, 98, 18, 82, 50, 114 },    { 3, 67, 35, 99, 19, 83, 51, 115 },
 
-  const int perm4[8] = { 0,1,4,5,6,7,2,3 };  // TODO: das landet im lmem. doof.
-
-  bool sel = ((threadIdx.x+2)&7) >= 4;  // 2,3,4,5
-
-  P1 = sel?expanded[0]:expanded[1]; Q1 = __shfl(P1, threadIdx.x^1, 8);      
-  Q2 = sel?expanded[2]:expanded[3]; P2 = __shfl(Q2, threadIdx.x^1, 8);
-  P = even? P1 : P2; Q = even? Q1 : Q2;
-  vec0.x = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), perm4[threadIdx.x&7], 8);
-  P1 = sel?expanded[8]:expanded[9]; Q1 = __shfl(P1, threadIdx.x^1, 8);      
-  Q2 = sel?expanded[10]:expanded[11]; P2 = __shfl(Q2, threadIdx.x^1, 8);
-  P = even? P1 : P2; Q = even? Q1 : Q2;
-  vec0.y = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), perm4[threadIdx.x&7], 8);
-  P1 = sel?expanded[4]:expanded[5]; Q1 = __shfl(P1, threadIdx.x^1, 8);      
-  Q2 = sel?expanded[6]:expanded[7]; P2 = __shfl(Q2, threadIdx.x^1, 8);
-  P = even? P1 : P2; Q = even? Q1 : Q2;
-  vec0.z = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), perm4[threadIdx.x&7], 8);
-  P1 = sel?expanded[12]:expanded[13]; Q1 = __shfl(P1, threadIdx.x^1, 8);      
-  Q2 = sel?expanded[14]:expanded[15]; P2 = __shfl(Q2, threadIdx.x^1, 8);
-  P = even? P1 : P2; Q = even? Q1 : Q2;
-  vec0.w = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), perm4[threadIdx.x&7], 8);
-
-  g_temp4[32+(threadIdx.x&7)] = vec0;
+	bool sel = ((threadIdx.x+2)&7) >= 4;  // 2,3,4,5
+
+	P1 = sel?expanded[0]:expanded[1]; Q1 = __shfl(P1, threadIdx.x^1, 8);
+	Q2 = sel?expanded[2]:expanded[3]; P2 = __shfl(Q2, threadIdx.x^1, 8);
+	P = even? P1 : P2; Q = even? Q1 : Q2;
+	vec0.x = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), c_perm[4][threadIdx.x&7], 8);
+	P1 = sel?expanded[8]:expanded[9]; Q1 = __shfl(P1, threadIdx.x^1, 8);
+	Q2 = sel?expanded[10]:expanded[11]; P2 = __shfl(Q2, threadIdx.x^1, 8);
+	P = even? P1 : P2; Q = even? Q1 : Q2;
+	vec0.y = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), c_perm[4][threadIdx.x&7], 8);
+	P1 = sel?expanded[4]:expanded[5]; Q1 = __shfl(P1, threadIdx.x^1, 8);
+	Q2 = sel?expanded[6]:expanded[7]; P2 = __shfl(Q2, threadIdx.x^1, 8);
+	P = even? P1 : P2; Q = even? Q1 : Q2;
+	vec0.z = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), c_perm[4][threadIdx.x&7], 8);
+	P1 = sel?expanded[12]:expanded[13]; Q1 = __shfl(P1, threadIdx.x^1, 8);
+	Q2 = sel?expanded[14]:expanded[15]; P2 = __shfl(Q2, threadIdx.x^1, 8);
+	P = even? P1 : P2; Q = even? Q1 : Q2;
+	vec0.w = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), c_perm[4][threadIdx.x&7], 8);
+
+	g_temp4[32+(threadIdx.x&7)] = vec0;
 
 //  0   8   4  12   2  10   6  14       0   8   4  12   2  10   6  14         6 6 6 6 6 6 6 6     7 7 7 7 7 7 7 7
 //  1   9   5  13   3  11   7  15       1   9   5  13   3  11   7  15         2 2 2 2 2 2 2 2     3 3 3 3 3 3 3 3
 //  0   8   4  12   2  10   6  14       0   8   4  12   2  10   6  14         0 0 0 0 0 0 0 0     1 1 1 1 1 1 1 1
 //  1   9   5  13   3  11   7  15       1   9   5  13   3  11   7  15         4 4 4 4 4 4 4 4     5 5 5 5 5 5 5 5
 
-  const int perm5[8] = { 6,7,2,3,0,1,4,5 };  // TODO: das landet im lmem. doof.
-
-  P1 = sel?expanded[1]:expanded[0]; Q1 = __shfl(P1, threadIdx.x^1, 8);      
-  Q2 = sel?expanded[3]:expanded[2]; P2 = __shfl(Q2, threadIdx.x^1, 8);
-  P = even? P1 : P2; Q = even? Q1 : Q2;
-  vec0.x = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), perm5[threadIdx.x&7], 8);
-  P1 = sel?expanded[9]:expanded[8]; Q1 = __shfl(P1, threadIdx.x^1, 8);      
-  Q2 = sel?expanded[11]:expanded[10]; P2 = __shfl(Q2, threadIdx.x^1, 8);
-  P = even? P1 : P2; Q = even? Q1 : Q2;
-  vec0.y = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), perm5[threadIdx.x&7], 8);
-  P1 = sel?expanded[5]:expanded[4]; Q1 = __shfl(P1, threadIdx.x^1, 8);      
-  Q2 = sel?expanded[7]:expanded[6]; P2 = __shfl(Q2, threadIdx.x^1, 8);
-  P = even? P1 : P2; Q = even? Q1 : Q2;
-  vec0.z = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), perm5[threadIdx.x&7], 8);
-  P1 = sel?expanded[13]:expanded[12]; Q1 = __shfl(P1, threadIdx.x^1, 8);      
-  Q2 = sel?expanded[15]:expanded[14]; P2 = __shfl(Q2, threadIdx.x^1, 8);
-  P = even? P1 : P2; Q = even? Q1 : Q2;
-  vec0.w = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), perm5[threadIdx.x&7], 8);
-
-  g_temp4[40+(threadIdx.x&7)] = vec0;
+	P1 = sel?expanded[1]:expanded[0]; Q1 = __shfl(P1, threadIdx.x^1, 8);
+	Q2 = sel?expanded[3]:expanded[2]; P2 = __shfl(Q2, threadIdx.x^1, 8);
+	P = even? P1 : P2; Q = even? Q1 : Q2;
+	vec0.x = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), c_perm[5][threadIdx.x&7], 8);
+	P1 = sel?expanded[9]:expanded[8]; Q1 = __shfl(P1, threadIdx.x^1, 8);
+	Q2 = sel?expanded[11]:expanded[10]; P2 = __shfl(Q2, threadIdx.x^1, 8);
+	P = even? P1 : P2; Q = even? Q1 : Q2;
+	vec0.y = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), c_perm[5][threadIdx.x&7], 8);
+	P1 = sel?expanded[5]:expanded[4]; Q1 = __shfl(P1, threadIdx.x^1, 8);
+	Q2 = sel?expanded[7]:expanded[6]; P2 = __shfl(Q2, threadIdx.x^1, 8);
+	P = even? P1 : P2; Q = even? Q1 : Q2;
+	vec0.z = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), c_perm[5][threadIdx.x&7], 8);
+	P1 = sel?expanded[13]:expanded[12]; Q1 = __shfl(P1, threadIdx.x^1, 8);
+	Q2 = sel?expanded[15]:expanded[14]; P2 = __shfl(Q2, threadIdx.x^1, 8);
+	P = even? P1 : P2; Q = even? Q1 : Q2;
+	vec0.w = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), c_perm[5][threadIdx.x&7], 8);
+
+	g_temp4[40+(threadIdx.x&7)] = vec0;
 
 // 16  24  20  28  18  26  22  30      16  24  20  28  18  26  22  30         6 6 6 6 6 6 6 6     7 7 7 7 7 7 7 7
 // 16  24  20  28  18  26  22  30      16  24  20  28  18  26  22  30         0 0 0 0 0 0 0 0     1 1 1 1 1 1 1 1
 // 17  25  21  29  19  27  23  31      17  25  21  29  19  27  23  31         0 0 0 0 0 0 0 0     1 1 1 1 1 1 1 1
 // 17  25  21  29  19  27  23  31      17  25  21  29  19  27  23  31         6 6 6 6 6 6 6 6     7 7 7 7 7 7 7 7
 
-  const int perm6[8] = { 6,7,0,1,4,5,2,3 };  // TODO: das landet im lmem. doof.
-  // sel markiert threads 2,3,4,5
-
-  int t;
-  t = __shfl(expanded[17],(threadIdx.x+4)&7,8); P1 = sel?t:expanded[16]; Q1 = __shfl(P1, threadIdx.x^1, 8);
-  t = __shfl(expanded[19],(threadIdx.x+4)&7,8); Q2 = sel?t:expanded[18]; P2 = __shfl(Q2, threadIdx.x^1, 8);
-  P = even? P1 : P2; Q = even? Q1 : Q2;
-  vec0.x = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), perm6[threadIdx.x&7], 8);
-  t = __shfl(expanded[25],(threadIdx.x+4)&7,8); P1 = sel?t:expanded[24]; Q1 = __shfl(P1, threadIdx.x^1, 8);
-  t = __shfl(expanded[27],(threadIdx.x+4)&7,8); Q2 = sel?t:expanded[26]; P2 = __shfl(Q2, threadIdx.x^1, 8);
-  P = even? P1 : P2; Q = even? Q1 : Q2;
-  vec0.y = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), perm6[threadIdx.x&7], 8);
-  t = __shfl(expanded[21],(threadIdx.x+4)&7,8); P1 = sel?t:expanded[20]; Q1 = __shfl(P1, threadIdx.x^1, 8);
-  t = __shfl(expanded[23],(threadIdx.x+4)&7,8); Q2 = sel?t:expanded[22]; P2 = __shfl(Q2, threadIdx.x^1, 8);
-  P = even? P1 : P2; Q = even? Q1 : Q2;
-  vec0.z = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), perm6[threadIdx.x&7], 8);
-  t = __shfl(expanded[29],(threadIdx.x+4)&7,8); P1 = sel?t:expanded[28]; Q1 = __shfl(P1, threadIdx.x^1, 8);
-  t = __shfl(expanded[31],(threadIdx.x+4)&7,8); Q2 = sel?t:expanded[30]; P2 = __shfl(Q2, threadIdx.x^1, 8);
-  P = even? P1 : P2; Q = even? Q1 : Q2;
-  vec0.w = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), perm6[threadIdx.x&7], 8);
-
-  g_temp4[48+(threadIdx.x&7)] = vec0;
+	// sel markiert threads 2,3,4,5
+
+	int t;
+	t = __shfl(expanded[17],(threadIdx.x+4)&7,8); P1 = sel?t:expanded[16]; Q1 = __shfl(P1, threadIdx.x^1, 8);
+	t = __shfl(expanded[19],(threadIdx.x+4)&7,8); Q2 = sel?t:expanded[18]; P2 = __shfl(Q2, threadIdx.x^1, 8);
+	P = even? P1 : P2; Q = even? Q1 : Q2;
+	vec0.x = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), c_perm[6][threadIdx.x&7], 8);
+	t = __shfl(expanded[25],(threadIdx.x+4)&7,8); P1 = sel?t:expanded[24]; Q1 = __shfl(P1, threadIdx.x^1, 8);
+	t = __shfl(expanded[27],(threadIdx.x+4)&7,8); Q2 = sel?t:expanded[26]; P2 = __shfl(Q2, threadIdx.x^1, 8);
+	P = even? P1 : P2; Q = even? Q1 : Q2;
+	vec0.y = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), c_perm[6][threadIdx.x&7], 8);
+	t = __shfl(expanded[21],(threadIdx.x+4)&7,8); P1 = sel?t:expanded[20]; Q1 = __shfl(P1, threadIdx.x^1, 8);
+	t = __shfl(expanded[23],(threadIdx.x+4)&7,8); Q2 = sel?t:expanded[22]; P2 = __shfl(Q2, threadIdx.x^1, 8);
+	P = even? P1 : P2; Q = even? Q1 : Q2;
+	vec0.z = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), c_perm[6][threadIdx.x&7], 8);
+	t = __shfl(expanded[29],(threadIdx.x+4)&7,8); P1 = sel?t:expanded[28]; Q1 = __shfl(P1, threadIdx.x^1, 8);
+	t = __shfl(expanded[31],(threadIdx.x+4)&7,8); Q2 = sel?t:expanded[30]; P2 = __shfl(Q2, threadIdx.x^1, 8);
+	P = even? P1 : P2; Q = even? Q1 : Q2;
+	vec0.w = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), c_perm[6][threadIdx.x&7], 8);
+
+	g_temp4[48+(threadIdx.x&7)] = vec0;
 
 // 17  25  21  29  19  27  23  31      17  25  21  29  19  27  23  31         4 4 4 4 4 4 4 4     5 5 5 5 5 5 5 5
 // 17  25  21  29  19  27  23  31      17  25  21  29  19  27  23  31         2 2 2 2 2 2 2 2     3 3 3 3 3 3 3 3
 // 16  24  20  28  18  26  22  30      16  24  20  28  18  26  22  30         2 2 2 2 2 2 2 2     3 3 3 3 3 3 3 3
 // 16  24  20  28  18  26  22  30      16  24  20  28  18  26  22  30         4 4 4 4 4 4 4 4     5 5 5 5 5 5 5 5
 
-  const int perm7[8] = { 4,5,2,3,6,7,0,1 };  // TODO: das landet im lmem. doof.
-  // sel markiert threads 2,3,4,5
-
-  t = __shfl(expanded[16],(threadIdx.x+4)&7,8); P1 = sel?expanded[17]:t; Q1 = __shfl(P1, threadIdx.x^1, 8);
-  t = __shfl(expanded[18],(threadIdx.x+4)&7,8); Q2 = sel?expanded[19]:t; P2 = __shfl(Q2, threadIdx.x^1, 8);
-  P = even? P1 : P2; Q = even? Q1 : Q2;
-  vec0.x = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), perm7[threadIdx.x&7], 8);
-  t = __shfl(expanded[24],(threadIdx.x+4)&7,8); P1 = sel?expanded[25]:t; Q1 = __shfl(P1, threadIdx.x^1, 8);
-  t = __shfl(expanded[26],(threadIdx.x+4)&7,8); Q2 = sel?expanded[27]:t; P2 = __shfl(Q2, threadIdx.x^1, 8);
-  P = even? P1 : P2; Q = even? Q1 : Q2;
-  vec0.y = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), perm7[threadIdx.x&7], 8);
-  t = __shfl(expanded[20],(threadIdx.x+4)&7,8); P1 = sel?expanded[21]:t; Q1 = __shfl(P1, threadIdx.x^1, 8);
-  t = __shfl(expanded[22],(threadIdx.x+4)&7,8); Q2 = sel?expanded[23]:t; P2 = __shfl(Q2, threadIdx.x^1, 8);
-  P = even? P1 : P2; Q = even? Q1 : Q2;
-  vec0.z = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), perm7[threadIdx.x&7], 8);
-  t = __shfl(expanded[28],(threadIdx.x+4)&7,8); P1 = sel?expanded[29]:t; Q1 = __shfl(P1, threadIdx.x^1, 8);
-  t = __shfl(expanded[30],(threadIdx.x+4)&7,8); Q2 = sel?expanded[31]:t; P2 = __shfl(Q2, threadIdx.x^1, 8);
-  P = even? P1 : P2; Q = even? Q1 : Q2;
-  vec0.w = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), perm7[threadIdx.x&7], 8);
-
-  g_temp4[56+(threadIdx.x&7)] = vec0;
+	// sel markiert threads 2,3,4,5
+
+	t = __shfl(expanded[16],(threadIdx.x+4)&7,8); P1 = sel?expanded[17]:t; Q1 = __shfl(P1, threadIdx.x^1, 8);
+	t = __shfl(expanded[18],(threadIdx.x+4)&7,8); Q2 = sel?expanded[19]:t; P2 = __shfl(Q2, threadIdx.x^1, 8);
+	P = even? P1 : P2; Q = even? Q1 : Q2;
+	vec0.x = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), c_perm[7][threadIdx.x&7], 8);
+	t = __shfl(expanded[24],(threadIdx.x+4)&7,8); P1 = sel?expanded[25]:t; Q1 = __shfl(P1, threadIdx.x^1, 8);
+	t = __shfl(expanded[26],(threadIdx.x+4)&7,8); Q2 = sel?expanded[27]:t; P2 = __shfl(Q2, threadIdx.x^1, 8);
+	P = even? P1 : P2; Q = even? Q1 : Q2;
+	vec0.y = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), c_perm[7][threadIdx.x&7], 8);
+	t = __shfl(expanded[20],(threadIdx.x+4)&7,8); P1 = sel?expanded[21]:t; Q1 = __shfl(P1, threadIdx.x^1, 8);
+	t = __shfl(expanded[22],(threadIdx.x+4)&7,8); Q2 = sel?expanded[23]:t; P2 = __shfl(Q2, threadIdx.x^1, 8);
+	P = even? P1 : P2; Q = even? Q1 : Q2;
+	vec0.z = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), c_perm[7][threadIdx.x&7], 8);
+	t = __shfl(expanded[28],(threadIdx.x+4)&7,8); P1 = sel?expanded[29]:t; Q1 = __shfl(P1, threadIdx.x^1, 8);
+	t = __shfl(expanded[30],(threadIdx.x+4)&7,8); Q2 = sel?expanded[31]:t; P2 = __shfl(Q2, threadIdx.x^1, 8);
+	P = even? P1 : P2; Q = even? Q1 : Q2;
+	vec0.w = __shfl((int)__byte_perm(mul_233(P),  mul_233(Q) , 0x5410), c_perm[7][threadIdx.x&7], 8);
+
+	g_temp4[56+(threadIdx.x&7)] = vec0;
 
 #undef mul_185
 #undef mul_233
 }
 
 /***************************************************/
-// Die Hash-Funktion
-__global__ void __launch_bounds__(TPB,4)
-x11_simd512_gpu_expand_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_temp4)
-{
-    int thread = (blockDim.x * blockIdx.x + threadIdx.x)/8;
-    if (thread < threads)
-    {
-        uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
-
-        int hashPosition = nounce - startNounce;
 
-        uint32_t *inpHash = (uint32_t*)&g_hash[8 * hashPosition];
-
-        // Hash einlesen und auf 8 Threads und 2 Register verteilen
-        uint32_t Hash[2];
-#pragma unroll 2
-        for (int i=0; i<2; i++)
-            Hash[i] = inpHash[8*i+(threadIdx.x&7)];
-
-        // Puffer f�r expandierte Nachricht
-        uint4 *temp4 = &g_temp4[64 * hashPosition];
-
-        Expansion(Hash, temp4);
-    }
+__global__ __launch_bounds__(TPB, 4)
+void x11_simd512_gpu_expand_64(uint32_t threads, uint32_t *g_hash, uint4 *g_temp4)
+{
+	int threadBloc = (blockDim.x * blockIdx.x + threadIdx.x) / 8;
+	if (threadBloc < threads)
+	{
+		int hashPosition = threadBloc * 16;
+		uint32_t *inpHash = &g_hash[hashPosition];
+
+		// Read hash per 8 threads
+		uint32_t Hash[2];
+		int ndx = threadIdx.x & 7;
+		Hash[0] = inpHash[ndx];
+		Hash[1] = inpHash[ndx + 8];
+
+		// Puffer für expandierte Nachricht
+		uint4 *temp4 = &g_temp4[hashPosition * 4];
+
+		Expansion(Hash, temp4);
+	}
 }
 
-__global__ void __launch_bounds__(TPB,4)
-x11_simd512_gpu_compress1_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, int *g_state)
+__global__ __launch_bounds__(TPB, 1)
+void x11_simd512_gpu_compress1_64(uint32_t threads, uint32_t *g_hash, uint4 *g_fft4, uint32_t *g_state)
 {
-    int thread = (blockDim.x * blockIdx.x + threadIdx.x);
-    if (thread < threads)
-    {
-        uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
-
-        int hashPosition = nounce - startNounce;
-        uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition];
-
-        Compression1(Hash, hashPosition, g_fft4, g_state);
-    }
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t *Hash = &g_hash[thread * 16];
+		Compression1(Hash, thread, g_fft4, g_state);
+	}
 }
 
-__global__ void __launch_bounds__(TPB,4)
-x11_simd512_gpu_compress2_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, int *g_state)
+__global__ __launch_bounds__(TPB, 1)
+void x11_simd512_gpu_compress2_64(uint32_t threads, uint4 *g_fft4, uint32_t *g_state)
 {
-    int thread = (blockDim.x * blockIdx.x + threadIdx.x);
-    if (thread < threads)
-    {
-        uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
-
-        int hashPosition = nounce - startNounce;
-
-        Compression2(hashPosition, g_fft4, g_state);
-    }
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		Compression2(thread, g_fft4, g_state);
+	}
 }
 
-__global__ void __launch_bounds__(TPB,4)
-x11_simd512_gpu_final_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, int *g_state)
+__global__ __launch_bounds__(TPB, 2)
+void x11_simd512_gpu_compress_64_maxwell(uint32_t threads, uint32_t *g_hash, uint4 *g_fft4, uint32_t *g_state)
 {
-    int thread = (blockDim.x * blockIdx.x + threadIdx.x);
-    if (thread < threads)
-    {
-        uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
-
-        int hashPosition = nounce - startNounce;
-        uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition];
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t *Hash = &g_hash[thread * 16];
+		Compression1(Hash, thread, g_fft4, g_state);
+		Compression2(thread, g_fft4, g_state);
+	}
+}
 
-        Final(Hash, hashPosition, g_fft4, g_state);
-    }
+__global__ __launch_bounds__(TPB, 2)
+void x11_simd512_gpu_final_64(uint32_t threads, uint32_t *g_hash, uint4 *g_fft4, uint32_t *g_state)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t *Hash = &g_hash[thread * 16];
+		Final(Hash, thread, g_fft4, g_state);
+	}
 }
 
-// Setup-Funktionen
-__host__ void x11_simd512_cpu_init(int thr_id, int threads)
+#else
+__global__ void x11_simd512_gpu_expand_64(uint32_t threads, uint32_t *g_hash, uint4 *g_temp4) {}
+__global__ void x11_simd512_gpu_compress1_64(uint32_t threads, uint32_t *g_hash, uint4 *g_fft4, uint32_t *g_state) {}
+__global__ void x11_simd512_gpu_compress2_64(uint32_t threads, uint4 *g_fft4, uint32_t *g_state) {}
+__global__ void x11_simd512_gpu_compress_64_maxwell(uint32_t threads, uint32_t *g_hash, uint4 *g_fft4, uint32_t *g_state) {}
+__global__ void x11_simd512_gpu_final_64(uint32_t threads, uint32_t *g_hash, uint4 *g_fft4, uint32_t *g_state) {}
+#endif /* SM3+ */
+
+__host__
+int x11_simd512_cpu_init(int thr_id, uint32_t threads)
 {
-    cudaMalloc( &d_state[thr_id], 32*sizeof(int)*threads );
-    cudaMalloc( &d_temp4[thr_id], 64*sizeof(uint4)*threads );
-
-#if 1
-    // Textur f�r 128 Bit Zugriffe
-    cudaChannelFormatDesc channelDesc128 = cudaCreateChannelDesc<uint4>();
-    texRef1D_128.normalized = 0;
-    texRef1D_128.filterMode = cudaFilterModePoint;
-    texRef1D_128.addressMode[0] = cudaAddressModeClamp;
-    cudaBindTexture(NULL, &texRef1D_128, d_temp4[thr_id], &channelDesc128, 64*sizeof(uint4)*threads);
+	int dev_id = device_map[thr_id];
+	cuda_get_arch(thr_id);
+	if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300) {
+		x11_simd512_cpu_init_sm2(thr_id);
+		return 0;
+	}
+
+	CUDA_CALL_OR_RET_X(cudaMalloc(&d_temp4[thr_id], 64*sizeof(uint4)*threads), (int) err); /* todo: prevent -i 21 */
+	CUDA_CALL_OR_RET_X(cudaMalloc(&d_state[thr_id], 32*sizeof(int)*threads), (int) err);
+
+#ifndef DEVICE_DIRECT_CONSTANTS
+	cudaMemcpyToSymbol(c_perm, h_perm, sizeof(h_perm), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(c_IV_512, h_IV_512, sizeof(h_IV_512), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(c_FFT128_8_16_Twiddle, h_FFT128_8_16_Twiddle, sizeof(h_FFT128_8_16_Twiddle), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(c_FFT256_2_128_Twiddle, h_FFT256_2_128_Twiddle, sizeof(h_FFT256_2_128_Twiddle), 0, cudaMemcpyHostToDevice);
+
+	cudaMemcpyToSymbol(d_cw0, h_cw0, sizeof(h_cw0), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(d_cw1, h_cw1, sizeof(h_cw1), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(d_cw2, h_cw2, sizeof(h_cw2), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(d_cw3, h_cw3, sizeof(h_cw3), 0, cudaMemcpyHostToDevice);
 #endif
 
-    cudaMemcpyToSymbol( c_IV_512, h_IV_512, sizeof(h_IV_512), 0, cudaMemcpyHostToDevice);
-    cudaMemcpyToSymbol( c_FFT128_8_16_Twiddle, h_FFT128_8_16_Twiddle, sizeof(h_FFT128_8_16_Twiddle), 0, cudaMemcpyHostToDevice);
-    cudaMemcpyToSymbol( c_FFT256_2_128_Twiddle, h_FFT256_2_128_Twiddle, sizeof(h_FFT256_2_128_Twiddle), 0, cudaMemcpyHostToDevice);
+	// Texture for 128-Bit Zugriffe
+	cudaChannelFormatDesc channelDesc128 = cudaCreateChannelDesc<uint4>();
+	texRef1D_128.normalized = 0;
+	texRef1D_128.filterMode = cudaFilterModePoint;
+	texRef1D_128.addressMode[0] = cudaAddressModeClamp;
 
+	CUDA_CALL_OR_RET_X(cudaBindTexture(NULL, &texRef1D_128, d_temp4[thr_id], &channelDesc128, 64*sizeof(uint4)*threads), (int) err);
 
-	// CH
-	cudaMemcpyToSymbol( d_cw0, h_cw0, sizeof(h_cw0), 0, cudaMemcpyHostToDevice);
-	cudaMemcpyToSymbol( d_cw1, h_cw1, sizeof(h_cw1), 0, cudaMemcpyHostToDevice);	
-	cudaMemcpyToSymbol( d_cw2, h_cw2, sizeof(h_cw2), 0, cudaMemcpyHostToDevice);	
-	cudaMemcpyToSymbol( d_cw3, h_cw3, sizeof(h_cw3), 0, cudaMemcpyHostToDevice);
-
-//    cudaFuncSetCacheConfig(x11_simd512_gpu_compress1_64, cudaFuncCachePreferL1);
-//    cudaFuncSetCacheConfig(x11_simd512_gpu_compress2_64, cudaFuncCachePreferL1);
+	return 0;
 }
 
-__host__ void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+__host__
+void x11_simd512_cpu_free(int thr_id)
 {
-    const int threadsperblock = TPB;
+	int dev_id = device_map[thr_id];
+	if (device_sm[dev_id] >= 300 && cuda_arch[dev_id] >= 300) {
+		cudaFree(d_temp4[thr_id]);
+		cudaFree(d_state[thr_id]);
+	}
+}
 
-    // Gr��e des dynamischen Shared Memory Bereichs
-    size_t shared_size = 0;
+__host__
+void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+{
+	const uint32_t threadsperblock = TPB;
+	int dev_id = device_map[thr_id];
 
-    // berechne wie viele Thread Blocks wir brauchen
-    dim3 block(threadsperblock);
+	dim3 block(threadsperblock);
+	dim3 grid((threads + threadsperblock-1) / threadsperblock);
+	dim3 gridX8(grid.x * 8);
 
-    dim3 grid8(((threads + threadsperblock-1)/threadsperblock)*8);
-    x11_simd512_gpu_expand_64<<<grid8, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id]);
+	if (d_nonceVector != NULL || device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300) {
+		x11_simd512_cpu_hash_64_sm2(thr_id, threads, startNounce, d_nonceVector, d_hash, order);
+		return;
+	}
 
-    dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	x11_simd512_gpu_expand_64 <<<gridX8, block>>> (threads, d_hash, d_temp4[thr_id]);
 
-    // k�nstlich die Occupancy limitieren, um das totale Ersch�pfen des Texture Cache zu vermeiden
-    x11_simd512_gpu_compress1_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]);
-    x11_simd512_gpu_compress2_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]);
+	if (device_sm[dev_id] >= 500 && cuda_arch[dev_id] >= 500) {
+		x11_simd512_gpu_compress_64_maxwell <<< grid, block >>> (threads, d_hash, d_temp4[thr_id], d_state[thr_id]);
+	} else {
+		x11_simd512_gpu_compress1_64 <<< grid, block >>> (threads, d_hash, d_temp4[thr_id], d_state[thr_id]);
+		x11_simd512_gpu_compress2_64 <<< grid, block >>> (threads, d_temp4[thr_id], d_state[thr_id]);
+	}
 
-    x11_simd512_gpu_final_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]);
+	x11_simd512_gpu_final_64 <<<grid, block>>> (threads, d_hash, d_temp4[thr_id], d_state[thr_id]);
 
-    MyStreamSynchronize(NULL, order, thr_id);
+	//MyStreamSynchronize(NULL, order, thr_id);
 }
diff --git a/x11/cuda_x11_simd512_func.cuh b/x11/cuda_x11_simd512_func.cuh
new file mode 100644
index 0000000000..f61eaa4f59
--- /dev/null
+++ b/x11/cuda_x11_simd512_func.cuh
@@ -0,0 +1,1396 @@
+#define SIMD_FUNCTIONS_CUH
+
+__device__ __forceinline__ void STEP8_IF_0(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for(int j=0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[1];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[0];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[3];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[2];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[5];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[4];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[7];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[6];
+#pragma unroll 8
+	for(int j=0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_IF_1(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[6];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[7];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[4];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[5];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[2];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[3];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[0];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[1];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_IF_2(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[2];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[3];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[0];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[1];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[6];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[7];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[4];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[5];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_IF_3(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int  j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[3];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[2];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[1];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[0];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[7];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[6];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[5];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[4];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_MAJ_4(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int  j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[5];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[4];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[7];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[6];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[1];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[0];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[3];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[2];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_MAJ_5(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[7];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[6];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[5];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[4];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[3];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[2];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[1];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[0];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_MAJ_6(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[4];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[5];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[6];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[7];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[0];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[1];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[2];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[3];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_MAJ_7(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[1];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[0];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[3];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[2];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[5];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[4];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[7];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[6];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_IF_8(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[6];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[7];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[4];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[5];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[2];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[3];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[0];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[1];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_IF_9(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[2];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[3];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[0];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[1];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[6];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[7];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[4];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[5];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_IF_10(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[3];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[2];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[1];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[0];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[7];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[6];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[5];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[4];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_IF_11(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[5];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[4];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[7];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[6];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[1];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[0];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[3];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[2];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_MAJ_12(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[7];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[6];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[5];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[4];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[3];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[2];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[1];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[0];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_MAJ_13(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[4];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[5];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[6];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[7];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[0];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[1];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[2];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[3];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_MAJ_14(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[1];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[0];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[3];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[2];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[5];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[4];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[7];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[6];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_MAJ_15(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[6];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[7];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[4];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[5];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[2];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[3];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[0];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[1];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_IF_16(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[2];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[3];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[0];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[1];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[6];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[7];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[4];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[5];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_IF_17(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[3];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[2];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[1];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[0];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[7];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[6];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[5];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[4];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_IF_18(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[5];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[4];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[7];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[6];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[1];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[0];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[3];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[2];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_IF_19(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[7];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[6];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[5];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[4];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[3];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[2];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[1];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[0];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_MAJ_20(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[4];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[5];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[6];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[7];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[0];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[1];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[2];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[3];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_MAJ_21(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[1];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[0];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[3];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[2];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[5];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[4];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[7];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[6];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_MAJ_22(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[6];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[7];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[4];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[5];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[2];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[3];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[0];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[1];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_MAJ_23(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[2];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[3];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[0];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[1];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[6];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[7];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[4];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[5];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_IF_24(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[3];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[2];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[1];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[0];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[7];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[6];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[5];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[4];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_IF_25(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[5];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[4];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[7];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[6];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[1];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[0];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[3];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[2];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_IF_26(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[7];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[6];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[5];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[4];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[3];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[2];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[1];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[0];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_IF_27(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[4];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[5];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[6];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[7];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[0];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[1];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[2];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[3];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_MAJ_28(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[1];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[0];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[3];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[2];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[5];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[4];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[7];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[6];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_MAJ_29(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[6];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[7];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[4];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[5];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[2];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[3];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[0];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[1];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_MAJ_30(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[2];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[3];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[0];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[1];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[6];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[7];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[4];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[5];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_MAJ_31(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[3];
+	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[2];
+	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[1];
+	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[0];
+	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[7];
+	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[6];
+	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[5];
+	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[4];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_IF_32(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[5];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[4];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[7];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[6];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[1];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[0];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[3];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[2];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_IF_33(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[7];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[6];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[5];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[4];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[3];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[2];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[1];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[0];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_IF_34(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[4];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[5];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[6];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[7];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[0];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[1];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[2];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[3];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+__device__ __forceinline__ void STEP8_IF_35(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
+{
+	uint32_t temp;
+	uint32_t R[8];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
+	D[0] = ROTL32(temp, s) + R[1];
+	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
+	D[1] = ROTL32(temp, s) + R[0];
+	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
+	D[2] = ROTL32(temp, s) + R[3];
+	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
+	D[3] = ROTL32(temp, s) + R[2];
+	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
+	D[4] = ROTL32(temp, s) + R[5];
+	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
+	D[5] = ROTL32(temp, s) + R[4];
+	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
+	D[6] = ROTL32(temp, s) + R[7];
+	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
+	D[7] = ROTL32(temp, s) + R[6];
+#pragma unroll 8
+	for (int j = 0; j<8; j++) {
+		A[j] = R[j];
+	}
+}
+
+#ifdef DEVICE_DIRECT_CONSTANTS
+static __constant__ uint32_t d_cw0[8][8] = {
+#else
+static __constant__ uint32_t d_cw0[8][8];
+static const uint32_t h_cw0[8][8] = {
+#endif
+	0x531B1720, 0xAC2CDE09, 0x0B902D87, 0x2369B1F4, 0x2931AA01, 0x02E4B082, 0xC914C914, 0xC1DAE1A6,
+	0xF18C2B5C, 0x08AC306B, 0x27BFC914, 0xCEDC548D, 0xC630C4BE, 0xF18C4335, 0xF0D3427C, 0xBE3DA380,
+	0x143C02E4, 0xA948C630, 0xA4F2DE09, 0xA71D2085, 0xA439BD84, 0x109FCD6A, 0xEEA8EF61, 0xA5AB1CE8,
+	0x0B90D4A4, 0x3D6D039D, 0x25944D53, 0xBAA0E034, 0x5BC71E5A, 0xB1F4F2FE, 0x12CADE09, 0x548D41C3,
+	0x3CB4F80D, 0x36ECEBC4, 0xA66443EE, 0x43351ABD, 0xC7A20C49, 0xEB0BB366, 0xF5293F98, 0x49B6DE09,
+	0x531B29EA, 0x02E402E4, 0xDB25C405, 0x53D4E543, 0x0AD71720, 0xE1A61A04, 0xB87534C1, 0x3EDF43EE,
+	0x213E50F0, 0x39173EDF, 0xA9485B0E, 0xEEA82EF9, 0x14F55771, 0xFAF15546, 0x3D6DD9B3, 0xAB73B92E,
+	0x582A48FD, 0xEEA81892, 0x4F7EAA01, 0xAF10A88F, 0x11581720, 0x34C124DB, 0xD1C0AB73, 0x1E5AF0D3
+};
+
+__device__ __forceinline__ void Round8_0_final(uint32_t *A, int r, int s, int t, int u)
+{
+	STEP8_IF_0(d_cw0[0], r, s, A, &A[8], &A[16], &A[24]);
+	STEP8_IF_1(d_cw0[1], s, t, &A[24], A, &A[8], &A[16]);
+	STEP8_IF_2(d_cw0[2], t, u, &A[16], &A[24], A, &A[8]);
+	STEP8_IF_3(d_cw0[3], u, r, &A[8], &A[16], &A[24], A);
+	STEP8_MAJ_4(d_cw0[4], r, s, A, &A[8], &A[16], &A[24]);
+	STEP8_MAJ_5(d_cw0[5], s, t, &A[24], A, &A[8], &A[16]);
+	STEP8_MAJ_6(d_cw0[6], t, u, &A[16], &A[24], A, &A[8]);
+	STEP8_MAJ_7(d_cw0[7], u, r, &A[8], &A[16], &A[24], A);
+}
+
+#ifdef DEVICE_DIRECT_CONSTANTS
+static __constant__ uint32_t d_cw1[8][8] = {
+#else
+static __constant__ uint32_t d_cw1[8][8];
+static const uint32_t h_cw1[8][8] = {
+#endif
+	0xC34C07F3, 0xC914143C, 0x599CBC12, 0xBCCBE543, 0x385EF3B7, 0x14F54C9A, 0x0AD7C068, 0xB64A21F7,
+	0xDEC2AF10, 0xC6E9C121, 0x56B8A4F2, 0x1158D107, 0xEB0BA88F, 0x050FAABA, 0xC293264D, 0x548D46D2,
+	0xACE5E8E0, 0x53D421F7, 0xF470D279, 0xDC974E0C, 0xD6CF55FF, 0xFD1C4F7E, 0x36EC36EC, 0x3E261E5A,
+	0xEBC4FD1C, 0x56B839D0, 0x5B0E21F7, 0x58E3DF7B, 0x5BC7427C, 0xEF613296, 0x1158109F, 0x5A55E318,
+	0xA7D6B703, 0x1158E76E, 0xB08255FF, 0x50F05771, 0xEEA8E8E0, 0xCB3FDB25, 0x2E40548D, 0xE1A60F2D,
+	0xACE5D616, 0xFD1CFD1C, 0x24DB3BFB, 0xAC2C1ABD, 0xF529E8E0, 0x1E5AE5FC, 0x478BCB3F, 0xC121BC12,
+	0xF4702B5C, 0xC293FC63, 0xDA6CB2AD, 0x45601FCC, 0xA439E1A6, 0x4E0C0D02, 0xED3621F7, 0xAB73BE3D,
+	0x0E74D4A4, 0xF754CF95, 0xD84136EC, 0x3124AB73, 0x39D03B42, 0x0E74BCCB, 0x0F2DBD84, 0x41C35C80
+};
+
+__device__ __forceinline__ void Round8_1_final(uint32_t *A, int r, int s, int t, int u)
+{
+	STEP8_IF_8(d_cw1[0], r, s, A, &A[8], &A[16], &A[24]);
+	STEP8_IF_9(d_cw1[1], s, t, &A[24], A, &A[8], &A[16]);
+	STEP8_IF_10(d_cw1[2], t, u, &A[16], &A[24], A, &A[8]);
+	STEP8_IF_11(d_cw1[3], u, r, &A[8], &A[16], &A[24], A);
+	STEP8_MAJ_12(d_cw1[4], r, s, A, &A[8], &A[16], &A[24]);
+	STEP8_MAJ_13(d_cw1[5], s, t, &A[24], A, &A[8], &A[16]);
+	STEP8_MAJ_14(d_cw1[6], t, u, &A[16], &A[24], A, &A[8]);
+	STEP8_MAJ_15(d_cw1[7], u, r, &A[8], &A[16], &A[24], A);
+}
+
+#ifdef DEVICE_DIRECT_CONSTANTS
+static __constant__ uint32_t d_cw2[8][8] = {
+#else
+static __constant__ uint32_t d_cw2[8][8];
+static const uint32_t h_cw2[8][8] = {
+#endif
+	0xA4135BED, 0xE10E1EF2, 0x6C4F93B1, 0x6E2191DF, 0xE2E01D20, 0xD1952E6B, 0x6A7D9583, 0x131DECE3,
+	0x369CC964, 0xFB73048D, 0x9E9D6163, 0x280CD7F4, 0xD9C6263A, 0x1062EF9E, 0x2AC7D539, 0xAD2D52D3,
+	0x0A03F5FD, 0x197CE684, 0xAA72558E, 0xDE5321AD, 0xF0870F79, 0x607A9F86, 0xAFE85018, 0x2AC7D539,
+	0xE2E01D20, 0x2AC7D539, 0xC6A93957, 0x624C9DB4, 0x6C4F93B1, 0x641E9BE2, 0x452CBAD4, 0x263AD9C6,
+	0xC964369C, 0xC3053CFB, 0x452CBAD4, 0x95836A7D, 0x4AA2B55E, 0xAB5B54A5, 0xAC4453BC, 0x74808B80,
+	0xCB3634CA, 0xFC5C03A4, 0x4B8BB475, 0x21ADDE53, 0xE2E01D20, 0xDF3C20C4, 0xBD8F4271, 0xAA72558E,
+	0xFC5C03A4, 0x48D0B730, 0x2AC7D539, 0xD70B28F5, 0x53BCAC44, 0x3FB6C04A, 0x14EFEB11, 0xDB982468,
+	0x9A1065F0, 0xB0D14F2F, 0x8D5272AE, 0xC4D73B29, 0x91DF6E21, 0x949A6B66, 0x303DCFC3, 0x5932A6CE
+};
+
+__device__ __forceinline__ void Round8_2_final(uint32_t *A, int r, int s, int t, int u)
+{
+	STEP8_IF_16(d_cw2[0], r, s, A, &A[8], &A[16], &A[24]);
+	STEP8_IF_17(d_cw2[1], s, t, &A[24], A, &A[8], &A[16]);
+	STEP8_IF_18(d_cw2[2], t, u, &A[16], &A[24], A, &A[8]);
+	STEP8_IF_19(d_cw2[3], u, r, &A[8], &A[16], &A[24], A);
+	STEP8_MAJ_20(d_cw2[4], r, s, A, &A[8], &A[16], &A[24]);
+	STEP8_MAJ_21(d_cw2[5], s, t, &A[24], A, &A[8], &A[16]);
+	STEP8_MAJ_22(d_cw2[6], t, u, &A[16], &A[24], A, &A[8]);
+	STEP8_MAJ_23(d_cw2[7], u, r, &A[8], &A[16], &A[24], A);
+}
+
+#ifdef DEVICE_DIRECT_CONSTANTS
+static __constant__ uint32_t d_cw3[8][8] = {
+#else
+static __constant__ uint32_t d_cw3[8][8];
+static const uint32_t h_cw3[8][8] = {
+#endif
+	0x1234EDCC, 0xF5140AEC, 0xCDF1320F, 0x3DE4C21C, 0x48D0B730, 0x1234EDCC, 0x131DECE3, 0x52D3AD2D,
+	0xE684197C, 0x6D3892C8, 0x72AE8D52, 0x6FF3900D, 0x73978C69, 0xEB1114EF, 0x15D8EA28, 0x71C58E3B,
+	0x90F66F0A, 0x15D8EA28, 0x9BE2641E, 0x65F09A10, 0xEA2815D8, 0xBD8F4271, 0x3A40C5C0, 0xD9C6263A,
+	0xB38C4C74, 0xBAD4452C, 0x70DC8F24, 0xAB5B54A5, 0x46FEB902, 0x1A65E59B, 0x0DA7F259, 0xA32A5CD6,
+	0xD62229DE, 0xB81947E7, 0x6D3892C8, 0x15D8EA28, 0xE59B1A65, 0x065FF9A1, 0xB2A34D5D, 0x6A7D9583,
+	0x975568AB, 0xFC5C03A4, 0x2E6BD195, 0x966C6994, 0xF2590DA7, 0x263AD9C6, 0x5A1BA5E5, 0xB0D14F2F,
+	0x975568AB, 0x6994966C, 0xF1700E90, 0xD3672C99, 0xCC1F33E1, 0xFC5C03A4, 0x452CBAD4, 0x4E46B1BA,
+	0xF1700E90, 0xB2A34D5D, 0xD0AC2F54, 0x5760A8A0, 0x8C697397, 0x624C9DB4, 0xE85617AA, 0x95836A7D
+};
+
+__device__ __forceinline__ void Round8_3_final(uint32_t *A, int r, int s, int t, int u)
+{
+	STEP8_IF_24(d_cw3[0], r, s, A, &A[8], &A[16], &A[24]);
+	STEP8_IF_25(d_cw3[1], s, t, &A[24], A, &A[8], &A[16]);
+	STEP8_IF_26(d_cw3[2], t, u, &A[16], &A[24], A, &A[8]);
+	STEP8_IF_27(d_cw3[3], u, r, &A[8], &A[16], &A[24], A);
+	STEP8_MAJ_28(d_cw3[4], r, s, A, &A[8], &A[16], &A[24]);
+	STEP8_MAJ_29(d_cw3[5], s, t, &A[24], A, &A[8], &A[16]);
+	STEP8_MAJ_30(d_cw3[6], t, u, &A[16], &A[24], A, &A[8]);
+	STEP8_MAJ_31(d_cw3[7], u, r, &A[8], &A[16], &A[24], A);
+}
+
+#if __CUDA_ARCH__ < 350
+#define expanded_vector(x) tex1Dfetch(texRef1D_128, (x))
+#else
+//#define expanded_vector(x) tex1Dfetch(texRef1D_128, (x))
+#define expanded_vector(x) __ldg(&g_fft4[x])
+#endif
+
+__device__ __forceinline__ void Round8_0(uint32_t *A, const int thr_offset,
+		int r, int s, int t, int u, uint4 *g_fft4) {
+	uint32_t w[8];
+    uint4 hv1, hv2;
+
+	int tmp = 0 + thr_offset;
+	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_0(w, r, s, A, &A[8], &A[16], &A[24]);
+	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_1(w, s, t, &A[24], A, &A[8], &A[16]);
+	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_2(w, t, u, &A[16], &A[24], A, &A[8]);
+	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_3(w, u, r, &A[8], &A[16], &A[24], A);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_4(w, r, s, A, &A[8], &A[16], &A[24]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_5(w, s, t, &A[24], A, &A[8], &A[16]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_6(w, t, u, &A[16], &A[24], A, &A[8]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_7(w, u, r, &A[8], &A[16], &A[24], A);
+
+
+}
+__device__ __forceinline__ void Round8_1(uint32_t *A, const int thr_offset,
+		int r, int s, int t, int u, uint4 *g_fft4) {
+	uint32_t w[8];
+    uint4 hv1, hv2;
+
+	int tmp = 16 + thr_offset;
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_8(w, r, s, A, &A[8], &A[16], &A[24]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_9(w, s, t, &A[24], A, &A[8], &A[16]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_10(w, t, u, &A[16], &A[24], A, &A[8]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_11(w, u, r, &A[8], &A[16], &A[24], A);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_12(w, r, s, A, &A[8], &A[16], &A[24]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_13(w, s, t, &A[24], A, &A[8], &A[16]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_14(w, t, u, &A[16], &A[24], A, &A[8]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_15(w, u, r, &A[8], &A[16], &A[24], A);
+
+
+}
+__device__ __forceinline__ void Round8_2(uint32_t *A, const int thr_offset,
+		int r, int s, int t, int u, uint4 *g_fft4) {
+	uint32_t w[8];
+    uint4 hv1, hv2;
+
+	int tmp = 32 + thr_offset;
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_16(w, r, s, A, &A[8], &A[16], &A[24]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_17(w, s, t, &A[24], A, &A[8], &A[16]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_18(w, t, u, &A[16], &A[24], A, &A[8]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_19(w, u, r, &A[8], &A[16], &A[24], A);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_20(w, r, s, A, &A[8], &A[16], &A[24]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_21(w, s, t, &A[24], A, &A[8], &A[16]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_22(w, t, u, &A[16], &A[24], A, &A[8]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_23(w, u, r, &A[8], &A[16], &A[24], A);
+
+
+}
+__device__ __forceinline__ void Round8_3(uint32_t *A, const int thr_offset,
+		int r, int s, int t, int u, uint4 *g_fft4) {
+	uint32_t w[8];
+    uint4 hv1, hv2;
+
+	int tmp = 48 + thr_offset;
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_24(w, r, s, A, &A[8], &A[16], &A[24]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_25(w, s, t, &A[24], A, &A[8], &A[16]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_26(w, t, u, &A[16], &A[24], A, &A[8]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_IF_27(w, u, r, &A[8], &A[16], &A[24], A);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_28(w, r, s, A, &A[8], &A[16], &A[24]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_29(w, s, t, &A[24], A, &A[8], &A[16]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_30(w, t, u, &A[16], &A[24], A, &A[8]);
+    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
+    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
+	STEP8_MAJ_31(w, u, r, &A[8], &A[16], &A[24], A);
+
+
+}
+
+__device__ __forceinline__ void SIMD_Compress1(uint32_t *A, const int thr_id, const uint32_t *M, uint4 *g_fft4) {
+	int i;
+	const int thr_offset = thr_id << 6; // thr_id * 128 (je zwei elemente)
+#pragma unroll 8
+	for(i=0; i<8; i++) {
+		A[i] ^= M[i];
+		(&A[8])[i] ^= M[8+i];
+	}
+	Round8_0(A, thr_offset, 3, 23, 17, 27, g_fft4);
+	Round8_1(A, thr_offset, 28, 19, 22, 7, g_fft4);
+}
+
+__device__ __forceinline__ void Compression1(const uint32_t *hashval, const int texture_id, uint4 *g_fft4, uint32_t *g_state) {
+	uint32_t A[32];
+	int i;
+#pragma unroll 32
+	for (i=0; i < 32; i++) A[i] = c_IV_512[i];
+	uint32_t buffer[16];
+#pragma unroll 16
+	for (i=0; i < 16; i++) buffer[i] = hashval[i];
+	SIMD_Compress1(A, texture_id, buffer, g_fft4);
+	uint32_t *state = (uint32_t*)&g_state[blockIdx.x * (blockDim.x*32)];
+#pragma unroll 32
+	for (i=0; i < 32; i++) state[threadIdx.x+blockDim.x*i] = A[i];
+}
+
+__device__ __forceinline__ void SIMD_Compress2(uint32_t *A, const int thr_id, uint4 *g_fft4) {
+	uint32_t IV[4][8];
+	int i;
+	const int thr_offset = thr_id << 6; // thr_id * 128 (je zwei elemente)
+#pragma unroll 8
+	for(i=0; i<8; i++) {
+		IV[0][i] = c_IV_512[i];
+		IV[1][i] = c_IV_512[8+i];
+		IV[2][i] = c_IV_512[16+i];
+		IV[3][i] = c_IV_512[24+i];
+	}
+	Round8_2(A, thr_offset, 29, 9, 15, 5, g_fft4);
+	Round8_3(A, thr_offset, 4, 13, 10, 25, g_fft4);
+	STEP8_IF_32(IV[0],  4, 13, A, &A[8], &A[16], &A[24]);
+	STEP8_IF_33(IV[1], 13, 10, &A[24], A, &A[8], &A[16]);
+	STEP8_IF_34(IV[2], 10, 25, &A[16], &A[24], A, &A[8]);
+	STEP8_IF_35(IV[3], 25,  4, &A[8], &A[16], &A[24], A);
+}
+
+__device__ __forceinline__ void Compression2(const int texture_id, uint4 *g_fft4, uint32_t *g_state) {
+	uint32_t A[32];
+	int i;
+	uint32_t *state = &g_state[blockIdx.x * (blockDim.x*32)];
+#pragma unroll 32
+	for (i=0; i < 32; i++) A[i] = state[threadIdx.x+blockDim.x*i];
+	SIMD_Compress2(A, texture_id, g_fft4);
+#pragma unroll 32
+	for (i=0; i < 32; i++) state[threadIdx.x+blockDim.x*i] = A[i];
+}
+
+__device__ __forceinline__ void SIMD_Compress_Final(uint32_t *A, const uint32_t *M) {
+	uint32_t IV[4][8];
+	int i;
+#pragma unroll 8
+	for(i=0; i<8; i++) {
+		IV[0][i] = A[i];
+		IV[1][i] = (&A[8])[i];
+		IV[2][i] = (&A[16])[i];
+		IV[3][i] = (&A[24])[i];
+	}
+#pragma unroll 8
+	for(i=0; i<8; i++) {
+		A[i] ^= M[i];
+		(&A[8])[i] ^= M[8+i];
+	}
+	Round8_0_final(A, 3, 23, 17, 27);
+	Round8_1_final(A, 28, 19, 22, 7);
+	Round8_2_final(A, 29, 9, 15, 5);
+	Round8_3_final(A, 4, 13, 10, 25);
+	STEP8_IF_32(IV[0],  4, 13, A, &A[8], &A[16], &A[24]);
+	STEP8_IF_33(IV[1], 13, 10, &A[24], A, &A[8], &A[16]);
+	STEP8_IF_34(IV[2], 10, 25, &A[16], &A[24], A, &A[8]);
+	STEP8_IF_35(IV[3], 25,  4, &A[8], &A[16], &A[24], A);
+}
+
+__device__ __forceinline__ void Final(uint32_t *hashval, const int texture_id, uint4 *g_fft4, uint32_t *g_state) {
+	uint32_t A[32];
+	int i;
+	uint32_t *state = &g_state[blockIdx.x * (blockDim.x*32)];
+#pragma unroll 32
+	for (i=0; i < 32; i++) A[i] = state[threadIdx.x+blockDim.x*i];
+	uint32_t buffer[16];
+	buffer[0] = 512;
+#pragma unroll 15
+	for (i=1; i < 16; i++) buffer[i] = 0;
+	SIMD_Compress_Final(A, buffer);
+#pragma unroll 16
+	for (i=0; i < 16; i++)
+		hashval[i] = A[i];
+}
diff --git a/x11/cuda_x11_simd512_sm2.cuh b/x11/cuda_x11_simd512_sm2.cuh
new file mode 100644
index 0000000000..1c5b314364
--- /dev/null
+++ b/x11/cuda_x11_simd512_sm2.cuh
@@ -0,0 +1,575 @@
+/***************************************************************************************************
+ * SM 2.x SIMD512 CUDA Implementation without shuffle
+ *
+ * cbuchner 2014 / tpruvot 2015
+ */
+
+#include "cuda_helper.h"
+
+#ifdef __INTELLISENSE__
+/* just for vstudio code colors */
+#define __CUDA_ARCH__ 210
+#endif
+
+#if __CUDA_ARCH__ < 300
+
+#define T32(x) (x)
+
+#if 0 /* already declared in SM 3+ implementation */
+__constant__  uint32_t c_IV_512[32];
+const uint32_t h_IV_512[32] = {
+	0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, 0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558,
+	0xac506643, 0xa90635a5, 0xe25b878b, 0xaab7878f, 0x88817f7a, 0x0a02892b, 0x559a7550, 0x598f657e,
+	0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8, 0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257,
+	0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4, 0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22
+};
+
+__constant__ int c_FFT128_8_16_Twiddle[128];
+static const int h_FFT128_8_16_Twiddle[128] = {
+	1,   1,   1,   1,   1,    1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
+	1,  60,   2, 120,   4,  -17,   8, -34,  16, -68,  32, 121,  64, -15, 128, -30,
+	1,  46,  60, -67,   2,   92, 120, 123,   4, -73, -17, -11,   8, 111, -34, -22,
+	1, -67, 120, -73,   8,  -22, -68, -70,  64,  81, -30, -46,  -2,-123,  17,-111,
+	1,-118,  46, -31,  60,  116, -67, -61,   2,  21,  92, -62, 120, -25, 123,-122,
+	1, 116,  92,-122, -17,   84, -22,  18,  32, 114, 117, -49, -30, 118,  67,  62,
+	1, -31, -67,  21, 120, -122, -73, -50,   8,   9, -22, -89, -68,  52, -70, 114,
+	1, -61, 123, -50, -34,   18, -70, -99, 128, -98,  67,  25,  17,  -9,  35, -79
+};
+
+__constant__ int c_FFT256_2_128_Twiddle[128];
+static const int h_FFT256_2_128_Twiddle[128] = {
+	  1,  41,-118,  45,  46,  87, -31,  14,
+	 60,-110, 116,-127, -67,  80, -61,  69,
+	  2,  82,  21,  90,  92, -83, -62,  28,
+	120,  37, -25,   3, 123, -97,-122,-119,
+	  4, -93,  42, -77, -73,  91,-124,  56,
+	-17,  74, -50,   6, -11,  63,  13,  19,
+	  8,  71,  84, 103, 111, -75,   9, 112,
+	-34,-109,-100,  12, -22, 126,  26,  38,
+	 16,-115, -89, -51, -35, 107,  18, -33,
+	-68,  39,  57,  24, -44,  -5,  52,  76,
+	 32,  27,  79,-102, -70, -43,  36, -66,
+	121,  78, 114,  48, -88, -10, 104,-105,
+	 64,  54, -99,  53, 117, -86,  72, 125,
+	-15,-101, -29,  96,  81, -20, -49,  47,
+	128, 108,  59, 106, -23,  85,-113,  -7,
+	-30,  55, -58, -65, -95, -40, -98,  94
+};
+#endif
+
+__constant__ int c_FFT[256] = {
+	// this is the FFT result in revbin permuted order
+	4, -4, 32, -32, -60, 60, 60, -60, 101, -101, 58, -58, 112, -112, -11, 11, -92, 92,
+	-119, 119, 42, -42, -82, 82, 32, -32, 32, -32, 121, -121, 17, -17, -47, 47, 63,
+	-63, 107, -107, -76, 76, -119, 119, -83, 83, 126, -126, 94, -94, -23, 23, -76,
+	76, -47, 47, 92, -92, -117, 117, 73, -73, -53, 53, 88, -88, -80, 80, -47, 47,
+	5, -5, 67, -67, 34, -34, 4, -4, 87, -87, -28, 28, -70, 70, -110, 110, -18, 18, 93,
+	-93, 51, -51, 36, -36, 118, -118, -106, 106, 45, -45, -108, 108, -44, 44, 117,
+	-117, -121, 121, -37, 37, 65, -65, 37, -37, 40, -40, -42, 42, 91, -91, -128, 128,
+	-21, 21, 94, -94, -98, 98, -47, 47, 28, -28, 115, -115, 16, -16, -20, 20, 122,
+	-122, 115, -115, 46, -46, 84, -84, -127, 127, 57, -57, 127, -127, -80, 80, 24,
+	-24, 15, -15, 29, -29, -78, 78, -126, 126, 16, -16, 52, -52, 55, -55, 110, -110,
+	-51, 51, -120, 120, -124, 124, -24, 24, -76, 76, 26, -26, -21, 21, -64, 64, -99,
+	99, 85, -85, -15, 15, -120, 120, -116, 116, 85, -85, 12, -12, -24, 24, 4, -4,
+	79, -79, 76, -76, 23, -23, 4, -4, -108, 108, -20, 20, 73, -73, -42, 42, -7, 7,
+	-29, 29, -123, 123, 49, -49, -96, 96, -68, 68, -112, 112, 116, -116, -24, 24, 93,
+	-93, -125, 125, -86, 86, 117, -117, -91, 91, 42, -42, 87, -87, -117, 117, 102, -102
+};
+
+__constant__ int c_P8[32][8] = {
+	{ 2, 66, 34, 98, 18, 82, 50, 114 },
+	{ 6, 70, 38, 102, 22, 86, 54, 118 },
+	{ 0, 64, 32, 96, 16, 80, 48, 112 },
+	{ 4, 68, 36, 100, 20, 84, 52, 116 },
+	{ 14, 78, 46, 110, 30, 94, 62, 126 },
+	{ 10, 74, 42, 106, 26, 90, 58, 122 },
+	{ 12, 76, 44, 108, 28, 92, 60, 124 },
+	{ 8, 72, 40, 104, 24, 88, 56, 120 },
+	{ 15, 79, 47, 111, 31, 95, 63, 127 },
+	{ 13, 77, 45, 109, 29, 93, 61, 125 },
+	{ 3, 67, 35, 99, 19, 83, 51, 115 },
+	{ 1, 65, 33, 97, 17, 81, 49, 113 },
+	{ 9, 73, 41, 105, 25, 89, 57, 121 },
+	{ 11, 75, 43, 107, 27, 91, 59, 123 },
+	{ 5, 69, 37, 101, 21, 85, 53, 117 },
+	{ 7, 71, 39, 103, 23, 87, 55, 119 },
+	{ 8, 72, 40, 104, 24, 88, 56, 120 },
+	{ 4, 68, 36, 100, 20, 84, 52, 116 },
+	{ 14, 78, 46, 110, 30, 94, 62, 126 },
+	{ 2, 66, 34, 98, 18, 82, 50, 114 },
+	{ 6, 70, 38, 102, 22, 86, 54, 118 },
+	{ 10, 74, 42, 106, 26, 90, 58, 122 },
+	{ 0, 64, 32, 96, 16, 80, 48, 112 },
+	{ 12, 76, 44, 108, 28, 92, 60, 124 },
+	{ 134, 198, 166, 230, 150, 214, 182, 246 },
+	{ 128, 192, 160, 224, 144, 208, 176, 240 },
+	{ 136, 200, 168, 232, 152, 216, 184, 248 },
+	{ 142, 206, 174, 238, 158, 222, 190, 254 },
+	{ 140, 204, 172, 236, 156, 220, 188, 252 },
+	{ 138, 202, 170, 234, 154, 218, 186, 250 },
+	{ 130, 194, 162, 226, 146, 210, 178, 242 },
+	{ 132, 196, 164, 228, 148, 212, 180, 244 },
+};
+
+__constant__ int c_Q8[32][8] = {
+	{ 130, 194, 162, 226, 146, 210, 178, 242 },
+	{ 134, 198, 166, 230, 150, 214, 182, 246 },
+	{ 128, 192, 160, 224, 144, 208, 176, 240 },
+	{ 132, 196, 164, 228, 148, 212, 180, 244 },
+	{ 142, 206, 174, 238, 158, 222, 190, 254 },
+	{ 138, 202, 170, 234, 154, 218, 186, 250 },
+	{ 140, 204, 172, 236, 156, 220, 188, 252 },
+	{ 136, 200, 168, 232, 152, 216, 184, 248 },
+	{ 143, 207, 175, 239, 159, 223, 191, 255 },
+	{ 141, 205, 173, 237, 157, 221, 189, 253 },
+	{ 131, 195, 163, 227, 147, 211, 179, 243 },
+	{ 129, 193, 161, 225, 145, 209, 177, 241 },
+	{ 137, 201, 169, 233, 153, 217, 185, 249 },
+	{ 139, 203, 171, 235, 155, 219, 187, 251 },
+	{ 133, 197, 165, 229, 149, 213, 181, 245 },
+	{ 135, 199, 167, 231, 151, 215, 183, 247 },
+	{ 9, 73, 41, 105, 25, 89, 57, 121 },
+	{ 5, 69, 37, 101, 21, 85, 53, 117 },
+	{ 15, 79, 47, 111, 31, 95, 63, 127 },
+	{ 3, 67, 35, 99, 19, 83, 51, 115 },
+	{ 7, 71, 39, 103, 23, 87, 55, 119 },
+	{ 11, 75, 43, 107, 27, 91, 59, 123 },
+	{ 1, 65, 33, 97, 17, 81, 49, 113 },
+	{ 13, 77, 45, 109, 29, 93, 61, 125 },
+	{ 135, 199, 167, 231, 151, 215, 183, 247 },
+	{ 129, 193, 161, 225, 145, 209, 177, 241 },
+	{ 137, 201, 169, 233, 153, 217, 185, 249 },
+	{ 143, 207, 175, 239, 159, 223, 191, 255 },
+	{ 141, 205, 173, 237, 157, 221, 189, 253 },
+	{ 139, 203, 171, 235, 155, 219, 187, 251 },
+	{ 131, 195, 163, 227, 147, 211, 179, 243 },
+	{ 133, 197, 165, 229, 149, 213, 181, 245 },
+};
+
+#define p8_xor(x) ( ((x)%7) == 0 ? 1 : \
+	((x)%7) == 1 ? 6 : \
+	((x)%7) == 2 ? 2 : \
+	((x)%7) == 3 ? 3 : \
+	((x)%7) == 4 ? 5 : \
+	((x)%7) == 5 ? 7 : 4 )
+
+/************* the round function ****************/
+
+//#define IF(x, y, z) ((((y) ^ (z)) & (x)) ^ (z))
+//#define MAJ(x, y, z) (((z) & (y)) | (((z) | (y)) & (x)))
+
+__device__ __forceinline__
+void STEP8_IF(const uint32_t *w, const int i, const int r, const int s, uint32_t *A, const uint32_t *B, const uint32_t *C, uint32_t *D)
+{
+	uint32_t R[8];
+	#pragma unroll 8
+	for(int j=0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	#pragma unroll 8
+	for(int j=0; j<8; j++) {
+		D[j] = D[j] + w[j] + IF(A[j], B[j], C[j]);
+		D[j] = T32(ROTL32(T32(D[j]), s) + R[j^p8_xor(i)]);
+		A[j] = R[j];
+	}
+}
+
+__device__ __forceinline__
+void STEP8_MAJ(const uint32_t *w, const int i, const int r, const int s, uint32_t *A, const uint32_t *B, const uint32_t *C, uint32_t *D)
+{
+	uint32_t R[8];
+	#pragma unroll 8
+	for(int j=0; j<8; j++) {
+		R[j] = ROTL32(A[j], r);
+	}
+	#pragma unroll 8
+	for(int j=0; j<8; j++) {
+		D[j] = D[j] + w[j] + MAJ(A[j], B[j], C[j]);
+		D[j] = T32(ROTL32(T32(D[j]), s) + R[j^p8_xor(i)]);
+		A[j] = R[j];
+	}
+}
+
+__device__ __forceinline__
+void Round8(uint32_t A[32], const int y[256], int i, int r, int s, int t, int u)
+{
+	uint32_t w[8][8];
+	int code = i<2? 185: 233;
+
+	/*
+	 * The FFT output y is in revbin permuted order,
+	 * but this is included in the tables P and Q
+	 */
+
+	#pragma unroll 8
+	for(int a=0; a<8; a++) {
+		#pragma unroll 8
+		for(int b=0; b<8; b++) {
+			w[a][b] = __byte_perm( (y[c_P8[8*i+a][b]] * code), (y[c_Q8[8*i+a][b]] * code), 0x5410);
+		}
+	}
+
+	STEP8_IF(w[0], 8*i+0, r, s, A, &A[8], &A[16], &A[24]);
+	STEP8_IF(w[1], 8*i+1, s, t, &A[24], A, &A[8], &A[16]);
+	STEP8_IF(w[2], 8*i+2, t, u, &A[16], &A[24], A, &A[8]);
+	STEP8_IF(w[3], 8*i+3, u, r, &A[8], &A[16], &A[24], A);
+
+	STEP8_MAJ(w[4], 8*i+4, r, s, A, &A[8], &A[16], &A[24]);
+	STEP8_MAJ(w[5], 8*i+5, s, t, &A[24], A, &A[8], &A[16]);
+	STEP8_MAJ(w[6], 8*i+6, t, u, &A[16], &A[24], A, &A[8]);
+	STEP8_MAJ(w[7], 8*i+7, u, r, &A[8], &A[16], &A[24], A);
+}
+
+
+/********************* Message expansion ************************/
+
+/*
+ * Reduce modulo 257; result is in [-127; 383]
+ * REDUCE(x) := (x&255) - (x>>8)
+ */
+#define REDUCE(x) (((x)&255) - ((x)>>8))
+
+/*
+ * Reduce from [-127; 383] to [-128; 128]
+ * EXTRA_REDUCE_S(x) := x<=128 ? x : x-257
+ */
+#define EXTRA_REDUCE_S(x) \
+	((x)<=128 ? (x) : (x)-257)
+
+/*
+ * Reduce modulo 257; result is in [-128; 128]
+ */
+#define REDUCE_FULL_S(x) \
+	EXTRA_REDUCE_S(REDUCE(x))
+
+__device__ __forceinline__
+void FFT_8(int *y, int stripe)
+{
+	/*
+	 * FFT_8 using w=4 as 8th root of unity
+	 * Unrolled decimation in frequency (DIF) radix-2 NTT.
+	 * Output data is in revbin_permuted order.
+	 */
+	#define X(i) y[stripe*i]
+
+	#define DO_REDUCE(i) \
+		X(i) = REDUCE(X(i))
+
+	#define DO_REDUCE_FULL_S(i) { \
+		X(i) = REDUCE(X(i)); \
+		X(i) = EXTRA_REDUCE_S(X(i)); \
+	}
+
+	#define BUTTERFLY(i,j,n) { \
+		int u= X(i); \
+		int v= X(j); \
+		X(i) = u+v; \
+		X(j) = (u-v) << (2*n); \
+	}
+
+	BUTTERFLY(0, 4, 0);
+	BUTTERFLY(1, 5, 1);
+	BUTTERFLY(2, 6, 2);
+	BUTTERFLY(3, 7, 3);
+
+	DO_REDUCE(6);
+	DO_REDUCE(7);
+
+	BUTTERFLY(0, 2, 0);
+	BUTTERFLY(4, 6, 0);
+	BUTTERFLY(1, 3, 2);
+	BUTTERFLY(5, 7, 2);
+
+	DO_REDUCE(7);
+
+	BUTTERFLY(0, 1, 0);
+	BUTTERFLY(2, 3, 0);
+	BUTTERFLY(4, 5, 0);
+	BUTTERFLY(6, 7, 0);
+
+	DO_REDUCE_FULL_S(0);
+	DO_REDUCE_FULL_S(1);
+	DO_REDUCE_FULL_S(2);
+	DO_REDUCE_FULL_S(3);
+	DO_REDUCE_FULL_S(4);
+	DO_REDUCE_FULL_S(5);
+	DO_REDUCE_FULL_S(6);
+	DO_REDUCE_FULL_S(7);
+
+	#undef X
+	#undef DO_REDUCE
+	#undef DO_REDUCE_FULL_S
+	#undef BUTTERFLY
+}
+
+__device__ __forceinline__
+void FFT_16(int *y, int stripe)
+{
+	/*
+	 * FFT_16 using w=2 as 16th root of unity
+	 * Unrolled decimation in frequency (DIF) radix-2 NTT.
+	 * Output data is in revbin_permuted order.
+	 */
+
+	#define X(i) y[stripe*i]
+
+	#define DO_REDUCE(i) \
+		X(i) = REDUCE(X(i))
+
+	#define DO_REDUCE_FULL_S(i) { \
+		X(i) = REDUCE(X(i)); \
+		X(i) = EXTRA_REDUCE_S(X(i)); \
+	}
+
+	#define BUTTERFLY(i,j,n) { \
+		int u= X(i); \
+		int v= X(j); \
+		X(i) = u+v; \
+		X(j) = (u-v) << n; \
+	}
+
+	BUTTERFLY(0, 8, 0);
+	BUTTERFLY(1, 9, 1);
+	BUTTERFLY(2, 10, 2);
+	BUTTERFLY(3, 11, 3);
+	BUTTERFLY(4, 12, 4);
+	BUTTERFLY(5, 13, 5);
+	BUTTERFLY(6, 14, 6);
+	BUTTERFLY(7, 15, 7);
+
+	DO_REDUCE(11);
+	DO_REDUCE(12);
+	DO_REDUCE(13);
+	DO_REDUCE(14);
+	DO_REDUCE(15);
+
+	BUTTERFLY( 0, 4, 0);
+	BUTTERFLY( 1, 5, 2);
+	BUTTERFLY( 2, 6, 4);
+	BUTTERFLY( 3, 7, 6);
+
+	BUTTERFLY( 8, 12, 0);
+	BUTTERFLY( 9, 13, 2);
+	BUTTERFLY(10, 14, 4);
+	BUTTERFLY(11, 15, 6);
+
+	DO_REDUCE(5);
+	DO_REDUCE(7);
+	DO_REDUCE(13);
+	DO_REDUCE(15);
+
+	BUTTERFLY( 0, 2, 0);
+	BUTTERFLY( 1, 3, 4);
+	BUTTERFLY( 4, 6, 0);
+	BUTTERFLY( 5, 7, 4);
+
+	BUTTERFLY( 8, 10, 0);
+	BUTTERFLY(12, 14, 0);
+	BUTTERFLY( 9, 11, 4);
+	BUTTERFLY(13, 15, 4);
+
+	BUTTERFLY( 0, 1, 0);
+	BUTTERFLY( 2, 3, 0);
+	BUTTERFLY( 4, 5, 0);
+	BUTTERFLY( 6, 7, 0);
+
+	BUTTERFLY( 8, 9, 0);
+	BUTTERFLY(10, 11, 0);
+	BUTTERFLY(12, 13, 0);
+	BUTTERFLY(14, 15, 0);
+
+	DO_REDUCE_FULL_S( 0);
+	DO_REDUCE_FULL_S( 1);
+	DO_REDUCE_FULL_S( 2);
+	DO_REDUCE_FULL_S( 3);
+	DO_REDUCE_FULL_S( 4);
+	DO_REDUCE_FULL_S( 5);
+	DO_REDUCE_FULL_S( 6);
+	DO_REDUCE_FULL_S( 7);
+	DO_REDUCE_FULL_S( 8);
+	DO_REDUCE_FULL_S( 9);
+	DO_REDUCE_FULL_S(10);
+	DO_REDUCE_FULL_S(11);
+	DO_REDUCE_FULL_S(12);
+	DO_REDUCE_FULL_S(13);
+	DO_REDUCE_FULL_S(14);
+	DO_REDUCE_FULL_S(15);
+
+	#undef X
+	#undef DO_REDUCE
+	#undef DO_REDUCE_FULL_S
+	#undef BUTTERFLY
+}
+
+__device__ __forceinline__
+void FFT_128_full(int *y)
+{
+	#pragma unroll 16
+	for (int i=0; i<16; i++) {
+		FFT_8(y+i,16);
+	}
+
+	#pragma unroll 128
+	for (int i=0; i<128; i++)
+		/*if (i & 7)*/ y[i] = REDUCE(y[i]*c_FFT128_8_16_Twiddle[i]);
+
+	#pragma unroll 8
+	for (int i=0; i<8; i++) {
+		FFT_16(y+16*i,1);
+	}
+}
+
+__device__ __forceinline__
+void FFT_256_halfzero(int y[256])
+{
+	/*
+	* FFT_256 using w=41 as 256th root of unity.
+	* Decimation in frequency (DIF) NTT.
+	* Output data is in revbin_permuted order.
+	* In place.
+	*/
+	const int tmp = y[127];
+
+	#pragma unroll 127
+	for (int i=0; i<127; i++)
+		y[128+i] = REDUCE(y[i] * c_FFT256_2_128_Twiddle[i]);
+
+	/* handle X^255 with an additionnal butterfly */
+	y[127] = REDUCE(tmp + 1);
+	y[255] = REDUCE((tmp - 1) * c_FFT256_2_128_Twiddle[127]);
+
+	FFT_128_full(y);
+	FFT_128_full(y+128);
+}
+
+__device__ __forceinline__
+void SIMD_Compress(uint32_t A[32], const int *expanded, const uint32_t *M)
+{
+	uint32_t IV[4][8];
+
+	/* Save the chaining value for the feed-forward */
+
+	#pragma unroll 8
+	for(int i=0; i<8; i++) {
+		IV[0][i] = A[i];
+		IV[1][i] = (&A[8])[i];
+		IV[2][i] = (&A[16])[i];
+		IV[3][i] = (&A[24])[i];
+	}
+
+	/* XOR the message to the chaining value */
+	/* we can XOR word-by-word */
+	#pragma unroll 8
+	for(int i=0; i<8; i++) {
+		A[i] ^= M[i];
+		(&A[8])[i] ^= M[8+i];
+	}
+
+	/* Run the feistel ladders with the expanded message */
+	Round8(A, expanded, 0, 3, 23, 17, 27);
+	Round8(A, expanded, 1, 28, 19, 22, 7);
+	Round8(A, expanded, 2, 29, 9, 15, 5);
+	Round8(A, expanded, 3, 4, 13, 10, 25);
+
+	STEP8_IF(IV[0], 32,  4, 13, A, &A[8], &A[16], &A[24]);
+	STEP8_IF(IV[1], 33, 13, 10, &A[24], A, &A[8], &A[16]);
+	STEP8_IF(IV[2], 34, 10, 25, &A[16], &A[24], A, &A[8]);
+	STEP8_IF(IV[3], 35, 25,  4, &A[8], &A[16], &A[24], A);
+}
+
+
+/***************************************************/
+
+__device__ __forceinline__
+void SIMDHash(const uint32_t *data, uint32_t *hashval)
+{
+	uint32_t A[32];
+	uint32_t buffer[16];
+
+	#pragma unroll 32
+	for (int i=0; i < 32; i++) A[i] = c_IV_512[i];
+
+	#pragma unroll 16
+	for (int i=0; i < 16; i++) buffer[i] = data[i];
+
+	/* Message Expansion using Number Theoretical Transform similar to FFT */
+	int expanded[256];
+	{
+		#pragma unroll 16
+		for(int i=0; i<64; i+=4) {
+			expanded[i+0] = __byte_perm(buffer[i/4],0,0x4440);
+			expanded[i+1] = __byte_perm(buffer[i/4],0,0x4441);
+			expanded[i+2] = __byte_perm(buffer[i/4],0,0x4442);
+			expanded[i+3] = __byte_perm(buffer[i/4],0,0x4443);
+		}
+
+		#pragma unroll 16
+		for(int i=64; i<128; i+=4) {
+			expanded[i+0] = 0;
+			expanded[i+1] = 0;
+			expanded[i+2] = 0;
+			expanded[i+3] = 0;
+		}
+
+		FFT_256_halfzero(expanded);
+	}
+
+	/* Compression Function */
+	 SIMD_Compress(A, expanded, buffer);
+
+	/* Padding Round with known input (hence the FFT can be precomputed) */
+	buffer[0] = 512;
+
+	#pragma unroll 15
+	for (int i=1; i < 16; i++) buffer[i] = 0;
+
+	SIMD_Compress(A, c_FFT, buffer);
+
+	#pragma unroll 16
+	for (int i=0; i < 16; i++)
+		hashval[i] = A[i];
+}
+
+/***************************************************/
+__global__
+void x11_simd512_gpu_hash_64_sm2(const uint32_t threads, const uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+
+		const int hashPosition = nounce - startNounce;
+		uint32_t *Hash = (uint32_t*) &g_hash[8 * hashPosition];
+
+		SIMDHash(Hash, Hash);
+	}
+}
+
+#else
+__global__ void x11_simd512_gpu_hash_64_sm2(const uint32_t threads, const uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) {}
+#endif /* __CUDA_ARCH__ < 300 */
+
+__host__
+static void x11_simd512_cpu_init_sm2(int thr_id)
+{
+#ifndef DEVICE_DIRECT_CONSTANTS
+	cudaMemcpyToSymbol( c_IV_512, h_IV_512, sizeof(h_IV_512), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol( c_FFT128_8_16_Twiddle, h_FFT128_8_16_Twiddle, sizeof(h_FFT128_8_16_Twiddle), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol( c_FFT256_2_128_Twiddle, h_FFT256_2_128_Twiddle, sizeof(h_FFT256_2_128_Twiddle), 0, cudaMemcpyHostToDevice);
+#endif
+}
+
+__host__
+static void x11_simd512_cpu_hash_64_sm2(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+{
+	const int threadsperblock = 256;
+
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	size_t shared_size = 0;
+
+	x11_simd512_gpu_hash_64_sm2<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+	MyStreamSynchronize(NULL, order, thr_id);
+}
diff --git a/x11/exosis.cu b/x11/exosis.cu
new file mode 100644
index 0000000000..e4dcfe5aba
--- /dev/null
+++ b/x11/exosis.cu
@@ -0,0 +1,497 @@
+/**
+ * Timetravel (exosis) CUDA implementation
+ *  by tpruvot@github, exosis
+ */
+
+#include <stdio.h>
+#include <memory.h>
+#include <unistd.h>
+
+#define HASH_FUNC_BASE_TIMESTAMP 1538556426U
+#define HASH_FUNC_COUNT 8
+#define HASH_FUNC_COUNT_PERMUTATIONS 40320U
+
+extern "C" {
+#include "sph/sph_blake.h"
+#include "sph/sph_bmw.h"
+#include "sph/sph_groestl.h"
+#include "sph/sph_skein.h"
+#include "sph/sph_jh.h"
+#include "sph/sph_keccak.h"
+#include "sph/sph_luffa.h"
+#include "sph/sph_cubehash.h"
+}
+
+#include "miner.h"
+#include "cuda_helper.h"
+#include "cuda_x11.h"
+
+static uint32_t *d_hash[MAX_GPUS];
+
+enum Algo {
+	BLAKE = 0,
+	BMW,
+	GROESTL,
+	SKEIN,
+	JH,
+	KECCAK,
+	LUFFA,
+	CUBEHASH,
+	MAX_ALGOS_COUNT
+};
+
+static const char* algo_strings[] = {
+	"blake",
+	"bmw512",
+	"groestl",
+	"skein",
+	"jh512",
+	"keccak",
+	"luffa",
+	"cube",
+	NULL
+};
+
+inline void swap8(uint8_t *a, uint8_t *b)
+{
+	uint8_t t = *a;
+	*a = *b;
+	*b = t;
+}
+
+inline void initPerm(uint8_t n[], int count)
+{
+	for (int i = 0; i < count; i++)
+		n[i] = i;
+}
+
+static int nextPerm(uint8_t n[], int count)
+{
+	int tail, i, j;
+
+	if (count <= 1)
+		return 0;
+
+	for (i = count - 1; i>0 && n[i - 1] >= n[i]; i--);
+	tail = i;
+
+	if (tail > 0) {
+		for (j = count - 1; j>tail && n[j] <= n[tail - 1]; j--);
+		swap8(&n[tail - 1], &n[j]);
+	}
+
+	for (i = tail, j = count - 1; i<j; i++, j--)
+		swap8(&n[i], &n[j]);
+
+	return (tail != 0);
+}
+
+static void getAlgoString(char *str, int seq)
+{
+	uint8_t algoList[HASH_FUNC_COUNT];
+	char *sptr;
+
+	initPerm(algoList, HASH_FUNC_COUNT);
+
+	for (int k = 0; k < seq; k++) {
+		nextPerm(algoList, HASH_FUNC_COUNT);
+	}
+
+	sptr = str;
+	for (int j = 0; j < HASH_FUNC_COUNT; j++) {
+		if (algoList[j] >= 10)
+			sprintf(sptr, "%c", 'A' + (algoList[j] - 10));
+		else
+			sprintf(sptr, "%u", (uint32_t) algoList[j]);
+		sptr++;
+	}
+	*sptr = '\0';
+}
+
+static __thread uint32_t s_ntime = 0;
+static uint32_t s_sequence = UINT32_MAX;
+static uint8_t s_firstalgo = 0xFF;
+static char hashOrder[HASH_FUNC_COUNT + 1] = { 0 };
+
+#define INITIAL_DATE HASH_FUNC_BASE_TIMESTAMP
+static inline uint32_t getCurrentAlgoSeq(uint32_t ntime)
+{
+	// unlike x11evo, the permutation changes often (with ntime)
+	return (uint32_t) (ntime - INITIAL_DATE) % HASH_FUNC_COUNT_PERMUTATIONS;
+}
+
+// To finish...
+static void get_travel_order(uint32_t ntime, char *permstr)
+{
+	uint32_t seq = getCurrentAlgoSeq(ntime);
+	if (s_sequence != seq) {
+		getAlgoString(permstr, seq);
+		s_sequence = seq;
+	}
+}
+
+// CPU Hash
+extern "C" void exosis_hash(void *output, const void *input)
+{
+	uint32_t _ALIGN(64) hash[64/4] = { 0 };
+
+	sph_blake512_context     ctx_blake;
+	sph_bmw512_context       ctx_bmw;
+	sph_groestl512_context   ctx_groestl;
+	sph_skein512_context     ctx_skein;
+	sph_jh512_context        ctx_jh;
+	sph_keccak512_context    ctx_keccak;
+	sph_luffa512_context     ctx_luffa1;
+	sph_cubehash512_context  ctx_cubehash1;
+
+	if (s_sequence == UINT32_MAX) {
+		uint32_t *data = (uint32_t*) input;
+		const uint32_t ntime = (opt_benchmark || !data[17]) ? (uint32_t) time(NULL) : data[17];
+		get_travel_order(ntime, hashOrder);
+	}
+
+	void *in = (void*) input;
+	int size = 80;
+
+	const int hashes = (int) strlen(hashOrder);
+
+	for (int i = 0; i < hashes; i++)
+	{
+		const char elem = hashOrder[i];
+		uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+		switch (algo) {
+		case BLAKE:
+			sph_blake512_init(&ctx_blake);
+			sph_blake512(&ctx_blake, in, size);
+			sph_blake512_close(&ctx_blake, hash);
+			break;
+		case BMW:
+			sph_bmw512_init(&ctx_bmw);
+			sph_bmw512(&ctx_bmw, in, size);
+			sph_bmw512_close(&ctx_bmw, hash);
+			break;
+		case GROESTL:
+			sph_groestl512_init(&ctx_groestl);
+			sph_groestl512(&ctx_groestl, in, size);
+			sph_groestl512_close(&ctx_groestl, hash);
+			break;
+		case SKEIN:
+			sph_skein512_init(&ctx_skein);
+			sph_skein512(&ctx_skein, in, size);
+			sph_skein512_close(&ctx_skein, hash);
+			break;
+		case JH:
+			sph_jh512_init(&ctx_jh);
+			sph_jh512(&ctx_jh, in, size);
+			sph_jh512_close(&ctx_jh, hash);
+			break;
+		case KECCAK:
+			sph_keccak512_init(&ctx_keccak);
+			sph_keccak512(&ctx_keccak, in, size);
+			sph_keccak512_close(&ctx_keccak, hash);
+			break;
+		case LUFFA:
+			sph_luffa512_init(&ctx_luffa1);
+			sph_luffa512(&ctx_luffa1, in, size);
+			sph_luffa512_close(&ctx_luffa1, hash);
+			break;
+		case CUBEHASH:
+			sph_cubehash512_init(&ctx_cubehash1);
+			sph_cubehash512(&ctx_cubehash1, in, size);
+			sph_cubehash512_close(&ctx_cubehash1, hash);
+			break;
+		}
+
+		in = (void*) hash;
+		size = 64;
+	}
+
+	memcpy(output, hash, 32);
+}
+
+static uint32_t get_next_time(uint32_t ntime, char* curOrder)
+{
+	char nextOrder[HASH_FUNC_COUNT + 1] = { 0 };
+	uint32_t secs = 15;
+	do {
+		uint32_t nseq = getCurrentAlgoSeq(ntime+secs);
+		getAlgoString(nextOrder, nseq);
+		secs += 15;
+	} while (curOrder[0] == nextOrder[0]);
+	return secs;
+}
+
+//#define _DEBUG
+#define _DEBUG_PREFIX "tt-"
+#include "cuda_debug.cuh"
+
+void quark_bmw512_cpu_setBlock_80(void *pdata);
+void quark_bmw512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
+
+void groestl512_setBlock_80(int thr_id, uint32_t *endiandata);
+void groestl512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash);
+
+void skein512_cpu_setBlock_80(void *pdata);
+void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap);
+
+void qubit_luffa512_cpu_init(int thr_id, uint32_t threads);
+void qubit_luffa512_cpu_setBlock_80(void *pdata);
+void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
+
+void jh512_setBlock_80(int thr_id, uint32_t *endiandata);
+void jh512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash);
+
+void keccak512_setBlock_80(int thr_id, uint32_t *endiandata);
+void keccak512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash);
+
+void cubehash512_setBlock_80(int thr_id, uint32_t* endiandata);
+void cubehash512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash);
+
+void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order);
+
+static bool init[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_exosis(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	int intensity = (device_sm[device_map[thr_id]] >= 500 && !is_windows()) ? 20 : 19;
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); // 19=256*256*8;
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	// if (opt_benchmark) pdata[17] = swab32(0x5886a4be); // TO DEBUG GROESTL 80
+
+	if (opt_debug || s_ntime != pdata[17] || s_sequence == UINT32_MAX) {
+		uint32_t ntime = swab32(work->data[17]);
+		get_travel_order(ntime, hashOrder);
+		s_ntime = pdata[17];
+		if (opt_debug && !thr_id) {
+			applog(LOG_DEBUG, "exosis hash order %s (%08x)", hashOrder, ntime);
+		}
+	}
+
+	if (opt_benchmark)
+		ptarget[7] = 0x5;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		quark_blake512_cpu_init(thr_id, throughput);
+		quark_bmw512_cpu_init(thr_id, throughput);
+		quark_groestl512_cpu_init(thr_id, throughput);
+		quark_skein512_cpu_init(thr_id, throughput);
+		quark_keccak512_cpu_init(thr_id, throughput);
+		quark_jh512_cpu_init(thr_id, throughput);
+		qubit_luffa512_cpu_init(thr_id, throughput); // only constants (480 bytes)
+		x11_luffa512_cpu_init(thr_id, throughput);
+		x11_cubehash512_cpu_init(thr_id, throughput);
+
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), -1);
+		CUDA_CALL_OR_RET_X(cudaMemset(d_hash[thr_id], 0, (size_t) 64 * throughput), -1);
+
+		cuda_check_cpu_init(thr_id, throughput);
+
+		init[thr_id] = true;
+	}
+
+	uint32_t endiandata[20];
+	for (int k=0; k < 19; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	cuda_check_cpu_setTarget(ptarget);
+
+	const int hashes = (int) strlen(hashOrder);
+	const char first = hashOrder[0];
+	const uint8_t algo80 = first >= 'A' ? first - 'A' + 10 : first - '0';
+	if (algo80 != s_firstalgo) {
+		s_firstalgo = algo80;
+		applog(LOG_INFO, "Exosis first algo is now %s", algo_strings[algo80 % HASH_FUNC_COUNT]);
+	}
+
+	switch (algo80) {
+		case BLAKE:
+			quark_blake512_cpu_setBlock_80(thr_id, endiandata);
+			break;
+		case BMW:
+			quark_bmw512_cpu_setBlock_80(endiandata);
+			break;
+		case GROESTL:
+			groestl512_setBlock_80(thr_id, endiandata);
+			break;
+		case SKEIN:
+			skein512_cpu_setBlock_80((void*)endiandata);
+			break;
+		case JH:
+			jh512_setBlock_80(thr_id, endiandata);
+			break;
+		case KECCAK:
+			keccak512_setBlock_80(thr_id, endiandata);
+			break;
+		case LUFFA:
+			qubit_luffa512_cpu_setBlock_80((void*)endiandata);
+			break;
+		case CUBEHASH:
+			cubehash512_setBlock_80(thr_id, endiandata);
+			break;
+		default: {
+			uint32_t next = get_next_time(swab32(s_ntime), hashOrder);
+			if (!thr_id)
+				applog(LOG_WARNING, "kernel %c unimplemented, next in %u mn", first, next/60);
+			sleep(next > 30 ? 60 : 10);
+			return -1;
+		}
+	}
+
+	do {
+		int order = 0;
+
+		// Hash with CUDA
+
+		switch (algo80) {
+			case BLAKE:
+				quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("blake80:");
+				break;
+			case BMW:
+				quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+				TRACE("bmw80  :");
+				break;
+			case GROESTL:
+				groestl512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("grstl80:");
+				break;
+			case SKEIN:
+				skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++;
+				TRACE("skein80:");
+				break;
+			case JH:
+				jh512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("jh51280:");
+				break;
+			case KECCAK:
+				keccak512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("kecck80:");
+				break;
+			case LUFFA:
+				qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+				TRACE("luffa80:");
+				break;
+			case CUBEHASH:
+				cubehash512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("cube 80:");
+				break;
+		}
+
+		for (int i = 1; i < hashes; i++)
+		{
+			const char elem = hashOrder[i];
+			const uint8_t algo64 = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+			switch (algo64) {
+			case BLAKE:
+				quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("blake  :");
+				break;
+			case BMW:
+				quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("bmw    :");
+				break;
+			case GROESTL:
+				quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("groestl:");
+				break;
+			case SKEIN:
+				quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("skein  :");
+				break;
+			case JH:
+				quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("jh512  :");
+				break;
+			case KECCAK:
+				quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("keccak :");
+				break;
+			case LUFFA:
+				x11_luffa512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("luffa  :");
+				break;
+			case CUBEHASH:
+				x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("cube   :");
+				break;
+			}
+		}
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			uint32_t _ALIGN(64) vhash[8];
+			const uint32_t Htarg = ptarget[7];
+			be32enc(&endiandata[19], work->nonces[0]);
+			exosis_hash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				pdata[19] = work->nonces[0];
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					exosis_hash(vhash, endiandata);
+					if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+						bn_set_target_ratio(work, vhash, 1);
+						work->valid_nonces++;
+					}
+					pdata[19] = max(pdata[19], work->nonces[1]) + 1;
+				}
+				return work->valid_nonces;
+			} else if (vhash[7] > Htarg) {
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+	return 0;
+}
+
+// cleanup
+extern "C" void free_exosis(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+
+	quark_blake512_cpu_free(thr_id);
+	quark_groestl512_cpu_free(thr_id);
+
+	cuda_check_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/x11/fresh.cu b/x11/fresh.cu
new file mode 100644
index 0000000000..f67a54e7c7
--- /dev/null
+++ b/x11/fresh.cu
@@ -0,0 +1,185 @@
+/**
+ * Fresh algorithm
+ */
+extern "C" {
+#include "sph/sph_shavite.h"
+#include "sph/sph_simd.h"
+#include "sph/sph_echo.h"
+}
+#include "miner.h"
+#include "cuda_helper.h"
+
+// to test gpu hash on a null buffer
+#define NULLTEST 0
+
+static uint32_t *d_hash[MAX_GPUS];
+
+extern void x11_shavite512_cpu_init(int thr_id, uint32_t threads);
+extern void x11_shavite512_setBlock_80(void *pdata);
+extern void x11_shavite512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
+extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern int  x11_simd512_cpu_init(int thr_id, uint32_t threads);
+extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x11_simd512_cpu_free(int thr_id);
+
+extern void x11_echo512_cpu_init(int thr_id, uint32_t threads);
+extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+
+// CPU Hash
+extern "C" void fresh_hash(void *state, const void *input)
+{
+	// shavite-simd-shavite-simd-echo
+
+	sph_shavite512_context ctx_shavite;
+	sph_simd512_context ctx_simd;
+	sph_echo512_context ctx_echo;
+
+	unsigned char hash[128]; // uint32_t hashA[16], hashB[16];
+	#define hashA hash
+	#define hashB hash+64
+
+	memset(hash, 0, sizeof hash);
+
+	sph_shavite512_init(&ctx_shavite);
+	sph_shavite512(&ctx_shavite, input, 80);
+	sph_shavite512_close(&ctx_shavite, hashA);
+
+	sph_simd512_init(&ctx_simd);
+	sph_simd512(&ctx_simd, hashA, 64);
+	sph_simd512_close(&ctx_simd, hashB);
+
+	sph_shavite512_init(&ctx_shavite);
+	sph_shavite512(&ctx_shavite, hashB, 64);
+	sph_shavite512_close(&ctx_shavite, hashA);
+
+	sph_simd512_init(&ctx_simd);
+	sph_simd512(&ctx_simd, hashA, 64);
+	sph_simd512_close(&ctx_simd, hashB);
+
+	sph_echo512_init(&ctx_echo);
+	sph_echo512(&ctx_echo, hashB, 64);
+	sph_echo512_close(&ctx_echo, hashA);
+
+	memcpy(state, hash, 32);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_fresh(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	uint32_t endiandata[20];
+
+	uint32_t throughput =  cuda_default_throughput(thr_id, 1 << 19);
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (opt_benchmark)
+		ptarget[7] = 0x00ff;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t)64 * throughput + 4), -1);
+
+		x11_shavite512_cpu_init(thr_id, throughput);
+		x11_simd512_cpu_init(thr_id, throughput);
+		x11_echo512_cpu_init(thr_id, throughput);
+
+		cuda_check_cpu_init(thr_id, throughput);
+
+		init[thr_id] = true;
+	}
+
+	for (int k=0; k < 20; k++)
+		be32enc(&endiandata[k], pdata[k]);
+	
+	x11_shavite512_setBlock_80((void*)endiandata);
+	cuda_check_cpu_setTarget(ptarget);
+	do {
+		int order = 0;
+
+		// GPU Hash
+		x11_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+
+#if NULLTEST
+		uint32_t buf[8]; memset(buf, 0, sizeof buf);
+		CUDA_SAFE_CALL(cudaMemcpy(buf, d_hash[thr_id], sizeof buf, cudaMemcpyDeviceToHost));
+		CUDA_SAFE_CALL(cudaThreadSynchronize());
+		print_hash((unsigned char*)buf); printf("\n");
+#endif
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			fresh_hash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				work_set_target_ratio(work, vhash);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					fresh_hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+			}
+		}
+
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce + 1;
+	return 0;
+}
+
+// cleanup
+extern "C" void free_fresh(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaSetDevice(device_map[thr_id]);
+
+	cudaFree(d_hash[thr_id]);
+	x11_simd512_cpu_free(thr_id);
+
+	cuda_check_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/x11/s3.cu b/x11/s3.cu
new file mode 100644
index 0000000000..85eaaa6a1c
--- /dev/null
+++ b/x11/s3.cu
@@ -0,0 +1,181 @@
+/**
+ * S3 Hash (Also called Triple S - Used by 1Coin)
+ */
+
+extern "C" {
+#include "sph/sph_skein.h"
+#include "sph/sph_shavite.h"
+#include "sph/sph_simd.h"
+}
+
+#include "miner.h"
+#include "cuda_helper.h"
+#include "cuda_x11.h"
+
+extern void x11_shavite512_setBlock_80(void *pdata);
+extern void x11_shavite512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
+
+#include <stdint.h>
+
+static uint32_t *d_hash[MAX_GPUS];
+
+/* CPU HASH */
+extern "C" void s3hash(void *output, const void *input)
+{
+	sph_shavite512_context ctx_shavite;
+	sph_simd512_context ctx_simd;
+	sph_skein512_context ctx_skein;
+
+	unsigned char hash[64];
+
+	sph_shavite512_init(&ctx_shavite);
+	sph_shavite512(&ctx_shavite, input, 80);
+	sph_shavite512_close(&ctx_shavite, (void*) hash);
+
+	sph_simd512_init(&ctx_simd);
+	sph_simd512(&ctx_simd, (const void*) hash, 64);
+	sph_simd512_close(&ctx_simd, (void*) hash);
+
+	sph_skein512_init(&ctx_skein);
+	sph_skein512(&ctx_skein, (const void*) hash, 64);
+	sph_skein512_close(&ctx_skein, (void*) hash);
+
+	memcpy(output, hash, 32);
+}
+
+#ifdef _DEBUG
+#define TRACE(algo) { \
+	if (max_nonce == 1 && pdata[19] <= 1) { \
+		uint32_t* debugbuf = NULL; \
+		cudaMallocHost(&debugbuf, 32); \
+		cudaMemcpy(debugbuf, d_hash[thr_id], 32, cudaMemcpyDeviceToHost); \
+		printf("S3 %s %08x %08x %08x %08x...%08x\n", algo, swab32(debugbuf[0]), swab32(debugbuf[1]), \
+			swab32(debugbuf[2]), swab32(debugbuf[3]), swab32(debugbuf[7])); \
+		cudaFreeHost(debugbuf); \
+	} \
+}
+#else
+#define TRACE(algo) {}
+#endif
+
+static bool init[MAX_GPUS] = { 0 };
+
+/* Main S3 entry point */
+extern "C" int scanhash_s3(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	int intensity = 20; // 256*256*8*2;
+#ifdef WIN32
+	// reduce by one the intensity on windows
+	intensity--;
+#endif
+	uint32_t throughput =  cuda_default_throughput(thr_id, 1 << intensity);
+	//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (opt_benchmark)
+		ptarget[7] = 0xF;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
+
+		x11_shavite512_cpu_init(thr_id, throughput);
+		x11_simd512_cpu_init(thr_id, throughput);
+		quark_skein512_cpu_init(thr_id, throughput);
+
+		cuda_check_cpu_init(thr_id, throughput);
+
+		init[thr_id] = true;
+	}
+
+	uint32_t endiandata[20];
+	for (int k=0; k < 20; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	x11_shavite512_setBlock_80((void*)endiandata);
+	cuda_check_cpu_setTarget(ptarget);
+
+	do {
+		int order = 0;
+
+		x11_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		TRACE("shavite:");
+		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		TRACE("simd   :");
+		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		TRACE("skein  :");
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			s3hash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					s3hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+	return 0;
+}
+
+// cleanup
+extern "C" void free_s3(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+	x11_simd512_cpu_free(thr_id);
+
+	cuda_check_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/x11/sib.cu b/x11/sib.cu
new file mode 100644
index 0000000000..c437523d03
--- /dev/null
+++ b/x11/sib.cu
@@ -0,0 +1,251 @@
+extern "C" {
+#include "sph/sph_blake.h"
+#include "sph/sph_bmw.h"
+#include "sph/sph_groestl.h"
+#include "sph/sph_skein.h"
+#include "sph/sph_jh.h"
+#include "sph/sph_keccak.h"
+#include "sph/sph_luffa.h"
+#include "sph/sph_cubehash.h"
+#include "sph/sph_shavite.h"
+#include "sph/sph_simd.h"
+#include "sph/sph_echo.h"
+#include "sph/sph_streebog.h"
+}
+
+#include "miner.h"
+#include "cuda_helper.h"
+#include "cuda_x11.h"
+
+extern void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
+extern void streebog_hash_64_maxwell(int thr_id, uint32_t threads, uint32_t *d_hash);
+
+#include <stdio.h>
+#include <memory.h>
+
+static uint32_t *d_hash[MAX_GPUS];
+
+// Sibcoin CPU Hash
+extern "C" void sibhash(void *output, const void *input)
+{
+	unsigned char _ALIGN(128) hash[128] = { 0 };
+
+	sph_blake512_context ctx_blake;
+	sph_bmw512_context ctx_bmw;
+	sph_groestl512_context ctx_groestl;
+	sph_skein512_context ctx_skein;
+	sph_jh512_context ctx_jh;
+	sph_keccak512_context ctx_keccak;
+	sph_gost512_context ctx_gost;
+	sph_luffa512_context ctx_luffa;
+	sph_cubehash512_context ctx_cubehash;
+	sph_shavite512_context ctx_shavite;
+	sph_simd512_context ctx_simd;
+	sph_echo512_context ctx_echo;
+
+	sph_blake512_init(&ctx_blake);
+	sph_blake512 (&ctx_blake, input, 80);
+	sph_blake512_close(&ctx_blake, (void*) hash);
+
+	sph_bmw512_init(&ctx_bmw);
+	sph_bmw512 (&ctx_bmw, (const void*) hash, 64);
+	sph_bmw512_close(&ctx_bmw, (void*) hash);
+
+	sph_groestl512_init(&ctx_groestl);
+	sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
+	sph_groestl512_close(&ctx_groestl, (void*) hash);
+
+	sph_skein512_init(&ctx_skein);
+	sph_skein512 (&ctx_skein, (const void*) hash, 64);
+	sph_skein512_close(&ctx_skein, (void*) hash);
+
+	sph_jh512_init(&ctx_jh);
+	sph_jh512 (&ctx_jh, (const void*) hash, 64);
+	sph_jh512_close(&ctx_jh, (void*) hash);
+
+	sph_keccak512_init(&ctx_keccak);
+	sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
+	sph_keccak512_close(&ctx_keccak, (void*) hash);
+
+	sph_gost512_init(&ctx_gost);
+	sph_gost512(&ctx_gost, (const void*) hash, 64);
+	sph_gost512_close(&ctx_gost, (void*) hash);
+
+	sph_luffa512_init(&ctx_luffa);
+	sph_luffa512 (&ctx_luffa, (const void*) hash, 64);
+	sph_luffa512_close (&ctx_luffa, (void*) hash);
+
+	sph_cubehash512_init(&ctx_cubehash);
+	sph_cubehash512 (&ctx_cubehash, (const void*) hash, 64);
+	sph_cubehash512_close(&ctx_cubehash, (void*) hash);
+
+	sph_shavite512_init(&ctx_shavite);
+	sph_shavite512 (&ctx_shavite, (const void*) hash, 64);
+	sph_shavite512_close(&ctx_shavite, (void*) hash);
+
+	sph_simd512_init(&ctx_simd);
+	sph_simd512 (&ctx_simd, (const void*) hash, 64);
+	sph_simd512_close(&ctx_simd, (void*) hash);
+
+	sph_echo512_init(&ctx_echo);
+	sph_echo512 (&ctx_echo, (const void*) hash, 64);
+	sph_echo512_close(&ctx_echo, (void*) hash);
+
+	memcpy(output, hash, 32);
+}
+
+//#define _DEBUG
+#define _DEBUG_PREFIX "sib"
+#include "cuda_debug.cuh"
+
+static bool init[MAX_GPUS] = { 0 };
+static bool use_compat_kernels[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	const int dev_id = device_map[thr_id];
+	int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 19 : 18; // 2^18 = 262144 cuda threads
+	if (device_sm[dev_id] >= 600) intensity = 20;
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (opt_benchmark)
+		ptarget[7] = 0xf;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		cuda_get_arch(thr_id);
+		use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
+
+		quark_blake512_cpu_init(thr_id, throughput);
+		quark_bmw512_cpu_init(thr_id, throughput);
+		quark_groestl512_cpu_init(thr_id, throughput);
+		quark_skein512_cpu_init(thr_id, throughput);
+		quark_keccak512_cpu_init(thr_id, throughput);
+		quark_jh512_cpu_init(thr_id, throughput);
+		x11_luffaCubehash512_cpu_init(thr_id, throughput);
+		x11_shavite512_cpu_init(thr_id, throughput);
+		x11_echo512_cpu_init(thr_id, throughput);
+		if (x11_simd512_cpu_init(thr_id, throughput) != 0) {
+			return -1;
+		}
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), -1);
+
+		cuda_check_cpu_init(thr_id, throughput);
+
+		init[thr_id] = true;
+	}
+
+	uint32_t endiandata[20];
+	for (int k=0; k < 20; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	quark_blake512_cpu_setBlock_80(thr_id, endiandata);
+	cuda_check_cpu_setTarget(ptarget);
+
+	do {
+		int order = 0;
+
+		// Hash with CUDA
+		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+		TRACE("blake  :");
+		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		TRACE("bmw    :");
+		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		TRACE("groestl:");
+		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		TRACE("skein  :");
+		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		TRACE("jh512  :");
+		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		TRACE("keccak :");
+		if (use_compat_kernels[thr_id])
+			streebog_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
+		else
+			streebog_hash_64_maxwell(thr_id, throughput, d_hash[thr_id]);
+		TRACE("gost   :");
+		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
+		TRACE("luffa+c:");
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		TRACE("shavite:");
+		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		TRACE("simd   :");
+		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		TRACE("echo => ");
+
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			sibhash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				work->nonces[1] =cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				*hashes_done = pdata[19] - first_nonce + throughput;
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					sibhash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+					gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+	return 0;
+}
+
+// cleanup
+extern "C" void free_sib(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+
+	quark_blake512_cpu_free(thr_id);
+	quark_groestl512_cpu_free(thr_id);
+	x11_simd512_cpu_free(thr_id);
+
+	cuda_check_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/x11/simd_functions.cu b/x11/simd_functions.cu
deleted file mode 100644
index fe5697dd21..0000000000
--- a/x11/simd_functions.cu
+++ /dev/null
@@ -1,1413 +0,0 @@
-__device__ __forceinline__ void STEP8_IF_0(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
-{
-	int j;
-	uint32_t temp;
-	uint32_t R[8];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[1];
-	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[0];
-	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[3];
-	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[2];
-	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[5];
-	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[4];
-	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[7];
-	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[6];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-__device__ __forceinline__ void STEP8_IF_1(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
-{
-	int j;
-	uint32_t temp;
-	uint32_t R[8];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[6];
-	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[7];
-	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[4];
-	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[5];
-	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[2];
-	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[3];
-	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[0];
-	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[1];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-__device__ __forceinline__ void STEP8_IF_2(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
-{
-	int j;
-	uint32_t temp;
-	uint32_t R[8];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[2];
-	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[3];
-	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[0];
-	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[1];
-	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[6];
-	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[7];
-	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[4];
-	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[5];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-__device__ __forceinline__ void STEP8_IF_3(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
-{
-	int j;
-	uint32_t temp;
-	uint32_t R[8];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[3];
-	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[2];
-	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[1];
-	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[0];
-	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[7];
-	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[6];
-	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[5];
-	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[4];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-__device__ __forceinline__ void STEP8_MAJ_4(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
-{
-	int j;
-	uint32_t temp;
-	uint32_t R[8];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[5];
-	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[4];
-	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[7];
-	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[6];
-	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[1];
-	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[0];
-	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[3];
-	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[2];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-__device__ __forceinline__ void STEP8_MAJ_5(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
-{
-	int j;
-	uint32_t temp;
-	uint32_t R[8];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[7];
-	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[6];
-	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[5];
-	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[4];
-	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[3];
-	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[2];
-	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[1];
-	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[0];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-__device__ __forceinline__ void STEP8_MAJ_6(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
-{
-	int j;
-	uint32_t temp;
-	uint32_t R[8];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[4];
-	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[5];
-	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[6];
-	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[7];
-	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[0];
-	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[1];
-	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[2];
-	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[3];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-__device__ __forceinline__ void STEP8_MAJ_7(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
-{
-	int j;
-	uint32_t temp;
-	uint32_t R[8];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[1];
-	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[0];
-	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[3];
-	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[2];
-	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[5];
-	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[4];
-	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[7];
-	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[6];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-__device__ __forceinline__ void STEP8_IF_8(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
-{
-	int j;
-	uint32_t temp;
-	uint32_t R[8];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[6];
-	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[7];
-	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[4];
-	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[5];
-	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[2];
-	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[3];
-	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[0];
-	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[1];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-__device__ __forceinline__ void STEP8_IF_9(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
-{
-	int j;
-	uint32_t temp;
-	uint32_t R[8];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[2];
-	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[3];
-	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[0];
-	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[1];
-	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[6];
-	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[7];
-	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[4];
-	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[5];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-__device__ __forceinline__ void STEP8_IF_10(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
-{
-	int j;
-	uint32_t temp;
-	uint32_t R[8];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[3];
-	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[2];
-	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[1];
-	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[0];
-	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[7];
-	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[6];
-	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[5];
-	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[4];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-__device__ __forceinline__ void STEP8_IF_11(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
-{
-	int j;
-	uint32_t temp;
-	uint32_t R[8];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[5];
-	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[4];
-	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[7];
-	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[6];
-	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[1];
-	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[0];
-	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[3];
-	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[2];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-__device__ __forceinline__ void STEP8_MAJ_12(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
-{
-	int j;
-	uint32_t temp;
-	uint32_t R[8];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[7];
-	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[6];
-	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[5];
-	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[4];
-	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[3];
-	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[2];
-	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[1];
-	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[0];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-__device__ __forceinline__ void STEP8_MAJ_13(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
-{
-	int j;
-	uint32_t temp;
-	uint32_t R[8];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[4];
-	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[5];
-	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[6];
-	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[7];
-	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[0];
-	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[1];
-	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[2];
-	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[3];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-__device__ __forceinline__ void STEP8_MAJ_14(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
-{
-	int j;
-	uint32_t temp;
-	uint32_t R[8];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[1];
-	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[0];
-	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[3];
-	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[2];
-	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[5];
-	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[4];
-	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[7];
-	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[6];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-__device__ __forceinline__ void STEP8_MAJ_15(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
-{
-	int j;
-	uint32_t temp;
-	uint32_t R[8];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[6];
-	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[7];
-	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[4];
-	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[5];
-	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[2];
-	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[3];
-	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[0];
-	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[1];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-__device__ __forceinline__ void STEP8_IF_16(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
-{
-	int j;
-	uint32_t temp;
-	uint32_t R[8];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[2];
-	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[3];
-	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[0];
-	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[1];
-	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[6];
-	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[7];
-	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[4];
-	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[5];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-__device__ __forceinline__ void STEP8_IF_17(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
-{
-	int j;
-	uint32_t temp;
-	uint32_t R[8];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[3];
-	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[2];
-	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[1];
-	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[0];
-	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[7];
-	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[6];
-	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[5];
-	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[4];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-__device__ __forceinline__ void STEP8_IF_18(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
-{
-	int j;
-	uint32_t temp;
-	uint32_t R[8];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[5];
-	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[4];
-	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[7];
-	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[6];
-	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[1];
-	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[0];
-	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[3];
-	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[2];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-__device__ __forceinline__ void STEP8_IF_19(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
-{
-	int j;
-	uint32_t temp;
-	uint32_t R[8];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[7];
-	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[6];
-	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[5];
-	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[4];
-	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[3];
-	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[2];
-	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[1];
-	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[0];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-__device__ __forceinline__ void STEP8_MAJ_20(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
-{
-	int j;
-	uint32_t temp;
-	uint32_t R[8];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[4];
-	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[5];
-	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[6];
-	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[7];
-	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[0];
-	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[1];
-	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[2];
-	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[3];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-__device__ __forceinline__ void STEP8_MAJ_21(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
-{
-	int j;
-	uint32_t temp;
-	uint32_t R[8];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[1];
-	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[0];
-	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[3];
-	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[2];
-	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[5];
-	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[4];
-	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[7];
-	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[6];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-__device__ __forceinline__ void STEP8_MAJ_22(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
-{
-	int j;
-	uint32_t temp;
-	uint32_t R[8];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[6];
-	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[7];
-	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[4];
-	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[5];
-	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[2];
-	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[3];
-	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[0];
-	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[1];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-__device__ __forceinline__ void STEP8_MAJ_23(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
-{
-	int j;
-	uint32_t temp;
-	uint32_t R[8];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[2];
-	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[3];
-	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[0];
-	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[1];
-	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[6];
-	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[7];
-	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[4];
-	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[5];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-__device__ __forceinline__ void STEP8_IF_24(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
-{
-	int j;
-	uint32_t temp;
-	uint32_t R[8];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[3];
-	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[2];
-	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[1];
-	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[0];
-	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[7];
-	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[6];
-	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[5];
-	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[4];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-__device__ __forceinline__ void STEP8_IF_25(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
-{
-	int j;
-	uint32_t temp;
-	uint32_t R[8];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[5];
-	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[4];
-	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[7];
-	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[6];
-	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[1];
-	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[0];
-	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[3];
-	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[2];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-__device__ __forceinline__ void STEP8_IF_26(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
-{
-	int j;
-	uint32_t temp;
-	uint32_t R[8];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[7];
-	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[6];
-	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[5];
-	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[4];
-	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[3];
-	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[2];
-	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[1];
-	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[0];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-__device__ __forceinline__ void STEP8_IF_27(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
-{
-	int j;
-	uint32_t temp;
-	uint32_t R[8];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[4];
-	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[5];
-	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[6];
-	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[7];
-	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[0];
-	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[1];
-	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[2];
-	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[3];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-__device__ __forceinline__ void STEP8_MAJ_28(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
-{
-	int j;
-	uint32_t temp;
-	uint32_t R[8];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[1];
-	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[0];
-	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[3];
-	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[2];
-	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[5];
-	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[4];
-	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[7];
-	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[6];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-__device__ __forceinline__ void STEP8_MAJ_29(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
-{
-	int j;
-	uint32_t temp;
-	uint32_t R[8];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[6];
-	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[7];
-	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[4];
-	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[5];
-	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[2];
-	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[3];
-	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[0];
-	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[1];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-__device__ __forceinline__ void STEP8_MAJ_30(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
-{
-	int j;
-	uint32_t temp;
-	uint32_t R[8];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[2];
-	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[3];
-	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[0];
-	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[1];
-	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[6];
-	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[7];
-	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[4];
-	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[5];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-__device__ __forceinline__ void STEP8_MAJ_31(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
-{
-	int j;
-	uint32_t temp;
-	uint32_t R[8];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[3];
-	temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[2];
-	temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[1];
-	temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[0];
-	temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[7];
-	temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[6];
-	temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[5];
-	temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[4];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-__device__ __forceinline__ void STEP8_IF_32(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
-{
-	int j;
-	uint32_t temp;
-	uint32_t R[8];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[5];
-	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[4];
-	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[7];
-	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[6];
-	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[1];
-	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[0];
-	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[3];
-	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[2];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-__device__ __forceinline__ void STEP8_IF_33(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
-{
-	int j;
-	uint32_t temp;
-	uint32_t R[8];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[7];
-	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[6];
-	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[5];
-	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[4];
-	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[3];
-	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[2];
-	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[1];
-	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[0];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-__device__ __forceinline__ void STEP8_IF_34(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
-{
-	int j;
-	uint32_t temp;
-	uint32_t R[8];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[4];
-	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[5];
-	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[6];
-	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[7];
-	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[0];
-	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[1];
-	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[2];
-	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[3];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-__device__ __forceinline__ void STEP8_IF_35(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D)
-{
-	int j;
-	uint32_t temp;
-	uint32_t R[8];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		R[j] = ROTL32(A[j], r);
-	}
-	temp = D[0] + w[0] + IF(A[0], B[0], C[0]);
-	D[0] = ROTL32(temp, s) + R[1];
-	temp = D[1] + w[1] + IF(A[1], B[1], C[1]);
-	D[1] = ROTL32(temp, s) + R[0];
-	temp = D[2] + w[2] + IF(A[2], B[2], C[2]);
-	D[2] = ROTL32(temp, s) + R[3];
-	temp = D[3] + w[3] + IF(A[3], B[3], C[3]);
-	D[3] = ROTL32(temp, s) + R[2];
-	temp = D[4] + w[4] + IF(A[4], B[4], C[4]);
-	D[4] = ROTL32(temp, s) + R[5];
-	temp = D[5] + w[5] + IF(A[5], B[5], C[5]);
-	D[5] = ROTL32(temp, s) + R[4];
-	temp = D[6] + w[6] + IF(A[6], B[6], C[6]);
-	D[6] = ROTL32(temp, s) + R[7];
-	temp = D[7] + w[7] + IF(A[7], B[7], C[7]);
-	D[7] = ROTL32(temp, s) + R[6];
-#pragma unroll 8
-	for(j=0; j<8; j++) {
-		A[j] = R[j];
-	}
-}
-static __constant__ uint32_t d_cw0[8][8];
-static const uint32_t h_cw0[8][8] = {
-	0x531B1720, 	0xAC2CDE09, 	0x0B902D87, 	0x2369B1F4, 	0x2931AA01, 	0x02E4B082, 	0xC914C914, 	0xC1DAE1A6, 
-	0xF18C2B5C, 	0x08AC306B, 	0x27BFC914, 	0xCEDC548D, 	0xC630C4BE, 	0xF18C4335, 	0xF0D3427C, 	0xBE3DA380, 
-	0x143C02E4, 	0xA948C630, 	0xA4F2DE09, 	0xA71D2085, 	0xA439BD84, 	0x109FCD6A, 	0xEEA8EF61, 	0xA5AB1CE8, 
-	0x0B90D4A4, 	0x3D6D039D, 	0x25944D53, 	0xBAA0E034, 	0x5BC71E5A, 	0xB1F4F2FE, 	0x12CADE09, 	0x548D41C3, 
-	0x3CB4F80D, 	0x36ECEBC4, 	0xA66443EE, 	0x43351ABD, 	0xC7A20C49, 	0xEB0BB366, 	0xF5293F98, 	0x49B6DE09, 
-	0x531B29EA, 	0x02E402E4, 	0xDB25C405, 	0x53D4E543, 	0x0AD71720, 	0xE1A61A04, 	0xB87534C1, 	0x3EDF43EE, 
-	0x213E50F0, 	0x39173EDF, 	0xA9485B0E, 	0xEEA82EF9, 	0x14F55771, 	0xFAF15546, 	0x3D6DD9B3, 	0xAB73B92E, 
-	0x582A48FD, 	0xEEA81892, 	0x4F7EAA01, 	0xAF10A88F, 	0x11581720, 	0x34C124DB, 	0xD1C0AB73, 	0x1E5AF0D3  
-};
-__device__ __forceinline__ void Round8_0_final(uint32_t *A,
-		int r, int s, int t, int u) {
-
-
-	STEP8_IF_0(d_cw0[0], r, s, A, &A[8], &A[16], &A[24]);
-	STEP8_IF_1(d_cw0[1], s, t, &A[24], A, &A[8], &A[16]);
-	STEP8_IF_2(d_cw0[2], t, u, &A[16], &A[24], A, &A[8]);
-	STEP8_IF_3(d_cw0[3], u, r, &A[8], &A[16], &A[24], A);
-	STEP8_MAJ_4(d_cw0[4], r, s, A, &A[8], &A[16], &A[24]);
-	STEP8_MAJ_5(d_cw0[5], s, t, &A[24], A, &A[8], &A[16]);
-	STEP8_MAJ_6(d_cw0[6], t, u, &A[16], &A[24], A, &A[8]);
-	STEP8_MAJ_7(d_cw0[7], u, r, &A[8], &A[16], &A[24], A);
-}
-static __constant__ uint32_t d_cw1[8][8];
-static const uint32_t h_cw1[8][8] = {
-	0xC34C07F3, 	0xC914143C, 	0x599CBC12, 	0xBCCBE543, 	0x385EF3B7, 	0x14F54C9A, 	0x0AD7C068, 	0xB64A21F7, 
-	0xDEC2AF10, 	0xC6E9C121, 	0x56B8A4F2, 	0x1158D107, 	0xEB0BA88F, 	0x050FAABA, 	0xC293264D, 	0x548D46D2, 
-	0xACE5E8E0, 	0x53D421F7, 	0xF470D279, 	0xDC974E0C, 	0xD6CF55FF, 	0xFD1C4F7E, 	0x36EC36EC, 	0x3E261E5A, 
-	0xEBC4FD1C, 	0x56B839D0, 	0x5B0E21F7, 	0x58E3DF7B, 	0x5BC7427C, 	0xEF613296, 	0x1158109F, 	0x5A55E318, 
-	0xA7D6B703, 	0x1158E76E, 	0xB08255FF, 	0x50F05771, 	0xEEA8E8E0, 	0xCB3FDB25, 	0x2E40548D, 	0xE1A60F2D, 
-	0xACE5D616, 	0xFD1CFD1C, 	0x24DB3BFB, 	0xAC2C1ABD, 	0xF529E8E0, 	0x1E5AE5FC, 	0x478BCB3F, 	0xC121BC12, 
-	0xF4702B5C, 	0xC293FC63, 	0xDA6CB2AD, 	0x45601FCC, 	0xA439E1A6, 	0x4E0C0D02, 	0xED3621F7, 	0xAB73BE3D, 
-	0x0E74D4A4, 	0xF754CF95, 	0xD84136EC, 	0x3124AB73, 	0x39D03B42, 	0x0E74BCCB, 	0x0F2DBD84, 	0x41C35C80  
-};
-__device__ __forceinline__ void Round8_1_final(uint32_t *A,
-		int r, int s, int t, int u) {
-
-
-	STEP8_IF_8(d_cw1[0], r, s, A, &A[8], &A[16], &A[24]);
-	STEP8_IF_9(d_cw1[1], s, t, &A[24], A, &A[8], &A[16]);
-	STEP8_IF_10(d_cw1[2], t, u, &A[16], &A[24], A, &A[8]);
-	STEP8_IF_11(d_cw1[3], u, r, &A[8], &A[16], &A[24], A);
-	STEP8_MAJ_12(d_cw1[4], r, s, A, &A[8], &A[16], &A[24]);
-	STEP8_MAJ_13(d_cw1[5], s, t, &A[24], A, &A[8], &A[16]);
-	STEP8_MAJ_14(d_cw1[6], t, u, &A[16], &A[24], A, &A[8]);
-	STEP8_MAJ_15(d_cw1[7], u, r, &A[8], &A[16], &A[24], A);
-}
-static __constant__ uint32_t d_cw2[8][8];
-static const uint32_t h_cw2[8][8] = {
-	0xA4135BED, 	0xE10E1EF2, 	0x6C4F93B1, 	0x6E2191DF, 	0xE2E01D20, 	0xD1952E6B, 	0x6A7D9583, 	0x131DECE3, 
-	0x369CC964, 	0xFB73048D, 	0x9E9D6163, 	0x280CD7F4, 	0xD9C6263A, 	0x1062EF9E, 	0x2AC7D539, 	0xAD2D52D3, 
-	0x0A03F5FD, 	0x197CE684, 	0xAA72558E, 	0xDE5321AD, 	0xF0870F79, 	0x607A9F86, 	0xAFE85018, 	0x2AC7D539, 
-	0xE2E01D20, 	0x2AC7D539, 	0xC6A93957, 	0x624C9DB4, 	0x6C4F93B1, 	0x641E9BE2, 	0x452CBAD4, 	0x263AD9C6, 
-	0xC964369C, 	0xC3053CFB, 	0x452CBAD4, 	0x95836A7D, 	0x4AA2B55E, 	0xAB5B54A5, 	0xAC4453BC, 	0x74808B80, 
-	0xCB3634CA, 	0xFC5C03A4, 	0x4B8BB475, 	0x21ADDE53, 	0xE2E01D20, 	0xDF3C20C4, 	0xBD8F4271, 	0xAA72558E, 
-	0xFC5C03A4, 	0x48D0B730, 	0x2AC7D539, 	0xD70B28F5, 	0x53BCAC44, 	0x3FB6C04A, 	0x14EFEB11, 	0xDB982468, 
-	0x9A1065F0, 	0xB0D14F2F, 	0x8D5272AE, 	0xC4D73B29, 	0x91DF6E21, 	0x949A6B66, 	0x303DCFC3, 	0x5932A6CE  
-};
-__device__ __forceinline__ void Round8_2_final(uint32_t *A,
-		int r, int s, int t, int u) {
-
-
-	STEP8_IF_16(d_cw2[0], r, s, A, &A[8], &A[16], &A[24]);
-	STEP8_IF_17(d_cw2[1], s, t, &A[24], A, &A[8], &A[16]);
-	STEP8_IF_18(d_cw2[2], t, u, &A[16], &A[24], A, &A[8]);
-	STEP8_IF_19(d_cw2[3], u, r, &A[8], &A[16], &A[24], A);
-	STEP8_MAJ_20(d_cw2[4], r, s, A, &A[8], &A[16], &A[24]);
-	STEP8_MAJ_21(d_cw2[5], s, t, &A[24], A, &A[8], &A[16]);
-	STEP8_MAJ_22(d_cw2[6], t, u, &A[16], &A[24], A, &A[8]);
-	STEP8_MAJ_23(d_cw2[7], u, r, &A[8], &A[16], &A[24], A);
-}
-static __constant__ uint32_t d_cw3[8][8];
-static const uint32_t h_cw3[8][8] = {
-	0x1234EDCC, 	0xF5140AEC, 	0xCDF1320F, 	0x3DE4C21C, 	0x48D0B730, 	0x1234EDCC, 	0x131DECE3, 	0x52D3AD2D, 
-	0xE684197C, 	0x6D3892C8, 	0x72AE8D52, 	0x6FF3900D, 	0x73978C69, 	0xEB1114EF, 	0x15D8EA28, 	0x71C58E3B, 
-	0x90F66F0A, 	0x15D8EA28, 	0x9BE2641E, 	0x65F09A10, 	0xEA2815D8, 	0xBD8F4271, 	0x3A40C5C0, 	0xD9C6263A, 
-	0xB38C4C74, 	0xBAD4452C, 	0x70DC8F24, 	0xAB5B54A5, 	0x46FEB902, 	0x1A65E59B, 	0x0DA7F259, 	0xA32A5CD6, 
-	0xD62229DE, 	0xB81947E7, 	0x6D3892C8, 	0x15D8EA28, 	0xE59B1A65, 	0x065FF9A1, 	0xB2A34D5D, 	0x6A7D9583, 
-	0x975568AB, 	0xFC5C03A4, 	0x2E6BD195, 	0x966C6994, 	0xF2590DA7, 	0x263AD9C6, 	0x5A1BA5E5, 	0xB0D14F2F, 
-	0x975568AB, 	0x6994966C, 	0xF1700E90, 	0xD3672C99, 	0xCC1F33E1, 	0xFC5C03A4, 	0x452CBAD4, 	0x4E46B1BA, 
-	0xF1700E90, 	0xB2A34D5D, 	0xD0AC2F54, 	0x5760A8A0, 	0x8C697397, 	0x624C9DB4, 	0xE85617AA, 	0x95836A7D  
-};
-__device__ __forceinline__ void Round8_3_final(uint32_t *A,
-		int r, int s, int t, int u) {
-
-
-	STEP8_IF_24(d_cw3[0], r, s, A, &A[8], &A[16], &A[24]);
-	STEP8_IF_25(d_cw3[1], s, t, &A[24], A, &A[8], &A[16]);
-	STEP8_IF_26(d_cw3[2], t, u, &A[16], &A[24], A, &A[8]);
-	STEP8_IF_27(d_cw3[3], u, r, &A[8], &A[16], &A[24], A);
-	STEP8_MAJ_28(d_cw3[4], r, s, A, &A[8], &A[16], &A[24]);
-	STEP8_MAJ_29(d_cw3[5], s, t, &A[24], A, &A[8], &A[16]);
-	STEP8_MAJ_30(d_cw3[6], t, u, &A[16], &A[24], A, &A[8]);
-	STEP8_MAJ_31(d_cw3[7], u, r, &A[8], &A[16], &A[24], A);
-}
-
-#if __CUDA_ARCH__ < 350
-#define expanded_vector(x) tex1Dfetch(texRef1D_128, (x))
-#else
-//#define expanded_vector(x) tex1Dfetch(texRef1D_128, (x))
-#define expanded_vector(x) __ldg(&g_fft4[x])
-#endif
-
-__device__ __forceinline__ void Round8_0(uint32_t *A, const int thr_offset,
-		int r, int s, int t, int u, uint4 *g_fft4) {
-	uint32_t w[8];
-    uint4 hv1, hv2;
-
-	int tmp = 0 + thr_offset;
-	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_IF_0(w, r, s, A, &A[8], &A[16], &A[24]);
-	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_IF_1(w, s, t, &A[24], A, &A[8], &A[16]);
-	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_IF_2(w, t, u, &A[16], &A[24], A, &A[8]);
-	hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-	hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_IF_3(w, u, r, &A[8], &A[16], &A[24], A);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_MAJ_4(w, r, s, A, &A[8], &A[16], &A[24]);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_MAJ_5(w, s, t, &A[24], A, &A[8], &A[16]);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_MAJ_6(w, t, u, &A[16], &A[24], A, &A[8]);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_MAJ_7(w, u, r, &A[8], &A[16], &A[24], A);
-
-
-}
-__device__ __forceinline__ void Round8_1(uint32_t *A, const int thr_offset,
-		int r, int s, int t, int u, uint4 *g_fft4) {
-	uint32_t w[8];
-    uint4 hv1, hv2;
-
-	int tmp = 16 + thr_offset;
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_IF_8(w, r, s, A, &A[8], &A[16], &A[24]);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_IF_9(w, s, t, &A[24], A, &A[8], &A[16]);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_IF_10(w, t, u, &A[16], &A[24], A, &A[8]);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_IF_11(w, u, r, &A[8], &A[16], &A[24], A);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_MAJ_12(w, r, s, A, &A[8], &A[16], &A[24]);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_MAJ_13(w, s, t, &A[24], A, &A[8], &A[16]);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_MAJ_14(w, t, u, &A[16], &A[24], A, &A[8]);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_MAJ_15(w, u, r, &A[8], &A[16], &A[24], A);
-
-
-}
-__device__ __forceinline__ void Round8_2(uint32_t *A, const int thr_offset,
-		int r, int s, int t, int u, uint4 *g_fft4) {
-	uint32_t w[8];
-    uint4 hv1, hv2;
-
-	int tmp = 32 + thr_offset;
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_IF_16(w, r, s, A, &A[8], &A[16], &A[24]);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_IF_17(w, s, t, &A[24], A, &A[8], &A[16]);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_IF_18(w, t, u, &A[16], &A[24], A, &A[8]);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_IF_19(w, u, r, &A[8], &A[16], &A[24], A);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_MAJ_20(w, r, s, A, &A[8], &A[16], &A[24]);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_MAJ_21(w, s, t, &A[24], A, &A[8], &A[16]);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_MAJ_22(w, t, u, &A[16], &A[24], A, &A[8]);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_MAJ_23(w, u, r, &A[8], &A[16], &A[24], A);
-
-
-}
-__device__ __forceinline__ void Round8_3(uint32_t *A, const int thr_offset,
-		int r, int s, int t, int u, uint4 *g_fft4) {
-	uint32_t w[8];
-    uint4 hv1, hv2;
-
-	int tmp = 48 + thr_offset;
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_IF_24(w, r, s, A, &A[8], &A[16], &A[24]);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_IF_25(w, s, t, &A[24], A, &A[8], &A[16]);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_IF_26(w, t, u, &A[16], &A[24], A, &A[8]);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_IF_27(w, u, r, &A[8], &A[16], &A[24], A);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_MAJ_28(w, r, s, A, &A[8], &A[16], &A[24]);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_MAJ_29(w, s, t, &A[24], A, &A[8], &A[16]);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_MAJ_30(w, t, u, &A[16], &A[24], A, &A[8]);
-    hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w;
-    hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w;
-	STEP8_MAJ_31(w, u, r, &A[8], &A[16], &A[24], A);
-
-
-}
-
-__device__ __forceinline__ void SIMD_Compress1(uint32_t *A, const int thr_id, const uint32_t *M, uint4 *g_fft4) {
-	int i;
-	const int thr_offset = thr_id << 6; // thr_id * 128 (je zwei elemente)
-#pragma unroll 8
-	for(i=0; i<8; i++) {
-		A[i] ^= M[i];
-		(&A[8])[i] ^= M[8+i];
-	}
-	Round8_0(A, thr_offset, 3, 23, 17, 27, g_fft4);
-	Round8_1(A, thr_offset, 28, 19, 22, 7, g_fft4);
-}
-
-__device__ __forceinline__ void Compression1(const uint32_t *hashval, const int texture_id, uint4 *g_fft4, int *g_state) {
-	uint32_t A[32];
-	int i;
-#pragma unroll 32
-	for (i=0; i < 32; i++) A[i] = c_IV_512[i];
-	uint32_t buffer[16];
-#pragma unroll 16
-	for (i=0; i < 16; i++) buffer[i] = hashval[i];
-	SIMD_Compress1(A, texture_id, buffer, g_fft4);
-	uint32_t *state = (uint32_t*)&g_state[blockIdx.x * (blockDim.x*32)];
-#pragma unroll 32
-	for (i=0; i < 32; i++) state[threadIdx.x+blockDim.x*i] = A[i];
-}
-
-__device__ __forceinline__ void SIMD_Compress2(uint32_t *A, const int thr_id, uint4 *g_fft4) {
-	uint32_t IV[4][8];
-	int i;
-	const int thr_offset = thr_id << 6; // thr_id * 128 (je zwei elemente)
-#pragma unroll 8
-	for(i=0; i<8; i++) {
-		IV[0][i] = c_IV_512[i];
-		IV[1][i] = c_IV_512[8+i];
-		IV[2][i] = c_IV_512[16+i];
-		IV[3][i] = c_IV_512[24+i];
-	}
-	Round8_2(A, thr_offset, 29, 9, 15, 5, g_fft4);
-	Round8_3(A, thr_offset, 4, 13, 10, 25, g_fft4);
-	STEP8_IF_32(IV[0],  4, 13, A, &A[8], &A[16], &A[24]);
-	STEP8_IF_33(IV[1], 13, 10, &A[24], A, &A[8], &A[16]);
-	STEP8_IF_34(IV[2], 10, 25, &A[16], &A[24], A, &A[8]);
-	STEP8_IF_35(IV[3], 25,  4, &A[8], &A[16], &A[24], A);
-}
-
-__device__ __forceinline__ void Compression2(const int texture_id, uint4 *g_fft4, int *g_state) {
-	uint32_t A[32];
-	int i;
-	uint32_t *state = (uint32_t*)&g_state[blockIdx.x * (blockDim.x*32)];
-#pragma unroll 32
-	for (i=0; i < 32; i++) A[i] = state[threadIdx.x+blockDim.x*i];
-	SIMD_Compress2(A, texture_id, g_fft4);
-#pragma unroll 32
-	for (i=0; i < 32; i++) state[threadIdx.x+blockDim.x*i] = A[i];
-}
-
-__device__ __forceinline__ void SIMD_Compress_Final(uint32_t *A, const uint32_t *M) {
-	uint32_t IV[4][8];
-	int i;
-#pragma unroll 8
-	for(i=0; i<8; i++) {
-		IV[0][i] = A[i];
-		IV[1][i] = (&A[8])[i];
-		IV[2][i] = (&A[16])[i];
-		IV[3][i] = (&A[24])[i];
-	}
-#pragma unroll 8
-	for(i=0; i<8; i++) {
-		A[i] ^= M[i];
-		(&A[8])[i] ^= M[8+i];
-	}
-	Round8_0_final(A, 3, 23, 17, 27);
-	Round8_1_final(A, 28, 19, 22, 7);
-	Round8_2_final(A, 29, 9, 15, 5);
-	Round8_3_final(A, 4, 13, 10, 25);
-	STEP8_IF_32(IV[0],  4, 13, A, &A[8], &A[16], &A[24]);
-	STEP8_IF_33(IV[1], 13, 10, &A[24], A, &A[8], &A[16]);
-	STEP8_IF_34(IV[2], 10, 25, &A[16], &A[24], A, &A[8]);
-	STEP8_IF_35(IV[3], 25,  4, &A[8], &A[16], &A[24], A);
-}
-
-__device__ __forceinline__ void Final(uint32_t *hashval, const int texture_id, uint4 *g_fft4, int *g_state) {
-	uint32_t A[32];
-	int i;
-	uint32_t *state = (uint32_t*)&g_state[blockIdx.x * (blockDim.x*32)];
-#pragma unroll 32
-	for (i=0; i < 32; i++) A[i] = state[threadIdx.x+blockDim.x*i];
-	uint32_t buffer[16];
-	buffer[0] = 512;
-#pragma unroll 15
-	for (i=1; i < 16; i++) buffer[i] = 0;
-	SIMD_Compress_Final(A, buffer);
-#pragma unroll 16
-	for (i=0; i < 16; i++)
-		hashval[i] = A[i];
-}
diff --git a/x11/streebog_arrays.cuh b/x11/streebog_arrays.cuh
new file mode 100644
index 0000000000..78362937b5
--- /dev/null
+++ b/x11/streebog_arrays.cuh
@@ -0,0 +1,567 @@
+// Tables for function F
+__device__ uint2 T02[256] = {
+	{ 0x5B711FD0, 0xE6F87E5C }, { 0x0924FA16, 0x25837780 }, { 0x852EA4A8, 0xC849E07E }, { 0x8F06C16A, 0x5B4686A1 },
+	{ 0xD77B416E, 0x0B32E9A2 }, { 0x67815C66, 0xABDA37A4 }, { 0x1A686676, 0xF61796A8 }, { 0x6391954B, 0xF5DC0B70 },
+	{ 0xB7E64BF1, 0x4862F38D }, { 0x68BD85C5, 0xFF5C629A }, { 0xFCD75795, 0xCB827DA6 }, { 0x69B9F089, 0x66D36DAF },
+	{ 0x483D83B0, 0x356C9F74 }, { 0x238C99A1, 0x7CBCECB1 }, { 0x31C4708D, 0x36A702AC }, { 0x2FBCDFD6, 0x9EB6A8D0 },
+	{ 0xE5B3AE37, 0x8B19FA51 }, { 0x8A127D0B, 0x9CCFB540 }, { 0x08208F5A, 0xBC0C78B5 }, { 0x2288ECED, 0xE533E384 },
+	{ 0x77C15FD2, 0xCEC2C7D3 }, { 0x505D0F5E, 0xEC7817B6 }, { 0x8336871D, 0xB94CC2C0 }, { 0xCB0B04AD, 0x8C205DB4 },
+	{ 0x28A0892F, 0x763C855B }, { 0xF6FF3257, 0x588D1B79 }, { 0x4311933E, 0x3FECF69E }, { 0x803A18C9, 0x0FC0D39F },
+	{ 0xF5F3AD83, 0xEE010A26 }, { 0x411979A6, 0x10EFE8F4 }, { 0x7DE93A10, 0x5DCDA10C }, { 0x1248E92C, 0x4A1BEE1D },
+	{ 0x21847339, 0x53BFF2DB }, { 0xA6A23D09, 0xB4F50CCF }, { 0xD84798CD, 0x5FB4BC9C }, { 0x071C56F9, 0xE88A2D8B },
+	{ 0x5A756A9C, 0x7F777169 }, { 0xA0BA1EBC, 0xC5F02E71 }, { 0x4215E672, 0xA663F9AB }, { 0xDE5FBB78, 0x2EB19E22 },
+	{ 0x2594BA14, 0x0DB9CE0F }, { 0x97664D84, 0x82520E63 }, { 0x0208EA98, 0x2F031E6A }, { 0xA1BE6BF0, 0x5C7F2144 },
+	{ 0xD16362DB, 0x7A37CB1C }, { 0x4B311C64, 0x83E08E2B }, { 0xAB960E32, 0xCF70479B }, { 0xB9DEE71E, 0x856BA986 },
+	{ 0x7AF56CE9, 0xB5478C87 }, { 0x5F61D6FD, 0xB8FE4288 }, { 0x966238C8, 0x1BDD0156 }, { 0x3EF8A92E, 0x62215792 },
+	{ 0x114476F8, 0xFC97FF42 }, { 0x56452CEB, 0x9D7D3508 }, { 0xE0A71256, 0x4C90C9B0 }, { 0xFBCB016C, 0x2308502D },
+	{ 0xA7A64845, 0x2D7A03FA }, { 0xBFC6C4AB, 0xF46E8B38 }, { 0xD477DEBA, 0xBDBEF8FD }, { 0xC8079B79, 0x3AAC4CEB },
+	{ 0xE8879D0C, 0xF09CB105 }, { 0xAC8A58CB, 0x27FA6A10 }, { 0x401D0CEA, 0x8960E7C1 }, { 0x4A356928, 0x1A6F811E },
+	{ 0x73D196FF, 0x90C4FB07 }, { 0x609D0A9F, 0x43501A2F }, { 0xC63F3796, 0xF7A516E0 }, { 0xB8DA9252, 0x1CE4A6B3 },
+	{ 0x38E08A9B, 0x1324752C }, { 0x3BEC154F, 0xA5A86473 }, { 0x5549B33F, 0x2BF12457 }, { 0x440DC5C7, 0xD766DB15 },
+	{ 0x9E42B792, 0xA7D179E3 }, { 0x61997FD3, 0xDADF151A }, { 0xC0271423, 0x86A0345E }, { 0x6DA939A4, 0x38D5517B },
+	{ 0x104003B4, 0x6518F077 }, { 0xA5AEA2DD, 0x02791D90 }, { 0x9C4A5D0A, 0x88D26789 }, { 0x0A2865C2, 0x930F66DF },
+	{ 0x4509B08B, 0x4EE9D420 }, { 0x6685292A, 0x32553891 }, { 0xC533A842, 0x412907BF }, { 0x544DC673, 0xB27E2B62 },
+	{ 0x6295E007, 0x6C530445 }, { 0x5351908A, 0x5AF406E9 }, { 0xC123616F, 0x1F2F3B6B }, { 0x5255E5C6, 0xC37B09DC },
+	{ 0xB1FE6844, 0x3967D133 }, { 0xF0E711E2, 0x298839C7 }, { 0x1964F9A2, 0x409B87F7 }, { 0xDB4B0719, 0xE938ADC3 },
+	{ 0xF9C3EBF4, 0x0C0B4E47 }, { 0xD36B8843, 0x5534D576 }, { 0xEB8B02D8, 0x4610A05A }, { 0x8232F251, 0x20C3CDF5 },
+	{ 0xBEC2B1E7, 0x6DE1840D }, { 0xB0FA1D08, 0xA0E8DE06 }, { 0x0D34333B, 0x7B854B54 }, { 0xBCCA5B7F, 0x42E29A67 },
+	{ 0xC437DD0E, 0xD8A6088A }, { 0xD943ED81, 0xC63BB3A9 }, { 0x5E65A3B1, 0x21714DBD }, { 0xB5EEA169, 0x6761EDE7 },
+	{ 0xD573ABF6, 0x2431F7C8 }, { 0xE1A3671A, 0xD51FC685 }, { 0x0410C92D, 0x5E063CD4 }, { 0x2CB04002, 0x283AB98F },
+	{ 0xB2F2F790, 0x8FEBC06C }, { 0x6FA1D33C, 0x17D64F11 }, { 0xA99EE4AA, 0xE07359F1 }, { 0x74CDC006, 0x784ED68C },
+	{ 0xC73B42DA, 0x6E2A19D5 }, { 0x1C7045C3, 0x8712B416 }, { 0xED93216D, 0x371582E4 }, { 0x4939F6FC, 0xACE39041 },
+	{ 0x86223B7C, 0x7EC5F121 }, { 0x2BAC16FB, 0xC0B09404 }, { 0x9A527EBF, 0xF9D74537 }, { 0xA3B68168, 0x737C3F2E },
+	{ 0xBAD278CA, 0x33E7B8D9 }, { 0xC22FFEBB, 0xA9A32A34 }, { 0xFEDFBD0D, 0xE48163CC }, { 0x6EA5A670, 0x8E594024 },
+	{ 0x842AD1E4, 0x51C6EF4B }, { 0x279C508C, 0x22BAD065 }, { 0x18608CEE, 0xD91488C2 }, { 0x1F7CDA17, 0x319EA549 },
+	{ 0x134C9C60, 0xD394E128 }, { 0x72D5E3B3, 0x094BF432 }, { 0xA4AAD791, 0x9BF612A5 }, { 0xD26FFD0F, 0xCCBBDA43 },
+	{ 0x946AD250, 0x34DE1F3C }, { 0x995EE16B, 0x4F5B5468 }, { 0xEA8F7794, 0xDF9FAF6F }, { 0x70DD092B, 0x2648EA58 },
+	{ 0x71D97C67, 0xBFC7E56D }, { 0x4F21D549, 0xDDE6B2FF }, { 0x3AE86003, 0x3C276B46 }, { 0xAF86C71F, 0x91767B4F },
+	{ 0x35D4B9A0, 0x68A13E78 }, { 0x030C9FD4, 0xB68C115F }, { 0x16582001, 0x141DD2C9 }, { 0xDD5324AC, 0x983D8F7D },
+	{ 0xCC175254, 0x64AA703F }, { 0x8E02B426, 0xC2C98994 }, { 0x9F46C2DE, 0x3E5E76D6 }, { 0x587D8004, 0x50746F03 },
+	{ 0x9272F1E5, 0x45DB3D82 }, { 0x9B560BF3, 0x60584A02 }, { 0x3FFCDC62, 0xFBAE58A7 }, { 0x6CAD4CE8, 0xA15A5E4E },
+	{ 0xCE1FB8CC, 0x4BA96E55 }, { 0xAE82B253, 0x08F9747A }, { 0xF7FB471B, 0xC102144C }, { 0xF3EB8E36, 0x9F042898 },
+	{ 0xF2EFFB7A, 0x068B27AD }, { 0x8C0A5EBE, 0xEDCA97FE }, { 0xF4F7D8CF, 0x778E0513 }, { 0xC32B8BF7, 0x302C2501 },
+	{ 0x175C554D, 0x8D92DDFC }, { 0x46052F5F, 0xF865C57F }, { 0xA2B2F424, 0xEAF3301B }, { 0xBBD60D86, 0xAA68B7EC },
+	{ 0x0104754C, 0x998F0F35 }, { 0x00000000, 0x00000000 }, { 0x34D0CCEC, 0xF12E314D }, { 0x061823B5, 0x710522BE },
+	{ 0x30C005C1, 0xAF280D99 }, { 0x5D693C65, 0x97FD5CE2 }, { 0x33CC9A15, 0x19A41CC6 }, { 0xF8C79EB8, 0x95844172 },
+	{ 0x937684A9, 0xDC5432B7 }, { 0x2490CF58, 0x9436C13A }, { 0x32C8EF59, 0x802B13F3 }, { 0x7CED4F5C, 0xC442AE39 },
+	{ 0xE3AB8D82, 0xFA1CD8EF }, { 0x4D293FD1, 0xF2E5AC95 }, { 0x907A1B7D, 0x6AD823E8 }, { 0x3CF043B6, 0x4D2249F8 },
+	{ 0x79F9F33D, 0x03CB9DD8 }, { 0x36D82674, 0xDE2D2F27 }, { 0x891EE2DF, 0x2A43A41F }, { 0x1B6C133A, 0x6F98999D },
+	{ 0x3DF436FA, 0xD4AD46CD }, { 0x269825C0, 0xBB35DF50 }, { 0x813E6D85, 0x964FDCAA }, { 0x7EE5A5C4, 0xEB41B053 },
+	{ 0x8B160847, 0x0540BA75 }, { 0xE7BB44AF, 0xA41AE43B }, { 0xD0671797, 0xE3B8C429 }, { 0xEE9FBEB9, 0x819993BB },
+	{ 0xEC975421, 0xAE9A8DD1 }, { 0x917E6E31, 0xF3572CDD }, { 0xE2AFF8CE, 0x6393D7DA }, { 0x37DC5338, 0x47A22012 },
+	{ 0xC903EE35, 0xA32343DE }, { 0xA89A91E6, 0x79FC56C4 }, { 0xDC5751E0, 0x01B28048 }, { 0xE4B7DB7B, 0x1296F564 },
+	{ 0x51597A12, 0x75F71883 }, { 0xBDCE2E33, 0xDB6D9552 }, { 0x1D74308F, 0x1E9DBB23 }, { 0xFDD322D9, 0x520D7293 },
+	{ 0x0C304677, 0xE20A4461 }, { 0xB4EAD425, 0xFEEEE2D2 }, { 0x20800675, 0xCA30FDEE }, { 0x47015A13, 0x61EACA4A },
+	{ 0x87264E30, 0xE74AFE14 }, { 0x7BF119A5, 0x2CC883B2 }, { 0xB3F682DC, 0x1664CF59 }, { 0x1E78AF5B, 0xA811AA7C },
+	{ 0x648DC3B2, 0x1D5626FB }, { 0xDF5BCE34, 0xB73E9117 }, { 0x6AB56F5D, 0xD05F7CF0 }, { 0xCD132718, 0xFD257F0A },
+	{ 0x76C52A9E, 0x574DC8E6 }, { 0x2EB8AA9A, 0x0739A7E5 }, { 0x0F3CD9A3, 0x5486553E }, { 0xAA927B7E, 0x56FF48AE },
+	{ 0xAD8E2D87, 0xBE756525 }, { 0xFFDBC841, 0x7D0E6CF9 }, { 0x1450CA99, 0x3B1ECCA3 }, { 0xE983E840, 0x6913BE30 },
+	{ 0x956EA71C, 0xAD511009 }, { 0x2DB4354E, 0xB1B5B6BA }, { 0x4E25A005, 0x4469BDCA }, { 0xCA0F71E1, 0x15AF5281 },
+	{ 0x8D0E2BF2, 0x744598CB }, { 0x2AA863B7, 0x593F9B31 }, { 0x29A4FC63, 0xEFB38A6E }, { 0x4C2D4A9D, 0x6B6AA3A0 },
+	{ 0xE6BF31E3, 0x3D95EB0E }, { 0x1554BFD5, 0xA291C396 }, { 0xEF9BCBF5, 0x18169C8E }, { 0x9D4E2846, 0x115D68BC },
+	{ 0xFACF7420, 0xBA875F18 }, { 0xB6E23EBD, 0xD1EDFCB8 }, { 0xF1E364AE, 0xB00736F2 }, { 0x6589B6FE, 0x84D929CE },
+	{ 0xDA4F7255, 0x70B7A2F6 }, { 0x5C6D4929, 0x0E7253D7 }, { 0x574159A7, 0x04F23A3D }, { 0x0B2C108E, 0x0A8069EA },
+	{ 0x6BB11A11, 0x49D073C5 }, { 0x39E4FFD7, 0x8AAB7A19 }, { 0x0E38ACEF, 0xCD095A0B }, { 0x5979F548, 0xC9FB6036 },
+	{ 0xD67F3422, 0x92BDE697 }, { 0x0514BC61, 0xC78933E1 }, { 0x75C9B54A, 0xE1C1D9B9 }, { 0xCF1BCD80, 0xD2266160 },
+	{ 0x78FD8671, 0x9A4492ED }, { 0x881A9793, 0xB3CCAB2A }, { 0x7FE1D088, 0x72CEBF66 }, { 0x985A9427, 0xD6D45B5D },
+};
+
+__device__ uint2 T12[256] = {
+	{ 0x8C3F55DE, 0xC811A805 }, { 0x96B50619, 0x65F5B431 }, { 0xD6706E43, 0xF74F96B1 }, { 0xCB43D336, 0x859D1E8B },
+	{ 0xCCFA3D84, 0x5AAB8A85 }, { 0xC295FCFD, 0xF9C7BF99 }, { 0xDE4B630F, 0xA21FD5A1 }, { 0x3B8B456D, 0xCDB3EF76 },
+	{ 0x7CF7C385, 0x803F59F8 }, { 0x5F31913C, 0xB27C73BE }, { 0x33B04821, 0x98E3AC66 }, { 0x26B8F818, 0xBF61674C },
+	{ 0xC4C130C8, 0x0FFBC995 }, { 0x10761A98, 0xAAA08620 }, { 0x210116AA, 0x6057F342 }, { 0x0654CC35, 0xF63C760C },
+	{ 0x667D9042, 0x2DDB45CC }, { 0x4BD40382, 0xBCF45A96 }, { 0xEF3C6F3D, 0x68E8A0C3 }, { 0x69FF73BC, 0xA7BD92D2 },
+	{ 0x01ED2287, 0x290AE202 }, { 0xE885818F, 0xB7DE34CD }, { 0xDD61059B, 0xD901EEA7 }, { 0x19A03553, 0xD6FA2732 },
+	{ 0x74CCCEC9, 0xD56F1AE8 }, { 0x2E83F554, 0xEA31245C }, { 0xA07BE499, 0x7034555D }, { 0x56E7BEF7, 0xCE26D2AC },
+	{ 0xA5054E38, 0xFD161857 }, { 0x527436D1, 0x6A0E7DA4 }, { 0x1CDE9FF2, 0x5BD86A38 }, { 0x31770C32, 0xCAF77562 },
+	{ 0xE279C8D0, 0xB09AAED9 }, { 0xC60674DB, 0x5DEF1091 }, { 0x515E5045, 0x111046A2 }, { 0x729802FC, 0x23536CE4 },
+	{ 0xF5B63CFA, 0xC50CBCF7 }, { 0xCD171F03, 0x73A16887 }, { 0xD9F28DBD, 0x7D2941AF }, { 0x5A4F3B9D, 0x3F5E3EB4 },
+	{ 0x1B677140, 0x84EEFE36 }, { 0xE7076271, 0x3DB8E3D3 }, { 0xF20FD248, 0x1A3A28F9 }, { 0xB49E7627, 0x7EBC7C75 },
+	{ 0xC7EB565C, 0x74E5F293 }, { 0x4F478BA4, 0x18DCF59E }, { 0xA9ADCB52, 0x0C6EF44F }, { 0x98DAC760, 0xC699812D },
+	{ 0x6E469D0E, 0x788B06DC }, { 0x7521EC4E, 0xFC65F8EA }, { 0x9E8E0B55, 0x30A5F721 }, { 0xBCA57B6B, 0x2BEC3F65 },
+	{ 0xBAF1B75E, 0xDDD04969 }, { 0xE394EA57, 0x99904CDB }, { 0xE6EA40F6, 0x14B201D1 }, { 0x41284ADD, 0xBBB0C082 },
+	{ 0xBF8F1DFF, 0x50F20463 }, { 0x93CBACB8, 0xE8D7F93B }, { 0x477C86E8, 0x4D8CB68E }, { 0x92268E3F, 0xC1DD1B39 },
+	{ 0x09D62FCB, 0x7C5AA112 }, { 0xDB35C9AE, 0x2F3D98AB }, { 0x2BFD5FF5, 0x67136956 }, { 0x36CEE280, 0x15C1E16C },
+	{ 0xF8F39B17, 0x1D7EB2ED }, { 0xB00DFE01, 0xDA94D37D }, { 0x760B8ADA, 0x877BC3EC }, { 0xE153AE44, 0xCB8495DF },
+	{ 0xB7B410B3, 0x05A24773 }, { 0x3C32ABDF, 0x12857B78 }, { 0x6812513B, 0x8EB770D0 }, { 0xD2E3E665, 0x536739B9 },
+	{ 0x71B26468, 0x584D57E2 }, { 0xC9849725, 0xD789C78F }, { 0x7D1AE102, 0xA935BBFA }, { 0xDFA64188, 0x8B1537A3 },
+	{ 0xC378DE7A, 0xD0CD5D9B }, { 0x4D80CFB7, 0x4AC82C9A }, { 0x83BDB620, 0x42777F1B }, { 0x1D33BD75, 0x72D2883A },
+	{ 0xAB6A8F41, 0x5E7A2D4B }, { 0xBB1C95D9, 0xF4DAAB6B }, { 0xFD8D31B6, 0x905CFFE7 }, { 0x119B381F, 0x83AA6422 },
+	{ 0x42022C49, 0xC0AEFB84 }, { 0x63033AE3, 0xA0F908C6 }, { 0x04938826, 0xA428AF08 }, { 0x1A8A53C7, 0xADE41C34 },
+	{ 0x77E6A85D, 0xAE7121EE }, { 0x25929E8C, 0xC47F5C4A }, { 0x55CDD863, 0xB538E9AA }, { 0xDAD8EB29, 0x06377AA9 },
+	{ 0xB3279895, 0xA18AE87B }, { 0x35E48414, 0x6EDFDA6A }, { 0x825094A7, 0x6B7D9D19 }, { 0xA4E86CBF, 0xD41CFA55 },
+	{ 0xEA42C59C, 0xE5CAEDC9 }, { 0x0E6FC179, 0xA36C351C }, { 0x6FABBF89, 0x5181E4DE }, { 0x184D17D4, 0xFFF0C530 },
+	{ 0x84045892, 0x9D41EB15 }, { 0x28D73961, 0x1C0D5250 }, { 0x0CA8856A, 0xF178EC18 }, { 0x8EF811CD, 0x9A057101 },
+	{ 0x3EF5EFCC, 0x4091A27C }, { 0x9F6329D2, 0x19AF1523 }, { 0xF91EB990, 0x347450EF }, { 0x8DD27759, 0xE11B4A07 },
+	{ 0xFC601331, 0xB9561DE5 }, { 0x2DA993C0, 0x912F1F5A }, { 0x5BA2191A, 0x1654DCB6 }, { 0x8A6B99EB, 0x3E2DDE09 },
+	{ 0x0F82E3FE, 0x8A66D71E }, { 0xD55A08D7, 0x8C51ADB7 }, { 0x8941FF7F, 0x4533E50F }, { 0xBD4859EC, 0x02E6DD67 },
+	{ 0x5DF6D52F, 0xE068AABA }, { 0xFF4A75A5, 0xC24826E3 }, { 0x88ACDDF8, 0x6C39070D }, { 0x4691A46F, 0x6486548C },
+	{ 0x135C7C0C, 0xD1BEBD26 }, { 0x8F15334A, 0xB30F9303 }, { 0xC1BF9A69, 0x82D9849F }, { 0x5420FAE4, 0x9C320BA8 },
+	{ 0xAFF90767, 0xFA528243 }, { 0xE968A308, 0x9ED4D6CF }, { 0x2C44B147, 0xB825FD58 }, { 0x5EDCB3BB, 0x9B7691BC },
+	{ 0x48FE6516, 0xC7EA6190 }, { 0x817AF233, 0x1063A61F }, { 0x3409A693, 0x47D53868 }, { 0x4C6DED30, 0x63C2CE98 },
+	{ 0x6C81D91D, 0x2A9FDFD8 }, { 0x032A6694, 0x7B1E3B06 }, { 0xFBD9FD83, 0x666089EB }, { 0x7375207B, 0x0A598EE6 },
+	{ 0x0AFC495F, 0x07449A14 }, { 0xB6593234, 0x2CA8A571 }, { 0x45BBC2FB, 0x1F986F8A }, { 0x50B372C2, 0x381AA4A0 },
+	{ 0xD81FAF3A, 0x5423A3AD }, { 0x8B86BB6C, 0x17273C0B }, { 0xC869B5A2, 0xFE83258D }, { 0xD1C980F1, 0x287902BF },
+	{ 0x6B3837AF, 0xF5A94BD6 }, { 0xB2CABA12, 0x88800A79 }, { 0x083B0D4C, 0x55504310 }, { 0x07B9EEB2, 0xDF36940E },
+	{ 0x6790B2C5, 0x04D1A7CE }, { 0xF125B4DC, 0x612413FF }, { 0xC52C124F, 0x26F12B97 }, { 0xA62F28AC, 0x86082351 },
+	{ 0x9937E5E7, 0xEF93632F }, { 0x293A1BE6, 0x3507B052 }, { 0x570A9C70, 0xE72C30AE }, { 0xAE1425E0, 0xD3586041 },
+	{ 0xD79D4CC4, 0xDE4574B3 }, { 0x40C5685A, 0x92BA2280 }, { 0xDC8C271C, 0xF00B0CA5 }, { 0xF69C5A6E, 0xBE1287F1 },
+	{ 0xB1E0DC86, 0xF39E317F }, { 0x20EC342D, 0x495D1140 }, { 0x3F18CD4B, 0x699B407E }, { 0x6AD51528, 0xDCA3A9D4 },
+	{ 0x79896924, 0x0D1D14F2 }, { 0x00000000, 0x00000000 }, { 0xA196C61E, 0x593EB75F }, { 0x0B116BD8, 0x2E4E7816 },
+	{ 0x58887F8E, 0x6D4AE7B0 }, { 0x872E3E06, 0xE65FD013 }, { 0xD30EC4E2, 0x7A6DDBBB }, { 0xCAAEF1B1, 0xAC97FC89 },
+	{ 0x1E19DBE1, 0x09CCB33C }, { 0x62EE1864, 0x89F3EAC4 }, { 0xAA87ADC6, 0x7770CF49 }, { 0x6557F6D6, 0x56C57ECA },
+	{ 0x6D6CFB9A, 0x03953DDA }, { 0x4456E07C, 0x36928D88 }, { 0x959F608D, 0x1EEB8F37 }, { 0x4EAAA923, 0x31D6179C },
+	{ 0xE5C02662, 0x6FAC3AD7 }, { 0x53991456, 0x43049FA6 }, { 0xC052B8EE, 0xABD3669D }, { 0xA7C20A2B, 0xAF02C153 },
+	{ 0x3723C007, 0x3CCB036E }, { 0x90E1CA2C, 0x93C9C23D }, { 0x2F6ED7D3, 0xC33BC65E }, { 0x9758249E, 0x4CFF5633 },
+	{ 0x325D6AA6, 0xB1E94E64 }, { 0x9472420A, 0x37E16D35 }, { 0xBE623F78, 0x79F8E661 }, { 0x02C74413, 0x5214D904 },
+	{ 0xF0C8965B, 0x482EF1FD }, { 0xEC1609A9, 0x13F69BC5 }, { 0x14E592BE, 0x0E882928 }, { 0x2A107D72, 0x4E198B54 },
+	{ 0xEBAFE71B, 0xCCC00FCB }, { 0x222B703E, 0x1B49C844 }, { 0xA840E9D5, 0x2564164D }, { 0x1FF4F966, 0x20C6513E },
+	{ 0x910CE8AB, 0xBAC3203F }, { 0x61C47EF0, 0xF2EDD1C2 }, { 0xACD361F3, 0x814CB945 }, { 0x4A392105, 0x95FEB894 },
+	{ 0x1622D6AD, 0x5C9CF02C }, { 0xF77178E9, 0x971865F3 }, { 0x9BF0A1F4, 0xBD87BA2B }, { 0x59655D09, 0x444005B2 },
+	{ 0x247FBC0B, 0xED75BE48 }, { 0x17CFF42A, 0x7596122E }, { 0x85E97A15, 0xB44B0917 }, { 0x2755DA9F, 0x966B854E },
+	{ 0x49134791, 0xEEE08392 }, { 0x23C652B9, 0x32432A46 }, { 0xAD3E4374, 0xA8465B47 }, { 0x12B15E8B, 0xF8B45F24 },
+	{ 0x78644BA3, 0x2417F6F0 }, { 0x7FDDA511, 0xFB2162FE }, { 0x9DA46DC1, 0x4BBBCC27 }, { 0xD024A276, 0x0173E0BD },
+	{ 0xA2BCA08A, 0x22208C59 }, { 0xB836F34D, 0x8FC4906D }, { 0x3A6667EA, 0xE4B90D74 }, { 0x705F46EF, 0x7147B5E0 },
+	{ 0x1508B039, 0x2782CB2A }, { 0xF45B1E7D, 0xEC065EF5 }, { 0xCFD05B10, 0x21B5B183 }, { 0x60295C77, 0xDBE733C0 },
+	{ 0x394C017E, 0x9FA73672 }, { 0x86C31C81, 0xCF553211 }, { 0x0D45A7ED, 0xD8720E1A }, { 0x3DDF8958, 0x3B8F997A },
+	{ 0xEDFB2B2E, 0x3AFC79C7 }, { 0x43EF0ECE, 0xE9A41986 }, { 0x7B4E2D37, 0x5F09CDF6 }, { 0xFA34DF04, 0x4F6A6BE9 },
+	{ 0x38A123F9, 0xB6ADD470 }, { 0x057EAAA1, 0x8D224D0A }, { 0x5C1BF7A8, 0xC96248B8 }, { 0x309A2EB5, 0xE3FD9760 },
+	{ 0xA351820D, 0x0B2A6E5B }, { 0xFEA75722, 0xEB42C4E1 }, { 0x9A1D8373, 0x948D5829 }, { 0x64BAD451, 0x7FCF9CC8 },
+	{ 0xD4B72A50, 0xA55B4FB5 }, { 0xCE3D7997, 0x08BF5381 }, { 0xE42D04E5, 0x46A6D8D5 }, { 0x7E308796, 0xD22B80FC },
+	{ 0xB57354A0, 0x57B69E77 }, { 0x8097D0B4, 0x3969441D }, { 0xF3E2F0CF, 0x3330CAFB }, { 0xE0BE8CC3, 0xE28E77DD },
+	{ 0x9C494F46, 0x62B12E25 }, { 0xB9DBD1CA, 0xA6CE726F }, { 0xEED14DBA, 0x41E242C1 }, { 0x7AA30FB0, 0x76032FF4 },
+};
+
+__device__ uint2 T22[256] = {
+	{ 0x3ACDE4CC, 0x45B268A9 }, { 0x84549D08, 0xAF7F0BE8 }, { 0xC1468263, 0x048354B3 }, { 0xC80EFED2, 0x925435C2 },
+	{ 0x7FDFFBA7, 0xEE4E37F2 }, { 0x0C60F14D, 0x167A3392 }, { 0xEA03E584, 0xFB123B52 }, { 0xFDBB9007, 0x4A0CAB53 },
+	{ 0x0F788A19, 0x9DEAF638 }, { 0x8F0CB32A, 0xCB48EC55 }, { 0xD6FEF7E0, 0xB59DC4B2 }, { 0xF4F3ECB6, 0xDCDBCA22 },
+	{ 0x549A9C40, 0x11DF5813 }, { 0x568ACED3, 0xE33FDEDF }, { 0x4322E9C3, 0xA0C1C812 }, { 0x58FA6D0D, 0x07A56B81 },
+	{ 0xB1E1F3DD, 0x77279579 }, { 0x422AC004, 0xD9B18B74 }, { 0xFFABC294, 0xB8EC2D9F }, { 0x2D75914F, 0xF4ACF8A8 },
+	{ 0xEF2B6878, 0x7BBF69B1 }, { 0x487AC7E1, 0xC4F62FAF }, { 0xC67E5D0C, 0x76CE809C }, { 0x92E4C14C, 0x6711D88F },
+	{ 0x243DEDFE, 0x627B99D9 }, { 0xDFB68B51, 0x234AA5C3 }, { 0x262DBF6D, 0x909B1F15 }, { 0x4B62BCB5, 0x4F66EA05 },
+	{ 0x52AA6AE8, 0x1AE2CF5A }, { 0xD0CE0148, 0xBEA053FB }, { 0xE66314C9, 0xED6808C0 }, { 0x15A82710, 0x43FE16CD },
+	{ 0xA06970F6, 0xCD049231 }, { 0x97CC4CB0, 0xE7BC8A6C }, { 0xFCB3B9C0, 0x337CE835 }, { 0x7CC780F3, 0x65DEF258 },
+	{ 0x4132BB50, 0x52214EDE }, { 0x90F493DF, 0x95F15E43 }, { 0x5DD2E0F1, 0x87083962 }, { 0xFB8B66AF, 0x41313C1A },
+	{ 0x51B211BC, 0x91720AF0 }, { 0xD4EEA573, 0x477D427E }, { 0xF6E3BE25, 0x2E3B4CEE }, { 0xEB0BCC43, 0x82627834 },
+	{ 0x78E724C8, 0x9C03E3DD }, { 0xD9867DF9, 0x2877328A }, { 0xE243B0F2, 0x14B51945 }, { 0xF7EB97E2, 0x574B0F88 },
+	{ 0x9AA4943A, 0x88B6FA98 }, { 0xCB168586, 0x19C4F068 }, { 0xAF11FAEF, 0x50EE6409 }, { 0xC04EABA4, 0x7DF317D5 },
+	{ 0x98B4C6A9, 0x7A567C54 }, { 0x4F42188E, 0xB6BBFB80 }, { 0x3BC5CD0B, 0x3CC22BCF }, { 0xAA397713, 0xD04336EA },
+	{ 0xEC33132C, 0xF02FAC1B }, { 0xF0D3488D, 0x2506DBA7 }, { 0xF2C31A1E, 0xD7E65D6B }, { 0x1FF820F5, 0x5EB9B216 },
+	{ 0xC46E0F9F, 0x842E0650 }, { 0x9E843001, 0x716BEB1D }, { 0xAB315ED4, 0xA933758C }, { 0xA2792265, 0x3FE414FD },
+	{ 0x1EF00932, 0x27C9F170 }, { 0x70A771BE, 0x73A4C1CA }, { 0xE76B3D0E, 0x94184BA6 }, { 0x8C14C87E, 0x40D829FF },
+	{ 0xC77674CB, 0x0FBEC3FA }, { 0x4A6A9572, 0x3616A963 }, { 0xC25EF937, 0x8F139119 }, { 0x5AEA3F9E, 0xF545ED4D },
+	{ 0x50BA387B, 0xE8024996 }, { 0x0B582E22, 0x6437E7BD }, { 0xE053E261, 0xE6559F89 }, { 0x05288DFC, 0x80AD52E3 },
+	{ 0xE34B9935, 0x6DC55A23 }, { 0x1AD0AD09, 0xDE14E0F5 }, { 0xA659865E, 0xC6390578 }, { 0x09487CB1, 0x96D76171 },
+	{ 0x21156002, 0xE2D6CB3A }, { 0x779FAED1, 0x01E915E5 }, { 0x6A77DCB7, 0xADB0213F }, { 0xB9A1A6AB, 0x9880B76E },
+	{ 0x8644CF9B, 0x5D9F8D24 }, { 0xC5662658, 0xFD5E4536 }, { 0x9BACBDFD, 0xF1C6B9FE }, { 0xBE9979C4, 0xEACD6341 },
+	{ 0x08405576, 0xEFA72217 }, { 0xD88E543E, 0x510771EC }, { 0x671F043D, 0xC2BA51CB }, { 0x71AF5879, 0x0AD482AC },
+	{ 0x5CDAC936, 0xFE787A04 }, { 0x8E049AED, 0xB238AF33 }, { 0x4972EE26, 0xBD866CC9 }, { 0xBD810290, 0x615DA6EB },
+	{ 0x8B2C1711, 0x3295FDD0 }, { 0x73BF0AEA, 0xF8340460 }, { 0x758FFC42, 0xF3099329 }, { 0x7DCFA934, 0x1CAEB13E },
+	{ 0x1188832B, 0xBA230748 }, { 0x874CE65C, 0x24EFCE42 }, { 0xB0E9DA1A, 0x0E57D61F }, { 0xF99B343C, 0xB3D1BAD6 },
+	{ 0x893C4582, 0xC0757B1C }, { 0x403A9297, 0x2B510DB8 }, { 0xF1DB614A, 0x5C7698C1 }, { 0xD5E68CB4, 0x3E0D0118 },
+	{ 0x855CB4CF, 0xD60F488E }, { 0xF3CB33D9, 0xAE961E0D }, { 0x14A00ED7, 0x3A8E55AB }, { 0x623789C1, 0x42170328 },
+	{ 0x9C946292, 0x838B6DD1 }, { 0xED3B3AEB, 0x895FEF7D }, { 0x4E4A3149, 0xCFCBB8E6 }, { 0x2F65C3DC, 0x064C7E64 },
+	{ 0x4C5A63DA, 0x3D2B3E2A }, { 0xA9210C47, 0x5BD3F340 }, { 0xA1615931, 0xB474D157 }, { 0x1DE87266, 0xAC5934DA },
+	{ 0x7AF7765B, 0x6EE36511 }, { 0x16B05C44, 0xC86ED367 }, { 0x201D49C5, 0x9BA6885C }, { 0x88346C45, 0xB905387A },
+	{ 0xBAB9DDFF, 0x131072C4 }, { 0xA751AF99, 0xBF49461E }, { 0x1CE05BA1, 0xD52977BC }, { 0x6027DB52, 0xB0F785E4 },
+	{ 0x6E57788C, 0x546D30BA }, { 0x650F56AE, 0x305AD707 }, { 0x612FF295, 0xC987C682 }, { 0xF5FBC571, 0xA5AB8944 },
+	{ 0x59F244CA, 0x7ED528E7 }, { 0x2C7DB888, 0x8DDCBBCE }, { 0x328DB1BA, 0xAA154ABE }, { 0x93ECE88B, 0x1E619BE9 },
+	{ 0xE813B717, 0x09F2BD9E }, { 0x285D1CB3, 0x7401AA4B }, { 0x3195CAEE, 0x21858F14 }, { 0x1398D1B8, 0x48C38184 },
+	{ 0xB2F98889, 0xFCB750D3 }, { 0x8D1CE1B9, 0x39A86A99 }, { 0xE473465A, 0x1F888E0C }, { 0x76978716, 0x78995683 },
+	{ 0xEE2341BF, 0x02CF2AD7 }, { 0xB3F1A14E, 0x85C713B5 }, { 0x2B4567E7, 0xFF916FE1 }, { 0xB7D10575, 0x7C1A0230 },
+	{ 0x5ECA9BA5, 0x0C98FCC8 }, { 0xDA9E06AD, 0xA3E7F720 }, { 0xBBB1F438, 0x6A6031A2 }, { 0x7ED7D260, 0x973E7494 },
+	{ 0x18C0FF9A, 0x2CF46639 }, { 0x68678E24, 0x5F50A7F3 }, { 0xA449D4CD, 0x34D983B4 }, { 0x5592B587, 0x68AF1B75 },
+	{ 0x2E6DEA1B, 0x7F3C3D02 }, { 0x45121F6B, 0xABFC5F5B }, { 0x29553574, 0x0D71E92D }, { 0x6D4F03D8, 0xDFFDF510 },
+	{ 0x9F8C19C6, 0x081BA87B }, { 0xAC0981BB, 0xDB7EA1A3 }, { 0x66172DFA, 0xBBCA12AD }, { 0x010829C7, 0x79704366 },
+	{ 0x7BFF5F9C, 0x17932677 }, { 0x00000000, 0x00000000 }, { 0xC906D715, 0xEB2476A4 }, { 0x0738DF6F, 0x724DD42F },
+	{ 0x38DDB65F, 0xB752EE65 }, { 0x3DF53BA3, 0x37FFBC86 }, { 0xB5C157E6, 0x8EFA84FC }, { 0x272596AA, 0xE9EB5C73 },
+	{ 0x2535C439, 0x1B0BDABF }, { 0x2A4D4E20, 0x86E12C87 }, { 0xCE3E087A, 0x9969A28B }, { 0x9D9C4B55, 0xFAFB2EB7 },
+	{ 0xB6D92CB2, 0x056A4156 }, { 0xDEBEA296, 0x5A3AE6A5 }, { 0xA8292580, 0x22A3B026 }, { 0x36AD1581, 0x53C85B3B },
+	{ 0x17B87583, 0xB11E9001 }, { 0x3FE56930, 0xC51F3A4A }, { 0xCF3621BD, 0xE019E1ED }, { 0x91FCBA18, 0xEC811D25 },
+	{ 0x4D524A1D, 0x445B7D4C }, { 0xDCAEF005, 0xA8DA6069 }, { 0x309DE329, 0x58F5CC72 }, { 0x6B7FF570, 0xD4C06259 },
+	{ 0x39D59F98, 0xCE22AD03 }, { 0x47024DF8, 0x591CD997 }, { 0x03187B54, 0x8B90C5AA }, { 0xC356D0F0, 0xF663D27F },
+	{ 0x35B56ED5, 0xD8589E91 }, { 0xD3D67A1C, 0x35309651 }, { 0xCD26732E, 0x12F96721 }, { 0x441A36AC, 0xD28C1C3D },
+	{ 0x64077F69, 0x492A9461 }, { 0x6F5F514B, 0x2D1D73DC }, { 0x0D68D88A, 0x6F0A70F4 }, { 0xCA1EAC41, 0x60B4B30E },
+	{ 0x3385987D, 0xD36509D8 }, { 0x0630F6A8, 0x0B3D9749 }, { 0x96C46577, 0x9ECCC90A }, { 0xAD01A87C, 0xA20EE2C5 },
+	{ 0x0E70A3DE, 0xE49AB55E }, { 0x82646BA0, 0xA4429CA1 }, { 0xDB962F6A, 0xDA97B446 }, { 0xD7F6DE27, 0xCCED87D4 },
+	{ 0x37A53C46, 0x2AB8185D }, { 0xE15BCBA6, 0x9F25DCEF }, { 0xFEA3EB53, 0xC19C6EF9 }, { 0x1BD884CE, 0xA764A393 },
+	{ 0x817C10F4, 0x2FD2590B }, { 0x80743933, 0x56A21A6D }, { 0x79EF0D0F, 0xE573A0BB }, { 0x95DC1E23, 0x155C0CA0 },
+	{ 0x94D437E4, 0x6C2C4FC6 }, { 0x23053291, 0x10364DF6 }, { 0x836C4267, 0xDD32DFC7 }, { 0x99BCEF6E, 0x03263F32 },
+	{ 0xE57B6F9D, 0x66F8CD6A }, { 0x5BE21659, 0x8C35AE2B }, { 0x1290F87F, 0x31B3C2E2 }, { 0xBF915003, 0x93BD2027 },
+	{ 0x220D1B56, 0x69460E90 }, { 0xAE19D328, 0x299E276F }, { 0x53A2432F, 0x63928C3C }, { 0xE91B9ED0, 0x7082FEF8 },
+	{ 0x3EED40F7, 0xBC6F792C }, { 0xD2DE53DB, 0x4C40D537 }, { 0x5FC2B262, 0x75E8BFAE }, { 0xA541FD0A, 0x4DA9C0D2 },
+	{ 0x3CFD1264, 0x4E8FFFE0 }, { 0x696FA7E3, 0x2620E495 }, { 0xB8A98F6C, 0xE1F0F408 }, { 0xDDA6D9C2, 0xD1AA230F },
+	{ 0xD1C6288F, 0xC7D0109D }, { 0x7487D585, 0x8A79D04F }, { 0xA3710BA2, 0x4694579B }, { 0xFA834F68, 0x38417F7C },
+	{ 0x0A5007E5, 0x1D47A4DB }, { 0x460A643F, 0x206C9AF1 }, { 0x34BD4712, 0xA128DDF7 }, { 0x72B7232D, 0x81444706 },
+	{ 0x02105293, 0xF2E086CC }, { 0xBC892B57, 0x182DE58D }, { 0xF8931DFB, 0xCAA1F9B0 }, { 0xCC2E5AE9, 0x6B892447 },
+	{ 0x0420A43B, 0xF9DD1185 }, { 0x8A243ED6, 0x4BE5BEB6 }, { 0x19C8D65D, 0x5584255F }, { 0x633FA006, 0x3B67404E },
+	{ 0x6C472A1F, 0xA68DB676 }, { 0xB4C97E21, 0xF78AC79A }, { 0x1080AAEC, 0xC353442E }, { 0x5782E714, 0x9A4F9DB9 },
+};
+
+__device__ uint2 T32[256] = {
+	{ 0x2C9B3220, 0x05BA7BC8 }, { 0xF8B65E4F, 0x31A54665 }, { 0x7547F4D4, 0xB1B651F7 }, { 0x7BA46682, 0x8BFA0D85 },
+	{ 0xA16A98BB, 0x85A96C5A }, { 0x08EB79C9, 0x990FAEF9 }, { 0x47F4A62D, 0xA15E37A2 }, { 0x5D27741E, 0x76857DCD },
+	{ 0x0A1820BC, 0xF8C50B80 }, { 0x01F7A2B4, 0xBE65DCB2 }, { 0x6F9426E7, 0x666D1B98 }, { 0x53C4E648, 0x4CC921BF },
+	{ 0x93D9CA42, 0x95410A0F }, { 0x647BA4EF, 0x20CDCCAA }, { 0x890A1871, 0x429A4060 }, { 0x9B32B38B, 0x0C4EA4F6 },
+	{ 0xDE354CD3, 0xCCDA362D }, { 0x7C5B2FA9, 0x96DC23BC }, { 0xAA851AB3, 0xC309BB68 }, { 0x3648E013, 0xD26131A7 },
+	{ 0x41FC4DB2, 0x021DC529 }, { 0x704BE48A, 0xCD5ADAB7 }, { 0x84ED71E6, 0xA77965D9 }, { 0x1734BBA4, 0x32386FD6 },
+	{ 0x38AB7245, 0xE82D6DD5 }, { 0x6177B4B1, 0x5C2147EA }, { 0xCF091CE8, 0x5DA1AB70 }, { 0x72B8BDFF, 0xAC907FCE },
+	{ 0x972278A8, 0x57C85DFD }, { 0x6B6F940D, 0xA4E44C6A }, { 0x4F1FDFE4, 0x3851995B }, { 0xED71BC9E, 0x62578CCA },
+	{ 0xC01D2C0A, 0xD9882BB0 }, { 0x113C503B, 0x917B9D5D }, { 0xA87643C6, 0xA2C31E11 }, { 0xA399C1CE, 0xE463C923 },
+	{ 0x7EA876DC, 0xF71686C5 }, { 0xE096D509, 0x87B4A973 }, { 0x9D3A5814, 0xAF0D567D }, { 0x59DCC6F4, 0xB40C2A3F },
+	{ 0x95D121DD, 0x3602F884 }, { 0x9836484A, 0xD3E1DD3D }, { 0xA46688E5, 0xF945E71A }, { 0xB2A591F5, 0x7518547E },
+	{ 0x50C01D89, 0x93665874 }, { 0x658C065B, 0x9EA81018 }, { 0xBC4603A3, 0x4F54080C }, { 0x5137BF3D, 0x2D0384C6 },
+	{ 0xEC861E2A, 0xDC325078 }, { 0x79573FF7, 0xEA30A8FC }, { 0xCA050CB6, 0x214D2030 }, { 0x8016C30C, 0x65F0322B },
+	{ 0x1B247087, 0x69BE96DD }, { 0x81E161B8, 0xDB95EE99 }, { 0xD9CA05F8, 0xD1FC1814 }, { 0xCC0DE729, 0x820ED2BB },
+	{ 0x430F14C7, 0x63D76050 }, { 0xA09D3A0F, 0x3BCCB0E8 }, { 0x573F54A2, 0x8E40764D }, { 0xE16177BD, 0x39D175C1 },
+	{ 0x734F1F4B, 0x12F5A37C }, { 0x1FDFC26D, 0xAB37C12F }, { 0x395CD0F1, 0x5648B167 }, { 0x37BF42A7, 0x6C04ED15 },
+	{ 0x14304065, 0xED97161D }, { 0xAB72B807, 0x7D6C67DA }, { 0xBA4EE83C, 0xEC17FA87 }, { 0x0304FBC1, 0xDFAF79CB },
+	{ 0x71BC463E, 0x733F0605 }, { 0x87E98A27, 0x78D61C12 }, { 0x77B4ADA1, 0xD07CF48E }, { 0x6C90DD26, 0xB9C26253 },
+	{ 0x60801605, 0xE2449B58 }, { 0xF941FCFB, 0x8FC09AD7 }, { 0x4BE46D0E, 0xFAD8CEA9 }, { 0x0608EB9F, 0xA343F28B },
+	{ 0x4917347B, 0x9B126BD0 }, { 0xE7699C22, 0x9A92874A }, { 0xC4E69EE0, 0x1B017C42 }, { 0x0EE39256, 0x3A4C5C72 },
+	{ 0x3EA399DA, 0x4B6E9F5E }, { 0x5AD83D35, 0x6BA353F4 }, { 0x4C1B2425, 0xE7FEE090 }, { 0x2587E95D, 0x22D00983 },
+	{ 0x0F1430E2, 0x842980C0 }, { 0x861E2893, 0xC6B3C0A0 }, { 0x19D729F2, 0x087433A4 }, { 0xD42D6C6F, 0x341F3DAD },
+	{ 0xFBB2A58E, 0xEE0A3FAE }, { 0x90DD3183, 0x4AEE73C4 }, { 0xB1A16A34, 0xAAB72DB5 }, { 0x5E238FDF, 0xA92A0406 },
+	{ 0x686B6FCC, 0x7B4B35A1 }, { 0xF4A6956C, 0x6A23BF6E }, { 0x851AD352, 0x191CB96B }, { 0xD6DE351A, 0x55D598D4 },
+	{ 0xF2AE7EF3, 0xC9604DE5 }, { 0xA981E172, 0x1CA6C2A3 }, { 0xAD7A5398, 0xDE2F9551 }, { 0x56C8F616, 0x3025AAFF },
+	{ 0x1E2860D9, 0x15521D9D }, { 0xFA45073A, 0x506FE31C }, { 0x2B647B0B, 0x189C55F1 }, { 0xAE7EA859, 0x0180EC9A },
+	{ 0x050C105E, 0x7CEC8B40 }, { 0x8BF94104, 0x2350E519 }, { 0x55CC0DD7, 0xEF8AD334 }, { 0x6D677F92, 0x07A7BEE1 },
+	{ 0x0DE76997, 0xE5E325B9 }, { 0xA26E637A, 0x5A061591 }, { 0x18208B46, 0xB611EF16 }, { 0xB7A981AB, 0x09F4DF3E },
+	{ 0xE87DACC0, 0x1EBB078A }, { 0xB65E231F, 0xB791038C }, { 0x74B05660, 0x0FD38D45 }, { 0xC1EA8EBE, 0x67EDF702 },
+	{ 0x831238CD, 0xBA5F4BE0 }, { 0xCEFEBE5C, 0xE3C477C2 }, { 0x354C1BD2, 0x0DCE486C }, { 0x16C31910, 0x8C5DB364 },
+	{ 0xA7627324, 0x26EA9ED1 }, { 0xEF82E5EB, 0x039D29B3 }, { 0xCBF2AE02, 0x9F28FC82 }, { 0xF05D2786, 0xA8AAE89C },
+	{ 0x2774B028, 0x431AACFA }, { 0x31B7A938, 0xCF471F9E }, { 0xE3922EC8, 0x581BD0B8 }, { 0x400BEF06, 0xBC78199B },
+	{ 0xBF42F862, 0x90FB71C7 }, { 0x46030499, 0x1F3BEB10 }, { 0xB55AD8DE, 0x683E7A47 }, { 0xA695D190, 0x988F4263 },
+	{ 0x6E638453, 0xD808C72A }, { 0xC319D7CB, 0x0627527B }, { 0xD72997AE, 0xEBB04466 }, { 0xE2658C7C, 0xE67E0C0A },
+	{ 0xB056C880, 0x14D2F107 }, { 0x30400B8C, 0x7122C32C }, { 0xD5DACEDB, 0x8A7AE11F }, { 0xE98A0E74, 0xA0DEDB38 },
+	{ 0xDCC615A6, 0xAD109354 }, { 0xF655CC19, 0x0BE91A17 }, { 0xB8BDB149, 0x8DDD5FFE }, { 0xAF890AED, 0xBFE53028 },
+	{ 0xB4AD7A6A, 0xD65BA6F5 }, { 0x2997227E, 0x7956F088 }, { 0x32B352F9, 0x10E86655 }, { 0xDACEFE39, 0x0E5361DF },
+	{ 0x9FC90161, 0xCEC7F304 }, { 0x677F5F2E, 0xFF62B561 }, { 0xD22587F0, 0x975CCF26 }, { 0x543BAF63, 0x51EF0F86 },
+	{ 0x10CBF28F, 0x2F1E41EF }, { 0xBBB94A88, 0x52722635 }, { 0x3344F04D, 0xAE8DBAE7 }, { 0x6688FD9A, 0x410769D3 },
+	{ 0x34BBB966, 0xB3AB94DE }, { 0x8DF1AA9B, 0x80131792 }, { 0xC5113C54, 0xA564A0F0 }, { 0xBDB1A117, 0xF131D4BE },
+	{ 0xEA8EF5B5, 0x7F71A2F3 }, { 0xC8F655C3, 0x40878549 }, { 0x44F05DEC, 0x7EF14E69 }, { 0xF55137D8, 0xD44663DC },
+	{ 0x523344FC, 0xF2ACFD0D }, { 0x00000000, 0x00000000 }, { 0x8EF5515A, 0x5FBC6E59 }, { 0xF1AA8532, 0x16CF342E },
+	{ 0xDB395C8D, 0xB036BD6D }, { 0xDD31B712, 0x13754FE6 }, { 0x2D6C9094, 0xBBDFA77A }, { 0x3A582B30, 0x89E7C8AC },
+	{ 0xCDFA459D, 0x3C6B0E09 }, { 0xC7E26521, 0xC4AE0589 }, { 0x7F5FD468, 0x49735A77 }, { 0x1D2C9B18, 0xCAFD6456 },
+	{ 0x2F9FC9E1, 0xDA150203 }, { 0x94268369, 0x88672436 }, { 0x3BAF8984, 0x3782141E }, { 0x24704BE9, 0x9CB5D531 },
+	{ 0x1AD3D233, 0xD7DB4A6F }, { 0x2A93D9BF, 0xA6F98943 }, { 0x8A0EE3B0, 0x9D3539AB }, { 0x15C7E2D1, 0x53F2CAAF },
+	{ 0x76430F15, 0x6E19283C }, { 0x6384EDC4, 0x3DEBE293 }, { 0x208BF903, 0x5E3C82C3 }, { 0xB94A13FD, 0x33B8834C },
+	{ 0x2E686B55, 0x6470DEB1 }, { 0x7A53C436, 0x359FD137 }, { 0x02F35975, 0x61CAA579 }, { 0x82E59A79, 0x043A9752 },
+	{ 0x2683129C, 0xFD7F7048 }, { 0x699CCD78, 0xC52EE913 }, { 0x7DAC8D1D, 0x28B9FF0E }, { 0x78A09D43, 0x5455744E },
+	{ 0xB3523341, 0xCB7D88CC }, { 0x4A13CFBA, 0x44BD121B }, { 0xFDBA4E11, 0x4D49CD25 }, { 0x8C06082F, 0x3E76CB20 },
+	{ 0x2278A076, 0x3FF627BA }, { 0x04FBB2EA, 0xC28957F2 }, { 0xE46D67E3, 0x453DFE81 }, { 0x3DA7621B, 0x94C1E695 },
+	{ 0xFF491764, 0x2C83685C }, { 0xFC4DECA5, 0xF32C1197 }, { 0x922E68F6, 0x2B24D6BD }, { 0x9AC5113F, 0xB22B7844 },
+	{ 0xD1217C31, 0x48F3B6ED }, { 0xBEB55AD6, 0x2E9EAD75 }, { 0x5FD42D6B, 0x174FD8B4 }, { 0x1238ABFA, 0x4ED4E496 },
+	{ 0xFEBEB5D0, 0x92E6B4EE }, { 0x0BEF8208, 0x46A0D732 }, { 0xA5912A51, 0x47203BA8 }, { 0xE69E3E96, 0x24F75BF8 },
+	{ 0x13CF094E, 0xF0B13824 }, { 0xC901F777, 0xFEE259FB }, { 0x091CDB7D, 0x276A724B }, { 0xEE75475F, 0xBDF8F501 },
+	{ 0x4DEC8691, 0x599B3C22 }, { 0x99C1EAFE, 0x6D84018F }, { 0x1CDB39AC, 0x7498B8E4 }, { 0x217C5BB7, 0xE0595E71 },
+	{ 0x3C50C0AF, 0x2AA43A27 }, { 0x3F543B6E, 0xF50B43EC }, { 0x62734F70, 0x838E3E21 }, { 0x4507FF58, 0xC09492DB },
+	{ 0xDFC2EE67, 0x72BFEA9F }, { 0x9CCDFAA0, 0x11688ACF }, { 0x6A9836B9, 0x1A8190D8 }, { 0xC615C795, 0x7ACBD93B },
+	{ 0x286080CA, 0xC7332C3A }, { 0x4EE87D50, 0x863445E9 }, { 0xD0D6DE85, 0xF6966A5F }, { 0x96D5DA1C, 0xE9AD814F },
+	{ 0x9E3EA3D5, 0x70A22FB6 }, { 0x582B6440, 0x0A69F68D }, { 0xC2EE757F, 0xB8428EC9 }, { 0xAC8DF12C, 0x604A49E3 },
+	{ 0x0C10CB23, 0x5B86F90B }, { 0x8F02F3EE, 0xE1D9B2EB }, { 0xD3D22544, 0x29391394 }, { 0x5CD0D6AA, 0xC8E0A17F },
+	{ 0xF7A26EAD, 0xB58CC6A5 }, { 0x238F02C2, 0x8193FB08 }, { 0x5B2F9F81, 0xD5C68F46 }, { 0x88FDBAC5, 0xFCFF9CD2 },
+	{ 0xF359DC47, 0x77059157 }, { 0x07FF492B, 0x1D262E39 }, { 0xE59AC557, 0xFB582233 }, { 0x42F8B673, 0xDDB2BCE2 },
+	{ 0x48E096CF, 0x2577B762 }, { 0xD83DA74C, 0x6F99C4A6 }, { 0xEB795701, 0xC1147E41 }, { 0x912A9337, 0xF48BAF76 },
+};
+
+__device__ uint2 T42[256] = {
+	{ 0x9B2C0A19, 0x3EF29D24 }, { 0xB6F8622F, 0xE9E16322 }, { 0x47757F7A, 0x55369940 }, { 0xA47B0B33, 0x9F4D56D5 },
+	{ 0x6AA1174C, 0x82256746 }, { 0xEB082FB2, 0xB8F5057D }, { 0xF4475F53, 0xCC48C10B }, { 0x275DEC3A, 0x373088D4 },
+	{ 0x180AED10, 0x968F4325 }, { 0xF7016151, 0x173D232C }, { 0x946FCC13, 0xAE4ED09F }, { 0xC4539873, 0xFD4B4741 },
+	{ 0xD9933765, 0x1B5B3F0D }, { 0x7B644052, 0x2FFCB096 }, { 0x0A89840C, 0xE02376D2 }, { 0x329B18D7, 0xA3AE3A70 },
+	{ 0x35DE8526, 0x419CBD23 }, { 0x5B7C3199, 0xFAFEBF11 }, { 0x85AA9B0D, 0x0397074F }, { 0x4836B970, 0xC58AD4FB },
+	{ 0xFC4104A8, 0xBEC60BE3 }, { 0x4B708772, 0x1EFF36DC }, { 0xED8453B6, 0x131FDC33 }, { 0x341764D3, 0x0844E33E },
+	{ 0xAB38CD39, 0x0FF11B6E }, { 0x7761B85A, 0x64351F0A }, { 0x09CFBA0E, 0x3B5694F5 }, { 0xB87245D0, 0x30857084 },
+	{ 0x2297AE3C, 0x47AFB3BD }, { 0x6F6B554A, 0xF2BA5C2F }, { 0x1F4F70E1, 0x74BDC476 }, { 0x71EDC45E, 0xCFDFC644 },
+	{ 0x1DC0AF16, 0xE610784C }, { 0x3C113F28, 0x7ACA29D6 }, { 0x76A859AF, 0x2DED4117 }, { 0x99A3D5EE, 0xAC5F211E },
+	{ 0xA87EF33B, 0xD484F949 }, { 0x96E013E4, 0x3CE36CA5 }, { 0x3A9D432C, 0xD120F098 }, { 0xDC597563, 0x6BC40464 },
+	{ 0xD1956C9E, 0x69D5F5E5 }, { 0x3698BB24, 0x9AE95F04 }, { 0x66A4EF44, 0xC9ECC8DA }, { 0xA5B2EAC6, 0xD69508C8 },
+	{ 0xC0503B80, 0xC40C2235 }, { 0x8C652103, 0x38C193BA }, { 0x46BC9E8F, 0x1CEEC75D }, { 0x37515AD1, 0xD3310119 },
+	{ 0x86ECA50F, 0xD8E2E568 }, { 0x5779C991, 0xB137108D }, { 0x05CA4206, 0x709F3B69 }, { 0x1680CAEF, 0x4FEB5083 },
+	{ 0x241BD238, 0xEC456AF3 }, { 0xE181ABBE, 0x58D673AF }, { 0xCAD9BF8C, 0x242F54E7 }, { 0x0DCC19FD, 0x0211F181 },
+	{ 0x0F43C60A, 0x90BC4DBB }, { 0x9DA0761D, 0x9518446A }, { 0x3F57012A, 0xA1BFCBF1 }, { 0x61E172B5, 0x2BDE4F89 },
+	{ 0x4F732481, 0x27B853A8 }, { 0xDF1F4B61, 0xB0B1E643 }, { 0x5C39AC68, 0x18CC3842 }, { 0xBF37D821, 0xD2B7F7D7 },
+	{ 0x3014C720, 0x3103864A }, { 0x72ABFA5C, 0x14AA2463 }, { 0x4EBAC574, 0x6E600DB5 }, { 0x0403A3F3, 0x39476574 },
+	{ 0xBC71E623, 0x09C215F0 }, { 0xE987F045, 0x2A58B947 }, { 0xB477BDD8, 0x7B4CDF18 }, { 0x906C6FE0, 0x9709B5EB },
+	{ 0x8060D90B, 0x73083C26 }, { 0x41F9037E, 0xFEDC400E }, { 0xE44BE9B8, 0x284948C6 }, { 0x08065BFB, 0x728ECAE8 },
+	{ 0x17492B1A, 0x06330E9E }, { 0x69E7294E, 0x59508561 }, { 0xE6C4364F, 0xBAE4F4FC }, { 0xE30E7449, 0xCA7BCF95 },
+	{ 0xA33E96C2, 0x7D7FD186 }, { 0xD85AD690, 0x52836110 }, { 0x1B4CD312, 0x4DFAA102 }, { 0x872544FA, 0x913ABB75 },
+	{ 0x140F1518, 0xDD46ECB9 }, { 0x1E869114, 0x3D659A6B }, { 0xD719109A, 0xC23F2CAB }, { 0x2DD46836, 0xD713FE06 },
+	{ 0xB2FBC1DC, 0xD0A60656 }, { 0xDD909496, 0x221C5A79 }, { 0xA1B14935, 0xEFD26DBC }, { 0x235E4FC9, 0x0E77EDA0 },
+	{ 0x6B68F6B9, 0xCBFD395B }, { 0xA6F4D4C4, 0x0DE0EAEF }, { 0x1A8532E7, 0x0422FF1F }, { 0xDED6AA94, 0xF969B85E },
+	{ 0xAEF28F3F, 0x7F6E2007 }, { 0x81A938FE, 0x3AD0623B }, { 0x7AADA1A7, 0x6624EE8B }, { 0xC856607B, 0xB682E8DD },
+	{ 0x281E2A30, 0xA78CC56F }, { 0x45FAA08D, 0xC79B257A }, { 0x642B30B3, 0x5B4174E0 }, { 0x7EAE0254, 0x5F638BFF },
+	{ 0x0C05F808, 0x4BC9AF9C }, { 0xF98B46AE, 0xCE59308A }, { 0xCC55C388, 0x8FC58DA9 }, { 0x676D0EB1, 0x803496C7 },
+	{ 0xE70DD7BA, 0xF33CAAE1 }, { 0x6EA2B4BF, 0xBB620232 }, { 0x201871CB, 0xD5020F87 }, { 0xA9B712CE, 0x9D5CA754 },
+	{ 0x7DE83C56, 0x841669D8 }, { 0x5EB6739F, 0x8A618478 }, { 0xB0741E2B, 0x420BBA6C }, { 0xEAC1CE47, 0xF12D5B60 },
+	{ 0x1283691C, 0x76AC35F7 }, { 0xFECEDB5F, 0x2C6BB7D9 }, { 0x4C351A83, 0xFCCDB18F }, { 0xC3160582, 0x1F79C012 },
+	{ 0x62A74CB7, 0xF0ABADAE }, { 0x82EF06FC, 0xE1A5801C }, { 0xF2CB2357, 0x67A21845 }, { 0x5DF04D9D, 0x5114665F },
+	{ 0x74278658, 0xBF40FD2D }, { 0xB73183DA, 0xA0393D3F }, { 0x92E3B017, 0x05A409D1 }, { 0x0B4065F9, 0xA9FB28CF },
+	{ 0x42BF3D7C, 0x25A9A229 }, { 0x03463E02, 0xDB75E227 }, { 0x5AB5D06C, 0xB326E10C }, { 0x95A62DE6, 0xE7968E82 },
+	{ 0x636EAD42, 0xB973F3B3 }, { 0x19C30CE5, 0xDF571D38 }, { 0x29D7CBC5, 0xEE549B72 }, { 0x65E2D146, 0x12992AFD },
+	{ 0x56B02864, 0xF8EF4E90 }, { 0x4030E28B, 0xB7041E13 }, { 0xDAD50967, 0xC02EDD2A }, { 0x8AE95D07, 0x932B4AF4 },
+	{ 0xC6DC4784, 0x6FE6FB7B }, { 0x55F61666, 0x239AACB7 }, { 0xBDB807D6, 0x401A4BED }, { 0x89AF6305, 0x485EA8D3 },
+	{ 0xADB4B13D, 0xA41BC220 }, { 0x9729F211, 0x753B32B8 }, { 0xB3322029, 0x997E584B }, { 0xCEDA1C7F, 0x1D683193 },
+	{ 0xC99F818E, 0xFF5AB6C0 }, { 0x7F67E3A1, 0x16BBD5E2 }, { 0x25D233CD, 0xA59D34EE }, { 0x3B54A2D9, 0x98F8AE85 },
+	{ 0xCB105E79, 0x6DF70AFA }, { 0xB9BBA425, 0x795D2E99 }, { 0x44334178, 0x8E437B67 }, { 0x886682F0, 0x0186F6CE },
+	{ 0xBB347BD2, 0xEBF092A3 }, { 0xF18D1D55, 0xBCD7FA62 }, { 0x11C5571E, 0xADD9D7D0 }, { 0xB1BDFFDE, 0x0BD3E471 },
+	{ 0x8EEAFEF4, 0xAA6C2F80 }, { 0xF6C880A4, 0x5EE57D31 }, { 0xF044FCA0, 0xF50FA47F }, { 0x51F5B595, 0x1ADDC9C3 },
+	{ 0x3352F922, 0xEA76646D }, { 0x00000000, 0x00000000 }, { 0xF58EBEA6, 0x85909F16 }, { 0xAAF12CCC, 0x46294573 },
+	{ 0x39DB7D2E, 0x0A5512BF }, { 0x31DD26D5, 0x78DBD857 }, { 0x6C2D6B48, 0x29CFBE08 }, { 0x583A0F9B, 0x218B5D36 },
+	{ 0xFACD78AC, 0x152CD2AD }, { 0xE2C795BC, 0x83A39188 }, { 0x5F7F926A, 0xC3B9DA65 }, { 0x2C1D89C3, 0x9ECBA01B },
+	{ 0x9F2FA9EA, 0x07B5F850 }, { 0x26940DCF, 0x7EE8D6C9 }, { 0xAF3B6ECA, 0x36B67E1A }, { 0x702425AB, 0x86079859 },
+	{ 0xD31AB369, 0xFB7849DF }, { 0x932A51E2, 0x4C7C57CC }, { 0x0E8A27FF, 0xD96413A6 }, { 0xC715A671, 0x263EA566 },
+	{ 0x4376DC89, 0x6C71FC34 }, { 0x84637AF8, 0x4A4F5952 }, { 0x8B20BCF2, 0xDAF314E9 }, { 0x4AB96687, 0x572768C1 },
+	{ 0x682EC8BB, 0x1088DB7C }, { 0x537A6A62, 0x887075F9 }, { 0xF302C2A2, 0x2E7A4658 }, { 0xE582084D, 0x619116DB },
+	{ 0x8326E709, 0xA87DDE01 }, { 0x9C6997E8, 0xDCC01A77 }, { 0xAC7D50C8, 0xEDC39C3D }, { 0xA078A8C0, 0xA60A33A1 },
+	{ 0x52B38B97, 0xC1A82BE4 }, { 0x134A88E9, 0x3F746BEA }, { 0xBAFD9A27, 0xA228CCBE }, { 0x068C7C04, 0xABEAD94E },
+	{ 0x78227E50, 0xF48952B1 }, { 0xFB049959, 0x5CF48CB0 }, { 0x6DE48ABD, 0x6017E015 }, { 0xA73D3531, 0x4438B4F2 },
+	{ 0x49FF5885, 0x8C528AE6 }, { 0x4DFCFB76, 0xB515EF92 }, { 0x2E925634, 0x0C661C21 }, { 0xC59A7986, 0xB493195C },
+	{ 0x21D1903E, 0x9CDA519A }, { 0xB5BE5C2D, 0x32948105 }, { 0xD45F2E98, 0x194ACE8C }, { 0x38129CDB, 0x438D4CA2 },
+	{ 0xBEFE39D4, 0x9B6FA9CA }, { 0xEF0B8C41, 0x81B26009 }, { 0x91A58E15, 0xDED1EBF6 }, { 0x9EE6481F, 0x4E6DA64D },
+	{ 0xCF13FD8A, 0x54B06F8E }, { 0x01C9E1F5, 0x49D85E1D }, { 0x1C094EE3, 0xAFC82651 }, { 0x75EE67AD, 0xF698A330 },
+	{ 0xEC4DB243, 0x5AC7822E }, { 0xC199DA75, 0x8DD47C28 }, { 0xDB1CE892, 0x89F68337 }, { 0x7C21DDA3, 0xCDCE37C5 },
+	{ 0x503C5460, 0x530597DE }, { 0x543FF793, 0x6A42F2AA }, { 0x73621BA9, 0x5D727A7E }, { 0x07459DF1, 0xE2328753 },
+	{ 0xC2DFE477, 0x56A19E0F }, { 0xCD9C227D, 0xC61DD3B4 }, { 0x986A341B, 0xE5877F03 }, { 0x15C6F4ED, 0x949EB2A4 },
+	{ 0x60289340, 0x62061194 }, { 0xE84E11B0, 0x6380E75A }, { 0xD6D0F16F, 0x8BE772B6 }, { 0xD596CF6D, 0x50929091 },
+	{ 0x3E9EE0DF, 0xE86795EC }, { 0x2B581432, 0x7CF92748 }, { 0xEEC26DB4, 0xC86A3E14 }, { 0x8DACC0F6, 0x7119CDA7 },
+	{ 0x100CB6EB, 0xE40189CD }, { 0x028FDFF7, 0x92ADBC3A }, { 0xD2D3529C, 0xB2A017C2 }, { 0xD05C8D6B, 0x200DABF8 },
+	{ 0xA2F77737, 0x34A78F9B }, { 0x8F231F01, 0xE3B4719D }, { 0x2F5BB7C1, 0x45BE423C }, { 0xFD88E55D, 0xF71E55FE },
+	{ 0x59F3EE6E, 0x6853032B }, { 0xFF073AAA, 0x65B3E9C4 }, { 0x9AE5EBEC, 0x772AC339 }, { 0xF842A75B, 0x87816E97 },
+	{ 0xE0484A4B, 0x110E2DB2 }, { 0x3DD8DEDD, 0x331277CB }, { 0x79EB9FA5, 0xBD510CAC }, { 0x2A91F5C7, 0x35217955 },
+};
+
+__device__ uint2 T52[256] = {
+	{ 0x46E06A6D, 0x8AB0A968 }, { 0x4BF0B33A, 0x43C7E80B }, { 0x6B161EE5, 0x08C9B354 }, { 0xEBA990BE, 0x39F1C235 },
+	{ 0x6606C7B2, 0xC1BEF237 }, { 0x614569AA, 0x2C209233 }, { 0x6FC3289A, 0xEB01523B }, { 0x935ACEDD, 0x946953AB },
+	{ 0x3E13340E, 0x272838F6 }, { 0xA12BA052, 0x8B0455EC }, { 0x978FF8A2, 0x77A1B2C4 }, { 0x13E54086, 0xA55122CA },
+	{ 0x62D3F1CD, 0x22761358 }, { 0x08B76CFE, 0xDB8DDFDE }, { 0x9E4A178A, 0x5D1E12C8 }, { 0x03969867, 0x0E56816B },
+	{ 0x3303ED59, 0xEE5F7995 }, { 0xAB78D71D, 0xAFED748B }, { 0xF93E53EE, 0x6D929F2D }, { 0xBA798C2A, 0xF5D8A8F8 },
+	{ 0x8E39CF6B, 0xF619B169 }, { 0x749104E2, 0x95DDAF2F }, { 0xE0886427, 0xEC2A9C80 }, { 0x825B95EA, 0xCE5C8FD8 },
+	{ 0x3AC60271, 0xC4E0D999 }, { 0x173076F9, 0x4699C3A5 }, { 0x50A29F42, 0x3D1B151F }, { 0x2BC75946, 0x9ED505EA },
+	{ 0xDC7F4B98, 0x34665ACF }, { 0x292342F7, 0x61B1FB53 }, { 0x0E864130, 0xC721C008 }, { 0x96FD7B74, 0x8693CD16 },
+	{ 0x7136B14B, 0x87273192 }, { 0x63A1721B, 0xD3446C8A }, { 0xA6680E4A, 0x669A35E8 }, { 0x39509A16, 0xCAB658F2 },
+	{ 0xF42E8AB9, 0xA4E5DE4E }, { 0xE83F08D9, 0x37A7435E }, { 0xE26C7F96, 0x134E6239 }, { 0x2DF67488, 0x82791A3C },
+	{ 0x8329163C, 0x3F6EF00A }, { 0xFDEB6591, 0x8E5A7E42 }, { 0x7981DDB5, 0x5CAAEE4C }, { 0x5AF1E80D, 0x19F23478 },
+	{ 0xED98BD70, 0x255DDDE3 }, { 0xA99CCCAC, 0x50898A32 }, { 0xDA4E6656, 0x28CA4519 }, { 0x4CB31D22, 0xAE59880F },
+	{ 0x37D6DB26, 0x0D9798FA }, { 0xB4FFCD1A, 0x32F968F0 }, { 0x4F258545, 0xA00F0964 }, { 0x5E24DE72, 0xFA3AD517 },
+	{ 0x5DB24615, 0xF46C547C }, { 0xFF0F7E20, 0x713E80FB }, { 0x73D2AAFA, 0x7843CF2B }, { 0xAEDF62B4, 0xBD17EA36 },
+	{ 0xD16F92CF, 0xFD111BAC }, { 0xC72D67E0, 0x4ABAA7DB }, { 0xAD49FAD3, 0xB3416B5D }, { 0x4914A88B, 0xBCA316B2 },
+	{ 0x8AECF914, 0x15D15006 }, { 0xE31EFC40, 0xE27C1DEB }, { 0x9BEDA223, 0x4FE48C75 }, { 0x1B522C78, 0x7EDCFD14 },
+	{ 0x7C26681C, 0x4E5070F1 }, { 0x5815F3BC, 0xE696CAC1 }, { 0x3BB481A7, 0x35D2A64B }, { 0xFE7DFDF6, 0x800CFF29 },
+	{ 0xD5BAA4B0, 0x1ED9FAC3 }, { 0x1EF599D1, 0x6C2663A9 }, { 0x34404341, 0x03C11991 }, { 0x69F20554, 0xF7AD4DED },
+	{ 0xB61BD6AB, 0xCD9D9649 }, { 0xEADB1368, 0xC8C3BDE7 }, { 0xB02AFB65, 0xD131899F }, { 0xE1FAE7F1, 0x1D18E352 },
+	{ 0xEF7CA6C1, 0xDA39235A }, { 0xA8EE4F7A, 0xA1BBF5E0 }, { 0xCF9A0B1E, 0x91377805 }, { 0x80BF8E5B, 0x31387161 },
+	{ 0xDB3CE580, 0xD9F83ACB }, { 0xD38B897E, 0x0275E515 }, { 0xF0FBBCC6, 0x472D3F21 }, { 0x868EA395, 0x2D946EB7 },
+	{ 0x21942E09, 0xBA3C248D }, { 0xBFDE3983, 0xE7223645 }, { 0x02E41BB1, 0xFF64FEB9 }, { 0x0D10D957, 0xC9774163 },
+	{ 0xB58D4ECC, 0xC3CB1722 }, { 0x9CAE0C3B, 0xA27AEC71 }, { 0xA48C15FB, 0x99FECB51 }, { 0x6D27332B, 0x1465AC82 },
+	{ 0xD75EBF01, 0xE1BD047A }, { 0x941960C5, 0x79F733AF }, { 0x41A3C475, 0x672EC96C }, { 0x524684F3, 0xC27FEBA6 },
+	{ 0x75E38734, 0x64EFD0FD }, { 0x0743AE18, 0xED9E6004 }, { 0xB9EF144D, 0xFB8E2993 }, { 0x0C625A81, 0x38453EB1 },
+	{ 0x42355C12, 0x69784807 }, { 0x14A6EE9E, 0x48CF42CE }, { 0x06312DCE, 0x1CAC1FD6 }, { 0x4792E9BB, 0x7B82D6BA },
+	{ 0x1F871A07, 0x9D141C7B }, { 0xC11C4A2E, 0x5616B80D }, { 0xF21FA777, 0xB849C198 }, { 0xC8D9A506, 0x7CA91801 },
+	{ 0x7EC273AD, 0xB1348E48 }, { 0x987B3A44, 0x41B20D1E }, { 0xA3CFBBE3, 0x7460AB55 }, { 0x4576F20A, 0x84E62803 },
+	{ 0x897A6173, 0x1B87D16D }, { 0xE45D5258, 0x0FE27DEF }, { 0xCA3DBEB7, 0x83CDE6B8 }, { 0xD01D1119, 0x0C23647E },
+	{ 0xA0592384, 0x7A362A3E }, { 0xF1893F10, 0xB61F40F3 }, { 0x440471DC, 0x75D457D1 }, { 0x237035B8, 0x4558DA34 },
+	{ 0x87FC2043, 0xDCA61165 }, { 0xC9AB26D0, 0x8D9B67D3 }, { 0xEE0E2517, 0x2B0B5C88 }, { 0x2AB5DA90, 0x6FE77A38 },
+	{ 0xD9D8FE31, 0x269CC472 }, { 0xFAA8CB89, 0x63C41E46 }, { 0x1642F52F, 0xB7ABBC77 }, { 0x2F126F39, 0x7D1DE485 },
+	{ 0x24339BA0, 0xA8C6BA30 }, { 0xCEE888C8, 0x600507D7 }, { 0x1A20AFAE, 0x8FEE82C6 }, { 0x26D78011, 0x57A24489 },
+	{ 0x36A458F0, 0xFCA5E728 }, { 0x8F4B4CBD, 0x072BCEBB }, { 0xF36D24A1, 0x497BBE4A }, { 0xB769557D, 0x3CAFE99B },
+	{ 0x05A7B5A9, 0x12FA9EBD }, { 0x5B836BDB, 0xE8C04BAA }, { 0xAC3B7905, 0x4273148F }, { 0x2851C121, 0x90838481 },
+	{ 0x6C55B0FD, 0xE557D350 }, { 0xCB4F3D61, 0x72FF996A }, { 0x64E2DC03, 0x3EDA0C8E }, { 0xE6B949E9, 0xF0868356 },
+	{ 0xBB0B0FFC, 0x04EAD72A }, { 0x5967706A, 0x17A4B513 }, { 0x04D5367F, 0xE3C8E16F }, { 0x8DAF570C, 0xF84F3002 },
+	{ 0xBD3A2232, 0x1846C8FC }, { 0xF6CA9108, 0x5B8120F7 }, { 0xECEA3EA6, 0xD46FA231 }, { 0x53340725, 0x334D9474 },
+	{ 0xC28AD249, 0x58403966 }, { 0x9A9F21F5, 0xBED6F3A7 }, { 0xA5FE962D, 0x68CCB483 }, { 0x57E1315A, 0xD085751B },
+	{ 0xE52FD18E, 0xFED0023D }, { 0x20E6ADDF, 0x4B0E5B5F }, { 0x6EB1AB4C, 0x1A332DE9 }, { 0x7B65C604, 0xA3CE10F5 },
+	{ 0xD62C3CD7, 0x108F7BA8 }, { 0x1073D8E1, 0xAB07A3A1 }, { 0x91BED56C, 0x6B0DAD12 }, { 0x3532C097, 0xF2F36643 },
+	{ 0xB2CEE0D4, 0x2E557726 }, { 0x00000000, 0x00000000 }, { 0xDE9B5029, 0xCB02A476 }, { 0x8B9E7AC2, 0xE4E32FD4 },
+	{ 0x2C84F75E, 0x734B65EE }, { 0xCD7E10AF, 0x6E5386BC }, { 0xE7CBCA3F, 0x01B4FC84 }, { 0x65905FD5, 0xCFE8735C },
+	{ 0x0FF4C2E6, 0x3613BFDA }, { 0x31E7F6E8, 0x113B872C }, { 0x55052AEB, 0x2FE18BA2 }, { 0xBC48A1E4, 0xE974B72E },
+	{ 0xB89D979B, 0x0ABC5641 }, { 0x2202B66E, 0xB46AA5E6 }, { 0xC4BBFF87, 0x44EC26B0 }, { 0x27A503C7, 0xA6903B5B },
+	{ 0xFC99E647, 0x7F680190 }, { 0xA71A8D9C, 0x97A84A3A }, { 0x6037EA7C, 0xDD12EDE1 }, { 0xDD0DC84E, 0xC554251D },
+	{ 0x956BE313, 0x88C54C7D }, { 0x48662B5D, 0x4D916960 }, { 0x9909B992, 0xB08072CC }, { 0xC5C97C51, 0xB5DE5962 },
+	{ 0x19B637C9, 0x81B803AD }, { 0x4A8230EC, 0xB2F597D9 }, { 0x5F565DA4, 0x0B08AAC5 }, { 0x017283D6, 0xF1327FD2 },
+	{ 0x78F35E63, 0xAD98919E }, { 0x76751F53, 0x6AB95196 }, { 0x0A53774F, 0x24E92167 }, { 0x15D46D48, 0xB9FD3D1C },
+	{ 0xFBDA485F, 0x92F66194 }, { 0x11015B37, 0x5A35DC73 }, { 0x5477A93D, 0xDED3F470 }, { 0x81CD0D8D, 0xC00A0EB3 },
+	{ 0xC65FE436, 0xBB88D809 }, { 0xBEACBA55, 0x16104997 }, { 0x5693B28C, 0x21B70AC9 }, { 0x25411876, 0x59F4C5E2 },
+	{ 0x0B21F499, 0xD5DB5EB5 }, { 0xF55C096F, 0x55D7A19C }, { 0xC3F8519F, 0xA97246B4 }, { 0xA2BD3835, 0x8552D487 },
+	{ 0x1297C350, 0x54635D18 }, { 0x85183BF2, 0x23C2EFDC }, { 0xCC0C9379, 0x9F61F96E }, { 0x9DDC8FED, 0x534893A3 },
+	{ 0xAA0A54CB, 0x5EDF0B59 }, { 0x9F38945C, 0xAC2C6D1A }, { 0xD8AA7DE7, 0xD7AEBBA0 }, { 0x09C5EF28, 0x2ABFA00C },
+	{ 0x3CF72FBF, 0xD84CC64F }, { 0xB15878B3, 0x2003F64D }, { 0xC06EC9F8, 0xA724C7DF }, { 0x68808682, 0x069F323F },
+	{ 0x51D01C94, 0xCC296ACD }, { 0x5CC0C5C3, 0x055E2BAE }, { 0x1D6301B6, 0x6270E2C2 }, { 0x382219C0, 0x3B842720 },
+	{ 0x846AB824, 0xD2F0900E }, { 0x7A1745D2, 0x52FC6F27 }, { 0xE94D8B0F, 0xC6953C8C }, { 0x3095753E, 0xE009F8FE },
+	{ 0x92284D0B, 0x655B2C79 }, { 0x4347DFC4, 0x984A37D5 }, { 0x8808E2A5, 0xEAB5AEBF }, { 0x90CC56BA, 0x9A3FD2C0 },
+	{ 0xF84CD038, 0x9CA0E0FF }, { 0xAFADE162, 0x4C2595E4 }, { 0xB3BC6302, 0xDF6708F4 }, { 0x7D54EBCA, 0xBF620F23 },
+	{ 0x1C118260, 0x93429D10 }, { 0x8CDDD4DA, 0x097D4FD0 }, { 0x2E60ECEF, 0x8C2F9B57 }, { 0x18C4B41F, 0x708A7C7F },
+	{ 0xDFE9D3FF, 0x3A30DBA4 }, { 0x7FB0F07B, 0x4006F19A }, { 0x4DC19EF4, 0x5F6BF7DD }, { 0x32716E8F, 0x1F6D0647 },
+	{ 0x6A649D33, 0xF9FBCC86 }, { 0x67744464, 0x308C8DE5 }, { 0x72A0292C, 0x8971B0F9 }, { 0x3F61B7D8, 0xD61A4724 },
+	{ 0xD4C82766, 0xEFEB8511 }, { 0x40D147A3, 0x961CB6BE }, { 0xF7B812DE, 0xAAB35F25 }, { 0x7044329D, 0x76154E40 },
+	{ 0x4E570693, 0x513D76B6 }, { 0xD2F90AA8, 0xF3479AC7 }, { 0x77079C85, 0x9B8B2E44 }, { 0x3D85AC69, 0x297EB99D },
+};
+
+__device__ uint2 T62[256] = {
+	{ 0xFC7D40C3, 0x7E37E62D }, { 0xEE939E5B, 0x776F25A4 }, { 0xDD8FB5AD, 0xE045C850 }, { 0x11FF1952, 0x86ED5BA7 },
+	{ 0xCF616B35, 0xE91D0BD9 }, { 0x6E408FFB, 0x37E0AB25 }, { 0x31025A7A, 0x9607F6C0 }, { 0x16D23C9D, 0x0B02F5E1 },
+	{ 0xFB50650C, 0xF3D8486B }, { 0xC40875F5, 0x621CFF27 }, { 0xFA5FD34A, 0x7D40CB71 }, { 0xDAA29062, 0x6DAA6616 },
+	{ 0x23EC84E2, 0x9F5F3549 }, { 0xC507C3B3, 0xEC847C3D }, { 0x043CE205, 0x025A3668 }, { 0x4DAC0B19, 0xA8BF9E6C },
+	{ 0xE9BEBB94, 0xFA808BE2 }, { 0x77C74FA3, 0xB5B99C52 }, { 0xF0397BCC, 0x78D9BC95 }, { 0xDBAD2624, 0xE332E50C },
+	{ 0x9332797E, 0xC74FCE12 }, { 0x2EA709AB, 0x1729ECEB }, { 0x9954D1F8, 0xC2D6B9F6 }, { 0xBAB8551A, 0x5D898CBF },
+	{ 0x17DD8ADB, 0x859A76FB }, { 0x362F7FB5, 0x1BE85886 }, { 0xF136CD8A, 0xF6413F8F }, { 0xBBB7E35C, 0xD3110FA5 },
+	{ 0x14CC4D11, 0x0A2FEED5 }, { 0xCD7F1AB9, 0xE83010ED }, { 0x5F42D581, 0xA1E75DE5 }, { 0xC13B21B6, 0xEEDE4A55 },
+	{ 0xF94E1480, 0xF2F5535F }, { 0x1888761E, 0x0CC1B46D }, { 0x6529913B, 0xBCE15FDB }, { 0x5A7181C2, 0x2D25E897 },
+	{ 0xE2D7A554, 0x71817F1C }, { 0x5C53124B, 0x2E52C5CB }, { 0xEF9C281D, 0xF9F7A6BE }, { 0x21F2F56E, 0x9E722E7D },
+	{ 0x81DCA7E6, 0xCE170D9B }, { 0x1CB4941B, 0x0E9B8205 }, { 0x3C49D733, 0x1E712F62 }, { 0x42F9F7DC, 0x21E45CFA },
+	{ 0x8BBA0F60, 0xCB8E7A7F }, { 0x010FB646, 0x8E98831A }, { 0x8E895B23, 0x474CCF0D }, { 0x4FB27A95, 0xA9928558 },
+	{ 0x05335443, 0x8CC2B572 }, { 0x84EFF3A5, 0x42D5B8E9 }, { 0x021E718C, 0x012D1B34 }, { 0xAE74180B, 0x57A6626A },
+	{ 0xE3D81312, 0xFF19FC06 }, { 0x6A7C6DFE, 0x35BA9D4D }, { 0x8F86ED65, 0xC9D44C17 }, { 0xA02E5288, 0x506523E6 },
+	{ 0x06229389, 0x03772D5C }, { 0x0B691EC0, 0x8B01F4FE }, { 0xED825991, 0xF8DABD8A }, { 0x985B67BE, 0x4C4E3AEC },
+	{ 0x7FBF96A9, 0xB10DF082 }, { 0xD4F8DAE1, 0x6A69279A }, { 0xD3D5FF2E, 0xE78689DC }, { 0x1FA553D1, 0x812E1A2B },
+	{ 0xEBA0CA18, 0xFBAD90D6 }, { 0x34310E39, 0x1AC543B2 }, { 0x2CB97827, 0x1604F7DF }, { 0x51189F02, 0xA6241C69 },
+	{ 0xEAAF7C5E, 0x753513CC }, { 0xC84C4EFA, 0x64F2A59F }, { 0x489F5F5A, 0x247D2B1E }, { 0xAB474C48, 0xDB64D718 },
+	{ 0xF2270A40, 0x79F4A7A1 }, { 0x2A9BEBAE, 0x1573DA83 }, { 0x68621C72, 0x34978679 }, { 0xA2302304, 0x514838D2 },
+	{ 0xFD72F685, 0xF0AF6537 }, { 0x3A6B44BA, 0x1D06023E }, { 0xCE6EDD73, 0x678588C3 }, { 0xCC70ACFF, 0x66A893F7 },
+	{ 0xB5EDA9DF, 0xD4D24E29 }, { 0x70EA6A6C, 0x38563214 }, { 0x0E5A4A83, 0x07C3418C }, { 0x5635BACD, 0x2BCBB22F },
+	{ 0x0878D90A, 0x04B46CD0 }, { 0x0C443B0F, 0x06EE5AB8 }, { 0x76C8F9E5, 0x3B211F48 }, { 0x12EEDE98, 0x0958C389 },
+	{ 0xBF8B0159, 0xD14B39CD }, { 0x72F41BE0, 0x397B2920 }, { 0x13E168DE, 0x87C04093 }, { 0x47CAA39F, 0xAD26E988 },
+	{ 0x9C6785BB, 0x4E140C84 }, { 0xB7F3D853, 0xD5FF551D }, { 0x5D5CA40D, 0xA0CA46D1 }, { 0x87FE346F, 0xCD6020C7 },
+	{ 0x15C3FB57, 0x84B76DCF }, { 0xA121E4CE, 0xDEFDA0FC }, { 0x96012D3D, 0x4B8D7B60 }, { 0x298A2C64, 0x9AC642AD },
+	{ 0x10F0AF14, 0x0875D8BD }, { 0x7B8374AC, 0xB357C6EA }, { 0x9A451632, 0x4D6321D8 }, { 0xC719B23F, 0xEDA96709 },
+	{ 0xF328BC06, 0xF76C24BB }, { 0x912C08F2, 0xC662D526 }, { 0x7892B366, 0x3CE25EC4 }, { 0x6F4F39BD, 0xB978283F },
+	{ 0x9D6833FD, 0xC08C8F9E }, { 0x9E79F437, 0x4F3917B0 }, { 0xB2C08C10, 0x593DE06F }, { 0xB1D14BDA, 0xD6887841 },
+	{ 0x32139DB0, 0x19B26EEE }, { 0x75D93E2F, 0xB4948766 }, { 0x1987C058, 0x82593777 }, { 0x3D466175, 0x90E9AC78 },
+	{ 0xFF6C8709, 0xF1827E03 }, { 0x353EB87F, 0x945DC0A8 }, { 0x8AB5B926, 0x4516F965 }, { 0x7EB020EF, 0x3F957398 },
+	{ 0x6D514831, 0xB855330B }, { 0x542BCB41, 0x2AE6A91B }, { 0xC6160479, 0x6331E413 }, { 0x80D311A0, 0x408F8E81 },
+	{ 0xC325503A, 0xEFF35161 }, { 0xBD9570D5, 0xD06622F9 }, { 0x0D4B8D49, 0x8876D9A2 }, { 0x573A0C8B, 0xA5533135 },
+	{ 0xDF91C421, 0xE168D364 }, { 0xF50A2F8F, 0xF41B09E7 }, { 0x24C1A12D, 0x12B09B0F }, { 0xA9593DC4, 0xDA49CC2C },
+	{ 0x3E57A6BF, 0x1F5C3456 }, { 0xA8568B82, 0x54D14F36 }, { 0x43F6419A, 0xAF7CDFE0 }, { 0xC943F8BC, 0xEA6A2685 },
+	{ 0xD7E91D2B, 0xE5DCBFB4 }, { 0x799D0520, 0xB27ADDDE }, { 0xD6E6AB6D, 0x6B443CAE }, { 0xF61BE845, 0x7BAE91C9 },
+	{ 0x7CAE5163, 0x3EB868AC }, { 0x22E332A4, 0x11C7B653 }, { 0xB9A992D0, 0xD23C1491 }, { 0x0311C7CA, 0x8FB5982E },
+	{ 0xE0C9D4D8, 0x70AC6428 }, { 0x0F55FCC5, 0x895BC296 }, { 0xEC8DEFD7, 0x76423E90 }, { 0xDE9E7267, 0x6FF0507E },
+	{ 0x7A8CC2EA, 0x3DCF45F0 }, { 0x941F5CB1, 0x4AA06054 }, { 0xB0DEFD9C, 0x5810FB5B }, { 0xBC9AC693, 0x5EFEA1E3 },
+	{ 0xDC8003EB, 0x6EDD4B4A }, { 0xE8B10DD2, 0x741808F8 }, { 0x28859A22, 0x145EC1B7 }, { 0x50172944, 0x28BC9F73 },
+	{ 0x4EBDCCD3, 0x270A0642 }, { 0x331C2BF6, 0x972AEDF4 }, { 0x0A66A886, 0x059977E4 }, { 0x4A812ED6, 0x2550302A },
+	{ 0xA7037747, 0xDD8A8DA0 }, { 0x970E9B7B, 0xC515F87A }, { 0x601AC578, 0x3023EAA9 }, { 0x73FBADA6, 0xB7E3AA3A },
+	{ 0x1EAAE597, 0x0FB69931 }, { 0x00000000, 0x00000000 }, { 0x6204B4F4, 0x310EF19D }, { 0x44DB6455, 0x229371A6 },
+	{ 0x1A960792, 0x0DECAF59 }, { 0xB8A62496, 0x5CA4978B }, { 0x38753536, 0x1C2B190A }, { 0x82CD602C, 0x41A295B5 },
+	{ 0x6426277D, 0x3279DCC1 }, { 0x9F764271, 0xC1A194AA }, { 0x26DFD0A1, 0x139D803B }, { 0x41E83016, 0xAE51C4D4 },
+	{ 0xAD65DFC1, 0xD813FA44 }, { 0x45D4D213, 0xAC0BF2BC }, { 0x46C515D9, 0x23BE6A92 }, { 0x923DCF38, 0x49D74D08 },
+	{ 0x27D066E7, 0x9D050321 }, { 0x5E4D63C7, 0x2F7FDEFF }, { 0x55247D07, 0xA47E2A01 }, { 0x2FA8BFED, 0x99B16FF1 },
+	{ 0x8C972AAF, 0x4661D439 }, { 0xA33F9542, 0xDFD0BBC8 }, { 0xA51D06CB, 0xDCA79694 }, { 0x7DA1E725, 0xB020EBB6 },
+	{ 0x696DAA34, 0xBA0F0563 }, { 0xD5F76CA7, 0xE4F1A480 }, { 0x9510EAF7, 0xC438E34E }, { 0x3B64F2FC, 0x939E8124 },
+	{ 0x072D25CF, 0x8DEFAE46 }, { 0x586FF04E, 0x2C08F3A3 }, { 0xB3CF3A56, 0xD7A56375 }, { 0x40E78650, 0x20C947CE },
+	{ 0x86F18229, 0x43F8A3DD }, { 0xAC6A6987, 0x568B795E }, { 0x1DBB225D, 0x8003011F }, { 0xF7145E03, 0xF53612D3 },
+	{ 0x300DEC3C, 0x189F75DA }, { 0x3720C9F3, 0x9570DB9C }, { 0x6B73DBB8, 0xBB221E57 }, { 0xE4F536DD, 0x72F65240 },
+	{ 0x88ABC8AA, 0x443BE251 }, { 0xD9B357A8, 0xE21FFE38 }, { 0xE7E4F117, 0xFD43CA6E }, { 0x89A47EEC, 0xCAA3614B },
+	{ 0xE1C6629E, 0xFE34E732 }, { 0x1B99B1D4, 0x83742C43 }, { 0x83C2D66A, 0xCF3A16AF }, { 0x4990E91C, 0xAAE5A804 },
+	{ 0x4CA3BD5F, 0x26271D76 }, { 0x3F5810F9, 0x91C4B74C }, { 0xF841A2C6, 0x7C6DD045 }, { 0xFE63314F, 0x7F1AFD19 },
+	{ 0x8D989CE9, 0xC8F95723 }, { 0x5306EE8E, 0xA709075D }, { 0xAA48FA0E, 0x55FC5402 }, { 0x9023BEB4, 0x48FA563C },
+	{ 0xCA523F76, 0x65DFBEAB }, { 0xD8BCE1EE, 0x6C877D22 }, { 0x85E045E3, 0xCC4D3BF3 }, { 0x6115733E, 0xBEBB69B3 },
+	{ 0x20FD4328, 0x10EAAD67 }, { 0x71E5DC2A, 0xB6CEB10E }, { 0x6737E0B7, 0xBDCC44EF }, { 0xA412B08D, 0x523F158E },
+	{ 0x2DB6CE61, 0x989C74C5 }, { 0x2B945DE8, 0x9BEB5999 }, { 0x09776F4C, 0x8A2CEFCA }, { 0x5B7E3784, 0xA3BD6B8D },
+	{ 0xCB5D8930, 0xEB473DB1 }, { 0x9B4AA074, 0xC3FBA2C2 }, { 0x25CE176B, 0x9C281815 }, { 0xD0C438E4, 0x683311F2 },
+	{ 0xBE84B71F, 0x5FD3BAD7 }, { 0xE5FA809B, 0xFC6ED15A }, { 0x6C5EFE77, 0x36CDB011 }, { 0x520958C8, 0x29918447 },
+	{ 0x59604608, 0xA29070B9 }, { 0xA60CC101, 0x53120EBA }, { 0x74D68869, 0x3A0C047C }, { 0xD2DA4968, 0x691E0AC6 },
+	{ 0xE6EB4751, 0x73DB4974 }, { 0xF40599C9, 0x7A838AFD }, { 0xB4E21F99, 0x5A4ACD33 }, { 0xC03497F0, 0x6046C94F },
+	{ 0xD1CB8EA2, 0xE6AB92E8 }, { 0x663856F1, 0x3354C7F5 }, { 0xAF7BAE4D, 0xD93EE170 }, { 0xC22AE67C, 0x616BD27B },
+	{ 0x397A8370, 0x92B39A10 }, { 0x4B8E9890, 0xABC8B330 }, { 0x630B02B2, 0xBF967287 }, { 0xB6FC6E15, 0x5B67D607 },
+};
+
+__device__ uint2 T72[256] = {
+	{ 0xCE553FE6, 0xD031C397 }, { 0xB006B525, 0x16BA5B01 }, { 0x296E70C8, 0xA89BADE6 }, { 0x77D3435B, 0x6A1F525D },
+	{ 0x573DFA0B, 0x6E103570 }, { 0x17FC95AB, 0x660EFB2A }, { 0x97634BF6, 0x76327A9E }, { 0x62458BF5, 0x4BAD9D64 },
+	{ 0xDBC3F748, 0xF1830CAE }, { 0x669131FF, 0xC5C8F542 }, { 0xDC48B0CB, 0x95044A1C }, { 0x3CF8B866, 0x892962DF },
+	{ 0xE930C135, 0xB0B9E208 }, { 0x611A767C, 0xA14FB3F0 }, { 0x1C160136, 0x8D2605F2 }, { 0xFECC549E, 0xD6B71922 },
+	{ 0xA5907D8B, 0x37089438 }, { 0x5803D49C, 0x0B5DA38E }, { 0xEA6F3CBC, 0x5A5BCC9C }, { 0x3B73FFE5, 0xEDAE246D },
+	{ 0xDE22EDCE, 0xD2B87E0F }, { 0xCA8185EC, 0x5E54ABB1 }, { 0xE80561B9, 0x1DE7F88F }, { 0x0135A08C, 0xAD5E1A87 },
+	{ 0x65CECC76, 0x2F2ADBD6 }, { 0x82F58358, 0x5780B5A7 }, { 0xEDE47B3F, 0x3EDC8A2E }, { 0x06BEE70F, 0xC9D95C35 },
+	{ 0x6C4E05EE, 0x83BE111D }, { 0x59367410, 0xA603B909 }, { 0x809FDE5D, 0x103C81B4 }, { 0x7D0C774A, 0x2C69B602 },
+	{ 0xD5C87953, 0x399080D7 }, { 0x487406B4, 0x09D41E16 }, { 0x26505E5F, 0xCDD63B18 }, { 0x9B0298E8, 0xF99DC2F4 },
+	{ 0x943CB67F, 0x9CD0540A }, { 0x891F17C5, 0xBCA84B7F }, { 0xB78DF2A6, 0x723D1DB3 }, { 0xE73B4F2E, 0x78AA6E71 },
+	{ 0xA071670D, 0x1433E699 }, { 0x54620782, 0x84F21BE4 }, { 0xB4D20F2F, 0x98DF3327 }, { 0xD3769E5C, 0xF049DCE2 },
+	{ 0x9656EB7A, 0xDB6C6019 }, { 0x078B4783, 0x648746B2 }, { 0x8DCBADCF, 0x32CD2359 }, { 0xF0C7DA85, 0x1EA4955B },
+	{ 0x1B9D46B5, 0xE9A14340 }, { 0xBBEC21B8, 0xFD92A5D9 }, { 0x0E0B8E1B, 0xC8138C79 }, { 0x6D7BA562, 0x2EE00B9A },
+	{ 0x93B7F1FC, 0xF85712B8 }, { 0x0BEA949D, 0xEB28FED8 }, { 0x8A40EA4C, 0x564A65EB }, { 0x474A2823, 0x6C9988E8 },
+	{ 0x121D8F2D, 0x4535898B }, { 0x31ACCBF4, 0xABD8C032 }, { 0xB9867CBD, 0xBA2E91CA }, { 0xEF8E263A, 0x7960BE3D },
+	{ 0x602FD6F0, 0x0C11A977 }, { 0x16C93527, 0xCB50E1AD }, { 0x035FFD89, 0xEAE22E94 }, { 0x5DE2CE1A, 0x2866D12F },
+	{ 0xAB9BF390, 0xFF1B1841 }, { 0x8CFE0D43, 0x9F9339DE }, { 0xC48A0BF7, 0x964727C8 }, { 0xAAAE531C, 0x524502C6 },
+	{ 0xAC10B413, 0x9B9C5EF3 }, { 0x42AB32A5, 0x4FA2FA49 }, { 0xE551122B, 0x3F165A62 }, { 0x76E6E3D7, 0xC74148DA },
+	{ 0xE464B2A7, 0x924840E5 }, { 0xD69784DA, 0xD372AE43 }, { 0x05E11A86, 0x233B72A1 }, { 0x4941A638, 0xA48A0491 },
+	{ 0xC9DE7865, 0xB4B68525 }, { 0xA6CF8002, 0xDDEABAAC }, { 0x50B6BD88, 0x0A9773C2 }, { 0x5EBD3393, 0xC284FFBB },
+	{ 0x2C8F6A4E, 0x8BA0DF47 }, { 0x4D951C32, 0x2AEF6CB7 }, { 0x2A318D41, 0x42798372 }, { 0xBF389BB2, 0x73F7CDFF },
+	{ 0x382C026C, 0x074C0AF9 }, { 0x243A035A, 0x8A6A0F0B }, { 0x5F88931F, 0x6FDAE53C }, { 0x7E538AC3, 0xC68B9896 },
+	{ 0x1AA8E639, 0x44FF59C7 }, { 0x439E9229, 0xE2FCE0CE }, { 0x79D8CD40, 0xA20CDE24 }, { 0xC8EBD8E9, 0x19E89FA2 },
+	{ 0xF398270C, 0xF446BBCF }, { 0x2284E455, 0x43B3533E }, { 0x8E945046, 0xD82F0DCD }, { 0xB26CE820, 0x51066F12 },
+	{ 0x6BC5426D, 0xE73957AF }, { 0x40C16FA0, 0x081ECE5A }, { 0xC5BFAB7B, 0x3B193D4F }, { 0xDF174D42, 0x7FE66488 },
+	{ 0x705804D8, 0x0E9814EF }, { 0x7C39D7C6, 0x8137AC85 }, { 0xE185A821, 0xB1733244 }, { 0x6F11F867, 0x695C3F89 },
+	{ 0xE3EFF524, 0xF6CF0657 }, { 0xD02963D5, 0x1AABF276 }, { 0x75B91E5E, 0x2DA3664E }, { 0x1077D228, 0x0289BD98 },
+	{ 0xF413608F, 0x90C1FD7D }, { 0xFD93A917, 0x3C5537B6 }, { 0x3919A2E0, 0xAA12107E }, { 0x30996B78, 0x0686DAB5 },
+	{ 0x9EE3826E, 0xDAA6B055 }, { 0x56085A87, 0xC34E2FF7 }, { 0x4FFF4137, 0x6D5358A4 }, { 0xB35948AC, 0xFC587595 },
+	{ 0xC7D5F67E, 0x7CA5095C }, { 0x8B754AC0, 0xFB147F6C }, { 0x91DDACF9, 0xBFEB26AB }, { 0x67A49173, 0x6896EFC5 },
+	{ 0x1E7C5C33, 0xCA9A31E1 }, { 0xB13315A9, 0xBBE44186 }, { 0x689ABFE4, 0x0DDB793B }, { 0xA7FA208E, 0x70B4A02B },
+	{ 0x7307F951, 0xE47A3A7B }, { 0x14A36822, 0x8CECD5BE }, { 0x23B144D9, 0xEEED49B9 }, { 0xB8B3DC31, 0x17708B4D },
+	{ 0x2765FED3, 0x6088219F }, { 0xF1F27A09, 0xB3FA8FDC }, { 0xFCA6099B, 0x910B2D31 }, { 0x78ED6DCC, 0x0F52C4A3 },
+	{ 0xBAD98134, 0x50CCBF5E }, { 0x7F662A4F, 0x6BD58211 }, { 0xD4FDD9DF, 0x94CE9A50 }, { 0x45207526, 0x2B25BCFB },
+	{ 0x1F49FCBF, 0x67C42B66 }, { 0x723259DD, 0x492420FC }, { 0x18C2BB3C, 0x03436DD4 }, { 0xF872B391, 0x1F6E4517 },
+	{ 0x69AF1F68, 0xA08563BC }, { 0xEEBB86B6, 0xD43EA4BA }, { 0x08B56914, 0x01CAD04C }, { 0x0980C998, 0xAC94CACB },
+	{ 0x9A373864, 0x54C3D873 }, { 0x2DBACAC2, 0x26FEC5C0 }, { 0xBE0D3B3E, 0xDEA9D778 }, { 0x20EEB950, 0x040F672D },
+	{ 0x7BB29045, 0xE5B0EA37 }, { 0xCBB42560, 0xF30AB136 }, { 0x37122CFB, 0x62019C07 }, { 0x13282FA1, 0xE86B930C },
+	{ 0x2EE5374B, 0xCC1CEB54 }, { 0xA21B3A08, 0x538FD28A }, { 0xD89C0AC1, 0x1B61223A }, { 0xAD25149F, 0x36C24474 },
+	{ 0xF74C9D06, 0x7A23D3E9 }, { 0x9968C5ED, 0xBE21F6E7 }, { 0x36278C77, 0xCF5F8680 }, { 0xEB5A9C30, 0xF705D61B },
+	{ 0x52DCE08D, 0x4D2B47D1 }, { 0xC234ECF8, 0x5F9E7BFD }, { 0x3DCD18EA, 0x24777858 }, { 0x4415D5AA, 0x867BA67C },
+	{ 0x5A698999, 0x4CE1979D }, { 0x00000000, 0x00000000 }, { 0x33C696F1, 0xEC64F421 }, { 0xC16B1171, 0xB57C5569 },
+	{ 0x467F88AF, 0xC1C7926F }, { 0x0F3E2E97, 0x654D96FE }, { 0xA8C40E19, 0x15F936D5 }, { 0xA9F1AE95, 0xB8A72C52 },
+	{ 0x21DB19DC, 0xA9517DAA }, { 0xFA18EE94, 0x58D27104 }, { 0xF2AD8780, 0x5918A148 }, { 0xDAF657C4, 0x5CDD1629 },
+	{ 0x64FB6CFA, 0x8274C151 }, { 0xC6E056F2, 0xD1FB13DB }, { 0xCF609F6A, 0x7D6FD910 }, { 0xD9A9AA4D, 0xB63F38BD },
+	{ 0xF526C003, 0x3D9FE7FA }, { 0x871499DE, 0x74BBC706 }, { 0xB6B8522A, 0xDF630734 }, { 0xCD0AC26F, 0x3AD3ED03 },
+	{ 0x83C023D4, 0xFADEAF20 }, { 0x4ECAE1BB, 0xC00D4223 }, { 0x5CD76E96, 0x8538CBA8 }, { 0x6E2458EB, 0xC402250E },
+	{ 0x026A5D05, 0x47BC3413 }, { 0x114272A4, 0xAFD7A71F }, { 0xCC3F62E3, 0x978DF784 }, { 0xA144C781, 0xB96DFC1E },
+	{ 0x1596C8AE, 0x21B2CF39 }, { 0x950916F3, 0x318E4E8D }, { 0x3E92E563, 0xCE9556CC }, { 0xDD7D1047, 0x385A509B },
+	{ 0xB5E7AFA3, 0x358129A0 }, { 0x63702B79, 0xE6F387E3 }, { 0x53E94001, 0xE0755D56 }, { 0xFFF9F412, 0x7BE903A5 },
+	{ 0x90E80C75, 0x12B53C2C }, { 0x857EC4DB, 0x3307F315 }, { 0x0C61D31E, 0x8FAFB86A }, { 0x86213952, 0xD9E5DD81 },
+	{ 0x9FD622E2, 0x77F8AAD2 }, { 0x357871FE, 0x25BDA814 }, { 0x8FA1F0CA, 0x7571174A }, { 0x985D6561, 0x137FEC60 },
+	{ 0x9DBC7FE7, 0x30449EC1 }, { 0x41F4CF2C, 0xA540D4DD }, { 0xAE7AE916, 0xDC206AE0 }, { 0xE2DA55A8, 0x5B911CD0 },
+	{ 0xF947131D, 0xB2305F90 }, { 0xBD52C6B7, 0x344BF9EC }, { 0xD2433ED0, 0x5D17C665 }, { 0xC05EB1FD, 0x18224FEE },
+	{ 0x844B6457, 0x9E59E992 }, { 0xA4A5DD07, 0x9A568EBF }, { 0x716DA454, 0xA3C60E68 }, { 0xD7A22456, 0x7E2CB4C4 },
+	{ 0x4CA0BCBE, 0x87B17630 }, { 0x32F3367D, 0x413AEEA6 }, { 0xBC67663B, 0x9915E36B }, { 0x3A465F69, 0x40F03EEA },
+	{ 0xE0B008AD, 0x1C2D28C3 }, { 0x4A1E5BB1, 0x4E682A05 }, { 0x285BD044, 0x05C5B761 }, { 0x5B5C2915, 0xE1BF8D1A },
+	{ 0xC3014C74, 0xF2C0617A }, { 0xD11CC359, 0xB7F5E8F1 }, { 0x3FA745EF, 0x63CB4C4B }, { 0x9C89DF6B, 0x9D1A8446 },
+	{ 0x4B2BFB3D, 0xE3363082 }, { 0xE60EEFA2, 0xD5F474F6 }, { 0xFB2D4E18, 0xF58C6B83 }, { 0x0ADF3411, 0x4676E45F },
+	{ 0x1D23A1BA, 0x20781F75 }, { 0x81AA7ED1, 0xBD629B33 }, { 0x19F71BB0, 0xAE1D7753 }, { 0xA32E9A84, 0xFED1C80D },
+	{ 0x92825170, 0x5509083F }, { 0x5557A70E, 0x29AC0163 }, { 0x51831D04, 0xA7C96945 }, { 0x04D4BA0A, 0x8E656826 },
+	{ 0x882AB749, 0x11F651F8 }, { 0xF6793D8A, 0xD77DC96E }, { 0x2B042DCD, 0xEF2799F5 }, { 0x7A8730C9, 0x48EEF0B0 },
+	{ 0x0D547392, 0x22F1A2ED }, { 0x2FD097C7, 0x6142F1D3 }, { 0x6AF0E2E1, 0x4A674D28 }, { 0x748CBED2, 0x80FD7CC9 },
+	{ 0xAF4F499A, 0x717E7067 }, { 0xECD1DBB3, 0x938290A9 }, { 0x344DD172, 0x88E3B293 }, { 0x250FA3D6, 0x2734158C },
+};
+
+// KeySchedule
+__constant__ const uint64_t CC[12][8] = {
+{ 0xe9daca1eda5b08b1, 0x1f7c65c0812fcbeb, 0x16d0452e43766a2f, 0xfcc485758db84e71, 0x0169679291e07c4b, 0x15d360a4082a42a2, 0x234d74cc36747605, 0x0745a6f2596580dd },
+{ 0x1a2f9da98ab5a36f, 0xd7b5700f469de34f, 0x982b230a72eafef3, 0x3101b5160f5ed561, 0x5899d6126b17b59a, 0xcaa70adbc261b55c, 0x56cdcbd71ba2dd55, 0xb79bb121700479e6 },
+{ 0xc72fce2bacdc74f5, 0x35843d6a28fc390a, 0x8b1f9c525f5ef106, 0x7b7b29b11475eaf2, 0xb19e3590e40fe2d3, 0x09db6260373ac9c1, 0x31db7a8643f4b6c2, 0xb20aba0af5961e99 },
+{ 0xd26615e8b3df1fef, 0xdde4715da0e148f9, 0x7d3c5c337e858e48, 0x3f355e68ad1c729d, 0x75d603ed822cd7a9, 0xbe0352933313b7d8, 0xf137e893a1ea5334, 0x2ed1e384bcbe0c22 },
+{ 0x994747adac6bea4b, 0x6323a96c0c413f9a, 0x4a1086161f1c157f, 0xbdff0f80d7359e35, 0xa3f53a254717cdbf, 0x161a2723b700ffdf, 0xf563eaa97ea2567a, 0x57fe6c7cfd581760 },
+{ 0xd9d33a1daeae4fae, 0xc039307a3bc3a46f, 0x6ca44251f9c4662d, 0xc68ef09ab49a7f18, 0xb4b79a1cb7a6facf, 0xb6c6bec2661ff20a, 0x354f903672c571bf, 0x6e7d64467a4068fa },
+{ 0xecc5aaee160ec7f4, 0x540924bffe86ac51, 0xc987bfe6c7c69e39, 0xc9937a19333e47d3, 0x372c822dc5ab9209, 0x04054a2883694706, 0xf34a3ca24c451735, 0x93d4143a4d568688 },
+{ 0xa7c9934d425b1f9b, 0x41416e0c02aae703, 0x1ede369c71f8b74e, 0x9ac4db4d3b44b489, 0x90069b92cb2b89f4, 0x2fc4a5d12b8dd169, 0xd9a8515935c2ac36, 0x1ee702bfd40d7fa4 },
+{ 0x9b223116545a8f37, 0xde5f16ecd89a4c94, 0x244289251b3a7d3a, 0x84090de0b755d93c, 0xb1ceb2db0b440a80, 0x549c07a69a8a2b7b, 0x602a1fcb92dc380e, 0xdb5a238351446172 },
+{ 0x526f0580a6debeab, 0xf3f3e4b248e52a38, 0xdb788aff1ce74189, 0x0361331b8ae1ff1f, 0x4b3369af0267e79f, 0xf452763b306c1e7a, 0xc3b63b15d1fa9836, 0xed9c4598fbc7b474 },
+{ 0xfb89c8efd09ecd7b, 0x94fe5a63cdc60230, 0x6107abebbb6bfad8, 0x7966841421800120, 0xcab948eaef711d8a, 0x986e477d1dcdbaef, 0x5dd86fc04a59a2de, 0x1b2df381cda4ca6b },
+{ 0xba3116f167e78e37, 0x7ab14904b08013d2, 0x771ddfbc323ca4cd, 0x9b9f2130d41220f8, 0x86cc91189def805d, 0x5228e188aaa41de7, 0x991bb2d9d517f4fa, 0x20d71bf14a92bc48 }
+};
+
+__constant__ const uint64_t precomputed_values[12][8] = {
+	0x8FD72F640708B0D0, 0x0DE874C7EBC3F213, 0xE92EEF3AD202E9E0, 0xC1E9DA0708013DA7, 0x9727DAB2F014BE88, 0x103051A02BCD6935, 0x33EC7E1DBD28F736, 0x1ECF460CF78AD1F4,
+	0x0B2D9F89C775449D, 0x6B6EEFC6DAB7E8B0, 0xF1A0D31667F6EC44, 0x2A71132D5E108166, 0x0E9357C2EC87931A, 0xC99F5C1B4A01612D, 0x7E60B16E637D4EE2, 0xA9FCB827F9BA6D81,
+	0x231FECA5AB3D285C, 0x70C6E1483C838C3B, 0x9C21C3C40CE4E2DA, 0x2FA796BD5688E573, 0x04C0E3FF55809FDF, 0x5FF978BFB8E3CDC8, 0xC54A19D6A3D07033, 0x0FCA83FDDE872478,
+	0xBDF9312726339F10, 0x51A5BA1793BC9C56, 0xC4428DA14F96D2D4, 0xEC925222374EAB1F, 0x79477893747DD92F, 0xC495E19A46886304, 0x9C23F893BA7CFA36, 0x0C47268881FC5FEB,
+	0xCF117966029B2CB3, 0x07179ABE77088A8F, 0x671EF4CC2650E257, 0x7474B8B170DAB5C6, 0x4224FEBECF35113E, 0x993D156C675C5537, 0x2DEE3A5782C39B45, 0xE7C586F2990DD385,
+	0x8608FD95B1C1138A, 0x8BB0847D9E9849AC, 0x5E76623F4F0EB0C7, 0x34C2BDBAFC5060CE, 0xE9E814475907826C, 0x22C9ED94D6AAC7C9, 0xE6B75E28171EB0D6, 0xF1329E5534E60215,
+	0x86BB4814B1C3CE52, 0xE8F226C9FBDDD017, 0xCEDED67991CB3087, 0x76C33E32FDBFACA5, 0xDBB13BE1A9F7474C, 0x3D0273470342C356, 0x8E7246C51CF07F61, 0xAC8C125DDEF8DF71,
+	0x6D73E747795B8CF3, 0x4E4AA65EA0072050, 0xA14A1582CB43C2B9, 0x748EF2B7BB63B938, 0x126789534410D7D4, 0xD4D48FF40301D791, 0xC67DFBE315C41FC0, 0x35E7A1A1AF88601C,
+	0x9BD33EA0FAB34007, 0xF51B7CDBE3D67D25, 0xD3ABDA0CE4186E6B, 0x8E61DDADCBCE1706, 0x58994565B41BE6A5, 0x7A87ABC1240CD31D, 0xFAFE6C28487968D0, 0x15B368609FF9EEA7,
+	0xAE33263CCF115818, 0x93B2DBE9CADFCFC8, 0x0A91952BF91B0147, 0x458E67CA5F1ED73A, 0x94C2E5F288F074E3, 0x377895E85C69E996, 0xF11A4456AAB37B10, 0x163131934816821A,
+	0xD07E4A2366BF469D, 0x5EF1A3D220213B6C, 0x3C5BB78971D8ED0F, 0x0DE05E6B9006F2D2, 0xC58CFB00B8EAA1C9, 0xEFCDB54D1F250B76, 0xFD135634FA527042, 0x4CEE791290516407,
+	0xD800B9264010790F, 0x974C4823E2B668D7, 0xA605A4B385C5E361, 0x3F6C92DA5A56D8D2, 0x82B9D67C12EF8277, 0x0AB6B4582561BF90, 0x46954FD98FC2CBA3, 0x70BE45CB21B6760D
+};
diff --git a/x11/timetravel.cu b/x11/timetravel.cu
new file mode 100644
index 0000000000..8d157f2962
--- /dev/null
+++ b/x11/timetravel.cu
@@ -0,0 +1,497 @@
+/**
+ * Timetravel CUDA implementation
+ *  by tpruvot@github - March 2017
+ */
+
+#include <stdio.h>
+#include <memory.h>
+#include <unistd.h>
+
+#define HASH_FUNC_BASE_TIMESTAMP 1389040865U // Machinecoin Genesis Timestamp
+#define HASH_FUNC_COUNT 8
+#define HASH_FUNC_COUNT_PERMUTATIONS 40320U
+
+extern "C" {
+#include "sph/sph_blake.h"
+#include "sph/sph_bmw.h"
+#include "sph/sph_groestl.h"
+#include "sph/sph_skein.h"
+#include "sph/sph_jh.h"
+#include "sph/sph_keccak.h"
+#include "sph/sph_luffa.h"
+#include "sph/sph_cubehash.h"
+}
+
+#include "miner.h"
+#include "cuda_helper.h"
+#include "cuda_x11.h"
+
+static uint32_t *d_hash[MAX_GPUS];
+
+enum Algo {
+	BLAKE = 0,
+	BMW,
+	GROESTL,
+	SKEIN,
+	JH,
+	KECCAK,
+	LUFFA,
+	CUBEHASH,
+	MAX_ALGOS_COUNT
+};
+
+static const char* algo_strings[] = {
+	"blake",
+	"bmw512",
+	"groestl",
+	"skein",
+	"jh512",
+	"keccak",
+	"luffa",
+	"cube",
+	NULL
+};
+
+inline void swap8(uint8_t *a, uint8_t *b)
+{
+	uint8_t t = *a;
+	*a = *b;
+	*b = t;
+}
+
+inline void initPerm(uint8_t n[], int count)
+{
+	for (int i = 0; i < count; i++)
+		n[i] = i;
+}
+
+static int nextPerm(uint8_t n[], int count)
+{
+	int tail, i, j;
+
+	if (count <= 1)
+		return 0;
+
+	for (i = count - 1; i>0 && n[i - 1] >= n[i]; i--);
+	tail = i;
+
+	if (tail > 0) {
+		for (j = count - 1; j>tail && n[j] <= n[tail - 1]; j--);
+		swap8(&n[tail - 1], &n[j]);
+	}
+
+	for (i = tail, j = count - 1; i<j; i++, j--)
+		swap8(&n[i], &n[j]);
+
+	return (tail != 0);
+}
+
+static void getAlgoString(char *str, int seq)
+{
+	uint8_t algoList[HASH_FUNC_COUNT];
+	char *sptr;
+
+	initPerm(algoList, HASH_FUNC_COUNT);
+
+	for (int k = 0; k < seq; k++) {
+		nextPerm(algoList, HASH_FUNC_COUNT);
+	}
+
+	sptr = str;
+	for (int j = 0; j < HASH_FUNC_COUNT; j++) {
+		if (algoList[j] >= 10)
+			sprintf(sptr, "%c", 'A' + (algoList[j] - 10));
+		else
+			sprintf(sptr, "%u", (uint32_t) algoList[j]);
+		sptr++;
+	}
+	*sptr = '\0';
+}
+
+static __thread uint32_t s_ntime = 0;
+static uint32_t s_sequence = UINT32_MAX;
+static uint8_t s_firstalgo = 0xFF;
+static char hashOrder[HASH_FUNC_COUNT + 1] = { 0 };
+
+#define INITIAL_DATE HASH_FUNC_BASE_TIMESTAMP
+static inline uint32_t getCurrentAlgoSeq(uint32_t ntime)
+{
+	// unlike x11evo, the permutation changes often (with ntime)
+	return (uint32_t) (ntime - INITIAL_DATE) % HASH_FUNC_COUNT_PERMUTATIONS;
+}
+
+// To finish...
+static void get_travel_order(uint32_t ntime, char *permstr)
+{
+	uint32_t seq = getCurrentAlgoSeq(ntime);
+	if (s_sequence != seq) {
+		getAlgoString(permstr, seq);
+		s_sequence = seq;
+	}
+}
+
+// CPU Hash
+extern "C" void timetravel_hash(void *output, const void *input)
+{
+	uint32_t _ALIGN(64) hash[64/4] = { 0 };
+
+	sph_blake512_context     ctx_blake;
+	sph_bmw512_context       ctx_bmw;
+	sph_groestl512_context   ctx_groestl;
+	sph_skein512_context     ctx_skein;
+	sph_jh512_context        ctx_jh;
+	sph_keccak512_context    ctx_keccak;
+	sph_luffa512_context     ctx_luffa1;
+	sph_cubehash512_context  ctx_cubehash1;
+
+	if (s_sequence == UINT32_MAX) {
+		uint32_t *data = (uint32_t*) input;
+		const uint32_t ntime = (opt_benchmark || !data[17]) ? (uint32_t) time(NULL) : data[17];
+		get_travel_order(ntime, hashOrder);
+	}
+
+	void *in = (void*) input;
+	int size = 80;
+
+	const int hashes = (int) strlen(hashOrder);
+
+	for (int i = 0; i < hashes; i++)
+	{
+		const char elem = hashOrder[i];
+		uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+		switch (algo) {
+		case BLAKE:
+			sph_blake512_init(&ctx_blake);
+			sph_blake512(&ctx_blake, in, size);
+			sph_blake512_close(&ctx_blake, hash);
+			break;
+		case BMW:
+			sph_bmw512_init(&ctx_bmw);
+			sph_bmw512(&ctx_bmw, in, size);
+			sph_bmw512_close(&ctx_bmw, hash);
+			break;
+		case GROESTL:
+			sph_groestl512_init(&ctx_groestl);
+			sph_groestl512(&ctx_groestl, in, size);
+			sph_groestl512_close(&ctx_groestl, hash);
+			break;
+		case SKEIN:
+			sph_skein512_init(&ctx_skein);
+			sph_skein512(&ctx_skein, in, size);
+			sph_skein512_close(&ctx_skein, hash);
+			break;
+		case JH:
+			sph_jh512_init(&ctx_jh);
+			sph_jh512(&ctx_jh, in, size);
+			sph_jh512_close(&ctx_jh, hash);
+			break;
+		case KECCAK:
+			sph_keccak512_init(&ctx_keccak);
+			sph_keccak512(&ctx_keccak, in, size);
+			sph_keccak512_close(&ctx_keccak, hash);
+			break;
+		case LUFFA:
+			sph_luffa512_init(&ctx_luffa1);
+			sph_luffa512(&ctx_luffa1, in, size);
+			sph_luffa512_close(&ctx_luffa1, hash);
+			break;
+		case CUBEHASH:
+			sph_cubehash512_init(&ctx_cubehash1);
+			sph_cubehash512(&ctx_cubehash1, in, size);
+			sph_cubehash512_close(&ctx_cubehash1, hash);
+			break;
+		}
+
+		in = (void*) hash;
+		size = 64;
+	}
+
+	memcpy(output, hash, 32);
+}
+
+static uint32_t get_next_time(uint32_t ntime, char* curOrder)
+{
+	char nextOrder[HASH_FUNC_COUNT + 1] = { 0 };
+	uint32_t secs = 15;
+	do {
+		uint32_t nseq = getCurrentAlgoSeq(ntime+secs);
+		getAlgoString(nextOrder, nseq);
+		secs += 15;
+	} while (curOrder[0] == nextOrder[0]);
+	return secs;
+}
+
+//#define _DEBUG
+#define _DEBUG_PREFIX "tt-"
+#include "cuda_debug.cuh"
+
+void quark_bmw512_cpu_setBlock_80(void *pdata);
+void quark_bmw512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
+
+void groestl512_setBlock_80(int thr_id, uint32_t *endiandata);
+void groestl512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash);
+
+void skein512_cpu_setBlock_80(void *pdata);
+void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap);
+
+void qubit_luffa512_cpu_init(int thr_id, uint32_t threads);
+void qubit_luffa512_cpu_setBlock_80(void *pdata);
+void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
+
+void jh512_setBlock_80(int thr_id, uint32_t *endiandata);
+void jh512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash);
+
+void keccak512_setBlock_80(int thr_id, uint32_t *endiandata);
+void keccak512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash);
+
+void cubehash512_setBlock_80(int thr_id, uint32_t* endiandata);
+void cubehash512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash);
+
+void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order);
+
+static bool init[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_timetravel(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	int intensity = (device_sm[device_map[thr_id]] >= 500 && !is_windows()) ? 20 : 19;
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); // 19=256*256*8;
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	// if (opt_benchmark) pdata[17] = swab32(0x5886a4be); // TO DEBUG GROESTL 80
+
+	if (opt_debug || s_ntime != pdata[17] || s_sequence == UINT32_MAX) {
+		uint32_t ntime = swab32(work->data[17]);
+		get_travel_order(ntime, hashOrder);
+		s_ntime = pdata[17];
+		if (opt_debug && !thr_id) {
+			applog(LOG_DEBUG, "timetravel hash order %s (%08x)", hashOrder, ntime);
+		}
+	}
+
+	if (opt_benchmark)
+		ptarget[7] = 0x5;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		quark_blake512_cpu_init(thr_id, throughput);
+		quark_bmw512_cpu_init(thr_id, throughput);
+		quark_groestl512_cpu_init(thr_id, throughput);
+		quark_skein512_cpu_init(thr_id, throughput);
+		quark_keccak512_cpu_init(thr_id, throughput);
+		quark_jh512_cpu_init(thr_id, throughput);
+		qubit_luffa512_cpu_init(thr_id, throughput); // only constants (480 bytes)
+		x11_luffa512_cpu_init(thr_id, throughput);
+		x11_cubehash512_cpu_init(thr_id, throughput);
+
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), -1);
+		CUDA_CALL_OR_RET_X(cudaMemset(d_hash[thr_id], 0, (size_t) 64 * throughput), -1);
+
+		cuda_check_cpu_init(thr_id, throughput);
+
+		init[thr_id] = true;
+	}
+
+	uint32_t endiandata[20];
+	for (int k=0; k < 19; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	cuda_check_cpu_setTarget(ptarget);
+
+	const int hashes = (int) strlen(hashOrder);
+	const char first = hashOrder[0];
+	const uint8_t algo80 = first >= 'A' ? first - 'A' + 10 : first - '0';
+	if (algo80 != s_firstalgo) {
+		s_firstalgo = algo80;
+		applog(LOG_INFO, "Timetravel first algo is now %s", algo_strings[algo80 % HASH_FUNC_COUNT]);
+	}
+
+	switch (algo80) {
+		case BLAKE:
+			quark_blake512_cpu_setBlock_80(thr_id, endiandata);
+			break;
+		case BMW:
+			quark_bmw512_cpu_setBlock_80(endiandata);
+			break;
+		case GROESTL:
+			groestl512_setBlock_80(thr_id, endiandata);
+			break;
+		case SKEIN:
+			skein512_cpu_setBlock_80((void*)endiandata);
+			break;
+		case JH:
+			jh512_setBlock_80(thr_id, endiandata);
+			break;
+		case KECCAK:
+			keccak512_setBlock_80(thr_id, endiandata);
+			break;
+		case LUFFA:
+			qubit_luffa512_cpu_setBlock_80((void*)endiandata);
+			break;
+		case CUBEHASH:
+			cubehash512_setBlock_80(thr_id, endiandata);
+			break;
+		default: {
+			uint32_t next = get_next_time(swab32(s_ntime), hashOrder);
+			if (!thr_id)
+				applog(LOG_WARNING, "kernel %c unimplemented, next in %u mn", first, next/60);
+			sleep(next > 30 ? 60 : 10);
+			return -1;
+		}
+	}
+
+	do {
+		int order = 0;
+
+		// Hash with CUDA
+
+		switch (algo80) {
+			case BLAKE:
+				quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("blake80:");
+				break;
+			case BMW:
+				quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+				TRACE("bmw80  :");
+				break;
+			case GROESTL:
+				groestl512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("grstl80:");
+				break;
+			case SKEIN:
+				skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++;
+				TRACE("skein80:");
+				break;
+			case JH:
+				jh512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("jh51280:");
+				break;
+			case KECCAK:
+				keccak512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("kecck80:");
+				break;
+			case LUFFA:
+				qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+				TRACE("luffa80:");
+				break;
+			case CUBEHASH:
+				cubehash512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("cube 80:");
+				break;
+		}
+
+		for (int i = 1; i < hashes; i++)
+		{
+			const char elem = hashOrder[i];
+			const uint8_t algo64 = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+			switch (algo64) {
+			case BLAKE:
+				quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("blake  :");
+				break;
+			case BMW:
+				quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("bmw    :");
+				break;
+			case GROESTL:
+				quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("groestl:");
+				break;
+			case SKEIN:
+				quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("skein  :");
+				break;
+			case JH:
+				quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("jh512  :");
+				break;
+			case KECCAK:
+				quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("keccak :");
+				break;
+			case LUFFA:
+				x11_luffa512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("luffa  :");
+				break;
+			case CUBEHASH:
+				x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("cube   :");
+				break;
+			}
+		}
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			uint32_t _ALIGN(64) vhash[8];
+			const uint32_t Htarg = ptarget[7];
+			be32enc(&endiandata[19], work->nonces[0]);
+			timetravel_hash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				pdata[19] = work->nonces[0];
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					timetravel_hash(vhash, endiandata);
+					if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+						bn_set_target_ratio(work, vhash, 1);
+						work->valid_nonces++;
+					}
+					pdata[19] = max(pdata[19], work->nonces[1]) + 1;
+				}
+				return work->valid_nonces;
+			} else if (vhash[7] > Htarg) {
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+	return 0;
+}
+
+// cleanup
+extern "C" void free_timetravel(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+
+	quark_blake512_cpu_free(thr_id);
+	quark_groestl512_cpu_free(thr_id);
+
+	cuda_check_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/x11/veltor.cu b/x11/veltor.cu
new file mode 100644
index 0000000000..7bc1e18dab
--- /dev/null
+++ b/x11/veltor.cu
@@ -0,0 +1,197 @@
+extern "C" {
+#include "sph/sph_skein.h"
+#include "sph/sph_shavite.h"
+#include "sph/sph_shabal.h"
+#include "sph/sph_streebog.h"
+}
+
+#include "miner.h"
+#include "cuda_helper.h"
+#include "cuda_x11.h"
+
+extern void skein512_cpu_setBlock_80(void *pdata);
+extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap);
+extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+// for SM3.x
+extern void streebog_sm3_set_target(uint32_t* ptarget);
+extern void streebog_sm3_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce);
+
+// for latest cards only
+extern void skunk_cpu_init(int thr_id, uint32_t threads);
+extern void skunk_streebog_set_target(uint32_t* ptarget);
+extern void skunk_cuda_streebog(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce);
+
+#include <stdio.h>
+#include <memory.h>
+
+#define NBN 2
+static uint32_t *d_hash[MAX_GPUS];
+static uint32_t *d_resNonce[MAX_GPUS];
+
+// veltor CPU Hash
+extern "C" void veltorhash(void *output, const void *input)
+{
+	unsigned char _ALIGN(128) hash[128] = { 0 };
+
+	sph_skein512_context ctx_skein;
+	sph_gost512_context ctx_gost;
+	sph_shabal512_context ctx_shabal;
+	sph_shavite512_context ctx_shavite;
+
+	sph_skein512_init(&ctx_skein);
+	sph_skein512(&ctx_skein, input, 80);
+	sph_skein512_close(&ctx_skein, (void*) hash);
+
+	sph_shavite512_init(&ctx_shavite);
+	sph_shavite512(&ctx_shavite, (const void*) hash, 64);
+	sph_shavite512_close(&ctx_shavite, (void*) hash);
+
+	sph_shabal512_init(&ctx_shabal);
+	sph_shabal512(&ctx_shabal, (const void*) hash, 64);
+	sph_shabal512_close(&ctx_shabal, (void*) hash);
+
+	sph_gost512_init(&ctx_gost);
+	sph_gost512(&ctx_gost, (const void*) hash, 64);
+	sph_gost512_close(&ctx_gost, (void*) hash);
+
+	memcpy(output, hash, 32);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+static bool use_compat_kernels[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	int dev_id = device_map[thr_id];
+
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	int intensity = (device_sm[device_map[thr_id]] > 500) ? 20 : 18;
+	if (strstr(device_name[dev_id], "GTX 10")) intensity = 21;
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
+	//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (opt_benchmark)
+		ptarget[7] = 0xf;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		skunk_cpu_init(thr_id, throughput);
+		use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
+
+		x11_shavite512_cpu_init(thr_id, throughput);
+
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_resNonce[thr_id], NBN * sizeof(uint32_t)), -1);
+
+		init[thr_id] = true;
+	}
+
+	uint32_t _ALIGN(64) h_resNonce[NBN];
+	uint32_t _ALIGN(64) endiandata[20];
+	for (int k=0; k < 20; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	skein512_cpu_setBlock_80(endiandata);
+
+	cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t));
+	if(use_compat_kernels[thr_id])
+		streebog_sm3_set_target(ptarget);
+	else
+		skunk_streebog_set_target(ptarget);
+
+	do {
+		int order = 0;
+		skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++;
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		if(use_compat_kernels[thr_id])
+			streebog_sm3_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);
+		else
+			skunk_cuda_streebog(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);
+
+		cudaMemcpy(h_resNonce, d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost);
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		if (h_resNonce[0] != UINT32_MAX)
+		{
+			uint32_t _ALIGN(64) vhash[8];
+			const uint32_t Htarg = ptarget[7];
+			const uint32_t startNounce = pdata[19];
+
+			be32enc(&endiandata[19], startNounce + h_resNonce[0]);
+			veltorhash(vhash, endiandata);
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget))
+			{
+				work->nonces[0] = startNounce + h_resNonce[0];
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				if (h_resNonce[1] != UINT32_MAX)
+				{
+					uint32_t secNonce = work->nonces[1] = startNounce + h_resNonce[1];
+					be32enc(&endiandata[19], secNonce);
+					veltorhash(vhash, endiandata);
+					work->nonces[1] = secNonce;
+					if (bn_hash_target_ratio(vhash, ptarget) > work->shareratio[0]) {
+						work_set_target_ratio(work, vhash);
+						xchg(work->nonces[1], work->nonces[0]);
+					} else {
+						bn_set_target_ratio(work, vhash, work->valid_nonces);
+					}
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+					gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", h_resNonce[0]);
+				cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t));
+				pdata[19] = startNounce + h_resNonce[0] + 1;
+				continue;
+			}
+		}
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+
+	return 0;
+}
+
+// cleanup
+extern "C" void free_veltor(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+	cudaFree(d_resNonce[thr_id]);
+
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/x11/x11.cu b/x11/x11.cu
index 5b22e4f20b..a7f1b601fa 100644
--- a/x11/x11.cu
+++ b/x11/x11.cu
@@ -1,261 +1,233 @@
-
-extern "C"
-{
+extern "C" {
 #include "sph/sph_blake.h"
 #include "sph/sph_bmw.h"
 #include "sph/sph_groestl.h"
 #include "sph/sph_skein.h"
 #include "sph/sph_jh.h"
 #include "sph/sph_keccak.h"
-
 #include "sph/sph_luffa.h"
 #include "sph/sph_cubehash.h"
 #include "sph/sph_shavite.h"
 #include "sph/sph_simd.h"
 #include "sph/sph_echo.h"
-
-#include "miner.h"
 }
 
-#include <stdint.h>
-
-// aus cpu-miner.c
-extern int device_map[8];
-
-// Speicher f�r Input/Output der verketteten Hashfunktionen
-static uint32_t *d_hash[8];
-
-extern void quark_blake512_cpu_init(int thr_id, int threads);
-extern void quark_blake512_cpu_setBlock_80(void *pdata);
-extern void quark_blake512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order);
-extern void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
-extern void quark_bmw512_cpu_init(int thr_id, int threads);
-extern void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
-extern void quark_groestl512_cpu_init(int thr_id, int threads);
-extern void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-extern void quark_doublegroestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
-extern void quark_skein512_cpu_init(int thr_id, int threads);
-extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
-extern void quark_keccak512_cpu_init(int thr_id, int threads);
-extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
-extern void quark_jh512_cpu_init(int thr_id, int threads);
-extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
-extern void x11_luffa512_cpu_init(int thr_id, int threads);
-extern void x11_luffa512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
-extern void x11_cubehash512_cpu_init(int thr_id, int threads);
-extern void x11_cubehash512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
-extern void x11_shavite512_cpu_init(int thr_id, int threads);
-extern void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
-extern void x11_simd512_cpu_init(int thr_id, int threads);
-extern void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
-extern void x11_echo512_cpu_init(int thr_id, int threads);
-extern void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+#include "miner.h"
+#include "cuda_helper.h"
+#include "cuda_x11.h"
 
-extern void quark_check_cpu_init(int thr_id, int threads);
-extern void quark_check_cpu_setTarget(const void *ptarget);
-extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
+#include <stdio.h>
+#include <memory.h>
 
-extern void quark_compactTest_cpu_init(int thr_id, int threads);
-extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, 
-											uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse,
-											int order);
+static uint32_t *d_hash[MAX_GPUS];
 
-// X11 Hashfunktion
-inline void x11hash(void *state, const void *input)
+// X11 CPU Hash
+extern "C" void x11hash(void *output, const void *input)
 {
-    // blake1-bmw2-grs3-skein4-jh5-keccak6-luffa7-cubehash8-shavite9-simd10-echo11
-
-    sph_blake512_context ctx_blake;
-    sph_bmw512_context ctx_bmw;
-    sph_groestl512_context ctx_groestl;
-    sph_jh512_context ctx_jh;
-    sph_keccak512_context ctx_keccak;
-    sph_skein512_context ctx_skein;
-    sph_luffa512_context ctx_luffa;
-    sph_cubehash512_context ctx_cubehash;
-    sph_shavite512_context ctx_shavite;
-    sph_simd512_context ctx_simd;
-    sph_echo512_context ctx_echo;
-
-    unsigned char hash[64];
-
-    sph_blake512_init(&ctx_blake);
-    // ZBLAKE;
-    sph_blake512 (&ctx_blake, input, 80);
-    sph_blake512_close(&ctx_blake, (void*) hash);
-    
-    sph_bmw512_init(&ctx_bmw);
-    // ZBMW;
-    sph_bmw512 (&ctx_bmw, (const void*) hash, 64);
-    sph_bmw512_close(&ctx_bmw, (void*) hash);
-
-    sph_groestl512_init(&ctx_groestl);
-    // ZGROESTL;
-    sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
-    sph_groestl512_close(&ctx_groestl, (void*) hash);
-
-    sph_skein512_init(&ctx_skein);
-    // ZSKEIN;
-    sph_skein512 (&ctx_skein, (const void*) hash, 64);
-    sph_skein512_close(&ctx_skein, (void*) hash);
-
-    sph_jh512_init(&ctx_jh);
-    // ZJH;
-    sph_jh512 (&ctx_jh, (const void*) hash, 64);
-    sph_jh512_close(&ctx_jh, (void*) hash);
-
-    sph_keccak512_init(&ctx_keccak);
-    // ZKECCAK;
-    sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
-    sph_keccak512_close(&ctx_keccak, (void*) hash);
-
-    sph_luffa512_init(&ctx_luffa);
-    // ZLUFFA;
-    sph_luffa512 (&ctx_luffa, (const void*) hash, 64);
-    sph_luffa512_close (&ctx_luffa, (void*) hash);
-
-#if 1
-    sph_cubehash512_init(&ctx_cubehash);
-    // ZCUBEHASH;
-    sph_cubehash512 (&ctx_cubehash, (const void*) hash, 64);
-    sph_cubehash512_close(&ctx_cubehash, (void*) hash);
-#endif
-
-#if 1
-    sph_shavite512_init(&ctx_shavite);
-    // ZSHAVITE;
-    sph_shavite512 (&ctx_shavite, (const void*) hash, 64);
-    sph_shavite512_close(&ctx_shavite, (void*) hash);
-#endif
-
-    sph_simd512_init(&ctx_simd);
-    // ZSIMD
-    sph_simd512 (&ctx_simd, (const void*) hash, 64);
-    sph_simd512_close(&ctx_simd, (void*) hash);
-
-#if 1
-    sph_echo512_init(&ctx_echo);
-    // ZECHO
-    sph_echo512 (&ctx_echo, (const void*) hash, 64);
-    sph_echo512_close(&ctx_echo, (void*) hash); 
-#endif
-
-    memcpy(state, hash, 32);
+	unsigned char _ALIGN(128) hash[128] = { 0 };
+
+	// blake1-bmw2-grs3-skein4-jh5-keccak6-luffa7-cubehash8-shavite9-simd10-echo11
+
+	sph_blake512_context ctx_blake;
+	sph_bmw512_context ctx_bmw;
+	sph_groestl512_context ctx_groestl;
+	sph_jh512_context ctx_jh;
+	sph_keccak512_context ctx_keccak;
+	sph_skein512_context ctx_skein;
+	sph_luffa512_context ctx_luffa;
+	sph_cubehash512_context ctx_cubehash;
+	sph_shavite512_context ctx_shavite;
+	sph_simd512_context ctx_simd;
+	sph_echo512_context ctx_echo;
+
+	sph_blake512_init(&ctx_blake);
+	sph_blake512 (&ctx_blake, input, 80);
+	sph_blake512_close(&ctx_blake, (void*) hash);
+
+	sph_bmw512_init(&ctx_bmw);
+	sph_bmw512 (&ctx_bmw, (const void*) hash, 64);
+	sph_bmw512_close(&ctx_bmw, (void*) hash);
+
+	sph_groestl512_init(&ctx_groestl);
+	sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
+	sph_groestl512_close(&ctx_groestl, (void*) hash);
+
+	sph_skein512_init(&ctx_skein);
+	sph_skein512 (&ctx_skein, (const void*) hash, 64);
+	sph_skein512_close(&ctx_skein, (void*) hash);
+
+	sph_jh512_init(&ctx_jh);
+	sph_jh512 (&ctx_jh, (const void*) hash, 64);
+	sph_jh512_close(&ctx_jh, (void*) hash);
+
+	sph_keccak512_init(&ctx_keccak);
+	sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
+	sph_keccak512_close(&ctx_keccak, (void*) hash);
+
+	sph_luffa512_init(&ctx_luffa);
+	sph_luffa512 (&ctx_luffa, (const void*) hash, 64);
+	sph_luffa512_close (&ctx_luffa, (void*) hash);
+
+	sph_cubehash512_init(&ctx_cubehash);
+	sph_cubehash512 (&ctx_cubehash, (const void*) hash, 64);
+	sph_cubehash512_close(&ctx_cubehash, (void*) hash);
+
+	sph_shavite512_init(&ctx_shavite);
+	sph_shavite512 (&ctx_shavite, (const void*) hash, 64);
+	sph_shavite512_close(&ctx_shavite, (void*) hash);
+
+	sph_simd512_init(&ctx_simd);
+	sph_simd512 (&ctx_simd, (const void*) hash, 64);
+	sph_simd512_close(&ctx_simd, (void*) hash);
+
+	sph_echo512_init(&ctx_echo);
+	sph_echo512 (&ctx_echo, (const void*) hash, 64);
+	sph_echo512_close(&ctx_echo, (void*) hash);
+
+	memcpy(output, hash, 32);
 }
 
+//#define _DEBUG
+#define _DEBUG_PREFIX "x11"
+#include "cuda_debug.cuh"
 
-extern bool opt_benchmark;
+static bool init[MAX_GPUS] = { 0 };
 
-extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
-    const uint32_t *ptarget, uint32_t max_nonce,
-    unsigned long *hashes_done)
+extern "C" int scanhash_x11(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
 {
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[19];
+	int intensity = (device_sm[device_map[thr_id]] >= 500 && !is_windows()) ? 20 : 19;
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); // 19=256*256*8;
+	//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x0000ff;
+		ptarget[7] = 0x5;
 
-	const uint32_t Htarg = ptarget[7];
-
-	const int throughput = 256*256*8;
-
-	static bool init[8] = {0,0,0,0,0,0,0,0};
 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 
-		// Konstanten kopieren, Speicher belegen
-		cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput);
 		quark_blake512_cpu_init(thr_id, throughput);
+		quark_bmw512_cpu_init(thr_id, throughput);
 		quark_groestl512_cpu_init(thr_id, throughput);
 		quark_skein512_cpu_init(thr_id, throughput);
-		quark_bmw512_cpu_init(thr_id, throughput);
 		quark_keccak512_cpu_init(thr_id, throughput);
 		quark_jh512_cpu_init(thr_id, throughput);
-		x11_luffa512_cpu_init(thr_id, throughput);
-		x11_cubehash512_cpu_init(thr_id, throughput);
+		x11_luffaCubehash512_cpu_init(thr_id, throughput);
 		x11_shavite512_cpu_init(thr_id, throughput);
-		x11_simd512_cpu_init(thr_id, throughput);
 		x11_echo512_cpu_init(thr_id, throughput);
-		quark_check_cpu_init(thr_id, throughput);
+		if (x11_simd512_cpu_init(thr_id, throughput) != 0) {
+			return 0;
+		}
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
+
+		cuda_check_cpu_init(thr_id, throughput);
+
 		init[thr_id] = true;
 	}
 
 	uint32_t endiandata[20];
 	for (int k=0; k < 20; k++)
-		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
+		be32enc(&endiandata[k], pdata[k]);
 
-	quark_blake512_cpu_setBlock_80((void*)endiandata);
-	quark_check_cpu_setTarget(ptarget);
+	quark_blake512_cpu_setBlock_80(thr_id, endiandata);
+	cuda_check_cpu_setTarget(ptarget);
 
 	do {
 		int order = 0;
 
-		// erstes Blake512 Hash mit CUDA
-		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch f�r BMW512
+		// Hash with CUDA
+		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+		TRACE("blake  :");
 		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch f�r Groestl512
+		TRACE("bmw    :");
 		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch f�r Skein512
+		TRACE("groestl:");
 		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch f�r JH512
+		TRACE("skein  :");
 		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch f�r Keccak512
+		TRACE("jh512  :");
 		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch f�r Luffa512
-		x11_luffa512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch f�r Cubehash512
-		x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch f�r Shavite512
+		TRACE("keccak :");
+		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
+		TRACE("luffa+c:");
 		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch f�r SIMD512
+		TRACE("shavite:");
 		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch f�r ECHO512
+		TRACE("simd   :");
 		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		TRACE("echo => ");
 
-		// Scan nach Gewinner Hashes auf der GPU
-		uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		if  (foundNonce != 0xffffffff)
-		{
-			uint32_t vhash64[8];
-			be32enc(&endiandata[19], foundNonce);
-			x11hash(vhash64, endiandata);
+		*hashes_done = pdata[19] - first_nonce + throughput;
 
-			if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) {
-
-				pdata[19] = foundNonce;
-				*hashes_done = foundNonce - first_nonce + 1;
-				return 1;
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			x11hash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					x11hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				return work->valid_nonces;
 			} else {
-				applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNonce);
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
 			}
 		}
 
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
 		pdata[19] += throughput;
 
-	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+	} while (!work_restart[thr_id].restart);
 
-	*hashes_done = pdata[19] - first_nonce + 1;
+	*hashes_done = pdata[19] - first_nonce;
 	return 0;
 }
+
+// cleanup
+extern "C" void free_x11(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+
+	quark_blake512_cpu_free(thr_id);
+	quark_groestl512_cpu_free(thr_id);
+	x11_simd512_cpu_free(thr_id);
+
+	cuda_check_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/x11/x11evo.cu b/x11/x11evo.cu
new file mode 100644
index 0000000000..53799f9ba6
--- /dev/null
+++ b/x11/x11evo.cu
@@ -0,0 +1,414 @@
+/**
+ * X11EVO algo implementation
+ * Cuda implementation by tpruvot@github - May 2016
+ */
+#include <stdio.h>
+#include <memory.h>
+
+extern "C" {
+#include "sph/sph_blake.h"
+#include "sph/sph_bmw.h"
+#include "sph/sph_groestl.h"
+#include "sph/sph_skein.h"
+#include "sph/sph_jh.h"
+#include "sph/sph_keccak.h"
+#include "sph/sph_luffa.h"
+#include "sph/sph_cubehash.h"
+#include "sph/sph_shavite.h"
+#include "sph/sph_simd.h"
+#include "sph/sph_echo.h"
+}
+
+#include "miner.h"
+#include "cuda_helper.h"
+#include "cuda_x11.h"
+
+static uint32_t *d_hash[MAX_GPUS];
+
+enum Algo {
+	BLAKE = 0,
+	BMW,
+	GROESTL,
+	SKEIN,
+	JH,
+	KECCAK,
+	LUFFA,
+	CUBEHASH,
+	SHAVITE,
+	SIMD,
+	ECHO,
+	HASH_FUNC_COUNT
+};
+
+static void swap8(uint8_t *a, uint8_t *b)
+{
+	uint8_t t = *a;
+	*a = *b;
+	*b = t;
+}
+
+static void initPerm(uint8_t n[], int count)
+{
+	for (int i = 0; i < count; i++)
+		n[i] = i;
+}
+
+static int nextPerm(uint8_t n[], int count)
+{
+	int tail, i, j;
+
+	if (count <= 1)
+		return 0;
+
+	for (i = count - 1; i>0 && n[i - 1] >= n[i]; i--);
+	tail = i;
+
+	if (tail > 0) {
+		for (j = count - 1; j>tail && n[j] <= n[tail - 1]; j--);
+		swap8(&n[tail - 1], &n[j]);
+	}
+
+	for (i = tail, j = count - 1; i<j; i++, j--)
+		swap8(&n[i], &n[j]);
+
+	return (tail != 0);
+}
+
+static void getAlgoString(char *str, int seq)
+{
+	uint8_t algoList[HASH_FUNC_COUNT];
+	char *sptr;
+
+	initPerm(algoList, HASH_FUNC_COUNT);
+
+	for (int k = 0; k < seq; k++) {
+		nextPerm(algoList, HASH_FUNC_COUNT);
+	}
+
+	sptr = str;
+	for (int j = 0; j < HASH_FUNC_COUNT; j++) {
+		if (algoList[j] >= 10)
+			sprintf(sptr, "%c", 'A' + (algoList[j] - 10));
+		else
+			sprintf(sptr, "%u", (uint32_t) algoList[j]);
+		sptr++;
+	}
+	*sptr = '\0';
+	//applog(LOG_DEBUG, "nextPerm %s", str);
+}
+
+static __thread uint32_t s_ntime = 0;
+static char hashOrder[HASH_FUNC_COUNT + 1] = { 0 };
+static int  s_sequence = -1;
+
+#define INITIAL_DATE 0x57254700
+static inline int getCurrentAlgoSeq(uint32_t current_time)
+{
+	// change once per day
+	return (int) (current_time - INITIAL_DATE) / (60 * 60 * 24);
+}
+
+static void evo_twisted_code(uint32_t ntime, char *permstr)
+{
+	int seq = getCurrentAlgoSeq(ntime);
+	if (s_sequence != seq) {
+		getAlgoString(permstr, seq);
+		s_sequence = seq;
+	}
+}
+
+// X11evo CPU Hash
+extern "C" void x11evo_hash(void *output, const void *input)
+{
+	uint32_t _ALIGN(64) hash[64/4] = { 0 };
+
+	sph_blake512_context     ctx_blake;
+	sph_bmw512_context       ctx_bmw;
+	sph_groestl512_context   ctx_groestl;
+	sph_skein512_context     ctx_skein;
+	sph_jh512_context        ctx_jh;
+	sph_keccak512_context    ctx_keccak;
+	sph_luffa512_context     ctx_luffa1;
+	sph_cubehash512_context  ctx_cubehash1;
+	sph_shavite512_context   ctx_shavite1;
+	sph_simd512_context      ctx_simd1;
+	sph_echo512_context      ctx_echo1;
+
+	if (s_sequence == -1) {
+		uint32_t *data = (uint32_t*) input;
+		const uint32_t ntime = data[17];
+		evo_twisted_code(ntime, hashOrder);
+	}
+
+	void *in = (void*) input;
+	int size = 80;
+
+	const int hashes = (int) strlen(hashOrder);
+
+	for (int i = 0; i < hashes; i++)
+	{
+		const char elem = hashOrder[i];
+		uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+		if (i > 0) {
+			in = (void*) hash;
+			size = 64;
+		}
+
+		switch (algo) {
+		case BLAKE:
+			sph_blake512_init(&ctx_blake);
+			sph_blake512(&ctx_blake, in, size);
+			sph_blake512_close(&ctx_blake, hash);
+			break;
+		case BMW:
+			sph_bmw512_init(&ctx_bmw);
+			sph_bmw512(&ctx_bmw, in, size);
+			sph_bmw512_close(&ctx_bmw, hash);
+			break;
+		case GROESTL:
+			sph_groestl512_init(&ctx_groestl);
+			sph_groestl512(&ctx_groestl, in, size);
+			sph_groestl512_close(&ctx_groestl, hash);
+			break;
+		case SKEIN:
+			sph_skein512_init(&ctx_skein);
+			sph_skein512(&ctx_skein, in, size);
+			sph_skein512_close(&ctx_skein, hash);
+			break;
+		case JH:
+			sph_jh512_init(&ctx_jh);
+			sph_jh512(&ctx_jh, in, size);
+			sph_jh512_close(&ctx_jh, hash);
+			break;
+		case KECCAK:
+			sph_keccak512_init(&ctx_keccak);
+			sph_keccak512(&ctx_keccak, in, size);
+			sph_keccak512_close(&ctx_keccak, hash);
+			break;
+		case LUFFA:
+			sph_luffa512_init(&ctx_luffa1);
+			sph_luffa512(&ctx_luffa1, in, size);
+			sph_luffa512_close(&ctx_luffa1, hash);
+			break;
+		case CUBEHASH:
+			sph_cubehash512_init(&ctx_cubehash1);
+			sph_cubehash512(&ctx_cubehash1, in, size);
+			sph_cubehash512_close(&ctx_cubehash1, hash);
+			break;
+		case SHAVITE:
+			sph_shavite512_init(&ctx_shavite1);
+			sph_shavite512(&ctx_shavite1, in, size);
+			sph_shavite512_close(&ctx_shavite1, hash);
+			break;
+		case SIMD:
+			sph_simd512_init(&ctx_simd1);
+			sph_simd512(&ctx_simd1, in, size);
+			sph_simd512_close(&ctx_simd1, hash);
+			break;
+		case ECHO:
+			sph_echo512_init(&ctx_echo1);
+			sph_echo512(&ctx_echo1, in, size);
+			sph_echo512_close(&ctx_echo1, hash);
+			break;
+		}
+	}
+
+	memcpy(output, hash, 32);
+}
+
+//#define _DEBUG
+#define _DEBUG_PREFIX "evo"
+#include "cuda_debug.cuh"
+
+static bool init[MAX_GPUS] = { 0 };
+
+extern void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order);
+
+extern "C" int scanhash_x11evo(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	int intensity = (device_sm[device_map[thr_id]] >= 500 && !is_windows()) ? 20 : 19;
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); // 19=256*256*8;
+	//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (opt_debug || s_ntime != pdata[17] || s_sequence == -1) {
+		uint32_t ntime = swab32(work->data[17]);
+		evo_twisted_code(ntime, hashOrder);
+		s_ntime = pdata[17];
+		if (opt_debug) {
+			int secs = (int) (ntime - INITIAL_DATE) % (60 * 60 * 24);
+			secs = (60 * 60 * 24) - secs;
+			applog(LOG_DEBUG, "evo hash order %s, next in %d mn", hashOrder, secs/60);
+		}
+	}
+
+	if (opt_benchmark)
+		ptarget[7] = 0x5;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		quark_blake512_cpu_init(thr_id, throughput);
+		quark_bmw512_cpu_init(thr_id, throughput);
+		quark_groestl512_cpu_init(thr_id, throughput);
+		quark_skein512_cpu_init(thr_id, throughput);
+		quark_keccak512_cpu_init(thr_id, throughput);
+		quark_jh512_cpu_init(thr_id, throughput);
+		x11_luffa512_cpu_init(thr_id, throughput);
+		x11_cubehash512_cpu_init(thr_id, throughput);
+		x11_shavite512_cpu_init(thr_id, throughput);
+		x11_echo512_cpu_init(thr_id, throughput);
+		if (x11_simd512_cpu_init(thr_id, throughput) != 0) {
+			return 0;
+		}
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
+
+		cuda_check_cpu_init(thr_id, throughput);
+
+		init[thr_id] = true;
+	}
+
+	uint32_t endiandata[20];
+	for (int k=0; k < 19; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	cuda_check_cpu_setTarget(ptarget);
+	quark_blake512_cpu_setBlock_80(thr_id, endiandata);
+
+	const int hashes = (int) strlen(hashOrder);
+
+	do {
+		int order = 1;
+
+		// Hash with CUDA
+		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		TRACE("blake80:");
+
+		for (int i = 1; i < hashes; i++)
+		{
+			const char elem = hashOrder[i];
+			const uint8_t algo64 = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+			switch (algo64) {
+			case BLAKE:
+				quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("blake  :");
+				break;
+			case BMW:
+				quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("bmw    :");
+				break;
+			case GROESTL:
+				quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("groestl:");
+				break;
+			case SKEIN:
+				quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("skein  :");
+				break;
+			case JH:
+				quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("jh512  :");
+				break;
+			case KECCAK:
+				quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("keccak :");
+				break;
+			case LUFFA:
+				x11_luffa512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("luffa  :");
+				break;
+			case CUBEHASH:
+				x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("cube   :");
+				break;
+			case SHAVITE:
+				x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("shavite:");
+				break;
+			case SIMD:
+				x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("simd   :");
+				break;
+			case ECHO:
+				x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("echo   :");
+				break;
+			}
+		}
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			x11evo_hash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				pdata[19] = work->nonces[0] + 1; // cursor
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					x11evo_hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+					gpulog(LOG_DEBUG, thr_id, "second nonce %08x! cursor %08x", work->nonces[1], pdata[19]);
+					work->valid_nonces++;
+				}
+				return work->valid_nonces;
+			} else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce + 1;
+	return 0;
+}
+
+// cleanup
+extern "C" void free_x11evo(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+
+	quark_blake512_cpu_free(thr_id);
+	quark_groestl512_cpu_free(thr_id);
+	x11_simd512_cpu_free(thr_id);
+
+	cuda_check_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/x12/x12.cu b/x12/x12.cu
new file mode 100644
index 0000000000..c0fd623be2
--- /dev/null
+++ b/x12/x12.cu
@@ -0,0 +1,248 @@
+/*
+ * X12 algorithm
+ */
+extern "C" {
+#include "sph/sph_blake.h"
+#include "sph/sph_bmw.h"
+#include "sph/sph_luffa.h"
+#include "sph/sph_cubehash.h"
+#include "sph/sph_shavite.h"
+#include "sph/sph_simd.h"
+#include "sph/sph_echo.h"
+#include "sph/sph_groestl.h"
+#include "sph/sph_skein.h"
+#include "sph/sph_jh.h"
+#include "sph/sph_keccak.h"
+#include "sph/sph_hamsi.h"
+}
+#include "miner.h"
+
+#include "cuda_helper.h"
+#include "x11/cuda_x11.h"
+
+static uint32_t *d_hash[MAX_GPUS];
+
+extern void x16_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
+
+extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads);
+extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+// X12 CPU Hash
+extern "C" void x12hash(void *output, const void *input)
+{
+	sph_blake512_context    ctx_blake;
+	sph_bmw512_context      ctx_bmw;
+	sph_luffa512_context    ctx_luffa;
+	sph_cubehash512_context ctx_cubehash;
+	sph_shavite512_context  ctx_shavite;
+	sph_simd512_context     ctx_simd;
+	sph_echo512_context     ctx_echo;
+	sph_groestl512_context  ctx_groestl;
+	sph_skein512_context    ctx_skein;
+	sph_jh512_context       ctx_jh;
+	sph_keccak512_context   ctx_keccak;
+	sph_hamsi512_context    ctx_hamsi;
+
+	uint32_t hash[32];
+	memset(hash, 0, sizeof hash);
+
+	sph_blake512_init(&ctx_blake);
+	sph_blake512 (&ctx_blake, input, 80);
+	sph_blake512_close(&ctx_blake, (void*) hash);
+
+	sph_bmw512_init(&ctx_bmw);
+	sph_bmw512(&ctx_bmw, (const void*) hash, 64);
+	sph_bmw512_close(&ctx_bmw, (void*) hash);
+
+	sph_luffa512_init(&ctx_luffa);
+	sph_luffa512(&ctx_luffa, (const void*)hash, 64);
+	sph_luffa512_close(&ctx_luffa, (void*)hash);
+
+	sph_cubehash512_init(&ctx_cubehash);
+	sph_cubehash512(&ctx_cubehash, (const void*)hash, 64);
+	sph_cubehash512_close(&ctx_cubehash, (void*)hash);
+
+	sph_shavite512_init(&ctx_shavite);
+	sph_shavite512(&ctx_shavite, (const void*)hash, 64);
+	sph_shavite512_close(&ctx_shavite, (void*)hash);
+
+	sph_simd512_init(&ctx_simd);
+	sph_simd512(&ctx_simd, (const void*)hash, 64);
+	sph_simd512_close(&ctx_simd, (void*)hash);
+
+	sph_echo512_init(&ctx_echo);
+	sph_echo512(&ctx_echo, (const void*)hash, 64);
+	sph_echo512_close(&ctx_echo, (void*)hash);
+
+	sph_groestl512_init(&ctx_groestl);
+	sph_groestl512(&ctx_groestl, (const void*) hash, 64);
+	sph_groestl512_close(&ctx_groestl, (void*) hash);
+
+	sph_skein512_init(&ctx_skein);
+	sph_skein512(&ctx_skein, (const void*) hash, 64);
+	sph_skein512_close(&ctx_skein, (void*) hash);
+
+	sph_jh512_init(&ctx_jh);
+	sph_jh512(&ctx_jh, (const void*) hash, 64);
+	sph_jh512_close(&ctx_jh, (void*) hash);
+
+	sph_keccak512_init(&ctx_keccak);
+	sph_keccak512(&ctx_keccak, (const void*) hash, 64);
+	sph_keccak512_close(&ctx_keccak, (void*) hash);
+
+	sph_hamsi512_init(&ctx_hamsi);
+	sph_hamsi512(&ctx_hamsi, (const void*) hash, 64);
+	sph_hamsi512_close(&ctx_hamsi, (void*) hash);
+
+	memcpy(output, hash, 32);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+static bool use_compat_kernels[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_x12(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	const int dev_id = device_map[thr_id];
+	int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 20 : 19;
+	uint32_t throughput =  cuda_default_throughput(thr_id, 1 << intensity); // 19=256*256*8;
+	//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (opt_benchmark)
+		((uint32_t*)ptarget)[7] = 0x000f;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		cuda_get_arch(thr_id);
+		use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
+		if (use_compat_kernels[thr_id])
+			x11_echo512_cpu_init(thr_id, throughput);
+
+		quark_blake512_cpu_init(thr_id, throughput);
+		x11_luffaCubehash512_cpu_init(thr_id, throughput);
+		x11_shavite512_cpu_init(thr_id, throughput);
+		if (x11_simd512_cpu_init(thr_id, throughput) != 0) {
+			return 0;
+		}
+		quark_groestl512_cpu_init(thr_id, throughput);
+		quark_skein512_cpu_init(thr_id, throughput);
+		quark_bmw512_cpu_init(thr_id, throughput);
+		quark_keccak512_cpu_init(thr_id, throughput);
+		quark_jh512_cpu_init(thr_id, throughput);
+		x13_hamsi512_cpu_init(thr_id, throughput);
+
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0);
+
+		cuda_check_cpu_init(thr_id, throughput);
+
+		init[thr_id] = true;
+	}
+
+	uint32_t endiandata[20];
+	for (int k=0; k < 20; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	quark_blake512_cpu_setBlock_80(thr_id, endiandata);
+	cuda_check_cpu_setTarget(ptarget);
+
+	do {
+		int order = 0;
+
+		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		if (use_compat_kernels[thr_id])
+			x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		else {
+			x16_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++;
+		}
+		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		CUDA_LOG_ERROR();
+
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			x12hash(vhash, endiandata);
+
+			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				work_set_target_ratio(work, vhash);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					x12hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+					gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+
+	CUDA_LOG_ERROR();
+
+	return 0;
+}
+
+// cleanup
+extern "C" void free_x12(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+
+	quark_blake512_cpu_free(thr_id);
+	quark_groestl512_cpu_free(thr_id);
+	x11_simd512_cpu_free(thr_id);
+
+	cuda_check_cpu_free(thr_id);
+	CUDA_LOG_ERROR();
+
+	cudaDeviceSynchronize();
+	init[thr_id] = false;
+}
diff --git a/x13/cuda_hsr_sm3.cu b/x13/cuda_hsr_sm3.cu
new file mode 100644
index 0000000000..5ce018626e
--- /dev/null
+++ b/x13/cuda_hsr_sm3.cu
@@ -0,0 +1,139 @@
+#include <stdio.h>
+#include <stdint.h>
+#include <memory.h>
+
+#include <cuda_helper.h>
+#include <miner.h>
+
+#define  F(x, y, z) (((x) ^ (y) ^ (z)))
+#define FF(x, y, z) (((x) & (y)) | ((x) & (z)) | ((y) & (z)))
+#define GG(x, y, z) ((z)  ^ ((x) & ((y) ^ (z))))
+
+#define P0(x) x ^ ROTL32(x,  9) ^ ROTL32(x, 17)
+#define P1(x) x ^ ROTL32(x, 15) ^ ROTL32(x, 23)
+
+static __forceinline__ __device__
+void sm3_compress2(uint32_t digest[8], const uint32_t pblock[16])
+{
+	uint32_t tt1, tt2, i, t, ss1, ss2, x, y;
+	uint32_t w[68];
+	uint32_t a = digest[0];
+	uint32_t b = digest[1];
+	uint32_t c = digest[2];
+	uint32_t d = digest[3];
+	uint32_t e = digest[4];
+	uint32_t f = digest[5];
+	uint32_t g = digest[6];
+	uint32_t h = digest[7];
+
+	#pragma unroll
+	for (i = 0; i<16; i++) {
+		w[i] = cuda_swab32(pblock[i]);
+	}
+
+	for (i = 16; i<68; i++) {
+		x = ROTL32(w[i - 3], 15);
+		y = ROTL32(w[i - 13], 7);
+
+		x ^= w[i - 16];
+		x ^= w[i - 9];
+		y ^= w[i - 6];
+
+		w[i] = P1(x) ^ y;
+	}
+
+	for (i = 0; i<64; i++) {
+
+		t = (i < 16) ? 0x79cc4519 : 0x7a879d8a;
+
+		ss2 = ROTL32(a, 12);
+		ss1 = ROTL32(ss2 + e + ROTL32(t, i), 7);
+		ss2 ^= ss1;
+
+		tt1 = d + ss2 + (w[i] ^ w[i + 4]);
+		tt2 = h + ss1 + w[i];
+
+		if (i < 16) {
+			tt1 += F(a, b, c);
+			tt2 += F(e, f, g);
+		}
+		else {
+			tt1 += FF(a, b, c);
+			tt2 += GG(e, f, g);
+		}
+		d = c;
+		c = ROTL32(b, 9);
+		b = a;
+		a = tt1;
+		h = g;
+		g = ROTL32(f, 19);
+		f = e;
+		e = P0(tt2);
+	}
+
+	digest[0] ^= a;
+	digest[1] ^= b;
+	digest[2] ^= c;
+	digest[3] ^= d;
+	digest[4] ^= e;
+	digest[5] ^= f;
+	digest[6] ^= g;
+	digest[7] ^= h;
+}
+
+/***************************************************/
+// GPU Hash Function
+__global__
+void sm3_gpu_hash_64(const uint32_t threads, uint32_t *g_hash)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+	if (thread < threads)
+	{
+		const size_t hashPosition = thread;
+
+		uint32_t digest[8];
+		digest[0] = 0x7380166F;
+		digest[1] = 0x4914B2B9;
+		digest[2] = 0x172442D7;
+		digest[3] = 0xDA8A0600;
+		digest[4] = 0xA96F30BC;
+		digest[5] = 0x163138AA;
+		digest[6] = 0xE38DEE4D;
+		digest[7] = 0xB0FB0E4E;
+
+		uint32_t *pHash = &g_hash[hashPosition << 4];
+		sm3_compress2(digest, pHash);
+
+		uint32_t block[16];
+		block[0] = 0x80;
+
+		#pragma unroll
+		for (int i = 1; i < 14; i++)
+			block[i] = 0;
+
+		// count
+		block[14] = cuda_swab32(1 >> 23);
+		block[15] = cuda_swab32((1 << 9) + (0 << 3));
+
+		sm3_compress2(digest, block);
+
+		for (int i = 0; i < 8; i++)
+			pHash[i] = cuda_swab32(digest[i]);
+
+		for (int i = 8; i < 16; i++)
+			pHash[i] = 0;
+	}
+}
+
+__host__
+void sm3_cuda_hash_64(int thr_id, uint32_t threads, uint32_t *g_hash, int order)
+{
+	const uint32_t threadsperblock = 256;
+
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	sm3_gpu_hash_64 <<<grid, block>>>(threads, g_hash);
+	//MyStreamSynchronize(NULL, order, thr_id);
+}
diff --git a/x13/cuda_x13_fugue512.cu b/x13/cuda_x13_fugue512.cu
index ef19a51a2e..b69ddb6350 100644
--- a/x13/cuda_x13_fugue512.cu
+++ b/x13/cuda_x13_fugue512.cu
@@ -1,18 +1,15 @@
-/*
- * Quick and dirty addition of Fugue-512 for X13
- * 
- * Built on cbuchner1's implementation, actual hashing code
- * heavily based on phm's sgminer
- *
- */
+
+#include <cuda_helper.h>
+
+#define TPB 256
 
 /*
- * X13 kernel implementation.
+ * fugue512 x13 kernel implementation.
  *
  * ==========================(LICENSE BEGIN)============================
  *
- * Copyright (c) 2014  phm
- * 
+ * Copyright (c) 2014-2017 phm, tpruvot
+ *
  * Permission is hereby granted, free of charge, to any person obtaining
  * a copy of this software and associated documentation files (the
  * "Software"), to deal in the Software without restriction, including
@@ -20,10 +17,10 @@
  * distribute, sublicense, and/or sell copies of the Software, and to
  * permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
- * 
+ *
  * The above copyright notice and this permission notice shall be
  * included in all copies or substantial portions of the Software.
- * 
+ *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
@@ -33,679 +30,376 @@
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  *
  * ===========================(LICENSE END)=============================
- *
- * @author   phm <phm@inbox.com>
  */
 
-// aus heavy.cu
-extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
-
-#include <stdint.h>
-
-#define SPH_C64(x)    ((uint64_t)(x ## ULL))
-#define SPH_C32(x)    ((uint32_t)(x ## U))
-#define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
-
-#define SWAB32(x) ( __byte_perm(x, x, 0x0123) )
-
-#if __CUDA_ARCH__ < 350 
-    // Kepler (Compute 3.0)
-    #define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
-#else
-    // Kepler (Compute 3.5)
-    #define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
+#ifdef __INTELLISENSE__
+#define __byte_perm(x, y, m) (x|y)
+#define tex1Dfetch(t, n) (n)
+#define __CUDACC__
+#include <cuda_texture_types.h>
 #endif
 
-#define mixtab0(x) (*((uint32_t*)mixtabs + (    (x))))
-#define mixtab1(x) (*((uint32_t*)mixtabs + (256+(x))))
-#define mixtab2(x) (*((uint32_t*)mixtabs + (512+(x))))
-#define mixtab3(x) (*((uint32_t*)mixtabs + (768+(x))))
-
-texture<unsigned int, 1, cudaReadModeElementType> mixTab0Tex;
-texture<unsigned int, 1, cudaReadModeElementType> mixTab1Tex;
-texture<unsigned int, 1, cudaReadModeElementType> mixTab2Tex;
-texture<unsigned int, 1, cudaReadModeElementType> mixTab3Tex;
-
-static const uint32_t mixtab0_cpu[] = {
-	SPH_C32(0x63633297), SPH_C32(0x7c7c6feb), SPH_C32(0x77775ec7),
-	SPH_C32(0x7b7b7af7), SPH_C32(0xf2f2e8e5), SPH_C32(0x6b6b0ab7),
-	SPH_C32(0x6f6f16a7), SPH_C32(0xc5c56d39), SPH_C32(0x303090c0),
-	SPH_C32(0x01010704), SPH_C32(0x67672e87), SPH_C32(0x2b2bd1ac),
-	SPH_C32(0xfefeccd5), SPH_C32(0xd7d71371), SPH_C32(0xabab7c9a),
-	SPH_C32(0x767659c3), SPH_C32(0xcaca4005), SPH_C32(0x8282a33e),
-	SPH_C32(0xc9c94909), SPH_C32(0x7d7d68ef), SPH_C32(0xfafad0c5),
-	SPH_C32(0x5959947f), SPH_C32(0x4747ce07), SPH_C32(0xf0f0e6ed),
-	SPH_C32(0xadad6e82), SPH_C32(0xd4d41a7d), SPH_C32(0xa2a243be),
-	SPH_C32(0xafaf608a), SPH_C32(0x9c9cf946), SPH_C32(0xa4a451a6),
-	SPH_C32(0x727245d3), SPH_C32(0xc0c0762d), SPH_C32(0xb7b728ea),
-	SPH_C32(0xfdfdc5d9), SPH_C32(0x9393d47a), SPH_C32(0x2626f298),
-	SPH_C32(0x363682d8), SPH_C32(0x3f3fbdfc), SPH_C32(0xf7f7f3f1),
-	SPH_C32(0xcccc521d), SPH_C32(0x34348cd0), SPH_C32(0xa5a556a2),
-	SPH_C32(0xe5e58db9), SPH_C32(0xf1f1e1e9), SPH_C32(0x71714cdf),
-	SPH_C32(0xd8d83e4d), SPH_C32(0x313197c4), SPH_C32(0x15156b54),
-	SPH_C32(0x04041c10), SPH_C32(0xc7c76331), SPH_C32(0x2323e98c),
-	SPH_C32(0xc3c37f21), SPH_C32(0x18184860), SPH_C32(0x9696cf6e),
-	SPH_C32(0x05051b14), SPH_C32(0x9a9aeb5e), SPH_C32(0x0707151c),
-	SPH_C32(0x12127e48), SPH_C32(0x8080ad36), SPH_C32(0xe2e298a5),
-	SPH_C32(0xebeba781), SPH_C32(0x2727f59c), SPH_C32(0xb2b233fe),
-	SPH_C32(0x757550cf), SPH_C32(0x09093f24), SPH_C32(0x8383a43a),
-	SPH_C32(0x2c2cc4b0), SPH_C32(0x1a1a4668), SPH_C32(0x1b1b416c),
-	SPH_C32(0x6e6e11a3), SPH_C32(0x5a5a9d73), SPH_C32(0xa0a04db6),
-	SPH_C32(0x5252a553), SPH_C32(0x3b3ba1ec), SPH_C32(0xd6d61475),
-	SPH_C32(0xb3b334fa), SPH_C32(0x2929dfa4), SPH_C32(0xe3e39fa1),
-	SPH_C32(0x2f2fcdbc), SPH_C32(0x8484b126), SPH_C32(0x5353a257),
-	SPH_C32(0xd1d10169), SPH_C32(0x00000000), SPH_C32(0xededb599),
-	SPH_C32(0x2020e080), SPH_C32(0xfcfcc2dd), SPH_C32(0xb1b13af2),
-	SPH_C32(0x5b5b9a77), SPH_C32(0x6a6a0db3), SPH_C32(0xcbcb4701),
-	SPH_C32(0xbebe17ce), SPH_C32(0x3939afe4), SPH_C32(0x4a4aed33),
-	SPH_C32(0x4c4cff2b), SPH_C32(0x5858937b), SPH_C32(0xcfcf5b11),
-	SPH_C32(0xd0d0066d), SPH_C32(0xefefbb91), SPH_C32(0xaaaa7b9e),
-	SPH_C32(0xfbfbd7c1), SPH_C32(0x4343d217), SPH_C32(0x4d4df82f),
-	SPH_C32(0x333399cc), SPH_C32(0x8585b622), SPH_C32(0x4545c00f),
-	SPH_C32(0xf9f9d9c9), SPH_C32(0x02020e08), SPH_C32(0x7f7f66e7),
-	SPH_C32(0x5050ab5b), SPH_C32(0x3c3cb4f0), SPH_C32(0x9f9ff04a),
-	SPH_C32(0xa8a87596), SPH_C32(0x5151ac5f), SPH_C32(0xa3a344ba),
-	SPH_C32(0x4040db1b), SPH_C32(0x8f8f800a), SPH_C32(0x9292d37e),
-	SPH_C32(0x9d9dfe42), SPH_C32(0x3838a8e0), SPH_C32(0xf5f5fdf9),
-	SPH_C32(0xbcbc19c6), SPH_C32(0xb6b62fee), SPH_C32(0xdada3045),
-	SPH_C32(0x2121e784), SPH_C32(0x10107040), SPH_C32(0xffffcbd1),
-	SPH_C32(0xf3f3efe1), SPH_C32(0xd2d20865), SPH_C32(0xcdcd5519),
-	SPH_C32(0x0c0c2430), SPH_C32(0x1313794c), SPH_C32(0xececb29d),
-	SPH_C32(0x5f5f8667), SPH_C32(0x9797c86a), SPH_C32(0x4444c70b),
-	SPH_C32(0x1717655c), SPH_C32(0xc4c46a3d), SPH_C32(0xa7a758aa),
-	SPH_C32(0x7e7e61e3), SPH_C32(0x3d3db3f4), SPH_C32(0x6464278b),
-	SPH_C32(0x5d5d886f), SPH_C32(0x19194f64), SPH_C32(0x737342d7),
-	SPH_C32(0x60603b9b), SPH_C32(0x8181aa32), SPH_C32(0x4f4ff627),
-	SPH_C32(0xdcdc225d), SPH_C32(0x2222ee88), SPH_C32(0x2a2ad6a8),
-	SPH_C32(0x9090dd76), SPH_C32(0x88889516), SPH_C32(0x4646c903),
-	SPH_C32(0xeeeebc95), SPH_C32(0xb8b805d6), SPH_C32(0x14146c50),
-	SPH_C32(0xdede2c55), SPH_C32(0x5e5e8163), SPH_C32(0x0b0b312c),
-	SPH_C32(0xdbdb3741), SPH_C32(0xe0e096ad), SPH_C32(0x32329ec8),
-	SPH_C32(0x3a3aa6e8), SPH_C32(0x0a0a3628), SPH_C32(0x4949e43f),
-	SPH_C32(0x06061218), SPH_C32(0x2424fc90), SPH_C32(0x5c5c8f6b),
-	SPH_C32(0xc2c27825), SPH_C32(0xd3d30f61), SPH_C32(0xacac6986),
-	SPH_C32(0x62623593), SPH_C32(0x9191da72), SPH_C32(0x9595c662),
-	SPH_C32(0xe4e48abd), SPH_C32(0x797974ff), SPH_C32(0xe7e783b1),
-	SPH_C32(0xc8c84e0d), SPH_C32(0x373785dc), SPH_C32(0x6d6d18af),
-	SPH_C32(0x8d8d8e02), SPH_C32(0xd5d51d79), SPH_C32(0x4e4ef123),
-	SPH_C32(0xa9a97292), SPH_C32(0x6c6c1fab), SPH_C32(0x5656b943),
-	SPH_C32(0xf4f4fafd), SPH_C32(0xeaeaa085), SPH_C32(0x6565208f),
-	SPH_C32(0x7a7a7df3), SPH_C32(0xaeae678e), SPH_C32(0x08083820),
-	SPH_C32(0xbaba0bde), SPH_C32(0x787873fb), SPH_C32(0x2525fb94),
-	SPH_C32(0x2e2ecab8), SPH_C32(0x1c1c5470), SPH_C32(0xa6a65fae),
-	SPH_C32(0xb4b421e6), SPH_C32(0xc6c66435), SPH_C32(0xe8e8ae8d),
-	SPH_C32(0xdddd2559), SPH_C32(0x747457cb), SPH_C32(0x1f1f5d7c),
-	SPH_C32(0x4b4bea37), SPH_C32(0xbdbd1ec2), SPH_C32(0x8b8b9c1a),
-	SPH_C32(0x8a8a9b1e), SPH_C32(0x70704bdb), SPH_C32(0x3e3ebaf8),
-	SPH_C32(0xb5b526e2), SPH_C32(0x66662983), SPH_C32(0x4848e33b),
-	SPH_C32(0x0303090c), SPH_C32(0xf6f6f4f5), SPH_C32(0x0e0e2a38),
-	SPH_C32(0x61613c9f), SPH_C32(0x35358bd4), SPH_C32(0x5757be47),
-	SPH_C32(0xb9b902d2), SPH_C32(0x8686bf2e), SPH_C32(0xc1c17129),
-	SPH_C32(0x1d1d5374), SPH_C32(0x9e9ef74e), SPH_C32(0xe1e191a9),
-	SPH_C32(0xf8f8decd), SPH_C32(0x9898e556), SPH_C32(0x11117744),
-	SPH_C32(0x696904bf), SPH_C32(0xd9d93949), SPH_C32(0x8e8e870e),
-	SPH_C32(0x9494c166), SPH_C32(0x9b9bec5a), SPH_C32(0x1e1e5a78),
-	SPH_C32(0x8787b82a), SPH_C32(0xe9e9a989), SPH_C32(0xcece5c15),
-	SPH_C32(0x5555b04f), SPH_C32(0x2828d8a0), SPH_C32(0xdfdf2b51),
-	SPH_C32(0x8c8c8906), SPH_C32(0xa1a14ab2), SPH_C32(0x89899212),
-	SPH_C32(0x0d0d2334), SPH_C32(0xbfbf10ca), SPH_C32(0xe6e684b5),
-	SPH_C32(0x4242d513), SPH_C32(0x686803bb), SPH_C32(0x4141dc1f),
-	SPH_C32(0x9999e252), SPH_C32(0x2d2dc3b4), SPH_C32(0x0f0f2d3c),
-	SPH_C32(0xb0b03df6), SPH_C32(0x5454b74b), SPH_C32(0xbbbb0cda),
-	SPH_C32(0x16166258)
-};
-
- static const uint32_t mixtab1_cpu[] = {
-	SPH_C32(0x97636332), SPH_C32(0xeb7c7c6f), SPH_C32(0xc777775e),
-	SPH_C32(0xf77b7b7a), SPH_C32(0xe5f2f2e8), SPH_C32(0xb76b6b0a),
-	SPH_C32(0xa76f6f16), SPH_C32(0x39c5c56d), SPH_C32(0xc0303090),
-	SPH_C32(0x04010107), SPH_C32(0x8767672e), SPH_C32(0xac2b2bd1),
-	SPH_C32(0xd5fefecc), SPH_C32(0x71d7d713), SPH_C32(0x9aabab7c),
-	SPH_C32(0xc3767659), SPH_C32(0x05caca40), SPH_C32(0x3e8282a3),
-	SPH_C32(0x09c9c949), SPH_C32(0xef7d7d68), SPH_C32(0xc5fafad0),
-	SPH_C32(0x7f595994), SPH_C32(0x074747ce), SPH_C32(0xedf0f0e6),
-	SPH_C32(0x82adad6e), SPH_C32(0x7dd4d41a), SPH_C32(0xbea2a243),
-	SPH_C32(0x8aafaf60), SPH_C32(0x469c9cf9), SPH_C32(0xa6a4a451),
-	SPH_C32(0xd3727245), SPH_C32(0x2dc0c076), SPH_C32(0xeab7b728),
-	SPH_C32(0xd9fdfdc5), SPH_C32(0x7a9393d4), SPH_C32(0x982626f2),
-	SPH_C32(0xd8363682), SPH_C32(0xfc3f3fbd), SPH_C32(0xf1f7f7f3),
-	SPH_C32(0x1dcccc52), SPH_C32(0xd034348c), SPH_C32(0xa2a5a556),
-	SPH_C32(0xb9e5e58d), SPH_C32(0xe9f1f1e1), SPH_C32(0xdf71714c),
-	SPH_C32(0x4dd8d83e), SPH_C32(0xc4313197), SPH_C32(0x5415156b),
-	SPH_C32(0x1004041c), SPH_C32(0x31c7c763), SPH_C32(0x8c2323e9),
-	SPH_C32(0x21c3c37f), SPH_C32(0x60181848), SPH_C32(0x6e9696cf),
-	SPH_C32(0x1405051b), SPH_C32(0x5e9a9aeb), SPH_C32(0x1c070715),
-	SPH_C32(0x4812127e), SPH_C32(0x368080ad), SPH_C32(0xa5e2e298),
-	SPH_C32(0x81ebeba7), SPH_C32(0x9c2727f5), SPH_C32(0xfeb2b233),
-	SPH_C32(0xcf757550), SPH_C32(0x2409093f), SPH_C32(0x3a8383a4),
-	SPH_C32(0xb02c2cc4), SPH_C32(0x681a1a46), SPH_C32(0x6c1b1b41),
-	SPH_C32(0xa36e6e11), SPH_C32(0x735a5a9d), SPH_C32(0xb6a0a04d),
-	SPH_C32(0x535252a5), SPH_C32(0xec3b3ba1), SPH_C32(0x75d6d614),
-	SPH_C32(0xfab3b334), SPH_C32(0xa42929df), SPH_C32(0xa1e3e39f),
-	SPH_C32(0xbc2f2fcd), SPH_C32(0x268484b1), SPH_C32(0x575353a2),
-	SPH_C32(0x69d1d101), SPH_C32(0x00000000), SPH_C32(0x99ededb5),
-	SPH_C32(0x802020e0), SPH_C32(0xddfcfcc2), SPH_C32(0xf2b1b13a),
-	SPH_C32(0x775b5b9a), SPH_C32(0xb36a6a0d), SPH_C32(0x01cbcb47),
-	SPH_C32(0xcebebe17), SPH_C32(0xe43939af), SPH_C32(0x334a4aed),
-	SPH_C32(0x2b4c4cff), SPH_C32(0x7b585893), SPH_C32(0x11cfcf5b),
-	SPH_C32(0x6dd0d006), SPH_C32(0x91efefbb), SPH_C32(0x9eaaaa7b),
-	SPH_C32(0xc1fbfbd7), SPH_C32(0x174343d2), SPH_C32(0x2f4d4df8),
-	SPH_C32(0xcc333399), SPH_C32(0x228585b6), SPH_C32(0x0f4545c0),
-	SPH_C32(0xc9f9f9d9), SPH_C32(0x0802020e), SPH_C32(0xe77f7f66),
-	SPH_C32(0x5b5050ab), SPH_C32(0xf03c3cb4), SPH_C32(0x4a9f9ff0),
-	SPH_C32(0x96a8a875), SPH_C32(0x5f5151ac), SPH_C32(0xbaa3a344),
-	SPH_C32(0x1b4040db), SPH_C32(0x0a8f8f80), SPH_C32(0x7e9292d3),
-	SPH_C32(0x429d9dfe), SPH_C32(0xe03838a8), SPH_C32(0xf9f5f5fd),
-	SPH_C32(0xc6bcbc19), SPH_C32(0xeeb6b62f), SPH_C32(0x45dada30),
-	SPH_C32(0x842121e7), SPH_C32(0x40101070), SPH_C32(0xd1ffffcb),
-	SPH_C32(0xe1f3f3ef), SPH_C32(0x65d2d208), SPH_C32(0x19cdcd55),
-	SPH_C32(0x300c0c24), SPH_C32(0x4c131379), SPH_C32(0x9dececb2),
-	SPH_C32(0x675f5f86), SPH_C32(0x6a9797c8), SPH_C32(0x0b4444c7),
-	SPH_C32(0x5c171765), SPH_C32(0x3dc4c46a), SPH_C32(0xaaa7a758),
-	SPH_C32(0xe37e7e61), SPH_C32(0xf43d3db3), SPH_C32(0x8b646427),
-	SPH_C32(0x6f5d5d88), SPH_C32(0x6419194f), SPH_C32(0xd7737342),
-	SPH_C32(0x9b60603b), SPH_C32(0x328181aa), SPH_C32(0x274f4ff6),
-	SPH_C32(0x5ddcdc22), SPH_C32(0x882222ee), SPH_C32(0xa82a2ad6),
-	SPH_C32(0x769090dd), SPH_C32(0x16888895), SPH_C32(0x034646c9),
-	SPH_C32(0x95eeeebc), SPH_C32(0xd6b8b805), SPH_C32(0x5014146c),
-	SPH_C32(0x55dede2c), SPH_C32(0x635e5e81), SPH_C32(0x2c0b0b31),
-	SPH_C32(0x41dbdb37), SPH_C32(0xade0e096), SPH_C32(0xc832329e),
-	SPH_C32(0xe83a3aa6), SPH_C32(0x280a0a36), SPH_C32(0x3f4949e4),
-	SPH_C32(0x18060612), SPH_C32(0x902424fc), SPH_C32(0x6b5c5c8f),
-	SPH_C32(0x25c2c278), SPH_C32(0x61d3d30f), SPH_C32(0x86acac69),
-	SPH_C32(0x93626235), SPH_C32(0x729191da), SPH_C32(0x629595c6),
-	SPH_C32(0xbde4e48a), SPH_C32(0xff797974), SPH_C32(0xb1e7e783),
-	SPH_C32(0x0dc8c84e), SPH_C32(0xdc373785), SPH_C32(0xaf6d6d18),
-	SPH_C32(0x028d8d8e), SPH_C32(0x79d5d51d), SPH_C32(0x234e4ef1),
-	SPH_C32(0x92a9a972), SPH_C32(0xab6c6c1f), SPH_C32(0x435656b9),
-	SPH_C32(0xfdf4f4fa), SPH_C32(0x85eaeaa0), SPH_C32(0x8f656520),
-	SPH_C32(0xf37a7a7d), SPH_C32(0x8eaeae67), SPH_C32(0x20080838),
-	SPH_C32(0xdebaba0b), SPH_C32(0xfb787873), SPH_C32(0x942525fb),
-	SPH_C32(0xb82e2eca), SPH_C32(0x701c1c54), SPH_C32(0xaea6a65f),
-	SPH_C32(0xe6b4b421), SPH_C32(0x35c6c664), SPH_C32(0x8de8e8ae),
-	SPH_C32(0x59dddd25), SPH_C32(0xcb747457), SPH_C32(0x7c1f1f5d),
-	SPH_C32(0x374b4bea), SPH_C32(0xc2bdbd1e), SPH_C32(0x1a8b8b9c),
-	SPH_C32(0x1e8a8a9b), SPH_C32(0xdb70704b), SPH_C32(0xf83e3eba),
-	SPH_C32(0xe2b5b526), SPH_C32(0x83666629), SPH_C32(0x3b4848e3),
-	SPH_C32(0x0c030309), SPH_C32(0xf5f6f6f4), SPH_C32(0x380e0e2a),
-	SPH_C32(0x9f61613c), SPH_C32(0xd435358b), SPH_C32(0x475757be),
-	SPH_C32(0xd2b9b902), SPH_C32(0x2e8686bf), SPH_C32(0x29c1c171),
-	SPH_C32(0x741d1d53), SPH_C32(0x4e9e9ef7), SPH_C32(0xa9e1e191),
-	SPH_C32(0xcdf8f8de), SPH_C32(0x569898e5), SPH_C32(0x44111177),
-	SPH_C32(0xbf696904), SPH_C32(0x49d9d939), SPH_C32(0x0e8e8e87),
-	SPH_C32(0x669494c1), SPH_C32(0x5a9b9bec), SPH_C32(0x781e1e5a),
-	SPH_C32(0x2a8787b8), SPH_C32(0x89e9e9a9), SPH_C32(0x15cece5c),
-	SPH_C32(0x4f5555b0), SPH_C32(0xa02828d8), SPH_C32(0x51dfdf2b),
-	SPH_C32(0x068c8c89), SPH_C32(0xb2a1a14a), SPH_C32(0x12898992),
-	SPH_C32(0x340d0d23), SPH_C32(0xcabfbf10), SPH_C32(0xb5e6e684),
-	SPH_C32(0x134242d5), SPH_C32(0xbb686803), SPH_C32(0x1f4141dc),
-	SPH_C32(0x529999e2), SPH_C32(0xb42d2dc3), SPH_C32(0x3c0f0f2d),
-	SPH_C32(0xf6b0b03d), SPH_C32(0x4b5454b7), SPH_C32(0xdabbbb0c),
-	SPH_C32(0x58161662)
-};
-
- static const uint32_t mixtab2_cpu[] = {
-	SPH_C32(0x32976363), SPH_C32(0x6feb7c7c), SPH_C32(0x5ec77777),
-	SPH_C32(0x7af77b7b), SPH_C32(0xe8e5f2f2), SPH_C32(0x0ab76b6b),
-	SPH_C32(0x16a76f6f), SPH_C32(0x6d39c5c5), SPH_C32(0x90c03030),
-	SPH_C32(0x07040101), SPH_C32(0x2e876767), SPH_C32(0xd1ac2b2b),
-	SPH_C32(0xccd5fefe), SPH_C32(0x1371d7d7), SPH_C32(0x7c9aabab),
-	SPH_C32(0x59c37676), SPH_C32(0x4005caca), SPH_C32(0xa33e8282),
-	SPH_C32(0x4909c9c9), SPH_C32(0x68ef7d7d), SPH_C32(0xd0c5fafa),
-	SPH_C32(0x947f5959), SPH_C32(0xce074747), SPH_C32(0xe6edf0f0),
-	SPH_C32(0x6e82adad), SPH_C32(0x1a7dd4d4), SPH_C32(0x43bea2a2),
-	SPH_C32(0x608aafaf), SPH_C32(0xf9469c9c), SPH_C32(0x51a6a4a4),
-	SPH_C32(0x45d37272), SPH_C32(0x762dc0c0), SPH_C32(0x28eab7b7),
-	SPH_C32(0xc5d9fdfd), SPH_C32(0xd47a9393), SPH_C32(0xf2982626),
-	SPH_C32(0x82d83636), SPH_C32(0xbdfc3f3f), SPH_C32(0xf3f1f7f7),
-	SPH_C32(0x521dcccc), SPH_C32(0x8cd03434), SPH_C32(0x56a2a5a5),
-	SPH_C32(0x8db9e5e5), SPH_C32(0xe1e9f1f1), SPH_C32(0x4cdf7171),
-	SPH_C32(0x3e4dd8d8), SPH_C32(0x97c43131), SPH_C32(0x6b541515),
-	SPH_C32(0x1c100404), SPH_C32(0x6331c7c7), SPH_C32(0xe98c2323),
-	SPH_C32(0x7f21c3c3), SPH_C32(0x48601818), SPH_C32(0xcf6e9696),
-	SPH_C32(0x1b140505), SPH_C32(0xeb5e9a9a), SPH_C32(0x151c0707),
-	SPH_C32(0x7e481212), SPH_C32(0xad368080), SPH_C32(0x98a5e2e2),
-	SPH_C32(0xa781ebeb), SPH_C32(0xf59c2727), SPH_C32(0x33feb2b2),
-	SPH_C32(0x50cf7575), SPH_C32(0x3f240909), SPH_C32(0xa43a8383),
-	SPH_C32(0xc4b02c2c), SPH_C32(0x46681a1a), SPH_C32(0x416c1b1b),
-	SPH_C32(0x11a36e6e), SPH_C32(0x9d735a5a), SPH_C32(0x4db6a0a0),
-	SPH_C32(0xa5535252), SPH_C32(0xa1ec3b3b), SPH_C32(0x1475d6d6),
-	SPH_C32(0x34fab3b3), SPH_C32(0xdfa42929), SPH_C32(0x9fa1e3e3),
-	SPH_C32(0xcdbc2f2f), SPH_C32(0xb1268484), SPH_C32(0xa2575353),
-	SPH_C32(0x0169d1d1), SPH_C32(0x00000000), SPH_C32(0xb599eded),
-	SPH_C32(0xe0802020), SPH_C32(0xc2ddfcfc), SPH_C32(0x3af2b1b1),
-	SPH_C32(0x9a775b5b), SPH_C32(0x0db36a6a), SPH_C32(0x4701cbcb),
-	SPH_C32(0x17cebebe), SPH_C32(0xafe43939), SPH_C32(0xed334a4a),
-	SPH_C32(0xff2b4c4c), SPH_C32(0x937b5858), SPH_C32(0x5b11cfcf),
-	SPH_C32(0x066dd0d0), SPH_C32(0xbb91efef), SPH_C32(0x7b9eaaaa),
-	SPH_C32(0xd7c1fbfb), SPH_C32(0xd2174343), SPH_C32(0xf82f4d4d),
-	SPH_C32(0x99cc3333), SPH_C32(0xb6228585), SPH_C32(0xc00f4545),
-	SPH_C32(0xd9c9f9f9), SPH_C32(0x0e080202), SPH_C32(0x66e77f7f),
-	SPH_C32(0xab5b5050), SPH_C32(0xb4f03c3c), SPH_C32(0xf04a9f9f),
-	SPH_C32(0x7596a8a8), SPH_C32(0xac5f5151), SPH_C32(0x44baa3a3),
-	SPH_C32(0xdb1b4040), SPH_C32(0x800a8f8f), SPH_C32(0xd37e9292),
-	SPH_C32(0xfe429d9d), SPH_C32(0xa8e03838), SPH_C32(0xfdf9f5f5),
-	SPH_C32(0x19c6bcbc), SPH_C32(0x2feeb6b6), SPH_C32(0x3045dada),
-	SPH_C32(0xe7842121), SPH_C32(0x70401010), SPH_C32(0xcbd1ffff),
-	SPH_C32(0xefe1f3f3), SPH_C32(0x0865d2d2), SPH_C32(0x5519cdcd),
-	SPH_C32(0x24300c0c), SPH_C32(0x794c1313), SPH_C32(0xb29decec),
-	SPH_C32(0x86675f5f), SPH_C32(0xc86a9797), SPH_C32(0xc70b4444),
-	SPH_C32(0x655c1717), SPH_C32(0x6a3dc4c4), SPH_C32(0x58aaa7a7),
-	SPH_C32(0x61e37e7e), SPH_C32(0xb3f43d3d), SPH_C32(0x278b6464),
-	SPH_C32(0x886f5d5d), SPH_C32(0x4f641919), SPH_C32(0x42d77373),
-	SPH_C32(0x3b9b6060), SPH_C32(0xaa328181), SPH_C32(0xf6274f4f),
-	SPH_C32(0x225ddcdc), SPH_C32(0xee882222), SPH_C32(0xd6a82a2a),
-	SPH_C32(0xdd769090), SPH_C32(0x95168888), SPH_C32(0xc9034646),
-	SPH_C32(0xbc95eeee), SPH_C32(0x05d6b8b8), SPH_C32(0x6c501414),
-	SPH_C32(0x2c55dede), SPH_C32(0x81635e5e), SPH_C32(0x312c0b0b),
-	SPH_C32(0x3741dbdb), SPH_C32(0x96ade0e0), SPH_C32(0x9ec83232),
-	SPH_C32(0xa6e83a3a), SPH_C32(0x36280a0a), SPH_C32(0xe43f4949),
-	SPH_C32(0x12180606), SPH_C32(0xfc902424), SPH_C32(0x8f6b5c5c),
-	SPH_C32(0x7825c2c2), SPH_C32(0x0f61d3d3), SPH_C32(0x6986acac),
-	SPH_C32(0x35936262), SPH_C32(0xda729191), SPH_C32(0xc6629595),
-	SPH_C32(0x8abde4e4), SPH_C32(0x74ff7979), SPH_C32(0x83b1e7e7),
-	SPH_C32(0x4e0dc8c8), SPH_C32(0x85dc3737), SPH_C32(0x18af6d6d),
-	SPH_C32(0x8e028d8d), SPH_C32(0x1d79d5d5), SPH_C32(0xf1234e4e),
-	SPH_C32(0x7292a9a9), SPH_C32(0x1fab6c6c), SPH_C32(0xb9435656),
-	SPH_C32(0xfafdf4f4), SPH_C32(0xa085eaea), SPH_C32(0x208f6565),
-	SPH_C32(0x7df37a7a), SPH_C32(0x678eaeae), SPH_C32(0x38200808),
-	SPH_C32(0x0bdebaba), SPH_C32(0x73fb7878), SPH_C32(0xfb942525),
-	SPH_C32(0xcab82e2e), SPH_C32(0x54701c1c), SPH_C32(0x5faea6a6),
-	SPH_C32(0x21e6b4b4), SPH_C32(0x6435c6c6), SPH_C32(0xae8de8e8),
-	SPH_C32(0x2559dddd), SPH_C32(0x57cb7474), SPH_C32(0x5d7c1f1f),
-	SPH_C32(0xea374b4b), SPH_C32(0x1ec2bdbd), SPH_C32(0x9c1a8b8b),
-	SPH_C32(0x9b1e8a8a), SPH_C32(0x4bdb7070), SPH_C32(0xbaf83e3e),
-	SPH_C32(0x26e2b5b5), SPH_C32(0x29836666), SPH_C32(0xe33b4848),
-	SPH_C32(0x090c0303), SPH_C32(0xf4f5f6f6), SPH_C32(0x2a380e0e),
-	SPH_C32(0x3c9f6161), SPH_C32(0x8bd43535), SPH_C32(0xbe475757),
-	SPH_C32(0x02d2b9b9), SPH_C32(0xbf2e8686), SPH_C32(0x7129c1c1),
-	SPH_C32(0x53741d1d), SPH_C32(0xf74e9e9e), SPH_C32(0x91a9e1e1),
-	SPH_C32(0xdecdf8f8), SPH_C32(0xe5569898), SPH_C32(0x77441111),
-	SPH_C32(0x04bf6969), SPH_C32(0x3949d9d9), SPH_C32(0x870e8e8e),
-	SPH_C32(0xc1669494), SPH_C32(0xec5a9b9b), SPH_C32(0x5a781e1e),
-	SPH_C32(0xb82a8787), SPH_C32(0xa989e9e9), SPH_C32(0x5c15cece),
-	SPH_C32(0xb04f5555), SPH_C32(0xd8a02828), SPH_C32(0x2b51dfdf),
-	SPH_C32(0x89068c8c), SPH_C32(0x4ab2a1a1), SPH_C32(0x92128989),
-	SPH_C32(0x23340d0d), SPH_C32(0x10cabfbf), SPH_C32(0x84b5e6e6),
-	SPH_C32(0xd5134242), SPH_C32(0x03bb6868), SPH_C32(0xdc1f4141),
-	SPH_C32(0xe2529999), SPH_C32(0xc3b42d2d), SPH_C32(0x2d3c0f0f),
-	SPH_C32(0x3df6b0b0), SPH_C32(0xb74b5454), SPH_C32(0x0cdabbbb),
-	SPH_C32(0x62581616)
-};
-
- static const uint32_t mixtab3_cpu[] = {
-	SPH_C32(0x63329763), SPH_C32(0x7c6feb7c), SPH_C32(0x775ec777),
-	SPH_C32(0x7b7af77b), SPH_C32(0xf2e8e5f2), SPH_C32(0x6b0ab76b),
-	SPH_C32(0x6f16a76f), SPH_C32(0xc56d39c5), SPH_C32(0x3090c030),
-	SPH_C32(0x01070401), SPH_C32(0x672e8767), SPH_C32(0x2bd1ac2b),
-	SPH_C32(0xfeccd5fe), SPH_C32(0xd71371d7), SPH_C32(0xab7c9aab),
-	SPH_C32(0x7659c376), SPH_C32(0xca4005ca), SPH_C32(0x82a33e82),
-	SPH_C32(0xc94909c9), SPH_C32(0x7d68ef7d), SPH_C32(0xfad0c5fa),
-	SPH_C32(0x59947f59), SPH_C32(0x47ce0747), SPH_C32(0xf0e6edf0),
-	SPH_C32(0xad6e82ad), SPH_C32(0xd41a7dd4), SPH_C32(0xa243bea2),
-	SPH_C32(0xaf608aaf), SPH_C32(0x9cf9469c), SPH_C32(0xa451a6a4),
-	SPH_C32(0x7245d372), SPH_C32(0xc0762dc0), SPH_C32(0xb728eab7),
-	SPH_C32(0xfdc5d9fd), SPH_C32(0x93d47a93), SPH_C32(0x26f29826),
-	SPH_C32(0x3682d836), SPH_C32(0x3fbdfc3f), SPH_C32(0xf7f3f1f7),
-	SPH_C32(0xcc521dcc), SPH_C32(0x348cd034), SPH_C32(0xa556a2a5),
-	SPH_C32(0xe58db9e5), SPH_C32(0xf1e1e9f1), SPH_C32(0x714cdf71),
-	SPH_C32(0xd83e4dd8), SPH_C32(0x3197c431), SPH_C32(0x156b5415),
-	SPH_C32(0x041c1004), SPH_C32(0xc76331c7), SPH_C32(0x23e98c23),
-	SPH_C32(0xc37f21c3), SPH_C32(0x18486018), SPH_C32(0x96cf6e96),
-	SPH_C32(0x051b1405), SPH_C32(0x9aeb5e9a), SPH_C32(0x07151c07),
-	SPH_C32(0x127e4812), SPH_C32(0x80ad3680), SPH_C32(0xe298a5e2),
-	SPH_C32(0xeba781eb), SPH_C32(0x27f59c27), SPH_C32(0xb233feb2),
-	SPH_C32(0x7550cf75), SPH_C32(0x093f2409), SPH_C32(0x83a43a83),
-	SPH_C32(0x2cc4b02c), SPH_C32(0x1a46681a), SPH_C32(0x1b416c1b),
-	SPH_C32(0x6e11a36e), SPH_C32(0x5a9d735a), SPH_C32(0xa04db6a0),
-	SPH_C32(0x52a55352), SPH_C32(0x3ba1ec3b), SPH_C32(0xd61475d6),
-	SPH_C32(0xb334fab3), SPH_C32(0x29dfa429), SPH_C32(0xe39fa1e3),
-	SPH_C32(0x2fcdbc2f), SPH_C32(0x84b12684), SPH_C32(0x53a25753),
-	SPH_C32(0xd10169d1), SPH_C32(0x00000000), SPH_C32(0xedb599ed),
-	SPH_C32(0x20e08020), SPH_C32(0xfcc2ddfc), SPH_C32(0xb13af2b1),
-	SPH_C32(0x5b9a775b), SPH_C32(0x6a0db36a), SPH_C32(0xcb4701cb),
-	SPH_C32(0xbe17cebe), SPH_C32(0x39afe439), SPH_C32(0x4aed334a),
-	SPH_C32(0x4cff2b4c), SPH_C32(0x58937b58), SPH_C32(0xcf5b11cf),
-	SPH_C32(0xd0066dd0), SPH_C32(0xefbb91ef), SPH_C32(0xaa7b9eaa),
-	SPH_C32(0xfbd7c1fb), SPH_C32(0x43d21743), SPH_C32(0x4df82f4d),
-	SPH_C32(0x3399cc33), SPH_C32(0x85b62285), SPH_C32(0x45c00f45),
-	SPH_C32(0xf9d9c9f9), SPH_C32(0x020e0802), SPH_C32(0x7f66e77f),
-	SPH_C32(0x50ab5b50), SPH_C32(0x3cb4f03c), SPH_C32(0x9ff04a9f),
-	SPH_C32(0xa87596a8), SPH_C32(0x51ac5f51), SPH_C32(0xa344baa3),
-	SPH_C32(0x40db1b40), SPH_C32(0x8f800a8f), SPH_C32(0x92d37e92),
-	SPH_C32(0x9dfe429d), SPH_C32(0x38a8e038), SPH_C32(0xf5fdf9f5),
-	SPH_C32(0xbc19c6bc), SPH_C32(0xb62feeb6), SPH_C32(0xda3045da),
-	SPH_C32(0x21e78421), SPH_C32(0x10704010), SPH_C32(0xffcbd1ff),
-	SPH_C32(0xf3efe1f3), SPH_C32(0xd20865d2), SPH_C32(0xcd5519cd),
-	SPH_C32(0x0c24300c), SPH_C32(0x13794c13), SPH_C32(0xecb29dec),
-	SPH_C32(0x5f86675f), SPH_C32(0x97c86a97), SPH_C32(0x44c70b44),
-	SPH_C32(0x17655c17), SPH_C32(0xc46a3dc4), SPH_C32(0xa758aaa7),
-	SPH_C32(0x7e61e37e), SPH_C32(0x3db3f43d), SPH_C32(0x64278b64),
-	SPH_C32(0x5d886f5d), SPH_C32(0x194f6419), SPH_C32(0x7342d773),
-	SPH_C32(0x603b9b60), SPH_C32(0x81aa3281), SPH_C32(0x4ff6274f),
-	SPH_C32(0xdc225ddc), SPH_C32(0x22ee8822), SPH_C32(0x2ad6a82a),
-	SPH_C32(0x90dd7690), SPH_C32(0x88951688), SPH_C32(0x46c90346),
-	SPH_C32(0xeebc95ee), SPH_C32(0xb805d6b8), SPH_C32(0x146c5014),
-	SPH_C32(0xde2c55de), SPH_C32(0x5e81635e), SPH_C32(0x0b312c0b),
-	SPH_C32(0xdb3741db), SPH_C32(0xe096ade0), SPH_C32(0x329ec832),
-	SPH_C32(0x3aa6e83a), SPH_C32(0x0a36280a), SPH_C32(0x49e43f49),
-	SPH_C32(0x06121806), SPH_C32(0x24fc9024), SPH_C32(0x5c8f6b5c),
-	SPH_C32(0xc27825c2), SPH_C32(0xd30f61d3), SPH_C32(0xac6986ac),
-	SPH_C32(0x62359362), SPH_C32(0x91da7291), SPH_C32(0x95c66295),
-	SPH_C32(0xe48abde4), SPH_C32(0x7974ff79), SPH_C32(0xe783b1e7),
-	SPH_C32(0xc84e0dc8), SPH_C32(0x3785dc37), SPH_C32(0x6d18af6d),
-	SPH_C32(0x8d8e028d), SPH_C32(0xd51d79d5), SPH_C32(0x4ef1234e),
-	SPH_C32(0xa97292a9), SPH_C32(0x6c1fab6c), SPH_C32(0x56b94356),
-	SPH_C32(0xf4fafdf4), SPH_C32(0xeaa085ea), SPH_C32(0x65208f65),
-	SPH_C32(0x7a7df37a), SPH_C32(0xae678eae), SPH_C32(0x08382008),
-	SPH_C32(0xba0bdeba), SPH_C32(0x7873fb78), SPH_C32(0x25fb9425),
-	SPH_C32(0x2ecab82e), SPH_C32(0x1c54701c), SPH_C32(0xa65faea6),
-	SPH_C32(0xb421e6b4), SPH_C32(0xc66435c6), SPH_C32(0xe8ae8de8),
-	SPH_C32(0xdd2559dd), SPH_C32(0x7457cb74), SPH_C32(0x1f5d7c1f),
-	SPH_C32(0x4bea374b), SPH_C32(0xbd1ec2bd), SPH_C32(0x8b9c1a8b),
-	SPH_C32(0x8a9b1e8a), SPH_C32(0x704bdb70), SPH_C32(0x3ebaf83e),
-	SPH_C32(0xb526e2b5), SPH_C32(0x66298366), SPH_C32(0x48e33b48),
-	SPH_C32(0x03090c03), SPH_C32(0xf6f4f5f6), SPH_C32(0x0e2a380e),
-	SPH_C32(0x613c9f61), SPH_C32(0x358bd435), SPH_C32(0x57be4757),
-	SPH_C32(0xb902d2b9), SPH_C32(0x86bf2e86), SPH_C32(0xc17129c1),
-	SPH_C32(0x1d53741d), SPH_C32(0x9ef74e9e), SPH_C32(0xe191a9e1),
-	SPH_C32(0xf8decdf8), SPH_C32(0x98e55698), SPH_C32(0x11774411),
-	SPH_C32(0x6904bf69), SPH_C32(0xd93949d9), SPH_C32(0x8e870e8e),
-	SPH_C32(0x94c16694), SPH_C32(0x9bec5a9b), SPH_C32(0x1e5a781e),
-	SPH_C32(0x87b82a87), SPH_C32(0xe9a989e9), SPH_C32(0xce5c15ce),
-	SPH_C32(0x55b04f55), SPH_C32(0x28d8a028), SPH_C32(0xdf2b51df),
-	SPH_C32(0x8c89068c), SPH_C32(0xa14ab2a1), SPH_C32(0x89921289),
-	SPH_C32(0x0d23340d), SPH_C32(0xbf10cabf), SPH_C32(0xe684b5e6),
-	SPH_C32(0x42d51342), SPH_C32(0x6803bb68), SPH_C32(0x41dc1f41),
-	SPH_C32(0x99e25299), SPH_C32(0x2dc3b42d), SPH_C32(0x0f2d3c0f),
-	SPH_C32(0xb03df6b0), SPH_C32(0x54b74b54), SPH_C32(0xbb0cdabb),
-	SPH_C32(0x16625816)
+// store allocated textures device addresses
+static unsigned int* d_textures[MAX_GPUS][1];
+
+#define mixtab0(x) mixtabs[(x)]
+#define mixtab1(x) mixtabs[(x)+256]
+#define mixtab2(x) mixtabs[(x)+512]
+#define mixtab3(x) mixtabs[(x)+768]
+
+static texture<unsigned int, 1, cudaReadModeElementType> mixTab0Tex;
+
+static const uint32_t mixtab0[] = {
+	0x63633297, 0x7c7c6feb, 0x77775ec7, 0x7b7b7af7, 0xf2f2e8e5, 0x6b6b0ab7, 0x6f6f16a7, 0xc5c56d39,
+	0x303090c0, 0x01010704, 0x67672e87, 0x2b2bd1ac, 0xfefeccd5, 0xd7d71371, 0xabab7c9a, 0x767659c3,
+	0xcaca4005, 0x8282a33e, 0xc9c94909, 0x7d7d68ef, 0xfafad0c5, 0x5959947f, 0x4747ce07, 0xf0f0e6ed,
+	0xadad6e82, 0xd4d41a7d, 0xa2a243be, 0xafaf608a, 0x9c9cf946, 0xa4a451a6, 0x727245d3, 0xc0c0762d,
+	0xb7b728ea, 0xfdfdc5d9, 0x9393d47a, 0x2626f298, 0x363682d8, 0x3f3fbdfc, 0xf7f7f3f1, 0xcccc521d,
+	0x34348cd0, 0xa5a556a2, 0xe5e58db9, 0xf1f1e1e9, 0x71714cdf, 0xd8d83e4d, 0x313197c4, 0x15156b54,
+	0x04041c10, 0xc7c76331, 0x2323e98c, 0xc3c37f21, 0x18184860, 0x9696cf6e, 0x05051b14, 0x9a9aeb5e,
+	0x0707151c, 0x12127e48, 0x8080ad36, 0xe2e298a5, 0xebeba781, 0x2727f59c, 0xb2b233fe, 0x757550cf,
+	0x09093f24, 0x8383a43a, 0x2c2cc4b0, 0x1a1a4668, 0x1b1b416c, 0x6e6e11a3, 0x5a5a9d73, 0xa0a04db6,
+	0x5252a553, 0x3b3ba1ec, 0xd6d61475, 0xb3b334fa, 0x2929dfa4, 0xe3e39fa1, 0x2f2fcdbc, 0x8484b126,
+	0x5353a257, 0xd1d10169, 0x00000000, 0xededb599, 0x2020e080, 0xfcfcc2dd, 0xb1b13af2, 0x5b5b9a77,
+	0x6a6a0db3, 0xcbcb4701, 0xbebe17ce, 0x3939afe4, 0x4a4aed33, 0x4c4cff2b, 0x5858937b, 0xcfcf5b11,
+	0xd0d0066d, 0xefefbb91, 0xaaaa7b9e, 0xfbfbd7c1, 0x4343d217, 0x4d4df82f, 0x333399cc, 0x8585b622,
+	0x4545c00f, 0xf9f9d9c9, 0x02020e08, 0x7f7f66e7, 0x5050ab5b, 0x3c3cb4f0, 0x9f9ff04a, 0xa8a87596,
+	0x5151ac5f, 0xa3a344ba, 0x4040db1b, 0x8f8f800a, 0x9292d37e, 0x9d9dfe42, 0x3838a8e0, 0xf5f5fdf9,
+	0xbcbc19c6, 0xb6b62fee, 0xdada3045, 0x2121e784, 0x10107040, 0xffffcbd1, 0xf3f3efe1, 0xd2d20865,
+	0xcdcd5519, 0x0c0c2430, 0x1313794c, 0xececb29d, 0x5f5f8667, 0x9797c86a, 0x4444c70b, 0x1717655c,
+	0xc4c46a3d, 0xa7a758aa, 0x7e7e61e3, 0x3d3db3f4, 0x6464278b, 0x5d5d886f, 0x19194f64, 0x737342d7,
+	0x60603b9b, 0x8181aa32, 0x4f4ff627, 0xdcdc225d, 0x2222ee88, 0x2a2ad6a8, 0x9090dd76, 0x88889516,
+	0x4646c903, 0xeeeebc95, 0xb8b805d6, 0x14146c50, 0xdede2c55, 0x5e5e8163, 0x0b0b312c, 0xdbdb3741,
+	0xe0e096ad, 0x32329ec8, 0x3a3aa6e8, 0x0a0a3628, 0x4949e43f, 0x06061218, 0x2424fc90, 0x5c5c8f6b,
+	0xc2c27825, 0xd3d30f61, 0xacac6986, 0x62623593, 0x9191da72, 0x9595c662, 0xe4e48abd, 0x797974ff,
+	0xe7e783b1, 0xc8c84e0d, 0x373785dc, 0x6d6d18af, 0x8d8d8e02, 0xd5d51d79, 0x4e4ef123, 0xa9a97292,
+	0x6c6c1fab, 0x5656b943, 0xf4f4fafd, 0xeaeaa085, 0x6565208f, 0x7a7a7df3, 0xaeae678e, 0x08083820,
+	0xbaba0bde, 0x787873fb, 0x2525fb94, 0x2e2ecab8, 0x1c1c5470, 0xa6a65fae, 0xb4b421e6, 0xc6c66435,
+	0xe8e8ae8d, 0xdddd2559, 0x747457cb, 0x1f1f5d7c, 0x4b4bea37, 0xbdbd1ec2, 0x8b8b9c1a, 0x8a8a9b1e,
+	0x70704bdb, 0x3e3ebaf8, 0xb5b526e2, 0x66662983, 0x4848e33b, 0x0303090c, 0xf6f6f4f5, 0x0e0e2a38,
+	0x61613c9f, 0x35358bd4, 0x5757be47, 0xb9b902d2, 0x8686bf2e, 0xc1c17129, 0x1d1d5374, 0x9e9ef74e,
+	0xe1e191a9, 0xf8f8decd, 0x9898e556, 0x11117744, 0x696904bf, 0xd9d93949, 0x8e8e870e, 0x9494c166,
+	0x9b9bec5a, 0x1e1e5a78, 0x8787b82a, 0xe9e9a989, 0xcece5c15, 0x5555b04f, 0x2828d8a0, 0xdfdf2b51,
+	0x8c8c8906, 0xa1a14ab2, 0x89899212, 0x0d0d2334, 0xbfbf10ca, 0xe6e684b5, 0x4242d513, 0x686803bb,
+	0x4141dc1f, 0x9999e252, 0x2d2dc3b4, 0x0f0f2d3c, 0xb0b03df6, 0x5454b74b, 0xbbbb0cda, 0x16166258
 };
 
 #define TIX4(q, x00, x01, x04, x07, x08, x22, x24, x27, x30) { \
-		x22 ^= x00; \
-		x00 = (q); \
-		x08 ^= x00; \
-		x01 ^= x24; \
-		x04 ^= x27; \
-		x07 ^= x30; \
-	}
+	x22 ^= x00; \
+	x00 = (q); \
+	x08 ^= x00; \
+	x01 ^= x24; \
+	x04 ^= x27; \
+	x07 ^= x30; \
+}
 
 #define CMIX36(x00, x01, x02, x04, x05, x06, x18, x19, x20) { \
-		x00 ^= x04; \
-		x01 ^= x05; \
-		x02 ^= x06; \
-		x18 ^= x04; \
-		x19 ^= x05; \
-		x20 ^= x06; \
-	}
+	x00 ^= x04; \
+	x01 ^= x05; \
+	x02 ^= x06; \
+	x18 ^= x04; \
+	x19 ^= x05; \
+	x20 ^= x06; \
+}
 
 #define SMIX(x0, x1, x2, x3) { \
-		uint32_t c0 = 0; \
-		uint32_t c1 = 0; \
-		uint32_t c2 = 0; \
-		uint32_t c3 = 0; \
-		uint32_t r0 = 0; \
-		uint32_t r1 = 0; \
-		uint32_t r2 = 0; \
-		uint32_t r3 = 0; \
-		uint32_t tmp; \
-		tmp = mixtab0(x0 >> 24); \
-		c0 ^= tmp; \
-		tmp = mixtab1((x0 >> 16) & 0xFF); \
-		c0 ^= tmp; \
-		r1 ^= tmp; \
-		tmp = mixtab2((x0 >>  8) & 0xFF); \
-		c0 ^= tmp; \
-		r2 ^= tmp; \
-		tmp = mixtab3(x0 & 0xFF); \
-		c0 ^= tmp; \
-		r3 ^= tmp; \
-		tmp = mixtab0(x1 >> 24); \
-		c1 ^= tmp; \
-		r0 ^= tmp; \
-		tmp = mixtab1((x1 >> 16) & 0xFF); \
-		c1 ^= tmp; \
-		tmp = mixtab2((x1 >>  8) & 0xFF); \
-		c1 ^= tmp; \
-		r2 ^= tmp; \
-		tmp = mixtab3(x1 & 0xFF); \
-		c1 ^= tmp; \
-		r3 ^= tmp; \
-		tmp = mixtab0(x2 >> 24); \
-		c2 ^= tmp; \
-		r0 ^= tmp; \
-		tmp = mixtab1((x2 >> 16) & 0xFF); \
-		c2 ^= tmp; \
-		r1 ^= tmp; \
-		tmp = mixtab2((x2 >>  8) & 0xFF); \
-		c2 ^= tmp; \
-		tmp = mixtab3(x2 & 0xFF); \
-		c2 ^= tmp; \
-		r3 ^= tmp; \
-		tmp = mixtab0(x3 >> 24); \
-		c3 ^= tmp; \
-		r0 ^= tmp; \
-		tmp = mixtab1((x3 >> 16) & 0xFF); \
-		c3 ^= tmp; \
-		r1 ^= tmp; \
-		tmp = mixtab2((x3 >>  8) & 0xFF); \
-		c3 ^= tmp; \
-		r2 ^= tmp; \
-		tmp = mixtab3(x3 & 0xFF); \
-		c3 ^= tmp; \
-		x0 = ((c0 ^ r0) & SPH_C32(0xFF000000)) \
-			| ((c1 ^ r1) & SPH_C32(0x00FF0000)) \
-			| ((c2 ^ r2) & SPH_C32(0x0000FF00)) \
-			| ((c3 ^ r3) & SPH_C32(0x000000FF)); \
-		x1 = ((c1 ^ (r0 << 8)) & SPH_C32(0xFF000000)) \
-			| ((c2 ^ (r1 << 8)) & SPH_C32(0x00FF0000)) \
-			| ((c3 ^ (r2 << 8)) & SPH_C32(0x0000FF00)) \
-			| ((c0 ^ (r3 >> 24)) & SPH_C32(0x000000FF)); \
-		x2 = ((c2 ^ (r0 << 16)) & SPH_C32(0xFF000000)) \
-			| ((c3 ^ (r1 << 16)) & SPH_C32(0x00FF0000)) \
-			| ((c0 ^ (r2 >> 16)) & SPH_C32(0x0000FF00)) \
-			| ((c1 ^ (r3 >> 16)) & SPH_C32(0x000000FF)); \
-		x3 = ((c3 ^ (r0 << 24)) & SPH_C32(0xFF000000)) \
-			| ((c0 ^ (r1 >> 8)) & SPH_C32(0x00FF0000)) \
-			| ((c1 ^ (r2 >> 8)) & SPH_C32(0x0000FF00)) \
-			| ((c2 ^ (r3 >> 8)) & SPH_C32(0x000000FF)); \
-	}
+	uint32_t tmp; \
+	uint32_t r0 = 0; \
+	uint32_t r1 = 0; \
+	uint32_t r2 = 0; \
+	uint32_t r3 = 0; \
+	uint32_t c0 = mixtab0(x0 >> 24); \
+	tmp = mixtab1((x0 >> 16) & 0xFF); \
+	c0 ^= tmp; \
+	r1 ^= tmp; \
+	tmp = mixtab2((x0 >>  8) & 0xFF); \
+	c0 ^= tmp; \
+	r2 ^= tmp; \
+	tmp = mixtab3(x0 & 0xFF); \
+	c0 ^= tmp; \
+	r3 ^= tmp; \
+	tmp = mixtab0(x1 >> 24); \
+	uint32_t c1 = tmp; \
+	r0 ^= tmp; \
+	tmp = mixtab1((x1 >> 16) & 0xFF); \
+	c1 ^= tmp; \
+	tmp = mixtab2((x1 >>  8) & 0xFF); \
+	c1 ^= tmp; \
+	r2 ^= tmp; \
+	tmp = mixtab3(x1 & 0xFF); \
+	c1 ^= tmp; \
+	r3 ^= tmp; \
+	tmp = mixtab0(x2 >> 24); \
+	uint32_t c2 = tmp; \
+	r0 ^= tmp; \
+	tmp = mixtab1((x2 >> 16) & 0xFF); \
+	c2 ^= tmp; \
+	r1 ^= tmp; \
+	tmp = mixtab2((x2 >>  8) & 0xFF); \
+	c2 ^= tmp; \
+	tmp = mixtab3(x2 & 0xFF); \
+	c2 ^= tmp; \
+	r3 ^= tmp; \
+	tmp = mixtab0(x3 >> 24); \
+	uint32_t c3 = tmp; \
+	r0 ^= tmp; \
+	tmp = mixtab1((x3 >> 16) & 0xFF); \
+	c3 ^= tmp; \
+	r1 ^= tmp; \
+	tmp = mixtab2((x3 >>  8) & 0xFF); \
+	c3 ^= tmp; \
+	r2 ^= tmp; \
+	tmp = mixtab3(x3 & 0xFF); \
+	c3 ^= tmp; \
+	x0 = ((c0 ^ r0) & 0xFF000000) | ((c1 ^ r1) & 0x00FF0000) \
+		| ((c2 ^ r2) & 0x0000FF00) | ((c3 ^ r3) & 0x000000FF); \
+	x1 = ((c1 ^ (r0 <<  8)) & 0xFF000000) | ((c2 ^ (r1 <<  8)) & 0x00FF0000) \
+		| ((c3 ^ (r2 <<  8)) & 0x0000FF00) | ((c0 ^ (r3 >> 24)) & 0x000000FF); \
+	x2 = ((c2 ^ (r0 << 16)) & 0xFF000000) | ((c3 ^ (r1 << 16)) & 0x00FF0000) \
+		| ((c0 ^ (r2 >> 16)) & 0x0000FF00) | ((c1 ^ (r3 >> 16)) & 0x000000FF); \
+	x3 = ((c3 ^ (r0 << 24)) & 0xFF000000) | ((c0 ^ (r1 >>  8)) & 0x00FF0000) \
+		| ((c1 ^ (r2 >>  8)) & 0x0000FF00) | ((c2 ^ (r3 >>  8)) & 0x000000FF); \
+}
 
-#define ROR3 { \
+#define SUB_ROR3 { \
 	B33 = S33, B34 = S34, B35 = S35; \
-    S35 = S32; S34 = S31; S33 = S30; S32 = S29; S31 = S28; S30 = S27; S29 = S26; S28 = S25; S27 = S24; \
+	S35 = S32; S34 = S31; S33 = S30; S32 = S29; S31 = S28; S30 = S27; S29 = S26; S28 = S25; S27 = S24; \
 	S26 = S23; S25 = S22; S24 = S21; S23 = S20; S22 = S19; S21 = S18; S20 = S17; S19 = S16; S18 = S15; \
 	S17 = S14; S16 = S13; S15 = S12; S14 = S11; S13 = S10; S12 = S09; S11 = S08; S10 = S07; S09 = S06; \
 	S08 = S05; S07 = S04; S06 = S03; S05 = S02; S04 = S01; S03 = S00; S02 = B35; S01 = B34; S00 = B33; \
-	}
+}
 
-#define ROR8 { \
+#define SUB_ROR8 { \
 	B28 = S28, B29 = S29, B30 = S30, B31 = S31, B32 = S32, B33 = S33, B34 = S34, B35 = S35; \
-    S35 = S27; S34 = S26; S33 = S25; S32 = S24; S31 = S23; S30 = S22; S29 = S21; S28 = S20; S27 = S19; \
+	S35 = S27; S34 = S26; S33 = S25; S32 = S24; S31 = S23; S30 = S22; S29 = S21; S28 = S20; S27 = S19; \
 	S26 = S18; S25 = S17; S24 = S16; S23 = S15; S22 = S14; S21 = S13; S20 = S12; S19 = S11; S18 = S10; \
 	S17 = S09; S16 = S08; S15 = S07; S14 = S06; S13 = S05; S12 = S04; S11 = S03; S10 = S02; S09 = S01; \
 	S08 = S00; S07 = B35; S06 = B34; S05 = B33; S04 = B32; S03 = B31; S02 = B30; S01 = B29; S00 = B28; \
-	}
+}
 
-#define ROR9 { \
+#define SUB_ROR9 { \
 	B27 = S27, B28 = S28, B29 = S29, B30 = S30, B31 = S31, B32 = S32, B33 = S33, B34 = S34, B35 = S35; \
-    S35 = S26; S34 = S25; S33 = S24; S32 = S23; S31 = S22; S30 = S21; S29 = S20; S28 = S19; S27 = S18; \
+	S35 = S26; S34 = S25; S33 = S24; S32 = S23; S31 = S22; S30 = S21; S29 = S20; S28 = S19; S27 = S18; \
 	S26 = S17; S25 = S16; S24 = S15; S23 = S14; S22 = S13; S21 = S12; S20 = S11; S19 = S10; S18 = S09; \
 	S17 = S08; S16 = S07; S15 = S06; S14 = S05; S13 = S04; S12 = S03; S11 = S02; S10 = S01; S09 = S00; \
 	S08 = B35; S07 = B34; S06 = B33; S05 = B32; S04 = B31; S03 = B30; S02 = B29; S01 = B28; S00 = B27; \
-	}
+}
 
-#define FUGUE512_3(x, y, z) {  \
-        TIX4(x, S00, S01, S04, S07, S08, S22, S24, S27, S30); \
-        CMIX36(S33, S34, S35, S01, S02, S03, S15, S16, S17); \
-        SMIX(S33, S34, S35, S00); \
-        CMIX36(S30, S31, S32, S34, S35, S00, S12, S13, S14); \
-        SMIX(S30, S31, S32, S33); \
-        CMIX36(S27, S28, S29, S31, S32, S33, S09, S10, S11); \
-        SMIX(S27, S28, S29, S30); \
-        CMIX36(S24, S25, S26, S28, S29, S30, S06, S07, S08); \
-        SMIX(S24, S25, S26, S27); \
-        \
-        TIX4(y, S24, S25, S28, S31, S32, S10, S12, S15, S18); \
-        CMIX36(S21, S22, S23, S25, S26, S27, S03, S04, S05); \
-        SMIX(S21, S22, S23, S24); \
-        CMIX36(S18, S19, S20, S22, S23, S24, S00, S01, S02); \
-        SMIX(S18, S19, S20, S21); \
-        CMIX36(S15, S16, S17, S19, S20, S21, S33, S34, S35); \
-        SMIX(S15, S16, S17, S18); \
-        CMIX36(S12, S13, S14, S16, S17, S18, S30, S31, S32); \
-        SMIX(S12, S13, S14, S15); \
-        \
-        TIX4(z, S12, S13, S16, S19, S20, S34, S00, S03, S06); \
-        CMIX36(S09, S10, S11, S13, S14, S15, S27, S28, S29); \
-        SMIX(S09, S10, S11, S12); \
-        CMIX36(S06, S07, S08, S10, S11, S12, S24, S25, S26); \
-        SMIX(S06, S07, S08, S09); \
-        CMIX36(S03, S04, S05, S07, S08, S09, S21, S22, S23); \
-        SMIX(S03, S04, S05, S06); \
-        CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20); \
-        SMIX(S00, S01, S02, S03); \
-	}
+#define FUGUE512_3(x, y, z) { \
+	TIX4(x, S00, S01, S04, S07, S08, S22, S24, S27, S30); \
+	CMIX36(S33, S34, S35, S01, S02, S03, S15, S16, S17); \
+	SMIX(S33, S34, S35, S00); \
+	CMIX36(S30, S31, S32, S34, S35, S00, S12, S13, S14); \
+	SMIX(S30, S31, S32, S33); \
+	CMIX36(S27, S28, S29, S31, S32, S33, S09, S10, S11); \
+	SMIX(S27, S28, S29, S30); \
+	CMIX36(S24, S25, S26, S28, S29, S30, S06, S07, S08); \
+	SMIX(S24, S25, S26, S27); \
+	\
+	TIX4(y, S24, S25, S28, S31, S32, S10, S12, S15, S18); \
+	CMIX36(S21, S22, S23, S25, S26, S27, S03, S04, S05); \
+	SMIX(S21, S22, S23, S24); \
+	CMIX36(S18, S19, S20, S22, S23, S24, S00, S01, S02); \
+	SMIX(S18, S19, S20, S21); \
+	CMIX36(S15, S16, S17, S19, S20, S21, S33, S34, S35); \
+	SMIX(S15, S16, S17, S18); \
+	CMIX36(S12, S13, S14, S16, S17, S18, S30, S31, S32); \
+	SMIX(S12, S13, S14, S15); \
+	\
+	TIX4(z, S12, S13, S16, S19, S20, S34, S00, S03, S06); \
+	CMIX36(S09, S10, S11, S13, S14, S15, S27, S28, S29); \
+	SMIX(S09, S10, S11, S12); \
+	CMIX36(S06, S07, S08, S10, S11, S12, S24, S25, S26); \
+	SMIX(S06, S07, S08, S09); \
+	CMIX36(S03, S04, S05, S07, S08, S09, S21, S22, S23); \
+	SMIX(S03, S04, S05, S06); \
+	CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20); \
+	SMIX(S00, S01, S02, S03); \
+}
+
+#undef ROL8
+#ifdef __CUDA_ARCH__
+__device__ __forceinline__
+uint32_t ROL8(const uint32_t a) {
+	return __byte_perm(a, 0, 0x2103);
+}
+__device__ __forceinline__
+uint32_t ROR8(const uint32_t a) {
+	return __byte_perm(a, 0, 0x0321);
+}
+__device__ __forceinline__
+uint32_t ROL16(const uint32_t a) {
+	return __byte_perm(a, 0, 0x1032);
+}
+#else
+#define ROL8(u)  ROTL32(u, 8)
+#define ROR8(u)  ROTR32(u, 8)
+#define ROL16(u) ROTL32(u,16)
+#endif
 
 
+#define AS_UINT4(addr) *((uint4*)(addr))
+
 /***************************************************/
-// Die Hash-Funktion
-__global__ void x13_fugue512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
+__global__
+__launch_bounds__(TPB)
+void x13_fugue512_gpu_hash_64(uint32_t threads, uint64_t *g_hash)
 {
-	extern __shared__ char mixtabs[];
-
-	*((uint32_t*)mixtabs + (    threadIdx.x)) = tex1Dfetch(mixTab0Tex, threadIdx.x);
-	*((uint32_t*)mixtabs + (256+threadIdx.x)) = tex1Dfetch(mixTab1Tex, threadIdx.x);
-	*((uint32_t*)mixtabs + (512+threadIdx.x)) = tex1Dfetch(mixTab2Tex, threadIdx.x);
-	*((uint32_t*)mixtabs + (768+threadIdx.x)) = tex1Dfetch(mixTab3Tex, threadIdx.x);
+	__shared__ uint32_t mixtabs[1024];
+
+	// load shared mem (with 256 threads)
+	const uint32_t thr = threadIdx.x & 0xFF;
+	const uint32_t tmp = tex1Dfetch(mixTab0Tex, thr);
+	mixtabs[thr] = tmp;
+	mixtabs[thr+256] = ROR8(tmp);
+	mixtabs[thr+512] = ROL16(tmp);
+	mixtabs[thr+768] = ROL8(tmp);
+#if TPB <= 256
+	if (blockDim.x < 256) {
+		const uint32_t thr = (threadIdx.x + 0x80) & 0xFF;
+		const uint32_t tmp = tex1Dfetch(mixTab0Tex, thr);
+		mixtabs[thr] = tmp;
+		mixtabs[thr + 256] = ROR8(tmp);
+		mixtabs[thr + 512] = ROL16(tmp);
+		mixtabs[thr + 768] = ROL8(tmp);
+	}
+#endif
 
 	__syncthreads();
 
-    int i;
-    int thread = (blockDim.x * blockIdx.x + threadIdx.x);
-    if (thread < threads)
-    {
-        uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
-
-        int hashPosition = nounce - startNounce;
-        uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3];
-
-        #pragma unroll 16
-        for( i = 0; i < 16; i++ )
-            Hash[i] = SWAB32(Hash[i]);
-
-        uint32_t S00, S01, S02, S03, S04, S05, S06, S07, S08, S09;
-        uint32_t S10, S11, S12, S13, S14, S15, S16, S17, S18, S19;
-        uint32_t S20, S21, S22, S23, S24, S25, S26, S27, S28, S29;
-        uint32_t S30, S31, S32, S33, S34, S35;
-
-        uint32_t B27, B28, B29, B30, B31, B32, B33, B34, B35;
-        uint64_t bc = (uint64_t) 64 << 3;
-        uint32_t bclo = (uint32_t)(bc & 0xFFFFFFFFULL);
-        uint32_t bchi = (uint32_t)(bc >> 32);
-
-        S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0;
-        S20 = SPH_C32(0x8807a57e); S21 = SPH_C32(0xe616af75); S22 = SPH_C32(0xc5d3e4db); S23 = SPH_C32(0xac9ab027); 
-        S24 = SPH_C32(0xd915f117); S25 = SPH_C32(0xb6eecc54); S26 = SPH_C32(0x06e8020b); S27 = SPH_C32(0x4a92efd1); 
-        S28 = SPH_C32(0xaac6e2c9); S29 = SPH_C32(0xddb21398); S30 = SPH_C32(0xcae65838); S31 = SPH_C32(0x437f203f);
-        S32 = SPH_C32(0x25ea78e7); S33 = SPH_C32(0x951fddd6); S34 = SPH_C32(0xda6ed11d); S35 = SPH_C32(0xe13e3567);
-
-        FUGUE512_3((Hash[0x0]), (Hash[0x1]), (Hash[0x2]));
-        FUGUE512_3((Hash[0x3]), (Hash[0x4]), (Hash[0x5]));
-        FUGUE512_3((Hash[0x6]), (Hash[0x7]), (Hash[0x8]));
-        FUGUE512_3((Hash[0x9]), (Hash[0xA]), (Hash[0xB]));
-        FUGUE512_3((Hash[0xC]), (Hash[0xD]), (Hash[0xE]));
-        FUGUE512_3((Hash[0xF]), bchi, bclo);
-
-        #pragma unroll 32
-        for (i = 0; i < 32; i ++) {
-            ROR3;
-            CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20);
-            SMIX(S00, S01, S02, S03);
-        }
-        #pragma unroll 13
-        for (i = 0; i < 13; i ++) {
-            S04 ^= S00;
-            S09 ^= S00;
-            S18 ^= S00;
-            S27 ^= S00;
-            ROR9;
-            SMIX(S00, S01, S02, S03);
-            S04 ^= S00;
-            S10 ^= S00;
-            S18 ^= S00;
-            S27 ^= S00;
-            ROR9;
-            SMIX(S00, S01, S02, S03);
-            S04 ^= S00;
-            S10 ^= S00;
-            S19 ^= S00;
-            S27 ^= S00;
-            ROR9;
-            SMIX(S00, S01, S02, S03);
-            S04 ^= S00;
-            S10 ^= S00;
-            S19 ^= S00;
-            S28 ^= S00;
-            ROR8;
-            SMIX(S00, S01, S02, S03);
-        }
-        S04 ^= S00;
-        S09 ^= S00;
-        S18 ^= S00;
-        S27 ^= S00;
-
-        Hash[0] = SWAB32(S01);
-        Hash[1] = SWAB32(S02);
-        Hash[2] = SWAB32(S03);
-        Hash[3] = SWAB32(S04);
-        Hash[4] = SWAB32(S09);
-        Hash[5] = SWAB32(S10);
-        Hash[6] = SWAB32(S11);
-        Hash[7] = SWAB32(S12);
-        Hash[8] = SWAB32(S18);
-        Hash[9] = SWAB32(S19);
-        Hash[10] = SWAB32(S20);
-        Hash[11] = SWAB32(S21);
-        Hash[12] = SWAB32(S27);
-        Hash[13] = SWAB32(S28);
-        Hash[14] = SWAB32(S29);
-        Hash[15] = SWAB32(S30);
-    }
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const size_t hashPosition = thread;
+		uint64_t*pHash = &g_hash[hashPosition<<3];
+		uint32_t Hash[16];
+
+		#pragma unroll 4
+		for(int i = 0; i < 4; i++)
+			AS_UINT4(&Hash[i*4]) = AS_UINT4(&pHash[i*2]);
+
+		#pragma unroll 16
+		for(int i = 0; i < 16; i++)
+			Hash[i] = cuda_swab32(Hash[i]);
+
+		uint32_t S00, S01, S02, S03, S04, S05, S06, S07, S08, S09;
+		uint32_t S10, S11, S12, S13, S14, S15, S16, S17, S18, S19;
+		uint32_t S20, S21, S22, S23, S24, S25, S26, S27, S28, S29;
+		uint32_t S30, S31, S32, S33, S34, S35;
+
+		uint32_t B27, B28, B29, B30, B31, B32, B33, B34, B35;
+		//const uint64_t bc = (64ULL << 3); // 512
+		//const uint32_t bclo = (uint32_t)(bc);
+		//const uint32_t bchi = (uint32_t)(bc >> 32);
+
+		S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = 0;
+		S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0;
+		S20 = 0x8807a57e; S21 = 0xe616af75; S22 = 0xc5d3e4db; S23 = 0xac9ab027;
+		S24 = 0xd915f117; S25 = 0xb6eecc54; S26 = 0x06e8020b; S27 = 0x4a92efd1;
+		S28 = 0xaac6e2c9; S29 = 0xddb21398; S30 = 0xcae65838; S31 = 0x437f203f;
+		S32 = 0x25ea78e7; S33 = 0x951fddd6; S34 = 0xda6ed11d; S35 = 0xe13e3567;
+
+		FUGUE512_3((Hash[0x0]), (Hash[0x1]), (Hash[0x2]));
+		FUGUE512_3((Hash[0x3]), (Hash[0x4]), (Hash[0x5]));
+		FUGUE512_3((Hash[0x6]), (Hash[0x7]), (Hash[0x8]));
+		FUGUE512_3((Hash[0x9]), (Hash[0xA]), (Hash[0xB]));
+		FUGUE512_3((Hash[0xC]), (Hash[0xD]), (Hash[0xE]));
+		FUGUE512_3((Hash[0xF]), 0u /*bchi*/, 512u /*bclo*/);
+
+		#pragma unroll 32
+		for (int i = 0; i < 32; i ++) {
+			SUB_ROR3;
+			CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20);
+			SMIX(S00, S01, S02, S03);
+		}
+		#pragma unroll 13
+		for (int i = 0; i < 13; i++) {
+			S04 ^= S00;
+			S09 ^= S00;
+			S18 ^= S00;
+			S27 ^= S00;
+			SUB_ROR9;
+			SMIX(S00, S01, S02, S03);
+			S04 ^= S00;
+			S10 ^= S00;
+			S18 ^= S00;
+			S27 ^= S00;
+			SUB_ROR9;
+			SMIX(S00, S01, S02, S03);
+			S04 ^= S00;
+			S10 ^= S00;
+			S19 ^= S00;
+			S27 ^= S00;
+			SUB_ROR9;
+			SMIX(S00, S01, S02, S03);
+			S04 ^= S00;
+			S10 ^= S00;
+			S19 ^= S00;
+			S28 ^= S00;
+			SUB_ROR8;
+			SMIX(S00, S01, S02, S03);
+		}
+		S04 ^= S00;
+		S09 ^= S00;
+		S18 ^= S00;
+		S27 ^= S00;
+
+		Hash[0] = cuda_swab32(S01);
+		Hash[1] = cuda_swab32(S02);
+		Hash[2] = cuda_swab32(S03);
+		Hash[3] = cuda_swab32(S04);
+		Hash[4] = cuda_swab32(S09);
+		Hash[5] = cuda_swab32(S10);
+		Hash[6] = cuda_swab32(S11);
+		Hash[7] = cuda_swab32(S12);
+		Hash[8] = cuda_swab32(S18);
+		Hash[9] = cuda_swab32(S19);
+		Hash[10] = cuda_swab32(S20);
+		Hash[11] = cuda_swab32(S21);
+		Hash[12] = cuda_swab32(S27);
+		Hash[13] = cuda_swab32(S28);
+		Hash[14] = cuda_swab32(S29);
+		Hash[15] = cuda_swab32(S30);
+
+		#pragma unroll 4
+		for(int i = 0; i < 4; i++)
+			AS_UINT4(&pHash[i*2]) = AS_UINT4(&Hash[i*4]);
+	}
 }
 
-#define texDef(texname, texmem, texsource, texsize) \
+#define texDef(id, texname, texmem, texsource, texsize) { \
 	unsigned int *texmem; \
 	cudaMalloc(&texmem, texsize); \
+	d_textures[thr_id][id] = texmem; \
 	cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
 	texname.normalized = 0; \
 	texname.filterMode = cudaFilterModePoint; \
 	texname.addressMode[0] = cudaAddressModeClamp; \
 	{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
-	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); }
+	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); \
+	} \
+}
 
-__host__ void x13_fugue512_cpu_init(int thr_id, int threads)
+__host__
+void x13_fugue512_cpu_init(int thr_id, uint32_t threads)
 {
-	texDef(mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256);
-	texDef(mixTab1Tex, mixTab1m, mixtab1_cpu, sizeof(uint32_t)*256);
-	texDef(mixTab2Tex, mixTab2m, mixtab2_cpu, sizeof(uint32_t)*256);
-	texDef(mixTab3Tex, mixTab3m, mixtab3_cpu, sizeof(uint32_t)*256);
+	texDef(0, mixTab0Tex, mixTab0m, mixtab0, sizeof(uint32_t)*256);
 }
 
-__host__ void x13_fugue512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+__host__
+void x13_fugue512_cpu_free(int thr_id)
 {
-    const int threadsperblock = 256;
-
-    // berechne wie viele Thread Blocks wir brauchen
-    dim3 grid((threads + threadsperblock-1)/threadsperblock);
-    dim3 block(threadsperblock);
+	cudaFree(d_textures[thr_id][0]);
+}
 
-    // Gr��e des dynamischen Shared Memory Bereichs
-	size_t shared_size = 4 * 256 * sizeof(uint32_t);
+__host__
+//void fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash)
+void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+{
+	const uint32_t threadsperblock = TPB;
 
-//    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
 
-    x13_fugue512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
-    MyStreamSynchronize(NULL, order, thr_id);
+	x13_fugue512_gpu_hash_64 <<<grid, block>>> (threads, (uint64_t*)d_hash);
 }
diff --git a/x13/cuda_x13_hamsi512.cu b/x13/cuda_x13_hamsi512.cu
index a9039a9d74..3c21f7f1e5 100644
--- a/x13/cuda_x13_hamsi512.cu
+++ b/x13/cuda_x13_hamsi512.cu
@@ -1,94 +1,32 @@
 /*
- * Quick and dirty addition of Hamsi-512 for X13
- * 
- * Built on cbuchner1's implementation, actual hashing code
- * heavily based on phm's sgminer
- *
+ * Quick Hamsi-512 for X13 by tsiv - 2014
+ * + Hamsi-512 80 by tpruvot - 2018
  */
 
-/*
- * X13 kernel implementation.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2014  phm
- * 
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   phm <phm@inbox.com>
- */
-
-// aus heavy.cu
-extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
-
-typedef unsigned char uint8_t;
-typedef unsigned short uint16_t;
-typedef unsigned int uint32_t;
-typedef unsigned long long uint64_t;
+#include <stdio.h>
+#include <stdint.h>
+#include <memory.h>
 
-#define SPH_C64(x)    ((uint64_t)(x ## ULL))
-#define SPH_C32(x)    ((uint32_t)(x ## U))
-#define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
+#include "cuda_helper.h"
 
-#define SWAB32(x) ( __byte_perm(x, x, 0x0123) )
-
-#if __CUDA_ARCH__ < 350 
-    // Kepler (Compute 3.0)
-    #define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
-#else
-    // Kepler (Compute 3.5)
-    #define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
-#endif
+typedef unsigned char BitSequence;
 
 static __constant__ uint32_t d_alpha_n[32];
 static __constant__ uint32_t d_alpha_f[32];
 static __constant__ uint32_t d_T512[64][16];
 
 static const uint32_t alpha_n[] = {
-	SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc),
-	SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00),
-	SPH_C32(0xaaaacccc), SPH_C32(0xf0f0ff00), SPH_C32(0xf0f0cccc),
-	SPH_C32(0xaaaaff00), SPH_C32(0xccccff00), SPH_C32(0xaaaaf0f0),
-	SPH_C32(0xaaaaf0f0), SPH_C32(0xff00cccc), SPH_C32(0xccccf0f0),
-	SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xff00f0f0),
-	SPH_C32(0xff00aaaa), SPH_C32(0xf0f0cccc), SPH_C32(0xf0f0ff00),
-	SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00), SPH_C32(0xaaaacccc),
-	SPH_C32(0xaaaaff00), SPH_C32(0xf0f0cccc), SPH_C32(0xaaaaf0f0),
-	SPH_C32(0xccccff00), SPH_C32(0xff00cccc), SPH_C32(0xaaaaf0f0),
-	SPH_C32(0xff00aaaa), SPH_C32(0xccccf0f0)
+	0xff00f0f0, 0xccccaaaa, 0xf0f0cccc, 0xff00aaaa, 0xccccaaaa, 0xf0f0ff00, 0xaaaacccc, 0xf0f0ff00,
+	0xf0f0cccc, 0xaaaaff00, 0xccccff00, 0xaaaaf0f0, 0xaaaaf0f0, 0xff00cccc, 0xccccf0f0, 0xff00aaaa,
+	0xccccaaaa, 0xff00f0f0, 0xff00aaaa, 0xf0f0cccc, 0xf0f0ff00, 0xccccaaaa, 0xf0f0ff00, 0xaaaacccc,
+	0xaaaaff00, 0xf0f0cccc, 0xaaaaf0f0, 0xccccff00, 0xff00cccc, 0xaaaaf0f0, 0xff00aaaa, 0xccccf0f0
 };
 
 static const uint32_t alpha_f[] = {
-	SPH_C32(0xcaf9639c), SPH_C32(0x0ff0f9c0), SPH_C32(0x639c0ff0),
-	SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9),
-	SPH_C32(0xf9c00ff0), SPH_C32(0x639ccaf9), SPH_C32(0x639c0ff0),
-	SPH_C32(0xf9c0caf9), SPH_C32(0x0ff0caf9), SPH_C32(0xf9c0639c),
-	SPH_C32(0xf9c0639c), SPH_C32(0xcaf90ff0), SPH_C32(0x0ff0639c),
-	SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0xcaf9639c),
-	SPH_C32(0xcaf9f9c0), SPH_C32(0x639c0ff0), SPH_C32(0x639ccaf9),
-	SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9), SPH_C32(0xf9c00ff0),
-	SPH_C32(0xf9c0caf9), SPH_C32(0x639c0ff0), SPH_C32(0xf9c0639c),
-	SPH_C32(0x0ff0caf9), SPH_C32(0xcaf90ff0), SPH_C32(0xf9c0639c),
-	SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0639c)
+	0xcaf9639c, 0x0ff0f9c0, 0x639c0ff0, 0xcaf9f9c0, 0x0ff0f9c0, 0x639ccaf9, 0xf9c00ff0, 0x639ccaf9,
+	0x639c0ff0, 0xf9c0caf9, 0x0ff0caf9, 0xf9c0639c, 0xf9c0639c, 0xcaf90ff0, 0x0ff0639c, 0xcaf9f9c0,
+	0x0ff0f9c0, 0xcaf9639c, 0xcaf9f9c0, 0x639c0ff0, 0x639ccaf9, 0x0ff0f9c0, 0x639ccaf9, 0xf9c00ff0,
+	0xf9c0caf9, 0x639c0ff0, 0xf9c0639c, 0x0ff0caf9, 0xcaf90ff0, 0xf9c0639c, 0xcaf9f9c0, 0x0ff0639c
 };
 
 #define hamsi_s00   m0
@@ -125,73 +63,73 @@ static const uint32_t alpha_f[] = {
 #define hamsi_s1F   mF
 
 #define SBOX(a, b, c, d) { \
-        uint32_t t; \
-        t = (a); \
-        (a) &= (c); \
-        (a) ^= (d); \
-        (c) ^= (b); \
-        (c) ^= (a); \
-        (d) |= t; \
-        (d) ^= (b); \
-        t ^= (c); \
-        (b) = (d); \
-        (d) |= t; \
-        (d) ^= (a); \
-        (a) &= (b); \
-        t ^= (a); \
-        (b) ^= (d); \
-        (b) ^= t; \
-        (a) = (c); \
-        (c) = (b); \
-        (b) = (d); \
-        (d) = SPH_T32(~t); \
-    }
+		uint32_t t; \
+		t = (a); \
+		(a) &= (c); \
+		(a) ^= (d); \
+		(c) ^= (b); \
+		(c) ^= (a); \
+		(d) |= t; \
+		(d) ^= (b); \
+		t ^= (c); \
+		(b) = (d); \
+		(d) |= t; \
+		(d) ^= (a); \
+		(a) &= (b); \
+		t ^= (a); \
+		(b) ^= (d); \
+		(b) ^= t; \
+		(a) = (c); \
+		(c) = (b); \
+		(b) = (d); \
+		(d) = SPH_T32(~t); \
+	}
 
 #define HAMSI_L(a, b, c, d) { \
-        (a) = ROTL32(a, 13); \
-        (c) = ROTL32(c, 3); \
-        (b) ^= (a) ^ (c); \
-        (d) ^= (c) ^ SPH_T32((a) << 3); \
-        (b) = ROTL32(b, 1); \
-        (d) = ROTL32(d, 7); \
-        (a) ^= (b) ^ (d); \
-        (c) ^= (d) ^ SPH_T32((b) << 7); \
-        (a) = ROTL32(a, 5); \
-        (c) = ROTL32(c, 22); \
-    }
+		(a) = ROTL32(a, 13); \
+		(c) = ROTL32(c, 3); \
+		(b) ^= (a) ^ (c); \
+		(d) ^= (c) ^ ((a) << 3); \
+		(b) = ROTL32(b, 1); \
+		(d) = ROTL32(d, 7); \
+		(a) ^= (b) ^ (d); \
+		(c) ^= (d) ^ ((b) << 7); \
+		(a) = ROTL32(a, 5); \
+		(c) = ROTL32(c, 22); \
+	}
 
 #define ROUND_BIG(rc, alpha) { \
 		hamsi_s00 ^= alpha[0x00]; \
-		hamsi_s01 ^= alpha[0x01] ^ (uint32_t)(rc); \
-		hamsi_s02 ^= alpha[0x02]; \
-		hamsi_s03 ^= alpha[0x03]; \
-		hamsi_s04 ^= alpha[0x04]; \
-		hamsi_s05 ^= alpha[0x05]; \
-		hamsi_s06 ^= alpha[0x06]; \
-		hamsi_s07 ^= alpha[0x07]; \
 		hamsi_s08 ^= alpha[0x08]; \
-		hamsi_s09 ^= alpha[0x09]; \
-		hamsi_s0A ^= alpha[0x0A]; \
-		hamsi_s0B ^= alpha[0x0B]; \
-		hamsi_s0C ^= alpha[0x0C]; \
-		hamsi_s0D ^= alpha[0x0D]; \
-		hamsi_s0E ^= alpha[0x0E]; \
-		hamsi_s0F ^= alpha[0x0F]; \
 		hamsi_s10 ^= alpha[0x10]; \
-		hamsi_s11 ^= alpha[0x11]; \
-		hamsi_s12 ^= alpha[0x12]; \
-		hamsi_s13 ^= alpha[0x13]; \
-		hamsi_s14 ^= alpha[0x14]; \
-		hamsi_s15 ^= alpha[0x15]; \
-		hamsi_s16 ^= alpha[0x16]; \
-		hamsi_s17 ^= alpha[0x17]; \
 		hamsi_s18 ^= alpha[0x18]; \
+		hamsi_s01 ^= alpha[0x01] ^ (uint32_t)(rc); \
+		hamsi_s09 ^= alpha[0x09]; \
+		hamsi_s11 ^= alpha[0x11]; \
 		hamsi_s19 ^= alpha[0x19]; \
+		hamsi_s02 ^= alpha[0x02]; \
+		hamsi_s0A ^= alpha[0x0A]; \
+		hamsi_s12 ^= alpha[0x12]; \
 		hamsi_s1A ^= alpha[0x1A]; \
+		hamsi_s03 ^= alpha[0x03]; \
+		hamsi_s0B ^= alpha[0x0B]; \
+		hamsi_s13 ^= alpha[0x13]; \
 		hamsi_s1B ^= alpha[0x1B]; \
+		hamsi_s04 ^= alpha[0x04]; \
+		hamsi_s0C ^= alpha[0x0C]; \
+		hamsi_s14 ^= alpha[0x14]; \
 		hamsi_s1C ^= alpha[0x1C]; \
+		hamsi_s05 ^= alpha[0x05]; \
+		hamsi_s0D ^= alpha[0x0D]; \
+		hamsi_s15 ^= alpha[0x15]; \
 		hamsi_s1D ^= alpha[0x1D]; \
+		hamsi_s06 ^= alpha[0x06]; \
+		hamsi_s0E ^= alpha[0x0E]; \
+		hamsi_s16 ^= alpha[0x16]; \
 		hamsi_s1E ^= alpha[0x1E]; \
+		hamsi_s07 ^= alpha[0x07]; \
+		hamsi_s0F ^= alpha[0x0F]; \
+		hamsi_s17 ^= alpha[0x17]; \
 		hamsi_s1F ^= alpha[0x1F]; \
 		SBOX(hamsi_s00, hamsi_s08, hamsi_s10, hamsi_s18); \
 		SBOX(hamsi_s01, hamsi_s09, hamsi_s11, hamsi_s19); \
@@ -217,30 +155,16 @@ static const uint32_t alpha_f[] = {
 
 
 #define P_BIG  { \
-		ROUND_BIG(0, d_alpha_n); \
-		ROUND_BIG(1, d_alpha_n); \
-		ROUND_BIG(2, d_alpha_n); \
-		ROUND_BIG(3, d_alpha_n); \
-		ROUND_BIG(4, d_alpha_n); \
-		ROUND_BIG(5, d_alpha_n); \
+		for( int r = 0; r < 6; r++ ) \
+			ROUND_BIG(r, d_alpha_n); \
 	}
 
-#define PF_BIG  { \
-		ROUND_BIG(0, d_alpha_f); \
-		ROUND_BIG(1, d_alpha_f); \
-		ROUND_BIG(2, d_alpha_f); \
-		ROUND_BIG(3, d_alpha_f); \
-		ROUND_BIG(4, d_alpha_f); \
-		ROUND_BIG(5, d_alpha_f); \
-		ROUND_BIG(6, d_alpha_f); \
-		ROUND_BIG(7, d_alpha_f); \
-		ROUND_BIG(8, d_alpha_f); \
-		ROUND_BIG(9, d_alpha_f); \
-		ROUND_BIG(10, d_alpha_f); \
-		ROUND_BIG(11, d_alpha_f); \
+#define PF_BIG { \
+		for( int r = 0; r < 12; r++ ) \
+			ROUND_BIG(r, d_alpha_f); \
 	}
 
-#define T_BIG   { \
+#define T_BIG  { \
 		/* order is important */ \
 		cF = (h[0xF] ^= hamsi_s17); \
 		cE = (h[0xE] ^= hamsi_s16); \
@@ -262,503 +186,366 @@ static const uint32_t alpha_f[] = {
 
 
 static const uint32_t T512[64][16] = {
-	{ SPH_C32(0xef0b0270), SPH_C32(0x3afd0000), SPH_C32(0x5dae0000),
-	  SPH_C32(0x69490000), SPH_C32(0x9b0f3c06), SPH_C32(0x4405b5f9),
-	  SPH_C32(0x66140a51), SPH_C32(0x924f5d0a), SPH_C32(0xc96b0030),
-	  SPH_C32(0xe7250000), SPH_C32(0x2f840000), SPH_C32(0x264f0000),
-	  SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137), SPH_C32(0x509f6984),
-	  SPH_C32(0x9e69af68) },
-	{ SPH_C32(0xc96b0030), SPH_C32(0xe7250000), SPH_C32(0x2f840000),
-	  SPH_C32(0x264f0000), SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137),
-	  SPH_C32(0x509f6984), SPH_C32(0x9e69af68), SPH_C32(0x26600240),
-	  SPH_C32(0xddd80000), SPH_C32(0x722a0000), SPH_C32(0x4f060000),
-	  SPH_C32(0x936667ff), SPH_C32(0x29f944ce), SPH_C32(0x368b63d5),
-	  SPH_C32(0x0c26f262) },
-	{ SPH_C32(0x145a3c00), SPH_C32(0xb9e90000), SPH_C32(0x61270000),
-	  SPH_C32(0xf1610000), SPH_C32(0xce613d6c), SPH_C32(0xb0493d78),
-	  SPH_C32(0x47a96720), SPH_C32(0xe18e24c5), SPH_C32(0x23671400),
-	  SPH_C32(0xc8b90000), SPH_C32(0xf4c70000), SPH_C32(0xfb750000),
-	  SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549), SPH_C32(0x02c40a3f),
-	  SPH_C32(0xdc24e61f) },
-	{ SPH_C32(0x23671400), SPH_C32(0xc8b90000), SPH_C32(0xf4c70000),
-	  SPH_C32(0xfb750000), SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549),
-	  SPH_C32(0x02c40a3f), SPH_C32(0xdc24e61f), SPH_C32(0x373d2800),
-	  SPH_C32(0x71500000), SPH_C32(0x95e00000), SPH_C32(0x0a140000),
-	  SPH_C32(0xbdac1909), SPH_C32(0x48ef9831), SPH_C32(0x456d6d1f),
-	  SPH_C32(0x3daac2da) },
-	{ SPH_C32(0x54285c00), SPH_C32(0xeaed0000), SPH_C32(0xc5d60000),
-	  SPH_C32(0xa1c50000), SPH_C32(0xb3a26770), SPH_C32(0x94a5c4e1),
-	  SPH_C32(0x6bb0419d), SPH_C32(0x551b3782), SPH_C32(0x9cbb1800),
-	  SPH_C32(0xb0d30000), SPH_C32(0x92510000), SPH_C32(0xed930000),
-	  SPH_C32(0x593a4345), SPH_C32(0xe114d5f4), SPH_C32(0x430633da),
-	  SPH_C32(0x78cace29) },
-	{ SPH_C32(0x9cbb1800), SPH_C32(0xb0d30000), SPH_C32(0x92510000),
-	  SPH_C32(0xed930000), SPH_C32(0x593a4345), SPH_C32(0xe114d5f4),
-	  SPH_C32(0x430633da), SPH_C32(0x78cace29), SPH_C32(0xc8934400),
-	  SPH_C32(0x5a3e0000), SPH_C32(0x57870000), SPH_C32(0x4c560000),
-	  SPH_C32(0xea982435), SPH_C32(0x75b11115), SPH_C32(0x28b67247),
-	  SPH_C32(0x2dd1f9ab) },
-	{ SPH_C32(0x29449c00), SPH_C32(0x64e70000), SPH_C32(0xf24b0000),
-	  SPH_C32(0xc2f30000), SPH_C32(0x0ede4e8f), SPH_C32(0x56c23745),
-	  SPH_C32(0xf3e04259), SPH_C32(0x8d0d9ec4), SPH_C32(0x466d0c00),
-	  SPH_C32(0x08620000), SPH_C32(0xdd5d0000), SPH_C32(0xbadd0000),
-	  SPH_C32(0x6a927942), SPH_C32(0x441f2b93), SPH_C32(0x218ace6f),
-	  SPH_C32(0xbf2c0be2) },
-	{ SPH_C32(0x466d0c00), SPH_C32(0x08620000), SPH_C32(0xdd5d0000),
-	  SPH_C32(0xbadd0000), SPH_C32(0x6a927942), SPH_C32(0x441f2b93),
-	  SPH_C32(0x218ace6f), SPH_C32(0xbf2c0be2), SPH_C32(0x6f299000),
-	  SPH_C32(0x6c850000), SPH_C32(0x2f160000), SPH_C32(0x782e0000),
-	  SPH_C32(0x644c37cd), SPH_C32(0x12dd1cd6), SPH_C32(0xd26a8c36),
-	  SPH_C32(0x32219526) },
-	{ SPH_C32(0xf6800005), SPH_C32(0x3443c000), SPH_C32(0x24070000),
-	  SPH_C32(0x8f3d0000), SPH_C32(0x21373bfb), SPH_C32(0x0ab8d5ae),
-	  SPH_C32(0xcdc58b19), SPH_C32(0xd795ba31), SPH_C32(0xa67f0001),
-	  SPH_C32(0x71378000), SPH_C32(0x19fc0000), SPH_C32(0x96db0000),
-	  SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3), SPH_C32(0x2c6d478f),
-	  SPH_C32(0xac8e6c88) },
-	{ SPH_C32(0xa67f0001), SPH_C32(0x71378000), SPH_C32(0x19fc0000),
-	  SPH_C32(0x96db0000), SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3),
-	  SPH_C32(0x2c6d478f), SPH_C32(0xac8e6c88), SPH_C32(0x50ff0004),
-	  SPH_C32(0x45744000), SPH_C32(0x3dfb0000), SPH_C32(0x19e60000),
-	  SPH_C32(0x1bbc5606), SPH_C32(0xe1727b5d), SPH_C32(0xe1a8cc96),
-	  SPH_C32(0x7b1bd6b9) },
-	{ SPH_C32(0xf7750009), SPH_C32(0xcf3cc000), SPH_C32(0xc3d60000),
-	  SPH_C32(0x04920000), SPH_C32(0x029519a9), SPH_C32(0xf8e836ba),
-	  SPH_C32(0x7a87f14e), SPH_C32(0x9e16981a), SPH_C32(0xd46a0000),
-	  SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000), SPH_C32(0x4a290000),
-	  SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c), SPH_C32(0x98369604),
-	  SPH_C32(0xf746c320) },
-	{ SPH_C32(0xd46a0000), SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000),
-	  SPH_C32(0x4a290000), SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c),
-	  SPH_C32(0x98369604), SPH_C32(0xf746c320), SPH_C32(0x231f0009),
-	  SPH_C32(0x42f40000), SPH_C32(0x66790000), SPH_C32(0x4ebb0000),
-	  SPH_C32(0xfedb5bd3), SPH_C32(0x315cb0d6), SPH_C32(0xe2b1674a),
-	  SPH_C32(0x69505b3a) },
-	{ SPH_C32(0x774400f0), SPH_C32(0xf15a0000), SPH_C32(0xf5b20000),
-	  SPH_C32(0x34140000), SPH_C32(0x89377e8c), SPH_C32(0x5a8bec25),
-	  SPH_C32(0x0bc3cd1e), SPH_C32(0xcf3775cb), SPH_C32(0xf46c0050),
-	  SPH_C32(0x96180000), SPH_C32(0x14a50000), SPH_C32(0x031f0000),
-	  SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19), SPH_C32(0x9ca470d2),
-	  SPH_C32(0x8a341574) },
-	{ SPH_C32(0xf46c0050), SPH_C32(0x96180000), SPH_C32(0x14a50000),
-	  SPH_C32(0x031f0000), SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19),
-	  SPH_C32(0x9ca470d2), SPH_C32(0x8a341574), SPH_C32(0x832800a0),
-	  SPH_C32(0x67420000), SPH_C32(0xe1170000), SPH_C32(0x370b0000),
-	  SPH_C32(0xcba30034), SPH_C32(0x3c34923c), SPH_C32(0x9767bdcc),
-	  SPH_C32(0x450360bf) },
-	{ SPH_C32(0xe8870170), SPH_C32(0x9d720000), SPH_C32(0x12db0000),
-	  SPH_C32(0xd4220000), SPH_C32(0xf2886b27), SPH_C32(0xa921e543),
-	  SPH_C32(0x4ef8b518), SPH_C32(0x618813b1), SPH_C32(0xb4370060),
-	  SPH_C32(0x0c4c0000), SPH_C32(0x56c20000), SPH_C32(0x5cae0000),
-	  SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825), SPH_C32(0x1b365f3d),
-	  SPH_C32(0xf3d45758) },
-	{ SPH_C32(0xb4370060), SPH_C32(0x0c4c0000), SPH_C32(0x56c20000),
-	  SPH_C32(0x5cae0000), SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825),
-	  SPH_C32(0x1b365f3d), SPH_C32(0xf3d45758), SPH_C32(0x5cb00110),
-	  SPH_C32(0x913e0000), SPH_C32(0x44190000), SPH_C32(0x888c0000),
-	  SPH_C32(0x66dc7418), SPH_C32(0x921f1d66), SPH_C32(0x55ceea25),
-	  SPH_C32(0x925c44e9) },
-	{ SPH_C32(0x0c720000), SPH_C32(0x49e50f00), SPH_C32(0x42790000),
-	  SPH_C32(0x5cea0000), SPH_C32(0x33aa301a), SPH_C32(0x15822514),
-	  SPH_C32(0x95a34b7b), SPH_C32(0xb44b0090), SPH_C32(0xfe220000),
-	  SPH_C32(0xa7580500), SPH_C32(0x25d10000), SPH_C32(0xf7600000),
-	  SPH_C32(0x893178da), SPH_C32(0x1fd4f860), SPH_C32(0x4ed0a315),
-	  SPH_C32(0xa123ff9f) },
-	{ SPH_C32(0xfe220000), SPH_C32(0xa7580500), SPH_C32(0x25d10000),
-	  SPH_C32(0xf7600000), SPH_C32(0x893178da), SPH_C32(0x1fd4f860),
-	  SPH_C32(0x4ed0a315), SPH_C32(0xa123ff9f), SPH_C32(0xf2500000),
-	  SPH_C32(0xeebd0a00), SPH_C32(0x67a80000), SPH_C32(0xab8a0000),
-	  SPH_C32(0xba9b48c0), SPH_C32(0x0a56dd74), SPH_C32(0xdb73e86e),
-	  SPH_C32(0x1568ff0f) },
-	{ SPH_C32(0x45180000), SPH_C32(0xa5b51700), SPH_C32(0xf96a0000),
-	  SPH_C32(0x3b480000), SPH_C32(0x1ecc142c), SPH_C32(0x231395d6),
-	  SPH_C32(0x16bca6b0), SPH_C32(0xdf33f4df), SPH_C32(0xb83d0000),
-	  SPH_C32(0x16710600), SPH_C32(0x379a0000), SPH_C32(0xf5b10000),
-	  SPH_C32(0x228161ac), SPH_C32(0xae48f145), SPH_C32(0x66241616),
-	  SPH_C32(0xc5c1eb3e) },
-	{ SPH_C32(0xb83d0000), SPH_C32(0x16710600), SPH_C32(0x379a0000),
-	  SPH_C32(0xf5b10000), SPH_C32(0x228161ac), SPH_C32(0xae48f145),
-	  SPH_C32(0x66241616), SPH_C32(0xc5c1eb3e), SPH_C32(0xfd250000),
-	  SPH_C32(0xb3c41100), SPH_C32(0xcef00000), SPH_C32(0xcef90000),
-	  SPH_C32(0x3c4d7580), SPH_C32(0x8d5b6493), SPH_C32(0x7098b0a6),
-	  SPH_C32(0x1af21fe1) },
-	{ SPH_C32(0x75a40000), SPH_C32(0xc28b2700), SPH_C32(0x94a40000),
-	  SPH_C32(0x90f50000), SPH_C32(0xfb7857e0), SPH_C32(0x49ce0bae),
-	  SPH_C32(0x1767c483), SPH_C32(0xaedf667e), SPH_C32(0xd1660000),
-	  SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000), SPH_C32(0xf6940000),
-	  SPH_C32(0x03024527), SPH_C32(0xcf70fcf2), SPH_C32(0xb4431b17),
-	  SPH_C32(0x857f3c2b) },
-	{ SPH_C32(0xd1660000), SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000),
-	  SPH_C32(0xf6940000), SPH_C32(0x03024527), SPH_C32(0xcf70fcf2),
-	  SPH_C32(0xb4431b17), SPH_C32(0x857f3c2b), SPH_C32(0xa4c20000),
-	  SPH_C32(0xd9372400), SPH_C32(0x0a480000), SPH_C32(0x66610000),
-	  SPH_C32(0xf87a12c7), SPH_C32(0x86bef75c), SPH_C32(0xa324df94),
-	  SPH_C32(0x2ba05a55) },
-	{ SPH_C32(0x75c90003), SPH_C32(0x0e10c000), SPH_C32(0xd1200000),
-	  SPH_C32(0xbaea0000), SPH_C32(0x8bc42f3e), SPH_C32(0x8758b757),
-	  SPH_C32(0xbb28761d), SPH_C32(0x00b72e2b), SPH_C32(0xeecf0001),
-	  SPH_C32(0x6f564000), SPH_C32(0xf33e0000), SPH_C32(0xa79e0000),
-	  SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5), SPH_C32(0x4a3b40ba),
-	  SPH_C32(0xfeabf254) },
-	{ SPH_C32(0xeecf0001), SPH_C32(0x6f564000), SPH_C32(0xf33e0000),
-	  SPH_C32(0xa79e0000), SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5),
-	  SPH_C32(0x4a3b40ba), SPH_C32(0xfeabf254), SPH_C32(0x9b060002),
-	  SPH_C32(0x61468000), SPH_C32(0x221e0000), SPH_C32(0x1d740000),
-	  SPH_C32(0x36715d27), SPH_C32(0x30495c92), SPH_C32(0xf11336a7),
-	  SPH_C32(0xfe1cdc7f) },
-	{ SPH_C32(0x86790000), SPH_C32(0x3f390002), SPH_C32(0xe19ae000),
-	  SPH_C32(0x98560000), SPH_C32(0x9565670e), SPH_C32(0x4e88c8ea),
-	  SPH_C32(0xd3dd4944), SPH_C32(0x161ddab9), SPH_C32(0x30b70000),
-	  SPH_C32(0xe5d00000), SPH_C32(0xf4f46000), SPH_C32(0x42c40000),
-	  SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460), SPH_C32(0x21afa1ea),
-	  SPH_C32(0xb0a51834) },
-	{ SPH_C32(0x30b70000), SPH_C32(0xe5d00000), SPH_C32(0xf4f46000),
-	  SPH_C32(0x42c40000), SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460),
-	  SPH_C32(0x21afa1ea), SPH_C32(0xb0a51834), SPH_C32(0xb6ce0000),
-	  SPH_C32(0xdae90002), SPH_C32(0x156e8000), SPH_C32(0xda920000),
-	  SPH_C32(0xf6dd5a64), SPH_C32(0x36325c8a), SPH_C32(0xf272e8ae),
-	  SPH_C32(0xa6b8c28d) },
-	{ SPH_C32(0x14190000), SPH_C32(0x23ca003c), SPH_C32(0x50df0000),
-	  SPH_C32(0x44b60000), SPH_C32(0x1b6c67b0), SPH_C32(0x3cf3ac75),
-	  SPH_C32(0x61e610b0), SPH_C32(0xdbcadb80), SPH_C32(0xe3430000),
-	  SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000), SPH_C32(0xaa4e0000),
-	  SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15), SPH_C32(0x123db156),
-	  SPH_C32(0x3a4e99d7) },
-	{ SPH_C32(0xe3430000), SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000),
-	  SPH_C32(0xaa4e0000), SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15),
-	  SPH_C32(0x123db156), SPH_C32(0x3a4e99d7), SPH_C32(0xf75a0000),
-	  SPH_C32(0x19840028), SPH_C32(0xa2190000), SPH_C32(0xeef80000),
-	  SPH_C32(0xc0722516), SPH_C32(0x19981260), SPH_C32(0x73dba1e6),
-	  SPH_C32(0xe1844257) },
-	{ SPH_C32(0x54500000), SPH_C32(0x0671005c), SPH_C32(0x25ae0000),
-	  SPH_C32(0x6a1e0000), SPH_C32(0x2ea54edf), SPH_C32(0x664e8512),
-	  SPH_C32(0xbfba18c3), SPH_C32(0x7e715d17), SPH_C32(0xbc8d0000),
-	  SPH_C32(0xfc3b0018), SPH_C32(0x19830000), SPH_C32(0xd10b0000),
-	  SPH_C32(0xae1878c4), SPH_C32(0x42a69856), SPH_C32(0x0012da37),
-	  SPH_C32(0x2c3b504e) },
-	{ SPH_C32(0xbc8d0000), SPH_C32(0xfc3b0018), SPH_C32(0x19830000),
-	  SPH_C32(0xd10b0000), SPH_C32(0xae1878c4), SPH_C32(0x42a69856),
-	  SPH_C32(0x0012da37), SPH_C32(0x2c3b504e), SPH_C32(0xe8dd0000),
-	  SPH_C32(0xfa4a0044), SPH_C32(0x3c2d0000), SPH_C32(0xbb150000),
-	  SPH_C32(0x80bd361b), SPH_C32(0x24e81d44), SPH_C32(0xbfa8c2f4),
-	  SPH_C32(0x524a0d59) },
-	{ SPH_C32(0x69510000), SPH_C32(0xd4e1009c), SPH_C32(0xc3230000),
-	  SPH_C32(0xac2f0000), SPH_C32(0xe4950bae), SPH_C32(0xcea415dc),
-	  SPH_C32(0x87ec287c), SPH_C32(0xbce1a3ce), SPH_C32(0xc6730000),
-	  SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000), SPH_C32(0x218d0000),
-	  SPH_C32(0x23111587), SPH_C32(0x7913512f), SPH_C32(0x1d28ac88),
-	  SPH_C32(0x378dd173) },
-	{ SPH_C32(0xc6730000), SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000),
-	  SPH_C32(0x218d0000), SPH_C32(0x23111587), SPH_C32(0x7913512f),
-	  SPH_C32(0x1d28ac88), SPH_C32(0x378dd173), SPH_C32(0xaf220000),
-	  SPH_C32(0x7b6c0090), SPH_C32(0x67e20000), SPH_C32(0x8da20000),
-	  SPH_C32(0xc7841e29), SPH_C32(0xb7b744f3), SPH_C32(0x9ac484f4),
-	  SPH_C32(0x8b6c72bd) },
-	{ SPH_C32(0xcc140000), SPH_C32(0xa5630000), SPH_C32(0x5ab90780),
-	  SPH_C32(0x3b500000), SPH_C32(0x4bd013ff), SPH_C32(0x879b3418),
-	  SPH_C32(0x694348c1), SPH_C32(0xca5a87fe), SPH_C32(0x819e0000),
-	  SPH_C32(0xec570000), SPH_C32(0x66320280), SPH_C32(0x95f30000),
-	  SPH_C32(0x5da92802), SPH_C32(0x48f43cbc), SPH_C32(0xe65aa22d),
-	  SPH_C32(0x8e67b7fa) },
-	{ SPH_C32(0x819e0000), SPH_C32(0xec570000), SPH_C32(0x66320280),
-	  SPH_C32(0x95f30000), SPH_C32(0x5da92802), SPH_C32(0x48f43cbc),
-	  SPH_C32(0xe65aa22d), SPH_C32(0x8e67b7fa), SPH_C32(0x4d8a0000),
-	  SPH_C32(0x49340000), SPH_C32(0x3c8b0500), SPH_C32(0xaea30000),
-	  SPH_C32(0x16793bfd), SPH_C32(0xcf6f08a4), SPH_C32(0x8f19eaec),
-	  SPH_C32(0x443d3004) },
-	{ SPH_C32(0x78230000), SPH_C32(0x12fc0000), SPH_C32(0xa93a0b80),
-	  SPH_C32(0x90a50000), SPH_C32(0x713e2879), SPH_C32(0x7ee98924),
-	  SPH_C32(0xf08ca062), SPH_C32(0x636f8bab), SPH_C32(0x02af0000),
-	  SPH_C32(0xb7280000), SPH_C32(0xba1c0300), SPH_C32(0x56980000),
-	  SPH_C32(0xba8d45d3), SPH_C32(0x8048c667), SPH_C32(0xa95c149a),
-	  SPH_C32(0xf4f6ea7b) },
-	{ SPH_C32(0x02af0000), SPH_C32(0xb7280000), SPH_C32(0xba1c0300),
-	  SPH_C32(0x56980000), SPH_C32(0xba8d45d3), SPH_C32(0x8048c667),
-	  SPH_C32(0xa95c149a), SPH_C32(0xf4f6ea7b), SPH_C32(0x7a8c0000),
-	  SPH_C32(0xa5d40000), SPH_C32(0x13260880), SPH_C32(0xc63d0000),
-	  SPH_C32(0xcbb36daa), SPH_C32(0xfea14f43), SPH_C32(0x59d0b4f8),
-	  SPH_C32(0x979961d0) },
-	{ SPH_C32(0xac480000), SPH_C32(0x1ba60000), SPH_C32(0x45fb1380),
-	  SPH_C32(0x03430000), SPH_C32(0x5a85316a), SPH_C32(0x1fb250b6),
-	  SPH_C32(0xfe72c7fe), SPH_C32(0x91e478f6), SPH_C32(0x1e4e0000),
-	  SPH_C32(0xdecf0000), SPH_C32(0x6df80180), SPH_C32(0x77240000),
-	  SPH_C32(0xec47079e), SPH_C32(0xf4a0694e), SPH_C32(0xcda31812),
-	  SPH_C32(0x98aa496e) },
-	{ SPH_C32(0x1e4e0000), SPH_C32(0xdecf0000), SPH_C32(0x6df80180),
-	  SPH_C32(0x77240000), SPH_C32(0xec47079e), SPH_C32(0xf4a0694e),
-	  SPH_C32(0xcda31812), SPH_C32(0x98aa496e), SPH_C32(0xb2060000),
-	  SPH_C32(0xc5690000), SPH_C32(0x28031200), SPH_C32(0x74670000),
-	  SPH_C32(0xb6c236f4), SPH_C32(0xeb1239f8), SPH_C32(0x33d1dfec),
-	  SPH_C32(0x094e3198) },
-	{ SPH_C32(0xaec30000), SPH_C32(0x9c4f0001), SPH_C32(0x79d1e000),
-	  SPH_C32(0x2c150000), SPH_C32(0x45cc75b3), SPH_C32(0x6650b736),
-	  SPH_C32(0xab92f78f), SPH_C32(0xa312567b), SPH_C32(0xdb250000),
-	  SPH_C32(0x09290000), SPH_C32(0x49aac000), SPH_C32(0x81e10000),
-	  SPH_C32(0xcafe6b59), SPH_C32(0x42793431), SPH_C32(0x43566b76),
-	  SPH_C32(0xe86cba2e) },
-	{ SPH_C32(0xdb250000), SPH_C32(0x09290000), SPH_C32(0x49aac000),
-	  SPH_C32(0x81e10000), SPH_C32(0xcafe6b59), SPH_C32(0x42793431),
-	  SPH_C32(0x43566b76), SPH_C32(0xe86cba2e), SPH_C32(0x75e60000),
-	  SPH_C32(0x95660001), SPH_C32(0x307b2000), SPH_C32(0xadf40000),
-	  SPH_C32(0x8f321eea), SPH_C32(0x24298307), SPH_C32(0xe8c49cf9),
-	  SPH_C32(0x4b7eec55) },
-	{ SPH_C32(0x58430000), SPH_C32(0x807e0000), SPH_C32(0x78330001),
-	  SPH_C32(0xc66b3800), SPH_C32(0xe7375cdc), SPH_C32(0x79ad3fdd),
-	  SPH_C32(0xac73fe6f), SPH_C32(0x3a4479b1), SPH_C32(0x1d5a0000),
-	  SPH_C32(0x2b720000), SPH_C32(0x488d0000), SPH_C32(0xaf611800),
-	  SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0), SPH_C32(0x81a20429),
-	  SPH_C32(0x1e7536a6) },
-	{ SPH_C32(0x1d5a0000), SPH_C32(0x2b720000), SPH_C32(0x488d0000),
-	  SPH_C32(0xaf611800), SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0),
-	  SPH_C32(0x81a20429), SPH_C32(0x1e7536a6), SPH_C32(0x45190000),
-	  SPH_C32(0xab0c0000), SPH_C32(0x30be0001), SPH_C32(0x690a2000),
-	  SPH_C32(0xc2fc7219), SPH_C32(0xb1d4800d), SPH_C32(0x2dd1fa46),
-	  SPH_C32(0x24314f17) },
-	{ SPH_C32(0xa53b0000), SPH_C32(0x14260000), SPH_C32(0x4e30001e),
-	  SPH_C32(0x7cae0000), SPH_C32(0x8f9e0dd5), SPH_C32(0x78dfaa3d),
-	  SPH_C32(0xf73168d8), SPH_C32(0x0b1b4946), SPH_C32(0x07ed0000),
-	  SPH_C32(0xb2500000), SPH_C32(0x8774000a), SPH_C32(0x970d0000),
-	  SPH_C32(0x437223ae), SPH_C32(0x48c76ea4), SPH_C32(0xf4786222),
-	  SPH_C32(0x9075b1ce) },
-	{ SPH_C32(0x07ed0000), SPH_C32(0xb2500000), SPH_C32(0x8774000a),
-	  SPH_C32(0x970d0000), SPH_C32(0x437223ae), SPH_C32(0x48c76ea4),
-	  SPH_C32(0xf4786222), SPH_C32(0x9075b1ce), SPH_C32(0xa2d60000),
-	  SPH_C32(0xa6760000), SPH_C32(0xc9440014), SPH_C32(0xeba30000),
-	  SPH_C32(0xccec2e7b), SPH_C32(0x3018c499), SPH_C32(0x03490afa),
-	  SPH_C32(0x9b6ef888) },
-	{ SPH_C32(0x88980000), SPH_C32(0x1f940000), SPH_C32(0x7fcf002e),
-	  SPH_C32(0xfb4e0000), SPH_C32(0xf158079a), SPH_C32(0x61ae9167),
-	  SPH_C32(0xa895706c), SPH_C32(0xe6107494), SPH_C32(0x0bc20000),
-	  SPH_C32(0xdb630000), SPH_C32(0x7e88000c), SPH_C32(0x15860000),
-	  SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43), SPH_C32(0xf460449e),
-	  SPH_C32(0xd8b61463) },
-	{ SPH_C32(0x0bc20000), SPH_C32(0xdb630000), SPH_C32(0x7e88000c),
-	  SPH_C32(0x15860000), SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43),
-	  SPH_C32(0xf460449e), SPH_C32(0xd8b61463), SPH_C32(0x835a0000),
-	  SPH_C32(0xc4f70000), SPH_C32(0x01470022), SPH_C32(0xeec80000),
-	  SPH_C32(0x60a54f69), SPH_C32(0x142f2a24), SPH_C32(0x5cf534f2),
-	  SPH_C32(0x3ea660f7) },
-	{ SPH_C32(0x52500000), SPH_C32(0x29540000), SPH_C32(0x6a61004e),
-	  SPH_C32(0xf0ff0000), SPH_C32(0x9a317eec), SPH_C32(0x452341ce),
-	  SPH_C32(0xcf568fe5), SPH_C32(0x5303130f), SPH_C32(0x538d0000),
-	  SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006), SPH_C32(0x56ff0000),
-	  SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9), SPH_C32(0xa9444018),
-	  SPH_C32(0x7f975691) },
-	{ SPH_C32(0x538d0000), SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006),
-	  SPH_C32(0x56ff0000), SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9),
-	  SPH_C32(0xa9444018), SPH_C32(0x7f975691), SPH_C32(0x01dd0000),
-	  SPH_C32(0x80a80000), SPH_C32(0xf4960048), SPH_C32(0xa6000000),
-	  SPH_C32(0x90d57ea2), SPH_C32(0xd7e68c37), SPH_C32(0x6612cffd),
-	  SPH_C32(0x2c94459e) },
-	{ SPH_C32(0xe6280000), SPH_C32(0x4c4b0000), SPH_C32(0xa8550000),
-	  SPH_C32(0xd3d002e0), SPH_C32(0xd86130b8), SPH_C32(0x98a7b0da),
-	  SPH_C32(0x289506b4), SPH_C32(0xd75a4897), SPH_C32(0xf0c50000),
-	  SPH_C32(0x59230000), SPH_C32(0x45820000), SPH_C32(0xe18d00c0),
-	  SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699), SPH_C32(0xcbe0fe1c),
-	  SPH_C32(0x56a7b19f) },
-	{ SPH_C32(0xf0c50000), SPH_C32(0x59230000), SPH_C32(0x45820000),
-	  SPH_C32(0xe18d00c0), SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699),
-	  SPH_C32(0xcbe0fe1c), SPH_C32(0x56a7b19f), SPH_C32(0x16ed0000),
-	  SPH_C32(0x15680000), SPH_C32(0xedd70000), SPH_C32(0x325d0220),
-	  SPH_C32(0xe30c3689), SPH_C32(0x5a4ae643), SPH_C32(0xe375f8a8),
-	  SPH_C32(0x81fdf908) },
-	{ SPH_C32(0xb4310000), SPH_C32(0x77330000), SPH_C32(0xb15d0000),
-	  SPH_C32(0x7fd004e0), SPH_C32(0x78a26138), SPH_C32(0xd116c35d),
-	  SPH_C32(0xd256d489), SPH_C32(0x4e6f74de), SPH_C32(0xe3060000),
-	  SPH_C32(0xbdc10000), SPH_C32(0x87130000), SPH_C32(0xbff20060),
-	  SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751), SPH_C32(0x73c5ab06),
-	  SPH_C32(0x5bd61539) },
-	{ SPH_C32(0xe3060000), SPH_C32(0xbdc10000), SPH_C32(0x87130000),
-	  SPH_C32(0xbff20060), SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751),
-	  SPH_C32(0x73c5ab06), SPH_C32(0x5bd61539), SPH_C32(0x57370000),
-	  SPH_C32(0xcaf20000), SPH_C32(0x364e0000), SPH_C32(0xc0220480),
-	  SPH_C32(0x56186b22), SPH_C32(0x5ca3f40c), SPH_C32(0xa1937f8f),
-	  SPH_C32(0x15b961e7) },
-	{ SPH_C32(0x02f20000), SPH_C32(0xa2810000), SPH_C32(0x873f0000),
-	  SPH_C32(0xe36c7800), SPH_C32(0x1e1d74ef), SPH_C32(0x073d2bd6),
-	  SPH_C32(0xc4c23237), SPH_C32(0x7f32259e), SPH_C32(0xbadd0000),
-	  SPH_C32(0x13ad0000), SPH_C32(0xb7e70000), SPH_C32(0xf7282800),
-	  SPH_C32(0xdf45144d), SPH_C32(0x361ac33a), SPH_C32(0xea5a8d14),
-	  SPH_C32(0x2a2c18f0) },
-	{ SPH_C32(0xbadd0000), SPH_C32(0x13ad0000), SPH_C32(0xb7e70000),
-	  SPH_C32(0xf7282800), SPH_C32(0xdf45144d), SPH_C32(0x361ac33a),
-	  SPH_C32(0xea5a8d14), SPH_C32(0x2a2c18f0), SPH_C32(0xb82f0000),
-	  SPH_C32(0xb12c0000), SPH_C32(0x30d80000), SPH_C32(0x14445000),
-	  SPH_C32(0xc15860a2), SPH_C32(0x3127e8ec), SPH_C32(0x2e98bf23),
-	  SPH_C32(0x551e3d6e) },
-	{ SPH_C32(0x1e6c0000), SPH_C32(0xc4420000), SPH_C32(0x8a2e0000),
-	  SPH_C32(0xbcb6b800), SPH_C32(0x2c4413b6), SPH_C32(0x8bfdd3da),
-	  SPH_C32(0x6a0c1bc8), SPH_C32(0xb99dc2eb), SPH_C32(0x92560000),
-	  SPH_C32(0x1eda0000), SPH_C32(0xea510000), SPH_C32(0xe8b13000),
-	  SPH_C32(0xa93556a5), SPH_C32(0xebfb6199), SPH_C32(0xb15c2254),
-	  SPH_C32(0x33c5244f) },
-	{ SPH_C32(0x92560000), SPH_C32(0x1eda0000), SPH_C32(0xea510000),
-	  SPH_C32(0xe8b13000), SPH_C32(0xa93556a5), SPH_C32(0xebfb6199),
-	  SPH_C32(0xb15c2254), SPH_C32(0x33c5244f), SPH_C32(0x8c3a0000),
-	  SPH_C32(0xda980000), SPH_C32(0x607f0000), SPH_C32(0x54078800),
-	  SPH_C32(0x85714513), SPH_C32(0x6006b243), SPH_C32(0xdb50399c),
-	  SPH_C32(0x8a58e6a4) },
-	{ SPH_C32(0x033d0000), SPH_C32(0x08b30000), SPH_C32(0xf33a0000),
-	  SPH_C32(0x3ac20007), SPH_C32(0x51298a50), SPH_C32(0x6b6e661f),
-	  SPH_C32(0x0ea5cfe3), SPH_C32(0xe6da7ffe), SPH_C32(0xa8da0000),
-	  SPH_C32(0x96be0000), SPH_C32(0x5c1d0000), SPH_C32(0x07da0002),
-	  SPH_C32(0x7d669583), SPH_C32(0x1f98708a), SPH_C32(0xbb668808),
-	  SPH_C32(0xda878000) },
-	{ SPH_C32(0xa8da0000), SPH_C32(0x96be0000), SPH_C32(0x5c1d0000),
-	  SPH_C32(0x07da0002), SPH_C32(0x7d669583), SPH_C32(0x1f98708a),
-	  SPH_C32(0xbb668808), SPH_C32(0xda878000), SPH_C32(0xabe70000),
-	  SPH_C32(0x9e0d0000), SPH_C32(0xaf270000), SPH_C32(0x3d180005),
-	  SPH_C32(0x2c4f1fd3), SPH_C32(0x74f61695), SPH_C32(0xb5c347eb),
-	  SPH_C32(0x3c5dfffe) },
-	{ SPH_C32(0x01930000), SPH_C32(0xe7820000), SPH_C32(0xedfb0000),
-	  SPH_C32(0xcf0c000b), SPH_C32(0x8dd08d58), SPH_C32(0xbca3b42e),
-	  SPH_C32(0x063661e1), SPH_C32(0x536f9e7b), SPH_C32(0x92280000),
-	  SPH_C32(0xdc850000), SPH_C32(0x57fa0000), SPH_C32(0x56dc0003),
-	  SPH_C32(0xbae92316), SPH_C32(0x5aefa30c), SPH_C32(0x90cef752),
-	  SPH_C32(0x7b1675d7) },
-	{ SPH_C32(0x92280000), SPH_C32(0xdc850000), SPH_C32(0x57fa0000),
-	  SPH_C32(0x56dc0003), SPH_C32(0xbae92316), SPH_C32(0x5aefa30c),
-	  SPH_C32(0x90cef752), SPH_C32(0x7b1675d7), SPH_C32(0x93bb0000),
-	  SPH_C32(0x3b070000), SPH_C32(0xba010000), SPH_C32(0x99d00008),
-	  SPH_C32(0x3739ae4e), SPH_C32(0xe64c1722), SPH_C32(0x96f896b3),
-	  SPH_C32(0x2879ebac) },
-	{ SPH_C32(0x5fa80000), SPH_C32(0x56030000), SPH_C32(0x43ae0000),
-	  SPH_C32(0x64f30013), SPH_C32(0x257e86bf), SPH_C32(0x1311944e),
-	  SPH_C32(0x541e95bf), SPH_C32(0x8ea4db69), SPH_C32(0x00440000),
-	  SPH_C32(0x7f480000), SPH_C32(0xda7c0000), SPH_C32(0x2a230001),
-	  SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87), SPH_C32(0x030a9e60),
-	  SPH_C32(0xbe0a679e) },
-	{ SPH_C32(0x00440000), SPH_C32(0x7f480000), SPH_C32(0xda7c0000),
-	  SPH_C32(0x2a230001), SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87),
-	  SPH_C32(0x030a9e60), SPH_C32(0xbe0a679e), SPH_C32(0x5fec0000),
-	  SPH_C32(0x294b0000), SPH_C32(0x99d20000), SPH_C32(0x4ed00012),
-	  SPH_C32(0x1ed34f73), SPH_C32(0xbaa708c9), SPH_C32(0x57140bdf),
-	  SPH_C32(0x30aebcf7) },
-	{ SPH_C32(0xee930000), SPH_C32(0xd6070000), SPH_C32(0x92c10000),
-	  SPH_C32(0x2b9801e0), SPH_C32(0x9451287c), SPH_C32(0x3b6cfb57),
-	  SPH_C32(0x45312374), SPH_C32(0x201f6a64), SPH_C32(0x7b280000),
-	  SPH_C32(0x57420000), SPH_C32(0xa9e50000), SPH_C32(0x634300a0),
-	  SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb), SPH_C32(0x27f83b03),
-	  SPH_C32(0xc7ff60f0) },
-	{ SPH_C32(0x7b280000), SPH_C32(0x57420000), SPH_C32(0xa9e50000),
-	  SPH_C32(0x634300a0), SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb),
-	  SPH_C32(0x27f83b03), SPH_C32(0xc7ff60f0), SPH_C32(0x95bb0000),
-	  SPH_C32(0x81450000), SPH_C32(0x3b240000), SPH_C32(0x48db0140),
-	  SPH_C32(0x0a8a6c53), SPH_C32(0x56f56eec), SPH_C32(0x62c91877),
-	  SPH_C32(0xe7e00a94) }
+{ 0xef0b0270, 0x3afd0000, 0x5dae0000, 0x69490000, 0x9b0f3c06, 0x4405b5f9, 0x66140a51, 0x924f5d0a, // 0
+  0xc96b0030, 0xe7250000, 0x2f840000, 0x264f0000, 0x08695bf9, 0x6dfcf137, 0x509f6984, 0x9e69af68 },
+{ 0xc96b0030, 0xe7250000, 0x2f840000, 0x264f0000, 0x08695bf9, 0x6dfcf137, 0x509f6984, 0x9e69af68,
+  0x26600240, 0xddd80000, 0x722a0000, 0x4f060000, 0x936667ff, 0x29f944ce, 0x368b63d5, 0x0c26f262 },
+{ 0x145a3c00, 0xb9e90000, 0x61270000, 0xf1610000, 0xce613d6c, 0xb0493d78, 0x47a96720, 0xe18e24c5,
+  0x23671400, 0xc8b90000, 0xf4c70000, 0xfb750000, 0x73cd2465, 0xf8a6a549, 0x02c40a3f, 0xdc24e61f },
+{ 0x23671400, 0xc8b90000, 0xf4c70000, 0xfb750000, 0x73cd2465, 0xf8a6a549, 0x02c40a3f, 0xdc24e61f,
+  0x373d2800, 0x71500000, 0x95e00000, 0x0a140000, 0xbdac1909, 0x48ef9831, 0x456d6d1f, 0x3daac2da },
+{ 0x54285c00, 0xeaed0000, 0xc5d60000, 0xa1c50000, 0xb3a26770, 0x94a5c4e1, 0x6bb0419d, 0x551b3782,
+  0x9cbb1800, 0xb0d30000, 0x92510000, 0xed930000, 0x593a4345, 0xe114d5f4, 0x430633da, 0x78cace29 },
+{ 0x9cbb1800, 0xb0d30000, 0x92510000, 0xed930000, 0x593a4345, 0xe114d5f4, 0x430633da, 0x78cace29,
+  0xc8934400, 0x5a3e0000, 0x57870000, 0x4c560000, 0xea982435, 0x75b11115, 0x28b67247, 0x2dd1f9ab },
+{ 0x29449c00, 0x64e70000, 0xf24b0000, 0xc2f30000, 0x0ede4e8f, 0x56c23745, 0xf3e04259, 0x8d0d9ec4,
+  0x466d0c00, 0x08620000, 0xdd5d0000, 0xbadd0000, 0x6a927942, 0x441f2b93, 0x218ace6f, 0xbf2c0be2 },
+{ 0x466d0c00, 0x08620000, 0xdd5d0000, 0xbadd0000, 0x6a927942, 0x441f2b93, 0x218ace6f, 0xbf2c0be2, // 7
+  0x6f299000, 0x6c850000, 0x2f160000, 0x782e0000, 0x644c37cd, 0x12dd1cd6, 0xd26a8c36, 0x32219526 },
+{ 0xf6800005, 0x3443c000, 0x24070000, 0x8f3d0000, 0x21373bfb, 0x0ab8d5ae, 0xcdc58b19, 0xd795ba31,
+  0xa67f0001, 0x71378000, 0x19fc0000, 0x96db0000, 0x3a8b6dfd, 0xebcaaef3, 0x2c6d478f, 0xac8e6c88 },
+{ 0xa67f0001, 0x71378000, 0x19fc0000, 0x96db0000, 0x3a8b6dfd, 0xebcaaef3, 0x2c6d478f, 0xac8e6c88,
+  0x50ff0004, 0x45744000, 0x3dfb0000, 0x19e60000, 0x1bbc5606, 0xe1727b5d, 0xe1a8cc96, 0x7b1bd6b9 },
+{ 0xf7750009, 0xcf3cc000, 0xc3d60000, 0x04920000, 0x029519a9, 0xf8e836ba, 0x7a87f14e, 0x9e16981a,
+  0xd46a0000, 0x8dc8c000, 0xa5af0000, 0x4a290000, 0xfc4e427a, 0xc9b4866c, 0x98369604, 0xf746c320 },
+{ 0xd46a0000, 0x8dc8c000, 0xa5af0000, 0x4a290000, 0xfc4e427a, 0xc9b4866c, 0x98369604, 0xf746c320,
+  0x231f0009, 0x42f40000, 0x66790000, 0x4ebb0000, 0xfedb5bd3, 0x315cb0d6, 0xe2b1674a, 0x69505b3a },
+{ 0x774400f0, 0xf15a0000, 0xf5b20000, 0x34140000, 0x89377e8c, 0x5a8bec25, 0x0bc3cd1e, 0xcf3775cb,
+  0xf46c0050, 0x96180000, 0x14a50000, 0x031f0000, 0x42947eb8, 0x66bf7e19, 0x9ca470d2, 0x8a341574 },
+{ 0xf46c0050, 0x96180000, 0x14a50000, 0x031f0000, 0x42947eb8, 0x66bf7e19, 0x9ca470d2, 0x8a341574,
+  0x832800a0, 0x67420000, 0xe1170000, 0x370b0000, 0xcba30034, 0x3c34923c, 0x9767bdcc, 0x450360bf },
+{ 0xe8870170, 0x9d720000, 0x12db0000, 0xd4220000, 0xf2886b27, 0xa921e543, 0x4ef8b518, 0x618813b1, // 14
+  0xb4370060, 0x0c4c0000, 0x56c20000, 0x5cae0000, 0x94541f3f, 0x3b3ef825, 0x1b365f3d, 0xf3d45758 },
+{ 0xb4370060, 0x0c4c0000, 0x56c20000, 0x5cae0000, 0x94541f3f, 0x3b3ef825, 0x1b365f3d, 0xf3d45758,
+  0x5cb00110, 0x913e0000, 0x44190000, 0x888c0000, 0x66dc7418, 0x921f1d66, 0x55ceea25, 0x925c44e9 },
+{ 0x0c720000, 0x49e50f00, 0x42790000, 0x5cea0000, 0x33aa301a, 0x15822514, 0x95a34b7b, 0xb44b0090,
+  0xfe220000, 0xa7580500, 0x25d10000, 0xf7600000, 0x893178da, 0x1fd4f860, 0x4ed0a315, 0xa123ff9f },
+{ 0xfe220000, 0xa7580500, 0x25d10000, 0xf7600000, 0x893178da, 0x1fd4f860, 0x4ed0a315, 0xa123ff9f,
+  0xf2500000, 0xeebd0a00, 0x67a80000, 0xab8a0000, 0xba9b48c0, 0x0a56dd74, 0xdb73e86e, 0x1568ff0f },
+{ 0x45180000, 0xa5b51700, 0xf96a0000, 0x3b480000, 0x1ecc142c, 0x231395d6, 0x16bca6b0, 0xdf33f4df,
+  0xb83d0000, 0x16710600, 0x379a0000, 0xf5b10000, 0x228161ac, 0xae48f145, 0x66241616, 0xc5c1eb3e },
+{ 0xb83d0000, 0x16710600, 0x379a0000, 0xf5b10000, 0x228161ac, 0xae48f145, 0x66241616, 0xc5c1eb3e,
+  0xfd250000, 0xb3c41100, 0xcef00000, 0xcef90000, 0x3c4d7580, 0x8d5b6493, 0x7098b0a6, 0x1af21fe1 },
+{ 0x75a40000, 0xc28b2700, 0x94a40000, 0x90f50000, 0xfb7857e0, 0x49ce0bae, 0x1767c483, 0xaedf667e,
+  0xd1660000, 0x1bbc0300, 0x9eec0000, 0xf6940000, 0x03024527, 0xcf70fcf2, 0xb4431b17, 0x857f3c2b },
+{ 0xd1660000, 0x1bbc0300, 0x9eec0000, 0xf6940000, 0x03024527, 0xcf70fcf2, 0xb4431b17, 0x857f3c2b, // 21
+  0xa4c20000, 0xd9372400, 0x0a480000, 0x66610000, 0xf87a12c7, 0x86bef75c, 0xa324df94, 0x2ba05a55 },
+{ 0x75c90003, 0x0e10c000, 0xd1200000, 0xbaea0000, 0x8bc42f3e, 0x8758b757, 0xbb28761d, 0x00b72e2b,
+  0xeecf0001, 0x6f564000, 0xf33e0000, 0xa79e0000, 0xbdb57219, 0xb711ebc5, 0x4a3b40ba, 0xfeabf254 },
+{ 0xeecf0001, 0x6f564000, 0xf33e0000, 0xa79e0000, 0xbdb57219, 0xb711ebc5, 0x4a3b40ba, 0xfeabf254,
+  0x9b060002, 0x61468000, 0x221e0000, 0x1d740000, 0x36715d27, 0x30495c92, 0xf11336a7, 0xfe1cdc7f },
+{ 0x86790000, 0x3f390002, 0xe19ae000, 0x98560000, 0x9565670e, 0x4e88c8ea, 0xd3dd4944, 0x161ddab9,
+  0x30b70000, 0xe5d00000, 0xf4f46000, 0x42c40000, 0x63b83d6a, 0x78ba9460, 0x21afa1ea, 0xb0a51834 },
+{ 0x30b70000, 0xe5d00000, 0xf4f46000, 0x42c40000, 0x63b83d6a, 0x78ba9460, 0x21afa1ea, 0xb0a51834,
+  0xb6ce0000, 0xdae90002, 0x156e8000, 0xda920000, 0xf6dd5a64, 0x36325c8a, 0xf272e8ae, 0xa6b8c28d },
+{ 0x14190000, 0x23ca003c, 0x50df0000, 0x44b60000, 0x1b6c67b0, 0x3cf3ac75, 0x61e610b0, 0xdbcadb80,
+  0xe3430000, 0x3a4e0014, 0xf2c60000, 0xaa4e0000, 0xdb1e42a6, 0x256bbe15, 0x123db156, 0x3a4e99d7 },
+{ 0xe3430000, 0x3a4e0014, 0xf2c60000, 0xaa4e0000, 0xdb1e42a6, 0x256bbe15, 0x123db156, 0x3a4e99d7,
+  0xf75a0000, 0x19840028, 0xa2190000, 0xeef80000, 0xc0722516, 0x19981260, 0x73dba1e6, 0xe1844257 },
+{ 0x54500000, 0x0671005c, 0x25ae0000, 0x6a1e0000, 0x2ea54edf, 0x664e8512, 0xbfba18c3, 0x7e715d17, // 28
+  0xbc8d0000, 0xfc3b0018, 0x19830000, 0xd10b0000, 0xae1878c4, 0x42a69856, 0x0012da37, 0x2c3b504e },
+{ 0xbc8d0000, 0xfc3b0018, 0x19830000, 0xd10b0000, 0xae1878c4, 0x42a69856, 0x0012da37, 0x2c3b504e,
+  0xe8dd0000, 0xfa4a0044, 0x3c2d0000, 0xbb150000, 0x80bd361b, 0x24e81d44, 0xbfa8c2f4, 0x524a0d59 },
+{ 0x69510000, 0xd4e1009c, 0xc3230000, 0xac2f0000, 0xe4950bae, 0xcea415dc, 0x87ec287c, 0xbce1a3ce,
+  0xc6730000, 0xaf8d000c, 0xa4c10000, 0x218d0000, 0x23111587, 0x7913512f, 0x1d28ac88, 0x378dd173 },
+{ 0xc6730000, 0xaf8d000c, 0xa4c10000, 0x218d0000, 0x23111587, 0x7913512f, 0x1d28ac88, 0x378dd173,
+  0xaf220000, 0x7b6c0090, 0x67e20000, 0x8da20000, 0xc7841e29, 0xb7b744f3, 0x9ac484f4, 0x8b6c72bd },
+{ 0xcc140000, 0xa5630000, 0x5ab90780, 0x3b500000, 0x4bd013ff, 0x879b3418, 0x694348c1, 0xca5a87fe,
+  0x819e0000, 0xec570000, 0x66320280, 0x95f30000, 0x5da92802, 0x48f43cbc, 0xe65aa22d, 0x8e67b7fa },
+{ 0x819e0000, 0xec570000, 0x66320280, 0x95f30000, 0x5da92802, 0x48f43cbc, 0xe65aa22d, 0x8e67b7fa,
+  0x4d8a0000, 0x49340000, 0x3c8b0500, 0xaea30000, 0x16793bfd, 0xcf6f08a4, 0x8f19eaec, 0x443d3004 },
+{ 0x78230000, 0x12fc0000, 0xa93a0b80, 0x90a50000, 0x713e2879, 0x7ee98924, 0xf08ca062, 0x636f8bab,
+  0x02af0000, 0xb7280000, 0xba1c0300, 0x56980000, 0xba8d45d3, 0x8048c667, 0xa95c149a, 0xf4f6ea7b },
+{ 0x02af0000, 0xb7280000, 0xba1c0300, 0x56980000, 0xba8d45d3, 0x8048c667, 0xa95c149a, 0xf4f6ea7b, // 35
+  0x7a8c0000, 0xa5d40000, 0x13260880, 0xc63d0000, 0xcbb36daa, 0xfea14f43, 0x59d0b4f8, 0x979961d0 },
+{ 0xac480000, 0x1ba60000, 0x45fb1380, 0x03430000, 0x5a85316a, 0x1fb250b6, 0xfe72c7fe, 0x91e478f6,
+  0x1e4e0000, 0xdecf0000, 0x6df80180, 0x77240000, 0xec47079e, 0xf4a0694e, 0xcda31812, 0x98aa496e },
+{ 0x1e4e0000, 0xdecf0000, 0x6df80180, 0x77240000, 0xec47079e, 0xf4a0694e, 0xcda31812, 0x98aa496e,
+  0xb2060000, 0xc5690000, 0x28031200, 0x74670000, 0xb6c236f4, 0xeb1239f8, 0x33d1dfec, 0x094e3198 },
+{ 0xaec30000, 0x9c4f0001, 0x79d1e000, 0x2c150000, 0x45cc75b3, 0x6650b736, 0xab92f78f, 0xa312567b,
+  0xdb250000, 0x09290000, 0x49aac000, 0x81e10000, 0xcafe6b59, 0x42793431, 0x43566b76, 0xe86cba2e },
+{ 0xdb250000, 0x09290000, 0x49aac000, 0x81e10000, 0xcafe6b59, 0x42793431, 0x43566b76, 0xe86cba2e,
+  0x75e60000, 0x95660001, 0x307b2000, 0xadf40000, 0x8f321eea, 0x24298307, 0xe8c49cf9, 0x4b7eec55 },
+{ 0x58430000, 0x807e0000, 0x78330001, 0xc66b3800, 0xe7375cdc, 0x79ad3fdd, 0xac73fe6f, 0x3a4479b1,
+  0x1d5a0000, 0x2b720000, 0x488d0000, 0xaf611800, 0x25cb2ec5, 0xc879bfd0, 0x81a20429, 0x1e7536a6 },
+{ 0x1d5a0000, 0x2b720000, 0x488d0000, 0xaf611800, 0x25cb2ec5, 0xc879bfd0, 0x81a20429, 0x1e7536a6,
+  0x45190000, 0xab0c0000, 0x30be0001, 0x690a2000, 0xc2fc7219, 0xb1d4800d, 0x2dd1fa46, 0x24314f17 },
+{ 0xa53b0000, 0x14260000, 0x4e30001e, 0x7cae0000, 0x8f9e0dd5, 0x78dfaa3d, 0xf73168d8, 0x0b1b4946, // 42
+  0x07ed0000, 0xb2500000, 0x8774000a, 0x970d0000, 0x437223ae, 0x48c76ea4, 0xf4786222, 0x9075b1ce },
+{ 0x07ed0000, 0xb2500000, 0x8774000a, 0x970d0000, 0x437223ae, 0x48c76ea4, 0xf4786222, 0x9075b1ce,
+  0xa2d60000, 0xa6760000, 0xc9440014, 0xeba30000, 0xccec2e7b, 0x3018c499, 0x03490afa, 0x9b6ef888 },
+{ 0x88980000, 0x1f940000, 0x7fcf002e, 0xfb4e0000, 0xf158079a, 0x61ae9167, 0xa895706c, 0xe6107494,
+  0x0bc20000, 0xdb630000, 0x7e88000c, 0x15860000, 0x91fd48f3, 0x7581bb43, 0xf460449e, 0xd8b61463 },
+{ 0x0bc20000, 0xdb630000, 0x7e88000c, 0x15860000, 0x91fd48f3, 0x7581bb43, 0xf460449e, 0xd8b61463,
+  0x835a0000, 0xc4f70000, 0x01470022, 0xeec80000, 0x60a54f69, 0x142f2a24, 0x5cf534f2, 0x3ea660f7 },
+{ 0x52500000, 0x29540000, 0x6a61004e, 0xf0ff0000, 0x9a317eec, 0x452341ce, 0xcf568fe5, 0x5303130f,
+  0x538d0000, 0xa9fc0000, 0x9ef70006, 0x56ff0000, 0x0ae4004e, 0x92c5cdf9, 0xa9444018, 0x7f975691 },
+{ 0x538d0000, 0xa9fc0000, 0x9ef70006, 0x56ff0000, 0x0ae4004e, 0x92c5cdf9, 0xa9444018, 0x7f975691,
+  0x01dd0000, 0x80a80000, 0xf4960048, 0xa6000000, 0x90d57ea2, 0xd7e68c37, 0x6612cffd, 0x2c94459e },
+{ 0xe6280000, 0x4c4b0000, 0xa8550000, 0xd3d002e0, 0xd86130b8, 0x98a7b0da, 0x289506b4, 0xd75a4897,
+  0xf0c50000, 0x59230000, 0x45820000, 0xe18d00c0, 0x3b6d0631, 0xc2ed5699, 0xcbe0fe1c, 0x56a7b19f },
+{ 0xf0c50000, 0x59230000, 0x45820000, 0xe18d00c0, 0x3b6d0631, 0xc2ed5699, 0xcbe0fe1c, 0x56a7b19f, // 49
+  0x16ed0000, 0x15680000, 0xedd70000, 0x325d0220, 0xe30c3689, 0x5a4ae643, 0xe375f8a8, 0x81fdf908 },
+{ 0xb4310000, 0x77330000, 0xb15d0000, 0x7fd004e0, 0x78a26138, 0xd116c35d, 0xd256d489, 0x4e6f74de,
+  0xe3060000, 0xbdc10000, 0x87130000, 0xbff20060, 0x2eba0a1a, 0x8db53751, 0x73c5ab06, 0x5bd61539 },
+{ 0xe3060000, 0xbdc10000, 0x87130000, 0xbff20060, 0x2eba0a1a, 0x8db53751, 0x73c5ab06, 0x5bd61539,
+  0x57370000, 0xcaf20000, 0x364e0000, 0xc0220480, 0x56186b22, 0x5ca3f40c, 0xa1937f8f, 0x15b961e7 },
+{ 0x02f20000, 0xa2810000, 0x873f0000, 0xe36c7800, 0x1e1d74ef, 0x073d2bd6, 0xc4c23237, 0x7f32259e,
+  0xbadd0000, 0x13ad0000, 0xb7e70000, 0xf7282800, 0xdf45144d, 0x361ac33a, 0xea5a8d14, 0x2a2c18f0 },
+{ 0xbadd0000, 0x13ad0000, 0xb7e70000, 0xf7282800, 0xdf45144d, 0x361ac33a, 0xea5a8d14, 0x2a2c18f0,
+  0xb82f0000, 0xb12c0000, 0x30d80000, 0x14445000, 0xc15860a2, 0x3127e8ec, 0x2e98bf23, 0x551e3d6e },
+{ 0x1e6c0000, 0xc4420000, 0x8a2e0000, 0xbcb6b800, 0x2c4413b6, 0x8bfdd3da, 0x6a0c1bc8, 0xb99dc2eb,
+  0x92560000, 0x1eda0000, 0xea510000, 0xe8b13000, 0xa93556a5, 0xebfb6199, 0xb15c2254, 0x33c5244f },
+{ 0x92560000, 0x1eda0000, 0xea510000, 0xe8b13000, 0xa93556a5, 0xebfb6199, 0xb15c2254, 0x33c5244f,
+  0x8c3a0000, 0xda980000, 0x607f0000, 0x54078800, 0x85714513, 0x6006b243, 0xdb50399c, 0x8a58e6a4 },
+{ 0x033d0000, 0x08b30000, 0xf33a0000, 0x3ac20007, 0x51298a50, 0x6b6e661f, 0x0ea5cfe3, 0xe6da7ffe, // 56
+  0xa8da0000, 0x96be0000, 0x5c1d0000, 0x07da0002, 0x7d669583, 0x1f98708a, 0xbb668808, 0xda878000 },
+{ 0xa8da0000, 0x96be0000, 0x5c1d0000, 0x07da0002, 0x7d669583, 0x1f98708a, 0xbb668808, 0xda878000,
+  0xabe70000, 0x9e0d0000, 0xaf270000, 0x3d180005, 0x2c4f1fd3, 0x74f61695, 0xb5c347eb, 0x3c5dfffe },
+{ 0x01930000, 0xe7820000, 0xedfb0000, 0xcf0c000b, 0x8dd08d58, 0xbca3b42e, 0x063661e1, 0x536f9e7b,
+  0x92280000, 0xdc850000, 0x57fa0000, 0x56dc0003, 0xbae92316, 0x5aefa30c, 0x90cef752, 0x7b1675d7 },
+{ 0x92280000, 0xdc850000, 0x57fa0000, 0x56dc0003, 0xbae92316, 0x5aefa30c, 0x90cef752, 0x7b1675d7,
+  0x93bb0000, 0x3b070000, 0xba010000, 0x99d00008, 0x3739ae4e, 0xe64c1722, 0x96f896b3, 0x2879ebac },
+{ 0x5fa80000, 0x56030000, 0x43ae0000, 0x64f30013, 0x257e86bf, 0x1311944e, 0x541e95bf, 0x8ea4db69,
+  0x00440000, 0x7f480000, 0xda7c0000, 0x2a230001, 0x3badc9cc, 0xa9b69c87, 0x030a9e60, 0xbe0a679e },
+{ 0x00440000, 0x7f480000, 0xda7c0000, 0x2a230001, 0x3badc9cc, 0xa9b69c87, 0x030a9e60, 0xbe0a679e,
+  0x5fec0000, 0x294b0000, 0x99d20000, 0x4ed00012, 0x1ed34f73, 0xbaa708c9, 0x57140bdf, 0x30aebcf7 },
+{ 0xee930000, 0xd6070000, 0x92c10000, 0x2b9801e0, 0x9451287c, 0x3b6cfb57, 0x45312374, 0x201f6a64,
+  0x7b280000, 0x57420000, 0xa9e50000, 0x634300a0, 0x9edb442f, 0x6d9995bb, 0x27f83b03, 0xc7ff60f0 },
+{ 0x7b280000, 0x57420000, 0xa9e50000, 0x634300a0, 0x9edb442f, 0x6d9995bb, 0x27f83b03, 0xc7ff60f0,
+  0x95bb0000, 0x81450000, 0x3b240000, 0x48db0140, 0x0a8a6c53, 0x56f56eec, 0x62c91877, 0xe7e00a94 }
 };
 
-#define INPUT_BIG  { \
-		const uint32_t *tp = &d_T512[0][0]; \
-		unsigned u, v; \
-		m0 = 0; \
-		m1 = 0; \
-		m2 = 0; \
-		m3 = 0; \
-		m4 = 0; \
-		m5 = 0; \
-		m6 = 0; \
-		m7 = 0; \
-		m8 = 0; \
-		m9 = 0; \
-		mA = 0; \
-		mB = 0; \
-		mC = 0; \
-		mD = 0; \
-		mE = 0; \
-		mF = 0; \
-        for (u = 0; u < 8; u ++) { \
-			unsigned db = buf(u); \
-			for (v = 0; v < 8; v ++, db >>= 1) { \
-				uint32_t dm = SPH_T32(-(uint32_t)(db & 1)); \
-				m0 ^= dm & *tp ++; \
-				m1 ^= dm & *tp ++; \
-				m2 ^= dm & *tp ++; \
-				m3 ^= dm & *tp ++; \
-				m4 ^= dm & *tp ++; \
-				m5 ^= dm & *tp ++; \
-				m6 ^= dm & *tp ++; \
-				m7 ^= dm & *tp ++; \
-				m8 ^= dm & *tp ++; \
-				m9 ^= dm & *tp ++; \
-				mA ^= dm & *tp ++; \
-				mB ^= dm & *tp ++; \
-				mC ^= dm & *tp ++; \
-				mD ^= dm & *tp ++; \
-				mE ^= dm & *tp ++; \
-				mF ^= dm & *tp ++; \
-			} \
-		} \
+__global__
+void x13_hamsi512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+
+		int hashPosition = nounce - startNounce;
+		uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3];
+		unsigned char *h1 = (unsigned char *)Hash;
+
+		uint32_t c0 = 0x73746565, c1 = 0x6c706172, c2 = 0x6b204172, c3 = 0x656e6265;
+		uint32_t c4 = 0x72672031, c5 = 0x302c2062, c6 = 0x75732032, c7 = 0x3434362c;
+		uint32_t c8 = 0x20422d33, c9 = 0x30303120, cA = 0x4c657576, cB = 0x656e2d48;
+		uint32_t cC = 0x65766572, cD = 0x6c65652c, cE = 0x2042656c, cF = 0x6769756d;
+		uint32_t h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF };
+		uint32_t m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, mA, mB, mC, mD, mE, mF;
+		uint32_t *tp, db, dm;
+
+		for(int i = 0; i < 64; i += 8) {
+
+			m0 = 0; m1 = 0; m2 = 0; m3 = 0; m4 = 0; m5 = 0; m6 = 0; m7 = 0;
+			m8 = 0; m9 = 0; mA = 0; mB = 0; mC = 0; mD = 0; mE = 0; mF = 0;
+			tp = &d_T512[0][0];
+
+			#pragma unroll 2
+			for (int u = 0; u < 8; u ++) {
+				db = h1[i+u];
+				#pragma unroll 2
+				for (int v = 0; v < 8; v ++, db >>= 1) {
+					dm = -(uint32_t)(db & 1);
+					m0 ^= dm & *(tp+ 0); m1 ^= dm & *(tp+ 1);
+					m2 ^= dm & *(tp+ 2); m3 ^= dm & *(tp+ 3);
+					m4 ^= dm & *(tp+ 4); m5 ^= dm & *(tp+ 5);
+					m6 ^= dm & *(tp+ 6); m7 ^= dm & *(tp+ 7);
+					m8 ^= dm & *(tp+ 8); m9 ^= dm & *(tp+ 9);
+					mA ^= dm & *(tp+10); mB ^= dm & *(tp+11);
+					mC ^= dm & *(tp+12); mD ^= dm & *(tp+13);
+					mE ^= dm & *(tp+14); mF ^= dm & *(tp+15);
+					tp += 16;
+				}
+			}
+
+			for( int r = 0; r < 6; r += 2 ) {
+				ROUND_BIG(r, d_alpha_n);
+				ROUND_BIG(r+1, d_alpha_n);
+			}
+			T_BIG;
+		}
+
+		// precomputed for 64 bytes blocks ?
+		tp = &d_T512[0][0] + 112;
+		m0 = tp[ 0]; m1 = tp[ 1];
+		m2 = tp[ 2]; m3 = tp[ 3];
+		m4 = tp[ 4]; m5 = tp[ 5];
+		m6 = tp[ 6]; m7 = tp[ 7];
+		m8 = tp[ 8]; m9 = tp[ 9];
+		mA = tp[10]; mB = tp[11];
+		mC = tp[12]; mD = tp[13];
+		mE = tp[14]; mF = tp[15];
+
+		for( int r = 0; r < 6; r += 2 ) {
+			ROUND_BIG(r, d_alpha_n);
+			ROUND_BIG(r+1, d_alpha_n);
+		}
+		T_BIG;
+
+		tp = &d_T512[0][0] + 784;
+		m0 = tp[ 0]; m1 = tp[ 1];
+		m2 = tp[ 2]; m3 = tp[ 3];
+		m4 = tp[ 4]; m5 = tp[ 5];
+		m6 = tp[ 6]; m7 = tp[ 7];
+		m8 = tp[ 8]; m9 = tp[ 9];
+		mA = tp[10]; mB = tp[11];
+		mC = tp[12]; mD = tp[13];
+		mE = tp[14]; mF = tp[15];
+
+		for( int r = 0; r < 12; r += 2 ) {
+			ROUND_BIG(r, d_alpha_f);
+			ROUND_BIG(r+1, d_alpha_f);
+		}
+		T_BIG;
+
+		#pragma unroll 16
+		for (int i = 0; i < 16; i++)
+			Hash[i] = cuda_swab32(h[i]);
 	}
+}
 
+__host__
+void x13_hamsi512_cpu_init(int thr_id, uint32_t threads)
+{
+	cudaMemcpyToSymbol(d_alpha_n, alpha_n, sizeof(uint32_t)*32, 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(d_alpha_f, alpha_f, sizeof(uint32_t)*32, 0, cudaMemcpyHostToDevice);
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_T512, T512, sizeof(uint32_t)*64*16, 0, cudaMemcpyHostToDevice));
+}
 
-/***************************************************/
-// Die Hash-Funktion
-__global__ void x13_hamsi512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
+__host__
+void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
 {
-    int thread = (blockDim.x * blockIdx.x + threadIdx.x);
-    if (thread < threads)
-    {
-        uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
-
-        int hashPosition = nounce - startNounce;
-        uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3];
-        unsigned char *h1 = (unsigned char *)Hash;
-
-        uint32_t c0 = SPH_C32(0x73746565), c1 = SPH_C32(0x6c706172), c2 = SPH_C32(0x6b204172), c3 = SPH_C32(0x656e6265);
-        uint32_t c4 = SPH_C32(0x72672031), c5 = SPH_C32(0x302c2062), c6 = SPH_C32(0x75732032), c7 = SPH_C32(0x3434362c);
-        uint32_t c8 = SPH_C32(0x20422d33), c9 = SPH_C32(0x30303120), cA = SPH_C32(0x4c657576), cB = SPH_C32(0x656e2d48);
-        uint32_t cC = SPH_C32(0x65766572), cD = SPH_C32(0x6c65652c), cE = SPH_C32(0x2042656c), cF = SPH_C32(0x6769756d);
-        uint32_t m0, m1, m2, m3, m4, m5, m6, m7;
-        uint32_t m8, m9, mA, mB, mC, mD, mE, mF;
-        uint32_t h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF };
-
-#define buf(u) (h1[i+u])
-		#pragma unroll 8
-        for(int i = 0; i < 64; i += 8) {
-            INPUT_BIG;
-            P_BIG;
-            T_BIG;
-        }
-#undef buf
-#define buf(u) (u == 0 ? 0x80 : 0)
-        INPUT_BIG;
-        P_BIG;
-        T_BIG;
-#undef buf
-#define buf(u) (u == 6 ? 2 : 0)
-        INPUT_BIG;
-        PF_BIG;
-        T_BIG;
+	const uint32_t threadsperblock = 128;
 
-		#pragma unroll 16
-        for (int i = 0; i < 16; i++)
-            Hash[i] = SWAB32(h[i]);
-    }
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	x13_hamsi512_gpu_hash_64<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+	//MyStreamSynchronize(NULL, order, thr_id);
 }
 
+__constant__ static uint64_t c_PaddedMessage80[10];
 
-__host__ void x13_hamsi512_cpu_init(int thr_id, int threads)
+__host__
+void x16_hamsi512_setBlock_80(void *pdata)
 {
-	cudaMemcpyToSymbol( d_alpha_n, alpha_n, sizeof(uint32_t)*32, 0, cudaMemcpyHostToDevice);
-	cudaMemcpyToSymbol( d_alpha_f, alpha_f, sizeof(uint32_t)*32, 0, cudaMemcpyHostToDevice);
-	cudaMemcpyToSymbol( d_T512, T512, sizeof(uint32_t)*64*16, 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(c_PaddedMessage80, pdata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice);
 }
 
-__host__ void x13_hamsi512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+__global__
+void x16_hamsi512_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t *g_hash)
 {
-    const int threadsperblock = 256;
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		unsigned char h1[80];
+		#pragma unroll
+		for (int i = 0; i < 10; i++)
+			((uint2*)h1)[i] = ((uint2*)c_PaddedMessage80)[i];
+		//((uint64_t*)h1)[9] = REPLACE_HIDWORD(c_PaddedMessage80[9], cuda_swab32(startNonce + thread));
+		((uint32_t*)h1)[19] = cuda_swab32(startNonce + thread);
+
+		uint32_t c0 = 0x73746565, c1 = 0x6c706172, c2 = 0x6b204172, c3 = 0x656e6265;
+		uint32_t c4 = 0x72672031, c5 = 0x302c2062, c6 = 0x75732032, c7 = 0x3434362c;
+		uint32_t c8 = 0x20422d33, c9 = 0x30303120, cA = 0x4c657576, cB = 0x656e2d48;
+		uint32_t cC = 0x65766572, cD = 0x6c65652c, cE = 0x2042656c, cF = 0x6769756d;
+		uint32_t h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF };
+		uint32_t m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, mA, mB, mC, mD, mE, mF;
+		uint32_t *tp, db, dm;
+
+		for(int i = 0; i < 80; i += 8)
+		{
+			m0 = 0; m1 = 0; m2 = 0; m3 = 0; m4 = 0; m5 = 0; m6 = 0; m7 = 0;
+			m8 = 0; m9 = 0; mA = 0; mB = 0; mC = 0; mD = 0; mE = 0; mF = 0;
+			tp = &d_T512[0][0];
+
+			#pragma unroll
+			for (int u = 0; u < 8; u++) {
+				db = h1[i + u];
+				#pragma unroll 2
+				for (int v = 0; v < 8; v++, db >>= 1) {
+					dm = -(uint32_t)(db & 1);
+					m0 ^= dm & tp[ 0]; m1 ^= dm & tp[ 1];
+					m2 ^= dm & tp[ 2]; m3 ^= dm & tp[ 3];
+					m4 ^= dm & tp[ 4]; m5 ^= dm & tp[ 5];
+					m6 ^= dm & tp[ 6]; m7 ^= dm & tp[ 7];
+					m8 ^= dm & tp[ 8]; m9 ^= dm & tp[ 9];
+					mA ^= dm & tp[10]; mB ^= dm & tp[11];
+					mC ^= dm & tp[12]; mD ^= dm & tp[13];
+					mE ^= dm & tp[14]; mF ^= dm & tp[15];
+					tp += 16;
+				}
+			}
+
+			#pragma unroll
+			for (int r = 0; r < 6; r++) {
+				ROUND_BIG(r, d_alpha_n);
+			}
+			T_BIG;
+		}
+
+		#define INPUT_BIG { \
+			m0 = 0; m1 = 0; m2 = 0; m3 = 0; m4 = 0; m5 = 0; m6 = 0; m7 = 0; \
+			m8 = 0; m9 = 0; mA = 0; mB = 0; mC = 0; mD = 0; mE = 0; mF = 0; \
+			tp = &d_T512[0][0]; \
+			for (int u = 0; u < 8; u++) { \
+				db = endtag[u]; \
+				for (int v = 0; v < 8; v++, db >>= 1) { \
+					dm = -(uint32_t)(db & 1); \
+					m0 ^= dm & tp[ 0]; m1 ^= dm & tp[ 1]; \
+					m2 ^= dm & tp[ 2]; m3 ^= dm & tp[ 3]; \
+					m4 ^= dm & tp[ 4]; m5 ^= dm & tp[ 5]; \
+					m6 ^= dm & tp[ 6]; m7 ^= dm & tp[ 7]; \
+					m8 ^= dm & tp[ 8]; m9 ^= dm & tp[ 9]; \
+					mA ^= dm & tp[10]; mB ^= dm & tp[11]; \
+					mC ^= dm & tp[12]; mD ^= dm & tp[13]; \
+					mE ^= dm & tp[14]; mF ^= dm & tp[15]; \
+					tp += 16; \
+				} \
+			} \
+		}
+
+		// close
+		uint8_t endtag[8] = { 0x80, 0x00, 0x00, 0x00,  0x00, 0x00, 0x00, 0x00 };
+		INPUT_BIG;
+
+		#pragma unroll
+		for (int r = 0; r < 6; r++) {
+			ROUND_BIG(r, d_alpha_n);
+		}
+		T_BIG;
+
+		endtag[0] = endtag[1] = 0x00;
+		endtag[6] = 0x02;
+		endtag[7] = 0x80;
+		INPUT_BIG;
+
+		// PF_BIG
+		#pragma unroll
+		for(int r = 0; r < 12; r++) {
+			ROUND_BIG(r, d_alpha_f);
+		}
+		T_BIG;
+
+		uint64_t hashPosition = thread;
+		uint32_t *Hash = (uint32_t*)&g_hash[hashPosition << 3];
+		#pragma unroll 16
+		for(int i = 0; i < 16; i++)
+			Hash[i] = cuda_swab32(h[i]);
 
-    // berechne wie viele Thread Blocks wir brauchen
-    dim3 grid((threads + threadsperblock-1)/threadsperblock);
-    dim3 block(threadsperblock);
+		#undef INPUT_BIG
+	}
+}
 
-    // Gr��e des dynamischen Shared Memory Bereichs
-    size_t shared_size = 0;
+__host__
+void x16_hamsi512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash)
+{
+	const uint32_t threadsperblock = 128;
 
-//    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
 
-    x13_hamsi512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
-    MyStreamSynchronize(NULL, order, thr_id);
+	x16_hamsi512_gpu_hash_80 <<<grid, block>>> (threads, startNounce, (uint64_t*)d_hash);
 }
-
diff --git a/x13/hsr.cu b/x13/hsr.cu
new file mode 100644
index 0000000000..e86444628d
--- /dev/null
+++ b/x13/hsr.cu
@@ -0,0 +1,265 @@
+/*
+ * X13 algorithm
+ */
+extern "C"
+{
+#include "sph/sph_blake.h"
+#include "sph/sph_bmw.h"
+#include "sph/sph_groestl.h"
+#include "sph/sph_skein.h"
+#include "sph/sph_jh.h"
+#include "sph/sph_keccak.h"
+
+#include "sph/sph_luffa.h"
+#include "sph/sph_cubehash.h"
+#include "sph/sph_shavite.h"
+#include "sph/sph_simd.h"
+#include "sph/sph_echo.h"
+
+#include "sph/sph_hamsi.h"
+#include "sph/sph_fugue.h"
+}
+#include "sm3.h"
+
+#include "miner.h"
+
+#include "cuda_helper.h"
+#include "x11/cuda_x11.h"
+
+static uint32_t *d_hash[MAX_GPUS];
+
+extern void sm3_cuda_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash, int order);
+
+extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads);
+extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
+extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x13_fugue512_cpu_free(int thr_id);
+
+// HSR CPU Hash
+extern "C" void hsr_hash(void *output, const void *input)
+{
+	// blake1-bmw2-grs3-skein4-jh5-keccak6-luffa7-cubehash8-shavite9-simd10-echo11-hamsi12-fugue13
+
+	sph_blake512_context ctx_blake;
+	sph_bmw512_context ctx_bmw;
+	sph_groestl512_context ctx_groestl;
+	sph_jh512_context ctx_jh;
+	sph_keccak512_context ctx_keccak;
+	sph_skein512_context ctx_skein;
+	sph_luffa512_context ctx_luffa;
+	sph_cubehash512_context ctx_cubehash;
+	sph_shavite512_context ctx_shavite;
+	sph_simd512_context ctx_simd;
+	sph_echo512_context ctx_echo;
+	sm3_ctx_t ctx_sm3;
+	sph_hamsi512_context ctx_hamsi;
+	sph_fugue512_context ctx_fugue;
+
+	uint32_t hash[32];
+	memset(hash, 0, sizeof hash);
+
+	sph_blake512_init(&ctx_blake);
+	sph_blake512(&ctx_blake, input, 80);
+	sph_blake512_close(&ctx_blake, (void*) hash);
+
+	sph_bmw512_init(&ctx_bmw);
+	sph_bmw512(&ctx_bmw, (const void*) hash, 64);
+	sph_bmw512_close(&ctx_bmw, (void*) hash);
+
+	sph_groestl512_init(&ctx_groestl);
+	sph_groestl512(&ctx_groestl, (const void*) hash, 64);
+	sph_groestl512_close(&ctx_groestl, (void*) hash);
+
+	sph_skein512_init(&ctx_skein);
+	sph_skein512(&ctx_skein, (const void*) hash, 64);
+	sph_skein512_close(&ctx_skein, (void*) hash);
+
+	sph_jh512_init(&ctx_jh);
+	sph_jh512(&ctx_jh, (const void*) hash, 64);
+	sph_jh512_close(&ctx_jh, (void*) hash);
+
+	sph_keccak512_init(&ctx_keccak);
+	sph_keccak512(&ctx_keccak, (const void*) hash, 64);
+	sph_keccak512_close(&ctx_keccak, (void*) hash);
+
+	sph_luffa512_init(&ctx_luffa);
+	sph_luffa512(&ctx_luffa, (const void*) hash, 64);
+	sph_luffa512_close (&ctx_luffa, (void*) hash);
+
+	sph_cubehash512_init(&ctx_cubehash);
+	sph_cubehash512(&ctx_cubehash, (const void*) hash, 64);
+	sph_cubehash512_close(&ctx_cubehash, (void*) hash);
+
+	sph_shavite512_init(&ctx_shavite);
+	sph_shavite512(&ctx_shavite, (const void*) hash, 64);
+	sph_shavite512_close(&ctx_shavite, (void*) hash);
+
+	sph_simd512_init(&ctx_simd);
+	sph_simd512(&ctx_simd, (const void*) hash, 64);
+	sph_simd512_close(&ctx_simd, (void*) hash);
+
+	sph_echo512_init(&ctx_echo);
+	sph_echo512(&ctx_echo, (const void*) hash, 64);
+	sph_echo512_close(&ctx_echo, (void*) hash);
+
+	sm3_init(&ctx_sm3);
+	sm3_update(&ctx_sm3, (const unsigned char*) hash, 64);
+	memset(hash, 0, sizeof hash);
+	sm3_close(&ctx_sm3, (void*) hash);
+
+	sph_hamsi512_init(&ctx_hamsi);
+	sph_hamsi512(&ctx_hamsi, (const void*) hash, 64);
+	sph_hamsi512_close(&ctx_hamsi, (void*) hash);
+
+	sph_fugue512_init(&ctx_fugue);
+	sph_fugue512(&ctx_fugue, (const void*) hash, 64);
+	sph_fugue512_close(&ctx_fugue, (void*) hash);
+
+	memcpy(output, hash, 32);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_hsr(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	int intensity = 19; // (device_sm[device_map[thr_id]] > 500 && !is_windows()) ? 20 : 19;
+	uint32_t throughput =  cuda_default_throughput(thr_id, 1 << intensity); // 19=256*256*8;
+	//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (opt_benchmark)
+		((uint32_t*)ptarget)[7] = 0x000f;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		quark_blake512_cpu_init(thr_id, throughput);
+		quark_groestl512_cpu_init(thr_id, throughput);
+		quark_skein512_cpu_init(thr_id, throughput);
+		quark_bmw512_cpu_init(thr_id, throughput);
+		quark_keccak512_cpu_init(thr_id, throughput);
+		quark_jh512_cpu_init(thr_id, throughput);
+		x11_luffaCubehash512_cpu_init(thr_id, throughput);
+		x11_shavite512_cpu_init(thr_id, throughput);
+		if (x11_simd512_cpu_init(thr_id, throughput) != 0) {
+			return 0;
+		}
+		x11_echo512_cpu_init(thr_id, throughput);
+		x13_hamsi512_cpu_init(thr_id, throughput);
+		x13_fugue512_cpu_init(thr_id, throughput);
+
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0);
+
+		cuda_check_cpu_init(thr_id, throughput);
+
+		init[thr_id] = true;
+	}
+
+	uint32_t endiandata[20];
+	for (int k=0; k < 20; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	quark_blake512_cpu_setBlock_80(thr_id, endiandata);
+	cuda_check_cpu_setTarget(ptarget);
+
+	do {
+		int order = 0;
+
+		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		sm3_cuda_hash_64(thr_id, throughput, d_hash[thr_id], order++);
+		x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		CUDA_LOG_ERROR();
+
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			hsr_hash(vhash, endiandata);
+
+			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				work_set_target_ratio(work, vhash);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					hsr_hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+
+	CUDA_LOG_ERROR();
+
+	return 0;
+}
+
+// cleanup
+extern "C" void free_hsr(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+
+	quark_blake512_cpu_free(thr_id);
+	quark_groestl512_cpu_free(thr_id);
+	x11_simd512_cpu_free(thr_id);
+	x13_fugue512_cpu_free(thr_id);
+
+	cuda_check_cpu_free(thr_id);
+	CUDA_LOG_ERROR();
+
+	cudaDeviceSynchronize();
+	init[thr_id] = false;
+}
diff --git a/x13/sm3.c b/x13/sm3.c
new file mode 100644
index 0000000000..295ba15086
--- /dev/null
+++ b/x13/sm3.c
@@ -0,0 +1,220 @@
+/* ====================================================================
+ * Copyright (c) 2014 - 2017 The GmSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the GmSSL Project.
+ *    (http://gmssl.org/)"
+ *
+ * 4. The name "GmSSL Project" must not be used to endorse or promote
+ *    products derived from this software without prior written
+ *    permission. For written permission, please contact
+ *    guanzhi1980@gmail.com.
+ *
+ * 5. Products derived from this software may not be called "GmSSL"
+ *    nor may "GmSSL" appear in their names without prior written
+ *    permission of the GmSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the GmSSL Project
+ *    (http://gmssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE GmSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE GmSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <string.h>
+
+#include "sm3.h"
+
+void sm3_init(sm3_ctx_t *ctx)
+{
+	ctx->digest[0] = 0x7380166F;
+	ctx->digest[1] = 0x4914B2B9;
+	ctx->digest[2] = 0x172442D7;
+	ctx->digest[3] = 0xDA8A0600;
+	ctx->digest[4] = 0xA96F30BC;
+	ctx->digest[5] = 0x163138AA;
+	ctx->digest[6] = 0xE38DEE4D;
+	ctx->digest[7] = 0xB0FB0E4E;
+
+	ctx->nblocks = 0;
+	ctx->num = 0;
+}
+
+void sm3_update(sm3_ctx_t *ctx, const unsigned char* data, size_t data_len)
+{
+	if (ctx->num) {
+		unsigned int left = SM3_BLOCK_SIZE - ctx->num;
+		if (data_len < left) {
+			memcpy(ctx->block + ctx->num, data, data_len);
+			ctx->num += data_len;
+			return;
+		} else {
+			memcpy(ctx->block + ctx->num, data, left);
+			sm3_compress(ctx->digest, ctx->block);
+			ctx->nblocks++;
+			data += left;
+			data_len -= left;
+		}
+	}
+	while (data_len >= SM3_BLOCK_SIZE) {
+		sm3_compress(ctx->digest, data);
+		ctx->nblocks++;
+		data += SM3_BLOCK_SIZE;
+		data_len -= SM3_BLOCK_SIZE;
+	}
+	ctx->num = data_len;
+	if (data_len) {
+		memcpy(ctx->block, data, data_len);
+	}
+}
+
+void sm3_close(void *cc, void *dst)
+{
+	sm3_final(cc, dst);
+	memset(cc, 0, sizeof(sm3_ctx_t));
+}
+
+void sm3_final(sm3_ctx_t *ctx, unsigned char *digest)
+{
+	int i;
+	uint32_t *pdigest = (uint32_t *)digest;
+	uint32_t *count = (uint32_t *)(ctx->block + SM3_BLOCK_SIZE - 8);
+
+	ctx->block[ctx->num] = 0x80;
+
+	if (ctx->num + 9 <= SM3_BLOCK_SIZE) {
+		memset(ctx->block + ctx->num + 1, 0, SM3_BLOCK_SIZE - ctx->num - 9);
+	} else {
+		memset(ctx->block + ctx->num + 1, 0, SM3_BLOCK_SIZE - ctx->num - 1);
+		sm3_compress(ctx->digest, ctx->block);
+		memset(ctx->block, 0, SM3_BLOCK_SIZE - 8);
+	}
+
+	count[0] = cpu_to_be32((ctx->nblocks) >> 23);
+	count[1] = cpu_to_be32((ctx->nblocks << 9) + (ctx->num << 3));
+
+	sm3_compress(ctx->digest, ctx->block);
+	for (i = 0; i < sizeof(ctx->digest)/sizeof(ctx->digest[0]); i++) {
+		pdigest[i] = cpu_to_be32(ctx->digest[i]);
+	}
+}
+
+#define ROTATELEFT(X,n)  (((X)<<(n)) | ((X)>>(32-(n))))
+
+#define P0(x) ((x) ^  ROTATELEFT((x),9)  ^ ROTATELEFT((x),17))
+#define P1(x) ((x) ^  ROTATELEFT((x),15) ^ ROTATELEFT((x),23))
+
+#define FF0(x,y,z) ( (x) ^ (y) ^ (z))
+#define FF1(x,y,z) (((x) & (y)) | ( (x) & (z)) | ( (y) & (z)))
+
+#define GG0(x,y,z) ( (x) ^ (y) ^ (z))
+#define GG1(x,y,z) (((x) & (y)) | ( (~(x)) & (z)) )
+
+
+void sm3_compress(uint32_t digest[8], const unsigned char block[64])
+{
+	int j;
+	uint32_t W[68], W1[64];
+	const uint32_t *pblock = (const uint32_t *)block;
+
+	uint32_t A = digest[0];
+	uint32_t B = digest[1];
+	uint32_t C = digest[2];
+	uint32_t D = digest[3];
+	uint32_t E = digest[4];
+	uint32_t F = digest[5];
+	uint32_t G = digest[6];
+	uint32_t H = digest[7];
+	uint32_t SS1,SS2,TT1,TT2,T[64];
+
+	for (j = 0; j < 16; j++) {
+		W[j] = cpu_to_be32(pblock[j]);
+	}
+	for (j = 16; j < 68; j++) {
+		W[j] = P1( W[j-16] ^ W[j-9] ^ ROTATELEFT(W[j-3],15)) ^ ROTATELEFT(W[j - 13],7 ) ^ W[j-6];;
+	}
+	for( j = 0; j < 64; j++) {
+		W1[j] = W[j] ^ W[j+4];
+	}
+
+	for(j =0; j < 16; j++) {
+
+		T[j] = 0x79CC4519;
+		SS1 = ROTATELEFT((ROTATELEFT(A,12) + E + ROTATELEFT(T[j],j)), 7);
+		SS2 = SS1 ^ ROTATELEFT(A,12);
+		TT1 = FF0(A,B,C) + D + SS2 + W1[j];
+		TT2 = GG0(E,F,G) + H + SS1 + W[j];
+		D = C;
+		C = ROTATELEFT(B,9);
+		B = A;
+		A = TT1;
+		H = G;
+		G = ROTATELEFT(F,19);
+		F = E;
+		E = P0(TT2);
+	}
+
+	for(j =16; j < 64; j++) {
+
+		T[j] = 0x7A879D8A;
+		SS1 = ROTATELEFT((ROTATELEFT(A,12) + E + ROTATELEFT(T[j],j)), 7);
+		SS2 = SS1 ^ ROTATELEFT(A,12);
+		TT1 = FF1(A,B,C) + D + SS2 + W1[j];
+		TT2 = GG1(E,F,G) + H + SS1 + W[j];
+		D = C;
+		C = ROTATELEFT(B,9);
+		B = A;
+		A = TT1;
+		H = G;
+		G = ROTATELEFT(F,19);
+		F = E;
+		E = P0(TT2);
+	}
+
+	digest[0] ^= A;
+	digest[1] ^= B;
+	digest[2] ^= C;
+	digest[3] ^= D;
+	digest[4] ^= E;
+	digest[5] ^= F;
+	digest[6] ^= G;
+	digest[7] ^= H;
+}
+
+void sm3(const unsigned char *msg, size_t msglen,
+	unsigned char dgst[SM3_DIGEST_LENGTH])
+{
+	sm3_ctx_t ctx;
+
+	sm3_init(&ctx);
+	sm3_update(&ctx, msg, msglen);
+	sm3_final(&ctx, dgst);
+
+	memset(&ctx, 0, sizeof(sm3_ctx_t));
+}
diff --git a/x13/sm3.h b/x13/sm3.h
new file mode 100644
index 0000000000..05c6595d98
--- /dev/null
+++ b/x13/sm3.h
@@ -0,0 +1,109 @@
+/* ====================================================================
+ * Copyright (c) 2014 - 2016 The GmSSL Project.  All rights reserved.
+ * Copyright (c) 2017 - YiiMP (cleaned hmac dead stuff)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the GmSSL Project.
+ *    (http://gmssl.org/)"
+ *
+ * 4. The name "GmSSL Project" must not be used to endorse or promote
+ *    products derived from this software without prior written
+ *    permission. For written permission, please contact
+ *    guanzhi1980@gmail.com.
+ *
+ * 5. Products derived from this software may not be called "GmSSL"
+ *    nor may "GmSSL" appear in their names without prior written
+ *    permission of the GmSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the GmSSL Project
+ *    (http://gmssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE GmSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE GmSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#ifndef _SM3_H
+#define _SM3_H
+
+#define SM3_DIGEST_LENGTH	32
+#define SM3_BLOCK_SIZE		64
+#define SM3_CBLOCK		(SM3_BLOCK_SIZE)
+#define SM3_HMAC_SIZE		(SM3_DIGEST_LENGTH)
+
+
+#include <sys/types.h>
+#include <stdint.h>
+#include <string.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+typedef struct {
+	uint32_t digest[8];
+	int nblocks;
+	unsigned char block[64];
+	int num;
+} sm3_ctx_t;
+
+void sm3_init(sm3_ctx_t *ctx);
+void sm3_update(sm3_ctx_t *ctx, const unsigned char* data, size_t data_len);
+void sm3_close(void *cc, void *dst);
+
+void sm3_final(sm3_ctx_t *ctx, unsigned char digest[SM3_DIGEST_LENGTH]);
+void sm3_compress(uint32_t digest[8], const unsigned char block[SM3_BLOCK_SIZE]);
+void sm3(const unsigned char *data, size_t datalen,
+	unsigned char digest[SM3_DIGEST_LENGTH]);
+
+#ifdef CPU_BIGENDIAN
+
+#define cpu_to_be16(v) (v)
+#define cpu_to_be32(v) (v)
+#define be16_to_cpu(v) (v)
+#define be32_to_cpu(v) (v)
+
+#else
+
+#define cpu_to_le16(v) (v)
+#define cpu_to_le32(v) (v)
+#define le16_to_cpu(v) (v)
+#define le32_to_cpu(v) (v)
+
+#define cpu_to_be16(v) (((v)<< 8) | ((v)>>8))
+#define cpu_to_be32(v) (((v)>>24) | (((v)>>8)&0xff00) | (((v)<<8)&0xff0000) | ((v)<<24))
+#define be16_to_cpu(v) cpu_to_be16(v)
+#define be32_to_cpu(v) cpu_to_be32(v)
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/x13/x13.cu b/x13/x13.cu
index c37c79d1e6..0f5d88c394 100644
--- a/x13/x13.cu
+++ b/x13/x13.cu
@@ -1,8 +1,6 @@
 /*
- * X13 algorithm built on cbuchner1's original X11
- * 
+ * X13 algorithm
  */
-
 extern "C"
 {
 #include "sph/sph_blake.h"
@@ -20,265 +18,237 @@ extern "C"
 
 #include "sph/sph_hamsi.h"
 #include "sph/sph_fugue.h"
-
-#include "miner.h"
 }
+#include "miner.h"
 
-// aus cpu-miner.c
-extern int device_map[8];
-
-// Speicher f�r Input/Output der verketteten Hashfunktionen
-static uint32_t *d_hash[8];
-
-extern void quark_blake512_cpu_init(int thr_id, int threads);
-extern void quark_blake512_cpu_setBlock_80(void *pdata);
-extern void quark_blake512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order);
-extern void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
-extern void quark_bmw512_cpu_init(int thr_id, int threads);
-extern void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
-extern void quark_groestl512_cpu_init(int thr_id, int threads);
-extern void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-//extern void quark_doublegroestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
-extern void quark_skein512_cpu_init(int thr_id, int threads);
-extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
-extern void quark_keccak512_cpu_init(int thr_id, int threads);
-extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
-extern void quark_jh512_cpu_init(int thr_id, int threads);
-extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
-extern void x11_luffa512_cpu_init(int thr_id, int threads);
-extern void x11_luffa512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
-extern void x11_cubehash512_cpu_init(int thr_id, int threads);
-extern void x11_cubehash512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
-extern void x11_shavite512_cpu_init(int thr_id, int threads);
-extern void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
-extern void x11_simd512_cpu_init(int thr_id, int threads);
-extern void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
-extern void x11_echo512_cpu_init(int thr_id, int threads);
-extern void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
-extern void x13_hamsi512_cpu_init(int thr_id, int threads);
-extern void x13_hamsi512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+#include "cuda_helper.h"
+#include "x11/cuda_x11.h"
 
-extern void x13_fugue512_cpu_init(int thr_id, int threads);
-extern void x13_fugue512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+static uint32_t *d_hash[MAX_GPUS];
 
-extern void quark_check_cpu_init(int thr_id, int threads);
-extern void quark_check_cpu_setTarget(const void *ptarget);
-extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
+extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads);
+extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 
-extern void quark_compactTest_cpu_init(int thr_id, int threads);
-extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, 
-											uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse,
-											int order);
+extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
+extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x13_fugue512_cpu_free(int thr_id);
 
-// X13 Hashfunktion
-inline void x13hash(void *state, const void *input)
+// X13 CPU Hash
+extern "C" void x13hash(void *output, const void *input)
 {
-    // blake1-bmw2-grs3-skein4-jh5-keccak6-luffa7-cubehash8-shavite9-simd10-echo11-hamsi12-fugue13
-
-    sph_blake512_context ctx_blake;
-    sph_bmw512_context ctx_bmw;
-    sph_groestl512_context ctx_groestl;
-    sph_jh512_context ctx_jh;
-    sph_keccak512_context ctx_keccak;
-    sph_skein512_context ctx_skein;
-    sph_luffa512_context ctx_luffa;
-    sph_cubehash512_context ctx_cubehash;
-    sph_shavite512_context ctx_shavite;
-    sph_simd512_context ctx_simd;
-    sph_echo512_context ctx_echo;
-    sph_hamsi512_context ctx_hamsi;
-    sph_fugue512_context ctx_fugue;
-
-    uint32_t hash[16];
-
-    sph_blake512_init(&ctx_blake);
-    // ZBLAKE;
-    sph_blake512 (&ctx_blake, input, 80);
-    sph_blake512_close(&ctx_blake, (void*) hash);
-
-    sph_bmw512_init(&ctx_bmw);
-    // ZBMW;
-    sph_bmw512 (&ctx_bmw, (const void*) hash, 64);
-    sph_bmw512_close(&ctx_bmw, (void*) hash);
-
-    sph_groestl512_init(&ctx_groestl);
-    // ZGROESTL;
-    sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
-    sph_groestl512_close(&ctx_groestl, (void*) hash);
-
-    sph_skein512_init(&ctx_skein);
-    // ZSKEIN;
-    sph_skein512 (&ctx_skein, (const void*) hash, 64);
-    sph_skein512_close(&ctx_skein, (void*) hash);
-
-    sph_jh512_init(&ctx_jh);
-    // ZJH;
-    sph_jh512 (&ctx_jh, (const void*) hash, 64);
-    sph_jh512_close(&ctx_jh, (void*) hash);
-
-    sph_keccak512_init(&ctx_keccak);
-    // ZKECCAK;
-    sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
-    sph_keccak512_close(&ctx_keccak, (void*) hash);
-
-    sph_luffa512_init(&ctx_luffa);
-    // ZLUFFA;
-    sph_luffa512 (&ctx_luffa, (const void*) hash, 64);
-    sph_luffa512_close (&ctx_luffa, (void*) hash);
-
-    sph_cubehash512_init(&ctx_cubehash);
-    // ZCUBEHASH;
-    sph_cubehash512 (&ctx_cubehash, (const void*) hash, 64);
-    sph_cubehash512_close(&ctx_cubehash, (void*) hash);
-
-    sph_shavite512_init(&ctx_shavite);
-    // ZSHAVITE;
-    sph_shavite512 (&ctx_shavite, (const void*) hash, 64);
-    sph_shavite512_close(&ctx_shavite, (void*) hash);
-
-    sph_simd512_init(&ctx_simd);
-    // ZSIMD
-    sph_simd512 (&ctx_simd, (const void*) hash, 64);
-    sph_simd512_close(&ctx_simd, (void*) hash);
-
-    sph_echo512_init(&ctx_echo);
-    // ZECHO
-    sph_echo512 (&ctx_echo, (const void*) hash, 64);
-    sph_echo512_close(&ctx_echo, (void*) hash); 
-
-    sph_hamsi512_init(&ctx_hamsi);
-    sph_hamsi512 (&ctx_hamsi, (const void*) hash, 64);
-    sph_hamsi512_close(&ctx_hamsi, (void*) hash); 
-
-    sph_fugue512_init(&ctx_fugue);
-    sph_fugue512 (&ctx_fugue, (const void*) hash, 64);
-    sph_fugue512_close(&ctx_fugue, (void*) hash); 
-
-    memcpy(state, hash, 32);
+	// blake1-bmw2-grs3-skein4-jh5-keccak6-luffa7-cubehash8-shavite9-simd10-echo11-hamsi12-fugue13
+
+	sph_blake512_context ctx_blake;
+	sph_bmw512_context ctx_bmw;
+	sph_groestl512_context ctx_groestl;
+	sph_jh512_context ctx_jh;
+	sph_keccak512_context ctx_keccak;
+	sph_skein512_context ctx_skein;
+	sph_luffa512_context ctx_luffa;
+	sph_cubehash512_context ctx_cubehash;
+	sph_shavite512_context ctx_shavite;
+	sph_simd512_context ctx_simd;
+	sph_echo512_context ctx_echo;
+	sph_hamsi512_context ctx_hamsi;
+	sph_fugue512_context ctx_fugue;
+
+	uint32_t hash[32];
+	memset(hash, 0, sizeof hash);
+
+	sph_blake512_init(&ctx_blake);
+	sph_blake512 (&ctx_blake, input, 80);
+	sph_blake512_close(&ctx_blake, (void*) hash);
+
+	sph_bmw512_init(&ctx_bmw);
+	sph_bmw512 (&ctx_bmw, (const void*) hash, 64);
+	sph_bmw512_close(&ctx_bmw, (void*) hash);
+
+	sph_groestl512_init(&ctx_groestl);
+	sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
+	sph_groestl512_close(&ctx_groestl, (void*) hash);
+
+	sph_skein512_init(&ctx_skein);
+	sph_skein512 (&ctx_skein, (const void*) hash, 64);
+	sph_skein512_close(&ctx_skein, (void*) hash);
+
+	sph_jh512_init(&ctx_jh);
+	sph_jh512 (&ctx_jh, (const void*) hash, 64);
+	sph_jh512_close(&ctx_jh, (void*) hash);
+
+	sph_keccak512_init(&ctx_keccak);
+	sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
+	sph_keccak512_close(&ctx_keccak, (void*) hash);
+
+	sph_luffa512_init(&ctx_luffa);
+	sph_luffa512 (&ctx_luffa, (const void*) hash, 64);
+	sph_luffa512_close (&ctx_luffa, (void*) hash);
+
+	sph_cubehash512_init(&ctx_cubehash);
+	sph_cubehash512 (&ctx_cubehash, (const void*) hash, 64);
+	sph_cubehash512_close(&ctx_cubehash, (void*) hash);
+
+	sph_shavite512_init(&ctx_shavite);
+	sph_shavite512 (&ctx_shavite, (const void*) hash, 64);
+	sph_shavite512_close(&ctx_shavite, (void*) hash);
+
+	sph_simd512_init(&ctx_simd);
+	sph_simd512 (&ctx_simd, (const void*) hash, 64);
+	sph_simd512_close(&ctx_simd, (void*) hash);
+
+	sph_echo512_init(&ctx_echo);
+	sph_echo512 (&ctx_echo, (const void*) hash, 64);
+	sph_echo512_close(&ctx_echo, (void*) hash);
+
+	sph_hamsi512_init(&ctx_hamsi);
+	sph_hamsi512 (&ctx_hamsi, (const void*) hash, 64);
+	sph_hamsi512_close(&ctx_hamsi, (void*) hash);
+
+	sph_fugue512_init(&ctx_fugue);
+	sph_fugue512 (&ctx_fugue, (const void*) hash, 64);
+	sph_fugue512_close(&ctx_fugue, (void*) hash);
+
+	memcpy(output, hash, 32);
 }
 
+static bool init[MAX_GPUS] = { 0 };
 
-extern bool opt_benchmark;
-
-extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
-    const uint32_t *ptarget, uint32_t max_nonce,
-    unsigned long *hashes_done)
+extern "C" int scanhash_x13(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
 {
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[19];
+	int intensity = 19; // (device_sm[device_map[thr_id]] > 500 && !is_windows()) ? 20 : 19;
+	uint32_t throughput =  cuda_default_throughput(thr_id, 1 << intensity); // 19=256*256*8;
+	//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x0000ff;
-
-	const uint32_t Htarg = ptarget[7];
-
-	const int throughput = 256*256*8;
+		((uint32_t*)ptarget)[7] = 0x000f;
 
-	static bool init[8] = {0,0,0,0,0,0,0,0};
 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 
-		// Konstanten kopieren, Speicher belegen
-		cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput);
 		quark_blake512_cpu_init(thr_id, throughput);
 		quark_groestl512_cpu_init(thr_id, throughput);
 		quark_skein512_cpu_init(thr_id, throughput);
 		quark_bmw512_cpu_init(thr_id, throughput);
 		quark_keccak512_cpu_init(thr_id, throughput);
 		quark_jh512_cpu_init(thr_id, throughput);
-		x11_luffa512_cpu_init(thr_id, throughput);
-		x11_cubehash512_cpu_init(thr_id, throughput);
+		x11_luffaCubehash512_cpu_init(thr_id, throughput);
 		x11_shavite512_cpu_init(thr_id, throughput);
-		x11_simd512_cpu_init(thr_id, throughput);
+		if (x11_simd512_cpu_init(thr_id, throughput) != 0) {
+			return 0;
+		}
 		x11_echo512_cpu_init(thr_id, throughput);
 		x13_hamsi512_cpu_init(thr_id, throughput);
 		x13_fugue512_cpu_init(thr_id, throughput);
-		quark_check_cpu_init(thr_id, throughput);
+
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0);
+
+		cuda_check_cpu_init(thr_id, throughput);
+
 		init[thr_id] = true;
 	}
 
-	//unsigned char echobefore[64], echoafter[64];
-
-    uint32_t endiandata[20];
+	uint32_t endiandata[20];
 	for (int k=0; k < 20; k++)
-		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
+		be32enc(&endiandata[k], pdata[k]);
 
-	quark_blake512_cpu_setBlock_80((void*)endiandata);
-	quark_check_cpu_setTarget(ptarget);
+	quark_blake512_cpu_setBlock_80(thr_id, endiandata);
+	cuda_check_cpu_setTarget(ptarget);
 
 	do {
 		int order = 0;
 
-        // erstes Blake512 Hash mit CUDA
-		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch f�r BMW512
+		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch f�r Groestl512
 		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch f�r Skein512
 		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch f�r JH512
 		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch f�r Keccak512
 		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch f�r Luffa512
-		x11_luffa512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch f�r Cubehash512
-		x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch f�r Shavite512
+		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
 		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch f�r SIMD512
 		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch f�r ECHO512
 		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-
 		x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
 
-        x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		CUDA_LOG_ERROR();
 
-		// Scan nach Gewinner Hashes auf der GPU
-		uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		if  (foundNonce != 0xffffffff)
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
 		{
-			uint32_t vhash64[8];
-			be32enc(&endiandata[19], foundNonce);
-			x13hash(vhash64, endiandata);
-
-			if( (vhash64[7]<=Htarg) && fulltest(vhash64, ptarget) ) {
-                
-                pdata[19] = foundNonce;
-                *hashes_done = foundNonce - first_nonce + 1;
-                return 1;
-			} else {
-				applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNonce);
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			x13hash(vhash, endiandata);
+
+			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				work_set_target_ratio(work, vhash);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					x13hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
 			}
 		}
 
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
 		pdata[19] += throughput;
 
-	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+
+	CUDA_LOG_ERROR();
 
-	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
+
+// cleanup
+extern "C" void free_x13(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+
+	quark_blake512_cpu_free(thr_id);
+	quark_groestl512_cpu_free(thr_id);
+	x11_simd512_cpu_free(thr_id);
+	x13_fugue512_cpu_free(thr_id);
+
+	cuda_check_cpu_free(thr_id);
+	CUDA_LOG_ERROR();
+
+	cudaDeviceSynchronize();
+	init[thr_id] = false;
+}
diff --git a/x15/cuda_whirlpool_tables.cuh b/x15/cuda_whirlpool_tables.cuh
new file mode 100644
index 0000000000..0690eb5933
--- /dev/null
+++ b/x15/cuda_whirlpool_tables.cuh
@@ -0,0 +1,190 @@
+/* CUDA whirlpool huge constant tables */
+
+static const uint64_t old1_T0[256] = {
+	0x78D8C07818281818, 0xAF2605AF23652323, 0xF9B87EF9C657C6C6, 0x6FFB136FE825E8E8,
+	0xA1CB4CA187948787, 0x6211A962B8D5B8B8, 0x0509080501030101, 0x6E0D426E4FD14F4F,
+	0xEE9BADEE365A3636, 0x04FF5904A6F7A6A6, 0xBD0CDEBDD26BD2D2, 0x060EFB06F502F5F5,
+	0x8096EF80798B7979, 0xCE305FCE6FB16F6F, 0xEF6DFCEF91AE9191, 0x07F8AA0752F65252,
+	0xFD4727FD60A06060, 0x76358976BCD9BCBC, 0xCD37ACCD9BB09B9B, 0x8C8A048C8E8F8E8E,
+	0x15D27115A3F8A3A3, 0x3C6C603C0C140C0C, 0x8A84FF8A7B8D7B7B, 0xE180B5E1355F3535,
+	0x69F5E8691D271D1D, 0x47B35347E03DE0E0, 0xAC21F6ACD764D7D7, 0xED9C5EEDC25BC2C2,
+	0x96436D962E722E2E, 0x7A29627A4BDD4B4B, 0x215DA321FE1FFEFE, 0x16D5821657F95757,
+	0x41BDA841153F1515, 0xB6E89FB677997777, 0xEB92A5EB37593737, 0x569E7B56E532E5E5,
+	0xD9138CD99FBC9F9F, 0x1723D317F00DF0F0, 0x7F206A7F4ADE4A4A, 0x95449E95DA73DADA,
+	0x25A2FA2558E85858, 0xCACF06CAC946C9C9, 0x8D7C558D297B2929, 0x225A50220A1E0A0A,
+	0x4F50E14FB1CEB1B1, 0x1AC9691AA0FDA0A0, 0xDA147FDA6BBD6B6B, 0xABD95CAB85928585,
+	0x733C8173BDDABDBD, 0x348FD2345DE75D5D, 0x5090805010301010, 0x0307F303F401F4F4,
+	0xC0DD16C0CB40CBCB, 0xC6D3EDC63E423E3E, 0x112D2811050F0505, 0xE6781FE667A96767,
+	0x53977353E431E4E4, 0xBB0225BB27692727, 0x5873325841C34141, 0x9DA72C9D8B808B8B,
+	0x01F65101A7F4A7A7, 0x94B2CF947D877D7D, 0xFB49DCFB95A29595, 0x9F568E9FD875D8D8,
+	0x30708B30FB10FBFB, 0x71CD2371EE2FEEEE, 0x91BBC7917C847C7C, 0xE37117E366AA6666,
+	0x8E7BA68EDD7ADDDD, 0x4BAFB84B17391717, 0x4645024647C94747, 0xDC1A84DC9EBF9E9E,
+	0xC5D41EC5CA43CACA, 0x995875992D772D2D, 0x792E9179BFDCBFBF, 0x1B3F381B07090707,
+	0x23AC0123ADEAADAD, 0x2FB0EA2F5AEE5A5A, 0xB5EF6CB583988383, 0xFFB685FF33553333,
+	0xF25C3FF263A56363, 0x0A12100A02060202, 0x38933938AAE3AAAA, 0xA8DEAFA871937171,
+	0xCFC60ECFC845C8C8, 0x7DD1C87D192B1919, 0x703B727049DB4949, 0x9A5F869AD976D9D9,
+	0x1D31C31DF20BF2F2, 0x48A84B48E338E3E3, 0x2AB9E22A5BED5B5B, 0x92BC349288858888,
+	0xC83EA4C89AB39A9A, 0xBE0B2DBE266A2626, 0xFABF8DFA32563232, 0x4A59E94AB0CDB0B0,
+	0x6AF21B6AE926E9E9, 0x337778330F110F0F, 0xA633E6A6D562D5D5, 0xBAF474BA809D8080,
+	0x7C27997CBEDFBEBE, 0xDEEB26DECD4ACDCD, 0xE489BDE4345C3434, 0x75327A7548D84848,
+	0x2454AB24FF1CFFFF, 0x8F8DF78F7A8E7A7A, 0xEA64F4EA90AD9090, 0x3E9DC23E5FE15F5F,
+	0xA03D1DA020602020, 0xD50F67D568B86868, 0x72CAD0721A2E1A1A, 0x2CB7192CAEEFAEAE,
+	0x5E7DC95EB4C1B4B4, 0x19CE9A1954FC5454, 0xE57FECE593A89393, 0xAA2F0DAA22662222,
+	0xE96307E964AC6464, 0x122ADB12F10EF1F1, 0xA2CCBFA273957373, 0x5A82905A12361212,
+	0x5D7A3A5D40C04040, 0x2848402808180808, 0xE89556E8C358C3C3, 0x7BDF337BEC29ECEC,
+	0x904D9690DB70DBDB, 0x1FC0611FA1FEA1A1, 0x83911C838D8A8D8D, 0xC9C8F5C93D473D3D,
+	0xF15BCCF197A49797, 0x0000000000000000, 0xD4F936D4CF4CCFCF, 0x876E45872B7D2B2B,
+	0xB3E197B3769A7676, 0xB0E664B0829B8282, 0xA928FEA9D667D6D6, 0x77C3D8771B2D1B1B,
+	0x5B74C15BB5C2B5B5, 0x29BE1129AFECAFAF, 0xDF1D77DF6ABE6A6A, 0x0DEABA0D50F05050,
+	0x4C57124C45CF4545, 0x1838CB18F308F3F3, 0xF0AD9DF030503030, 0x74C42B74EF2CEFEF,
+	0xC3DAE5C33F413F3F, 0x1CC7921C55FF5555, 0x10DB7910A2FBA2A2, 0x65E90365EA23EAEA,
+	0xEC6A0FEC65AF6565, 0x6803B968BAD3BABA, 0x934A65932F712F2F, 0xE78E4EE7C05DC0C0,
+	0x8160BE81DE7FDEDE, 0x6CFCE06C1C241C1C, 0x2E46BB2EFD1AFDFD, 0x641F52644DD74D4D,
+	0xE076E4E092AB9292, 0xBCFA8FBC759F7575, 0x1E36301E060A0606, 0x98AE24988A838A8A,
+	0x404BF940B2CBB2B2, 0x59856359E637E6E6, 0x367E70360E120E0E, 0x63E7F8631F211F1F,
+	0xF75537F762A66262, 0xA33AEEA3D461D4D4, 0x32812932A8E5A8A8, 0xF452C4F496A79696,
+	0x3A629B3AF916F9F9, 0xF6A366F6C552C5C5, 0xB11035B1256F2525, 0x20ABF22059EB5959,
+	0xAED054AE84918484, 0xA7C5B7A772967272, 0xDDECD5DD394B3939, 0x61165A614CD44C4C,
+	0x3B94CA3B5EE25E5E, 0x859FE78578887878, 0xD8E5DDD838483838, 0x869814868C898C8C,
+	0xB217C6B2D16ED1D1, 0x0BE4410BA5F2A5A5, 0x4DA1434DE23BE2E2, 0xF84E2FF861A36161,
+	0x4542F145B3C8B3B3, 0xA53415A521632121, 0xD60894D69CB99C9C, 0x66EEF0661E221E1E,
+	0x5261225243C54343, 0xFCB176FCC754C7C7, 0x2B4FB32BFC19FCFC, 0x14242014040C0404,
+	0x08E3B20851F35151, 0xC725BCC799B69999, 0xC4224FC46DB76D6D, 0x396568390D170D0D,
+	0x35798335FA13FAFA, 0x8469B684DF7CDFDF, 0x9BA9D79B7E827E7E, 0xB4193DB4246C2424,
+	0xD7FEC5D73B4D3B3B, 0x3D9A313DABE0ABAB, 0xD1F03ED1CE4FCECE, 0x5599885511331111,
+	0x89830C898F8C8F8F, 0x6B044A6B4ED24E4E, 0x5166D151B7C4B7B7, 0x60E00B60EB20EBEB,
+	0xCCC1FDCC3C443C3C, 0xBFFD7CBF819E8181, 0xFE40D4FE94A19494, 0x0C1CEB0CF704F7F7,
+	0x6718A167B9D6B9B9, 0x5F8B985F13351313, 0x9C517D9C2C742C2C, 0xB805D6B8D368D3D3,
+	0x5C8C6B5CE734E7E7, 0xCB3957CB6EB26E6E, 0xF3AA6EF3C451C4C4, 0x0F1B180F03050303,
+	0x13DC8A1356FA5656, 0x495E1A4944CC4444, 0x9EA0DF9E7F817F7F, 0x37882137A9E6A9A9,
+	0x82674D822A7E2A2A, 0x6D0AB16DBBD0BBBB, 0xE28746E2C15EC1C1, 0x02F1A20253F55353,
+	0x8B72AE8BDC79DCDC, 0x275358270B1D0B0B, 0xD3019CD39DBA9D9D, 0xC12B47C16CB46C6C,
+	0xF5A495F531533131, 0xB9F387B9749C7474, 0x0915E309F607F6F6, 0x434C0A4346CA4646,
+	0x26A50926ACE9ACAC, 0x97B53C9789868989, 0x44B4A044143C1414, 0x42BA5B42E13EE1E1,
+	0x4EA6B04E163A1616, 0xD2F7CDD23A4E3A3A, 0xD0066FD069BB6969, 0x2D41482D091B0909,
+	0xADD7A7AD70907070, 0x546FD954B6C7B6B6, 0xB71ECEB7D06DD0D0, 0x7ED63B7EED2AEDED,
+	0xDBE22EDBCC49CCCC, 0x57682A5742C64242, 0xC22CB4C298B59898, 0x0EED490EA4F1A4A4,
+	0x88755D8828782828, 0x3186DA315CE45C5C, 0x3F6B933FF815F8F8, 0xA4C244A486978686
+};
+
+static const uint64_t old1_RC[10] = {
+	0x4F01B887E8C62318, 0x52916F79F5D2A636, 0x357B0CA38E9BBC60, 0x57FE4B2EC2D7E01D, 0xDA4AF09FE5377715,
+	0x856BA0B10A29C958, 0x67053ECBF4105DBD, 0xD8957DA78B4127E4, 0x9E4717DD667CEEFB, 0x33835AAD07BF2DCA
+};
+
+static const uint64_t old1_precomputed_round_key_64[72] = {
+	0xECE7FCC7F808AB3B, 0x44E9CB45024545CB, 0xB289A43CA4A489FE, 0xC5E1F3E1E1C5A9A0,
+	0xFCD4FCFCAC5C48AC, 0x418F8F0E90F70E8F, 0x7979078514077946, 0xF8B87868B8F8D8F8,
+	0xE4B6012A17B9C1EF, 0xAFF6AD2866D6C68D, 0xB0C745991504968F, 0x74E93F120FE2E675,
+	0xC1D216198D2A495B, 0x35B6A53DF6716FD7, 0xD6983228CCDC354F, 0xC3B8BFCB242AB159,
+	0x4AC0340260B548D4, 0x3B3582BB4F9BB769, 0xD8683C4AF17C46F8, 0x9EE05220D8214E61,
+	0xFED2431F5098E0E5, 0x387AF50F0776E24F, 0x7FFD300A74DE5AE1, 0xB753981921AEB24B,
+	0x296F5919978BA0BD, 0x52CC9DD7031E59AC, 0xE5E646C9A5165AE3, 0x4F848BDF70532817,
+	0xA01ADD556B693C51, 0xC7268B83DDB75E97, 0xCB908F9316E2C076, 0x8FC39EE0D946E9C5,
+	0x5B256B0FE7937D86, 0xF77C51751A22932C, 0x2582A83CD34108C4, 0xE218CDE28A9C790D,
+	0xC93A572E592E3594, 0xEA70CF71BC4E55AC, 0xC135C8214155BC85, 0x50989643A6B8456E,
+	0x60E970DFD74C71E6, 0xC0B5A520AB7C88D8, 0x3A66AA761D5B1400, 0x0502AB3087300DE6,
+	0xC06203EDED483DEA, 0xC2620EDF55C1CB74, 0xF36A22CF9AA452F1, 0xB3502DC83BA2660F,
+	0xCF249B6DB82273C3, 0xC74DADAB026388F2, 0xB8B77B3275AFCDE8, 0xC9947382C6D6A3C0,
+	0x84938F2258A6BC21, 0x1E51E15A3B99CDF7, 0xC812F9AC41F5CC05, 0x21BFEC61E9B9393E,
+	0xF576066160243540, 0x3A62D1CB6404180D, 0x8807A55C2AC7AFE2, 0x804237B54859503E,
+	0x1619B3612106744B, 0xC1ECB5643D81C76F, 0xBA7CBB8C13214C6C, 0xD241AEAD7622701E,
+	0xDD900A1B66BF748C, 0xCACCF665EC2391FE, 0xF9BED90100B89447, 0x4CF5D284E56B7A0F,
+	0x003EB289B6993F96, 0xE9DB01146199245D, 0x97701270F3F41CCB, 0x9C8CA117E01E4B49
+};
+
+//----------------------------------------------------------------------------------------------------------
+
+static const uint64_t plain_T0[256] = {
+	0xD83078C018601818, 0x2646AF05238C2323, 0xB891F97EC63FC6C6, 0xFBCD6F13E887E8E8,
+	0xCB13A14C87268787, 0x116D62A9B8DAB8B8, 0x0902050801040101, 0x0D9E6E424F214F4F,
+	0x9B6CEEAD36D83636, 0xFF510459A6A2A6A6, 0x0CB9BDDED26FD2D2, 0x0EF706FBF5F3F5F5,
+	0x96F280EF79F97979, 0x30DECE5F6FA16F6F, 0x6D3FEFFC917E9191, 0xF8A407AA52555252,
+	0x47C0FD27609D6060, 0x35657689BCCABCBC, 0x372BCDAC9B569B9B, 0x8A018C048E028E8E,
+	0xD25B1571A3B6A3A3, 0x6C183C600C300C0C, 0x84F68AFF7BF17B7B, 0x806AE1B535D43535,
+	0xF53A69E81D741D1D, 0xB3DD4753E0A7E0E0, 0x21B3ACF6D77BD7D7, 0x9C99ED5EC22FC2C2,
+	0x435C966D2EB82E2E, 0x29967A624B314B4B, 0x5DE121A3FEDFFEFE, 0xD5AE168257415757,
+	0xBD2A41A815541515, 0xE8EEB69F77C17777, 0x926EEBA537DC3737, 0x9ED7567BE5B3E5E5,
+	0x1323D98C9F469F9F, 0x23FD17D3F0E7F0F0, 0x20947F6A4A354A4A, 0x44A9959EDA4FDADA,
+	0xA2B025FA587D5858, 0xCF8FCA06C903C9C9, 0x7C528D5529A42929, 0x5A1422500A280A0A,
+	0x507F4FE1B1FEB1B1, 0xC95D1A69A0BAA0A0, 0x14D6DA7F6BB16B6B, 0xD917AB5C852E8585,
+	0x3C677381BDCEBDBD, 0x8FBA34D25D695D5D, 0x9020508010401010, 0x07F503F3F4F7F4F4,
+	0xDD8BC016CB0BCBCB, 0xD37CC6ED3EF83E3E, 0x2D0A112805140505, 0x78CEE61F67816767,
+	0x97D55373E4B7E4E4, 0x024EBB25279C2727, 0x7382583241194141, 0xA70B9D2C8B168B8B,
+	0xF6530151A7A6A7A7, 0xB2FA94CF7DE97D7D, 0x4937FBDC956E9595, 0x56AD9F8ED847D8D8,
+	0x70EB308BFBCBFBFB, 0xCDC17123EE9FEEEE, 0xBBF891C77CED7C7C, 0x71CCE31766856666,
+	0x7BA78EA6DD53DDDD, 0xAF2E4BB8175C1717, 0x458E460247014747, 0x1A21DC849E429E9E,
+	0xD489C51ECA0FCACA, 0x585A99752DB42D2D, 0x2E637991BFC6BFBF, 0x3F0E1B38071C0707,
+	0xAC472301AD8EADAD, 0xB0B42FEA5A755A5A, 0xEF1BB56C83368383, 0xB666FF8533CC3333,
+	0x5CC6F23F63916363, 0x12040A1002080202, 0x93493839AA92AAAA, 0xDEE2A8AF71D97171,
+	0xC68DCF0EC807C8C8, 0xD1327DC819641919, 0x3B92707249394949, 0x5FAF9A86D943D9D9,
+	0x31F91DC3F2EFF2F2, 0xA8DB484BE3ABE3E3, 0xB9B62AE25B715B5B, 0xBC0D9234881A8888,
+	0x3E29C8A49A529A9A, 0x0B4CBE2D26982626, 0xBF64FA8D32C83232, 0x597D4AE9B0FAB0B0,
+	0xF2CF6A1BE983E9E9, 0x771E33780F3C0F0F, 0x33B7A6E6D573D5D5, 0xF41DBA74803A8080,
+	0x27617C99BEC2BEBE, 0xEB87DE26CD13CDCD, 0x8968E4BD34D03434, 0x3290757A483D4848,
+	0x54E324ABFFDBFFFF, 0x8DF48FF77AF57A7A, 0x643DEAF4907A9090, 0x9DBE3EC25F615F5F,
+	0x3D40A01D20802020, 0x0FD0D56768BD6868, 0xCA3472D01A681A1A, 0xB7412C19AE82AEAE,
+	0x7D755EC9B4EAB4B4, 0xCEA8199A544D5454, 0x7F3BE5EC93769393, 0x2F44AA0D22882222,
+	0x63C8E907648D6464, 0x2AFF12DBF1E3F1F1, 0xCCE6A2BF73D17373, 0x82245A9012481212,
+	0x7A805D3A401D4040, 0x4810284008200808, 0x959BE856C32BC3C3, 0xDFC57B33EC97ECEC,
+	0x4DAB9096DB4BDBDB, 0xC05F1F61A1BEA1A1, 0x9107831C8D0E8D8D, 0xC87AC9F53DF43D3D,
+	0x5B33F1CC97669797, 0x0000000000000000, 0xF983D436CF1BCFCF, 0x6E5687452BAC2B2B,
+	0xE1ECB39776C57676, 0xE619B06482328282, 0x28B1A9FED67FD6D6, 0xC33677D81B6C1B1B,
+	0x74775BC1B5EEB5B5, 0xBE432911AF86AFAF, 0x1DD4DF776AB56A6A, 0xEAA00DBA505D5050,
+	0x578A4C1245094545, 0x38FB18CBF3EBF3F3, 0xAD60F09D30C03030, 0xC4C3742BEF9BEFEF,
+	0xDA7EC3E53FFC3F3F, 0xC7AA1C9255495555, 0xDB591079A2B2A2A2, 0xE9C96503EA8FEAEA,
+	0x6ACAEC0F65896565, 0x036968B9BAD2BABA, 0x4A5E93652FBC2F2F, 0x8E9DE74EC027C0C0,
+	0x60A181BEDE5FDEDE, 0xFC386CE01C701C1C, 0x46E72EBBFDD3FDFD, 0x1F9A64524D294D4D,
+	0x7639E0E492729292, 0xFAEABC8F75C97575, 0x360C1E3006180606, 0xAE0998248A128A8A,
+	0x4B7940F9B2F2B2B2, 0x85D15963E6BFE6E6, 0x7E1C36700E380E0E, 0xE73E63F81F7C1F1F,
+	0x55C4F73762956262, 0x3AB5A3EED477D4D4, 0x814D3229A89AA8A8, 0x5231F4C496629696,
+	0x62EF3A9BF9C3F9F9, 0xA397F666C533C5C5, 0x104AB13525942525, 0xABB220F259795959,
+	0xD015AE54842A8484, 0xC5E4A7B772D57272, 0xEC72DDD539E43939, 0x1698615A4C2D4C4C,
+	0x94BC3BCA5E655E5E, 0x9FF085E778FD7878, 0xE570D8DD38E03838, 0x980586148C0A8C8C,
+	0x17BFB2C6D163D1D1, 0xE4570B41A5AEA5A5, 0xA1D94D43E2AFE2E2, 0x4EC2F82F61996161,
+	0x427B45F1B3F6B3B3, 0x3442A51521842121, 0x0825D6949C4A9C9C, 0xEE3C66F01E781E1E,
+	0x6186522243114343, 0xB193FC76C73BC7C7, 0x4FE52BB3FCD7FCFC, 0x2408142004100404,
+	0xE3A208B251595151, 0x252FC7BC995E9999, 0x22DAC44F6DA96D6D, 0x651A39680D340D0D,
+	0x79E93583FACFFAFA, 0x69A384B6DF5BDFDF, 0xA9FC9BD77EE57E7E, 0x1948B43D24902424,
+	0xFE76D7C53BEC3B3B, 0x9A4B3D31AB96ABAB, 0xF081D13ECE1FCECE, 0x9922558811441111,
+	0x8303890C8F068F8F, 0x049C6B4A4E254E4E, 0x667351D1B7E6B7B7, 0xE0CB600BEB8BEBEB,
+	0xC178CCFD3CF03C3C, 0xFD1FBF7C813E8181, 0x4035FED4946A9494, 0x1CF30CEBF7FBF7F7,
+	0x186F67A1B9DEB9B9, 0x8B265F98134C1313, 0x51589C7D2CB02C2C, 0x05BBB8D6D36BD3D3,
+	0x8CD35C6BE7BBE7E7, 0x39DCCB576EA56E6E, 0xAA95F36EC437C4C4, 0x1B060F18030C0303,
+	0xDCAC138A56455656, 0x5E88491A440D4444, 0xA0FE9EDF7FE17F7F, 0x884F3721A99EA9A9,
+	0x6754824D2AA82A2A, 0x0A6B6DB1BBD6BBBB, 0x879FE246C123C1C1, 0xF1A602A253515353,
+	0x72A58BAEDC57DCDC, 0x531627580B2C0B0B, 0x0127D39C9D4E9D9D, 0x2BD8C1476CAD6C6C,
+	0xA462F59531C43131, 0xF3E8B98774CD7474, 0x15F109E3F6FFF6F6, 0x4C8C430A46054646,
+	0xA5452609AC8AACAC, 0xB50F973C891E8989, 0xB42844A014501414, 0xBADF425BE1A3E1E1,
+	0xA62C4EB016581616, 0xF774D2CD3AE83A3A, 0x06D2D06F69B96969, 0x41122D4809240909,
+	0xD7E0ADA770DD7070, 0x6F7154D9B6E2B6B6, 0x1EBDB7CED067D0D0, 0xD6C77E3BED93EDED,
+	0xE285DB2ECC17CCCC, 0x6884572A42154242, 0x2C2DC2B4985A9898, 0xED550E49A4AAA4A4,
+	0x7550885D28A02828, 0x86B831DA5C6D5C5C, 0x6BED3F93F8C7F8F8, 0xC211A44486228686
+};
+
+static const uint64_t plain_RC[10] = {
+	0x4F01B887E8C62318, 0x52916F79F5D2A636, 0x357B0CA38E9BBC60, 0x57FE4B2EC2D7E01D, 0xDA4AF09FE5377715,
+	0x856BA0B10A29C958, 0x67053ECBF4105DBD, 0xD8957DA78B4127E4, 0x9E4717DD667CEEFB, 0x33835AAD07BF2DCA
+};
+
+static const uint64_t plain_precomputed_round_key_64[72] = {
+	0x24AED1EAF889AB3B, 0xAFCBE94566454544, 0x89B2A4C5A4A4FE70, 0xA0E1CCE1E1A9FAC5,
+	0xFCB8FCFC5CC0AC48, 0x698F8F90260EF78F, 0x797985D707147996, 0xF878C8B868F8A8F8,
+	0x58704630DBBF19D3, 0xDB37CFAFD1235B29, 0x98AC958BC28A2C01, 0xA706B2C0B19E6381,
+	0xDB09B2B07A605E44, 0x71BC8CBCCF2C5B73, 0xD3DDEDEF240967DC, 0x197D3BD7F03B8D7B,
+	0x866511DEC1AABE38, 0x7F33874AD0F37C68, 0x57F0AD98DBFA37F3, 0xBC8D35EE5842E2C5,
+	0x7E246E99E8F00911, 0x0134B010EDD6C501, 0xD3EC287BF152C9FB, 0x4027F1C70CDC5632,
+	0x14CF9B9420A525AF, 0x4D53C4E3A92636C1, 0xE1F94077867D0FE6, 0x29066AE2BBE65D91,
+	0x8D5EFE4CCC545A96, 0xA63A3262CB31E9BE, 0x476A849618597BB1, 0x31AF592736C9F0D4,
+	0xB00B3725C0B5F9E2, 0xA5948416A2CB2B39, 0x148C34FACEF88A60, 0x19928C416437A57A,
+	0x893F83FAA146F3B3, 0x7CCF0278483F4997, 0x238F001EBAE8ADDC, 0x3D32B0ED494F7792,
+	0x2FFF4D7782634175, 0x00460355D038FAFF, 0x61F3983E49027DBF, 0x0BCEE59AC260A8F4,
+	0x279D5DEE445ADFC8, 0xA4007504555AF423, 0x8CE2F902121016B0, 0x1D33336829CD30AC,
+	0x89AD846882F16B03, 0x637146D862C64099, 0x10C2194B173E434C, 0xC586FF4CD3CF9CE2,
+	0x5326DF42A011FF21, 0x134BE46CCB008E1B, 0xCEB747A3F73B12A6, 0xCA33283B0E9018D9,
+	0xF92C9A0A7A671CD0, 0xB2B6634A532F942A, 0xB4A8ACFE46224288, 0x5935583DC75C4A47,
+	0xA16F5CA55D92A674, 0x395C73C48CE61777, 0xC61AEC530B3B2A08, 0x62E74D81EB58F62A,
+	0x3ABCEE01B6489548, 0x818EED6BC66B0DA5, 0x755A2688CF3DCEE0, 0xE99CF6C0DB4A8CC2,
+	0x1385717FD59CB754, 0x7B0B7D978A4B4143, 0x7A15F6DBBB351963, 0x27820137F64E7A6A
+};
+
diff --git a/x15/cuda_whirlpoolx.cu b/x15/cuda_whirlpoolx.cu
new file mode 100644
index 0000000000..d5f94eb8a3
--- /dev/null
+++ b/x15/cuda_whirlpoolx.cu
@@ -0,0 +1,594 @@
+/*
+ * Built on cbuchner1's implementation, actual hashing code
+ * based on sphlib 3.0
+ */
+#include <stdio.h>
+#include <memory.h>
+
+#define threadsPerBlock 1024
+
+#include "cuda_helper.h"
+
+__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
+__constant__ uint64_t c_xtra[8];
+__constant__ uint64_t c_tmp[72];
+__constant__ uint64_t pTarget[4];
+
+static uint32_t *h_wxnounce[MAX_GPUS] = { 0 };
+static uint32_t *d_WXNonce[MAX_GPUS] = { 0 };
+
+/**
+ * Whirlpool CUDA kernel implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2014 djm34 & tpruvot & SP & Provos Alexis
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ * @author djm34
+ * @author tpruvot
+ * @author SP
+ * @author Provos Alexis
+ */
+
+__constant__ __align__(64) uint64_t mixTob0Tox[256];
+
+const uint64_t plain_T0[256]= {
+	0xD83078C018601818,0x2646AF05238C2323,0xB891F97EC63FC6C6,0xFBCD6F13E887E8E8,0xCB13A14C87268787,0x116D62A9B8DAB8B8,0x0902050801040101,0x0D9E6E424F214F4F,0x9B6CEEAD36D83636,
+	0xFF510459A6A2A6A6,0x0CB9BDDED26FD2D2,0x0EF706FBF5F3F5F5,0x96F280EF79F97979,0x30DECE5F6FA16F6F,0x6D3FEFFC917E9191,0xF8A407AA52555252,0x47C0FD27609D6060,0x35657689BCCABCBC,
+	0x372BCDAC9B569B9B,0x8A018C048E028E8E,0xD25B1571A3B6A3A3,0x6C183C600C300C0C,0x84F68AFF7BF17B7B,0x806AE1B535D43535,0xF53A69E81D741D1D,0xB3DD4753E0A7E0E0,0x21B3ACF6D77BD7D7,
+	0x9C99ED5EC22FC2C2,0x435C966D2EB82E2E,0x29967A624B314B4B,0x5DE121A3FEDFFEFE,0xD5AE168257415757,0xBD2A41A815541515,0xE8EEB69F77C17777,0x926EEBA537DC3737,0x9ED7567BE5B3E5E5,
+	0x1323D98C9F469F9F,0x23FD17D3F0E7F0F0,0x20947F6A4A354A4A,0x44A9959EDA4FDADA,0xA2B025FA587D5858,0xCF8FCA06C903C9C9,0x7C528D5529A42929,0x5A1422500A280A0A,0x507F4FE1B1FEB1B1,
+	0xC95D1A69A0BAA0A0,0x14D6DA7F6BB16B6B,0xD917AB5C852E8585,0x3C677381BDCEBDBD,0x8FBA34D25D695D5D,0x9020508010401010,0x07F503F3F4F7F4F4,0xDD8BC016CB0BCBCB,0xD37CC6ED3EF83E3E,
+	0x2D0A112805140505,0x78CEE61F67816767,0x97D55373E4B7E4E4,0x024EBB25279C2727,0x7382583241194141,0xA70B9D2C8B168B8B,0xF6530151A7A6A7A7,0xB2FA94CF7DE97D7D,0x4937FBDC956E9595,
+	0x56AD9F8ED847D8D8,0x70EB308BFBCBFBFB,0xCDC17123EE9FEEEE,0xBBF891C77CED7C7C,0x71CCE31766856666,0x7BA78EA6DD53DDDD,0xAF2E4BB8175C1717,0x458E460247014747,0x1A21DC849E429E9E,
+	0xD489C51ECA0FCACA,0x585A99752DB42D2D,0x2E637991BFC6BFBF,0x3F0E1B38071C0707,0xAC472301AD8EADAD,0xB0B42FEA5A755A5A,0xEF1BB56C83368383,0xB666FF8533CC3333,0x5CC6F23F63916363,
+	0x12040A1002080202,0x93493839AA92AAAA,0xDEE2A8AF71D97171,0xC68DCF0EC807C8C8,0xD1327DC819641919,0x3B92707249394949,0x5FAF9A86D943D9D9,0x31F91DC3F2EFF2F2,0xA8DB484BE3ABE3E3,
+	0xB9B62AE25B715B5B,0xBC0D9234881A8888,0x3E29C8A49A529A9A,0x0B4CBE2D26982626,0xBF64FA8D32C83232,0x597D4AE9B0FAB0B0,0xF2CF6A1BE983E9E9,0x771E33780F3C0F0F,0x33B7A6E6D573D5D5,
+	0xF41DBA74803A8080,0x27617C99BEC2BEBE,0xEB87DE26CD13CDCD,0x8968E4BD34D03434,0x3290757A483D4848,0x54E324ABFFDBFFFF,0x8DF48FF77AF57A7A,0x643DEAF4907A9090,0x9DBE3EC25F615F5F,
+	0x3D40A01D20802020,0x0FD0D56768BD6868,0xCA3472D01A681A1A,0xB7412C19AE82AEAE,0x7D755EC9B4EAB4B4,0xCEA8199A544D5454,0x7F3BE5EC93769393,0x2F44AA0D22882222,0x63C8E907648D6464,
+	0x2AFF12DBF1E3F1F1,0xCCE6A2BF73D17373,0x82245A9012481212,0x7A805D3A401D4040,0x4810284008200808,0x959BE856C32BC3C3,0xDFC57B33EC97ECEC,0x4DAB9096DB4BDBDB,0xC05F1F61A1BEA1A1,
+	0x9107831C8D0E8D8D,0xC87AC9F53DF43D3D,0x5B33F1CC97669797,0x0000000000000000,0xF983D436CF1BCFCF,0x6E5687452BAC2B2B,0xE1ECB39776C57676,0xE619B06482328282,0x28B1A9FED67FD6D6,
+	0xC33677D81B6C1B1B,0x74775BC1B5EEB5B5,0xBE432911AF86AFAF,0x1DD4DF776AB56A6A,0xEAA00DBA505D5050,0x578A4C1245094545,0x38FB18CBF3EBF3F3,0xAD60F09D30C03030,0xC4C3742BEF9BEFEF,
+	0xDA7EC3E53FFC3F3F,0xC7AA1C9255495555,0xDB591079A2B2A2A2,0xE9C96503EA8FEAEA,0x6ACAEC0F65896565,0x036968B9BAD2BABA,0x4A5E93652FBC2F2F,0x8E9DE74EC027C0C0,0x60A181BEDE5FDEDE,
+	0xFC386CE01C701C1C,0x46E72EBBFDD3FDFD,0x1F9A64524D294D4D,0x7639E0E492729292,0xFAEABC8F75C97575,0x360C1E3006180606,0xAE0998248A128A8A,0x4B7940F9B2F2B2B2,0x85D15963E6BFE6E6,
+	0x7E1C36700E380E0E,0xE73E63F81F7C1F1F,0x55C4F73762956262,0x3AB5A3EED477D4D4,0x814D3229A89AA8A8,0x5231F4C496629696,0x62EF3A9BF9C3F9F9,0xA397F666C533C5C5,0x104AB13525942525,
+	0xABB220F259795959,0xD015AE54842A8484,0xC5E4A7B772D57272,0xEC72DDD539E43939,0x1698615A4C2D4C4C,0x94BC3BCA5E655E5E,0x9FF085E778FD7878,0xE570D8DD38E03838,0x980586148C0A8C8C,
+	0x17BFB2C6D163D1D1,0xE4570B41A5AEA5A5,0xA1D94D43E2AFE2E2,0x4EC2F82F61996161,0x427B45F1B3F6B3B3,0x3442A51521842121,0x0825D6949C4A9C9C,0xEE3C66F01E781E1E,0x6186522243114343,
+	0xB193FC76C73BC7C7,0x4FE52BB3FCD7FCFC,0x2408142004100404,0xE3A208B251595151,0x252FC7BC995E9999,0x22DAC44F6DA96D6D,0x651A39680D340D0D,0x79E93583FACFFAFA,0x69A384B6DF5BDFDF,
+	0xA9FC9BD77EE57E7E,0x1948B43D24902424,0xFE76D7C53BEC3B3B,0x9A4B3D31AB96ABAB,0xF081D13ECE1FCECE,0x9922558811441111,0x8303890C8F068F8F,0x049C6B4A4E254E4E,0x667351D1B7E6B7B7,
+	0xE0CB600BEB8BEBEB,0xC178CCFD3CF03C3C,0xFD1FBF7C813E8181,0x4035FED4946A9494,0x1CF30CEBF7FBF7F7,0x186F67A1B9DEB9B9,0x8B265F98134C1313,0x51589C7D2CB02C2C,0x05BBB8D6D36BD3D3,
+	0x8CD35C6BE7BBE7E7,0x39DCCB576EA56E6E,0xAA95F36EC437C4C4,0x1B060F18030C0303,0xDCAC138A56455656,0x5E88491A440D4444,0xA0FE9EDF7FE17F7F,0x884F3721A99EA9A9,0x6754824D2AA82A2A,
+	0x0A6B6DB1BBD6BBBB,0x879FE246C123C1C1,0xF1A602A253515353,0x72A58BAEDC57DCDC,0x531627580B2C0B0B,0x0127D39C9D4E9D9D,0x2BD8C1476CAD6C6C,0xA462F59531C43131,0xF3E8B98774CD7474,
+	0x15F109E3F6FFF6F6,0x4C8C430A46054646,0xA5452609AC8AACAC,0xB50F973C891E8989,0xB42844A014501414,0xBADF425BE1A3E1E1,0xA62C4EB016581616,0xF774D2CD3AE83A3A,0x06D2D06F69B96969,
+	0x41122D4809240909,0xD7E0ADA770DD7070,0x6F7154D9B6E2B6B6,0x1EBDB7CED067D0D0,0xD6C77E3BED93EDED,0xE285DB2ECC17CCCC,0x6884572A42154242,0x2C2DC2B4985A9898,0xED550E49A4AAA4A4,
+	0x7550885D28A02828,0x86B831DA5C6D5C5C,0x6BED3F93F8C7F8F8,0xC211A44486228686
+};
+
+/**
+ * Round constants.
+ */
+__constant__ uint64_t InitVector_RC[10];
+
+const uint64_t plain_RC[10] = {
+	0x4F01B887E8C62318,0x52916F79F5D2A636,0x357B0CA38E9BBC60,0x57FE4B2EC2D7E01D,0xDA4AF09FE5377715,
+	0x856BA0B10A29C958,0x67053ECBF4105DBD,0xD8957DA78B4127E4,0x9E4717DD667CEEFB,0x33835AAD07BF2DCA
+};
+
+/* ====================================================================== */
+
+__device__ __forceinline__
+static uint64_t ROUND_ELT(const uint64_t* sharedMemory, const uint64_t* __restrict__ in, const int i0, const int i1, const int i2, const int i3, const int i4, const int i5, const int i6, const int i7)
+{
+	uint32_t* in32 = (uint32_t*)in;
+	return xor8(	sharedMemory[__byte_perm(in32[(i0 << 1)], 0, 0x4440)],
+			sharedMemory[__byte_perm(in32[(i1 << 1)], 0, 0x4441) + 256],
+			sharedMemory[__byte_perm(in32[(i2 << 1)], 0, 0x4442) + 512],
+			sharedMemory[__byte_perm(in32[(i3 << 1)], 0, 0x4443) + 768],
+			sharedMemory[__byte_perm(in32[(i4 << 1) + 1], 0, 0x4440) + 1024],
+			sharedMemory[__byte_perm(in32[(i5 << 1) + 1], 0, 0x4441) + 1280],
+			sharedMemory[__byte_perm(in32[(i6 << 1) + 1], 0, 0x4442) + 1536],
+			sharedMemory[__byte_perm(in32[(i7 << 1) + 1], 0, 0x4443) + 1792]);
+}
+
+#define TRANSFER(dst, src) { \
+	dst[0] = src ## 0; \
+	dst[1] = src ## 1; \
+	dst[2] = src ## 2; \
+	dst[3] = src ## 3; \
+	dst[4] = src ## 4; \
+	dst[5] = src ## 5; \
+	dst[6] = src ## 6; \
+	dst[7] = src ## 7; \
+}
+
+#define ROUND(table, in, out, c0, c1, c2, c3, c4, c5, c6, c7) { \
+	out ## 0 = xor1(ROUND_ELT(table, in, 0, 7, 6, 5, 4, 3, 2, 1), c0); \
+	out ## 1 = xor1(ROUND_ELT(table, in, 1, 0, 7, 6, 5, 4, 3, 2), c1); \
+	out ## 2 = xor1(ROUND_ELT(table, in, 2, 1, 0, 7, 6, 5, 4, 3), c2); \
+	out ## 3 = xor1(ROUND_ELT(table, in, 3, 2, 1, 0, 7, 6, 5, 4), c3); \
+	out ## 4 = xor1(ROUND_ELT(table, in, 4, 3, 2, 1, 0, 7, 6, 5), c4); \
+	out ## 5 = xor1(ROUND_ELT(table, in, 5, 4, 3, 2, 1, 0, 7, 6), c5); \
+	out ## 6 = xor1(ROUND_ELT(table, in, 6, 5, 4, 3, 2, 1, 0, 7), c6); \
+	out ## 7 = xor1(ROUND_ELT(table, in, 7, 6, 5, 4, 3, 2, 1, 0), c7); \
+}
+
+#define ROUND1(table, in, out, c) { \
+	out ## 0 = xor1(ROUND_ELT(table, in, 0, 7, 6, 5, 4, 3, 2, 1), c); \
+	out ## 1 = ROUND_ELT(table, in, 1, 0, 7, 6, 5, 4, 3, 2); \
+	out ## 2 = ROUND_ELT(table, in, 2, 1, 0, 7, 6, 5, 4, 3); \
+	out ## 3 = ROUND_ELT(table, in, 3, 2, 1, 0, 7, 6, 5, 4); \
+	out ## 4 = ROUND_ELT(table, in, 4, 3, 2, 1, 0, 7, 6, 5); \
+	out ## 5 = ROUND_ELT(table, in, 5, 4, 3, 2, 1, 0, 7, 6); \
+	out ## 6 = ROUND_ELT(table, in, 6, 5, 4, 3, 2, 1, 0, 7); \
+	out ## 7 = ROUND_ELT(table, in, 7, 6, 5, 4, 3, 2, 1, 0); \
+}
+
+#define ROUND_KSCHED(table, in, out, c) \
+	ROUND1(table, in, out, c) \
+	TRANSFER(in, out)
+
+#define ROUND_WENC(table, in, key, out) \
+	ROUND(table, in, out, key[0], key[1], key[2],key[3], key[4], key[5], key[6], key[7]) \
+	TRANSFER(in, out)
+
+static uint64_t* d_xtra[MAX_GPUS] = { 0 };
+static uint64_t* d_tmp[MAX_GPUS] = { 0 };
+
+__device__ __forceinline__
+static void whirlpoolx_getShared(uint64_t* sharedMemory)
+{
+	if (threadIdx.x < 256) {
+		sharedMemory[threadIdx.x] = mixTob0Tox[threadIdx.x];
+		sharedMemory[threadIdx.x+256]  = ROTL64(sharedMemory[threadIdx.x], 8);
+		sharedMemory[threadIdx.x+512]  = ROTL64(sharedMemory[threadIdx.x],16);
+		sharedMemory[threadIdx.x+768]  = ROTL64(sharedMemory[threadIdx.x],24);
+		sharedMemory[threadIdx.x+1024] = ROTL64(sharedMemory[threadIdx.x],32);
+		sharedMemory[threadIdx.x+1280] = ROTR64(sharedMemory[threadIdx.x],24);
+		sharedMemory[threadIdx.x+1536] = ROTR64(sharedMemory[threadIdx.x],16);
+		sharedMemory[threadIdx.x+1792] = ROTR64(sharedMemory[threadIdx.x], 8);
+	}
+	__syncthreads();
+}
+
+
+__global__
+void whirlpoolx_gpu_precompute(uint32_t threads, uint64_t* d_xtra, uint64_t* d_tmp)
+{
+	__shared__ uint64_t sharedMemory[2048];
+
+	whirlpoolx_getShared(sharedMemory);
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint64_t n[8];
+		uint64_t h[8] = { 0 };
+
+		#pragma unroll 8
+		for (int i=0; i<8; i++) {
+			n[i] = c_PaddedMessage80[i];  // read data
+		}
+		//#pragma unroll 10
+		for (unsigned int r=0; r < 10; r++) {
+			uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+			ROUND_KSCHED(sharedMemory, h, tmp, InitVector_RC[r]);
+			ROUND_WENC(sharedMemory, n, h, tmp);
+		}
+		#pragma unroll 8
+		for (int i=0; i < 8; i++) {
+			h[i] = xor1(n[i],c_PaddedMessage80[i]);
+		}
+
+		if(threadIdx.x==0)d_xtra[threadIdx.x]=h[1];
+		uint64_t atLastCalc=xor1(h[3],h[5]);
+
+		//////////////////////////////////
+		n[0] = c_PaddedMessage80[8];    //read data
+		n[1] = c_PaddedMessage80[9]; //whirlpool
+		n[2] = 0x0000000000000080; //whirlpool
+		n[3] = 0;
+		n[4] = 0;
+		n[5] = 0;
+		n[6] = 0;
+		n[7] = 0x8002000000000000;
+
+		n[0] = xor1(n[0],h[0]);
+		n[2] = xor1(n[2],h[2]);	n[3] = h[3];
+		n[4] = h[4];	n[5] = h[5];
+		n[6] = h[6];	n[7] = xor1(n[7],h[7]);
+		uint64_t tmp[8];
+		tmp[0] = xor1(ROUND_ELT(sharedMemory, h, 0, 7, 6, 5, 4, 3, 2, 1),InitVector_RC[0]);
+		tmp[1] = ROUND_ELT(sharedMemory, h, 1, 0, 7, 6, 5, 4, 3, 2);
+		tmp[2] = ROUND_ELT(sharedMemory, h, 2, 1, 0, 7, 6, 5, 4, 3);
+		tmp[3] = ROUND_ELT(sharedMemory, h, 3, 2, 1, 0, 7, 6, 5, 4);
+		tmp[4] = ROUND_ELT(sharedMemory, h, 4, 3, 2, 1, 0, 7, 6, 5);
+		tmp[5] = ROUND_ELT(sharedMemory, h, 5, 4, 3, 2, 1, 0, 7, 6);
+		tmp[6] = ROUND_ELT(sharedMemory, h, 6, 5, 4, 3, 2, 1, 0, 7);
+		tmp[7] = ROUND_ELT(sharedMemory, h, 7, 6, 5, 4, 3, 2, 1, 0);
+
+		uint64_t tmp2[8];
+		uint32_t* n32 = (uint32_t*)n;
+		tmp2[0]=xor8(	sharedMemory[__byte_perm(n32[ 0], 0, 0x4440)]  		,sharedMemory[__byte_perm(n32[14], 0, 0x4441) + 256],
+				sharedMemory[__byte_perm(n32[12], 0, 0x4442) + 512]	,sharedMemory[__byte_perm(n32[10], 0, 0x4443) + 768],
+				sharedMemory[__byte_perm(n32[ 9], 0, 0x4440) + 1024]	,sharedMemory[__byte_perm(n32[ 7], 0, 0x4441) + 1280],
+				sharedMemory[__byte_perm(n32[ 5], 0, 0x4442) + 1536]	,tmp[0]);
+
+		tmp2[1]=xor8(	tmp[1]							,sharedMemory[__byte_perm(n32[ 0], 0, 0x4441) + 256],
+				sharedMemory[__byte_perm(n32[14], 0, 0x4442) +  512]	,sharedMemory[__byte_perm(n32[12], 0, 0x4443) + 768],
+				sharedMemory[__byte_perm(n32[11], 0, 0x4440) + 1024]	,sharedMemory[__byte_perm(n32[ 9], 0, 0x4441) + 1280],
+				sharedMemory[__byte_perm(n32[ 7], 0, 0x4442) + 1536]	,sharedMemory[__byte_perm(n32[ 5], 0, 0x4443) + 1792]);
+
+		tmp2[2]=xor8(	sharedMemory[__byte_perm(n32[ 4], 0, 0x4440)]  		,tmp[2]						    ,
+				sharedMemory[__byte_perm(n32[ 0], 0, 0x4442) +  512]	,sharedMemory[__byte_perm(n32[14], 0, 0x4443) + 768],
+				sharedMemory[__byte_perm(n32[13], 0, 0x4440) + 1024]	,sharedMemory[__byte_perm(n32[11], 0, 0x4441) + 1280],
+				sharedMemory[__byte_perm(n32[ 9], 0, 0x4442) + 1536]	,sharedMemory[__byte_perm(n32[ 7], 0, 0x4443) + 1792]);
+
+		tmp2[3]=xor8(	sharedMemory[__byte_perm(n32[ 6], 0, 0x4440)]  		,sharedMemory[__byte_perm(n32[ 4], 0, 0x4441) + 256],
+				tmp[3]							,sharedMemory[__byte_perm(n32[ 0], 0, 0x4443) + 768],
+				sharedMemory[__byte_perm(n32[15], 0, 0x4440) + 1024]	,sharedMemory[__byte_perm(n32[13], 0, 0x4441) + 1280],
+				sharedMemory[__byte_perm(n32[11], 0, 0x4442) + 1536]	,sharedMemory[__byte_perm(n32[ 9], 0, 0x4443) + 1792]);
+
+		tmp2[4]=xor8(	sharedMemory[__byte_perm(n32[ 8], 0, 0x4440)]  		,sharedMemory[__byte_perm(n32[ 6], 0, 0x4441) + 256]  ,
+				sharedMemory[__byte_perm(n32[ 4], 0, 0x4442) +  512]	,tmp[4]						      ,
+				sharedMemory[__byte_perm(n32[ 1], 0, 0x4440) + 1024]	,sharedMemory[__byte_perm(n32[15], 0, 0x4441) + 1280] ,
+				sharedMemory[__byte_perm(n32[13], 0, 0x4442) + 1536]	,sharedMemory[__byte_perm(n32[11], 0, 0x4443) + 1792]);
+
+		tmp2[5]=xor8(	sharedMemory[__byte_perm(n32[10], 0, 0x4440)]  		,sharedMemory[__byte_perm(n32[ 8], 0, 0x4441) + 256],
+				sharedMemory[__byte_perm(n32[ 6], 0, 0x4442) +  512]	,sharedMemory[__byte_perm(n32[ 4], 0, 0x4443) + 768],
+				tmp[5]							,sharedMemory[__byte_perm(n32[ 1], 0, 0x4441) + 1280],
+				sharedMemory[__byte_perm(n32[15], 0, 0x4442) + 1536]	,sharedMemory[__byte_perm(n32[13], 0, 0x4443) + 1792]);
+
+		tmp2[6]=xor8(	sharedMemory[__byte_perm(n32[12], 0, 0x4440)]  		,sharedMemory[__byte_perm(n32[10], 0, 0x4441) + 256],
+				sharedMemory[__byte_perm(n32[ 8], 0, 0x4442) +  512]	,sharedMemory[__byte_perm(n32[ 6], 0, 0x4443) + 768],
+				sharedMemory[__byte_perm(n32[ 5], 0, 0x4440) + 1024]	,tmp[6],
+				sharedMemory[__byte_perm(n32[ 1], 0, 0x4442) + 1536]	,sharedMemory[__byte_perm(n32[15], 0, 0x4443) + 1792]);
+
+		tmp2[7]=xor8(	sharedMemory[__byte_perm(n32[14], 0, 0x4440)]  		,sharedMemory[__byte_perm(n32[12], 0, 0x4441) + 256],
+				sharedMemory[__byte_perm(n32[10], 0, 0x4442) +  512]	,sharedMemory[__byte_perm(n32[ 8], 0, 0x4443) + 768],
+				sharedMemory[__byte_perm(n32[ 7], 0, 0x4440) + 1024]	,sharedMemory[__byte_perm(n32[ 5], 0, 0x4441) + 1280],
+				tmp[7]							,sharedMemory[__byte_perm(n32[ 1], 0, 0x4443) + 1792]);
+
+		n[1] ^= h[1];
+		tmp2[1]^=sharedMemory[__byte_perm(n32[2], 0, 0x4440)];
+		tmp2[2]^=sharedMemory[__byte_perm(n32[2], 0, 0x4441) + 256];
+		tmp2[3]^=sharedMemory[__byte_perm(n32[2], 0, 0x4442) + 512];
+		tmp2[4]^=sharedMemory[__byte_perm(n32[2], 0, 0x4443) + 768];
+
+		d_tmp[threadIdx.x]=tmp2[threadIdx.x];
+
+		uint64_t tmp3[8];
+		tmp3[0] = xor1(ROUND_ELT(sharedMemory, tmp, 0, 7, 6, 5, 4, 3, 2, 1), InitVector_RC[1]);
+		tmp3[1] = ROUND_ELT(sharedMemory, tmp, 1, 0, 7, 6, 5, 4, 3, 2);
+		tmp3[2] = ROUND_ELT(sharedMemory, tmp, 2, 1, 0, 7, 6, 5, 4, 3);
+		tmp3[3] = ROUND_ELT(sharedMemory, tmp, 3, 2, 1, 0, 7, 6, 5, 4);
+		tmp3[4] = ROUND_ELT(sharedMemory, tmp, 4, 3, 2, 1, 0, 7, 6, 5);
+		tmp3[5] = ROUND_ELT(sharedMemory, tmp, 5, 4, 3, 2, 1, 0, 7, 6);
+		tmp3[6] = ROUND_ELT(sharedMemory, tmp, 6, 5, 4, 3, 2, 1, 0, 7);
+		tmp3[7] = ROUND_ELT(sharedMemory, tmp, 7, 6, 5, 4, 3, 2, 1, 0);
+
+		n32 = (uint32_t*)tmp2;
+		uint64_t tmp4[8];
+		tmp4[0]=(	sharedMemory[__byte_perm(n32[ 9], 0, 0x4440) + 1024]	^sharedMemory[__byte_perm(n32[ 7], 0, 0x4441) + 1280]^
+			sharedMemory[__byte_perm(n32[ 5], 0, 0x4442) + 1536]	^sharedMemory[__byte_perm(n32[ 3], 0, 0x4443) + 1792]) ^tmp3[0];
+
+		tmp4[1]=(sharedMemory[__byte_perm(n32[ 2], 0, 0x4440)]		^sharedMemory[__byte_perm(n32[ 9], 0, 0x4441) + 1280]^
+			sharedMemory[__byte_perm(n32[ 7], 0, 0x4442) + 1536]	^sharedMemory[__byte_perm(n32[ 5], 0, 0x4443) + 1792]) ^tmp3[1];
+
+		tmp4[2]=(sharedMemory[__byte_perm(n32[ 4], 0, 0x4440)]  	^sharedMemory[__byte_perm(n32[ 2], 0, 0x4441) + 256]^
+			sharedMemory[__byte_perm(n32[ 9], 0, 0x4442) + 1536]	^sharedMemory[__byte_perm(n32[ 7], 0, 0x4443) + 1792]) ^tmp3[2];
+
+		tmp4[3]=(sharedMemory[__byte_perm(n32[ 6], 0, 0x4440)]  	^sharedMemory[__byte_perm(n32[ 4], 0, 0x4441) + 256]^
+			sharedMemory[__byte_perm(n32[ 2], 0, 0x4442) +  512]	^sharedMemory[__byte_perm(n32[ 9], 0, 0x4443) + 1792]) ^tmp3[3];
+
+		tmp4[4]=(sharedMemory[__byte_perm(n32[ 8], 0, 0x4440)]  	^sharedMemory[__byte_perm(n32[ 6], 0, 0x4441) + 256]^
+			sharedMemory[__byte_perm(n32[ 4], 0, 0x4442) +  512]	^sharedMemory[__byte_perm(n32[ 2], 0, 0x4443) + 768]) ^tmp3[4];
+
+		tmp4[5]=(sharedMemory[__byte_perm(n32[ 8], 0, 0x4441) + 256]	^sharedMemory[__byte_perm(n32[ 6], 0, 0x4442) +  512]^
+			sharedMemory[__byte_perm(n32[ 4], 0, 0x4443) + 768]	^sharedMemory[__byte_perm(n32[ 3], 0, 0x4440) + 1024]) ^tmp3[5];
+
+		tmp4[6]=(sharedMemory[__byte_perm(n32[ 8], 0, 0x4442) + 512]	^sharedMemory[__byte_perm(n32[ 6], 0, 0x4443) + 768]^
+			sharedMemory[__byte_perm(n32[ 5], 0, 0x4440) + 1024]	^sharedMemory[__byte_perm(n32[ 3], 0, 0x4441) + 1280]) ^tmp3[6];
+
+		tmp4[7]=(sharedMemory[__byte_perm(n32[ 8], 0, 0x4443) + 768]	^sharedMemory[__byte_perm(n32[ 7], 0, 0x4440) + 1024]^
+			sharedMemory[__byte_perm(n32[ 5], 0, 0x4441) + 1280]	^sharedMemory[__byte_perm(n32[ 3], 0, 0x4442) + 1536]) ^tmp3[7];
+
+		d_tmp[threadIdx.x+16]=tmp4[threadIdx.x];
+
+		uint64_t tmp5[8];
+		tmp5[0] = xor1(ROUND_ELT(sharedMemory, tmp3, 0, 7, 6, 5, 4, 3, 2, 1), InitVector_RC[2]);
+		tmp5[1] = ROUND_ELT(sharedMemory, tmp3, 1, 0, 7, 6, 5, 4, 3, 2);
+		tmp5[2] = ROUND_ELT(sharedMemory, tmp3, 2, 1, 0, 7, 6, 5, 4, 3);
+		tmp5[3] = ROUND_ELT(sharedMemory, tmp3, 3, 2, 1, 0, 7, 6, 5, 4);
+		tmp5[4] = ROUND_ELT(sharedMemory, tmp3, 4, 3, 2, 1, 0, 7, 6, 5);
+		tmp5[5] = ROUND_ELT(sharedMemory, tmp3, 5, 4, 3, 2, 1, 0, 7, 6);
+		tmp5[6] = ROUND_ELT(sharedMemory, tmp3, 6, 5, 4, 3, 2, 1, 0, 7);
+		tmp5[7] = ROUND_ELT(sharedMemory, tmp3, 7, 6, 5, 4, 3, 2, 1, 0);
+
+		d_tmp[threadIdx.x+8]=tmp5[threadIdx.x];
+
+		uint64_t tmp6[8];
+		tmp6[0] = xor1(ROUND_ELT(sharedMemory, tmp5, 0, 7, 6, 5, 4, 3, 2, 1), InitVector_RC[3]);
+		tmp6[1] = ROUND_ELT(sharedMemory, tmp5, 1, 0, 7, 6, 5, 4, 3, 2);
+		tmp6[2] = ROUND_ELT(sharedMemory, tmp5, 2, 1, 0, 7, 6, 5, 4, 3);
+		tmp6[3] = ROUND_ELT(sharedMemory, tmp5, 3, 2, 1, 0, 7, 6, 5, 4);
+		tmp6[4] = ROUND_ELT(sharedMemory, tmp5, 4, 3, 2, 1, 0, 7, 6, 5);
+		tmp6[5] = ROUND_ELT(sharedMemory, tmp5, 5, 4, 3, 2, 1, 0, 7, 6);
+		tmp6[6] = ROUND_ELT(sharedMemory, tmp5, 6, 5, 4, 3, 2, 1, 0, 7);
+		tmp6[7] = ROUND_ELT(sharedMemory, tmp5, 7, 6, 5, 4, 3, 2, 1, 0);
+
+		d_tmp[threadIdx.x+24]=tmp6[threadIdx.x];
+
+		uint64_t tmp7[8];
+		tmp7[0] = xor1(ROUND_ELT(sharedMemory, tmp6, 0, 7, 6, 5, 4, 3, 2, 1), InitVector_RC[4]);
+		tmp7[1] = ROUND_ELT(sharedMemory, tmp6, 1, 0, 7, 6, 5, 4, 3, 2);
+		tmp7[2] = ROUND_ELT(sharedMemory, tmp6, 2, 1, 0, 7, 6, 5, 4, 3);
+		tmp7[3] = ROUND_ELT(sharedMemory, tmp6, 3, 2, 1, 0, 7, 6, 5, 4);
+		tmp7[4] = ROUND_ELT(sharedMemory, tmp6, 4, 3, 2, 1, 0, 7, 6, 5);
+		tmp7[5] = ROUND_ELT(sharedMemory, tmp6, 5, 4, 3, 2, 1, 0, 7, 6);
+		tmp7[6] = ROUND_ELT(sharedMemory, tmp6, 6, 5, 4, 3, 2, 1, 0, 7);
+		tmp7[7] = ROUND_ELT(sharedMemory, tmp6, 7, 6, 5, 4, 3, 2, 1, 0);
+
+		d_tmp[threadIdx.x+32]=tmp7[threadIdx.x];
+
+		uint64_t tmp8[8];
+		tmp8[0] = xor1(ROUND_ELT(sharedMemory, tmp7, 0, 7, 6, 5, 4, 3, 2, 1), InitVector_RC[5]);
+		tmp8[1] = ROUND_ELT(sharedMemory, tmp7, 1, 0, 7, 6, 5, 4, 3, 2);
+		tmp8[2] = ROUND_ELT(sharedMemory, tmp7, 2, 1, 0, 7, 6, 5, 4, 3);
+		tmp8[3] = ROUND_ELT(sharedMemory, tmp7, 3, 2, 1, 0, 7, 6, 5, 4);
+		tmp8[4] = ROUND_ELT(sharedMemory, tmp7, 4, 3, 2, 1, 0, 7, 6, 5);
+		tmp8[5] = ROUND_ELT(sharedMemory, tmp7, 5, 4, 3, 2, 1, 0, 7, 6);
+		tmp8[6] = ROUND_ELT(sharedMemory, tmp7, 6, 5, 4, 3, 2, 1, 0, 7);
+		tmp8[7] = ROUND_ELT(sharedMemory, tmp7, 7, 6, 5, 4, 3, 2, 1, 0);
+
+		d_tmp[threadIdx.x+40]=tmp8[threadIdx.x];
+
+		uint64_t tmp9[8];
+		tmp9[0] = xor1(ROUND_ELT(sharedMemory, tmp8, 0, 7, 6, 5, 4, 3, 2, 1), InitVector_RC[6]);
+		tmp9[1] = ROUND_ELT(sharedMemory, tmp8, 1, 0, 7, 6, 5, 4, 3, 2);
+		tmp9[2] = ROUND_ELT(sharedMemory, tmp8, 2, 1, 0, 7, 6, 5, 4, 3);
+		tmp9[3] = ROUND_ELT(sharedMemory, tmp8, 3, 2, 1, 0, 7, 6, 5, 4);
+		tmp9[4] = ROUND_ELT(sharedMemory, tmp8, 4, 3, 2, 1, 0, 7, 6, 5);
+		tmp9[5] = ROUND_ELT(sharedMemory, tmp8, 5, 4, 3, 2, 1, 0, 7, 6);
+		tmp9[6] = ROUND_ELT(sharedMemory, tmp8, 6, 5, 4, 3, 2, 1, 0, 7);
+		tmp9[7] = ROUND_ELT(sharedMemory, tmp8, 7, 6, 5, 4, 3, 2, 1, 0);
+
+		d_tmp[threadIdx.x+48]=tmp9[threadIdx.x];
+
+		uint64_t tmp10[8];
+		tmp10[0] = xor1(ROUND_ELT(sharedMemory, tmp9, 0, 7, 6, 5, 4, 3, 2, 1), InitVector_RC[7]);
+		tmp10[1] = ROUND_ELT(sharedMemory, tmp9, 1, 0, 7, 6, 5, 4, 3, 2);
+		tmp10[2] = ROUND_ELT(sharedMemory, tmp9, 2, 1, 0, 7, 6, 5, 4, 3);
+		tmp10[3] = ROUND_ELT(sharedMemory, tmp9, 3, 2, 1, 0, 7, 6, 5, 4);
+		tmp10[4] = ROUND_ELT(sharedMemory, tmp9, 4, 3, 2, 1, 0, 7, 6, 5);
+		tmp10[5] = ROUND_ELT(sharedMemory, tmp9, 5, 4, 3, 2, 1, 0, 7, 6);
+		tmp10[6] = ROUND_ELT(sharedMemory, tmp9, 6, 5, 4, 3, 2, 1, 0, 7);
+		tmp10[7] = ROUND_ELT(sharedMemory, tmp9, 7, 6, 5, 4, 3, 2, 1, 0);
+
+
+		d_tmp[threadIdx.x+56]=tmp10[threadIdx.x];
+
+		uint64_t tmp11[8];
+		tmp11[0] = xor1(ROUND_ELT(sharedMemory, tmp10, 0, 7, 6, 5, 4, 3, 2, 1), InitVector_RC[8]);
+		tmp11[1] = ROUND_ELT(sharedMemory, tmp10, 1, 0, 7, 6, 5, 4, 3, 2);
+		tmp11[2] = ROUND_ELT(sharedMemory, tmp10, 2, 1, 0, 7, 6, 5, 4, 3);
+		tmp11[3] = ROUND_ELT(sharedMemory, tmp10, 3, 2, 1, 0, 7, 6, 5, 4);
+		tmp11[4] = ROUND_ELT(sharedMemory, tmp10, 4, 3, 2, 1, 0, 7, 6, 5);
+		tmp11[5] = ROUND_ELT(sharedMemory, tmp10, 5, 4, 3, 2, 1, 0, 7, 6);
+		tmp11[6] = ROUND_ELT(sharedMemory, tmp10, 6, 5, 4, 3, 2, 1, 0, 7);
+		tmp11[7] = ROUND_ELT(sharedMemory, tmp10, 7, 6, 5, 4, 3, 2, 1, 0);
+
+		d_tmp[threadIdx.x+64]=tmp11[threadIdx.x];
+
+		if(threadIdx.x==1){
+			tmp[0]=ROUND_ELT(sharedMemory,tmp11, 3, 2, 1, 0, 7, 6, 5, 4);
+			tmp[1]=ROUND_ELT(sharedMemory,tmp11, 5, 4, 3, 2, 1, 0, 7, 6);
+			tmp[4] = xor3(tmp[0],tmp[1],atLastCalc);
+			d_xtra[threadIdx.x]=tmp[4];
+		}
+	}
+}
+
+__global__ __launch_bounds__(threadsPerBlock,2)
+void whirlpoolx_gpu_hash(uint32_t threads, uint32_t startNounce, uint32_t *resNounce)
+{
+	__shared__ uint64_t sharedMemory[2048];
+
+	whirlpoolx_getShared(sharedMemory);
+
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint64_t n[8];
+		uint64_t tmp[8];
+		uint32_t nounce = startNounce + thread;
+
+		n[1] = xor1(REPLACE_HIDWORD(c_PaddedMessage80[9], cuda_swab32(nounce)),c_xtra[0]);
+
+		uint32_t* n32 = (uint32_t*)&n[0];
+		n[0]=sharedMemory[__byte_perm(n32[3], 0, 0x4443) + 1792];
+		n[5]=sharedMemory[__byte_perm(n32[3], 0, 0x4440) + 1024];
+		n[6]=sharedMemory[__byte_perm(n32[3], 0, 0x4441) + 1280];
+		n[7]=sharedMemory[__byte_perm(n32[3], 0, 0x4442) + 1536];
+		n[0]=xor1(c_tmp[0],n[0]);
+		n[1]=c_tmp[1];
+		n[2]=c_tmp[2];
+		n[3]=c_tmp[3];
+		n[4]=c_tmp[4];
+		n[5]=xor1(c_tmp[5],n[5]);
+		n[6]=xor1(c_tmp[6],n[6]);
+		n[7]=xor1(c_tmp[7],n[7]);
+
+		tmp[0]=xor3(sharedMemory[__byte_perm(n32[10],0,0x4443)+768],sharedMemory[__byte_perm(n32[12],0,0x4442)+512],sharedMemory[__byte_perm(n32[14],0,0x4441)+256]);
+		tmp[1]=xor3(sharedMemory[__byte_perm(n32[11],0,0x4440)+1024],sharedMemory[__byte_perm(n32[12],0,0x4443)+768],sharedMemory[__byte_perm(n32[14],0,0x4442)+512]);
+		tmp[2]=xor3(sharedMemory[__byte_perm(n32[11],0,0x4441)+1280],sharedMemory[__byte_perm(n32[13],0,0x4440)+1024],sharedMemory[__byte_perm(n32[14],0,0x4443)+768]);
+		tmp[3]=xor3(sharedMemory[__byte_perm(n32[11],0,0x4442)+1536],sharedMemory[__byte_perm(n32[13],0,0x4441)+1280],sharedMemory[__byte_perm(n32[15],0,0x4440)+1024]);
+		tmp[4]=xor3(sharedMemory[__byte_perm(n32[11],0,0x4443)+1792],sharedMemory[__byte_perm(n32[13],0,0x4442)+1536],sharedMemory[__byte_perm(n32[15],0,0x4441)+1280]);
+		tmp[5]=xor3(sharedMemory[__byte_perm(n32[10],0,0x4440)],sharedMemory[__byte_perm(n32[13],0,0x4443)+1792],sharedMemory[__byte_perm(n32[15],0,0x4442)+1536]);
+		tmp[6]=xor3(sharedMemory[__byte_perm(n32[12],0,0x4440)],sharedMemory[__byte_perm(n32[10],0,0x4441)+256],sharedMemory[__byte_perm(n32[15],0,0x4443)+1792]);
+		tmp[7]=xor3(sharedMemory[__byte_perm(n32[14],0,0x4440)],sharedMemory[__byte_perm(n32[12],0,0x4441)+256],sharedMemory[__byte_perm(n32[10],0,0x4442)+ 512]);
+
+		tmp[0]=xor3(sharedMemory[__byte_perm(n32[ 0], 0, 0x4440)],tmp[0],c_tmp[0+16]);
+		tmp[1]=xor3(sharedMemory[__byte_perm(n32[ 0], 0, 0x4441) + 256],tmp[1],c_tmp[1+16]);
+		tmp[2]=xor3(sharedMemory[__byte_perm(n32[ 0], 0, 0x4442) +  512],tmp[2],c_tmp[2+16]);
+		tmp[3]=xor3(sharedMemory[__byte_perm(n32[ 0], 0, 0x4443) + 768],tmp[3],c_tmp[3+16]);
+		tmp[4]=xor3(sharedMemory[__byte_perm(n32[ 1], 0, 0x4440) + 1024],tmp[4],c_tmp[4+16]);
+		tmp[5]=xor3(sharedMemory[__byte_perm(n32[ 1], 0, 0x4441) + 1280],tmp[5],c_tmp[5+16]);
+		tmp[6]=xor3(sharedMemory[__byte_perm(n32[ 1], 0, 0x4442) + 1536],tmp[6],c_tmp[6+16]);
+		tmp[7]=xor3(sharedMemory[__byte_perm(n32[ 1], 0, 0x4443) + 1792],tmp[7],c_tmp[7+16]);
+
+		n[0]=tmp[0];
+		n[1]=tmp[1];
+		n[2]=tmp[2];
+		n[3]=tmp[3];
+		n[4]=tmp[4];
+		n[5]=tmp[5];
+		n[6]=tmp[6];
+		n[7]=tmp[7];
+
+		tmp[0] = xor1(ROUND_ELT(sharedMemory, n, 0, 7, 6, 5, 4, 3, 2, 1), c_tmp[0+8]);
+		tmp[1] = xor1(ROUND_ELT(sharedMemory, n, 1, 0, 7, 6, 5, 4, 3, 2), c_tmp[1+8]);
+		tmp[2] = xor1(ROUND_ELT(sharedMemory, n, 2, 1, 0, 7, 6, 5, 4, 3), c_tmp[2+8]);
+		tmp[3] = xor1(ROUND_ELT(sharedMemory, n, 3, 2, 1, 0, 7, 6, 5, 4), c_tmp[3+8]);
+		tmp[4] = xor1(ROUND_ELT(sharedMemory, n, 4, 3, 2, 1, 0, 7, 6, 5), c_tmp[4+8]);
+		tmp[5] = xor1(ROUND_ELT(sharedMemory, n, 5, 4, 3, 2, 1, 0, 7, 6), c_tmp[5+8]);
+		tmp[6] = xor1(ROUND_ELT(sharedMemory, n, 6, 5, 4, 3, 2, 1, 0, 7), c_tmp[6+8]);
+		tmp[7] = xor1(ROUND_ELT(sharedMemory, n, 7, 6, 5, 4, 3, 2, 1, 0), c_tmp[7+8]);
+
+		n[0] = xor1(ROUND_ELT(sharedMemory, tmp, 0, 7, 6, 5, 4, 3, 2, 1), c_tmp[0+24]);
+		n[1] = xor1(ROUND_ELT(sharedMemory, tmp, 1, 0, 7, 6, 5, 4, 3, 2), c_tmp[1+24]);
+		n[2] = xor1(ROUND_ELT(sharedMemory, tmp, 2, 1, 0, 7, 6, 5, 4, 3), c_tmp[2+24]);
+		n[3] = xor1(ROUND_ELT(sharedMemory, tmp, 3, 2, 1, 0, 7, 6, 5, 4), c_tmp[3+24]);
+		n[4] = xor1(ROUND_ELT(sharedMemory, tmp, 4, 3, 2, 1, 0, 7, 6, 5), c_tmp[4+24]);
+		n[5] = xor1(ROUND_ELT(sharedMemory, tmp, 5, 4, 3, 2, 1, 0, 7, 6), c_tmp[5+24]);
+		n[6] = xor1(ROUND_ELT(sharedMemory, tmp, 6, 5, 4, 3, 2, 1, 0, 7), c_tmp[6+24]);
+		n[7] = xor1(ROUND_ELT(sharedMemory, tmp, 7, 6, 5, 4, 3, 2, 1, 0), c_tmp[7+24]);
+
+		tmp[0] = xor1(ROUND_ELT(sharedMemory, n, 0, 7, 6, 5, 4, 3, 2, 1), c_tmp[0+32]);
+		tmp[1] = xor1(ROUND_ELT(sharedMemory, n, 1, 0, 7, 6, 5, 4, 3, 2), c_tmp[1+32]);
+		tmp[2] = xor1(ROUND_ELT(sharedMemory, n, 2, 1, 0, 7, 6, 5, 4, 3), c_tmp[2+32]);
+		tmp[3] = xor1(ROUND_ELT(sharedMemory, n, 3, 2, 1, 0, 7, 6, 5, 4), c_tmp[3+32]);
+		tmp[4] = xor1(ROUND_ELT(sharedMemory, n, 4, 3, 2, 1, 0, 7, 6, 5), c_tmp[4+32]);
+		tmp[5] = xor1(ROUND_ELT(sharedMemory, n, 5, 4, 3, 2, 1, 0, 7, 6), c_tmp[5+32]);
+		tmp[6] = xor1(ROUND_ELT(sharedMemory, n, 6, 5, 4, 3, 2, 1, 0, 7), c_tmp[6+32]);
+		tmp[7] = xor1(ROUND_ELT(sharedMemory, n, 7, 6, 5, 4, 3, 2, 1, 0), c_tmp[7+32]);
+
+		n[0] = xor1(ROUND_ELT(sharedMemory, tmp, 0, 7, 6, 5, 4, 3, 2, 1), c_tmp[0+40]);
+		n[1] = xor1(ROUND_ELT(sharedMemory, tmp, 1, 0, 7, 6, 5, 4, 3, 2), c_tmp[1+40]);
+		n[2] = xor1(ROUND_ELT(sharedMemory, tmp, 2, 1, 0, 7, 6, 5, 4, 3), c_tmp[2+40]);
+		n[3] = xor1(ROUND_ELT(sharedMemory, tmp, 3, 2, 1, 0, 7, 6, 5, 4), c_tmp[3+40]);
+		n[4] = xor1(ROUND_ELT(sharedMemory, tmp, 4, 3, 2, 1, 0, 7, 6, 5), c_tmp[4+40]);
+		n[5] = xor1(ROUND_ELT(sharedMemory, tmp, 5, 4, 3, 2, 1, 0, 7, 6), c_tmp[5+40]);
+		n[6] = xor1(ROUND_ELT(sharedMemory, tmp, 6, 5, 4, 3, 2, 1, 0, 7), c_tmp[6+40]);
+		n[7] = xor1(ROUND_ELT(sharedMemory, tmp, 7, 6, 5, 4, 3, 2, 1, 0), c_tmp[7+40]);
+
+		tmp[0] = xor1(ROUND_ELT(sharedMemory, n, 0, 7, 6, 5, 4, 3, 2, 1), c_tmp[0+48]);
+		tmp[1] = xor1(ROUND_ELT(sharedMemory, n, 1, 0, 7, 6, 5, 4, 3, 2), c_tmp[1+48]);
+		tmp[2] = xor1(ROUND_ELT(sharedMemory, n, 2, 1, 0, 7, 6, 5, 4, 3), c_tmp[2+48]);
+		tmp[3] = xor1(ROUND_ELT(sharedMemory, n, 3, 2, 1, 0, 7, 6, 5, 4), c_tmp[3+48]);
+		tmp[4] = xor1(ROUND_ELT(sharedMemory, n, 4, 3, 2, 1, 0, 7, 6, 5), c_tmp[4+48]);
+		tmp[5] = xor1(ROUND_ELT(sharedMemory, n, 5, 4, 3, 2, 1, 0, 7, 6), c_tmp[5+48]);
+		tmp[6] = xor1(ROUND_ELT(sharedMemory, n, 6, 5, 4, 3, 2, 1, 0, 7), c_tmp[6+48]);
+		tmp[7] = xor1(ROUND_ELT(sharedMemory, n, 7, 6, 5, 4, 3, 2, 1, 0), c_tmp[7+48]);
+
+		n[0] = xor1(ROUND_ELT(sharedMemory, tmp, 0, 7, 6, 5, 4, 3, 2, 1), c_tmp[0+56]);
+		n[1] = xor1(ROUND_ELT(sharedMemory, tmp, 1, 0, 7, 6, 5, 4, 3, 2), c_tmp[1+56]);
+		n[2] = xor1(ROUND_ELT(sharedMemory, tmp, 2, 1, 0, 7, 6, 5, 4, 3), c_tmp[2+56]);
+		n[3] = xor1(ROUND_ELT(sharedMemory, tmp, 3, 2, 1, 0, 7, 6, 5, 4), c_tmp[3+56]);
+		n[4] = xor1(ROUND_ELT(sharedMemory, tmp, 4, 3, 2, 1, 0, 7, 6, 5), c_tmp[4+56]);
+		n[5] = xor1(ROUND_ELT(sharedMemory, tmp, 5, 4, 3, 2, 1, 0, 7, 6), c_tmp[5+56]);
+		n[6] = xor1(ROUND_ELT(sharedMemory, tmp, 6, 5, 4, 3, 2, 1, 0, 7), c_tmp[6+56]);
+		n[7] = xor1(ROUND_ELT(sharedMemory, tmp, 7, 6, 5, 4, 3, 2, 1, 0), c_tmp[7+56]);
+
+		tmp[0] = xor1(ROUND_ELT(sharedMemory, n, 0, 7, 6, 5, 4, 3, 2, 1), c_tmp[0+64]);
+		tmp[1] = xor1(ROUND_ELT(sharedMemory, n, 1, 0, 7, 6, 5, 4, 3, 2), c_tmp[1+64]);
+		tmp[2] = xor1(ROUND_ELT(sharedMemory, n, 2, 1, 0, 7, 6, 5, 4, 3), c_tmp[2+64]);
+		tmp[3] = xor1(ROUND_ELT(sharedMemory, n, 3, 2, 1, 0, 7, 6, 5, 4), c_tmp[3+64]);
+		tmp[4] = xor1(ROUND_ELT(sharedMemory, n, 4, 3, 2, 1, 0, 7, 6, 5), c_tmp[4+64]);
+		tmp[5] = xor1(ROUND_ELT(sharedMemory, n, 5, 4, 3, 2, 1, 0, 7, 6), c_tmp[5+64]);
+		tmp[6] = xor1(ROUND_ELT(sharedMemory, n, 6, 5, 4, 3, 2, 1, 0, 7), c_tmp[6+64]);
+		tmp[7] = xor1(ROUND_ELT(sharedMemory, n, 7, 6, 5, 4, 3, 2, 1, 0), c_tmp[7+64]);
+
+		if (xor3(c_xtra[1], ROUND_ELT(sharedMemory, tmp, 3, 2, 1, 0, 7, 6, 5, 4), ROUND_ELT(sharedMemory, tmp, 5, 4, 3, 2, 1, 0, 7, 6)) <= pTarget[3]) {
+			atomicMin(&resNounce[0], nounce);
+		}
+	}
+}
+
+__host__
+extern void whirlpoolx_cpu_init(int thr_id, uint32_t threads)
+{
+	cudaMemcpyToSymbol(InitVector_RC, plain_RC, sizeof(plain_RC), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(mixTob0Tox, plain_T0, sizeof(plain_T0), 0, cudaMemcpyHostToDevice);
+	cudaMalloc(&d_WXNonce[thr_id], sizeof(uint32_t));
+	cudaMallocHost(&h_wxnounce[thr_id], sizeof(uint32_t));
+	cudaMalloc(&d_xtra[thr_id], 8 * sizeof(uint64_t));
+	CUDA_SAFE_CALL(cudaMalloc(&d_tmp[thr_id], 8 * 9 * sizeof(uint64_t))); // d_tmp[threadIdx.x+64] (7+64)
+}
+
+__host__
+extern void whirlpoolx_cpu_free(int thr_id)
+{
+	cudaFree(d_WXNonce[thr_id]);
+	cudaFreeHost(h_wxnounce[thr_id]);
+	cudaFree(d_xtra[thr_id]);
+	cudaFree(d_tmp[thr_id]);
+}
+
+__host__
+void whirlpoolx_setBlock_80(void *pdata, const void *ptarget)
+{
+	uint64_t PaddedMessage[16];
+	memcpy(PaddedMessage, pdata, 80);
+	memset((uint8_t*)&PaddedMessage+80, 0, 48);
+	((uint8_t*)PaddedMessage)[80] = 0x80; /* ending */
+	cudaMemcpyToSymbol(pTarget, ptarget, 4*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 16 * sizeof(uint64_t), 0, cudaMemcpyHostToDevice));
+}
+
+__host__
+void whirlpoolx_precompute(int thr_id)
+{
+	dim3 grid(1);
+	dim3 block(256);
+
+	whirlpoolx_gpu_precompute <<<grid, block>>>(8, d_xtra[thr_id], d_tmp[thr_id]);
+	cudaThreadSynchronize();
+
+	cudaMemcpyToSymbol(c_xtra, d_xtra[thr_id], 8 * sizeof(uint64_t), 0, cudaMemcpyDeviceToDevice);
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_tmp, d_tmp[thr_id], 8 * 9 * sizeof(uint64_t), 0, cudaMemcpyDeviceToDevice));
+}
+
+__host__
+uint32_t whirlpoolx_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce)
+{
+	dim3 grid((threads + threadsPerBlock-1) / threadsPerBlock);
+	dim3 block(threadsPerBlock);
+
+	cudaMemset(d_WXNonce[thr_id], 0xff, sizeof(uint32_t));
+
+	whirlpoolx_gpu_hash<<<grid, block>>>(threads, startNounce, d_WXNonce[thr_id]);
+	cudaThreadSynchronize();
+
+	cudaMemcpy(h_wxnounce[thr_id], d_WXNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+
+	return *(h_wxnounce[thr_id]);
+}
diff --git a/x15/cuda_x14_shabal512.cu b/x15/cuda_x14_shabal512.cu
new file mode 100644
index 0000000000..a1d5a8da01
--- /dev/null
+++ b/x15/cuda_x14_shabal512.cu
@@ -0,0 +1,475 @@
+/*
+ * Shabal-512 for X14/X15
+ */
+#include "cuda_helper.h"
+
+/* $Id: shabal.c 175 2010-05-07 16:03:20Z tp $ */
+/*
+ * Shabal implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010 Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+/*
+ * Part of this code was automatically generated (the part between
+ * the "BEGIN" and "END" markers).
+ */
+
+#define sM    16
+
+#define C32   SPH_C32
+#define T32   SPH_T32
+
+#define O1   13
+#define O2    9
+#define O3    6
+
+/*
+ * We copy the state into local variables, so that the compiler knows
+ * that it can optimize them at will.
+ */
+
+/* BEGIN -- automatically generated code. */
+
+#define INPUT_BLOCK_ADD   do { \
+		B0 = T32(B0 + M0); \
+		B1 = T32(B1 + M1); \
+		B2 = T32(B2 + M2); \
+		B3 = T32(B3 + M3); \
+		B4 = T32(B4 + M4); \
+		B5 = T32(B5 + M5); \
+		B6 = T32(B6 + M6); \
+		B7 = T32(B7 + M7); \
+		B8 = T32(B8 + M8); \
+		B9 = T32(B9 + M9); \
+		BA = T32(BA + MA); \
+		BB = T32(BB + MB); \
+		BC = T32(BC + MC); \
+		BD = T32(BD + MD); \
+		BE = T32(BE + ME); \
+		BF = T32(BF + MF); \
+	} while (0)
+
+#define INPUT_BLOCK_SUB   do { \
+		C0 = T32(C0 - M0); \
+		C1 = T32(C1 - M1); \
+		C2 = T32(C2 - M2); \
+		C3 = T32(C3 - M3); \
+		C4 = T32(C4 - M4); \
+		C5 = T32(C5 - M5); \
+		C6 = T32(C6 - M6); \
+		C7 = T32(C7 - M7); \
+		C8 = T32(C8 - M8); \
+		C9 = T32(C9 - M9); \
+		CA = T32(CA - MA); \
+		CB = T32(CB - MB); \
+		CC = T32(CC - MC); \
+		CD = T32(CD - MD); \
+		CE = T32(CE - ME); \
+		CF = T32(CF - MF); \
+	} while (0)
+
+#define XOR_W   do { \
+		A00 ^= Wlow; \
+		A01 ^= Whigh; \
+	} while (0)
+
+#define SWAP(v1, v2)   do { \
+		uint32_t tmp = (v1); \
+		(v1) = (v2); \
+		(v2) = tmp; \
+	} while (0)
+
+#define SWAP_BC   do { \
+		SWAP(B0, C0); \
+		SWAP(B1, C1); \
+		SWAP(B2, C2); \
+		SWAP(B3, C3); \
+		SWAP(B4, C4); \
+		SWAP(B5, C5); \
+		SWAP(B6, C6); \
+		SWAP(B7, C7); \
+		SWAP(B8, C8); \
+		SWAP(B9, C9); \
+		SWAP(BA, CA); \
+		SWAP(BB, CB); \
+		SWAP(BC, CC); \
+		SWAP(BD, CD); \
+		SWAP(BE, CE); \
+		SWAP(BF, CF); \
+	} while (0)
+
+#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm)   do { \
+		xa0 = T32((xa0 \
+			^ (ROTL32(xa1, 15) * 5U) \
+			^ xc) * 3U) \
+			^ xb1 ^ (xb2 & ~xb3) ^ xm; \
+		xb0 = T32(~(ROTL32(xb0, 1) ^ xa0)); \
+	} while (0)
+
+#define PERM_STEP_0   do { \
+		PERM_ELT(A00, A0B, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A01, A00, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A02, A01, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A03, A02, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A04, A03, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A05, A04, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A06, A05, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A07, A06, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A08, A07, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A09, A08, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A0A, A09, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A0B, A0A, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A00, A0B, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A01, A00, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A02, A01, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A03, A02, BF, BC, B8, B5, C9, MF); \
+	} while (0)
+
+#define PERM_STEP_1   do { \
+		PERM_ELT(A04, A03, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A05, A04, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A06, A05, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A07, A06, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A08, A07, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A09, A08, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A0A, A09, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A0B, A0A, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A00, A0B, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A01, A00, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A02, A01, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A03, A02, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A04, A03, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A05, A04, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A06, A05, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A07, A06, BF, BC, B8, B5, C9, MF); \
+	} while (0)
+
+#define PERM_STEP_2   do { \
+		PERM_ELT(A08, A07, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A09, A08, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A0A, A09, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A0B, A0A, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A00, A0B, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A01, A00, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A02, A01, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A03, A02, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A04, A03, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A05, A04, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A06, A05, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A07, A06, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A08, A07, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A09, A08, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A0A, A09, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A0B, A0A, BF, BC, B8, B5, C9, MF); \
+	} while (0)
+
+#define APPLY_P   do { \
+		B0 = T32(B0 << 17) | (B0 >> 15); \
+		B1 = T32(B1 << 17) | (B1 >> 15); \
+		B2 = T32(B2 << 17) | (B2 >> 15); \
+		B3 = T32(B3 << 17) | (B3 >> 15); \
+		B4 = T32(B4 << 17) | (B4 >> 15); \
+		B5 = T32(B5 << 17) | (B5 >> 15); \
+		B6 = T32(B6 << 17) | (B6 >> 15); \
+		B7 = T32(B7 << 17) | (B7 >> 15); \
+		B8 = T32(B8 << 17) | (B8 >> 15); \
+		B9 = T32(B9 << 17) | (B9 >> 15); \
+		BA = T32(BA << 17) | (BA >> 15); \
+		BB = T32(BB << 17) | (BB >> 15); \
+		BC = T32(BC << 17) | (BC >> 15); \
+		BD = T32(BD << 17) | (BD >> 15); \
+		BE = T32(BE << 17) | (BE >> 15); \
+		BF = T32(BF << 17) | (BF >> 15); \
+		PERM_STEP_0; \
+		PERM_STEP_1; \
+		PERM_STEP_2; \
+		A0B = T32(A0B + C6); \
+		A0A = T32(A0A + C5); \
+		A09 = T32(A09 + C4); \
+		A08 = T32(A08 + C3); \
+		A07 = T32(A07 + C2); \
+		A06 = T32(A06 + C1); \
+		A05 = T32(A05 + C0); \
+		A04 = T32(A04 + CF); \
+		A03 = T32(A03 + CE); \
+		A02 = T32(A02 + CD); \
+		A01 = T32(A01 + CC); \
+		A00 = T32(A00 + CB); \
+		A0B = T32(A0B + CA); \
+		A0A = T32(A0A + C9); \
+		A09 = T32(A09 + C8); \
+		A08 = T32(A08 + C7); \
+		A07 = T32(A07 + C6); \
+		A06 = T32(A06 + C5); \
+		A05 = T32(A05 + C4); \
+		A04 = T32(A04 + C3); \
+		A03 = T32(A03 + C2); \
+		A02 = T32(A02 + C1); \
+		A01 = T32(A01 + C0); \
+		A00 = T32(A00 + CF); \
+		A0B = T32(A0B + CE); \
+		A0A = T32(A0A + CD); \
+		A09 = T32(A09 + CC); \
+		A08 = T32(A08 + CB); \
+		A07 = T32(A07 + CA); \
+		A06 = T32(A06 + C9); \
+		A05 = T32(A05 + C8); \
+		A04 = T32(A04 + C7); \
+		A03 = T32(A03 + C6); \
+		A02 = T32(A02 + C5); \
+		A01 = T32(A01 + C4); \
+		A00 = T32(A00 + C3); \
+	} while (0)
+
+#define INCR_W   do { \
+		if ((Wlow = T32(Wlow + 1)) == 0) \
+			Whigh = T32(Whigh + 1); \
+	} while (0)
+
+
+#if 0 /* other hash sizes init */
+
+static const uint32_t A_init_192[] = {
+	C32(0xFD749ED4), C32(0xB798E530), C32(0x33904B6F), C32(0x46BDA85E),
+	C32(0x076934B4), C32(0x454B4058), C32(0x77F74527), C32(0xFB4CF465),
+	C32(0x62931DA9), C32(0xE778C8DB), C32(0x22B3998E), C32(0xAC15CFB9)
+};
+
+static const uint32_t B_init_192[] = {
+	C32(0x58BCBAC4), C32(0xEC47A08E), C32(0xAEE933B2), C32(0xDFCBC824),
+	C32(0xA7944804), C32(0xBF65BDB0), C32(0x5A9D4502), C32(0x59979AF7),
+	C32(0xC5CEA54E), C32(0x4B6B8150), C32(0x16E71909), C32(0x7D632319),
+	C32(0x930573A0), C32(0xF34C63D1), C32(0xCAF914B4), C32(0xFDD6612C)
+};
+
+static const uint32_t C_init_192[] = {
+	C32(0x61550878), C32(0x89EF2B75), C32(0xA1660C46), C32(0x7EF3855B),
+	C32(0x7297B58C), C32(0x1BC67793), C32(0x7FB1C723), C32(0xB66FC640),
+	C32(0x1A48B71C), C32(0xF0976D17), C32(0x088CE80A), C32(0xA454EDF3),
+	C32(0x1C096BF4), C32(0xAC76224B), C32(0x5215781C), C32(0xCD5D2669)
+};
+
+static const uint32_t A_init_224[] = {
+	C32(0xA5201467), C32(0xA9B8D94A), C32(0xD4CED997), C32(0x68379D7B),
+	C32(0xA7FC73BA), C32(0xF1A2546B), C32(0x606782BF), C32(0xE0BCFD0F),
+	C32(0x2F25374E), C32(0x069A149F), C32(0x5E2DFF25), C32(0xFAECF061)
+};
+
+static const uint32_t B_init_224[] = {
+	C32(0xEC9905D8), C32(0xF21850CF), C32(0xC0A746C8), C32(0x21DAD498),
+	C32(0x35156EEB), C32(0x088C97F2), C32(0x26303E40), C32(0x8A2D4FB5),
+	C32(0xFEEE44B6), C32(0x8A1E9573), C32(0x7B81111A), C32(0xCBC139F0),
+	C32(0xA3513861), C32(0x1D2C362E), C32(0x918C580E), C32(0xB58E1B9C)
+};
+
+static const uint32_t C_init_224[] = {
+	C32(0xE4B573A1), C32(0x4C1A0880), C32(0x1E907C51), C32(0x04807EFD),
+	C32(0x3AD8CDE5), C32(0x16B21302), C32(0x02512C53), C32(0x2204CB18),
+	C32(0x99405F2D), C32(0xE5B648A1), C32(0x70AB1D43), C32(0xA10C25C2),
+	C32(0x16F1AC05), C32(0x38BBEB56), C32(0x9B01DC60), C32(0xB1096D83)
+};
+
+static const uint32_t A_init_256[] = {
+	C32(0x52F84552), C32(0xE54B7999), C32(0x2D8EE3EC), C32(0xB9645191),
+	C32(0xE0078B86), C32(0xBB7C44C9), C32(0xD2B5C1CA), C32(0xB0D2EB8C),
+	C32(0x14CE5A45), C32(0x22AF50DC), C32(0xEFFDBC6B), C32(0xEB21B74A)
+};
+
+static const uint32_t B_init_256[] = {
+	C32(0xB555C6EE), C32(0x3E710596), C32(0xA72A652F), C32(0x9301515F),
+	C32(0xDA28C1FA), C32(0x696FD868), C32(0x9CB6BF72), C32(0x0AFE4002),
+	C32(0xA6E03615), C32(0x5138C1D4), C32(0xBE216306), C32(0xB38B8890),
+	C32(0x3EA8B96B), C32(0x3299ACE4), C32(0x30924DD4), C32(0x55CB34A5)
+};
+
+static const uint32_t C_init_256[] = {
+	C32(0xB405F031), C32(0xC4233EBA), C32(0xB3733979), C32(0xC0DD9D55),
+	C32(0xC51C28AE), C32(0xA327B8E1), C32(0x56C56167), C32(0xED614433),
+	C32(0x88B59D60), C32(0x60E2CEBA), C32(0x758B4B8B), C32(0x83E82A7F),
+	C32(0xBC968828), C32(0xE6E00BF7), C32(0xBA839E55), C32(0x9B491C60)
+};
+
+static const uint32_t A_init_384[] = {
+	C32(0xC8FCA331), C32(0xE55C504E), C32(0x003EBF26), C32(0xBB6B8D83),
+	C32(0x7B0448C1), C32(0x41B82789), C32(0x0A7C9601), C32(0x8D659CFF),
+	C32(0xB6E2673E), C32(0xCA54C77B), C32(0x1460FD7E), C32(0x3FCB8F2D)
+};
+
+static const uint32_t B_init_384[] = {
+	C32(0x527291FC), C32(0x2A16455F), C32(0x78E627E5), C32(0x944F169F),
+	C32(0x1CA6F016), C32(0xA854EA25), C32(0x8DB98ABE), C32(0xF2C62641),
+	C32(0x30117DCB), C32(0xCF5C4309), C32(0x93711A25), C32(0xF9F671B8),
+	C32(0xB01D2116), C32(0x333F4B89), C32(0xB285D165), C32(0x86829B36)
+};
+
+static const uint32_t C_init_384[] = {
+	C32(0xF764B11A), C32(0x76172146), C32(0xCEF6934D), C32(0xC6D28399),
+	C32(0xFE095F61), C32(0x5E6018B4), C32(0x5048ECF5), C32(0x51353261),
+	C32(0x6E6E36DC), C32(0x63130DAD), C32(0xA9C69BD6), C32(0x1E90EA0C),
+	C32(0x7C35073B), C32(0x28D95E6D), C32(0xAA340E0D), C32(0xCB3DEE70)
+};
+#endif
+
+__device__ __constant__
+static const uint32_t d_A512[] = {
+	C32(0x20728DFD), C32(0x46C0BD53), C32(0xE782B699), C32(0x55304632),
+	C32(0x71B4EF90), C32(0x0EA9E82C), C32(0xDBB930F1), C32(0xFAD06B8B),
+	C32(0xBE0CAE40), C32(0x8BD14410), C32(0x76D2ADAC), C32(0x28ACAB7F)
+};
+
+__device__ __constant__
+static const uint32_t d_B512[] = {
+	C32(0xC1099CB7), C32(0x07B385F3), C32(0xE7442C26), C32(0xCC8AD640),
+	C32(0xEB6F56C7), C32(0x1EA81AA9), C32(0x73B9D314), C32(0x1DE85D08),
+	C32(0x48910A5A), C32(0x893B22DB), C32(0xC5A0DF44), C32(0xBBC4324E),
+	C32(0x72D2F240), C32(0x75941D99), C32(0x6D8BDE82), C32(0xA1A7502B)
+};
+
+__device__ __constant__
+static const uint32_t d_C512[] = {
+	C32(0xD9BF68D1), C32(0x58BAD750), C32(0x56028CB2), C32(0x8134F359),
+	C32(0xB5D469D8), C32(0x941A8CC2), C32(0x418B2A6E), C32(0x04052780),
+	C32(0x7F07D787), C32(0x5194358F), C32(0x3C60D665), C32(0xBE97D79A),
+	C32(0x950C3434), C32(0xAED9A06D), C32(0x2537DC8D), C32(0x7CDB5969)
+};
+
+/***************************************************/
+// GPU Hash Function
+__global__ void x14_shabal512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
+{
+	__syncthreads();
+
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+	if (thread < threads)
+	{
+		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+		int hashPosition = nounce - startNounce;
+		uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3]; // [hashPosition * 8]
+
+		uint32_t A00 = d_A512[0], A01 = d_A512[1], A02 = d_A512[2], A03 = d_A512[3],
+			A04 = d_A512[4], A05 = d_A512[5], A06 = d_A512[6], A07 = d_A512[7],
+			A08 = d_A512[8], A09 = d_A512[9], A0A = d_A512[10], A0B = d_A512[11];
+		uint32_t B0 = d_B512[0], B1 = d_B512[1], B2 = d_B512[2], B3 = d_B512[3],
+			B4 = d_B512[4], B5 = d_B512[5], B6 = d_B512[6], B7 = d_B512[7],
+			B8 = d_B512[8], B9 = d_B512[9], BA = d_B512[10], BB = d_B512[11],
+			BC = d_B512[12], BD = d_B512[13], BE = d_B512[14], BF = d_B512[15];
+		uint32_t C0 = d_C512[0], C1 = d_C512[1], C2 = d_C512[2], C3 = d_C512[3],
+			C4 = d_C512[4], C5 = d_C512[5], C6 = d_C512[6], C7 = d_C512[7],
+			C8 = d_C512[8], C9 = d_C512[9], CA = d_C512[10], CB = d_C512[11],
+			CC = d_C512[12], CD = d_C512[13], CE = d_C512[14], CF = d_C512[15];
+		uint32_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
+		uint32_t Wlow = 1, Whigh = 0;
+
+		M0 = Hash[0];
+		M1 = Hash[1];
+		M2 = Hash[2];
+		M3 = Hash[3];
+		M4 = Hash[4];
+		M5 = Hash[5];
+		M6 = Hash[6];
+		M7 = Hash[7];
+
+		M8 = Hash[8];
+		M9 = Hash[9];
+		MA = Hash[10];
+		MB = Hash[11];
+		MC = Hash[12];
+		MD = Hash[13];
+		ME = Hash[14];
+		MF = Hash[15];
+
+		INPUT_BLOCK_ADD;
+		XOR_W;
+		APPLY_P;
+		INPUT_BLOCK_SUB;
+		SWAP_BC;
+		INCR_W;
+
+		M0 = 0x80;
+		M1 = M2 = M3 = M4 = M5 = M6 = M7 = M8 = M9 = MA = MB = MC = MD = ME = MF = 0;
+
+		INPUT_BLOCK_ADD;
+		XOR_W;
+		APPLY_P;
+
+		for (uint8_t i = 0; i < 3; i ++)
+		{
+			SWAP_BC;
+			XOR_W;
+			APPLY_P;
+		}
+
+		Hash[0] = B0;
+		Hash[1] = B1;
+		Hash[2] = B2;
+		Hash[3] = B3;
+		Hash[4] = B4;
+		Hash[5] = B5;
+		Hash[6] = B6;
+		Hash[7] = B7;
+
+		Hash[8] = B8;
+		Hash[9] = B9;
+		Hash[10] = BA;
+		Hash[11] = BB;
+		Hash[12] = BC;
+		Hash[13] = BD;
+		Hash[14] = BE;
+		Hash[15] = BF;
+
+		//result = (Hash[3] <= target);
+
+		uint32_t *outpHash = (uint32_t*)&g_hash[hashPosition << 3]; // [8 * hashPosition];
+
+		for (int i = 0; i < 16; i++)
+			outpHash[i] = Hash[i];
+	}
+}
+
+__host__ void x14_shabal512_cpu_init(int thr_id, uint32_t threads)
+{
+}
+
+// #include <stdio.h>
+__host__ void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+{
+	const uint32_t threadsperblock = 256;
+
+	// berechne wie viele Thread Blocks wir brauchen
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	size_t shared_size = 0;
+
+	// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
+
+	x14_shabal512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+	MyStreamSynchronize(NULL, order, thr_id);
+}
diff --git a/x15/cuda_x15_whirlpool.cu b/x15/cuda_x15_whirlpool.cu
new file mode 100644
index 0000000000..a541919711
--- /dev/null
+++ b/x15/cuda_x15_whirlpool.cu
@@ -0,0 +1,743 @@
+/**
+ * Whirlpool-512 CUDA implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2014-2016 djm34, tpruvot, SP, Provos Alexis
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ * @author djm34 (initial draft)
+ * @author tpruvot (dual old/whirlpool modes, midstate)
+ * @author SP ("final" function opt and tuning)
+ * @author Provos Alexis (Applied partial shared memory utilization, precomputations, merging & tuning for 970/750ti under CUDA7.5 -> +93% increased throughput of whirlpool)
+ */
+
+
+// Change with caution, used by shared mem fetch
+#define TPB80 384
+#define TPB64 384
+
+extern "C" {
+#include <sph/sph_whirlpool.h>
+#include <miner.h>
+}
+
+#include <cuda_helper.h>
+#include <cuda_vector_uint2x4.h>
+#include <cuda_vectors.h>
+
+#define xor3x(a,b,c) (a^b^c)
+
+#include "cuda_whirlpool_tables.cuh"
+
+__device__ static uint64_t b0[256];
+__device__ static uint64_t b7[256];
+
+__constant__ static uint2 precomputed_round_key_64[72];
+__constant__ static uint2 precomputed_round_key_80[80];
+
+__device__ static uint2 c_PaddedMessage80[16];
+
+/**
+ * Round constants.
+ */
+__device__ uint2 InitVector_RC[10];
+
+static uint32_t *d_resNonce[MAX_GPUS] = { 0 };
+
+//--------START OF WHIRLPOOL DEVICE MACROS---------------------------------------------------------------------------
+__device__ __forceinline__
+void static TRANSFER(uint2 *const __restrict__ dst,const uint2 *const __restrict__ src){
+	dst[0] = src[ 0];
+	dst[1] = src[ 1];
+	dst[2] = src[ 2];
+	dst[3] = src[ 3];
+	dst[4] = src[ 4];
+	dst[5] = src[ 5];
+	dst[6] = src[ 6];
+	dst[7] = src[ 7];
+}
+
+__device__ __forceinline__
+static uint2 d_ROUND_ELT_LDG(const uint2 sharedMemory[7][256],const uint2 *const __restrict__ in,const int i0, const int i1, const int i2, const int i3, const int i4, const int i5, const int i6, const int i7){
+	uint2 ret = __ldg((uint2*)&b0[__byte_perm(in[i0].x, 0, 0x4440)]);
+	ret ^= sharedMemory[1][__byte_perm(in[i1].x, 0, 0x4441)];
+	ret ^= sharedMemory[2][__byte_perm(in[i2].x, 0, 0x4442)];
+	ret ^= sharedMemory[3][__byte_perm(in[i3].x, 0, 0x4443)];
+	ret ^= sharedMemory[4][__byte_perm(in[i4].y, 0, 0x4440)];
+	ret ^= ROR24(__ldg((uint2*)&b0[__byte_perm(in[i5].y, 0, 0x4441)]));
+	ret ^= ROR8(__ldg((uint2*)&b7[__byte_perm(in[i6].y, 0, 0x4442)]));
+	ret ^= __ldg((uint2*)&b7[__byte_perm(in[i7].y, 0, 0x4443)]);
+	return ret;
+}
+
+__device__ __forceinline__
+static uint2 d_ROUND_ELT(const uint2 sharedMemory[7][256],const uint2 *const __restrict__ in,const int i0, const int i1, const int i2, const int i3, const int i4, const int i5, const int i6, const int i7){
+
+	uint2 ret = __ldg((uint2*)&b0[__byte_perm(in[i0].x, 0, 0x4440)]);
+	ret ^= sharedMemory[1][__byte_perm(in[i1].x, 0, 0x4441)];
+	ret ^= sharedMemory[2][__byte_perm(in[i2].x, 0, 0x4442)];
+	ret ^= sharedMemory[3][__byte_perm(in[i3].x, 0, 0x4443)];
+	ret ^= sharedMemory[4][__byte_perm(in[i4].y, 0, 0x4440)];
+	ret ^= sharedMemory[5][__byte_perm(in[i5].y, 0, 0x4441)];
+	ret ^= ROR8(__ldg((uint2*)&b7[__byte_perm(in[i6].y, 0, 0x4442)]));
+	ret ^= __ldg((uint2*)&b7[__byte_perm(in[i7].y, 0, 0x4443)]);
+	return ret;
+}
+
+__device__ __forceinline__
+static uint2 d_ROUND_ELT1_LDG(const uint2 sharedMemory[7][256],const uint2 *const __restrict__ in,const int i0, const int i1, const int i2, const int i3, const int i4, const int i5, const int i6, const int i7, const uint2 c0){
+
+	uint2 ret = __ldg((uint2*)&b0[__byte_perm(in[i0].x, 0, 0x4440)]);
+	ret ^= sharedMemory[1][__byte_perm(in[i1].x, 0, 0x4441)];
+	ret ^= sharedMemory[2][__byte_perm(in[i2].x, 0, 0x4442)];
+	ret ^= sharedMemory[3][__byte_perm(in[i3].x, 0, 0x4443)];
+	ret ^= sharedMemory[4][__byte_perm(in[i4].y, 0, 0x4440)];
+	ret ^= ROR24(__ldg((uint2*)&b0[__byte_perm(in[i5].y, 0, 0x4441)]));
+	ret ^= ROR8(__ldg((uint2*)&b7[__byte_perm(in[i6].y, 0, 0x4442)]));
+	ret ^= __ldg((uint2*)&b7[__byte_perm(in[i7].y, 0, 0x4443)]);
+	ret ^= c0;
+	return ret;
+}
+
+__device__ __forceinline__
+static uint2 d_ROUND_ELT1(const uint2 sharedMemory[7][256],const uint2 *const __restrict__ in,const int i0, const int i1, const int i2, const int i3, const int i4, const int i5, const int i6, const int i7, const uint2 c0){
+	uint2 ret = __ldg((uint2*)&b0[__byte_perm(in[i0].x, 0, 0x4440)]);
+	ret ^= sharedMemory[1][__byte_perm(in[i1].x, 0, 0x4441)];
+	ret ^= sharedMemory[2][__byte_perm(in[i2].x, 0, 0x4442)];
+	ret ^= sharedMemory[3][__byte_perm(in[i3].x, 0, 0x4443)];
+	ret ^= sharedMemory[4][__byte_perm(in[i4].y, 0, 0x4440)];
+	ret ^= sharedMemory[5][__byte_perm(in[i5].y, 0, 0x4441)];
+	ret ^= ROR8(__ldg((uint2*)&b7[__byte_perm(in[i6].y, 0, 0x4442)]));//sharedMemory[6][__byte_perm(in[i6].y, 0, 0x4442)]
+	ret ^= __ldg((uint2*)&b7[__byte_perm(in[i7].y, 0, 0x4443)]);//sharedMemory[7][__byte_perm(in[i7].y, 0, 0x4443)]
+	ret ^= c0;
+	return ret;
+}
+
+//--------END OF WHIRLPOOL DEVICE MACROS-----------------------------------------------------------------------------
+
+//--------START OF WHIRLPOOL HOST MACROS-----------------------------------------------------------------------------
+
+#define table_skew(val,num) SPH_ROTL64(val,8*num)
+#define BYTE(x, n)     ((unsigned)((x) >> (8 * (n))) & 0xFF)
+
+#define ROUND_ELT(table, in, i0, i1, i2, i3, i4, i5, i6, i7) \
+	(table[BYTE(in[i0], 0)] \
+	^ table_skew(table[BYTE(in[i1], 1)], 1) \
+	^ table_skew(table[BYTE(in[i2], 2)], 2) \
+	^ table_skew(table[BYTE(in[i3], 3)], 3) \
+	^ table_skew(table[BYTE(in[i4], 4)], 4) \
+	^ table_skew(table[BYTE(in[i5], 5)], 5) \
+	^ table_skew(table[BYTE(in[i6], 6)], 6) \
+	^ table_skew(table[BYTE(in[i7], 7)], 7))
+
+#define ROUND(table, in, out, c0, c1, c2, c3, c4, c5, c6, c7)   do { \
+		out[0] = ROUND_ELT(table, in, 0, 7, 6, 5, 4, 3, 2, 1) ^ c0; \
+		out[1] = ROUND_ELT(table, in, 1, 0, 7, 6, 5, 4, 3, 2) ^ c1; \
+		out[2] = ROUND_ELT(table, in, 2, 1, 0, 7, 6, 5, 4, 3) ^ c2; \
+		out[3] = ROUND_ELT(table, in, 3, 2, 1, 0, 7, 6, 5, 4) ^ c3; \
+		out[4] = ROUND_ELT(table, in, 4, 3, 2, 1, 0, 7, 6, 5) ^ c4; \
+		out[5] = ROUND_ELT(table, in, 5, 4, 3, 2, 1, 0, 7, 6) ^ c5; \
+		out[6] = ROUND_ELT(table, in, 6, 5, 4, 3, 2, 1, 0, 7) ^ c6; \
+		out[7] = ROUND_ELT(table, in, 7, 6, 5, 4, 3, 2, 1, 0) ^ c7; \
+	} while (0)
+
+__host__
+static void ROUND_KSCHED(const uint64_t *in,uint64_t *out,const uint64_t c){
+	const uint64_t *a = in;
+	uint64_t *b = out;
+	ROUND(old1_T0, a, b, c, 0, 0, 0, 0, 0, 0, 0);
+}
+
+
+//--------END OF WHIRLPOOL HOST MACROS-------------------------------------------------------------------------------
+
+__host__
+void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int mode)
+{
+	uint64_t* table0 = NULL;
+
+	switch (mode) {
+	case 0: /* x15 with rotated T1-T7 (based on T0) */
+		table0 = (uint64_t*)plain_T0;
+		cudaMemcpyToSymbol(InitVector_RC, plain_RC, 10*sizeof(uint64_t),0, cudaMemcpyHostToDevice);
+		cudaMemcpyToSymbol(precomputed_round_key_64, plain_precomputed_round_key_64, 72*sizeof(uint64_t),0, cudaMemcpyHostToDevice);
+		break;
+	case 1: /* old whirlpool */
+		table0 = (uint64_t*)old1_T0;
+		cudaMemcpyToSymbol(InitVector_RC, old1_RC, 10*sizeof(uint64_t),0,cudaMemcpyHostToDevice);
+		cudaMemcpyToSymbol(precomputed_round_key_64, old1_precomputed_round_key_64, 72*sizeof(uint64_t),0, cudaMemcpyHostToDevice);
+		break;
+	default:
+		applog(LOG_ERR,"Bad whirlpool mode");
+		exit(0);
+	}
+	cudaMemcpyToSymbol(b0, table0, 256*sizeof(uint64_t),0, cudaMemcpyHostToDevice);
+	uint64_t table7[256];
+	for(int i=0;i<256;i++){
+		table7[i] = ROTR64(table0[i],8);
+	}
+	cudaMemcpyToSymbol(b7, table7, 256*sizeof(uint64_t),0, cudaMemcpyHostToDevice);
+
+	CUDA_SAFE_CALL(cudaMalloc(&d_resNonce[thr_id], 2 * sizeof(uint32_t)));
+
+	cuda_get_arch(thr_id);
+}
+
+__host__
+static void whirl_midstate(void *state, const void *input)
+{
+	sph_whirlpool_context ctx;
+
+	sph_whirlpool1_init(&ctx);
+	sph_whirlpool1(&ctx, input, 64);
+
+	memcpy(state, ctx.state, 64);
+}
+
+__host__
+void whirlpool512_setBlock_80(void *pdata, const void *ptarget)
+{
+	uint64_t PaddedMessage[16];
+
+	memcpy(PaddedMessage, pdata, 80);
+	memset(((uint8_t*)&PaddedMessage)+80, 0, 48);
+	((uint8_t*)&PaddedMessage)[80] = 0x80; /* ending */
+
+	// compute constant first block
+	uint64_t midstate[16] = { 0 };
+	whirl_midstate(midstate, pdata);
+	memcpy(PaddedMessage, midstate, 64);
+
+	uint64_t round_constants[80];
+	uint64_t n[8];
+
+	n[0] = PaddedMessage[0] ^ PaddedMessage[8];    //read data
+	n[1] = PaddedMessage[1] ^ PaddedMessage[9];
+	n[2] = PaddedMessage[2] ^ 0x0000000000000080; //whirlpool
+	n[3] = PaddedMessage[3];
+	n[4] = PaddedMessage[4];
+	n[5] = PaddedMessage[5];
+	n[6] = PaddedMessage[6];
+	n[7] = PaddedMessage[7] ^ 0x8002000000000000;
+
+	ROUND_KSCHED(PaddedMessage,round_constants,old1_RC[0]);
+
+	for(int i=1;i<10;i++){
+		ROUND_KSCHED(&round_constants[8*(i-1)],&round_constants[8*i],old1_RC[i]);
+	}
+
+	//USE the same memory place to store keys and state
+	round_constants[ 0]^= old1_T0[BYTE(n[0], 0)]
+	 ^ table_skew(old1_T0[BYTE(n[7], 1)], 1) ^ table_skew(old1_T0[BYTE(n[6], 2)], 2) ^ table_skew(old1_T0[BYTE(n[5], 3)], 3)
+	 ^ table_skew(old1_T0[BYTE(n[4], 4)], 4) ^ table_skew(old1_T0[BYTE(n[3], 5)], 5) ^ table_skew(old1_T0[BYTE(n[2], 6)], 6);
+
+	round_constants[ 1]^= old1_T0[BYTE(n[1], 0)]
+	 ^ table_skew(old1_T0[BYTE(n[0], 1)], 1) ^ table_skew(old1_T0[BYTE(n[7], 2)], 2) ^ table_skew(old1_T0[BYTE(n[6], 3)], 3)
+	 ^ table_skew(old1_T0[BYTE(n[5], 4)], 4) ^ table_skew(old1_T0[BYTE(n[4], 5)], 5) ^ table_skew(old1_T0[BYTE(n[3], 6)], 6)
+	 ^ table_skew(old1_T0[BYTE(n[2], 7)], 7);
+
+	round_constants[ 2]^= old1_T0[BYTE(n[2], 0)]
+	 ^ table_skew(old1_T0[BYTE(n[1], 1)], 1) ^ table_skew(old1_T0[BYTE(n[0], 2)], 2) ^ table_skew(old1_T0[BYTE(n[7], 3)], 3)
+	 ^ table_skew(old1_T0[BYTE(n[6], 4)], 4) ^ table_skew(old1_T0[BYTE(n[5], 5)], 5) ^ table_skew(old1_T0[BYTE(n[4], 6)], 6)
+	 ^ table_skew(old1_T0[BYTE(n[3], 7)], 7);
+
+	round_constants[ 3]^= old1_T0[BYTE(n[3], 0)]
+	 ^ table_skew(old1_T0[BYTE(n[2], 1)], 1) ^ table_skew(old1_T0[BYTE(n[1], 2)], 2) ^ table_skew(old1_T0[BYTE(n[0], 3)], 3)
+	 ^ table_skew(old1_T0[BYTE(n[7], 4)], 4) ^ table_skew(old1_T0[BYTE(n[6], 5)], 5) ^ table_skew(old1_T0[BYTE(n[5], 6)], 6)
+	 ^ table_skew(old1_T0[BYTE(n[4], 7)], 7);
+
+	round_constants[ 4]^= old1_T0[BYTE(n[4], 0)]
+	 ^ table_skew(old1_T0[BYTE(n[3], 1)], 1) ^ table_skew(old1_T0[BYTE(n[2], 2)], 2) ^ table_skew(old1_T0[BYTE(n[1], 3)], 3)
+	 ^ table_skew(old1_T0[BYTE(n[0], 4)], 4) ^ table_skew(old1_T0[BYTE(n[7], 5)], 5) ^ table_skew(old1_T0[BYTE(n[6], 6)], 6)
+	 ^ table_skew(old1_T0[BYTE(n[5], 7)], 7);
+
+	round_constants[ 5]^= old1_T0[BYTE(n[5], 0)]
+	 ^ table_skew(old1_T0[BYTE(n[4], 1)], 1) ^ table_skew(old1_T0[BYTE(n[3], 2)], 2) ^ table_skew(old1_T0[BYTE(n[2], 3)], 3)
+	 ^ table_skew(old1_T0[BYTE(n[0], 5)], 5) ^ table_skew(old1_T0[BYTE(n[7], 6)], 6) ^ table_skew(old1_T0[BYTE(n[6], 7)], 7);
+
+	round_constants[ 6]^= old1_T0[BYTE(n[6], 0)]
+	 ^ table_skew(old1_T0[BYTE(n[5], 1)], 1) ^ table_skew(old1_T0[BYTE(n[4], 2)], 2) ^ table_skew(old1_T0[BYTE(n[3], 3)], 3)
+	 ^ table_skew(old1_T0[BYTE(n[2], 4)], 4) ^ table_skew(old1_T0[BYTE(n[0], 6)], 6) ^ table_skew(old1_T0[BYTE(n[7], 7)], 7);
+
+	round_constants[ 7]^= old1_T0[BYTE(n[7], 0)]
+	 ^ table_skew(old1_T0[BYTE(n[6], 1)], 1) ^ table_skew(old1_T0[BYTE(n[5], 2)], 2) ^ table_skew(old1_T0[BYTE(n[4], 3)], 3)
+	 ^ table_skew(old1_T0[BYTE(n[3], 4)], 4) ^ table_skew(old1_T0[BYTE(n[2], 5)], 5) ^ table_skew(old1_T0[BYTE(n[0], 7)], 7);
+
+	for(int i=1;i<5;i++)
+		n[i] = round_constants[i];
+
+	round_constants[ 8]^= table_skew(old1_T0[BYTE(n[4], 4)], 4)
+	 ^ table_skew(old1_T0[BYTE(n[3], 5)], 5) ^ table_skew(old1_T0[BYTE(n[2], 6)], 6) ^ table_skew(old1_T0[BYTE(n[1], 7)], 7);
+
+	round_constants[ 9]^= old1_T0[BYTE(n[1], 0)]
+	 ^ table_skew(old1_T0[BYTE(n[4], 5)], 5) ^ table_skew(old1_T0[BYTE(n[3], 6)], 6) ^ table_skew(old1_T0[BYTE(n[2], 7)], 7);
+
+	round_constants[10]^= old1_T0[BYTE(n[2], 0)]
+	 ^ table_skew(old1_T0[BYTE(n[1], 1)], 1) ^ table_skew(old1_T0[BYTE(n[4], 6)], 6) ^ table_skew(old1_T0[BYTE(n[3], 7)], 7);
+
+	round_constants[11]^= old1_T0[BYTE(n[3], 0)]
+	 ^ table_skew(old1_T0[BYTE(n[2], 1)], 1) ^ table_skew(old1_T0[BYTE(n[1], 2)], 2) ^ table_skew(old1_T0[BYTE(n[4], 7)], 7);
+
+	round_constants[12]^= old1_T0[BYTE(n[4], 0)]
+	 ^ table_skew(old1_T0[BYTE(n[3], 1)], 1) ^ table_skew(old1_T0[BYTE(n[2], 2)], 2) ^ table_skew(old1_T0[BYTE(n[1], 3)], 3);
+
+	round_constants[13]^= table_skew(old1_T0[BYTE(n[4], 1)], 1) ^ table_skew(old1_T0[BYTE(n[3], 2)], 2)
+	 ^ table_skew(old1_T0[BYTE(n[2], 3)], 3) ^ table_skew(old1_T0[BYTE(n[1], 4)], 4);
+
+	round_constants[14]^= table_skew(old1_T0[BYTE(n[4], 2)], 2) ^ table_skew(old1_T0[BYTE(n[3], 3)], 3)
+	 ^ table_skew(old1_T0[BYTE(n[2], 4)], 4) ^ table_skew(old1_T0[BYTE(n[1], 5)], 5);
+
+	round_constants[15]^= table_skew(old1_T0[BYTE(n[4], 3)], 3) ^  table_skew(old1_T0[BYTE(n[3], 4)], 4)
+	 ^ table_skew(old1_T0[BYTE(n[2], 5)], 5) ^ table_skew(old1_T0[BYTE(n[1], 6)], 6);
+
+	PaddedMessage[0] ^= PaddedMessage[8];
+
+	cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 128, 0, cudaMemcpyHostToDevice);
+
+	cudaMemcpyToSymbol(precomputed_round_key_80, round_constants, 80*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
+}
+
+__host__
+extern void x15_whirlpool_cpu_free(int thr_id)
+{
+	if (d_resNonce[thr_id])
+		cudaFree(d_resNonce[thr_id]);
+}
+
+__global__
+__launch_bounds__(TPB80,2)
+void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t* resNonce, const uint64_t target)
+{
+	__shared__ uint2 sharedMemory[7][256];
+
+	if (threadIdx.x < 256) {
+		const uint2 tmp = __ldg((uint2*)&b0[threadIdx.x]);
+		sharedMemory[0][threadIdx.x] = tmp;
+		sharedMemory[1][threadIdx.x] = ROL8(tmp);
+		sharedMemory[2][threadIdx.x] = ROL16(tmp);
+		sharedMemory[3][threadIdx.x] = ROL24(tmp);
+		sharedMemory[4][threadIdx.x] = SWAPUINT2(tmp);
+		sharedMemory[5][threadIdx.x] = ROR24(tmp);
+		sharedMemory[6][threadIdx.x] = ROR16(tmp);
+	}
+
+	__syncthreads();
+
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+	if (thread < threads){
+
+		uint2 hash[8], state[8],n[8], tmp[8];
+		uint32_t nonce = cuda_swab32(startNounce + thread);
+		uint2 temp = c_PaddedMessage80[9];
+		temp.y = nonce;
+
+		/// round 2 ///////
+		//////////////////////////////////
+		temp = temp ^ c_PaddedMessage80[1];
+
+		*(uint2x4*)&n[ 0]   = *(uint2x4*)&precomputed_round_key_80[ 0];
+		*(uint2x4*)&n[ 4]   = *(uint2x4*)&precomputed_round_key_80[ 4];
+		*(uint2x4*)&tmp[ 0] = *(uint2x4*)&precomputed_round_key_80[ 8];
+		*(uint2x4*)&tmp[ 4] = *(uint2x4*)&precomputed_round_key_80[12];
+
+		n[ 0]^= __ldg((uint2*)&b7[__byte_perm(temp.y, 0, 0x4443)]);
+		n[ 5]^= sharedMemory[4][__byte_perm(temp.y, 0, 0x4440)];
+		n[ 6]^= sharedMemory[5][__byte_perm(temp.y, 0, 0x4441)];
+		n[ 7]^= sharedMemory[6][__byte_perm(temp.y, 0, 0x4442)];
+
+		tmp[ 0]^= __ldg((uint2*)&b0[__byte_perm(n[0].x, 0, 0x4440)]);
+		tmp[ 0]^= sharedMemory[1][__byte_perm(n[7].x, 0, 0x4441)];
+		tmp[ 0]^= sharedMemory[2][__byte_perm(n[6].x, 0, 0x4442)];
+		tmp[ 0]^= sharedMemory[3][__byte_perm(n[5].x, 0, 0x4443)];
+
+		tmp[ 1]^= sharedMemory[1][__byte_perm(n[0].x, 0, 0x4441)];
+		tmp[ 1]^= sharedMemory[2][__byte_perm(n[7].x, 0, 0x4442)];
+		tmp[ 1]^= sharedMemory[3][__byte_perm(n[6].x, 0, 0x4443)];
+		tmp[ 1]^= sharedMemory[4][__byte_perm(n[5].y, 0, 0x4440)];
+
+		tmp[ 2]^= sharedMemory[2][__byte_perm(n[0].x, 0, 0x4442)];
+		tmp[ 2]^= sharedMemory[3][__byte_perm(n[7].x, 0, 0x4443)];
+		tmp[ 2]^= sharedMemory[4][__byte_perm(n[6].y, 0, 0x4440)];
+		tmp[ 2]^= sharedMemory[5][__byte_perm(n[5].y, 0, 0x4441)];
+
+		tmp[ 3]^= sharedMemory[3][__byte_perm(n[0].x, 0, 0x4443)];
+		tmp[ 3]^= sharedMemory[4][__byte_perm(n[7].y, 0, 0x4440)];
+		tmp[ 3]^= ROR24(__ldg((uint2*)&b0[__byte_perm(n[6].y, 0, 0x4441)]));
+		tmp[ 3]^= ROR8(__ldg((uint2*)&b7[__byte_perm(n[5].y, 0, 0x4442)]));
+
+		tmp[ 4]^= sharedMemory[4][__byte_perm(n[0].y, 0, 0x4440)];
+		tmp[ 4]^= sharedMemory[5][__byte_perm(n[7].y, 0, 0x4441)];
+		tmp[ 4]^= ROR8(__ldg((uint2*)&b7[__byte_perm(n[6].y, 0, 0x4442)]));
+		tmp[ 4]^= __ldg((uint2*)&b7[__byte_perm(n[5].y, 0, 0x4443)]);
+
+		tmp[ 5]^= __ldg((uint2*)&b0[__byte_perm(n[5].x, 0, 0x4440)]);
+		tmp[ 5]^= sharedMemory[5][__byte_perm(n[0].y, 0, 0x4441)];
+		tmp[ 5]^= sharedMemory[6][__byte_perm(n[7].y, 0, 0x4442)];
+		tmp[ 5]^= __ldg((uint2*)&b7[__byte_perm(n[6].y, 0, 0x4443)]);
+
+		tmp[ 6]^= __ldg((uint2*)&b0[__byte_perm(n[6].x, 0, 0x4440)]);
+		tmp[ 6]^= sharedMemory[1][__byte_perm(n[5].x, 0, 0x4441)];
+		tmp[ 6]^= sharedMemory[6][__byte_perm(n[0].y, 0, 0x4442)];
+		tmp[ 6]^= __ldg((uint2*)&b7[__byte_perm(n[7].y, 0, 0x4443)]);
+
+		tmp[ 7]^= __ldg((uint2*)&b0[__byte_perm(n[7].x, 0, 0x4440)]);
+		tmp[ 7]^= sharedMemory[1][__byte_perm(n[6].x, 0, 0x4441)];
+		tmp[ 7]^= sharedMemory[2][__byte_perm(n[5].x, 0, 0x4442)];
+		tmp[ 7]^= __ldg((uint2*)&b7[__byte_perm(n[0].y, 0, 0x4443)]);
+
+		TRANSFER(n, tmp);
+
+		for (int i=2; i<10; i++) {
+			tmp[ 0] = d_ROUND_ELT1_LDG(sharedMemory,n, 0, 7, 6, 5, 4, 3, 2, 1, precomputed_round_key_80[i*8+0]);
+			tmp[ 1] = d_ROUND_ELT1(    sharedMemory,n, 1, 0, 7, 6, 5, 4, 3, 2, precomputed_round_key_80[i*8+1]);
+			tmp[ 2] = d_ROUND_ELT1(    sharedMemory,n, 2, 1, 0, 7, 6, 5, 4, 3, precomputed_round_key_80[i*8+2]);
+			tmp[ 3] = d_ROUND_ELT1_LDG(sharedMemory,n, 3, 2, 1, 0, 7, 6, 5, 4, precomputed_round_key_80[i*8+3]);
+			tmp[ 4] = d_ROUND_ELT1_LDG(sharedMemory,n, 4, 3, 2, 1, 0, 7, 6, 5, precomputed_round_key_80[i*8+4]);
+			tmp[ 5] = d_ROUND_ELT1(    sharedMemory,n, 5, 4, 3, 2, 1, 0, 7, 6, precomputed_round_key_80[i*8+5]);
+			tmp[ 6] = d_ROUND_ELT1(    sharedMemory,n, 6, 5, 4, 3, 2, 1, 0, 7, precomputed_round_key_80[i*8+6]);
+			tmp[ 7] = d_ROUND_ELT1_LDG(sharedMemory,n, 7, 6, 5, 4, 3, 2, 1, 0, precomputed_round_key_80[i*8+7]);
+			TRANSFER(n, tmp);
+		}
+
+		state[0] = c_PaddedMessage80[0] ^ n[0];
+		state[1] = c_PaddedMessage80[1] ^ n[1] ^ vectorize(REPLACE_HIDWORD(devectorize(c_PaddedMessage80[9]),nonce));
+		state[2] = c_PaddedMessage80[2] ^ n[2] ^ vectorize(0x0000000000000080);
+		state[3] = c_PaddedMessage80[3] ^ n[3];
+		state[4] = c_PaddedMessage80[4] ^ n[4];
+		state[5] = c_PaddedMessage80[5] ^ n[5];
+		state[6] = c_PaddedMessage80[6] ^ n[6];
+		state[7] = c_PaddedMessage80[7] ^ n[7] ^ vectorize(0x8002000000000000);
+
+		#pragma unroll 2
+		for(int r=0;r<2;r++){
+			#pragma unroll 8
+			for(int i=0;i<8;i++)
+				hash[ i] = n[ i] = state[ i];
+
+			uint2 h[8] = {
+				{0xC0EE0B30,0x672990AF},{0x28282828,0x28282828},{0x28282828,0x28282828},{0x28282828,0x28282828},
+				{0x28282828,0x28282828},{0x28282828,0x28282828},{0x28282828,0x28282828},{0x28282828,0x28282828}
+			};
+
+			tmp[ 0] = d_ROUND_ELT1_LDG(sharedMemory,n, 0, 7, 6, 5, 4, 3, 2, 1, h[0]);
+			tmp[ 1] = d_ROUND_ELT1(sharedMemory,n, 1, 0, 7, 6, 5, 4, 3, 2, h[1]);
+			tmp[ 2] = d_ROUND_ELT1(sharedMemory,n, 2, 1, 0, 7, 6, 5, 4, 3, h[2]);
+			tmp[ 3] = d_ROUND_ELT1_LDG(sharedMemory,n, 3, 2, 1, 0, 7, 6, 5, 4, h[3]);
+			tmp[ 4] = d_ROUND_ELT1(sharedMemory,n, 4, 3, 2, 1, 0, 7, 6, 5, h[4]);
+			tmp[ 5] = d_ROUND_ELT1_LDG(sharedMemory,n, 5, 4, 3, 2, 1, 0, 7, 6, h[5]);
+			tmp[ 6] = d_ROUND_ELT1(sharedMemory,n, 6, 5, 4, 3, 2, 1, 0, 7, h[6]);
+			tmp[ 7] = d_ROUND_ELT1_LDG(sharedMemory,n, 7, 6, 5, 4, 3, 2, 1, 0, h[7]);
+			TRANSFER(n, tmp);
+	//		#pragma unroll 10
+			for (int i=1; i <10; i++){
+				tmp[ 0] = d_ROUND_ELT1_LDG(sharedMemory,n, 0, 7, 6, 5, 4, 3, 2, 1, precomputed_round_key_64[(i-1)*8+0]);
+				tmp[ 1] = d_ROUND_ELT1(    sharedMemory,n, 1, 0, 7, 6, 5, 4, 3, 2, precomputed_round_key_64[(i-1)*8+1]);
+				tmp[ 2] = d_ROUND_ELT1(    sharedMemory,n, 2, 1, 0, 7, 6, 5, 4, 3, precomputed_round_key_64[(i-1)*8+2]);
+				tmp[ 3] = d_ROUND_ELT1_LDG(sharedMemory,n, 3, 2, 1, 0, 7, 6, 5, 4, precomputed_round_key_64[(i-1)*8+3]);
+				tmp[ 4] = d_ROUND_ELT1(    sharedMemory,n, 4, 3, 2, 1, 0, 7, 6, 5, precomputed_round_key_64[(i-1)*8+4]);
+				tmp[ 5] = d_ROUND_ELT1(    sharedMemory,n, 5, 4, 3, 2, 1, 0, 7, 6, precomputed_round_key_64[(i-1)*8+5]);
+				tmp[ 6] = d_ROUND_ELT1(    sharedMemory,n, 6, 5, 4, 3, 2, 1, 0, 7, precomputed_round_key_64[(i-1)*8+6]);
+				tmp[ 7] = d_ROUND_ELT1_LDG(sharedMemory,n, 7, 6, 5, 4, 3, 2, 1, 0, precomputed_round_key_64[(i-1)*8+7]);
+				TRANSFER(n, tmp);
+			}
+			#pragma unroll 8
+			for (int i=0; i<8; i++)
+				state[i] = n[i] ^ hash[i];
+
+			#pragma unroll 6
+			for (int i=1; i<7; i++)
+				n[i]=vectorize(0);
+
+			n[0] = vectorize(0x80);
+			n[7] = vectorize(0x2000000000000);
+
+			#pragma unroll 8
+			for (int i=0; i < 8; i++) {
+				h[i] = state[i];
+				n[i] = n[i] ^ h[i];
+			}
+
+	//		#pragma unroll 10
+			for (int i=0; i < 10; i++) {
+				tmp[ 0] = d_ROUND_ELT1(sharedMemory, h, 0, 7, 6, 5, 4, 3, 2, 1, InitVector_RC[i]);
+				tmp[ 1] = d_ROUND_ELT(sharedMemory, h, 1, 0, 7, 6, 5, 4, 3, 2);
+				tmp[ 2] = d_ROUND_ELT_LDG(sharedMemory, h, 2, 1, 0, 7, 6, 5, 4, 3);
+				tmp[ 3] = d_ROUND_ELT(sharedMemory, h, 3, 2, 1, 0, 7, 6, 5, 4);
+				tmp[ 4] = d_ROUND_ELT_LDG(sharedMemory, h, 4, 3, 2, 1, 0, 7, 6, 5);
+				tmp[ 5] = d_ROUND_ELT(sharedMemory, h, 5, 4, 3, 2, 1, 0, 7, 6);
+				tmp[ 6] = d_ROUND_ELT_LDG(sharedMemory, h, 6, 5, 4, 3, 2, 1, 0, 7);
+				tmp[ 7] = d_ROUND_ELT(sharedMemory, h, 7, 6, 5, 4, 3, 2, 1, 0);
+				TRANSFER(h, tmp);
+				tmp[ 0] = d_ROUND_ELT1(sharedMemory,n, 0, 7, 6, 5, 4, 3, 2, 1, tmp[0]);
+				tmp[ 1] = d_ROUND_ELT1(sharedMemory,n, 1, 0, 7, 6, 5, 4, 3, 2, tmp[1]);
+				tmp[ 2] = d_ROUND_ELT1_LDG(sharedMemory,n, 2, 1, 0, 7, 6, 5, 4, 3, tmp[2]);
+				tmp[ 3] = d_ROUND_ELT1(sharedMemory,n, 3, 2, 1, 0, 7, 6, 5, 4, tmp[3]);
+				tmp[ 4] = d_ROUND_ELT1(sharedMemory,n, 4, 3, 2, 1, 0, 7, 6, 5, tmp[4]);
+				tmp[ 5] = d_ROUND_ELT1(sharedMemory,n, 5, 4, 3, 2, 1, 0, 7, 6, tmp[5]);
+				tmp[ 6] = d_ROUND_ELT1(sharedMemory,n, 6, 5, 4, 3, 2, 1, 0, 7, tmp[6]);
+				tmp[ 7] = d_ROUND_ELT1_LDG(sharedMemory,n, 7, 6, 5, 4, 3, 2, 1, 0, tmp[7]);
+				TRANSFER(n, tmp);
+			}
+
+			state[0] = xor3x(state[0], n[0], vectorize(0x80));
+			state[1] = state[1]^ n[1];
+			state[2] = state[2]^ n[2];
+			state[3] = state[3]^ n[3];
+			state[4] = state[4]^ n[4];
+			state[5] = state[5]^ n[5];
+			state[6] = state[6]^ n[6];
+			state[7] = xor3x(state[7], n[7], vectorize(0x2000000000000));
+		}
+
+		uint2 h[8] = {
+			{0xC0EE0B30,0x672990AF},{0x28282828,0x28282828},{0x28282828,0x28282828},{0x28282828,0x28282828},
+			{0x28282828,0x28282828},{0x28282828,0x28282828},{0x28282828,0x28282828},{0x28282828,0x28282828}
+		};
+
+		#pragma unroll 8
+		for(int i=0;i<8;i++)
+			n[i]=hash[i] = state[ i];
+
+		tmp[ 0] = d_ROUND_ELT1(sharedMemory,n, 0, 7, 6, 5, 4, 3, 2, 1, h[0]);
+		tmp[ 1] = d_ROUND_ELT1_LDG(sharedMemory,n, 1, 0, 7, 6, 5, 4, 3, 2, h[1]);
+		tmp[ 2] = d_ROUND_ELT1(sharedMemory,n, 2, 1, 0, 7, 6, 5, 4, 3, h[2]);
+		tmp[ 3] = d_ROUND_ELT1_LDG(sharedMemory,n, 3, 2, 1, 0, 7, 6, 5, 4, h[3]);
+		tmp[ 4] = d_ROUND_ELT1(sharedMemory,n, 4, 3, 2, 1, 0, 7, 6, 5, h[4]);
+		tmp[ 5] = d_ROUND_ELT1_LDG(sharedMemory,n, 5, 4, 3, 2, 1, 0, 7, 6, h[5]);
+		tmp[ 6] = d_ROUND_ELT1(sharedMemory,n, 6, 5, 4, 3, 2, 1, 0, 7, h[6]);
+		tmp[ 7] = d_ROUND_ELT1_LDG(sharedMemory,n, 7, 6, 5, 4, 3, 2, 1, 0, h[7]);
+		TRANSFER(n, tmp);
+//		#pragma unroll 10
+		for (int i=1; i <10; i++){
+			tmp[ 0] = d_ROUND_ELT1_LDG(sharedMemory,n, 0, 7, 6, 5, 4, 3, 2, 1, precomputed_round_key_64[(i-1)*8+0]);
+			tmp[ 1] = d_ROUND_ELT1(    sharedMemory,n, 1, 0, 7, 6, 5, 4, 3, 2, precomputed_round_key_64[(i-1)*8+1]);
+			tmp[ 2] = d_ROUND_ELT1(    sharedMemory,n, 2, 1, 0, 7, 6, 5, 4, 3, precomputed_round_key_64[(i-1)*8+2]);
+			tmp[ 3] = d_ROUND_ELT1_LDG(sharedMemory,n, 3, 2, 1, 0, 7, 6, 5, 4, precomputed_round_key_64[(i-1)*8+3]);
+			tmp[ 4] = d_ROUND_ELT1(    sharedMemory,n, 4, 3, 2, 1, 0, 7, 6, 5, precomputed_round_key_64[(i-1)*8+4]);
+			tmp[ 5] = d_ROUND_ELT1(    sharedMemory,n, 5, 4, 3, 2, 1, 0, 7, 6, precomputed_round_key_64[(i-1)*8+5]);
+			tmp[ 6] = d_ROUND_ELT1(    sharedMemory,n, 6, 5, 4, 3, 2, 1, 0, 7, precomputed_round_key_64[(i-1)*8+6]);
+			tmp[ 7] = d_ROUND_ELT1_LDG(sharedMemory,n, 7, 6, 5, 4, 3, 2, 1, 0, precomputed_round_key_64[(i-1)*8+7]);
+			TRANSFER(n, tmp);
+		}
+
+		#pragma unroll 8
+		for (int i=0; i<8; i++)
+			n[ i] = h[i] = n[i] ^ hash[i];
+
+		uint2 backup = h[ 3];
+
+		n[0]^= vectorize(0x80);
+		n[7]^= vectorize(0x2000000000000);
+
+//		#pragma unroll 8
+		for (int i=0; i < 8; i++) {
+			tmp[ 0] = d_ROUND_ELT1(sharedMemory, h, 0, 7, 6, 5, 4, 3, 2, 1, InitVector_RC[i]);
+			tmp[ 1] = d_ROUND_ELT(sharedMemory, h, 1, 0, 7, 6, 5, 4, 3, 2);
+			tmp[ 2] = d_ROUND_ELT_LDG(sharedMemory, h, 2, 1, 0, 7, 6, 5, 4, 3);
+			tmp[ 3] = d_ROUND_ELT(sharedMemory, h, 3, 2, 1, 0, 7, 6, 5, 4);
+			tmp[ 4] = d_ROUND_ELT_LDG(sharedMemory, h, 4, 3, 2, 1, 0, 7, 6, 5);
+			tmp[ 5] = d_ROUND_ELT(sharedMemory, h, 5, 4, 3, 2, 1, 0, 7, 6);
+			tmp[ 6] = d_ROUND_ELT_LDG(sharedMemory, h, 6, 5, 4, 3, 2, 1, 0, 7);
+			tmp[ 7] = d_ROUND_ELT(sharedMemory, h, 7, 6, 5, 4, 3, 2, 1, 0);
+			TRANSFER(h, tmp);
+			tmp[ 0] = d_ROUND_ELT1(sharedMemory,n, 0, 7, 6, 5, 4, 3, 2, 1, tmp[0]);
+			tmp[ 1] = d_ROUND_ELT1(sharedMemory,n, 1, 0, 7, 6, 5, 4, 3, 2, tmp[1]);
+			tmp[ 2] = d_ROUND_ELT1_LDG(sharedMemory,n, 2, 1, 0, 7, 6, 5, 4, 3, tmp[2]);
+			tmp[ 3] = d_ROUND_ELT1(sharedMemory,n, 3, 2, 1, 0, 7, 6, 5, 4, tmp[3]);
+			tmp[ 4] = d_ROUND_ELT1(sharedMemory,n, 4, 3, 2, 1, 0, 7, 6, 5, tmp[4]);
+			tmp[ 5] = d_ROUND_ELT1(sharedMemory,n, 5, 4, 3, 2, 1, 0, 7, 6, tmp[5]);
+			tmp[ 6] = d_ROUND_ELT1(sharedMemory,n, 6, 5, 4, 3, 2, 1, 0, 7, tmp[6]);
+			tmp[ 7] = d_ROUND_ELT1_LDG(sharedMemory,n, 7, 6, 5, 4, 3, 2, 1, 0, tmp[7]);
+			TRANSFER(n, tmp);
+		}
+		tmp[ 0] = d_ROUND_ELT1(sharedMemory, h, 0, 7, 6, 5, 4, 3, 2, 1, InitVector_RC[8]);
+		tmp[ 1] = d_ROUND_ELT(sharedMemory, h, 1, 0, 7, 6, 5, 4, 3, 2);
+		tmp[ 2] = d_ROUND_ELT_LDG(sharedMemory, h, 2, 1, 0, 7, 6, 5, 4, 3);
+		tmp[ 3] = d_ROUND_ELT(sharedMemory, h, 3, 2, 1, 0, 7, 6, 5, 4);
+		tmp[ 4] = d_ROUND_ELT_LDG(sharedMemory, h, 4, 3, 2, 1, 0, 7, 6, 5);
+		tmp[ 5] = d_ROUND_ELT(sharedMemory, h, 5, 4, 3, 2, 1, 0, 7, 6);
+		tmp[ 6] = d_ROUND_ELT(sharedMemory, h, 6, 5, 4, 3, 2, 1, 0, 7);
+		tmp[ 7] = d_ROUND_ELT(sharedMemory, h, 7, 6, 5, 4, 3, 2, 1, 0);
+		TRANSFER(h, tmp);
+		tmp[ 0] = d_ROUND_ELT1(sharedMemory,n, 0, 7, 6, 5, 4, 3, 2, 1, tmp[0]);
+		tmp[ 1] = d_ROUND_ELT1(sharedMemory,n, 1, 0, 7, 6, 5, 4, 3, 2, tmp[1]);
+		tmp[ 2] = d_ROUND_ELT1(sharedMemory,n, 2, 1, 0, 7, 6, 5, 4, 3, tmp[2]);
+		tmp[ 3] = d_ROUND_ELT1(sharedMemory,n, 3, 2, 1, 0, 7, 6, 5, 4, tmp[3]);
+		tmp[ 4] = d_ROUND_ELT1(sharedMemory,n, 4, 3, 2, 1, 0, 7, 6, 5, tmp[4]);
+		tmp[ 5] = d_ROUND_ELT1(sharedMemory,n, 5, 4, 3, 2, 1, 0, 7, 6, tmp[5]);
+		tmp[ 6] = d_ROUND_ELT1_LDG(sharedMemory,n, 6, 5, 4, 3, 2, 1, 0, 7, tmp[6]);
+		tmp[ 7] = d_ROUND_ELT1(sharedMemory,n, 7, 6, 5, 4, 3, 2, 1, 0, tmp[7]);
+
+		n[ 3] = backup ^ d_ROUND_ELT(sharedMemory,  h, 3, 2, 1, 0, 7, 6, 5, 4)
+			^ d_ROUND_ELT(sharedMemory,tmp, 3, 2, 1, 0, 7, 6, 5, 4);
+
+		if(devectorize(n[3]) <= target) {
+			uint32_t tmp = atomicExch(&resNonce[0], thread);
+			if (tmp != UINT32_MAX)
+				resNonce[1] = tmp;
+		}
+
+	} // thread < threads
+}
+
+/* only for whirlpool algo, no data out!! */
+__host__
+void whirlpool512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *h_resNonces, const uint64_t target)
+{
+	dim3 grid((threads + TPB80-1) / TPB80);
+	dim3 block(TPB80);
+
+	cudaMemset(d_resNonce[thr_id], 0xff, 2*sizeof(uint32_t));
+
+	oldwhirlpool_gpu_hash_80<<<grid, block>>>(threads, startNounce, d_resNonce[thr_id], target);
+
+	cudaMemcpy(h_resNonces, d_resNonce[thr_id], 2*sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	if (h_resNonces[0] != UINT32_MAX) h_resNonces[0] += startNounce;
+	if (h_resNonces[1] != UINT32_MAX) h_resNonces[1] += startNounce;
+}
+
+__global__
+__launch_bounds__(TPB64,2)
+void x15_whirlpool_gpu_hash_64(uint32_t threads, uint64_t *g_hash)
+{
+	__shared__ uint2 sharedMemory[7][256];
+
+	if (threadIdx.x < 256) {
+		const uint2 tmp = __ldg((uint2*)&b0[threadIdx.x]);
+		sharedMemory[0][threadIdx.x] = tmp;
+		sharedMemory[1][threadIdx.x] = ROL8(tmp);
+		sharedMemory[2][threadIdx.x] = ROL16(tmp);
+		sharedMemory[3][threadIdx.x] = ROL24(tmp);
+		sharedMemory[4][threadIdx.x] = SWAPUINT2(tmp);
+		sharedMemory[5][threadIdx.x] = ROR24(tmp);
+		sharedMemory[6][threadIdx.x] = ROR16(tmp);
+	}
+
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads){
+
+		uint2 hash[8], n[8], h[ 8];
+		uint2 tmp[8] = {
+			{0xC0EE0B30,0x672990AF},{0x28282828,0x28282828},{0x28282828,0x28282828},{0x28282828,0x28282828},
+			{0x28282828,0x28282828},{0x28282828,0x28282828},{0x28282828,0x28282828},{0x28282828,0x28282828}
+		};
+
+		*(uint2x4*)&hash[ 0] = __ldg4((uint2x4*)&g_hash[(thread<<3) + 0]);
+		*(uint2x4*)&hash[ 4] = __ldg4((uint2x4*)&g_hash[(thread<<3) + 4]);
+
+		__syncthreads();
+
+		#pragma unroll 8
+		for(int i=0;i<8;i++)
+			n[i]=hash[i];
+
+		tmp[ 0]^= d_ROUND_ELT(sharedMemory,n, 0, 7, 6, 5, 4, 3, 2, 1);
+		tmp[ 1]^= d_ROUND_ELT_LDG(sharedMemory,n, 1, 0, 7, 6, 5, 4, 3, 2);
+		tmp[ 2]^= d_ROUND_ELT(sharedMemory,n, 2, 1, 0, 7, 6, 5, 4, 3);
+		tmp[ 3]^= d_ROUND_ELT_LDG(sharedMemory,n, 3, 2, 1, 0, 7, 6, 5, 4);
+		tmp[ 4]^= d_ROUND_ELT(sharedMemory,n, 4, 3, 2, 1, 0, 7, 6, 5);
+		tmp[ 5]^= d_ROUND_ELT_LDG(sharedMemory,n, 5, 4, 3, 2, 1, 0, 7, 6);
+		tmp[ 6]^= d_ROUND_ELT(sharedMemory,n, 6, 5, 4, 3, 2, 1, 0, 7);
+		tmp[ 7]^= d_ROUND_ELT_LDG(sharedMemory,n, 7, 6, 5, 4, 3, 2, 1, 0);
+		for (int i=1; i <10; i++){
+			TRANSFER(n, tmp);
+			tmp[ 0] = d_ROUND_ELT1_LDG(sharedMemory,n, 0, 7, 6, 5, 4, 3, 2, 1, precomputed_round_key_64[(i-1)*8+0]);
+			tmp[ 1] = d_ROUND_ELT1(    sharedMemory,n, 1, 0, 7, 6, 5, 4, 3, 2, precomputed_round_key_64[(i-1)*8+1]);
+			tmp[ 2] = d_ROUND_ELT1(    sharedMemory,n, 2, 1, 0, 7, 6, 5, 4, 3, precomputed_round_key_64[(i-1)*8+2]);
+			tmp[ 3] = d_ROUND_ELT1_LDG(sharedMemory,n, 3, 2, 1, 0, 7, 6, 5, 4, precomputed_round_key_64[(i-1)*8+3]);
+			tmp[ 4] = d_ROUND_ELT1(    sharedMemory,n, 4, 3, 2, 1, 0, 7, 6, 5, precomputed_round_key_64[(i-1)*8+4]);
+			tmp[ 5] = d_ROUND_ELT1(    sharedMemory,n, 5, 4, 3, 2, 1, 0, 7, 6, precomputed_round_key_64[(i-1)*8+5]);
+			tmp[ 6] = d_ROUND_ELT1(    sharedMemory,n, 6, 5, 4, 3, 2, 1, 0, 7, precomputed_round_key_64[(i-1)*8+6]);
+			tmp[ 7] = d_ROUND_ELT1_LDG(sharedMemory,n, 7, 6, 5, 4, 3, 2, 1, 0, precomputed_round_key_64[(i-1)*8+7]);
+		}
+
+		TRANSFER(h, tmp);
+		#pragma unroll 8
+		for (int i=0; i<8; i++)
+			hash[ i] = h[i] = h[i] ^ hash[i];
+
+		#pragma unroll 6
+		for (int i=1; i<7; i++)
+			n[i]=vectorize(0);
+
+		n[0] = vectorize(0x80);
+		n[7] = vectorize(0x2000000000000);
+
+		#pragma unroll 8
+		for (int i=0; i < 8; i++) {
+			n[i] = n[i] ^ h[i];
+		}
+
+//		#pragma unroll 10
+		for (int i=0; i < 10; i++) {
+			tmp[ 0] = InitVector_RC[i];
+			tmp[ 0]^= d_ROUND_ELT(sharedMemory, h, 0, 7, 6, 5, 4, 3, 2, 1);
+			tmp[ 1] = d_ROUND_ELT(sharedMemory, h, 1, 0, 7, 6, 5, 4, 3, 2);
+			tmp[ 2] = d_ROUND_ELT_LDG(sharedMemory, h, 2, 1, 0, 7, 6, 5, 4, 3);
+			tmp[ 3] = d_ROUND_ELT(sharedMemory, h, 3, 2, 1, 0, 7, 6, 5, 4);
+			tmp[ 4] = d_ROUND_ELT_LDG(sharedMemory, h, 4, 3, 2, 1, 0, 7, 6, 5);
+			tmp[ 5] = d_ROUND_ELT(sharedMemory, h, 5, 4, 3, 2, 1, 0, 7, 6);
+			tmp[ 6] = d_ROUND_ELT(sharedMemory, h, 6, 5, 4, 3, 2, 1, 0, 7);
+			tmp[ 7] = d_ROUND_ELT(sharedMemory, h, 7, 6, 5, 4, 3, 2, 1, 0);
+			TRANSFER(h, tmp);
+			tmp[ 0] = d_ROUND_ELT1(sharedMemory,n, 0, 7, 6, 5, 4, 3, 2, 1, tmp[0]);
+			tmp[ 1] = d_ROUND_ELT1_LDG(sharedMemory,n, 1, 0, 7, 6, 5, 4, 3, 2, tmp[1]);
+			tmp[ 2] = d_ROUND_ELT1(sharedMemory,n, 2, 1, 0, 7, 6, 5, 4, 3, tmp[2]);
+			tmp[ 3] = d_ROUND_ELT1(sharedMemory,n, 3, 2, 1, 0, 7, 6, 5, 4, tmp[3]);
+			tmp[ 4] = d_ROUND_ELT1_LDG(sharedMemory,n, 4, 3, 2, 1, 0, 7, 6, 5, tmp[4]);
+			tmp[ 5] = d_ROUND_ELT1(sharedMemory,n, 5, 4, 3, 2, 1, 0, 7, 6, tmp[5]);
+			tmp[ 6] = d_ROUND_ELT1_LDG(sharedMemory,n, 6, 5, 4, 3, 2, 1, 0, 7, tmp[6]);
+			tmp[ 7] = d_ROUND_ELT1(sharedMemory,n, 7, 6, 5, 4, 3, 2, 1, 0, tmp[7]);
+			TRANSFER(n, tmp);
+		}
+
+		hash[0] = xor3x(hash[0], n[0], vectorize(0x80));
+		hash[1] = hash[1]^ n[1];
+		hash[2] = hash[2]^ n[2];
+		hash[3] = hash[3]^ n[3];
+		hash[4] = hash[4]^ n[4];
+		hash[5] = hash[5]^ n[5];
+		hash[6] = hash[6]^ n[6];
+		hash[7] = xor3x(hash[7], n[7], vectorize(0x2000000000000));
+
+		*(uint2x4*)&g_hash[(thread<<3)+ 0] = *(uint2x4*)&hash[ 0];
+		*(uint2x4*)&g_hash[(thread<<3)+ 4] = *(uint2x4*)&hash[ 4];
+	}
+}
+
+__host__
+static void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash)
+{
+	dim3 grid((threads + TPB64-1) / TPB64);
+	dim3 block(TPB64);
+
+	x15_whirlpool_gpu_hash_64 <<<grid, block>>> (threads, (uint64_t*)d_hash);
+}
+
+__host__
+void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+{
+	x15_whirlpool_cpu_hash_64(thr_id, threads, d_hash);
+}
+
diff --git a/x15/cuda_x15_whirlpool_sm3.cu b/x15/cuda_x15_whirlpool_sm3.cu
new file mode 100644
index 0000000000..3110a694ed
--- /dev/null
+++ b/x15/cuda_x15_whirlpool_sm3.cu
@@ -0,0 +1,2419 @@
+/**
+ * Whirlpool-512 CUDA implementation. (better for SM 3.0)
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2014-2016 djm34, tpruvot, SP
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ * @author djm34 (initial draft)
+ * @author tpruvot (dual old/whirlpool modes, midstate)
+ * @author SP ("final" function opt and tuning)
+ */
+#include <stdio.h>
+#include <memory.h>
+
+// don't change, used by shared mem fetch!
+#define threadsperblock 256
+
+#include <cuda_helper.h>
+#include <miner.h>
+#include "cuda_whirlpool_tables.cuh"
+
+extern __device__ __device_builtin__ void __threadfence_block(void);
+
+__device__ static uint64_t c_PaddedMessage80[16];
+__device__ static uint32_t pTarget[8];
+
+static uint32_t *h_wnounce[MAX_GPUS] = { 0 };
+static uint32_t *d_WNonce[MAX_GPUS] = { 0 };
+
+#define HOST_MIDSTATE 1
+#define USE_ALL_TABLES 1
+
+__constant__ static uint64_t mixTob0Tox[256];
+#if USE_ALL_TABLES
+__constant__ static uint64_t mixTob1Tox[256];
+__constant__ static uint64_t mixTob2Tox[256];
+__constant__ static uint64_t mixTob3Tox[256];
+__constant__ static uint64_t mixTob4Tox[256];
+__constant__ static uint64_t mixTob5Tox[256];
+__constant__ static uint64_t mixTob6Tox[256];
+__constant__ static uint64_t mixTob7Tox[256];
+#endif
+
+#if USE_ALL_TABLES
+static const uint64_t old1_T1[256] = {
+	SPH_C64(0xD8C0781828181878), SPH_C64(0x2605AF23652323AF),
+	SPH_C64(0xB87EF9C657C6C6F9), SPH_C64(0xFB136FE825E8E86F),
+	SPH_C64(0xCB4CA187948787A1), SPH_C64(0x11A962B8D5B8B862),
+	SPH_C64(0x0908050103010105), SPH_C64(0x0D426E4FD14F4F6E),
+	SPH_C64(0x9BADEE365A3636EE), SPH_C64(0xFF5904A6F7A6A604),
+	SPH_C64(0x0CDEBDD26BD2D2BD), SPH_C64(0x0EFB06F502F5F506),
+	SPH_C64(0x96EF80798B797980), SPH_C64(0x305FCE6FB16F6FCE),
+	SPH_C64(0x6DFCEF91AE9191EF), SPH_C64(0xF8AA0752F6525207),
+	SPH_C64(0x4727FD60A06060FD), SPH_C64(0x358976BCD9BCBC76),
+	SPH_C64(0x37ACCD9BB09B9BCD), SPH_C64(0x8A048C8E8F8E8E8C),
+	SPH_C64(0xD27115A3F8A3A315), SPH_C64(0x6C603C0C140C0C3C),
+	SPH_C64(0x84FF8A7B8D7B7B8A), SPH_C64(0x80B5E1355F3535E1),
+	SPH_C64(0xF5E8691D271D1D69), SPH_C64(0xB35347E03DE0E047),
+	SPH_C64(0x21F6ACD764D7D7AC), SPH_C64(0x9C5EEDC25BC2C2ED),
+	SPH_C64(0x436D962E722E2E96), SPH_C64(0x29627A4BDD4B4B7A),
+	SPH_C64(0x5DA321FE1FFEFE21), SPH_C64(0xD5821657F9575716),
+	SPH_C64(0xBDA841153F151541), SPH_C64(0xE89FB677997777B6),
+	SPH_C64(0x92A5EB37593737EB), SPH_C64(0x9E7B56E532E5E556),
+	SPH_C64(0x138CD99FBC9F9FD9), SPH_C64(0x23D317F00DF0F017),
+	SPH_C64(0x206A7F4ADE4A4A7F), SPH_C64(0x449E95DA73DADA95),
+	SPH_C64(0xA2FA2558E8585825), SPH_C64(0xCF06CAC946C9C9CA),
+	SPH_C64(0x7C558D297B29298D), SPH_C64(0x5A50220A1E0A0A22),
+	SPH_C64(0x50E14FB1CEB1B14F), SPH_C64(0xC9691AA0FDA0A01A),
+	SPH_C64(0x147FDA6BBD6B6BDA), SPH_C64(0xD95CAB85928585AB),
+	SPH_C64(0x3C8173BDDABDBD73), SPH_C64(0x8FD2345DE75D5D34),
+	SPH_C64(0x9080501030101050), SPH_C64(0x07F303F401F4F403),
+	SPH_C64(0xDD16C0CB40CBCBC0), SPH_C64(0xD3EDC63E423E3EC6),
+	SPH_C64(0x2D2811050F050511), SPH_C64(0x781FE667A96767E6),
+	SPH_C64(0x977353E431E4E453), SPH_C64(0x0225BB27692727BB),
+	SPH_C64(0x73325841C3414158), SPH_C64(0xA72C9D8B808B8B9D),
+	SPH_C64(0xF65101A7F4A7A701), SPH_C64(0xB2CF947D877D7D94),
+	SPH_C64(0x49DCFB95A29595FB), SPH_C64(0x568E9FD875D8D89F),
+	SPH_C64(0x708B30FB10FBFB30), SPH_C64(0xCD2371EE2FEEEE71),
+	SPH_C64(0xBBC7917C847C7C91), SPH_C64(0x7117E366AA6666E3),
+	SPH_C64(0x7BA68EDD7ADDDD8E), SPH_C64(0xAFB84B173917174B),
+	SPH_C64(0x45024647C9474746), SPH_C64(0x1A84DC9EBF9E9EDC),
+	SPH_C64(0xD41EC5CA43CACAC5), SPH_C64(0x5875992D772D2D99),
+	SPH_C64(0x2E9179BFDCBFBF79), SPH_C64(0x3F381B070907071B),
+	SPH_C64(0xAC0123ADEAADAD23), SPH_C64(0xB0EA2F5AEE5A5A2F),
+	SPH_C64(0xEF6CB583988383B5), SPH_C64(0xB685FF33553333FF),
+	SPH_C64(0x5C3FF263A56363F2), SPH_C64(0x12100A020602020A),
+	SPH_C64(0x933938AAE3AAAA38), SPH_C64(0xDEAFA871937171A8),
+	SPH_C64(0xC60ECFC845C8C8CF), SPH_C64(0xD1C87D192B19197D),
+	SPH_C64(0x3B727049DB494970), SPH_C64(0x5F869AD976D9D99A),
+	SPH_C64(0x31C31DF20BF2F21D), SPH_C64(0xA84B48E338E3E348),
+	SPH_C64(0xB9E22A5BED5B5B2A), SPH_C64(0xBC34928885888892),
+	SPH_C64(0x3EA4C89AB39A9AC8), SPH_C64(0x0B2DBE266A2626BE),
+	SPH_C64(0xBF8DFA32563232FA), SPH_C64(0x59E94AB0CDB0B04A),
+	SPH_C64(0xF21B6AE926E9E96A), SPH_C64(0x7778330F110F0F33),
+	SPH_C64(0x33E6A6D562D5D5A6), SPH_C64(0xF474BA809D8080BA),
+	SPH_C64(0x27997CBEDFBEBE7C), SPH_C64(0xEB26DECD4ACDCDDE),
+	SPH_C64(0x89BDE4345C3434E4), SPH_C64(0x327A7548D8484875),
+	SPH_C64(0x54AB24FF1CFFFF24), SPH_C64(0x8DF78F7A8E7A7A8F),
+	SPH_C64(0x64F4EA90AD9090EA), SPH_C64(0x9DC23E5FE15F5F3E),
+	SPH_C64(0x3D1DA020602020A0), SPH_C64(0x0F67D568B86868D5),
+	SPH_C64(0xCAD0721A2E1A1A72), SPH_C64(0xB7192CAEEFAEAE2C),
+	SPH_C64(0x7DC95EB4C1B4B45E), SPH_C64(0xCE9A1954FC545419),
+	SPH_C64(0x7FECE593A89393E5), SPH_C64(0x2F0DAA22662222AA),
+	SPH_C64(0x6307E964AC6464E9), SPH_C64(0x2ADB12F10EF1F112),
+	SPH_C64(0xCCBFA273957373A2), SPH_C64(0x82905A123612125A),
+	SPH_C64(0x7A3A5D40C040405D), SPH_C64(0x4840280818080828),
+	SPH_C64(0x9556E8C358C3C3E8), SPH_C64(0xDF337BEC29ECEC7B),
+	SPH_C64(0x4D9690DB70DBDB90), SPH_C64(0xC0611FA1FEA1A11F),
+	SPH_C64(0x911C838D8A8D8D83), SPH_C64(0xC8F5C93D473D3DC9),
+	SPH_C64(0x5BCCF197A49797F1), SPH_C64(0x0000000000000000),
+	SPH_C64(0xF936D4CF4CCFCFD4), SPH_C64(0x6E45872B7D2B2B87),
+	SPH_C64(0xE197B3769A7676B3), SPH_C64(0xE664B0829B8282B0),
+	SPH_C64(0x28FEA9D667D6D6A9), SPH_C64(0xC3D8771B2D1B1B77),
+	SPH_C64(0x74C15BB5C2B5B55B), SPH_C64(0xBE1129AFECAFAF29),
+	SPH_C64(0x1D77DF6ABE6A6ADF), SPH_C64(0xEABA0D50F050500D),
+	SPH_C64(0x57124C45CF45454C), SPH_C64(0x38CB18F308F3F318),
+	SPH_C64(0xAD9DF030503030F0), SPH_C64(0xC42B74EF2CEFEF74),
+	SPH_C64(0xDAE5C33F413F3FC3), SPH_C64(0xC7921C55FF55551C),
+	SPH_C64(0xDB7910A2FBA2A210), SPH_C64(0xE90365EA23EAEA65),
+	SPH_C64(0x6A0FEC65AF6565EC), SPH_C64(0x03B968BAD3BABA68),
+	SPH_C64(0x4A65932F712F2F93), SPH_C64(0x8E4EE7C05DC0C0E7),
+	SPH_C64(0x60BE81DE7FDEDE81), SPH_C64(0xFCE06C1C241C1C6C),
+	SPH_C64(0x46BB2EFD1AFDFD2E), SPH_C64(0x1F52644DD74D4D64),
+	SPH_C64(0x76E4E092AB9292E0), SPH_C64(0xFA8FBC759F7575BC),
+	SPH_C64(0x36301E060A06061E), SPH_C64(0xAE24988A838A8A98),
+	SPH_C64(0x4BF940B2CBB2B240), SPH_C64(0x856359E637E6E659),
+	SPH_C64(0x7E70360E120E0E36), SPH_C64(0xE7F8631F211F1F63),
+	SPH_C64(0x5537F762A66262F7), SPH_C64(0x3AEEA3D461D4D4A3),
+	SPH_C64(0x812932A8E5A8A832), SPH_C64(0x52C4F496A79696F4),
+	SPH_C64(0x629B3AF916F9F93A), SPH_C64(0xA366F6C552C5C5F6),
+	SPH_C64(0x1035B1256F2525B1), SPH_C64(0xABF22059EB595920),
+	SPH_C64(0xD054AE84918484AE), SPH_C64(0xC5B7A772967272A7),
+	SPH_C64(0xECD5DD394B3939DD), SPH_C64(0x165A614CD44C4C61),
+	SPH_C64(0x94CA3B5EE25E5E3B), SPH_C64(0x9FE7857888787885),
+	SPH_C64(0xE5DDD838483838D8), SPH_C64(0x9814868C898C8C86),
+	SPH_C64(0x17C6B2D16ED1D1B2), SPH_C64(0xE4410BA5F2A5A50B),
+	SPH_C64(0xA1434DE23BE2E24D), SPH_C64(0x4E2FF861A36161F8),
+	SPH_C64(0x42F145B3C8B3B345), SPH_C64(0x3415A521632121A5),
+	SPH_C64(0x0894D69CB99C9CD6), SPH_C64(0xEEF0661E221E1E66),
+	SPH_C64(0x61225243C5434352), SPH_C64(0xB176FCC754C7C7FC),
+	SPH_C64(0x4FB32BFC19FCFC2B), SPH_C64(0x242014040C040414),
+	SPH_C64(0xE3B20851F3515108), SPH_C64(0x25BCC799B69999C7),
+	SPH_C64(0x224FC46DB76D6DC4), SPH_C64(0x6568390D170D0D39),
+	SPH_C64(0x798335FA13FAFA35), SPH_C64(0x69B684DF7CDFDF84),
+	SPH_C64(0xA9D79B7E827E7E9B), SPH_C64(0x193DB4246C2424B4),
+	SPH_C64(0xFEC5D73B4D3B3BD7), SPH_C64(0x9A313DABE0ABAB3D),
+	SPH_C64(0xF03ED1CE4FCECED1), SPH_C64(0x9988551133111155),
+	SPH_C64(0x830C898F8C8F8F89), SPH_C64(0x044A6B4ED24E4E6B),
+	SPH_C64(0x66D151B7C4B7B751), SPH_C64(0xE00B60EB20EBEB60),
+	SPH_C64(0xC1FDCC3C443C3CCC), SPH_C64(0xFD7CBF819E8181BF),
+	SPH_C64(0x40D4FE94A19494FE), SPH_C64(0x1CEB0CF704F7F70C),
+	SPH_C64(0x18A167B9D6B9B967), SPH_C64(0x8B985F133513135F),
+	SPH_C64(0x517D9C2C742C2C9C), SPH_C64(0x05D6B8D368D3D3B8),
+	SPH_C64(0x8C6B5CE734E7E75C), SPH_C64(0x3957CB6EB26E6ECB),
+	SPH_C64(0xAA6EF3C451C4C4F3), SPH_C64(0x1B180F030503030F),
+	SPH_C64(0xDC8A1356FA565613), SPH_C64(0x5E1A4944CC444449),
+	SPH_C64(0xA0DF9E7F817F7F9E), SPH_C64(0x882137A9E6A9A937),
+	SPH_C64(0x674D822A7E2A2A82), SPH_C64(0x0AB16DBBD0BBBB6D),
+	SPH_C64(0x8746E2C15EC1C1E2), SPH_C64(0xF1A20253F5535302),
+	SPH_C64(0x72AE8BDC79DCDC8B), SPH_C64(0x5358270B1D0B0B27),
+	SPH_C64(0x019CD39DBA9D9DD3), SPH_C64(0x2B47C16CB46C6CC1),
+	SPH_C64(0xA495F531533131F5), SPH_C64(0xF387B9749C7474B9),
+	SPH_C64(0x15E309F607F6F609), SPH_C64(0x4C0A4346CA464643),
+	SPH_C64(0xA50926ACE9ACAC26), SPH_C64(0xB53C978986898997),
+	SPH_C64(0xB4A044143C141444), SPH_C64(0xBA5B42E13EE1E142),
+	SPH_C64(0xA6B04E163A16164E), SPH_C64(0xF7CDD23A4E3A3AD2),
+	SPH_C64(0x066FD069BB6969D0), SPH_C64(0x41482D091B09092D),
+	SPH_C64(0xD7A7AD70907070AD), SPH_C64(0x6FD954B6C7B6B654),
+	SPH_C64(0x1ECEB7D06DD0D0B7), SPH_C64(0xD63B7EED2AEDED7E),
+	SPH_C64(0xE22EDBCC49CCCCDB), SPH_C64(0x682A5742C6424257),
+	SPH_C64(0x2CB4C298B59898C2), SPH_C64(0xED490EA4F1A4A40E),
+	SPH_C64(0x755D882878282888), SPH_C64(0x86DA315CE45C5C31),
+	SPH_C64(0x6B933FF815F8F83F), SPH_C64(0xC244A486978686A4)
+};
+
+static const uint64_t old1_T2[256] = {
+	SPH_C64(0xC0781828181878D8), SPH_C64(0x05AF23652323AF26),
+	SPH_C64(0x7EF9C657C6C6F9B8), SPH_C64(0x136FE825E8E86FFB),
+	SPH_C64(0x4CA187948787A1CB), SPH_C64(0xA962B8D5B8B86211),
+	SPH_C64(0x0805010301010509), SPH_C64(0x426E4FD14F4F6E0D),
+	SPH_C64(0xADEE365A3636EE9B), SPH_C64(0x5904A6F7A6A604FF),
+	SPH_C64(0xDEBDD26BD2D2BD0C), SPH_C64(0xFB06F502F5F5060E),
+	SPH_C64(0xEF80798B79798096), SPH_C64(0x5FCE6FB16F6FCE30),
+	SPH_C64(0xFCEF91AE9191EF6D), SPH_C64(0xAA0752F6525207F8),
+	SPH_C64(0x27FD60A06060FD47), SPH_C64(0x8976BCD9BCBC7635),
+	SPH_C64(0xACCD9BB09B9BCD37), SPH_C64(0x048C8E8F8E8E8C8A),
+	SPH_C64(0x7115A3F8A3A315D2), SPH_C64(0x603C0C140C0C3C6C),
+	SPH_C64(0xFF8A7B8D7B7B8A84), SPH_C64(0xB5E1355F3535E180),
+	SPH_C64(0xE8691D271D1D69F5), SPH_C64(0x5347E03DE0E047B3),
+	SPH_C64(0xF6ACD764D7D7AC21), SPH_C64(0x5EEDC25BC2C2ED9C),
+	SPH_C64(0x6D962E722E2E9643), SPH_C64(0x627A4BDD4B4B7A29),
+	SPH_C64(0xA321FE1FFEFE215D), SPH_C64(0x821657F9575716D5),
+	SPH_C64(0xA841153F151541BD), SPH_C64(0x9FB677997777B6E8),
+	SPH_C64(0xA5EB37593737EB92), SPH_C64(0x7B56E532E5E5569E),
+	SPH_C64(0x8CD99FBC9F9FD913), SPH_C64(0xD317F00DF0F01723),
+	SPH_C64(0x6A7F4ADE4A4A7F20), SPH_C64(0x9E95DA73DADA9544),
+	SPH_C64(0xFA2558E8585825A2), SPH_C64(0x06CAC946C9C9CACF),
+	SPH_C64(0x558D297B29298D7C), SPH_C64(0x50220A1E0A0A225A),
+	SPH_C64(0xE14FB1CEB1B14F50), SPH_C64(0x691AA0FDA0A01AC9),
+	SPH_C64(0x7FDA6BBD6B6BDA14), SPH_C64(0x5CAB85928585ABD9),
+	SPH_C64(0x8173BDDABDBD733C), SPH_C64(0xD2345DE75D5D348F),
+	SPH_C64(0x8050103010105090), SPH_C64(0xF303F401F4F40307),
+	SPH_C64(0x16C0CB40CBCBC0DD), SPH_C64(0xEDC63E423E3EC6D3),
+	SPH_C64(0x2811050F0505112D), SPH_C64(0x1FE667A96767E678),
+	SPH_C64(0x7353E431E4E45397), SPH_C64(0x25BB27692727BB02),
+	SPH_C64(0x325841C341415873), SPH_C64(0x2C9D8B808B8B9DA7),
+	SPH_C64(0x5101A7F4A7A701F6), SPH_C64(0xCF947D877D7D94B2),
+	SPH_C64(0xDCFB95A29595FB49), SPH_C64(0x8E9FD875D8D89F56),
+	SPH_C64(0x8B30FB10FBFB3070), SPH_C64(0x2371EE2FEEEE71CD),
+	SPH_C64(0xC7917C847C7C91BB), SPH_C64(0x17E366AA6666E371),
+	SPH_C64(0xA68EDD7ADDDD8E7B), SPH_C64(0xB84B173917174BAF),
+	SPH_C64(0x024647C947474645), SPH_C64(0x84DC9EBF9E9EDC1A),
+	SPH_C64(0x1EC5CA43CACAC5D4), SPH_C64(0x75992D772D2D9958),
+	SPH_C64(0x9179BFDCBFBF792E), SPH_C64(0x381B070907071B3F),
+	SPH_C64(0x0123ADEAADAD23AC), SPH_C64(0xEA2F5AEE5A5A2FB0),
+	SPH_C64(0x6CB583988383B5EF), SPH_C64(0x85FF33553333FFB6),
+	SPH_C64(0x3FF263A56363F25C), SPH_C64(0x100A020602020A12),
+	SPH_C64(0x3938AAE3AAAA3893), SPH_C64(0xAFA871937171A8DE),
+	SPH_C64(0x0ECFC845C8C8CFC6), SPH_C64(0xC87D192B19197DD1),
+	SPH_C64(0x727049DB4949703B), SPH_C64(0x869AD976D9D99A5F),
+	SPH_C64(0xC31DF20BF2F21D31), SPH_C64(0x4B48E338E3E348A8),
+	SPH_C64(0xE22A5BED5B5B2AB9), SPH_C64(0x34928885888892BC),
+	SPH_C64(0xA4C89AB39A9AC83E), SPH_C64(0x2DBE266A2626BE0B),
+	SPH_C64(0x8DFA32563232FABF), SPH_C64(0xE94AB0CDB0B04A59),
+	SPH_C64(0x1B6AE926E9E96AF2), SPH_C64(0x78330F110F0F3377),
+	SPH_C64(0xE6A6D562D5D5A633), SPH_C64(0x74BA809D8080BAF4),
+	SPH_C64(0x997CBEDFBEBE7C27), SPH_C64(0x26DECD4ACDCDDEEB),
+	SPH_C64(0xBDE4345C3434E489), SPH_C64(0x7A7548D848487532),
+	SPH_C64(0xAB24FF1CFFFF2454), SPH_C64(0xF78F7A8E7A7A8F8D),
+	SPH_C64(0xF4EA90AD9090EA64), SPH_C64(0xC23E5FE15F5F3E9D),
+	SPH_C64(0x1DA020602020A03D), SPH_C64(0x67D568B86868D50F),
+	SPH_C64(0xD0721A2E1A1A72CA), SPH_C64(0x192CAEEFAEAE2CB7),
+	SPH_C64(0xC95EB4C1B4B45E7D), SPH_C64(0x9A1954FC545419CE),
+	SPH_C64(0xECE593A89393E57F), SPH_C64(0x0DAA22662222AA2F),
+	SPH_C64(0x07E964AC6464E963), SPH_C64(0xDB12F10EF1F1122A),
+	SPH_C64(0xBFA273957373A2CC), SPH_C64(0x905A123612125A82),
+	SPH_C64(0x3A5D40C040405D7A), SPH_C64(0x4028081808082848),
+	SPH_C64(0x56E8C358C3C3E895), SPH_C64(0x337BEC29ECEC7BDF),
+	SPH_C64(0x9690DB70DBDB904D), SPH_C64(0x611FA1FEA1A11FC0),
+	SPH_C64(0x1C838D8A8D8D8391), SPH_C64(0xF5C93D473D3DC9C8),
+	SPH_C64(0xCCF197A49797F15B), SPH_C64(0x0000000000000000),
+	SPH_C64(0x36D4CF4CCFCFD4F9), SPH_C64(0x45872B7D2B2B876E),
+	SPH_C64(0x97B3769A7676B3E1), SPH_C64(0x64B0829B8282B0E6),
+	SPH_C64(0xFEA9D667D6D6A928), SPH_C64(0xD8771B2D1B1B77C3),
+	SPH_C64(0xC15BB5C2B5B55B74), SPH_C64(0x1129AFECAFAF29BE),
+	SPH_C64(0x77DF6ABE6A6ADF1D), SPH_C64(0xBA0D50F050500DEA),
+	SPH_C64(0x124C45CF45454C57), SPH_C64(0xCB18F308F3F31838),
+	SPH_C64(0x9DF030503030F0AD), SPH_C64(0x2B74EF2CEFEF74C4),
+	SPH_C64(0xE5C33F413F3FC3DA), SPH_C64(0x921C55FF55551CC7),
+	SPH_C64(0x7910A2FBA2A210DB), SPH_C64(0x0365EA23EAEA65E9),
+	SPH_C64(0x0FEC65AF6565EC6A), SPH_C64(0xB968BAD3BABA6803),
+	SPH_C64(0x65932F712F2F934A), SPH_C64(0x4EE7C05DC0C0E78E),
+	SPH_C64(0xBE81DE7FDEDE8160), SPH_C64(0xE06C1C241C1C6CFC),
+	SPH_C64(0xBB2EFD1AFDFD2E46), SPH_C64(0x52644DD74D4D641F),
+	SPH_C64(0xE4E092AB9292E076), SPH_C64(0x8FBC759F7575BCFA),
+	SPH_C64(0x301E060A06061E36), SPH_C64(0x24988A838A8A98AE),
+	SPH_C64(0xF940B2CBB2B2404B), SPH_C64(0x6359E637E6E65985),
+	SPH_C64(0x70360E120E0E367E), SPH_C64(0xF8631F211F1F63E7),
+	SPH_C64(0x37F762A66262F755), SPH_C64(0xEEA3D461D4D4A33A),
+	SPH_C64(0x2932A8E5A8A83281), SPH_C64(0xC4F496A79696F452),
+	SPH_C64(0x9B3AF916F9F93A62), SPH_C64(0x66F6C552C5C5F6A3),
+	SPH_C64(0x35B1256F2525B110), SPH_C64(0xF22059EB595920AB),
+	SPH_C64(0x54AE84918484AED0), SPH_C64(0xB7A772967272A7C5),
+	SPH_C64(0xD5DD394B3939DDEC), SPH_C64(0x5A614CD44C4C6116),
+	SPH_C64(0xCA3B5EE25E5E3B94), SPH_C64(0xE78578887878859F),
+	SPH_C64(0xDDD838483838D8E5), SPH_C64(0x14868C898C8C8698),
+	SPH_C64(0xC6B2D16ED1D1B217), SPH_C64(0x410BA5F2A5A50BE4),
+	SPH_C64(0x434DE23BE2E24DA1), SPH_C64(0x2FF861A36161F84E),
+	SPH_C64(0xF145B3C8B3B34542), SPH_C64(0x15A521632121A534),
+	SPH_C64(0x94D69CB99C9CD608), SPH_C64(0xF0661E221E1E66EE),
+	SPH_C64(0x225243C543435261), SPH_C64(0x76FCC754C7C7FCB1),
+	SPH_C64(0xB32BFC19FCFC2B4F), SPH_C64(0x2014040C04041424),
+	SPH_C64(0xB20851F3515108E3), SPH_C64(0xBCC799B69999C725),
+	SPH_C64(0x4FC46DB76D6DC422), SPH_C64(0x68390D170D0D3965),
+	SPH_C64(0x8335FA13FAFA3579), SPH_C64(0xB684DF7CDFDF8469),
+	SPH_C64(0xD79B7E827E7E9BA9), SPH_C64(0x3DB4246C2424B419),
+	SPH_C64(0xC5D73B4D3B3BD7FE), SPH_C64(0x313DABE0ABAB3D9A),
+	SPH_C64(0x3ED1CE4FCECED1F0), SPH_C64(0x8855113311115599),
+	SPH_C64(0x0C898F8C8F8F8983), SPH_C64(0x4A6B4ED24E4E6B04),
+	SPH_C64(0xD151B7C4B7B75166), SPH_C64(0x0B60EB20EBEB60E0),
+	SPH_C64(0xFDCC3C443C3CCCC1), SPH_C64(0x7CBF819E8181BFFD),
+	SPH_C64(0xD4FE94A19494FE40), SPH_C64(0xEB0CF704F7F70C1C),
+	SPH_C64(0xA167B9D6B9B96718), SPH_C64(0x985F133513135F8B),
+	SPH_C64(0x7D9C2C742C2C9C51), SPH_C64(0xD6B8D368D3D3B805),
+	SPH_C64(0x6B5CE734E7E75C8C), SPH_C64(0x57CB6EB26E6ECB39),
+	SPH_C64(0x6EF3C451C4C4F3AA), SPH_C64(0x180F030503030F1B),
+	SPH_C64(0x8A1356FA565613DC), SPH_C64(0x1A4944CC4444495E),
+	SPH_C64(0xDF9E7F817F7F9EA0), SPH_C64(0x2137A9E6A9A93788),
+	SPH_C64(0x4D822A7E2A2A8267), SPH_C64(0xB16DBBD0BBBB6D0A),
+	SPH_C64(0x46E2C15EC1C1E287), SPH_C64(0xA20253F5535302F1),
+	SPH_C64(0xAE8BDC79DCDC8B72), SPH_C64(0x58270B1D0B0B2753),
+	SPH_C64(0x9CD39DBA9D9DD301), SPH_C64(0x47C16CB46C6CC12B),
+	SPH_C64(0x95F531533131F5A4), SPH_C64(0x87B9749C7474B9F3),
+	SPH_C64(0xE309F607F6F60915), SPH_C64(0x0A4346CA4646434C),
+	SPH_C64(0x0926ACE9ACAC26A5), SPH_C64(0x3C978986898997B5),
+	SPH_C64(0xA044143C141444B4), SPH_C64(0x5B42E13EE1E142BA),
+	SPH_C64(0xB04E163A16164EA6), SPH_C64(0xCDD23A4E3A3AD2F7),
+	SPH_C64(0x6FD069BB6969D006), SPH_C64(0x482D091B09092D41),
+	SPH_C64(0xA7AD70907070ADD7), SPH_C64(0xD954B6C7B6B6546F),
+	SPH_C64(0xCEB7D06DD0D0B71E), SPH_C64(0x3B7EED2AEDED7ED6),
+	SPH_C64(0x2EDBCC49CCCCDBE2), SPH_C64(0x2A5742C642425768),
+	SPH_C64(0xB4C298B59898C22C), SPH_C64(0x490EA4F1A4A40EED),
+	SPH_C64(0x5D88287828288875), SPH_C64(0xDA315CE45C5C3186),
+	SPH_C64(0x933FF815F8F83F6B), SPH_C64(0x44A486978686A4C2)
+};
+
+static const uint64_t old1_T3[256] = {
+	SPH_C64(0x781828181878D8C0), SPH_C64(0xAF23652323AF2605),
+	SPH_C64(0xF9C657C6C6F9B87E), SPH_C64(0x6FE825E8E86FFB13),
+	SPH_C64(0xA187948787A1CB4C), SPH_C64(0x62B8D5B8B86211A9),
+	SPH_C64(0x0501030101050908), SPH_C64(0x6E4FD14F4F6E0D42),
+	SPH_C64(0xEE365A3636EE9BAD), SPH_C64(0x04A6F7A6A604FF59),
+	SPH_C64(0xBDD26BD2D2BD0CDE), SPH_C64(0x06F502F5F5060EFB),
+	SPH_C64(0x80798B79798096EF), SPH_C64(0xCE6FB16F6FCE305F),
+	SPH_C64(0xEF91AE9191EF6DFC), SPH_C64(0x0752F6525207F8AA),
+	SPH_C64(0xFD60A06060FD4727), SPH_C64(0x76BCD9BCBC763589),
+	SPH_C64(0xCD9BB09B9BCD37AC), SPH_C64(0x8C8E8F8E8E8C8A04),
+	SPH_C64(0x15A3F8A3A315D271), SPH_C64(0x3C0C140C0C3C6C60),
+	SPH_C64(0x8A7B8D7B7B8A84FF), SPH_C64(0xE1355F3535E180B5),
+	SPH_C64(0x691D271D1D69F5E8), SPH_C64(0x47E03DE0E047B353),
+	SPH_C64(0xACD764D7D7AC21F6), SPH_C64(0xEDC25BC2C2ED9C5E),
+	SPH_C64(0x962E722E2E96436D), SPH_C64(0x7A4BDD4B4B7A2962),
+	SPH_C64(0x21FE1FFEFE215DA3), SPH_C64(0x1657F9575716D582),
+	SPH_C64(0x41153F151541BDA8), SPH_C64(0xB677997777B6E89F),
+	SPH_C64(0xEB37593737EB92A5), SPH_C64(0x56E532E5E5569E7B),
+	SPH_C64(0xD99FBC9F9FD9138C), SPH_C64(0x17F00DF0F01723D3),
+	SPH_C64(0x7F4ADE4A4A7F206A), SPH_C64(0x95DA73DADA95449E),
+	SPH_C64(0x2558E8585825A2FA), SPH_C64(0xCAC946C9C9CACF06),
+	SPH_C64(0x8D297B29298D7C55), SPH_C64(0x220A1E0A0A225A50),
+	SPH_C64(0x4FB1CEB1B14F50E1), SPH_C64(0x1AA0FDA0A01AC969),
+	SPH_C64(0xDA6BBD6B6BDA147F), SPH_C64(0xAB85928585ABD95C),
+	SPH_C64(0x73BDDABDBD733C81), SPH_C64(0x345DE75D5D348FD2),
+	SPH_C64(0x5010301010509080), SPH_C64(0x03F401F4F40307F3),
+	SPH_C64(0xC0CB40CBCBC0DD16), SPH_C64(0xC63E423E3EC6D3ED),
+	SPH_C64(0x11050F0505112D28), SPH_C64(0xE667A96767E6781F),
+	SPH_C64(0x53E431E4E4539773), SPH_C64(0xBB27692727BB0225),
+	SPH_C64(0x5841C34141587332), SPH_C64(0x9D8B808B8B9DA72C),
+	SPH_C64(0x01A7F4A7A701F651), SPH_C64(0x947D877D7D94B2CF),
+	SPH_C64(0xFB95A29595FB49DC), SPH_C64(0x9FD875D8D89F568E),
+	SPH_C64(0x30FB10FBFB30708B), SPH_C64(0x71EE2FEEEE71CD23),
+	SPH_C64(0x917C847C7C91BBC7), SPH_C64(0xE366AA6666E37117),
+	SPH_C64(0x8EDD7ADDDD8E7BA6), SPH_C64(0x4B173917174BAFB8),
+	SPH_C64(0x4647C94747464502), SPH_C64(0xDC9EBF9E9EDC1A84),
+	SPH_C64(0xC5CA43CACAC5D41E), SPH_C64(0x992D772D2D995875),
+	SPH_C64(0x79BFDCBFBF792E91), SPH_C64(0x1B070907071B3F38),
+	SPH_C64(0x23ADEAADAD23AC01), SPH_C64(0x2F5AEE5A5A2FB0EA),
+	SPH_C64(0xB583988383B5EF6C), SPH_C64(0xFF33553333FFB685),
+	SPH_C64(0xF263A56363F25C3F), SPH_C64(0x0A020602020A1210),
+	SPH_C64(0x38AAE3AAAA389339), SPH_C64(0xA871937171A8DEAF),
+	SPH_C64(0xCFC845C8C8CFC60E), SPH_C64(0x7D192B19197DD1C8),
+	SPH_C64(0x7049DB4949703B72), SPH_C64(0x9AD976D9D99A5F86),
+	SPH_C64(0x1DF20BF2F21D31C3), SPH_C64(0x48E338E3E348A84B),
+	SPH_C64(0x2A5BED5B5B2AB9E2), SPH_C64(0x928885888892BC34),
+	SPH_C64(0xC89AB39A9AC83EA4), SPH_C64(0xBE266A2626BE0B2D),
+	SPH_C64(0xFA32563232FABF8D), SPH_C64(0x4AB0CDB0B04A59E9),
+	SPH_C64(0x6AE926E9E96AF21B), SPH_C64(0x330F110F0F337778),
+	SPH_C64(0xA6D562D5D5A633E6), SPH_C64(0xBA809D8080BAF474),
+	SPH_C64(0x7CBEDFBEBE7C2799), SPH_C64(0xDECD4ACDCDDEEB26),
+	SPH_C64(0xE4345C3434E489BD), SPH_C64(0x7548D8484875327A),
+	SPH_C64(0x24FF1CFFFF2454AB), SPH_C64(0x8F7A8E7A7A8F8DF7),
+	SPH_C64(0xEA90AD9090EA64F4), SPH_C64(0x3E5FE15F5F3E9DC2),
+	SPH_C64(0xA020602020A03D1D), SPH_C64(0xD568B86868D50F67),
+	SPH_C64(0x721A2E1A1A72CAD0), SPH_C64(0x2CAEEFAEAE2CB719),
+	SPH_C64(0x5EB4C1B4B45E7DC9), SPH_C64(0x1954FC545419CE9A),
+	SPH_C64(0xE593A89393E57FEC), SPH_C64(0xAA22662222AA2F0D),
+	SPH_C64(0xE964AC6464E96307), SPH_C64(0x12F10EF1F1122ADB),
+	SPH_C64(0xA273957373A2CCBF), SPH_C64(0x5A123612125A8290),
+	SPH_C64(0x5D40C040405D7A3A), SPH_C64(0x2808180808284840),
+	SPH_C64(0xE8C358C3C3E89556), SPH_C64(0x7BEC29ECEC7BDF33),
+	SPH_C64(0x90DB70DBDB904D96), SPH_C64(0x1FA1FEA1A11FC061),
+	SPH_C64(0x838D8A8D8D83911C), SPH_C64(0xC93D473D3DC9C8F5),
+	SPH_C64(0xF197A49797F15BCC), SPH_C64(0x0000000000000000),
+	SPH_C64(0xD4CF4CCFCFD4F936), SPH_C64(0x872B7D2B2B876E45),
+	SPH_C64(0xB3769A7676B3E197), SPH_C64(0xB0829B8282B0E664),
+	SPH_C64(0xA9D667D6D6A928FE), SPH_C64(0x771B2D1B1B77C3D8),
+	SPH_C64(0x5BB5C2B5B55B74C1), SPH_C64(0x29AFECAFAF29BE11),
+	SPH_C64(0xDF6ABE6A6ADF1D77), SPH_C64(0x0D50F050500DEABA),
+	SPH_C64(0x4C45CF45454C5712), SPH_C64(0x18F308F3F31838CB),
+	SPH_C64(0xF030503030F0AD9D), SPH_C64(0x74EF2CEFEF74C42B),
+	SPH_C64(0xC33F413F3FC3DAE5), SPH_C64(0x1C55FF55551CC792),
+	SPH_C64(0x10A2FBA2A210DB79), SPH_C64(0x65EA23EAEA65E903),
+	SPH_C64(0xEC65AF6565EC6A0F), SPH_C64(0x68BAD3BABA6803B9),
+	SPH_C64(0x932F712F2F934A65), SPH_C64(0xE7C05DC0C0E78E4E),
+	SPH_C64(0x81DE7FDEDE8160BE), SPH_C64(0x6C1C241C1C6CFCE0),
+	SPH_C64(0x2EFD1AFDFD2E46BB), SPH_C64(0x644DD74D4D641F52),
+	SPH_C64(0xE092AB9292E076E4), SPH_C64(0xBC759F7575BCFA8F),
+	SPH_C64(0x1E060A06061E3630), SPH_C64(0x988A838A8A98AE24),
+	SPH_C64(0x40B2CBB2B2404BF9), SPH_C64(0x59E637E6E6598563),
+	SPH_C64(0x360E120E0E367E70), SPH_C64(0x631F211F1F63E7F8),
+	SPH_C64(0xF762A66262F75537), SPH_C64(0xA3D461D4D4A33AEE),
+	SPH_C64(0x32A8E5A8A8328129), SPH_C64(0xF496A79696F452C4),
+	SPH_C64(0x3AF916F9F93A629B), SPH_C64(0xF6C552C5C5F6A366),
+	SPH_C64(0xB1256F2525B11035), SPH_C64(0x2059EB595920ABF2),
+	SPH_C64(0xAE84918484AED054), SPH_C64(0xA772967272A7C5B7),
+	SPH_C64(0xDD394B3939DDECD5), SPH_C64(0x614CD44C4C61165A),
+	SPH_C64(0x3B5EE25E5E3B94CA), SPH_C64(0x8578887878859FE7),
+	SPH_C64(0xD838483838D8E5DD), SPH_C64(0x868C898C8C869814),
+	SPH_C64(0xB2D16ED1D1B217C6), SPH_C64(0x0BA5F2A5A50BE441),
+	SPH_C64(0x4DE23BE2E24DA143), SPH_C64(0xF861A36161F84E2F),
+	SPH_C64(0x45B3C8B3B34542F1), SPH_C64(0xA521632121A53415),
+	SPH_C64(0xD69CB99C9CD60894), SPH_C64(0x661E221E1E66EEF0),
+	SPH_C64(0x5243C54343526122), SPH_C64(0xFCC754C7C7FCB176),
+	SPH_C64(0x2BFC19FCFC2B4FB3), SPH_C64(0x14040C0404142420),
+	SPH_C64(0x0851F3515108E3B2), SPH_C64(0xC799B69999C725BC),
+	SPH_C64(0xC46DB76D6DC4224F), SPH_C64(0x390D170D0D396568),
+	SPH_C64(0x35FA13FAFA357983), SPH_C64(0x84DF7CDFDF8469B6),
+	SPH_C64(0x9B7E827E7E9BA9D7), SPH_C64(0xB4246C2424B4193D),
+	SPH_C64(0xD73B4D3B3BD7FEC5), SPH_C64(0x3DABE0ABAB3D9A31),
+	SPH_C64(0xD1CE4FCECED1F03E), SPH_C64(0x5511331111559988),
+	SPH_C64(0x898F8C8F8F89830C), SPH_C64(0x6B4ED24E4E6B044A),
+	SPH_C64(0x51B7C4B7B75166D1), SPH_C64(0x60EB20EBEB60E00B),
+	SPH_C64(0xCC3C443C3CCCC1FD), SPH_C64(0xBF819E8181BFFD7C),
+	SPH_C64(0xFE94A19494FE40D4), SPH_C64(0x0CF704F7F70C1CEB),
+	SPH_C64(0x67B9D6B9B96718A1), SPH_C64(0x5F133513135F8B98),
+	SPH_C64(0x9C2C742C2C9C517D), SPH_C64(0xB8D368D3D3B805D6),
+	SPH_C64(0x5CE734E7E75C8C6B), SPH_C64(0xCB6EB26E6ECB3957),
+	SPH_C64(0xF3C451C4C4F3AA6E), SPH_C64(0x0F030503030F1B18),
+	SPH_C64(0x1356FA565613DC8A), SPH_C64(0x4944CC4444495E1A),
+	SPH_C64(0x9E7F817F7F9EA0DF), SPH_C64(0x37A9E6A9A9378821),
+	SPH_C64(0x822A7E2A2A82674D), SPH_C64(0x6DBBD0BBBB6D0AB1),
+	SPH_C64(0xE2C15EC1C1E28746), SPH_C64(0x0253F5535302F1A2),
+	SPH_C64(0x8BDC79DCDC8B72AE), SPH_C64(0x270B1D0B0B275358),
+	SPH_C64(0xD39DBA9D9DD3019C), SPH_C64(0xC16CB46C6CC12B47),
+	SPH_C64(0xF531533131F5A495), SPH_C64(0xB9749C7474B9F387),
+	SPH_C64(0x09F607F6F60915E3), SPH_C64(0x4346CA4646434C0A),
+	SPH_C64(0x26ACE9ACAC26A509), SPH_C64(0x978986898997B53C),
+	SPH_C64(0x44143C141444B4A0), SPH_C64(0x42E13EE1E142BA5B),
+	SPH_C64(0x4E163A16164EA6B0), SPH_C64(0xD23A4E3A3AD2F7CD),
+	SPH_C64(0xD069BB6969D0066F), SPH_C64(0x2D091B09092D4148),
+	SPH_C64(0xAD70907070ADD7A7), SPH_C64(0x54B6C7B6B6546FD9),
+	SPH_C64(0xB7D06DD0D0B71ECE), SPH_C64(0x7EED2AEDED7ED63B),
+	SPH_C64(0xDBCC49CCCCDBE22E), SPH_C64(0x5742C6424257682A),
+	SPH_C64(0xC298B59898C22CB4), SPH_C64(0x0EA4F1A4A40EED49),
+	SPH_C64(0x882878282888755D), SPH_C64(0x315CE45C5C3186DA),
+	SPH_C64(0x3FF815F8F83F6B93), SPH_C64(0xA486978686A4C244)
+};
+
+static const uint64_t old1_T4[256] = {
+	SPH_C64(0x1828181878D8C078), SPH_C64(0x23652323AF2605AF),
+	SPH_C64(0xC657C6C6F9B87EF9), SPH_C64(0xE825E8E86FFB136F),
+	SPH_C64(0x87948787A1CB4CA1), SPH_C64(0xB8D5B8B86211A962),
+	SPH_C64(0x0103010105090805), SPH_C64(0x4FD14F4F6E0D426E),
+	SPH_C64(0x365A3636EE9BADEE), SPH_C64(0xA6F7A6A604FF5904),
+	SPH_C64(0xD26BD2D2BD0CDEBD), SPH_C64(0xF502F5F5060EFB06),
+	SPH_C64(0x798B79798096EF80), SPH_C64(0x6FB16F6FCE305FCE),
+	SPH_C64(0x91AE9191EF6DFCEF), SPH_C64(0x52F6525207F8AA07),
+	SPH_C64(0x60A06060FD4727FD), SPH_C64(0xBCD9BCBC76358976),
+	SPH_C64(0x9BB09B9BCD37ACCD), SPH_C64(0x8E8F8E8E8C8A048C),
+	SPH_C64(0xA3F8A3A315D27115), SPH_C64(0x0C140C0C3C6C603C),
+	SPH_C64(0x7B8D7B7B8A84FF8A), SPH_C64(0x355F3535E180B5E1),
+	SPH_C64(0x1D271D1D69F5E869), SPH_C64(0xE03DE0E047B35347),
+	SPH_C64(0xD764D7D7AC21F6AC), SPH_C64(0xC25BC2C2ED9C5EED),
+	SPH_C64(0x2E722E2E96436D96), SPH_C64(0x4BDD4B4B7A29627A),
+	SPH_C64(0xFE1FFEFE215DA321), SPH_C64(0x57F9575716D58216),
+	SPH_C64(0x153F151541BDA841), SPH_C64(0x77997777B6E89FB6),
+	SPH_C64(0x37593737EB92A5EB), SPH_C64(0xE532E5E5569E7B56),
+	SPH_C64(0x9FBC9F9FD9138CD9), SPH_C64(0xF00DF0F01723D317),
+	SPH_C64(0x4ADE4A4A7F206A7F), SPH_C64(0xDA73DADA95449E95),
+	SPH_C64(0x58E8585825A2FA25), SPH_C64(0xC946C9C9CACF06CA),
+	SPH_C64(0x297B29298D7C558D), SPH_C64(0x0A1E0A0A225A5022),
+	SPH_C64(0xB1CEB1B14F50E14F), SPH_C64(0xA0FDA0A01AC9691A),
+	SPH_C64(0x6BBD6B6BDA147FDA), SPH_C64(0x85928585ABD95CAB),
+	SPH_C64(0xBDDABDBD733C8173), SPH_C64(0x5DE75D5D348FD234),
+	SPH_C64(0x1030101050908050), SPH_C64(0xF401F4F40307F303),
+	SPH_C64(0xCB40CBCBC0DD16C0), SPH_C64(0x3E423E3EC6D3EDC6),
+	SPH_C64(0x050F0505112D2811), SPH_C64(0x67A96767E6781FE6),
+	SPH_C64(0xE431E4E453977353), SPH_C64(0x27692727BB0225BB),
+	SPH_C64(0x41C3414158733258), SPH_C64(0x8B808B8B9DA72C9D),
+	SPH_C64(0xA7F4A7A701F65101), SPH_C64(0x7D877D7D94B2CF94),
+	SPH_C64(0x95A29595FB49DCFB), SPH_C64(0xD875D8D89F568E9F),
+	SPH_C64(0xFB10FBFB30708B30), SPH_C64(0xEE2FEEEE71CD2371),
+	SPH_C64(0x7C847C7C91BBC791), SPH_C64(0x66AA6666E37117E3),
+	SPH_C64(0xDD7ADDDD8E7BA68E), SPH_C64(0x173917174BAFB84B),
+	SPH_C64(0x47C9474746450246), SPH_C64(0x9EBF9E9EDC1A84DC),
+	SPH_C64(0xCA43CACAC5D41EC5), SPH_C64(0x2D772D2D99587599),
+	SPH_C64(0xBFDCBFBF792E9179), SPH_C64(0x070907071B3F381B),
+	SPH_C64(0xADEAADAD23AC0123), SPH_C64(0x5AEE5A5A2FB0EA2F),
+	SPH_C64(0x83988383B5EF6CB5), SPH_C64(0x33553333FFB685FF),
+	SPH_C64(0x63A56363F25C3FF2), SPH_C64(0x020602020A12100A),
+	SPH_C64(0xAAE3AAAA38933938), SPH_C64(0x71937171A8DEAFA8),
+	SPH_C64(0xC845C8C8CFC60ECF), SPH_C64(0x192B19197DD1C87D),
+	SPH_C64(0x49DB4949703B7270), SPH_C64(0xD976D9D99A5F869A),
+	SPH_C64(0xF20BF2F21D31C31D), SPH_C64(0xE338E3E348A84B48),
+	SPH_C64(0x5BED5B5B2AB9E22A), SPH_C64(0x8885888892BC3492),
+	SPH_C64(0x9AB39A9AC83EA4C8), SPH_C64(0x266A2626BE0B2DBE),
+	SPH_C64(0x32563232FABF8DFA), SPH_C64(0xB0CDB0B04A59E94A),
+	SPH_C64(0xE926E9E96AF21B6A), SPH_C64(0x0F110F0F33777833),
+	SPH_C64(0xD562D5D5A633E6A6), SPH_C64(0x809D8080BAF474BA),
+	SPH_C64(0xBEDFBEBE7C27997C), SPH_C64(0xCD4ACDCDDEEB26DE),
+	SPH_C64(0x345C3434E489BDE4), SPH_C64(0x48D8484875327A75),
+	SPH_C64(0xFF1CFFFF2454AB24), SPH_C64(0x7A8E7A7A8F8DF78F),
+	SPH_C64(0x90AD9090EA64F4EA), SPH_C64(0x5FE15F5F3E9DC23E),
+	SPH_C64(0x20602020A03D1DA0), SPH_C64(0x68B86868D50F67D5),
+	SPH_C64(0x1A2E1A1A72CAD072), SPH_C64(0xAEEFAEAE2CB7192C),
+	SPH_C64(0xB4C1B4B45E7DC95E), SPH_C64(0x54FC545419CE9A19),
+	SPH_C64(0x93A89393E57FECE5), SPH_C64(0x22662222AA2F0DAA),
+	SPH_C64(0x64AC6464E96307E9), SPH_C64(0xF10EF1F1122ADB12),
+	SPH_C64(0x73957373A2CCBFA2), SPH_C64(0x123612125A82905A),
+	SPH_C64(0x40C040405D7A3A5D), SPH_C64(0x0818080828484028),
+	SPH_C64(0xC358C3C3E89556E8), SPH_C64(0xEC29ECEC7BDF337B),
+	SPH_C64(0xDB70DBDB904D9690), SPH_C64(0xA1FEA1A11FC0611F),
+	SPH_C64(0x8D8A8D8D83911C83), SPH_C64(0x3D473D3DC9C8F5C9),
+	SPH_C64(0x97A49797F15BCCF1), SPH_C64(0x0000000000000000),
+	SPH_C64(0xCF4CCFCFD4F936D4), SPH_C64(0x2B7D2B2B876E4587),
+	SPH_C64(0x769A7676B3E197B3), SPH_C64(0x829B8282B0E664B0),
+	SPH_C64(0xD667D6D6A928FEA9), SPH_C64(0x1B2D1B1B77C3D877),
+	SPH_C64(0xB5C2B5B55B74C15B), SPH_C64(0xAFECAFAF29BE1129),
+	SPH_C64(0x6ABE6A6ADF1D77DF), SPH_C64(0x50F050500DEABA0D),
+	SPH_C64(0x45CF45454C57124C), SPH_C64(0xF308F3F31838CB18),
+	SPH_C64(0x30503030F0AD9DF0), SPH_C64(0xEF2CEFEF74C42B74),
+	SPH_C64(0x3F413F3FC3DAE5C3), SPH_C64(0x55FF55551CC7921C),
+	SPH_C64(0xA2FBA2A210DB7910), SPH_C64(0xEA23EAEA65E90365),
+	SPH_C64(0x65AF6565EC6A0FEC), SPH_C64(0xBAD3BABA6803B968),
+	SPH_C64(0x2F712F2F934A6593), SPH_C64(0xC05DC0C0E78E4EE7),
+	SPH_C64(0xDE7FDEDE8160BE81), SPH_C64(0x1C241C1C6CFCE06C),
+	SPH_C64(0xFD1AFDFD2E46BB2E), SPH_C64(0x4DD74D4D641F5264),
+	SPH_C64(0x92AB9292E076E4E0), SPH_C64(0x759F7575BCFA8FBC),
+	SPH_C64(0x060A06061E36301E), SPH_C64(0x8A838A8A98AE2498),
+	SPH_C64(0xB2CBB2B2404BF940), SPH_C64(0xE637E6E659856359),
+	SPH_C64(0x0E120E0E367E7036), SPH_C64(0x1F211F1F63E7F863),
+	SPH_C64(0x62A66262F75537F7), SPH_C64(0xD461D4D4A33AEEA3),
+	SPH_C64(0xA8E5A8A832812932), SPH_C64(0x96A79696F452C4F4),
+	SPH_C64(0xF916F9F93A629B3A), SPH_C64(0xC552C5C5F6A366F6),
+	SPH_C64(0x256F2525B11035B1), SPH_C64(0x59EB595920ABF220),
+	SPH_C64(0x84918484AED054AE), SPH_C64(0x72967272A7C5B7A7),
+	SPH_C64(0x394B3939DDECD5DD), SPH_C64(0x4CD44C4C61165A61),
+	SPH_C64(0x5EE25E5E3B94CA3B), SPH_C64(0x78887878859FE785),
+	SPH_C64(0x38483838D8E5DDD8), SPH_C64(0x8C898C8C86981486),
+	SPH_C64(0xD16ED1D1B217C6B2), SPH_C64(0xA5F2A5A50BE4410B),
+	SPH_C64(0xE23BE2E24DA1434D), SPH_C64(0x61A36161F84E2FF8),
+	SPH_C64(0xB3C8B3B34542F145), SPH_C64(0x21632121A53415A5),
+	SPH_C64(0x9CB99C9CD60894D6), SPH_C64(0x1E221E1E66EEF066),
+	SPH_C64(0x43C5434352612252), SPH_C64(0xC754C7C7FCB176FC),
+	SPH_C64(0xFC19FCFC2B4FB32B), SPH_C64(0x040C040414242014),
+	SPH_C64(0x51F3515108E3B208), SPH_C64(0x99B69999C725BCC7),
+	SPH_C64(0x6DB76D6DC4224FC4), SPH_C64(0x0D170D0D39656839),
+	SPH_C64(0xFA13FAFA35798335), SPH_C64(0xDF7CDFDF8469B684),
+	SPH_C64(0x7E827E7E9BA9D79B), SPH_C64(0x246C2424B4193DB4),
+	SPH_C64(0x3B4D3B3BD7FEC5D7), SPH_C64(0xABE0ABAB3D9A313D),
+	SPH_C64(0xCE4FCECED1F03ED1), SPH_C64(0x1133111155998855),
+	SPH_C64(0x8F8C8F8F89830C89), SPH_C64(0x4ED24E4E6B044A6B),
+	SPH_C64(0xB7C4B7B75166D151), SPH_C64(0xEB20EBEB60E00B60),
+	SPH_C64(0x3C443C3CCCC1FDCC), SPH_C64(0x819E8181BFFD7CBF),
+	SPH_C64(0x94A19494FE40D4FE), SPH_C64(0xF704F7F70C1CEB0C),
+	SPH_C64(0xB9D6B9B96718A167), SPH_C64(0x133513135F8B985F),
+	SPH_C64(0x2C742C2C9C517D9C), SPH_C64(0xD368D3D3B805D6B8),
+	SPH_C64(0xE734E7E75C8C6B5C), SPH_C64(0x6EB26E6ECB3957CB),
+	SPH_C64(0xC451C4C4F3AA6EF3), SPH_C64(0x030503030F1B180F),
+	SPH_C64(0x56FA565613DC8A13), SPH_C64(0x44CC4444495E1A49),
+	SPH_C64(0x7F817F7F9EA0DF9E), SPH_C64(0xA9E6A9A937882137),
+	SPH_C64(0x2A7E2A2A82674D82), SPH_C64(0xBBD0BBBB6D0AB16D),
+	SPH_C64(0xC15EC1C1E28746E2), SPH_C64(0x53F5535302F1A202),
+	SPH_C64(0xDC79DCDC8B72AE8B), SPH_C64(0x0B1D0B0B27535827),
+	SPH_C64(0x9DBA9D9DD3019CD3), SPH_C64(0x6CB46C6CC12B47C1),
+	SPH_C64(0x31533131F5A495F5), SPH_C64(0x749C7474B9F387B9),
+	SPH_C64(0xF607F6F60915E309), SPH_C64(0x46CA4646434C0A43),
+	SPH_C64(0xACE9ACAC26A50926), SPH_C64(0x8986898997B53C97),
+	SPH_C64(0x143C141444B4A044), SPH_C64(0xE13EE1E142BA5B42),
+	SPH_C64(0x163A16164EA6B04E), SPH_C64(0x3A4E3A3AD2F7CDD2),
+	SPH_C64(0x69BB6969D0066FD0), SPH_C64(0x091B09092D41482D),
+	SPH_C64(0x70907070ADD7A7AD), SPH_C64(0xB6C7B6B6546FD954),
+	SPH_C64(0xD06DD0D0B71ECEB7), SPH_C64(0xED2AEDED7ED63B7E),
+	SPH_C64(0xCC49CCCCDBE22EDB), SPH_C64(0x42C6424257682A57),
+	SPH_C64(0x98B59898C22CB4C2), SPH_C64(0xA4F1A4A40EED490E),
+	SPH_C64(0x2878282888755D88), SPH_C64(0x5CE45C5C3186DA31),
+	SPH_C64(0xF815F8F83F6B933F), SPH_C64(0x86978686A4C244A4)
+};
+
+static const uint64_t old1_T5[256] = {
+	SPH_C64(0x28181878D8C07818), SPH_C64(0x652323AF2605AF23),
+	SPH_C64(0x57C6C6F9B87EF9C6), SPH_C64(0x25E8E86FFB136FE8),
+	SPH_C64(0x948787A1CB4CA187), SPH_C64(0xD5B8B86211A962B8),
+	SPH_C64(0x0301010509080501), SPH_C64(0xD14F4F6E0D426E4F),
+	SPH_C64(0x5A3636EE9BADEE36), SPH_C64(0xF7A6A604FF5904A6),
+	SPH_C64(0x6BD2D2BD0CDEBDD2), SPH_C64(0x02F5F5060EFB06F5),
+	SPH_C64(0x8B79798096EF8079), SPH_C64(0xB16F6FCE305FCE6F),
+	SPH_C64(0xAE9191EF6DFCEF91), SPH_C64(0xF6525207F8AA0752),
+	SPH_C64(0xA06060FD4727FD60), SPH_C64(0xD9BCBC76358976BC),
+	SPH_C64(0xB09B9BCD37ACCD9B), SPH_C64(0x8F8E8E8C8A048C8E),
+	SPH_C64(0xF8A3A315D27115A3), SPH_C64(0x140C0C3C6C603C0C),
+	SPH_C64(0x8D7B7B8A84FF8A7B), SPH_C64(0x5F3535E180B5E135),
+	SPH_C64(0x271D1D69F5E8691D), SPH_C64(0x3DE0E047B35347E0),
+	SPH_C64(0x64D7D7AC21F6ACD7), SPH_C64(0x5BC2C2ED9C5EEDC2),
+	SPH_C64(0x722E2E96436D962E), SPH_C64(0xDD4B4B7A29627A4B),
+	SPH_C64(0x1FFEFE215DA321FE), SPH_C64(0xF9575716D5821657),
+	SPH_C64(0x3F151541BDA84115), SPH_C64(0x997777B6E89FB677),
+	SPH_C64(0x593737EB92A5EB37), SPH_C64(0x32E5E5569E7B56E5),
+	SPH_C64(0xBC9F9FD9138CD99F), SPH_C64(0x0DF0F01723D317F0),
+	SPH_C64(0xDE4A4A7F206A7F4A), SPH_C64(0x73DADA95449E95DA),
+	SPH_C64(0xE8585825A2FA2558), SPH_C64(0x46C9C9CACF06CAC9),
+	SPH_C64(0x7B29298D7C558D29), SPH_C64(0x1E0A0A225A50220A),
+	SPH_C64(0xCEB1B14F50E14FB1), SPH_C64(0xFDA0A01AC9691AA0),
+	SPH_C64(0xBD6B6BDA147FDA6B), SPH_C64(0x928585ABD95CAB85),
+	SPH_C64(0xDABDBD733C8173BD), SPH_C64(0xE75D5D348FD2345D),
+	SPH_C64(0x3010105090805010), SPH_C64(0x01F4F40307F303F4),
+	SPH_C64(0x40CBCBC0DD16C0CB), SPH_C64(0x423E3EC6D3EDC63E),
+	SPH_C64(0x0F0505112D281105), SPH_C64(0xA96767E6781FE667),
+	SPH_C64(0x31E4E453977353E4), SPH_C64(0x692727BB0225BB27),
+	SPH_C64(0xC341415873325841), SPH_C64(0x808B8B9DA72C9D8B),
+	SPH_C64(0xF4A7A701F65101A7), SPH_C64(0x877D7D94B2CF947D),
+	SPH_C64(0xA29595FB49DCFB95), SPH_C64(0x75D8D89F568E9FD8),
+	SPH_C64(0x10FBFB30708B30FB), SPH_C64(0x2FEEEE71CD2371EE),
+	SPH_C64(0x847C7C91BBC7917C), SPH_C64(0xAA6666E37117E366),
+	SPH_C64(0x7ADDDD8E7BA68EDD), SPH_C64(0x3917174BAFB84B17),
+	SPH_C64(0xC947474645024647), SPH_C64(0xBF9E9EDC1A84DC9E),
+	SPH_C64(0x43CACAC5D41EC5CA), SPH_C64(0x772D2D995875992D),
+	SPH_C64(0xDCBFBF792E9179BF), SPH_C64(0x0907071B3F381B07),
+	SPH_C64(0xEAADAD23AC0123AD), SPH_C64(0xEE5A5A2FB0EA2F5A),
+	SPH_C64(0x988383B5EF6CB583), SPH_C64(0x553333FFB685FF33),
+	SPH_C64(0xA56363F25C3FF263), SPH_C64(0x0602020A12100A02),
+	SPH_C64(0xE3AAAA38933938AA), SPH_C64(0x937171A8DEAFA871),
+	SPH_C64(0x45C8C8CFC60ECFC8), SPH_C64(0x2B19197DD1C87D19),
+	SPH_C64(0xDB4949703B727049), SPH_C64(0x76D9D99A5F869AD9),
+	SPH_C64(0x0BF2F21D31C31DF2), SPH_C64(0x38E3E348A84B48E3),
+	SPH_C64(0xED5B5B2AB9E22A5B), SPH_C64(0x85888892BC349288),
+	SPH_C64(0xB39A9AC83EA4C89A), SPH_C64(0x6A2626BE0B2DBE26),
+	SPH_C64(0x563232FABF8DFA32), SPH_C64(0xCDB0B04A59E94AB0),
+	SPH_C64(0x26E9E96AF21B6AE9), SPH_C64(0x110F0F337778330F),
+	SPH_C64(0x62D5D5A633E6A6D5), SPH_C64(0x9D8080BAF474BA80),
+	SPH_C64(0xDFBEBE7C27997CBE), SPH_C64(0x4ACDCDDEEB26DECD),
+	SPH_C64(0x5C3434E489BDE434), SPH_C64(0xD8484875327A7548),
+	SPH_C64(0x1CFFFF2454AB24FF), SPH_C64(0x8E7A7A8F8DF78F7A),
+	SPH_C64(0xAD9090EA64F4EA90), SPH_C64(0xE15F5F3E9DC23E5F),
+	SPH_C64(0x602020A03D1DA020), SPH_C64(0xB86868D50F67D568),
+	SPH_C64(0x2E1A1A72CAD0721A), SPH_C64(0xEFAEAE2CB7192CAE),
+	SPH_C64(0xC1B4B45E7DC95EB4), SPH_C64(0xFC545419CE9A1954),
+	SPH_C64(0xA89393E57FECE593), SPH_C64(0x662222AA2F0DAA22),
+	SPH_C64(0xAC6464E96307E964), SPH_C64(0x0EF1F1122ADB12F1),
+	SPH_C64(0x957373A2CCBFA273), SPH_C64(0x3612125A82905A12),
+	SPH_C64(0xC040405D7A3A5D40), SPH_C64(0x1808082848402808),
+	SPH_C64(0x58C3C3E89556E8C3), SPH_C64(0x29ECEC7BDF337BEC),
+	SPH_C64(0x70DBDB904D9690DB), SPH_C64(0xFEA1A11FC0611FA1),
+	SPH_C64(0x8A8D8D83911C838D), SPH_C64(0x473D3DC9C8F5C93D),
+	SPH_C64(0xA49797F15BCCF197), SPH_C64(0x0000000000000000),
+	SPH_C64(0x4CCFCFD4F936D4CF), SPH_C64(0x7D2B2B876E45872B),
+	SPH_C64(0x9A7676B3E197B376), SPH_C64(0x9B8282B0E664B082),
+	SPH_C64(0x67D6D6A928FEA9D6), SPH_C64(0x2D1B1B77C3D8771B),
+	SPH_C64(0xC2B5B55B74C15BB5), SPH_C64(0xECAFAF29BE1129AF),
+	SPH_C64(0xBE6A6ADF1D77DF6A), SPH_C64(0xF050500DEABA0D50),
+	SPH_C64(0xCF45454C57124C45), SPH_C64(0x08F3F31838CB18F3),
+	SPH_C64(0x503030F0AD9DF030), SPH_C64(0x2CEFEF74C42B74EF),
+	SPH_C64(0x413F3FC3DAE5C33F), SPH_C64(0xFF55551CC7921C55),
+	SPH_C64(0xFBA2A210DB7910A2), SPH_C64(0x23EAEA65E90365EA),
+	SPH_C64(0xAF6565EC6A0FEC65), SPH_C64(0xD3BABA6803B968BA),
+	SPH_C64(0x712F2F934A65932F), SPH_C64(0x5DC0C0E78E4EE7C0),
+	SPH_C64(0x7FDEDE8160BE81DE), SPH_C64(0x241C1C6CFCE06C1C),
+	SPH_C64(0x1AFDFD2E46BB2EFD), SPH_C64(0xD74D4D641F52644D),
+	SPH_C64(0xAB9292E076E4E092), SPH_C64(0x9F7575BCFA8FBC75),
+	SPH_C64(0x0A06061E36301E06), SPH_C64(0x838A8A98AE24988A),
+	SPH_C64(0xCBB2B2404BF940B2), SPH_C64(0x37E6E659856359E6),
+	SPH_C64(0x120E0E367E70360E), SPH_C64(0x211F1F63E7F8631F),
+	SPH_C64(0xA66262F75537F762), SPH_C64(0x61D4D4A33AEEA3D4),
+	SPH_C64(0xE5A8A832812932A8), SPH_C64(0xA79696F452C4F496),
+	SPH_C64(0x16F9F93A629B3AF9), SPH_C64(0x52C5C5F6A366F6C5),
+	SPH_C64(0x6F2525B11035B125), SPH_C64(0xEB595920ABF22059),
+	SPH_C64(0x918484AED054AE84), SPH_C64(0x967272A7C5B7A772),
+	SPH_C64(0x4B3939DDECD5DD39), SPH_C64(0xD44C4C61165A614C),
+	SPH_C64(0xE25E5E3B94CA3B5E), SPH_C64(0x887878859FE78578),
+	SPH_C64(0x483838D8E5DDD838), SPH_C64(0x898C8C869814868C),
+	SPH_C64(0x6ED1D1B217C6B2D1), SPH_C64(0xF2A5A50BE4410BA5),
+	SPH_C64(0x3BE2E24DA1434DE2), SPH_C64(0xA36161F84E2FF861),
+	SPH_C64(0xC8B3B34542F145B3), SPH_C64(0x632121A53415A521),
+	SPH_C64(0xB99C9CD60894D69C), SPH_C64(0x221E1E66EEF0661E),
+	SPH_C64(0xC543435261225243), SPH_C64(0x54C7C7FCB176FCC7),
+	SPH_C64(0x19FCFC2B4FB32BFC), SPH_C64(0x0C04041424201404),
+	SPH_C64(0xF3515108E3B20851), SPH_C64(0xB69999C725BCC799),
+	SPH_C64(0xB76D6DC4224FC46D), SPH_C64(0x170D0D396568390D),
+	SPH_C64(0x13FAFA35798335FA), SPH_C64(0x7CDFDF8469B684DF),
+	SPH_C64(0x827E7E9BA9D79B7E), SPH_C64(0x6C2424B4193DB424),
+	SPH_C64(0x4D3B3BD7FEC5D73B), SPH_C64(0xE0ABAB3D9A313DAB),
+	SPH_C64(0x4FCECED1F03ED1CE), SPH_C64(0x3311115599885511),
+	SPH_C64(0x8C8F8F89830C898F), SPH_C64(0xD24E4E6B044A6B4E),
+	SPH_C64(0xC4B7B75166D151B7), SPH_C64(0x20EBEB60E00B60EB),
+	SPH_C64(0x443C3CCCC1FDCC3C), SPH_C64(0x9E8181BFFD7CBF81),
+	SPH_C64(0xA19494FE40D4FE94), SPH_C64(0x04F7F70C1CEB0CF7),
+	SPH_C64(0xD6B9B96718A167B9), SPH_C64(0x3513135F8B985F13),
+	SPH_C64(0x742C2C9C517D9C2C), SPH_C64(0x68D3D3B805D6B8D3),
+	SPH_C64(0x34E7E75C8C6B5CE7), SPH_C64(0xB26E6ECB3957CB6E),
+	SPH_C64(0x51C4C4F3AA6EF3C4), SPH_C64(0x0503030F1B180F03),
+	SPH_C64(0xFA565613DC8A1356), SPH_C64(0xCC4444495E1A4944),
+	SPH_C64(0x817F7F9EA0DF9E7F), SPH_C64(0xE6A9A937882137A9),
+	SPH_C64(0x7E2A2A82674D822A), SPH_C64(0xD0BBBB6D0AB16DBB),
+	SPH_C64(0x5EC1C1E28746E2C1), SPH_C64(0xF5535302F1A20253),
+	SPH_C64(0x79DCDC8B72AE8BDC), SPH_C64(0x1D0B0B275358270B),
+	SPH_C64(0xBA9D9DD3019CD39D), SPH_C64(0xB46C6CC12B47C16C),
+	SPH_C64(0x533131F5A495F531), SPH_C64(0x9C7474B9F387B974),
+	SPH_C64(0x07F6F60915E309F6), SPH_C64(0xCA4646434C0A4346),
+	SPH_C64(0xE9ACAC26A50926AC), SPH_C64(0x86898997B53C9789),
+	SPH_C64(0x3C141444B4A04414), SPH_C64(0x3EE1E142BA5B42E1),
+	SPH_C64(0x3A16164EA6B04E16), SPH_C64(0x4E3A3AD2F7CDD23A),
+	SPH_C64(0xBB6969D0066FD069), SPH_C64(0x1B09092D41482D09),
+	SPH_C64(0x907070ADD7A7AD70), SPH_C64(0xC7B6B6546FD954B6),
+	SPH_C64(0x6DD0D0B71ECEB7D0), SPH_C64(0x2AEDED7ED63B7EED),
+	SPH_C64(0x49CCCCDBE22EDBCC), SPH_C64(0xC6424257682A5742),
+	SPH_C64(0xB59898C22CB4C298), SPH_C64(0xF1A4A40EED490EA4),
+	SPH_C64(0x78282888755D8828), SPH_C64(0xE45C5C3186DA315C),
+	SPH_C64(0x15F8F83F6B933FF8), SPH_C64(0x978686A4C244A486)
+};
+
+static const uint64_t old1_T6[256] = {
+	SPH_C64(0x181878D8C0781828), SPH_C64(0x2323AF2605AF2365),
+	SPH_C64(0xC6C6F9B87EF9C657), SPH_C64(0xE8E86FFB136FE825),
+	SPH_C64(0x8787A1CB4CA18794), SPH_C64(0xB8B86211A962B8D5),
+	SPH_C64(0x0101050908050103), SPH_C64(0x4F4F6E0D426E4FD1),
+	SPH_C64(0x3636EE9BADEE365A), SPH_C64(0xA6A604FF5904A6F7),
+	SPH_C64(0xD2D2BD0CDEBDD26B), SPH_C64(0xF5F5060EFB06F502),
+	SPH_C64(0x79798096EF80798B), SPH_C64(0x6F6FCE305FCE6FB1),
+	SPH_C64(0x9191EF6DFCEF91AE), SPH_C64(0x525207F8AA0752F6),
+	SPH_C64(0x6060FD4727FD60A0), SPH_C64(0xBCBC76358976BCD9),
+	SPH_C64(0x9B9BCD37ACCD9BB0), SPH_C64(0x8E8E8C8A048C8E8F),
+	SPH_C64(0xA3A315D27115A3F8), SPH_C64(0x0C0C3C6C603C0C14),
+	SPH_C64(0x7B7B8A84FF8A7B8D), SPH_C64(0x3535E180B5E1355F),
+	SPH_C64(0x1D1D69F5E8691D27), SPH_C64(0xE0E047B35347E03D),
+	SPH_C64(0xD7D7AC21F6ACD764), SPH_C64(0xC2C2ED9C5EEDC25B),
+	SPH_C64(0x2E2E96436D962E72), SPH_C64(0x4B4B7A29627A4BDD),
+	SPH_C64(0xFEFE215DA321FE1F), SPH_C64(0x575716D5821657F9),
+	SPH_C64(0x151541BDA841153F), SPH_C64(0x7777B6E89FB67799),
+	SPH_C64(0x3737EB92A5EB3759), SPH_C64(0xE5E5569E7B56E532),
+	SPH_C64(0x9F9FD9138CD99FBC), SPH_C64(0xF0F01723D317F00D),
+	SPH_C64(0x4A4A7F206A7F4ADE), SPH_C64(0xDADA95449E95DA73),
+	SPH_C64(0x585825A2FA2558E8), SPH_C64(0xC9C9CACF06CAC946),
+	SPH_C64(0x29298D7C558D297B), SPH_C64(0x0A0A225A50220A1E),
+	SPH_C64(0xB1B14F50E14FB1CE), SPH_C64(0xA0A01AC9691AA0FD),
+	SPH_C64(0x6B6BDA147FDA6BBD), SPH_C64(0x8585ABD95CAB8592),
+	SPH_C64(0xBDBD733C8173BDDA), SPH_C64(0x5D5D348FD2345DE7),
+	SPH_C64(0x1010509080501030), SPH_C64(0xF4F40307F303F401),
+	SPH_C64(0xCBCBC0DD16C0CB40), SPH_C64(0x3E3EC6D3EDC63E42),
+	SPH_C64(0x0505112D2811050F), SPH_C64(0x6767E6781FE667A9),
+	SPH_C64(0xE4E453977353E431), SPH_C64(0x2727BB0225BB2769),
+	SPH_C64(0x41415873325841C3), SPH_C64(0x8B8B9DA72C9D8B80),
+	SPH_C64(0xA7A701F65101A7F4), SPH_C64(0x7D7D94B2CF947D87),
+	SPH_C64(0x9595FB49DCFB95A2), SPH_C64(0xD8D89F568E9FD875),
+	SPH_C64(0xFBFB30708B30FB10), SPH_C64(0xEEEE71CD2371EE2F),
+	SPH_C64(0x7C7C91BBC7917C84), SPH_C64(0x6666E37117E366AA),
+	SPH_C64(0xDDDD8E7BA68EDD7A), SPH_C64(0x17174BAFB84B1739),
+	SPH_C64(0x47474645024647C9), SPH_C64(0x9E9EDC1A84DC9EBF),
+	SPH_C64(0xCACAC5D41EC5CA43), SPH_C64(0x2D2D995875992D77),
+	SPH_C64(0xBFBF792E9179BFDC), SPH_C64(0x07071B3F381B0709),
+	SPH_C64(0xADAD23AC0123ADEA), SPH_C64(0x5A5A2FB0EA2F5AEE),
+	SPH_C64(0x8383B5EF6CB58398), SPH_C64(0x3333FFB685FF3355),
+	SPH_C64(0x6363F25C3FF263A5), SPH_C64(0x02020A12100A0206),
+	SPH_C64(0xAAAA38933938AAE3), SPH_C64(0x7171A8DEAFA87193),
+	SPH_C64(0xC8C8CFC60ECFC845), SPH_C64(0x19197DD1C87D192B),
+	SPH_C64(0x4949703B727049DB), SPH_C64(0xD9D99A5F869AD976),
+	SPH_C64(0xF2F21D31C31DF20B), SPH_C64(0xE3E348A84B48E338),
+	SPH_C64(0x5B5B2AB9E22A5BED), SPH_C64(0x888892BC34928885),
+	SPH_C64(0x9A9AC83EA4C89AB3), SPH_C64(0x2626BE0B2DBE266A),
+	SPH_C64(0x3232FABF8DFA3256), SPH_C64(0xB0B04A59E94AB0CD),
+	SPH_C64(0xE9E96AF21B6AE926), SPH_C64(0x0F0F337778330F11),
+	SPH_C64(0xD5D5A633E6A6D562), SPH_C64(0x8080BAF474BA809D),
+	SPH_C64(0xBEBE7C27997CBEDF), SPH_C64(0xCDCDDEEB26DECD4A),
+	SPH_C64(0x3434E489BDE4345C), SPH_C64(0x484875327A7548D8),
+	SPH_C64(0xFFFF2454AB24FF1C), SPH_C64(0x7A7A8F8DF78F7A8E),
+	SPH_C64(0x9090EA64F4EA90AD), SPH_C64(0x5F5F3E9DC23E5FE1),
+	SPH_C64(0x2020A03D1DA02060), SPH_C64(0x6868D50F67D568B8),
+	SPH_C64(0x1A1A72CAD0721A2E), SPH_C64(0xAEAE2CB7192CAEEF),
+	SPH_C64(0xB4B45E7DC95EB4C1), SPH_C64(0x545419CE9A1954FC),
+	SPH_C64(0x9393E57FECE593A8), SPH_C64(0x2222AA2F0DAA2266),
+	SPH_C64(0x6464E96307E964AC), SPH_C64(0xF1F1122ADB12F10E),
+	SPH_C64(0x7373A2CCBFA27395), SPH_C64(0x12125A82905A1236),
+	SPH_C64(0x40405D7A3A5D40C0), SPH_C64(0x0808284840280818),
+	SPH_C64(0xC3C3E89556E8C358), SPH_C64(0xECEC7BDF337BEC29),
+	SPH_C64(0xDBDB904D9690DB70), SPH_C64(0xA1A11FC0611FA1FE),
+	SPH_C64(0x8D8D83911C838D8A), SPH_C64(0x3D3DC9C8F5C93D47),
+	SPH_C64(0x9797F15BCCF197A4), SPH_C64(0x0000000000000000),
+	SPH_C64(0xCFCFD4F936D4CF4C), SPH_C64(0x2B2B876E45872B7D),
+	SPH_C64(0x7676B3E197B3769A), SPH_C64(0x8282B0E664B0829B),
+	SPH_C64(0xD6D6A928FEA9D667), SPH_C64(0x1B1B77C3D8771B2D),
+	SPH_C64(0xB5B55B74C15BB5C2), SPH_C64(0xAFAF29BE1129AFEC),
+	SPH_C64(0x6A6ADF1D77DF6ABE), SPH_C64(0x50500DEABA0D50F0),
+	SPH_C64(0x45454C57124C45CF), SPH_C64(0xF3F31838CB18F308),
+	SPH_C64(0x3030F0AD9DF03050), SPH_C64(0xEFEF74C42B74EF2C),
+	SPH_C64(0x3F3FC3DAE5C33F41), SPH_C64(0x55551CC7921C55FF),
+	SPH_C64(0xA2A210DB7910A2FB), SPH_C64(0xEAEA65E90365EA23),
+	SPH_C64(0x6565EC6A0FEC65AF), SPH_C64(0xBABA6803B968BAD3),
+	SPH_C64(0x2F2F934A65932F71), SPH_C64(0xC0C0E78E4EE7C05D),
+	SPH_C64(0xDEDE8160BE81DE7F), SPH_C64(0x1C1C6CFCE06C1C24),
+	SPH_C64(0xFDFD2E46BB2EFD1A), SPH_C64(0x4D4D641F52644DD7),
+	SPH_C64(0x9292E076E4E092AB), SPH_C64(0x7575BCFA8FBC759F),
+	SPH_C64(0x06061E36301E060A), SPH_C64(0x8A8A98AE24988A83),
+	SPH_C64(0xB2B2404BF940B2CB), SPH_C64(0xE6E659856359E637),
+	SPH_C64(0x0E0E367E70360E12), SPH_C64(0x1F1F63E7F8631F21),
+	SPH_C64(0x6262F75537F762A6), SPH_C64(0xD4D4A33AEEA3D461),
+	SPH_C64(0xA8A832812932A8E5), SPH_C64(0x9696F452C4F496A7),
+	SPH_C64(0xF9F93A629B3AF916), SPH_C64(0xC5C5F6A366F6C552),
+	SPH_C64(0x2525B11035B1256F), SPH_C64(0x595920ABF22059EB),
+	SPH_C64(0x8484AED054AE8491), SPH_C64(0x7272A7C5B7A77296),
+	SPH_C64(0x3939DDECD5DD394B), SPH_C64(0x4C4C61165A614CD4),
+	SPH_C64(0x5E5E3B94CA3B5EE2), SPH_C64(0x7878859FE7857888),
+	SPH_C64(0x3838D8E5DDD83848), SPH_C64(0x8C8C869814868C89),
+	SPH_C64(0xD1D1B217C6B2D16E), SPH_C64(0xA5A50BE4410BA5F2),
+	SPH_C64(0xE2E24DA1434DE23B), SPH_C64(0x6161F84E2FF861A3),
+	SPH_C64(0xB3B34542F145B3C8), SPH_C64(0x2121A53415A52163),
+	SPH_C64(0x9C9CD60894D69CB9), SPH_C64(0x1E1E66EEF0661E22),
+	SPH_C64(0x43435261225243C5), SPH_C64(0xC7C7FCB176FCC754),
+	SPH_C64(0xFCFC2B4FB32BFC19), SPH_C64(0x040414242014040C),
+	SPH_C64(0x515108E3B20851F3), SPH_C64(0x9999C725BCC799B6),
+	SPH_C64(0x6D6DC4224FC46DB7), SPH_C64(0x0D0D396568390D17),
+	SPH_C64(0xFAFA35798335FA13), SPH_C64(0xDFDF8469B684DF7C),
+	SPH_C64(0x7E7E9BA9D79B7E82), SPH_C64(0x2424B4193DB4246C),
+	SPH_C64(0x3B3BD7FEC5D73B4D), SPH_C64(0xABAB3D9A313DABE0),
+	SPH_C64(0xCECED1F03ED1CE4F), SPH_C64(0x1111559988551133),
+	SPH_C64(0x8F8F89830C898F8C), SPH_C64(0x4E4E6B044A6B4ED2),
+	SPH_C64(0xB7B75166D151B7C4), SPH_C64(0xEBEB60E00B60EB20),
+	SPH_C64(0x3C3CCCC1FDCC3C44), SPH_C64(0x8181BFFD7CBF819E),
+	SPH_C64(0x9494FE40D4FE94A1), SPH_C64(0xF7F70C1CEB0CF704),
+	SPH_C64(0xB9B96718A167B9D6), SPH_C64(0x13135F8B985F1335),
+	SPH_C64(0x2C2C9C517D9C2C74), SPH_C64(0xD3D3B805D6B8D368),
+	SPH_C64(0xE7E75C8C6B5CE734), SPH_C64(0x6E6ECB3957CB6EB2),
+	SPH_C64(0xC4C4F3AA6EF3C451), SPH_C64(0x03030F1B180F0305),
+	SPH_C64(0x565613DC8A1356FA), SPH_C64(0x4444495E1A4944CC),
+	SPH_C64(0x7F7F9EA0DF9E7F81), SPH_C64(0xA9A937882137A9E6),
+	SPH_C64(0x2A2A82674D822A7E), SPH_C64(0xBBBB6D0AB16DBBD0),
+	SPH_C64(0xC1C1E28746E2C15E), SPH_C64(0x535302F1A20253F5),
+	SPH_C64(0xDCDC8B72AE8BDC79), SPH_C64(0x0B0B275358270B1D),
+	SPH_C64(0x9D9DD3019CD39DBA), SPH_C64(0x6C6CC12B47C16CB4),
+	SPH_C64(0x3131F5A495F53153), SPH_C64(0x7474B9F387B9749C),
+	SPH_C64(0xF6F60915E309F607), SPH_C64(0x4646434C0A4346CA),
+	SPH_C64(0xACAC26A50926ACE9), SPH_C64(0x898997B53C978986),
+	SPH_C64(0x141444B4A044143C), SPH_C64(0xE1E142BA5B42E13E),
+	SPH_C64(0x16164EA6B04E163A), SPH_C64(0x3A3AD2F7CDD23A4E),
+	SPH_C64(0x6969D0066FD069BB), SPH_C64(0x09092D41482D091B),
+	SPH_C64(0x7070ADD7A7AD7090), SPH_C64(0xB6B6546FD954B6C7),
+	SPH_C64(0xD0D0B71ECEB7D06D), SPH_C64(0xEDED7ED63B7EED2A),
+	SPH_C64(0xCCCCDBE22EDBCC49), SPH_C64(0x424257682A5742C6),
+	SPH_C64(0x9898C22CB4C298B5), SPH_C64(0xA4A40EED490EA4F1),
+	SPH_C64(0x282888755D882878), SPH_C64(0x5C5C3186DA315CE4),
+	SPH_C64(0xF8F83F6B933FF815), SPH_C64(0x8686A4C244A48697)
+};
+
+static const uint64_t old1_T7[256] = {
+	SPH_C64(0x1878D8C078182818), SPH_C64(0x23AF2605AF236523),
+	SPH_C64(0xC6F9B87EF9C657C6), SPH_C64(0xE86FFB136FE825E8),
+	SPH_C64(0x87A1CB4CA1879487), SPH_C64(0xB86211A962B8D5B8),
+	SPH_C64(0x0105090805010301), SPH_C64(0x4F6E0D426E4FD14F),
+	SPH_C64(0x36EE9BADEE365A36), SPH_C64(0xA604FF5904A6F7A6),
+	SPH_C64(0xD2BD0CDEBDD26BD2), SPH_C64(0xF5060EFB06F502F5),
+	SPH_C64(0x798096EF80798B79), SPH_C64(0x6FCE305FCE6FB16F),
+	SPH_C64(0x91EF6DFCEF91AE91), SPH_C64(0x5207F8AA0752F652),
+	SPH_C64(0x60FD4727FD60A060), SPH_C64(0xBC76358976BCD9BC),
+	SPH_C64(0x9BCD37ACCD9BB09B), SPH_C64(0x8E8C8A048C8E8F8E),
+	SPH_C64(0xA315D27115A3F8A3), SPH_C64(0x0C3C6C603C0C140C),
+	SPH_C64(0x7B8A84FF8A7B8D7B), SPH_C64(0x35E180B5E1355F35),
+	SPH_C64(0x1D69F5E8691D271D), SPH_C64(0xE047B35347E03DE0),
+	SPH_C64(0xD7AC21F6ACD764D7), SPH_C64(0xC2ED9C5EEDC25BC2),
+	SPH_C64(0x2E96436D962E722E), SPH_C64(0x4B7A29627A4BDD4B),
+	SPH_C64(0xFE215DA321FE1FFE), SPH_C64(0x5716D5821657F957),
+	SPH_C64(0x1541BDA841153F15), SPH_C64(0x77B6E89FB6779977),
+	SPH_C64(0x37EB92A5EB375937), SPH_C64(0xE5569E7B56E532E5),
+	SPH_C64(0x9FD9138CD99FBC9F), SPH_C64(0xF01723D317F00DF0),
+	SPH_C64(0x4A7F206A7F4ADE4A), SPH_C64(0xDA95449E95DA73DA),
+	SPH_C64(0x5825A2FA2558E858), SPH_C64(0xC9CACF06CAC946C9),
+	SPH_C64(0x298D7C558D297B29), SPH_C64(0x0A225A50220A1E0A),
+	SPH_C64(0xB14F50E14FB1CEB1), SPH_C64(0xA01AC9691AA0FDA0),
+	SPH_C64(0x6BDA147FDA6BBD6B), SPH_C64(0x85ABD95CAB859285),
+	SPH_C64(0xBD733C8173BDDABD), SPH_C64(0x5D348FD2345DE75D),
+	SPH_C64(0x1050908050103010), SPH_C64(0xF40307F303F401F4),
+	SPH_C64(0xCBC0DD16C0CB40CB), SPH_C64(0x3EC6D3EDC63E423E),
+	SPH_C64(0x05112D2811050F05), SPH_C64(0x67E6781FE667A967),
+	SPH_C64(0xE453977353E431E4), SPH_C64(0x27BB0225BB276927),
+	SPH_C64(0x415873325841C341), SPH_C64(0x8B9DA72C9D8B808B),
+	SPH_C64(0xA701F65101A7F4A7), SPH_C64(0x7D94B2CF947D877D),
+	SPH_C64(0x95FB49DCFB95A295), SPH_C64(0xD89F568E9FD875D8),
+	SPH_C64(0xFB30708B30FB10FB), SPH_C64(0xEE71CD2371EE2FEE),
+	SPH_C64(0x7C91BBC7917C847C), SPH_C64(0x66E37117E366AA66),
+	SPH_C64(0xDD8E7BA68EDD7ADD), SPH_C64(0x174BAFB84B173917),
+	SPH_C64(0x474645024647C947), SPH_C64(0x9EDC1A84DC9EBF9E),
+	SPH_C64(0xCAC5D41EC5CA43CA), SPH_C64(0x2D995875992D772D),
+	SPH_C64(0xBF792E9179BFDCBF), SPH_C64(0x071B3F381B070907),
+	SPH_C64(0xAD23AC0123ADEAAD), SPH_C64(0x5A2FB0EA2F5AEE5A),
+	SPH_C64(0x83B5EF6CB5839883), SPH_C64(0x33FFB685FF335533),
+	SPH_C64(0x63F25C3FF263A563), SPH_C64(0x020A12100A020602),
+	SPH_C64(0xAA38933938AAE3AA), SPH_C64(0x71A8DEAFA8719371),
+	SPH_C64(0xC8CFC60ECFC845C8), SPH_C64(0x197DD1C87D192B19),
+	SPH_C64(0x49703B727049DB49), SPH_C64(0xD99A5F869AD976D9),
+	SPH_C64(0xF21D31C31DF20BF2), SPH_C64(0xE348A84B48E338E3),
+	SPH_C64(0x5B2AB9E22A5BED5B), SPH_C64(0x8892BC3492888588),
+	SPH_C64(0x9AC83EA4C89AB39A), SPH_C64(0x26BE0B2DBE266A26),
+	SPH_C64(0x32FABF8DFA325632), SPH_C64(0xB04A59E94AB0CDB0),
+	SPH_C64(0xE96AF21B6AE926E9), SPH_C64(0x0F337778330F110F),
+	SPH_C64(0xD5A633E6A6D562D5), SPH_C64(0x80BAF474BA809D80),
+	SPH_C64(0xBE7C27997CBEDFBE), SPH_C64(0xCDDEEB26DECD4ACD),
+	SPH_C64(0x34E489BDE4345C34), SPH_C64(0x4875327A7548D848),
+	SPH_C64(0xFF2454AB24FF1CFF), SPH_C64(0x7A8F8DF78F7A8E7A),
+	SPH_C64(0x90EA64F4EA90AD90), SPH_C64(0x5F3E9DC23E5FE15F),
+	SPH_C64(0x20A03D1DA0206020), SPH_C64(0x68D50F67D568B868),
+	SPH_C64(0x1A72CAD0721A2E1A), SPH_C64(0xAE2CB7192CAEEFAE),
+	SPH_C64(0xB45E7DC95EB4C1B4), SPH_C64(0x5419CE9A1954FC54),
+	SPH_C64(0x93E57FECE593A893), SPH_C64(0x22AA2F0DAA226622),
+	SPH_C64(0x64E96307E964AC64), SPH_C64(0xF1122ADB12F10EF1),
+	SPH_C64(0x73A2CCBFA2739573), SPH_C64(0x125A82905A123612),
+	SPH_C64(0x405D7A3A5D40C040), SPH_C64(0x0828484028081808),
+	SPH_C64(0xC3E89556E8C358C3), SPH_C64(0xEC7BDF337BEC29EC),
+	SPH_C64(0xDB904D9690DB70DB), SPH_C64(0xA11FC0611FA1FEA1),
+	SPH_C64(0x8D83911C838D8A8D), SPH_C64(0x3DC9C8F5C93D473D),
+	SPH_C64(0x97F15BCCF197A497), SPH_C64(0x0000000000000000),
+	SPH_C64(0xCFD4F936D4CF4CCF), SPH_C64(0x2B876E45872B7D2B),
+	SPH_C64(0x76B3E197B3769A76), SPH_C64(0x82B0E664B0829B82),
+	SPH_C64(0xD6A928FEA9D667D6), SPH_C64(0x1B77C3D8771B2D1B),
+	SPH_C64(0xB55B74C15BB5C2B5), SPH_C64(0xAF29BE1129AFECAF),
+	SPH_C64(0x6ADF1D77DF6ABE6A), SPH_C64(0x500DEABA0D50F050),
+	SPH_C64(0x454C57124C45CF45), SPH_C64(0xF31838CB18F308F3),
+	SPH_C64(0x30F0AD9DF0305030), SPH_C64(0xEF74C42B74EF2CEF),
+	SPH_C64(0x3FC3DAE5C33F413F), SPH_C64(0x551CC7921C55FF55),
+	SPH_C64(0xA210DB7910A2FBA2), SPH_C64(0xEA65E90365EA23EA),
+	SPH_C64(0x65EC6A0FEC65AF65), SPH_C64(0xBA6803B968BAD3BA),
+	SPH_C64(0x2F934A65932F712F), SPH_C64(0xC0E78E4EE7C05DC0),
+	SPH_C64(0xDE8160BE81DE7FDE), SPH_C64(0x1C6CFCE06C1C241C),
+	SPH_C64(0xFD2E46BB2EFD1AFD), SPH_C64(0x4D641F52644DD74D),
+	SPH_C64(0x92E076E4E092AB92), SPH_C64(0x75BCFA8FBC759F75),
+	SPH_C64(0x061E36301E060A06), SPH_C64(0x8A98AE24988A838A),
+	SPH_C64(0xB2404BF940B2CBB2), SPH_C64(0xE659856359E637E6),
+	SPH_C64(0x0E367E70360E120E), SPH_C64(0x1F63E7F8631F211F),
+	SPH_C64(0x62F75537F762A662), SPH_C64(0xD4A33AEEA3D461D4),
+	SPH_C64(0xA832812932A8E5A8), SPH_C64(0x96F452C4F496A796),
+	SPH_C64(0xF93A629B3AF916F9), SPH_C64(0xC5F6A366F6C552C5),
+	SPH_C64(0x25B11035B1256F25), SPH_C64(0x5920ABF22059EB59),
+	SPH_C64(0x84AED054AE849184), SPH_C64(0x72A7C5B7A7729672),
+	SPH_C64(0x39DDECD5DD394B39), SPH_C64(0x4C61165A614CD44C),
+	SPH_C64(0x5E3B94CA3B5EE25E), SPH_C64(0x78859FE785788878),
+	SPH_C64(0x38D8E5DDD8384838), SPH_C64(0x8C869814868C898C),
+	SPH_C64(0xD1B217C6B2D16ED1), SPH_C64(0xA50BE4410BA5F2A5),
+	SPH_C64(0xE24DA1434DE23BE2), SPH_C64(0x61F84E2FF861A361),
+	SPH_C64(0xB34542F145B3C8B3), SPH_C64(0x21A53415A5216321),
+	SPH_C64(0x9CD60894D69CB99C), SPH_C64(0x1E66EEF0661E221E),
+	SPH_C64(0x435261225243C543), SPH_C64(0xC7FCB176FCC754C7),
+	SPH_C64(0xFC2B4FB32BFC19FC), SPH_C64(0x0414242014040C04),
+	SPH_C64(0x5108E3B20851F351), SPH_C64(0x99C725BCC799B699),
+	SPH_C64(0x6DC4224FC46DB76D), SPH_C64(0x0D396568390D170D),
+	SPH_C64(0xFA35798335FA13FA), SPH_C64(0xDF8469B684DF7CDF),
+	SPH_C64(0x7E9BA9D79B7E827E), SPH_C64(0x24B4193DB4246C24),
+	SPH_C64(0x3BD7FEC5D73B4D3B), SPH_C64(0xAB3D9A313DABE0AB),
+	SPH_C64(0xCED1F03ED1CE4FCE), SPH_C64(0x1155998855113311),
+	SPH_C64(0x8F89830C898F8C8F), SPH_C64(0x4E6B044A6B4ED24E),
+	SPH_C64(0xB75166D151B7C4B7), SPH_C64(0xEB60E00B60EB20EB),
+	SPH_C64(0x3CCCC1FDCC3C443C), SPH_C64(0x81BFFD7CBF819E81),
+	SPH_C64(0x94FE40D4FE94A194), SPH_C64(0xF70C1CEB0CF704F7),
+	SPH_C64(0xB96718A167B9D6B9), SPH_C64(0x135F8B985F133513),
+	SPH_C64(0x2C9C517D9C2C742C), SPH_C64(0xD3B805D6B8D368D3),
+	SPH_C64(0xE75C8C6B5CE734E7), SPH_C64(0x6ECB3957CB6EB26E),
+	SPH_C64(0xC4F3AA6EF3C451C4), SPH_C64(0x030F1B180F030503),
+	SPH_C64(0x5613DC8A1356FA56), SPH_C64(0x44495E1A4944CC44),
+	SPH_C64(0x7F9EA0DF9E7F817F), SPH_C64(0xA937882137A9E6A9),
+	SPH_C64(0x2A82674D822A7E2A), SPH_C64(0xBB6D0AB16DBBD0BB),
+	SPH_C64(0xC1E28746E2C15EC1), SPH_C64(0x5302F1A20253F553),
+	SPH_C64(0xDC8B72AE8BDC79DC), SPH_C64(0x0B275358270B1D0B),
+	SPH_C64(0x9DD3019CD39DBA9D), SPH_C64(0x6CC12B47C16CB46C),
+	SPH_C64(0x31F5A495F5315331), SPH_C64(0x74B9F387B9749C74),
+	SPH_C64(0xF60915E309F607F6), SPH_C64(0x46434C0A4346CA46),
+	SPH_C64(0xAC26A50926ACE9AC), SPH_C64(0x8997B53C97898689),
+	SPH_C64(0x1444B4A044143C14), SPH_C64(0xE142BA5B42E13EE1),
+	SPH_C64(0x164EA6B04E163A16), SPH_C64(0x3AD2F7CDD23A4E3A),
+	SPH_C64(0x69D0066FD069BB69), SPH_C64(0x092D41482D091B09),
+	SPH_C64(0x70ADD7A7AD709070), SPH_C64(0xB6546FD954B6C7B6),
+	SPH_C64(0xD0B71ECEB7D06DD0), SPH_C64(0xED7ED63B7EED2AED),
+	SPH_C64(0xCCDBE22EDBCC49CC), SPH_C64(0x4257682A5742C642),
+	SPH_C64(0x98C22CB4C298B598), SPH_C64(0xA40EED490EA4F1A4),
+	SPH_C64(0x2888755D88287828), SPH_C64(0x5C3186DA315CE45C),
+	SPH_C64(0xF83F6B933FF815F8), SPH_C64(0x86A4C244A4869786)
+};
+
+static const uint64_t plain_T1[256] = {
+	SPH_C64(0x3078C018601818D8), SPH_C64(0x46AF05238C232326),
+	SPH_C64(0x91F97EC63FC6C6B8), SPH_C64(0xCD6F13E887E8E8FB),
+	SPH_C64(0x13A14C87268787CB), SPH_C64(0x6D62A9B8DAB8B811),
+	SPH_C64(0x0205080104010109), SPH_C64(0x9E6E424F214F4F0D),
+	SPH_C64(0x6CEEAD36D836369B), SPH_C64(0x510459A6A2A6A6FF),
+	SPH_C64(0xB9BDDED26FD2D20C), SPH_C64(0xF706FBF5F3F5F50E),
+	SPH_C64(0xF280EF79F9797996), SPH_C64(0xDECE5F6FA16F6F30),
+	SPH_C64(0x3FEFFC917E91916D), SPH_C64(0xA407AA52555252F8),
+	SPH_C64(0xC0FD27609D606047), SPH_C64(0x657689BCCABCBC35),
+	SPH_C64(0x2BCDAC9B569B9B37), SPH_C64(0x018C048E028E8E8A),
+	SPH_C64(0x5B1571A3B6A3A3D2), SPH_C64(0x183C600C300C0C6C),
+	SPH_C64(0xF68AFF7BF17B7B84), SPH_C64(0x6AE1B535D4353580),
+	SPH_C64(0x3A69E81D741D1DF5), SPH_C64(0xDD4753E0A7E0E0B3),
+	SPH_C64(0xB3ACF6D77BD7D721), SPH_C64(0x99ED5EC22FC2C29C),
+	SPH_C64(0x5C966D2EB82E2E43), SPH_C64(0x967A624B314B4B29),
+	SPH_C64(0xE121A3FEDFFEFE5D), SPH_C64(0xAE168257415757D5),
+	SPH_C64(0x2A41A815541515BD), SPH_C64(0xEEB69F77C17777E8),
+	SPH_C64(0x6EEBA537DC373792), SPH_C64(0xD7567BE5B3E5E59E),
+	SPH_C64(0x23D98C9F469F9F13), SPH_C64(0xFD17D3F0E7F0F023),
+	SPH_C64(0x947F6A4A354A4A20), SPH_C64(0xA9959EDA4FDADA44),
+	SPH_C64(0xB025FA587D5858A2), SPH_C64(0x8FCA06C903C9C9CF),
+	SPH_C64(0x528D5529A429297C), SPH_C64(0x1422500A280A0A5A),
+	SPH_C64(0x7F4FE1B1FEB1B150), SPH_C64(0x5D1A69A0BAA0A0C9),
+	SPH_C64(0xD6DA7F6BB16B6B14), SPH_C64(0x17AB5C852E8585D9),
+	SPH_C64(0x677381BDCEBDBD3C), SPH_C64(0xBA34D25D695D5D8F),
+	SPH_C64(0x2050801040101090), SPH_C64(0xF503F3F4F7F4F407),
+	SPH_C64(0x8BC016CB0BCBCBDD), SPH_C64(0x7CC6ED3EF83E3ED3),
+	SPH_C64(0x0A1128051405052D), SPH_C64(0xCEE61F6781676778),
+	SPH_C64(0xD55373E4B7E4E497), SPH_C64(0x4EBB25279C272702),
+	SPH_C64(0x8258324119414173), SPH_C64(0x0B9D2C8B168B8BA7),
+	SPH_C64(0x530151A7A6A7A7F6), SPH_C64(0xFA94CF7DE97D7DB2),
+	SPH_C64(0x37FBDC956E959549), SPH_C64(0xAD9F8ED847D8D856),
+	SPH_C64(0xEB308BFBCBFBFB70), SPH_C64(0xC17123EE9FEEEECD),
+	SPH_C64(0xF891C77CED7C7CBB), SPH_C64(0xCCE3176685666671),
+	SPH_C64(0xA78EA6DD53DDDD7B), SPH_C64(0x2E4BB8175C1717AF),
+	SPH_C64(0x8E46024701474745), SPH_C64(0x21DC849E429E9E1A),
+	SPH_C64(0x89C51ECA0FCACAD4), SPH_C64(0x5A99752DB42D2D58),
+	SPH_C64(0x637991BFC6BFBF2E), SPH_C64(0x0E1B38071C07073F),
+	SPH_C64(0x472301AD8EADADAC), SPH_C64(0xB42FEA5A755A5AB0),
+	SPH_C64(0x1BB56C83368383EF), SPH_C64(0x66FF8533CC3333B6),
+	SPH_C64(0xC6F23F639163635C), SPH_C64(0x040A100208020212),
+	SPH_C64(0x493839AA92AAAA93), SPH_C64(0xE2A8AF71D97171DE),
+	SPH_C64(0x8DCF0EC807C8C8C6), SPH_C64(0x327DC819641919D1),
+	SPH_C64(0x927072493949493B), SPH_C64(0xAF9A86D943D9D95F),
+	SPH_C64(0xF91DC3F2EFF2F231), SPH_C64(0xDB484BE3ABE3E3A8),
+	SPH_C64(0xB62AE25B715B5BB9), SPH_C64(0x0D9234881A8888BC),
+	SPH_C64(0x29C8A49A529A9A3E), SPH_C64(0x4CBE2D269826260B),
+	SPH_C64(0x64FA8D32C83232BF), SPH_C64(0x7D4AE9B0FAB0B059),
+	SPH_C64(0xCF6A1BE983E9E9F2), SPH_C64(0x1E33780F3C0F0F77),
+	SPH_C64(0xB7A6E6D573D5D533), SPH_C64(0x1DBA74803A8080F4),
+	SPH_C64(0x617C99BEC2BEBE27), SPH_C64(0x87DE26CD13CDCDEB),
+	SPH_C64(0x68E4BD34D0343489), SPH_C64(0x90757A483D484832),
+	SPH_C64(0xE324ABFFDBFFFF54), SPH_C64(0xF48FF77AF57A7A8D),
+	SPH_C64(0x3DEAF4907A909064), SPH_C64(0xBE3EC25F615F5F9D),
+	SPH_C64(0x40A01D208020203D), SPH_C64(0xD0D56768BD68680F),
+	SPH_C64(0x3472D01A681A1ACA), SPH_C64(0x412C19AE82AEAEB7),
+	SPH_C64(0x755EC9B4EAB4B47D), SPH_C64(0xA8199A544D5454CE),
+	SPH_C64(0x3BE5EC937693937F), SPH_C64(0x44AA0D228822222F),
+	SPH_C64(0xC8E907648D646463), SPH_C64(0xFF12DBF1E3F1F12A),
+	SPH_C64(0xE6A2BF73D17373CC), SPH_C64(0x245A901248121282),
+	SPH_C64(0x805D3A401D40407A), SPH_C64(0x1028400820080848),
+	SPH_C64(0x9BE856C32BC3C395), SPH_C64(0xC57B33EC97ECECDF),
+	SPH_C64(0xAB9096DB4BDBDB4D), SPH_C64(0x5F1F61A1BEA1A1C0),
+	SPH_C64(0x07831C8D0E8D8D91), SPH_C64(0x7AC9F53DF43D3DC8),
+	SPH_C64(0x33F1CC976697975B), SPH_C64(0x0000000000000000),
+	SPH_C64(0x83D436CF1BCFCFF9), SPH_C64(0x5687452BAC2B2B6E),
+	SPH_C64(0xECB39776C57676E1), SPH_C64(0x19B06482328282E6),
+	SPH_C64(0xB1A9FED67FD6D628), SPH_C64(0x3677D81B6C1B1BC3),
+	SPH_C64(0x775BC1B5EEB5B574), SPH_C64(0x432911AF86AFAFBE),
+	SPH_C64(0xD4DF776AB56A6A1D), SPH_C64(0xA00DBA505D5050EA),
+	SPH_C64(0x8A4C124509454557), SPH_C64(0xFB18CBF3EBF3F338),
+	SPH_C64(0x60F09D30C03030AD), SPH_C64(0xC3742BEF9BEFEFC4),
+	SPH_C64(0x7EC3E53FFC3F3FDA), SPH_C64(0xAA1C9255495555C7),
+	SPH_C64(0x591079A2B2A2A2DB), SPH_C64(0xC96503EA8FEAEAE9),
+	SPH_C64(0xCAEC0F658965656A), SPH_C64(0x6968B9BAD2BABA03),
+	SPH_C64(0x5E93652FBC2F2F4A), SPH_C64(0x9DE74EC027C0C08E),
+	SPH_C64(0xA181BEDE5FDEDE60), SPH_C64(0x386CE01C701C1CFC),
+	SPH_C64(0xE72EBBFDD3FDFD46), SPH_C64(0x9A64524D294D4D1F),
+	SPH_C64(0x39E0E49272929276), SPH_C64(0xEABC8F75C97575FA),
+	SPH_C64(0x0C1E300618060636), SPH_C64(0x0998248A128A8AAE),
+	SPH_C64(0x7940F9B2F2B2B24B), SPH_C64(0xD15963E6BFE6E685),
+	SPH_C64(0x1C36700E380E0E7E), SPH_C64(0x3E63F81F7C1F1FE7),
+	SPH_C64(0xC4F7376295626255), SPH_C64(0xB5A3EED477D4D43A),
+	SPH_C64(0x4D3229A89AA8A881), SPH_C64(0x31F4C49662969652),
+	SPH_C64(0xEF3A9BF9C3F9F962), SPH_C64(0x97F666C533C5C5A3),
+	SPH_C64(0x4AB1352594252510), SPH_C64(0xB220F259795959AB),
+	SPH_C64(0x15AE54842A8484D0), SPH_C64(0xE4A7B772D57272C5),
+	SPH_C64(0x72DDD539E43939EC), SPH_C64(0x98615A4C2D4C4C16),
+	SPH_C64(0xBC3BCA5E655E5E94), SPH_C64(0xF085E778FD78789F),
+	SPH_C64(0x70D8DD38E03838E5), SPH_C64(0x0586148C0A8C8C98),
+	SPH_C64(0xBFB2C6D163D1D117), SPH_C64(0x570B41A5AEA5A5E4),
+	SPH_C64(0xD94D43E2AFE2E2A1), SPH_C64(0xC2F82F619961614E),
+	SPH_C64(0x7B45F1B3F6B3B342), SPH_C64(0x42A5152184212134),
+	SPH_C64(0x25D6949C4A9C9C08), SPH_C64(0x3C66F01E781E1EEE),
+	SPH_C64(0x8652224311434361), SPH_C64(0x93FC76C73BC7C7B1),
+	SPH_C64(0xE52BB3FCD7FCFC4F), SPH_C64(0x0814200410040424),
+	SPH_C64(0xA208B251595151E3), SPH_C64(0x2FC7BC995E999925),
+	SPH_C64(0xDAC44F6DA96D6D22), SPH_C64(0x1A39680D340D0D65),
+	SPH_C64(0xE93583FACFFAFA79), SPH_C64(0xA384B6DF5BDFDF69),
+	SPH_C64(0xFC9BD77EE57E7EA9), SPH_C64(0x48B43D2490242419),
+	SPH_C64(0x76D7C53BEC3B3BFE), SPH_C64(0x4B3D31AB96ABAB9A),
+	SPH_C64(0x81D13ECE1FCECEF0), SPH_C64(0x2255881144111199),
+	SPH_C64(0x03890C8F068F8F83), SPH_C64(0x9C6B4A4E254E4E04),
+	SPH_C64(0x7351D1B7E6B7B766), SPH_C64(0xCB600BEB8BEBEBE0),
+	SPH_C64(0x78CCFD3CF03C3CC1), SPH_C64(0x1FBF7C813E8181FD),
+	SPH_C64(0x35FED4946A949440), SPH_C64(0xF30CEBF7FBF7F71C),
+	SPH_C64(0x6F67A1B9DEB9B918), SPH_C64(0x265F98134C13138B),
+	SPH_C64(0x589C7D2CB02C2C51), SPH_C64(0xBBB8D6D36BD3D305),
+	SPH_C64(0xD35C6BE7BBE7E78C), SPH_C64(0xDCCB576EA56E6E39),
+	SPH_C64(0x95F36EC437C4C4AA), SPH_C64(0x060F18030C03031B),
+	SPH_C64(0xAC138A56455656DC), SPH_C64(0x88491A440D44445E),
+	SPH_C64(0xFE9EDF7FE17F7FA0), SPH_C64(0x4F3721A99EA9A988),
+	SPH_C64(0x54824D2AA82A2A67), SPH_C64(0x6B6DB1BBD6BBBB0A),
+	SPH_C64(0x9FE246C123C1C187), SPH_C64(0xA602A253515353F1),
+	SPH_C64(0xA58BAEDC57DCDC72), SPH_C64(0x1627580B2C0B0B53),
+	SPH_C64(0x27D39C9D4E9D9D01), SPH_C64(0xD8C1476CAD6C6C2B),
+	SPH_C64(0x62F59531C43131A4), SPH_C64(0xE8B98774CD7474F3),
+	SPH_C64(0xF109E3F6FFF6F615), SPH_C64(0x8C430A460546464C),
+	SPH_C64(0x452609AC8AACACA5), SPH_C64(0x0F973C891E8989B5),
+	SPH_C64(0x2844A014501414B4), SPH_C64(0xDF425BE1A3E1E1BA),
+	SPH_C64(0x2C4EB016581616A6), SPH_C64(0x74D2CD3AE83A3AF7),
+	SPH_C64(0xD2D06F69B9696906), SPH_C64(0x122D480924090941),
+	SPH_C64(0xE0ADA770DD7070D7), SPH_C64(0x7154D9B6E2B6B66F),
+	SPH_C64(0xBDB7CED067D0D01E), SPH_C64(0xC77E3BED93EDEDD6),
+	SPH_C64(0x85DB2ECC17CCCCE2), SPH_C64(0x84572A4215424268),
+	SPH_C64(0x2DC2B4985A98982C), SPH_C64(0x550E49A4AAA4A4ED),
+	SPH_C64(0x50885D28A0282875), SPH_C64(0xB831DA5C6D5C5C86),
+	SPH_C64(0xED3F93F8C7F8F86B), SPH_C64(0x11A44486228686C2)
+};
+
+static const uint64_t plain_T2[256] = {
+	SPH_C64(0x78C018601818D830), SPH_C64(0xAF05238C23232646),
+	SPH_C64(0xF97EC63FC6C6B891), SPH_C64(0x6F13E887E8E8FBCD),
+	SPH_C64(0xA14C87268787CB13), SPH_C64(0x62A9B8DAB8B8116D),
+	SPH_C64(0x0508010401010902), SPH_C64(0x6E424F214F4F0D9E),
+	SPH_C64(0xEEAD36D836369B6C), SPH_C64(0x0459A6A2A6A6FF51),
+	SPH_C64(0xBDDED26FD2D20CB9), SPH_C64(0x06FBF5F3F5F50EF7),
+	SPH_C64(0x80EF79F9797996F2), SPH_C64(0xCE5F6FA16F6F30DE),
+	SPH_C64(0xEFFC917E91916D3F), SPH_C64(0x07AA52555252F8A4),
+	SPH_C64(0xFD27609D606047C0), SPH_C64(0x7689BCCABCBC3565),
+	SPH_C64(0xCDAC9B569B9B372B), SPH_C64(0x8C048E028E8E8A01),
+	SPH_C64(0x1571A3B6A3A3D25B), SPH_C64(0x3C600C300C0C6C18),
+	SPH_C64(0x8AFF7BF17B7B84F6), SPH_C64(0xE1B535D43535806A),
+	SPH_C64(0x69E81D741D1DF53A), SPH_C64(0x4753E0A7E0E0B3DD),
+	SPH_C64(0xACF6D77BD7D721B3), SPH_C64(0xED5EC22FC2C29C99),
+	SPH_C64(0x966D2EB82E2E435C), SPH_C64(0x7A624B314B4B2996),
+	SPH_C64(0x21A3FEDFFEFE5DE1), SPH_C64(0x168257415757D5AE),
+	SPH_C64(0x41A815541515BD2A), SPH_C64(0xB69F77C17777E8EE),
+	SPH_C64(0xEBA537DC3737926E), SPH_C64(0x567BE5B3E5E59ED7),
+	SPH_C64(0xD98C9F469F9F1323), SPH_C64(0x17D3F0E7F0F023FD),
+	SPH_C64(0x7F6A4A354A4A2094), SPH_C64(0x959EDA4FDADA44A9),
+	SPH_C64(0x25FA587D5858A2B0), SPH_C64(0xCA06C903C9C9CF8F),
+	SPH_C64(0x8D5529A429297C52), SPH_C64(0x22500A280A0A5A14),
+	SPH_C64(0x4FE1B1FEB1B1507F), SPH_C64(0x1A69A0BAA0A0C95D),
+	SPH_C64(0xDA7F6BB16B6B14D6), SPH_C64(0xAB5C852E8585D917),
+	SPH_C64(0x7381BDCEBDBD3C67), SPH_C64(0x34D25D695D5D8FBA),
+	SPH_C64(0x5080104010109020), SPH_C64(0x03F3F4F7F4F407F5),
+	SPH_C64(0xC016CB0BCBCBDD8B), SPH_C64(0xC6ED3EF83E3ED37C),
+	SPH_C64(0x1128051405052D0A), SPH_C64(0xE61F6781676778CE),
+	SPH_C64(0x5373E4B7E4E497D5), SPH_C64(0xBB25279C2727024E),
+	SPH_C64(0x5832411941417382), SPH_C64(0x9D2C8B168B8BA70B),
+	SPH_C64(0x0151A7A6A7A7F653), SPH_C64(0x94CF7DE97D7DB2FA),
+	SPH_C64(0xFBDC956E95954937), SPH_C64(0x9F8ED847D8D856AD),
+	SPH_C64(0x308BFBCBFBFB70EB), SPH_C64(0x7123EE9FEEEECDC1),
+	SPH_C64(0x91C77CED7C7CBBF8), SPH_C64(0xE3176685666671CC),
+	SPH_C64(0x8EA6DD53DDDD7BA7), SPH_C64(0x4BB8175C1717AF2E),
+	SPH_C64(0x460247014747458E), SPH_C64(0xDC849E429E9E1A21),
+	SPH_C64(0xC51ECA0FCACAD489), SPH_C64(0x99752DB42D2D585A),
+	SPH_C64(0x7991BFC6BFBF2E63), SPH_C64(0x1B38071C07073F0E),
+	SPH_C64(0x2301AD8EADADAC47), SPH_C64(0x2FEA5A755A5AB0B4),
+	SPH_C64(0xB56C83368383EF1B), SPH_C64(0xFF8533CC3333B666),
+	SPH_C64(0xF23F639163635CC6), SPH_C64(0x0A10020802021204),
+	SPH_C64(0x3839AA92AAAA9349), SPH_C64(0xA8AF71D97171DEE2),
+	SPH_C64(0xCF0EC807C8C8C68D), SPH_C64(0x7DC819641919D132),
+	SPH_C64(0x7072493949493B92), SPH_C64(0x9A86D943D9D95FAF),
+	SPH_C64(0x1DC3F2EFF2F231F9), SPH_C64(0x484BE3ABE3E3A8DB),
+	SPH_C64(0x2AE25B715B5BB9B6), SPH_C64(0x9234881A8888BC0D),
+	SPH_C64(0xC8A49A529A9A3E29), SPH_C64(0xBE2D269826260B4C),
+	SPH_C64(0xFA8D32C83232BF64), SPH_C64(0x4AE9B0FAB0B0597D),
+	SPH_C64(0x6A1BE983E9E9F2CF), SPH_C64(0x33780F3C0F0F771E),
+	SPH_C64(0xA6E6D573D5D533B7), SPH_C64(0xBA74803A8080F41D),
+	SPH_C64(0x7C99BEC2BEBE2761), SPH_C64(0xDE26CD13CDCDEB87),
+	SPH_C64(0xE4BD34D034348968), SPH_C64(0x757A483D48483290),
+	SPH_C64(0x24ABFFDBFFFF54E3), SPH_C64(0x8FF77AF57A7A8DF4),
+	SPH_C64(0xEAF4907A9090643D), SPH_C64(0x3EC25F615F5F9DBE),
+	SPH_C64(0xA01D208020203D40), SPH_C64(0xD56768BD68680FD0),
+	SPH_C64(0x72D01A681A1ACA34), SPH_C64(0x2C19AE82AEAEB741),
+	SPH_C64(0x5EC9B4EAB4B47D75), SPH_C64(0x199A544D5454CEA8),
+	SPH_C64(0xE5EC937693937F3B), SPH_C64(0xAA0D228822222F44),
+	SPH_C64(0xE907648D646463C8), SPH_C64(0x12DBF1E3F1F12AFF),
+	SPH_C64(0xA2BF73D17373CCE6), SPH_C64(0x5A90124812128224),
+	SPH_C64(0x5D3A401D40407A80), SPH_C64(0x2840082008084810),
+	SPH_C64(0xE856C32BC3C3959B), SPH_C64(0x7B33EC97ECECDFC5),
+	SPH_C64(0x9096DB4BDBDB4DAB), SPH_C64(0x1F61A1BEA1A1C05F),
+	SPH_C64(0x831C8D0E8D8D9107), SPH_C64(0xC9F53DF43D3DC87A),
+	SPH_C64(0xF1CC976697975B33), SPH_C64(0x0000000000000000),
+	SPH_C64(0xD436CF1BCFCFF983), SPH_C64(0x87452BAC2B2B6E56),
+	SPH_C64(0xB39776C57676E1EC), SPH_C64(0xB06482328282E619),
+	SPH_C64(0xA9FED67FD6D628B1), SPH_C64(0x77D81B6C1B1BC336),
+	SPH_C64(0x5BC1B5EEB5B57477), SPH_C64(0x2911AF86AFAFBE43),
+	SPH_C64(0xDF776AB56A6A1DD4), SPH_C64(0x0DBA505D5050EAA0),
+	SPH_C64(0x4C1245094545578A), SPH_C64(0x18CBF3EBF3F338FB),
+	SPH_C64(0xF09D30C03030AD60), SPH_C64(0x742BEF9BEFEFC4C3),
+	SPH_C64(0xC3E53FFC3F3FDA7E), SPH_C64(0x1C9255495555C7AA),
+	SPH_C64(0x1079A2B2A2A2DB59), SPH_C64(0x6503EA8FEAEAE9C9),
+	SPH_C64(0xEC0F658965656ACA), SPH_C64(0x68B9BAD2BABA0369),
+	SPH_C64(0x93652FBC2F2F4A5E), SPH_C64(0xE74EC027C0C08E9D),
+	SPH_C64(0x81BEDE5FDEDE60A1), SPH_C64(0x6CE01C701C1CFC38),
+	SPH_C64(0x2EBBFDD3FDFD46E7), SPH_C64(0x64524D294D4D1F9A),
+	SPH_C64(0xE0E4927292927639), SPH_C64(0xBC8F75C97575FAEA),
+	SPH_C64(0x1E3006180606360C), SPH_C64(0x98248A128A8AAE09),
+	SPH_C64(0x40F9B2F2B2B24B79), SPH_C64(0x5963E6BFE6E685D1),
+	SPH_C64(0x36700E380E0E7E1C), SPH_C64(0x63F81F7C1F1FE73E),
+	SPH_C64(0xF7376295626255C4), SPH_C64(0xA3EED477D4D43AB5),
+	SPH_C64(0x3229A89AA8A8814D), SPH_C64(0xF4C4966296965231),
+	SPH_C64(0x3A9BF9C3F9F962EF), SPH_C64(0xF666C533C5C5A397),
+	SPH_C64(0xB13525942525104A), SPH_C64(0x20F259795959ABB2),
+	SPH_C64(0xAE54842A8484D015), SPH_C64(0xA7B772D57272C5E4),
+	SPH_C64(0xDDD539E43939EC72), SPH_C64(0x615A4C2D4C4C1698),
+	SPH_C64(0x3BCA5E655E5E94BC), SPH_C64(0x85E778FD78789FF0),
+	SPH_C64(0xD8DD38E03838E570), SPH_C64(0x86148C0A8C8C9805),
+	SPH_C64(0xB2C6D163D1D117BF), SPH_C64(0x0B41A5AEA5A5E457),
+	SPH_C64(0x4D43E2AFE2E2A1D9), SPH_C64(0xF82F619961614EC2),
+	SPH_C64(0x45F1B3F6B3B3427B), SPH_C64(0xA515218421213442),
+	SPH_C64(0xD6949C4A9C9C0825), SPH_C64(0x66F01E781E1EEE3C),
+	SPH_C64(0x5222431143436186), SPH_C64(0xFC76C73BC7C7B193),
+	SPH_C64(0x2BB3FCD7FCFC4FE5), SPH_C64(0x1420041004042408),
+	SPH_C64(0x08B251595151E3A2), SPH_C64(0xC7BC995E9999252F),
+	SPH_C64(0xC44F6DA96D6D22DA), SPH_C64(0x39680D340D0D651A),
+	SPH_C64(0x3583FACFFAFA79E9), SPH_C64(0x84B6DF5BDFDF69A3),
+	SPH_C64(0x9BD77EE57E7EA9FC), SPH_C64(0xB43D249024241948),
+	SPH_C64(0xD7C53BEC3B3BFE76), SPH_C64(0x3D31AB96ABAB9A4B),
+	SPH_C64(0xD13ECE1FCECEF081), SPH_C64(0x5588114411119922),
+	SPH_C64(0x890C8F068F8F8303), SPH_C64(0x6B4A4E254E4E049C),
+	SPH_C64(0x51D1B7E6B7B76673), SPH_C64(0x600BEB8BEBEBE0CB),
+	SPH_C64(0xCCFD3CF03C3CC178), SPH_C64(0xBF7C813E8181FD1F),
+	SPH_C64(0xFED4946A94944035), SPH_C64(0x0CEBF7FBF7F71CF3),
+	SPH_C64(0x67A1B9DEB9B9186F), SPH_C64(0x5F98134C13138B26),
+	SPH_C64(0x9C7D2CB02C2C5158), SPH_C64(0xB8D6D36BD3D305BB),
+	SPH_C64(0x5C6BE7BBE7E78CD3), SPH_C64(0xCB576EA56E6E39DC),
+	SPH_C64(0xF36EC437C4C4AA95), SPH_C64(0x0F18030C03031B06),
+	SPH_C64(0x138A56455656DCAC), SPH_C64(0x491A440D44445E88),
+	SPH_C64(0x9EDF7FE17F7FA0FE), SPH_C64(0x3721A99EA9A9884F),
+	SPH_C64(0x824D2AA82A2A6754), SPH_C64(0x6DB1BBD6BBBB0A6B),
+	SPH_C64(0xE246C123C1C1879F), SPH_C64(0x02A253515353F1A6),
+	SPH_C64(0x8BAEDC57DCDC72A5), SPH_C64(0x27580B2C0B0B5316),
+	SPH_C64(0xD39C9D4E9D9D0127), SPH_C64(0xC1476CAD6C6C2BD8),
+	SPH_C64(0xF59531C43131A462), SPH_C64(0xB98774CD7474F3E8),
+	SPH_C64(0x09E3F6FFF6F615F1), SPH_C64(0x430A460546464C8C),
+	SPH_C64(0x2609AC8AACACA545), SPH_C64(0x973C891E8989B50F),
+	SPH_C64(0x44A014501414B428), SPH_C64(0x425BE1A3E1E1BADF),
+	SPH_C64(0x4EB016581616A62C), SPH_C64(0xD2CD3AE83A3AF774),
+	SPH_C64(0xD06F69B9696906D2), SPH_C64(0x2D48092409094112),
+	SPH_C64(0xADA770DD7070D7E0), SPH_C64(0x54D9B6E2B6B66F71),
+	SPH_C64(0xB7CED067D0D01EBD), SPH_C64(0x7E3BED93EDEDD6C7),
+	SPH_C64(0xDB2ECC17CCCCE285), SPH_C64(0x572A421542426884),
+	SPH_C64(0xC2B4985A98982C2D), SPH_C64(0x0E49A4AAA4A4ED55),
+	SPH_C64(0x885D28A028287550), SPH_C64(0x31DA5C6D5C5C86B8),
+	SPH_C64(0x3F93F8C7F8F86BED), SPH_C64(0xA44486228686C211)
+};
+
+static const uint64_t plain_T3[256] = {
+	SPH_C64(0xC018601818D83078), SPH_C64(0x05238C23232646AF),
+	SPH_C64(0x7EC63FC6C6B891F9), SPH_C64(0x13E887E8E8FBCD6F),
+	SPH_C64(0x4C87268787CB13A1), SPH_C64(0xA9B8DAB8B8116D62),
+	SPH_C64(0x0801040101090205), SPH_C64(0x424F214F4F0D9E6E),
+	SPH_C64(0xAD36D836369B6CEE), SPH_C64(0x59A6A2A6A6FF5104),
+	SPH_C64(0xDED26FD2D20CB9BD), SPH_C64(0xFBF5F3F5F50EF706),
+	SPH_C64(0xEF79F9797996F280), SPH_C64(0x5F6FA16F6F30DECE),
+	SPH_C64(0xFC917E91916D3FEF), SPH_C64(0xAA52555252F8A407),
+	SPH_C64(0x27609D606047C0FD), SPH_C64(0x89BCCABCBC356576),
+	SPH_C64(0xAC9B569B9B372BCD), SPH_C64(0x048E028E8E8A018C),
+	SPH_C64(0x71A3B6A3A3D25B15), SPH_C64(0x600C300C0C6C183C),
+	SPH_C64(0xFF7BF17B7B84F68A), SPH_C64(0xB535D43535806AE1),
+	SPH_C64(0xE81D741D1DF53A69), SPH_C64(0x53E0A7E0E0B3DD47),
+	SPH_C64(0xF6D77BD7D721B3AC), SPH_C64(0x5EC22FC2C29C99ED),
+	SPH_C64(0x6D2EB82E2E435C96), SPH_C64(0x624B314B4B29967A),
+	SPH_C64(0xA3FEDFFEFE5DE121), SPH_C64(0x8257415757D5AE16),
+	SPH_C64(0xA815541515BD2A41), SPH_C64(0x9F77C17777E8EEB6),
+	SPH_C64(0xA537DC3737926EEB), SPH_C64(0x7BE5B3E5E59ED756),
+	SPH_C64(0x8C9F469F9F1323D9), SPH_C64(0xD3F0E7F0F023FD17),
+	SPH_C64(0x6A4A354A4A20947F), SPH_C64(0x9EDA4FDADA44A995),
+	SPH_C64(0xFA587D5858A2B025), SPH_C64(0x06C903C9C9CF8FCA),
+	SPH_C64(0x5529A429297C528D), SPH_C64(0x500A280A0A5A1422),
+	SPH_C64(0xE1B1FEB1B1507F4F), SPH_C64(0x69A0BAA0A0C95D1A),
+	SPH_C64(0x7F6BB16B6B14D6DA), SPH_C64(0x5C852E8585D917AB),
+	SPH_C64(0x81BDCEBDBD3C6773), SPH_C64(0xD25D695D5D8FBA34),
+	SPH_C64(0x8010401010902050), SPH_C64(0xF3F4F7F4F407F503),
+	SPH_C64(0x16CB0BCBCBDD8BC0), SPH_C64(0xED3EF83E3ED37CC6),
+	SPH_C64(0x28051405052D0A11), SPH_C64(0x1F6781676778CEE6),
+	SPH_C64(0x73E4B7E4E497D553), SPH_C64(0x25279C2727024EBB),
+	SPH_C64(0x3241194141738258), SPH_C64(0x2C8B168B8BA70B9D),
+	SPH_C64(0x51A7A6A7A7F65301), SPH_C64(0xCF7DE97D7DB2FA94),
+	SPH_C64(0xDC956E95954937FB), SPH_C64(0x8ED847D8D856AD9F),
+	SPH_C64(0x8BFBCBFBFB70EB30), SPH_C64(0x23EE9FEEEECDC171),
+	SPH_C64(0xC77CED7C7CBBF891), SPH_C64(0x176685666671CCE3),
+	SPH_C64(0xA6DD53DDDD7BA78E), SPH_C64(0xB8175C1717AF2E4B),
+	SPH_C64(0x0247014747458E46), SPH_C64(0x849E429E9E1A21DC),
+	SPH_C64(0x1ECA0FCACAD489C5), SPH_C64(0x752DB42D2D585A99),
+	SPH_C64(0x91BFC6BFBF2E6379), SPH_C64(0x38071C07073F0E1B),
+	SPH_C64(0x01AD8EADADAC4723), SPH_C64(0xEA5A755A5AB0B42F),
+	SPH_C64(0x6C83368383EF1BB5), SPH_C64(0x8533CC3333B666FF),
+	SPH_C64(0x3F639163635CC6F2), SPH_C64(0x100208020212040A),
+	SPH_C64(0x39AA92AAAA934938), SPH_C64(0xAF71D97171DEE2A8),
+	SPH_C64(0x0EC807C8C8C68DCF), SPH_C64(0xC819641919D1327D),
+	SPH_C64(0x72493949493B9270), SPH_C64(0x86D943D9D95FAF9A),
+	SPH_C64(0xC3F2EFF2F231F91D), SPH_C64(0x4BE3ABE3E3A8DB48),
+	SPH_C64(0xE25B715B5BB9B62A), SPH_C64(0x34881A8888BC0D92),
+	SPH_C64(0xA49A529A9A3E29C8), SPH_C64(0x2D269826260B4CBE),
+	SPH_C64(0x8D32C83232BF64FA), SPH_C64(0xE9B0FAB0B0597D4A),
+	SPH_C64(0x1BE983E9E9F2CF6A), SPH_C64(0x780F3C0F0F771E33),
+	SPH_C64(0xE6D573D5D533B7A6), SPH_C64(0x74803A8080F41DBA),
+	SPH_C64(0x99BEC2BEBE27617C), SPH_C64(0x26CD13CDCDEB87DE),
+	SPH_C64(0xBD34D034348968E4), SPH_C64(0x7A483D4848329075),
+	SPH_C64(0xABFFDBFFFF54E324), SPH_C64(0xF77AF57A7A8DF48F),
+	SPH_C64(0xF4907A9090643DEA), SPH_C64(0xC25F615F5F9DBE3E),
+	SPH_C64(0x1D208020203D40A0), SPH_C64(0x6768BD68680FD0D5),
+	SPH_C64(0xD01A681A1ACA3472), SPH_C64(0x19AE82AEAEB7412C),
+	SPH_C64(0xC9B4EAB4B47D755E), SPH_C64(0x9A544D5454CEA819),
+	SPH_C64(0xEC937693937F3BE5), SPH_C64(0x0D228822222F44AA),
+	SPH_C64(0x07648D646463C8E9), SPH_C64(0xDBF1E3F1F12AFF12),
+	SPH_C64(0xBF73D17373CCE6A2), SPH_C64(0x901248121282245A),
+	SPH_C64(0x3A401D40407A805D), SPH_C64(0x4008200808481028),
+	SPH_C64(0x56C32BC3C3959BE8), SPH_C64(0x33EC97ECECDFC57B),
+	SPH_C64(0x96DB4BDBDB4DAB90), SPH_C64(0x61A1BEA1A1C05F1F),
+	SPH_C64(0x1C8D0E8D8D910783), SPH_C64(0xF53DF43D3DC87AC9),
+	SPH_C64(0xCC976697975B33F1), SPH_C64(0x0000000000000000),
+	SPH_C64(0x36CF1BCFCFF983D4), SPH_C64(0x452BAC2B2B6E5687),
+	SPH_C64(0x9776C57676E1ECB3), SPH_C64(0x6482328282E619B0),
+	SPH_C64(0xFED67FD6D628B1A9), SPH_C64(0xD81B6C1B1BC33677),
+	SPH_C64(0xC1B5EEB5B574775B), SPH_C64(0x11AF86AFAFBE4329),
+	SPH_C64(0x776AB56A6A1DD4DF), SPH_C64(0xBA505D5050EAA00D),
+	SPH_C64(0x1245094545578A4C), SPH_C64(0xCBF3EBF3F338FB18),
+	SPH_C64(0x9D30C03030AD60F0), SPH_C64(0x2BEF9BEFEFC4C374),
+	SPH_C64(0xE53FFC3F3FDA7EC3), SPH_C64(0x9255495555C7AA1C),
+	SPH_C64(0x79A2B2A2A2DB5910), SPH_C64(0x03EA8FEAEAE9C965),
+	SPH_C64(0x0F658965656ACAEC), SPH_C64(0xB9BAD2BABA036968),
+	SPH_C64(0x652FBC2F2F4A5E93), SPH_C64(0x4EC027C0C08E9DE7),
+	SPH_C64(0xBEDE5FDEDE60A181), SPH_C64(0xE01C701C1CFC386C),
+	SPH_C64(0xBBFDD3FDFD46E72E), SPH_C64(0x524D294D4D1F9A64),
+	SPH_C64(0xE4927292927639E0), SPH_C64(0x8F75C97575FAEABC),
+	SPH_C64(0x3006180606360C1E), SPH_C64(0x248A128A8AAE0998),
+	SPH_C64(0xF9B2F2B2B24B7940), SPH_C64(0x63E6BFE6E685D159),
+	SPH_C64(0x700E380E0E7E1C36), SPH_C64(0xF81F7C1F1FE73E63),
+	SPH_C64(0x376295626255C4F7), SPH_C64(0xEED477D4D43AB5A3),
+	SPH_C64(0x29A89AA8A8814D32), SPH_C64(0xC4966296965231F4),
+	SPH_C64(0x9BF9C3F9F962EF3A), SPH_C64(0x66C533C5C5A397F6),
+	SPH_C64(0x3525942525104AB1), SPH_C64(0xF259795959ABB220),
+	SPH_C64(0x54842A8484D015AE), SPH_C64(0xB772D57272C5E4A7),
+	SPH_C64(0xD539E43939EC72DD), SPH_C64(0x5A4C2D4C4C169861),
+	SPH_C64(0xCA5E655E5E94BC3B), SPH_C64(0xE778FD78789FF085),
+	SPH_C64(0xDD38E03838E570D8), SPH_C64(0x148C0A8C8C980586),
+	SPH_C64(0xC6D163D1D117BFB2), SPH_C64(0x41A5AEA5A5E4570B),
+	SPH_C64(0x43E2AFE2E2A1D94D), SPH_C64(0x2F619961614EC2F8),
+	SPH_C64(0xF1B3F6B3B3427B45), SPH_C64(0x15218421213442A5),
+	SPH_C64(0x949C4A9C9C0825D6), SPH_C64(0xF01E781E1EEE3C66),
+	SPH_C64(0x2243114343618652), SPH_C64(0x76C73BC7C7B193FC),
+	SPH_C64(0xB3FCD7FCFC4FE52B), SPH_C64(0x2004100404240814),
+	SPH_C64(0xB251595151E3A208), SPH_C64(0xBC995E9999252FC7),
+	SPH_C64(0x4F6DA96D6D22DAC4), SPH_C64(0x680D340D0D651A39),
+	SPH_C64(0x83FACFFAFA79E935), SPH_C64(0xB6DF5BDFDF69A384),
+	SPH_C64(0xD77EE57E7EA9FC9B), SPH_C64(0x3D249024241948B4),
+	SPH_C64(0xC53BEC3B3BFE76D7), SPH_C64(0x31AB96ABAB9A4B3D),
+	SPH_C64(0x3ECE1FCECEF081D1), SPH_C64(0x8811441111992255),
+	SPH_C64(0x0C8F068F8F830389), SPH_C64(0x4A4E254E4E049C6B),
+	SPH_C64(0xD1B7E6B7B7667351), SPH_C64(0x0BEB8BEBEBE0CB60),
+	SPH_C64(0xFD3CF03C3CC178CC), SPH_C64(0x7C813E8181FD1FBF),
+	SPH_C64(0xD4946A94944035FE), SPH_C64(0xEBF7FBF7F71CF30C),
+	SPH_C64(0xA1B9DEB9B9186F67), SPH_C64(0x98134C13138B265F),
+	SPH_C64(0x7D2CB02C2C51589C), SPH_C64(0xD6D36BD3D305BBB8),
+	SPH_C64(0x6BE7BBE7E78CD35C), SPH_C64(0x576EA56E6E39DCCB),
+	SPH_C64(0x6EC437C4C4AA95F3), SPH_C64(0x18030C03031B060F),
+	SPH_C64(0x8A56455656DCAC13), SPH_C64(0x1A440D44445E8849),
+	SPH_C64(0xDF7FE17F7FA0FE9E), SPH_C64(0x21A99EA9A9884F37),
+	SPH_C64(0x4D2AA82A2A675482), SPH_C64(0xB1BBD6BBBB0A6B6D),
+	SPH_C64(0x46C123C1C1879FE2), SPH_C64(0xA253515353F1A602),
+	SPH_C64(0xAEDC57DCDC72A58B), SPH_C64(0x580B2C0B0B531627),
+	SPH_C64(0x9C9D4E9D9D0127D3), SPH_C64(0x476CAD6C6C2BD8C1),
+	SPH_C64(0x9531C43131A462F5), SPH_C64(0x8774CD7474F3E8B9),
+	SPH_C64(0xE3F6FFF6F615F109), SPH_C64(0x0A460546464C8C43),
+	SPH_C64(0x09AC8AACACA54526), SPH_C64(0x3C891E8989B50F97),
+	SPH_C64(0xA014501414B42844), SPH_C64(0x5BE1A3E1E1BADF42),
+	SPH_C64(0xB016581616A62C4E), SPH_C64(0xCD3AE83A3AF774D2),
+	SPH_C64(0x6F69B9696906D2D0), SPH_C64(0x480924090941122D),
+	SPH_C64(0xA770DD7070D7E0AD), SPH_C64(0xD9B6E2B6B66F7154),
+	SPH_C64(0xCED067D0D01EBDB7), SPH_C64(0x3BED93EDEDD6C77E),
+	SPH_C64(0x2ECC17CCCCE285DB), SPH_C64(0x2A42154242688457),
+	SPH_C64(0xB4985A98982C2DC2), SPH_C64(0x49A4AAA4A4ED550E),
+	SPH_C64(0x5D28A02828755088), SPH_C64(0xDA5C6D5C5C86B831),
+	SPH_C64(0x93F8C7F8F86BED3F), SPH_C64(0x4486228686C211A4)
+};
+
+static const uint64_t plain_T4[256] = {
+	SPH_C64(0x18601818D83078C0), SPH_C64(0x238C23232646AF05),
+	SPH_C64(0xC63FC6C6B891F97E), SPH_C64(0xE887E8E8FBCD6F13),
+	SPH_C64(0x87268787CB13A14C), SPH_C64(0xB8DAB8B8116D62A9),
+	SPH_C64(0x0104010109020508), SPH_C64(0x4F214F4F0D9E6E42),
+	SPH_C64(0x36D836369B6CEEAD), SPH_C64(0xA6A2A6A6FF510459),
+	SPH_C64(0xD26FD2D20CB9BDDE), SPH_C64(0xF5F3F5F50EF706FB),
+	SPH_C64(0x79F9797996F280EF), SPH_C64(0x6FA16F6F30DECE5F),
+	SPH_C64(0x917E91916D3FEFFC), SPH_C64(0x52555252F8A407AA),
+	SPH_C64(0x609D606047C0FD27), SPH_C64(0xBCCABCBC35657689),
+	SPH_C64(0x9B569B9B372BCDAC), SPH_C64(0x8E028E8E8A018C04),
+	SPH_C64(0xA3B6A3A3D25B1571), SPH_C64(0x0C300C0C6C183C60),
+	SPH_C64(0x7BF17B7B84F68AFF), SPH_C64(0x35D43535806AE1B5),
+	SPH_C64(0x1D741D1DF53A69E8), SPH_C64(0xE0A7E0E0B3DD4753),
+	SPH_C64(0xD77BD7D721B3ACF6), SPH_C64(0xC22FC2C29C99ED5E),
+	SPH_C64(0x2EB82E2E435C966D), SPH_C64(0x4B314B4B29967A62),
+	SPH_C64(0xFEDFFEFE5DE121A3), SPH_C64(0x57415757D5AE1682),
+	SPH_C64(0x15541515BD2A41A8), SPH_C64(0x77C17777E8EEB69F),
+	SPH_C64(0x37DC3737926EEBA5), SPH_C64(0xE5B3E5E59ED7567B),
+	SPH_C64(0x9F469F9F1323D98C), SPH_C64(0xF0E7F0F023FD17D3),
+	SPH_C64(0x4A354A4A20947F6A), SPH_C64(0xDA4FDADA44A9959E),
+	SPH_C64(0x587D5858A2B025FA), SPH_C64(0xC903C9C9CF8FCA06),
+	SPH_C64(0x29A429297C528D55), SPH_C64(0x0A280A0A5A142250),
+	SPH_C64(0xB1FEB1B1507F4FE1), SPH_C64(0xA0BAA0A0C95D1A69),
+	SPH_C64(0x6BB16B6B14D6DA7F), SPH_C64(0x852E8585D917AB5C),
+	SPH_C64(0xBDCEBDBD3C677381), SPH_C64(0x5D695D5D8FBA34D2),
+	SPH_C64(0x1040101090205080), SPH_C64(0xF4F7F4F407F503F3),
+	SPH_C64(0xCB0BCBCBDD8BC016), SPH_C64(0x3EF83E3ED37CC6ED),
+	SPH_C64(0x051405052D0A1128), SPH_C64(0x6781676778CEE61F),
+	SPH_C64(0xE4B7E4E497D55373), SPH_C64(0x279C2727024EBB25),
+	SPH_C64(0x4119414173825832), SPH_C64(0x8B168B8BA70B9D2C),
+	SPH_C64(0xA7A6A7A7F6530151), SPH_C64(0x7DE97D7DB2FA94CF),
+	SPH_C64(0x956E95954937FBDC), SPH_C64(0xD847D8D856AD9F8E),
+	SPH_C64(0xFBCBFBFB70EB308B), SPH_C64(0xEE9FEEEECDC17123),
+	SPH_C64(0x7CED7C7CBBF891C7), SPH_C64(0x6685666671CCE317),
+	SPH_C64(0xDD53DDDD7BA78EA6), SPH_C64(0x175C1717AF2E4BB8),
+	SPH_C64(0x47014747458E4602), SPH_C64(0x9E429E9E1A21DC84),
+	SPH_C64(0xCA0FCACAD489C51E), SPH_C64(0x2DB42D2D585A9975),
+	SPH_C64(0xBFC6BFBF2E637991), SPH_C64(0x071C07073F0E1B38),
+	SPH_C64(0xAD8EADADAC472301), SPH_C64(0x5A755A5AB0B42FEA),
+	SPH_C64(0x83368383EF1BB56C), SPH_C64(0x33CC3333B666FF85),
+	SPH_C64(0x639163635CC6F23F), SPH_C64(0x0208020212040A10),
+	SPH_C64(0xAA92AAAA93493839), SPH_C64(0x71D97171DEE2A8AF),
+	SPH_C64(0xC807C8C8C68DCF0E), SPH_C64(0x19641919D1327DC8),
+	SPH_C64(0x493949493B927072), SPH_C64(0xD943D9D95FAF9A86),
+	SPH_C64(0xF2EFF2F231F91DC3), SPH_C64(0xE3ABE3E3A8DB484B),
+	SPH_C64(0x5B715B5BB9B62AE2), SPH_C64(0x881A8888BC0D9234),
+	SPH_C64(0x9A529A9A3E29C8A4), SPH_C64(0x269826260B4CBE2D),
+	SPH_C64(0x32C83232BF64FA8D), SPH_C64(0xB0FAB0B0597D4AE9),
+	SPH_C64(0xE983E9E9F2CF6A1B), SPH_C64(0x0F3C0F0F771E3378),
+	SPH_C64(0xD573D5D533B7A6E6), SPH_C64(0x803A8080F41DBA74),
+	SPH_C64(0xBEC2BEBE27617C99), SPH_C64(0xCD13CDCDEB87DE26),
+	SPH_C64(0x34D034348968E4BD), SPH_C64(0x483D48483290757A),
+	SPH_C64(0xFFDBFFFF54E324AB), SPH_C64(0x7AF57A7A8DF48FF7),
+	SPH_C64(0x907A9090643DEAF4), SPH_C64(0x5F615F5F9DBE3EC2),
+	SPH_C64(0x208020203D40A01D), SPH_C64(0x68BD68680FD0D567),
+	SPH_C64(0x1A681A1ACA3472D0), SPH_C64(0xAE82AEAEB7412C19),
+	SPH_C64(0xB4EAB4B47D755EC9), SPH_C64(0x544D5454CEA8199A),
+	SPH_C64(0x937693937F3BE5EC), SPH_C64(0x228822222F44AA0D),
+	SPH_C64(0x648D646463C8E907), SPH_C64(0xF1E3F1F12AFF12DB),
+	SPH_C64(0x73D17373CCE6A2BF), SPH_C64(0x1248121282245A90),
+	SPH_C64(0x401D40407A805D3A), SPH_C64(0x0820080848102840),
+	SPH_C64(0xC32BC3C3959BE856), SPH_C64(0xEC97ECECDFC57B33),
+	SPH_C64(0xDB4BDBDB4DAB9096), SPH_C64(0xA1BEA1A1C05F1F61),
+	SPH_C64(0x8D0E8D8D9107831C), SPH_C64(0x3DF43D3DC87AC9F5),
+	SPH_C64(0x976697975B33F1CC), SPH_C64(0x0000000000000000),
+	SPH_C64(0xCF1BCFCFF983D436), SPH_C64(0x2BAC2B2B6E568745),
+	SPH_C64(0x76C57676E1ECB397), SPH_C64(0x82328282E619B064),
+	SPH_C64(0xD67FD6D628B1A9FE), SPH_C64(0x1B6C1B1BC33677D8),
+	SPH_C64(0xB5EEB5B574775BC1), SPH_C64(0xAF86AFAFBE432911),
+	SPH_C64(0x6AB56A6A1DD4DF77), SPH_C64(0x505D5050EAA00DBA),
+	SPH_C64(0x45094545578A4C12), SPH_C64(0xF3EBF3F338FB18CB),
+	SPH_C64(0x30C03030AD60F09D), SPH_C64(0xEF9BEFEFC4C3742B),
+	SPH_C64(0x3FFC3F3FDA7EC3E5), SPH_C64(0x55495555C7AA1C92),
+	SPH_C64(0xA2B2A2A2DB591079), SPH_C64(0xEA8FEAEAE9C96503),
+	SPH_C64(0x658965656ACAEC0F), SPH_C64(0xBAD2BABA036968B9),
+	SPH_C64(0x2FBC2F2F4A5E9365), SPH_C64(0xC027C0C08E9DE74E),
+	SPH_C64(0xDE5FDEDE60A181BE), SPH_C64(0x1C701C1CFC386CE0),
+	SPH_C64(0xFDD3FDFD46E72EBB), SPH_C64(0x4D294D4D1F9A6452),
+	SPH_C64(0x927292927639E0E4), SPH_C64(0x75C97575FAEABC8F),
+	SPH_C64(0x06180606360C1E30), SPH_C64(0x8A128A8AAE099824),
+	SPH_C64(0xB2F2B2B24B7940F9), SPH_C64(0xE6BFE6E685D15963),
+	SPH_C64(0x0E380E0E7E1C3670), SPH_C64(0x1F7C1F1FE73E63F8),
+	SPH_C64(0x6295626255C4F737), SPH_C64(0xD477D4D43AB5A3EE),
+	SPH_C64(0xA89AA8A8814D3229), SPH_C64(0x966296965231F4C4),
+	SPH_C64(0xF9C3F9F962EF3A9B), SPH_C64(0xC533C5C5A397F666),
+	SPH_C64(0x25942525104AB135), SPH_C64(0x59795959ABB220F2),
+	SPH_C64(0x842A8484D015AE54), SPH_C64(0x72D57272C5E4A7B7),
+	SPH_C64(0x39E43939EC72DDD5), SPH_C64(0x4C2D4C4C1698615A),
+	SPH_C64(0x5E655E5E94BC3BCA), SPH_C64(0x78FD78789FF085E7),
+	SPH_C64(0x38E03838E570D8DD), SPH_C64(0x8C0A8C8C98058614),
+	SPH_C64(0xD163D1D117BFB2C6), SPH_C64(0xA5AEA5A5E4570B41),
+	SPH_C64(0xE2AFE2E2A1D94D43), SPH_C64(0x619961614EC2F82F),
+	SPH_C64(0xB3F6B3B3427B45F1), SPH_C64(0x218421213442A515),
+	SPH_C64(0x9C4A9C9C0825D694), SPH_C64(0x1E781E1EEE3C66F0),
+	SPH_C64(0x4311434361865222), SPH_C64(0xC73BC7C7B193FC76),
+	SPH_C64(0xFCD7FCFC4FE52BB3), SPH_C64(0x0410040424081420),
+	SPH_C64(0x51595151E3A208B2), SPH_C64(0x995E9999252FC7BC),
+	SPH_C64(0x6DA96D6D22DAC44F), SPH_C64(0x0D340D0D651A3968),
+	SPH_C64(0xFACFFAFA79E93583), SPH_C64(0xDF5BDFDF69A384B6),
+	SPH_C64(0x7EE57E7EA9FC9BD7), SPH_C64(0x249024241948B43D),
+	SPH_C64(0x3BEC3B3BFE76D7C5), SPH_C64(0xAB96ABAB9A4B3D31),
+	SPH_C64(0xCE1FCECEF081D13E), SPH_C64(0x1144111199225588),
+	SPH_C64(0x8F068F8F8303890C), SPH_C64(0x4E254E4E049C6B4A),
+	SPH_C64(0xB7E6B7B7667351D1), SPH_C64(0xEB8BEBEBE0CB600B),
+	SPH_C64(0x3CF03C3CC178CCFD), SPH_C64(0x813E8181FD1FBF7C),
+	SPH_C64(0x946A94944035FED4), SPH_C64(0xF7FBF7F71CF30CEB),
+	SPH_C64(0xB9DEB9B9186F67A1), SPH_C64(0x134C13138B265F98),
+	SPH_C64(0x2CB02C2C51589C7D), SPH_C64(0xD36BD3D305BBB8D6),
+	SPH_C64(0xE7BBE7E78CD35C6B), SPH_C64(0x6EA56E6E39DCCB57),
+	SPH_C64(0xC437C4C4AA95F36E), SPH_C64(0x030C03031B060F18),
+	SPH_C64(0x56455656DCAC138A), SPH_C64(0x440D44445E88491A),
+	SPH_C64(0x7FE17F7FA0FE9EDF), SPH_C64(0xA99EA9A9884F3721),
+	SPH_C64(0x2AA82A2A6754824D), SPH_C64(0xBBD6BBBB0A6B6DB1),
+	SPH_C64(0xC123C1C1879FE246), SPH_C64(0x53515353F1A602A2),
+	SPH_C64(0xDC57DCDC72A58BAE), SPH_C64(0x0B2C0B0B53162758),
+	SPH_C64(0x9D4E9D9D0127D39C), SPH_C64(0x6CAD6C6C2BD8C147),
+	SPH_C64(0x31C43131A462F595), SPH_C64(0x74CD7474F3E8B987),
+	SPH_C64(0xF6FFF6F615F109E3), SPH_C64(0x460546464C8C430A),
+	SPH_C64(0xAC8AACACA5452609), SPH_C64(0x891E8989B50F973C),
+	SPH_C64(0x14501414B42844A0), SPH_C64(0xE1A3E1E1BADF425B),
+	SPH_C64(0x16581616A62C4EB0), SPH_C64(0x3AE83A3AF774D2CD),
+	SPH_C64(0x69B9696906D2D06F), SPH_C64(0x0924090941122D48),
+	SPH_C64(0x70DD7070D7E0ADA7), SPH_C64(0xB6E2B6B66F7154D9),
+	SPH_C64(0xD067D0D01EBDB7CE), SPH_C64(0xED93EDEDD6C77E3B),
+	SPH_C64(0xCC17CCCCE285DB2E), SPH_C64(0x421542426884572A),
+	SPH_C64(0x985A98982C2DC2B4), SPH_C64(0xA4AAA4A4ED550E49),
+	SPH_C64(0x28A028287550885D), SPH_C64(0x5C6D5C5C86B831DA),
+	SPH_C64(0xF8C7F8F86BED3F93), SPH_C64(0x86228686C211A444)
+};
+
+static const uint64_t plain_T5[256] = {
+	SPH_C64(0x601818D83078C018), SPH_C64(0x8C23232646AF0523),
+	SPH_C64(0x3FC6C6B891F97EC6), SPH_C64(0x87E8E8FBCD6F13E8),
+	SPH_C64(0x268787CB13A14C87), SPH_C64(0xDAB8B8116D62A9B8),
+	SPH_C64(0x0401010902050801), SPH_C64(0x214F4F0D9E6E424F),
+	SPH_C64(0xD836369B6CEEAD36), SPH_C64(0xA2A6A6FF510459A6),
+	SPH_C64(0x6FD2D20CB9BDDED2), SPH_C64(0xF3F5F50EF706FBF5),
+	SPH_C64(0xF9797996F280EF79), SPH_C64(0xA16F6F30DECE5F6F),
+	SPH_C64(0x7E91916D3FEFFC91), SPH_C64(0x555252F8A407AA52),
+	SPH_C64(0x9D606047C0FD2760), SPH_C64(0xCABCBC35657689BC),
+	SPH_C64(0x569B9B372BCDAC9B), SPH_C64(0x028E8E8A018C048E),
+	SPH_C64(0xB6A3A3D25B1571A3), SPH_C64(0x300C0C6C183C600C),
+	SPH_C64(0xF17B7B84F68AFF7B), SPH_C64(0xD43535806AE1B535),
+	SPH_C64(0x741D1DF53A69E81D), SPH_C64(0xA7E0E0B3DD4753E0),
+	SPH_C64(0x7BD7D721B3ACF6D7), SPH_C64(0x2FC2C29C99ED5EC2),
+	SPH_C64(0xB82E2E435C966D2E), SPH_C64(0x314B4B29967A624B),
+	SPH_C64(0xDFFEFE5DE121A3FE), SPH_C64(0x415757D5AE168257),
+	SPH_C64(0x541515BD2A41A815), SPH_C64(0xC17777E8EEB69F77),
+	SPH_C64(0xDC3737926EEBA537), SPH_C64(0xB3E5E59ED7567BE5),
+	SPH_C64(0x469F9F1323D98C9F), SPH_C64(0xE7F0F023FD17D3F0),
+	SPH_C64(0x354A4A20947F6A4A), SPH_C64(0x4FDADA44A9959EDA),
+	SPH_C64(0x7D5858A2B025FA58), SPH_C64(0x03C9C9CF8FCA06C9),
+	SPH_C64(0xA429297C528D5529), SPH_C64(0x280A0A5A1422500A),
+	SPH_C64(0xFEB1B1507F4FE1B1), SPH_C64(0xBAA0A0C95D1A69A0),
+	SPH_C64(0xB16B6B14D6DA7F6B), SPH_C64(0x2E8585D917AB5C85),
+	SPH_C64(0xCEBDBD3C677381BD), SPH_C64(0x695D5D8FBA34D25D),
+	SPH_C64(0x4010109020508010), SPH_C64(0xF7F4F407F503F3F4),
+	SPH_C64(0x0BCBCBDD8BC016CB), SPH_C64(0xF83E3ED37CC6ED3E),
+	SPH_C64(0x1405052D0A112805), SPH_C64(0x81676778CEE61F67),
+	SPH_C64(0xB7E4E497D55373E4), SPH_C64(0x9C2727024EBB2527),
+	SPH_C64(0x1941417382583241), SPH_C64(0x168B8BA70B9D2C8B),
+	SPH_C64(0xA6A7A7F6530151A7), SPH_C64(0xE97D7DB2FA94CF7D),
+	SPH_C64(0x6E95954937FBDC95), SPH_C64(0x47D8D856AD9F8ED8),
+	SPH_C64(0xCBFBFB70EB308BFB), SPH_C64(0x9FEEEECDC17123EE),
+	SPH_C64(0xED7C7CBBF891C77C), SPH_C64(0x85666671CCE31766),
+	SPH_C64(0x53DDDD7BA78EA6DD), SPH_C64(0x5C1717AF2E4BB817),
+	SPH_C64(0x014747458E460247), SPH_C64(0x429E9E1A21DC849E),
+	SPH_C64(0x0FCACAD489C51ECA), SPH_C64(0xB42D2D585A99752D),
+	SPH_C64(0xC6BFBF2E637991BF), SPH_C64(0x1C07073F0E1B3807),
+	SPH_C64(0x8EADADAC472301AD), SPH_C64(0x755A5AB0B42FEA5A),
+	SPH_C64(0x368383EF1BB56C83), SPH_C64(0xCC3333B666FF8533),
+	SPH_C64(0x9163635CC6F23F63), SPH_C64(0x08020212040A1002),
+	SPH_C64(0x92AAAA93493839AA), SPH_C64(0xD97171DEE2A8AF71),
+	SPH_C64(0x07C8C8C68DCF0EC8), SPH_C64(0x641919D1327DC819),
+	SPH_C64(0x3949493B92707249), SPH_C64(0x43D9D95FAF9A86D9),
+	SPH_C64(0xEFF2F231F91DC3F2), SPH_C64(0xABE3E3A8DB484BE3),
+	SPH_C64(0x715B5BB9B62AE25B), SPH_C64(0x1A8888BC0D923488),
+	SPH_C64(0x529A9A3E29C8A49A), SPH_C64(0x9826260B4CBE2D26),
+	SPH_C64(0xC83232BF64FA8D32), SPH_C64(0xFAB0B0597D4AE9B0),
+	SPH_C64(0x83E9E9F2CF6A1BE9), SPH_C64(0x3C0F0F771E33780F),
+	SPH_C64(0x73D5D533B7A6E6D5), SPH_C64(0x3A8080F41DBA7480),
+	SPH_C64(0xC2BEBE27617C99BE), SPH_C64(0x13CDCDEB87DE26CD),
+	SPH_C64(0xD034348968E4BD34), SPH_C64(0x3D48483290757A48),
+	SPH_C64(0xDBFFFF54E324ABFF), SPH_C64(0xF57A7A8DF48FF77A),
+	SPH_C64(0x7A9090643DEAF490), SPH_C64(0x615F5F9DBE3EC25F),
+	SPH_C64(0x8020203D40A01D20), SPH_C64(0xBD68680FD0D56768),
+	SPH_C64(0x681A1ACA3472D01A), SPH_C64(0x82AEAEB7412C19AE),
+	SPH_C64(0xEAB4B47D755EC9B4), SPH_C64(0x4D5454CEA8199A54),
+	SPH_C64(0x7693937F3BE5EC93), SPH_C64(0x8822222F44AA0D22),
+	SPH_C64(0x8D646463C8E90764), SPH_C64(0xE3F1F12AFF12DBF1),
+	SPH_C64(0xD17373CCE6A2BF73), SPH_C64(0x48121282245A9012),
+	SPH_C64(0x1D40407A805D3A40), SPH_C64(0x2008084810284008),
+	SPH_C64(0x2BC3C3959BE856C3), SPH_C64(0x97ECECDFC57B33EC),
+	SPH_C64(0x4BDBDB4DAB9096DB), SPH_C64(0xBEA1A1C05F1F61A1),
+	SPH_C64(0x0E8D8D9107831C8D), SPH_C64(0xF43D3DC87AC9F53D),
+	SPH_C64(0x6697975B33F1CC97), SPH_C64(0x0000000000000000),
+	SPH_C64(0x1BCFCFF983D436CF), SPH_C64(0xAC2B2B6E5687452B),
+	SPH_C64(0xC57676E1ECB39776), SPH_C64(0x328282E619B06482),
+	SPH_C64(0x7FD6D628B1A9FED6), SPH_C64(0x6C1B1BC33677D81B),
+	SPH_C64(0xEEB5B574775BC1B5), SPH_C64(0x86AFAFBE432911AF),
+	SPH_C64(0xB56A6A1DD4DF776A), SPH_C64(0x5D5050EAA00DBA50),
+	SPH_C64(0x094545578A4C1245), SPH_C64(0xEBF3F338FB18CBF3),
+	SPH_C64(0xC03030AD60F09D30), SPH_C64(0x9BEFEFC4C3742BEF),
+	SPH_C64(0xFC3F3FDA7EC3E53F), SPH_C64(0x495555C7AA1C9255),
+	SPH_C64(0xB2A2A2DB591079A2), SPH_C64(0x8FEAEAE9C96503EA),
+	SPH_C64(0x8965656ACAEC0F65), SPH_C64(0xD2BABA036968B9BA),
+	SPH_C64(0xBC2F2F4A5E93652F), SPH_C64(0x27C0C08E9DE74EC0),
+	SPH_C64(0x5FDEDE60A181BEDE), SPH_C64(0x701C1CFC386CE01C),
+	SPH_C64(0xD3FDFD46E72EBBFD), SPH_C64(0x294D4D1F9A64524D),
+	SPH_C64(0x7292927639E0E492), SPH_C64(0xC97575FAEABC8F75),
+	SPH_C64(0x180606360C1E3006), SPH_C64(0x128A8AAE0998248A),
+	SPH_C64(0xF2B2B24B7940F9B2), SPH_C64(0xBFE6E685D15963E6),
+	SPH_C64(0x380E0E7E1C36700E), SPH_C64(0x7C1F1FE73E63F81F),
+	SPH_C64(0x95626255C4F73762), SPH_C64(0x77D4D43AB5A3EED4),
+	SPH_C64(0x9AA8A8814D3229A8), SPH_C64(0x6296965231F4C496),
+	SPH_C64(0xC3F9F962EF3A9BF9), SPH_C64(0x33C5C5A397F666C5),
+	SPH_C64(0x942525104AB13525), SPH_C64(0x795959ABB220F259),
+	SPH_C64(0x2A8484D015AE5484), SPH_C64(0xD57272C5E4A7B772),
+	SPH_C64(0xE43939EC72DDD539), SPH_C64(0x2D4C4C1698615A4C),
+	SPH_C64(0x655E5E94BC3BCA5E), SPH_C64(0xFD78789FF085E778),
+	SPH_C64(0xE03838E570D8DD38), SPH_C64(0x0A8C8C980586148C),
+	SPH_C64(0x63D1D117BFB2C6D1), SPH_C64(0xAEA5A5E4570B41A5),
+	SPH_C64(0xAFE2E2A1D94D43E2), SPH_C64(0x9961614EC2F82F61),
+	SPH_C64(0xF6B3B3427B45F1B3), SPH_C64(0x8421213442A51521),
+	SPH_C64(0x4A9C9C0825D6949C), SPH_C64(0x781E1EEE3C66F01E),
+	SPH_C64(0x1143436186522243), SPH_C64(0x3BC7C7B193FC76C7),
+	SPH_C64(0xD7FCFC4FE52BB3FC), SPH_C64(0x1004042408142004),
+	SPH_C64(0x595151E3A208B251), SPH_C64(0x5E9999252FC7BC99),
+	SPH_C64(0xA96D6D22DAC44F6D), SPH_C64(0x340D0D651A39680D),
+	SPH_C64(0xCFFAFA79E93583FA), SPH_C64(0x5BDFDF69A384B6DF),
+	SPH_C64(0xE57E7EA9FC9BD77E), SPH_C64(0x9024241948B43D24),
+	SPH_C64(0xEC3B3BFE76D7C53B), SPH_C64(0x96ABAB9A4B3D31AB),
+	SPH_C64(0x1FCECEF081D13ECE), SPH_C64(0x4411119922558811),
+	SPH_C64(0x068F8F8303890C8F), SPH_C64(0x254E4E049C6B4A4E),
+	SPH_C64(0xE6B7B7667351D1B7), SPH_C64(0x8BEBEBE0CB600BEB),
+	SPH_C64(0xF03C3CC178CCFD3C), SPH_C64(0x3E8181FD1FBF7C81),
+	SPH_C64(0x6A94944035FED494), SPH_C64(0xFBF7F71CF30CEBF7),
+	SPH_C64(0xDEB9B9186F67A1B9), SPH_C64(0x4C13138B265F9813),
+	SPH_C64(0xB02C2C51589C7D2C), SPH_C64(0x6BD3D305BBB8D6D3),
+	SPH_C64(0xBBE7E78CD35C6BE7), SPH_C64(0xA56E6E39DCCB576E),
+	SPH_C64(0x37C4C4AA95F36EC4), SPH_C64(0x0C03031B060F1803),
+	SPH_C64(0x455656DCAC138A56), SPH_C64(0x0D44445E88491A44),
+	SPH_C64(0xE17F7FA0FE9EDF7F), SPH_C64(0x9EA9A9884F3721A9),
+	SPH_C64(0xA82A2A6754824D2A), SPH_C64(0xD6BBBB0A6B6DB1BB),
+	SPH_C64(0x23C1C1879FE246C1), SPH_C64(0x515353F1A602A253),
+	SPH_C64(0x57DCDC72A58BAEDC), SPH_C64(0x2C0B0B531627580B),
+	SPH_C64(0x4E9D9D0127D39C9D), SPH_C64(0xAD6C6C2BD8C1476C),
+	SPH_C64(0xC43131A462F59531), SPH_C64(0xCD7474F3E8B98774),
+	SPH_C64(0xFFF6F615F109E3F6), SPH_C64(0x0546464C8C430A46),
+	SPH_C64(0x8AACACA5452609AC), SPH_C64(0x1E8989B50F973C89),
+	SPH_C64(0x501414B42844A014), SPH_C64(0xA3E1E1BADF425BE1),
+	SPH_C64(0x581616A62C4EB016), SPH_C64(0xE83A3AF774D2CD3A),
+	SPH_C64(0xB9696906D2D06F69), SPH_C64(0x24090941122D4809),
+	SPH_C64(0xDD7070D7E0ADA770), SPH_C64(0xE2B6B66F7154D9B6),
+	SPH_C64(0x67D0D01EBDB7CED0), SPH_C64(0x93EDEDD6C77E3BED),
+	SPH_C64(0x17CCCCE285DB2ECC), SPH_C64(0x1542426884572A42),
+	SPH_C64(0x5A98982C2DC2B498), SPH_C64(0xAAA4A4ED550E49A4),
+	SPH_C64(0xA028287550885D28), SPH_C64(0x6D5C5C86B831DA5C),
+	SPH_C64(0xC7F8F86BED3F93F8), SPH_C64(0x228686C211A44486)
+};
+
+static const uint64_t plain_T6[256] = {
+	SPH_C64(0x1818D83078C01860), SPH_C64(0x23232646AF05238C),
+	SPH_C64(0xC6C6B891F97EC63F), SPH_C64(0xE8E8FBCD6F13E887),
+	SPH_C64(0x8787CB13A14C8726), SPH_C64(0xB8B8116D62A9B8DA),
+	SPH_C64(0x0101090205080104), SPH_C64(0x4F4F0D9E6E424F21),
+	SPH_C64(0x36369B6CEEAD36D8), SPH_C64(0xA6A6FF510459A6A2),
+	SPH_C64(0xD2D20CB9BDDED26F), SPH_C64(0xF5F50EF706FBF5F3),
+	SPH_C64(0x797996F280EF79F9), SPH_C64(0x6F6F30DECE5F6FA1),
+	SPH_C64(0x91916D3FEFFC917E), SPH_C64(0x5252F8A407AA5255),
+	SPH_C64(0x606047C0FD27609D), SPH_C64(0xBCBC35657689BCCA),
+	SPH_C64(0x9B9B372BCDAC9B56), SPH_C64(0x8E8E8A018C048E02),
+	SPH_C64(0xA3A3D25B1571A3B6), SPH_C64(0x0C0C6C183C600C30),
+	SPH_C64(0x7B7B84F68AFF7BF1), SPH_C64(0x3535806AE1B535D4),
+	SPH_C64(0x1D1DF53A69E81D74), SPH_C64(0xE0E0B3DD4753E0A7),
+	SPH_C64(0xD7D721B3ACF6D77B), SPH_C64(0xC2C29C99ED5EC22F),
+	SPH_C64(0x2E2E435C966D2EB8), SPH_C64(0x4B4B29967A624B31),
+	SPH_C64(0xFEFE5DE121A3FEDF), SPH_C64(0x5757D5AE16825741),
+	SPH_C64(0x1515BD2A41A81554), SPH_C64(0x7777E8EEB69F77C1),
+	SPH_C64(0x3737926EEBA537DC), SPH_C64(0xE5E59ED7567BE5B3),
+	SPH_C64(0x9F9F1323D98C9F46), SPH_C64(0xF0F023FD17D3F0E7),
+	SPH_C64(0x4A4A20947F6A4A35), SPH_C64(0xDADA44A9959EDA4F),
+	SPH_C64(0x5858A2B025FA587D), SPH_C64(0xC9C9CF8FCA06C903),
+	SPH_C64(0x29297C528D5529A4), SPH_C64(0x0A0A5A1422500A28),
+	SPH_C64(0xB1B1507F4FE1B1FE), SPH_C64(0xA0A0C95D1A69A0BA),
+	SPH_C64(0x6B6B14D6DA7F6BB1), SPH_C64(0x8585D917AB5C852E),
+	SPH_C64(0xBDBD3C677381BDCE), SPH_C64(0x5D5D8FBA34D25D69),
+	SPH_C64(0x1010902050801040), SPH_C64(0xF4F407F503F3F4F7),
+	SPH_C64(0xCBCBDD8BC016CB0B), SPH_C64(0x3E3ED37CC6ED3EF8),
+	SPH_C64(0x05052D0A11280514), SPH_C64(0x676778CEE61F6781),
+	SPH_C64(0xE4E497D55373E4B7), SPH_C64(0x2727024EBB25279C),
+	SPH_C64(0x4141738258324119), SPH_C64(0x8B8BA70B9D2C8B16),
+	SPH_C64(0xA7A7F6530151A7A6), SPH_C64(0x7D7DB2FA94CF7DE9),
+	SPH_C64(0x95954937FBDC956E), SPH_C64(0xD8D856AD9F8ED847),
+	SPH_C64(0xFBFB70EB308BFBCB), SPH_C64(0xEEEECDC17123EE9F),
+	SPH_C64(0x7C7CBBF891C77CED), SPH_C64(0x666671CCE3176685),
+	SPH_C64(0xDDDD7BA78EA6DD53), SPH_C64(0x1717AF2E4BB8175C),
+	SPH_C64(0x4747458E46024701), SPH_C64(0x9E9E1A21DC849E42),
+	SPH_C64(0xCACAD489C51ECA0F), SPH_C64(0x2D2D585A99752DB4),
+	SPH_C64(0xBFBF2E637991BFC6), SPH_C64(0x07073F0E1B38071C),
+	SPH_C64(0xADADAC472301AD8E), SPH_C64(0x5A5AB0B42FEA5A75),
+	SPH_C64(0x8383EF1BB56C8336), SPH_C64(0x3333B666FF8533CC),
+	SPH_C64(0x63635CC6F23F6391), SPH_C64(0x020212040A100208),
+	SPH_C64(0xAAAA93493839AA92), SPH_C64(0x7171DEE2A8AF71D9),
+	SPH_C64(0xC8C8C68DCF0EC807), SPH_C64(0x1919D1327DC81964),
+	SPH_C64(0x49493B9270724939), SPH_C64(0xD9D95FAF9A86D943),
+	SPH_C64(0xF2F231F91DC3F2EF), SPH_C64(0xE3E3A8DB484BE3AB),
+	SPH_C64(0x5B5BB9B62AE25B71), SPH_C64(0x8888BC0D9234881A),
+	SPH_C64(0x9A9A3E29C8A49A52), SPH_C64(0x26260B4CBE2D2698),
+	SPH_C64(0x3232BF64FA8D32C8), SPH_C64(0xB0B0597D4AE9B0FA),
+	SPH_C64(0xE9E9F2CF6A1BE983), SPH_C64(0x0F0F771E33780F3C),
+	SPH_C64(0xD5D533B7A6E6D573), SPH_C64(0x8080F41DBA74803A),
+	SPH_C64(0xBEBE27617C99BEC2), SPH_C64(0xCDCDEB87DE26CD13),
+	SPH_C64(0x34348968E4BD34D0), SPH_C64(0x48483290757A483D),
+	SPH_C64(0xFFFF54E324ABFFDB), SPH_C64(0x7A7A8DF48FF77AF5),
+	SPH_C64(0x9090643DEAF4907A), SPH_C64(0x5F5F9DBE3EC25F61),
+	SPH_C64(0x20203D40A01D2080), SPH_C64(0x68680FD0D56768BD),
+	SPH_C64(0x1A1ACA3472D01A68), SPH_C64(0xAEAEB7412C19AE82),
+	SPH_C64(0xB4B47D755EC9B4EA), SPH_C64(0x5454CEA8199A544D),
+	SPH_C64(0x93937F3BE5EC9376), SPH_C64(0x22222F44AA0D2288),
+	SPH_C64(0x646463C8E907648D), SPH_C64(0xF1F12AFF12DBF1E3),
+	SPH_C64(0x7373CCE6A2BF73D1), SPH_C64(0x121282245A901248),
+	SPH_C64(0x40407A805D3A401D), SPH_C64(0x0808481028400820),
+	SPH_C64(0xC3C3959BE856C32B), SPH_C64(0xECECDFC57B33EC97),
+	SPH_C64(0xDBDB4DAB9096DB4B), SPH_C64(0xA1A1C05F1F61A1BE),
+	SPH_C64(0x8D8D9107831C8D0E), SPH_C64(0x3D3DC87AC9F53DF4),
+	SPH_C64(0x97975B33F1CC9766), SPH_C64(0x0000000000000000),
+	SPH_C64(0xCFCFF983D436CF1B), SPH_C64(0x2B2B6E5687452BAC),
+	SPH_C64(0x7676E1ECB39776C5), SPH_C64(0x8282E619B0648232),
+	SPH_C64(0xD6D628B1A9FED67F), SPH_C64(0x1B1BC33677D81B6C),
+	SPH_C64(0xB5B574775BC1B5EE), SPH_C64(0xAFAFBE432911AF86),
+	SPH_C64(0x6A6A1DD4DF776AB5), SPH_C64(0x5050EAA00DBA505D),
+	SPH_C64(0x4545578A4C124509), SPH_C64(0xF3F338FB18CBF3EB),
+	SPH_C64(0x3030AD60F09D30C0), SPH_C64(0xEFEFC4C3742BEF9B),
+	SPH_C64(0x3F3FDA7EC3E53FFC), SPH_C64(0x5555C7AA1C925549),
+	SPH_C64(0xA2A2DB591079A2B2), SPH_C64(0xEAEAE9C96503EA8F),
+	SPH_C64(0x65656ACAEC0F6589), SPH_C64(0xBABA036968B9BAD2),
+	SPH_C64(0x2F2F4A5E93652FBC), SPH_C64(0xC0C08E9DE74EC027),
+	SPH_C64(0xDEDE60A181BEDE5F), SPH_C64(0x1C1CFC386CE01C70),
+	SPH_C64(0xFDFD46E72EBBFDD3), SPH_C64(0x4D4D1F9A64524D29),
+	SPH_C64(0x92927639E0E49272), SPH_C64(0x7575FAEABC8F75C9),
+	SPH_C64(0x0606360C1E300618), SPH_C64(0x8A8AAE0998248A12),
+	SPH_C64(0xB2B24B7940F9B2F2), SPH_C64(0xE6E685D15963E6BF),
+	SPH_C64(0x0E0E7E1C36700E38), SPH_C64(0x1F1FE73E63F81F7C),
+	SPH_C64(0x626255C4F7376295), SPH_C64(0xD4D43AB5A3EED477),
+	SPH_C64(0xA8A8814D3229A89A), SPH_C64(0x96965231F4C49662),
+	SPH_C64(0xF9F962EF3A9BF9C3), SPH_C64(0xC5C5A397F666C533),
+	SPH_C64(0x2525104AB1352594), SPH_C64(0x5959ABB220F25979),
+	SPH_C64(0x8484D015AE54842A), SPH_C64(0x7272C5E4A7B772D5),
+	SPH_C64(0x3939EC72DDD539E4), SPH_C64(0x4C4C1698615A4C2D),
+	SPH_C64(0x5E5E94BC3BCA5E65), SPH_C64(0x78789FF085E778FD),
+	SPH_C64(0x3838E570D8DD38E0), SPH_C64(0x8C8C980586148C0A),
+	SPH_C64(0xD1D117BFB2C6D163), SPH_C64(0xA5A5E4570B41A5AE),
+	SPH_C64(0xE2E2A1D94D43E2AF), SPH_C64(0x61614EC2F82F6199),
+	SPH_C64(0xB3B3427B45F1B3F6), SPH_C64(0x21213442A5152184),
+	SPH_C64(0x9C9C0825D6949C4A), SPH_C64(0x1E1EEE3C66F01E78),
+	SPH_C64(0x4343618652224311), SPH_C64(0xC7C7B193FC76C73B),
+	SPH_C64(0xFCFC4FE52BB3FCD7), SPH_C64(0x0404240814200410),
+	SPH_C64(0x5151E3A208B25159), SPH_C64(0x9999252FC7BC995E),
+	SPH_C64(0x6D6D22DAC44F6DA9), SPH_C64(0x0D0D651A39680D34),
+	SPH_C64(0xFAFA79E93583FACF), SPH_C64(0xDFDF69A384B6DF5B),
+	SPH_C64(0x7E7EA9FC9BD77EE5), SPH_C64(0x24241948B43D2490),
+	SPH_C64(0x3B3BFE76D7C53BEC), SPH_C64(0xABAB9A4B3D31AB96),
+	SPH_C64(0xCECEF081D13ECE1F), SPH_C64(0x1111992255881144),
+	SPH_C64(0x8F8F8303890C8F06), SPH_C64(0x4E4E049C6B4A4E25),
+	SPH_C64(0xB7B7667351D1B7E6), SPH_C64(0xEBEBE0CB600BEB8B),
+	SPH_C64(0x3C3CC178CCFD3CF0), SPH_C64(0x8181FD1FBF7C813E),
+	SPH_C64(0x94944035FED4946A), SPH_C64(0xF7F71CF30CEBF7FB),
+	SPH_C64(0xB9B9186F67A1B9DE), SPH_C64(0x13138B265F98134C),
+	SPH_C64(0x2C2C51589C7D2CB0), SPH_C64(0xD3D305BBB8D6D36B),
+	SPH_C64(0xE7E78CD35C6BE7BB), SPH_C64(0x6E6E39DCCB576EA5),
+	SPH_C64(0xC4C4AA95F36EC437), SPH_C64(0x03031B060F18030C),
+	SPH_C64(0x5656DCAC138A5645), SPH_C64(0x44445E88491A440D),
+	SPH_C64(0x7F7FA0FE9EDF7FE1), SPH_C64(0xA9A9884F3721A99E),
+	SPH_C64(0x2A2A6754824D2AA8), SPH_C64(0xBBBB0A6B6DB1BBD6),
+	SPH_C64(0xC1C1879FE246C123), SPH_C64(0x5353F1A602A25351),
+	SPH_C64(0xDCDC72A58BAEDC57), SPH_C64(0x0B0B531627580B2C),
+	SPH_C64(0x9D9D0127D39C9D4E), SPH_C64(0x6C6C2BD8C1476CAD),
+	SPH_C64(0x3131A462F59531C4), SPH_C64(0x7474F3E8B98774CD),
+	SPH_C64(0xF6F615F109E3F6FF), SPH_C64(0x46464C8C430A4605),
+	SPH_C64(0xACACA5452609AC8A), SPH_C64(0x8989B50F973C891E),
+	SPH_C64(0x1414B42844A01450), SPH_C64(0xE1E1BADF425BE1A3),
+	SPH_C64(0x1616A62C4EB01658), SPH_C64(0x3A3AF774D2CD3AE8),
+	SPH_C64(0x696906D2D06F69B9), SPH_C64(0x090941122D480924),
+	SPH_C64(0x7070D7E0ADA770DD), SPH_C64(0xB6B66F7154D9B6E2),
+	SPH_C64(0xD0D01EBDB7CED067), SPH_C64(0xEDEDD6C77E3BED93),
+	SPH_C64(0xCCCCE285DB2ECC17), SPH_C64(0x42426884572A4215),
+	SPH_C64(0x98982C2DC2B4985A), SPH_C64(0xA4A4ED550E49A4AA),
+	SPH_C64(0x28287550885D28A0), SPH_C64(0x5C5C86B831DA5C6D),
+	SPH_C64(0xF8F86BED3F93F8C7), SPH_C64(0x8686C211A4448622)
+};
+
+static const uint64_t plain_T7[256] = {
+	SPH_C64(0x18D83078C0186018), SPH_C64(0x232646AF05238C23),
+	SPH_C64(0xC6B891F97EC63FC6), SPH_C64(0xE8FBCD6F13E887E8),
+	SPH_C64(0x87CB13A14C872687), SPH_C64(0xB8116D62A9B8DAB8),
+	SPH_C64(0x0109020508010401), SPH_C64(0x4F0D9E6E424F214F),
+	SPH_C64(0x369B6CEEAD36D836), SPH_C64(0xA6FF510459A6A2A6),
+	SPH_C64(0xD20CB9BDDED26FD2), SPH_C64(0xF50EF706FBF5F3F5),
+	SPH_C64(0x7996F280EF79F979), SPH_C64(0x6F30DECE5F6FA16F),
+	SPH_C64(0x916D3FEFFC917E91), SPH_C64(0x52F8A407AA525552),
+	SPH_C64(0x6047C0FD27609D60), SPH_C64(0xBC35657689BCCABC),
+	SPH_C64(0x9B372BCDAC9B569B), SPH_C64(0x8E8A018C048E028E),
+	SPH_C64(0xA3D25B1571A3B6A3), SPH_C64(0x0C6C183C600C300C),
+	SPH_C64(0x7B84F68AFF7BF17B), SPH_C64(0x35806AE1B535D435),
+	SPH_C64(0x1DF53A69E81D741D), SPH_C64(0xE0B3DD4753E0A7E0),
+	SPH_C64(0xD721B3ACF6D77BD7), SPH_C64(0xC29C99ED5EC22FC2),
+	SPH_C64(0x2E435C966D2EB82E), SPH_C64(0x4B29967A624B314B),
+	SPH_C64(0xFE5DE121A3FEDFFE), SPH_C64(0x57D5AE1682574157),
+	SPH_C64(0x15BD2A41A8155415), SPH_C64(0x77E8EEB69F77C177),
+	SPH_C64(0x37926EEBA537DC37), SPH_C64(0xE59ED7567BE5B3E5),
+	SPH_C64(0x9F1323D98C9F469F), SPH_C64(0xF023FD17D3F0E7F0),
+	SPH_C64(0x4A20947F6A4A354A), SPH_C64(0xDA44A9959EDA4FDA),
+	SPH_C64(0x58A2B025FA587D58), SPH_C64(0xC9CF8FCA06C903C9),
+	SPH_C64(0x297C528D5529A429), SPH_C64(0x0A5A1422500A280A),
+	SPH_C64(0xB1507F4FE1B1FEB1), SPH_C64(0xA0C95D1A69A0BAA0),
+	SPH_C64(0x6B14D6DA7F6BB16B), SPH_C64(0x85D917AB5C852E85),
+	SPH_C64(0xBD3C677381BDCEBD), SPH_C64(0x5D8FBA34D25D695D),
+	SPH_C64(0x1090205080104010), SPH_C64(0xF407F503F3F4F7F4),
+	SPH_C64(0xCBDD8BC016CB0BCB), SPH_C64(0x3ED37CC6ED3EF83E),
+	SPH_C64(0x052D0A1128051405), SPH_C64(0x6778CEE61F678167),
+	SPH_C64(0xE497D55373E4B7E4), SPH_C64(0x27024EBB25279C27),
+	SPH_C64(0x4173825832411941), SPH_C64(0x8BA70B9D2C8B168B),
+	SPH_C64(0xA7F6530151A7A6A7), SPH_C64(0x7DB2FA94CF7DE97D),
+	SPH_C64(0x954937FBDC956E95), SPH_C64(0xD856AD9F8ED847D8),
+	SPH_C64(0xFB70EB308BFBCBFB), SPH_C64(0xEECDC17123EE9FEE),
+	SPH_C64(0x7CBBF891C77CED7C), SPH_C64(0x6671CCE317668566),
+	SPH_C64(0xDD7BA78EA6DD53DD), SPH_C64(0x17AF2E4BB8175C17),
+	SPH_C64(0x47458E4602470147), SPH_C64(0x9E1A21DC849E429E),
+	SPH_C64(0xCAD489C51ECA0FCA), SPH_C64(0x2D585A99752DB42D),
+	SPH_C64(0xBF2E637991BFC6BF), SPH_C64(0x073F0E1B38071C07),
+	SPH_C64(0xADAC472301AD8EAD), SPH_C64(0x5AB0B42FEA5A755A),
+	SPH_C64(0x83EF1BB56C833683), SPH_C64(0x33B666FF8533CC33),
+	SPH_C64(0x635CC6F23F639163), SPH_C64(0x0212040A10020802),
+	SPH_C64(0xAA93493839AA92AA), SPH_C64(0x71DEE2A8AF71D971),
+	SPH_C64(0xC8C68DCF0EC807C8), SPH_C64(0x19D1327DC8196419),
+	SPH_C64(0x493B927072493949), SPH_C64(0xD95FAF9A86D943D9),
+	SPH_C64(0xF231F91DC3F2EFF2), SPH_C64(0xE3A8DB484BE3ABE3),
+	SPH_C64(0x5BB9B62AE25B715B), SPH_C64(0x88BC0D9234881A88),
+	SPH_C64(0x9A3E29C8A49A529A), SPH_C64(0x260B4CBE2D269826),
+	SPH_C64(0x32BF64FA8D32C832), SPH_C64(0xB0597D4AE9B0FAB0),
+	SPH_C64(0xE9F2CF6A1BE983E9), SPH_C64(0x0F771E33780F3C0F),
+	SPH_C64(0xD533B7A6E6D573D5), SPH_C64(0x80F41DBA74803A80),
+	SPH_C64(0xBE27617C99BEC2BE), SPH_C64(0xCDEB87DE26CD13CD),
+	SPH_C64(0x348968E4BD34D034), SPH_C64(0x483290757A483D48),
+	SPH_C64(0xFF54E324ABFFDBFF), SPH_C64(0x7A8DF48FF77AF57A),
+	SPH_C64(0x90643DEAF4907A90), SPH_C64(0x5F9DBE3EC25F615F),
+	SPH_C64(0x203D40A01D208020), SPH_C64(0x680FD0D56768BD68),
+	SPH_C64(0x1ACA3472D01A681A), SPH_C64(0xAEB7412C19AE82AE),
+	SPH_C64(0xB47D755EC9B4EAB4), SPH_C64(0x54CEA8199A544D54),
+	SPH_C64(0x937F3BE5EC937693), SPH_C64(0x222F44AA0D228822),
+	SPH_C64(0x6463C8E907648D64), SPH_C64(0xF12AFF12DBF1E3F1),
+	SPH_C64(0x73CCE6A2BF73D173), SPH_C64(0x1282245A90124812),
+	SPH_C64(0x407A805D3A401D40), SPH_C64(0x0848102840082008),
+	SPH_C64(0xC3959BE856C32BC3), SPH_C64(0xECDFC57B33EC97EC),
+	SPH_C64(0xDB4DAB9096DB4BDB), SPH_C64(0xA1C05F1F61A1BEA1),
+	SPH_C64(0x8D9107831C8D0E8D), SPH_C64(0x3DC87AC9F53DF43D),
+	SPH_C64(0x975B33F1CC976697), SPH_C64(0x0000000000000000),
+	SPH_C64(0xCFF983D436CF1BCF), SPH_C64(0x2B6E5687452BAC2B),
+	SPH_C64(0x76E1ECB39776C576), SPH_C64(0x82E619B064823282),
+	SPH_C64(0xD628B1A9FED67FD6), SPH_C64(0x1BC33677D81B6C1B),
+	SPH_C64(0xB574775BC1B5EEB5), SPH_C64(0xAFBE432911AF86AF),
+	SPH_C64(0x6A1DD4DF776AB56A), SPH_C64(0x50EAA00DBA505D50),
+	SPH_C64(0x45578A4C12450945), SPH_C64(0xF338FB18CBF3EBF3),
+	SPH_C64(0x30AD60F09D30C030), SPH_C64(0xEFC4C3742BEF9BEF),
+	SPH_C64(0x3FDA7EC3E53FFC3F), SPH_C64(0x55C7AA1C92554955),
+	SPH_C64(0xA2DB591079A2B2A2), SPH_C64(0xEAE9C96503EA8FEA),
+	SPH_C64(0x656ACAEC0F658965), SPH_C64(0xBA036968B9BAD2BA),
+	SPH_C64(0x2F4A5E93652FBC2F), SPH_C64(0xC08E9DE74EC027C0),
+	SPH_C64(0xDE60A181BEDE5FDE), SPH_C64(0x1CFC386CE01C701C),
+	SPH_C64(0xFD46E72EBBFDD3FD), SPH_C64(0x4D1F9A64524D294D),
+	SPH_C64(0x927639E0E4927292), SPH_C64(0x75FAEABC8F75C975),
+	SPH_C64(0x06360C1E30061806), SPH_C64(0x8AAE0998248A128A),
+	SPH_C64(0xB24B7940F9B2F2B2), SPH_C64(0xE685D15963E6BFE6),
+	SPH_C64(0x0E7E1C36700E380E), SPH_C64(0x1FE73E63F81F7C1F),
+	SPH_C64(0x6255C4F737629562), SPH_C64(0xD43AB5A3EED477D4),
+	SPH_C64(0xA8814D3229A89AA8), SPH_C64(0x965231F4C4966296),
+	SPH_C64(0xF962EF3A9BF9C3F9), SPH_C64(0xC5A397F666C533C5),
+	SPH_C64(0x25104AB135259425), SPH_C64(0x59ABB220F2597959),
+	SPH_C64(0x84D015AE54842A84), SPH_C64(0x72C5E4A7B772D572),
+	SPH_C64(0x39EC72DDD539E439), SPH_C64(0x4C1698615A4C2D4C),
+	SPH_C64(0x5E94BC3BCA5E655E), SPH_C64(0x789FF085E778FD78),
+	SPH_C64(0x38E570D8DD38E038), SPH_C64(0x8C980586148C0A8C),
+	SPH_C64(0xD117BFB2C6D163D1), SPH_C64(0xA5E4570B41A5AEA5),
+	SPH_C64(0xE2A1D94D43E2AFE2), SPH_C64(0x614EC2F82F619961),
+	SPH_C64(0xB3427B45F1B3F6B3), SPH_C64(0x213442A515218421),
+	SPH_C64(0x9C0825D6949C4A9C), SPH_C64(0x1EEE3C66F01E781E),
+	SPH_C64(0x4361865222431143), SPH_C64(0xC7B193FC76C73BC7),
+	SPH_C64(0xFC4FE52BB3FCD7FC), SPH_C64(0x0424081420041004),
+	SPH_C64(0x51E3A208B2515951), SPH_C64(0x99252FC7BC995E99),
+	SPH_C64(0x6D22DAC44F6DA96D), SPH_C64(0x0D651A39680D340D),
+	SPH_C64(0xFA79E93583FACFFA), SPH_C64(0xDF69A384B6DF5BDF),
+	SPH_C64(0x7EA9FC9BD77EE57E), SPH_C64(0x241948B43D249024),
+	SPH_C64(0x3BFE76D7C53BEC3B), SPH_C64(0xAB9A4B3D31AB96AB),
+	SPH_C64(0xCEF081D13ECE1FCE), SPH_C64(0x1199225588114411),
+	SPH_C64(0x8F8303890C8F068F), SPH_C64(0x4E049C6B4A4E254E),
+	SPH_C64(0xB7667351D1B7E6B7), SPH_C64(0xEBE0CB600BEB8BEB),
+	SPH_C64(0x3CC178CCFD3CF03C), SPH_C64(0x81FD1FBF7C813E81),
+	SPH_C64(0x944035FED4946A94), SPH_C64(0xF71CF30CEBF7FBF7),
+	SPH_C64(0xB9186F67A1B9DEB9), SPH_C64(0x138B265F98134C13),
+	SPH_C64(0x2C51589C7D2CB02C), SPH_C64(0xD305BBB8D6D36BD3),
+	SPH_C64(0xE78CD35C6BE7BBE7), SPH_C64(0x6E39DCCB576EA56E),
+	SPH_C64(0xC4AA95F36EC437C4), SPH_C64(0x031B060F18030C03),
+	SPH_C64(0x56DCAC138A564556), SPH_C64(0x445E88491A440D44),
+	SPH_C64(0x7FA0FE9EDF7FE17F), SPH_C64(0xA9884F3721A99EA9),
+	SPH_C64(0x2A6754824D2AA82A), SPH_C64(0xBB0A6B6DB1BBD6BB),
+	SPH_C64(0xC1879FE246C123C1), SPH_C64(0x53F1A602A2535153),
+	SPH_C64(0xDC72A58BAEDC57DC), SPH_C64(0x0B531627580B2C0B),
+	SPH_C64(0x9D0127D39C9D4E9D), SPH_C64(0x6C2BD8C1476CAD6C),
+	SPH_C64(0x31A462F59531C431), SPH_C64(0x74F3E8B98774CD74),
+	SPH_C64(0xF615F109E3F6FFF6), SPH_C64(0x464C8C430A460546),
+	SPH_C64(0xACA5452609AC8AAC), SPH_C64(0x89B50F973C891E89),
+	SPH_C64(0x14B42844A0145014), SPH_C64(0xE1BADF425BE1A3E1),
+	SPH_C64(0x16A62C4EB0165816), SPH_C64(0x3AF774D2CD3AE83A),
+	SPH_C64(0x6906D2D06F69B969), SPH_C64(0x0941122D48092409),
+	SPH_C64(0x70D7E0ADA770DD70), SPH_C64(0xB66F7154D9B6E2B6),
+	SPH_C64(0xD01EBDB7CED067D0), SPH_C64(0xEDD6C77E3BED93ED),
+	SPH_C64(0xCCE285DB2ECC17CC), SPH_C64(0x426884572A421542),
+	SPH_C64(0x982C2DC2B4985A98), SPH_C64(0xA4ED550E49A4AAA4),
+	SPH_C64(0x287550885D28A028), SPH_C64(0x5C86B831DA5C6D5C),
+	SPH_C64(0xF86BED3F93F8C7F8), SPH_C64(0x86C211A444862286)
+};
+#endif /* USE_ALL_TABLES */
+
+
+/**
+ * Round constants.
+ */
+__device__ uint64_t InitVector_RC[10];
+
+/* ====================================================================== */
+
+
+#define TRANSFER(dst, src) { \
+	dst[0] = src ## 0; \
+	dst[1] = src ## 1; \
+	dst[2] = src ## 2; \
+	dst[3] = src ## 3; \
+	dst[4] = src ## 4; \
+	dst[5] = src ## 5; \
+	dst[6] = src ## 6; \
+	dst[7] = src ## 7; \
+}
+
+#if !USE_ALL_TABLES
+#define BYTE(x, n) ((unsigned)((x) >> (8 * (n))) & 0xFF)
+
+/* method disabled to reduce code size */
+__device__ __forceinline__
+static uint64_t table_skew(uint64_t val, int num) {
+	return ROTL64(val, 8 * num);
+}
+
+__device__ __forceinline__
+static uint64_t ROUND_ELT(const uint64_t* sharedMemory, uint64_t* __restrict__ in,
+	int i0,int i1,int i2,int i3,int i4,int i5,int i6,int i7)
+{
+	uint32_t idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7;
+	idx0 = BYTE(in[i0], 0);
+	idx1 = BYTE(in[i1], 1);
+	idx2 = BYTE(in[i2], 2);
+	idx3 = BYTE(in[i3], 3);
+	idx4 = BYTE(in[i4], 4);
+	idx5 = BYTE(in[i5], 5);
+	idx6 = BYTE(in[i6], 6);
+	idx7 = BYTE(in[i7], 7);
+
+	return xor8(
+		sharedMemory[idx0],
+		table_skew(sharedMemory[idx1], 1),
+		table_skew(sharedMemory[idx2], 2),
+		table_skew(sharedMemory[idx3], 3),
+		table_skew(sharedMemory[idx4], 4),
+		table_skew(sharedMemory[idx5], 5),
+		table_skew(sharedMemory[idx6], 6),
+		table_skew(sharedMemory[idx7], 7)
+	);
+}
+
+#else
+
+__device__ __forceinline__
+static uint64_t ROUND_ELT(const uint64_t* sharedMemory, uint64_t* __restrict__ in,
+const int i0, const int i1, const int i2, const int i3, const int i4, const int i5, const int i6, const int i7)
+{
+	uint32_t* in32 = (uint32_t*)in;
+	return (sharedMemory[__byte_perm(in32[(i0 << 1)], 0, 0x4440)]        ^ sharedMemory[__byte_perm(in32[(i1 << 1)], 0, 0x4441) + 256] ^
+		sharedMemory[__byte_perm(in32[(i2 << 1)], 0, 0x4442) + 512]      ^ sharedMemory[__byte_perm(in32[(i3 << 1)], 0, 0x4443) + 768] ^
+		sharedMemory[__byte_perm(in32[(i4 << 1) + 1], 0, 0x4440) + 1024] ^ sharedMemory[__byte_perm(in32[(i5 << 1) + 1], 0, 0x4441) + 1280] ^
+		sharedMemory[__byte_perm(in32[(i6 << 1) + 1], 0, 0x4442) + 1536] ^ sharedMemory[__byte_perm(in32[(i7 << 1) + 1], 0, 0x4443) + 1792]);
+}
+#endif /* USE_ALL_TABLES */
+
+#define ROUND(table, in, out, c0, c1, c2, c3, c4, c5, c6, c7) { \
+	out ## 0 = xor1(ROUND_ELT(table, in, 0, 7, 6, 5, 4, 3, 2, 1), c0); \
+	out ## 1 = xor1(ROUND_ELT(table, in, 1, 0, 7, 6, 5, 4, 3, 2), c1); \
+	out ## 2 = xor1(ROUND_ELT(table, in, 2, 1, 0, 7, 6, 5, 4, 3), c2); \
+	out ## 3 = xor1(ROUND_ELT(table, in, 3, 2, 1, 0, 7, 6, 5, 4), c3); \
+	out ## 4 = xor1(ROUND_ELT(table, in, 4, 3, 2, 1, 0, 7, 6, 5), c4); \
+	out ## 5 = xor1(ROUND_ELT(table, in, 5, 4, 3, 2, 1, 0, 7, 6), c5); \
+	out ## 6 = xor1(ROUND_ELT(table, in, 6, 5, 4, 3, 2, 1, 0, 7), c6); \
+	out ## 7 = xor1(ROUND_ELT(table, in, 7, 6, 5, 4, 3, 2, 1, 0), c7); \
+}
+
+#define ROUND1(table, in, out, c) { \
+	out ## 0 = xor1(ROUND_ELT(table, in, 0, 7, 6, 5, 4, 3, 2, 1), c); \
+	out ## 1 = ROUND_ELT(table, in, 1, 0, 7, 6, 5, 4, 3, 2); \
+	out ## 2 = ROUND_ELT(table, in, 2, 1, 0, 7, 6, 5, 4, 3); \
+	out ## 3 = ROUND_ELT(table, in, 3, 2, 1, 0, 7, 6, 5, 4); \
+	out ## 4 = ROUND_ELT(table, in, 4, 3, 2, 1, 0, 7, 6, 5); \
+	out ## 5 = ROUND_ELT(table, in, 5, 4, 3, 2, 1, 0, 7, 6); \
+	out ## 6 = ROUND_ELT(table, in, 6, 5, 4, 3, 2, 1, 0, 7); \
+	out ## 7 = ROUND_ELT(table, in, 7, 6, 5, 4, 3, 2, 1, 0); \
+}
+
+#define ROUND_KSCHED(table, in, out, c) \
+	ROUND1(table, in, out, c) \
+	TRANSFER(in, out)
+
+#define ROUND_WENC(table, in, key, out) \
+	ROUND(table, in, out, key[0], key[1], key[2],key[3], key[4], key[5], key[6], key[7]) \
+	TRANSFER(in, out)
+
+
+__global__
+void oldwhirlpool_gpu_hash_80(const uint32_t threads, const uint32_t startNounce, void *outputHash, int swab)
+{
+	__shared__ uint64_t sharedMemory[2048];
+
+	if (threadIdx.x < 256) {
+		sharedMemory[threadIdx.x] = mixTob0Tox[threadIdx.x];
+		#if USE_ALL_TABLES
+			sharedMemory[threadIdx.x+256]  = mixTob1Tox[threadIdx.x];
+			sharedMemory[threadIdx.x+512]  = mixTob2Tox[threadIdx.x];
+			sharedMemory[threadIdx.x+768]  = mixTob3Tox[threadIdx.x];
+			sharedMemory[threadIdx.x+1024] = mixTob4Tox[threadIdx.x];
+			sharedMemory[threadIdx.x+1280] = mixTob5Tox[threadIdx.x];
+			sharedMemory[threadIdx.x+1536] = mixTob6Tox[threadIdx.x];
+			sharedMemory[threadIdx.x+1792] = mixTob7Tox[threadIdx.x];
+		#endif
+	}
+	//__threadfence_block(); // ensure shared mem is ready
+	__syncthreads();
+
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint64_t n[8];
+		uint64_t h[8];
+		uint32_t nonce = startNounce + thread;
+		nonce = swab ? cuda_swab32(nonce) : nonce;
+
+#if HOST_MIDSTATE
+		uint64_t state[8];
+		#pragma unroll 8
+		for (int i=0; i < 8; i++) {
+			//state[i] = c_PaddedMessage80[i];
+			AS_UINT2(&state[i]) = AS_UINT2(&c_PaddedMessage80[i]);
+		}
+#else
+		#pragma unroll 8
+		for (int i=0; i<8; i++) {
+			n[i] = c_PaddedMessage80[i];  // read data
+			h[i] = 0;                     // read state
+		}
+
+		#pragma unroll 1
+		for (unsigned r=0; r < 10; r++) {
+			uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+			ROUND_KSCHED(sharedMemory, h, tmp, InitVector_RC[r]);
+			ROUND_WENC(sharedMemory, n, h, tmp);
+		}
+
+		uint64_t state[8];
+		#pragma unroll 8
+		for (int i=0; i < 8; i++) {
+			state[i] = xor1(n[i],c_PaddedMessage80[i]);
+		}
+#endif
+
+		/// round 2 ///////
+		//////////////////////////////////
+		n[0] = c_PaddedMessage80[8];    //read data
+		n[1] = REPLACE_HIDWORD(c_PaddedMessage80[9], nonce); //whirlpool
+		n[2] = 0x0000000000000080; //whirlpool
+		n[3] = 0;
+		n[4] = 0;
+		n[5] = 0;
+		n[6] = 0;
+		n[7] = 0x8002000000000000;
+
+		#pragma unroll 8
+		for (int i=0; i<8; i++) {
+			h[i] = state[i];   //read state
+			n[i] = xor1(n[i],h[i]);
+		}
+
+//		#pragma unroll
+		for (unsigned r=0; r < 10; r++) {
+			uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+			ROUND_KSCHED(sharedMemory, h, tmp, InitVector_RC[r]);
+			ROUND_WENC(sharedMemory, n, h, tmp);
+		}
+
+		state[0] = xor3(state[0], n[0], c_PaddedMessage80[8]);
+		state[1] = xor3(state[1], n[1], REPLACE_HIDWORD(c_PaddedMessage80[9], nonce) );
+		state[2] = xor3(state[2], n[2], 0x0000000000000080);
+		state[3] = xor1(state[3], n[3]);
+		state[4] = xor1(state[4], n[4]);
+		state[5] = xor1(state[5], n[5]);
+		state[6] = xor1(state[6], n[6]);
+		state[7] = xor3(state[7], n[7], 0x8002000000000000);
+
+		uint64_t* outHash = &(((uint64_t*)outputHash)[(size_t)8 * thread]);
+		#pragma unroll 8
+		for (unsigned i = 0; i < 8; i++)
+			outHash[i] = state[i];
+
+	} // thread < threads
+}
+
+__global__
+void x15_whirlpool_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
+{
+	__shared__ uint64_t sharedMemory[2048];
+
+	if (threadIdx.x < 256) {
+		sharedMemory[threadIdx.x] = mixTob0Tox[threadIdx.x];
+		#if USE_ALL_TABLES
+			sharedMemory[threadIdx.x+256]  = mixTob1Tox[threadIdx.x];
+			sharedMemory[threadIdx.x+512]  = mixTob2Tox[threadIdx.x];
+			sharedMemory[threadIdx.x+768]  = mixTob3Tox[threadIdx.x];
+			sharedMemory[threadIdx.x+1024] = mixTob4Tox[threadIdx.x];
+			sharedMemory[threadIdx.x+1280] = mixTob5Tox[threadIdx.x];
+			sharedMemory[threadIdx.x+1536] = mixTob6Tox[threadIdx.x];
+			sharedMemory[threadIdx.x+1792] = mixTob7Tox[threadIdx.x];
+		#endif
+	}
+	__threadfence_block(); // ensure shared mem is ready
+
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread);
+		uint32_t hashPosition = (nounce - startNounce) << 3;
+		uint64_t hash[8], state[8], n[8], h[8] = { 0 };
+		uint8_t i;
+
+		#pragma unroll 8
+		for (i=0; i<8; i++)
+			n[i] = hash[i] = g_hash[hashPosition + i];
+
+//		#pragma unroll 10
+		for (i=0; i < 10; i++) {
+			uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+			ROUND_KSCHED(sharedMemory, h, tmp, InitVector_RC[i]);
+			ROUND_WENC(sharedMemory, n, h, tmp);
+		}
+
+		#pragma unroll 8
+		for (i=0; i<8; i++)
+			state[i] = xor1(n[i], hash[i]);
+
+		#pragma unroll 6
+		for (i=1; i<7; i++)
+			n[i]=0;
+
+		n[0] = 0x80;
+		n[7] = 0x2000000000000;
+
+		#pragma unroll 8
+		for (i=0; i < 8; i++) {
+			h[i] = state[i];
+			n[i] = xor1(n[i], h[i]);
+		}
+
+//		#pragma unroll 10
+		for (i=0; i < 10; i++) {
+			uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+			ROUND_KSCHED(sharedMemory, h, tmp, InitVector_RC[i]);
+			ROUND_WENC(sharedMemory, n, h, tmp);
+		}
+
+		state[0] = xor3(state[0], n[0], 0x80);
+		state[1] = xor1(state[1], n[1]);
+		state[2] = xor1(state[2], n[2]);
+		state[3] = xor1(state[3], n[3]);
+		state[4] = xor1(state[4], n[4]);
+		state[5] = xor1(state[5], n[5]);
+		state[6] = xor1(state[6], n[6]);
+		state[7] = xor3(state[7], n[7], 0x2000000000000);
+
+		#pragma unroll 8
+		for (i=0; i < 8; i++)
+			g_hash[hashPosition + i] = state[i];
+	}
+}
+
+__global__
+void oldwhirlpool_gpu_finalhash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resNounce)
+{
+	__shared__ uint64_t sharedMemory[2048];
+
+	if (threadIdx.x < 256)
+	{
+		sharedMemory[threadIdx.x] = mixTob0Tox[threadIdx.x];
+		#if USE_ALL_TABLES
+			sharedMemory[threadIdx.x+256]  = mixTob1Tox[threadIdx.x];
+			sharedMemory[threadIdx.x+512]  = mixTob2Tox[threadIdx.x];
+			sharedMemory[threadIdx.x+768]  = mixTob3Tox[threadIdx.x];
+			sharedMemory[threadIdx.x+1024] = mixTob4Tox[threadIdx.x];
+			sharedMemory[threadIdx.x+1280] = mixTob5Tox[threadIdx.x];
+			sharedMemory[threadIdx.x+1536] = mixTob6Tox[threadIdx.x];
+			sharedMemory[threadIdx.x+1792] = mixTob7Tox[threadIdx.x];
+		#endif
+	}
+	__threadfence_block(); // ensure shared mem is ready
+
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t nonce = startNounce + thread;
+		uint64_t *inpHash = (uint64_t*) &g_hash[(size_t)8 * thread];
+		uint64_t h8[8];
+
+		#pragma unroll 8
+		for (int i=0; i<8; i++) {
+			h8[i] = inpHash[i];
+		}
+
+		uint64_t state[8];
+		uint64_t n[8];
+		uint64_t h[8];
+
+		#pragma unroll 8
+		for (int i=0; i<8; i++) {
+			n[i] = h8[i];
+			h[i] = 0;
+		}
+
+//		#pragma unroll 10
+		for (unsigned r=0; r < 10; r++) {
+			uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+			ROUND_KSCHED(sharedMemory, h, tmp, InitVector_RC[r]);
+			ROUND_WENC(sharedMemory, n, h, tmp);
+		}
+
+		#pragma unroll 8
+		for (int i=0; i<8; i++) {
+			state[i] = xor1(n[i], h8[i]);
+			n[i]=0;
+		}
+
+		n[0] = 0x80;
+		n[7] = 0x2000000000000;
+
+		#pragma unroll 8
+		for (int i=0; i<8; i++) {
+			h[i] = state[i];
+			n[i] = xor1(n[i], h[i]);
+		}
+
+//		#pragma unroll 10
+		for (unsigned r=0; r < 10; r++) {
+			uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+			ROUND_KSCHED(sharedMemory, h, tmp, InitVector_RC[r]);
+			ROUND_WENC(sharedMemory, n, h, tmp);
+		}
+
+		state[0] = xor3(state[0], n[0], 0x80);
+		state[1] = xor1(state[1], n[1]);
+		state[2] = xor1(state[2], n[2]);
+		state[3] = xor1(state[3], n[3]);
+		state[4] = xor1(state[4], n[4]);
+		state[5] = xor1(state[5], n[5]);
+		state[6] = xor1(state[6], n[6]);
+		state[7] = xor3(state[7], n[7], 0x2000000000000);
+
+		bool rc = (state[3] <= ((uint64_t*)pTarget)[3]);
+		if (rc && resNounce[0] > nonce)
+			resNounce[0] = nonce;
+	}
+}
+
+__host__
+void whirlpool512_init_sm3(int thr_id, uint32_t threads, int mode)
+{
+	switch (mode) {
+	case 0: /* x15 with rotated T1-T7 (based on T0) */
+		cudaMemcpyToSymbol(InitVector_RC, plain_RC, sizeof(plain_RC), 0, cudaMemcpyHostToDevice);
+		cudaMemcpyToSymbol(mixTob0Tox, plain_T0, sizeof(plain_T0), 0, cudaMemcpyHostToDevice);
+#if USE_ALL_TABLES
+		cudaMemcpyToSymbol(mixTob1Tox, plain_T1, (256*8), 0, cudaMemcpyHostToDevice);
+		cudaMemcpyToSymbol(mixTob2Tox, plain_T2, (256*8), 0, cudaMemcpyHostToDevice);
+		cudaMemcpyToSymbol(mixTob3Tox, plain_T3, (256*8), 0, cudaMemcpyHostToDevice);
+		cudaMemcpyToSymbol(mixTob4Tox, plain_T4, (256*8), 0, cudaMemcpyHostToDevice);
+		cudaMemcpyToSymbol(mixTob5Tox, plain_T5, (256*8), 0, cudaMemcpyHostToDevice);
+		cudaMemcpyToSymbol(mixTob6Tox, plain_T6, (256*8), 0, cudaMemcpyHostToDevice);
+		cudaMemcpyToSymbol(mixTob7Tox, plain_T7, (256*8), 0, cudaMemcpyHostToDevice);
+#endif
+		break;
+
+	case 1: /* old whirlpool */
+		cudaMemcpyToSymbol(InitVector_RC, old1_RC, sizeof(plain_RC), 0, cudaMemcpyHostToDevice);
+		cudaMemcpyToSymbol(mixTob0Tox, old1_T0, sizeof(plain_T0), 0, cudaMemcpyHostToDevice);
+#if USE_ALL_TABLES
+		cudaMemcpyToSymbol(mixTob1Tox, old1_T1, (256*8), 0, cudaMemcpyHostToDevice);
+		cudaMemcpyToSymbol(mixTob2Tox, old1_T2, (256*8), 0, cudaMemcpyHostToDevice);
+		cudaMemcpyToSymbol(mixTob3Tox, old1_T3, (256*8), 0, cudaMemcpyHostToDevice);
+		cudaMemcpyToSymbol(mixTob4Tox, old1_T4, (256*8), 0, cudaMemcpyHostToDevice);
+		cudaMemcpyToSymbol(mixTob5Tox, old1_T5, (256*8), 0, cudaMemcpyHostToDevice);
+		cudaMemcpyToSymbol(mixTob6Tox, old1_T6, (256*8), 0, cudaMemcpyHostToDevice);
+		cudaMemcpyToSymbol(mixTob7Tox, old1_T7, (256*8), 0, cudaMemcpyHostToDevice);
+#endif
+		cudaMalloc(&d_WNonce[thr_id], sizeof(uint32_t));
+		cudaMallocHost(&h_wnounce[thr_id], sizeof(uint32_t));
+		break;
+	}
+}
+
+__host__
+void whirlpool512_free_sm3(int thr_id)
+{
+	if (h_wnounce[thr_id]) {
+		cudaFree(d_WNonce[thr_id]);
+		cudaFreeHost(h_wnounce[thr_id]);
+		h_wnounce[thr_id] = NULL;
+	}
+}
+
+__host__
+void whirlpool512_hash_64_sm3(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+{
+	dim3 grid((threads + threadsperblock-1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	x15_whirlpool_gpu_hash_64 <<<grid, block>>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+
+	//MyStreamSynchronize(NULL, order, thr_id);
+}
+
+__host__
+extern uint32_t whirlpool512_finalhash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+{
+	uint32_t result = 0xffffffff;
+
+	dim3 grid((threads + threadsperblock-1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	cudaMemset(d_WNonce[thr_id], 0xff, sizeof(uint32_t));
+
+	oldwhirlpool_gpu_finalhash_64 <<<grid, block>>> (threads, startNounce, (uint64_t*)d_hash, d_WNonce[thr_id]);
+
+	cudaMemcpy(h_wnounce[thr_id], d_WNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+
+	result = *h_wnounce[thr_id];
+
+	return result;
+}
+
+__host__
+void whirlpool512_hash_80_sm3(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_outputHash)
+{
+	dim3 grid((threads + threadsperblock-1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	if (threads < 256)
+		applog(LOG_WARNING, "whirlpool requires a minimum of 256 threads to fetch constant tables!");
+
+	oldwhirlpool_gpu_hash_80<<<grid, block>>>(threads, startNonce, d_outputHash, 1);
+}
+
+extern void whirl_midstate(void *state, const void *input);
+
+__host__
+void whirlpool512_setBlock_80_sm3(void *pdata, const void *ptarget)
+{
+	unsigned char PaddedMessage[128];
+
+	memcpy(PaddedMessage, pdata, 80);
+	memset(PaddedMessage+80, 0, 48);
+	PaddedMessage[80] = 0x80; /* ending */
+
+#if HOST_MIDSTATE
+	// compute constant first block
+	unsigned char midstate[64] = { 0 };
+	whirl_midstate(midstate, pdata);
+	memcpy(PaddedMessage, midstate, 64);
+#endif
+
+	cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 128, 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(pTarget, ptarget, 32, 0, cudaMemcpyHostToDevice);
+}
+
+// ------------------------------------------------------------------------------------------------
+
+__host__
+void x16_whirlpool512_init(int thr_id, uint32_t threads)
+{
+	cudaMemcpyToSymbol(InitVector_RC, plain_RC, sizeof(plain_RC), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(mixTob0Tox, plain_T0, sizeof(plain_T0), 0, cudaMemcpyHostToDevice);
+#if USE_ALL_TABLES
+	cudaMemcpyToSymbol(mixTob1Tox, plain_T1, (256 * 8), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(mixTob2Tox, plain_T2, (256 * 8), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(mixTob3Tox, plain_T3, (256 * 8), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(mixTob4Tox, plain_T4, (256 * 8), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(mixTob5Tox, plain_T5, (256 * 8), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(mixTob6Tox, plain_T6, (256 * 8), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(mixTob7Tox, plain_T7, (256 * 8), 0, cudaMemcpyHostToDevice);
+#endif
+}
+
+extern void whirlpool_midstate(void *state, const void *input);
+
+__host__
+void x16_whirlpool512_setBlock_80(void *pdata)
+{
+	unsigned char PaddedMessage[128];
+
+	memcpy(PaddedMessage, pdata, 80);
+	memset(PaddedMessage + 80, 0, 48);
+	PaddedMessage[80] = 0x80; /* ending */
+
+#if HOST_MIDSTATE
+	// compute constant first block
+	unsigned char midstate[64] = { 0 };
+	whirlpool_midstate(midstate, pdata);
+	memcpy(PaddedMessage, midstate, 64);
+#endif
+
+	cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 128, 0, cudaMemcpyHostToDevice);
+}
+
+__host__
+void x16_whirlpool512_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_outputHash)
+{
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	if (threads < 256)
+		applog(LOG_WARNING, "whirlpool requires a minimum of 256 threads to fetch constant tables!");
+
+	oldwhirlpool_gpu_hash_80 <<<grid, block>>> (threads, startNonce, d_outputHash, 1);
+}
diff --git a/x15/whirlpool.cu b/x15/whirlpool.cu
new file mode 100644
index 0000000000..ae5bc996e4
--- /dev/null
+++ b/x15/whirlpool.cu
@@ -0,0 +1,181 @@
+/*
+ * whirlpool routine
+ */
+extern "C" {
+#include <sph/sph_whirlpool.h>
+#include <miner.h>
+}
+
+#include <cuda_helper.h>
+
+//#define SM3_VARIANT
+
+#ifdef SM3_VARIANT
+static uint32_t *d_hash[MAX_GPUS];
+extern void whirlpool512_init_sm3(int thr_id, uint32_t threads, int mode);
+extern void whirlpool512_free_sm3(int thr_id);
+extern void whirlpool512_setBlock_80_sm3(void *pdata, const void *ptarget);
+extern void whirlpool512_hash_64_sm3(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void whirlpool512_hash_80_sm3(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
+extern uint32_t whirlpool512_finalhash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+//#define _DEBUG
+#define _DEBUG_PREFIX "whirl"
+#include <cuda_debug.cuh>
+#else
+extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int mode);
+extern void x15_whirlpool_cpu_free(int thr_id);
+extern void whirlpool512_setBlock_80(void *pdata, const void *ptarget);
+extern void whirlpool512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonces, const uint64_t target);
+#endif
+
+
+// CPU Hash function
+extern "C" void wcoinhash(void *state, const void *input)
+{
+	sph_whirlpool_context ctx_whirlpool;
+
+	unsigned char hash[128]; // uint32_t hashA[16], hashB[16];
+	#define hashB hash+64
+
+	memset(hash, 0, sizeof hash);
+
+	sph_whirlpool1_init(&ctx_whirlpool);
+	sph_whirlpool1(&ctx_whirlpool, input, 80);
+	sph_whirlpool1_close(&ctx_whirlpool, hash);
+
+	sph_whirlpool1_init(&ctx_whirlpool);
+	sph_whirlpool1(&ctx_whirlpool, hash, 64);
+	sph_whirlpool1_close(&ctx_whirlpool, hashB);
+
+	sph_whirlpool1_init(&ctx_whirlpool);
+	sph_whirlpool1(&ctx_whirlpool, hashB, 64);
+	sph_whirlpool1_close(&ctx_whirlpool, hash);
+
+	sph_whirlpool1_init(&ctx_whirlpool);
+	sph_whirlpool1(&ctx_whirlpool, hash, 64);
+	sph_whirlpool1_close(&ctx_whirlpool, hash);
+
+	memcpy(state, hash, 32);
+}
+
+void whirl_midstate(void *state, const void *input)
+{
+	sph_whirlpool_context ctx;
+
+	sph_whirlpool1_init(&ctx);
+	sph_whirlpool1(&ctx, input, 64);
+
+	memcpy(state, ctx.state, 64);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t _ALIGN(128) endiandata[20];
+	uint32_t* pdata = work->data;
+	uint32_t* ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+
+	uint32_t throughput =  cuda_default_throughput(thr_id, 1U << 19); // 19=256*256*8;
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+	if (init[thr_id]) throughput = max(throughput, 256); // shared mem requirement
+
+	if (opt_benchmark)
+		((uint32_t*)ptarget)[7] = 0x0000ff;
+
+	if (!init[thr_id]) {
+		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+#ifdef SM3_VARIANT
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
+		whirlpool512_init_sm3(thr_id, throughput, 1 /* old whirlpool */);
+#else
+		x15_whirlpool_cpu_init(thr_id, throughput, 1 /* old whirlpool */);
+#endif
+		init[thr_id] = true;
+	}
+
+	for (int k=0; k < 20; k++) {
+		be32enc(&endiandata[k], pdata[k]);
+	}
+
+#ifdef SM3_VARIANT
+	whirlpool512_setBlock_80_sm3((void*)endiandata, ptarget);
+#else
+	whirlpool512_setBlock_80((void*)endiandata, ptarget);
+#endif
+
+	do {
+#ifdef SM3_VARIANT
+		int order = 1;
+		whirlpool512_hash_80_sm3(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		TRACE64(" 80 :", d_hash);
+		whirlpool512_hash_64_sm3(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		TRACE64(" 64 :", d_hash);
+		whirlpool512_hash_64_sm3(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		TRACE64(" 64 :", d_hash);
+		work->nonces[0] = whirlpool512_finalhash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+#else
+		whirlpool512_cpu_hash_80(thr_id, throughput, pdata[19], work->nonces, *(uint64_t*)&ptarget[6]);
+#endif
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		if (work->nonces[0] != UINT32_MAX && bench_algo < 0)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			wcoinhash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				pdata[19] = work->nonces[0] + 1; // cursor
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+	return 0;
+}
+
+// cleanup
+extern "C" void free_whirl(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+#ifdef SM3_VARIANT
+	cudaFree(d_hash[thr_id]);
+	whirlpool512_free_sm3(thr_id);
+#else
+	x15_whirlpool_cpu_free(thr_id);
+#endif
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
+
diff --git a/x15/whirlpoolx.cu b/x15/whirlpoolx.cu
new file mode 100644
index 0000000000..e9ec79ba56
--- /dev/null
+++ b/x15/whirlpoolx.cu
@@ -0,0 +1,127 @@
+/*
+ * whirlpool routine (djm)
+ * whirlpoolx routine (provos alexis, tpruvot)
+ */
+extern "C" {
+#include "sph/sph_whirlpool.h"
+}
+
+#include "miner.h"
+#include "cuda_helper.h"
+
+static uint32_t *d_hash[MAX_GPUS] = { 0 };
+
+extern void whirlpoolx_cpu_init(int thr_id, uint32_t threads);
+extern void whirlpoolx_cpu_free(int thr_id);
+extern void whirlpoolx_setBlock_80(void *pdata, const void *ptarget);
+extern uint32_t whirlpoolx_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce);
+extern void whirlpoolx_precompute(int thr_id);
+
+// CPU Hash function
+extern "C" void whirlxHash(void *state, const void *input)
+{
+	sph_whirlpool_context ctx_whirlpool;
+
+	unsigned char hash[64];
+	unsigned char hash_xored[32];
+
+	sph_whirlpool_init(&ctx_whirlpool);
+	sph_whirlpool(&ctx_whirlpool, input, 80);
+	sph_whirlpool_close(&ctx_whirlpool, hash);
+
+	// compress the 48 first bytes of the hash to 32
+	for (int i = 0; i < 32; i++) {
+		hash_xored[i] = hash[i] ^ hash[i + 16];
+	}
+	memcpy(state, hash_xored, 32);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_whirlx(int thr_id,  struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	uint32_t endiandata[20];
+	int intensity = is_windows() ? 20 : 22;
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (opt_benchmark)
+		ptarget[7] = 0x000f;
+
+	if (!init[thr_id]) {
+		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), -1);
+
+		whirlpoolx_cpu_init(thr_id, throughput);
+
+		init[thr_id] = true;
+	}
+
+	for (int k=0; k < 20; k++) {
+		be32enc(&endiandata[k], pdata[k]);
+	}
+
+	whirlpoolx_setBlock_80((void*)endiandata, ptarget);
+	whirlpoolx_precompute(thr_id);
+
+	do {
+		uint32_t foundNonce = whirlpoolx_cpu_hash(thr_id, throughput, pdata[19]);
+
+		*(hashes_done) = pdata[19] - first_nonce + throughput;
+
+		if (foundNonce != UINT32_MAX && bench_algo < 0)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t vhash64[8];
+			be32enc(&endiandata[19], foundNonce);
+			whirlxHash(vhash64, endiandata);
+
+			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
+				work_set_target_ratio(work, vhash64);
+				pdata[19] = foundNonce;
+				return 1;
+			} else {
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
+			}
+		}
+
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*(hashes_done) = pdata[19] - first_nonce;
+
+	return 0;
+}
+
+// cleanup
+extern "C" void free_whirlx(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+
+	whirlpoolx_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/x15/x14.cu b/x15/x14.cu
new file mode 100644
index 0000000000..4232c6906f
--- /dev/null
+++ b/x15/x14.cu
@@ -0,0 +1,270 @@
+/*
+ * X14 algorithm
+ * Added in ccminer by Tanguy Pruvot - 2014
+ */
+
+extern "C" {
+#include "sph/sph_blake.h"
+#include "sph/sph_bmw.h"
+#include "sph/sph_groestl.h"
+#include "sph/sph_skein.h"
+#include "sph/sph_jh.h"
+#include "sph/sph_keccak.h"
+
+#include "sph/sph_luffa.h"
+#include "sph/sph_cubehash.h"
+#include "sph/sph_shavite.h"
+#include "sph/sph_simd.h"
+#include "sph/sph_echo.h"
+
+#include "sph/sph_hamsi.h"
+#include "sph/sph_fugue.h"
+#include "sph/sph_shabal.h"
+}
+
+#include "miner.h"
+
+#include "cuda_helper.h"
+#include "x11/cuda_x11.h"
+
+// Memory for the hash functions
+static uint32_t *d_hash[MAX_GPUS] = { 0 };
+
+extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads);
+extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
+extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x13_fugue512_cpu_free(int thr_id);
+
+extern void x14_shabal512_cpu_init(int thr_id, uint32_t threads);
+extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+
+// X14 CPU Hash function
+extern "C" void x14hash(void *output, const void *input)
+{
+	unsigned char hash[128]; // uint32_t hashA[16], hashB[16];
+	#define hashB hash+64
+
+	memset(hash, 0, sizeof hash);
+
+	sph_blake512_context ctx_blake;
+	sph_bmw512_context ctx_bmw;
+	sph_groestl512_context ctx_groestl;
+	sph_jh512_context ctx_jh;
+	sph_keccak512_context ctx_keccak;
+	sph_skein512_context ctx_skein;
+	sph_luffa512_context ctx_luffa;
+	sph_cubehash512_context ctx_cubehash;
+	sph_shavite512_context ctx_shavite;
+	sph_simd512_context ctx_simd;
+	sph_echo512_context ctx_echo;
+	sph_hamsi512_context ctx_hamsi;
+	sph_fugue512_context ctx_fugue;
+	sph_shabal512_context ctx_shabal;
+
+	sph_blake512_init(&ctx_blake);
+	sph_blake512(&ctx_blake, input, 80);
+	sph_blake512_close(&ctx_blake, hash);
+
+	sph_bmw512_init(&ctx_bmw);
+	sph_bmw512(&ctx_bmw, hash, 64);
+	sph_bmw512_close(&ctx_bmw, hashB);
+
+	sph_groestl512_init(&ctx_groestl);
+	sph_groestl512(&ctx_groestl, hashB, 64);
+	sph_groestl512_close(&ctx_groestl, hash);
+
+	sph_skein512_init(&ctx_skein);
+	sph_skein512(&ctx_skein, hash, 64);
+	sph_skein512_close(&ctx_skein, hashB);
+
+	sph_jh512_init(&ctx_jh);
+	sph_jh512(&ctx_jh, hashB, 64);
+	sph_jh512_close(&ctx_jh, hash);
+
+	sph_keccak512_init(&ctx_keccak);
+	sph_keccak512(&ctx_keccak, hash, 64);
+	sph_keccak512_close(&ctx_keccak, hashB);
+
+	sph_luffa512_init(&ctx_luffa);
+	sph_luffa512(&ctx_luffa, hashB, 64);
+	sph_luffa512_close(&ctx_luffa, hash);
+
+	sph_cubehash512_init(&ctx_cubehash);
+	sph_cubehash512(&ctx_cubehash, hash, 64);
+	sph_cubehash512_close(&ctx_cubehash, hashB);
+
+	sph_shavite512_init(&ctx_shavite);
+	sph_shavite512(&ctx_shavite, hashB, 64);
+	sph_shavite512_close(&ctx_shavite, hash);
+
+	sph_simd512_init(&ctx_simd);
+	sph_simd512(&ctx_simd, hash, 64);
+	sph_simd512_close(&ctx_simd, hashB);
+
+	sph_echo512_init(&ctx_echo);
+	sph_echo512(&ctx_echo, hashB, 64);
+	sph_echo512_close(&ctx_echo, hash);
+
+	sph_hamsi512_init(&ctx_hamsi);
+	sph_hamsi512(&ctx_hamsi, hash, 64);
+	sph_hamsi512_close(&ctx_hamsi, hashB);
+
+	sph_fugue512_init(&ctx_fugue);
+	sph_fugue512(&ctx_fugue, hashB, 64);
+	sph_fugue512_close(&ctx_fugue, hash);
+
+	sph_shabal512_init(&ctx_shabal);
+	sph_shabal512(&ctx_shabal, hash, 64);
+	sph_shabal512_close(&ctx_shabal, hash);
+
+	memcpy(output, hash, 32);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_x14(int thr_id,  struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	uint32_t endiandata[20];
+
+	uint32_t throughput =  cuda_default_throughput(thr_id, 1U << 19); // 19=256*256*8;
+	//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (opt_benchmark)
+		ptarget[7] = 0x000f;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		quark_blake512_cpu_init(thr_id, throughput);
+		quark_groestl512_cpu_init(thr_id, throughput);
+		quark_skein512_cpu_init(thr_id, throughput);
+		quark_bmw512_cpu_init(thr_id, throughput);
+		quark_keccak512_cpu_init(thr_id, throughput);
+		quark_jh512_cpu_init(thr_id, throughput);
+		x11_luffaCubehash512_cpu_init(thr_id, throughput);
+		x11_shavite512_cpu_init(thr_id, throughput);
+		x11_simd512_cpu_init(thr_id, throughput);
+		x11_echo512_cpu_init(thr_id, throughput);
+		x13_hamsi512_cpu_init(thr_id, throughput);
+		x13_fugue512_cpu_init(thr_id, throughput);
+		x14_shabal512_cpu_init(thr_id, throughput);
+
+		cuda_check_cpu_init(thr_id, throughput);
+
+		cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput);
+
+		CUDA_LOG_ERROR();
+
+		init[thr_id] = true;
+	}
+
+	for (int k = 0; k < 20; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	quark_blake512_cpu_setBlock_80(thr_id, endiandata);
+	cuda_check_cpu_setTarget(ptarget);
+
+	do {
+		int order = 0;
+		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+
+		CUDA_LOG_ERROR();
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			/* check now with the CPU to confirm */
+			be32enc(&endiandata[19], work->nonces[0]);
+			x14hash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				work_set_target_ratio(work, vhash);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					x14hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	CUDA_LOG_ERROR();
+
+	*hashes_done = pdata[19] - first_nonce;
+	return 0;
+}
+
+// cleanup
+extern "C" void free_x14(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	quark_blake512_cpu_free(thr_id);
+	quark_groestl512_cpu_free(thr_id);
+	x11_simd512_cpu_free(thr_id);
+	x13_fugue512_cpu_free(thr_id);
+
+	cudaFree(d_hash[thr_id]);
+	d_hash[thr_id] = NULL;
+
+	cuda_check_cpu_free(thr_id);
+
+	cudaDeviceSynchronize();
+	init[thr_id] = false;
+}
diff --git a/x15/x15.cu b/x15/x15.cu
new file mode 100644
index 0000000000..cdfbd81f00
--- /dev/null
+++ b/x15/x15.cu
@@ -0,0 +1,275 @@
+/*
+ * X15 algorithm (CHC, BBC, X15C)
+ * Added in ccminer by Tanguy Pruvot - 2014
+ */
+
+extern "C" {
+#include "sph/sph_blake.h"
+#include "sph/sph_bmw.h"
+#include "sph/sph_groestl.h"
+#include "sph/sph_skein.h"
+#include "sph/sph_jh.h"
+#include "sph/sph_keccak.h"
+
+#include "sph/sph_luffa.h"
+#include "sph/sph_cubehash.h"
+#include "sph/sph_shavite.h"
+#include "sph/sph_simd.h"
+#include "sph/sph_echo.h"
+
+#include "sph/sph_hamsi.h"
+#include "sph/sph_fugue.h"
+#include "sph/sph_shabal.h"
+#include "sph/sph_whirlpool.h"
+}
+
+#include "miner.h"
+
+#include "cuda_helper.h"
+#include "x11/cuda_x11.h"
+
+// Memory for the hash functions
+static uint32_t *d_hash[MAX_GPUS] = { 0 };
+
+extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads);
+extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
+extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x13_fugue512_cpu_free(int thr_id);
+
+extern void x14_shabal512_cpu_init(int thr_id, uint32_t threads);
+extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int mode);
+extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x15_whirlpool_cpu_free(int thr_id);
+
+
+// X15 CPU Hash function
+extern "C" void x15hash(void *output, const void *input)
+{
+	sph_blake512_context     ctx_blake;
+	sph_bmw512_context       ctx_bmw;
+	sph_groestl512_context   ctx_groestl;
+	sph_jh512_context        ctx_jh;
+	sph_keccak512_context    ctx_keccak;
+	sph_skein512_context     ctx_skein;
+	sph_luffa512_context     ctx_luffa;
+	sph_cubehash512_context  ctx_cubehash;
+	sph_shavite512_context   ctx_shavite;
+	sph_simd512_context      ctx_simd;
+	sph_echo512_context      ctx_echo;
+	sph_hamsi512_context     ctx_hamsi;
+	sph_fugue512_context     ctx_fugue;
+	sph_shabal512_context    ctx_shabal;
+	sph_whirlpool_context    ctx_whirlpool;
+
+	unsigned char hash[128]; // uint32_t hashA[16], hashB[16];
+	#define hashB hash+64
+
+	memset(hash, 0, sizeof hash);
+
+	sph_blake512_init(&ctx_blake);
+	sph_blake512(&ctx_blake, input, 80);
+	sph_blake512_close(&ctx_blake, hash);
+
+	sph_bmw512_init(&ctx_bmw);
+	sph_bmw512(&ctx_bmw, hash, 64);
+	sph_bmw512_close(&ctx_bmw, hashB);
+
+	sph_groestl512_init(&ctx_groestl);
+	sph_groestl512(&ctx_groestl, hashB, 64);
+	sph_groestl512_close(&ctx_groestl, hash);
+
+	sph_skein512_init(&ctx_skein);
+	sph_skein512(&ctx_skein, hash, 64);
+	sph_skein512_close(&ctx_skein, hashB);
+
+	sph_jh512_init(&ctx_jh);
+	sph_jh512(&ctx_jh, hashB, 64);
+	sph_jh512_close(&ctx_jh, hash);
+
+	sph_keccak512_init(&ctx_keccak);
+	sph_keccak512(&ctx_keccak, hash, 64);
+	sph_keccak512_close(&ctx_keccak, hashB);
+
+	sph_luffa512_init(&ctx_luffa);
+	sph_luffa512(&ctx_luffa, hashB, 64);
+	sph_luffa512_close(&ctx_luffa, hash);
+
+	sph_cubehash512_init(&ctx_cubehash);
+	sph_cubehash512(&ctx_cubehash, hash, 64);
+	sph_cubehash512_close(&ctx_cubehash, hashB);
+
+	sph_shavite512_init(&ctx_shavite);
+	sph_shavite512(&ctx_shavite, hashB, 64);
+	sph_shavite512_close(&ctx_shavite, hash);
+
+	sph_simd512_init(&ctx_simd);
+	sph_simd512(&ctx_simd, hash, 64);
+	sph_simd512_close(&ctx_simd, hashB);
+
+	sph_echo512_init(&ctx_echo);
+	sph_echo512(&ctx_echo, hashB, 64);
+	sph_echo512_close(&ctx_echo, hash);
+
+	sph_hamsi512_init(&ctx_hamsi);
+	sph_hamsi512(&ctx_hamsi, hash, 64);
+	sph_hamsi512_close(&ctx_hamsi, hashB);
+
+	sph_fugue512_init(&ctx_fugue);
+	sph_fugue512(&ctx_fugue, hashB, 64);
+	sph_fugue512_close(&ctx_fugue, hash);
+
+	sph_shabal512_init(&ctx_shabal);
+	sph_shabal512(&ctx_shabal, hash, 64);
+	sph_shabal512_close(&ctx_shabal, hashB);
+
+	sph_whirlpool_init(&ctx_whirlpool);
+	sph_whirlpool(&ctx_whirlpool, hashB, 64);
+	sph_whirlpool_close(&ctx_whirlpool, hash);
+
+	memcpy(output, hash, 32);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_x15(int thr_id,  struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	uint32_t endiandata[20];
+
+	uint32_t throughput =  cuda_default_throughput(thr_id, 1U << 19); // 19=256*256*8;
+	//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (opt_benchmark)
+		ptarget[7] = 0x00FF;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		quark_blake512_cpu_init(thr_id, throughput);
+		quark_groestl512_cpu_init(thr_id, throughput);
+		quark_skein512_cpu_init(thr_id, throughput);
+		quark_bmw512_cpu_init(thr_id, throughput);
+		quark_keccak512_cpu_init(thr_id, throughput);
+		quark_jh512_cpu_init(thr_id, throughput);
+		x11_luffaCubehash512_cpu_init(thr_id, throughput);
+		x11_shavite512_cpu_init(thr_id, throughput);
+		x11_simd512_cpu_init(thr_id, throughput);
+		x11_echo512_cpu_init(thr_id, throughput);
+		x13_hamsi512_cpu_init(thr_id, throughput);
+		x13_fugue512_cpu_init(thr_id, throughput);
+		x14_shabal512_cpu_init(thr_id, throughput);
+		x15_whirlpool_cpu_init(thr_id, throughput, 0);
+
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
+
+		cuda_check_cpu_init(thr_id, throughput);
+		init[thr_id] = true;
+	}
+
+	for (int k=0; k < 20; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	quark_blake512_cpu_setBlock_80(thr_id, endiandata);
+	cuda_check_cpu_setTarget(ptarget);
+
+	do {
+		int order = 0;
+		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			/* check now with the CPU to confirm */
+			be32enc(&endiandata[19], work->nonces[0]);
+			x15hash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				work_set_target_ratio(work, vhash);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					x15hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+
+	return 0;
+}
+
+// cleanup
+extern "C" void free_x15(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+
+	quark_blake512_cpu_free(thr_id);
+	quark_groestl512_cpu_free(thr_id);
+	x11_simd512_cpu_free(thr_id);
+	x13_fugue512_cpu_free(thr_id);
+	x15_whirlpool_cpu_free(thr_id);
+
+	cuda_check_cpu_free(thr_id);
+
+	cudaDeviceSynchronize();
+	init[thr_id] = false;
+}
diff --git a/x16/cuda_x16.h b/x16/cuda_x16.h
new file mode 100644
index 0000000000..67b205a2a3
--- /dev/null
+++ b/x16/cuda_x16.h
@@ -0,0 +1,80 @@
+#include "x11/cuda_x11.h"
+
+extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads);
+extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
+extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x13_fugue512_cpu_free(int thr_id);
+
+extern void x14_shabal512_cpu_init(int thr_id, uint32_t threads);
+extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int flag);
+extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x15_whirlpool_cpu_free(int thr_id);
+
+extern void x17_sha512_cpu_init(int thr_id, uint32_t threads);
+extern void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash);
+
+extern void x17_haval256_cpu_init(int thr_id, uint32_t threads);
+extern void x17_haval256_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, const int outlen);
+
+void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order);
+
+// ---- optimised but non compatible kernels
+
+void x16_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
+
+// ---- 80 bytes kernels
+
+void quark_bmw512_cpu_setBlock_80(void *pdata);
+void quark_bmw512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int order);
+
+void groestl512_setBlock_80(int thr_id, uint32_t *endiandata);
+void groestl512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
+
+void skein512_cpu_setBlock_80(void *pdata);
+void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int swap);
+
+void qubit_luffa512_cpu_init(int thr_id, uint32_t threads);
+void qubit_luffa512_cpu_setBlock_80(void *pdata);
+void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int order);
+
+void jh512_setBlock_80(int thr_id, uint32_t *endiandata);
+void jh512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
+
+void keccak512_setBlock_80(int thr_id, uint32_t *endiandata);
+void keccak512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
+
+void cubehash512_setBlock_80(int thr_id, uint32_t* endiandata);
+void cubehash512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
+
+void x11_shavite512_setBlock_80(void *pdata);
+void x11_shavite512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int order);
+
+void x16_shabal512_setBlock_80(void *pdata);
+void x16_shabal512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
+
+void x16_simd512_setBlock_80(void *pdata);
+void x16_simd512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
+
+void x16_echo512_cuda_init(int thr_id, const uint32_t threads);
+void x16_echo512_setBlock_80(void *pdata);
+void x16_echo512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
+
+void x16_hamsi512_setBlock_80(void *pdata);
+void x16_hamsi512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
+
+void x16_fugue512_cpu_init(int thr_id, uint32_t threads);
+void x16_fugue512_cpu_free(int thr_id);
+void x16_fugue512_setBlock_80(void *pdata);
+void x16_fugue512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
+
+void x16_whirlpool512_init(int thr_id, uint32_t threads);
+void x16_whirlpool512_setBlock_80(void* endiandata);
+void x16_whirlpool512_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
+
+void x16_sha512_setBlock_80(void *pdata);
+void x16_sha512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
+
diff --git a/x16/cuda_x16_echo512.cu b/x16/cuda_x16_echo512.cu
new file mode 100644
index 0000000000..5e6013d2ab
--- /dev/null
+++ b/x16/cuda_x16_echo512.cu
@@ -0,0 +1,214 @@
+/**
+ * echo512-80 cuda kernel for X16R algorithm
+ *
+ * tpruvot 2018 - GPL code
+ */
+
+#include <stdio.h>
+#include <memory.h>
+
+#include "cuda_helper.h"
+
+extern __device__ __device_builtin__ void __threadfence_block(void);
+
+#include "../x11/cuda_x11_aes.cuh"
+
+__device__ __forceinline__ void AES_2ROUND(const uint32_t* __restrict__ sharedMemory,
+	uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3,
+	uint32_t &k0)
+{
+	uint32_t y0, y1, y2, y3;
+
+	aes_round(sharedMemory,
+		x0, x1, x2, x3,
+		k0,
+		y0, y1, y2, y3);
+
+	aes_round(sharedMemory,
+		y0, y1, y2, y3,
+		x0, x1, x2, x3);
+
+	k0++;
+}
+
+__device__
+static void echo_round(uint32_t* const sharedMemory, uint32_t *W, uint32_t &k0)
+{
+	// Big Sub Words
+	#pragma unroll 16
+	for (int idx = 0; idx < 16; idx++) {
+		AES_2ROUND(sharedMemory, W[(idx << 2) + 0], W[(idx << 2) + 1], W[(idx << 2) + 2], W[(idx << 2) + 3], k0);
+	}
+
+	// Shift Rows
+	#pragma unroll 4
+	for (int i = 0; i < 4; i++)
+	{
+		uint32_t t[4];
+		/// 1, 5, 9, 13
+		t[0] = W[i +  4];
+		t[1] = W[i +  8];
+		t[2] = W[i + 24];
+		t[3] = W[i + 60];
+
+		W[i +  4] = W[i + 20];
+		W[i +  8] = W[i + 40];
+		W[i + 24] = W[i + 56];
+		W[i + 60] = W[i + 44];
+
+		W[i + 20] = W[i + 36];
+		W[i + 40] = t[1];
+		W[i + 56] = t[2];
+		W[i + 44] = W[i + 28];
+
+		W[i + 28] = W[i + 12];
+		W[i + 12] = t[3];
+		W[i + 36] = W[i + 52];
+		W[i + 52] = t[0];
+	}
+
+	// Mix Columns
+	#pragma unroll 4
+	for (int i = 0; i < 4; i++)
+	{
+		#pragma unroll 4
+		for (int idx = 0; idx < 64; idx += 16)
+		{
+			uint32_t a[4];
+			a[0] = W[idx + i];
+			a[1] = W[idx + i + 4];
+			a[2] = W[idx + i + 8];
+			a[3] = W[idx + i + 12];
+
+			uint32_t ab = a[0] ^ a[1];
+			uint32_t bc = a[1] ^ a[2];
+			uint32_t cd = a[2] ^ a[3];
+
+			uint32_t t, t2, t3;
+			t  = (ab & 0x80808080);
+			t2 = (bc & 0x80808080);
+			t3 = (cd & 0x80808080);
+
+			uint32_t abx = (t  >> 7) * 27U ^ ((ab^t)  << 1);
+			uint32_t bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
+			uint32_t cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
+
+			W[idx + i] = bc ^ a[3] ^ abx;
+			W[idx + i +  4] = a[0] ^ cd ^ bcx;
+			W[idx + i +  8] = ab ^ a[3] ^ cdx;
+			W[idx + i + 12] = ab ^ a[2] ^ (abx ^ bcx ^ cdx);
+		}
+	}
+}
+
+__device__ __forceinline__
+void cuda_echo_round_80(uint32_t *const __restrict__ sharedMemory, uint32_t *const __restrict__ data, const uint32_t nonce, uint32_t *hash)
+{
+	uint32_t h[29]; // <= 127 bytes input
+
+	#pragma unroll 8
+	for (int i = 0; i < 18; i += 2)
+		AS_UINT2(&h[i]) = AS_UINT2(&data[i]);
+	h[18] = data[18];
+	h[19] = cuda_swab32(nonce);
+	h[20] = 0x80;
+	h[21] = h[22] = h[23] = h[24] = h[25] = h[26] = 0;
+	//((uint8_t*)h)[80] = 0x80;
+	//((uint8_t*)h)[128-17] = 0x02;
+	//((uint8_t*)h)[128-16] = 0x80;
+	//((uint8_t*)h)[128-15] = 0x02;
+	h[27] = 0x2000000;
+	h[28] = 0x280;
+	//h[29] = h[30] = h[31] = 0;
+
+	uint32_t k0 = 640; // bitlen
+	uint32_t W[64];
+
+	#pragma unroll 8
+	for (int i = 0; i < 32; i+=4) {
+		W[i] = 512; // L
+		W[i+1] = 0; // H
+		W[i+2] = 0; // X
+		W[i+3] = 0;
+	}
+
+	uint32_t Z[16];
+	#pragma unroll
+	for (int i = 0;  i<16; i++) Z[i] = W[i];
+	#pragma unroll
+	for (int i = 32; i<61; i++) W[i] = h[i - 32];
+	#pragma unroll
+	for (int i = 61; i<64; i++) W[i] = 0;
+
+	for (int i = 0; i < 10; i++)
+		echo_round(sharedMemory, W, k0);
+
+	#pragma unroll 16
+	for (int i = 0; i < 16; i++) {
+		Z[i] ^= h[i] ^ W[i] ^ W[i + 32];
+	}
+
+	#pragma unroll 8
+	for (int i = 0; i < 16; i += 2)
+		AS_UINT2(&hash[i]) = AS_UINT2(&Z[i]);
+}
+
+__device__ __forceinline__
+void echo_gpu_init(uint32_t *const __restrict__ sharedMemory)
+{
+	/* each thread startup will fill a uint32 */
+	if (threadIdx.x < 128) {
+		sharedMemory[threadIdx.x] = d_AES0[threadIdx.x];
+		sharedMemory[threadIdx.x + 256] = d_AES1[threadIdx.x];
+		sharedMemory[threadIdx.x + 512] = d_AES2[threadIdx.x];
+		sharedMemory[threadIdx.x + 768] = d_AES3[threadIdx.x];
+
+		sharedMemory[threadIdx.x + 64 * 2] = d_AES0[threadIdx.x + 64 * 2];
+		sharedMemory[threadIdx.x + 64 * 2 + 256] = d_AES1[threadIdx.x + 64 * 2];
+		sharedMemory[threadIdx.x + 64 * 2 + 512] = d_AES2[threadIdx.x + 64 * 2];
+		sharedMemory[threadIdx.x + 64 * 2 + 768] = d_AES3[threadIdx.x + 64 * 2];
+	}
+}
+
+__host__
+void x16_echo512_cuda_init(int thr_id, const uint32_t threads)
+{
+	aes_cpu_init(thr_id);
+}
+
+__constant__ static uint32_t c_PaddedMessage80[20];
+
+__host__
+void x16_echo512_setBlock_80(void *endiandata)
+{
+	cudaMemcpyToSymbol(c_PaddedMessage80, endiandata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice);
+}
+
+__global__ __launch_bounds__(128, 7) /* will force 72 registers */
+void x16_echo512_gpu_hash_80(uint32_t threads, uint32_t startNonce, uint64_t *g_hash)
+{
+	__shared__ uint32_t sharedMemory[1024];
+
+	echo_gpu_init(sharedMemory);
+	__threadfence_block();
+
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint64_t hashPosition = thread;
+		uint32_t *pHash = (uint32_t*)&g_hash[hashPosition<<3];
+
+		cuda_echo_round_80(sharedMemory, c_PaddedMessage80, startNonce + thread, pHash);
+	}
+}
+
+__host__
+void x16_echo512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash)
+{
+	const uint32_t threadsperblock = 128;
+
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	x16_echo512_gpu_hash_80<<<grid, block>>>(threads, startNonce, (uint64_t*)d_hash);
+}
diff --git a/x16/cuda_x16_echo512_64.cu b/x16/cuda_x16_echo512_64.cu
new file mode 100644
index 0000000000..3a0f268725
--- /dev/null
+++ b/x16/cuda_x16_echo512_64.cu
@@ -0,0 +1,260 @@
+/**
+ * Echo512-64 kernel for maxwell, based on alexis work
+ */
+
+#include <cuda_helper.h>
+#include <cuda_vector_uint2x4.h>
+#include <cuda_vectors.h>
+
+#define INTENSIVE_GMF
+#include "tribus/cuda_echo512_aes.cuh"
+
+#ifdef __INTELLISENSE__
+#define __byte_perm(x, y, b) x
+#define atomicExch(p,y) (*p) = y
+#endif
+
+__device__
+static void echo_round_alexis(const uint32_t sharedMemory[4][256], uint32_t *W, uint32_t &k0)
+{
+	// Big Sub Words
+	#pragma unroll 16
+	for (int idx = 0; idx < 16; idx++)
+		AES_2ROUND(sharedMemory,W[(idx<<2) + 0], W[(idx<<2) + 1], W[(idx<<2) + 2], W[(idx<<2) + 3], k0);
+
+	// Shift Rows
+	#pragma unroll 4
+	for (int i = 0; i < 4; i++){
+		uint32_t t[4];
+		/// 1, 5, 9, 13
+		t[0] = W[i+ 4];
+		t[1] = W[i+ 8];
+		t[2] = W[i+24];
+		t[3] = W[i+60];
+		W[i + 4] = W[i + 20];
+		W[i + 8] = W[i + 40];
+		W[i +24] = W[i + 56];
+		W[i +60] = W[i + 44];
+
+		W[i +20] = W[i +36];
+		W[i +40] = t[1];
+		W[i +56] = t[2];
+		W[i +44] = W[i +28];
+
+		W[i +28] = W[i +12];
+		W[i +12] = t[3];
+		W[i +36] = W[i +52];
+		W[i +52] = t[0];
+	}
+	// Mix Columns
+	#pragma unroll 4
+	for (int i = 0; i < 4; i++){ // Schleife über je 2*uint32_t
+		#pragma unroll 4
+		for (int idx = 0; idx < 64; idx += 16){ // Schleife über die elemnte
+			uint32_t a[4];
+			a[0] = W[idx + i];
+			a[1] = W[idx + i + 4];
+			a[2] = W[idx + i + 8];
+			a[3] = W[idx + i +12];
+
+			uint32_t ab = a[0] ^ a[1];
+			uint32_t bc = a[1] ^ a[2];
+			uint32_t cd = a[2] ^ a[3];
+
+			uint32_t t, t2, t3;
+			t = (ab & 0x80808080);
+			t2 = (bc & 0x80808080);
+			t3 = (cd & 0x80808080);
+
+			uint32_t abx = (t  >> 7) * 27U ^ ((ab^t) << 1);
+			uint32_t bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
+			uint32_t cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
+
+			W[idx + i] = bc ^ a[3] ^ abx;
+			W[idx + i + 4] = a[0] ^ cd ^ bcx;
+			W[idx + i + 8] = ab ^ a[3] ^ cdx;
+			W[idx + i +12] = ab ^ a[2] ^ (abx ^ bcx ^ cdx);
+		}
+	}
+}
+
+__global__ __launch_bounds__(128, 5) /* will force 80 registers */
+static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t* g_hash, uint32_t* const d_filter, const uint32_t filter_val)
+{
+	__shared__ uint32_t sharedMemory[4][256];
+
+	aes_gpu_init128(sharedMemory);
+	__syncthreads();
+
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	uint32_t k0;
+	uint32_t h[16];
+	uint32_t hash[16];
+	if (thread < threads)
+	{
+		// phi2 filter (2 hash chain branches)
+		if (d_filter && d_filter[thread] != filter_val) return;
+
+		uint32_t *Hash = &g_hash[thread<<4];
+
+		*(uint2x4*)&h[ 0] = __ldg4((uint2x4*)&Hash[ 0]);
+		*(uint2x4*)&h[ 8] = __ldg4((uint2x4*)&Hash[ 8]);
+
+		*(uint2x4*)&hash[ 0] = *(uint2x4*)&h[ 0];
+		*(uint2x4*)&hash[ 8] = *(uint2x4*)&h[ 8];
+
+		const uint32_t P[48] = {
+			0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
+			//8-12
+			0x01425eb8, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0x65978b09, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
+			//21-25
+			0x2cb6b661, 0x6b23b3b3, 0xcf93a7cf, 0x9d9d3751, 0x9ac2dea3, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
+			//34-38
+			0x579f9f33, 0xfbfbfbfb, 0xfbfbfbfb, 0xefefd3c7, 0xdbfde1dd, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
+			0x34514d9e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xb134347e, 0xea6f7e7e, 0xbd7731bd, 0x8a8a1968,
+			0x14b8a457, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0x265f4382, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af
+			//58-61
+		};
+
+		k0 = 520;
+
+		#pragma unroll 4
+		for (uint32_t idx = 0; idx < 16; idx += 4) {
+			AES_2ROUND(sharedMemory, h[idx + 0], h[idx + 1], h[idx + 2], h[idx + 3], k0);
+		}
+		k0 += 4;
+
+		uint32_t W[64];
+
+		#pragma unroll 4
+		for (uint32_t i = 0; i < 4; i++)
+		{
+			uint32_t a = P[i];
+			uint32_t b = P[i + 4];
+			uint32_t c = h[i + 8];
+			uint32_t d = P[i + 8];
+
+			uint32_t ab = a ^ b;
+			uint32_t bc = b ^ c;
+			uint32_t cd = c ^ d;
+
+
+			uint32_t t =  (ab & 0x80808080);
+			uint32_t t2 = (bc & 0x80808080);
+			uint32_t t3 = (cd & 0x80808080);
+
+			uint32_t abx = (t  >> 7) * 27U ^ ((ab^t) << 1);
+			uint32_t bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
+			uint32_t cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
+
+			W[i] = abx ^ bc ^ d;
+			W[i + 4] = bcx ^ a ^ cd;
+			W[i + 8] = cdx ^ ab ^ d;
+			W[i +12] = abx ^ bcx ^ cdx ^ ab ^ c;
+
+			a = P[i +12];
+			b = h[i + 4];
+			c = P[i +16];
+			d = P[i +20];
+
+			ab = a ^ b;
+			bc = b ^ c;
+			cd = c ^ d;
+
+
+			t = (ab & 0x80808080);
+			t2 = (bc & 0x80808080);
+			t3 = (cd & 0x80808080);
+
+			abx = (t >> 7) * 27U ^ ((ab^t) << 1);
+			bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
+			cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
+
+			W[16 + i] = bc ^ d ^ abx;
+			W[16 + i + 4] = a ^ cd ^ bcx;
+			W[16 + i + 8] = d ^ ab ^ cdx;
+			W[16 + i + 12] = c ^ ab ^ abx ^ bcx ^ cdx;
+
+			a = h[i];
+			b = P[24 + i + 0];
+			c = P[24 + i + 4];
+			d = P[24 + i + 8];
+
+			ab = a ^ b;
+			bc = b ^ c;
+			cd = c ^ d;
+
+
+			t = (ab & 0x80808080);
+			t2 = (bc & 0x80808080);
+			t3 = (cd & 0x80808080);
+
+			abx = (t >> 7) * 27U ^ ((ab^t) << 1);
+			bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
+			cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
+
+			W[32 + i] = bc ^ d ^ abx;
+			W[32 + i + 4] = a ^ cd ^ bcx;
+			W[32 + i + 8] = d ^ ab ^ cdx;
+			W[32 + i + 12] = c ^ ab ^ abx ^ bcx ^ cdx;
+
+			a = P[36 + i ];
+			b = P[36 + i + 4];
+			c = P[36 + i + 8];
+			d = h[i + 12];
+
+			ab = a ^ b;
+			bc = b ^ c;
+			cd = c ^ d;
+
+			t = (ab & 0x80808080);
+			t2 = (bc & 0x80808080);
+			t3 = (cd & 0x80808080);
+
+			abx = (t >> 7) * 27U ^ ((ab^t) << 1);
+			bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
+			cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
+
+			W[48 + i] = bc ^ d ^ abx;
+			W[48 + i + 4] = a ^ cd ^ bcx;
+			W[48 + i + 8] = d ^ ab ^ cdx;
+			W[48 + i + 12] = c ^ ab ^ abx ^ bcx ^ cdx;
+		}
+
+		for (int k = 1; k < 10; k++)
+			echo_round_alexis(sharedMemory,W,k0);
+
+		#pragma unroll 4
+		for (int i = 0; i < 16; i += 4)
+		{
+			W[i] ^= W[32 + i] ^ 512;
+			W[i + 1] ^= W[32 + i + 1];
+			W[i + 2] ^= W[32 + i + 2];
+			W[i + 3] ^= W[32 + i + 3];
+		}
+		*(uint2x4*)&Hash[ 0] = *(uint2x4*)&hash[ 0] ^ *(uint2x4*)&W[ 0];
+		*(uint2x4*)&Hash[ 8] = *(uint2x4*)&hash[ 8] ^ *(uint2x4*)&W[ 8];
+	}
+}
+
+__host__
+void x16_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash)
+{
+	const uint32_t threadsperblock = 128;
+
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	x16_echo512_gpu_hash_64 <<<grid, block>>> (threads, d_hash, NULL, 0);
+}
+
+__host__
+void phi_echo512_cpu_hash_64_filtered(int thr_id, const uint32_t threads, uint32_t* g_hash, uint32_t* d_filter)
+{
+	const uint32_t threadsperblock = 128;
+
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	x16_echo512_gpu_hash_64 <<<grid, block>>> (threads, g_hash, d_filter, 0);
+}
\ No newline at end of file
diff --git a/x16/cuda_x16_fugue512.cu b/x16/cuda_x16_fugue512.cu
new file mode 100644
index 0000000000..7f3438c2bd
--- /dev/null
+++ b/x16/cuda_x16_fugue512.cu
@@ -0,0 +1,467 @@
+
+#include <stdio.h>
+#include <cuda_helper.h>
+
+#define TPB 256
+
+/*
+ * fugue512-80 x16r kernel implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2018 tpruvot
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ */
+
+#ifdef __INTELLISENSE__
+#define __byte_perm(x, y, m) (x|y)
+#define tex1Dfetch(t, n) (n)
+#define __CUDACC__
+#include <cuda_texture_types.h>
+#endif
+
+// store allocated textures device addresses
+static unsigned int* d_textures[MAX_GPUS][1];
+
+#define mixtab0(x) mixtabs[(x)]
+#define mixtab1(x) mixtabs[(x)+256]
+#define mixtab2(x) mixtabs[(x)+512]
+#define mixtab3(x) mixtabs[(x)+768]
+
+static texture<unsigned int, 1, cudaReadModeElementType> mixTab0Tex;
+
+static const uint32_t mixtab0[] = {
+	0x63633297, 0x7c7c6feb, 0x77775ec7, 0x7b7b7af7, 0xf2f2e8e5, 0x6b6b0ab7, 0x6f6f16a7, 0xc5c56d39,
+	0x303090c0, 0x01010704, 0x67672e87, 0x2b2bd1ac, 0xfefeccd5, 0xd7d71371, 0xabab7c9a, 0x767659c3,
+	0xcaca4005, 0x8282a33e, 0xc9c94909, 0x7d7d68ef, 0xfafad0c5, 0x5959947f, 0x4747ce07, 0xf0f0e6ed,
+	0xadad6e82, 0xd4d41a7d, 0xa2a243be, 0xafaf608a, 0x9c9cf946, 0xa4a451a6, 0x727245d3, 0xc0c0762d,
+	0xb7b728ea, 0xfdfdc5d9, 0x9393d47a, 0x2626f298, 0x363682d8, 0x3f3fbdfc, 0xf7f7f3f1, 0xcccc521d,
+	0x34348cd0, 0xa5a556a2, 0xe5e58db9, 0xf1f1e1e9, 0x71714cdf, 0xd8d83e4d, 0x313197c4, 0x15156b54,
+	0x04041c10, 0xc7c76331, 0x2323e98c, 0xc3c37f21, 0x18184860, 0x9696cf6e, 0x05051b14, 0x9a9aeb5e,
+	0x0707151c, 0x12127e48, 0x8080ad36, 0xe2e298a5, 0xebeba781, 0x2727f59c, 0xb2b233fe, 0x757550cf,
+	0x09093f24, 0x8383a43a, 0x2c2cc4b0, 0x1a1a4668, 0x1b1b416c, 0x6e6e11a3, 0x5a5a9d73, 0xa0a04db6,
+	0x5252a553, 0x3b3ba1ec, 0xd6d61475, 0xb3b334fa, 0x2929dfa4, 0xe3e39fa1, 0x2f2fcdbc, 0x8484b126,
+	0x5353a257, 0xd1d10169, 0x00000000, 0xededb599, 0x2020e080, 0xfcfcc2dd, 0xb1b13af2, 0x5b5b9a77,
+	0x6a6a0db3, 0xcbcb4701, 0xbebe17ce, 0x3939afe4, 0x4a4aed33, 0x4c4cff2b, 0x5858937b, 0xcfcf5b11,
+	0xd0d0066d, 0xefefbb91, 0xaaaa7b9e, 0xfbfbd7c1, 0x4343d217, 0x4d4df82f, 0x333399cc, 0x8585b622,
+	0x4545c00f, 0xf9f9d9c9, 0x02020e08, 0x7f7f66e7, 0x5050ab5b, 0x3c3cb4f0, 0x9f9ff04a, 0xa8a87596,
+	0x5151ac5f, 0xa3a344ba, 0x4040db1b, 0x8f8f800a, 0x9292d37e, 0x9d9dfe42, 0x3838a8e0, 0xf5f5fdf9,
+	0xbcbc19c6, 0xb6b62fee, 0xdada3045, 0x2121e784, 0x10107040, 0xffffcbd1, 0xf3f3efe1, 0xd2d20865,
+	0xcdcd5519, 0x0c0c2430, 0x1313794c, 0xececb29d, 0x5f5f8667, 0x9797c86a, 0x4444c70b, 0x1717655c,
+	0xc4c46a3d, 0xa7a758aa, 0x7e7e61e3, 0x3d3db3f4, 0x6464278b, 0x5d5d886f, 0x19194f64, 0x737342d7,
+	0x60603b9b, 0x8181aa32, 0x4f4ff627, 0xdcdc225d, 0x2222ee88, 0x2a2ad6a8, 0x9090dd76, 0x88889516,
+	0x4646c903, 0xeeeebc95, 0xb8b805d6, 0x14146c50, 0xdede2c55, 0x5e5e8163, 0x0b0b312c, 0xdbdb3741,
+	0xe0e096ad, 0x32329ec8, 0x3a3aa6e8, 0x0a0a3628, 0x4949e43f, 0x06061218, 0x2424fc90, 0x5c5c8f6b,
+	0xc2c27825, 0xd3d30f61, 0xacac6986, 0x62623593, 0x9191da72, 0x9595c662, 0xe4e48abd, 0x797974ff,
+	0xe7e783b1, 0xc8c84e0d, 0x373785dc, 0x6d6d18af, 0x8d8d8e02, 0xd5d51d79, 0x4e4ef123, 0xa9a97292,
+	0x6c6c1fab, 0x5656b943, 0xf4f4fafd, 0xeaeaa085, 0x6565208f, 0x7a7a7df3, 0xaeae678e, 0x08083820,
+	0xbaba0bde, 0x787873fb, 0x2525fb94, 0x2e2ecab8, 0x1c1c5470, 0xa6a65fae, 0xb4b421e6, 0xc6c66435,
+	0xe8e8ae8d, 0xdddd2559, 0x747457cb, 0x1f1f5d7c, 0x4b4bea37, 0xbdbd1ec2, 0x8b8b9c1a, 0x8a8a9b1e,
+	0x70704bdb, 0x3e3ebaf8, 0xb5b526e2, 0x66662983, 0x4848e33b, 0x0303090c, 0xf6f6f4f5, 0x0e0e2a38,
+	0x61613c9f, 0x35358bd4, 0x5757be47, 0xb9b902d2, 0x8686bf2e, 0xc1c17129, 0x1d1d5374, 0x9e9ef74e,
+	0xe1e191a9, 0xf8f8decd, 0x9898e556, 0x11117744, 0x696904bf, 0xd9d93949, 0x8e8e870e, 0x9494c166,
+	0x9b9bec5a, 0x1e1e5a78, 0x8787b82a, 0xe9e9a989, 0xcece5c15, 0x5555b04f, 0x2828d8a0, 0xdfdf2b51,
+	0x8c8c8906, 0xa1a14ab2, 0x89899212, 0x0d0d2334, 0xbfbf10ca, 0xe6e684b5, 0x4242d513, 0x686803bb,
+	0x4141dc1f, 0x9999e252, 0x2d2dc3b4, 0x0f0f2d3c, 0xb0b03df6, 0x5454b74b, 0xbbbb0cda, 0x16166258
+};
+
+#define TIX4(q, x00, x01, x04, x07, x08, x22, x24, x27, x30) { \
+	x22 ^= x00; \
+	x00 = (q); \
+	x08 ^= x00; \
+	x01 ^= x24; \
+	x04 ^= x27; \
+	x07 ^= x30; \
+}
+
+#define CMIX36(x00, x01, x02, x04, x05, x06, x18, x19, x20) { \
+	x00 ^= x04; \
+	x01 ^= x05; \
+	x02 ^= x06; \
+	x18 ^= x04; \
+	x19 ^= x05; \
+	x20 ^= x06; \
+}
+
+#define SMIX(x0, x1, x2, x3) { \
+	uint32_t tmp; \
+	uint32_t r0 = 0; \
+	uint32_t r1 = 0; \
+	uint32_t r2 = 0; \
+	uint32_t r3 = 0; \
+	uint32_t c0 = mixtab0(x0 >> 24); \
+	tmp = mixtab1((x0 >> 16) & 0xFF); \
+	c0 ^= tmp; \
+	r1 ^= tmp; \
+	tmp = mixtab2((x0 >>  8) & 0xFF); \
+	c0 ^= tmp; \
+	r2 ^= tmp; \
+	tmp = mixtab3(x0 & 0xFF); \
+	c0 ^= tmp; \
+	r3 ^= tmp; \
+	tmp = mixtab0(x1 >> 24); \
+	uint32_t c1 = tmp; \
+	r0 ^= tmp; \
+	tmp = mixtab1((x1 >> 16) & 0xFF); \
+	c1 ^= tmp; \
+	tmp = mixtab2((x1 >>  8) & 0xFF); \
+	c1 ^= tmp; \
+	r2 ^= tmp; \
+	tmp = mixtab3(x1 & 0xFF); \
+	c1 ^= tmp; \
+	r3 ^= tmp; \
+	tmp = mixtab0(x2 >> 24); \
+	uint32_t c2 = tmp; \
+	r0 ^= tmp; \
+	tmp = mixtab1((x2 >> 16) & 0xFF); \
+	c2 ^= tmp; \
+	r1 ^= tmp; \
+	tmp = mixtab2((x2 >>  8) & 0xFF); \
+	c2 ^= tmp; \
+	tmp = mixtab3(x2 & 0xFF); \
+	c2 ^= tmp; \
+	r3 ^= tmp; \
+	tmp = mixtab0(x3 >> 24); \
+	uint32_t c3 = tmp; \
+	r0 ^= tmp; \
+	tmp = mixtab1((x3 >> 16) & 0xFF); \
+	c3 ^= tmp; \
+	r1 ^= tmp; \
+	tmp = mixtab2((x3 >>  8) & 0xFF); \
+	c3 ^= tmp; \
+	r2 ^= tmp; \
+	tmp = mixtab3(x3 & 0xFF); \
+	c3 ^= tmp; \
+	x0 = ((c0 ^ r0) & 0xFF000000) | ((c1 ^ r1) & 0x00FF0000) \
+		| ((c2 ^ r2) & 0x0000FF00) | ((c3 ^ r3) & 0x000000FF); \
+	x1 = ((c1 ^ (r0 <<  8)) & 0xFF000000) | ((c2 ^ (r1 <<  8)) & 0x00FF0000) \
+		| ((c3 ^ (r2 <<  8)) & 0x0000FF00) | ((c0 ^ (r3 >> 24)) & 0x000000FF); \
+	x2 = ((c2 ^ (r0 << 16)) & 0xFF000000) | ((c3 ^ (r1 << 16)) & 0x00FF0000) \
+		| ((c0 ^ (r2 >> 16)) & 0x0000FF00) | ((c1 ^ (r3 >> 16)) & 0x000000FF); \
+	x3 = ((c3 ^ (r0 << 24)) & 0xFF000000) | ((c0 ^ (r1 >>  8)) & 0x00FF0000) \
+		| ((c1 ^ (r2 >>  8)) & 0x0000FF00) | ((c2 ^ (r3 >>  8)) & 0x000000FF); \
+}
+
+#define SUB_ROR3 { \
+	B33 = S33, B34 = S34, B35 = S35; \
+	S35 = S32; S34 = S31; S33 = S30; S32 = S29; S31 = S28; S30 = S27; S29 = S26; S28 = S25; S27 = S24; \
+	S26 = S23; S25 = S22; S24 = S21; S23 = S20; S22 = S19; S21 = S18; S20 = S17; S19 = S16; S18 = S15; \
+	S17 = S14; S16 = S13; S15 = S12; S14 = S11; S13 = S10; S12 = S09; S11 = S08; S10 = S07; S09 = S06; \
+	S08 = S05; S07 = S04; S06 = S03; S05 = S02; S04 = S01; S03 = S00; S02 = B35; S01 = B34; S00 = B33; \
+}
+
+#define SUB_ROR8 { \
+	B28 = S28, B29 = S29, B30 = S30, B31 = S31, B32 = S32, B33 = S33, B34 = S34, B35 = S35; \
+	S35 = S27; S34 = S26; S33 = S25; S32 = S24; S31 = S23; S30 = S22; S29 = S21; S28 = S20; S27 = S19; \
+	S26 = S18; S25 = S17; S24 = S16; S23 = S15; S22 = S14; S21 = S13; S20 = S12; S19 = S11; S18 = S10; \
+	S17 = S09; S16 = S08; S15 = S07; S14 = S06; S13 = S05; S12 = S04; S11 = S03; S10 = S02; S09 = S01; \
+	S08 = S00; S07 = B35; S06 = B34; S05 = B33; S04 = B32; S03 = B31; S02 = B30; S01 = B29; S00 = B28; \
+}
+
+#define SUB_ROR9 { \
+	B27 = S27, B28 = S28, B29 = S29, B30 = S30, B31 = S31, B32 = S32, B33 = S33, B34 = S34, B35 = S35; \
+	S35 = S26; S34 = S25; S33 = S24; S32 = S23; S31 = S22; S30 = S21; S29 = S20; S28 = S19; S27 = S18; \
+	S26 = S17; S25 = S16; S24 = S15; S23 = S14; S22 = S13; S21 = S12; S20 = S11; S19 = S10; S18 = S09; \
+	S17 = S08; S16 = S07; S15 = S06; S14 = S05; S13 = S04; S12 = S03; S11 = S02; S10 = S01; S09 = S00; \
+	S08 = B35; S07 = B34; S06 = B33; S05 = B32; S04 = B31; S03 = B30; S02 = B29; S01 = B28; S00 = B27; \
+}
+
+#define SUB_ROR9_3 { \
+	SUB_ROR3; SUB_ROR3; SUB_ROR3; \
+}
+
+#define SUB_ROR12 { /* to fix */ \
+	B24 = S00; B25 = S01; B26 = S02; B27 = S03; B28 = S04; B29 = S05; B30 = S06; B31 = S07; B32 = S08; B33 = S09; B34 = S10; B35 = S11; \
+	S00 = S12; S01 = S13; S02 = S14; S03 = S15; S04 = S16; S05 = S17; S06 = S18; S07 = S19; S08 = S20; S09 = S21; S10 = S22; S11 = S23; \
+	S12 = S24; S13 = S25; S14 = S26; S15 = S27; S16 = S28; S17 = S29; S18 = S30; S19 = S31; S20 = S32; S21 = S33; S22 = S34; S23 = S35; \
+	S24 = B24; S25 = B25; S26 = B26; S27 = B27; S28 = B28; S29 = B29; S30 = B30; S31 = B31; S32 = B32; S33 = B33; S34 = B34; S35 = B35; \
+}
+
+#define FUGUE512_3(x, y, z) { \
+	TIX4(x, S00, S01, S04, S07, S08, S22, S24, S27, S30); \
+	CMIX36(S33, S34, S35, S01, S02, S03, S15, S16, S17); \
+	SMIX(S33, S34, S35, S00); \
+	CMIX36(S30, S31, S32, S34, S35, S00, S12, S13, S14); \
+	SMIX(S30, S31, S32, S33); \
+	CMIX36(S27, S28, S29, S31, S32, S33, S09, S10, S11); \
+	SMIX(S27, S28, S29, S30); \
+	CMIX36(S24, S25, S26, S28, S29, S30, S06, S07, S08); \
+	SMIX(S24, S25, S26, S27); \
+	\
+	TIX4(y, S24, S25, S28, S31, S32, S10, S12, S15, S18); \
+	CMIX36(S21, S22, S23, S25, S26, S27, S03, S04, S05); \
+	SMIX(S21, S22, S23, S24); \
+	CMIX36(S18, S19, S20, S22, S23, S24, S00, S01, S02); \
+	SMIX(S18, S19, S20, S21); \
+	CMIX36(S15, S16, S17, S19, S20, S21, S33, S34, S35); \
+	SMIX(S15, S16, S17, S18); \
+	CMIX36(S12, S13, S14, S16, S17, S18, S30, S31, S32); \
+	SMIX(S12, S13, S14, S15); \
+	\
+	TIX4(z, S12, S13, S16, S19, S20, S34, S00, S03, S06); \
+	CMIX36(S09, S10, S11, S13, S14, S15, S27, S28, S29); \
+	SMIX(S09, S10, S11, S12); \
+	CMIX36(S06, S07, S08, S10, S11, S12, S24, S25, S26); \
+	SMIX(S06, S07, S08, S09); \
+	CMIX36(S03, S04, S05, S07, S08, S09, S21, S22, S23); \
+	SMIX(S03, S04, S05, S06); \
+	CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20); \
+	SMIX(S00, S01, S02, S03); \
+}
+
+#define FUGUE512_F(w, x, y, z) { \
+	TIX4(w, S00, S01, S04, S07, S08, S22, S24, S27, S30); \
+	CMIX36(S33, S34, S35, S01, S02, S03, S15, S16, S17); \
+	SMIX(S33, S34, S35, S00); \
+	CMIX36(S30, S31, S32, S34, S35, S00, S12, S13, S14); \
+	SMIX(S30, S31, S32, S33); \
+	CMIX36(S27, S28, S29, S31, S32, S33, S09, S10, S11); \
+	SMIX(S27, S28, S29, S30); \
+	CMIX36(S24, S25, S26, S28, S29, S30, S06, S07, S08); \
+	SMIX(S24, S25, S26, S27); \
+	\
+	TIX4(x, S24, S25, S28, S31, S32, S10, S12, S15, S18); \
+	CMIX36(S21, S22, S23, S25, S26, S27, S03, S04, S05); \
+	SMIX(S21, S22, S23, S24); \
+	CMIX36(S18, S19, S20, S22, S23, S24, S00, S01, S02); \
+	SMIX(S18, S19, S20, S21); \
+	CMIX36(S15, S16, S17, S19, S20, S21, S33, S34, S35); \
+	SMIX(S15, S16, S17, S18); \
+	CMIX36(S12, S13, S14, S16, S17, S18, S30, S31, S32); \
+	SMIX(S12, S13, S14, S15); \
+	\
+	TIX4(y, S12, S13, S16, S19, S20, S34, S00, S03, S06); \
+	CMIX36(S09, S10, S11, S13, S14, S15, S27, S28, S29); \
+	SMIX(S09, S10, S11, S12); \
+	CMIX36(S06, S07, S08, S10, S11, S12, S24, S25, S26); \
+	SMIX(S06, S07, S08, S09); \
+	CMIX36(S03, S04, S05, S07, S08, S09, S21, S22, S23); \
+	SMIX(S03, S04, S05, S06); \
+	CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20); \
+	SMIX(S00, S01, S02, S03); \
+	\
+	TIX4(z, S00, S01, S04, S07, S08, S22, S24, S27, S30); \
+	CMIX36(S33, S34, S35, S01, S02, S03, S15, S16, S17); \
+	SMIX(S33, S34, S35, S00); \
+	CMIX36(S30, S31, S32, S34, S35, S00, S12, S13, S14); \
+	SMIX(S30, S31, S32, S33); \
+	CMIX36(S27, S28, S29, S31, S32, S33, S09, S10, S11); \
+	SMIX(S27, S28, S29, S30); \
+	CMIX36(S24, S25, S26, S28, S29, S30, S06, S07, S08); \
+	SMIX(S24, S25, S26, S27); \
+}
+
+#undef ROL8
+#ifdef __CUDA_ARCH__
+__device__ __forceinline__
+uint32_t ROL8(const uint32_t a) {
+	return __byte_perm(a, 0, 0x2103);
+}
+__device__ __forceinline__
+uint32_t ROR8(const uint32_t a) {
+	return __byte_perm(a, 0, 0x0321);
+}
+__device__ __forceinline__
+uint32_t ROL16(const uint32_t a) {
+	return __byte_perm(a, 0, 0x1032);
+}
+#else
+#define ROL8(u)  ROTL32(u, 8)
+#define ROR8(u)  ROTR32(u, 8)
+#define ROL16(u) ROTL32(u,16)
+#endif
+
+//#define AS_UINT4(addr) *((uint4*)(addr))
+
+__constant__ static uint64_t c_PaddedMessage80[10];
+
+__host__
+void x16_fugue512_setBlock_80(void *pdata)
+{
+	cudaMemcpyToSymbol(c_PaddedMessage80, pdata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice);
+}
+
+/***************************************************/
+
+__global__
+__launch_bounds__(TPB)
+void x16_fugue512_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t *g_hash)
+{
+	__shared__ uint32_t mixtabs[1024];
+
+	// load shared mem (with 256 threads)
+	const uint32_t thr = threadIdx.x & 0xFF;
+	const uint32_t tmp = tex1Dfetch(mixTab0Tex, thr);
+	mixtabs[thr] = tmp;
+	mixtabs[thr+256] = ROR8(tmp);
+	mixtabs[thr+512] = ROL16(tmp);
+	mixtabs[thr+768] = ROL8(tmp);
+#if TPB <= 256
+	if (blockDim.x < 256) {
+		const uint32_t thr = (threadIdx.x + 0x80) & 0xFF;
+		const uint32_t tmp = tex1Dfetch(mixTab0Tex, thr);
+		mixtabs[thr] = tmp;
+		mixtabs[thr + 256] = ROR8(tmp);
+		mixtabs[thr + 512] = ROL16(tmp);
+		mixtabs[thr + 768] = ROL8(tmp);
+	}
+#endif
+
+	__syncthreads();
+
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t Data[20];
+
+		#pragma unroll
+		for(int i = 0; i < 10; i++)
+			AS_UINT2(&Data[i * 2]) = AS_UINT2(&c_PaddedMessage80[i]);
+		Data[19] = (startNonce + thread);
+
+		uint32_t S00, S01, S02, S03, S04, S05, S06, S07, S08, S09, S10, S11;
+		uint32_t S12, S13, S14, S15, S16, S17, S18, S19, S20, S21, S22, S23;
+		uint32_t S24, S25, S26, S27, S28, S29, S30, S31, S32, S33, S34, S35;
+		//uint32_t B24, B25, B26,
+		uint32_t B27, B28, B29, B30, B31, B32, B33, B34, B35;
+		//const uint64_t bc = 640 bits to hash
+		//const uint32_t bclo = (uint32_t)(bc);
+		//const uint32_t bchi = (uint32_t)(bc >> 32);
+
+		S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = 0;
+		S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0;
+		S20 = 0x8807a57e; S21 = 0xe616af75; S22 = 0xc5d3e4db; S23 = 0xac9ab027;
+		S24 = 0xd915f117; S25 = 0xb6eecc54; S26 = 0x06e8020b; S27 = 0x4a92efd1;
+		S28 = 0xaac6e2c9; S29 = 0xddb21398; S30 = 0xcae65838; S31 = 0x437f203f;
+		S32 = 0x25ea78e7; S33 = 0x951fddd6; S34 = 0xda6ed11d; S35 = 0xe13e3567;
+
+		FUGUE512_3((Data[ 0]), (Data[ 1]), (Data[ 2]));
+		FUGUE512_3((Data[ 3]), (Data[ 4]), (Data[ 5]));
+		FUGUE512_3((Data[ 6]), (Data[ 7]), (Data[ 8]));
+		FUGUE512_3((Data[ 9]), (Data[10]), (Data[11]));
+		FUGUE512_3((Data[12]), (Data[13]), (Data[14]));
+		FUGUE512_3((Data[15]), (Data[16]), (Data[17]));
+		FUGUE512_F((Data[18]), (Data[19]), 0/*bchi*/, (80*8)/*bclo*/);
+
+		// rotate right state by 3 dwords (S00 = S33, S03 = S00)
+		SUB_ROR3;
+		SUB_ROR9;
+
+		#pragma unroll 32
+		for (int i = 0; i < 32; i++) {
+			SUB_ROR3;
+			CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20);
+			SMIX(S00, S01, S02, S03);
+		}
+		#pragma unroll 13
+		for (int i = 0; i < 13; i++) {
+			S04 ^= S00;
+			S09 ^= S00;
+			S18 ^= S00;
+			S27 ^= S00;
+			SUB_ROR9;
+			SMIX(S00, S01, S02, S03);
+			S04 ^= S00;
+			S10 ^= S00;
+			S18 ^= S00;
+			S27 ^= S00;
+			SUB_ROR9;
+			SMIX(S00, S01, S02, S03);
+			S04 ^= S00;
+			S10 ^= S00;
+			S19 ^= S00;
+			S27 ^= S00;
+			SUB_ROR9;
+			SMIX(S00, S01, S02, S03);
+			S04 ^= S00;
+			S10 ^= S00;
+			S19 ^= S00;
+			S28 ^= S00;
+			SUB_ROR8;
+			SMIX(S00, S01, S02, S03);
+		}
+		S04 ^= S00;
+		S09 ^= S00;
+		S18 ^= S00;
+		S27 ^= S00;
+
+		Data[ 0] = cuda_swab32(S01);
+		Data[ 1] = cuda_swab32(S02);
+		Data[ 2] = cuda_swab32(S03);
+		Data[ 3] = cuda_swab32(S04);
+		Data[ 4] = cuda_swab32(S09);
+		Data[ 5] = cuda_swab32(S10);
+		Data[ 6] = cuda_swab32(S11);
+		Data[ 7] = cuda_swab32(S12);
+		Data[ 8] = cuda_swab32(S18);
+		Data[ 9] = cuda_swab32(S19);
+		Data[10] = cuda_swab32(S20);
+		Data[11] = cuda_swab32(S21);
+		Data[12] = cuda_swab32(S27);
+		Data[13] = cuda_swab32(S28);
+		Data[14] = cuda_swab32(S29);
+		Data[15] = cuda_swab32(S30);
+
+		const size_t hashPosition = thread;
+		uint64_t* pHash = &g_hash[hashPosition << 3];
+		#pragma unroll 4
+		for(int i = 0; i < 4; i++)
+			AS_UINT4(&pHash[i * 2]) = AS_UINT4(&Data[i * 4]);
+	}
+}
+
+#define texDef(id, texname, texmem, texsource, texsize) { \
+	unsigned int *texmem; \
+	cudaMalloc(&texmem, texsize); \
+	d_textures[thr_id][id] = texmem; \
+	cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
+	texname.normalized = 0; \
+	texname.filterMode = cudaFilterModePoint; \
+	texname.addressMode[0] = cudaAddressModeClamp; \
+	{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
+	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); \
+	} \
+}
+
+__host__
+void x16_fugue512_cpu_init(int thr_id, uint32_t threads)
+{
+	texDef(0, mixTab0Tex, mixTab0m, mixtab0, sizeof(uint32_t)*256);
+}
+
+__host__
+void x16_fugue512_cpu_free(int thr_id)
+{
+	cudaFree(d_textures[thr_id][0]);
+}
+
+__host__
+void x16_fugue512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash)
+{
+	const uint32_t threadsperblock = TPB;
+
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	x16_fugue512_gpu_hash_80 <<<grid, block>>> (threads, startNonce, (uint64_t*)d_hash);
+}
diff --git a/x16/cuda_x16_shabal512.cu b/x16/cuda_x16_shabal512.cu
new file mode 100644
index 0000000000..ca00b50737
--- /dev/null
+++ b/x16/cuda_x16_shabal512.cu
@@ -0,0 +1,350 @@
+/*
+* Shabal-512 for X16R
+* tpruvot 2018, based on alexis x14 and xevan kernlx code
+*/
+
+#include <cuda_helper.h>
+#include <cuda_vectors.h>
+#include <cuda_vector_uint2x4.h>
+
+typedef uint32_t sph_u32;
+
+#define C32(x) (x)
+#define T32(x) (x)
+
+#define INPUT_BLOCK_ADD do { \
+		B0 = T32(B0 + M0); \
+		B1 = T32(B1 + M1); \
+		B2 = T32(B2 + M2); \
+		B3 = T32(B3 + M3); \
+		B4 = T32(B4 + M4); \
+		B5 = T32(B5 + M5); \
+		B6 = T32(B6 + M6); \
+		B7 = T32(B7 + M7); \
+		B8 = T32(B8 + M8); \
+		B9 = T32(B9 + M9); \
+		BA = T32(BA + MA); \
+		BB = T32(BB + MB); \
+		BC = T32(BC + MC); \
+		BD = T32(BD + MD); \
+		BE = T32(BE + ME); \
+		BF = T32(BF + MF); \
+		} while (0)
+
+#define INPUT_BLOCK_SUB do { \
+		C0 = T32(C0 - M0); \
+		C1 = T32(C1 - M1); \
+		C2 = T32(C2 - M2); \
+		C3 = T32(C3 - M3); \
+		C4 = T32(C4 - M4); \
+		C5 = T32(C5 - M5); \
+		C6 = T32(C6 - M6); \
+		C7 = T32(C7 - M7); \
+		C8 = T32(C8 - M8); \
+		C9 = T32(C9 - M9); \
+		CA = T32(CA - MA); \
+		CB = T32(CB - MB); \
+		CC = T32(CC - MC); \
+		CD = T32(CD - MD); \
+		CE = T32(CE - ME); \
+		CF = T32(CF - MF); \
+		} while (0)
+
+#define XOR_W   do { \
+		A00 ^= Wlow; \
+		A01 ^= Whigh; \
+		} while (0)
+
+#define SWAP(v1, v2) do { \
+		sph_u32 tmp = (v1); \
+		(v1) = (v2); \
+		(v2) = tmp; \
+		} while (0)
+
+#define SWAP_BC do { \
+		SWAP(B0, C0); \
+		SWAP(B1, C1); \
+		SWAP(B2, C2); \
+		SWAP(B3, C3); \
+		SWAP(B4, C4); \
+		SWAP(B5, C5); \
+		SWAP(B6, C6); \
+		SWAP(B7, C7); \
+		SWAP(B8, C8); \
+		SWAP(B9, C9); \
+		SWAP(BA, CA); \
+		SWAP(BB, CB); \
+		SWAP(BC, CC); \
+		SWAP(BD, CD); \
+		SWAP(BE, CE); \
+		SWAP(BF, CF); \
+		} while (0)
+
+#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) do { \
+		xa0 = T32((xa0 \
+			^ (((xa1 << 15) | (xa1 >> 17)) * 5U) \
+			^ xc) * 3U) \
+			^ xb1 ^ (xb2 & ~xb3) ^ xm; \
+		xb0 = T32(~(((xb0 << 1) | (xb0 >> 31)) ^ xa0)); \
+		} while (0)
+
+#define PERM_STEP_0 do { \
+		PERM_ELT(A00, A0B, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A01, A00, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A02, A01, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A03, A02, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A04, A03, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A05, A04, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A06, A05, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A07, A06, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A08, A07, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A09, A08, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A0A, A09, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A0B, A0A, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A00, A0B, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A01, A00, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A02, A01, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A03, A02, BF, BC, B8, B5, C9, MF); \
+		} while (0)
+
+#define PERM_STEP_1 do { \
+		PERM_ELT(A04, A03, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A05, A04, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A06, A05, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A07, A06, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A08, A07, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A09, A08, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A0A, A09, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A0B, A0A, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A00, A0B, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A01, A00, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A02, A01, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A03, A02, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A04, A03, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A05, A04, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A06, A05, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A07, A06, BF, BC, B8, B5, C9, MF); \
+		} while (0)
+
+#define PERM_STEP_2 do { \
+		PERM_ELT(A08, A07, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A09, A08, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A0A, A09, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A0B, A0A, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A00, A0B, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A01, A00, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A02, A01, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A03, A02, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A04, A03, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A05, A04, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A06, A05, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A07, A06, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A08, A07, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A09, A08, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A0A, A09, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A0B, A0A, BF, BC, B8, B5, C9, MF); \
+		} while (0)
+
+#define APPLY_P do { \
+		B0 = T32(B0 << 17) | (B0 >> 15); \
+		B1 = T32(B1 << 17) | (B1 >> 15); \
+		B2 = T32(B2 << 17) | (B2 >> 15); \
+		B3 = T32(B3 << 17) | (B3 >> 15); \
+		B4 = T32(B4 << 17) | (B4 >> 15); \
+		B5 = T32(B5 << 17) | (B5 >> 15); \
+		B6 = T32(B6 << 17) | (B6 >> 15); \
+		B7 = T32(B7 << 17) | (B7 >> 15); \
+		B8 = T32(B8 << 17) | (B8 >> 15); \
+		B9 = T32(B9 << 17) | (B9 >> 15); \
+		BA = T32(BA << 17) | (BA >> 15); \
+		BB = T32(BB << 17) | (BB >> 15); \
+		BC = T32(BC << 17) | (BC >> 15); \
+		BD = T32(BD << 17) | (BD >> 15); \
+		BE = T32(BE << 17) | (BE >> 15); \
+		BF = T32(BF << 17) | (BF >> 15); \
+		PERM_STEP_0; \
+		PERM_STEP_1; \
+		PERM_STEP_2; \
+		A0B = T32(A0B + C6); \
+		A0A = T32(A0A + C5); \
+		A09 = T32(A09 + C4); \
+		A08 = T32(A08 + C3); \
+		A07 = T32(A07 + C2); \
+		A06 = T32(A06 + C1); \
+		A05 = T32(A05 + C0); \
+		A04 = T32(A04 + CF); \
+		A03 = T32(A03 + CE); \
+		A02 = T32(A02 + CD); \
+		A01 = T32(A01 + CC); \
+		A00 = T32(A00 + CB); \
+		A0B = T32(A0B + CA); \
+		A0A = T32(A0A + C9); \
+		A09 = T32(A09 + C8); \
+		A08 = T32(A08 + C7); \
+		A07 = T32(A07 + C6); \
+		A06 = T32(A06 + C5); \
+		A05 = T32(A05 + C4); \
+		A04 = T32(A04 + C3); \
+		A03 = T32(A03 + C2); \
+		A02 = T32(A02 + C1); \
+		A01 = T32(A01 + C0); \
+		A00 = T32(A00 + CF); \
+		A0B = T32(A0B + CE); \
+		A0A = T32(A0A + CD); \
+		A09 = T32(A09 + CC); \
+		A08 = T32(A08 + CB); \
+		A07 = T32(A07 + CA); \
+		A06 = T32(A06 + C9); \
+		A05 = T32(A05 + C8); \
+		A04 = T32(A04 + C7); \
+		A03 = T32(A03 + C6); \
+		A02 = T32(A02 + C5); \
+		A01 = T32(A01 + C4); \
+		A00 = T32(A00 + C3); \
+	} while (0)
+
+#define INCR_W do { \
+	if ((Wlow = T32(Wlow + 1)) == 0) \
+		Whigh = T32(Whigh + 1); \
+	} while (0)
+
+__constant__ static const sph_u32 A_init_512[] = {
+	C32(0x20728DFD), C32(0x46C0BD53), C32(0xE782B699), C32(0x55304632),
+	C32(0x71B4EF90), C32(0x0EA9E82C), C32(0xDBB930F1), C32(0xFAD06B8B),
+	C32(0xBE0CAE40), C32(0x8BD14410), C32(0x76D2ADAC), C32(0x28ACAB7F)
+};
+
+__constant__ static const sph_u32 B_init_512[] = {
+	C32(0xC1099CB7), C32(0x07B385F3), C32(0xE7442C26), C32(0xCC8AD640),
+	C32(0xEB6F56C7), C32(0x1EA81AA9), C32(0x73B9D314), C32(0x1DE85D08),
+	C32(0x48910A5A), C32(0x893B22DB), C32(0xC5A0DF44), C32(0xBBC4324E),
+	C32(0x72D2F240), C32(0x75941D99), C32(0x6D8BDE82), C32(0xA1A7502B)
+};
+
+__constant__ static const sph_u32 C_init_512[] = {
+	C32(0xD9BF68D1), C32(0x58BAD750), C32(0x56028CB2), C32(0x8134F359),
+	C32(0xB5D469D8), C32(0x941A8CC2), C32(0x418B2A6E), C32(0x04052780),
+	C32(0x7F07D787), C32(0x5194358F), C32(0x3C60D665), C32(0xBE97D79A),
+	C32(0x950C3434), C32(0xAED9A06D), C32(0x2537DC8D), C32(0x7CDB5969)
+};
+
+__constant__ static uint32_t c_PaddedMessage80[20];
+
+__host__
+void x16_shabal512_setBlock_80(void *pdata)
+{
+	cudaMemcpyToSymbol(c_PaddedMessage80, pdata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice);
+}
+
+#define TPB_SHABAL 256
+
+__global__ __launch_bounds__(TPB_SHABAL, 2)
+void x16_shabal512_gpu_hash_80(uint32_t threads, const uint32_t startNonce, uint32_t *g_hash)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+	uint32_t B[] = {
+		0xC1099CB7, 0x07B385F3, 0xE7442C26, 0xCC8AD640, 0xEB6F56C7, 0x1EA81AA9, 0x73B9D314, 0x1DE85D08,
+		0x48910A5A, 0x893B22DB, 0xC5A0DF44, 0xBBC4324E, 0x72D2F240, 0x75941D99, 0x6D8BDE82, 0xA1A7502B
+	};
+	uint32_t M[16];
+
+	if (thread < threads)
+	{
+		// todo: try __ldc
+		*(uint2x4*)&M[0] = *(uint2x4*)&c_PaddedMessage80[0];
+		*(uint2x4*)&M[8] = *(uint2x4*)&c_PaddedMessage80[8];
+
+		sph_u32 A00 = A_init_512[0], A01 = A_init_512[1], A02 = A_init_512[ 2], A03 = A_init_512[ 3];
+		sph_u32 A04 = A_init_512[4], A05 = A_init_512[5], A06 = A_init_512[ 6], A07 = A_init_512[ 7];
+		sph_u32 A08 = A_init_512[8], A09 = A_init_512[9], A0A = A_init_512[10], A0B = A_init_512[11];
+
+		sph_u32 B0 = B_init_512[ 0], B1 = B_init_512[ 1], B2 = B_init_512[ 2], B3 = B_init_512 [3];
+		sph_u32 B4 = B_init_512[ 4], B5 = B_init_512[ 5], B6 = B_init_512[ 6], B7 = B_init_512[ 7];
+		sph_u32 B8 = B_init_512[ 8], B9 = B_init_512[ 9], BA = B_init_512[10], BB = B_init_512[11];
+		sph_u32 BC = B_init_512[12], BD = B_init_512[13], BE = B_init_512[14], BF = B_init_512[15];
+
+		sph_u32 C0 = C_init_512[ 0], C1 = C_init_512[ 1], C2 = C_init_512[ 2], C3 = C_init_512[ 3];
+		sph_u32 C4 = C_init_512[ 4], C5 = C_init_512[ 5], C6 = C_init_512[ 6], C7 = C_init_512[ 7];
+		sph_u32 C8 = C_init_512[ 8], C9 = C_init_512[ 9], CA = C_init_512[10], CB = C_init_512[11];
+		sph_u32 CC = C_init_512[12], CD = C_init_512[13], CE = C_init_512[14], CF = C_init_512[15];
+
+		sph_u32 M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
+		sph_u32 Wlow = 1, Whigh = 0;
+
+		M0 = M[ 0];
+		M1 = M[ 1];
+		M2 = M[ 2];
+		M3 = M[ 3];
+		M4 = M[ 4];
+		M5 = M[ 5];
+		M6 = M[ 6];
+		M7 = M[ 7];
+		M8 = M[ 8];
+		M9 = M[ 9];
+		MA = M[10];
+		MB = M[11];
+		MC = M[12];
+		MD = M[13];
+		ME = M[14];
+		MF = M[15];
+
+		INPUT_BLOCK_ADD;
+		XOR_W;
+		APPLY_P;
+		INPUT_BLOCK_SUB;
+		SWAP_BC;
+		INCR_W;
+
+		M0 = c_PaddedMessage80[16];
+		M1 = c_PaddedMessage80[17];
+		M2 = c_PaddedMessage80[18];
+		M3 = cuda_swab32(startNonce + thread);
+		M4 = 0x80;
+		M5 = M6 = M7 = M8 = M9 = MA = MB = MC = MD = ME = MF = 0;
+
+		INPUT_BLOCK_ADD;
+		XOR_W;
+		APPLY_P;
+
+		for (unsigned i = 0; i < 3; i++) {
+			SWAP_BC;
+			XOR_W;
+			APPLY_P;
+		}
+
+		B[ 0] = B0;
+		B[ 1] = B1;
+		B[ 2] = B2;
+		B[ 3] = B3;
+		B[ 4] = B4;
+		B[ 5] = B5;
+		B[ 6] = B6;
+		B[ 7] = B7;
+		B[ 8] = B8;
+		B[ 9] = B9;
+		B[10] = BA;
+		B[11] = BB;
+		B[12] = BC;
+		B[13] = BD;
+		B[14] = BE;
+		B[15] = BF;
+
+		// output
+		uint64_t hashPosition = thread;
+		uint32_t *Hash = &g_hash[hashPosition << 4];
+		*(uint2x4*)&Hash[0] = *(uint2x4*)&B[0];
+		*(uint2x4*)&Hash[8] = *(uint2x4*)&B[8];
+	}
+}
+
+__host__
+void x16_shabal512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash)
+{
+	const uint32_t threadsperblock = TPB_SHABAL;
+
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	x16_shabal512_gpu_hash_80 <<<grid, block >>>(threads, startNonce, d_hash);
+}
diff --git a/x16/cuda_x16_simd512_80.cu b/x16/cuda_x16_simd512_80.cu
new file mode 100644
index 0000000000..142180a39b
--- /dev/null
+++ b/x16/cuda_x16_simd512_80.cu
@@ -0,0 +1,1836 @@
+/**
+ * SIMD512 CUDA IMPLEMENTATION based on sph simd code
+ * tpruvot 2018 (with the help of kernelx xevan code)
+ */
+
+#include <miner.h>
+#include <cuda_helper.h>
+#include <cuda_vectors.h>
+
+#ifdef __INTELLISENSE__
+/* just for vstudio code colors */
+#define __CUDA_ARCH__ 500
+#define __byte_perm(x, y, m) (x|y)
+#endif
+
+#define TPB50_1 128
+#define TPB50_2 128
+#define TPB52_1 128
+#define TPB52_2 128
+
+#define sph_u32 uint32_t
+#define sph_s32 int32_t
+typedef uint32_t u32;
+typedef int32_t  s32;
+
+#define C32     SPH_C32
+#define T32     SPH_T32
+#define ROL32   ROTL32
+#define XCAT(x, y)    XCAT_(x, y)
+#define XCAT_(x, y)   x ## y
+
+/*
+ * The powers of 41 modulo 257. We use exponents from 0 to 255, inclusive.
+ */
+__constant__ static const s32 alpha_tab[] = {
+    1,  41, 139,  45,  46,  87, 226,  14,  60, 147, 116, 130, 190,  80, 196,  69,
+    2,  82,  21,  90,  92, 174, 195,  28, 120,  37, 232,   3, 123, 160, 135, 138,
+    4, 164,  42, 180,  184,  91, 133, 56, 240,  74, 207,   6, 246,  63,  13,  19,
+    8,  71,  84, 103, 111, 182,   9, 112, 223, 148, 157,  12, 235, 126,  26,  38,
+   16, 142, 168, 206, 222, 107,  18, 224, 189,  39,  57,  24, 213, 252,  52,  76,
+   32,  27,  79, 155, 187, 214,  36, 191, 121,  78, 114,  48, 169, 247, 104, 152,
+   64,  54, 158,  53, 117, 171,  72, 125, 242, 156, 228,  96,  81, 237, 208,  47,
+  128, 108,  59, 106, 234,  85, 144, 250, 227,  55, 199, 192, 162, 217, 159,  94,
+  256, 216, 118, 212, 211, 170,  31, 243, 197, 110, 141, 127,  67, 177,  61, 188,
+  255, 175, 236, 167, 165,  83,  62, 229, 137, 220,  25, 254, 134,  97, 122, 119,
+  253,  93, 215,  77,  73, 166, 124, 201,  17, 183,  50, 251,  11, 194, 244, 238,
+  249, 186, 173, 154, 146,  75, 248, 145,  34, 109, 100, 245,  22, 131, 231, 219,
+  241, 115,  89,  51,  35, 150, 239,  33,  68, 218, 200, 233,  44,   5, 205, 181,
+  225, 230, 178, 102,  70,  43, 221,  66, 136, 179, 143, 209,  88,  10, 153, 105,
+  193, 203,  99, 204, 140,  86, 185, 132,  15, 101,  29, 161, 176,  20,  49, 210,
+  129, 149, 198, 151,  23, 172, 113,   7,  30, 202,  58,  65,  95,  40,  98, 163
+};
+
+/*
+ * Ranges:
+ *   REDS1: from -32768..98302 to -383..383
+ *   REDS2: from -2^31..2^31-1 to -32768..98302
+ */
+#define REDS1(x) (((x) & 0x00FF) - ((x) >> 8))
+#define REDS2(x) (((x) & 0xFFFF) + ((x) >> 16))
+
+/*
+ * If, upon entry, the values of q[] are all in the -N..N range (where
+ * N >= 98302) then the new values of q[] are in the -2N..2N range.
+ *
+ * Since alpha_tab[v] <= 256, maximum allowed range is for N = 8388608.
+ */
+#define FFT_LOOP_16_8(rb)   do { \
+    s32 m = q[(rb)]; \
+    s32 n = q[(rb) + 16]; \
+    q[(rb)] = m + n; \
+    q[(rb) + 16] = m - n; \
+    s32 t; \
+    m = q[(rb) + 0 + 1]; \
+    n = q[(rb) + 0 + 1 + 16]; \
+    t = REDS2(n * alpha_tab[0 + 1 * 8]); \
+    q[(rb) + 0 + 1] = m + t; \
+    q[(rb) + 0 + 1 + 16] = m - t; \
+    m = q[(rb) + 0 + 2]; \
+    n = q[(rb) + 0 + 2 + 16]; \
+    t = REDS2(n * alpha_tab[0 + 2 * 8]); \
+    q[(rb) + 0 + 2] = m + t; \
+    q[(rb) + 0 + 2 + 16] = m - t; \
+    m = q[(rb) + 0 + 3]; \
+    n = q[(rb) + 0 + 3 + 16]; \
+    t = REDS2(n * alpha_tab[0 + 3 * 8]); \
+    q[(rb) + 0 + 3] = m + t; \
+    q[(rb) + 0 + 3 + 16] = m - t; \
+    \
+    m = q[(rb) + 4 + 0]; \
+    n = q[(rb) + 4 + 0 + 16]; \
+    t = REDS2(n * alpha_tab[32 + 0 * 8]); \
+    q[(rb) + 4 + 0] = m + t; \
+    q[(rb) + 4 + 0 + 16] = m - t; \
+    m = q[(rb) + 4 + 1]; \
+    n = q[(rb) + 4 + 1 + 16]; \
+    t = REDS2(n * alpha_tab[32 + 1 * 8]); \
+    q[(rb) + 4 + 1] = m + t; \
+    q[(rb) + 4 + 1 + 16] = m - t; \
+    m = q[(rb) + 4 + 2]; \
+    n = q[(rb) + 4 + 2 + 16]; \
+    t = REDS2(n * alpha_tab[32 + 2 * 8]); \
+    q[(rb) + 4 + 2] = m + t; \
+    q[(rb) + 4 + 2 + 16] = m - t; \
+    m = q[(rb) + 4 + 3]; \
+    n = q[(rb) + 4 + 3 + 16]; \
+    t = REDS2(n * alpha_tab[32 + 3 * 8]); \
+    q[(rb) + 4 + 3] = m + t; \
+    q[(rb) + 4 + 3 + 16] = m - t; \
+    \
+    m = q[(rb) + 8 + 0]; \
+    n = q[(rb) + 8 + 0 + 16]; \
+    t = REDS2(n * alpha_tab[64 + 0 * 8]); \
+    q[(rb) + 8 + 0] = m + t; \
+    q[(rb) + 8 + 0 + 16] = m - t; \
+    m = q[(rb) + 8 + 1]; \
+    n = q[(rb) + 8 + 1 + 16]; \
+    t = REDS2(n * alpha_tab[64 + 1 * 8]); \
+    q[(rb) + 8 + 1] = m + t; \
+    q[(rb) + 8 + 1 + 16] = m - t; \
+    m = q[(rb) + 8 + 2]; \
+    n = q[(rb) + 8 + 2 + 16]; \
+    t = REDS2(n * alpha_tab[64 + 2 * 8]); \
+    q[(rb) + 8 + 2] = m + t; \
+    q[(rb) + 8 + 2 + 16] = m - t; \
+    m = q[(rb) + 8 + 3]; \
+    n = q[(rb) + 8 + 3 + 16]; \
+    t = REDS2(n * alpha_tab[64 + 3 * 8]); \
+    q[(rb) + 8 + 3] = m + t; \
+    q[(rb) + 8 + 3 + 16] = m - t; \
+    \
+    m = q[(rb) + 12 + 0]; \
+    n = q[(rb) + 12 + 0 + 16]; \
+    t = REDS2(n * alpha_tab[96 + 0 * 8]); \
+    q[(rb) + 12 + 0] = m + t; \
+    q[(rb) + 12 + 0 + 16] = m - t; \
+    m = q[(rb) + 12 + 1]; \
+    n = q[(rb) + 12 + 1 + 16]; \
+    t = REDS2(n * alpha_tab[96 + 1 * 8]); \
+    q[(rb) + 12 + 1] = m + t; \
+    q[(rb) + 12 + 1 + 16] = m - t; \
+    m = q[(rb) + 12 + 2]; \
+    n = q[(rb) + 12 + 2 + 16]; \
+    t = REDS2(n * alpha_tab[96 + 2 * 8]); \
+    q[(rb) + 12 + 2] = m + t; \
+    q[(rb) + 12 + 2 + 16] = m - t; \
+    m = q[(rb) + 12 + 3]; \
+    n = q[(rb) + 12 + 3 + 16]; \
+    t = REDS2(n * alpha_tab[96 + 3 * 8]); \
+    q[(rb) + 12 + 3] = m + t; \
+    q[(rb) + 12 + 3 + 16] = m - t; \
+  } while (0)
+
+#define FFT_LOOP_32_4(rb)   do { \
+    s32 m = q[(rb)]; \
+    s32 n = q[(rb) + 32]; \
+    q[(rb)] = m + n; \
+    q[(rb) + 32] = m - n; \
+    s32 t; \
+    m = q[(rb) + 0 + 1]; \
+    n = q[(rb) + 0 + 1 + 32]; \
+    t = REDS2(n * alpha_tab[0 + 1 * 4]); \
+    q[(rb) + 0 + 1] = m + t; \
+    q[(rb) + 0 + 1 + 32] = m - t; \
+    m = q[(rb) + 0 + 2]; \
+    n = q[(rb) + 0 + 2 + 32]; \
+    t = REDS2(n * alpha_tab[0 + 2 * 4]); \
+    q[(rb) + 0 + 2] = m + t; \
+    q[(rb) + 0 + 2 + 32] = m - t; \
+    m = q[(rb) + 0 + 3]; \
+    n = q[(rb) + 0 + 3 + 32]; \
+    t = REDS2(n * alpha_tab[0 + 3 * 4]); \
+    q[(rb) + 0 + 3] = m + t; \
+    q[(rb) + 0 + 3 + 32] = m - t; \
+    \
+    m = q[(rb) + 4 + 0]; \
+    n = q[(rb) + 4 + 0 + 32]; \
+    t = REDS2(n * alpha_tab[16 + 0 * 4]); \
+    q[(rb) + 4 + 0] = m + t; \
+    q[(rb) + 4 + 0 + 32] = m - t; \
+    m = q[(rb) + 4 + 1]; \
+    n = q[(rb) + 4 + 1 + 32]; \
+    t = REDS2(n * alpha_tab[16 + 1 * 4]); \
+    q[(rb) + 4 + 1] = m + t; \
+    q[(rb) + 4 + 1 + 32] = m - t; \
+    m = q[(rb) + 4 + 2]; \
+    n = q[(rb) + 4 + 2 + 32]; \
+    t = REDS2(n * alpha_tab[16 + 2 * 4]); \
+    q[(rb) + 4 + 2] = m + t; \
+    q[(rb) + 4 + 2 + 32] = m - t; \
+    m = q[(rb) + 4 + 3]; \
+    n = q[(rb) + 4 + 3 + 32]; \
+    t = REDS2(n * alpha_tab[16 + 3 * 4]); \
+    q[(rb) + 4 + 3] = m + t; \
+    q[(rb) + 4 + 3 + 32] = m - t; \
+    \
+    m = q[(rb) + 8 + 0]; \
+    n = q[(rb) + 8 + 0 + 32]; \
+    t = REDS2(n * alpha_tab[32 + 0 * 4]); \
+    q[(rb) + 8 + 0] = m + t; \
+    q[(rb) + 8 + 0 + 32] = m - t; \
+    m = q[(rb) + 8 + 1]; \
+    n = q[(rb) + 8 + 1 + 32]; \
+    t = REDS2(n * alpha_tab[32 + 1 * 4]); \
+    q[(rb) + 8 + 1] = m + t; \
+    q[(rb) + 8 + 1 + 32] = m - t; \
+    m = q[(rb) + 8 + 2]; \
+    n = q[(rb) + 8 + 2 + 32]; \
+    t = REDS2(n * alpha_tab[32 + 2 * 4]); \
+    q[(rb) + 8 + 2] = m + t; \
+    q[(rb) + 8 + 2 + 32] = m - t; \
+    m = q[(rb) + 8 + 3]; \
+    n = q[(rb) + 8 + 3 + 32]; \
+    t = REDS2(n * alpha_tab[32 + 3 * 4]); \
+    q[(rb) + 8 + 3] = m + t; \
+    q[(rb) + 8 + 3 + 32] = m - t; \
+    \
+    m = q[(rb) + 12 + 0]; \
+    n = q[(rb) + 12 + 0 + 32]; \
+    t = REDS2(n * alpha_tab[48 + 0 * 4]); \
+    q[(rb) + 12 + 0] = m + t; \
+    q[(rb) + 12 + 0 + 32] = m - t; \
+    m = q[(rb) + 12 + 1]; \
+    n = q[(rb) + 12 + 1 + 32]; \
+    t = REDS2(n * alpha_tab[48 + 1 * 4]); \
+    q[(rb) + 12 + 1] = m + t; \
+    q[(rb) + 12 + 1 + 32] = m - t; \
+    m = q[(rb) + 12 + 2]; \
+    n = q[(rb) + 12 + 2 + 32]; \
+    t = REDS2(n * alpha_tab[48 + 2 * 4]); \
+    q[(rb) + 12 + 2] = m + t; \
+    q[(rb) + 12 + 2 + 32] = m - t; \
+    m = q[(rb) + 12 + 3]; \
+    n = q[(rb) + 12 + 3 + 32]; \
+    t = REDS2(n * alpha_tab[48 + 3 * 4]); \
+    q[(rb) + 12 + 3] = m + t; \
+    q[(rb) + 12 + 3 + 32] = m - t; \
+    \
+    m = q[(rb) + 16 + 0]; \
+    n = q[(rb) + 16 + 0 + 32]; \
+    t = REDS2(n * alpha_tab[64 + 0 * 4]); \
+    q[(rb) + 16 + 0] = m + t; \
+    q[(rb) + 16 + 0 + 32] = m - t; \
+    m = q[(rb) + 16 + 1]; \
+    n = q[(rb) + 16 + 1 + 32]; \
+    t = REDS2(n * alpha_tab[64 + 1 * 4]); \
+    q[(rb) + 16 + 1] = m + t; \
+    q[(rb) + 16 + 1 + 32] = m - t; \
+    m = q[(rb) + 16 + 2]; \
+    n = q[(rb) + 16 + 2 + 32]; \
+    t = REDS2(n * alpha_tab[64 + 2 * 4]); \
+    q[(rb) + 16 + 2] = m + t; \
+    q[(rb) + 16 + 2 + 32] = m - t; \
+    m = q[(rb) + 16 + 3]; \
+    n = q[(rb) + 16 + 3 + 32]; \
+    t = REDS2(n * alpha_tab[64 + 3 * 4]); \
+    q[(rb) + 16 + 3] = m + t; \
+    q[(rb) + 16 + 3 + 32] = m - t; \
+    \
+    m = q[(rb) + 20 + 0]; \
+    n = q[(rb) + 20 + 0 + 32]; \
+    t = REDS2(n * alpha_tab[80 + 0 * 4]); \
+    q[(rb) + 20 + 0] = m + t; \
+    q[(rb) + 20 + 0 + 32] = m - t; \
+    m = q[(rb) + 20 + 1]; \
+    n = q[(rb) + 20 + 1 + 32]; \
+    t = REDS2(n * alpha_tab[80 + 1 * 4]); \
+    q[(rb) + 20 + 1] = m + t; \
+    q[(rb) + 20 + 1 + 32] = m - t; \
+    m = q[(rb) + 20 + 2]; \
+    n = q[(rb) + 20 + 2 + 32]; \
+    t = REDS2(n * alpha_tab[80 + 2 * 4]); \
+    q[(rb) + 20 + 2] = m + t; \
+    q[(rb) + 20 + 2 + 32] = m - t; \
+    m = q[(rb) + 20 + 3]; \
+    n = q[(rb) + 20 + 3 + 32]; \
+    t = REDS2(n * alpha_tab[80 + 3 * 4]); \
+    q[(rb) + 20 + 3] = m + t; \
+    q[(rb) + 20 + 3 + 32] = m - t; \
+    \
+    m = q[(rb) + 24 + 0]; \
+    n = q[(rb) + 24 + 0 + 32]; \
+    t = REDS2(n * alpha_tab[96 + 0 * 4]); \
+    q[(rb) + 24 + 0] = m + t; \
+    q[(rb) + 24 + 0 + 32] = m - t; \
+    m = q[(rb) + 24 + 1]; \
+    n = q[(rb) + 24 + 1 + 32]; \
+    t = REDS2(n * alpha_tab[96 + 1 * 4]); \
+    q[(rb) + 24 + 1] = m + t; \
+    q[(rb) + 24 + 1 + 32] = m - t; \
+    m = q[(rb) + 24 + 2]; \
+    n = q[(rb) + 24 + 2 + 32]; \
+    t = REDS2(n * alpha_tab[96 + 2 * 4]); \
+    q[(rb) + 24 + 2] = m + t; \
+    q[(rb) + 24 + 2 + 32] = m - t; \
+    m = q[(rb) + 24 + 3]; \
+    n = q[(rb) + 24 + 3 + 32]; \
+    t = REDS2(n * alpha_tab[96 + 3 * 4]); \
+    q[(rb) + 24 + 3] = m + t; \
+    q[(rb) + 24 + 3 + 32] = m - t; \
+    \
+    m = q[(rb) + 28 + 0]; \
+    n = q[(rb) + 28 + 0 + 32]; \
+    t = REDS2(n * alpha_tab[112 + 0 * 4]); \
+    q[(rb) + 28 + 0] = m + t; \
+    q[(rb) + 28 + 0 + 32] = m - t; \
+    m = q[(rb) + 28 + 1]; \
+    n = q[(rb) + 28 + 1 + 32]; \
+    t = REDS2(n * alpha_tab[112 + 1 * 4]); \
+    q[(rb) + 28 + 1] = m + t; \
+    q[(rb) + 28 + 1 + 32] = m - t; \
+    m = q[(rb) + 28 + 2]; \
+    n = q[(rb) + 28 + 2 + 32]; \
+    t = REDS2(n * alpha_tab[112 + 2 * 4]); \
+    q[(rb) + 28 + 2] = m + t; \
+    q[(rb) + 28 + 2 + 32] = m - t; \
+    m = q[(rb) + 28 + 3]; \
+    n = q[(rb) + 28 + 3 + 32]; \
+    t = REDS2(n * alpha_tab[112 + 3 * 4]); \
+    q[(rb) + 28 + 3] = m + t; \
+    q[(rb) + 28 + 3 + 32] = m - t; \
+  } while (0)
+
+#define FFT_LOOP_64_2(rb)   do { \
+    s32 m = q[(rb)]; \
+    s32 n = q[(rb) + 64]; \
+    q[(rb)] = m + n; \
+    q[(rb) + 64] = m - n; \
+    s32 t; \
+    m = q[(rb) + 0 + 1]; \
+    n = q[(rb) + 0 + 1 + 64]; \
+    t = REDS2(n * alpha_tab[0 + 1 * 2]); \
+    q[(rb) + 0 + 1] = m + t; \
+    q[(rb) + 0 + 1 + 64] = m - t; \
+    m = q[(rb) + 0 + 2]; \
+    n = q[(rb) + 0 + 2 + 64]; \
+    t = REDS2(n * alpha_tab[0 + 2 * 2]); \
+    q[(rb) + 0 + 2] = m + t; \
+    q[(rb) + 0 + 2 + 64] = m - t; \
+    m = q[(rb) + 0 + 3]; \
+    n = q[(rb) + 0 + 3 + 64]; \
+    t = REDS2(n * alpha_tab[0 + 3 * 2]); \
+    q[(rb) + 0 + 3] = m + t; \
+    q[(rb) + 0 + 3 + 64] = m - t; \
+    \
+    m = q[(rb) + 4 + 0]; \
+    n = q[(rb) + 4 + 0 + 64]; \
+    t = REDS2(n * alpha_tab[8 + 0 * 2]); \
+    q[(rb) + 4 + 0] = m + t; \
+    q[(rb) + 4 + 0 + 64] = m - t; \
+    m = q[(rb) + 4 + 1]; \
+    n = q[(rb) + 4 + 1 + 64]; \
+    t = REDS2(n * alpha_tab[8 + 1 * 2]); \
+    q[(rb) + 4 + 1] = m + t; \
+    q[(rb) + 4 + 1 + 64] = m - t; \
+    m = q[(rb) + 4 + 2]; \
+    n = q[(rb) + 4 + 2 + 64]; \
+    t = REDS2(n * alpha_tab[8 + 2 * 2]); \
+    q[(rb) + 4 + 2] = m + t; \
+    q[(rb) + 4 + 2 + 64] = m - t; \
+    m = q[(rb) + 4 + 3]; \
+    n = q[(rb) + 4 + 3 + 64]; \
+    t = REDS2(n * alpha_tab[8 + 3 * 2]); \
+    q[(rb) + 4 + 3] = m + t; \
+    q[(rb) + 4 + 3 + 64] = m - t; \
+    \
+    m = q[(rb) + 8 + 0]; \
+    n = q[(rb) + 8 + 0 + 64]; \
+    t = REDS2(n * alpha_tab[16 + 0 * 2]); \
+    q[(rb) + 8 + 0] = m + t; \
+    q[(rb) + 8 + 0 + 64] = m - t; \
+    m = q[(rb) + 8 + 1]; \
+    n = q[(rb) + 8 + 1 + 64]; \
+    t = REDS2(n * alpha_tab[16 + 1 * 2]); \
+    q[(rb) + 8 + 1] = m + t; \
+    q[(rb) + 8 + 1 + 64] = m - t; \
+    m = q[(rb) + 8 + 2]; \
+    n = q[(rb) + 8 + 2 + 64]; \
+    t = REDS2(n * alpha_tab[16 + 2 * 2]); \
+    q[(rb) + 8 + 2] = m + t; \
+    q[(rb) + 8 + 2 + 64] = m - t; \
+    m = q[(rb) + 8 + 3]; \
+    n = q[(rb) + 8 + 3 + 64]; \
+    t = REDS2(n * alpha_tab[16 + 3 * 2]); \
+    q[(rb) + 8 + 3] = m + t; \
+    q[(rb) + 8 + 3 + 64] = m - t; \
+    \
+    m = q[(rb) + 12 + 0]; \
+    n = q[(rb) + 12 + 0 + 64]; \
+    t = REDS2(n * alpha_tab[24 + 0 * 2]); \
+    q[(rb) + 12 + 0] = m + t; \
+    q[(rb) + 12 + 0 + 64] = m - t; \
+    m = q[(rb) + 12 + 1]; \
+    n = q[(rb) + 12 + 1 + 64]; \
+    t = REDS2(n * alpha_tab[24 + 1 * 2]); \
+    q[(rb) + 12 + 1] = m + t; \
+    q[(rb) + 12 + 1 + 64] = m - t; \
+    m = q[(rb) + 12 + 2]; \
+    n = q[(rb) + 12 + 2 + 64]; \
+    t = REDS2(n * alpha_tab[24 + 2 * 2]); \
+    q[(rb) + 12 + 2] = m + t; \
+    q[(rb) + 12 + 2 + 64] = m - t; \
+    m = q[(rb) + 12 + 3]; \
+    n = q[(rb) + 12 + 3 + 64]; \
+    t = REDS2(n * alpha_tab[24 + 3 * 2]); \
+    q[(rb) + 12 + 3] = m + t; \
+    q[(rb) + 12 + 3 + 64] = m - t; \
+    \
+    m = q[(rb) + 16 + 0]; \
+    n = q[(rb) + 16 + 0 + 64]; \
+    t = REDS2(n * alpha_tab[32 + 0 * 2]); \
+    q[(rb) + 16 + 0] = m + t; \
+    q[(rb) + 16 + 0 + 64] = m - t; \
+    m = q[(rb) + 16 + 1]; \
+    n = q[(rb) + 16 + 1 + 64]; \
+    t = REDS2(n * alpha_tab[32 + 1 * 2]); \
+    q[(rb) + 16 + 1] = m + t; \
+    q[(rb) + 16 + 1 + 64] = m - t; \
+    m = q[(rb) + 16 + 2]; \
+    n = q[(rb) + 16 + 2 + 64]; \
+    t = REDS2(n * alpha_tab[32 + 2 * 2]); \
+    q[(rb) + 16 + 2] = m + t; \
+    q[(rb) + 16 + 2 + 64] = m - t; \
+    m = q[(rb) + 16 + 3]; \
+    n = q[(rb) + 16 + 3 + 64]; \
+    t = REDS2(n * alpha_tab[32 + 3 * 2]); \
+    q[(rb) + 16 + 3] = m + t; \
+    q[(rb) + 16 + 3 + 64] = m - t; \
+    \
+    m = q[(rb) + 20 + 0]; \
+    n = q[(rb) + 20 + 0 + 64]; \
+    t = REDS2(n * alpha_tab[40 + 0 * 2]); \
+    q[(rb) + 20 + 0] = m + t; \
+    q[(rb) + 20 + 0 + 64] = m - t; \
+    m = q[(rb) + 20 + 1]; \
+    n = q[(rb) + 20 + 1 + 64]; \
+    t = REDS2(n * alpha_tab[40 + 1 * 2]); \
+    q[(rb) + 20 + 1] = m + t; \
+    q[(rb) + 20 + 1 + 64] = m - t; \
+    m = q[(rb) + 20 + 2]; \
+    n = q[(rb) + 20 + 2 + 64]; \
+    t = REDS2(n * alpha_tab[40 + 2 * 2]); \
+    q[(rb) + 20 + 2] = m + t; \
+    q[(rb) + 20 + 2 + 64] = m - t; \
+    m = q[(rb) + 20 + 3]; \
+    n = q[(rb) + 20 + 3 + 64]; \
+    t = REDS2(n * alpha_tab[40 + 3 * 2]); \
+    q[(rb) + 20 + 3] = m + t; \
+    q[(rb) + 20 + 3 + 64] = m - t; \
+    \
+    m = q[(rb) + 24 + 0]; \
+    n = q[(rb) + 24 + 0 + 64]; \
+    t = REDS2(n * alpha_tab[48 + 0 * 2]); \
+    q[(rb) + 24 + 0] = m + t; \
+    q[(rb) + 24 + 0 + 64] = m - t; \
+    m = q[(rb) + 24 + 1]; \
+    n = q[(rb) + 24 + 1 + 64]; \
+    t = REDS2(n * alpha_tab[48 + 1 * 2]); \
+    q[(rb) + 24 + 1] = m + t; \
+    q[(rb) + 24 + 1 + 64] = m - t; \
+    m = q[(rb) + 24 + 2]; \
+    n = q[(rb) + 24 + 2 + 64]; \
+    t = REDS2(n * alpha_tab[48 + 2 * 2]); \
+    q[(rb) + 24 + 2] = m + t; \
+    q[(rb) + 24 + 2 + 64] = m - t; \
+    m = q[(rb) + 24 + 3]; \
+    n = q[(rb) + 24 + 3 + 64]; \
+    t = REDS2(n * alpha_tab[48 + 3 * 2]); \
+    q[(rb) + 24 + 3] = m + t; \
+    q[(rb) + 24 + 3 + 64] = m - t; \
+    \
+    m = q[(rb) + 28 + 0]; \
+    n = q[(rb) + 28 + 0 + 64]; \
+    t = REDS2(n * alpha_tab[56 + 0 * 2]); \
+    q[(rb) + 28 + 0] = m + t; \
+    q[(rb) + 28 + 0 + 64] = m - t; \
+    m = q[(rb) + 28 + 1]; \
+    n = q[(rb) + 28 + 1 + 64]; \
+    t = REDS2(n * alpha_tab[56 + 1 * 2]); \
+    q[(rb) + 28 + 1] = m + t; \
+    q[(rb) + 28 + 1 + 64] = m - t; \
+    m = q[(rb) + 28 + 2]; \
+    n = q[(rb) + 28 + 2 + 64]; \
+    t = REDS2(n * alpha_tab[56 + 2 * 2]); \
+    q[(rb) + 28 + 2] = m + t; \
+    q[(rb) + 28 + 2 + 64] = m - t; \
+    m = q[(rb) + 28 + 3]; \
+    n = q[(rb) + 28 + 3 + 64]; \
+    t = REDS2(n * alpha_tab[56 + 3 * 2]); \
+    q[(rb) + 28 + 3] = m + t; \
+    q[(rb) + 28 + 3 + 64] = m - t; \
+    \
+    m = q[(rb) + 32 + 0]; \
+    n = q[(rb) + 32 + 0 + 64]; \
+    t = REDS2(n * alpha_tab[64 + 0 * 2]); \
+    q[(rb) + 32 + 0] = m + t; \
+    q[(rb) + 32 + 0 + 64] = m - t; \
+    m = q[(rb) + 32 + 1]; \
+    n = q[(rb) + 32 + 1 + 64]; \
+    t = REDS2(n * alpha_tab[64 + 1 * 2]); \
+    q[(rb) + 32 + 1] = m + t; \
+    q[(rb) + 32 + 1 + 64] = m - t; \
+    m = q[(rb) + 32 + 2]; \
+    n = q[(rb) + 32 + 2 + 64]; \
+    t = REDS2(n * alpha_tab[64 + 2 * 2]); \
+    q[(rb) + 32 + 2] = m + t; \
+    q[(rb) + 32 + 2 + 64] = m - t; \
+    m = q[(rb) + 32 + 3]; \
+    n = q[(rb) + 32 + 3 + 64]; \
+    t = REDS2(n * alpha_tab[64 + 3 * 2]); \
+    q[(rb) + 32 + 3] = m + t; \
+    q[(rb) + 32 + 3 + 64] = m - t; \
+    \
+    m = q[(rb) + 36 + 0]; \
+    n = q[(rb) + 36 + 0 + 64]; \
+    t = REDS2(n * alpha_tab[72 + 0 * 2]); \
+    q[(rb) + 36 + 0] = m + t; \
+    q[(rb) + 36 + 0 + 64] = m - t; \
+    m = q[(rb) + 36 + 1]; \
+    n = q[(rb) + 36 + 1 + 64]; \
+    t = REDS2(n * alpha_tab[72 + 1 * 2]); \
+    q[(rb) + 36 + 1] = m + t; \
+    q[(rb) + 36 + 1 + 64] = m - t; \
+    m = q[(rb) + 36 + 2]; \
+    n = q[(rb) + 36 + 2 + 64]; \
+    t = REDS2(n * alpha_tab[72 + 2 * 2]); \
+    q[(rb) + 36 + 2] = m + t; \
+    q[(rb) + 36 + 2 + 64] = m - t; \
+    m = q[(rb) + 36 + 3]; \
+    n = q[(rb) + 36 + 3 + 64]; \
+    t = REDS2(n * alpha_tab[72 + 3 * 2]); \
+    q[(rb) + 36 + 3] = m + t; \
+    q[(rb) + 36 + 3 + 64] = m - t; \
+    \
+    m = q[(rb) + 40 + 0]; \
+    n = q[(rb) + 40 + 0 + 64]; \
+    t = REDS2(n * alpha_tab[80 + 0 * 2]); \
+    q[(rb) + 40 + 0] = m + t; \
+    q[(rb) + 40 + 0 + 64] = m - t; \
+    m = q[(rb) + 40 + 1]; \
+    n = q[(rb) + 40 + 1 + 64]; \
+    t = REDS2(n * alpha_tab[80 + 1 * 2]); \
+    q[(rb) + 40 + 1] = m + t; \
+    q[(rb) + 40 + 1 + 64] = m - t; \
+    m = q[(rb) + 40 + 2]; \
+    n = q[(rb) + 40 + 2 + 64]; \
+    t = REDS2(n * alpha_tab[80 + 2 * 2]); \
+    q[(rb) + 40 + 2] = m + t; \
+    q[(rb) + 40 + 2 + 64] = m - t; \
+    m = q[(rb) + 40 + 3]; \
+    n = q[(rb) + 40 + 3 + 64]; \
+    t = REDS2(n * alpha_tab[80 + 3 * 2]); \
+    q[(rb) + 40 + 3] = m + t; \
+    q[(rb) + 40 + 3 + 64] = m - t; \
+    \
+    m = q[(rb) + 44 + 0]; \
+    n = q[(rb) + 44 + 0 + 64]; \
+    t = REDS2(n * alpha_tab[88 + 0 * 2]); \
+    q[(rb) + 44 + 0] = m + t; \
+    q[(rb) + 44 + 0 + 64] = m - t; \
+    m = q[(rb) + 44 + 1]; \
+    n = q[(rb) + 44 + 1 + 64]; \
+    t = REDS2(n * alpha_tab[88 + 1 * 2]); \
+    q[(rb) + 44 + 1] = m + t; \
+    q[(rb) + 44 + 1 + 64] = m - t; \
+    m = q[(rb) + 44 + 2]; \
+    n = q[(rb) + 44 + 2 + 64]; \
+    t = REDS2(n * alpha_tab[88 + 2 * 2]); \
+    q[(rb) + 44 + 2] = m + t; \
+    q[(rb) + 44 + 2 + 64] = m - t; \
+    m = q[(rb) + 44 + 3]; \
+    n = q[(rb) + 44 + 3 + 64]; \
+    t = REDS2(n * alpha_tab[88 + 3 * 2]); \
+    q[(rb) + 44 + 3] = m + t; \
+    q[(rb) + 44 + 3 + 64] = m - t; \
+    \
+    m = q[(rb) + 48 + 0]; \
+    n = q[(rb) + 48 + 0 + 64]; \
+    t = REDS2(n * alpha_tab[96 + 0 * 2]); \
+    q[(rb) + 48 + 0] = m + t; \
+    q[(rb) + 48 + 0 + 64] = m - t; \
+    m = q[(rb) + 48 + 1]; \
+    n = q[(rb) + 48 + 1 + 64]; \
+    t = REDS2(n * alpha_tab[96 + 1 * 2]); \
+    q[(rb) + 48 + 1] = m + t; \
+    q[(rb) + 48 + 1 + 64] = m - t; \
+    m = q[(rb) + 48 + 2]; \
+    n = q[(rb) + 48 + 2 + 64]; \
+    t = REDS2(n * alpha_tab[96 + 2 * 2]); \
+    q[(rb) + 48 + 2] = m + t; \
+    q[(rb) + 48 + 2 + 64] = m - t; \
+    m = q[(rb) + 48 + 3]; \
+    n = q[(rb) + 48 + 3 + 64]; \
+    t = REDS2(n * alpha_tab[96 + 3 * 2]); \
+    q[(rb) + 48 + 3] = m + t; \
+    q[(rb) + 48 + 3 + 64] = m - t; \
+    \
+    m = q[(rb) + 52 + 0]; \
+    n = q[(rb) + 52 + 0 + 64]; \
+    t = REDS2(n * alpha_tab[104 + 0 * 2]); \
+    q[(rb) + 52 + 0] = m + t; \
+    q[(rb) + 52 + 0 + 64] = m - t; \
+    m = q[(rb) + 52 + 1]; \
+    n = q[(rb) + 52 + 1 + 64]; \
+    t = REDS2(n * alpha_tab[104 + 1 * 2]); \
+    q[(rb) + 52 + 1] = m + t; \
+    q[(rb) + 52 + 1 + 64] = m - t; \
+    m = q[(rb) + 52 + 2]; \
+    n = q[(rb) + 52 + 2 + 64]; \
+    t = REDS2(n * alpha_tab[104 + 2 * 2]); \
+    q[(rb) + 52 + 2] = m + t; \
+    q[(rb) + 52 + 2 + 64] = m - t; \
+    m = q[(rb) + 52 + 3]; \
+    n = q[(rb) + 52 + 3 + 64]; \
+    t = REDS2(n * alpha_tab[104 + 3 * 2]); \
+    q[(rb) + 52 + 3] = m + t; \
+    q[(rb) + 52 + 3 + 64] = m - t; \
+    \
+    m = q[(rb) + 56 + 0]; \
+    n = q[(rb) + 56 + 0 + 64]; \
+    t = REDS2(n * alpha_tab[112 + 0 * 2]); \
+    q[(rb) + 56 + 0] = m + t; \
+    q[(rb) + 56 + 0 + 64] = m - t; \
+    m = q[(rb) + 56 + 1]; \
+    n = q[(rb) + 56 + 1 + 64]; \
+    t = REDS2(n * alpha_tab[112 + 1 * 2]); \
+    q[(rb) + 56 + 1] = m + t; \
+    q[(rb) + 56 + 1 + 64] = m - t; \
+    m = q[(rb) + 56 + 2]; \
+    n = q[(rb) + 56 + 2 + 64]; \
+    t = REDS2(n * alpha_tab[112 + 2 * 2]); \
+    q[(rb) + 56 + 2] = m + t; \
+    q[(rb) + 56 + 2 + 64] = m - t; \
+    m = q[(rb) + 56 + 3]; \
+    n = q[(rb) + 56 + 3 + 64]; \
+    t = REDS2(n * alpha_tab[112 + 3 * 2]); \
+    q[(rb) + 56 + 3] = m + t; \
+    q[(rb) + 56 + 3 + 64] = m - t; \
+    \
+    m = q[(rb) + 60 + 0]; \
+    n = q[(rb) + 60 + 0 + 64]; \
+    t = REDS2(n * alpha_tab[120 + 0 * 2]); \
+    q[(rb) + 60 + 0] = m + t; \
+    q[(rb) + 60 + 0 + 64] = m - t; \
+    m = q[(rb) + 60 + 1]; \
+    n = q[(rb) + 60 + 1 + 64]; \
+    t = REDS2(n * alpha_tab[120 + 1 * 2]); \
+    q[(rb) + 60 + 1] = m + t; \
+    q[(rb) + 60 + 1 + 64] = m - t; \
+    m = q[(rb) + 60 + 2]; \
+    n = q[(rb) + 60 + 2 + 64]; \
+    t = REDS2(n * alpha_tab[120 + 2 * 2]); \
+    q[(rb) + 60 + 2] = m + t; \
+    q[(rb) + 60 + 2 + 64] = m - t; \
+    m = q[(rb) + 60 + 3]; \
+    n = q[(rb) + 60 + 3 + 64]; \
+    t = REDS2(n * alpha_tab[120 + 3 * 2]); \
+    q[(rb) + 60 + 3] = m + t; \
+    q[(rb) + 60 + 3 + 64] = m - t; \
+  } while (0)
+
+#define FFT_LOOP_128_1(rb)   do { \
+    s32 m = q[(rb)]; \
+    s32 n = q[(rb) + 128]; \
+    q[(rb)] = m + n; \
+    q[(rb) + 128] = m - n; \
+    s32 t; \
+    m = q[(rb) + 0 + 1]; \
+    n = q[(rb) + 0 + 1 + 128]; \
+    t = REDS2(n * alpha_tab[0 + 1 * 1]); \
+    q[(rb) + 0 + 1] = m + t; \
+    q[(rb) + 0 + 1 + 128] = m - t; \
+    m = q[(rb) + 0 + 2]; \
+    n = q[(rb) + 0 + 2 + 128]; \
+    t = REDS2(n * alpha_tab[0 + 2 * 1]); \
+    q[(rb) + 0 + 2] = m + t; \
+    q[(rb) + 0 + 2 + 128] = m - t; \
+    m = q[(rb) + 0 + 3]; \
+    n = q[(rb) + 0 + 3 + 128]; \
+    t = REDS2(n * alpha_tab[0 + 3 * 1]); \
+    q[(rb) + 0 + 3] = m + t; \
+    q[(rb) + 0 + 3 + 128] = m - t; \
+    m = q[(rb) + 4 + 0]; \
+    n = q[(rb) + 4 + 0 + 128]; \
+    t = REDS2(n * alpha_tab[4 + 0 * 1]); \
+    q[(rb) + 4 + 0] = m + t; \
+    q[(rb) + 4 + 0 + 128] = m - t; \
+    m = q[(rb) + 4 + 1]; \
+    n = q[(rb) + 4 + 1 + 128]; \
+    t = REDS2(n * alpha_tab[4 + 1 * 1]); \
+    q[(rb) + 4 + 1] = m + t; \
+    q[(rb) + 4 + 1 + 128] = m - t; \
+    m = q[(rb) + 4 + 2]; \
+    n = q[(rb) + 4 + 2 + 128]; \
+    t = REDS2(n * alpha_tab[4 + 2 * 1]); \
+    q[(rb) + 4 + 2] = m + t; \
+    q[(rb) + 4 + 2 + 128] = m - t; \
+    m = q[(rb) + 4 + 3]; \
+    n = q[(rb) + 4 + 3 + 128]; \
+    t = REDS2(n * alpha_tab[4 + 3 * 1]); \
+    q[(rb) + 4 + 3] = m + t; \
+    q[(rb) + 4 + 3 + 128] = m - t; \
+    m = q[(rb) + 8 + 0]; \
+    n = q[(rb) + 8 + 0 + 128]; \
+    t = REDS2(n * alpha_tab[8 + 0 * 1]); \
+    q[(rb) + 8 + 0] = m + t; \
+    q[(rb) + 8 + 0 + 128] = m - t; \
+    m = q[(rb) + 8 + 1]; \
+    n = q[(rb) + 8 + 1 + 128]; \
+    t = REDS2(n * alpha_tab[8 + 1 * 1]); \
+    q[(rb) + 8 + 1] = m + t; \
+    q[(rb) + 8 + 1 + 128] = m - t; \
+    m = q[(rb) + 8 + 2]; \
+    n = q[(rb) + 8 + 2 + 128]; \
+    t = REDS2(n * alpha_tab[8 + 2 * 1]); \
+    q[(rb) + 8 + 2] = m + t; \
+    q[(rb) + 8 + 2 + 128] = m - t; \
+    m = q[(rb) + 8 + 3]; \
+    n = q[(rb) + 8 + 3 + 128]; \
+    t = REDS2(n * alpha_tab[8 + 3 * 1]); \
+    q[(rb) + 8 + 3] = m + t; \
+    q[(rb) + 8 + 3 + 128] = m - t; \
+    m = q[(rb) + 12 + 0]; \
+    n = q[(rb) + 12 + 0 + 128]; \
+    t = REDS2(n * alpha_tab[12 + 0 * 1]); \
+    q[(rb) + 12 + 0] = m + t; \
+    q[(rb) + 12 + 0 + 128] = m - t; \
+    m = q[(rb) + 12 + 1]; \
+    n = q[(rb) + 12 + 1 + 128]; \
+    t = REDS2(n * alpha_tab[12 + 1 * 1]); \
+    q[(rb) + 12 + 1] = m + t; \
+    q[(rb) + 12 + 1 + 128] = m - t; \
+    m = q[(rb) + 12 + 2]; \
+    n = q[(rb) + 12 + 2 + 128]; \
+    t = REDS2(n * alpha_tab[12 + 2 * 1]); \
+    q[(rb) + 12 + 2] = m + t; \
+    q[(rb) + 12 + 2 + 128] = m - t; \
+    m = q[(rb) + 12 + 3]; \
+    n = q[(rb) + 12 + 3 + 128]; \
+    t = REDS2(n * alpha_tab[12 + 3 * 1]); \
+    q[(rb) + 12 + 3] = m + t; \
+    q[(rb) + 12 + 3 + 128] = m - t; \
+    m = q[(rb) + 16 + 0]; \
+    n = q[(rb) + 16 + 0 + 128]; \
+    t = REDS2(n * alpha_tab[16 + 0 * 1]); \
+    q[(rb) + 16 + 0] = m + t; \
+    q[(rb) + 16 + 0 + 128] = m - t; \
+    m = q[(rb) + 16 + 1]; \
+    n = q[(rb) + 16 + 1 + 128]; \
+    t = REDS2(n * alpha_tab[16 + 1 * 1]); \
+    q[(rb) + 16 + 1] = m + t; \
+    q[(rb) + 16 + 1 + 128] = m - t; \
+    m = q[(rb) + 16 + 2]; \
+    n = q[(rb) + 16 + 2 + 128]; \
+    t = REDS2(n * alpha_tab[16 + 2 * 1]); \
+    q[(rb) + 16 + 2] = m + t; \
+    q[(rb) + 16 + 2 + 128] = m - t; \
+    m = q[(rb) + 16 + 3]; \
+    n = q[(rb) + 16 + 3 + 128]; \
+    t = REDS2(n * alpha_tab[16 + 3 * 1]); \
+    q[(rb) + 16 + 3] = m + t; \
+    q[(rb) + 16 + 3 + 128] = m - t; \
+    m = q[(rb) + 20 + 0]; \
+    n = q[(rb) + 20 + 0 + 128]; \
+    t = REDS2(n * alpha_tab[20 + 0 * 1]); \
+    q[(rb) + 20 + 0] = m + t; \
+    q[(rb) + 20 + 0 + 128] = m - t; \
+    m = q[(rb) + 20 + 1]; \
+    n = q[(rb) + 20 + 1 + 128]; \
+    t = REDS2(n * alpha_tab[20 + 1 * 1]); \
+    q[(rb) + 20 + 1] = m + t; \
+    q[(rb) + 20 + 1 + 128] = m - t; \
+    m = q[(rb) + 20 + 2]; \
+    n = q[(rb) + 20 + 2 + 128]; \
+    t = REDS2(n * alpha_tab[20 + 2 * 1]); \
+    q[(rb) + 20 + 2] = m + t; \
+    q[(rb) + 20 + 2 + 128] = m - t; \
+    m = q[(rb) + 20 + 3]; \
+    n = q[(rb) + 20 + 3 + 128]; \
+    t = REDS2(n * alpha_tab[20 + 3 * 1]); \
+    q[(rb) + 20 + 3] = m + t; \
+    q[(rb) + 20 + 3 + 128] = m - t; \
+    m = q[(rb) + 24 + 0]; \
+    n = q[(rb) + 24 + 0 + 128]; \
+    t = REDS2(n * alpha_tab[24 + 0 * 1]); \
+    q[(rb) + 24 + 0] = m + t; \
+    q[(rb) + 24 + 0 + 128] = m - t; \
+    m = q[(rb) + 24 + 1]; \
+    n = q[(rb) + 24 + 1 + 128]; \
+    t = REDS2(n * alpha_tab[24 + 1 * 1]); \
+    q[(rb) + 24 + 1] = m + t; \
+    q[(rb) + 24 + 1 + 128] = m - t; \
+    m = q[(rb) + 24 + 2]; \
+    n = q[(rb) + 24 + 2 + 128]; \
+    t = REDS2(n * alpha_tab[24 + 2 * 1]); \
+    q[(rb) + 24 + 2] = m + t; \
+    q[(rb) + 24 + 2 + 128] = m - t; \
+    m = q[(rb) + 24 + 3]; \
+    n = q[(rb) + 24 + 3 + 128]; \
+    t = REDS2(n * alpha_tab[24 + 3 * 1]); \
+    q[(rb) + 24 + 3] = m + t; \
+    q[(rb) + 24 + 3 + 128] = m - t; \
+    m = q[(rb) + 28 + 0]; \
+    n = q[(rb) + 28 + 0 + 128]; \
+    t = REDS2(n * alpha_tab[28 + 0 * 1]); \
+    q[(rb) + 28 + 0] = m + t; \
+    q[(rb) + 28 + 0 + 128] = m - t; \
+    m = q[(rb) + 28 + 1]; \
+    n = q[(rb) + 28 + 1 + 128]; \
+    t = REDS2(n * alpha_tab[28 + 1 * 1]); \
+    q[(rb) + 28 + 1] = m + t; \
+    q[(rb) + 28 + 1 + 128] = m - t; \
+    m = q[(rb) + 28 + 2]; \
+    n = q[(rb) + 28 + 2 + 128]; \
+    t = REDS2(n * alpha_tab[28 + 2 * 1]); \
+    q[(rb) + 28 + 2] = m + t; \
+    q[(rb) + 28 + 2 + 128] = m - t; \
+    m = q[(rb) + 28 + 3]; \
+    n = q[(rb) + 28 + 3 + 128]; \
+    t = REDS2(n * alpha_tab[28 + 3 * 1]); \
+    q[(rb) + 28 + 3] = m + t; \
+    q[(rb) + 28 + 3 + 128] = m - t; \
+    m = q[(rb) + 32 + 0]; \
+    n = q[(rb) + 32 + 0 + 128]; \
+    t = REDS2(n * alpha_tab[32 + 0 * 1]); \
+    q[(rb) + 32 + 0] = m + t; \
+    q[(rb) + 32 + 0 + 128] = m - t; \
+    m = q[(rb) + 32 + 1]; \
+    n = q[(rb) + 32 + 1 + 128]; \
+    t = REDS2(n * alpha_tab[32 + 1 * 1]); \
+    q[(rb) + 32 + 1] = m + t; \
+    q[(rb) + 32 + 1 + 128] = m - t; \
+    m = q[(rb) + 32 + 2]; \
+    n = q[(rb) + 32 + 2 + 128]; \
+    t = REDS2(n * alpha_tab[32 + 2 * 1]); \
+    q[(rb) + 32 + 2] = m + t; \
+    q[(rb) + 32 + 2 + 128] = m - t; \
+    m = q[(rb) + 32 + 3]; \
+    n = q[(rb) + 32 + 3 + 128]; \
+    t = REDS2(n * alpha_tab[32 + 3 * 1]); \
+    q[(rb) + 32 + 3] = m + t; \
+    q[(rb) + 32 + 3 + 128] = m - t; \
+    m = q[(rb) + 36 + 0]; \
+    n = q[(rb) + 36 + 0 + 128]; \
+    t = REDS2(n * alpha_tab[36 + 0 * 1]); \
+    q[(rb) + 36 + 0] = m + t; \
+    q[(rb) + 36 + 0 + 128] = m - t; \
+    m = q[(rb) + 36 + 1]; \
+    n = q[(rb) + 36 + 1 + 128]; \
+    t = REDS2(n * alpha_tab[36 + 1 * 1]); \
+    q[(rb) + 36 + 1] = m + t; \
+    q[(rb) + 36 + 1 + 128] = m - t; \
+    m = q[(rb) + 36 + 2]; \
+    n = q[(rb) + 36 + 2 + 128]; \
+    t = REDS2(n * alpha_tab[36 + 2 * 1]); \
+    q[(rb) + 36 + 2] = m + t; \
+    q[(rb) + 36 + 2 + 128] = m - t; \
+    m = q[(rb) + 36 + 3]; \
+    n = q[(rb) + 36 + 3 + 128]; \
+    t = REDS2(n * alpha_tab[36 + 3 * 1]); \
+    q[(rb) + 36 + 3] = m + t; \
+    q[(rb) + 36 + 3 + 128] = m - t; \
+    m = q[(rb) + 40 + 0]; \
+    n = q[(rb) + 40 + 0 + 128]; \
+    t = REDS2(n * alpha_tab[40 + 0 * 1]); \
+    q[(rb) + 40 + 0] = m + t; \
+    q[(rb) + 40 + 0 + 128] = m - t; \
+    m = q[(rb) + 40 + 1]; \
+    n = q[(rb) + 40 + 1 + 128]; \
+    t = REDS2(n * alpha_tab[40 + 1 * 1]); \
+    q[(rb) + 40 + 1] = m + t; \
+    q[(rb) + 40 + 1 + 128] = m - t; \
+    m = q[(rb) + 40 + 2]; \
+    n = q[(rb) + 40 + 2 + 128]; \
+    t = REDS2(n * alpha_tab[40 + 2 * 1]); \
+    q[(rb) + 40 + 2] = m + t; \
+    q[(rb) + 40 + 2 + 128] = m - t; \
+    m = q[(rb) + 40 + 3]; \
+    n = q[(rb) + 40 + 3 + 128]; \
+    t = REDS2(n * alpha_tab[40 + 3 * 1]); \
+    q[(rb) + 40 + 3] = m + t; \
+    q[(rb) + 40 + 3 + 128] = m - t; \
+    m = q[(rb) + 44 + 0]; \
+    n = q[(rb) + 44 + 0 + 128]; \
+    t = REDS2(n * alpha_tab[44 + 0 * 1]); \
+    q[(rb) + 44 + 0] = m + t; \
+    q[(rb) + 44 + 0 + 128] = m - t; \
+    m = q[(rb) + 44 + 1]; \
+    n = q[(rb) + 44 + 1 + 128]; \
+    t = REDS2(n * alpha_tab[44 + 1 * 1]); \
+    q[(rb) + 44 + 1] = m + t; \
+    q[(rb) + 44 + 1 + 128] = m - t; \
+    m = q[(rb) + 44 + 2]; \
+    n = q[(rb) + 44 + 2 + 128]; \
+    t = REDS2(n * alpha_tab[44 + 2 * 1]); \
+    q[(rb) + 44 + 2] = m + t; \
+    q[(rb) + 44 + 2 + 128] = m - t; \
+    m = q[(rb) + 44 + 3]; \
+    n = q[(rb) + 44 + 3 + 128]; \
+    t = REDS2(n * alpha_tab[44 + 3 * 1]); \
+    q[(rb) + 44 + 3] = m + t; \
+    q[(rb) + 44 + 3 + 128] = m - t; \
+    m = q[(rb) + 48 + 0]; \
+    n = q[(rb) + 48 + 0 + 128]; \
+    t = REDS2(n * alpha_tab[48 + 0 * 1]); \
+    q[(rb) + 48 + 0] = m + t; \
+    q[(rb) + 48 + 0 + 128] = m - t; \
+    m = q[(rb) + 48 + 1]; \
+    n = q[(rb) + 48 + 1 + 128]; \
+    t = REDS2(n * alpha_tab[48 + 1 * 1]); \
+    q[(rb) + 48 + 1] = m + t; \
+    q[(rb) + 48 + 1 + 128] = m - t; \
+    m = q[(rb) + 48 + 2]; \
+    n = q[(rb) + 48 + 2 + 128]; \
+    t = REDS2(n * alpha_tab[48 + 2 * 1]); \
+    q[(rb) + 48 + 2] = m + t; \
+    q[(rb) + 48 + 2 + 128] = m - t; \
+    m = q[(rb) + 48 + 3]; \
+    n = q[(rb) + 48 + 3 + 128]; \
+    t = REDS2(n * alpha_tab[48 + 3 * 1]); \
+    q[(rb) + 48 + 3] = m + t; \
+    q[(rb) + 48 + 3 + 128] = m - t; \
+    m = q[(rb) + 52 + 0]; \
+    n = q[(rb) + 52 + 0 + 128]; \
+    t = REDS2(n * alpha_tab[52 + 0 * 1]); \
+    q[(rb) + 52 + 0] = m + t; \
+    q[(rb) + 52 + 0 + 128] = m - t; \
+    m = q[(rb) + 52 + 1]; \
+    n = q[(rb) + 52 + 1 + 128]; \
+    t = REDS2(n * alpha_tab[52 + 1 * 1]); \
+    q[(rb) + 52 + 1] = m + t; \
+    q[(rb) + 52 + 1 + 128] = m - t; \
+    m = q[(rb) + 52 + 2]; \
+    n = q[(rb) + 52 + 2 + 128]; \
+    t = REDS2(n * alpha_tab[52 + 2 * 1]); \
+    q[(rb) + 52 + 2] = m + t; \
+    q[(rb) + 52 + 2 + 128] = m - t; \
+    m = q[(rb) + 52 + 3]; \
+    n = q[(rb) + 52 + 3 + 128]; \
+    t = REDS2(n * alpha_tab[52 + 3 * 1]); \
+    q[(rb) + 52 + 3] = m + t; \
+    q[(rb) + 52 + 3 + 128] = m - t; \
+    m = q[(rb) + 56 + 0]; \
+    n = q[(rb) + 56 + 0 + 128]; \
+    t = REDS2(n * alpha_tab[56 + 0 * 1]); \
+    q[(rb) + 56 + 0] = m + t; \
+    q[(rb) + 56 + 0 + 128] = m - t; \
+    m = q[(rb) + 56 + 1]; \
+    n = q[(rb) + 56 + 1 + 128]; \
+    t = REDS2(n * alpha_tab[56 + 1 * 1]); \
+    q[(rb) + 56 + 1] = m + t; \
+    q[(rb) + 56 + 1 + 128] = m - t; \
+    m = q[(rb) + 56 + 2]; \
+    n = q[(rb) + 56 + 2 + 128]; \
+    t = REDS2(n * alpha_tab[56 + 2 * 1]); \
+    q[(rb) + 56 + 2] = m + t; \
+    q[(rb) + 56 + 2 + 128] = m - t; \
+    m = q[(rb) + 56 + 3]; \
+    n = q[(rb) + 56 + 3 + 128]; \
+    t = REDS2(n * alpha_tab[56 + 3 * 1]); \
+    q[(rb) + 56 + 3] = m + t; \
+    q[(rb) + 56 + 3 + 128] = m - t; \
+    m = q[(rb) + 60 + 0]; \
+    n = q[(rb) + 60 + 0 + 128]; \
+    t = REDS2(n * alpha_tab[60 + 0 * 1]); \
+    q[(rb) + 60 + 0] = m + t; \
+    q[(rb) + 60 + 0 + 128] = m - t; \
+    m = q[(rb) + 60 + 1]; \
+    n = q[(rb) + 60 + 1 + 128]; \
+    t = REDS2(n * alpha_tab[60 + 1 * 1]); \
+    q[(rb) + 60 + 1] = m + t; \
+    q[(rb) + 60 + 1 + 128] = m - t; \
+    m = q[(rb) + 60 + 2]; \
+    n = q[(rb) + 60 + 2 + 128]; \
+    t = REDS2(n * alpha_tab[60 + 2 * 1]); \
+    q[(rb) + 60 + 2] = m + t; \
+    q[(rb) + 60 + 2 + 128] = m - t; \
+    m = q[(rb) + 60 + 3]; \
+    n = q[(rb) + 60 + 3 + 128]; \
+    t = REDS2(n * alpha_tab[60 + 3 * 1]); \
+    q[(rb) + 60 + 3] = m + t; \
+    q[(rb) + 60 + 3 + 128] = m - t; \
+    m = q[(rb) + 64 + 0]; \
+    n = q[(rb) + 64 + 0 + 128]; \
+    t = REDS2(n * alpha_tab[64 + 0 * 1]); \
+    q[(rb) + 64 + 0] = m + t; \
+    q[(rb) + 64 + 0 + 128] = m - t; \
+    m = q[(rb) + 64 + 1]; \
+    n = q[(rb) + 64 + 1 + 128]; \
+    t = REDS2(n * alpha_tab[64 + 1 * 1]); \
+    q[(rb) + 64 + 1] = m + t; \
+    q[(rb) + 64 + 1 + 128] = m - t; \
+    m = q[(rb) + 64 + 2]; \
+    n = q[(rb) + 64 + 2 + 128]; \
+    t = REDS2(n * alpha_tab[64 + 2 * 1]); \
+    q[(rb) + 64 + 2] = m + t; \
+    q[(rb) + 64 + 2 + 128] = m - t; \
+    m = q[(rb) + 64 + 3]; \
+    n = q[(rb) + 64 + 3 + 128]; \
+    t = REDS2(n * alpha_tab[64 + 3 * 1]); \
+    q[(rb) + 64 + 3] = m + t; \
+    q[(rb) + 64 + 3 + 128] = m - t; \
+    m = q[(rb) + 68 + 0]; \
+    n = q[(rb) + 68 + 0 + 128]; \
+    t = REDS2(n * alpha_tab[68 + 0 * 1]); \
+    q[(rb) + 68 + 0] = m + t; \
+    q[(rb) + 68 + 0 + 128] = m - t; \
+    m = q[(rb) + 68 + 1]; \
+    n = q[(rb) + 68 + 1 + 128]; \
+    t = REDS2(n * alpha_tab[68 + 1 * 1]); \
+    q[(rb) + 68 + 1] = m + t; \
+    q[(rb) + 68 + 1 + 128] = m - t; \
+    m = q[(rb) + 68 + 2]; \
+    n = q[(rb) + 68 + 2 + 128]; \
+    t = REDS2(n * alpha_tab[68 + 2 * 1]); \
+    q[(rb) + 68 + 2] = m + t; \
+    q[(rb) + 68 + 2 + 128] = m - t; \
+    m = q[(rb) + 68 + 3]; \
+    n = q[(rb) + 68 + 3 + 128]; \
+    t = REDS2(n * alpha_tab[68 + 3 * 1]); \
+    q[(rb) + 68 + 3] = m + t; \
+    q[(rb) + 68 + 3 + 128] = m - t; \
+    m = q[(rb) + 72 + 0]; \
+    n = q[(rb) + 72 + 0 + 128]; \
+    t = REDS2(n * alpha_tab[72 + 0 * 1]); \
+    q[(rb) + 72 + 0] = m + t; \
+    q[(rb) + 72 + 0 + 128] = m - t; \
+    m = q[(rb) + 72 + 1]; \
+    n = q[(rb) + 72 + 1 + 128]; \
+    t = REDS2(n * alpha_tab[72 + 1 * 1]); \
+    q[(rb) + 72 + 1] = m + t; \
+    q[(rb) + 72 + 1 + 128] = m - t; \
+    m = q[(rb) + 72 + 2]; \
+    n = q[(rb) + 72 + 2 + 128]; \
+    t = REDS2(n * alpha_tab[72 + 2 * 1]); \
+    q[(rb) + 72 + 2] = m + t; \
+    q[(rb) + 72 + 2 + 128] = m - t; \
+    m = q[(rb) + 72 + 3]; \
+    n = q[(rb) + 72 + 3 + 128]; \
+    t = REDS2(n * alpha_tab[72 + 3 * 1]); \
+    q[(rb) + 72 + 3] = m + t; \
+    q[(rb) + 72 + 3 + 128] = m - t; \
+    m = q[(rb) + 76 + 0]; \
+    n = q[(rb) + 76 + 0 + 128]; \
+    t = REDS2(n * alpha_tab[76 + 0 * 1]); \
+    q[(rb) + 76 + 0] = m + t; \
+    q[(rb) + 76 + 0 + 128] = m - t; \
+    m = q[(rb) + 76 + 1]; \
+    n = q[(rb) + 76 + 1 + 128]; \
+    t = REDS2(n * alpha_tab[76 + 1 * 1]); \
+    q[(rb) + 76 + 1] = m + t; \
+    q[(rb) + 76 + 1 + 128] = m - t; \
+    m = q[(rb) + 76 + 2]; \
+    n = q[(rb) + 76 + 2 + 128]; \
+    t = REDS2(n * alpha_tab[76 + 2 * 1]); \
+    q[(rb) + 76 + 2] = m + t; \
+    q[(rb) + 76 + 2 + 128] = m - t; \
+    m = q[(rb) + 76 + 3]; \
+    n = q[(rb) + 76 + 3 + 128]; \
+    t = REDS2(n * alpha_tab[76 + 3 * 1]); \
+    q[(rb) + 76 + 3] = m + t; \
+    q[(rb) + 76 + 3 + 128] = m - t; \
+    m = q[(rb) + 80 + 0]; \
+    n = q[(rb) + 80 + 0 + 128]; \
+    t = REDS2(n * alpha_tab[80 + 0 * 1]); \
+    q[(rb) + 80 + 0] = m + t; \
+    q[(rb) + 80 + 0 + 128] = m - t; \
+    m = q[(rb) + 80 + 1]; \
+    n = q[(rb) + 80 + 1 + 128]; \
+    t = REDS2(n * alpha_tab[80 + 1 * 1]); \
+    q[(rb) + 80 + 1] = m + t; \
+    q[(rb) + 80 + 1 + 128] = m - t; \
+    m = q[(rb) + 80 + 2]; \
+    n = q[(rb) + 80 + 2 + 128]; \
+    t = REDS2(n * alpha_tab[80 + 2 * 1]); \
+    q[(rb) + 80 + 2] = m + t; \
+    q[(rb) + 80 + 2 + 128] = m - t; \
+    m = q[(rb) + 80 + 3]; \
+    n = q[(rb) + 80 + 3 + 128]; \
+    t = REDS2(n * alpha_tab[80 + 3 * 1]); \
+    q[(rb) + 80 + 3] = m + t; \
+    q[(rb) + 80 + 3 + 128] = m - t; \
+    m = q[(rb) + 84 + 0]; \
+    n = q[(rb) + 84 + 0 + 128]; \
+    t = REDS2(n * alpha_tab[84 + 0 * 1]); \
+    q[(rb) + 84 + 0] = m + t; \
+    q[(rb) + 84 + 0 + 128] = m - t; \
+    m = q[(rb) + 84 + 1]; \
+    n = q[(rb) + 84 + 1 + 128]; \
+    t = REDS2(n * alpha_tab[84 + 1 * 1]); \
+    q[(rb) + 84 + 1] = m + t; \
+    q[(rb) + 84 + 1 + 128] = m - t; \
+    m = q[(rb) + 84 + 2]; \
+    n = q[(rb) + 84 + 2 + 128]; \
+    t = REDS2(n * alpha_tab[84 + 2 * 1]); \
+    q[(rb) + 84 + 2] = m + t; \
+    q[(rb) + 84 + 2 + 128] = m - t; \
+    m = q[(rb) + 84 + 3]; \
+    n = q[(rb) + 84 + 3 + 128]; \
+    t = REDS2(n * alpha_tab[84 + 3 * 1]); \
+    q[(rb) + 84 + 3] = m + t; \
+    q[(rb) + 84 + 3 + 128] = m - t; \
+    m = q[(rb) + 88 + 0]; \
+    n = q[(rb) + 88 + 0 + 128]; \
+    t = REDS2(n * alpha_tab[88 + 0 * 1]); \
+    q[(rb) + 88 + 0] = m + t; \
+    q[(rb) + 88 + 0 + 128] = m - t; \
+    m = q[(rb) + 88 + 1]; \
+    n = q[(rb) + 88 + 1 + 128]; \
+    t = REDS2(n * alpha_tab[88 + 1 * 1]); \
+    q[(rb) + 88 + 1] = m + t; \
+    q[(rb) + 88 + 1 + 128] = m - t; \
+    m = q[(rb) + 88 + 2]; \
+    n = q[(rb) + 88 + 2 + 128]; \
+    t = REDS2(n * alpha_tab[88 + 2 * 1]); \
+    q[(rb) + 88 + 2] = m + t; \
+    q[(rb) + 88 + 2 + 128] = m - t; \
+    m = q[(rb) + 88 + 3]; \
+    n = q[(rb) + 88 + 3 + 128]; \
+    t = REDS2(n * alpha_tab[88 + 3 * 1]); \
+    q[(rb) + 88 + 3] = m + t; \
+    q[(rb) + 88 + 3 + 128] = m - t; \
+    m = q[(rb) + 92 + 0]; \
+    n = q[(rb) + 92 + 0 + 128]; \
+    t = REDS2(n * alpha_tab[92 + 0 * 1]); \
+    q[(rb) + 92 + 0] = m + t; \
+    q[(rb) + 92 + 0 + 128] = m - t; \
+    m = q[(rb) + 92 + 1]; \
+    n = q[(rb) + 92 + 1 + 128]; \
+    t = REDS2(n * alpha_tab[92 + 1 * 1]); \
+    q[(rb) + 92 + 1] = m + t; \
+    q[(rb) + 92 + 1 + 128] = m - t; \
+    m = q[(rb) + 92 + 2]; \
+    n = q[(rb) + 92 + 2 + 128]; \
+    t = REDS2(n * alpha_tab[92 + 2 * 1]); \
+    q[(rb) + 92 + 2] = m + t; \
+    q[(rb) + 92 + 2 + 128] = m - t; \
+    m = q[(rb) + 92 + 3]; \
+    n = q[(rb) + 92 + 3 + 128]; \
+    t = REDS2(n * alpha_tab[92 + 3 * 1]); \
+    q[(rb) + 92 + 3] = m + t; \
+    q[(rb) + 92 + 3 + 128] = m - t; \
+    m = q[(rb) + 96 + 0]; \
+    n = q[(rb) + 96 + 0 + 128]; \
+    t = REDS2(n * alpha_tab[96 + 0 * 1]); \
+    q[(rb) + 96 + 0] = m + t; \
+    q[(rb) + 96 + 0 + 128] = m - t; \
+    m = q[(rb) + 96 + 1]; \
+    n = q[(rb) + 96 + 1 + 128]; \
+    t = REDS2(n * alpha_tab[96 + 1 * 1]); \
+    q[(rb) + 96 + 1] = m + t; \
+    q[(rb) + 96 + 1 + 128] = m - t; \
+    m = q[(rb) + 96 + 2]; \
+    n = q[(rb) + 96 + 2 + 128]; \
+    t = REDS2(n * alpha_tab[96 + 2 * 1]); \
+    q[(rb) + 96 + 2] = m + t; \
+    q[(rb) + 96 + 2 + 128] = m - t; \
+    m = q[(rb) + 96 + 3]; \
+    n = q[(rb) + 96 + 3 + 128]; \
+    t = REDS2(n * alpha_tab[96 + 3 * 1]); \
+    q[(rb) + 96 + 3] = m + t; \
+    q[(rb) + 96 + 3 + 128] = m - t; \
+    m = q[(rb) + 100 + 0]; \
+    n = q[(rb) + 100 + 0 + 128]; \
+    t = REDS2(n * alpha_tab[100 + 0 * 1]); \
+    q[(rb) + 100 + 0] = m + t; \
+    q[(rb) + 100 + 0 + 128] = m - t; \
+    m = q[(rb) + 100 + 1]; \
+    n = q[(rb) + 100 + 1 + 128]; \
+    t = REDS2(n * alpha_tab[100 + 1 * 1]); \
+    q[(rb) + 100 + 1] = m + t; \
+    q[(rb) + 100 + 1 + 128] = m - t; \
+    m = q[(rb) + 100 + 2]; \
+    n = q[(rb) + 100 + 2 + 128]; \
+    t = REDS2(n * alpha_tab[100 + 2 * 1]); \
+    q[(rb) + 100 + 2] = m + t; \
+    q[(rb) + 100 + 2 + 128] = m - t; \
+    m = q[(rb) + 100 + 3]; \
+    n = q[(rb) + 100 + 3 + 128]; \
+    t = REDS2(n * alpha_tab[100 + 3 * 1]); \
+    q[(rb) + 100 + 3] = m + t; \
+    q[(rb) + 100 + 3 + 128] = m - t; \
+    m = q[(rb) + 104 + 0]; \
+    n = q[(rb) + 104 + 0 + 128]; \
+    t = REDS2(n * alpha_tab[104 + 0 * 1]); \
+    q[(rb) + 104 + 0] = m + t; \
+    q[(rb) + 104 + 0 + 128] = m - t; \
+    m = q[(rb) + 104 + 1]; \
+    n = q[(rb) + 104 + 1 + 128]; \
+    t = REDS2(n * alpha_tab[104 + 1 * 1]); \
+    q[(rb) + 104 + 1] = m + t; \
+    q[(rb) + 104 + 1 + 128] = m - t; \
+    m = q[(rb) + 104 + 2]; \
+    n = q[(rb) + 104 + 2 + 128]; \
+    t = REDS2(n * alpha_tab[104 + 2 * 1]); \
+    q[(rb) + 104 + 2] = m + t; \
+    q[(rb) + 104 + 2 + 128] = m - t; \
+    m = q[(rb) + 104 + 3]; \
+    n = q[(rb) + 104 + 3 + 128]; \
+    t = REDS2(n * alpha_tab[104 + 3 * 1]); \
+    q[(rb) + 104 + 3] = m + t; \
+    q[(rb) + 104 + 3 + 128] = m - t; \
+    m = q[(rb) + 108 + 0]; \
+    n = q[(rb) + 108 + 0 + 128]; \
+    t = REDS2(n * alpha_tab[108 + 0 * 1]); \
+    q[(rb) + 108 + 0] = m + t; \
+    q[(rb) + 108 + 0 + 128] = m - t; \
+    m = q[(rb) + 108 + 1]; \
+    n = q[(rb) + 108 + 1 + 128]; \
+    t = REDS2(n * alpha_tab[108 + 1 * 1]); \
+    q[(rb) + 108 + 1] = m + t; \
+    q[(rb) + 108 + 1 + 128] = m - t; \
+    m = q[(rb) + 108 + 2]; \
+    n = q[(rb) + 108 + 2 + 128]; \
+    t = REDS2(n * alpha_tab[108 + 2 * 1]); \
+    q[(rb) + 108 + 2] = m + t; \
+    q[(rb) + 108 + 2 + 128] = m - t; \
+    m = q[(rb) + 108 + 3]; \
+    n = q[(rb) + 108 + 3 + 128]; \
+    t = REDS2(n * alpha_tab[108 + 3 * 1]); \
+    q[(rb) + 108 + 3] = m + t; \
+    q[(rb) + 108 + 3 + 128] = m - t; \
+    m = q[(rb) + 112 + 0]; \
+    n = q[(rb) + 112 + 0 + 128]; \
+    t = REDS2(n * alpha_tab[112 + 0 * 1]); \
+    q[(rb) + 112 + 0] = m + t; \
+    q[(rb) + 112 + 0 + 128] = m - t; \
+    m = q[(rb) + 112 + 1]; \
+    n = q[(rb) + 112 + 1 + 128]; \
+    t = REDS2(n * alpha_tab[112 + 1 * 1]); \
+    q[(rb) + 112 + 1] = m + t; \
+    q[(rb) + 112 + 1 + 128] = m - t; \
+    m = q[(rb) + 112 + 2]; \
+    n = q[(rb) + 112 + 2 + 128]; \
+    t = REDS2(n * alpha_tab[112 + 2 * 1]); \
+    q[(rb) + 112 + 2] = m + t; \
+    q[(rb) + 112 + 2 + 128] = m - t; \
+    m = q[(rb) + 112 + 3]; \
+    n = q[(rb) + 112 + 3 + 128]; \
+    t = REDS2(n * alpha_tab[112 + 3 * 1]); \
+    q[(rb) + 112 + 3] = m + t; \
+    q[(rb) + 112 + 3 + 128] = m - t; \
+    m = q[(rb) + 116 + 0]; \
+    n = q[(rb) + 116 + 0 + 128]; \
+    t = REDS2(n * alpha_tab[116 + 0 * 1]); \
+    q[(rb) + 116 + 0] = m + t; \
+    q[(rb) + 116 + 0 + 128] = m - t; \
+    m = q[(rb) + 116 + 1]; \
+    n = q[(rb) + 116 + 1 + 128]; \
+    t = REDS2(n * alpha_tab[116 + 1 * 1]); \
+    q[(rb) + 116 + 1] = m + t; \
+    q[(rb) + 116 + 1 + 128] = m - t; \
+    m = q[(rb) + 116 + 2]; \
+    n = q[(rb) + 116 + 2 + 128]; \
+    t = REDS2(n * alpha_tab[116 + 2 * 1]); \
+    q[(rb) + 116 + 2] = m + t; \
+    q[(rb) + 116 + 2 + 128] = m - t; \
+    m = q[(rb) + 116 + 3]; \
+    n = q[(rb) + 116 + 3 + 128]; \
+    t = REDS2(n * alpha_tab[116 + 3 * 1]); \
+    q[(rb) + 116 + 3] = m + t; \
+    q[(rb) + 116 + 3 + 128] = m - t; \
+    m = q[(rb) + 120 + 0]; \
+    n = q[(rb) + 120 + 0 + 128]; \
+    t = REDS2(n * alpha_tab[120 + 0 * 1]); \
+    q[(rb) + 120 + 0] = m + t; \
+    q[(rb) + 120 + 0 + 128] = m - t; \
+    m = q[(rb) + 120 + 1]; \
+    n = q[(rb) + 120 + 1 + 128]; \
+    t = REDS2(n * alpha_tab[120 + 1 * 1]); \
+    q[(rb) + 120 + 1] = m + t; \
+    q[(rb) + 120 + 1 + 128] = m - t; \
+    m = q[(rb) + 120 + 2]; \
+    n = q[(rb) + 120 + 2 + 128]; \
+    t = REDS2(n * alpha_tab[120 + 2 * 1]); \
+    q[(rb) + 120 + 2] = m + t; \
+    q[(rb) + 120 + 2 + 128] = m - t; \
+    m = q[(rb) + 120 + 3]; \
+    n = q[(rb) + 120 + 3 + 128]; \
+    t = REDS2(n * alpha_tab[120 + 3 * 1]); \
+    q[(rb) + 120 + 3] = m + t; \
+    q[(rb) + 120 + 3 + 128] = m - t; \
+    m = q[(rb) + 124 + 0]; \
+    n = q[(rb) + 124 + 0 + 128]; \
+    t = REDS2(n * alpha_tab[124 + 0 * 1]); \
+    q[(rb) + 124 + 0] = m + t; \
+    q[(rb) + 124 + 0 + 128] = m - t; \
+    m = q[(rb) + 124 + 1]; \
+    n = q[(rb) + 124 + 1 + 128]; \
+    t = REDS2(n * alpha_tab[124 + 1 * 1]); \
+    q[(rb) + 124 + 1] = m + t; \
+    q[(rb) + 124 + 1 + 128] = m - t; \
+    m = q[(rb) + 124 + 2]; \
+    n = q[(rb) + 124 + 2 + 128]; \
+    t = REDS2(n * alpha_tab[124 + 2 * 1]); \
+    q[(rb) + 124 + 2] = m + t; \
+    q[(rb) + 124 + 2 + 128] = m - t; \
+    m = q[(rb) + 124 + 3]; \
+    n = q[(rb) + 124 + 3 + 128]; \
+    t = REDS2(n * alpha_tab[124 + 3 * 1]); \
+    q[(rb) + 124 + 3] = m + t; \
+    q[(rb) + 124 + 3 + 128] = m - t; \
+  } while (0)
+
+/*
+ * Output ranges:
+ *   d0:   min=    0   max= 1020
+ *   d1:   min=  -67   max= 4587
+ *   d2:   min=-4335   max= 4335
+ *   d3:   min=-4147   max=  507
+ *   d4:   min= -510   max=  510
+ *   d5:   min= -252   max= 4402
+ *   d6:   min=-4335   max= 4335
+ *   d7:   min=-4332   max=  322
+ */
+#define FFT8(xb, xs, d)   do { \
+    s32 x0 = x[(xb)]; \
+    s32 x1 = x[(xb) + (xs)]; \
+    s32 x2 = x[(xb) + 2 * (xs)]; \
+    s32 x3 = x[(xb) + 3 * (xs)]; \
+    s32 a0 = x0 + x2; \
+    s32 a1 = x0 + (x2 << 4); \
+    s32 a2 = x0 - x2; \
+    s32 a3 = x0 - (x2 << 4); \
+    s32 b0 = x1 + x3; \
+    s32 b1 = REDS1((x1 << 2) + (x3 << 6)); \
+    s32 b2 = (x1 << 4) - (x3 << 4); \
+    s32 b3 = REDS1((x1 << 6) + (x3 << 2)); \
+    d ## 0 = a0 + b0; \
+    d ## 1 = a1 + b1; \
+    d ## 2 = a2 + b2; \
+    d ## 3 = a3 + b3; \
+    d ## 4 = a0 - b0; \
+    d ## 5 = a1 - b1; \
+    d ## 6 = a2 - b2; \
+    d ## 7 = a3 - b3; \
+  } while (0)
+
+/*
+ * When k=16, we have alpha=2. Multiplication by alpha^i is then reduced
+ * to some shifting.
+ *
+ * Output: within -591471..591723
+ */
+#define FFT16(xb, xs, rb)   do { \
+    s32 d1_0, d1_1, d1_2, d1_3, d1_4, d1_5, d1_6, d1_7; \
+    s32 d2_0, d2_1, d2_2, d2_3, d2_4, d2_5, d2_6, d2_7; \
+    FFT8(xb, (xs) << 1, d1_); \
+    FFT8((xb) + (xs), (xs) << 1, d2_); \
+    q[(rb) +  0] = d1_0 + d2_0; \
+    q[(rb) +  1] = d1_1 + (d2_1 << 1); \
+    q[(rb) +  2] = d1_2 + (d2_2 << 2); \
+    q[(rb) +  3] = d1_3 + (d2_3 << 3); \
+    q[(rb) +  4] = d1_4 + (d2_4 << 4); \
+    q[(rb) +  5] = d1_5 + (d2_5 << 5); \
+    q[(rb) +  6] = d1_6 + (d2_6 << 6); \
+    q[(rb) +  7] = d1_7 + (d2_7 << 7); \
+    q[(rb) +  8] = d1_0 - d2_0; \
+    q[(rb) +  9] = d1_1 - (d2_1 << 1); \
+    q[(rb) + 10] = d1_2 - (d2_2 << 2); \
+    q[(rb) + 11] = d1_3 - (d2_3 << 3); \
+    q[(rb) + 12] = d1_4 - (d2_4 << 4); \
+    q[(rb) + 13] = d1_5 - (d2_5 << 5); \
+    q[(rb) + 14] = d1_6 - (d2_6 << 6); \
+    q[(rb) + 15] = d1_7 - (d2_7 << 7); \
+  } while (0)
+
+/*
+ * Output range: |q| <= 1183446
+ */
+#define FFT32(xb, xs, rb, id)   do { \
+    FFT16(xb, (xs) << 1, rb); \
+    FFT16((xb) + (xs), (xs) << 1, (rb) + 16); \
+    FFT_LOOP_16_8(rb); \
+  } while (0)
+
+/*
+ * Output range: |q| <= 2366892
+ */
+#define FFT64(xb, xs, rb)   do { \
+  FFT32(xb, (xs) << 1, (rb), label_a); \
+  FFT32((xb) + (xs), (xs) << 1, (rb) + 32, label_b); \
+  FFT_LOOP_32_4(rb); \
+  } while (0)
+
+/*
+ * Output range: |q| <= 9467568
+ */
+#define FFT256(xb, xs, rb, id)   do { \
+    FFT64((xb) + ((xs) * 0), (xs) << 2, (rb + 0)); \
+    FFT64((xb) + ((xs) * 2), (xs) << 2, (rb + 64)); \
+    FFT_LOOP_64_2(rb); \
+    FFT64((xb) + ((xs) * 1), (xs) << 2, (rb + 128)); \
+    FFT64((xb) + ((xs) * 3), (xs) << 2, (rb + 192)); \
+    FFT_LOOP_64_2((rb) + 128); \
+    FFT_LOOP_128_1(rb); \
+  } while (0)
+
+/*
+ * beta^(255*i) mod 257
+ */
+__constant__ static const unsigned short yoff_b_n[] = {
+    1, 163,  98,  40,  95,  65,  58, 202,  30,   7, 113, 172,
+   23, 151, 198, 149, 129, 210,  49,  20, 176, 161,  29, 101,
+   15, 132, 185,  86, 140, 204,  99, 203, 193, 105, 153,  10,
+   88, 209, 143, 179, 136,  66, 221,  43,  70, 102, 178, 230,
+  225, 181, 205,   5,  44, 233, 200, 218,  68,  33, 239, 150,
+   35,  51,  89, 115, 241, 219, 231, 131,  22, 245, 100, 109,
+   34, 145, 248,  75, 146, 154, 173, 186, 249, 238, 244, 194,
+   11, 251,  50, 183,  17, 201, 124, 166,  73,  77, 215,  93,
+  253, 119, 122,  97, 134, 254,  25, 220, 137, 229,  62,  83,
+  165, 167, 236, 175, 255, 188,  61, 177,  67, 127, 141, 110,
+  197, 243,  31, 170, 211, 212, 118, 216, 256,  94, 159, 217,
+  162, 192, 199,  55, 227, 250, 144,  85, 234, 106,  59, 108,
+  128,  47, 208, 237,  81,  96, 228, 156, 242, 125,  72, 171,
+  117,  53, 158,  54,  64, 152, 104, 247, 169,  48, 114,  78,
+  121, 191,  36, 214, 187, 155,  79,  27,  32,  76,  52, 252,
+  213,  24,  57,  39, 189, 224,  18, 107, 222, 206, 168, 142,
+   16,  38,  26, 126, 235,  12, 157, 148, 223, 112,   9, 182,
+  111, 103,  84,  71,   8,  19,  13,  63, 246,   6, 207,  74,
+  240,  56, 133,  91, 184, 180,  42, 164,   4, 138, 135, 160,
+  123,   3, 232,  37, 120,  28, 195, 174,  92,  90,  21,  82,
+    2,  69, 196,  80, 190, 130, 116, 147,  60,  14, 226,  87,
+   46,  45, 139,  41
+};
+
+#define INNER(l, h, mm)   (((u32)((l) * (mm)) & 0xFFFFU) \
+              + ((u32)((h) * (mm)) << 16))
+
+#define W_BIG(sb, o1, o2, mm) \
+  (INNER(q[16 * (sb) + 2 * 0 + o1], q[16 * (sb) + 2 * 0 + o2], mm), \
+   INNER(q[16 * (sb) + 2 * 1 + o1], q[16 * (sb) + 2 * 1 + o2], mm), \
+   INNER(q[16 * (sb) + 2 * 2 + o1], q[16 * (sb) + 2 * 2 + o2], mm), \
+   INNER(q[16 * (sb) + 2 * 3 + o1], q[16 * (sb) + 2 * 3 + o2], mm), \
+   INNER(q[16 * (sb) + 2 * 4 + o1], q[16 * (sb) + 2 * 4 + o2], mm), \
+   INNER(q[16 * (sb) + 2 * 5 + o1], q[16 * (sb) + 2 * 5 + o2], mm), \
+   INNER(q[16 * (sb) + 2 * 6 + o1], q[16 * (sb) + 2 * 6 + o2], mm), \
+   INNER(q[16 * (sb) + 2 * 7 + o1], q[16 * (sb) + 2 * 7 + o2], mm)
+
+#define WB_0_0   W_BIG( 4,    0,    1, 185)
+#define WB_0_1   W_BIG( 6,    0,    1, 185)
+#define WB_0_2   W_BIG( 0,    0,    1, 185)
+#define WB_0_3   W_BIG( 2,    0,    1, 185)
+#define WB_0_4   W_BIG( 7,    0,    1, 185)
+#define WB_0_5   W_BIG( 5,    0,    1, 185)
+#define WB_0_6   W_BIG( 3,    0,    1, 185)
+#define WB_0_7   W_BIG( 1,    0,    1, 185)
+#define WB_1_0   W_BIG(15,    0,    1, 185)
+#define WB_1_1   W_BIG(11,    0,    1, 185)
+#define WB_1_2   W_BIG(12,    0,    1, 185)
+#define WB_1_3   W_BIG( 8,    0,    1, 185)
+#define WB_1_4   W_BIG( 9,    0,    1, 185)
+#define WB_1_5   W_BIG(13,    0,    1, 185)
+#define WB_1_6   W_BIG(10,    0,    1, 185)
+#define WB_1_7   W_BIG(14,    0,    1, 185)
+#define WB_2_0   W_BIG(17, -256, -128, 233)
+#define WB_2_1   W_BIG(18, -256, -128, 233)
+#define WB_2_2   W_BIG(23, -256, -128, 233)
+#define WB_2_3   W_BIG(20, -256, -128, 233)
+#define WB_2_4   W_BIG(22, -256, -128, 233)
+#define WB_2_5   W_BIG(21, -256, -128, 233)
+#define WB_2_6   W_BIG(16, -256, -128, 233)
+#define WB_2_7   W_BIG(19, -256, -128, 233)
+#define WB_3_0   W_BIG(30, -383, -255, 233)
+#define WB_3_1   W_BIG(24, -383, -255, 233)
+#define WB_3_2   W_BIG(25, -383, -255, 233)
+#define WB_3_3   W_BIG(31, -383, -255, 233)
+#define WB_3_4   W_BIG(27, -383, -255, 233)
+#define WB_3_5   W_BIG(29, -383, -255, 233)
+#define WB_3_6   W_BIG(28, -383, -255, 233)
+#define WB_3_7   W_BIG(26, -383, -255, 233)
+
+#define IF(x, y, z)    ((((y) ^ (z)) & (x)) ^ (z))
+#define MAJ(x, y, z)   (((x) & (y)) | (((x) | (y)) & (z)))
+
+#define PP4_0_0   1
+#define PP4_0_1   0
+#define PP4_0_2   3
+#define PP4_0_3   2
+#define PP4_1_0   2
+#define PP4_1_1   3
+#define PP4_1_2   0
+#define PP4_1_3   1
+#define PP4_2_0   3
+#define PP4_2_1   2
+#define PP4_2_2   1
+#define PP4_2_3   0
+
+#define PP8_0_0   1
+#define PP8_0_1   0
+#define PP8_0_2   3
+#define PP8_0_3   2
+#define PP8_0_4   5
+#define PP8_0_5   4
+#define PP8_0_6   7
+#define PP8_0_7   6
+
+#define PP8_1_0   6
+#define PP8_1_1   7
+#define PP8_1_2   4
+#define PP8_1_3   5
+#define PP8_1_4   2
+#define PP8_1_5   3
+#define PP8_1_6   0
+#define PP8_1_7   1
+
+#define PP8_2_0   2
+#define PP8_2_1   3
+#define PP8_2_2   0
+#define PP8_2_3   1
+#define PP8_2_4   6
+#define PP8_2_5   7
+#define PP8_2_6   4
+#define PP8_2_7   5
+
+#define PP8_3_0   3
+#define PP8_3_1   2
+#define PP8_3_2   1
+#define PP8_3_3   0
+#define PP8_3_4   7
+#define PP8_3_5   6
+#define PP8_3_6   5
+#define PP8_3_7   4
+
+#define PP8_4_0   5
+#define PP8_4_1   4
+#define PP8_4_2   7
+#define PP8_4_3   6
+#define PP8_4_4   1
+#define PP8_4_5   0
+#define PP8_4_6   3
+#define PP8_4_7   2
+
+#define PP8_5_0   7
+#define PP8_5_1   6
+#define PP8_5_2   5
+#define PP8_5_3   4
+#define PP8_5_4   3
+#define PP8_5_5   2
+#define PP8_5_6   1
+#define PP8_5_7   0
+
+#define PP8_6_0   4
+#define PP8_6_1   5
+#define PP8_6_2   6
+#define PP8_6_3   7
+#define PP8_6_4   0
+#define PP8_6_5   1
+#define PP8_6_6   2
+#define PP8_6_7   3
+
+#define STEP_ELT(n, w, fun, s, ppb)   do { \
+    u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \
+    A ## n = T32(ROL32(tt, s) + XCAT(tA, XCAT(ppb, n))); \
+    D ## n = C ## n; \
+    C ## n = B ## n; \
+    B ## n = tA ## n; \
+  } while (0)
+
+#define STEP_BIG(w0, w1, w2, w3, w4, w5, w6, w7, fun, r, s, pp8b)   do { \
+    u32 tA0 = ROL32(A0, r); \
+    u32 tA1 = ROL32(A1, r); \
+    u32 tA2 = ROL32(A2, r); \
+    u32 tA3 = ROL32(A3, r); \
+    u32 tA4 = ROL32(A4, r); \
+    u32 tA5 = ROL32(A5, r); \
+    u32 tA6 = ROL32(A6, r); \
+    u32 tA7 = ROL32(A7, r); \
+    STEP_ELT(0, w0, fun, s, pp8b); \
+    STEP_ELT(1, w1, fun, s, pp8b); \
+    STEP_ELT(2, w2, fun, s, pp8b); \
+    STEP_ELT(3, w3, fun, s, pp8b); \
+    STEP_ELT(4, w4, fun, s, pp8b); \
+    STEP_ELT(5, w5, fun, s, pp8b); \
+    STEP_ELT(6, w6, fun, s, pp8b); \
+    STEP_ELT(7, w7, fun, s, pp8b); \
+  } while (0)
+
+#define SIMD_M3_0_0   0_
+#define SIMD_M3_1_0   1_
+#define SIMD_M3_2_0   2_
+#define SIMD_M3_3_0   0_
+#define SIMD_M3_4_0   1_
+#define SIMD_M3_5_0   2_
+#define SIMD_M3_6_0   0_
+#define SIMD_M3_7_0   1_
+
+#define SIMD_M3_0_1   1_
+#define SIMD_M3_1_1   2_
+#define SIMD_M3_2_1   0_
+#define SIMD_M3_3_1   1_
+#define SIMD_M3_4_1   2_
+#define SIMD_M3_5_1   0_
+#define SIMD_M3_6_1   1_
+#define SIMD_M3_7_1   2_
+
+#define SIMD_M3_0_2   2_
+#define SIMD_M3_1_2   0_
+#define SIMD_M3_2_2   1_
+#define SIMD_M3_3_2   2_
+#define SIMD_M3_4_2   0_
+#define SIMD_M3_5_2   1_
+#define SIMD_M3_6_2   2_
+#define SIMD_M3_7_2   0_
+
+#define M7_0_0   0_
+#define M7_1_0   1_
+#define M7_2_0   2_
+#define M7_3_0   3_
+#define M7_4_0   4_
+#define M7_5_0   5_
+#define M7_6_0   6_
+#define M7_7_0   0_
+
+#define M7_0_1   1_
+#define M7_1_1   2_
+#define M7_2_1   3_
+#define M7_3_1   4_
+#define M7_4_1   5_
+#define M7_5_1   6_
+#define M7_6_1   0_
+#define M7_7_1   1_
+
+#define M7_0_2   2_
+#define M7_1_2   3_
+#define M7_2_2   4_
+#define M7_3_2   5_
+#define M7_4_2   6_
+#define M7_5_2   0_
+#define M7_6_2   1_
+#define M7_7_2   2_
+
+#define M7_0_3   3_
+#define M7_1_3   4_
+#define M7_2_3   5_
+#define M7_3_3   6_
+#define M7_4_3   0_
+#define M7_5_3   1_
+#define M7_6_3   2_
+#define M7_7_3   3_
+
+#define STEP_BIG_(w, fun, r, s, pp8b)   STEP_BIG w, fun, r, s, pp8b)
+
+#define ONE_ROUND_BIG(ri, isp, p0, p1, p2, p3)   do { \
+    STEP_BIG_(WB_ ## ri ## 0, \
+      IF,  p0, p1, XCAT(PP8_, M7_0_ ## isp)); \
+    STEP_BIG_(WB_ ## ri ## 1, \
+      IF,  p1, p2, XCAT(PP8_, M7_1_ ## isp)); \
+    STEP_BIG_(WB_ ## ri ## 2, \
+      IF,  p2, p3, XCAT(PP8_, M7_2_ ## isp)); \
+    STEP_BIG_(WB_ ## ri ## 3, \
+      IF,  p3, p0, XCAT(PP8_, M7_3_ ## isp)); \
+    STEP_BIG_(WB_ ## ri ## 4, \
+      MAJ, p0, p1, XCAT(PP8_, M7_4_ ## isp)); \
+    STEP_BIG_(WB_ ## ri ## 5, \
+      MAJ, p1, p2, XCAT(PP8_, M7_5_ ## isp)); \
+    STEP_BIG_(WB_ ## ri ## 6, \
+      MAJ, p2, p3, XCAT(PP8_, M7_6_ ## isp)); \
+    STEP_BIG_(WB_ ## ri ## 7, \
+      MAJ, p3, p0, XCAT(PP8_, M7_7_ ## isp)); \
+  } while (0)
+
+//__constant__ static const s32 SIMD_Q_64[] = {
+//  4, 28, -80, -120, -47, -126, 45, -123, -92, -127, -70, 23, -23, -24, 40, -125, 101, 122, 34, -24, -119, 110, -121, -112, 32, 24, 51, 73, -117, -64, -21, 42, -60, 16, 5, 85, 107, 52, -44, -96, 42, 127, -18, -108, -47, 26, 91, 117, 112, 46, 87, 79, 126, -120, 65, -24, 121, 29, 118, -7, -53, 85, -98, -117, 32, 115, -47, -116, 63, 16, -108, 49, -119, 57, -110, 4, -76, -76, -42, -86, 58, 115, 4, 4, -83, -51, -37, 116, 32, 15, 36, -42, 73, -99, 94, 87, 60, -20, 67, 12, -76, 55, 117, -68, -82, -80, 93, -20, 92, -21, -128, -91, -11, 84, -28, 76, 94, -124, 37, 93, 17, -78, -106, -29, 88, -15, -47, 102, -4, -28, 80, 120, 47, 126, -45, 123, 92, 127, 70, -23, 23, 24, -40, 125, -101, -122, -34, 24, 119, -110, 121, 112, -32, -24, -51, -73, 117, 64, 21, -42, 60, -16, -5, -85, -107, -52, 44, 96, -42, -127, 18, 108, 47, -26, -91, -117, -112, -46, -87, -79, -126, 120, -65, 24, -121, -29, -118, 7, 53, -85, 98, 117, -32, -115, 47, 116, -63, -16, 108, -49, 119, -57, 110, -4, 76, 76, 42, 86, -58, -115, -4, -4, 83, 51, 37, -116, -32, -15, -36, 42, -73, 99, -94, -87, -60, 20, -67, -12, 76, -55, -117, 68, 82, 80, -93, 20, -92, 21, 128, 91, 11, -84, 28, -76, -94, 124, -37, -93, -17, 78, 106, 29, -88, 15, 47, -102
+//};
+__constant__ static const s32 SIMD_Q_80[] = {
+	-125, -101, 48, 8, 81, 2, -84, 5, 36, 1, 58, -106, 105, 104, -89, 3, -28, -7, -95, 104, 9, -19, 7, 16, -97, -105, -78, -56, 11, 64, 107, -87, 68, -113, -124, -44, -22, -77, 84, 32, -87, -2, 110, 20, 81, -103, -38, -12, -17, -83, -42, -50, -3, 8, -64, 104, -8, -100, -11, 121, 75, -44, 30, 11, -97, -14, 81, 12, -66, -113, 20, -80, 9, -72, 18, -125, 52, 52, 86, 42, -71, -14, -125, -125, 45, 77, 91, -13, -97, -114, -93, 86, -56, 29, -35, -42, -69, 108, -62, -117, 52, -74, -12, 60, 46, 48, -36, 108, -37, 107, 0, 37, 117, -45, 100, -53, -35, 4, -92, -36, -112, 50, 22, 99, -41, 113, 81, -27, 124, 100, -49, -9, -82, -3, 83, -6, -37, -2, -59, 105, -106, -105, 88, -4, 27, 6, 94, -105, -10, 18, -8, -17, 96, 104, 77, 55, -12, -65, -108, 86, -69, 112, 123, 43, 21, 76, -85, -33, 86, 1, -111, -21, -82, 102, 37, 11, 16, 82, 41, 49, 2, -9, 63, -105, 7, 99, 10, -122, -76, 43, -31, -12, 96, 13, -82, -13, 65, 112, -21, 79, -10, 71, -19, 124, -53, -53, -87, -43, 70, 13, 124, 124, -46, -78, -92, 12, 96, 113, 92, -87, 55, -30, 34, 41, 68, -109, 61, 116, -53, 73, 11, -61, -47, -49, 35, -109, 36, -108, -1, -38, -118, 44, -101, 52, 34, -5, 91, 35, 111, -51, -23, -100, 40, -114, -82, 26
+};
+
+__constant__ static uint32_t c_PaddedMessage80[20];
+
+__host__
+void x16_simd512_setBlock_80(void *pdata)
+{
+	cudaMemcpyToSymbol(c_PaddedMessage80, pdata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice);
+}
+
+#define TPB_SIMD 128
+__global__
+__launch_bounds__(TPB_SIMD,1)
+static void x16_simd512_gpu_80(const uint32_t threads, const uint32_t startNonce, uint64_t *g_outputhash)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t A[20];
+		#pragma unroll 10
+		for (int i=0; i < 20; i += 2)
+			AS_UINT2(&A[i]) = AS_UINT2(&c_PaddedMessage80[i]);
+		A[19] = cuda_swab32(startNonce + thread);
+
+		// simd
+		unsigned char x[128];
+		#pragma unroll
+		for (int i = 0; i < 20; i += 2)
+			AS_UINT2(&x[i*4]) = AS_UINT2(&A[i]);
+		#pragma unroll
+		for(int i = 80; i < 128; i+=4) AS_U32(&x[i]) = 0;
+
+		// SIMD_IV512
+		u32 A0 = 0x0BA16B95, A1 = 0x72F999AD, A2 = 0x9FECC2AE, A3 = 0xBA3264FC, A4 = 0x5E894929, A5 = 0x8E9F30E5, A6 = 0x2F1DAA37, A7 = 0xF0F2C558;
+		u32 B0 = 0xAC506643, B1 = 0xA90635A5, B2 = 0xE25B878B, B3 = 0xAAB7878F, B4 = 0x88817F7A, B5 = 0x0A02892B, B6 = 0x559A7550, B7 = 0x598F657E;
+		u32 C0 = 0x7EEF60A1, C1 = 0x6B70E3E8, C2 = 0x9C1714D1, C3 = 0xB958E2A8, C4 = 0xAB02675E, C5 = 0xED1C014F, C6 = 0xCD8D65BB, C7 = 0xFDB7A257;
+		u32 D0 = 0x09254899, D1 = 0xD699C7BC, D2 = 0x9019B6DC, D3 = 0x2B9022E4, D4 = 0x8FA14956, D5 = 0x21BF9BD3, D6 = 0xB94D0943, D7 = 0x6FFDDC22;
+
+		s32 q[256];
+		FFT256(0, 1, 0, ll1);
+
+		#pragma unroll
+		for (int i = 0; i < 256; i ++) {
+			s32 tq = q[i] + yoff_b_n[i];
+			tq = REDS2(tq);
+			tq = REDS1(tq);
+			tq = REDS1(tq);
+			q[i] = (tq <= 128 ? tq : tq - 257);
+		}
+
+		A0 ^= A[ 0];
+		A1 ^= A[ 1];
+		A2 ^= A[ 2];
+		A3 ^= A[ 3];
+		A4 ^= A[ 4];
+		A5 ^= A[ 5];
+		A6 ^= A[ 6];
+		A7 ^= A[ 7];
+		B0 ^= A[ 8];
+		B1 ^= A[ 9];
+		B2 ^= A[10];
+		B3 ^= A[11];
+		B4 ^= A[12];
+		B5 ^= A[13];
+		B6 ^= A[14];
+		B7 ^= A[15];
+		C0 ^= A[16];
+		C1 ^= A[17];
+		C2 ^= A[18];
+		C3 ^= A[19];
+
+		ONE_ROUND_BIG(0_, 0,  3, 23, 17, 27);
+		ONE_ROUND_BIG(1_, 1, 28, 19, 22,  7);
+		ONE_ROUND_BIG(2_, 2, 29,  9, 15,  5);
+		ONE_ROUND_BIG(3_, 3,  4, 13, 10, 25);
+
+		STEP_BIG(
+			C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC),
+			C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558),
+			IF,  4, 13, PP8_4_);
+
+		STEP_BIG(
+			C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F),
+			C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E),
+			IF, 13, 10, PP8_5_);
+
+		STEP_BIG(
+			C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8),
+			C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257),
+			IF, 10, 25, PP8_6_);
+
+		STEP_BIG(
+			C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4),
+			C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22),
+			IF, 25,  4, PP8_0_);
+
+		// Second round
+
+		u32 COPY_A0 = A0, COPY_A1 = A1, COPY_A2 = A2, COPY_A3 = A3, COPY_A4 = A4, COPY_A5 = A5, COPY_A6 = A6, COPY_A7 = A7;
+		u32 COPY_B0 = B0, COPY_B1 = B1, COPY_B2 = B2, COPY_B3 = B3, COPY_B4 = B4, COPY_B5 = B5, COPY_B6 = B6, COPY_B7 = B7;
+		u32 COPY_C0 = C0, COPY_C1 = C1, COPY_C2 = C2, COPY_C3 = C3, COPY_C4 = C4, COPY_C5 = C5, COPY_C6 = C6, COPY_C7 = C7;
+		u32 COPY_D0 = D0, COPY_D1 = D1, COPY_D2 = D2, COPY_D3 = D3, COPY_D4 = D4, COPY_D5 = D5, COPY_D6 = D6, COPY_D7 = D7;
+
+		#define q SIMD_Q_80
+
+		A0 ^= 0x280; // bitlen
+
+		ONE_ROUND_BIG(0_, 0,  3, 23, 17, 27);
+		ONE_ROUND_BIG(1_, 1, 28, 19, 22,  7);
+		ONE_ROUND_BIG(2_, 2, 29,  9, 15,  5);
+		ONE_ROUND_BIG(3_, 3,  4, 13, 10, 25);
+
+		STEP_BIG(
+			COPY_A0, COPY_A1, COPY_A2, COPY_A3,
+			COPY_A4, COPY_A5, COPY_A6, COPY_A7,
+			IF,  4, 13, PP8_4_);
+
+		STEP_BIG(
+			COPY_B0, COPY_B1, COPY_B2, COPY_B3,
+			COPY_B4, COPY_B5, COPY_B6, COPY_B7,
+			IF, 13, 10, PP8_5_);
+
+		STEP_BIG(
+			COPY_C0, COPY_C1, COPY_C2, COPY_C3,
+			COPY_C4, COPY_C5, COPY_C6, COPY_C7,
+			IF, 10, 25, PP8_6_);
+
+		STEP_BIG(
+			COPY_D0, COPY_D1, COPY_D2, COPY_D3,
+			COPY_D4, COPY_D5, COPY_D6, COPY_D7,
+			IF, 25,  4, PP8_0_);
+
+		#undef q
+
+		A[ 0] = A0;
+		A[ 1] = A1;
+		A[ 2] = A2;
+		A[ 3] = A3;
+		A[ 4] = A4;
+		A[ 5] = A5;
+		A[ 6] = A6;
+		A[ 7] = A7;
+		A[ 8] = B0;
+		A[ 9] = B1;
+		A[10] = B2;
+		A[11] = B3;
+		A[12] = B4;
+		A[13] = B5;
+		A[14] = B6;
+		A[15] = B7;
+
+		const uint64_t hashPosition = thread;
+		uint32_t *Hash = (uint32_t*)(&g_outputhash[(size_t)8 * hashPosition]);
+		#pragma unroll
+		for (int i=0; i < 16; i += 2)
+			*(uint2*)&Hash[i] = *(uint2*)&A[i];
+	}
+}
+
+/***************************************************/
+
+__host__
+void x16_simd512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash)
+{
+	const uint32_t tpb = 128;
+	const dim3 grid((threads + tpb - 1) / tpb);
+	const dim3 block(tpb);
+	x16_simd512_gpu_80 <<<grid, block>>> (threads, startNonce, (uint64_t*) d_hash);
+}
diff --git a/x16/x16r.cu b/x16/x16r.cu
new file mode 100644
index 0000000000..2caa5d0f1b
--- /dev/null
+++ b/x16/x16r.cu
@@ -0,0 +1,623 @@
+/**
+ * X16R algorithm (X16 with Randomized chain order)
+ *
+ * tpruvot 2018 - GPL code
+ */
+
+#include <stdio.h>
+#include <memory.h>
+#include <unistd.h>
+
+extern "C" {
+#include "sph/sph_blake.h"
+#include "sph/sph_bmw.h"
+#include "sph/sph_groestl.h"
+#include "sph/sph_skein.h"
+#include "sph/sph_jh.h"
+#include "sph/sph_keccak.h"
+
+#include "sph/sph_luffa.h"
+#include "sph/sph_cubehash.h"
+#include "sph/sph_shavite.h"
+#include "sph/sph_simd.h"
+#include "sph/sph_echo.h"
+
+#include "sph/sph_hamsi.h"
+#include "sph/sph_fugue.h"
+#include "sph/sph_shabal.h"
+#include "sph/sph_whirlpool.h"
+#include "sph/sph_sha2.h"
+}
+
+#include "miner.h"
+#include "cuda_helper.h"
+#include "cuda_x16.h"
+
+static uint32_t *d_hash[MAX_GPUS];
+
+enum Algo {
+	BLAKE = 0,
+	BMW,
+	GROESTL,
+	JH,
+	KECCAK,
+	SKEIN,
+	LUFFA,
+	CUBEHASH,
+	SHAVITE,
+	SIMD,
+	ECHO,
+	HAMSI,
+	FUGUE,
+	SHABAL,
+	WHIRLPOOL,
+	SHA512,
+	HASH_FUNC_COUNT
+};
+
+static const char* algo_strings[] = {
+	"blake",
+	"bmw512",
+	"groestl",
+	"jh512",
+	"keccak",
+	"skein",
+	"luffa",
+	"cube",
+	"shavite",
+	"simd",
+	"echo",
+	"hamsi",
+	"fugue",
+	"shabal",
+	"whirlpool",
+	"sha512",
+	NULL
+};
+
+static __thread uint32_t s_ntime = UINT32_MAX;
+static __thread char hashOrder[HASH_FUNC_COUNT + 1] = { 0 };
+
+static void getAlgoString(const uint32_t* prevblock, char *output)
+{
+	char *sptr = output;
+	uint8_t* data = (uint8_t*)prevblock;
+
+	for (uint8_t j = 0; j < HASH_FUNC_COUNT; j++) {
+		uint8_t b = (15 - j) >> 1; // 16 ascii hex chars, reversed
+		uint8_t algoDigit = (j & 1) ? data[b] & 0xF : data[b] >> 4;
+		if (algoDigit >= 10)
+			sprintf(sptr, "%c", 'A' + (algoDigit - 10));
+		else
+			sprintf(sptr, "%u", (uint32_t) algoDigit);
+		sptr++;
+	}
+	*sptr = '\0';
+}
+
+// X16R CPU Hash (Validation)
+extern "C" void x16r_hash(void *output, const void *input)
+{
+	unsigned char _ALIGN(64) hash[128];
+
+	sph_blake512_context ctx_blake;
+	sph_bmw512_context ctx_bmw;
+	sph_groestl512_context ctx_groestl;
+	sph_jh512_context ctx_jh;
+	sph_keccak512_context ctx_keccak;
+	sph_skein512_context ctx_skein;
+	sph_luffa512_context ctx_luffa;
+	sph_cubehash512_context ctx_cubehash;
+	sph_shavite512_context ctx_shavite;
+	sph_simd512_context ctx_simd;
+	sph_echo512_context ctx_echo;
+	sph_hamsi512_context ctx_hamsi;
+	sph_fugue512_context ctx_fugue;
+	sph_shabal512_context ctx_shabal;
+	sph_whirlpool_context ctx_whirlpool;
+	sph_sha512_context ctx_sha512;
+
+	void *in = (void*) input;
+	int size = 80;
+
+	uint32_t *in32 = (uint32_t*) input;
+	getAlgoString(&in32[1], hashOrder);
+
+	for (int i = 0; i < 16; i++)
+	{
+		const char elem = hashOrder[i];
+		const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+		switch (algo) {
+		case BLAKE:
+			sph_blake512_init(&ctx_blake);
+			sph_blake512(&ctx_blake, in, size);
+			sph_blake512_close(&ctx_blake, hash);
+			break;
+		case BMW:
+			sph_bmw512_init(&ctx_bmw);
+			sph_bmw512(&ctx_bmw, in, size);
+			sph_bmw512_close(&ctx_bmw, hash);
+			break;
+		case GROESTL:
+			sph_groestl512_init(&ctx_groestl);
+			sph_groestl512(&ctx_groestl, in, size);
+			sph_groestl512_close(&ctx_groestl, hash);
+			break;
+		case SKEIN:
+			sph_skein512_init(&ctx_skein);
+			sph_skein512(&ctx_skein, in, size);
+			sph_skein512_close(&ctx_skein, hash);
+			break;
+		case JH:
+			sph_jh512_init(&ctx_jh);
+			sph_jh512(&ctx_jh, in, size);
+			sph_jh512_close(&ctx_jh, hash);
+			break;
+		case KECCAK:
+			sph_keccak512_init(&ctx_keccak);
+			sph_keccak512(&ctx_keccak, in, size);
+			sph_keccak512_close(&ctx_keccak, hash);
+			break;
+		case LUFFA:
+			sph_luffa512_init(&ctx_luffa);
+			sph_luffa512(&ctx_luffa, in, size);
+			sph_luffa512_close(&ctx_luffa, hash);
+			break;
+		case CUBEHASH:
+			sph_cubehash512_init(&ctx_cubehash);
+			sph_cubehash512(&ctx_cubehash, in, size);
+			sph_cubehash512_close(&ctx_cubehash, hash);
+			break;
+		case SHAVITE:
+			sph_shavite512_init(&ctx_shavite);
+			sph_shavite512(&ctx_shavite, in, size);
+			sph_shavite512_close(&ctx_shavite, hash);
+			break;
+		case SIMD:
+			sph_simd512_init(&ctx_simd);
+			sph_simd512(&ctx_simd, in, size);
+			sph_simd512_close(&ctx_simd, hash);
+			break;
+		case ECHO:
+			sph_echo512_init(&ctx_echo);
+			sph_echo512(&ctx_echo, in, size);
+			sph_echo512_close(&ctx_echo, hash);
+			break;
+		case HAMSI:
+			sph_hamsi512_init(&ctx_hamsi);
+			sph_hamsi512(&ctx_hamsi, in, size);
+			sph_hamsi512_close(&ctx_hamsi, hash);
+			break;
+		case FUGUE:
+			sph_fugue512_init(&ctx_fugue);
+			sph_fugue512(&ctx_fugue, in, size);
+			sph_fugue512_close(&ctx_fugue, hash);
+			break;
+		case SHABAL:
+			sph_shabal512_init(&ctx_shabal);
+			sph_shabal512(&ctx_shabal, in, size);
+			sph_shabal512_close(&ctx_shabal, hash);
+			break;
+		case WHIRLPOOL:
+			sph_whirlpool_init(&ctx_whirlpool);
+			sph_whirlpool(&ctx_whirlpool, in, size);
+			sph_whirlpool_close(&ctx_whirlpool, hash);
+			break;
+		case SHA512:
+			sph_sha512_init(&ctx_sha512);
+			sph_sha512(&ctx_sha512,(const void*) in, size);
+			sph_sha512_close(&ctx_sha512,(void*) hash);
+			break;
+		}
+		in = (void*) hash;
+		size = 64;
+	}
+	memcpy(output, hash, 32);
+}
+
+void whirlpool_midstate(void *state, const void *input)
+{
+	sph_whirlpool_context ctx;
+
+	sph_whirlpool_init(&ctx);
+	sph_whirlpool(&ctx, input, 64);
+
+	memcpy(state, ctx.state, 64);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+static bool use_compat_kernels[MAX_GPUS] = { 0 };
+
+//#define _DEBUG
+#define _DEBUG_PREFIX "x16r-"
+#include "cuda_debug.cuh"
+
+//static int algo80_tests[HASH_FUNC_COUNT] = { 0 };
+//static int algo64_tests[HASH_FUNC_COUNT] = { 0 };
+static int algo80_fails[HASH_FUNC_COUNT] = { 0 };
+
+extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	const int dev_id = device_map[thr_id];
+	int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 20 : 19;
+	if (strstr(device_name[dev_id], "GTX 1080")) intensity = 20;
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
+	//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		cuda_get_arch(thr_id);
+		use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
+		if (use_compat_kernels[thr_id])
+			x11_echo512_cpu_init(thr_id, throughput);
+
+		quark_blake512_cpu_init(thr_id, throughput);
+		quark_bmw512_cpu_init(thr_id, throughput);
+		quark_groestl512_cpu_init(thr_id, throughput);
+		quark_skein512_cpu_init(thr_id, throughput);
+		quark_jh512_cpu_init(thr_id, throughput);
+		quark_keccak512_cpu_init(thr_id, throughput);
+		qubit_luffa512_cpu_init(thr_id, throughput);
+		x11_luffa512_cpu_init(thr_id, throughput); // 64
+		x11_shavite512_cpu_init(thr_id, throughput);
+		x11_simd512_cpu_init(thr_id, throughput); // 64
+		x16_echo512_cuda_init(thr_id, throughput);
+		x13_hamsi512_cpu_init(thr_id, throughput);
+		x13_fugue512_cpu_init(thr_id, throughput);
+		x16_fugue512_cpu_init(thr_id, throughput);
+		x14_shabal512_cpu_init(thr_id, throughput);
+		x15_whirlpool_cpu_init(thr_id, throughput, 0);
+		x16_whirlpool512_init(thr_id, throughput);
+		x17_sha512_cpu_init(thr_id, throughput);
+
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
+
+		cuda_check_cpu_init(thr_id, throughput);
+
+		init[thr_id] = true;
+	}
+
+	if (opt_benchmark) {
+		((uint32_t*)ptarget)[7] = 0x003f;
+		//((uint8_t*)pdata)[8] = 0x90; // hashOrder[0] = '9'; for simd 80 + blake512 64
+		((uint8_t*)pdata)[8] = 0xAA; // hashOrder[0] = 'A'; for echo 80 + 64
+		//((uint8_t*)pdata)[8] = 0xB0; // hashOrder[0] = 'B'; for hamsi 80 + blake512 64
+		//((uint8_t*)pdata)[8] = 0xC0; // hashOrder[0] = 'C'; for fugue 80 + blake512 64
+		//((uint8_t*)pdata)[8] = 0xE0; // hashOrder[0] = 'E'; for whirlpool 80 + blake512 64
+	}
+	uint32_t _ALIGN(64) endiandata[20];
+
+	for (int k=0; k < 19; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	uint32_t ntime = swab32(pdata[17]);
+	if (s_ntime != ntime) {
+		getAlgoString(&endiandata[1], hashOrder);
+		s_ntime = ntime;
+		if (opt_debug && !thr_id) applog(LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime);
+	}
+
+	cuda_check_cpu_setTarget(ptarget);
+
+	char elem = hashOrder[0];
+	const uint8_t algo80 = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+	switch (algo80) {
+		case BLAKE:
+			quark_blake512_cpu_setBlock_80(thr_id, endiandata);
+			break;
+		case BMW:
+			quark_bmw512_cpu_setBlock_80(endiandata);
+			break;
+		case GROESTL:
+			groestl512_setBlock_80(thr_id, endiandata);
+			break;
+		case JH:
+			jh512_setBlock_80(thr_id, endiandata);
+			break;
+		case KECCAK:
+			keccak512_setBlock_80(thr_id, endiandata);
+			break;
+		case SKEIN:
+			skein512_cpu_setBlock_80((void*)endiandata);
+			break;
+		case LUFFA:
+			qubit_luffa512_cpu_setBlock_80((void*)endiandata);
+			break;
+		case CUBEHASH:
+			cubehash512_setBlock_80(thr_id, endiandata);
+			break;
+		case SHAVITE:
+			x11_shavite512_setBlock_80((void*)endiandata);
+			break;
+		case SIMD:
+			x16_simd512_setBlock_80((void*)endiandata);
+			break;
+		case ECHO:
+			x16_echo512_setBlock_80((void*)endiandata);
+			break;
+		case HAMSI:
+			x16_hamsi512_setBlock_80((void*)endiandata);
+			break;
+		case FUGUE:
+			x16_fugue512_setBlock_80((void*)pdata);
+			break;
+		case SHABAL:
+			x16_shabal512_setBlock_80((void*)endiandata);
+			break;
+		case WHIRLPOOL:
+			x16_whirlpool512_setBlock_80((void*)endiandata);
+			break;
+		case SHA512:
+			x16_sha512_setBlock_80(endiandata);
+			break;
+		default: {
+			return -1;
+		}
+	}
+
+	int warn = 0;
+
+	do {
+		int order = 0;
+
+		// Hash with CUDA
+
+		switch (algo80) {
+			case BLAKE:
+				quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("blake80:");
+				break;
+			case BMW:
+				quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+				TRACE("bmw80  :");
+				break;
+			case GROESTL:
+				groestl512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("grstl80:");
+				break;
+			case JH:
+				jh512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("jh51280:");
+				break;
+			case KECCAK:
+				keccak512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("kecck80:");
+				break;
+			case SKEIN:
+				skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++;
+				TRACE("skein80:");
+				break;
+			case LUFFA:
+				qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+				TRACE("luffa80:");
+				break;
+			case CUBEHASH:
+				cubehash512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("cube 80:");
+				break;
+			case SHAVITE:
+				x11_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+				TRACE("shavite:");
+				break;
+			case SIMD:
+				x16_simd512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("simd512:");
+				break;
+			case ECHO:
+				x16_echo512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("echo   :");
+				break;
+			case HAMSI:
+				x16_hamsi512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("hamsi  :");
+				break;
+			case FUGUE:
+				x16_fugue512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("fugue  :");
+				break;
+			case SHABAL:
+				x16_shabal512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("shabal :");
+				break;
+			case WHIRLPOOL:
+				x16_whirlpool512_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("whirl  :");
+				break;
+			case SHA512:
+				x16_sha512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("sha512 :");
+				break;
+		}
+
+		for (int i = 1; i < 16; i++)
+		{
+			const char elem = hashOrder[i];
+			const uint8_t algo64 = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+			switch (algo64) {
+			case BLAKE:
+				quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("blake  :");
+				break;
+			case BMW:
+				quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("bmw    :");
+				break;
+			case GROESTL:
+				quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("groestl:");
+				break;
+			case JH:
+				quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("jh512  :");
+				break;
+			case KECCAK:
+				quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("keccak :");
+				break;
+			case SKEIN:
+				quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("skein  :");
+				break;
+			case LUFFA:
+				x11_luffa512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("luffa  :");
+				break;
+			case CUBEHASH:
+				x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("cube   :");
+				break;
+			case SHAVITE:
+				x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("shavite:");
+				break;
+			case SIMD:
+				x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("simd   :");
+				break;
+			case ECHO:
+				if (use_compat_kernels[thr_id])
+					x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				else {
+					x16_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++;
+				}
+				TRACE("echo   :");
+				break;
+			case HAMSI:
+				x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("hamsi  :");
+				break;
+			case FUGUE:
+				x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("fugue  :");
+				break;
+			case SHABAL:
+				x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("shabal :");
+				break;
+			case WHIRLPOOL:
+				x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("shabal :");
+				break;
+			case SHA512:
+				x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("sha512 :");
+				break;
+			}
+		}
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+#ifdef _DEBUG
+		uint32_t _ALIGN(64) dhash[8];
+		be32enc(&endiandata[19], pdata[19]);
+		x16r_hash(dhash, endiandata);
+		applog_hash(dhash);
+		return -1;
+#endif
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			x16r_hash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				work_set_target_ratio(work, vhash);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					x16r_hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+#if 0
+				gpulog(LOG_INFO, thr_id, "hash found with %s 80!", algo_strings[algo80]);
+
+				algo80_tests[algo80] += work->valid_nonces;
+				char oks64[128] = { 0 };
+				char oks80[128] = { 0 };
+				char fails[128] = { 0 };
+				for (int a = 0; a < HASH_FUNC_COUNT; a++) {
+					const char elem = hashOrder[a];
+					const uint8_t algo64 = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+					if (a > 0) algo64_tests[algo64] += work->valid_nonces;
+					sprintf(&oks64[strlen(oks64)], "|%X:%2d", a, algo64_tests[a] < 100 ? algo64_tests[a] : 99);
+					sprintf(&oks80[strlen(oks80)], "|%X:%2d", a, algo80_tests[a] < 100 ? algo80_tests[a] : 99);
+					sprintf(&fails[strlen(fails)], "|%X:%2d", a, algo80_fails[a] < 100 ? algo80_fails[a] : 99);
+				}
+				applog(LOG_INFO, "K64: %s", oks64);
+				applog(LOG_INFO, "K80: %s", oks80);
+				applog(LOG_ERR,  "F80: %s", fails);
+#endif
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				// x11+ coins could do some random error, but not on retry
+				gpu_increment_reject(thr_id);
+				algo80_fails[algo80]++;
+				if (!warn) {
+					warn++;
+					pdata[19] = work->nonces[0] + 1;
+					continue;
+				} else {
+					if (!opt_quiet)	gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU! %s %s",
+						work->nonces[0], algo_strings[algo80], hashOrder);
+					warn = 0;
+				}
+			}
+		}
+
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
+		pdata[19] += throughput;
+
+	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+	return 0;
+}
+
+// cleanup
+extern "C" void free_x16r(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+
+	quark_blake512_cpu_free(thr_id);
+	quark_groestl512_cpu_free(thr_id);
+	x11_simd512_cpu_free(thr_id);
+	x13_fugue512_cpu_free(thr_id);
+	x16_fugue512_cpu_free(thr_id); // to merge with x13_fugue512 ?
+	x15_whirlpool_cpu_free(thr_id);
+
+	cuda_check_cpu_free(thr_id);
+
+	cudaDeviceSynchronize();
+	init[thr_id] = false;
+}
diff --git a/x16/x16s.cu b/x16/x16s.cu
new file mode 100644
index 0000000000..382de413b3
--- /dev/null
+++ b/x16/x16s.cu
@@ -0,0 +1,602 @@
+/**
+ * X16S algorithm (X16 with Shuffled chain order)
+ *
+ * tpruvot 2018 - GPL code
+ */
+
+#include <stdio.h>
+#include <memory.h>
+#include <unistd.h>
+
+extern "C" {
+#include "sph/sph_blake.h"
+#include "sph/sph_bmw.h"
+#include "sph/sph_groestl.h"
+#include "sph/sph_skein.h"
+#include "sph/sph_jh.h"
+#include "sph/sph_keccak.h"
+
+#include "sph/sph_luffa.h"
+#include "sph/sph_cubehash.h"
+#include "sph/sph_shavite.h"
+#include "sph/sph_simd.h"
+#include "sph/sph_echo.h"
+
+#include "sph/sph_hamsi.h"
+#include "sph/sph_fugue.h"
+#include "sph/sph_shabal.h"
+#include "sph/sph_whirlpool.h"
+#include "sph/sph_sha2.h"
+}
+
+#include "miner.h"
+#include "cuda_helper.h"
+#include "cuda_x16.h"
+
+static uint32_t *d_hash[MAX_GPUS];
+
+enum Algo {
+	BLAKE = 0,
+	BMW,
+	GROESTL,
+	JH,
+	KECCAK,
+	SKEIN,
+	LUFFA,
+	CUBEHASH,
+	SHAVITE,
+	SIMD,
+	ECHO,
+	HAMSI,
+	FUGUE,
+	SHABAL,
+	WHIRLPOOL,
+	SHA512,
+	HASH_FUNC_COUNT
+};
+
+static const char* algo_strings[] = {
+	"blake",
+	"bmw512",
+	"groestl",
+	"jh512",
+	"keccak",
+	"skein",
+	"luffa",
+	"cube",
+	"shavite",
+	"simd",
+	"echo",
+	"hamsi",
+	"fugue",
+	"shabal",
+	"whirlpool",
+	"sha512",
+	NULL
+};
+
+static __thread uint32_t s_ntime = UINT32_MAX;
+static __thread char hashOrder[HASH_FUNC_COUNT + 1] = { 0 };
+
+static void getAlgoString(const uint32_t* prevblock, char *output)
+{
+	uint8_t* data = (uint8_t*)prevblock;
+
+	strcpy(output, "0123456789ABCDEF");
+
+	for (uint8_t i = 0; i < HASH_FUNC_COUNT; i++) {
+		uint8_t b = (15 - i) >> 1; // 16 ascii hex chars, reversed
+		uint8_t algoDigit = (i & 1) ? data[b] & 0xF : data[b] >> 4;
+		int offset = (int) algoDigit;
+		char oldVal = output[offset];
+		for(int j=offset; j-->0;)
+			output[j+1] = output[j];
+		output[0] = oldVal;
+	}
+}
+
+// X16S CPU Hash (Validation)
+extern "C" void x16s_hash(void *output, const void *input)
+{
+	unsigned char _ALIGN(64) hash[128];
+
+	sph_blake512_context ctx_blake;
+	sph_bmw512_context ctx_bmw;
+	sph_groestl512_context ctx_groestl;
+	sph_jh512_context ctx_jh;
+	sph_keccak512_context ctx_keccak;
+	sph_skein512_context ctx_skein;
+	sph_luffa512_context ctx_luffa;
+	sph_cubehash512_context ctx_cubehash;
+	sph_shavite512_context ctx_shavite;
+	sph_simd512_context ctx_simd;
+	sph_echo512_context ctx_echo;
+	sph_hamsi512_context ctx_hamsi;
+	sph_fugue512_context ctx_fugue;
+	sph_shabal512_context ctx_shabal;
+	sph_whirlpool_context ctx_whirlpool;
+	sph_sha512_context ctx_sha512;
+
+	void *in = (void*) input;
+	int size = 80;
+
+	uint32_t *in32 = (uint32_t*) input;
+	getAlgoString(&in32[1], hashOrder);
+
+	for (int i = 0; i < 16; i++)
+	{
+		const char elem = hashOrder[i];
+		const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+		switch (algo) {
+		case BLAKE:
+			sph_blake512_init(&ctx_blake);
+			sph_blake512(&ctx_blake, in, size);
+			sph_blake512_close(&ctx_blake, hash);
+			break;
+		case BMW:
+			sph_bmw512_init(&ctx_bmw);
+			sph_bmw512(&ctx_bmw, in, size);
+			sph_bmw512_close(&ctx_bmw, hash);
+			break;
+		case GROESTL:
+			sph_groestl512_init(&ctx_groestl);
+			sph_groestl512(&ctx_groestl, in, size);
+			sph_groestl512_close(&ctx_groestl, hash);
+			break;
+		case SKEIN:
+			sph_skein512_init(&ctx_skein);
+			sph_skein512(&ctx_skein, in, size);
+			sph_skein512_close(&ctx_skein, hash);
+			break;
+		case JH:
+			sph_jh512_init(&ctx_jh);
+			sph_jh512(&ctx_jh, in, size);
+			sph_jh512_close(&ctx_jh, hash);
+			break;
+		case KECCAK:
+			sph_keccak512_init(&ctx_keccak);
+			sph_keccak512(&ctx_keccak, in, size);
+			sph_keccak512_close(&ctx_keccak, hash);
+			break;
+		case LUFFA:
+			sph_luffa512_init(&ctx_luffa);
+			sph_luffa512(&ctx_luffa, in, size);
+			sph_luffa512_close(&ctx_luffa, hash);
+			break;
+		case CUBEHASH:
+			sph_cubehash512_init(&ctx_cubehash);
+			sph_cubehash512(&ctx_cubehash, in, size);
+			sph_cubehash512_close(&ctx_cubehash, hash);
+			break;
+		case SHAVITE:
+			sph_shavite512_init(&ctx_shavite);
+			sph_shavite512(&ctx_shavite, in, size);
+			sph_shavite512_close(&ctx_shavite, hash);
+			break;
+		case SIMD:
+			sph_simd512_init(&ctx_simd);
+			sph_simd512(&ctx_simd, in, size);
+			sph_simd512_close(&ctx_simd, hash);
+			break;
+		case ECHO:
+			sph_echo512_init(&ctx_echo);
+			sph_echo512(&ctx_echo, in, size);
+			sph_echo512_close(&ctx_echo, hash);
+			break;
+		case HAMSI:
+			sph_hamsi512_init(&ctx_hamsi);
+			sph_hamsi512(&ctx_hamsi, in, size);
+			sph_hamsi512_close(&ctx_hamsi, hash);
+			break;
+		case FUGUE:
+			sph_fugue512_init(&ctx_fugue);
+			sph_fugue512(&ctx_fugue, in, size);
+			sph_fugue512_close(&ctx_fugue, hash);
+			break;
+		case SHABAL:
+			sph_shabal512_init(&ctx_shabal);
+			sph_shabal512(&ctx_shabal, in, size);
+			sph_shabal512_close(&ctx_shabal, hash);
+			break;
+		case WHIRLPOOL:
+			sph_whirlpool_init(&ctx_whirlpool);
+			sph_whirlpool(&ctx_whirlpool, in, size);
+			sph_whirlpool_close(&ctx_whirlpool, hash);
+			break;
+		case SHA512:
+			sph_sha512_init(&ctx_sha512);
+			sph_sha512(&ctx_sha512,(const void*) in, size);
+			sph_sha512_close(&ctx_sha512,(void*) hash);
+			break;
+		}
+		in = (void*) hash;
+		size = 64;
+	}
+	memcpy(output, hash, 32);
+}
+
+#if 0 /* in x16r */
+void whirlpool_midstate(void *state, const void *input)
+{
+	sph_whirlpool_context ctx;
+
+	sph_whirlpool_init(&ctx);
+	sph_whirlpool(&ctx, input, 64);
+
+	memcpy(state, ctx.state, 64);
+}
+#endif
+
+static bool init[MAX_GPUS] = { 0 };
+static bool use_compat_kernels[MAX_GPUS] = { 0 };
+
+//#define _DEBUG
+#define _DEBUG_PREFIX "x16s-"
+#include "cuda_debug.cuh"
+
+extern "C" int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	const int dev_id = device_map[thr_id];
+	int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 20 : 19;
+	if (strstr(device_name[dev_id], "GTX 1080")) intensity = 20;
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
+	//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		cuda_get_arch(thr_id);
+		use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
+		if (use_compat_kernels[thr_id])
+			x11_echo512_cpu_init(thr_id, throughput);
+
+		quark_blake512_cpu_init(thr_id, throughput);
+		quark_bmw512_cpu_init(thr_id, throughput);
+		quark_groestl512_cpu_init(thr_id, throughput);
+		quark_skein512_cpu_init(thr_id, throughput);
+		quark_jh512_cpu_init(thr_id, throughput);
+		quark_keccak512_cpu_init(thr_id, throughput);
+		qubit_luffa512_cpu_init(thr_id, throughput);
+		x11_luffa512_cpu_init(thr_id, throughput); // 64
+		x11_shavite512_cpu_init(thr_id, throughput);
+		x11_simd512_cpu_init(thr_id, throughput); // 64
+		x16_echo512_cuda_init(thr_id, throughput);
+		x13_hamsi512_cpu_init(thr_id, throughput);
+		x13_fugue512_cpu_init(thr_id, throughput);
+		x16_fugue512_cpu_init(thr_id, throughput);
+		x14_shabal512_cpu_init(thr_id, throughput);
+		x15_whirlpool_cpu_init(thr_id, throughput, 0);
+		x16_whirlpool512_init(thr_id, throughput);
+		x17_sha512_cpu_init(thr_id, throughput);
+
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
+
+		cuda_check_cpu_init(thr_id, throughput);
+
+		init[thr_id] = true;
+	}
+
+	if (opt_benchmark) {
+		((uint32_t*)ptarget)[7] = 0x003f;
+		//((uint8_t*)pdata)[8] = 0x90; // hashOrder[0] = '9'; for simd 80 + blake512 64
+		//((uint8_t*)pdata)[8] = 0x0A; // hashOrder[0] = 'A'; for echo 64
+		//((uint8_t*)pdata)[8] = 0xB0; // hashOrder[0] = 'B'; for hamsi 80 + blake512 64
+		//((uint8_t*)pdata)[8] = 0xC0; // hashOrder[0] = 'C'; for fugue 80 + blake512 64
+		//((uint8_t*)pdata)[8] = 0xE0; // hashOrder[0] = 'E'; for whirlpool 80 + blake512 64
+	}
+	uint32_t _ALIGN(64) endiandata[20];
+
+	for (int k=0; k < 19; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	uint32_t ntime = swab32(pdata[17]);
+	if (s_ntime != ntime) {
+		getAlgoString(&endiandata[1], hashOrder);
+		s_ntime = ntime;
+		if (opt_debug && !thr_id) applog(LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime);
+	}
+
+	cuda_check_cpu_setTarget(ptarget);
+
+	char elem = hashOrder[0];
+	const uint8_t algo80 = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+	switch (algo80) {
+		case BLAKE:
+			quark_blake512_cpu_setBlock_80(thr_id, endiandata);
+			break;
+		case BMW:
+			quark_bmw512_cpu_setBlock_80(endiandata);
+			break;
+		case GROESTL:
+			groestl512_setBlock_80(thr_id, endiandata);
+			break;
+		case JH:
+			jh512_setBlock_80(thr_id, endiandata);
+			break;
+		case KECCAK:
+			keccak512_setBlock_80(thr_id, endiandata);
+			break;
+		case SKEIN:
+			skein512_cpu_setBlock_80((void*)endiandata);
+			break;
+		case LUFFA:
+			qubit_luffa512_cpu_setBlock_80((void*)endiandata);
+			break;
+		case CUBEHASH:
+			cubehash512_setBlock_80(thr_id, endiandata);
+			break;
+		case SHAVITE:
+			x11_shavite512_setBlock_80((void*)endiandata);
+			break;
+		case SIMD:
+			x16_simd512_setBlock_80((void*)endiandata);
+			break;
+		case ECHO:
+			x16_echo512_setBlock_80((void*)endiandata);
+			break;
+		case HAMSI:
+			x16_hamsi512_setBlock_80((void*)endiandata);
+			break;
+		case FUGUE:
+			x16_fugue512_setBlock_80((void*)pdata);
+			break;
+		case SHABAL:
+			x16_shabal512_setBlock_80((void*)endiandata);
+			break;
+		case WHIRLPOOL:
+			x16_whirlpool512_setBlock_80((void*)endiandata);
+			break;
+		case SHA512:
+			x16_sha512_setBlock_80(endiandata);
+			break;
+		default: {
+			return -1;
+		}
+	}
+
+	int warn = 0;
+
+	do {
+		int order = 0;
+
+		// Hash with CUDA
+
+		switch (algo80) {
+			case BLAKE:
+				quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("blake80:");
+				break;
+			case BMW:
+				quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+				TRACE("bmw80  :");
+				break;
+			case GROESTL:
+				groestl512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("grstl80:");
+				break;
+			case JH:
+				jh512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("jh51280:");
+				break;
+			case KECCAK:
+				keccak512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("kecck80:");
+				break;
+			case SKEIN:
+				skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++;
+				TRACE("skein80:");
+				break;
+			case LUFFA:
+				qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+				TRACE("luffa80:");
+				break;
+			case CUBEHASH:
+				cubehash512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("cube 80:");
+				break;
+			case SHAVITE:
+				x11_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+				TRACE("shavite:");
+				break;
+			case SIMD:
+				x16_simd512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("simd512:");
+				break;
+			case ECHO:
+				x16_echo512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("echo   :");
+				break;
+			case HAMSI:
+				x16_hamsi512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("hamsi  :");
+				break;
+			case FUGUE:
+				x16_fugue512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("fugue  :");
+				break;
+			case SHABAL:
+				x16_shabal512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("shabal :");
+				break;
+			case WHIRLPOOL:
+				x16_whirlpool512_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("whirl  :");
+				break;
+			case SHA512:
+				x16_sha512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("sha512 :");
+				break;
+		}
+
+		for (int i = 1; i < 16; i++)
+		{
+			const char elem = hashOrder[i];
+			const uint8_t algo64 = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+			switch (algo64) {
+			case BLAKE:
+				quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("blake  :");
+				break;
+			case BMW:
+				quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("bmw    :");
+				break;
+			case GROESTL:
+				quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("groestl:");
+				break;
+			case JH:
+				quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("jh512  :");
+				break;
+			case KECCAK:
+				quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("keccak :");
+				break;
+			case SKEIN:
+				quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("skein  :");
+				break;
+			case LUFFA:
+				x11_luffa512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("luffa  :");
+				break;
+			case CUBEHASH:
+				x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("cube   :");
+				break;
+			case SHAVITE:
+				x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("shavite:");
+				break;
+			case SIMD:
+				x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("simd   :");
+				break;
+			case ECHO:
+				if (use_compat_kernels[thr_id])
+					x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				else {
+					x16_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++;
+				}
+				TRACE("echo   :");
+				break;
+			case HAMSI:
+				x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("hamsi  :");
+				break;
+			case FUGUE:
+				x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("fugue  :");
+				break;
+			case SHABAL:
+				x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("shabal :");
+				break;
+			case WHIRLPOOL:
+				x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("shabal :");
+				break;
+			case SHA512:
+				x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("sha512 :");
+				break;
+			}
+		}
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+#ifdef _DEBUG
+		uint32_t _ALIGN(64) dhash[8];
+		be32enc(&endiandata[19], pdata[19]);
+		x16s_hash(dhash, endiandata);
+		applog_hash(dhash);
+		return -1;
+#endif
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			x16s_hash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				work_set_target_ratio(work, vhash);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					x16s_hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				//gpulog(LOG_INFO, thr_id, "hash found with %s 80 (%s)!", algo_strings[algo80], hashOrder);
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				// x11+ coins could do some random error, but not on retry
+				gpu_increment_reject(thr_id);
+				if (!warn) {
+					warn++;
+					pdata[19] = work->nonces[0] + 1;
+					continue;
+				} else {
+					if (!opt_quiet)	gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU! %s %s",
+						work->nonces[0], algo_strings[algo80], hashOrder);
+					warn = 0;
+				}
+			}
+		}
+
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
+		pdata[19] += throughput;
+
+	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+	return 0;
+}
+
+// cleanup
+extern "C" void free_x16s(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+
+	quark_blake512_cpu_free(thr_id);
+	quark_groestl512_cpu_free(thr_id);
+	x11_simd512_cpu_free(thr_id);
+	x13_fugue512_cpu_free(thr_id);
+	x16_fugue512_cpu_free(thr_id); // to merge with x13_fugue512 ?
+	x15_whirlpool_cpu_free(thr_id);
+
+	cuda_check_cpu_free(thr_id);
+
+	cudaDeviceSynchronize();
+	init[thr_id] = false;
+}
diff --git a/x17/cuda_x17_haval256.cu b/x17/cuda_x17_haval256.cu
new file mode 100644
index 0000000000..74b311e236
--- /dev/null
+++ b/x17/cuda_x17_haval256.cu
@@ -0,0 +1,351 @@
+/*
+ * haval-256 kernel implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2014 djm34
+ *               2016 tpruvot
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ */
+#include <stdio.h>
+#include <memory.h>
+
+#include "cuda_helper.h"
+
+#define F1(x6, x5, x4, x3, x2, x1, x0) \
+	(((x1) & ((x0) ^ (x4))) ^ ((x2) & (x5)) ^ ((x3) & (x6)) ^ (x0))
+
+#define F2(x6, x5, x4, x3, x2, x1, x0) \
+	(((x2) & (((x1) & ~(x3)) ^ ((x4) & (x5)) ^ (x6) ^ (x0))) \
+	^ ((x4) & ((x1) ^ (x5))) ^ ((x3 & (x5)) ^ (x0)))
+
+#define F3(x6, x5, x4, x3, x2, x1, x0) \
+	(((x3) & (((x1) & (x2)) ^ (x6) ^ (x0))) \
+	^ ((x1) & (x4)) ^ ((x2) & (x5)) ^ (x0))
+
+#define F4(x6, x5, x4, x3, x2, x1, x0) \
+	(((x3) & (((x1) & (x2)) ^ ((x4) | (x6)) ^ (x5))) \
+	^ ((x4) & ((~(x2) & (x5)) ^ (x1) ^ (x6) ^ (x0))) \
+	^ ((x2) & (x6)) ^ (x0))
+
+#define F5(x6, x5, x4, x3, x2, x1, x0) \
+	(((x0) & ~(((x1) & (x2) & (x3)) ^ (x5))) \
+	^ ((x1) & (x4)) ^ ((x2) & (x5)) ^ ((x3) & (x6)))
+
+#define FP5_1(x6, x5, x4, x3, x2, x1, x0) \
+	F1(x3, x4, x1, x0, x5, x2, x6)
+#define FP5_2(x6, x5, x4, x3, x2, x1, x0) \
+	F2(x6, x2, x1, x0, x3, x4, x5)
+#define FP5_3(x6, x5, x4, x3, x2, x1, x0) \
+	F3(x2, x6, x0, x4, x3, x1, x5)
+#define FP5_4(x6, x5, x4, x3, x2, x1, x0) \
+	F4(x1, x5, x3, x2, x0, x4, x6)
+#define FP5_5(x6, x5, x4, x3, x2, x1, x0) \
+	F5(x2, x5, x0, x6, x4, x3, x1)
+
+#define STEP(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w, c) { \
+	uint32_t t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \
+	(x7) = (uint32_t)(ROTR32(t, 7) + ROTR32((x7), 11) + (w) + (c)); \
+}
+
+#define PASS1(n, in) { \
+	STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, in[ 0], 0U); \
+	STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, in[ 1], 0U); \
+	STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, in[ 2], 0U); \
+	STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, in[ 3], 0U); \
+	STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, in[ 4], 0U); \
+	STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, in[ 5], 0U); \
+	STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, in[ 6], 0U); \
+	STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, in[ 7], 0U); \
+ \
+	STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, in[ 8], 0U); \
+	STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, in[ 9], 0U); \
+	STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, in[10], 0U); \
+	STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, in[11], 0U); \
+	STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, in[12], 0U); \
+	STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, in[13], 0U); \
+	STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, in[14], 0U); \
+	STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, in[15], 0U); \
+ \
+	STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, in[16], 0U); \
+	STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, in[17], 0U); \
+	STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, in[18], 0U); \
+	STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, in[19], 0U); \
+	STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, in[20], 0U); \
+	STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, in[21], 0U); \
+	STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, in[22], 0U); \
+	STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, in[23], 0U); \
+ \
+	STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, in[24], 0U); \
+	STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, in[25], 0U); \
+	STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, in[26], 0U); \
+	STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, in[27], 0U); \
+	STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, in[28], 0U); \
+	STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, in[29], 0U); \
+	STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, in[30], 0U); \
+	STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, in[31], 0U); \
+}
+
+#define PASS2(n, in) { \
+	STEP(n, 2, s7, s6, s5, s4, s3, s2, s1, s0, in[ 5], 0x452821E6); \
+	STEP(n, 2, s6, s5, s4, s3, s2, s1, s0, s7, in[14], 0x38D01377); \
+	STEP(n, 2, s5, s4, s3, s2, s1, s0, s7, s6, in[26], 0xBE5466CF); \
+	STEP(n, 2, s4, s3, s2, s1, s0, s7, s6, s5, in[18], 0x34E90C6C); \
+	STEP(n, 2, s3, s2, s1, s0, s7, s6, s5, s4, in[11], 0xC0AC29B7); \
+	STEP(n, 2, s2, s1, s0, s7, s6, s5, s4, s3, in[28], 0xC97C50DD); \
+	STEP(n, 2, s1, s0, s7, s6, s5, s4, s3, s2, in[ 7], 0x3F84D5B5); \
+	STEP(n, 2, s0, s7, s6, s5, s4, s3, s2, s1, in[16], 0xB5470917); \
+ \
+	STEP(n, 2, s7, s6, s5, s4, s3, s2, s1, s0, in[ 0], 0x9216D5D9); \
+	STEP(n, 2, s6, s5, s4, s3, s2, s1, s0, s7, in[23], 0x8979FB1B); \
+	STEP(n, 2, s5, s4, s3, s2, s1, s0, s7, s6, in[20], 0xD1310BA6); \
+	STEP(n, 2, s4, s3, s2, s1, s0, s7, s6, s5, in[22], 0x98DFB5AC); \
+	STEP(n, 2, s3, s2, s1, s0, s7, s6, s5, s4, in[ 1], 0x2FFD72DB); \
+	STEP(n, 2, s2, s1, s0, s7, s6, s5, s4, s3, in[10], 0xD01ADFB7); \
+	STEP(n, 2, s1, s0, s7, s6, s5, s4, s3, s2, in[ 4], 0xB8E1AFED); \
+	STEP(n, 2, s0, s7, s6, s5, s4, s3, s2, s1, in[ 8], 0x6A267E96); \
+ \
+	STEP(n, 2, s7, s6, s5, s4, s3, s2, s1, s0, in[30], 0xBA7C9045); \
+	STEP(n, 2, s6, s5, s4, s3, s2, s1, s0, s7, in[ 3], 0xF12C7F99); \
+	STEP(n, 2, s5, s4, s3, s2, s1, s0, s7, s6, in[21], 0x24A19947); \
+	STEP(n, 2, s4, s3, s2, s1, s0, s7, s6, s5, in[ 9], 0xB3916CF7); \
+	STEP(n, 2, s3, s2, s1, s0, s7, s6, s5, s4, in[17], 0x0801F2E2); \
+	STEP(n, 2, s2, s1, s0, s7, s6, s5, s4, s3, in[24], 0x858EFC16); \
+	STEP(n, 2, s1, s0, s7, s6, s5, s4, s3, s2, in[29], 0x636920D8); \
+	STEP(n, 2, s0, s7, s6, s5, s4, s3, s2, s1, in[ 6], 0x71574E69); \
+ \
+	STEP(n, 2, s7, s6, s5, s4, s3, s2, s1, s0, in[19], 0xA458FEA3); \
+	STEP(n, 2, s6, s5, s4, s3, s2, s1, s0, s7, in[12], 0xF4933D7E); \
+	STEP(n, 2, s5, s4, s3, s2, s1, s0, s7, s6, in[15], 0x0D95748F); \
+	STEP(n, 2, s4, s3, s2, s1, s0, s7, s6, s5, in[13], 0x728EB658); \
+	STEP(n, 2, s3, s2, s1, s0, s7, s6, s5, s4, in[ 2], 0x718BCD58); \
+	STEP(n, 2, s2, s1, s0, s7, s6, s5, s4, s3, in[25], 0x82154AEE); \
+	STEP(n, 2, s1, s0, s7, s6, s5, s4, s3, s2, in[31], 0x7B54A41D); \
+	STEP(n, 2, s0, s7, s6, s5, s4, s3, s2, s1, in[27], 0xC25A59B5); \
+}
+
+#define PASS3(n, in) { \
+	STEP(n, 3, s7, s6, s5, s4, s3, s2, s1, s0, in[19], 0x9C30D539); \
+	STEP(n, 3, s6, s5, s4, s3, s2, s1, s0, s7, in[ 9], 0x2AF26013); \
+	STEP(n, 3, s5, s4, s3, s2, s1, s0, s7, s6, in[ 4], 0xC5D1B023); \
+	STEP(n, 3, s4, s3, s2, s1, s0, s7, s6, s5, in[20], 0x286085F0); \
+	STEP(n, 3, s3, s2, s1, s0, s7, s6, s5, s4, in[28], 0xCA417918); \
+	STEP(n, 3, s2, s1, s0, s7, s6, s5, s4, s3, in[17], 0xB8DB38EF); \
+	STEP(n, 3, s1, s0, s7, s6, s5, s4, s3, s2, in[ 8], 0x8E79DCB0); \
+	STEP(n, 3, s0, s7, s6, s5, s4, s3, s2, s1, in[22], 0x603A180E); \
+ \
+	STEP(n, 3, s7, s6, s5, s4, s3, s2, s1, s0, in[29], 0x6C9E0E8B); \
+	STEP(n, 3, s6, s5, s4, s3, s2, s1, s0, s7, in[14], 0xB01E8A3E); \
+	STEP(n, 3, s5, s4, s3, s2, s1, s0, s7, s6, in[25], 0xD71577C1); \
+	STEP(n, 3, s4, s3, s2, s1, s0, s7, s6, s5, in[12], 0xBD314B27); \
+	STEP(n, 3, s3, s2, s1, s0, s7, s6, s5, s4, in[24], 0x78AF2FDA); \
+	STEP(n, 3, s2, s1, s0, s7, s6, s5, s4, s3, in[30], 0x55605C60); \
+	STEP(n, 3, s1, s0, s7, s6, s5, s4, s3, s2, in[16], 0xE65525F3); \
+	STEP(n, 3, s0, s7, s6, s5, s4, s3, s2, s1, in[26], 0xAA55AB94); \
+ \
+	STEP(n, 3, s7, s6, s5, s4, s3, s2, s1, s0, in[31], 0x57489862); \
+	STEP(n, 3, s6, s5, s4, s3, s2, s1, s0, s7, in[15], 0x63E81440); \
+	STEP(n, 3, s5, s4, s3, s2, s1, s0, s7, s6, in[ 7], 0x55CA396A); \
+	STEP(n, 3, s4, s3, s2, s1, s0, s7, s6, s5, in[ 3], 0x2AAB10B6); \
+	STEP(n, 3, s3, s2, s1, s0, s7, s6, s5, s4, in[ 1], 0xB4CC5C34); \
+	STEP(n, 3, s2, s1, s0, s7, s6, s5, s4, s3, in[ 0], 0x1141E8CE); \
+	STEP(n, 3, s1, s0, s7, s6, s5, s4, s3, s2, in[18], 0xA15486AF); \
+	STEP(n, 3, s0, s7, s6, s5, s4, s3, s2, s1, in[27], 0x7C72E993); \
+ \
+	STEP(n, 3, s7, s6, s5, s4, s3, s2, s1, s0, in[13], 0xB3EE1411); \
+	STEP(n, 3, s6, s5, s4, s3, s2, s1, s0, s7, in[ 6], 0x636FBC2A); \
+	STEP(n, 3, s5, s4, s3, s2, s1, s0, s7, s6, in[21], 0x2BA9C55D); \
+	STEP(n, 3, s4, s3, s2, s1, s0, s7, s6, s5, in[10], 0x741831F6); \
+	STEP(n, 3, s3, s2, s1, s0, s7, s6, s5, s4, in[23], 0xCE5C3E16); \
+	STEP(n, 3, s2, s1, s0, s7, s6, s5, s4, s3, in[11], 0x9B87931E); \
+	STEP(n, 3, s1, s0, s7, s6, s5, s4, s3, s2, in[ 5], 0xAFD6BA33); \
+	STEP(n, 3, s0, s7, s6, s5, s4, s3, s2, s1, in[ 2], 0x6C24CF5C); \
+}
+
+#define PASS4(n, in) { \
+	STEP(n, 4, s7, s6, s5, s4, s3, s2, s1, s0, in[24], 0x7A325381); \
+	STEP(n, 4, s6, s5, s4, s3, s2, s1, s0, s7, in[ 4], 0x28958677); \
+	STEP(n, 4, s5, s4, s3, s2, s1, s0, s7, s6, in[ 0], 0x3B8F4898); \
+	STEP(n, 4, s4, s3, s2, s1, s0, s7, s6, s5, in[14], 0x6B4BB9AF); \
+	STEP(n, 4, s3, s2, s1, s0, s7, s6, s5, s4, in[ 2], 0xC4BFE81B); \
+	STEP(n, 4, s2, s1, s0, s7, s6, s5, s4, s3, in[ 7], 0x66282193); \
+	STEP(n, 4, s1, s0, s7, s6, s5, s4, s3, s2, in[28], 0x61D809CC); \
+	STEP(n, 4, s0, s7, s6, s5, s4, s3, s2, s1, in[23], 0xFB21A991); \
+ \
+	STEP(n, 4, s7, s6, s5, s4, s3, s2, s1, s0, in[26], 0x487CAC60); \
+	STEP(n, 4, s6, s5, s4, s3, s2, s1, s0, s7, in[ 6], 0x5DEC8032); \
+	STEP(n, 4, s5, s4, s3, s2, s1, s0, s7, s6, in[30], 0xEF845D5D); \
+	STEP(n, 4, s4, s3, s2, s1, s0, s7, s6, s5, in[20], 0xE98575B1); \
+	STEP(n, 4, s3, s2, s1, s0, s7, s6, s5, s4, in[18], 0xDC262302); \
+	STEP(n, 4, s2, s1, s0, s7, s6, s5, s4, s3, in[25], 0xEB651B88); \
+	STEP(n, 4, s1, s0, s7, s6, s5, s4, s3, s2, in[19], 0x23893E81); \
+	STEP(n, 4, s0, s7, s6, s5, s4, s3, s2, s1, in[ 3], 0xD396ACC5); \
+ \
+	STEP(n, 4, s7, s6, s5, s4, s3, s2, s1, s0, in[22], 0x0F6D6FF3); \
+	STEP(n, 4, s6, s5, s4, s3, s2, s1, s0, s7, in[11], 0x83F44239); \
+	STEP(n, 4, s5, s4, s3, s2, s1, s0, s7, s6, in[31], 0x2E0B4482); \
+	STEP(n, 4, s4, s3, s2, s1, s0, s7, s6, s5, in[21], 0xA4842004); \
+	STEP(n, 4, s3, s2, s1, s0, s7, s6, s5, s4, in[ 8], 0x69C8F04A); \
+	STEP(n, 4, s2, s1, s0, s7, s6, s5, s4, s3, in[27], 0x9E1F9B5E); \
+	STEP(n, 4, s1, s0, s7, s6, s5, s4, s3, s2, in[12], 0x21C66842); \
+	STEP(n, 4, s0, s7, s6, s5, s4, s3, s2, s1, in[ 9], 0xF6E96C9A); \
+ \
+	STEP(n, 4, s7, s6, s5, s4, s3, s2, s1, s0, in[ 1], 0x670C9C61); \
+	STEP(n, 4, s6, s5, s4, s3, s2, s1, s0, s7, in[29], 0xABD388F0); \
+	STEP(n, 4, s5, s4, s3, s2, s1, s0, s7, s6, in[ 5], 0x6A51A0D2); \
+	STEP(n, 4, s4, s3, s2, s1, s0, s7, s6, s5, in[15], 0xD8542F68); \
+	STEP(n, 4, s3, s2, s1, s0, s7, s6, s5, s4, in[17], 0x960FA728); \
+	STEP(n, 4, s2, s1, s0, s7, s6, s5, s4, s3, in[10], 0xAB5133A3); \
+	STEP(n, 4, s1, s0, s7, s6, s5, s4, s3, s2, in[16], 0x6EEF0B6C); \
+	STEP(n, 4, s0, s7, s6, s5, s4, s3, s2, s1, in[13], 0x137A3BE4); \
+}
+
+#define PASS5(n, in) { \
+	STEP(n, 5, s7, s6, s5, s4, s3, s2, s1, s0, in[27], 0xBA3BF050); \
+	STEP(n, 5, s6, s5, s4, s3, s2, s1, s0, s7, in[ 3], 0x7EFB2A98); \
+	STEP(n, 5, s5, s4, s3, s2, s1, s0, s7, s6, in[21], 0xA1F1651D); \
+	STEP(n, 5, s4, s3, s2, s1, s0, s7, s6, s5, in[26], 0x39AF0176); \
+	STEP(n, 5, s3, s2, s1, s0, s7, s6, s5, s4, in[17], 0x66CA593E); \
+	STEP(n, 5, s2, s1, s0, s7, s6, s5, s4, s3, in[11], 0x82430E88); \
+	STEP(n, 5, s1, s0, s7, s6, s5, s4, s3, s2, in[20], 0x8CEE8619); \
+	STEP(n, 5, s0, s7, s6, s5, s4, s3, s2, s1, in[29], 0x456F9FB4); \
+ \
+	STEP(n, 5, s7, s6, s5, s4, s3, s2, s1, s0, in[19], 0x7D84A5C3); \
+	STEP(n, 5, s6, s5, s4, s3, s2, s1, s0, s7, in[ 0], 0x3B8B5EBE); \
+	STEP(n, 5, s5, s4, s3, s2, s1, s0, s7, s6, in[12], 0xE06F75D8); \
+	STEP(n, 5, s4, s3, s2, s1, s0, s7, s6, s5, in[ 7], 0x85C12073); \
+	STEP(n, 5, s3, s2, s1, s0, s7, s6, s5, s4, in[13], 0x401A449F); \
+	STEP(n, 5, s2, s1, s0, s7, s6, s5, s4, s3, in[ 8], 0x56C16AA6); \
+	STEP(n, 5, s1, s0, s7, s6, s5, s4, s3, s2, in[31], 0x4ED3AA62); \
+	STEP(n, 5, s0, s7, s6, s5, s4, s3, s2, s1, in[10], 0x363F7706); \
+ \
+	STEP(n, 5, s7, s6, s5, s4, s3, s2, s1, s0, in[ 5], 0x1BFEDF72); \
+	STEP(n, 5, s6, s5, s4, s3, s2, s1, s0, s7, in[ 9], 0x429B023D); \
+	STEP(n, 5, s5, s4, s3, s2, s1, s0, s7, s6, in[14], 0x37D0D724); \
+	STEP(n, 5, s4, s3, s2, s1, s0, s7, s6, s5, in[30], 0xD00A1248); \
+	STEP(n, 5, s3, s2, s1, s0, s7, s6, s5, s4, in[18], 0xDB0FEAD3); \
+	STEP(n, 5, s2, s1, s0, s7, s6, s5, s4, s3, in[ 6], 0x49F1C09B); \
+	STEP(n, 5, s1, s0, s7, s6, s5, s4, s3, s2, in[28], 0x075372C9); \
+	STEP(n, 5, s0, s7, s6, s5, s4, s3, s2, s1, in[24], 0x80991B7B); \
+ \
+	STEP(n, 5, s7, s6, s5, s4, s3, s2, s1, s0, in[ 2], 0x25D479D8); \
+	STEP(n, 5, s6, s5, s4, s3, s2, s1, s0, s7, in[23], 0xF6E8DEF7); \
+	STEP(n, 5, s5, s4, s3, s2, s1, s0, s7, s6, in[16], 0xE3FE501A); \
+	STEP(n, 5, s4, s3, s2, s1, s0, s7, s6, s5, in[22], 0xB6794C3B); \
+	STEP(n, 5, s3, s2, s1, s0, s7, s6, s5, s4, in[ 4], 0x976CE0BD); \
+	STEP(n, 5, s2, s1, s0, s7, s6, s5, s4, s3, in[ 1], 0x04C006BA); \
+	STEP(n, 5, s1, s0, s7, s6, s5, s4, s3, s2, in[25], 0xC1A94FB6); \
+	STEP(n, 5, s0, s7, s6, s5, s4, s3, s2, s1, in[15], 0x409F60C4); \
+}
+
+__global__ /* __launch_bounds__(256, 6) */
+void x17_haval256_gpu_hash_64(const uint32_t threads, uint64_t *g_hash, const int outlen)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint64_t hashPosition = thread*8U;
+		uint64_t *pHash = &g_hash[hashPosition];
+
+		uint32_t s0, s1, s2, s3, s4, s5, s6, s7;
+		const uint32_t u0 = s0 = 0x243F6A88;
+		const uint32_t u1 = s1 = 0x85A308D3;
+		const uint32_t u2 = s2 = 0x13198A2E;
+		const uint32_t u3 = s3 = 0x03707344;
+		const uint32_t u4 = s4 = 0xA4093822;
+		const uint32_t u5 = s5 = 0x299F31D0;
+		const uint32_t u6 = s6 = 0x082EFA98;
+		const uint32_t u7 = s7 = 0xEC4E6C89;
+
+		union {
+			uint32_t h4[16];
+			uint64_t h8[8];
+		} hash;
+
+		#pragma unroll
+		for (int i=0; i<8; i++) {
+			hash.h8[i] = pHash[i];
+		}
+
+		///////// input big /////////////////////
+
+		uint32_t buf[32];
+
+		#pragma unroll
+		for (int i=0; i<16; i++)
+			buf[i] = hash.h4[i];
+
+		buf[16] = 0x00000001;
+
+		#pragma unroll
+		for (int i=17; i<29; i++)
+			buf[i] = 0;
+
+		buf[29] = 0x40290000;
+		buf[30] = 0x00000200;
+		buf[31] = 0;
+
+		PASS1(5, buf);
+		PASS2(5, buf);
+		PASS3(5, buf);
+		PASS4(5, buf);
+		PASS5(5, buf);
+
+		hash.h4[0] = s0 + u0;
+		hash.h4[1] = s1 + u1;
+		hash.h4[2] = s2 + u2;
+		hash.h4[3] = s3 + u3;
+		hash.h4[4] = s4 + u4;
+		hash.h4[5] = s5 + u5;
+		hash.h4[6] = s6 + u6;
+		hash.h4[7] = s7 + u7;
+
+		pHash[0] = hash.h8[0];
+		pHash[1] = hash.h8[1];
+		pHash[2] = hash.h8[2];
+		pHash[3] = hash.h8[3];
+
+		if (outlen == 512) {
+			pHash[4] = 0; //hash.h8[4];
+			pHash[5] = 0; //hash.h8[5];
+			pHash[6] = 0; //hash.h8[6];
+			pHash[7] = 0; //hash.h8[7];
+		}
+	}
+}
+
+__host__
+void x17_haval256_cpu_init(int thr_id, uint32_t threads)
+{
+}
+
+__host__
+void x17_haval256_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, const int outlen)
+{
+	const uint32_t threadsperblock = 256;
+
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	x17_haval256_gpu_hash_64 <<<grid, block>>> (threads, (uint64_t*)d_hash, outlen);
+}
diff --git a/x17/cuda_x17_sha512.cu b/x17/cuda_x17_sha512.cu
new file mode 100644
index 0000000000..a0757d0e5b
--- /dev/null
+++ b/x17/cuda_x17_sha512.cu
@@ -0,0 +1,248 @@
+/*
+ * sha-512 cuda kernel implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2014 djm34
+ *               2016 tpruvot
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ */
+#include <stdio.h>
+
+#define NEED_HASH_512
+
+#include "cuda_helper.h"
+
+#define SWAP64(u64) cuda_swab64(u64)
+
+static __constant__ uint64_t c_WB[80];
+
+static const uint64_t WB[80] = {
+	0x428A2F98D728AE22, 0x7137449123EF65CD, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
+	0x3956C25BF348B538, 0x59F111F1B605D019, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
+	0xD807AA98A3030242, 0x12835B0145706FBE, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
+	0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1, 0x9BDC06A725C71235, 0xC19BF174CF692694,
+	0xE49B69C19EF14AD2, 0xEFBE4786384F25E3, 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65,
+	0x2DE92C6F592B0275, 0x4A7484AA6EA6E483, 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5,
+	0x983E5152EE66DFAB, 0xA831C66D2DB43210, 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4,
+	0xC6E00BF33DA88FC2, 0xD5A79147930AA725, 0x06CA6351E003826F, 0x142929670A0E6E70,
+	0x27B70A8546D22FFC, 0x2E1B21385C26C926, 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF,
+	0x650A73548BAF63DE, 0x766A0ABB3C77B2A8, 0x81C2C92E47EDAEE6, 0x92722C851482353B,
+	0xA2BFE8A14CF10364, 0xA81A664BBC423001, 0xC24B8B70D0F89791, 0xC76C51A30654BE30,
+	0xD192E819D6EF5218, 0xD69906245565A910, 0xF40E35855771202A, 0x106AA07032BBD1B8,
+	0x19A4C116B8D2D0C8, 0x1E376C085141AB53, 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8,
+	0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB, 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3,
+	0x748F82EE5DEFB2FC, 0x78A5636F43172F60, 0x84C87814A1F0AB72, 0x8CC702081A6439EC,
+	0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9, 0xBEF9A3F7B2C67915, 0xC67178F2E372532B,
+	0xCA273ECEEA26619C, 0xD186B8C721C0C207, 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178,
+	0x06F067AA72176FBA, 0x0A637DC5A2C898A6, 0x113F9804BEF90DAE, 0x1B710B35131C471B,
+	0x28DB77F523047D84, 0x32CAAB7B40C72493, 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C,
+	0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817
+};
+
+#define BSG5_0(x) xor3(ROTR64(x,28), ROTR64(x,34), ROTR64(x,39))
+#define SSG5_0(x) xor3(ROTR64(x, 1), ROTR64(x ,8), shr_t64(x,7))
+#define SSG5_1(x) xor3(ROTR64(x,19), ROTR64(x,61), shr_t64(x,6))
+
+//#define MAJ(X, Y, Z)   (((X) & (Y)) | (((X) | (Y)) & (Z)))
+#define MAJ(x, y, z)   andor(x,y,z)
+
+__device__ __forceinline__
+uint64_t Tone(uint64_t* K, uint64_t* r, uint64_t* W, const int a, const int i)
+{
+	//asm("// TONE \n");
+	const uint64_t e = r[(a+4) & 7];
+	uint64_t BSG51 = xor3(ROTR64(e, 14), ROTR64(e, 18), ROTR64(e, 41));
+	const uint64_t f = r[(a+5) & 7];
+	const uint64_t g = r[(a+6) & 7];
+	uint64_t CHl = ((f ^ g) & e) ^ g; // xandx(e, f, g);
+	return (r[(a+7) & 7] + BSG51 + CHl + K[i] + W[i]);
+}
+
+#define SHA3_STEP(K, r, W, ord, i) { \
+	const int a = (8 - ord) & 7; \
+	uint64_t T1 = Tone(K, r, W, a, i); \
+	r[(a+3) & 7] += T1; \
+	uint64_t T2 = (BSG5_0(r[a]) + MAJ(r[a], r[(a+1) & 7], r[(a+2) & 7])); \
+	r[(a+7) & 7] = T1 + T2; \
+}
+
+__global__
+/*__launch_bounds__(256, 4)*/
+void x17_sha512_gpu_hash_64(const uint32_t threads, uint64_t *g_hash)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint64_t hashPosition = thread;
+		uint64_t *pHash = &g_hash[hashPosition*8U];
+
+		uint64_t W[80];
+		#pragma unroll
+		for (int i = 0; i < 8; i ++) {
+			W[i] = SWAP64(pHash[i]);
+		}
+		W[8] = 0x8000000000000000;
+
+		#pragma unroll 69
+		for (int i = 9; i<78; i++) {
+			W[i] = 0U;
+		}
+		W[15] = 0x0000000000000200;
+
+		#pragma unroll 64
+		for (int i = 16; i < 80; i ++) {
+			W[i] = SSG5_1(W[i-2]) + W[i-7];
+			W[i] += SSG5_0(W[i-15]) + W[i-16];
+		}
+
+		const uint64_t IV512[8] = {
+			0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
+			0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
+			0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
+			0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
+		};
+
+		uint64_t r[8];
+		#pragma unroll
+		for (int i = 0; i < 8; i ++) {
+			r[i] = IV512[i];
+		}
+
+#if CUDART_VERSION >= 7050
+		#pragma unroll 10
+#endif
+		for (int i = 0; i < 80; i += 8) {
+			#pragma unroll
+			for (int ord = 0; ord < 8; ord++) {
+				SHA3_STEP(c_WB, r, W, ord, i+ord);
+			}
+		}
+
+		#pragma unroll
+		for (int u = 0; u < 4; u ++) {
+			pHash[u] = SWAP64(r[u] + IV512[u]);
+		}
+
+#ifdef NEED_HASH_512
+		#pragma unroll
+		for (int u = 4; u < 8; u ++) {
+			pHash[u] = SWAP64(r[u] + IV512[u]);
+		}
+#endif
+	}
+}
+
+__host__
+void x17_sha512_cpu_init(int thr_id, uint32_t threads)
+{
+	cudaMemcpyToSymbol(c_WB, WB, 80*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
+}
+
+__host__
+void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash)
+{
+	const uint32_t threadsperblock = 256;
+
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	x17_sha512_gpu_hash_64 <<<grid, block>>> (threads, (uint64_t*)d_hash);
+}
+
+__constant__
+static uint64_t c_PaddedMessage80[10];
+
+__global__
+/*__launch_bounds__(256, 4)*/
+void x16_sha512_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t *g_hash)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint64_t W[80];
+		#pragma unroll
+		for (int i = 0; i < 9; i ++) {
+			W[i] = SWAP64(c_PaddedMessage80[i]);
+		}
+		const uint32_t nonce = startNonce + thread;
+		//((uint32_t*)W)[19] = cuda_swab32(nonce);
+		W[9] = REPLACE_HIDWORD(c_PaddedMessage80[9], cuda_swab32(nonce));
+		W[9] = cuda_swab64(W[9]);
+		W[10] = 0x8000000000000000;
+
+		#pragma unroll
+		for (int i = 11; i<15; i++) {
+			W[i] = 0U;
+		}
+		W[15] = 0x0000000000000280;
+
+		#pragma unroll 64
+		for (int i = 16; i < 80; i ++) {
+			W[i] = SSG5_1(W[i-2]) + W[i-7];
+			W[i] += SSG5_0(W[i-15]) + W[i-16];
+		}
+
+		const uint64_t IV512[8] = {
+			0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
+			0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
+			0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
+			0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
+		};
+
+		uint64_t r[8];
+		#pragma unroll
+		for (int i = 0; i < 8; i++) {
+			r[i] = IV512[i];
+		}
+
+		#pragma unroll
+		for (int i = 0; i < 80; i++) {
+			SHA3_STEP(c_WB, r, W, i&7, i);
+		}
+
+		const uint64_t hashPosition = thread;
+		uint64_t *pHash = &g_hash[hashPosition << 3];
+		#pragma unroll
+		for (int u = 0; u < 8; u ++) {
+			pHash[u] = SWAP64(r[u] + IV512[u]);
+		}
+	}
+}
+
+__host__
+void x16_sha512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash)
+{
+	const uint32_t threadsperblock = 256;
+
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	x16_sha512_gpu_hash_80 <<<grid, block >>> (threads, startNounce, (uint64_t*)d_hash);
+}
+
+__host__
+void x16_sha512_setBlock_80(void *pdata)
+{
+	cudaMemcpyToSymbol(c_PaddedMessage80, pdata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice);
+}
\ No newline at end of file
diff --git a/x17/hmq17.cu b/x17/hmq17.cu
new file mode 100644
index 0000000000..8fdbcdf4ff
--- /dev/null
+++ b/x17/hmq17.cu
@@ -0,0 +1,542 @@
+/**
+ * HMQ1725 algorithm
+ * @author tpruvot@github 02-2017
+ */
+
+extern "C" {
+#include "sph/sph_blake.h"
+#include "sph/sph_bmw.h"
+#include "sph/sph_groestl.h"
+#include "sph/sph_skein.h"
+#include "sph/sph_jh.h"
+#include "sph/sph_keccak.h"
+#include "sph/sph_luffa.h"
+#include "sph/sph_cubehash.h"
+#include "sph/sph_shavite.h"
+#include "sph/sph_simd.h"
+#include "sph/sph_echo.h"
+#include "sph/sph_hamsi.h"
+#include "sph/sph_fugue.h"
+#include "sph/sph_shabal.h"
+#include "sph/sph_whirlpool.h"
+#include "sph/sph_sha2.h"
+#include "sph/sph_haval.h"
+}
+
+#include <miner.h>
+#include <cuda_helper.h>
+
+#include "x11/cuda_x11.h"
+
+static uint32_t *d_hash[MAX_GPUS];
+static uint32_t *d_hash_br2[MAX_GPUS];
+static uint32_t *d_tempBranch[MAX_GPUS];
+
+extern void quark_bmw512_cpu_setBlock_80(void *pdata);
+extern void quark_bmw512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
+
+extern void x11_luffa512_cpu_init(int thr_id, uint32_t threads);
+extern void x11_luffa512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads);
+extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
+extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x13_fugue512_cpu_free(int thr_id);
+
+extern void x14_shabal512_cpu_init(int thr_id, uint32_t threads);
+extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int flag);
+extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x15_whirlpool_cpu_free(int thr_id);
+
+extern void x17_sha512_cpu_init(int thr_id, uint32_t threads);
+extern void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
+
+extern void x17_haval256_cpu_init(int thr_id, uint32_t threads);
+extern void x17_haval256_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, const int outlen);
+
+struct hmq_contexts
+{
+	sph_blake512_context    blake1, blake2;
+	sph_bmw512_context      bmw1, bmw2, bmw3;
+	sph_groestl512_context  groestl1, groestl2;
+	sph_skein512_context    skein1, skein2;
+	sph_jh512_context       jh1, jh2;
+	sph_keccak512_context   keccak1, keccak2;
+	sph_luffa512_context    luffa1, luffa2;
+	sph_cubehash512_context	cubehash;
+	sph_shavite512_context	shavite1, shavite2;
+	sph_simd512_context     simd1, simd2;
+	sph_echo512_context     echo1, echo2;
+	sph_hamsi512_context    hamsi;
+	sph_fugue512_context    fugue1, fugue2;
+	sph_shabal512_context   shabal;
+	sph_whirlpool_context   whirlpool1, whirlpool2, whirlpool3, whirlpool4;
+	sph_sha512_context      sha1, sha2;
+	sph_haval256_5_context  haval1, haval2;
+};
+
+static __thread hmq_contexts base_contexts;
+static __thread bool hmq_context_init = false;
+
+static void init_contexts(hmq_contexts *ctx)
+{
+	sph_bmw512_init(&ctx->bmw1);
+	sph_bmw512_init(&ctx->bmw2);
+	sph_bmw512_init(&ctx->bmw2);
+	sph_bmw512_init(&ctx->bmw3);
+	sph_whirlpool_init(&ctx->whirlpool1);
+	sph_whirlpool_init(&ctx->whirlpool2);
+	sph_whirlpool_init(&ctx->whirlpool3);
+	sph_whirlpool_init(&ctx->whirlpool4);
+	sph_groestl512_init(&ctx->groestl1);
+	sph_groestl512_init(&ctx->groestl2);
+	sph_skein512_init(&ctx->skein1);
+	sph_skein512_init(&ctx->skein2);
+	sph_jh512_init(&ctx->jh1);
+	sph_jh512_init(&ctx->jh2);
+	sph_keccak512_init(&ctx->keccak1);
+	sph_keccak512_init(&ctx->keccak2);
+	sph_blake512_init(&ctx->blake1);
+	sph_blake512_init(&ctx->blake2);
+	sph_luffa512_init(&ctx->luffa1);
+	sph_luffa512_init(&ctx->luffa2);
+	sph_cubehash512_init(&ctx->cubehash);
+	sph_shavite512_init(&ctx->shavite1);
+	sph_shavite512_init(&ctx->shavite2);
+	sph_simd512_init(&ctx->simd1);
+	sph_simd512_init(&ctx->simd2);
+	sph_echo512_init(&ctx->echo1);
+	sph_echo512_init(&ctx->echo2);
+	sph_hamsi512_init(&ctx->hamsi);
+	sph_fugue512_init(&ctx->fugue1);
+	sph_fugue512_init(&ctx->fugue2);
+	sph_shabal512_init(&ctx->shabal);
+	sph_sha512_init(&ctx->sha1);
+	sph_sha512_init(&ctx->sha2);
+	sph_haval256_5_init(&ctx->haval1);
+	sph_haval256_5_init(&ctx->haval2);
+}
+
+// CPU Check
+extern "C" void hmq17hash(void *output, const void *input)
+{
+	uint32_t _ALIGN(64) hash[32];
+
+	const uint32_t mask = 24;
+
+	hmq_contexts ctx;
+	if (!hmq_context_init) {
+		init_contexts(&base_contexts);
+		hmq_context_init = true;
+	}
+	memcpy(&ctx, &base_contexts, sizeof(hmq_contexts));
+
+	sph_bmw512(&ctx.bmw1, input, 80);
+	sph_bmw512_close(&ctx.bmw1, hash);
+
+	sph_whirlpool(&ctx.whirlpool1, hash, 64);
+	sph_whirlpool_close(&ctx.whirlpool1, hash);
+
+	if (hash[0] & mask) {
+		sph_groestl512(&ctx.groestl1, hash, 64);
+		sph_groestl512_close(&ctx.groestl1, hash);
+	} else {
+		sph_skein512(&ctx.skein1, hash, 64);
+		sph_skein512_close(&ctx.skein1, hash);
+	}
+
+	sph_jh512(&ctx.jh1, hash, 64);
+	sph_jh512_close(&ctx.jh1, hash);
+	sph_keccak512(&ctx.keccak1, hash, 64);
+	sph_keccak512_close(&ctx.keccak1, hash);
+
+	if (hash[0] & mask) {
+		sph_blake512(&ctx.blake1, hash, 64);
+		sph_blake512_close(&ctx.blake1, hash);
+	} else {
+		sph_bmw512(&ctx.bmw2, hash, 64);
+		sph_bmw512_close(&ctx.bmw2, hash);
+	}
+
+	sph_luffa512(&ctx.luffa1, hash, 64);
+	sph_luffa512_close(&ctx.luffa1, hash);
+
+	sph_cubehash512(&ctx.cubehash, hash, 64);
+	sph_cubehash512_close(&ctx.cubehash, hash);
+
+	if (hash[0] & mask) {
+		sph_keccak512(&ctx.keccak2, hash, 64);
+		sph_keccak512_close(&ctx.keccak2, hash);
+	} else {
+		sph_jh512(&ctx.jh2, hash, 64);
+		sph_jh512_close(&ctx.jh2, hash);
+	}
+
+	sph_shavite512(&ctx.shavite1, hash, 64);
+	sph_shavite512_close(&ctx.shavite1, hash);
+
+	sph_simd512(&ctx.simd1, hash, 64);
+	sph_simd512_close(&ctx.simd1, hash);
+	//applog_hash(hash);
+
+	if (hash[0] & mask) {
+		sph_whirlpool(&ctx.whirlpool2, hash, 64);
+		sph_whirlpool_close(&ctx.whirlpool2, hash);
+	} else {
+		sph_haval256_5(&ctx.haval1, hash, 64);
+		sph_haval256_5_close(&ctx.haval1, hash);
+		memset(&hash[8], 0, 32);
+	}
+
+	sph_echo512(&ctx.echo1, hash, 64);
+	sph_echo512_close(&ctx.echo1, hash);
+
+	sph_blake512(&ctx.blake2, hash, 64);
+	sph_blake512_close(&ctx.blake2, hash);
+	//applog_hash(hash);
+
+	if (hash[0] & mask) {
+		sph_shavite512(&ctx.shavite2, hash, 64);
+		sph_shavite512_close(&ctx.shavite2, hash);
+	} else {
+		sph_luffa512(&ctx.luffa2, hash, 64);
+		sph_luffa512_close(&ctx.luffa2, hash);
+	}
+
+	sph_hamsi512(&ctx.hamsi, hash, 64);
+	sph_hamsi512_close(&ctx.hamsi, hash);
+
+	sph_fugue512(&ctx.fugue1, hash, 64);
+	sph_fugue512_close(&ctx.fugue1, hash);
+	//applog_hash(hash);
+
+	if (hash[0] & mask) {
+		sph_echo512(&ctx.echo2, hash, 64);
+		sph_echo512_close(&ctx.echo2, hash);
+	} else {
+		sph_simd512(&ctx.simd2, hash, 64);
+		sph_simd512_close(&ctx.simd2, hash);
+	}
+
+	sph_shabal512(&ctx.shabal, hash, 64);
+	sph_shabal512_close(&ctx.shabal, hash);
+
+	sph_whirlpool(&ctx.whirlpool3, hash, 64);
+	sph_whirlpool_close(&ctx.whirlpool3, hash);
+	//applog_hash(hash);
+
+	if (hash[0] & mask) {
+		sph_fugue512(&ctx.fugue2, hash, 64);
+		sph_fugue512_close(&ctx.fugue2, hash);
+	} else {
+		sph_sha512(&ctx.sha1, hash, 64);
+		sph_sha512_close(&ctx.sha1, hash);
+	}
+
+	sph_groestl512(&ctx.groestl2, hash, 64);
+	sph_groestl512_close(&ctx.groestl2, hash);
+
+	sph_sha512(&ctx.sha2, hash, 64);
+	sph_sha512_close(&ctx.sha2, hash);
+	//applog_hash(hash);
+
+	if (hash[0] & mask) {
+		sph_haval256_5(&ctx.haval2, hash, 64);
+		sph_haval256_5_close(&ctx.haval2, hash);
+		memset(&hash[8], 0, 32);
+	} else {
+		sph_whirlpool(&ctx.whirlpool4, hash, 64);
+		sph_whirlpool_close(&ctx.whirlpool4, hash);
+	}
+	//applog_hash(hash);
+
+	sph_bmw512(&ctx.bmw3, hash, 64);
+	sph_bmw512_close(&ctx.bmw3, hash);
+
+	memcpy(output, hash, 32);
+}
+
+__global__ __launch_bounds__(128, 8)
+void hmq_filter_gpu(const uint32_t threads, const uint32_t* d_hash, uint32_t* d_branch2, uint32_t* d_NonceBranch)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint32_t offset = thread * 16U; // 64U / sizeof(uint32_t);
+		uint4 *psrc = (uint4*) (&d_hash[offset]);
+		d_NonceBranch[thread] = ((uint8_t*)psrc)[0] & 24U;
+		if (d_NonceBranch[thread]) return;
+		// uint4 = 4x uint32_t = 16 bytes
+		uint4 *pdst = (uint4*) (&d_branch2[offset]);
+		pdst[0] = psrc[0];
+		pdst[1] = psrc[1];
+		pdst[2] = psrc[2];
+		pdst[3] = psrc[3];
+	}
+}
+
+__global__ __launch_bounds__(128, 8)
+void hmq_merge_gpu(const uint32_t threads, uint32_t* d_hash, uint32_t* d_branch2, uint32_t* const d_NonceBranch)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads && !d_NonceBranch[thread])
+	{
+		const uint32_t offset = thread * 16U;
+		uint4 *pdst = (uint4*) (&d_hash[offset]);
+		uint4 *psrc = (uint4*) (&d_branch2[offset]);
+		pdst[0] = psrc[0];
+		pdst[1] = psrc[1];
+		pdst[2] = psrc[2];
+		pdst[3] = psrc[3];
+	}
+}
+
+__host__
+uint32_t hmq_filter_cpu(const int thr_id, const uint32_t threads, const uint32_t *inpHashes, uint32_t* d_branch2)
+{
+	const uint32_t threadsperblock = 128;
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+	// extract algo permution hashes to a second branch buffer
+	hmq_filter_gpu <<<grid, block>>> (threads, inpHashes, d_branch2, d_tempBranch[thr_id]);
+	return threads;
+}
+
+__host__
+void hmq_merge_cpu(const int thr_id, const uint32_t threads, uint32_t *outpHashes, uint32_t* d_branch2)
+{
+	const uint32_t threadsperblock = 128;
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+	// put back second branch hashes to the common buffer d_hash
+	hmq_merge_gpu <<<grid, block>>> (threads, outpHashes, d_branch2, d_tempBranch[thr_id]);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+
+//#define _DEBUG
+#define _DEBUG_PREFIX "hmq-"
+#include "cuda_debug.cuh"
+
+extern "C" int scanhash_hmq17(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+
+	uint32_t throughput =  cuda_default_throughput(thr_id, 1U << 19); // 19=256*256*8;
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (opt_benchmark)
+		((uint32_t*)ptarget)[7] = 0x00ff;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads",
+			throughput2intensity(throughput), throughput);
+
+		quark_bmw512_cpu_init(thr_id, throughput);
+		x15_whirlpool_cpu_init(thr_id, throughput, 0);
+		quark_groestl512_cpu_init(thr_id, throughput);
+		quark_skein512_cpu_init(thr_id, throughput);
+		quark_jh512_cpu_init(thr_id, throughput);
+		quark_blake512_cpu_init(thr_id, throughput);
+		x11_luffaCubehash512_cpu_init(thr_id, throughput);
+		quark_keccak512_cpu_init(thr_id, throughput);
+		x11_simd512_cpu_init(thr_id, throughput);
+		x17_haval256_cpu_init(thr_id, throughput);
+		x11_echo512_cpu_init(thr_id, throughput);
+		x11_shavite512_cpu_init(thr_id, throughput);
+		x11_luffa512_cpu_init(thr_id, throughput);
+		x13_hamsi512_cpu_init(thr_id, throughput);
+		x13_fugue512_cpu_init(thr_id, throughput);
+		x14_shabal512_cpu_init(thr_id, throughput);
+		x17_sha512_cpu_init(thr_id, throughput);
+
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash_br2[thr_id], (size_t) 64 * throughput), 0);
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_tempBranch[thr_id], sizeof(uint32_t) * throughput), 0);
+
+		cuda_check_cpu_init(thr_id, throughput);
+
+		init[thr_id] = true;
+	}
+
+	int warn = 0;
+	uint32_t endiandata[20];
+	for (int k=0; k < 20; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	quark_bmw512_cpu_setBlock_80(endiandata);
+	cuda_check_cpu_setTarget(ptarget);
+
+	do {
+		int order = 0;
+
+		// Hash with CUDA
+		quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		TRACE("bmw512 ");
+		x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		TRACE("whirl  ");
+
+		hmq_filter_cpu(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]);
+		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order++);
+		hmq_merge_cpu(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]);
+
+		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		TRACE("keccak ");
+
+		hmq_filter_cpu(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]);
+		quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order++);
+		hmq_merge_cpu(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]);
+
+		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
+		TRACE("cube   ");
+
+		hmq_filter_cpu(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]);
+		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order++);
+		hmq_merge_cpu(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]);
+
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		TRACE("simd   ");
+
+		hmq_filter_cpu(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]);
+		x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x17_haval256_cpu_hash_64(thr_id, throughput, pdata[19], d_hash_br2[thr_id], 512); order++;
+		hmq_merge_cpu(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]);
+
+		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		TRACE("blake  ");
+
+		hmq_filter_cpu(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]);
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_luffa512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order++);
+		hmq_merge_cpu(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]);
+
+		x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		TRACE("fugue  ");
+
+		hmq_filter_cpu(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]);
+		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order++);
+		hmq_merge_cpu(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]);
+
+		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		TRACE("whirl  ");
+
+		hmq_filter_cpu(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]);
+		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash_br2[thr_id]); order++;
+		hmq_merge_cpu(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]);
+
+		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+		TRACE("sha512 ");
+
+		hmq_filter_cpu(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]);
+		x17_haval256_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], 512); order++;
+		x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order++);
+		hmq_merge_cpu(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]);
+		TRACE("hav/wh ");
+
+		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		TRACE("bmw512 => ");
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			hmq17hash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				work_set_target_ratio(work, vhash);
+				if (work->nonces[1] != 0 && work->nonces[1] != work->nonces[0]) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					hmq17hash(vhash, endiandata);
+					if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+						bn_set_target_ratio(work, vhash, 1);
+						work->valid_nonces++;
+					} else if (vhash[7] > Htarg) {
+						gpu_increment_reject(thr_id);
+					}
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				// x11+ coins could do some random error, but not on retry
+				gpu_increment_reject(thr_id);
+				if (!warn) {
+					warn++;
+					pdata[19] = work->nonces[0] + 1;
+					continue;
+				} else {
+					if (!opt_quiet)
+						gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+					warn = 0;
+				}
+			}
+		}
+
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
+		pdata[19] += throughput;
+
+	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+	return 0;
+}
+
+// cleanup
+extern "C" void free_hmq17(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+	cudaFree(d_hash_br2[thr_id]);
+	cudaFree(d_tempBranch[thr_id]);
+
+	quark_blake512_cpu_free(thr_id);
+	quark_groestl512_cpu_free(thr_id);
+	x11_simd512_cpu_free(thr_id);
+	x13_fugue512_cpu_free(thr_id);
+	x15_whirlpool_cpu_free(thr_id);
+
+	cuda_check_cpu_free(thr_id);
+
+	cudaDeviceSynchronize();
+	init[thr_id] = false;
+}
diff --git a/x17/sonoa.cu b/x17/sonoa.cu
new file mode 100644
index 0000000000..153f787cef
--- /dev/null
+++ b/x17/sonoa.cu
@@ -0,0 +1,632 @@
+/**
+ * x97 SONO
+ **/
+
+extern "C" {
+#include "sph/sph_blake.h"
+#include "sph/sph_bmw.h"
+#include "sph/sph_groestl.h"
+#include "sph/sph_skein.h"
+#include "sph/sph_jh.h"
+#include "sph/sph_keccak.h"
+#include "sph/sph_luffa.h"
+#include "sph/sph_cubehash.h"
+#include "sph/sph_shavite.h"
+#include "sph/sph_simd.h"
+#include "sph/sph_echo.h"
+#include "sph/sph_hamsi.h"
+#include "sph/sph_fugue.h"
+#include "sph/sph_shabal.h"
+#include "sph/sph_whirlpool.h"
+#include "sph/sph_sha2.h"
+#include "sph/sph_haval.h"
+}
+
+#include "miner.h"
+#include "cuda_helper.h"
+#include "x11/cuda_x11.h"
+
+#define NBN 2
+
+static uint32_t *d_hash[MAX_GPUS];
+
+extern void x16_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
+
+extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads);
+extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
+extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x13_fugue512_cpu_free(int thr_id);
+
+extern void x14_shabal512_cpu_init(int thr_id, uint32_t threads);
+extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int flag);
+extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x15_whirlpool_cpu_free(int thr_id);
+
+extern void x17_sha512_cpu_init(int thr_id, uint32_t threads);
+extern void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
+
+extern void x17_haval256_cpu_init(int thr_id, uint32_t threads);
+extern void x17_haval256_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, const int outlen);
+
+// CPU Hash Validation
+extern "C" void sonoa_hash(void *output, const void *input)
+{
+	unsigned char _ALIGN(128) hash[64];
+
+	sph_blake512_context ctx_blake;
+	sph_bmw512_context ctx_bmw;
+	sph_groestl512_context ctx_groestl;
+	sph_jh512_context ctx_jh;
+	sph_keccak512_context ctx_keccak;
+	sph_skein512_context ctx_skein;
+	sph_luffa512_context ctx_luffa;
+	sph_cubehash512_context ctx_cubehash;
+	sph_shavite512_context ctx_shavite;
+	sph_simd512_context ctx_simd;
+	sph_echo512_context ctx_echo;
+	sph_hamsi512_context ctx_hamsi;
+	sph_fugue512_context ctx_fugue;
+	sph_shabal512_context ctx_shabal;
+	sph_whirlpool_context ctx_whirlpool;
+	sph_sha512_context ctx_sha512;
+	sph_haval256_5_context ctx_haval;
+
+
+	sph_blake512_init(&ctx_blake);
+	sph_blake512(&ctx_blake, input, 80);
+	sph_blake512_close(&ctx_blake, (void*)hash);
+
+	sph_bmw512_init(&ctx_bmw);
+	sph_bmw512(&ctx_bmw, (const void*)hash, 64);
+	sph_bmw512_close(&ctx_bmw, (void*)hash);
+
+	sph_groestl512_init(&ctx_groestl);
+	sph_groestl512(&ctx_groestl, (const void*)hash, 64);
+	sph_groestl512_close(&ctx_groestl, (void*)hash);
+
+	sph_skein512_init(&ctx_skein);
+	sph_skein512(&ctx_skein, (const void*)hash, 64);
+	sph_skein512_close(&ctx_skein, (void*)hash);
+
+	sph_jh512_init(&ctx_jh);
+	sph_jh512(&ctx_jh, (const void*)hash, 64);
+	sph_jh512_close(&ctx_jh, (void*)hash);
+
+	sph_keccak512_init(&ctx_keccak);
+	sph_keccak512(&ctx_keccak, (const void*)hash, 64);
+	sph_keccak512_close(&ctx_keccak, (void*)hash);
+
+	sph_luffa512_init(&ctx_luffa);
+	sph_luffa512(&ctx_luffa, (const void*)hash, 64);
+	sph_luffa512_close(&ctx_luffa, (void*)hash);
+
+	sph_cubehash512_init(&ctx_cubehash);
+	sph_cubehash512(&ctx_cubehash, (const void*)hash, 64);
+	sph_cubehash512_close(&ctx_cubehash, (void*)hash);
+
+	sph_shavite512_init(&ctx_shavite);
+	sph_shavite512(&ctx_shavite, (const void*)hash, 64);
+	sph_shavite512_close(&ctx_shavite, (void*)hash);
+
+	sph_simd512_init(&ctx_simd);
+	sph_simd512(&ctx_simd, (const void*)hash, 64);
+	sph_simd512_close(&ctx_simd, (void*)hash);
+
+	sph_echo512_init(&ctx_echo);
+	sph_echo512(&ctx_echo, (const void*)hash, 64);
+	sph_echo512_close(&ctx_echo, (void*)hash);
+
+
+	sph_bmw512(&ctx_bmw, (const void*)hash, 64);
+	sph_bmw512_close(&ctx_bmw, (void*)hash);
+
+	sph_groestl512(&ctx_groestl, (const void*)hash, 64);
+	sph_groestl512_close(&ctx_groestl, (void*)hash);
+
+	sph_skein512(&ctx_skein, (const void*)hash, 64);
+	sph_skein512_close(&ctx_skein, (void*)hash);
+
+	sph_jh512(&ctx_jh, (const void*)hash, 64);
+	sph_jh512_close(&ctx_jh, (void*)hash);
+
+	sph_keccak512(&ctx_keccak, (const void*)hash, 64);
+	sph_keccak512_close(&ctx_keccak, (void*)hash);
+
+	sph_luffa512(&ctx_luffa, (const void*)hash, 64);
+	sph_luffa512_close(&ctx_luffa, (void*)hash);
+
+	sph_cubehash512(&ctx_cubehash, (const void*)hash, 64);
+	sph_cubehash512_close(&ctx_cubehash, (void*)hash);
+
+	sph_shavite512(&ctx_shavite, (const void*)hash, 64);
+	sph_shavite512_close(&ctx_shavite, (void*)hash);
+
+	sph_simd512(&ctx_simd, (const void*)hash, 64);
+	sph_simd512_close(&ctx_simd, (void*)hash);
+
+	sph_echo512(&ctx_echo, (const void*)hash, 64);
+	sph_echo512_close(&ctx_echo, (void*)hash);
+
+	sph_hamsi512_init(&ctx_hamsi);
+	sph_hamsi512(&ctx_hamsi, (const void*)hash, 64);
+	sph_hamsi512_close(&ctx_hamsi, (void*)hash);
+
+
+	sph_bmw512(&ctx_bmw, (const void*)hash, 64);
+	sph_bmw512_close(&ctx_bmw, (void*)hash);
+
+	sph_groestl512(&ctx_groestl, (const void*)hash, 64);
+	sph_groestl512_close(&ctx_groestl, (void*)hash);
+
+	sph_skein512(&ctx_skein, (const void*)hash, 64);
+	sph_skein512_close(&ctx_skein, (void*)hash);
+
+	sph_jh512(&ctx_jh, (const void*)hash, 64);
+	sph_jh512_close(&ctx_jh, (void*)hash);
+
+	sph_keccak512(&ctx_keccak, (const void*)hash, 64);
+	sph_keccak512_close(&ctx_keccak, (void*)hash);
+
+	sph_luffa512(&ctx_luffa, (const void*)hash, 64);
+	sph_luffa512_close(&ctx_luffa, (void*)hash);
+
+	sph_cubehash512(&ctx_cubehash, (const void*)hash, 64);
+	sph_cubehash512_close(&ctx_cubehash, (void*)hash);
+
+	sph_shavite512(&ctx_shavite, (const void*)hash, 64);
+	sph_shavite512_close(&ctx_shavite, (void*)hash);
+
+	sph_simd512(&ctx_simd, (const void*)hash, 64);
+	sph_simd512_close(&ctx_simd, (void*)hash);
+
+	sph_echo512(&ctx_echo, (const void*)hash, 64);
+	sph_echo512_close(&ctx_echo, (void*)hash);
+
+	sph_hamsi512(&ctx_hamsi, (const void*)hash, 64);
+	sph_hamsi512_close(&ctx_hamsi, (void*)hash);
+
+	sph_fugue512_init(&ctx_fugue);
+	sph_fugue512(&ctx_fugue, (const void*)hash, 64);
+	sph_fugue512_close(&ctx_fugue, (void*)hash);
+
+
+	sph_bmw512(&ctx_bmw, (const void*)hash, 64);
+	sph_bmw512_close(&ctx_bmw, (void*)hash);
+
+	sph_groestl512(&ctx_groestl, (const void*)hash, 64);
+	sph_groestl512_close(&ctx_groestl, (void*)hash);
+
+	sph_skein512(&ctx_skein, (const void*)hash, 64);
+	sph_skein512_close(&ctx_skein, (void*)hash);
+
+	sph_jh512(&ctx_jh, (const void*)hash, 64);
+	sph_jh512_close(&ctx_jh, (void*)hash);
+
+	sph_keccak512(&ctx_keccak, (const void*)hash, 64);
+	sph_keccak512_close(&ctx_keccak, (void*)hash);
+
+	sph_luffa512(&ctx_luffa, (const void*)hash, 64);
+	sph_luffa512_close(&ctx_luffa, (void*)hash);
+
+	sph_cubehash512(&ctx_cubehash, (const void*)hash, 64);
+	sph_cubehash512_close(&ctx_cubehash, (void*)hash);
+
+	sph_shavite512(&ctx_shavite, (const void*)hash, 64);
+	sph_shavite512_close(&ctx_shavite, (void*)hash);
+
+	sph_simd512(&ctx_simd, (const void*)hash, 64);
+	sph_simd512_close(&ctx_simd, (void*)hash);
+
+	sph_echo512(&ctx_echo, (const void*)hash, 64);
+	sph_echo512_close(&ctx_echo, (void*)hash);
+
+	sph_hamsi512(&ctx_hamsi, (const void*)hash, 64);
+	sph_hamsi512_close(&ctx_hamsi, (void*)hash);
+
+	sph_fugue512(&ctx_fugue, (const void*)hash, 64);
+	sph_fugue512_close(&ctx_fugue, (void*)hash);
+
+	sph_shabal512_init(&ctx_shabal);
+	sph_shabal512(&ctx_shabal, (const void*)hash, 64);
+	sph_shabal512_close(&ctx_shabal, (void*)hash);
+
+	sph_hamsi512(&ctx_hamsi, (const void*)hash, 64);
+	sph_hamsi512_close(&ctx_hamsi, (void*)hash);
+
+	sph_echo512(&ctx_echo, (const void*)hash, 64);
+	sph_echo512_close(&ctx_echo, (void*)hash);
+
+	sph_shavite512(&ctx_shavite, (const void*)hash, 64);
+	sph_shavite512_close(&ctx_shavite, (void*)hash);
+
+
+	sph_bmw512(&ctx_bmw, (const void*)hash, 64);
+	sph_bmw512_close(&ctx_bmw, (void*)hash);
+
+	sph_shabal512(&ctx_shabal, (const void*)hash, 64);
+	sph_shabal512_close(&ctx_shabal, (void*)hash);
+
+	sph_groestl512(&ctx_groestl, (const void*)hash, 64);
+	sph_groestl512_close(&ctx_groestl, (void*)hash);
+
+	sph_skein512(&ctx_skein, (const void*)hash, 64);
+	sph_skein512_close(&ctx_skein, (void*)hash);
+
+	sph_jh512(&ctx_jh, (const void*)hash, 64);
+	sph_jh512_close(&ctx_jh, (void*)hash);
+
+	sph_keccak512(&ctx_keccak, (const void*)hash, 64);
+	sph_keccak512_close(&ctx_keccak, (void*)hash);
+
+	sph_luffa512(&ctx_luffa, (const void*)hash, 64);
+	sph_luffa512_close(&ctx_luffa, (void*)hash);
+
+	sph_cubehash512(&ctx_cubehash, (const void*)hash, 64);
+	sph_cubehash512_close(&ctx_cubehash, (void*)hash);
+
+	sph_shavite512(&ctx_shavite, (const void*)hash, 64);
+	sph_shavite512_close(&ctx_shavite, (void*)hash);
+
+	sph_simd512(&ctx_simd, (const void*)hash, 64);
+	sph_simd512_close(&ctx_simd, (void*)hash);
+
+	sph_echo512(&ctx_echo, (const void*)hash, 64);
+	sph_echo512_close(&ctx_echo, (void*)hash);
+
+	sph_hamsi512(&ctx_hamsi, (const void*)hash, 64);
+	sph_hamsi512_close(&ctx_hamsi, (void*)hash);
+
+	sph_fugue512(&ctx_fugue, (const void*)hash, 64);
+	sph_fugue512_close(&ctx_fugue, (void*)hash);
+
+	sph_shabal512(&ctx_shabal, (const void*)hash, 64);
+	sph_shabal512_close(&ctx_shabal, (void*)hash);
+
+	sph_whirlpool_init(&ctx_whirlpool);
+	sph_whirlpool(&ctx_whirlpool, (const void*)hash, 64);
+	sph_whirlpool_close(&ctx_whirlpool, (void*)hash);
+
+
+	sph_bmw512(&ctx_bmw, (const void*)hash, 64);
+	sph_bmw512_close(&ctx_bmw, (void*)hash);
+
+	sph_groestl512(&ctx_groestl, (const void*)hash, 64);
+	sph_groestl512_close(&ctx_groestl, (void*)hash);
+
+	sph_skein512(&ctx_skein, (const void*)hash, 64);
+	sph_skein512_close(&ctx_skein, (void*)hash);
+
+	sph_jh512(&ctx_jh, (const void*)hash, 64);
+	sph_jh512_close(&ctx_jh, (void*)hash);
+
+	sph_keccak512(&ctx_keccak, (const void*)hash, 64);
+	sph_keccak512_close(&ctx_keccak, (void*)hash);
+
+	sph_luffa512(&ctx_luffa, (const void*)hash, 64);
+	sph_luffa512_close(&ctx_luffa, (void*)hash);
+
+	sph_cubehash512(&ctx_cubehash, (const void*)hash, 64);
+	sph_cubehash512_close(&ctx_cubehash, (void*)hash);
+
+	sph_shavite512(&ctx_shavite, (const void*)hash, 64);
+	sph_shavite512_close(&ctx_shavite, (void*)hash);
+
+	sph_simd512(&ctx_simd, (const void*)hash, 64);
+	sph_simd512_close(&ctx_simd, (void*)hash);
+
+	sph_echo512(&ctx_echo, (const void*)hash, 64);
+	sph_echo512_close(&ctx_echo, (void*)hash);
+
+	sph_hamsi512(&ctx_hamsi, (const void*)hash, 64);
+	sph_hamsi512_close(&ctx_hamsi, (void*)hash);
+
+	sph_fugue512(&ctx_fugue, (const void*)hash, 64);
+	sph_fugue512_close(&ctx_fugue, (void*)hash);
+
+	sph_shabal512(&ctx_shabal, (const void*)hash, 64);
+	sph_shabal512_close(&ctx_shabal, (void*)hash);
+
+	sph_whirlpool(&ctx_whirlpool, (const void*)hash, 64);
+	sph_whirlpool_close(&ctx_whirlpool, (void*)hash);
+
+	sph_sha512_init(&ctx_sha512);
+	sph_sha512(&ctx_sha512, (const void*)hash, 64);
+	sph_sha512_close(&ctx_sha512, (void*)hash);
+
+	sph_whirlpool(&ctx_whirlpool, (const void*)hash, 64);
+	sph_whirlpool_close(&ctx_whirlpool, (void*)hash);
+
+
+	sph_bmw512(&ctx_bmw, (const void*)hash, 64);
+	sph_bmw512_close(&ctx_bmw, (void*)hash);
+
+	sph_groestl512(&ctx_groestl, (const void*)hash, 64);
+	sph_groestl512_close(&ctx_groestl, (void*)hash);
+
+	sph_skein512(&ctx_skein, (const void*)hash, 64);
+	sph_skein512_close(&ctx_skein, (void*)hash);
+
+	sph_jh512(&ctx_jh, (const void*)hash, 64);
+	sph_jh512_close(&ctx_jh, (void*)hash);
+
+	sph_keccak512(&ctx_keccak, (const void*)hash, 64);
+	sph_keccak512_close(&ctx_keccak, (void*)hash);
+
+	sph_luffa512(&ctx_luffa, (const void*)hash, 64);
+	sph_luffa512_close(&ctx_luffa, (void*)hash);
+
+	sph_cubehash512(&ctx_cubehash, (const void*)hash, 64);
+	sph_cubehash512_close(&ctx_cubehash, (void*)hash);
+
+	sph_shavite512(&ctx_shavite, (const void*)hash, 64);
+	sph_shavite512_close(&ctx_shavite, (void*)hash);
+
+	sph_simd512(&ctx_simd, (const void*)hash, 64);
+	sph_simd512_close(&ctx_simd, (void*)hash);
+
+	sph_echo512(&ctx_echo, (const void*)hash, 64);
+	sph_echo512_close(&ctx_echo, (void*)hash);
+
+	sph_hamsi512(&ctx_hamsi, (const void*)hash, 64);
+	sph_hamsi512_close(&ctx_hamsi, (void*)hash);
+
+	sph_fugue512(&ctx_fugue, (const void*)hash, 64);
+	sph_fugue512_close(&ctx_fugue, (void*)hash);
+
+	sph_shabal512(&ctx_shabal, (const void*)hash, 64);
+	sph_shabal512_close(&ctx_shabal, (void*)hash);
+
+	sph_whirlpool(&ctx_whirlpool, (const void*)hash, 64);
+	sph_whirlpool_close(&ctx_whirlpool, (void*)hash);
+
+	sph_sha512(&ctx_sha512, (const void*)hash, 64);
+	sph_sha512_close(&ctx_sha512, (void*)hash);
+
+	sph_haval256_5_init(&ctx_haval);
+	sph_haval256_5(&ctx_haval, (const void*)hash, 64);
+	sph_haval256_5_close(&ctx_haval, (void*)hash);
+
+	memcpy(output, hash, 32);
+}
+
+#define x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash) \
+  x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash, order++); \
+  if (use_compat_kernels[thr_id]) x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash, order++); \
+  else x16_echo512_cpu_hash_64(thr_id, throughput, d_hash)
+
+
+static bool init[MAX_GPUS] = { 0 };
+static bool use_compat_kernels[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_sonoa(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	const int dev_id = device_map[thr_id];
+
+	uint32_t default_throughput = 1 << 18;
+	if (device_sm[dev_id] <= 500) default_throughput = 1 << 18;
+	else if (device_sm[dev_id] <= 520) default_throughput = 1 << 18;
+	else if (device_sm[dev_id]  > 520) default_throughput = (1 << 19) + (1 << 18);
+
+	uint32_t throughput = cuda_default_throughput(thr_id, default_throughput);
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	throughput &= 0xFFFFFF00;
+
+	if (opt_benchmark)
+		((uint32_t*)ptarget)[7] = 0x00ff;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+		}
+		gpulog(LOG_INFO,thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		cuda_get_arch(thr_id);
+		use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
+		if (use_compat_kernels[thr_id])
+			x11_echo512_cpu_init(thr_id, throughput);
+
+		quark_blake512_cpu_init(thr_id, throughput);
+		quark_groestl512_cpu_init(thr_id, throughput);
+		quark_skein512_cpu_init(thr_id, throughput);
+		quark_bmw512_cpu_init(thr_id, throughput);
+		quark_keccak512_cpu_init(thr_id, throughput);
+		quark_jh512_cpu_init(thr_id, throughput);
+		x11_luffaCubehash512_cpu_init(thr_id, throughput);
+		x11_shavite512_cpu_init(thr_id, throughput);
+		x11_simd512_cpu_init(thr_id, throughput);
+		x13_hamsi512_cpu_init(thr_id, throughput);
+		x13_fugue512_cpu_init(thr_id, throughput);
+		x14_shabal512_cpu_init(thr_id, throughput);
+		x15_whirlpool_cpu_init(thr_id, throughput, 0);
+		x17_sha512_cpu_init(thr_id, throughput);
+		x17_haval256_cpu_init(thr_id, throughput);
+
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 8 * sizeof(uint64_t) * throughput));
+
+		cuda_check_cpu_init(thr_id, throughput);
+
+		init[thr_id] = true;
+	}
+
+	int warn = 0;
+	uint32_t _ALIGN(64) endiandata[20];
+	for (int k=0; k < 20; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	quark_blake512_cpu_setBlock_80(thr_id, endiandata);
+	cuda_check_cpu_setTarget(ptarget);
+
+	do {
+		int order = 0;
+		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
+
+		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
+		x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+
+		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
+		x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+
+		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
+		x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x16_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+
+		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
+		x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+
+		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
+		x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+		x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+
+		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
+		x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+		x17_haval256_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], 256); order++;
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+                work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+                if (work->nonces[0] != UINT32_MAX)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			sonoa_hash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget))
+			{
+				work->valid_nonces = 1;
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				work_set_target_ratio(work, vhash);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					sonoa_hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!warn) {
+					warn++;
+					pdata[19] = work->nonces[0] + 1;
+					continue;
+				} else {
+					if (!opt_quiet)
+					gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+					warn = 0;
+				}
+			}
+		}
+
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
+		pdata[19] += throughput;
+
+	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+	return 0;
+}
+
+extern "C" void free_sonoa(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaDeviceSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+
+	quark_blake512_cpu_free(thr_id);
+	quark_groestl512_cpu_free(thr_id);
+	x11_simd512_cpu_free(thr_id);
+	x13_fugue512_cpu_free(thr_id);
+	x15_whirlpool_cpu_free(thr_id);
+
+	cudaDeviceSynchronize();
+	init[thr_id] = false;
+}
diff --git a/x17/x17.cu b/x17/x17.cu
new file mode 100644
index 0000000000..3536cdcde7
--- /dev/null
+++ b/x17/x17.cu
@@ -0,0 +1,316 @@
+/**
+ * X17 algorithm (X15 + sha512 + haval256)
+ */
+
+extern "C" {
+#include "sph/sph_blake.h"
+#include "sph/sph_bmw.h"
+#include "sph/sph_groestl.h"
+#include "sph/sph_skein.h"
+#include "sph/sph_jh.h"
+#include "sph/sph_keccak.h"
+
+#include "sph/sph_luffa.h"
+#include "sph/sph_cubehash.h"
+#include "sph/sph_shavite.h"
+#include "sph/sph_simd.h"
+#include "sph/sph_echo.h"
+
+#include "sph/sph_hamsi.h"
+#include "sph/sph_fugue.h"
+
+#include "sph/sph_shabal.h"
+#include "sph/sph_whirlpool.h"
+
+#include "sph/sph_sha2.h"
+#include "sph/sph_haval.h"
+}
+
+#include "miner.h"
+#include "cuda_helper.h"
+#include "x11/cuda_x11.h"
+
+static uint32_t *d_hash[MAX_GPUS];
+
+extern void x16_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
+
+extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads);
+extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
+extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x13_fugue512_cpu_free(int thr_id);
+
+extern void x14_shabal512_cpu_init(int thr_id, uint32_t threads);
+extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int flag);
+extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x15_whirlpool_cpu_free(int thr_id);
+
+extern void x17_sha512_cpu_init(int thr_id, uint32_t threads);
+extern void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
+
+extern void x17_haval256_cpu_init(int thr_id, uint32_t threads);
+extern void x17_haval256_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, const int outlen);
+
+
+// X17 CPU Hash (Validation)
+extern "C" void x17hash(void *output, const void *input)
+{
+	unsigned char _ALIGN(64) hash[128];
+
+	// x11 + hamsi12-fugue13-shabal14-whirlpool15-sha512-haval256
+
+	sph_blake512_context ctx_blake;
+	sph_bmw512_context ctx_bmw;
+	sph_groestl512_context ctx_groestl;
+	sph_jh512_context ctx_jh;
+	sph_keccak512_context ctx_keccak;
+	sph_skein512_context ctx_skein;
+	sph_luffa512_context ctx_luffa;
+	sph_cubehash512_context ctx_cubehash;
+	sph_shavite512_context ctx_shavite;
+	sph_simd512_context ctx_simd;
+	sph_echo512_context ctx_echo;
+	sph_hamsi512_context ctx_hamsi;
+	sph_fugue512_context ctx_fugue;
+	sph_shabal512_context ctx_shabal;
+	sph_whirlpool_context ctx_whirlpool;
+	sph_sha512_context ctx_sha512;
+	sph_haval256_5_context ctx_haval;
+
+
+	sph_blake512_init(&ctx_blake);
+	sph_blake512(&ctx_blake, input, 80);
+	sph_blake512_close(&ctx_blake, hash);
+
+	sph_bmw512_init(&ctx_bmw);
+	sph_bmw512(&ctx_bmw, (const void*) hash, 64);
+	sph_bmw512_close(&ctx_bmw, hash);
+
+	sph_groestl512_init(&ctx_groestl);
+	sph_groestl512(&ctx_groestl, (const void*) hash, 64);
+	sph_groestl512_close(&ctx_groestl, hash);
+
+	sph_skein512_init(&ctx_skein);
+	sph_skein512(&ctx_skein, (const void*) hash, 64);
+	sph_skein512_close(&ctx_skein, hash);
+
+	sph_jh512_init(&ctx_jh);
+	sph_jh512(&ctx_jh, (const void*) hash, 64);
+	sph_jh512_close(&ctx_jh, hash);
+
+	sph_keccak512_init(&ctx_keccak);
+	sph_keccak512(&ctx_keccak, (const void*) hash, 64);
+	sph_keccak512_close(&ctx_keccak, hash);
+
+	sph_luffa512_init(&ctx_luffa);
+	sph_luffa512(&ctx_luffa, (const void*) hash, 64);
+	sph_luffa512_close (&ctx_luffa, hash);
+
+	sph_cubehash512_init(&ctx_cubehash);
+	sph_cubehash512(&ctx_cubehash, (const void*) hash, 64);
+	sph_cubehash512_close(&ctx_cubehash, hash);
+
+	sph_shavite512_init(&ctx_shavite);
+	sph_shavite512(&ctx_shavite, (const void*) hash, 64);
+	sph_shavite512_close(&ctx_shavite, hash);
+
+	sph_simd512_init(&ctx_simd);
+	sph_simd512(&ctx_simd, (const void*) hash, 64);
+	sph_simd512_close(&ctx_simd, hash);
+
+	sph_echo512_init(&ctx_echo);
+	sph_echo512(&ctx_echo, (const void*) hash, 64);
+	sph_echo512_close(&ctx_echo, hash);
+
+	sph_hamsi512_init(&ctx_hamsi);
+	sph_hamsi512(&ctx_hamsi, (const void*) hash, 64);
+	sph_hamsi512_close(&ctx_hamsi, hash);
+
+	sph_fugue512_init(&ctx_fugue);
+	sph_fugue512(&ctx_fugue, (const void*) hash, 64);
+	sph_fugue512_close(&ctx_fugue, hash);
+
+	sph_shabal512_init(&ctx_shabal);
+	sph_shabal512(&ctx_shabal, (const void*) hash, 64);
+	sph_shabal512_close(&ctx_shabal, hash);
+
+	sph_whirlpool_init(&ctx_whirlpool);
+	sph_whirlpool (&ctx_whirlpool, (const void*) hash, 64);
+	sph_whirlpool_close(&ctx_whirlpool, hash);
+
+	sph_sha512_init(&ctx_sha512);
+	sph_sha512(&ctx_sha512,(const void*) hash, 64);
+	sph_sha512_close(&ctx_sha512,(void*) hash);
+
+	sph_haval256_5_init(&ctx_haval);
+	sph_haval256_5(&ctx_haval,(const void*) hash, 64);
+	sph_haval256_5_close(&ctx_haval,hash);
+
+	memcpy(output, hash, 32);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+static bool use_compat_kernels[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	const int dev_id = device_map[thr_id];
+
+	uint32_t throughput =  cuda_default_throughput(thr_id, 1U << 19); // 19=256*256*8;
+	//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (opt_benchmark)
+		((uint32_t*)ptarget)[7] = 0x00ff;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		cuda_get_arch(thr_id);
+		use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
+		if (use_compat_kernels[thr_id])
+			x11_echo512_cpu_init(thr_id, throughput);
+
+		quark_blake512_cpu_init(thr_id, throughput);
+		quark_groestl512_cpu_init(thr_id, throughput);
+		quark_skein512_cpu_init(thr_id, throughput);
+		quark_bmw512_cpu_init(thr_id, throughput);
+		quark_keccak512_cpu_init(thr_id, throughput);
+		quark_jh512_cpu_init(thr_id, throughput);
+		x11_luffaCubehash512_cpu_init(thr_id, throughput);
+		x11_shavite512_cpu_init(thr_id, throughput);
+		x11_simd512_cpu_init(thr_id, throughput);
+		x13_hamsi512_cpu_init(thr_id, throughput);
+		x13_fugue512_cpu_init(thr_id, throughput);
+		x14_shabal512_cpu_init(thr_id, throughput);
+		x15_whirlpool_cpu_init(thr_id, throughput, 0);
+		x17_sha512_cpu_init(thr_id, throughput);
+		x17_haval256_cpu_init(thr_id, throughput);
+
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0);
+
+		cuda_check_cpu_init(thr_id, throughput);
+
+		init[thr_id] = true;
+	}
+
+	uint32_t _ALIGN(64) endiandata[20];
+	for (int k=0; k < 20; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	quark_blake512_cpu_setBlock_80(thr_id, endiandata);
+	cuda_check_cpu_setTarget(ptarget);
+
+	int warn = 0;
+
+	do {
+		int order = 0;
+
+		// Hash with CUDA
+		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		if (use_compat_kernels[thr_id])
+			x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		else {
+			x16_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++;
+		}
+		x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+		x17_haval256_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], 256); order++;
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			x17hash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				work_set_target_ratio(work, vhash);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					x17hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				// x11+ coins could do some random error, but not on retry
+				gpu_increment_reject(thr_id);
+				if (!warn) {
+					warn++;
+					pdata[19] = work->nonces[0] + 1;
+					continue;
+				} else {
+					if (!opt_quiet)
+					gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+					warn = 0;
+				}
+			}
+		}
+
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
+		pdata[19] += throughput;
+
+	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+	return 0;
+}
+
+// cleanup
+extern "C" void free_x17(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+
+	quark_blake512_cpu_free(thr_id);
+	quark_groestl512_cpu_free(thr_id);
+	x11_simd512_cpu_free(thr_id);
+	x13_fugue512_cpu_free(thr_id);
+	x15_whirlpool_cpu_free(thr_id);
+
+	cuda_check_cpu_free(thr_id);
+
+	cudaDeviceSynchronize();
+	init[thr_id] = false;
+}
diff --git a/zr5.cu b/zr5.cu
new file mode 100644
index 0000000000..11140b0fff
--- /dev/null
+++ b/zr5.cu
@@ -0,0 +1,514 @@
+/* Ziftrcoin ZR5 CUDA Implementation, (c) tpruvot 2015 */
+
+extern "C" {
+#include "sph/sph_blake.h"
+#include "sph/sph_groestl.h"
+#include "sph/sph_skein.h"
+#include "sph/sph_jh.h"
+#include "sph/sph_keccak.h"
+}
+
+#include "miner.h"
+#include "cuda_helper.h"
+
+#include <stdio.h>
+#include <memory.h>
+
+#define ZR_BLAKE   0
+#define ZR_GROESTL 1
+#define ZR_JH512   2
+#define ZR_SKEIN   3
+
+#define POK_BOOL_MASK 0x00008000
+#define POK_DATA_MASK 0xFFFF0000
+
+static uint32_t* d_hash[MAX_GPUS];
+static uint16_t* d_poks[MAX_GPUS];
+
+static uint32_t**d_buffers[MAX_GPUS];
+static uint8_t*  d_permut[MAX_GPUS];
+
+static uint32_t* d_blake[MAX_GPUS];
+static uint32_t* d_groes[MAX_GPUS];
+static uint32_t* d_jh512[MAX_GPUS];
+static uint32_t* d_skein[MAX_GPUS];
+
+static uint8_t*  d_txs[MAX_GPUS];
+__constant__ uint16_t c_txlens[POK_MAX_TXS];
+
+__constant__ uint8_t c_permut[24][4];
+static const uint8_t permut[24][4] = {
+	{0, 1, 2, 3},
+	{0, 1, 3, 2},
+	{0, 2, 1, 3},
+	{0, 2, 3, 1},
+	{0, 3, 1, 2},
+	{0, 3, 2, 1},
+	{1, 0, 2, 3},
+	{1, 0, 3, 2},
+	{1, 2, 0, 3},
+	{1, 2, 3, 0},
+	{1, 3, 0, 2},
+	{1, 3, 2, 0},
+	{2, 0, 1, 3},
+	{2, 0, 3, 1},
+	{2, 1, 0, 3},
+	{2, 1, 3, 0},
+	{2, 3, 0, 1},
+	{2, 3, 1, 0},
+	{3, 0, 1, 2},
+	{3, 0, 2, 1},
+	{3, 1, 0, 2},
+	{3, 1, 2, 0},
+	{3, 2, 0, 1},
+	{3, 2, 1, 0}
+};
+
+// CPU HASH
+extern "C" void zr5hash(void *output, const void *input)
+{
+	sph_keccak512_context ctx_keccak;
+	sph_blake512_context ctx_blake;
+	sph_groestl512_context ctx_groestl;
+	sph_jh512_context ctx_jh;
+	sph_skein512_context ctx_skein;
+
+	uchar _ALIGN(64) hash[64];
+	uint32_t *phash = (uint32_t *) hash;
+	uint32_t norder;
+
+	sph_keccak512_init(&ctx_keccak);
+	sph_keccak512(&ctx_keccak, (const void*) input, 80);
+	sph_keccak512_close(&ctx_keccak, (void*) phash);
+
+	norder = phash[0] % ARRAY_SIZE(permut); /* % 24 */
+
+	for(int i = 0; i < 4; i++)
+	{
+		switch (permut[norder][i]) {
+		case ZR_BLAKE:
+			sph_blake512_init(&ctx_blake);
+			sph_blake512(&ctx_blake, (const void*) phash, 64);
+			sph_blake512_close(&ctx_blake, phash);
+			break;
+		case ZR_GROESTL:
+			sph_groestl512_init(&ctx_groestl);
+			sph_groestl512(&ctx_groestl, (const void*) phash, 64);
+			sph_groestl512_close(&ctx_groestl, phash);
+			break;
+		case ZR_JH512:
+			sph_jh512_init(&ctx_jh);
+			sph_jh512(&ctx_jh, (const void*) phash, 64);
+			sph_jh512_close(&ctx_jh, phash);
+			break;
+		case ZR_SKEIN:
+			sph_skein512_init(&ctx_skein);
+			sph_skein512(&ctx_skein, (const void*) phash, 64);
+			sph_skein512_close(&ctx_skein, phash);
+			break;
+		default:
+			break;
+		}
+	}
+	memcpy(output, phash, 32);
+}
+
+extern "C" void zr5hash_pok(void *output, uint32_t *pdata)
+{
+	uint32_t _ALIGN(64) hash[8];
+	const uint32_t version = (pdata[0] & (~POK_DATA_MASK)) | (use_pok ? POK_BOOL_MASK : 0);
+
+	pdata[0] = version;
+	zr5hash(hash, pdata);
+
+	// fill PoK
+	pdata[0] = version | (hash[0] & POK_DATA_MASK);
+	zr5hash(hash, pdata);
+
+	memcpy(output, hash, 32);
+}
+
+// ------------------------------------------------------------------------------------------------
+
+__global__ __launch_bounds__(128, 8)
+void zr5_init_vars_gpu(uint32_t threads, uint32_t* d_hash, uint8_t* d_permut, uint32_t** d_buffers,
+        uint32_t* d_blake, uint32_t* d_groes, uint32_t* d_jh512, uint32_t* d_skein)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t offset = thread * 16U; // 64U / sizeof(uint32_t);
+		uint32_t *phash = &d_hash[offset];
+
+		// store the algos order for other procs
+		const uint8_t norder = (phash[0] % ARRAY_SIZE(permut));
+		const uint8_t algo = c_permut[norder][0];
+		d_permut[thread] = norder;
+
+		// init array for other procs
+		d_buffers[0] = d_blake;
+		d_buffers[1] = d_groes;
+		d_buffers[2] = d_jh512;
+		d_buffers[3] = d_skein;
+
+		// Copy From d_hash to the first algo buffer
+		// uint4 = 4x uint32_t = 16 bytes
+		uint4 *psrc = (uint4*) phash;
+		uint4 *pdst = (uint4*) (d_buffers[algo] + offset);
+		pdst[0] = psrc[0];
+		pdst[1] = psrc[1];
+		pdst[2] = psrc[2];
+		pdst[3] = psrc[3];
+	}
+}
+
+__host__
+void zr5_init_vars(int thr_id, uint32_t threads)
+{
+	const uint32_t threadsperblock = 128;
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	zr5_init_vars_gpu <<<grid, block>>> (
+		threads, d_hash[thr_id], d_permut[thr_id], d_buffers[thr_id],
+		d_blake[thr_id], d_groes[thr_id], d_jh512[thr_id], d_skein[thr_id]
+	);
+}
+
+
+__global__ __launch_bounds__(128, 8)
+void zr5_move_data_to_hash_gpu(const uint32_t threads, const int rnd, uint32_t** const d_buffers, uint8_t *d_permut, uint32_t *d_hash)
+{
+	// copy 64 bytes hash from/to the right algo buffers
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint8_t norder = d_permut[thread];
+		const uint8_t algodst = c_permut[norder][rnd];
+		const uint8_t algosrc = c_permut[norder][rnd-1];
+
+		const uint32_t offset = thread * (64 / 4);
+
+		// uint4 = 4x uint32_t = 16 bytes
+		uint4 *psrc = (uint4*) (d_buffers[algosrc] + offset);
+		uint4 *pdst = (uint4*) (d_buffers[algodst] + offset);
+		pdst[0] = psrc[0];
+		pdst[1] = psrc[1];
+		pdst[2] = psrc[2];
+		pdst[3] = psrc[3];
+	}
+}
+
+__host__
+void zr5_move_data_to_hash(int thr_id, uint32_t threads, int rnd)
+{
+	const uint32_t threadsperblock = 128;
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	zr5_move_data_to_hash_gpu <<<grid, block>>> (threads, rnd, d_buffers[thr_id], d_permut[thr_id], d_hash[thr_id]);
+}
+
+
+__global__ __launch_bounds__(128, 8)
+void zr5_get_poks_gpu(uint32_t threads, uint32_t** const d_buffers, uint8_t* const d_permut, uint16_t *d_poks)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint8_t norder = d_permut[thread];
+		const uint8_t algosrc = c_permut[norder][3];
+
+		// copy only pok
+		const uint32_t offset = thread * 16U; // 64 / 4;
+		uint16_t* hash0 = (uint16_t*) (d_buffers[algosrc] + offset);
+		d_poks[thread] = hash0[1];
+	}
+}
+
+__global__ __launch_bounds__(128, 4)
+void zr5_get_poks_xor_gpu(uint32_t threads, uint32_t** const d_buffers, uint8_t* d_permut, uint16_t* d_poks, uint8_t* d_txs, uint8_t txs)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint8_t norder = d_permut[thread];
+		const uint8_t algo = c_permut[norder][3];
+		const uint8_t ntx = norder % txs; // generally 0 on testnet...
+		const uint32_t offset = thread * 16U; // 64 / 4;
+		uint32_t* hash = (uint32_t*) (d_buffers[algo] + offset);
+		uint32_t randNdx = hash[1] % c_txlens[ntx];
+		uint8_t* ptx = &d_txs[POK_MAX_TX_SZ*ntx] + randNdx;
+		uint32_t x = 0x100UL * ptx[3] + ptx[2];
+
+		d_poks[thread] = x ^ (hash[2] >> 16);
+	}
+}
+
+__host__
+void zr5_get_poks(int thr_id, uint32_t threads, uint16_t* d_poks, struct work* work)
+{
+	const uint32_t threadsperblock = 128;
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	uint8_t txs = (uint8_t) work->tx_count;
+
+	if (txs && use_pok)
+	{
+		uint32_t txlens[POK_MAX_TXS];
+		uint8_t* txdata = (uint8_t*) calloc(POK_MAX_TXS, POK_MAX_TX_SZ);
+		if (!txdata) {
+			applog(LOG_ERR, "%s: error, memory alloc failure", __func__);
+			return;
+		}
+		// create blocs to copy on device
+		for (uint8_t tx=0; tx < txs; tx++) {
+			txlens[tx] = (uint32_t) (work->txs[tx].len - 3U);
+			memcpy(&txdata[POK_MAX_TX_SZ*tx], work->txs[tx].data, min(POK_MAX_TX_SZ, txlens[tx]+3U));
+		}
+		cudaMemcpy(d_txs[thr_id], txdata, txs * POK_MAX_TX_SZ, cudaMemcpyHostToDevice);
+		CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_txlens, txlens, txs * sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
+		zr5_get_poks_xor_gpu <<<grid, block>>> (threads, d_buffers[thr_id], d_permut[thr_id], d_poks, d_txs[thr_id], txs);
+		free(txdata);
+	} else {
+		zr5_get_poks_gpu <<<grid, block>>> (threads, d_buffers[thr_id], d_permut[thr_id], d_poks);
+	}
+}
+
+
+__global__ __launch_bounds__(128, 8)
+void zr5_final_round_data_gpu(uint32_t threads, uint32_t** const d_buffers, uint8_t* const d_permut, uint32_t *d_hash, uint16_t *d_poks)
+{
+	// after the 4 algos rounds, copy back hash to d_hash
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint8_t norder = d_permut[thread];
+		const uint8_t algosrc = c_permut[norder][3];
+		const uint32_t offset = thread * 16U; // 64 / 4;
+
+		// copy only hash[4..7]
+		uint2 *psrc = (uint2*) (d_buffers[algosrc] + offset);
+		uint2 *phash = (uint2*) (&d_hash[offset]);
+
+		phash[2] = psrc[2];
+		phash[3] = psrc[3];
+	}
+}
+
+__host__
+void zr5_final_round(int thr_id, uint32_t threads)
+{
+	const uint32_t threadsperblock = 128;
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	zr5_final_round_data_gpu <<<grid, block>>> (threads, d_buffers[thr_id], d_permut[thr_id], d_hash[thr_id], d_poks[thr_id]);
+}
+
+
+extern void jackpot_keccak512_cpu_init(int thr_id, uint32_t threads);
+extern void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen);
+
+extern void zr5_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
+extern void zr5_keccak512_cpu_hash_pok(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t* pdata, uint32_t *d_hash, uint16_t *d_poks);
+
+extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
+extern void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_blake512_cpu_free(int thr_id);
+
+extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
+extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_groestl512_cpu_free(int thr_id);
+
+extern void quark_jh512_cpu_init(int thr_id, uint32_t threads);
+extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
+extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+static bool init[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_zr5(int thr_id, struct work *work,
+	uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t _ALIGN(64) tmpdata[20];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t oldp0 = pdata[0];
+	const uint32_t version = (oldp0 & (~POK_DATA_MASK)) | (use_pok ? POK_BOOL_MASK : 0);
+	const uint32_t first_nonce = pdata[19];
+	uint32_t throughput =  cuda_default_throughput(thr_id, 1U << 18);
+	throughput = min(throughput, (1U << 20)-1024);
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (opt_benchmark)
+		ptarget[7] = 0x0000ff;
+
+	memcpy(tmpdata, pdata, 80);
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		// constants
+		cudaMemcpyToSymbol(c_permut, permut, 24*4, 0, cudaMemcpyHostToDevice);
+
+		// hash buffer = keccak hash 64 required
+		cudaMalloc(&d_hash[thr_id], 64 * throughput);
+		cudaMalloc(&d_poks[thr_id], sizeof(uint16_t) * throughput);
+		cudaMalloc(&d_permut[thr_id], sizeof(uint8_t) * throughput);
+		cudaMalloc(&d_buffers[thr_id], 4 * sizeof(uint32_t*));
+
+		// data buffers for the 4 rounds
+		cudaMalloc(&d_blake[thr_id], 64 * throughput);
+		cudaMalloc(&d_groes[thr_id], 64 * throughput);
+		cudaMalloc(&d_jh512[thr_id], 64 * throughput);
+		cudaMalloc(&d_skein[thr_id], 64 * throughput);
+
+		cudaMalloc(&d_txs[thr_id], POK_MAX_TXS * POK_MAX_TX_SZ);
+
+		jackpot_keccak512_cpu_init(thr_id, throughput);
+
+		quark_blake512_cpu_init(thr_id, throughput);
+		quark_groestl512_cpu_init(thr_id, throughput);
+		quark_jh512_cpu_init(thr_id, throughput);
+		quark_skein512_cpu_init(thr_id, throughput);
+
+		cuda_check_cpu_init(thr_id, throughput);
+
+		CUDA_SAFE_CALL(cudaDeviceSynchronize());
+
+		init[thr_id] = true;
+	}
+
+	tmpdata[0] = version;
+	jackpot_keccak512_cpu_setBlock((void*)tmpdata, 80);
+	cuda_check_cpu_setTarget(ptarget);
+
+	do {
+		int order = 0;
+
+		// Keccak512 Hash with CUDA
+		zr5_keccak512_cpu_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		zr5_init_vars(thr_id, throughput);
+
+		for (int rnd=0; rnd<4; rnd++) {
+			if (rnd > 0)
+				zr5_move_data_to_hash(thr_id, throughput, rnd);
+			quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_blake[thr_id], order++);
+			quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_groes[thr_id], order++);
+			quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_jh512[thr_id], order++);
+			quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_skein[thr_id], order++);
+		}
+
+		// store on device d_poks all hash[0] prefixes
+		zr5_get_poks(thr_id, throughput, d_poks[thr_id], work);
+
+		// Keccak512 with pok
+		zr5_keccak512_cpu_hash_pok(thr_id, throughput, pdata[19], pdata, d_hash[thr_id], d_poks[thr_id]);
+		zr5_init_vars(thr_id, throughput);
+
+		for (int rnd=0; rnd<4; rnd++) {
+			if (rnd > 0)
+				zr5_move_data_to_hash(thr_id, throughput, rnd);
+			quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_blake[thr_id], order++);
+			quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_groes[thr_id], order++);
+			quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_jh512[thr_id], order++);
+			quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_skein[thr_id], order++);
+		}
+		zr5_final_round(thr_id, throughput);
+
+		// do not scan results on interuption
+		if (work_restart[thr_id].restart)
+			return -1;
+
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			uint32_t _ALIGN(64) vhash[8];
+			uint32_t oldp19 = pdata[19];
+			uint32_t offset = work->nonces[0] - pdata[19];
+			uint32_t pok = 0;
+			uint16_t h_pok;
+
+			*hashes_done = pdata[19] - first_nonce + throughput;
+
+			cudaMemcpy(&h_pok, d_poks[thr_id] + offset, sizeof(uint16_t), cudaMemcpyDeviceToHost);
+			pok = version | (0x10000UL * h_pok);
+			pdata[0] = pok; pdata[19] = work->nonces[0];
+			zr5hash(vhash, pdata);
+			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, oldp19, d_hash[thr_id], 1);
+				if (work->nonces[1] != 0) {
+					offset = work->nonces[1] - oldp19;
+					cudaMemcpy(&h_pok, d_poks[thr_id] + offset, sizeof(uint16_t), cudaMemcpyDeviceToHost);
+					pok = version | (0x10000UL * h_pok);
+					memcpy(tmpdata, pdata, 80);
+					tmpdata[0] = pok; tmpdata[19] = work->nonces[1];
+					zr5hash(vhash, tmpdata);
+					if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+						bn_set_target_ratio(work, vhash, 1);
+						pdata[19] = max(pdata[19], work->nonces[1]); // cursor
+						pdata[20] = pok; // second nonce "pok"
+						work->valid_nonces++;
+					}
+					pdata[19]++;
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > ptarget[7]) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+					gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[0] = oldp0;
+			}
+		} else
+			pdata[19] += throughput;
+
+	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+
+	pdata[0] = oldp0;
+
+	*hashes_done = pdata[19] - first_nonce + 1;
+	return 0;
+}
+
+// cleanup
+extern "C" void free_zr5(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+
+	cudaFree(d_poks[thr_id]);
+	cudaFree(d_permut[thr_id]);
+	cudaFree(d_buffers[thr_id]);
+
+	cudaFree(d_blake[thr_id]);
+	cudaFree(d_groes[thr_id]);
+	cudaFree(d_jh512[thr_id]);
+	cudaFree(d_skein[thr_id]);
+
+	cudaFree(d_txs[thr_id]);
+
+	quark_blake512_cpu_free(thr_id);
+	quark_groestl512_cpu_free(thr_id);
+	cuda_check_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}